[RFC v2 2/2] sched/fair: introduce sched-idle balance

From: Abel Wu
Date: Sat Apr 09 2022 - 09:52:22 EST


The periodic (normal/idle) balancing is regulated by intervals on each
sched-domain and the intervals can prevent the unoccupied cpus from
pulling the non-idle tasks. While the newly-idle balancing is triggered
only when the cpus become really idle, and sadly the sched-idle cpus
are not the case. There are also some other constrains to get in the
middle of the way of making unoccupied cpus busier.

Given the above, the sched-idle balancing is an extension to existing
load balance mechanisms on the unoccupied cpus to let them fast pull
non-idle tasks from the overloaded cpus. This is achieved by:

- Quit early in periodic load balancing if the cpu becomes
no idle anymore. This is similar to what we do in newly-
idle case in which we stop balancing once we got some
work to do (althrough this is partly due to newly-idle
can be very frequent, while periodic balancing is not).

- The newly-idle balancing will try harder to pull the non-
idle tasks if overloaded cpus exist.

In this way we will fill the unoccupied cpus more proactively to get
more cpu capacity for the non-idle tasks.

Signed-off-by: Abel Wu <wuyun.abel@xxxxxxxxxxxxx>
---
include/linux/sched/idle.h | 1 +
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 145 ++++++++++++++++++++++++++++++++++++++++++---
kernel/sched/sched.h | 2 +
4 files changed, 142 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index d73d314d59c6..50ec5c770f85 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -8,6 +8,7 @@ enum cpu_idle_type {
CPU_IDLE,
CPU_NOT_IDLE,
CPU_NEWLY_IDLE,
+ CPU_SCHED_IDLE,
CPU_MAX_IDLE_TYPES
};

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a372881f8eaf..c05c39541c4e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9495,6 +9495,7 @@ void __init sched_init(void)
rq->wake_stamp = jiffies;
rq->wake_avg_idle = rq->avg_idle;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+ rq->sched_idle_balance = 0;
rq->overloaded = 0;

INIT_LIST_HEAD(&rq->cfs_tasks);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fbeb05321615..5fca3bb98273 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -456,6 +456,21 @@ static int se_is_idle(struct sched_entity *se)
return cfs_rq_is_idle(group_cfs_rq(se));
}

+/* Is this an idle task */
+static int task_h_idle(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ if (task_has_idle_policy(p))
+ return 1;
+
+ for_each_sched_entity(se)
+ if (cfs_rq_is_idle(cfs_rq_of(se)))
+ return 1;
+
+ return 0;
+}
+
#else /* !CONFIG_FAIR_GROUP_SCHED */

#define for_each_sched_entity(se) \
@@ -508,6 +523,11 @@ static int se_is_idle(struct sched_entity *se)
return 0;
}

+static inline int task_h_idle(struct task_struct *p)
+{
+ return task_has_idle_policy(p);
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */

static __always_inline
@@ -7039,6 +7059,16 @@ static inline bool cfs_rq_overloaded(struct rq *rq)
return rq->cfs.h_nr_running - rq->cfs.idle_h_nr_running > 1;
}

+static inline bool cfs_rq_busy(struct rq *rq)
+{
+ return rq->cfs.h_nr_running - rq->cfs.idle_h_nr_running == 1;
+}
+
+static inline bool need_pull_cfs_task(struct rq *rq)
+{
+ return rq->cfs.h_nr_running == rq->cfs.idle_h_nr_running;
+}
+
/*
* Use locality-friendly rq->overloaded to cache the status of the rq
* to minimize the heavy cost on LLC shared data.
@@ -7837,6 +7867,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (kthread_is_per_cpu(p))
return 0;

+ if (unlikely(task_h_idle(p))) {
+ /*
+ * Disregard hierarchically idle tasks during sched-idle
+ * load balancing.
+ */
+ if (env->idle == CPU_SCHED_IDLE)
+ return 0;
+ } else if (!static_branch_unlikely(&sched_asym_cpucapacity)) {
+ /*
+ * It's not gonna help if stacking non-idle tasks on one
+ * cpu while leaving some idle.
+ */
+ if (cfs_rq_busy(env->src_rq) && !need_pull_cfs_task(env->dst_rq))
+ return 0;
+ }
+
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;

@@ -10337,6 +10383,68 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
}

/*
+ * The sched-idle balancing tries to make full use of cpu capacity
+ * for non-idle tasks by pulling them for the unoccupied cpus from
+ * the overloaded ones.
+ *
+ * Return 1 if pulled successfully, 0 otherwise.
+ */
+static int sched_idle_balance(struct rq *dst_rq)
+{
+ struct sched_domain *sd;
+ struct task_struct *p = NULL;
+ int dst_cpu = cpu_of(dst_rq), cpu;
+
+ sd = rcu_dereference(per_cpu(sd_llc, dst_cpu));
+ if (unlikely(!sd))
+ return 0;
+
+ if (!atomic_read(&sd->shared->nr_overloaded))
+ return 0;
+
+ for_each_cpu_wrap(cpu, sdo_mask(sd->shared), dst_cpu + 1) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ struct lb_env env;
+
+ if (cpu == dst_cpu || !cfs_rq_overloaded(rq) ||
+ READ_ONCE(rq->sched_idle_balance))
+ continue;
+
+ WRITE_ONCE(rq->sched_idle_balance, 1);
+ rq_lock_irqsave(rq, &rf);
+
+ env = (struct lb_env) {
+ .sd = sd,
+ .dst_cpu = dst_cpu,
+ .dst_rq = dst_rq,
+ .src_cpu = cpu,
+ .src_rq = rq,
+ .idle = CPU_SCHED_IDLE, /* non-idle only */
+ .flags = LBF_DST_PINNED, /* pin dst_cpu */
+ };
+
+ update_rq_clock(rq);
+ p = detach_one_task(&env);
+ if (p)
+ update_overload_status(rq);
+
+ rq_unlock(rq, &rf);
+ WRITE_ONCE(rq->sched_idle_balance, 0);
+
+ if (p) {
+ attach_one_task(dst_rq, p);
+ local_irq_restore(rf.flags);
+ return 1;
+ }
+
+ local_irq_restore(rf.flags);
+ }
+
+ return 0;
+}
+
+/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
@@ -10356,6 +10464,15 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
u64 max_cost = 0;

rcu_read_lock();
+
+ /*
+ * Quit early if this cpu is no idle any more. It might not be a
+ * problem since we have already made some contribution to fix
+ * imbalance.
+ */
+ if (need_pull_cfs_task(rq) && sched_idle_balance(rq))
+ continue_balancing = 0;
+
for_each_domain(cpu, sd) {
/*
* Decay the newidle max times here because this is a regular
@@ -10934,7 +11051,8 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
int this_cpu = this_rq->cpu;
u64 t0, t1, curr_cost = 0;
struct sched_domain *sd;
- int pulled_task = 0;
+ struct sched_domain_shared *sds;
+ int pulled_task = 0, has_overloaded_cpus = 0;

update_misfit_status(NULL, this_rq);

@@ -10985,6 +11103,11 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
update_blocked_averages(this_cpu);

rcu_read_lock();
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, this_cpu));
+ if (likely(sds))
+ has_overloaded_cpus = atomic_read(&sds->nr_overloaded);
+
for_each_domain(this_cpu, sd) {
int continue_balancing = 1;
u64 domain_cost;
@@ -10996,9 +11119,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)

if (sd->flags & SD_BALANCE_NEWIDLE) {

- pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE,
- &continue_balancing);
+ pulled_task |= load_balance(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);

t1 = sched_clock_cpu(this_cpu);
domain_cost = t1 - t0;
@@ -11006,13 +11129,21 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)

curr_cost += domain_cost;
t0 = t1;
+
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq, given that no
+ * overloaded cpu can be found on this LLC.
+ */
+ if (pulled_task && !has_overloaded_cpus)
+ break;
}

/*
- * Stop searching for tasks to pull if there are
- * now runnable tasks on this rq.
+ * Try harder to pull non-idle tasks to let them use as more
+ * cpu capacity as it can be.
*/
- if (pulled_task || this_rq->nr_running > 0 ||
+ if (this_rq->nr_running > this_rq->cfs.idle_h_nr_running ||
this_rq->ttwu_pending)
break;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index afa1bb68c3ec..dcceaec8d8b4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1012,6 +1012,8 @@ struct rq {

unsigned char nohz_idle_balance;
unsigned char idle_balance;
+
+ unsigned char sched_idle_balance;
unsigned char overloaded;

unsigned long misfit_task_load;
--
2.11.0