[PATCH 2/2] sched/fair: Update blocked load from newly idle balance

From: Brendan Jackman
Date: Tue Oct 24 2017 - 08:26:14 EST


We now have a NOHZ kick to avoid the load of idle CPUs becoming stale. This is
good, but it brings about CPU wakeups, which have an energy cost. As an
alternative to waking CPUs up to do decay blocked load, we can sometimes do it
from newly idle balance. If the newly idle balance is on a domain that covers
all the currently nohz-idle CPUs, we push the value of nohz.next_update into the
future. That means that if such newly idle balances happen often enough, we
never need wake up a CPU just to update load.

Since we're doing this new update inside a for_each_domain, we need to do
something to avoid doing multiple updates on the same CPU in the same
idle_balance. A tick stamp is set on the rq in update_blocked_averages as a
simple way to do this. Using a simple jiffies-based timestamp, as opposed to the
last_update_time of the root cfs_rq's sched_avg, means we can do this without
taking the rq lock.

Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Morten Rasmussen <morten.rasmussen@xxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Signed-off-by: Brendan Jackman <brendan.jackman@xxxxxxx>
---
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++------
kernel/sched/sched.h | 1 +
3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da523a0..d8e71fd27806 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5923,6 +5923,7 @@ void __init sched_init(void)
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_load_update_tick = jiffies;
+ rq->last_blocked_load_update_tick = jiffies;
rq->nohz_flags = 0;
#endif
#ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9085caf49c76..45e9c8056161 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7062,6 +7062,7 @@ static void update_blocked_averages(int cpu)
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
}
+ rq->last_blocked_load_update_tick = jiffies;
rq_unlock_irqrestore(rq, &rf);
}

@@ -7121,6 +7122,7 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ rq->last_blocked_load_update_tick = jiffies;
rq_unlock_irqrestore(rq, &rf);
}

@@ -7615,6 +7617,15 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
}
#endif /* CONFIG_NUMA_BALANCING */

+#ifdef CONFIG_NO_HZ_COMMON
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ unsigned long next_balance; /* in jiffy units */
+ unsigned long next_update; /* in jiffy units */
+} nohz ____cacheline_aligned;
+#endif
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -7633,6 +7644,30 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;

+#ifdef CONFIG_NO_HZ_COMMON
+ if (env->idle == CPU_NEWLY_IDLE) {
+ int cpu;
+
+ /* Update the stats of NOHZ idle CPUs in the sd */
+ for_each_cpu_and(cpu, sched_domain_span(env->sd),
+ nohz.idle_cpus_mask) {
+ struct rq *rq = cpu_rq(cpu);
+
+ /* ... Unless we've already done since the last tick */
+ if (time_after(jiffies,
+ rq->last_blocked_load_update_tick))
+ update_blocked_averages(cpu);
+ }
+ }
+ /*
+ * If we've just updated all of the NOHZ idle CPUs, then we can push
+ * back the next nohz.next_update, which will prevent an unnecessary
+ * wakeup for the nohz stats kick
+ */
+ if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
+ nohz.next_update = jiffies + LOAD_AVG_PERIOD;
+#endif
+
load_idx = get_sd_load_idx(env->sd, env->idle);

do {
@@ -8657,12 +8692,6 @@ static inline int on_null_domain(struct rq *rq)
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
-static struct {
- cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- unsigned long next_balance; /* in jiffy units */
- unsigned long next_update; /* in jiffy units */
-} nohz ____cacheline_aligned;

static inline int find_new_ilb(void)
{
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6f95ef653f73..6be8938bb977 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -681,6 +681,7 @@ struct rq {
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
+ unsigned long last_blocked_load_update_tick;
#endif /* CONFIG_SMP */
unsigned long nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
--
2.14.1