Re: [PATCH 1/3] sched: Stop nohz stats when decayed

From: Vincent Guittot
Date: Tue Feb 06 2018 - 09:31:59 EST


On 6 February 2018 at 15:16, Valentin Schneider
<valentin.schneider@xxxxxxx> wrote:
> Hi Vincent,
>
> On 02/06/2018 08:32 AM, Vincent Guittot wrote:
>> Stopped the periodic update of blocked load when all idle CPUs have fully
>> decayed. We introduce a new nohz.has_blocked that reflect if some idle
>> CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked
>> is set everytime that a Idle CPU can have blocked load and it is then clear
>> when no more blocked load has been detected during an update. We don't need
>> atomic operation but only to make cure of the right ordering when updating
>> nohz.idle_cpus_mask and nohz.has_blocked.
>>
>> Suggested-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
>> Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
>> ---
>> kernel/sched/fair.c | 94 +++++++++++++++++++++++++++++++++++++++++-----------
>> kernel/sched/sched.h | 1 +
>> 2 files changed, 75 insertions(+), 20 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 7af1fa9..279f4b2 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
>> static struct {
>> cpumask_var_t idle_cpus_mask;
>> atomic_t nr_cpus;
>> + int has_blocked; /* Idle CPUS has blocked load */
>> unsigned long next_balance; /* in jiffy units */
>> - unsigned long next_stats;
>> + unsigned long next_blocked; /* Next update of blocked load in jiffies */
>> } nohz ____cacheline_aligned;
>>
>> #endif /* CONFIG_NO_HZ_COMMON */
>> @@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all };
>> #define LBF_DST_PINNED 0x04
>> #define LBF_SOME_PINNED 0x08
>> #define LBF_NOHZ_STATS 0x10
>> +#define LBF_NOHZ_AGAIN 0x20
>>
>> struct lb_env {
>> struct sched_domain *sd;
>> @@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env)
>> rq_unlock(env->dst_rq, &rf);
>> }
>>
>> -#ifdef CONFIG_FAIR_GROUP_SCHED
>> -
>> static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>> {
>> if (cfs_rq->load.weight)
>> @@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
>> return true;
>> }
>>
>> +#ifdef CONFIG_FAIR_GROUP_SCHED
>> +
>> static void update_blocked_averages(int cpu)
>> {
>> struct rq *rq = cpu_rq(cpu);
>> struct cfs_rq *cfs_rq, *pos;
>> struct rq_flags rf;
>> + bool done = true;
>>
>> rq_lock_irqsave(rq, &rf);
>> update_rq_clock(rq);
>> @@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu)
>> */
>> if (cfs_rq_is_decayed(cfs_rq))
>> list_del_leaf_cfs_rq(cfs_rq);
>> + else
>> + done = false;
>> }
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>> rq->last_blocked_load_update_tick = jiffies;
>> + if (done)
>> + rq->has_blocked_load = 0;
>> #endif
>> rq_unlock_irqrestore(rq, &rf);
>> }
>> @@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu)
>> update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
>> #ifdef CONFIG_NO_HZ_COMMON
>> rq->last_blocked_load_update_tick = jiffies;
>> + if (cfs_rq_is_decayed(cfs_rq))
>> + rq->has_blocked_load = 0;
>> #endif
>> rq_unlock_irqrestore(rq, &rf);
>> }
>> @@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group,
>> return group_other;
>> }
>>
>> -static void update_nohz_stats(struct rq *rq)
>> +static bool update_nohz_stats(struct rq *rq)
>> {
>> #ifdef CONFIG_NO_HZ_COMMON
>> unsigned int cpu = rq->cpu;
>>
>> + if (!rq->has_blocked_load)
>> + return false;
>> +
>> if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
>> - return;
>> + return false;
>>
>> if (!time_after(jiffies, rq->last_blocked_load_update_tick))
>
> I forgot to ask this on the initial thread - what's the point of this condition ? At first glance I would have said that this would make more sense:
>
> if (!time_after(jiffies, rq->last_blocked_load_update_tick + msecs_to_jiffies(LOAD_AVG_PERIOD))
> return false;
>
> But maybe it's simply there to skip an update if it has already been done in the same jiffy interval ?

That's exactly the purpose.

>
>> - return;
>> + return true;
>>
>> update_blocked_averages(cpu);
>> +
>> + return rq->has_blocked_load;
>> +#else
>> + return false;
>> #endif
>> }
>>
>> @@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>> for_each_cpu_and(i, sched_group_span(group), env->cpus) {
>> struct rq *rq = cpu_rq(i);
>>
>> - if (env->flags & LBF_NOHZ_STATS)
>> - update_nohz_stats(rq);
>> + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
>> + env->flags |= LBF_NOHZ_AGAIN;
>>
>> /* Bias balancing toward cpus of our domain */
>> if (local_group)
>> @@ -7979,18 +7995,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>> struct sg_lb_stats *local = &sds->local_stat;
>> struct sg_lb_stats tmp_sgs;
>> int load_idx, prefer_sibling = 0;
>> + int has_blocked = READ_ONCE(nohz.has_blocked);
>> bool overload = false;
>>
>> if (child && child->flags & SD_PREFER_SIBLING)
>> prefer_sibling = 1;
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>> - if (env->idle == CPU_NEWLY_IDLE) {
>> + if (env->idle == CPU_NEWLY_IDLE && has_blocked)
>> env->flags |= LBF_NOHZ_STATS;
>> -
>> - if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
>> - nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
>> - }
>> #endif
>>
>> load_idx = get_sd_load_idx(env->sd, env->idle);
>> @@ -8046,6 +8059,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
>> sg = sg->next;
>> } while (sg != env->sd->groups);
>>
>> +#ifdef CONFIG_NO_HZ_COMMON
>> + if ((env->flags & LBF_NOHZ_AGAIN) &&
>> + cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
>> +
>> + WRITE_ONCE(nohz.next_blocked,
>> + jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> + }
>> +#endif
>> +
>> if (env->sd->flags & SD_NUMA)
>> env->fbq_type = fbq_classify_group(&sds->busiest_stat);
>>
>> @@ -9069,6 +9091,8 @@ static void nohz_balancer_kick(struct rq *rq)
>> struct sched_domain *sd;
>> int nr_busy, i, cpu = rq->cpu;
>> unsigned int flags = 0;
>> + unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
>> + unsigned long next = READ_ONCE(nohz.next_blocked);
>
> What about something slightly more explicit, e.g. next_stats/next_blocked ? There's also nohz.next_balance referenced here so IMO it's best to keep things clear.

ok

>
>>
>> if (unlikely(rq->idle_balance))
>> return;
>> @@ -9086,7 +9110,7 @@ static void nohz_balancer_kick(struct rq *rq)
>> if (likely(!atomic_read(&nohz.nr_cpus)))
>> return;
>>
>> - if (time_after(now, nohz.next_stats))
>> + if (time_after(now, next) && has_blocked)
>> flags = NOHZ_STATS_KICK;
>>
>> if (time_before(now, nohz.next_balance))
>> @@ -9207,13 +9231,15 @@ void nohz_balance_enter_idle(int cpu)
>> if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
>> return;
>>
>> + rq->has_blocked_load = 1;
>> +
>> if (rq->nohz_tick_stopped)
>> - return;
>> + goto out;
>>
>> /*
>> * If we're a completely isolated CPU, we don't play.
>> */
>> - if (on_null_domain(cpu_rq(cpu)))
>> + if (on_null_domain(rq))
>> return;
>>
>> rq->nohz_tick_stopped = 1;
>> @@ -9222,6 +9248,13 @@ void nohz_balance_enter_idle(int cpu)
>> atomic_inc(&nohz.nr_cpus);
>>
>> set_cpu_sd_state_idle(cpu);
>> +
>> +out:
>> + /*
>> + * Each time a cpu enter idle, we assume that it has blocked load and
>> + * enable the periodic update of the load of idle cpus
>> + */
>> + WRITE_ONCE(nohz.has_blocked, 1);
>> }
>> #else
>> static inline void nohz_balancer_kick(struct rq *rq) { }
>> @@ -9374,6 +9407,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>>
>> SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
>>
>> + /*
>> + * We assume there will be no idle load after this update and clear
>> + * the stats state. If a cpu enters idle in the mean time, it will
>
> s/stats state/has_blocked flag/ (repeated a few times in the comment block)

yes. I will update others as well

>
>> + * set the stats state and trig another update of idle load.
>> + * Because a cpu that becomes idle, is added to idle_cpus_mask before
>> + * setting the stats state, we are sure to not clear the state and not
>> + * check the load of an idle cpu.
>> + */
>> + WRITE_ONCE(nohz.has_blocked, 0);
>> +
>> for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>> if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>> continue;
>> @@ -9383,11 +9426,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> * work being done for other cpus. Next load
>> * balancing owner will pick it up.
>> */
>> - if (need_resched())
>> - break;
>> + if (need_resched()) {
>> + has_blocked_load = true;
>> + goto abort;
>> + }
>>
>> rq = cpu_rq(balance_cpu);
>>
>> + update_blocked_averages(rq->cpu);
>> + has_blocked_load |= rq->has_blocked_load;
>> +
>> /*
>> * If time for next balance is due,
>> * do the balance.
>> @@ -9400,7 +9448,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> cpu_load_update_idle(rq);
>> rq_unlock_irq(rq, &rf);
>>
>> - update_blocked_averages(rq->cpu);
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(rq, CPU_IDLE);
>> }
>> @@ -9415,7 +9462,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>> if (flags & NOHZ_BALANCE_KICK)
>> rebalance_domains(this_rq, CPU_IDLE);
>>
>> - nohz.next_stats = next_stats;
>> + WRITE_ONCE(nohz.next_blocked,
>> + now + msecs_to_jiffies(LOAD_AVG_PERIOD));
>> +
>> +abort:
>> + /* There is still blocked load, enable periodic update */
>> + if (has_blocked_load)
>> + WRITE_ONCE(nohz.has_blocked, 1);
>>
>> /*
>> * next_balance will be updated only when there is a need.
>> @@ -10046,6 +10099,7 @@ __init void init_sched_fair_class(void)
>>
>> #ifdef CONFIG_NO_HZ_COMMON
>> nohz.next_balance = jiffies;
>> + nohz.next_blocked = jiffies;
>> zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
>> #endif
>> #endif /* SMP */
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index e200045..ad9b929 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -723,6 +723,7 @@ struct rq {
>> #ifdef CONFIG_SMP
>> unsigned long last_load_update_tick;
>> unsigned long last_blocked_load_update_tick;
>> + unsigned int has_blocked_load;
>> #endif /* CONFIG_SMP */
>> unsigned int nohz_tick_stopped;
>> atomic_t nohz_flags;
>>