Re: [RFC tg_shares_up improvements - v1 01/12] sched: rewrite tg_shares_up

From: Paul Turner
Date: Thu Oct 21 2010 - 02:29:31 EST


On Wed, Oct 20, 2010 at 11:04 PM, Bharata B Rao
<bharata@xxxxxxxxxxxxxxxxxx> wrote:
> On Fri, Oct 15, 2010 at 09:43:50PM -0700, pjt@xxxxxxxxxx wrote:
>> From: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
>>
>> By tracking a per-cpu load-avg for each cfs_rq and folding it into a
>> global task_group load on each tick we can rework tg_shares_up to be
>> strictly per-cpu.
>>
>> This should improve cpu-cgroup performance for smp systems
>> significantly.
>>
>> [ Paul: changed to use queueing cfs_rq ]
>>
>> Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
>> Signed-off-by: Paul Turner <pjt@xxxxxxxxxx>
>>
>> Index: kernel/sched_fair.c
>> ===================================================================
>> --- kernel/sched_fair.c.orig
>> +++ kernel/sched_fair.c
>> @@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl
>>       WRT_SYSCTL(sched_min_granularity);
>>       WRT_SYSCTL(sched_latency);
>>       WRT_SYSCTL(sched_wakeup_granularity);
>> -     WRT_SYSCTL(sched_shares_ratelimit);
>>  #undef WRT_SYSCTL
>>
>>       return 0;
>> @@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cf
>>               list_add(&se->group_node, &cfs_rq->tasks);
>>       }
>>       cfs_rq->nr_running++;
>> -     se->on_rq = 1;
>>  }
>>
>>  static void
>> @@ -647,9 +645,89 @@ account_entity_dequeue(struct cfs_rq *cf
>>               list_del_init(&se->group_node);
>>       }
>>       cfs_rq->nr_running--;
>> -     se->on_rq = 0;
>>  }
>>
>> +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
>> +static void update_cfs_load(struct cfs_rq *cfs_rq)
>> +{
>> +     u64 period = sched_avg_period();
>> +     u64 now, delta;
>> +
>> +     if (!cfs_rq)
>> +             return;
>> +
>> +     now = rq_of(cfs_rq)->clock;
>> +     delta = now - cfs_rq->load_stamp;
>> +
>> +     cfs_rq->load_stamp = now;
>> +     cfs_rq->load_period += delta;
>> +     cfs_rq->load_avg += delta * cfs_rq->load.weight;
>> +
>> +     while (cfs_rq->load_period > period) {
>> +             /*
>> +              * Inline assembly required to prevent the compiler
>> +              * optimising this loop into a divmod call.
>> +              * See __iter_div_u64_rem() for another example of this.
>> +              */
>> +             asm("" : "+rm" (cfs_rq->load_period));
>> +             cfs_rq->load_period /= 2;
>> +             cfs_rq->load_avg /= 2;
>> +     }
>> +}
>> +
>> +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
>> +                         unsigned long weight)
>> +{
>> +     if (se->on_rq)
>> +             account_entity_dequeue(cfs_rq, se);
>> +
>> +     update_load_set(&se->load, weight);
>> +
>> +     if (se->on_rq)
>> +             account_entity_enqueue(cfs_rq, se);
>> +}
>> +
>> +static void update_cfs_shares(struct cfs_rq *cfs_rq)
>> +{
>> +     struct task_group *tg;
>> +     struct sched_entity *se;
>> +     long load_weight, load, shares;
>> +
>> +     if (!cfs_rq)
>> +             return;
>> +
>> +     tg = cfs_rq->tg;
>> +     se = tg->se[cpu_of(rq_of(cfs_rq))];
>> +     if (!se)
>> +             return;
>> +
>> +     load = cfs_rq->load.weight;
>> +
>> +     load_weight = atomic_read(&tg->load_weight);
>> +     load_weight -= cfs_rq->load_contribution;
>> +     load_weight += load;
>> +
>> +     shares = (tg->shares * load);
>> +     if (load_weight)
>> +             shares /= load_weight;
>> +
>> +     if (shares < MIN_SHARES)
>> +             shares = MIN_SHARES;
>> +     if (shares > tg->shares)
>> +             shares = tg->shares;
>> +
>> +     reweight_entity(cfs_rq_of(se), se, shares);
>> +}
>> +#else /* CONFIG_FAIR_GROUP_SCHED */
>> +static inline void update_cfs_load(struct cfs_rq *cfs_rq)
>> +{
>> +}
>> +
>> +static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
>> +{
>> +}
>> +#endif /* CONFIG_FAIR_GROUP_SCHED */
>> +
>>  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>  {
>>  #ifdef CONFIG_SCHEDSTATS
>> @@ -771,7 +849,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
>>        * Update run-time statistics of the 'current'.
>>        */
>>       update_curr(cfs_rq);
>> +     update_cfs_load(cfs_rq);
>>       account_entity_enqueue(cfs_rq, se);
>
> By placing update_cfs_load() before account_entity_enqueue(), you are
> updating cfs_rq->load_avg before actually taking into account the current
> load increment due to enqueing. I see same in dequeue also. Is there a
> reason for this ?

Yes -- the update covers the interval spanning the previous update
(tracked with load_stamp) and the present. This interval occurred
prior to the above weight delta which will only be meaningful against
the _next_ interval we account.

>
>> +     update_cfs_shares(cfs_rq_of(se));
>
> Isn't cfs_rq_of(se) same as cfs_rq that enqueue_entity() gets
> from enqueue_task_fair() ?  Same for dequeue case.
>

Yup.. no need for it, will fix.

Thanks

> Regards,
> Bharata.
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/