Re: [PATCH 02/10] sched: Factor out code to should_we_balance()

From: Paul Turner
Date: Thu Aug 22 2013 - 05:59:08 EST


On Mon, Aug 19, 2013 at 9:01 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> From: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx>
>
> Now checking whether this cpu is appropriate to balance or not
> is embedded into update_sg_lb_stats() and this checking has no direct
> relationship to this function. There is not enough reason to place
> this checking at update_sg_lb_stats(), except saving one iteration
> for sched_group_cpus.
>
> In this patch, I factor out this checking to should_we_balance() function.
> And before doing actual work for load_balancing, check whether this cpu is
> appropriate to balance via should_we_balance(). If this cpu is not
> a candidate for balancing, it quit the work immediately.
>
> With this change, we can save two memset cost and can expect better
> compiler optimization.
>
> Below is result of this patch.
>
> * Vanilla *
> text data bss dec hex filename
> 34499 1136 116 35751 8ba7 kernel/sched/fair.o
>
> * Patched *
> text data bss dec hex filename
> 34243 1136 116 35495 8aa7 kernel/sched/fair.o
>
> In addition, rename @balance to @should_balance in order to represent
> its purpose more clearly.
>
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx>
> [peterz: style changes and a fix in should_we_balance() ]
> Signed-off-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Link: http://lkml.kernel.org/r/1375778203-31343-3-git-send-email-iamjoonsoo.kim@xxxxxxx
> ---
> kernel/sched/fair.c | 111 +++++++++++++++++++++++++---------------------------
> 1 file changed, 54 insertions(+), 57 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4510,22 +4510,17 @@ fix_small_capacity(struct sched_domain *
> * @group: sched_group whose statistics are to be updated.
> * @load_idx: Load index of sched_domain of this_cpu for load calc.
> * @local_group: Does group contain this_cpu.
> - * @balance: Should we balance.
> * @sgs: variable to hold the statistics for this group.
> */
> static inline void update_sg_lb_stats(struct lb_env *env,
> struct sched_group *group, int load_idx,
> - int local_group, int *balance, struct sg_lb_stats *sgs)
> + int local_group, struct sg_lb_stats *sgs)
> {
> unsigned long nr_running, max_nr_running, min_nr_running;
> unsigned long load, max_cpu_load, min_cpu_load;
> - unsigned int balance_cpu = -1, first_idle_cpu = 0;
> unsigned long avg_load_per_task = 0;
> int i;
>
> - if (local_group)
> - balance_cpu = group_balance_cpu(group);
> -
> /* Tally up the load of all CPUs in the group */
> max_cpu_load = 0;
> min_cpu_load = ~0UL;
> @@ -4538,15 +4533,9 @@ static inline void update_sg_lb_stats(st
> nr_running = rq->nr_running;
>
> /* Bias balancing toward cpus of our domain */
> - if (local_group) {
> - if (idle_cpu(i) && !first_idle_cpu &&
> - cpumask_test_cpu(i, sched_group_mask(group))) {
> - first_idle_cpu = 1;
> - balance_cpu = i;
> - }
> -
> + if (local_group)
> load = target_load(i, load_idx);

Keep the braces here:

if (local_group) {
load = target_load(i, load_idx);
} else {
...

> - } else {
> + else {
> load = source_load(i, load_idx);
> if (load > max_cpu_load)
> max_cpu_load = load;
> @@ -4566,22 +4555,9 @@ static inline void update_sg_lb_stats(st
> sgs->idle_cpus++;
> }
>
> - /*
> - * First idle cpu or the first cpu(busiest) in this sched group
> - * is eligible for doing load balancing at this and above
> - * domains. In the newly idle case, we will allow all the cpu's
> - * to do the newly idle load balance.
> - */
> - if (local_group) {
> - if (env->idle != CPU_NEWLY_IDLE) {
> - if (balance_cpu != env->dst_cpu) {
> - *balance = 0;
> - return;
> - }
> - update_group_power(env->sd, env->dst_cpu);
> - } else if (time_after_eq(jiffies, group->sgp->next_update))
> - update_group_power(env->sd, env->dst_cpu);
> - }
> + if (local_group && (env->idle != CPU_NEWLY_IDLE ||
> + time_after_eq(jiffies, group->sgp->next_update)))
> + update_group_power(env->sd, env->dst_cpu);
>
> /* Adjust by relative CPU power of the group */
> sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
> @@ -4663,7 +4639,7 @@ static bool update_sd_pick_busiest(struc
> * @sds: variable to hold the statistics for this sched_domain.
> */
> static inline void update_sd_lb_stats(struct lb_env *env,
> - int *balance, struct sd_lb_stats *sds)
> + struct sd_lb_stats *sds)
> {
> struct sched_domain *child = env->sd->child;
> struct sched_group *sg = env->sd->groups;
> @@ -4680,10 +4656,7 @@ static inline void update_sd_lb_stats(st
>
> local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
> memset(&sgs, 0, sizeof(sgs));
> - update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
> -
> - if (local_group && !(*balance))
> - return;
> + update_sg_lb_stats(env, sg, load_idx, local_group, &sgs);
>
> sds->total_load += sgs.group_load;
> sds->total_pwr += sg->sgp->power;
> @@ -4916,8 +4889,6 @@ static inline void calculate_imbalance(s
> * to restore balance.
> *
> * @env: The load balancing environment.
> - * @balance: Pointer to a variable indicating if this_cpu
> - * is the appropriate cpu to perform load balancing at this_level.
> *
> * Return: - The busiest group if imbalance exists.
> * - If no imbalance and user has opted for power-savings balance,
> @@ -4925,7 +4896,7 @@ static inline void calculate_imbalance(s
> * put to idle by rebalancing its tasks onto our group.
> */
> static struct sched_group *
> -find_busiest_group(struct lb_env *env, int *balance)
> +find_busiest_group(struct lb_env *env)
> {
> struct sd_lb_stats sds;
>
> @@ -4935,14 +4906,7 @@ find_busiest_group(struct lb_env *env, i
> * Compute the various statistics relavent for load balancing at
> * this level.
> */
> - update_sd_lb_stats(env, balance, &sds);
> -
> - /*
> - * this_cpu is not the appropriate cpu to perform load balancing at
> - * this level.
> - */
> - if (!(*balance))
> - goto ret;
> + update_sd_lb_stats(env, &sds);
>
> if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
> check_asym_packing(env, &sds))
> @@ -5006,7 +4970,6 @@ find_busiest_group(struct lb_env *env, i
> return sds.busiest;
>
> out_balanced:
> -ret:
> env->imbalance = 0;
> return NULL;
> }
> @@ -5088,13 +5051,47 @@ static int need_active_balance(struct lb
>
> static int active_load_balance_cpu_stop(void *data);
>
> +static int should_we_balance(struct lb_env *env)
> +{
> + struct sched_group *sg = env->sd->groups;
> + struct cpumask *sg_cpus, *sg_mask;
> + int cpu, balance_cpu = -1;
> +
> + /*
> + * In the newly idle case, we will allow all the cpu's
> + * to do the newly idle load balance.
> + */
> + if (env->idle == CPU_NEWLY_IDLE)
> + return 1;
> +
> + sg_cpus = sched_group_cpus(sg);
> + sg_mask = sched_group_mask(sg);
> + /* Try to find first idle cpu */
> + for_each_cpu_and(cpu, sg_cpus, env->cpus) {
> + if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
> + continue;
> +
> + balance_cpu = cpu;
> + break;
> + }
> +
> + if (balance_cpu == -1)
> + balance_cpu = group_balance_cpu(sg);
> +
> + /*
> + * First idle cpu or the first cpu(busiest) in this sched group
> + * is eligible for doing load balancing at this and above domains.
> + */
> + return balance_cpu != env->dst_cpu;
> +}
> +
> /*
> * Check this_cpu to ensure it is balanced within domain. Attempt to move
> * tasks if there is an imbalance.
> */
> static int load_balance(int this_cpu, struct rq *this_rq,
> struct sched_domain *sd, enum cpu_idle_type idle,
> - int *balance)
> + int *should_balance)
> {
> int ld_moved, cur_ld_moved, active_balance = 0;
> struct sched_group *group;
> @@ -5123,12 +5120,11 @@ static int load_balance(int this_cpu, st
>
> schedstat_inc(sd, lb_count[idle]);
>
> -redo:
> - group = find_busiest_group(&env, balance);
> -
> - if (*balance == 0)
> + if (!(*should_balance = should_we_balance(&env)))
> goto out_balanced;

Given we always initialize *should_balance where we care, it might be
more readable to write this as:

if (!should_we_balance(&env)) {
*continue_balancing = 0;
goto out_balanced;
}

[ With a rename to can_balance ]

>
> +redo:

One behavioral change worth noting here is that in the redo case if a
CPU has become idle we'll continue trying to load-balance in the
!new-idle case.

This could be unpleasant in the case where a package has a pinned busy
core allowing this and a newly idle cpu to start dueling for load.

While more deterministically bad in this case now, it could racily do
this before anyway so perhaps not worth worrying about immediately.

> + group = find_busiest_group(&env);
> if (!group) {
> schedstat_inc(sd, lb_nobusyg[idle]);
> goto out_balanced;
> @@ -5340,7 +5336,7 @@ void idle_balance(int this_cpu, struct r
> rcu_read_lock();
> for_each_domain(this_cpu, sd) {
> unsigned long interval;
> - int balance = 1;
> + int should_balance;
>
> if (!(sd->flags & SD_LOAD_BALANCE))
> continue;
> @@ -5348,7 +5344,8 @@ void idle_balance(int this_cpu, struct r
> if (sd->flags & SD_BALANCE_NEWIDLE) {
> /* If we've pulled tasks over stop searching: */
> pulled_task = load_balance(this_cpu, this_rq,
> - sd, CPU_NEWLY_IDLE, &balance);
> + sd, CPU_NEWLY_IDLE,
> + &should_balance);
> }
>
> interval = msecs_to_jiffies(sd->balance_interval);
> @@ -5586,7 +5583,7 @@ void update_max_interval(void)
> */
> static void rebalance_domains(int cpu, enum cpu_idle_type idle)
> {
> - int balance = 1;
> + int should_balance = 1;
> struct rq *rq = cpu_rq(cpu);
> unsigned long interval;
> struct sched_domain *sd;
> @@ -5618,7 +5615,7 @@ static void rebalance_domains(int cpu, e
> }
>
> if (time_after_eq(jiffies, sd->last_balance + interval)) {
> - if (load_balance(cpu, rq, sd, idle, &balance)) {
> + if (load_balance(cpu, rq, sd, idle, &should_balance)) {
> /*
> * The LBF_SOME_PINNED logic could have changed
> * env->dst_cpu, so we can't know our idle
> @@ -5641,7 +5638,7 @@ static void rebalance_domains(int cpu, e
> * CPU in our sched group which is doing load balancing more
> * actively.
> */
> - if (!balance)
> + if (!should_balance)
> break;
> }
> rcu_read_unlock();
>
>

Reviewed-by: Paul Turner <pjt@xxxxxxxxxx>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/