Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()
From: Paul Turner
Date:  Thu Mar 30 2017 - 07:21:49 EST
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[]
>   * Approximate:
>   *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
>   */
> -static __always_inline u64 decay_load(u64 val, u64 n)
> +static u64 decay_load(u64 val, u64 n)
>  {
>         unsigned int local_n;
>
> @@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u6
>         return val;
>  }
>
> -/*
> - * For updates fully spanning n periods, the contribution to runnable
> - * average will be: \Sum 1024*y^n
> - *
> - * We can compute this reasonably efficiently by combining:
> - *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
> - */
> -static u32 __compute_runnable_contrib(u64 n)
> +static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
- The naming here is really ambiguous:
    "__accumulate_sum" -> "__accumulate_pelt_segments"?
- Passing in "remainder" seems irrelevant to the sum accumulation.  It would be
  more clear to handle it from the caller.
>
>  {
> -       u32 contrib = 0;
> +       u32 c1, c2, c3 = remainder; /* y^0 == 1 */
>
> -       if (likely(n <= LOAD_AVG_PERIOD))
> -               return runnable_avg_yN_sum[n];
> -       else if (unlikely(n >= LOAD_AVG_MAX_N))
> +       if (!periods)
> +               return remainder - period_contrib;
This is super confusing.  It only works because remainder already had
period_contrib aggregated _into_ it.  We're literally computing:
  remainder + period_contrib - period_contrib
We should just not call this in the !periods case and handle the remainder
below.
> +
> +       if (unlikely(periods >= LOAD_AVG_MAX_N))
>                 return LOAD_AVG_MAX;
>
> -       /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
> -       contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
> -       n %= LOAD_AVG_PERIOD;
> -       contrib = decay_load(contrib, n);
> -       return contrib + runnable_avg_yN_sum[n];
> +       /*
> +        * c1 = d1 y^(p+1)
> +        */
> +       c1 = decay_load((u64)(1024 - period_contrib), periods);
> +
> +       periods -= 1;
> +       /*
> +        * For updates fully spanning n periods, the contribution to runnable
> +        * average will be:
> +        *
> +        *   c2 = 1024 \Sum y^n
> +        *
> +        * We can compute this reasonably efficiently by combining:
> +        *
> +        *   y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
> +        */
> +       if (likely(periods <= LOAD_AVG_PERIOD)) {
> +               c2 = runnable_avg_yN_sum[periods];
> +       } else {
> +               c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
This still wants the comment justifying why we can't overflow.
> +               periods %= LOAD_AVG_PERIOD;
> +               c2 = decay_load(c2, periods);
> +               c2 += runnable_avg_yN_sum[periods];
> +       }
> +
> +       return c1 + c2 + c3;
>  }
>
>  #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
>
>  /*
> + * Accumulate the three separate parts of the sum; d1 the remainder
> + * of the last (incomplete) period, d2 the span of full periods and d3
> + * the remainder of the (incomplete) current period.
> + *
> + *           d1          d2           d3
> + *           ^           ^            ^
> + *           |           |            |
> + *         |<->|<----------------->|<--->|
> + * ... |---x---|------| ... |------|-----x (now)
> + *
> + *                                p
> + * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
> + *                               n=1
> + *
> + *    = u y^(p+1) +                            (Step 1)
> + *
> + *                          p
> + *      d1 y^(p+1) + 1024 \Sum y^n + d3 y^0    (Step 2)
> + *                         n=1
> + */
> +static __always_inline u32
> +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> +              unsigned long weight, int running, struct cfs_rq *cfs_rq)
> +{
> +       unsigned long scale_freq, scale_cpu;
> +       u64 periods;
> +       u32 contrib;
> +
> +       scale_freq = arch_scale_freq_capacity(NULL, cpu);
> +       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> +
> +       delta += sa->period_contrib;
> +       periods = delta / 1024; /* A period is 1024us (~1ms) */
> +
> +       if (periods) {
> +               sa->load_sum = decay_load(sa->load_sum, periods);
> +               if (cfs_rq) {
> +                       cfs_rq->runnable_load_sum =
> +                               decay_load(cfs_rq->runnable_load_sum, periods);
> +               }
> +               sa->util_sum = decay_load((u64)(sa->util_sum), periods);
Move step 2 here, handle remainder below.
> +       }
> +
> +       /*
> +        * Step 2
> +        */
> +       delta %= 1024;
> +       contrib = __accumulate_sum(periods, sa->period_contrib, delta);
> +       sa->period_contrib = delta;
> +
> +       contrib = cap_scale(contrib, scale_freq);
> +       if (weight) {
> +               sa->load_sum += weight * contrib;
Is this correct in the iterated periods > LOAD_AVG_MAX_N case?
I don't think the decay above is guaranteed to return these to zero.
> +               if (cfs_rq)
> +                       cfs_rq->runnable_load_sum += weight * contrib;
> +       }
> +       if (running)
> +               sa->util_sum += contrib * scale_cpu;
> +
> +       return periods;
> +}
> +
> +/*
>   * We can represent the historical contribution to runnable average as the
>   * coefficients of a geometric series.  To do this we sub-divide our runnable
>   * history into segments of approximately 1ms (1024us); label the segment that
> @@ -2852,10 +2933,7 @@ static __always_inline int
>  ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
>                   unsigned long weight, int running, struct cfs_rq *cfs_rq)
>  {
> -       u64 delta, scaled_delta, periods;
> -       u32 contrib;
> -       unsigned int delta_w, scaled_delta_w, decayed = 0;
> -       unsigned long scale_freq, scale_cpu;
> +       u64 delta;
>
>         delta = now - sa->last_update_time;
>         /*
> @@ -2876,81 +2954,27 @@ ___update_load_avg(u64 now, int cpu, str
>                 return 0;
>         sa->last_update_time = now;
>
> -       scale_freq = arch_scale_freq_capacity(NULL, cpu);
> -       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> -
> -       /* delta_w is the amount already accumulated against our next period */
> -       delta_w = sa->period_contrib;
> -       if (delta + delta_w >= 1024) {
> -               decayed = 1;
> -
> -               /* how much left for next period will start over, we don't know yet */
> -               sa->period_contrib = 0;
> -
> -               /*
> -                * Now that we know we're crossing a period boundary, figure
> -                * out how much from delta we need to complete the current
> -                * period and accrue it.
> -                */
> -               delta_w = 1024 - delta_w;
> -               scaled_delta_w = cap_scale(delta_w, scale_freq);
> -               if (weight) {
> -                       sa->load_sum += weight * scaled_delta_w;
> -                       if (cfs_rq) {
> -                               cfs_rq->runnable_load_sum +=
> -                                               weight * scaled_delta_w;
> -                       }
> -               }
> -               if (running)
> -                       sa->util_sum += scaled_delta_w * scale_cpu;
> -
> -               delta -= delta_w;
> -
> -               /* Figure out how many additional periods this update spans */
> -               periods = delta / 1024;
> -               delta %= 1024;
> -
> -               sa->load_sum = decay_load(sa->load_sum, periods + 1);
> -               if (cfs_rq) {
> -                       cfs_rq->runnable_load_sum =
> -                               decay_load(cfs_rq->runnable_load_sum, periods + 1);
> -               }
> -               sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
> -
> -               /* Efficiently calculate \sum (1..n_period) 1024*y^i */
> -               contrib = __compute_runnable_contrib(periods);
> -               contrib = cap_scale(contrib, scale_freq);
> -               if (weight) {
> -                       sa->load_sum += weight * contrib;
> -                       if (cfs_rq)
> -                               cfs_rq->runnable_load_sum += weight * contrib;
> -               }
> -               if (running)
> -                       sa->util_sum += contrib * scale_cpu;
> -       }
> -
> -       /* Remainder of delta accrued against u_0` */
> -       scaled_delta = cap_scale(delta, scale_freq);
> -       if (weight) {
> -               sa->load_sum += weight * scaled_delta;
> -               if (cfs_rq)
> -                       cfs_rq->runnable_load_sum += weight * scaled_delta;
> -       }
> -       if (running)
> -               sa->util_sum += scaled_delta * scale_cpu;
> -
> -       sa->period_contrib += delta;
> +       /*
> +        * Now we know we crossed measurement unit boundaries. The *_avg
> +        * accrues by two steps:
> +        *
> +        * Step 1: accumulate *_sum since last_update_time. If we haven't
> +        * crossed period boundaries, finish.
> +        */
> +       if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
> +               return 0;
>
> -       if (decayed) {
> -               sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
> -               if (cfs_rq) {
> -                       cfs_rq->runnable_load_avg =
> -                               div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
> -               }
> -               sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
> +       /*
> +        * Step 2: update *_avg.
> +        */
> +       sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
> +       if (cfs_rq) {
> +               cfs_rq->runnable_load_avg =
> +                       div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
>         }
> +       sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
>
> -       return decayed;
> +       return 1;
>  }
>
>  static int