[PATCH 3/4] sched/fair: Remove scale_load_down() for load_avg

From: Yuyang Du
Date: Sun Oct 04 2015 - 21:46:59 EST


Currently, load_avg = scale_load_down(load) * runnable%. This does
not make much sense, because load_avg is primarily the load that
takes runnable time ratio into account.

We therefore remove scale_load_down() for load_avg. But we need to
carefully consider the overflow risk if load has higher resolution
(2*SCHED_RESOLUTION_SHIFT). The only case an overflow may occur due
to us is on 64bit kernel with increased load resolution. In that
case, the 64bit load_sum can afford 4251057 (=2^64/47742/88761/1024)
entities with the highest load (=88761*1024) always runnable on one
single cfs_rq, which may be an issue, but should be fine.

Signed-off-by: Yuyang Du <yuyang.du@xxxxxxxxx>
---
include/linux/sched.h | 69 +++++++++++++++++++++++++++++++++++++++++++--------
kernel/sched/fair.c | 11 ++++----
2 files changed, 63 insertions(+), 17 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b3ba0fb..a63271e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1185,18 +1185,65 @@ struct load_weight {
};

/*
- * The load_avg/util_avg accumulates an infinite geometric series.
- * 1) load_avg factors frequency scaling into the amount of time that a
- * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
- * aggregated such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency and cpu capacity scaling into the amount of time
- * that a sched_entity is running on a CPU, in the range [0..SCHED_CAPACITY_SCALE].
- * For cfs_rq, it is the aggregated such times of all runnable and
+ * The load_avg/util_avg accumulates an infinite geometric series
+ * (see __update_load_avg() in kernel/sched/fair.c).
+ *
+ * [load_avg definition]
+ *
+ * load_avg = runnable% * load
+ *
+ * where runnable% is the time ratio that a sched_entity is runnable.
+ * For cfs_rq, it is the aggregated such load_avg of all runnable and
* blocked sched_entities.
- * The 64 bit load_sum can:
- * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
- * the highest weight (=88761) always runnable, we should not overflow
- * 2) for entity, support any load.weight always runnable
+ *
+ * load_avg may also take frequency scaling into account:
+ *
+ * load_avg = runnable% * load * freq%
+ *
+ * where freq% is the CPU frequency normalize to the highest frequency
+ *
+ * [util_avg definition]
+ *
+ * util_avg = running% * SCHED_CAPACITY_SCALE
+ *
+ * where running% is the time ratio that a sched_entity is running on
+ * a CPU. For cfs_rq, it is the aggregated such util_avg of all running
+ * and blocked sched_entities.
+ *
+ * util_avg may also factor frequency scaling and CPU capacity scaling:
+ *
+ * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity%
+ *
+ * where freq% is the same as above, and capacity% is the CPU capacity
+ * normalized to the greatest capacity (due to uarch differences, etc).
+ *
+ * N.B., the above ratios (runnable%, running%, freq%, and capacity%)
+ * themselves are in the range of [0, 1]. We therefore scale them to
+ * not lose the intermediate values due to integer rounding and provide
+ * as fine resolution as necessary. This is for example reflected by
+ * util_avg's SCHED_CAPACITY_SCALE.
+ *
+ * [Overflow issue]
+ *
+ * On 64bit kernel:
+ *
+ * When load has low resolution (SCHED_RESOLUTION_SHIFT), the 64bit
+ * load_sum can have 4353082796 (=2^64/47742/88761) entities with
+ * the highest load (=88761) always runnable on a cfs_rq, we should
+ * not overflow.
+ *
+ * When load has high resolution (2*SCHED_RESOLUTION_SHIFT), the 64bit
+ * load_sum can have 4251057 (=2^64/47742/88761/1024) entities with
+ * the highest load (=88761*1024) always runnable on ONE cfs_rq, we
+ * should be fine.
+ *
+ * For all other cases (including 32bit kernel), struct load_weight's
+ * weight will overflow first before we do, because:
+ *
+ * Max(load_avg) <= Max(load.weight)
+ *
+ * Then, it is the load_weight's responsibility to consider overflow
+ * issues.
*/
struct sched_avg {
u64 last_update_time, load_sum;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fdb7937..807d960 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -680,7 +680,7 @@ void init_entity_runnable_average(struct sched_entity *se)
* will definitely be update (after enqueue).
*/
sa->period_contrib = 1023;
- sa->load_avg = scale_load_down(se->load.weight);
+ sa->load_avg = se->load.weight;
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
sa->util_avg = SCHED_CAPACITY_SCALE;
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
@@ -2697,7 +2697,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
}

decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
+ cfs_rq->load.weight, cfs_rq->curr != NULL, cfs_rq);

#ifndef CONFIG_64BIT
smp_wmb();
@@ -2718,8 +2718,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
+ __update_load_avg(now, cpu, &se->avg, se->on_rq * se->load.weight,
cfs_rq->curr == se, NULL);

if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
@@ -2756,7 +2755,7 @@ skip_aging:
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
+ &se->avg, se->on_rq * se->load.weight,
cfs_rq->curr == se, NULL);

cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
@@ -2776,7 +2775,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
migrated = !sa->last_update_time;
if (!migrated) {
__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
+ se->on_rq * se->load.weight,
cfs_rq->curr == se, NULL);
}

--
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/