[PATCH 3/8] sched,fair: redefine runnable_load_avg as the sum of task_h_load

From: Rik van Riel
Date: Wed Jun 12 2019 - 15:37:33 EST

Next message: Rik van Riel: "[PATCH 2/8] sched: change /proc/sched_debug fields"
Previous message: Rik van Riel: "[PATCH 8/8] sched,fair: flatten hierarchical runqueues"
In reply to: Rik van Riel: "[PATCH 8/8] sched,fair: flatten hierarchical runqueues"
Next in thread: Rik van Riel: "[PATCH 2/8] sched: change /proc/sched_debug fields"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

The runnable_load magic is used to quickly propagate information about
runnable tasks up the hierarchy of runqueues. lhen switching to a flat
runqueue, that no longer works.

Redefine the CPU cfs_rq runnable_load_avg to be the sum of task_h_loads
of the runnable tasks. This provides enough information to the load
balancer.

The runnable_load_avg of the cgroup cfs_rqs does not appear to be
used for anything, so don't bother calculating those.

This removes one of the things that the code currently traverses the
cgroup hierarchy for, and getting rid of it brings us one step closer
to a flat runqueue for the CPU controller.

Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx>
---
include/linux/sched.h | 3 +-
kernel/sched/core.c | 2 -
kernel/sched/debug.c | 1 +
kernel/sched/fair.c | 125 +++++++++++++-----------------------------
kernel/sched/pelt.c | 49 ++++++-----------
kernel/sched/sched.h | 6 --
6 files changed, 55 insertions(+), 131 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11837410690f..f5bb6948e40c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -391,7 +391,6 @@ struct util_est {
struct sched_avg {
u64 last_update_time;
u64 load_sum;
- u64 runnable_load_sum;
u32 util_sum;
u32 period_contrib;
unsigned long load_avg;
@@ -439,7 +438,6 @@ struct sched_statistics {
struct sched_entity {
/* For load-balancing: */
struct load_weight load;
- unsigned long runnable_weight;
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
@@ -455,6 +453,7 @@ struct sched_entity {

#ifdef CONFIG_FAIR_GROUP_SCHED
int depth;
+ unsigned long enqueued_h_load;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 874c427742a9..fbd96900f715 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -744,7 +744,6 @@ static void set_load_weight(struct task_struct *p, bool update_load)
if (task_has_idle_policy(p)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
- p->se.runnable_weight = load->weight;
return;
}

@@ -757,7 +756,6 @@ static void set_load_weight(struct task_struct *p, bool update_load)
} else {
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
- p->se.runnable_weight = load->weight;
}
}

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index aab4640d66c5..d06e7436d148 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -965,6 +965,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.load_avg);
P(se.avg.runnable_load_avg);
P(se.avg.util_avg);
+ P(se.enqueued_h_load);
P(se.avg.last_update_time);
P(se.avg.util_est.ewma);
P(se.avg.util_est.enqueued);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df624f7a68e7..aebd43d74468 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -724,9 +724,7 @@ void init_entity_runnable_average(struct sched_entity *se)
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
- sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
-
- se->runnable_weight = se->load.weight;
+ sa->load_avg = scale_load_down(se->load.weight);

/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
@@ -2767,20 +2765,39 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
static inline void
enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- cfs_rq->runnable_weight += se->runnable_weight;
+ if (entity_is_task(se)) {
+ struct cfs_rq *cpu_cfs_rq = &cfs_rq->rq->cfs;
+ se->enqueued_h_load = task_se_h_load(se);

- cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
- cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
+ cpu_cfs_rq->avg.runnable_load_avg += se->enqueued_h_load;
+ }
}

static inline void
dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- cfs_rq->runnable_weight -= se->runnable_weight;
+ if (entity_is_task(se)) {
+ struct cfs_rq *cpu_cfs_rq = &cfs_rq->rq->cfs;
+ sub_positive(&cpu_cfs_rq->avg.runnable_load_avg,
+ se->enqueued_h_load);
+ }
+}
+
+static inline void
+update_runnable_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cpu_cfs_rq = &cfs_rq_of(se)->rq->cfs;
+ long new_h_load, delta;
+
+ SCHED_WARN_ON(!entity_is_task(se));
+
+ if (!se->on_rq)
+ return;

- sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
- sub_positive(&cfs_rq->avg.runnable_load_sum,
- se_runnable(se) * se->avg.runnable_load_sum);
+ new_h_load = task_se_h_load(se);
+ delta = new_h_load - se->enqueued_h_load;
+ cpu_cfs_rq->avg.runnable_load_avg += delta;
+ se->enqueued_h_load = new_h_load;
}

static inline void
@@ -2808,7 +2825,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif

static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- unsigned long weight, unsigned long runnable)
+ unsigned long weight)
{
if (se->on_rq) {
/* commit outstanding execution time */
@@ -2819,7 +2836,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
}
dequeue_load_avg(cfs_rq, se);

- se->runnable_weight = runnable;
update_load_set(&se->load, weight);

#ifdef CONFIG_SMP
@@ -2827,8 +2843,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;

se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
- se->avg.runnable_load_avg =
- div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
} while (0);
#endif

@@ -2846,7 +2860,7 @@ void reweight_task(struct task_struct *p, int prio)
struct load_weight *load = &se->load;
unsigned long weight = scale_load(sched_prio_to_weight[prio]);

- reweight_entity(cfs_rq, se, weight, weight);
+ reweight_entity(cfs_rq, se, weight);
load->inv_weight = sched_prio_to_wmult[prio];
}

@@ -2959,49 +2973,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
return clamp_t(long, shares, MIN_SHARES, tg_shares);
}

-/*
- * This calculates the effective runnable weight for a group entity based on
- * the group entity weight calculated above.
- *
- * Because of the above approximation (2), our group entity weight is
- * an load_avg based ratio (3). This means that it includes blocked load and
- * does not represent the runnable weight.
- *
- * Approximate the group entity's runnable weight per ratio from the group
- * runqueue:
- *
- * grq->avg.runnable_load_avg
- * ge->runnable_weight = ge->load.weight * -------------------------- (7)
- * grq->avg.load_avg
- *
- * However, analogous to above, since the avg numbers are slow, this leads to
- * transients in the from-idle case. Instead we use:
- *
- * ge->runnable_weight = ge->load.weight *
- *
- * max(grq->avg.runnable_load_avg, grq->runnable_weight)
- * ----------------------------------------------------- (8)
- * max(grq->avg.load_avg, grq->load.weight)
- *
- * Where these max() serve both to use the 'instant' values to fix the slow
- * from-idle and avoid the /0 on to-idle, similar to (6).
- */
-static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
-{
- long runnable, load_avg;
-
- load_avg = max(cfs_rq->avg.load_avg,
- scale_load_down(cfs_rq->load.weight));
-
- runnable = max(cfs_rq->avg.runnable_load_avg,
- scale_load_down(cfs_rq->runnable_weight));
-
- runnable *= shares;
- if (load_avg)
- runnable /= load_avg;
-
- return clamp_t(long, runnable, MIN_SHARES, shares);
-}
#endif /* CONFIG_SMP */

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@@ -3013,25 +2984,24 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
static void update_cfs_group(struct sched_entity *se)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- long shares, runnable;
+ long shares;

- if (!gcfs_rq)
+ if (!gcfs_rq) {
+ update_runnable_load_avg(se);
return;
+ }

if (throttled_hierarchy(gcfs_rq))
return;

#ifndef CONFIG_SMP
- runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
-
if (likely(se->load.weight == shares))
return;
#else
shares = calc_group_shares(gcfs_rq);
- runnable = calc_group_runnable(gcfs_rq, shares);
#endif

- reweight_entity(cfs_rq_of(se), se, shares, runnable);
+ reweight_entity(cfs_rq_of(se), se, shares);
}

#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -3244,8 +3214,8 @@ static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
- unsigned long runnable_load_avg, load_avg;
- u64 runnable_load_sum, load_sum = 0;
+ unsigned long load_avg;
+ u64 load_sum = 0;
s64 delta_sum;

if (!runnable_sum)
@@ -3293,19 +3263,6 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
se->avg.load_avg = load_avg;
add_positive(&cfs_rq->avg.load_avg, delta_avg);
add_positive(&cfs_rq->avg.load_sum, delta_sum);
-
- runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
- runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
-
- se->avg.runnable_load_sum = runnable_sum;
- se->avg.runnable_load_avg = runnable_load_avg;
-
- if (se->on_rq) {
- add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
- add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
- }
}

static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3400,7 +3357,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
- unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
+ unsigned long removed_load = 0, removed_util = 0;
struct sched_avg *sa = &cfs_rq->avg;
int decayed = 0;

@@ -3411,7 +3368,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
swap(cfs_rq->removed.load_avg, removed_load);
- swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
cfs_rq->removed.nr = 0;
raw_spin_unlock(&cfs_rq->removed.lock);

@@ -3423,8 +3379,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * divider);

- add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
-
decayed = 1;
}

@@ -3478,8 +3432,6 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
}

- se->avg.runnable_load_sum = se->avg.load_sum;
-
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
@@ -7736,9 +7688,6 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (cfs_rq->avg.util_sum)
return false;

- if (cfs_rq->avg.runnable_load_sum)
- return false;
-
return true;
}

diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index befce29bd882..32dc2791a517 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -106,7 +106,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
*/
static __always_inline u32
accumulate_sum(u64 delta, struct sched_avg *sa,
- unsigned long load, unsigned long runnable, int running)
+ unsigned long load, int running)
{
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
@@ -119,8 +119,6 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
*/
if (periods) {
sa->load_sum = decay_load(sa->load_sum, periods);
- sa->runnable_load_sum =
- decay_load(sa->runnable_load_sum, periods);
sa->util_sum = decay_load((u64)(sa->util_sum), periods);

/*
@@ -134,8 +132,6 @@ accumulate_sum(u64 delta, struct sched_avg *sa,

if (load)
sa->load_sum += load * contrib;
- if (runnable)
- sa->runnable_load_sum += runnable * contrib;
if (running)
sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;

@@ -172,7 +168,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
*/
static __always_inline int
___update_load_sum(u64 now, struct sched_avg *sa,
- unsigned long load, unsigned long runnable, int running)
+ unsigned long load, int running)
{
u64 delta;

@@ -206,7 +202,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* update_blocked_averages()
*/
if (!load)
- runnable = running = 0;
+ running = 0;

/*
* Now we know we crossed measurement unit boundaries. The *_avg
@@ -215,14 +211,14 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, sa, load, runnable, running))
+ if (!accumulate_sum(delta, sa, load, running))
return 0;

return 1;
}

static __always_inline void
-___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
+___update_load_avg(struct sched_avg *sa, unsigned long load)
{
u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;

@@ -230,7 +226,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
* Step 2: update *_avg.
*/
sa->load_avg = div_u64(load * sa->load_sum, divider);
- sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
}

@@ -263,8 +258,8 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna

int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
- if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
- ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ if (___update_load_sum(now, &se->avg, 0, 0)) {
+ ___update_load_avg(&se->avg, se_weight(se));
return 1;
}

@@ -273,10 +268,10 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)

int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,
+ if (___update_load_sum(now, &se->avg, !!se->on_rq,
cfs_rq->curr == se)) {

- ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ ___update_load_avg(&se->avg, se_weight(se));
cfs_se_util_change(&se->avg);
return 1;
}
@@ -288,10 +283,9 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
- scale_load_down(cfs_rq->runnable_weight),
cfs_rq->curr != NULL)) {

- ___update_load_avg(&cfs_rq->avg, 1, 1);
+ ___update_load_avg(&cfs_rq->avg, 1);
return 1;
}

@@ -303,20 +297,18 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
- * runnable_load_sum = load_sum
*
- * load_avg and runnable_load_avg are not supported and meaningless.
+ * load_avg is not supported and meaningless.
*
*/

int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
{
if (___update_load_sum(now, &rq->avg_rt,
- running,
running,
running)) {

- ___update_load_avg(&rq->avg_rt, 1, 1);
+ ___update_load_avg(&rq->avg_rt, 1);
return 1;
}

@@ -328,18 +320,16 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
- * runnable_load_sum = load_sum
*
*/

int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
{
if (___update_load_sum(now, &rq->avg_dl,
- running,
running,
running)) {

- ___update_load_avg(&rq->avg_dl, 1, 1);
+ ___update_load_avg(&rq->avg_dl, 1);
return 1;
}

@@ -352,7 +342,6 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
- * runnable_load_sum = load_sum
*
*/

@@ -379,17 +368,11 @@ int update_irq_load_avg(struct rq *rq, u64 running)
* We can safely remove running from rq->clock because
* rq->clock += delta with delta >= running
*/
- ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
- 0,
- 0,
- 0);
- ret += ___update_load_sum(rq->clock, &rq->avg_irq,
- 1,
- 1,
- 1);
+ ret = ___update_load_sum(rq->clock - running, &rq->avg_irq, 0);
+ ret += ___update_load_sum(rq->clock, &rq->avg_irq, 1);

if (ret)
- ___update_load_avg(&rq->avg_irq, 1, 1);
+ ___update_load_avg(&rq->avg_irq, 1);

return ret;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b52ed1ada0be..5be14cee61f9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -487,7 +487,6 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned long runnable_weight;
unsigned int nr_running;
unsigned int h_nr_running;

@@ -700,11 +699,6 @@ static inline long se_weight(struct sched_entity *se)
return scale_load_down(se->load.weight);
}

-static inline long se_runnable(struct sched_entity *se)
-{
- return scale_load_down(se->runnable_weight);
-}
-
static inline bool sched_asym_prefer(int a, int b)
{
return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
--
2.20.1

Next message: Rik van Riel: "[PATCH 2/8] sched: change /proc/sched_debug fields"
Previous message: Rik van Riel: "[PATCH 8/8] sched,fair: flatten hierarchical runqueues"
In reply to: Rik van Riel: "[PATCH 8/8] sched,fair: flatten hierarchical runqueues"
Next in thread: Rik van Riel: "[PATCH 2/8] sched: change /proc/sched_debug fields"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]