Re: [PATCH v2 for-4.12-fixes 2/2] sched/fair: Fix O(# total cgroups) in load balance path

From: Tejun Heo
Date: Wed May 10 2017 - 11:55:36 EST


On Wed, May 10, 2017 at 10:44:14AM -0400, Tejun Heo wrote:
> Hello,
>
> On Wed, May 10, 2017 at 08:50:14AM +0200, Vincent Guittot wrote:
> > On 9 May 2017 at 18:18, Tejun Heo <tj@xxxxxxxxxx> wrote:
> > > Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
> > > live cfs_rqs which have ever been active on the CPU; unfortunately,
> > > this makes update_blocked_averages() O(# total cgroups) which isn't
> > > scalable at all.
> >
> > Dietmar raised similar optimization in the past. The only question was
> > : what is the impact of re-adding the cfs_rq in leaf_cfs_rq_list on
> > the wake up path ? Have you done some measurements ?
>
> Didn't do a perf test yet but it's several more branches and a local
> list operation on enqueue, which is already pretty expensive vs. load
> balance being O(total number of cgroups on the system).
>
> Anyways, I'll do some hackbench tests with several levels of layering.

Oh, BTW, do we still need bc4278987e38 ("sched/fair: Fix FTQ noise
bench regression") with this patch? That patch looks like it's just
cutting down on the body of the O(#cgroups) loop.

Ying, can you verify whether the following patch on top of the current
linus#master 56868a460b83 also fixes the regression that you saw?

Index: work/kernel/sched/fair.c
===================================================================
--- work.orig/kernel/sched/fair.c
+++ work/kernel/sched/fair.c
@@ -372,6 +372,10 @@ static inline void list_del_leaf_cfs_rq(
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)

+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
+ leaf_cfs_rq_list)
+
/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
@@ -466,6 +470,9 @@ static inline void list_del_leaf_cfs_rq(
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)

+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
@@ -3170,36 +3177,6 @@ static inline int propagate_entity_load_
return 1;
}

-/*
- * Check if we need to update the load and the utilization of a blocked
- * group_entity:
- */
-static inline bool skip_blocked_update(struct sched_entity *se)
-{
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
-
- /*
- * If sched_entity still have not zero load or utilization, we have to
- * decay it:
- */
- if (se->avg.load_avg || se->avg.util_avg)
- return false;
-
- /*
- * If there is a pending propagation, we have to update the load and
- * the utilization of the sched_entity:
- */
- if (gcfs_rq->propagate_avg)
- return false;
-
- /*
- * Otherwise, the load and the utilization of the sched_entity is
- * already zero and there is no pending propagation, so it will be a
- * waste of time to try to decay it:
- */
- return true;
-}
-
#else /* CONFIG_FAIR_GROUP_SCHED */

static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -4644,22 +4621,32 @@ static void destroy_cfs_bandwidth(struct

static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
- struct cfs_rq *cfs_rq;
+ struct task_group *tg;
+
+ lockdep_assert_held(&rq->lock);

- for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

raw_spin_lock(&cfs_b->lock);
cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
raw_spin_unlock(&cfs_b->lock);
}
+ rcu_read_unlock();
}

static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
- struct cfs_rq *cfs_rq;
+ struct task_group *tg;
+
+ lockdep_assert_held(&rq->lock);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

- for_each_leaf_cfs_rq(rq, cfs_rq) {
if (!cfs_rq->runtime_enabled)
continue;

@@ -4677,6 +4664,7 @@ static void __maybe_unused unthrottle_of
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
+ rcu_read_unlock();
}

#else /* CONFIG_CFS_BANDWIDTH */
@@ -6973,7 +6961,7 @@ static void attach_tasks(struct lb_env *
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
struct rq_flags rf;

rq_lock_irqsave(rq, &rf);
@@ -6983,9 +6971,7 @@ static void update_blocked_averages(int
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
- for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct sched_entity *se;
-
+ for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
/* throttled entities do not contribute to load */
if (throttled_hierarchy(cfs_rq))
continue;
@@ -6993,10 +6979,17 @@ static void update_blocked_averages(int
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);

- /* Propagate pending load changes to the parent, if any: */
- se = cfs_rq->tg->se[cpu];
- if (se && !skip_blocked_update(se))
- update_load_avg(se, 0);
+ /* Propagate pending load changes to the parent */
+ if (cfs_rq->tg->se[cpu])
+ update_load_avg(cfs_rq->tg->se[cpu], 0);
+
+ /*
+ * There can be a lot of idle CPU cgroups. Don't let fully
+ * decayed cfs_rqs linger on the list.
+ */
+ if (!cfs_rq->load.weight && !cfs_rq->avg.load_sum &&
+ !cfs_rq->avg.util_sum && !cfs_rq->runnable_load_sum)
+ list_del_leaf_cfs_rq(cfs_rq);
}
rq_unlock_irqrestore(rq, &rf);
}