Re: [PATCH] sched: fix infinity loop in update_blocked_averages

From: Linus Torvalds
Date: Thu Dec 27 2018 - 16:46:52 EST


On Thu, Dec 27, 2018 at 1:09 PM Sargun Dhillon <sargun@xxxxxxxxx> wrote:
>
> This appears to be broken since October on 4.18.5. We've only noticed
> it recently with a workload which does ridiculously parallel compiles
> in cgroups that are rapidly churned.

Yeah, that's probably unusual enough that people will have missed it.

Because it really looks like the bug has been there since 4.13, unless
I'm mis-reading things. Other things have changed there since, so
maybe I am.

> It's also an awkward bug to catch, because none of the lockup
> detectors, were catching it in our environment. The only reason we
> caught it was that it was blocking other cores, and those other cores
> were missing IPIs, resulting in catastrophic failure.

My gut feel is that we just need to revert that commit. It doesn't
revert clealy, but it doesn't look hard to do manually.

Something like the attached?

But we do need Tejun and PeterZ to take a look, since there might be
something subtle going on.

Everybody is probably still on well-deserved vacations, so it might be
a while. But testing the attached patch is probably a good idea
regardless.

Linus
kernel/sched/fair.c | 41 ++++++++---------------------------------
1 file changed, 8 insertions(+), 33 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d1907506318a..01f3cb89d188 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -353,9 +353,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}

/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
- list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
- leaf_cfs_rq_list)
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)

/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
@@ -447,8 +446,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}

-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
- for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)

static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
@@ -7647,27 +7646,10 @@ static inline bool others_have_blocked(struct rq *rq)

#ifdef CONFIG_FAIR_GROUP_SCHED

-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load.weight)
- return false;
-
- if (cfs_rq->avg.load_sum)
- return false;
-
- if (cfs_rq->avg.util_sum)
- return false;
-
- if (cfs_rq->avg.runnable_load_sum)
- return false;
-
- return true;
-}
-
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct cfs_rq *cfs_rq, *pos;
+ struct cfs_rq *cfs_rq;
const struct sched_class *curr_class;
struct rq_flags rf;
bool done = true;
@@ -7679,7 +7661,7 @@ static void update_blocked_averages(int cpu)
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
- for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
struct sched_entity *se;

/* throttled entities do not contribute to load */
@@ -7694,13 +7676,6 @@ static void update_blocked_averages(int cpu)
if (se && !skip_blocked_update(se))
update_load_avg(cfs_rq_of(se), se, 0);

- /*
- * There can be a lot of idle CPU cgroups. Don't let fully
- * decayed cfs_rqs linger on the list.
- */
- if (cfs_rq_is_decayed(cfs_rq))
- list_del_leaf_cfs_rq(cfs_rq);
-
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
done = false;
@@ -10570,10 +10545,10 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
- struct cfs_rq *cfs_rq, *pos;
+ struct cfs_rq *cfs_rq;

rcu_read_lock();
- for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
+ for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}