Re: [PATCH 2/2] sched/fair: Always propagate runnable_load_avg

From: Tejun Heo
Date: Fri Apr 28 2017 - 16:38:58 EST


Here's the debug patch.

The debug condition triggers when the load balancer picks a group w/o
more than one schbench threads on a CPU over one w/.

/sys/module/fair/parameters/dbg_odd_cnt: resettable counter
/sys/module/fair/parameters/dbg_odd_nth: dump group states on Nth
occurrence via trace_printk()

The load / weights are printed out so that NICE_0_LOAD is 1.000.

Thanks.
---
kernel/sched/fair.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 159 insertions(+), 1 deletion(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,11 +32,18 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/moduleparam.h>

#include <trace/events/sched.h>

#include "sched.h"

+static unsigned long dbg_odd_nth;
+static unsigned long dbg_odd_cnt;
+
+module_param(dbg_odd_nth, ulong, 0644);
+module_param(dbg_odd_cnt, ulong, 0644);
+
/*
* Targeted preemption latency for CPU-bound tasks:
*
@@ -7413,6 +7420,149 @@ static inline void update_sg_lb_stats(st
sgs->group_type = group_classify(group, sgs);
}

+static int count_schb(struct rq *rq)
+{
+ unsigned long flags;
+ struct task_struct *p;
+ int cnt = 0;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
+ if (!strncmp(p->comm, "schbench", 8))
+ cnt++;
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return cnt;
+}
+
+static bool sg_has_two_schb(struct sched_group *sg)
+{
+ int cpu;
+
+ for_each_cpu(cpu, sched_group_cpus(sg))
+ if (count_schb(cpu_rq(cpu)) >= 2)
+ return true;
+ return false;
+}
+
+static DEFINE_PER_CPU(char [PAGE_SIZE], odd_buf);
+
+#define lbw(x) (int)((x) / NICE_0_LOAD), (int)(((x) % NICE_0_LOAD) * 1000 / NICE_0_LOAD)
+#define lba(x) (int)((scale_load(x)) / NICE_0_LOAD), (int)(((scale_load(x)) % NICE_0_LOAD) * 1000 / NICE_0_LOAD)
+
+static int odd_append_se(struct sched_entity *se, const char *postfix,
+ int cnt, char *buf, size_t size)
+{
+#define odd_append(fmt, args...) do { \
+ cnt += scnprintf(buf + cnt, size - cnt, fmt, ##args); \
+ cnt = min_t(int, cnt, size); \
+} while (0)
+
+ if (entity_is_task(se)) {
+ struct task_struct *task = task_of(se);
+ odd_append(" %s(%d%s)", task->comm, task->pid, postfix);
+ } else {
+ char nbuf[64];
+ cgroup_name(se->my_q->tg->css.cgroup, nbuf, sizeof(nbuf));
+ odd_append(" %s(%s)", nbuf, postfix);
+ }
+ odd_append(":w=%d.%03d,l=%d.%03d,u=%d.%03d",
+ lbw(se->load.weight),
+ lba(se->avg.load_avg),
+ lba(se->avg.util_avg));
+
+ return cnt;
+}
+
+static void dbg_odd_dump(const char *pref,
+ struct sched_group *sg, struct sg_lb_stats *sgs)
+{
+ int cpu;
+
+ trace_printk("%sgrp=%*pbl w=%u avg=%d.%03d grp=%d.%03d sum=%d.%03d pertask=%d.%03d\n", pref,
+ cpumask_pr_args(sched_group_cpus(sg)), sg->group_weight,
+ lba(sgs->avg_load), lba(sgs->group_load),
+ lba(sgs->sum_weighted_load), lba(sgs->load_per_task));
+ trace_printk("%sgcap=%d.%03d gutil=%d.%03d run=%u idle=%u gwt=%u type=%d nocap=%d\n",
+ pref,
+ lba(sgs->group_capacity), lba(sgs->group_util),
+ sgs->sum_nr_running, sgs->idle_cpus, sgs->group_weight,
+ sgs->group_type, sgs->group_no_capacity);
+
+ for_each_cpu(cpu, sched_group_cpus(sg)) {
+ struct task_group *tg;
+ unsigned long flags;
+
+ trace_printk("%sCPU%03d: run=%u schb=%d\n", pref, cpu,
+ cpu_rq(cpu)->nr_running, count_schb(cpu_rq(cpu)));
+
+ raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags);
+
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ char qname[32] = "root";
+ int depth = 0;
+ long tg_weight = 0, tg_shares = 0;
+ struct sched_entity *se;
+ char *buf = per_cpu_ptr(odd_buf, cpu);
+ int cnt;
+
+ if (!cfs_rq->nr_running)
+ continue;
+
+ if (cfs_rq->tg) {
+ cgroup_name(cfs_rq->tg->css.cgroup, qname, sizeof(qname));
+ if (cfs_rq->tg->se[cpu])
+ depth = cfs_rq->tg->se[cpu]->depth;
+ tg_weight = atomic_long_read(&cfs_rq->tg->load_avg);
+ tg_shares = cfs_rq->tg->shares;
+ }
+
+ trace_printk("%sQ%03d-%s@%d: w=%d.%03d,l=%d.%03d,u=%d.%03d,r=%d.%03d run=%u hrun=%u tgs=%d.%03d tgw=%d.%03d\n",
+ pref, cpu, qname, depth,
+ lbw(cfs_rq->load.weight),
+ lba(cfs_rq->avg.load_avg),
+ lba(cfs_rq->avg.util_avg),
+ lba(cfs_rq->runnable_load_avg),
+ cfs_rq->nr_running, cfs_rq->h_nr_running,
+ lbw(tg_shares),
+ lba(tg_weight));
+
+ buf[0] = '\0';
+ cnt = 0;
+
+ if (cfs_rq->curr)
+ cnt = odd_append_se(cfs_rq->curr, "C", cnt, buf, PAGE_SIZE);
+
+ for (se = __pick_first_entity(cfs_rq); se;
+ se = __pick_next_entity(se))
+ cnt = odd_append_se(se, "", cnt, buf, PAGE_SIZE);
+
+ trace_printk("%sQ%03d-%s@%d: %s\n",
+ pref, cpu, qname, depth, buf);
+ }
+
+ raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags);
+ }
+}
+
+/* a has >= 2 dts, b doesn't */
+static void dbg_odd(struct lb_env *env,
+ struct sched_group *sga, struct sg_lb_stats *sgsa,
+ struct sched_group *sgb, struct sg_lb_stats *sgsb)
+{
+ if (dbg_odd_nth && (dbg_odd_cnt++ % dbg_odd_nth))
+ return;
+
+ trace_printk("odd: dst=%d idle=%d brk=%u lbtgt=%*pbl type=%d\n",
+ env->dst_cpu, env->idle, env->loop_break,
+ cpumask_pr_args(env->cpus), env->fbq_type);
+ dbg_odd_dump("A: ", sga, sgsa);
+ dbg_odd_dump("B: ", sgb, sgsb);
+}
+
/**
* update_sd_pick_busiest - return 1 on busiest group
* @env: The load balancing environment.
@@ -7432,6 +7582,8 @@ static bool update_sd_pick_busiest(struc
struct sg_lb_stats *sgs)
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ bool busiest_has_two = sds->busiest && sg_has_two_schb(sds->busiest);
+ bool sg_has_two = sg_has_two_schb(sg);

if (sgs->group_type > busiest->group_type)
return true;
@@ -7439,8 +7591,14 @@ static bool update_sd_pick_busiest(struc
if (sgs->group_type < busiest->group_type)
return false;

- if (sgs->avg_load <= busiest->avg_load)
+ if (sgs->avg_load <= busiest->avg_load) {
+ if (sg_has_two && !busiest_has_two)
+ dbg_odd(env, sg, sgs, sds->busiest, busiest);
return false;
+ }
+
+ if (!sg_has_two && busiest_has_two)
+ dbg_odd(env, sds->busiest, busiest, sg, sgs);

if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
goto asym_packing;