[PATCH 07/10] sched/fair: Account for the idle cpu/smt search cost

From: Mel Gorman
Date: Thu Dec 03 2020 - 09:12:27 EST


select_idle_cpu() accounts average search cost for the purposes of
conducting a limited proportional search if SIS_PROP is enabled. The issue
is that select_idle_cpu() does not account for the cost if a candidate
is found and select_idle_smt() is ignored.

This patch moves the accounting of avg_cost to cover the cpu/smt search
costs. select_idle_core() costs could be accounted for but it has its
own throttling mechanism by tracking depending on whether idle cores are
expected to exist.

This patch is a bisection hazard becuse SIS_PROP and how it balances
avg_cost vs avg_idle was probably guided by the fact that avg_cost was
not always accounted for.

Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
---
kernel/sched/fair.c | 82 +++++++++++++++++++++++++--------------------
1 file changed, 46 insertions(+), 36 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1d8f5c4b4936..185fc6e28f8e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6006,6 +6006,29 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return new_cpu;
}

+static int sis_search_depth(struct sched_domain *sd, struct sched_domain *this_sd)
+{
+ u64 avg_cost, avg_idle, span_avg;
+ int nr = INT_MAX;
+
+ if (sched_feat(SIS_PROP)) {
+ /*
+ * Due to large variance we need a large fuzz factor; hackbench in
+ * particularly is sensitive here.
+ */
+ avg_idle = this_rq()->avg_idle / 512;
+ avg_cost = this_sd->avg_scan_cost + 1;
+
+ span_avg = sd->span_weight * avg_idle;
+ if (span_avg > 4*avg_cost)
+ nr = div_u64(span_avg, avg_cost);
+ else
+ nr = 4;
+ }
+
+ return nr;
+}
+
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
EXPORT_SYMBOL_GPL(sched_smt_present);
@@ -6151,35 +6174,11 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd,
+ int target, int nr)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- struct sched_domain *this_sd;
- u64 avg_cost, avg_idle;
- u64 time;
- int this = smp_processor_id();
- int cpu, nr = INT_MAX;
-
- this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
- if (!this_sd)
- return -1;
-
- /*
- * Due to large variance we need a large fuzz factor; hackbench in
- * particularly is sensitive here.
- */
- avg_idle = this_rq()->avg_idle / 512;
- avg_cost = this_sd->avg_scan_cost + 1;
-
- if (sched_feat(SIS_PROP)) {
- u64 span_avg = sd->span_weight * avg_idle;
- if (span_avg > 4*avg_cost)
- nr = div_u64(span_avg, avg_cost);
- else
- nr = 4;
- }
-
- time = cpu_clock(this);
+ int cpu;

cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
__cpumask_clear_cpu(target, cpus);
@@ -6192,9 +6191,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
break;
}

- time = cpu_clock(this) - time;
- update_avg(&this_sd->avg_scan_cost, time);
-
return cpu;
}

@@ -6245,9 +6241,10 @@ static inline bool asym_fits_capacity(int task_util, int cpu)
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
- struct sched_domain *sd;
+ struct sched_domain *sd, *this_sd;
unsigned long task_util;
- int i, recent_used_cpu;
+ int i, recent_used_cpu, depth;
+ u64 time;

schedstat_inc(this_rq()->sis_search);

@@ -6337,21 +6334,34 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (!sd)
return target;

+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd)
+ return target;
+
+ depth = sis_search_depth(sd, this_sd);
+
schedstat_inc(this_rq()->sis_domain_search);
i = select_idle_core(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;

- i = select_idle_cpu(p, sd, target);
+ time = cpu_clock(smp_processor_id());
+ i = select_idle_cpu(p, sd, target, depth);
if ((unsigned)i < nr_cpumask_bits)
- return i;
+ goto acct_cost;

i = select_idle_smt(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
- return i;
+ goto acct_cost;

schedstat_inc(this_rq()->sis_failed);
- return target;
+ i = target;
+
+acct_cost:
+ time = cpu_clock(smp_processor_id()) - time;
+ update_avg(&this_sd->avg_scan_cost, time);
+
+ return i;
}

/**
--
2.26.2