[PATCH 2/4] sched/fair: Check a task has a fitting cpu when updating misfit

From: Qais Yousef
Date: Sun Feb 04 2024 - 21:08:25 EST


If a misfit task is affined to a subset of the possible cpus, we need to
verify that one of these cpus can fit it. Otherwise the load balancer
code will continuously trigger needlessly leading the balance_interval
to increase in return and eventually end up with a situation where real
imbalances take a long time to address because of this impossible
imbalance situation.

This can happen in Android world where it's common for background tasks
to be restricted to little cores.

Similarly if we can't fit the biggest core, triggering misfit is
pointless as it is the best we can ever get on this system.

To be able to detect that; we use asym_cap_list to iterate through
capacities in the system to see if the task is able to run at a higher
capacity level based on its p->cpus_ptr. We do that when the affinity
change, a fair task is forked, or when a task switched to fair policy.
We store the max_allowed_capacity in task_struct to allow for cheap
comparison in the fast path.

If cpu hotplug causes a capacity level to disappear, we will update the
max_allowed_capacity accordingly.

Improve check_misfit_status() function by removing redundant checks.
misfit_task_load will be 0 if the task can't move to a bigger CPU. And
nohz_balancer_kick() already checks for cpu_check_capacity() before
calling check_misfit_status().

Test:
=====

Add

trace_printk("balance_interval = %lu\n", interval)

in get_sd_balance_interval().

run
if [ "$MASK" != "0" ]; then
adb shell "taskset -a $MASK cat /dev/zero > /dev/null"
fi
sleep 10
// parse ftrace buffer counting the occurrence of each valaue

Where MASK is either:

* 0: no busy task running
* 1: busy task is pinned to 1 cpu; handled today to not cause
misfit
* f: busy task pinned to little cores, simulates busy background
task, demonstrates the problem to be fixed

Results:
========

Note how occurrence of balance_interval = 128 overshoots for MASK = f.

BEFORE
------

MASK=0

1 balance_interval = 175
120 balance_interval = 128
846 balance_interval = 64
55 balance_interval = 63
215 balance_interval = 32
2 balance_interval = 31
2 balance_interval = 16
4 balance_interval = 8
1870 balance_interval = 4
65 balance_interval = 2

MASK=1

27 balance_interval = 175
37 balance_interval = 127
840 balance_interval = 64
167 balance_interval = 63
449 balance_interval = 32
84 balance_interval = 31
304 balance_interval = 16
1156 balance_interval = 8
2781 balance_interval = 4
428 balance_interval = 2

MASK=f

1 balance_interval = 175
1328 balance_interval = 128
44 balance_interval = 64
101 balance_interval = 63
25 balance_interval = 32
5 balance_interval = 31
23 balance_interval = 16
23 balance_interval = 8
4306 balance_interval = 4
177 balance_interval = 2

AFTER
-----

Note how the high values almost disappear for all MASK values. The
system has background tasks that could trigger the problem without
simulate it even with MASK=0.

MASK=0

103 balance_interval = 63
19 balance_interval = 31
194 balance_interval = 8
4827 balance_interval = 4
179 balance_interval = 2

MASK=1

131 balance_interval = 63
1 balance_interval = 31
87 balance_interval = 8
3600 balance_interval = 4
7 balance_interval = 2

MASK=f

8 balance_interval = 127
182 balance_interval = 63
3 balance_interval = 31
9 balance_interval = 16
415 balance_interval = 8
3415 balance_interval = 4
21 balance_interval = 2

Signed-off-by: Qais Yousef <qyousef@xxxxxxxxxxx>
---
include/linux/sched.h | 1 +
init/init_task.c | 1 +
kernel/sched/fair.c | 134 +++++++++++++++++++++++++++++++++++++-----
3 files changed, 120 insertions(+), 16 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffe8f618ab86..774cddbeab09 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -835,6 +835,7 @@ struct task_struct {
#endif

unsigned int policy;
+ unsigned long max_allowed_capacity;
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t *user_cpus_ptr;
diff --git a/init/init_task.c b/init/init_task.c
index 7ecb458eb3da..b3dbab4c959e 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -77,6 +77,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
.cpus_mask = CPU_MASK_ALL,
+ .max_allowed_capacity = SCHED_CAPACITY_SCALE,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e30e2bb77a0..9a9ddf611ffe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5092,15 +5092,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)

static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
+ int cpu = cpu_of(rq);
+
if (!sched_asym_cpucap_active())
return;

- if (!p || p->nr_cpus_allowed == 1) {
- rq->misfit_task_load = 0;
- return;
- }
+ /*
+ * Affinity allows us to go somewhere higher? Or are we on biggest
+ * available CPU already? Or do we fit into this CPU ?
+ */
+ if (!p || (p->nr_cpus_allowed == 1) ||
+ (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
+ task_fits_cpu(p, cpu)) {

- if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -8241,6 +8245,100 @@ static void task_dead_fair(struct task_struct *p)
remove_entity_load_avg(&p->se);
}

+/*
+ * Set the max capacity the task is allowed to run at for misfit detection.
+ */
+static void set_task_max_allowed_capacity(struct task_struct *p, bool hotplug)
+{
+ struct asym_cap_data *entry;
+
+ if (!hotplug && !sched_asym_cpucap_active())
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ cpumask_t *cpumask;
+
+ cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(cpu_active_mask, cpumask))
+ continue;
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ p->max_allowed_capacity = entry->capacity;
+ break;
+ }
+ rcu_read_unlock();
+}
+
+static void __update_tasks_max_allowed_capacity(unsigned long capacity,
+ cpumask_t *cpumask,
+ bool online)
+{
+ struct task_struct *g, *p;
+
+ for_each_process_thread(g, p) {
+ if (p->sched_class != &fair_sched_class)
+ continue;
+
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ /*
+ * Should we expand if a capacity level re-appeared?
+ * Or should we shrink if a capacity level disappeared?
+ */
+ if ((online && p->max_allowed_capacity < capacity) ||
+ (!online && p->max_allowed_capacity == capacity))
+ set_task_max_allowed_capacity(p, true);
+ }
+}
+
+/*
+ * Handle a cpu going online/offline changing the available capacity levels.
+ */
+static void update_tasks_max_allowed_capacity(int cpu, bool online)
+{
+ struct asym_cap_data *entry;
+ bool do_update = false;
+ cpumask_t *cpumask;
+
+ /*
+ * We can't check for sched_asym_cpucap_active() here as we can't
+ * differentiate when an online operation will enable the key.
+ */
+
+ if (cpuhp_tasks_frozen)
+ return;
+
+ rcu_read_lock();
+ /* Did a capacity level appear/disappear? */
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ unsigned int nr_active;
+
+ cpumask = cpu_capacity_span(entry);
+
+ if (!cpumask_test_cpu(cpu, cpumask))
+ continue;
+
+ nr_active = cpumask_weight_and(cpu_active_mask, cpumask);
+ if (online)
+ do_update = nr_active == 1;
+ else
+ do_update = !nr_active;
+ break;
+ }
+ if (do_update)
+ __update_tasks_max_allowed_capacity(entry->capacity, cpumask, online);
+ rcu_read_unlock();
+}
+
+static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
+{
+ set_cpus_allowed_common(p, ctx);
+ set_task_max_allowed_capacity(p, false);
+}
+
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -8249,6 +8347,8 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

return newidle_balance(rq, rf) != 0;
}
+#else
+static inline void set_task_max_allowed_capacity(struct task_struct *p, bool hotplug) {}
#endif /* CONFIG_SMP */

static void set_next_buddy(struct sched_entity *se)
@@ -9601,16 +9701,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}

-/*
- * Check whether a rq has a misfit task and if it looks like we can actually
- * help that task: we can migrate the task to a CPU of higher capacity, or
- * the task's current CPU is heavily pressured.
- */
-static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+/* Check if the rq has a misfit task */
+static inline bool check_misfit_status(struct rq *rq)
{
- return rq->misfit_task_load &&
- (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
- check_cpu_capacity(rq, sd));
+ return rq->misfit_task_load;
}

/*
@@ -11922,7 +12016,7 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
* to run the misfit task on.
*/
- if (check_misfit_status(rq, sd)) {
+ if (check_misfit_status(rq)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -12461,6 +12555,8 @@ static void rq_online_fair(struct rq *rq)
update_sysctl();

update_runtime_enabled(rq);
+
+ update_tasks_max_allowed_capacity(cpu_of(rq), true);
}

static void rq_offline_fair(struct rq *rq)
@@ -12472,6 +12568,8 @@ static void rq_offline_fair(struct rq *rq)

/* Ensure that we remove rq contribution to group share: */
clear_tg_offline_cfs_rqs(rq);
+
+ update_tasks_max_allowed_capacity(cpu_of(rq), false);
}

#endif /* CONFIG_SMP */
@@ -12645,6 +12743,8 @@ static void task_fork_fair(struct task_struct *p)
rq_lock(rq, &rf);
update_rq_clock(rq);

+ set_task_max_allowed_capacity(p, false);
+
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr)
@@ -12768,6 +12868,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
attach_task_cfs_rq(p);

+ set_task_max_allowed_capacity(p, false);
+
if (task_on_rq_queued(p)) {
/*
* We were most likely switched from sched_rt, so
@@ -13139,7 +13241,7 @@ DEFINE_SCHED_CLASS(fair) = {
.rq_offline = rq_offline_fair,

.task_dead = task_dead_fair,
- .set_cpus_allowed = set_cpus_allowed_common,
+ .set_cpus_allowed = set_cpus_allowed_fair,
#endif

.task_tick = task_tick_fair,
--
2.34.1