[PATCH] sched/fair: use dst group while checking imbalance for NUMA balancer

From: Barry Song
Date: Mon Sep 07 2020 - 03:30:23 EST


Something is wrong. In find_busiest_group(), we are checking if src has
higher load, however, in task_numa_find_cpu(), we are checking if dst
will have higher load after balancing. It seems it is not sensible to
check src.
It maybe cause wrong imbalance value, for example, if
dst_running = env->dst_stats.nr_running + 1 results in 3 or above, and
src_running = env->src_stats.nr_running - 1 results in 1;
The current code is thinking imbalance as 0 since src_running is smaller
than 2.
This is inconsistent with load balancer.

Fixes: fb86f5b211 ("sched/numa: Use similar logic to the load balancer for moving between domains with spare capacity")
Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxxxxx>
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Valentin Schneider <valentin.schneider@xxxxxxx>
Cc: Phil Auld <pauld@xxxxxxxxxx>
Cc: Hillf Danton <hdanton@xxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Signed-off-by: Barry Song <song.bao.hua@xxxxxxxxxxxxx>
---
kernel/sched/fair.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1a68a05..90cfee7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1548,7 +1548,7 @@ struct task_numa_env {

static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
+static inline long adjust_numa_imbalance(int imbalance, int nr_running);

static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
@@ -1925,7 +1925,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
- imbalance = adjust_numa_imbalance(imbalance, src_running);
+ imbalance = adjust_numa_imbalance(imbalance, dst_running);

/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@@ -8957,7 +8957,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
}

-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
+static inline long adjust_numa_imbalance(int imbalance, int nr_running)
{
unsigned int imbalance_min;

@@ -8966,7 +8966,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
* tasks that remain local when the source domain is almost idle.
*/
imbalance_min = 2;
- if (src_nr_running <= imbalance_min)
+ if (nr_running <= imbalance_min)
return 0;

return imbalance;
--
2.7.4