[RFC PATCH v4 24/28] sched: Record average number of runninhg tasks per process

From: Chen Yu
Date: Sat Aug 09 2025 - 01:14:40 EST


Performance regression was found when running hackbench
with many threads per process(the fd number is high).
To avoid this regression, process having a large number
of active threads should be excluded from cache aware
scheduling.

With sched_cache enabled, record the number of active threads within
the process. This calculation occurs in the periodic task_cache_work():
when iterating over the CPUs, check the currently running task on that
CPU; if the running task belongs to the same process as the task that
launches task_cache_work(), increment the active thread count by 1.

If the number exceeds the number of CPUs in the preferred LLC,
sched_cache is prevented from aggregating too many threads in one
LLC domain.

Reported-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
Signed-off-by: Chen Yu <yu.c.chen@xxxxxxxxx>
---
include/linux/mm_types.h | 1 +
kernel/sched/fair.c | 14 ++++++++++++--
2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 41a598a44361..13b715357ccb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1033,6 +1033,7 @@ struct mm_struct {
raw_spinlock_t mm_sched_lock;
unsigned long mm_sched_epoch;
int mm_sched_cpu;
+ u64 nr_running_avg;
#endif

#ifdef CONFIG_MMU
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 420d3a080990..2577b4225c3f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1414,12 +1414,13 @@ static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu,

static void __no_profile task_cache_work(struct callback_head *work)
{
- struct task_struct *p = current;
+ struct task_struct *p = current, *cur;
struct mm_struct *mm = p->mm;
unsigned long m_a_occ = 0;
unsigned long last_m_a_occ = 0;
int cpu, m_a_cpu = -1, cache_cpu,
- pref_nid = NUMA_NO_NODE, curr_cpu = smp_processor_id();
+ pref_nid = NUMA_NO_NODE, curr_cpu = smp_processor_id(),
+ nr_running = 0;
cpumask_var_t cpus;

WARN_ON_ONCE(work != &p->cache_work);
@@ -1460,6 +1461,14 @@ static void __no_profile task_cache_work(struct callback_head *work)
m_cpu = i;
}
nr++;
+
+ rcu_read_lock();
+ cur = rcu_dereference(cpu_rq(i)->curr);
+ if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) &&
+ cur->mm == mm)
+ nr_running++;
+ rcu_read_unlock();
+
trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
}
@@ -1489,6 +1498,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
mm->mm_sched_cpu = m_a_cpu;
}

+ update_avg(&mm->nr_running_avg, nr_running);
free_cpumask_var(cpus);
}

--
2.25.1