Re: [PATCH v1 1/1] sched/topology: Add lock guard support

From: K Prateek Nayak
Date: Thu Jun 12 2025 - 01:36:00 EST

Next message: Herbert Xu: "Re: [PATCH] crypto: iaa - Fix race condition when probing IAA devices"
Previous message: Paolo Bonzini: "Re: [PATCH] KVM: x86/mmu: Reject direct bits in gpa passed to KVM_PRE_FAULT_MEMORY"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Hello Jeremy,

Sorry for the delay!

On 6/5/2025 5:34 PM, Jemmy Wong wrote:

This change replaces manual lock acquisition and release with lock guards
to improve code robustness and reduce the risk of lock mismanagement.
No functional changes to the scheduler topology logic are introduced.

Signed-off-by: Jemmy Wong <jemmywong512@xxxxxxxxx>

I have few comments below but other than that, I tested the patch with a
combination of hotplug and cpusets going concurrently and did not see
any issues. Feel free to add:

Tested-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>

---
include/linux/sched.h | 11 ++--
kernel/sched/core.c | 6 +--
kernel/sched/debug.c | 13 ++---
kernel/sched/rt.c | 35 ++++++------
kernel/sched/topology.c | 115 +++++++++++++++++-----------------------
5 files changed, 81 insertions(+), 99 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f78a64beb52..10a9d6083b72 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -46,6 +46,7 @@
#include <linux/rv.h>
#include <linux/uidgid_types.h>
#include <linux/tracepoint-defs.h>
+#include <linux/mutex.h>
#include <asm/kmap_size.h>
/* task_struct member predeclarations (sorted alphabetically): */
@@ -395,14 +396,14 @@ enum uclamp_id {
UCLAMP_CNT
};
+extern struct mutex sched_domains_mutex;
#ifdef CONFIG_SMP
extern struct root_domain def_root_domain;
-extern struct mutex sched_domains_mutex;
-extern void sched_domains_mutex_lock(void);
-extern void sched_domains_mutex_unlock(void);
+DEFINE_LOCK_GUARD_0(sched_domains_mutex,
+ mutex_lock(&sched_domains_mutex),
+ mutex_unlock(&sched_domains_mutex))
#else
-static inline void sched_domains_mutex_lock(void) { }
-static inline void sched_domains_mutex_unlock(void) { }
+DEFINE_LOCK_GUARD_0(sched_domains_mutex, ,)
#endif
struct sched_param {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dce50fa57471..b2b7a0cae95a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8457,9 +8457,9 @@ void __init sched_init_smp(void)
* CPU masks are stable and all blatant races in the below code cannot
* happen.
*/
- sched_domains_mutex_lock();
- sched_init_domains(cpu_active_mask);
- sched_domains_mutex_unlock();
+ scoped_guard(sched_domains_mutex) {
+ sched_init_domains(cpu_active_mask);
+ }

Since sched_init_domains() is only called once during init from here,
perhaps you can move the guard() within sched_init_domains() and avoid
the scoped_guard() here in the caller.

/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 9d71baf08075..1c00016fd54c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -293,8 +293,8 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
ssize_t result;
bool orig;
- cpus_read_lock();
- sched_domains_mutex_lock();
+ guard(cpus_read_lock)();
+ guard(sched_domains_mutex)();
orig = sched_debug_verbose;
result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
@@ -306,9 +306,6 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
sd_dentry = NULL;
}
- sched_domains_mutex_unlock();
- cpus_read_unlock();
-
return result;
}
#else
@@ -517,9 +514,9 @@ static __init int sched_init_debug(void)
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
- sched_domains_mutex_lock();
- update_sched_domain_debugfs();
- sched_domains_mutex_unlock();
+ scoped_guard(sched_domains_mutex) {
+ update_sched_domain_debugfs();
+ }
#endif
#ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e40422c37033..3533b471b015 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2920,36 +2920,37 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
static DEFINE_MUTEX(mutex);
int ret;
- mutex_lock(&mutex);
- sched_domains_mutex_lock();
+ guard(mutex)(&mutex);
+ guard(sched_domains_mutex)();
+
old_period = sysctl_sched_rt_period;
old_runtime = sysctl_sched_rt_runtime;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
- ret = sched_rt_global_validate();
- if (ret)
- goto undo;
+ do {

Instead of this do-while loop that runs once, how about:

static int sched_validate_constraints()
{
int ret;

ret = sched_rt_global_validate();
if (ret)
return ret;

... /* similarly for dl_global_validate, rt_global_constraints */

sched_rt_do_global();
sched_dl_do_global();

return 0;
}

staic int sched_rt_handler()
{
...

ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;

ret = sched_validate_constraints();
if (ret) {
sysctl_sched_rt_period = old_period;
sysctl_sched_rt_runtime = old_runtime;
}

return ret;
}

+ ret = sched_rt_global_validate();
+ if (ret)
+ break;
- ret = sched_dl_global_validate();
- if (ret)
- goto undo;
+ ret = sched_dl_global_validate();
+ if (ret)
+ break;
- ret = sched_rt_global_constraints();
- if (ret)
- goto undo;
+ ret = sched_rt_global_constraints();
+ if (ret)
+ break;
- sched_rt_do_global();
- sched_dl_do_global();
+ sched_rt_do_global();
+ sched_dl_do_global();
+ } while (0);
}
- if (0) {
-undo:
+
+ if (ret) {

Previously "!write" cases and early ret from proc_dointvec_minmax()
wouldn't do this. It doesn't hurt but it is wasteful and can be avoided
using the pattern from the above suggestion.

sysctl_sched_rt_period = old_period;
sysctl_sched_rt_runtime = old_runtime;
}
- sched_domains_mutex_unlock();
- mutex_unlock(&mutex);
return ret;
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b958fe48e020..3f89f969666c 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -6,14 +6,6 @@
#include <linux/bsearch.h>
DEFINE_MUTEX(sched_domains_mutex);
-void sched_domains_mutex_lock(void)
-{
- mutex_lock(&sched_domains_mutex);
-}
-void sched_domains_mutex_unlock(void)
-{
- mutex_unlock(&sched_domains_mutex);
-}
/* Protected by sched_domains_mutex: */
static cpumask_var_t sched_domains_tmpmask;
@@ -470,44 +462,41 @@ static void free_rootdomain(struct rcu_head *rcu)
void rq_attach_root(struct rq *rq, struct root_domain *rd)
{
struct root_domain *old_rd = NULL;
- struct rq_flags rf;
- rq_lock_irqsave(rq, &rf);
+ scoped_guard(rq_lock_irqsave, rq) {
+ if (rq->rd) {
+ old_rd = rq->rd;
- if (rq->rd) {
- old_rd = rq->rd;
+ if (cpumask_test_cpu(rq->cpu, old_rd->online))
+ set_rq_offline(rq);
+
+ cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+ /*
+ * If we don't want to free the old_rd yet then
+ * set old_rd to NULL to skip the freeing later
+ * in this function:
+ */
+ if (!atomic_dec_and_test(&old_rd->refcount))
+ old_rd = NULL;
+ }
- if (cpumask_test_cpu(rq->cpu, old_rd->online))
- set_rq_offline(rq);
+ atomic_inc(&rd->refcount);
+ rq->rd = rd;
- cpumask_clear_cpu(rq->cpu, old_rd->span);
+ cpumask_set_cpu(rq->cpu, rd->span);
+ if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+ set_rq_online(rq);
/*
- * If we don't want to free the old_rd yet then
- * set old_rd to NULL to skip the freeing later
- * in this function:
+ * Because the rq is not a task, dl_add_task_root_domain() did not

nit. Perhaps that comment can now be wrapped at 80 characters. No strong
feelings, just that the comment stood out for being pretty long when
reviewing.

+ * move the fair server bw to the rd if it already started.
+ * Add it now.
*/
- if (!atomic_dec_and_test(&old_rd->refcount))
- old_rd = NULL;
+ if (rq->fair_server.dl_server)
+ __dl_server_attach_root(&rq->fair_server, rq);
}

--
Thanks and Regards,
Prateek

Next message: Herbert Xu: "Re: [PATCH] crypto: iaa - Fix race condition when probing IAA devices"
Previous message: Paolo Bonzini: "Re: [PATCH] KVM: x86/mmu: Reject direct bits in gpa passed to KVM_PRE_FAULT_MEMORY"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]