[PATCH v2 4.19 1/3] cgroup/cpuset: Change cpuset_rwsem and hotplug lock order

From: Cai Xinchen
Date: Sun Mar 19 2023 - 21:41:32 EST


From: Juri Lelli <juri.lelli@xxxxxxxxxx>

commit d74b27d63a8bebe2fe634944e4ebdc7b10db7a39 upstream.

commit 1243dc518c9da ("cgroup/cpuset: Convert cpuset_mutex to
percpu_rwsem") is performance patch which is not backport. So
convert percpu_rwsem to cpuset_mutex.

commit aa44002e7db25 ("cpuset: Fix unsafe lock order between
cpuset lock and cpuslock") makes lock order keep cpuset_mutex
->cpu_hotplug_lock. We should change lock order in cpuset_attach.

original commit message:

cpuset_rwsem is going to be acquired from sched_setscheduler() with a
following patch. There are however paths (e.g., spawn_ksoftirqd) in
which sched_scheduler() is eventually called while holding hotplug lock;
this creates a dependecy between hotplug lock (to be always acquired
first) and cpuset_rwsem (to be always acquired after hotplug lock).

Fix paths which currently take the two locks in the wrong order (after
a following patch is applied).

Tested-by: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Signed-off-by: Juri Lelli <juri.lelli@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: bristot@xxxxxxxxxx
Cc: claudio@xxxxxxxxxxxxxxx
Cc: lizefan@xxxxxxxxxx
Cc: longman@xxxxxxxxxx
Cc: luca.abeni@xxxxxxxxxxxxxxx
Cc: mathieu.poirier@xxxxxxxxxx
Cc: rostedt@xxxxxxxxxxx
Cc: tj@xxxxxxxxxx
Cc: tommaso.cucinotta@xxxxxxxxxxxxxxx
Link: https://lkml.kernel.org/r/20190719140000.31694-7-juri.lelli@xxxxxxxxxx
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
Signed-off-by: Cai Xinchen <caixinchen1@xxxxxxxxxx>
---
v2:
* Change get_online_cpus/put_online_cpus lock order in cpuset_attach
to keep cpuset_mutex and hotplug lock order
---
include/linux/cpuset.h | 8 ++++----
kernel/cgroup/cpuset.c | 24 +++++++++++++++++-------
2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 934633a05d20..7f1478c26a33 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void)

static inline void cpuset_inc(void)
{
- static_branch_inc(&cpusets_pre_enable_key);
- static_branch_inc(&cpusets_enabled_key);
+ static_branch_inc_cpuslocked(&cpusets_pre_enable_key);
+ static_branch_inc_cpuslocked(&cpusets_enabled_key);
}

static inline void cpuset_dec(void)
{
- static_branch_dec(&cpusets_enabled_key);
- static_branch_dec(&cpusets_pre_enable_key);
+ static_branch_dec_cpuslocked(&cpusets_enabled_key);
+ static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
}

extern int cpuset_init(void);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index dcd5755b1fe2..7169e47fb48b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -830,8 +830,8 @@ static void rebuild_sched_domains_locked(void)
cpumask_var_t *doms;
int ndoms;

+ lockdep_assert_cpus_held();
lockdep_assert_held(&cpuset_mutex);
- get_online_cpus();

/*
* We have raced with CPU hotplug. Don't do anything to avoid
@@ -839,15 +839,13 @@ static void rebuild_sched_domains_locked(void)
* Anyways, hotplug work item will rebuild sched domains.
*/
if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- goto out;
+ return;

/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);

/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-out:
- put_online_cpus();
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
@@ -857,9 +855,11 @@ static void rebuild_sched_domains_locked(void)

void rebuild_sched_domains(void)
{
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
rebuild_sched_domains_locked();
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
}

/**
@@ -1528,13 +1528,13 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);

- mutex_lock(&cpuset_mutex);
-
/*
* It should hold cpus lock because a cpu offline event can
* cause set_cpus_allowed_ptr() failed.
*/
get_online_cpus();
+ mutex_lock(&cpuset_mutex);
+
/* prepare for attach */
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1553,7 +1553,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
cpuset_update_task_spread_flag(cs, task);
}
- put_online_cpus();

/*
* Change mm for all threadgroup leaders. This is expensive and may
@@ -1589,6 +1588,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);

mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
}

/* The various types of files and directories in a cpuset file system */
@@ -1617,6 +1617,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;

+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -1654,6 +1655,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
return retval;
}

@@ -1664,6 +1666,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;

+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -1678,6 +1681,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
return retval;
}

@@ -1716,6 +1720,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);

+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -1741,6 +1746,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_trial_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
@@ -1985,6 +1991,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;

+ get_online_cpus();
mutex_lock(&cpuset_mutex);

set_bit(CS_ONLINE, &cs->flags);
@@ -2035,6 +2042,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
spin_unlock_irq(&callback_lock);
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
return 0;
}

@@ -2048,6 +2056,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);

+ get_online_cpus();
mutex_lock(&cpuset_mutex);

if (is_sched_load_balance(cs))
@@ -2057,6 +2066,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
clear_bit(CS_ONLINE, &cs->flags);

mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
}

static void cpuset_css_free(struct cgroup_subsys_state *css)
--
2.17.1