[PATCH 1/2] cgroup/cpuset: Use cpuset_rwsem read lock in cpuset_can_attach()

From: Waiman Long
Date: Thu Dec 08 2022 - 14:58:12 EST


Since commit 1243dc518c9d ("cgroup/cpuset: Convert cpuset_mutex to
percpu_rwsem"), cpuset_mutex is changed to cpuset_rwsem. This has the
undesirable side effect of increasing the latency to take cpuset_rwsem
write lock, potentially up to a RCU grace period later which can be
especially problematic on systems with a large number of CPU cores.

One particular pain point is moving a task from one cpuset to another
one. The locking sequence for such a task migration is as follow:

cgroup_mutex => cgroup_threadgroup_rwsem => cpuset_rwsem

The cpuset_rwsem write lock has to be taken twice - at both
cpuset_can_attach() and cpuset_attach(). This can create significant
delay in blocking the fork/exit path while the task migration is in
progress.

Reduce that latency by using cpuset_rwsem read lock in
cpuset_can_attach() and cpuset_cancel_attach() as they don't need to
change anything in the cpuset except attach_in_progress which is now
changed to an atomic_t type to allow proper concurrent update while
holding cpuset_rwsem read lock. The attach_in_progress field is only read
while holding cpuset_rwsem write lock avoiding possible race condition.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
kernel/cgroup/cpuset.c | 39 ++++++++++++++++++++++-----------------
1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b474289c15b8..800c65de5daa 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -171,7 +171,7 @@ struct cpuset {
* Tasks are being attached to this cpuset. Used to prevent
* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
*/
- int attach_in_progress;
+ atomic_t attach_in_progress;

/* partition number for rebuild_sched_domains() */
int pn;
@@ -369,11 +369,14 @@ static struct cpuset top_cpuset = {
* There are two global locks guarding cpuset structures - cpuset_rwsem and
* callback_lock. We also require taking task_lock() when dereferencing a
* task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment. The cpuset code uses only cpuset_rwsem write lock. Other
- * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
- * prevent change to cpuset structures.
- *
- * A task must hold both locks to modify cpusets. If a task holds
+ * comment. The cpuset code uses mostly cpuset_rwsem write lock with the
+ * exception of cpuset_can_attach() and cpuset_cancel_attach(). Other kernel
+ * subsystems can use cpuset_read_lock()/cpuset_read_unlock() to prevent
+ * change to cpuset structures.
+ *
+ * A task must hold both locks to modify cpusets except attach_in_progress
+ * which can be modified by either holding cpuset_rwsem read or write
+ * lock and to be read under cpuset_rwsem write lock. If a task holds
* cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
* is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
@@ -746,7 +749,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
* be changed to have empty cpus_allowed or mems_allowed.
*/
ret = -ENOSPC;
- if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
+ if ((cgroup_is_populated(cur->css.cgroup) ||
+ atomic_read(&cur->attach_in_progress))) {
if (!cpumask_empty(cur->cpus_allowed) &&
cpumask_empty(trial->cpus_allowed))
goto out;
@@ -2448,7 +2452,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
cs = css_cs(css);

- percpu_down_write(&cpuset_rwsem);
+ percpu_down_read(&cpuset_rwsem);

/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
@@ -2475,10 +2479,10 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* Mark attach is in progress. This makes validate_change() fail
* changes which zero cpus/mems_allowed.
*/
- cs->attach_in_progress++;
+ atomic_inc(&cs->attach_in_progress);
ret = 0;
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ percpu_up_read(&cpuset_rwsem);
return ret;
}

@@ -2488,9 +2492,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)

cgroup_taskset_first(tset, &css);

- percpu_down_write(&cpuset_rwsem);
- css_cs(css)->attach_in_progress--;
- percpu_up_write(&cpuset_rwsem);
+ percpu_down_read(&cpuset_rwsem);
+ atomic_dec(&css_cs(css)->attach_in_progress);
+ percpu_up_read(&cpuset_rwsem);
}

/*
@@ -2562,8 +2566,8 @@ static void cpuset_attach(struct cgroup_taskset *tset)

cs->old_mems_allowed = cpuset_attach_nodemask_to;

- cs->attach_in_progress--;
- if (!cs->attach_in_progress)
+ atomic_inc(&cs->attach_in_progress);
+ if (!atomic_read(&cs->attach_in_progress))
wake_up(&cpuset_attach_wq);

percpu_up_write(&cpuset_rwsem);
@@ -3072,6 +3076,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
nodes_clear(cs->mems_allowed);
nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
+ atomic_set(&cs->attach_in_progress, 0);
cs->relax_domain_level = -1;

/* Set CS_MEMORY_MIGRATE for default hierarchy */
@@ -3383,7 +3388,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
bool mems_updated;
struct cpuset *parent;
retry:
- wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
+ wait_event(cpuset_attach_wq, atomic_read(&cs->attach_in_progress) == 0);

percpu_down_write(&cpuset_rwsem);

@@ -3391,7 +3396,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
* We have raced with task attaching. We wait until attaching
* is finished, so we won't attach a task to an empty cpuset.
*/
- if (cs->attach_in_progress) {
+ if (atomic_read(&cs->attach_in_progress)) {
percpu_up_write(&cpuset_rwsem);
goto retry;
}
--
2.31.1