[PATCH v6 4/5] sched: Handle set_cpus_allowed_ptr() & sched_setaffinity() race

From: Waiman Long
Date: Thu Aug 25 2022 - 21:02:15 EST


Racing is possible between set_cpus_allowed_ptr() and sched_setaffinity()
or between multiple sched_setaffinity() calls from different CPUs. To
resolve these race conditions, we need to update both user_cpus_ptr
and cpus_mask in a single lock critical section instead of separated
ones. This requires moving the user_cpus_ptr update to
affine_move_task() before doing task_rq_unlock().

A new argument puser_mask is added to affine_move_task(),
__set_cpus_allowed_ptr_locked() and __set_cpus_allowed_ptr() to do that.

Ideally, user_cpus_ptr should only be updated if the sched_setaffinity()
is successful. However, this patch will update user_cpus_ptr when the
first call to __set_cpus_allowed_ptr() is successful. However, if there
is racing between sched_setaffinity() and cpuset update, the subsequent
calls to __set_cpus_allowed_ptr() may fail but the user_cpus_ptr will
still be updated in this corner case.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
kernel/sched/core.c | 66 ++++++++++++++++++++++++++++-----------------
1 file changed, 42 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1c2f548e5369..6cd1177fbcea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2199,7 +2199,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32

static int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask,
- u32 flags);
+ u32 flags, struct cpumask **puser_mask);

static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
@@ -2249,7 +2249,7 @@ void migrate_enable(void)
*/
preempt_disable();
if (p->cpus_ptr != &p->cpus_mask)
- __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE, NULL);
/*
* Mustn't clear migration_disabled() until cpus_ptr points back at the
* regular cpus_mask, otherwise things that race (eg.
@@ -2618,6 +2618,15 @@ void release_user_cpus_ptr(struct task_struct *p)
kfree(clear_user_cpus_ptr(p));
}

+static inline void swap_user_cpus_ptr(struct task_struct *p,
+ struct cpumask **puser_mask)
+{
+ if (!puser_mask)
+ return;
+
+ swap(p->user_cpus_ptr, *puser_mask);
+}
+
/*
* This function is wildly self concurrent; here be dragons.
*
@@ -2693,9 +2702,12 @@ void release_user_cpus_ptr(struct task_struct *p)
* Note that the above is safe vs a concurrent migrate_enable(), as any
* pending affinity completion is preceded by an uninstallation of
* p->migration_pending done with p->pi_lock held.
+ *
+ * The puser_mask pointer, if defined, will cause its swap with the current
+ * user_cpus_ptr value if operation succeeds.
*/
static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
- int dest_cpu, unsigned int flags)
+ int dest_cpu, unsigned int flags, struct cpumask **puser_mask)
__releases(rq->lock)
__releases(p->pi_lock)
{
@@ -2722,6 +2734,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
complete = true;
}

+ swap_user_cpus_ptr(p, puser_mask);
task_rq_unlock(rq, p, rf);

if (push_task) {
@@ -2793,6 +2806,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
if (flags & SCA_MIGRATE_ENABLE)
p->migration_flags &= ~MDF_PUSH;

+ swap_user_cpus_ptr(p, puser_mask);
task_rq_unlock(rq, p, rf);

if (!stop_pending) {
@@ -2813,6 +2827,8 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
complete = true;
}
}
+
+ swap_user_cpus_ptr(p, puser_mask);
task_rq_unlock(rq, p, rf);

if (complete)
@@ -2843,7 +2859,8 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
const struct cpumask *new_mask,
u32 flags,
struct rq *rq,
- struct rq_flags *rf)
+ struct rq_flags *rf,
+ struct cpumask **puser_mask)
__releases(rq->lock)
__releases(p->pi_lock)
{
@@ -2908,7 +2925,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,

__do_set_cpus_allowed(p, new_mask, flags);

- return affine_move_task(rq, p, rf, dest_cpu, flags);
+ return affine_move_task(rq, p, rf, dest_cpu, flags, puser_mask);

out:
task_rq_unlock(rq, p, rf);
@@ -2926,7 +2943,8 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
* call is not atomic; no spinlocks may be held.
*/
static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, u32 flags)
+ const struct cpumask *new_mask, u32 flags,
+ struct cpumask **puser_mask)
{
struct cpumask *alloc_mask = NULL;
struct rq_flags rf;
@@ -2934,8 +2952,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
int ret;

rq = task_rq_lock(p, &rf);
- if (p->user_cpus_ptr) {

+ /*
+ * user_cpus_ptr masking is skipped if puser_mask is defined.
+ */
+ if (p->user_cpus_ptr && !puser_mask) {
/*
* A scratch cpumask is allocated on the percpu runqueues
* to enable additional masking with user_cpus_ptr. This
@@ -2958,7 +2979,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
}


- ret = __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+ ret = __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf,
+ puser_mask);
if (unlikely(alloc_mask))
kfree(alloc_mask);
return ret;
@@ -2966,7 +2988,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
- return __set_cpus_allowed_ptr(p, new_mask, 0);
+ return __set_cpus_allowed_ptr(p, new_mask, 0, NULL);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

@@ -3004,7 +3026,7 @@ static int restrict_cpus_allowed_ptr(struct task_struct *p,
goto err_unlock;
}

- return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+ return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf, NULL);

err_unlock:
task_rq_unlock(rq, p, &rf);
@@ -3551,7 +3573,7 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)

static inline int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask,
- u32 flags)
+ u32 flags, struct cpumask **puser_mask)
{
return set_cpus_allowed_ptr(p, new_mask);
}
@@ -8109,29 +8131,25 @@ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask, bool save
}
cpumask_copy(user_mask, mask);
}
-again:
- retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+
+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK,
+ user_mask ? &user_mask : NULL);
if (retval)
goto out_free_new_mask;

- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
+ for (;;) {
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (cpumask_subset(new_mask, cpus_allowed))
+ break;
+
/*
* We must have raced with a concurrent cpuset update.
* Just reset the cpumask to the cpuset's cpus_allowed.
*/
cpumask_copy(new_mask, cpus_allowed);
- goto again;
+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK, NULL);
}

- if (save_mask) {
- unsigned long flags;
-
- /* Use pi_lock to synchronize changes to user_cpus_ptr */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- swap(p->user_cpus_ptr, user_mask);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- }
out_free_new_mask:
kfree(user_mask);
free_cpumask_var(new_mask);
--
2.31.1