Re: sched_setscheduler() vs idle_balance() race

From: Mike Galbraith
Date: Sun May 31 2015 - 02:39:18 EST


On Sat, 2015-05-30 at 15:08 +0200, Mike Galbraith wrote:
> On Thu, 2015-05-28 at 15:53 +0200, Peter Zijlstra wrote:
> > On Thu, May 28, 2015 at 09:43:52AM +0200, Mike Galbraith wrote:
> > > Hi Peter,
> > >
> > > I'm not seeing what prevents pull_task() from yanking a task out from
> > > under __sched_setscheduler(). A box sprinkling smoldering 3.0 kernel
> > > wreckage all over my bugzilla mbox isn't seeing it either ;-)
> >
> > Say, how easy can that thing be reproduced?
> >
> > The below is compile tested only, but it might just work if I didn't
> > miss anything :-)
>
> Seems trying to make the target invisible to balancing created a new
> race: dequeue target, do stuff that may drop rq->lock while it's
> dequeued, target sneaks into schedule(), dequeues itself (#2), boom.

Well, the below (ick) plugged it up, but...

I don't see why we can't just say no in can_migrate_task() if ->pi_lock
is held. It plugged the original hole in a lot less lines.

Hohum, time to go pretend I have something better to do on a sunny
Sunday morning ;-)

massive_intr_x-6213 [007] d... 170.579339: pull_rt_task: yup, pulled
massive_intr_x-6213 [002] d... 170.580114: pull_rt_task: yup, pulled
massive_intr_x-6213 [006] d... 170.586083: pull_rt_task: yup, pulled
<...>-6237 [006] d... 170.593878: __schedule: saving the day

---
kernel/sched/core.c | 43 +++++++++++++++++++++++++++++++++++--------
kernel/sched/deadline.c | 6 +++---
kernel/sched/rt.c | 11 +++++++++--
kernel/sched/sched.h | 10 +++++++++-
4 files changed, 56 insertions(+), 14 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2745,9 +2745,18 @@ static void __sched __schedule(void)
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
+dequeued:
smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);

+ if (unlikely(!task_on_rq_queued(prev))) {
+ trace_printk("saving the day\n");
+ tracing_off();
+ raw_spin_unlock_irq(&rq->lock);
+ cpu_relax();
+ goto dequeued;
+ }
+
rq->clock_skip_update <<= 1; /* promote REQ to ACT */

switch_count = &prev->nivcsw;
@@ -3013,8 +3022,10 @@ void rt_mutex_setprio(struct task_struct
prev_class = p->sched_class;
queued = task_on_rq_queued(p);
running = task_current(rq, p);
- if (queued)
+ if (queued) {
dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_DEQUEUED;
+ }
if (running)
put_prev_task(rq, p);

@@ -3067,8 +3078,10 @@ void rt_mutex_setprio(struct task_struct

if (running)
p->sched_class->set_curr_task(rq);
- if (queued)
+ if (queued) {
enqueue_task(rq, p, enqueue_flag);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ }

/*
* Both switched_to() and prio_changed() are allowed to drop @rq->lock;
@@ -3114,8 +3127,10 @@ void set_user_nice(struct task_struct *p
goto out_unlock;
}
queued = task_on_rq_queued(p);
- if (queued)
+ if (queued) {
dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_DEQUEUED;
+ }

p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -3125,6 +3140,7 @@ void set_user_nice(struct task_struct *p

if (queued) {
enqueue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -3628,8 +3644,10 @@ static int __sched_setscheduler(struct t

queued = task_on_rq_queued(p);
running = task_current(rq, p);
- if (queued)
+ if (queued) {
dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_DEQUEUED;
+ }
if (running)
put_prev_task(rq, p);

@@ -3656,6 +3674,7 @@ static int __sched_setscheduler(struct t
* increased (user space view).
*/
enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
}

/*
@@ -4943,8 +4962,10 @@ void sched_setnuma(struct task_struct *p
queued = task_on_rq_queued(p);
running = task_current(rq, p);

- if (queued)
+ if (queued) {
dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_DEQUEUED;
+ }
if (running)
put_prev_task(rq, p);

@@ -4952,8 +4973,10 @@ void sched_setnuma(struct task_struct *p

if (running)
p->sched_class->set_curr_task(rq);
- if (queued)
+ if (queued) {
enqueue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ }
task_rq_unlock(rq, p, &flags);
}
#endif
@@ -7587,8 +7610,10 @@ void sched_move_task(struct task_struct
running = task_current(rq, tsk);
queued = task_on_rq_queued(tsk);

- if (queued)
+ if (queued) {
dequeue_task(rq, tsk, 0);
+ tsk->on_rq = TASK_ON_RQ_DEQUEUED;
+ }
if (unlikely(running))
put_prev_task(rq, tsk);

@@ -7611,8 +7636,10 @@ void sched_move_task(struct task_struct

if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
- if (queued)
+ if (queued) {
enqueue_task(rq, tsk, 0);
+ tsk->on_rq = TASK_ON_RQ_QUEUED;
+ }

task_rq_unlock(rq, tsk, &flags);
}
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -607,7 +607,7 @@ static enum hrtimer_restart dl_task_time
* We can be both throttled and !queued. Replenish the counter
* but do not enqueue -- wait for our wakeup to do that.
*/
- if (!task_on_rq_queued(p)) {
+ if (!task_on_rq_queued_or_dequeued(p)) {
replenish_dl_entity(dl_se, dl_se);
goto unlock;
}
@@ -1526,7 +1526,7 @@ static int pull_dl_task(struct rq *this_
dl_time_before(p->dl.deadline,
this_rq->dl.earliest_dl.curr))) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!task_on_rq_queued(p));
+ WARN_ON(!task_on_rq_queued_or_dequeued(p));

/*
* Then we pull iff p has actually an earlier
@@ -1707,7 +1707,7 @@ static void switched_from_dl(struct rq *
* this is the right place to try to pull some other one
* from an overloaded cpu, if any.
*/
- if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+ if (!task_on_rq_queued_or_dequeued(p) || rq->dl.dl_nr_running)
return;

if (pull_dl_task(rq))
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1656,7 +1656,7 @@ static struct rq *find_lock_lowest_rq(st
!cpumask_test_cpu(lowest_rq->cpu,
tsk_cpus_allowed(task)) ||
task_running(rq, task) ||
- !task_on_rq_queued(task))) {
+ !task_on_rq_queued_or_dequeued(task))) {

double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1953,10 +1953,14 @@ static int pull_rt_task(struct rq *this_
int this_cpu = this_rq->cpu, ret = 0, cpu;
struct task_struct *p;
struct rq *src_rq;
+ int task_dequeued = 0;

if (likely(!rt_overloaded(this_rq)))
return 0;

+ if (this_rq->curr->on_rq == TASK_ON_RQ_DEQUEUED)
+ task_dequeued = 1;
+
/*
* Match the barrier from rt_set_overloaded; this guarantees that if we
* see overloaded we must also see the rto_mask bit.
@@ -2035,6 +2039,9 @@ static int pull_rt_task(struct rq *this_
double_unlock_balance(this_rq, src_rq);
}

+ if (ret && task_dequeued)
+ trace_printk("yup, pulled\n");
+
return ret;
}

@@ -2133,7 +2140,7 @@ static void switched_from_rt(struct rq *
* we may need to handle the pulling of RT tasks
* now.
*/
- if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+ if (!task_on_rq_queued_or_dequeued(p) || rq->rt.rt_nr_running)
return;

if (pull_rt_task(rq))
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -19,7 +19,8 @@ struct cpuidle_state;

/* task_struct::on_rq states: */
#define TASK_ON_RQ_QUEUED 1
-#define TASK_ON_RQ_MIGRATING 2
+#define TASK_ON_RQ_DEQUEUED 2
+#define TASK_ON_RQ_MIGRATING 3

extern __read_mostly int scheduler_running;

@@ -1034,6 +1035,13 @@ static inline int task_on_rq_queued(stru
return p->on_rq == TASK_ON_RQ_QUEUED;
}

+static inline int task_on_rq_queued_or_dequeued(struct task_struct *p)
+{
+ if (p->on_rq == TASK_ON_RQ_QUEUED)
+ return 1;
+ return p->on_rq == TASK_ON_RQ_DEQUEUED;
+}
+
static inline int task_on_rq_migrating(struct task_struct *p)
{
return p->on_rq == TASK_ON_RQ_MIGRATING;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/