Re: sched_setscheduler() vs idle_balance() race

From: Kirill Tkhai
Date: Thu May 28 2015 - 13:07:11 EST

Next message: Jaegeuk Kim: "Re: [PATCH 1/3] f2fs crypto: check context consistent for rename2"
Previous message: Jim Davis: "randconfig build error with next-20150528, in drivers/power/twl4030_charger.c"
In reply to: Mike Galbraith: "Re: sched_setscheduler() vs idle_balance() race"
Next in thread: Mike Galbraith: "Re: sched_setscheduler() vs idle_balance() race"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

В Чт, 28/05/2015 в 15:53 +0200, Peter Zijlstra пишет:
On Thu, May 28, 2015 at 09:43:52AM +0200, Mike Galbraith wrote:
> > Hi Peter,
> >
> > I'm not seeing what prevents pull_task() from yanking a task out from
> > under __sched_setscheduler(). A box sprinkling smoldering 3.0 kernel
> > wreckage all over my bugzilla mbox isn't seeing it either ;-)
>
> Say, how easy can that thing be reproduced?
>
> The below is compile tested only, but it might just work if I didn't
> miss anything :-)
>
>
> ---
> kernel/sched/core.c | 137 +++++++++++++++++++++++++++++-----------------------
> 1 file changed, 77 insertions(+), 60 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 4eec60757b16..28f1ddc0bef2 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1000,22 +1000,6 @@ inline int task_curr(const struct task_struct *p)
> return cpu_curr(task_cpu(p)) == p;
> }
>
> -/*
> - * Can drop rq->lock because from sched_class::switched_from() methods drop it.
> - */
> -static inline void check_class_changed(struct rq *rq, struct task_struct *p,
> - const struct sched_class *prev_class,
> - int oldprio)
> -{
> - if (prev_class != p->sched_class) {
> - if (prev_class->switched_from)
> - prev_class->switched_from(rq, p);
> - /* Possble rq->lock 'hole'. */
> - p->sched_class->switched_to(rq, p);
> - } else if (oldprio != p->prio || dl_task(p))
> - p->sched_class->prio_changed(rq, p, oldprio);
> -}
> -
> void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
> {
> const struct sched_class *class;
> @@ -3075,12 +3059,38 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
>
> p->prio = prio;
>
> + if (prev_class != p->sched_class) {
> + prev_class->switched_from(rq, p);
>

switched_from_XXX() doesn't depend on on_YYY_rq state, so, yeah, this should be safe.

+ /*
> + * switched_from() is allowed to drop @rq->lock; which opens a
> + * race against load-balancing, however since @p is not
> + * currently enqueued it is invisible to the load-balancer.
> + *
> + * double check @p is still where we thought it was.
> + */
> + WARN_ON_ONCE(task_rq(p) != rq);
> + }
> +
> if (running)
> p->sched_class->set_curr_task(rq);
> if (queued)
> enqueue_task(rq, p, enqueue_flag);
>
> - check_class_changed(rq, p, prev_class, oldprio);
> + /*
> + * Both switched_to() and prio_changed() are allowed to drop @rq->lock;
> + * which opens a race against load-balancing, and since @p is now
> + * enqueued it can indeed be subject to this.
> + *
> + * This means that any balancing done by these functions must double
> + * check a task's rq.
> + */
> + if (prev_class != p->sched_class)
> + p->sched_class->switched_to(rq, p);
> + else if (oldprio != p->prio || dl_task(p))
> + p->sched_class->prio_changed(rq, p, oldprio);
> + /*
> + * It further means we should not rely on @p's rq from here on.
> + */
> out_unlock:
> __task_rq_unlock(rq);
> }
> @@ -3420,7 +3430,7 @@ static bool dl_param_changed(struct task_struct *p,
>
> static int __sched_setscheduler(struct task_struct *p,
> const struct sched_attr *attr,
> - bool user)
> + bool user, bool pi)
> {
> int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
> MAX_RT_PRIO - 1 - attr->sched_priority;
> @@ -3606,18 +3616,20 @@ static int __sched_setscheduler(struct task_struct *p,
> p->sched_reset_on_fork = reset_on_fork;
> oldprio = p->prio;
>
> - /*
> - * Take priority boosted tasks into account. If the new
> - * effective priority is unchanged, we just store the new
> - * normal parameters and do not touch the scheduler class and
> - * the runqueue. This will be done when the task deboost
> - * itself.
> - */
> - new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
> - if (new_effective_prio == oldprio) {
> - __setscheduler_params(p, attr);
> - task_rq_unlock(rq, p, &flags);
> - return 0;
> + if (pi) {
> + /*
> + * Take priority boosted tasks into account. If the new
> + * effective priority is unchanged, we just store the new
> + * normal parameters and do not touch the scheduler class and
> + * the runqueue. This will be done when the task deboost
> + * itself.
> + */
> + new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
> + if (new_effective_prio == oldprio) {
> + __setscheduler_params(p, attr);
> + task_rq_unlock(rq, p, &flags);
> + return 0;
> + }
> }
>
> queued = task_on_rq_queued(p);
> @@ -3628,7 +3640,19 @@ static int __sched_setscheduler(struct task_struct *p,
> put_prev_task(rq, p);
>
> prev_class = p->sched_class;
> - __setscheduler(rq, p, attr, true);
> + __setscheduler(rq, p, attr, pi);
> +
> + if (prev_class != p->sched_class) {
> + prev_class->switched_from(rq, p);
> + /*
> + * switched_from() is allowed to drop @rq->lock; which opens a
> + * race against load-balancing, however since @p is not
> + * currently enqueued it is invisible to the load-balancer.
> + *
> + * double check @p is still where we thought it was.
> + */
> + WARN_ON_ONCE(task_rq(p) != rq);
> + }
>
> if (running)
> p->sched_class->set_curr_task(rq);
> @@ -3640,10 +3664,25 @@ static int __sched_setscheduler(struct task_struct *p,
> enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
> }
>
> - check_class_changed(rq, p, prev_class, oldprio);
> + /*
> + * Both switched_to() and prio_changed() are allowed to drop @rq->lock;
> + * which opens a race against load-balancing, and since @p is now
> + * enqueued it can indeed be subject to this.
> + *
> + * This means that any balancing done by these functions must double
> + * check a task's rq.
> + */
> + if (prev_class != p->sched_class)
> + p->sched_class->switched_to(rq, p);
> + else if (oldprio != p->prio || dl_task(p))
> + p->sched_class->prio_changed(rq, p, oldprio);
> + /*
> + * It further means we should not rely on @p's rq from here on.
> + */
> task_rq_unlock(rq, p, &flags);
>
> - rt_mutex_adjust_pi(p);
> + if (pi)
> + rt_mutex_adjust_pi(p);
>
> return 0;
> }
> @@ -3664,7 +3703,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
> attr.sched_policy = policy;
> }
>
> - return __sched_setscheduler(p, &attr, check);
> + return __sched_setscheduler(p, &attr, check, true);
> }
> /**
> * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
> @@ -3685,7 +3724,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
>
> int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
> {
> - return __sched_setscheduler(p, attr, true);
> + return __sched_setscheduler(p, attr, true, true);
> }
> EXPORT_SYMBOL_GPL(sched_setattr);
>
> @@ -7346,32 +7385,12 @@ EXPORT_SYMBOL(___might_sleep);
> #endif
>
> #ifdef CONFIG_MAGIC_SYSRQ
> -static void normalize_task(struct rq *rq, struct task_struct *p)
> +void normalize_rt_tasks(void)
> {
> - const struct sched_class *prev_class = p->sched_class;
> + struct task_struct *g, *p;
> struct sched_attr attr = {
> .sched_policy = SCHED_NORMAL,
> };
> - int old_prio = p->prio;
> - int queued;
> -
> - queued = task_on_rq_queued(p);
> - if (queued)
> - dequeue_task(rq, p, 0);
> - __setscheduler(rq, p, &attr, false);
> - if (queued) {
> - enqueue_task(rq, p, 0);
> - resched_curr(rq);
> - }
> -
> - check_class_changed(rq, p, prev_class, old_prio);
> -}
> -
> -void normalize_rt_tasks(void)
> -{
> - struct task_struct *g, *p;
> - unsigned long flags;
> - struct rq *rq;
>
> read_lock(&tasklist_lock);
> for_each_process_thread(g, p) {
> @@ -7398,9 +7417,7 @@ void normalize_rt_tasks(void)
> continue;
> }
>
> - rq = task_rq_lock(p, &flags);
> - normalize_task(rq, p);
> - task_rq_unlock(rq, p, &flags);
> + __sched_setscheduler(p, &attr, false, false);
> }
> read_unlock(&tasklist_lock);
> }
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Jaegeuk Kim: "Re: [PATCH 1/3] f2fs crypto: check context consistent for rename2"
Previous message: Jim Davis: "randconfig build error with next-20150528, in drivers/power/twl4030_charger.c"
In reply to: Mike Galbraith: "Re: sched_setscheduler() vs idle_balance() race"
Next in thread: Mike Galbraith: "Re: sched_setscheduler() vs idle_balance() race"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]