Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()

From: Peter Zijlstra
Date: Fri Nov 27 2009 - 07:21:51 EST


I'm poking at it like so...

the below hasn't yet seen a compiler and I know there to still be a few
races in the ttwu() path.


---
include/linux/sched.h | 6 +-
kernel/fork.c | 2 +-
kernel/sched.c | 170 +++++++++++++++++++++++++++++++------------------
kernel/sched_fair.c | 26 +++----
4 files changed, 124 insertions(+), 80 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c46023..ff5ec5a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1099,7 +1099,7 @@ struct sched_class {

void (*set_curr_task) (struct rq *rq);
void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
- void (*task_new) (struct rq *rq, struct task_struct *p);
+ void (*fork) (struct task_struct *p);

void (*switched_from) (struct rq *this_rq, struct task_struct *task,
int running);
@@ -2456,7 +2456,7 @@ static inline unsigned int task_cpu(const struct task_struct *p)
return task_thread_info(p)->cpu;
}

-extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
+extern void set_task_cpu_locked(struct task_struct *p, int cpu);

#else

@@ -2465,7 +2465,7 @@ static inline unsigned int task_cpu(const struct task_struct *p)
return 0;
}

-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+static inline void set_task_cpu_locked(struct task_struct *p, int cpu)
{
}

diff --git a/kernel/fork.c b/kernel/fork.c
index 166b8c4..29c4aec 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1242,7 +1242,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
!cpu_online(task_cpu(p))))
- set_task_cpu(p, smp_processor_id());
+ set_task_cpu_locked(p, smp_processor_id());

/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 0cbf2ef..dced5af 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1967,7 +1967,7 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}

-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+static inline void raw_set_task_cpu(struct task_struct *p, unsigned int cpu)
{
set_task_rq(p, cpu);
#ifdef CONFIG_SMP
@@ -2008,6 +2008,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
+ struct rq *old_rq = task_rq(p);
unsigned long flags;

/* Must have done schedule() in kthread() before we set_task_cpu */
@@ -2016,13 +2017,14 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
return;
}

- spin_lock_irqsave(&rq->lock, flags);
- update_rq_clock(rq);
+ local_irq_save(flags);
+ double_rq_lock(old_rq, rq);
set_task_cpu(p, cpu);
p->cpus_allowed = cpumask_of_cpu(cpu);
p->rt.nr_cpus_allowed = 1;
p->flags |= PF_THREAD_BOUND;
- spin_unlock_irqrestore(&rq->lock, flags);
+ double_rq_unlock(old_rq, rq);
+ local_irq_restore(flags);
}
EXPORT_SYMBOL(kthread_bind);

@@ -2056,40 +2058,101 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
return delta < (s64)sysctl_sched_migration_cost;
}

+static void set_task_cpu_all(struct task_struct *p, int old_cpu, int new_cpu)
+{
+ trace_sched_migrate_task(p, new_cpu);
+
+ p->se.nr_migrations++;
+#ifdef CONFIG_SCHEDSTATS
+ if (task_hot(p, old_rq->clock, NULL))
+ schedstat_inc(p, se.nr_forced2_migrations);
+#endif
+ perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
+
+ raw_set_task_cpu(p, new_cpu);
+}

-void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+static void set_task_cpu(struct task_struct *p, int new_cpu)
{
int old_cpu = task_cpu(p);
- struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
- struct cfs_rq *old_cfsrq = task_cfs_rq(p),
- *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
- u64 clock_offset;

- clock_offset = old_rq->clock - new_rq->clock;
+ if (old_cpu == new_cpu)
+ return;

- trace_sched_migrate_task(p, new_cpu);
+ if (p->sched_class == &fair_sched_class) {
+ struct cfs_rq *old_cfsrq = task_cfs_rq(p),
+ *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);

-#ifdef CONFIG_SCHEDSTATS
- if (p->se.wait_start)
- p->se.wait_start -= clock_offset;
- if (p->se.sleep_start)
- p->se.sleep_start -= clock_offset;
- if (p->se.block_start)
- p->se.block_start -= clock_offset;
-#endif
- if (old_cpu != new_cpu) {
- p->se.nr_migrations++;
-#ifdef CONFIG_SCHEDSTATS
- if (task_hot(p, old_rq->clock, NULL))
- schedstat_inc(p, se.nr_forced2_migrations);
-#endif
- perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
- 1, 1, NULL, 0);
+ p->se.vruntime -= old_cfsrq->min_vruntime -
+ new_cfsrq->min_vruntime;
+ }
+
+ set_task_cpu_all(p, old_cpu, new_cpu);
+}
+
+static inline
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+{
+ return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+}
+
+/*
+ * self balance a task without holding both rq locks
+ *
+ * used by fork and wakeup, since in neither case the task was present
+ * on a rq the load-balancer wouldn't encounter it and move it away
+ * underneath us.
+ *
+ * and since we have IRQs disabled we keep off hot-unplug.
+ */
+static struct rq *
+balance_task(struct task_struct *p, int sd_flags, int wake_flags)
+{
+ struct rq *rq, *old_rq;
+ u64 vdelta;
+
+ rq = old_rq = task_rq(p);
+
+ if (p->sched_class == &fair_sched_class)
+ vdelta = task_cfs_rq(p)->min_vruntime;
+
+ __task_rq_unlock(old_rq);
+
+ cpu = select_task_rq(p, sd_flags, wake_flags);
+
+ rq = cpu_rq(cpu);
+ spin_lock(&rq->lock);
+ if (rq == old_rq)
+ return rq;
+
+ update_rq_clock(rq);
+
+ set_task_cpu_all(p, task_cpu(p), cpu);
+
+ if (p->sched_class == &fair_sched_class) {
+ vdelta -= task_cfs_rq(p)->min_vruntime;
+ p->se.vruntime -= vdelta;
}
- p->se.vruntime -= old_cfsrq->min_vruntime -
- new_cfsrq->min_vruntime;

- __set_task_cpu(p, new_cpu);
+ return rq;
+}
+
+void set_task_cpu_locked(struct task_struct *p, int new_cpu)
+{
+ struct rq *old_rq = task_rq(p);
+ struct rq *new_rq = cpu_rq(new_cpu);
+ unsigned long flags;
+
+ local_irq_save(flags);
+ double_rq_lock(old_rq, new_rq);
+ /*
+ * called from fork() on the child task before it gets
+ * put on the tasklist, no way we could've been migrated
+ */
+ WARN_ON_ONCE(old_rq != task_rq(p));
+ set_task_cpu(p, new_cpu);
+ double_rq_unlock(old_rq, new_rq);
+ local_irq_restore(flags);
}

struct migration_req {
@@ -2373,19 +2436,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
*/
if (task_contributes_to_load(p))
rq->nr_uninterruptible--;
+
p->state = TASK_WAKING;
+ rq = balance_task(p, SD_BALANCE_WAKE, wake_flags);
task_rq_unlock(rq, &flags);

- cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
- if (cpu != orig_cpu) {
- local_irq_save(flags);
- rq = cpu_rq(cpu);
- update_rq_clock(rq);
- set_task_cpu(p, cpu);
- local_irq_restore(flags);
- }
- rq = task_rq_lock(p, &flags);
-
WARN_ON(p->state != TASK_WAKING);
cpu = task_cpu(p);

@@ -2557,8 +2612,9 @@ static void __sched_fork(struct task_struct *p)
*/
void sched_fork(struct task_struct *p, int clone_flags)
{
- int cpu = get_cpu();
unsigned long flags;
+ struct *rq;
+ int cpu;

__sched_fork(p);

@@ -2592,13 +2648,15 @@ void sched_fork(struct task_struct *p, int clone_flags)
if (!rt_prio(p->prio))
p->sched_class = &fair_sched_class;

-#ifdef CONFIG_SMP
- cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
- local_irq_save(flags);
- update_rq_clock(cpu_rq(cpu));
- set_task_cpu(p, cpu);
- local_irq_restore(flags);
+ rq = task_rq_lock(current, &flags);
+ update_rq_clock(rq);
+
+ if (p->sched_class->fork)
+ p->sched_class->fork(p);
+
+ rq = balance_task(p, SD_BALANCE_FORK, 0);
+ task_rq_unlock(rq, &flags);
+

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (likely(sched_info_on()))
@@ -2631,17 +2689,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
update_rq_clock(rq);
-
- if (!p->sched_class->task_new || !current->se.on_rq) {
- activate_task(rq, p, 0);
- } else {
- /*
- * Let the scheduling class do new task startup
- * management (if any):
- */
- p->sched_class->task_new(rq, p);
- inc_nr_running(rq);
- }
+ activate_task(rq, p, 0);
trace_sched_wakeup_new(rq, p, 1);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
@@ -3156,7 +3204,7 @@ out:
void sched_exec(void)
{
int new_cpu, this_cpu = get_cpu();
- new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
+ new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
put_cpu();
if (new_cpu != this_cpu)
sched_migrate_task(current, new_cpu);
@@ -6980,7 +7028,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)

idle->prio = idle->normal_prio = MAX_PRIO;
cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
- __set_task_cpu(idle, cpu);
+ raw_set_task_cpu(idle, cpu);

rq->curr = rq->idle = idle;
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0ff21af..7c27d2f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1922,28 +1922,26 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
}

/*
- * Share the fairness runtime between parent and child, thus the
- * total amount of pressure for CPU stays equal - new tasks
- * get a chance to run but frequent forkers are not allowed to
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
+ * Called at fork, before the child is on the tasklist and before
+ * self-balance, current (parent) his rq is locked.
*/
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void fork_fair(struct task_struct *p)
{
- struct cfs_rq *cfs_rq = task_cfs_rq(p);
- struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+ struct rq *rq = this_rq();
+ struct cfs_rq *cfs_rq = task_cfs_rq(current);
+ struct sched_entity *se = &p->se, *curr = cfs_rq-curr;
int this_cpu = smp_processor_id();

+ if (unlikely(task_cpu(p) != this_cpu))
+ raw_set_task_cpu(p, this_cpu);
+
sched_info_queued(p);
-
update_curr(cfs_rq);
if (curr)
se->vruntime = curr->vruntime;
place_entity(cfs_rq, se, 1);

- /* 'curr' will be NULL if the child belongs to a different group */
- if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
- curr && entity_before(curr, se)) {
+ if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
/*
* Upon rescheduling, sched_class::put_prev_task() will place
* 'current' within the tree based on its new key value.
@@ -1951,8 +1949,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
swap(curr->vruntime, se->vruntime);
resched_task(rq->curr);
}
-
- enqueue_task_fair(rq, p, 0);
}

/*
@@ -2056,7 +2052,7 @@ static const struct sched_class fair_sched_class = {

.set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
- .task_new = task_new_fair,
+ .fork = fork_fair,

.prio_changed = prio_changed_fair,
.switched_to = switched_to_fair,


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/