Re: [BUG] CFS vs cpu hotplug

From: Dmitry Adamushko
Date: Sat Jun 28 2008 - 18:17:28 EST


Hello,


it seems to be related to migrate_dead_tasks().

Firstly I added traces to see all tasks being migrated with
migrate_live_tasks() and migrate_dead_tasks(). On my setup the problem
pops up (the one with "se == NULL" in the loop of
pick_next_task_fair()) shortly after the traces indicate that some has
been migrated with migrate_dead_tasks()). btw., I can reproduce it
much faster now with just a plain cpu down/up loop.

[disclaimer] Well, unless I'm really missing something important in
this late hour [/desclaimer] pick_next_task() is not something
appropriate for migrate_dead_tasks() :-)

the following change seems to eliminate the problem on my setup
(although, I kept it running only for a few minutes to get a few
messages indicating migrate_dead_tasks() does move tasks and the
system is still ok)

[ quick hack ]

@@ -5887,6 +5907,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
next = pick_next_task(rq, rq->curr);
if (!next)
break;
+ next->sched_class->put_prev_task(rq, next);
migrate_dead(dead_cpu, next);

}

just in case, all the changes I've used for this test are attached "as is".

p.s. perhaps I won't be able to verify it carefully till tomorrow's
late evening.


--
Best regards,
Dmitry Adamushko
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a..db92c01 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,8 @@
#include <linux/stop_machine.h>
#include <linux/mutex.h>

+extern int sched_check_offline_cpu(int cpu);
+
/* Serializes the updates to cpu_online_map, cpu_present_map */
static DEFINE_MUTEX(cpu_add_remove_lock);

@@ -247,7 +249,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)

/* This actually kills the CPU. */
__cpu_die(cpu);
-
+
/* CPU is completely dead: tell everyone. Too late to complain. */
if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
hcpu) == NOTIFY_BAD)
@@ -255,6 +257,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)

check_for_tasks(cpu);

+ sched_check_offline_cpu(cpu);
+
out_thread:
err = kthread_stop(p);
out_allowed:
@@ -289,6 +293,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;

+ printk("cpu_up:\n");
+ ret = sched_check_offline_cpu(cpu);
+ if (ret)
+ return -EINVAL;
+
cpu_hotplug_begin();
ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
-1, &nr_calls);
diff --git a/kernel/sched.c b/kernel/sched.c
index 3aaa5c8..f20fe1c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4135,6 +4135,22 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
}
}

+int sched_check_offline_cpu(int cpu)
+{
+ struct task_group *tgi;
+ int ret;
+
+ ret = check_cfs_tree(&cpu_rq(cpu)->cfs);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tgi, &task_groups, list) {
+ ret += check_cfs_tree(tgi->cfs_rq[cpu]);
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
/*
* schedule() is the main scheduler function.
*/
@@ -5712,6 +5728,9 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
local_irq_disable();
ret = __migrate_task(p, src_cpu, dest_cpu);
local_irq_enable();
+
+ printk(KERN_ERR "__migrate(%d -- %s) -> cpu (%d) == ret (%d)\n",
+ p->pid, p->comm, dest_cpu, ret);
return ret;
}

@@ -5868,6 +5887,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
* fine.
*/
spin_unlock_irq(&rq->lock);
+ printk(KERN_ERR "---> migrate_dead(%d -- %s)\n", p->pid, p->comm);
move_task_off_dead_cpu(dead_cpu, p);
spin_lock_irq(&rq->lock);

@@ -5887,6 +5907,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
next = pick_next_task(rq, rq->curr);
if (!next)
break;
+ next->sched_class->put_prev_task(rq, next);
migrate_dead(dead_cpu, next);

}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index be16dfc..6d96890 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1235,10 +1235,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
do {
se = pick_next_entity(cfs_rq);

- if (unlikely(!se))
+ if (unlikely(!se)) {
printk(KERN_ERR "BUG: se == NULL but nr_running (%ld), load (%ld),"
" rq-nr_running (%ld), rq-load (%ld)\n",
cfs_rq->nr_running, cfs_rq->load.weight, rq->nr_running, rq->load.weight);
+// BUG();
+ return NULL;
+ }

cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -1315,6 +1318,41 @@ static struct task_struct *load_balance_next_fair(void *arg)
return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
}

+static int check_cfs_tree(struct cfs_rq *cfs_rq)
+{
+ struct list_head *next = cfs_rq->tasks.next;
+ struct sched_entity *se;
+ struct task_struct *p;
+ int ret = 0;
+
+ if (next == &cfs_rq->tasks)
+ return ret;
+
+ do {
+ se = list_entry(next, struct sched_entity, group_node);
+
+ if (entity_is_task(se)) {
+ p = task_of(se);
+
+ printk("* ERROR: (task) %d - %s\n", p->pid, p->comm);
+ ret = 1;
+ } else {
+ struct cfs_rq *cfs_rq_child = group_cfs_rq(se);
+
+ if (cfs_rq_child->nr_running || cfs_rq_child->load.weight) {
+ printk("* ERROR: (group) %ld - %ld\n",
+ cfs_rq_child->nr_running, cfs_rq_child->load.weight);
+ check_cfs_tree(cfs_rq_child);
+ ret = 1;
+ }
+ }
+
+ next = next->next;
+ } while (next != &cfs_rq->tasks);
+
+ return ret;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
{