Re: current linux-2.6.git: cpusets completely broken

From: Miao Xie
Date: Sat Jul 12 2008 - 06:03:11 EST


on 2008-7-12 11:28 Linus Torvalds wrote:
>
> On Sat, 12 Jul 2008, Vegard Nossum wrote:
>> Can somebody else please test/ack/review it too? This should eventually
>> go into 2.6.26 if it doesn't break anything else.
>
> And Dmitry, _please_ also explain what was going on. Why did things break
> from calling common_cpu_mem_hotplug_unplug() too much? That function is
> called pretty randomly anyway (for just about any random CPU event), so
> why did it fail in some circumstances?
>
> Linus
>

My explanation:
http://lkml.org/lkml/2008/7/7/75
this bug occurred on the kernel compiled with CONFIG_CPUSETS=y.

As Dmitry said in the following mail, modifying try_to_wake_up() to fix this bug
is not perfect. Maybe we need update the sched domain before migrating tasks.
http://lkml.org/lkml/2008/7/7/94

So I remake a patch to fix this bug by updating the sched domain when a cpu is in
CPU_DOWN_PREPARE state.

I think Vegard Nossum's patch is not so good because it is not necessary to detach
all the sched domains when making a cpu offline.

Signed-off-by: Miao Xie <miaox@xxxxxxxxxxxxxx>

---
include/linux/sched.h | 1 +
kernel/cpuset.c | 30 +++++++++++++++++++++++++-----
kernel/sched.c | 28 +++++++++++++++++++++++++++-
3 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c5d3f84..cf40eae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -817,6 +817,7 @@ struct sched_domain {
#endif
};

+extern void detach_sched_domains(int cpu);
extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
struct sched_domain_attr *dattr_new);
extern int arch_reinit_sched_domains(void);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9fceb97..64fa742 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1910,13 +1910,33 @@ static void common_cpu_mem_hotplug_unplug(void)
*/

static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
- unsigned long phase, void *unused_cpu)
+ unsigned long phase, void *hcpu)
{
- if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
- return NOTIFY_DONE;
+ int cpu = (long)hcpu;

- common_cpu_mem_hotplug_unplug();
- return 0;
+ switch (phase) {
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ cgroup_lock();
+ get_online_cpus();
+ detach_sched_domains(cpu);
+ put_online_cpus();
+ cgroup_unlock();
+ break;
+
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ common_cpu_mem_hotplug_unplug();
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+ return NOTIFY_OK;
}

#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/kernel/sched.c b/kernel/sched.c
index 4e2f603..73e0026 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7315,6 +7315,32 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
sizeof(struct sched_domain_attr));
}

+
+/*
+ * Detach sched domains from a group of cpus which are in the same domain with
+ * the specified cpu. These cpus will now be attach to the NULL domain.
+ *
+ * Call with hotplug lock and cgroup lock held
+ */
+void detach_sched_domains(int cpu)
+{
+ int i;
+
+ unregister_sched_domain_sysctl();
+
+ mutex_lock(&sched_domains_mutex);
+
+ for (i = 0; i < ndoms_cur; i++) {
+ if (cpu_isset(cpu, doms_cur[i])) {
+ detach_destroy_domains(doms_cur + i);
+ cpus_clear(doms_cur[i]);
+ break;
+ }
+ }
+
+ mutex_unlock(&sched_domains_mutex);
+}
+
/*
* Partition sched domains as specified by the 'ndoms_new'
* cpumasks in the array doms_new[] of cpumasks. This compares
@@ -7481,6 +7507,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
static int update_sched_domains(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
+#ifndef CONFIG_CPUSETS
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
@@ -7506,7 +7533,6 @@ static int update_sched_domains(struct notifier_block *nfb,
return NOTIFY_DONE;
}

-#ifndef CONFIG_CPUSETS
/*
* Create default domain partitioning if cpusets are disabled.
* Otherwise we let cpusets rebuild the domains based on the
--
1.5.4.rc3


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/