[PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

From: Peter Zijlstra
Date: Mon Feb 16 2015 - 08:12:59 EST


From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>

Preeti reported a cpu down race with hrtimer based broadcasting:

Assume CPU1 is the CPU which holds the hrtimer broadcasting duty
before it is taken down.

CPU0 CPU1
cpu_down()
takedown_cpu()
disable_interrupts()
cpu_die()
while (CPU1 != DEAD) {
msleep(100);
switch_to_idle()
stop_cpu_timer()
schedule_broadcast()
}

tick_cleanup_dead_cpu()
take_over_broadcast()

So after CPU1 disabled interrupts it cannot handle the broadcast
hrtimer anymore, so CPU0 will be stuck forever.

Doing a "while (CPU1 != DEAD) msleep(100);" periodic poll is silly at
best, but we need to fix that nevertheless.

Split the tick cleanup into two pieces:

1) Shutdown and remove all per cpu clockevent devices from
takedown_cpu()

This is done carefully with respect to existing arch code which
works around the shortcoming of the clockevents core code in
interesting ways. We really want a separate callback for this to
cleanup the workarounds, but that's not scope of this patch

2) Takeover the broadcast duty explicitely before calling cpu_die()

This is a temporary workaround as well. What we really want is a
callback in the clockevent device which allows us to do that from
the dying CPU by pushing the hrtimer onto a different cpu. That
might involve an IPI and is definitely more complex than this
immediate fix.

Reported-by: Preeti U Murthy <preeti@xxxxxxxxxxxxxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
include/linux/tick.h | 9 +++++----
kernel/cpu.c | 6 +++---
kernel/time/clockevents.c | 30 ++++++++++++++++++------------
kernel/time/tick-broadcast.c | 32 ++++++++++++++++++++++----------
kernel/time/tick-common.c | 34 ++++++++++++----------------------
kernel/time/tick-internal.h | 6 +++---
6 files changed, 63 insertions(+), 54 deletions(-)

Index: linux/include/linux/tick.h
===================================================================
--- linux.orig/include/linux/tick.h
+++ linux/include/linux/tick.h
@@ -29,13 +29,12 @@ extern struct tick_device *tick_get_devi
extern void __init tick_init(void);
/* Should be core only, but XEN resume magic requires this */
extern void tick_resume_local(void);
-extern void tick_handover_do_timer(void);
-extern void tick_cleanup_dead_cpu(int cpu);
+/* CPU hotplug */
+extern void tick_shutdown_local(void);
#else /* CONFIG_GENERIC_CLOCKEVENTS */
static inline void tick_init(void) { }
static inline void tick_resume_local(void) { }
-static inline void tick_handover_do_timer(void) { }
-static inline void tick_cleanup_dead_cpu(int cpu) { }
+static inline void tick_shutdown_local(void) { }
#endif /* !CONFIG_GENERIC_CLOCKEVENTS */

#ifdef CONFIG_TICK_ONESHOT
@@ -66,8 +65,10 @@ static inline void tick_broadcast_contro

#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state);
+extern void tick_takeover(int deadcpu);
#else
static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { return 0; }
+static inline void tick_takeover(int deadcpu) { }
#endif

static inline void tick_broadcast_enable(void)
Index: linux/kernel/cpu.c
===================================================================
--- linux.orig/kernel/cpu.c
+++ linux/kernel/cpu.c
@@ -349,8 +349,8 @@ static int __ref take_cpu_down(void *_pa
return err;

cpu_notify(CPU_DYING | param->mod, param->hcpu);
- /* Give up timekeeping duties */
- tick_handover_do_timer();
+ /* Shutdown the per cpu tick */
+ tick_shutdown_local();
/* Park the stopper thread */
kthread_park(current);
return 0;
@@ -428,7 +428,7 @@ static int __ref _cpu_down(unsigned int
__cpu_die(cpu);

/* CPU is completely dead: tell everyone. Too late to complain. */
- tick_cleanup_dead_cpu(cpu);
+ tick_takeover(cpu);
cpu_notify_nofail(CPU_DEAD | mod, hcpu);

check_for_tasks(cpu);
Index: linux/kernel/time/clockevents.c
===================================================================
--- linux.orig/kernel/time/clockevents.c
+++ linux/kernel/time/clockevents.c
@@ -541,26 +541,32 @@ void clockevents_resume(void)
#endif

#ifdef CONFIG_HOTPLUG_CPU
-/**
- * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
+/*
+ * Cleanup the clock events devices on the dying cpu. curdev is the
+ * current installed tick device on that cpu
*/
-void tick_cleanup_dead_cpu(int cpu)
+void clockevents_cleanup_dying_cpu(struct clock_event_device *curdev)
{
struct clock_event_device *dev, *tmp;
unsigned long flags;
+ int cpu;

raw_spin_lock_irqsave(&clockevents_lock, flags);
-
- tick_shutdown(cpu);
- /*
- * Unregister the clock event devices which were
- * released from the users in the notify chain.
- */
- list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
- list_del(&dev->list);
+ if (!curdev)
+ goto cleanup;
/*
- * Now check whether the CPU has left unused per cpu devices
+ * We cannot call the set mode function here at the moment
+ * because existing architecture cpu down code shuts down
+ * stuff already and we cannot interfere with that. So we just
+ * set the mode to unused for now.
*/
+ curdev->mode = CLOCK_EVT_MODE_UNUSED;
+ list_del(&curdev->list);
+ module_put(curdev->owner);
+
+cleanup:
+ /* Remove the unused percpu devices from the list */
+ cpu = smp_processor_id();
list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
if (cpumask_test_cpu(cpu, dev->cpumask) &&
cpumask_weight(dev->cpumask) == 1 &&
Index: linux/kernel/time/tick-broadcast.c
===================================================================
--- linux.orig/kernel/time/tick-broadcast.c
+++ linux/kernel/time/tick-broadcast.c
@@ -421,15 +421,17 @@ void tick_set_periodic_handler(struct cl

#ifdef CONFIG_HOTPLUG_CPU
/*
- * Remove a CPU from broadcasting
+ * Remove a CPU from broadcasting. Called from the dying cpu.
*/
-void tick_shutdown_broadcast(unsigned int cpu)
+void tick_shutdown_broadcast(void)
{
struct clock_event_device *bc;
unsigned long flags;
+ int cpu;

raw_spin_lock_irqsave(&tick_broadcast_lock, flags);

+ cpu = smp_processor_id();
cpumask_clear_cpu(cpu, tick_broadcast_mask);
cpumask_clear_cpu(cpu, tick_broadcast_on);

@@ -906,14 +908,26 @@ void tick_broadcast_switch_to_oneshot(vo
}

#ifdef CONFIG_HOTPLUG_CPU
-static void broadcast_move_bc(int deadcpu)
+/*
+ * Called from the cpu hotplug code after a cpu is dead. This ensures
+ * that a hrtimer based broad cast device is taken over.
+ *
+ * FIXME: This should go away. We should replace this by a mechanism
+ * which pushes the hrtimer over to a different cpu from
+ * tick_shutdown_broadcast_oneshot()
+ */
+void tick_broadcast_takeover_bc(int deadcpu)
{
- struct clock_event_device *bc = tick_broadcast_device.evtdev;
+ struct clock_event_device *bc;
+ unsigned long flags;

- if (!bc || !broadcast_needs_cpu(bc, deadcpu))
- return;
- /* This moves the broadcast assignment to this cpu */
- clockevents_program_event(bc, bc->next_event, 1);
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+ bc = tick_broadcast_device.evtdev;
+ if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+ /* This moves the broadcast assignment to this cpu */
+ clockevents_program_event(bc, bc->next_event, 1);
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}

/*
@@ -929,8 +943,6 @@ static void tick_shutdown_broadcast_ones
cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
-
- broadcast_move_bc(cpu);
}
#endif

Index: linux/kernel/time/tick-common.c
===================================================================
--- linux.orig/kernel/time/tick-common.c
+++ linux/kernel/time/tick-common.c
@@ -336,10 +336,10 @@ out_bc:
/*
* Transfer the do_timer job away from a dying cpu.
*
- * Called with interrupts disabled. Not locking required. If
- * tick_do_timer_cpu is owned by this cpu, nothing can change it.
+ * No locking required. If tick_do_timer_cpu is owned by this cpu,
+ * nothing can change it.
*/
-void tick_handover_do_timer(void)
+static void tick_handover_do_timer(void)
{
if (tick_do_timer_cpu == smp_processor_id()) {
int cpu = cpumask_first(cpu_online_mask);
@@ -349,32 +349,22 @@ void tick_handover_do_timer(void)
}
}

-/*
- * Shutdown an event device on a given cpu:
+/**
+ * tick_shutdown_local - Shutdown the tick related functions on a cpu
*
- * This is called on a life CPU, when a CPU is dead. So we cannot
- * access the hardware device itself.
- * We just set the mode and remove it from the lists.
+ * This is called from the dying cpu.
*/
-void tick_shutdown(unsigned int cpu)
+void tick_shutdown_local(void)
{
- struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
- struct clock_event_device *dev = td->evtdev;
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);

/* Remove the CPU from the broadcast machinery */
- tick_shutdown_broadcast(cpu);
+ tick_shutdown_broadcast();

+ clockevents_cleanup_dying_cpu(td->evtdev);
td->mode = TICKDEV_MODE_PERIODIC;
- if (dev) {
- /*
- * Prevent that the clock events layer tries to call
- * the set mode function!
- */
- dev->mode = CLOCK_EVT_MODE_UNUSED;
- clockevents_exchange_device(dev, NULL);
- dev->event_handler = clockevents_handle_noop;
- td->evtdev = NULL;
- }
+
+ tick_handover_do_timer();
}
#endif

Index: linux/kernel/time/tick-internal.h
===================================================================
--- linux.orig/kernel/time/tick-internal.h
+++ linux/kernel/time/tick-internal.h
@@ -20,7 +20,6 @@ extern int tick_do_timer_cpu __read_most
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
extern void tick_check_new_device(struct clock_event_device *dev);
-extern void tick_shutdown(unsigned int cpu);
extern void tick_suspend(void);
extern void tick_resume(void);
extern bool tick_check_replacement(struct clock_event_device *curdev,
@@ -38,6 +37,7 @@ extern void clockevents_shutdown(struct
extern void clockevents_exchange_device(struct clock_event_device *old,
struct clock_event_device *new);
extern void clockevents_handle_noop(struct clock_event_device *dev);
+extern void clockevents_cleanup_dying_cpu(struct clock_event_device *dev);
extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
extern void clockevents_suspend(void);
extern void clockevents_resume(void);
@@ -82,7 +82,7 @@ static inline int tick_check_oneshot_cha
extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
extern void tick_install_broadcast_device(struct clock_event_device *dev);
extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_shutdown_broadcast(unsigned int cpu);
+extern void tick_shutdown_broadcast(void);
extern void tick_suspend_broadcast(void);
extern void tick_resume_broadcast(void);
extern bool tick_resume_check_broadcast(void);
@@ -96,7 +96,7 @@ static inline void tick_install_broadcas
static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_shutdown_broadcast(unsigned int cpu) { }
+static inline void tick_shutdown_broadcast(void) { }
static inline void tick_suspend_broadcast(void) { }
static inline void tick_resume_broadcast(void) { }
static inline bool tick_resume_check_broadcast(void) { return false; }


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/