Re: [PATCH V5 6/8] time/cpuidle: Support in tick broadcast frameworkin the absence of external clock device

From: Thomas Gleixner
Date: Wed Jan 22 2014 - 08:28:54 EST


On Wed, 15 Jan 2014, Preeti U Murthy wrote:
> diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
> index 086ad60..d61404e 100644
> --- a/kernel/time/clockevents.c
> +++ b/kernel/time/clockevents.c
> @@ -524,12 +524,13 @@ void clockevents_resume(void)
> #ifdef CONFIG_GENERIC_CLOCKEVENTS
> /**
> * clockevents_notify - notification about relevant events
> + * Returns non zero on error.
> */
> -void clockevents_notify(unsigned long reason, void *arg)
> +int clockevents_notify(unsigned long reason, void *arg)
> {

The interface change of clockevents_notify wants to be a separate
patch.

> diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
> index 9532690..1c23912 100644
> --- a/kernel/time/tick-broadcast.c
> +++ b/kernel/time/tick-broadcast.c
> @@ -20,6 +20,7 @@
> #include <linux/sched.h>
> #include <linux/smp.h>
> #include <linux/module.h>
> +#include <linux/slab.h>
>
> #include "tick-internal.h"
>
> @@ -35,6 +36,15 @@ static cpumask_var_t tmpmask;
> static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
> static int tick_broadcast_force;
>
> +/*
> + * Helper variables for handling broadcast in the absence of a
> + * tick_broadcast_device.
> + * */
> +static struct hrtimer *bc_hrtimer;
> +static int bc_cpu = -1;
> +static ktime_t bc_next_wakeup;

Why do you need another variable to store the expiry time? The
broadcast code already knows it and the hrtimer expiry value gives you
the same information for free.

> +static int hrtimer_initialized = 0;

What's the point of this hrtimer_initialized dance? Why not simply
making the hrtimer static and avoid that all together. Also adding the
initialization into tick_broadcast_oneshot_available() is
braindamaged. Why not adding this to tick_broadcast_init() which is
the proper place to do?

Aside of that you are making this hrtimer mode unconditional, which
might break existing systems which are not aware of the hrtimer
implications.

What you really want is a pseudo clock event device which has the
proper functions for handling the timer and you can register it from
your architecture code. The broadcast core code needs a few tweaks to
avoid the shutdown of the cpu local clock event device, but aside of
that the whole thing just falls into place. So architectures can use
this if they want and are sure that their low level idle code knows
about the deep idle preventing return value of
clockevents_notify(). Once that works you can register the hrtimer
based broadcast device and a real hardware broadcast device with a
higher rating. It just works.

Find an incomplete and nonfunctional concept patch below. It should be
simple to make it work for real.

Thanks,

tglx
----
Index: linux-2.6/include/linux/clockchips.h
===================================================================
--- linux-2.6.orig/include/linux/clockchips.h
+++ linux-2.6/include/linux/clockchips.h
@@ -62,6 +62,11 @@ enum clock_event_mode {
#define CLOCK_EVT_FEAT_DYNIRQ 0x000020
#define CLOCK_EVT_FEAT_PERCPU 0x000040

+/*
+ * Clockevent device is based on a hrtimer for broadcast
+ */
+#define CLOCK_EVT_FEAT_HRTIMER 0x000080
+
/**
* struct clock_event_device - clock event device descriptor
* @event_handler: Assigned by the framework to be called by the low
@@ -83,6 +88,7 @@ enum clock_event_mode {
* @name: ptr to clock event name
* @rating: variable to rate clock event devices
* @irq: IRQ number (only for non CPU local devices)
+ * @bound_on: Bound on CPU
* @cpumask: cpumask to indicate for which CPUs this device works
* @list: list head for the management code
* @owner: module reference
@@ -113,6 +119,7 @@ struct clock_event_device {
const char *name;
int rating;
int irq;
+ int bound_on;
const struct cpumask *cpumask;
struct list_head list;
struct module *owner;
Index: linux-2.6/kernel/time/tick-broadcast-hrtimer.c
===================================================================
--- /dev/null
+++ linux-2.6/kernel/time/tick-broadcast-hrtimer.c
@@ -0,0 +1,77 @@
+
+static struct hrtimer bctimer;
+
+static void bc_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *bc)
+{
+ switch (mode) {
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ /*
+ * Note, we cannot cancel the timer here as we might
+ * run into the following live lock scenario:
+ *
+ * cpu 0 cpu1
+ * lock(broadcast_lock);
+ * hrtimer_interrupt()
+ * bc_handler()
+ * tick_handle_oneshot_broadcast();
+ * lock(broadcast_lock);
+ * hrtimer_cancel()
+ * wait_for_callback()
+ */
+ hrtimer_try_to_cancel(&bctimer);
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * This is called from the guts of the broadcast code when the cpu
+ * which is about to enter idle has the earliest broadcast timer event.
+ */
+static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
+{
+ /*
+ * We try to cancel the timer first. If the callback is on
+ * flight on some other cpu then we let it handle it. If we
+ * were able to cancel the timer nothing can rearm it as we
+ * own broadcast_lock.
+ */
+ if (hrtimer_try_to_cancel(&bctimer) >= 0) {
+ hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED);
+ /* Bind the "device" to the cpu */
+ bc->bound_on = smp_processor_id();
+ }
+ return 0;
+}
+
+static struct clock_event_device ce_broadcast_hrtimer = {
+ .set_mode = bc_set_mode,
+ .set_next_ktime = bc_set_next,
+ .features = CLOCK_EVT_FEAT_ONESHOT |
+ CLOCK_EVT_FEAT_KTIME |
+ CLOCK_EVT_FEAT_HRTIMER,
+ .rating = 0,
+ .bound_on = -1,
+ .min_delta_ns = 1,
+ .max_delta_ns = KTIME_MAX,
+ .min_delta_ticks = 1,
+ .max_delta_ticks = KTIME_MAX,
+ .mult = 1,
+ .shift = 0,
+ .cpumask = cpu_all_mask,
+};
+
+static enum hrtimer_restart bc_handler(struct hrtimer *t)
+{
+ ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
+ return HRTIMER_NORESTART;
+}
+
+void tick_setup_hrtimer_broadcast(void)
+{
+ hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ bctimer.function = bc_handler;
+ clockevents_register(&ce_broadcast_hrtimer);
+}
Index: linux-2.6/kernel/time/tick-broadcast.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-broadcast.c
+++ linux-2.6/kernel/time/tick-broadcast.c
@@ -630,6 +630,42 @@ again:
raw_spin_unlock(&tick_broadcast_lock);
}

+static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
+{
+ if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER))
+ return 0;
+ if (bc->next_event.tv64 == KTIME_MAX)
+ return 0;
+ return bc->bound_on == cpu ? -EBUSY : 0;
+}
+
+static void broadcast_shutdown_local(struct clock_event_device *bc,
+ struct clock_event_device *dev)
+{
+ /*
+ * For hrtimer based broadcasting we cannot shutdown the cpu
+ * local device if our own event is the first one to expire or
+ * if we own the broadcast timer.
+ */
+ if (bc->features & CLOCK_EVT_FEAT_HRTIMER) {
+ if (broadcast_needs_cpu(bc))
+ return;
+ if (dev->next_event.tv64 < bc->next_event.tv64)
+ return;
+ }
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+}
+
+static void broadcast_move_bc(int deadcpu)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ if (!bc || !broadcast_needs_cpu(bc, deadcpu))
+ return;
+ /* This moves the broadcast assignment to this cpu */
+ clockevents_program_event(bc, bc->next_event, 1);
+}
+
/*
* Powerstate information: The system enters/leaves a state, where
* affected devices might stop
@@ -639,8 +675,8 @@ void tick_broadcast_oneshot_control(unsi
struct clock_event_device *bc, *dev;
struct tick_device *td;
unsigned long flags;
+ int cpu, ret = 0;
ktime_t now;
- int cpu;

/*
* Periodic mode does not care about the enter/exit of power
@@ -666,7 +702,7 @@ void tick_broadcast_oneshot_control(unsi
if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
- clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ broadcast_shutdown_local(bc, dev);
/*
* We only reprogram the broadcast timer if we
* did not mark ourself in the force mask and
@@ -679,6 +715,11 @@ void tick_broadcast_oneshot_control(unsi
dev->next_event.tv64 < bc->next_event.tv64)
tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
}
+ /*
+ * If the current CPU owns the hrtimer broadcast
+ * mechanism, it cannot go deep idle.
+ */
+ ret = broadcast_needs_cpu(bc, cpu);
} else {
if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
@@ -851,6 +892,8 @@ void tick_shutdown_broadcast_oneshot(uns
cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
cpumask_clear_cpu(cpu, tick_broadcast_force_mask);

+ broadcast_move_bc(cpu);
+
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/