Re: PREEMPT_RT vs 'hrtimer: Prevent hrtimer_enqueue_reprogram race'

From: Steven Rostedt
Date: Thu Mar 21 2013 - 22:31:25 EST


On Fri, 2013-03-22 at 01:11 +0000, Ben Hutchings wrote:
> Commit b22affe0aef4 'hrtimer: Prevent hrtimer_enqueue_reprogram race'
> conflicts with the RT patches
> hrtimer-fixup-hrtimer-callback-changes-for-preempt-r.patch and
> peter_zijlstra-frob-hrtimer.patch, as they all change
> hrtimer_enqueue_reprogram(). It seems that the changes in the RT
> patches now belong in __hrtimer_start_range_ns().
>
> Since I haven't seen any RT releases in a while, here's what I came up
> with for 3.2-rt:

Note, I posted a fix on Tuesday:

https://lkml.org/lkml/2013/3/19/369

I'm waiting for Thomas to give his OK on it before releasing the series.
He told me he'll have a look at it tomorrow. I've already ran the series
through all my tests, and will post it immediately after I get the OK.
Or if there's a issue I will have to fix it and rerun my tests.


>
> ---
> From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Date: Fri, 3 Jul 2009 08:44:31 -0500
> Subject: hrtimer: fixup hrtimer callback changes for preempt-rt
>
> In preempt-rt we can not call the callbacks which take sleeping locks
> from the timer interrupt context.
>
> Bring back the softirq split for now, until we fixed the signal
> delivery problem for real.
>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Signed-off-by: Ingo Molnar <mingo@xxxxxxx>
> [bwh: Pull the changes to hrtimer_enqueue_reprogram() up into
> __hrtimer_start_range_ns(), following changes in
> commit b22affe0aef4 'hrtimer: Prevent hrtimer_enqueue_reprogram race'
> backported into 3.2.40]
> Signed-off-by: Ben Hutchings <ben@xxxxxxxxxxxxxxx>
> ---
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -111,6 +111,8 @@ struct hrtimer {
> enum hrtimer_restart (*function)(struct hrtimer *);
> struct hrtimer_clock_base *base;
> unsigned long state;
> + struct list_head cb_entry;
> + int irqsafe;
> #ifdef CONFIG_TIMER_STATS
> int start_pid;
> void *start_site;
> @@ -147,6 +149,7 @@ struct hrtimer_clock_base {
> int index;
> clockid_t clockid;
> struct timerqueue_head active;
> + struct list_head expired;
> ktime_t resolution;
> ktime_t (*get_time)(void);
> ktime_t softirq_time;
> --- a/kernel/hrtimer.c
> +++ b/kernel/hrtimer.c
> @@ -589,8 +589,7 @@ static int hrtimer_reprogram(struct hrti
> * When the callback is running, we do not reprogram the clock event
> * device. The timer callback is either running on a different CPU or
> * the callback is executed in the hrtimer_interrupt context. The
> - * reprogramming is handled either by the softirq, which called the
> - * callback or at the end of the hrtimer_interrupt.
> + * reprogramming is handled at the end of the hrtimer_interrupt.
> */
> if (hrtimer_callback_running(timer))
> return 0;
> @@ -625,6 +624,9 @@ static int hrtimer_reprogram(struct hrti
> return res;
> }
>
> +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now);
> +static int hrtimer_rt_defer(struct hrtimer *timer);
> +
> /*
> * Initialize the high resolution related parts of cpu_base
> */
> @@ -730,6 +732,11 @@ static inline int hrtimer_enqueue_reprog
> }
> static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
> static inline void retrigger_next_event(void *arg) { }
> +static inline int hrtimer_reprogram(struct hrtimer *timer,
> + struct hrtimer_clock_base *base)
> +{
> + return 0;
> +}
>
> #endif /* CONFIG_HIGH_RES_TIMERS */
>
> @@ -861,9 +868,9 @@ void hrtimer_wait_for_timer(const struct
> {
> struct hrtimer_clock_base *base = timer->base;
>
> - if (base && base->cpu_base && !hrtimer_hres_active(base->cpu_base))
> + if (base && base->cpu_base && !timer->irqsafe)
> wait_event(base->cpu_base->wait,
> - !(timer->state & HRTIMER_STATE_CALLBACK));
> + !(timer->state & HRTIMER_STATE_CALLBACK));
> }
>
> #else
> @@ -913,6 +920,11 @@ static void __remove_hrtimer(struct hrti
> if (!(timer->state & HRTIMER_STATE_ENQUEUED))
> goto out;
>
> + if (unlikely(!list_empty(&timer->cb_entry))) {
> + list_del_init(&timer->cb_entry);
> + goto out;
> + }
> +
> next_timer = timerqueue_getnext(&base->active);
> timerqueue_del(&base->active, &timer->node);
> if (&timer->node == next_timer) {
> @@ -1011,6 +1023,26 @@ int __hrtimer_start_range_ns(struct hrti
> */
> if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
> && hrtimer_enqueue_reprogram(timer, new_base)) {
> +#ifdef CONFIG_PREEMPT_RT_BASE
> + again:

What kernel are you working with? I don't see anywhere the "again:"
within a PREEMPT_RT_BASE block.

> + /*
> + * Move softirq based timers away from the rbtree in
> + * case it expired already. Otherwise we would have a
> + * stale base->first entry until the softirq runs.
> + */
> + if (!hrtimer_rt_defer(timer)) {
> + ktime_t now = ktime_get();
> +
> + __run_hrtimer(timer, &now);
> + /*
> + * __run_hrtimer might have requeued timer and
> + * it could be base->first again.
> + */
> + if (&timer->node == new_base->active.next &&
> + hrtimer_enqueue_reprogram(timer, new_base))
> + goto again;
> + } else
> +#endif
> if (wakeup) {
> /*
> * We need to drop cpu_base->lock to avoid a
> @@ -1188,6 +1220,7 @@ static void __hrtimer_init(struct hrtime
>
> base = hrtimer_clockid_to_base(clock_id);
> timer->base = &cpu_base->clock_base[base];
> + INIT_LIST_HEAD(&timer->cb_entry);
> timerqueue_init(&timer->node);
>
> #ifdef CONFIG_TIMER_STATS
> @@ -1271,10 +1304,118 @@ static void __run_hrtimer(struct hrtimer
> timer->state &= ~HRTIMER_STATE_CALLBACK;
> }
>
> -#ifdef CONFIG_HIGH_RES_TIMERS
> -
> static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
>
> +#ifdef CONFIG_PREEMPT_RT_BASE
> +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
> + struct hrtimer_clock_base *base)
> +{
> + /*
> + * Note, we clear the callback flag before we requeue the
> + * timer otherwise we trigger the callback_running() check
> + * in hrtimer_reprogram().
> + */
> + timer->state &= ~HRTIMER_STATE_CALLBACK;
> +
> + if (restart != HRTIMER_NORESTART) {
> + BUG_ON(hrtimer_active(timer));
> + /*
> + * Enqueue the timer, if it's the leftmost timer then
> + * we need to reprogram it.
> + */
> + if (!enqueue_hrtimer(timer, base))
> + return;
> +
> + if (hrtimer_reprogram(timer, base))
> + goto requeue;
> +
> + } else if (hrtimer_active(timer)) {
> + /*
> + * If the timer was rearmed on another CPU, reprogram
> + * the event device.
> + */
> + if (&timer->node == base->active.next &&
> + hrtimer_reprogram(timer, base))
> + goto requeue;
> + }
> + return;
> +
> +requeue:
> + /*
> + * Timer is expired. Thus move it from tree to pending list
> + * again.
> + */
> + __remove_hrtimer(timer, base, timer->state, 0);
> + list_add_tail(&timer->cb_entry, &base->expired);
> +}
> +
> +/*
> + * The changes in mainline which removed the callback modes from
> + * hrtimer are not yet working with -rt. The non wakeup_process()
> + * based callbacks which involve sleeping locks need to be treated
> + * seperately.
> + */
> +static void hrtimer_rt_run_pending(void)
> +{
> + enum hrtimer_restart (*fn)(struct hrtimer *);
> + struct hrtimer_cpu_base *cpu_base;
> + struct hrtimer_clock_base *base;
> + struct hrtimer *timer;
> + int index, restart;
> +
> + local_irq_disable();
> + cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
> +
> + raw_spin_lock(&cpu_base->lock);
> +
> + for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
> + base = &cpu_base->clock_base[index];
> +
> + while (!list_empty(&base->expired)) {
> + timer = list_first_entry(&base->expired,
> + struct hrtimer, cb_entry);
> +
> + /*
> + * Same as the above __run_hrtimer function
> + * just we run with interrupts enabled.
> + */
> + debug_hrtimer_deactivate(timer);
> + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
> + timer_stats_account_hrtimer(timer);
> + fn = timer->function;
> +
> + raw_spin_unlock_irq(&cpu_base->lock);
> + restart = fn(timer);
> + raw_spin_lock_irq(&cpu_base->lock);
> +
> + hrtimer_rt_reprogram(restart, timer, base);
> + }
> + }
> +
> + raw_spin_unlock_irq(&cpu_base->lock);
> +
> + wake_up_timer_waiters(cpu_base);
> +}
> +
> +static int hrtimer_rt_defer(struct hrtimer *timer)
> +{
> + if (timer->irqsafe)
> + return 0;
> +
> + __remove_hrtimer(timer, timer->base, timer->state, 0);
> + list_add_tail(&timer->cb_entry, &timer->base->expired);
> + return 1;
> +}
> +
> +#else
> +
> +static inline void hrtimer_rt_run_pending(void) { }
> +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
> +
> +#endif
> +
> +#ifdef CONFIG_HIGH_RES_TIMERS
> +
> /*
> * High resolution timer interrupt
> * Called with interrupts disabled
> @@ -1283,7 +1424,7 @@ void hrtimer_interrupt(struct clock_even
> {
> struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
> ktime_t expires_next, now, entry_time, delta;
> - int i, retries = 0;
> + int i, retries = 0, raise = 0;
>
> BUG_ON(!cpu_base->hres_active);
> cpu_base->nr_events++;
> @@ -1349,7 +1490,10 @@ retry:
> break;
> }
>
> - __run_hrtimer(timer, &basenow);
> + if (!hrtimer_rt_defer(timer))
> + __run_hrtimer(timer, &basenow);
> + else
> + raise = 1;
> }
> }
>
> @@ -1364,6 +1508,10 @@ retry:
> if (expires_next.tv64 == KTIME_MAX ||
> !tick_program_event(expires_next, 0)) {
> cpu_base->hang_detected = 0;
> +
> + if (raise)
> + raise_softirq_irqoff(HRTIMER_SOFTIRQ);
> +
> return;
> }
>
> @@ -1444,6 +1592,12 @@ void hrtimer_peek_ahead_timers(void)
> local_irq_restore(flags);
> }
>
> +#else /* CONFIG_HIGH_RES_TIMERS */
> +
> +static inline void __hrtimer_peek_ahead_timers(void) { }
> +
> +#endif /* !CONFIG_HIGH_RES_TIMERS */
> +
> static void run_hrtimer_softirq(struct softirq_action *h)
> {
> struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
> @@ -1453,15 +1607,9 @@ static void run_hrtimer_softirq(struct s
> clock_was_set();
> }
>
> - hrtimer_peek_ahead_timers();
> + hrtimer_rt_run_pending();
> }
>
> -#else /* CONFIG_HIGH_RES_TIMERS */
> -
> -static inline void __hrtimer_peek_ahead_timers(void) { }
> -
> -#endif /* !CONFIG_HIGH_RES_TIMERS */
> -
> /*
> * Called from timer softirq every jiffy, expire hrtimers:
> *
> @@ -1494,7 +1642,7 @@ void hrtimer_run_queues(void)
> struct timerqueue_node *node;
> struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
> struct hrtimer_clock_base *base;
> - int index, gettime = 1;
> + int index, gettime = 1, raise = 0;
>
> if (hrtimer_hres_active())
> return;
> @@ -1519,12 +1667,16 @@ void hrtimer_run_queues(void)
> hrtimer_get_expires_tv64(timer))
> break;
>
> - __run_hrtimer(timer, &base->softirq_time);
> + if (!hrtimer_rt_defer(timer))
> + __run_hrtimer(timer, &base->softirq_time);
> + else
> + raise = 1;
> }
> raw_spin_unlock(&cpu_base->lock);
> }
>
> - wake_up_timer_waiters(cpu_base);
> + if (raise)
> + raise_softirq_irqoff(HRTIMER_SOFTIRQ);
> }
>
> /*
> @@ -1546,6 +1698,7 @@ static enum hrtimer_restart hrtimer_wake
> void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
> {
> sl->timer.function = hrtimer_wakeup;
> + sl->timer.irqsafe = 1;
> sl->task = task;
> }
> EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
> @@ -1684,6 +1837,7 @@ static void __cpuinit init_hrtimers_cpu(
> for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
> cpu_base->clock_base[i].cpu_base = cpu_base;
> timerqueue_init_head(&cpu_base->clock_base[i].active);
> + INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
> }
>
> hrtimer_init_hres(cpu_base);
> @@ -1802,9 +1956,7 @@ void __init hrtimers_init(void)
> hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
> (void *)(long)smp_processor_id());
> register_cpu_notifier(&hrtimers_nb);
> -#ifdef CONFIG_HIGH_RES_TIMERS
> open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
> -#endif
> }
>
> /**
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -189,6 +189,7 @@ void init_rt_bandwidth(struct rt_bandwid
>
> hrtimer_init(&rt_b->rt_period_timer,
> CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + rt_b->rt_period_timer.irqsafe = 1;
> rt_b->rt_period_timer.function = sched_rt_period_timer;
> }
>
> @@ -1274,6 +1275,7 @@ static void init_rq_hrtick(struct rq *rq
>
> hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> rq->hrtick_timer.function = hrtick;
> + rq->hrtick_timer.irqsafe = 1;
> }
> #else /* CONFIG_SCHED_HRTICK */
> static inline void hrtick_clear(struct rq *rq)
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -805,6 +805,7 @@ void tick_setup_sched_timer(void)
> * Emulate tick processing via per-CPU hrtimers:
> */
> hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
> + ts->sched_timer.irqsafe = 1;
> ts->sched_timer.function = tick_sched_timer;
>
> /* Get the next period (per cpu) */
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -436,6 +436,7 @@ static void watchdog_prepare_cpu(int cpu
> WARN_ON(per_cpu(softlockup_watchdog, cpu));
> hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> hrtimer->function = watchdog_timer_fn;
> + hrtimer->irqsafe = 1;
> }
>
> static int watchdog_enable(int cpu)
> ---
> From: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
> Date: Fri, 12 Aug 2011 17:39:54 +0200
> Subject: hrtimer: Don't call the timer handler from hrtimer_start
>
> [<ffffffff812de4a9>] __delay+0xf/0x11
> [<ffffffff812e36e9>] do_raw_spin_lock+0xd2/0x13c
> [<ffffffff815028ee>] _raw_spin_lock+0x60/0x73 rt_b->rt_runtime_lock
> [<ffffffff81068f68>] ? sched_rt_period_timer+0xad/0x281
> [<ffffffff81068f68>] sched_rt_period_timer+0xad/0x281
> [<ffffffff8109e5e1>] __run_hrtimer+0x1e4/0x347
> [<ffffffff81068ebb>] ? enqueue_rt_entity+0x36/0x36
> [<ffffffff8109f2b1>] __hrtimer_start_range_ns+0x2b5/0x40a base->cpu_base->lock (lock_hrtimer_base)
> [<ffffffff81068b6f>] __enqueue_rt_entity+0x26f/0x2aa rt_b->rt_runtime_lock (start_rt_bandwidth)
> [<ffffffff81068ead>] enqueue_rt_entity+0x28/0x36
> [<ffffffff81069355>] enqueue_task_rt+0x3d/0xb0
> [<ffffffff810679d6>] enqueue_task+0x5d/0x64
> [<ffffffff810714fc>] task_setprio+0x210/0x29c rq->lock
> [<ffffffff810b56cb>] __rt_mutex_adjust_prio+0x25/0x2a p->pi_lock
> [<ffffffff810b5d2c>] task_blocks_on_rt_mutex+0x196/0x20f
>
> Instead make __hrtimer_start_range_ns() return -ETIME when the timer
> is in the past. Since body actually uses the hrtimer_start*() return
> value its pretty safe to wreck it.
>
> Also, it will only ever return -ETIME for timer->irqsafe || !wakeup
> timers.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
> [bwh: Pull the changes to hrtimer_enqueue_reprogram() up into
> __hrtimer_start_range_ns(), following changes in
> commit b22affe0aef4 'hrtimer: Prevent hrtimer_enqueue_reprogram race'
> backported into 3.2.40]
> Signed-off-by: Ben Hutchings <ben@xxxxxxxxxxxxxxx>
> ---
> --- a/kernel/hrtimer.c
> +++ b/kernel/hrtimer.c
> @@ -1023,37 +1023,31 @@ int __hrtimer_start_range_ns(struct hrti
> */
> if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
> && hrtimer_enqueue_reprogram(timer, new_base)) {
> -#ifdef CONFIG_PREEMPT_RT_BASE
> - again:

Never mind, I see you applied two patches.

> /*
> * Move softirq based timers away from the rbtree in
> * case it expired already. Otherwise we would have a
> * stale base->first entry until the softirq runs.
> */
> - if (!hrtimer_rt_defer(timer)) {
> - ktime_t now = ktime_get();
> -
> - __run_hrtimer(timer, &now);
> - /*
> - * __run_hrtimer might have requeued timer and
> - * it could be base->first again.
> - */
> - if (&timer->node == new_base->active.next &&
> - hrtimer_enqueue_reprogram(timer, new_base))
> - goto again;
> - } else
> + if (wakeup
> +#ifdef CONFIG_PREEMPT_RT_BASE
> + && hrtimer_rt_defer(timer)
> #endif

This is very similar to what I came up with.

> - if (wakeup) {
> - /*
> - * We need to drop cpu_base->lock to avoid a
> - * lock ordering issue vs. rq->lock.
> - */
> + ) {
> raw_spin_unlock(&new_base->cpu_base->lock);
> raise_softirq_irqoff(HRTIMER_SOFTIRQ);
> local_irq_restore(flags);
> - return ret;
> + return 0;
> } else {
> - __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
> + ret = -ETIME;
> +
> + /*
> + * In case we failed to reprogram the timer (mostly
> + * because out current timer is already elapsed),
> + * remove it again and report a failure. This avoids
> + * stale base->first entries.
> + */
> + __remove_hrtimer(timer, new_base,
> + timer->state & HRTIMER_STATE_CALLBACK, 0);

Yep, looks like we are on the right track :-)

> }
> }
>
> ---
>
> Note, the #ifdef in the second patch can't be removed, as the !RT
> definition of hrtimer_rt_defer() returns 0 whereas we want it to be true
> here. (At least, that seems to match the logic of the previous version
> of the patch.)

Yep. That's what I'm waiting to hear too.

-- Steve



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/