Re: [PATCH 17/25] hrtimer: Implementation of softirq hrtimer handling

From: Peter Zijlstra
Date: Wed Sep 27 2017 - 12:40:42 EST


On Thu, Aug 31, 2017 at 12:23:42PM -0000, Anna-Maria Gleixner wrote:
> @@ -540,12 +539,23 @@ static ktime_t __hrtimer_get_next_event(
>
> hrtimer_update_next_timer(cpu_base, NULL);
>
> + if (!cpu_base->softirq_activated) {
> + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
> + expires_next = __hrtimer_next_event_base(cpu_base, active,
> + expires_next);
> + cpu_base->softirq_expires_next = expires_next;
> + }
> +

So this bugged the living daylight out of me. That's a very asymmetric
thing to do. The normal rule is that get_next_event() updates
cpu_base::next_timer and hrtimer_reprogram() updates
cpu_base::expires_next. And here you make a giant mess of things.

The below is a fairly large diff on top of this patch which is
_completely_ untested, but should hopefully restore sanity.

- fixes the mixing of bool and bitfields in cpu_base
- restores the whole get_next_timer() / reprogram() symmetry
by adding cpu_base::softirq_next_timer.
- adds a comment hopefully explaining the why of that
- adds for_each_active_base() helper, we had two copies of that, and
for a brief moment, I had another few, luckily those didn't survive
:-)
- uses irqsave/irqrestore for the cpu_base->lock fiddling around
__run_hrtimer().
- folded hrtimer_reprogram() and hrtimer_reprogram_softirq()
- removed superfluous local_bh_disable(), since local_irq_disable()
already implies much the same.

Please consider...

---
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -186,21 +186,26 @@ struct hrtimer_cpu_base {
unsigned int cpu;
unsigned int active_bases;
unsigned int clock_was_set_seq;
- bool migration_enabled;
- bool nohz_active;
- bool softirq_activated;
- unsigned int hres_active : 1,
- in_hrtirq : 1,
- hang_detected : 1;
+
+ unsigned int migration_enabled : 1;
+ unsigned int nohz_active : 1;
+ unsigned int softirq_activated : 1;
+ unsigned int hres_active : 1;
+ unsigned int in_hrtirq : 1;
+ unsigned int hang_detected : 1;
+
#ifdef CONFIG_HIGH_RES_TIMERS
unsigned int nr_events;
unsigned int nr_retries;
unsigned int nr_hangs;
unsigned int max_hang_time;
#endif
+
ktime_t expires_next;
struct hrtimer *next_timer;
ktime_t softirq_expires_next;
+ struct hrtimer *softirq_next_timer;
+
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
} ____cacheline_aligned;

--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -78,6 +78,7 @@
#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT)
#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1)
#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
+#define HRTIMER_ACTIVE (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)

/*
* The timer bases:
@@ -493,33 +494,49 @@ static inline void debug_deactivate(stru
trace_hrtimer_cancel(timer);
}

-static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
- struct hrtimer *timer)
+static inline bool hrtimer_is_softirq(struct hrtimer *timer)
{
- cpu_base->next_timer = timer;
+ return timer->base->index >= HRTIMER_BASE_MONOTONIC_SOFT;
+}
+
+
+static struct hrtimer_block_base *
+__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
+{
+ struct hrtimer_clock_base *base = NULL;
+
+ if (*active) {
+ int idx = __ffs(*active);
+ *active &= (1U << idx);
+ base = cpu_base->clock_base[idx];
+ }
+
+ return base;
}

+#define for_each_active_base(base, cpu_base, active) \
+ while ((base = __next_base((cpu_base), &(active))))
+
static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
unsigned int active,
ktime_t expires_next)
{
+ struct hrtimer_clock_base *base;
ktime_t expires;

- while (active) {
- unsigned int id = __ffs(active);
- struct hrtimer_clock_base *base;
+ for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *next;
struct hrtimer *timer;

- active &= ~(1U << id);
- base = cpu_base->clock_base + id;
-
next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires < expires_next) {
expires_next = expires;
- hrtimer_update_next_timer(cpu_base, timer);
+ if (hrtimer_is_softirq(timer))
+ cpu_base->softirq_next_timer = timer;
+ else
+ cpu_base->next_timer = timer;
}
}
/*
@@ -529,30 +546,47 @@ static ktime_t __hrtimer_next_event_base
*/
if (expires_next < 0)
expires_next = 0;
+
return expires_next;
}

-static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
+/*
+ * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
+ * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
+ *
+ * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
+ * those timers will get run whenever the softirq gets handled, at the end of
+ * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
+ *
+ * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
+ * The !softirq values are the minima across HRTIMER_ACTIVE, unless an actual
+ * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
+ *
+ * @active_mask must be one of:
+ * - HRTIMER_ACTIVE,
+ * - HRTIMER_ACTIVE_SOFT, or
+ * - HRTIMER_ACTIVE_HARD.
+ */
+static ktime_t
+__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
{
unsigned int active;
+ struct hrtimer *next_timer = NULL;
ktime_t expires_next = KTIME_MAX;

- hrtimer_update_next_timer(cpu_base, NULL);
-
- if (!cpu_base->softirq_activated) {
+ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
- expires_next = __hrtimer_next_event_base(cpu_base, active,
- expires_next);
- cpu_base->softirq_expires_next = expires_next;
- }
+ cpu_base->softirq_next_timer = next_timer;
+ expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);

- active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
- expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
+ next_timer = cpu_base->softirq_next_timer;
+ }

- /*
- * cpu_base->expires_next is not updated here. It is set only
- * in hrtimer_reprogramming path!
- */
+ if (active_mask & HRTIMER_ACTIVE_HARD) {
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ cpu_base->next_timer = next_timer;
+ expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
+ }

return expires_next;
}
@@ -621,7 +655,10 @@ hrtimer_force_reprogram(struct hrtimer_c
if (!cpu_base->hres_active)
return;

- expires_next = __hrtimer_get_next_event(cpu_base);
+ /*
+ * Find the current next expiration time.
+ */
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE);

if (skip_equal && expires_next == cpu_base->expires_next)
return;
@@ -727,6 +764,20 @@ static void hrtimer_reprogram(struct hrt

WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);

+ if (hrtimer_is_softirq(timer)) {
+ if (cpu_base->softirq_activated)
+ return;
+
+ if (!ktime_before(expires, cpu_base->softirq_expires_next))
+ return;
+
+ cpu_base->softirq_next_timer = timer;
+ cpu_base->softirq_expires_next = expires;
+
+ if (!ktime_before(expires, cpu_base->expires_next))
+ return;
+ }
+
/*
* If the timer is not on the current cpu, we cannot reprogram
* the other cpus clock event device.
@@ -755,7 +806,7 @@ static void hrtimer_reprogram(struct hrt
return;

/* Update the pointer to the next expiring timer */
- hrtimer_update_next_timer(cpu_base, timer);
+ cpu_base->next_timer = timer;
cpu_base->expires_next = expires;

/*
@@ -979,47 +1030,24 @@ static inline ktime_t hrtimer_update_low
return tim;
}

-static void hrtimer_reprogram_softirq(struct hrtimer *timer)
+static void
+hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
- struct hrtimer_clock_base *base = timer->base;
- struct hrtimer_cpu_base *cpu_base = base->cpu_base;
ktime_t expires;

/*
- * The softirq timer is not rearmed, when the softirq was raised
- * and has not yet run to completion.
+ * Find the next SOFT expiration.
*/
- if (cpu_base->softirq_activated)
- return;
-
- expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-
- if (!ktime_before(expires, cpu_base->softirq_expires_next))
- return;
-
- cpu_base->softirq_expires_next = expires;
-
- if (!ktime_before(expires, cpu_base->expires_next))
- return;
- hrtimer_reprogram(timer);
-}
-
-static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base,
- bool reprogram)
-{
- ktime_t expires;
-
- expires = __hrtimer_get_next_event(cpu_base);
+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);

if (!reprogram || !ktime_before(expires, cpu_base->expires_next))
return;
+
/*
- * next_timer can be used here, because
- * hrtimer_get_next_event() updated the next
- * timer. expires_next is only set when reprogramming function
- * is called.
+ * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
+ * cpu_base->*expires_next is only set by hrtimer_reprogram()
*/
- hrtimer_reprogram(cpu_base->next_timer);
+ hrtimer_reprogram(cpu_base->softirq_next_timer);
}

static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
@@ -1060,12 +1088,9 @@ void hrtimer_start_range_ns(struct hrtim

base = lock_hrtimer_base(timer, &flags);

- if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) {
- if (timer->base->index < HRTIMER_BASE_MONOTONIC_SOFT)
- hrtimer_reprogram(timer);
- else
- hrtimer_reprogram_softirq(timer);
- }
+ if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
+ hrtimer_reprogram(timer);
+
unlock_hrtimer_base(timer, &flags);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
@@ -1163,7 +1188,7 @@ u64 hrtimer_get_next_event(void)
raw_spin_lock_irqsave(&cpu_base->lock, flags);

if (!__hrtimer_hres_active(cpu_base))
- expires = __hrtimer_get_next_event(cpu_base);
+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE);

raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

@@ -1263,7 +1288,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer_clock_base *base,
struct hrtimer *timer, ktime_t *now,
- bool hardirq)
+ unsigned long flags)
{
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
@@ -1298,19 +1323,13 @@ static void __run_hrtimer(struct hrtimer
* protected against migration to a different CPU even if the lock
* is dropped.
*/
- if (hardirq)
- raw_spin_unlock(&cpu_base->lock);
- else
- raw_spin_unlock_irq(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

trace_hrtimer_expire_entry(timer, now);
restart = fn(timer);
trace_hrtimer_expire_exit(timer);

- if (hardirq)
- raw_spin_lock(&cpu_base->lock);
- else
- raw_spin_lock_irq(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);

/*
* Note: We clear the running state after enqueue_hrtimer and
@@ -1339,19 +1358,15 @@ static void __run_hrtimer(struct hrtimer
}

static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
- unsigned int active_mask)
+ unsigned int active_mask, unsigned long flags)
{
unsigned int active = cpu_base->active_bases & active_mask;
+ struct hrtimer_clock_base *base;

- while (active) {
- unsigned int id = __ffs(active);
- struct hrtimer_clock_base *base;
+ for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *node;
ktime_t basenow;

- active &= ~(1U << id);
- base = cpu_base->clock_base + id;
-
basenow = ktime_add(now, base->offset);

while ((node = timerqueue_getnext(&base->active))) {
@@ -1374,8 +1389,7 @@ static void __hrtimer_run_queues(struct
if (basenow < hrtimer_get_softexpires_tv64(timer))
break;

- __run_hrtimer(cpu_base, base, timer, &basenow,
- active_mask == HRTIMER_ACTIVE_HARD);
+ __run_hrtimer(cpu_base, base, timer, &basenow, flags);
}
}
}
@@ -1383,17 +1397,18 @@ static void __hrtimer_run_queues(struct
static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ unsigned long flags;
ktime_t now;

- raw_spin_lock_irq(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);

now = hrtimer_update_base(cpu_base);
- __hrtimer_run_queues(cpu_base, now, HRTIMER_ACTIVE_SOFT);
+ __hrtimer_run_queues(cpu_base, now, HRTIMER_ACTIVE_SOFT, flags);

cpu_base->softirq_activated = 0;
hrtimer_update_softirq_timer(cpu_base, true);

- raw_spin_unlock_irq(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}

#ifdef CONFIG_HIGH_RES_TIMERS
@@ -1406,13 +1421,14 @@ void hrtimer_interrupt(struct clock_even
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires_next, now, entry_time, delta;
+ unsigned long flags;
int retries = 0;

BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
dev->next_event = KTIME_MAX;

- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
entry_time = now = hrtimer_update_base(cpu_base);
retry:
cpu_base->in_hrtirq = 1;
@@ -1431,17 +1447,17 @@ void hrtimer_interrupt(struct clock_even
raise_softirq_irqoff(HRTIMER_SOFTIRQ);
}

- __hrtimer_run_queues(cpu_base, now, HRTIMER_ACTIVE_HARD);
+ __hrtimer_run_queues(cpu_base, now, HRTIMER_ACTIVE_HARD, flags);

- /* Reevaluate the hard interrupt clock bases for the next expiry */
- expires_next = __hrtimer_get_next_event(cpu_base);
+ /* Reevaluate the clock bases for the next expiry */
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE);
/*
* Store the new expiry value so the migration code can verify
* against it.
*/
cpu_base->expires_next = expires_next;
cpu_base->in_hrtirq = 0;
- raw_spin_unlock(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

/* Reprogramming necessary ? */
if (!tick_program_event(expires_next, 0)) {
@@ -1462,7 +1478,7 @@ void hrtimer_interrupt(struct clock_even
* Acquire base lock for updating the offsets and retrieving
* the current time.
*/
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
cpu_base->nr_retries++;
if (++retries < 3)
@@ -1475,7 +1491,8 @@ void hrtimer_interrupt(struct clock_even
*/
cpu_base->nr_hangs++;
cpu_base->hang_detected = 1;
- raw_spin_unlock(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+
delta = ktime_sub(now, entry_time);
if ((unsigned int)delta > cpu_base->max_hang_time)
cpu_base->max_hang_time = (unsigned int) delta;
@@ -1517,6 +1534,7 @@ static inline void __hrtimer_peek_ahead_
void hrtimer_run_queues(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ unsigned long flags;
ktime_t now;

if (__hrtimer_hres_active(cpu_base))
@@ -1534,7 +1552,7 @@ void hrtimer_run_queues(void)
return;
}

- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);

if (!ktime_before(now, cpu_base->softirq_expires_next)) {
@@ -1543,8 +1561,8 @@ void hrtimer_run_queues(void)
raise_softirq_irqoff(HRTIMER_SOFTIRQ);
}

- __hrtimer_run_queues(cpu_base, now, HRTIMER_ACTIVE_HARD);
- raw_spin_unlock(&cpu_base->lock);
+ __hrtimer_run_queues(cpu_base, now, HRTIMER_ACTIVE_HARD, flags);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}

/*
@@ -1768,7 +1786,6 @@ int hrtimers_dead_cpu(unsigned int scpu)
BUG_ON(cpu_online(scpu));
tick_cancel_sched_timer(scpu);

- local_bh_disable();
local_irq_disable();
old_base = &per_cpu(hrtimer_bases, scpu);
new_base = this_cpu_ptr(&hrtimer_bases);
@@ -1796,7 +1813,6 @@ int hrtimers_dead_cpu(unsigned int scpu)
/* Check, if we got expired work to do */
__hrtimer_peek_ahead_timers();
local_irq_enable();
- local_bh_enable();
return 0;
}