[PATCH 1/2] timer: Avoid waking up an idle-core by migrate running timer

From: Viresh Kumar
Date: Tue Mar 31 2015 - 02:55:42 EST


While queuing a timer, we try to migrate it to a non-idle core if the local core
is idle, but we don't try that if the timer is re-armed from its handler.

There were few unsolved problems due to which it was avoided until now. But
there are cases where solving these problems can be useful. When the timer is
always re-armed from its handler, it never migrates to other cores. And many a
times, it ends up waking an idle core to just service the timer, which could
have been handled by a non-idle core.

Problems to solve, if we allow such migrations:

- Serialization of timer with itself. Its possible that timer may fire on the
new base, before the timer handler finishes on old base.

- del_timer_sync() can't detect that the timer's handler has not finished.

__mod_timer migrates the timer with following code:

spin_lock(&old_base->lock);
timer_set_base(timer, NULL);
spin_unlock(&old_base->lock);

spin_lock(&new_base->lock);
timer_set_base(timer, new_base);
spin_unlock(&new_base->lock);

Once the new_base->lock is released, del_timer_sync() can take the
new_base->lock and will get new_base->running_timer != timer. del_timer_sync()
will then remove the timer and return while timer's handler hasn't finished yet
on the old base.

To fix these issues, lets use another bit from base pointer to mark if a timer's
handler is currently running or not. This can be used to verify timer's state in
del_timer_sync().

Before running timer's handler we must ensure timer isn't running on any other
CPU. If it is, then process other expired timers first, if any, and then wait
until the handler finishes.

Cc: Steven Miao <realmz6@xxxxxxxxx>
Suggested-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Signed-off-by: Viresh Kumar <viresh.kumar@xxxxxxxxxx>
---
include/linux/timer.h | 3 +-
kernel/time/timer.c | 93 ++++++++++++++++++++++++++++++++++++++++-----------
2 files changed, 76 insertions(+), 20 deletions(-)

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 8c5a197e1587..68bf09d69352 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -67,8 +67,9 @@ extern struct tvec_base boot_tvec_bases;
*/
#define TIMER_DEFERRABLE 0x1LU
#define TIMER_IRQSAFE 0x2LU
+#define TIMER_RUNNING 0x4LU

-#define TIMER_FLAG_MASK 0x3LU
+#define TIMER_FLAG_MASK 0x7LU

#define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
.entry = { .prev = TIMER_ENTRY_STATIC }, \
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c504939..364644811485 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -105,6 +105,21 @@ static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
}

+static inline unsigned int timer_running(struct timer_list *timer)
+{
+ return ((unsigned int)(unsigned long)timer->base & TIMER_RUNNING);
+}
+
+static inline void timer_set_running(struct timer_list *timer)
+{
+ timer->base = (struct tvec_base *)((unsigned long)timer->base | TIMER_RUNNING);
+}
+
+static inline void timer_clear_running(struct timer_list *timer)
+{
+ timer->base = (struct tvec_base *)((unsigned long)timer->base & ~TIMER_RUNNING);
+}
+
static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
{
return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
@@ -781,21 +796,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
new_base = per_cpu(tvec_bases, cpu);

if (base != new_base) {
- /*
- * We are trying to schedule the timer on the local CPU.
- * However we can't change timer's base while it is running,
- * otherwise del_timer_sync() can't detect that the timer's
- * handler yet has not finished. This also guarantees that
- * the timer is serialized wrt itself.
- */
- if (likely(base->running_timer != timer)) {
- /* See the comment in lock_timer_base() */
- timer_set_base(timer, NULL);
- spin_unlock(&base->lock);
- base = new_base;
- spin_lock(&base->lock);
- timer_set_base(timer, base);
- }
+ /* See the comment in lock_timer_base() */
+ timer_set_base(timer, NULL);
+ spin_unlock(&base->lock);
+ base = new_base;
+ spin_lock(&base->lock);
+ timer_set_base(timer, base);
}

timer->expires = expires;
@@ -1016,7 +1022,7 @@ int try_to_del_timer_sync(struct timer_list *timer)

base = lock_timer_base(timer, &flags);

- if (base->running_timer != timer) {
+ if (!timer_running(timer)) {
timer_stats_timer_clear_start_info(timer);
ret = detach_if_pending(timer, base, true);
}
@@ -1050,12 +1056,12 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
* ---- ----
* <SOFTIRQ>
* call_timer_fn();
- * base->running_timer = mytimer;
+ * timer_set_running(mytimer);
* spin_lock_irq(somelock);
* <IRQ>
* spin_lock(somelock);
* del_timer_sync(mytimer);
- * while (base->running_timer == mytimer);
+ * while (timer_running(mytimer));
*
* Now del_timer_sync() will never return and never release somelock.
* The interrupt on the other CPU is waiting to grab somelock but
@@ -1189,12 +1195,41 @@ static inline void __run_timers(struct tvec_base *base)
cascade(base, &base->tv5, INDEX(3));
++base->timer_jiffies;
list_replace_init(base->tv1.vec + index, head);
+
+again:
while (!list_empty(head)) {
void (*fn)(unsigned long);
unsigned long data;
bool irqsafe;

- timer = list_first_entry(head, struct timer_list,entry);
+ timer = list_first_entry(head, struct timer_list, entry);
+
+ if (unlikely(timer_running(timer))) {
+ /*
+ * The only way to get here is if the handler,
+ * running on another base, re-queued itself on
+ * this base, and the handler hasn't finished
+ * yet.
+ */
+
+ if (list_is_last(&timer->entry, head)) {
+ /*
+ * Drop lock, so that TIMER_RUNNING can
+ * be cleared on another base.
+ */
+ spin_unlock(&base->lock);
+
+ while (timer_running(timer))
+ cpu_relax();
+
+ spin_lock(&base->lock);
+ } else {
+ /* Rotate the list and try someone else */
+ list_move_tail(&timer->entry, head);
+ }
+ goto again;
+ }
+
fn = timer->function;
data = timer->data;
irqsafe = tbase_get_irqsafe(timer->base);
@@ -1202,6 +1237,7 @@ static inline void __run_timers(struct tvec_base *base)
timer_stats_account_timer(timer);

base->running_timer = timer;
+ timer_set_running(timer);
detach_expired_timer(timer, base);

if (irqsafe) {
@@ -1213,6 +1249,25 @@ static inline void __run_timers(struct tvec_base *base)
call_timer_fn(timer, fn, data);
spin_lock_irq(&base->lock);
}
+
+ /*
+ * Handler running on this base, re-queued itself on
+ * another base ?
+ */
+ if (unlikely(timer->base != base)) {
+ unsigned long flags;
+ struct tvec_base *tbase;
+
+ spin_unlock(&base->lock);
+
+ tbase = lock_timer_base(timer, &flags);
+ timer_clear_running(timer);
+ spin_unlock(&tbase->lock);
+
+ spin_lock(&base->lock);
+ } else {
+ timer_clear_running(timer);
+ }
}
}
base->running_timer = NULL;
--
2.3.0.rc0.44.ga94655d

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/