Re: [RFC] sched: Add a new lockless wake-from-idle implementation

From: Peter Zijlstra
Date: Thu Feb 13 2014 - 09:50:36 EST


On Wed, Feb 12, 2014 at 05:40:12PM -0800, Andy Lutomirski wrote:
> This is a strawman proposal to simplify the idle implementation, eliminate
> a race
>
> Benefits over current code:
> - ttwu_queue_remote doesn't use an IPI unless needed
> - The diffstat should speak for itself :)
> - Less racy. Spurious IPIs are possible, but only in narrow windows or
> when two wakeups occur in rapid succession.
> - Seems to work (?)
>
> Issues:
> - Am I doing the percpu stuff right?
> - Needs work on non-x86 architectures
> - The !CONFIG_SMP case needs to be checked
> - Is "idlepoll" a good name for the new code? It doesn't have *that*
> much to do with the idle state. Maybe cpukick?
>
> If this turns out okay, TIF_NEED_RESCHED could possibly be deleted as well.

No, we can't do away with that; its used in some fairly critical paths
(return to userspace) and adding a second cacheline load there would be
unfortunate.

I also don't really like how the polling state is an atomic; its a cpu
local property.

Now given we can't get rid of TIF_NEED_RESCHED, and we need an atomic op
on a remote cacheline anyhow; the simplest solution would be to convert
all TS_POLLING users to TIF_POLLING_NRFLAG and use an atomic_or_return()
like construct to do:

atomic_or_return(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG

and avoid the IPI if that is false.

Something a little like this; it does require a lot of auditing; but it
boots on my x86_64.

---
arch/x86/include/asm/mwait.h | 19 ++++----
arch/x86/include/asm/thread_info.h | 4 +-
arch/x86/kernel/process.c | 8 ++--
include/linux/sched.h | 90 +++-----------------------------------
kernel/sched/core.c | 70 ++++++++++++++++++-----------
kernel/sched/idle.c | 40 ++++++++---------
kernel/sched/sched.h | 3 ++
7 files changed, 88 insertions(+), 146 deletions(-)

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 1da25a5f96f9..d6a7b7cdc478 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -42,18 +42,15 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
*/
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
- if (!current_set_polling_and_test()) {
- if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
- mb();
- clflush((void *)&current_thread_info()->flags);
- mb();
- }
-
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- if (!need_resched())
- __mwait(eax, ecx);
+ if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
+ mb();
+ clflush((void *)&current_thread_info()->flags);
+ mb();
}
- current_clr_polling();
+
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ if (!need_resched())
+ __mwait(eax, ecx);
}

#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e1940c06ed02..1f2fe575cfca 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -88,6 +88,7 @@ struct thread_info {
#define TIF_FORK 18 /* ret_from_fork */
#define TIF_NOHZ 19 /* in adaptive nohz mode */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
+#define TIF_POLLING_NRFLAG 21
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
@@ -111,6 +112,7 @@ struct thread_info {
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_FORK (1 << TIF_FORK)
#define _TIF_NOHZ (1 << TIF_NOHZ)
+#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
@@ -234,8 +236,6 @@ static inline struct thread_info *current_thread_info(void)
* have to worry about atomic accesses.
*/
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
-#define TS_POLLING 0x0004 /* idle task polling need_resched,
- skip sending interrupt */
#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */

#ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4505e2a950d8..298c002629c6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -306,9 +306,11 @@ void arch_cpu_idle(void)
*/
void default_idle(void)
{
- trace_cpu_idle_rcuidle(1, smp_processor_id());
- safe_halt();
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+ if (!current_clr_polling_and_test()) {
+ trace_cpu_idle_rcuidle(1, smp_processor_id());
+ safe_halt();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+ }
}
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(default_idle);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c49a2585ff7d..b326abe95978 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2692,85 +2692,26 @@ static inline int spin_needbreak(spinlock_t *lock)
* thread_info.status and one based on TIF_POLLING_NRFLAG in
* thread_info.flags
*/
-#ifdef TS_POLLING
-static inline int tsk_is_polling(struct task_struct *p)
-{
- return task_thread_info(p)->status & TS_POLLING;
-}
-static inline void __current_set_polling(void)
-{
- current_thread_info()->status |= TS_POLLING;
-}
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
- __current_set_polling();
-
- /*
- * Polling state must be visible before we test NEED_RESCHED,
- * paired by resched_task()
- */
- smp_mb();
-
- return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
-{
- current_thread_info()->status &= ~TS_POLLING;
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
-{
- __current_clr_polling();
-
- /*
- * Polling state must be visible before we test NEED_RESCHED,
- * paired by resched_task()
- */
- smp_mb();
-
- return unlikely(tif_need_resched());
-}
-#elif defined(TIF_POLLING_NRFLAG)
+#ifdef CONFIG_SMP
static inline int tsk_is_polling(struct task_struct *p)
{
return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
}

-static inline void __current_set_polling(void)
+static inline void current_set_polling(void)
{
set_thread_flag(TIF_POLLING_NRFLAG);
}

-static inline bool __must_check current_set_polling_and_test(void)
-{
- __current_set_polling();
-
- /*
- * Polling state must be visible before we test NEED_RESCHED,
- * paired by resched_task()
- *
- * XXX: assumes set/clear bit are identical barrier wise.
- */
- smp_mb__after_clear_bit();
-
- return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
+static inline void current_clr_polling(void)
{
clear_thread_flag(TIF_POLLING_NRFLAG);
}

static inline bool __must_check current_clr_polling_and_test(void)
{
- __current_clr_polling();
+ current_clr_polling();

- /*
- * Polling state must be visible before we test NEED_RESCHED,
- * paired by resched_task()
- */
smp_mb__after_clear_bit();

return unlikely(tif_need_resched());
@@ -2778,34 +2719,15 @@ static inline bool __must_check current_clr_polling_and_test(void)

#else
static inline int tsk_is_polling(struct task_struct *p) { return 0; }
-static inline void __current_set_polling(void) { }
-static inline void __current_clr_polling(void) { }
+static inline void current_set_polling(void) { }
+static inline void current_clr_polling(void) { }

-static inline bool __must_check current_set_polling_and_test(void)
-{
- return unlikely(tif_need_resched());
-}
static inline bool __must_check current_clr_polling_and_test(void)
{
return unlikely(tif_need_resched());
}
#endif

-static inline void current_clr_polling(void)
-{
- __current_clr_polling();
-
- /*
- * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
- * Once the bit is cleared, we'll get IPIs with every new
- * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
- * fold.
- */
- smp_mb(); /* paired with resched_task() */
-
- preempt_fold_need_resched();
-}
-
static __always_inline bool need_resched(void)
{
return unlikely(tif_need_resched());
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fb9764fbc537..19de9a1cc96c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -504,45 +504,63 @@ static inline void init_hrtick(void)
}
#endif /* CONFIG_SCHED_HRTICK */

-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-void resched_task(struct task_struct *p)
+static void __resched_cpu(int cpu)
{
- int cpu;
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *p;
+ struct thread_info *ti;
+ unsigned long val, old, new;
+ bool ipi;

- lockdep_assert_held(&task_rq(p)->lock);
+ rcu_read_lock();
+ do {
+ p = ACCESS_ONCE(rq->curr);
+ ti = task_thread_info(p);
+
+ val = ti->flags;
+ for (;;) {
+ new = val | _TIF_NEED_RESCHED;
+ old = cmpxchg(&ti->flags, val, new);
+ if (old == val)
+ break;
+ val = old;
+ }

- if (test_tsk_need_resched(p))
- return;
+ ipi = !(old & _TIF_POLLING_NRFLAG);

- set_tsk_need_resched(p);
+ } while (p != ACCESS_ONCE(rq->curr));
+ rcu_read_unlock();

- cpu = task_cpu(p);
+ if (ipi)
+ smp_send_reschedule(cpu);
+}
+
+void resched_cpu(int cpu)
+{
if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(current);
set_preempt_need_resched();
return;
}

- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(p))
- smp_send_reschedule(cpu);
+ __resched_cpu(cpu);
}

-void resched_cpu(int cpu)
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_task(struct task_struct *p)
{
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
+ lockdep_assert_held(&task_rq(p)->lock);

- if (!raw_spin_trylock_irqsave(&rq->lock, flags))
+ if (test_tsk_need_resched(p))
return;
- resched_task(cpu_curr(cpu));
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ resched_cpu(task_cpu(p));
}

#ifdef CONFIG_SMP
@@ -1476,7 +1494,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}

#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
@@ -2689,7 +2707,7 @@ static void __sched __schedule(void)
update_rq_clock(rq);

next = pick_next_task(rq, prev);
- clear_tsk_need_resched(prev);
+ clear_tsk_need_resched(next);
clear_preempt_need_resched();
rq->skip_clock_update = 0;

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 14ca43430aee..03748737473e 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -11,6 +11,7 @@
#include <asm/tlb.h>

#include <trace/events/power.h>
+#include "sched.h"

static int __read_mostly cpu_idle_force_poll;

@@ -71,6 +72,8 @@ static void cpu_idle_loop(void)
while (1) {
tick_nohz_idle_enter();

+ current_set_polling();
+
while (!need_resched()) {
check_pgt_cache();
rmb();
@@ -93,29 +96,27 @@ static void cpu_idle_loop(void)
if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
cpu_idle_poll();
} else {
- if (!current_clr_polling_and_test()) {
- stop_critical_timings();
- rcu_idle_enter();
- if (cpuidle_idle_call())
- arch_cpu_idle();
- if (WARN_ON_ONCE(irqs_disabled()))
- local_irq_enable();
- rcu_idle_exit();
- start_critical_timings();
- } else {
+ stop_critical_timings();
+ rcu_idle_enter();
+ if (cpuidle_idle_call())
+ arch_cpu_idle();
+ if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
- }
- __current_set_polling();
+ rcu_idle_exit();
+ start_critical_timings();
}
arch_cpu_idle_exit();
- /*
- * We need to test and propagate the TIF_NEED_RESCHED
- * bit here because we might not have send the
- * reschedule IPI to idle tasks.
- */
- if (tif_need_resched())
- set_preempt_need_resched();
}
+
+ current_clr_polling();
+
+ /*
+ * We need to test and propagate the TIF_NEED_RESCHED bit here
+ * because we might not have send the reschedule IPI to idle
+ * tasks.
+ */
+ set_preempt_need_resched();
+ sched_ttwu_pending();
tick_nohz_idle_exit();
schedule_preempt_disabled();
}
@@ -138,7 +139,6 @@ void cpu_startup_entry(enum cpuhp_state state)
*/
boot_init_stack_canary();
#endif
- __current_set_polling();
arch_cpu_idle_prepare();
cpu_idle_loop();
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1bf34c257d3b..269b1a4e0bdf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1160,6 +1160,7 @@ extern const struct sched_class idle_sched_class;

#ifdef CONFIG_SMP

+extern void sched_ttwu_pending(void);
extern void update_group_power(struct sched_domain *sd, int cpu);

extern void trigger_load_balance(struct rq *rq);
@@ -1170,6 +1171,8 @@ extern void idle_exit_fair(struct rq *this_rq);

#else /* CONFIG_SMP */

+static inline void sched_ttwu_pending(void) { }
+
static inline void idle_balance(int cpu, struct rq *rq)
{
}



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/