[PATCH 3/6] x86,tlb: make lazy TLB mode lazier

From: Rik van Riel
Date: Tue Jun 26 2018 - 13:31:53 EST


Lazy TLB mode can result in an idle CPU being woken up by a TLB flush,
when all it really needs to do is reload %CR3 at the next context switch,
assuming no page table pages got freed.

Memory ordering is used to prevent race conditions between switch_mm_irqs_off,
which checks whether .tlb_gen changed, and the TLB invalidation code, which
increments .tlb_gen whenever page table entries get invalidated.

The atomic increment in inc_mm_tlb_gen is its own barrier; the context
switch code adds an explicit barrier between reading tlbstate.is_lazy and
next->context.tlb_gen.

Unlike the 2016 version of this patch, CPUs with cpu_tlbstate.is_lazy set
are not removed from the mm_cpumask(mm), since that would prevent the TLB
flush IPIs at page table free time from being sent to all the CPUs
that need them.

Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx>
Tested-by: Song Liu <songliubraving@xxxxxx>
---
arch/x86/include/asm/uv/uv.h | 6 +-
arch/x86/mm/tlb.c | 131 ++++++++++++++++++++++++++++--------------
arch/x86/platform/uv/tlb_uv.c | 2 +-
3 files changed, 92 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index a80c0673798f..d801afb5fe90 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -17,7 +17,7 @@ extern int is_uv_hubless(void);
extern void uv_cpu_init(void);
extern void uv_nmi_init(void);
extern void uv_system_init(void);
-extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+extern struct cpumask *uv_flush_tlb_others(struct cpumask *cpumask,
const struct flush_tlb_info *info);

#else /* X86_UV */
@@ -27,8 +27,8 @@ static inline int is_uv_system(void) { return 0; }
static inline int is_uv_hubless(void) { return 0; }
static inline void uv_cpu_init(void) { }
static inline void uv_system_init(void) { }
-static inline const struct cpumask *
-uv_flush_tlb_others(const struct cpumask *cpumask,
+static inline struct cpumask *
+uv_flush_tlb_others(struct cpumask *cpumask,
const struct flush_tlb_info *info)
{ return cpumask; }

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9a893673c56b..137a2c62c75b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
+#include <linux/gfp.h>

#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -185,8 +186,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
+ bool need_flush;
+ u16 new_asid;

/*
* NB: The scheduler will call us with prev == next when switching
@@ -250,10 +254,20 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));

- return;
+ /*
+ * Switching straight from one thread in a process to another
+ * thread in the same process requires no TLB flush at all.
+ */
+ if (!was_lazy)
+ return;
+
+ /*
+ * The code below checks whether there was a TLB flush while
+ * this CPU was in lazy TLB mode. The barrier ensures ordering
+ * with the TLB invalidation code advancing .tlb_gen.
+ */
+ smp_rmb();
} else {
- u16 new_asid;
- bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);

/*
@@ -290,48 +304,47 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
real_prev != &init_mm);
cpumask_clear_cpu(cpu, mm_cpumask(real_prev));

- /*
- * Start remote flushes and then read tlb_gen.
- */
+ /* Start remote flushes. */
cpumask_set_cpu(cpu, mm_cpumask(next));
- next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-
- choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ }

- if (need_flush) {
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
+ /* Read the tlb_gen to check whether a flush is needed. */
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);

- /*
- * NB: This gets called via leave_mm() in the idle path
- * where RCU functions differently. Tracing normally
- * uses RCU, so we need to use the _rcuidle variant.
- *
- * (There is no good reason for this. The idle code should
- * be rearranged to call this before rcu_idle_enter().)
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- } else {
- /* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
-
- /* See above wrt _rcuidle. */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
- }
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ load_new_mm_cr3(next->pgd, new_asid, true);

/*
- * Record last user mm's context id, so we can avoid
- * flushing branch buffer with IBPB if we switch back
- * to the same user.
+ * NB: This gets called via leave_mm() in the idle path
+ * where RCU functions differently. Tracing normally
+ * uses RCU, so we need to use the _rcuidle variant.
+ *
+ * (There is no good reason for this. The idle code should
+ * be rearranged to call this before rcu_idle_enter().)
*/
- if (next != &init_mm)
- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+ load_new_mm_cr3(next->pgd, new_asid, false);

- this_cpu_write(cpu_tlbstate.loaded_mm, next);
- this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ /* See above wrt _rcuidle. */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}

+ /*
+ * Record last user mm's context id, so we can avoid
+ * flushing branch buffer with IBPB if we switch back
+ * to the same user.
+ */
+ if (next != &init_mm)
+ this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
@@ -454,6 +467,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
+ *
+ * This should be rare, with native_flush_tlb_others skipping
+ * IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
@@ -560,6 +576,22 @@ static void flush_tlb_func_remote(void *info)
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
+ cpumask_t *mask = (struct cpumask *)cpumask;
+ cpumask_var_t varmask;
+ bool can_lazy_flush = false;
+ unsigned int cpu;
+
+ /*
+ * A temporary cpumask allows the kernel to skip sending IPIs
+ * to CPUs in lazy TLB state, without also removing them from
+ * mm_cpumask(mm).
+ */
+ if (alloc_cpumask_var(&varmask, GFP_ATOMIC)) {
+ cpumask_copy(varmask, cpumask);
+ mask = varmask;
+ can_lazy_flush = true;
+ }
+
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -583,17 +615,30 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
* that UV should be updated so that smp_call_function_many(),
* etc, are optimal on UV.
*/
- unsigned int cpu;
-
cpu = smp_processor_id();
- cpumask = uv_flush_tlb_others(cpumask, info);
- if (cpumask)
- smp_call_function_many(cpumask, flush_tlb_func_remote,
+ mask = uv_flush_tlb_others(mask, info);
+ if (mask)
+ smp_call_function_many(mask, flush_tlb_func_remote,
(void *)info, 1);
- return;
+ goto out;
+ }
+
+ /*
+ * Instead of sending IPIs to CPUs in lazy TLB mode, move that
+ * CPU's TLB state to TLBSTATE_FLUSH, causing the TLB to be flushed
+ * at the next context switch, or at page table free time.
+ */
+ if (can_lazy_flush) {
+ for_each_cpu(cpu, mask) {
+ if (per_cpu(cpu_tlbstate.is_lazy, cpu))
+ cpumask_clear_cpu(cpu, mask);
+ }
}
- smp_call_function_many(cpumask, flush_tlb_func_remote,
+
+ smp_call_function_many(mask, flush_tlb_func_remote,
(void *)info, 1);
+ out:
+ free_cpumask_var(varmask);
}

/*
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index ca446da48fd2..84a4c6679da6 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1102,7 +1102,7 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
* Returns pointer to cpumask if some remote flushing remains to be
* done. The returned pointer is valid till preemption is re-enabled.
*/
-const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+struct cpumask *uv_flush_tlb_others(struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
unsigned int cpu = smp_processor_id();
--
2.14.4