[PATCH] [RFC] Introduce a new pv_mmu_op to free a directory page

From: Peter Zijlstra
Date: Tue May 22 2012 - 07:39:44 EST


At the moment get_user_pages_fast is unsafe on Xen, because it relies on
the fact that flush_tlb_others sends an IPI to flush the tlb but
xen_flush_tlb_others doesn't send any IPIs and always returns succesfully
and immediately.

The kernel offers HAVE_RCU_TABLE_FREE that enables an RCU lock to
protect this kind of software pagetable walks (see
include/asm-generic/tlb.h).
The goal of this patch is to enable HAVE_RCU_TABLE_FREE on Xen without
impacting the native x86 case.

The original patch to enable HAVE_RCU_TABLE_FREE on x86 is by Peter
Zijlstra, see http://marc.info/?l=linux-kernel&m=133595422305515&w=2; I
have only modified it so that it enables HAVE_RCU_TABLE_FREE on Xen
but not on native.
It does so by introducing a new pv_mmu_op to free a directory page:
pv_mmu_ops.tlb_remove_table.
The pvop is set to tlb_remove_page on native and to tlb_remove_table on
Xen.

The result is that if CONFIG_XEN is disabled, the behavior is the same
as today.
If CONFIG_XEN is enabled and we are running on native,
HAVE_RCU_TABLE_FREE is set but tlb_remove_table is never called: we
still call tlb_remove_page so there should be no performance penalty.
If CONFIG_XEN is enabled and we are running on Xen we make full usage of
the RCU directory page freeing as described in tlb.h.

Please advice on how to proceed: is it correct to enable
HAVE_RCU_TABLE_FREE but never call tlb_remove_table?
Is the pvops really the best thing to do here?
Maybe we can just implement get_user_pages_fast as a call to
get_user_pages on Xen?

Signed-off-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Signed-off-by: Stefano Stabellini <stefano.stabellini@xxxxxxxxxxxxx>
---
arch/powerpc/include/asm/pgalloc.h | 1 +
arch/s390/mm/pgtable.c | 1 +
arch/sparc/include/asm/pgalloc_64.h | 1 +
arch/x86/include/asm/paravirt.h | 6 ++++++
arch/x86/include/asm/paravirt_types.h | 3 +++
arch/x86/include/asm/pgalloc.h | 1 +
arch/x86/kernel/paravirt.c | 2 ++
arch/x86/mm/pgtable.c | 6 +++---
arch/x86/xen/Kconfig | 1 +
arch/x86/xen/mmu.c | 7 +++++++
mm/memory.c | 7 +++++++
11 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index bf301ac..c33ae79 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -49,6 +49,7 @@ static inline void __tlb_remove_table(void *_table)

pgtable_free(table, shift);
}
+#define __tlb_remove_table __tlb_remove_table
#else /* CONFIG_SMP */
static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
{
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 6e765bf..7029ed7 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -730,6 +730,7 @@ void __tlb_remove_table(void *_table)
else
free_pages((unsigned long) table, ALLOC_ORDER);
}
+#define __tlb_remove_table __tlb_remove_table

static void tlb_remove_table_smp_sync(void *arg)
{
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h
index 40b2d7a..d10913a 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h
@@ -106,6 +106,7 @@ static inline void __tlb_remove_table(void *_table)
is_page = true;
pgtable_free(table, is_page);
}
+#define __tlb_remove_table __tlb_remove_table
#else /* CONFIG_SMP */
static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, bool is_page)
{
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index aa0f913..42c6a9b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -402,6 +402,12 @@ static inline void flush_tlb_others(const struct cpumask *cpumask,
PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va);
}

+static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb,
+ struct page *page)
+{
+ PVOP_VCALL2(pv_mmu_ops.tlb_remove_table, tlb, page);
+}
+
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
{
return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 8e8b9a4..7e5ad6d 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -51,6 +51,7 @@ struct mm_struct;
struct desc_struct;
struct task_struct;
struct cpumask;
+struct mmu_gather;

/*
* Wrapper type for pointers to code which uses the non-standard
@@ -251,6 +252,8 @@ struct pv_mmu_ops {
void (*flush_tlb_others)(const struct cpumask *cpus,
struct mm_struct *mm,
unsigned long va);
+ /* free a directory page */
+ void (*tlb_remove_table)(struct mmu_gather *tlb, struct page *page);

/* Hooks for allocating and freeing a pagetable top-level */
int (*pgd_alloc)(struct mm_struct *mm);
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index b4389a4..0cc3251 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -20,6 +20,7 @@ static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {
static inline void paravirt_release_pte(unsigned long pfn) {}
static inline void paravirt_release_pmd(unsigned long pfn) {}
static inline void paravirt_release_pud(unsigned long pfn) {}
+#define paravirt_tlb_remove_table(tlb, page) tlb_remove_page(tlb, page)
#endif

/*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ab13760..370b3b4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -37,6 +37,7 @@
#include <asm/fixmap.h>
#include <asm/apic.h>
#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#include <asm/timer.h>
#include <asm/special_insns.h>

@@ -422,6 +423,7 @@ struct pv_mmu_ops pv_mmu_ops = {
.flush_tlb_kernel = native_flush_tlb_global,
.flush_tlb_single = native_flush_tlb_single,
.flush_tlb_others = native_flush_tlb_others,
+ .tlb_remove_table = tlb_remove_page,

.pgd_alloc = __paravirt_pgd_alloc,
.pgd_free = paravirt_nop,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8573b83..904d45c 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -51,21 +51,21 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
- tlb_remove_page(tlb, pte);
+ paravirt_tlb_remove_table(tlb, pte);
}

#if PAGETABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
- tlb_remove_page(tlb, virt_to_page(pmd));
+ paravirt_tlb_remove_table(tlb, virt_to_page(pmd));
}

#if PAGETABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- tlb_remove_page(tlb, virt_to_page(pud));
+ paravirt_tlb_remove_table(tlb, virt_to_page(pud));
}
#endif /* PAGETABLE_LEVELS > 3 */
#endif /* PAGETABLE_LEVELS > 2 */
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index fdce49c..e2efb29 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,6 +6,7 @@ config XEN
bool "Xen guest support"
select PARAVIRT
select PARAVIRT_CLOCK
+ select HAVE_RCU_TABLE_FREE if SMP
depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
depends on X86_CMPXCHG && X86_TSC
help
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 69f5857..a58153b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -62,6 +62,7 @@
#include <asm/init.h>
#include <asm/pat.h>
#include <asm/smp.h>
+#include <asm/tlb.h>

#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
@@ -1281,6 +1282,11 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
xen_mc_issue(PARAVIRT_LAZY_MMU);
}

+static void xen_tlb_remove_table(struct mmu_gather *tlb, struct page *page)
+{
+ tlb_remove_table(tlb, page);
+}
+
static unsigned long xen_read_cr3(void)
{
return this_cpu_read(xen_cr3);
@@ -2006,6 +2012,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.flush_tlb_kernel = xen_flush_tlb,
.flush_tlb_single = xen_flush_tlb_single,
.flush_tlb_others = xen_flush_tlb_others,
+ .tlb_remove_table = xen_tlb_remove_table,

.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
diff --git a/mm/memory.c b/mm/memory.c
index 6105f47..c12685d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -297,6 +297,13 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
* See the comment near struct mmu_table_batch.
*/

+#ifndef __tlb_remove_table
+static void __tlb_remove_table(void *table)
+{
+ free_page_and_swap_cache(table);
+}
+#endif
+
static void tlb_remove_table_smp_sync(void *arg)
{
/* Simply deliver the interrupt */
--
1.7.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/