[PATCH] mm: clear 1G pages with streaming stores on x86

From: Cannon Matthews
Date: Fri Mar 06 2020 - 20:04:02 EST


Reimplement clear_gigantic_page() to clear gigabytes pages using the
non-temporal streaming store instructions that bypass the cache
(movnti), since an entire 1GiB region will not fit in the cache anyway.

Doing an mlock() on a 512GiB 1G-hugetlb region previously would take on
average 134 seconds, about 260ms/GiB which is quite slow. Using `movnti`
and optimizing the control flow over the constituent small pages, this
can be improved roughly by a factor of 3-4x, with the 512GiB mlock()
taking only 34 seconds on average, or 67ms/GiB.

The assembly code for the __clear_page_nt routine is more or less
taken directly from the output of gcc with -O3 for this function with
some tweaks to support arbitrary sizes and moving memory barriers:

void clear_page_nt_64i (void *page)
{
for (int i = 0; i < GiB /sizeof(long long int); ++i)
{
_mm_stream_si64 (((long long int*)page) + i, 0);
}
sfence();
}

Tested:
Time to `mlock()` a 512GiB region on broadwell CPU
AVG time (s) % imp. ms/page
clear_page_erms 133.584 - 261
clear_page_nt 34.154 74.43% 67

An earlier version of this code was sent as an RFC patch ~July 2018
https://patchwork.kernel.org/patch/10543193/ but never merged.

Signed-off-by: Cannon Matthews <cannonmatthews@xxxxxxxxxx>
---
MAINTAINERS | 1 +
arch/x86/Kconfig | 4 ++++
arch/x86/include/asm/page_64.h | 1 +
arch/x86/lib/Makefile | 2 +-
arch/x86/lib/clear_gigantic_page.c | 28 ++++++++++++++++++++++++++++
arch/x86/lib/clear_page_64.S | 19 +++++++++++++++++++
include/linux/mm.h | 2 ++
mm/memory.c | 2 ++
8 files changed, 58 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/lib/clear_gigantic_page.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 68eebf3650ac..efe84f085404 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7702,6 +7702,7 @@ S: Maintained
F: fs/hugetlbfs/
F: mm/hugetlb.c
F: include/linux/hugetlb.h
+F: arch/x86/lib/clear_gigantic_page.c
F: Documentation/admin-guide/mm/hugetlbpage.rst
F: Documentation/vm/hugetlbfs_reserv.rst
F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index beea77046f9b..f49e7b6f6851 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -70,6 +70,7 @@ config X86
select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
+ select ARCH_HAS_CLEAR_GIGANTIC_PAGE if X86_64
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PTE_DEVMAP if X86_64
select ARCH_HAS_PTE_SPECIAL
@@ -290,6 +291,9 @@ config ARCH_MAY_HAVE_PC_FDC
config GENERIC_CALIBRATE_DELAY
def_bool y

+config ARCH_HAS_CLEAR_GIGANTIC_PAGE
+ bool
+
config ARCH_HAS_CPU_RELAX
def_bool y

diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 939b1cff4a7b..6ea60883b6d6 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -55,6 +55,7 @@ static inline void clear_page(void *page)
}

void copy_page(void *to, void *from);
+void clear_page_nt(void *page, u64 page_size);

#endif /* !__ASSEMBLY__ */

diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 5246db42de45..a620c6636210 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -56,7 +56,7 @@ endif
else
obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
- lib-y += clear_page_64.o copy_page_64.o
+ lib-y += clear_page_64.o copy_page_64.o clear_gigantic_page.o
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o
lib-y += cmpxchg16b_emu.o
diff --git a/arch/x86/lib/clear_gigantic_page.c b/arch/x86/lib/clear_gigantic_page.c
new file mode 100644
index 000000000000..6fcb494ec9bc
--- /dev/null
+++ b/arch/x86/lib/clear_gigantic_page.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/page.h>
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+
+void clear_gigantic_page(struct page *page, unsigned long addr,
+ unsigned int pages)
+{
+ int i;
+ void *dest = page_to_virt(page);
+
+ /*
+ * cond_resched() every 2M. Hypothetical page sizes not divisible by
+ * this are not supported.
+ */
+ BUG_ON(pages % HPAGE_PMD_NR != 0);
+ for (i = 0; i < pages; i += HPAGE_PMD_NR) {
+ clear_page_nt(dest + (i * PAGE_SIZE), HPAGE_PMD_NR * PAGE_SIZE);
+ cond_resched();
+ }
+ /* clear_page_nt requires an `sfence` barrier. */
+ wmb();
+}
+#endif /* defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) */
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index c4c7dd115953..1224094fd863 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -50,3 +50,22 @@ SYM_FUNC_START(clear_page_erms)
ret
SYM_FUNC_END(clear_page_erms)
EXPORT_SYMBOL_GPL(clear_page_erms)
+
+/*
+ * Zero memory using non temporal stores, bypassing the cache.
+ * Requires an `sfence` (wmb()) afterwards.
+ * %rdi - destination.
+ * %rsi - page size. Must be 64 bit aligned.
+*/
+SYM_FUNC_START(clear_page_nt)
+ leaq (%rdi,%rsi), %rdx
+ xorl %eax, %eax
+ .p2align 4,,10
+ .p2align 3
+.L2:
+ movnti %rax, (%rdi)
+ addq $8, %rdi
+ cmpq %rdx, %rdi
+ jne .L2
+ ret
+SYM_FUNC_END(clear_page_nt)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c54fb96cb1e6..a57f9007374b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2856,6 +2856,8 @@ enum mf_action_page_type {
};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+extern void clear_gigantic_page(struct page *page, unsigned long addr,
+ unsigned int pages);
extern void clear_huge_page(struct page *page,
unsigned long addr_hint,
unsigned int pages_per_huge_page);
diff --git a/mm/memory.c b/mm/memory.c
index e8bfdf0d9d1d..2a13bf102890 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4706,6 +4706,7 @@ static inline void process_huge_page(
}
}

+#ifndef CONFIG_ARCH_HAS_CLEAR_GIGANTIC_PAGE
static void clear_gigantic_page(struct page *page,
unsigned long addr,
unsigned int pages_per_huge_page)
@@ -4720,6 +4721,7 @@ static void clear_gigantic_page(struct page *page,
clear_user_highpage(p, addr + i * PAGE_SIZE);
}
}
+#endif /* CONFIG_ARCH_HAS_CLEAR_GIGANTIC_PAGE */

static void clear_subpage(unsigned long addr, int idx, void *arg)
{
--
2.25.1.481.gfbce0eb801-goog