[PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings

From: Mike Rapoport
Date: Sun Oct 27 2019 - 06:17:56 EST


From: Mike Rapoport <rppt@xxxxxxxxxxxxx>

The mappings created with MAP_EXCLUSIVE are visible only in the context of
the owning process and can be used by applications to store secret
information that will not be visible not only to other processes but to the
kernel as well.

The pages in these mappings are removed from the kernel direct map and
marked with PG_user_exclusive flag. When the exclusive area is unmapped,
the pages are mapped back into the direct map.

The MAP_EXCLUSIVE flag implies MAP_POPULATE and MAP_LOCKED.

Signed-off-by: Mike Rapoport <rppt@xxxxxxxxxxxxx>
---
arch/x86/mm/fault.c | 14 ++++++++++
fs/proc/task_mmu.c | 1 +
include/linux/mm.h | 9 +++++++
include/linux/page-flags.h | 7 +++++
include/linux/page_excl.h | 49 ++++++++++++++++++++++++++++++++++
include/trace/events/mmflags.h | 9 ++++++-
include/uapi/asm-generic/mman-common.h | 1 +
kernel/fork.c | 3 ++-
mm/Kconfig | 3 +++
mm/gup.c | 8 ++++++
mm/memory.c | 3 +++
mm/mmap.c | 16 +++++++++++
mm/page_alloc.c | 5 ++++
13 files changed, 126 insertions(+), 2 deletions(-)
create mode 100644 include/linux/page_excl.h

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9ceacd1..8f73a75 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -17,6 +17,7 @@
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
#include <linux/efi.h> /* efi_recover_from_page_fault()*/
+#include <linux/page_excl.h> /* page_is_user_exclusive() */
#include <linux/mm_types.h>

#include <asm/cpufeature.h> /* boot_cpu_has, ... */
@@ -1218,6 +1219,13 @@ static int fault_in_kernel_space(unsigned long address)
return address >= TASK_SIZE_MAX;
}

+static bool fault_in_user_exclusive_page(unsigned long address)
+{
+ struct page *page = virt_to_page(address);
+
+ return page_is_user_exclusive(page);
+}
+
/*
* Called for all faults where 'address' is part of the kernel address
* space. Might get called for faults that originate from *code* that
@@ -1261,6 +1269,12 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
if (spurious_kernel_fault(hw_error_code, address))
return;

+ /* FIXME: warn and handle gracefully */
+ if (unlikely(fault_in_user_exclusive_page(address))) {
+ pr_err("page fault in user exclusive page at %lx", address);
+ force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)address);
+ }
+
/* kprobes don't want to hook the spurious faults: */
if (kprobe_page_fault(regs, X86_TRAP_PF))
return;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9442631..99e14d1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -655,6 +655,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_X86_INTEL_MPX
[ilog2(VM_MPX)] = "mp",
#endif
+ [ilog2(VM_EXCLUSIVE)] = "xl",
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cc29227..9c43375 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -298,11 +298,13 @@ extern unsigned int kobjsize(const void *objp);
#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */

#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -340,6 +342,12 @@ extern unsigned int kobjsize(const void *objp);
# define VM_MPX VM_NONE
#endif

+#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
+# define VM_EXCLUSIVE VM_HIGH_ARCH_5
+#else
+# define VM_EXCLUSIVE VM_NONE
+#endif
+
#ifndef VM_GROWSUP
# define VM_GROWSUP VM_NONE
#endif
@@ -2594,6 +2602,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_ANON 0x8000 /* don't do file mappings */
#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
+#define FOLL_EXCLUSIVE 0x40000 /* mapping is exclusive to owning mm */

/*
* NOTE on FOLL_LONGTERM:
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f91cb88..32d0aee 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -131,6 +131,9 @@ enum pageflags {
PG_young,
PG_idle,
#endif
+#if defined(CONFIG_EXCLUSIVE_USER_PAGES)
+ PG_user_exclusive,
+#endif
__NR_PAGEFLAGS,

/* Filesystems */
@@ -431,6 +434,10 @@ TESTCLEARFLAG(Young, young, PF_ANY)
PAGEFLAG(Idle, idle, PF_ANY)
#endif

+#ifdef CONFIG_EXCLUSIVE_USER_PAGES
+__PAGEFLAG(UserExclusive, user_exclusive, PF_ANY)
+#endif
+
/*
* On an anonymous page mapped into a user virtual memory area,
* page->mapping points to its anon_vma, not to a struct address_space;
diff --git a/include/linux/page_excl.h b/include/linux/page_excl.h
new file mode 100644
index 0000000..b7ea3ce
--- /dev/null
+++ b/include/linux/page_excl.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MM_PAGE_EXCLUSIVE_H
+#define _LINUX_MM_PAGE_EXCLUSIVE_H
+
+#include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/set_memory.h>
+#include <asm/tlbflush.h>
+
+#ifdef CONFIG_EXCLUSIVE_USER_PAGES
+
+static inline bool page_is_user_exclusive(struct page *page)
+{
+ return PageUserExclusive(page);
+}
+
+static inline void __set_page_user_exclusive(struct page *page)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ __SetPageUserExclusive(page);
+ set_direct_map_invalid_noflush(page);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+}
+
+static inline void __clear_page_user_exclusive(struct page *page)
+{
+ __ClearPageUserExclusive(page);
+ set_direct_map_default_noflush(page);
+}
+
+#else /* !CONFIG_EXCLUSIVE_USER_PAGES */
+
+static inline bool page_is_user_exclusive(struct page *page)
+{
+ return false;
+}
+
+static inline void __set_page_user_exclusive(struct page *page)
+{
+}
+
+static inline void __clear_page_user_exclusive(struct page *page)
+{
+}
+
+#endif /* CONFIG_EXCLUSIVE_USER_PAGES */
+
+#endif /* _LINUX_MM_PAGE_EXCLUSIVE_H */
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index a1675d4..2d3c14a 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -79,6 +79,12 @@
#define IF_HAVE_PG_IDLE(flag,string)
#endif

+#ifdef CONFIG_EXCLUSIVE_USER_PAGES
+#define IF_HAVE_PG_USER_EXCLUSIVE(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_USER_EXCLUSIVE(flag,string)
+#endif
+
#define __def_pageflag_names \
{1UL << PG_locked, "locked" }, \
{1UL << PG_waiters, "waiters" }, \
@@ -105,7 +111,8 @@ IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \
IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
IF_HAVE_PG_IDLE(PG_young, "young" ) \
-IF_HAVE_PG_IDLE(PG_idle, "idle" )
+IF_HAVE_PG_IDLE(PG_idle, "idle" ) \
+IF_HAVE_PG_USER_EXCLUSIVE(PG_user_exclusive, "user_exclusive" )

#define show_page_flags(flags) \
(flags) ? __print_flags(flags, "|", \
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index c160a53..bf8f23e 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -27,6 +27,7 @@
#define MAP_HUGETLB 0x040000 /* create a huge page mapping */
#define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
+#define MAP_EXCLUSIVE 0x200000 /* mapping is exclusive to the owning task; the pages in it are dropped from the direct map */

#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
* uninitialized */
diff --git a/kernel/fork.c b/kernel/fork.c
index bcdf531..d63adec 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -518,7 +518,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
struct file *file;

- if (mpnt->vm_flags & VM_DONTCOPY) {
+ if (mpnt->vm_flags & VM_DONTCOPY ||
+ mpnt->vm_flags & VM_EXCLUSIVE) {
vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
}
diff --git a/mm/Kconfig b/mm/Kconfig
index a5dae9a..9d60141 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -736,4 +736,7 @@ config ARCH_HAS_PTE_SPECIAL
config ARCH_HAS_HUGEPD
bool

+config EXCLUSIVE_USER_PAGES
+ def_bool ARCH_USES_HIGH_VMA_FLAGS && ARCH_HAS_SET_DIRECT_MAP
+
endmenu
diff --git a/mm/gup.c b/mm/gup.c
index 8f236a3..a98c0ca0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -17,6 +17,7 @@
#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
+#include <linux/page_excl.h>

#include <asm/mmu_context.h>
#include <asm/pgtable.h>
@@ -868,6 +869,10 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
ret = PTR_ERR(page);
goto out;
}
+
+ if (gup_flags & FOLL_EXCLUSIVE)
+ __set_page_user_exclusive(page);
+
if (pages) {
pages[i] = page;
flush_anon_page(vma, page, start);
@@ -1216,6 +1221,9 @@ long populate_vma_page_range(struct vm_area_struct *vma,
if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
gup_flags |= FOLL_FORCE;

+ if (vma->vm_flags & VM_EXCLUSIVE)
+ gup_flags |= FOLL_EXCLUSIVE;
+
/*
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
diff --git a/mm/memory.c b/mm/memory.c
index b1ca51a..a4b4cff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -71,6 +71,7 @@
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/page_excl.h>

#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -1062,6 +1063,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
page_remove_rmap(page, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
+ if (page_is_user_exclusive(page))
+ __clear_page_user_exclusive(page);
if (unlikely(__tlb_remove_page(tlb, page))) {
force_flush = 1;
addr += PAGE_SIZE;
diff --git a/mm/mmap.c b/mm/mmap.c
index a7d8c84..d8cc82d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1574,6 +1574,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags |= VM_NORESERVE;
}

+ if (flags & MAP_EXCLUSIVE)
+ vm_flags |= VM_EXCLUSIVE;
+
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
@@ -1591,6 +1594,19 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,

addr = untagged_addr(addr);

+ if (flags & MAP_EXCLUSIVE) {
+ /*
+ * MAP_EXCLUSIVE is only supported for private
+ * anonymous memory not backed by hugetlbfs
+ */
+ if (!(flags & MAP_ANONYMOUS) || !(flags & MAP_PRIVATE) ||
+ (flags & MAP_HUGETLB))
+ return -EINVAL;
+
+ /* and impies MAP_LOCKED and MAP_POPULATE */
+ flags |= (MAP_LOCKED | MAP_POPULATE);
+ }
+
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ecc3dba..2f1de9d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,7 @@
#include <linux/lockdep.h>
#include <linux/nmi.h>
#include <linux/psi.h>
+#include <linux/page_excl.h>

#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -4779,6 +4780,10 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
page = NULL;
}

+ /* FIXME: should not happen! */
+ if (WARN_ON(page_is_user_exclusive(page)))
+ __clear_page_user_exclusive(page);
+
trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

return page;
--
2.7.4