Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu

From: KAMEZAWA Hiroyuki
Date: Wed Jan 18 2012 - 21:43:19 EST


On Wed, 18 Jan 2012 10:51:02 -0800
Arun Sharma <asharma@xxxxxx> wrote:

> This enables malloc optimizations where we might
> madvise(..,MADV_DONTNEED) a page only to fault it
> back at a different virtual address.
>
> To ensure that we don't leak sensitive data to
> unprivileged processes, we enable this optimization
> only for pages that are reused within a memory
> cgroup.
>
> The idea is to make this opt-in both at the mmap()
> level and cgroup level so the default behavior is
> unchanged after the patch.
>
> TODO: Ask for a VM_UNINITIALIZED bit
> TODO: Implement a cgroup level opt-in flag
>

Hmm, then,
1. a new task jumped into this cgroup can see any uncleared data...
2. if a memcg pointer is reused, the information will be leaked.
3. If VM_UNINITALIZED is set, the process can see any data which
was freed by other process which doesn't know VM_UNINITALIZED at all.

4. The process will be able to see file cache data which the it has no
access right if it's accessed by memcg once.

3 & 4 seems too danger.

Isn't it better to have this as per-task rather than per-memcg ?
And just allow to reuse pages the page has freed ?


Thanks,
-Kame


> To: linux-kernel@xxxxxxxxxxxxxxx
> Cc: linux-mm@xxxxxxxxx
> Cc: Balbir Singh <bsingharora@xxxxxxxxx>
> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
> Cc: akpm@xxxxxxxxxxxxxxxxxxxx
> Signed-off-by: Arun Sharma <asharma@xxxxxx>
> ---
> include/asm-generic/mman-common.h | 6 +-----
> include/linux/highmem.h | 6 ++++++
> include/linux/mm.h | 2 ++
> include/linux/mman.h | 1 +
> include/linux/page_cgroup.h | 29 +++++++++++++++++++++++++++++
> init/Kconfig | 2 +-
> mm/mempolicy.c | 29 +++++++++++++++++++++++------
> 7 files changed, 63 insertions(+), 12 deletions(-)
>
> diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
> index 787abbb..71e079f 100644
> --- a/include/asm-generic/mman-common.h
> +++ b/include/asm-generic/mman-common.h
> @@ -19,11 +19,7 @@
> #define MAP_TYPE 0x0f /* Mask for type of mapping */
> #define MAP_FIXED 0x10 /* Interpret addr exactly */
> #define MAP_ANONYMOUS 0x20 /* don't use a file */
> -#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
> -# define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be uninitialized */
> -#else
> -# define MAP_UNINITIALIZED 0x0 /* Don't support this flag */
> -#endif
> +#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be uninitialized */
>
> #define MS_ASYNC 1 /* sync memory asynchronously */
> #define MS_INVALIDATE 2 /* invalidate the caches */
> diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> index 3a93f73..caae922 100644
> --- a/include/linux/highmem.h
> +++ b/include/linux/highmem.h
> @@ -4,6 +4,7 @@
> #include <linux/fs.h>
> #include <linux/kernel.h>
> #include <linux/mm.h>
> +#include <linux/page_cgroup.h>
> #include <linux/uaccess.h>
> #include <linux/hardirq.h>
>
> @@ -156,6 +157,11 @@ __alloc_zeroed_user_highpage(gfp_t movableflags,
> struct page *page = alloc_page_vma(GFP_HIGHUSER | movableflags,
> vma, vaddr);
>
> +#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
> + if (!page_needs_clearing(page, vma))
> + return page;
> +#endif
> +
> if (page)
> clear_user_highpage(page, vaddr);
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 4baadd1..c6bab01 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -118,6 +118,8 @@ extern unsigned int kobjsize(const void *objp);
> #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */
> #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */
> #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
> +#define VM_UNINITIALIZED VM_SAO /* Steal a powerpc bit for now, since we're out
> + of bits for 32 bit archs */
>
> /* Bits set in the VMA until the stack is in its final location */
> #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
> diff --git a/include/linux/mman.h b/include/linux/mman.h
> index 8b74e9b..9bef6c9 100644
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -87,6 +87,7 @@ calc_vm_flag_bits(unsigned long flags)
> return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
> _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
> _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
> + _calc_vm_trans(flags, MAP_UNINITIALIZED, VM_UNINITIALIZED) |
> _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
> }
> #endif /* __KERNEL__ */
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 961ecc7..e959869 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -155,6 +155,17 @@ static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc)
> return (pc->flags >> PCG_ARRAYID_SHIFT) & PCG_ARRAYID_MASK;
> }
>
> +static int mm_match_cgroup(const struct mm_struct *mm,
> + const struct mem_cgroup *cgroup);
> +static inline bool page_seen_by_cgroup(struct page *page,
> + const struct mm_struct *mm)
> +{
> + struct page_cgroup *pcg = lookup_page_cgroup(page);
> + if (pcg == NULL)
> + return false;
> + return mm_match_cgroup(mm, pcg->mem_cgroup);
> +}
> +
> #else /* CONFIG_CGROUP_MEM_RES_CTLR */
> struct page_cgroup;
>
> @@ -175,8 +186,26 @@ static inline void __init page_cgroup_init_flatmem(void)
> {
> }
>
> +static inline bool page_seen_by_cgroup(struct page *page,
> + const struct mm_struct *mm)
> +{
> + return false;
> +}
> +
> #endif /* CONFIG_CGROUP_MEM_RES_CTLR */
>
> +static inline bool vma_requests_uninitialized(struct vm_area_struct *vma)
> +{
> + return vma && !vma->vm_file && vma->vm_flags & VM_UNINITIALIZED;
> +}
> +
> +static inline bool page_needs_clearing(struct page *page,
> + struct vm_area_struct *vma)
> +{
> + return !(vma_requests_uninitialized(vma)
> + && page_seen_by_cgroup(page, vma->vm_mm));
> +}
> +
> #include <linux/swap.h>
>
> #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
> diff --git a/init/Kconfig b/init/Kconfig
> index 43298f9..428e047 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1259,7 +1259,7 @@ endchoice
>
> config MMAP_ALLOW_UNINITIALIZED
> bool "Allow mmapped anonymous memory to be uninitialized"
> - depends on EXPERT && !MMU
> + depends on EXPERT
> default n
> help
> Normally, and according to the Linux spec, anonymous memory obtained
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index c3fdbcb..7c9ab68 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -90,6 +90,7 @@
> #include <linux/syscalls.h>
> #include <linux/ctype.h>
> #include <linux/mm_inline.h>
> +#include <linux/page_cgroup.h>
>
> #include <asm/tlbflush.h>
> #include <asm/uaccess.h>
> @@ -1847,6 +1848,11 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
> struct zonelist *zl;
> struct page *page;
>
> +#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
> + if (vma_requests_uninitialized(vma))
> + gfp &= ~__GFP_ZERO;
> +#endif
> +
> get_mems_allowed();
> if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
> unsigned nid;
> @@ -1854,25 +1860,36 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
> nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
> mpol_cond_put(pol);
> page = alloc_page_interleave(gfp, order, nid);
> - put_mems_allowed();
> - return page;
> + goto out;
> }
> zl = policy_zonelist(gfp, pol, node);
> if (unlikely(mpol_needs_cond_ref(pol))) {
> /*
> * slow path: ref counted shared policy
> */
> - struct page *page = __alloc_pages_nodemask(gfp, order,
> - zl, policy_nodemask(gfp, pol));
> + page = __alloc_pages_nodemask(gfp, order,
> + zl, policy_nodemask(gfp, pol));
> __mpol_put(pol);
> - put_mems_allowed();
> - return page;
> + goto out;
> }
> +
> /*
> * fast path: default or task policy
> */
> page = __alloc_pages_nodemask(gfp, order, zl,
> policy_nodemask(gfp, pol));
> +
> +out:
> +#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
> + if (page_needs_clearing(page, vma)) {
> + int i;
> + for (i = 0; i < (1 << order); i++) {
> + void *kaddr = kmap_atomic(page + i, KM_USER0);
> + clear_page(kaddr);
> + kunmap_atomic(kaddr, KM_USER0);
> + }
> + }
> +#endif
> put_mems_allowed();
> return page;
> }
> --
> 1.7.4
>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/