Re: [PATCH 2/6] __wr_after_init: write rare for static allocation

From: Martin Schwidefsky
Date: Wed Dec 12 2018 - 04:49:15 EST


On Wed, 5 Dec 2018 15:13:56 -0800
Andy Lutomirski <luto@xxxxxxxxxx> wrote:

> I added some s390 and powerpc people.
>
> On Tue, Dec 4, 2018 at 4:18 AM Igor Stoppa <igor.stoppa@xxxxxxxxx> wrote:
> >
> > Implementation of write rare for statically allocated data, located in a
> > specific memory section through the use of the __write_rare label.
> >
> > The basic functions are:
> > - wr_memset(): write rare counterpart of memset()
> > - wr_memcpy(): write rare counterpart of memcpy()
> > - wr_assign(): write rare counterpart of the assignment ('=') operator
> > - wr_rcu_assign_pointer(): write rare counterpart of rcu_assign_pointer()
> >
> > The implementation is based on code from Andy Lutomirski and Nadav Amit
> > for patching the text on x86 [here goes reference to commits, once merged]
> >
> > The modification of write protected data is done through an alternate
> > mapping of the same pages, as writable.
> > This mapping is local to each core and is active only for the duration
> > of each write operation.
> > Local interrupts are disabled, while the alternate mapping is active.
> >
> > In theory, it could introduce a non-predictable delay, in a preemptible
> > system, however the amount of data to be altered is likely to be far
> > smaller than a page.
> >
> > Signed-off-by: Igor Stoppa <igor.stoppa@xxxxxxxxxx>
> >
> > CC: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
> > CC: Nadav Amit <nadav.amit@xxxxxxxxx>
> > CC: Matthew Wilcox <willy@xxxxxxxxxxxxx>
> > CC: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> > CC: Kees Cook <keescook@xxxxxxxxxxxx>
> > CC: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> > CC: linux-integrity@xxxxxxxxxxxxxxx
> > CC: kernel-hardening@xxxxxxxxxxxxxxxxxx
> > CC: linux-mm@xxxxxxxxx
> > CC: linux-kernel@xxxxxxxxxxxxxxx
> > ---
> > include/linux/prmem.h | 133 ++++++++++++++++++++++++++++++++++++++++++
> > init/main.c | 2 +
> > mm/Kconfig | 4 ++
> > mm/Makefile | 1 +
> > mm/prmem.c | 124 +++++++++++++++++++++++++++++++++++++++
> > 5 files changed, 264 insertions(+)
> > create mode 100644 include/linux/prmem.h
> > create mode 100644 mm/prmem.c
> >
> > diff --git a/include/linux/prmem.h b/include/linux/prmem.h
> > new file mode 100644
> > index 000000000000..b0131c1f5dc0
> > --- /dev/null
> > +++ b/include/linux/prmem.h
> > @@ -0,0 +1,133 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * prmem.h: Header for memory protection library
> > + *
> > + * (C) Copyright 2018 Huawei Technologies Co. Ltd.
> > + * Author: Igor Stoppa <igor.stoppa@xxxxxxxxxx>
> > + *
> > + * Support for:
> > + * - statically allocated write rare data
> > + */
> > +
> > +#ifndef _LINUX_PRMEM_H
> > +#define _LINUX_PRMEM_H
> > +
> > +#include <linux/set_memory.h>
> > +#include <linux/mm.h>
> > +#include <linux/vmalloc.h>
> > +#include <linux/string.h>
> > +#include <linux/slab.h>
> > +#include <linux/mutex.h>
> > +#include <linux/compiler.h>
> > +#include <linux/irqflags.h>
> > +
> > +/**
> > + * memtst() - test n bytes of the source to match the c value
> > + * @p: beginning of the memory to test
> > + * @c: byte to compare against
> > + * @len: amount of bytes to test
> > + *
> > + * Returns 0 on success, non-zero otherwise.
> > + */
> > +static inline int memtst(void *p, int c, __kernel_size_t len)
> > +{
> > + __kernel_size_t i;
> > +
> > + for (i = 0; i < len; i++) {
> > + u8 d = *(i + (u8 *)p) - (u8)c;
> > +
> > + if (unlikely(d))
> > + return d;
> > + }
> > + return 0;
> > +}
> > +
> > +
> > +#ifndef CONFIG_PRMEM
> > +
> > +static inline void *wr_memset(void *p, int c, __kernel_size_t len)
> > +{
> > + return memset(p, c, len);
> > +}
> > +
> > +static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
> > +{
> > + return memcpy(p, q, size);
> > +}
> > +
> > +#define wr_assign(var, val) ((var) = (val))
> > +
> > +#define wr_rcu_assign_pointer(p, v) \
> > + rcu_assign_pointer(p, v)
> > +
> > +#else
> > +
> > +enum wr_op_type {
> > + WR_MEMCPY,
> > + WR_MEMSET,
> > + WR_RCU_ASSIGN_PTR,
> > + WR_OPS_NUMBER,
> > +};
> > +
> > +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> > + enum wr_op_type op);
> > +
> > +/**
> > + * wr_memset() - sets n bytes of the destination to the c value
> > + * @p: beginning of the memory to write to
> > + * @c: byte to replicate
> > + * @len: amount of bytes to copy
> > + *
> > + * Returns true on success, false otherwise.
> > + */
> > +static inline void *wr_memset(void *p, int c, __kernel_size_t len)
> > +{
> > + return __wr_op((unsigned long)p, (unsigned long)c, len, WR_MEMSET);
> > +}
> > +
> > +/**
> > + * wr_memcpy() - copyes n bytes from source to destination
> > + * @dst: beginning of the memory to write to
> > + * @src: beginning of the memory to read from
> > + * @n_bytes: amount of bytes to copy
> > + *
> > + * Returns pointer to the destination
> > + */
> > +static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
> > +{
> > + return __wr_op((unsigned long)p, (unsigned long)q, size, WR_MEMCPY);
> > +}
> > +
> > +/**
> > + * wr_assign() - sets a write-rare variable to a specified value
> > + * @var: the variable to set
> > + * @val: the new value
> > + *
> > + * Returns: the variable
> > + *
> > + * Note: it might be possible to optimize this, to use wr_memset in some
> > + * cases (maybe with NULL?).
> > + */
> > +
> > +#define wr_assign(var, val) ({ \
> > + typeof(var) tmp = (typeof(var))val; \
> > + \
> > + wr_memcpy(&var, &tmp, sizeof(var)); \
> > + var; \
> > +})
> > +
> > +/**
> > + * wr_rcu_assign_pointer() - initialize a pointer in rcu mode
> > + * @p: the rcu pointer
> > + * @v: the new value
> > + *
> > + * Returns the value assigned to the rcu pointer.
> > + *
> > + * It is provided as macro, to match rcu_assign_pointer()
> > + */
> > +#define wr_rcu_assign_pointer(p, v) ({ \
> > + __wr_op((unsigned long)&p, v, sizeof(p), WR_RCU_ASSIGN_PTR); \
> > + p; \
> > +})
> > +#endif
> > +#endif
> > diff --git a/init/main.c b/init/main.c
> > index a461150adfb1..a36f2e54f937 100644
> > --- a/init/main.c
> > +++ b/init/main.c
> > @@ -498,6 +498,7 @@ void __init __weak thread_stack_cache_init(void)
> > void __init __weak mem_encrypt_init(void) { }
> >
> > void __init __weak poking_init(void) { }
> > +void __init __weak wr_poking_init(void) { }
> >
> > bool initcall_debug;
> > core_param(initcall_debug, initcall_debug, bool, 0644);
> > @@ -734,6 +735,7 @@ asmlinkage __visible void __init start_kernel(void)
> > delayacct_init();
> >
> > poking_init();
> > + wr_poking_init();
> > check_bugs();
> >
> > acpi_subsystem_init();
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index d85e39da47ae..9b09339c027f 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -142,6 +142,10 @@ config ARCH_DISCARD_MEMBLOCK
> > config MEMORY_ISOLATION
> > bool
> >
> > +config PRMEM
> > + def_bool n
> > + depends on STRICT_KERNEL_RWX && X86_64
> > +
> > #
> > # Only be set on architectures that have completely implemented memory hotplug
> > # feature. If you are not sure, don't touch it.
> > diff --git a/mm/Makefile b/mm/Makefile
> > index d210cc9d6f80..ef3867c16ce0 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -58,6 +58,7 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o
> > obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
> > obj-$(CONFIG_SLOB) += slob.o
> > obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
> > +obj-$(CONFIG_PRMEM) += prmem.o
> > obj-$(CONFIG_KSM) += ksm.o
> > obj-$(CONFIG_PAGE_POISONING) += page_poison.o
> > obj-$(CONFIG_SLAB) += slab.o
> > diff --git a/mm/prmem.c b/mm/prmem.c
> > new file mode 100644
> > index 000000000000..e8ab76701831
> > --- /dev/null
> > +++ b/mm/prmem.c
> > @@ -0,0 +1,124 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * prmem.c: Memory Protection Library
> > + *
> > + * (C) Copyright 2017-2018 Huawei Technologies Co. Ltd.
> > + * Author: Igor Stoppa <igor.stoppa@xxxxxxxxxx>
> > + */
> > +
> > +#include <linux/mm.h>
> > +#include <linux/string.h>
> > +#include <linux/compiler.h>
> > +#include <linux/slab.h>
> > +#include <linux/mmu_context.h>
> > +#include <linux/rcupdate.h>
> > +#include <linux/prmem.h>
> > +
> > +static __ro_after_init bool wr_ready;
> > +static __ro_after_init struct mm_struct *wr_poking_mm;
> > +static __ro_after_init unsigned long wr_poking_base;
> > +
> > +/*
> > + * The following two variables are statically allocated by the linker
> > + * script at the the boundaries of the memory region (rounded up to
> > + * multiples of PAGE_SIZE) reserved for __wr_after_init.
> > + */
> > +extern long __start_wr_after_init;
> > +extern long __end_wr_after_init;
> > +
> > +static inline bool is_wr_after_init(unsigned long ptr, __kernel_size_t size)
> > +{
> > + unsigned long start = (unsigned long)&__start_wr_after_init;
> > + unsigned long end = (unsigned long)&__end_wr_after_init;
> > + unsigned long low = ptr;
> > + unsigned long high = ptr + size;
> > +
> > + return likely(start <= low && low <= high && high <= end);
> > +}
> > +
> > +
> > +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> > + enum wr_op_type op)
> > +{
>
> You might end up wanting something like:
>
> #ifdef __arch_wr_op
> return __arch_wr_op(...);
> #endif
>
> if an arch (s390? powerpc?) decides to have a totally different
> implementation of this.
>
> Hi s390 and powerpc people: it would be nice if this generic
> implementation *worked* on your architectures and that it will allow
> you to add some straightforward way to add a better arch-specific
> implementation if you think that would be better.

As the code is right now I can guarantee that it will not work on s390.

> > + temporary_mm_state_t prev;
> > + unsigned long flags;
> > + unsigned long offset;
> > + unsigned long wr_poking_addr;
> > +
> > + /* Confirm that the writable mapping exists. */
> > + BUG_ON(!wr_ready);
> > +
> > + if (WARN_ONCE(op >= WR_OPS_NUMBER, "Invalid WR operation.") ||
> > + WARN_ONCE(!is_wr_after_init(dst, len), "Invalid WR range."))
> > + return (void *)dst;
> > +
> > + offset = dst - (unsigned long)&__start_wr_after_init;
> > + wr_poking_addr = wr_poking_base + offset;
> > + local_irq_save(flags);
> > + prev = use_temporary_mm(wr_poking_mm);
> > +
> > + kasan_disable_current();
> > + if (op == WR_MEMCPY)
> > + memcpy((void *)wr_poking_addr, (void *)src, len);
> > + else if (op == WR_MEMSET)
> > + memset((u8 *)wr_poking_addr, (u8)src, len);
> > + else if (op == WR_RCU_ASSIGN_PTR)
> > + /* generic version of rcu_assign_pointer */
> > + smp_store_release((void **)wr_poking_addr,
> > + RCU_INITIALIZER((void **)src));
> > + kasan_enable_current();
>
> Hmm. I suspect this will explode quite badly on sane architectures
> like s390. (In my book, despite how weird s390 is, it has a vastly
> nicer model of "user" memory than any other architecture I know
> of...). I think you should use copy_to_user(), etc, instead. I'm not
> entirely sure what the best smp_store_release() replacement is.
> Making this change may also mean you can get rid of the
> kasan_disable_current().

use_temporary_mm() replaces the *user* mm, for s390 this is completely
independent from the kernel mm (different control register cr1 vs cr7).
The wr_poking_addr is not available when running in the kernel address
space.

> > +
> > + barrier(); /* XXX redundant? */
>
> I think it's redundant. If unuse_temporary_mm() allows earlier stores
> to hit the wrong address space, then something is very very wrong, and
> something is also very very wrong if the optimizer starts moving
> stores across a function call that is most definitely a barrier.
>
> > +
> > + unuse_temporary_mm(prev);
> > + /* XXX make the verification optional? */
> > + if (op == WR_MEMCPY)
> > + BUG_ON(memcmp((void *)dst, (void *)src, len));
> > + else if (op == WR_MEMSET)
> > + BUG_ON(memtst((void *)dst, (u8)src, len));
> > + else if (op == WR_RCU_ASSIGN_PTR)
> > + BUG_ON(*(unsigned long *)dst != src);
>
> Hmm. If you allowed cmpxchg or even plain xchg, then these bug_ons
> would be thoroughly buggy, but maybe they're okay. But they should,
> at most, be WARN_ON_ONCE(), given that you can trigger them by writing
> the same addresses from two threads at once, and this isn't even
> entirely obviously bogus given the presence of smp_store_release().
>
> > + local_irq_restore(flags);
> > + return (void *)dst;
> > +}
> > +
> > +struct mm_struct *copy_init_mm(void);
> > +void __init wr_poking_init(void)
> > +{
> > + unsigned long start = (unsigned long)&__start_wr_after_init;
> > + unsigned long end = (unsigned long)&__end_wr_after_init;
> > + unsigned long i;
> > + unsigned long wr_range;
> > +
> > + wr_poking_mm = copy_init_mm();
> > + BUG_ON(!wr_poking_mm);
> > +
> > + /* XXX What if it's too large to fit in the task unmapped mem? */
> > + wr_range = round_up(end - start, PAGE_SIZE);
> > +
> > + /* Randomize the poking address base*/
> > + wr_poking_base = TASK_UNMAPPED_BASE +
> > + (kaslr_get_random_long("Write Rare Poking") & PAGE_MASK) %
> > + (TASK_SIZE - (TASK_UNMAPPED_BASE + wr_range));

The wr_poking_base address is a user space address, no? This will be usable
only with the uaccess primitives on s390.

> > +
> > + /* Create alternate mapping for the entire wr_after_init range. */
> > + for (i = start; i < end; i += PAGE_SIZE) {
> > + struct page *page;
> > + spinlock_t *ptl;
> > + pte_t pte;
> > + pte_t *ptep;
> > + unsigned long wr_poking_addr;
> > +
> > + BUG_ON(!(page = virt_to_page(i)));
> > + wr_poking_addr = i - start + wr_poking_base;
> > +
> > + /* The lock is not needed, but avoids open-coding. */
> > + ptep = get_locked_pte(wr_poking_mm, wr_poking_addr, &ptl);
> > + VM_BUG_ON(!ptep);
> > +
> > + pte = mk_pte(page, PAGE_KERNEL);
> > + set_pte_at(wr_poking_mm, wr_poking_addr, ptep, pte);
> > + spin_unlock(ptl);
> > + }
> > + wr_ready = true;
> > +}
> > --
> > 2.19.1
> >
>


--
blue skies,
Martin.

"Reality continues to ruin my life." - Calvin.