Re: [RFC, PATCH 7/24] i386 Vmi memory hole

From: Gerd Hoffmann
Date: Wed Mar 15 2006 - 03:23:00 EST


>> The complications in my patch come
>> from the fact that the vsyscall page has to be relocated dynamically,
>> requiring, basically run time linking on the page and some tweaks to get
>> sysenter to work. If you don't use vsyscall (say, non-TLS glibc), then
>> you don't need that complexity. But I think it might be needed now,
>> even for Xen.
>
> I believe both Xen and execshield move vsyscall out of fixmap, and then
> map into userspace as normal vma.

Yep, my patch (attached below for reference) moves the vsyscall page
into user address space, just below PAGE_OFFSET. Works basically the
same way the vsyscall page is mapped in the ia32 emulation of the x86_64
architecture. Address stays fixed, thus the relocation magic isn't needed.

Once the vsyscall page is moved out of fixmap it's easy to make fixmap
movable and thus have a runtime-resizable address space hole at the top
of address space. Patch is attached too, although that one is more
proof-of-concept, it doesn't make much sense as-is. It has a kernel
command line option to specify the top of address space so you can play
around with it ...

Both patches are against -rc3 and most likely still apply just fine,
havn't tested that though.

cheers,

Gerd

--
Gerd 'just married' Hoffmann <kraxel@xxxxxxx>
I'm the hacker formerly known as Gerd Knorr.
http://www.suse.de/~kraxel/just-married.jpeg
Index: vanilla-2.6.16-rc3/arch/i386/kernel/asm-offsets.c
===================================================================
--- vanilla-2.6.16-rc3.orig/arch/i386/kernel/asm-offsets.c 2006-01-03 04:21:10.000000000 +0100
+++ vanilla-2.6.16-rc3/arch/i386/kernel/asm-offsets.c 2006-02-15 10:59:41.000000000 +0100
@@ -68,5 +68,5 @@ void foo(void)
sizeof(struct tss_struct));

DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
- DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
+ DEFINE(VSYSCALL_BASE, (PAGE_OFFSET - 2*PAGE_SIZE));
}
Index: vanilla-2.6.16-rc3/arch/i386/kernel/sysenter.c
===================================================================
--- vanilla-2.6.16-rc3.orig/arch/i386/kernel/sysenter.c 2006-01-03 04:21:10.000000000 +0100
+++ vanilla-2.6.16-rc3/arch/i386/kernel/sysenter.c 2006-02-13 09:57:36.000000000 +0100
@@ -13,6 +13,7 @@
#include <linux/gfp.h>
#include <linux/string.h>
#include <linux/elf.h>
+#include <linux/mm.h>

#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -45,23 +46,88 @@ void enable_sep_cpu(void)
*/
extern const char vsyscall_int80_start, vsyscall_int80_end;
extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
+static void *syscall_page;

int __init sysenter_setup(void)
{
- void *page = (void *)get_zeroed_page(GFP_ATOMIC);
-
- __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
+ syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);

if (!boot_cpu_has(X86_FEATURE_SEP)) {
- memcpy(page,
+ memcpy(syscall_page,
&vsyscall_int80_start,
&vsyscall_int80_end - &vsyscall_int80_start);
return 0;
}

- memcpy(page,
+ memcpy(syscall_page,
&vsyscall_sysenter_start,
&vsyscall_sysenter_end - &vsyscall_sysenter_start);

return 0;
}
+
+static struct page*
+syscall_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
+{
+ struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
+ get_page(p);
+ return p;
+}
+
+/* Prevent VMA merging */
+static void syscall_vma_close(struct vm_area_struct *vma)
+{
+}
+
+static struct vm_operations_struct syscall_vm_ops = {
+ .close = syscall_vma_close,
+ .nopage = syscall_nopage,
+};
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!vma)
+ return -ENOMEM;
+
+ memset(vma, 0, sizeof(struct vm_area_struct));
+ /* Could randomize here */
+ vma->vm_start = VSYSCALL_BASE;
+ vma->vm_end = VSYSCALL_BASE + PAGE_SIZE;
+ /* MAYWRITE to allow gdb to COW and set breakpoints */
+ vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
+ vma->vm_flags |= mm->def_flags;
+ vma->vm_page_prot = protection_map[vma->vm_flags & 7];
+ vma->vm_ops = &syscall_vm_ops;
+ vma->vm_mm = mm;
+
+ down_write(&mm->mmap_sem);
+ if ((ret = insert_vm_struct(mm, vma))) {
+ up_write(&mm->mmap_sem);
+ kmem_cache_free(vm_area_cachep, vma);
+ return ret;
+ }
+ mm->total_vm++;
+ up_write(&mm->mmap_sem);
+ return 0;
+}
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+ return NULL;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+ return 0;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+ return 0;
+}
Index: vanilla-2.6.16-rc3/include/asm-i386/a.out.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/a.out.h 2006-01-03 04:21:10.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/a.out.h 2006-02-13 09:57:36.000000000 +0100
@@ -19,7 +19,7 @@ struct exec

#ifdef __KERNEL__

-#define STACK_TOP TASK_SIZE
+#define STACK_TOP (TASK_SIZE - 3*PAGE_SIZE)

#endif

Index: vanilla-2.6.16-rc3/include/asm-i386/elf.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/elf.h 2006-01-03 04:21:10.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/elf.h 2006-02-13 09:57:36.000000000 +0100
@@ -129,11 +129,16 @@ extern int dump_task_extended_fpu (struc
#define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
#define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)

-#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL))
+#define VSYSCALL_BASE (PAGE_OFFSET - 2*PAGE_SIZE)
#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE)
#define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall)
extern void __kernel_vsyscall;

+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
+struct linux_binprm;
+extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+ int executable_stack);
+
#define ARCH_DLINFO \
do { \
NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \
Index: vanilla-2.6.16-rc3/include/asm-i386/fixmap.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/fixmap.h 2006-01-03 04:21:10.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/fixmap.h 2006-02-14 14:40:15.000000000 +0100
@@ -52,7 +52,6 @@
*/
enum fixed_addresses {
FIX_HOLE,
- FIX_VSYSCALL,
#ifdef CONFIG_X86_LOCAL_APIC
FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
#endif
@@ -116,14 +115,6 @@ extern void __set_fixmap (enum fixed_add
#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)

-/*
- * This is the range that is readable by user mode, and things
- * acting like user mode such as get_user_pages.
- */
-#define FIXADDR_USER_START (__fix_to_virt(FIX_VSYSCALL))
-#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
-
-
extern void __this_fixmap_does_not_exist(void);

/*
Index: vanilla-2.6.16-rc3/include/asm-i386/page.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/page.h 2006-02-13 09:42:02.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/page.h 2006-02-14 14:40:15.000000000 +0100
@@ -139,6 +139,8 @@ extern int page_is_ram(unsigned long pag
((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)

+#define __HAVE_ARCH_GATE_AREA 1
+
#endif /* __KERNEL__ */

#include <asm-generic/page.h>
Index: vanilla-2.6.16-rc3/include/asm-i386/processor.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/processor.h 2006-02-13 09:42:02.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/processor.h 2006-02-14 14:43:25.000000000 +0100
@@ -318,7 +318,7 @@ extern int bootloader_type;
/*
* User space process size: 3GB (default).
*/
-#define TASK_SIZE (PAGE_OFFSET)
+#define TASK_SIZE (PAGE_OFFSET - 3*PAGE_SIZE)

/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
Index: vanilla-2.6.16-rc3/arch/i386/kernel/setup.c
===================================================================
--- vanilla-2.6.16-rc3.orig/arch/i386/kernel/setup.c 2006-02-13 09:39:33.000000000 +0100
+++ vanilla-2.6.16-rc3/arch/i386/kernel/setup.c 2006-02-13 09:57:36.000000000 +0100
@@ -922,6 +922,12 @@ static void __init parse_cmdline_early (
else if (!memcmp(from, "vmalloc=", 8))
__VMALLOC_RESERVE = memparse(from+8, &from);

+ /*
+ * fixmap=addr
+ */
+ else if (!memcmp(from, "fixmap=", 7))
+ set_fixaddr_top(simple_strtoul(from+7, NULL, 16));
+
next_char:
c = *(from++);
if (!c)
Index: vanilla-2.6.16-rc3/arch/i386/mm/init.c
===================================================================
--- vanilla-2.6.16-rc3.orig/arch/i386/mm/init.c 2006-02-13 09:39:33.000000000 +0100
+++ vanilla-2.6.16-rc3/arch/i386/mm/init.c 2006-02-13 14:33:40.000000000 +0100
@@ -628,6 +628,42 @@ void __init mem_init(void)
(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
);

+#if 1 /* double-sanity-check paranoia */
+ printk("virtual kernel memory layout:\n"
+ " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
+ " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
+ " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
+ FIXADDR_START, FIXADDR_TOP,
+ (FIXADDR_TOP - FIXADDR_START) >> 10,
+
+ PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+ (LAST_PKMAP*PAGE_SIZE) >> 10,
+
+ VMALLOC_START, VMALLOC_END,
+ (VMALLOC_END - VMALLOC_START) >> 20,
+
+ (unsigned long)__va(0), (unsigned long)high_memory,
+ ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+ (unsigned long)&__init_begin, (unsigned long)&__init_end,
+ ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
+
+ (unsigned long)&_etext, (unsigned long)&_edata,
+ ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+ (unsigned long)&_text, (unsigned long)&_etext,
+ ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+ BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+ BUG_ON(VMALLOC_END > PKMAP_BASE);
+ BUG_ON(VMALLOC_START > VMALLOC_END);
+ BUG_ON((unsigned long)high_memory > VMALLOC_START);
+#endif /* double-sanity-check paranoia */
+
#ifdef CONFIG_X86_PAE
if (!cpu_has_pae)
panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
Index: vanilla-2.6.16-rc3/arch/i386/mm/pgtable.c
===================================================================
--- vanilla-2.6.16-rc3.orig/arch/i386/mm/pgtable.c 2006-01-03 04:21:10.000000000 +0100
+++ vanilla-2.6.16-rc3/arch/i386/mm/pgtable.c 2006-02-13 09:57:36.000000000 +0100
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
+#include <linux/module.h>

#include <asm/system.h>
#include <asm/pgtable.h>
@@ -138,6 +139,10 @@ void set_pmd_pfn(unsigned long vaddr, un
__flush_tlb_one(vaddr);
}

+static int fixmaps = 0;
+unsigned long __FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(__FIXADDR_TOP);
+
void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
{
unsigned long address = __fix_to_virt(idx);
@@ -147,6 +152,14 @@ void __set_fixmap (enum fixed_addresses
return;
}
set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+ fixmaps++;
+}
+
+void set_fixaddr_top(unsigned long top)
+{
+ BUG_ON(fixmaps > 0);
+ printk("%s: addr=0x%lx\n", __FUNCTION__, top);
+ __FIXADDR_TOP = top - PAGE_SIZE;
}

pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
Index: vanilla-2.6.16-rc3/include/asm-i386/fixmap.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/fixmap.h 2006-02-13 09:57:36.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/fixmap.h 2006-02-13 09:57:36.000000000 +0100
@@ -20,7 +20,7 @@
* Leave one empty page between vmalloc'ed areas and
* the start of the fixmap.
*/
-#define __FIXADDR_TOP 0xfffff000
+extern unsigned long __FIXADDR_TOP;

#ifndef __ASSEMBLY__
#include <linux/kernel.h>
@@ -93,6 +93,7 @@ enum fixed_addresses {

extern void __set_fixmap (enum fixed_addresses idx,
unsigned long phys, pgprot_t flags);
+extern void set_fixaddr_top(unsigned long top);

#define set_fixmap(idx, phys) \
__set_fixmap(idx, phys, PAGE_KERNEL)
Index: vanilla-2.6.16-rc3/include/asm-i386/page.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/page.h 2006-02-13 09:57:36.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/page.h 2006-02-13 14:21:36.000000000 +0100
@@ -121,7 +121,7 @@ extern int page_is_ram(unsigned long pag

#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
#define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE)
-#define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE)
+#define MAXMEM (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET)
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)