[PATCH 09/10] Introduce functions to restore mm

From: Andrey Mirkin
Date: Fri Oct 17 2008 - 19:14:45 EST


Functions to restore mm, VMAs and mm context are added.

Signed-off-by: Andrey Mirkin <major@xxxxxxxxxx>
---
checkpoint/Makefile | 2 +-
checkpoint/checkpoint.h | 1 +
checkpoint/cpt_image.h | 5 +
checkpoint/rst_mm.c | 320 ++++++++++++++++++++++++++++++++++++++++++++++
checkpoint/rst_process.c | 3 +-
mm/mmap.c | 1 +
mm/mprotect.c | 2 +
7 files changed, 332 insertions(+), 2 deletions(-)
create mode 100644 checkpoint/rst_mm.c

diff --git a/checkpoint/Makefile b/checkpoint/Makefile
index 689a0eb..19ca732 100644
--- a/checkpoint/Makefile
+++ b/checkpoint/Makefile
@@ -3,4 +3,4 @@ obj-y += sys_core.o
obj-$(CONFIG_CHECKPOINT) += cptrst.o

cptrst-objs := sys.o checkpoint.o cpt_process.o cpt_mm.o restart.o \
- rst_process.o
+ rst_process.o rst_mm.o
diff --git a/checkpoint/checkpoint.h b/checkpoint/checkpoint.h
index 1d0ca49..195fdc6 100644
--- a/checkpoint/checkpoint.h
+++ b/checkpoint/checkpoint.h
@@ -65,3 +65,4 @@ int cpt_dump_mm(struct task_struct *tsk, struct cpt_context *ctx);
int restart_container(struct cpt_context *ctx);
int rst_get_object(int type, void *tmp, int size, struct cpt_context *ctx);
int rst_restart_process(struct cpt_context *ctx);
+int rst_restore_mm(struct cpt_context *ctx);
diff --git a/checkpoint/cpt_image.h b/checkpoint/cpt_image.h
index 160cf85..e1fb483 100644
--- a/checkpoint/cpt_image.h
+++ b/checkpoint/cpt_image.h
@@ -233,6 +233,11 @@ struct cpt_x86_regs
__u32 cpt_ss;
} __attribute__ ((aligned (8)));

+static inline void __user * cpt_ptr_import(__u64 ptr)
+{
+ return (void*)(unsigned long)ptr;
+}
+
static inline __u64 cpt_timespec_export(struct timespec *tv)
{
return (((u64)tv->tv_sec) << 32) + tv->tv_nsec;
diff --git a/checkpoint/rst_mm.c b/checkpoint/rst_mm.c
new file mode 100644
index 0000000..fe53c45
--- /dev/null
+++ b/checkpoint/rst_mm.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (C) 2008 Parallels, Inc.
+ *
+ * Author: Andrey Mirkin <major@xxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/syscalls.h>
+
+#include "checkpoint.h"
+#include "cpt_image.h"
+
+static unsigned long make_prot(struct cpt_vma_image *vmai)
+{
+ unsigned long prot = 0;
+
+ if (vmai->cpt_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vmai->cpt_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vmai->cpt_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+ if (vmai->cpt_flags & VM_GROWSDOWN)
+ prot |= PROT_GROWSDOWN;
+ if (vmai->cpt_flags & VM_GROWSUP)
+ prot |= PROT_GROWSUP;
+ return prot;
+}
+
+static unsigned long make_flags(struct cpt_vma_image *vmai)
+{
+ unsigned long flags = MAP_FIXED;
+
+ if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE))
+ flags |= MAP_SHARED;
+ else
+ flags |= MAP_PRIVATE;
+
+ if (vmai->cpt_file == CPT_NULL)
+ flags |= MAP_ANONYMOUS;
+ if (vmai->cpt_flags & VM_GROWSDOWN)
+ flags |= MAP_GROWSDOWN;
+#ifdef MAP_GROWSUP
+ if (vmai->cpt_flags & VM_GROWSUP)
+ flags |= MAP_GROWSUP;
+#endif
+ if (vmai->cpt_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vmai->cpt_flags & VM_EXECUTABLE)
+ flags |= MAP_EXECUTABLE;
+ if (!(vmai->cpt_flags & VM_ACCOUNT))
+ flags |= MAP_NORESERVE;
+ return flags;
+}
+
+static int rst_restore_one_vma(struct cpt_context *ctx)
+{
+ int err;
+ int i;
+ unsigned long addr;
+ struct mm_struct *mm = current->mm;
+ struct cpt_vma_image vmai;
+ struct vm_area_struct *vma;
+ struct file *file = NULL;
+ unsigned long prot;
+
+ err = rst_get_object(CPT_OBJ_VMA, &vmai, sizeof(vmai), ctx);
+ if (err)
+ return err;
+
+ prot = make_prot(&vmai);
+
+ if (vmai.cpt_vma_type == CPT_VMA_FILE) {
+ struct cpt_object_hdr h;
+ int len;
+ char *path;
+
+ err = rst_get_object(CPT_OBJ_NAME, &h, sizeof(h), ctx);
+ if (err)
+ goto out;
+ len = h.cpt_len - sizeof(h);
+ if (len < 0) {
+ err = -EINVAL;
+ goto out;
+ }
+ path = kmalloc(len, GFP_KERNEL);
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = ctx->read(path, len, ctx);
+ if (err) {
+ kfree(path);
+ goto out;
+ }
+
+ /* Just open file
+ TODO: open with correct flags */
+ file = filp_open(path, O_RDONLY, 0);
+ kfree(path);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto out;
+ }
+ }
+
+ down_write(&mm->mmap_sem);
+ addr = do_mmap_pgoff(file, vmai.cpt_start,
+ vmai.cpt_end - vmai.cpt_start,
+ prot, make_flags(&vmai),
+ vmai.cpt_pgoff);
+
+ if (addr != vmai.cpt_start) {
+ up_write(&mm->mmap_sem);
+
+ err = -EINVAL;
+ if (IS_ERR((void*)addr))
+ err = addr;
+ goto out;
+ }
+
+ vma = find_vma(mm, vmai.cpt_start);
+ if (vma == NULL) {
+ up_write(&mm->mmap_sem);
+ eprintk("cannot find mmapped vma\n");
+ err = -ESRCH;
+ goto out;
+ }
+
+ /* do_mmap_pgoff() can merge new area to previous one (not to the next,
+ * we mmap in order, the rest of mm is still unmapped). This can happen
+ * f.e. if flags are to be adjusted later, or if we had different
+ * anon_vma on two adjacent regions. Split it by brute force. */
+ if (vma->vm_start != vmai.cpt_start) {
+ err = split_vma(mm, vma, (unsigned long)vmai.cpt_start, 0);
+ if (err) {
+ up_write(&mm->mmap_sem);
+ eprintk("cannot split vma\n");
+ goto out;
+ }
+ }
+ up_write(&mm->mmap_sem);
+
+ for (i = 0; i < vmai.cpt_page_num; i++) {
+ struct cpt_page_block pb;
+
+ err = rst_get_object(CPT_OBJ_PAGES, &pb, sizeof(pb), ctx);
+ if (err)
+ goto out;
+ if (!(vmai.cpt_flags & VM_ACCOUNT) && !(prot & PROT_WRITE)) {
+ /* I guess this is get_user_pages() messed things,
+ * this happens f.e. when gdb inserts breakpoints.
+ */
+ int j;
+ for (j = 0; j < (pb.cpt_end-pb.cpt_start)/PAGE_SIZE; j++) {
+ struct page *page;
+ void *maddr;
+ err = get_user_pages(current, current->mm,
+ (unsigned long)pb.cpt_start +
+ j * PAGE_SIZE,
+ 1, 1, 1, &page, NULL);
+ if (err == 0)
+ err = -EFAULT;
+ if (err < 0) {
+ eprintk("get_user_pages: %d\n", err);
+ goto out;
+ }
+ err = 0;
+ maddr = kmap(page);
+ if (pb.cpt_content == CPT_CONTENT_VOID) {
+ memset(maddr, 0, PAGE_SIZE);
+ } else if (pb.cpt_content == CPT_CONTENT_DATA) {
+ err = ctx->read(maddr, PAGE_SIZE, ctx);
+ if (err) {
+ kunmap(page);
+ goto out;
+ }
+ } else {
+ err = -EINVAL;
+ kunmap(page);
+ goto out;
+ }
+ set_page_dirty_lock(page);
+ kunmap(page);
+ page_cache_release(page);
+ }
+ } else {
+ if (!(prot & PROT_WRITE))
+ sys_mprotect(vmai.cpt_start,
+ vmai.cpt_end - vmai.cpt_start,
+ prot | PROT_WRITE);
+ if (pb.cpt_content == CPT_CONTENT_VOID) {
+ int j;
+ for (j=0; j<(pb.cpt_end-pb.cpt_start)/sizeof(unsigned long); j++) {
+ err = __put_user(0UL, ((unsigned long __user*)(unsigned long)pb.cpt_start) + j);
+ if (err) {
+ eprintk("__put_user 2 %d\n", err);
+ goto out;
+ }
+ }
+ } else if (pb.cpt_content == CPT_CONTENT_DATA) {
+ err = ctx->read(cpt_ptr_import(pb.cpt_start),
+ pb.cpt_end - pb.cpt_start,
+ ctx);
+ if (err)
+ goto out;
+ } else {
+ err = -EINVAL;
+ goto out;
+ }
+ if (!(prot & PROT_WRITE))
+ sys_mprotect(vmai.cpt_start,
+ vmai.cpt_end - vmai.cpt_start,
+ prot);
+ }
+ }
+
+out:
+ if (file)
+ fput(file);
+ return err;
+}
+
+static int rst_restore_mm_context(struct cpt_context *ctx)
+{
+ struct cpt_obj_bits b;
+ struct mm_struct *mm = current->mm;
+ int oldsize = mm->context.size;
+ int err;
+ void *oldldt;
+ void *newldt;
+
+ err = rst_get_object(CPT_OBJ_BITS, &b, sizeof(b), ctx);
+ if (err)
+ return err;
+
+ if (b.cpt_size > PAGE_SIZE)
+ newldt = vmalloc(b.cpt_size);
+ else
+ newldt = kmalloc(b.cpt_size, GFP_KERNEL);
+
+ if (!newldt)
+ return -ENOMEM;
+
+ err = ctx->read(newldt, b.cpt_size, ctx);
+ if (err)
+ return err;
+
+ oldldt = mm->context.ldt;
+ mm->context.ldt = newldt;
+ mm->context.size = b.cpt_size / LDT_ENTRY_SIZE;
+
+ load_LDT(&mm->context);
+
+ if (oldsize) {
+ if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(oldldt);
+ else
+ kfree(oldldt);
+ }
+
+ return 0;
+}
+
+int rst_restore_mm(struct cpt_context *ctx)
+{
+ int err;
+ int i;
+ struct mm_struct *mm = current->mm;
+ struct cpt_mm_image m;
+
+ err = rst_get_object(CPT_OBJ_MM, &m, sizeof(m), ctx);
+ if (err)
+ return err;
+
+ down_write(&mm->mmap_sem);
+ do_munmap(mm, 0, TASK_SIZE);
+
+ mm->start_code = m.cpt_start_code;
+ mm->end_code = m.cpt_end_code;
+ mm->start_data = m.cpt_start_data;
+ mm->end_data = m.cpt_end_data;
+ mm->start_brk = m.cpt_start_brk;
+ mm->brk = m.cpt_brk;
+ mm->start_stack = m.cpt_start_stack;
+ mm->arg_start = m.cpt_start_arg;
+ mm->arg_end = m.cpt_end_arg;
+ mm->env_start = m.cpt_start_env;
+ mm->env_end = m.cpt_end_env;
+ mm->def_flags = m.cpt_def_flags;
+ mm->flags = m.cpt_flags;
+
+ up_write(&mm->mmap_sem);
+
+ for (i = 0; i < m.cpt_map_count; i++) {
+ err = rst_restore_one_vma(ctx);
+ if (err < 0)
+ goto out;
+ }
+
+ err = rst_restore_mm_context(ctx);
+out:
+ return err;
+}
+
diff --git a/checkpoint/rst_process.c b/checkpoint/rst_process.c
index b9f745e..9e448b2 100644
--- a/checkpoint/rst_process.c
+++ b/checkpoint/rst_process.c
@@ -210,7 +210,8 @@ static int restart_thread(void *arg)
err = rst_get_object(CPT_OBJ_TASK, ti, sizeof(*ti), ctx);
if (!err)
err = rst_restore_task_struct(current, ti, ctx);
- /* Restore mm here */
+ if (!err)
+ err = rst_restore_mm(ctx);
if (!err)
err = rst_restore_fpustate(current, ti, ctx);
if (!err)
diff --git a/mm/mmap.c b/mm/mmap.c
index 971d0ed..98d1ba9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1858,6 +1858,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,

return 0;
}
+EXPORT_SYMBOL(split_vma);

/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
diff --git a/mm/mprotect.c b/mm/mprotect.c
index fded06f..47c7d75 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -22,6 +22,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
+#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -317,3 +318,4 @@ out:
up_write(&current->mm->mmap_sem);
return error;
}
+EXPORT_SYMBOL(sys_mprotect);
--
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/