[PATCH 1/1] checkpoint: add sys_checkpoint() and binfmt_cr.c

From: Serge Hallyn
Date: Wed Jul 16 2008 - 14:38:44 EST


Add a do_checkpoint syscall (only for x86_32 right now). The intent is to
dump process data which isn't userspace-accessible yet.

Introduce fs/binfmt_cr, which executes checkpoint files. At the moment
all it does is execute the original file using its default binary
handler, and resets tsk->mm->arg_start and tsk->did_exec.

Since binfmt_cr only does part of the necessary restart operations, userspace
will need to do the rest.

Cryo, for instance, will cause the new process to execute this task, then be
ptraced to allow the rest of the restore to take place.

Signed-off-by: Serge Hallyn <serge@xxxxxxxxxx>
---
arch/x86/kernel/process_32.c | 15 +++++
arch/x86/kernel/syscall_table_32.S | 1 +
fs/Kconfig.binfmt | 7 +++
fs/Makefile | 3 +-
fs/binfmt_cr.c | 100 ++++++++++++++++++++++++++++++++++++
fs/checkpoint.c | 79 ++++++++++++++++++++++++++++
fs/exec.c | 21 ++++++++
include/asm-x86/unistd_32.h | 1 +
include/linux/binfmts.h | 1 +
include/linux/checkpoint.h | 5 ++
include/linux/sched.h | 2 +
include/linux/syscalls.h | 1 +
kernel/sys_ni.c | 2 +
13 files changed, 237 insertions(+), 1 deletions(-)
create mode 100644 fs/binfmt_cr.c
create mode 100644 fs/checkpoint.c
create mode 100644 include/linux/checkpoint.h

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index e2db9ac..fd55fec 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -767,3 +767,18 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
unsigned long range_end = mm->brk + 0x02000000;
return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}
+
+asmlinkage int sys_checkpoint(struct pt_regs regs)
+{
+ int error;
+ char *filename;
+
+ filename = getname((char __user *) regs.bx);
+ error = PTR_ERR(filename);
+ if (IS_ERR(filename))
+ goto out;
+ error = do_checkpoint(filename, &regs);
+ putname(filename);
+out:
+ return error;
+}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index adff556..8195a31 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -326,3 +326,4 @@ ENTRY(sys_call_table)
.long sys_fallocate
.long sys_timerfd_settime /* 325 */
.long sys_timerfd_gettime
+ .long sys_checkpoint
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 3263084..cb0d19b 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -137,3 +137,10 @@ config BINFMT_MISC
You may say M here for module support and later load the module when
you have use for it; the module is called binfmt_misc. If you
don't know what to answer at this point, say Y.
+
+config BINFMT_CR
+ tristate "Kernel support for executing checkpoint files"
+ default n
+ ---help---
+ Checkpoint files (created using sys_checkpoint) can be executed
+ as though they were binaries using this binary format handler.
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11b..9230fac 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
- stack.o
+ stack.o checkpoint.o

ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
@@ -34,6 +34,7 @@ obj-y += $(nfsd-y) $(nfsd-m)
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
+obj-$(CONFIG_BINFMT_CR) += binfmt_cr.o

# binfmt_script is always there
obj-y += binfmt_script.o
diff --git a/fs/binfmt_cr.c b/fs/binfmt_cr.c
new file mode 100644
index 0000000..8a0e173
--- /dev/null
+++ b/fs/binfmt_cr.c
@@ -0,0 +1,100 @@
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/binfmts.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/mm_types.h>
+#include <linux/sched.h>
+#include <linux/checkpoint.h>
+
+/*
+ * The pathname is likely to quickly overrun bprm->buf.
+ * I'll need to read the first page of the file.
+ */
+static int load_checkpoint(struct linux_binprm *bprm,struct pt_regs *regs)
+{
+ unsigned long arg_start;
+ short did_exec;
+ char *cp;
+ struct file *file;
+ int retval;
+
+ cp = bprm->buf;
+ if (memcmp(cp, CKPT_ID, strlen(CKPT_ID)))
+ return -ENOEXEC;
+ cp += strlen(CKPT_ID) + 1;
+ printk(KERN_NOTICE "%s: checking version\n", __func__);
+ if (memcmp(cp, CKPT_VERSION, strlen(CKPT_VERSION)))
+ return -EINVAL;
+ /* Grab the pathname of the original, checkpointed executable */
+ cp += strlen(CKPT_VERSION) + 1;
+ if (*cp == ' ') {
+ printk(KERN_NOTICE "Serge: bump by 1\n");
+ cp++;
+ }
+ printk(KERN_NOTICE "%s: reading arg_start\n", __func__);
+ retval = sscanf(cp, "%lu", &arg_start);
+ if (retval != 1)
+ return -EINVAL;
+ printk(KERN_NOTICE "%s: arg_start was %lu\n", __func__, arg_start);
+ printk(KERN_NOTICE "%s: moving cp to did_exec\n", __func__);
+ while (*(++cp) != ' ' && (cp-bprm->buf < BINPRM_BUF_SIZE));
+ cp++;
+ if (cp-bprm->buf >= BINPRM_BUF_SIZE)
+ return -EINVAL;
+ printk(KERN_NOTICE "%s: reading did_exec (cp is %s)\n", __func__, cp);
+ retval = sscanf(cp, "%hu", &did_exec);
+ if (retval != 1)
+ return -EINVAL;
+ printk(KERN_NOTICE "%s: did_exec was %hu\n", __func__, did_exec);
+ printk(KERN_NOTICE "%s: moving cp to fname\n", __func__);
+ while (*(++cp) != ' ' && (cp-bprm->buf < BINPRM_BUF_SIZE));
+ cp++;
+ if (cp-bprm->buf >= BINPRM_BUF_SIZE)
+ return -EINVAL;
+ /*
+ * OK, now restart the process with the original executable's dentry.
+ */
+ printk(KERN_NOTICE "%s: opening fname: %s\n", __func__, cp);
+ file = open_exec(cp);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ printk(KERN_NOTICE "%s: calling prepare_binprm %s\n", __func__, cp);
+ bprm->file = file;
+ retval = prepare_binprm(bprm);
+ if (retval < 0)
+ return retval;
+ retval = search_binary_handler(bprm,regs);
+ if (retval >= 0) {
+ /* execve success */
+ printk(KERN_NOTICE "%s: execve succeeded!\n", __func__);
+ current->mm->arg_start = arg_start;
+ current->did_exec = did_exec;
+ } else
+ printk(KERN_NOTICE "%s: execve failed with %d.\n", __func__, retval);
+ return retval;
+}
+
+static struct linux_binfmt cr_format = {
+ .module = THIS_MODULE,
+ .load_binary = load_checkpoint,
+};
+
+static int __init init_cr_binfmt(void)
+{
+ return register_binfmt(&cr_format);
+}
+
+static void __exit exit_cr_binfmt(void)
+{
+ unregister_binfmt(&cr_format);
+}
+
+core_initcall(init_cr_binfmt);
+module_exit(exit_cr_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
new file mode 100644
index 0000000..784f79a
--- /dev/null
+++ b/fs/checkpoint.c
@@ -0,0 +1,79 @@
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/checkpoint.h>
+
+int checkpoint_write(struct file *file, const void *addr, int nr)
+{
+ return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+
+#ifdef CONFIG_PROC_FS
+char *get_exe_name(char *buf, int buflen)
+{
+ struct file *f = current->mm->exe_file;
+ return dentry_path(f->f_dentry, buf, buflen);
+}
+#else
+char *get_exe_name(char *buf, int buflen)
+{
+ if (buflen < sizeof(current->comm))
+ return -ENAMETOOLONG;
+ return get_task_comm(buf, current);
+#endif
+
+/*
+ * Format of a checkpoint file.
+ * Version 2008-07-14:
+ * LX_CKPT2008-07-14
+ * mm->arg_start (lu)
+ * current->did_exec (hu)
+ * filename
+ */
+int dump_checkpoint(struct file *file, struct pt_regs * regs)
+{
+ char buf[MMARGSTR];
+ char *exename, *sret;
+ size_t len;
+ mm_segment_t fs;
+ int retval = 0;
+
+ exename = kmalloc(4096, GFP_KERNEL);
+ if (IS_ERR(exename))
+ return -ENOMEM;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+
+ retval = -EINVAL;
+ printk(KERN_NOTICE "%s: writing a dump file\n", __func__);
+ if (!checkpoint_write(file, CKPT_ID, sizeof(CKPT_ID)))
+ goto out_setfs;
+ printk(KERN_NOTICE "%s: wrote ckpt id\n", __func__);
+ if (!checkpoint_write(file, CKPT_VERSION, sizeof(CKPT_VERSION)))
+ goto out_setfs;
+ len = snprintf(buf, MMARGSTR, " %lu ", current->mm->arg_start);
+ if (!checkpoint_write(file, buf, len))
+ goto out_setfs;
+ len = snprintf(buf, MMARGSTR, "%hu ", current->did_exec);
+ if (!checkpoint_write(file, buf, len))
+ goto out_setfs;
+
+ sret = get_exe_name(exename, 4096);
+ if (IS_ERR(sret)) {
+ retval = PTR_ERR(sret);
+ goto out_setfs;
+ }
+ retval = 0;
+ if (!checkpoint_write(file, sret, strlen(sret)+1))
+ retval = -EINVAL;
+ printk(KERN_NOTICE "%s: returning %d\n", __func__, retval);
+out_setfs:
+ set_fs(fs);
+ kfree(exename);
+ return retval;
+}
diff --git a/fs/exec.c b/fs/exec.c
index fd92343..68ad85c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -51,6 +51,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
+#include <linux/checkpoint.h>

#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -1790,3 +1791,23 @@ fail_unlock:
fail:
return retval;
}
+
+int do_checkpoint(char *filename, struct pt_regs * regs)
+{
+ int retval = -EINVAL;
+ struct file * file;
+
+ printk(KERN_NOTICE "%s: called (filename %s)\n", __func__, filename);
+ file = filp_open(filename, O_CREAT|O_NOFOLLOW|O_WRONLY, 0600);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ printk(KERN_NOTICE "%s: create went ok\n", __func__);
+ if (!file->f_op || !file->f_op->write)
+ goto close_fail;
+
+ retval = dump_checkpoint(file, regs);
+
+close_fail:
+ filp_close(file, NULL);
+ return retval;
+}
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
index 8317d94..b367465 100644
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -332,6 +332,7 @@
#define __NR_fallocate 324
#define __NR_timerfd_settime 325
#define __NR_timerfd_gettime 326
+#define __NR_checkpoint 327

#ifdef __KERNEL__

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index ee0ed48..3024e44 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -70,6 +70,7 @@ struct linux_binfmt {
int (*load_shlib)(struct file *);
int (*core_dump)(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
unsigned long min_coredump; /* minimal dump size */
+ int (*checkpoint)(struct pt_regs *regs, struct file *file);
int hasvdso;
};

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
new file mode 100644
index 0000000..5628f0e
--- /dev/null
+++ b/include/linux/checkpoint.h
@@ -0,0 +1,5 @@
+#define CKPT_ID "LX_CKPT"
+#define CKPT_VERSION "2008-07-14"
+#define MMARGSTR 20
+
+int dump_checkpoint(struct file *file, struct pt_regs * regs);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c5d3f84..7098822 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1861,6 +1861,8 @@ extern int do_execve(char *, char __user * __user *, char __user * __user *, str
extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
struct task_struct *fork_idle(int);

+extern int do_checkpoint(char *, struct pt_regs *);
+
extern void set_task_comm(struct task_struct *tsk, char *from);
extern char *get_task_comm(char *to, struct task_struct *tsk);

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 0522f36..f08877d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -617,5 +617,6 @@ asmlinkage long sys_eventfd(unsigned int count);
asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);

int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
+asmlinkage long sys_checkpoint(const char *filename);

#endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5b9b467..62dcdaa 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -161,3 +161,5 @@ cond_syscall(sys_timerfd_gettime);
cond_syscall(compat_sys_timerfd_settime);
cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
+
+cond_syscall(sys_checkpoint);
--
1.5.4.3


--cNdxnHkX5QqsyA0e
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="0001-cryo--sys_checkpoint-first-attempt-at-exploiting.patch"