memory.c rewrite

Patrick Schaaf (bof@briggs.math.uni-sb.de)
Mon, 7 Aug 1995 15:16:57 +0000


Hi!

This is a rewrite of the page table manipulation code in memory.c

The basic idea of my mangling of the code is to drive it all from
the vm_areas. The old code walks the whole page table range
on fork, exit and exec. The new code walks the vmareas, and operates
only on their memory range. I get a speedup of 3x to 4x for
small test programs (that do nothing but fork/exit/exec :) - I hope
this will work for normal workloads, too. I'd be especially interested
in the impact of this change on the BYTE benchmarks and similar stuff.

The new code assumes that the vm_areas always cover all page table
areas that need treatment. If this is not correct (and there's a reason
for it) I'm lost.

The code needs testing:
- on other architectures.
- for real workloads and programs with lots of vm_areas; for really
high numbers of vm_areas the changes might even make it slower,
because I have to reach through the page table tree for each area.

There's an ugly hack involved: On the first exec() from within the kernel
resident init I cannot use zap_page_range - the page directory is in use.
I think I understand what's going on, and know how to fix it more nicely
than in the code below - by making the first fork() a clone().

Yeah, that's the second thing I think this code provides - mm cloning
should work correctly, except perhaps for signalling on page faults.
The mm_struct is shared between all clones in a group that use the
same page directory. When a clone execs or forks, it leaves the group.
The vm_task of the group's areas initially points to the task that did
the mmap; when that task forks or execs vm_task is redirected to
another task in the group.

enjoy
Patrick

(The #define FOLDED_THREE_LEVELS at the top of memory.c gives me
a small speedup on my system, but this might be due to bad optimization
on the part of gcc 2.5.8. Sorry, no time to check this. Would it be
worth the hassle to make this configurable?)

P.S.: I will be on holiday for two weeks starting on 12.8, so don't
expect me to answer mail during that time.

Here's the diff:

--- linux-1.3.12/include/linux/mm.h Mon Jul 17 09:43:26 1995
+++ linux/include/linux/mm.h Fri Aug 4 22:15:53 1995
@@ -163,31 +163,35 @@
return page;
}

-/* memory.c & swap.c*/
+/* memory.c */

-#define free_page(addr) free_pages((addr),0)
-extern void free_pages(unsigned long addr, unsigned long order);
+extern void oom(struct task_struct * tsk);

-extern void show_free_areas(void);
-extern unsigned long put_dirty_page(struct task_struct * tsk,unsigned long page,
- unsigned long address);
+extern int exec_mm(void);
+extern void exit_mm(void);
+extern int fork_mm(unsigned long clone_flags, struct task_struct *new_task);

-extern void free_page_tables(struct task_struct * tsk);
-extern void clear_page_tables(struct task_struct * tsk);
-extern int copy_page_tables(struct task_struct * to);
-extern int clone_page_tables(struct task_struct * to);
extern int unmap_page_range(unsigned long from, unsigned long size);
extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot);

-extern void handle_mm_fault(struct vm_area_struct *vma, unsigned long address, int write_access);
extern void do_wp_page(struct vm_area_struct * vma, unsigned long address, int write_access);
extern void do_no_page(struct vm_area_struct * vma, unsigned long address, int write_access);

+/* swap.c */
+
+#define free_page(addr) free_pages((addr),0)
+extern void free_pages(unsigned long addr, unsigned long order);
+
+extern void show_free_areas(void);
+extern unsigned long put_dirty_page(struct task_struct * tsk,unsigned long page,
+ unsigned long address);
+
+extern void handle_mm_fault(struct vm_area_struct *vma, unsigned long address, int write_access);
+
extern unsigned long paging_init(unsigned long start_mem, unsigned long end_mem);
extern void mem_init(unsigned long start_mem, unsigned long end_mem);
extern void show_mem(void);
-extern void oom(struct task_struct * tsk);
extern void si_meminfo(struct sysinfo * val);

/* vmalloc.c */
@@ -214,7 +218,6 @@
extern void insert_vm_struct(struct task_struct *, struct vm_area_struct *);
extern void remove_shared_vm_struct(struct vm_area_struct *);
extern void build_mmap_avl(struct task_struct *);
-extern void exit_mmap(struct task_struct *);
extern int do_munmap(unsigned long, size_t);
extern unsigned long get_unmapped_area(unsigned long, unsigned long);

--- linux-1.3.12/include/linux/sched.h Mon Jul 17 09:05:15 1995
+++ linux/include/linux/sched.h Sat Aug 5 02:59:29 1995
@@ -113,7 +113,6 @@
}

struct mm_struct {
- int count;
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack, start_mmap;
unsigned long arg_start, arg_end, env_start, env_end;
@@ -126,10 +125,12 @@
unsigned long swap_cnt; /* number of pages to swap on next pass */
struct vm_area_struct * mmap;
struct vm_area_struct * mmap_avl;
+ struct task_struct *clones;
};

+#define INIT_MM_PTR &init_mm_struct
+
#define INIT_MM { \
- 0, \
0, 0, 0, 0, \
0, 0, 0, 0, \
0, 0, 0, 0, \
@@ -137,7 +138,8 @@
/* ?_flt */ 0, 0, 0, 0, \
0, \
/* swap */ 0, 0, 0, 0, \
- &init_mmap, &init_mmap }
+ &init_mmap, &init_mmap, \
+ &init_task }

struct task_struct {
/* these are hardcoded - don't touch */
@@ -195,7 +197,9 @@
/* open file information */
struct files_struct files[1];
/* memory management info */
- struct mm_struct mm[1];
+ struct mm_struct *mm;
+ struct task_struct *mm_clone_next;
+ struct task_struct *mm_clone_prev;
};

/*
@@ -254,7 +258,7 @@
/* tss */ INIT_TSS, \
/* fs */ { INIT_FS }, \
/* files */ { INIT_FILES }, \
-/* mm */ { INIT_MM } \
+/* mm */ INIT_MM_PTR, &init_task, &init_task \
}

#ifdef __KERNEL__
--- linux-1.3.12/fs/exec.c Wed Jul 12 03:41:59 1995
+++ linux/fs/exec.c Sat Aug 5 03:30:36 1995
@@ -527,7 +527,7 @@
current->comm[i] = '\0';

/* Release all of the old mmap stuff. */
- exit_mmap(current);
+ exec_mm();

flush_thread();

@@ -545,7 +545,6 @@
if (FD_ISSET(i,&current->files->close_on_exec))
sys_close(i);
FD_ZERO(&current->files->close_on_exec);
- clear_page_tables(current);
if (last_task_used_math == current)
last_task_used_math = NULL;
current->used_math = 0;
@@ -774,6 +773,11 @@
#endif

/* OK, This is the point of no return */
+ /*
+ * FIXME: exit_mm() in flush_old_exec() can return an error now...
+ * I simply ignore it for now; doing it correctly would need an
+ * interface change for flush_old_exec().
+ */
flush_old_exec(bprm);

current->mm->end_code = ex.a_text +
--- linux-1.3.12/kernel/fork.c Tue Jul 18 14:15:38 1995
+++ linux/kernel/fork.c Sat Aug 5 03:31:57 1995
@@ -82,37 +82,6 @@
return new_file;
}

-static int dup_mmap(struct task_struct * tsk)
-{
- struct vm_area_struct * mpnt, **p, *tmp;
-
- tsk->mm->mmap = NULL;
- p = &tsk->mm->mmap;
- for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
- tmp = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
- if (!tmp) {
- exit_mmap(tsk);
- return -ENOMEM;
- }
- *tmp = *mpnt;
- tmp->vm_task = tsk;
- tmp->vm_next = NULL;
- if (tmp->vm_inode) {
- tmp->vm_inode->i_count++;
- /* insert tmp into the share list, just after mpnt */
- tmp->vm_next_share->vm_prev_share = tmp;
- mpnt->vm_next_share = tmp;
- tmp->vm_prev_share = mpnt;
- }
- if (tmp->vm_ops && tmp->vm_ops->open)
- tmp->vm_ops->open(tmp);
- *p = tmp;
- p = &tmp->vm_next;
- }
- build_mmap_avl(tsk);
- return 0;
-}
-
/*
* SHAREFD not yet implemented..
*/
@@ -132,25 +101,6 @@
}
}

-/*
- * CLONEVM not yet correctly implemented: needs to clone the mmap
- * instead of duplicating it..
- */
-static int copy_mm(unsigned long clone_flags, struct task_struct * p)
-{
- if (clone_flags & COPYVM) {
- p->mm->min_flt = p->mm->maj_flt = 0;
- p->mm->cmin_flt = p->mm->cmaj_flt = 0;
- if (copy_page_tables(p))
- return 1;
- return dup_mmap(p);
- } else {
- if (clone_page_tables(p))
- return 1;
- return dup_mmap(p); /* wrong.. */
- }
-}
-
static void copy_fs(unsigned long clone_flags, struct task_struct * p)
{
if (current->fs->pwd)
@@ -206,7 +156,9 @@
p->utime = p->stime = 0;
p->cutime = p->cstime = 0;
p->start_time = jiffies;
- p->mm->swappable = 0; /* don't try to swap it out before it's set up */
+ if (fork_mm(clone_flags, p))
+ goto bad_fork_free;
+
task[nr] = p;
SET_LINKS(p);
nr_tasks++;
@@ -213,8 +165,6 @@

/* copy all the process information */
copy_thread(nr, clone_flags, usp, p, regs);
- if (copy_mm(clone_flags, p))
- goto bad_fork_cleanup;
p->semundo = NULL;
copy_files(clone_flags, p);
copy_fs(clone_flags, p);
@@ -225,10 +175,7 @@
p->counter = current->counter >> 1;
wake_up_process(p); /* do this last, just in case */
return p->pid;
-bad_fork_cleanup:
- task[nr] = NULL;
- REMOVE_LINKS(p);
- nr_tasks--;
+
bad_fork_free:
free_page(new_stack);
free_page((long) p);
--- linux-1.3.12/kernel/exit.c Mon Jul 17 09:10:39 1995
+++ linux/kernel/exit.c Sat Aug 5 03:26:12 1995
@@ -384,8 +384,7 @@
current->flags |= PF_EXITING;
del_timer(&current->real_timer);
sem_exit();
- exit_mmap(current);
- free_page_tables(current);
+ exit_mm();
exit_files();
exit_fs();
exit_thread();
@@ -450,7 +449,6 @@
last_task_used_math = NULL;
current->state = TASK_ZOMBIE;
current->exit_code = code;
- current->mm->rss = 0;
#ifdef DEBUG_PROC_TREE
audit_ptree();
#endif
--- linux-1.3.12/kernel/sched.c Tue Jul 18 14:56:31 1995
+++ linux/kernel/sched.c Sat Aug 5 03:27:15 1995
@@ -81,6 +81,7 @@
static unsigned long init_kernel_stack[1024] = { STACK_MAGIC, };
unsigned long init_user_stack[1024] = { STACK_MAGIC, };
static struct vm_area_struct init_mmap = INIT_MMAP;
+static struct mm_struct init_mm_struct = INIT_MM;
struct task_struct init_task = INIT_TASK;

unsigned long volatile jiffies=0;
--- linux-1.3.12/arch/i386/kernel/process.c Thu Jun 29 08:44:04 1995
+++ linux/arch/i386/kernel/process.c Sat Aug 5 03:22:51 1995
@@ -1,3 +1,5 @@
+#define CLONE_ACTUALLY_WORKS_OK at_least_for_mm_I_hope
+
/*
* linux/arch/i386/kernel/process.c
*
--- linux-1.3.12/mm/mmap.c Mon Jun 26 08:10:35 1995
+++ linux/mm/mmap.c Fri Aug 4 11:46:47 1995
@@ -781,26 +781,6 @@
avl_insert(vma, &task->mm->mmap_avl);
}

-/* Release all mmaps. */
-void exit_mmap(struct task_struct * task)
-{
- struct vm_area_struct * mpnt;
-
- mpnt = task->mm->mmap;
- task->mm->mmap = NULL;
- task->mm->mmap_avl = NULL;
- while (mpnt) {
- struct vm_area_struct * next = mpnt->vm_next;
- if (mpnt->vm_ops && mpnt->vm_ops->close)
- mpnt->vm_ops->close(mpnt);
- remove_shared_vm_struct(mpnt);
- if (mpnt->vm_inode)
- iput(mpnt->vm_inode);
- kfree(mpnt);
- mpnt = next;
- }
-}
-
/*
* Insert vm structure into process list sorted by address
* and into the inode's i_mmap ring.
--- linux-1.3.12/mm/memory.c Wed Jul 12 04:46:42 1995
+++ linux/mm/memory.c Mon Aug 7 13:17:11 1995
@@ -1,3 +1,5 @@
+#undef FOLDED_THREE_LEVELS
+
/*
* linux/mm/memory.c
*
@@ -44,6 +46,7 @@
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
+#include <linux/malloc.h>

#include <asm/system.h>
#include <asm/segment.h>
@@ -78,46 +81,38 @@
send_sig(SIGKILL,task,1);
}

-static inline void free_one_pte(pte_t * page_table)
-{
- pte_t page = *page_table;
-
- if (pte_none(page))
- return;
- pte_clear(page_table);
- if (!pte_present(page)) {
- swap_free(pte_val(page));
- return;
- }
- free_page(pte_page(page));
- return;
-}
+/*
+ * freeing page tables can be done with or without removing the
+ * page directory/kernel map. forget_vm does it all, forget_vm_tables
+ * leaves a valid directory with no user page tables.
+ * FIXME?:
+ * forget_vm coult omit zeroing out of the table references
+ * (pmd_clear/pgd_clear) in the _one_ functions, forget_vm_tables
+ * at least has to zero out the entries in the page directory.
+ * Unfortunately, doing this in the loop in forget_vm_tables does
+ * not work cleanly because pgd_clear(), which would be logically used
+ * there, does nothing with the folded three level setup on x86. I could
+ * fake it there, using a pgd_clear() that _does_ zeroing, but I think
+ * that would be worse.
+ */

-static inline void free_one_pmd(pmd_t * dir)
+static inline void forget_pmd(pmd_t * dir)
{
- int j;
- pte_t * pte;
-
if (pmd_none(*dir))
return;
if (pmd_bad(*dir)) {
- printk("free_one_pmd: bad directory entry %08lx\n", pmd_val(*dir));
- pmd_clear(dir);
+ printk("forget_pmd: bad directory entry %08lx\n", pmd_val(*dir));
return;
}
- pte = pte_offset(dir, 0);
+ pte_free(pte_offset(dir,0));
pmd_clear(dir);
- if (pte_inuse(pte)) {
- pte_free(pte);
- return;
- }
- for (j = 0; j < PTRS_PER_PTE ; j++)
- free_one_pte(pte+j);
- pte_free(pte);
}

-static inline void free_one_pgd(pgd_t * dir)
+static inline void forget_pgd(pgd_t * dir)
{
+#ifdef FOLDED_THREE_LEVELS
+ forget_pmd((pmd_t *)dir);
+#else
int j;
pmd_t * pmd;

@@ -124,222 +119,582 @@
if (pgd_none(*dir))
return;
if (pgd_bad(*dir)) {
- printk("free_one_pgd: bad directory entry %08lx\n", pgd_val(*dir));
+ printk("forget_pgd: bad directory entry %08lx\n", pgd_val(*dir));
pgd_clear(dir);
return;
}
pmd = pmd_offset(dir, 0);
- pgd_clear(dir);
- if (pmd_inuse(pmd)) {
- pmd_free(pmd);
- return;
- }
- for (j = 0; j < PTRS_PER_PMD ; j++)
- free_one_pmd(pmd+j);
+ if (!pmd_inuse(pmd)) /* ??? */
+ for (j = 0; j < PTRS_PER_PMD ; j++)
+ forget_pmd(pmd+j);
pmd_free(pmd);
+ pgd_clear(dir);
+#endif
}
-

-/*
- * This function clears all user-level page tables of a process - this
- * is needed by execve(), so that old pages aren't in the way. Note that
- * unlike 'free_page_tables()', this function still leaves a valid
- * page-table-tree in memory: it just removes the user pages. The two
- * functions are similar, but there is a fundamental difference.
- */
-void clear_page_tables(struct task_struct * tsk)
+static inline void forget_vm(struct task_struct * tsk)
{
int i;
pgd_t * page_dir;

- if (!tsk)
- return;
- if (tsk == task[0])
- panic("task[0] (swapper) doesn't support exec()\n");
page_dir = pgd_offset(tsk, 0);
if (!page_dir || page_dir == swapper_pg_dir) {
- printk("%s trying to clear kernel page-directory: not good\n", tsk->comm);
- return;
- }
- if (pgd_inuse(page_dir)) {
- pgd_t * new_pg;
-
- if (!(new_pg = pgd_alloc())) {
- oom(tsk);
- return;
- }
- for (i = USER_PTRS_PER_PGD ; i < PTRS_PER_PGD ; i++)
- new_pg[i] = page_dir[i];
- SET_PAGE_DIR(tsk, new_pg);
- pgd_free(page_dir);
+ printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
return;
}
+ if (pgd_inuse(page_dir))
+ panic("forget_vm on shared page directory\n");
+ SET_PAGE_DIR(tsk, swapper_pg_dir);
for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
- free_one_pgd(page_dir + i);
- invalidate();
- return;
+ forget_pgd(page_dir + i);
+ pgd_free(page_dir);
}

/*
- * This function frees up all page tables of a process when it exits.
+ * This function frees all user page tables of current, but leaves a valid
+ * page directory mapping kernel memory in place.
*/
-void free_page_tables(struct task_struct * tsk)
+static inline void forget_vm_tables(void)
{
int i;
pgd_t * page_dir;

- if (!tsk)
+ page_dir = pgd_offset(current, 0);
+ if (!page_dir || page_dir == swapper_pg_dir) {
+ printk("%s trying to clear kernel page-directory: not good\n", current->comm);
return;
- if (tsk == task[0]) {
- printk("task[0] (swapper) killed: unable to recover\n");
- panic("Trying to free up swapper memory space");
}
+ if (pgd_inuse(page_dir))
+ panic("forget_vm_tables on shared page directory\n");
+ for (i = 0 ; i < USER_PTRS_PER_PGD ; i++)
+ forget_pgd(page_dir + i);
+}
+
+/*
+ * clone_vm() clones the page table for a process - both
+ * processes share one page directory.
+ */
+static inline void clone_vm(struct task_struct * tsk)
+{
+ pgd_t * pg_dir;
+
+ pg_dir = pgd_offset(current, 0);
+ pgd_reuse(pg_dir);
+ SET_PAGE_DIR(tsk, pg_dir);
+}
+
+/*
+ * This function drops one reference to the page directory.
+ */
+static inline void leave_vm(struct task_struct * tsk)
+{
+ pgd_t * page_dir;
+
page_dir = pgd_offset(tsk, 0);
if (!page_dir || page_dir == swapper_pg_dir) {
- printk("%s trying to free kernel page-directory: not good\n", tsk->comm);
+ printk("%s trying to drop kernel page-directory: not good\n", tsk->comm);
return;
}
+ if (!pgd_inuse(page_dir))
+ panic("leave_vm on nonshared directory\n");
SET_PAGE_DIR(tsk, swapper_pg_dir);
- if (pgd_inuse(page_dir)) {
- pgd_free(page_dir);
- return;
- }
- for (i = 0 ; i < PTRS_PER_PGD ; i++)
- free_one_pgd(page_dir + i);
pgd_free(page_dir);
- invalidate();
}

/*
- * clone_page_tables() clones the page table for a process - both
- * processes will have the exact same pages in memory. There are
- * probably races in the memory management with cloning, but we'll
- * see..
+ * Allocates a new page directory, and fill in the kernel part from
+ * the page directory of current.
+ * Returns the page directory we copied from, or 0 on error.
*/
-int clone_page_tables(struct task_struct * tsk)
+static inline pgd_t *alloc_vm(struct task_struct * tsk)
{
- pgd_t * pg_dir;
+ pgd_t *old_pgd;
+ pgd_t *new_pgd;

- pg_dir = pgd_offset(current, 0);
- pgd_reuse(pg_dir);
- SET_PAGE_DIR(tsk, pg_dir);
+ new_pgd = pgd_alloc();
+ if (!new_pgd)
+ return 0;
+ old_pgd = pgd_offset(current, 0);
+ memcpy(new_pgd + USER_PTRS_PER_PGD, old_pgd + USER_PTRS_PER_PGD,
+ PTRS_PER_PGD - USER_PTRS_PER_PGD);
+ SET_PAGE_DIR(tsk, new_pgd);
+ return old_pgd;
+}
+
+/*
+ * The current task gets a new page directory only mapping kernel pages.
+ * In addition, the old page directory is freed (dereferenced).
+ */
+static inline int renew_vm(void)
+{
+ pgd_t *old_pgd;
+
+ old_pgd = alloc_vm(current);
+ if (!old_pgd)
+ return -ENOMEM;
+ pgd_free(old_pgd);
return 0;
}

-static inline void copy_one_pte(pte_t * old_pte, pte_t * new_pte)
+/*
+ * zap_page_range removes the user pages in a given range. It does not
+ * bother to update the page tables, or to invalidate. This is used
+ * on exit_mm and exec_mm in a loop over all vm_areas.
+ */
+
+static inline void zap_pte(pte_t page)
+{
+ if (pte_none(page))
+ return;
+ if (pte_present(page)) {
+ free_page(pte_page(page));
+ return;
+ }
+ swap_free(pte_val(page));
+}
+
+static inline void zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
{
- pte_t pte = *old_pte;
+ pte_t * pte;
+ unsigned long end;

- if (pte_none(pte))
+ if (pmd_none(*pmd))
return;
- if (!pte_present(pte)) {
- swap_duplicate(pte_val(pte));
- *new_pte = pte;
+ if (pmd_bad(*pmd)) {
+ printk("zap_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
return;
}
- if (pte_page(pte) > high_memory || (mem_map[MAP_NR(pte_page(pte))] & MAP_PAGE_RESERVED)) {
- *new_pte = pte;
+ pte = pte_offset(pmd, address);
+ address &= ~PMD_MASK;
+ end = address + size;
+ if (end >= PMD_SIZE)
+ end = PMD_SIZE;
+ do {
+ pte_t page = *pte;
+ zap_pte(page);
+ address += PAGE_SIZE;
+ pte++;
+ } while (address < end);
+}
+
+static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
+{
+#ifdef FOLDED_THREE_LEVELS
+ zap_pte_range((pmd_t *)dir, address, size);
+#else
+ pmd_t * pmd;
+ unsigned long end;
+
+ if (pgd_none(*dir))
return;
+ if (pgd_bad(*dir)) {
+ printk("zap_pmd_range: bad pgd (%08lx)\n", pgd_val(*dir));
+ pgd_clear(dir);
+ return;
+ }
+ pmd = pmd_offset(dir, address);
+ address &= ~PGDIR_MASK;
+ end = address + size;
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+ do {
+ zap_pte_range(pmd, address, end - address);
+ address = (address + PMD_SIZE) & PMD_MASK;
+ pmd++;
+ } while (address < end);
+#endif
+}
+
+static void zap_page_range(struct task_struct *task,
+ unsigned long address, unsigned long end)
+{
+ pgd_t * dir;
+
+ dir = pgd_offset(task, address);
+ while (address < end) {
+ zap_pmd_range(dir, address, end - address);
+ address = (address + PGDIR_SIZE) & PGDIR_MASK;
+ dir++;
}
- if (pte_cow(pte))
- pte = pte_wrprotect(pte);
- if (delete_from_swap_cache(pte_page(pte)))
- pte = pte_mkdirty(pte);
- *new_pte = pte_mkold(pte);
- *old_pte = pte;
- mem_map[MAP_NR(pte_page(pte))]++;
}

-static inline int copy_one_pmd(pmd_t * old_pmd, pmd_t * new_pmd)
+/*
+ * copy_page_range copies user pages from a given range in current
+ * to a new task. Page tables are allocated as needed.
+ * This is used in copy_mm in a loop over all vm_areas, after we
+ * got a fresh page directory - the function assumes no valid mappings
+ * are in place in the new task.
+ */
+
+/* intentionally not inlined - this is complex enough */
+static void copy_pte_range(pte_t *old_pte, pte_t *new_pte, pte_t *old_last)
{
- int j;
+ pte_t pte;
+
+ while (1) {
+ pte = *old_pte;
+ if (!pte_none(pte)) {
+ if (!pte_present(pte)) {
+ swap_duplicate(pte_val(pte));
+ } else {
+ if (pte_page(pte) <= high_memory
+ && !(mem_map[MAP_NR(pte_page(pte))]&MAP_PAGE_RESERVED)) {
+ if (pte_cow(pte))
+ pte = pte_wrprotect(pte);
+ if (delete_from_swap_cache(pte_page(pte)))
+ pte = pte_mkdirty(pte);
+ *old_pte = pte;
+ mem_map[MAP_NR(pte_page(pte))]++;
+ pte = pte_mkold(pte);
+ }
+ }
+ *new_pte = pte;
+ }
+ if (old_pte == old_last) return;
+ old_pte++;
+ new_pte++;
+ }
+}
+
+static inline int copy_pmd_range(pmd_t *old_pmd, pmd_t *new_pmd,
+ unsigned long address, unsigned long size)
+{
pte_t *old_pte, *new_pte;
+ unsigned long end;

if (pmd_none(*old_pmd))
return 0;
if (pmd_bad(*old_pmd)) {
- printk("copy_one_pmd: bad page table: probable memory corruption\n");
+ printk("copy_pte_range: bad pmd (%08lx)\n", pmd_val(*old_pmd));
pmd_clear(old_pmd);
- return 0;
+ return -EFAULT;
}
- old_pte = pte_offset(old_pmd, 0);
- if (pte_inuse(old_pte)) {
- pte_reuse(old_pte);
- *new_pmd = *old_pmd;
- return 0;
- }
- new_pte = pte_alloc(new_pmd, 0);
- if (!new_pte)
- return -ENOMEM;
- for (j = 0 ; j < PTRS_PER_PTE ; j++) {
- copy_one_pte(old_pte, new_pte);
- old_pte++;
- new_pte++;
+ old_pte = pte_offset(old_pmd, address);
+ if (pmd_none(*new_pmd)) {
+ if (!pte_alloc(new_pmd, 0))
+ return -ENOMEM;
}
+ new_pte = pte_offset(new_pmd, address);
+ address &= ~PMD_MASK;
+ end = address + size - 1;
+ if (end >= PMD_SIZE)
+ end = PMD_SIZE-1;
+ copy_pte_range(old_pte, new_pte, old_pte + (end-address)/PAGE_SIZE);
return 0;
}

-static inline int copy_one_pgd(pgd_t * old_pgd, pgd_t * new_pgd)
+static inline int copy_pgd_range(pgd_t *old_pgd, pgd_t *new_pgd,
+ unsigned long address, unsigned long size)
{
- int j;
+#ifdef FOLDED_THREE_LEVELS
+ return copy_pmd_range((pmd_t *)old_pgd, (pmd_t *)new_pgd, address, size);
+#else
pmd_t *old_pmd, *new_pmd;
+ unsigned long end;
+ int error;

if (pgd_none(*old_pgd))
return 0;
if (pgd_bad(*old_pgd)) {
- printk("copy_one_pgd: bad page table (%p: %08lx): probable memory corruption\n", old_pgd, pgd_val(*old_pgd));
+ printk("copy_pmd_range: bad pgd (%08lx)\n", pgd_val(*old_pgd));
pgd_clear(old_pgd);
- return 0;
+ return -EFAULT;
}
- old_pmd = pmd_offset(old_pgd, 0);
- if (pmd_inuse(old_pmd)) {
- pmd_reuse(old_pmd);
- *new_pgd = *old_pgd;
- return 0;
+ old_pmd = pmd_offset(old_pgd, address);
+ if (pgd_none(*new_pgd)) {
+ if (!pmd_alloc(new_pgd, 0))
+ return -ENOMEM;
}
- new_pmd = pmd_alloc(new_pgd, 0);
- if (!new_pmd)
- return -ENOMEM;
- for (j = 0 ; j < PTRS_PER_PMD ; j++) {
- int error = copy_one_pmd(old_pmd, new_pmd);
+ new_pmd = pmd_offset(new_pgd, address);
+ address &= ~PGDIR_MASK;
+ end = address + size;
+ if (end > PGDIR_SIZE)
+ end = PGDIR_SIZE;
+ do {
+ error = copy_pmd_range(old_pmd, new_pmd, address, end-address);
if (error)
return error;
+ address = (address + PMD_SIZE) & PMD_MASK;
old_pmd++;
new_pmd++;
+ } while (address < end);
+ return 0;
+#endif
+}
+
+static inline int copy_page_range(struct task_struct *tsk,
+ unsigned long address, unsigned long end)
+{
+ pgd_t *old_pgd, *new_pgd;
+ int error;
+
+ old_pgd = pgd_offset(current, address);
+ new_pgd = pgd_offset(tsk, address);
+ error = 0;
+ while (address < end) {
+ error = copy_pgd_range(old_pgd, new_pgd, address, end-address);
+ if (error)
+ break;
+ address = (address + PGDIR_SIZE) & PGDIR_MASK;
+ old_pgd++;
+ new_pgd++;
+ }
+ return error;
+}
+
+/* mm_struct handling - these are the external entry points for all of
+ * the above, as used on fork, exit and exec.
+ */
+
+static struct mm_struct zombie_mm = {
+ 0,0,0,0,
+ 0,0,0,0,
+ 0,0,0,0,
+ 0,
+ 0,0,0,0,
+ 0,
+ 0,0,0,0,
+ 0,0,
+ 0
+};
+
+/*
+ * Remove the vm_areas of a task, but leave memory alone.
+ * This should go away, but there is this small complication on first exec...
+ */
+
+static int is_first_exec = 1; /* argh... */
+
+static void flush_mmap(void)
+{
+ struct vm_area_struct * mpnt;
+
+ mpnt = current->mm->mmap;
+ current->mm->mmap = NULL;
+ current->mm->mmap_avl = NULL;
+ while (mpnt) {
+ struct vm_area_struct * next = mpnt->vm_next;
+ if (mpnt->vm_ops && mpnt->vm_ops->close)
+ mpnt->vm_ops->close(mpnt);
+ remove_shared_vm_struct(mpnt);
+ if (mpnt->vm_inode)
+ iput(mpnt->vm_inode);
+ kfree(mpnt);
+ mpnt = next;
+ }
+}
+
+/* allocate an unused mm struct */
+static inline int alloc_mm(struct task_struct *tsk)
+{
+ struct mm_struct *mm;
+
+ mm = kmalloc(sizeof(*mm), GFP_KERNEL);
+ if (!mm) return 0;
+ tsk->mm = mm;
+ memset(mm, 0, sizeof(*mm));
+ mm->mmap = 0;
+ mm->mmap_avl = 0;
+ mm->clones = tsk->mm_clone_next = tsk->mm_clone_prev = tsk;
+ return 1;
+}
+
+/* add tsk to clone list */
+static inline void join_mm(struct mm_struct *mm, struct task_struct *tsk)
+{
+ tsk->mm = mm;
+ tsk->mm_clone_next = mm->clones;
+ tsk->mm_clone_prev = tsk->mm_clone_next->mm_clone_prev;
+ tsk->mm_clone_next->mm_clone_prev = tsk;
+ tsk->mm_clone_prev->mm_clone_next = tsk;
+}
+
+/*
+ * Remove tsk from mm clone list. If there are other clones left,
+ * fix up vm_task in all vm_areas if they reference tsk.
+ */
+static inline int leave_mm(struct task_struct *tsk)
+{
+ struct vm_area_struct *mpnt;
+
+ if (tsk == tsk->mm_clone_next) /* am I the only one? */
+ return 0;
+ tsk->mm_clone_prev->mm_clone_next = tsk->mm_clone_next;
+ tsk->mm_clone_next->mm_clone_prev = tsk->mm_clone_prev;
+ if (tsk->mm->clones == tsk)
+ tsk->mm->clones = tsk->mm_clone_next;
+ for (mpnt = tsk->mm->mmap; mpnt; mpnt = mpnt->vm_next)
+ if (mpnt->vm_task == tsk)
+ mpnt->vm_task = tsk->mm->clones;
+ return 1;
+}
+
+/*
+ * exec_mm, used on execv, ensures current has a clean page directory
+ * used only by itself, with all user level page tables removed
+ * and the kernel mappings in place. On exit, current will be the
+ * only one using its mm_struct, and no vm_areas will be attached.
+ */
+int exec_mm(void)
+{
+ struct vm_area_struct * mpnt;
+
+ if (current == task[0])
+ panic("task[0] (swapper) doesn't support exec()\n");
+ if (!current->mm || current->mm == &zombie_mm)
+ panic("exec_mm invalid mm_struct pointer 0x%08lx\n", (unsigned long) current->mm);
+ if (is_first_exec) {
+ is_first_exec = 0;
+ flush_mmap();
+ return renew_vm();
+ }
+ if (leave_mm(current)) {
+ int ret;
+ if (!alloc_mm(current))
+ return -ENOMEM;
+ ret = renew_vm();
+ current->mm->swappable = 1;
+ return ret;
+ }
+ mpnt = current->mm->mmap;
+ memset(current->mm, 0, sizeof(*current->mm));
+ current->mm->mmap = 0;
+ current->mm->mmap_avl = 0;
+ current->mm->clones = current->mm_clone_next = current->mm_clone_prev = current;
+ while (mpnt) {
+ struct vm_area_struct * next = mpnt->vm_next;
+ if (mpnt->vm_ops && mpnt->vm_ops->close)
+ mpnt->vm_ops->close(mpnt);
+ remove_shared_vm_struct(mpnt);
+ if (mpnt->vm_inode)
+ iput(mpnt->vm_inode);
+ zap_page_range(current, mpnt->vm_start, mpnt->vm_end);
+ kfree(mpnt);
+ mpnt = next;
}
+ forget_vm_tables();
+ invalidate();
+ current->mm->swappable = 1;
return 0;
}

+static inline void do_exit_mm(struct task_struct * tsk)
+{
+ struct vm_area_struct * mpnt;
+
+ if (tsk == task[0]) {
+ printk("task[0] (swapper) killed: unable to recover\n");
+ panic("Trying to free up swapper memory space");
+ }
+ if (!tsk->mm || tsk->mm == &zombie_mm)
+ panic("exit_mm invalid mm_struct pointer 0x%08lx\n", (unsigned long) tsk->mm);
+ if (leave_mm(tsk)) {
+ tsk->mm = &zombie_mm;
+ leave_vm(tsk);
+ return;
+ }
+ mpnt = tsk->mm->mmap;
+ kfree_s(tsk->mm, sizeof(*tsk->mm));
+ tsk->mm = &zombie_mm;
+ while (mpnt) {
+ struct vm_area_struct * next = mpnt->vm_next;
+ if (mpnt->vm_ops && mpnt->vm_ops->close)
+ mpnt->vm_ops->close(mpnt);
+ remove_shared_vm_struct(mpnt);
+ if (mpnt->vm_inode)
+ iput(mpnt->vm_inode);
+ zap_page_range(tsk, mpnt->vm_start, mpnt->vm_end);
+ kfree(mpnt);
+ mpnt = next;
+ }
+ forget_vm(tsk);
+}
+
+void exit_mm(void)
+{
+ do_exit_mm(current);
+}
+
/*
- * copy_page_tables() just copies the whole process memory range:
- * note the special handling of RESERVED (ie kernel) pages, which
- * means that they are always shared by all processes.
+ * copy_mm() copies vm_areas from current to a new task, on fork().
+ * When done the new task has a copy of the vm_areas and a fully
+ * initialized page table setup - unlike the old dup_mmap() we
+ * dup memory as we go on.
*/
-int copy_page_tables(struct task_struct * tsk)
+static inline int copy_mm(struct task_struct * tsk)
{
- int i;
- pgd_t *old_pgd;
- pgd_t *new_pgd;
+ struct vm_area_struct * mpnt, **p, *tmp;
+ int error;

- new_pgd = pgd_alloc();
- if (!new_pgd)
+ tsk->mm = (struct mm_struct *) kmalloc(sizeof(*tsk->mm), GFP_KERNEL);
+ if (!tsk->mm)
return -ENOMEM;
- SET_PAGE_DIR(tsk, new_pgd);
- old_pgd = pgd_offset(current, 0);
- for (i = 0 ; i < PTRS_PER_PGD ; i++) {
- int errno = copy_one_pgd(old_pgd, new_pgd);
- if (errno) {
- free_page_tables(tsk);
- invalidate();
- return errno;
+ *tsk->mm = *current->mm;
+ if (!alloc_vm(tsk)) {
+ kfree_s(tsk->mm, sizeof(*tsk->mm));
+ tsk->mm = &zombie_mm;
+ return -ENOMEM;
+ }
+ tsk->mm->swappable = 0;
+ tsk->mm->min_flt = tsk->mm->maj_flt = 0;
+ tsk->mm->cmin_flt = tsk->mm->cmaj_flt = 0;
+ tsk->mm->mmap = NULL;
+ tsk->mm->mmap_avl = NULL;
+ tsk->mm->clones = tsk->mm_clone_next = tsk->mm_clone_prev = tsk;
+ p = &tsk->mm->mmap;
+ for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
+ tmp = (struct vm_area_struct *) kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+ if (!tmp) {
+ error = -ENOMEM;
+ goto error_out;
}
- old_pgd++;
- new_pgd++;
+ *tmp = *mpnt;
+ tmp->vm_task = tsk;
+ tmp->vm_next = NULL;
+ if (tmp->vm_inode) {
+ tmp->vm_inode->i_count++;
+ /* insert tmp into the share list, just after mpnt */
+ tmp->vm_next_share->vm_prev_share = tmp;
+ mpnt->vm_next_share = tmp;
+ tmp->vm_prev_share = mpnt;
+ }
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+ *p = tmp;
+ p = &tmp->vm_next;
+ error = copy_page_range(tsk, tmp->vm_start, tmp->vm_end);
+ if (error)
+ goto error_out;
}
+ build_mmap_avl(tsk);
+ invalidate();
+ return 0;
+
+error_out:
+ do_exit_mm(tsk);
invalidate();
+ return error;
+}
+
+static inline int clone_mm(struct task_struct *tsk)
+{
+ join_mm(current->mm, tsk);
+ clone_vm(tsk);
return 0;
}

+int fork_mm(unsigned long clone_flags, struct task_struct *new_task)
+{
+ if (clone_flags & COPYVM)
+ return copy_mm(new_task);
+ else
+ return clone_mm(new_task);
+}
+
+/*
+ * support functions for mmap handlers, for manipulating page ranges
+ */
+
static inline void forget_pte(pte_t page)
{
if (pte_none(page))
@@ -407,8 +762,7 @@
}

/*
- * a more complete version of free_page_tables which performs with page
- * granularity.
+ * unmap a range of virtual memory in current.
*/
int unmap_page_range(unsigned long address, unsigned long size)
{