Re: [PATCH 04/10] NOMMU: Make VMAs per MM as for MMU-mode linux[ver #3]

From: Andrew Morton
Date: Tue Jan 13 2009 - 02:57:46 EST


On Thu, 08 Jan 2009 12:54:07 +0000 David Howells <dhowells@xxxxxxxxxx> wrote:

> Make VMAs per mm_struct as for MMU-mode linux.
>
>
> ...
>
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -74,6 +74,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> "LowTotal: %8lu kB\n"
> "LowFree: %8lu kB\n"
> #endif
> +#ifndef CONFIG_MMU
> + "MmapCopy: %8lu kB\n"
> +#endif
> "SwapTotal: %8lu kB\n"
> "SwapFree: %8lu kB\n"
> "Dirty: %8lu kB\n"
> @@ -116,6 +119,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> K(i.totalram-i.totalhigh),
> K(i.freeram-i.freehigh),
> #endif
> +#ifndef CONFIG_MMU
> + K((unsigned long) atomic_read(&mmap_pages_allocated)),
> +#endif

This is slightly wrong, as atomic_read() returns "int". It will show
weird results on 64-bit nommu machines which allocate more than 2G pages :)

> K(i.totalswap),
> K(i.freeswap),
> K(global_page_state(NR_FILE_DIRTY)),
>
> ...
>
> +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
> +{
> + unsigned long ino = 0;
> + struct file *file;
> + dev_t dev = 0;
> + int flags, len;
> +
> + flags = vma->vm_flags;
> + file = vma->vm_file;
> +
> + if (file) {
> + struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
> + dev = inode->i_sb->s_dev;
> + ino = inode->i_ino;
> + }
> +
> + seq_printf(m,
> + "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
> + vma->vm_start,
> + vma->vm_end,
> + flags & VM_READ ? 'r' : '-',
> + flags & VM_WRITE ? 'w' : '-',
> + flags & VM_EXEC ? 'x' : '-',
> + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
> + vma->vm_pgoff << PAGE_SHIFT,

This will overflow at file offsets >4G?

> + MAJOR(dev), MINOR(dev), ino, &len);
> +
> + if (file) {
> + len = 25 + sizeof(void *) * 6 - len;
> + if (len < 1)
> + len = 1;
> + seq_printf(m, "%*c", len, ' ');
> + seq_path(m, &file->f_path, "");
> + }
> +
> + seq_putc(m, '\n');
> + return 0;
> +}
> +
>
> ...
>
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -2472,3 +2472,13 @@ void mm_drop_all_locks(struct mm_struct *mm)
>
> mutex_unlock(&mm_all_locks_mutex);
> }
> +
> +/*
> + * initialise the VMA slab
> + */
> +void __init mmap_init(void)
> +{
> + vm_area_cachep = kmem_cache_create("vm_area_struct",
> + sizeof(struct vm_area_struct), 0,
> + SLAB_PANIC, NULL);

We have the KMEM_CACHE() macro for this.

It was initially added because we'd been through a period of changing
the kmem_cache_create() arguments every ten minutes and we got tired of
patching zillions of callsites.

(I see this was copy-n-pasted from fork.c. Why was it moved?)

>
> ...
>
> + vm_region_jar = kmem_cache_create("vm_region_jar",
> + sizeof(struct vm_region), 0,
> + SLAB_PANIC, NULL);
> + vm_area_cachep = kmem_cache_create("vm_area_struct",
> + sizeof(struct vm_area_struct), 0,
> + SLAB_PANIC, NULL);

dittoes

> -#endif /* DEBUG */
>
> /*
> - * add a VMA into a process's mm_struct in the appropriate place in the list
> - * - should be called with mm->mmap_sem held writelocked
> + * validate the region tree
> + * - the caller must hold the region lock
> */
> -static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
> +#ifdef CONFIG_DEBUG_NOMMU_REGIONS
> +static noinline void validate_nommu_regions(void)
> {
> - struct vm_list_struct **ppv;
> + struct vm_region *region, *last;
> + struct rb_node *p, *lastp;
>
> - for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
> - if ((*ppv)->vma->vm_start > vml->vma->vm_start)
> - break;
> + lastp = rb_first(&nommu_region_tree);
> + if (!lastp)
> + return;
> +
> + last = rb_entry(lastp, struct vm_region, vm_rb);
> + if (unlikely(last->vm_end <= last->vm_start))
> + BUG();

open-coded BUG_ON()

> + while ((p = rb_next(lastp))) {
> + region = rb_entry(p, struct vm_region, vm_rb);

`region' could be local to this block, if one feels so inclined.

> + last = rb_entry(lastp, struct vm_region, vm_rb);
> +
> + if (unlikely(region->vm_end <= region->vm_start))
> + BUG();
> + if (unlikely(region->vm_start < last->vm_end))
> + BUG();

BUG_ON()

> - vml->next = *ppv;
> - *ppv = vml;
> + lastp = p;
> + }
> }
> +#else
> +#define validate_nommu_regions() do {} while(0)

There's no need to implement this in cpp...

>
> ...
>
> -static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
> - unsigned long addr)
> +static void free_page_series(unsigned long from, unsigned long to)
> {
> - struct vm_list_struct *vml;
> -
> - /* search the vm_start ordered list */
> - for (vml = mm->context.vmlist; vml; vml = vml->next) {
> - if (vml->vma->vm_start == addr)
> - return vml->vma;
> - if (vml->vma->vm_start > addr)
> - break;
> + for (; from < to; from += PAGE_SIZE) {
> + struct page *page = virt_to_page(from);
> +
> + kdebug("- free %lx", from);
> + atomic_dec(&mmap_pages_allocated);

This isn't counting "pages allocated". It's counting page *refcounts*.
Now, perhaps these pages will always have a refcount of 1 (why?), in
which case things happen to work out. But if so, the next line isn't
needed:

> + if (page_count(page) != 1)

should be "== 1", surely?

> + kdebug("free page %p [%d]", page, page_count(page));
> + put_page(page);
> }
> -
> - return NULL;
> }
>
> /*
> - * find a VMA in the global tree
> + * release a reference to a region
> + * - the caller must hold the region semaphore, which this releases

"for writing"

>
> ...
>
> +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
> +{
> + struct vm_area_struct *vma;
> + struct rb_node *rb;
> + unsigned long end = start + len;
> + int ret;
>
> - update_hiwater_vm(mm);
> - mm->total_vm -= len >> PAGE_SHIFT;
> + kenter(",%lx,%zx", start, len);
>
> -#ifdef DEBUG
> - show_process_blocks();
> -#endif
> + if (len == 0)
> + return -EINVAL;
> +
> + /* find the first potentially overlapping VMA */
> + vma = find_vma(mm, start);
> + if (!vma) {
> + printk(KERN_WARNING
> + "munmap of memory not mmapped by process %d (%s):"
> + " 0x%lx-0x%lx\n",
> + current->pid, current->comm, start, start + len - 1);

Unprivileged userspace can spam logs? Perhaps not a concern on typical
nommu systems. But buggy userspace can spam logs, too, which _is_ a
problem?

> + return -EINVAL;
> + }
>
> + /* we're allowed to split an anonymous VMA but not a file-backed one */
> + if (vma->vm_file) {
> + do {
> + if (start > vma->vm_start) {
> + kleave(" = -EINVAL [miss]");
> + return -EINVAL;
> + }
> + if (end == vma->vm_end)
> + goto erase_whole_vma;
> + rb = rb_next(&vma->vm_rb);
> + vma = rb_entry(rb, struct vm_area_struct, vm_rb);
> + } while (rb);
> + kleave(" = -EINVAL [split file]");
> + return -EINVAL;
> + } else {
> + /* the chunk must be a subset of the VMA found */
> + if (start == vma->vm_start && end == vma->vm_end)
> + goto erase_whole_vma;
> + if (start < vma->vm_start || end > vma->vm_end) {
> + kleave(" = -EINVAL [superset]");
> + return -EINVAL;
> + }
> + if (start & ~PAGE_MASK) {
> + kleave(" = -EINVAL [unaligned start]");
> + return -EINVAL;
> + }
> + if (end != vma->vm_end && end & ~PAGE_MASK) {
> + kleave(" = -EINVAL [unaligned split]");
> + return -EINVAL;
> + }
> + if (start != vma->vm_start && end != vma->vm_end) {
> + ret = split_vma(mm, vma, start, 1);
> + if (ret < 0) {
> + kleave(" = %d [split]", ret);
> + return ret;
> + }
> + }
> + return shrink_vma(mm, vma, start, end);
> + }
> +
> +erase_whole_vma:
> + delete_vma_from_mm(vma);
> + delete_vma(mm, vma);
> + kleave(" = 0");
> return 0;
> }
>
> ...
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/