pagetable_ops: Hugetlb character device example

From: Adam Litke
Date: Wed Mar 21 2007 - 15:44:07 EST


The main reason I am advocating a set of pagetable_operations is to
enable the development of a new hugetlb interface. During the hugetlb
BOFS at OLS last year, we talked about a character device that would
behave like /dev/zero. Many of the people were talking about how they
just wanted to create MAP_PRIVATE hugetlb mappings without all the fuss
about the hugetlbfs filesystem. /dev/zero is a familiar interface for
getting anonymous memory so bringing that model to huge pages would make
programming for anonymous huge pages easier.

The pagetable_operations API opens up possibilities to do some
additional (and completely sane) things. For example, I have a patch
that alters the character device code below to make use of a hugetlb
ZERO_PAGE. This eliminates almost all the up-front fault time, allowing
pages to be COW'ed only when first written to. We cannot do things like
this with hugetlbfs anymore because we have a set of complex semantics
to preserve.

The following patch is an example of what a simple pagetable_operations
consumer could look like. It does depend on some other cleanups I am
working on (removal of is_file_hugepages(), ...hugetlbfs/inode.c vs.
mm/hugetlb.c separation, etc). So it is unlikely to apply to any trees
you may have. I do think it makes a useful illustration of what
legitimate things can be done with a pagetable_operations interface.

commit be72df1c616fb662693a8d4410ce3058f20c71f3
Author: Adam Litke <agl@xxxxxxxxxx>
Date: Tue Feb 13 14:18:21 2007 -0800

diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index fc11063..c5e755b 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_IPMI_HANDLER) += ipmi/

obj-$(CONFIG_HANGCHECK_TIMER) += hangcheck-timer.o
obj-$(CONFIG_TCG_TPM) += tpm/
+obj-$(CONFIG_HUGETLB_PAGE) += page.o

# Files generated that shall be removed upon make clean
clean-files := consolemap_deftbl.c defkeymap.c
diff --git a/drivers/char/page.c b/drivers/char/page.c
new file mode 100644
index 0000000..e903028
--- /dev/null
+++ b/drivers/char/page.c
@@ -0,0 +1,133 @@
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
+
+static const struct {
+ unsigned int minor;
+ char *name;
+ umode_t mode;
+} devlist[] = {
+ {1, "page-huge", S_IRUGO | S_IWUGO},
+};
+
+static struct page *page_nopage(struct vm_area_struct *vma,
+ unsigned long address, int *unused)
+{
+ BUG();
+ return NULL;
+}
+
+static struct vm_operations_struct page_vm_ops = {
+ .nopage = page_nopage,
+};
+
+static int page_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pte_t *ptep;
+ pte_t entry, new_entry;
+ int ret;
+ static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+
+ ptep = huge_pte_alloc(mm, address);
+ if (!ptep)
+ return VM_FAULT_OOM;
+
+ mutex_lock(&hugetlb_instantiation_mutex);
+ entry = *ptep;
+ if (pte_none(entry)) {
+ struct page *page;
+
+ page = alloc_huge_page(vma, address);
+ if (!page)
+ return VM_FAULT_OOM;
+ clear_huge_page(page, address);
+
+ ret = VM_FAULT_MINOR;
+ spin_lock(&mm->page_table_lock);
+ if (!pte_none(*ptep))
+ goto out;
+ add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+ new_entry = make_huge_pte(vma, page, 0);
+ set_huge_pte_at(mm, address, ptep, new_entry);
+ goto out;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ /* Check for a racing update before calling hugetlb_cow */
+ if (likely(pte_same(entry, *ptep)))
+ if (write_access && !pte_write(entry))
+ ret = hugetlb_cow(mm, vma, address, ptep, entry);
+
+out:
+ spin_unlock(&mm->page_table_lock);
+ mutex_unlock(&hugetlb_instantiation_mutex);
+ return ret;
+}
+
+
+static struct pagetable_operations_struct page_pagetable_ops = {
+ .copy_vma = copy_hugetlb_page_range,
+ .pin_pages = follow_hugetlb_page,
+ .unmap_page_range = unmap_hugepage_range,
+ .change_protection = hugetlb_change_protection,
+ .free_pgtable_range = hugetlb_free_pgd_range,
+ .fault = page_fault,
+};
+
+static int page_mmap(struct file * file, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHARED)
+ return -EINVAL;
+
+ if (vma->vm_pgoff)
+ return -EINVAL;
+
+ if (vma->vm_start & ~HPAGE_MASK)
+ return -EINVAL;
+
+ if (vma->vm_end & ~HPAGE_MASK)
+ return -EINVAL;
+
+ if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+ return -EINVAL;
+
+ vma->vm_flags |= (VM_HUGETLB | VM_RESERVED);
+ vma->vm_ops = &page_vm_ops;
+ vma->pagetable_ops = &page_pagetable_ops;
+
+ return 0;
+}
+
+const struct file_operations page_file_operations = {
+ .mmap = page_mmap,
+ .get_unmapped_area = hugetlb_get_unmapped_area,
+ .prepare_unmapped_area = prepare_hugepage_range,
+};
+
+static struct class *page_class;
+
+static int __init chr_dev_init(void)
+{
+ int major, i;
+
+ printk("Initializing page devices...");
+ major = register_chrdev(0, "page", &page_file_operations);
+ if (major <= 0)
+ printk("failed\n");
+ else
+ printk("(%i:0)\n", major);
+
+ page_class = class_create(THIS_MODULE, "page");
+ for (i = 0; i < ARRAY_SIZE(devlist); i++)
+ class_device_create(page_class, NULL,
+ MKDEV(major, devlist[i].minor),
+ NULL, devlist[i].name);
+
+ return 0;
+}
+fs_initcall(chr_dev_init);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4fc0bca..edd4944 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -590,6 +590,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,

BUG_ON(!has_pt_op(vma, fault));

+ BUG_ON(!has_pt_op(vma,fault));
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;

--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/