Re: [RFC][PATCH] make /proc/pid/pagemap work with huge pages and return page size

From: Hans Rosenfeld
Date: Tue Feb 26 2008 - 15:34:27 EST


On Mon, Feb 25, 2008 at 10:39:10AM -0800, Dave Hansen wrote:
> Did you read my suggestion? We use one bit in the pte to specify that
> its a large page mapping, then specify a mask to apply to the address to
> get the *first* mapping of the large page, where you're find the actual
> physical address. That keeps us from having to worry about specifying
> *both* the page size and the pfn in the same pte.

I did read it, but I don't see the point. I think we have enough bits in
that pseudo pte to include both the page size and the address/pfn.

> I'm just asking that you please put this in a nice helper function to
> hide it from the poor powerpc/ia64/mips... guys that don't want to see
> x86 code cluttering up otherwise generic functions. Yes, the compiler
> optimizes it away, but it still has a cost to my eyeballs. :)
>
> I eagerly await your next patch!

Ok, here it comes. What I changed is:
- added x86-specific inline function pmd_to_ppte() to asm-x86/pgtable.h
and a dummy definition to asm-generic/pgtable.h
- added a function add_huge_to_pagemap to fs/proc/task_mmu.c that is
completely independent of the huge page size
- use pshift instead of psize in pseudo pte, had to shorten paddr field
to 56 bits which is probably still enough for quite some time
- renamed the struct ppte to struct pagemap_ppte, making more clear what
it is actually used for. Since I weren't sure where to put it, I put
the definition in a new header linux/pagemap_ppte.h. I guess that it
will be put somewhere else, linux/pagemap.h or asm-generic/pgtable.h
are what I had in mind. Using one of those created more problems that
I didn't want to look at.



Signed-off-by: Hans Rosenfeld <hans.rosenfeld@xxxxxxx>

---
fs/proc/task_mmu.c | 54 +++++++++++++++++++++++++---------------
include/asm-generic/pgtable.h | 6 ++++
include/asm-x86/pgtable.h | 16 ++++++++++++
include/asm-x86/pgtable_64.h | 1 -
include/linux/pagemap_ppte.h | 17 +++++++++++++
5 files changed, 73 insertions(+), 21 deletions(-)
create mode 100644 include/linux/pagemap_ppte.h

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 49958cf..b3f07f6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -527,16 +527,11 @@ struct pagemapread {
char __user *out, *end;
};

-#define PM_ENTRY_BYTES sizeof(u64)
-#define PM_RESERVED_BITS 3
-#define PM_RESERVED_OFFSET (64 - PM_RESERVED_BITS)
-#define PM_RESERVED_MASK (((1LL<<PM_RESERVED_BITS)-1) << PM_RESERVED_OFFSET)
-#define PM_SPECIAL(nr) (((nr) << PM_RESERVED_OFFSET) | PM_RESERVED_MASK)
-#define PM_NOT_PRESENT PM_SPECIAL(1LL)
-#define PM_SWAP PM_SPECIAL(2LL)
-#define PM_END_OF_BUFFER 1
-
-static int add_to_pagemap(unsigned long addr, u64 pfn,
+#define PM_NOT_PRESENT ((struct pagemap_ppte) {0, 0, 0, 0})
+#define PM_ENTRY_BYTES sizeof(struct pagemap_ppte)
+#define PM_END_OF_BUFFER 1
+
+static int add_to_pagemap(unsigned long addr, struct pagemap_ppte ppte,
struct pagemapread *pm)
{
/*
@@ -545,18 +540,30 @@ static int add_to_pagemap(unsigned long addr, u64 pfn,
* the pfn.
*/
if (pm->out + PM_ENTRY_BYTES >= pm->end) {
- if (copy_to_user(pm->out, &pfn, pm->end - pm->out))
+ if (copy_to_user(pm->out, &ppte, pm->end - pm->out))
return -EFAULT;
pm->out = pm->end;
return PM_END_OF_BUFFER;
}

- if (put_user(pfn, pm->out))
+ if (copy_to_user(pm->out, &ppte, sizeof(ppte)))
return -EFAULT;
pm->out += PM_ENTRY_BYTES;
return 0;
}

+static int add_huge_to_pagemap(unsigned long addr, unsigned long end,
+ struct pagemap_ppte ppte, struct pagemapread *pm)
+{
+ int err;
+
+ for (; addr != end; addr += PAGE_SIZE) {
+ err = add_to_pagemap(addr, ppte, pm);
+ if (err)
+ return err;
+ }
+}
+
static int pagemap_pte_hole(unsigned long start, unsigned long end,
void *private)
{
@@ -574,7 +581,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
u64 swap_pte_to_pagemap_entry(pte_t pte)
{
swp_entry_t e = pte_to_swp_entry(pte);
- return PM_SWAP | swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
+ return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
}

static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
@@ -584,16 +591,23 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
int err = 0;

- for (; addr != end; addr += PAGE_SIZE) {
- u64 pfn = PM_NOT_PRESENT;
+ if (pmd_huge(*pmd))
+ add_huge_to_pagemap(addr, end, pmd_to_ppte(pmd), pm);
+ else for (; addr != end; addr += PAGE_SIZE) {
+ struct pagemap_ppte ppte = { 0, 0, 0, 0};
+
pte = pte_offset_map(pmd, addr);
- if (is_swap_pte(*pte))
- pfn = swap_pte_to_pagemap_entry(*pte);
- else if (pte_present(*pte))
- pfn = pte_pfn(*pte);
+ if (is_swap_pte(*pte)) {
+ ppte.swap = 1;
+ ppte.paddr = swap_pte_to_pagemap_entry(*pte);
+ } else if (pte_present(*pte)) {
+ ppte.present = 1;
+ ppte.pshift = PAGE_SHIFT;
+ ppte.paddr = pte_pfn(*pte) << PAGE_SHIFT;
+ }
/* unmap so we're not in atomic when we copy to userspace */
pte_unmap(pte);
- err = add_to_pagemap(addr, pfn, pm);
+ err = add_to_pagemap(addr, ppte, pm);
if (err)
return err;
}
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 44ef329..d7df89d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -195,6 +195,12 @@ static inline int pmd_none_or_clear_bad(pmd_t *pmd)
}
return 0;
}
+
+/* dummy for !x86 */
+#ifndef pmd_to_ppte
+#define pmd_to_ppte(x) ((struct pagemap_ppte) {0, 0, 0, 0})
+#endif
+
#endif /* CONFIG_MMU */

/*
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 174b877..7a446e0 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -181,6 +181,8 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
pgprot_val(pgprot)) & __supported_pte_mask);
}

+#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
pteval_t val = pte_val(pte);
@@ -242,6 +244,20 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)

#ifndef __ASSEMBLY__

+#include <linux/pagemap_ppte.h>
+
+static inline struct pagemap_ppte pmd_to_ppte(pmd_t *pmd)
+{
+ struct pagemap_ppte ppte = {
+ .paddr = pmd_pfn(*pmd) << PAGE_SHIFT,
+ .pshift = HPAGE_SHIFT,
+ .swap = 0,
+ .present = 1,
+ };
+
+ return ppte;
+}
+
enum {
PG_LEVEL_NONE,
PG_LEVEL_4K,
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
index 02bd4aa..094a538 100644
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -216,7 +216,6 @@ static inline int pud_large(pud_t pte)
#define pmd_none(x) (!pmd_val(x))
#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
-#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)

#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
diff --git a/include/linux/pagemap_ppte.h b/include/linux/pagemap_ppte.h
new file mode 100644
index 0000000..4d1256f
--- /dev/null
+++ b/include/linux/pagemap_ppte.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_PAGEMAP_PPTE_H
+#define _LINUX_PAGEMAP_PPTE_H
+
+#include <linux/types.h>
+
+/*
+ * structure used by /proc/pid/pagemap to describe mappings
+ */
+
+struct pagemap_ppte {
+ __u64 paddr:56;
+ __u64 pshift:6;
+ __u64 swap:1;
+ __u64 present:1;
+};
+
+#endif /* _LINUX_PAGEMAP_PPTE_H */
--
1.5.3.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/