[RFC][PATCH 1/3] pagemap: avoid splitting thp when reading /proc/pid/pagemap

From: Naoya Horiguchi
Date: Mon Dec 19 2011 - 13:38:17 EST


Thp split is not necessary if we explicitly check whether pmds are
mapping thps or not. This patch introduces the check and the code
to generate pagemap entries for pmds mapping thps, which results in
less performance impact of pagemap on thp.

Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
---
fs/proc/task_mmu.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 44 insertions(+), 4 deletions(-)

diff --git 3.2-rc5.orig/fs/proc/task_mmu.c 3.2-rc5/fs/proc/task_mmu.c
index e418c5a..90c4b7a 100644
--- 3.2-rc5.orig/fs/proc/task_mmu.c
+++ 3.2-rc5/fs/proc/task_mmu.c
@@ -600,6 +600,9 @@ struct pagemapread {
u64 *buffer;
};

+#define PAGEMAP_WALK_SIZE (PMD_SIZE)
+#define PAGEMAP_WALK_MASK (PMD_MASK)
+
#define PM_ENTRY_BYTES sizeof(u64)
#define PM_STATUS_BITS 3
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
@@ -658,6 +661,22 @@ static u64 pte_to_pagemap_entry(pte_t pte)
return pme;
}

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static u64 thp_pte_to_pagemap_entry(pte_t pte, int offset)
+{
+ u64 pme = 0;
+ if (pte_present(pte))
+ pme = PM_PFRAME(pte_pfn(pte) + offset)
+ | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
+ return pme;
+}
+#else
+static inline u64 thp_pte_to_pagemap_entry(pte_t pte, int offset)
+{
+ return 0;
+}
+#endif
+
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
@@ -666,10 +685,33 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
int err = 0;

- split_huge_page_pmd(walk->mm, pmd);
-
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
+
+ spin_lock(&walk->mm->page_table_lock);
+ if (pmd_trans_huge(*pmd)) {
+ if (pmd_trans_splitting(*pmd)) {
+ spin_unlock(&walk->mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ u64 pfn = PM_NOT_PRESENT;
+
+ for (; addr != end; addr += PAGE_SIZE) {
+ int offset = (addr & ~PAGEMAP_WALK_MASK)
+ >> PAGE_SHIFT;
+ pfn = thp_pte_to_pagemap_entry(*(pte_t *)pmd,
+ offset);
+ err = add_to_pagemap(addr, pfn, pm);
+ if (err)
+ break;
+ }
+ spin_unlock(&walk->mm->page_table_lock);
+ return err;
+ }
+ } else {
+ spin_unlock(&walk->mm->page_table_lock);
+ }
+
for (; addr != end; addr += PAGE_SIZE) {
u64 pfn = PM_NOT_PRESENT;

@@ -754,8 +796,6 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
* determine which areas of memory are actually mapped and llseek to
* skip over unmapped regions.
*/
-#define PAGEMAP_WALK_SIZE (PMD_SIZE)
-#define PAGEMAP_WALK_MASK (PMD_MASK)
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
--
1.7.7.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/