[RFC][PATCH] proc: mm: export PTE sizes directly in smaps

From: Dave Hansen
Date: Wed Nov 16 2016 - 14:05:42 EST



/proc/$pid/smaps has a number of fields that are intended to imply the
kinds of PTEs used to map memory. "AnonHugePages" obviously tells you
how many PMDs are being used. "MMUPageSize" along with the "Hugetlb"
fields tells you how many PTEs you have for a huge page.

The current mechanisms work fine when we have one or two page sizes.
But, they start to get a bit muddled when we mix page sizes inside
one VMA. For instance, the DAX folks were proposing adding a set of
fields like:

DevicePages:
DeviceHugePages:
DeviceGiganticPages:
DeviceGinormousPages:

to unmuddle things when page sizes get mixed. That's fine, but
it does require userspace know the mapping from our various
arbitrary names to hardware page sizes on each architecture and
kernel configuration.

What folks really want is to know how much memory is mapped with
each page size. How about we just do *that*?

Patch attached. Seems harmless enough. Probably doesn't compile
everywhere. Makes smaps look like this:

Private_Hugetlb: 0 kB
Swap: 0 kB
SwapPss: 0 kB
KernelPageSize: 4 kB
MMUPageSize: 4 kB
Locked: 0 kB
Ptes: 4096 kB @ 4 kB 4096 kB @ 2048 kB
VmFlags: rd wr mr mw me ac

1. I'd like to thank Dan Williams for showing me a mirror as I
complained about the bozo that introduced 'AnonHugePages'.

Cc: Christoph Hellwig <hch@xxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Dan Williams <dan.j.williams@xxxxxxxxx>
Cc: Anshuman Khandual <khandual@xxxxxxxxxxxxxxxxxx>

---

b/fs/proc/task_mmu.c | 38 ++++++++++++++++++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)

diff -puN fs/proc/task_mmu.c~smaps-pte-sizes fs/proc/task_mmu.c
--- a/fs/proc/task_mmu.c~smaps-pte-sizes 2016-11-16 08:47:35.609910046 -0800
+++ b/fs/proc/task_mmu.c 2016-11-16 10:40:40.529937746 -0800
@@ -445,6 +445,9 @@ struct mem_size_stats {
unsigned long swap;
unsigned long shared_hugetlb;
unsigned long private_hugetlb;
+ unsigned long rss_pte;
+ unsigned long rss_pmd;
+ unsigned long rss_pud;
u64 pss;
u64 swap_pss;
bool check_shmem_swap;
@@ -519,6 +522,7 @@ static void smaps_pte_entry(pte_t *pte,

if (pte_present(*pte)) {
page = vm_normal_page(vma, addr, *pte);
+ mss->rss_pte += PAGE_SIZE;
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);

@@ -578,6 +582,7 @@ static void smaps_pmd_entry(pmd_t *pmd,
/* pass */;
else
VM_BUG_ON_PAGE(1, page);
+ mss->rss_pmd += PAGE_SIZE;
smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
}
#else
@@ -702,11 +707,13 @@ static int smaps_hugetlb_range(pte_t *pt
}
if (page) {
int mapcount = page_mapcount(page);
+ unsigned long hpage_size = huge_page_size(hstate_vma(vma));

+ mss->rss_pud += hpage_size;
if (mapcount >= 2)
- mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
+ mss->shared_hugetlb += hpage_size;
else
- mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+ mss->private_hugetlb += hpage_size;
}
return 0;
}
@@ -716,6 +723,32 @@ void __weak arch_show_smap(struct seq_fi
{
}

+#define K(x) ((x) >> 10)
+
+static void show_smap_ptes(struct seq_file *m, struct mem_size_stats *mss)
+{
+ /* Don't print anything if there was nothing mapped in */
+ if (!(mss->rss_pte || mss->rss_pmd || mss->rss_pmd))
+ return;
+
+ seq_printf(m, "Ptes: ");
+ /*
+ * Always print the small-page entry. This, among other
+ * things, guarantees that we have *something* that says
+ * "kB" on each line which is less likely to break parsers.
+ */
+ seq_printf(m, "%8lu kB @ %8lu kB", mss->rss_pte, K(PAGE_SIZE));
+ if (mss->rss_pmd) {
+ seq_printf(m, " ");
+ seq_printf(m, "%8lu kB @ %8lu kB", mss->rss_pmd, K(PMD_SIZE));
+ }
+ if (mss->rss_pud) {
+ seq_printf(m, " ");
+ seq_printf(m, "%8lu kB @ %8lu kB", mss->rss_pud, K(PUD_SIZE));
+ }
+ seq_printf(m, "\n");
+}
+
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
@@ -799,6 +832,7 @@ static int show_smap(struct seq_file *m,
(vma->vm_flags & VM_LOCKED) ?
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);

+ show_smap_ptes(m, &mss);
arch_show_smap(m, vma);
show_smap_vma_flags(m, vma);
m_cache_vma(m, vma);
_