Re: [PATCH] Add /proc/pid/smaps_rollup

From: Daniel Colascione
Date: Tue Aug 08 2017 - 14:23:04 EST


Adding more people.

On Tue, Aug 08 2017, Daniel Colascione wrote:
> /proc/pid/smaps_rollup is a new proc file that improves the
> performance of user programs that determine aggregate memory
> statistics (e.g., total PSS) of a process.
>
> Anroid regularly "samples" the memory usage of various processes in
> order to blance its memory pool sizes. This sampling process involves
> opening /proc/pid/smaps and summing certain fields. For very large
> processes, sampling memory use this way can take several hundred
> milliseconds, due mostly to the overhead of the seq_printf calls in
> task_mmu.c.
>
> smaps_rollup improves the situation. It contains most of the fields of
> /proc/pid/smaps, but instead of a set of fields for each VMA,
> smaps_rollup instead contains one synthetic smaps-format entry
> representing the whole process. In the single smaps_rollup synthetic
> entry, each field is the summation of the corresponding field in all
> of the real-smaps VMAs. Using a common format for smaps_rollup and
> smaps allows userspace parsers to repurpose parsers meant for use with
> non-rollup smaps for smaps_rollup, and it allows userspace to switch
> between smaps_rollup and smaps at runtime (say, based on the
> availablity of smaps_rollup in a given kernel) with minimal fuss.
>
> By using smaps_rollup instead of smaps, a caller can avoid the
> significant overhead of formatting, reading, and parsing each of a
> large process's potentially very numerous memory mappings. For
> sampling system_server's PSS in Android, we measured a 12x speedup,
> representing a savings of several hundred milliseconds.
>
> One alternative to a new per-process proc file would have been
> including PSS information in /proc/pid/status. We considered this
> option but thought that PSS would be too expensive (by a few orders of
> magnitude) to collect relative to what's already emitted as part of
> /proc/pid/status, and slowing every user of /proc/pid/status for the
> sake of readers that happen to want PSS feels wrong.
>
> The code itself works by reusing the existing VMA-walking framework we
> use for regular smaps generation and keeping the mem_size_stats
> structure around between VMA walks instead of using a fresh one for
> each VMA. In this way, summation happens automatically. We let
> seq_file walk over the VMAs just as it does for regular smaps and just
> emit nothing to the seq_file until we hit the last VMA.
>
> Signed-off-by: Daniel Colascione <dancol@xxxxxxxxxx>
> ---
> fs/proc/base.c | 2 +
> fs/proc/internal.h | 3 +
> fs/proc/task_mmu.c | 196 ++++++++++++++++++++++++++++++++++++-----------------
> 3 files changed, 139 insertions(+), 62 deletions(-)
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 719c2e943ea1..a9587b9cace5 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2930,6 +2930,7 @@ static const struct pid_entry tgid_base_stuff[] = {
> #ifdef CONFIG_PROC_PAGE_MONITOR
> REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
> REG("smaps", S_IRUGO, proc_pid_smaps_operations),
> + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
> REG("pagemap", S_IRUSR, proc_pagemap_operations),
> #endif
> #ifdef CONFIG_SECURITY
> @@ -3323,6 +3324,7 @@ static const struct pid_entry tid_base_stuff[] = {
> #ifdef CONFIG_PROC_PAGE_MONITOR
> REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
> REG("smaps", S_IRUGO, proc_tid_smaps_operations),
> + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
> REG("pagemap", S_IRUSR, proc_pagemap_operations),
> #endif
> #ifdef CONFIG_SECURITY
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index aa2b89071630..2cbfcd32e884 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -269,10 +269,12 @@ extern int proc_remount(struct super_block *, int *, char *);
> /*
> * task_[no]mmu.c
> */
> +struct mem_size_stats;
> struct proc_maps_private {
> struct inode *inode;
> struct task_struct *task;
> struct mm_struct *mm;
> + struct mem_size_stats *rollup;
> #ifdef CONFIG_MMU
> struct vm_area_struct *tail_vma;
> #endif
> @@ -288,6 +290,7 @@ extern const struct file_operations proc_tid_maps_operations;
> extern const struct file_operations proc_pid_numa_maps_operations;
> extern const struct file_operations proc_tid_numa_maps_operations;
> extern const struct file_operations proc_pid_smaps_operations;
> +extern const struct file_operations proc_pid_smaps_rollup_operations;
> extern const struct file_operations proc_tid_smaps_operations;
> extern const struct file_operations proc_clear_refs_operations;
> extern const struct file_operations proc_pagemap_operations;
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index b836fd61ed87..02b55df7291c 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -252,6 +252,7 @@ static int proc_map_release(struct inode *inode, struct file *file)
> if (priv->mm)
> mmdrop(priv->mm);
>
> + kfree(priv->rollup);
> return seq_release_private(inode, file);
> }
>
> @@ -278,6 +279,23 @@ static int is_stack(struct proc_maps_private *priv,
> vma->vm_end >= vma->vm_mm->start_stack;
> }
>
> +static void show_vma_header_prefix(struct seq_file *m,
> + unsigned long start, unsigned long end,
> + vm_flags_t flags, unsigned long long pgoff,
> + dev_t dev, unsigned long ino)
> +{
> + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
> + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
> + start,
> + end,
> + flags & VM_READ ? 'r' : '-',
> + flags & VM_WRITE ? 'w' : '-',
> + flags & VM_EXEC ? 'x' : '-',
> + flags & VM_MAYSHARE ? 's' : 'p',
> + pgoff,
> + MAJOR(dev), MINOR(dev), ino);
> +}
> +
> static void
> show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
> {
> @@ -300,17 +318,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
>
> start = vma->vm_start;
> end = vma->vm_end;
> -
> - seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
> - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
> - start,
> - end,
> - flags & VM_READ ? 'r' : '-',
> - flags & VM_WRITE ? 'w' : '-',
> - flags & VM_EXEC ? 'x' : '-',
> - flags & VM_MAYSHARE ? 's' : 'p',
> - pgoff,
> - MAJOR(dev), MINOR(dev), ino);
> + show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
>
> /*
> * Print the dentry name for named mappings, and a
> @@ -429,6 +437,7 @@ const struct file_operations proc_tid_maps_operations = {
>
> #ifdef CONFIG_PROC_PAGE_MONITOR
> struct mem_size_stats {
> + bool first;
> unsigned long resident;
> unsigned long shared_clean;
> unsigned long shared_dirty;
> @@ -442,7 +451,9 @@ struct mem_size_stats {
> unsigned long swap;
> unsigned long shared_hugetlb;
> unsigned long private_hugetlb;
> + unsigned long first_vma_start;
> u64 pss;
> + u64 pss_locked;
> u64 swap_pss;
> bool check_shmem_swap;
> };
> @@ -718,18 +729,36 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
>
> static int show_smap(struct seq_file *m, void *v, int is_pid)
> {
> + struct proc_maps_private *priv = m->private;
> struct vm_area_struct *vma = v;
> - struct mem_size_stats mss;
> + struct mem_size_stats mss_stack;
> + struct mem_size_stats *mss;
> struct mm_walk smaps_walk = {
> .pmd_entry = smaps_pte_range,
> #ifdef CONFIG_HUGETLB_PAGE
> .hugetlb_entry = smaps_hugetlb_range,
> #endif
> .mm = vma->vm_mm,
> - .private = &mss,
> };
> + int ret = 0;
> + bool rollup_mode;
> + bool last_vma;
> +
> + if (priv->rollup) {
> + rollup_mode = true;
> + mss = priv->rollup;
> + if (mss->first) {
> + mss->first_vma_start = vma->vm_start;
> + mss->first = false;
> + }
> + last_vma = !m_next_vma(priv, vma);
> + } else {
> + rollup_mode = false;
> + memset(&mss_stack, 0, sizeof(mss_stack));
> + mss = &mss_stack;
> + }
>
> - memset(&mss, 0, sizeof mss);
> + smaps_walk.private = mss;
>
> #ifdef CONFIG_SHMEM
> if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
> @@ -747,9 +776,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
>
> if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
> !(vma->vm_flags & VM_WRITE)) {
> - mss.swap = shmem_swapped;
> + mss->swap = shmem_swapped;
> } else {
> - mss.check_shmem_swap = true;
> + mss->check_shmem_swap = true;
> smaps_walk.pte_hole = smaps_pte_hole;
> }
> }
> @@ -757,54 +786,71 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
>
> /* mmap_sem is held in m_start */
> walk_page_vma(vma, &smaps_walk);
> + if (vma->vm_flags & VM_LOCKED)
> + mss->pss_locked += mss->pss;
> +
> + if (!rollup_mode) {
> + show_map_vma(m, vma, is_pid);
> + } else if (last_vma) {
> + show_vma_header_prefix(
> + m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
> + seq_pad(m, ' ');
> + seq_puts(m, "[rollup]\n");
> + } else {
> + ret = SEQ_SKIP;
> + }
>
> - show_map_vma(m, vma, is_pid);
> -
> - seq_printf(m,
> - "Size: %8lu kB\n"
> - "Rss: %8lu kB\n"
> - "Pss: %8lu kB\n"
> - "Shared_Clean: %8lu kB\n"
> - "Shared_Dirty: %8lu kB\n"
> - "Private_Clean: %8lu kB\n"
> - "Private_Dirty: %8lu kB\n"
> - "Referenced: %8lu kB\n"
> - "Anonymous: %8lu kB\n"
> - "LazyFree: %8lu kB\n"
> - "AnonHugePages: %8lu kB\n"
> - "ShmemPmdMapped: %8lu kB\n"
> - "Shared_Hugetlb: %8lu kB\n"
> - "Private_Hugetlb: %7lu kB\n"
> - "Swap: %8lu kB\n"
> - "SwapPss: %8lu kB\n"
> - "KernelPageSize: %8lu kB\n"
> - "MMUPageSize: %8lu kB\n"
> - "Locked: %8lu kB\n",
> - (vma->vm_end - vma->vm_start) >> 10,
> - mss.resident >> 10,
> - (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
> - mss.shared_clean >> 10,
> - mss.shared_dirty >> 10,
> - mss.private_clean >> 10,
> - mss.private_dirty >> 10,
> - mss.referenced >> 10,
> - mss.anonymous >> 10,
> - mss.lazyfree >> 10,
> - mss.anonymous_thp >> 10,
> - mss.shmem_thp >> 10,
> - mss.shared_hugetlb >> 10,
> - mss.private_hugetlb >> 10,
> - mss.swap >> 10,
> - (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
> - vma_kernel_pagesize(vma) >> 10,
> - vma_mmu_pagesize(vma) >> 10,
> - (vma->vm_flags & VM_LOCKED) ?
> - (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
> -
> - arch_show_smap(m, vma);
> - show_smap_vma_flags(m, vma);
> + if (!rollup_mode)
> + seq_printf(m,
> + "Size: %8lu kB\n"
> + "KernelPageSize: %8lu kB\n"
> + "MMUPageSize: %8lu kB\n",
> + (vma->vm_end - vma->vm_start) >> 10,
> + vma_kernel_pagesize(vma) >> 10,
> + vma_mmu_pagesize(vma) >> 10);
> +
> +
> + if (!rollup_mode || last_vma)
> + seq_printf(m,
> + "Rss: %8lu kB\n"
> + "Pss: %8lu kB\n"
> + "Shared_Clean: %8lu kB\n"
> + "Shared_Dirty: %8lu kB\n"
> + "Private_Clean: %8lu kB\n"
> + "Private_Dirty: %8lu kB\n"
> + "Referenced: %8lu kB\n"
> + "Anonymous: %8lu kB\n"
> + "LazyFree: %8lu kB\n"
> + "AnonHugePages: %8lu kB\n"
> + "ShmemPmdMapped: %8lu kB\n"
> + "Shared_Hugetlb: %8lu kB\n"
> + "Private_Hugetlb: %7lu kB\n"
> + "Swap: %8lu kB\n"
> + "SwapPss: %8lu kB\n"
> + "Locked: %8lu kB\n",
> + mss->resident >> 10,
> + (unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
> + mss->shared_clean >> 10,
> + mss->shared_dirty >> 10,
> + mss->private_clean >> 10,
> + mss->private_dirty >> 10,
> + mss->referenced >> 10,
> + mss->anonymous >> 10,
> + mss->lazyfree >> 10,
> + mss->anonymous_thp >> 10,
> + mss->shmem_thp >> 10,
> + mss->shared_hugetlb >> 10,
> + mss->private_hugetlb >> 10,
> + mss->swap >> 10,
> + (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)),
> + (unsigned long)(mss->pss >> (10 + PSS_SHIFT)));
> +
> + if (!rollup_mode) {
> + arch_show_smap(m, vma);
> + show_smap_vma_flags(m, vma);
> + }
> m_cache_vma(m, vma);
> - return 0;
> + return ret;
> }
>
> static int show_pid_smap(struct seq_file *m, void *v)
> @@ -836,6 +882,25 @@ static int pid_smaps_open(struct inode *inode, struct file *file)
> return do_maps_open(inode, file, &proc_pid_smaps_op);
> }
>
> +static int pid_smaps_rollup_open(struct inode *inode, struct file *file)
> +{
> + struct seq_file *seq;
> + struct proc_maps_private *priv;
> + int ret = do_maps_open(inode, file, &proc_pid_smaps_op);
> +
> + if (ret < 0)
> + return ret;
> + seq = file->private_data;
> + priv = seq->private;
> + priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL);
> + if (!priv->rollup) {
> + proc_map_release(inode, file);
> + return -ENOMEM;
> + }
> + priv->rollup->first = true;
> + return 0;
> +}
> +
> static int tid_smaps_open(struct inode *inode, struct file *file)
> {
> return do_maps_open(inode, file, &proc_tid_smaps_op);
> @@ -848,6 +913,13 @@ const struct file_operations proc_pid_smaps_operations = {
> .release = proc_map_release,
> };
>
> +const struct file_operations proc_pid_smaps_rollup_operations = {
> + .open = pid_smaps_rollup_open,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = proc_map_release,
> +};
> +
> const struct file_operations proc_tid_smaps_operations = {
> .open = tid_smaps_open,
> .read = seq_read,