Re: [PATCH v6 7/8] fs/proc/task_mmu: read proc/pid/maps under per-vma lock

From: Liam R. Howlett
Date: Mon Jul 07 2025 - 14:21:52 EST


* Suren Baghdasaryan <surenb@xxxxxxxxxx> [250704 02:07]:
> With maple_tree supporting vma tree traversal under RCU and per-vma
> locks, /proc/pid/maps can be read while holding individual vma locks
> instead of locking the entire address space.
> Completely lockless approach (walking vma tree under RCU) would be quite
> complex with the main issue being get_vma_name() using callbacks which
> might not work correctly with a stable vma copy, requiring original
> (unstable) vma - see special_mapping_name() for an example.
> When per-vma lock acquisition fails, we take the mmap_lock for reading,
> lock the vma, release the mmap_lock and continue. This fallback to mmap
> read lock guarantees the reader to make forward progress even during
> lock contention. This will interfere with the writer but for a very
> short time while we are acquiring the per-vma lock and only when there
> was contention on the vma reader is interested in. We shouldn't see a
> repeated fallback to mmap read locks in practice, as this require a
> very unlikely series of lock contentions (for instance due to repeated
> vma split operations). However even if this did somehow happen, we would
> still progress.
> One case requiring special handling is when vma changes between the
> time it was found and the time it got locked. A problematic case would
> be if vma got shrunk so that it's start moved higher in the address
> space and a new vma was installed at the beginning:
>
> reader found: |--------VMA A--------|
> VMA is modified: |-VMA B-|----VMA A----|
> reader locks modified VMA A
> reader reports VMA A: | gap |----VMA A----|
>
> This would result in reporting a gap in the address space that does not
> exist. To prevent this we retry the lookup after locking the vma, however
> we do that only when we identify a gap and detect that the address space
> was changed after we found the vma.
> This change is designed to reduce mmap_lock contention and prevent a
> process reading /proc/pid/maps files (often a low priority task, such
> as monitoring/data collection services) from blocking address space
> updates. Note that this change has a userspace visible disadvantage:
> it allows for sub-page data tearing as opposed to the previous mechanism
> where data tearing could happen only between pages of generated output
> data. Since current userspace considers data tearing between pages to be
> acceptable, we assume is will be able to handle sub-page data tearing
> as well.
>
> Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>

Reviewed-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx>

> ---
> fs/proc/internal.h | 5 ++
> fs/proc/task_mmu.c | 118 ++++++++++++++++++++++++++++++++++----
> include/linux/mmap_lock.h | 11 ++++
> mm/madvise.c | 3 +-
> mm/mmap_lock.c | 88 ++++++++++++++++++++++++++++
> 5 files changed, 214 insertions(+), 11 deletions(-)
>
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 3d48ffe72583..7c235451c5ea 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -384,6 +384,11 @@ struct proc_maps_private {
> struct task_struct *task;
> struct mm_struct *mm;
> struct vma_iterator iter;
> + loff_t last_pos;
> +#ifdef CONFIG_PER_VMA_LOCK
> + bool mmap_locked;
> + struct vm_area_struct *locked_vma;
> +#endif
> #ifdef CONFIG_NUMA
> struct mempolicy *task_mempolicy;
> #endif
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index b8bc06d05a72..ff3fe488ce51 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -127,15 +127,107 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
> }
> #endif
>
> -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
> - loff_t *ppos)
> +#ifdef CONFIG_PER_VMA_LOCK
> +
> +static void unlock_vma(struct proc_maps_private *priv)
> +{
> + if (priv->locked_vma) {
> + vma_end_read(priv->locked_vma);
> + priv->locked_vma = NULL;
> + }
> +}
> +
> +static const struct seq_operations proc_pid_maps_op;
> +
> +static inline bool lock_vma_range(struct seq_file *m,
> + struct proc_maps_private *priv)
> +{
> + /*
> + * smaps and numa_maps perform page table walk, therefore require
> + * mmap_lock but maps can be read with locking just the vma.
> + */
> + if (m->op != &proc_pid_maps_op) {
> + if (mmap_read_lock_killable(priv->mm))
> + return false;
> +
> + priv->mmap_locked = true;
> + } else {
> + rcu_read_lock();
> + priv->locked_vma = NULL;
> + priv->mmap_locked = false;
> + }
> +
> + return true;
> +}
> +
> +static inline void unlock_vma_range(struct proc_maps_private *priv)
> +{
> + if (priv->mmap_locked) {
> + mmap_read_unlock(priv->mm);
> + } else {
> + unlock_vma(priv);
> + rcu_read_unlock();
> + }
> +}
> +
> +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
> + loff_t last_pos)
> +{
> + struct vm_area_struct *vma;
> +
> + if (priv->mmap_locked)
> + return vma_next(&priv->iter);
> +
> + unlock_vma(priv);
> + vma = lock_next_vma(priv->mm, &priv->iter, last_pos);
> + if (!IS_ERR_OR_NULL(vma))
> + priv->locked_vma = vma;
> +
> + return vma;
> +}
> +
> +#else /* CONFIG_PER_VMA_LOCK */
> +
> +static inline bool lock_vma_range(struct seq_file *m,
> + struct proc_maps_private *priv)
> {
> - struct vm_area_struct *vma = vma_next(&priv->iter);
> + return mmap_read_lock_killable(priv->mm) == 0;
> +}
> +
> +static inline void unlock_vma_range(struct proc_maps_private *priv)
> +{
> + mmap_read_unlock(priv->mm);
> +}
> +
> +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
> + loff_t last_pos)
> +{
> + return vma_next(&priv->iter);
> +}
>
> +#endif /* CONFIG_PER_VMA_LOCK */
> +
> +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
> +{
> + struct proc_maps_private *priv = m->private;
> + struct vm_area_struct *vma;
> +
> + vma = get_next_vma(priv, *ppos);
> + /* EINTR is possible */
> + if (IS_ERR(vma))
> + return vma;
> +
> + /* Store previous position to be able to restart if needed */
> + priv->last_pos = *ppos;
> if (vma) {
> - *ppos = vma->vm_start;
> + /*
> + * Track the end of the reported vma to ensure position changes
> + * even if previous vma was merged with the next vma and we
> + * found the extended vma with the same vm_start.
> + */
> + *ppos = vma->vm_end;
> } else {
> - *ppos = -2;
> + *ppos = -2; /* -2 indicates gate vma */
> vma = get_gate_vma(priv->mm);
> }
>
> @@ -163,28 +255,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
> return NULL;
> }
>
> - if (mmap_read_lock_killable(mm)) {
> + if (!lock_vma_range(m, priv)) {
> mmput(mm);
> put_task_struct(priv->task);
> priv->task = NULL;
> return ERR_PTR(-EINTR);
> }
>
> + /*
> + * Reset current position if last_addr was set before
> + * and it's not a sentinel.
> + */
> + if (last_addr > 0)
> + *ppos = last_addr = priv->last_pos;
> vma_iter_init(&priv->iter, mm, (unsigned long)last_addr);
> hold_task_mempolicy(priv);
> if (last_addr == -2)
> return get_gate_vma(mm);
>
> - return proc_get_vma(priv, ppos);
> + return proc_get_vma(m, ppos);
> }
>
> static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
> {
> if (*ppos == -2) {
> - *ppos = -1;
> + *ppos = -1; /* -1 indicates no more vmas */
> return NULL;
> }
> - return proc_get_vma(m->private, ppos);
> + return proc_get_vma(m, ppos);
> }
>
> static void m_stop(struct seq_file *m, void *v)
> @@ -196,7 +294,7 @@ static void m_stop(struct seq_file *m, void *v)
> return;
>
> release_task_mempolicy(priv);
> - mmap_read_unlock(mm);
> + unlock_vma_range(priv);
> mmput(mm);
> put_task_struct(priv->task);
> priv->task = NULL;
> diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> index 5da384bd0a26..1f4f44951abe 100644
> --- a/include/linux/mmap_lock.h
> +++ b/include/linux/mmap_lock.h
> @@ -309,6 +309,17 @@ void vma_mark_detached(struct vm_area_struct *vma);
> struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
> unsigned long address);
>
> +/*
> + * Locks next vma pointed by the iterator. Confirms the locked vma has not
> + * been modified and will retry under mmap_lock protection if modification
> + * was detected. Should be called from read RCU section.
> + * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the
> + * process was interrupted.
> + */
> +struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
> + struct vma_iterator *iter,
> + unsigned long address);
> +
> #else /* CONFIG_PER_VMA_LOCK */
>
> static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
> diff --git a/mm/madvise.c b/mm/madvise.c
> index a34c2c89a53b..e61e32b2cd91 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -108,7 +108,8 @@ void anon_vma_name_free(struct kref *kref)
>
> struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
> {
> - mmap_assert_locked(vma->vm_mm);
> + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
> + vma_assert_locked(vma);
>
> return vma->anon_name;
> }
> diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
> index 5f725cc67334..ed0e5e2171cd 100644
> --- a/mm/mmap_lock.c
> +++ b/mm/mmap_lock.c
> @@ -178,6 +178,94 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
> count_vm_vma_lock_event(VMA_LOCK_ABORT);
> return NULL;
> }
> +
> +static struct vm_area_struct *lock_vma_under_mmap_lock(struct mm_struct *mm,
> + struct vma_iterator *iter,
> + unsigned long address)
> +{
> + struct vm_area_struct *vma;
> + int ret;
> +
> + ret = mmap_read_lock_killable(mm);
> + if (ret)
> + return ERR_PTR(ret);
> +
> + /* Lookup the vma at the last position again under mmap_read_lock */
> + vma_iter_init(iter, mm, address);
> + vma = vma_next(iter);
> + if (vma)
> + vma_start_read_locked(vma);
> +
> + mmap_read_unlock(mm);
> +
> + return vma;
> +}
> +
> +struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
> + struct vma_iterator *iter,
> + unsigned long address)
> +{
> + struct vm_area_struct *vma;
> + unsigned int mm_wr_seq;
> + bool mmap_unlocked;
> +
> + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
> +retry:
> + /* Start mmap_lock speculation in case we need to verify the vma later */
> + mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
> + vma = vma_next(iter);
> + if (!vma)
> + return NULL;
> +
> + vma = vma_start_read(mm, vma);
> +
> + if (IS_ERR_OR_NULL(vma)) {
> + /*
> + * Retry immediately if the vma gets detached from under us.
> + * Infinite loop should not happen because the vma we find will
> + * have to be constantly knocked out from under us.
> + */
> + if (PTR_ERR(vma) == -EAGAIN) {
> + vma_iter_init(iter, mm, address);
> + goto retry;
> + }
> +
> + goto out;
> + }
> +
> + /*
> + * Verify the vma we locked belongs to the same address space and it's
> + * not behind of the last search position.
> + */
> + if (unlikely(vma->vm_mm != mm || address >= vma->vm_end))
> + goto out_unlock;
> +
> + /*
> + * vma can be ahead of the last search position but we need to verify
> + * it was not shrunk after we found it and another vma has not been
> + * installed ahead of it. Otherwise we might observe a gap that should
> + * not be there.
> + */
> + if (address < vma->vm_start) {
> + /* Verify only if the address space might have changed since vma lookup. */
> + if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
> + vma_iter_init(iter, mm, address);
> + if (vma != vma_next(iter))
> + goto out_unlock;
> + }
> + }
> +
> + return vma;
> +
> +out_unlock:
> + vma_end_read(vma);
> +out:
> + rcu_read_unlock();
> + vma = lock_vma_under_mmap_lock(mm, iter, address);
> + rcu_read_lock();
> +
> + return vma;
> +}
> #endif /* CONFIG_PER_VMA_LOCK */
>
> #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
> --
> 2.50.0.727.gbf7dc18ff4-goog
>