[PATCH v2] mm: madvise: use per_vma lock for MADV_FREE
From: Barry Song
Date: Wed Jun 11 2025 - 06:51:35 EST
From: Barry Song <v-songbaohua@xxxxxxxx>
MADV_FREE is another option, besides MADV_DONTNEED, for dynamic memory
freeing in user-space native or Java heap memory management. For example,
jemalloc can be configured to use MADV_FREE, and recent versions of the
Android Java heap have also increasingly adopted MADV_FREE. Supporting
per-VMA locking for MADV_FREE thus appears increasingly necessary.
We have replaced walk_page_range() with walk_page_range_vma(). Along with
the proposed madvise_lock_mode by Lorenzo, the necessary infrastructure is
now in place to begin exploring per-VMA locking support for MADV_FREE and
potentially other madvise using walk_page_range_vma().
This patch adds support for the PGWALK_VMA_RDLOCK walk_lock mode in
walk_page_range_vma(), and leverages madvise_lock_mode from
madv_behavior to select the appropriate walk_lock—either mmap_lock or
per-VMA lock—based on the context.
Because we now dynamically update the walk_ops->walk_lock field, we
must ensure this is thread-safe. The madvise_free_walk_ops is now
defined as a stack variable instead of a global constant.
Acked-by: David Hildenbrand <david@xxxxxxxxxx>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx>
Cc: "Liam R. Howlett" <Liam.Howlett@xxxxxxxxxx>
Cc: Vlastimil Babka <vbabka@xxxxxxx>
Cc: Jann Horn <jannh@xxxxxxxxxx>
Cc: Suren Baghdasaryan <surenb@xxxxxxxxxx>
Cc: Lokesh Gidra <lokeshgidra@xxxxxxxxxx>
Cc: Mike Rapoport <rppt@xxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxx>
Cc: Tangquan Zheng <zhengtangquan@xxxxxxxx>
Cc: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
Signed-off-by: Barry Song <v-songbaohua@xxxxxxxx>
---
-v2:
* Collect David's acked-by and Lorenzo's reviewed-by;
* refine changelog and code cleanup according to David and Lorenzo
include/linux/pagewalk.h | 2 ++
mm/madvise.c | 25 +++++++++++++++++++------
mm/pagewalk.c | 5 ++++-
3 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 9700a29f8afb..a4afa64ef0ab 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -14,6 +14,8 @@ enum page_walk_lock {
PGWALK_WRLOCK = 1,
/* vma is expected to be already write-locked during the walk */
PGWALK_WRLOCK_VERIFY = 2,
+ /* vma is expected to be already read-locked during the walk */
+ PGWALK_VMA_RDLOCK_VERIFY = 3,
};
/**
diff --git a/mm/madvise.c b/mm/madvise.c
index 790c238d04d4..267d8e4adf31 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -777,10 +777,19 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static const struct mm_walk_ops madvise_free_walk_ops = {
- .pmd_entry = madvise_free_pte_range,
- .walk_lock = PGWALK_RDLOCK,
-};
+static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode)
+{
+ switch (mode) {
+ case MADVISE_VMA_READ_LOCK:
+ return PGWALK_VMA_RDLOCK_VERIFY;
+ case MADVISE_MMAP_READ_LOCK:
+ return PGWALK_RDLOCK;
+ default:
+ /* Other modes don't require fixing up the walk_lock */
+ WARN_ON_ONCE(1);
+ return PGWALK_RDLOCK;
+ }
+}
static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
struct vm_area_struct *vma,
@@ -789,6 +798,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
struct mmu_gather *tlb = madv_behavior->tlb;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = madvise_free_pte_range,
+ };
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
@@ -808,8 +820,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
mmu_notifier_invalidate_range_start(&range);
tlb_start_vma(tlb, vma);
+ walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode);
walk_page_range_vma(vma, range.start, range.end,
- &madvise_free_walk_ops, tlb);
+ &walk_ops, tlb);
tlb_end_vma(tlb, vma);
mmu_notifier_invalidate_range_end(&range);
return 0;
@@ -1655,7 +1668,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
case MADV_WILLNEED:
case MADV_COLD:
case MADV_PAGEOUT:
- case MADV_FREE:
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
@@ -1664,6 +1676,7 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
return MADVISE_MMAP_READ_LOCK;
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
return MADVISE_VMA_READ_LOCK;
default:
return MADVISE_MMAP_WRITE_LOCK;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e478777c86e1..74f623159f7b 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -422,7 +422,7 @@ static inline void process_mm_walk_lock(struct mm_struct *mm,
{
if (walk_lock == PGWALK_RDLOCK)
mmap_assert_locked(mm);
- else
+ else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY)
mmap_assert_write_locked(mm);
}
@@ -437,6 +437,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma,
case PGWALK_WRLOCK_VERIFY:
vma_assert_write_locked(vma);
break;
+ case PGWALK_VMA_RDLOCK_VERIFY:
+ vma_assert_locked(vma);
+ break;
case PGWALK_RDLOCK:
/* PGWALK_RDLOCK is handled by process_mm_walk_lock */
break;
--
2.39.3 (Apple Git-146)