From: Barry Song <v-songbaohua@xxxxxxxx>
MADV_FREE is another option, besides MADV_DONTNEED, for dynamic memory
freeing in user-space native or Java heap memory management. For example,
jemalloc can be configured to use MADV_FREE, and recent versions of the
Android Java heap have also increasingly adopted MADV_FREE. Supporting
per-VMA locking for MADV_FREE thus appears increasingly necessary.
We have replaced walk_page_range() with walk_page_range_vma(). Along with
the proposed madvise_lock_mode by Lorenzo, the necessary infrastructure is
now in place to begin exploring per-VMA locking support for MADV_FREE and
potentially other madvise using walk_page_range_vma().
This patch adds support for the PGWALK_VMA_RDLOCK walk_lock mode in
walk_page_range_vma(), and leverages madvise_lock_mode from
madv_behavior to select the appropriate walk_lock—either mmap_lock or
per-VMA lock—based on the context.
To ensure thread safety, madvise_free_walk_ops is now defined as a stack
variable instead of a global constant.
Cc: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx>
Cc: "Liam R. Howlett" <Liam.Howlett@xxxxxxxxxx>
Cc: David Hildenbrand <david@xxxxxxxxxx>
Cc: Vlastimil Babka <vbabka@xxxxxxx>
Cc: Jann Horn <jannh@xxxxxxxxxx>
Cc: Suren Baghdasaryan <surenb@xxxxxxxxxx>
Cc: Lokesh Gidra <lokeshgidra@xxxxxxxxxx>
Cc: Mike Rapoport <rppt@xxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxx>
Cc: Tangquan Zheng <zhengtangquan@xxxxxxxx>
Cc: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
Signed-off-by: Barry Song <v-songbaohua@xxxxxxxx>
---
include/linux/pagewalk.h | 2 ++
mm/madvise.c | 20 ++++++++++++++------
mm/pagewalk.c | 6 ++++++
3 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 9700a29f8afb..a4afa64ef0ab 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -14,6 +14,8 @@ enum page_walk_lock {
PGWALK_WRLOCK = 1,
/* vma is expected to be already write-locked during the walk */
PGWALK_WRLOCK_VERIFY = 2,
+ /* vma is expected to be already read-locked during the walk */
+ PGWALK_VMA_RDLOCK_VERIFY = 3,
};
/**
diff --git a/mm/madvise.c b/mm/madvise.c
index 381eedde8f6d..23d58eb31c8f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -775,10 +775,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static const struct mm_walk_ops madvise_free_walk_ops = {
- .pmd_entry = madvise_free_pte_range,
- .walk_lock = PGWALK_RDLOCK,
-};
+static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode)
+{
+ /* Other modes don't require fixing up the walk_lock. */
+ VM_WARN_ON_ONCE(mode != MADVISE_VMA_READ_LOCK &&
+ mode != MADVISE_MMAP_READ_LOCK);
+ return mode == MADVISE_VMA_READ_LOCK ?
+ PGWALK_VMA_RDLOCK_VERIFY : PGWALK_RDLOCK;
+}
static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
struct vm_area_struct *vma,
@@ -787,6 +791,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
struct mmu_gather *tlb = madv_behavior->tlb;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = madvise_free_pte_range,
+ };
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
@@ -806,8 +813,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
mmu_notifier_invalidate_range_start(&range);
tlb_start_vma(tlb, vma);
+ walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode);
walk_page_range_vma(vma, range.start, range.end,
- &madvise_free_walk_ops, tlb);
+ &walk_ops, tlb);
tlb_end_vma(tlb, vma);
mmu_notifier_invalidate_range_end(&range);
return 0;
@@ -1653,7 +1661,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
case MADV_WILLNEED:
case MADV_COLD:
case MADV_PAGEOUT:
- case MADV_FREE:
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
@@ -1662,6 +1669,7 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
return MADVISE_MMAP_READ_LOCK;
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
return MADVISE_VMA_READ_LOCK;
default:
return MADVISE_MMAP_WRITE_LOCK;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e478777c86e1..c984aacc5552 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -420,6 +420,9 @@ static int __walk_page_range(unsigned long start, unsigned long end,
static inline void process_mm_walk_lock(struct mm_struct *mm,
enum page_walk_lock walk_lock)
{
+ if (walk_lock == PGWALK_VMA_RDLOCK_VERIFY)
+ return;
+
if (walk_lock == PGWALK_RDLOCK)
mmap_assert_locked(mm);
else
@@ -437,6 +440,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma,
case PGWALK_WRLOCK_VERIFY:
vma_assert_write_locked(vma);
break;
+ case PGWALK_VMA_RDLOCK_VERIFY:
+ vma_assert_locked(vma);
+ break;
case PGWALK_RDLOCK:
/* PGWALK_RDLOCK is handled by process_mm_walk_lock */
break;