Re: [PATCH 3/3] mm/mmu_notifier: contextual information for event triggering invalidation

From: Mike Rapoport
Date: Tue Dec 04 2018 - 03:18:06 EST


On Mon, Dec 03, 2018 at 03:18:17PM -0500, jglisse@xxxxxxxxxx wrote:
> From: JÃrÃme Glisse <jglisse@xxxxxxxxxx>
>
> CPU page table update can happens for many reasons, not only as a result
> of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also
> as a result of kernel activities (memory compression, reclaim, migration,
> ...).
>
> Users of mmu notifier API track changes to the CPU page table and take
> specific action for them. While current API only provide range of virtual
> address affected by the change, not why the changes is happening.
>
> This patchset adds event information so that users of mmu notifier can
> differentiate among broad category:
> - UNMAP: munmap() or mremap()
> - CLEAR: page table is cleared (migration, compaction, reclaim, ...)
> - PROTECTION_VMA: change in access protections for the range
> - PROTECTION_PAGE: change in access protections for page in the range
> - SOFT_DIRTY: soft dirtyness tracking
>
> Being able to identify munmap() and mremap() from other reasons why the
> page table is cleared is important to allow user of mmu notifier to
> update their own internal tracking structure accordingly (on munmap or
> mremap it is not longer needed to track range of virtual address as it
> becomes invalid).
>
> Signed-off-by: JÃrÃme Glisse <jglisse@xxxxxxxxxx>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Cc: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>
> Cc: Ross Zwisler <zwisler@xxxxxxxxxx>
> Cc: Jan Kara <jack@xxxxxxx>
> Cc: Dan Williams <dan.j.williams@xxxxxxxxx>
> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> Cc: Radim KrÄmÃÅ <rkrcmar@xxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxxxxx>
> Cc: Christian Koenig <christian.koenig@xxxxxxx>
> Cc: Felix Kuehling <felix.kuehling@xxxxxxx>
> Cc: Ralph Campbell <rcampbell@xxxxxxxxxx>
> Cc: John Hubbard <jhubbard@xxxxxxxxxx>
> Cc: kvm@xxxxxxxxxxxxxxx
> Cc: linux-rdma@xxxxxxxxxxxxxxx
> Cc: linux-fsdevel@xxxxxxxxxxxxxxx
> Cc: dri-devel@xxxxxxxxxxxxxxxxxxxxx
> ---
> fs/dax.c | 1 +
> fs/proc/task_mmu.c | 1 +
> include/linux/mmu_notifier.h | 33 +++++++++++++++++++++++++++++++++
> kernel/events/uprobes.c | 1 +
> mm/huge_memory.c | 4 ++++
> mm/hugetlb.c | 4 ++++
> mm/khugepaged.c | 1 +
> mm/ksm.c | 2 ++
> mm/madvise.c | 1 +
> mm/memory.c | 5 +++++
> mm/migrate.c | 2 ++
> mm/mprotect.c | 1 +
> mm/mremap.c | 1 +
> mm/oom_kill.c | 1 +
> mm/rmap.c | 2 ++
> 15 files changed, 60 insertions(+)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index e22508ee19ec..83092c5ac5f0 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -761,6 +761,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
> struct mmu_notifier_range range;
> unsigned long address;
>
> + range.event = MMU_NOTIFY_PROTECTION_PAGE;
> range.mm = vma->vm_mm;
>
> cond_resched();
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 53d625925669..4abb1668eeb3 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -1144,6 +1144,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
> range.start = 0;
> range.end = -1UL;
> range.mm = mm;
> + range.event = MMU_NOTIFY_SOFT_DIRTY;
> mmu_notifier_invalidate_range_start(&range);
> }
> walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index cbeece8e47d4..3077d487be8b 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -25,10 +25,43 @@ struct mmu_notifier_mm {
> spinlock_t lock;
> };
>
> +/*
> + * What event is triggering the invalidation:

Can you please make it kernel-doc comment?

> + *
> + * MMU_NOTIFY_UNMAP
> + * either munmap() that unmap the range or a mremap() that move the range
> + *
> + * MMU_NOTIFY_CLEAR
> + * clear page table entry (many reasons for this like madvise() or replacing
> + * a page by another one, ...).
> + *
> + * MMU_NOTIFY_PROTECTION_VMA
> + * update is due to protection change for the range ie using the vma access
> + * permission (vm_page_prot) to update the whole range is enough no need to
> + * inspect changes to the CPU page table (mprotect() syscall)
> + *
> + * MMU_NOTIFY_PROTECTION_PAGE
> + * update is due to change in read/write flag for pages in the range so to
> + * mirror those changes the user must inspect the CPU page table (from the
> + * end callback).
> + *
> + *
> + * MMU_NOTIFY_SOFT_DIRTY
> + * soft dirty accounting (still same page and same access flags)
> + */
> +enum mmu_notifier_event {
> + MMU_NOTIFY_UNMAP = 0,
> + MMU_NOTIFY_CLEAR,
> + MMU_NOTIFY_PROTECTION_VMA,
> + MMU_NOTIFY_PROTECTION_PAGE,
> + MMU_NOTIFY_SOFT_DIRTY,
> +};
> +
> struct mmu_notifier_range {
> struct mm_struct *mm;
> unsigned long start;
> unsigned long end;
> + enum mmu_notifier_event event;
> bool blockable;
> };
>
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index aa7996ca361e..b6ef3be1c24e 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -174,6 +174,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
> struct mmu_notifier_range range;
> struct mem_cgroup *memcg;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = addr;
> range.end = addr + PAGE_SIZE;
> range.mm = mm;
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 1a7a059dbf7d..4919be71ffd0 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1182,6 +1182,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
> cond_resched();
> }
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = haddr;
> range.end = range.start + HPAGE_PMD_SIZE;
> range.mm = vma->vm_mm;
> @@ -1347,6 +1348,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
> vma, HPAGE_PMD_NR);
> __SetPageUptodate(new_page);
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = haddr;
> range.end = range.start + HPAGE_PMD_SIZE;
> range.mm = vma->vm_mm;
> @@ -2029,6 +2031,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
> struct mm_struct *mm = vma->vm_mm;
> struct mmu_notifier_range range;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = address & HPAGE_PUD_MASK;
> range.end = range.start + HPAGE_PUD_SIZE;
> range.mm = mm;
> @@ -2248,6 +2251,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> struct mm_struct *mm = vma->vm_mm;
> struct mmu_notifier_range range;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = address & HPAGE_PMD_MASK;
> range.end = range.start + HPAGE_PMD_SIZE;
> range.mm = mm;
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 4bfbdab44d51..9ffe34173834 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -3244,6 +3244,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>
> cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = vma->vm_start;
> range.end = vma->vm_end;
> range.mm = src;
> @@ -3344,6 +3345,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> unsigned long sz = huge_page_size(h);
> struct mmu_notifier_range range;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = start;
> range.end = end;
> range.mm = mm;
> @@ -3629,6 +3631,7 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> __SetPageUptodate(new_page);
> set_page_huge_active(new_page);
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = haddr;
> range.end = range.start + huge_page_size(h);
> range.mm = mm;
> @@ -4346,6 +4349,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> bool shared_pmd = false;
> struct mmu_notifier_range range;
>
> + range.event = MMU_NOTIFY_PROTECTION_VMA;
> range.start = start;
> range.end = end;
> range.mm = mm;
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index e9fe0c9a9f56..c5c78ba30b38 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1016,6 +1016,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> pte = pte_offset_map(pmd, address);
> pte_ptl = pte_lockptr(mm, pmd);
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = address;
> range.end = range.start + HPAGE_PMD_SIZE;
> range.mm = mm;
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 262694d0cd4c..f8fbb92ca1bd 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -1050,6 +1050,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
>
> BUG_ON(PageTransCompound(page));
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = pvmw.address;
> range.end = range.start + PAGE_SIZE;
> range.mm = mm;
> @@ -1139,6 +1140,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
> if (!pmd)
> goto out;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = addr;
> range.end = addr + PAGE_SIZE;
> range.mm = mm;
> diff --git a/mm/madvise.c b/mm/madvise.c
> index f20dd80ca21b..c415985d6a04 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -466,6 +466,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
> if (!vma_is_anonymous(vma))
> return -EINVAL;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = max(vma->vm_start, start_addr);
> if (range.start >= vma->vm_end)
> return -EINVAL;
> diff --git a/mm/memory.c b/mm/memory.c
> index 36e0b83949fc..4ad63002d770 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1007,6 +1007,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> * is_cow_mapping() returns true.
> */
> is_cow = is_cow_mapping(vma->vm_flags);
> + range.event = MMU_NOTIFY_PROTECTION_PAGE;
> range.start = addr;
> range.end = end;
> range.mm = src_mm;
> @@ -1334,6 +1335,7 @@ void unmap_vmas(struct mmu_gather *tlb,
> {
> struct mmu_notifier_range range;
>
> + range.event = MMU_NOTIFY_UNMAP;
> range.start = start_addr;
> range.end = end_addr;
> range.mm = vma->vm_mm;
> @@ -1358,6 +1360,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
> struct mmu_notifier_range range;
> struct mmu_gather tlb;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = start;
> range.end = range.start + size;
> range.mm = vma->vm_mm;
> @@ -1387,6 +1390,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
> struct mmu_notifier_range range;
> struct mmu_gather tlb;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = address;
> range.end = range.start + size;
> range.mm = vma->vm_mm;
> @@ -2260,6 +2264,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> struct mem_cgroup *memcg;
> struct mmu_notifier_range range;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = vmf->address & PAGE_MASK;
> range.end = range.start + PAGE_SIZE;
> range.mm = mm;
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 4896dd9d8b28..a2caaabfc5a1 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -2306,6 +2306,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
> struct mmu_notifier_range range;
> struct mm_walk mm_walk;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = migrate->start;
> range.end = migrate->end;
> range.mm = mm_walk.mm;
> @@ -2726,6 +2727,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
> if (!notified) {
> notified = true;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = addr;
> range.end = migrate->end;
> range.mm = mm;
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index f466adf31e12..6d41321b2f3e 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -186,6 +186,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
>
> /* invoke the mmu notifier if the pmd is populated */
> if (!range.start) {
> + range.event = MMU_NOTIFY_PROTECTION_VMA;
> range.start = addr;
> range.end = end;
> range.mm = mm;
> diff --git a/mm/mremap.c b/mm/mremap.c
> index db060acb4a8c..856a5e6bb226 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -203,6 +203,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
> old_end = old_addr + len;
> flush_cache_range(vma, old_addr, old_end);
>
> + range.event = MMU_NOTIFY_UNMAP;
> range.start = old_addr;
> range.end = old_end;
> range.mm = vma->vm_mm;
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index b29ab2624e95..f4bde1c34714 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -519,6 +519,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
> struct mmu_notifier_range range;
> struct mmu_gather tlb;
>
> + range.event = MMU_NOTIFY_CLEAR;
> range.start = vma->vm_start;
> range.end = vma->vm_end;
> range.mm = mm;
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 09c5d9e5c766..b1afbbcc236a 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -896,6 +896,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
> * We have to assume the worse case ie pmd for invalidation. Note that
> * the page can not be free from this function.
> */
> + range.event = MMU_NOTIFY_PROTECTION_PAGE;
> range.mm = vma->vm_mm;
> range.start = address;
> range.end = min(vma->vm_end, range.start +
> @@ -1372,6 +1373,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
> * Note that the page can not be free in this function as call of
> * try_to_unmap() must hold a reference on the page.
> */
> + range.event = MMU_NOTIFY_CLEAR;
> range.mm = vma->vm_mm;
> range.start = vma->vm_start;
> range.end = min(vma->vm_end, range.start +
> --
> 2.17.2
>

--
Sincerely yours,
Mike.