Re: [PATCH v1 03/11] mm: thp: add helpers related to thp/pmd migration
From: Naoya Horiguchi
Date: Thu Mar 03 2016 - 11:22:34 EST
On Thu, Mar 03, 2016 at 01:40:51PM +0300, Kirill A. Shutemov wrote:
> On Thu, Mar 03, 2016 at 04:41:50PM +0900, Naoya Horiguchi wrote:
> > This patch prepares thp migration's core code. These code will be open when
> > unmap_and_move() stops unconditionally splitting thp and get_new_page() starts
> > to allocate destination thps.
> >
> > Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
> > ---
> > arch/x86/include/asm/pgtable.h | 11 ++++++
> > arch/x86/include/asm/pgtable_64.h | 2 +
> > include/linux/swapops.h | 62 +++++++++++++++++++++++++++++++
> > mm/huge_memory.c | 78 +++++++++++++++++++++++++++++++++++++++
> > mm/migrate.c | 23 ++++++++++++
> > 5 files changed, 176 insertions(+)
> >
> > diff --git v4.5-rc5-mmotm-2016-02-24-16-18/arch/x86/include/asm/pgtable.h v4.5-rc5-mmotm-2016-02-24-16-18_patched/arch/x86/include/asm/pgtable.h
> > index 0687c47..0df9afe 100644
> > --- v4.5-rc5-mmotm-2016-02-24-16-18/arch/x86/include/asm/pgtable.h
> > +++ v4.5-rc5-mmotm-2016-02-24-16-18_patched/arch/x86/include/asm/pgtable.h
> > @@ -515,6 +515,17 @@ static inline int pmd_present(pmd_t pmd)
> > return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
> > }
> >
> > +/*
> > + * Unlike pmd_present(), __pmd_present() checks only _PAGE_PRESENT bit.
> > + * Combined with is_migration_entry(), this routine is used to detect pmd
> > + * migration entries. To make it work fine, callers should make sure that
> > + * pmd_trans_huge() returns true beforehand.
> > + */
>
> Hm. I don't this this would fly. What pevents false positive for PROT_NONE
> pmds?
Nothing actually if we use __pmd_present alone. __pmd_present() is now used
only via is_pmd_migration_entry() combined with is_migration_entry(), and
is_migration_entry() should return false for PROT_NONE pmds (because
is_migration_entry() requires characteristic bits SWP_MIGRATION_READ|WRITE,
and they aren't compatible.) But I admit it might not be robust enough.
>
> I guess the problem is _PAGE_PSE, right? I don't really understand why we
> need it in pmd_present().
Yes, _PAGE_PSE in pmd_present() makes this branching harder/complicated.
Some simplification seems necessary.
>
> Andrea?
>
> > +static inline int __pmd_present(pmd_t pmd)
> > +{
> > + return pmd_flags(pmd) & _PAGE_PRESENT;
> > +}
> > +
> > #ifdef CONFIG_NUMA_BALANCING
> > /*
> > * These work without NUMA balancing but the kernel does not care. See the
> > diff --git v4.5-rc5-mmotm-2016-02-24-16-18/arch/x86/include/asm/pgtable_64.h v4.5-rc5-mmotm-2016-02-24-16-18_patched/arch/x86/include/asm/pgtable_64.h
> > index 2ee7811..df869d0 100644
> > --- v4.5-rc5-mmotm-2016-02-24-16-18/arch/x86/include/asm/pgtable_64.h
> > +++ v4.5-rc5-mmotm-2016-02-24-16-18_patched/arch/x86/include/asm/pgtable_64.h
> > @@ -153,7 +153,9 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
> > ((type) << (_PAGE_BIT_PRESENT + 1)) \
> > | ((offset) << SWP_OFFSET_SHIFT) })
> > #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
> > +#define __pmd_to_swp_entry(pte) ((swp_entry_t) { pmd_val((pmd)) })
> > #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
> > +#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val })
> >
> > extern int kern_addr_valid(unsigned long addr);
> > extern void cleanup_highmap(void);
> > diff --git v4.5-rc5-mmotm-2016-02-24-16-18/include/linux/swapops.h v4.5-rc5-mmotm-2016-02-24-16-18_patched/include/linux/swapops.h
> > index 5c3a5f3..b402a2c 100644
> > --- v4.5-rc5-mmotm-2016-02-24-16-18/include/linux/swapops.h
> > +++ v4.5-rc5-mmotm-2016-02-24-16-18_patched/include/linux/swapops.h
> > @@ -163,6 +163,68 @@ static inline int is_write_migration_entry(swp_entry_t entry)
> >
> > #endif
> >
> > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> > +extern int set_pmd_migration_entry(struct page *page,
> > + struct mm_struct *mm, unsigned long address);
> > +
> > +extern int remove_migration_pmd(struct page *new,
> > + struct vm_area_struct *vma, unsigned long addr, void *old);
> > +
> > +extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
> > +
> > +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
> > +{
> > + swp_entry_t arch_entry;
> > +
> > + arch_entry = __pmd_to_swp_entry(pmd);
> > + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
> > +}
> > +
> > +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
> > +{
> > + swp_entry_t arch_entry;
> > +
> > + arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
> > + return __swp_entry_to_pmd(arch_entry);
> > +}
> > +
> > +static inline int is_pmd_migration_entry(pmd_t pmd)
> > +{
> > + return !__pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
> > +}
> > +#else
> > +static inline int set_pmd_migration_entry(struct page *page,
> > + struct mm_struct *mm, unsigned long address)
> > +{
> > + return 0;
> > +}
> > +
> > +static inline int remove_migration_pmd(struct page *new,
> > + struct vm_area_struct *vma, unsigned long addr, void *old)
> > +{
> > + return 0;
> > +}
> > +
> > +static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }
> > +
> > +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
> > +{
> > + return swp_entry(0, 0);
> > +}
> > +
> > +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
> > +{
> > + pmd_t pmd = {};
> > +
> > + return pmd;
> > +}
> > +
> > +static inline int is_pmd_migration_entry(pmd_t pmd)
> > +{
> > + return 0;
> > +}
> > +#endif
> > +
> > #ifdef CONFIG_MEMORY_FAILURE
> >
> > extern atomic_long_t num_poisoned_pages __read_mostly;
> > diff --git v4.5-rc5-mmotm-2016-02-24-16-18/mm/huge_memory.c v4.5-rc5-mmotm-2016-02-24-16-18_patched/mm/huge_memory.c
> > index 46ad357..c6d5406 100644
> > --- v4.5-rc5-mmotm-2016-02-24-16-18/mm/huge_memory.c
> > +++ v4.5-rc5-mmotm-2016-02-24-16-18_patched/mm/huge_memory.c
> > @@ -3657,3 +3657,81 @@ static int __init split_huge_pages_debugfs(void)
> > }
> > late_initcall(split_huge_pages_debugfs);
> > #endif
> > +
> > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> > +int set_pmd_migration_entry(struct page *page, struct mm_struct *mm,
> > + unsigned long addr)
> > +{
> > + pte_t *pte;
> > + pmd_t *pmd;
> > + pmd_t pmdval;
> > + pmd_t pmdswp;
> > + swp_entry_t entry;
> > + spinlock_t *ptl;
> > +
> > + mmu_notifier_invalidate_range_start(mm, addr, addr + HPAGE_PMD_SIZE);
> > + if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl))
> > + goto out;
> > + if (pte)
> > + goto out;
> > + pmdval = pmdp_huge_get_and_clear(mm, addr, pmd);
> > + entry = make_migration_entry(page, pmd_write(pmdval));
> > + pmdswp = swp_entry_to_pmd(entry);
> > + pmdswp = pmd_mkhuge(pmdswp);
> > + set_pmd_at(mm, addr, pmd, pmdswp);
> > + page_remove_rmap(page, true);
> > + page_cache_release(page);
> > + spin_unlock(ptl);
> > +out:
> > + mmu_notifier_invalidate_range_end(mm, addr, addr + HPAGE_PMD_SIZE);
> > + return SWAP_AGAIN;
> > +}
> > +
> > +int remove_migration_pmd(struct page *new, struct vm_area_struct *vma,
> > + unsigned long addr, void *old)
> > +{
> > + struct mm_struct *mm = vma->vm_mm;
> > + spinlock_t *ptl;
> > + pgd_t *pgd;
> > + pud_t *pud;
> > + pmd_t *pmd;
> > + pmd_t pmde;
> > + swp_entry_t entry;
> > + unsigned long mmun_start = addr & HPAGE_PMD_MASK;
> > + unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
> > +
> > + pgd = pgd_offset(mm, addr);
> > + if (!pgd_present(*pgd))
> > + goto out;
> > + pud = pud_offset(pgd, addr);
> > + if (!pud_present(*pud))
> > + goto out;
> > + pmd = pmd_offset(pud, addr);
> > + if (!pmd)
> > + goto out;
> > + ptl = pmd_lock(mm, pmd);
> > + pmde = *pmd;
> > + barrier();
>
> Do we need a barrier under ptl?
No, I'll drop this. Thank you.
> > + if (!is_pmd_migration_entry(pmde))
> > + goto unlock_ptl;
> > + entry = pmd_to_swp_entry(pmde);
> > + if (migration_entry_to_page(entry) != old)
> > + goto unlock_ptl;
> > + get_page(new);
> > + pmde = mk_huge_pmd(new, vma->vm_page_prot);
> > + if (is_write_migration_entry(entry))
> > + pmde = maybe_pmd_mkwrite(pmde, vma);
> > + flush_cache_range(vma, mmun_start, mmun_end);
> > + page_add_anon_rmap(new, vma, mmun_start, true);
> > + pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
> > + set_pmd_at(mm, mmun_start, pmd, pmde);
> > + flush_tlb_range(vma, mmun_start, mmun_end);
> > + if (vma->vm_flags & VM_LOCKED)
> > + mlock_vma_page(new);
> > + update_mmu_cache_pmd(vma, addr, pmd);
> > +unlock_ptl:
> > + spin_unlock(ptl);
> > +out:
> > + return SWAP_AGAIN;
> > +}
> > +#endif
> > diff --git v4.5-rc5-mmotm-2016-02-24-16-18/mm/migrate.c v4.5-rc5-mmotm-2016-02-24-16-18_patched/mm/migrate.c
> > index 577c94b..14164f6 100644
> > --- v4.5-rc5-mmotm-2016-02-24-16-18/mm/migrate.c
> > +++ v4.5-rc5-mmotm-2016-02-24-16-18_patched/mm/migrate.c
> > @@ -118,6 +118,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
> > if (!ptep)
> > goto out;
> > ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
> > + } else if (PageTransHuge(new)) {
> > + return remove_migration_pmd(new, vma, addr, old);
>
> Hm. THP now can be mapped with PTEs too..
Right, and different calls of remove_migration_pte() handle pmd/pte migration
entries separately, so this particular code seems OK to me.
Thanks,
Naoya