Re: [PATCH v2 18/30] powerpc: Implement the new page table range API

From: Matthew Wilcox
Date: Mon Feb 27 2023 - 15:21:07 EST


On Mon, Feb 27, 2023 at 07:45:08PM +0000, Christophe Leroy wrote:
> Hi,
>
> Le 27/02/2023 à 18:57, Matthew Wilcox (Oracle) a écrit :
> > Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio().
> > Change the PG_arch_1 (aka PG_dcache_dirty) flag from being per-page to
> > per-folio.
> >
> > I'm unsure about my merging of flush_dcache_icache_hugepage() and
> > flush_dcache_icache_page() into flush_dcache_icache_folio() and subsequent
> > removal of flush_dcache_icache_phys(). Please review.
>
> Not sure why you want to remove flush_dcache_icache_phys().

Well, I didn't, necessarily. It's just that when I merged
flush_dcache_icache_hugepage() and flush_dcache_icache_page()
together, it was left with no callers.

> Allthough that's only feasible when address bus is not wider than 32
> bits and cannot be done on BOOKE as you can't switch off MMU on BOOKE,
> flush_dcache_icache_phys() allows to flush not mapped pages without
> having to map them. So it is more efficient.

And it was just never done for the hugepage case?

> > @@ -148,17 +103,20 @@ static void __flush_dcache_icache(void *p)
> > invalidate_icache_range(addr, addr + PAGE_SIZE);
> > }
> >
> > -static void flush_dcache_icache_hugepage(struct page *page)
> > +void flush_dcache_icache_folio(struct folio *folio)
> > {
> > - int i;
> > - int nr = compound_nr(page);
> > + unsigned int i, nr = folio_nr_pages(folio);
> >
> > - if (!PageHighMem(page)) {
> > + if (flush_coherent_icache())
> > + return;
> > +
> > + if (!folio_test_highmem(folio)) {
> > + void *addr = folio_address(folio);
> > for (i = 0; i < nr; i++)
> > - __flush_dcache_icache(lowmem_page_address(page + i));
> > + __flush_dcache_icache(addr + i * PAGE_SIZE);
> > } else {
> > for (i = 0; i < nr; i++) {
> > - void *start = kmap_local_page(page + i);
> > + void *start = kmap_local_folio(folio, i * PAGE_SIZE);
> >
> > __flush_dcache_icache(start);
> > kunmap_local(start);

So you'd like this to be:

} else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) {
for (i = 0; i < nr; i++) {
void *start = kmap_local_folio(folio, i * PAGE_SIZE);
__flush_dcache_icache(start);
kunmap_local(start);
}
} else {
unsigned long pfn = folio_pfn(folio);
for (i = 0; i < nr; i++)
flush_dcache_icache_phys((pfn + i) * PAGE_SIZE;
}

(or maybe you'd prefer a flush_dcache_icache_pfn() that doesn't need to
worry about PAGE_MASK).

> > @@ -166,27 +124,6 @@ static void flush_dcache_icache_hugepage(struct page *page)
> > }
> > }
> >
> > -void flush_dcache_icache_page(struct page *page)
> > -{
> > - if (flush_coherent_icache())
> > - return;
> > -
> > - if (PageCompound(page))
> > - return flush_dcache_icache_hugepage(page);
> > -
> > - if (!PageHighMem(page)) {
> > - __flush_dcache_icache(lowmem_page_address(page));
> > - } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) {
> > - void *start = kmap_local_page(page);
> > -
> > - __flush_dcache_icache(start);
> > - kunmap_local(start);
> > - } else {
> > - flush_dcache_icache_phys(page_to_phys(page));
> > - }
> > -}
> > -EXPORT_SYMBOL(flush_dcache_icache_page);
> > -
> > void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
> > {
> > clear_page(page);
> > diff --git a/arch/powerpc/mm/nohash/e500_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c
> > index 58c8d9849cb1..f3cb91107a47 100644
> > --- a/arch/powerpc/mm/nohash/e500_hugetlbpage.c
> > +++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c
> > @@ -178,7 +178,8 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte)
> > *
> > * This must always be called with the pte lock held.
> > */
> > -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
> > +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
> > + pte_t *ptep, unsigned int nr)
> > {
> > if (is_vm_hugetlb_page(vma))
> > book3e_hugetlb_preload(vma, address, *ptep);
> > diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> > index cb2dcdb18f8e..b3c7b874a7a2 100644
> > --- a/arch/powerpc/mm/pgtable.c
> > +++ b/arch/powerpc/mm/pgtable.c
> > @@ -58,7 +58,7 @@ static inline int pte_looks_normal(pte_t pte)
> > return 0;
> > }
> >
> > -static struct page *maybe_pte_to_page(pte_t pte)
> > +static struct folio *maybe_pte_to_folio(pte_t pte)
> > {
> > unsigned long pfn = pte_pfn(pte);
> > struct page *page;
> > @@ -68,7 +68,7 @@ static struct page *maybe_pte_to_page(pte_t pte)
> > page = pfn_to_page(pfn);
> > if (PageReserved(page))
> > return NULL;
> > - return page;
> > + return page_folio(page);
> > }
> >
> > #ifdef CONFIG_PPC_BOOK3S
> > @@ -84,12 +84,12 @@ static pte_t set_pte_filter_hash(pte_t pte)
> > pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
> > if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
> > cpu_has_feature(CPU_FTR_NOEXECUTE))) {
> > - struct page *pg = maybe_pte_to_page(pte);
> > - if (!pg)
> > + struct folio *folio = maybe_pte_to_folio(pte);
> > + if (!folio)
> > return pte;
> > - if (!test_bit(PG_dcache_clean, &pg->flags)) {
> > - flush_dcache_icache_page(pg);
> > - set_bit(PG_dcache_clean, &pg->flags);
> > + if (!test_bit(PG_dcache_clean, &folio->flags)) {
> > + flush_dcache_icache_folio(folio);
> > + set_bit(PG_dcache_clean, &folio->flags);
> > }
> > }
> > return pte;
> > @@ -107,7 +107,7 @@ static pte_t set_pte_filter_hash(pte_t pte) { return pte; }
> > */
> > static inline pte_t set_pte_filter(pte_t pte)
> > {
> > - struct page *pg;
> > + struct folio *folio;
> >
> > if (radix_enabled())
> > return pte;
> > @@ -120,18 +120,18 @@ static inline pte_t set_pte_filter(pte_t pte)
> > return pte;
> >
> > /* If you set _PAGE_EXEC on weird pages you're on your own */
> > - pg = maybe_pte_to_page(pte);
> > - if (unlikely(!pg))
> > + folio = maybe_pte_to_folio(pte);
> > + if (unlikely(!folio))
> > return pte;
> >
> > /* If the page clean, we move on */
> > - if (test_bit(PG_dcache_clean, &pg->flags))
> > + if (test_bit(PG_dcache_clean, &folio->flags))
> > return pte;
> >
> > /* If it's an exec fault, we flush the cache and make it clean */
> > if (is_exec_fault()) {
> > - flush_dcache_icache_page(pg);
> > - set_bit(PG_dcache_clean, &pg->flags);
> > + flush_dcache_icache_folio(folio);
> > + set_bit(PG_dcache_clean, &folio->flags);
> > return pte;
> > }
> >
> > @@ -142,7 +142,7 @@ static inline pte_t set_pte_filter(pte_t pte)
> > static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
> > int dirty)
> > {
> > - struct page *pg;
> > + struct folio *folio;
> >
> > if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
> > return pte;
> > @@ -168,17 +168,17 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
> > #endif /* CONFIG_DEBUG_VM */
> >
> > /* If you set _PAGE_EXEC on weird pages you're on your own */
> > - pg = maybe_pte_to_page(pte);
> > - if (unlikely(!pg))
> > + folio = maybe_pte_to_folio(pte);
> > + if (unlikely(!folio))
> > goto bail;
> >
> > /* If the page is already clean, we move on */
> > - if (test_bit(PG_dcache_clean, &pg->flags))
> > + if (test_bit(PG_dcache_clean, &folio->flags))
> > goto bail;
> >
> > /* Clean the page and set PG_dcache_clean */
> > - flush_dcache_icache_page(pg);
> > - set_bit(PG_dcache_clean, &pg->flags);
> > + flush_dcache_icache_folio(folio);
> > + set_bit(PG_dcache_clean, &folio->flags);
> >
> > bail:
> > return pte_mkexec(pte);
> > @@ -187,8 +187,8 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
> > /*
> > * set_pte stores a linux PTE into the linux page table.
> > */
> > -void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
> > - pte_t pte)
> > +void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
> > + pte_t pte, unsigned int nr)
> > {
> > /*
> > * Make sure hardware valid bit is not set. We don't do
> > @@ -203,7 +203,14 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
> > pte = set_pte_filter(pte);
> >
> > /* Perform the setting of the PTE */
> > - __set_pte_at(mm, addr, ptep, pte, 0);
> > + for (;;) {
> > + __set_pte_at(mm, addr, ptep, pte, 0);
> > + if (--nr == 0)
> > + break;
> > + ptep++;
> > + pte = __pte(pte_val(pte) + PAGE_SIZE);
> > + addr += PAGE_SIZE;
> > + }
> > }
> >
> > void unmap_kernel_page(unsigned long va)