Re: [PATCH] sparc64: Add 16GB hugepage support

From: Paul Gortmaker
Date: Wed May 24 2017 - 23:36:01 EST


[[PATCH] sparc64: Add 16GB hugepage support] On 24/05/2017 (Wed 17:29) Nitin Gupta wrote:

> Orabug: 25362942
>
> Signed-off-by: Nitin Gupta <nitin.m.gupta@xxxxxxxxxx>

If this wasn't an accidental git send-email misfire, then there should
be a long log indicating the use case, the perforamnce increase, the
testing that was done, etc. etc.

Normally I'd not notice but since I was Cc'd I figured it was worth a
mention -- for example the vendor ID above doesn't mean a thing to
all the rest of us, hence why I suspect it was a git send-email misfire;
sadly, I think we've all accidentally done that at least once....

Paul.
--

> ---
> arch/sparc/include/asm/page_64.h | 3 +-
> arch/sparc/include/asm/pgtable_64.h | 5 +++
> arch/sparc/include/asm/tsb.h | 35 +++++++++++++++++-
> arch/sparc/kernel/tsb.S | 2 +-
> arch/sparc/mm/hugetlbpage.c | 74 ++++++++++++++++++++++++++-----------
> arch/sparc/mm/init_64.c | 41 ++++++++++++++++----
> 6 files changed, 128 insertions(+), 32 deletions(-)
>
> diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
> index 5961b2d..8ee1f97 100644
> --- a/arch/sparc/include/asm/page_64.h
> +++ b/arch/sparc/include/asm/page_64.h
> @@ -17,6 +17,7 @@
>
> #define HPAGE_SHIFT 23
> #define REAL_HPAGE_SHIFT 22
> +#define HPAGE_16GB_SHIFT 34
> #define HPAGE_2GB_SHIFT 31
> #define HPAGE_256MB_SHIFT 28
> #define HPAGE_64K_SHIFT 16
> @@ -28,7 +29,7 @@
> #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
> #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
> #define REAL_HPAGE_PER_HPAGE (_AC(1,UL) << (HPAGE_SHIFT - REAL_HPAGE_SHIFT))
> -#define HUGE_MAX_HSTATE 4
> +#define HUGE_MAX_HSTATE 5
> #endif
>
> #ifndef __ASSEMBLY__
> diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
> index 6fbd931..2444b02 100644
> --- a/arch/sparc/include/asm/pgtable_64.h
> +++ b/arch/sparc/include/asm/pgtable_64.h
> @@ -414,6 +414,11 @@ static inline bool is_hugetlb_pmd(pmd_t pmd)
> return !!(pmd_val(pmd) & _PAGE_PMD_HUGE);
> }
>
> +static inline bool is_hugetlb_pud(pud_t pud)
> +{
> + return !!(pud_val(pud) & _PAGE_PUD_HUGE);
> +}
> +
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> static inline pmd_t pmd_mkhuge(pmd_t pmd)
> {
> diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h
> index 32258e0..fbd8da7 100644
> --- a/arch/sparc/include/asm/tsb.h
> +++ b/arch/sparc/include/asm/tsb.h
> @@ -195,6 +195,36 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
> nop; \
> 699:
>
> + /* PUD has been loaded into REG1, interpret the value, seeing
> + * if it is a HUGE PUD or a normal one. If it is not valid
> + * then jump to FAIL_LABEL. If it is a HUGE PUD, and it
> + * translates to a valid PTE, branch to PTE_LABEL.
> + *
> + * We have to propagate bits [32:22] from the virtual address
> + * to resolve at 4M granularity.
> + */
> +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
> +#define USER_PGTABLE_CHECK_PUD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \
> + brz,pn REG1, FAIL_LABEL; \
> + sethi %uhi(_PAGE_PUD_HUGE), REG2; \
> + sllx REG2, 32, REG2; \
> + andcc REG1, REG2, %g0; \
> + be,pt %xcc, 700f; \
> + sethi %hi(0x1ffc0000), REG2; \
> + brgez,pn REG1, FAIL_LABEL; \
> + sllx REG2, 1, REG2; \
> + brgez,pn REG1, FAIL_LABEL; \
> + andn REG1, REG2, REG1; \
> + and VADDR, REG2, REG2; \
> + brlz,pt REG1, PTE_LABEL; \
> + or REG1, REG2, REG1; \
> +700:
> +#else
> +#define USER_PGTABLE_CHECK_PUD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \
> + brz,pn REG1, FAIL_LABEL; \
> + nop;
> +#endif
> +
> /* PMD has been loaded into REG1, interpret the value, seeing
> * if it is a HUGE PMD or a normal one. If it is not valid
> * then jump to FAIL_LABEL. If it is a HUGE PMD, and it
> @@ -209,14 +239,14 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
> sethi %uhi(_PAGE_PMD_HUGE), REG2; \
> sllx REG2, 32, REG2; \
> andcc REG1, REG2, %g0; \
> - be,pt %xcc, 700f; \
> + be,pt %xcc, 701f; \
> sethi %hi(4 * 1024 * 1024), REG2; \
> brgez,pn REG1, FAIL_LABEL; \
> andn REG1, REG2, REG1; \
> and VADDR, REG2, REG2; \
> brlz,pt REG1, PTE_LABEL; \
> or REG1, REG2, REG1; \
> -700:
> +701:
> #else
> #define USER_PGTABLE_CHECK_PMD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, PTE_LABEL) \
> brz,pn REG1, FAIL_LABEL; \
> @@ -242,6 +272,7 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
> srlx REG2, 64 - PAGE_SHIFT, REG2; \
> andn REG2, 0x7, REG2; \
> ldxa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
> + USER_PGTABLE_CHECK_PUD_HUGE(VADDR, REG1, REG2, FAIL_LABEL, 800f) \
> brz,pn REG1, FAIL_LABEL; \
> sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
> srlx REG2, 64 - PAGE_SHIFT, REG2; \
> diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S
> index 10689cf..a0a5a13 100644
> --- a/arch/sparc/kernel/tsb.S
> +++ b/arch/sparc/kernel/tsb.S
> @@ -117,7 +117,7 @@ tsb_miss_page_table_walk_sun4v_fastpath:
> /* Valid PTE is now in %g5. */
>
> #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
> - sethi %uhi(_PAGE_PMD_HUGE), %g7
> + sethi %uhi(_PAGE_PMD_HUGE | _PAGE_PUD_HUGE), %g7
> sllx %g7, 32, %g7
>
> andcc %g5, %g7, %g0
> diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
> index 7c29d38..62c1e62 100644
> --- a/arch/sparc/mm/hugetlbpage.c
> +++ b/arch/sparc/mm/hugetlbpage.c
> @@ -143,6 +143,10 @@ static pte_t sun4v_hugepage_shift_to_tte(pte_t entry, unsigned int shift)
> pte_val(entry) = pte_val(entry) & ~_PAGE_SZALL_4V;
>
> switch (shift) {
> + case HPAGE_16GB_SHIFT:
> + hugepage_size = _PAGE_SZ16GB_4V;
> + pte_val(entry) |= _PAGE_PUD_HUGE;
> + break;
> case HPAGE_2GB_SHIFT:
> hugepage_size = _PAGE_SZ2GB_4V;
> pte_val(entry) |= _PAGE_PMD_HUGE;
> @@ -187,6 +191,9 @@ static unsigned int sun4v_huge_tte_to_shift(pte_t entry)
> unsigned int shift;
>
> switch (tte_szbits) {
> + case _PAGE_SZ16GB_4V:
> + shift = HPAGE_16GB_SHIFT;
> + break;
> case _PAGE_SZ2GB_4V:
> shift = HPAGE_2GB_SHIFT;
> break;
> @@ -263,7 +270,12 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
>
> pgd = pgd_offset(mm, addr);
> pud = pud_alloc(mm, pgd, addr);
> - if (pud) {
> + if (!pud)
> + return NULL;
> +
> + if (sz >= PUD_SIZE)
> + pte = (pte_t *)pud;
> + else {
> pmd = pmd_alloc(mm, pud, addr);
> if (!pmd)
> return NULL;
> @@ -288,12 +300,16 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
> if (!pgd_none(*pgd)) {
> pud = pud_offset(pgd, addr);
> if (!pud_none(*pud)) {
> - pmd = pmd_offset(pud, addr);
> - if (!pmd_none(*pmd)) {
> - if (is_hugetlb_pmd(*pmd))
> - pte = (pte_t *)pmd;
> - else
> - pte = pte_offset_map(pmd, addr);
> + if (is_hugetlb_pud(*pud))
> + pte = (pte_t *)pud;
> + else {
> + pmd = pmd_offset(pud, addr);
> + if (!pmd_none(*pmd)) {
> + if (is_hugetlb_pmd(*pmd))
> + pte = (pte_t *)pmd;
> + else
> + pte = pte_offset_map(pmd, addr);
> + }
> }
> }
> }
> @@ -304,12 +320,20 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
> void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
> pte_t *ptep, pte_t entry)
> {
> - unsigned int i, nptes, orig_shift, shift;
> - unsigned long size;
> + unsigned int nptes, orig_shift, shift;
> + unsigned long i, size;
> pte_t orig;
>
> size = huge_tte_to_size(entry);
> - shift = size >= HPAGE_SIZE ? PMD_SHIFT : PAGE_SHIFT;
> +
> + shift = PAGE_SHIFT;
> + if (size >= PUD_SIZE)
> + shift = PUD_SHIFT;
> + else if (size >= PMD_SIZE)
> + shift = PMD_SHIFT;
> + else
> + shift = PAGE_SHIFT;
> +
> nptes = size >> shift;
>
> if (!pte_present(*ptep) && pte_present(entry))
> @@ -332,19 +356,23 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
> pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
> pte_t *ptep)
> {
> - unsigned int i, nptes, hugepage_shift;
> + unsigned int i, nptes, orig_shift, shift;
> unsigned long size;
> pte_t entry;
>
> entry = *ptep;
> size = huge_tte_to_size(entry);
> - if (size >= HPAGE_SIZE)
> - nptes = size >> PMD_SHIFT;
> +
> + shift = PAGE_SHIFT;
> + if (size >= PUD_SIZE)
> + shift = PUD_SHIFT;
> + else if (size >= PMD_SIZE)
> + shift = PMD_SHIFT;
> else
> - nptes = size >> PAGE_SHIFT;
> + shift = PAGE_SHIFT;
>
> - hugepage_shift = pte_none(entry) ? PAGE_SHIFT :
> - huge_tte_to_shift(entry);
> + nptes = size >> shift;
> + orig_shift = pte_none(entry) ? PAGE_SHIFT : huge_tte_to_shift(entry);
>
> if (pte_present(entry))
> mm->context.hugetlb_pte_count -= nptes;
> @@ -353,11 +381,11 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
> for (i = 0; i < nptes; i++)
> ptep[i] = __pte(0UL);
>
> - maybe_tlb_batch_add(mm, addr, ptep, entry, 0, hugepage_shift);
> + maybe_tlb_batch_add(mm, addr, ptep, entry, 0, orig_shift);
> /* An HPAGE_SIZE'ed page is composed of two REAL_HPAGE_SIZE'ed pages */
> if (size == HPAGE_SIZE)
> maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, entry, 0,
> - hugepage_shift);
> + orig_shift);
>
> return entry;
> }
> @@ -370,7 +398,8 @@ int pmd_huge(pmd_t pmd)
>
> int pud_huge(pud_t pud)
> {
> - return 0;
> + return !pud_none(pud) &&
> + (pud_val(pud) & (_PAGE_VALID|_PAGE_PUD_HUGE)) != _PAGE_VALID;
> }
>
> static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
> @@ -434,8 +463,11 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
> next = pud_addr_end(addr, end);
> if (pud_none_or_clear_bad(pud))
> continue;
> - hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
> - ceiling);
> + if (is_hugetlb_pud(*pud))
> + pud_clear(pud);
> + else
> + hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
> + ceiling);
> } while (pud++, addr = next, addr != end);
>
> start &= PGDIR_MASK;
> diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
> index 0cda653..7c0fe73 100644
> --- a/arch/sparc/mm/init_64.c
> +++ b/arch/sparc/mm/init_64.c
> @@ -337,6 +337,10 @@ static int __init setup_hugepagesz(char *string)
> hugepage_shift = ilog2(hugepage_size);
>
> switch (hugepage_shift) {
> + case HPAGE_16GB_SHIFT:
> + hv_pgsz_mask = HV_PGSZ_MASK_16GB;
> + hv_pgsz_idx = HV_PGSZ_IDX_16GB;
> + break;
> case HPAGE_2GB_SHIFT:
> hv_pgsz_mask = HV_PGSZ_MASK_2GB;
> hv_pgsz_idx = HV_PGSZ_IDX_2GB;
> @@ -376,6 +380,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
> {
> struct mm_struct *mm;
> unsigned long flags;
> + bool is_huge_tsb;
> pte_t pte = *ptep;
>
> if (tlb_type != hypervisor) {
> @@ -393,15 +398,37 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *
>
> spin_lock_irqsave(&mm->context.lock, flags);
>
> + is_huge_tsb = false;
> #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
> - if ((mm->context.hugetlb_pte_count || mm->context.thp_pte_count) &&
> - is_hugetlb_pmd(__pmd(pte_val(pte)))) {
> - /* We are fabricating 8MB pages using 4MB real hw pages. */
> - pte_val(pte) |= (address & (1UL << REAL_HPAGE_SHIFT));
> - __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT,
> - address, pte_val(pte));
> - } else
> + if (mm->context.hugetlb_pte_count || mm->context.thp_pte_count) {
> + unsigned long hugepage_size = PAGE_SIZE;
> +
> + if (is_vm_hugetlb_page(vma))
> + hugepage_size = huge_page_size(hstate_vma(vma));
> +
> + if (hugepage_size >= PUD_SIZE) {
> + unsigned long mask = 0x1ffc00000UL;
> +
> + /* Transfer bits [32:22] from address to resolve
> + * at 4M granularity.
> + */
> + pte_val(pte) &= ~mask;
> + pte_val(pte) |= (address & mask);
> + } else if (hugepage_size >= PMD_SIZE) {
> + /* We are fabricating 8MB pages using 4MB
> + * real hw pages.
> + */
> + pte_val(pte) |= (address & (1UL << REAL_HPAGE_SHIFT));
> + }
> +
> + if (hugepage_size >= PMD_SIZE) {
> + __update_mmu_tsb_insert(mm, MM_TSB_HUGE,
> + REAL_HPAGE_SHIFT, address, pte_val(pte));
> + is_huge_tsb = true;
> + }
> + }
> #endif
> + if (!is_huge_tsb)
> __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT,
> address, pte_val(pte));
>
> --
> 2.9.2
>