Re: [PATCH 1/4] KVM: x86/mmu: Don't require refcounted "struct page" to create huge SPTEs

From: Mingwei Zhang
Date: Sat Jul 16 2022 - 01:53:58 EST


On Fri, Jul 15, 2022, Sean Christopherson wrote:
> Drop the requirement that a pfn be backed by a refcounted, compound or
> or ZONE_DEVICE, struct page, and instead rely solely on the host page
> tables to identify huge pages. The PageCompound() check is a remnant of
> an old implementation that identified (well, attempt to identify) huge
> pages without walking the host page tables. The ZONE_DEVICE check was
> added as an exception to the PageCompound() requirement. In other words,
> neither check is actually a hard requirement, if the primary has a pfn
> backed with a huge page, then KVM can back the pfn with a huge page
> regardless of the backing store.
>
> Dropping the @pfn parameter will also allow KVM to query the max host
> mapping level without having to first get the pfn, which is advantageous
> for use outside of the page fault path where KVM wants to take action if
> and only if a page can be mapped huge, i.e. avoids the pfn lookup for
> gfns that can't be backed with a huge page.

Our of curiosity, when host maps huge pages under VMA with VM_PFNMAP,
they are basically out of the control of MM (I presume). So, when
drivers of those pages remove them from host page table, do we (KVM) get
mmu_notifiers?

>
> Cc: Mingwei Zhang <mizhang@xxxxxxxxxx>
> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>

Reviewed-by: Mingwei Zhang <mizhang@xxxxxxxxxx>
> ---
> arch/x86/kvm/mmu/mmu.c | 23 +++++------------------
> arch/x86/kvm/mmu/mmu_internal.h | 2 +-
> arch/x86/kvm/mmu/tdp_mmu.c | 8 +-------
> 3 files changed, 7 insertions(+), 26 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 52664c3caaab..bebff1d5acd4 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2919,11 +2919,10 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> __direct_pte_prefetch(vcpu, sp, sptep);
> }
>
> -static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
> const struct kvm_memory_slot *slot)
> {
> int level = PG_LEVEL_4K;
> - struct page *page;
> unsigned long hva;
> unsigned long flags;
> pgd_t pgd;
> @@ -2931,17 +2930,6 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> pud_t pud;
> pmd_t pmd;
>
> - /*
> - * Note, @slot must be non-NULL, i.e. the caller is responsible for
> - * ensuring @pfn isn't garbage and is backed by a memslot.
> - */
> - page = kvm_pfn_to_refcounted_page(pfn);
> - if (!page)
> - return PG_LEVEL_4K;
> -
> - if (!PageCompound(page) && !kvm_is_zone_device_page(page))
> - return PG_LEVEL_4K;
> -
> /*
> * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
> * is not solely for performance, it's also necessary to avoid the
> @@ -2994,7 +2982,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>
> int kvm_mmu_max_mapping_level(struct kvm *kvm,
> const struct kvm_memory_slot *slot, gfn_t gfn,
> - kvm_pfn_t pfn, int max_level)
> + int max_level)
> {
> struct kvm_lpage_info *linfo;
> int host_level;
> @@ -3009,7 +2997,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> if (max_level == PG_LEVEL_4K)
> return PG_LEVEL_4K;
>
> - host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
> + host_level = host_pfn_mapping_level(kvm, gfn, slot);
> return min(host_level, max_level);
> }
>
> @@ -3034,8 +3022,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> * level, which will be used to do precise, accurate accounting.
> */
> fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
> - fault->gfn, fault->pfn,
> - fault->max_level);
> + fault->gfn, fault->max_level);
> if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
> return;
>
> @@ -6406,7 +6393,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
> */
> if (sp->role.direct &&
> sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
> - pfn, PG_LEVEL_NUM)) {
> + PG_LEVEL_NUM)) {
> pte_list_remove(kvm, rmap_head, sptep);
>
> if (kvm_available_flush_tlb_with_range())
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index ae2d660e2dab..582def531d4d 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -309,7 +309,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>
> int kvm_mmu_max_mapping_level(struct kvm *kvm,
> const struct kvm_memory_slot *slot, gfn_t gfn,
> - kvm_pfn_t pfn, int max_level);
> + int max_level);
> void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
> void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
>
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index f3a430d64975..d75d93edc40a 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1733,7 +1733,6 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
> gfn_t end = start + slot->npages;
> struct tdp_iter iter;
> int max_mapping_level;
> - kvm_pfn_t pfn;
>
> rcu_read_lock();
>
> @@ -1745,13 +1744,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
> !is_last_spte(iter.old_spte, iter.level))
> continue;
>
> - /*
> - * This is a leaf SPTE. Check if the PFN it maps can
> - * be mapped at a higher level.
> - */
> - pfn = spte_to_pfn(iter.old_spte);
> max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
> - iter.gfn, pfn, PG_LEVEL_NUM);
> + iter.gfn, PG_LEVEL_NUM);
>
> WARN_ON(max_mapping_level < iter.level);
>
> --
> 2.37.0.170.g444d1eabd0-goog
>