Re: [PATCH v2 8/10] KVM: MMU: prefetch ptes when intercepted guest#PF

From: Marcelo Tosatti
Date: Mon Jun 28 2010 - 09:10:32 EST

Next message: Gustavo F. Padovan: "Re: [PATCH] Bluetooth: Fix abuse of the preincrement operator"
Previous message: Bernd Petrovitsch: "Re: [PATCH 5/5]bluetooth:hci_bcsp Fix operation on'bcsp->msgq_txseq' may be undefined"
In reply to: Xiao Guangrong: "[PATCH v2 8/10] KVM: MMU: prefetch ptes when intercepted guest #PF"
Next in thread: Xiao Guangrong: "Re: [PATCH v2 8/10] KVM: MMU: prefetch ptes when intercepted guest#PF"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Fri, Jun 25, 2010 at 08:07:06PM +0800, Xiao Guangrong wrote:
> Support prefetch ptes when intercept guest #PF, avoid to #PF by later
> access
>
> If we meet any failure in the prefetch path, we will exit it and
> not try other ptes to avoid become heavy path
>
> Note: this speculative will mark page become dirty but it not really
> accessed, the same issue is in other speculative paths like invlpg,
> pte write, fortunately, it just affect host memory management. After
> Avi's patchset named "[PATCH v2 1/4] KVM: MMU: Introduce drop_spte()"
> merged, we will easily fix it. Will do it in the future.
>
> Signed-off-by: Xiao Guangrong <xiaoguangrong@xxxxxxxxxxxxxx>
> ---
> arch/x86/kvm/mmu.c | 69 +++++++++++++++++++++++++++++++++++++++++
> arch/x86/kvm/paging_tmpl.h | 74 ++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 143 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 6c06666..b2ad723 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
> }
> #endif
>
> +#define PTE_PREFETCH_NUM 16
> +
> #define PT_FIRST_AVAIL_BITS_SHIFT 9
> #define PT64_SECOND_AVAIL_BITS_SHIFT 52
>
> @@ -1998,6 +2000,72 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
> {
> }
>
> +static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
> + struct kvm_mmu_page *sp,
> + u64 *start, u64 *end)
> +{
> + gfn_t gfn;
> + struct page *pages[PTE_PREFETCH_NUM];
> +
> + if (pte_prefetch_topup_memory_cache(vcpu, end - start))
> + return -1;
> +
> + gfn = sp->gfn + start - sp->spt;
> + while (start < end) {
> + unsigned long addr;
> + int entry, j, ret;
> +
> + addr = gfn_to_hva_many(vcpu->kvm, gfn, &entry);
> + if (kvm_is_error_hva(addr))
> + return -1;
> +
> + entry = min(entry, (int)(end - start));
> + ret = __get_user_pages_fast(addr, entry, 1, pages);
> + if (ret <= 0)
> + return -1;
> +
> + for (j = 0; j < ret; j++, gfn++, start++)
> + mmu_set_spte(vcpu, start, ACC_ALL,
> + sp->role.access, 0, 0, 1, NULL,
> + sp->role.level, gfn,
> + page_to_pfn(pages[j]), true, false);
> +
> + if (ret < entry)
> + return -1;
> + }
> + return 0;
> +}
> +
> +static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> +{
> + struct kvm_mmu_page *sp;
> + u64 *start = NULL;
> + int index, i, max;
> +
> + sp = page_header(__pa(sptep));
> + WARN_ON(!sp->role.direct);
> +
> + if (sp->role.level > PT_PAGE_TABLE_LEVEL)
> + return;
> +
> + index = sptep - sp->spt;
> + i = index & ~(PTE_PREFETCH_NUM - 1);
> + max = index | (PTE_PREFETCH_NUM - 1);
> +
> + for (; i < max; i++) {
> + u64 *spte = sp->spt + i;
> +
> + if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
> + if (!start)
> + continue;
> + if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
> + break;
> + start = NULL;
> + } else if (!start)
> + start = spte;
> + }
> +}
> +
> static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
> int level, gfn_t gfn, pfn_t pfn)
> {
> @@ -2012,6 +2080,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
> 0, write, 1, &pt_write,
> level, gfn, pfn, false, true);
> ++vcpu->stat.pf_fixed;
> + direct_pte_prefetch(vcpu, iterator.sptep);
> break;
> }
>
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index fdba751..134f031 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -291,6 +291,79 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> gpte_to_gfn(gpte), pfn, true, true);
> }
>
> +static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
> +{
> + struct kvm_mmu_page *sp;
> + pt_element_t gptep[PTE_PREFETCH_NUM];
> + gpa_t first_pte_gpa;
> + int offset = 0, index, i, j, max;
> +
> + sp = page_header(__pa(sptep));
> + index = sptep - sp->spt;
> +
> + if (sp->role.level > PT_PAGE_TABLE_LEVEL)
> + return;
> +
> + if (sp->role.direct)
> + return direct_pte_prefetch(vcpu, sptep);

Can never happen.

> +
> + index = sptep - sp->spt;
> + i = index & ~(PTE_PREFETCH_NUM - 1);
> + max = index | (PTE_PREFETCH_NUM - 1);
> +
> + if (PTTYPE == 32)
> + offset = sp->role.quadrant << PT64_LEVEL_BITS;
> +
> + first_pte_gpa = gfn_to_gpa(sp->gfn) +
> + (offset + i) * sizeof(pt_element_t);
> +
> + if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
> + sizeof(gptep)) < 0)
> + return;
> +
> + for (j = 0; i < max; i++, j++) {
> + pt_element_t gpte;
> + unsigned pte_access;
> + u64 *spte = sp->spt + i;
> + gfn_t gfn;
> + pfn_t pfn;
> +
> + if (spte == sptep)
> + continue;
> +
> + if (*spte != shadow_trap_nonpresent_pte)
> + continue;
> +
> + gpte = gptep[j];
> +
> + if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
> + break;
> +
> + if (!(gpte & PT_ACCESSED_MASK))
> + continue;
> +
> + if (!is_present_gpte(gpte)) {
> + if (!sp->unsync)
> + __set_spte(spte, shadow_notrap_nonpresent_pte);
> + continue;
> + }
> +
> + gfn = gpte_to_gfn(gpte);
> +
> + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
> + if (is_error_pfn(pfn) ||
> + pte_prefetch_topup_memory_cache(vcpu, 1)) {
> + kvm_release_pfn_clean(pfn);
> + break;
> + }
> +
> + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
> + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
> + is_dirty_gpte(gpte), NULL, sp->role.level, gfn,
> + pfn, true, false);
> + }
> +}
> +
> /*
> * Fetch a shadow pte for a specific level in the paging hierarchy.
> */
> @@ -322,6 +395,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
> user_fault, write_fault,
> dirty, ptwrite, level,
> gw->gfn, pfn, false, true);
> + FNAME(pte_prefetch)(vcpu, sptep);
> break;
> }

I'm afraid this can introduce regressions since it increases mmu_lock
contention. Can you get some numbers with 4-vcpu or 8-vcpu guest and
many threads benchmarks, such as kernbench and apachebench? (on
non-EPT).

Also prefetch should be disabled for EPT, due to lack of accessed bit.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Gustavo F. Padovan: "Re: [PATCH] Bluetooth: Fix abuse of the preincrement operator"
Previous message: Bernd Petrovitsch: "Re: [PATCH 5/5]bluetooth:hci_bcsp Fix operation on'bcsp->msgq_txseq' may be undefined"
In reply to: Xiao Guangrong: "[PATCH v2 8/10] KVM: MMU: prefetch ptes when intercepted guest #PF"
Next in thread: Xiao Guangrong: "Re: [PATCH v2 8/10] KVM: MMU: prefetch ptes when intercepted guest#PF"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]