[RFC PATCH 4/8] KVM: x86/mmu: Add infrastructure for pinning PFNs on demand

From: Sean Christopherson
Date: Fri Jul 31 2020 - 17:23:43 EST


Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 7 ++
arch/x86/kvm/mmu/mmu.c | 111 ++++++++++++++++++++++++++------
2 files changed, 99 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1bab87a444d78..b14864f3e8e74 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1138,6 +1138,13 @@ struct kvm_x86_ops {

void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3);

+ bool (*pin_spte)(struct kvm_vcpu *vcpu, gfn_t gfn, int level,
+ kvm_pfn_t pfn);
+ void (*drop_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level,
+ kvm_pfn_t pfn);
+ void (*zap_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level);
+ void (*unzap_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level);
+
bool (*has_wbinvd_exit)(void);

/* Returns actual tsc_offset set in active VMCS */
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 182f398036248..cab3b2f2f49c3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -133,6 +133,9 @@ module_param(dbg, bool, 0644);
#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
#define SPTE_MMIO_MASK (3ULL << 52)

+/* Special SPTEs flags that can only be used for non-MMIO SPTEs. */
+#define SPTE_PINNED_MASK BIT_ULL(62)
+
#define PT64_LEVEL_BITS 9

#define PT64_LEVEL_SHIFT(level) \
@@ -211,6 +214,7 @@ enum {
RET_PF_EMULATE = 1,
RET_PF_INVALID = 2,
RET_PF_FIXED = 3,
+ RET_PF_UNZAPPED = 4,
};

struct pte_list_desc {
@@ -635,6 +639,11 @@ static bool is_shadow_present_pte(u64 pte)
return __is_shadow_present_pte(pte) && !is_mmio_spte(pte);
}

+static bool is_pinned_pte(u64 pte)
+{
+ return !!(pte & SPTE_PINNED_MASK);
+}
+
static int is_large_pte(u64 pte)
{
return pte & PT_PAGE_SIZE_MASK;
@@ -937,15 +946,15 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* state bits, it is used to clear the last level sptep.
* Returns the old PTE.
*/
-static u64 mmu_spte_clear_track_bits(u64 *sptep)
+static u64 __mmu_spte_clear_track_bits(u64 *sptep, u64 clear_value)
{
kvm_pfn_t pfn;
u64 old_spte = *sptep;

if (!spte_has_volatile_bits(old_spte))
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, clear_value);
else
- old_spte = __update_clear_spte_slow(sptep, 0ull);
+ old_spte = __update_clear_spte_slow(sptep, clear_value);

if (!is_shadow_present_pte(old_spte))
return old_spte;
@@ -968,6 +977,11 @@ static u64 mmu_spte_clear_track_bits(u64 *sptep)
return old_spte;
}

+static inline u64 mmu_spte_clear_track_bits(u64 *sptep)
+{
+ return __mmu_spte_clear_track_bits(sptep, 0ull);
+}
+
/*
* Rules for using mmu_spte_clear_no_track:
* Directly clear spte without caring the state bits of sptep,
@@ -1399,7 +1413,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
return pte_list_add(vcpu, spte, rmap_head);
}

-static void rmap_remove(struct kvm *kvm, u64 *spte)
+static void rmap_remove(struct kvm *kvm, u64 *spte, u64 old_spte)
{
struct kvm_mmu_page *sp;
gfn_t gfn;
@@ -1409,6 +1423,10 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
rmap_head = gfn_to_rmap(kvm, gfn, sp);
__pte_list_remove(spte, rmap_head);
+
+ if (is_pinned_pte(old_spte))
+ kvm_x86_ops.drop_pinned_spte(kvm, gfn, sp->role.level - 1,
+ spte_to_pfn(old_spte));
}

/*
@@ -1446,7 +1464,7 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
iter->pos = 0;
sptep = iter->desc->sptes[iter->pos];
out:
- BUG_ON(!is_shadow_present_pte(*sptep));
+ BUG_ON(!is_shadow_present_pte(*sptep) && !is_pinned_pte(*sptep));
return sptep;
}

@@ -1491,8 +1509,8 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
{
u64 old_spte = mmu_spte_clear_track_bits(sptep);

- if (is_shadow_present_pte(old_spte))
- rmap_remove(kvm, sptep);
+ if (is_shadow_present_pte(old_spte) || is_pinned_pte(old_spte))
+ rmap_remove(kvm, sptep, old_spte);
}


@@ -1730,17 +1748,49 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
}

+static bool kvm_mmu_zap_pinned_spte(struct kvm *kvm, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+
+ if (!(*sptep & SPTE_PINNED_MASK))
+ return false;
+
+ sp = sptep_to_sp(sptep);
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ pfn = spte_to_pfn(*sptep);
+
+ if (kvm_x86_ops.zap_pinned_spte)
+ kvm_x86_ops.zap_pinned_spte(kvm, gfn, sp->role.level - 1);
+
+ __mmu_spte_clear_track_bits(sptep, SPTE_PINNED_MASK | pfn << PAGE_SHIFT);
+ return true;
+}
+
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;

- while ((sptep = rmap_get_first(rmap_head, &iter))) {
+restart:
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);

+ if (!is_shadow_present_pte(*sptep)) {
+ WARN_ON_ONCE(!is_pinned_pte(*sptep));
+ continue;
+ }
+
+ flush = true;
+
+ /* Keep the rmap if the SPTE is pinned. */
+ if (kvm_mmu_zap_pinned_spte(kvm, sptep))
+ continue;
+
pte_list_remove(rmap_head, sptep);
- flush = true;
+ goto restart;
}

return flush;
@@ -1774,6 +1824,10 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,

need_flush = 1;

+ /* Pinned pages should not be relocated (obviously). */
+ if (WARN_ON_ONCE(is_pinned_pte(*sptep)))
+ continue;
+
if (pte_write(*ptep)) {
pte_list_remove(rmap_head, sptep);
goto restart;
@@ -2630,7 +2684,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
struct kvm_mmu_page *child;

pte = *spte;
- if (is_shadow_present_pte(pte)) {
+ if (is_shadow_present_pte(pte) || is_pinned_pte(pte)) {
if (is_last_spte(pte, sp->role.level)) {
drop_spte(kvm, spte);
if (is_large_pte(pte))
@@ -2639,7 +2693,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
}
- return true;
+ return is_shadow_present_pte(pte);
}

if (is_mmio_spte(pte))
@@ -2987,10 +3041,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
u64 spte = 0;
int ret = 0;
struct kvm_mmu_page *sp;
+ bool is_mmio_pfn;

if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;

+ is_mmio_pfn = kvm_is_mmio_pfn(pfn);
+
sp = sptep_to_sp(sptep);
if (sp_ad_disabled(sp))
spte |= SPTE_AD_DISABLED_MASK;
@@ -3023,15 +3080,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
- spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
+ spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, is_mmio_pfn);

if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
else
pte_access &= ~ACC_WRITE_MASK;

- if (!kvm_is_mmio_pfn(pfn))
+ if (!is_mmio_pfn)
spte |= shadow_me_mask;

spte |= (u64)pfn << PAGE_SHIFT;
@@ -3065,6 +3121,12 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (speculative)
spte = mark_spte_for_access_track(spte);

+ if (is_pinned_pte(*sptep) ||
+ (vcpu->arch.mmu->direct_map && !is_mmio_pfn &&
+ kvm_x86_ops.pin_spte &&
+ kvm_x86_ops.pin_spte(vcpu, gfn, level, pfn)))
+ spte |= SPTE_PINNED_MASK;
+
set_pte:
if (mmu_spte_update(sptep, spte))
ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
@@ -3081,29 +3143,33 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
int set_spte_ret;
int ret = RET_PF_FIXED;
bool flush = false;
+ u64 pte = *sptep;

pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);

- if (is_shadow_present_pte(*sptep)) {
+ if (is_shadow_present_pte(pte)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
*/
- if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
+ if (level > PG_LEVEL_4K && !is_large_pte(pte)) {
struct kvm_mmu_page *child;
- u64 pte = *sptep;

child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, sptep);
flush = true;
- } else if (pfn != spte_to_pfn(*sptep)) {
+ } else if (pfn != spte_to_pfn(pte)) {
pgprintk("hfn old %llx new %llx\n",
- spte_to_pfn(*sptep), pfn);
+ spte_to_pfn(pte), pfn);
drop_spte(vcpu->kvm, sptep);
flush = true;
} else
was_rmapped = 1;
+ } else if (is_pinned_pte(pte)) {
+ WARN_ON_ONCE(pfn != spte_to_pfn(pte));
+ ret = RET_PF_UNZAPPED;
+ was_rmapped = 1;
}

set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
@@ -3136,6 +3202,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
rmap_recycle(vcpu, sptep, gfn);
}

+ if (ret == RET_PF_UNZAPPED && kvm_x86_ops.unzap_pinned_spte)
+ kvm_x86_ops.unzap_pinned_spte(vcpu->kvm, gfn, level - 1);
+
return ret;
}

@@ -5921,6 +5990,10 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
sp = sptep_to_sp(sptep);
pfn = spte_to_pfn(*sptep);

+ /* Pinned page dirty logging is not supported. */
+ if (WARN_ON_ONCE(is_pinned_pte(*sptep)))
+ continue;
+
/*
* We cannot do huge page mapping for indirect shadow pages,
* which are found on the last rmap (level = 1) when not using
--
2.28.0