[RFC PATCH v6 057/104] KVM: x86/mmu: steal software usable git to record if GFN is for shared or not

From: isaku . yamahata
Date: Thu May 05 2022 - 14:24:07 EST


From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>

With TDX, all GFNs are private at guest boot time. At run time guest TD
can explicitly change it to shared from private or vice-versa by MapGPA
hypercall. If it's specified, the given GFN can't be used as otherwise.
That's is, if a guest tells KVM that the GFN is shared, it can't be used
as private. or vice-versa.

Steal software usable bit, SPTE_SHARED_MASK, for it from MMIO counter to
record it. Use the bit SPTE_SHARED_MASK in shared or private EPT to
determine which mapping, shared or private, is allowed. If requested
mapping isn't allowed, return RET_PF_RETRY to wait for other vcpu to change
it. The bit is recorded in both shared and private shadow page to avoid
traverse one more shadow page when resolving KVM page fault.

The bit needs to be kept over zapping the EPT entry. Currently the EPT
entry is initialized SHADOW_NONPRESENT_VALUE unconditionally to clear
SPTE_SHARED_MASK bit. To carry SPTE_SHARED_MASK bit, introduce a helper
function to get initial value for zapped entry with SPTE_SHARED_MASK bit.
Replace SHADOW_NONPRESENT_VALUE with it.

Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
---
arch/x86/kvm/mmu/spte.h | 17 +++++++---
arch/x86/kvm/mmu/tdp_mmu.c | 65 ++++++++++++++++++++++++++++++++------
2 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1ac2a7a91166..d97ffe440536 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -14,6 +14,9 @@
*/
#define SPTE_MMU_PRESENT_MASK BIT_ULL(11)

+/* Masks that used to track for shared GPA **/
+#define SPTE_SHARED_MASK BIT_ULL(62)
+
/*
* TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
* be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
@@ -104,7 +107,7 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
* the memslots generation and is derived as follows:
*
* Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
- * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
+ * Bits 8-18 of the MMIO generation are propagated to spte bits 52-61
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number, as doing so would require stealing a bit from
@@ -118,7 +121,7 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
#define MMIO_SPTE_GEN_LOW_END 10

#define MMIO_SPTE_GEN_HIGH_START 52
-#define MMIO_SPTE_GEN_HIGH_END 62
+#define MMIO_SPTE_GEN_HIGH_END 61

#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)
@@ -131,7 +134,7 @@ static_assert(!(SPTE_MMU_PRESENT_MASK &
#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)

/* remember to adjust the comment above as well if you change these */
-static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 10);

#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
@@ -208,6 +211,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
static_assert(!(__REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
static_assert(!(__REMOVED_SPTE & SHADOW_NONPRESENT_VALUE));
+static_assert(!(__REMOVED_SPTE & SPTE_SHARED_MASK));

/*
* See above comment around __REMOVED_SPTE. REMOVED_SPTE is the actual
@@ -217,7 +221,12 @@ static_assert(!(__REMOVED_SPTE & SHADOW_NONPRESENT_VALUE));

static inline bool is_removed_spte(u64 spte)
{
- return spte == REMOVED_SPTE;
+ return (spte & ~SPTE_SHARED_MASK) == REMOVED_SPTE;
+}
+
+static inline u64 spte_shared_mask(u64 spte)
+{
+ return spte & SPTE_SHARED_MASK;
}

/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 2aa2cb8a9b05..1d7642a0acc9 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -736,6 +736,11 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
return 0;
}

+static u64 shadow_nonpresent_spte(u64 old_spte)
+{
+ return SHADOW_NONPRESENT_VALUE | spte_shared_mask(old_spte);
+}
+
static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter)
{
@@ -770,7 +775,8 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* SHADOW_NONPRESENT_VALUE (which sets "suppress #VE" bit) so it
* can be set when EPT table entries are zapped.
*/
- kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
+ kvm_tdp_mmu_write_spte(iter->sptep,
+ shadow_nonpresent_spte(iter->old_spte));

return 0;
}
@@ -948,8 +954,11 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
continue;

if (!shared)
- tdp_mmu_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
- else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
+ tdp_mmu_set_spte(kvm, &iter,
+ shadow_nonpresent_spte(iter.old_spte));
+ else if (tdp_mmu_set_spte_atomic(
+ kvm, &iter,
+ shadow_nonpresent_spte(iter.old_spte)))
goto retry;
}
}
@@ -1006,7 +1015,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
return false;

__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
- SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1,
+ shadow_nonpresent_spte(old_spte),
+ sp->gfn, sp->role.level + 1,
true, true, is_private_sp(sp));

return true;
@@ -1048,11 +1058,20 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
}

+ /*
+ * SPTE_SHARED_MASK is stored as 4K granularity. The
+ * information is lost if we delete upper level SPTE page.
+ * TODO: support large page.
+ */
+ if (kvm_gfn_shared_mask(kvm) && iter.level > PG_LEVEL_4K)
+ continue;
+
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;

- tdp_mmu_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
+ tdp_mmu_set_spte(kvm, &iter,
+ shadow_nonpresent_spte(iter.old_spte));
flush = true;
}

@@ -1168,18 +1187,44 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
gfn_t gfn_unalias = iter->gfn & ~kvm_gfn_shared_mask(vcpu->kvm);

WARN_ON(sp->role.level != fault->goal_level);
+ WARN_ON(is_private_sptep(iter->sptep) != fault->is_private);

- /* TDX shared GPAs are no executable, enforce this for the SDV. */
- if (kvm_gfn_shared_mask(vcpu->kvm) && !fault->is_private)
- pte_access &= ~ACC_EXEC_MASK;
+ if (kvm_gfn_shared_mask(vcpu->kvm)) {
+ if (fault->is_private) {
+ /*
+ * SPTE allows only RWX mapping. PFN can't be mapped it
+ * as READONLY in GPA.
+ */
+ if (fault->slot && !fault->map_writable)
+ return RET_PF_RETRY;
+ /*
+ * This GPA is not allowed to map as private. Let
+ * vcpu loop in page fault until other vcpu change it
+ * by MapGPA hypercall.
+ */
+ if (fault->slot &&
+ spte_shared_mask(iter->old_spte))
+ return RET_PF_RETRY;
+ } else {
+ /* This GPA is not allowed to map as shared. */
+ if (fault->slot &&
+ !spte_shared_mask(iter->old_spte))
+ return RET_PF_RETRY;
+ /* TDX shared GPAs are no executable, enforce this. */
+ pte_access &= ~ACC_EXEC_MASK;
+ }
+ }

if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, gfn_unalias, pte_access);
- else
+ else {
wrprot = make_spte(vcpu, sp, fault->slot, pte_access,
gfn_unalias, fault->pfn, iter->old_spte,
fault->prefetch, true, fault->map_writable,
&new_spte);
+ if (spte_shared_mask(iter->old_spte))
+ new_spte |= SPTE_SHARED_MASK;
+ }

if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
@@ -1488,7 +1533,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
* invariant that the PFN of a present * leaf SPTE can never change.
* See __handle_changed_spte().
*/
- tdp_mmu_set_spte(kvm, iter, SHADOW_NONPRESENT_VALUE);
+ tdp_mmu_set_spte(kvm, iter, shadow_nonpresent_spte(iter->old_spte));

if (!pte_write(range->pte)) {
new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
--
2.25.1