[RFC PATCH v5 045/104] KVM: x86/tdp_mmu: make REMOVED_SPTE include shadow_initial value

From: isaku . yamahata
Date: Fri Mar 04 2022 - 15:09:55 EST


From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>

TDP MMU uses REMOVED_SPTE = 0x5a0ULL as special constant to indicate the
intermediate value to indicate one thread is operating on it and the value
should be semi-arbitrary value. For TDX (more correctly to use #VE), the
value should include suppress #VE value which is shadow_init_value.

Define SHADOW_REMOVED_SPTE as shadow_init_value | REMOVED_SPTE, and replace
REMOVED_SPTE with SHADOW_REMOVED_SPTE to use suppress #VE bit properly for
TDX.

Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
---
arch/x86/kvm/mmu/spte.h | 14 ++++++++++++--
arch/x86/kvm/mmu/tdp_mmu.c | 23 ++++++++++++++++-------
2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index bde843bce878..e88f796724b4 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -194,7 +194,9 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
* If a thread running without exclusive control of the MMU lock must perform a
* multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
* non-present intermediate value. Other threads which encounter this value
- * should not modify the SPTE.
+ * should not modify the SPTE. When TDX is enabled, shadow_init_value, which
+ * is "suppress #VE" bit set, is also set to removed SPTE, because TDX module
+ * always enables "EPT violation #VE".
*
* Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
* bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
@@ -207,9 +209,17 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));

+/*
+ * See above comment around REMOVED_SPTE. SHADOW_REMOVED_SPTE is the actual
+ * intermediate value set to the removed SPET. When TDX is enabled, it sets
+ * the "suppress #VE" bit, otherwise it's REMOVED_SPTE.
+ */
+extern u64 __read_mostly shadow_init_value;
+#define SHADOW_REMOVED_SPTE (shadow_init_value | REMOVED_SPTE)
+
static inline bool is_removed_spte(u64 spte)
{
- return spte == REMOVED_SPTE;
+ return spte == SHADOW_REMOVED_SPTE;
}

/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index ebd0a02620e8..b6ec2f112c26 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -338,7 +338,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
* value to the removed SPTE value.
*/
for (;;) {
- old_child_spte = xchg(sptep, REMOVED_SPTE);
+ old_child_spte = xchg(sptep, SHADOW_REMOVED_SPTE);
if (!is_removed_spte(old_child_spte))
break;
cpu_relax();
@@ -365,10 +365,10 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
* the two branches consistent and simplifies
* the function.
*/
- WRITE_ONCE(*sptep, REMOVED_SPTE);
+ WRITE_ONCE(*sptep, SHADOW_REMOVED_SPTE);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_child_spte, REMOVED_SPTE, level,
+ old_child_spte, SHADOW_REMOVED_SPTE, level,
shared);
}

@@ -537,7 +537,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* immediately installing a present entry in its place
* before the TLBs are flushed.
*/
- if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
+ if (!tdp_mmu_set_spte_atomic(kvm, iter, SHADOW_REMOVED_SPTE))
return false;

kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
@@ -550,8 +550,16 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* special removed SPTE value. No bookkeeping is needed
* here since the SPTE is going from non-present
* to non-present.
+ *
+ * Set non-present value to shadow_init_value, rather than 0.
+ * It is because when TDX is enabled, TDX module always
+ * enables "EPT-violation #VE", so KVM needs to set
+ * "suppress #VE" bit in EPT table entries, in order to get
+ * real EPT violation, rather than TDVMCALL. KVM sets
+ * shadow_init_value (which sets "suppress #VE" bit) so it
+ * can be set when EPT table entries are zapped.
*/
- WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
+ WRITE_ONCE(*rcu_dereference(iter->sptep), shadow_init_value);

return true;
}
@@ -748,7 +756,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
continue;

if (!shared) {
- tdp_mmu_set_spte(kvm, &iter, 0);
+ /* see comments in tdp_mmu_zap_spte_atomic() */
+ tdp_mmu_set_spte(kvm, &iter, shadow_init_value);
flush = true;
} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
/*
@@ -1135,7 +1144,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
* invariant that the PFN of a present * leaf SPTE can never change.
* See __handle_changed_spte().
*/
- tdp_mmu_set_spte(kvm, iter, 0);
+ tdp_mmu_set_spte(kvm, iter, shadow_init_value);

if (!pte_write(range->pte)) {
new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
--
2.25.1