Re: [Patch] KVM: x86/mmu: Make optimized __handle_changed_spte() for clear dirty log

From: Sean Christopherson
Date: Mon Jan 30 2023 - 18:49:57 EST


On Mon, Jan 30, 2023, David Matlack wrote:
> On Wed, Jan 25, 2023 at 5:49 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote:
> [...]
> > ---
> > arch/x86/kvm/mmu/tdp_mmu.c | 92 ++++++++++----------------------------
> > 1 file changed, 24 insertions(+), 68 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index bba33aea0fb0..2f78ca43a276 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> [...]
> > @@ -1289,8 +1244,7 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
> > new_spte = mark_spte_for_access_track(new_spte);
> > }
> >
> > - tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
> > -
> > + kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte, new_spte, iter->level);
>
> This can race with fast_page_fault() setting the W-bit and the CPU
> setting the D-bit. i.e. This call to kvm_tdp_mmu_write_spte() could
> clear the W-bit or D-bit.

Ugh, right. Hrm. Duh, I just didn't go far enough. A straight XCHG is silly.
Except for the access-tracking mess, KVM wants to clear a single bit. Writing
the entire thing and potentially clobbering bits is wasteful and unnecessarily
dangerous. And the access-tracking code already needs special handling.

We can just simplify this all by adding a helper to clear a single bit (and
maybe even use clear_bit() and __clear_bit() if we save the bit number for the
W/A/D bits and not just the mask). And if it weren't for EPT (different A/D
locations), we could even use static asserts to restrict the usage to the W/A/D
bits :-/ Oh well.

E.g. this

static inline void kvm_tdp_mmu_clear_spte_bit(struct tdp_iter *iter, u64 mask)
{
if (kvm_tdp_mmu_spte_has_volatile_bits(iter->old_spte, iter->level))
atomic64_and(~mask, (atomic64_t *)rcu_dereference(iter->sptep));
else
__kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte & ~mask);
}

yields another untested patch:

---
arch/x86/kvm/mmu/tdp_iter.h | 24 +++++++++++++++++++-----
arch/x86/kvm/mmu/tdp_mmu.c | 33 +++++++++++++--------------------
2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index f0af385c56e0..09c8f2640ccc 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -29,11 +29,10 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
}

-static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
- u64 new_spte, int level)
+static inline bool kvm_tdp_mmu_spte_has_volatile_bits(u64 old_spte, int level)
{
/*
- * Atomically write the SPTE if it is a shadow-present, leaf SPTE with
+ * Atomically write SPTEs if it is a shadow-present, leaf SPTE with
* volatile bits, i.e. has bits that can be set outside of mmu_lock.
* The Writable bit can be set by KVM's fast page fault handler, and
* Accessed and Dirty bits can be set by the CPU.
@@ -44,8 +43,15 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
* logic needs to be reassessed if KVM were to use non-leaf Accessed
* bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
*/
- if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
- spte_has_volatile_bits(old_spte))
+ return is_shadow_present_pte(old_spte) &&
+ is_last_spte(old_spte, level) &&
+ spte_has_volatile_bits(old_spte);
+}
+
+static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
+ u64 new_spte, int level)
+{
+ if (kvm_tdp_mmu_spte_has_volatile_bits(old_spte, level))
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);

__kvm_tdp_mmu_write_spte(sptep, new_spte);
@@ -115,4 +121,12 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);

+static inline void kvm_tdp_mmu_clear_spte_bit(struct tdp_iter *iter, u64 mask)
+{
+ if (kvm_tdp_mmu_spte_has_volatile_bits(iter->old_spte, iter->level))
+ atomic64_and(~mask, (atomic64_t *)rcu_dereference(iter->sptep));
+ else
+ __kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte & ~mask);
+}
+
#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 2f78ca43a276..32a7209a522d 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1223,28 +1223,26 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_gfn_range *range)
{
- u64 new_spte = 0;
+ u64 new_spte;

/* If we have a non-accessed entry we don't need to change the pte. */
if (!is_accessed_spte(iter->old_spte))
return false;

- new_spte = iter->old_spte;
-
- if (spte_ad_enabled(new_spte)) {
- new_spte &= ~shadow_accessed_mask;
+ if (spte_ad_enabled(iter->old_spte)) {
+ kvm_tdp_mmu_clear_spte_bit(iter, shadow_accessed_mask);
} else {
+ new_spte = mark_spte_for_access_track(iter->old_spte);
+ iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte,
+ new_spte, iter->level);
/*
* Capture the dirty status of the page, so that it doesn't get
* lost when the SPTE is marked for access tracking.
*/
- if (is_writable_pte(new_spte))
- kvm_set_pfn_dirty(spte_to_pfn(new_spte));
-
- new_spte = mark_spte_for_access_track(new_spte);
+ if (is_writable_pte(iter->old_spte))
+ kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
}

- kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte, new_spte, iter->level);
return true;
}

@@ -1632,7 +1630,6 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn, unsigned long mask, bool wrprot)
{
struct tdp_iter iter;
- u64 new_spte;

rcu_read_lock();

@@ -1648,20 +1645,16 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
mask &= ~(1UL << (iter.gfn - gfn));

if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
- if (is_writable_pte(iter.old_spte))
- new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- else
+ if (!is_writable_pte(iter.old_spte))
continue;
+
+ kvm_tdp_mmu_clear_spte_bit(&iter, PT_WRITABLE_MASK);
} else {
- if (iter.old_spte & shadow_dirty_mask)
- new_spte = iter.old_spte & ~shadow_dirty_mask;
- else
+ if (!(iter.old_spte & shadow_dirty_mask))
continue;

- kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
+ kvm_tdp_mmu_clear_spte_bit(&iter, shadow_dirty_mask);
}
-
- kvm_tdp_mmu_write_spte(iter.sptep, iter.old_spte, new_spte, iter.level);
}

rcu_read_unlock();

base-commit: 7bb67c88a2d77cc95524912f3b1ca51daf5c0224
--