[PATCH 2/3] KVM: x86: fix nested guest live migration with PML

From: Paolo Bonzini
Date: Thu Sep 26 2019 - 13:18:46 EST


Shadow paging is fundamentally incompatible with the page-modification
log, because the GPAs in the log come from the wrong memory map.
In particular, for the EPT page-modification log, the GPAs in the log come
from L2 rather than L1. (If there was a non-EPT page-modification log,
we couldn't use it for shadow paging because it would log GVAs rather
than GPAs).

Therefore, we need to rely on write protection to record dirty pages.
This has the side effect of bypassing PML, since writes now result in an
EPT violation vmexit.

This is relatively easy to add to KVM, because pretty much the only place
that needs changing is spte_clear_dirty. The first access to the page
already goes through the page fault path and records the correct GPA;
it's only subsequent accesses that are wrong. Therefore, we can equip
set_spte (where the first access happens) to record that the SPTE will
have to be write protected, and then spte_clear_dirty will use this
information to do the right thing.

Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
---
arch/x86/kvm/mmu.c | 32 ++++++++++++++++++++++++++++++--
1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bac8d228d82b..722a3b5fc0db 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -92,6 +92,7 @@ enum {
#define SPTE_SPECIAL_MASK (3ULL << 52)
#define SPTE_AD_ENABLED_MASK (0ULL << 52)
#define SPTE_AD_DISABLED_MASK (1ULL << 52)
+#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
#define SPTE_MMIO_MASK (3ULL << 52)

#define PT64_LEVEL_BITS 9
@@ -328,10 +329,27 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
return sp->role.ad_disabled;
}

+static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
+{
+ /*
+ * When using the EPT page-modification log, the GPAs in the log
+ * would come from L2 rather than L1. Therefore, we need to rely
+ * on write protection to record dirty pages. This also bypasses
+ * PML, since writes now result in a vmexit.
+ */
+ return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
+}
+
static inline bool spte_ad_enabled(u64 spte)
{
MMU_WARN_ON(is_mmio_spte(spte));
- return (spte & SPTE_SPECIAL_MASK) == SPTE_AD_ENABLED_MASK;
+ return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
+}
+
+static inline bool spte_ad_need_write_protect(u64 spte)
+{
+ MMU_WARN_ON(is_mmio_spte(spte));
+ return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
}

static inline u64 spte_shadow_accessed_mask(u64 spte)
@@ -1597,8 +1615,11 @@ static bool spte_clear_dirty(u64 *sptep)

rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);

- spte &= ~shadow_dirty_mask;
+ WARN_ON(!spte_ad_enabled(spte));
+ if (spte_ad_need_write_protect(spte))
+ return spte_write_protect(sptep, false);

+ spte &= ~shadow_dirty_mask;
return mmu_spte_update(sptep, spte);
}

@@ -1639,6 +1660,11 @@ static bool spte_set_dirty(u64 *sptep)

rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);

+ /*
+ * Similar to the !kvm_x86_ops->slot_disable_log_dirty case,
+ * do not bother adding back write access to pages marked
+ * SPTE_AD_WRPROT_ONLY_MASK.
+ */
spte |= shadow_dirty_mask;

return mmu_spte_update(sptep, spte);
@@ -2977,6 +3003,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
sp = page_header(__pa(sptep));
if (sp_ad_disabled(sp))
spte |= SPTE_AD_DISABLED_MASK;
+ else if (kvm_vcpu_ad_need_write_protect(vcpu))
+ spte |= SPTE_AD_WRPROT_ONLY_MASK;

/*
* For the EPT case, shadow_present_mask is 0 if hardware
--
1.8.3.1