[PATCH] KVM: VMX: Sketch in possible framework for eliding TLB flushes on pCPU migration
From: Sean Christopherson
Date: Tue Aug 05 2025 - 18:58:13 EST
Not-Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
(anyone that makes this work deserves full credit)
Not-yet-Signed-off-by: Jeremi Piotrowski <jpiotrowski@xxxxxxxxxxxxxxxxxxx>
---
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 3 +++
arch/x86/kvm/mmu/mmu.c | 5 +++++
arch/x86/kvm/mmu/mmu_internal.h | 4 ++++
arch/x86/kvm/mmu/tdp_mmu.c | 4 ++++
arch/x86/kvm/vmx/main.c | 1 +
arch/x86/kvm/vmx/vmx.c | 28 +++++++++++++++++++++-------
arch/x86/kvm/vmx/x86_ops.h | 1 +
8 files changed, 40 insertions(+), 7 deletions(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 8d50e3e0a19b..60351dd22f2f 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -99,6 +99,7 @@ KVM_X86_OP_OPTIONAL(link_external_spt)
KVM_X86_OP_OPTIONAL(set_external_spte)
KVM_X86_OP_OPTIONAL(free_external_spt)
KVM_X86_OP_OPTIONAL(remove_external_spte)
+KVM_X86_OP_OPTIONAL(alloc_root_cpu_mask)
KVM_X86_OP(has_wbinvd_exit)
KVM_X86_OP(get_l2_tsc_offset)
KVM_X86_OP(get_l2_tsc_multiplier)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b4a391929cdb..a3d415c3ea8b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1801,6 +1801,9 @@ struct kvm_x86_ops {
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int root_level);
+ /* Allocate per-root pCPU flush mask. */
+ void (*alloc_root_cpu_mask)(struct kvm_mmu_page *root);
+
/* Update external mapping with page table link. */
int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
void *external_spt);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4e06e2e89a8f..721ee8ea76bd 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -20,6 +20,7 @@
#include "ioapic.h"
#include "mmu.h"
#include "mmu_internal.h"
+#include <linux/cpumask.h>
#include "tdp_mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
@@ -1820,6 +1821,7 @@ static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
list_del(&sp->link);
free_page((unsigned long)sp->spt);
free_page((unsigned long)sp->shadowed_translation);
+ free_cpumask_var(sp->cpu_flushed_mask);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -3827,6 +3829,9 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
++sp->root_count;
+ if (level >= PT64_ROOT_4LEVEL)
+ kvm_x86_call(alloc_root_cpu_mask)(sp);
+
return __pa(sp->spt);
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index db8f33e4de62..5acb3dd34b36 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -7,6 +7,7 @@
#include <asm/kvm_host.h>
#include "mmu.h"
+#include <linux/cpumask.h>
#ifdef CONFIG_KVM_PROVE_MMU
#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
@@ -145,6 +146,9 @@ struct kvm_mmu_page {
/* Used for freeing the page asynchronously if it is a TDP MMU page. */
struct rcu_head rcu_head;
#endif
+
+ /* Mask tracking which host CPUs have flushed this EPT root */
+ cpumask_var_t cpu_flushed_mask;
};
extern struct kmem_cache *mmu_page_header_cache;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7f3d7229b2c1..40c7f46f553c 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -3,6 +3,7 @@
#include "mmu.h"
#include "mmu_internal.h"
+#include <linux/cpumask.h>
#include "mmutrace.h"
#include "tdp_iter.h"
#include "tdp_mmu.h"
@@ -57,6 +58,7 @@ static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
free_page((unsigned long)sp->external_spt);
free_page((unsigned long)sp->spt);
+ free_cpumask_var(sp->cpu_flushed_mask);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -293,6 +295,8 @@ void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
root = tdp_mmu_alloc_sp(vcpu);
tdp_mmu_init_sp(root, NULL, 0, role);
+ kvm_x86_call(alloc_root_cpu_mask)(root);
+
/*
* TDP MMU roots are kept until they are explicitly invalidated, either
* by a memslot update or by the destruction of the VM. Initialize the
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index d1e02e567b57..ec7f6899443d 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -1005,6 +1005,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.write_tsc_multiplier = vt_op(write_tsc_multiplier),
.load_mmu_pgd = vt_op(load_mmu_pgd),
+ .alloc_root_cpu_mask = vmx_alloc_root_cpu_mask,
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index eec2d866e7f1..a6d93624c2d4 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>
+#include <linux/cpumask.h>
#include <linux/entry-kvm.h>
#include <asm/apic.h>
@@ -62,6 +63,7 @@
#include "kvm_cache_regs.h"
#include "lapic.h"
#include "mmu.h"
+#include "mmu/spte.h"
#include "nested.h"
#include "pmu.h"
#include "sgx.h"
@@ -1450,7 +1452,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-static void vmx_flush_ept_on_pcpu_migration(struct kvm_mmu *mmu);
+static void vmx_flush_ept_on_pcpu_migration(struct kvm_mmu *mmu, int cpu);
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
{
@@ -1489,8 +1491,8 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
* TLB entries from its previous association with the vCPU.
*/
if (enable_ept) {
- vmx_flush_ept_on_pcpu_migration(&vcpu->arch.root_mmu);
- vmx_flush_ept_on_pcpu_migration(&vcpu->arch.guest_mmu);
+ vmx_flush_ept_on_pcpu_migration(&vcpu->arch.root_mmu, cpu);
+ vmx_flush_ept_on_pcpu_migration(&vcpu->arch.guest_mmu, cpu);
} else {
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
@@ -3307,22 +3309,34 @@ void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
-static void __vmx_flush_ept_on_pcpu_migration(hpa_t root_hpa)
+void vmx_alloc_root_cpu_mask(struct kvm_mmu_page *root)
{
+ WARN_ON_ONCE(!zalloc_cpumask_var(&root->cpu_flushed_mask,
+ GFP_KERNEL_ACCOUNT));
+}
+
+static void __vmx_flush_ept_on_pcpu_migration(hpa_t root_hpa, int cpu)
+{
+ struct kvm_mmu_page *root;
+
if (!VALID_PAGE(root_hpa))
return;
+ root = root_to_sp(root_hpa);
+ if (!root || cpumask_test_and_set_cpu(cpu, root->cpu_flushed_mask))
+ return;
+
vmx_flush_tlb_ept_root(root_hpa);
}
-static void vmx_flush_ept_on_pcpu_migration(struct kvm_mmu *mmu)
+static void vmx_flush_ept_on_pcpu_migration(struct kvm_mmu *mmu, int cpu)
{
int i;
- __vmx_flush_ept_on_pcpu_migration(mmu->root.hpa);
+ __vmx_flush_ept_on_pcpu_migration(mmu->root.hpa, cpu);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- __vmx_flush_ept_on_pcpu_migration(mmu->prev_roots[i].hpa);
+ __vmx_flush_ept_on_pcpu_migration(mmu->prev_roots[i].hpa, cpu);
}
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index b4596f651232..4406d53e6ebe 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -84,6 +84,7 @@ void vmx_flush_tlb_all(struct kvm_vcpu *vcpu);
void vmx_flush_tlb_current(struct kvm_vcpu *vcpu);
void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr);
void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu);
+void vmx_alloc_root_cpu_mask(struct kvm_mmu_page *root);
void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall);
--
2.39.5
--PUxSKgFWL3VARTd/--