Re: [PATCH v2] KVM: x86: nSVM/nVMX: Fix handling triple fault on RSM instruction
From: Sean Christopherson
Date:  Fri Feb 09 2024 - 10:12:54 EST
On Thu, Feb 08, 2024, Paolo Bonzini wrote:
> On Thu, Feb 8, 2024 at 2:18 PM Wilczynski, Michal
> <michal.wilczynski@xxxxxxxxx> wrote:
> > Hi, I've tested the patch and it seems to work, both on Intel and AMD.
> > There was a problem with applying this chunk though:
> >
> > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> > index ac8b7614e79d..3d18fa7db353 100644
> > --- a/arch/x86/include/asm/kvm-x86-ops.h
> > +++ b/arch/x86/include/asm/kvm-x86-ops.h
> > @@ -119,7 +119,8 @@ KVM_X86_OP(setup_mce)
> >  #ifdef CONFIG_KVM_SMM
> >  KVM_X86_OP(smi_allowed)
> >  KVM_X86_OP()                 // <- This shouldn't be there I guess ?
> > -KVM_X86_OP(leave_smm)
> > +KVM_X86_OP(leave_smm_prepare)
> > +KVM_X86_OP(leave_smm_commit)
> >  KVM_X86_OP(enable_smi_window)
> >  #endif
> >  KVM_X86_OP_OPTIONAL(dev_get_attr)
> >
> > Anyway I was a bit averse to this approach as I noticed in the git log
> > that callbacks like e.g post_leave_smm() used to exist, but they were later
> > removed, so I though the maintainers don't like introducing extra
> > callbacks.
> 
> If they are needed, it's fine. In my opinion a new callback is easier
> to handle and understand than new state.
Yeah, we ripped out post_leave_smm() because its sole usage at the time was buggy,
and having a callback without a purpose would just be dead code.
> > > 2) otherwise, if the problem is that we have not gone through the
> > > vmenter yet, then KVM needs to do that and _then_ inject the triple
> > > fault. The fix is to merge the .triple_fault and .check_nested_events
> > > callbacks, with something like the second attached patch - which
> > > probably has so many problems that I haven't even tried to compile it.
> >
> > Well, in this case if we know that RSM will fail it doesn't seem to me
> > like it make sense to run vmenter just do kill the VM anyway, this would
> > be more confusing.
> 
> Note that the triple fault must not kill the VM, it's just causing a
> nested vmexit from L2 to L1. KVM's algorithm to inject a
> vmexit-causing event is always to first ensure that the VMCS02 (VMCB02
> for AMD) is consistent, and only then trigger the vmexit. So if patch
> 2 or something like it works, that would be even better.
> 
> > I've made the fix this way based on our discussion with Sean in v1, and
> > tried to mark the RSM instruction with a flag, as a one that needs
> > actual HW VMenter to complete succesfully, and based on that information
> > manipulate nested_run_pending.
Heh, you misunderstood my suggestion.
 : But due to nested_run_pending being (unnecessarily) buried in vendor structs, it
 : might actually be easier to do a cleaner fix.  E.g. add yet another flag to track
 : that a hardware VM-Enter needs to be completed in order to complete instruction
 : emulation.
I didn't mean add a flag to the emulator to muck with nested_run_pending, I meant
add a flag to kvm_vcpu_arch to be a superset of nested_run_pending.  E.g. as a
first step, something like the below.  And then as follow up, see if it's doable
to propagate nested_run_pending => insn_emulation_needs_vmenter so that the
nested_run_pending checks in {svm,vmx}_{interrupt,nmi,smi}_allowed() can be
dropped.
---
 arch/x86/include/asm/kvm_host.h |  8 ++++++
 arch/x86/kvm/smm.c              | 10 ++++++--
 arch/x86/kvm/x86.c              | 44 +++++++++++++++++++++++++--------
 3 files changed, 50 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d271ba20a0b2..bb4250551619 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -769,6 +769,14 @@ struct kvm_vcpu_arch {
 	u64 ia32_misc_enable_msr;
 	u64 smbase;
 	u64 smi_count;
+
+	/*
+	 * Tracks if a successful VM-Enter is needed to complete emulation of
+	 * an instruction, e.g. to ensure emulation of RSM or nested VM-Enter,
+	 * which can directly inject events, completes before KVM attempts to
+	 * inject new events.
+	 */
+	bool insn_emulation_needs_vmenter;
 	bool at_instruction_boundary;
 	bool tpr_access_reporting;
 	bool xfd_no_write_intercept;
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index dc3d95fdca7d..c6e597b8c794 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -640,8 +640,14 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
 
 #ifdef CONFIG_X86_64
 	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
-		return rsm_load_state_64(ctxt, &smram.smram64);
+		ret = rsm_load_state_64(ctxt, &smram.smram64);
 	else
 #endif
-		return rsm_load_state_32(ctxt, &smram.smram32);
+		ret = rsm_load_state_32(ctxt, &smram.smram32);
+
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	vcpu->arch.insn_emulation_needs_vmenter = true;
+	return X86EMUL_CONTINUE;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bf10a9073a09..21a7183bbf69 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10195,6 +10195,30 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu)
 	return kvm_x86_ops.nested_ops->check_events(vcpu);
 }
 
+static int kvm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+	if (vcpu->arch.insn_emulation_needs_vmenter)
+		return -EBUSY;
+
+	return static_call(kvm_x86_interrupt_allowed)(vcpu, for_injection);
+}
+
+static int kvm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+	if (vcpu->arch.insn_emulation_needs_vmenter)
+		return -EBUSY;
+
+	return static_call(kvm_x86_smi_allowed)(vcpu, for_injection)
+}
+
+static int kvm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+	if (vcpu->arch.insn_emulation_needs_vmenter)
+		return -EBUSY;
+
+	return x86_nmi_static_call(kvm_x86_nmi_allowed)(vcpu, for_injection);
+}
+
 static void kvm_inject_exception(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -10384,7 +10408,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
 	 */
 #ifdef CONFIG_KVM_SMM
 	if (vcpu->arch.smi_pending) {
-		r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
+		r = can_inject ? kvm_smi_allowed(vcpu, true) : -EBUSY;
 		if (r < 0)
 			goto out;
 		if (r) {
@@ -10398,7 +10422,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
 #endif
 
 	if (vcpu->arch.nmi_pending) {
-		r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
+		r = can_inject ? kvm_nmi_allowed(vcpu, true) : -EBUSY;
 		if (r < 0)
 			goto out;
 		if (r) {
@@ -10406,14 +10430,14 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
 			vcpu->arch.nmi_injected = true;
 			static_call(kvm_x86_inject_nmi)(vcpu);
 			can_inject = false;
-			WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
+			WARN_ON_ONCE(kvm_nmi_allowed() < 0);
 		}
 		if (vcpu->arch.nmi_pending)
 			static_call(kvm_x86_enable_nmi_window)(vcpu);
 	}
 
 	if (kvm_cpu_has_injectable_intr(vcpu)) {
-		r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
+		r = can_inject ? kvm_interrupt_allowed(vcpu, true) : -EBUSY;
 		if (r < 0)
 			goto out;
 		if (r) {
@@ -10422,7 +10446,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
 			if (!WARN_ON_ONCE(irq == -1)) {
 				kvm_queue_interrupt(vcpu, irq, false);
 				static_call(kvm_x86_inject_irq)(vcpu, false);
-				WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+				WARN_ON(kvm_interrupt_allowed(vcpu, true) < 0);
 			}
 		}
 		if (kvm_cpu_has_injectable_intr(vcpu))
@@ -10969,6 +10993,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
 			     (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
 
+		vcpu->arch.insn_emulation_needs_vmenter = false;
+
 		exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
 		if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
 			break;
@@ -13051,14 +13077,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 		return true;
 
 	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
-	    (vcpu->arch.nmi_pending &&
-	     static_call(kvm_x86_nmi_allowed)(vcpu, false)))
+	    (vcpu->arch.nmi_pending && kvm_nmi_allowed(vcpu, false)))
 		return true;
 
 #ifdef CONFIG_KVM_SMM
 	if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
-	    (vcpu->arch.smi_pending &&
-	     static_call(kvm_x86_smi_allowed)(vcpu, false)))
+	    (vcpu->arch.smi_pending && kvm_smi_allowed(vcpu, false)))
 		return true;
 #endif
 
@@ -13136,7 +13160,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-	return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
+	return kvm_interrupt_allowed(vcpu, false);
 }
 
 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
base-commit: f8fe663bc413d2a14ab9a452638a99b975011a9d
--