[PATCH v3 1/2] KVM: VMX: FIXED+PHYSICAL mode single target IPI fastpath

From: Wanpeng Li
Date: Tue Nov 19 2019 - 22:42:30 EST


From: Wanpeng Li <wanpengli@xxxxxxxxxxx>

ICR and TSCDEADLINE MSRs write cause the main MSRs write vmexits in
our product observation, multicast IPIs are not as common as unicast
IPI like RESCHEDULE_VECTOR and CALL_FUNCTION_SINGLE_VECTOR etc.

This patch tries to optimize x2apic physical destination mode, fixed
delivery mode single target IPI. The fast path is invoked at
->handle_exit_irqoff() to emulate only the effect of the ICR write
itself, i.e. the sending of IPIs. Sending IPIs early in the VM-Exit
flow reduces the latency of virtual IPIs by avoiding the expensive bits
of transitioning from guest to host, e.g. reacquiring KVM's SRCU lock.
Especially when running guest w/ KVM_CAP_X86_DISABLE_EXITS capability
enabled or guest can keep running, IPI can be injected to target vCPU
by posted-interrupt immediately.

Testing on Xeon Skylake server:

The virtual IPI latency from sender send to receiver receive reduces
more than 200+ cpu cycles.

Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
Cc: Radim KrÄmÃÅ <rkrcmar@xxxxxxxxxx>
Cc: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
Cc: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
Cc: Liran Alon <liran.alon@xxxxxxxxxx>
Signed-off-by: Wanpeng Li <wanpengli@xxxxxxxxxxx>
---
v2 -> v3:
* for both VMX and SVM
* vmx_handle_exit() get second parameter by value and not by pointer
* rename parameter to âaccel_exit_completionâ
* preserve tracepoint ordering
* rename handler to handle_accel_set_msr_irqoff and more generic
* add comments above handle_accel_set_msr_irqoff
* msr index APIC_BASE_MSR + (APIC_ICR >> 4)
v1 -> v2:
* add tracepoint
* Instead of a separate vcpu->fast_vmexit, set exit_reason
to vmx->exit_reason to -1 if the fast path succeeds.
* move the "kvm_skip_emulated_instruction(vcpu)" to vmx_handle_exit
* moving the handling into vmx_handle_exit_irqoff()

arch/x86/include/asm/kvm_host.h | 11 ++++++++--
arch/x86/kvm/svm.c | 14 ++++++++----
arch/x86/kvm/vmx/vmx.c | 13 ++++++++---
arch/x86/kvm/x86.c | 48 +++++++++++++++++++++++++++++++++++++++--
arch/x86/kvm/x86.h | 1 +
5 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 898ab9e..67c7889 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -175,6 +175,11 @@ enum {
VCPU_SREG_LDTR,
};

+enum accel_exit_completion {
+ ACCEL_EXIT_NONE,
+ ACCEL_EXIT_SKIP_EMUL_INS = -1,
+};
+
#include <asm/kvm_emulate.h>

#define KVM_NR_MEM_OBJS 40
@@ -1084,7 +1089,8 @@ struct kvm_x86_ops {
void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);

void (*run)(struct kvm_vcpu *vcpu);
- int (*handle_exit)(struct kvm_vcpu *vcpu);
+ int (*handle_exit)(struct kvm_vcpu *vcpu,
+ enum accel_exit_completion accel_exit);
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
@@ -1134,7 +1140,8 @@ struct kvm_x86_ops {
int (*check_intercept)(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info,
enum x86_intercept_stage stage);
- void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
+ void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu,
+ enum accel_exit_completion *accel_exit);
bool (*mpx_supported)(void);
bool (*xsaves_supported)(void);
bool (*umip_emulated)(void);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d02a73a..060b11d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4929,7 +4929,8 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
*info2 = control->exit_info_2;
}

-static int handle_exit(struct kvm_vcpu *vcpu)
+static int handle_exit(struct kvm_vcpu *vcpu,
+ enum accel_exit_completion accel_exit)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
@@ -4987,7 +4988,10 @@ static int handle_exit(struct kvm_vcpu *vcpu)
__func__, svm->vmcb->control.exit_int_info,
exit_code);

- if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+ if (accel_exit == ACCEL_EXIT_SKIP_EMUL_INS) {
+ kvm_skip_emulated_instruction(vcpu);
+ return 1;
+ } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
|| !svm_exit_handlers[exit_code]) {
vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
dump_vmcb(vcpu);
@@ -6187,9 +6191,11 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
return ret;
}

-static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu,
+ enum accel_exit_completion *accel_exit)
{
-
+ if (to_svm(vcpu)->vmcb->control.exit_code == EXIT_REASON_MSR_WRITE)
+ *accel_exit = handle_accel_set_msr_irqoff(vcpu);
}

static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 621142e5..86c0a23 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5792,7 +5792,8 @@ void dump_vmcs(void)
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
-static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_vcpu *vcpu,
+ enum accel_exit_completion accel_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exit_reason = vmx->exit_reason;
@@ -5878,7 +5879,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
}

- if (exit_reason < kvm_vmx_max_exit_handlers
+ if (accel_exit == ACCEL_EXIT_SKIP_EMUL_INS) {
+ kvm_skip_emulated_instruction(vcpu);
+ return 1;
+ } else if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason]) {
#ifdef CONFIG_RETPOLINE
if (exit_reason == EXIT_REASON_MSR_WRITE)
@@ -6223,7 +6227,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
}
STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);

-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
+ enum accel_exit_completion *accel_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);

@@ -6231,6 +6236,8 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
handle_exception_nmi_irqoff(vmx);
+ else if (vmx->exit_reason == EXIT_REASON_MSR_WRITE)
+ *accel_exit = handle_accel_set_msr_irqoff(vcpu);
}

static bool vmx_has_emulated_msr(int index)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 991dd01..966c659 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1509,6 +1509,49 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);

+static int handle_accel_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
+{
+ if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) &&
+ ((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
+
+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
+ return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
+ }
+
+ return 1;
+}
+
+/*
+ * The fast path for frequent and performance sensitive wrmsr emulation,
+ * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
+ * the latency of virtual IPI by avoiding the expensive bits of transitioning
+ * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
+ * other cases which must be called after interrupts are enabled on the host.
+ */
+enum accel_exit_completion handle_accel_set_msr_irqoff(struct kvm_vcpu *vcpu)
+{
+ u32 msr = kvm_rcx_read(vcpu);
+ u64 data = kvm_read_edx_eax(vcpu);
+ int ret = 0;
+
+ switch (msr) {
+ case APIC_BASE_MSR + (APIC_ICR >> 4):
+ ret = handle_accel_set_x2apic_icr_irqoff(vcpu, data);
+ break;
+ default:
+ return ACCEL_EXIT_NONE;
+ }
+
+ if (!ret) {
+ trace_kvm_msr_write(msr, data);
+ return ACCEL_EXIT_SKIP_EMUL_INS;
+ }
+
+ return ACCEL_EXIT_NONE;
+}
+EXPORT_SYMBOL_GPL(handle_accel_set_msr_irqoff);
+
/*
* Adapt set_msr() to msr_io()'s calling convention
*/
@@ -7984,6 +8027,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_int_win =
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
+ enum accel_exit_completion accel_exit = ACCEL_EXIT_NONE;

bool req_immediate_exit = false;

@@ -8226,7 +8270,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();

- kvm_x86_ops->handle_exit_irqoff(vcpu);
+ kvm_x86_ops->handle_exit_irqoff(vcpu, &accel_exit);

/*
* Consume any pending interrupts, including the possible source of
@@ -8270,7 +8314,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_lapic_sync_from_vapic(vcpu);

vcpu->arch.gpa_available = false;
- r = kvm_x86_ops->handle_exit(vcpu);
+ r = kvm_x86_ops->handle_exit(vcpu, accel_exit);
return r;

cancel_injection:
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 29391af..f14ec14 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -291,6 +291,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
bool kvm_vector_hashing_enabled(void);
int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
int emulation_type, void *insn, int insn_len);
+enum accel_exit_completion handle_accel_set_msr_irqoff(struct kvm_vcpu *vcpu);

#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
--
2.7.4