[PATCH v3 11/11] KVM: SVM: allow to avoid not needed updates to is_running

From: Maxim Levitsky
Date: Tue Mar 01 2022 - 13:29:24 EST


Allow optionally to make KVM not update is_running unless it is
functionally needed which is only when a vCPU halts,
or is in the guest mode.

This means security wise that if a vCPU is scheduled out,
other vCPUs could still send doorbell messages to the
last physical CPU where this vCPU was last running.

This in theory can be considered less secure, thus
this option is not enabled by default.

The option is avic_doorbell_strict and is true by
default, setting it to false allows this relaxed
non strict mode.

Signed-off-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>
---
arch/x86/kvm/svm/avic.c | 39 +++++++++++++++++++++++++++------------
arch/x86/kvm/svm/svm.c | 7 +++++--
arch/x86/kvm/svm/svm.h | 1 +
virt/kvm/kvm_main.c | 3 ++-
4 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index dd13fd3588e2b..1d690a9d3952e 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -166,10 +166,13 @@ void avic_physid_shadow_table_update_vcpu_location(struct kvm_vcpu *vcpu, int cp
raw_spin_lock_irqsave(&kvm_svm->avic.table_entries_lock, flags);

list_for_each_entry(e, &vcpu_svm->nested.physid_ref_entries, link) {
- u64 sentry = READ_ONCE(*e->sentry);
+ u64 old_sentry = READ_ONCE(*e->sentry);
+ u64 new_sentry = old_sentry;

- physid_entry_set_apicid(&sentry, cpu);
- WRITE_ONCE(*e->sentry, sentry);
+ physid_entry_set_apicid(&new_sentry, cpu);
+
+ if (new_sentry != old_sentry)
+ WRITE_ONCE(*e->sentry, new_sentry);
nentries++;
}

@@ -1507,7 +1510,7 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)

void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- u64 entry;
+ u64 old_entry, new_entry;
/* ID = 0xff (broadcast), ID > 0xff (reserved) */
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1531,14 +1534,16 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_vcpu_is_blocking(vcpu))
return;

- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+ old_entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ new_entry = old_entry;

- entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
- entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
- entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ new_entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ new_entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+ new_entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ if (old_entry != new_entry)
+ WRITE_ONCE(*(svm->avic_physical_id_cache), new_entry);

- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
}

@@ -1549,14 +1554,24 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)

lockdep_assert_preemption_disabled();

+ avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
+ /*
+ * It is only meaningful to intercept IPIs from the guest
+ * when either vCPU is blocked, or in guest mode.
+ * In all other cases (e.g userspace vmexit, or preemption
+ * by other task, the vCPU is guaranteed to return to guest mode
+ * as soon as it can
+ */
+ if (!avic_doorbell_strict && !kvm_vcpu_is_blocking(vcpu) && !is_guest_mode(vcpu))
+ return;
+
entry = READ_ONCE(*(svm->avic_physical_id_cache));

/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
return;

- avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
-
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 0d6b715375a69..463b756f665ae 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -202,6 +202,9 @@ module_param(tsc_scaling, int, 0444);
static bool avic;
module_param(avic, bool, 0444);

+bool avic_doorbell_strict = true;
+module_param(avic_doorbell_strict, bool, 0444);
+
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);

@@ -1340,7 +1343,8 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
svm->loaded = false;

if (svm->nested.initialized && svm->avic_enabled)
- avic_physid_shadow_table_update_vcpu_location(vcpu, -1);
+ if (!avic_doorbell_strict || kvm_vcpu_is_blocking(vcpu))
+ avic_physid_shadow_table_update_vcpu_location(vcpu, -1);

svm_prepare_host_switch(vcpu);

@@ -4707,7 +4711,6 @@ static __init void svm_set_cpu_caps(void)
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
if (nested) {
kvm_cpu_cap_set(X86_FEATURE_SVM);
- kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);

if (nrips)
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 8ebda12995abe..d0108bae2cdac 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -33,6 +33,7 @@
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
extern bool intercept_smi;
+extern bool avic_doorbell_strict;

/*
* Clean bits in VMCB.
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cdf1fa3c60ae2..67a29233e216b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3291,9 +3291,10 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu)

vcpu->stat.generic.blocking = 1;

+ prepare_to_rcuwait(wait);
+
kvm_arch_vcpu_blocking(vcpu);

- prepare_to_rcuwait(wait);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);

--
2.26.3