Re: [PATCH v19 023/130] KVM: TDX: Initialize the TDX module when loading the KVM intel kernel module

From: Huang, Kai
Date: Wed Apr 17 2024 - 09:21:24 EST


On Tue, 2024-04-16 at 13:58 -0700, Sean Christopherson wrote:
> On Fri, Apr 12, 2024, Kai Huang wrote:
> > On 12/04/2024 2:03 am, Sean Christopherson wrote:
> > > On Thu, Apr 11, 2024, Kai Huang wrote:
> > > > I can certainly follow up with this and generate a reviewable patchset if I
> > > > can confirm with you that this is what you want?
> > >
> > > Yes, I think it's the right direction. I still have minor concerns about VMX
> > > being enabled while kvm.ko is loaded, which means that VMXON will _always_ be
> > > enabled if KVM is built-in. But after seeing the complexity that is needed to
> > > safely initialize TDX, and after seeing just how much complexity KVM already
> > > has because it enables VMX on-demand (I hadn't actually tried removing that code
> > > before), I think the cost of that complexity far outweighs the risk of "always"
> > > being post-VMXON.
> >
> > Does always leaving VMXON have any actual damage, given we have emergency
> > virtualization shutdown?
>
> Being post-VMXON increases the risk of kexec() into the kdump kernel failing.
> The tradeoffs that we're trying to balance are: is the risk of kexec() failing
> due to the complexity of the emergency VMX code higher than the risk of us breaking
> things in general due to taking on a ton of complexity to juggle VMXON for TDX?
>
> After seeing the latest round of TDX code, my opinion is that being post-VMXON
> is less risky overall, in no small part because we need that to work anyways for
> hosts that are actively running VMs.
>
> >

How about we only keep VMX always on when TDX is enabled?

In short, we can do this way:

- Do VMXON + unconditional tdx_cpu_enable() in vt_hardware_enable()

- And in vt_hardware_setup():

cpus_read_lock();
hardware_enable_all_nolock(); (this doesn't exist yet)
ret = tdx_enable();
if (!ret)
hardware_disable_all_nolock();
cpus_read_unlock();

- And in vt_hardware_unsetup():

if (TDX is enabled) {
cpus_read_lock();
hardware_disable_all_nolock();
cpus_read_unlock();
}

Note to make this work, we also need to move register/unregister
kvm_online_cpu()/kvm_offline_cpu() from kvm_init() to
hardware_enable_all_nolock() and hardware_disable_all_nolock()
respectively to cover any CPU becoming online after tdx_enable() (well,
more precisely, after hardware_enable_all_nolock()).

This is reasonable anyway even w/o TDX, because only _after_ we have
enabled hardware on all online cpus, we need to handle CPU hotplug.

Calling hardware_enable_all_nolock() w/o holding kvm_lock mutex is also
fine because at this stage it's not possible for userspace to create VM
yet.

Btw, kvm_arch_hardware_enable() does things like TSC backworks, uret_msrs,
etc but they are safe to be called during module load/unload AFAICT. We
can put a comment there for reminder if needed.

If I am not missing anything, below diff to kvm.ko shows my idea:

diff --git a/arch/x86/include/asm/kvm_host.h
b/arch/x86/include/asm/kvm_host.h
index 16e07a2eee19..ed8b2f34af01 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2318,4 +2318,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot,
unsigned long npages);
*/
#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1)

+int hardware_enable_all_nolock(void);
+void hardware_disable_all_nolock(void);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fb49c2a60200..3d2ff7dd0150 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5601,14 +5601,23 @@ static int kvm_offline_cpu(unsigned int cpu)
return 0;
}

-static void hardware_disable_all_nolock(void)
+static void __hardware_disable_all_nolock(void)
+{
+#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
+ cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
+#endif
+ on_each_cpu(hardware_disable_nolock, NULL, 1);
+}
+
+void hardware_disable_all_nolock(void)
{
BUG_ON(!kvm_usage_count);

kvm_usage_count--;
if (!kvm_usage_count)
- on_each_cpu(hardware_disable_nolock, NULL, 1);
+ __hardware_disable_all_nolock();
}
+EXPORT_SYMBOL_GPL(hardware_disable_all_nolock);

static void hardware_disable_all(void)
{
@@ -5619,11 +5628,27 @@ static void hardware_disable_all(void)
cpus_read_unlock();
}

-static int hardware_enable_all(void)
+static int __hardware_enable_all_nolock(void)
{
atomic_t failed = ATOMIC_INIT(0);
int r;

+ on_each_cpu(hardware_enable_nolock, &failed, 1);
+
+ r = atomic_read(&failed);
+ if (r)
+ return -EBUSY;
+
+#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
+ r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE,
"kvm/cpu:online",
+ kvm_online_cpu, kvm_offline_cpu);
+#endif
+
+ return r;
+}
+
+int hardware_enable_all_nolock(void)
+{
/*
* Do not enable hardware virtualization if the system is going
down.
* If userspace initiated a forced reboot, e.g. reboot -f, then
it's
@@ -5637,6 +5662,24 @@ static int hardware_enable_all(void)
system_state == SYSTEM_RESTART)
return -EBUSY;

+ kvm_usage_count++;
+ if (kvm_usage_count == 1) {
+ int r = __hardware_enable_all_nolock();
+
+ if (r) {
+ hardware_disable_all_nolock();
+ return r;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hardware_enable_all_nolock);
+
+static int hardware_enable_all(void)
+{
+ int r;
+
/*
* When onlining a CPU, cpu_online_mask is set before
kvm_online_cpu()
* is called, and so on_each_cpu() between them includes the CPU
that
@@ -5648,17 +5691,7 @@ static int hardware_enable_all(void)
cpus_read_lock();
mutex_lock(&kvm_lock);

- r = 0;
-
- kvm_usage_count++;
- if (kvm_usage_count == 1) {
- on_each_cpu(hardware_enable_nolock, &failed, 1);
-
- if (atomic_read(&failed)) {
- hardware_disable_all_nolock();
- r = -EBUSY;
- }
- }
+ r = hardware_enable_all_nolock();

mutex_unlock(&kvm_lock);
cpus_read_unlock();
@@ -6422,11 +6455,6 @@ int kvm_init(unsigned vcpu_size, unsigned
vcpu_align, struct module *module)
int cpu;

#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
- r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE,
"kvm/cpu:online",
- kvm_online_cpu, kvm_offline_cpu);
- if (r)
- return r;
-
register_syscore_ops(&kvm_syscore_ops);
#endif

@@ -6528,7 +6556,6 @@ void kvm_exit(void)
kvm_async_pf_deinit();
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
unregister_syscore_ops(&kvm_syscore_ops);
- cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
#endif
kvm_irqfd_exit();
}