[PATCH v12 018/106] KVM: TDX: allocate/free TDX vcpu structure

From: isaku . yamahata
Date: Mon Feb 27 2023 - 03:25:17 EST


From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>

The next step of TDX guest creation is to create vcpu. Allocate TDX vcpu
structures, initialize it that doesn't require TDX SEAMCALL. Allocate
pages of TDX vcpu for the TDX module. Actual donation TDX vcpu pages to
the TDX module is not done yet.

In the case of the conventional case, cpuid is empty at the initialization.
and cpuid is configured after the vcpu initialization. Because TDX
supports only X2APIC mode, cpuid is forcibly initialized to support X2APIC
on the vcpu initialization in vcpu_reset method. because
kvm_arch_vcpu_create() also initializes kvm MMU that depends on local apic
settings. So x2apic needs to be initialized to X2APIC mode by vcpu_reset
method.

Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
---
Changes v11 -> v12:
- add more comments in tdx_vcpu_reset().
- use KVM_BUG_ON()

Changes v10 -> v11:
- NULL check of kvmalloc_array() in tdx_vcpu_reset. Move it to
tdx_vcpu_create()
---
arch/x86/kvm/vmx/main.c | 44 ++++++++++++++++++--
arch/x86/kvm/vmx/tdx.c | 82 ++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/x86_ops.h | 10 +++++
arch/x86/kvm/x86.c | 2 +
4 files changed, 134 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index a15ee25d47e0..904b98a9a7ed 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -83,6 +83,42 @@ static void vt_vm_free(struct kvm *kvm)
tdx_vm_free(kvm);
}

+static int vt_vcpu_precreate(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return 0;
+
+ return vmx_vcpu_precreate(kvm);
+}
+
+static int vt_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_create(vcpu);
+
+ return vmx_vcpu_create(vcpu);
+}
+
+static void vt_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_free(vcpu);
+ return;
+ }
+
+ vmx_vcpu_free(vcpu);
+}
+
+static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_reset(vcpu, init_event);
+ return;
+ }
+
+ vmx_vcpu_reset(vcpu, init_event);
+}
+
static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
if (!is_td(kvm))
@@ -123,10 +159,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.vm_destroy = vt_vm_destroy,
.vm_free = vt_vm_free,

- .vcpu_precreate = vmx_vcpu_precreate,
- .vcpu_create = vmx_vcpu_create,
- .vcpu_free = vmx_vcpu_free,
- .vcpu_reset = vmx_vcpu_reset,
+ .vcpu_precreate = vt_vcpu_precreate,
+ .vcpu_create = vt_vcpu_create,
+ .vcpu_free = vt_vcpu_free,
+ .vcpu_reset = vt_vcpu_reset,

.prepare_switch_to_guest = vmx_prepare_switch_to_guest,
.vcpu_load = vmx_vcpu_load,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 8d657bacc050..e6c83634582e 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -316,6 +316,88 @@ int tdx_vm_init(struct kvm *kvm)
return 0;
}

+int tdx_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *e;
+
+ /*
+ * On cpu creation, cpuid entry is blank. Forcibly enable
+ * X2APIC feature to allow X2APIC.
+ * Because vcpu_reset() can't return error, allocation is done here.
+ */
+ WARN_ON_ONCE(vcpu->arch.cpuid_entries);
+ WARN_ON_ONCE(vcpu->arch.cpuid_nent);
+ /*
+ * Because vcpu->arch.cpuid_entries is freed by kvfree(), use kvmalloc
+ * same to kvm_vcpu_ioctl_set_cpuid().
+ * In error case, the memory freeing is done by kvm_arch_destroy_vm()
+ * => kvm_destroy_vpcus() => kvm_vcpu_destroy() =>
+ * kvm_arch_vcpu_destroy().
+ */
+ e = kvmalloc_array(1, sizeof(*e), GFP_KERNEL_ACCOUNT);
+ if (!e)
+ return -ENOMEM;
+ *e = (struct kvm_cpuid_entry2) {
+ .function = 1, /* Features for X2APIC */
+ .index = 0,
+ .eax = 0,
+ .ebx = 0,
+ .ecx = 1ULL << 21, /* X2APIC */
+ .edx = 0,
+ };
+ vcpu->arch.cpuid_entries = e;
+ vcpu->arch.cpuid_nent = 1;
+
+ /* TDX only supports x2APIC, which requires an in-kernel local APIC. */
+ if (!vcpu->arch.apic)
+ return -EINVAL;
+
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
+
+ vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
+
+ vcpu->arch.cr0_guest_owned_bits = -1ul;
+ vcpu->arch.cr4_guest_owned_bits = -1ul;
+
+ vcpu->arch.tsc_offset = to_kvm_tdx(vcpu->kvm)->tsc_offset;
+ vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
+ vcpu->arch.guest_state_protected =
+ !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTRIBUTE_DEBUG);
+
+ return 0;
+}
+
+void tdx_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ /* This is stub for now. More logic will come. */
+}
+
+void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct msr_data apic_base_msr;
+
+ /* Ignore INIT silently because TDX doesn't support INIT event. */
+ if (init_event)
+ return;
+
+ /*
+ * TDX requires X2APIC. kvm_arch_vcpu_reset() initialize KVM mmu that
+ * depends on local apic setting. Set local apic mode before it.
+ */
+ apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
+ apic_base_msr.host_initiated = true;
+ if (KVM_BUG_ON(kvm_set_apic_base(vcpu, &apic_base_msr), vcpu->kvm))
+ return;
+
+ /*
+ * Don't update mp_state to runnable because more initialization
+ * is needed by TDX_VCPU_INIT.
+ */
+ return;
+}
+
int tdx_dev_ioctl(void __user *argp)
{
struct kvm_tdx_capabilities __user *user_caps;
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 4960e7d58add..b7708e725e93 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -148,7 +148,12 @@ int tdx_vm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap);
int tdx_vm_init(struct kvm *kvm);
void tdx_mmu_release_hkid(struct kvm *kvm);
void tdx_vm_free(struct kvm *kvm);
+
int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
+
+int tdx_vcpu_create(struct kvm_vcpu *vcpu);
+void tdx_vcpu_free(struct kvm_vcpu *vcpu);
+void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
#else
static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -ENOSYS; }
static inline void tdx_hardware_unsetup(void) {}
@@ -161,7 +166,12 @@ static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
static inline void tdx_flush_shadow_all_private(struct kvm *kvm) {}
static inline void tdx_vm_free(struct kvm *kvm) {}
+
static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
+
+static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
+static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
+static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
#endif

#endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a2a2d62d490a..275bdbcb3043 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -492,6 +492,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
kvm_recalculate_apic_map(vcpu->kvm);
return 0;
}
+EXPORT_SYMBOL_GPL(kvm_set_apic_base);

/*
* Handle a fault on a hardware virtualization (VMX or SVM) instruction.
@@ -12128,6 +12129,7 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);

bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
{
--
2.25.1