[tip: x86/tdx] x86/boot: Avoid #VE during boot for TDX platforms

From: tip-bot2 for Sean Christopherson
Date: Fri Apr 08 2022 - 21:28:04 EST


The following commit has been merged into the x86/tdx branch of tip:

Commit-ID: 77a512e35db7609a8c909e2006b2ea82f2b1616f
Gitweb: https://git.kernel.org/tip/77a512e35db7609a8c909e2006b2ea82f2b1616f
Author: Sean Christopherson <seanjc@xxxxxxxxxx>
AuthorDate: Wed, 06 Apr 2022 02:29:32 +03:00
Committer: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
CommitterDate: Thu, 07 Apr 2022 08:27:53 -07:00

x86/boot: Avoid #VE during boot for TDX platforms

There are a few MSRs and control register bits that the kernel
normally needs to modify during boot. But, TDX disallows
modification of these registers to help provide consistent security
guarantees. Fortunately, TDX ensures that these are all in the correct
state before the kernel loads, which means the kernel does not need to
modify them.

The conditions to avoid are:

* Any writes to the EFER MSR
* Clearing CR4.MCE

This theoretically makes the guest boot more fragile. If, for instance,
EFER was set up incorrectly and a WRMSR was performed, it will trigger
early exception panic or a triple fault, if it's before early
exceptions are set up. However, this is likely to trip up the guest
BIOS long before control reaches the kernel. In any case, these kinds
of problems are unlikely to occur in production environments, and
developers have good debug tools to fix them quickly.

Change the common boot code to work on TDX and non-TDX systems.
This should have no functional effect on non-TDX systems.

Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@xxxxxxxxxxxxxxx>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
Signed-off-by: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Reviewed-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
Reviewed-by: Dan Williams <dan.j.williams@xxxxxxxxx>
Reviewed-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Link: https://lkml.kernel.org/r/20220405232939.73860-24-kirill.shutemov@xxxxxxxxxxxxxxx
---
arch/x86/Kconfig | 1 +-
arch/x86/boot/compressed/head_64.S | 20 +++++++++++++++++--
arch/x86/boot/compressed/pgtable.h | 2 +-
arch/x86/kernel/head_64.S | 28 +++++++++++++++++++++++++--
arch/x86/realmode/rm/trampoline_64.S | 13 ++++++++++++-
5 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index aea4cc4..a181855 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -884,6 +884,7 @@ config INTEL_TDX_GUEST
depends on X86_X2APIC
select ARCH_HAS_CC_PLATFORM
select DYNAMIC_PHYSICAL_MASK
+ select X86_MCE
help
Support running as a guest under Intel TDX. Without this support,
the guest kernel can not boot or run under TDX.
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 7b5d362..f63ec25 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -642,12 +642,28 @@ SYM_CODE_START(trampoline_32bit_src)
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_LME, %eax
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+ jc 1f
wrmsr
- popl %edx
+1: popl %edx
popl %ecx

+#ifdef CONFIG_X86_MCE
+ /*
+ * Preserve CR4.MCE if the kernel will enable #MC support.
+ * Clearing MCE may fault in some environments (that also force #MC
+ * support). Any machine check that occurs before #MC support is fully
+ * configured will crash the system regardless of the CR4.MCE value set
+ * here.
+ */
+ movl %cr4, %eax
+ andl $X86_CR4_MCE, %eax
+#else
+ movl $0, %eax
+#endif
+
/* Enable PAE and LA57 (if required) paging modes */
- movl $X86_CR4_PAE, %eax
+ orl $X86_CR4_PAE, %eax
testl %edx, %edx
jz 1f
orl $X86_CR4_LA57, %eax
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
index 6ff7e81..cc9b252 100644
--- a/arch/x86/boot/compressed/pgtable.h
+++ b/arch/x86/boot/compressed/pgtable.h
@@ -6,7 +6,7 @@
#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0

#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
-#define TRAMPOLINE_32BIT_CODE_SIZE 0x70
+#define TRAMPOLINE_32BIT_CODE_SIZE 0x80

#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b8e3019..39a6ff7 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -142,8 +142,22 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
addq $(init_top_pgt - __START_KERNEL_map), %rax
1:

+#ifdef CONFIG_X86_MCE
+ /*
+ * Preserve CR4.MCE if the kernel will enable #MC support.
+ * Clearing MCE may fault in some environments (that also force #MC
+ * support). Any machine check that occurs before #MC support is fully
+ * configured will crash the system regardless of the CR4.MCE value set
+ * here.
+ */
+ movq %cr4, %rcx
+ andl $X86_CR4_MCE, %ecx
+#else
+ movl $0, %ecx
+#endif
+
/* Enable PAE mode, PGE and LA57 */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
testl $1, __pgtable_l5_enabled(%rip)
jz 1f
@@ -249,13 +263,23 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
rdmsr
+ /*
+ * Preserve current value of EFER for comparison and to skip
+ * EFER writes if no change was made (for TDX guest)
+ */
+ movl %eax, %edx
btsl $_EFER_SCE, %eax /* Enable System Call */
btl $20,%edi /* No Execute supported? */
jnc 1f
btsl $_EFER_NX, %eax
btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
-1: wrmsr /* Make changes effective */

+ /* Avoid writing EFER if no change was made (for TDX guest) */
+1: cmpl %edx, %eax
+ je 1f
+ xor %edx, %edx
+ wrmsr /* Make changes effective */
+1:
/* Setup cr0 */
movl $CR0_STATE, %eax
/* Make changes effective */
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index d380f2d..e38d61d 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -143,11 +143,22 @@ SYM_CODE_START(startup_32)
movl %eax, %cr3

# Set up EFER
+ movl $MSR_EFER, %ecx
+ rdmsr
+ /*
+ * Skip writing to EFER if the register already has desired
+ * value (to avoid #VE for the TDX guest).
+ */
+ cmp pa_tr_efer, %eax
+ jne .Lwrite_efer
+ cmp pa_tr_efer + 4, %edx
+ je .Ldone_efer
+.Lwrite_efer:
movl pa_tr_efer, %eax
movl pa_tr_efer + 4, %edx
- movl $MSR_EFER, %ecx
wrmsr

+.Ldone_efer:
# Enable paging and in turn activate Long Mode.
movl $CR0_STATE, %eax
movl %eax, %cr0