Re: [PATCH v10 6/8] x86/smpboot: Support parallel startup of secondary CPUs

From: Yuan Yao
Date: Fri Feb 24 2023 - 01:00:04 EST


On Tue, Feb 21, 2023 at 10:33:50PM +0000, Usama Arif wrote:
> From: David Woodhouse <dwmw@xxxxxxxxxxxx>
>
> Rework the real-mode startup code to allow for APs to be brought up in
> parallel. This is in two parts:
>
> 1. Introduce a bit-spinlock to prevent them from all using the real
> mode stack at the same time.
>
> 2. Avoid the use of global variables for passing per-CPU information to
> the APs.
>
> To achieve the latter, export the cpuid_to_apicid[] array so that each
> AP can find its own per_cpu data (and thus initial_gs, initial_stack and
> early_gdt_descr) by searching therein based on its APIC ID.
>
> Introduce a global variable 'smpboot_control' indicating to the AP how
> it should find its APIC ID. For a serialized bringup, the APIC ID is
> explicitly passed in the low bits of smpboot_control, while for parallel
> mode there are flags directing the AP to find its APIC ID in CPUID leaf
> 0x0b (for X2APIC mode) or CPUID leaf 0x01 where 8 bits are sufficient.
>
> Parallel startup may be disabled by a command line option, and also if:
> • AMD SEV-ES is in use, since the AP may not use CPUID that early.
> • X2APIC is enabled, but CPUID leaf 0xb is not present and correect.
> • X2APIC is not enabled but not even CPUID leaf 0x01 exists.
>
> Aside from the fact that APs will now look up their per-cpu data via the
> newly-exported cpuid_to_apicid[] table, there is no behavioural change
> intended yet, since new parallel CPUHP states have not — yet — been
> added.
>
> [ tglx: Initial proof of concept patch with bitlock and APIC ID lookup ]
> [ dwmw2: Rework and testing, commit message, CPUID 0x1 and CPU0 support ]
> [ seanc: Fix stray override of initial_gs in common_cpu_up() ]
> [ Oleksandr Natalenko: reported suspend/resume issue fixed in
> x86_acpi_suspend_lowlevel ]
> Co-developed-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
> Signed-off-by: Usama Arif <usama.arif@xxxxxxxxxxxxx>
> Tested-by: Paul E. McKenney <paulmck@xxxxxxxxxx>
> Tested-by: Kim Phillips <kim.phillips@xxxxxxx>
> Tested-by: Oleksandr Natalenko <oleksandr@xxxxxxxxxxxxxx>
> ---
> .../admin-guide/kernel-parameters.txt | 3 +
> arch/x86/include/asm/realmode.h | 3 +
> arch/x86/include/asm/smp.h | 10 +-
> arch/x86/kernel/acpi/sleep.c | 7 ++
> arch/x86/kernel/apic/apic.c | 2 +-
> arch/x86/kernel/head_64.S | 99 ++++++++++++++++++-
> arch/x86/kernel/smpboot.c | 62 +++++++++++-
> arch/x86/realmode/init.c | 3 +
> arch/x86/realmode/rm/trampoline_64.S | 14 +++
> kernel/smpboot.c | 2 +-
> 10 files changed, 197 insertions(+), 8 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 6cfa6e3996cf..ee099b8aac6d 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -3819,6 +3819,9 @@
>
> nomodule Disable module load
>
> + no_parallel_bringup
> + [X86,SMP] Disable parallel brinugp of secondary cores.
> +
> nopat [X86] Disable PAT (page attribute table extension of
> pagetables) support.
>
> diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
> index a336feef0af1..f0357cfe2fb0 100644
> --- a/arch/x86/include/asm/realmode.h
> +++ b/arch/x86/include/asm/realmode.h
> @@ -52,6 +52,7 @@ struct trampoline_header {
> u64 efer;
> u32 cr4;
> u32 flags;
> + u32 lock;
> #endif
> };
>
> @@ -65,6 +66,8 @@ extern unsigned long initial_stack;
> extern unsigned long initial_vc_handler;
> #endif
>
> +extern u32 *trampoline_lock;
> +
> extern unsigned char real_mode_blob[];
> extern unsigned char real_mode_relocs[];
>
> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index b4dbb20dab1a..33c0d5fd8af6 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -199,5 +199,13 @@ extern void nmi_selftest(void);
> #define nmi_selftest() do { } while (0)
> #endif
>
> -#endif /* __ASSEMBLY__ */
> +extern unsigned int smpboot_control;
> +
> +#endif /* !__ASSEMBLY__ */
> +
> +/* Control bits for startup_64 */
> +#define STARTUP_SECONDARY 0x80000000
> +#define STARTUP_APICID_CPUID_0B 0x40000000
> +#define STARTUP_APICID_CPUID_01 0x20000000
> +
> #endif /* _ASM_X86_SMP_H */
> diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
> index 3b7f4cdbf2e0..47e75c056cb5 100644
> --- a/arch/x86/kernel/acpi/sleep.c
> +++ b/arch/x86/kernel/acpi/sleep.c
> @@ -16,6 +16,7 @@
> #include <asm/cacheflush.h>
> #include <asm/realmode.h>
> #include <asm/hypervisor.h>
> +#include <asm/smp.h>
>
> #include <linux/ftrace.h>
> #include "../../realmode/rm/wakeup.h"
> @@ -57,6 +58,7 @@ asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state)
> */
> int x86_acpi_suspend_lowlevel(void)
> {
> + unsigned int __maybe_unused saved_smpboot_ctrl;
> struct wakeup_header *header =
> (struct wakeup_header *) __va(real_mode_header->wakeup_header);
>
> @@ -115,6 +117,8 @@ int x86_acpi_suspend_lowlevel(void)
> early_gdt_descr.address =
> (unsigned long)get_cpu_gdt_rw(smp_processor_id());
> initial_gs = per_cpu_offset(smp_processor_id());
> + /* Force the startup into boot mode */
> + saved_smpboot_ctrl = xchg(&smpboot_control, 0);
> #endif
> initial_code = (unsigned long)wakeup_long64;
> saved_magic = 0x123456789abcdef0L;
> @@ -127,6 +131,9 @@ int x86_acpi_suspend_lowlevel(void)
> pause_graph_tracing();
> do_suspend_lowlevel();
> unpause_graph_tracing();
> +
> + if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_SMP))
> + smpboot_control = saved_smpboot_ctrl;
> return 0;
> }
>
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index 20d9a604da7c..ac1d7e5da1f2 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -2377,7 +2377,7 @@ static int nr_logical_cpuids = 1;
> /*
> * Used to store mapping between logical CPU IDs and APIC IDs.
> */
> -static int cpuid_to_apicid[] = {
> +int cpuid_to_apicid[] = {
> [0 ... NR_CPUS - 1] = -1,
> };
>
> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> index 222efd4a09bc..0e4e53d231db 100644
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -25,6 +25,7 @@
> #include <asm/export.h>
> #include <asm/nospec-branch.h>
> #include <asm/fixmap.h>
> +#include <asm/smp.h>
>
> /*
> * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
> @@ -241,6 +242,83 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
> UNWIND_HINT_EMPTY
> ANNOTATE_NOENDBR // above
>
> +#ifdef CONFIG_SMP
> + /*
> + * Is this the boot CPU coming up? If so everything is available
> + * in initial_gs, initial_stack and early_gdt_descr.
> + */
> + movl smpboot_control(%rip), %edx
> + testl $STARTUP_SECONDARY, %edx
> + jz .Lsetup_cpu
> +
> + /*
> + * Secondary CPUs find out the offsets via the APIC ID. For parallel
> + * boot the APIC ID is retrieved from CPUID, otherwise it's encoded
> + * in smpboot_control:
> + * Bit 31 STARTUP_SECONDARY flag (checked above)
> + * Bit 30 STARTUP_APICID_CPUID_0B flag (use CPUID 0x0b)
> + * Bit 29 STARTUP_APICID_CPUID_01 flag (use CPUID 0x01)
> + * Bit 0-24 APIC ID if STARTUP_APICID_CPUID_xx flags are not set
> + */
> + testl $STARTUP_APICID_CPUID_0B, %edx
> + jnz .Luse_cpuid_0b
> + testl $STARTUP_APICID_CPUID_01, %edx
> + jnz .Luse_cpuid_01
> + andl $0x0FFFFFFF, %edx
> + jmp .Lsetup_AP
> +
> +.Luse_cpuid_01:
> + mov $0x01, %eax
> + cpuid
> + mov %ebx, %edx
> + shr $24, %edx
> + jmp .Lsetup_AP
> +
> +.Luse_cpuid_0b:
> + mov $0x0B, %eax
> + xorl %ecx, %ecx
> + cpuid
> +
> +.Lsetup_AP:
> + /* EDX contains the APIC ID of the current CPU */
> + xorq %rcx, %rcx
> + leaq cpuid_to_apicid(%rip), %rbx
> +
> +.Lfind_cpunr:
> + cmpl (%rbx,%rcx,4), %edx
> + jz .Linit_cpu_data
> + inc %ecx
> + cmpl nr_cpu_ids(%rip), %ecx
> + jb .Lfind_cpunr
> +
> + /* APIC ID not found in the table. Drop the trampoline lock and bail. */
> + movq trampoline_lock(%rip), %rax
> + lock
> + btrl $0, (%rax)
> +
> +1: cli
> + hlt
> + jmp 1b
> +
> +.Linit_cpu_data:
> + /* Get the per cpu offset for the given CPU# which is in ECX */
> + leaq __per_cpu_offset(%rip), %rbx
> + movq (%rbx,%rcx,8), %rbx
> + /* Save it for GS BASE setup */
> + movq %rbx, initial_gs(%rip)
> +
> + /* Calculate the GDT address */
> + movq $gdt_page, %rcx
> + addq %rbx, %rcx
> + movq %rcx, early_gdt_descr_base(%rip)
> +
> + /* Find the idle task stack */
> + movq idle_threads(%rbx), %rcx
> + movq TASK_threadsp(%rcx), %rcx
> + movq %rcx, initial_stack(%rip)
> +#endif /* CONFIG_SMP */
> +
> +.Lsetup_cpu:
> /*
> * We must switch to a new descriptor in kernel space for the GDT
> * because soon the kernel won't have access anymore to the userspace
> @@ -281,6 +359,14 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
> */
> movq initial_stack(%rip), %rsp
>
> + /* Drop the realmode protection. For the boot CPU the pointer is NULL! */
> + movq trampoline_lock(%rip), %rax
> + testq %rax, %rax
> + jz .Lsetup_idt
> + lock
> + btrl $0, (%rax)
> +
> +.Lsetup_idt:
> /* Setup and Load IDT */
> pushq %rsi
> call early_setup_idt
> @@ -372,7 +458,14 @@ SYM_CODE_END(secondary_startup_64)
> SYM_CODE_START(start_cpu0)
> ANNOTATE_NOENDBR
> UNWIND_HINT_EMPTY
> - movq initial_stack(%rip), %rsp
> + /* Load the per-cpu base for CPU#0 */
> + leaq __per_cpu_offset(%rip), %rbx
> + movq (%rbx), %rbx
> +
> + /* Find the idle task stack */
> + movq idle_threads(%rbx), %rcx
> + movq TASK_threadsp(%rcx), %rsp
> +
> jmp .Ljump_to_C_code
> SYM_CODE_END(start_cpu0)
> #endif
> @@ -426,6 +519,7 @@ SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
> * reliably detect the end of the stack.
> */
> SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
> +SYM_DATA(trampoline_lock, .quad 0);
> __FINITDATA
>
> __INIT
> @@ -660,6 +754,9 @@ SYM_DATA_END(level1_fixmap_pgt)
> SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1)
> SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page))
>
> + .align 16
> +SYM_DATA(smpboot_control, .long 0)
> +
> .align 16
> /* This must match the first entry in level2_kernel_pgt */
> SYM_DATA(phys_base, .quad 0x0)
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index b18c1385e181..6924d91b69ca 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -797,6 +797,16 @@ static int __init cpu_init_udelay(char *str)
> }
> early_param("cpu_init_udelay", cpu_init_udelay);
>
> +static bool do_parallel_bringup __ro_after_init = true;
> +
> +static int __init no_parallel_bringup(char *str)
> +{
> + do_parallel_bringup = false;
> +
> + return 0;
> +}
> +early_param("no_parallel_bringup", no_parallel_bringup);
> +
> static void __init smp_quirk_init_udelay(void)
> {
> /* if cmdline changed it from default, leave it alone */
> @@ -1084,8 +1094,6 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
> #ifdef CONFIG_X86_32
> /* Stack for startup_32 can be just as for start_secondary onwards */
> per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle);
> -#else
> - initial_gs = per_cpu_offset(cpu);
> #endif
> return 0;
> }
> @@ -1110,9 +1118,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
> start_ip = real_mode_header->trampoline_start64;
> #endif
> idle->thread.sp = (unsigned long)task_pt_regs(idle);
> - early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
> initial_code = (unsigned long)start_secondary;
> - initial_stack = idle->thread.sp;
> +
> + if (IS_ENABLED(CONFIG_X86_32)) {
> + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
> + initial_stack = idle->thread.sp;
> + } else if (!do_parallel_bringup) {
> + smpboot_control = STARTUP_SECONDARY | apicid;
> + }
>
> /* Enable the espfix hack for this CPU */
> init_espfix_ap(cpu);
> @@ -1512,6 +1525,47 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
>
> speculative_store_bypass_ht_init();
>
> + /*
> + * We can do 64-bit AP bringup in parallel if the CPU reports
> + * its APIC ID in CPUID (either leaf 0x0B if we need the full
> + * APIC ID in X2APIC mode, or leaf 0x01 if 8 bits are
> + * sufficient). Otherwise it's too hard. And not for SEV-ES
> + * guests because they can't use CPUID that early.
> + */
> + if (IS_ENABLED(CONFIG_X86_32) || boot_cpu_data.cpuid_level < 1 ||
> + (x2apic_mode && boot_cpu_data.cpuid_level < 0xb) ||
> + cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
> + do_parallel_bringup = false;
> +
> + if (do_parallel_bringup && x2apic_mode) {
> + unsigned int eax, ebx, ecx, edx;
> +
> + /*
> + * To support parallel bringup in x2apic mode, the AP will need
> + * to obtain its APIC ID from CPUID 0x0B, since CPUID 0x01 has
> + * only 8 bits. Check that it is present and seems correct.
> + */
> + cpuid_count(0xb, 0, &eax, &ebx, &ecx, &edx);
> +
> + /*
> + * AMD says that if executed with an umimplemented level in
> + * ECX, then it will return all zeroes in EAX. Intel says it
> + * will return zeroes in both EAX and EBX. Checking only EAX
> + * should be sufficient.
> + */
> + if (eax) {
> + pr_debug("Using CPUID 0xb for parallel CPU startup\n");
> + smpboot_control = STARTUP_SECONDARY | STARTUP_APICID_CPUID_0B;
> + } else {
> + pr_info("Disabling parallel bringup because CPUID 0xb looks untrustworthy\n");
> + do_parallel_bringup = false;
> + }
> + } else if (do_parallel_bringup) {
> + /* Without X2APIC, what's in CPUID 0x01 should suffice. */
> + pr_debug("Using CPUID 0x1 for parallel CPU startup\n");
> + smpboot_control = STARTUP_SECONDARY | STARTUP_APICID_CPUID_01;
> + }
> +
> snp_set_wakeup_secondary_cpu();
> }
>
> diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
> index af565816d2ba..788e5559549f 100644
> --- a/arch/x86/realmode/init.c
> +++ b/arch/x86/realmode/init.c
> @@ -154,6 +154,9 @@ static void __init setup_real_mode(void)
>
> trampoline_header->flags = 0;
>
> + trampoline_lock = &trampoline_header->lock;
> + *trampoline_lock = 0;
> +
> trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
>
> /* Map the real mode stub as virtual == physical */
> diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
> index e38d61d6562e..49ebc1636ffd 100644
> --- a/arch/x86/realmode/rm/trampoline_64.S
> +++ b/arch/x86/realmode/rm/trampoline_64.S
> @@ -49,6 +49,19 @@ SYM_CODE_START(trampoline_start)
> mov %ax, %es
> mov %ax, %ss
>
> + /*
> + * Make sure only one CPU fiddles with the realmode stack
> + */
> +.Llock_rm:
> + btl $0, tr_lock
> + jnc 2f
> + pause
> + jmp .Llock_rm
> +2:
> + lock
> + btsl $0, tr_lock
> + jc .Llock_rm
> +

Looks these changes should be applied to trampoline_start64()
yet, which is used for boot up APs when apic->wakeup_secondary_cpu_64
is available, e.g when ACPI_MADT_TYPE_MULTIPROC_WAKEUP is available.

One case I know is the INTEL TD guest, which using the MADT wakeup
for AP wake up now.

> # Setup stack
> movl $rm_stack_end, %esp
>
> @@ -241,6 +254,7 @@ SYM_DATA_START(trampoline_header)
> SYM_DATA(tr_efer, .space 8)
> SYM_DATA(tr_cr4, .space 4)
> SYM_DATA(tr_flags, .space 4)
> + SYM_DATA(tr_lock, .space 4)
> SYM_DATA_END(trampoline_header)
>
> #include "trampoline_common.S"
> diff --git a/kernel/smpboot.c b/kernel/smpboot.c
> index 2c7396da470c..a18a21dff9bc 100644
> --- a/kernel/smpboot.c
> +++ b/kernel/smpboot.c
> @@ -25,7 +25,7 @@
> * For the hotplug case we keep the task structs around and reuse
> * them.
> */
> -static DEFINE_PER_CPU(struct task_struct *, idle_threads);
> +DEFINE_PER_CPU(struct task_struct *, idle_threads);
>
> struct task_struct *idle_thread_get(unsigned int cpu)
> {
> --
> 2.25.1
>