Re: [PATCH v7 5/9] x86/smpboot: Split up native_cpu_up into separate phases and document them

From: Yuan Yao
Date: Wed Feb 08 2023 - 05:04:26 EST


On Tue, Feb 07, 2023 at 11:04:32PM +0000, Usama Arif wrote:
> From: David Woodhouse <dwmw@xxxxxxxxxxxx>
>
> There are four logical parts to what native_cpu_up() does on the BSP (or
> on the controlling CPU for a later hotplug):
>
> 1) Wake the AP by sending the INIT/SIPI/SIPI sequence.
>
> 2) Wait for the AP to make it as far as wait_for_master_cpu() which
> sets that CPU's bit in cpu_initialized_mask, then sets the bit in
> cpu_callout_mask to let the AP proceed through cpu_init().
>
> 3) Waits for the AP to finish cpu_init() and get as far as the
> smp_callin() call, which sets that CPU's bit in cpu_callin_mask.
>
> 4) Perform the TSC synchronization and wait for the AP to actually
> mark itself online in cpu_online_mask.
>
> In preparation to allow these phases to operate in parallel on multiple
> APs, split them out into separate functions and document the interactions
> a little more clearly in both the BSP and AP code paths.
>
> No functional change intended.
>
> [Usama Arif: fixed rebase conflict]
> Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
> Signed-off-by: Usama Arif <usama.arif@xxxxxxxxxxxxx>
> ---
> arch/x86/kernel/smpboot.c | 182 +++++++++++++++++++++++++++-----------
> 1 file changed, 128 insertions(+), 54 deletions(-)
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 3a793772a2aa..eee717086ab7 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -204,6 +204,10 @@ static void smp_callin(void)
>
> wmb();
>
> + /*
> + * This runs the AP through all the cpuhp states to its target
> + * state (CPUHP_ONLINE in the case of serial bringup).
> + */
> notify_cpu_starting(cpuid);
>
> /*
> @@ -231,17 +235,33 @@ static void notrace start_secondary(void *unused)
> load_cr3(swapper_pg_dir);
> __flush_tlb_all();
> #endif
> + /*
> + * Sync point with do_wait_cpu_initialized(). On boot, all secondary
> + * CPUs reach this stage after receiving INIT/SIPI from do_cpu_up()
> + * in the x86/cpu:kick cpuhp stage. At the start of cpu_init() they
> + * will wait for do_wait_cpu_initialized() to set their bit in
> + * smp_callout_mask to release them.

The last sentence of the comment looks confused. The fact is:

For serial case, The BSP waits AP to set cpu_initialized_mask from
wait_for_master_cpu() after fired INIT/SIPI, then AP starts to wait
cpu_callout_mask set by BSP from do_boot_cpu().

Or the comments below "Bringup step two:..." which also looks clear
enough then above.

> + */
> cpu_init_secondary();
> rcu_cpu_starting(raw_smp_processor_id());
> x86_cpuinit.early_percpu_clock_init();
> +
> + /*
> + * Sync point with do_wait_cpu_callin(). The AP doesn't wait here
> + * but just sets the bit to let the controlling CPU (BSP) know that
> + * it's got this far.
> + */
> smp_callin();
>
> enable_start_cpu0 = 0;
>
> /* otherwise gcc will move up smp_processor_id before the cpu_init */
> barrier();
> +
> /*
> - * Check TSC synchronization with the boot CPU:
> + * Check TSC synchronization with the boot CPU (or whichever CPU
> + * is controlling the bringup). It will do its part of this from
> + * do_wait_cpu_online(), making it an implicit sync point.
> */
> check_tsc_sync_target();
>
> @@ -254,6 +274,7 @@ static void notrace start_secondary(void *unused)
> * half valid vector space.
> */
> lock_vector_lock();
> + /* Sync point with do_wait_cpu_online() */
> set_cpu_online(smp_processor_id(), true);
> lapic_online();
> unlock_vector_lock();
> @@ -1083,7 +1104,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
> unsigned long start_ip = real_mode_header->trampoline_start;
>
> unsigned long boot_error = 0;
> - unsigned long timeout;
>
> #ifdef CONFIG_X86_64
> /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
> @@ -1144,55 +1164,94 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
> boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
> cpu0_nmi_registered);
>
> - if (!boot_error) {
> - /*
> - * Wait 10s total for first sign of life from AP
> - */
> - boot_error = -1;
> - timeout = jiffies + 10*HZ;
> - while (time_before(jiffies, timeout)) {
> - if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
> - /*
> - * Tell AP to proceed with initialization
> - */
> - cpumask_set_cpu(cpu, cpu_callout_mask);
> - boot_error = 0;
> - break;
> - }
> - schedule();
> - }
> - }
> + return boot_error;
> +}
>
> - if (!boot_error) {
> - /*
> - * Wait till AP completes initial initialization
> - */
> - while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
> - /*
> - * Allow other tasks to run while we wait for the
> - * AP to come online. This also gives a chance
> - * for the MTRR work(triggered by the AP coming online)
> - * to be completed in the stop machine context.
> - */
> - schedule();
> - }
> +static int do_wait_cpu_cpumask(unsigned int cpu, const struct cpumask *mask)
> +{
> + unsigned long timeout;
> +
> + /*
> + * Wait up to 10s for the CPU to report in.
> + */
> + timeout = jiffies + 10*HZ;
> + while (time_before(jiffies, timeout)) {
> + if (cpumask_test_cpu(cpu, mask))
> + return 0;
> +
> + schedule();
> }
> + return -1;
> +}
>
> - if (x86_platform.legacy.warm_reset) {
> - /*
> - * Cleanup possible dangling ends...
> - */
> - smpboot_restore_warm_reset_vector();
> +/*
> + * Bringup step two: Wait for the target AP to reach cpu_init_secondary()
> + * and thus wait_for_master_cpu(), then set cpu_callout_mask to allow it
> + * to proceed. The AP will then proceed past setting its 'callin' bit
> + * and end up waiting in check_tsc_sync_target() until we reach
> + * do_wait_cpu_online() to tend to it.
> + */
> +static int do_wait_cpu_initialized(unsigned int cpu)
> +{
> + /*
> + * Wait for first sign of life from AP.
> + */
> + if (do_wait_cpu_cpumask(cpu, cpu_initialized_mask))
> + return -1;
> +
> + cpumask_set_cpu(cpu, cpu_callout_mask);
> + return 0;
> +}
> +
> +/*
> + * Bringup step three: Wait for the target AP to reach smp_callin().
> + * The AP is not waiting for us here so we don't need to parallelise
> + * this step. Not entirely clear why we care about this, since we just
> + * proceed directly to TSC synchronization which is the next sync
> + * point with the AP anyway.
> + */
> +static int do_wait_cpu_callin(unsigned int cpu)
> +{
> + /*
> + * Wait till AP completes initial initialization.
> + */
> + return do_wait_cpu_cpumask(cpu, cpu_callin_mask);
> +}
> +
> +/*
> + * Bringup step four: Synchronize the TSC and wait for the target AP
> + * to reach set_cpu_online() in start_secondary().
> + */
> +static int do_wait_cpu_online(unsigned int cpu)
> +{
> + unsigned long flags;
> +
> + /*
> + * Check TSC synchronization with the AP (keep irqs disabled
> + * while doing so):
> + */
> + local_irq_save(flags);
> + check_tsc_sync_source(cpu);
> + local_irq_restore(flags);
> +
> + /*
> + * Wait for the AP to mark itself online. Not entirely
> + * clear why we care, since the generic cpuhp code will
> + * wait for it to each CPUHP_AP_ONLINE_IDLE before going
> + * ahead with the rest of the bringup anyway.
> + */
> + while (!cpu_online(cpu)) {
> + cpu_relax();
> + touch_nmi_watchdog();
> }
>
> - return boot_error;
> + return 0;
> }
>
> -int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
> +static int do_cpu_up(unsigned int cpu, struct task_struct *tidle)
> {
> int apicid = apic->cpu_present_to_apicid(cpu);
> int cpu0_nmi_registered = 0;
> - unsigned long flags;
> int err, ret = 0;
>
> lockdep_assert_irqs_enabled();
> @@ -1239,19 +1298,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
> goto unreg_nmi;
> }
>
> - /*
> - * Check TSC synchronization with the AP (keep irqs disabled
> - * while doing so):
> - */
> - local_irq_save(flags);
> - check_tsc_sync_source(cpu);
> - local_irq_restore(flags);
> -
> - while (!cpu_online(cpu)) {
> - cpu_relax();
> - touch_nmi_watchdog();
> - }
> -
> unreg_nmi:
> /*
> * Clean up the nmi handler. Do this after the callin and callout sync
> @@ -1263,6 +1309,34 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
> return ret;
> }
>
> +int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
> +{
> + int ret;
> +
> + ret = do_cpu_up(cpu, tidle);
> + if (ret)
> + return ret;
> +
> + ret = do_wait_cpu_initialized(cpu);
> + if (ret)
> + return ret;
> +
> + ret = do_wait_cpu_callin(cpu);
> + if (ret)
> + return ret;
> +
> + ret = do_wait_cpu_online(cpu);
> +
> + if (x86_platform.legacy.warm_reset) {
> + /*
> + * Cleanup possible dangling ends...
> + */
> + smpboot_restore_warm_reset_vector();
> + }
> +
> + return ret;
> +}
> +
> /**
> * arch_disable_smp_support() - disables SMP support for x86 at runtime
> */
> --
> 2.25.1
>