Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation

From: David Woodhouse
Date: Thu Jan 21 2021 - 12:37:58 EST


On Thu, 2021-01-21 at 15:42 +0000, David Woodhouse wrote:
> [ 2.289283] BUG: kernel NULL pointer dereference, address: 0000000000000000
> [ 2.289283] #PF: supervisor write access in kernel mode
> [ 2.289283] #PF: error_code(0x0002) - not-present page
> [ 2.289283] PGD 0 P4D 0
> [ 2.289283] Oops: 0002 [#1] SMP PTI
> [ 2.289283] CPU: 32 PID: 0 Comm: swapper/32 Not tainted 5.10.0+ #745
> [ 2.289283] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1.fc33 04/01/2014
> [ 2.289283] RIP: 0010:init_x2apic_ldr+0xa0/0xb0


OK... in alloc_clustermask() for each CPU we were preallocating a
cluster_mask and storing it in the global cluster_hotplug_mask.

Then later for each CPU we were taking the preallocated cluster_mask
and setting cluster_hotplug_mask to NULL.

That doesn't parallelise well :)

So... ditch the global variable, let alloc_clustermask() install the
appropriate cluster_mask *directly* into the target CPU's per_cpu data
before it's running. And since we have to calculate the logical APIC ID
for the cluster ID, we might as well set x86_cpu_to_logical_apicid at
the same time.

Now all that init_x2apic_ldr() actually *does* on the target CPU is set
that CPU's bit in the pre-existing cluster_mask.

To reduce the number of loops over all (present or online) CPUs, I've
made it set the per_cpu cluster_mask for *all* CPUs in the cluster in
one pass at boot time. I think the case for later hotplug is also sane;
will have to test that.

But it passes that qemu boot test it was failing earlier, at least...

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index b0889c48a2ac..74bb4cae8b5b 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -18,7 +18,6 @@ struct cluster_mask {
static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
-static struct cluster_mask *cluster_hotplug_mask;

static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
@@ -98,54 +97,61 @@ static u32 x2apic_calc_apicid(unsigned int cpu)
static void init_x2apic_ldr(void)
{
struct cluster_mask *cmsk = this_cpu_read(cluster_masks);
- u32 cluster, apicid = apic_read(APIC_LDR);
- unsigned int cpu;

- this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+ BUG_ON(!cmsk);

- if (cmsk)
- goto update;
-
- cluster = apicid >> 16;
- for_each_online_cpu(cpu) {
- cmsk = per_cpu(cluster_masks, cpu);
- /* Matching cluster found. Link and update it. */
- if (cmsk && cmsk->clusterid == cluster)
- goto update;
- }
- cmsk = cluster_hotplug_mask;
- cmsk->clusterid = cluster;
- cluster_hotplug_mask = NULL;
-update:
- this_cpu_write(cluster_masks, cmsk);
cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
}

-static int alloc_clustermask(unsigned int cpu, int node)
+static int alloc_clustermask(unsigned int cpu, u32 cluster, int node)
{
+ struct cluster_mask *cmsk = NULL;
+ u32 apicid;
+
if (per_cpu(cluster_masks, cpu))
return 0;
- /*
- * If a hotplug spare mask exists, check whether it's on the right
- * node. If not, free it and allocate a new one.
+
+ /* For the hotplug case, don't always allocate a new one */
+ for_each_online_cpu(cpu) {
+ apicid = apic->cpu_present_to_apicid(cpu);
+ if (apicid != BAD_APICID && apicid >> 4 == cluster) {
+ cmsk = per_cpu(cluster_masks, cpu);
+ if (cmsk)
+ break;
+ }
+ }
+ if (!cmsk)
+ cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node);
+ if (!cmsk)
+ return -ENOMEM;
+
+ cmsk->node = node;
+ cmsk->clusterid = cluster;
+
+ /*
+ * As an optimisation during boot, set the cluster_mask for *all*
+ * present CPUs at once, which will include 'cpu'.
*/
- if (cluster_hotplug_mask) {
- if (cluster_hotplug_mask->node == node)
- return 0;
- kfree(cluster_hotplug_mask);
+ if (system_state < SYSTEM_RUNNING) {
+ for_each_present_cpu(cpu) {
+ u32 apicid = apic->cpu_present_to_apicid(cpu);
+ if (apicid != BAD_APICID && apicid >> 4 == cluster)
+ per_cpu(cluster_masks, cpu) = cmsk;
+ }
}

- cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
- GFP_KERNEL, node);
- if (!cluster_hotplug_mask)
- return -ENOMEM;
- cluster_hotplug_mask->node = node;
return 0;
}

static int x2apic_prepare_cpu(unsigned int cpu)
{
- if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0)
+ u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
+ u32 cluster = phys_apicid >> 4;
+ u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+
+ per_cpu(x86_cpu_to_logical_apicid, cpu) = logical_apicid;
+
+ if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0)
return -ENOMEM;
if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
return -ENOMEM;

Attachment: smime.p7s
Description: S/MIME cryptographic signature