[PATCH] ppc64: cpu hotplug notifier for numa

From: Paul Mackerras
Date: Mon Oct 25 2004 - 23:40:59 EST


This patch is from Nathan Lynch <nathanl@xxxxxxxxxxxxxx>.

The NUMA properties of all "possible" cpus are not necessarily
available at boot time on ppc64 LPAR. Only the properties for present
cpus are known.

This patch modifies the ppc64 numa code to map a cpu to its node right
before it is brought up -- this means that secondary cpus are now
mapped to their nodes during smp_init(). Cpus are removed from their
nodes after they have gone offline.

Also some minor cleanups:
- Stash the "minimum common depth" in a global at boot time, so we
don't have to rediscover it every time something changes.

- Remove unnecessary variable from of_get_associativity() which is
accessed while possibly uninitialized.

- Remove the cpu portion from dump_numa_topology() since it will show
only the boot cpu now. We could display this information from
smp_cpus_done() if necessary.

Tested on a 4-way 2-node Power5 system.

Signed-off-by: Nathan Lynch <nathanl@xxxxxxxxxxxxxx>
Signed-off-by: Paul Mackerras <paulus@xxxxxxxxx>

diff -puN arch/ppc64/mm/numa.c~ppc64-numa-cpu-hotplug-notifier arch/ppc64/mm/numa.c
--- 2.6.10-rc1/arch/ppc64/mm/numa.c~ppc64-numa-cpu-hotplug-notifier 2004-10-23 15:10:39.000000000 -0500
+++ 2.6.10-rc1-nathanl/arch/ppc64/mm/numa.c 2004-10-23 16:28:58.000000000 -0500
@@ -15,6 +15,8 @@
#include <linux/mmzone.h>
#include <linux/module.h>
#include <linux/nodemask.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
#include <asm/lmb.h>
#include <asm/machdep.h>
#include <asm/abs_addr.h>
@@ -39,6 +41,7 @@ int nr_cpus_in_node[MAX_NUMNODES] = { [0
struct pglist_data *node_data[MAX_NUMNODES];
bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
static unsigned long node0_io_hole_size;
+static int min_common_depth;

/*
* We need somewhere to store start/span for each node until we have
@@ -64,7 +67,24 @@ static inline void map_cpu_to_node(int c
}
}

-static struct device_node * __init find_cpu_node(unsigned int cpu)
+#ifdef CONFIG_HOTPLUG_CPU
+static void unmap_cpu_from_node(unsigned long cpu)
+{
+ int node = numa_cpu_lookup_table[cpu];
+
+ dbg("removing cpu %lu from node %d\n", cpu, node);
+
+ if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
+ cpu_clear(cpu, numa_cpumask_lookup_table[node]);
+ nr_cpus_in_node[node]--;
+ } else {
+ printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
+ cpu, node);
+ }
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static struct device_node * __devinit find_cpu_node(unsigned int cpu)
{
unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
struct device_node *cpu_node = NULL;
@@ -96,26 +116,21 @@ static struct device_node * __init find_

/* must hold reference to node during call */
static int *of_get_associativity(struct device_node *dev)
- {
- unsigned int *result;
- int len;
-
- result = (unsigned int *)get_property(dev, "ibm,associativity", &len);
-
- if (len <= 0)
- return NULL;
-
- return result;
+{
+ return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
}

-static int of_node_numa_domain(struct device_node *device, int depth)
+static int of_node_numa_domain(struct device_node *device)
{
int numa_domain;
unsigned int *tmp;

+ if (min_common_depth == -1)
+ return 0;
+
tmp = of_get_associativity(device);
- if (tmp && (tmp[0] >= depth)) {
- numa_domain = tmp[depth];
+ if (tmp && (tmp[0] >= min_common_depth)) {
+ numa_domain = tmp[min_common_depth];
} else {
dbg("WARNING: no NUMA information for %s\n",
device->full_name);
@@ -138,7 +153,7 @@ static int of_node_numa_domain(struct de
*
* - Dave Hansen <haveblue@xxxxxxxxxx>
*/
-static int find_min_common_depth(void)
+static int __init find_min_common_depth(void)
{
int depth;
unsigned int *ref_points;
@@ -185,11 +200,72 @@ static unsigned long read_cell_ul(struct
return result;
}

+/*
+ * Figure out to which domain a cpu belongs and stick it there.
+ * Return the id of the domain used.
+ */
+static int numa_setup_cpu(unsigned long lcpu)
+{
+ int numa_domain = 0;
+ struct device_node *cpu = find_cpu_node(lcpu);
+
+ if (!cpu) {
+ WARN_ON(1);
+ goto out;
+ }
+
+ numa_domain = of_node_numa_domain(cpu);
+
+ if (numa_domain >= MAX_NUMNODES) {
+ /*
+ * POWER4 LPAR uses 0xffff as invalid node,
+ * dont warn in this case.
+ */
+ if (numa_domain != 0xffff)
+ printk(KERN_ERR "WARNING: cpu %ld "
+ "maps to invalid NUMA node %d\n",
+ lcpu, numa_domain);
+ numa_domain = 0;
+ }
+out:
+ node_set_online(numa_domain);
+
+ map_cpu_to_node(lcpu, numa_domain);
+
+ of_node_put(cpu);
+
+ return numa_domain;
+}
+
+static int cpu_numa_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned long lcpu = (unsigned long)hcpu;
+ int ret = NOTIFY_DONE;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (min_common_depth == -1 || !numa_enabled)
+ map_cpu_to_node(lcpu, 0);
+ else
+ numa_setup_cpu(lcpu);
+ ret = NOTIFY_OK;
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ unmap_cpu_from_node(lcpu);
+ break;
+ ret = NOTIFY_OK;
+#endif
+ }
+ return ret;
+}
+
static int __init parse_numa_properties(void)
{
- struct device_node *cpu = NULL;
struct device_node *memory = NULL;
- int depth;
int max_domain = 0;
long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
unsigned long i;
@@ -206,44 +282,13 @@ static int __init parse_numa_properties(
for (i = 0; i < entries ; i++)
numa_memory_lookup_table[i] = ARRAY_INITIALISER;

- depth = find_min_common_depth();
-
- dbg("NUMA associativity depth for CPU/Memory: %d\n", depth);
- if (depth < 0)
- return depth;
-
- for_each_cpu(i) {
- int numa_domain;
-
- cpu = find_cpu_node(i);
-
- if (cpu) {
- numa_domain = of_node_numa_domain(cpu, depth);
- of_node_put(cpu);
-
- if (numa_domain >= MAX_NUMNODES) {
- /*
- * POWER4 LPAR uses 0xffff as invalid node,
- * dont warn in this case.
- */
- if (numa_domain != 0xffff)
- printk(KERN_ERR "WARNING: cpu %ld "
- "maps to invalid NUMA node %d\n",
- i, numa_domain);
- numa_domain = 0;
- }
- } else {
- dbg("WARNING: no NUMA information for cpu %ld\n", i);
- numa_domain = 0;
- }
-
- node_set_online(numa_domain);
+ min_common_depth = find_min_common_depth();

- if (max_domain < numa_domain)
- max_domain = numa_domain;
+ dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+ if (min_common_depth < 0)
+ return min_common_depth;

- map_cpu_to_node(i, numa_domain);
- }
+ max_domain = numa_setup_cpu(boot_cpuid);

memory = NULL;
while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
@@ -267,7 +312,7 @@ new_range:
start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
size = _ALIGN_UP(size, MEMORY_INCREMENT);

- numa_domain = of_node_numa_domain(memory, depth);
+ numa_domain = of_node_numa_domain(memory);

if (numa_domain >= MAX_NUMNODES) {
if (numa_domain != 0xffff)
@@ -341,8 +386,7 @@ static void __init setup_nonnuma(void)
numa_memory_lookup_table[i] = ARRAY_INITIALISER;
}

- for (i = 0; i < NR_CPUS; i++)
- map_cpu_to_node(i, 0);
+ map_cpu_to_node(boot_cpuid, 0);

node_set_online(0);

@@ -358,35 +402,10 @@ static void __init setup_nonnuma(void)
static void __init dump_numa_topology(void)
{
unsigned int node;
- unsigned int cpu, count;
+ unsigned int count;

- for (node = 0; node < MAX_NUMNODES; node++) {
- if (!node_online(node))
- continue;
-
- printk(KERN_INFO "Node %d CPUs:", node);
-
- count = 0;
- /*
- * If we used a CPU iterator here we would miss printing
- * the holes in the cpumap.
- */
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
- if (count == 0)
- printk(" %u", cpu);
- ++count;
- } else {
- if (count > 1)
- printk("-%u", cpu - 1);
- count = 0;
- }
- }
-
- if (count > 1)
- printk("-%u", NR_CPUS - 1);
- printk("\n");
- }
+ if (min_common_depth == -1 || !numa_enabled)
+ return;

for (node = 0; node < MAX_NUMNODES; node++) {
unsigned long i;
@@ -414,6 +433,7 @@ static void __init dump_numa_topology(vo
printk("-0x%lx", i);
printk("\n");
}
+ return;
}

/*
@@ -460,6 +480,10 @@ static unsigned long careful_allocation(
void __init do_init_bootmem(void)
{
int nid;
+ static struct notifier_block ppc64_numa_nb = {
+ .notifier_call = cpu_numa_callback,
+ .priority = 1 /* Must run before sched domains notifier. */
+ };

min_low_pfn = 0;
max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
@@ -470,6 +494,8 @@ void __init do_init_bootmem(void)
else
dump_numa_topology();

+ register_cpu_notifier(&ppc64_numa_nb);
+
for (nid = 0; nid < numnodes; nid++) {
unsigned long start_paddr, end_paddr;
int i;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/