RE: [PATCH 1/1] sched/topology: Make sched_init_numa() use a set for the deduplicating sort

From: Song Bao Hua (Barry Song)
Date: Sun Jan 24 2021 - 21:25:32 EST




> -----Original Message-----
> From: Valentin Schneider [mailto:valentin.schneider@xxxxxxx]
> Sent: Saturday, January 23, 2021 1:40 AM
> To: linux-kernel@xxxxxxxxxxxxxxx
> Cc: mingo@xxxxxxxxxx; peterz@xxxxxxxxxxxxx; vincent.guittot@xxxxxxxxxx;
> dietmar.eggemann@xxxxxxx; morten.rasmussen@xxxxxxx; mgorman@xxxxxxx; Song Bao
> Hua (Barry Song) <song.bao.hua@xxxxxxxxxxxxx>
> Subject: [PATCH 1/1] sched/topology: Make sched_init_numa() use a set for the
> deduplicating sort
>
> The deduplicating sort in sched_init_numa() assumes that the first line in
> the distance table contains all unique values in the entire table. I've
> been trying to pen what this exactly means for the topology, but it's not
> straightforward. For instance, topology.c uses this example:
>
> node 0 1 2 3
> 0: 10 20 20 30
> 1: 20 10 20 20
> 2: 20 20 10 20
> 3: 30 20 20 10
>
> 0 ----- 1
> | / |
> | / |
> | / |
> 2 ----- 3
>
> Which works out just fine. However, if we swap nodes 0 and 1:
>
> 1 ----- 0
> | / |
> | / |
> | / |
> 2 ----- 3
>
> we get this distance table:
>
> node 0 1 2 3
> 0: 10 20 20 20
> 1: 20 10 20 30
> 2: 20 20 10 20
> 3: 20 30 20 10
>
> Which breaks the deduplicating sort (non-representative first line). In
> this case this would just be a renumbering exercise, but it so happens that
> we can have a deduplicating sort that goes through the whole table in O(n²)
> at the extra cost of a temporary memory allocation (i.e. any form of set).
>
> The ACPI spec (SLIT) mentions distances are encoded on 8 bits. Following
> this, implement the set as a 256-bits bitmap. Should this not be
> satisfactory (i.e. we want to support 32-bit values), then we'll have to go
> for some other sparse set implementation.
>
> This has the added benefit of letting us allocate just the right amount of
> memory for sched_domains_numa_distance[], rather than an arbitrary
> (nr_node_ids + 1).
>
> Note: DT binding equivalent (distance-map) decodes distances as 32-bit
> values.
>
> Signed-off-by: Valentin Schneider <valentin.schneider@xxxxxxx>

Hi Valentin, thanks.
It seems it didn't resolve my problem. The simplest code to workaround
my problem is:
void sched_init_numa(void)
{
...
next_distance = curr_distance;
for (i = 0; i < nr_node_ids; i++) {
for (j = 0; j < nr_node_ids; j++) {
for (k = 0; k < nr_node_ids; k++) {
+ int ii = (i + 1) % nr_node_ids;
- int distance = node_distance(i, k);
+ int distance = node_distance(ii, k);

if (distance > curr_distance &&
(distance < next_distance ||
next_distance == curr_distance))
next_distance = distance;

/*

On the other hand, the patch made the kernel panic during boot:

[ 1.596789] swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000000417c9000
[ 1.598406] [ffff80002e8cc001] pgd=000000013ffff003,
p4d=000000013ffff003, pud=000000013fffe003, pmd=0000000000000000
[ 1.600258] Internal error: Oops: 96000006 [#1] PREEMPT SMP
[ 1.600730] Modules linked in:
[ 1.601072] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W
5.11.0-rc1-00079-g8da796ff6f58-dirty #114
[ 1.601652] Hardware name: linux,dummy-virt (DT)
[ 1.601917] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
[ 1.602202] pc : build_sched_domains+0x3e4/0x1498
[ 1.604050] lr : build_sched_domains+0x3c0/0x1498
[ 1.604512] sp : ffff800011f1bce0
[ 1.604654] x29: ffff800011f1bce0 x28: ffff800011b7a410
[ 1.604924] x27: ffff800011b799c0 x26: ffff00000263b300
[ 1.605227] x25: 00000000ffffffff x24: ffff00000263b3a0
[ 1.605470] x23: 0000000000000000 x22: ffff800011b799c0
[ 1.605813] x21: 0000000000000000 x20: ffff00000261db00
[ 1.606140] x19: ffff0000025c7600 x18: 0000000000000010
[ 1.606472] x17: 000000001472bb62 x16: 000000006e92504d
[ 1.606885] x15: ffff000002600508 x14: 0000000000000116
[ 1.607408] x13: ffff000002600508 x12: 00000000ffffffea
[ 1.607681] x11: ffff800011c02fe8 x10: ffff800011beafa8
[ 1.607931] x9 : ffff800011beb000 x8 : 0000000000017fe8
[ 1.608225] x7 : 00000000000002a8 x6 : 00000000000000ff
[ 1.608480] x5 : 0000000000000000 x4 : 0000000000000000
[ 1.608690] x3 : 0000000000000000 x2 : ffff80002e8cc000
[ 1.609048] x1 : 0000000000000001 x0 : 0000000000000001
[ 1.609425] Call trace:
[ 1.609550] build_sched_domains+0x3e4/0x1498
[ 1.609777] sched_init_domains+0x88/0xb8
[ 1.609937] sched_init_smp+0x30/0x80
[ 1.610170] kernel_init_freeable+0xf4/0x238
[ 1.610388] kernel_init+0x14/0x118
[ 1.610559] ret_from_fork+0x10/0x30
[ 1.611234] Code: b4000201 93407eb7 aa0103e0 f8777ac2 (f8626800)
[ 1.613107] ---[ end trace 01540465b01c8e3b ]---
[ 1.614871] Kernel panic - not syncing: Attempted to kill init!
exitcode=0x0000000b
[ 1.615433] SMP: stopping secondary CPUs
[ 1.616415] ---[ end Kernel panic - not syncing: Attempted to kill
init! exitcode=0x0000000b ]---

with the below topology:
qemu-system-aarch64 -M virt -nographic \
-smp cpus=8 \
-numa node,cpus=0-1,nodeid=0 \
-numa node,cpus=2-3,nodeid=1 \
-numa node,cpus=4-5,nodeid=2 \
-numa node,cpus=6-7,nodeid=3 \
-numa dist,src=0,dst=1,val=12 \
-numa dist,src=0,dst=2,val=20 \
-numa dist,src=0,dst=3,val=22 \
-numa dist,src=1,dst=2,val=22 \
-numa dist,src=2,dst=3,val=12 \
-numa dist,src=1,dst=3,val=24 \


The panic address is *1294:

if (sdd->sd) {
1280: f9400e61 ldr x1, [x19, #24]
1284: b4000201 cbz x1, 12c4 <build_sched_domains+0x414>
sd = *per_cpu_ptr(sdd->sd, j);
1288: 93407eb7 sxtw x23, w21
128c: aa0103e0 mov x0, x1
1290: f8777ac2 ldr x2, [x22, x23, lsl #3]
*1294: f8626800 ldr x0, [x0, x2]
if (sd && (sd->flags & SD_OVERLAP))
1298: b4000120 cbz x0, 12bc <build_sched_domains+0x40c>
129c: b9403803 ldr w3, [x0, #56]
12a0: 365800e3 tbz w3, #11, 12bc
<build_sched_domains+0x40c>
free_sched_groups(sd->groups, 0);
12a4: f9400800 ldr x0, [x0, #16]
if (!sg)

Thanks
Barry

> ---
> include/linux/topology.h | 1 +
> kernel/sched/topology.c | 99 +++++++++++++++++++---------------------
> 2 files changed, 49 insertions(+), 51 deletions(-)
>
> diff --git a/include/linux/topology.h b/include/linux/topology.h
> index ad03df1cc266..7634cd737061 100644
> --- a/include/linux/topology.h
> +++ b/include/linux/topology.h
> @@ -48,6 +48,7 @@ int arch_update_cpu_topology(void);
> /* Conform to ACPI 2.0 SLIT distance definitions */
> #define LOCAL_DISTANCE 10
> #define REMOTE_DISTANCE 20
> +#define DISTANCE_BITS 8
> #ifndef node_distance
> #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE :
> REMOTE_DISTANCE)
> #endif
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 5d3675c7a76b..bf5c9bd10bfe 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1596,66 +1596,58 @@ static void init_numa_topology_type(void)
> }
> }
>
> +
> +#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
> +
> void sched_init_numa(void)
> {
> - int next_distance, curr_distance = node_distance(0, 0);
> struct sched_domain_topology_level *tl;
> - int level = 0;
> - int i, j, k;
> -
> - sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1),
> GFP_KERNEL);
> - if (!sched_domains_numa_distance)
> - return;
> -
> - /* Includes NUMA identity node at level 0. */
> - sched_domains_numa_distance[level++] = curr_distance;
> - sched_domains_numa_levels = level;
> + unsigned long *distance_map;
> + int nr_levels = 0;
> + int i, j;
>
> /*
> * O(nr_nodes^2) deduplicating selection sort -- in order to find the
> * unique distances in the node_distance() table.
> - *
> - * Assumes node_distance(0,j) includes all distances in
> - * node_distance(i,j) in order to avoid cubic time.
> */
> - next_distance = curr_distance;
> + distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
> + if (!distance_map)
> + return;
> +
> + bitmap_zero(distance_map, NR_DISTANCE_VALUES);
> for (i = 0; i < nr_node_ids; i++) {
> for (j = 0; j < nr_node_ids; j++) {
> - for (k = 0; k < nr_node_ids; k++) {
> - int distance = node_distance(i, k);
> -
> - if (distance > curr_distance &&
> - (distance < next_distance ||
> - next_distance == curr_distance))
> - next_distance = distance;
> -
> - /*
> - * While not a strong assumption it would be nice to know
> - * about cases where if node A is connected to B, B is not
> - * equally connected to A.
> - */
> - if (sched_debug() && node_distance(k, i) != distance)
> - sched_numa_warn("Node-distance not symmetric");
> + int distance = node_distance(i, j);
>
> - if (sched_debug() && i && !find_numa_distance(distance))
> - sched_numa_warn("Node-0 not representative");
> + if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
> + sched_numa_warn("Invalid distance value range");
> + return;
> }
> - if (next_distance != curr_distance) {
> - sched_domains_numa_distance[level++] = next_distance;
> - sched_domains_numa_levels = level;
> - curr_distance = next_distance;
> - } else break;
> +
> + bitmap_set(distance_map, distance, 1);
> }
> + }
> + /*
> + * We can now figure out how many unique distance values there are and
> + * allocate memory accordingly.
> + */
> + nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
>
> - /*
> - * In case of sched_debug() we verify the above assumption.
> - */
> - if (!sched_debug())
> - break;
> + sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int),
> GFP_KERNEL);
> + if (!sched_domains_numa_distance) {
> + bitmap_free(distance_map);
> + return;
> + }
> +
> + for (i = 0, j = 0; i < nr_levels; i++, j++) {
> + j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
> + sched_domains_numa_distance[i] = j;
> }
>
> + bitmap_free(distance_map);
> +
> /*
> - * 'level' contains the number of unique distances
> + * 'nr_levels' contains the number of unique distances
> *
> * The sched_domains_numa_distance[] array includes the actual distance
> * numbers.
> @@ -1664,15 +1656,15 @@ void sched_init_numa(void)
> /*
> * Here, we should temporarily reset sched_domains_numa_levels to 0.
> * If it fails to allocate memory for array sched_domains_numa_masks[][],
> - * the array will contain less then 'level' members. This could be
> + * the array will contain less then 'nr_levels' members. This could be
> * dangerous when we use it to iterate array sched_domains_numa_masks[][]
> * in other functions.
> *
> - * We reset it to 'level' at the end of this function.
> + * We reset it to 'nr_levels' at the end of this function.
> */
> sched_domains_numa_levels = 0;
>
> - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
> + sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels,
> GFP_KERNEL);
> if (!sched_domains_numa_masks)
> return;
>
> @@ -1680,7 +1672,7 @@ void sched_init_numa(void)
> * Now for each level, construct a mask per node which contains all
> * CPUs of nodes that are that many hops away from us.
> */
> - for (i = 0; i < level; i++) {
> + for (i = 0; i < nr_levels; i++) {
> sched_domains_numa_masks[i] =
> kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
> if (!sched_domains_numa_masks[i])
> @@ -1688,12 +1680,17 @@ void sched_init_numa(void)
>
> for (j = 0; j < nr_node_ids; j++) {
> struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
> + int k;
> +
> if (!mask)
> return;
>
> sched_domains_numa_masks[i][j] = mask;
>
> for_each_node(k) {
> + if (sched_debug() && (node_distance(j, k) != node_distance(k,
> j)))
> + sched_numa_warn("Node-distance not symmetric");
> +
> if (node_distance(j, k) > sched_domains_numa_distance[i])
> continue;
>
> @@ -1705,7 +1702,7 @@ void sched_init_numa(void)
> /* Compute default topology size */
> for (i = 0; sched_domain_topology[i].mask; i++);
>
> - tl = kzalloc((i + level + 1) *
> + tl = kzalloc((i + nr_levels) *
> sizeof(struct sched_domain_topology_level), GFP_KERNEL);
> if (!tl)
> return;
> @@ -1728,7 +1725,7 @@ void sched_init_numa(void)
> /*
> * .. and append 'j' levels of NUMA goodness.
> */
> - for (j = 1; j < level; i++, j++) {
> + for (j = 1; j < nr_levels; i++, j++) {
> tl[i] = (struct sched_domain_topology_level){
> .mask = sd_numa_mask,
> .sd_flags = cpu_numa_flags,
> @@ -1740,8 +1737,8 @@ void sched_init_numa(void)
>
> sched_domain_topology = tl;
>
> - sched_domains_numa_levels = level;
> - sched_max_numa_distance = sched_domains_numa_distance[level - 1];
> + sched_domains_numa_levels = nr_levels;
> + sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
>
> init_numa_topology_type();
> }
> --
> 2.27.0