Re: 32bit NUMA and fakeNUMA broken for AMD CPUs

From: Tejun Heo
Date: Wed Jun 29 2011 - 12:15:26 EST


Hans, can you please apply the following patch and post the boot log
from both SPARSEMEM and DISCONTIGMEM kernels? On SPARSEMEM, it should
reject NUMA config and boot w/ flatmem.

Thanks.

diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 224e8c5..0b6c75b 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -34,15 +34,15 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {}
* 64Gb / 4096bytes/page = 16777216 pages
*/
#define MAX_NR_PAGES 16777216
-#define MAX_ELEMENTS 1024
-#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
+#define MAX_SECTIONS 1024
+#define PAGES_PER_SECTION (MAX_NR_PAGES/MAX_SECTIONS)

extern s8 physnode_map[];

static inline int pfn_to_nid(unsigned long pfn)
{
#ifdef CONFIG_NUMA
- return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]);
+ return((int) physnode_map[(pfn) / PAGES_PER_SECTIONS]);
#else
return 0;
#endif
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f5510d8..9d643e2 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -496,6 +496,7 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)

static int __init numa_register_memblks(struct numa_meminfo *mi)
{
+ unsigned long pfn_align;
int i, nid;

/* Account for nodes with cpus and no memory */
@@ -511,6 +512,15 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)

/* for out of order entries */
sort_node_map();
+
+ pfn_align = node_map_pfn_alignment();
+ if (pfn_align && pfn_align < PAGES_PER_SECTION) {
+ printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
+ (u64)pfn_align << PAGE_SHIFT >> 20,
+ (u64)PAGES_PER_SECTION << PAGE_SHIFT >> 20);
+ return -EINVAL;
+ }
+
if (!numa_meminfo_cover_memory(mi))
return -EINVAL;

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 849a975..3adebe7 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -41,7 +41,7 @@
* physnode_map[16-31] = 1;
* physnode_map[32- ] = -1;
*/
-s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
+s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};
EXPORT_SYMBOL(physnode_map);

void memory_present(int nid, unsigned long start, unsigned long end)
@@ -52,8 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
nid, start, end);
printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
printk(KERN_DEBUG " ");
- for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
- physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+ physnode_map[pfn / PAGES_PER_SECTION] = nid;
printk(KERN_CONT "%lx ", pfn);
}
printk(KERN_CONT "\n");
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71..c70a326 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1313,6 +1313,7 @@ extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
unsigned long end_pfn);
extern void remove_all_active_ranges(void);
void sort_node_map(void);
+unsigned long node_map_pfn_alignment(void);
unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
unsigned long end_pfn);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985a..2ae7dbc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4585,6 +4585,34 @@ void __init sort_node_map(void)
cmp_node_active_region, NULL);
}

+unsigned long __init node_map_pfn_alignment(void)
+{
+ unsigned long accl_mask = 0, last_end = 0;
+ int last_nid = -1;
+ int i;
+
+ for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
+ int nid = early_node_map[i].nid;
+ unsigned long start = early_node_map[i].start_pfn;
+ unsigned long end = early_node_map[i].end_pfn;
+ unsigned long mask;
+
+ if (!start || last_nid < 0 || last_nid == nid) {
+ last_nid = nid;
+ last_end = end;
+ continue;
+ }
+
+ mask = ~((1 << __ffs(start)) - 1);
+ while (mask && last_end <= (start & (mask << 1)))
+ mask <<= 1;
+
+ accl_mask |= mask;
+ }
+
+ return ~accl_mask + 1;
+}
+
/* Find the lowest pfn for a node */
static unsigned long __init find_min_pfn_for_node(int nid)
{
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/