NUMA allocations fail to be numa allocated

From: Professor Berkley Shands
Date: Tue Feb 12 2013 - 16:32:43 EST

Next message: H. Peter Anvin: "Re: [RFC][PATCH v2] tracing/syscalls: Allow archs to ignore tracingcompat syscalls"
Previous message: Andrew Morton: "Re: [PATCH 1/3] nbd: support FLUSH requests"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

using libnuma calls on RedHat 6.3 x86_64 with the default kernel and up to 3.4.29
don't allocate on the specified numa nodes, even when forced with numactl.

It appears that setting the NUMA policy, and or numa nodes does little for large allocations.
Using HUGETLBFS, and you get memory on most any node BUT the one you asked for.
It appears that it allocates on the last node that did a free().

Here is a small program to demo the lack of numa awareness from user space.

#include <stdlib.h>
#include <sys/shm.h>
#include <stdio.h>
#include <numa.h>
#include <numaif.h>
#include <sched.h> // for sched_getcpu() call

static const unsigned long HUGE_PAGE_SIZE = 1UL << 21; // a 2MB huge page
static const unsigned long HUGE_PAGE_SIZE1 = (1UL << 21) - 1; // less one
static const unsigned long PAGE_SIZE = 1UL << 10; // a 4KB page
static const unsigned long PAGE_SIZE1 = (1UL << 10) - 1; // less one

int VerifyNumaNode(void *ptr, // address
int node, // target node
int Count); // count of 4KB pages
int MoveAddrToNodeMulti(void *ptr, int node, int Count);

void *Allocate(size_t length, int OnNode)
{
int shmid = -1;
void *shmaddr = NULL;
size_t new_length = length;
int MaxNumaNode = numa_max_node(); // find highest NUMA number
int LocalNumaNode = numa_node_of_cpu(sched_getcpu());
int NewNumaNode = LocalNumaNode;
unsigned long MaskBits[2] = { 0UL, 0UL }; // up to 128 nodes
struct bitmask NewMask;
NewMask.size = 8; // Max nodes on an HP
struct bitmask *CurrentMask = numa_get_membind();

// see if NUMA allocation is desired
if (OnNode >= 0)
{
if (OnNode > MaxNumaNode)
{
fprintf(stderr, "Invalid NUMA HUGEPages allocation node %d max is %d\n", OnNode, MaxNumaNode);
}
else
{
NewNumaNode = OnNode;
}
}
MaskBits[0] = 1UL << NewNumaNode;
numa_set_membind(&NewMask); // restrict to this node

if (new_length < HUGE_PAGE_SIZE) /* 2MB min alloc for huge pages */
{
new_length = HUGE_PAGE_SIZE;
}

if (new_length & HUGE_PAGE_SIZE1) /* 2MB min alloc for huge pages */
{
new_length = ((new_length >> 21) + 1) << 21;
}
if ((shmid = shmget(IPC_PRIVATE, new_length, /* length */
SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) == -1)
{
fprintf(stderr, "shmget() failed for %ldMB\n", (long) (new_length >> 20));
numa_set_membind(CurrentMask); // unrestrict to this node
return NULL;
}

shmaddr = shmat(shmid, NULL, 0);
if (shmaddr == (void *) -1)
{
shmctl(shmid, IPC_RMID, NULL);
numa_set_membind(CurrentMask); // unrestrict to this node
return NULL;
}
else if ((unsigned long) (shmaddr) & (PAGE_SIZE - 1))
{
fprintf(stderr, "huge page allocation was not page aligned\n");
}

memset(shmaddr, 0x00, new_length);
if (VerifyNumaNode(shmaddr, NewNumaNode, new_length / 4096UL) > 0)
{
MoveAddrToNodeMulti(shmaddr, NewNumaNode, new_length / 4096UL);
}
numa_set_membind(CurrentMask); // unrestrict to this node
VerifyNumaNode(shmaddr, NewNumaNode, new_length / 4096UL);
/* now delete the ID so it will free itself on exit */
shmctl(shmid, IPC_RMID, NULL);
return shmaddr;
}

void Free(void *addr)
{
}

int NumaNodeFromAddress(void *Address)
{
int status[1] = { -1 };
void *PTR = Address;
void *PTR2[1] = { NULL };
PTR2[0] = &PTR;
int retval = move_pages(0, // this thread
1, // just one pointer
PTR2, // The given address
NULL, // array of nodes, no moving, just asking
status, // array of node results
MPOL_MF_MOVE);
if (retval)
{
fprintf(stderr, "Invalid Address %p - No NUMA node\n", Address);
}
return status[0];
}

int MoveAddrToNodeMulti(void *ptr, int node, int Count)
{
unsigned long *PTR = new unsigned long[Count + 1];
unsigned long *PTR2 = new unsigned long[Count + 1];
int *status = new int[Count + 1];
int *NN = new int[Count + 1];
int retval = 0;
int i = 0;
unsigned long addr = 0;

for (i = 0; i < Count; i++)
{
status[i] = -1;
NN[i] = node;
addr = ((unsigned long) ptr) + (i * 4096);
PTR[i] = (unsigned long) (addr & ~4095UL);
PTR2[i] = (unsigned long) &PTR[i];
}

retval = move_pages(0, // this thread
Count, // lots of pointers
(void **) PTR2, // The given address
NN, // move to new node please
status, // array of node results
MPOL_MF_MOVE);
if (retval)
{
fprintf(stderr, "MoveAddrToNodeMulti to failed\n");
}
else
{
retval = 0;
for (i = 0; i < Count; i++)
{
if (status[i] != node)
{
fprintf(stderr, "Addr 0x%08lx is node %d not %d\n", PTR[i], status[i], node);
retval++;
}
}
}
delete [] NN;
delete [] status;
delete [] PTR2;
delete [] PTR;
return retval;
}

int VerifyNumaNode(void *ptr, int node, int Count)
{
unsigned long *PTR = new unsigned long[Count + 1];
unsigned long *PTR2 = new unsigned long[Count + 1];
int *status = new int[Count + 1];
int retval = 0;
int i = 0;
unsigned long addr = 0;

for (i = 0; i < Count; i++)
{
status[i] = -1;
addr = ((unsigned long) ptr) + (i * 4096);
PTR[i] = (unsigned long) (addr & ~4095UL);
PTR2[i] = (unsigned long) &PTR[i];
}

retval = move_pages(0, // this thread
Count, // lots of pointers
(void **) PTR2, // The given address
NULL, // no new node
status, // array of node results
MPOL_MF_MOVE);
if (retval)
{
fprintf(stderr, "VerifyNumaNode move_pages failed\n");
}
else
{
retval = 0;
for (i = 0; i < Count; i++)
{
if (status[i] != node)
{
fprintf(stderr, "Verify Addr 0x%08lx is node %d not %d\n", PTR[i], status[i], node);
retval++;
}
}
}

// release temp stuff

delete [] status;
delete [] PTR2;
delete [] PTR;
return retval;
}

// small demo program showing:
//
// a: huge page allocations via hugetlb are not node allocated
// b: huge pages cannot be move_page()'ed
// c: Replacing the shm*() with numa_alloc_node() has the exact same problem
// d: 4KB pages or 2MB pages act the same.

int main(int argc, char **argv)
{
int Node = -1;
unsigned long Size = 32UL * 1024UL *1024UL; // default to 32MB

if (argc >= 2)
{
Node = atoi(argv[1]);
}
if (argc >= 3)
{
Size = atol(argv[2]) * 1024UL * 1024UL;
}

unsigned long *Array = (unsigned long *) Allocate(Size, Node);
exit(-1);
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: H. Peter Anvin: "Re: [RFC][PATCH v2] tracing/syscalls: Allow archs to ignore tracingcompat syscalls"
Previous message: Andrew Morton: "Re: [PATCH 1/3] nbd: support FLUSH requests"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]