[<prev] [next>] [day] [month] [year] [list]
Message-ID: <511AB3C5.2080703@seas.wustl.edu>
Date: Tue, 12 Feb 2013 15:27:33 -0600
From: Professor Berkley Shands <berkley@...s.wustl.edu>
To: linux-kernel <linux-kernel@...r.kernel.org>
Subject: NUMA allocations fail to be numa allocated
using libnuma calls on RedHat 6.3 x86_64 with the default kernel and up
to 3.4.29
don't allocate on the specified numa nodes, even when forced with numactl.
It appears that setting the NUMA policy, and or numa nodes does little
for large allocations.
Using HUGETLBFS, and you get memory on most any node BUT the one you
asked for.
It appears that it allocates on the last node that did a free().
Here is a small program to demo the lack of numa awareness from user space.
#include <stdlib.h>
#include <sys/shm.h>
#include <stdio.h>
#include <numa.h>
#include <numaif.h>
#include <sched.h> // for sched_getcpu() call
static const unsigned long HUGE_PAGE_SIZE = 1UL << 21; // a 2MB huge page
static const unsigned long HUGE_PAGE_SIZE1 = (1UL << 21) - 1; // less one
static const unsigned long PAGE_SIZE = 1UL << 10; // a 4KB page
static const unsigned long PAGE_SIZE1 = (1UL << 10) - 1; // less one
int VerifyNumaNode(void *ptr, // address
int node, // target node
int Count); // count of 4KB pages
int MoveAddrToNodeMulti(void *ptr, int node, int Count);
void *Allocate(size_t length, int OnNode)
{
int shmid = -1;
void *shmaddr = NULL;
size_t new_length = length;
int MaxNumaNode = numa_max_node(); // find highest NUMA number
int LocalNumaNode = numa_node_of_cpu(sched_getcpu());
int NewNumaNode = LocalNumaNode;
unsigned long MaskBits[2] = { 0UL, 0UL }; // up to 128 nodes
struct bitmask NewMask;
NewMask.size = 8; // Max nodes on an HP
struct bitmask *CurrentMask = numa_get_membind();
// see if NUMA allocation is desired
if (OnNode >= 0)
{
if (OnNode > MaxNumaNode)
{
fprintf(stderr, "Invalid NUMA HUGEPages allocation node %d max
is %d\n", OnNode, MaxNumaNode);
}
else
{
NewNumaNode = OnNode;
}
}
MaskBits[0] = 1UL << NewNumaNode;
numa_set_membind(&NewMask); // restrict to this node
if (new_length < HUGE_PAGE_SIZE) /* 2MB min alloc for huge pages */
{
new_length = HUGE_PAGE_SIZE;
}
if (new_length & HUGE_PAGE_SIZE1) /* 2MB min alloc for huge pages */
{
new_length = ((new_length >> 21) + 1) << 21;
}
if ((shmid = shmget(IPC_PRIVATE, new_length, /* length */
SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) == -1)
{
fprintf(stderr, "shmget() failed for %ldMB\n", (long) (new_length
>> 20));
numa_set_membind(CurrentMask); // unrestrict to this node
return NULL;
}
shmaddr = shmat(shmid, NULL, 0);
if (shmaddr == (void *) -1)
{
shmctl(shmid, IPC_RMID, NULL);
numa_set_membind(CurrentMask); // unrestrict to this node
return NULL;
}
else if ((unsigned long) (shmaddr) & (PAGE_SIZE - 1))
{
fprintf(stderr, "huge page allocation was not page aligned\n");
}
memset(shmaddr, 0x00, new_length);
if (VerifyNumaNode(shmaddr, NewNumaNode, new_length / 4096UL) > 0)
{
MoveAddrToNodeMulti(shmaddr, NewNumaNode, new_length / 4096UL);
}
numa_set_membind(CurrentMask); // unrestrict to this node
VerifyNumaNode(shmaddr, NewNumaNode, new_length / 4096UL);
/* now delete the ID so it will free itself on exit */
shmctl(shmid, IPC_RMID, NULL);
return shmaddr;
}
void Free(void *addr)
{
}
int NumaNodeFromAddress(void *Address)
{
int status[1] = { -1 };
void *PTR = Address;
void *PTR2[1] = { NULL };
PTR2[0] = &PTR;
int retval = move_pages(0, // this thread
1, // just one pointer
PTR2, // The given address
NULL, // array of nodes, no moving,
just asking
status, // array of node results
MPOL_MF_MOVE);
if (retval)
{
fprintf(stderr, "Invalid Address %p - No NUMA node\n", Address);
}
return status[0];
}
int MoveAddrToNodeMulti(void *ptr, int node, int Count)
{
unsigned long *PTR = new unsigned long[Count + 1];
unsigned long *PTR2 = new unsigned long[Count + 1];
int *status = new int[Count + 1];
int *NN = new int[Count + 1];
int retval = 0;
int i = 0;
unsigned long addr = 0;
for (i = 0; i < Count; i++)
{
status[i] = -1;
NN[i] = node;
addr = ((unsigned long) ptr) + (i * 4096);
PTR[i] = (unsigned long) (addr & ~4095UL);
PTR2[i] = (unsigned long) &PTR[i];
}
retval = move_pages(0, // this thread
Count, // lots of pointers
(void **) PTR2, // The given address
NN, // move to new node please
status, // array of node results
MPOL_MF_MOVE);
if (retval)
{
fprintf(stderr, "MoveAddrToNodeMulti to failed\n");
}
else
{
retval = 0;
for (i = 0; i < Count; i++)
{
if (status[i] != node)
{
fprintf(stderr, "Addr 0x%08lx is node %d not %d\n", PTR[i],
status[i], node);
retval++;
}
}
}
delete [] NN;
delete [] status;
delete [] PTR2;
delete [] PTR;
return retval;
}
int VerifyNumaNode(void *ptr, int node, int Count)
{
unsigned long *PTR = new unsigned long[Count + 1];
unsigned long *PTR2 = new unsigned long[Count + 1];
int *status = new int[Count + 1];
int retval = 0;
int i = 0;
unsigned long addr = 0;
for (i = 0; i < Count; i++)
{
status[i] = -1;
addr = ((unsigned long) ptr) + (i * 4096);
PTR[i] = (unsigned long) (addr & ~4095UL);
PTR2[i] = (unsigned long) &PTR[i];
}
retval = move_pages(0, // this thread
Count, // lots of pointers
(void **) PTR2, // The given address
NULL, // no new node
status, // array of node results
MPOL_MF_MOVE);
if (retval)
{
fprintf(stderr, "VerifyNumaNode move_pages failed\n");
}
else
{
retval = 0;
for (i = 0; i < Count; i++)
{
if (status[i] != node)
{
fprintf(stderr, "Verify Addr 0x%08lx is node %d not %d\n",
PTR[i], status[i], node);
retval++;
}
}
}
// release temp stuff
delete [] status;
delete [] PTR2;
delete [] PTR;
return retval;
}
// small demo program showing:
//
// a: huge page allocations via hugetlb are not node allocated
// b: huge pages cannot be move_page()'ed
// c: Replacing the shm*() with numa_alloc_node() has the exact same problem
// d: 4KB pages or 2MB pages act the same.
int main(int argc, char **argv)
{
int Node = -1;
unsigned long Size = 32UL * 1024UL *1024UL; // default to 32MB
if (argc >= 2)
{
Node = atoi(argv[1]);
}
if (argc >= 3)
{
Size = atol(argv[2]) * 1024UL * 1024UL;
}
unsigned long *Array = (unsigned long *) Allocate(Size, Node);
exit(-1);
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists