lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240720173543.897972-1-jglisse@google.com>
Date: Sat, 20 Jul 2024 10:35:43 -0700
From: Jerome Glisse <jglisse@...gle.com>
To: Andrew Morton <akpm@...ux-foundation.org>, linux-mm@...ck.org
Cc: Jerome Glisse <jglisse@...gle.com>, linux-kernel@...r.kernel.org, 
	stable@...r.kernel.org
Subject: [PATCH] mm: fix maxnode for mbind(), set_mempolicy() and migrate_pages()

Because maxnode bug there is no way to bind or migrate_pages to the
last node in multi-node NUMA system unless you lie about maxnodes
when making the mbind, set_mempolicy or migrate_pages syscall.

Manpage for those syscall describe maxnodes as the number of bits in
the node bitmap ("bit mask of nodes containing up to maxnode bits").
Thus if maxnode is n then we expect to have a n bit(s) bitmap which
means that the mask of valid bits is ((1 << n) - 1). The get_nodes()
decrement lead to the mask being ((1 << (n - 1)) - 1).

The three syscalls use a common helper get_nodes() and first things
this helper do is decrement maxnode by 1 which leads to using n-1 bits
in the provided mask of nodes (see get_bitmap() an helper function to
get_nodes()).

The lead to two bugs, either the last node in the bitmap provided will
not be use in either of the three syscalls, or the syscalls will error
out and return EINVAL if the only bit set in the bitmap was the last
bit in the mask of nodes (which is ignored because of the bug and an
empty mask of nodes is an invalid argument).

I am surprised this bug was never caught ... it has been in the kernel
since forever.

People can use the following function to detect if the kernel has the
bug:

bool kernel_has_maxnodes_bug(void)
{
    unsigned long nodemask = 1;
    bool has_bug;
    long res;

    res = set_mempolicy(MPOL_BIND, &nodemask, 1);
    has_bug = res && (errno == EINVAL);
    set_mempolicy(MPOL_DEFAULT, NULL, 0);
    return has_bug;
}

You can tested with any of the three program below:

gcc mbind.c -o mbind -lnuma
gcc set_mempolicy.c -o set_mempolicy -lnuma
gcc migrate_pages.c -o migrate_pages -lnuma

First argument is maxnode, second argument is the bit index to set in
the mask of node (0 set the first bit, 1 the second bit, ...).

./mbind 2 1 & sleep 2 && numastat -n -p `pidof mbind` && fg
./set_mempolicy 2 1 & sleep 2 && numastat -n -p `pidof set_mempolicy` && fg
./migrate_pages 2 1 & sleep 2 && numastat -n -p `pidof migrate_pages` && fg

mbind.c %< ----------------------------------------------------------

void *anon_mem(size_t size)
{
    void *ret;

    ret = mmap(NULL, size, PROT_READ|
               PROT_WRITE, MAP_PRIVATE|
               MAP_ANON, -1, 0);
    return ret == MAP_FAILED ? NULL : ret;
}

unsigned long mround(unsigned long v, unsigned long m)
{
    if (m == 0) {
        return v;
    }

    return v + m - (v % m);
}

void bitmap_set(void *_bitmap, unsigned long b)
{
    uint8_t *bitmap = _bitmap;

    bitmap[b >> 3] |= (1 << (b & 7));
}

int main(int argc, char *argv[])
{
    unsigned long *nodemask, maxnode, node, i;
    size_t bytes;
    int8_t *mem;
    long res;

    if (argv[1] == NULL || argv[2] == NULL) {
        printf("missing argument: %s maxnodes node\n", argv[0]);
        return -1;
    }
    maxnode = atoi(argv[1]);
    node = atoi(argv[2]);

    bytes = mround(mround(maxnode, 8) >> 3,
                   sizeof(unsigned long));
    nodemask = calloc(bytes, 1);
    mem = anon_mem(NPAGES << 12);
    if (!mem || !nodemask) {
        return -1;
    }

    // Try to bind memory to node
    bitmap_set(nodemask, node);
    res = mbind(mem, NPAGES << 12, MPOL_BIND,
                nodemask, maxnode, 0);
    if (res) {
        printf("mbind(mem, NPAGES << 12, MPOL_BIND, "
               "nodemask, %d, 0) failed with %d\n",
               maxnode, errno);
        return -1;
    }

    // Write something to breakup from the zero page
    for (unsigned i = 0; i < NPAGES; i++) {
        mem[i << 12] = i + 1;
    }

    // Allow numastats to gather statistics
    getchar();

    return 0;
}

set_mempolicy %< ----------------------------------------------------

void *anon_mem(size_t size)
{
    void *ret;

    ret = mmap(NULL, size, PROT_READ|
               PROT_WRITE, MAP_PRIVATE|
               MAP_ANON, -1, 0);
    return ret == MAP_FAILED ? NULL : ret;
}

unsigned long mround(unsigned long v, unsigned long m)
{
    if (m == 0) {
        return v;
    }

    return v + m - (v % m);
}

void bitmap_set(void *_bitmap, unsigned long b)
{
    uint8_t *bitmap = _bitmap;

    bitmap[b >> 3] |= (1 << (b & 7));
}

int main(int argc, char *argv[])
{
    unsigned long *nodemask, maxnode, node, i;
    size_t bytes;
    int8_t *mem;
    long res;

    if (argv[1] == NULL || argv[2] == NULL) {
        printf("missing argument: %s maxnodes node\n", argv[0]);
        return -1;
    }
    maxnode = atoi(argv[1]);
    node = atoi(argv[2]);

    // bind memory to node 0 ...
    i = 1;
    res = set_mempolicy(MPOL_BIND, i, 2);
    if (res) {
        printf("set_mempolicy(MPOL_BIND, []=1, %d) "
               "failed with %d\n", maxnode, errno);
        return -1;
    }

    bytes = mround(mround(maxnode, 8) >> 3,
                   sizeof(unsigned long));
    nodemask = calloc(bytes, 1);
    mem = anon_mem(NPAGES << 12);
    if (!mem || !nodemask) {
        return -1;
    }

    // Try to bind memory to node
    bitmap_set(nodemask, node);
    res = set_mempolicy(MPOL_BIND, nodemask, maxnode);
    if (res) {
        printf("set_mempolicy(MPOL_BIND, nodemask, %d) "
               "failed with %d\n", maxnode, errno);
        return -1;
    }

    // Write something to breakup from the zero page
    for (unsigned i = 0; i < NPAGES; i++) {
        mem[i << 12] = i + 1;
    }

    // Allow numastats to gather statistics
    getchar();

    return 0;
}

migrate_pages %< ----------------------------------------------------

void *anon_mem(size_t size)
{
    void *ret;

    ret = mmap(NULL, size, PROT_READ|
               PROT_WRITE, MAP_PRIVATE|
               MAP_ANON, -1, 0);
    return ret == MAP_FAILED ? NULL : ret;
}

unsigned long mround(unsigned long v, unsigned long m)
{
    if (m == 0) {
        return v;
    }

    return v + m - (v % m);
}

void bitmap_set(void *_bitmap, unsigned long b)
{
    uint8_t *bitmap = _bitmap;

    bitmap[b >> 3] |= (1 << (b & 7));
}

int main(int argc, char *argv[])
{
    unsigned long *old_nodes, *new_nodes, maxnode, node, i;
    size_t bytes;
    int8_t *mem;
    long res;

    if (argv[1] == NULL || argv[2] == NULL) {
        printf("missing argument: %s maxnodes node\n", argv[0]);
        return -1;
    }
    maxnode = atoi(argv[1]);
    node = atoi(argv[2]);

    // bind memory to node 0 ...
    i = 1;
    res = set_mempolicy(MPOL_BIND, &i, 2);
    if (res) {
        printf("set_mempolicy(MPOL_BIND, []=1, %d) "
               "failed with %d\n", maxnode, errno);
        return -1;
    }

    bytes = mround(mround(maxnode, 8) >> 3,
                   sizeof(unsigned long));
    old_nodes = calloc(bytes, 1);
    new_nodes = calloc(bytes, 1);
    mem = anon_mem(NPAGES << 12);
    if (!mem || !new_nodes || !old_nodes) {
        return -1;
    }

    // Write something to breakup from the zero page
    for (unsigned i = 0; i < NPAGES; i++) {
        mem[i << 12] = i + 1;
    }

    // Try to bind memory to node
    bitmap_set(old_nodes, 0);
    bitmap_set(new_nodes, node);
    res = migrate_pages(getpid(), maxnode,
                        old_nodes, new_nodes);
    if (res) {
        printf("migrate_pages(pid, %d, old_nodes, "
               "new_nodes) failed with %d\n",
               maxnode, errno);
        return -1;
    }

    // Allow numastats to gather statistics
    getchar();

    return 0;
}

Signed-off-by: Jérôme Glisse <jglisse@...gle.com>
To: Andrew Morton <akpm@...ux-foundation.org>
To: linux-mm@...ck.org
Cc: linux-kernel@...r.kernel.org
Cc: stable@...r.kernel.org
---
 mm/mempolicy.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index aec756ae5637..658e5366d266 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1434,7 +1434,6 @@ static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 		     unsigned long maxnode)
 {
-	--maxnode;
 	nodes_clear(*nodes);
 	if (maxnode == 0 || !nmask)
 		return 0;
-- 
2.45.2.1089.g2a221341d9-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ