[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <fc3a6902-a845-4b11-ace8-514f10194288@linux.ibm.com>
Date: Wed, 26 Mar 2025 01:22:19 +0530
From: Shrikanth Hegde <sshegde@...ux.ibm.com>
To: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Cc: Peter Zijlstra <peterz@...radead.org>,
André Almeida <andrealmeid@...lia.com>,
Darren Hart <dvhart@...radead.org>,
Davidlohr Bueso <dave@...olabs.net>, Ingo Molnar <mingo@...hat.com>,
Juri Lelli <juri.lelli@...hat.com>,
Thomas Gleixner <tglx@...utronix.de>,
Valentin Schneider <vschneid@...hat.com>,
Waiman Long <longman@...hat.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v10 20/21] futex: Implement FUTEX2_NUMA
On 3/12/25 20:46, Sebastian Andrzej Siewior wrote:
> From: Peter Zijlstra <peterz@...radead.org>
>
> Extend the futex2 interface to be numa aware.
>
> When FUTEX2_NUMA is specified for a futex, the user value is extended
> to two words (of the same size). The first is the user value we all
> know, the second one will be the node to place this futex on.
>
> struct futex_numa_32 {
> u32 val;
> u32 node;
> };
>
> When node is set to ~0, WAIT will set it to the current node_id such
> that WAKE knows where to find it. If userspace corrupts the node value
> between WAIT and WAKE, the futex will not be found and no wakeup will
> happen.
>
> When FUTEX2_NUMA is not set, the node is simply an extention of the
> hash, such that traditional futexes are still interleaved over the
> nodes.
>
> This is done to avoid having to have a separate !numa hash-table.
>
> [bigeasy: ensure to have at least hashsize of 4 in futex_init(), add
> pr_info() for size and allocation information.]
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
> ---
> include/linux/futex.h | 3 ++
> include/uapi/linux/futex.h | 8 +++
> kernel/futex/core.c | 100 ++++++++++++++++++++++++++++++-------
> kernel/futex/futex.h | 33 ++++++++++--
> 4 files changed, 124 insertions(+), 20 deletions(-)
>
> diff --git a/include/linux/futex.h b/include/linux/futex.h
> index 7e14d2e9162d2..19c37afa0432a 100644
> --- a/include/linux/futex.h
> +++ b/include/linux/futex.h
> @@ -34,6 +34,7 @@ union futex_key {
> u64 i_seq;
> unsigned long pgoff;
> unsigned int offset;
> + /* unsigned int node; */
> } shared;
> struct {
> union {
> @@ -42,11 +43,13 @@ union futex_key {
> };
> unsigned long address;
> unsigned int offset;
> + /* unsigned int node; */
> } private;
> struct {
> u64 ptr;
> unsigned long word;
> unsigned int offset;
> + unsigned int node; /* NOT hashed! */
> } both;
> };
>
> diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
> index d2ee625ea1890..0435025beaae8 100644
> --- a/include/uapi/linux/futex.h
> +++ b/include/uapi/linux/futex.h
> @@ -74,6 +74,14 @@
> /* do not use */
> #define FUTEX_32 FUTEX2_SIZE_U32 /* historical accident :-( */
>
> +
> +/*
> + * When FUTEX2_NUMA doubles the futex word, the second word is a node value.
> + * The special value -1 indicates no-node. This is the same value as
> + * NUMA_NO_NODE, except that value is not ABI, this is.
> + */
> +#define FUTEX_NO_NODE (-1)
> +
> /*
> * Max numbers of elements in a futex_waitv array
> */
> diff --git a/kernel/futex/core.c b/kernel/futex/core.c
> index bc7451287b2ce..b9da7dc6a900a 100644
> --- a/kernel/futex/core.c
> +++ b/kernel/futex/core.c
> @@ -36,6 +36,8 @@
> #include <linux/pagemap.h>
> #include <linux/debugfs.h>
> #include <linux/plist.h>
> +#include <linux/gfp.h>
> +#include <linux/vmalloc.h>
> #include <linux/memblock.h>
> #include <linux/fault-inject.h>
> #include <linux/slab.h>
> @@ -51,11 +53,14 @@
> * reside in the same cacheline.
> */
> static struct {
> - struct futex_hash_bucket *queues;
> unsigned long hashmask;
> + unsigned int hashshift;
> + struct futex_hash_bucket *queues[MAX_NUMNODES];
> } __futex_data __read_mostly __aligned(2*sizeof(long));
> -#define futex_queues (__futex_data.queues)
> -#define futex_hashmask (__futex_data.hashmask)
> +
> +#define futex_hashmask (__futex_data.hashmask)
> +#define futex_hashshift (__futex_data.hashshift)
> +#define futex_queues (__futex_data.queues)
>
> struct futex_private_hash {
> rcuref_t users;
> @@ -326,15 +331,35 @@ __futex_hash(union futex_key *key, struct futex_private_hash *fph)
> {
> struct futex_hash_bucket *hb;
> u32 hash;
> + int node;
>
> hb = __futex_hash_private(key, fph);
> if (hb)
> return hb;
>
> hash = jhash2((u32 *)key,
> - offsetof(typeof(*key), both.offset) / 4,
> + offsetof(typeof(*key), both.offset) / sizeof(u32),
> key->both.offset);
> - return &futex_queues[hash & futex_hashmask];
> + node = key->both.node;
> +
> + if (node == FUTEX_NO_NODE) {
> + /*
> + * In case of !FLAGS_NUMA, use some unused hash bits to pick a
> + * node -- this ensures regular futexes are interleaved across
> + * the nodes and avoids having to allocate multiple
> + * hash-tables.
> + *
> + * NOTE: this isn't perfectly uniform, but it is fast and
> + * handles sparse node masks.
> + */
> + node = (hash >> futex_hashshift) % nr_node_ids;
> + if (!node_possible(node)) {
> + node = find_next_bit_wrap(node_possible_map.bits,
> + nr_node_ids, node);
> + }
> + }
> +
> + return &futex_queues[node][hash & futex_hashmask];
IIUC, when one specifies the numa node it can't be private futex anymore?
> }
>
> /**
> @@ -441,25 +466,49 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
> struct page *page;
> struct folio *folio;
> struct address_space *mapping;
> - int err, ro = 0;
> + int node, err, size, ro = 0;
> bool fshared;
>
> fshared = flags & FLAGS_SHARED;
> + size = futex_size(flags);
> + if (flags & FLAGS_NUMA)
> + size *= 2;
>
> /*
> * The futex address must be "naturally" aligned.
> */
> key->both.offset = address % PAGE_SIZE;
> - if (unlikely((address % sizeof(u32)) != 0))
> + if (unlikely((address % size) != 0))
> return -EINVAL;
> address -= key->both.offset;
>
> - if (unlikely(!access_ok(uaddr, sizeof(u32))))
> + if (unlikely(!access_ok(uaddr, size)))
> return -EFAULT;
>
> if (unlikely(should_fail_futex(fshared)))
> return -EFAULT;
>
> + if (flags & FLAGS_NUMA) {
> + u32 __user *naddr = uaddr + size / 2;
> +
> + if (futex_get_value(&node, naddr))
> + return -EFAULT;
> +
> + if (node == FUTEX_NO_NODE) {
> + node = numa_node_id();
> + if (futex_put_value(node, naddr))
> + return -EFAULT;
> +
> + } else if (node >= MAX_NUMNODES || !node_possible(node)) {
> + return -EINVAL;
> + }
> +
> + key->both.node = node;
> +
> + } else {
> + key->both.node = FUTEX_NO_NODE;
> + }
> +
> /*
> * PROCESS_PRIVATE futexes are fast.
> * As the mm cannot disappear under us and the 'key' only needs
> @@ -1597,24 +1646,41 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3)
> static int __init futex_init(void)
> {
> unsigned long hashsize, i;
> - unsigned int futex_shift;
> + unsigned int order, n;
> + unsigned long size;
>
> #ifdef CONFIG_BASE_SMALL
> hashsize = 16;
> #else
> - hashsize = roundup_pow_of_two(256 * num_possible_cpus());
> + hashsize = 256 * num_possible_cpus();
> + hashsize /= num_possible_nodes();
Wouldn't it be better to use num_online_nodes? each node may get a bigger
hash bucket which means less collision no?
> + hashsize = max(4, hashsize);
> + hashsize = roundup_pow_of_two(hashsize);
> #endif
> + futex_hashshift = ilog2(hashsize);
> + size = sizeof(struct futex_hash_bucket) * hashsize;
> + order = get_order(size);
>
> - futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
> - hashsize, 0, 0,
> - &futex_shift, NULL,
> - hashsize, hashsize);
> - hashsize = 1UL << futex_shift;
> + for_each_node(n) {
> + struct futex_hash_bucket *table;
>
> - for (i = 0; i < hashsize; i++)
> - futex_hash_bucket_init(&futex_queues[i], NULL);
> + if (order > MAX_PAGE_ORDER)
> + table = vmalloc_huge_node(size, GFP_KERNEL, n);
> + else
> + table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
> +
> + BUG_ON(!table);
> +
> + for (i = 0; i < hashsize; i++)
> + futex_hash_bucket_init(&table[i], NULL);
> +
> + futex_queues[n] = table;
> + }
>
> futex_hashmask = hashsize - 1;
> + pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
> + hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
> + order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
> return 0;
> }
> core_initcall(futex_init);
> diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
> index 8eba9982bcae1..11c870a92b5d0 100644
> --- a/kernel/futex/futex.h
> +++ b/kernel/futex/futex.h
> @@ -54,7 +54,7 @@ static inline unsigned int futex_to_flags(unsigned int op)
> return flags;
> }
>
> -#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
> +#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_PRIVATE)
>
> /* FUTEX2_ to FLAGS_ */
> static inline unsigned int futex2_to_flags(unsigned int flags2)
> @@ -87,6 +87,19 @@ static inline bool futex_flags_valid(unsigned int flags)
> if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
> return false;
>
> + /*
> + * Must be able to represent both FUTEX_NO_NODE and every valid nodeid
> + * in a futex word.
> + */
> + if (flags & FLAGS_NUMA) {
> + int bits = 8 * futex_size(flags);
> + u64 max = ~0ULL;
> +
> + max >>= 64 - bits;
> + if (nr_node_ids >= max)
> + return false;
> + }
> +
> return true;
> }
>
> @@ -290,7 +303,7 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32
> * This looks a bit overkill, but generally just results in a couple
> * of instructions.
> */
> -static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
> +static __always_inline int futex_get_value(u32 *dest, u32 __user *from)
> {
> u32 val;
>
> @@ -307,12 +320,26 @@ static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
> return -EFAULT;
> }
>
> +static __always_inline int futex_put_value(u32 val, u32 __user *to)
> +{
> + if (can_do_masked_user_access())
> + to = masked_user_access_begin(to);
> + else if (!user_read_access_begin(to, sizeof(*to)))
> + return -EFAULT;
> + unsafe_put_user(val, to, Efault);
> + user_read_access_end();
> + return 0;
> +Efault:
> + user_read_access_end();
> + return -EFAULT;
> +}
> +
> static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
> {
> int ret;
>
> pagefault_disable();
> - ret = futex_read_inatomic(dest, from);
> + ret = futex_get_value(dest, from);
> pagefault_enable();
>
> return ret;
Powered by blists - more mailing lists