lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <fc3a6902-a845-4b11-ace8-514f10194288@linux.ibm.com>
Date: Wed, 26 Mar 2025 01:22:19 +0530
From: Shrikanth Hegde <sshegde@...ux.ibm.com>
To: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Cc: Peter Zijlstra <peterz@...radead.org>,
        André Almeida <andrealmeid@...lia.com>,
        Darren Hart <dvhart@...radead.org>,
        Davidlohr Bueso <dave@...olabs.net>, Ingo Molnar <mingo@...hat.com>,
        Juri Lelli <juri.lelli@...hat.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Valentin Schneider <vschneid@...hat.com>,
        Waiman Long <longman@...hat.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v10 20/21] futex: Implement FUTEX2_NUMA



On 3/12/25 20:46, Sebastian Andrzej Siewior wrote:
> From: Peter Zijlstra <peterz@...radead.org>
> 
> Extend the futex2 interface to be numa aware.
> 
> When FUTEX2_NUMA is specified for a futex, the user value is extended
> to two words (of the same size). The first is the user value we all
> know, the second one will be the node to place this futex on.
> 
>    struct futex_numa_32 {
> 	u32 val;
> 	u32 node;
>    };
> 
> When node is set to ~0, WAIT will set it to the current node_id such
> that WAKE knows where to find it. If userspace corrupts the node value
> between WAIT and WAKE, the futex will not be found and no wakeup will
> happen.
> 
> When FUTEX2_NUMA is not set, the node is simply an extention of the
> hash, such that traditional futexes are still interleaved over the
> nodes.
> 
> This is done to avoid having to have a separate !numa hash-table.
> 
> [bigeasy: ensure to have at least hashsize of 4 in futex_init(), add
> pr_info() for size and allocation information.]
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
> ---
>   include/linux/futex.h      |   3 ++
>   include/uapi/linux/futex.h |   8 +++
>   kernel/futex/core.c        | 100 ++++++++++++++++++++++++++++++-------
>   kernel/futex/futex.h       |  33 ++++++++++--
>   4 files changed, 124 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/futex.h b/include/linux/futex.h
> index 7e14d2e9162d2..19c37afa0432a 100644
> --- a/include/linux/futex.h
> +++ b/include/linux/futex.h
> @@ -34,6 +34,7 @@ union futex_key {
>   		u64 i_seq;
>   		unsigned long pgoff;
>   		unsigned int offset;
> +		/* unsigned int node; */
>   	} shared;
>   	struct {
>   		union {
> @@ -42,11 +43,13 @@ union futex_key {
>   		};
>   		unsigned long address;
>   		unsigned int offset;
> +		/* unsigned int node; */
>   	} private;
>   	struct {
>   		u64 ptr;
>   		unsigned long word;
>   		unsigned int offset;
> +		unsigned int node;	/* NOT hashed! */
>   	} both;
>   };
>   
> diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
> index d2ee625ea1890..0435025beaae8 100644
> --- a/include/uapi/linux/futex.h
> +++ b/include/uapi/linux/futex.h
> @@ -74,6 +74,14 @@
>   /* do not use */
>   #define FUTEX_32		FUTEX2_SIZE_U32 /* historical accident :-( */
>   
> +
> +/*
> + * When FUTEX2_NUMA doubles the futex word, the second word is a node value.
> + * The special value -1 indicates no-node. This is the same value as
> + * NUMA_NO_NODE, except that value is not ABI, this is.
> + */
> +#define FUTEX_NO_NODE		(-1)
> +
>   /*
>    * Max numbers of elements in a futex_waitv array
>    */
> diff --git a/kernel/futex/core.c b/kernel/futex/core.c
> index bc7451287b2ce..b9da7dc6a900a 100644
> --- a/kernel/futex/core.c
> +++ b/kernel/futex/core.c
> @@ -36,6 +36,8 @@
>   #include <linux/pagemap.h>
>   #include <linux/debugfs.h>
>   #include <linux/plist.h>
> +#include <linux/gfp.h>
> +#include <linux/vmalloc.h>
>   #include <linux/memblock.h>
>   #include <linux/fault-inject.h>
>   #include <linux/slab.h>
> @@ -51,11 +53,14 @@
>    * reside in the same cacheline.
>    */
>   static struct {
> -	struct futex_hash_bucket *queues;
>   	unsigned long            hashmask;
> +	unsigned int		 hashshift;
> +	struct futex_hash_bucket *queues[MAX_NUMNODES];
>   } __futex_data __read_mostly __aligned(2*sizeof(long));
> -#define futex_queues   (__futex_data.queues)
> -#define futex_hashmask (__futex_data.hashmask)
> +
> +#define futex_hashmask	(__futex_data.hashmask)
> +#define futex_hashshift	(__futex_data.hashshift)
> +#define futex_queues	(__futex_data.queues)
>   
>   struct futex_private_hash {
>   	rcuref_t	users;
> @@ -326,15 +331,35 @@ __futex_hash(union futex_key *key, struct futex_private_hash *fph)
>   {
>   	struct futex_hash_bucket *hb;
>   	u32 hash;
> +	int node;
>   
>   	hb = __futex_hash_private(key, fph);
>   	if (hb)
>   		return hb;
>   
>   	hash = jhash2((u32 *)key,
> -		      offsetof(typeof(*key), both.offset) / 4,
> +		      offsetof(typeof(*key), both.offset) / sizeof(u32),
>   		      key->both.offset);
> -	return &futex_queues[hash & futex_hashmask];
> +	node = key->both.node;
> +
> +	if (node == FUTEX_NO_NODE) {
> +		/*
> +		 * In case of !FLAGS_NUMA, use some unused hash bits to pick a
> +		 * node -- this ensures regular futexes are interleaved across
> +		 * the nodes and avoids having to allocate multiple
> +		 * hash-tables.
> +		 *
> +		 * NOTE: this isn't perfectly uniform, but it is fast and
> +		 * handles sparse node masks.
> +		 */
> +		node = (hash >> futex_hashshift) % nr_node_ids;
> +		if (!node_possible(node)) {
> +			node = find_next_bit_wrap(node_possible_map.bits,
> +						  nr_node_ids, node);
> +		}
> +	}
> +
> +	return &futex_queues[node][hash & futex_hashmask];

IIUC, when one specifies the numa node it can't be private futex anymore?


>   }
>   
>   /**
> @@ -441,25 +466,49 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
>   	struct page *page;
>   	struct folio *folio;
>   	struct address_space *mapping;
> -	int err, ro = 0;
> +	int node, err, size, ro = 0;
>   	bool fshared;
>   
>   	fshared = flags & FLAGS_SHARED;
> +	size = futex_size(flags);
> +	if (flags & FLAGS_NUMA)
> +		size *= 2;
>   
>   	/*
>   	 * The futex address must be "naturally" aligned.
>   	 */
>   	key->both.offset = address % PAGE_SIZE;
> -	if (unlikely((address % sizeof(u32)) != 0))
> +	if (unlikely((address % size) != 0))
>   		return -EINVAL;
>   	address -= key->both.offset;
>   
> -	if (unlikely(!access_ok(uaddr, sizeof(u32))))
> +	if (unlikely(!access_ok(uaddr, size)))
>   		return -EFAULT;
>   
>   	if (unlikely(should_fail_futex(fshared)))
>   		return -EFAULT;
>   
> +	if (flags & FLAGS_NUMA) {
> +		u32 __user *naddr = uaddr + size / 2;
> +
> +		if (futex_get_value(&node, naddr))
> +			return -EFAULT;
> +
> +		if (node == FUTEX_NO_NODE) {
> +			node = numa_node_id();
> +			if (futex_put_value(node, naddr))
> +				return -EFAULT;
> +
> +		} else if (node >= MAX_NUMNODES || !node_possible(node)) {
> +			return -EINVAL;
> +		}
> +
> +		key->both.node = node;
> +
> +	} else {
> +		key->both.node = FUTEX_NO_NODE;
> +	}
> +
>   	/*
>   	 * PROCESS_PRIVATE futexes are fast.
>   	 * As the mm cannot disappear under us and the 'key' only needs
> @@ -1597,24 +1646,41 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3)
>   static int __init futex_init(void)
>   {
>   	unsigned long hashsize, i;
> -	unsigned int futex_shift;
> +	unsigned int order, n;
> +	unsigned long size;
>   
>   #ifdef CONFIG_BASE_SMALL
>   	hashsize = 16;
>   #else
> -	hashsize = roundup_pow_of_two(256 * num_possible_cpus());
> +	hashsize = 256 * num_possible_cpus();
> +	hashsize /= num_possible_nodes();

Wouldn't it be better to use num_online_nodes? each node may get a bigger
hash bucket which means less collision no?


> +	hashsize = max(4, hashsize);
> +	hashsize = roundup_pow_of_two(hashsize);
>   #endif
> +	futex_hashshift = ilog2(hashsize);
> +	size = sizeof(struct futex_hash_bucket) * hashsize;
> +	order = get_order(size);
>   
> -	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
> -					       hashsize, 0, 0,
> -					       &futex_shift, NULL,
> -					       hashsize, hashsize);
> -	hashsize = 1UL << futex_shift;
> +	for_each_node(n) {
> +		struct futex_hash_bucket *table;
>   
> -	for (i = 0; i < hashsize; i++)
> -		futex_hash_bucket_init(&futex_queues[i], NULL);
> +		if (order > MAX_PAGE_ORDER)
> +			table = vmalloc_huge_node(size, GFP_KERNEL, n);
> +		else
> +			table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
> +
> +		BUG_ON(!table);
> +
> +		for (i = 0; i < hashsize; i++)
> +			futex_hash_bucket_init(&table[i], NULL);
> +
> +		futex_queues[n] = table;
> +	}
>   
>   	futex_hashmask = hashsize - 1;
> +	pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
> +		hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
> +		order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
>   	return 0;
>   }
>   core_initcall(futex_init);
> diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
> index 8eba9982bcae1..11c870a92b5d0 100644
> --- a/kernel/futex/futex.h
> +++ b/kernel/futex/futex.h
> @@ -54,7 +54,7 @@ static inline unsigned int futex_to_flags(unsigned int op)
>   	return flags;
>   }
>   
> -#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
> +#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_PRIVATE)
>   
>   /* FUTEX2_ to FLAGS_ */
>   static inline unsigned int futex2_to_flags(unsigned int flags2)
> @@ -87,6 +87,19 @@ static inline bool futex_flags_valid(unsigned int flags)
>   	if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
>   		return false;
>   
> +	/*
> +	 * Must be able to represent both FUTEX_NO_NODE and every valid nodeid
> +	 * in a futex word.
> +	 */
> +	if (flags & FLAGS_NUMA) {
> +		int bits = 8 * futex_size(flags);
> +		u64 max = ~0ULL;
> +
> +		max >>= 64 - bits;
> +		if (nr_node_ids >= max)
> +			return false;
> +	}
> +
>   	return true;
>   }
>   
> @@ -290,7 +303,7 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32
>    * This looks a bit overkill, but generally just results in a couple
>    * of instructions.
>    */
> -static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
> +static __always_inline int futex_get_value(u32 *dest, u32 __user *from)
>   {
>   	u32 val;
>   
> @@ -307,12 +320,26 @@ static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
>   	return -EFAULT;
>   }
>   
> +static __always_inline int futex_put_value(u32 val, u32 __user *to)
> +{
> +	if (can_do_masked_user_access())
> +		to = masked_user_access_begin(to);
> +	else if (!user_read_access_begin(to, sizeof(*to)))
> +		return -EFAULT;
> +	unsafe_put_user(val, to, Efault);
> +	user_read_access_end();
> +	return 0;
> +Efault:
> +	user_read_access_end();
> +	return -EFAULT;
> +}
> +
>   static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
>   {
>   	int ret;
>   
>   	pagefault_disable();
> -	ret = futex_read_inatomic(dest, from);
> +	ret = futex_get_value(dest, from);
>   	pagefault_enable();
>   
>   	return ret;


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ