[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250312151634.2183278-22-bigeasy@linutronix.de>
Date: Wed, 12 Mar 2025 16:16:34 +0100
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: linux-kernel@...r.kernel.org
Cc: André Almeida <andrealmeid@...lia.com>,
Darren Hart <dvhart@...radead.org>,
Davidlohr Bueso <dave@...olabs.net>,
Ingo Molnar <mingo@...hat.com>,
Juri Lelli <juri.lelli@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>,
Valentin Schneider <vschneid@...hat.com>,
Waiman Long <longman@...hat.com>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Subject: [PATCH v10 21/21] futex: Implement FUTEX2_MPOL
From: Peter Zijlstra <peterz@...radead.org>
Extend the futex2 interface to be aware of mempolicy.
When FUTEX2_MPOL is specified and there is a MPOL_PREFERRED or
home_node specified covering the futex address, use that hash-map.
Notably, in this case the futex will go to the global node hashtable,
even if it is a PRIVATE futex.
When FUTEX2_NUMA|FUTEX2_MPOL is specified and the user specified node
value is FUTEX_NO_NODE, the MPOL lookup (as described above) will be
tried first before reverting to setting node to the local node.
[bigeasy: add CONFIG_FUTEX_MPOL ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
---
include/linux/mmap_lock.h | 4 ++
include/uapi/linux/futex.h | 2 +-
init/Kconfig | 5 ++
kernel/futex/core.c | 112 +++++++++++++++++++++++++++++++------
kernel/futex/futex.h | 4 ++
5 files changed, 108 insertions(+), 19 deletions(-)
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 45a21faa3ff62..89fb032545e0d 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -7,6 +7,7 @@
#include <linux/rwsem.h>
#include <linux/tracepoint-defs.h>
#include <linux/types.h>
+#include <linux/cleanup.h>
#define MMAP_LOCK_INITIALIZER(name) \
.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
@@ -217,6 +218,9 @@ static inline void mmap_read_unlock(struct mm_struct *mm)
up_read(&mm->mmap_lock);
}
+DEFINE_GUARD(mmap_read_lock, struct mm_struct *,
+ mmap_read_lock(_T), mmap_read_unlock(_T))
+
static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
{
__mmap_lock_trace_released(mm, false);
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 0435025beaae8..247c425e175ef 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -63,7 +63,7 @@
#define FUTEX2_SIZE_U32 0x02
#define FUTEX2_SIZE_U64 0x03
#define FUTEX2_NUMA 0x04
- /* 0x08 */
+#define FUTEX2_MPOL 0x08
/* 0x10 */
/* 0x20 */
/* 0x40 */
diff --git a/init/Kconfig b/init/Kconfig
index b0a448608446d..a4502a9077e03 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1688,6 +1688,11 @@ config FUTEX_PRIVATE_HASH
depends on FUTEX && !BASE_SMALL && MMU
default y
+config FUTEX_MPOL
+ bool
+ depends on FUTEX && NUMA
+ default y
+
config EPOLL
bool "Enable eventpoll support" if EXPERT
default y
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index b9da7dc6a900a..65523f3cfe32e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -43,6 +43,8 @@
#include <linux/slab.h>
#include <linux/prctl.h>
#include <linux/rcuref.h>
+#include <linux/mempolicy.h>
+#include <linux/mmap_lock.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -318,6 +320,73 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)
#endif /* CONFIG_FUTEX_PRIVATE_HASH */
+#ifdef CONFIG_FUTEX_MPOL
+static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = vma_lookup(mm, addr);
+ struct mempolicy *mpol;
+ int node = FUTEX_NO_NODE;
+
+ if (!vma)
+ return FUTEX_NO_NODE;
+
+ mpol = vma_policy(vma);
+ if (!mpol)
+ return FUTEX_NO_NODE;
+
+ switch (mpol->mode) {
+ case MPOL_PREFERRED:
+ node = first_node(mpol->nodes);
+ break;
+ case MPOL_PREFERRED_MANY:
+ case MPOL_BIND:
+ if (mpol->home_node != NUMA_NO_NODE)
+ node = mpol->home_node;
+ break;
+ default:
+ break;
+ }
+
+ return node;
+}
+
+static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr)
+{
+ int seq, node;
+
+ guard(rcu)();
+
+ if (!mmap_lock_speculate_try_begin(mm, &seq))
+ return -EBUSY;
+
+ node = __futex_key_to_node(mm, addr);
+
+ if (mmap_lock_speculate_retry(mm, seq))
+ return -EAGAIN;
+
+ return node;
+}
+
+static int futex_mpol(struct mm_struct *mm, unsigned long addr)
+{
+ int node;
+
+ node = futex_key_to_node_opt(mm, addr);
+ if (node >= FUTEX_NO_NODE)
+ return node;
+
+ guard(mmap_read_lock)(mm);
+ return __futex_key_to_node(mm, addr);
+}
+#else /* !CONFIG_FUTEX_MPOL */
+
+static int futex_mpol(struct mm_struct *mm, unsigned long addr)
+{
+ return FUTEX_NO_NODE;
+}
+
+#endif /* CONFIG_FUTEX_MPOL */
+
/**
* futex_hash - Return the hash bucket in the global hash
* @key: Pointer to the futex key for which the hash is calculated
@@ -329,18 +398,20 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)
static struct futex_hash_bucket *
__futex_hash(union futex_key *key, struct futex_private_hash *fph)
{
- struct futex_hash_bucket *hb;
+ int node = key->both.node;
u32 hash;
- int node;
- hb = __futex_hash_private(key, fph);
- if (hb)
- return hb;
+ if (node == FUTEX_NO_NODE) {
+ struct futex_hash_bucket *hb;
+
+ hb = __futex_hash_private(key, fph);
+ if (hb)
+ return hb;
+ }
hash = jhash2((u32 *)key,
offsetof(typeof(*key), both.offset) / sizeof(u32),
key->both.offset);
- node = key->both.node;
if (node == FUTEX_NO_NODE) {
/*
@@ -488,27 +559,32 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
+ node = FUTEX_NO_NODE;
+
if (flags & FLAGS_NUMA) {
u32 __user *naddr = uaddr + size / 2;
if (futex_get_value(&node, naddr))
return -EFAULT;
- if (node == FUTEX_NO_NODE) {
- node = numa_node_id();
- if (futex_put_value(node, naddr))
- return -EFAULT;
-
- } else if (node >= MAX_NUMNODES || !node_possible(node)) {
+ if (node >= MAX_NUMNODES || !node_possible(node))
return -EINVAL;
- }
-
- key->both.node = node;
-
- } else {
- key->both.node = FUTEX_NO_NODE;
}
+ if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL))
+ node = futex_mpol(mm, address);
+
+ if (flags & FLAGS_NUMA) {
+ u32 __user *naddr = uaddr + size / 2;
+
+ if (node == FUTEX_NO_NODE)
+ node = numa_node_id();
+ if (futex_put_value(node, naddr))
+ return -EFAULT;
+ }
+
+ key->both.node = node;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 11c870a92b5d0..52e9c0c4b6c87 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -39,6 +39,7 @@
#define FLAGS_HAS_TIMEOUT 0x0040
#define FLAGS_NUMA 0x0080
#define FLAGS_STRICT 0x0100
+#define FLAGS_MPOL 0x0200
/* FUTEX_ to FLAGS_ */
static inline unsigned int futex_to_flags(unsigned int op)
@@ -67,6 +68,9 @@ static inline unsigned int futex2_to_flags(unsigned int flags2)
if (flags2 & FUTEX2_NUMA)
flags |= FLAGS_NUMA;
+ if (flags2 & FUTEX2_MPOL)
+ flags |= FLAGS_MPOL;
+
return flags;
}
--
2.47.2
Powered by blists - more mailing lists