[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250407155742.968816-13-bigeasy@linutronix.de>
Date: Mon, 7 Apr 2025 17:57:35 +0200
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: linux-kernel@...r.kernel.org
Cc: André Almeida <andrealmeid@...lia.com>,
Darren Hart <dvhart@...radead.org>,
Davidlohr Bueso <dave@...olabs.net>,
Ingo Molnar <mingo@...hat.com>,
Juri Lelli <juri.lelli@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>,
Valentin Schneider <vschneid@...hat.com>,
Waiman Long <longman@...hat.com>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Subject: [PATCH v11 12/19] futex: Add basic infrastructure for local task local hash.
The futex hash is system wide and shared by all tasks. Each slot
is hashed based on futex address and the VMA of the thread. Due to
randomized VMAs (and memory allocations) the same logical lock (pointer)
can end up in a different hash bucket on each invocation of the
application. This in turn means that different applications may share a
hash bucket on the first invocation but not on the second and it is not
always clear which applications will be involved. This can result in
high latency's to acquire the futex_hash_bucket::lock especially if the
lock owner is limited to a CPU and can not be effectively PI boosted.
Introduce basic infrastructure for process local hash which is shared by
all threads of process. This hash will only be used for a
PROCESS_PRIVATE FUTEX operation.
The hashmap can be allocated via
prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, num)
A `num' of 0 means that the global hash is used instead of a private
hash.
Other values for `num' specify the number of slots for the hash and the
number must be power of two, starting with two.
The prctl() returns zero on success. This function can only be used
before a thread is created.
The current status for the private hash can be queried via
num = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS);
which return the current number of slots. The value 0 means that the
global hash is used. Values greater than 0 indicate the number of slots
that are used. A negative number indicates an error.
For optimisation, for the private hash jhash2() uses only two arguments
the address and the offset. This omits the VMA which is always the same.
[peterz: Use 0 for global hash. A bit shuffling and renaming. ]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
---
include/linux/futex.h | 26 ++++-
include/linux/mm_types.h | 5 +-
include/uapi/linux/prctl.h | 5 +
init/Kconfig | 5 +
kernel/fork.c | 2 +
kernel/futex/core.c | 206 +++++++++++++++++++++++++++++++++----
kernel/futex/futex.h | 10 ++
kernel/sys.c | 4 +
8 files changed, 242 insertions(+), 21 deletions(-)
diff --git a/include/linux/futex.h b/include/linux/futex.h
index b70df27d7e85c..bbc897e54c864 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -4,11 +4,11 @@
#include <linux/sched.h>
#include <linux/ktime.h>
+#include <linux/mm_types.h>
#include <uapi/linux/futex.h>
struct inode;
-struct mm_struct;
struct task_struct;
/*
@@ -77,7 +77,22 @@ void futex_exec_release(struct task_struct *tsk);
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3);
-#else
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3);
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+void futex_hash_free(struct mm_struct *mm);
+
+static inline void futex_mm_init(struct mm_struct *mm)
+{
+ mm->futex_phash = NULL;
+}
+
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
+static inline void futex_hash_free(struct mm_struct *mm) { }
+static inline void futex_mm_init(struct mm_struct *mm) { }
+#endif /* CONFIG_FUTEX_PRIVATE_HASH */
+
+#else /* !CONFIG_FUTEX */
static inline void futex_init_task(struct task_struct *tsk) { }
static inline void futex_exit_recursive(struct task_struct *tsk) { }
static inline void futex_exit_release(struct task_struct *tsk) { }
@@ -88,6 +103,13 @@ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
{
return -EINVAL;
}
+static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3)
+{
+ return -EINVAL;
+}
+static inline void futex_hash_free(struct mm_struct *mm) { }
+static inline void futex_mm_init(struct mm_struct *mm) { }
+
#endif
#endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 56d07edd01f91..a4b5661e41770 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -31,6 +31,7 @@
#define INIT_PASID 0
struct address_space;
+struct futex_private_hash;
struct mem_cgroup;
/*
@@ -1031,7 +1032,9 @@ struct mm_struct {
*/
seqcount_t mm_lock_seq;
#endif
-
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+ struct futex_private_hash *futex_phash;
+#endif
unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 15c18ef4eb11a..3b93fb906e3c5 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -364,4 +364,9 @@ struct prctl_mm_map {
# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
# define PR_TIMER_CREATE_RESTORE_IDS_GET 2
+/* FUTEX hash management */
+#define PR_FUTEX_HASH 78
+# define PR_FUTEX_HASH_SET_SLOTS 1
+# define PR_FUTEX_HASH_GET_SLOTS 2
+
#endif /* _LINUX_PRCTL_H */
diff --git a/init/Kconfig b/init/Kconfig
index dd2ea3b9a7992..b308b98d79347 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1699,6 +1699,11 @@ config FUTEX_PI
depends on FUTEX && RT_MUTEXES
default y
+config FUTEX_PRIVATE_HASH
+ bool
+ depends on FUTEX && !BASE_SMALL && MMU
+ default y
+
config EPOLL
bool "Enable eventpoll support" if EXPERT
default y
diff --git a/kernel/fork.c b/kernel/fork.c
index c4b26cd8998b8..831dfec450544 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1305,6 +1305,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
+ futex_mm_init(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
mm->pmd_huge_pte = NULL;
#endif
@@ -1387,6 +1388,7 @@ static inline void __mmput(struct mm_struct *mm)
if (mm->binfmt)
module_put(mm->binfmt->module);
lru_gen_del_mm(mm);
+ futex_hash_free(mm);
mmdrop(mm);
}
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index afc66780f84fc..479a629b9aeea 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -39,6 +39,7 @@
#include <linux/memblock.h>
#include <linux/fault-inject.h>
#include <linux/slab.h>
+#include <linux/prctl.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -55,6 +56,12 @@ static struct {
#define futex_queues (__futex_data.queues)
#define futex_hashmask (__futex_data.hashmask)
+struct futex_private_hash {
+ unsigned int hash_mask;
+ void *mm;
+ bool custom;
+ struct futex_hash_bucket queues[];
+};
/*
* Fault injections for futexes.
@@ -107,9 +114,17 @@ late_initcall(fail_futex_debugfs);
#endif /* CONFIG_FAIL_FUTEX */
-struct futex_private_hash *futex_private_hash(void)
+static struct futex_hash_bucket *
+__futex_hash(union futex_key *key, struct futex_private_hash *fph);
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+static inline bool futex_key_is_private(union futex_key *key)
{
- return NULL;
+ /*
+ * Relies on get_futex_key() to set either bit for shared
+ * futexes -- see comment with union futex_key.
+ */
+ return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
}
bool futex_private_hash_get(struct futex_private_hash *fph)
@@ -117,21 +132,8 @@ bool futex_private_hash_get(struct futex_private_hash *fph)
return false;
}
-void futex_private_hash_put(struct futex_private_hash *fph) { }
-
-/**
- * futex_hash - Return the hash bucket in the global hash
- * @key: Pointer to the futex key for which the hash is calculated
- *
- * We hash on the keys returned from get_futex_key (see below) and return the
- * corresponding hash bucket in the global hash.
- */
-struct futex_hash_bucket *futex_hash(union futex_key *key)
+void futex_private_hash_put(struct futex_private_hash *fph)
{
- u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
- key->both.offset);
-
- return &futex_queues[hash & futex_hashmask];
}
/**
@@ -144,6 +146,84 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)
void futex_hash_get(struct futex_hash_bucket *hb) { }
void futex_hash_put(struct futex_hash_bucket *hb) { }
+static struct futex_hash_bucket *
+__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
+{
+ u32 hash;
+
+ if (!futex_key_is_private(key))
+ return NULL;
+
+ if (!fph)
+ fph = key->private.mm->futex_phash;
+ if (!fph || !fph->hash_mask)
+ return NULL;
+
+ hash = jhash2((void *)&key->private.address,
+ sizeof(key->private.address) / 4,
+ key->both.offset);
+ return &fph->queues[hash & fph->hash_mask];
+}
+
+struct futex_private_hash *futex_private_hash(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct futex_private_hash *fph;
+
+ fph = mm->futex_phash;
+ return fph;
+}
+
+struct futex_hash_bucket *futex_hash(union futex_key *key)
+{
+ struct futex_hash_bucket *hb;
+
+ hb = __futex_hash(key, NULL);
+ return hb;
+}
+
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
+
+static struct futex_hash_bucket *
+__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
+{
+ return NULL;
+}
+
+struct futex_hash_bucket *futex_hash(union futex_key *key)
+{
+ return __futex_hash(key, NULL);
+}
+
+#endif /* CONFIG_FUTEX_PRIVATE_HASH */
+
+/**
+ * __futex_hash - Return the hash bucket
+ * @key: Pointer to the futex key for which the hash is calculated
+ * @fph: Pointer to private hash if known
+ *
+ * We hash on the keys returned from get_futex_key (see below) and return the
+ * corresponding hash bucket.
+ * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the
+ * private hash) is returned if existing. Otherwise a hash bucket from the
+ * global hash is returned.
+ */
+static struct futex_hash_bucket *
+__futex_hash(union futex_key *key, struct futex_private_hash *fph)
+{
+ struct futex_hash_bucket *hb;
+ u32 hash;
+
+ hb = __futex_hash_private(key, fph);
+ if (hb)
+ return hb;
+
+ hash = jhash2((u32 *)key,
+ offsetof(typeof(*key), both.offset) / 4,
+ key->both.offset);
+ return &futex_queues[hash & futex_hashmask];
+}
+
/**
* futex_setup_timer - set up the sleeping hrtimer.
* @time: ptr to the given timeout value
@@ -985,6 +1065,13 @@ static void exit_pi_state_list(struct task_struct *curr)
struct futex_pi_state *pi_state;
union futex_key key = FUTEX_KEY_INIT;
+ /*
+ * Ensure the hash remains stable (no resize) during the while loop
+ * below. The hb pointer is acquired under the pi_lock so we can't block
+ * on the mutex.
+ */
+ WARN_ON(curr != current);
+ guard(private_hash)();
/*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
@@ -1160,13 +1247,96 @@ void futex_exit_release(struct task_struct *tsk)
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
-static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
+static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
+ struct futex_private_hash *fph)
{
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+ fhb->priv = fph;
+#endif
atomic_set(&fhb->waiters, 0);
plist_head_init(&fhb->chain);
spin_lock_init(&fhb->lock);
}
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+void futex_hash_free(struct mm_struct *mm)
+{
+ kvfree(mm->futex_phash);
+}
+
+static int futex_hash_allocate(unsigned int hash_slots, bool custom)
+{
+ struct mm_struct *mm = current->mm;
+ struct futex_private_hash *fph;
+ int i;
+
+ if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
+ return -EINVAL;
+
+ if (mm->futex_phash)
+ return -EALREADY;
+
+ if (!thread_group_empty(current))
+ return -EINVAL;
+
+ fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT);
+ if (!fph)
+ return -ENOMEM;
+
+ fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
+ fph->custom = custom;
+ fph->mm = mm;
+
+ for (i = 0; i < hash_slots; i++)
+ futex_hash_bucket_init(&fph->queues[i], fph);
+
+ mm->futex_phash = fph;
+ return 0;
+}
+
+static int futex_hash_get_slots(void)
+{
+ struct futex_private_hash *fph;
+
+ fph = current->mm->futex_phash;
+ if (fph && fph->hash_mask)
+ return fph->hash_mask + 1;
+ return 0;
+}
+
+#else
+
+static int futex_hash_allocate(unsigned int hash_slots, bool custom)
+{
+ return -EINVAL;
+}
+
+static int futex_hash_get_slots(void)
+{
+ return 0;
+}
+#endif
+
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3)
+{
+ int ret;
+
+ switch (arg2) {
+ case PR_FUTEX_HASH_SET_SLOTS:
+ ret = futex_hash_allocate(arg3, true);
+ break;
+
+ case PR_FUTEX_HASH_GET_SLOTS:
+ ret = futex_hash_get_slots();
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
static int __init futex_init(void)
{
unsigned long hashsize, i;
@@ -1185,7 +1355,7 @@ static int __init futex_init(void)
hashsize = 1UL << futex_shift;
for (i = 0; i < hashsize; i++)
- futex_hash_bucket_init(&futex_queues[i]);
+ futex_hash_bucket_init(&futex_queues[i], NULL);
futex_hashmask = hashsize - 1;
return 0;
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 26e69333cb745..899aed5acde12 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -118,6 +118,7 @@ struct futex_hash_bucket {
atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
+ struct futex_private_hash *priv;
} ____cacheline_aligned_in_smp;
/*
@@ -204,6 +205,7 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns);
extern struct futex_hash_bucket *futex_hash(union futex_key *key);
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
extern void futex_hash_get(struct futex_hash_bucket *hb);
extern void futex_hash_put(struct futex_hash_bucket *hb);
@@ -211,6 +213,14 @@ extern struct futex_private_hash *futex_private_hash(void);
extern bool futex_private_hash_get(struct futex_private_hash *fph);
extern void futex_private_hash_put(struct futex_private_hash *fph);
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
+static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
+static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
+static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
+static inline bool futex_private_hash_get(void) { return false; }
+static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
+#endif
+
DEFINE_CLASS(hb, struct futex_hash_bucket *,
if (_T) futex_hash_put(_T),
futex_hash(key), union futex_key *key);
diff --git a/kernel/sys.c b/kernel/sys.c
index c434968e9f5dd..d446d8ecb0b33 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -52,6 +52,7 @@
#include <linux/user_namespace.h>
#include <linux/time_namespace.h>
#include <linux/binfmts.h>
+#include <linux/futex.h>
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
@@ -2820,6 +2821,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = posixtimer_create_prctl(arg2);
break;
+ case PR_FUTEX_HASH:
+ error = futex_hash_prctl(arg2, arg3);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
--
2.49.0
Powered by blists - more mailing lists