[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241026224306.982896-3-bigeasy@linutronix.de>
Date: Sun, 27 Oct 2024 00:34:51 +0200
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: linux-kernel@...r.kernel.org
Cc: André Almeida <andrealmeid@...lia.com>,
Darren Hart <dvhart@...radead.org>,
Davidlohr Bueso <dave@...olabs.net>,
Ingo Molnar <mingo@...hat.com>,
Juri Lelli <juri.lelli@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Valentin Schneider <vschneid@...hat.com>,
Waiman Long <longman@...hat.com>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Subject: [RFC PATCH 2/3] futex: Add basic infrastructure for local task local hash.
The futex hashmap is system wide and shared by random tasks. Each slot
is hashed based on its address and VMA. Due to randomized VMAs the same
logical lock (pointer) can end up in a different hash bucket on each
invocation of the application. This in turn means that different
applications may share a hash bucket on each invocation and it is not
always clear which applications will be involved. This can result in
high latency's to acquire the futex_hash_bucket::lock especially if the
lock owner is limited to a CPU and not be effectively PI boosted.
Introduce a task local hash map. The hashmap can be allocated via
prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_ALLOCATE, 0)
The `0' argument allocates a default number of 4 slots, a higher number
can be specified if desired. The current uppoer limit is 16.
The hashmap can be shared with other threads within an application via
prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SHARE);
Once the shared hashmap is enabled, all threads must enable it.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
---
include/linux/futex.h | 8 +++
include/linux/sched.h | 2 +
include/uapi/linux/prctl.h | 5 ++
kernel/futex/core.c | 125 +++++++++++++++++++++++++++++++++++++
kernel/sys.c | 4 ++
5 files changed, 144 insertions(+)
diff --git a/include/linux/futex.h b/include/linux/futex.h
index b70df27d7e85c..e92cbea336e8e 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -69,6 +69,7 @@ static inline void futex_init_task(struct task_struct *tsk)
tsk->pi_state_cache = NULL;
tsk->futex_state = FUTEX_STATE_OK;
mutex_init(&tsk->futex_exit_mutex);
+ rcu_assign_pointer(tsk->futex_hash_table, NULL);
}
void futex_exit_recursive(struct task_struct *tsk);
@@ -77,6 +78,8 @@ void futex_exec_release(struct task_struct *tsk);
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3);
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5);
#else
static inline void futex_init_task(struct task_struct *tsk) { }
static inline void futex_exit_recursive(struct task_struct *tsk) { }
@@ -88,6 +91,11 @@ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
{
return -EINVAL;
}
+static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ return -EINVAL;
+}
#endif
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ade6417609002..8854c6029a9b4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -58,6 +58,7 @@ struct bpf_net_context;
struct capture_control;
struct cfs_rq;
struct fs_struct;
+struct futex_hash_table;
struct futex_pi_state;
struct io_context;
struct io_uring_task;
@@ -1281,6 +1282,7 @@ struct task_struct {
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
+ struct futex_hash_table *futex_hash_table;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 35791791a879b..2475b128ba85d 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -328,4 +328,9 @@ struct prctl_mm_map {
# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */
# define PR_PPC_DEXCR_CTRL_MASK 0x1f
+/* FUTEX hash management */
+#define PR_FUTEX_HASH 74
+# define PR_FUTEX_HASH_ALLOCATE 1
+# define PR_FUTEX_HASH_SHARE 2
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index de6d7f71961eb..7c97fc96f84a3 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -39,6 +39,7 @@
#include <linux/memblock.h>
#include <linux/fault-inject.h>
#include <linux/slab.h>
+#include <linux/prctl.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -55,6 +56,12 @@ static struct {
#define futex_queues (__futex_data.queues)
#define futex_hashsize (__futex_data.hashsize)
+struct futex_hash_table {
+ unsigned int slots;
+ int users;
+ spinlock_t lock;
+ struct futex_hash_bucket queues[];
+};
/*
* Fault injections for futexes.
@@ -1040,6 +1047,9 @@ static inline void exit_pi_state_list(struct task_struct *curr) { }
static void futex_cleanup(struct task_struct *tsk)
{
+ struct futex_hash_table *fht;
+ bool need_free = false;
+
if (unlikely(tsk->robust_list)) {
exit_robust_list(tsk);
tsk->robust_list = NULL;
@@ -1054,6 +1064,23 @@ static void futex_cleanup(struct task_struct *tsk)
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
+
+ rcu_read_lock();
+ fht = rcu_dereference(current->futex_hash_table);
+ if (fht) {
+
+ spin_lock(&fht->lock);
+ fht->users--;
+ WARN_ON_ONCE(fht->users < 0);
+ if (fht->users == 0)
+ need_free = true;
+ spin_unlock(&fht->lock);
+ rcu_assign_pointer(current->futex_hash_table, NULL);
+ }
+ rcu_read_unlock();
+
+ if (need_free)
+ kfree_rcu_mightsleep(fht);
}
/**
@@ -1153,6 +1180,104 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
spin_lock_init(&fhb->lock);
}
+static int futex_hash_allocate(unsigned long arg3, unsigned long arg4,
+ unsigned long arg5)
+{
+ unsigned int hash_slots = arg3;
+ struct futex_hash_table *fht;
+ size_t struct_size;
+ int i;
+
+ if (hash_slots == 0)
+ hash_slots = 4;
+ if (hash_slots < 2)
+ hash_slots = 2;
+ if (hash_slots > 16)
+ hash_slots = 16;
+ if (!is_power_of_2(hash_slots))
+ hash_slots = rounddown_pow_of_two(hash_slots);
+
+ if (current->futex_hash_table)
+ return -EALREADY;
+
+ struct_size = hash_slots * sizeof(struct futex_hash_bucket);
+ struct_size += sizeof(struct futex_hash_table);
+ fht = kmalloc(struct_size, GFP_KERNEL);
+ if (!fht)
+ return -ENOMEM;
+
+ fht->slots = hash_slots;
+ fht->users = 1;
+ spin_lock_init(&fht->lock);
+
+ for (i = 0; i < hash_slots; i++)
+ futex_hash_bucket_init(&fht->queues[i]);
+
+ rcu_assign_pointer(current->futex_hash_table, fht);
+ return 0;
+}
+
+static int futex_hash_share(unsigned long arg3, unsigned long arg4,
+ unsigned long arg5)
+{
+ struct futex_hash_table *fht;
+ struct task_struct *task;
+ pid_t task_pid;
+ int ret;
+
+ rcu_read_lock();
+ /* XXX maybe auto attach on fork() */
+ task_pid = task_tgid_vnr(current);
+ task = find_task_by_vpid(task_pid);
+ if (!task) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ fht = rcu_dereference(task->futex_hash_table);
+ if (!fht) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ spin_lock(&fht->lock);
+ if (fht->users <= 0) {
+ ret = -EINVAL;
+ goto unlock_out;
+ }
+ fht->users++;
+
+ rcu_assign_pointer(current->futex_hash_table, fht);
+ ret = 0;
+
+unlock_out:
+ spin_unlock(&fht->lock);
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ int ret;
+
+ switch (arg2) {
+ case PR_FUTEX_HASH_ALLOCATE:
+ ret = futex_hash_allocate(arg3, arg4, arg5);
+ break;
+
+ case PR_FUTEX_HASH_SHARE:
+ ret = futex_hash_share(arg3, arg4, arg5);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
static int __init futex_init(void)
{
unsigned int futex_shift;
diff --git a/kernel/sys.c b/kernel/sys.c
index 4da31f28fda81..0dcbb8ce9f19d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -52,6 +52,7 @@
#include <linux/user_namespace.h>
#include <linux/time_namespace.h>
#include <linux/binfmts.h>
+#include <linux/futex.h>
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
@@ -2784,6 +2785,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_RISCV_SET_ICACHE_FLUSH_CTX:
error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
break;
+ case PR_FUTEX_HASH:
+ error = futex_hash_prctl(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
--
2.45.2
Powered by blists - more mailing lists