[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250123202446.610203-13-bigeasy@linutronix.de>
Date: Thu, 23 Jan 2025 21:24:42 +0100
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: linux-kernel@...r.kernel.org
Cc: André Almeida <andrealmeid@...lia.com>,
Darren Hart <dvhart@...radead.org>,
Davidlohr Bueso <dave@...olabs.net>,
Ingo Molnar <mingo@...hat.com>,
Juri Lelli <juri.lelli@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>,
Valentin Schneider <vschneid@...hat.com>,
Waiman Long <longman@...hat.com>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Subject: [PATCH v7 12/15] futex: Allow to re-allocate the private local hash.
The mm_struct::futex_hash_lock guards the futex_hash_bucket assignment/
replacement. The futex_hash_allocate()/ PR_FUTEX_HASH_SET_SLOTS
operation can now be invoked at runtime and resize an already existing
internal private futex_hash_bucket to another size.
The reallocation is based on an idea by Thomas Gleixner: The initial
allocation of struct futex_private_hash sets the reference count
to one. Every user acquires a reference on the local hash before using
it and drops it after it enqueued itself on the hash bucket. There is no
reference held while the task is scheduled out while waiting for the
wake up.
The resize allocates a new struct futex_private_hash and drops the
initial reference under the mm_struct::futex_hash_lock. If the reference
drop results in destruction of the object then users currently queued on
the local hash will be requeued on the new local hash. At the end
mm_struct::futex_phash is updated, the old pointer is RCU freed
and the mutex is dropped.
If the reference drop does not result in destruction of the object then
the new pointer is saved as mm_struct::futex_phash_new. In this case
replacement is delayed. The user dropping the last reference is not
always the best choice to perform the replacement. For instance
futex_wait_queue() drops the reference after changing its task
state which will also be modified while the futex_hash_lock is acquired.
Therefore the replacement is delayed to the task acquiring a reference
on the current local hash.
This scheme keeps the requirement that all waiters/ wakers of the same address
block always on the same futex_hash_bucket::lock.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
---
include/linux/futex.h | 5 +-
include/linux/mm_types.h | 7 +-
kernel/futex/core.c | 252 +++++++++++++++++++++++++++++++++++----
kernel/futex/futex.h | 1 +
kernel/futex/requeue.c | 5 +
kernel/futex/waitwake.c | 4 +-
6 files changed, 243 insertions(+), 31 deletions(-)
diff --git a/include/linux/futex.h b/include/linux/futex.h
index bad377c30de5e..bfb38764bac7a 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -83,12 +83,13 @@ void futex_hash_free(struct mm_struct *mm);
static inline void futex_mm_init(struct mm_struct *mm)
{
- mm->futex_hash_bucket = NULL;
+ rcu_assign_pointer(mm->futex_phash, NULL);
+ mutex_init(&mm->futex_hash_lock);
}
static inline bool futex_hash_requires_allocation(void)
{
- if (current->mm->futex_hash_bucket)
+ if (current->mm->futex_phash)
return false;
return true;
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a3db2ae88c1c2..79ad913bccc86 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -30,7 +30,7 @@
#define INIT_PASID 0
struct address_space;
-struct futex_hash_bucket;
+struct futex_private_hash;
struct mem_cgroup;
/*
@@ -934,8 +934,9 @@ struct mm_struct {
#endif
#ifdef CONFIG_FUTEX
- unsigned int futex_hash_mask;
- struct futex_hash_bucket *futex_hash_bucket;
+ struct mutex futex_hash_lock;
+ struct futex_private_hash __rcu *futex_phash;
+ struct futex_private_hash *futex_phash_new;
#endif
unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 7130019aa9ec6..e1bf43f7eb277 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -40,6 +40,7 @@
#include <linux/fault-inject.h>
#include <linux/slab.h>
#include <linux/prctl.h>
+#include <linux/rcuref.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -56,6 +57,14 @@ static struct {
#define futex_queues (__futex_data.queues)
#define futex_hashsize (__futex_data.hashsize)
+struct futex_private_hash {
+ rcuref_t users;
+ unsigned int hash_mask;
+ struct rcu_head rcu;
+ bool initial_ref_dropped;
+ bool released;
+ struct futex_hash_bucket queues[];
+};
/*
* Fault injections for futexes.
@@ -129,9 +138,122 @@ static struct futex_hash_bucket *futex_hash_private(union futex_key *key,
return &fhb[hash & hash_mask];
}
+static void futex_rehash_current_users(struct futex_private_hash *old,
+ struct futex_private_hash *new)
+{
+ struct futex_hash_bucket *hb_old, *hb_new;
+ unsigned int slots = old->hash_mask + 1;
+ u32 hash_mask = new->hash_mask;
+ unsigned int i;
+
+ for (i = 0; i < slots; i++) {
+ struct futex_q *this, *tmp;
+
+ hb_old = &old->queues[i];
+
+ spin_lock(&hb_old->lock);
+ plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
+
+ plist_del(&this->list, &hb_old->chain);
+ futex_hb_waiters_dec(hb_old);
+
+ WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
+
+ hb_new = futex_hash_private(&this->key, new->queues, hash_mask);
+ futex_hb_waiters_inc(hb_new);
+ /*
+ * The new pointer isn't published yet but an already
+ * moved user can be unqueued due to timeout or signal.
+ */
+ spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
+ plist_add(&this->list, &hb_new->chain);
+ this->lock_ptr = &hb_new->lock;
+ spin_unlock(&hb_new->lock);
+ }
+ spin_unlock(&hb_old->lock);
+ }
+}
+
+static void futex_assign_new_hash(struct futex_private_hash *hb_p_new,
+ struct mm_struct *mm)
+{
+ bool drop_init_ref = hb_p_new != NULL;
+ struct futex_private_hash *hb_p;
+
+ if (!hb_p_new) {
+ hb_p_new = mm->futex_phash_new;
+ mm->futex_phash_new = NULL;
+ }
+ /* Someone was quicker, the current mask is valid */
+ if (!hb_p_new)
+ return;
+
+ hb_p = rcu_dereference_check(mm->futex_phash,
+ lockdep_is_held(&mm->futex_hash_lock));
+ if (hb_p) {
+ if (hb_p->hash_mask >= hb_p_new->hash_mask) {
+ /* It was increased again while we were waiting */
+ kvfree(hb_p_new);
+ return;
+ }
+ /*
+ * If the caller started the resize then the initial reference
+ * needs to be dropped. If the object can not be deconstructed
+ * we save hb_p_new for later and ensure the reference counter
+ * is not dropped again.
+ */
+ if (drop_init_ref &&
+ (hb_p->initial_ref_dropped || !futex_put_private_hash(hb_p))) {
+ mm->futex_phash_new = hb_p_new;
+ hb_p->initial_ref_dropped = true;
+ return;
+ }
+ if (!READ_ONCE(hb_p->released)) {
+ mm->futex_phash_new = hb_p_new;
+ return;
+ }
+
+ futex_rehash_current_users(hb_p, hb_p_new);
+ }
+ rcu_assign_pointer(mm->futex_phash, hb_p_new);
+ kvfree_rcu(hb_p, rcu);
+}
+
struct futex_private_hash *futex_get_private_hash(void)
{
- return NULL;
+ struct mm_struct *mm = current->mm;
+ /*
+ * Ideally we don't loop. If there is a replacement in progress
+ * then a new private hash is already prepared and a reference can't be
+ * obtained once the last user dropped it's.
+ * In that case we block on mm_struct::futex_hash_lock and either have
+ * to perform the replacement or wait while someone else is doing the
+ * job. Eitherway, on the second iteration we acquire a reference on the
+ * new private hash or loop again because a new replacement has been
+ * requested.
+ */
+again:
+ scoped_guard(rcu) {
+ struct futex_private_hash *hb_p;
+
+ hb_p = rcu_dereference(mm->futex_phash);
+ if (!hb_p)
+ return NULL;
+
+ if (rcuref_get(&hb_p->users))
+ return hb_p;
+ }
+ scoped_guard(mutex, ¤t->mm->futex_hash_lock)
+ futex_assign_new_hash(NULL, mm);
+ goto again;
+}
+
+static struct futex_private_hash *futex_get_private_hb(union futex_key *key)
+{
+ if (!futex_key_is_private(key))
+ return NULL;
+
+ return futex_get_private_hash();
}
/**
@@ -144,12 +266,12 @@ struct futex_private_hash *futex_get_private_hash(void)
*/
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
- struct futex_hash_bucket *fhb;
+ struct futex_private_hash *hb_p;
u32 hash;
- fhb = current->mm->futex_hash_bucket;
- if (fhb && futex_key_is_private(key))
- return futex_hash_private(key, fhb, current->mm->futex_hash_mask);
+ hb_p = futex_get_private_hb(key);
+ if (hb_p)
+ return futex_hash_private(key, hb_p->queues, hb_p->hash_mask);
hash = jhash2((u32 *)key,
offsetof(typeof(*key), both.offset) / 4,
@@ -159,11 +281,25 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)
bool futex_put_private_hash(struct futex_private_hash *hb_p)
{
- return false;
+ bool released;
+
+ guard(preempt)();
+ released = rcuref_put_rcusafe(&hb_p->users);
+ if (released)
+ WRITE_ONCE(hb_p->released, true);
+ return released;
}
void futex_hash_put(struct futex_hash_bucket *hb)
{
+ struct futex_private_hash *hb_p;
+
+ if (hb->hb_slot == 0)
+ return;
+ hb_p = container_of(hb, struct futex_private_hash,
+ queues[hb->hb_slot - 1]);
+
+ futex_put_private_hash(hb_p);
}
/**
@@ -175,6 +311,14 @@ void futex_hash_put(struct futex_hash_bucket *hb)
*/
void futex_hash_get(struct futex_hash_bucket *hb)
{
+ struct futex_private_hash *hb_p;
+
+ if (hb->hb_slot == 0)
+ return;
+ hb_p = container_of(hb, struct futex_private_hash,
+ queues[hb->hb_slot - 1]);
+
+ WARN_ON_ONCE(!rcuref_get(&hb_p->users));
}
/**
@@ -622,6 +766,8 @@ int futex_unqueue(struct futex_q *q)
spinlock_t *lock_ptr;
int ret = 0;
+ /* RCU so lock_ptr is not going away during locking. */
+ guard(rcu)();
/* In the common case we don't take the spinlock, which is nice. */
retry:
/*
@@ -1032,10 +1178,22 @@ static void compat_exit_robust_list(struct task_struct *curr)
static void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
+ struct futex_private_hash *hb_p;
struct futex_pi_state *pi_state;
struct futex_hash_bucket *hb;
union futex_key key = FUTEX_KEY_INIT;
+ /*
+ * The mutex mm_struct::futex_hash_lock might be acquired.
+ */
+ might_sleep();
+ /*
+ * Ensure the hash remains stable (no resize) during the while loop
+ * below. The hb pointer is acquired under the pi_lock so we can't block
+ * on the mutex.
+ */
+ WARN_ON(curr != current);
+ hb_p = futex_get_private_hash();
/*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
@@ -1061,6 +1219,7 @@ static void exit_pi_state_list(struct task_struct *curr)
if (!refcount_inc_not_zero(&pi_state->refcount)) {
raw_spin_unlock_irq(&curr->pi_lock);
cpu_relax();
+ futex_hash_put(hb);
raw_spin_lock_irq(&curr->pi_lock);
continue;
}
@@ -1076,7 +1235,7 @@ static void exit_pi_state_list(struct task_struct *curr)
if (head->next != next) {
/* retain curr->pi_lock for the loop invariant */
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
+ futex_hb_unlock_put(hb);
put_pi_state(pi_state);
continue;
}
@@ -1088,7 +1247,7 @@ static void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock(&curr->pi_lock);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
+ futex_hb_unlock_put(hb);
rt_mutex_futex_unlock(&pi_state->pi_mutex);
put_pi_state(pi_state);
@@ -1096,6 +1255,8 @@ static void exit_pi_state_list(struct task_struct *curr)
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
+ if (hb_p)
+ futex_put_private_hash(hb_p);
}
#else
static inline void exit_pi_state_list(struct task_struct *curr) { }
@@ -1209,8 +1370,9 @@ void futex_exit_release(struct task_struct *tsk)
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
-static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
+static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, unsigned int slot)
{
+ fhb->hb_slot = slot;
atomic_set(&fhb->waiters, 0);
plist_head_init(&fhb->chain);
spin_lock_init(&fhb->lock);
@@ -1218,20 +1380,33 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
void futex_hash_free(struct mm_struct *mm)
{
- kvfree(mm->futex_hash_bucket);
+ struct futex_private_hash *hb_p;
+
+ kvfree(mm->futex_phash_new);
+ /*
+ * The mm_struct belonging to the task is removed so all threads, that
+ * ever accessed the private hash, are gone and the pointer can be
+ * accessed directly (omitting a RCU-read section).
+ * Since there can not be a thread holding a reference to the private
+ * hash we free it immediately.
+ */
+ hb_p = rcu_access_pointer(mm->futex_phash);
+ if (!hb_p)
+ return;
+
+ if (!hb_p->initial_ref_dropped && WARN_ON(!futex_put_private_hash(hb_p)))
+ return;
+
+ kvfree(hb_p);
}
static int futex_hash_allocate(unsigned int hash_slots)
{
- struct futex_hash_bucket *fhb;
+ struct futex_private_hash *hb_p, *hb_tofree = NULL;
+ struct mm_struct *mm = current->mm;
+ size_t alloc_size;
int i;
- if (current->mm->futex_hash_bucket)
- return -EALREADY;
-
- if (!thread_group_leader(current))
- return -EINVAL;
-
if (hash_slots == 0)
hash_slots = 16;
if (hash_slots < 2)
@@ -1241,16 +1416,39 @@ static int futex_hash_allocate(unsigned int hash_slots)
if (!is_power_of_2(hash_slots))
hash_slots = rounddown_pow_of_two(hash_slots);
- fhb = kvmalloc_array(hash_slots, sizeof(struct futex_hash_bucket), GFP_KERNEL_ACCOUNT);
- if (!fhb)
+ if (unlikely(check_mul_overflow(hash_slots, sizeof(struct futex_hash_bucket),
+ &alloc_size)))
return -ENOMEM;
- current->mm->futex_hash_mask = hash_slots - 1;
+ if (unlikely(check_add_overflow(alloc_size, sizeof(struct futex_private_hash),
+ &alloc_size)))
+ return -ENOMEM;
+
+ hb_p = kvmalloc(alloc_size, GFP_KERNEL_ACCOUNT);
+ if (!hb_p)
+ return -ENOMEM;
+
+ rcuref_init(&hb_p->users, 1);
+ hb_p->initial_ref_dropped = false;
+ hb_p->released = false;
+ hb_p->hash_mask = hash_slots - 1;
for (i = 0; i < hash_slots; i++)
- futex_hash_bucket_init(&fhb[i]);
+ futex_hash_bucket_init(&hb_p->queues[i], i + 1);
- current->mm->futex_hash_bucket = fhb;
+ scoped_guard(mutex, &mm->futex_hash_lock) {
+ if (mm->futex_phash_new) {
+ if (mm->futex_phash_new->hash_mask <= hb_p->hash_mask) {
+ hb_tofree = mm->futex_phash_new;
+ } else {
+ hb_tofree = hb_p;
+ hb_p = mm->futex_phash_new;
+ }
+ mm->futex_phash_new = NULL;
+ }
+ futex_assign_new_hash(hb_p, mm);
+ }
+ kvfree(hb_tofree);
return 0;
}
@@ -1261,8 +1459,12 @@ int futex_hash_allocate_default(void)
static int futex_hash_get_slots(void)
{
- if (current->mm->futex_hash_bucket)
- return current->mm->futex_hash_mask + 1;
+ struct futex_private_hash *hb_p;
+
+ guard(rcu)();
+ hb_p = rcu_dereference(current->mm->futex_phash);
+ if (hb_p)
+ return hb_p->hash_mask + 1;
return 0;
}
@@ -1304,7 +1506,7 @@ static int __init futex_init(void)
futex_hashsize = 1UL << futex_shift;
for (i = 0; i < futex_hashsize; i++)
- futex_hash_bucket_init(&futex_queues[i]);
+ futex_hash_bucket_init(&futex_queues[i], 0);
return 0;
}
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index d6fa6f663d9ad..127f333d3b0d5 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -115,6 +115,7 @@ static inline bool should_fail_futex(bool fshared)
*/
struct futex_hash_bucket {
atomic_t waiters;
+ unsigned int hb_slot;
spinlock_t lock;
struct plist_head chain;
} ____cacheline_aligned_in_smp;
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index 167204e856fec..eb506428c9574 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -87,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
futex_hb_waiters_inc(hb2);
plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
+ /*
+ * hb1 and hb2 belong to the same futex_hash_bucket_private
+ * because if we managed get a reference on hb1 then it can't be
+ * replaced. Therefore we avoid put(hb1)+get(hb2) here.
+ */
}
q->key = *key2;
}
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 419af848deadb..828e2dbf92c2b 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -173,8 +173,10 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
hb = futex_hash(&key);
/* Make sure we really have tasks to wakeup */
- if (!futex_hb_waiters_pending(hb))
+ if (!futex_hb_waiters_pending(hb)) {
+ futex_hash_put(hb);
return ret;
+ }
spin_lock(&hb->lock);
--
2.47.2
Powered by blists - more mailing lists