[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241218111618.268028-11-bigeasy@linutronix.de>
Date: Wed, 18 Dec 2024 12:09:48 +0100
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: linux-kernel@...r.kernel.org
Cc: André Almeida <andrealmeid@...lia.com>,
Darren Hart <dvhart@...radead.org>,
Davidlohr Bueso <dave@...olabs.net>,
Ingo Molnar <mingo@...hat.com>,
Juri Lelli <juri.lelli@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>,
Valentin Schneider <vschneid@...hat.com>,
Waiman Long <longman@...hat.com>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Subject: [PATCH v6 10/15] futex: Allow to re-allocate the private local hash.
The mm_struct::futex_hash_lock guards the futex_hash_bucket assignment/
replacement. The futex_hash_allocate()/ PR_FUTEX_HASH_SET_SLOTS
operation can now be invoked at runtime and resize an already existing
internal private futex_hash_bucket to another size.
The reallocation is based on an idea by Thomas Gleixner: The initial
allocation of struct futex_hash_bucket_private sets the reference count
to one. Every user acquires a reference on the local hash before using
it and drops it after it enqueued itself on the hash bucket. There is no
reference held while the task is scheduled out while waiting for the
wake up.
The resize allocates a new struct futex_hash_bucket_private and drops the
initial reference under the mm_struct::futex_hash_lock. If the reference
drop results in destruction of the object then users currently queued on
the local hash will be requeued on the new local hash. At the end
mm_struct::futex_hash_bucket is updated, the old pointer is RCU freed
and the mutex is dropped.
If the reference drop does not result in destruction of the object then
the new pointer is saved as mm_struct::futex_hash_new. In this case
replacement is delayed. The user dropping the last reference is not
always the best choice to preform the replacement. For instance
futex_wait_queue() drops the reference after changing its task
state which will also be modified while the futex_hash_lock is acquired.
Therefore the replacement is delayed to the task acquiring a reference
on the current local hash.
This scheme keeps the requirement that during a lock/ unlock operation
all waiter block on the same futex_hash_bucket::lock.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
---
include/linux/futex.h | 3 +-
include/linux/mm_types.h | 7 +-
kernel/futex/core.c | 230 +++++++++++++++++++++++++++++++++++----
kernel/futex/futex.h | 1 +
kernel/futex/requeue.c | 5 +
kernel/futex/waitwake.c | 4 +-
6 files changed, 222 insertions(+), 28 deletions(-)
diff --git a/include/linux/futex.h b/include/linux/futex.h
index bad377c30de5e..3ced01a9c5218 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -83,7 +83,8 @@ void futex_hash_free(struct mm_struct *mm);
static inline void futex_mm_init(struct mm_struct *mm)
{
- mm->futex_hash_bucket = NULL;
+ rcu_assign_pointer(mm->futex_hash_bucket, NULL);
+ mutex_init(&mm->futex_hash_lock);
}
static inline bool futex_hash_requires_allocation(void)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 2337a2e481fd0..62fe872b381f8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -30,7 +30,7 @@
#define INIT_PASID 0
struct address_space;
-struct futex_hash_bucket;
+struct futex_hash_bucket_private;
struct mem_cgroup;
/*
@@ -904,8 +904,9 @@ struct mm_struct {
#endif
#ifdef CONFIG_FUTEX
- unsigned int futex_hash_mask;
- struct futex_hash_bucket *futex_hash_bucket;
+ struct mutex futex_hash_lock;
+ struct futex_hash_bucket_private __rcu *futex_hash_bucket;
+ struct futex_hash_bucket_private *futex_hash_new;
#endif
unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index e8214656a66b6..44e16f033a4dd 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -40,6 +40,7 @@
#include <linux/fault-inject.h>
#include <linux/slab.h>
#include <linux/prctl.h>
+#include <linux/rcuref.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -56,6 +57,13 @@ static struct {
#define futex_queues (__futex_data.queues)
#define futex_hashsize (__futex_data.hashsize)
+struct futex_hash_bucket_private {
+ rcuref_t users;
+ unsigned int hash_mask;
+ struct rcu_head rcu;
+ bool initial_ref_dropped;
+ struct futex_hash_bucket queues[];
+};
/*
* Fault injections for futexes.
@@ -129,6 +137,122 @@ static struct futex_hash_bucket *futex_hash_private(union futex_key *key,
return &fhb[hash & hash_mask];
}
+static void futex_rehash_current_users(struct futex_hash_bucket_private *old,
+ struct futex_hash_bucket_private *new)
+{
+ struct futex_hash_bucket *hb_old, *hb_new;
+ unsigned int slots = old->hash_mask + 1;
+ u32 hash_mask = new->hash_mask;
+ unsigned int i;
+
+ for (i = 0; i < slots; i++) {
+ struct futex_q *this, *tmp;
+
+ hb_old = &old->queues[i];
+
+ spin_lock(&hb_old->lock);
+ plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
+
+ plist_del(&this->list, &hb_old->chain);
+ futex_hb_waiters_dec(hb_old);
+
+ WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
+
+ hb_new = futex_hash_private(&this->key, new->queues, hash_mask);
+ futex_hb_waiters_inc(hb_new);
+ /*
+ * The new pointer isn't published yet but an already
+ * moved user can be unqueued due to timeout or signal.
+ */
+ spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
+ plist_add(&this->list, &hb_new->chain);
+ this->lock_ptr = &hb_new->lock;
+ spin_unlock(&hb_new->lock);
+ }
+ spin_unlock(&hb_old->lock);
+ }
+}
+
+static void __futex_assign_new_hb(struct futex_hash_bucket_private *hb_p_new,
+ struct mm_struct *mm)
+{
+ struct futex_hash_bucket_private *hb_p;
+ bool drop_init_ref = hb_p_new != NULL;
+
+ if (!hb_p_new) {
+ hb_p_new = mm->futex_hash_new;
+ mm->futex_hash_new = NULL;
+ }
+ /* Someone was quicker, the current mask is valid */
+ if (!hb_p_new)
+ return;
+
+ hb_p = rcu_dereference_check(mm->futex_hash_bucket,
+ lockdep_is_held(&mm->futex_hash_lock));
+ if (hb_p) {
+ if (hb_p->hash_mask >= hb_p_new->hash_mask) {
+ /* It was increased again while we were waiting */
+ kvfree(hb_p_new);
+ return;
+ }
+ /*
+ * If the caller started the resize then the initial reference
+ * needs needs to be dropped. If the object can not be
+ * deconstructed we save hb_p_new for later and ensure the
+ * reference counter is not dropped again.
+ */
+ if (drop_init_ref &&
+ (hb_p->initial_ref_dropped || !rcuref_put(&hb_p->users))) {
+ mm->futex_hash_new = hb_p_new;
+ hb_p->initial_ref_dropped = true;
+ return;
+ }
+
+ futex_rehash_current_users(hb_p, hb_p_new);
+ }
+ rcu_assign_pointer(mm->futex_hash_bucket, hb_p_new);
+ kvfree_rcu(hb_p, rcu);
+}
+
+static void futex_assign_new_hb(struct futex_hash_bucket_private *hb_p_new)
+{
+ struct mm_struct *mm = current->mm;
+
+ scoped_guard(mutex, &mm->futex_hash_lock)
+ __futex_assign_new_hb(hb_p_new, mm);
+}
+
+static struct futex_hash_bucket_private *futex_get_private_hb(union futex_key *key)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (!futex_key_is_private(key))
+ return NULL;
+ /*
+ * Ideally we don't loop. If there is a replacement in progress
+ * then a new local hash is already prepared. We fail to obtain
+ * a reference only after the last user returned its referefence.
+ * In that case futex_assign_new_hb() blocks on futex_hash_bucket
+ * and we either have to performon the replacement or wait
+ * while someone else is doing the job. Eitherway, after we
+ * return we can acquire a reference on the new local hash
+ * (unless it is replaced again).
+ */
+again:
+ scoped_guard(rcu) {
+ struct futex_hash_bucket_private *hb_p;
+
+ hb_p = rcu_dereference(mm->futex_hash_bucket);
+ if (!hb_p)
+ return NULL;
+
+ if (rcuref_get(&hb_p->users))
+ return hb_p;
+ }
+ futex_assign_new_hb(NULL);
+ goto again;
+}
+
/**
* futex_hash - Return the hash bucket in the global hash
* @key: Pointer to the futex key for which the hash is calculated
@@ -139,12 +263,12 @@ static struct futex_hash_bucket *futex_hash_private(union futex_key *key,
*/
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
- struct futex_hash_bucket *fhb;
+ struct futex_hash_bucket_private *hb_p = NULL;
u32 hash;
- fhb = current->mm->futex_hash_bucket;
- if (fhb && futex_key_is_private(key))
- return futex_hash_private(key, fhb, current->mm->futex_hash_mask);
+ hb_p = futex_get_private_hb(key);
+ if (hb_p)
+ return futex_hash_private(key, hb_p->queues, hb_p->hash_mask);
hash = jhash2((u32 *)key,
offsetof(typeof(*key), both.offset) / 4,
@@ -154,6 +278,16 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)
void futex_hash_put(struct futex_hash_bucket *hb)
{
+ struct futex_hash_bucket_private *hb_p;
+
+ if (hb->hb_slot == 0)
+ return;
+ hb_p = container_of(hb, struct futex_hash_bucket_private,
+ queues[hb->hb_slot - 1]);
+
+ if (!rcuref_put(&hb_p->users))
+ return;
+ /* If hb_p is for destruction then this is delayed to futex_hash() */
}
/**
@@ -601,6 +735,8 @@ int futex_unqueue(struct futex_q *q)
spinlock_t *lock_ptr;
int ret = 0;
+ /* RCU so lock_ptr is not going away during locking. */
+ guard(rcu)();
/* In the common case we don't take the spinlock, which is nice. */
retry:
/*
@@ -1008,10 +1144,23 @@ static void compat_exit_robust_list(struct task_struct *curr)
static void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
+ struct futex_hash_bucket_private *hb_p;
struct futex_pi_state *pi_state;
struct futex_hash_bucket *hb;
union futex_key key = FUTEX_KEY_INIT;
+ /*
+ * Lock the futex_hash_bucket to ensure that the hb remains unchanged.
+ * This is important so we can invoke futex_hash() under the pi_lock.
+ */
+ guard(mutex)(&curr->mm->futex_hash_lock);
+ hb_p = rcu_dereference_check(curr->mm->futex_hash_bucket,
+ lockdep_is_held(&curr->mm->futex_hash_lock));
+ if (hb_p) {
+ if (rcuref_read(&hb_p->users) == 0)
+ __futex_assign_new_hb(NULL, curr->mm);
+ }
+
/*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
@@ -1037,6 +1186,7 @@ static void exit_pi_state_list(struct task_struct *curr)
if (!refcount_inc_not_zero(&pi_state->refcount)) {
raw_spin_unlock_irq(&curr->pi_lock);
cpu_relax();
+ futex_hash_put(hb);
raw_spin_lock_irq(&curr->pi_lock);
continue;
}
@@ -1052,7 +1202,7 @@ static void exit_pi_state_list(struct task_struct *curr)
if (head->next != next) {
/* retain curr->pi_lock for the loop invariant */
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
+ futex_hb_unlock_put(hb);
put_pi_state(pi_state);
continue;
}
@@ -1064,7 +1214,7 @@ static void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock(&curr->pi_lock);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
+ futex_hb_unlock_put(hb);
rt_mutex_futex_unlock(&pi_state->pi_mutex);
put_pi_state(pi_state);
@@ -1185,8 +1335,9 @@ void futex_exit_release(struct task_struct *tsk)
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
-static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
+static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, unsigned int slot)
{
+ fhb->hb_slot = slot;
atomic_set(&fhb->waiters, 0);
plist_head_init(&fhb->chain);
spin_lock_init(&fhb->lock);
@@ -1194,20 +1345,27 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
void futex_hash_free(struct mm_struct *mm)
{
- kvfree(mm->futex_hash_bucket);
+ struct futex_hash_bucket_private *hb_p;
+
+ /* We are the last one and we hold the initial reference */
+ hb_p = rcu_dereference_check(mm->futex_hash_bucket, true);
+ if (!hb_p)
+ return;
+
+ kvfree(mm->futex_hash_new);
+ if (WARN_ON(!rcuref_put(&hb_p->users)))
+ return;
+
+ kvfree(hb_p);
}
static int futex_hash_allocate(unsigned int hash_slots)
{
- struct futex_hash_bucket *fhb;
+ struct futex_hash_bucket_private *hb_p, *hb_tofree = NULL;
+ struct mm_struct *mm = current->mm;
+ size_t alloc_size;
int i;
- if (current->mm->futex_hash_bucket)
- return -EALREADY;
-
- if (!thread_group_leader(current))
- return -EINVAL;
-
if (hash_slots == 0)
hash_slots = 16;
if (hash_slots < 2)
@@ -1217,16 +1375,38 @@ static int futex_hash_allocate(unsigned int hash_slots)
if (!is_power_of_2(hash_slots))
hash_slots = rounddown_pow_of_two(hash_slots);
- fhb = kvmalloc_array(hash_slots, sizeof(struct futex_hash_bucket), GFP_KERNEL_ACCOUNT);
- if (!fhb)
+ if (unlikely(check_mul_overflow(hash_slots, sizeof(struct futex_hash_bucket),
+ &alloc_size)))
return -ENOMEM;
- current->mm->futex_hash_mask = hash_slots - 1;
+ if (unlikely(check_add_overflow(alloc_size, sizeof(struct futex_hash_bucket_private),
+ &alloc_size)))
+ return -ENOMEM;
+
+ hb_p = kvmalloc(alloc_size, GFP_KERNEL_ACCOUNT);
+ if (!hb_p)
+ return -ENOMEM;
+
+ rcuref_init(&hb_p->users, 1);
+ hb_p->initial_ref_dropped = false;
+ hb_p->hash_mask = hash_slots - 1;
for (i = 0; i < hash_slots; i++)
- futex_hash_bucket_init(&fhb[i]);
+ futex_hash_bucket_init(&hb_p->queues[i], i + 1);
- current->mm->futex_hash_bucket = fhb;
+ scoped_guard(mutex, &mm->futex_hash_lock) {
+ if (mm->futex_hash_new) {
+ if (mm->futex_hash_new->hash_mask <= hb_p->hash_mask) {
+ hb_tofree = mm->futex_hash_new;
+ } else {
+ hb_tofree = hb_p;
+ hb_p = mm->futex_hash_new;
+ }
+ mm->futex_hash_new = NULL;
+ }
+ __futex_assign_new_hb(hb_p, mm);
+ }
+ kvfree(hb_tofree);
return 0;
}
@@ -1237,8 +1417,12 @@ int futex_hash_allocate_default(void)
static int futex_hash_get_slots(void)
{
- if (current->mm->futex_hash_bucket)
- return current->mm->futex_hash_mask + 1;
+ struct futex_hash_bucket_private *hb_p;
+
+ guard(rcu)();
+ hb_p = rcu_dereference(current->mm->futex_hash_bucket);
+ if (hb_p)
+ return hb_p->hash_mask + 1;
return 0;
}
@@ -1280,7 +1464,7 @@ static int __init futex_init(void)
futex_hashsize = 1UL << futex_shift;
for (i = 0; i < futex_hashsize; i++)
- futex_hash_bucket_init(&futex_queues[i]);
+ futex_hash_bucket_init(&futex_queues[i], 0);
return 0;
}
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 3c78126d4079e..8f6ff83f9a499 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -115,6 +115,7 @@ static inline bool should_fail_futex(bool fshared)
*/
struct futex_hash_bucket {
atomic_t waiters;
+ unsigned int hb_slot;
spinlock_t lock;
struct plist_head chain;
} ____cacheline_aligned_in_smp;
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index db27fbf68521c..1c1c43251120e 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -87,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
futex_hb_waiters_inc(hb2);
plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
+ /*
+ * hb1 and hb2 belong to the same futex_hash_bucket_private
+ * because if we managed get a reference on hb1 then it can't be
+ * replaced. Therefore we avoid put(hb1)+get(hb2) here.
+ */
}
q->key = *key2;
}
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 03e2f546967d8..f5ee140886a50 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -173,8 +173,10 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
hb = futex_hash(&key);
/* Make sure we really have tasks to wakeup */
- if (!futex_hb_waiters_pending(hb))
+ if (!futex_hb_waiters_pending(hb)) {
+ futex_hash_put(hb);
return ret;
+ }
spin_lock(&hb->lock);
--
2.45.2
Powered by blists - more mailing lists