[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250312151848.RlB_XuHA@linutronix.de>
Date: Wed, 12 Mar 2025 16:18:48 +0100
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: linux-kernel@...r.kernel.org
Cc: André Almeida <andrealmeid@...lia.com>,
Darren Hart <dvhart@...radead.org>,
Davidlohr Bueso <dave@...olabs.net>, Ingo Molnar <mingo@...hat.com>,
Juri Lelli <juri.lelli@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>,
Valentin Schneider <vschneid@...hat.com>,
Waiman Long <longman@...hat.com>
Subject: Re: [PATCH v10 00/21] futex: Add support task local hash maps,
FUTEX2_NUMA and FUTEX2_MPOL
On 2025-03-12 16:16:13 [+0100], To linux-kernel@...r.kernel.org wrote:
> The complete tree is at
> https://git.kernel.org/pub/scm/linux/kernel/git/bigeasy/staging.git/log/?h=futex_local_v10
> https://git.kernel.org/pub/scm/linux/kernel/git/bigeasy/staging.git futex_local_v10
>
> v9…v10: https://lore.kernel.org/all/20250225170914.289358-1-bigeasy@linutronix.de/
The exact diff vs peterz/locking/futex:
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 0cdd5882e89c1..19c37afa0432a 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -82,12 +82,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3);
int futex_hash_prctl(unsigned long arg2, unsigned long arg3);
-#ifdef CONFIG_BASE_SMALL
-static inline int futex_hash_allocate_default(void) { return 0; }
-static inline void futex_hash_free(struct mm_struct *mm) { }
-static inline void futex_mm_init(struct mm_struct *mm) { }
-#else /* !CONFIG_BASE_SMALL */
-
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
int futex_hash_allocate_default(void);
void futex_hash_free(struct mm_struct *mm);
@@ -97,7 +92,11 @@ static inline void futex_mm_init(struct mm_struct *mm)
mutex_init(&mm->futex_hash_lock);
}
-#endif /* CONFIG_BASE_SMALL */
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
+static inline int futex_hash_allocate_default(void) { return 0; }
+static inline void futex_hash_free(struct mm_struct *mm) { }
+static inline void futex_mm_init(struct mm_struct *mm) { }
+#endif /* CONFIG_FUTEX_PRIVATE_HASH */
#else /* !CONFIG_FUTEX */
static inline void futex_init_task(struct task_struct *tsk) { }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9399ee7d40201..e0e8adbe66bdd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -938,7 +938,7 @@ struct mm_struct {
*/
seqcount_t mm_lock_seq;
#endif
-#if defined(CONFIG_FUTEX) && !defined(CONFIG_BASE_SMALL)
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
struct mutex futex_hash_lock;
struct futex_private_hash __rcu *futex_phash;
struct futex_private_hash *futex_phash_new;
diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
index 6322d8c1c6b42..2fb2af6d98249 100644
--- a/include/linux/rcuref.h
+++ b/include/linux/rcuref.h
@@ -30,7 +30,11 @@ static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
* rcuref_read - Read the number of held reference counts of a rcuref
* @ref: Pointer to the reference count
*
- * Return: The number of held references (0 ... N)
+ * Return: The number of held references (0 ... N). The value 0 does not
+ * indicate that it is safe to schedule the object, protected by this reference
+ * counter, for deconstruction.
+ * If you want to know if the reference counter has been marked DEAD (as
+ * signaled by rcuref_put()) please use rcuread_is_dead().
*/
static inline unsigned int rcuref_read(rcuref_t *ref)
{
@@ -40,6 +44,22 @@ static inline unsigned int rcuref_read(rcuref_t *ref)
return c >= RCUREF_RELEASED ? 0 : c + 1;
}
+/**
+ * rcuref_is_dead - Check if the rcuref has been already marked dead
+ * @ref: Pointer to the reference count
+ *
+ * Return: True if the object has been marked DEAD. This signals that a previous
+ * invocation of rcuref_put() returned true on this reference counter meaning
+ * the protected object can safely be scheduled for deconstruction.
+ * Otherwise, returns false.
+ */
+static inline bool rcuref_is_dead(rcuref_t *ref)
+{
+ unsigned int c = atomic_read(&ref->refcnt);
+
+ return (c >= RCUREF_RELEASED) && (c < RCUREF_NOREF);
+}
+
extern __must_check bool rcuref_get_slowpath(rcuref_t *ref);
/**
diff --git a/init/Kconfig b/init/Kconfig
index a0ea04c177842..a4502a9077e03 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1683,6 +1683,16 @@ config FUTEX_PI
depends on FUTEX && RT_MUTEXES
default y
+config FUTEX_PRIVATE_HASH
+ bool
+ depends on FUTEX && !BASE_SMALL && MMU
+ default y
+
+config FUTEX_MPOL
+ bool
+ depends on FUTEX && NUMA
+ default y
+
config EPOLL
bool "Enable eventpoll support" if EXPERT
default y
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 976a487bf3ad5..65523f3cfe32e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -136,7 +136,7 @@ static inline bool futex_key_is_private(union futex_key *key)
static struct futex_hash_bucket *
__futex_hash(union futex_key *key, struct futex_private_hash *fph);
-#ifndef CONFIG_BASE_SMALL
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
static struct futex_hash_bucket *
__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
{
@@ -196,12 +196,12 @@ static bool __futex_pivot_hash(struct mm_struct *mm,
{
struct futex_private_hash *fph;
- lockdep_assert_held(&mm->futex_hash_lock);
WARN_ON_ONCE(mm->futex_phash_new);
- fph = mm->futex_phash;
+ fph = rcu_dereference_protected(mm->futex_phash,
+ lockdep_is_held(&mm->futex_hash_lock));
if (fph) {
- if (rcuref_read(&fph->users) != 0) {
+ if (!rcuref_is_dead(&fph->users)) {
mm->futex_phash_new = new;
return false;
}
@@ -262,6 +262,10 @@ bool futex_private_hash_get(struct futex_private_hash *fph)
void futex_private_hash_put(struct futex_private_hash *fph)
{
+ /*
+ * Ignore the result; the DEAD state is picked up
+ * when rcuref_get() starts failing via rcuref_is_dead().
+ */
if (rcuref_put(&fph->users))
wake_up_var(fph->mm);
}
@@ -301,7 +305,7 @@ void futex_hash_put(struct futex_hash_bucket *hb)
futex_private_hash_put(fph);
}
-#else
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
static inline struct futex_hash_bucket *
__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
@@ -314,8 +318,9 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)
return __futex_hash(key, NULL);
}
-#endif /* CONFIG_BASE_SMALL */
+#endif /* CONFIG_FUTEX_PRIVATE_HASH */
+#ifdef CONFIG_FUTEX_MPOL
static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = vma_lookup(mm, addr);
@@ -325,7 +330,7 @@ static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
if (!vma)
return FUTEX_NO_NODE;
- mpol = vma->vm_policy;
+ mpol = vma_policy(vma);
if (!mpol)
return FUTEX_NO_NODE;
@@ -373,6 +378,14 @@ static int futex_mpol(struct mm_struct *mm, unsigned long addr)
guard(mmap_read_lock)(mm);
return __futex_key_to_node(mm, addr);
}
+#else /* !CONFIG_FUTEX_MPOL */
+
+static int futex_mpol(struct mm_struct *mm, unsigned long addr)
+{
+ return FUTEX_NO_NODE;
+}
+
+#endif /* CONFIG_FUTEX_MPOL */
/**
* futex_hash - Return the hash bucket in the global hash
@@ -420,7 +433,6 @@ __futex_hash(union futex_key *key, struct futex_private_hash *fph)
return &futex_queues[node][hash & futex_hashmask];
}
-
/**
* futex_setup_timer - set up the sleeping hrtimer.
* @time: ptr to the given timeout value
@@ -932,9 +944,6 @@ int futex_unqueue(struct futex_q *q)
void futex_q_lockptr_lock(struct futex_q *q)
{
-#if 0
- struct futex_hash_bucket *hb;
-#endif
spinlock_t *lock_ptr;
/*
@@ -949,18 +958,6 @@ void futex_q_lockptr_lock(struct futex_q *q)
spin_unlock(lock_ptr);
goto retry;
}
-#if 0
- hb = container_of(lock_ptr, struct futex_hash_bucket, lock);
- /*
- * The caller needs to either hold a reference on the hash (to ensure
- * that the hash is not resized) _or_ be enqueued on the hash. This
- * ensures that futex_q::lock_ptr is updated while moved to the new
- * hash during resize.
- * Once the hash bucket is locked the resize operation, which might be
- * in progress, will block on the lock.
- */
- return hb;
-#endif
}
/*
@@ -1497,7 +1494,7 @@ void futex_exit_release(struct task_struct *tsk)
static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
struct futex_private_hash *fph)
{
-#ifndef CONFIG_BASE_SMALL
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
fhb->priv = fph;
#endif
atomic_set(&fhb->waiters, 0);
@@ -1505,21 +1502,30 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
spin_lock_init(&fhb->lock);
}
-#ifndef CONFIG_BASE_SMALL
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
void futex_hash_free(struct mm_struct *mm)
{
+ struct futex_private_hash *fph;
+
kvfree(mm->futex_phash_new);
- kvfree(mm->futex_phash);
+ fph = rcu_dereference_raw(mm->futex_phash);
+ if (fph) {
+ WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
+ kvfree(fph);
+ }
}
static bool futex_pivot_pending(struct mm_struct *mm)
{
+ struct futex_private_hash *fph;
+
guard(rcu)();
if (!mm->futex_phash_new)
return false;
- return !rcuref_read(&mm->futex_phash->users);
+ fph = rcu_dereference(mm->futex_phash);
+ return !rcuref_read(&fph->users);
}
static bool futex_hash_less(struct futex_private_hash *a,
@@ -1560,7 +1566,7 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
*/
scoped_guard (rcu) {
fph = rcu_dereference(mm->futex_phash);
- if (fph && !mm->futex_phash->hash_mask) {
+ if (fph && !fph->hash_mask) {
if (custom)
return -EBUSY;
return 0;
@@ -1591,7 +1597,8 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
struct futex_private_hash *free __free(kvfree) = NULL;
struct futex_private_hash *cur, *new;
- cur = mm->futex_phash;
+ cur = rcu_dereference_protected(mm->futex_phash,
+ lockdep_is_held(&mm->futex_hash_lock));
new = mm->futex_phash_new;
mm->futex_phash_new = NULL;
@@ -1602,7 +1609,7 @@ static int futex_hash_allocate(unsigned int hash_slots, bool custom)
* allocated a replacement hash, drop the initial
* reference on the existing hash.
*/
- futex_private_hash_put(mm->futex_phash);
+ futex_private_hash_put(cur);
}
if (new) {
@@ -1683,7 +1690,7 @@ static int futex_hash_get_slots(void)
static int futex_hash_allocate(unsigned int hash_slots, bool custom)
{
- return 0;
+ return -EINVAL;
}
static int futex_hash_get_slots(void)
@@ -1723,6 +1730,7 @@ static int __init futex_init(void)
#else
hashsize = 256 * num_possible_cpus();
hashsize /= num_possible_nodes();
+ hashsize = max(4, hashsize);
hashsize = roundup_pow_of_two(hashsize);
#endif
futex_hashshift = ilog2(hashsize);
@@ -1740,12 +1748,15 @@ static int __init futex_init(void)
BUG_ON(!table);
for (i = 0; i < hashsize; i++)
- futex_hash_bucket_init(&table[i], 0);
+ futex_hash_bucket_init(&table[i], NULL);
futex_queues[n] = table;
}
futex_hashmask = hashsize - 1;
+ pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
+ hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
+ order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
return 0;
}
core_initcall(futex_init);
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 40f06523a3565..52e9c0c4b6c87 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -223,14 +223,15 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
extern struct futex_hash_bucket *futex_hash(union futex_key *key);
-#ifndef CONFIG_BASE_SMALL
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
extern void futex_hash_get(struct futex_hash_bucket *hb);
extern void futex_hash_put(struct futex_hash_bucket *hb);
extern struct futex_private_hash *futex_private_hash(void);
extern bool futex_private_hash_get(struct futex_private_hash *fph);
extern void futex_private_hash_put(struct futex_private_hash *fph);
-#else
+
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
diff --git a/mm/nommu.c b/mm/nommu.c
index baa79abdaf037..d04e601a8f4d7 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -209,6 +209,11 @@ EXPORT_SYMBOL(vmalloc_noprof);
void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof);
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
+{
+ return vmalloc_huge_noprof(size, gfp_mask);
+}
+
/*
* vzalloc - allocate virtually contiguous memory with zero fill
*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 39fe43183a64f..69247b46413ca 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3968,9 +3968,9 @@ EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
- gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- node, __builtin_return_address(0));
+ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+ gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
}
/**
Sebastian
Powered by blists - more mailing lists