[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250715102912.GQ1613200@noisy.programming.kicks-ass.net>
Date: Tue, 15 Jul 2025 12:29:12 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Steven Rostedt <rostedt@...nel.org>
Cc: linux-kernel@...r.kernel.org, linux-trace-kernel@...r.kernel.org,
bpf@...r.kernel.org, x86@...nel.org,
Masami Hiramatsu <mhiramat@...nel.org>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Josh Poimboeuf <jpoimboe@...nel.org>,
Ingo Molnar <mingo@...nel.org>, Jiri Olsa <jolsa@...nel.org>,
Namhyung Kim <namhyung@...nel.org>,
Thomas Gleixner <tglx@...utronix.de>,
Andrii Nakryiko <andrii@...nel.org>,
Indu Bhagat <indu.bhagat@...cle.com>,
"Jose E. Marchesi" <jemarch@....org>,
Beau Belgrave <beaub@...ux.microsoft.com>,
Jens Remus <jremus@...ux.ibm.com>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Andrew Morton <akpm@...ux-foundation.org>,
Jens Axboe <axboe@...nel.dk>, Florian Weimer <fweimer@...hat.com>,
Sam James <sam@...too.org>
Subject: Re: [PATCH v13 10/14] unwind: Clear unwind_mask on exit back to user
space
On Mon, Jul 07, 2025 at 09:22:49PM -0400, Steven Rostedt wrote:
> diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
> index 12bffdb0648e..587e120c0fd6 100644
> --- a/include/linux/unwind_deferred.h
> +++ b/include/linux/unwind_deferred.h
> @@ -18,6 +18,14 @@ struct unwind_work {
>
> #ifdef CONFIG_UNWIND_USER
>
> +#define UNWIND_PENDING_BIT (BITS_PER_LONG - 1)
> +#define UNWIND_PENDING BIT(UNWIND_PENDING_BIT)
Since it really didn't matter what bit you took, why not take bit 0?
> +
> +enum {
> + UNWIND_ALREADY_PENDING = 1,
> + UNWIND_ALREADY_EXECUTED = 2,
> +};
> +
> void unwind_task_init(struct task_struct *task);
> void unwind_task_free(struct task_struct *task);
>
> @@ -29,15 +37,26 @@ void unwind_deferred_cancel(struct unwind_work *work);
>
> static __always_inline void unwind_reset_info(void)
> {
> - if (unlikely(current->unwind_info.id.id))
> + struct unwind_task_info *info = ¤t->unwind_info;
> + unsigned long bits;
> +
> + /* Was there any unwinding? */
> + if (unlikely(info->unwind_mask)) {
> + bits = info->unwind_mask;
> + do {
> + /* Is a task_work going to run again before going back */
> + if (bits & UNWIND_PENDING)
> + return;
> + } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL));
> current->unwind_info.id.id = 0;
> + }
> /*
> * As unwind_user_faultable() can be called directly and
> * depends on nr_entries being cleared on exit to user,
> * this needs to be a separate conditional.
> */
> - if (unlikely(current->unwind_info.cache))
> - current->unwind_info.cache->nr_entries = 0;
> + if (unlikely(info->cache))
> + info->cache->nr_entries = 0;
> }
>
> #else /* !CONFIG_UNWIND_USER */
> diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
> index 9aed9866f460..256458f3eafe 100644
> --- a/kernel/unwind/deferred.c
> +++ b/kernel/unwind/deferred.c
> @@ -47,6 +47,11 @@ static LIST_HEAD(callbacks);
> static unsigned long unwind_mask;
> DEFINE_STATIC_SRCU(unwind_srcu);
>
> +static inline bool unwind_pending(struct unwind_task_info *info)
> +{
> + return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
> +}
> +
> /*
> * This is a unique percpu identifier for a given task entry context.
> * Conceptually, it's incremented every time the CPU enters the kernel from
> @@ -143,14 +148,17 @@ static void unwind_deferred_task_work(struct callback_head *head)
> struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
> struct unwind_stacktrace trace;
> struct unwind_work *work;
> + unsigned long bits;
> u64 cookie;
> int idx;
>
> - if (WARN_ON_ONCE(!local_read(&info->pending)))
> + if (WARN_ON_ONCE(!unwind_pending(info)))
> return;
>
> - /* Allow work to come in again */
> - local_set(&info->pending, 0);
> + /* Clear pending bit but make sure to have the current bits */
> + bits = READ_ONCE(info->unwind_mask);
> + while (!try_cmpxchg(&info->unwind_mask, &bits, bits & ~UNWIND_PENDING))
> + ;
We have:
bits = atomic_long_fetch_andnot(UNWIND_PENDING, &info->unwind_mask);
for that. A fair number of architecture can actually do this better than
a cmpxchg loop.
>
> /*
> * From here on out, the callback must always be called, even if it's
> @@ -166,10 +174,8 @@ static void unwind_deferred_task_work(struct callback_head *head)
> idx = srcu_read_lock(&unwind_srcu);
> list_for_each_entry_srcu(work, &callbacks, list,
> srcu_read_lock_held(&unwind_srcu)) {
> - if (test_bit(work->bit, &info->unwind_mask)) {
> + if (test_bit(work->bit, &bits))
> work->func(work, &trace, cookie);
> - clear_bit(work->bit, &info->unwind_mask);
> - }
> }
> srcu_read_unlock(&unwind_srcu, idx);
> }
> @@ -194,15 +200,17 @@ static void unwind_deferred_task_work(struct callback_head *head)
> * because it has already been previously called for the same entry context,
> * it will be called again with the same stack trace and cookie.
> *
> - * Return: 1 if the the callback was already queued.
> - * 0 if the callback successfully was queued.
> + * Return: 0 if the callback successfully was queued.
> + * UNWIND_ALREADY_PENDING if the the callback was already queued.
> + * UNWIND_ALREADY_EXECUTED if the callback was already called
> + * (and will not be called again)
> * Negative if there's an error.
> * @cookie holds the cookie of the first request by any user
> */
Lots of babbling in the Changelog, but no real elucidation as to why you
need this second return value.
AFAICT it serves no real purpose; the users of this function should not
care. The only difference is that the unwind reference (your cookie)
becomes a backward reference instead of a forward reference. But why
would anybody care?
Whatever tool is ultimately in charge of gluing humpty^Wstacktraces back
together again should have no problem with this.
> int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
> {
> struct unwind_task_info *info = ¤t->unwind_info;
> - long pending;
> + unsigned long old, bits;
> int bit;
> int ret;
>
> @@ -225,32 +233,52 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
>
> *cookie = get_cookie(info);
>
> - /* This is already queued */
> - if (test_bit(bit, &info->unwind_mask))
> - return 1;
> + old = READ_ONCE(info->unwind_mask);
> +
> + /* Is this already queued */
> + if (test_bit(bit, &old)) {
> + /*
> + * If pending is not set, it means this work's callback
> + * was already called.
> + */
> + return old & UNWIND_PENDING ? UNWIND_ALREADY_PENDING :
> + UNWIND_ALREADY_EXECUTED;
> + }
>
> - /* callback already pending? */
> - pending = local_read(&info->pending);
> - if (pending)
> + if (unwind_pending(info))
> goto out;
>
> + /*
> + * This is the first to enable another task_work for this task since
> + * the task entered the kernel, or had already called the callbacks.
> + * Set only the bit for this work and clear all others as they have
> + * already had their callbacks called, and do not need to call them
> + * again because of this work.
> + */
> + bits = UNWIND_PENDING | BIT(bit);
> +
> + /*
> + * If the cmpxchg() fails, it means that an NMI came in and set
> + * the pending bit as well as cleared the other bits. Just
> + * jump to setting the bit for this work.
> + */
> if (CAN_USE_IN_NMI) {
> - /* Claim the work unless an NMI just now swooped in to do so. */
> - if (!local_try_cmpxchg(&info->pending, &pending, 1))
> + if (!try_cmpxchg(&info->unwind_mask, &old, bits))
> goto out;
> } else {
> - local_set(&info->pending, 1);
> + info->unwind_mask = bits;
> }
>
> /* The work has been claimed, now schedule it. */
> ret = task_work_add(current, &info->work, TWA_RESUME);
> - if (WARN_ON_ONCE(ret)) {
> - local_set(&info->pending, 0);
> - return ret;
> - }
>
> + if (WARN_ON_ONCE(ret))
> + WRITE_ONCE(info->unwind_mask, 0);
> +
> + return ret;
> out:
> - return test_and_set_bit(bit, &info->unwind_mask);
> + return test_and_set_bit(bit, &info->unwind_mask) ?
> + UNWIND_ALREADY_PENDING : 0;
> }
This is some of the most horrifyingly confused code I've seen in a
while.
Please just slow down and think for a minute.
The below is the last four patches rolled into one. Not been near a
compiler.
---
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -524,4 +524,8 @@ DEFINE_LOCK_GUARD_1(srcu, struct srcu_st
srcu_read_unlock(_T->lock, _T->idx),
int idx)
+DEFINE_LOCK_GUARD_1(srcu_lite, struct srcu_struct,
+ _T->idx = srcu_read_lock_lite(_T->lock),
+ srcu_read_unlock_lite(_T->lock, _T->idx),
+ int idx)
#endif
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -13,10 +13,14 @@ typedef void (*unwind_callback_t)(struct
struct unwind_work {
struct list_head list;
unwind_callback_t func;
+ int bit;
};
#ifdef CONFIG_UNWIND_USER
+#define UNWIND_PENDING_BIT 0
+#define UNWIND_PENDING BIT(UNWIND_PENDING_BIT)
+
void unwind_task_init(struct task_struct *task);
void unwind_task_free(struct task_struct *task);
@@ -28,15 +32,26 @@ void unwind_deferred_cancel(struct unwin
static __always_inline void unwind_reset_info(void)
{
- if (unlikely(current->unwind_info.id.id))
+ struct unwind_task_info *info = ¤t->unwind_info;
+ unsigned long bits;
+
+ /* Was there any unwinding? */
+ if (unlikely(info->unwind_mask)) {
+ bits = raw_atomic_long_read(&info->unwind_mask);
+ do {
+ /* Is a task_work going to run again before going back */
+ if (bits & UNWIND_PENDING)
+ return;
+ } while (!raw_atomic_long_try_cmpxchg(&info->unwind_mask, &bits, 0UL));
current->unwind_info.id.id = 0;
+ }
/*
* As unwind_user_faultable() can be called directly and
* depends on nr_entries being cleared on exit to user,
* this needs to be a separate conditional.
*/
- if (unlikely(current->unwind_info.cache))
- current->unwind_info.cache->nr_entries = 0;
+ if (unlikely(info->cache))
+ info->cache->nr_entries = 0;
}
#else /* !CONFIG_UNWIND_USER */
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -2,6 +2,8 @@
#ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
#define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
+#include <linux/atomic.h>
+
struct unwind_cache {
unsigned int nr_entries;
unsigned long entries[];
@@ -19,8 +21,8 @@ union unwind_task_id {
struct unwind_task_info {
struct unwind_cache *cache;
struct callback_head work;
+ atomic_long_t unwind_mask;
union unwind_task_id id;
- int pending;
};
#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -12,13 +12,39 @@
#include <linux/slab.h>
#include <linux/mm.h>
+/*
+ * For requesting a deferred user space stack trace from NMI context
+ * the architecture must support a safe cmpxchg in NMI context.
+ * For those architectures that do not have that, then it cannot ask
+ * for a deferred user space stack trace from an NMI context. If it
+ * does, then it will get -EINVAL.
+ */
+#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+#define UNWIND_NMI_SAFE 1
+static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
+{
+ u32 zero = 0;
+ return try_cmpxchg(&info->id.cnt, &zero, cnt);
+}
+#else
+#define UNWIND_NMI_SAFE 0
+/* When NMIs are not allowed, this always succeeds */
+static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
+{
+ info->id.cnt = cnt;
+ return true;
+}
+#endif /* CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG */
+
/* Make the cache fit in a 4K page */
#define UNWIND_MAX_ENTRIES \
((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
-/* Guards adding to and reading the list of callbacks */
+/* Guards adding to or removing from the list of callbacks */
static DEFINE_MUTEX(callback_mutex);
static LIST_HEAD(callbacks);
+static unsigned long unwind_mask;
+DEFINE_STATIC_SRCU(unwind_srcu);
/*
* This is a unique percpu identifier for a given task entry context.
@@ -41,21 +67,16 @@ static DEFINE_PER_CPU(u32, unwind_ctx_ct
*/
static u64 get_cookie(struct unwind_task_info *info)
{
- u32 cpu_cnt;
- u32 cnt;
- u32 old = 0;
+ u32 cnt = 1;
if (info->id.cpu)
return info->id.id;
- cpu_cnt = __this_cpu_read(unwind_ctx_ctr);
- cpu_cnt += 2;
- cnt = cpu_cnt | 1; /* Always make non zero */
-
- if (try_cmpxchg(&info->id.cnt, &old, cnt)) {
- /* Update the per cpu counter */
- __this_cpu_write(unwind_ctx_ctr, cpu_cnt);
- }
+ /* LSB it always set to ensure 0 is an invalid value. */
+ cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
+ if (try_assign_cnt(info, cnt))
+ __this_cpu_write(unwind_ctx_ctr, cnt);
+
/* Interrupts are disabled, the CPU will always be same */
info->id.cpu = smp_processor_id() + 1; /* Must be non zero */
@@ -117,13 +138,13 @@ static void unwind_deferred_task_work(st
struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
struct unwind_stacktrace trace;
struct unwind_work *work;
+ unsigned long bits;
u64 cookie;
- if (WARN_ON_ONCE(!info->pending))
+ if (WARN_ON_ONCE(!unwind_pending(info)))
return;
- /* Allow work to come in again */
- WRITE_ONCE(info->pending, 0);
+ bits = atomic_long_fetch_andnot(UNWIND_PENDING, &info->unwind_mask);
/*
* From here on out, the callback must always be called, even if it's
@@ -136,9 +157,11 @@ static void unwind_deferred_task_work(st
cookie = info->id.id;
- guard(mutex)(&callback_mutex);
- list_for_each_entry(work, &callbacks, list) {
- work->func(work, &trace, cookie);
+ guard(srcu_lite)(&unwind_srcu);
+ list_for_each_entry_srcu(work, &callbacks, list,
+ srcu_read_lock_held(&unwind_srcu)) {
+ if (test_bit(work->bit, &bits))
+ work->func(work, &trace, cookie);
}
}
@@ -162,7 +185,7 @@ static void unwind_deferred_task_work(st
* because it has already been previously called for the same entry context,
* it will be called again with the same stack trace and cookie.
*
- * Return: 1 if the the callback was already queued.
+ * Return: 1 if the callback was already queued.
* 0 if the callback successfully was queued.
* Negative if there's an error.
* @cookie holds the cookie of the first request by any user
@@ -170,41 +193,62 @@ static void unwind_deferred_task_work(st
int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
{
struct unwind_task_info *info = ¤t->unwind_info;
- int ret;
+ unsigned long bits, mask;
+ int bit, ret;
*cookie = 0;
- if (WARN_ON_ONCE(in_nmi()))
- return -EINVAL;
-
if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
!user_mode(task_pt_regs(current)))
return -EINVAL;
+ /* NMI requires having safe cmpxchg operations */
+ if (WARN_ON_ONCE(!UNWIND_NMI_SAFE && in_nmi()))
+ return -EINVAL;
+
+ /* Do not allow cancelled works to request again */
+ bit = READ_ONCE(work->bit);
+ if (WARN_ON_ONCE(bit < 0))
+ return -EINVAL;
+
guard(irqsave)();
*cookie = get_cookie(info);
- /* callback already pending? */
- if (info->pending)
+ bits = UNWIND_PENDING | BIT(bit);
+ mask = atomic_long_fetch_or(bits, &info->unwind_mask);
+ if (mask & bits)
return 1;
/* The work has been claimed, now schedule it. */
ret = task_work_add(current, &info->work, TWA_RESUME);
if (WARN_ON_ONCE(ret))
- return ret;
-
- info->pending = 1;
- return 0;
+ atomic_long_set(0, &info->unwind_mask);
}
void unwind_deferred_cancel(struct unwind_work *work)
{
+ struct task_struct *g, *t;
+ int bit;
+
if (!work)
return;
guard(mutex)(&callback_mutex);
- list_del(&work->list);
+ list_del_rcu(&work->list);
+ bit = work->bit;
+
+ /* Do not allow any more requests and prevent callbacks */
+ work->bit = -1;
+
+ __clear_bit(bit, &unwind_mask);
+
+ synchronize_srcu(&unwind_srcu);
+
+ guard(rcu)();
+ /* Clear this bit from all threads */
+ for_each_process_thread(g, t)
+ atomic_long_andnot(BIT(bit), &t->unwind_info.unwind_mask);
}
int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
@@ -212,7 +256,15 @@ int unwind_deferred_init(struct unwind_w
memset(work, 0, sizeof(*work));
guard(mutex)(&callback_mutex);
- list_add(&work->list, &callbacks);
+
+ /* See if there's a bit in the mask available */
+ if (unwind_mask == ~UNWIND_PENDING)
+ return -EBUSY;
+
+ work->bit = ffz(unwind_mask);
+ __set_bit(work->bit, &unwind_mask);
+
+ list_add_rcu(&work->list, &callbacks);
work->func = func;
return 0;
}
Powered by blists - more mailing lists