[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250619083415.GZ1613376@noisy.programming.kicks-ass.net>
Date: Thu, 19 Jun 2025 10:34:15 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Steven Rostedt <rostedt@...dmis.org>
Cc: linux-kernel@...r.kernel.org, linux-trace-kernel@...r.kernel.org,
bpf@...r.kernel.org, x86@...nel.org,
Masami Hiramatsu <mhiramat@...nel.org>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Josh Poimboeuf <jpoimboe@...nel.org>,
Ingo Molnar <mingo@...nel.org>, Jiri Olsa <jolsa@...nel.org>,
Namhyung Kim <namhyung@...nel.org>,
Thomas Gleixner <tglx@...utronix.de>,
Andrii Nakryiko <andrii@...nel.org>,
Indu Bhagat <indu.bhagat@...cle.com>,
"Jose E. Marchesi" <jemarch@....org>,
Beau Belgrave <beaub@...ux.microsoft.com>,
Jens Remus <jremus@...ux.ibm.com>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Andrew Morton <akpm@...ux-foundation.org>
Subject: Re: [PATCH v10 07/14] unwind_user/deferred: Make unwind deferral
requests NMI-safe
On Tue, Jun 10, 2025 at 08:54:28PM -0400, Steven Rostedt wrote:
> From: Josh Poimboeuf <jpoimboe@...nel.org>
>
> Make unwind_deferred_request() NMI-safe so tracers in NMI context can
> call it and safely request a user space stacktrace when the task exits.
>
> A "nmi_timestamp" is added to the unwind_task_info that gets updated by
> NMIs to not race with setting the info->timestamp.
I feel this is missing something... or I am.
> Signed-off-by: Josh Poimboeuf <jpoimboe@...nel.org>
> Signed-off-by: Steven Rostedt (Google) <rostedt@...dmis.org>
> ---
> Changes since v9: https://lore.kernel.org/linux-trace-kernel/20250513223552.636076711@goodmis.org/
>
> - Check for ret < 0 instead of just ret != 0 from return code of
> task_work_add(). Don't want to just assume it's less than zero as it
> needs to return a negative on error.
>
> include/linux/unwind_deferred_types.h | 1 +
> kernel/unwind/deferred.c | 91 ++++++++++++++++++++++++---
> 2 files changed, 84 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
> index 5df264cf81ad..ae27a02234b8 100644
> --- a/include/linux/unwind_deferred_types.h
> +++ b/include/linux/unwind_deferred_types.h
> @@ -11,6 +11,7 @@ struct unwind_task_info {
> struct unwind_cache *cache;
> struct callback_head work;
> u64 timestamp;
> + u64 nmi_timestamp;
> int pending;
> };
>
> diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
> index b76c704ddc6d..88c867c32c01 100644
> --- a/kernel/unwind/deferred.c
> +++ b/kernel/unwind/deferred.c
> @@ -25,8 +25,27 @@ static u64 get_timestamp(struct unwind_task_info *info)
> {
> lockdep_assert_irqs_disabled();
>
> - if (!info->timestamp)
> - info->timestamp = local_clock();
> + /*
> + * Note, the timestamp is generated on the first request.
> + * If it exists here, then the timestamp is earlier than
> + * this request and it means that this request will be
> + * valid for the stracktrace.
> + */
> + if (!info->timestamp) {
> + WRITE_ONCE(info->timestamp, local_clock());
> + barrier();
> + /*
> + * If an NMI came in and set a timestamp, it means that
> + * it happened before this timestamp was set (otherwise
> + * the NMI would have used this one). Use the NMI timestamp
> + * instead.
> + */
> + if (unlikely(info->nmi_timestamp)) {
> + WRITE_ONCE(info->timestamp, info->nmi_timestamp);
> + barrier();
> + WRITE_ONCE(info->nmi_timestamp, 0);
> + }
> + }
>
> return info->timestamp;
> }
> @@ -103,6 +122,13 @@ static void unwind_deferred_task_work(struct callback_head *head)
>
> unwind_deferred_trace(&trace);
>
> + /* Check if the timestamp was only set by NMI */
> + if (info->nmi_timestamp) {
> + WRITE_ONCE(info->timestamp, info->nmi_timestamp);
> + barrier();
> + WRITE_ONCE(info->nmi_timestamp, 0);
> + }
> +
> timestamp = info->timestamp;
>
> guard(mutex)(&callback_mutex);
> @@ -111,6 +137,48 @@ static void unwind_deferred_task_work(struct callback_head *head)
> }
> }
>
> +static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *timestamp)
> +{
> + struct unwind_task_info *info = ¤t->unwind_info;
> + bool inited_timestamp = false;
> + int ret;
> +
> + /* Always use the nmi_timestamp first */
> + *timestamp = info->nmi_timestamp ? : info->timestamp;
> +
> + if (!*timestamp) {
> + /*
> + * This is the first unwind request since the most recent entry
> + * from user space. Initialize the task timestamp.
> + *
> + * Don't write to info->timestamp directly, otherwise it may race
> + * with an interruption of get_timestamp().
> + */
> + info->nmi_timestamp = local_clock();
> + *timestamp = info->nmi_timestamp;
> + inited_timestamp = true;
> + }
> +
> + if (info->pending)
> + return 1;
> +
> + ret = task_work_add(current, &info->work, TWA_NMI_CURRENT);
> + if (ret < 0) {
> + /*
> + * If this set nmi_timestamp and is not using it,
> + * there's no guarantee that it will be used.
> + * Set it back to zero.
> + */
> + if (inited_timestamp)
> + info->nmi_timestamp = 0;
> + return ret;
> + }
> +
> + info->pending = 1;
> +
> + return 0;
> +}
So what's the actual problem here, something like this:
if (!info->timestamp)
<NMI>
if (!info->timestamp)
info->timestamp = local_clock(); /* Ta */
</NMI>
info->timestamp = local_clock(); /* Tb */
And now info has Tb which is after Ta, which was recorded for the NMI
request?
Why can't we cmpxchg_local() the thing and avoid this horrible stuff?
static u64 get_timestamp(struct unwind_task_info *info)
{
u64 new, old = info->timestamp;
if (old)
return old;
new = local_clock();
old = cmpxchg_local(&info->timestamp, old, new);
if (old)
return old;
return new;
}
Seems simple enough; what's wrong with it?
Powered by blists - more mailing lists