[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250509165155.292241900@goodmis.org>
Date: Fri, 09 May 2025 12:45:34 -0400
From: Steven Rostedt <rostedt@...dmis.org>
To: linux-kernel@...r.kernel.org,
linux-trace-kernel@...r.kernel.org,
bpf@...r.kernel.org,
x86@...nel.org
Cc: Masami Hiramatsu <mhiramat@...nel.org>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Josh Poimboeuf <jpoimboe@...nel.org>,
Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...nel.org>,
Jiri Olsa <jolsa@...nel.org>,
Namhyung Kim <namhyung@...nel.org>
Subject: [PATCH v8 10/18] unwind_user/deferred: Make unwind deferral requests NMI-safe
From: Josh Poimboeuf <jpoimboe@...nel.org>
Make unwind_deferred_request() NMI-safe so tracers in NMI context can
call it and safely request a user space stacktrace when the task exits.
A "nmi_timestamp" is added to the unwind_task_info that gets updated by
NMIs to not race with setting the info->timestamp.
Signed-off-by: Josh Poimboeuf <jpoimboe@...nel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@...dmis.org>
---
Changes since v7: https://lore.kernel.org/20250502165009.069806229@goodmis.org
- Updated to use timestamp instead of cookie
include/linux/unwind_deferred_types.h | 1 +
kernel/unwind/deferred.c | 91 ++++++++++++++++++++++++---
2 files changed, 84 insertions(+), 8 deletions(-)
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index 5df264cf81ad..ae27a02234b8 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -11,6 +11,7 @@ struct unwind_task_info {
struct unwind_cache *cache;
struct callback_head work;
u64 timestamp;
+ u64 nmi_timestamp;
int pending;
};
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index b76c704ddc6d..238cd97079ec 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -25,8 +25,27 @@ static u64 get_timestamp(struct unwind_task_info *info)
{
lockdep_assert_irqs_disabled();
- if (!info->timestamp)
- info->timestamp = local_clock();
+ /*
+ * Note, the timestamp is generated on the first request.
+ * If it exists here, then the timestamp is earlier than
+ * this request and it means that this request will be
+ * valid for the stracktrace.
+ */
+ if (!info->timestamp) {
+ WRITE_ONCE(info->timestamp, local_clock());
+ barrier();
+ /*
+ * If an NMI came in and set a timestamp, it means that
+ * it happened before this timestamp was set (otherwise
+ * the NMI would have used this one). Use the NMI timestamp
+ * instead.
+ */
+ if (unlikely(info->nmi_timestamp)) {
+ WRITE_ONCE(info->timestamp, info->nmi_timestamp);
+ barrier();
+ WRITE_ONCE(info->nmi_timestamp, 0);
+ }
+ }
return info->timestamp;
}
@@ -103,6 +122,13 @@ static void unwind_deferred_task_work(struct callback_head *head)
unwind_deferred_trace(&trace);
+ /* Check if the timestamp was only set by NMI */
+ if (info->nmi_timestamp) {
+ WRITE_ONCE(info->timestamp, info->nmi_timestamp);
+ barrier();
+ WRITE_ONCE(info->nmi_timestamp, 0);
+ }
+
timestamp = info->timestamp;
guard(mutex)(&callback_mutex);
@@ -111,6 +137,48 @@ static void unwind_deferred_task_work(struct callback_head *head)
}
}
+static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *timestamp)
+{
+ struct unwind_task_info *info = ¤t->unwind_info;
+ bool inited_timestamp = false;
+ int ret;
+
+ /* Always use the nmi_timestamp first */
+ *timestamp = info->nmi_timestamp ? : info->timestamp;
+
+ if (!*timestamp) {
+ /*
+ * This is the first unwind request since the most recent entry
+ * from user space. Initialize the task timestamp.
+ *
+ * Don't write to info->timestamp directly, otherwise it may race
+ * with an interruption of get_timestamp().
+ */
+ info->nmi_timestamp = local_clock();
+ *timestamp = info->nmi_timestamp;
+ inited_timestamp = true;
+ }
+
+ if (info->pending)
+ return 1;
+
+ ret = task_work_add(current, &info->work, TWA_NMI_CURRENT);
+ if (ret) {
+ /*
+ * If this set nmi_timestamp and is not using it,
+ * there's no guarantee that it will be used.
+ * Set it back to zero.
+ */
+ if (inited_timestamp)
+ info->nmi_timestamp = 0;
+ return ret;
+ }
+
+ info->pending = 1;
+
+ return 0;
+}
+
/**
* unwind_deferred_request - Request a user stacktrace on task exit
* @work: Unwind descriptor requesting the trace
@@ -139,31 +207,38 @@ static void unwind_deferred_task_work(struct callback_head *head)
int unwind_deferred_request(struct unwind_work *work, u64 *timestamp)
{
struct unwind_task_info *info = ¤t->unwind_info;
+ int pending;
int ret;
*timestamp = 0;
- if (WARN_ON_ONCE(in_nmi()))
- return -EINVAL;
-
if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
!user_mode(task_pt_regs(current)))
return -EINVAL;
+ if (in_nmi())
+ return unwind_deferred_request_nmi(work, timestamp);
+
guard(irqsave)();
*timestamp = get_timestamp(info);
/* callback already pending? */
- if (info->pending)
+ pending = READ_ONCE(info->pending);
+ if (pending)
+ return 1;
+
+ /* Claim the work unless an NMI just now swooped in to do so. */
+ if (!try_cmpxchg(&info->pending, &pending, 1))
return 1;
/* The work has been claimed, now schedule it. */
ret = task_work_add(current, &info->work, TWA_RESUME);
- if (WARN_ON_ONCE(ret))
+ if (WARN_ON_ONCE(ret)) {
+ WRITE_ONCE(info->pending, 0);
return ret;
+ }
- info->pending = 1;
return 0;
}
--
2.47.2
Powered by blists - more mailing lists