lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250521125048.4d572d08@gandalf.local.home>
Date: Wed, 21 May 2025 12:50:48 -0400
From: Steven Rostedt <rostedt@...dmis.org>
To: "Masami Hiramatsu (Google)" <mhiramat@...nel.org>
Cc: Namhyung Kim <namhyung@...nel.org>, linux-kernel@...r.kernel.org,
 linux-trace-kernel@...r.kernel.org, bpf@...r.kernel.org, x86@...nel.org,
 Mathieu Desnoyers <mathieu.desnoyers@...icios.com>, Josh Poimboeuf
 <jpoimboe@...nel.org>, Peter Zijlstra <peterz@...radead.org>, Ingo Molnar
 <mingo@...nel.org>, Jiri Olsa <jolsa@...nel.org>, Thomas Gleixner
 <tglx@...utronix.de>, Borislav Petkov <bp@...en8.de>, Dave Hansen
 <dave.hansen@...ux.intel.com>, "H. Peter Anvin" <hpa@...or.com>, Andrii
 Nakryiko <andrii@...nel.org>
Subject: Re: [PATCH v9 00/13] unwind_user: x86: Deferred unwinding
 infrastructure

On Tue, 20 May 2025 19:55:49 -0400
Steven Rostedt <rostedt@...dmis.org> wrote:

> There's a proposal to move trace_sched_process_exit() to before exit_mm().
> If that happens, we could make that tracepoint a "faultable" tracepoint and
> then the unwind infrastructure could attach to it and do the unwinding from
> that tracepoint.

The below patch does work. It's just a PoC and would need to be broken up
and also cleaned up.

I created a TRACE_EVENT_FAULTABLE() that is basically just a
TRACE_EVENT_SYSCALL(), and used that for the sched_process_exit tracepoint.

I then had the unwinder attach to that tracepoint when the first unwind
callback is registered.

I had to change the check in the trace from testing PF_EXITING to just
current->mm is NULL.

But this does work for the exiting of a task:

-- Steve

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index a351763e6965..eb98bb61126e 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -617,6 +617,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 #define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign,	\
 			    print, reg, unreg)			\
 	DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args))
+#define TRACE_EVENT_FAULTABLE(name, proto, args, struct, assign, print)	\
+	DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args))
 
 #define TRACE_EVENT_FLAGS(event, flag)
 
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index ed52d0506c69..b228424744fd 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -50,6 +50,10 @@
 #define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, print, reg, unreg) \
 	DEFINE_TRACE_SYSCALL(name, reg, unreg, PARAMS(proto), PARAMS(args))
 
+#undef TRACE_EVENT_FAULTABLE
+#define TRACE_EVENT_FAULTABLE(name, proto, args, struct, assign, print) \
+	DEFINE_TRACE_SYSCALL(name, NULL, NULL, PARAMS(proto), PARAMS(args))
+
 #undef TRACE_EVENT_NOP
 #define TRACE_EVENT_NOP(name, proto, args, struct, assign, print)
 
@@ -125,6 +129,7 @@
 #undef TRACE_EVENT_FN
 #undef TRACE_EVENT_FN_COND
 #undef TRACE_EVENT_SYSCALL
+#undef TRACE_EVENT_FAULTABLE
 #undef TRACE_EVENT_CONDITION
 #undef TRACE_EVENT_NOP
 #undef DEFINE_EVENT_NOP
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 3bec9fb73a36..c6d7894970e3 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -326,13 +326,13 @@ DEFINE_EVENT(sched_process_template, sched_process_free,
 	     TP_ARGS(p));
 
 /*
- * Tracepoint for a task exiting.
+ * Tracepoint for a task exiting (allows faulting)
  * Note, it's a superset of sched_process_template and should be kept
  * compatible as much as possible. sched_process_exits has an extra
  * `group_dead` argument, so sched_process_template can't be used,
  * unfortunately, just like sched_migrate_task above.
  */
-TRACE_EVENT(sched_process_exit,
+TRACE_EVENT_FAULTABLE(sched_process_exit,
 
 	TP_PROTO(struct task_struct *p, bool group_dead),
 
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 4f22136fd465..0ed57e7906d1 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -55,6 +55,16 @@
 			     PARAMS(print));		       \
 	DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));
 
+#undef TRACE_EVENT_FAULTABLE
+#define TRACE_EVENT_FAULTABLE(name, proto, args, tstruct, assign, print) \
+	DECLARE_EVENT_SYSCALL_CLASS(name,		       \
+			     PARAMS(proto),		       \
+			     PARAMS(args),		       \
+			     PARAMS(tstruct),		       \
+			     PARAMS(assign),		       \
+			     PARAMS(print));		       \
+	DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));
+
 #include "stages/stage1_struct_define.h"
 
 #undef DECLARE_EVENT_CLASS
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index 63d0237bad3e..7aad471f2887 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -11,6 +11,8 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 
+#include <trace/events/sched.h>
+
 #define UNWIND_MAX_ENTRIES 512
 
 /* Guards adding to or removing from the list of callbacks */
@@ -77,7 +79,7 @@ int unwind_deferred_trace(struct unwind_stacktrace *trace)
 	/* Should always be called from faultable context */
 	might_fault();
 
-	if (current->flags & PF_EXITING)
+	if (!current->mm)
 		return -EINVAL;
 
 	if (!info->cache) {
@@ -107,14 +109,14 @@ int unwind_deferred_trace(struct unwind_stacktrace *trace)
 	return 0;
 }
 
-static void unwind_deferred_task_work(struct callback_head *head)
+static void process_unwind_deferred(void)
 {
-	struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
+	struct task_struct *task = current;
+	struct unwind_task_info *info = &task->unwind_info;
 	struct unwind_stacktrace trace;
 	struct unwind_work *work;
 	unsigned long bits;
 	u64 timestamp;
-	struct task_struct *task = current;
 	int idx;
 
 	if (WARN_ON_ONCE(!unwind_pending(task)))
@@ -152,6 +155,21 @@ static void unwind_deferred_task_work(struct callback_head *head)
 	srcu_read_unlock(&unwind_srcu, idx);
 }
 
+static void unwind_deferred_task_work(struct callback_head *head)
+{
+	process_unwind_deferred();
+}
+
+static void unwind_deferred_callback(void *data, struct task_struct *p, bool group_dead)
+{
+	if (!unwind_pending(p))
+		return;
+
+	process_unwind_deferred();
+
+	task_work_cancel(p, &p->unwind_info.work);
+}
+
 static int unwind_deferred_request_nmi(struct unwind_work *work, u64 *timestamp)
 {
 	struct unwind_task_info *info = &current->unwind_info;
@@ -329,6 +347,10 @@ void unwind_deferred_cancel(struct unwind_work *work)
 	for_each_process_thread(g, t) {
 		clear_bit(bit, &t->unwind_mask);
 	}
+
+	/* Is this the last registered unwinding? */
+	if (!unwind_mask)
+		unregister_trace_sched_process_exit(unwind_deferred_callback, NULL);
 }
 
 int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
@@ -341,6 +363,15 @@ int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
 	if (unwind_mask == ~(UNWIND_PENDING))
 		return -EBUSY;
 
+	/* Is this the first registered unwinding? */
+	if (!unwind_mask) {
+		int ret;
+
+		ret = register_trace_sched_process_exit(unwind_deferred_callback, NULL);
+		if (ret < 0)
+			return ret;
+	}
+
 	work->bit = ffz(unwind_mask);
 	unwind_mask |= 1UL << work->bit;
 


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ