[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251024104119.GJ4068168@noisy.programming.kicks-ass.net>
Date: Fri, 24 Oct 2025 12:41:19 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Steven Rostedt <rostedt@...nel.org>, jremus@...ux.ibm.com
Cc: linux-kernel@...r.kernel.org, linux-trace-kernel@...r.kernel.org,
bpf@...r.kernel.org, x86@...nel.org,
Masami Hiramatsu <mhiramat@...nel.org>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Josh Poimboeuf <jpoimboe@...nel.org>,
Ingo Molnar <mingo@...nel.org>, Jiri Olsa <jolsa@...nel.org>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
Namhyung Kim <namhyung@...nel.org>,
Thomas Gleixner <tglx@...utronix.de>,
Andrii Nakryiko <andrii@...nel.org>,
Indu Bhagat <indu.bhagat@...cle.com>,
"Jose E. Marchesi" <jemarch@....org>,
Beau Belgrave <beaub@...ux.microsoft.com>,
Jens Remus <jremus@...ux.ibm.com>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Andrew Morton <akpm@...ux-foundation.org>,
Florian Weimer <fweimer@...hat.com>, Sam James <sam@...too.org>,
Kees Cook <kees@...nel.org>, Carlos O'Donell <codonell@...hat.com>
Subject: Re: [PATCH v16 0/4] perf: Support the deferred unwinding
infrastructure
On Fri, Oct 24, 2025 at 11:29:26AM +0200, Peter Zijlstra wrote:
> On Thu, Oct 23, 2025 at 05:00:02PM +0200, Peter Zijlstra wrote:
>
> > Trouble is, pretty much every unwind is 510 entries long -- this cannot
> > be right. I'm sure there's a silly mistake in unwind/user.c but I'm too
> > tired to find it just now. I'll try again tomorrow.
>
> PEBKAC
Anyway, while staring at this, I noted that the perf userspace unwind
code has a few bits that are missing from the new shiny thing.
How about something like so? This add an optional arch specific unwinder
at the very highest priority (bit 0) and uses that to do a few extra
bits before disabling itself and falling back to whatever lower prio
unwinder to do the actual unwinding.
---
arch/x86/events/core.c | 40 ---------------------------
arch/x86/include/asm/unwind_user.h | 4 ++
arch/x86/include/asm/uprobes.h | 9 ++++++
arch/x86/kernel/unwind_user.c | 53 +++++++++++++++++++++++++++++++++++++
arch/x86/kernel/uprobes.c | 32 ++++++++++++++++++++++
include/linux/unwind_user_types.h | 5 ++-
kernel/unwind/user.c | 7 ++++
7 files changed, 109 insertions(+), 41 deletions(-)
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2845,46 +2845,6 @@ static unsigned long get_segment_base(un
return get_desc_base(desc);
}
-#ifdef CONFIG_UPROBES
-/*
- * Heuristic-based check if uprobe is installed at the function entry.
- *
- * Under assumption of user code being compiled with frame pointers,
- * `push %rbp/%ebp` is a good indicator that we indeed are.
- *
- * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
- * If we get this wrong, captured stack trace might have one extra bogus
- * entry, but the rest of stack trace will still be meaningful.
- */
-static bool is_uprobe_at_func_entry(struct pt_regs *regs)
-{
- struct arch_uprobe *auprobe;
-
- if (!current->utask)
- return false;
-
- auprobe = current->utask->auprobe;
- if (!auprobe)
- return false;
-
- /* push %rbp/%ebp */
- if (auprobe->insn[0] == 0x55)
- return true;
-
- /* endbr64 (64-bit only) */
- if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
- return true;
-
- return false;
-}
-
-#else
-static bool is_uprobe_at_func_entry(struct pt_regs *regs)
-{
- return false;
-}
-#endif /* CONFIG_UPROBES */
-
#ifdef CONFIG_IA32_EMULATION
#include <linux/compat.h>
--- a/arch/x86/include/asm/unwind_user.h
+++ b/arch/x86/include/asm/unwind_user.h
@@ -8,4 +8,8 @@
.fp_off = -2*(ws), \
.use_fp = true,
+#define HAVE_UNWIND_USER_ARCH 1
+
+extern int unwind_user_next_arch(struct unwind_user_state *state);
+
#endif /* _ASM_X86_UNWIND_USER_H */
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -62,4 +62,13 @@ struct arch_uprobe_task {
unsigned int saved_tf;
};
+#ifdef CONFIG_UPROBES
+extern bool is_uprobe_at_func_entry(struct pt_regs *regs);
+#else
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ return false;
+}
+#endif /* CONFIG_UPROBES */
+
#endif /* _ASM_UPROBES_H */
--- /dev/null
+++ b/arch/x86/kernel/unwind_user.c
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/unwind_user.h>
+#include <linux/uprobes.h>
+#include <linux/uaccess.h>
+#include <linux/sched/task_stack.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+
+int unwind_user_next_arch(struct unwind_user_state *state)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+
+ /* only once, on the first iteration */
+ state->available_types &= ~UNWIND_USER_TYPE_ARCH;
+
+ /* We don't know how to unwind VM86 stacks. */
+ if (regs->flags & X86_VM_MASK) {
+ state->done = true;
+ return 0;
+ }
+
+ /*
+ * If we are called from uprobe handler, and we are indeed at the very
+ * entry to user function (which is normally a `push %rbp` instruction,
+ * under assumption of application being compiled with frame pointers),
+ * we should read return address from *regs->sp before proceeding
+ * to follow frame pointers, otherwise we'll skip immediate caller
+ * as %rbp is not yet setup.
+ */
+ if (!is_uprobe_at_func_entry(regs))
+ return -EINVAL;
+
+#ifdef CONFIG_COMPAT
+ if (state->ws == sizeof(int)) {
+ unsigned int retaddr;
+ int ret = get_user(retaddr, (unsigned int __user *)regs->sp);
+ if (ret)
+ return ret;
+
+ state->ip = retaddr;
+ return 0;
+ }
+#endif
+ unsigned long retaddr;
+ int ret = get_user(retaddr, (unsigned long __user *)regs->sp);
+ if (ret)
+ return ret;
+
+ state->ip = retaddr;
+ return 0;
+}
+
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1791,3 +1791,35 @@ bool arch_uretprobe_is_alive(struct retu
else
return regs->sp <= ret->stack;
}
+
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ struct arch_uprobe *auprobe;
+
+ if (!current->utask)
+ return false;
+
+ auprobe = current->utask->auprobe;
+ if (!auprobe)
+ return false;
+
+ /* push %rbp/%ebp */
+ if (auprobe->insn[0] == 0x55)
+ return true;
+
+ /* endbr64 (64-bit only) */
+ if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
+ return true;
+
+ return false;
+}
--- a/include/linux/unwind_user_types.h
+++ b/include/linux/unwind_user_types.h
@@ -3,13 +3,15 @@
#define _LINUX_UNWIND_USER_TYPES_H
#include <linux/types.h>
+#include <linux/bits.h>
/*
* Unwind types, listed in priority order: lower numbers are attempted first if
* available.
*/
enum unwind_user_type_bits {
- UNWIND_USER_TYPE_FP_BIT = 0,
+ UNWIND_USER_TYPE_ARCH_BIT = 0,
+ UNWIND_USER_TYPE_FP_BIT,
NR_UNWIND_USER_TYPE_BITS,
};
@@ -17,6 +19,7 @@ enum unwind_user_type_bits {
enum unwind_user_type {
/* Type "none" for the start of stack walk iteration. */
UNWIND_USER_TYPE_NONE = 0,
+ UNWIND_USER_TYPE_ARCH = BIT(UNWIND_USER_TYPE_ARCH_BIT),
UNWIND_USER_TYPE_FP = BIT(UNWIND_USER_TYPE_FP_BIT),
};
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -79,6 +79,10 @@ static int unwind_user_next(struct unwin
state->current_type = type;
switch (type) {
+ case UNWIND_USER_TYPE_ARCH:
+ if (!unwind_user_next_arch(state))
+ return 0;
+ continue;
case UNWIND_USER_TYPE_FP:
if (!unwind_user_next_fp(state))
return 0;
@@ -107,6 +111,9 @@ static int unwind_user_start(struct unwi
return -EINVAL;
}
+ if (HAVE_UNWIND_USER_ARCH)
+ state->available_types |= UNWIND_USER_TYPE_ARCH;
+
if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))
state->available_types |= UNWIND_USER_TYPE_FP;
Powered by blists - more mailing lists