linux-kernel - [PATCH 3/5] perf trace: Do not return 0 from syscall tracepoint BPF

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250814071754.193265-4-namhyung@kernel.org>
Date: Thu, 14 Aug 2025 00:17:52 -0700
From: Namhyung Kim <namhyung@...nel.org>
To: Arnaldo Carvalho de Melo <acme@...nel.org>,
	Ian Rogers <irogers@...gle.com>,
	Kan Liang <kan.liang@...ux.intel.com>
Cc: Jiri Olsa <jolsa@...nel.org>,
	Adrian Hunter <adrian.hunter@...el.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...nel.org>,
	LKML <linux-kernel@...r.kernel.org>,
	linux-perf-users@...r.kernel.org,
	bpf@...r.kernel.org,
	Song Liu <song@...nel.org>,
	Howard Chu <howardchu95@...il.com>
Subject: [PATCH 3/5] perf trace: Do not return 0 from syscall tracepoint BPF

Howard reported that returning 0 from the BPF resulted in affecting
global syscall tracepoint handling.  What we want to do is just to drop
syscall output in the current perf session.  So we need a different
approach.

Currently perf trace uses bpf-output event for augmented arguments and
raw_syscalls:sys_{enter,exit} tracepoint events for normal arguments.
But I think we can just use bpf-output in both cases and drop the trace
point events.

Then it needs to distinguish bpf-output data if it's for enter or exit.
Repurpose struct trace_entry.type which is common in both syscall entry
and exit tracepoints.

Closes: https://lore.kernel.org/r/20250529065537.529937-1-howardchu95@gmail.com
Suggested-by: Howard Chu <howardchu95@...il.com>
Signed-off-by: Namhyung Kim <namhyung@...nel.org>
---
 tools/perf/builtin-trace.c                    | 119 ++++++++++++++----
 .../bpf_skel/augmented_raw_syscalls.bpf.c     |  37 ++++--
 tools/perf/util/bpf_skel/perf_trace_u.h       |  14 +++
 3 files changed, 133 insertions(+), 37 deletions(-)
 create mode 100644 tools/perf/util/bpf_skel/perf_trace_u.h

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 1bc912273af2db66..e1caa82bc427b68b 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -22,6 +22,7 @@
 #include <bpf/btf.h>
 #endif
 #include "util/bpf_map.h"
+#include "util/bpf_skel/perf_trace_u.h"
 #include "util/rlimit.h"
 #include "builtin.h"
 #include "util/cgroup.h"
@@ -535,6 +536,61 @@ static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *
 	return NULL;
 }
 
+static struct syscall_tp sys_enter_tp;
+static struct syscall_tp sys_exit_tp;
+
+static int evsel__init_bpf_output_tp(struct evsel *evsel)
+{
+	struct tep_event *event;
+	struct tep_format_field *field;
+	struct syscall_tp *sc;
+
+	if (evsel == NULL)
+		return 0;
+
+	event = trace_event__tp_format("raw_syscalls", "sys_enter");
+	if (IS_ERR(event))
+		event = trace_event__tp_format("syscalls", "sys_enter");
+	if (IS_ERR(event))
+		return PTR_ERR(event);
+
+	field = tep_find_field(event, "id");
+	if (field == NULL)
+		return -EINVAL;
+
+	tp_field__init_uint(&sys_enter_tp.id, field, evsel->needs_swap);
+	__tp_field__init_ptr(&sys_enter_tp.args, sys_enter_tp.id.offset + sizeof(u64));
+
+	/* ID is at the same offset, use evsel sc for convenience */
+	sc = evsel__syscall_tp(evsel);
+	if (sc == NULL)
+		return -ENOMEM;
+
+	event = trace_event__tp_format("raw_syscalls", "sys_exit");
+	if (IS_ERR(event))
+		event = trace_event__tp_format("syscalls", "sys_exit");
+	if (IS_ERR(event))
+		return PTR_ERR(event);
+
+	field = tep_find_field(event, "id");
+	if (field == NULL)
+		return -EINVAL;
+
+	tp_field__init_uint(&sys_exit_tp.id, field, evsel->needs_swap);
+
+	field = tep_find_field(event, "ret");
+	if (field == NULL)
+		return -EINVAL;
+
+	tp_field__init_uint(&sys_exit_tp.ret, field, evsel->needs_swap);
+
+	/* Save the common part to the evsel sc */
+	BUG_ON(sys_enter_tp.id.offset != sys_exit_tp.id.offset);
+	sc->id = sys_enter_tp.id;
+
+	return 0;
+}
+
 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
 	({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
 	   fields->name.integer(&fields->name, sample); })
@@ -2777,7 +2833,10 @@ static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
 
 	trace__fprintf_sample(trace, evsel, sample, thread);
 
-	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
+	if (evsel == trace->syscalls.events.bpf_output)
+		args = sys_enter_tp.args.pointer(&sys_enter_tp.args, sample);
+	else
+		args = perf_evsel__sc_tp_ptr(evsel, args, sample);
 
 	if (ttrace->entry_str == NULL) {
 		ttrace->entry_str = malloc(trace__entry_str_size);
@@ -2797,8 +2856,10 @@ static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
 	 * thinking that the extra 2 u64 args are the augmented filename, so just check
 	 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
 	 */
-	if (evsel != trace->syscalls.events.sys_enter)
-		augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
+	if (evsel == trace->syscalls.events.bpf_output) {
+		augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size,
+							 trace->raw_augmented_syscalls_args_size);
+	}
 	ttrace->entry_time = sample->time;
 	msg = ttrace->entry_str;
 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
@@ -2922,7 +2983,10 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
 
 	trace__fprintf_sample(trace, evsel, sample, thread);
 
-	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
+	if (evsel == trace->syscalls.events.bpf_output)
+		ret = sys_exit_tp.ret.integer(&sys_exit_tp.ret, sample);
+	else
+		ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
 
 	if (trace->summary)
 		thread__update_stats(thread, ttrace, id, sample, ret, trace);
@@ -3252,6 +3316,17 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
 		}
 	}
 
+	if (evsel == trace->syscalls.events.bpf_output) {
+		short *event_type = sample->raw_data;
+
+		if (*event_type == SYSCALL_TRACE_ENTER)
+			trace__sys_enter(trace, evsel, event, sample);
+		else
+			trace__sys_exit(trace, evsel, event, sample);
+
+		goto printed;
+	}
+
 	trace__printf_interrupted_entry(trace);
 	trace__fprintf_tstamp(trace, sample->time, trace->output);
 
@@ -3261,25 +3336,6 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
 	if (thread)
 		trace__fprintf_comm_tid(trace, thread, trace->output);
 
-	if (evsel == trace->syscalls.events.bpf_output) {
-		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
-		int e_machine = thread ? thread__e_machine(thread, trace->host) : EM_HOST;
-		struct syscall *sc = trace__syscall_info(trace, evsel, e_machine, id);
-
-		if (sc) {
-			fprintf(trace->output, "%s(", sc->name);
-			trace__fprintf_sys_enter(trace, evsel, sample);
-			fputc(')', trace->output);
-			goto newline;
-		}
-
-		/*
-		 * XXX: Not having the associated syscall info or not finding/adding
-		 * 	the thread should never happen, but if it does...
-		 * 	fall thru and print it as a bpf_output event.
-		 */
-	}
-
 	fprintf(trace->output, "%s(", evsel->name);
 
 	if (evsel__is_bpf_output(evsel)) {
@@ -3299,7 +3355,6 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
 		}
 	}
 
-newline:
 	fprintf(trace->output, ")\n");
 
 	if (callchain_ret > 0)
@@ -3307,6 +3362,7 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
 	else if (callchain_ret < 0)
 		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
 
+printed:
 	++trace->nr_events_printed;
 
 	if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
@@ -4527,7 +4583,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 
 	trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
 		perf_thread_map__nr(evlist->core.threads) > 1 ||
-		evlist__first(evlist)->core.attr.inherit;
+		!trace->opts.no_inherit;
 
 	/*
 	 * Now that we already used evsel->core.attr to ask the kernel to setup the
@@ -5552,8 +5608,6 @@ int cmd_trace(int argc, const char **argv)
 	if (err < 0)
 		goto skip_augmentation;
 
-	trace__add_syscall_newtp(&trace);
-
 	err = augmented_syscalls__create_bpf_output(trace.evlist);
 	if (err == 0)
 		trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
@@ -5589,6 +5643,7 @@ int cmd_trace(int argc, const char **argv)
 
 	if (trace.evlist->core.nr_entries > 0) {
 		bool use_btf = false;
+		struct evsel *augmented = trace.syscalls.events.bpf_output;
 
 		evlist__set_default_evsel_handler(trace.evlist, trace__event_handler);
 		if (evlist__set_syscall_tp_fields(trace.evlist, &use_btf)) {
@@ -5598,6 +5653,16 @@ int cmd_trace(int argc, const char **argv)
 
 		if (use_btf)
 			trace__load_vmlinux_btf(&trace);
+
+		if (augmented) {
+			if (evsel__init_bpf_output_tp(augmented) < 0) {
+				perror("failed to initialize bpf output fields\n");
+				goto out;
+			}
+			trace.raw_augmented_syscalls_args_size = sys_enter_tp.id.offset;
+			trace.raw_augmented_syscalls_args_size += (6 + 1) * sizeof(long);
+			trace.raw_augmented_syscalls = true;
+		}
 	}
 
 	/*
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 0016deb321fe0d97..979d60d7dce6565b 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -7,6 +7,7 @@
  */
 
 #include "vmlinux.h"
+#include "perf_trace_u.h"
 
 #include <bpf/bpf_helpers.h>
 #include <linux/limits.h>
@@ -140,7 +141,7 @@ static inline struct augmented_args_payload *augmented_args_payload(void)
 	return bpf_map_lookup_elem(&augmented_args_tmp, &key);
 }
 
-static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len)
+static inline int augmented__output(void *ctx, void *args, int len)
 {
 	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
@@ -182,12 +183,20 @@ unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const
 SEC("tp/raw_syscalls/sys_enter")
 int sys_enter_unaugmented(struct trace_event_raw_sys_enter *args)
 {
+	struct augmented_args_payload *augmented_args = augmented_args_payload();
+
+        if (augmented_args)
+		augmented__output(args, &augmented_args->args, sizeof(*args));
 	return 1;
 }
 
 SEC("tp/raw_syscalls/sys_exit")
 int sys_exit_unaugmented(struct trace_event_raw_sys_exit *args)
 {
+	struct augmented_args_payload *augmented_args = augmented_args_payload();
+
+	if (augmented_args)
+		augmented__output(args, &augmented_args->args, sizeof(*args));
 	return 1;
 }
 
@@ -450,6 +459,7 @@ static int augment_sys_enter(void *ctx, struct trace_event_raw_sys_enter *args)
 
 	/* copy the sys_enter header, which has the id */
 	__builtin_memcpy(&payload->args, args, sizeof(*args));
+	payload->args.ent.type = SYSCALL_TRACE_ENTER;
 
 	/*
 	 * Determine what type of argument and how many bytes to read from user space, using the
@@ -532,13 +542,14 @@ int sys_enter(struct trace_event_raw_sys_enter *args)
 	 */
 
 	if (pid_filter__has(&pids_filtered, getpid()))
-		return 0;
+		return 1;
 
 	augmented_args = augmented_args_payload();
 	if (augmented_args == NULL)
 		return 1;
 
 	bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args);
+	augmented_args->args.ent.type = SYSCALL_TRACE_ENTER;
 
 	/*
 	 * Jump to syscall specific augmenter, even if the default one,
@@ -548,29 +559,35 @@ int sys_enter(struct trace_event_raw_sys_enter *args)
 	if (augment_sys_enter(args, &augmented_args->args))
 		bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.id);
 
-	// If not found on the PROG_ARRAY syscalls map, then we're filtering it:
-	return 0;
+	return 1;
 }
 
 SEC("tp/raw_syscalls/sys_exit")
 int sys_exit(struct trace_event_raw_sys_exit *args)
 {
-	struct trace_event_raw_sys_exit exit_args;
+	struct augmented_args_payload *augmented_args;
 
 	if (pid_filter__has(&pids_filtered, getpid()))
-		return 0;
+		return 1;
+
+	augmented_args = augmented_args_payload();
+	if (augmented_args == NULL)
+		return 1;
+
+	bpf_probe_read_kernel(&augmented_args->args, sizeof(*args), args);
+	augmented_args->args.ent.type = SYSCALL_TRACE_EXIT;
 
-	bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
 	/*
 	 * Jump to syscall specific return augmenter, even if the default one,
 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
 	 * unaugmented tracepoint payload.
 	 */
-	bpf_tail_call(args, &syscalls_sys_exit, exit_args.id);
+	bpf_tail_call(args, &syscalls_sys_exit, args->id);
 	/*
-	 * If not found on the PROG_ARRAY syscalls map, then we're filtering it:
+	 * If not found on the PROG_ARRAY syscalls map, then we're filtering it
+	 * by not emitting bpf-output event.
 	 */
-	return 0;
+	return 1;
 }
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/perf/util/bpf_skel/perf_trace_u.h b/tools/perf/util/bpf_skel/perf_trace_u.h
new file mode 100644
index 0000000000000000..5b41afa734331d89
--- /dev/null
+++ b/tools/perf/util/bpf_skel/perf_trace_u.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2025 Google
+
+// This file will be shared between BPF and userspace.
+
+#ifndef __PERF_TRACE_U_H
+#define __PERF_TRACE_U_H
+
+enum syscall_trace_type {
+	SYSCALL_TRACE_ENTER = 0,
+	SYSCALL_TRACE_EXIT,
+};
+
+#endif /* __PERF_TRACE_U_H */
-- 
2.51.0.rc1.167.g924127e9c0-goog