[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251118003531.805883570@kernel.org>
Date: Mon, 17 Nov 2025 19:29:52 -0500
From: Steven Rostedt <rostedt@...nel.org>
To: linux-kernel@...r.kernel.org,
linux-trace-kernel@...r.kernel.org
Cc: Masami Hiramatsu <mhiramat@...nel.org>,
Mark Rutland <mark.rutland@....com>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>,
Ian Rogers <irogers@...gle.com>,
Namhyung Kim <namhyung@...nel.org>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
Jiri Olsa <jolsa@...nel.org>,
Douglas Raillard <douglas.raillard@....com>
Subject: [POC][RFC][PATCH 2/3] ftrace: Add perf counters to function tracing
From: Steven Rostedt <rostedt@...dmis.org>
Add option to trigger perf events to function tracing.
Two new options are added: func-cpu-cycles and func-cache-misses
# cd /sys/kernel/tracing
# echo 1 > options/func-cache-misses
# echo function > current_tracer
# cat trace
[..]
sshd-session-1014 [005] ..... 327.836708: __x64_sys_read <-do_syscall_64
sshd-session-1014 [005] ..... 327.836708: cache_misses: 741719054
sshd-session-1014 [005] ..... 327.836712: ksys_read <-do_syscall_64
sshd-session-1014 [005] ..... 327.836713: cache_misses: 741720271
sshd-session-1014 [005] ..... 327.836716: fdget_pos <-ksys_read
sshd-session-1014 [005] ..... 327.836717: cache_misses: 741721483
sshd-session-1014 [005] ..... 327.836720: vfs_read <-ksys_read
sshd-session-1014 [005] ..... 327.836721: cache_misses: 741722726
sshd-session-1014 [005] ..... 327.836724: rw_verify_area <-vfs_read
sshd-session-1014 [005] ..... 327.836725: cache_misses: 741723940
sshd-session-1014 [005] ..... 327.836728: security_file_permission <-rw_verify_area
sshd-session-1014 [005] ..... 327.836729: cache_misses: 741725151
The option will cause the perf event to be triggered after every function
called.
If cpu_cycles is also enabled:
# echo 1 > options/func-cpu-cycles
# cat trace
[..]
sshd-session-1014 [005] b..1. 536.844538: preempt_count_sub <-_raw_spin_unlock
sshd-session-1014 [005] b..1. 536.844539: cpu_cycles: 1919425978 cache_misses: 3431216952
sshd-session-1014 [005] b.... 536.844545: validate_xmit_skb_list <-sch_direct_xmit
sshd-session-1014 [005] b.... 536.844545: cpu_cycles: 1919429935 cache_misses: 3431218535
sshd-session-1014 [005] b.... 536.844551: validate_xmit_skb.isra.0 <-validate_xmit_skb_list
sshd-session-1014 [005] b.... 536.844552: cpu_cycles: 1919433763 cache_misses: 3431220112
sshd-session-1014 [005] b.... 536.844557: netif_skb_features <-validate_xmit_skb.isra.0
sshd-session-1014 [005] b.... 536.844558: cpu_cycles: 1919437574 cache_misses: 3431221688
Signed-off-by: Steven Rostedt (Google) <rostedt@...dmis.org>
---
include/linux/trace_recursion.h | 5 +-
kernel/trace/trace.c | 58 ++++++++++++---
kernel/trace/trace.h | 6 ++
kernel/trace/trace_functions.c | 124 ++++++++++++++++++++++++++++++--
4 files changed, 178 insertions(+), 15 deletions(-)
diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index ae04054a1be3..c42d86d81afa 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -132,9 +132,12 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
* will think a recursion occurred, and the event will be dropped.
* Let a single instance happen via the TRANSITION_BIT to
* not drop those events.
+ *
+ * When ip is zero, the caller is purposely trying causing
+ * recursion. Don't record it.
*/
bit = TRACE_CTX_TRANSITION + start;
- if (val & (1 << bit)) {
+ if ((val & (1 << bit)) && ip) {
do_ftrace_record_recursion(ip, pip);
return -1;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 64d966a3ec8b..42bf1c046de1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2915,21 +2915,18 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
#ifdef CONFIG_PERF_EVENTS
-static inline void record_perf_event(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned int trace_ctx)
+static inline void trace_perf_event(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ int entries, u64 flags,
+ unsigned int trace_ctx)
{
struct ring_buffer_event *event;
struct perf_event_entry *entry;
- int entries = READ_ONCE(tr->perf_events);
struct trace_array_cpu *data;
u64 *value;
int size;
int cpu;
- if (!entries)
- return;
-
guard(preempt_notrace)();
cpu = smp_processor_id();
@@ -2949,12 +2946,12 @@ static inline void record_perf_event(struct trace_array *tr,
entry = ring_buffer_event_data(event);
value = entry->values;
- if (tr->trace_flags & TRACE_ITER(PERF_CYCLES)) {
+ if (flags & TRACE_ITER(PERF_CYCLES)) {
*value++ = TRACE_PERF_VALUE(PERF_TRACE_CYCLES);
entries--;
}
- if (entries && tr->trace_flags & TRACE_ITER(PERF_CACHE)) {
+ if (entries && flags & TRACE_ITER(PERF_CACHE)) {
*value++ = TRACE_PERF_VALUE(PERF_TRACE_CACHE);
entries--;
}
@@ -2968,6 +2965,49 @@ static inline void record_perf_event(struct trace_array *tr,
local_dec(&data->disabled);
}
+static inline void record_perf_event(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx)
+{
+ int entries = READ_ONCE(tr->perf_events);
+
+ if (!entries)
+ return;
+
+ trace_perf_event(tr, buffer, entries, tr->trace_flags, trace_ctx);
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+void ftrace_perf_events(struct trace_array *tr, int perf_events,
+ u64 perf_mask, unsigned int trace_ctx)
+{
+ struct trace_buffer *buffer;
+ int bit;
+
+ /*
+ * Prevent any ftrace recursion.
+ * The ftrace_test_recursion_trylock() allows one nested loop
+ * to handle the case where an interrupt comes in and traces
+ * before the preempt_count is updated to the new context.
+ * This one instance allows that function to still be traced.
+ *
+ * The trace_perf_cache_misses() will call functions that function
+ * tracing will want to trace. Prevent this one loop from happening
+ * by taking the the lock again. If an interrupt comes in now,
+ * it may still be dropped, but there's really nothing that can
+ * be done about that until all those locations get fixed.
+ */
+ bit = ftrace_test_recursion_trylock(0, 0);
+
+ buffer = tr->array_buffer.buffer;
+ trace_perf_event(tr, buffer, perf_events, perf_mask, trace_ctx);
+
+ /* bit < 0 means the trylock failed and does not need to be unlocked */
+ if (bit >= 0)
+ ftrace_test_recursion_unlock(bit);
+}
+#endif
+
static int handle_perf_event(struct trace_array *tr, u64 mask, int enabled)
{
int ret = 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 094a156b0c70..bb764a2255c7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -365,6 +365,8 @@ struct trace_array {
int buffer_disabled;
int perf_events;
+ int ftrace_perf_events;
+ u64 ftrace_perf_mask;
struct trace_pid_list __rcu *filtered_pids;
struct trace_pid_list __rcu *filtered_no_pids;
@@ -1402,6 +1404,10 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
u64 do_trace_perf_event(int type);
int trace_perf_event_enable(int type);
void trace_perf_event_disable(int type);
+#ifdef CONFIG_FUNCTION_TRACER
+void ftrace_perf_events(struct trace_array *tr, int perf_events,
+ u64 perf_mask, unsigned int trace_ctx);
+#endif
#else
# define PERF_FLAGS
static inline u64 do_trace_perf_event(int type) { return 0; }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c12795c2fb39..97f46ac7ef21 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -47,8 +47,12 @@ enum {
TRACE_FUNC_OPT_NO_REPEATS = 0x2,
TRACE_FUNC_OPT_ARGS = 0x4,
- /* Update this to next highest bit. */
- TRACE_FUNC_OPT_HIGHEST_BIT = 0x8
+ /* Update this to next highest function bit. */
+ TRACE_FUNC_OPT_HIGHEST_BIT = 0x8,
+
+ /* These are just other options */
+ TRACE_FUNC_OPT_PERF_CYCLES = 0x10,
+ TRACE_FUNC_OPT_PERF_CACHE = 0x20,
};
#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1)
@@ -143,6 +147,105 @@ static bool handle_func_repeats(struct trace_array *tr, u32 flags_val)
return true;
}
+#ifdef CONFIG_PERF_EVENTS
+static inline void
+do_trace_function(struct trace_array *tr, unsigned long ip,
+ unsigned long parent_ip, unsigned int trace_ctx,
+ struct ftrace_regs *fregs)
+{
+ trace_function(tr, ip, parent_ip, trace_ctx, fregs);
+
+ if (likely(!tr->ftrace_perf_events))
+ return;
+
+ ftrace_perf_events(tr, tr->ftrace_perf_events, tr->ftrace_perf_mask, trace_ctx);
+}
+
+static bool handle_perf_event_flag(struct trace_array *tr, int bit, int set, int *err)
+{
+ u64 mask;
+ int type;
+
+ *err = 0;
+
+ switch (bit) {
+ case TRACE_FUNC_OPT_PERF_CYCLES:
+ mask = TRACE_ITER(PERF_CYCLES);
+ type = PERF_TRACE_CYCLES;
+ break;
+
+ case TRACE_FUNC_OPT_PERF_CACHE:
+ mask = TRACE_ITER(PERF_CACHE);
+ type = PERF_TRACE_CACHE;
+ break;
+
+ default:
+ return 0;
+ }
+
+ if (set)
+ *err = trace_perf_event_enable(type);
+ else
+ trace_perf_event_disable(type);
+
+ if (*err < 0)
+ return 1;
+
+ if (set) {
+ tr->ftrace_perf_events++;
+ tr->ftrace_perf_mask |= mask;
+ } else {
+ tr->ftrace_perf_mask &= ~mask;
+ tr->ftrace_perf_events--;
+ }
+ return 1;
+}
+
+static void ftrace_perf_enable(struct trace_array *tr, int bit)
+{
+ int err;
+
+ if (!(tr->current_trace_flags->val & bit))
+ return;
+
+ handle_perf_event_flag(tr, bit, 1, &err);
+ if (err < 0)
+ tr->current_trace_flags->val &= ~bit;
+}
+
+static void ftrace_perf_disable(struct trace_array *tr, int bit)
+{
+ int err;
+
+ /* Only disable if it was enabled */
+ if (!(tr->current_trace_flags->val & bit))
+ return;
+
+ handle_perf_event_flag(tr, bit, 0, &err);
+}
+
+static void ftrace_perf_init(struct trace_array *tr)
+{
+ ftrace_perf_enable(tr, TRACE_FUNC_OPT_PERF_CYCLES);
+ ftrace_perf_enable(tr, TRACE_FUNC_OPT_PERF_CACHE);
+}
+
+static void ftrace_perf_reset(struct trace_array *tr)
+{
+ ftrace_perf_disable(tr, TRACE_FUNC_OPT_PERF_CYCLES);
+ ftrace_perf_disable(tr, TRACE_FUNC_OPT_PERF_CACHE);
+}
+#else
+#define do_trace_function trace_function
+static inline bool handle_perf_event_flag(struct trace_array *tr, int bit,
+ int set, int *err)
+{
+ return 0;
+}
+static inline void ftrace_perf_init(struct trace_array *tr) { }
+static inline void ftrace_perf_reset(struct trace_array *tr) { }
+#endif /* CONFIG_PERF_EVENTS */
+
static int function_trace_init(struct trace_array *tr)
{
ftrace_func_t func;
@@ -165,6 +268,8 @@ static int function_trace_init(struct trace_array *tr)
tr->array_buffer.cpu = raw_smp_processor_id();
+ ftrace_perf_init(tr);
+
tracing_start_cmdline_record();
tracing_start_function_trace(tr);
return 0;
@@ -172,6 +277,7 @@ static int function_trace_init(struct trace_array *tr)
static void function_trace_reset(struct trace_array *tr)
{
+ ftrace_perf_reset(tr);
tracing_stop_function_trace(tr);
tracing_stop_cmdline_record();
ftrace_reset_array_ops(tr);
@@ -223,7 +329,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx_dec();
- trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+ do_trace_function(tr, ip, parent_ip, trace_ctx, NULL);
ftrace_test_recursion_unlock(bit);
}
@@ -245,7 +351,7 @@ function_args_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx();
- trace_function(tr, ip, parent_ip, trace_ctx, fregs);
+ do_trace_function(tr, ip, parent_ip, trace_ctx, fregs);
ftrace_test_recursion_unlock(bit);
}
@@ -372,7 +478,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx_dec();
process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
- trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+ do_trace_function(tr, ip, parent_ip, trace_ctx, NULL);
out:
ftrace_test_recursion_unlock(bit);
@@ -428,6 +534,10 @@ static struct tracer_opt func_opts[] = {
{ TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) },
#ifdef CONFIG_FUNCTION_TRACE_ARGS
{ TRACER_OPT(func-args, TRACE_FUNC_OPT_ARGS) },
+#endif
+#if CONFIG_PERF_EVENTS
+ { TRACER_OPT(func-cpu-cycles, TRACE_FUNC_OPT_PERF_CYCLES) },
+ { TRACER_OPT(func-cache-misses, TRACE_FUNC_OPT_PERF_CACHE) },
#endif
{ } /* Always set a last empty entry */
};
@@ -457,6 +567,7 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
ftrace_func_t func;
u32 new_flags;
+ int err;
/* Do nothing if already set. */
if (!!set == !!(tr->current_trace_flags->val & bit))
@@ -466,6 +577,9 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
if (tr->current_trace != &function_trace)
return 0;
+ if (handle_perf_event_flag(tr, bit, set, &err))
+ return err;
+
new_flags = (tr->current_trace_flags->val & ~bit) | (set ? bit : 0);
func = select_trace_function(new_flags);
if (!func)
--
2.51.0
Powered by blists - more mailing lists