[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAP-5=fVrpBjsJ7=BZQmhXKcaN+OYTY5_gOVj-Qs+33cH0gft7Q@mail.gmail.com>
Date: Fri, 14 Nov 2025 09:52:41 -0800
From: Ian Rogers <irogers@...gle.com>
To: Namhyung Kim <namhyung@...nel.org>
Cc: Arnaldo Carvalho de Melo <acme@...nel.org>, James Clark <james.clark@...aro.org>,
Jiri Olsa <jolsa@...nel.org>, Adrian Hunter <adrian.hunter@...el.com>,
Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...nel.org>,
LKML <linux-kernel@...r.kernel.org>, linux-perf-users@...r.kernel.org,
Steven Rostedt <rostedt@...dmis.org>, Josh Poimboeuf <jpoimboe@...nel.org>,
Indu Bhagat <indu.bhagat@...cle.com>, Jens Remus <jremus@...ux.ibm.com>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>, linux-trace-kernel@...r.kernel.org,
bpf@...r.kernel.org
Subject: Re: [PATCH v3 2/5] perf tools: Minimal DEFERRED_CALLCHAIN support
On Thu, Nov 13, 2025 at 11:00 PM Namhyung Kim <namhyung@...nel.org> wrote:
>
> Add a new event type for deferred callchains and a new callback for the
> struct perf_tool. For now it doesn't actually handle the deferred
> callchains but it just marks the sample if it has the PERF_CONTEXT_
> USER_DEFFERED in the callchain array.
>
> At least, perf report can dump the raw data with this change. Actually
> this requires the next commit to enable attr.defer_callchain, but if you
> already have a data file, it'll show the following result.
>
> $ perf report -D
> ...
> 0x2158@...f.data [0x40]: event: 22
> .
> . ... raw event: size 64 bytes
> . 0000: 16 00 00 00 02 00 40 00 06 00 00 00 0b 00 00 00 ......@.........
> . 0010: 03 00 00 00 00 00 00 00 a7 7f 33 fe 18 7f 00 00 ..........3.....
> . 0020: 0f 0e 33 fe 18 7f 00 00 48 14 33 fe 18 7f 00 00 ..3.....H.3.....
> . 0030: 08 09 00 00 08 09 00 00 e6 7a e7 35 1c 00 00 00 .........z.5....
>
> 121163447014 0x2158 [0x40]: PERF_RECORD_CALLCHAIN_DEFERRED(IP, 0x2): 2312/2312: 0xb00000006
> ... FP chain: nr:3
> ..... 0: 00007f18fe337fa7
> ..... 1: 00007f18fe330e0f
> ..... 2: 00007f18fe331448
> : unhandled!
>
> Signed-off-by: Namhyung Kim <namhyung@...nel.org>
> ---
> tools/lib/perf/include/perf/event.h | 8 ++++++++
> tools/perf/util/event.c | 1 +
> tools/perf/util/evsel.c | 19 +++++++++++++++++++
> tools/perf/util/machine.c | 1 +
> tools/perf/util/perf_event_attr_fprintf.c | 2 ++
> tools/perf/util/sample.h | 2 ++
> tools/perf/util/session.c | 20 ++++++++++++++++++++
> tools/perf/util/tool.c | 1 +
> tools/perf/util/tool.h | 3 ++-
> 9 files changed, 56 insertions(+), 1 deletion(-)
>
> diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h
> index aa1e91c97a226e1a..769bc48ca85c0eb8 100644
> --- a/tools/lib/perf/include/perf/event.h
> +++ b/tools/lib/perf/include/perf/event.h
> @@ -151,6 +151,13 @@ struct perf_record_switch {
> __u32 next_prev_tid;
> };
>
> +struct perf_record_callchain_deferred {
> + struct perf_event_header header;
> + __u64 cookie;
Could we add a comment that this value is used to match user and
kernel stack traces together? I don't believe that intent is
immediately obvious from the word "cookie".
> + __u64 nr;
> + __u64 ips[];
> +};
> +
> struct perf_record_header_attr {
> struct perf_event_header header;
> struct perf_event_attr attr;
> @@ -523,6 +530,7 @@ union perf_event {
> struct perf_record_read read;
> struct perf_record_throttle throttle;
> struct perf_record_sample sample;
> + struct perf_record_callchain_deferred callchain_deferred;
> struct perf_record_bpf_event bpf;
> struct perf_record_ksymbol ksymbol;
> struct perf_record_text_poke_event text_poke;
> diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
> index fcf44149feb20c35..4c92cc1a952c1d9f 100644
> --- a/tools/perf/util/event.c
> +++ b/tools/perf/util/event.c
> @@ -61,6 +61,7 @@ static const char *perf_event__names[] = {
> [PERF_RECORD_CGROUP] = "CGROUP",
> [PERF_RECORD_TEXT_POKE] = "TEXT_POKE",
> [PERF_RECORD_AUX_OUTPUT_HW_ID] = "AUX_OUTPUT_HW_ID",
> + [PERF_RECORD_CALLCHAIN_DEFERRED] = "CALLCHAIN_DEFERRED",
> [PERF_RECORD_HEADER_ATTR] = "ATTR",
> [PERF_RECORD_HEADER_EVENT_TYPE] = "EVENT_TYPE",
> [PERF_RECORD_HEADER_TRACING_DATA] = "TRACING_DATA",
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 989c56d4a23f74f4..244b3e44d090d413 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -3089,6 +3089,20 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
> data->data_src = PERF_MEM_DATA_SRC_NONE;
> data->vcpu = -1;
>
> + if (event->header.type == PERF_RECORD_CALLCHAIN_DEFERRED) {
> + const u64 max_callchain_nr = UINT64_MAX / sizeof(u64);
> +
> + data->callchain = (struct ip_callchain *)&event->callchain_deferred.nr;
> + if (data->callchain->nr > max_callchain_nr)
> + return -EFAULT;
> +
> + data->deferred_cookie = event->callchain_deferred.cookie;
> +
> + if (evsel->core.attr.sample_id_all)
> + perf_evsel__parse_id_sample(evsel, event, data);
> + return 0;
> + }
> +
> if (event->header.type != PERF_RECORD_SAMPLE) {
> if (!evsel->core.attr.sample_id_all)
> return 0;
> @@ -3219,6 +3233,11 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
> if (data->callchain->nr > max_callchain_nr)
> return -EFAULT;
> sz = data->callchain->nr * sizeof(u64);
> + if (evsel->core.attr.defer_callchain && data->callchain->nr >= 2 &&
> + data->callchain->ips[data->callchain->nr - 2] == PERF_CONTEXT_USER_DEFERRED) {
> + data->deferred_cookie = data->callchain->ips[data->callchain->nr - 1];
> + data->deferred_callchain = true;
> + }
It'd be nice to have a comment saying what is going on here. I can see
that if there are 2 stack slots and the 2nd is a magic value then the
first should be read as the "cookie". At a first look this code is
difficult to parse so a comment would add value.
Thanks,
Ian
> OVERFLOW_CHECK(array, sz, max_size);
> array = (void *)array + sz;
> }
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index b5dd42588c916d91..841b711d970e9457 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -2124,6 +2124,7 @@ static int add_callchain_ip(struct thread *thread,
> *cpumode = PERF_RECORD_MISC_KERNEL;
> break;
> case PERF_CONTEXT_USER:
> + case PERF_CONTEXT_USER_DEFERRED:
> *cpumode = PERF_RECORD_MISC_USER;
> break;
> default:
> diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c
> index 66b666d9ce649dd7..741c3d657a8b6ae7 100644
> --- a/tools/perf/util/perf_event_attr_fprintf.c
> +++ b/tools/perf/util/perf_event_attr_fprintf.c
> @@ -343,6 +343,8 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
> PRINT_ATTRf(inherit_thread, p_unsigned);
> PRINT_ATTRf(remove_on_exec, p_unsigned);
> PRINT_ATTRf(sigtrap, p_unsigned);
> + PRINT_ATTRf(defer_callchain, p_unsigned);
> + PRINT_ATTRf(defer_output, p_unsigned);
>
> PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned, false);
> PRINT_ATTRf(bp_type, p_unsigned);
> diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
> index fae834144ef42105..a8307b20a9ea8066 100644
> --- a/tools/perf/util/sample.h
> +++ b/tools/perf/util/sample.h
> @@ -107,6 +107,8 @@ struct perf_sample {
> /** @weight3: On x86 holds retire_lat, on powerpc holds p_stage_cyc. */
> u16 weight3;
> bool no_hw_idx; /* No hw_idx collected in branch_stack */
> + bool deferred_callchain; /* Has deferred user callchains */
> + u64 deferred_cookie;
> char insn[MAX_INSN];
> void *raw_data;
> struct ip_callchain *callchain;
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 4b0236b2df2913e1..361e15c1f26a96d0 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -720,6 +720,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
> [PERF_RECORD_CGROUP] = perf_event__cgroup_swap,
> [PERF_RECORD_TEXT_POKE] = perf_event__text_poke_swap,
> [PERF_RECORD_AUX_OUTPUT_HW_ID] = perf_event__all64_swap,
> + [PERF_RECORD_CALLCHAIN_DEFERRED] = perf_event__all64_swap,
> [PERF_RECORD_HEADER_ATTR] = perf_event__hdr_attr_swap,
> [PERF_RECORD_HEADER_EVENT_TYPE] = perf_event__event_type_swap,
> [PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap,
> @@ -854,6 +855,9 @@ static void callchain__printf(struct evsel *evsel,
> for (i = 0; i < callchain->nr; i++)
> printf("..... %2d: %016" PRIx64 "\n",
> i, callchain->ips[i]);
> +
> + if (sample->deferred_callchain)
> + printf("...... (deferred)\n");
> }
>
> static void branch_stack__printf(struct perf_sample *sample,
> @@ -1123,6 +1127,19 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
> sample_read__printf(sample, evsel->core.attr.read_format);
> }
>
> +static void dump_deferred_callchain(struct evsel *evsel, union perf_event *event,
> + struct perf_sample *sample)
> +{
> + if (!dump_trace)
> + return;
> +
> + printf("(IP, 0x%x): %d/%d: %#" PRIx64 "\n",
> + event->header.misc, sample->pid, sample->tid, sample->deferred_cookie);
> +
> + if (evsel__has_callchain(evsel))
> + callchain__printf(evsel, sample);
> +}
> +
> static void dump_read(struct evsel *evsel, union perf_event *event)
> {
> struct perf_record_read *read_event = &event->read;
> @@ -1353,6 +1370,9 @@ static int machines__deliver_event(struct machines *machines,
> return tool->text_poke(tool, event, sample, machine);
> case PERF_RECORD_AUX_OUTPUT_HW_ID:
> return tool->aux_output_hw_id(tool, event, sample, machine);
> + case PERF_RECORD_CALLCHAIN_DEFERRED:
> + dump_deferred_callchain(evsel, event, sample);
> + return tool->callchain_deferred(tool, event, sample, evsel, machine);
> default:
> ++evlist->stats.nr_unknown_events;
> return -1;
> diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
> index 22a8a4ffe05f778e..f732d33e7f895ed4 100644
> --- a/tools/perf/util/tool.c
> +++ b/tools/perf/util/tool.c
> @@ -287,6 +287,7 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events)
> tool->read = process_event_sample_stub;
> tool->throttle = process_event_stub;
> tool->unthrottle = process_event_stub;
> + tool->callchain_deferred = process_event_sample_stub;
> tool->attr = process_event_synth_attr_stub;
> tool->event_update = process_event_synth_event_update_stub;
> tool->tracing_data = process_event_synth_tracing_data_stub;
> diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
> index 88337cee1e3e2be3..9b9f0a8cbf3de4b5 100644
> --- a/tools/perf/util/tool.h
> +++ b/tools/perf/util/tool.h
> @@ -44,7 +44,8 @@ enum show_feature_header {
>
> struct perf_tool {
> event_sample sample,
> - read;
> + read,
> + callchain_deferred;
> event_op mmap,
> mmap2,
> comm,
> --
> 2.52.0.rc1.455.g30608eb744-goog
>
Powered by blists - more mailing lists