[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Zun9r1TAAG1slUSA@google.com>
Date: Tue, 17 Sep 2024 15:07:43 -0700
From: Namhyung Kim <namhyung@...nel.org>
To: Josh Poimboeuf <jpoimboe@...nel.org>
Cc: x86@...nel.org, Peter Zijlstra <peterz@...radead.org>,
Steven Rostedt <rostedt@...dmis.org>,
Ingo Molnar <mingo@...nel.org>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
linux-kernel@...r.kernel.org, Indu Bhagat <indu.bhagat@...cle.com>,
Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Jiri Olsa <jolsa@...nel.org>, Ian Rogers <irogers@...gle.com>,
Adrian Hunter <adrian.hunter@...el.com>,
linux-perf-users@...r.kernel.org, Mark Brown <broonie@...nel.org>,
linux-toolchains@...r.kernel.org, Jordan Rome <jordalgo@...a.com>,
Sam James <sam@...too.org>
Subject: Re: [PATCH v2 09/11] perf: Introduce deferred user callchains
On Sat, Sep 14, 2024 at 01:02:11AM +0200, Josh Poimboeuf wrote:
> Instead of attempting to unwind user space from the NMI handler, defer
> it to run in task context by sending a self-IPI and then scheduling the
> unwind to run in the IRQ's exit task work before returning to user space.
>
> This allows the user stack page to be paged in if needed, avoids
> duplicate unwinds for kernel-bound workloads, and prepares for SFrame
> unwinding (so .sframe sections can be paged in on demand).
>
> Suggested-by: Steven Rostedt <rostedt@...dmis.org>
> Suggested-by: Peter Zijlstra <peterz@...radead.org>
> Signed-off-by: Josh Poimboeuf <jpoimboe@...nel.org>
> ---
[SNIP]
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 19fd7bd38ecf..5fc7c5156287 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6854,11 +6860,70 @@ static void perf_pending_irq(struct irq_work *entry)
> perf_swevent_put_recursion_context(rctx);
> }
>
> +struct perf_callchain_deferred_event {
> + struct perf_event_header header;
> + struct perf_callchain_entry callchain;
> +};
> +
> +#define PERF_CALLCHAIN_DEFERRED_EVENT_SIZE \
> + sizeof(struct perf_callchain_deferred_event) + \
> + (sizeof(__u64) * 1) + /* PERF_CONTEXT_USER */ \
> + (sizeof(__u64) * PERF_MAX_STACK_DEPTH)
> +
> +static void perf_event_callchain_deferred(struct perf_event *event)
> +{
> + struct pt_regs *regs = task_pt_regs(current);
> + struct perf_callchain_entry *callchain;
> + struct perf_output_handle handle;
> + struct perf_sample_data data;
> + unsigned char buf[PERF_CALLCHAIN_DEFERRED_EVENT_SIZE];
> + struct perf_callchain_entry_ctx ctx;
> + struct perf_callchain_deferred_event *deferred_event;
> +
> + deferred_event = (void *)&buf;
> +
> + callchain = &deferred_event->callchain;
> + callchain->nr = 0;
> +
> + ctx.entry = callchain;
> + ctx.max_stack = MIN(event->attr.sample_max_stack,
> + PERF_MAX_STACK_DEPTH);
> + ctx.nr = 0;
> + ctx.contexts = 0;
> + ctx.contexts_maxed = false;
> +
> + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
> + perf_callchain_user_deferred(&ctx, regs);
> +
> + deferred_event->header.type = PERF_RECORD_CALLCHAIN_DEFERRED;
> + deferred_event->header.misc = 0;
I think we can use PERF_RECORD_MISC_USER here as it's about user
callchains.
> + deferred_event->header.size = sizeof(*deferred_event) +
> + (callchain->nr * sizeof(u64));
> +
> + perf_event_header__init_id(&deferred_event->header, &data, event);
> +
> + if (perf_output_begin(&handle, &data, event,
> + deferred_event->header.size))
> + return;
> +
> + perf_output_copy(&handle, deferred_event, deferred_event->header.size);
You should not copy the whole event size because it also contains the
id_sample parts in the below. Maybe something like this instead?
perf_output_put(&handle, *deferred_event);
__output_copy(&handle, callchain->ip, callchain->nr * sizeof(u64));
Thanks,
Namhyung
> + perf_event__output_id_sample(event, &handle, &data);
> + perf_output_end(&handle);
> +}
> +
> static void perf_pending_task(struct callback_head *head)
> {
> struct perf_event *event = container_of(head, struct perf_event, pending_task);
> int rctx;
>
> + if (!is_software_event(event)) {
> + if (event->pending_callchain) {
> + perf_event_callchain_deferred(event);
> + event->pending_callchain = 0;
> + }
> + return;
> + }
> +
> /*
> * All accesses to the event must belong to the same implicit RCU read-side
> * critical section as the ->pending_work reset. See comment in
> @@ -7688,6 +7753,8 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
> bool user = !event->attr.exclude_callchain_user;
> const u32 max_stack = event->attr.sample_max_stack;
> struct perf_callchain_entry *callchain;
> + bool defer_user = IS_ENABLED(CONFIG_HAVE_PERF_CALLCHAIN_DEFERRED) &&
> + event->attr.defer_callchain;
>
> /* Disallow cross-task user callchains. */
> user &= !event->ctx->task || event->ctx->task == current;
> @@ -7695,7 +7762,14 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
> if (!kernel && !user)
> return &__empty_callchain;
>
> - callchain = get_perf_callchain(regs, kernel, user, max_stack, true);
> + callchain = get_perf_callchain(regs, kernel, user, max_stack, true,
> + defer_user);
> +
> + if (user && defer_user && !event->pending_callchain) {
> + event->pending_callchain = 1;
> + irq_work_queue(&event->pending_irq);
> + }
> +
> return callchain ?: &__empty_callchain;
> }
>
> --
> 2.46.0
>
Powered by blists - more mailing lists