linux-kernel - Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEf4BzY3xJ=W2qPD8i6UbSB=zNqpiA1gSd+SC3wKxQAJWjeHhA@mail.gmail.com>
Date: Tue, 29 Oct 2024 16:32:46 -0700
From: Andrii Nakryiko <andrii.nakryiko@...il.com>
To: Josh Poimboeuf <jpoimboe@...nel.org>
Cc: x86@...nel.org, Peter Zijlstra <peterz@...radead.org>, 
	Steven Rostedt <rostedt@...dmis.org>, Ingo Molnar <mingo@...nel.org>, 
	Arnaldo Carvalho de Melo <acme@...nel.org>, linux-kernel@...r.kernel.org, 
	Indu Bhagat <indu.bhagat@...cle.com>, Mark Rutland <mark.rutland@....com>, 
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>, Jiri Olsa <jolsa@...nel.org>, 
	Namhyung Kim <namhyung@...nel.org>, Ian Rogers <irogers@...gle.com>, 
	Adrian Hunter <adrian.hunter@...el.com>, linux-perf-users@...r.kernel.org, 
	Mark Brown <broonie@...nel.org>, linux-toolchains@...r.kernel.org, 
	Jordan Rome <jordalgo@...a.com>, Sam James <sam@...too.org>, linux-trace-kernel@...r.kerne.org, 
	Jens Remus <jremus@...ux.ibm.com>, Mathieu Desnoyers <mathieu.desnoyers@...icios.com>, 
	Florian Weimer <fweimer@...hat.com>, Andy Lutomirski <luto@...nel.org>
Subject: Re: [PATCH v3 11/19] unwind: Add deferred user space unwinding API

On Mon, Oct 28, 2024 at 2:48 PM Josh Poimboeuf <jpoimboe@...nel.org> wrote:
>
> Add unwind_user_deferred() which allows callers to schedule task work to
> unwind the user space stack before returning to user space.  This solves
> several problems for its callers:
>
>   - Ensure the unwind happens in task context even if the caller may
>     running in interrupt context.
>
>   - Only do the unwind once, even if called multiple times either by the
>     same caller or multiple callers.
>
>   - Create a "context context" cookie which allows trace post-processing
>     to correlate kernel unwinds/traces with the user unwind.
>
> Signed-off-by: Josh Poimboeuf <jpoimboe@...nel.org>
> ---
>  include/linux/entry-common.h |   3 +
>  include/linux/sched.h        |   5 +
>  include/linux/unwind_user.h  |  56 ++++++++++
>  kernel/fork.c                |   4 +
>  kernel/unwind/user.c         | 199 +++++++++++++++++++++++++++++++++++
>  5 files changed, 267 insertions(+)
>
> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> index 1e50cdb83ae5..efbe8f964f31 100644
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -12,6 +12,7 @@
>  #include <linux/resume_user_mode.h>
>  #include <linux/tick.h>
>  #include <linux/kmsan.h>
> +#include <linux/unwind_user.h>
>
>  #include <asm/entry-common.h>
>
> @@ -111,6 +112,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
>         CT_WARN_ON(__ct_state() != CT_STATE_USER);
>         user_exit_irqoff();
>
> +       unwind_enter_from_user_mode();
> +
>         instrumentation_begin();
>         kmsan_unpoison_entry_regs(regs);
>         trace_hardirqs_off_finish();
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 5007a8e2d640..31b6f1d763ef 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -47,6 +47,7 @@
>  #include <linux/livepatch_sched.h>
>  #include <linux/uidgid_types.h>
>  #include <asm/kmap_size.h>
> +#include <linux/unwind_user.h>
>
>  /* task_struct member predeclarations (sorted alphabetically): */
>  struct audit_context;
> @@ -1592,6 +1593,10 @@ struct task_struct {
>         struct user_event_mm            *user_event_mm;
>  #endif
>
> +#ifdef CONFIG_UNWIND_USER
> +       struct unwind_task_info         unwind_task_info;

this is quite a lot of memory to pay on each task, a lot of which a)
might not have sframe and b) might not need stack unwinding during
their lifetime.

It can be a pointer and allocated in copy_process(), no? Though
ideally this should be initialized lazily, if possible.

> +#endif
> +
>         /*
>          * New fields for task_struct should be added above here, so that
>          * they are included in the randomized portion of task_struct.

[...]

> +static void unwind_user_task_work(struct callback_head *head)
> +{
> +       struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
> +       struct task_struct *task = container_of(info, struct task_struct, unwind_task_info);
> +       void *privs[UNWIND_MAX_CALLBACKS];
> +       struct unwind_stacktrace trace;
> +       unsigned long pending;
> +       u64 cookie = 0;
> +       int i;
> +
> +       BUILD_BUG_ON(UNWIND_MAX_CALLBACKS > 32);
> +
> +       if (WARN_ON_ONCE(task != current))
> +               return;
> +
> +       if (WARN_ON_ONCE(!info->ctx_cookie || !info->pending_callbacks))
> +               return;
> +
> +       scoped_guard(irqsave) {
> +               pending = info->pending_callbacks;
> +               cookie = info->ctx_cookie;
> +
> +               info->pending_callbacks = 0;
> +               info->ctx_cookie = 0;
> +               memcpy(privs, info->privs, sizeof(void *) * UNWIND_MAX_CALLBACKS);
> +       }
> +
> +       if (!info->entries) {
> +               info->entries = kmalloc(UNWIND_MAX_ENTRIES * sizeof(long),
> +                                       GFP_KERNEL);
> +               if (!info->entries)
> +                       return;

uhm... can we notify callbacks that stack capture failed? otherwise
we'd need some extra timeouts and other complications if we are
*waiting* for this callback to be called

> +       }
> +
> +       trace.entries = info->entries;
> +       trace.nr = 0;
> +       unwind_user(&trace, UNWIND_MAX_ENTRIES);
> +

[...]