[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEf4BzY_rGszo9O9i3xhB2VFC-BOcqoZ3KGpKT+Hf4o-0W2BAQ@mail.gmail.com>
Date: Tue, 29 Oct 2024 16:32:40 -0700
From: Andrii Nakryiko <andrii.nakryiko@...il.com>
To: Josh Poimboeuf <jpoimboe@...nel.org>
Cc: x86@...nel.org, Peter Zijlstra <peterz@...radead.org>,
Steven Rostedt <rostedt@...dmis.org>, Ingo Molnar <mingo@...nel.org>,
Arnaldo Carvalho de Melo <acme@...nel.org>, linux-kernel@...r.kernel.org,
Indu Bhagat <indu.bhagat@...cle.com>, Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>, Jiri Olsa <jolsa@...nel.org>,
Namhyung Kim <namhyung@...nel.org>, Ian Rogers <irogers@...gle.com>,
Adrian Hunter <adrian.hunter@...el.com>, linux-perf-users@...r.kernel.org,
Mark Brown <broonie@...nel.org>, linux-toolchains@...r.kernel.org,
Jordan Rome <jordalgo@...a.com>, Sam James <sam@...too.org>, linux-trace-kernel@...r.kerne.org,
Jens Remus <jremus@...ux.ibm.com>, Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Florian Weimer <fweimer@...hat.com>, Andy Lutomirski <luto@...nel.org>
Subject: Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
On Mon, Oct 28, 2024 at 2:48 PM Josh Poimboeuf <jpoimboe@...nel.org> wrote:
>
> Some distros have started compiling frame pointers into all their
> packages to enable the kernel to do system-wide profiling of user space.
> Unfortunately that creates a runtime performance penalty across the
> entire system. Using DWARF instead isn't feasible due to the complexity
> it would add to the kernel.
>
> For in-kernel unwinding we solved this problem with the creation of the
> ORC unwinder for x86_64. Similarly, for user space the GNU assembler
> has created the sframe format starting with binutils 2.41 for sframe v2.
> Sframe is a simpler version of .eh_frame. It gets placed in the .sframe
> section.
>
> Add support for unwinding user space using sframe.
>
> More information about sframe can be found here:
>
> - https://lwn.net/Articles/932209/
> - https://lwn.net/Articles/940686/
> - https://sourceware.org/binutils/docs/sframe-spec.html
>
> Glibc support is needed to implement the prctl() calls to tell the
> kernel where the .sframe segments are.
>
> Signed-off-by: Josh Poimboeuf <jpoimboe@...nel.org>
> ---
> arch/Kconfig | 4 +
> arch/x86/include/asm/mmu.h | 2 +-
> fs/binfmt_elf.c | 35 +++-
> include/linux/mm_types.h | 3 +
> include/linux/sframe.h | 41 ++++
> include/linux/unwind_user.h | 2 +
> include/uapi/linux/elf.h | 1 +
> include/uapi/linux/prctl.h | 3 +
> kernel/fork.c | 10 +
> kernel/sys.c | 11 ++
> kernel/unwind/Makefile | 3 +-
> kernel/unwind/sframe.c | 380 ++++++++++++++++++++++++++++++++++++
> kernel/unwind/sframe.h | 215 ++++++++++++++++++++
> kernel/unwind/user.c | 24 ++-
> mm/init-mm.c | 6 +
> 15 files changed, 732 insertions(+), 8 deletions(-)
> create mode 100644 include/linux/sframe.h
> create mode 100644 kernel/unwind/sframe.c
> create mode 100644 kernel/unwind/sframe.h
>
It feels like this patch is trying to do too much. There is both new
UAPI introduction, and SFrame format definition, and unwinder
integration, etc, etc. Do you think it can be split further into more
focused smaller patches?
> @@ -688,7 +692,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
> /*
> * Check to see if the section's size will overflow the
> * allowed task size. Note that p_filesz must always be
> - * <= p_memsize so it's only necessary to check p_memsz.
> + * <= p_memsz so it's only necessary to check p_memsz.
> */
> k = load_addr + eppnt->p_vaddr;
> if (BAD_ADDR(k) ||
> @@ -698,9 +702,24 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
> error = -ENOMEM;
> goto out;
> }
> +
> + if ((eppnt->p_flags & PF_X) && k < start_code)
> + start_code = k;
> +
> + if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
> + end_code = k + eppnt->p_filesz;
> + break;
> + }
> + case PT_GNU_SFRAME:
> + sframe_phdr = eppnt;
if I understand correctly, there has to be only one sframe, is that
right? Should we validate that?
> + break;
> }
> }
>
> + if (sframe_phdr)
> + sframe_add_section(load_addr + sframe_phdr->p_vaddr,
> + start_code, end_code);
> +
no error checking?
> error = load_addr;
> out:
> return error;
> @@ -823,7 +842,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
> int first_pt_load = 1;
> unsigned long error;
> struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
> - struct elf_phdr *elf_property_phdata = NULL;
> + struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
> unsigned long elf_brk;
> int retval, i;
> unsigned long elf_entry;
> @@ -931,6 +950,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
> executable_stack = EXSTACK_DISABLE_X;
> break;
>
> + case PT_GNU_SFRAME:
> + sframe_phdr = elf_ppnt;
> + break;
> +
> case PT_LOPROC ... PT_HIPROC:
> retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
> bprm->file, false,
> @@ -1321,6 +1344,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
> task_pid_nr(current), retval);
> }
>
> + if (sframe_phdr)
> + sframe_add_section(load_bias + sframe_phdr->p_vaddr,
> + start_code, end_code);
> +
error checking missing?
> regs = current_pt_regs();
> #ifdef ELF_PLAT_INIT
> /*
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 381d22eba088..6e7561c1a5fc 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1052,6 +1052,9 @@ struct mm_struct {
> #endif
> } lru_gen;
> #endif /* CONFIG_LRU_GEN_WALKS_MMU */
> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
> + struct maple_tree sframe_mt;
> +#endif
> } __randomize_layout;
>
> /*
> diff --git a/include/linux/sframe.h b/include/linux/sframe.h
> new file mode 100644
> index 000000000000..d167e01817c4
> --- /dev/null
> +++ b/include/linux/sframe.h
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_SFRAME_H
> +#define _LINUX_SFRAME_H
> +
> +#include <linux/mm_types.h>
> +
> +struct unwind_user_frame;
> +
> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
> +
> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
> +
> +extern void sframe_free_mm(struct mm_struct *mm);
> +
> +/* text_start, text_end, file_name are optional */
what file_name? was that an extra argument that got removed?
> +extern int sframe_add_section(unsigned long sframe_addr, unsigned long text_start,
> + unsigned long text_end);
> +
> +extern int sframe_remove_section(unsigned long sframe_addr);
> +extern int sframe_find(unsigned long ip, struct unwind_user_frame *frame);
> +
> +static inline bool current_has_sframe(void)
> +{
> + struct mm_struct *mm = current->mm;
> +
> + return mm && !mtree_empty(&mm->sframe_mt);
> +}
> +
[...]
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 4da31f28fda8..7d05a67727db 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -64,6 +64,7 @@
> #include <linux/rcupdate.h>
> #include <linux/uidgid.h>
> #include <linux/cred.h>
> +#include <linux/sframe.h>
>
> #include <linux/nospec.h>
>
> @@ -2784,6 +2785,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> case PR_RISCV_SET_ICACHE_FLUSH_CTX:
> error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
> break;
> + case PR_ADD_SFRAME:
> + if (arg5)
> + return -EINVAL;
> + error = sframe_add_section(arg2, arg3, arg4);
wouldn't it be better to make this interface extendable from the get
go? Instead of passing 3 arguments with fixed meaning, why not pass a
pointer to an extendable binary struct like seems to be the trend
nowadays with nicely extensible APIs. See [0] for one such example
(specifically, struct procmap_query). Seems more prudent, as we'll
most probably will be adding flags, options, extra information, etc)
[0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/
> + break;
> + case PR_REMOVE_SFRAME:
> + if (arg3 || arg4 || arg5)
> + return -EINVAL;
> + error = sframe_remove_section(arg2);
> + break;
> default:
> error = -EINVAL;
> break;
[...]
> +static unsigned char fre_type_to_size(unsigned char fre_type)
> +{
> + if (fre_type > 2)
> + return 0;
> + return 1 << fre_type;
> +}
> +
> +static unsigned char offset_size_enum_to_size(unsigned char off_size)
> +{
> + if (off_size > 2)
> + return 0;
> + return 1 << off_size;
> +}
> +
> +static int find_fde(struct sframe_section *sec, unsigned long ip,
> + struct sframe_fde *fde)
> +{
> + struct sframe_fde __user *first, *last, *found = NULL;
> + u32 ip_off, func_off_low = 0, func_off_high = -1;
> +
> + ip_off = ip - sec->sframe_addr;
what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right?
and also, does it mean that SFrame doesn't support executables with
text bigger than 4GB?
> +
> + first = (void __user *)sec->fdes_addr;
> + last = first + sec->fdes_nr;
> + while (first <= last) {
> + struct sframe_fde __user *mid;
> + u32 func_off;
> +
> + mid = first + ((last - first) / 2);
> +
> + if (get_user(func_off, (s32 __user *)mid))
> + return -EFAULT;
> +
> + if (ip_off >= func_off) {
> + /* validate sort order */
> + if (func_off < func_off_low)
> + return -EINVAL;
> +
> + func_off_low = func_off;
> +
> + found = mid;
> + first = mid + 1;
> + } else {
> + /* validate sort order */
> + if (func_off > func_off_high)
> + return -EINVAL;
> +
> + func_off_high = func_off;
> +
> + last = mid - 1;
> + }
> + }
> +
> + if (!found)
> + return -EINVAL;
> +
> + if (copy_from_user(fde, found, sizeof(*fde)))
> + return -EFAULT;
> +
> + /* check for gaps */
> + if (ip_off < fde->start_addr || ip_off >= fde->start_addr + fde->size)
> + return -EINVAL;
> +
> + return 0;
> +}
> +
[...]
> +int sframe_add_section(unsigned long sframe_addr, unsigned long text_start,
> + unsigned long text_end)
> +{
> + struct mm_struct *mm = current->mm;
> + struct vm_area_struct *sframe_vma;
> +
> + mmap_read_lock(mm);
> +
> + sframe_vma = vma_lookup(mm, sframe_addr);
> + if (!sframe_vma)
> + goto err_unlock;
> +
> + if (text_start && text_end) {
> + struct vm_area_struct *text_vma;
> +
> + text_vma = vma_lookup(mm, text_start);
> + if (!(text_vma->vm_flags & VM_EXEC))
> + goto err_unlock;
> +
> + if (PAGE_ALIGN(text_end) != text_vma->vm_end)
> + goto err_unlock;
> + } else {
> + struct vm_area_struct *vma, *text_vma = NULL;
> + VMA_ITERATOR(vmi, mm, 0);
> +
> + for_each_vma(vmi, vma) {
> + if (vma->vm_file != sframe_vma->vm_file ||
> + !(vma->vm_flags & VM_EXEC))
> + continue;
> +
> + if (text_vma) {
> + pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n",
> + current->comm, current->pid);
is this just something that fundamentally can't be supported by SFrame
format? Or just an implementation simplification?
It's not illegal to have an executable with multiple VM_EXEC segments,
no? Should this be a pr_warn_once() then?
> + goto err_unlock;
> + }
> +
> + text_vma = vma;
> + }
> +
> + if (!text_vma)
> + goto err_unlock;
> +
> + text_start = text_vma->vm_start;
> + text_end = text_vma->vm_end;
> + }
> +
> + mmap_read_unlock(mm);
> +
> + return __sframe_add_section(sframe_addr, text_start, text_end);
> +
[...]
Powered by blists - more mailing lists