[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c967505c-fbdc-4a46-a5b6-d164fe79e2e3@linux.ibm.com>
Date: Wed, 23 Oct 2024 15:59:53 +0200
From: Jens Remus <jremus@...ux.ibm.com>
To: Josh Poimboeuf <jpoimboe@...nel.org>
Cc: Peter Zijlstra <peterz@...radead.org>,
Steven Rostedt <rostedt@...dmis.org>, Ingo Molnar <mingo@...nel.org>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
linux-kernel@...r.kernel.org, Indu Bhagat <indu.bhagat@...cle.com>,
Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Jiri Olsa <jolsa@...nel.org>, Namhyung Kim <namhyung@...nel.org>,
Ian Rogers <irogers@...gle.com>,
Adrian Hunter <adrian.hunter@...el.com>,
linux-perf-users@...r.kernel.org, Mark Brown <broonie@...nel.org>,
linux-toolchains@...r.kernel.org, Jordan Rome <jordalgo@...a.com>,
Sam James <sam@...too.org>, x86@...nel.org,
Heiko Carstens <hca@...ux.ibm.com>,
Ilya Leoshkevich <iii@...ux.ibm.com>,
Vasily Gorbik <gor@...ux.ibm.com>
Subject: Re: [PATCH v2 03/11] unwind: Introduce SFrame user space unwinding
Hello Josh!
On 14.09.2024 01:02, Josh Poimboeuf wrote:
> Some distros have started compiling frame pointers into all their
> packages to enable the kernel to do system-wide profiling of user space.
> Unfortunately that creates a runtime performance penalty across the
> entire system. Using DWARF instead isn't feasible due to the complexity
> it would add to the kernel.
>
> For in-kernel unwinding we solved this problem with the creation of the
> ORC unwinder for x86_64. Similarly, for user space the GNU assembler
> has created the SFrame format starting with binutils 2.41 for SFrame v2.
> SFrame is a simpler version of .eh_frame which gets placed in the
> .sframe section.
>
> Add support for unwinding user space using SFrame.
>
> More information about SFrame can be found here:
>
> - https://lwn.net/Articles/932209/
> - https://lwn.net/Articles/940686/
> - https://sourceware.org/binutils/docs/sframe-spec.html
>
> Signed-off-by: Josh Poimboeuf <jpoimboe@...nel.org>
> ---
> arch/Kconfig | 3 +
> fs/binfmt_elf.c | 47 +++-
> include/linux/mm_types.h | 3 +
> include/linux/sframe.h | 46 ++++
> include/linux/user_unwind.h | 1 +
> include/uapi/linux/elf.h | 1 +
> include/uapi/linux/prctl.h | 3 +
> kernel/fork.c | 10 +
> kernel/sys.c | 11 +
> kernel/unwind/Makefile | 1 +
> kernel/unwind/sframe.c | 420 ++++++++++++++++++++++++++++++++++++
> kernel/unwind/sframe.h | 215 ++++++++++++++++++
> kernel/unwind/user.c | 14 ++
> mm/init-mm.c | 4 +-
> 14 files changed, 774 insertions(+), 5 deletions(-)
> create mode 100644 include/linux/sframe.h
> create mode 100644 kernel/unwind/sframe.c
> create mode 100644 kernel/unwind/sframe.h
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index b1002b2da331..ff5d5bc5f947 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -428,6 +428,9 @@ config HAVE_HARDLOCKUP_DETECTOR_ARCH
> config HAVE_USER_UNWIND
> bool
>
> +config HAVE_USER_UNWIND_SFRAME
> + bool
I found this unwinder of userspace using SFrame to depend on your
generic unwinder of userspace framework. Does this warrant to add:
depends on HAVE_USER_UNWIND
> +
> config HAVE_PERF_REGS
> bool
> help
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 19fa49cd9907..923aed390f2e 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -47,6 +47,7 @@
> #include <linux/dax.h>
> #include <linux/uaccess.h>
> #include <linux/rseq.h>
> +#include <linux/sframe.h>
> #include <asm/param.h>
> #include <asm/page.h>
>
> @@ -633,11 +634,13 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
> unsigned long no_base, struct elf_phdr *interp_elf_phdata,
> struct arch_elf_state *arch_state)
> {
> - struct elf_phdr *eppnt;
> + struct elf_phdr *eppnt, *sframe_phdr = NULL;
> unsigned long load_addr = 0;
> int load_addr_set = 0;
> unsigned long error = ~0UL;
> unsigned long total_size;
> + unsigned long start_code = ~0UL;
> + unsigned long end_code = 0;
> int i;
>
> /* First of all, some simple consistency checks */
> @@ -659,7 +662,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>
> eppnt = interp_elf_phdata;
> for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
> - if (eppnt->p_type == PT_LOAD) {
> + switch (eppnt->p_type) {
> + case PT_LOAD: {
> int elf_type = MAP_PRIVATE;
> int elf_prot = make_prot(eppnt->p_flags, arch_state,
> true, true);
> @@ -688,7 +692,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
> /*
> * Check to see if the section's size will overflow the
> * allowed task size. Note that p_filesz must always be
> - * <= p_memsize so it's only necessary to check p_memsz.
> + * <= p_memsz so it's only necessary to check p_memsz.
> */
> k = load_addr + eppnt->p_vaddr;
> if (BAD_ADDR(k) ||
> @@ -698,7 +702,28 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
> error = -ENOMEM;
> goto out;
> }
> +
> + if ((eppnt->p_flags & PF_X) && k < start_code)
> + start_code = k;
> +
> + if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
> + end_code = k + eppnt->p_filesz;
> + break;
> }
> + case PT_GNU_SFRAME:
> + sframe_phdr = eppnt;
> + break;
> + }
> + }
> +
> + if (sframe_phdr) {
> + struct sframe_file sfile = {
> + .sframe_addr = load_addr + sframe_phdr->p_vaddr,
> + .text_start = start_code,
> + .text_end = end_code,
> + };
> +
> + __sframe_add_section(&sfile);
> }
>
> error = load_addr;
> @@ -823,7 +848,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
> int first_pt_load = 1;
> unsigned long error;
> struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
> - struct elf_phdr *elf_property_phdata = NULL;
> + struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
> unsigned long elf_brk;
> int retval, i;
> unsigned long elf_entry;
> @@ -931,6 +956,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
> executable_stack = EXSTACK_DISABLE_X;
> break;
>
> + case PT_GNU_SFRAME:
> + sframe_phdr = elf_ppnt;
> + break;
> +
> case PT_LOPROC ... PT_HIPROC:
> retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
> bprm->file, false,
> @@ -1316,6 +1345,16 @@ static int load_elf_binary(struct linux_binprm *bprm)
> MAP_FIXED | MAP_PRIVATE, 0);
> }
>
> + if (sframe_phdr) {
> + struct sframe_file sfile = {
> + .sframe_addr = load_bias + sframe_phdr->p_vaddr,
> + .text_start = start_code,
> + .text_end = end_code,
> + };
> +
> + __sframe_add_section(&sfile);
> + }
> +
> regs = current_pt_regs();
> #ifdef ELF_PLAT_INIT
> /*
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 485424979254..1aee78cbea33 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1019,6 +1019,9 @@ struct mm_struct {
> #endif
> } lru_gen;
> #endif /* CONFIG_LRU_GEN_WALKS_MMU */
> +#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
> + struct maple_tree sframe_mt;
> +#endif
> } __randomize_layout;
>
> /*
> diff --git a/include/linux/sframe.h b/include/linux/sframe.h
> new file mode 100644
> index 000000000000..3a44f76929e2
> --- /dev/null
> +++ b/include/linux/sframe.h
> @@ -0,0 +1,46 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_SFRAME_H
> +#define _LINUX_SFRAME_H
> +
> +#include <linux/mm_types.h>
> +
> +struct sframe_file {
> + unsigned long sframe_addr, text_start, text_end;
> +};
> +
> +struct user_unwind_frame;
> +
> +#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
> +
> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0)
> +
> +extern void sframe_free_mm(struct mm_struct *mm);
> +
> +extern int __sframe_add_section(struct sframe_file *file);
> +extern int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end);
> +extern int sframe_remove_section(unsigned long sframe_addr);
> +extern int sframe_find(unsigned long ip, struct user_unwind_frame *frame);
> +
> +static inline bool current_has_sframe(void)
> +{
> + struct mm_struct *mm = current->mm;
> +
> + return mm && !mtree_empty(&mm->sframe_mt);
> +}
> +
> +#else /* !CONFIG_HAVE_USER_UNWIND_SFRAME */
> +
> +#define INIT_MM_SFRAME
> +
> +static inline void sframe_free_mm(struct mm_struct *mm) {}
> +
> +static inline int __sframe_add_section(struct sframe_file *file) { return -EINVAL; }
> +static inline int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end) { return -EINVAL; }
> +static inline int sframe_remove_section(unsigned long sframe_addr) { return -EINVAL; }
> +static inline int sframe_find(unsigned long ip, struct user_unwind_frame *frame) { return -EINVAL; }
> +
> +static inline bool current_has_sframe(void) { return false; }
> +
> +#endif /* CONFIG_HAVE_USER_UNWIND_SFRAME */
> +
> +#endif /* _LINUX_SFRAME_H */
> diff --git a/include/linux/user_unwind.h b/include/linux/user_unwind.h
> index 0a19ac6c92b2..8003f9d35405 100644
> --- a/include/linux/user_unwind.h
> +++ b/include/linux/user_unwind.h
> @@ -7,6 +7,7 @@
> enum user_unwind_type {
> USER_UNWIND_TYPE_AUTO,
> USER_UNWIND_TYPE_FP,
> + USER_UNWIND_TYPE_SFRAME,
> };
>
> struct user_unwind_frame {
> diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
> index b54b313bcf07..b2aca31e1a49 100644
> --- a/include/uapi/linux/elf.h
> +++ b/include/uapi/linux/elf.h
> @@ -39,6 +39,7 @@ typedef __s64 Elf64_Sxword;
> #define PT_GNU_STACK (PT_LOOS + 0x474e551)
> #define PT_GNU_RELRO (PT_LOOS + 0x474e552)
> #define PT_GNU_PROPERTY (PT_LOOS + 0x474e553)
> +#define PT_GNU_SFRAME (PT_LOOS + 0x474e554)
>
>
> /* ARM MTE memory tag segment type */
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 35791791a879..69511077c910 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -328,4 +328,7 @@ struct prctl_mm_map {
> # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */
> # define PR_PPC_DEXCR_CTRL_MASK 0x1f
>
> +#define PR_ADD_SFRAME 74
> +#define PR_REMOVE_SFRAME 75
> +
> #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index cc760491f201..a216f091edfb 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -104,6 +104,7 @@
> #include <linux/rseq.h>
> #include <uapi/linux/pidfd.h>
> #include <linux/pidfs.h>
> +#include <linux/sframe.h>
>
> #include <asm/pgalloc.h>
> #include <linux/uaccess.h>
> @@ -923,6 +924,7 @@ void __mmdrop(struct mm_struct *mm)
> mm_pasid_drop(mm);
> mm_destroy_cid(mm);
> percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
> + sframe_free_mm(mm);
>
> free_mm(mm);
> }
> @@ -1249,6 +1251,13 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
> #endif
> }
>
> +static void mm_init_sframe(struct mm_struct *mm)
> +{
> +#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
> + mt_init(&mm->sframe_mt);
> +#endif
> +}
> +
> static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
> struct user_namespace *user_ns)
> {
> @@ -1280,6 +1289,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
> mm->pmd_huge_pte = NULL;
> #endif
> mm_init_uprobes_state(mm);
> + mm_init_sframe(mm);
> hugetlb_count_init(mm);
>
> if (current->mm) {
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 3a2df1bd9f64..e4d2b64f4ae4 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -64,6 +64,7 @@
> #include <linux/rcupdate.h>
> #include <linux/uidgid.h>
> #include <linux/cred.h>
> +#include <linux/sframe.h>
>
> #include <linux/nospec.h>
>
> @@ -2782,6 +2783,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> case PR_RISCV_SET_ICACHE_FLUSH_CTX:
> error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
> break;
> + case PR_ADD_SFRAME:
> + if (arg5)
> + return -EINVAL;
> + error = sframe_add_section(arg2, arg3, arg4);
> + break;
> + case PR_REMOVE_SFRAME:
> + if (arg3 || arg4 || arg5)
> + return -EINVAL;
> + error = sframe_remove_section(arg2);
> + break;
> default:
> error = -EINVAL;
> break;
> diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
> index eb466d6a3295..6f202c5840cf 100644
> --- a/kernel/unwind/Makefile
> +++ b/kernel/unwind/Makefile
> @@ -1 +1,2 @@
> obj-$(CONFIG_HAVE_USER_UNWIND) += user.o
> +obj-$(CONFIG_HAVE_USER_UNWIND_SFRAME) += sframe.o
> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
> new file mode 100644
> index 000000000000..3e4d29e737a1
> --- /dev/null
> +++ b/kernel/unwind/sframe.c
> @@ -0,0 +1,420 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/srcu.h>
> +#include <linux/uaccess.h>
> +#include <linux/mm.h>
> +#include <linux/sframe.h>
> +#include <linux/user_unwind.h>
> +
> +#include "sframe.h"
> +
> +#define SFRAME_FILENAME_LEN 32
> +
> +struct sframe_section {
> + struct rcu_head rcu;
> +
> + unsigned long sframe_addr;
> + unsigned long text_addr;
> +
> + unsigned long fdes_addr;
> + unsigned long fres_addr;
> + unsigned int fdes_nr;
> + signed char ra_off, fp_off;
> +};
> +
> +DEFINE_STATIC_SRCU(sframe_srcu);
> +
> +#define __SFRAME_GET_USER(out, user_ptr, type) \
> +({ \
> + type __tmp; \
> + if (get_user(__tmp, (type *)user_ptr)) \
> + return -EFAULT; \
> + user_ptr += sizeof(__tmp); \
> + out = __tmp; \
> +})
> +
> +#define SFRAME_GET_USER_SIGNED(out, user_ptr, size) \
> +({ \
> + switch (size) { \
> + case 1: \
> + __SFRAME_GET_USER(out, user_ptr, s8); \
> + break; \
> + case 2: \
> + __SFRAME_GET_USER(out, user_ptr, s16); \
> + break; \
> + case 4: \
> + __SFRAME_GET_USER(out, user_ptr, s32); \
> + break; \
> + default: \
> + return -EINVAL; \
> + } \
> +})
> +
> +#define SFRAME_GET_USER_UNSIGNED(out, user_ptr, size) \
> +({ \
> + switch (size) { \
> + case 1: \
> + __SFRAME_GET_USER(out, user_ptr, u8); \
> + break; \
> + case 2: \
> + __SFRAME_GET_USER(out, user_ptr, u16); \
> + break; \
> + case 4: \
> + __SFRAME_GET_USER(out, user_ptr, u32); \
> + break; \
> + default: \
> + return -EINVAL; \
> + } \
> +})
> +
> +static unsigned char fre_type_to_size(unsigned char fre_type)
> +{
> + if (fre_type > 2)
> + return 0;
> + return 1 << fre_type;
> +}
> +
> +static unsigned char offset_size_enum_to_size(unsigned char off_size)
> +{
> + if (off_size > 2)
> + return 0;
> + return 1 << off_size;
> +}
> +
> +static int find_fde(struct sframe_section *sec, unsigned long ip,
> + struct sframe_fde *fde)
> +{
> + s32 func_off, ip_off;
> + struct sframe_fde __user *first, *last, *mid, *found;
> +
> + ip_off = ip - sec->sframe_addr;
> +
> + first = (void *)sec->fdes_addr;
> + last = first + sec->fdes_nr;
> + while (first <= last) {
> + mid = first + ((last - first) / 2);
> + if (get_user(func_off, (s32 *)mid))
> + return -EFAULT;
> + if (ip_off >= func_off) {
> + found = mid;
> + first = mid + 1;
> + } else
> + last = mid - 1;
> + }
> +
> + if (!found)
> + return -EINVAL;
> +
> + if (copy_from_user(fde, found, sizeof(*fde)))
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +static int find_fre(struct sframe_section *sec, struct sframe_fde *fde,
> + unsigned long ip, struct user_unwind_frame *frame)
> +{
> + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info);
> + unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info);
> + s32 fre_ip_off, cfa_off, ra_off, fp_off, ip_off;
Doesn't fre_ip_off need to be u32 (see also below)? The SFrame format
specification states the FRE sfre_start_address is either u8, u16, or u32:
https://sourceware.org/binutils/docs/sframe-spec.html#The-SFrame-FRE-Types
> + unsigned char offset_count, offset_size;
> + unsigned char addr_size;
> + void __user *f, *last_f;
> + u8 fre_info;
> + int i;
> +
> + addr_size = fre_type_to_size(fre_type);
> + if (!addr_size)
> + return -EINVAL;
> +
> + ip_off = ip - sec->sframe_addr - fde->start_addr;
> +
> + f = (void *)sec->fres_addr + fde->fres_off;
> +
> + for (i = 0; i < fde->fres_num; i++) {
> +
> + SFRAME_GET_USER_UNSIGNED(fre_ip_off, f, addr_size);
You already use SFRAME_GET_USER_UNSIGNED() to read it.
> +
> + if (fde_type == SFRAME_FDE_TYPE_PCINC) {
> + if (fre_ip_off > ip_off)
> + break;
> + } else {
> + /* SFRAME_FDE_TYPE_PCMASK */
> + if (ip_off % fde->rep_size < fre_ip_off)
> + break;
> + }
> +
> + SFRAME_GET_USER_UNSIGNED(fre_info, f, 1);
> +
> + offset_count = SFRAME_FRE_OFFSET_COUNT(fre_info);
> + offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(fre_info));
> +
> + if (!offset_count || !offset_size)
> + return -EINVAL;
> +
> + last_f = f;
> + f += offset_count * offset_size;
> + }
> +
> + if (!last_f)
> + return -EINVAL;
> +
> + f = last_f;
> +
> + SFRAME_GET_USER_UNSIGNED(cfa_off, f, offset_size);
As far as I know the CFA offset from CFA base register is signed in the
SFrame file format. See Binutils include/sframe-api.h,
sframe_fre_get_cfa_offset(). Therefore use SFRAME_GET_USER_SIGNED().
Both cfa_off and struct user_unwind_frame cfa_off are already defined as
s32.
> + offset_count--;
> +
> + ra_off = sec->ra_off;
> + if (!ra_off) {
On s390 there is no fixed RA offset from CFA ...
> + if (!offset_count--)
> + return -EINVAL;
... and the RA must not neccessarily be saved (e.g. at function entry).
But we can address this when sending patches for s390 support.
> + SFRAME_GET_USER_SIGNED(ra_off, f, offset_size);
> + }
> +
> + fp_off = sec->fp_off;
> + if (!fp_off && offset_count) {
> + offset_count--;
> + SFRAME_GET_USER_SIGNED(fp_off, f, offset_size);
> + }
> +
> + if (offset_count)
> + return -EINVAL;
> +
> + frame->cfa_off = cfa_off;
> + frame->ra_off = ra_off;
> + frame->fp_off = fp_off;
> + frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP;
> +
> + return 0;
> +}
> +
> +int sframe_find(unsigned long ip, struct user_unwind_frame *frame)
> +{
> + struct mm_struct *mm = current->mm;
> + struct sframe_section *sec;
> + struct sframe_fde fde;
> + int srcu_idx;
> + int ret = -EINVAL;
> +
> + srcu_idx = srcu_read_lock(&sframe_srcu);
> +
> + sec = mtree_load(&mm->sframe_mt, ip);
> + if (!sec) {
> + srcu_read_unlock(&sframe_srcu, srcu_idx);
> + return -EINVAL;
> + }
> +
> +
> + ret = find_fde(sec, ip, &fde);
> + if (ret)
> + goto err_unlock;
> +
> + ret = find_fre(sec, &fde, ip, frame);
> + if (ret)
> + goto err_unlock;
> +
> + srcu_read_unlock(&sframe_srcu, srcu_idx);
> + return 0;
> +
> +err_unlock:
> + srcu_read_unlock(&sframe_srcu, srcu_idx);
> + return ret;
> +}
> +
> +static int get_sframe_file(unsigned long sframe_addr, struct sframe_file *file)
> +{
> + struct mm_struct *mm = current->mm;
> + struct vm_area_struct *sframe_vma, *text_vma, *vma;
> + VMA_ITERATOR(vmi, mm, 0);
> +
> + mmap_read_lock(mm);
> +
> + sframe_vma = vma_lookup(mm, sframe_addr);
> + if (!sframe_vma || !sframe_vma->vm_file)
> + goto err_unlock;
> +
> + text_vma = NULL;
> +
> + for_each_vma(vmi, vma) {
> + if (vma->vm_file != sframe_vma->vm_file)
> + continue;
> + if (vma->vm_flags & VM_EXEC) {
> + if (text_vma) {
> + /*
> + * Multiple EXEC segments in a single file
> + * aren't currently supported, is that a thing?
> + */
> + mmap_read_unlock(mm);
> + pr_warn_once("unsupported multiple EXEC segments in task %s[%d]\n",
> + current->comm, current->pid);
> + return -EINVAL;
> + }
> + text_vma = vma;
> + }
> + }
> +
> + file->sframe_addr = sframe_addr;
> + file->text_start = text_vma->vm_start;
> + file->text_end = text_vma->vm_end;
> +
> + mmap_read_unlock(mm);
> + return 0;
> +
> +err_unlock:
> + mmap_read_unlock(mm);
> + return -EINVAL;
> +}
> +
> +static int validate_sframe_addrs(struct sframe_file *file)
> +{
> + struct mm_struct *mm = current->mm;
> + struct vm_area_struct *text_vma;
> +
> + mmap_read_lock(mm);
> +
> + if (!vma_lookup(mm, file->sframe_addr))
> + goto err_unlock;
> +
> + text_vma = vma_lookup(mm, file->text_start);
> + if (!(text_vma->vm_flags & VM_EXEC))
> + goto err_unlock;
> +
> + if (vma_lookup(mm, file->text_end-1) != text_vma)
> + goto err_unlock;
> +
> + mmap_read_unlock(mm);
> + return 0;
> +
> +err_unlock:
> + mmap_read_unlock(mm);
> + return -EINVAL;
> +}
> +
> +int __sframe_add_section(struct sframe_file *file)
> +{
> + struct maple_tree *sframe_mt = ¤t->mm->sframe_mt;
> + struct sframe_section *sec;
> + struct sframe_header shdr;
> + unsigned long header_end;
> + int ret;
> +
> + if (copy_from_user(&shdr, (void *)file->sframe_addr, sizeof(shdr)))
> + return -EFAULT;
> +
> + if (shdr.preamble.magic != SFRAME_MAGIC ||
> + shdr.preamble.version != SFRAME_VERSION_2 ||
> + !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
> + shdr.auxhdr_len || !shdr.num_fdes || !shdr.num_fres ||
> + shdr.fdes_off > shdr.fres_off) {
> + /*
> + * Either binutils < 2.41, corrupt sframe header, or
> + * unsupported feature.
> + * */
> + pr_warn_once("bad sframe header in task %s[%d]\n",
> + current->comm, current->pid);
> + return -EINVAL;
> + }
> +
> + header_end = file->sframe_addr + SFRAME_HDR_SIZE(shdr);
> +
> + sec = kmalloc(sizeof(*sec), GFP_KERNEL);
> + if (!sec)
> + return -ENOMEM;
> +
> + sec->sframe_addr = file->sframe_addr;
> + sec->text_addr = file->text_start;
> + sec->fdes_addr = header_end + shdr.fdes_off;
> + sec->fres_addr = header_end + shdr.fres_off;
> + sec->fdes_nr = shdr.num_fdes;
> + sec->ra_off = shdr.cfa_fixed_ra_offset;
> + sec->fp_off = shdr.cfa_fixed_fp_offset;
> +
> + ret = mtree_insert_range(sframe_mt, file->text_start, file->text_end,
> + sec, GFP_KERNEL);
> + if (ret) {
> + kfree(sec);
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end)
> +{
> + struct sframe_file file;
> + int ret;
> +
> + if (!text_start || !text_end) {
> + ret = get_sframe_file(sframe_addr, &file);
> + if (ret)
> + return ret;
> + } else {
> + /*
> + * This is mainly for generated code, for which the text isn't
> + * file-backed so the user has to give the text bounds.
> + */
> + file.sframe_addr = sframe_addr;
> + file.text_start = text_start;
> + file.text_end = text_end;
> + ret = validate_sframe_addrs(&file);
> + if (ret)
> + return ret;
> + }
> +
> + return __sframe_add_section(&file);
> +}
> +
> +static void sframe_free_rcu(struct rcu_head *rcu)
> +{
> + struct sframe_section *sec = container_of(rcu, struct sframe_section, rcu);
> +
> + kfree(sec);
> +}
> +
> +static int __sframe_remove_section(struct mm_struct *mm,
> + struct sframe_section *sec)
> +{
> + struct sframe_section *s;
> +
> + s = mtree_erase(&mm->sframe_mt, sec->text_addr);
> + if (!s || WARN_ON_ONCE(s != sec))
> + return -EINVAL;
> +
> + call_srcu(&sframe_srcu, &sec->rcu, sframe_free_rcu);
> +
> + return 0;
> +}
> +
> +int sframe_remove_section(unsigned long sframe_addr)
> +{
> + struct mm_struct *mm = current->mm;
> + struct sframe_section *sec;
> + unsigned long index = 0;
> +
> + sec = mtree_load(&mm->sframe_mt, sframe_addr);
> + if (!sec)
> + return -EINVAL;
> +
> + mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX) {
> + if (sec->sframe_addr == sframe_addr)
> + return __sframe_remove_section(mm, sec);
> + }
> +
> + return -EINVAL;
> +}
> +
> +void sframe_free_mm(struct mm_struct *mm)
> +{
> + struct sframe_section *sec;
> + unsigned long index = 0;
> +
> + if (!mm)
> + return;
> +
> + mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX)
> + kfree(sec);
> +
> + mtree_destroy(&mm->sframe_mt);
> +}
> diff --git a/kernel/unwind/sframe.h b/kernel/unwind/sframe.h
> new file mode 100644
> index 000000000000..aa468d6f1f4a
> --- /dev/null
> +++ b/kernel/unwind/sframe.h
> @@ -0,0 +1,215 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * Copyright (C) 2023, Oracle and/or its affiliates.
> + *
> + * This file contains definitions for the SFrame stack tracing format, which is
> + * documented at https://sourceware.org/binutils/docs
> + */
> +#ifndef _SFRAME_H
> +#define _SFRAME_H
> +
> +#include <linux/types.h>
> +
> +#define SFRAME_VERSION_1 1
> +#define SFRAME_VERSION_2 2
> +#define SFRAME_MAGIC 0xdee2
> +
> +/* Function Descriptor Entries are sorted on PC. */
> +#define SFRAME_F_FDE_SORTED 0x1
> +/* Frame-pointer based stack tracing. Defined, but not set. */
> +#define SFRAME_F_FRAME_POINTER 0x2
> +
> +#define SFRAME_CFA_FIXED_FP_INVALID 0
> +#define SFRAME_CFA_FIXED_RA_INVALID 0
> +
> +/* Supported ABIs/Arch. */
> +#define SFRAME_ABI_AARCH64_ENDIAN_BIG 1 /* AARCH64 big endian. */
> +#define SFRAME_ABI_AARCH64_ENDIAN_LITTLE 2 /* AARCH64 little endian. */
> +#define SFRAME_ABI_AMD64_ENDIAN_LITTLE 3 /* AMD64 little endian. */
> +
> +/* SFrame FRE types. */
> +#define SFRAME_FRE_TYPE_ADDR1 0
> +#define SFRAME_FRE_TYPE_ADDR2 1
> +#define SFRAME_FRE_TYPE_ADDR4 2
> +
> +/*
> + * SFrame Function Descriptor Entry types.
> + *
> + * The SFrame format has two possible representations for functions. The
> + * choice of which type to use is made according to the instruction patterns
> + * in the relevant program stub.
> + */
> +
> +/* Unwinders perform a (PC >= FRE_START_ADDR) to look up a matching FRE. */
> +#define SFRAME_FDE_TYPE_PCINC 0
> +/*
> + * Unwinders perform a (PC & FRE_START_ADDR_AS_MASK >= FRE_START_ADDR_AS_MASK)
> + * to look up a matching FRE. Typical usecases are pltN entries, trampolines
> + * etc.
> + */
> +#define SFRAME_FDE_TYPE_PCMASK 1
> +
> +/**
> + * struct sframe_preamble - SFrame Preamble.
> + * @magic: Magic number (SFRAME_MAGIC).
> + * @version: Format version number (SFRAME_VERSION).
> + * @flags: Various flags.
> + */
> +struct sframe_preamble {
> + u16 magic;
> + u8 version;
> + u8 flags;
> +} __packed;
> +
> +/**
> + * struct sframe_header - SFrame Header.
> + * @preamble: SFrame preamble.
> + * @abi_arch: Identify the arch (including endianness) and ABI.
> + * @cfa_fixed_fp_offset: Offset for the Frame Pointer (FP) from CFA may be
> + * fixed for some ABIs ((e.g, in AMD64 when -fno-omit-frame-pointer is
Nit: Two consecutive spaces in "fixed for".
> + * used). When fixed, this field specifies the fixed stack frame offset
> + * and the individual FREs do not need to track it. When not fixed, it
> + * is set to SFRAME_CFA_FIXED_FP_INVALID, and the individual FREs may
> + * provide the applicable stack frame offset, if any.
> + * @cfa_fixed_ra_offset: Offset for the Return Address from CFA is fixed for
> + * some ABIs. When fixed, this field specifies the fixed stack frame
> + * offset and the individual FREs do not need to track it. When not
> + * fixed, it is set to SFRAME_CFA_FIXED_FP_INVALID.
> + * @auxhdr_len: Number of bytes making up the auxiliary header, if any.
> + * Some ABI/arch, in the future, may use this space for extending the
> + * information in SFrame header. Auxiliary header is contained in bytes
> + * sequentially following the sframe_header.
> + * @num_fdes: Number of SFrame FDEs in this SFrame section.
> + * @num_fres: Number of SFrame Frame Row Entries.
> + * @fre_len: Number of bytes in the SFrame Frame Row Entry section.
> + * @fdes_off: Offset of SFrame Function Descriptor Entry section.
> + * @fres_off: Offset of SFrame Frame Row Entry section.
> + */
> +struct sframe_header {
> + struct sframe_preamble preamble;
> + u8 abi_arch;
> + s8 cfa_fixed_fp_offset;
> + s8 cfa_fixed_ra_offset;
> + u8 auxhdr_len;
> + u32 num_fdes;
> + u32 num_fres;
> + u32 fre_len;
> + u32 fdes_off;
> + u32 fres_off;
> +} __packed;
> +
> +#define SFRAME_HDR_SIZE(sframe_hdr) \
> + ((sizeof(struct sframe_header) + (sframe_hdr).auxhdr_len))
> +
> +/* Two possible keys for executable (instruction) pointers signing. */
> +#define SFRAME_AARCH64_PAUTH_KEY_A 0 /* Key A. */
> +#define SFRAME_AARCH64_PAUTH_KEY_B 1 /* Key B. */
> +
> +/**
> + * struct sframe_fde - SFrame Function Descriptor Entry.
> + * @start_addr: Function start address. Encoded as a signed offset,
> + * relative to the current FDE.
> + * @size: Size of the function in bytes.
> + * @fres_off: Offset of the first SFrame Frame Row Entry of the function,
> + * relative to the beginning of the SFrame Frame Row Entry sub-section.
> + * @fres_num: Number of frame row entries for the function.
> + * @info: Additional information for deciphering the stack trace
> + * information for the function. Contains information about SFrame FRE
> + * type, SFrame FDE type, PAC authorization A/B key, etc.
> + * @rep_size: Block size for SFRAME_FDE_TYPE_PCMASK
> + * @padding: Unused
> + */
> +struct sframe_fde {
> + s32 start_addr;
> + u32 size;
> + u32 fres_off;
> + u32 fres_num;
> + u8 info;
> + u8 rep_size;
> + u16 padding;
> +} __packed;
> +
> +/*
> + * 'func_info' in SFrame FDE contains additional information for deciphering
> + * the stack trace information for the function. In V1, the information is
> + * organized as follows:
> + * - 4-bits: Identify the FRE type used for the function.
> + * - 1-bit: Identify the FDE type of the function - mask or inc.
> + * - 1-bit: PAC authorization A/B key (aarch64).
> + * - 2-bits: Unused.
> + * ---------------------------------------------------------------------
> + * | Unused | PAC auth A/B key (aarch64) | FDE type | FRE type |
> + * | | Unused (amd64) | | |
> + * ---------------------------------------------------------------------
> + * 8 6 5 4 0
> + */
> +
> +/* Note: Set PAC auth key to SFRAME_AARCH64_PAUTH_KEY_A by default. */
> +#define SFRAME_FUNC_INFO(fde_type, fre_enc_type) \
> + (((SFRAME_AARCH64_PAUTH_KEY_A & 0x1) << 5) | \
> + (((fde_type) & 0x1) << 4) | ((fre_enc_type) & 0xf))
> +
> +#define SFRAME_FUNC_FRE_TYPE(data) ((data) & 0xf)
> +#define SFRAME_FUNC_FDE_TYPE(data) (((data) >> 4) & 0x1)
> +#define SFRAME_FUNC_PAUTH_KEY(data) (((data) >> 5) & 0x1)
> +
> +/*
> + * Size of stack frame offsets in an SFrame Frame Row Entry. A single
> + * SFrame FRE has all offsets of the same size. Offset size may vary
> + * across frame row entries.
> + */
> +#define SFRAME_FRE_OFFSET_1B 0
> +#define SFRAME_FRE_OFFSET_2B 1
> +#define SFRAME_FRE_OFFSET_4B 2
> +
> +/* An SFrame Frame Row Entry can be SP or FP based. */
> +#define SFRAME_BASE_REG_FP 0
> +#define SFRAME_BASE_REG_SP 1
> +
> +/*
> + * The index at which a specific offset is presented in the variable length
> + * bytes of an FRE.
> + */
> +#define SFRAME_FRE_CFA_OFFSET_IDX 0
> +/*
> + * The RA stack offset, if present, will always be at index 1 in the variable
> + * length bytes of the FRE.
> + */
> +#define SFRAME_FRE_RA_OFFSET_IDX 1
> +/*
> + * The FP stack offset may appear at offset 1 or 2, depending on the ABI as RA
> + * may or may not be tracked.
> + */
> +#define SFRAME_FRE_FP_OFFSET_IDX 2
> +
> +/*
> + * 'fre_info' in SFrame FRE contains information about:
> + * - 1 bit: base reg for CFA
> + * - 4 bits: Number of offsets (N). A value of up to 3 is allowed to track
> + * all three of CFA, FP and RA (fixed implicit order).
> + * - 2 bits: information about size of the offsets (S) in bytes.
> + * Valid values are SFRAME_FRE_OFFSET_1B, SFRAME_FRE_OFFSET_2B,
> + * SFRAME_FRE_OFFSET_4B
> + * - 1 bit: Mangled RA state bit (aarch64 only).
> + * ---------------------------------------------------------------
> + * | Mangled-RA (aarch64) | Size of | Number of | base_reg |
> + * | Unused (amd64) | offsets | offsets | |
> + * ---------------------------------------------------------------
> + * 8 7 5 1 0
> + */
> +
> +/* Note: Set mangled_ra_p to zero by default. */
> +#define SFRAME_FRE_INFO(base_reg_id, offset_num, offset_size) \
> + (((0 & 0x1) << 7) | (((offset_size) & 0x3) << 5) | \
> + (((offset_num) & 0xf) << 1) | ((base_reg_id) & 0x1))
> +
> +/* Set the mangled_ra_p bit as indicated. */
> +#define SFRAME_FRE_INFO_UPDATE_MANGLED_RA_P(mangled_ra_p, fre_info) \
> + ((((mangled_ra_p) & 0x1) << 7) | ((fre_info) & 0x7f))
> +
> +#define SFRAME_FRE_CFA_BASE_REG_ID(data) ((data) & 0x1)
> +#define SFRAME_FRE_OFFSET_COUNT(data) (((data) >> 1) & 0xf)
> +#define SFRAME_FRE_OFFSET_SIZE(data) (((data) >> 5) & 0x3)
> +#define SFRAME_FRE_MANGLED_RA_P(data) (((data) >> 7) & 0x1)
> +
> +#endif /* _SFRAME_H */
> diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
> index 5d16f9604a61..3a7b14cf522b 100644
> --- a/kernel/unwind/user.c
> +++ b/kernel/unwind/user.c
> @@ -8,6 +8,7 @@
> #include <linux/sched.h>
> #include <linux/sched/task_stack.h>
> #include <linux/user_unwind.h>
> +#include <linux/sframe.h>
> #include <linux/uaccess.h>
> #include <asm/user_unwind.h>
>
> @@ -29,6 +30,11 @@ int user_unwind_next(struct user_unwind_state *state)
> case USER_UNWIND_TYPE_FP:
> frame = &fp_frame;
> break;
> + case USER_UNWIND_TYPE_SFRAME:
> + ret = sframe_find(state->ip, frame);
> + if (ret)
> + goto the_end;
> + break;
> default:
> BUG();
> }
> @@ -57,6 +63,7 @@ int user_unwind_start(struct user_unwind_state *state,
> enum user_unwind_type type)
> {
> struct pt_regs *regs = task_pt_regs(current);
> + bool sframe_possible = current_has_sframe();
>
> memset(state, 0, sizeof(*state));
>
> @@ -67,6 +74,13 @@ int user_unwind_start(struct user_unwind_state *state,
>
> switch (type) {
> case USER_UNWIND_TYPE_AUTO:
> + state->type = sframe_possible ? USER_UNWIND_TYPE_SFRAME :
> + USER_UNWIND_TYPE_FP;
> + break;
> + case USER_UNWIND_TYPE_SFRAME:
> + if (!sframe_possible)
> + return -EINVAL;
> + break;
> case USER_UNWIND_TYPE_FP:
> break;
> default:
> diff --git a/mm/init-mm.c b/mm/init-mm.c
> index 24c809379274..c4c6af046778 100644
> --- a/mm/init-mm.c
> +++ b/mm/init-mm.c
> @@ -11,6 +11,7 @@
> #include <linux/atomic.h>
> #include <linux/user_namespace.h>
> #include <linux/iommu.h>
> +#include <linux/sframe.h>
> #include <asm/mmu.h>
>
> #ifndef INIT_MM_CONTEXT
> @@ -44,7 +45,8 @@ struct mm_struct init_mm = {
> #endif
> .user_ns = &init_user_ns,
> .cpu_bitmap = CPU_BITS_NONE,
> - INIT_MM_CONTEXT(init_mm)
> + INIT_MM_CONTEXT(init_mm),
> + INIT_MM_SFRAME,
This does not compile on s390, as INIT_MM_CONTEXT() is defined with a
trailing comma, leading to two consecutive commas. Already reported by
the kernel test robot for other architectures.
Same if INIT_MM_SFRAME expands into nothing there would be two
consecutive commas.
> };
>
> void setup_initial_init_mm(void *start_code, void *end_code,
Regards,
Jens
--
Jens Remus
Linux on Z Development (D3303) and z/VSE Support
+49-7031-16-1128 Office
jremus@...ibm.com
IBM
IBM Deutschland Research & Development GmbH; Vorsitzender des
Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der
Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/
Powered by blists - more mailing lists