[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <24e46e1c-e9fa-4d44-97f2-068bda6e54b4@oracle.com>
Date: Thu, 9 Nov 2023 11:31:59 -0800
From: Indu Bhagat <indu.bhagat@...cle.com>
To: Josh Poimboeuf <jpoimboe@...nel.org>,
Peter Zijlstra <peterz@...radead.org>,
Steven Rostedt <rostedt@...dmis.org>,
Ingo Molnar <mingo@...nel.org>,
Arnaldo Carvalho de Melo <acme@...nel.org>
Cc: linux-kernel@...r.kernel.org, x86@...nel.org,
Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Jiri Olsa <jolsa@...nel.org>,
Namhyung Kim <namhyung@...nel.org>,
Ian Rogers <irogers@...gle.com>,
Adrian Hunter <adrian.hunter@...el.com>,
linux-perf-users@...r.kernel.org, Mark Brown <broonie@...nel.org>,
linux-toolchains@...r.kernel.org
Subject: Re: [PATCH RFC 09/10] unwind: Introduce SFrame user space unwinding
On 11/8/23 16:41, Josh Poimboeuf wrote:
> Some distros have started compiling frame pointers into all their
> packages to enable the kernel to do system-wide profiling of user space.
> Unfortunately that creates a runtime performance penalty across the
> entire system. Using DWARF instead isn't feasible due to complexity and
> slowness.
>
> For in-kernel unwinding we solved this problem with the creation of the
> ORC unwinder for x86_64. Similarly, for user space the GNU assembler
> has created the SFrame format starting with binutils 2.40. SFrame is a
> simpler version of .eh_frame which gets placed in the .sframe section.
>
> Add support for unwinding user space using SFrame.
>
> More information about SFrame can be found here:
>
> - https://lwn.net/Articles/932209/
> - https://lwn.net/Articles/940686/
> - https://sourceware.org/binutils/docs/sframe-spec.html
>
> Signed-off-by: Josh Poimboeuf <jpoimboe@...nel.org>
> ---
> arch/Kconfig | 3 +
> arch/x86/include/asm/mmu.h | 2 +-
> fs/binfmt_elf.c | 46 +++-
> include/linux/mm_types.h | 3 +
> include/linux/sframe.h | 46 ++++
> include/linux/user_unwind.h | 1 +
> include/uapi/linux/elf.h | 1 +
> include/uapi/linux/prctl.h | 3 +
> kernel/fork.c | 10 +
> kernel/sys.c | 11 +
> kernel/unwind/Makefile | 1 +
> kernel/unwind/sframe.c | 414 ++++++++++++++++++++++++++++++++++++
> kernel/unwind/sframe.h | 217 +++++++++++++++++++
> kernel/unwind/user.c | 15 +-
> mm/init-mm.c | 2 +
> 15 files changed, 768 insertions(+), 7 deletions(-)
> create mode 100644 include/linux/sframe.h
> create mode 100644 kernel/unwind/sframe.c
> create mode 100644 kernel/unwind/sframe.h
>
Thanks for working on this.
> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
> new file mode 100644
> index 000000000000..b167c19497e5
> --- /dev/null
> +++ b/kernel/unwind/sframe.c
> @@ -0,0 +1,414 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/srcu.h>
> +#include <linux/uaccess.h>
> +#include <linux/mm.h>
> +#include <linux/sframe.h>
> +#include <linux/user_unwind.h>
> +
> +#include "sframe.h"
> +
> +#define SFRAME_FILENAME_LEN 32
> +
> +struct sframe_section {
> + struct rcu_head rcu;
> +
> + unsigned long sframe_addr;
> + unsigned long text_addr;
> +
> + unsigned long fdes_addr;
> + unsigned long fres_addr;
> + unsigned int fdes_num;
> + signed char ra_off, fp_off;
> +};
> +
> +DEFINE_STATIC_SRCU(sframe_srcu);
> +
> +#define __SFRAME_GET_USER(out, user_ptr, type) \
> +({ \
> + type __tmp; \
> + if (get_user(__tmp, (type *)user_ptr)) \
> + return -EFAULT; \
> + user_ptr += sizeof(__tmp); \
> + out = __tmp; \
> +})
> +
> +#define SFRAME_GET_USER_SIGNED(out, user_ptr, size) \
> +({ \
> + switch (size) { \
> + case 1: \
> + __SFRAME_GET_USER(out, user_ptr, s8); \
> + break; \
> + case 2: \
> + __SFRAME_GET_USER(out, user_ptr, s16); \
> + break; \
> + case 4: \
> + __SFRAME_GET_USER(out, user_ptr, s32); \
> + break; \
> + default: \
> + return -EINVAL; \
> + } \
> +})
> +
> +#define SFRAME_GET_USER_UNSIGNED(out, user_ptr, size) \
> +({ \
> + switch (size) { \
> + case 1: \
> + __SFRAME_GET_USER(out, user_ptr, u8); \
> + break; \
> + case 2: \
> + __SFRAME_GET_USER(out, user_ptr, u16); \
> + break; \
> + case 4: \
> + __SFRAME_GET_USER(out, user_ptr, u32); \
> + break; \
> + default: \
> + return -EINVAL; \
> + } \
> +})
> +
> +static unsigned char fre_type_to_size(unsigned char fre_type)
> +{
> + if (fre_type > 2)
> + return 0;
> + return 1 << fre_type;
> +}
> +
> +static unsigned char offset_size_enum_to_size(unsigned char off_size)
> +{
> + if (off_size > 2)
> + return 0;
> + return 1 << off_size;
> +}
> +
> +static int find_fde(struct sframe_section *sec, unsigned long ip,
> + struct sframe_fde *fde)
> +{
> + s32 func_off, ip_off;
> + struct sframe_fde __user *first, *last, *mid, *found;
> +
> + ip_off = ip - sec->sframe_addr;
> +
> + first = (void *)sec->fdes_addr;
> + last = first + sec->fdes_num;
> + while (first <= last) {
> + mid = first + ((last - first) / 2);
> + if (get_user(func_off, (s32 *)mid))
> + return -EFAULT;
> + if (ip_off >= func_off) {
> + found = mid;
> + first = mid + 1;
> + } else
> + last = mid - 1;
> + }
> +
> + if (!found)
> + return -EINVAL;
> +
> + if (copy_from_user(fde, found, sizeof(*fde)))
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +static int find_fre(struct sframe_section *sec, struct sframe_fde *fde,
> + unsigned long ip, struct user_unwind_frame *frame)
> +{
> + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info);
> + unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info);
> + s32 fre_ip_off, cfa_off, ra_off, fp_off, ip_off;
> + unsigned char offset_count, offset_size;
> + unsigned char addr_size;
> + void __user *f, *last_f;
> + u8 fre_info;
> + int i;
> +
> + addr_size = fre_type_to_size(fre_type);
> + if (!addr_size)
> + return -EINVAL;
> +
> + ip_off = ip - sec->sframe_addr - fde->start_addr;
> +
> + f = (void *)sec->fres_addr + fde->fres_off;
> +
> + for (i = 0; i < fde->fres_num; i++) {
> +
> + SFRAME_GET_USER_UNSIGNED(fre_ip_off, f, addr_size);
> +
> + if (fde_type == SFRAME_FDE_TYPE_PCINC) {
> + if (fre_ip_off > ip_off)
> + break;
> + } else {
> + /* SFRAME_FDE_TYPE_PCMASK */
> +#if 0 /* sframe v2 */
> + if (ip_off % fde->rep_size < fre_ip_off)
> + break;
> +#endif
> + }
> +
> + SFRAME_GET_USER_UNSIGNED(fre_info, f, 1);
> +
> + offset_count = SFRAME_FRE_OFFSET_COUNT(fre_info);
> + offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(fre_info));
> +
> + if (!offset_count || !offset_size)
> + return -EINVAL;
> +
> + last_f = f;
> + f += offset_count * offset_size;
> + }
> +
> + if (!last_f)
> + return -EINVAL;
> +
> + f = last_f;
> +
> + SFRAME_GET_USER_UNSIGNED(cfa_off, f, offset_size);
> + offset_count--;
> +
> + ra_off = sec->ra_off;
> + if (!ra_off) {
> + if (!offset_count--)
> + return -EINVAL;
> + SFRAME_GET_USER_SIGNED(ra_off, f, offset_size);
> + }
> +
> + fp_off = sec->fp_off;
> + if (!fp_off && offset_count) {
> + offset_count--;
> + SFRAME_GET_USER_SIGNED(fp_off, f, offset_size);
> + }
> +
> + if (offset_count)
> + return -EINVAL;
> +
> + frame->cfa_off = cfa_off;
> + frame->ra_off = ra_off;
> + frame->fp_off = fp_off;
> + frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP;
> +
> + return 0;
> +}
> +
> +int sframe_find(unsigned long ip, struct user_unwind_frame *frame)
> +{
> + struct mm_struct *mm = current->mm;
> + struct sframe_section *sec;
> + struct sframe_fde fde;
> + int srcu_idx;
> + int ret = -EINVAL;
> +
> + srcu_idx = srcu_read_lock(&sframe_srcu);
> +
> + sec = mtree_load(&mm->sframe_mt, ip);
> + if (!sec) {
> + srcu_read_unlock(&sframe_srcu, srcu_idx);
> + return -EINVAL;
> + }
> +
> +
> + ret = find_fde(sec, ip, &fde);
> + if (ret)
> + goto err_unlock;
> +
> + ret = find_fre(sec, &fde, ip, frame);
> + if (ret)
> + goto err_unlock;
> +
> + srcu_read_unlock(&sframe_srcu, srcu_idx);
> + return 0;
> +
> +err_unlock:
> + srcu_read_unlock(&sframe_srcu, srcu_idx);
> + return ret;
> +}
> +
> +static int get_sframe_file(unsigned long sframe_addr, struct sframe_file *file)
> +{
> + struct mm_struct *mm = current->mm;
> + struct vm_area_struct *sframe_vma, *text_vma, *vma;
> + VMA_ITERATOR(vmi, mm, 0);
> +
> + mmap_read_lock(mm);
> +
> + sframe_vma = vma_lookup(mm, sframe_addr);
> + if (!sframe_vma || !sframe_vma->vm_file)
> + goto err_unlock;
> +
> + text_vma = NULL;
> +
> + for_each_vma(vmi, vma) {
> + if (vma->vm_file != sframe_vma->vm_file)
> + continue;
> + if (vma->vm_flags & VM_EXEC) {
> + if (text_vma) {
> + /*
> + * Multiple EXEC segments in a single file
> + * aren't currently supported, is that a thing?
> + */
> + WARN_ON_ONCE(1);
> + goto err_unlock;
> + }
> + text_vma = vma;
> + }
> + }
> +
> + file->sframe_addr = sframe_addr;
> + file->text_start = text_vma->vm_start;
> + file->text_end = text_vma->vm_end;
> +
> + mmap_read_unlock(mm);
> + return 0;
> +
> +err_unlock:
> + mmap_read_unlock(mm);
> + return -EINVAL;
> +}
> +
> +static int validate_sframe_addrs(struct sframe_file *file)
> +{
> + struct mm_struct *mm = current->mm;
> + struct vm_area_struct *text_vma;
> +
> + mmap_read_lock(mm);
> +
> + if (!vma_lookup(mm, file->sframe_addr))
> + goto err_unlock;
> +
> + text_vma = vma_lookup(mm, file->text_start);
> + if (!(text_vma->vm_flags & VM_EXEC))
> + goto err_unlock;
> +
> + if (vma_lookup(mm, file->text_end-1) != text_vma)
> + goto err_unlock;
> +
> + mmap_read_unlock(mm);
> + return 0;
> +
> +err_unlock:
> + mmap_read_unlock(mm);
> + return -EINVAL;
> +}
> +
> +int __sframe_add_section(struct sframe_file *file)
> +{
> + struct maple_tree *sframe_mt = ¤t->mm->sframe_mt;
> + struct sframe_section *sec;
> + struct sframe_header shdr;
> + unsigned long header_end;
> + int ret;
> +
> + if (copy_from_user(&shdr, (void *)file->sframe_addr, sizeof(shdr)))
> + return -EFAULT;
> +
> + if (shdr.preamble.magic != SFRAME_MAGIC ||
> + shdr.preamble.version != SFRAME_VERSION_1 ||
> + (!shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
> + shdr.auxhdr_len || !shdr.num_fdes || !shdr.num_fres ||
> + shdr.fdes_off > shdr.fres_off) {
> + return -EINVAL;
> + }
> +
I would say that it will be ideal to start the support with
SFRAME_VERSION_2 onwards, if we have a choice.
The structure SFrame FDE in SFRAME_VERSION_1 was unaligned on-disk. We
fixed that in SFRAME_VERSION_2 (Binutils 2.41) by adding some padding as
you have already noted. For x86_64, its not an issue though, yes.
Powered by blists - more mailing lists