netdev - Re: [PATCH bpf-next v6 1/2] bpf: extend stackmap to save binary_build

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <15D381AB-87EE-4056-A5E1-9BF588710801@fb.com>
Date:   Wed, 14 Mar 2018 17:36:21 +0000
From:   Song Liu <songliubraving@...com>
To:     "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
        "ast@...nel.org" <ast@...nel.org>,
        "peterz@...radead.org" <peterz@...radead.org>,
        "daniel@...earbox.net" <daniel@...earbox.net>
CC:     Kernel Team <Kernel-team@...com>,
        "hannes@...xchg.org" <hannes@...xchg.org>,
        Teng Qin <qinteng@...com>
Subject: Re: [PATCH bpf-next v6 1/2] bpf: extend stackmap to save
 binary_build_id+offset instead of address



> On Mar 14, 2018, at 10:23 AM, Song Liu <songliubraving@...com> wrote:
> 
> Currently, bpf stackmap store address for each entry in the call trace.
> To map these addresses to user space files, it is necessary to maintain
> the mapping from these virtual address to symbols in the binary. Usually,
> the user space profiler (such as perf) has to scan /proc/pid/maps at the
> beginning of profiling, and monitor mmap2() calls afterwards. Given the
> cost of maintaining the address map, this solution is not practical for
> system wide profiling that is always on.
> 
> This patch tries to solve this problem with a variation of stackmap. This
> variation is enabled by flag BPF_F_STACK_BUILD_ID. Instead of storing
> addresses, the variation stores ELF file build_id + offset.
> 
> Build_id is
> only meaningful for user stack. If a kernel stack is added to a stackmap
> with BPF_F_STACK_BUILD_ID, it will automatically fallback to only store
> ip (status == BPF_STACK_BUILD_ID_IP).

I forgot to delete the paragraph above. Please let me know if I should 
resend (or it can be removed when you apply it).

Thanks,
Song 


> Build ID is a 20-byte unique identifier for ELF files. The following
> command shows the Build ID of /bin/bash:
> 
>  [user@]$ readelf -n /bin/bash
>  ...
>    Build ID: XXXXXXXXXX
>  ...
> 
> With BPF_F_STACK_BUILD_ID, bpf_get_stackid() tries to parse Build ID
> for each entry in the call trace, and translate it into the following
> struct:
> 
>  struct bpf_stack_build_id_offset {
>          __s32           status;
>          unsigned char   build_id[BPF_BUILD_ID_SIZE];
>          union {
>                  __u64   offset;
>                  __u64   ip;
>          };
>  };
> 
> The search of build_id is limited to the first page of the file, and this
> page should be in page cache. Otherwise, we fallback to store ip for this
> entry (ip field in struct bpf_stack_build_id_offset). This requires the
> build_id to be stored in the first page. A quick survey of binary and
> dynamic library files in a few different systems shows that almost all
> binary and dynamic library files have build_id in the first page.
> 
> Build_id is only meaningful for user stack. If a kernel stack is added to
> a stackmap with BPF_F_STACK_BUILD_ID, it will automatically fallback to
> only store ip (status == BPF_STACK_BUILD_ID_IP). Similarly, if build_id
> lookup failed for some reason, it will also fallback to store ip.
> 
> User space can access struct bpf_stack_build_id_offset with bpf
> syscall BPF_MAP_LOOKUP_ELEM. It is necessary for user space to
> maintain mapping from build id to binary files. This mostly static
> mapping is much easier to maintain than per process address maps.
> 
> Note: Stackmap with build_id only works in non-nmi context at this time.
> This is because we need to take mm->mmap_sem for find_vma(). If this
> changes, we would like to allow build_id lookup in nmi context.
> 
> Signed-off-by: Song Liu <songliubraving@...com>
> ---
> include/uapi/linux/bpf.h |  22 ++++
> kernel/bpf/stackmap.c    | 257 +++++++++++++++++++++++++++++++++++++++++++----
> 2 files changed, 257 insertions(+), 22 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 2a66769..1e15d17 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -231,6 +231,28 @@ enum bpf_attach_type {
> #define BPF_F_RDONLY		(1U << 3)
> #define BPF_F_WRONLY		(1U << 4)
> 
> +/* Flag for stack_map, store build_id+offset instead of pointer */
> +#define BPF_F_STACK_BUILD_ID	(1U << 5)
> +
> +enum bpf_stack_build_id_status {
> +	/* user space need an empty entry to identify end of a trace */
> +	BPF_STACK_BUILD_ID_EMPTY = 0,
> +	/* with valid build_id and offset */
> +	BPF_STACK_BUILD_ID_VALID = 1,
> +	/* couldn't get build_id, fallback to ip */
> +	BPF_STACK_BUILD_ID_IP = 2,
> +};
> +
> +#define BPF_BUILD_ID_SIZE 20
> +struct bpf_stack_build_id {
> +	__s32		status;
> +	unsigned char	build_id[BPF_BUILD_ID_SIZE];
> +	union {
> +		__u64	offset;
> +		__u64	ip;
> +	};
> +};
> +
> union bpf_attr {
> 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
> 		__u32	map_type;	/* one of enum bpf_map_type */
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index b0ecf43..57eeb12 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -9,16 +9,19 @@
> #include <linux/filter.h>
> #include <linux/stacktrace.h>
> #include <linux/perf_event.h>
> +#include <linux/elf.h>
> +#include <linux/pagemap.h>
> #include "percpu_freelist.h"
> 
> -#define STACK_CREATE_FLAG_MASK \
> -	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
> +#define STACK_CREATE_FLAG_MASK					\
> +	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |	\
> +	 BPF_F_STACK_BUILD_ID)
> 
> struct stack_map_bucket {
> 	struct pcpu_freelist_node fnode;
> 	u32 hash;
> 	u32 nr;
> -	u64 ip[];
> +	u64 data[];
> };
> 
> struct bpf_stack_map {
> @@ -29,6 +32,17 @@ struct bpf_stack_map {
> 	struct stack_map_bucket *buckets[];
> };
> 
> +static inline bool stack_map_use_build_id(struct bpf_map *map)
> +{
> +	return (map->map_flags & BPF_F_STACK_BUILD_ID);
> +}
> +
> +static inline int stack_map_data_size(struct bpf_map *map)
> +{
> +	return stack_map_use_build_id(map) ?
> +		sizeof(struct bpf_stack_build_id) : sizeof(u64);
> +}
> +
> static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
> {
> 	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
> @@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
> 
> 	/* check sanity of attributes */
> 	if (attr->max_entries == 0 || attr->key_size != 4 ||
> -	    value_size < 8 || value_size % 8 ||
> -	    value_size / 8 > sysctl_perf_event_max_stack)
> +	    value_size < 8 || value_size % 8)
> +		return ERR_PTR(-EINVAL);
> +
> +	BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
> +	if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
> +		if (value_size % sizeof(struct bpf_stack_build_id) ||
> +		    value_size / sizeof(struct bpf_stack_build_id)
> +		    > sysctl_perf_event_max_stack)
> +			return ERR_PTR(-EINVAL);
> +	} else if (value_size / 8 > sysctl_perf_event_max_stack)
> 		return ERR_PTR(-EINVAL);
> 
> 	/* hash table size must be power of 2 */
> @@ -114,13 +136,184 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
> 	return ERR_PTR(err);
> }
> 
> +#define BPF_BUILD_ID 3
> +/*
> + * Parse build id from the note segment. This logic can be shared between
> + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
> + * identical.
> + */
> +static inline int stack_map_parse_build_id(void *page_addr,
> +					   unsigned char *build_id,
> +					   void *note_start,
> +					   Elf32_Word note_size)
> +{
> +	Elf32_Word note_offs = 0, new_offs;
> +
> +	/* check for overflow */
> +	if (note_start < page_addr || note_start + note_size < note_start)
> +		return -EINVAL;
> +
> +	/* only supports note that fits in the first page */
> +	if (note_start + note_size > page_addr + PAGE_SIZE)
> +		return -EINVAL;
> +
> +	while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
> +		Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
> +
> +		if (nhdr->n_type == BPF_BUILD_ID &&
> +		    nhdr->n_namesz == sizeof("GNU") &&
> +		    nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
> +			memcpy(build_id,
> +			       note_start + note_offs +
> +			       ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
> +			       BPF_BUILD_ID_SIZE);
> +			return 0;
> +		}
> +		new_offs = note_offs + sizeof(Elf32_Nhdr) +
> +			ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
> +		if (new_offs <= note_offs)  /* overflow */
> +			break;
> +		note_offs = new_offs;
> +	}
> +	return -EINVAL;
> +}
> +
> +/* Parse build ID from 32-bit ELF */
> +static int stack_map_get_build_id_32(void *page_addr,
> +				     unsigned char *build_id)
> +{
> +	Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
> +	Elf32_Phdr *phdr;
> +	int i;
> +
> +	/* only supports phdr that fits in one page */
> +	if (ehdr->e_phnum >
> +	    (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
> +		return -EINVAL;
> +
> +	phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
> +
> +	for (i = 0; i < ehdr->e_phnum; ++i)
> +		if (phdr[i].p_type == PT_NOTE)
> +			return stack_map_parse_build_id(page_addr, build_id,
> +					page_addr + phdr[i].p_offset,
> +					phdr[i].p_filesz);
> +	return -EINVAL;
> +}
> +
> +/* Parse build ID from 64-bit ELF */
> +static int stack_map_get_build_id_64(void *page_addr,
> +				     unsigned char *build_id)
> +{
> +	Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
> +	Elf64_Phdr *phdr;
> +	int i;
> +
> +	/* only supports phdr that fits in one page */
> +	if (ehdr->e_phnum >
> +	    (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
> +		return -EINVAL;
> +
> +	phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
> +
> +	for (i = 0; i < ehdr->e_phnum; ++i)
> +		if (phdr[i].p_type == PT_NOTE)
> +			return stack_map_parse_build_id(page_addr, build_id,
> +					page_addr + phdr[i].p_offset,
> +					phdr[i].p_filesz);
> +	return -EINVAL;
> +}
> +
> +/* Parse build ID of ELF file mapped to vma */
> +static int stack_map_get_build_id(struct vm_area_struct *vma,
> +				  unsigned char *build_id)
> +{
> +	Elf32_Ehdr *ehdr;
> +	struct page *page;
> +	void *page_addr;
> +	int ret;
> +
> +	/* only works for page backed storage  */
> +	if (!vma->vm_file)
> +		return -EINVAL;
> +
> +	page = find_get_page(vma->vm_file->f_mapping, 0);
> +	if (!page)
> +		return -EFAULT;	/* page not mapped */
> +
> +	ret = -EINVAL;
> +	page_addr = page_address(page);
> +	ehdr = (Elf32_Ehdr *)page_addr;
> +
> +	/* compare magic x7f "ELF" */
> +	if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
> +		goto out;
> +
> +	/* only support executable file and shared object file */
> +	if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
> +		goto out;
> +
> +	if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
> +		ret = stack_map_get_build_id_32(page_addr, build_id);
> +	else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
> +		ret = stack_map_get_build_id_64(page_addr, build_id);
> +out:
> +	put_page(page);
> +	return ret;
> +}
> +
> +static void stack_map_get_build_id_offset(struct bpf_map *map,
> +					  struct stack_map_bucket *bucket,
> +					  u64 *ips, u32 trace_nr, bool user)
> +{
> +	int i;
> +	struct vm_area_struct *vma;
> +	struct bpf_stack_build_id *id_offs;
> +
> +	bucket->nr = trace_nr;
> +	id_offs = (struct bpf_stack_build_id *)bucket->data;
> +
> +	/*
> +	 * We cannot do up_read() in nmi context, so build_id lookup is
> +	 * only supported for non-nmi events. If at some point, it is
> +	 * possible to run find_vma() without taking the semaphore, we
> +	 * would like to allow build_id lookup in nmi context.
> +	 *
> +	 * Same fallback is used for kernel stack (!user) on a stackmap
> +	 * with build_id.
> +	 */
> +	if (!user || !current || !current->mm || in_nmi() ||
> +	    down_read_trylock(&current->mm->mmap_sem) == 0) {
> +		/* cannot access current->mm, fall back to ips */
> +		for (i = 0; i < trace_nr; i++) {
> +			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
> +			id_offs[i].ip = ips[i];
> +		}
> +		return;
> +	}
> +
> +	for (i = 0; i < trace_nr; i++) {
> +		vma = find_vma(current->mm, ips[i]);
> +		if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
> +			/* per entry fall back to ips */
> +			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
> +			id_offs[i].ip = ips[i];
> +			continue;
> +		}
> +		id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
> +			- vma->vm_start;
> +		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
> +	}
> +	up_read(&current->mm->mmap_sem);
> +}
> +
> BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
> 	   u64, flags)
> {
> 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
> 	struct perf_callchain_entry *trace;
> 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
> -	u32 max_depth = map->value_size / 8;
> +	u32 max_depth = map->value_size / stack_map_data_size(map);
> 	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
> 	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
> 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
> @@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
> 	bool user = flags & BPF_F_USER_STACK;
> 	bool kernel = !user;
> 	u64 *ips;
> +	bool hash_matches;
> 
> 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
> 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
> @@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
> 	id = hash & (smap->n_buckets - 1);
> 	bucket = READ_ONCE(smap->buckets[id]);
> 
> -	if (bucket && bucket->hash == hash) {
> -		if (flags & BPF_F_FAST_STACK_CMP)
> +	hash_matches = bucket && bucket->hash == hash;
> +	/* fast cmp */
> +	if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
> +		return id;
> +
> +	if (stack_map_use_build_id(map)) {
> +		/* for build_id+offset, pop a bucket before slow cmp */
> +		new_bucket = (struct stack_map_bucket *)
> +			pcpu_freelist_pop(&smap->freelist);
> +		if (unlikely(!new_bucket))
> +			return -ENOMEM;
> +		stack_map_get_build_id_offset(map, new_bucket, ips,
> +					      trace_nr, user);
> +		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
> +		if (hash_matches && bucket->nr == trace_nr &&
> +		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
> +			pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
> 			return id;
> -		if (bucket->nr == trace_nr &&
> -		    memcmp(bucket->ip, ips, trace_len) == 0)
> +		}
> +		if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
> +			pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
> +			return -EEXIST;
> +		}
> +	} else {
> +		if (hash_matches && bucket->nr == trace_nr &&
> +		    memcmp(bucket->data, ips, trace_len) == 0)
> 			return id;
> +		if (bucket && !(flags & BPF_F_REUSE_STACKID))
> +			return -EEXIST;
> +
> +		new_bucket = (struct stack_map_bucket *)
> +			pcpu_freelist_pop(&smap->freelist);
> +		if (unlikely(!new_bucket))
> +			return -ENOMEM;
> +		memcpy(new_bucket->data, ips, trace_len);
> 	}
> 
> -	/* this call stack is not in the map, try to add it */
> -	if (bucket && !(flags & BPF_F_REUSE_STACKID))
> -		return -EEXIST;
> -
> -	new_bucket = (struct stack_map_bucket *)
> -		pcpu_freelist_pop(&smap->freelist);
> -	if (unlikely(!new_bucket))
> -		return -ENOMEM;
> -
> -	memcpy(new_bucket->ip, ips, trace_len);
> 	new_bucket->hash = hash;
> 	new_bucket->nr = trace_nr;
> 
> @@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
> 	if (!bucket)
> 		return -ENOENT;
> 
> -	trace_len = bucket->nr * sizeof(u64);
> -	memcpy(value, bucket->ip, trace_len);
> +	trace_len = bucket->nr * stack_map_data_size(map);
> +	memcpy(value, bucket->data, trace_len);
> 	memset(value + trace_len, 0, map->value_size - trace_len);
> 
> 	old_bucket = xchg(&smap->buckets[id], bucket);
> -- 
> 2.9.5
>