[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e532530c-d5ea-a889-6467-1d8dd2b4f098@fb.com>
Date: Sun, 8 Apr 2018 20:34:07 -0700
From: Alexei Starovoitov <ast@...com>
To: Yonghong Song <yhs@...com>, <daniel@...earbox.net>,
<netdev@...r.kernel.org>
CC: <kernel-team@...com>
Subject: Re: [RFC PATCH bpf-next 2/6] bpf: add bpf_get_stack helper
On 4/6/18 2:48 PM, Yonghong Song wrote:
> Currently, stackmap and bpf_get_stackid helper are provided
> for bpf program to get the stack trace. This approach has
> a limitation though. If two stack traces have the same hash,
> only one will get stored in the stackmap table,
> so some stack traces are missing from user perspective.
>
> This patch implements a new helper, bpf_get_stack, will
> send stack traces directly to bpf program. The bpf program
> is able to see all stack traces, and then can do in-kernel
> processing or send stack traces to user space through
> shared map or bpf_perf_event_output.
>
> Signed-off-by: Yonghong Song <yhs@...com>
> ---
> include/linux/bpf.h | 1 +
> include/linux/filter.h | 3 ++-
> include/uapi/linux/bpf.h | 17 +++++++++++++--
> kernel/bpf/stackmap.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
> kernel/bpf/syscall.c | 12 ++++++++++-
> kernel/bpf/verifier.c | 3 +++
> kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++++++++-
> 7 files changed, 137 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 95a7abd..72ccb9a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -676,6 +676,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
> extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
> extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
> extern const struct bpf_func_proto bpf_get_stackid_proto;
> +extern const struct bpf_func_proto bpf_get_stack_proto;
> extern const struct bpf_func_proto bpf_sock_map_update_proto;
>
> /* Shared helpers among cBPF and eBPF. */
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index fc4e8f9..9b64f63 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -467,7 +467,8 @@ struct bpf_prog {
> dst_needed:1, /* Do we need dst entry? */
> blinded:1, /* Was blinded */
> is_func:1, /* program is a bpf function */
> - kprobe_override:1; /* Do we override a kprobe? */
> + kprobe_override:1, /* Do we override a kprobe? */
> + need_callchain_buf:1; /* Needs callchain buffer? */
> enum bpf_prog_type type; /* Type of BPF program */
> enum bpf_attach_type expected_attach_type; /* For some prog types */
> u32 len; /* Number of filter blocks */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index c5ec897..a4ff5b7 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -517,6 +517,17 @@ union bpf_attr {
> * other bits - reserved
> * Return: >= 0 stackid on success or negative error
> *
> + * int bpf_get_stack(ctx, buf, size, flags)
> + * walk user or kernel stack and store the ips in buf
> + * @ctx: struct pt_regs*
> + * @buf: user buffer to fill stack
> + * @size: the buf size
> + * @flags: bits 0-7 - numer of stack frames to skip
> + * bit 8 - collect user stack instead of kernel
> + * bit 11 - get build-id as well if user stack
> + * other bits - reserved
> + * Return: >= 0 size copied on success or negative error
> + *
> * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
> * calculate csum diff
> * @from: raw from buffer
> @@ -821,7 +832,8 @@ union bpf_attr {
> FN(msg_apply_bytes), \
> FN(msg_cork_bytes), \
> FN(msg_pull_data), \
> - FN(bind),
> + FN(bind), \
> + FN(get_stack),
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> * function eBPF program intends to call
> @@ -855,11 +867,12 @@ enum bpf_func_id {
> /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
> #define BPF_F_TUNINFO_IPV6 (1ULL << 0)
>
> -/* BPF_FUNC_get_stackid flags. */
> +/* BPF_FUNC_get_stackid and BPF_FUNC_get_stack flags. */
> #define BPF_F_SKIP_FIELD_MASK 0xffULL
> #define BPF_F_USER_STACK (1ULL << 8)
> #define BPF_F_FAST_STACK_CMP (1ULL << 9)
> #define BPF_F_REUSE_STACKID (1ULL << 10)
> +#define BPF_F_USER_BUILD_ID (1ULL << 11)
the comment above is not quite correct.
This new flag is only available for new helper.
>
> /* BPF_FUNC_skb_set_tunnel_key flags. */
> #define BPF_F_ZERO_CSUM_TX (1ULL << 1)
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 04f6ec1..371c72e 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -402,6 +402,62 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
> .arg3_type = ARG_ANYTHING,
> };
>
> +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
> + u64, flags)
> +{
> + u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
> + bool user_build_id = flags & BPF_F_USER_BUILD_ID;
> + u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
> + bool user = flags & BPF_F_USER_STACK;
> + struct perf_callchain_entry *trace;
> + bool kernel = !user;
> + u64 *ips;
> +
> + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
> + BPF_F_USER_BUILD_ID)))
> + return -EINVAL;
> +
> + elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
> + : sizeof(u64);
> + if (unlikely(size % elem_size))
> + return -EINVAL;
> +
> + num_elem = size / elem_size;
> + if (sysctl_perf_event_max_stack < num_elem)
> + init_nr = 0;
prog's buffer should be zero padded in this case since it
points to uninit_mem.
> + else
> + init_nr = sysctl_perf_event_max_stack - num_elem;
> + trace = get_perf_callchain(regs, init_nr, kernel, user,
> + sysctl_perf_event_max_stack, false, false);
> + if (unlikely(!trace))
> + return -EFAULT;
> +
> + trace_nr = trace->nr - init_nr;
> + if (trace_nr <= skip)
> + return -EFAULT;
> +
> + trace_nr -= skip;
> + trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
> + copy_len = trace_nr * elem_size;
> + ips = trace->ip + skip + init_nr;
> + if (user && user_build_id)
the combination of kern + user_build_id should probably be rejected
earlier with einval.
> + stack_map_get_build_id_offset(buf, ips, trace_nr, user);
> + else
> + memcpy(buf, ips, copy_len);
> +
> + return copy_len;
> +}
> +
> +const struct bpf_func_proto bpf_get_stack_proto = {
> + .func = bpf_get_stack,
> + .gpl_only = true,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_PTR_TO_CTX,
> + .arg2_type = ARG_PTR_TO_UNINIT_MEM,
> + .arg3_type = ARG_CONST_SIZE_OR_ZERO,
why allow zero size?
I'm not sure the helper will work correctly when size=0
> + .arg4_type = ARG_ANYTHING,
> +};
> +
> /* Called from eBPF program */
> static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
> {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 0244973..2aa3a65 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -984,10 +984,13 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
> static void __bpf_prog_put_rcu(struct rcu_head *rcu)
> {
> struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
> + bool need_callchain_buf = aux->prog->need_callchain_buf;
>
> free_used_maps(aux);
> bpf_prog_uncharge_memlock(aux->prog);
> security_bpf_prog_free(aux);
> + if (need_callchain_buf)
> + put_callchain_buffers();
> bpf_prog_free(aux->prog);
> }
>
> @@ -1004,7 +1007,8 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
> bpf_prog_kallsyms_del(prog->aux->func[i]);
> bpf_prog_kallsyms_del(prog);
>
> - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
> + synchronize_rcu();
> + __bpf_prog_put_rcu(&prog->aux->rcu);
there should have been lockdep splat.
We cannot call synchronize_rcu here, since we cannot sleep
in some cases.
Powered by blists - more mailing lists