[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <625de1bb-7cb2-5f9e-01c3-a863cd27b0e6@fb.com>
Date: Sun, 8 Apr 2018 21:53:49 -0700
From: Yonghong Song <yhs@...com>
To: Alexei Starovoitov <ast@...com>, <daniel@...earbox.net>,
<netdev@...r.kernel.org>
CC: <kernel-team@...com>
Subject: Re: [RFC PATCH bpf-next 2/6] bpf: add bpf_get_stack helper
On 4/8/18 8:34 PM, Alexei Starovoitov wrote:
> On 4/6/18 2:48 PM, Yonghong Song wrote:
>> Currently, stackmap and bpf_get_stackid helper are provided
>> for bpf program to get the stack trace. This approach has
>> a limitation though. If two stack traces have the same hash,
>> only one will get stored in the stackmap table,
>> so some stack traces are missing from user perspective.
>>
>> This patch implements a new helper, bpf_get_stack, will
>> send stack traces directly to bpf program. The bpf program
>> is able to see all stack traces, and then can do in-kernel
>> processing or send stack traces to user space through
>> shared map or bpf_perf_event_output.
>>
>> Signed-off-by: Yonghong Song <yhs@...com>
>> ---
>> include/linux/bpf.h | 1 +
>> include/linux/filter.h | 3 ++-
>> include/uapi/linux/bpf.h | 17 +++++++++++++--
>> kernel/bpf/stackmap.c | 56
>> ++++++++++++++++++++++++++++++++++++++++++++++++
>> kernel/bpf/syscall.c | 12 ++++++++++-
>> kernel/bpf/verifier.c | 3 +++
>> kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++++++++-
>> 7 files changed, 137 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 95a7abd..72ccb9a 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -676,6 +676,7 @@ extern const struct bpf_func_proto
>> bpf_get_current_comm_proto;
>> extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
>> extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
>> extern const struct bpf_func_proto bpf_get_stackid_proto;
>> +extern const struct bpf_func_proto bpf_get_stack_proto;
>> extern const struct bpf_func_proto bpf_sock_map_update_proto;
>>
>> /* Shared helpers among cBPF and eBPF. */
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index fc4e8f9..9b64f63 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -467,7 +467,8 @@ struct bpf_prog {
>> dst_needed:1, /* Do we need dst entry? */
>> blinded:1, /* Was blinded */
>> is_func:1, /* program is a bpf function */
>> - kprobe_override:1; /* Do we override a kprobe? */
>> + kprobe_override:1, /* Do we override a kprobe? */
>> + need_callchain_buf:1; /* Needs callchain buffer? */
>> enum bpf_prog_type type; /* Type of BPF program */
>> enum bpf_attach_type expected_attach_type; /* For some prog
>> types */
>> u32 len; /* Number of filter blocks */
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index c5ec897..a4ff5b7 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -517,6 +517,17 @@ union bpf_attr {
>> * other bits - reserved
>> * Return: >= 0 stackid on success or negative error
>> *
>> + * int bpf_get_stack(ctx, buf, size, flags)
>> + * walk user or kernel stack and store the ips in buf
>> + * @ctx: struct pt_regs*
>> + * @buf: user buffer to fill stack
>> + * @size: the buf size
>> + * @flags: bits 0-7 - numer of stack frames to skip
>> + * bit 8 - collect user stack instead of kernel
>> + * bit 11 - get build-id as well if user stack
>> + * other bits - reserved
>> + * Return: >= 0 size copied on success or negative error
>> + *
>> * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
>> * calculate csum diff
>> * @from: raw from buffer
>> @@ -821,7 +832,8 @@ union bpf_attr {
>> FN(msg_apply_bytes), \
>> FN(msg_cork_bytes), \
>> FN(msg_pull_data), \
>> - FN(bind),
>> + FN(bind), \
>> + FN(get_stack),
>>
>> /* integer value in 'imm' field of BPF_CALL instruction selects which
>> helper
>> * function eBPF program intends to call
>> @@ -855,11 +867,12 @@ enum bpf_func_id {
>> /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
>> #define BPF_F_TUNINFO_IPV6 (1ULL << 0)
>>
>> -/* BPF_FUNC_get_stackid flags. */
>> +/* BPF_FUNC_get_stackid and BPF_FUNC_get_stack flags. */
>> #define BPF_F_SKIP_FIELD_MASK 0xffULL
>> #define BPF_F_USER_STACK (1ULL << 8)
>> #define BPF_F_FAST_STACK_CMP (1ULL << 9)
>> #define BPF_F_REUSE_STACKID (1ULL << 10)
>> +#define BPF_F_USER_BUILD_ID (1ULL << 11)
>
> the comment above is not quite correct.
> This new flag is only available for new helper.
Right, some flags are used for both helpers and some are only used for
one of them. Will make it clear in the next revision.
>
>>
>> /* BPF_FUNC_skb_set_tunnel_key flags. */
>> #define BPF_F_ZERO_CSUM_TX (1ULL << 1)
>> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
>> index 04f6ec1..371c72e 100644
>> --- a/kernel/bpf/stackmap.c
>> +++ b/kernel/bpf/stackmap.c
>> @@ -402,6 +402,62 @@ const struct bpf_func_proto bpf_get_stackid_proto
>> = {
>> .arg3_type = ARG_ANYTHING,
>> };
>>
>> +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32,
>> size,
>> + u64, flags)
>> +{
>> + u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
>> + bool user_build_id = flags & BPF_F_USER_BUILD_ID;
>> + u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
>> + bool user = flags & BPF_F_USER_STACK;
>> + struct perf_callchain_entry *trace;
>> + bool kernel = !user;
>> + u64 *ips;
>> +
>> + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
>> + BPF_F_USER_BUILD_ID)))
>> + return -EINVAL;
>> +
>> + elem_size = (user && user_build_id) ? sizeof(struct
>> bpf_stack_build_id)
>> + : sizeof(u64);
>> + if (unlikely(size % elem_size))
>> + return -EINVAL;
>> +
>> + num_elem = size / elem_size;
>> + if (sysctl_perf_event_max_stack < num_elem)
>> + init_nr = 0;
>
> prog's buffer should be zero padded in this case since it
> points to uninit_mem.
Will make the change in the next revision.
>
>> + else
>> + init_nr = sysctl_perf_event_max_stack - num_elem;
>> + trace = get_perf_callchain(regs, init_nr, kernel, user,
>> + sysctl_perf_event_max_stack, false, false);
>> + if (unlikely(!trace))
>> + return -EFAULT;
>> +
>> + trace_nr = trace->nr - init_nr;
>> + if (trace_nr <= skip)
>> + return -EFAULT;
>> +
>> + trace_nr -= skip;
>> + trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
>> + copy_len = trace_nr * elem_size;
>> + ips = trace->ip + skip + init_nr;
>> + if (user && user_build_id)
>
> the combination of kern + user_build_id should probably be rejected
> earlier with einval.
Right, I missed this case. It should be rejected.
>
>> + stack_map_get_build_id_offset(buf, ips, trace_nr, user);
>> + else
>> + memcpy(buf, ips, copy_len);
>> +
>> + return copy_len;
>> +}
>> +
>> +const struct bpf_func_proto bpf_get_stack_proto = {
>> + .func = bpf_get_stack,
>> + .gpl_only = true,
>> + .ret_type = RET_INTEGER,
>> + .arg1_type = ARG_PTR_TO_CTX,
>> + .arg2_type = ARG_PTR_TO_UNINIT_MEM,
>> + .arg3_type = ARG_CONST_SIZE_OR_ZERO,
>
> why allow zero size?
> I'm not sure the helper will work correctly when size=0
The only reason I had is to make bpf program easier so
they do not need to test zero size, similiar to
bpf_probe_read/bpf_perf_event_output.
I have double checked my implementation and it should
handle zero size properly.
Let me double check whether whether not allowing zero
can still have reasonable bpf program coding which
passes verifier.
>
>> + .arg4_type = ARG_ANYTHING,
>> +};
>> +
>> /* Called from eBPF program */
>> static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
>> {
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 0244973..2aa3a65 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -984,10 +984,13 @@ void bpf_prog_free_id(struct bpf_prog *prog,
>> bool do_idr_lock)
>> static void __bpf_prog_put_rcu(struct rcu_head *rcu)
>> {
>> struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux,
>> rcu);
>> + bool need_callchain_buf = aux->prog->need_callchain_buf;
>>
>> free_used_maps(aux);
>> bpf_prog_uncharge_memlock(aux->prog);
>> security_bpf_prog_free(aux);
>> + if (need_callchain_buf)
>> + put_callchain_buffers();
>> bpf_prog_free(aux->prog);
>> }
>>
>> @@ -1004,7 +1007,8 @@ static void __bpf_prog_put(struct bpf_prog
>> *prog, bool do_idr_lock)
>> bpf_prog_kallsyms_del(prog->aux->func[i]);
>> bpf_prog_kallsyms_del(prog);
>>
>> - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
>> + synchronize_rcu();
>> + __bpf_prog_put_rcu(&prog->aux->rcu);
>
> there should have been lockdep splat.
> We cannot call synchronize_rcu here, since we cannot sleep
> in some cases.
Let me double check this. The following is the reason
why I am using synchronize_rcu().
With call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu) and
_bpf_prog_put_rcu calls put_callchain_buffers() which
calls mutex_lock(), the runtime with CONFIG_DEBUG_ATOMIC_SLEEP=y
will complains since potential sleep inside the call_rcu is not
allowed.
Powered by blists - more mailing lists