[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1df36069-4b7c-5244-8d49-37ad831a298e@gmail.com>
Date: Tue, 16 Jan 2018 21:49:16 -0800
From: John Fastabend <john.fastabend@...il.com>
To: Alexei Starovoitov <alexei.starovoitov@...il.com>
Cc: borkmann@...earbox.net, ast@...nel.org, netdev@...r.kernel.org,
kafai@...com
Subject: Re: [bpf-next PATCH 5/7] bpf: create tcp_bpf_ulp allowing BPF to
monitor socket TX/RX data
On 01/16/2018 06:25 PM, Alexei Starovoitov wrote:
> On Fri, Jan 12, 2018 at 10:11:11AM -0800, John Fastabend wrote:
>> This implements a BPF ULP layer to allow policy enforcement and
>> monitoring at the socket layer. In order to support this a new
>> program type BPF_PROG_TYPE_SK_MSG is used to run the policy at
>> the sendmsg/sendpage hook. To attach the policy to sockets a
>> sockmap is used with a new program attach type BPF_SK_MSG_VERDICT.
[...]
>
> overall design looks clean. imo huge improvement from first version.
>
Great thanks for the review.
> Few nits:
>
[...]
>> +
>> +static int bpf_tcp_push(struct sock *sk, struct scatterlist *sg,
>> + int *sg_end, int flags, bool charge)
>> +{
>> + int sendpage_flags = flags | MSG_SENDPAGE_NOTLAST;
>> + int offset, ret = 0;
>> + struct page *p;
>> + size_t size;
>> +
>> + size = sg->length;
>> + offset = sg->offset;
>> +
>> + while (1) {
>> + if (sg_is_last(sg))
>> + sendpage_flags = flags;
>> +
>> + tcp_rate_check_app_limited(sk);
>> + p = sg_page(sg);
>> +retry:
>> + ret = do_tcp_sendpages(sk, p, offset, size, sendpage_flags);
>> + if (ret != size) {
>> + if (ret > 0) {
>> + offset += ret;
>> + size -= ret;
>> + goto retry;
>> + }
>> +
>> + if (charge)
>> + sk_mem_uncharge(sk,
>> + sg->length - size - sg->offset);
>
> should the bool argument be called 'uncharge' instead ?
>
Agreed that would be a better name, will update.
>> +
>> + sg->offset = offset;
>> + sg->length = size;
>> + return ret;
>> + }
>> +
>> + put_page(p);
>> + if (charge)
>> + sk_mem_uncharge(sk, sg->length);
>> + *sg_end += 1;
>> + sg = sg_next(sg);
>> + if (!sg)
>> + break;
>> +
>> + offset = sg->offset;
>> + size = sg->length;
>> + }
>> +
>> + return 0;
>> +}
[...]
>> +static int bpf_tcp_sendmsg_do_redirect(struct scatterlist *sg, int sg_num,
>> + struct sk_msg_buff *md, int flags)
>> +{
>> + int i, sg_curr = 0, err, free;
>> + struct smap_psock *psock;
>> + struct sock *sk;
>> +
>> + rcu_read_lock();
>> + sk = do_msg_redirect_map(md);
>> + if (unlikely(!sk))
>> + goto out_rcu;
>> +
>> + psock = smap_psock_sk(sk);
>> + if (unlikely(!psock))
>> + goto out_rcu;
>> +
>> + if (!refcount_inc_not_zero(&psock->refcnt))
>> + goto out_rcu;
>> +
>> + rcu_read_unlock();
>> + lock_sock(sk);
>> + err = bpf_tcp_push(sk, sg, &sg_curr, flags, false);
>> + if (unlikely(err))
>> + goto out;
>> + release_sock(sk);
>> + smap_release_sock(psock, sk);
>> + return 0;
>> +out_rcu:
>> + rcu_read_unlock();
>> +out:
>> + for (i = sg_curr; i < sg_num; ++i) {
>> + free += sg[i].length;
>> + put_page(sg_page(&sg[i]));
>> + }
>> + return free;
>
> erro path keeps rcu_lock and sk locked?
>
yep, although looks like rcu_read_unlock() is OK because its
released above the call but the
if (unlikely(err))
goto err
needs to be moved below the smap_release_sock(). Thanks!
>> +}
>> +
[...]
>> + while (msg_data_left(msg)) {
>> + int sg_curr;
>> +
>> + if (sk->sk_err) {
>> + err = sk->sk_err;
>> + goto out_err;
>> + }
>> +
>> + copy = msg_data_left(msg);
>> + if (!sk_stream_memory_free(sk))
>> + goto wait_for_sndbuf;
>> +
>> + /* sg_size indicates bytes already allocated and sg_num
>> + * is last sg element used. This is used when alloc_sg
>> + * partially allocates a scatterlist and then is sent
>> + * to wait for memory. In normal case (no memory pressure)
>> + * both sg_nun and sg_size are zero.
>> + */
>> + copy = copy - sg_size;
>> + err = sk_alloc_sg(sk, copy, sg, &sg_num, &sg_size, 0);
>> + if (err) {
>> + if (err != -ENOSPC)
>> + goto wait_for_memory;
>> + copy = sg_size;
>> + }
>> +
>> + err = memcopy_from_iter(sk, sg, sg_num, &msg->msg_iter, copy);
>> + if (err < 0) {
>> + free_sg(sk, sg, 0, sg_num);
>> + goto out_err;
>> + }
>> +
>> + copied += copy;
>> +
>> + /* If msg is larger than MAX_SKB_FRAGS we can send multiple
>> + * scatterlists per msg. However BPF decisions apply to the
>> + * entire msg.
>> + */
>> + if (eval == __SK_NONE)
>> + eval = smap_do_tx_msg(sk, psock, &md);
>
> it seems sk_alloc_sg() will put 64k bytes into sg_data,
Yep upto 64k will be copied from msg into sg data.
> but this program will see only first SG ?
Correct, to read further into the msg we would need to have a helper
or some other way to catch reads/writes past the first 4k and read
the next sg. We have the same limitation in cls_bpf.
I have started a patch on top of this series with the current working
title msg_apply_bytes(int bytes). This would let us apply a verdict to
a set number of bytes instead of the entire msg. By calling
msg_apply_bytes(data_end - data) a user who needs to read an entire msg
could do this in 4k chunks until the entire msg is passed through the
bpf prog.
> and it's typically going to be one page only ?
yep
> then what's the value of waiting for MAX_SKB_FRAGS ?
>
Its not waiting for MAX_SKB_FRAGS its simple copying up to MAX_SKB_FRAGS
pages in one call if possible. It seems better to me to run this loop
over as much data as we can.
>> +
>> + switch (eval) {
>> + case __SK_PASS:
>> + sg_mark_end(sg + sg_num - 1);
>> + err = bpf_tcp_push(sk, sg, &sg_curr, flags, true);
>> + if (unlikely(err)) {
>> + copied -= free_sg(sk, sg, sg_curr, sg_num);
>> + goto out_err;
>> + }
>> + break;
>> + case __SK_REDIRECT:
>> + sg_mark_end(sg + sg_num - 1);
>> + goto do_redir;
>> + case __SK_DROP:
>> + default:
>> + copied -= free_sg(sk, sg, 0, sg_num);
>> + goto out_err;
>> + }
[...]
>> + /* Calculate pkt data pointers and run BPF program */
>> + md.data = page_address(page) + offset;
>> + md.data_end = md.data + size;
>> + _rc = (*prog->bpf_func)(&md, prog->insnsi);
>> +
>> +verdict:
>> + rcu_read_unlock();
>> + preempt_enable();
>> +
>> + /* Moving return codes from UAPI namespace into internal namespace */
>> + rc = ((_rc == SK_PASS) ? __SK_PASS : __SK_DROP);
>> +
>> + switch (rc) {
>> + case __SK_PASS:
>> + lock_sock(sk);
>> + rc = tcp_sendpage_locked(sk, page, offset, size, flags);
>> + release_sock(sk);
>> + break;
>> + case __SK_REDIRECT:
>> + smap_release_sock(psock, sk);
>> + rc = bpf_tcp_sendpage_do_redirect(page, offset, size, flags,
>> + &md);
>
> looks like this path wasn't tested,
:/ yep adding test now.
> since above rc = ...; line cannot return REDIRECT...
> probably should be common helper for both tcp_bpf_*() funcs
> to call into bpf and convert rc.
>
will do in v2 thanks.
>> + break;
>> + case __SK_DROP:
>> + default:
>> + rc = -EACCES;
>> + }
>> +
>> + return rc;
>> +}
>> +
>> +static int bpf_tcp_msg_add(struct smap_psock *psock,
>> + struct sock *sk,
>> + struct bpf_prog *tx_msg)
>> +{
>> + struct bpf_prog *orig_tx_msg;
>> +
>> + orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg);
>> + if (orig_tx_msg)
>> + bpf_prog_put(orig_tx_msg);
>
> the function is replacing the program. why is it called bpf_tcp_msg_add ?
>
bad name will rename.
>> +
>> + return tcp_set_ulp_id(sk, TCP_ULP_BPF);
>> +}
>> +
[...]
>> +static int bpf_tcp_ulp_register(void)
>> +{
>> + tcp_bpf_proto = tcp_prot;
>> + tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg;
>> + tcp_bpf_proto.sendpage = bpf_tcp_sendpage;
>> + return tcp_register_ulp(&bpf_tcp_ulp_ops);
>
> I don't see corresponding tcp_unregister_ulp().
>
There is none. tcp_register_ulp() adds the bpf_tcp_ulp to the list of
available ULPs and never removes it. To remove it we would have to
keep a ref count on the reg/unreg calls. This would require a couple
more patches to the ULP infra and not sure it hurts to leave the ULP
reference around...
Maybe a follow on patch? Or else it could be a patch in front of this
patch.
>> @@ -710,6 +1151,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
>> */
>> verdict = READ_ONCE(stab->bpf_verdict);
>> parse = READ_ONCE(stab->bpf_parse);
>> + tx_msg = READ_ONCE(stab->bpf_tx_msg);
>>
>> if (parse && verdict) {
>> /* bpf prog refcnt may be zero if a concurrent attach operation
>> @@ -728,6 +1170,17 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
>> }
>> }
>>
>> + if (tx_msg) {
>> + tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
>
> prog_inc_not_zero() looks scary here.
> Why 'not_zero' is necessary ?
>
Same reason we use inc_not_zero variants with the verdict/parse
programs,
/* bpf prog refcnt may be zero if a concurrent attach operation
* removes the program after the above READ_ONCE() but before
* we increment the refcnt. If this is the case abort with an
* error.
*/
>> + if (IS_ERR(tx_msg)) {
>> + if (verdict)
>> + bpf_prog_put(verdict);
>> + if (parse)
>> + bpf_prog_put(parse);
>> + return PTR_ERR(tx_msg);
>> + }
>> + }
>> +
Thanks!
Powered by blists - more mailing lists