[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6be60772-4d2a-30b0-5ebb-f857db31c037@fb.com>
Date: Tue, 29 Jun 2021 10:27:17 -0700
From: Yonghong Song <yhs@...com>
To: Martin KaFai Lau <kafai@...com>, <bpf@...r.kernel.org>
CC: Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
Eric Dumazet <edumazet@...gle.com>, <kernel-team@...com>,
Neal Cardwell <ncardwell@...gle.com>, <netdev@...r.kernel.org>,
Yuchung Cheng <ycheng@...gle.com>
Subject: Re: [PATCH bpf-next 6/8] bpf: tcp: bpf iter batching and lock_sock
On 6/25/21 1:05 PM, Martin KaFai Lau wrote:
> This patch does batching and lock_sock for the bpf tcp iter.
> It does not affect the proc fs iteration.
>
> With bpf-tcp-cc, new algo rollout happens more often. Instead of
> restarting the application to pick up the new tcp-cc, the next patch
> will allow bpf iter with CAP_NET_ADMIN to do setsockopt(TCP_CONGESTION).
> This requires locking the sock.
>
> Also, unlike the proc iteration (cat /proc/net/tcp[6]), the bpf iter
> can inspect all fields of a tcp_sock. It will be useful to have a
> consistent view on some of the fields (e.g. the ones reported in
> tcp_get_info() that also acquires the sock lock).
>
> Double lock: locking the bucket first and then locking the sock could
> lead to deadlock. This patch takes a batching approach similar to
> inet_diag. While holding the bucket lock, it batch a number of sockets
> into an array first and then unlock the bucket. Before doing show(),
> it then calls lock_sock_fast().
>
> In a machine with ~400k connections, the maximum number of
> sk in a bucket of the established hashtable is 7. 0.02% of
> the established connections fall into this bucket size.
>
> For listen hash (port+addr lhash2), the bucket is usually very
> small also except for the SO_REUSEPORT use case which the
> userspace could have one SO_REUSEPORT socket per thread.
>
> While batching is used, it can also minimize the chance of missing
> sock in the setsockopt use case if the whole bucket is batched.
> This patch will start with a batch array with INIT_BATCH_SZ (16)
> which will be enough for the most common cases. bpf_iter_tcp_batch()
> will try to realloc to a larger array to handle exception case (e.g.
> the SO_REUSEPORT case in the lhash2).
>
> Signed-off-by: Martin KaFai Lau <kafai@...com>
> ---
> net/ipv4/tcp_ipv4.c | 236 ++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 230 insertions(+), 6 deletions(-)
>
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 0d851289a89e..856144d33f52 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -2687,6 +2687,15 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
> }
>
> #ifdef CONFIG_BPF_SYSCALL
> +struct bpf_tcp_iter_state {
> + struct tcp_iter_state state;
> + unsigned int cur_sk;
> + unsigned int end_sk;
> + unsigned int max_sk;
> + struct sock **batch;
> + bool st_bucket_done;
> +};
> +
> struct bpf_iter__tcp {
> __bpf_md_ptr(struct bpf_iter_meta *, meta);
> __bpf_md_ptr(struct sock_common *, sk_common);
> @@ -2705,16 +2714,203 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
> return bpf_iter_run_prog(prog, &ctx);
> }
>
> +static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
> +{
> + while (iter->cur_sk < iter->end_sk)
> + sock_put(iter->batch[iter->cur_sk++]);
> +}
> +
> +static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
> + unsigned int new_batch_sz)
> +{
> + struct sock **new_batch;
> +
> + new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER);
Since we return -ENOMEM below, should we have __GFP_NOWARN in kvmalloc
flags?
> + if (!new_batch)
> + return -ENOMEM;
> +
> + bpf_iter_tcp_put_batch(iter);
> + kvfree(iter->batch);
> + iter->batch = new_batch;
> + iter->max_sk = new_batch_sz;
> +
> + return 0;
> +}
> +
[...]
> +
> static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
> {
> struct bpf_iter_meta meta;
> struct bpf_prog *prog;
> struct sock *sk = v;
> + bool slow;
> uid_t uid;
> + int ret;
>
> if (v == SEQ_START_TOKEN)
> return 0;
>
> + if (sk_fullsock(sk))
> + slow = lock_sock_fast(sk);
> +
> + if (unlikely(sk_unhashed(sk))) {
> + ret = SEQ_SKIP;
> + goto unlock;
> + }
I am not a tcp expert. Maybe a dummy question.
Is it possible to do setsockopt() for listening socket?
What will happen if the listening sock is unhashed after the
above check?
> +
> if (sk->sk_state == TCP_TIME_WAIT) {
> uid = 0;
> } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
> @@ -2728,11 +2924,18 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
>
> meta.seq = seq;
> prog = bpf_iter_get_info(&meta, false);
> - return tcp_prog_seq_show(prog, &meta, v, uid);
> + ret = tcp_prog_seq_show(prog, &meta, v, uid);
> +
> +unlock:
> + if (sk_fullsock(sk))
> + unlock_sock_fast(sk, slow);
> + return ret;
> +
> }
>
> static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
> {
> + struct bpf_tcp_iter_state *iter = seq->private;
> struct bpf_iter_meta meta;
> struct bpf_prog *prog;
>
> @@ -2743,13 +2946,16 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
> (void)tcp_prog_seq_show(prog, &meta, v, 0);
> }
>
> - tcp_seq_stop(seq, v);
> + if (iter->cur_sk < iter->end_sk) {
> + bpf_iter_tcp_put_batch(iter);
> + iter->st_bucket_done = false;
> + }
> }
>
[...]
Powered by blists - more mailing lists