[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <88507f09-b422-4991-90a8-1b8cedc07d86@linux.dev>
Date: Tue, 8 Jul 2025 16:16:20 -0700
From: Martin KaFai Lau <martin.lau@...ux.dev>
To: Jordan Rife <jordan@...fe.io>
Cc: Daniel Borkmann <daniel@...earbox.net>,
Willem de Bruijn <willemdebruijn.kernel@...il.com>,
Kuniyuki Iwashima <kuniyu@...gle.com>,
Alexei Starovoitov <alexei.starovoitov@...il.com>,
Stanislav Fomichev <stfomichev@...il.com>, netdev@...r.kernel.org,
bpf@...r.kernel.org
Subject: Re: [PATCH v4 bpf-next 02/12] bpf: tcp: Make sure iter->batch always
contains a full bucket snapshot
On 7/7/25 8:50 AM, Jordan Rife wrote:
> static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
> - struct sock *start_sk)
> + struct sock **start_sk)
> {
> - struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
> struct bpf_tcp_iter_state *iter = seq->private;
> - struct tcp_iter_state *st = &iter->state;
> struct hlist_nulls_node *node;
> unsigned int expected = 1;
> struct sock *sk;
>
> - sock_hold(start_sk);
> - iter->batch[iter->end_sk++] = start_sk;
> + sock_hold(*start_sk);
> + iter->batch[iter->end_sk++] = *start_sk;
>
> - sk = sk_nulls_next(start_sk);
> + sk = sk_nulls_next(*start_sk);
> + *start_sk = NULL;
> sk_nulls_for_each_from(sk, node) {
> if (seq_sk_match(seq, sk)) {
> if (iter->end_sk < iter->max_sk) {
> sock_hold(sk);
> iter->batch[iter->end_sk++] = sk;
> + } else if (!*start_sk) {
> + /* Remember where we left off. */
> + *start_sk = sk;
> }
> expected++;
> }
> }
> - spin_unlock(&hinfo->lhash2[st->bucket].lock);
>
> return expected;
> }
>
[ ... ]
> static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
> {
> struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
> struct bpf_tcp_iter_state *iter = seq->private;
> struct tcp_iter_state *st = &iter->state;
> unsigned int expected;
> - bool resized = false;
> struct sock *sk;
> + int err;
>
> /* The st->bucket is done. Directly advance to the next
> * bucket instead of having the tcp_seek_last_pos() to skip
> @@ -3145,33 +3171,52 @@ static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
> }
> }
>
> -again:
> - /* Get a new batch */
> iter->cur_sk = 0;
> iter->end_sk = 0;
> - iter->st_bucket_done = false;
> + iter->st_bucket_done = true;
>
> sk = tcp_seek_last_pos(seq);
> if (!sk)
> return NULL; /* Done */
>
> - if (st->state == TCP_SEQ_STATE_LISTENING)
> - expected = bpf_iter_tcp_listening_batch(seq, sk);
> - else
> - expected = bpf_iter_tcp_established_batch(seq, sk);
> + expected = bpf_iter_fill_batch(seq, &sk);
> + if (likely(iter->end_sk == expected))
> + goto done;
>
> - if (iter->end_sk == expected) {
> - iter->st_bucket_done = true;
> - return sk;
> - }
> + /* Batch size was too small. */
> + bpf_iter_tcp_unlock_bucket(seq);
> + bpf_iter_tcp_put_batch(iter);
> + err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
> + GFP_USER);
> + if (err)
> + return ERR_PTR(err);
> +
> + iter->cur_sk = 0;
> + iter->end_sk = 0;
> +
> + sk = tcp_seek_last_pos(seq);
> + if (!sk)
> + return NULL; /* Done */
> +
> + expected = bpf_iter_fill_batch(seq, &sk);
A nit.
The next start_sk is stored in &sk. It took me a while to see through how it is
useful such that it needs the new "struct sock **start_sk" argument.
> + if (likely(iter->end_sk == expected))
> + goto done;
>
> - if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
> - GFP_USER)) {
> - resized = true;
> - goto again;
> + /* Batch size was still too small. Hold onto the lock while we try
> + * again with a larger batch to make sure the current bucket's size
> + * does not change in the meantime.
> + */
> + err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
> + if (err) {
> + bpf_iter_tcp_unlock_bucket(seq);
> + return ERR_PTR(err);
> }
>
> - return sk;
> + expected = bpf_iter_fill_batch(seq, &sk);
iiuc, the stored "&sk" is only useful in this special GFP_NOWAIT case.
How about directly figuring out the next start_sk here?
The next start_sk should be the sk_nulls_next() of the
iter->batch[iter->end_sk - 1]?
> +done:
> + WARN_ON_ONCE(iter->end_sk != expected);
nit. I would move this WARN_ON_ONCE before the "done:" label. It seems to make
sense only for the GFP_NOWAIT case.
> + bpf_iter_tcp_unlock_bucket(seq);
> + return iter->batch[0];
> }
>
> static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
Powered by blists - more mailing lists