[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <aFMJHoasszw3x2kX@mini-arch>
Date: Wed, 18 Jun 2025 11:44:46 -0700
From: Stanislav Fomichev <stfomichev@...il.com>
To: Jordan Rife <jordan@...fe.io>
Cc: netdev@...r.kernel.org, bpf@...r.kernel.org,
Daniel Borkmann <daniel@...earbox.net>,
Martin KaFai Lau <martin.lau@...ux.dev>,
Willem de Bruijn <willemdebruijn.kernel@...il.com>,
Kuniyuki Iwashima <kuniyu@...zon.com>,
Alexei Starovoitov <alexei.starovoitov@...il.com>
Subject: Re: [RESEND PATCH v2 bpf-next 02/12] bpf: tcp: Make sure iter->batch
always contains a full bucket snapshot
On 06/18, Jordan Rife wrote:
> Require that iter->batch always contains a full bucket snapshot. This
> invariant is important to avoid skipping or repeating sockets during
> iteration when combined with the next few patches. Before, there were
> two cases where a call to bpf_iter_tcp_batch may only capture part of a
> bucket:
>
> 1. When bpf_iter_tcp_realloc_batch() returns -ENOMEM.
> 2. When more sockets are added to the bucket while calling
> bpf_iter_tcp_realloc_batch(), making the updated batch size
> insufficient.
>
> In cases where the batch size only covers part of a bucket, it is
> possible to forget which sockets were already visited, especially if we
> have to process a bucket in more than two batches. This forces us to
> choose between repeating or skipping sockets, so don't allow this:
>
> 1. Stop iteration and propagate -ENOMEM up to userspace if reallocation
> fails instead of continuing with a partial batch.
> 2. Try bpf_iter_tcp_realloc_batch() with GFP_USER just as before, but if
> we still aren't able to capture the full bucket, call
> bpf_iter_tcp_realloc_batch() again while holding the bucket lock to
> guarantee the bucket does not change. On the second attempt use
> GFP_NOWAIT since we hold onto the spin lock.
>
> I did some manual testing to exercise the code paths where GFP_NOWAIT is
> used and where ERR_PTR(err) is returned. I used the realloc test cases
> included later in this series to trigger a scenario where a realloc
> happens inside bpf_iter_tcp_batch and made a small code tweak to force
> the first realloc attempt to allocate a too-small batch, thus requiring
> another attempt with GFP_NOWAIT. Some printks showed both reallocs with
> the tests passing:
>
> May 09 18:18:55 crow kernel: resize batch TCP_SEQ_STATE_LISTENING
> May 09 18:18:55 crow kernel: again GFP_USER
> May 09 18:18:55 crow kernel: resize batch TCP_SEQ_STATE_LISTENING
> May 09 18:18:55 crow kernel: again GFP_NOWAIT
> May 09 18:18:57 crow kernel: resize batch TCP_SEQ_STATE_ESTABLISHED
> May 09 18:18:57 crow kernel: again GFP_USER
> May 09 18:18:57 crow kernel: resize batch TCP_SEQ_STATE_ESTABLISHED
> May 09 18:18:57 crow kernel: again GFP_NOWAIT
>
> With this setup, I also forced each of the bpf_iter_tcp_realloc_batch
> calls to return -ENOMEM to ensure that iteration ends and that the
> read() in userspace fails.
>
> Signed-off-by: Jordan Rife <jordan@...fe.io>
> Reviewed-by: Kuniyuki Iwashima <kuniyu@...zon.com>
> ---
> net/ipv4/tcp_ipv4.c | 96 ++++++++++++++++++++++++++++++++-------------
> 1 file changed, 68 insertions(+), 28 deletions(-)
>
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 2e40af6aff37..69c976a07434 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -3057,7 +3057,10 @@ static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
> if (!new_batch)
> return -ENOMEM;
>
> - bpf_iter_tcp_put_batch(iter);
> + if (flags != GFP_NOWAIT)
> + bpf_iter_tcp_put_batch(iter);
> +
> + memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
> kvfree(iter->batch);
> iter->batch = new_batch;
> iter->max_sk = new_batch_sz;
> @@ -3066,69 +3069,85 @@ static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
> }
>
> static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
> - struct sock *start_sk)
> + struct sock **start_sk)
> {
> - struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
> struct bpf_tcp_iter_state *iter = seq->private;
> - struct tcp_iter_state *st = &iter->state;
> struct hlist_nulls_node *node;
> unsigned int expected = 1;
> struct sock *sk;
>
> - sock_hold(start_sk);
> - iter->batch[iter->end_sk++] = start_sk;
> + sock_hold(*start_sk);
> + iter->batch[iter->end_sk++] = *start_sk;
>
> - sk = sk_nulls_next(start_sk);
> + sk = sk_nulls_next(*start_sk);
> + *start_sk = NULL;
> sk_nulls_for_each_from(sk, node) {
> if (seq_sk_match(seq, sk)) {
> if (iter->end_sk < iter->max_sk) {
> sock_hold(sk);
> iter->batch[iter->end_sk++] = sk;
> + } else if (!*start_sk) {
> + /* Remember where we left off. */
> + *start_sk = sk;
> }
> expected++;
> }
> }
> - spin_unlock(&hinfo->lhash2[st->bucket].lock);
>
> return expected;
> }
>
> static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
> - struct sock *start_sk)
> + struct sock **start_sk)
> {
> - struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
> struct bpf_tcp_iter_state *iter = seq->private;
> - struct tcp_iter_state *st = &iter->state;
> struct hlist_nulls_node *node;
> unsigned int expected = 1;
> struct sock *sk;
>
> - sock_hold(start_sk);
> - iter->batch[iter->end_sk++] = start_sk;
> + sock_hold(*start_sk);
> + iter->batch[iter->end_sk++] = *start_sk;
>
> - sk = sk_nulls_next(start_sk);
> + sk = sk_nulls_next(*start_sk);
> + *start_sk = NULL;
> sk_nulls_for_each_from(sk, node) {
> if (seq_sk_match(seq, sk)) {
> if (iter->end_sk < iter->max_sk) {
> sock_hold(sk);
> iter->batch[iter->end_sk++] = sk;
> + } else if (!*start_sk) {
> + /* Remember where we left off. */
> + *start_sk = sk;
> }
> expected++;
> }
> }
> - spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
>
> return expected;
> }
>
> +static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
> +{
> + struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
> + struct bpf_tcp_iter_state *iter = seq->private;
> + struct tcp_iter_state *st = &iter->state;
> +
> + if (st->state == TCP_SEQ_STATE_LISTENING)
> + spin_unlock(&hinfo->lhash2[st->bucket].lock);
> + else
> + spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
> +}
> +
> static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
> {
> struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
> struct bpf_tcp_iter_state *iter = seq->private;
> struct tcp_iter_state *st = &iter->state;
> + int prev_bucket, prev_state;
> unsigned int expected;
> - bool resized = false;
> + int resizes = 0;
> struct sock *sk;
> + int err;
>
> /* The st->bucket is done. Directly advance to the next
> * bucket instead of having the tcp_seek_last_pos() to skip
> @@ -3149,29 +3168,50 @@ static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
> /* Get a new batch */
> iter->cur_sk = 0;
> iter->end_sk = 0;
> - iter->st_bucket_done = false;
> + iter->st_bucket_done = true;
>
> + prev_bucket = st->bucket;
> + prev_state = st->state;
> sk = tcp_seek_last_pos(seq);
> if (!sk)
> return NULL; /* Done */
> + if (st->bucket != prev_bucket || st->state != prev_state)
> + resizes = 0;
> + expected = 0;
>
> +fill_batch:
> if (st->state == TCP_SEQ_STATE_LISTENING)
> - expected = bpf_iter_tcp_listening_batch(seq, sk);
> + expected += bpf_iter_tcp_listening_batch(seq, &sk);
> else
> - expected = bpf_iter_tcp_established_batch(seq, sk);
> + expected += bpf_iter_tcp_established_batch(seq, &sk);
>
> - if (iter->end_sk == expected) {
> - iter->st_bucket_done = true;
> - return sk;
> - }
[..]
> + if (unlikely(resizes <= 1 && iter->end_sk != expected)) {
> + resizes++;
> +
> + if (resizes == 1) {
> + bpf_iter_tcp_unlock_bucket(seq);
>
> - if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
> - GFP_USER)) {
> - resized = true;
> - goto again;
> + err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
> + GFP_USER);
> + if (err)
> + return ERR_PTR(err);
> + goto again;
> + }
> +
> + err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
> + if (err) {
> + bpf_iter_tcp_unlock_bucket(seq);
> + return ERR_PTR(err);
> + }
> +
> + expected = iter->end_sk;
> + goto fill_batch;
Can we try to unroll this? Add new helpers to hide the repeating parts,
store extra state in iter if needed.
AFAIU, we want the following:
1. find sk, try to fill the batch, if it fits -> bail out
2. try to allocate new batch with GPU_USER, try to fill again -> bail
out
3. otherwise, attempt GPF_NOWAIT and do that dance where you copy over
previous partial copy
The conditional put in bpf_iter_tcp_put_batch does not look nice :-(
Same for unconditional memcpy (which, if I understand correctly, only
needed for GFP_NOWAIT case). I'm 99% sure your current version works,
but it's a bit hard to follow :-(
Untested code to illustrate the idea below. Any reason it won't work?
/* fast path */
sk = tcp_seek_last_pos(seq);
if (!sk) return NULL;
fits = bpf_iter_tcp_fill_batch(...);
bpf_iter_tcp_unlock_bucket(iter);
if (fits) return sk;
/* not enough space to store full batch, try to reallocate with GFP_USER */
bpf_iter_tcp_free_batch(iter);
if (bpf_iter_tcp_alloc_batch(iter, GFP_USER)) {
/* allocated 'expected' size, try to fill again */
sk = tcp_seek_last_pos(seq);
if (!sk) return NULL;
fits = bpf_iter_tcp_fill_batch(...);
if (fits) {
bpf_iter_tcp_unlock_bucket(iter);
return sk;
}
}
/* the bucket is still locked here, sk points to the correct one,
* we have a partial result in iter->batch */
old_batch = iter->batch;
if (!bpf_iter_tcp_alloc_batch(iter, GFP_NOWAIT)) {
/* no luck, bail out */
bpf_iter_tcp_unlock_bucket(iter);
bpf_iter_tcp_free_batch(iter); /* or put? */
return ERR_PTR(-ENOMEM);
}
if (old_batch) {
/* copy partial result from the previous run if needed? */
memcpy(iter->batch, old_batch, ...);
kvfree(old_batch);
}
/* TODO: somehow fill the remainder */
bpf_iter_tcp_unlock_bucket(iter);
return ..;
....
bool bpf_iter_tcp_fill_batch(...)
{
if (st->state == TCP_SEQ_STATE_LISTENING)
expected = bpf_iter_tcp_listening_batch(seq, sk);
else
expected = bpf_iter_tcp_established_batch(seq, sk);
/* TODO: store expected into the iter for future resizing */
/* TODO: make bpf_iter_tcp_xxx_batch store start_sk in iter */
if (iter->end_sk == expected) {
iter->st_bucket_done = true;
return true;
}
return false;
}
bool bpf_iter_tcp_free_batch(...)
{
bpf_iter_tcp_put_batch(iter);
kvfree(iter->batch);
iter->batch = NULL;
}
Powered by blists - more mailing lists