[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250428180036.369192-3-jordan@jrife.io>
Date: Mon, 28 Apr 2025 11:00:26 -0700
From: Jordan Rife <jordan@...fe.io>
To: netdev@...r.kernel.org,
bpf@...r.kernel.org
Cc: Jordan Rife <jordan@...fe.io>,
Aditi Ghag <aditi.ghag@...valent.com>,
Daniel Borkmann <daniel@...earbox.net>,
Martin KaFai Lau <martin.lau@...ux.dev>,
Willem de Bruijn <willemdebruijn.kernel@...il.com>,
Kuniyuki Iwashima <kuniyu@...zon.com>,
Alexei Starovoitov <alexei.starovoitov@...il.com>
Subject: [PATCH v6 bpf-next 2/7] bpf: udp: Make sure iter->batch always contains a full bucket snapshot
Require that iter->batch always contains a full bucket snapshot. This
invariant is important to avoid skipping or repeating sockets during
iteration when combined with the next few patches. Before, there were
two cases where a call to bpf_iter_udp_batch may only capture part of a
bucket:
1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1].
2. When more sockets are added to the bucket while calling
bpf_iter_udp_realloc_batch(), making the updated batch size
insufficient [2].
In cases where the batch size only covers part of a bucket, it is
possible to forget which sockets were already visited, especially if we
have to process a bucket in more than two batches. This forces us to
choose between repeating or skipping sockets, so don't allow this:
1. Stop iteration and propagate -ENOMEM up to userspace if reallocation
fails instead of continuing with a partial batch.
2. Try bpf_iter_udp_realloc_batch() with GFP_USER just as before, but if
we still aren't able to capture the full bucket, call
bpf_iter_udp_realloc_batch() again while holding the bucket lock to
guarantee the bucket does not change. On the second attempt use
GFP_NOWAIT since we hold onto the spin lock.
Introduce the udp_portaddr_for_each_entry_from macro and use it instead
of udp_portaddr_for_each_entry to make it possible to continue iteration
from an arbitrary socket. This is required for this patch in the
GFP_NOWAIT case to allow us to fill the rest of a batch starting from
the middle of a bucket and the later patch which skips sockets that were
already seen.
Testing all scenarios directly is a bit difficult, but I did some manual
testing to exercise the code paths where GFP_NOWAIT is used and where
ERR_PTR(err) is returned. I used the realloc test case included later
in this series to trigger a scenario where a realloc happens inside
bpf_iter_udp_batch and made a small code tweak to force the first
realloc attempt to allocate a too-small batch, thus requiring
another attempt with GFP_NOWAIT. Some printks showed both reallocs with
the tests passing:
Apr 25 23:16:24 crow kernel: go again GFP_USER
Apr 25 23:16:24 crow kernel: go again GFP_NOWAIT
With this setup, I also forced each of the bpf_iter_udp_realloc_batch
calls to return -ENOMEM to ensure that iteration ends and that the
read() in userspace fails.
[1]: https://lore.kernel.org/bpf/CABi4-ogUtMrH8-NVB6W8Xg_F_KDLq=yy-yu-tKr2udXE2Mu1Lg@mail.gmail.com/
[2]: https://lore.kernel.org/bpf/7ed28273-a716-4638-912d-f86f965e54bb@linux.dev/
Signed-off-by: Jordan Rife <jordan@...fe.io>
---
include/linux/udp.h | 3 ++
net/ipv4/udp.c | 81 ++++++++++++++++++++++++++++++---------------
2 files changed, 58 insertions(+), 26 deletions(-)
diff --git a/include/linux/udp.h b/include/linux/udp.h
index 0807e21cfec9..a69da9c4c1c5 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -209,6 +209,9 @@ static inline void udp_allow_gso(struct sock *sk)
#define udp_portaddr_for_each_entry(__sk, list) \
hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
+#define udp_portaddr_for_each_entry_from(__sk) \
+ hlist_for_each_entry_from(__sk, __sk_common.skc_portaddr_node)
+
#define udp_portaddr_for_each_entry_rcu(__sk, list) \
hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6a3c351aa06e..5fe22f4f43d7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3410,8 +3410,9 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
int resume_bucket, resume_offset;
struct udp_table *udptable;
unsigned int batch_sks = 0;
- bool resized = false;
+ int resizes = 0;
struct sock *sk;
+ int err = 0;
resume_bucket = state->bucket;
resume_offset = iter->offset;
@@ -3432,18 +3433,21 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
*/
iter->cur_sk = 0;
iter->end_sk = 0;
- iter->st_bucket_done = false;
+ iter->st_bucket_done = true;
batch_sks = 0;
for (; state->bucket <= udptable->mask; state->bucket++) {
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;
if (hlist_empty(&hslot2->head))
- continue;
+ goto next_bucket;
iter->offset = 0;
spin_lock_bh(&hslot2->lock);
- udp_portaddr_for_each_entry(sk, &hslot2->head) {
+ sk = hlist_entry_safe(hslot2->head.first, struct sock,
+ __sk_common.skc_portaddr_node);
+fill_batch:
+ udp_portaddr_for_each_entry_from(sk) {
if (seq_sk_match(seq, sk)) {
/* Resume from the last iterated socket at the
* offset in the bucket before iterator was stopped.
@@ -3460,33 +3464,55 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
batch_sks++;
}
}
+
+ /* Allocate a larger batch and try again. */
+ if (unlikely(resizes <= 1 && iter->end_sk &&
+ iter->end_sk != batch_sks)) {
+ resizes++;
+
+ /* First, try with GFP_USER to maximize the chances of
+ * grabbing more memory.
+ */
+ if (resizes == 1) {
+ spin_unlock_bh(&hslot2->lock);
+ err = bpf_iter_udp_realloc_batch(iter,
+ batch_sks * 3 / 2,
+ GFP_USER);
+ if (err)
+ return ERR_PTR(err);
+ /* Start over. */
+ goto again;
+ }
+
+ /* Next, hold onto the lock, so the bucket doesn't
+ * change while we get the rest of the sockets.
+ */
+ err = bpf_iter_udp_realloc_batch(iter, batch_sks,
+ GFP_NOWAIT);
+ if (err) {
+ spin_unlock_bh(&hslot2->lock);
+ return ERR_PTR(err);
+ }
+
+ /* Pick up where we left off. */
+ sk = iter->batch[iter->end_sk - 1];
+ sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next,
+ struct sock,
+ __sk_common.skc_portaddr_node);
+ batch_sks = iter->end_sk;
+ goto fill_batch;
+ }
+
spin_unlock_bh(&hslot2->lock);
if (iter->end_sk)
break;
+next_bucket:
+ resizes = 0;
}
- /* All done: no batch made. */
- if (!iter->end_sk)
- return NULL;
-
- if (iter->end_sk == batch_sks) {
- /* Batching is done for the current bucket; return the first
- * socket to be iterated from the batch.
- */
- iter->st_bucket_done = true;
- goto done;
- }
- if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2,
- GFP_USER)) {
- resized = true;
- /* After allocating a larger batch, retry one more time to grab
- * the whole bucket.
- */
- goto again;
- }
-done:
- return iter->batch[0];
+ WARN_ON_ONCE(iter->end_sk != batch_sks);
+ return iter->end_sk ? iter->batch[0] : NULL;
}
static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -3841,7 +3867,10 @@ static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
if (!new_batch)
return -ENOMEM;
- bpf_iter_udp_put_batch(iter);
+ if (flags != GFP_NOWAIT)
+ bpf_iter_udp_put_batch(iter);
+
+ memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
kvfree(iter->batch);
iter->batch = new_batch;
iter->max_sk = new_batch_sz;
--
2.43.0
Powered by blists - more mailing lists