[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9f3697c1-ed15-4a3d-9113-c4437f421bb3@linux.dev>
Date: Wed, 20 Dec 2023 20:45:08 -0800
From: Martin KaFai Lau <martin.lau@...ux.dev>
To: Daniel Borkmann <daniel@...earbox.net>
Cc: 'Alexei Starovoitov ' <ast@...nel.org>,
'Andrii Nakryiko ' <andrii@...nel.org>, netdev@...r.kernel.org,
kernel-team@...a.com, Aditi Ghag <aditi.ghag@...valent.com>,
bpf@...r.kernel.org
Subject: Re: [PATCH bpf 1/2] bpf: Avoid iter->offset making backward progress
in bpf_iter_udp
On 12/20/23 11:10 AM, Martin KaFai Lau wrote:
> Good catch. It will unnecessary skip in the following batch/bucket if there is
> changes in the current batch/bucket.
>
> From looking at the loop again, I think it is better not to change the
> iter->offset during the for loop. Only update iter->offset after the for loop
> has concluded.
>
> The non-zero iter->offset is only useful for the first bucket, so does a test on
> the first bucket (state->bucket == bucket) before skipping sockets. Something
> like this:
>
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 89e5a806b82e..a993f364d6ae 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -3139,6 +3139,7 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> struct net *net = seq_file_net(seq);
> struct udp_table *udptable;
> unsigned int batch_sks = 0;
> + int bucket, bucket_offset;
> bool resized = false;
> struct sock *sk;
>
> @@ -3162,14 +3163,14 @@ static struct sock *bpf_iter_udp_batch(struct seq_file
> *seq)
> iter->end_sk = 0;
> iter->st_bucket_done = false;
> batch_sks = 0;
> + bucket = state->bucket;
> + bucket_offset = 0;
>
> for (; state->bucket <= udptable->mask; state->bucket++) {
> struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
>
> - if (hlist_empty(&hslot2->head)) {
> - iter->offset = 0;
> + if (hlist_empty(&hslot2->head))
> continue;
> - }
>
> spin_lock_bh(&hslot2->lock);
> udp_portaddr_for_each_entry(sk, &hslot2->head) {
> @@ -3177,8 +3178,9 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> /* Resume from the last iterated socket at the
> * offset in the bucket before iterator was stopped.
> */
> - if (iter->offset) {
> - --iter->offset;
> + if (state->bucket == bucket &&
> + bucket_offset < iter->offset) {
> + ++bucket_offset;
> continue;
> }
> if (iter->end_sk < iter->max_sk) {
> @@ -3192,10 +3194,10 @@ static struct sock *bpf_iter_udp_batch(struct seq_file
> *seq)
>
> if (iter->end_sk)
> break;
> + }
>
> - /* Reset the current bucket's offset before moving to the next bucket. */
> + if (state->bucket != bucket)
> iter->offset = 0;
> - }
>
> /* All done: no batch made. */
> if (!iter->end_sk)
I think I found another bug in the current bpf_iter_udp_batch(). The
"state->bucket--;" at the end of the batch() function is wrong also. It does not
need to go back to the previous bucket. After realloc with a larger batch array,
it should retry on the "state->bucket" as is. I tried to force the bind() to use
bucket 0 and bind a larger so_reuseport set (24 sockets). WARN_ON(state->bucket
< 0) triggered.
Going back to this bug (backward progress on --iter->offset), I think it is a
bit cleaner to always reset iter->offset to 0 and advance iter->offset to the
resume_offset only when needed. Something like this:
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 89e5a806b82e..184aa966a006 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3137,16 +3137,18 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
struct bpf_udp_iter_state *iter = seq->private;
struct udp_iter_state *state = &iter->state;
struct net *net = seq_file_net(seq);
+ int resume_bucket, resume_offset;
struct udp_table *udptable;
unsigned int batch_sks = 0;
bool resized = false;
struct sock *sk;
+ resume_bucket = state->bucket;
+ resume_offset = iter->offset;
+
/* The current batch is done, so advance the bucket. */
- if (iter->st_bucket_done) {
+ if (iter->st_bucket_done)
state->bucket++;
- iter->offset = 0;
- }
udptable = udp_get_table_seq(seq, net);
@@ -3166,10 +3168,9 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
for (; state->bucket <= udptable->mask; state->bucket++) {
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
- if (hlist_empty(&hslot2->head)) {
- iter->offset = 0;
+ iter->offset = 0;
+ if (hlist_empty(&hslot2->head))
continue;
- }
spin_lock_bh(&hslot2->lock);
udp_portaddr_for_each_entry(sk, &hslot2->head) {
@@ -3177,8 +3178,9 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
/* Resume from the last iterated socket at the
* offset in the bucket before iterator was stopped.
*/
- if (iter->offset) {
- --iter->offset;
+ if (state->bucket == resume_bucket &&
+ iter->offset < resume_offset) {
+ ++iter->offset;
continue;
}
if (iter->end_sk < iter->max_sk) {
@@ -3192,9 +3194,6 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
if (iter->end_sk)
break;
-
- /* Reset the current bucket's offset before moving to the next bucket. */
- iter->offset = 0;
}
/* All done: no batch made. */
@@ -3210,10 +3209,6 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
}
if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
resized = true;
- /* After allocating a larger batch, retry one more time to grab
- * the whole bucket.
- */
- state->bucket--;
goto again;
}
done:
Powered by blists - more mailing lists