[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <fa0a7936-145a-4f47-a858-66e46b829486@linux.dev>
Date: Tue, 22 Apr 2025 13:25:28 -0700
From: Martin KaFai Lau <martin.lau@...ux.dev>
To: Jordan Rife <jordan@...fe.io>
Cc: Aditi Ghag <aditi.ghag@...valent.com>,
Daniel Borkmann <daniel@...earbox.net>,
Willem de Bruijn <willemdebruijn.kernel@...il.com>,
Kuniyuki Iwashima <kuniyu@...zon.com>, netdev@...r.kernel.org,
bpf@...r.kernel.org
Subject: Re: [PATCH v4 bpf-next 2/6] bpf: udp: Make sure iter->batch always
contains a full bucket snapshot
On 4/22/25 11:02 AM, Jordan Rife wrote:
>> I found the "if (lock)" changes and its related changes make the code harder
>> to follow. This change is mostly to handle one special case,
>> avoid releasing the lock when "resizes" reaches the limit.
>>
>> Can this one case be specifically handled in the "for(bucket)" loop?
>>
>> With this special case, it can alloc exactly the "batch_sks" size
>> with GFP_ATOMIC. It does not need to put the sk or get the cookie.
>> It can directly continue from the "iter->batch[iter->end_sk - 1].sock".
>>
>> Something like this on top of this set. I reset the "resizes" on each new bucket,
>> removed the existing "done" label and avoid getting cookie in the last attempt.
>>
>> Untested code and likely still buggy. wdyt?
>
> Overall I like it, and at a glance, it seems correct. The code before
Thanks for checking!
> (with the if (lock) stuff) made it harder to easily verify that the
> lock was always released for every code path. This structure makes it
> more clear. I'll adopt this for the next version of the series and do
> a bit more testing to make sure everything's sound.
>
>>
>> #define MAX_REALLOC_ATTEMPTS 2
>>
>> static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
>> {
>> struct bpf_udp_iter_state *iter = seq->private;
>> struct udp_iter_state *state = &iter->state;
>> unsigned int find_cookie, end_cookie = 0;
>> struct net *net = seq_file_net(seq);
>> struct udp_table *udptable;
>> unsigned int batch_sks = 0;
>> int resume_bucket;
>> struct sock *sk;
>> int resizes = 0;
>> int err = 0;
>>
>> resume_bucket = state->bucket;
>>
>> /* The current batch is done, so advance the bucket. */
>> if (iter->st_bucket_done)
>> state->bucket++;
>>
>> udptable = udp_get_table_seq(seq, net);
>>
>> again:
>> /* New batch for the next bucket.
>> * Iterate over the hash table to find a bucket with sockets matching
>> * the iterator attributes, and return the first matching socket from
>> * the bucket. The remaining matched sockets from the bucket are batched
>> * before releasing the bucket lock. This allows BPF programs that are
>> * called in seq_show to acquire the bucket lock if needed.
>> */
>> find_cookie = iter->cur_sk;
>> end_cookie = iter->end_sk;
>> iter->cur_sk = 0;
>> iter->end_sk = 0;
>> iter->st_bucket_done = false;
>> batch_sks = 0;
>>
>> for (; state->bucket <= udptable->mask; state->bucket++) {
>> struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;
>>
>> if (hlist_empty(&hslot2->head)) {
>> resizes = 0;
>
>
>
>> continue;
>> }
>>
>> spin_lock_bh(&hslot2->lock);
>>
>> /* Initialize sk to the first socket in hslot2. */
>> sk = hlist_entry_safe(hslot2->head.first, struct sock,
>> __sk_common.skc_portaddr_node);
>> /* Resume from the first (in iteration order) unseen socket from
>> * the last batch that still exists in resume_bucket. Most of
>> * the time this will just be where the last iteration left off
>> * in resume_bucket unless that socket disappeared between
>> * reads.
>> *
>> * Skip this if end_cookie isn't set; this is the first
>> * batch, we're on bucket zero, and we want to start from the
>> * beginning.
>> */
>> if (state->bucket == resume_bucket && end_cookie)
>> sk = bpf_iter_udp_resume(sk,
>> &iter->batch[find_cookie],
>> end_cookie - find_cookie);
>> last_realloc_retry:
>> udp_portaddr_for_each_entry_from(sk) {
>> if (seq_sk_match(seq, sk)) {
>> if (iter->end_sk < iter->max_sk) {
>> sock_hold(sk);
>> iter->batch[iter->end_sk++].sock = sk;
>> }
>> batch_sks++;
>> }
>> }
>>
>> if (unlikely(resizes == MAX_REALLOC_ATTEMPTS) &&
>> iter->end_sk && iter->end_sk != batch_sks) {
>
> While iter->end_sk == batch_sks should always be true here after goto
> last_realloc_retry, I wonder if it's worth adding a sanity check:
> WARN_*ing and bailing out if we hit this condition twice? Not sure if
> I'm being overly paranoid here.
hmm... I usually won't go with WARN if the case is impossible.
The code is quite tricky here, so I think it is ok to have a WARN_ON_ONCE.
May be increment the "retries" in this special case also and it can stop hitting
this case again for the same bucket. WARN outside of the for loop. Like:
>
>> /* last realloc attempt to batch the whole
>> * bucket. Keep holding the lock to ensure the
>> * bucket will not be changed.
>> */
>> err = bpf_iter_udp_realloc_batch(iter, batch_sks, GFP_ATOMIC);
>> if (err) {
>> spin_unlock_bh(&hslot2->lock);
>> return ERR_PTR(err);
>> }
>> sk = iter->batch[iter->end_sk - 1].sock;
>> sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next,
>> struct sock, __sk_common.skc_portaddr_node);
>> batch_sks = iter->end_sk;
/* and then WARN outside of the for loop */
retries++;
>> goto last_realloc_retry;
>> }
>>
>> spin_unlock_bh(&hslot2->lock);
>>
>> if (iter->end_sk)
>> break;
>>
>> /* Got an empty bucket after taking the lock */
>> resizes = 0;
>> }
>>
>> /* All done: no batch made. */
>> if (!iter->end_sk)
>> return NULL;
>>
>> if (iter->end_sk == batch_sks) {
>> /* Batching is done for the current bucket; return the first
>> * socket to be iterated from the batch.
>> */
>> iter->st_bucket_done = true;
>> return iter->batch[0].sock;
>> }
if (WARN_ON_ONCE(retries > MAX_REALLOC_ATTEMPTS))
return iter->batch[0].sock;
>>
>> err = bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2, GFP_USER);
>> if (err)
>> return ERR_PTR(err);
>>
>> resizes++;
>> goto again;
>> }
>> static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
>> unsigned int new_batch_sz, int flags)
>> {
>> union bpf_udp_iter_batch_item *new_batch;
>>
>> new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
>> flags | __GFP_NOWARN);
>> if (!new_batch)
>> return -ENOMEM;
>>
>> if (flags != GFP_ATOMIC)
>> bpf_iter_udp_put_batch(iter);
>>
>> /* Make sure the new batch has the cookies of the sockets we haven't
>> * visited yet.
>> */
>> memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
>> kvfree(iter->batch);
>> iter->batch = new_batch;
>> iter->max_sk = new_batch_sz;
>>
>> return 0;
>> }
>
> Jordan
Powered by blists - more mailing lists