[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMB2axNT04=cc7-7XaDqhVKRbHm4zivW5kmwRpLbrFTVYwmm3g@mail.gmail.com>
Date: Mon, 12 Jan 2026 13:17:56 -0800
From: Amery Hung <ameryhung@...il.com>
To: Kumar Kartikeya Dwivedi <memxor@...il.com>
Cc: bpf@...r.kernel.org, netdev@...r.kernel.org, alexei.starovoitov@...il.com,
andrii@...nel.org, daniel@...earbox.net, martin.lau@...nel.org,
kpsingh@...nel.org, yonghong.song@...ux.dev, song@...nel.org,
haoluo@...gle.com, kernel-team@...a.com
Subject: Re: [PATCH bpf-next v3 10/16] bpf: Support lockless unlink when
freeing map or local storage
On Mon, Jan 12, 2026 at 7:49 AM Kumar Kartikeya Dwivedi
<memxor@...il.com> wrote:
>
> On Mon, 12 Jan 2026 at 16:36, Kumar Kartikeya Dwivedi <memxor@...il.com> wrote:
> >
> > On Thu, 18 Dec 2025 at 18:56, Amery Hung <ameryhung@...il.com> wrote:
> > >
> > > Introduce bpf_selem_unlink_lockless() to properly handle errors returned
> > > from rqspinlock in bpf_local_storage_map_free() and
> > > bpf_local_storage_destroy() where the operation must succeeds.
> > >
> > > The idea of bpf_selem_unlink_lockless() is to allow an selem to be
> > > partially linked and use refcount to determine when and who can free the
> > > selem. An selem initially is fully linked to a map and a local storage
> > > and therefore selem->link_cnt is set to 2. Under normal circumstances,
> > > bpf_selem_unlink_lockless() will be able to grab locks and unlink
> > > an selem from map and local storage in sequeunce, just like
> > > bpf_selem_unlink(), and then add it to a local tofree list provide by
> > > the caller. However, if any of the lock attempts fails, it will
> > > only clear SDATA(selem)->smap or selem->local_storage depending on the
> > > caller and decrement link_cnt to signal that the corresponding data
> > > structure holding a reference to the selem is gone. Then, only when both
> > > map and local storage are gone, an selem can be free by the last caller
> > > that turns link_cnt to 0.
> > >
> > > To make sure bpf_obj_free_fields() is done only once and when map is
> > > still present, it is called when unlinking an selem from b->list under
> > > b->lock.
> > >
> > > To make sure uncharging memory is only done once and when owner is still
> > > present, only unlink selem from local_storage->list in
> > > bpf_local_storage_destroy() and return the amount of memory to uncharge
> > > to the caller (i.e., owner) since the map associated with an selem may
> > > already be gone and map->ops->map_local_storage_uncharge can no longer
> > > be referenced.
> > >
> > > Finally, access of selem, SDATA(selem)->smap and selem->local_storage
> > > are racy. Callers will protect these fields with RCU.
> > >
> > > Co-developed-by: Martin KaFai Lau <martin.lau@...nel.org>
> > > Signed-off-by: Martin KaFai Lau <martin.lau@...nel.org>
> > > Signed-off-by: Amery Hung <ameryhung@...il.com>
> > > ---
> > > include/linux/bpf_local_storage.h | 2 +-
> > > kernel/bpf/bpf_local_storage.c | 77 +++++++++++++++++++++++++++++--
> > > 2 files changed, 74 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
> > > index 20918c31b7e5..1fd908c44fb6 100644
> > > --- a/include/linux/bpf_local_storage.h
> > > +++ b/include/linux/bpf_local_storage.h
> > > @@ -80,9 +80,9 @@ struct bpf_local_storage_elem {
> > > * after raw_spin_unlock
> > > */
> > > };
> > > + atomic_t link_cnt;
> > > u16 size;
> > > bool use_kmalloc_nolock;
> > > - /* 4 bytes hole */
> > > /* The data is stored in another cacheline to minimize
> > > * the number of cachelines access during a cache hit.
> > > */
> > > diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> > > index 62201552dca6..4c682d5aef7f 100644
> > > --- a/kernel/bpf/bpf_local_storage.c
> > > +++ b/kernel/bpf/bpf_local_storage.c
> > > @@ -97,6 +97,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> > > if (swap_uptrs)
> > > bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
> > > }
> > > + atomic_set(&selem->link_cnt, 2);
> > > selem->size = smap->elem_size;
> > > selem->use_kmalloc_nolock = smap->use_kmalloc_nolock;
> > > return selem;
> > > @@ -200,9 +201,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
> > > /* The bpf_local_storage_map_free will wait for rcu_barrier */
> > > smap = rcu_dereference_check(SDATA(selem)->smap, 1);
> > >
> > > - migrate_disable();
> > > - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
> > > - migrate_enable();
> > > + if (smap) {
> > > + migrate_disable();
> > > + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
> > > + migrate_enable();
> > > + }
> > > kfree_nolock(selem);
> > > }
> > >
> > > @@ -227,7 +230,8 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
> > > * is only supported in task local storage, where
> > > * smap->use_kmalloc_nolock == true.
> > > */
> > > - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
> > > + if (smap)
> > > + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
> > > __bpf_selem_free(selem, reuse_now);
> > > return;
> > > }
> > > @@ -419,6 +423,71 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
> > > return err;
> > > }
> > >
> > > +/* Callers of bpf_selem_unlink_lockless() */
> > > +#define BPF_LOCAL_STORAGE_MAP_FREE 0
> > > +#define BPF_LOCAL_STORAGE_DESTROY 1
> > > +
> > > +/*
> > > + * Unlink an selem from map and local storage with lockless fallback if callers
> > > + * are racing or rqspinlock returns error. It should only be called by
> > > + * bpf_local_storage_destroy() or bpf_local_storage_map_free().
> > > + */
> > > +static void bpf_selem_unlink_lockless(struct bpf_local_storage_elem *selem,
> > > + struct hlist_head *to_free, int caller)
> > > +{
> > > + struct bpf_local_storage *local_storage;
> > > + struct bpf_local_storage_map_bucket *b;
> > > + struct bpf_local_storage_map *smap;
> > > + unsigned long flags;
> > > + int err, unlink = 0;
> > > +
> > > + local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held());
> > > + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
> > > +
> > > + /*
> > > + * Free special fields immediately as SDATA(selem)->smap will be cleared.
> > > + * No BPF program should be reading the selem.
> > > + */
> > > + if (smap) {
> > > + b = select_bucket(smap, selem);
> > > + err = raw_res_spin_lock_irqsave(&b->lock, flags);
> >
> > Please correct me if I'm wrong with any of this here. I think this is
> > the only potential problem I see, i.e. if we assume that this call can
> > fail for map_free.
> > map_free fails here, goes to the bottom with unlink == 0, and moves
> > refcnt from 2 to 1.
> > Before it restarts its loop, destroy() which was going in parallel and
> > caused the failure already succeeded in smap removal and local storage
> > removal, has unlink == 2, so proceeds with bpf_selem_free_list.
> > It frees the selem with RCU gp.
> > Meanwhile our loop races around cond_resched_rcu(), which restarts the
> > read section so the element is freed before we restart the while loop.
> > Would we do UAF?
> >
> > 1. map_free() fails b->lock, link_cnt 2->1, map_node still linked
> > 2. destroy() succeeds (unlink == 2), calls bpf_selem_free_list(),
> > does RCU free
>
> The ordering here is probably a bit messed up, but map_free would need
> to wrap around to the start of its loop on the other side right before
> destroy() does hlist_del_init_rcu(), and then let it free the node
> before proceeding.
> At that point I guess it would still wait for our newly started read
> section, but we'd probably still observe the refcount as 0 and end up
> underflowing.
> So we may not need any change to cond_resched_rcu() but just a
> dec_not_zero to make things safe.
>
Thanks for bringing up the corner case!
I am not sure how underflowing can happen with atomic_dec_and_test().
However, if map_free() visits the same selem twice and fails to
acquire b->lock both times, we might double free the selem as
map_free() drops the refcount twice and also tries to free it.
I think we need to be more precise about when we drop refcnt.
Something like below:
bool drop_refcnt = false;
...
drop_refcnt = (smap && caller == BPF_LOCAL_STORAGE_MAP_FREE ||
local_storage && caller == BPF_LOCAL_STORAGE_DESTROY);
if (unlink == 2 || (drop_refcnt && atomic_dec_and_test(&selem->link_cnt)))
hlist_add_head(&selem->free_node, to_free);
> That said none of it feels worth it when compared to just warning on
> an error taking the bucket lock in map_free(), unless there are other
> concerns I missed.
I feel this is also same with the orignial "can we assume the lock
always succeed and use WARN_ON" discussion. So, I think it is better
to be able to handle the error than using WARN_ON. This also doesn't
add too much complexity IMO.
>
>
> > 3. map_free()'s cond_resched_rcu() releases rcu_read_lock()
> > 4. RCU grace period completes, selem is freed
> > 5. map_free() re-acquires rcu_read_lock(), hlist_first_rcu() returns
> > freed memory
> >
> > I think the fix is that we might want to unlink from map_node in
> > bpf_selem_free_list and do dec_not_zero instead, and avoid the
> > cond_resched_rcu().
> > If we race with destroy(), and end up doing another iteration on the
> > same element, we will keep our RCU gp so not access freed memory, and
> > avoid moving refcount < 0 due to dec_not_zero. By the time we restart
> > third time we will no longer see the element.
> >
> > But removing cond_resched_rcu() doesn't seem attractive (I don't think
> > there's a variant that does cond_resched() while holding the RCU read
> > lock).
> >
> > Things become much simpler if we assume map_free() cannot fail when
> > acquiring the bucket lock. It seems to me that we should make that
> > assumption, since destroy() in task context is the only racing
> > invocation.
> > If we are getting timeouts something is seriously wrong.
> > WARN_ON_ONCE(err && caller == BPF_LOCAL_STORAGE_MAP_FREE).
> > Then remove else if branch.
> > The converse (assuming it will always succeed for destroy()) is not
> > true, since BPF programs can cause deadlocks. But the problem is only
> > around map_node unlinking.
> >
> >
> > > + if (!err) {
> > > + if (likely(selem_linked_to_map(selem))) {
> > > + hlist_del_init_rcu(&selem->map_node);
> > > + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
> > > + RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
> > > + unlink++;
> > > + }
> > > + raw_res_spin_unlock_irqrestore(&b->lock, flags);
> > > + } else if (caller == BPF_LOCAL_STORAGE_MAP_FREE) {
> > > + RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
> > > + }
> > > + }
> > > +
> > > + /*
> > > + * Only let destroy() unlink from local_storage->list and do mem_uncharge
> > > + * as owner is guaranteed to be valid in destroy().
> > > + */
> > > + if (local_storage && caller == BPF_LOCAL_STORAGE_DESTROY) {
> > > + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
> > > + if (!err) {
> > > + hlist_del_init_rcu(&selem->snode);
> > > + unlink++;
> > > + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
> > > + }
> > > + RCU_INIT_POINTER(selem->local_storage, NULL);
> > > + }
> > > +
> > > + /*
> > > + * Normally, an selem can be unlink under local_storage->lock and b->lock, and
> > > + * then added to a local to_free list. However, if destroy() and map_free() are
> > > + * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free
> > > + * the selem only after both map_free() and destroy() drop the refcnt.
> > > + */
> > > + if (unlink == 2 || atomic_dec_and_test(&selem->link_cnt))
> > > + hlist_add_head(&selem->free_node, to_free);
> > > +}
> > > +
> > > void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
> > > struct bpf_local_storage_map *smap,
> > > struct bpf_local_storage_elem *selem)
> > > --
> > > 2.47.3
> > >
Powered by blists - more mailing lists