[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <74fa8337-b0cb-42fb-af8a-fdf6877e558d@linux.dev>
Date: Thu, 8 Jan 2026 12:29:00 -0800
From: Martin KaFai Lau <martin.lau@...ux.dev>
To: Amery Hung <ameryhung@...il.com>
Cc: bpf@...r.kernel.org, netdev@...r.kernel.org,
alexei.starovoitov@...il.com, andrii@...nel.org, daniel@...earbox.net,
memxor@...il.com, martin.lau@...nel.org, kpsingh@...nel.org,
yonghong.song@...ux.dev, song@...nel.org, haoluo@...gle.com,
kernel-team@...a.com
Subject: Re: [PATCH bpf-next v3 01/16] bpf: Convert bpf_selem_unlink_map to
failable
On 12/18/25 9:56 AM, Amery Hung wrote:
> To prepare for changing bpf_local_storage_map_bucket::lock to rqspinlock,
> convert bpf_selem_unlink_map() to failable. It still always succeeds and
> returns 0 for now.
>
> Since some operations updating local storage cannot fail in the middle,
> open-code bpf_selem_unlink_map() to take the b->lock before the
> operation. There are two such locations:
>
> - bpf_local_storage_alloc()
>
> The first selem will be unlinked from smap if cmpxchg owner_storage_ptr
> fails, which should not fail. Therefore, hold b->lock when linking
> until allocation complete. Helpers that assume b->lock is held by
> callers are introduced: bpf_selem_link_map_nolock() and
> bpf_selem_unlink_map_nolock().
>
> - bpf_local_storage_update()
>
> The three step update process: link_map(new_selem),
> link_storage(new_selem), and unlink_map(old_selem) should not fail in
> the middle.
>
> In bpf_selem_unlink(), bpf_selem_unlink_map() and
> bpf_selem_unlink_storage() should either all succeed or fail as a whole
> instead of failing in the middle. So, return if unlink_map() failed.
>
> In bpf_local_storage_destroy(), since it cannot deadlock with itself or
> bpf_local_storage_map_free() who the function might be racing with,
> retry if bpf_selem_unlink_map() fails due to rqspinlock returning
> errors.
>
> Signed-off-by: Amery Hung <ameryhung@...il.com>
> ---
> kernel/bpf/bpf_local_storage.c | 64 +++++++++++++++++++++++++++++-----
> 1 file changed, 55 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> index e2fe6c32822b..4e3f227fd634 100644
> --- a/kernel/bpf/bpf_local_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c
> @@ -347,7 +347,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
> hlist_add_head_rcu(&selem->snode, &local_storage->list);
> }
>
> -static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
> +static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
This will end up only be used by bpf_selem_unlink(). It may as well
remove this function and open code in the bpf_selem_unlink(). I think it
may depend on how patch 10 goes and also if it makes sense to remove
bpf_selem_"link"_map and bpf_selem_unlink_map_nolock also, so treat it
as a nit note for now.
> {
> struct bpf_local_storage_map *smap;
> struct bpf_local_storage_map_bucket *b;
> @@ -355,7 +355,7 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
>
> if (unlikely(!selem_linked_to_map_lockless(selem)))
In the later patch where both local_storage's and map-bucket's locks
must be acquired, will this check still be needed if there is an earlier
check that ensures the selem is still linked to the local_storage? It
does not matter in terms of perf, but I think it will help code reading
in the future for the common code path (i.e. the code paths other than
bpf_local_storage_destroy and bpf_local_storage_map_free).
> /* selem has already be unlinked from smap */
> - return;
> + return 0;
>
> smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
> b = select_bucket(smap, selem);
> @@ -363,6 +363,14 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
> if (likely(selem_linked_to_map(selem)))
> hlist_del_init_rcu(&selem->map_node);
> raw_spin_unlock_irqrestore(&b->lock, flags);
> +
> + return 0;
> +}
> +
> +static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem)
> +{
> + if (likely(selem_linked_to_map(selem)))
Take this chance to remove the selem_linked_to_map() check.
hlist_del_init_rcu has the same check.
> + hlist_del_init_rcu(&selem->map_node);
> }
>
> void bpf_selem_link_map(struct bpf_local_storage_map *smap,
> @@ -376,13 +384,26 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
> raw_spin_unlock_irqrestore(&b->lock, flags);
> }
>
> +static void bpf_selem_link_map_nolock(struct bpf_local_storage_map *smap,
> + struct bpf_local_storage_elem *selem,
> + struct bpf_local_storage_map_bucket *b)
> +{
> + RCU_INIT_POINTER(SDATA(selem)->smap, smap);
Is it needed? bpf_selem_alloc should have init the SDATA(selem)->smap.
> + hlist_add_head_rcu(&selem->map_node, &b->list);
> +}
> +
[ ... ]
> @@ -574,20 +603,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
> goto unlock;
> }
>
> + b = select_bucket(smap, selem);
> +
> + if (old_sdata) {
> + old_b = select_bucket(smap, SELEM(old_sdata));
> + old_b = old_b == b ? NULL : old_b;
> + }
> +
> + raw_spin_lock_irqsave(&b->lock, b_flags);
> +
> + if (old_b)
> + raw_spin_lock_irqsave(&old_b->lock, old_b_flags);
This will deadlock because of the lock ordering of b and old_b.
Replacing it with res_spin_lock in the later patch can detect it and
break it more gracefully. imo, we should not introduce a known deadlock
logic in the kernel code in the syscall code path and ask the current
user to retry the map_update_elem syscall.
What happened to the patch in the earlier revision that uses the
local_storage (or owner) for select_bucket?
[ will continue with the rest of the patches a bit later ]
> +
> alloc_selem = NULL;
> /* First, link the new selem to the map */
> - bpf_selem_link_map(smap, selem);
> + bpf_selem_link_map_nolock(smap, selem, b);
>
> /* Second, link (and publish) the new selem to local_storage */
> bpf_selem_link_storage_nolock(local_storage, selem);
>
> /* Third, remove old selem, SELEM(old_sdata) */
> if (old_sdata) {
> - bpf_selem_unlink_map(SELEM(old_sdata));
> + bpf_selem_unlink_map_nolock(SELEM(old_sdata));
> bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
> &old_selem_free_list);
> }
>
> + if (old_b)
> + raw_spin_unlock_irqrestore(&old_b->lock, old_b_flags);
> +
> + raw_spin_unlock_irqrestore(&b->lock, b_flags);
> +
> unlock:
> raw_spin_unlock_irqrestore(&local_storage->lock, flags);
> bpf_selem_free_list(&old_selem_free_list, false);
> @@ -679,7 +725,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
> /* Always unlink from map before unlinking from
> * local_storage.
> */
> - bpf_selem_unlink_map(selem);
> + WARN_ON(bpf_selem_unlink_map(selem));
> /* If local_storage list has only one element, the
> * bpf_selem_unlink_storage_nolock() will return true.
> * Otherwise, it will return false. The current loop iteration
Powered by blists - more mailing lists