[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260131050920.2574084-11-ameryhung@gmail.com>
Date: Fri, 30 Jan 2026 21:09:13 -0800
From: Amery Hung <ameryhung@...il.com>
To: bpf@...r.kernel.org
Cc: netdev@...r.kernel.org,
alexei.starovoitov@...il.com,
andrii@...nel.org,
daniel@...earbox.net,
memxor@...il.com,
martin.lau@...nel.org,
kpsingh@...nel.org,
yonghong.song@...ux.dev,
song@...nel.org,
haoluo@...gle.com,
ameryhung@...il.com,
kernel-team@...a.com
Subject: [PATCH bpf-next v4 10/16] bpf: Support lockless unlink when freeing map or local storage
Introduce bpf_selem_unlink_nofail() to properly handle errors returned
from rqspinlock in bpf_local_storage_map_free() and
bpf_local_storage_destroy() where the operation must succeeds.
The idea of bpf_selem_unlink_nofail() is to allow a selem to be
partially linked and use refcount to determine when and who can free the
selem if any unlink under lock fails. A selem initially is fully linked
to a map and a local storage and therefore selem->link_cnt is set to 2.
Under normal circumstances, bpf_selem_unlink_nofail() will be able to
grab locks and unlink a selem from map and local storage in sequeunce,
just like bpf_selem_unlink(), and then free it after an RCU grace period.
However, if any of the lock attempts fails, it will only clear
SDATA(selem)->smap or selem->local_storage depending on the caller and
decrement link_cnt to signal that the corresponding data structure
holding a reference to the selem is gone. Then, only when both map and
local storage are gone, an selem can be free by the last caller that
turns link_cnt to 0.
To make sure bpf_obj_free_fields() is done only once and when map is
still present, it is called when unlinking an selem from b->list under
b->lock.
To make sure uncharging memory is done only when the owner is still
present in map_free(), block destroy() from returning until there is no
pending map_free().
Later bpf_local_storage_destroy() will return the remaining amount of
memory charge tracked by selems_size to the owner.
Finally, access of selem, SDATA(selem)->smap and selem->local_storage
are racy. Callers will protect these fields with RCU.
Co-developed-by: Martin KaFai Lau <martin.lau@...nel.org>
Signed-off-by: Martin KaFai Lau <martin.lau@...nel.org>
Signed-off-by: Amery Hung <ameryhung@...il.com>
---
include/linux/bpf_local_storage.h | 5 +-
kernel/bpf/bpf_local_storage.c | 109 ++++++++++++++++++++++++++++--
2 files changed, 109 insertions(+), 5 deletions(-)
diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index ece32f756d86..4e1ebfb3b9e8 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -80,8 +80,10 @@ struct bpf_local_storage_elem {
* after raw_spin_unlock
*/
};
+ /* Used by map_free() and destroy() when rqspinlock returns err */
+ atomic_t link_cnt;
bool use_kmalloc_nolock;
- /* 7 bytes hole */
+ /* 3 bytes hole */
/* The data is stored in another cacheline to minimize
* the number of cachelines access during a cache hit.
*/
@@ -98,6 +100,7 @@ struct bpf_local_storage {
struct rcu_head rcu;
rqspinlock_t lock; /* Protect adding/removing from the "list" */
u64 selems_size; /* Total selem size. Protected by "lock" */
+ refcount_t owner_refcnt;
bool use_kmalloc_nolock;
};
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b8f146d41ffe..54d106ebbfe5 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -93,6 +93,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
if (swap_uptrs)
bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
}
+ atomic_set(&selem->link_cnt, 2);
return selem;
}
@@ -194,9 +195,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
/* The bpf_local_storage_map_free will wait for rcu_barrier */
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
- migrate_disable();
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- migrate_enable();
+ if (smap) {
+ migrate_disable();
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ migrate_enable();
+ }
kfree_nolock(selem);
}
@@ -221,7 +224,8 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
* is only supported in task local storage, where
* smap->use_kmalloc_nolock == true.
*/
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ if (smap)
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
return;
}
@@ -421,6 +425,96 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
return err;
}
+/*
+ * Unlink an selem from map and local storage with lockless fallback if callers
+ * are racing or rqspinlock returns error. It should only be called by
+ * bpf_local_storage_destroy() or bpf_local_storage_map_free().
+ */
+static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
+ struct bpf_local_storage_map_bucket *b)
+{
+ struct bpf_local_storage *local_storage;
+ struct bpf_local_storage_map *smap;
+ bool in_map_free = !!b;
+ unsigned long flags;
+ int err, unlink = 0;
+
+ local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held());
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
+ /*
+ * map_free() and destroy() each holds a link_cnt on an selem. Prevent called twice
+ * from the same caller on the same selem.
+ */
+ if ((!smap && in_map_free) || (!local_storage && !in_map_free))
+ return;
+
+ if (smap) {
+ b = b ? : select_bucket(smap, local_storage);
+ err = raw_res_spin_lock_irqsave(&b->lock, flags);
+ if (!err) {
+ /*
+ * Call bpf_obj_free_fields() under b->lock to make sure it is done
+ * exactly once for an selem. Safe to free special fields immediately
+ * as no BPF program should be referencing the selem.
+ */
+ if (likely(selem_linked_to_map(selem))) {
+ hlist_del_init_rcu(&selem->map_node);
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ unlink++;
+ }
+ raw_res_spin_unlock_irqrestore(&b->lock, flags);
+ }
+ /*
+ * Highly unlikely scenario: resource leak
+ *
+ * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing
+ * and both selem belong to the same bucket, if destroy(selem2) acquired
+ * b->lock and block for too long, neither map_free(selem1) and
+ * destroy(selem1) will be able to free the special field associated
+ * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT.
+ */
+ WARN_ON_ONCE(err && in_map_free);
+ if (!err || in_map_free)
+ RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
+ }
+
+ if (local_storage) {
+ err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
+ if (!err) {
+ /*
+ * In the common path, map_free() can call mem_uncharge() if
+ * destroy() is not about to return to owner, which can then go
+ * away immediately. Otherwise, the charge of the selem will stay
+ * accounted in local_storage->selems_size and uncharged during
+ * destroy().
+ */
+ if (likely(selem_linked_to_storage(selem))) {
+ hlist_del_init_rcu(&selem->snode);
+ if (smap && in_map_free &&
+ refcount_inc_not_zero(&local_storage->owner_refcnt)) {
+ mem_uncharge(smap, local_storage->owner, smap->elem_size);
+ local_storage->selems_size -= smap->elem_size;
+ refcount_dec(&local_storage->owner_refcnt);
+ }
+ unlink++;
+ }
+ raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
+ }
+ if (!err || !in_map_free)
+ RCU_INIT_POINTER(selem->local_storage, NULL);
+ }
+
+ /*
+ * Normally, an selem can be unlink under local_storage->lock and b->lock, and
+ * then added to a local to_free list. However, if destroy() and map_free() are
+ * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free
+ * the selem only after both map_free() and destroy() drop the refcnt.
+ */
+ if (unlink == 2 || atomic_dec_and_test(&selem->link_cnt))
+ bpf_selem_free(selem, false);
+}
+
void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem *selem)
@@ -487,6 +581,7 @@ int bpf_local_storage_alloc(void *owner,
raw_res_spin_lock_init(&storage->lock);
storage->owner = owner;
storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
+ refcount_set(&storage->owner_refcnt, 1);
bpf_selem_link_storage_nolock(storage, first_selem);
@@ -754,6 +849,12 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
if (free_storage)
bpf_local_storage_free(local_storage, true);
+
+ if (!refcount_dec_and_test(&local_storage->owner_refcnt)) {
+ while (refcount_read(&local_storage->owner_refcnt))
+ cpu_relax();
+ smp_rmb(); /* pair with refcount_dec in bpf_selem_unlink_nofail */
+ }
}
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
--
2.47.3
Powered by blists - more mailing lists