[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEf4BzY+1PAE94PfoE=3VQVEYHWAiJP5btkx+u+UBjaZt_k==A@mail.gmail.com>
Date: Tue, 4 Nov 2025 15:20:52 -0800
From: Andrii Nakryiko <andrii.nakryiko@...il.com>
To: Amery Hung <ameryhung@...il.com>
Cc: bpf@...r.kernel.org, netdev@...r.kernel.org, alexei.starovoitov@...il.com,
andrii@...nel.org, daniel@...earbox.net, tj@...nel.org, martin.lau@...nel.org,
kernel-team@...a.com
Subject: Re: [PATCH bpf-next v5 3/7] bpf: Pin associated struct_ops when
registering async callback
On Tue, Nov 4, 2025 at 9:27 AM Amery Hung <ameryhung@...il.com> wrote:
>
> Take a refcount of the associated struct_ops map to prevent the map from
> being freed when an async callback scheduled from a struct_ops program
> runs.
>
> Since struct_ops programs do not take refcounts on the struct_ops map,
> it is possible for a struct_ops map to be freed when an async callback
> scheduled from it runs. To prevent this, take a refcount on prog->aux->
> st_ops_assoc and save it in a newly created struct bpf_async_res for
> every async mechanism. The reference needs to be preserved in
> bpf_async_res since prog->aux->st_ops_assoc can be poisoned anytime
> and reference leak could happen.
>
> bpf_async_res will contain a async callback's BPF program and resources
> related to the BPF program. The resources will be acquired when
> registering a callback and released when cancelled or when the map
> associated with the callback is freed.
>
> Also rename drop_prog_refcnt to bpf_async_cb_reset to better reflect
> what it now does.
>
> Signed-off-by: Amery Hung <ameryhung@...il.com>
> ---
> kernel/bpf/helpers.c | 105 +++++++++++++++++++++++++++++--------------
> 1 file changed, 72 insertions(+), 33 deletions(-)
>
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 930e132f440f..5c081cd604d5 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -1092,9 +1092,14 @@ static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
> return (void *)value - round_up(map->key_size, 8);
> }
>
> +struct bpf_async_res {
"res" has a strong "result" meaning, which is a distraction here.
Maybe "bpf_async_ctx"? And then we can use prep and put (reset?)
terminology?
> + struct bpf_prog *prog;
> + struct bpf_map *st_ops_assoc;
> +};
> +
> struct bpf_async_cb {
> struct bpf_map *map;
> - struct bpf_prog *prog;
> + struct bpf_async_res res;
> void __rcu *callback_fn;
> void *value;
> union {
> @@ -1299,8 +1304,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
> break;
> }
> cb->map = map;
> - cb->prog = NULL;
> cb->flags = flags;
> + memset(&cb->res, 0, sizeof(cb->res));
> rcu_assign_pointer(cb->callback_fn, NULL);
>
> WRITE_ONCE(async->cb, cb);
> @@ -1351,11 +1356,47 @@ static const struct bpf_func_proto bpf_timer_init_proto = {
> .arg3_type = ARG_ANYTHING,
> };
>
> +static void bpf_async_res_put(struct bpf_async_res *res)
> +{
> + bpf_prog_put(res->prog);
> +
> + if (res->st_ops_assoc)
> + bpf_map_put(res->st_ops_assoc);
> +}
> +
> +static int bpf_async_res_get(struct bpf_async_res *res, struct bpf_prog *prog)
> +{
> + struct bpf_map *st_ops_assoc = NULL;
> + int err;
> +
> + prog = bpf_prog_inc_not_zero(prog);
> + if (IS_ERR(prog))
> + return PTR_ERR(prog);
> +
> + st_ops_assoc = READ_ONCE(prog->aux->st_ops_assoc);
I think in about a month we'll forget why we inc_not_zero only for
STRUCT_OPS programs, so I'd add comment here that non-struct_ops
programs have explicit refcount on st_ops_assoc, so as long as we have
that inc_not_zero(prog) above, we don't need to also bump map refcount
> + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS &&
> + st_ops_assoc && st_ops_assoc != BPF_PTR_POISON) {
> + st_ops_assoc = bpf_map_inc_not_zero(st_ops_assoc);
> + if (IS_ERR(st_ops_assoc)) {
> + err = PTR_ERR(st_ops_assoc);
> + goto put_prog;
nit: might be a bit premature to structure code with goto put_prog. As of now:
bpf_prog_put(prog);
return PTR_ERR(st_ops_assoc);
is short and sweet and good enough?
> + }
> + }
> +
> + res->prog = prog;
> + res->st_ops_assoc = st_ops_assoc;
question: do we want to assign BPF_PTR_POISON to res->st_ops_assoc or
is it better to keep it as NULL in such a case? I'm not sure, just
bringing up the possibility
> + return 0;
> +put_prog:
> + bpf_prog_put(prog);
> + return err;
> +}
> +
> static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
> struct bpf_prog_aux *aux, unsigned int flags,
> enum bpf_async_type type)
> {
> struct bpf_prog *prev, *prog = aux->prog;
> + struct bpf_async_res res;
> struct bpf_async_cb *cb;
> int ret = 0;
>
> @@ -1376,20 +1417,18 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback
> ret = -EPERM;
> goto out;
> }
> - prev = cb->prog;
> + prev = cb->res.prog;
> if (prev != prog) {
> - /* Bump prog refcnt once. Every bpf_timer_set_callback()
> + /* Get prog and related resources once. Every bpf_timer_set_callback()
> * can pick different callback_fn-s within the same prog.
> */
> - prog = bpf_prog_inc_not_zero(prog);
> - if (IS_ERR(prog)) {
> - ret = PTR_ERR(prog);
> + ret = bpf_async_res_get(&res, prog);
> + if (ret)
> goto out;
> - }
> if (prev)
> - /* Drop prev prog refcnt when swapping with new prog */
> - bpf_prog_put(prev);
> - cb->prog = prog;
> + /* Put prev prog and related resources when swapping with new prog */
> + bpf_async_res_put(&cb->res);
> + cb->res = res;
> }
we discussed this offline, but I'll summarize here:
I think we need to abstract this away as bpf_async_ctx_update(), which
will accept a new prog pointer, and internally will deal with
necessary ref count inc/put as necessary, depending on whether prog
changed or not. With st_ops_assoc, prog pointer may not change, but
the underlying st_ops_assoc might (it can go from NULL to valid
assoc). And the implementation will be careful to leave previous async
ctx as it was if anything goes wrong (just like existing code
behaves).
> rcu_assign_pointer(cb->callback_fn, callback_fn);
> out:
> @@ -1423,7 +1462,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla
> return -EINVAL;
> __bpf_spin_lock_irqsave(&timer->lock);
> t = timer->timer;
> - if (!t || !t->cb.prog) {
> + if (!t || !t->cb.res.prog) {
> ret = -EINVAL;
> goto out;
> }
> @@ -1451,14 +1490,14 @@ static const struct bpf_func_proto bpf_timer_start_proto = {
> .arg3_type = ARG_ANYTHING,
> };
>
> -static void drop_prog_refcnt(struct bpf_async_cb *async)
> +static void bpf_async_cb_reset(struct bpf_async_cb *cb)
> {
> - struct bpf_prog *prog = async->prog;
> + struct bpf_prog *prog = cb->res.prog;
>
> if (prog) {
> - bpf_prog_put(prog);
> - async->prog = NULL;
> - rcu_assign_pointer(async->callback_fn, NULL);
> + bpf_async_res_put(&cb->res);
> + memset(&cb->res, 0, sizeof(cb->res));
shouldn't bpf_async_res_put() leave cb->res in zeroed out state? why
extra memset(0)?
> + rcu_assign_pointer(cb->callback_fn, NULL);
> }
> }
>
[...]
Powered by blists - more mailing lists