[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMB2axMNzemRgxQfNLi2GrTYJdmgchSH+ND6+QaFQM2m9ygajQ@mail.gmail.com>
Date: Wed, 5 Nov 2025 15:03:09 -0800
From: Amery Hung <ameryhung@...il.com>
To: Andrii Nakryiko <andrii.nakryiko@...il.com>
Cc: bpf@...r.kernel.org, netdev@...r.kernel.org, alexei.starovoitov@...il.com,
andrii@...nel.org, daniel@...earbox.net, tj@...nel.org, martin.lau@...nel.org,
kernel-team@...a.com
Subject: Re: [PATCH bpf-next v5 3/7] bpf: Pin associated struct_ops when
registering async callback
On Tue, Nov 4, 2025 at 3:21 PM Andrii Nakryiko
<andrii.nakryiko@...il.com> wrote:
>
> On Tue, Nov 4, 2025 at 9:27 AM Amery Hung <ameryhung@...il.com> wrote:
> >
> > Take a refcount of the associated struct_ops map to prevent the map from
> > being freed when an async callback scheduled from a struct_ops program
> > runs.
> >
> > Since struct_ops programs do not take refcounts on the struct_ops map,
> > it is possible for a struct_ops map to be freed when an async callback
> > scheduled from it runs. To prevent this, take a refcount on prog->aux->
> > st_ops_assoc and save it in a newly created struct bpf_async_res for
> > every async mechanism. The reference needs to be preserved in
> > bpf_async_res since prog->aux->st_ops_assoc can be poisoned anytime
> > and reference leak could happen.
> >
> > bpf_async_res will contain a async callback's BPF program and resources
> > related to the BPF program. The resources will be acquired when
> > registering a callback and released when cancelled or when the map
> > associated with the callback is freed.
> >
> > Also rename drop_prog_refcnt to bpf_async_cb_reset to better reflect
> > what it now does.
> >
> > Signed-off-by: Amery Hung <ameryhung@...il.com>
> > ---
> > kernel/bpf/helpers.c | 105 +++++++++++++++++++++++++++++--------------
> > 1 file changed, 72 insertions(+), 33 deletions(-)
> >
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index 930e132f440f..5c081cd604d5 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -1092,9 +1092,14 @@ static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
> > return (void *)value - round_up(map->key_size, 8);
> > }
> >
> > +struct bpf_async_res {
>
> "res" has a strong "result" meaning, which is a distraction here.
> Maybe "bpf_async_ctx"? And then we can use prep and put (reset?)
> terminology?
>
> > + struct bpf_prog *prog;
> > + struct bpf_map *st_ops_assoc;
> > +};
> > +
> > struct bpf_async_cb {
> > struct bpf_map *map;
> > - struct bpf_prog *prog;
> > + struct bpf_async_res res;
> > void __rcu *callback_fn;
> > void *value;
> > union {
> > @@ -1299,8 +1304,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
> > break;
> > }
> > cb->map = map;
> > - cb->prog = NULL;
> > cb->flags = flags;
> > + memset(&cb->res, 0, sizeof(cb->res));
> > rcu_assign_pointer(cb->callback_fn, NULL);
> >
> > WRITE_ONCE(async->cb, cb);
> > @@ -1351,11 +1356,47 @@ static const struct bpf_func_proto bpf_timer_init_proto = {
> > .arg3_type = ARG_ANYTHING,
> > };
> >
> > +static void bpf_async_res_put(struct bpf_async_res *res)
> > +{
> > + bpf_prog_put(res->prog);
> > +
> > + if (res->st_ops_assoc)
> > + bpf_map_put(res->st_ops_assoc);
> > +}
> > +
> > +static int bpf_async_res_get(struct bpf_async_res *res, struct bpf_prog *prog)
> > +{
> > + struct bpf_map *st_ops_assoc = NULL;
> > + int err;
> > +
> > + prog = bpf_prog_inc_not_zero(prog);
> > + if (IS_ERR(prog))
> > + return PTR_ERR(prog);
> > +
> > + st_ops_assoc = READ_ONCE(prog->aux->st_ops_assoc);
>
> I think in about a month we'll forget why we inc_not_zero only for
> STRUCT_OPS programs, so I'd add comment here that non-struct_ops
> programs have explicit refcount on st_ops_assoc, so as long as we have
> that inc_not_zero(prog) above, we don't need to also bump map refcount
Will document this in the comment.
>
> > + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS &&
> > + st_ops_assoc && st_ops_assoc != BPF_PTR_POISON) {
> > + st_ops_assoc = bpf_map_inc_not_zero(st_ops_assoc);
> > + if (IS_ERR(st_ops_assoc)) {
> > + err = PTR_ERR(st_ops_assoc);
> > + goto put_prog;
>
> nit: might be a bit premature to structure code with goto put_prog. As of now:
>
>
> bpf_prog_put(prog);
> return PTR_ERR(st_ops_assoc);
>
> is short and sweet and good enough?
Yes. We can change it to this style once there are more fields in bpf_async_ctx.
>
> > + }
> > + }
> > +
> > + res->prog = prog;
> > + res->st_ops_assoc = st_ops_assoc;
>
> question: do we want to assign BPF_PTR_POISON to res->st_ops_assoc or
> is it better to keep it as NULL in such a case? I'm not sure, just
> bringing up the possibility
As this doesn't make a difference on what
bpf_prog_get_assoc_struct_ops() returns, I'd keep it as NULL to
simplify things.
>
> > + return 0;
> > +put_prog:
> > + bpf_prog_put(prog);
> > + return err;
> > +}
> > +
> > static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
> > struct bpf_prog_aux *aux, unsigned int flags,
> > enum bpf_async_type type)
> > {
> > struct bpf_prog *prev, *prog = aux->prog;
> > + struct bpf_async_res res;
> > struct bpf_async_cb *cb;
> > int ret = 0;
> >
> > @@ -1376,20 +1417,18 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback
> > ret = -EPERM;
> > goto out;
> > }
> > - prev = cb->prog;
> > + prev = cb->res.prog;
> > if (prev != prog) {
> > - /* Bump prog refcnt once. Every bpf_timer_set_callback()
> > + /* Get prog and related resources once. Every bpf_timer_set_callback()
> > * can pick different callback_fn-s within the same prog.
> > */
> > - prog = bpf_prog_inc_not_zero(prog);
> > - if (IS_ERR(prog)) {
> > - ret = PTR_ERR(prog);
> > + ret = bpf_async_res_get(&res, prog);
> > + if (ret)
> > goto out;
> > - }
> > if (prev)
> > - /* Drop prev prog refcnt when swapping with new prog */
> > - bpf_prog_put(prev);
> > - cb->prog = prog;
> > + /* Put prev prog and related resources when swapping with new prog */
> > + bpf_async_res_put(&cb->res);
> > + cb->res = res;
> > }
>
> we discussed this offline, but I'll summarize here:
>
> I think we need to abstract this away as bpf_async_ctx_update(), which
> will accept a new prog pointer, and internally will deal with
> necessary ref count inc/put as necessary, depending on whether prog
> changed or not. With st_ops_assoc, prog pointer may not change, but
> the underlying st_ops_assoc might (it can go from NULL to valid
> assoc). And the implementation will be careful to leave previous async
> ctx as it was if anything goes wrong (just like existing code
> behaves).
How about three APIs like below. First we just bump refcounts
unconditionally with prepare(). Then, xchg the local bpf_async_ctx
with the one embedded in callbacks with update(), and drop refcount in
cleanup().
This will have some overhead as there are unnecessary atomic op, but
can make update() much straightforward.
static void bpf_async_ctx_cleanup(struct bpf_async_ctx *ctx)
{
bpf_prog_put(ctx->prog);
if (ctx->st_ops_assoc)
bpf_map_put(ctx->st_ops_assoc);
memset(&ctx, 0, sizeof(*ctx));
}
static int bpf_async_ctx_prepare(struct bpf_async_ctx *ctx, struct
bpf_prog *prog)
{
struct bpf_map *st_ops_assoc = NULL;
int err;
prog = bpf_prog_inc_not_zero(prog);
if (IS_ERR(prog))
return PTR_ERR(prog);
st_ops_assoc = READ_ONCE(prog->aux->st_ops_assoc);
if (prog->type == BPF_PROG_TYPE_STRUCT_OPS &&
st_ops_assoc && st_ops_assoc != BPF_PTR_POISON) {
st_ops_assoc = bpf_map_inc_not_zero(st_ops_assoc);
if (IS_ERR(st_ops_assoc)) {
bpf_prog_put(prog);
return PTR_ERR(st_ops_assoc);
}
}
ctx->prog = prog;
ctx->st_ops_assoc = st_ops_assoc;
return 0;
}
static void bpf_async_ctx_update(struct bpf_async_ctx *ctx, struct
bpf_async_ctx *new)
{
struct bpf_async_ctx old;
old = *ctx;
*ctx = *new;
bpf_async_ctx_cleanup(old);
}
>
> > rcu_assign_pointer(cb->callback_fn, callback_fn);
> > out:
> > @@ -1423,7 +1462,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla
> > return -EINVAL;
> > __bpf_spin_lock_irqsave(&timer->lock);
> > t = timer->timer;
> > - if (!t || !t->cb.prog) {
> > + if (!t || !t->cb.res.prog) {
> > ret = -EINVAL;
> > goto out;
> > }
> > @@ -1451,14 +1490,14 @@ static const struct bpf_func_proto bpf_timer_start_proto = {
> > .arg3_type = ARG_ANYTHING,
> > };
> >
> > -static void drop_prog_refcnt(struct bpf_async_cb *async)
> > +static void bpf_async_cb_reset(struct bpf_async_cb *cb)
> > {
> > - struct bpf_prog *prog = async->prog;
> > + struct bpf_prog *prog = cb->res.prog;
> >
> > if (prog) {
> > - bpf_prog_put(prog);
> > - async->prog = NULL;
> > - rcu_assign_pointer(async->callback_fn, NULL);
> > + bpf_async_res_put(&cb->res);
> > + memset(&cb->res, 0, sizeof(cb->res));
>
> shouldn't bpf_async_res_put() leave cb->res in zeroed out state? why
> extra memset(0)?
Will move memset(0) into bpf_async_res_put().
>
> > + rcu_assign_pointer(cb->callback_fn, NULL);
> > }
> > }
> >
>
> [...]
Powered by blists - more mailing lists