[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <269f6e6a-362f-4ccf-926f-2e2e0c1ea014@kernel.dk>
Date: Tue, 27 Jan 2026 09:41:00 -0700
From: Jens Axboe <axboe@...nel.dk>
To: Christian Brauner <brauner@...nel.org>
Cc: io-uring@...r.kernel.org, jannh@...gle.com, kees@...nel.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/7] io_uring: add support for BPF filtering for opcode
restrictions
On 1/27/26 3:06 AM, Christian Brauner wrote:
>> +int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req)
>> +{
>> + struct io_bpf_filter *filter;
>> + struct io_uring_bpf_ctx bpf_ctx;
>> + int ret;
>> +
>> + /* Fast check for existence of filters outside of RCU */
>> + if (!rcu_access_pointer(res->bpf_filters->filters[req->opcode]))
>> + return 0;
>> +
>> + /*
>> + * req->opcode has already been validated to be within the range
>> + * of what we expect, io_init_req() does this.
>> + */
>> + rcu_read_lock();
>> + filter = rcu_dereference(res->bpf_filters->filters[req->opcode]);
>> + if (!filter) {
>> + ret = 1;
>> + goto out;
>> + } else if (filter == &dummy_filter) {
>> + ret = 0;
>> + goto out;
>> + }
>> +
>> + io_uring_populate_bpf_ctx(&bpf_ctx, req);
>> +
>> + /*
>> + * Iterate registered filters. The opcode is allowed IFF all filters
>> + * return 1. If any filter returns denied, opcode will be denied.
>> + */
>> + do {
>> + if (filter == &dummy_filter)
>> + ret = 0;
>> + else
>> + ret = bpf_prog_run(filter->prog, &bpf_ctx);
>> + if (!ret)
>> + break;
>> + filter = filter->next;
>> + } while (filter);
>> +out:
>> + rcu_read_unlock();
>> + return ret ? 0 : -EACCES;
>> +}
>
> Maybe we can write this a little nicer?:
>
> int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req)
> {
> struct io_bpf_filter *filter;
> struct io_uring_bpf_ctx bpf_ctx;
>
> /* Fast check for existence of filters outside of RCU */
> if (!rcu_access_pointer(res->bpf_filters->filters[req->opcode]))
> return 0;
>
> /*
> * req->opcode has already been validated to be within the range
> * of what we expect, io_init_req() does this.
> */
> guard(rcu)();
> filter = rcu_dereference(res->bpf_filters->filters[req->opcode]);
> if (!filter)
> return 0;
>
> if (filter == &dummy_filter)
> return -EACCES;
>
> io_uring_populate_bpf_ctx(&bpf_ctx, req);
>
> /*
> * Iterate registered filters. The opcode is allowed IFF all filters
> * return 1. If any filter returns denied, opcode will be denied.
> */
> for (; filter ; filter = filter->next) {
> int ret;
>
> if (filter == &dummy_filter)
> return -EACCES;
>
> ret = bpf_prog_run(filter->prog, &bpf_ctx);
> if (!ret)
> return -EACCES;
> }
>
> return 0;
> }
Did a variant based on this, I agree it looks nicer with guard for this
one.
>> +static void io_free_bpf_filters(struct rcu_head *head)
>> +{
>> + struct io_bpf_filter __rcu **filter;
>> + struct io_bpf_filters *filters;
>> + int i;
>> +
>> + filters = container_of(head, struct io_bpf_filters, rcu_head);
>> + spin_lock(&filters->lock);
>> + filter = filters->filters;
>> + if (!filter) {
>> + spin_unlock(&filters->lock);
>> + return;
>> + }
>> + spin_unlock(&filters->lock);
>
> This is minor but I prefer:
>
> scoped_guard(spinlock)(&filters->lock) {
> filters = container_of(head, struct io_bpf_filters, rcu_head);
> filter = filters->filters;
> if (!filter)
> return;
> }
Reason I tend to never do that is that I always have to grep for the
syntax... And case in point, you need to do:
scoped_guard(spinlock, &filters->lock) {
so I guess I'm not the only one :-). But it does read better that way,
made this change too.
>> +static struct io_bpf_filters *io_new_bpf_filters(void)
>> +{
>> + struct io_bpf_filters *filters;
>> +
>> + filters = kzalloc(sizeof(*filters), GFP_KERNEL_ACCOUNT);
>> + if (!filters)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + filters->filters = kcalloc(IORING_OP_LAST,
>> + sizeof(struct io_bpf_filter *),
>> + GFP_KERNEL_ACCOUNT);
>> + if (!filters->filters) {
>> + kfree(filters);
>> + return ERR_PTR(-ENOMEM);
>> + }
>> +
>> + refcount_set(&filters->refs, 1);
>> + spin_lock_init(&filters->lock);
>> + return filters;
>> +}
>
> static struct io_bpf_filters *io_new_bpf_filters(void)
> {
> struct io_bpf_filters *filters __free(kfree) = NULL;
>
> filters = kzalloc(sizeof(*filters), GFP_KERNEL_ACCOUNT);
> if (!filters)
> return ERR_PTR(-ENOMEM);
>
> filters->filters = kcalloc(IORING_OP_LAST,
> sizeof(struct io_bpf_filter *),
> GFP_KERNEL_ACCOUNT);
> if (!filters->filters)
> return ERR_PTR(-ENOMEM);
>
> refcount_set(&filters->refs, 1);
> spin_lock_init(&filters->lock);
> return no_free_ptr(filters);
> }
Adopted as well, thanks.
>> +/*
>> + * Validate classic BPF filter instructions. Only allow a safe subset of
>> + * operations - no packet data access, just context field loads and basic
>> + * ALU/jump operations.
>> + */
>> +static int io_uring_check_cbpf_filter(struct sock_filter *filter,
>> + unsigned int flen)
>> +{
>> + int pc;
>
> Seems fine to me but I can't meaningfully review this.
Yeah seriously... It's just the seccomp filter modified for this use
case, so supposedly should be vetted already.
>> +int io_register_bpf_filter(struct io_restriction *res,
>> + struct io_uring_bpf __user *arg)
>> +{
>> + struct io_bpf_filter *filter, *old_filter;
>> + struct io_bpf_filters *filters;
>> + struct io_uring_bpf reg;
>> + struct bpf_prog *prog;
>> + struct sock_fprog fprog;
>> + int ret;
>> +
>> + if (copy_from_user(®, arg, sizeof(reg)))
>> + return -EFAULT;
>> + if (reg.cmd_type != IO_URING_BPF_CMD_FILTER)
>> + return -EINVAL;
>> + if (reg.cmd_flags || reg.resv)
>> + return -EINVAL;
>> +
>> + if (reg.filter.opcode >= IORING_OP_LAST)
>> + return -EINVAL;
>
> So you only support per-op-code filtering with cbpf. I assume that you
> would argue that people can use the existing io_uring restrictions. But
> that's not inherited, right? So then this forces users to have a bpf
> program for all opcodes that io_uring on their system supports.
The existing restrictions ARE inherited now, and can be set on a
per-task basis as well. That's in the last patch. And since the classic
"deny this opcode" filtering is way cheaper than running a BPF prog, I
do think that's the right approach.
> I think that this is a bit unfortunate and wasteful for both userspace
> and io_uring. Can't we do a combined thing where we also allow filters
> to attach to all op-codes. Then userspace could start with an allow-list
> or deny-list filter and then attach further per-op-code bpf programs to
> the op-codes they want to manage specifically. Then you also get
> inheritance of the restrictions per-task.
>
> That would be nicer imho.
I'm considering this one moot given the above on using
IORING_REGISTER_RESTRICTIONS with fd == -1 to set per-task restrictions
using the classic non-bpf filtering, which are inherited as well. You
only need the cBPF filters if you want to deny an opcode based on aux
data for that opcode.
--
Jens Axboe
Powered by blists - more mailing lists