[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8aa7b962-40a6-4bbc-8646-86dd7ce3380e@gmail.com>
Date: Thu, 12 Jun 2025 14:26:58 +0100
From: Pavel Begunkov <asml.silence@...il.com>
To: Alexei Starovoitov <alexei.starovoitov@...il.com>
Cc: io-uring@...r.kernel.org, Martin KaFai Lau <martin.lau@...ux.dev>,
bpf <bpf@...r.kernel.org>, LKML <linux-kernel@...r.kernel.org>
Subject: Re: [RFC v2 5/5] io_uring/bpf: add basic kfunc helpers
On 6/12/25 03:47, Alexei Starovoitov wrote:
> On Fri, Jun 6, 2025 at 6:58 AM Pavel Begunkov <asml.silence@...il.com> wrote:
...>> +__bpf_kfunc
>> +struct io_uring_cqe *bpf_io_uring_extract_next_cqe(struct io_ring_ctx *ctx)
>> +{
>> + struct io_rings *rings = ctx->rings;
>> + unsigned int mask = ctx->cq_entries - 1;
>> + unsigned head = rings->cq.head;
>> + struct io_uring_cqe *cqe;
>> +
>> + /* TODO CQE32 */
>> + if (head == rings->cq.tail)
>> + return NULL;
>> +
>> + cqe = &rings->cqes[head & mask];
>> + rings->cq.head++;
>> + return cqe;
>> +}
>> +
>> +__bpf_kfunc_end_defs();
>> +
>> +BTF_KFUNCS_START(io_uring_kfunc_set)
>> +BTF_ID_FLAGS(func, bpf_io_uring_submit_sqes, KF_SLEEPABLE);
>> +BTF_ID_FLAGS(func, bpf_io_uring_post_cqe, KF_SLEEPABLE);
>> +BTF_ID_FLAGS(func, bpf_io_uring_queue_sqe, KF_SLEEPABLE);
>> +BTF_ID_FLAGS(func, bpf_io_uring_get_cqe, 0);
>> +BTF_ID_FLAGS(func, bpf_io_uring_extract_next_cqe, KF_RET_NULL);
>> +BTF_KFUNCS_END(io_uring_kfunc_set)
>
> This is not safe in general.
> The verifier doesn't enforce argument safety here.
> As a minimum you need to add KF_TRUSTED_ARGS flag to all kfunc.
> And once you do that you'll see that the verifier
> doesn't recognize the cqe returned from bpf_io_uring_get_cqe*()
> as trusted.
Thanks, will add it. If I read it right, without the flag the
program can, for example, create a struct io_ring_ctx on stack,
fill it with nonsense and pass to kfuncs. Is that right?
> Looking at your example:
> https://github.com/axboe/liburing/commit/706237127f03e15b4cc9c7c31c16d34dbff37cdc
> it doesn't care about contents of cqe and doesn't pass it further.
> So sort-of ok-ish right now,
> but if you need to pass cqe to another kfunc
> you would need to add an open coded iterator for cqe-s
> with appropriate KF_ITER* flags
> or maybe add acquire/release semantics for cqe.
> Like, get_cqe will be KF_ACQUIRE, and you'd need
> matching KF_RELEASE kfunc,
> so that 'cqe' is not lost.
> Then 'cqe' will be trusted and you can pass it as actual 'cqe'
> into another kfunc.
> Without KF_ACQUIRE the verifier sees that get_cqe*() kfuncs
> return 'struct io_uring_cqe *' and it's ok for tracing
> or passing into kfuncs like bpf_io_uring_queue_sqe()
> that don't care about a particular type,
> but not ok for full tracking of objects.
I don't need type safety for SQEs / CQEs, they're supposed to be simple
memory blobs containing userspace data only. SQ / CQ are shared with
userspace, and the kfuncs can leak the content of passed CQE / SQE to
userspace. But I'd like to find a way to reject programs stashing
kernel pointers / data into them.
BPF_PROG(name, struct io_ring_ctx *io_ring)
{
struct io_uring_sqe *cqe = ...;
cqe->user_data = io_ring;
cqe->res = io_ring->private_field;
}
And I mentioned in the message, I rather want to get rid of half of the
kfuncs, and give BPF direct access to the SQ/CQ instead. Schematically
it should look like this:
BPF_PROG(name, struct io_ring_ctx *ring)
{
struct io_uring_sqe *sqes = get_SQ(ring);
sqes[ring->sq_tail]->opcode = OP_NOP;
bpf_kfunc_submit_sqes(ring, 1);
struct io_uring_cqe *cqes = get_CQ(ring);
print_cqe(&cqes[ring->cq_head]);
}
I hacked up RET_PTR_TO_MEM for kfuncs, the diff is below, but it'd be
great to get rid of the constness of the size argument. I need to
digest arenas first as conceptually they look very close.
> For next revision please post all selftest, examples,
> and bpf progs on the list,
> so people don't need to search github.
Did the link in the cover letter not work for you? I'm confused
since it's all in a branch in my tree, but you linked to the same
patches but in Jens' tree, and I have zero clue what they're
doing there or how you found them.
diff --git a/io_uring/bpf.c b/io_uring/bpf.c
index 9494e4289605..400a06a74b5d 100644
--- a/io_uring/bpf.c
+++ b/io_uring/bpf.c
@@ -2,6 +2,7 @@
#include <linux/bpf_verifier.h>
#include "io_uring.h"
+#include "memmap.h"
#include "bpf.h"
#include "register.h"
@@ -72,6 +73,14 @@ struct io_uring_cqe *bpf_io_uring_extract_next_cqe(struct io_ring_ctx *ctx)
return cqe;
}
+__bpf_kfunc
+void *bpf_io_uring_get_region(struct io_ring_ctx *ctx, u64 size__retsz)
+{
+ if (size__retsz > ((u64)ctx->ring_region.nr_pages << PAGE_SHIFT))
+ return NULL;
+ return io_region_get_ptr(&ctx->ring_region);
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(io_uring_kfunc_set)
@@ -80,6 +89,7 @@ BTF_ID_FLAGS(func, bpf_io_uring_post_cqe, KF_SLEEPABLE);
BTF_ID_FLAGS(func, bpf_io_uring_queue_sqe, KF_SLEEPABLE);
BTF_ID_FLAGS(func, bpf_io_uring_get_cqe, 0);
BTF_ID_FLAGS(func, bpf_io_uring_extract_next_cqe, KF_RET_NULL);
+BTF_ID_FLAGS(func, bpf_io_uring_get_region, KF_RET_NULL);
BTF_KFUNCS_END(io_uring_kfunc_set)
static const struct btf_kfunc_id_set bpf_io_uring_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 54c6953a8b84..ac4803b5933c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -343,6 +343,7 @@ struct bpf_kfunc_call_arg_meta {
int uid;
} map;
u64 mem_size;
+ bool mem_size_found;
};
struct btf *btf_vmlinux;
@@ -11862,6 +11863,11 @@ static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *a
return btf_param_match_suffix(btf, arg, "__ign");
}
+static bool is_kfunc_arg_ret_size(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__retsz");
+}
+
static bool is_kfunc_arg_map(const struct btf *btf, const struct btf_param *arg)
{
return btf_param_match_suffix(btf, arg, "__map");
@@ -12912,7 +12918,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
- if (is_kfunc_arg_constant(meta->btf, &args[i])) {
+ if (is_kfunc_arg_ret_size(btf, &args[i])) {
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno);
+ return -EINVAL;
+ }
+ if (meta->mem_size_found) {
+ verbose(env, "Only one return size argument is permitted\n");
+ return -EINVAL;
+ }
+ meta->mem_size = reg->var_off.value;
+ meta->mem_size_found = true;
+ ret = mark_chain_precision(env, regno);
+ if (ret)
+ return ret;
+ } else if (is_kfunc_arg_constant(meta->btf, &args[i])) {
if (meta->arg_constant.found) {
verbose(env, "verifier internal error: only one constant argument permitted\n");
return -EFAULT;
@@ -13816,6 +13836,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
} else if (btf_type_is_void(ptr_type)) {
/* kfunc returning 'void *' is equivalent to returning scalar */
mark_reg_unknown(env, regs, BPF_REG_0);
+
+ if (meta.mem_size_found) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM;
+ regs[BPF_REG_0].mem_size = meta.mem_size;
+ }
} else if (!__btf_type_is_struct(ptr_type)) {
if (!meta.r0_size) {
__u32 sz;
--
Pavel Begunkov
Powered by blists - more mailing lists