[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2439336d-b6ae-4d08-a1e8-2372fc6df383@kernel.dk>
Date: Wed, 15 Jan 2025 10:18:29 -0700
From: Jens Axboe <axboe@...nel.dk>
To: Jann Horn <jannh@...gle.com>, Pavel Begunkov <asml.silence@...il.com>
Cc: io-uring@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] io_uring/rsrc: Simplify buffer cloning by locking both
rings
On 1/15/25 9:25 AM, Jann Horn wrote:
> The locking in the buffer cloning code is somewhat complex because it goes
> back and forth between locking the source ring and the destination ring.
>
> Make it easier to reason about by locking both rings at the same time.
> To avoid ABBA deadlocks, lock the rings in ascending kernel address order,
> just like in lock_two_nondirectories().
>
> Signed-off-by: Jann Horn <jannh@...gle.com>
> ---
> Just an idea for how I think io_clone_buffers() could be changed so it
> becomes slightly easier to reason about.
> I left the out_unlock jump label with its current name for now, though
> I guess that should probably be adjusted.
Looks pretty clean to me, and does make it easier to reason about. Only
thing that stuck out to me was:
> @@ -1067,7 +1060,18 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
> file = io_uring_register_get_file(buf.src_fd, registered_src);
> if (IS_ERR(file))
> return PTR_ERR(file);
> - ret = io_clone_buffers(ctx, file->private_data, &buf);
> + src_ctx = file->private_data;
> + if (src_ctx == ctx) {
> + ret = -ELOOP;
> + goto out_put;
> + }
which is a change, as previously it would've been legal to do something ala:
struct io_uring ring;
struct iovec vecs[2];
vecs[0] = real_buffer;
vecs[1] = sparse_buffer;
io_uring_register_buffers(&ring, vecs, 2);
io_uring_clone_buffers_offset(&ring, &ring, 1, 0, 1, IORING_REGISTER_DST_REPLACE);
and clone vecs[0] into slot 1. With the patch, that'll return -ELOOP instead.
Maybe something like the below incremental, to just make the unlock +
double lock depending on whether they are different or not? And also
cleaning up the label naming at the same time.
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 4b030382ad03..a1c7c8db5545 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -938,6 +938,9 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
int i, ret, off, nr;
unsigned int nbufs;
+ lockdep_assert_held(&ctx->uring_lock);
+ lockdep_assert_held(&src_ctx->uring_lock);
+
/*
* Accounting state is shared between the two rings; that only works if
* both rings are accounted towards the same counters.
@@ -979,17 +982,17 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
ret = -ENXIO;
nbufs = src_ctx->buf_table.nr;
if (!nbufs)
- goto out_unlock;
+ goto out_free;
ret = -EINVAL;
if (!arg->nr)
arg->nr = nbufs;
else if (arg->nr > nbufs)
- goto out_unlock;
+ goto out_free;
ret = -EOVERFLOW;
if (check_add_overflow(arg->nr, arg->src_off, &off))
- goto out_unlock;
+ goto out_free;
if (off > nbufs)
- goto out_unlock;
+ goto out_free;
off = arg->dst_off;
i = arg->src_off;
@@ -1004,7 +1007,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
if (!dst_node) {
ret = -ENOMEM;
- goto out_unlock;
+ goto out_free;
}
refcount_inc(&src_node->buf->refs);
@@ -1027,11 +1030,11 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
* copied to a ring that does not have buffers yet (checked at function
* entry).
*/
- WARN_ON(ctx->buf_table.nr);
+ WARN_ON_ONCE(ctx->buf_table.nr);
ctx->buf_table = data;
return 0;
-out_unlock:
+out_free:
io_rsrc_data_free(ctx, &data);
return ret;
}
@@ -1064,18 +1067,18 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
file = io_uring_register_get_file(buf.src_fd, registered_src);
if (IS_ERR(file))
return PTR_ERR(file);
+
src_ctx = file->private_data;
- if (src_ctx == ctx) {
- ret = -ELOOP;
- goto out_put;
+ if (src_ctx != ctx) {
+ mutex_unlock(&ctx->uring_lock);
+ lock_two_rings(ctx, src_ctx);
}
- mutex_unlock(&ctx->uring_lock);
- lock_two_rings(ctx, src_ctx);
ret = io_clone_buffers(ctx, src_ctx, &buf);
- mutex_unlock(&src_ctx->uring_lock);
-out_put:
+ if (src_ctx != ctx)
+ mutex_unlock(&src_ctx->uring_lock);
+
if (!registered_src)
fput(file);
return ret;
--
Jens Axboe
Powered by blists - more mailing lists