[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251125233928.3962947-5-csander@purestorage.com>
Date: Tue, 25 Nov 2025 16:39:28 -0700
From: Caleb Sander Mateos <csander@...estorage.com>
To: Jens Axboe <axboe@...nel.dk>
Cc: io-uring@...r.kernel.org,
linux-kernel@...r.kernel.org,
Caleb Sander Mateos <csander@...estorage.com>
Subject: [PATCH v3 4/4] io_uring: avoid uring_lock for IORING_SETUP_SINGLE_ISSUER
io_ring_ctx's mutex uring_lock can be quite expensive in high-IOPS
workloads. Even when only one thread pinned to a single CPU is accessing
the io_ring_ctx, the atomic CASes required to lock and unlock the mutex
are very hot instructions. The mutex's primary purpose is to prevent
concurrent io_uring system calls on the same io_ring_ctx. However, there
is already a flag IORING_SETUP_SINGLE_ISSUER that promises only one
task will make io_uring_enter() and io_uring_register() system calls on
the io_ring_ctx once it's enabled.
So if the io_ring_ctx is setup with IORING_SETUP_SINGLE_ISSUER, skip the
uring_lock mutex_lock() and mutex_unlock() on the submitter_task. On
other tasks acquiring the ctx uring lock, use a task work item to
suspend the submitter_task for the critical section.
In io_uring_register(), continue to always acquire the uring_lock mutex.
io_uring_register() can be called on a disabled io_ring_ctx (indeed,
it's required to enable it), when submitter_task isn't set yet. After
submitter_task is set, io_uring_register() is only permitted on
submitter_task, so uring_lock suffices to exclude all other users.
Signed-off-by: Caleb Sander Mateos <csander@...estorage.com>
---
io_uring/io_uring.c | 11 +++++
io_uring/io_uring.h | 101 ++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 109 insertions(+), 3 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e05e56a840f9..64e4e57e2c11 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -363,10 +363,21 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
}
+void io_ring_suspend_work(struct callback_head *cb_head)
+{
+ struct io_ring_suspend_work *suspend_work =
+ container_of(cb_head, struct io_ring_suspend_work, cb_head);
+ DECLARE_COMPLETION_ONSTACK(suspend_end);
+
+ suspend_work->lock_state->suspend_end = &suspend_end;
+ complete(&suspend_work->suspend_start);
+ wait_for_completion(&suspend_end);
+}
+
static void io_clean_op(struct io_kiocb *req)
{
if (unlikely(req->flags & REQ_F_BUFFER_SELECTED))
io_kbuf_drop_legacy(req);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 23dae0af530b..262971224cc6 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -1,8 +1,9 @@
#ifndef IOU_CORE_H
#define IOU_CORE_H
+#include <linux/completion.h>
#include <linux/errno.h>
#include <linux/lockdep.h>
#include <linux/resume_user_mode.h>
#include <linux/kasan.h>
#include <linux/poll.h>
@@ -195,36 +196,130 @@ void io_queue_next(struct io_kiocb *req);
void io_task_refs_refill(struct io_uring_task *tctx);
bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
void io_activate_pollwq(struct io_ring_ctx *ctx);
+/*
+ * The ctx uring lock protects most of the mutable struct io_ring_ctx state
+ * accessed in the struct io_kiocb issue path. In the I/O path, it is typically
+ * acquired in the io_uring_enter() syscall and io_handle_tw_list(). For
+ * IORING_SETUP_SQPOLL, it's acquired by io_sq_thread() instead. io_kiocb's
+ * issued with IO_URING_F_UNLOCKED in issue_flags (e.g. by io_wq_submit_work())
+ * acquire and release the ctx uring lock whenever they must touch io_ring_ctx
+ * state. io_uring_register() also acquires the ctx uring lock because most
+ * opcodes mutate io_ring_ctx state accessed in the issue path.
+ *
+ * For !IORING_SETUP_SINGLE_ISSUER io_ring_ctx's, acquiring the ctx uring lock
+ * is always done via mutex_(try)lock(&ctx->uring_lock).
+ *
+ * However, for IORING_SETUP_SINGLE_ISSUER, we can avoid the mutex_lock() +
+ * mutex_unlock() overhead on submitter_task because a single thread can't race
+ * with itself. In the uncommon case where the ctx uring lock is needed on
+ * another thread, it must suspend submitter_task by scheduling a task work item
+ * on it. io_ring_ctx_lock() returns once the task work item has started.
+ * submitter_task is unblocked once io_ring_ctx_unlock() is called.
+ *
+ * io_uring_register() requires special treatment for IORING_SETUP_SINGLE_ISSUER
+ * since it's allowed on a IORING_SETUP_R_DISABLED io_ring_ctx, where
+ * submitter_task isn't set yet. Hence the io_ring_register_ctx_*() family
+ * of helpers. They unconditionally acquire the uring_lock mutex, which always
+ * works to exclude other ctx uring lock users:
+ * - For !IORING_SETUP_SINGLE_ISSUER, all users acquire the ctx uring lock via
+ * the uring_lock mutex
+ * - For IORING_SETUP_SINGLE_ISSUER and IORING_SETUP_R_DISABLED, only
+ * io_uring_register() is allowed before the io_ring_ctx is enabled.
+ * So again, all ctx uring lock users acquire the uring_lock mutex.
+ * - For IORING_SETUP_SINGLE_ISSUER and !IORING_SETUP_R_DISABLED,
+ * io_uring_register() is only permitted on submitter_task, which is always
+ * granted the ctx uring lock unless suspended.
+ * Acquiring the uring_lock mutex is unnecessary but still correct.
+ */
+
struct io_ring_ctx_lock_state {
+ struct completion *suspend_end;
};
+struct io_ring_suspend_work {
+ struct callback_head cb_head;
+ struct completion suspend_start;
+ struct io_ring_ctx_lock_state *lock_state;
+};
+
+void io_ring_suspend_work(struct callback_head *cb_head);
+
/* Acquire the ctx uring lock */
static inline void io_ring_ctx_lock(struct io_ring_ctx *ctx,
struct io_ring_ctx_lock_state *state)
{
- mutex_lock(&ctx->uring_lock);
+ struct io_ring_suspend_work suspend_work;
+ struct task_struct *submitter_task;
+
+ if (!(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) {
+ mutex_lock(&ctx->uring_lock);
+ return;
+ }
+
+ submitter_task = ctx->submitter_task;
+ /*
+ * Not suitable for use while IORING_SETUP_R_DISABLED.
+ * Must use io_ring_register_ctx_lock() in that case.
+ */
+ WARN_ON_ONCE(!submitter_task);
+ if (likely(current == submitter_task))
+ return;
+
+ /* Use task work to suspend submitter_task */
+ init_task_work(&suspend_work.cb_head, io_ring_suspend_work);
+ init_completion(&suspend_work.suspend_start);
+ suspend_work.lock_state = state;
+ /* If task_work_add() fails, task is exiting, so no need to suspend */
+ if (unlikely(task_work_add(submitter_task, &suspend_work.cb_head,
+ TWA_SIGNAL))) {
+ state->suspend_end = NULL;
+ return;
+ }
+
+ wait_for_completion(&suspend_work.suspend_start);
}
/* Attempt to acquire the ctx uring lock without blocking */
static inline bool io_ring_ctx_trylock(struct io_ring_ctx *ctx)
{
- return mutex_trylock(&ctx->uring_lock);
+ if (!(ctx->flags & IORING_SETUP_SINGLE_ISSUER))
+ return mutex_trylock(&ctx->uring_lock);
+
+ /* Not suitable for use while IORING_SETUP_R_DISABLED */
+ WARN_ON_ONCE(!ctx->submitter_task);
+ return current == ctx->submitter_task;
}
/* Release the ctx uring lock */
static inline void io_ring_ctx_unlock(struct io_ring_ctx *ctx,
struct io_ring_ctx_lock_state *state)
{
- mutex_unlock(&ctx->uring_lock);
+ if (!(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) {
+ mutex_unlock(&ctx->uring_lock);
+ return;
+ }
+
+ if (likely(current == ctx->submitter_task))
+ return;
+
+ if (likely(state->suspend_end))
+ complete(state->suspend_end);
}
/* Assert (if CONFIG_LOCKDEP) that the ctx uring lock is held */
static inline void io_ring_ctx_assert_locked(const struct io_ring_ctx *ctx)
{
+ /*
+ * No straightforward way to check that submitter_task is suspended
+ * without access to struct io_ring_ctx_lock_state
+ */
+ if (ctx->flags & IORING_SETUP_SINGLE_ISSUER)
+ return;
+
lockdep_assert_held(&ctx->uring_lock);
}
/* Acquire the ctx uring lock during the io_uring_register() syscall */
static inline void io_ring_register_ctx_lock(struct io_ring_ctx *ctx)
--
2.45.2
Powered by blists - more mailing lists