lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251202164121.3612929-6-csander@purestorage.com>
Date: Tue,  2 Dec 2025 09:41:21 -0700
From: Caleb Sander Mateos <csander@...estorage.com>
To: Jens Axboe <axboe@...nel.dk>
Cc: io-uring@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	Caleb Sander Mateos <csander@...estorage.com>,
	syzbot@...kaller.appspotmail.com
Subject: [PATCH v4 5/5] io_uring: avoid uring_lock for IORING_SETUP_SINGLE_ISSUER

io_ring_ctx's mutex uring_lock can be quite expensive in high-IOPS
workloads. Even when only one thread pinned to a single CPU is accessing
the io_ring_ctx, the atomic CASes required to lock and unlock the mutex
are very hot instructions. The mutex's primary purpose is to prevent
concurrent io_uring system calls on the same io_ring_ctx. However, there
is already a flag IORING_SETUP_SINGLE_ISSUER that promises only one
task will make io_uring_enter() and io_uring_register() system calls on
the io_ring_ctx once it's enabled.
So if the io_ring_ctx is setup with IORING_SETUP_SINGLE_ISSUER, skip the
uring_lock mutex_lock() and mutex_unlock() on the submitter_task. On
other tasks acquiring the ctx uring lock, use a task work item to
suspend the submitter_task for the critical section.
If the io_ring_ctx is IORING_SETUP_R_DISABLED (possible during
io_uring_setup(), io_uring_register(), or io_uring exit), submitter_task
may be set concurrently, so acquire the uring_lock before checking it.
If submitter_task isn't set yet, the uring_lock suffices to provide
mutual exclusion.

Signed-off-by: Caleb Sander Mateos <csander@...estorage.com>
Tested-by: syzbot@...kaller.appspotmail.com
---
 io_uring/io_uring.c |  12 +++++
 io_uring/io_uring.h | 114 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 123 insertions(+), 3 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8d934bba21fa..054667880bfb 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -363,10 +363,22 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	xa_destroy(&ctx->io_bl_xa);
 	kfree(ctx);
 	return NULL;
 }
 
+void io_ring_suspend_work(struct callback_head *cb_head)
+{
+	struct io_ring_suspend_work *suspend_work =
+		container_of(cb_head, struct io_ring_suspend_work, cb_head);
+	DECLARE_COMPLETION_ONSTACK(suspend_end);
+
+	*suspend_work->suspend_end = &suspend_end;
+	complete(&suspend_work->suspend_start);
+
+	wait_for_completion(&suspend_end);
+}
+
 static void io_clean_op(struct io_kiocb *req)
 {
 	if (unlikely(req->flags & REQ_F_BUFFER_SELECTED))
 		io_kbuf_drop_legacy(req);
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 57c3eef26a88..2b08d0ddab30 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -1,8 +1,9 @@
 #ifndef IOU_CORE_H
 #define IOU_CORE_H
 
+#include <linux/completion.h>
 #include <linux/errno.h>
 #include <linux/lockdep.h>
 #include <linux/resume_user_mode.h>
 #include <linux/kasan.h>
 #include <linux/poll.h>
@@ -195,19 +196,85 @@ void io_queue_next(struct io_kiocb *req);
 void io_task_refs_refill(struct io_uring_task *tctx);
 bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
 
 void io_activate_pollwq(struct io_ring_ctx *ctx);
 
+/*
+ * The ctx uring lock protects most of the mutable struct io_ring_ctx state
+ * accessed in the struct io_kiocb issue path. In the I/O path, it is typically
+ * acquired in the io_uring_enter() syscall and in io_handle_tw_list(). For
+ * IORING_SETUP_SQPOLL, it's acquired by io_sq_thread() instead. io_kiocb's
+ * issued with IO_URING_F_UNLOCKED in issue_flags (e.g. by io_wq_submit_work())
+ * acquire and release the ctx uring lock whenever they must touch io_ring_ctx
+ * state. io_uring_register() also acquires the ctx uring lock because most
+ * opcodes mutate io_ring_ctx state accessed in the issue path.
+ *
+ * For !IORING_SETUP_SINGLE_ISSUER io_ring_ctx's, acquiring the ctx uring lock
+ * is done via mutex_(try)lock(&ctx->uring_lock).
+ *
+ * However, for IORING_SETUP_SINGLE_ISSUER, we can avoid the mutex_lock() +
+ * mutex_unlock() overhead on submitter_task because a single thread can't race
+ * with itself. In the uncommon case where the ctx uring lock is needed on
+ * another thread, it must suspend submitter_task by scheduling a task work item
+ * on it. io_ring_ctx_lock() returns once the task work item has started.
+ * io_ring_ctx_unlock() allows the task work item to complete.
+ * If io_ring_ctx_lock() is called while the ctx is IORING_SETUP_R_DISABLED
+ * (e.g. during ctx create or exit), io_ring_ctx_lock() must acquire uring_lock
+ * because submitter_task isn't set yet. submitter_task can be accessed once
+ * uring_lock is held. If submitter_task exists, we do the same thing as in the
+ * non-IORING_SETUP_R_DISABLED case (except with uring_lock also held). If
+ * submitter_task isn't set, all other io_ring_ctx_lock() callers will also
+ * acquire uring_lock, so it suffices for mutual exclusion.
+ */
+
+struct io_ring_suspend_work {
+	struct callback_head cb_head;
+	struct completion suspend_start;
+	struct completion **suspend_end;
+};
+
+void io_ring_suspend_work(struct callback_head *cb_head);
+
 struct io_ring_ctx_lock_state {
+	bool need_mutex;
+	struct completion *suspend_end;
 };
 
 /* Acquire the ctx uring lock with the given nesting level */
 static inline void io_ring_ctx_lock_nested(struct io_ring_ctx *ctx,
 					   unsigned int subclass,
 					   struct io_ring_ctx_lock_state *state)
 {
-	mutex_lock_nested(&ctx->uring_lock, subclass);
+	struct io_ring_suspend_work suspend_work;
+
+	if (!(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) {
+		mutex_lock_nested(&ctx->uring_lock, subclass);
+		return;
+	}
+
+	state->suspend_end = NULL;
+	state->need_mutex =
+		!!(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED);
+	if (unlikely(state->need_mutex)) {
+		mutex_lock_nested(&ctx->uring_lock, subclass);
+		if (likely(!ctx->submitter_task))
+			return;
+	}
+
+	if (likely(current == ctx->submitter_task))
+		return;
+
+	/* Use task work to suspend submitter_task */
+	init_task_work(&suspend_work.cb_head, io_ring_suspend_work);
+	init_completion(&suspend_work.suspend_start);
+	suspend_work.suspend_end = &state->suspend_end;
+	/* If task_work_add() fails, task is exiting, so no need to suspend */
+	if (unlikely(task_work_add(ctx->submitter_task, &suspend_work.cb_head,
+				   TWA_SIGNAL)))
+		return;
+
+	wait_for_completion(&suspend_work.suspend_start);
 }
 
 /* Acquire the ctx uring lock */
 static inline void io_ring_ctx_lock(struct io_ring_ctx *ctx,
 				    struct io_ring_ctx_lock_state *state)
@@ -217,29 +284,70 @@ static inline void io_ring_ctx_lock(struct io_ring_ctx *ctx,
 
 /* Attempt to acquire the ctx uring lock without blocking */
 static inline bool io_ring_ctx_trylock(struct io_ring_ctx *ctx,
 				       struct io_ring_ctx_lock_state *state)
 {
-	return mutex_trylock(&ctx->uring_lock);
+	if (!(ctx->flags & IORING_SETUP_SINGLE_ISSUER))
+		return mutex_trylock(&ctx->uring_lock);
+
+	state->suspend_end = NULL;
+	state->need_mutex =
+		!!(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED);
+	if (unlikely(state->need_mutex)) {
+		if (!mutex_trylock(&ctx->uring_lock))
+			return false;
+		if (likely(!ctx->submitter_task))
+			return true;
+	}
+
+	if (unlikely(current != ctx->submitter_task))
+		goto unlock;
+
+	return true;
+
+unlock:
+	if (unlikely(state->need_mutex))
+		mutex_unlock(&ctx->uring_lock);
+	return false;
 }
 
 /* Release the ctx uring lock */
 static inline void io_ring_ctx_unlock(struct io_ring_ctx *ctx,
 				      struct io_ring_ctx_lock_state *state)
 {
-	mutex_unlock(&ctx->uring_lock);
+	if (!(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) {
+		mutex_unlock(&ctx->uring_lock);
+		return;
+	}
+
+	if (unlikely(state->need_mutex))
+		mutex_unlock(&ctx->uring_lock);
+	if (unlikely(state->suspend_end))
+		complete(state->suspend_end);
 }
 
 /* Return (if CONFIG_LOCKDEP) whether the ctx uring lock is held */
 static inline bool io_ring_ctx_lock_held(const struct io_ring_ctx *ctx)
 {
+	/*
+	 * No straightforward way to check that submitter_task is suspended
+	 * without access to struct io_ring_ctx_lock_state
+	 */
+	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
+	    !(ctx->flags & IORING_SETUP_R_DISABLED))
+		return true;
+
 	return lockdep_is_held(&ctx->uring_lock);
 }
 
 /* Assert (if CONFIG_LOCKDEP) that the ctx uring lock is held */
 static inline void io_ring_ctx_assert_locked(const struct io_ring_ctx *ctx)
 {
+	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
+	    !(ctx->flags & IORING_SETUP_R_DISABLED))
+		return;
+
 	lockdep_assert_held(&ctx->uring_lock);
 }
 
 static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
 {
-- 
2.45.2


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ