linux-kernel - [RFC PATCH 2/2] io_uring/io-wq: try to batch multiple free work

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250221041927.8470-3-minhquangbui99@gmail.com>
Date: Fri, 21 Feb 2025 11:19:26 +0700
From: Bui Quang Minh <minhquangbui99@...il.com>
To: io-uring@...r.kernel.org
Cc: Jens Axboe <axboe@...nel.dk>,
	Pavel Begunkov <asml.silence@...il.com>,
	linux-kernel@...r.kernel.org,
	Bui Quang Minh <minhquangbui99@...il.com>
Subject: [RFC PATCH 2/2] io_uring/io-wq: try to batch multiple free work

Currently, in case we don't use IORING_SETUP_DEFER_TASKRUN, when io
worker frees work, it needs to add a task work. This creates contention
on tctx->task_list. With this commit, io work queues free work on a
local list and batch multiple free work in one call when the number of
free work in local list exceeds IO_REQ_ALLOC_BATCH.

Signed-off-by: Bui Quang Minh <minhquangbui99@...il.com>
---
 io_uring/io-wq.c    | 62 +++++++++++++++++++++++++++++++++++++++++++--
 io_uring/io-wq.h    |  4 ++-
 io_uring/io_uring.c | 23 ++++++++++++++---
 io_uring/io_uring.h |  6 ++++-
 4 files changed, 87 insertions(+), 8 deletions(-)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 5d0928f37471..096711707db9 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -544,6 +544,20 @@ static void io_assign_current_work(struct io_worker *worker,
 	raw_spin_unlock(&worker->lock);
 }
 
+static void flush_req_free_list(struct llist_head *free_list,
+				struct llist_node *tail)
+{
+	struct io_kiocb *first_req, *last_req;
+
+	first_req = container_of(free_list->first, struct io_kiocb,
+				 io_task_work.node);
+	last_req = container_of(tail, struct io_kiocb,
+				io_task_work.node);
+
+	io_req_normal_work_add(first_req, last_req);
+	init_llist_head(free_list);
+}
+
 /*
  * Called with acct->lock held, drops it before returning
  */
@@ -553,6 +567,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
 {
 	struct io_wq *wq = worker->wq;
 	bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
+	LLIST_HEAD(free_list);
+	int free_req = 0;
+	struct llist_node *tail = NULL;
+	struct io_ring_ctx *last_added_ctx = NULL;
 
 	do {
 		struct io_wq_work *work;
@@ -592,6 +610,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
 		do {
 			struct io_wq_work *next_hashed, *linked;
 			unsigned int hash = io_get_work_hash(work);
+			struct io_kiocb *req = container_of(work,
+						struct io_kiocb, work);
+			bool did_free = false;
 
 			next_hashed = wq_next_work(work);
 
@@ -601,7 +622,41 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
 			wq->do_work(work);
 			io_assign_current_work(worker, NULL);
 
-			linked = wq->free_work(work);
+			/*
+			 * All requests in free list must have the same
+			 * io_ring_ctx.
+			 */
+			if (last_added_ctx && last_added_ctx != req->ctx) {
+				flush_req_free_list(&free_list, tail);
+				tail = NULL;
+				last_added_ctx = NULL;
+				free_req = 0;
+			}
+
+			/*
+			 * Try to batch free work when
+			 * !IORING_SETUP_DEFER_TASKRUN to reduce contention
+			 * on tctx->task_list.
+			 */
+			if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+				linked = wq->free_work(work, NULL, NULL);
+			else
+				linked = wq->free_work(work, &free_list, &did_free);
+
+			if (did_free) {
+				if (!tail)
+					tail = free_list.first;
+
+				last_added_ctx = req->ctx;
+				free_req++;
+				if (free_req == IO_REQ_ALLOC_BATCH) {
+					flush_req_free_list(&free_list, tail);
+					tail = NULL;
+					last_added_ctx = NULL;
+					free_req = 0;
+				}
+			}
+
 			work = next_hashed;
 			if (!work && linked && !io_wq_is_hashed(linked)) {
 				work = linked;
@@ -626,6 +681,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
 			break;
 		raw_spin_lock(&acct->lock);
 	} while (1);
+
+	if (free_list.first)
+		flush_req_free_list(&free_list, tail);
 }
 
 static int io_wq_worker(void *data)
@@ -899,7 +957,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
 	do {
 		atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
 		wq->do_work(work);
-		work = wq->free_work(work);
+		work = wq->free_work(work, NULL, NULL);
 	} while (work);
 }
 
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index b3b004a7b625..4f1674d3d68e 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -21,7 +21,9 @@ enum io_wq_cancel {
 	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
 };
 
-typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
+typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *,
+					  struct llist_head *,
+					  bool *did_free);
 typedef void (io_wq_work_fn)(struct io_wq_work *);
 
 struct io_wq_hash {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0c111f7d7832..0343c9ec7271 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -120,7 +120,6 @@
 #define IO_TCTX_REFS_CACHE_NR	(1U << 10)
 
 #define IO_COMPL_BATCH			32
-#define IO_REQ_ALLOC_BATCH		8
 #define IO_LOCAL_TW_DEFAULT_MAX		20
 
 struct io_defer_entry {
@@ -985,13 +984,18 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	return true;
 }
 
-__cold void io_free_req(struct io_kiocb *req)
+static __cold void io_set_req_free(struct io_kiocb *req)
 {
 	/* refs were already put, restore them for io_req_task_complete() */
 	req->flags &= ~REQ_F_REFCOUNT;
 	/* we only want to free it, don't post CQEs */
 	req->flags |= REQ_F_CQE_SKIP;
 	req->io_task_work.func = io_req_task_complete;
+}
+
+__cold void io_free_req(struct io_kiocb *req)
+{
+	io_set_req_free(req);
 	io_req_task_work_add(req);
 }
 
@@ -1772,16 +1776,27 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
 				 IO_URING_F_COMPLETE_DEFER);
 }
 
-struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
+struct io_wq_work *io_wq_free_work(struct io_wq_work *work,
+				   struct llist_head *free_list,
+				   bool *did_free)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct io_kiocb *nxt = NULL;
+	bool free_req = false;
 
 	if (req_ref_put_and_test(req)) {
 		if (req->flags & IO_REQ_LINK_FLAGS)
 			nxt = io_req_find_next(req);
-		io_free_req(req);
+		io_set_req_free(req);
+		if (free_list)
+			__llist_add(&req->io_task_work.node, free_list);
+		else
+			io_req_task_work_add(req);
+		free_req = true;
 	}
+	if (did_free)
+		*did_free = free_req;
+
 	return nxt ? &nxt->work : NULL;
 }
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index bdd6407c14d0..dc050bc44b65 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -54,6 +54,8 @@ struct io_wait_queue {
 #endif
 };
 
+#define IO_REQ_ALLOC_BATCH	8
+
 static inline bool io_should_wake(struct io_wait_queue *iowq)
 {
 	struct io_ring_ctx *ctx = iowq->ctx;
@@ -111,7 +113,9 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
 void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 
-struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
+struct io_wq_work *io_wq_free_work(struct io_wq_work *work,
+				   struct llist_head *free_req,
+				   bool *did_free);
 void io_wq_submit_work(struct io_wq_work *work);
 
 void io_free_req(struct io_kiocb *req);
-- 
2.43.0