lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Mon,  3 Dec 2012 12:58:41 -0800
From:	Kent Overstreet <koverstreet@...gle.com>
To:	linux-kernel@...r.kernel.org, linux-aio@...ck.org,
	linux-fsdevel@...r.kernel.org
Cc:	zab@...hat.com, bcrl@...ck.org, jmoyer@...hat.com, axboe@...nel.dk,
	viro@...iv.linux.org.uk, Kent Overstreet <koverstreet@...gle.com>
Subject: [PATCH 25/26] aio: use xchg() instead of completion_lock

So, for sticking kiocb completions on the kioctx ringbuffer, we need a
lock - it unfortunately can't be lockless.

When the kioctx is shared between threads on different cpus and the rate
of completions is high, this lock sees quite a bit of contention - in
terms of cacheline contention it's the hottest thing in the aio
subsystem.

That means, with a regular spinlock, we're going to take a cache miss
to grab the lock, then another cache miss when we touch the data the
lock protects - if it's on the same cacheline as the lock, other cpus
spinning on the lock are going to be pulling it out from under us as
we're using it.

So, we use an old trick to get rid of this second forced cache miss -
make the data the lock protects be the lock itself, so we grab them both
at once.

Signed-off-by: Kent Overstreet <koverstreet@...gle.com>
---
 fs/aio.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 931606b..f44f21d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -101,11 +101,11 @@ struct kioctx {
 
 	struct {
 		struct mutex	ring_lock;
+		unsigned	shadow_tail;
 	} ____cacheline_aligned;
 
 	struct {
 		unsigned	tail;
-		spinlock_t	completion_lock;
 	} ____cacheline_aligned;
 
 	struct {
@@ -311,9 +311,9 @@ static void free_ioctx(struct kioctx *ctx)
 	kunmap_atomic(ring);
 
 	while (atomic_read(&ctx->reqs_available) < ctx->nr) {
-		wait_event(ctx->wait, head != ctx->tail);
+		wait_event(ctx->wait, head != ctx->shadow_tail);
 
-		avail = (head < ctx->tail ? ctx->tail : ctx->nr) - head;
+		avail = (head < ctx->shadow_tail ? ctx->shadow_tail : ctx->nr) - head;
 
 		atomic_add(avail, &ctx->reqs_available);
 		head += avail;
@@ -378,7 +378,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	rcu_read_unlock();
 
 	spin_lock_init(&ctx->ctx_lock);
-	spin_lock_init(&ctx->completion_lock);
 	mutex_init(&ctx->ring_lock);
 	init_waitqueue_head(&ctx->wait);
 
@@ -686,9 +685,10 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	 * ctx->ctx_lock to prevent other code from messing with the tail
 	 * pointer since we might be called from irq context.
 	 */
-	spin_lock_irqsave(&ctx->completion_lock, flags);
+	local_irq_save(flags);
+	while ((tail = xchg(&ctx->tail, UINT_MAX)) == UINT_MAX)
+		cpu_relax();
 
-	tail = ctx->tail;
 	pos = tail + AIO_EVENTS_OFFSET;
 
 	if (++tail >= ctx->nr)
@@ -714,14 +714,16 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	 */
 	smp_wmb();	/* make event visible before updating tail */
 
-	ctx->tail = tail;
+	ctx->shadow_tail = tail;
 
 	ring = kmap_atomic(ctx->ring_pages[0]);
 	ring->tail = tail;
 	kunmap_atomic(ring);
 	flush_dcache_page(ctx->ring_pages[0]);
 
-	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	smp_wmb();
+	ctx->tail = tail;
+	local_irq_restore(flags);
 
 	pr_debug("added to ring %p at [%u]\n", iocb, tail);
 
@@ -766,11 +768,11 @@ static int aio_read_events(struct kioctx *ctx, struct io_event __user *event,
 	pr_debug("h%u t%u m%u\n", *head, ctx->tail, ctx->nr);
 
 	while (ret < nr) {
-		unsigned i = (*head < ctx->tail ? ctx->tail : ctx->nr) - *head;
+		unsigned i = (*head < ctx->shadow_tail ? ctx->shadow_tail : ctx->nr) - *head;
 		struct io_event *ev;
 		struct page *page;
 
-		if (*head == ctx->tail)
+		if (*head == ctx->shadow_tail)
 			break;
 
 		i = min_t(int, i, nr - ret);
@@ -860,7 +862,7 @@ retry:
 		prepare_to_wait_exclusive(&ctx->wait, &wait,
 					  TASK_INTERRUPTIBLE);
 
-		if (head != ctx->tail) {
+		if (head != ctx->shadow_tail) {
 			__set_current_state(TASK_RUNNING);
 			goto retry;
 		}
-- 
1.7.12

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ