linux-kernel - [PATCH 1/2] aio: clean up aio_migratepage() and related code much

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <532A80AC.5000709@cn.fujitsu.com>
Date:	Thu, 20 Mar 2014 13:46:20 +0800
From:	Gu Zheng <guz.fnst@...fujitsu.com>
To:	Benjamin <bcrl@...ck.org>
CC:	Al Viro <viro@...iv.linux.org.uk>, jmoyer@...hat.com,
	kosaki.motohiro@...fujitsu.com,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>,
	Yasuaki Ishimatsu <isimatu.yasuaki@...fujitsu.com>,
	tangchen <tangchen@...fujitsu.com>, miaox@...fujitsu.com,
	Andrew Morton <akpm@...ux-foundation.org>, linux-aio@...ck.org,
	fsdevel <linux-fsdevel@...r.kernel.org>,
	linux-kernel <linux-kernel@...r.kernel.org>
Subject: [PATCH 1/2] aio: clean up aio_migratepage() and related code much

As the page migration framework holds lock_page() to protect the pages
(both old and new) while migrating, so while the page migrating, both
of old page and new page are locked. And the aio context teardown
routine will call *truncate*(in put_aio_ring_file()) to truncate
pagecache which needs to acquire page_lock() for each page one by one.
So there is a native mutual exclusion between *migrate page* v.s. truncate().

If put_aio_ring_file() is called at first of the context teardown flow
(aio_free_ring). Then, page migration and ctx freeing will have mutual
execution guarded by lock_page() v.s. truncate(). Once a page is removed
from radix-tree, it will not be migrated. On the other hand, the context
can not be freed while the page migration are ongoing.

aio_free_ring
-put_aio_ring_file()		  |
 |-truncate_setsize()             |migrate_pages()
 | |-truncate_inode_pages_range() | |-__unmap_and_move()
 |  |-trylock_page(page)          |  |-lock_page(old)
 |-i_mapping->private_data = NULL |  ...
 |-ctx->aio_ring_file=NULL        |   |-move_to_new_page()
 |                                |    |-trylock_page(newpage)
			          |     |-aio_migratepage()
So that, the additional spinlock(address_space's private_lock) used to
protect use and updates of the mapping's private_data, and the sane check
of context is needless, so remove it here.

Besides, we also move filling ring_pages[] array and ctx->nr_pages into the
page_lock protection in aio_setup_ring to keep the semantic unanimous.

Moreover, after migrate_page_move_mapping() success, page migration should
never fail, so here we merge the flow in one routine.

Thanks KAMEZAWA Hiroyuki very much for giving directions on this.

ps.It is applied on linux-next(3-18).

Signed-off-by: Gu Zheng <guz.fnst@...fujitsu.com>
---
 fs/aio.c |   92 ++++++++++++++++++++-----------------------------------------
 1 files changed, 30 insertions(+), 62 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 4133ba9..88ad40c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -229,11 +229,8 @@ static void put_aio_ring_file(struct kioctx *ctx)
 	if (aio_ring_file) {
 		truncate_setsize(aio_ring_file->f_inode, 0);
 
-		/* Prevent further access to the kioctx from migratepages */
-		spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);
 		aio_ring_file->f_inode->i_mapping->private_data = NULL;
 		ctx->aio_ring_file = NULL;
-		spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);
 
 		fput(aio_ring_file);
 	}
@@ -243,6 +240,8 @@ static void aio_free_ring(struct kioctx *ctx)
 {
 	int i;
 
+	put_aio_ring_file(ctx);
+
 	for (i = 0; i < ctx->nr_pages; i++) {
 		struct page *page;
 		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
@@ -254,8 +253,6 @@ static void aio_free_ring(struct kioctx *ctx)
 		put_page(page);
 	}
 
-	put_aio_ring_file(ctx);
-
 	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
 		kfree(ctx->ring_pages);
 		ctx->ring_pages = NULL;
@@ -283,32 +280,22 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 {
 	struct kioctx *ctx;
 	unsigned long flags;
-	int rc;
-
-	rc = 0;
+	pgoff_t index;
+	int rc = MIGRATEPAGE_SUCCESS;
 
-	/* Make sure the old page hasn't already been changed */
-	spin_lock(&mapping->private_lock);
+	/*
+	 * Because old page is *locked*, if we see mapping, the page isn't
+	 * truncated, andmapping , mapping->private_data etc...are all valid.
+	 */
 	ctx = mapping->private_data;
-	if (ctx) {
-		pgoff_t idx;
-		spin_lock_irqsave(&ctx->completion_lock, flags);
-		idx = old->index;
-		if (idx < (pgoff_t)ctx->nr_pages) {
-			if (ctx->ring_pages[idx] != old)
-				rc = -EAGAIN;
-		} else
-			rc = -EINVAL;
-		spin_unlock_irqrestore(&ctx->completion_lock, flags);
-	} else
-		rc = -EINVAL;
-	spin_unlock(&mapping->private_lock);
 
-	if (rc != 0)
-		return rc;
+	index = old->index;
+	BUG_ON(index >= (pgoff_t)ctx->nr_pages);
+	VM_BUG_ON(ctx->ring_pages[index] != old);
 
 	/* Writeback must be complete */
 	BUG_ON(PageWriteback(old));
+	/* Extra ref cnt for rind_pages[] array */
 	get_page(new);
 
 	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
@@ -317,42 +304,22 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 		return rc;
 	}
 
-	/* We can potentially race against kioctx teardown here.  Use the
-	 * address_space's private data lock to protect the mapping's
-	 * private_data.
-	 */
-	spin_lock(&mapping->private_lock);
-	ctx = mapping->private_data;
-	if (ctx) {
-		pgoff_t idx;
-		spin_lock_irqsave(&ctx->completion_lock, flags);
-		migrate_page_copy(new, old);
+	spin_lock_irqsave(&ctx->completion_lock, flags);
 
-		/*
-		 * Ensure memory copy is finished before updating
-		 * ctx->ring_pages[]. Otherwise other processes may access to
-		 * new ring pages which are not fully initialized.
-		 */
-		smp_wmb();
+	/* Migration should not fail if migrate_page_move_mapping SUCCESS */
+	migrate_page_copy(new, old);
 
-		idx = old->index;
-		if (idx < (pgoff_t)ctx->nr_pages) {
-			/* And only do the move if things haven't changed */
-			if (ctx->ring_pages[idx] == old)
-				ctx->ring_pages[idx] = new;
-			else
-				rc = -EAGAIN;
-		} else
-			rc = -EINVAL;
-		spin_unlock_irqrestore(&ctx->completion_lock, flags);
-	} else
-		rc = -EBUSY;
-	spin_unlock(&mapping->private_lock);
+	/*
+	 * Ensure memory copy is finished before updating
+	 * ctx->ring_pages[]. Otherwise other processes may access to
+	 * new ring pages which are not fully initialized.
+	 */
+	smp_wmb();
 
-	if (rc == MIGRATEPAGE_SUCCESS)
-		put_page(old);
-	else
-		put_page(new);
+	ctx->ring_pages[old->index] = new;
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	put_page(old);
 
 	return rc;
 }
@@ -405,6 +372,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 		}
 	}
 
+	ctx->nr_pages = 0;
+
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page;
 		page = find_or_create_page(file->f_inode->i_mapping,
@@ -415,13 +384,12 @@ static int aio_setup_ring(struct kioctx *ctx)
 			 current->pid, i, page_count(page));
 		SetPageUptodate(page);
 		SetPageDirty(page);
-		unlock_page(page);
-
 		ctx->ring_pages[i] = page;
+		ctx->nr_pages++;
+		unlock_page(page);
 	}
-	ctx->nr_pages = i;
 
-	if (unlikely(i != nr_pages)) {
+	if (unlikely(ctx->nr_pages != nr_pages)) {
 		aio_free_ring(ctx);
 		return -EAGAIN;
 	}
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/