linux-kernel - [PATCH 4/8] Unionfs: mmap updates

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <11940529702778-git-send-email-ezk@cs.sunysb.edu>
Date:	Fri,  2 Nov 2007 21:22:45 -0400
From:	Erez Zadok <ezk@...sunysb.edu>
To:	akpm@...ux-foundation.org
Cc:	linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org,
	viro@....linux.org.uk, hch@...radead.org,
	Hugh Dickins <hugh@...itas.com>,
	Pekka Enberg <penberg@...helsinki.fi>,
	Erez Zadok <ezk@...sunysb.edu>,
	Pekka Enberg <penberg@...helsinki.fi>
Subject: [PATCH 4/8] Unionfs: mmap updates

Rewrite unionfs_writepage to minimize dependence on AOP_WRITEPAGE_ACTIVEATE,
handle memory pressure better, and update documentation.  Remove
unionfs_sync_page because it's not needed.

CC: Hugh Dickins <hugh@...itas.com>
CC: Pekka Enberg <penberg@...helsinki.fi>

Signed-off-by: Erez Zadok <ezk@...sunysb.edu>
---
 fs/unionfs/mmap.c |  156 ++++++++++++++++++++--------------------------------
 1 files changed, 60 insertions(+), 96 deletions(-)

diff --git a/fs/unionfs/mmap.c b/fs/unionfs/mmap.c
index bed11c3..4b00b98 100644
--- a/fs/unionfs/mmap.c
+++ b/fs/unionfs/mmap.c
@@ -25,83 +25,91 @@ static int unionfs_writepage(struct page *page, struct writeback_control *wbc)
 	struct inode *inode;
 	struct inode *lower_inode;
 	struct page *lower_page;
-	char *kaddr, *lower_kaddr;
+	struct address_space *lower_mapping; /* lower inode mapping */
+	gfp_t mask;
 
 	inode = page->mapping->host;
 	lower_inode = unionfs_lower_inode(inode);
+	lower_mapping = lower_inode->i_mapping;
 
 	/*
 	 * find lower page (returns a locked page)
 	 *
-	 * NOTE: we used to call grab_cache_page(), but that was unnecessary
-	 * as it would have tried to create a new lower page if it didn't
-	 * exist, leading to deadlocks (esp. under memory-pressure
-	 * conditions, when it is really a bad idea to *consume* more
-	 * memory).  Instead, we assume the lower page exists, and if we can
-	 * find it, then we ->writepage on it; if we can't find it, then it
-	 * couldn't have disappeared unless the kernel already flushed it,
-	 * in which case we're still OK.  This is especially correct if
-	 * wbc->sync_mode is WB_SYNC_NONE (as per
-	 * Documentation/filesystems/vfs.txt).  If we can't flush our page
-	 * because we can't find a lower page, then at least we re-mark our
-	 * page as dirty, and return AOP_WRITEPAGE_ACTIVATE as the VFS
-	 * expects us to.  (Note, if in the future it'd turn out that we
-	 * have to find a lower page no matter what, then we'd have to
-	 * resort to RAIF's page pointer flipping trick.)
+	 * We turn off __GFP_FS while we look for or create a new lower
+	 * page.  This prevents a recursion into the file system code, which
+	 * under memory pressure conditions could lead to a deadlock.  This
+	 * is similar to how the loop driver behaves (see loop_set_fd in
+	 * drivers/block/loop.c).  If we can't find the lower page, we
+	 * redirty our page and return "success" so that the VM will call us
+	 * again in the (hopefully near) future.
 	 */
-	lower_page = find_lock_page(lower_inode->i_mapping, page->index);
+	mask = mapping_gfp_mask(lower_mapping) & ~(__GFP_FS);
+	lower_page = find_or_create_page(lower_mapping, page->index, mask);
 	if (!lower_page) {
-		err = AOP_WRITEPAGE_ACTIVATE;
+		err = 0;
 		set_page_dirty(page);
 		goto out;
 	}
 
-	/* get page address, and encode it */
-	kaddr = kmap(page);
-	lower_kaddr = kmap(lower_page);
+	/* copy page data from our upper page to the lower page */
+	copy_highpage(lower_page, page);
 
-	memcpy(lower_kaddr, kaddr, PAGE_CACHE_SIZE);
-
-	kunmap(page);
-	kunmap(lower_page);
-
-	BUG_ON(!lower_inode->i_mapping->a_ops->writepage);
-
-	/* call lower writepage (expects locked page) */
-	clear_page_dirty_for_io(lower_page); /* emulate VFS behavior */
-	err = lower_inode->i_mapping->a_ops->writepage(lower_page, wbc);
-
-	/* b/c find_lock_page locked it and ->writepage unlocks on success */
-	if (err)
+	/*
+	 * Call lower writepage (expects locked page).  However, if we are
+	 * called with wbc->for_reclaim, then the VFS/VM just wants to
+	 * reclaim our page.  Therefore, we don't need to call the lower
+	 * ->writepage: just copy our data to the lower page (already done
+	 * above), then mark the lower page dirty and unlock it, and return
+	 * success.
+	 */
+	if (wbc->for_reclaim) {
+		set_page_dirty(lower_page);
 		unlock_page(lower_page);
-	/* b/c grab_cache_page increased refcnt */
-	page_cache_release(lower_page);
-
+		goto out_release;
+	}
+	BUG_ON(!lower_mapping->a_ops->writepage);
+	clear_page_dirty_for_io(lower_page); /* emulate VFS behavior */
+	err = lower_mapping->a_ops->writepage(lower_page, wbc);
 	if (err < 0) {
 		ClearPageUptodate(page);
-		goto out;
+		goto out_release;
 	}
+
+	/*
+	 * Lower file systems such as ramfs and tmpfs, may return
+	 * AOP_WRITEPAGE_ACTIVATE so that the VM won't try to (pointlessly)
+	 * write the page again for a while.  But those lower file systems
+	 * also set the page dirty bit back again.  Since we successfully
+	 * copied our page data to the lower page, then the VM will come
+	 * back to the lower page (directly) and try to flush it.  So we can
+	 * save the VM the hassle of coming back to our page and trying to
+	 * flush too.  Therefore, we don't re-dirty our own page, and we
+	 * never return AOP_WRITEPAGE_ACTIVATE back to the VM (we consider
+	 * this a success).
+	 *
+	 * We also unlock the lower page if the lower ->writepage returned
+	 * AOP_WRITEPAGE_ACTIVATE.  (This "anomalous" behaviour may be
+	 * addressed in future shmem/VM code.)
+	 */
 	if (err == AOP_WRITEPAGE_ACTIVATE) {
-		/*
-		 * Lower file systems such as ramfs and tmpfs, may return
-		 * AOP_WRITEPAGE_ACTIVATE so that the VM won't try to
-		 * (pointlessly) write the page again for a while.  But
-		 * those lower file systems also set the page dirty bit back
-		 * again.  So we mimic that behaviour here.
-		 */
-		if (PageDirty(lower_page))
-			set_page_dirty(page);
-		goto out;
+		err = 0;
+		unlock_page(lower_page);
 	}
 
 	/* all is well */
 	SetPageUptodate(page);
-	/* lower mtimes has changed: update ours */
+	/* lower mtimes have changed: update ours */
 	unionfs_copy_attr_times(inode);
 
-	unlock_page(page);
-
+out_release:
+	/* b/c find_or_create_page increased refcnt */
+	page_cache_release(lower_page);
 out:
+	/*
+	 * We unlock our page unconditionally, because we never return
+	 * AOP_WRITEPAGE_ACTIVATE.
+	 */
+	unlock_page(page);
 	return err;
 }
 
@@ -119,9 +127,8 @@ static int unionfs_writepages(struct address_space *mapping,
 	if (!mapping_cap_writeback_dirty(lower_inode->i_mapping))
 		goto out;
 
-	/* Note: generic_writepages may return AOP_WRITEPAGE_ACTIVATE */
 	err = generic_writepages(mapping, wbc);
-	if (err == 0)
+	if (!err)
 		unionfs_copy_attr_times(inode);
 out:
 	return err;
@@ -313,53 +320,10 @@ out:
 	return err;		/* assume all is ok */
 }
 
-static void unionfs_sync_page(struct page *page)
-{
-	struct inode *inode;
-	struct inode *lower_inode;
-	struct page *lower_page;
-	struct address_space *mapping;
-
-	inode = page->mapping->host;
-	lower_inode = unionfs_lower_inode(inode);
-
-	/*
-	 * Find lower page (returns a locked page).
-	 *
-	 * NOTE: we used to call grab_cache_page(), but that was unnecessary
-	 * as it would have tried to create a new lower page if it didn't
-	 * exist, leading to deadlocks.  All our sync_page method needs to
-	 * do is ensure that pending I/O gets done.
-	 */
-	lower_page = find_lock_page(lower_inode->i_mapping, page->index);
-	if (!lower_page) {
-		printk(KERN_ERR "unionfs: find_lock_page failed\n");
-		goto out;
-	}
-
-	/* do the actual sync */
-	mapping = lower_page->mapping;
-	/*
-	 * XXX: can we optimize ala RAIF and set the lower page to be
-	 * discarded after a successful sync_page?
-	 */
-	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
-		mapping->a_ops->sync_page(lower_page);
-
-	/* b/c find_lock_page locked it */
-	unlock_page(lower_page);
-	/* b/c find_lock_page increased refcnt */
-	page_cache_release(lower_page);
-
-out:
-	return;
-}
-
 struct address_space_operations unionfs_aops = {
 	.writepage	= unionfs_writepage,
 	.writepages	= unionfs_writepages,
 	.readpage	= unionfs_readpage,
 	.prepare_write	= unionfs_prepare_write,
 	.commit_write	= unionfs_commit_write,
-	.sync_page	= unionfs_sync_page,
 };
-- 
1.5.2.2

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/