lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1369301187-24934-3-git-send-email-mgorman@suse.de>
Date:	Thu, 23 May 2013 10:26:27 +0100
From:	Mel Gorman <mgorman@...e.de>
To:	Andrew Morton <akpm@...ux-foundation.org>
Cc:	Jiri Slaby <jslaby@...e.cz>,
	Valdis Kletnieks <Valdis.Kletnieks@...edu>,
	Rik van Riel <riel@...hat.com>,
	Zlatko Calusic <zcalusic@...sync.net>,
	Johannes Weiner <hannes@...xchg.org>,
	dormando <dormando@...ia.net>, Michal Hocko <mhocko@...e.cz>,
	Jan Kara <jack@...e.cz>, Dave Chinner <david@...morbit.com>,
	Kamezawa Hiroyuki <kamezawa.hiroyu@...fujitsu.com>,
	Linux-FSDevel <linux-fsdevel@...r.kernel.org>,
	Linux-MM <linux-mm@...ck.org>,
	LKML <linux-kernel@...r.kernel.org>, Mel Gorman <mgorman@...e.de>
Subject: [PATCH 2/2] mm: vmscan: Take page buffers dirty and locked state into account

Page reclaim keeps track of dirty and under writeback pages and uses it to
determine if wait_iff_congested() should stall or if kswapd should begin
writing back pages. This fails to account for buffer pages that can be
under writeback but not PageWriteback which is the case for filesystems
like ext3. Furthermore, PageDirty buffer pages can have all the buffers
clean and writepage does no IO so it should not be accounted as congested.

This patch adds an address_space operation that filesystems may
optionally use to check if a page is really dirty or really under
writeback. An implementation is provided for filesystems that use
buffer_heads. By default, the page flags are obeyed.

Credit goes to Jan Kara for identifying that the page flags alone are
not sufficient for ext3 and sanity checking a number of ideas on how
the problem could be addressed.

Signed-off-by: Mel Gorman <mgorman@...e.de>
---
 fs/buffer.c                 | 34 ++++++++++++++++++++++++++++++++++
 fs/ext2/inode.c             |  1 +
 fs/ext3/inode.c             |  3 +++
 fs/ext4/inode.c             |  2 ++
 fs/gfs2/aops.c              |  2 ++
 fs/ntfs/aops.c              |  1 +
 fs/ocfs2/aops.c             |  1 +
 fs/xfs/xfs_aops.c           |  1 +
 include/linux/buffer_head.h |  3 +++
 include/linux/fs.h          |  1 +
 mm/vmscan.c                 | 33 +++++++++++++++++++++++++++++++--
 11 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 1aa0836..4247aa9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -91,6 +91,40 @@ void unlock_buffer(struct buffer_head *bh)
 EXPORT_SYMBOL(unlock_buffer);
 
 /*
+ * Returns if the page has dirty or writeback buffers. If all the buffers
+ * are unlocked and clean then the PageDirty information is stale. If
+ * any of the pages are locked, it is assumed they are locked for IO.
+ */
+void buffer_check_dirty_writeback(struct page *page,
+				     bool *dirty, bool *writeback)
+{
+	struct buffer_head *head, *bh;
+	*dirty = false;
+	*writeback = false;
+
+	BUG_ON(!PageLocked(page));
+
+	if (!page_has_buffers(page))
+		return;
+
+	if (PageWriteback(page))
+		*writeback = true;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (buffer_locked(bh))
+			*writeback = true;
+
+		if (buffer_dirty(bh))
+			*dirty = true;
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+EXPORT_SYMBOL(buffer_check_dirty_writeback);
+
+/*
  * Block until a buffer comes unlocked.  This doesn't stop it
  * from becoming locked again - you have to lock it yourself
  * if you want to preserve its state.
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0a87bb1..2fc3593 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -880,6 +880,7 @@ const struct address_space_operations ext2_aops = {
 	.writepages		= ext2_writepages,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate	= block_is_partially_uptodate,
+	.is_dirty_writeback	= buffer_check_dirty_writeback,
 	.error_remove_page	= generic_error_remove_page,
 };
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 23c7128..14494fc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1984,6 +1984,7 @@ static const struct address_space_operations ext3_ordered_aops = {
 	.direct_IO		= ext3_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.is_dirty_writeback	= buffer_check_dirty_writeback,
 	.error_remove_page	= generic_error_remove_page,
 };
 
@@ -1999,6 +2000,7 @@ static const struct address_space_operations ext3_writeback_aops = {
 	.direct_IO		= ext3_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.is_dirty_writeback	= buffer_check_dirty_writeback,
 	.error_remove_page	= generic_error_remove_page,
 };
 
@@ -2013,6 +2015,7 @@ static const struct address_space_operations ext3_journalled_aops = {
 	.invalidatepage		= ext3_invalidatepage,
 	.releasepage		= ext3_releasepage,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.is_dirty_writeback	= buffer_check_dirty_writeback,
 	.error_remove_page	= generic_error_remove_page,
 };
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0723774..7af746a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3309,6 +3309,7 @@ static const struct address_space_operations ext4_aops = {
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.is_dirty_writeback	= buffer_check_dirty_writeback,
 	.error_remove_page	= generic_error_remove_page,
 };
 
@@ -3340,6 +3341,7 @@ static const struct address_space_operations ext4_da_aops = {
 	.direct_IO		= ext4_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.is_dirty_writeback	= buffer_check_dirty_writeback,
 	.error_remove_page	= generic_error_remove_page,
 };
 
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0bad69e..027b8ea 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1112,6 +1112,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
 	.direct_IO = gfs2_direct_IO,
 	.migratepage = buffer_migrate_page,
 	.is_partially_uptodate = block_is_partially_uptodate,
+	.is_dirty_writeback = buffer_check_dirty_writeback,
 	.error_remove_page = generic_error_remove_page,
 };
 
@@ -1129,6 +1130,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
 	.direct_IO = gfs2_direct_IO,
 	.migratepage = buffer_migrate_page,
 	.is_partially_uptodate = block_is_partially_uptodate,
+	.is_dirty_writeback = buffer_check_dirty_writeback,
 	.error_remove_page = generic_error_remove_page,
 };
 
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f..eb85ac1 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1549,6 +1549,7 @@ const struct address_space_operations ntfs_aops = {
 	.migratepage	= buffer_migrate_page,	/* Move a page cache page from
 						   one physical page to an
 						   other. */
+	.is_dirty_writeback = buffer_check_dirty_writeback,
 	.error_remove_page = generic_error_remove_page,
 };
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 20dfec7..191af11 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2096,5 +2096,6 @@ const struct address_space_operations ocfs2_aops = {
 	.releasepage		= ocfs2_releasepage,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate	= block_is_partially_uptodate,
+	.is_dirty_writeback	= buffer_check_dirty_writeback,
 	.error_remove_page	= generic_error_remove_page,
 };
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f64ee71..1aada1c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1656,5 +1656,6 @@ const struct address_space_operations xfs_address_space_operations = {
 	.direct_IO		= xfs_vm_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.is_dirty_writeback	= buffer_check_dirty_writeback,
 	.error_remove_page	= generic_error_remove_page,
 };
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 6d9f5a2..d458880 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -139,6 +139,9 @@ BUFFER_FNS(Prio, prio)
 	})
 #define page_has_buffers(page)	PagePrivate(page)
 
+void buffer_check_dirty_writeback(struct page *page,
+				     bool *dirty, bool *writeback);
+
 /*
  * Declarations
  */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0a9a6766..96f857f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -380,6 +380,7 @@ struct address_space_operations {
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
+	void (*is_dirty_writeback) (struct page *, bool *, bool *);
 	int (*error_remove_page)(struct address_space *, struct page *);
 
 	/* swapfile support */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f3315c6..d9213d8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -669,6 +669,25 @@ static enum page_references page_check_references(struct page *page,
 	return PAGEREF_RECLAIM;
 }
 
+/* Check if a page is dirty or under writeback */
+static void page_check_dirty_writeback(struct page *page,
+				       bool *dirty, bool *writeback)
+{
+	struct address_space *mapping;
+
+	/* By default assume that the page flags are accurate */
+	*dirty = PageDirty(page);
+	*writeback = PageWriteback(page);
+
+	/* Verify dirty/writeback state if the filesystem supports it */
+	if (!page_has_private(page))
+		return;
+
+	mapping = page_mapping(page);
+	if (mapping && mapping->a_ops->is_dirty_writeback)
+		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
+}
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -839,9 +858,19 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		}
 
 		if (PageDirty(page)) {
-			nr_dirty++;
+			bool dirty, writeback;
+
+			/*
+			 * The number of dirty pages determines if a zone is
+			 * marked zone_is_reclaim_congested which affects
+			 * wait_iff_congested. The number of unqueued dirty
+			 * pages affects if kswapd will start writing pages.
+			 */
+			page_check_dirty_writeback(page, &dirty, &writeback);
+			if (dirty || writeback)
+				nr_dirty++;
 
-			if (!PageWriteback(page))
+			if (dirty && !writeback)
 				nr_unqueued_dirty++;
 
 			/*
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ