lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4C3694A5.6080103@ds.jp.nec.com>
Date:	Thu, 08 Jul 2010 23:16:53 -0400
From:	Munehiro Ikeda <m-ikeda@...jp.nec.com>
To:	linux-kernel@...r.kernel.org, jens.axboe@...cle.com,
	Vivek Goyal <vgoyal@...hat.com>
CC:	Munehiro Ikeda <m-ikeda@...jp.nec.com>,
	Ryo Tsuruta <ryov@...inux.co.jp>, taka@...inux.co.jp,
	kamezawa.hiroyu@...fujitsu.com,
	Andrea Righi <righi.andrea@...il.com>,
	Gui Jianfeng <guijianfeng@...fujitsu.com>,
	akpm@...ux-foundation.org, balbir@...ux.vnet.ibm.com
Subject: [RFC][PATCH 04/11] blkiocg async: block_commit_write not to record
 process info

When a mmap(2)'d page is written back, which means the page doesn't
have buffer_head, ext4 prepares buffer_heads and calls
block_commit_write() from ext4_writepage().
This results to call mark_buffer_dirty() and the page's dirty flag
is set.  In this case, current process marking page dirty is (almost)
flush kernel thread, so the original info of a process which dirtied
this page is lost.

To prevent this issue, this patch introduces
block_commit_write_noiotrack() which is same as block_commit_write()
but runs through a code path not to record current process info.

The portion calling block_commit_write() in ext4 will be modified
in the following patch.

Signed-off-by: Munehiro "Muuhh" Ikeda <m-ikeda@...jp.nec.com>
---
 fs/buffer.c                 |   70 ++++++++++++++++++++++++++++++++-----------
 include/linux/buffer_head.h |    2 +
 2 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index c418fdf..61ebf94 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -660,15 +660,17 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  *
  * If warn is true, then emit a warning if the page is not uptodate and has
  * not been truncated.
+ * If track is true, dirtying process info is recorded for iotrack.
  */
 static void __set_page_dirty(struct page *page,
-		struct address_space *mapping, int warn)
+		struct address_space *mapping, int warn, int track)
 {
 	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
 		account_page_dirtied(page, mapping);
-		blk_iotrack_reset_owner_pagedirty(page, current->mm);
+		if (track)
+			blk_iotrack_reset_owner_pagedirty(page, current->mm);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
@@ -723,7 +725,7 @@ int __set_page_dirty_buffers(struct page *page)
 	spin_unlock(&mapping->private_lock);
 
 	if (newly_dirty)
-		__set_page_dirty(page, mapping, 1);
+		__set_page_dirty(page, mapping, 1, 1);
 	return newly_dirty;
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
@@ -1137,18 +1139,11 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
  */
 
 /**
- * mark_buffer_dirty - mark a buffer_head as needing writeout
+ * __mark_buffer_dirty - helper function for mark_buffer_dirty*
  * @bh: the buffer_head to mark dirty
- *
- * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
- * backing page dirty, then tag the page as dirty in its address_space's radix
- * tree and then attach the address_space's inode to its superblock's dirty
- * inode list.
- *
- * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
- * mapping->tree_lock and the global inode_lock.
+ * @track: if true, dirtying process info will be recorded for iotrack
  */
-void mark_buffer_dirty(struct buffer_head *bh)
+static void __mark_buffer_dirty(struct buffer_head *bh, int track)
 {
 	WARN_ON_ONCE(!buffer_uptodate(bh));
 
@@ -1169,12 +1164,40 @@ void mark_buffer_dirty(struct buffer_head *bh)
 		if (!TestSetPageDirty(page)) {
 			struct address_space *mapping = page_mapping(page);
 			if (mapping)
-				__set_page_dirty(page, mapping, 0);
+				__set_page_dirty(page, mapping, 0, track);
 		}
 	}
 }
+
+/**
+ * mark_buffer_dirty - mark a buffer_head as needing writeout
+ * @bh: the buffer_head to mark dirty
+ *
+ * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
+ * backing page dirty, then tag the page as dirty in its address_space's radix
+ * tree and then attach the address_space's inode to its superblock's dirty
+ * inode list.
+ *
+ * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
+ * mapping->tree_lock and the global inode_lock.
+ */
+void mark_buffer_dirty(struct buffer_head *bh)
+{
+	__mark_buffer_dirty(bh, 1);
+}
 EXPORT_SYMBOL(mark_buffer_dirty);
 
+/**
+ * mark_buffer_dirty_noiotrack
+ * - same as mark_buffer_dirty but doesn't record dirtying process info
+ * @bh: the buffer_head to mark dirty
+ */
+void mark_buffer_dirty_noiotrack(struct buffer_head *bh)
+{
+	__mark_buffer_dirty(bh, 0);
+}
+EXPORT_SYMBOL(mark_buffer_dirty_noiotrack);
+
 /*
  * Decrement a buffer_head's reference count.  If all buffers against a page
  * have zero reference count, are clean and unlocked, and if the page is clean
@@ -1916,7 +1939,7 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
 }
 
 static int __block_commit_write(struct inode *inode, struct page *page,
-		unsigned from, unsigned to)
+		unsigned from, unsigned to, int track)
 {
 	unsigned block_start, block_end;
 	int partial = 0;
@@ -1934,7 +1957,10 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 				partial = 1;
 		} else {
 			set_buffer_uptodate(bh);
-			mark_buffer_dirty(bh);
+			if (track)
+				mark_buffer_dirty(bh);
+			else
+				mark_buffer_dirty_noiotrack(bh);
 		}
 		clear_buffer_new(bh);
 	}
@@ -2067,7 +2093,7 @@ int block_write_end(struct file *file, struct address_space *mapping,
 	flush_dcache_page(page);
 
 	/* This could be a short (even 0-length) commit */
-	__block_commit_write(inode, page, start, start+copied);
+	__block_commit_write(inode, page, start, start+copied, 1);
 
 	return copied;
 }
@@ -2414,11 +2440,19 @@ EXPORT_SYMBOL(block_prepare_write);
 int block_commit_write(struct page *page, unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
-	__block_commit_write(inode,page,from,to);
+	__block_commit_write(inode, page, from, to, 1);
 	return 0;
 }
 EXPORT_SYMBOL(block_commit_write);
 
+int block_commit_write_noiotrack(struct page *page, unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	__block_commit_write(inode, page, from, to, 0);
+	return 0;
+}
+EXPORT_SYMBOL(block_commit_write_noiotrack);
+
 /*
  * block_page_mkwrite() is not allowed to change the file size as it gets
  * called from a page fault handler when a page is first dirtied. Hence we must
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 1b9ba19..9d7e0b0 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -145,6 +145,7 @@ BUFFER_FNS(Unwritten, unwritten)
  */
 
 void mark_buffer_dirty(struct buffer_head *bh);
+void mark_buffer_dirty_noiotrack(struct buffer_head *bh);
 void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
 void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset);
@@ -225,6 +226,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
 			get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
+int block_commit_write_noiotrack(struct page *page, unsigned from, unsigned to);
 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 				get_block_t get_block);
 void block_sync_page(struct page *);
-- 
1.6.2.5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ