linux-ext4 - Re: [PATCH v3]Ext4: journal credits reservation fixes for DIO, fallocate and delalloc writepages

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 30 Jul 2008 17:46:51 +0530
From:	"Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>
To:	Mingming Cao <cmm@...ibm.com>
Cc:	tytso <tytso@....edu>, Shehjar Tikoo <shehjart@....unsw.edu.au>,
	linux-ext4@...r.kernel.org, Andreas Dilger <adilger@....com>
Subject: Re: [PATCH v3]Ext4: journal credits reservation fixes for DIO,
	fallocate and delalloc writepages

Hi Mingming,

This is the patch i was working on. The patch is still not complete.
But sending it early so that i can get some feedback.

commit a4546f0034fcfb0a20991378fe4a3cf6c157ad72
Author: Aneesh Kumar K.V <aneesh.kumar@...ux.vnet.ibm.com>
Date:   Wed Jul 30 17:34:25 2008 +0530

    ext4: Rework the ext4_da_writepages
    
    With the below changes we reserve credit needed to insert only one extent
    resulting from a call to single get_block. That make sure we don't take
    too much journal credits during writeout. We also don't limit the pages
    to write. That means we loop through the dirty pages building largest
    possible contiguous block request. Then we issue a single get_block request.
    We may get less block that we requested. If so we would end up not mapping
    some of the buffer_heads. That means those buffer_heads are still marked delay.
    Later in the writepage callback via __mpage_writepage we redirty those pages.
    
    
    TODO:
    a) Some pages are leaked without unlock. So fsstress deadlock on lock_page
    b) range_start is not handled correctly in loop
    
    Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@...ux.vnet.ibm.com>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5a394c8..23f55cf 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,8 @@
 #include "acl.h"
 #include "ext4_extents.h"
 
+#define MPAGE_DA_EXTENT_TAIL 0x01
+
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
@@ -1604,9 +1606,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 		.get_block = mpd->get_block,
 		.use_writepage = 1,
 	};
-	int ret = 0, err, nr_pages, i;
+	int err, nr_pages, i;
 	unsigned long index, end;
 	struct pagevec pvec;
+	int written_pages = 0;
 
 	BUG_ON(mpd->next_page <= mpd->first_page);
 
@@ -1628,21 +1631,20 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 			index++;
 
 			err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
-
+			if (!err)
+				written_pages++;
 			/*
 			 * In error case, we have to continue because
 			 * remaining pages are still locked
 			 * XXX: unlock and re-dirty them?
 			 */
-			if (ret == 0)
-				ret = err;
 		}
 		pagevec_release(&pvec);
 	}
 	if (mpd_pp.bio)
 		mpage_bio_submit(WRITE, mpd_pp.bio);
 
-	return ret;
+	return written_pages;
 }
 
 /*
@@ -1745,10 +1747,10 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
  * The function ignores errors ->get_block() returns, thus real
  * error handling is postponed to __mpage_writepage()
  */
-static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
+	int err = 0;
 	struct buffer_head *lbh = &mpd->lbh;
-	int err = 0, remain = lbh->b_size;
 	sector_t next = lbh->b_blocknr;
 	struct buffer_head new;
 
@@ -1756,37 +1758,33 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * We consider only non-mapped and non-allocated blocks
 	 */
 	if (buffer_mapped(lbh) && !buffer_delay(lbh))
-		return;
-
-	while (remain) {
-		new.b_state = lbh->b_state;
-		new.b_blocknr = 0;
-		new.b_size = remain;
-		err = mpd->get_block(mpd->inode, next, &new, 1);
-		if (err) {
-			/*
-			 * Rather than implement own error handling
-			 * here, we just leave remaining blocks
-			 * unallocated and try again with ->writepage()
-			 */
-			break;
-		}
-		BUG_ON(new.b_size == 0);
-
-		if (buffer_new(&new))
-			__unmap_underlying_blocks(mpd->inode, &new);
+		return 0;
 
+	new.b_state = lbh->b_state;
+	new.b_blocknr = 0;
+	new.b_size = lbh->b_size;
+	err = mpd->get_block(mpd->inode, next, &new, 1);
+	if (err) {
 		/*
-		 * If blocks are delayed marked, we need to
-		 * put actual blocknr and drop delayed bit
+		 * We failed to do block allocation. All
+		 * the pages will be redirtied later in
+		 * ext4_da_writepage
 		 */
-		if (buffer_delay(lbh))
-			mpage_put_bnr_to_bhs(mpd, next, &new);
-
-		/* go for the remaining blocks */
-		next += new.b_size >> mpd->inode->i_blkbits;
-		remain -= new.b_size;
+		return 0;
 	}
+	BUG_ON(new.b_size == 0);
+
+	if (buffer_new(&new))
+		__unmap_underlying_blocks(mpd->inode, &new);
+
+	/*
+	 * If blocks are delayed marked, we need to
+	 * put actual blocknr and drop delayed bit
+	 */
+	if (buffer_delay(lbh))
+		mpage_put_bnr_to_bhs(mpd, next, &new);
+
+	return new.b_size;
 }
 
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
@@ -1800,7 +1798,7 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
  *
  * the function is used to collect contig. blocks in same state
  */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+static int mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 				   sector_t logical, struct buffer_head *bh)
 {
 	struct buffer_head *lbh = &mpd->lbh;
@@ -1815,7 +1813,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 		lbh->b_blocknr = logical;
 		lbh->b_size = bh->b_size;
 		lbh->b_state = bh->b_state & BH_FLAGS;
-		return;
+		return 0;
 	}
 
 	/*
@@ -1823,21 +1821,14 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 	 */
 	if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
 		lbh->b_size += bh->b_size;
-		return;
+		return 0;
 	}
 
 	/*
 	 * We couldn't merge the block to our extent, so we
 	 * need to flush current  extent and start new one
 	 */
-	mpage_da_map_blocks(mpd);
-
-	/*
-	 * Now start a new extent
-	 */
-	lbh->b_size = bh->b_size;
-	lbh->b_state = bh->b_state & BH_FLAGS;
-	lbh->b_blocknr = logical;
+	return MPAGE_DA_EXTENT_TAIL;
 }
 
 /*
@@ -1852,6 +1843,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 static int __mpage_da_writepage(struct page *page,
 				struct writeback_control *wbc, void *data)
 {
+	int ret = 0;
 	struct mpage_da_data *mpd = data;
 	struct inode *inode = mpd->inode;
 	struct buffer_head *bh, *head, fake;
@@ -1866,12 +1858,13 @@ static int __mpage_da_writepage(struct page *page,
 		 * and start IO on them using __mpage_writepage()
 		 */
 		if (mpd->next_page != mpd->first_page) {
-			mpage_da_map_blocks(mpd);
-			mpage_da_submit_io(mpd);
+			unlock_page(page);
+			ret = MPAGE_DA_EXTENT_TAIL;
+			goto finish_extent;
 		}
 
 		/*
-		 * Start next extent of pages ...
+		 * Start extent of pages ...
 		 */
 		mpd->first_page = page->index;
 
@@ -1897,7 +1890,7 @@ static int __mpage_da_writepage(struct page *page,
 		bh->b_state = 0;
 		set_buffer_dirty(bh);
 		set_buffer_uptodate(bh);
-		mpage_add_bh_to_extent(mpd, logical, bh);
+		ret = mpage_add_bh_to_extent(mpd, logical, bh);
 	} else {
 		/*
 		 * Page with regular buffer heads, just add all dirty ones
@@ -1906,13 +1899,17 @@ static int __mpage_da_writepage(struct page *page,
 		bh = head;
 		do {
 			BUG_ON(buffer_locked(bh));
-			if (buffer_dirty(bh))
-				mpage_add_bh_to_extent(mpd, logical, bh);
+			if (buffer_dirty(bh)) {
+				ret = mpage_add_bh_to_extent(mpd, logical, bh);
+				if (ret == MPAGE_DA_EXTENT_TAIL)
+					goto finish_extent;
+			}
 			logical++;
 		} while ((bh = bh->b_this_page) != head);
 	}
 
-	return 0;
+finish_extent:
+	return ret;
 }
 
 /*
@@ -1941,8 +1938,8 @@ static int mpage_da_writepages(struct address_space *mapping,
 			       struct writeback_control *wbc,
 			       get_block_t get_block)
 {
+	int ret, to_write, written_pages;
 	struct mpage_da_data mpd;
-	int ret;
 
 	if (!get_block)
 		return generic_writepages(mapping, wbc);
@@ -1956,16 +1953,20 @@ static int mpage_da_writepages(struct address_space *mapping,
 	mpd.next_page = 0;
 	mpd.get_block = get_block;
 
+	to_write = wbc->nr_to_write;
+
 	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
 
 	/*
-	 * Handle last extent of pages
+	 * Allocate blocks for the dirty delayed
+	 * pages found
 	 */
 	if (mpd.next_page != mpd.first_page) {
 		mpage_da_map_blocks(&mpd);
-		mpage_da_submit_io(&mpd);
+		written_pages =  mpage_da_submit_io(&mpd);
+		/* update nr_to_write correctly */
+		wbc->nr_to_write = to_write - written_pages;
 	}
-
 	return ret;
 }
 
@@ -2118,9 +2119,11 @@ static int ext4_da_writepage(struct page *page,
 			 * If we don't have mapping block we just ignore
 			 * them. We can also reach here via shrink_page_list
 			 */
+			start_pdflush = 1;
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
-			return 0;
+			ret = 0;
+			goto finish_ret;
 		}
 	} else {
 		/*
@@ -2143,9 +2146,11 @@ static int ext4_da_writepage(struct page *page,
 			/* check whether all are mapped and non delay */
 			if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
 						ext4_bh_unmapped_or_delay)) {
+				start_pdflush = 1;
 				redirty_page_for_writepage(wbc, page);
 				unlock_page(page);
-				return 0;
+				ret = 0;
+				goto finish_ret;
 			}
 		} else {
 			/*
@@ -2153,9 +2158,11 @@ static int ext4_da_writepage(struct page *page,
 			 * so just redity the page and unlock
 			 * and return
 			 */
+			start_pdflush = 1;
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
-			return 0;
+			ret = 0;
+			goto finish_ret;
 		}
 	}
 
@@ -2165,7 +2172,7 @@ static int ext4_da_writepage(struct page *page,
 		ret = block_write_full_page(page,
 						ext4_normal_get_block_write,
 						wbc);
-
+finish_ret:
 	return ret;
 }
 
@@ -2201,19 +2208,14 @@ static int ext4_da_writepages(struct address_space *mapping,
 	}
 
 	while (!ret && to_write) {
-		/*
-		 * set the max dirty pages could be write at a time
-		 * to fit into the reserved transaction credits
-		 */
-		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
-			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
 
 		/*
-		 * Estimate the worse case needed credits to write out
-		 * to_write pages
+		 * We write only a single extent in a loop.
+		 * So allocate credit needed to write a single
+		 * extent. journalled mode is not supported.
 		 */
-		needed_blocks = ext4_writepages_trans_blocks(inode,
-							     wbc->nr_to_write);
+		BUG_ON(ext4_should_journal_data(inode));
+		needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
 
 		/* start a new transaction*/
 		handle = ext4_journal_start(inode, needed_blocks);
@@ -2239,7 +2241,19 @@ static int ext4_da_writepages(struct address_space *mapping,
 		ret = mpage_da_writepages(mapping, wbc,
 						ext4_da_get_block_write);
 		ext4_journal_stop(handle);
-		if (wbc->nr_to_write) {
+		if (ret == MPAGE_DA_EXTENT_TAIL) {
+			/*
+			 * got one extent now try with
+			 * rest of the pages
+			 */
+			to_write += wbc->nr_to_write;
+			/*
+			 * Try to write from the start
+			 * The pages already written will
+			 * not be tagged dirty
+			 */
+			//wbc->range_start = range_start;
+		} else if (wbc->nr_to_write) {
 			/*
 			 * There is no more writeout needed
 			 * or we requested for a noblocking writeout
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html