linux-ext4 - Re: [PATCH -v2] ext4: Rework the ext4_da

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1218491404.6766.22.camel@mingming-laptop>
Date:	Mon, 11 Aug 2008 14:50:04 -0700
From:	Mingming Cao <cmm@...ibm.com>
To:	"Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>
Cc:	tytso@....edu, sandeen@...hat.com, linux-ext4@...r.kernel.org
Subject: Re: [PATCH -v2] ext4: Rework the ext4_da_writepages


在 2008-08-11一的 15:53 +0530，Aneesh Kumar K.V写道：
> With the below changes we reserve credit needed to insert only one extent
> resulting from a call to single get_block. That make sure we don't take
> too much journal credits during writeout. We also don't limit the pages
> to write. That means we loop through the dirty pages building largest
> possible contiguous block request. Then we issue a single get_block request.
> We may get less block that we requested. If so we would end up not mapping
> some of the buffer_heads. That means those buffer_heads are still marked delay.
> Later in the writepage callback via __mpage_writepage we redirty those pages.
> 
> We should also not limit/throttle wbc->nr_to_write in the filesystem writepages
> callback. That cause wrong behaviour in generic_sync_sb_inodes caused by
> wbc->nr_to_write being <= 0
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@...ux.vnet.ibm.com>

Updated in patch queue.
> ---
>  fs/ext4/inode.c |  200 ++++++++++++++++++++++++++++--------------------------
>  1 files changed, 104 insertions(+), 96 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 7a66bba..26e30ed 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -41,6 +41,8 @@
>  #include "acl.h"
>  #include "ext4_extents.h"
> 
> +#define MPAGE_DA_EXTENT_TAIL 0x01
> +
>  static inline int ext4_begin_ordered_truncate(struct inode *inode,
>  					      loff_t new_size)
>  {
> @@ -1604,11 +1606,13 @@ static void ext4_da_page_release_reservation(struct page *page,
>  	unsigned long first_page, next_page;	/* extent of pages */
>  	get_block_t *get_block;
>  	struct writeback_control *wbc;
> +	int io_done;
> +	long pages_written;
>  };
> 
>  /*
>   * mpage_da_submit_io - walks through extent of pages and try to write
> - * them with __mpage_writepage()
> + * them with writepage() call back
>   *
>   * @mpd->inode: inode
>   * @mpd->first_page: first page of the extent
> @@ -1623,18 +1627,11 @@ static void ext4_da_page_release_reservation(struct page *page,
>  static int mpage_da_submit_io(struct mpage_da_data *mpd)
>  {
>  	struct address_space *mapping = mpd->inode->i_mapping;
> -	struct mpage_data mpd_pp = {
> -		.bio = NULL,
> -		.last_block_in_bio = 0,
> -		.get_block = mpd->get_block,
> -		.use_writepage = 1,
> -	};
>  	int ret = 0, err, nr_pages, i;
>  	unsigned long index, end;
>  	struct pagevec pvec;
> 
>  	BUG_ON(mpd->next_page <= mpd->first_page);
> -
>  	pagevec_init(&pvec, 0);
>  	index = mpd->first_page;
>  	end = mpd->next_page - 1;
> @@ -1652,8 +1649,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
>  				break;
>  			index++;
> 
> -			err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
> -
> +			err = mapping->a_ops->writepage(page, mpd->wbc);
> +			if (!err)
> +				mpd->pages_written++;
>  			/*
>  			 * In error case, we have to continue because
>  			 * remaining pages are still locked
> @@ -1664,9 +1662,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
>  		}
>  		pagevec_release(&pvec);
>  	}
> -	if (mpd_pp.bio)
> -		mpage_bio_submit(WRITE, mpd_pp.bio);
> -
>  	return ret;
>  }
> 
> @@ -1689,7 +1684,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
>  	int blocks = exbh->b_size >> inode->i_blkbits;
>  	sector_t pblock = exbh->b_blocknr, cur_logical;
>  	struct buffer_head *head, *bh;
> -	unsigned long index, end;
> +	pgoff_t index, end;
>  	struct pagevec pvec;
>  	int nr_pages, i;
> 
> @@ -1774,13 +1769,11 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
>   *
>   * The function skips space we know is already mapped to disk blocks.
>   *
> - * The function ignores errors ->get_block() returns, thus real
> - * error handling is postponed to __mpage_writepage()
>   */
>  static void mpage_da_map_blocks(struct mpage_da_data *mpd)
>  {
> +	int err = 0;
>  	struct buffer_head *lbh = &mpd->lbh;
> -	int err = 0, remain = lbh->b_size;
>  	sector_t next = lbh->b_blocknr;
>  	struct buffer_head new;
> 
> @@ -1790,35 +1783,32 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
>  	if (buffer_mapped(lbh) && !buffer_delay(lbh))
>  		return;
> 
> -	while (remain) {
> -		new.b_state = lbh->b_state;
> -		new.b_blocknr = 0;
> -		new.b_size = remain;
> -		err = mpd->get_block(mpd->inode, next, &new, 1);
> -		if (err) {
> -			/*
> -			 * Rather than implement own error handling
> -			 * here, we just leave remaining blocks
> -			 * unallocated and try again with ->writepage()
> -			 */
> -			break;
> -		}
> -		BUG_ON(new.b_size == 0);
> +	new.b_state = lbh->b_state;
> +	new.b_blocknr = 0;
> +	new.b_size = lbh->b_size;
> +
> +	/*
> +	 * If we didn't accumulate anything
> +	 * to write simply return
> +	 */
> +	if (!new.b_size)
> +		return;
> +	err = mpd->get_block(mpd->inode, next, &new, 1);
> +	if (err)
> +		return;
> +	BUG_ON(new.b_size == 0);
> 
> -		if (buffer_new(&new))
> -			__unmap_underlying_blocks(mpd->inode, &new);
> +	if (buffer_new(&new))
> +		__unmap_underlying_blocks(mpd->inode, &new);
> 
> -		/*
> -		 * If blocks are delayed marked, we need to
> -		 * put actual blocknr and drop delayed bit
> -		 */
> -		if (buffer_delay(lbh) || buffer_unwritten(lbh))
> -			mpage_put_bnr_to_bhs(mpd, next, &new);
> +	/*
> +	 * If blocks are delayed marked, we need to
> +	 * put actual blocknr and drop delayed bit
> +	 */
> +	if (buffer_delay(lbh) || buffer_unwritten(lbh))
> +		mpage_put_bnr_to_bhs(mpd, next, &new);
> 
> -		/* go for the remaining blocks */
> -		next += new.b_size >> mpd->inode->i_blkbits;
> -		remain -= new.b_size;
> -	}
> +	return;
>  }
> 
>  #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
> @@ -1864,13 +1854,9 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
>  	 * need to flush current  extent and start new one
>  	 */
>  	mpage_da_map_blocks(mpd);
> -
> -	/*
> -	 * Now start a new extent
> -	 */
> -	lbh->b_size = bh->b_size;
> -	lbh->b_state = bh->b_state & BH_FLAGS;
> -	lbh->b_blocknr = logical;
> +	mpage_da_submit_io(mpd);
> +	mpd->io_done = 1;
> +	return;
>  }
> 
>  /*
> @@ -1890,17 +1876,35 @@ static int __mpage_da_writepage(struct page *page,
>  	struct buffer_head *bh, *head, fake;
>  	sector_t logical;
> 
> +	if (mpd->io_done) {
> +		/*
> +		 * Rest of the page in the page_vec
> +		 * redirty then and skip then. We will
> +		 * try to to write them again after
> +		 * starting a new transaction
> +		 */
> +		redirty_page_for_writepage(wbc, page);
> +		unlock_page(page);
> +		return MPAGE_DA_EXTENT_TAIL;
> +	}
>  	/*
>  	 * Can we merge this page to current extent?
>  	 */
>  	if (mpd->next_page != page->index) {
>  		/*
>  		 * Nope, we can't. So, we map non-allocated blocks
> -		 * and start IO on them using __mpage_writepage()
> +		 * and start IO on them using writepage()
>  		 */
>  		if (mpd->next_page != mpd->first_page) {
>  			mpage_da_map_blocks(mpd);
>  			mpage_da_submit_io(mpd);
> +			/*
> +			 * skip rest of the page in the page_vec
> +			 */
> +			mpd->io_done = 1;
> +			redirty_page_for_writepage(wbc, page);
> +			unlock_page(page);
> +			return MPAGE_DA_EXTENT_TAIL;
>  		}
> 
>  		/*
> @@ -1931,6 +1935,8 @@ static int __mpage_da_writepage(struct page *page,
>  		set_buffer_dirty(bh);
>  		set_buffer_uptodate(bh);
>  		mpage_add_bh_to_extent(mpd, logical, bh);
> +		if (mpd->io_done)
> +			return MPAGE_DA_EXTENT_TAIL;
>  	} else {
>  		/*
>  		 * Page with regular buffer heads, just add all dirty ones
> @@ -1939,8 +1945,12 @@ static int __mpage_da_writepage(struct page *page,
>  		bh = head;
>  		do {
>  			BUG_ON(buffer_locked(bh));
> -			if (buffer_dirty(bh))
> +			if (buffer_dirty(bh) &&
> +				(!buffer_mapped(bh) || buffer_delay(bh))) {
>  				mpage_add_bh_to_extent(mpd, logical, bh);
> +				if (mpd->io_done)
> +					return MPAGE_DA_EXTENT_TAIL;
> +			}
>  			logical++;
>  		} while ((bh = bh->b_this_page) != head);
>  	}
> @@ -1959,22 +1969,13 @@ static int __mpage_da_writepage(struct page *page,
>   *
>   * This is a library function, which implements the writepages()
>   * address_space_operation.
> - *
> - * In order to avoid duplication of logic that deals with partial pages,
> - * multiple bio per page, etc, we find non-allocated blocks, allocate
> - * them with minimal calls to ->get_block() and re-use __mpage_writepage()
> - *
> - * It's important that we call __mpage_writepage() only once for each
> - * involved page, otherwise we'd have to implement more complicated logic
> - * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
> - *
> - * See comments to mpage_writepages()
>   */
>  static int mpage_da_writepages(struct address_space *mapping,
>  			       struct writeback_control *wbc,
>  			       get_block_t get_block)
>  {
>  	struct mpage_da_data mpd;
> +	long to_write;
>  	int ret;
> 
>  	if (!get_block)
> @@ -1988,17 +1989,22 @@ static int mpage_da_writepages(struct address_space *mapping,
>  	mpd.first_page = 0;
>  	mpd.next_page = 0;
>  	mpd.get_block = get_block;
> +	mpd.io_done = 0;
> +	mpd.pages_written = 0;
> +
> +	to_write = wbc->nr_to_write;
> 
>  	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
> 
>  	/*
>  	 * Handle last extent of pages
>  	 */
> -	if (mpd.next_page != mpd.first_page) {
> +	if (!mpd.io_done && mpd.next_page != mpd.first_page) {
>  		mpage_da_map_blocks(&mpd);
>  		mpage_da_submit_io(&mpd);
>  	}
> 
> +	wbc->nr_to_write = to_write - mpd.pages_written;
>  	return ret;
>  }
> 
> @@ -2210,10 +2216,7 @@ static int ext4_da_writepages(struct address_space *mapping,
>  	int ret = 0;
>  	long to_write;
>  	loff_t range_start = 0;
> -	int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
> -	int max_credit_blocks = ext4_journal_max_transaction_buffers(inode);
> -	int need_credits_per_page =  ext4_writepages_trans_blocks(inode, 1);
> -	int max_writeback_pages = (max_credit_blocks / blocks_per_page) / need_credits_per_page;
> +	long pages_skipped = 0;
> 
>  	/*
>  	 * No pages to write? This is mainly a kludge to avoid starting
> @@ -2223,45 +2226,35 @@ static int ext4_da_writepages(struct address_space *mapping,
>  	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
>  		return 0;
> 
> -	if (wbc->nr_to_write > mapping->nrpages)
> -		wbc->nr_to_write = mapping->nrpages;
> -
> -	to_write = wbc->nr_to_write;
> -
> -	if (!wbc->range_cyclic) {
> +	if (!wbc->range_cyclic)
>  		/*
>  		 * If range_cyclic is not set force range_cont
>  		 * and save the old writeback_index
>  		 */
>  		wbc->range_cont = 1;
> -		range_start =  wbc->range_start;
> -	}
> 
> -	while (!ret && to_write) {
> -		/*
> -		 * set the max dirty pages could be write at a time
> -		 * to fit into the reserved transaction credits
> -		 */
> -		if (wbc->nr_to_write > max_writeback_pages)
> -			wbc->nr_to_write = max_writeback_pages;
> +	range_start =  wbc->range_start;
> +	pages_skipped = wbc->pages_skipped;
> +
> +restart_loop:
> +	to_write = wbc->nr_to_write;
> +	while (!ret && to_write > 0) {
> 
>  		/*
> -		 * Estimate the worse case needed credits to write out
> -		 * to_write pages
> +		 * we  insert one extent at a time. So we need
> +		 * credit needed for single extent allocation.
> +		 * journalled mode is currently not supported
> +		 * by delalloc
>  		 */
> -		needed_blocks = ext4_writepages_trans_blocks(inode,
> -							     wbc->nr_to_write);
> -		while (needed_blocks > max_credit_blocks) {
> -			wbc->nr_to_write--;
> -			needed_blocks = ext4_writepages_trans_blocks(inode,
> -							     wbc->nr_to_write);
> -		}
> +		BUG_ON(ext4_should_journal_data(inode));
> +		needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
> +
>  		/* start a new transaction*/
>  		handle = ext4_journal_start(inode, needed_blocks);
>  		if (IS_ERR(handle)) {
>  			ret = PTR_ERR(handle);
> -			printk(KERN_EMERG "ext4_da_writepages: jbd2_start: "
> -			       "%ld pages, ino %lu; err %d\n",
> +			printk(KERN_EMERG "%s: jbd2_start: "
> +			       "%ld pages, ino %lu; err %d\n", __func__,
>  			       wbc->nr_to_write, inode->i_ino, ret);
>  			dump_stack();
>  			goto out_writepages;
> @@ -2284,7 +2277,14 @@ static int ext4_da_writepages(struct address_space *mapping,
>  		ret = mpage_da_writepages(mapping, wbc,
>  						ext4_da_get_block_write);
>  		ext4_journal_stop(handle);
> -		if (wbc->nr_to_write) {
> +		if (ret == MPAGE_DA_EXTENT_TAIL) {
> +			/*
> +			 * got one extent now try with
> +			 * rest of the pages
> +			 */
> +			to_write += wbc->nr_to_write;
> +			ret = 0;
> +		} else if (wbc->nr_to_write) {
>  			/*
>  			 * There is no more writeout needed
>  			 * or we requested for a noblocking writeout
> @@ -2296,10 +2296,18 @@ static int ext4_da_writepages(struct address_space *mapping,
>  		wbc->nr_to_write = to_write;
>  	}
> 
> +	if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
> +		/* We skipped pages in this loop */
> +		wbc->range_start = range_start;
> +		wbc->nr_to_write = to_write +
> +					wbc->pages_skipped - pages_skipped;
> +		wbc->pages_skipped = pages_skipped;
> +		goto restart_loop;
> +	}
> +
>  out_writepages:
>  	wbc->nr_to_write = to_write;
> -	if (range_start)
> -		wbc->range_start = range_start;
> +	wbc->range_start = range_start;
>  	return ret;
>  }
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html