[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1218491404.6766.22.camel@mingming-laptop>
Date: Mon, 11 Aug 2008 14:50:04 -0700
From: Mingming Cao <cmm@...ibm.com>
To: "Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>
Cc: tytso@....edu, sandeen@...hat.com, linux-ext4@...r.kernel.org
Subject: Re: [PATCH -v2] ext4: Rework the ext4_da_writepages
在 2008-08-11一的 15:53 +0530,Aneesh Kumar K.V写道:
> With the below changes we reserve credit needed to insert only one extent
> resulting from a call to single get_block. That make sure we don't take
> too much journal credits during writeout. We also don't limit the pages
> to write. That means we loop through the dirty pages building largest
> possible contiguous block request. Then we issue a single get_block request.
> We may get less block that we requested. If so we would end up not mapping
> some of the buffer_heads. That means those buffer_heads are still marked delay.
> Later in the writepage callback via __mpage_writepage we redirty those pages.
>
> We should also not limit/throttle wbc->nr_to_write in the filesystem writepages
> callback. That cause wrong behaviour in generic_sync_sb_inodes caused by
> wbc->nr_to_write being <= 0
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@...ux.vnet.ibm.com>
Updated in patch queue.
> ---
> fs/ext4/inode.c | 200 ++++++++++++++++++++++++++++--------------------------
> 1 files changed, 104 insertions(+), 96 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 7a66bba..26e30ed 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -41,6 +41,8 @@
> #include "acl.h"
> #include "ext4_extents.h"
>
> +#define MPAGE_DA_EXTENT_TAIL 0x01
> +
> static inline int ext4_begin_ordered_truncate(struct inode *inode,
> loff_t new_size)
> {
> @@ -1604,11 +1606,13 @@ static void ext4_da_page_release_reservation(struct page *page,
> unsigned long first_page, next_page; /* extent of pages */
> get_block_t *get_block;
> struct writeback_control *wbc;
> + int io_done;
> + long pages_written;
> };
>
> /*
> * mpage_da_submit_io - walks through extent of pages and try to write
> - * them with __mpage_writepage()
> + * them with writepage() call back
> *
> * @mpd->inode: inode
> * @mpd->first_page: first page of the extent
> @@ -1623,18 +1627,11 @@ static void ext4_da_page_release_reservation(struct page *page,
> static int mpage_da_submit_io(struct mpage_da_data *mpd)
> {
> struct address_space *mapping = mpd->inode->i_mapping;
> - struct mpage_data mpd_pp = {
> - .bio = NULL,
> - .last_block_in_bio = 0,
> - .get_block = mpd->get_block,
> - .use_writepage = 1,
> - };
> int ret = 0, err, nr_pages, i;
> unsigned long index, end;
> struct pagevec pvec;
>
> BUG_ON(mpd->next_page <= mpd->first_page);
> -
> pagevec_init(&pvec, 0);
> index = mpd->first_page;
> end = mpd->next_page - 1;
> @@ -1652,8 +1649,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
> break;
> index++;
>
> - err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
> -
> + err = mapping->a_ops->writepage(page, mpd->wbc);
> + if (!err)
> + mpd->pages_written++;
> /*
> * In error case, we have to continue because
> * remaining pages are still locked
> @@ -1664,9 +1662,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
> }
> pagevec_release(&pvec);
> }
> - if (mpd_pp.bio)
> - mpage_bio_submit(WRITE, mpd_pp.bio);
> -
> return ret;
> }
>
> @@ -1689,7 +1684,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
> int blocks = exbh->b_size >> inode->i_blkbits;
> sector_t pblock = exbh->b_blocknr, cur_logical;
> struct buffer_head *head, *bh;
> - unsigned long index, end;
> + pgoff_t index, end;
> struct pagevec pvec;
> int nr_pages, i;
>
> @@ -1774,13 +1769,11 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
> *
> * The function skips space we know is already mapped to disk blocks.
> *
> - * The function ignores errors ->get_block() returns, thus real
> - * error handling is postponed to __mpage_writepage()
> */
> static void mpage_da_map_blocks(struct mpage_da_data *mpd)
> {
> + int err = 0;
> struct buffer_head *lbh = &mpd->lbh;
> - int err = 0, remain = lbh->b_size;
> sector_t next = lbh->b_blocknr;
> struct buffer_head new;
>
> @@ -1790,35 +1783,32 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
> if (buffer_mapped(lbh) && !buffer_delay(lbh))
> return;
>
> - while (remain) {
> - new.b_state = lbh->b_state;
> - new.b_blocknr = 0;
> - new.b_size = remain;
> - err = mpd->get_block(mpd->inode, next, &new, 1);
> - if (err) {
> - /*
> - * Rather than implement own error handling
> - * here, we just leave remaining blocks
> - * unallocated and try again with ->writepage()
> - */
> - break;
> - }
> - BUG_ON(new.b_size == 0);
> + new.b_state = lbh->b_state;
> + new.b_blocknr = 0;
> + new.b_size = lbh->b_size;
> +
> + /*
> + * If we didn't accumulate anything
> + * to write simply return
> + */
> + if (!new.b_size)
> + return;
> + err = mpd->get_block(mpd->inode, next, &new, 1);
> + if (err)
> + return;
> + BUG_ON(new.b_size == 0);
>
> - if (buffer_new(&new))
> - __unmap_underlying_blocks(mpd->inode, &new);
> + if (buffer_new(&new))
> + __unmap_underlying_blocks(mpd->inode, &new);
>
> - /*
> - * If blocks are delayed marked, we need to
> - * put actual blocknr and drop delayed bit
> - */
> - if (buffer_delay(lbh) || buffer_unwritten(lbh))
> - mpage_put_bnr_to_bhs(mpd, next, &new);
> + /*
> + * If blocks are delayed marked, we need to
> + * put actual blocknr and drop delayed bit
> + */
> + if (buffer_delay(lbh) || buffer_unwritten(lbh))
> + mpage_put_bnr_to_bhs(mpd, next, &new);
>
> - /* go for the remaining blocks */
> - next += new.b_size >> mpd->inode->i_blkbits;
> - remain -= new.b_size;
> - }
> + return;
> }
>
> #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
> @@ -1864,13 +1854,9 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
> * need to flush current extent and start new one
> */
> mpage_da_map_blocks(mpd);
> -
> - /*
> - * Now start a new extent
> - */
> - lbh->b_size = bh->b_size;
> - lbh->b_state = bh->b_state & BH_FLAGS;
> - lbh->b_blocknr = logical;
> + mpage_da_submit_io(mpd);
> + mpd->io_done = 1;
> + return;
> }
>
> /*
> @@ -1890,17 +1876,35 @@ static int __mpage_da_writepage(struct page *page,
> struct buffer_head *bh, *head, fake;
> sector_t logical;
>
> + if (mpd->io_done) {
> + /*
> + * Rest of the page in the page_vec
> + * redirty then and skip then. We will
> + * try to to write them again after
> + * starting a new transaction
> + */
> + redirty_page_for_writepage(wbc, page);
> + unlock_page(page);
> + return MPAGE_DA_EXTENT_TAIL;
> + }
> /*
> * Can we merge this page to current extent?
> */
> if (mpd->next_page != page->index) {
> /*
> * Nope, we can't. So, we map non-allocated blocks
> - * and start IO on them using __mpage_writepage()
> + * and start IO on them using writepage()
> */
> if (mpd->next_page != mpd->first_page) {
> mpage_da_map_blocks(mpd);
> mpage_da_submit_io(mpd);
> + /*
> + * skip rest of the page in the page_vec
> + */
> + mpd->io_done = 1;
> + redirty_page_for_writepage(wbc, page);
> + unlock_page(page);
> + return MPAGE_DA_EXTENT_TAIL;
> }
>
> /*
> @@ -1931,6 +1935,8 @@ static int __mpage_da_writepage(struct page *page,
> set_buffer_dirty(bh);
> set_buffer_uptodate(bh);
> mpage_add_bh_to_extent(mpd, logical, bh);
> + if (mpd->io_done)
> + return MPAGE_DA_EXTENT_TAIL;
> } else {
> /*
> * Page with regular buffer heads, just add all dirty ones
> @@ -1939,8 +1945,12 @@ static int __mpage_da_writepage(struct page *page,
> bh = head;
> do {
> BUG_ON(buffer_locked(bh));
> - if (buffer_dirty(bh))
> + if (buffer_dirty(bh) &&
> + (!buffer_mapped(bh) || buffer_delay(bh))) {
> mpage_add_bh_to_extent(mpd, logical, bh);
> + if (mpd->io_done)
> + return MPAGE_DA_EXTENT_TAIL;
> + }
> logical++;
> } while ((bh = bh->b_this_page) != head);
> }
> @@ -1959,22 +1969,13 @@ static int __mpage_da_writepage(struct page *page,
> *
> * This is a library function, which implements the writepages()
> * address_space_operation.
> - *
> - * In order to avoid duplication of logic that deals with partial pages,
> - * multiple bio per page, etc, we find non-allocated blocks, allocate
> - * them with minimal calls to ->get_block() and re-use __mpage_writepage()
> - *
> - * It's important that we call __mpage_writepage() only once for each
> - * involved page, otherwise we'd have to implement more complicated logic
> - * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
> - *
> - * See comments to mpage_writepages()
> */
> static int mpage_da_writepages(struct address_space *mapping,
> struct writeback_control *wbc,
> get_block_t get_block)
> {
> struct mpage_da_data mpd;
> + long to_write;
> int ret;
>
> if (!get_block)
> @@ -1988,17 +1989,22 @@ static int mpage_da_writepages(struct address_space *mapping,
> mpd.first_page = 0;
> mpd.next_page = 0;
> mpd.get_block = get_block;
> + mpd.io_done = 0;
> + mpd.pages_written = 0;
> +
> + to_write = wbc->nr_to_write;
>
> ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
>
> /*
> * Handle last extent of pages
> */
> - if (mpd.next_page != mpd.first_page) {
> + if (!mpd.io_done && mpd.next_page != mpd.first_page) {
> mpage_da_map_blocks(&mpd);
> mpage_da_submit_io(&mpd);
> }
>
> + wbc->nr_to_write = to_write - mpd.pages_written;
> return ret;
> }
>
> @@ -2210,10 +2216,7 @@ static int ext4_da_writepages(struct address_space *mapping,
> int ret = 0;
> long to_write;
> loff_t range_start = 0;
> - int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
> - int max_credit_blocks = ext4_journal_max_transaction_buffers(inode);
> - int need_credits_per_page = ext4_writepages_trans_blocks(inode, 1);
> - int max_writeback_pages = (max_credit_blocks / blocks_per_page) / need_credits_per_page;
> + long pages_skipped = 0;
>
> /*
> * No pages to write? This is mainly a kludge to avoid starting
> @@ -2223,45 +2226,35 @@ static int ext4_da_writepages(struct address_space *mapping,
> if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
> return 0;
>
> - if (wbc->nr_to_write > mapping->nrpages)
> - wbc->nr_to_write = mapping->nrpages;
> -
> - to_write = wbc->nr_to_write;
> -
> - if (!wbc->range_cyclic) {
> + if (!wbc->range_cyclic)
> /*
> * If range_cyclic is not set force range_cont
> * and save the old writeback_index
> */
> wbc->range_cont = 1;
> - range_start = wbc->range_start;
> - }
>
> - while (!ret && to_write) {
> - /*
> - * set the max dirty pages could be write at a time
> - * to fit into the reserved transaction credits
> - */
> - if (wbc->nr_to_write > max_writeback_pages)
> - wbc->nr_to_write = max_writeback_pages;
> + range_start = wbc->range_start;
> + pages_skipped = wbc->pages_skipped;
> +
> +restart_loop:
> + to_write = wbc->nr_to_write;
> + while (!ret && to_write > 0) {
>
> /*
> - * Estimate the worse case needed credits to write out
> - * to_write pages
> + * we insert one extent at a time. So we need
> + * credit needed for single extent allocation.
> + * journalled mode is currently not supported
> + * by delalloc
> */
> - needed_blocks = ext4_writepages_trans_blocks(inode,
> - wbc->nr_to_write);
> - while (needed_blocks > max_credit_blocks) {
> - wbc->nr_to_write--;
> - needed_blocks = ext4_writepages_trans_blocks(inode,
> - wbc->nr_to_write);
> - }
> + BUG_ON(ext4_should_journal_data(inode));
> + needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
> +
> /* start a new transaction*/
> handle = ext4_journal_start(inode, needed_blocks);
> if (IS_ERR(handle)) {
> ret = PTR_ERR(handle);
> - printk(KERN_EMERG "ext4_da_writepages: jbd2_start: "
> - "%ld pages, ino %lu; err %d\n",
> + printk(KERN_EMERG "%s: jbd2_start: "
> + "%ld pages, ino %lu; err %d\n", __func__,
> wbc->nr_to_write, inode->i_ino, ret);
> dump_stack();
> goto out_writepages;
> @@ -2284,7 +2277,14 @@ static int ext4_da_writepages(struct address_space *mapping,
> ret = mpage_da_writepages(mapping, wbc,
> ext4_da_get_block_write);
> ext4_journal_stop(handle);
> - if (wbc->nr_to_write) {
> + if (ret == MPAGE_DA_EXTENT_TAIL) {
> + /*
> + * got one extent now try with
> + * rest of the pages
> + */
> + to_write += wbc->nr_to_write;
> + ret = 0;
> + } else if (wbc->nr_to_write) {
> /*
> * There is no more writeout needed
> * or we requested for a noblocking writeout
> @@ -2296,10 +2296,18 @@ static int ext4_da_writepages(struct address_space *mapping,
> wbc->nr_to_write = to_write;
> }
>
> + if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
> + /* We skipped pages in this loop */
> + wbc->range_start = range_start;
> + wbc->nr_to_write = to_write +
> + wbc->pages_skipped - pages_skipped;
> + wbc->pages_skipped = pages_skipped;
> + goto restart_loop;
> + }
> +
> out_writepages:
> wbc->nr_to_write = to_write;
> - if (range_start)
> - wbc->range_start = range_start;
> + wbc->range_start = range_start;
> return ret;
> }
>
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists