[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20130508034857.GA32131@gmail.com>
Date: Wed, 8 May 2013 11:48:57 +0800
From: Zheng Liu <gnehzuil.liu@...il.com>
To: Jan Kara <jack@...e.cz>
Cc: Ted Tso <tytso@....edu>, linux-ext4@...r.kernel.org
Subject: Re: [PATCH 18/29] ext4: Restructure writeback path
On Mon, Apr 08, 2013 at 11:32:23PM +0200, Jan Kara wrote:
> There are two issues with current writeback path in ext4. For one we
> don't necessarily map complete pages when blocksize < pagesize and thus
> needn't do any writeback in one iteration. We always map some blocks
> though so we will eventually finish mapping the page. Just if writeback
> races with other operations on the file, forward progress is not really
> guaranteed. The second problem is that current code structure makes it
> hard to associate all the bios to some range of pages with one io_end
> structure so that unwritten extents can be converted after all the bios
> are finished. This will be especially difficult later when io_end will
> be associated with reserved transaction handle.
>
> We restructure the writeback path to a relatively simple loop which
> first prepares extent of pages, then maps one or more extents so that
> no page is partially mapped, and once page is fully mapped it is
> submitted for IO. We keep all the mapping and IO submission information
> in mpage_da_data structure to somewhat reduce stack usage. Resulting
> code is somewhat shorter than the old one and hopefully also easier to
> read.
>
> Signed-off-by: Jan Kara <jack@...e.cz>
One nit below. Otherwise the patch looks good to be.
Reviewed-by: Zheng Liu <wenqing.lz@...bao.com>
> ---
> fs/ext4/ext4.h | 15 -
> fs/ext4/inode.c | 978 +++++++++++++++++++++----------------------
> include/trace/events/ext4.h | 64 ++-
> 3 files changed, 508 insertions(+), 549 deletions(-)
...
> /*
> - * write_cache_pages_da - walk the list of dirty pages of the given
> - * address space and accumulate pages that need writing, and call
> - * mpage_da_map_and_submit to map a single contiguous memory region
> - * and then write them.
> + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
> + * and underlying extent to map
> + *
> + * @mpd - where to look for pages
> + *
> + * Walk dirty pages in the mapping while they are contiguous and lock them.
> + * While pages are fully mapped submit them for IO. When we find a page which
> + * isn't mapped we start accumulating extent of buffers underlying these pages
> + * that needs mapping (formed by either delayed or unwritten buffers). The
> + * extent found is returned in @mpd structure (starting at mpd->lblk with
> + * length mpd->len blocks).
> */
> -static int write_cache_pages_da(handle_t *handle,
> - struct address_space *mapping,
> - struct writeback_control *wbc,
> - struct mpage_da_data *mpd,
> - pgoff_t *done_index)
> +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
> {
> - struct buffer_head *bh, *head;
> - struct inode *inode = mapping->host;
> - struct pagevec pvec;
> - unsigned int nr_pages;
> - sector_t logical;
> - pgoff_t index, end;
> - long nr_to_write = wbc->nr_to_write;
> - int i, tag, ret = 0;
> -
> - memset(mpd, 0, sizeof(struct mpage_da_data));
> - mpd->wbc = wbc;
> - mpd->inode = inode;
> - pagevec_init(&pvec, 0);
> - index = wbc->range_start >> PAGE_CACHE_SHIFT;
> - end = wbc->range_end >> PAGE_CACHE_SHIFT;
> + struct address_space *mapping = mpd->inode->i_mapping;
> + struct pagevec pvec;
> + unsigned int nr_pages;
> + pgoff_t index = mpd->first_page;
> + pgoff_t end = mpd->last_page;
> + bool first_page_found = false;
> + int tag;
> + int i, err = 0;
> + int blkbits = mpd->inode->i_blkbits;
> + ext4_lblk_t lblk;
> + struct buffer_head *head;
>
> - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
> + if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
> tag = PAGECACHE_TAG_TOWRITE;
> else
> tag = PAGECACHE_TAG_DIRTY;
>
> - *done_index = index;
> + mpd->map.m_len = 0;
> + mpd->next_page = index;
Forgot to call pagevec_init(&pvec, 0) here.
Regards,
- Zheng
> while (index <= end) {
> nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
> min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
> if (nr_pages == 0)
> - return 0;
> + goto out;
>
> for (i = 0; i < nr_pages; i++) {
> struct page *page = pvec.pages[i];
> @@ -2145,31 +2138,21 @@ static int write_cache_pages_da(handle_t *handle,
> if (page->index > end)
> goto out;
>
> - *done_index = page->index + 1;
> -
> - /*
> - * If we can't merge this page, and we have
> - * accumulated an contiguous region, write it
> - */
> - if ((mpd->next_page != page->index) &&
> - (mpd->next_page != mpd->first_page)) {
> - mpage_da_map_and_submit(mpd);
> - goto ret_extent_tail;
> - }
> + /* If we can't merge this page, we are done. */
> + if (first_page_found && mpd->next_page != page->index)
> + goto out;
>
> lock_page(page);
> -
> /*
> - * If the page is no longer dirty, or its
> - * mapping no longer corresponds to inode we
> - * are writing (which means it has been
> - * truncated or invalidated), or the page is
> - * already under writeback and we are not
> - * doing a data integrity writeback, skip the page
> + * If the page is no longer dirty, or its mapping no
> + * longer corresponds to inode we are writing (which
> + * means it has been truncated or invalidated), or the
> + * page is already under writeback and we are not doing
> + * a data integrity writeback, skip the page
> */
> if (!PageDirty(page) ||
> (PageWriteback(page) &&
> - (wbc->sync_mode == WB_SYNC_NONE)) ||
> + (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
> unlikely(page->mapping != mapping)) {
> unlock_page(page);
> continue;
> @@ -2178,101 +2161,60 @@ static int write_cache_pages_da(handle_t *handle,
> wait_on_page_writeback(page);
> BUG_ON(PageWriteback(page));
>
> - /*
> - * If we have inline data and arrive here, it means that
> - * we will soon create the block for the 1st page, so
> - * we'd better clear the inline data here.
> - */
> - if (ext4_has_inline_data(inode)) {
> - BUG_ON(ext4_test_inode_state(inode,
> - EXT4_STATE_MAY_INLINE_DATA));
> - ext4_destroy_inline_data(handle, inode);
> - }
> -
> - if (mpd->next_page != page->index)
> + if (!first_page_found) {
> mpd->first_page = page->index;
> + first_page_found = true;
> + }
> mpd->next_page = page->index + 1;
> - logical = (sector_t) page->index <<
> - (PAGE_CACHE_SHIFT - inode->i_blkbits);
> + lblk = ((ext4_lblk_t)page->index) <<
> + (PAGE_CACHE_SHIFT - blkbits);
>
> /* Add all dirty buffers to mpd */
> head = page_buffers(page);
> - bh = head;
> - do {
> - BUG_ON(buffer_locked(bh));
> - /*
> - * We need to try to allocate unmapped blocks
> - * in the same page. Otherwise we won't make
> - * progress with the page in ext4_writepage
> - */
> - if (ext4_bh_delay_or_unwritten(NULL, bh)) {
> - mpage_add_bh_to_extent(mpd, logical,
> - bh->b_state);
> - if (mpd->io_done)
> - goto ret_extent_tail;
> - } else if (buffer_dirty(bh) &&
> - buffer_mapped(bh)) {
> - /*
> - * mapped dirty buffer. We need to
> - * update the b_state because we look
> - * at b_state in mpage_da_map_blocks.
> - * We don't update b_size because if we
> - * find an unmapped buffer_head later
> - * we need to use the b_state flag of
> - * that buffer_head.
> - */
> - if (mpd->b_size == 0)
> - mpd->b_state =
> - bh->b_state & BH_FLAGS;
> - }
> - logical++;
> - } while ((bh = bh->b_this_page) != head);
> -
> - if (nr_to_write > 0) {
> - nr_to_write--;
> - if (nr_to_write == 0 &&
> - wbc->sync_mode == WB_SYNC_NONE)
> - /*
> - * We stop writing back only if we are
> - * not doing integrity sync. In case of
> - * integrity sync we have to keep going
> - * because someone may be concurrently
> - * dirtying pages, and we might have
> - * synced a lot of newly appeared dirty
> - * pages, but have not synced all of the
> - * old dirty pages.
> - */
> + if (!add_page_bufs_to_extent(mpd, head, head, lblk))
> + goto out;
> + /* So far everything mapped? Submit the page for IO. */
> + if (mpd->map.m_len == 0) {
> + err = mpage_submit_page(mpd, page);
> + if (err < 0)
> goto out;
> }
> +
> + /*
> + * Accumulated enough dirty pages? This doesn't apply
> + * to WB_SYNC_ALL mode. For integrity sync we have to
> + * keep going because someone may be concurrently
> + * dirtying pages, and we might have synced a lot of
> + * newly appeared dirty pages, but have not synced all
> + * of the old dirty pages.
> + */
> + if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
> + mpd->next_page - mpd->first_page >=
> + mpd->wbc->nr_to_write)
> + goto out;
> }
> pagevec_release(&pvec);
> cond_resched();
> }
> return 0;
> -ret_extent_tail:
> - ret = MPAGE_DA_EXTENT_TAIL;
> out:
> pagevec_release(&pvec);
> - cond_resched();
> - return ret;
> + return err;
> }
>
> -
> static int ext4_da_writepages(struct address_space *mapping,
> struct writeback_control *wbc)
> {
> - pgoff_t index;
> + pgoff_t writeback_index = 0;
> + long nr_to_write = wbc->nr_to_write;
> int range_whole = 0;
> + int cycled = 1;
> handle_t *handle = NULL;
> struct mpage_da_data mpd;
> struct inode *inode = mapping->host;
> - int pages_written = 0;
> - int range_cyclic, cycled = 1, io_done = 0;
> int needed_blocks, ret = 0;
> - loff_t range_start = wbc->range_start;
> struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
> - pgoff_t done_index = 0;
> - pgoff_t end;
> + bool done;
> struct blk_plug plug;
>
> trace_ext4_da_writepages(inode, wbc);
> @@ -2298,40 +2240,65 @@ static int ext4_da_writepages(struct address_space *mapping,
> if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
> return -EROFS;
>
> + /*
> + * If we have inline data and arrive here, it means that
> + * we will soon create the block for the 1st page, so
> + * we'd better clear the inline data here.
> + */
> + if (ext4_has_inline_data(inode)) {
> + /* Just inode will be modified... */
> + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
> + if (IS_ERR(handle)) {
> + ret = PTR_ERR(handle);
> + goto out_writepages;
> + }
> + BUG_ON(ext4_test_inode_state(inode,
> + EXT4_STATE_MAY_INLINE_DATA));
> + ext4_destroy_inline_data(handle, inode);
> + ext4_journal_stop(handle);
> + }
> +
> if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
> range_whole = 1;
>
> - range_cyclic = wbc->range_cyclic;
> if (wbc->range_cyclic) {
> - index = mapping->writeback_index;
> - if (index)
> + writeback_index = mapping->writeback_index;
> + if (writeback_index)
> cycled = 0;
> - wbc->range_start = index << PAGE_CACHE_SHIFT;
> - wbc->range_end = LLONG_MAX;
> - wbc->range_cyclic = 0;
> - end = -1;
> + mpd.first_page = writeback_index;
> + mpd.last_page = -1;
> } else {
> - index = wbc->range_start >> PAGE_CACHE_SHIFT;
> - end = wbc->range_end >> PAGE_CACHE_SHIFT;
> + mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
> + mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
> }
>
> + mpd.inode = inode;
> + mpd.wbc = wbc;
> + ext4_io_submit_init(&mpd.io_submit, wbc);
> retry:
> if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
> - tag_pages_for_writeback(mapping, index, end);
> -
> + tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
> + done = false;
> blk_start_plug(&plug);
> - while (!ret && wbc->nr_to_write > 0) {
> + while (!done && mpd.first_page <= mpd.last_page) {
> + /* For each extent of pages we use new io_end */
> + mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
> + if (!mpd.io_submit.io_end) {
> + ret = -ENOMEM;
> + break;
> + }
>
> /*
> - * we insert one extent at a time. So we need
> - * credit needed for single extent allocation.
> - * journalled mode is currently not supported
> - * by delalloc
> + * We have two constraints: We find one extent to map and we
> + * must always write out whole page (makes a difference when
> + * blocksize < pagesize) so that we don't block on IO when we
> + * try to write out the rest of the page. Journalled mode is
> + * not supported by delalloc.
> */
> BUG_ON(ext4_should_journal_data(inode));
> needed_blocks = ext4_da_writepages_trans_blocks(inode);
>
> - /* start a new transaction*/
> + /* start a new transaction */
> handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
> needed_blocks);
> if (IS_ERR(handle)) {
> @@ -2339,76 +2306,67 @@ retry:
> ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
> "%ld pages, ino %lu; err %d", __func__,
> wbc->nr_to_write, inode->i_ino, ret);
> - blk_finish_plug(&plug);
> - goto out_writepages;
> + /* Release allocated io_end */
> + ext4_put_io_end(mpd.io_submit.io_end);
> + break;
> }
>
> - /*
> - * Now call write_cache_pages_da() to find the next
> - * contiguous region of logical blocks that need
> - * blocks to be allocated by ext4 and submit them.
> - */
> - ret = write_cache_pages_da(handle, mapping,
> - wbc, &mpd, &done_index);
> - /*
> - * If we have a contiguous extent of pages and we
> - * haven't done the I/O yet, map the blocks and submit
> - * them for I/O.
> - */
> - if (!mpd.io_done && mpd.next_page != mpd.first_page) {
> - mpage_da_map_and_submit(&mpd);
> - ret = MPAGE_DA_EXTENT_TAIL;
> + trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
> + ret = mpage_prepare_extent_to_map(&mpd);
> + if (!ret) {
> + if (mpd.map.m_len)
> + ret = mpage_map_and_submit_extent(handle, &mpd);
> + else {
> + /*
> + * We scanned the whole range (or exhausted
> + * nr_to_write), submitted what was mapped and
> + * didn't find anything needing mapping. We are
> + * done.
> + */
> + done = true;
> + }
> }
> - trace_ext4_da_write_pages(inode, &mpd);
> - wbc->nr_to_write -= mpd.pages_written;
> -
> ext4_journal_stop(handle);
> -
> - if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
> - /* commit the transaction which would
> + /* Submit prepared bio */
> + ext4_io_submit(&mpd.io_submit);
> + /* Unlock pages we didn't use */
> + mpage_release_unused_pages(&mpd, false);
> + /* Drop our io_end reference we got from init */
> + ext4_put_io_end(mpd.io_submit.io_end);
> +
> + if (ret == -ENOSPC && sbi->s_journal) {
> + /*
> + * Commit the transaction which would
> * free blocks released in the transaction
> * and try again
> */
> jbd2_journal_force_commit_nested(sbi->s_journal);
> ret = 0;
> - } else if (ret == MPAGE_DA_EXTENT_TAIL) {
> - /*
> - * Got one extent now try with rest of the pages.
> - * If mpd.retval is set -EIO, journal is aborted.
> - * So we don't need to write any more.
> - */
> - pages_written += mpd.pages_written;
> - ret = mpd.retval;
> - io_done = 1;
> - } else if (wbc->nr_to_write)
> - /*
> - * There is no more writeout needed
> - * or we requested for a noblocking writeout
> - * and we found the device congested
> - */
> + continue;
> + }
> + /* Fatal error - ENOMEM, EIO... */
> + if (ret)
> break;
> }
> blk_finish_plug(&plug);
> - if (!io_done && !cycled) {
> + if (!ret && !cycled) {
> cycled = 1;
> - index = 0;
> - wbc->range_start = index << PAGE_CACHE_SHIFT;
> - wbc->range_end = mapping->writeback_index - 1;
> + mpd.last_page = writeback_index - 1;
> + mpd.first_page = 0;
> goto retry;
> }
>
> /* Update index */
> - wbc->range_cyclic = range_cyclic;
> if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> /*
> - * set the writeback_index so that range_cyclic
> + * Set the writeback_index so that range_cyclic
> * mode will write it back later
> */
> - mapping->writeback_index = done_index;
> + mapping->writeback_index = mpd.first_page;
>
> out_writepages:
> - wbc->range_start = range_start;
> - trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
> + trace_ext4_da_writepages_result(inode, wbc, ret,
> + nr_to_write - wbc->nr_to_write);
> return ret;
> }
>
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index a601bb3..203dcd5 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -332,43 +332,59 @@ TRACE_EVENT(ext4_da_writepages,
> );
>
> TRACE_EVENT(ext4_da_write_pages,
> - TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
> + TP_PROTO(struct inode *inode, pgoff_t first_page,
> + struct writeback_control *wbc),
>
> - TP_ARGS(inode, mpd),
> + TP_ARGS(inode, first_page, wbc),
>
> TP_STRUCT__entry(
> __field( dev_t, dev )
> __field( ino_t, ino )
> - __field( __u64, b_blocknr )
> - __field( __u32, b_size )
> - __field( __u32, b_state )
> - __field( unsigned long, first_page )
> - __field( int, io_done )
> - __field( int, pages_written )
> - __field( int, sync_mode )
> + __field( pgoff_t, first_page )
> + __field( long, nr_to_write )
> + __field( int, sync_mode )
> ),
>
> TP_fast_assign(
> __entry->dev = inode->i_sb->s_dev;
> __entry->ino = inode->i_ino;
> - __entry->b_blocknr = mpd->b_blocknr;
> - __entry->b_size = mpd->b_size;
> - __entry->b_state = mpd->b_state;
> - __entry->first_page = mpd->first_page;
> - __entry->io_done = mpd->io_done;
> - __entry->pages_written = mpd->pages_written;
> - __entry->sync_mode = mpd->wbc->sync_mode;
> + __entry->first_page = first_page;
> + __entry->nr_to_write = wbc->nr_to_write;
> + __entry->sync_mode = wbc->sync_mode;
> ),
>
> - TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x "
> - "first_page %lu io_done %d pages_written %d sync_mode %d",
> + TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
> + "sync_mode %d",
> MAJOR(__entry->dev), MINOR(__entry->dev),
> - (unsigned long) __entry->ino,
> - __entry->b_blocknr, __entry->b_size,
> - __entry->b_state, __entry->first_page,
> - __entry->io_done, __entry->pages_written,
> - __entry->sync_mode
> - )
> + (unsigned long) __entry->ino, __entry->first_page,
> + __entry->nr_to_write, __entry->sync_mode)
> +);
> +
> +TRACE_EVENT(ext4_da_write_pages_extent,
> + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),
> +
> + TP_ARGS(inode, map),
> +
> + TP_STRUCT__entry(
> + __field( dev_t, dev )
> + __field( ino_t, ino )
> + __field( __u64, lblk )
> + __field( __u32, len )
> + __field( __u32, flags )
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = inode->i_sb->s_dev;
> + __entry->ino = inode->i_ino;
> + __entry->lblk = map->m_lblk;
> + __entry->len = map->m_len;
> + __entry->flags = map->m_flags;
> + ),
> +
> + TP_printk("dev %d,%d ino %lu lblk %llu len %u flags 0x%04x",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + (unsigned long) __entry->ino, __entry->lblk, __entry->len,
> + __entry->flags)
> );
>
> TRACE_EVENT(ext4_da_writepages_result,
> --
> 1.7.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists