[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1213398082.27507.27.camel@BVR-FS.beaverton.ibm.com>
Date: Fri, 13 Jun 2008 16:01:22 -0700
From: Mingming <cmm@...ibm.com>
To: "Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>
Cc: tytso@....edu, sandeen@...hat.com, linux-ext4@...r.kernel.org
Subject: Re: [RFC PATCH] ext4: Add ordered mode support for delalloc
Thanks, some comments below...
On Thu, 2008-06-12 at 20:55 +0530, Aneesh Kumar K.V wrote:
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@...ux.vnet.ibm.com>
> ---
> fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> fs/jbd2/commit.c | 41 ++++++++++++--
> 2 files changed, 198 insertions(+), 12 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 63355ab..7d87641 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1606,13 +1606,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
> return !buffer_mapped(bh) || buffer_delay(bh);
> }
>
> -/* FIXME!! only support data=writeback mode */
> /*
> * get called vi ext4_da_writepages after taking page lock
> * We may end up doing block allocation here in case
> * mpage_da_map_blocks failed to allocate blocks.
> */
> -static int ext4_da_writepage(struct page *page,
> +static int ext4_da_writeback_writepage(struct page *page,
> struct writeback_control *wbc)
> {
> int ret = 0;
> @@ -1660,6 +1659,61 @@ static int ext4_da_writepage(struct page *page,
> return ret;
> }
>
> +/*
> + * get called vi ext4_da_writepages after taking page lock
> + * We may end up doing block allocation here in case
> + * mpage_da_map_blocks failed to allocate blocks.
> + *
> + * We also get called via journal_submit_inode_data_buffers
> + */
> +static int ext4_da_ordered_writepage(struct page *page,
> + struct writeback_control *wbc)
> +{
> + int ret = 0;
> + loff_t size;
> + unsigned long len;
> + handle_t *handle = NULL;
> + struct buffer_head *page_bufs;
> + struct inode *inode = page->mapping->host;
> +
> + handle = ext4_journal_current_handle();
> + if (!handle) {
> + /*
> + * This can happen when we aren't called via
> + * ext4_da_writepages() but directly (shrink_page_list).
> + * We cannot easily start a transaction here so we just skip
> + * writing the page in case we would have to do so.
> + */
> + size = i_size_read(inode);
> +
> + page_bufs = page_buffers(page);
> + if (page->index == size >> PAGE_CACHE_SHIFT)
> + len = size & ~PAGE_CACHE_MASK;
> + else
> + len = PAGE_CACHE_SIZE;
> +
> + if (walk_page_buffers(NULL, page_bufs, 0,
> + len, NULL, ext4_bh_unmapped_or_delay)) {
> + /*
> + * We can't do block allocation under
> + * page lock without a handle . So redirty
> + * the page and return.
> + * We may reach here when we do a journal commit
> + * via journal_submit_inode_data_buffers.
> + * If we don't have mapping block we just ignore
> + * them
> + *
> + */
> + redirty_page_for_writepage(wbc, page);
> + unlock_page(page);
> + return 0;
> + }
> + }
> +
It seems we missed to file the inode to the journal list before calling
block_write_full_page(), since it's possible block_write_full_page()
could do block allocation.
something like this?
+ if (ext4_should_order_data(inode))
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ return ret;
+ }
> + ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
> +
> + return ret;
> +}
>
It seems this code is duplicated from
ext4_da_writeback_writepage()(except for the file inode to keep the
ordering), is there any reason not to making it one function for both
ordered mode and writeback mode?
> /*
> * For now just follow the DIO way to estimate the max credits
> @@ -1745,19 +1799,99 @@ static int ext4_da_writepages(struct address_space *mapping,
> return ret;
> }
>
> +static int ext4_da_ordered_writepages(struct address_space *mapping,
> + struct writeback_control *wbc)
> +{
> + struct inode *inode = mapping->host;
> + handle_t *handle = NULL;
> + int needed_blocks;
> + int ret = 0;
> + long to_write;
> + loff_t range_start = 0;
> +
> +
> + /*
> + * No pages to write? This is mainly a kludge to avoid starting
> + * a transaction for special inodes like journal inode on last iput()
> + * because that could violate lock ordering on umount
> + */
> + if (!mapping->nrpages)
> + return 0;
> +
> + /*
> + * Estimate the worse case needed credits to write out
> + * EXT4_MAX_BUF_BLOCKS pages
> + */
> + needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
> +
> + to_write = wbc->nr_to_write;
> + if (!wbc->range_cyclic) {
> + /*
> + * If range_cyclic is not set force range_cont
> + * and save the old writeback_index
> + */
> + wbc->range_cont = 1;
> + range_start = wbc->range_start;
> + }
> +
> + while (!ret && to_write) {
> + /* start a new transaction*/
> + handle = ext4_journal_start(inode, needed_blocks);
> + if (IS_ERR(handle)) {
> + ret = PTR_ERR(handle);
> + goto out_writepages;
> + }
> +
> + ret = ext4_jbd2_file_inode(handle, inode);
> + if (ret) {
> + ext4_journal_stop(handle);
> + goto out_writepages;
> + }
> + /*
> + * set the max dirty pages could be write at a time
> + * to fit into the reserved transaction credits
> + */
> + if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
> + wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
> +
> + to_write -= wbc->nr_to_write;
> + ret = mpage_da_writepages(mapping, wbc,
> + ext4_da_get_block_write);
> + ext4_journal_stop(handle);
> + if (wbc->nr_to_write) {
> + /*
> + * There is no more writeout needed
> + * or we requested for a noblocking writeout
> + * and we found the device congested
> + */
> + to_write += wbc->nr_to_write;
> + break;
> + }
> + wbc->nr_to_write = to_write;
> + }
> +
> +out_writepages:
> + wbc->nr_to_write = to_write;
> + if (range_start)
> + wbc->range_start = range_start;
> + return ret;
> +}
> +
It seems this code is duplicated from
ext4_da_writeback_writepages()also. The only part different is in
ordered mode, we need to file the inode to the journal list to keep the
ordering. I think we could use existing da_writepages() function for
both ordered mode and writeback mode as well.
> static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
> loff_t pos, unsigned len, unsigned flags,
> struct page **pagep, void **fsdata)
> {
> - int ret;
> + int ret, retries = 0;
> struct page *page;
> pgoff_t index;
> unsigned from, to;
> + struct inode *inode = mapping->host;
>
> index = pos >> PAGE_CACHE_SHIFT;
> from = pos & (PAGE_CACHE_SIZE - 1);
> to = from + len;
>
> +retry:
> page = __grab_cache_page(mapping, index);
> if (!page)
> return -ENOMEM;
> @@ -1770,6 +1904,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
> page_cache_release(page);
> }
>
> + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
> + goto retry;
> +
> return ret;
> }
>
In case of ENOSPC, instead of go back and try to do reservation (which
may overestimate the total number of metablocks to reserve) again, I
think we should not doing delayed allocation, instead call the real
get_block() function to try the real block allocation.
Just to clarify, this is not part of the ordered mode support, I think
we should make a separate patch for this kind of improvement.
> @@ -2224,10 +2361,10 @@ static int ext4_journalled_set_page_dirty(struct page *page)
> .releasepage = ext4_releasepage,
> };
>
> -static const struct address_space_operations ext4_da_aops = {
> +static const struct address_space_operations ext4_da_writeback_aops = {
> .readpage = ext4_readpage,
> .readpages = ext4_readpages,
> - .writepage = ext4_da_writepage,
> + .writepage = ext4_da_writeback_writepage,
> .writepages = ext4_da_writepages,
> .sync_page = block_sync_page,
> .write_begin = ext4_da_write_begin,
> @@ -2239,13 +2376,31 @@ static int ext4_journalled_set_page_dirty(struct page *page)
> .migratepage = buffer_migrate_page,
> };
>
> +static const struct address_space_operations ext4_da_ordered_aops = {
> + .readpage = ext4_readpage,
> + .readpages = ext4_readpages,
> + .writepage = ext4_da_ordered_writepage,
> + .writepages = ext4_da_ordered_writepages,
> + .sync_page = block_sync_page,
> + .write_begin = ext4_da_write_begin,
> + .write_end = generic_write_end,
> + .bmap = ext4_bmap,
> + .invalidatepage = ext4_da_invalidatepage,
> + .releasepage = ext4_releasepage,
> + .direct_IO = ext4_direct_IO,
> + .migratepage = buffer_migrate_page,
> +};
> +
With the new ordered mode, we could share the same address space
operations for delayed allocation over writeback and ordered mode.
> void ext4_set_aops(struct inode *inode)
> {
> - if (ext4_should_order_data(inode))
> + if (ext4_should_order_data(inode) &&
> + test_opt(inode->i_sb, DELALLOC))
> + inode->i_mapping->a_ops = &ext4_da_ordered_aops;
> + else if (ext4_should_order_data(inode))
> inode->i_mapping->a_ops = &ext4_ordered_aops;
> else if (ext4_should_writeback_data(inode) &&
> test_opt(inode->i_sb, DELALLOC))
> - inode->i_mapping->a_ops = &ext4_da_aops;
> + inode->i_mapping->a_ops = &ext4_da_writeback_aops;
> else if (ext4_should_writeback_data(inode))
> inode->i_mapping->a_ops = &ext4_writeback_aops;
> else
> diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
> index 483183d..32ca3c3 100644
> --- a/fs/jbd2/commit.c
> +++ b/fs/jbd2/commit.c
> @@ -22,6 +22,8 @@
> #include <linux/pagemap.h>
> #include <linux/jiffies.h>
> #include <linux/crc32.h>
> +#include <linux/writeback.h>
> +#include <linux/backing-dev.h>
>
> /*
> * Default IO end handler for temporary BJ_IO buffer_heads.
> @@ -185,6 +187,30 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
> }
>
> /*
> + * write the filemap data using writepage() address_space_operations.
> + * We don't do block allocation here even for delalloc. We don't
> + * use writepages() because with dealyed allocation we may be doing
> + * block allocation in writepages().
> + */
> +static int journal_submit_inode_data_buffers(struct address_space *mapping)
> +{
> + int ret;
> + struct writeback_control wbc = {
> + .sync_mode = WB_SYNC_ALL,
> + .nr_to_write = mapping->nrpages * 2,
> + .range_start = 0,
> + .range_end = i_size_read(mapping->host),
> + .for_writepages = 1,
> + };
> +
> + if (!mapping_cap_writeback_dirty(mapping))
> + return 0;
> +
> + ret = generic_writepages(mapping, &wbc);
> + return ret;
> +}
> +
> +/*
> * Submit all the data buffers of inode associated with the transaction to
> * disk.
> *
> @@ -192,7 +218,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
> * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
> * operate on from being released while we write out pages.
> */
> -static int journal_submit_inode_data_buffers(journal_t *journal,
> +static int journal_submit_data_buffers(journal_t *journal,
> transaction_t *commit_transaction)
> {
> struct jbd2_inode *jinode;
> @@ -204,8 +230,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal,
> mapping = jinode->i_vfs_inode->i_mapping;
> jinode->i_flags |= JI_COMMIT_RUNNING;
> spin_unlock(&journal->j_list_lock);
> - err = filemap_fdatawrite_range(mapping, 0,
> - i_size_read(jinode->i_vfs_inode));
> + /*
> + * submit the inode data buffers. We use writepage
> + * instead of writepages. Because writepages can do
> + * block allocation with delalloc. We need to write
> + * only allocated blocks here.
> + */
Hmm, when writepage()->ext4_da_orderd_writepage() is called from here,
the handle is expecting to be NULL? Otherwise block_write_full_page()
could do block allocation, that's against the locking ordering...:(
> + err = journal_submit_inode_data_buffers(mapping);
> if (!ret)
> ret = err;
> spin_lock(&journal->j_list_lock);
> @@ -228,7 +259,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
> struct jbd2_inode *jinode, *next_i;
> int err, ret = 0;
>
> - /* For locking, see the comment in journal_submit_inode_data_buffers() */
> + /* For locking, see the comment in journal_submit_data_buffers() */
> spin_lock(&journal->j_list_lock);
> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
> jinode->i_flags |= JI_COMMIT_RUNNING;
> @@ -431,7 +462,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
> * Now start flushing things to disk, in the order they appear
> * on the transaction lists. Data blocks go first.
> */
> - err = journal_submit_inode_data_buffers(journal, commit_transaction);
> + err = journal_submit_data_buffers(journal, commit_transaction);
> if (err)
> jbd2_journal_abort(journal, err);
>
Regards,
Mingming
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists