[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-id: <A70619A1-1E84-425A-956E-7C0DE156F046@sun.com>
Date: Mon, 23 Nov 2009 12:22:29 -0700
From: Andreas Dilger <adilger@....com>
To: "Theodore Ts'o" <tytso@....edu>
Cc: Ext4 Developers List <linux-ext4@...r.kernel.org>,
"Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>,
Curt Wohlgemuth <curtw@...gle.com>
Subject: Re: [PATCH 5/8] ext4: call ext4_forget() from ext4_free_blocks()
On 2009-11-22, at 19:18, Theodore Ts'o wrote:
> Add the facility for ext4_forget() to be called form
> ext4_free_blocks(). This simplifies the code in a large number of
> places, and centralizes most of the work of calling ext4_forget() into
> a single place.
>
> Also fix a bug in the extents migration code; it wasn't calling
> ext4_forget() when releasing the indirect blocks during the
> conversion.
Would this also solve Curt's problem, mentioned in "Bug in extent
zeroout: blocks not marked as new" where bforget was not being called
when the blocks are freed?
> As a result, if the system cashed during or shortly after the
> extents migration, and the released indirect blocks get reused as
> data blocks, the journal replay would corrupt the data blocks.
> With this new patch, fixing this bug was as simple as adding the
> EXT4_FREE_BLOCKS_FORGET flags to the call to ext4_free_blocks().
>
> Signed-off-by: "Theodore Ts'o" <tytso@....edu>
> Cc: "Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>
> ---
> fs/ext4/ext4.h | 10 +++++-
> fs/ext4/extents.c | 24 ++++++---------
> fs/ext4/inode.c | 67 +++++++++++++++
> +--------------------------
> fs/ext4/mballoc.c | 49 +++++++++++++++++++++++--------
> fs/ext4/migrate.c | 23 ++++++++++----
> fs/ext4/xattr.c | 8 +++--
> include/trace/events/ext4.h | 16 ++++++----
> 7 files changed, 109 insertions(+), 88 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 210e1b5..4cfc2f0 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -376,6 +376,12 @@ struct ext4_new_group_data {
> EXT4_GET_BLOCKS_DIO_CREATE_EXT)
>
> /*
> + * Flags used by ext4_free_blocks
> + */
> +#define EXT4_FREE_BLOCKS_METADATA 0x0001
> +#define EXT4_FREE_BLOCKS_FORGET 0x0002
> +
> +/*
> * ioctl commands
> */
> #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
> @@ -1384,8 +1390,8 @@ extern void ext4_discard_preallocations(struct
> inode *);
> extern int __init init_ext4_mballoc(void);
> extern void exit_ext4_mballoc(void);
> extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
> - ext4_fsblk_t block, unsigned long count,
> - int metadata);
> + struct buffer_head *bh, ext4_fsblk_t block,
> + unsigned long count, int flags);
> extern int ext4_mb_add_groupinfo(struct super_block *sb,
> ext4_group_t i, struct ext4_group_desc *desc);
> extern int ext4_mb_get_buddy_cache_lock(struct super_block *,
> ext4_group_t);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 74dcff8..2c4a932 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -1007,7 +1007,8 @@ cleanup:
> for (i = 0; i < depth; i++) {
> if (!ablocks[i])
> continue;
> - ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
> + ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
> + EXT4_FREE_BLOCKS_METADATA);
> }
> }
> kfree(ablocks);
> @@ -1957,7 +1958,6 @@ errout:
> static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
> struct ext4_ext_path *path)
> {
> - struct buffer_head *bh;
> int err;
> ext4_fsblk_t leaf;
>
> @@ -1973,9 +1973,8 @@ static int ext4_ext_rm_idx(handle_t *handle,
> struct inode *inode,
> if (err)
> return err;
> ext_debug("index is empty, remove it, free block %llu\n", leaf);
> - bh = sb_find_get_block(inode->i_sb, leaf);
> - ext4_forget(handle, 1, inode, bh, leaf);
> - ext4_free_blocks(handle, inode, leaf, 1, 1);
> + ext4_free_blocks(handle, inode, 0, leaf, 1,
> + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
> return err;
> }
>
> @@ -2042,12 +2041,11 @@ static int ext4_remove_blocks(handle_t
> *handle, struct inode *inode,
> struct ext4_extent *ex,
> ext4_lblk_t from, ext4_lblk_t to)
> {
> - struct buffer_head *bh;
> unsigned short ee_len = ext4_ext_get_actual_len(ex);
> - int i, metadata = 0;
> + int flags = EXT4_FREE_BLOCKS_FORGET;
>
> if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
> - metadata = 1;
> + flags |= EXT4_FREE_BLOCKS_METADATA;
> #ifdef EXTENTS_STATS
> {
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> @@ -2072,11 +2070,7 @@ static int ext4_remove_blocks(handle_t
> *handle, struct inode *inode,
> num = le32_to_cpu(ex->ee_block) + ee_len - from;
> start = ext_pblock(ex) + ee_len - num;
> ext_debug("free last %u blocks starting %llu\n", num, start);
> - for (i = 0; i < num; i++) {
> - bh = sb_find_get_block(inode->i_sb, start + i);
> - ext4_forget(handle, metadata, inode, bh, start + i);
> - }
> - ext4_free_blocks(handle, inode, start, num, metadata);
> + ext4_free_blocks(handle, inode, 0, start, num, flags);
> } else if (from == le32_to_cpu(ex->ee_block)
> && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
> printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
> @@ -3319,8 +3313,8 @@ int ext4_ext_get_blocks(handle_t *handle,
> struct inode *inode,
> /* not a good idea to call discard here directly,
> * but otherwise we'd need to call it every free() */
> ext4_discard_preallocations(inode);
> - ext4_free_blocks(handle, inode, ext_pblock(&newex),
> - ext4_ext_get_actual_len(&newex), 0);
> + ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
> + ext4_ext_get_actual_len(&newex), 0);
> goto out2;
> }
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 72c6943..3b28e1f 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -669,7 +669,7 @@ allocated:
> return ret;
> failed_out:
> for (i = 0; i < index; i++)
> - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
> + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
> return ret;
> }
>
> @@ -765,20 +765,20 @@ static int ext4_alloc_branch(handle_t *handle,
> struct inode *inode,
> return err;
> failed:
> /* Allocation failed, free what we already allocated */
> + ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
> for (i = 1; i <= n ; i++) {
> - BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
> /*
> - * Note: is_metadata is 0 because branch[i].bh is
> - * newly allocated, so there is no need to revoke the
> - * block. If we do, it's harmless, but not necessary.
> + * branch[i].bh is newly allocated, so there is no
> + * need to revoke the block, which is why we don't
> + * need to set EXT4_FREE_BLOCKS_METADATA.
> */
> - ext4_forget(handle, 0, inode, branch[i].bh,
> - branch[i].bh->b_blocknr);
> + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
> + EXT4_FREE_BLOCKS_FORGET);
> }
> - for (i = 0; i < indirect_blks; i++)
> - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
> + for (i = n+1; i < indirect_blks; i++)
> + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
>
> - ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
> + ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
>
> return err;
> }
> @@ -857,18 +857,16 @@ static int ext4_splice_branch(handle_t
> *handle, struct inode *inode,
>
> err_out:
> for (i = 1; i <= num; i++) {
> - BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
> /*
> - * Note: is_metadata is 0 because branch[i].bh is
> - * newly allocated, so there is no need to revoke the
> - * block. If we do, it's harmless, but not necessary.
> + * branch[i].bh is newly allocated, so there is no
> + * need to revoke the block, which is why we don't
> + * need to set EXT4_FREE_BLOCKS_METADATA.
> */
> - ext4_forget(handle, 0, inode, where[i].bh,
> - where[i].bh->b_blocknr);
> - ext4_free_blocks(handle, inode,
> - le32_to_cpu(where[i-1].key), 1, 0);
> + ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
> + EXT4_FREE_BLOCKS_FORGET);
> }
> - ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks,
> 0);
> + ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
> + blks, 0);
>
> return err;
> }
> @@ -4080,7 +4078,10 @@ static void ext4_clear_blocks(handle_t
> *handle, struct inode *inode,
> __le32 *last)
> {
> __le32 *p;
> - int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode);
> + int flags = EXT4_FREE_BLOCKS_FORGET;
> +
> + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
> + flags |= EXT4_FREE_BLOCKS_METADATA;
>
> if (try_to_extend_transaction(handle, inode)) {
> if (bh) {
> @@ -4096,27 +4097,10 @@ static void ext4_clear_blocks(handle_t
> *handle, struct inode *inode,
> }
> }
>
> - /*
> - * Any buffers which are on the journal will be in memory. We
> - * find them on the hash table so jbd2_journal_revoke() will
> - * run jbd2_journal_forget() on them. We've already detached
> - * each block from the file, so bforget() in
> - * jbd2_journal_forget() should be safe.
> - *
> - * AKPM: turn on bforget in jbd2_journal_forget()!!!
> - */
> - for (p = first; p < last; p++) {
> - u32 nr = le32_to_cpu(*p);
> - if (nr) {
> - struct buffer_head *tbh;
> -
> - *p = 0;
> - tbh = sb_find_get_block(inode->i_sb, nr);
> - ext4_forget(handle, is_metadata, inode, tbh, nr);
> - }
> - }
> + for (p = first; p < last; p++)
> + *p = 0;
>
> - ext4_free_blocks(handle, inode, block_to_free, count, is_metadata);
> + ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
> }
>
> /**
> @@ -4304,7 +4288,8 @@ static void ext4_free_branches(handle_t
> *handle, struct inode *inode,
> blocks_for_truncate(inode));
> }
>
> - ext4_free_blocks(handle, inode, nr, 1, 1);
> + ext4_free_blocks(handle, inode, 0, nr, 1,
> + EXT4_FREE_BLOCKS_METADATA);
>
> if (parent_bh) {
> /*
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 0dca90b..78de5d3 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4436,8 +4436,8 @@ ext4_mb_free_metadata(handle_t *handle, struct
> ext4_buddy *e4b,
> * @metadata: Are these metadata blocks
> */
> void ext4_free_blocks(handle_t *handle, struct inode *inode,
> - ext4_fsblk_t block, unsigned long count,
> - int metadata)
> + struct buffer_head *bh, ext4_fsblk_t block,
> + unsigned long count, int flags)
> {
> struct buffer_head *bitmap_bh = NULL;
> struct super_block *sb = inode->i_sb;
> @@ -4454,15 +4454,12 @@ void ext4_free_blocks(handle_t *handle,
> struct inode *inode,
> int err = 0;
> int ret;
>
> - /*
> - * We need to make sure we don't reuse the freed block until
> - * after the transaction is committed, which we can do by
> - * treating the block as metadata, below. We make an
> - * exception if the inode is to be written in writeback mode
> - * since writeback mode has weak data consistency guarantees.
> - */
> - if (!ext4_should_writeback_data(inode))
> - metadata = 1;
> + if (bh) {
> + if (block)
> + BUG_ON(block != bh->b_blocknr);
> + else
> + block = bh->b_blocknr;
> + }
>
> sbi = EXT4_SB(sb);
> es = EXT4_SB(sb)->s_es;
> @@ -4476,7 +4473,32 @@ void ext4_free_blocks(handle_t *handle,
> struct inode *inode,
> }
>
> ext4_debug("freeing block %llu\n", block);
> - trace_ext4_free_blocks(inode, block, count, metadata);
> + trace_ext4_free_blocks(inode, block, count, flags);
> +
> + if (flags & EXT4_FREE_BLOCKS_FORGET) {
> + struct buffer_head *tbh = bh;
> + int i;
> +
> + BUG_ON(bh && (count > 1));
> +
> + for (i = 0; i < count; i++) {
> + if (!bh)
> + tbh = sb_find_get_block(inode->i_sb,
> + block + i);
> + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
> + inode, tbh, block + i);
> + }
> + }
> +
> + /*
> + * We need to make sure we don't reuse the freed block until
> + * after the transaction is committed, which we can do by
> + * treating the block as metadata, below. We make an
> + * exception if the inode is to be written in writeback mode
> + * since writeback mode has weak data consistency guarantees.
> + */
> + if (!ext4_should_writeback_data(inode))
> + flags |= EXT4_FREE_BLOCKS_METADATA;
>
> ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
> if (ac) {
> @@ -4552,7 +4574,8 @@ do_more:
> err = ext4_mb_load_buddy(sb, block_group, &e4b);
> if (err)
> goto error_return;
> - if (metadata && ext4_handle_valid(handle)) {
> +
> + if ((flags & EXT4_FREE_BLOCKS_METADATA) &&
> ext4_handle_valid(handle)) {
> struct ext4_free_data *new_entry;
> /*
> * blocks being freed are metadata. these blocks shouldn't
> diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
> index a93d5b8..d641e13 100644
> --- a/fs/ext4/migrate.c
> +++ b/fs/ext4/migrate.c
> @@ -262,13 +262,17 @@ static int free_dind_blocks(handle_t *handle,
> for (i = 0; i < max_entries; i++) {
> if (tmp_idata[i]) {
> extend_credit_for_blkdel(handle, inode);
> - ext4_free_blocks(handle, inode,
> - le32_to_cpu(tmp_idata[i]), 1, 1);
> + ext4_free_blocks(handle, inode, 0,
> + le32_to_cpu(tmp_idata[i]), 1,
> + EXT4_FREE_BLOCKS_METADATA |
> + EXT4_FREE_BLOCKS_FORGET);
> }
> }
> put_bh(bh);
> extend_credit_for_blkdel(handle, inode);
> - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
> + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
> + EXT4_FREE_BLOCKS_METADATA |
> + EXT4_FREE_BLOCKS_FORGET);
> return 0;
> }
>
> @@ -297,7 +301,9 @@ static int free_tind_blocks(handle_t *handle,
> }
> put_bh(bh);
> extend_credit_for_blkdel(handle, inode);
> - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
> + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
> + EXT4_FREE_BLOCKS_METADATA |
> + EXT4_FREE_BLOCKS_FORGET);
> return 0;
> }
>
> @@ -308,8 +314,10 @@ static int free_ind_block(handle_t *handle,
> struct inode *inode, __le32 *i_data)
> /* ei->i_data[EXT4_IND_BLOCK] */
> if (i_data[0]) {
> extend_credit_for_blkdel(handle, inode);
> - ext4_free_blocks(handle, inode,
> - le32_to_cpu(i_data[0]), 1, 1);
> + ext4_free_blocks(handle, inode, 0,
> + le32_to_cpu(i_data[0]), 1,
> + EXT4_FREE_BLOCKS_METADATA |
> + EXT4_FREE_BLOCKS_FORGET);
> }
>
> /* ei->i_data[EXT4_DIND_BLOCK] */
> @@ -419,7 +427,8 @@ static int free_ext_idx(handle_t *handle, struct
> inode *inode,
> }
> put_bh(bh);
> extend_credit_for_blkdel(handle, inode);
> - ext4_free_blocks(handle, inode, block, 1, 1);
> + ext4_free_blocks(handle, inode, 0, block, 1,
> + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
> return retval;
> }
>
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 0257019..910bf9a 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -482,9 +482,10 @@ ext4_xattr_release_block(handle_t *handle,
> struct inode *inode,
> ea_bdebug(bh, "refcount now=0; freeing");
> if (ce)
> mb_cache_entry_free(ce);
> - ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
> get_bh(bh);
> - ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
> + ext4_free_blocks(handle, inode, bh, 0, 1,
> + EXT4_FREE_BLOCKS_METADATA |
> + EXT4_FREE_BLOCKS_FORGET);
> } else {
> le32_add_cpu(&BHDR(bh)->h_refcount, -1);
> error = ext4_handle_dirty_metadata(handle, inode, bh);
> @@ -832,7 +833,8 @@ inserted:
> new_bh = sb_getblk(sb, block);
> if (!new_bh) {
> getblk_failed:
> - ext4_free_blocks(handle, inode, block, 1, 1);
> + ext4_free_blocks(handle, inode, 0, block, 1,
> + EXT4_FREE_BLOCKS_METADATA);
> error = -EIO;
> goto cleanup;
> }
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index b390e1f..74f628b 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -650,30 +650,32 @@ TRACE_EVENT(ext4_allocate_blocks,
>
> TRACE_EVENT(ext4_free_blocks,
> TP_PROTO(struct inode *inode, __u64 block, unsigned long count,
> - int metadata),
> + int flags),
>
> - TP_ARGS(inode, block, count, metadata),
> + TP_ARGS(inode, block, count, flags),
>
> TP_STRUCT__entry(
> __field( dev_t, dev )
> __field( ino_t, ino )
> + __field( umode_t, mode )
> __field( __u64, block )
> __field( unsigned long, count )
> - __field( int, metadata )
> -
> + __field( int, flags )
> ),
>
> TP_fast_assign(
> __entry->dev = inode->i_sb->s_dev;
> __entry->ino = inode->i_ino;
> + __entry->mode = inode->i_mode;
> __entry->block = block;
> __entry->count = count;
> - __entry->metadata = metadata;
> + __entry->flags = flags;
> ),
>
> - TP_printk("dev %s ino %lu block %llu count %lu metadata %d",
> + TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d",
> jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
> - __entry->block, __entry->count, __entry->metadata)
> + __entry->mode, __entry->block, __entry->count,
> + __entry->flags)
> );
>
> TRACE_EVENT(ext4_sync_file,
> --
> 1.6.5.216.g5288a.dirty
Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists