lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 9 Oct 2020 22:44:03 +0530
From:   Ritesh Harjani <riteshh@...ux.ibm.com>
To:     Harshad Shirwadkar <harshadshirwadkar@...il.com>,
        linux-ext4@...r.kernel.org
Cc:     tytso@....edu
Subject: Re: [PATCH v9 7/9] ext4: fast commit recovery path



On 9/19/20 6:24 AM, Harshad Shirwadkar wrote:
> This patch adds fast commit recovery path support for Ext4 file
> system. We add several helper functions that are similar in spirit to
> e2fsprogs journal recovery path handlers. Example of such functions
> include - a simple block allocator, idempotent block bitmap update
> function etc. Using these routines and the fast commit log in the fast
> commit area, the recovery path (ext4_fc_replay()) performs fast commit
> log recovery.
> 
> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@...il.com>
> ---
>   fs/ext4/balloc.c            |   7 +-
>   fs/ext4/ext4.h              |  26 ++
>   fs/ext4/ext4_jbd2.c         |   2 +-
>   fs/ext4/extents.c           | 261 +++++++++++
>   fs/ext4/extents_status.c    |  24 +
>   fs/ext4/fast_commit.c       | 881 +++++++++++++++++++++++++++++++++++-
>   fs/ext4/fast_commit.h       |  40 ++
>   fs/ext4/ialloc.c            | 165 ++++++-
>   fs/ext4/inode.c             |  89 ++--
>   fs/ext4/ioctl.c             |   6 +-
>   fs/ext4/mballoc.c           | 208 ++++++++-
>   fs/ext4/namei.c             | 149 +++---
>   fs/ext4/super.c             |  21 +
>   include/trace/events/ext4.h |  56 ++-
>   14 files changed, 1804 insertions(+), 131 deletions(-)
> 
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 48c3df47748d..77108c99ae90 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -368,7 +368,12 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
>   				      struct buffer_head *bh)
>   {
>   	ext4_fsblk_t	blk;
> -	struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
> +	struct ext4_group_info *grp;
> +
> +	if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return 0;
> +
> +	grp = ext4_get_group_info(sb, block_group);
> 
>   	if (buffer_verified(bh))
>   		return 0;
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 27d48d166e5d..372a38292ed1 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1167,6 +1167,7 @@ struct ext4_inode_info {
>   #define EXT4_FC_COMMITTING		0x0010	/* File system underoing a fast
>   						 * commit.
>   						 */
> +#define EXT4_FC_REPLAY			0x0020	/* Fast commit replay ongoing */
> 
>   /*
>    * Misc. filesystem flags
> @@ -1658,6 +1659,10 @@ struct ext4_sb_info {
>   	struct buffer_head *s_fc_bh;
>   	struct ext4_fc_stats s_fc_stats;
>   	u64 s_fc_avg_commit_time;
> +#ifdef CONFIG_EXT4_DEBUG
> +	int s_fc_debug_max_replay;
> +#endif
> +	struct ext4_fc_replay_state s_fc_replay_state;
>   };
> 
>   static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> @@ -2700,6 +2705,7 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
>   			  struct dx_hash_info *hinfo);
> 
>   /* ialloc.c */
> +extern int ext4_mark_inode_used(struct super_block *sb, int ino);
>   extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
>   				      const struct qstr *qstr, __u32 goal,
>   				      uid_t *owner, __u32 i_flags,
> @@ -2741,6 +2747,8 @@ void ext4_fc_stop_ineligible(struct super_block *sb);
>   void ext4_fc_start_update(struct inode *inode);
>   void ext4_fc_stop_update(struct inode *inode);
>   void ext4_fc_del(struct inode *inode);
> +bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
> +void ext4_fc_replay_cleanup(struct super_block *sb);
>   int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
>   int __init ext4_fc_init_dentry_cache(void);
> 
> @@ -2773,8 +2781,12 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
>   				ext4_fsblk_t block, unsigned long count);
>   extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
>   extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
> +extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
> +		       int len, int state);
> 
>   /* inode.c */
> +void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
> +			 struct ext4_inode_info *ei);
>   int ext4_inode_is_fast_symlink(struct inode *inode);
>   struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
>   struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
> @@ -2821,6 +2833,8 @@ extern int  ext4_sync_inode(handle_t *, struct inode *);
>   extern void ext4_dirty_inode(struct inode *, int);
>   extern int ext4_change_inode_journal_flag(struct inode *, int);
>   extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
> +extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
> +			  struct ext4_iloc *iloc);
>   extern int ext4_inode_attach_jinode(struct inode *inode);
>   extern int ext4_can_truncate(struct inode *inode);
>   extern int ext4_truncate(struct inode *);
> @@ -2854,12 +2868,15 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
>   /* ioctl.c */
>   extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
>   extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
> +extern void ext4_reset_inode_seed(struct inode *inode);
> 
>   /* migrate.c */
>   extern int ext4_ext_migrate(struct inode *);
>   extern int ext4_ind_migrate(struct inode *inode);
> 
>   /* namei.c */
> +extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
> +			     struct inode *inode);
>   extern int ext4_dirblock_csum_verify(struct inode *inode,
>   				     struct buffer_head *bh);
>   extern int ext4_orphan_add(handle_t *, struct inode *);
> @@ -3426,6 +3443,10 @@ extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
>   extern int ext4_ci_compare(const struct inode *parent,
>   			   const struct qstr *fname,
>   			   const struct qstr *entry, bool quick);
> +extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
> +			 struct inode *inode);
> +extern int __ext4_link(struct inode *dir, struct inode *inode,
> +		       struct dentry *dentry);
> 
>   #define S_SHIFT 12
>   static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
> @@ -3526,6 +3547,11 @@ extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
>   extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
>   				       int check_cred, int restart_cred,
>   				       int revoke_cred);
> +extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
> +extern int ext4_ext_replay_set_iblocks(struct inode *inode);
> +extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
> +		int len, int unwritten, ext4_fsblk_t pblk);
> +extern int ext4_ext_clear_bb(struct inode *inode);
> 
> 
>   /* move_extent.c */
> diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
> index 760b9ee49dc0..0fd0c42a4f7d 100644
> --- a/fs/ext4/ext4_jbd2.c
> +++ b/fs/ext4/ext4_jbd2.c
> @@ -100,7 +100,7 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
>   		return ERR_PTR(err);
> 
>   	journal = EXT4_SB(sb)->s_journal;
> -	if (!journal)
> +	if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
>   		return ext4_get_nojournal();
>   	return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
>   				   GFP_NOFS, type, line);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 8de236fedade..29945f1172fc 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -5804,3 +5804,264 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
> 
>   	return err ? err : mapped;
>   }
> +
> +/*
> + * Updates physical block address and unwritten status of extent starting at
> + * lblk start and of len. If such an extent doesn't exist, this function
> + * splits the extent tree appropriately to create an extent like this.
> + * This function is called in Ext4 fast commit replay path. Returns 0 on success
> + * and error on failure.
> + */
> +int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
> +		int len, int unwritten, ext4_fsblk_t pblk)
> +{
> +	struct ext4_ext_path *path = NULL, *ppath;
> +	struct ext4_extent *ex;
> +	int ret;
> +
> +	path = ext4_find_extent(inode, start, NULL, 0);
> +	if (!path)
> +		return -EINVAL;
> +	ex = path[path->p_depth].p_ext;
> +	if (!ex) {
> +		ret = -EFSCORRUPTED;
> +		goto out;
> +	}
> +
> +	if (le32_to_cpu(ex->ee_block) != start ||
> +		ext4_ext_get_actual_len(ex) != len) {
> +		/* We need to split this extent to match our extent first */
> +		ppath = path;
> +		down_write(&EXT4_I(inode)->i_data_sem);
> +		ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1);
> +		up_write(&EXT4_I(inode)->i_data_sem);
> +		if (ret)
> +			goto out;
> +		kfree(path);
> +		path = ext4_find_extent(inode, start, NULL, 0);
> +		if (IS_ERR(path))
> +			return -1;
> +		ppath = path;
> +		ex = path[path->p_depth].p_ext;
> +		WARN_ON(le32_to_cpu(ex->ee_block) != start);
> +		if (ext4_ext_get_actual_len(ex) != len) {
> +			down_write(&EXT4_I(inode)->i_data_sem);
> +			ret = ext4_force_split_extent_at(NULL, inode, &ppath,
> +							 start + len, 1);
> +			up_write(&EXT4_I(inode)->i_data_sem);
> +			if (ret)
> +				goto out;
> +			kfree(path);
> +			path = ext4_find_extent(inode, start, NULL, 0);
> +			if (IS_ERR(path))
> +				return -EINVAL;
> +			ex = path[path->p_depth].p_ext;
> +		}
> +	}
> +	if (unwritten)
> +		ext4_ext_mark_unwritten(ex);
> +	else
> +		ext4_ext_mark_initialized(ex);
> +	ext4_ext_store_pblock(ex, pblk);
> +	down_write(&EXT4_I(inode)->i_data_sem);
> +	ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
> +	up_write(&EXT4_I(inode)->i_data_sem);
> +out:
> +	ext4_ext_drop_refs(path);
> +	kfree(path);
> +	ext4_mark_inode_dirty(NULL, inode);
> +	return ret;
> +}
> +
> +/* Try to shrink the extent tree */
> +void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
> +{
> +	struct ext4_ext_path *path = NULL;
> +	struct ext4_extent *ex;
> +	ext4_lblk_t old_cur, cur = 0;
> +
> +	while (cur < end) {
> +		path = ext4_find_extent(inode, cur, NULL, 0);
> +		if (IS_ERR(path))
> +			return;
> +		ex = path[path->p_depth].p_ext;
> +		if (!ex) {
> +			ext4_ext_drop_refs(path);
> +			kfree(path);
> +			ext4_mark_inode_dirty(NULL, inode);
> +			return;
> +		}
> +		old_cur = cur;
> +		cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
> +		if (cur <= old_cur)
> +			cur = old_cur + 1;
> +		ext4_ext_try_to_merge(NULL, inode, path, ex);
> +		down_write(&EXT4_I(inode)->i_data_sem);
> +		ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
> +		up_write(&EXT4_I(inode)->i_data_sem);
> +		ext4_mark_inode_dirty(NULL, inode);
> +		ext4_ext_drop_refs(path);
> +		kfree(path);
> +	}
> +}
> +
> +/* Check if *cur is a hole and if it is, skip it */
> +static void skip_hole(struct inode *inode, ext4_lblk_t *cur)
> +{
> +	int ret;
> +	struct ext4_map_blocks map;
> +
> +	map.m_lblk = *cur;
> +	map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;
> +
> +	ret = ext4_map_blocks(NULL, inode, &map, 0);
> +	if (ret != 0)
> +		return;
> +	*cur = *cur + map.m_len;
> +}
> +
> +/* Count number of blocks used by this inode and update i_blocks */
> +int ext4_ext_replay_set_iblocks(struct inode *inode)
> +{
> +	struct ext4_ext_path *path = NULL, *path2 = NULL;
> +	struct ext4_extent *ex;
> +	ext4_lblk_t cur = 0, end;
> +	int numblks = 0, i, ret = 0;
> +	ext4_fsblk_t cmp1, cmp2;
> +	struct ext4_map_blocks map;
> +
> +	/* Determin the size of the file first */
> +	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
> +					EXT4_EX_NOCACHE);
> +	if (IS_ERR(path))
> +		return PTR_ERR(path);
> +	ex = path[path->p_depth].p_ext;
> +	if (!ex) {
> +		ext4_ext_drop_refs(path);
> +		kfree(path);
> +		goto out;
> +	}
> +	end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
> +	ext4_ext_drop_refs(path);
> +	kfree(path);
> +
> +	/* Count the number of data blocks */
> +	cur = 0;
> +	while (cur < end) {
> +		map.m_lblk = cur;
> +		map.m_len = end - cur;
> +		ret = ext4_map_blocks(NULL, inode, &map, 0);
> +		if (ret < 0)
> +			break;
> +		if (ret > 0)
> +			numblks += ret;
> +		cur = cur + map.m_len;
> +	}
> +
> +	/*
> +	 * Count the number of extent tree blocks. We do it by looking up
> +	 * two successive extents and determining the difference between
> +	 * their paths. When path is different for 2 successive extents
> +	 * we compare the blocks in the path at each level and increment
> +	 * iblocks by total number of differences found.
> +	 */
> +	cur = 0;
> +	skip_hole(inode, &cur);
> +	path = ext4_find_extent(inode, cur, NULL, 0);
> +	if (IS_ERR(path))
> +		goto out;
> +	numblks += path->p_depth;
> +	ext4_ext_drop_refs(path);
> +	kfree(path);
> +	while (cur < end) {
> +		path = ext4_find_extent(inode, cur, NULL, 0);
> +		if (IS_ERR(path))
> +			break;
> +		ex = path[path->p_depth].p_ext;
> +		if (!ex) {
> +			ext4_ext_drop_refs(path);
> +			kfree(path);
> +			return 0;
> +		}
> +		cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
> +					ext4_ext_get_actual_len(ex));
> +		skip_hole(inode, &cur);
> +
> +		path2 = ext4_find_extent(inode, cur, NULL, 0);
> +		if (IS_ERR(path2)) {
> +			ext4_ext_drop_refs(path);
> +			kfree(path);
> +			break;
> +		}
> +		ex = path2[path2->p_depth].p_ext;
> +		for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
> +			cmp1 = cmp2 = 0;
> +			if (i <= path->p_depth)
> +				cmp1 = path[i].p_bh ?
> +					path[i].p_bh->b_blocknr : 0;
> +			if (i <= path2->p_depth)
> +				cmp2 = path2[i].p_bh ?
> +					path2[i].p_bh->b_blocknr : 0;
> +			if (cmp1 != cmp2 && cmp2 != 0)
> +				numblks++;
> +		}
> +		ext4_ext_drop_refs(path);
> +		ext4_ext_drop_refs(path2);
> +		kfree(path);
> +		kfree(path2);
> +	}
> +
> +out:
> +	inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
> +	ext4_mark_inode_dirty(NULL, inode);
> +	return 0;
> +}
> +
> +int ext4_ext_clear_bb(struct inode *inode)
> +{
> +	struct ext4_ext_path *path = NULL;
> +	struct ext4_extent *ex;
> +	ext4_lblk_t cur = 0, end;
> +	int j, ret = 0;
> +	struct ext4_map_blocks map;
> +
> +	/* Determin the size of the file first */
> +	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
> +					EXT4_EX_NOCACHE);
> +	if (IS_ERR(path))
> +		return PTR_ERR(path);
> +	ex = path[path->p_depth].p_ext;
> +	if (!ex) {
> +		ext4_ext_drop_refs(path);
> +		kfree(path);
> +		return 0;
> +	}
> +	end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
> +	ext4_ext_drop_refs(path);
> +	kfree(path);
> +
> +	cur = 0;
> +	while (cur < end) {
> +		map.m_lblk = cur;
> +		map.m_len = end - cur;
> +		ret = ext4_map_blocks(NULL, inode, &map, 0);
> +		if (ret < 0)
> +			break;
> +		if (ret > 0) {
> +			path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
> +			if (!IS_ERR_OR_NULL(path)) {
> +				for (j = 0; j < path->p_depth; j++) {
> +
> +					ext4_mb_mark_bb(inode->i_sb,
> +							path[j].p_block, 1, 0);
> +				}
> +				ext4_ext_drop_refs(path);
> +				kfree(path);
> +			}
> +			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
> +		}
> +		cur = cur + map.m_len;
> +	}
> +
> +	return 0;
> +}
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index e75171535375..0a729027322d 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -311,6 +311,9 @@ void ext4_es_find_extent_range(struct inode *inode,
>   			       ext4_lblk_t lblk, ext4_lblk_t end,
>   			       struct extent_status *es)
>   {
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return;
> +
>   	trace_ext4_es_find_extent_range_enter(inode, lblk);
> 
>   	read_lock(&EXT4_I(inode)->i_es_lock);
> @@ -361,6 +364,9 @@ bool ext4_es_scan_range(struct inode *inode,
>   {
>   	bool ret;
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return false;
> +
>   	read_lock(&EXT4_I(inode)->i_es_lock);
>   	ret = __es_scan_range(inode, matching_fn, lblk, end);
>   	read_unlock(&EXT4_I(inode)->i_es_lock);
> @@ -404,6 +410,9 @@ bool ext4_es_scan_clu(struct inode *inode,
>   {
>   	bool ret;
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return false;
> +
>   	read_lock(&EXT4_I(inode)->i_es_lock);
>   	ret = __es_scan_clu(inode, matching_fn, lblk);
>   	read_unlock(&EXT4_I(inode)->i_es_lock);
> @@ -812,6 +821,9 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>   	int err = 0;
>   	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return 0;
> +
>   	es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
>   		 lblk, len, pblk, status, inode->i_ino);
> 
> @@ -873,6 +885,9 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
>   	struct extent_status newes;
>   	ext4_lblk_t end = lblk + len - 1;
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return;
> +
>   	newes.es_lblk = lblk;
>   	newes.es_len = len;
>   	ext4_es_store_pblock_status(&newes, pblk, status);
> @@ -908,6 +923,9 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
>   	struct rb_node *node;
>   	int found = 0;
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return 0;
> +
>   	trace_ext4_es_lookup_extent_enter(inode, lblk);
>   	es_debug("lookup extent in block %u\n", lblk);
> 
> @@ -1419,6 +1437,9 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
>   	int err = 0;
>   	int reserved = 0;
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return 0;
> +
>   	trace_ext4_es_remove_extent(inode, lblk, len);
>   	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
>   		 lblk, len, inode->i_ino);
> @@ -1969,6 +1990,9 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
>   	struct extent_status newes;
>   	int err = 0;
> 
> +	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return 0;
> +
>   	es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
>   		 lblk, inode->i_ino);
> 
> diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> index 6e251b5682b4..63429076ad59 100644
> --- a/fs/ext4/fast_commit.c
> +++ b/fs/ext4/fast_commit.c
> @@ -170,7 +170,8 @@ void ext4_fc_start_update(struct inode *inode)
>   {
>   	struct ext4_inode_info *ei = EXT4_I(inode);
> 
> -	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
> +	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
> +	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
>   		return;
> 
>   restart:
> @@ -209,7 +210,8 @@ void ext4_fc_stop_update(struct inode *inode)
>   {
>   	struct ext4_inode_info *ei = EXT4_I(inode);
> 
> -	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
> +	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
> +	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
>   		return;
> 
>   	if (atomic_dec_and_test(&ei->i_fc_updates))
> @@ -224,11 +226,8 @@ void ext4_fc_del(struct inode *inode)
>   {
>   	struct ext4_inode_info *ei = EXT4_I(inode);
> 
> -	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
> -		return;
> -
> -
> -	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
> +	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
> +	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
>   		return;
> 
>   restart:
> @@ -270,6 +269,10 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
>   {
>   	struct ext4_sb_info *sbi = EXT4_SB(sb);
> 
> +	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
> +	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
> +		return;
> +
>   	sbi->s_mount_state |= EXT4_FC_INELIGIBLE;
>   	WARN_ON(reason >= EXT4_FC_REASON_MAX);
>   	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
> @@ -283,6 +286,10 @@ void ext4_fc_start_ineligible(struct super_block *sb, int reason)
>   {
>   	struct ext4_sb_info *sbi = EXT4_SB(sb);
> 
> +	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
> +	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
> +		return;
> +
>   	WARN_ON(reason >= EXT4_FC_REASON_MAX);
>   	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
>   	atomic_inc(&sbi->s_fc_ineligible_updates);
> @@ -295,6 +302,10 @@ void ext4_fc_start_ineligible(struct super_block *sb, int reason)
>    */
>   void ext4_fc_stop_ineligible(struct super_block *sb)
>   {
> +	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
> +	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
> +		return;
> +
>   	EXT4_SB(sb)->s_mount_state |= EXT4_FC_INELIGIBLE;
>   	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
>   }
> @@ -325,7 +336,8 @@ static int ext4_fc_track_template(
>   	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>   	int ret;
> 
> -	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
> +	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
> +	    (sbi->s_mount_state & EXT4_FC_REPLAY))
>   		return -EOPNOTSUPP;
> 
>   	if (ext4_fc_is_ineligible(inode->i_sb))
> @@ -1214,13 +1226,864 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
>   	trace_ext4_fc_stats(sb);
>   }
> 
> +/* Ext4 Replay Path Routines */
> +
> +/* Get length of a particular tlv */
> +static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
> +{
> +	return le16_to_cpu(tl->fc_len);
> +}
> +
> +/* Get a pointer to "value" of a tlv */
> +static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
> +{
> +	return (u8 *)tl + sizeof(*tl);
> +}
> +
> +/* Helper struct for dentry replay routines */
> +struct dentry_info_args {
> +	int parent_ino, dname_len, ino, inode_len;
> +	char *dname;
> +};
> +
> +static inline void tl_to_darg(struct dentry_info_args *darg,
> +				struct  ext4_fc_tl *tl)
> +{
> +	struct ext4_fc_dentry_info *fcd;
> +
> +	fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
> +
> +	darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
> +	darg->ino = le32_to_cpu(fcd->fc_ino);
> +	darg->dname = fcd->fc_dname;
> +	darg->dname_len = ext4_fc_tag_len(tl) -
> +			sizeof(struct ext4_fc_dentry_info);
> +}
> +
> +/* Unlink replay function */
> +static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
> +{
> +	struct inode *inode, *old_parent;
> +	struct qstr entry;
> +	struct dentry_info_args darg;
> +	int ret = 0;
> +
> +	tl_to_darg(&darg, tl);
> +
> +	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
> +			darg.parent_ino, darg.dname_len);
> +
> +	entry.name = darg.dname;
> +	entry.len = darg.dname_len;
> +	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
> +
> +	if (IS_ERR_OR_NULL(inode)) {
> +		jbd_debug(1, "Inode %d not found", darg.ino);
> +		return 0;
> +	}
> +
> +	old_parent = ext4_iget(sb, darg.parent_ino,
> +				EXT4_IGET_NORMAL);
> +	if (IS_ERR_OR_NULL(old_parent)) {
> +		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
> +		iput(inode);
> +		return 0;
> +	}
> +
> +	ret = __ext4_unlink(old_parent, &entry, inode);
> +	/* -ENOENT ok coz it might not exist anymore. */
> +	if (ret == -ENOENT)
> +		ret = 0;
> +	iput(old_parent);
> +	iput(inode);
> +	return ret;
> +}
> +
> +static int ext4_fc_replay_link_internal(struct super_block *sb,
> +				struct dentry_info_args *darg,
> +				struct inode *inode)
> +{
> +	struct inode *dir = NULL;
> +	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
> +	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
> +	int ret = 0;
> +
> +	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
> +	if (IS_ERR(dir)) {
> +		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
> +		dir = NULL;
> +		goto out;
> +	}
> +
> +	dentry_dir = d_obtain_alias(dir);
> +	if (IS_ERR(dentry_dir)) {
> +		jbd_debug(1, "Failed to obtain dentry");
> +		dentry_dir = NULL;
> +		goto out;
> +	}
> +
> +	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
> +	if (!dentry_inode) {
> +		jbd_debug(1, "Inode dentry not created.");
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = __ext4_link(dir, inode, dentry_inode);
> +	/*
> +	 * It's possible that link already existed since data blocks
> +	 * for the dir in question got persisted before we crashed OR
> +	 * we replayed this tag and crashed before the entire replay
> +	 * could complete.
> +	 */
> +	if (ret && ret != -EEXIST) {
> +		jbd_debug(1, "Failed to link\n");
> +		goto out;
> +	}
> +
> +	ret = 0;
> +out:
> +	if (dentry_dir) {
> +		d_drop(dentry_dir);
> +		dput(dentry_dir);
> +	} else if (dir) {
> +		iput(dir);
> +	}
> +	if (dentry_inode) {
> +		d_drop(dentry_inode);
> +		dput(dentry_inode);
> +	}
> +
> +	return ret;
> +}
> +
> +/* Link replay function */
> +static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
> +{
> +	struct inode *inode;
> +	struct dentry_info_args darg;
> +	int ret = 0;
> +
> +	tl_to_darg(&darg, tl);
> +	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
> +			darg.parent_ino, darg.dname_len);
> +
> +	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
> +	if (IS_ERR_OR_NULL(inode)) {
> +		jbd_debug(1, "Inode not found.");
> +		return 0;
> +	}
> +
> +	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
> +	iput(inode);
> +	return ret;
> +}
> +
> +/*
> + * Record all the modified inodes during replay. We use this later to setup
> + * block bitmaps correctly.
> + */
> +static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
> +{
> +	struct ext4_fc_replay_state *state;
> +	int i;
> +
> +	state = &EXT4_SB(sb)->s_fc_replay_state;
> +	for (i = 0; i < state->fc_modified_inodes_used; i++)
> +		if (state->fc_modified_inodes[i] == ino)
> +			return 0;
> +	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
> +		state->fc_modified_inodes_size +=
> +			EXT4_FC_REPLAY_REALLOC_INCREMENT;
> +		state->fc_modified_inodes = krealloc(
> +					state->fc_modified_inodes, sizeof(int) *
> +					state->fc_modified_inodes_size,
> +					GFP_KERNEL);
> +		if (!state->fc_modified_inodes)
> +			return -ENOMEM;
> +	}
> +	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
> +	return 0;
> +}
> +
> +/*
> + * Inode replay function
> + *
> + * If the tag is EXT4_FC_TAG_INODE_FULL, copy the entire inode to its location.
> + * If the tag is EXT4_FC_TAG_INODE_PARTIAL, copy everything except i_block.
> + * This is useful if i_block has been modified due to previous ADD_RANGE /
> + * DEL_RANGE tags.
> + */
> +static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
> +{
> +	struct ext4_fc_inode *fc_inode;
> +	u8 *raw_fc_inode;
> +	struct inode *inode = NULL;
> +	struct ext4_iloc iloc;
> +	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
> +
> +	fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
> +
> +	ino = le32_to_cpu(fc_inode->fc_ino);
> +	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
> +
> +	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
> +	if (!IS_ERR_OR_NULL(inode)) {
> +		ext4_ext_clear_bb(inode);
> +		iput(inode);
> +	}
> +
> +	ext4_fc_record_modified_inode(sb, ino);
> +
> +	raw_fc_inode = fc_inode->fc_raw_inode;
> +	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
> +	if (ret)
> +		goto out;
> +
> +	inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
> +
> +	if (tag == EXT4_FC_TAG_INODE_FULL) {
> +		memcpy(ext4_raw_inode(&iloc), raw_fc_inode, inode_len);
> +	} else {
> +		memcpy(ext4_raw_inode(&iloc), raw_fc_inode,
> +			offsetof(struct ext4_inode, i_block));
> +		memcpy(&ext4_raw_inode(&iloc)->i_generation,
> +			&((struct ext4_inode *)(raw_fc_inode))->i_generation,
> +			inode_len -
> +			offsetof(struct ext4_inode, i_generation));
> +	}
> +
> +	/* Immediately update the inode on disk. */
> +	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
> +	sync_dirty_buffer(iloc.bh);
> +
> +	ret = ext4_mark_inode_used(sb, ino);
> +	if (ret)
> +		goto out;
> +
> +	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
> +	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
> +	if (IS_ERR_OR_NULL(inode)) {
> +		jbd_debug(1, "Inode not found.");
> +		return -EFSCORRUPTED;
> +	}
> +
> +	/*
> +	 * Our allocator could have made different decisions than before
> +	 * crashing. This should be fixed but until then, we calculate
> +	 * the number of blocks the inode.
> +	 */
> +	if (tag == EXT4_FC_TAG_INODE_PARTIAL)
> +		ext4_ext_replay_set_iblocks(inode);
> +
> +	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
> +	ext4_reset_inode_seed(inode);
> +
> +	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
> +	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
> +	sync_dirty_buffer(iloc.bh);
> +	brelse(iloc.bh);
> +out:
> +	iput(inode);
> +	if (!ret)
> +		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
> +
> +	return 0;
> +}
> +
> +/*
> + * Dentry create replay function.
> + *
> + * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
> + * inode for which we are trying to create a dentry here, should already have
> + * been replayed before we start here.
> + */
> +static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
> +{
> +	int ret = 0;
> +	struct inode *inode = NULL;
> +	struct inode *dir = NULL;
> +	struct dentry_info_args darg;
> +
> +	tl_to_darg(&darg, tl);
> +
> +	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
> +			darg.parent_ino, darg.dname_len);
> +
> +	/* This takes care of update group descriptor and other metadata */
> +	ret = ext4_mark_inode_used(sb, darg.ino);
> +	if (ret)
> +		goto out;
> +
> +	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
> +	if (IS_ERR_OR_NULL(inode)) {
> +		jbd_debug(1, "inode %d not found.", darg.ino);
> +		inode = NULL;
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (S_ISDIR(inode->i_mode)) {
> +		/*
> +		 * If we are creating a directory, we need to make sure that the
> +		 * dot and dot dot dirents are setup properly.
> +		 */
> +		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
> +		if (IS_ERR_OR_NULL(dir)) {
> +			jbd_debug(1, "Dir %d not found.", darg.ino);
> +			goto out;
> +		}
> +		ret = ext4_init_new_dir(NULL, dir, inode);
> +		iput(dir);
> +		if (ret) {
> +			ret = 0;
> +			goto out;
> +		}
> +	}
> +	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
> +	if (ret)
> +		goto out;
> +	set_nlink(inode, 1);
> +	ext4_mark_inode_dirty(NULL, inode);
> +out:
> +	if (inode)
> +		iput(inode);
> +	return ret;
> +}
> +
> +/*
> + * Record physical disk regions which are in use as per fast commit area. Our
> + * simple replay phase allocator excludes these regions from allocation.
> + */
> +static int ext4_fc_record_regions(struct super_block *sb, int ino,
> +		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
> +{
> +	struct ext4_fc_replay_state *state;
> +	struct ext4_fc_alloc_region *region;
> +
> +	state = &EXT4_SB(sb)->s_fc_replay_state;
> +	if (state->fc_regions_used == state->fc_regions_size) {
> +		state->fc_regions_size +=
> +			EXT4_FC_REPLAY_REALLOC_INCREMENT;
> +		state->fc_regions = krealloc(
> +					state->fc_regions,
> +					state->fc_regions_size *
> +					sizeof(struct ext4_fc_alloc_region),
> +					GFP_KERNEL);
> +		if (!state->fc_regions)
> +			return -ENOMEM;
> +	}
> +	region = &state->fc_regions[state->fc_regions_used++];
> +	region->ino = ino;
> +	region->lblk = lblk;
> +	region->pblk = pblk;
> +	region->len = len;
> +
> +	return 0;
> +}
> +
> +/* Replay add range tag */
> +static int ext4_fc_replay_add_range(struct super_block *sb,
> +				struct ext4_fc_tl *tl)
> +{
> +	struct ext4_fc_add_range *fc_add_ex;
> +	struct ext4_extent newex, *ex;
> +	struct inode *inode;
> +	ext4_lblk_t start, cur;
> +	int remaining, len;
> +	ext4_fsblk_t start_pblk;
> +	struct ext4_map_blocks map;
> +	struct ext4_ext_path *path = NULL;
> +	int ret;
> +
> +	fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
> +	ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
> +
> +	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
> +		le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
> +		ext4_ext_get_actual_len(ex));
> +
> +	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
> +				EXT4_IGET_NORMAL);
> +	if (IS_ERR_OR_NULL(inode)) {
> +		jbd_debug(1, "Inode not found.");
> +		return 0;
> +	}
> +
> +	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
> +
> +	start = le32_to_cpu(ex->ee_block);
> +	start_pblk = ext4_ext_pblock(ex);
> +	len = ext4_ext_get_actual_len(ex);
> +
> +	cur = start;
> +	remaining = len;
> +	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
> +		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
> +		  inode->i_ino);
> +
> +	while (remaining > 0) {
> +		map.m_lblk = cur;
> +		map.m_len = remaining;
> +		map.m_pblk = 0;
> +		ret = ext4_map_blocks(NULL, inode, &map, 0);
> +
> +		if (ret < 0) {
> +			iput(inode);
> +			return 0;
> +		}
> +
> +		if (ret == 0) {
> +			/* Range not mapped */
> +			path = ext4_find_extent(inode, cur, NULL, 0);
> +			if (!path)
> +				continue;
> +			memset(&newex, 0, sizeof(newex));
> +			newex.ee_block = cpu_to_le32(cur);
> +			ext4_ext_store_pblock(
> +				&newex, start_pblk + cur - start);
> +			newex.ee_len = cpu_to_le16(map.m_len);
> +			if (ext4_ext_is_unwritten(ex))
> +				ext4_ext_mark_unwritten(&newex);
> +			down_write(&EXT4_I(inode)->i_data_sem);
> +			ret = ext4_ext_insert_extent(
> +				NULL, inode, &path, &newex, 0);
> +			up_write((&EXT4_I(inode)->i_data_sem));
> +			ext4_ext_drop_refs(path);
> +			kfree(path);
> +			if (ret) {
> +				iput(inode);
> +				return 0;
> +			}
> +			goto next;
> +		}
> +
> +		if (start_pblk + cur - start != map.m_pblk) { > +			/* Logical to physical mapping changed */


Sorry I am not sure if I understand this correctly. Can we pls put more
comments on when and how can this condition happen?
I am sure I am mising something.

Also what about if the mapping changed and the start pblk is different
but it's still an overlapping mapping?
Do we take care of that case here? why I ask this, because we are
clearing the block bitmaps for map.m_len below.

> +			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
> +					ext4_ext_is_unwritten(ex),
> +					start_pblk + cur - start);
> +			if (ret) {
> +				iput(inode);
> +				return 0;
> +			}
> +			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
> +			goto next;
> +		}
> +
> +		/* Range is mapped and needs a state change */
> +		jbd_debug(1, "Converting from %d to %d %lld",
> +				map.m_flags & EXT4_MAP_UNWRITTEN,
> +			ext4_ext_is_unwritten(ex), map.m_pblk);
> +		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
> +					ext4_ext_is_unwritten(ex), map.m_pblk);
> +		if (ret) {
> +			iput(inode);
> +			return 0;
> +		}
> +		/*
> +		 * We may have split the extent tree while toggling the state.
> +		 * Try to shrink the exten tree now.

s/exten/extent



> +		 */
> +		ext4_ext_replay_shrink_inode(inode, start + len);
> +next:
> +		cur += map.m_len;
> +		remaining -= map.m_len;
> +	}
> +	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
> +					sb->s_blocksize_bits);
> +	iput(inode);
> +	return 0;
> +}
> +
> +/* Replay DEL_RANGE tag */
> +static int
> +ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
> +{
> +	struct inode *inode;
> +	struct ext4_fc_del_range *lrange;
> +	struct ext4_map_blocks map;
> +	ext4_lblk_t cur, remaining;
> +	int ret;
> +
> +	lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
> +	cur = le32_to_cpu(lrange->fc_lblk);
> +	remaining = le32_to_cpu(lrange->fc_len);
> +
> +	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
> +		le32_to_cpu(lrange->fc_ino), cur, remaining);
> +
> +	inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
> +	if (IS_ERR_OR_NULL(inode)) {
> +		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
> +		return 0;
> +	}
> +
> +	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
> +
> +	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
> +			inode->i_ino, le32_to_cpu(lrange->fc_lblk),
> +			le32_to_cpu(lrange->fc_len));
> +	while (remaining > 0) {
> +		map.m_lblk = cur;
> +		map.m_len = remaining;
> +
> +		ret = ext4_map_blocks(NULL, inode, &map, 0);
> +		if (ret < 0) {
> +			iput(inode);
> +			return 0;
> +		}
> +		if (ret > 0) {
> +			remaining -= ret;
> +			cur += ret;
> +			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
> +		} else {
> +			remaining -= map.m_len;
> +			cur += map.m_len;
> +		}
> +	}
> +
> +	ret = ext4_punch_hole(inode,
> +		le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
> +		le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
> +	if (ret)
> +		jbd_debug(1, "ext4_punch_hole returned %d", ret);
> +	ext4_ext_replay_shrink_inode(inode,
> +		i_size_read(inode) >> sb->s_blocksize_bits);
> +	ext4_mark_inode_dirty(NULL, inode);
> +	iput(inode);
> +
> +	return 0;
> +}
> +
> +static inline const char *tag2str(u16 tag)
> +{
> +	switch (tag) {
> +	case EXT4_FC_TAG_LINK:
> +		return "TAG_ADD_ENTRY";
> +	case EXT4_FC_TAG_UNLINK:
> +		return "TAG_DEL_ENTRY";
> +	case EXT4_FC_TAG_ADD_RANGE:
> +		return "TAG_ADD_RANGE";
> +	case EXT4_FC_TAG_CREAT:
> +		return "TAG_CREAT_DENTRY";
> +	case EXT4_FC_TAG_DEL_RANGE:
> +		return "TAG_DEL_RANGE";
> +	case EXT4_FC_TAG_INODE_FULL:
> +		return "TAG_INODE_FULL";
> +	case EXT4_FC_TAG_INODE_PARTIAL:
> +		return "TAG_INODE_PARTIAL";
> +	case EXT4_FC_TAG_PAD:
> +		return "TAG_PAD";
> +	case EXT4_FC_TAG_TAIL:
> +		return "TAG_TAIL";
> +	case EXT4_FC_TAG_HEAD:
> +		return "TAG_HEAD";
> +	default:
> +		return "TAG_ERROR";
> +	}
> +}
> +
> +void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)

static ?

> +{
> +	struct ext4_fc_replay_state *state;
> +	struct inode *inode;
> +	struct ext4_ext_path *path = NULL;
> +	struct ext4_map_blocks map;
> +	int i, ret, j;
> +	ext4_lblk_t cur, end;
> +
> +	state = &EXT4_SB(sb)->s_fc_replay_state;
> +	for (i = 0; i < state->fc_modified_inodes_used; i++) {
> +		inode = ext4_iget(sb, state->fc_modified_inodes[i],
> +			EXT4_IGET_NORMAL);
> +		if (IS_ERR_OR_NULL(inode)) {
> +			jbd_debug(1, "Inode %d not found.",
> +				state->fc_modified_inodes[i]);
> +			continue;
> +		}
> +		cur = 0;
> +		end = EXT_MAX_BLOCKS;
> +		while (cur < end) {
> +			map.m_lblk = cur;
> +			map.m_len = end - cur;
> +
> +			ret = ext4_map_blocks(NULL, inode, &map, 0);
> +			if (ret < 0)
> +				break;
> +
> +			if (ret > 0) {
> +				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
> +				if (!IS_ERR_OR_NULL(path)) {
> +					for (j = 0; j < path->p_depth; j++)
> +						ext4_mb_mark_bb(inode->i_sb,
> +							path[j].p_block, 1, 1);
> +					ext4_ext_drop_refs(path);
> +					kfree(path);
> +				}
> +				cur += ret;
> +				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
> +							map.m_len, 1);
> +			} else {
> +				cur = cur + (map.m_len ? map.m_len : 1);
> +			}
> +		}
> +		iput(inode);
> +	}
> +}
> +
> +/*
> + * Check if block is in excluded regions for block allocation. The simple
> + * allocator that runs during replay phase is calls this function to see
> + * if it is okay to use a block.
> + */
> +bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
> +{
> +	int i;
> +	struct ext4_fc_replay_state *state;
> +
> +	state = &EXT4_SB(sb)->s_fc_replay_state;
> +	for (i = 0; i < state->fc_regions_valid; i++) {
> +		if (state->fc_regions[i].ino == 0 ||
> +			state->fc_regions[i].len == 0)
> +			continue;
> +		if (blk >= state->fc_regions[i].pblk &&
> +		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
> +			return true;
> +	}
> +	return false;
> +}
> +
> +/* Cleanup function called after replay */
> +void ext4_fc_replay_cleanup(struct super_block *sb)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +
> +	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
> +	kfree(sbi->s_fc_replay_state.fc_regions);
> +	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
> +}
> +
> +/*
> + * Recovery Scan phase handler
> + *
> + * This function is called during the scan phase and is responsible
> + * for doing following things:
> + * - Make sure the fast commit area has valid tags for replay
> + * - Count number of tags that need to be replayed by the replay handler
> + * - Verify CRC
> + * - Create a list of excluded blocks for allocation during replay phase
> + *
> + * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
> + * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
> + * to indicate that scan has finished and JBD2 can now start replay phase.
> + * It returns a negative error to indicate that there was an error. At the end
> + * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
> + * to indicate the number of tags that need to replayed during the replay phase.
> + */
> +static int ext4_fc_replay_scan(journal_t *journal,
> +				struct buffer_head *bh, int off,
> +				tid_t expected_tid)
> +{
> +	struct super_block *sb = journal->j_private;
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	struct ext4_fc_replay_state *state;
> +	int ret = JBD2_FC_REPLAY_CONTINUE;
> +	struct ext4_fc_add_range *ext;
> +	struct ext4_fc_tl *tl;
> +	struct ext4_fc_tail *tail;
> +	__u8 *start, *end;
> +	struct ext4_fc_head *head;
> +	struct ext4_extent *ex;
> +
> +	state = &sbi->s_fc_replay_state;
> +
> +	start = (u8 *)bh->b_data;
> +	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
> +
> +	if (state->fc_replay_expected_off == 0) {
> +		state->fc_cur_tag = 0;
> +		state->fc_replay_num_tags = 0;
> +		state->fc_crc = 0;
> +		state->fc_regions = NULL;
> +		state->fc_regions_valid = state->fc_regions_used =
> +			state->fc_regions_size = 0;
> +		/* Check if we can stop early */
> +		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
> +			!= EXT4_FC_TAG_HEAD)
> +			return 0;
> +	}
> +
> +	if (off != state->fc_replay_expected_off) {
> +		ret = -EFSCORRUPTED;
> +		goto out_err;
> +	}
> +
> +	state->fc_replay_expected_off++;
> +	fc_for_each_tl(start, end, tl) {
> +		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
> +			  tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
> +		switch (le16_to_cpu(tl->fc_tag)) {
> +		case EXT4_FC_TAG_ADD_RANGE:
> +			ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
> +			ex = (struct ext4_extent *)&ext->fc_ex;
> +			ret = ext4_fc_record_regions(sb,
> +				le32_to_cpu(ext->fc_ino),
> +				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
> +				ext4_ext_get_actual_len(ex));
> +			if (ret < 0)
> +				break;
> +			ret = JBD2_FC_REPLAY_CONTINUE;
> +			fallthrough;
> +		case EXT4_FC_TAG_DEL_RANGE:
> +		case EXT4_FC_TAG_LINK:
> +		case EXT4_FC_TAG_UNLINK:
> +		case EXT4_FC_TAG_CREAT:
> +		case EXT4_FC_TAG_INODE_FULL:
> +		case EXT4_FC_TAG_INODE_PARTIAL:
> +		case EXT4_FC_TAG_PAD:
> +			state->fc_cur_tag++;
> +			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
> +					sizeof(*tl) + ext4_fc_tag_len(tl));
> +			break;
> +		case EXT4_FC_TAG_TAIL:
> +			state->fc_cur_tag++;
> +			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
> +			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
> +						sizeof(*tl) +
> +						offsetof(struct ext4_fc_tail,
> +						fc_crc));
> +			if (le32_to_cpu(tail->fc_tid) == expected_tid &&
> +				le32_to_cpu(tail->fc_crc) == state->fc_crc) {
> +				state->fc_replay_num_tags = state->fc_cur_tag;
> +				state->fc_regions_valid =
> +					state->fc_regions_used;
> +			} else {
> +				ret = state->fc_replay_num_tags ?
> +					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
> +			}
> +			state->fc_crc = 0;
> +			break;
> +		case EXT4_FC_TAG_HEAD:
> +			head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
> +			if (le32_to_cpu(head->fc_features) &
> +				~EXT4_FC_SUPPORTED_FEATURES) {
> +				ret = -EOPNOTSUPP;
> +				break;
> +			}
> +			if (le32_to_cpu(head->fc_tid) != expected_tid) {
> +				ret = JBD2_FC_REPLAY_STOP;
> +				break;
> +			}
> +			state->fc_cur_tag++;
> +			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
> +					sizeof(*tl) + ext4_fc_tag_len(tl));


why do we need to calculate state->fc_crc for HEAD?
I don't see we comparing this anywhere right? anything I missed?

> +			break;
> +		default:
> +			ret = state->fc_replay_num_tags ?
> +				JBD2_FC_REPLAY_STOP : -ECANCELED;
> +		}
> +		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
> +			break;
> +	}
> +
> +out_err:
> +	trace_ext4_fc_replay_scan(sb, ret, off);
> +	return ret;
> +}
> +
>   /*
>    * Main recovery path entry point.
> + * The meaning of return codes is similar as above.
>    */
>   static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
>   				enum passtype pass, int off, tid_t expected_tid)
>   {
> -	return 0;
> +	struct super_block *sb = journal->j_private;
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	struct ext4_fc_tl *tl;
> +	__u8 *start, *end;
> +	int ret = JBD2_FC_REPLAY_CONTINUE;
> +	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
> +	struct ext4_fc_tail *tail;
> +
> +	if (pass == PASS_SCAN) {
> +		state->fc_current_pass = PASS_SCAN;
> +		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
> +	}
> +
> +	if (state->fc_current_pass != pass) {
> +		state->fc_current_pass = pass;
> +		sbi->s_mount_state |= EXT4_FC_REPLAY;
> +	}
> +	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
> +		jbd_debug(1, "Replay stops\n");
> +		ext4_fc_set_bitmaps_and_counters(sb);
> +		return 0;
> +	}
> +
> +#ifdef CONFIG_EXT4_DEBUG
> +	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
> +		pr_warn("Dropping fc block %d because max_replay set\n", off);
> +		return -EINVAL;
> +	}
> +#endif
> +
> +	start = (u8 *)bh->b_data;
> +	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
> +
> +	fc_for_each_tl(start, end, tl) {
> +		if (state->fc_replay_num_tags == 0) {
> +			ret = JBD2_FC_REPLAY_STOP;
> +			ext4_fc_set_bitmaps_and_counters(sb);
> +			break;
> +		}
> +		jbd_debug(3, "Replay phase, tag:%s\n",
> +				tag2str(le16_to_cpu(tl->fc_tag)));
> +		state->fc_replay_num_tags--;
> +		switch (le16_to_cpu(tl->fc_tag)) {
> +		case EXT4_FC_TAG_LINK:
> +			ret = ext4_fc_replay_link(sb, tl);
> +			break;
> +		case EXT4_FC_TAG_UNLINK:
> +			ret = ext4_fc_replay_unlink(sb, tl);
> +			break;
> +		case EXT4_FC_TAG_ADD_RANGE:
> +			ret = ext4_fc_replay_add_range(sb, tl);
> +			break;
> +		case EXT4_FC_TAG_CREAT:
> +			ret = ext4_fc_replay_create(sb, tl);
> +			break;
> +		case EXT4_FC_TAG_DEL_RANGE:
> +			ret = ext4_fc_replay_del_range(sb, tl);
> +			break;
> +		case EXT4_FC_TAG_INODE_PARTIAL:
> +		case EXT4_FC_TAG_INODE_FULL:
> +			ret = ext4_fc_replay_inode(sb, tl);
> +			break;
> +		case EXT4_FC_TAG_PAD:
> +			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
> +				ext4_fc_tag_len(tl), 0);
> +			break;
> +		case EXT4_FC_TAG_TAIL:
> +			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
> +				ext4_fc_tag_len(tl), 0);
> +			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
> +			WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
> +			break;
> +		case EXT4_FC_TAG_HEAD:
> +			break;
> +		default:
> +			trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
> +				ext4_fc_tag_len(tl), 0);
> +			ret = -ECANCELED;
> +			break;
> +		}
> +		if (ret < 0)
> +			break;
> +		ret = JBD2_FC_REPLAY_CONTINUE;
> +	}
> +	return ret;
>   }
> 
>   void ext4_fc_init(struct super_block *sb, journal_t *journal)
> diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
> index a541d2bbe24b..cf4d8772d055 100644
> --- a/fs/ext4/fast_commit.h
> +++ b/fs/ext4/fast_commit.h
> @@ -117,4 +117,44 @@ struct ext4_fc_stats {
>   	int fc_numblks;
>   };
> 
> +#define EXT4_FC_REPLAY_REALLOC_INCREMENT	4
> +
> +/*
> + * Physical block regions added to different inodes due to fast commit
> + * recovery. These are set during the SCAN phase. During the replay phase,
> + * our allocator excludes these from its allocation. This ensures that
> + * we don't accidentally allocating a block that is going to be used by
> + * another inode.
> + */
> +struct ext4_fc_alloc_region {
> +	ext4_lblk_t lblk;
> +	ext4_fsblk_t pblk;
> +	int ino, len;
> +};
> +
> +/*
> + * Fast commit replay state.
> + */
> +struct ext4_fc_replay_state {
> +	int fc_replay_num_tags;
> +	int fc_replay_expected_off;
> +	int fc_current_pass;
> +	int fc_cur_tag;
> +	int fc_crc;
> +	struct ext4_fc_alloc_region *fc_regions;
> +	int fc_regions_size, fc_regions_used, fc_regions_valid;
> +	int *fc_modified_inodes;
> +	int fc_modified_inodes_used, fc_modified_inodes_size;
> +};
> +
> +#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1)
> +
> +#define fc_for_each_tl(__start, __end, __tl)				\
> +	for (tl = (struct ext4_fc_tl *)start;				\
> +		(u8 *)tl < (u8 *)end;					\
> +		tl = (struct ext4_fc_tl *)((u8 *)tl +			\
> +					sizeof(struct ext4_fc_tl) +	\
> +					+ le16_to_cpu(tl->fc_len)))
> +
> +
>   #endif /* __FAST_COMMIT_H__ */
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index df25d38d6539..db9b9eeb9560 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -82,7 +82,12 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
>   				      struct buffer_head *bh)
>   {
>   	ext4_fsblk_t	blk;
> -	struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
> +	struct ext4_group_info *grp;
> +
> +	if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
> +		return 0;
> +
> +	grp = ext4_get_group_info(sb, block_group);
> 
>   	if (buffer_verified(bh))
>   		return 0;
> @@ -284,15 +289,17 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
>   	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
>   	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
>   	/* Don't bother if the inode bitmap is corrupt. */
> -	grp = ext4_get_group_info(sb, block_group);
>   	if (IS_ERR(bitmap_bh)) {
>   		fatal = PTR_ERR(bitmap_bh);
>   		bitmap_bh = NULL;
>   		goto error_return;
>   	}
> -	if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
> -		fatal = -EFSCORRUPTED;
> -		goto error_return;
> +	if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
> +		grp = ext4_get_group_info(sb, block_group);
> +		if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
> +			fatal = -EFSCORRUPTED;
> +			goto error_return;
> +		}
>   	}
> 
>   	BUFFER_TRACE(bitmap_bh, "get_write_access");
> @@ -742,6 +749,119 @@ static int find_inode_bit(struct super_block *sb, ext4_group_t group,
>   	return 1;
>   }
> 
> +int ext4_mark_inode_used(struct super_block *sb, int ino)
> +{
> +	unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
> +	struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
> +	struct ext4_group_desc *gdp;
> +	ext4_group_t group;
> +	int bit;
> +	int err = -EFSCORRUPTED;
> +
> +	if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
> +		goto out;
> +
> +	group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
> +	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
> +	inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
> +	if (IS_ERR(inode_bitmap_bh))
> +		return PTR_ERR(inode_bitmap_bh);
> +
> +	if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) {
> +		err = 0;
> +		goto out;
> +	}
> +
> +	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
> +	if (!gdp || !group_desc_bh) {
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	ext4_set_bit(bit, inode_bitmap_bh->b_data);
> +
> +	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
> +	err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
> +	if (err) {
> +		ext4_std_error(sb, err);
> +		goto out;
> +	}
> +	sync_dirty_buffer(inode_bitmap_bh);

Shouldn't we handle error from sync_dirty_buffer()?

> +	BUFFER_TRACE(group_desc_bh, "get_write_access");

The above BUFFER_TRACE() is not correct. We should remove it from here.



> +
> +	/* We may have to initialize the block bitmap if it isn't already */
> +	if (ext4_has_group_desc_csum(sb) &&
> +	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
> +		struct buffer_head *block_bitmap_bh;
> +
> +		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
> +		if (IS_ERR(block_bitmap_bh)) {
> +			err = PTR_ERR(block_bitmap_bh);
> +			goto out;
> +		}
> +
> +		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
> +		err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
> +		sync_dirty_buffer(block_bitmap_bh);
> +
> +		/* recheck and clear flag under lock if we still need to */
> +		ext4_lock_group(sb, group);
> +		if (ext4_has_group_desc_csum(sb) &&
> +		    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
> +			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
> +			ext4_free_group_clusters_set(sb, gdp,
> +				ext4_free_clusters_after_init(sb, group, gdp));
> +			ext4_block_bitmap_csum_set(sb, group, gdp,
> +						   block_bitmap_bh);
> +			ext4_group_desc_csum_set(sb, group, gdp);
> +		}
> +		ext4_unlock_group(sb, group);
> +		brelse(block_bitmap_bh);
> +
> +		if (err) {
> +			ext4_std_error(sb, err);
> +			goto out;
> +		}
> +	}
> +
> +	/* Update the relevant bg descriptor fields */
> +	if (ext4_has_group_desc_csum(sb)) {
> +		int free;
> +
> +		ext4_lock_group(sb, group); /* while we modify the bg desc */
> +		free = EXT4_INODES_PER_GROUP(sb) -
> +			ext4_itable_unused_count(sb, gdp);
> +		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
> +			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
> +			free = 0;
> +		}
> +
> +		/*
> +		 * Check the relative inode number against the last used
> +		 * relative inode number in this group. if it is greater
> +		 * we need to update the bg_itable_unused count
> +		 */
> +		if (bit >= free)
> +			ext4_itable_unused_set(sb, gdp,
> +					(EXT4_INODES_PER_GROUP(sb) - bit - 1));
> +	} else {
> +		ext4_lock_group(sb, group);
> +	}
> +
> +	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
> +	if (ext4_has_group_desc_csum(sb)) {
> +		ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
> +					   EXT4_INODES_PER_GROUP(sb) / 8);
> +		ext4_group_desc_csum_set(sb, group, gdp);
> +	}
> +
> +	ext4_unlock_group(sb, group);
> +	err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
> +	sync_dirty_buffer(group_desc_bh);
> +out:
> +	return err;
> +}
> +
>   /*
>    * There are two policies for allocating an inode.  If the new inode is
>    * a directory, then a forward search is made for a block group with both
> @@ -771,7 +891,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>   	struct inode *ret;
>   	ext4_group_t i;
>   	ext4_group_t flex_group;
> -	struct ext4_group_info *grp;
> +	struct ext4_group_info *grp = NULL;
>   	int encrypt = 0;
> 
>   	/* Cannot create files in a deleted directory */
> @@ -909,15 +1029,21 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>   		if (ext4_free_inodes_count(sb, gdp) == 0)
>   			goto next_group;
> 
> -		grp = ext4_get_group_info(sb, group);
> -		/* Skip groups with already-known suspicious inode tables */
> -		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
> -			goto next_group;
> +		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
> +			grp = ext4_get_group_info(sb, group);
> +			/*
> +			 * Skip groups with already-known suspicious inode
> +			 * tables
> +			 */
> +			if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
> +				goto next_group;
> +		}
> 
>   		brelse(inode_bitmap_bh);
>   		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
>   		/* Skip groups with suspicious inode tables */
> -		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
> +		if (((!(sbi->s_mount_state & EXT4_FC_REPLAY))
> +		     && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ||
>   		    IS_ERR(inode_bitmap_bh)) {
>   			inode_bitmap_bh = NULL;
>   			goto next_group;
> @@ -936,7 +1062,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>   			goto next_group;
>   		}
> 
> -		if (!handle) {
> +		if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
>   			BUG_ON(nblocks <= 0);
>   			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
>   				 handle_type, nblocks, 0,
> @@ -1040,9 +1166,15 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>   	/* Update the relevant bg descriptor fields */
>   	if (ext4_has_group_desc_csum(sb)) {
>   		int free;
> -		struct ext4_group_info *grp = ext4_get_group_info(sb, group);
> -
> -		down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
> +		struct ext4_group_info *grp = NULL;
> +
> +		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
> +			grp = ext4_get_group_info(sb, group);
> +			down_read(&grp->alloc_sem); /*
> +						     * protect vs itable
> +						     * lazyinit
> +						     */
> +		}
>   		ext4_lock_group(sb, group); /* while we modify the bg desc */
>   		free = EXT4_INODES_PER_GROUP(sb) -
>   			ext4_itable_unused_count(sb, gdp);
> @@ -1058,7 +1190,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>   		if (ino > free)
>   			ext4_itable_unused_set(sb, gdp,
>   					(EXT4_INODES_PER_GROUP(sb) - ino));
> -		up_read(&grp->alloc_sem);
> +		if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
> +			up_read(&grp->alloc_sem);
>   	} else {
>   		ext4_lock_group(sb, group);
>   	}
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 26eed76812f9..9dce088171cc 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -101,8 +101,8 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
>   	return provided == calculated;
>   }
> 
> -static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
> -				struct ext4_inode_info *ei)
> +void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
> +			 struct ext4_inode_info *ei)
>   {
>   	__u32 csum;
> 
> @@ -514,7 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>   		return -EFSCORRUPTED;
> 
>   	/* Lookup extent status tree firstly */
> -	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
> +	if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
> +	    ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
>   		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
>   			map->m_pblk = ext4_es_pblock(&es) +
>   					map->m_lblk - es.es_lblk;
> @@ -827,7 +828,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
>   	int create = map_flags & EXT4_GET_BLOCKS_CREATE;
>   	int err;
> 
> -	J_ASSERT(handle != NULL || create == 0);
> +	J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +		 || handle != NULL || create == 0);
> 
>   	map.m_lblk = block;
>   	map.m_len = 1;
> @@ -843,7 +845,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
>   		return ERR_PTR(-ENOMEM);
>   	if (map.m_flags & EXT4_MAP_NEW) {
>   		J_ASSERT(create != 0);
> -		J_ASSERT(handle != NULL);
> +		J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> +			 || (handle != NULL));
> 
>   		/*
>   		 * Now that we do not always journal data, we should
> @@ -4255,22 +4258,22 @@ int ext4_truncate(struct inode *inode)
>    * data in memory that is needed to recreate the on-disk version of this
>    * inode.
>    */
> -static int __ext4_get_inode_loc(struct inode *inode,
> -				struct ext4_iloc *iloc, int in_mem)
> +static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
> +				struct ext4_iloc *iloc, int in_mem,
> +				ext4_fsblk_t *ret_block)
>   {
>   	struct ext4_group_desc	*gdp;
>   	struct buffer_head	*bh;
> -	struct super_block	*sb = inode->i_sb;
>   	ext4_fsblk_t		block;
>   	struct blk_plug		plug;
>   	int			inodes_per_block, inode_offset;
> 
>   	iloc->bh = NULL;
> -	if (inode->i_ino < EXT4_ROOT_INO ||
> -	    inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
> +	if (ino < EXT4_ROOT_INO ||
> +	    ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
>   		return -EFSCORRUPTED;
> 
> -	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
> +	iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
>   	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
>   	if (!gdp)
>   		return -EIO;
> @@ -4279,7 +4282,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
>   	 * Figure out the offset within the block group inode table
>   	 */
>   	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
> -	inode_offset = ((inode->i_ino - 1) %
> +	inode_offset = ((ino - 1) %
>   			EXT4_INODES_PER_GROUP(sb));
>   	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
>   	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
> @@ -4380,7 +4383,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
>   		 * has in-inode xattrs, or we don't have this inode in memory.
>   		 * Read the block from disk.
>   		 */
> -		trace_ext4_load_inode(inode);
> +		trace_ext4_load_inode(sb, ino);
>   		get_bh(bh);
>   		bh->b_end_io = end_buffer_read_sync;
>   		submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
> @@ -4388,8 +4391,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
>   		wait_on_buffer(bh);
>   		if (!buffer_uptodate(bh)) {
>   		simulate_eio:
> -			ext4_error_inode_block(inode, block, EIO,
> -					       "unable to read itable block");
> +			if (ret_block)
> +				*ret_block = block;
>   			brelse(bh);
>   			return -EIO;
>   		}
> @@ -4399,11 +4402,43 @@ static int __ext4_get_inode_loc(struct inode *inode,
>   	return 0;
>   }
> 
> +static int __ext4_get_inode_loc_noinmem(struct inode *inode,
> +					struct ext4_iloc *iloc)
> +{
> +	ext4_fsblk_t err_blk;
> +	int ret;
> +
> +	ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0,
> +					&err_blk);
> +
> +	if (ret == -EIO)
> +		ext4_error_inode_block(inode, err_blk, EIO,
> +					"unable to read itable block");
> +
> +	return ret;
> +}
> +
>   int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
>   {
> +	ext4_fsblk_t err_blk;
> +	int ret;
> +
>   	/* We have all inode data except xattrs in memory here. */
> -	return __ext4_get_inode_loc(inode, iloc,
> -		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
> +	ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc,
> +		!ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk);
> +
> +	if (ret == -EIO)
> +		ext4_error_inode_block(inode, err_blk, EIO,
> +					"unable to read itable block");
> +
> +	return ret;
> +}
> +
> +
> +int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
> +			  struct ext4_iloc *iloc)
> +{
> +	return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL);
>   }
> 
>   static bool ext4_should_enable_dax(struct inode *inode)
> @@ -4569,7 +4604,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>   	ei = EXT4_I(inode);
>   	iloc.bh = NULL;
> 
> -	ret = __ext4_get_inode_loc(inode, &iloc, 0);
> +	ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
>   	if (ret < 0)
>   		goto bad_inode;
>   	raw_inode = ext4_raw_inode(&iloc);
> @@ -4615,10 +4650,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>   					      sizeof(gen));
>   	}
> 
> -	if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
> -	    ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
> -		ext4_error_inode_err(inode, function, line, 0, EFSBADCRC,
> -				     "iget: checksum invalid");
> +	if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
> +	    ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
> +	     (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
> +		ext4_error_inode_err(inode, function, line, 0,
> +				EFSBADCRC, "iget: checksum invalid");
>   		ret = -EFSBADCRC;
>   		goto bad_inode;
>   	}
> @@ -4772,9 +4808,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>   		goto bad_inode;
>   	} else if (!ext4_has_inline_data(inode)) {
>   		/* validate the block references in the inode */
> -		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
> -		   (S_ISLNK(inode->i_mode) &&
> -		    !ext4_inode_is_fast_symlink(inode))) {
> +		if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
> +			(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
> +			(S_ISLNK(inode->i_mode) &&
> +			!ext4_inode_is_fast_symlink(inode)))) {
>   			if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>   				ret = ext4_ext_check_inode(inode);
>   			else
> @@ -5158,7 +5195,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
>   	} else {
>   		struct ext4_iloc iloc;
> 
> -		err = __ext4_get_inode_loc(inode, &iloc, 0);
> +		err = __ext4_get_inode_loc_noinmem(inode, &iloc);
>   		if (err)
>   			return err;
>   		/*
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index d2f8f50deef6..f0381876a7e5 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -86,7 +86,7 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
>   	i_size_write(inode2, isize);
>   }
> 
> -static void reset_inode_seed(struct inode *inode)
> +void ext4_reset_inode_seed(struct inode *inode)
>   {
>   	struct ext4_inode_info *ei = EXT4_I(inode);
>   	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> @@ -200,8 +200,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
> 
>   	inode->i_generation = prandom_u32();
>   	inode_bl->i_generation = prandom_u32();
> -	reset_inode_seed(inode);
> -	reset_inode_seed(inode_bl);
> +	ext4_reset_inode_seed(inode);
> +	ext4_reset_inode_seed(inode_bl);
> 
>   	ext4_discard_preallocations(inode, 0);
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 132c118d12e1..ea894529118a 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -1508,14 +1508,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
> 
>   		blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
>   		blocknr += EXT4_C2B(sbi, block);
> -		ext4_grp_locked_error(sb, e4b->bd_group,
> -				      inode ? inode->i_ino : 0,
> -				      blocknr,
> -				      "freeing already freed block "
> -				      "(bit %u); block bitmap corrupt.",
> -				      block);
> -		ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
> +		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
> +			ext4_grp_locked_error(sb, e4b->bd_group,
> +					      inode ? inode->i_ino : 0,
> +					      blocknr,
> +					      "freeing already freed block (bit %u); block bitmap corrupt.",
> +					      block);
> +			ext4_mark_group_bitmap_corrupted(
> +				sb, e4b->bd_group,
>   				EXT4_GROUP_INFO_BBITMAP_CORRUPT);
> +		}
>   		mb_regenerate_buddy(e4b);
>   		goto done;
>   	}
> @@ -3302,6 +3304,86 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
>   	return err;
>   }
> 
> +/*
> + * Idempotent helper for Ext4 fast commit replay path to set the state of
> + * blocks in bitmaps and update counters.
> + */
> +void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
> +			int len, int state)
> +{
> +	struct buffer_head *bitmap_bh = NULL;
> +	struct ext4_group_desc *gdp;
> +	struct buffer_head *gdp_bh;
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	ext4_group_t group;
> +	ext4_fsblk_t cluster;

I guess we never use this variable cluster. We can as well drop it.

-ritesh

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ