linux-kernel - Re: [PATCH v2 6/8] ext4: Refactor zeroout path and handle all cases

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aWnNvttsuSIoq3WA@li-dc0c254c-257c-11b2-a85c-98b6c1322444.ibm.com>
Date: Fri, 16 Jan 2026 11:05:44 +0530
From: Ojaswin Mujoo <ojaswin@...ux.ibm.com>
To: Jan Kara <jack@...e.cz>
Cc: linux-ext4@...r.kernel.org, "Theodore Ts'o" <tytso@....edu>,
        Ritesh Harjani <ritesh.list@...il.com>, Zhang Yi <yi.zhang@...wei.com>,
        libaokun1@...wei.com, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 6/8] ext4: Refactor zeroout path and handle all cases

On Thu, Jan 15, 2026 at 01:01:14PM +0100, Jan Kara wrote:
> On Wed 14-01-26 20:27:50, Ojaswin Mujoo wrote:
> > Currently, zeroout is used as a fallback in case we fail to
> > split/convert extents in the "traditional" modify-the-extent-tree way.
> > This is essential to mitigate failures in critical paths like extent
> > splitting during endio. However, the logic is very messy and not easy to
> > follow. Further, the fragile use of various flags has made it prone to
> > errors.
> > 
> > Refactor zeroout out logic by moving it up to ext4_split_extents().
> > Further, zeroout correctly based on the type of conversion we want, ie:
> > - unwritten to written: Zeroout everything around the mapped range.
> > - written to unwritten: Zeroout only the mapped range.
> > 
> > Also, ext4_ext_convert_to_initialized() now passes
> > EXT4_GET_BLOCKS_CONVERT to make the intention clear.
> > 
> > Signed-off-by: Ojaswin Mujoo <ojaswin@...ux.ibm.com>
> 
> Overall looks nice. Feel free to add:
> 
> Reviewed-by: Jan Kara <jack@...e.cz>

Thanks for the review Jan, I'll make the changes you suggested in v3.

Regards,
ojaswin
> 
> A few nits below:
> 
> > +static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
> > +				     struct ext4_ext_path *path,
> > +				     struct ext4_map_blocks *map, int flags)
> > +{
> > +	struct ext4_extent *ex;
> > +	unsigned int ee_len, depth;
> > +	ext4_lblk_t ee_block;
> > +	uint64_t lblk, pblk, len;
> > +	int is_unwrit;
> > +	int err = 0;
> > +
> > +	depth = ext_depth(inode);
> > +	ex = path[depth].p_ext;
> > +	ee_block = le32_to_cpu(ex->ee_block);
> > +	ee_len = ext4_ext_get_actual_len(ex);
> > +	is_unwrit = ext4_ext_is_unwritten(ex);
> >  
> > +	if (flags & EXT4_GET_BLOCKS_CONVERT) {
> >  		/*
> > -		 * The first half contains partially valid data, the splitting
> > -		 * of this extent has not been completed, fix extent length
> > -		 * and ext4_split_extent() split will the first half again.
> > +		 * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
> > +		 * map to be initialized. Zeroout everything except the map
> > +		 * range.
> >  		 */
> > -		if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) {
> > -			/*
> > -			 * Drop extent cache to prevent stale unwritten
> > -			 * extents remaining after zeroing out.
> > -			 */
> > -			ext4_es_remove_extent(inode,
> > -					le32_to_cpu(zero_ex.ee_block),
> > -					ext4_ext_get_actual_len(&zero_ex));
> > -			goto fix_extent_len;
> > +
> > +		loff_t map_end = (loff_t) map->m_lblk + map->m_len;
> > +		loff_t ex_end = (loff_t) ee_block + ee_len;
> > +
> > +		if (!is_unwrit)
> > +			/* Shouldn't happen. Just exit */
> > +			return -EINVAL;
> > +
> > +		/* zeroout left */
> > +		if (map->m_lblk > ee_block) {
> > +			lblk = ee_block;
> > +			len = map->m_lblk - ee_block;
> > +			pblk = ext4_ext_pblock(ex);
> > +			err = ext4_issue_zeroout(inode, lblk, pblk, len);
> > +			if (err)
> > +				/* ZEROOUT failed, just return original error */
> > +				return err;
> >  		}
> >  
> > -		/* update the extent length and mark as initialized */
> > -		ex->ee_len = cpu_to_le16(ee_len);
> > -		ext4_ext_try_to_merge(handle, inode, path, ex);
> > -		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
> > -		if (!err)
> > -			/* update extent status tree */
> > -			ext4_zeroout_es(inode, &zero_ex);
> > +		/* zeroout right */
> > +		if (map->m_lblk + map->m_len < ee_block + ee_len) {
> 
> Use map_end and ex_end in the above condition when we have it?
> 
> ...
> > @@ -3382,11 +3428,13 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> >  					       int split_flag, int flags,
> >  					       unsigned int *allocated)
> >  {
> > -	ext4_lblk_t ee_block;
> > +	ext4_lblk_t ee_block, orig_ee_block;
> >  	struct ext4_extent *ex;
> > -	unsigned int ee_len, depth;
> > -	int unwritten;
> > -	int split_flag1, flags1;
> > +	unsigned int ee_len, orig_ee_len, depth;
> > +	int unwritten, orig_unwritten;
> > +	int split_flag1 = 0, flags1 = 0;
> > +	int  orig_err = 0;
> 	   ^^ extra space
> 
> > +	int orig_flags = flags;
> >  
> >  	depth = ext_depth(inode);
> >  	ex = path[depth].p_ext;
> > @@ -3394,30 +3442,31 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> >  	ee_len = ext4_ext_get_actual_len(ex);
> >  	unwritten = ext4_ext_is_unwritten(ex);
> >  
> > +	orig_ee_block = ee_block;
> > +	orig_ee_len = ee_len;
> > +	orig_unwritten = unwritten;
> > +
> >  	/* Do not cache extents that are in the process of being modified. */
> >  	flags |= EXT4_EX_NOCACHE;
> >  
> >  	if (map->m_lblk + map->m_len < ee_block + ee_len) {
> > -		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
> >  		flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
> >  		if (unwritten)
> >  			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
> >  				       EXT4_EXT_MARK_UNWRIT2;
> > -		if (split_flag & EXT4_EXT_DATA_VALID2)
> > -			split_flag1 |= map->m_lblk > ee_block ?
> > -				       EXT4_EXT_DATA_PARTIAL_VALID1 :
> > -				       EXT4_EXT_DATA_ENTIRE_VALID1;
> >  		path = ext4_split_extent_at(handle, inode, path,
> >  				map->m_lblk + map->m_len, split_flag1, flags1);
> >  		if (IS_ERR(path))
> > -			return path;
> > +			goto try_zeroout;
> > +
> >  		/*
> >  		 * Update path is required because previous ext4_split_extent_at
> >  		 * may result in split of original leaf or extent zeroout.
> >  		 */
> >  		path = ext4_find_extent(inode, map->m_lblk, path, flags);
> >  		if (IS_ERR(path))
> > -			return path;
> > +			goto try_zeroout;
> > +
> >  		depth = ext_depth(inode);
> >  		ex = path[depth].p_ext;
> >  		if (!ex) {
> > @@ -3426,22 +3475,64 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> >  			ext4_free_ext_path(path);
> >  			return ERR_PTR(-EFSCORRUPTED);
> >  		}
> > -		unwritten = ext4_ext_is_unwritten(ex);
> >  	}
> >  
> >  	if (map->m_lblk >= ee_block) {
> > -		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
> > +		split_flag1 = 0;
> >  		if (unwritten) {
> >  			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
> > -			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
> > -						     EXT4_EXT_MARK_UNWRIT2);
> > +			split_flag1 |= split_flag & EXT4_EXT_MARK_UNWRIT2;
> >  		}
> > -		path = ext4_split_extent_at(handle, inode, path,
> > -				map->m_lblk, split_flag1, flags);
> > +		path = ext4_split_extent_at(handle, inode, path, map->m_lblk,
> > +					    split_flag1, flags);
> >  		if (IS_ERR(path))
> > -			return path;
> > +			goto try_zeroout;
> >  	}
> >  
> > +	goto success;
> > +
> > +try_zeroout:
> > +	/*
> > +	 * There was an error in splitting the extent. So instead, just zeroout
> > +	 * unwritten portions and convert it to initialize as a last resort. If
> > +	 * there is any failure here we just return the original error
> > +	 */
> > +
> > +	orig_err = PTR_ERR(path);
> > +	if (orig_err != -ENOSPC && orig_err != -EDQUOT && orig_err != -ENOMEM)
> > +		goto out_orig_err;
> > +
> > +	if (!(split_flag & EXT4_EXT_MAY_ZEROOUT))
> > +		/* There's an error and we can't zeroout, just return the
> > +		 * original err
> > +		 */
> 
> I'd put this before if and just write:
> 
> 	/* We can't zeroout? Just return the original error */
> 
> so that the comment fits on a single line :)
> 
> > +		goto out_orig_err;
> > +
> > +	path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
> > +	if (IS_ERR(path))
> > +		goto out_orig_err;
> > +
> > +	depth = ext_depth(inode);
> > +	ex = path[depth].p_ext;
> > +	ee_block = le32_to_cpu(ex->ee_block);
> > +	ee_len = ext4_ext_get_actual_len(ex);
> > +	unwritten = ext4_ext_is_unwritten(ex);
> > +
> > +	if (WARN_ON(ee_block != orig_ee_block || ee_len != orig_ee_len ||
> > +		    unwritten != orig_unwritten))
> > +		/*
> > +		 * The extent to zeroout should have been unchanged
> > +		 * but its not.
> > +		 */
> > +		goto out_free_path;
> > +
> > +	if (ext4_split_extent_zeroout(handle, inode, path, map, orig_flags))
> > +		/*
> > +		 * Something went wrong in zeroout
> > +		 */
> 
> I think this comment isn't really useful...
> 
> 								Honza
> -- 
> Jan Kara <jack@...e.com>
> SUSE Labs, CR