[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aWnNvttsuSIoq3WA@li-dc0c254c-257c-11b2-a85c-98b6c1322444.ibm.com>
Date: Fri, 16 Jan 2026 11:05:44 +0530
From: Ojaswin Mujoo <ojaswin@...ux.ibm.com>
To: Jan Kara <jack@...e.cz>
Cc: linux-ext4@...r.kernel.org, "Theodore Ts'o" <tytso@....edu>,
Ritesh Harjani <ritesh.list@...il.com>, Zhang Yi <yi.zhang@...wei.com>,
libaokun1@...wei.com, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 6/8] ext4: Refactor zeroout path and handle all cases
On Thu, Jan 15, 2026 at 01:01:14PM +0100, Jan Kara wrote:
> On Wed 14-01-26 20:27:50, Ojaswin Mujoo wrote:
> > Currently, zeroout is used as a fallback in case we fail to
> > split/convert extents in the "traditional" modify-the-extent-tree way.
> > This is essential to mitigate failures in critical paths like extent
> > splitting during endio. However, the logic is very messy and not easy to
> > follow. Further, the fragile use of various flags has made it prone to
> > errors.
> >
> > Refactor zeroout out logic by moving it up to ext4_split_extents().
> > Further, zeroout correctly based on the type of conversion we want, ie:
> > - unwritten to written: Zeroout everything around the mapped range.
> > - written to unwritten: Zeroout only the mapped range.
> >
> > Also, ext4_ext_convert_to_initialized() now passes
> > EXT4_GET_BLOCKS_CONVERT to make the intention clear.
> >
> > Signed-off-by: Ojaswin Mujoo <ojaswin@...ux.ibm.com>
>
> Overall looks nice. Feel free to add:
>
> Reviewed-by: Jan Kara <jack@...e.cz>
Thanks for the review Jan, I'll make the changes you suggested in v3.
Regards,
ojaswin
>
> A few nits below:
>
> > +static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode,
> > + struct ext4_ext_path *path,
> > + struct ext4_map_blocks *map, int flags)
> > +{
> > + struct ext4_extent *ex;
> > + unsigned int ee_len, depth;
> > + ext4_lblk_t ee_block;
> > + uint64_t lblk, pblk, len;
> > + int is_unwrit;
> > + int err = 0;
> > +
> > + depth = ext_depth(inode);
> > + ex = path[depth].p_ext;
> > + ee_block = le32_to_cpu(ex->ee_block);
> > + ee_len = ext4_ext_get_actual_len(ex);
> > + is_unwrit = ext4_ext_is_unwritten(ex);
> >
> > + if (flags & EXT4_GET_BLOCKS_CONVERT) {
> > /*
> > - * The first half contains partially valid data, the splitting
> > - * of this extent has not been completed, fix extent length
> > - * and ext4_split_extent() split will the first half again.
> > + * EXT4_GET_BLOCKS_CONVERT: Caller wants the range specified by
> > + * map to be initialized. Zeroout everything except the map
> > + * range.
> > */
> > - if (split_flag & EXT4_EXT_DATA_PARTIAL_VALID1) {
> > - /*
> > - * Drop extent cache to prevent stale unwritten
> > - * extents remaining after zeroing out.
> > - */
> > - ext4_es_remove_extent(inode,
> > - le32_to_cpu(zero_ex.ee_block),
> > - ext4_ext_get_actual_len(&zero_ex));
> > - goto fix_extent_len;
> > +
> > + loff_t map_end = (loff_t) map->m_lblk + map->m_len;
> > + loff_t ex_end = (loff_t) ee_block + ee_len;
> > +
> > + if (!is_unwrit)
> > + /* Shouldn't happen. Just exit */
> > + return -EINVAL;
> > +
> > + /* zeroout left */
> > + if (map->m_lblk > ee_block) {
> > + lblk = ee_block;
> > + len = map->m_lblk - ee_block;
> > + pblk = ext4_ext_pblock(ex);
> > + err = ext4_issue_zeroout(inode, lblk, pblk, len);
> > + if (err)
> > + /* ZEROOUT failed, just return original error */
> > + return err;
> > }
> >
> > - /* update the extent length and mark as initialized */
> > - ex->ee_len = cpu_to_le16(ee_len);
> > - ext4_ext_try_to_merge(handle, inode, path, ex);
> > - err = ext4_ext_dirty(handle, inode, path + path->p_depth);
> > - if (!err)
> > - /* update extent status tree */
> > - ext4_zeroout_es(inode, &zero_ex);
> > + /* zeroout right */
> > + if (map->m_lblk + map->m_len < ee_block + ee_len) {
>
> Use map_end and ex_end in the above condition when we have it?
>
> ...
> > @@ -3382,11 +3428,13 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> > int split_flag, int flags,
> > unsigned int *allocated)
> > {
> > - ext4_lblk_t ee_block;
> > + ext4_lblk_t ee_block, orig_ee_block;
> > struct ext4_extent *ex;
> > - unsigned int ee_len, depth;
> > - int unwritten;
> > - int split_flag1, flags1;
> > + unsigned int ee_len, orig_ee_len, depth;
> > + int unwritten, orig_unwritten;
> > + int split_flag1 = 0, flags1 = 0;
> > + int orig_err = 0;
> ^^ extra space
>
> > + int orig_flags = flags;
> >
> > depth = ext_depth(inode);
> > ex = path[depth].p_ext;
> > @@ -3394,30 +3442,31 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> > ee_len = ext4_ext_get_actual_len(ex);
> > unwritten = ext4_ext_is_unwritten(ex);
> >
> > + orig_ee_block = ee_block;
> > + orig_ee_len = ee_len;
> > + orig_unwritten = unwritten;
> > +
> > /* Do not cache extents that are in the process of being modified. */
> > flags |= EXT4_EX_NOCACHE;
> >
> > if (map->m_lblk + map->m_len < ee_block + ee_len) {
> > - split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
> > flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE;
> > if (unwritten)
> > split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
> > EXT4_EXT_MARK_UNWRIT2;
> > - if (split_flag & EXT4_EXT_DATA_VALID2)
> > - split_flag1 |= map->m_lblk > ee_block ?
> > - EXT4_EXT_DATA_PARTIAL_VALID1 :
> > - EXT4_EXT_DATA_ENTIRE_VALID1;
> > path = ext4_split_extent_at(handle, inode, path,
> > map->m_lblk + map->m_len, split_flag1, flags1);
> > if (IS_ERR(path))
> > - return path;
> > + goto try_zeroout;
> > +
> > /*
> > * Update path is required because previous ext4_split_extent_at
> > * may result in split of original leaf or extent zeroout.
> > */
> > path = ext4_find_extent(inode, map->m_lblk, path, flags);
> > if (IS_ERR(path))
> > - return path;
> > + goto try_zeroout;
> > +
> > depth = ext_depth(inode);
> > ex = path[depth].p_ext;
> > if (!ex) {
> > @@ -3426,22 +3475,64 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
> > ext4_free_ext_path(path);
> > return ERR_PTR(-EFSCORRUPTED);
> > }
> > - unwritten = ext4_ext_is_unwritten(ex);
> > }
> >
> > if (map->m_lblk >= ee_block) {
> > - split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
> > + split_flag1 = 0;
> > if (unwritten) {
> > split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
> > - split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
> > - EXT4_EXT_MARK_UNWRIT2);
> > + split_flag1 |= split_flag & EXT4_EXT_MARK_UNWRIT2;
> > }
> > - path = ext4_split_extent_at(handle, inode, path,
> > - map->m_lblk, split_flag1, flags);
> > + path = ext4_split_extent_at(handle, inode, path, map->m_lblk,
> > + split_flag1, flags);
> > if (IS_ERR(path))
> > - return path;
> > + goto try_zeroout;
> > }
> >
> > + goto success;
> > +
> > +try_zeroout:
> > + /*
> > + * There was an error in splitting the extent. So instead, just zeroout
> > + * unwritten portions and convert it to initialize as a last resort. If
> > + * there is any failure here we just return the original error
> > + */
> > +
> > + orig_err = PTR_ERR(path);
> > + if (orig_err != -ENOSPC && orig_err != -EDQUOT && orig_err != -ENOMEM)
> > + goto out_orig_err;
> > +
> > + if (!(split_flag & EXT4_EXT_MAY_ZEROOUT))
> > + /* There's an error and we can't zeroout, just return the
> > + * original err
> > + */
>
> I'd put this before if and just write:
>
> /* We can't zeroout? Just return the original error */
>
> so that the comment fits on a single line :)
>
> > + goto out_orig_err;
> > +
> > + path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
> > + if (IS_ERR(path))
> > + goto out_orig_err;
> > +
> > + depth = ext_depth(inode);
> > + ex = path[depth].p_ext;
> > + ee_block = le32_to_cpu(ex->ee_block);
> > + ee_len = ext4_ext_get_actual_len(ex);
> > + unwritten = ext4_ext_is_unwritten(ex);
> > +
> > + if (WARN_ON(ee_block != orig_ee_block || ee_len != orig_ee_len ||
> > + unwritten != orig_unwritten))
> > + /*
> > + * The extent to zeroout should have been unchanged
> > + * but its not.
> > + */
> > + goto out_free_path;
> > +
> > + if (ext4_split_extent_zeroout(handle, inode, path, map, orig_flags))
> > + /*
> > + * Something went wrong in zeroout
> > + */
>
> I think this comment isn't really useful...
>
> Honza
> --
> Jan Kara <jack@...e.com>
> SUSE Labs, CR
Powered by blists - more mailing lists