[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <utxx6yngpfntc5qn7iv6a6be2hgpoubkkhdxkrfbcdnbmiiv5j@ftxbfofhybj2>
Date: Wed, 8 Oct 2025 13:26:57 +0200
From: Jan Kara <jack@...e.cz>
To: Zhang Yi <yi.zhang@...weicloud.com>
Cc: linux-ext4@...r.kernel.org, linux-fsdevel@...r.kernel.org,
linux-kernel@...r.kernel.org, tytso@....edu, adilger.kernel@...ger.ca, jack@...e.cz,
yi.zhang@...wei.com, libaokun1@...wei.com, yukuai3@...wei.com, yangerkun@...wei.com
Subject: Re: [PATCH v2 03/13] ext4: introduce seq counter for the extent
status entry
On Thu 25-09-25 17:25:59, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@...wei.com>
>
> In the iomap_write_iter(), the iomap buffered write frame does not hold
> any locks between querying the inode extent mapping info and performing
> page cache writes. As a result, the extent mapping can be changed due to
> concurrent I/O in flight. Similarly, in the iomap_writepage_map(), the
> write-back process faces a similar problem: concurrent changes can
> invalidate the extent mapping before the I/O is submitted.
>
> Therefore, both of these processes must recheck the mapping info after
> acquiring the folio lock. To address this, similar to XFS, we propose
> introducing an extent sequence number to serve as a validity cookie for
> the extent. After commit 24b7a2331fcd ("ext4: clairfy the rules for
> modifying extents"), we can ensure the extent information should always
> be processed through the extent status tree, and the extent status tree
> is always uptodate under i_rwsem or invalidate_lock or folio lock, so
> it's safe to introduce this sequence number. The sequence number will be
> increased whenever the extent status tree changes, preparing for the
> buffered write iomap conversion.
>
> Besides, this mechanism is also applicable for the moving extents case.
> In move_extent_per_page(), it also needs to reacquire data_sem and check
> the mapping info again under the folio lock.
>
> Signed-off-by: Zhang Yi <yi.zhang@...wei.com>
Looks good. Feel free to add:
Reviewed-by: Jan Kara <jack@...e.cz>
Honza
> ---
> fs/ext4/ext4.h | 2 ++
> fs/ext4/extents_status.c | 21 +++++++++++++++++----
> fs/ext4/super.c | 1 +
> include/trace/events/ext4.h | 23 +++++++++++++++--------
> 4 files changed, 35 insertions(+), 12 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 01a6e2de7fc3..7b37a661dd37 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1138,6 +1138,8 @@ struct ext4_inode_info {
> ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
> extents to shrink. Protected by
> i_es_lock */
> + u64 i_es_seq; /* Change counter for extents.
> + Protected by i_es_lock */
>
> /* ialloc */
> ext4_group_t i_last_alloc_group;
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index 31dc0496f8d0..62886e18e2a3 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -235,6 +235,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
> return es->es_lblk + es->es_len - 1;
> }
>
> +static inline void ext4_es_inc_seq(struct inode *inode)
> +{
> + struct ext4_inode_info *ei = EXT4_I(inode);
> +
> + WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1);
> +}
> +
> /*
> * search through the tree for an delayed extent with a given offset. If
> * it can't be found, try to find next extent.
> @@ -906,7 +913,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
> newes.es_lblk = lblk;
> newes.es_len = len;
> ext4_es_store_pblock_status(&newes, pblk, status);
> - trace_ext4_es_insert_extent(inode, &newes);
>
> ext4_es_insert_extent_check(inode, &newes);
>
> @@ -955,6 +961,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
> }
> pending = err3;
> }
> + ext4_es_inc_seq(inode);
> error:
> write_unlock(&EXT4_I(inode)->i_es_lock);
> /*
> @@ -981,6 +988,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
> if (err1 || err2 || err3 < 0)
> goto retry;
>
> + trace_ext4_es_insert_extent(inode, &newes);
> ext4_es_print_tree(inode);
> return;
> }
> @@ -1550,7 +1558,6 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
> if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
> return;
>
> - trace_ext4_es_remove_extent(inode, lblk, len);
> es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
> lblk, len, inode->i_ino);
>
> @@ -1570,16 +1577,21 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
> */
> write_lock(&EXT4_I(inode)->i_es_lock);
> err = __es_remove_extent(inode, lblk, end, &reserved, es);
> + if (err)
> + goto error;
> /* Free preallocated extent if it didn't get used. */
> if (es) {
> if (!es->es_len)
> __es_free_extent(es);
> es = NULL;
> }
> + ext4_es_inc_seq(inode);
> +error:
> write_unlock(&EXT4_I(inode)->i_es_lock);
> if (err)
> goto retry;
>
> + trace_ext4_es_remove_extent(inode, lblk, len);
> ext4_es_print_tree(inode);
> ext4_da_release_space(inode, reserved);
> }
> @@ -2140,8 +2152,6 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
> newes.es_lblk = lblk;
> newes.es_len = len;
> ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
> - trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
> - end_allocated);
>
> ext4_es_insert_extent_check(inode, &newes);
>
> @@ -2196,11 +2206,14 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
> pr2 = NULL;
> }
> }
> + ext4_es_inc_seq(inode);
> error:
> write_unlock(&EXT4_I(inode)->i_es_lock);
> if (err1 || err2 || err3 < 0)
> goto retry;
>
> + trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
> + end_allocated);
> ext4_es_print_tree(inode);
> ext4_print_pending_tree(inode);
> return;
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 699c15db28a8..30682df3eeef 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1397,6 +1397,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
> ei->i_es_all_nr = 0;
> ei->i_es_shk_nr = 0;
> ei->i_es_shrink_lblk = 0;
> + ei->i_es_seq = 0;
> ei->i_reserved_data_blocks = 0;
> spin_lock_init(&(ei->i_block_reservation_lock));
> ext4_init_pending_tree(&ei->i_pending_tree);
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index a374e7ea7e57..6a0754d38acf 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -2210,7 +2210,8 @@ DECLARE_EVENT_CLASS(ext4__es_extent,
> __field( ext4_lblk_t, lblk )
> __field( ext4_lblk_t, len )
> __field( ext4_fsblk_t, pblk )
> - __field( char, status )
> + __field( char, status )
> + __field( u64, seq )
> ),
>
> TP_fast_assign(
> @@ -2220,13 +2221,15 @@ DECLARE_EVENT_CLASS(ext4__es_extent,
> __entry->len = es->es_len;
> __entry->pblk = ext4_es_show_pblock(es);
> __entry->status = ext4_es_status(es);
> + __entry->seq = EXT4_I(inode)->i_es_seq;
> ),
>
> - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
> + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s seq %llu",
> MAJOR(__entry->dev), MINOR(__entry->dev),
> (unsigned long) __entry->ino,
> __entry->lblk, __entry->len,
> - __entry->pblk, show_extent_status(__entry->status))
> + __entry->pblk, show_extent_status(__entry->status),
> + __entry->seq)
> );
>
> DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent,
> @@ -2251,6 +2254,7 @@ TRACE_EVENT(ext4_es_remove_extent,
> __field( ino_t, ino )
> __field( loff_t, lblk )
> __field( loff_t, len )
> + __field( u64, seq )
> ),
>
> TP_fast_assign(
> @@ -2258,12 +2262,13 @@ TRACE_EVENT(ext4_es_remove_extent,
> __entry->ino = inode->i_ino;
> __entry->lblk = lblk;
> __entry->len = len;
> + __entry->seq = EXT4_I(inode)->i_es_seq;
> ),
>
> - TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
> + TP_printk("dev %d,%d ino %lu es [%lld/%lld) seq %llu",
> MAJOR(__entry->dev), MINOR(__entry->dev),
> (unsigned long) __entry->ino,
> - __entry->lblk, __entry->len)
> + __entry->lblk, __entry->len, __entry->seq)
> );
>
> TRACE_EVENT(ext4_es_find_extent_range_enter,
> @@ -2523,6 +2528,7 @@ TRACE_EVENT(ext4_es_insert_delayed_extent,
> __field( char, status )
> __field( bool, lclu_allocated )
> __field( bool, end_allocated )
> + __field( u64, seq )
> ),
>
> TP_fast_assign(
> @@ -2534,15 +2540,16 @@ TRACE_EVENT(ext4_es_insert_delayed_extent,
> __entry->status = ext4_es_status(es);
> __entry->lclu_allocated = lclu_allocated;
> __entry->end_allocated = end_allocated;
> + __entry->seq = EXT4_I(inode)->i_es_seq;
> ),
>
> - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
> - "allocated %d %d",
> + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s allocated %d %d seq %llu",
> MAJOR(__entry->dev), MINOR(__entry->dev),
> (unsigned long) __entry->ino,
> __entry->lblk, __entry->len,
> __entry->pblk, show_extent_status(__entry->status),
> - __entry->lclu_allocated, __entry->end_allocated)
> + __entry->lclu_allocated, __entry->end_allocated,
> + __entry->seq)
> );
>
> /* fsmap traces */
> --
> 2.46.1
>
--
Jan Kara <jack@...e.com>
SUSE Labs, CR
Powered by blists - more mailing lists