lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <utxx6yngpfntc5qn7iv6a6be2hgpoubkkhdxkrfbcdnbmiiv5j@ftxbfofhybj2>
Date: Wed, 8 Oct 2025 13:26:57 +0200
From: Jan Kara <jack@...e.cz>
To: Zhang Yi <yi.zhang@...weicloud.com>
Cc: linux-ext4@...r.kernel.org, linux-fsdevel@...r.kernel.org, 
	linux-kernel@...r.kernel.org, tytso@....edu, adilger.kernel@...ger.ca, jack@...e.cz, 
	yi.zhang@...wei.com, libaokun1@...wei.com, yukuai3@...wei.com, yangerkun@...wei.com
Subject: Re: [PATCH v2 03/13] ext4: introduce seq counter for the extent
 status entry

On Thu 25-09-25 17:25:59, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@...wei.com>
> 
> In the iomap_write_iter(), the iomap buffered write frame does not hold
> any locks between querying the inode extent mapping info and performing
> page cache writes. As a result, the extent mapping can be changed due to
> concurrent I/O in flight. Similarly, in the iomap_writepage_map(), the
> write-back process faces a similar problem: concurrent changes can
> invalidate the extent mapping before the I/O is submitted.
> 
> Therefore, both of these processes must recheck the mapping info after
> acquiring the folio lock. To address this, similar to XFS, we propose
> introducing an extent sequence number to serve as a validity cookie for
> the extent. After commit 24b7a2331fcd ("ext4: clairfy the rules for
> modifying extents"), we can ensure the extent information should always
> be processed through the extent status tree, and the extent status tree
> is always uptodate under i_rwsem or invalidate_lock or folio lock, so
> it's safe to introduce this sequence number. The sequence number will be
> increased whenever the extent status tree changes, preparing for the
> buffered write iomap conversion.
> 
> Besides, this mechanism is also applicable for the moving extents case.
> In move_extent_per_page(), it also needs to reacquire data_sem and check
> the mapping info again under the folio lock.
> 
> Signed-off-by: Zhang Yi <yi.zhang@...wei.com>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@...e.cz>

								Honza

> ---
>  fs/ext4/ext4.h              |  2 ++
>  fs/ext4/extents_status.c    | 21 +++++++++++++++++----
>  fs/ext4/super.c             |  1 +
>  include/trace/events/ext4.h | 23 +++++++++++++++--------
>  4 files changed, 35 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 01a6e2de7fc3..7b37a661dd37 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1138,6 +1138,8 @@ struct ext4_inode_info {
>  	ext4_lblk_t i_es_shrink_lblk;	/* Offset where we start searching for
>  					   extents to shrink. Protected by
>  					   i_es_lock  */
> +	u64 i_es_seq;			/* Change counter for extents.
> +					   Protected by i_es_lock */
>  
>  	/* ialloc */
>  	ext4_group_t	i_last_alloc_group;
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index 31dc0496f8d0..62886e18e2a3 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -235,6 +235,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
>  	return es->es_lblk + es->es_len - 1;
>  }
>  
> +static inline void ext4_es_inc_seq(struct inode *inode)
> +{
> +	struct ext4_inode_info *ei = EXT4_I(inode);
> +
> +	WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1);
> +}
> +
>  /*
>   * search through the tree for an delayed extent with a given offset.  If
>   * it can't be found, try to find next extent.
> @@ -906,7 +913,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  	newes.es_lblk = lblk;
>  	newes.es_len = len;
>  	ext4_es_store_pblock_status(&newes, pblk, status);
> -	trace_ext4_es_insert_extent(inode, &newes);
>  
>  	ext4_es_insert_extent_check(inode, &newes);
>  
> @@ -955,6 +961,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  		}
>  		pending = err3;
>  	}
> +	ext4_es_inc_seq(inode);
>  error:
>  	write_unlock(&EXT4_I(inode)->i_es_lock);
>  	/*
> @@ -981,6 +988,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  	if (err1 || err2 || err3 < 0)
>  		goto retry;
>  
> +	trace_ext4_es_insert_extent(inode, &newes);
>  	ext4_es_print_tree(inode);
>  	return;
>  }
> @@ -1550,7 +1558,6 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
>  	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
>  		return;
>  
> -	trace_ext4_es_remove_extent(inode, lblk, len);
>  	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
>  		 lblk, len, inode->i_ino);
>  
> @@ -1570,16 +1577,21 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
>  	 */
>  	write_lock(&EXT4_I(inode)->i_es_lock);
>  	err = __es_remove_extent(inode, lblk, end, &reserved, es);
> +	if (err)
> +		goto error;
>  	/* Free preallocated extent if it didn't get used. */
>  	if (es) {
>  		if (!es->es_len)
>  			__es_free_extent(es);
>  		es = NULL;
>  	}
> +	ext4_es_inc_seq(inode);
> +error:
>  	write_unlock(&EXT4_I(inode)->i_es_lock);
>  	if (err)
>  		goto retry;
>  
> +	trace_ext4_es_remove_extent(inode, lblk, len);
>  	ext4_es_print_tree(inode);
>  	ext4_da_release_space(inode, reserved);
>  }
> @@ -2140,8 +2152,6 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
>  	newes.es_lblk = lblk;
>  	newes.es_len = len;
>  	ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
> -	trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
> -					    end_allocated);
>  
>  	ext4_es_insert_extent_check(inode, &newes);
>  
> @@ -2196,11 +2206,14 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
>  			pr2 = NULL;
>  		}
>  	}
> +	ext4_es_inc_seq(inode);
>  error:
>  	write_unlock(&EXT4_I(inode)->i_es_lock);
>  	if (err1 || err2 || err3 < 0)
>  		goto retry;
>  
> +	trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
> +					    end_allocated);
>  	ext4_es_print_tree(inode);
>  	ext4_print_pending_tree(inode);
>  	return;
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 699c15db28a8..30682df3eeef 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1397,6 +1397,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
>  	ei->i_es_all_nr = 0;
>  	ei->i_es_shk_nr = 0;
>  	ei->i_es_shrink_lblk = 0;
> +	ei->i_es_seq = 0;
>  	ei->i_reserved_data_blocks = 0;
>  	spin_lock_init(&(ei->i_block_reservation_lock));
>  	ext4_init_pending_tree(&ei->i_pending_tree);
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index a374e7ea7e57..6a0754d38acf 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -2210,7 +2210,8 @@ DECLARE_EVENT_CLASS(ext4__es_extent,
>  		__field(	ext4_lblk_t,	lblk		)
>  		__field(	ext4_lblk_t,	len		)
>  		__field(	ext4_fsblk_t,	pblk		)
> -		__field(	char, status	)
> +		__field(	char,		status		)
> +		__field(	u64,		seq		)
>  	),
>  
>  	TP_fast_assign(
> @@ -2220,13 +2221,15 @@ DECLARE_EVENT_CLASS(ext4__es_extent,
>  		__entry->len	= es->es_len;
>  		__entry->pblk	= ext4_es_show_pblock(es);
>  		__entry->status	= ext4_es_status(es);
> +		__entry->seq	= EXT4_I(inode)->i_es_seq;
>  	),
>  
> -	TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
> +	TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s seq %llu",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
>  		  (unsigned long) __entry->ino,
>  		  __entry->lblk, __entry->len,
> -		  __entry->pblk, show_extent_status(__entry->status))
> +		  __entry->pblk, show_extent_status(__entry->status),
> +		  __entry->seq)
>  );
>  
>  DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent,
> @@ -2251,6 +2254,7 @@ TRACE_EVENT(ext4_es_remove_extent,
>  		__field(	ino_t,	ino			)
>  		__field(	loff_t,	lblk			)
>  		__field(	loff_t,	len			)
> +		__field(	u64,	seq			)
>  	),
>  
>  	TP_fast_assign(
> @@ -2258,12 +2262,13 @@ TRACE_EVENT(ext4_es_remove_extent,
>  		__entry->ino	= inode->i_ino;
>  		__entry->lblk	= lblk;
>  		__entry->len	= len;
> +		__entry->seq	= EXT4_I(inode)->i_es_seq;
>  	),
>  
> -	TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
> +	TP_printk("dev %d,%d ino %lu es [%lld/%lld) seq %llu",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
>  		  (unsigned long) __entry->ino,
> -		  __entry->lblk, __entry->len)
> +		  __entry->lblk, __entry->len, __entry->seq)
>  );
>  
>  TRACE_EVENT(ext4_es_find_extent_range_enter,
> @@ -2523,6 +2528,7 @@ TRACE_EVENT(ext4_es_insert_delayed_extent,
>  		__field(	char,		status		)
>  		__field(	bool,		lclu_allocated	)
>  		__field(	bool,		end_allocated	)
> +		__field(	u64,		seq		)
>  	),
>  
>  	TP_fast_assign(
> @@ -2534,15 +2540,16 @@ TRACE_EVENT(ext4_es_insert_delayed_extent,
>  		__entry->status		= ext4_es_status(es);
>  		__entry->lclu_allocated	= lclu_allocated;
>  		__entry->end_allocated	= end_allocated;
> +		__entry->seq		= EXT4_I(inode)->i_es_seq;
>  	),
>  
> -	TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
> -		  "allocated %d %d",
> +	TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s allocated %d %d seq %llu",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
>  		  (unsigned long) __entry->ino,
>  		  __entry->lblk, __entry->len,
>  		  __entry->pblk, show_extent_status(__entry->status),
> -		  __entry->lclu_allocated, __entry->end_allocated)
> +		  __entry->lclu_allocated, __entry->end_allocated,
> +		  __entry->seq)
>  );
>  
>  /* fsmap traces */
> -- 
> 2.46.1
> 
-- 
Jan Kara <jack@...e.com>
SUSE Labs, CR

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ