linux-ext4 - Re: [PATCH v3 4/7] ext4: Add support for EXT4_GET_BLOCKS_QUERY_LEAF

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250514161603.GG25655@frogsfrogsfrogs>
Date: Wed, 14 May 2025 09:16:03 -0700
From: "Darrick J. Wong" <djwong@...nel.org>
To: "Ritesh Harjani (IBM)" <ritesh.list@...il.com>
Cc: linux-ext4@...r.kernel.org, Theodore Ts'o <tytso@....edu>,
	Jan Kara <jack@...e.cz>, John Garry <john.g.garry@...cle.com>,
	Ojaswin Mujoo <ojaswin@...ux.ibm.com>,
	linux-fsdevel@...r.kernel.org
Subject: Re: [PATCH v3 4/7] ext4: Add support for
 EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS

On Fri, May 09, 2025 at 02:20:34AM +0530, Ritesh Harjani (IBM) wrote:
> There can be a case where there are contiguous extents on the adjacent
> leaf nodes of on-disk extent trees. So when someone tries to write to
> this contiguous range, ext4_map_blocks() call will split by returning
> 1 extent at a time if this is not already cached in extent_status tree
> cache (where if these extents when cached can get merged since they are
> contiguous).
> 
> This is fine for a normal write however in case of atomic writes, it
> can't afford to break the write into two. Now this is also something
> that will only happen in the slow write case where we call
> ext4_map_blocks() for each of these extents spread across different leaf
> nodes. However, there is no guarantee that these extent status cache
> cannot be reclaimed before the last call to ext4_map_blocks() in
> ext4_map_blocks_atomic_write_slow().

Can you have two physically and logically contiguous mappings within a
single leaf node?  Or is the key idea here that the extent status tree
will merge adjacent mappings from the same leaf block, just not between
leaf blocks?

> Hence this patch adds support of EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS.
> This flag checks if the requested range can be fully found in extent
> status cache and return. If not, it looks up in on-disk extent
> tree via ext4_map_query_blocks(). If the found extent is the last entry
> in the leaf node, then it goes and queries the next lblk to see if there
> is an adjacent contiguous extent in the adjacent leaf node of the
> on-disk extent tree.
> 
> Even though there can be a case where there are multiple adjacent extent
> entries spread across multiple leaf nodes. But we only read an adjacent
> leaf block i.e. in total of 2 extent entries spread across 2 leaf nodes.
> The reason for this is that we are mostly only going to support atomic
> writes with upto 64KB or maybe max upto 1MB of atomic write support.
> 
> Co-developed-by: Ojaswin Mujoo <ojaswin@...ux.ibm.com>
> Signed-off-by: Ojaswin Mujoo <ojaswin@...ux.ibm.com>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@...il.com>
> ---
>  fs/ext4/ext4.h    | 18 ++++++++-
>  fs/ext4/extents.c | 12 ++++++
>  fs/ext4/inode.c   | 97 +++++++++++++++++++++++++++++++++++++++++------
>  3 files changed, 115 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index e2b36a3c1b0f..b4bbe2837423 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -256,9 +256,19 @@ struct ext4_allocation_request {
>  #define EXT4_MAP_UNWRITTEN	BIT(BH_Unwritten)
>  #define EXT4_MAP_BOUNDARY	BIT(BH_Boundary)
>  #define EXT4_MAP_DELAYED	BIT(BH_Delay)
> +/*
> + * This is for use in ext4_map_query_blocks() for a special case where we can
> + * have a physically and logically contiguous blocks explit across two leaf

s/explit/split/ ?

--D

> + * nodes instead of a single extent. This is required in case of atomic writes
> + * to know whether the returned extent is last in leaf. If yes, then lookup for
> + * next in leaf block in ext4_map_query_blocks_next_in_leaf().
> + * - This is never going to be added to any buffer head state.
> + * - We use the next available bit after BH_BITMAP_UPTODATE.
> + */
> +#define EXT4_MAP_QUERY_LAST_IN_LEAF	BIT(BH_BITMAP_UPTODATE + 1)
>  #define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
>  				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
> -				 EXT4_MAP_DELAYED)
> +				 EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF)
>  
>  struct ext4_map_blocks {
>  	ext4_fsblk_t m_pblk;
> @@ -725,6 +735,12 @@ enum {
>  #define EXT4_GET_BLOCKS_IO_SUBMIT		0x0400
>  	/* Caller is in the atomic contex, find extent if it has been cached */
>  #define EXT4_GET_BLOCKS_CACHED_NOWAIT		0x0800
> +/*
> + * Atomic write caller needs this to query in the slow path of mixed mapping
> + * case, when a contiguous extent can be split across two adjacent leaf nodes.
> + * Look EXT4_MAP_QUERY_LAST_IN_LEAF.
> + */
> +#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF	0x1000
>  
>  /*
>   * The bit position of these flags must not overlap with any of the
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index c616a16a9f36..fa850f188d46 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4433,6 +4433,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
>  	allocated = map->m_len;
>  	ext4_ext_show_leaf(inode, path);
>  out:
> +	/*
> +	 * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag.
> +	 * So we know that the depth used here is correct, since there was no
> +	 * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set.
> +	 * If tomorrow we start using this QUERY flag with CREATE, then we will
> +	 * need to re-calculate the depth as it might have changed due to block
> +	 * allocation.
> +	 */
> +	if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
> +		if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr)))
> +			map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF;
> +
>  	ext4_free_ext_path(path);
>  
>  	trace_ext4_ext_map_blocks_exit(inode, flags, map,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 2f99b087a5d8..8b86b1a29bdc 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -459,14 +459,71 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
>  }
>  #endif /* ES_AGGRESSIVE_TEST */
>  
> +static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
> +			struct inode *inode, struct ext4_map_blocks *map,
> +			unsigned int orig_mlen)
> +{
> +	struct ext4_map_blocks map2;
> +	unsigned int status, status2;
> +	int retval;
> +
> +	status = map->m_flags & EXT4_MAP_UNWRITTEN ?
> +		EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
> +
> +	WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF));
> +	WARN_ON_ONCE(orig_mlen <= map->m_len);
> +
> +	/* Prepare map2 for lookup in next leaf block */
> +	map2.m_lblk = map->m_lblk + map->m_len;
> +	map2.m_len = orig_mlen - map->m_len;
> +	map2.m_flags = 0;
> +	retval = ext4_ext_map_blocks(handle, inode, &map2, 0);
> +
> +	if (retval <= 0) {
> +		ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
> +				      map->m_pblk, status, false);
> +		return map->m_len;
> +	}
> +
> +	if (unlikely(retval != map2.m_len)) {
> +		ext4_warning(inode->i_sb,
> +			     "ES len assertion failed for inode "
> +			     "%lu: retval %d != map->m_len %d",
> +			     inode->i_ino, retval, map2.m_len);
> +		WARN_ON(1);
> +	}
> +
> +	status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ?
> +		EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
> +
> +	/*
> +	 * If map2 is contiguous with map, then let's insert it as a single
> +	 * extent in es cache and return the combined length of both the maps.
> +	 */
> +	if (map->m_pblk + map->m_len == map2.m_pblk &&
> +			status == status2) {
> +		ext4_es_insert_extent(inode, map->m_lblk,
> +				      map->m_len + map2.m_len, map->m_pblk,
> +				      status, false);
> +		map->m_len += map2.m_len;
> +	} else {
> +		ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
> +				      map->m_pblk, status, false);
> +	}
> +
> +	return map->m_len;
> +}
> +
>  static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
> -				 struct ext4_map_blocks *map)
> +				 struct ext4_map_blocks *map, int flags)
>  {
>  	unsigned int status;
>  	int retval;
> +	unsigned int orig_mlen = map->m_len;
> +	unsigned int query_flags = flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF;
>  
>  	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> -		retval = ext4_ext_map_blocks(handle, inode, map, 0);
> +		retval = ext4_ext_map_blocks(handle, inode, map, query_flags);
>  	else
>  		retval = ext4_ind_map_blocks(handle, inode, map, 0);
>  
> @@ -481,11 +538,23 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
>  		WARN_ON(1);
>  	}
>  
> -	status = map->m_flags & EXT4_MAP_UNWRITTEN ?
> -			EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
> -	ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
> -			      map->m_pblk, status, false);
> -	return retval;
> +	/*
> +	 * No need to query next in leaf:
> +	 * - if returned extent is not last in leaf or
> +	 * - if the last in leaf is the full requested range
> +	 */
> +	if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) ||
> +			((map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) &&
> +			 (map->m_len == orig_mlen))) {
> +		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
> +				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
> +		ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
> +				      map->m_pblk, status, false);
> +		return retval;
> +	}
> +
> +	return ext4_map_query_blocks_next_in_leaf(handle, inode, map,
> +						  orig_mlen);
>  }
>  
>  static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
> @@ -599,6 +668,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>  	struct extent_status es;
>  	int retval;
>  	int ret = 0;
> +	unsigned int orig_mlen = map->m_len;
>  #ifdef ES_AGGRESSIVE_TEST
>  	struct ext4_map_blocks orig_map;
>  
> @@ -650,7 +720,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>  		ext4_map_blocks_es_recheck(handle, inode, map,
>  					   &orig_map, flags);
>  #endif
> -		goto found;
> +		if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) ||
> +				orig_mlen == map->m_len)
> +			goto found;
> +
> +		if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
> +			map->m_len = orig_mlen;
>  	}
>  	/*
>  	 * In the query cache no-wait mode, nothing we can do more if we
> @@ -664,7 +739,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>  	 * file system block.
>  	 */
>  	down_read(&EXT4_I(inode)->i_data_sem);
> -	retval = ext4_map_query_blocks(handle, inode, map);
> +	retval = ext4_map_query_blocks(handle, inode, map, flags);
>  	up_read((&EXT4_I(inode)->i_data_sem));
>  
>  found:
> @@ -1802,7 +1877,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
>  	if (ext4_has_inline_data(inode))
>  		retval = 0;
>  	else
> -		retval = ext4_map_query_blocks(NULL, inode, map);
> +		retval = ext4_map_query_blocks(NULL, inode, map, 0);
>  	up_read(&EXT4_I(inode)->i_data_sem);
>  	if (retval)
>  		return retval < 0 ? retval : 0;
> @@ -1825,7 +1900,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
>  			goto found;
>  		}
>  	} else if (!ext4_has_inline_data(inode)) {
> -		retval = ext4_map_query_blocks(NULL, inode, map);
> +		retval = ext4_map_query_blocks(NULL, inode, map, 0);
>  		if (retval) {
>  			up_write(&EXT4_I(inode)->i_data_sem);
>  			return retval < 0 ? retval : 0;
> -- 
> 2.49.0
> 
>