lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <64e865aa8e53151e681eee0332c2a2e956c852eb.1745987268.git.ritesh.list@gmail.com>
Date: Wed, 30 Apr 2025 10:50:42 +0530
From: "Ritesh Harjani (IBM)" <ritesh.list@...il.com>
To: Theodore Ts'o <tytso@....edu>,
	Jan Kara <jack@...e.cz>
Cc: John Garry <john.g.garry@...cle.com>,
	djwong@...nel.org,
	Ojaswin Mujoo <ojaswin@...ux.ibm.com>,
	linux-ext4@...r.kernel.org,
	"Ritesh Harjani (IBM)" <ritesh.list@...il.com>
Subject: [RFC v2 2/2] ext4: Add support for EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS

There can be a case where there are contiguous extents on the adjacent
leaf nodes of on-disk extent trees. So when someone tries to write to
this contiguous range, ext4_map_blocks() call will split by returning
1 extent at a time if this is not already cached in extent_status tree
cache (where if these extents when cached can get merged since they are
contiguous).

This is fine for a normal write however in case of atomic writes, it
can't afford to break the write into two. Now this is also something
that will only happen in the slow write case where we call
ext4_map_blocks() for each of these extents spread across different leaf
nodes. However, there is no guarantee that these extent status cache
cannot be reclaimed before the last call to ext4_map_blocks() in
ext4_map_blocks_atomic_write_slow().

Hence this patch adds support of EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS.
This flag checks if the requested range can be fully found in extent
status cache and return. If not, it looks up in on-disk extent
tree via ext4_map_query_blocks(). If the found extent is the last entry
in the leaf node, then it goes and queries the next lblk to see if there
is an adjacent contiguous extent in the adjacent leaf node of the
on-disk extent tree.

Even though there can be a case where there are multiple adjacent extent
entries spread across multiple leaf nodes. But we only read an adjacent
leaf block i.e. in total of 2 extent entries spread across 2 leaf nodes.
The reason for this is that we are mostly only going to support atomic
writes with upto 64KB or maybe max upto 1MB of atomic write support.

Co-developed-by: Ojaswin Mujoo <ojaswin@...ux.ibm.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@...ux.ibm.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@...il.com>
---
 fs/ext4/ext4.h    |  6 ++++
 fs/ext4/extents.c |  9 +++--
 fs/ext4/inode.c   | 91 ++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 92 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 589d51389327..38f75d33d67f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -256,6 +256,7 @@ struct ext4_allocation_request {
 #define EXT4_MAP_UNWRITTEN	BIT(BH_Unwritten)
 #define EXT4_MAP_BOUNDARY	BIT(BH_Boundary)
 #define EXT4_MAP_DELAYED	BIT(BH_Delay)
+#define EXT4_MAP_LAST_IN_LEAF	BIT(BH_PrivateStart)
 #define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
 				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
 				 EXT4_MAP_DELAYED)
@@ -725,6 +726,11 @@ enum {
 #define EXT4_GET_BLOCKS_IO_SUBMIT		0x0400
 	/* Caller is in the atomic contex, find extent if it has been cached */
 #define EXT4_GET_BLOCKS_CACHED_NOWAIT		0x0800
+/*
+ * Atomic write caller needs this to query in the slow path of mixed mapping
+ * case, when a contiguous extent can be split across two adjacent leaf nodes.
+ */
+#define EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS	0x1000
 
 /*
  * The bit position of these flags must not overlap with any of the
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0e00b78b521c..12fae8d70f46 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4433,6 +4433,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	allocated = map->m_len;
 	ext4_ext_show_leaf(inode, path);
 out:
+	if (flags & EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS)
+		if (ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr)))
+			map->m_flags |= EXT4_MAP_LAST_IN_LEAF;
+
 	ext4_free_ext_path(path);
 
 	trace_ext4_ext_map_blocks_exit(inode, flags, map,
@@ -4788,7 +4792,8 @@ int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode,
 	struct ext4_map_blocks map;
 	unsigned int blkbits = inode->i_blkbits;
 	unsigned int credits = 0;
-	int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT;
+	int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT |
+			EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS;
 
 	map.m_lblk = offset >> blkbits;
 	max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
@@ -4815,7 +4820,7 @@ int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode,
 		map.m_lblk += ret;
 		map.m_len = (max_blocks -= ret);
 		ret = ext4_map_blocks(handle, inode, &map, flags);
-		if (ret != max_blocks)
+		if (!(map.m_flags & EXT4_MAP_LAST_IN_LEAF) && ret != max_blocks)
 			ext4_warning(inode->i_sb,
 				     "inode #%lu: block %u: len %u: "
 				     "split block mapping found for atomic write,"
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 27235a76c2d1..f5c8c4b8cd16 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -459,14 +459,69 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
 }
 #endif /* ES_AGGRESSIVE_TEST */
 
+static int ext4_map_query_last_in_leaf_blocks(handle_t *handle,
+			struct inode *inode, struct ext4_map_blocks *map,
+			unsigned int orig_mlen)
+{
+	struct ext4_map_blocks map2;
+	unsigned int status, status2;
+	int retval;
+
+	status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+		EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+	WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_LAST_IN_LEAF));
+
+	map2.m_lblk = map->m_lblk + map->m_len;
+	map2.m_len = INT_MAX;
+	map2.m_flags = 0;
+	retval = ext4_ext_map_blocks(handle, inode, &map2, 0);
+
+	if (retval <= 0) {
+		ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+				      map->m_pblk, status, false);
+		return map->m_len;
+
+	}
+
+	if (unlikely(retval != map2.m_len)) {
+		ext4_warning(inode->i_sb,
+			     "ES len assertion failed for inode "
+			     "%lu: retval %d != map->m_len %d",
+			     inode->i_ino, retval, map2.m_len);
+		WARN_ON(1);
+	}
+
+	status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ?
+		EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+	if (map->m_pblk + map->m_len == map2.m_pblk &&
+			status == status2) {
+		ext4_es_insert_extent(inode, map->m_lblk,
+				      map->m_len + map2.m_len, map->m_pblk,
+				      status, false);
+		if (in_range(orig_mlen, map->m_len,
+					map->m_len + map2.m_len + 1))
+			map->m_len = orig_mlen;
+	} else {
+		ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+				      map->m_pblk, status, false);
+	}
+
+	return map->m_len;
+}
+
 static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
-				 struct ext4_map_blocks *map)
+				 struct ext4_map_blocks *map, int flags)
 {
 	unsigned int status;
 	int retval;
+	unsigned int orig_mlen = map->m_len;
+
+	flags = flags & EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS;
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-		retval = ext4_ext_map_blocks(handle, inode, map, 0);
+		retval = ext4_ext_map_blocks(handle, inode, map, flags);
 	else
 		retval = ext4_ind_map_blocks(handle, inode, map, 0);
 
@@ -481,11 +536,16 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
 		WARN_ON(1);
 	}
 
-	status = map->m_flags & EXT4_MAP_UNWRITTEN ?
-			EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
-	ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
-			      map->m_pblk, status, false);
-	return retval;
+	if (!(map->m_flags & EXT4_MAP_LAST_IN_LEAF)) {
+		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+		ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+				      map->m_pblk, status, false);
+		return retval;
+	}
+
+	return ext4_map_query_last_in_leaf_blocks(handle, inode, map,
+						  orig_mlen);
 }
 
 static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
@@ -599,6 +659,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	struct extent_status es;
 	int retval;
 	int ret = 0;
+	unsigned int orig_mlen = map->m_len;
 #ifdef ES_AGGRESSIVE_TEST
 	struct ext4_map_blocks orig_map;
 
@@ -650,7 +711,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		ext4_map_blocks_es_recheck(handle, inode, map,
 					   &orig_map, flags);
 #endif
-		goto found;
+		if (!(flags & EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS) ||
+				orig_mlen == map->m_len)
+			goto found;
+
+		if (flags & EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS)
+			map->m_len = orig_mlen;
 	}
 	/*
 	 * In the query cache no-wait mode, nothing we can do more if we
@@ -664,7 +730,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 * file system block.
 	 */
 	down_read(&EXT4_I(inode)->i_data_sem);
-	retval = ext4_map_query_blocks(handle, inode, map);
+	retval = ext4_map_query_blocks(handle, inode, map, flags);
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 found:
@@ -1802,7 +1868,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
 	if (ext4_has_inline_data(inode))
 		retval = 0;
 	else
-		retval = ext4_map_query_blocks(NULL, inode, map);
+		retval = ext4_map_query_blocks(NULL, inode, map, 0);
 	up_read(&EXT4_I(inode)->i_data_sem);
 	if (retval)
 		return retval < 0 ? retval : 0;
@@ -1825,7 +1891,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
 			goto found;
 		}
 	} else if (!ext4_has_inline_data(inode)) {
-		retval = ext4_map_query_blocks(NULL, inode, map);
+		retval = ext4_map_query_blocks(NULL, inode, map, 0);
 		if (retval) {
 			up_write(&EXT4_I(inode)->i_data_sem);
 			return retval < 0 ? retval : 0;
@@ -3372,7 +3438,8 @@ static int ext4_map_blocks_atomic_write_slow(handle_t *handle,
 	map->m_lblk = m_lblk;
 	map->m_len = m_len;
 
-	ret = ext4_map_blocks(handle, inode, map, 0);
+	ret = ext4_map_blocks(handle, inode, map,
+			      EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS);
 	if (ret != m_len) {
 		ext4_warning_inode(inode, "allocation failed for atomic write request pos:%u, len:%u\n",
 			m_lblk, m_len);
-- 
2.49.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ