lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20240125200032.4c38a9ae@x390.bzzz77.ru>
Date: Thu, 25 Jan 2024 20:01:55 +0300
From: Alex Zhuravlev <bzzz@...mcloud.com>
To: linux-ext4@...r.kernel.org
Subject: [PATCH] merge extent blocks when possible

Hi,

Please have a look at the patch attempting to handle the problem with deep extent tree.

Thanks, Alex

In some cases when a lot of extents are created initially by sparse file writes, they
get merged over time, but there is no way to merge blocks in different indexes.

For example, if a file is written synchronously, all even blocks first, then odd blocks.
The resulting extents tree looks like the following in "debugfs stat" output, often
with only a single block in each index/leaf:

   EXTENTS:
   (ETB0):33796
   (ETB1):33795
   (0-677):2588672-2589349
   (ETB1):2590753
   (678):2589350
   (ETB1):2590720
   (679-1357):2589351-2590029
   (ETB1):2590752
   (1358):2590030
   (ETB1):2590721
   (1359-2037):2590031-2590709
   (ETB1):2590751
   (2038):2590710
   (ETB1):2590722
   :
   :

With the patch applied the index and lead blocks are properly merged (0.6% slower
under this random sync write workload, but later read IOPS are greatly reduced):

   EXTENTS:
   (ETB0):33796
   (ETB1):2590736
   (0-2047):2588672-2590719
   (2048-11999):2592768-2602719

Originally the problem was hit with a real application operating on huge datasets and with just
27371 extents "inode has invalid extent depth: 6" problem occurred.
With the patch applied the application succeeded having finally 73637 in 3-level tree.

Signed-off-by: Alex Zhuravlev <bzzz@...mcloud.com>
---
 fs/ext4/extents.c     | 185 ++++++++++++++++++++++++++++++++++++++++--
 fs/jbd2/transaction.c |   1 +
 2 files changed, 180 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 01299b55a567..0fd9fbd65711 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1885,7 +1885,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
  * This function tries to merge the @ex extent to neighbours in the tree, then
  * tries to collapse the extent tree into the inode.
  */
-static void ext4_ext_try_to_merge(handle_t *handle,
+static int ext4_ext_try_to_merge(handle_t *handle,
 				  struct inode *inode,
 				  struct ext4_ext_path *path,
 				  struct ext4_extent *ex)
@@ -1902,9 +1902,178 @@ static void ext4_ext_try_to_merge(handle_t *handle,
 		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
 
 	if (!merge_done)
-		(void) ext4_ext_try_to_merge_right(inode, path, ex);
+		merge_done = ext4_ext_try_to_merge_right(inode, path, ex);
 
 	ext4_ext_try_to_merge_up(handle, inode, path);
+
+	return merge_done;
+}
+
+/*
+ * This function tries to merge blocks from @path into @npath
+ */
+static int ext4_ext_merge_blocks(handle_t *handle,
+				struct inode *inode,
+				struct ext4_ext_path *path,
+				struct ext4_ext_path *npath)
+{
+	unsigned int depth = ext_depth(inode);
+	int used, nused, free, i, k, err;
+	ext4_fsblk_t next;
+
+	if (path[depth].p_hdr == npath[depth].p_hdr)
+		return 0;
+
+	used = le16_to_cpu(path[depth].p_hdr->eh_entries);
+	free = le16_to_cpu(npath[depth].p_hdr->eh_max) -
+		le16_to_cpu(npath[depth].p_hdr->eh_entries);
+	if (free < used)
+		return 0;
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		return err;
+	err = ext4_ext_get_access(handle, inode, npath + depth);
+	if (err)
+		return err;
+
+	/* move entries from the current leave to the next one */
+	nused = le16_to_cpu(npath[depth].p_hdr->eh_entries);
+	memmove(EXT_FIRST_EXTENT(npath[depth].p_hdr) + used,
+		EXT_FIRST_EXTENT(npath[depth].p_hdr),
+		nused * sizeof(struct ext4_extent));
+	memcpy(EXT_FIRST_EXTENT(npath[depth].p_hdr),
+		EXT_FIRST_EXTENT(path[depth].p_hdr),
+		used * sizeof(struct ext4_extent));
+	le16_add_cpu(&npath[depth].p_hdr->eh_entries, used);
+	le16_add_cpu(&path[depth].p_hdr->eh_entries, -used);
+	ext4_ext_try_to_merge_right(inode, npath,
+					EXT_FIRST_EXTENT(npath[depth].p_hdr));
+
+	err = ext4_ext_dirty(handle, inode, path + depth);
+	if (err)
+		return err;
+	err = ext4_ext_dirty(handle, inode, npath + depth);
+	if (err)
+		return err;
+
+	/* otherwise the index won't get corrected */
+	npath[depth].p_ext = EXT_FIRST_EXTENT(npath[depth].p_hdr);
+	err = ext4_ext_correct_indexes(handle, inode, npath);
+	if (err)
+		return err;
+
+	for (i = depth - 1; i >= 0; i--) {
+
+		next = ext4_idx_pblock(path[i].p_idx);
+		ext4_free_blocks(handle, inode, NULL, next, 1,
+				EXT4_FREE_BLOCKS_METADATA |
+				EXT4_FREE_BLOCKS_FORGET);
+		err = ext4_ext_get_access(handle, inode, path + i);
+		if (err)
+			return err;
+		le16_add_cpu(&path[i].p_hdr->eh_entries, -1);
+		if (le16_to_cpu(path[i].p_hdr->eh_entries) == 0) {
+			/* whole index block collapsed, go up */
+			continue;
+		}
+		/* remove index pointer */
+		used = EXT_LAST_INDEX(path[i].p_hdr) - path[i].p_idx + 1;
+		memmove(path[i].p_idx, path[i].p_idx + 1,
+			used * sizeof(struct ext4_extent_idx));
+
+		err = ext4_ext_dirty(handle, inode, path + i);
+		if (err)
+			return err;
+
+		if (path[i].p_hdr == npath[i].p_hdr)
+			break;
+
+		/* try to move index pointers */
+		used = le16_to_cpu(path[i].p_hdr->eh_entries);
+		free = le16_to_cpu(npath[i].p_hdr->eh_max) -
+			le16_to_cpu(npath[i].p_hdr->eh_entries);
+		if (used > free)
+			break;
+		err = ext4_ext_get_access(handle, inode, npath + i);
+		if (err)
+			return err;
+		memmove(EXT_FIRST_INDEX(npath[i].p_hdr) + used,
+			EXT_FIRST_INDEX(npath[i].p_hdr),
+			npath[i].p_hdr->eh_entries * sizeof(struct ext4_extent_idx));
+		memcpy(EXT_FIRST_INDEX(npath[i].p_hdr), EXT_FIRST_INDEX(path[i].p_hdr),
+			used * sizeof(struct ext4_extent_idx));
+		le16_add_cpu(&path[i].p_hdr->eh_entries, -used);
+		le16_add_cpu(&npath[i].p_hdr->eh_entries, used);
+		err = ext4_ext_dirty(handle, inode, path + i);
+		if (err)
+			return err;
+		err = ext4_ext_dirty(handle, inode, npath + i);
+		if (err)
+			return err;
+
+		/* correct index above */
+		for (k = i; k > 0; k--) {
+			err = ext4_ext_get_access(handle, inode, npath + k - 1);
+			if (err)
+				return err;
+			npath[k-1].p_idx->ei_block =
+				EXT_FIRST_INDEX(npath[k].p_hdr)->ei_block;
+			err = ext4_ext_dirty(handle, inode, npath + k - 1);
+			if (err)
+				return err;
+		}
+	}
+
+	/*
+	 * TODO: given we've got two paths, it should be possible to
+	 * collapse those two blocks into the root one in some cases
+	 */
+	return 1;
+}
+
+static int ext4_ext_try_to_merge_blocks(handle_t *handle,
+		struct inode *inode,
+		struct ext4_ext_path *path)
+{
+	struct ext4_ext_path *npath = NULL;
+	unsigned int depth = ext_depth(inode);
+	ext4_lblk_t next;
+	int used, rc = 0;
+
+	if (depth == 0)
+		return 0;
+
+	used = le16_to_cpu(path[depth].p_hdr->eh_entries);
+	/* don't be too agressive as checking space in
+	 * the next block is not free */
+	if (used > ext4_ext_space_block(inode, 0) / 4)
+		return 0;
+
+	/* try to merge to the next block */
+	next = ext4_ext_next_leaf_block(path);
+	if (next == EXT_MAX_BLOCKS)
+		return 0;
+	npath = ext4_find_extent(inode, next, NULL, 0);
+	if (IS_ERR(npath))
+		return 0;
+	rc = ext4_ext_merge_blocks(handle, inode, path, npath);
+	ext4_ext_drop_refs(npath);
+	kfree(npath);
+	if (rc)
+		return rc > 0 ? 0 : rc;
+
+	/* try to merge with the previous block */
+	if (EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block == 0)
+		return 0;
+	next = EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block - 1;
+	npath = ext4_find_extent(inode, next, NULL, 0);
+	if (IS_ERR(npath))
+		return 0;
+	rc = ext4_ext_merge_blocks(handle, inode, npath, path);
+	ext4_ext_drop_refs(npath);
+	kfree(npath);
+	return rc > 0 ? 0 : rc;
 }
 
 /*
@@ -1976,6 +2145,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	int depth, len, err;
 	ext4_lblk_t next;
 	int mb_flags = 0, unwritten;
+	int merged = 0;
 
 	if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		mb_flags |= EXT4_MB_DELALLOC_RESERVED;
@@ -2167,8 +2337,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 merge:
 	/* try to merge extents */
 	if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
-		ext4_ext_try_to_merge(handle, inode, path, nearex);
-
+		merged = ext4_ext_try_to_merge(handle, inode, path, nearex);
 
 	/* time to correct all indexes above */
 	err = ext4_ext_correct_indexes(handle, inode, path);
@@ -2176,6 +2345,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 		goto cleanup;
 
 	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+	if (!err && merged)
+		err = ext4_ext_try_to_merge_blocks(handle, inode, path);
 
 cleanup:
 	ext4_free_ext_path(npath);
@@ -3765,7 +3936,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 	/* note: ext4_ext_correct_indexes() isn't needed here because
 	 * borders are not changed
 	 */
-	ext4_ext_try_to_merge(handle, inode, path, ex);
+	if (ext4_ext_try_to_merge(handle, inode, path, ex))
+		ext4_ext_try_to_merge_blocks(handle, inode, path);
 
 	/* Mark modified extent as dirty */
 	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
@@ -3828,7 +4000,8 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
 	/* note: ext4_ext_correct_indexes() isn't needed here because
 	 * borders are not changed
 	 */
-	ext4_ext_try_to_merge(handle, inode, path, ex);
+	if (ext4_ext_try_to_merge(handle, inode, path, ex))
+		ext4_ext_try_to_merge_blocks(handle, inode, path);
 
 	/* Mark modified extent as dirty */
 	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cb0b8d6fc0c6..4cd738fa408e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -513,6 +513,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
 		}
 		rsv_handle->h_reserved = 1;
 		rsv_handle->h_journal = journal;
+		rsv_handle->h_revoke_credits = revoke_records;
 		handle->h_rsv_handle = rsv_handle;
 	}
 	handle->h_revoke_credits = revoke_records;
-- 
2.43.0


Powered by blists - more mailing lists