linux-kernel - [PATCH 1/5] ext4: restart handle if credits are insufficient during writepages

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250530062858.458039-2-yi.zhang@huaweicloud.com>
Date: Fri, 30 May 2025 14:28:54 +0800
From: Zhang Yi <yi.zhang@...weicloud.com>
To: linux-ext4@...r.kernel.org
Cc: linux-fsdevel@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	tytso@....edu,
	adilger.kernel@...ger.ca,
	jack@...e.cz,
	ojaswin@...ux.ibm.com,
	yi.zhang@...wei.com,
	yi.zhang@...weicloud.com,
	libaokun1@...wei.com,
	yukuai3@...wei.com,
	yangerkun@...wei.com
Subject: [PATCH 1/5] ext4: restart handle if credits are insufficient during writepages

From: Zhang Yi <yi.zhang@...wei.com>

After large folios are supported on ext4, writing back a sufficiently
large and discontinuous folio may consume a significant number of
journal credits, placing considerable strain on the journal. For
example, in a 20GB filesystem with 1K block size and 1MB journal size,
writing back a 2MB folio could require thousands of credits in the
worst-case scenario (when each block is discontinuous and distributed
across different block groups), potentially exceeding the journal size.

Fix this by making the write-back process first reserves credits for one
page and attempts to extend the transaction if the credits are
insufficient. In particular, if the credits for a transaction reach
their upper limit, stop the handle and initiate a new transaction.

Note that since we do not support partial folio writeouts, some blocks
within this folio may have been allocated. These allocated extents are
submitted through the current transaction, but the folio itself is not
submitted. To prevent stale data and potential deadlocks in ordered
mode, only the dioread_nolock mode supports this solution, as it always
allocate unwritten extents.

Suggested-by: Jan Kara <jack@...e.cz>
Signed-off-by: Zhang Yi <yi.zhang@...wei.com>
---
 fs/ext4/inode.c | 57 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be9a4cba35fd..5ef34c0c5633 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1680,6 +1680,7 @@ struct mpage_da_data {
 	unsigned int do_map:1;
 	unsigned int scanned_until_end:1;
 	unsigned int journalled_more_data:1;
+	unsigned int continue_map:1;
 };
 
 static void mpage_release_unused_pages(struct mpage_da_data *mpd,
@@ -2367,6 +2368,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
  *
  * @handle - handle for journal operations
  * @mpd - extent to map
+ * @needed_blocks - journal credits needed for one writepages iteration
+ * @check_blocks - journal credits needed for map one extent
  * @give_up_on_write - we set this to true iff there is a fatal error and there
  *                     is no hope of writing the data. The caller should discard
  *                     dirty pages to avoid infinite loops.
@@ -2383,6 +2386,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
  */
 static int mpage_map_and_submit_extent(handle_t *handle,
 				       struct mpage_da_data *mpd,
+				       int needed_blocks, int check_blocks,
 				       bool *give_up_on_write)
 {
 	struct inode *inode = mpd->inode;
@@ -2393,6 +2397,8 @@ static int mpage_map_and_submit_extent(handle_t *handle,
 	ext4_io_end_t *io_end = mpd->io_submit.io_end;
 	struct ext4_io_end_vec *io_end_vec;
 
+	mpd->continue_map = 0;
+
 	io_end_vec = ext4_alloc_io_end_vec(io_end);
 	if (IS_ERR(io_end_vec))
 		return PTR_ERR(io_end_vec);
@@ -2439,6 +2445,34 @@ static int mpage_map_and_submit_extent(handle_t *handle,
 		err = mpage_map_and_submit_buffers(mpd);
 		if (err < 0)
 			goto update_disksize;
+		if (!map->m_len)
+			goto update_disksize;
+
+		/*
+		 * For mapping a folio that is sufficiently large and
+		 * discontinuous, the current handle credits may be
+		 * insufficient, try to extend the handle.
+		 */
+		err = __ext4_journal_ensure_credits(handle, check_blocks,
+				needed_blocks, 0);
+		if (err < 0)
+			goto update_disksize;
+		/*
+		 * The credits for the current handle and transaction have
+		 * reached their upper limit, stop the handle and initiate a
+		 * new transaction. Note that some blocks in this folio may
+		 * have been allocated, and these allocated extents are
+		 * submitted through the current transaction, but the folio
+		 * itself is not submitted. To prevent stale data and
+		 * potential deadlock in ordered mode, only the
+		 * dioread_nolock mode supports this.
+		 */
+		if (err > 0) {
+			WARN_ON_ONCE(!ext4_should_dioread_nolock(inode));
+			mpd->continue_map = 1;
+			err = 0;
+			goto update_disksize;
+		}
 	} while (map->m_len);
 
 update_disksize:
@@ -2467,6 +2501,9 @@ static int mpage_map_and_submit_extent(handle_t *handle,
 		if (!err)
 			err = err2;
 	}
+	if (!err && mpd->continue_map)
+		ext4_get_io_end(io_end);
+
 	return err;
 }
 
@@ -2703,7 +2740,7 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
 	handle_t *handle = NULL;
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
-	int needed_blocks, rsv_blocks = 0, ret = 0;
+	int needed_blocks, check_blocks, rsv_blocks = 0, ret = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 	struct blk_plug plug;
 	bool give_up_on_write = false;
@@ -2825,10 +2862,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
 
 	while (!mpd->scanned_until_end && wbc->nr_to_write > 0) {
 		/* For each extent of pages we use new io_end */
-		mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
 		if (!mpd->io_submit.io_end) {
-			ret = -ENOMEM;
-			break;
+			mpd->io_submit.io_end =
+					ext4_init_io_end(inode, GFP_KERNEL);
+			if (!mpd->io_submit.io_end) {
+				ret = -ENOMEM;
+				break;
+			}
 		}
 
 		WARN_ON_ONCE(!mpd->can_map);
@@ -2841,10 +2881,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
 		 */
 		BUG_ON(ext4_should_journal_data(inode));
 		needed_blocks = ext4_da_writepages_trans_blocks(inode);
+		check_blocks = ext4_chunk_trans_blocks(inode,
+				MAX_WRITEPAGES_EXTENT_LEN);
 
 		/* start a new transaction */
 		handle = ext4_journal_start_with_reserve(inode,
-				EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
+				EXT4_HT_WRITE_PAGE, needed_blocks,
+				mpd->continue_map ? 0 : rsv_blocks);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
@@ -2861,6 +2904,7 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
 		ret = mpage_prepare_extent_to_map(mpd);
 		if (!ret && mpd->map.m_len)
 			ret = mpage_map_and_submit_extent(handle, mpd,
+					needed_blocks, check_blocks,
 					&give_up_on_write);
 		/*
 		 * Caution: If the handle is synchronous,
@@ -2894,7 +2938,8 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
 			ext4_journal_stop(handle);
 		} else
 			ext4_put_io_end(mpd->io_submit.io_end);
-		mpd->io_submit.io_end = NULL;
+		if (ret || !mpd->continue_map)
+			mpd->io_submit.io_end = NULL;
 
 		if (ret == -ENOSPC && sbi->s_journal) {
 			/*
-- 
2.46.1