linux-ext4 - [PATCH] jbd2: recheck chechpointing non-dirty buffer

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [thread-next>] [day] [month] [year] [list]

Message-Id: <20230426131041.1004383-1-yi.zhang@huaweicloud.com>
Date:   Wed, 26 Apr 2023 21:10:41 +0800
From:   Zhang Yi <yi.zhang@...weicloud.com>
To:     linux-ext4@...r.kernel.org
Cc:     tytso@....edu, adilger.kernel@...ger.ca, jack@...e.cz,
        yi.zhang@...wei.com, yi.zhang@...weicloud.com, yukuai3@...wei.com,
        chengzhihao1@...wei.com
Subject: [PATCH] jbd2: recheck chechpointing non-dirty buffer

From: Zhang Yi <yi.zhang@...wei.com>

There is a long-standing metadata corruption issue that happens from
time to time, but it's very difficult to reproduce and analyse, benefit
from the JBD2_CYCLE_RECORD option, we found out that the problem is the
checkpointing process miss to write out some buffers which are raced by
another do_get_write_access(). Looks below for detail.

jbd2_log_do_checkpoint() //transaction X
 //buffer A is dirty and not belones to any transaction
 __buffer_relink_io() //move it to the IO list
 __flush_batch()
  write_dirty_buffer()
                             do_get_write_access()
                             clear_buffer_dirty
                             __jbd2_journal_file_buffer()
                             //add buffer A to a new transaction Y
   lock_buffer(bh)
   //doesn't write out
 __jbd2_journal_remove_checkpoint()
 //finish checkpoint except buffer A
 //filesystem corrupt if the new transaction Y isn't fully write out.

The fix is subtle because we can't trust the chechpointing buffers and
transactions once we release the j_list_lock, they could be written back
and checkpointed by some others, or they could have been added to a new
transaction. So we have to re-add them on the checkpoint list and
recheck their status if they are clean and don't need to write out.

Cc: stable@...r.kernel.org
Signed-off-by: Zhang Yi <yi.zhang@...wei.com>
Tested-by: Zhihao Cheng <chengzhihao1@...wei.com>
---
 fs/jbd2/checkpoint.c | 52 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 51bd38da21cd..1aca860eb0f6 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -77,8 +77,31 @@ static inline void __buffer_relink_io(struct journal_head *jh)
 		jh->b_cpnext->b_cpprev = jh;
 	}
 	transaction->t_checkpoint_io_list = jh;
+	transaction->t_chp_stats.cs_written++;
 }
 
+/*
+ * Move a buffer from the checkpoint io list back to the checkpoint list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_cp(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink(jh);
+
+	if (!transaction->t_checkpoint_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_list;
+		jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_list = jh;
+	transaction->t_chp_stats.cs_written--;
+}
 /*
  * Check a checkpoint buffer could be release or not.
  *
@@ -175,8 +198,31 @@ __flush_batch(journal_t *journal, int *batch_count)
 	struct blk_plug plug;
 
 	blk_start_plug(&plug);
-	for (i = 0; i < *batch_count; i++)
-		write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC);
+	for (i = 0; i < *batch_count; i++) {
+		struct buffer_head *bh = journal->j_chkpt_bhs[i];
+		struct journal_head *jh = bh2jh(bh);
+
+		lock_buffer(bh);
+		/*
+		 * This buffer isn't dirty, it could be getten write access
+		 * again by a new transaction, re-add it on the checkpoint
+		 * list if it still needs to be checkpointed, and wait
+		 * until that transaction finished to write out.
+		 */
+		if (!test_clear_buffer_dirty(bh)) {
+			unlock_buffer(bh);
+			spin_lock(&journal->j_list_lock);
+			if (jh->b_cp_transaction)
+				__buffer_relink_cp(jh);
+			spin_unlock(&journal->j_list_lock);
+			jbd2_journal_put_journal_head(jh);
+			continue;
+		}
+		jbd2_journal_put_journal_head(jh);
+		bh->b_end_io = end_buffer_write_sync;
+		get_bh(bh);
+		submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
+	}
 	blk_finish_plug(&plug);
 
 	for (i = 0; i < *batch_count; i++) {
@@ -303,9 +349,9 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 		BUFFER_TRACE(bh, "queue");
 		get_bh(bh);
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
+		jbd2_journal_grab_journal_head(bh);
 		journal->j_chkpt_bhs[batch_count++] = bh;
 		__buffer_relink_io(jh);
-		transaction->t_chp_stats.cs_written++;
 		if ((batch_count == JBD2_NR_BATCH) ||
 		    need_resched() ||
 		    spin_needbreak(&journal->j_list_lock))
-- 
2.31.1