From 759062a21176c46a8fc1fa4d95e20c4450a10b9a Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Thu, 30 Dec 2021 08:22:00 +0100 Subject: [PATCH] jbd2/recovery.c: Continue on csum failures for commit record Noticed with simulated power failures, i.e. not on real hardware: The failure model is: - 512 bytes writes are atomic. - Larger writes are not atomic. - Everything is written in order. Since the JBD2 block size can be larger than the block size of the physical drive, it may happen that a (JBD2) block starts with the expected magic/block type==JBD2_COMMIT_BLOCK/sequence number/ commit time, but nevertheless the csum verification fails because jbd2_commit_block_csum_verify() calculates a checksum over the complete JBD2 block. Thus: Just end the scan on a csum failure. Note: The change is most likely incomplete. There are probably more situations where the code assumes that an incorrect csum is always a corruption. --- fs/jbd2/recovery.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 8ca3527189f8..f6d59bc204a1 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -709,6 +709,20 @@ static int do_one_pass(journal_t *journal, /* How to differentiate between interrupted commit * and journal corruption ? * + * Assume: Physical block size 512 bytes, + * j->j_blocksize=1024 + * If the 1st physical block of a commit block is + * written, then the correct magic/block type/ + * sequence number/commit time will be there. + * If the 2nd block is not written, then the csum + * verification will fail, because the csum is + * calculated over the whole JBD2 block. + * + * Thus: Only only async_commit, n-th transaction fails + * csum check, (n+1)th transaction passes csum check, + * is a journal corruption. Everything else could be + * just an interrupted write. + * * {nth transaction} * Checksum Verification Failed * | @@ -717,7 +731,7 @@ static int do_one_pass(journal_t *journal, * async_commit sync_commit * | | * | GO TO NEXT "Journal Corruption" - * | TRANSACTION + * | TRANSACTION or "Interrupted Commit" * | * {(n+1)th transanction} * | @@ -806,8 +820,9 @@ static int do_one_pass(journal_t *journal, info->end_transaction = next_commit_ID; if (!jbd2_has_feature_async_commit(journal)) { - journal->j_failed_commit = - next_commit_ID; + /* Interrupted commit or corrupt + * journal. Assume interrupted commit. + */ brelse(bh); break; } -- 2.33.1