linux-kernel - Re: [RFC, PATCH 4/6] jbd: refactor nested journal log recovery loop into separate functions

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20080311143550.GB6544@atrey.karlin.mff.cuni.cz>
Date:	Tue, 11 Mar 2008 15:35:50 +0100
From:	Jan Kara <jack@...e.cz>
To:	Duane Griffin <duaneg@...da.com>
Cc:	linux-ext4@...r.kernel.org, linux-kernel@...r.kernel.org,
	Theodore Tso <tytso@....edu>, sct@...hat.com,
	akpm@...ux-foundation.org, adilger@...sterfs.com
Subject: Re: [RFC, PATCH 4/6] jbd: refactor nested journal log recovery loop into separate functions

> The do_one_pass function iterates through the jbd log operating on blocks
> depending on the pass. During the replay pass descriptor blocks are processed
> by an inner loop which replays them. The nesting makes the code difficult to
> follow or to modify. Splitting the inner loop and replay code into separate
> functions simplifies matters.
> 
> The refactored code no longer unnecessarily reads revoked data blocks, so
> potential read errors from such blocks will cause differing behaviour. Aside
> from that there should be no functional effect.
  Hmm, if I read your patch correctly, previously we aborted journal
replay when we found a block with unknown block magic but now we
continue replaying. Why have you done such change? And similarly when
some error happened when parsing revoke records block...

									Honza

> 
> Signed-off-by: Duane Griffin <duaneg@...da.com>
> ---
> 
> Note the TODO before the block read in the outer loop below. We could possibly
> attempt to continue after a failed read as we do when replaying descriptor
> blocks. However if it was a descriptor block we'd likely bomb out on the next
> iteration anyway, so I'm not sure it would be worth it. Thoughts?
> 
>  fs/jbd/recovery.c |  243 ++++++++++++++++++++++++++++-------------------------
>  1 files changed, 127 insertions(+), 116 deletions(-)
> 
> diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
> index 2b8edf4..e2ac78f 100644
> --- a/fs/jbd/recovery.c
> +++ b/fs/jbd/recovery.c
> @@ -307,6 +307,101 @@ int journal_skip_recovery(journal_t *journal)
>  	return err;
>  }
>  
> +static int replay_data_block(
> +	journal_t *journal, struct buffer_head *obh, char *data,
> +	int flags, unsigned long blocknr)
> +{
> +	struct buffer_head *nbh;
> +
> +	/* Find a buffer for the new data being restored */
> +	nbh = __getblk(journal->j_fs_dev, blocknr, journal->j_blocksize);
> +	if (nbh == NULL)
> +		return -ENOMEM;
> +
> +	lock_buffer(nbh);
> +	memcpy(nbh->b_data, obh->b_data, journal->j_blocksize);
> +	if (flags & JFS_FLAG_ESCAPE)
> +		*((__be32 *)data) = cpu_to_be32(JFS_MAGIC_NUMBER);
> +
> +	BUFFER_TRACE(nbh, "marking dirty");
> +	set_buffer_uptodate(nbh);
> +	mark_buffer_dirty(nbh);
> +	BUFFER_TRACE(nbh, "marking uptodate");
> +	/* ll_rw_block(WRITE, 1, &nbh); */
> +	unlock_buffer(nbh);
> +	brelse(nbh);
> +
> +	return 0;
> +}
> +
> +/* Replay a descriptor block: write all of the data blocks. */
> +static int replay_descriptor_block(
> +	journal_t *journal, char *data,
> +	unsigned int next_commit_ID,
> +	unsigned long *next_log_block,
> +	struct recovery_info *info)
> +{
> +	int err;
> +	int success = 0;
> +	char *tagp;
> +	unsigned long io_block;
> +	struct buffer_head *obh;
> +	unsigned long next = *next_log_block;
> +
> +	tagp = &data[sizeof(journal_header_t)];
> +	while ((tagp - data + sizeof(journal_block_tag_t)) <=
> +	       journal->j_blocksize) {
> +		unsigned long blocknr;
> +		journal_block_tag_t *tag = (journal_block_tag_t *) tagp;
> +		int flags = be32_to_cpu(tag->t_flags);
> +
> +		io_block = next++;
> +		wrap(journal, next);
> +		blocknr = be32_to_cpu(tag->t_blocknr);
> +
> +		/* If the block has been revoked, then we're all done here. */
> +		if (journal_test_revoke(journal, blocknr, next_commit_ID)) {
> +			++info->nr_revoke_hits;
> +			goto skip_write;
> +		}
> +
> +		err = jread(&obh, journal, io_block);
> +		if (err) {
> +
> +			/* Recover what we can, but
> +			 * report failure at the end. */
> +			success = err;
> +			printk(KERN_ERR "JBD: IO error %d recovering "
> +			       "block %ld in log\n", err, io_block);
> +			continue;
> +		}
> +		J_ASSERT(obh != NULL);
> +
> +		err = replay_data_block(journal, obh, data, flags, blocknr);
> +		if (err) {
> +			brelse(obh);
> +			printk(KERN_ERR "JBD: Out of memory during recovery\n");
> +			success = err;
> +			goto failed;
> +		}
> +
> +		++info->nr_replays;
> +		brelse(obh);
> +
> +skip_write:
> +		tagp += sizeof(journal_block_tag_t);
> +		if (!(flags & JFS_FLAG_SAME_UUID))
> +			tagp += 16;
> +
> +		if (flags & JFS_FLAG_LAST_TAG)
> +			break;
> +	}
> +
> +	*next_log_block = next;
> +failed:
> +	return success;
> +}
> +
>  static int do_one_pass(journal_t *journal,
>  			struct recovery_info *info, enum passtype pass)
>  {
> @@ -316,8 +411,6 @@ static int do_one_pass(journal_t *journal,
>  	journal_superblock_t *	sb;
>  	journal_header_t *	tmp;
>  	struct buffer_head *	bh;
> -	unsigned int		sequence;
> -	int			blocktype;
>  
>  	/* Precompute the maximum metadata descriptors in a descriptor block */
>  	int			MAX_BLOCKS_PER_DESC;
> @@ -348,11 +441,8 @@ static int do_one_pass(journal_t *journal,
>  	 */
>  
>  	while (1) {
> -		int			flags;
> -		char *			tagp;
> -		journal_block_tag_t *	tag;
> -		struct buffer_head *	obh;
> -		struct buffer_head *	nbh;
> +		int blocktype;
> +		unsigned int sequence;
>  
>  		cond_resched();
>  
> @@ -371,10 +461,13 @@ static int do_one_pass(journal_t *journal,
>  		 * either the next descriptor block or the final commit
>  		 * record. */
>  
> +		/* TODO: Should we continue on error? */
>  		jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
>  		err = jread(&bh, journal, next_log_block);
> -		if (err)
> +		if (err) {
> +			success = err;
>  			goto failed;
> +		}
>  
>  		next_log_block++;
>  		wrap(journal, next_log_block);
> @@ -406,137 +499,57 @@ static int do_one_pass(journal_t *journal,
>  		 * all of the sequence number checks.  What are we going
>  		 * to do with it?  That depends on the pass... */
>  
> -		switch(blocktype) {
> +		switch (blocktype) {
>  		case JFS_DESCRIPTOR_BLOCK:
>  			/* If it is a valid descriptor block, replay it
>  			 * in pass REPLAY; otherwise, just skip over the
>  			 * blocks it describes. */
> -			if (pass != PASS_REPLAY) {
> +			if (pass == PASS_REPLAY) {
> +				err = replay_descriptor_block(
> +					journal,
> +					bh->b_data,
> +					next_commit_ID,
> +					&next_log_block,
> +					info);
> +			} else {
>  				next_log_block +=
>  					count_tags(bh, journal->j_blocksize);
>  				wrap(journal, next_log_block);
> -				brelse(bh);
> -				continue;
>  			}
> -
> -			/* A descriptor block: we can now write all of
> -			 * the data blocks.  Yay, useful work is finally
> -			 * getting done here! */
> -
> -			tagp = &bh->b_data[sizeof(journal_header_t)];
> -			while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
> -			       <= journal->j_blocksize) {
> -				unsigned long io_block;
> -
> -				tag = (journal_block_tag_t *) tagp;
> -				flags = be32_to_cpu(tag->t_flags);
> -
> -				io_block = next_log_block++;
> -				wrap(journal, next_log_block);
> -				err = jread(&obh, journal, io_block);
> -				if (err) {
> -					/* Recover what we can, but
> -					 * report failure at the end. */
> -					success = err;
> -					printk (KERN_ERR
> -						"JBD: IO error %d recovering "
> -						"block %ld in log\n",
> -						err, io_block);
> -				} else {
> -					unsigned long blocknr;
> -
> -					J_ASSERT(obh != NULL);
> -					blocknr = be32_to_cpu(tag->t_blocknr);
> -
> -					/* If the block has been
> -					 * revoked, then we're all done
> -					 * here. */
> -					if (journal_test_revoke
> -					    (journal, blocknr,
> -					     next_commit_ID)) {
> -						brelse(obh);
> -						++info->nr_revoke_hits;
> -						goto skip_write;
> -					}
> -
> -					/* Find a buffer for the new
> -					 * data being restored */
> -					nbh = __getblk(journal->j_fs_dev,
> -							blocknr,
> -							journal->j_blocksize);
> -					if (nbh == NULL) {
> -						printk(KERN_ERR
> -						       "JBD: Out of memory "
> -						       "during recovery.\n");
> -						err = -ENOMEM;
> -						brelse(bh);
> -						brelse(obh);
> -						goto failed;
> -					}
> -
> -					lock_buffer(nbh);
> -					memcpy(nbh->b_data, obh->b_data,
> -							journal->j_blocksize);
> -					if (flags & JFS_FLAG_ESCAPE) {
> -						*((__be32 *)bh->b_data) =
> -						cpu_to_be32(JFS_MAGIC_NUMBER);
> -					}
> -
> -					BUFFER_TRACE(nbh, "marking dirty");
> -					set_buffer_uptodate(nbh);
> -					mark_buffer_dirty(nbh);
> -					BUFFER_TRACE(nbh, "marking uptodate");
> -					++info->nr_replays;
> -					/* ll_rw_block(WRITE, 1, &nbh); */
> -					unlock_buffer(nbh);
> -					brelse(obh);
> -					brelse(nbh);
> -				}
> -
> -			skip_write:
> -				tagp += sizeof(journal_block_tag_t);
> -				if (!(flags & JFS_FLAG_SAME_UUID))
> -					tagp += 16;
> -
> -				if (flags & JFS_FLAG_LAST_TAG)
> -					break;
> -			}
> -
> -			brelse(bh);
> -			continue;
> +			break;
>  
>  		case JFS_COMMIT_BLOCK:
>  			/* Found an expected commit block: not much to
>  			 * do other than move on to the next sequence
>  			 * number. */
> -			brelse(bh);
>  			next_commit_ID++;
> -			continue;
> +			break;
>  
>  		case JFS_REVOKE_BLOCK:
>  			/* If we aren't in the REVOKE pass, then we can
>  			 * just skip over this block. */
> -			if (pass != PASS_REVOKE) {
> -				brelse(bh);
> -				continue;
> +			if (pass == PASS_REVOKE) {
> +				err = scan_revoke_records(
> +					journal, bh, next_commit_ID, info);
>  			}
> -
> -			err = scan_revoke_records(journal, bh,
> -						  next_commit_ID, info);
> -			brelse(bh);
> -			if (err)
> -				goto failed;
> -			continue;
> +			break;
>  
>  		default:
>  			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
>  				  blocktype);
> -			brelse(bh);
> -			goto done;
> +			break;
>  		}
> +
> +		brelse(bh);
> +
> +		/* Immediately fail on OOM; on other errors try to recover
> +		 * as much as we can, but report the failure at the end. */
> +		if (err)
> +			success = err;
> +		if (err == -ENOMEM)
> +			goto failed;
>  	}
>  
> - done:
>  	/*
>  	 * We broke out of the log scan loop: either we came to the
>  	 * known end of the log or we found an unexpected block in the
> @@ -558,10 +571,8 @@ static int do_one_pass(journal_t *journal,
>  		}
>  	}
>  
> -	return success;
> -
>   failed:
> -	return err;
> +	return success;
>  }
>  
>  
> -- 
> 1.5.3.7
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
-- 
Jan Kara <jack@...e.cz>
SuSE CR Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/