linux-kernel - Re: [CALL FOR TESTING] Make Ext3 fsck way faster [2.6.23.13, updated patch]

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-Id: <200801141421.22169.abhishekrai@google.com>
Date:	Mon, 14 Jan 2008 14:21:21 -0500
From:	Abhishek Rai <abhishekrai@...gle.com>
To:	linux-kernel@...r.kernel.org
Subject: Re: [CALL FOR TESTING] Make Ext3 fsck way faster [2.6.23.13, updated patch]


Here's an updated patch for 2.6.23.13 that fixes a problem introduced due to
patching that I discovered while testing the 2.6.23.13 change I sent out earlier
today (the -mm tree patch does not suffer from this problem).

Thanks,
Abhishek


Signed-off-by: Abhishek Rai <abhishekrai@...gle.com>

diff -rupdN linux-2.6.23.13-clean/fs/ext3/balloc.c linux-2.6.23.13-ext3mc/fs/ext3/balloc.c
--- linux-2.6.23.13-clean/fs/ext3/balloc.c	2008-01-09 12:18:17.000000000 -0500
+++ linux-2.6.23.13-ext3mc/fs/ext3/balloc.c	2008-01-12 23:59:36.000000000 -0500
@@ -33,6 +33,29 @@
  * super block.  Each descriptor contains the number of the bitmap block and
  * the free blocks count in the block.  The descriptors are loaded in memory
  * when a file system is mounted (see ext3_fill_super).
+ *
+ * A note on ext3 metaclustering:
+ *
+ * 	Start of						End of
+ * 	block group						block group
+ * 	 ________________________________________________________________
+ * 	|	NON-MC REGION			|	MC REGION	 |
+ * 	|					|Overflow		 |
+ * 	|Data blocks and			|data		Indirect |
+ * 	|overflow indirect blocks		|blocks		blocks	 |
+ * 	|----------> 				|------->	<--------|
+ * 	|________________________________________________________________|
+ *
+ * 	Every block group has at its end a semi-reserved region called the
+ * 	metacluster mostly used for allocating indirect blocks. Under normal
+ * 	circumstances, the metacluster is used only for allocating indirect
+ * 	blocks which are allocated in decreasing order of block numbers.
+ * 	The non-Metacluster region is used for data block allocation which are
+ * 	allocated in increasing order of block numbers. However, when the MC
+ * 	runs out of space, indirect blocks can be allocated in the non-MC
+ * 	region along with the data blocks in the forward direction. Similarly,
+ * 	when non-MC runs out of space, new data blocks are allocated in MC but
+ * 	in the forward direction.
  */
 
 
@@ -108,6 +131,88 @@ read_block_bitmap(struct super_block *sb
 error_out:
 	return bh;
 }
+
+
+/*
+ * Count number of free blocks in a block group that don't lie in the
+ * metacluster region of the block group.
+ */
+static void
+ext3_init_grp_free_nonmc_blocks(struct super_block *sb,
+				struct buffer_head *bitmap_bh,
+				unsigned long block_group)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_bg_info *bgi = &sbi->s_bginfo[block_group];
+
+	BUG_ON(!test_opt(sb, METACLUSTER));
+
+	spin_lock(sb_bgl_lock(sbi, block_group));
+	if (bgi->bgi_free_nonmc_blocks_count >= 0)
+		goto out;
+
+	bgi->bgi_free_nonmc_blocks_count =
+		ext3_count_free(bitmap_bh, sbi->s_nonmc_blocks_per_group/8);
+
+out:
+	spin_unlock(sb_bgl_lock(sbi, block_group));
+	BUG_ON(bgi->bgi_free_nonmc_blocks_count >
+		sbi->s_nonmc_blocks_per_group);
+}
+
+/*
+ * ext3_update_nonmc_block_count:
+ *	Update bgi_free_nonmc_blocks_count for block group 'group_no' following
+ *	an allocation or deallocation.
+ *
+ *	@group_no:	affected block group
+ *	@start:		start of the [de]allocated range
+ *	@count:		number of blocks [de]allocated
+ *	@allocation:	1 if blocks were allocated, 0 otherwise.
+ */
+static inline void
+ext3_update_nonmc_block_count(struct ext3_sb_info *sbi, unsigned long group_no,
+				ext3_grpblk_t start, unsigned long count,
+				int allocation)
+{
+	struct ext3_bg_info *bginfo = &sbi->s_bginfo[group_no];
+	ext3_grpblk_t change;
+
+	BUG_ON(bginfo->bgi_free_nonmc_blocks_count < 0);
+	BUG_ON(start >= sbi->s_nonmc_blocks_per_group);
+
+	change = min_t(ext3_grpblk_t, start + count,
+			sbi->s_nonmc_blocks_per_group) - start;
+
+	spin_lock(sb_bgl_lock(sbi, group_no));
+	BUG_ON(bginfo->bgi_free_nonmc_blocks_count >
+		sbi->s_nonmc_blocks_per_group);
+	BUG_ON(allocation && bginfo->bgi_free_nonmc_blocks_count < change);
+
+	bginfo->bgi_free_nonmc_blocks_count += (allocation ? -change : change);
+
+	BUG_ON(bginfo->bgi_free_nonmc_blocks_count >
+		sbi->s_nonmc_blocks_per_group);
+	spin_unlock(sb_bgl_lock(sbi, group_no));
+}
+
+/*
+ * allow_mc_alloc:
+ * 	Check if we can use metacluster region of a block group for general
+ * 	allocation if needed. Ideally, we should allow this only if
+ * 	bgi_free_nonmc_blocks_count == 0, but sometimes there is a small number
+ * 	of blocks which don't get allocated in the first pass, no point
+ * 	breaking our file at the metacluster boundary because of that, so we
+ * 	relax the limit to 8.
+ */
+static inline int allow_mc_alloc(struct ext3_sb_info *sbi,
+					struct ext3_bg_info *bgi,
+					ext3_grpblk_t blk)
+{
+	return !(blk >= 0 && blk >= sbi->s_nonmc_blocks_per_group &&
+		bgi->bgi_free_nonmc_blocks_count >= 8);
+}
+
 /*
  * The reservation window structure operations
  * --------------------------------------------
@@ -424,6 +529,7 @@ void ext3_free_blocks_sb(handle_t *handl
 	struct ext3_group_desc * desc;
 	struct ext3_super_block * es;
 	struct ext3_sb_info *sbi;
+	struct ext3_bg_info *bgi;
 	int err = 0, ret;
 	ext3_grpblk_t group_freed;
 
@@ -463,6 +569,13 @@ do_more:
 	if (!desc)
 		goto error_return;
 
+	if (test_opt(sb, METACLUSTER)) {
+		bgi = &sbi->s_bginfo[block_group];
+		if (bgi->bgi_free_nonmc_blocks_count < 0)
+			ext3_init_grp_free_nonmc_blocks(sb, bitmap_bh,
+							block_group);
+	}
+
 	if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
 	    in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
 	    in_range (block, le32_to_cpu(desc->bg_inode_table),
@@ -582,6 +695,9 @@ do_more:
 	if (!err) err = ret;
 	*pdquot_freed_blocks += group_freed;
 
+	if (test_opt(sb, METACLUSTER) && bit < sbi->s_nonmc_blocks_per_group)
+		ext3_update_nonmc_block_count(sbi, block_group, bit, count, 0);
+
 	if (overflow && !err) {
 		block += count;
 		count = overflow;
@@ -687,6 +803,50 @@ bitmap_search_next_usable_block(ext3_grp
 	return -1;
 }
 
+static ext3_grpblk_t
+bitmap_find_prev_zero_bit(char *map, ext3_grpblk_t start, ext3_grpblk_t lowest)
+{
+	ext3_grpblk_t k, blk;
+
+	k = start & ~7;
+	while (lowest <= k) {
+		if (map[k/8] != '\255' &&
+			(blk = ext3_find_next_zero_bit(map, k + 8, k))
+			 < (k + 8))
+				return blk;
+
+		k -= 8;
+	}
+	return -1;
+}
+
+static ext3_grpblk_t
+bitmap_search_prev_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+					ext3_grpblk_t lowest)
+{
+	ext3_grpblk_t next;
+	struct journal_head *jh = bh2jh(bh);
+
+	/*
+	 * The bitmap search --- search backward alternately through the actual
+	 * bitmap and the last-committed copy until we find a bit free in
+	 * both
+	 */
+	while (start >= lowest) {
+		next = bitmap_find_prev_zero_bit(bh->b_data, start, lowest);
+		if (next < lowest)
+			return -1;
+		if (ext3_test_allocatable(next, bh))
+			return next;
+		jbd_lock_bh_state(bh);
+		if (jh->b_committed_data)
+			start = bitmap_find_prev_zero_bit(jh->b_committed_data,
+								next, lowest);
+		jbd_unlock_bh_state(bh);
+	}
+	return -1;
+}
+
 /**
  * find_next_usable_block()
  * @start:		the starting block (group relative) to find next
@@ -794,19 +954,27 @@ claim_block(spinlock_t *lock, ext3_grpbl
  *	file's own reservation window;
  *	Otherwise, the allocation range starts from the give goal block, ends at
  *	the block group's last block.
- *
- * If we failed to allocate the desired block then we may end up crossing to a
- * new bitmap.  In that case we must release write access to the old one via
- * ext3_journal_release_buffer(), else we'll run out of credits.
  */
 static ext3_grpblk_t
 ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
 			struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
 			unsigned long *count, struct ext3_reserve_window *my_rsv)
 {
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_group_desc *gdp;
+	struct ext3_bg_info *bgi = NULL;
+	struct buffer_head *gdp_bh;
 	ext3_fsblk_t group_first_block;
 	ext3_grpblk_t start, end;
 	unsigned long num = 0;
+	const int metaclustering = test_opt(sb, METACLUSTER);
+
+	if (metaclustering)
+		bgi = &sbi->s_bginfo[group];
+
+	gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+	if (!gdp)
+		goto fail_access;
 
 	/* we do allocation within the reservation window if we have a window */
 	if (my_rsv) {
@@ -851,8 +1019,10 @@ repeat:
 	}
 	start = grp_goal;
 
-	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
-		grp_goal, bitmap_bh)) {
+	if (metaclustering && !allow_mc_alloc(sbi, bgi, grp_goal))
+		goto fail_access;
+
+	if (!claim_block(sb_bgl_lock(sbi, group), grp_goal, bitmap_bh)) {
 		/*
 		 * The block was allocated by another thread, or it was
 		 * allocated and then freed by another thread
@@ -867,8 +1037,8 @@ repeat:
 	grp_goal++;
 	while (num < *count && grp_goal < end
 		&& ext3_test_allocatable(grp_goal, bitmap_bh)
-		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group),
-				grp_goal, bitmap_bh)) {
+		&& (!metaclustering || allow_mc_alloc(sbi, bgi, grp_goal))
+		&& claim_block(sb_bgl_lock(sbi, group), grp_goal, bitmap_bh)) {
 		num++;
 		grp_goal++;
 	}
@@ -1099,7 +1269,9 @@ static int alloc_new_reservation(struct 
 
 	/*
 	 * find_next_reservable_window() simply finds a reservable window
-	 * inside the given range(start_block, group_end_block).
+	 * inside the given range(start_block, group_end_block). The
+	 * reservation window must have a reservable free bit inside it for our
+	 * callers to work correctly.
 	 *
 	 * To make sure the reservation window has a free bit inside it, we
 	 * need to check the bitmap after we found a reservable window.
@@ -1131,10 +1303,17 @@ retry:
 			my_rsv->rsv_start - group_first_block,
 			bitmap_bh, group_end_block - group_first_block + 1);
 
-	if (first_free_block < 0) {
+	if (first_free_block < 0 ||
+		(test_opt(sb, METACLUSTER)
+		 && !allow_mc_alloc(EXT3_SB(sb), &EXT3_SB(sb)->s_bginfo[group],
+					first_free_block))) {
 		/*
-		 * no free block left on the bitmap, no point
-		 * to reserve the space. return failed.
+		 * No free block left on the bitmap, no point to reserve space,
+		 * return failed. We also fail here if metaclustering is enabled
+		 * and the first free block in the window lies in the
+		 * metacluster while there are free non-mc blocks in the block
+		 * group, such a window or any window following it is not useful
+		 * to us.
 		 */
 		spin_lock(rsv_lock);
 		if (!rsv_is_empty(&my_rsv->rsv_window))
@@ -1237,25 +1416,17 @@ ext3_try_to_allocate_with_rsv(struct sup
 			unsigned int group, struct buffer_head *bitmap_bh,
 			ext3_grpblk_t grp_goal,
 			struct ext3_reserve_window_node * my_rsv,
-			unsigned long *count, int *errp)
+			unsigned long *count)
 {
+	struct ext3_bg_info *bgi;
 	ext3_fsblk_t group_first_block, group_last_block;
 	ext3_grpblk_t ret = 0;
-	int fatal;
 	unsigned long num = *count;
 
-	*errp = 0;
-
-	/*
-	 * Make sure we use undo access for the bitmap, because it is critical
-	 * that we do the frozen_data COW on bitmap buffers in all cases even
-	 * if the buffer is in BJ_Forget state in the committing transaction.
-	 */
-	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
-	fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
-	if (fatal) {
-		*errp = fatal;
-		return -1;
+	if (test_opt(sb, METACLUSTER)) {
+		bgi = &EXT3_SB(sb)->s_bginfo[group];
+		if (bgi->bgi_free_nonmc_blocks_count < 0)
+			ext3_init_grp_free_nonmc_blocks(sb, bitmap_bh, group);
 	}
 
 	/*
@@ -1331,19 +1502,6 @@ ext3_try_to_allocate_with_rsv(struct sup
 		num = *count;
 	}
 out:
-	if (ret >= 0) {
-		BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
-					"bitmap block");
-		fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
-		if (fatal) {
-			*errp = fatal;
-			return -1;
-		}
-		return ret;
-	}
-
-	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
-	ext3_journal_release_buffer(handle, bitmap_bh);
 	return ret;
 }
 
@@ -1389,22 +1547,149 @@ int ext3_should_retry_alloc(struct super
 	return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
 }
 
+/*
+ * ext3_alloc_indirect_blocks:
+ * 	Helper function for ext3_new_blocks. Allocates indirect blocks from the
+ * 	metacluster region only and stores their numbers in new_blocks[].
+ */
+int ext3_alloc_indirect_blocks(struct super_block *sb,
+			struct buffer_head *bitmap_bh,
+			struct ext3_group_desc *gdp,
+			int group_no, unsigned long indirect_blks,
+			ext3_fsblk_t new_blocks[])
+{
+	struct ext3_bg_info *bgi = &EXT3_SB(sb)->s_bginfo[group_no];
+	ext3_grpblk_t blk = EXT3_BLOCKS_PER_GROUP(sb) - 1;
+	ext3_grpblk_t mc_start = EXT3_SB(sb)->s_nonmc_blocks_per_group;
+	ext3_fsblk_t group_first_block;
+	int allocated = 0;
+
+	BUG_ON(!test_opt(sb, METACLUSTER));
+
+	/* This check is racy but that wouldn't harm us. */
+	if (bgi->bgi_free_nonmc_blocks_count >=
+		le16_to_cpu(gdp->bg_free_blocks_count))
+		return 0;
+
+	group_first_block = ext3_group_first_block_no(sb, group_no);
+	while (allocated < indirect_blks && blk >= mc_start) {
+		if (!ext3_test_allocatable(blk, bitmap_bh)) {
+			blk = bitmap_search_prev_usable_block(blk, bitmap_bh,
+								mc_start);
+			continue;
+		}
+		if (claim_block(sb_bgl_lock(EXT3_SB(sb), group_no), blk,
+				bitmap_bh)) {
+			new_blocks[allocated++] = group_first_block + blk;
+		} else {
+			/*
+			 * The block was allocated by another thread, or it
+			 * was allocated and then freed by another thread
+			 */
+			cpu_relax();
+		}
+		if (allocated < indirect_blks)
+			blk = bitmap_search_prev_usable_block(blk, bitmap_bh,
+								mc_start);
+	}
+	return allocated;
+}
+
+/*
+ * check_allocated_blocks:
+ * 	Helper function for ext3_new_blocks. Checks newly allocated block
+ * 	numbers.
+ */
+int check_allocated_blocks(ext3_fsblk_t blk, unsigned long num,
+				struct super_block *sb, int group_no,
+				struct ext3_group_desc *gdp,
+				struct buffer_head *bitmap_bh)
+{
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	ext3_fsblk_t grp_blk = blk - ext3_group_first_block_no(sb, group_no);
+
+	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), blk, num) ||
+		in_range(le32_to_cpu(gdp->bg_inode_bitmap), blk, num) ||
+		in_range(blk, le32_to_cpu(gdp->bg_inode_table),
+				EXT3_SB(sb)->s_itb_per_group) ||
+		in_range(blk + num - 1, le32_to_cpu(gdp->bg_inode_table),
+				EXT3_SB(sb)->s_itb_per_group))
+		ext3_error(sb, "ext3_new_blocks",
+				"Allocating block in system zone - "
+				"blocks from "E3FSBLK", length %lu",
+				blk, num);
+
+#ifdef CONFIG_JBD_DEBUG
+	{
+		struct buffer_head *debug_bh;
+
+		/* Record bitmap buffer state in the newly allocated block */
+		debug_bh = sb_find_get_block(sb, blk);
+		if (debug_bh) {
+			BUFFER_TRACE(debug_bh, "state when allocated");
+			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
+			brelse(debug_bh);
+		}
+	}
+	jbd_lock_bh_state(bitmap_bh);
+	spin_lock(sb_bgl_lock(sbi, group_no));
+	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
+		int i;
+
+		for (i = 0; i < num; i++) {
+			if (ext3_test_bit(grp_blk+i,
+					bh2jh(bitmap_bh)->b_committed_data))
+				printk(KERN_ERR "%s: block was unexpectedly set"
+					" in b_committed_data\n", __FUNCTION__);
+		}
+	}
+	ext3_debug("found bit %d\n", grp_blk);
+	spin_unlock(sb_bgl_lock(sbi, group_no));
+	jbd_unlock_bh_state(bitmap_bh);
+#endif
+
+	if (blk + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
+		ext3_error(sb, "ext3_new_blocks",
+				"block("E3FSBLK") >= blocks count(%d) - "
+				"block_group = %d, es == %p ", blk,
+				le32_to_cpu(es->s_blocks_count), group_no, es);
+		return 1;
+	}
+
+	return 0;
+}
+
 /**
- * ext3_new_blocks() -- core block(s) allocation function
- * @handle:		handle to this transaction
- * @inode:		file inode
- * @goal:		given target block(filesystem wide)
- * @count:		target number of blocks to allocate
- * @errp:		error code
+ * ext3_new_blocks - allocate indirect blocks and direct blocks.
+ *	@handle:	handle to this transaction
+ *	@inode:		file inode
+ *	@goal:		given target block(filesystem wide)
+ * 	@indirect_blks	number of indirect blocks to allocate
+ * 	@blks		number of direct blocks to allocate
+ * 	@new_blocks	this will store the block numbers of indirect blocks
+ * 			and direct blocks upon return.
  *
- * ext3_new_blocks uses a goal block to assist allocation.  It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * 	returns the number of direct blocks allocated. Fewer than requested
+ * 	number of direct blocks may be allocated but all requested indirect
+ * 	blocks must be allocated in order to return success.
  *
+ *	Without metaclustering, ext3_new_block allocates all blocks using a
+ *	goal block to assist allocation.  It tries to allocate block(s) from
+ *	the block group contains the goal block first. If that fails, it will
+ *	try to allocate block(s) from other block groups without any specific
+ *	goal block.
+ *
+ *	With metaclustering, the only difference is that indirect block
+ *	allocation is first attempted in the metacluster region of the same
+ *	block group failing which they are allocated along with direct blocks.
+ *
+ *	This function also updates quota and i_blocks field.
  */
-ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
-			ext3_fsblk_t goal, unsigned long *count, int *errp)
+int ext3_new_blocks(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, int indirect_blks, int blks,
+			ext3_fsblk_t new_blocks[4], int *errp)
+
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gdp_bh;
@@ -1413,10 +1698,16 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h
 	ext3_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
 	ext3_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
 	ext3_fsblk_t ret_block;		/* filesyetem-wide allocated block */
+	ext3_fsblk_t group_first_block; /* first block in the group */
 	int bgi;			/* blockgroup iteration index */
 	int fatal = 0, err;
 	int performed_allocation = 0;
 	ext3_grpblk_t free_blocks;	/* number of free blocks in a group */
+	unsigned long ngroups;
+	unsigned long grp_mc_alloc;/* blocks allocated from mc in a group */
+	unsigned long grp_alloc;   /* blocks allocated outside mc in a group */
+	int indirect_blks_done = 0;/* total ind blocks allocated so far */
+	int blks_done = 0;	   /* total direct blocks allocated */
 	struct super_block *sb;
 	struct ext3_group_desc *gdp;
 	struct ext3_super_block *es;
@@ -1424,23 +1715,23 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h
 	struct ext3_reserve_window_node *my_rsv = NULL;
 	struct ext3_block_alloc_info *block_i;
 	unsigned short windowsz = 0;
+	int i;
 #ifdef EXT3FS_DEBUG
 	static int goal_hits, goal_attempts;
 #endif
-	unsigned long ngroups;
-	unsigned long num = *count;
 
 	*errp = -ENOSPC;
 	sb = inode->i_sb;
 	if (!sb) {
-		printk("ext3_new_block: nonexistent device");
+		printk(KERN_INFO "ext3_new_blocks: nonexistent device");
+		*errp = -ENODEV;
 		return 0;
 	}
 
 	/*
 	 * Check quota for allocation of this block.
 	 */
-	if (DQUOT_ALLOC_BLOCK(inode, num)) {
+	if (DQUOT_ALLOC_BLOCK(inode, indirect_blks + blks)) {
 		*errp = -EDQUOT;
 		return 0;
 	}
@@ -1474,73 +1765,194 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h
 	group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
 			EXT3_BLOCKS_PER_GROUP(sb);
 	goal_group = group_no;
-retry_alloc:
-	gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
-	if (!gdp)
-		goto io_error;
-
-	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-	/*
-	 * if there is not enough free blocks to make a new resevation
-	 * turn off reservation for this allocation
-	 */
-	if (my_rsv && (free_blocks < windowsz)
-		&& (rsv_is_empty(&my_rsv->rsv_window)))
-		my_rsv = NULL;
-
-	if (free_blocks > 0) {
-		grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
-				EXT3_BLOCKS_PER_GROUP(sb));
-		bitmap_bh = read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
-			goto io_error;
-		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, grp_target_blk,
-					my_rsv,	&num, &fatal);
-		if (fatal)
-			goto out;
-		if (grp_alloc_blk >= 0)
-			goto allocated;
-	}
 
+retry_alloc:
+	grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
+			EXT3_BLOCKS_PER_GROUP(sb));
 	ngroups = EXT3_SB(sb)->s_groups_count;
 	smp_rmb();
 
 	/*
-	 * Now search the rest of the groups.  We assume that
-	 * i and gdp correctly point to the last group visited.
+	 * Iterate over successive block groups for allocating (any) indirect
+	 * blocks and direct blocks until at least one direct block has been
+	 * allocated. If metaclustering is enabled, we try allocating indirect
+	 * blocks first in the metacluster region and then in the general
+	 * region and if that fails too, we repeat the same algorithm in the
+	 * next block group and so on. This not only keeps the indirect blocks
+	 * together in the metacluster, but also keeps them in close proximity
+	 * to their corresponding direct blocks.
+	 *
+	 * The search begins and ends at the goal group, though the second time
+	 * we are at the goal group we try allocating without a goal.
 	 */
-	for (bgi = 0; bgi < ngroups; bgi++) {
-		group_no++;
+	bgi = 0;
+	while (bgi < ngroups + 1) {
+		grp_mc_alloc = 0;
+
 		if (group_no >= ngroups)
 			group_no = 0;
+
 		gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
 		if (!gdp)
 			goto io_error;
+
 		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-		/*
-		 * skip this group if the number of
-		 * free blocks is less than half of the reservation
-		 * window size.
-		 */
-		if (free_blocks <= (windowsz/2))
-			continue;
+		if (group_no == goal_group) {
+			if (my_rsv && (free_blocks < windowsz)
+				&& (rsv_is_empty(&my_rsv->rsv_window)))
+				my_rsv = NULL;
+			if (free_blocks == 0)
+				goto next;
+		} else if (free_blocks <= windowsz/2)
+			goto next;
 
-		brelse(bitmap_bh);
 		bitmap_bh = read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
+
 		/*
-		 * try to allocate block(s) from this group, without a goal(-1).
+		 * Make sure we use undo access for the bitmap, because it is
+		 * critical that we do the frozen_data COW on bitmap buffers in
+		 * all cases even if the buffer is in BJ_Forget state in the
+		 * committing transaction.
+		 */
+		BUFFER_TRACE(bitmap_bh, "get undo access for new block");
+		fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
+		if (fatal)
+			goto out;
+
+		/*
+		 * If metaclustering is enabled, first try to allocate indirect
+		 * blocks in the metacluster.
 		 */
+		if (test_opt(sb, METACLUSTER) &&
+			indirect_blks_done < indirect_blks)
+			grp_mc_alloc = ext3_alloc_indirect_blocks(sb,
+					bitmap_bh, gdp, group_no,
+					indirect_blks - indirect_blks_done,
+					new_blocks + indirect_blks_done);
+
+		/* Allocate data blocks and any leftover indirect blocks. */
+		grp_alloc = indirect_blks + blks
+				- (indirect_blks_done + grp_mc_alloc);
 		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, -1, my_rsv,
-					&num, &fatal);
+					group_no, bitmap_bh, grp_target_blk,
+					my_rsv, &grp_alloc);
+		if (grp_alloc_blk < 0)
+			grp_alloc = 0;
+
+		/*
+		 * If we couldn't allocate anything, there is nothing more to
+		 * do with this block group, so move over to the next. But
+		 * before that We must release write access to the old one via
+		 * ext3_journal_release_buffer(), else we'll run out of credits.
+		 */
+		if (grp_mc_alloc == 0 && grp_alloc == 0) {
+			BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
+			ext3_journal_release_buffer(handle, bitmap_bh);
+			goto next;
+		}
+
+		BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
+					"bitmap block");
+		fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
 		if (fatal)
 			goto out;
-		if (grp_alloc_blk >= 0)
+
+		ext3_debug("using block group %d(%d)\n",
+				group_no, gdp->bg_free_blocks_count);
+
+		BUFFER_TRACE(gdp_bh, "get_write_access");
+		fatal = ext3_journal_get_write_access(handle, gdp_bh);
+		if (fatal)
+			goto out;
+
+		/* Should this be called before ext3_journal_dirty_metadata? */
+		for (i = 0; i < grp_mc_alloc; i++) {
+			if (check_allocated_blocks(
+				new_blocks[indirect_blks_done + i], 1, sb,
+				group_no, gdp, bitmap_bh))
+				goto out;
+		}
+		if (grp_alloc > 0) {
+			ret_block = ext3_group_first_block_no(sb, group_no) +
+				grp_alloc_blk;
+			if (check_allocated_blocks(ret_block, grp_alloc, sb,
+						group_no, gdp, bitmap_bh))
+				goto out;
+		}
+
+		indirect_blks_done += grp_mc_alloc;
+		performed_allocation = 1;
+
+		/* The caller will add the new buffer to the journal. */
+		if (grp_alloc > 0)
+			ext3_debug("allocating block %lu. "
+					"Goal hits %d of %d.\n",
+					ret_block, goal_hits, goal_attempts);
+
+		spin_lock(sb_bgl_lock(sbi, group_no));
+		gdp->bg_free_blocks_count =
+			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -
+					(grp_mc_alloc + grp_alloc));
+		spin_unlock(sb_bgl_lock(sbi, group_no));
+		percpu_counter_mod(&sbi->s_freeblocks_counter,
+				 -(grp_mc_alloc + grp_alloc));
+
+		BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for "
+				"group descriptor");
+		err = ext3_journal_dirty_metadata(handle, gdp_bh);
+		if (!fatal)
+			fatal = err;
+
+		sb->s_dirt = 1;
+		if (fatal)
+			goto out;
+
+		brelse(bitmap_bh);
+		bitmap_bh = NULL;
+
+		if (grp_alloc == 0)
+			goto next;
+
+		/* Update block group non-mc block count since we used some. */
+		if (test_opt(sb, METACLUSTER) &&
+			grp_alloc_blk < sbi->s_nonmc_blocks_per_group)
+			ext3_update_nonmc_block_count(sbi, group_no,
+				grp_alloc_blk, grp_alloc, 1);
+
+		/*
+		 * Assign all the non-mc blocks that we allocated from this
+		 * block group.
+		 */
+		group_first_block = ext3_group_first_block_no(sb, group_no);
+		while (grp_alloc > 0 && indirect_blks_done < indirect_blks) {
+			new_blocks[indirect_blks_done++] =
+				group_first_block + grp_alloc_blk;
+			grp_alloc_blk++;
+			grp_alloc--;
+		}
+
+		if (grp_alloc > 0) {
+			blks_done = grp_alloc;
+			new_blocks[indirect_blks_done] =
+				group_first_block + grp_alloc_blk;
 			goto allocated;
+		}
+
+		/*
+		 * If we allocated something but not the minimum required,
+		 * it's OK to retry in this group as it might have more free
+		 * blocks.
+		 */
+		continue;
+
+next:
+		bgi++;
+		group_no++;
+		grp_target_blk = -1;
 	}
+
 	/*
 	 * We may end up a bogus ealier ENOSPC error due to
 	 * filesystem is "full" of reservations, but
@@ -1559,96 +1971,11 @@ retry_alloc:
 	goto out;
 
 allocated:
-
-	ext3_debug("using block group %d(%d)\n",
-			group_no, gdp->bg_free_blocks_count);
-
-	BUFFER_TRACE(gdp_bh, "get_write_access");
-	fatal = ext3_journal_get_write_access(handle, gdp_bh);
-	if (fatal)
-		goto out;
-
-	ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
-
-	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
-	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
-	    in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
-		      EXT3_SB(sb)->s_itb_per_group) ||
-	    in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
-		      EXT3_SB(sb)->s_itb_per_group))
-		ext3_error(sb, "ext3_new_block",
-			    "Allocating block in system zone - "
-			    "blocks from "E3FSBLK", length %lu",
-			     ret_block, num);
-
-	performed_allocation = 1;
-
-#ifdef CONFIG_JBD_DEBUG
-	{
-		struct buffer_head *debug_bh;
-
-		/* Record bitmap buffer state in the newly allocated block */
-		debug_bh = sb_find_get_block(sb, ret_block);
-		if (debug_bh) {
-			BUFFER_TRACE(debug_bh, "state when allocated");
-			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
-			brelse(debug_bh);
-		}
-	}
-	jbd_lock_bh_state(bitmap_bh);
-	spin_lock(sb_bgl_lock(sbi, group_no));
-	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
-		int i;
-
-		for (i = 0; i < num; i++) {
-			if (ext3_test_bit(grp_alloc_blk+i,
-					bh2jh(bitmap_bh)->b_committed_data)) {
-				printk("%s: block was unexpectedly set in "
-					"b_committed_data\n", __FUNCTION__);
-			}
-		}
-	}
-	ext3_debug("found bit %d\n", grp_alloc_blk);
-	spin_unlock(sb_bgl_lock(sbi, group_no));
-	jbd_unlock_bh_state(bitmap_bh);
-#endif
-
-	if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
-		ext3_error(sb, "ext3_new_block",
-			    "block("E3FSBLK") >= blocks count(%d) - "
-			    "block_group = %d, es == %p ", ret_block,
-			le32_to_cpu(es->s_blocks_count), group_no, es);
-		goto out;
-	}
-
-	/*
-	 * It is up to the caller to add the new buffer to a journal
-	 * list of some description.  We don't know in advance whether
-	 * the caller wants to use it as metadata or data.
-	 */
-	ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
-			ret_block, goal_hits, goal_attempts);
-
-	spin_lock(sb_bgl_lock(sbi, group_no));
-	gdp->bg_free_blocks_count =
-			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
-	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
-
-	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
-	err = ext3_journal_dirty_metadata(handle, gdp_bh);
-	if (!fatal)
-		fatal = err;
-
-	sb->s_dirt = 1;
-	if (fatal)
-		goto out;
-
 	*errp = 0;
-	brelse(bitmap_bh);
-	DQUOT_FREE_BLOCK(inode, *count-num);
-	*count = num;
-	return ret_block;
+	DQUOT_FREE_BLOCK(inode,
+			indirect_blks + blks - indirect_blks_done - blks_done);
+
+	return blks_done;
 
 io_error:
 	*errp = -EIO;
@@ -1661,7 +1988,13 @@ out:
 	 * Undo the block allocation
 	 */
 	if (!performed_allocation)
-		DQUOT_FREE_BLOCK(inode, *count);
+		DQUOT_FREE_BLOCK(inode, indirect_blks + blks);
+	/*
+	 * Free any indirect blocks we allocated already. If the transaction
+	 * has been aborted this is essentially a no-op.
+	 */
+	for (i = 0; i < indirect_blks_done; i++)
+		ext3_free_blocks(handle, inode, new_blocks[i], 1);
 	brelse(bitmap_bh);
 	return 0;
 }
@@ -1669,9 +2002,13 @@ out:
 ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
 			ext3_fsblk_t goal, int *errp)
 {
-	unsigned long count = 1;
+	ext3_fsblk_t new_blocks[4];
 
-	return ext3_new_blocks(handle, inode, goal, &count, errp);
+	ext3_new_blocks(handle, inode, goal, 0, 1, new_blocks, errp);
+	if (*errp)
+		return 0;
+
+	return new_blocks[0];
 }
 
 /**
diff -rupdN linux-2.6.23.13-clean/fs/ext3/bitmap.c linux-2.6.23.13-ext3mc/fs/ext3/bitmap.c
--- linux-2.6.23.13-clean/fs/ext3/bitmap.c	2008-01-09 12:18:17.000000000 -0500
+++ linux-2.6.23.13-ext3mc/fs/ext3/bitmap.c	2008-01-12 22:30:19.000000000 -0500
@@ -11,8 +11,6 @@
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
 
-#ifdef EXT3FS_DEBUG
-
 static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
 
 unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
@@ -27,6 +25,3 @@ unsigned long ext3_count_free (struct bu
 			nibblemap[(map->b_data[i] >> 4) & 0xf];
 	return (sum);
 }
-
-#endif  /*  EXT3FS_DEBUG  */
-
diff -rupdN linux-2.6.23.13-clean/fs/ext3/inode.c linux-2.6.23.13-ext3mc/fs/ext3/inode.c
--- linux-2.6.23.13-clean/fs/ext3/inode.c	2008-01-09 12:18:17.000000000 -0500
+++ linux-2.6.23.13-ext3mc/fs/ext3/inode.c	2008-01-14 14:12:13.000000000 -0500
@@ -36,10 +36,33 @@
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+#include <linux/sort.h>
 #include "xattr.h"
 #include "acl.h"
 
+typedef struct {
+	__le32	*p;
+	__le32	key;
+	struct buffer_head *bh;
+} Indirect;
+
+struct ext3_ind_read_info {
+	int                     count;
+	int                     seq_prefetch;
+	long                    size;
+	struct buffer_head      *bh[0];
+};
+
+# define EXT3_IND_READ_INFO_SIZE(_c)        \
+	(sizeof(struct ext3_ind_read_info) + \
+	 sizeof(struct buffer_head *) * (_c))
+
+# define EXT3_IND_READ_MAX     	(32)
+
 static int ext3_writepage_trans_blocks(struct inode *inode);
+static Indirect *ext3_read_indblocks(struct inode *inode, int iblock,
+				     int depth, int offsets[4],
+				     Indirect chain[4], int *err);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -233,12 +256,6 @@ no_delete:
 	clear_inode(inode);	/* We must guarantee clearing of inode... */
 }
 
-typedef struct {
-	__le32	*p;
-	__le32	key;
-	struct buffer_head *bh;
-} Indirect;
-
 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 {
 	p->key = *(p->p = v);
@@ -352,18 +369,21 @@ static int ext3_block_to_path(struct ino
  *	the whole chain, all way to the data (returns %NULL, *err == 0).
  */
 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
-				 Indirect chain[4], int *err)
+				 Indirect chain[4], int ind_readahead, int *err)
 {
 	struct super_block *sb = inode->i_sb;
 	Indirect *p = chain;
 	struct buffer_head *bh;
+	int index;
 
 	*err = 0;
 	/* i_data is not going away, no lock needed */
 	add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
 	if (!p->key)
 		goto no_block;
-	while (--depth) {
+	for (index = 0; index < depth - 1; index++) {
+		if (ind_readahead && depth > 2 && index == depth - 2)
+			break;
 		bh = sb_bread(sb, le32_to_cpu(p->key));
 		if (!bh)
 			goto failure;
@@ -396,7 +416,11 @@ no_block:
  *	It is used when heuristic for sequential allocation fails.
  *	Rules are:
  *	  + if there is a block to the left of our position - allocate near it.
- *	  + if pointer will live in indirect block - allocate near that block.
+ *	  + If METACLUSTER options is not specified, allocate the data
+ *	  block close to the metadata block. Otherwise, if pointer will live in
+ *	  indirect block, we cannot allocate near the indirect block since
+ *	  indirect blocks are allocated in the metacluster, just put in the same
+ *	  cylinder group as the inode.
  *	  + if pointer will live in inode - allocate in the same
  *	    cylinder group.
  *
@@ -421,9 +445,11 @@ static ext3_fsblk_t ext3_find_near(struc
 			return le32_to_cpu(*p);
 	}
 
-	/* No such thing, so let's try location of indirect block */
-	if (ind->bh)
-		return ind->bh->b_blocknr;
+	if (!test_opt(inode->i_sb, METACLUSTER)) {
+		/* No such thing, so let's try location of indirect block */
+		if (ind->bh)
+			return ind->bh->b_blocknr;
+	}
 
 	/*
 	 * It is going to be referred to from the inode itself? OK, just put it
@@ -475,8 +501,7 @@ static ext3_fsblk_t ext3_find_goal(struc
  *	@blks: number of data blocks to be mapped.
  *	@blocks_to_boundary:  the offset in the indirect block
  *
- *	return the total number of blocks to be allocate, including the
- *	direct and indirect blocks.
+ *	return the total number of direct blocks to be allocated.
  */
 static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 		int blocks_to_boundary)
@@ -505,75 +530,18 @@ static int ext3_blks_to_allocate(Indirec
 }
 
 /**
- *	ext3_alloc_blocks: multiple allocate blocks needed for a branch
- *	@indirect_blks: the number of blocks need to allocate for indirect
- *			blocks
- *
- *	@new_blocks: on return it will store the new block numbers for
- *	the indirect blocks(if needed) and the first direct block,
- *	@blks:	on return it will store the total number of allocated
- *		direct blocks
- */
-static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
-			ext3_fsblk_t goal, int indirect_blks, int blks,
-			ext3_fsblk_t new_blocks[4], int *err)
-{
-	int target, i;
-	unsigned long count = 0;
-	int index = 0;
-	ext3_fsblk_t current_block = 0;
-	int ret = 0;
-
-	/*
-	 * Here we try to allocate the requested multiple blocks at once,
-	 * on a best-effort basis.
-	 * To build a branch, we should allocate blocks for
-	 * the indirect blocks(if not allocated yet), and at least
-	 * the first direct block of this branch.  That's the
-	 * minimum number of blocks need to allocate(required)
-	 */
-	target = blks + indirect_blks;
-
-	while (1) {
-		count = target;
-		/* allocating blocks for indirect blocks and direct blocks */
-		current_block = ext3_new_blocks(handle,inode,goal,&count,err);
-		if (*err)
-			goto failed_out;
-
-		target -= count;
-		/* allocate blocks for indirect blocks */
-		while (index < indirect_blks && count) {
-			new_blocks[index++] = current_block++;
-			count--;
-		}
-
-		if (count > 0)
-			break;
-	}
-
-	/* save the new block number for the first direct block */
-	new_blocks[index] = current_block;
-
-	/* total number of blocks allocated for direct blocks */
-	ret = count;
-	*err = 0;
-	return ret;
-failed_out:
-	for (i = 0; i <index; i++)
-		ext3_free_blocks(handle, inode, new_blocks[i], 1);
-	return ret;
-}
-
-/**
  *	ext3_alloc_branch - allocate and set up a chain of blocks.
  *	@inode: owner
  *	@indirect_blks: number of allocated indirect blocks
  *	@blks: number of allocated direct blocks
+ *	@goal: goal for allocation
  *	@offsets: offsets (in the blocks) to store the pointers to next.
  *	@branch: place to store the chain in.
  *
- *	This function allocates blocks, zeroes out all but the last one,
+ *	returns error and number of direct blocks allocated via *blks
+ *
+ *	This function allocates indirect_blks + *blks, zeroes out all
+ *	indirect blocks,
  *	links them into chain and (if we are synchronous) writes them to disk.
  *	In other words, it prepares a branch that can be spliced onto the
  *	inode. It stores the information about that chain in the branch[], in
@@ -602,7 +570,7 @@ static int ext3_alloc_branch(handle_t *h
 	ext3_fsblk_t new_blocks[4];
 	ext3_fsblk_t current_block;
 
-	num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
+	num = ext3_new_blocks(handle, inode, goal, indirect_blks,
 				*blks, new_blocks, &err);
 	if (err)
 		return err;
@@ -799,17 +767,21 @@ int ext3_get_blocks_handle(handle_t *han
 	int blocks_to_boundary = 0;
 	int depth;
 	struct ext3_inode_info *ei = EXT3_I(inode);
-	int count = 0;
+	int count = 0, ind_readahead;
 	ext3_fsblk_t first_block = 0;
 
-
 	J_ASSERT(handle != NULL || create == 0);
 	depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
 
 	if (depth == 0)
 		goto out;
 
-	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
+	ind_readahead = !create && depth > 2;
+	partial = ext3_get_branch(inode, depth, offsets, chain,
+				  ind_readahead, &err);
+	if (!partial && ind_readahead)
+		partial = ext3_read_indblocks(inode, iblock, depth,
+					      offsets, chain, &err);
 
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
@@ -844,7 +816,7 @@ int ext3_get_blocks_handle(handle_t *han
 	}
 
 	/* Next simple case - plain lookup or failed read of indirect block */
-	if (!create || err == -EIO)
+	if (!create || (err && err != -EAGAIN))
 		goto cleanup;
 
 	mutex_lock(&ei->truncate_mutex);
@@ -866,7 +838,8 @@ int ext3_get_blocks_handle(handle_t *han
 			brelse(partial->bh);
 			partial--;
 		}
-		partial = ext3_get_branch(inode, depth, offsets, chain, &err);
+		partial = ext3_get_branch(inode, depth, offsets, chain, 0,
+					&err);
 		if (!partial) {
 			count++;
 			mutex_unlock(&ei->truncate_mutex);
@@ -1907,7 +1880,7 @@ static Indirect *ext3_find_shared(struct
 	/* Make k index the deepest non-null offest + 1 */
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
-	partial = ext3_get_branch(inode, k, offsets, chain, &err);
+	partial = ext3_get_branch(inode, k, offsets, chain, 0, &err);
 	/* Writer: pointers */
 	if (!partial)
 		partial = chain + k-1;
@@ -3230,3 +3203,561 @@ int ext3_change_inode_journal_flag(struc
 
 	return err;
 }
+
+/*
+ * ext3_ind_read_end_bio --
+ *
+ * 	bio callback for read IO issued from ext3_read_indblocks.
+ * 	May be called multiple times until the whole I/O completes at
+ * 	which point bio->bi_size = 0 and it frees read_info and bio.
+ * 	The first time it is called, first_bh is unlocked so that any sync
+ * 	waier can unblock.
+ */
+static int ext3_ind_read_end_bio(struct bio *bio, unsigned int bytes_done,
+				 int err)
+{
+	struct ext3_ind_read_info *read_info = bio->bi_private;
+	struct buffer_head *bh;
+	int uptodate = !err && test_bit(BIO_UPTODATE, &bio->bi_flags);
+	int i;
+
+	if (err == -EOPNOTSUPP)
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+
+	/* Wait for all buffers to finish - is this needed? */
+	if (bio->bi_size)
+		return 1;
+
+	for (i = 0; i < read_info->count; i++) {
+		bh = read_info->bh[i];
+		if (err == -EOPNOTSUPP)
+			set_bit(BH_Eopnotsupp, &bh->b_state);
+
+		if (uptodate) {
+			BUG_ON(buffer_uptodate(bh));
+			BUG_ON(ext3_buffer_prefetch(bh));
+			set_buffer_uptodate(bh);
+			if (read_info->seq_prefetch)
+				ext3_set_buffer_prefetch(bh);
+		}
+
+		unlock_buffer(bh);
+		brelse(bh);
+	}
+
+	kfree(read_info);
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * ext3_get_max_read --
+ * 	@inode: inode of file.
+ * 	@block: block number in file (starting from zero).
+ * 	@offset_in_dind_block: offset of the indirect block inside it's
+ * 	parent doubly-indirect block.
+ *
+ *      Compute the maximum no. of indirect blocks that can be read
+ *      satisfying following constraints:
+ *              - Don't read indirect blocks beyond the end of current
+ *              doubly-indirect block.
+ *              - Don't read beyond eof.
+ */
+static inline unsigned long ext3_get_max_read(const struct inode *inode,
+						  int block,
+						  int offset_in_dind_block)
+{
+	const struct super_block *sb = inode->i_sb;
+	unsigned long max_read;
+	unsigned long ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+	unsigned long ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
+	unsigned long blocks_in_file =
+		(inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	unsigned long remaining_ind_blks_in_dind =
+		(ptrs >= offset_in_dind_block) ? (ptrs - offset_in_dind_block)
+					       : 0;
+	unsigned long remaining_ind_blks_before_eof =
+		((blocks_in_file - EXT3_NDIR_BLOCKS + ptrs - 1) >> ptrs_bits) -
+		((block - EXT3_NDIR_BLOCKS) >> ptrs_bits);
+
+	BUG_ON(block >= blocks_in_file);
+
+	max_read = min_t(unsigned long, remaining_ind_blks_in_dind,
+			 remaining_ind_blks_before_eof);
+
+	BUG_ON(max_read < 1);
+
+	return max_read;
+}
+
+static void ext3_read_indblocks_submit(struct bio **pbio,
+					struct ext3_ind_read_info **pread_info,
+					int *read_cnt, int seq_prefetch)
+{
+	struct bio *bio = *pbio;
+	struct ext3_ind_read_info *read_info = *pread_info;
+
+	BUG_ON(*read_cnt < 1);
+
+	read_info->seq_prefetch = seq_prefetch;
+	read_info->count = *read_cnt;
+	read_info->size = bio->bi_size;
+	bio->bi_private = read_info;
+	bio->bi_end_io = ext3_ind_read_end_bio;
+	submit_bio(READ, bio);
+
+	*pbio = NULL;
+	*pread_info = NULL;
+	*read_cnt = 0;
+}
+
+struct ind_block_info {
+	ext3_fsblk_t		blockno;
+	struct buffer_head	*bh;
+};
+
+static int ind_info_cmp(const void *a, const void *b)
+{
+	struct ind_block_info *info_a = (struct ind_block_info *)a;
+	struct ind_block_info *info_b = (struct ind_block_info *)b;
+
+	return info_a->blockno - info_b->blockno;
+}
+
+static void ind_info_swap(void *a, void *b, int size)
+{
+	struct ind_block_info *info_a = (struct ind_block_info *)a;
+	struct ind_block_info *info_b = (struct ind_block_info *)b;
+	struct ind_block_info tmp;
+
+	tmp = *info_a;
+	*info_a = *info_b;
+	*info_b = tmp;
+}
+
+/*
+ * ext3_read_indblocks_async --
+ *      @sb:            super block
+ *      @ind_blocks[]:  array of indirect block numbers on disk
+ *      @count:         maximum number of indirect blocks to read
+ *      @first_bh:      buffer_head for indirect block ind_blocks[0], may be
+ *                      NULL
+ *      @seq_prefetch:  if this is part of a sequential prefetch and buffers'
+ *                      prefetch bit must be set.
+ *      @blocks_done:   number of blocks considered for prefetching.
+ *
+ *      Issue a single bio request to read upto count buffers identified in
+ *      ind_blocks[]. Fewer than count buffers may be read in some cases:
+ *      - If a buffer is found to be uptodate and it's prefetch bit is set, we
+ *      don't look at any more buffers as they will most likely be in the cache.
+ *      - We skip buffers we cannot lock without blocking (except for first_bh
+ *      if specified).
+ *      - We skip buffers beyond a certain range on disk.
+ *
+ *      This function must issue read on first_bh if specified unless of course
+ *      it's already uptodate.
+ */
+static int ext3_read_indblocks_async(struct super_block *sb,
+				     const __le32 ind_blocks[], int count,
+				     struct buffer_head *first_bh,
+				     int seq_prefetch,
+				     unsigned long *blocks_done)
+{
+	struct buffer_head *bh;
+	struct bio *bio = NULL;
+	struct ext3_ind_read_info *read_info = NULL;
+	int read_cnt = 0, blk;
+	ext3_fsblk_t prev_blk = 0, io_start_blk = 0, curr;
+	struct ind_block_info *ind_info = NULL;
+	int err = 0, ind_info_count = 0;
+
+	BUG_ON(count < 1);
+	/* Don't move this to ext3_get_max_read() since callers often need to
+	 * trim the count returned by that function. So this bound must only
+	 * be imposed at the last moment. */
+	count = min_t(unsigned long, count, EXT3_IND_READ_MAX);
+	*blocks_done = 0UL;
+
+	if (count == 1 && first_bh) {
+		lock_buffer(first_bh);
+		get_bh(first_bh);
+		first_bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, first_bh);
+		*blocks_done = 1UL;
+		return 0;
+	}
+
+	ind_info = kmalloc(count * sizeof(*ind_info), GFP_KERNEL);
+	if (unlikely(!ind_info))
+		return -ENOMEM;
+
+	/*
+	 * First pass: sort block numbers for all indirect blocks that we'll
+	 * read. This allows us to scan blocks in sequenial order during the
+	 * second pass which helps coalasce requests to contiguous blocks.
+	 * Since we sort block numbers here instead of assuming any specific
+	 * layout on the disk, we have some protection against different
+	 * indirect block layout strategies as long as they keep all indirect
+	 * blocks close by.
+	 */
+	for (blk = 0; blk < count; blk++) {
+		curr = le32_to_cpu(ind_blocks[blk]);
+		if (!curr)
+			continue;
+
+		/*
+		 * Skip this block if it lies too far from blocks we have
+		 * already decided to read. "Too far" should typically indicate
+		 * lying on a different track on the disk. EXT3_IND_READ_MAX
+		 * seems reasonable for most disks.
+		 */
+		if (io_start_blk > 0 &&
+			(max(io_start_blk, curr) - min(io_start_blk, curr) >=
+				EXT3_IND_READ_MAX))
+			continue;
+
+		if (blk == 0 && first_bh) {
+			bh = first_bh;
+			get_bh(first_bh);
+		} else {
+			bh = sb_getblk(sb, curr);
+			if (unlikely(!bh)) {
+				err = -ENOMEM;
+				goto failure;
+			}
+		}
+
+		if (buffer_uptodate(bh)) {
+			if (ext3_buffer_prefetch(bh)) {
+				brelse(bh);
+				break;
+			}
+			brelse(bh);
+			continue;
+		}
+
+		if (io_start_blk == 0)
+			io_start_blk = curr;
+
+		ind_info[ind_info_count].blockno = curr;
+		ind_info[ind_info_count].bh = bh;
+		ind_info_count++;
+	}
+	*blocks_done = blk;
+
+	sort(ind_info, ind_info_count, sizeof(*ind_info),
+		ind_info_cmp, ind_info_swap);
+
+	/* Second pass: compose bio requests and issue them. */
+	for (blk = 0; blk < ind_info_count; blk++) {
+		bh = ind_info[blk].bh;
+		curr = ind_info[blk].blockno;
+
+		if (prev_blk > 0 && curr != prev_blk + 1) {
+			ext3_read_indblocks_submit(&bio, &read_info,
+						&read_cnt, seq_prefetch);
+			prev_blk = 0;
+		}
+
+		/* Lock the buffer without blocking, skipping any buffers
+		 * which would require us to block. first_bh when specified is
+		 * an exception as caller typically wants it to be read for
+		 * sure (e.g., ext3_read_indblocks_sync).
+		 */
+		if (bh == first_bh) {
+			lock_buffer(bh);
+		} else if (test_set_buffer_locked(bh)) {
+			brelse(bh);
+			continue;
+		}
+
+		/* Check again with the buffer locked. */
+		if (buffer_uptodate(bh)) {
+			if (ext3_buffer_prefetch(bh)) {
+				unlock_buffer(bh);
+				brelse(bh);
+				break;
+			}
+			unlock_buffer(bh);
+			brelse(bh);
+			continue;
+		}
+
+		if (read_cnt == 0) {
+			/* read_info freed in ext3_ind_read_end_bio(). */
+			read_info = kmalloc(EXT3_IND_READ_INFO_SIZE(count),
+					    GFP_KERNEL);
+			if (unlikely(!read_info)) {
+				err = -ENOMEM;
+				goto failure;
+			}
+
+			bio = bio_alloc(GFP_KERNEL, count);
+			if (unlikely(!bio)) {
+				err = -ENOMEM;
+				goto failure;
+			}
+			bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+			bio->bi_bdev = bh->b_bdev;
+		}
+
+		if (bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))
+				< bh->b_size) {
+			brelse(bh);
+			if (read_cnt == 0)
+				goto failure;
+
+			break;
+		}
+
+		read_info->bh[read_cnt++] = bh;
+		prev_blk = curr;
+	}
+
+	if (read_cnt == 0)
+		goto done;
+
+	ext3_read_indblocks_submit(&bio, &read_info, &read_cnt, seq_prefetch);
+
+	kfree(ind_info);
+	return 0;
+
+failure:
+	while (--read_cnt >= 0) {
+		unlock_buffer(read_info->bh[read_cnt]);
+		brelse(read_info->bh[read_cnt]);
+	}
+	*blocks_done = 0UL;
+
+done:
+	kfree(read_info);
+
+	if (bio)
+		bio_put(bio);
+
+	kfree(ind_info);
+	return err;
+}
+
+/*
+ * ext3_read_indblocks_sync --
+ *      @sb:            super block
+ *      @ind_blocks[]:  array of indirect block numbers on disk
+ *      @count:         maximum number of indirect blocks to read
+ *      @first_bh:      buffer_head for indirect block ind_blocks[0], must be
+ *                      non-NULL.
+ *      @seq_prefetch:  set prefetch bit of buffers, used when this is part of
+ *                      a sequential prefetch.
+ *      @blocks_done:   number of blocks considered for prefetching.
+ *
+ *      Synchronously read at most count indirect blocks listed in
+ *      ind_blocks[]. This function calls ext3_read_indblocks_async() to do all
+ *      the hard work. It waits for read to complete on first_bh before
+ *      returning.
+ */
+
+static int ext3_read_indblocks_sync(struct super_block *sb,
+				    const __le32 ind_blocks[], int count,
+				    struct buffer_head *first_bh,
+				    int seq_prefetch,
+				    unsigned long *blocks_done)
+{
+	int err;
+
+	BUG_ON(count < 1);
+	BUG_ON(!first_bh);
+
+	err = ext3_read_indblocks_async(sb, ind_blocks, count, first_bh,
+					seq_prefetch, blocks_done);
+	if (err)
+		return err;
+
+	wait_on_buffer(first_bh);
+	if (!buffer_uptodate(first_bh))
+		err = -EIO;
+
+	/* if seq_prefetch != 0, ext3_read_indblocks_async() sets prefetch bit
+	 * for all buffers, but the first buffer for sync IO is never a prefetch
+	 * buffer since it's needed presently so mark it so.
+	 */
+	if (seq_prefetch)
+		ext3_clear_buffer_prefetch(first_bh);
+
+	BUG_ON(ext3_buffer_prefetch(first_bh));
+
+	return err;
+}
+
+/*
+ * ext3_read_indblocks --
+ *
+ * 	@inode: inode of file
+ * 	@iblock: block number inside file (starting from 0).
+ * 	@depth: depth of path from inode to data block.
+ * 	@offsets: array of offsets within blocks identified in 'chain'.
+ * 	@chain: array of Indirect with info about all levels of blocks until
+ * 	the data block.
+ * 	@err: error pointer.
+ *
+ * 	This function is called after reading all metablocks leading to 'iblock'
+ * 	except the (singly) indirect block. It reads the indirect block if not
+ * 	already in the cache and may also prefetch next few indirect blocks.
+ * 	It uses a combination of synchronous and asynchronous requests to
+ * 	accomplish this. We do prefetching even for random reads by reading
+ * 	ahead one indirect block since reads of size >=512KB have at least 12%
+ * 	chance of spanning two indirect blocks.
+ */
+
+static Indirect *ext3_read_indblocks(struct inode *inode, int iblock,
+				     int depth, int offsets[4],
+				     Indirect chain[4], int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *first_bh, *prev_bh;
+	unsigned long max_read, blocks_done = 0;
+	__le32 *ind_blocks;
+
+	/* Must have doubly indirect block for prefetching indirect blocks. */
+	BUG_ON(depth <= 2);
+	BUG_ON(!chain[depth-2].key);
+
+	*err = 0;
+
+	/* Handle first block */
+	ind_blocks = chain[depth-2].p;
+	first_bh = sb_getblk(sb, le32_to_cpu(ind_blocks[0]));
+	if (unlikely(!first_bh)) {
+		printk(KERN_ERR "Failed to get block %u for sb %p\n",
+		       le32_to_cpu(ind_blocks[0]), sb);
+		goto failure;
+	}
+
+	BUG_ON(first_bh->b_size != sb->s_blocksize);
+
+	if (buffer_uptodate(first_bh)) {
+		/* Found the buffer in cache, either it was accessed recently or
+		 * it was prefetched while reading previous indirect block(s).
+		 * We need to figure out if we need to prefetch the following
+		 * indirect blocks.
+		 */
+		if (!ext3_buffer_prefetch(first_bh)) {
+			/* Either we've seen this indirect block before while
+			 * accessing another data block, or this is a random
+			 * read. In the former case, we must have done the
+			 * needful the first time we had a cache hit on this
+			 * indirect block, in the latter case we obviously
+			 * don't need to do any prefetching.
+			 */
+			goto done;
+		}
+
+		max_read = ext3_get_max_read(inode, iblock,
+					     offsets[depth-2]);
+
+		/* This indirect block is in the cache due to prefetching and
+		 * this is its first cache hit, clear the prefetch bit and
+		 * make sure the following blocks are also prefetched.
+		 */
+		ext3_clear_buffer_prefetch(first_bh);
+
+		if (max_read >= 2) {
+			/* ext3_read_indblocks_async() stops at the first
+			 * indirect block which has the prefetch bit set which
+			 * will most likely be the very next indirect block.
+			 */
+			ext3_read_indblocks_async(sb, &ind_blocks[1],
+						  max_read - 1,
+						  NULL, 1, &blocks_done);
+		}
+
+	} else {
+		/* Buffer is not in memory, we need to read it. If we are
+		 * reading sequentially from the previous indirect block, we
+		 * have just detected a sequential read and we must prefetch
+		 * some indirect blocks for future.
+		 */
+
+		max_read = ext3_get_max_read(inode, iblock,
+					     offsets[depth-2]);
+
+		if ((ind_blocks - (__le32 *)chain[depth-2].bh->b_data) >= 1) {
+			prev_bh = sb_getblk(sb, le32_to_cpu(ind_blocks[-1]));
+			if (buffer_uptodate(prev_bh) &&
+			    !ext3_buffer_prefetch(prev_bh)) {
+				/* Detected sequential read. */
+				brelse(prev_bh);
+
+				/* Sync read indirect block, also read the next
+				 * few indirect blocks.
+				 */
+				*err = ext3_read_indblocks_sync(sb, ind_blocks,
+							 max_read, first_bh, 1,
+							 &blocks_done);
+
+				if (*err)
+					goto out;
+
+				/* In case the very next indirect block is
+				 * discontiguous by a non-trivial amount,
+				 * ext3_read_indblocks_sync() above won't
+				 * prefetch it (indicated by blocks_done < 2).
+				 * So to help sequential read, schedule an
+				 * async request for reading the next
+				 * contiguous indirect block range (which
+				 * in metaclustering case would be the next
+				 * metacluster, without metaclustering it
+				 * would be the next indirect block). This is
+				 * expected to benefit the non-metaclustering
+				 * case.
+				 */
+				if (max_read >= 2 && blocks_done < 2)
+					ext3_read_indblocks_async(sb,
+							&ind_blocks[1],
+							max_read - 1,
+							NULL, 1, &blocks_done);
+
+				goto done;
+			}
+			brelse(prev_bh);
+		}
+
+		/* Either random read, or sequential detection failed above.
+		 * We always prefetch the next indirect block in this case
+		 * whenever possible.
+		 * This is because for random reads of size ~512KB, there is
+		 * >12% chance that a read will span two indirect blocks.
+		 */
+		*err = ext3_read_indblocks_sync(sb, ind_blocks,
+						(max_read >= 2) ? 2 : 1,
+						first_bh, 0, &blocks_done);
+		if (*err)
+			goto out;
+	}
+
+done:
+	/* Reader: pointers */
+	if (!verify_chain(chain, &chain[depth - 2])) {
+		brelse(first_bh);
+		goto changed;
+	}
+	add_chain(&chain[depth - 1], first_bh,
+		  (__le32 *)first_bh->b_data + offsets[depth - 1]);
+	/* Reader: end */
+	if (!chain[depth - 1].key)
+		goto out;
+
+	BUG_ON(!buffer_uptodate(first_bh));
+	return NULL;
+
+changed:
+	*err = -EAGAIN;
+	goto out;
+failure:
+	*err = -EIO;
+out:
+	if (*err) {
+		ext3_debug("Error %d reading indirect blocks\n", *err);
+		return &chain[depth - 2];
+	} else
+		return &chain[depth - 1];
+}
diff -rupdN linux-2.6.23.13-clean/fs/ext3/super.c linux-2.6.23.13-ext3mc/fs/ext3/super.c
--- linux-2.6.23.13-clean/fs/ext3/super.c	2008-01-09 12:18:17.000000000 -0500
+++ linux-2.6.23.13-ext3mc/fs/ext3/super.c	2008-01-12 22:30:19.000000000 -0500
@@ -556,6 +556,9 @@ static int ext3_show_options(struct seq_
 	else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
 		seq_puts(seq, ",data=writeback");
 
+	if (test_opt(sb, METACLUSTER))
+		seq_puts(seq, ",metacluster");
+
 	ext3_show_quota_options(seq, sb);
 
 	return 0;
@@ -684,7 +687,7 @@ enum {
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota
+	Opt_grpquota, Opt_metacluster
 };
 
 static match_table_t tokens = {
@@ -734,6 +737,7 @@ static match_table_t tokens = {
 	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
+	{Opt_metacluster, "metacluster"},
 	{Opt_err, NULL},
 	{Opt_resize, "resize"},
 };
@@ -1066,6 +1070,9 @@ clear_qf_name:
 		case Opt_bh:
 			clear_opt(sbi->s_mount_opt, NOBH);
 			break;
+		case Opt_metacluster:
+			set_opt(sbi->s_mount_opt, METACLUSTER);
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT3-fs: Unrecognized mount option \"%s\" "
@@ -1594,6 +1601,13 @@ static int ext3_fill_super (struct super
 	}
 	sbi->s_frags_per_block = 1;
 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+	if (test_opt(sb, METACLUSTER)) {
+		sbi->s_nonmc_blocks_per_group = sbi->s_blocks_per_group -
+			sbi->s_blocks_per_group / 12;
+		sbi->s_nonmc_blocks_per_group &= ~7;
+	} else
+		sbi->s_nonmc_blocks_per_group = sbi->s_blocks_per_group;
+
 	sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
 	if (EXT3_INODE_SIZE(sb) == 0)
@@ -1695,6 +1709,18 @@ static int ext3_fill_super (struct super
 	sbi->s_rsv_window_head.rsv_goal_size = 0;
 	ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);
 
+	if (test_opt(sb, METACLUSTER)) {
+		sbi->s_bginfo = kmalloc(sbi->s_groups_count *
+					sizeof(*sbi->s_bginfo), GFP_KERNEL);
+		if (!sbi->s_bginfo) {
+			printk(KERN_ERR "EXT3-fs: not enough memory\n");
+			goto failed_mount3;
+		}
+		for (i = 0; i < sbi->s_groups_count; i++)
+			sbi->s_bginfo[i].bgi_free_nonmc_blocks_count = -1;
+	} else
+		sbi->s_bginfo = NULL;
+
 	/*
 	 * set up enough so that it can read an inode
 	 */
@@ -1720,16 +1746,16 @@ static int ext3_fill_super (struct super
 	if (!test_opt(sb, NOLOAD) &&
 	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext3_load_journal(sb, es, journal_devnum))
-			goto failed_mount3;
+			goto failed_mount4;
 	} else if (journal_inum) {
 		if (ext3_create_journal(sb, es, journal_inum))
-			goto failed_mount3;
+			goto failed_mount4;
 	} else {
 		if (!silent)
 			printk (KERN_ERR
 				"ext3: No journal on filesystem on %s\n",
 				sb->s_id);
-		goto failed_mount3;
+		goto failed_mount4;
 	}
 
 	/* We have now updated the journal if required, so we can
@@ -1752,7 +1778,7 @@ static int ext3_fill_super (struct super
 		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
 			printk(KERN_ERR "EXT3-fs: Journal does not support "
 			       "requested data journaling mode\n");
-			goto failed_mount4;
+			goto failed_mount5;
 		}
 	default:
 		break;
@@ -1775,13 +1801,13 @@ static int ext3_fill_super (struct super
 	if (!sb->s_root) {
 		printk(KERN_ERR "EXT3-fs: get root inode failed\n");
 		iput(root);
-		goto failed_mount4;
+		goto failed_mount5;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		dput(sb->s_root);
 		sb->s_root = NULL;
 		printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
-		goto failed_mount4;
+		goto failed_mount5;
 	}
 
 	ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -1813,8 +1839,10 @@ cantfind_ext3:
 		       sb->s_id);
 	goto failed_mount;
 
-failed_mount4:
+failed_mount5:
 	journal_destroy(sbi->s_journal);
+failed_mount4:
+	kfree(sbi->s_bginfo);
 failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
diff -rupdN linux-2.6.23.13-clean/include/linux/ext3_fs.h linux-2.6.23.13-ext3mc/include/linux/ext3_fs.h
--- linux-2.6.23.13-clean/include/linux/ext3_fs.h	2008-01-09 12:18:17.000000000 -0500
+++ linux-2.6.23.13-ext3mc/include/linux/ext3_fs.h	2008-01-12 22:30:19.000000000 -0500
@@ -384,6 +384,7 @@ struct ext3_inode {
 #define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT3_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT3_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT3_MOUNT_METACLUSTER		0x400000 /* Indirect block clustering */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -497,6 +498,7 @@ struct ext3_super_block {
 #ifdef __KERNEL__
 #include <linux/ext3_fs_i.h>
 #include <linux/ext3_fs_sb.h>
+#include <linux/buffer_head.h>
 static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -732,6 +734,11 @@ struct dir_private_info {
 	__u32		next_hash;
 };
 
+/* Special bh flag used by the metacluster readahead logic. */
+enum ext3_bh_state_bits {
+	EXT3_BH_PREFETCH = BH_JBD_Sentinel,
+};
+
 /* calculate the first block number of the group */
 static inline ext3_fsblk_t
 ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
@@ -740,6 +747,24 @@ ext3_group_first_block_no(struct super_b
 		le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
 }
 
+static inline void
+ext3_set_buffer_prefetch(struct buffer_head *bh)
+{
+	set_bit(EXT3_BH_PREFETCH, &bh->b_state);
+}
+
+static inline void
+ext3_clear_buffer_prefetch(struct buffer_head *bh)
+{
+	clear_bit(EXT3_BH_PREFETCH, &bh->b_state);
+}
+
+static inline int
+ext3_buffer_prefetch(struct buffer_head *bh)
+{
+	return test_bit(EXT3_BH_PREFETCH, &bh->b_state);
+}
+
 /*
  * Special error return code only used by dx_probe() and its callers.
  */
@@ -762,8 +787,9 @@ extern int ext3_bg_has_super(struct supe
 extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
 extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
 			ext3_fsblk_t goal, int *errp);
-extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
-			ext3_fsblk_t goal, unsigned long *count, int *errp);
+extern int ext3_new_blocks(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, int indirect_blks, int blks,
+			ext3_fsblk_t new_blocks[], int *errp);
 extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
 			ext3_fsblk_t block, unsigned long count);
 extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
diff -rupdN linux-2.6.23.13-clean/include/linux/ext3_fs_sb.h linux-2.6.23.13-ext3mc/include/linux/ext3_fs_sb.h
--- linux-2.6.23.13-clean/include/linux/ext3_fs_sb.h	2008-01-09 12:18:17.000000000 -0500
+++ linux-2.6.23.13-ext3mc/include/linux/ext3_fs_sb.h	2008-01-12 22:30:19.000000000 -0500
@@ -24,6 +24,8 @@
 #endif
 #include <linux/rbtree.h>
 
+struct ext3_bg_info;
+
 /*
  * third extended-fs super-block data in memory
  */
@@ -33,6 +35,7 @@ struct ext3_sb_info {
 	unsigned long s_inodes_per_block;/* Number of inodes per block */
 	unsigned long s_frags_per_group;/* Number of fragments in a group */
 	unsigned long s_blocks_per_group;/* Number of blocks in a group */
+	unsigned long s_nonmc_blocks_per_group;/* Number of non-metacluster blocks in a group */
 	unsigned long s_inodes_per_group;/* Number of inodes in a group */
 	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
 	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
@@ -66,6 +69,9 @@ struct ext3_sb_info {
 	struct rb_root s_rsv_window_root;
 	struct ext3_reserve_window_node s_rsv_window_head;
 
+	/* array of per-bg in-memory info */
+	struct ext3_bg_info *s_bginfo;
+
 	/* Journaling */
 	struct inode * s_journal_inode;
 	struct journal_s * s_journal;
@@ -82,4 +88,11 @@ struct ext3_sb_info {
 #endif
 };
 
+/*
+ * in-memory data associated with each block group.
+ */
+struct ext3_bg_info {
+	int bgi_free_nonmc_blocks_count;/* Number of free non-metacluster blocks in group */
+};
+
 #endif	/* _LINUX_EXT3_FS_SB */
diff -rupdN linux-2.6.23.13-clean/include/linux/jbd.h linux-2.6.23.13-ext3mc/include/linux/jbd.h
--- linux-2.6.23.13-clean/include/linux/jbd.h	2008-01-09 12:18:17.000000000 -0500
+++ linux-2.6.23.13-ext3mc/include/linux/jbd.h	2008-01-12 22:30:19.000000000 -0500
@@ -307,6 +307,7 @@ enum jbd_state_bits {
 	BH_State,		/* Pins most journal_head state */
 	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
 	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
+	BH_JBD_Sentinel,	/* Start bit for clients of jbd */
 };
 
 BUFFER_FNS(JBD, jbd)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/