linux-ext4 - Re: ext3 metaclustering performance numbers and updated patch

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <d9885f0f0711151205v1d891fa6k45740c14a7d6cbbe@mail.gmail.com>
Date:	Thu, 15 Nov 2007 12:05:58 -0800
From:	"Abhishek Rai" <abhishekrai@...gle.com>
To:	"Abhishek Rai" <abhishekrai@...gle.com>, linux-ext4@...r.kernel.org
Subject: Re: ext3 metaclustering performance numbers and updated patch

Please ignore previous patch, it has an issue with operator precedence
in ext3_get_grp_metacluster(), try this instead:

Signed-off-by: Abhishek Rai <abhishekrai@...gle.com>

diff -uprdN linux-2.6.23mm1-clean/fs/ext3/balloc.c
linux-2.6.23mm1-ext3mc/fs/ext3/balloc.c
--- linux-2.6.23mm1-clean/fs/ext3/balloc.c	2007-10-17 18:31:42.000000000 -0700
+++ linux-2.6.23mm1-ext3mc/fs/ext3/balloc.c	2007-11-15 11:23:51.000000000 -0800
@@ -711,6 +711,7 @@ bitmap_search_next_usable_block(ext3_grp
 	ext3_grpblk_t next;
 	struct journal_head *jh = bh2jh(bh);

+	BUG_ON(start > maxblocks);
 	while (start < maxblocks) {
 		next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
 		if (next >= maxblocks)
@@ -841,10 +842,12 @@ claim_block(spinlock_t *lock, ext3_grpbl
 static ext3_grpblk_t
 ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
 			struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
-			unsigned long *count, struct ext3_reserve_window *my_rsv)
+			int use_metacluster, unsigned long *count,
+			struct ext3_reserve_window *my_rsv)
 {
 	ext3_fsblk_t group_first_block;
 	ext3_grpblk_t start, end;
+	ext3_grpblk_t mc_start, mc_end, start2 = -1, end2 = -1;
 	unsigned long num = 0;

 	/* we do allocation within the reservation window if we have a window */
@@ -872,12 +875,48 @@ ext3_try_to_allocate(struct super_block
 	}

 	BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
+	/* start must have been set to grp_goal if one still exists. */
+	BUG_ON(grp_goal >= 0 && start != grp_goal);
+
+	if (test_opt(sb, METACLUSTER) && !use_metacluster) {
+		ext3_get_grp_metacluster(sb, &mc_start, &mc_end);
+
+		/*
+	 	 * If there is an overlap with metacluster range, adjust our
+		 * range to remove overlap, splitting our range into two if
+		 * needed.
+	 	 */
+		if (mc_end > mc_start) {
+			if (mc_start <= start)
+				start = max_t(ext3_grpblk_t, start, mc_end);
+			else if (mc_end >= end)
+				end = min_t(ext3_grpblk_t, end, mc_start);
+			else {
+				start2 = mc_end;
+				end2 = end;
+				end = mc_start;
+			}
+		}
+	}
+
+	if (start >= end)
+		goto fail_access;
+
+	if (grp_goal > 0)
+		grp_goal = start;

 repeat:
 	if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
 		grp_goal = find_next_usable_block(start, bitmap_bh, end);
-		if (grp_goal < 0)
+		if (grp_goal < 0) {
+			if (start2 >= 0) {
+				start = start2;
+				end = end2;
+				start2 = -1;
+				goto repeat;
+			}
 			goto fail_access;
+		}
 		if (!my_rsv) {
 			int i;

@@ -898,8 +937,15 @@ repeat:
 		 */
 		start++;
 		grp_goal++;
-		if (start >= end)
-			goto fail_access;
+		if (start >= end) {
+			if (start2 < 0)
+				goto fail_access;
+
+			start = start2;
+			end = end2;
+			start2 = -1;
+			grp_goal = -1;
+		}
 		goto repeat;
 	}
 	num++;
@@ -1084,6 +1130,7 @@ static int alloc_new_reservation(struct
 	unsigned long size;
 	int ret;
 	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
+	ext3_grpblk_t mc_start, mc_end;

 	group_first_block = ext3_group_first_block_no(sb, group);
 	group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
@@ -1143,6 +1190,7 @@ static int alloc_new_reservation(struct
 	 * To make sure the reservation window has a free bit inside it, we
 	 * need to check the bitmap after we found a reservable window.
 	 */
+	ext3_get_grp_metacluster(sb, &mc_start, &mc_end);
 retry:
 	ret = find_next_reservable_window(search_head, my_rsv, sb,
 						start_block, group_end_block);
@@ -1170,6 +1218,11 @@ retry:
 			my_rsv->rsv_start - group_first_block,
 			bitmap_bh, group_end_block - group_first_block + 1);

+	if (first_free_block >= mc_start && first_free_block < mc_end) {
+		start_block = mc_end;
+		goto next;
+	}
+
 	if (first_free_block < 0) {
 		/*
 		 * no free block left on the bitmap, no point
@@ -1195,6 +1248,7 @@ retry:
 	 * start from where the free block is,
 	 * we also shift the list head to where we stopped last time
 	 */
+next:
 	search_head = my_rsv;
 	spin_lock(rsv_lock);
 	goto retry;
@@ -1223,12 +1277,18 @@ static void try_to_extend_reservation(st
 	struct ext3_reserve_window_node *next_rsv;
 	struct rb_node *next;
 	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
+	ext3_grpblk_t mc_start, mc_end;

 	if (!spin_trylock(rsv_lock))
 		return;

 	next = rb_next(&my_rsv->rsv_node);

+	ext3_get_grp_metacluster(sb, &mc_start, &mc_end);
+
+	if (my_rsv->rsv_end >= mc_start && my_rsv->rsv_end < mc_end)
+		size += mc_end - 1 - my_rsv->rsv_end;
+
 	if (!next)
 		my_rsv->rsv_end += size;
 	else {
@@ -1274,7 +1334,7 @@ static void try_to_extend_reservation(st
 static ext3_grpblk_t
 ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 			unsigned int group, struct buffer_head *bitmap_bh,
-			ext3_grpblk_t grp_goal,
+			ext3_grpblk_t grp_goal, int use_metacluster,
 			struct ext3_reserve_window_node * my_rsv,
 			unsigned long *count, int *errp)
 {
@@ -1305,7 +1365,8 @@ ext3_try_to_allocate_with_rsv(struct sup
 	 */
 	if (my_rsv == NULL ) {
 		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
-						grp_goal, count, NULL);
+						grp_goal, use_metacluster,
+						count, NULL);
 		goto out;
 	}
 	/*
@@ -1361,7 +1422,8 @@ ext3_try_to_allocate_with_rsv(struct sup
 			BUG();
 		}
 		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
-					   grp_goal, &num, &my_rsv->rsv_window);
+						grp_goal, use_metacluster,
+						&num, &my_rsv->rsv_window);
 		if (ret >= 0) {
 			my_rsv->rsv_alloc_hit += num;
 			*count = num;
@@ -1455,6 +1517,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h
 	int bgi;			/* blockgroup iteration index */
 	int fatal = 0, err;
 	int performed_allocation = 0;
+	int use_metacluster = 0;
 	ext3_grpblk_t free_blocks;	/* number of free blocks in a group */
 	struct super_block *sb;
 	struct ext3_group_desc *gdp;
@@ -1473,6 +1536,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h
 	sb = inode->i_sb;
 	if (!sb) {
 		printk("ext3_new_block: nonexistent device");
+		*errp = -ENODEV;
 		return 0;
 	}

@@ -1487,6 +1551,11 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h
 	sbi = EXT3_SB(sb);
 	es = EXT3_SB(sb)->s_es;
 	ext3_debug("goal=%lu.\n", goal);
+
+	/* Caller should ensure this. */
+	BUG_ON(goal < le32_to_cpu(es->s_first_data_block) ||
+	       goal >= le32_to_cpu(es->s_blocks_count));
+
 	/*
 	 * Allocate a block from reservation only when
 	 * filesystem is mounted with reservation(default,-o reservation), and
@@ -1507,9 +1576,6 @@ ext3_fsblk_t ext3_new_blocks(handle_t *h
 	/*
 	 * First, test whether the goal block is free.
 	 */
-	if (goal < le32_to_cpu(es->s_first_data_block) ||
-	    goal >= le32_to_cpu(es->s_blocks_count))
-		goal = le32_to_cpu(es->s_first_data_block);
 	group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
 			EXT3_BLOCKS_PER_GROUP(sb);
 	goal_group = group_no;
@@ -1535,7 +1601,7 @@ retry_alloc:
 			goto io_error;
 		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
 					group_no, bitmap_bh, grp_target_blk,
-					my_rsv,	&num, &fatal);
+					use_metacluster, my_rsv, &num, &fatal);
 		if (fatal)
 			goto out;
 		if (grp_alloc_blk >= 0)
@@ -1573,8 +1639,8 @@ retry_alloc:
 		 * try to allocate block(s) from this group, without a goal(-1).
 		 */
 		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, -1, my_rsv,
-					&num, &fatal);
+					group_no, bitmap_bh, -1,
+					use_metacluster, my_rsv, &num, &fatal);
 		if (fatal)
 			goto out;
 		if (grp_alloc_blk >= 0)
@@ -1593,6 +1659,10 @@ retry_alloc:
 		group_no = goal_group;
 		goto retry_alloc;
 	}
+	if (test_opt(sb, METACLUSTER) && use_metacluster == 0) {
+		use_metacluster = 1;
+		goto retry_alloc;
+	}
 	/* No space left on the device */
 	*errp = -ENOSPC;
 	goto out;
@@ -1713,6 +1783,161 @@ ext3_fsblk_t ext3_new_block(handle_t *ha
 	return ext3_new_blocks(handle, inode, goal, &count, errp);
 }

+/*
+ * ext3_new_indirect_blocks() -- allocate indirect blocks for inode.
+ * @inode:		file inode
+ * @count:		target number of indirect blocks to allocate
+ * @new_blocks[]:       used for returning block numbers allocated
+ *
+ * return: 0 on success, appropriate error code otherwise. Upon return, *count
+ * contains the number of blocks successfully allocated which is non-zero only
+ * in the success case.
+ *
+ * Allocate maximum of *count indirect blocks from the indirect block metadata
+ * area of inode's group and store the block numbers in new_blocksp[]. Since
+ * the allocation is in a predetermined region of the block group, caller just
+ * needs to pass a group number here which is where the goal and/or the
+ * reservation window may fall.
+ */
+int ext3_new_indirect_blocks(handle_t *handle, struct inode *inode,
+			unsigned long group_no, unsigned long *count,
+			ext3_fsblk_t new_blocks[])
+{
+	struct super_block *sb;
+	struct ext3_sb_info *sbi;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gdp_bh;
+	struct ext3_group_desc *gdp;
+	ext3_grpblk_t group_first_block;      /* first block in the group */
+	ext3_grpblk_t free_blocks;	/* number of free blocks in the group */
+	ext3_grpblk_t mc_start, mc_end;
+	int blk, done = 0;
+	int err = 0;
+
+	BUG_ON(*count > 3);
+
+	sb = inode->i_sb;
+	if (!sb) {
+		printk(KERN_INFO "ext3_new_indirect_blocks: "
+			"nonexistent device");
+		return -ENODEV;
+	}
+	BUG_ON(!test_opt(sb, METACLUSTER));
+	sbi = EXT3_SB(sb);
+
+	if (DQUOT_ALLOC_BLOCK(inode, *count))
+		return -EDQUOT;
+
+	if (!ext3_has_free_blocks(sbi)) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
+	if (!gdp) {
+		err = -EIO;
+		goto out;
+	}
+
+	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+	if (free_blocks == 0) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	bitmap_bh = read_block_bitmap(sb, group_no);
+	if (!bitmap_bh) {
+		err = -EIO;
+		goto out;
+	}
+
+	/*
+	 * Make sure we use undo access for the bitmap, because it is critical
+	 * that we do the frozen_data COW on bitmap buffers in all cases even
+	 * if the buffer is in BJ_Forget state in the committing transaction.
+	 */
+	BUFFER_TRACE(bitmap_bh, "get undo access for new indirect block");
+	err = ext3_journal_get_undo_access(handle, bitmap_bh);
+	if (err)
+		goto out;
+
+	err = -ENOSPC;
+	group_first_block = ext3_group_first_block_no(sb, group_no);
+	ext3_get_grp_metacluster(sb, &mc_start, &mc_end);
+	blk = mc_start;
+
+	while (done < *count && blk < mc_end) {
+		if (!ext3_test_allocatable(blk, bitmap_bh)) {
+			/*
+			 * Don't use find_next_usable_block() here as it may
+			 * skip free blocks that are not close to the goal.
+			 * Since our goal is always fixed (mc_start), we may
+			 * be trying to allocate slightly far from it and that
+			 * will be a problem.
+			 */
+			blk = bitmap_search_next_usable_block(blk, bitmap_bh,
+								mc_end);
+			continue;
+		}
+		if (claim_block(sb_bgl_lock(sbi, group_no), blk,
+				bitmap_bh)) {
+			new_blocks[done++] = group_first_block + blk;
+		} else {
+			/*
+		 	 * The block was allocated by another thread, or it
+			 * was allocated and then freed by another thread
+		 	 */
+			cpu_relax();
+		}
+		blk++;
+	}
+
+	if (!done) {
+		BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
+		ext3_journal_release_buffer(handle, bitmap_bh);
+		goto out;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block");
+	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+	if (err)
+		goto out;
+
+	BUFFER_TRACE(gdp_bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, gdp_bh);
+	if (err)
+		goto out;
+
+	/*
+	 * Caller is responsible for adding the new indirect block buffers
+	 * to the journal list.
+	 */
+
+	spin_lock(sb_bgl_lock(sbi, group_no));
+	gdp->bg_free_blocks_count =
+		cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - done);
+	spin_unlock(sb_bgl_lock(sbi, group_no));
+	percpu_counter_sub(&sbi->s_freeblocks_counter, done);
+
+	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
+	err = ext3_journal_dirty_metadata(handle, gdp_bh);
+	sb->s_dirt = 1;
+	if (err)
+		goto out;
+
+out:
+	if (bitmap_bh)
+		brelse(bitmap_bh);
+
+	DQUOT_FREE_BLOCK(inode, *count - done);
+	*count = done;
+
+	if (err && err != -ENOSPC)
+		ext3_error(sb, "ext3_new_indirect_blocks", "error %d", err);
+
+	return err;
+}
+
 /**
  * ext3_count_free_blocks() -- count filesystem free blocks
  * @sb:		superblock
diff -uprdN linux-2.6.23mm1-clean/fs/ext3/inode.c
linux-2.6.23mm1-ext3mc/fs/ext3/inode.c
--- linux-2.6.23mm1-clean/fs/ext3/inode.c	2007-10-17 18:31:42.000000000 -0700
+++ linux-2.6.23mm1-ext3mc/fs/ext3/inode.c	2007-11-15 11:21:07.000000000 -0800
@@ -39,7 +39,29 @@
 #include "xattr.h"
 #include "acl.h"

+typedef struct {
+	__le32	*p;
+	__le32	key;
+	struct buffer_head *bh;
+} Indirect;
+
+struct ext3_ind_read_info {
+	int                     count;
+	int                     seq_prefetch;
+	long                    size;
+	struct buffer_head      *bh[0];
+};
+
+# define EXT3_IND_READ_INFO_SIZE(_c)        \
+	(sizeof(struct ext3_ind_read_info) + \
+	 sizeof(struct buffer_head *) * (_c))
+
+# define EXT3_IND_READ_MAX     	(32)
+
 static int ext3_writepage_trans_blocks(struct inode *inode);
+static Indirect *ext3_read_indblocks(struct inode *inode, int iblock,
+					int depth, int offsets[4],
+					Indirect chain[4], int *err);

 /*
  * Test whether an inode is a fast symlink.
@@ -233,12 +255,6 @@ no_delete:
 	clear_inode(inode);	/* We must guarantee clearing of inode... */
 }

-typedef struct {
-	__le32	*p;
-	__le32	key;
-	struct buffer_head *bh;
-} Indirect;
-
 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 {
 	p->key = *(p->p = v);
@@ -352,18 +368,21 @@ static int ext3_block_to_path(struct ino
  *	the whole chain, all way to the data (returns %NULL, *err == 0).
  */
 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
-				 Indirect chain[4], int *err)
+				 Indirect chain[4], int ind_readahead, int *err)
 {
 	struct super_block *sb = inode->i_sb;
 	Indirect *p = chain;
 	struct buffer_head *bh;
+	int index;

 	*err = 0;
 	/* i_data is not going away, no lock needed */
 	add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
 	if (!p->key)
 		goto no_block;
-	while (--depth) {
+	for (index = 0; index < depth - 1; index++) {
+		if (ind_readahead && depth > 2 && index == depth - 2)
+			break;
 		bh = sb_bread(sb, le32_to_cpu(p->key));
 		if (!bh)
 			goto failure;
@@ -396,7 +415,15 @@ no_block:
  *	It is used when heuristic for sequential allocation fails.
  *	Rules are:
  *	  + if there is a block to the left of our position - allocate near it.
- *	  + if pointer will live in indirect block - allocate near that block.
+ *	  + If METACLUSTER options is not specified, allocate the data
+ *	  block close to the metadata block. Otherwise, if pointer will live in
+ *	  indirect block, we cannot allocate near the indirect block since
+ *	  indirect blocks are allocated in a reserved area. Even if we allocate
+ *	  this block right after the preceding logical file block, we'll still
+ *	  have to incur extra seek due to the indirect block (unless we
+ *	  prefetch the indirect block separately). So for now (until
+ *	  prefetching is turned on), it's OK not to return a sequential goal -
+ *	  just put in the same cylinder group as the inode.
  *	  + if pointer will live in inode - allocate in the same
  *	    cylinder group.
  *
@@ -421,9 +448,11 @@ static ext3_fsblk_t ext3_find_near(struc
 			return le32_to_cpu(*p);
 	}

-	/* No such thing, so let's try location of indirect block */
-	if (ind->bh)
-		return ind->bh->b_blocknr;
+	if (!test_opt(inode->i_sb, METACLUSTER)) {
+		/* No such thing, so let's try location of indirect block */
+		if (ind->bh)
+			return ind->bh->b_blocknr;
+	}

 	/*
 	 * It is going to be referred to from the inode itself? OK, just put it
@@ -475,8 +504,7 @@ static ext3_fsblk_t ext3_find_goal(struc
  *	@blks: number of data blocks to be mapped.
  *	@blocks_to_boundary:  the offset in the indirect block
  *
- *	return the total number of blocks to be allocate, including the
- *	direct and indirect blocks.
+ *	return the total number of direct blocks to be allocated.
  */
 static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 		int blocks_to_boundary)
@@ -508,22 +536,39 @@ static int ext3_blks_to_allocate(Indirec
  *	ext3_alloc_blocks: multiple allocate blocks needed for a branch
  *	@indirect_blks: the number of blocks need to allocate for indirect
  *			blocks
- *
+ *	@blks: the number of direct blocks to be allocated
  *	@new_blocks: on return it will store the new block numbers for
  *	the indirect blocks(if needed) and the first direct block,
- *	@blks:	on return it will store the total number of allocated
- *		direct blocks
+ *
+ *	returns the number of direct blocks allocated, error via *err, and
+ *	new block numbers via new_blocks[]
  */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
 			ext3_fsblk_t goal, int indirect_blks, int blks,
 			ext3_fsblk_t new_blocks[4], int *err)
 {
+	struct super_block *sb;
+	struct ext3_super_block *es;
 	int target, i;
-	unsigned long count = 0;
+	unsigned long count = 0, goal_group;
 	int index = 0;
 	ext3_fsblk_t current_block = 0;
 	int ret = 0;

+	BUG_ON(blks <= 0);
+
+	sb = inode->i_sb;
+	if (!sb) {
+		printk(KERN_INFO "ext3_alloc_blocks: nonexistent device");
+		*err = -ENODEV;
+		return 0;
+	}
+	es = EXT3_SB(sb)->s_es;
+
+	if (goal < le32_to_cpu(es->s_first_data_block) ||
+	    goal >= le32_to_cpu(es->s_blocks_count))
+		goal = le32_to_cpu(es->s_first_data_block);
+
 	/*
 	 * Here we try to allocate the requested multiple blocks at once,
 	 * on a best-effort basis.
@@ -534,6 +579,41 @@ static int ext3_alloc_blocks(handle_t *h
 	 */
 	target = blks + indirect_blks;

+	/*
+	 * Try to allocate indirect blocks in the metacluster region of block
+	 * group in which goal falls. This should not only give us clustered
+	 * metablock allocation, but also allocate new metablocks close to the
+	 * corresponding data blocks (by putting them in the same block group).
+	 * Note that allocation of indirect blocks is only guided by goal and
+	 * not by reservation window since the goal mostly falls within the
+	 * reservation window for sequential allocation.
+	 * If the indirect blocks could not be allocated in this block group,
+	 * we fall back to sequential allocation of indirect block alongside
+	 * the data block instead of trying other block groups as that can
+	 * separate indirect and data blocks too far out.
+	 */
+	if (test_opt(sb, METACLUSTER) && indirect_blks) {
+		count = indirect_blks;
+		goal_group = (goal - le32_to_cpu(es->s_first_data_block)) /
+				EXT3_BLOCKS_PER_GROUP(sb);
+		*err = ext3_new_indirect_blocks(handle, inode, goal_group,
+						&count, new_blocks + index);
+		if (*err && *err != -ENOSPC) {
+			printk(KERN_ERR "ext3_alloc_blocks failed to allocate "
+				"indirect blocks: %d", *err);
+			goto failed_out;
+		} else if (*err == 0) {
+			BUG_ON(count == 0);
+		}
+		*err = 0;
+
+		if (count > 0) {
+			index += count;
+			target -= count;
+			BUG_ON(index > indirect_blks);
+		}
+	}
+
 	while (1) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
@@ -542,7 +622,7 @@ static int ext3_alloc_blocks(handle_t *h
 			goto failed_out;

 		target -= count;
-		/* allocate blocks for indirect blocks */
+		/* store indirect block numbers we just allocated */
 		while (index < indirect_blks && count) {
 			new_blocks[index++] = current_block++;
 			count--;
@@ -570,10 +650,14 @@ failed_out:
  *	@inode: owner
  *	@indirect_blks: number of allocated indirect blocks
  *	@blks: number of allocated direct blocks
+ *	@goal: goal for allocation
  *	@offsets: offsets (in the blocks) to store the pointers to next.
  *	@branch: place to store the chain in.
  *
- *	This function allocates blocks, zeroes out all but the last one,
+ *	returns error and number of direct blocks allocated via *blks
+ *
+ *	This function allocates indirect_blks + *blks, zeroes out all
+ *	indirect blocks,
  *	links them into chain and (if we are synchronous) writes them to disk.
  *	In other words, it prepares a branch that can be spliced onto the
  *	inode. It stores the information about that chain in the branch[], in
@@ -799,17 +883,24 @@ int ext3_get_blocks_handle(handle_t *han
 	int blocks_to_boundary = 0;
 	int depth;
 	struct ext3_inode_info *ei = EXT3_I(inode);
-	int count = 0;
+	int count = 0, ind_readahead;
 	ext3_fsblk_t first_block = 0;

-
+	BUG_ON(!create &&
+		iblock >= (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+					inode->i_sb->s_blocksize_bits);
 	J_ASSERT(handle != NULL || create == 0);
 	depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);

 	if (depth == 0)
 		goto out;

-	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
+	ind_readahead = !create && depth > 2;
+	partial = ext3_get_branch(inode, depth, offsets, chain,
+				  ind_readahead, &err);
+	if (!partial && ind_readahead)
+		partial = ext3_read_indblocks(inode, iblock, depth,
+					      offsets, chain, &err);

 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
@@ -844,7 +935,7 @@ int ext3_get_blocks_handle(handle_t *han
 	}

 	/* Next simple case - plain lookup or failed read of indirect block */
-	if (!create || err == -EIO)
+	if (!create || (err && err != -EAGAIN))
 		goto cleanup;

 	mutex_lock(&ei->truncate_mutex);
@@ -866,7 +957,8 @@ int ext3_get_blocks_handle(handle_t *han
 			brelse(partial->bh);
 			partial--;
 		}
-		partial = ext3_get_branch(inode, depth, offsets, chain, &err);
+		partial = ext3_get_branch(inode, depth, offsets, chain, 0,
+					&err);
 		if (!partial) {
 			count++;
 			mutex_unlock(&ei->truncate_mutex);
@@ -1974,7 +2066,7 @@ static Indirect *ext3_find_shared(struct
 	/* Make k index the deepest non-null offest + 1 */
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
-	partial = ext3_get_branch(inode, k, offsets, chain, &err);
+	partial = ext3_get_branch(inode, k, offsets, chain, 0, &err);
 	/* Writer: pointers */
 	if (!partial)
 		partial = chain + k-1;
@@ -3297,3 +3389,508 @@ int ext3_change_inode_journal_flag(struc

 	return err;
 }
+
+/*
+ * ext3_ind_read_end_bio --
+ *
+ * 	bio callback for read IO issued from ext3_read_indblocks.
+ * 	Will be called only once, when all I/O has completed.
+ * 	Frees read_info and bio.
+ */
+static void ext3_ind_read_end_bio(struct bio *bio, int err)
+{
+	struct ext3_ind_read_info *read_info = bio->bi_private;
+	struct buffer_head *bh;
+	int uptodate = !err && test_bit(BIO_UPTODATE, &bio->bi_flags);
+	int i;
+
+	BUG_ON(read_info->count <= 0);
+
+	if (err == -EOPNOTSUPP)
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+
+	for (i = 0; i < read_info->count; i++) {
+		bh = read_info->bh[i];
+		BUG_ON(bh == NULL);
+
+		if (err == -EOPNOTSUPP)
+			set_bit(BH_Eopnotsupp, &bh->b_state);
+
+		if (uptodate) {
+			BUG_ON(buffer_uptodate(bh));
+			BUG_ON(ext3_buffer_prefetch(bh));
+			set_buffer_uptodate(bh);
+			if (read_info->seq_prefetch)
+				ext3_set_buffer_prefetch(bh);
+		}
+
+		unlock_buffer(bh);
+		brelse(bh);
+	}
+
+	kfree(read_info);
+	bio_put(bio);
+}
+
+/*
+ * ext3_get_max_read --
+ * 	@inode: inode of file.
+ * 	@block: block number in file (starting from zero).
+ * 	@offset_in_dind_block: offset of the indirect block inside it's
+ * 	parent doubly-indirect block.
+ *
+ *      Compute the maximum no. of indirect blocks that can be read
+ *      satisfying following constraints:
+ *              - Don't read indirect blocks beyond the end of current
+ *              doubly-indirect block.
+ *              - Don't read beyond eof.
+ */
+static inline unsigned long ext3_get_max_read(const struct inode *inode,
+						  int block,
+						  int offset_in_dind_block)
+{
+	const struct super_block *sb = inode->i_sb;
+	unsigned long max_read;
+	unsigned long ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+	unsigned long ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
+	unsigned long blocks_in_file =
+		(inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	unsigned long remaining_ind_blks_in_dind =
+		(ptrs >= offset_in_dind_block) ? (ptrs - offset_in_dind_block)
+					       : 0;
+	unsigned long remaining_ind_blks_before_eof =
+		((blocks_in_file - EXT3_NDIR_BLOCKS + ptrs - 1) >> ptrs_bits) -
+		((block - EXT3_NDIR_BLOCKS) >> ptrs_bits);
+
+	BUG_ON(block >= blocks_in_file);
+
+	max_read = min_t(unsigned long, remaining_ind_blks_in_dind,
+			 remaining_ind_blks_before_eof);
+
+	BUG_ON(max_read < 1);
+
+	return max_read;
+}
+
+static void ext3_read_indblocks_submit(struct bio **pbio,
+					struct ext3_ind_read_info **pread_info,
+					int *read_cnt, int seq_prefetch)
+{
+	struct bio *bio = *pbio;
+	struct ext3_ind_read_info *read_info = *pread_info;
+
+	BUG_ON(*read_cnt < 1);
+
+	read_info->seq_prefetch = seq_prefetch;
+	read_info->count = *read_cnt;
+	read_info->size = bio->bi_size;
+	bio->bi_private = read_info;
+	bio->bi_end_io = ext3_ind_read_end_bio;
+	submit_bio(READ, bio);
+
+	*pbio = NULL;
+	*pread_info = NULL;
+	*read_cnt = 0;
+}
+
+/*
+ * ext3_read_indblocks_async --
+ *      @sb:            super block
+ *      @ind_blocks[]:  array of indirect block numbers on disk
+ *      @count:         maximum number of indirect blocks to read
+ *      @first_bh:      buffer_head for indirect block ind_blocks[0], may be
+ *                      NULL
+ *      @seq_prefetch:  if this is part of a sequential prefetch and buffers'
+ *                      prefetch bit must be set.
+ *      @blocks_done:   number of blocks considered for prefetching.
+ *
+ *      Issue a single bio request to read upto count buffers identified in
+ *      ind_blocks[]. Fewer than count buffers may be read in some cases:
+ *      - If a buffer is found to be uptodate and it's prefetch bit is set, we
+ *      don't look at any more buffers as they will most likely be in
the cache.
+ *      - We skip buffers we cannot lock without blocking (except for first_bh
+ *			read_info->seq_prefetch = seq_prefetch;
+			read_info->count = read_cnt;
+			read_info->size = bio->bi_size;
+			bio->bi_private = read_info;
+			bio->bi_end_io = ext3_ind_read_end_bio;
+			submit_bio(READ, bio);
+      if specified).
+ *      - We skip buffers beyond a certain range on disk.
+ *
+ *      This function must issue read on first_bh if specified unless of course
+ *      it's already uptodate.
+ */
+static int ext3_read_indblocks_async(struct super_block *sb,
+				     __le32 ind_blocks[], int count,
+				     struct buffer_head *first_bh,
+				     int seq_prefetch,
+				     unsigned long *blocks_done)
+{
+	struct buffer_head *bh;
+	struct bio *bio = NULL;
+	struct ext3_ind_read_info *read_info = NULL;
+	int read_cnt = 0, blk;
+	ext3_fsblk_t prev_blk = 0, io_start_blk = 0, curr;
+	int err = 0;
+
+	BUG_ON(count < 1);
+	/* Don't move this to ext3_get_max_read() since callers often need to
+	 * trim the count returned by that function. So this bound must only
+	 * be imposed at the last moment. */
+	count = min_t(unsigned long, count, EXT3_IND_READ_MAX);
+	*blocks_done = 0UL;
+
+	if (count == 1 && first_bh) {
+		lock_buffer(first_bh);
+		get_bh(first_bh);
+		first_bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, first_bh);
+		*blocks_done = 1UL;
+		return 0;
+	}
+
+	for (blk = 0; blk < count; blk++) {
+		curr = le32_to_cpu(ind_blocks[blk]);
+
+		if (!curr)
+			continue;
+
+		if (io_start_blk > 0) {
+			if (max(io_start_blk, curr) - min(io_start_blk, curr) >=
+					EXT3_IND_READ_MAX)
+				continue;
+		}
+
+		if (prev_blk > 0 && curr != prev_blk + 1) {
+			ext3_read_indblocks_submit(&bio, &read_info,
+						&read_cnt, seq_prefetch);
+			prev_blk = 0;
+			break;
+		}
+
+		if (blk == 0 && first_bh) {
+			bh = first_bh;
+			get_bh(first_bh);
+		} else {
+			bh = sb_getblk(sb, curr);
+			if (unlikely(!bh)) {
+				err = -ENOMEM;
+				goto failure;
+			}
+		}
+
+		if (buffer_uptodate(bh)) {
+			if (ext3_buffer_prefetch(bh)) {
+				brelse(bh);
+				break;
+			}
+			brelse(bh);
+			continue;
+		}
+
+		/* Lock the buffer without blocking, skipping any buffers
+		 * which would require us to block. first_bh when specified is
+		 * an exception as caller typically wants it to be read for
+		 * sure (e.g., ext3_read_indblocks_sync).
+		 */
+		if (bh == first_bh) {
+			lock_buffer(bh);
+		} else if (test_set_buffer_locked(bh)) {
+			brelse(bh);
+			continue;
+		}
+
+		/* Check again with the buffer locked. */
+		if (buffer_uptodate(bh)) {
+			if (ext3_buffer_prefetch(bh)) {
+				unlock_buffer(bh);
+				brelse(bh);
+				break;
+			}
+			unlock_buffer(bh);
+			brelse(bh);
+			continue;
+		}
+
+		if (read_cnt == 0) {
+			/* read_info freed in ext3_ind_read_end_bio(). */
+			read_info = kmalloc(EXT3_IND_READ_INFO_SIZE(count),
+					    GFP_KERNEL);
+			if (unlikely(!read_info)) {
+				err = -ENOMEM;
+				goto failure;
+			}
+
+			bio = bio_alloc(GFP_KERNEL, count);
+			if (unlikely(!bio)) {
+				err = -ENOMEM;
+				goto failure;
+			}
+			bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+			bio->bi_bdev = bh->b_bdev;
+		}
+
+		if (bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))
+				< bh->b_size) {
+			brelse(bh);
+			if (read_cnt == 0)
+				goto failure;
+
+			break;
+		}
+
+		read_info->bh[read_cnt++] = bh;
+
+		prev_blk = curr;
+		if (io_start_blk == 0)
+			io_start_blk = curr;
+	}
+
+	if (read_cnt == 0)
+		goto done;
+
+	ext3_read_indblocks_submit(&bio, &read_info, &read_cnt, seq_prefetch);
+
+	*blocks_done = blk;
+	return 0;
+
+failure:
+	while (--read_cnt >= 0) {
+		unlock_buffer(read_info->bh[read_cnt]);
+		brelse(read_info->bh[read_cnt]);
+	}
+
+done:
+	if (read_info)
+		kfree(read_info);
+
+	if (bio)
+		bio_put(bio);
+
+	return err;
+}
+
+/*
+ * ext3_read_indblocks_sync --
+ *      @sb:            super block
+ *      @ind_blocks[]:  array of indirect block numbers on disk
+ *      @count:         maximum number of indirect blocks to read
+ *      @first_bh:      buffer_head for indirect block ind_blocks[0], must be
+ *                      non-NULL.
+ *      @seq_prefetch:  set prefetch bit of buffers, used when this is part of
+ *                      a sequential prefetch.
+ *      @blocks_done:   number of blocks considered for prefetching.
+ *
+ *      Synchronously read at most count indirect blocks listed in
+ *      ind_blocks[]. This function calls ext3_read_indblocks_async() to do all
+ *      the hard work. It waits for read to complete on first_bh before
+ *      returning.
+ */
+
+static int ext3_read_indblocks_sync(struct super_block *sb,
+				    __le32 ind_blocks[], int count,
+				    struct buffer_head *first_bh,
+				    int seq_prefetch,
+				    unsigned long *blocks_done)
+{
+	int err;
+
+	BUG_ON(count < 1);
+	BUG_ON(!first_bh);
+
+	err = ext3_read_indblocks_async(sb, ind_blocks, count, first_bh,
+					seq_prefetch, blocks_done);
+	if (err)
+		return err;
+
+	wait_on_buffer(first_bh);
+	if (!buffer_uptodate(first_bh))
+		err = -EIO;
+
+	/* if seq_prefetch != 0, ext3_read_indblocks_async() sets prefetch bit
+	 * for all buffers, but the first buffer for sync IO is never a prefetch
+	 * buffer since it's needed presently so mark it so.
+	 */
+	if (seq_prefetch)
+		ext3_clear_buffer_prefetch(first_bh);
+
+	BUG_ON(ext3_buffer_prefetch(first_bh));
+
+	return err;
+}
+
+/*
+ * ext3_read_indblocks --
+ *
+ * 	@inode: inode of file
+ * 	@iblock: block number inside file (starting from 0).
+ * 	@depth: depth of path from inode to data block.
+ * 	@offsets: array of offsets within blocks identified in 'chain'.
+ * 	@chain: array of Indirect with info about all levels of blocks until
+ * 	the data block.
+ * 	@err: error pointer.
+ *
+ * 	This function is called after reading all metablocks leading to 'iblock'
+ * 	except the (singly) indirect block. It reads the indirect block if not
+ * 	already in the cache and may also prefetch next few indirect blocks.
+ * 	It uses a combination of synchronous and asynchronous requests to
+ * 	accomplish this. We do prefetching even for random reads by reading
+ * 	ahead one indirect block since reads of size >=512KB have at least 12%
+ * 	chance of spanning two indirect blocks.
+ */
+
+static Indirect *ext3_read_indblocks(struct inode *inode, int iblock,
+				     int depth, int offsets[4],
+				     Indirect chain[4], int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *first_bh, *prev_bh;
+	unsigned long max_read, blocks_done = 0;
+	__le32 *ind_blocks;
+
+	/* Must have doubly indirect block for prefetching indirect blocks. */
+	BUG_ON(depth <= 2);
+	BUG_ON(!chain[depth-2].key);
+
+	*err = 0;
+
+	/* Handle first block */
+	ind_blocks = chain[depth-2].p;
+	first_bh = sb_getblk(sb, le32_to_cpu(ind_blocks[0]));
+	if (unlikely(!first_bh)) {
+		printk(KERN_ERR "Failed to get block %u for sb %p\n",
+		       le32_to_cpu(ind_blocks[0]), sb);
+		goto failure;
+	}
+
+	BUG_ON(first_bh->b_size != sb->s_blocksize);
+
+	if (buffer_uptodate(first_bh)) {
+		/* Found the buffer in cache, either it was accessed recently or
+		 * it was prefetched while reading previous indirect block(s).
+		 * We need to figure out if we need to prefetch the following
+		 * indirect blocks.
+		 */
+		if (!ext3_buffer_prefetch(first_bh)) {
+			/* Either we've seen this indirect block before while
+			 * accessing another data block, or this is a random
+			 * read. In the former case, we must have done the
+			 * needful the first time we had a cache hit on this
+			 * indirect block, in the latter case we obviously
+			 * don't need to do any prefetching.
+			 */
+			goto done;
+		}
+
+		max_read = ext3_get_max_read(inode, iblock,
+					     offsets[depth-2]);
+
+		/* This indirect block is in the cache due to prefetching and
+		 * this is its first cache hit, clear the prefetch bit and
+		 * make sure the following blocks are also prefetched.
+		 */
+		ext3_clear_buffer_prefetch(first_bh);
+
+		if (max_read >= 2) {
+			/* ext3_read_indblocks_async() stops at the first
+			 * indirect block which has the prefetch bit set which
+			 * will most likely be the very next indirect block.
+			 */
+			ext3_read_indblocks_async(sb, &ind_blocks[1],
+						  max_read - 1,
+						  NULL, 1, &blocks_done);
+		}
+
+	} else {
+		/* Buffer is not in memory, we need to read it. If we are
+		 * reading sequentially from the previous indirect block, we
+		 * have just detected a sequential read and we must prefetch
+		 * some indirect blocks for future.
+		 */
+
+		max_read = ext3_get_max_read(inode, iblock,
+					     offsets[depth-2]);
+
+		if ((ind_blocks - (__le32 *)chain[depth-2].bh->b_data) >= 1) {
+			prev_bh = sb_getblk(sb, le32_to_cpu(ind_blocks[-1]));
+			if (buffer_uptodate(prev_bh) &&
+			    !ext3_buffer_prefetch(prev_bh)) {
+				/* Detected sequential read. */
+				brelse(prev_bh);
+
+				/* Sync read indirect block, also read the next
+				 * few indirect blocks.
+				 */
+				*err = ext3_read_indblocks_sync(sb, ind_blocks,
+							 max_read, first_bh, 1,
+							 &blocks_done);
+
+				if (*err)
+					goto out;
+
+				/* In case the very next indirect block is
+				 * discontiguous by a non-trivial amount,
+				 * ext3_read_indblocks_sync() above won't
+				 * prefetch it (indicated by blocks_done < 2).
+				 * So to help sequential read, schedule an
+				 * async request for reading the next
+				 * contiguous indirect block range (which
+				 * in metaclustering case would be the next
+				 * metacluster, without metaclustering it
+				 * would be the next indirect block). This is
+				 * expected to benefit the non-metaclustering
+				 * case.
+				 */
+				if (max_read >= 2 && blocks_done < 2)
+					ext3_read_indblocks_async(sb,
+							&ind_blocks[1],
+							max_read - 1,
+							NULL, 1, &blocks_done);
+
+				goto done;
+			}
+			brelse(prev_bh);
+		}
+
+		/* Either random read, or sequential detection failed above.
+		 * We always prefetch the next indirect block in this case
+		 * whenever possible.
+		 * This is because for random reads of size ~512KB, there is
+		 * >12% chance that a read will span two indirect blocks.
+		 */
+		*err = ext3_read_indblocks_sync(sb, ind_blocks,
+						(max_read >= 2) ? 2 : 1,
+						first_bh, 0, &blocks_done);
+		if (*err)
+			goto out;
+	}
+
+done:
+	/* Reader: pointers */
+	if (!verify_chain(chain, &chain[depth - 2])) {
+		brelse(first_bh);
+		goto changed;
+	}
+	add_chain(&chain[depth - 1], first_bh,
+		  (__le32*)first_bh->b_data + offsets[depth - 1]);
+	/* Reader: end */
+	if (!chain[depth - 1].key)
+		goto out;
+
+	BUG_ON(!buffer_uptodate(first_bh));
+	return NULL;
+
+changed:
+	*err = -EAGAIN;
+	goto out;
+failure:
+	*err = -EIO;
+out:
+	if (*err) {
+		ext3_debug("Error %d reading indirect blocks\n", *err);
+		return &chain[depth - 2];
+	} else
+		return &chain[depth - 1];
+}
+
diff -uprdN linux-2.6.23mm1-clean/fs/ext3/super.c
linux-2.6.23mm1-ext3mc/fs/ext3/super.c
--- linux-2.6.23mm1-clean/fs/ext3/super.c	2007-10-17 18:31:42.000000000 -0700
+++ linux-2.6.23mm1-ext3mc/fs/ext3/super.c	2007-11-09 16:46:29.000000000 -0800
@@ -625,6 +625,9 @@ static int ext3_show_options(struct seq_
 	else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
 		seq_puts(seq, ",data=writeback");

+	if (test_opt(sb, METACLUSTER))
+		seq_puts(seq, ",metacluster");
+
 	ext3_show_quota_options(seq, sb);

 	return 0;
@@ -758,7 +761,7 @@ enum {
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota
+	Opt_grpquota, Opt_metacluster
 };

 static match_table_t tokens = {
@@ -808,6 +811,7 @@ static match_table_t tokens = {
 	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
+	{Opt_metacluster, "metacluster"},
 	{Opt_err, NULL},
 	{Opt_resize, "resize"},
 };
@@ -1140,6 +1144,9 @@ clear_qf_name:
 		case Opt_bh:
 			clear_opt(sbi->s_mount_opt, NOBH);
 			break;
+		case Opt_metacluster:
+			set_opt(sbi->s_mount_opt, METACLUSTER);
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT3-fs: Unrecognized mount option \"%s\" "
diff -uprdN linux-2.6.23mm1-clean/include/linux/ext3_fs.h
linux-2.6.23mm1-ext3mc/include/linux/ext3_fs.h
--- linux-2.6.23mm1-clean/include/linux/ext3_fs.h	2007-10-17
18:31:43.000000000 -0700
+++ linux-2.6.23mm1-ext3mc/include/linux/ext3_fs.h	2007-11-15
12:03:48.000000000 -0800
@@ -380,6 +380,7 @@ struct ext3_inode {
 #define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT3_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT3_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT3_MOUNT_METACLUSTER		0x400000 /* Indirect block clustering */

 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -493,6 +494,7 @@ struct ext3_super_block {
 #ifdef __KERNEL__
 #include <linux/ext3_fs_i.h>
 #include <linux/ext3_fs_sb.h>
+#include <linux/buffer_head.h>
 static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -722,6 +724,11 @@ struct dir_private_info {
 	__u32		next_hash;
 };

+/* Special bh flag used by the metacluster readahead logic. */
+enum ext3_bh_state_bits {
+	EXT3_BH_PREFETCH = BH_JBD_Sentinel,
+};
+
 /* calculate the first block number of the group */
 static inline ext3_fsblk_t
 ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
@@ -730,6 +737,24 @@ ext3_group_first_block_no(struct super_b
 		le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
 }

+static inline void
+ext3_set_buffer_prefetch(struct buffer_head *bh)
+{
+	set_bit(EXT3_BH_PREFETCH, &bh->b_state);
+}
+
+static inline void
+ext3_clear_buffer_prefetch(struct buffer_head *bh)
+{
+	clear_bit(EXT3_BH_PREFETCH, &bh->b_state);
+}
+
+static inline int
+ext3_buffer_prefetch(struct buffer_head *bh)
+{
+	return test_bit(EXT3_BH_PREFETCH, &bh->b_state);
+}
+
 /*
  * Special error return code only used by dx_probe() and its callers.
  */
@@ -752,6 +777,9 @@ extern int ext3_bg_has_super(struct supe
 extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
 extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
 			ext3_fsblk_t goal, int *errp);
+extern int ext3_new_indirect_blocks(handle_t *handle, struct inode *,
+				unsigned long group_no, unsigned long *,
+				ext3_fsblk_t new_blocks[]);
 extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
 			ext3_fsblk_t goal, unsigned long *count, int *errp);
 extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
@@ -870,6 +898,31 @@ extern const struct inode_operations ext
 extern const struct inode_operations ext3_symlink_inode_operations;
 extern const struct inode_operations ext3_fast_symlink_inode_operations;

+/*
+ * ext3_get_grp_metacluster:
+ *
+ * 	Determines metacluster block range for all block groups of the file
+ * 	system.
+ *
+ * 	Number of metacluster blocks = blocks_per_group/128. This allows us
+ * 	to fit all indirect blocks in a block group with average file size of
+ * 	256KB into the group's metacluster. We want to avoid having large
+ * 	metaclusters because then we'll run of data blocks sooner and when
+ * 	out of data blocks metaclustering goes for a toss.
+ * 	
+ */
+static inline void
+ext3_get_grp_metacluster(struct super_block *sb,
+				ext3_grpblk_t *mc_start,
+				ext3_grpblk_t *mc_end)	/* exclusive */
+{
+	*mc_start = EXT3_BLOCKS_PER_GROUP(sb) / 2;
+	if (test_opt(sb, METACLUSTER)) {
+		*mc_end = *mc_start + (EXT3_BLOCKS_PER_GROUP(sb) >> 7);
+	} else {
+		*mc_end = *mc_start;
+	}
+}

 #endif	/* __KERNEL__ */

diff -uprdN linux-2.6.23mm1-clean/include/linux/jbd.h
linux-2.6.23mm1-ext3mc/include/linux/jbd.h
--- linux-2.6.23mm1-clean/include/linux/jbd.h	2007-10-17
18:31:43.000000000 -0700
+++ linux-2.6.23mm1-ext3mc/include/linux/jbd.h	2007-11-09
16:46:29.000000000 -0800
@@ -294,6 +294,7 @@ enum jbd_state_bits {
 	BH_State,		/* Pins most journal_head state */
 	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
 	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
+	BH_JBD_Sentinel,	/* Start bit for clients of jbd */
 };

 BUFFER_FNS(JBD, jbd)
-
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html