linux-ext4 - [PATCH] Threaded e2fsck proof of concept

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Date:	Wed, 26 Sep 2007 12:25:44 -0600
From:	Valerie Henson <val@....edu>
To:	linux-ext4@...r.kernel.org, linux-fsdevel@...r.kernel.org
Cc:	"Theodore Ts'o" <tytso@....edu>,
	Andreas Dilger <adilger@...sterfs.com>,
	Ric Wheeler <ric@....com>
Subject: [PATCH] Threaded e2fsck proof of concept

The below patch is a proof of concept that e2fsck can get a
performance improvement on file systems with more than one disk
underneath.  On my test case, a 500GB file system with 150GB in use
and 10+1 RAID underneath, elapsed time is reduced by 40-50%.  I see no
performance improvement in the single disk case.  Only the reading of
inode tables and indirect blocks in pass 1 is multi-threaded; most
likely multithreading passes 2 and 5 will help too.  The actual data
processing is still all single-threaded, which is convenient.

Designing multithreaded readahead for the long term is another
question.  The Lustre folks are working on a sys_readahead() based
patch.  True aio() is the obvious solution, but won't work for older
kernels.  Pthreads works for all kernels but is clumsy.  Coming up
with a design for readahead that allows these different
implementations is probably a good idea.

Finally, if you are planning on testing these patches:

* Use -n!  You are crazy to let this write to your file system.
* Use about 2 * number_disks threads. (Doesn't work without -t <n> of
  some sort.)
* The striping logic is probably bogus.  Try something like -A 100000.

Thanks to EMC for making this patch possible.  Share and enjoy!

-VAL

--- e2fsprogs-1.40.2.orig/e2fsck/Makefile.in
+++ e2fsprogs-1.40.2/e2fsck/Makefile.in
@@ -119,16 +119,16 @@ e2fsck: e2fsck.@...SCK_TYPE@
 e2fsck.static: $(OBJS)  $(STATIC_DEPLIBS)
 	@echo "	LD $@"
 	@$(LD) $(ALL_LDFLAGS) $(LDFLAG_STATIC) -o e2fsck.static $(OBJS) \
-		$(STATIC_LIBS) 
+		$(STATIC_LIBS) -lpthread
 
 e2fsck.shared: $(OBJS)  $(DEPLIBS)
 	@echo "	LD $@"
-	@$(LD) $(ALL_LDFLAGS) -o e2fsck.shared $(OBJS) $(LIBS) 
+	@$(LD) $(ALL_LDFLAGS) -o e2fsck.shared $(OBJS) $(LIBS)  -lpthread
 
 e2fsck.profiled: $(PROFILED_OBJS)  $(PROFILED_DEPLIBS)
 	@echo "	LD $@"
 	@$(LD) $(ALL_LDFLAGS) -g -pg -o e2fsck.profiled $(PROFILED_OBJS) \
-		$(PROFILED_LIBS) 
+		$(PROFILED_LIBS)  -lpthread
 
 tst_refcount: ea_refcount.c
 	@echo "	LD $@"
--- e2fsprogs-1.40.2.orig/e2fsck/e2fsck.h
+++ e2fsprogs-1.40.2/e2fsck/e2fsck.h
@@ -25,6 +25,7 @@
 #ifdef HAVE_SETJMP_H
 #include <setjmp.h>
 #endif
+#include <pthread.h>
 
 #if EXT2_FLAT_INCLUDES
 #include "ext2_fs.h"
@@ -334,6 +335,17 @@ struct e2fsck_struct {
 	profile_t	profile;
 
 	/*
+	 * Multithreaded readahead variables
+	 */
+	unsigned int read_threads;
+	unsigned int stripe_size;
+	struct readahead_state *readahead;
+	/* Used to signal the main thread when a bg is ready */
+	pthread_mutex_t mutex_ready;
+	pthread_cond_t buffer_ready;
+	/* Have to count groups left at the ctx level, not scan level */
+	dgrp_t groups_left;
+	/*
 	 * For the use of callers of the e2fsck functions; not used by
 	 * e2fsck functions themselves.
 	 */
--- e2fsprogs-1.40.2.orig/e2fsck/pass1.c
+++ e2fsprogs-1.40.2/e2fsck/pass1.c
@@ -96,9 +96,64 @@ struct process_inode_block {
 	struct ext2_inode inode;
 };
 
-struct scan_callback_struct {
+/*
+ * XXX Complete and total interface violation
+ *
+ * We need to skip around between block groups based on when they're
+ * done with readahead, rather than processing them sequentially.
+ * Probably just using the fs->get_blocks hook or something similar
+ * will work and will cut a few hundred lines of code.  For now, mess
+ * around with libext2fs's private structures.
+ *
+ */
+
+struct ext2_struct_inode_scan {
+	errcode_t		magic;
+	ext2_filsys		fs;
+	ext2_ino_t		current_inode;
+	blk_t			current_block;
+	dgrp_t			current_group;
+	ext2_ino_t		inodes_left;
+	blk_t			blocks_left;
+	dgrp_t			groups_left;
+	blk_t			inode_buffer_blocks;
+	char *			inode_buffer;
+	int			inode_size;
+	char *			ptr;
+	int			bytes_left;
+	char			*temp_buffer;
+	errcode_t		(*done_group)(ext2_filsys fs,
+					      ext2_inode_scan scan,
+					      dgrp_t group,
+					      void * priv_data);
+	void *			done_group_data;
+	int			bad_block_ptr;
+	int			scan_flags;
+	int			reserved[6];
+};
+
+/*
+ * Per thread readahead state.
+ */
+
+struct readahead_state {
+	ext2_filsys	fs;
+	ext2_inode_scan	scan;
 	e2fsck_t	ctx;
-	char		*block_buf;
+	pthread_t	pthread;
+	unsigned int	thread;
+	int		bg_readahead_done;
+	pthread_mutex_t	mutex;
+	pthread_cond_t	pause;
+	blk_t		*ind_blks_queue;
+	unsigned int	ind_blks_count;
+	char		*ind_blks_bufs[3];
+};
+
+struct scan_callback_struct {
+	e2fsck_t		ctx;
+	char			*block_buf;
+	struct readahead_state	*readahead;
 };
 
 /*
@@ -107,6 +162,373 @@ struct scan_callback_struct {
 static struct process_inode_block *inodes_to_process;
 static int process_inode_count;
 
+/*
+ * For the indirect block readahead queue.
+ */
+
+static unsigned int ind_blks_queue_size = 1024; /* Should be an option */
+static unsigned int bad_ind_blk_count;
+
+/*
+ * Minimal sanity check on indirect block addresses during readahead.
+ */
+
+static int
+check_sanity(struct readahead_state *readahead, blk_t blk)
+{
+	e2fsck_t ctx = readahead->ctx;
+	if (blk >= ctx->fs->super->s_blocks_count ||
+	    blk < ctx->fs->super->s_first_data_block) {
+		bad_ind_blk_count++;
+#if 0
+		return 1;
+#else
+		/* Crash for debugging purposes */
+		* (char *) 0 = 0;
+#endif
+	}
+	return 0;
+}
+
+static void
+ind_block_readahead(struct readahead_state *readahead, blk_t blk, char *buf)
+{
+	if (check_sanity(readahead, blk))
+		return;
+#if 0
+	printf("Pre read of ind blk %u\n", blk);
+#endif
+	/* Read the block and hope it sticks in buffer cache */
+	io_channel_read_blk(readahead->fs->io, blk, 1, buf);
+}
+
+static EXT2_QSORT_TYPE process_ind_blks_cmp(const void *a, const void *b)
+{
+	const blk_t *blk_a = (const blk_t *) a;
+	const blk_t *blk_b = (const blk_t *) b;
+
+	return *blk_a - *blk_b;
+}
+
+static void
+process_ind_blks(struct readahead_state *readahead)
+{
+	blk_t *queue = readahead->ind_blks_queue;
+	unsigned int count = readahead->ind_blks_count;
+	char *buf = readahead->ind_blks_bufs[0];
+	int i;
+
+	if (count == 0)
+		return;
+	qsort(queue, count, sizeof (queue[0]), process_ind_blks_cmp);
+	for (i = 0; i < count; i++)
+		ind_block_readahead(readahead, queue[i], buf);
+	readahead->ind_blks_count = 0;
+}
+
+/*
+ * Add indirect blocks to the queue of to-be-read blocks.
+ *
+ * The queue is the obvious performance optimization - sort the blocks
+ * to be read by address.  The double/triples are read immediately and
+ * the singles put in the queue.  I wrote a version where the
+ * doubles/triples had their own queues and were sorted and read
+ * independently, but that went slower.
+ *
+ * This effectively does the exact same optimization as the inode
+ * sorting, just simplified since we're not doing the inode checks at
+ * the same time.
+ */
+
+static void
+add_to_queue(struct readahead_state *readahead, blk_t blk, int level)
+{
+	blk_t *queue = readahead->ind_blks_queue;
+	unsigned int count = readahead->ind_blks_count;
+	char *buf = readahead->ind_blks_bufs[level];
+	e2fsck_t ctx = readahead->ctx;
+	int limit = ctx->fs->blocksize >> 2;
+	blk_t *blk_ptrs;
+	int i;
+
+	if (blk == 0)
+		return;
+	if (check_sanity(readahead, blk))
+		return;
+
+	if (level == 0) {
+		/* Single indirect block */
+		if (count == ind_blks_queue_size)
+			process_ind_blks(readahead);
+		queue[count] = blk;
+		readahead->ind_blks_count++;
+	} else {
+		/* Double or triple - read it and rerun */
+		ind_block_readahead(readahead, blk, buf);
+		blk_ptrs = (blk_t *) buf;
+		for (i = 0; i < limit; i++)
+			add_to_queue(readahead, blk_ptrs[i], level - 1);
+	}
+}
+
+/*
+ * Do readahead on inodes in a block group.
+ */
+
+static void
+readahead_ind_blocks(struct readahead_state *readahead)
+{
+	ext2_inode_scan scan = readahead->scan;
+	struct ext2_inode *inode_array = (struct ext2_inode *) scan->inode_buffer;
+	struct ext2_inode *inode;
+	int num_inodes = scan->inodes_left;
+	int i;
+
+	for (i = 0; i < num_inodes; i++) {
+		inode = &inode_array[i];
+		if (ext2fs_inode_has_valid_blocks(inode)) {
+			/* add_to_queue deals with zero pointers, etc. */
+			add_to_queue(readahead, inode->i_block[EXT2_IND_BLOCK], 0);
+			add_to_queue(readahead, inode->i_block[EXT2_DIND_BLOCK], 1);
+			add_to_queue(readahead, inode->i_block[EXT2_TIND_BLOCK], 2);
+		}
+	}
+	/* Finish off the queue */
+	process_ind_blks(readahead);
+}
+
+/*
+ * Find the next block group number in our stripe.
+ *
+ * Currently readahead is done on block group boundaries.  It may make
+ * more sense to ignore block group boundaries, since block groups can
+ * straddle stripes.  This should probably interact with the stripe
+ * size setting used when creating the file system.
+ */
+
+static dgrp_t
+next_bg_in_stripe(struct readahead_state *readahead)
+{
+	ext2_inode_scan scan = readahead->scan;
+	dgrp_t new_grp = scan->current_group + 1;
+	unsigned long long new_block;
+	unsigned long long new_byte;
+	unsigned long long stripe;
+	unsigned int thread;
+
+	for (new_grp = scan->current_group + 1;
+	     new_grp < readahead->ctx->fs->group_desc_count;
+	     new_grp++) {
+		/* Get the block offset of the inode table */
+		new_block = scan->fs->
+			group_desc[new_grp].bg_inode_table;
+		/* Convert to bytes - stripe size is in bytes */
+		new_byte = new_block * scan->fs->blocksize;
+		/* Divide bytes by stripe size to get a stripe number */
+		stripe = new_byte / readahead->ctx->stripe_size;
+		/* Modulo number of threads to get thread number */
+		thread = stripe % readahead->ctx->read_threads;
+#if 0
+		printf("block %llu byte %llu stripe %llu thread %u read_threads %u\n",
+		       new_block, new_byte, stripe, thread, readahead->ctx->read_threads);
+#endif
+		if (thread == readahead->thread)
+			break;
+	}
+	printf("Thread %u chooses bg %d\n", readahead->thread, new_grp);
+	return new_grp;
+}
+
+/*
+ * Stolen from inode.c and modified for multi-threaded I/O.
+ *
+ * This function is called by ext2fs_get_next_inode when it needs to
+ * read in more blocks from the current blockgroup's inode table.
+ *
+ * This is, of course, a gross violation of the inteface and has to be
+ * fixed.
+ */
+
+static int get_next_blocks_threaded(struct readahead_state *readahead)
+{
+	ext2_inode_scan scan = readahead->scan;
+	blk_t		num_blocks;
+
+	scan->current_group = next_bg_in_stripe(readahead);
+	if (scan->current_group >= scan->fs->group_desc_count)
+		return 0;
+
+	scan->current_block = scan->fs->
+		group_desc[scan->current_group].bg_inode_table;
+
+	scan->current_inode = scan->current_group *
+		EXT2_INODES_PER_GROUP(scan->fs->super);
+
+	scan->inodes_left = EXT2_INODES_PER_GROUP(scan->fs->super);
+	scan->blocks_left = scan->fs->inode_blocks_per_group;
+	/*
+	 * We read an entire block group at once, and we aren't called
+	 * unless there is a block group to read.
+	 */
+	num_blocks = scan->inode_buffer_blocks;
+
+	/* Ignore bad blocks for now. */
+	io_channel_read_blk(readahead->fs->io,
+			    scan->current_block,
+			    (int) num_blocks,
+			    scan->inode_buffer);
+	scan->ptr = scan->inode_buffer;
+	scan->bytes_left = num_blocks * scan->fs->blocksize;
+	/*
+	 * The end result is that blocks_left is 0.  This is because
+	 * it is used to decided how many more blocks in this block
+	 * group are left to read in.
+	 */
+	scan->blocks_left -= num_blocks;
+	if (scan->current_block)
+		scan->current_block += num_blocks;
+	return 1;
+}
+
+/*
+ * Read block groups in our stripe until there are no more.
+ */
+
+static void *
+readahead_bg_loop(void *arg)
+{
+	struct readahead_state *readahead = arg;
+	printf("Thread %u starting\n", readahead->thread);
+	/* Read in the block group */
+	while (get_next_blocks_threaded(readahead) != 0) {
+		/* Read in the indirect blocks */
+		readahead_ind_blocks(readahead);
+		/*
+		 * All done!  Once we set the readahead_done flag, the
+		 * main thread could come in, eat our buffer, and send
+		 * us the wake up signal at any point.  Hold the lock
+		 * across all this so we don't miss the signal.
+		 */
+		pthread_mutex_lock(&readahead->mutex);
+		readahead->bg_readahead_done = 1;
+		/* Signal main thread that we are done with the bg*/
+		pthread_mutex_lock(&readahead->ctx->mutex_ready);
+		pthread_cond_signal(&readahead->ctx->buffer_ready);
+		pthread_mutex_unlock(&readahead->ctx->mutex_ready);
+		/* Sleep until main thread has used our buffer */
+		printf("Thread %d sleeping\n", readahead->thread);
+		pthread_cond_wait(&readahead->pause, &readahead->mutex);
+		pthread_mutex_unlock(&readahead->mutex);
+	}
+	printf("Thread %u exiting\n", readahead->thread);
+	pthread_exit(0);
+}
+
+static void
+readahead_start_thread(struct readahead_state *readahead, unsigned int thread)
+{
+	readahead->thread = thread;
+	pthread_mutex_init(&readahead->mutex, NULL);
+	pthread_cond_init(&readahead->pause, NULL);
+	readahead->bg_readahead_done = 0;
+	pthread_create(&readahead->pthread, NULL, readahead_bg_loop, readahead);
+}
+
+static void
+readahead_shutdown(e2fsck_t ctx)
+{
+	int threads = ctx->read_threads;
+	struct readahead_state *readahead;
+	int i;
+
+	for (i = 0; i < threads; i++) {
+		readahead = &ctx->readahead[i];
+		pthread_mutex_lock(&readahead->mutex);
+		pthread_cond_signal(&readahead->pause);
+		pthread_mutex_unlock(&readahead->mutex);
+		printf("Shutting down thread %d... ", i);
+		pthread_join(readahead->pthread, NULL);
+		printf("done\n");
+		ext2fs_free_mem(&readahead->ind_blks_bufs[0]);
+		ext2fs_free_mem(&readahead->ind_blks_bufs[1]);
+		ext2fs_free_mem(&readahead->ind_blks_bufs[2]);
+		ext2fs_free_mem(&readahead->ind_blks_queue);
+		ext2fs_close_inode_scan(readahead->scan);	
+	}
+}
+
+/*
+ * Find a block group that's done with readahead.
+ */
+
+static void
+get_ready_blockgroup(struct readahead_state **readaheadp)
+{
+	e2fsck_t ctx = (*readaheadp)->ctx;
+	static int last_index = 0;
+	struct readahead_state *readahead;
+	int i;
+	int index;
+
+	pthread_mutex_lock(&ctx->mutex_ready);
+ restart:
+	printf("Main thread: ");
+	for (i = 0; i < ctx->read_threads; i++) {
+		/* Start from the last thread + 1 to evenly spread the workload */
+		index = (last_index + 1 + i) % ctx->read_threads;
+		printf("%d ", index);
+		readahead = &ctx->readahead[index];
+		if (readahead->bg_readahead_done) {
+			/* Found! */
+			printf("\nPicked thread %d bg %d (%d/%d left)\n",
+			       index, readahead->scan->current_group,
+			       ctx->groups_left, ctx->fs->group_desc_count);
+			ctx->groups_left--;
+			last_index = index;
+			readahead->bg_readahead_done = 0;
+			*readaheadp = readahead;
+			pthread_mutex_unlock(&ctx->mutex_ready);
+			return;
+		}
+	} 
+	printf("...nothing\n");
+	/*
+	 * No readahead threads are ready, go to sleep and wait for
+	 * one to finish.  This is going to happen a lot unless you're
+	 * by some miracle not I/O bound.
+	 */
+	pthread_cond_wait(&ctx->buffer_ready, &ctx->mutex_ready);
+	goto restart;
+}
+
+/*
+ * Called when we're done with the current block group.
+ */
+
+static int
+get_next_blockgroup_threaded(struct readahead_state **readaheadp, ext2_ino_t *ino)
+{
+	int retval;
+	ext2_inode_scan scan = (*readaheadp)->scan;
+	e2fsck_t ctx = (*readaheadp)->ctx;
+
+	if (scan->done_group) {
+		retval = (scan->done_group)
+			(scan->fs, scan, scan->current_group,
+			 scan->done_group_data);
+		if (retval) {
+			printf("*** retval %d\n", retval);
+			return retval;
+		}
+		if (ctx->groups_left <= 0) {
+			*ino = 0;
+			return 0;
+		}
+	}
+	get_ready_blockgroup(readaheadp);
+	return 0;
+}
+
 static __u64 ext2_max_sizes[EXT2_MAX_BLOCK_LOG_SIZE -
 			    EXT2_MIN_BLOCK_LOG_SIZE + 1];
 
@@ -483,7 +905,8 @@ void e2fsck_pass1(e2fsck_t ctx)
 	int		imagic_fs;
 	int		busted_fs_time = 0;
 	int		inode_size;
-	
+	struct readahead_state *readahead;
+
 #ifdef RESOURCE_TRACK
 	init_resource_track(&rtrack);
 #endif
@@ -596,22 +1019,65 @@ void e2fsck_pass1(e2fsck_t ctx)
 	block_buf = (char *) e2fsck_allocate_memory(ctx, fs->blocksize * 3,
 						    "block interate buffer");
 	e2fsck_use_inode_shortcuts(ctx, 1);
-	old_op = ehandler_operation(_("opening inode scan"));
-	pctx.errcode = ext2fs_open_inode_scan(fs, ctx->inode_buffer_blocks, 
-					      &scan);
-	ehandler_operation(old_op);
-	if (pctx.errcode) {
-		fix_problem(ctx, PR_1_ISCAN_ERROR, &pctx);
-		ctx->flags |= E2F_FLAG_ABORT;
-		ext2fs_free_mem(&block_buf);
-		ext2fs_free_mem(&inode);
-		return;
-	}
-	ext2fs_inode_scan_flags(scan, EXT2_SF_SKIP_MISSING_ITABLE, 0);
-	ctx->stashed_inode = inode;
 	scan_struct.ctx = ctx;
 	scan_struct.block_buf = block_buf;
 	ext2fs_set_inode_callback(scan, scan_callback, &scan_struct);
+	/* Set up readahead threads */
+	/* XXX free this mem on error */
+	ctx->readahead = e2fsck_allocate_memory(ctx, sizeof (struct readahead_state) *
+					    ctx->read_threads, "multi-threaded readahead state");
+	pthread_mutex_init(&ctx->mutex_ready, NULL);
+	pthread_cond_init(&ctx->buffer_ready, NULL);
+	ctx->groups_left = ctx->fs->group_desc_count;
+	for(i = 0; i < ctx->read_threads; i++) {
+		readahead = &ctx->readahead[i];
+		/* Each thread needs its own fd to avoid an lseek/read race */
+		if (ext2fs_open2(ctx->filesystem_name, ctx->io_options, 
+				 0, ctx->superblock, ctx->blocksize,
+				 fs->io->manager, &readahead->fs)) {
+			/* XXX better error handling */
+			com_err(ctx->program_name, errno, "reopen for readahead failed\n");
+			ctx->flags |= E2F_FLAG_ABORT;
+			ext2fs_free_mem(&block_buf);
+			ext2fs_free_mem(&inode);
+			return;
+		}
+		readahead->fs->priv_data = ctx;
+		readahead->fs->now = ctx->now;
+		old_op = ehandler_operation(_("opening inode scan"));
+		/* XXX should be ctx->inode_buffer_blocks but want whole bg for simplicity */
+		pctx.errcode = ext2fs_open_inode_scan(fs, fs->inode_blocks_per_group,
+						      &readahead->scan);
+		scan = readahead->scan;
+		ehandler_operation(old_op);
+		if (pctx.errcode) {
+			fix_problem(ctx, PR_1_ISCAN_ERROR, &pctx);
+			ctx->flags |= E2F_FLAG_ABORT;
+			ext2fs_free_mem(&block_buf);
+			ext2fs_free_mem(&inode);
+			return;
+		}
+		ext2fs_inode_scan_flags(scan, EXT2_SF_SKIP_MISSING_ITABLE, 0);
+		ext2fs_set_inode_callback(scan, scan_callback, &scan_struct);
+		readahead->ctx = ctx;
+		/* Queue and buffer for indirect blocks */
+		/* XXX free this mem on error */
+		readahead->ind_blks_bufs[0] = (char *) e2fsck_allocate_memory(ctx, fs->blocksize,
+						    "indirect block readahead buffer");
+		readahead->ind_blks_bufs[1] = (char *) e2fsck_allocate_memory(ctx, fs->blocksize,
+						    "indirect block readahead buffer");
+		readahead->ind_blks_bufs[2] = (char *) e2fsck_allocate_memory(ctx, fs->blocksize,
+						    "indirect block readahead buffer");
+		readahead->ind_blks_queue = (blk_t *) e2fsck_allocate_memory(ctx,
+						     sizeof (blk_t) * ind_blks_queue_size,
+						    "indirect block readahead block list");
+		readahead->ind_blks_count = 0;
+		/* Barf.  Must be a better way to signal no scan has started. */
+		scan->current_group = -1;
+		readahead_start_thread(readahead, i);
+	}
+	ctx->stashed_inode = inode;
+	
 	if (ctx->progress)
 		if ((ctx->progress)(ctx, 1, 0, ctx->fs->group_desc_count))
 			return;
@@ -619,7 +1085,23 @@ void e2fsck_pass1(e2fsck_t ctx)
 	    (fs->super->s_mtime < fs->super->s_inodes_count))
 		busted_fs_time = 1;
 
+	/*
+	 * Find a blockgroup that's already been read in.
+	 */
+	get_ready_blockgroup(&readahead);
+	scan_struct.readahead = readahead;
+	scan = readahead->scan;
 	while (1) {
+		/* Usurp libext2fs's role in refilling inode buffers */
+		if (scan->inodes_left <= 0) {
+			if (ctx->groups_left == 0)
+				break;
+			if (get_next_blockgroup_threaded(&readahead, &ino))
+				break;
+			scan_struct.readahead = readahead;
+			scan = readahead->scan;
+		}
+
 		old_op = ehandler_operation(_("getting next inode from scan"));
 		pctx.errcode = ext2fs_get_next_inode_full(scan, &ino, 
 							  inode, inode_size);
@@ -934,8 +1416,9 @@ void e2fsck_pass1(e2fsck_t ctx)
 		}
 	}
 	process_inodes(ctx, block_buf);
+	readahead_shutdown(ctx);
 	ext2fs_close_inode_scan(scan);
-
+	printf("Bad ind blks %u\n", bad_ind_blk_count);
 	/*
 	 * If any extended attribute blocks' reference counts need to
 	 * be adjusted, either up (ctx->refcount_extra), or down
@@ -1009,6 +1492,7 @@ endit:
 	
 	ext2fs_free_mem(&block_buf);
 	ext2fs_free_mem(&inode);
+	ext2fs_free_mem(&ctx->readahead);
 
 #ifdef RESOURCE_TRACK
 	if (ctx->options & E2F_OPT_TIME2) {
@@ -1020,20 +1504,25 @@ endit:
 
 /*
  * When the inode_scan routines call this callback at the end of the
- * glock group, call process_inodes.
+ * block group, call process_inodes.
  */
 static errcode_t scan_callback(ext2_filsys fs, 
 			       ext2_inode_scan scan EXT2FS_ATTR((unused)),
 			       dgrp_t group, void * priv_data)
 {
-	struct scan_callback_struct *scan_struct;
-	e2fsck_t ctx;
+	struct scan_callback_struct *scan_struct =
+		(struct scan_callback_struct *) priv_data;
+	struct readahead_state *readahead = scan_struct->readahead;
+	e2fsck_t ctx = scan_struct->ctx;
 
-	scan_struct = (struct scan_callback_struct *) priv_data;
-	ctx = scan_struct->ctx;
-	
 	process_inodes((e2fsck_t) fs->priv_data, scan_struct->block_buf);
 
+	pthread_mutex_lock(&readahead->mutex);
+	/* Wake up the sleeping readahead thread for the bg we just finished */
+	printf("Waking thread %d\n", readahead->thread);
+	pthread_cond_signal(&readahead->pause);
+	pthread_mutex_unlock(&readahead->mutex);
+
 	if (ctx->progress)
 		if ((ctx->progress)(ctx, 1, group+1,
 				    ctx->fs->group_desc_count))
@@ -1054,21 +1543,19 @@ static void process_inodes(e2fsck_t ctx,
 	char			buf[80];
 	struct problem_context	pctx;
 	
-#if 0
-	printf("begin process_inodes: ");
-#endif
 	if (process_inode_count == 0)
 		return;
+#if 1
+	printf("begin process_inodes: curr %d\n", process_inode_count);
+#endif
 	old_operation = ehandler_operation(0);
 	old_stashed_inode = ctx->stashed_inode;
 	old_stashed_ino = ctx->stashed_ino;
-	qsort(inodes_to_process, process_inode_count,
-		      sizeof(struct process_inode_block), process_inode_cmp);
 	clear_problem_context(&pctx);
 	for (i=0; i < process_inode_count; i++) {
 		pctx.inode = ctx->stashed_inode = &inodes_to_process[i].inode;
 		pctx.ino = ctx->stashed_ino = inodes_to_process[i].ino;
-		
+
 #if 0
 		printf("%u ", pctx.ino);
 #endif
@@ -1082,7 +1569,7 @@ static void process_inodes(e2fsck_t ctx,
 	ctx->stashed_inode = old_stashed_inode;
 	ctx->stashed_ino = old_stashed_ino;
 	process_inode_count = 0;
-#if 0
+#if 1
 	printf("end process inodes\n");
 #endif
 	ehandler_operation(old_operation);
--- e2fsprogs-1.40.2.orig/e2fsck/unix.c
+++ e2fsprogs-1.40.2/e2fsck/unix.c
@@ -74,6 +74,7 @@ static void usage(e2fsck_t ctx)
 		_("Usage: %s [-panyrcdfvstDFSV] [-b superblock] [-B blocksize]\n"
 		"\t\t[-I inode_buffer_blocks] [-P process_inode_size]\n"
 		"\t\t[-l|-L bad_blocks_file] [-C fd] [-j external_journal]\n"
+		"\t\t[-A readahead_streams] [-z stripe_size]\n"
 		"\t\t[-E extended-options] device\n"),
 		ctx->program_name);
 
@@ -610,8 +611,11 @@ static errcode_t PRS(int argc, char *arg
 		ctx->program_name = *argv;
 	else
 		ctx->program_name = "e2fsck";
-	while ((c = getopt (argc, argv, "panyrcC:B:dE:fvtFVM:b:I:j:P:l:L:N:SsDk")) != EOF)
+	while ((c = getopt (argc, argv, "paA:nyrcC:B:dE:fvtFVM:b:I:j:P:l:L:N:SsDkz:")) != EOF)
 		switch (c) {
+		case 'A':
+			ctx->read_threads = atoi(optarg);
+			break;
 		case 'C':
 			ctx->progress = e2fsck_update_progress;
 			ctx->progress_fd = atoi(optarg);
@@ -734,6 +738,9 @@ static errcode_t PRS(int argc, char *arg
 		case 'k':
 			keep_bad_blocks++;
 			break;
+		case 'z':
+			ctx->stripe_size = atoi(optarg);
+			break;
 		default:
 			usage(ctx);
 		}
--- e2fsprogs-1.40.2.orig/lib/ext2fs/ind_block.c
+++ e2fsprogs-1.40.2/lib/ext2fs/ind_block.c
@@ -30,6 +30,9 @@ errcode_t ext2fs_read_ind_block(ext2_fil
 	    (fs->io != fs->image_io))
 		memset(buf, 0, fs->blocksize);
 	else {
+#if 0
+		printf("Final read of ind blk %d\n", blk);
+#endif
 		retval = io_channel_read_blk(fs->io, blk, 1, buf);
 		if (retval)
 			return retval;
-
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html