linux-ext4 - Re: 32TB ext4 fsck times

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20090422231816.GA7066@shell>
Date:	Wed, 22 Apr 2009 19:18:16 -0400
From:	Valerie Aurora Henson <vaurora@...hat.com>
To:	Ric Wheeler <rwheeler@...hat.com>
Cc:	nicholas.dokos@...com, linux-ext4@...r.kernel.org
Subject: Re: 32TB ext4 fsck times

On Tue, Apr 21, 2009 at 03:31:18PM -0400, Ric Wheeler wrote:
> Nick Dokos wrote:
> >Now that 64-bit e2fsck can run to completion on a (newly-minted, never
> >mounted) filesystem, here are some numbers. They must be taken with
> >a large grain of salt of course, given the unrealistict situation, but
> >they might be reasonable lower bounds of what one might expect.
> >
> >First, the disks are 300GB SCSI 15K rpm - there are 28 disks per RAID
> >controller and they are striped into 2TiB volumes (that's a limitation
> >of the hardware). 16 of these volumes are striped together using LVM, to
> >make a 32TiB volume.
> >
> >The machine is a four-slot quad core AMD box with 128GB of memory and
> >dual-port FC adapters.
> >  
> Certainly a great configuration for this test....

I've been thinking about resurrecting my parallel e2fsck patches
(below) - but I would much prefer that someone else do so. :)

Back to something useful the short term... Was the file system created
with uninitialized block groups and lazy inode table initialization?

-VAL

The patch is against e2fsprogs 1.40.4.

 e2fsck/Makefile.in                      |    6
 e2fsck/e2fsck.h                         |    5
 e2fsck/pass1.c                          |   36
 e2fsck/pass2.c                          |   12
 e2fsck/unix.c                           |   18
 lib/ext2fs/readahead.c                  | 1607 ++++++++++++++++++++++++++++++++
 lib/ext2fs/Makefile.in                  |    1
 lib/ext2fs/block.c                      |   20
 lib/ext2fs/dosio.c                      |    2
 lib/ext2fs/ext2_io.h                    |   10
 lib/ext2fs/ext2fs.h                     |   42
 lib/ext2fs/inode.c                      |   70 +
 lib/ext2fs/inode_io.c                   |    2
 lib/ext2fs/io_manager.c                 |   12
 lib/ext2fs/nt_io.c                      |    2
 lib/ext2fs/test_io.c                    |    2
 lib/ext2fs/unix_io.c                    |   41
 17 files changed, 1873 insertions(+), 15 deletions(-)

--- e2fsprogs-1.40.4.orig/lib/ext2fs/ext2_io.h
+++ e2fsprogs-1.40.4/lib/ext2fs/ext2_io.h
@@ -63,6 +63,10 @@ struct struct_io_manager {
 	errcode_t (*set_blksize)(io_channel channel, int blksize);
 	errcode_t (*read_blk)(io_channel channel, unsigned long block,
 			      int count, void *data);
+	errcode_t (*readahead)(io_channel channel, unsigned long block,
+			       int count);
+	errcode_t (*cache_release)(io_channel channel, unsigned long block,
+				   int count);
 	errcode_t (*write_blk)(io_channel channel, unsigned long block,
 			       int count, const void *data);
 	errcode_t (*flush)(io_channel channel);
@@ -82,6 +86,8 @@ struct struct_io_manager {
 #define io_channel_close(c) 		((c)->manager->close((c)))
 #define io_channel_set_blksize(c,s)	((c)->manager->set_blksize((c),s))
 #define io_channel_read_blk(c,b,n,d)	((c)->manager->read_blk((c),b,n,d))
+#define io_channel_readahead(c,b,n)	((c)->manager->readahead((c),b,n))
+#define io_channel_cache_release(c,b,n)	((c)->manager->cache_release((c),b,n))
 #define io_channel_write_blk(c,b,n,d)	((c)->manager->write_blk((c),b,n,d))
 #define io_channel_flush(c) 		((c)->manager->flush((c)))
 #define io_channel_bumpcount(c)		((c)->refcount++)
@@ -92,6 +98,10 @@ extern errcode_t io_channel_set_options(
 extern errcode_t io_channel_write_byte(io_channel channel,
 				       unsigned long offset,
 				       int count, const void *data);
+extern errcode_t readahead_noop(io_channel channel, unsigned long block,
+				int count);
+extern errcode_t cache_release_noop(io_channel channel, unsigned long block,
+				    int count);

 /* unix_io.c */
 extern io_manager unix_io_manager;
--- e2fsprogs-1.40.4.orig/lib/ext2fs/unix_io.c
+++ e2fsprogs-1.40.4/lib/ext2fs/unix_io.c
@@ -17,6 +17,10 @@

 #define _LARGEFILE_SOURCE
 #define _LARGEFILE64_SOURCE
+/* Below needed for fadvise */
+#define _XOPEN_SOURCE 600
+/* Without this, fadvise goes 32 bit? */
+#define _FILE_OFFSET_BITS 64

 #include <stdio.h>
 #include <string.h>
@@ -77,6 +81,10 @@ static errcode_t unix_close(io_channel c
 static errcode_t unix_set_blksize(io_channel channel, int blksize);
 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
 			       int count, void *data);
+static errcode_t unix_readahead(io_channel channel, unsigned long block,
+				int count);
+static errcode_t unix_cache_release(io_channel channel, unsigned long block,
+				    int count);
 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
 				int count, const void *data);
 static errcode_t unix_flush(io_channel channel);
@@ -103,6 +111,8 @@ static struct struct_io_manager struct_u
 	unix_close,
 	unix_set_blksize,
 	unix_read_blk,
+	unix_readahead,
+	unix_cache_release,
 	unix_write_blk,
 	unix_flush,
 #ifdef NEED_BOUNCE_BUFFER
@@ -587,6 +597,37 @@ static errcode_t unix_read_blk(io_channe
 #endif /* NO_IO_CACHE */
 }

+static errcode_t unix_readahead(io_channel channel, unsigned long block,
+				int count)
+{
+	struct unix_private_data *data;
+
+	data = (struct unix_private_data *)channel->private_data;
+#if DEBUG
+	printf(" RA: %s: readahead %lu, %d blocks\n", __FUNCTION__,
+	       block, count);
+#endif
+	posix_fadvise(data->dev, (ext2_loff_t)block * channel->block_size,
+		      (ext2_loff_t)count * channel->block_size,
+		      POSIX_FADV_WILLNEED);
+	return 0;
+}
+
+static errcode_t unix_cache_release(io_channel channel, unsigned long block,
+				    int count)
+{
+	struct unix_private_data *data;
+
+	data = (struct unix_private_data *)channel->private_data;
+#if DEBUG
+	printf("MT: %s: release %lu, %d blocks\n", __FUNCTION__, block, count);
+#endif
+	posix_fadvise(data->dev, (ext2_loff_t)block * channel->block_size,
+		      (ext2_loff_t)count * channel->block_size,
+		      POSIX_FADV_DONTNEED);
+	return 0;
+}
+
 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
 				int count, const void *buf)
 {
--- e2fsprogs-1.40.4.orig/lib/ext2fs/inode_io.c
+++ e2fsprogs-1.40.4/lib/ext2fs/inode_io.c
@@ -64,6 +64,8 @@ static struct struct_io_manager struct_i
 	inode_close,
 	inode_set_blksize,
 	inode_read_blk,
+	readahead_noop,
+	cache_release_noop,
 	inode_write_blk,
 	inode_flush,
 	inode_write_byte
--- e2fsprogs-1.40.4.orig/lib/ext2fs/dosio.c
+++ e2fsprogs-1.40.4/lib/ext2fs/dosio.c
@@ -64,6 +64,8 @@ static struct struct_io_manager struct_d
         dos_close,
         dos_set_blksize,
         dos_read_blk,
+        readahead_noop,
+        cache_release_noop,
         dos_write_blk,
         dos_flush
 };
--- e2fsprogs-1.40.4.orig/lib/ext2fs/nt_io.c
+++ e2fsprogs-1.40.4/lib/ext2fs/nt_io.c
@@ -236,6 +236,8 @@ static struct struct_io_manager struct_n
 	nt_close,
 	nt_set_blksize,
 	nt_read_blk,
+	readahead_noop,
+	cache_release_noop,
 	nt_write_blk,
 	nt_flush
 };
--- e2fsprogs-1.40.4.orig/lib/ext2fs/test_io.c
+++ e2fsprogs-1.40.4/lib/ext2fs/test_io.c
@@ -74,6 +74,8 @@ static struct struct_io_manager struct_t
 	test_close,
 	test_set_blksize,
 	test_read_blk,
+	readahead_noop,
+	cache_release_noop,
 	test_write_blk,
 	test_flush,
 	test_write_byte,
--- e2fsprogs-1.40.4.orig/lib/ext2fs/io_manager.c
+++ e2fsprogs-1.40.4/lib/ext2fs/io_manager.c
@@ -67,3 +67,15 @@ errcode_t io_channel_write_byte(io_chann

 	return EXT2_ET_UNIMPLEMENTED;
 }
+
+errcode_t readahead_noop(io_channel channel, unsigned long block,
+			 int count)
+{
+	return 0;
+}
+
+errcode_t cache_release_noop(io_channel channel, unsigned long block,
+			 int count)
+{
+	return 0;
+}
--- e2fsprogs-1.40.4.orig/e2fsck/Makefile.in
+++ e2fsprogs-1.40.4/e2fsck/Makefile.in
@@ -119,16 +119,16 @@ e2fsck: e2fsck.@...SCK_TYPE@
 e2fsck.static: $(OBJS)  $(STATIC_DEPLIBS)
 	@echo "	LD $@"
 	@$(LD) $(ALL_LDFLAGS) $(LDFLAG_STATIC) -o e2fsck.static $(OBJS) \
-		$(STATIC_LIBS)
+		$(STATIC_LIBS) -lpthread

 e2fsck.shared: $(OBJS)  $(DEPLIBS)
 	@echo "	LD $@"
-	@$(LD) $(ALL_LDFLAGS) -o e2fsck.shared $(OBJS) $(LIBS)
+	@$(LD) $(ALL_LDFLAGS) -o e2fsck.shared $(OBJS) $(LIBS)  -lpthread

 e2fsck.profiled: $(PROFILED_OBJS)  $(PROFILED_DEPLIBS)
 	@echo "	LD $@"
 	@$(LD) $(ALL_LDFLAGS) -g -pg -o e2fsck.profiled $(PROFILED_OBJS) \
-		$(PROFILED_LIBS)
+		$(PROFILED_LIBS)  -lpthread

 tst_refcount: ea_refcount.c
 	@echo "	LD $@"
--- e2fsprogs-1.40.4.orig/e2fsck/e2fsck.h
+++ e2fsprogs-1.40.4/e2fsck/e2fsck.h
@@ -334,6 +334,11 @@ struct e2fsck_struct {
 	profile_t	profile;
 	int blocks_per_page;

+ 	/*
+ 	 * Readahead state
+ 	 */
+ 	struct readahead_state *ra;
+
 	/*
 	 * For the use of callers of the e2fsck functions; not used by
 	 * e2fsck functions themselves.
--- e2fsprogs-1.40.4.orig/e2fsck/pass1.c
+++ e2fsprogs-1.40.4/e2fsck/pass1.c
@@ -463,6 +463,25 @@ extern void e2fsck_setup_tdb_icount(e2fs
 		*ret = 0;
 }

+/*
+ * Should we submit this inode for readahead?
+ */
+
+static int readahead_this_inode(struct ext2_inode *inode)
+{
+	/* Does this inode have valid blocks (i.e., not a fast symlink)? */
+	if (ext2fs_inode_has_valid_blocks(inode) &&
+	    /* Is it a directory or regular file? */
+	    (LINUX_S_ISDIR(inode->i_mode) ||
+	     LINUX_S_ISREG(inode->i_mode)) &&
+	    /* And does it have indirect blocks? */
+	    (inode->i_block[EXT2_IND_BLOCK] ||
+	     inode->i_block[EXT2_DIND_BLOCK] ||
+	     inode->i_block[EXT2_TIND_BLOCK]))
+		return 1;
+	return 0;
+}
+
 void e2fsck_pass1(e2fsck_t ctx)
 {
 	int	i;
@@ -483,7 +502,7 @@ void e2fsck_pass1(e2fsck_t ctx)
 	int		imagic_fs;
 	int		busted_fs_time = 0;
 	int		inode_size;
-	
+
 #ifdef RESOURCE_TRACK
 	init_resource_track(&rtrack);
 #endif
@@ -498,6 +517,8 @@ void e2fsck_pass1(e2fsck_t ctx)
 			ctx->dirs_to_hash = 0;
 	}

+	ext2fs_readahead_inode_init(ctx->ra, readahead_this_inode);
+
 #ifdef MTRACE
 	mtrace_print("Pass 1");
 #endif
@@ -619,6 +640,8 @@ void e2fsck_pass1(e2fsck_t ctx)
 	    (fs->super->s_mtime < fs->super->s_inodes_count))
 		busted_fs_time = 1;

+	ext2fs_readahead_inode_start(ctx->ra);
+
 	while (1) {
 		old_op = ehandler_operation(_("getting next inode from scan"));
 		pctx.errcode = ext2fs_get_next_inode_full(scan, &ino,
@@ -687,6 +710,8 @@ void e2fsck_pass1(e2fsck_t ctx)
 			}
 			ext2fs_mark_inode_bitmap(ctx->inode_used_map, ino);
 			clear_problem_context(&pctx);
+			/* Inodes are normally released in check_blocks except for this one */
+			ext2fs_readahead_inode_release(ctx->ra, fs, ino, block_buf);
 			continue;
 		} else if (ino == EXT2_ROOT_INO) {
 			/*
@@ -935,6 +960,7 @@ void e2fsck_pass1(e2fsck_t ctx)
 	}
 	process_inodes(ctx, block_buf);
 	ext2fs_close_inode_scan(scan);
+	ext2fs_readahead_inode_exit(ctx->ra);

 	/*
 	 * If any extended attribute blocks' reference counts need to
@@ -1032,6 +1058,8 @@ static errcode_t scan_callback(ext2_fils
 	scan_struct = (struct scan_callback_struct *) priv_data;
 	ctx = scan_struct->ctx;
 	
+	ext2fs_readahead_inode_table_release(ctx->ra, fs, group);
+
 	process_inodes((e2fsck_t) fs->priv_data, scan_struct->block_buf);

 	if (ctx->progress)
@@ -1515,10 +1543,14 @@ static void check_blocks(e2fsck_t ctx, s
 	if (inode->i_file_acl && check_ext_attr(ctx, pctx, block_buf))
 		pb.num_blocks++;

-	if (ext2fs_inode_has_valid_blocks(inode))
+	if (ext2fs_inode_has_valid_blocks(inode)) {
 		pctx->errcode = ext2fs_block_iterate2(fs, ino,
 				       pb.is_dir ? BLOCK_FLAG_HOLE : 0,
 				       block_buf, process_block, &pb);
+		/* Release it if we read it in the first place */
+		if (readahead_this_inode(inode))
+			ext2fs_readahead_inode_release(ctx->ra, fs, ino, block_buf);
+	}
 	end_problem_latch(ctx, PR_LATCH_BLOCK);
 	end_problem_latch(ctx, PR_LATCH_TOOBIG);
 	if (ctx->flags & E2F_FLAG_SIGNAL_MASK)
--- e2fsprogs-1.40.4.orig/e2fsck/unix.c
+++ e2fsprogs-1.40.4/e2fsck/unix.c
@@ -57,6 +57,8 @@ static int normalize_swapfs;
 static int cflag;		/* check disk */
 static int show_version_only;
 static int verbose;
+static unsigned int max_ios;
+static unsigned int cache_max;

 static int replace_bad_blocks;
 static int keep_bad_blocks;
@@ -74,6 +76,7 @@ static void usage(e2fsck_t ctx)
 		_("Usage: %s [-panyrcdfvstDFSV] [-b superblock] [-B blocksize]\n"
 		"\t\t[-I inode_buffer_blocks] [-P process_inode_size]\n"
 		"\t\t[-l|-L bad_blocks_file] [-C fd] [-j external_journal]\n"
+		"\t\t[-A maximum_concurrent_ios] [-m maximum_memory]\n"
 		"\t\t[-E extended-options] device\n"),
 		ctx->program_name);

@@ -619,8 +622,11 @@ static errcode_t PRS(int argc, char *arg
 		ctx->program_name = *argv;
 	else
 		ctx->program_name = "e2fsck";
-	while ((c = getopt (argc, argv,
"panyrcC:B:dE:fvtFVM:b:I:j:P:l:L:N:SsDk")) != EOF)
+	while ((c = getopt (argc, argv,
"paA:nyrcC:B:dE:fvtFVm:M:b:I:j:P:l:L:N:SsDk")) != EOF)
 		switch (c) {
+		case 'A':
+			max_ios = atoi(optarg);
+			break;
 		case 'C':
 			ctx->progress = e2fsck_update_progress;
 			res = sscanf(optarg, "%d", &ctx->progress_fd);
@@ -727,6 +733,10 @@ static errcode_t PRS(int argc, char *arg
 		case 'V':
 			show_version_only = 1;
 			break;
+		case 'm':
+			/* XXX should handle 3M and 54kb type stuff */
+			cache_max = atoi(optarg);
+			break;
 #ifdef MTRACE
 		case 'M':
 			mallwatch = (void *) strtol(optarg, NULL, 0);
@@ -1244,7 +1254,13 @@ restart:
 	else
 		journal_size = -1;

+	if (ext2fs_readahead_init(ctx->fs, max_ios, cache_max, &ctx->ra))
+		printf(_("Readahead failed to start, disabled.\n"));
+
 	run_result = e2fsck_run(ctx);
+
+	ext2fs_readahead_exit(&ctx->ra);
+
 	e2fsck_clear_progbar(ctx);

 	if (ctx->flags & E2F_FLAG_JOURNAL_INODE) {
--- e2fsprogs-1.40.4.orig/lib/ext2fs/Makefile.in
+++ e2fsprogs-1.40.4/lib/ext2fs/Makefile.in
@@ -59,6 +59,7 @@ OBJS= $(DEBUGFS_LIB_OBJS) $(RESIZE_LIB_O
 	openfs.o \
 	read_bb.o \
 	read_bb_file.o \
+	readahead.o \
 	res_gdt.o \
 	rw_bitmaps.o \
 	swapfs.o \
--- e2fsprogs-1.40.4.orig/lib/ext2fs/block.c
+++ e2fsprogs-1.40.4/lib/ext2fs/block.c
@@ -59,6 +59,9 @@ static int block_iterate_ind(blk_t *ind_
 		ret |= BLOCK_ERROR;
 		return ret;
 	}
+	if (ctx->flags & BLOCK_FLAG_IND_ONLY)
+		return ret;
+
 	ctx->errcode = ext2fs_read_ind_block(ctx->fs, *ind_block,
 					     ctx->ind_buf);
 	if (ctx->errcode) {
@@ -259,7 +262,6 @@ static int block_iterate_tind(blk_t *tin
 		ret |= (*ctx->func)(ctx->fs, tind_block,
 				    BLOCK_COUNT_TIND, ref_block,
 				    ref_offset, ctx->priv_data);
-	
 	return ret;
 }
 	
@@ -342,14 +344,17 @@ errcode_t ext2fs_block_iterate2(ext2_fil
 	/*
 	 * Iterate over normal data blocks
 	 */
-	for (i = 0; i < EXT2_NDIR_BLOCKS ; i++, ctx.bcount++) {
-		if (blocks[i] || (flags & BLOCK_FLAG_APPEND)) {
-			ret |= (*ctx.func)(fs, &blocks[i],
-					    ctx.bcount, 0, i, priv_data);
-			if (ret & BLOCK_ABORT)
-				goto abort_exit;
+	if (!(flags & BLOCK_FLAG_IND_ONLY)) {
+		for (i = 0; i < EXT2_NDIR_BLOCKS ; i++, ctx.bcount++) {
+			if (blocks[i] || (flags & BLOCK_FLAG_APPEND)) {
+				ret |= (*ctx.func)(fs, &blocks[i],
+						   ctx.bcount, 0, i, priv_data);
+				if (ret & BLOCK_ABORT)
+					goto abort_exit;
+			}
 		}
 	}
+
 	if (*(blocks + EXT2_IND_BLOCK) || (flags & BLOCK_FLAG_APPEND)) {
 		ret |= block_iterate_ind(blocks + EXT2_IND_BLOCK,
 					 0, EXT2_IND_BLOCK, &ctx);
@@ -434,4 +439,3 @@ errcode_t ext2fs_block_iterate(ext2_fils
 	return ext2fs_block_iterate2(fs, ino, BLOCK_FLAG_NO_LARGE | flags,
 				     block_buf, xlate_func, &xl);
 }
-
--- e2fsprogs-1.40.4.orig/lib/ext2fs/ext2fs.h
+++ e2fsprogs-1.40.4/lib/ext2fs/ext2fs.h
@@ -278,6 +278,9 @@ struct struct_ext2_filsys {
  * BLOCK_FLAG_DATA_ONLY indicates that the iterator function should be
  * called for data blocks only.
  *
+ * BLOCK_FLAG_IND_ONLY is the opposite of BLOCK_FLAG_DATA_ONLY - only
+ * call the iterate function for indirect blocks.
+ *
  * BLOCK_FLAG_NO_LARGE is for internal use only.  It informs
  * ext2fs_block_iterate2 that large files won't be accepted.
  */
@@ -285,6 +288,7 @@ struct struct_ext2_filsys {
 #define BLOCK_FLAG_HOLE		1
 #define BLOCK_FLAG_DEPTH_TRAVERSE	2
 #define BLOCK_FLAG_DATA_ONLY	4
+#define BLOCK_FLAG_IND_ONLY	8

 #define BLOCK_FLAG_NO_LARGE	0x1000

@@ -808,6 +812,8 @@ extern errcode_t ext2fs_get_next_inode_f
 					    ext2_ino_t *ino,
 					    struct ext2_inode *inode,
 					    int bufsize);
+errcode_t ext2fs_get_next_inode_ptr(ext2_inode_scan scan, ext2_ino_t *ino,
+				    struct ext2_inode **inode);
 extern errcode_t ext2fs_open_inode_scan(ext2_filsys fs, int buffer_blocks,
 				  ext2_inode_scan *ret_scan);
 extern void ext2fs_close_inode_scan(ext2_inode_scan scan);
@@ -907,6 +913,42 @@ errcode_t ext2fs_link(ext2_filsys fs, ex
 errcode_t ext2fs_unlink(ext2_filsys fs, ext2_ino_t dir, const char *name,
 			ext2_ino_t ino, int flags);

+/* readahead.c */
+
+struct readahead_state;
+
+/* Generic readahead start/stop functions */
+
+errcode_t ext2fs_readahead_init(ext2_filsys fs, unsigned int max_ios,
+				unsigned int max_mem,
+				struct readahead_state **ret_ra);
+void ext2fs_readahead_exit(struct readahead_state **ret_ra);
+void ext2fs_readahead_kill(struct readahead_state *ra);
+
+/* Inode readahead (pass 1) */
+
+void ext2fs_readahead_inode_init(struct readahead_state *ra,
+				 int (*should_read_inode)(struct ext2_inode *));
+void ext2fs_readahead_inode_start(struct readahead_state *ra);
+void ext2fs_readahead_inode_exit(struct readahead_state *ra);
+void ext2fs_readahead_inode_release(struct readahead_state *ra, ext2_filsys fs,
+				    ext2_ino_t ino, char *block_buf);
+void ext2fs_readahead_inode_table_release(struct readahead_state *ra,
+					  ext2_filsys fs, dgrp_t group);
+/* Directory block readahead (pass 2) */
+
+void ext2fs_readahead_dblist_init(struct readahead_state *ra,
+				  ext2_dblist dblist);
+void ext2fs_readahead_dblist_start(struct readahead_state *ra);
+void ext2fs_readahead_dblist_exit(struct readahead_state *ra);
+void ext2fs_readahead_dblist_release(struct readahead_state *ra,
+				     ext2_filsys fs, blk_t blk);
+
+/* Private to readahead implementation */
+
+void * readahead_inode_loop(void *arg);
+void * readahead_dblist(void *arg);
+
 /* read_bb.c */
 extern errcode_t ext2fs_read_bb_inode(ext2_filsys fs,
 				      ext2_badblocks_list *bb_list);
--- /dev/null
+++ e2fsprogs-1.40.4/lib/ext2fs/readahead.c
@@ -0,0 +1,1607 @@
+/*
+ * readahead.c --- Fast, parallel readahead of file system structures
+ *
+ * Copyright (C) 2007-2008 Valerie Henson <val@....edu>
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ * %End-Header%
+ *
+ * Generic readahead functions for users of ext2 lib.
+ *
+ * The general theory of readahead is that the main thread (fsck in
+ * most cases) spawns a second thread, the readahead thread, to go
+ * preread the file system structures.  The readahead thread relies on
+ * the buffer cache to store the blocks it has pre-read - it's sort of
+ * managing the buffer cache blindfolded.  Similarly, the readahead
+ * thread also keeps track of how many ios it has submittted to the
+ * device which have not yet completed, so it can avoid overflowing
+ * the device io queue.  This is also done without any real
+ * communication with the actual device queue; we're just guessing for
+ * the most part.
+ *
+ * This file is divided into calls made by the readahead thread and
+ * calls made by the main thread.
+ *
+ * Readahead is turned off by default, as it gives minimal or zero
+ * performance improvement on single disk systems, which make up the
+ * majority of file systems in Linux.
+ *
+ * Cache management:
+ *
+ * The total amount of cache to use is set by a command-line option.
+ * This should be a little bit smaller than the maximum memory
+ * available for buffer cache - i.e., not all of free memory, since
+ * buffer cache is usually limited to some fraction of all memory.
+ * Setting the maximum cache to greater than the buffer cache will
+ * approximately double elapsed time, since all of the file system
+ * data will be read from disk twice - once by readahead, and after it
+ * has been evicted by other pre-read blocks, a second time by the
+ * main thread.  This is bad.  Currently, I figure out what the buffer
+ * cache maximum is by running vmstat and reading the "buff" field.
+ *
+ * The *_readahead functions are called by the readahead thread and
+ * queue up ios to be pre-read.  When the io is actually issued and
+ * completed, the readahead thread increases the number of blocks
+ * available in cache.  The main thread (the client) calls the
+ * ext2fs_readahead_*_release functions (in readahead_client.c),
+ * reducing the number of blocks marked as in the cache and marking
+ * them as WONTNEED using fadvise (probably unnecessary and a no-op,
+ * as normal block aging should evict them from cache as new blocks
+ * are read).
+ *
+ * The threads sleep and wake according to the size of the cache, with
+ * the readahead thread as the producer and the main thread as the
+ * consumer.  The readahead thread wakes the main thread whenever a
+ * significant amount of cache is available, and wakes and waits on
+ * the main thread if the cache is full.  The main thread wakes the
+ * readahead thread whenever a significant amount of cache becomes
+ * free, and wakes and waits on the readahead thread whenever the
+ * cache becomes empty.  The threads only signal each other when a
+ * significant amount of cache becomes free or available because
+ * otherwise the threads can get locked in a tight wake/sleep cycle in
+ * which the readahead thread never has the opportunity to issue
+ * multiple ios in parallel.  Another small modification is that the
+ * readahead thread records how much cache it needs to complete the
+ * next io before it goes to sleep, so the main thread won't wake it
+ * unless there is enough cache to satisfy the next io.
+ *
+ * One major pitfall of this approach is that it requires very careful
+ * accounting of blocks read.  The main thread must call the release
+ * function for every block that the readahead thread adds to the
+ * cache.  In particular, the main thread must release the indirect
+ * blocks for every inode that the readahead thread walks.  The main
+ * thread may not want to readahead every inode in the file system, so
+ * the interface allows the client to pass a function used to decide
+ * which inodes to readahead.
+ *
+ * Note that the main thread is allowed to pull slightly ahead of the
+ * readahead thread (and allow the cache used accounting to go
+ * negative) because it's hard to predict how many blocks the main
+ * thread will read when it's checking an inode.  Checking before
+ * every block read would be a lot of overhead, so checking and
+ * blocking if necessary is done when the main thread is entirely done
+ * with an inode.
+ *
+ * We try to keep the granularity of adding or releasing things to the
+ * cache large, so that we don't grab locks and send signals and
+ * switch threads for every block that enters or leaves the cache.
+ *
+ * Asynchronous IO implementation:
+ *
+ * Asynchronous, parallel IO is implemented using primitives that are
+ * available, well-tested, and actually do something in most deployed
+ * Linux systems:
+ *
+ * Start io: posix_fadvise (WILLNEED)
+ * Complete io: read() into scratch buffer
+ *
+ * Amazingly, this works - and it goes fast.  At this point in time,
+ * the only other serious alternative is to spawn multiple threads and
+ * do synchronous ios from the thread.  No other forms of aio for
+ * Linux satisfy the above three criteria; in particular, POSIX aio is
+ * implemented as synchronous IO on most systems.
+ *
+ * Device queue management:
+ *
+ * As with the cache, we assume we have pretty much exclusive use of
+ * the device containing the file system, and try to manage the queue
+ * blind.  The io queue size is currently passed in from the user, but
+ * could be found programmatically with some effort.  Ios are
+ * considered outstanding after we start readahead until we read the
+ * actual data requested.  We try to collect at least max_ios before
+ * sorting and issuing them, to get optimal parallelization and head
+ * movement.
+ *
+ * This code replicates some portion of the in-kernel device queue
+ * management - queue plugging, merging, sorting, etc.  Why not just
+ * let the kernel handle it?  The answer is that we can predict what
+ * blocks will be needed, and we know about block dependencies.  The
+ * block layer has to guess what will happen next, but we have more
+ * information than we can pass down through the system call
+ * interface.  That said, it's certainly possible that the performance
+ * difference will be negligible on some systems.
+ *
+ * Thread management:
+ *
+ * Each pass of readahead (inode, direct block, etc.) creates and
+ * destroys its own thread.  The thread isn't created and doesn't
+ * start until the *_start() function is called, so the main thread
+ * controls when readahead starts working.  Some generic helper
+ * functions are provided; the per-pass initialization only has to
+ * initialize its own private variables.
+ *
+ * TODO:
+ *
+ * - acls and xattrs are ignored, but should also be read.
+ * - The user interface is less than stellar, esp. for maximum cache memory
+ * - autoconf the thread stuff and all of readahead
+ * - Implement main thread timeout for safety
+ * - Replace dummy block hack to keep main thread from hanging
+ * - Use pthread_cleanup handlers when holding locks to prevent
+ *   deadlock in the case of the thread dying while holding a lock
+ * - mincore() instead of read()?
+ * - parallelize inode table readahead
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <pthread.h>
+#include <error.h>
+#include <errno.h>
+
+#include "ext2_fs.h"
+#include "ext2fs.h"
+#include "ext2fsP.h"
+
+/* XXX should use ext2 debug routines? */
+
+/* #define EXT2FS_READAHEAD_DEBUG */
+
+#ifdef EXT2FS_READAHEAD_DEBUG
+#define dprintf			printf
+#else
+#define dprintf(args...)	do { } while (0)
+#endif
+
+struct io_desc {
+	blk_t	blk;
+	unsigned int count;
+};
+
+struct readahead_state {
+	ext2_filsys	fs;
+	int		enabled;
+
+	/*
+	 * Pass-independent IO and cache management.
+	 */
+
+	/* Maximum number of simultaneous ios disk can handle */
+	unsigned int	max_ios;
+	/* Maximum io size - to prevent deadlocks waiting for cache */
+	unsigned int	max_io_size;
+	/* Kick off outstanding ios after this many msecs idle */
+	unsigned int	timeout;
+	/* Should be at least max_ios in size */
+	unsigned int	io_queue_size;
+	/* Blocks to read queue */
+	struct io_desc	*io_queue;
+	/* Ios in the queue */
+	unsigned int	ios_queued;
+	/* Blocks in the queue (but not issued) */
+	unsigned int	blocks_queued;
+	/* All cache size variables are in units of file system blocks */
+	/* Maximum buffer cache to use */
+	int		cache_max;
+	/* Buffer cache used by pre-read blocks */
+	int		cache_used;
+	/* Cache reserved for in-progress readaheads */
+	int		cache_pending;
+	/* Cache blocks needed for waiting readahead thread to progress */
+	int		cache_needed;
+	/* Wake readahead when cache used gets this low */
+	unsigned int	cache_restart_ra;
+	/* Wake main thread ahead when cache used gets this high */
+	unsigned int	cache_restart_main;
+	/* Protects shared cache variables */
+	pthread_mutex_t	cache_mutex;
+	/* Condition variable for thread wake/sleep */
+	pthread_cond_t	cache_wake;
+	/* Is readahead done? Then don't wait on zero cache */
+	int		cache_readahead_done;
+	/*
+	 * Scratch buffer for reading a single throwaway block.  Never
+	 * ever read the contents.
+	 */
+	char		*scratch_buf;
+	/* Readahead thread struct; only need one of them */
+	pthread_t		thread;
+	/* Pass-specific exit handler */
+	void		(*thread_exit)(struct readahead_state *);
+	/* Should we exit now? Check frequently */
+	int		should_exit;
+
+	/*
+	 * Inode readahead related state
+	 */
+
+	/*
+	 * Indirect blocks are traversed recursively and need one
+	 * block buffer for double and triple blocks (singles use the
+	 * scratch_buf).
+	 */
+	char		*double_buf;
+	char		*triple_buf;
+	/* We do our own internal inode scan */
+	ext2_inode_scan scan;
+	/* Caller provided function to decide which inodes to read ahead */
+	int		(*should_read_inode)(struct ext2_inode *);
+
+	/*
+	 * Directory block readahead related state
+	 */
+
+	/* List of directory blocks */
+	ext2_dblist dblist;
+};
+
+static void readahead_test_exit(struct readahead_state *ra);
+static void readhead_thread_exit_now(struct readahead_state *ra);
+
+/*
+ * Cache management routines.
+ */
+
+/*
+ * Sanity check cache variables and kill readahead if things look
+ * wonky.  Caller must not hold the cache mutex.
+ */
+
+static void check_cache(struct readahead_state *ra)
+{
+	if (ra->cache_used + ra->cache_pending > ra->cache_max) {
+		fprintf(stderr, "Bug! cache_used (%d) + cache_pending (%d) is "
+			"greater than cache_max (%d)\n",
+			ra->cache_used, ra->cache_pending, ra->cache_max);
+#ifdef EXT2FS_READAHEAD_DEBUG
+		abort();
+#endif
+		readhead_thread_exit_now(ra);
+	}
+}
+
+/*
+ * Do we have room in the cache for this io?
+ */
+
+static int cache_full(struct readahead_state *ra, unsigned int blks)
+{
+	/* No lock needed - main thread can only decrease cache_used */
+	if (ra->cache_used + ra->cache_pending + blks > ra->cache_max) {
+		dprintf(" RA: %s: cache_used %d cache_pending %d blks %u\n",
+			__FUNCTION__, ra->cache_used, ra->cache_pending, blks);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Wait until cache becomes available.
+ */
+
+static void wait_for_cache(struct readahead_state *ra, unsigned int blks)
+{
+	check_cache(ra);
+	pthread_mutex_lock(&ra->cache_mutex);
+	/*
+	 * The main thread could have freed up cache since the call to
+	 * cache_full(), check to see if we already have space
+	 * available.
+	 */
+	if (ra->cache_used + ra->cache_pending + blks <= ra->cache_max) {
+		pthread_mutex_unlock(&ra->cache_mutex);
+		return;
+	}
+	/* Guess not.  Wait for it. */
+	ra->cache_needed = blks;
+	dprintf(" RA: %s: Need cache, waking main thread, used %d needed %d\n",
+		__FUNCTION__, ra->cache_used, ra->cache_needed);
+	pthread_cond_signal(&ra->cache_wake);
+	dprintf(" RA: %s: Waiting for main thread to free cache...\n",
+		__FUNCTION__);
+	pthread_cond_wait(&ra->cache_wake, &ra->cache_mutex);
+	/* cache_needed is reset by the main thread */
+	pthread_mutex_unlock(&ra->cache_mutex);
+	/* See if we were actually woken by request to exit */
+	readahead_test_exit(ra);
+}
+
+/*
+ * Track blocks read into buffer cache and ready for the main thread
+ * to use.
+ */
+
+static void blocks_ready(struct readahead_state *ra, unsigned int count)
+{
+	check_cache(ra);
+	pthread_mutex_lock(&ra->cache_mutex);
+	dprintf(" RA: %s: cache_pending %d cache_used %d count %u\n",
+		__FUNCTION__, ra->cache_pending, ra->cache_used, count);
+	ra->cache_pending -= count;
+	ra->cache_used += count;
+	/* Wake main thread if we crossed the boundary */
+	if ((ra->cache_used - count < ra->cache_restart_main) &&
+	    (ra->cache_used >= ra->cache_restart_main)) {
+		dprintf(" RA: %s: Cache filling up, waking main thread "
+			"(used %d)\n", __FUNCTION__, ra->cache_used);
+		pthread_cond_signal(&ra->cache_wake);
+	}
+	pthread_mutex_unlock(&ra->cache_mutex);
+}
+
+/*
+ * IO queue management routines.
+ */
+
+#ifdef EXT2FS_READAHEAD_DEBUG
+
+/*
+ * Sanity check the io queue.
+ */
+
+static void check_queue(struct readahead_state *ra)
+{
+	struct io_desc *io;
+	int i;
+
+	if (ra->ios_queued > ra->io_queue_size) {
+		fprintf(stderr, "Bug!  IOs queued %u > IO queue size %u\n",
+			ra->ios_queued, ra->io_queue_size);
+		goto out;
+	}
+
+	for (i = 0; i < ra->io_queue_size; i++) {
+		io = &ra->io_queue[i];
+		/* Only print non-zero ios for brevity */
+		if (io->blk)
+			dprintf("  io[%d] blk %u count %u\n", i, io->blk,
+			       io->count);
+		if ((io->blk || io->count) &&
+		    (i >= ra->ios_queued)) {
+			fprintf(stderr, "Bug!  io[%d] lost! "
+				"(blk %u count %u)\n", i, io->blk, io->count);
+			goto out;
+		}
+	}
+
+	return;
+
+ out:
+#ifdef EXT2FS_READAHEAD_DEBUG
+	abort();
+#endif
+	readhead_thread_exit_now(ra);
+}
+#else
+#define check_queue(x) do { } while(0);
+#endif
+
+/*
+ * Compare function for sorting the io queue.  We sort unused (zero)
+ * ios created by merge_ios() to the end.
+ */
+
+static EXT2_QSORT_TYPE io_compare(const void *a, const void *b)
+{
+	struct io_desc *io_a = (struct io_desc *) a;
+	struct io_desc *io_b = (struct io_desc *) b;
+
+	if ((io_a->blk == 0) &&
+	    (io_b->blk == 0))
+		/* Don't care */
+		return 0;
+	/*
+	 * Make zeroes bigger than everything else so they move to the
+	 * end of the queue.
+	 */
+	if (io_a->blk == 0)
+		return 1;
+
+	if (io_b->blk == 0)
+		return -1;
+
+	return io_a->blk - io_b->blk;
+}
+
+/*
+ * Merge ios if they are contiguous while respecting maximum IO size.
+ */
+
+static void merge_ios(struct readahead_state *ra)
+{
+	/* merge_ios not to be called with less than one io */
+	unsigned int new_ios = 1;
+	struct io_desc *last_io;
+	struct io_desc *new_io;
+	blk_t last_blk;
+	blk_t new_last_blk;
+	int i;
+
+	qsort(ra->io_queue, ra->ios_queued, sizeof(ra->io_queue[0]),
+	      io_compare);
+
+	/* Set last_io to the first io... */
+	last_io = &ra->io_queue[0];
+	new_ios = 1;
+	/* Then start merging at second io */
+	for (i = 1; i < ra->ios_queued; i++) {
+		dprintf(" RA: %s: io[%d]\n", __FUNCTION__, i);
+		new_io = &ra->io_queue[i];
+		last_blk = last_io->blk + last_io->count;
+		/*
+		 * Merge ios if both (1) the start of the new io is <=
+		 * to the end of the last io, and (2) the new io would
+		 * not be bigger than the maximum allowed io.
+		 */
+		if ((new_io->blk <= last_blk) &&
+		    ((new_io->count + last_io->count) <= ra->max_io_size)) {
+			dprintf(" RA: %s: io merged! [%u,%u] + [%u,%u] = ",
+				__FUNCTION__, last_io->blk, last_io->count,
+				new_io->blk, new_io->count);
+			new_last_blk = new_io->blk + new_io->count;
+			/*
+			 * Be careful.  The end of this io could be
+			 * less than the end of last io.
+			 */
+			if (new_last_blk > last_blk)
+				last_io->count = new_last_blk - last_io->blk;
+			/* Clear the merged io */
+			new_io->blk = 0;
+			new_io->count = 0;
+			dprintf("[%u,%u]\n", last_io->blk, last_io->count);
+		} else {
+			dprintf(" RA: Not merged: [%u,%u] and [%u,%u]\n",
+				last_io->blk, last_io->count,
+				new_io->blk, new_io->count);
+			/* This is our new last io */
+			last_io = new_io;
+			new_ios++;
+		}
+	}
+
+	/* Resort to move zeroes to the end */
+	qsort(ra->io_queue, ra->ios_queued, sizeof(ra->io_queue[0]),
+	      io_compare);
+
+	check_queue(ra);
+
+	ra->ios_queued = new_ios;
+}
+
+/*
+ * Ios can be bad for at least two reasons:
+ *
+ * - corrupted on-disk block pointer (can't prevent this)
+ * - programming error (a.k.a. "bug")
+ *
+ * Check for and reject obviously bad ios before they are issued or
+ * cache is reserved for them.
+ */
+
+static int reject_io(struct readahead_state *ra, blk_t blk, unsigned int count)
+{
+	blk_t max_blk = ra->fs->super->s_blocks_count - 1;
+
+	/* Is the blk number within the bounds of the file system? */
+	if ((blk == 0) || (blk + count > max_blk)) {
+		fprintf(stderr, "Bad io: blk %u count %u\n", blk, count);
+#ifdef EXT2FS_READAHEAD_DEBUG
+		abort();
+#endif
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Complete issued ios to get them out of the device's io queue.
+ */
+
+static void reap_ios(struct readahead_state *ra, unsigned int start,
+		     unsigned int count)
+{
+	unsigned int blocks_reaped = 0;
+	struct io_desc *io;
+	int i;
+
+	dprintf(" RA: %s: start %u count %u\n", __FUNCTION__, start, count);
+	for (i = start; i < start + count; i++) {
+		io = &ra->io_queue[i];
+		if (reject_io(ra, io->blk, io->count))
+			continue;
+		/*
+		 * Read blocks into the scratch buffer one io at a
+		 * time and throw them away (scratch buffer is big
+		 * enough for the maximum possible io).  Wastes memory
+		 * bandwidth, but I sure hope that's not our
+		 * bottleneck.  Also, passing the memory back to the
+		 * main thread is painful, and keeping the blocks in
+		 * memory would result in double-buffering for
+		 * everything in the cache anyway.
+		 */
+		io_channel_read_blk(ra->fs->io, io->blk, io->count,
+				    ra->scratch_buf);
+		blocks_reaped += io->count;
+		/*
+		 * Only release blocks to the cache when we have a lot
+		 * of them to avoid excessive lock and signal
+		 * overhead.
+		 */
+		/* XXX should be based on high/low watermarks */
+		if (blocks_reaped > (ra->cache_max / 10)) {
+			blocks_ready(ra, blocks_reaped);
+			blocks_reaped = 0;
+		}
+		/* Clear the io descriptor for debugging */
+		io->blk = 0;
+		io->count = 0;
+		/*
+		 * Frequent checking for an exit request is especially
+		 * important when doing IO.
+		 */
+		readahead_test_exit(ra);
+	}
+	blocks_ready(ra, blocks_reaped);
+	blocks_reaped = 0;
+}
+
+/*
+ * Given an array of ios, sort, issue, and wait for them.
+ *
+ * Device queue management is important.  We want to avoid overflowing
+ * the device queue and getting our readahead requests thrown away, so
+ * we need to know whether the previous readahead ios have completed.
+ * At the same time, we want to issue max_ios number of sorted IOs in
+ * parallel.  So we collect ios at user level and wait until (1) the
+ * queue is full, (2) we need a synchronous read (as for double/triple
+ * indirect blocks or inode tables), or (3) we timeout.  Then we issue
+ * the readahead requests, up to max_ios at a time.  We then do a
+ * synchronous read of all the outstanding ios to make sure they have
+ * completed before issuing the next set of ios.  Poor man's aio.
+ */
+
+static void issue_ios(struct readahead_state *ra)
+{
+	unsigned int ios_in_flight = 0;
+	struct io_desc *io;
+	int ios_finished = 0;
+	int i;
+
+	merge_ios(ra);
+
+	dprintf(" RA: %s: ios_queued %u blocks_queued %u after merge\n",
+		__FUNCTION__, ra->ios_queued, ra->blocks_queued);
+	/*
+	 * Issue up to max ios at a time.  If necessary, wake the main
+	 * thread and wait for cache to become available.
+	 */
+	for (i = 0; i < ra->ios_queued; i++) {
+		io = &ra->io_queue[i];
+		/* Check io and cache limits */
+		if ((ios_in_flight == ra->max_ios) ||
+		    cache_full(ra, io->count)) {
+			/* Cache won't overflow if we issue current ios */
+			reap_ios(ra, ios_finished, ios_in_flight);
+			ios_in_flight = 0;
+			ios_finished = i;
+		}
+		/* Check that we have enough cache */
+		wait_for_cache(ra, io->count);
+		/* Now we have enough cache and a free io slot */
+		dprintf(" RA: %s: issuing io %d blk %u count %d\n",
+			__FUNCTION__, i, io->blk, io->count);
+		io_channel_readahead(ra->fs->io, io->blk, io->count);
+		/* Reserve cache for this io */
+		ra->cache_pending += io->count;
+		ios_in_flight++;
+		readahead_test_exit(ra);
+	}
+	/* Finish off the last of the ios */
+	if (ios_in_flight)
+		reap_ios(ra, ios_finished, ios_in_flight);
+	ra->ios_queued = 0;
+	ra->blocks_queued = 0;
+}
+
+/*
+ * Issue any outstanding ios and wake main thread one more time.
+ * Called before thread exits.
+ */
+
+static void finish_ios(struct readahead_state *ra)
+{
+	/* Kick off any left-over ios */
+	if (ra->ios_queued)
+		issue_ios(ra);
+
+	/* Wake main thread one more time in case it's waiting on us */
+	pthread_mutex_lock(&ra->cache_mutex);
+	/*
+	 * XXX HACK: Prevent main thread hanging on zero cache by
+	 * adding a dummy block to the cache.  Will deadlock if we
+	 * have a cache accounting bug that makes cache_used go
+	 * negative.
+	 */
+	ra->cache_used += 1;
+	dprintf(" RA: %s: Readahead finished, waking main thread "
+		"cache_used %d\n", __FUNCTION__, ra->cache_used);
+	pthread_cond_signal(&ra->cache_wake);
+	pthread_mutex_unlock(&ra->cache_mutex);
+}
+
+/*
+ * Check if the io queue is full.  If it is, try to squish the ios
+ * together to free up io slots.
+ */
+
+static int queue_full(struct readahead_state *ra)
+{
+	if (ra->ios_queued == ra->io_queue_size)
+		merge_ios(ra);
+	if (ra->ios_queued == ra->io_queue_size)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Add an io to the readahead queue, issuing ios if necessary to make
+ * space in the queue.
+ */
+
+static void queue_blks(struct readahead_state *ra, blk_t blk,
+		       unsigned int count)
+{
+	unsigned int blks_to_queue;
+	struct io_desc *io;
+
+	dprintf(" RA: %s: queue %u, blk %u, count %d\n", __FUNCTION__,
+		ra->ios_queued, blk, count);
+
+	if (reject_io(ra, blk, count))
+		return;
+
+	/* Break down large requests into max_io_size chunks */
+	while (count) {
+		if (queue_full(ra))
+			issue_ios(ra);
+		io = &ra->io_queue[ra->ios_queued];
+		blks_to_queue = count > ra->max_io_size ?
+			ra->max_io_size : count;
+		io->blk = blk;
+		io->count = blks_to_queue;
+		/* printf("%s: queued blk %u count %u\n", __FUNCTION__, io->blk,
+		   io->count); */
+		ra->blocks_queued += count;
+		ra->ios_queued++;
+		count -= blks_to_queue;
+		blk += blks_to_queue;
+	}
+}
+
+/*
+ * Read requested blocks, issuing any other waiting ios while we're at
+ * it.
+ */
+
+static void read_blks(struct readahead_state *ra, blk_t blk,
+		      unsigned int count, char *buf)
+{
+	dprintf(" RA: %s: blk %u count %u\n", __FUNCTION__, blk, count);
+
+	if (reject_io(ra, blk, count))
+		return;
+
+	/* Add to readahead queue */
+	queue_blks(ra, blk, count);
+	/*
+	 * Since we have to go to disk anyway, kick off and complete
+	 * all outstanding ios.
+	 */
+	issue_ios(ra);
+	/*
+	 * Retrieve specific data requested.  The cache used by this
+	 * block is already accounted for.
+	 */
+	io_channel_read_blk(ra->fs->io, blk, count, buf);
+}
+
+/*
+ * Inode table and indirect block readahead routines.
+ */
+
+/*
+ * Given a triple, double, or single indirect block, recursively
+ * traverse the indirect block tree.  Read triples and doubles
+ * immediately, but just add singles to the main block queue and read
+ * them at our convenience.
+ */
+
+static void readahead_ind_blk(struct readahead_state *ra, blk_t blk,
+			      int level, int print_header)
+{
+	unsigned int limit = ra->fs->blocksize >> 2;
+	blk_t *blk_ptr;
+	char *buf;
+	int i;
+
+	if (print_header)
+		dprintf(" RA: %*s%s: blk %u level %d\n", level * 4, "",
+			__FUNCTION__, blk, level);
+	else if (level != 0)
+		/* Start a new line */
+		dprintf("\n");
+
+	if (reject_io(ra, blk, 1))
+		return;
+
+	/* Single indirect block?  Just queue it up and return. */
+	if (level == 0) {
+		queue_blks(ra, blk, 1);
+		return;
+	}
+
+	/* Read double/triple immediately */
+	if (level == 1)
+		buf = ra->double_buf;
+	else
+		buf = ra->triple_buf;
+
+	read_blks(ra, blk, 1, buf);
+
+	/* Iterate through the block pointers */
+	blk_ptr = (blk_t *) buf;
+	dprintf(" RA: %*s%s(%d): read block %u into buf %p\n",
+		level * 4, "", __FUNCTION__, level, blk, buf);
+
+	for (i = 0; i < limit; i++) {
+		if (blk_ptr[i] != 0)
+			readahead_ind_blk(ra, blk_ptr[i], level - 1, 0);
+	}
+}
+
+/*
+ * Perform breadth-first traversal on the indirect blocks of the inode.
+ */
+
+static void readahead_inode(struct readahead_state *ra,
+			    struct ext2_inode *inode,
+			    ext2_ino_t ino)
+{
+	dprintf("%s: %d\n", __FUNCTION__, ino);
+
+	/* XXX Read acls and xattrs too */
+
+	if (inode->i_block[EXT2_IND_BLOCK])
+		readahead_ind_blk(ra, inode->i_block[EXT2_IND_BLOCK], 0, 1);
+	if (inode->i_block[EXT2_DIND_BLOCK])
+		readahead_ind_blk(ra, inode->i_block[EXT2_DIND_BLOCK], 1, 1);
+	if (inode->i_block[EXT2_TIND_BLOCK])
+		readahead_ind_blk(ra, inode->i_block[EXT2_TIND_BLOCK], 2, 1);
+}
+
+/*
+ * Start readahead for a set of inode tables.
+ *
+ * XXX Inode table readahead really, really ought to be parallelized,
+ * but the cache and io slot management is pretty hairy.  Why?
+ *
+ * - You can't just add the blocks to the cache for more than one
+ * inode table at once because the main thread will think the indirect
+ * blocks for the current inode table are ready and go read them -
+ * which will kill performance because reading indirect blocks out of
+ * order is pretty fatally slow.
+ *
+ * - You have to reserve some cache for indirect block reads for the
+ * current block group to complete or else readahead won't be able to
+ * progress, since all the cache will be locked up as pending and it
+ * will never make any of it available to the main thread.  To merely
+ * prevent deadlocks, you must reserve at least enough blocks for the
+ * maximum io size.  To make it perform well, you have to reserve more
+ * than that.  How much is enough?  Don't know.
+ *
+ * - To make sure you can read inode tables in parallel, you want to
+ * make sure you have enough io slots free.  Again, how many?  Should
+ * they be always reserved or can indirect block reads be able to use
+ * them?  Should you just issue all outstanding ios when you start
+ * reading a block group?  But then you have to change issue_ios so it
+ * doesn't mark the blocks as available in cache.
+ *
+ * Possible solutions:
+ *
+ * - Track inode table blocks and indirect blocks separately.  Messes
+ * up nice clean generic cache interface.
+ *
+ * - Track current inode number and don't let main thread process
+ * beyond that.  Have to worry about cache/inode progress deadlocks.
+ *
+ * - Reserve cache/io for inode table readaheads.  Danger of deadlocks
+ * waiting for cache waiting for io waiting for cache, etc..  How much
+ * to reserve not clear.
+ */
+
+static void readahead_inode_tables(struct readahead_state *ra, dgrp_t group,
+				  int count)
+{
+	int i;
+
+	dprintf(" RA: %s: Starting groups %u-%u cache_used %u\n",
+		__FUNCTION__, group, group + count - 1, ra->cache_used);
+
+	for (i = 0; i < count; i++)
+		queue_blks(ra, ra->fs->group_desc[group + i].bg_inode_table,
+			   ra->fs->inode_blocks_per_group);
+	/*
+	 * Kick off the io immediately since we'll need the blocks
+	 * right away.
+	 */
+	issue_ios(ra);
+}
+
+/*
+ * At the end of each block group, issue readahead for the next block
+ * group(s).
+ */
+
+static errcode_t readahead_scan_callback(ext2_filsys fs,
+			       ext2_inode_scan scan EXT2FS_ATTR((unused)),
+			       dgrp_t group, void * priv_data)
+{
+	struct readahead_state *ra = (struct readahead_state *) priv_data;
+	dgrp_t next_group = group + 1;
+
+	dprintf(" RA: %s: Finished group %u cache_used %d\n", __FUNCTION__,
+		group, ra->cache_used);
+
+	/* Any more blockgroups? */
+	if (next_group == fs->group_desc_count)
+		return 0;
+
+	/* Start readahead for next inode table(s) */
+	/* XXX count is always 1 for now - see comment */
+	readahead_inode_tables(ra, next_group, 1);
+	return 0;
+}
+
+/*
+ * Theory of readahead thread exit/kill:
+ *
+ * The readahead thread will end for one of two reasons: (1) This
+ * particular pass is successfully completed, (2) The main thread
+ * wants to kill readahead entirely because some pass was aborted.
+ * When either of these things happens, we need to exit the readahead
+ * thread safely and then run the exit function for this pass in the
+ * main thread's context.  The nasty bit is doing this while making
+ * sure we don't leave the cache mutex locked.  We really have to
+ * avoid the main thread blocking in any situation so that readahead
+ * doesn't make fsck *less* stable.  Pthreads doesn't offer any
+ * reasonable facilities for doing this, though.
+ *
+ * The solution is exactly the same one as the main fsck thread: Block
+ * cancellation and occasionally check at predetermined safe places if
+ * we should exit now.
+ *
+ * So the interface for cleaning up properly after passes goes like
+ * this:
+ *
+ * In the pass-specific init routine, set the thread_exit pointer to
+ * your exit function - after completing all setup that will be torn
+ * down on exit.
+ *
+ * Call readahead_thread_setup at start of the pass-specific main
+ * routine to disable cancellation.
+ *
+ * When the pass completes normally, it should call
+ * readahead_thread_exit().  This will exit the thread.  Pass-specific
+ * clean up will not happen until the main thread calls the pass exit
+ * function (clean up should be done in the same context as
+ * initialization).
+ *
+ * If something goes drastically wrong, the pass has to be restarted,
+ * or readahead needs to be disabled immediately for any reason, call
+ * ext2fs_readahead_kill().  This will set the "exit and cleanup"
+ * variable.  The readahead thread will eventually check it and
+ * correctly clean up and kill the current readahead thread.  From
+ * this point on, readahead is disabled and no future readahead
+ * requests will actually start.  If you want readahead to start
+ * again, start over again with ext2fs_readahead_init().
+ *
+ * Throughout the pass-specific readahead, use readahead_test_exit()
+ * to frequenttly check for the exit condition (e.g., don't do too
+ * much IO without checking for exit).
+ */
+
+/*
+ * Generic post-thread create setup.  Run it first thing as soon as
+ * the thread is created (in the new thread's context).
+ */
+
+static void readahead_thread_setup(struct readahead_state *ra)
+{
+	/*
+	 * Disable cancellation so that we can exit only at safe
+	 * points.
+	 */
+	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
+}
+
+/*
+ * Normal pass completion exit handler.
+ */
+
+static void readahead_thread_exit(struct readahead_state *ra)
+{
+	finish_ios(ra);
+}
+
+/*
+ * Check if readahead should exit entirely.  Currently it's just a
+ * check and an exit, but if anything is added, note that we do not
+ * want to do anything fancy because we're trying to kill all
+ * readahead activity as soon as possible without doing anything
+ * dangerous.  All pass-specific cleanup is done in the context of the
+ * caller.
+ *
+ * This function may not be called with the cache mutex held.
+ */
+
+static void readahead_test_exit(struct readahead_state *ra)
+{
+	if (!ra->should_exit)
+		return;
+
+	fprintf(stderr, "Readahead thread dying an untimely death.\n");
+
+	ra->enabled = 0;
+	pthread_exit(NULL);
+}
+
+/*
+ * Exit now.  Initiated from the readahead thread, rather than the
+ * main thread as in ext2fs_readahead_kill().
+ *
+ * This function may not be called with the cache mutex held.
+ */
+
+static void readhead_thread_exit_now(struct readahead_state *ra)
+{
+	ra->should_exit = 1;
+	readahead_test_exit(ra);
+}
+
+/*
+ * The inode readahead main function iterates over all the inodes in
+ * the file system, reading the indirect blocks if necessary, to get
+ * data in cache as efficiently as possible.  The main thread issues
+ * IOs synchronously and as-needed; we can do better.
+ */
+
+void * readahead_inode_loop(void *arg)
+{
+	struct readahead_state *ra = (struct readahead_state *) arg;
+	struct ext2_inode *inode;
+	ext2_ino_t ino = 0;
+
+	readahead_thread_setup(ra);
+
+	dprintf(" RA: %s\n", __FUNCTION__);
+
+	ext2fs_set_inode_callback(ra->scan, readahead_scan_callback, ra);
+
+	/* Start first bg readahead, rest are triggered by callback */
+	readahead_inode_tables(ra, 0, 1);
+
+	do {
+		ext2fs_get_next_inode_ptr(ra->scan, &ino, &inode);
+		/* Is it interesting? Readahead its indirect blocks */
+		if (!ra->should_read_inode || ra->should_read_inode(inode))
+			readahead_inode(ra, inode, ino);
+		/*
+		 * Check after every inode for exit request; we can
+		 * process a lot of inodes before hitting any other
+		 * exit points.
+		 */
+		readahead_test_exit(ra);
+	} while (ino);
+
+	readahead_thread_exit(ra);
+	/* Return value ignored */
+	return NULL;
+}
+
+/*
+ * Helper function for ext2fs_dblist_iterate.
+ */
+
+static int iter_dblist(ext2_filsys fs, struct ext2_db_entry *db,
+		       void *priv_data)
+{
+	struct readahead_state *ra = (struct readahead_state *) priv_data;
+
+	queue_blks(ra, db->blk, 1);
+
+	return 0;
+}
+
+/*
+ * Do the directory block readahead.
+ */
+
+void * readahead_dblist(void *arg)
+{
+	struct readahead_state *ra = (struct readahead_state *) arg;
+	errcode_t err;
+
+	readahead_thread_setup(ra);
+
+	err = ext2fs_dblist_iterate(ra->dblist, iter_dblist, ra);
+
+	if (err)
+		fprintf(stderr, "Error traversing directory blocks\n");
+
+	readahead_thread_exit(ra);
+
+	/* Return value ignored */
+	return NULL;
+}
+
+/*
+ * The client routines for readahead are called only by the main
+ * thread.  The rest of the readahead functions are called only by the
+ * readahead thread.  The client initializes, starts, and stops the
+ * readahead thread.  It informs the readahead thread when it has
+ * finished reading blocks from the cache.
+ *
+ * The distinction between the two threads is especially important
+ * when it comes to lseek()/read()/write() on the device file
+ * descriptor.  The two threads must never share an fd or the lseek()s
+ * will race with each other.  The readahead thread's fd is in the
+ * readahead struct; the main thread's fd is in the context struct
+ * (both are inside the ext2_filsys sturcture).  The main symptom of
+ * this bug is getting back data which is from somewhere on disk, just
+ * not the somewhere you wanted.
+ *
+ */
+
+static int readahead_disabled(struct readahead_state *ra)
+{
+	if ((ra == NULL) || !ra->enabled)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Put the cache and associated locks into the right state to start a
+ * readahead thread.
+ */
+
+static void readahead_cache_init(struct readahead_state *ra)
+{
+	pthread_mutex_init(&ra->cache_mutex, NULL);
+	pthread_cond_init(&ra->cache_wake, NULL);
+
+	ra->cache_used = 0;
+	ra->cache_needed = 0;
+	ra->cache_readahead_done = 0;
+}
+
+errcode_t ext2fs_readahead_init(ext2_filsys fs, unsigned int max_ios,
+				unsigned int cache_max,
+				struct readahead_state **ret_ra)
+{
+	struct readahead_state *ra;
+	int retval;
+
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	*ret_ra = NULL;
+
+	/*
+	 * Don't initialize readahead unless explicitly enabled by the
+	 * user passing the max ios argument.
+	 */
+
+	if (max_ios == 0)
+		return 0;
+
+	retval = ext2fs_get_mem(sizeof(*ra), &ra);
+	if (retval)
+		return retval;
+
+	/* Reopen the device so we don't share the file pointer */
+	retval = ext2fs_open2(fs->device_name, NULL,
+			      /* Don't ask for exclusive open because
+			       * we definitely won't get it. */
+			      (fs->flags && ~IO_FLAG_EXCLUSIVE), 0, 0,
+			      fs->io->manager, &ra->fs);
+	if (retval)
+		goto out;
+
+	/* Thread exit/kill management */
+	ra->should_exit = 0;
+	ra->thread_exit = NULL;
+
+	/* IO queue set up */
+	ra->max_ios = max_ios;
+	dprintf("MT: %s: Maximum ios %u\n", __FUNCTION__, ra->max_ios);
+	ra->timeout = 100; /* milliseconds, currently not used */
+	/* Not sure how big to make this, but must be at least max_ios */
+	ra->io_queue_size = ra->max_ios * 10;
+	dprintf("MT: %s: IO queue size %u\n", __FUNCTION__, ra->io_queue_size);
+	ra->ios_queued = 0;
+	retval = ext2fs_get_mem(sizeof(ra->io_queue[0]) * ra->io_queue_size,
+				&ra->io_queue);
+	if (retval)
+		goto out_close;
+	/* Clear the queue for debugging purposes */
+	memset(ra->io_queue, 0, sizeof(ra->io_queue[0]) * ra->io_queue_size);
+
+	/* Cache set up */
+
+	/*
+	 * Max memory must be at least as big as an inode table.  Make
+	 * the minimum four times that so we can get more than one
+	 * block group ahead and read some indirect blocks too.
+	 */
+	ra->cache_max = cache_max;
+	/* XXX currently measuring cache_max in file system blocks, yuck */
+	if (ra->cache_max < ra->fs->inode_blocks_per_group) {
+		ra->cache_max = 4 * ra->fs->inode_blocks_per_group;
+		if (cache_max != 0)
+			/* XXX use ext2 printing routines */
+			fprintf(stderr, "Maximum cache %u blocks too small; "
+				"raised to %u blocks", cache_max,
+				ra->cache_max);
+	}
+	dprintf("MT: %s: Maximum cache %u\n", __FUNCTION__, ra->cache_max);
+
+	readahead_cache_init(ra);
+	/*
+	 * When the cache gets full, the readahead thread sleeps.
+	 * When the cache gets empty, the main thread sleeps.  If we
+	 * wake the threads at empty/full, then we end up with an
+	 * inefficient ping-pong effect where the readahead thread
+	 * only issues one io before going back to sleep.  The ideal
+	 * pattern is where the readahead thread caches a bunch of
+	 * blocks, then goes back to sleep until a lot of the cache is
+	 * free again, so it issue lots of ios in parallel.  When the
+	 * main thread wakes/sleeps is less important, other than
+	 * avoiding a lot of signal/wake traffic if it's near the
+	 * readahead thread wake point or near the cache boundaries.
+	 * Wait to wake it up until the cache is close to full.
+	 *
+	 * XXX Should wake main thread sooner?
+	 */
+
+	/* Restart readahead when the cache gets this low */
+	ra->cache_restart_ra = ra->cache_max / 3;
+	dprintf("MT: %s: Readahead restarts at %u\n", __FUNCTION__,
+	       ra->cache_restart_ra);
+	ra->cache_restart_main = (ra->cache_max * 2) / 3;
+	dprintf("MT: %s: Main thread restarts at %u\n", __FUNCTION__,
+	       ra->cache_restart_main);
+
+	/* Maxium io size (in blocks) should be less than the cache size */
+	/* XXX should be command line option */
+	ra->max_io_size = (256 * 1024) / ra->fs->blocksize;
+	if (ra->max_io_size > ra->cache_max) {
+		dprintf("MT: %s: Maximum io size %u larger than cache %u\n",
+			__FUNCTION__, ra->max_io_size, ra->cache_max);
+		ra->max_io_size = ra->cache_max / 4;
+	}
+	dprintf("MT: %s: Maximum io size %u\n", __FUNCTION__, ra->max_io_size);
+
+	/* Make scratch buffer big enough for largest ios */
+	retval = ext2fs_get_mem(ra->max_io_size * ra->fs->blocksize,
+				&ra->scratch_buf);
+	if (retval)
+		goto out_free_queue;
+
+	ra->enabled = 1;
+	*ret_ra = ra;
+	return 0;
+
+ out_free_queue:
+	ext2fs_free_mem(&ra->io_queue);
+ out_close:
+	ext2fs_close(ra->fs);
+ out:
+	ext2fs_free_mem(&ra);
+	fprintf(stderr, "Readahead failed to initialize.\n");
+	return retval;
+}
+
+void ext2fs_readahead_exit(struct readahead_state **ret_ra)
+{
+	struct readahead_state *ra = *ret_ra;
+
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (!ra)
+		return;
+
+	pthread_mutex_destroy(&ra->cache_mutex);
+	pthread_cond_destroy(&ra->cache_wake);
+	ext2fs_free_mem(&ra->scratch_buf);
+	ext2fs_free_mem(&ra->io_queue);
+	ext2fs_close(ra->fs);
+	ext2fs_free_mem(&ra);
+	*ret_ra = NULL;
+}
+
+/*
+ * Release blocks that have already been read.  Primitive for the
+ * various release runctions.
+ */
+
+static void blocks_release(struct readahead_state *ra, ext2_filsys fs,
+			   blk_t blk, int count)
+{
+	dprintf("MT: %s: cache_used %d releasing %d\n", __FUNCTION__,
+		ra->cache_used, count);
+	pthread_mutex_lock(&ra->cache_mutex);
+	ra->cache_used -= count;
+	/*
+	 * Did we get ahead of readahead?
+	 *
+	 * XXX Main thread hangs on cache_used == 0; current
+	 * workaround is to add a bogus block to the cache on exit.
+	 */
+	if (ra->cache_used <= 0) {
+		dprintf("MT: %s: Cache empty, waking readahead thread\n",
+			__FUNCTION__);
+		pthread_cond_signal(&ra->cache_wake);
+		dprintf("MT: %s: Waiting for readahead to populate cache...\n",
+			__FUNCTION__);
+		/* XXX use timeout to avoid deadlock on fatal bugs */
+		pthread_cond_wait(&ra->cache_wake, &ra->cache_mutex);
+		dprintf("MT: %s: Continuing, cache_used %d\n", __FUNCTION__,
+			ra->cache_used);
+	}
+	/*
+	 * Wake readahead thread if we have enough free cache to issue
+	 * io efficiently.
+	 */
+	if ((ra->cache_used + count) > ra->cache_restart_ra &&
+	    (ra->cache_used <= ra->cache_restart_ra)) {
+		/* Do we have enough cache to satisfy any specific need? */
+		if ((ra->cache_used + ra->cache_needed <= ra->cache_max)) {
+			dprintf("MT: %s: Cache available, waking readahead "
+				"(needed %u used %d)\n", __FUNCTION__,
+				ra->cache_needed, ra->cache_used);
+			pthread_cond_signal(&ra->cache_wake);
+			/* Don't keep waking it over and over... */
+			ra->cache_needed = 0;
+		}
+	}
+	pthread_mutex_unlock(&ra->cache_mutex);
+	/* Optional but nice: Let vm know we don't need these any more */
+	io_channel_cache_release(fs->io, blk, count);
+}
+
+/*
+ * ext2fs_block_iterate2 helper function for releasing indirect
+ * blocks.
+ */
+
+static int ind_block_release(ext2_filsys fs, blk_t *block_nr,
+		  e2_blkcnt_t blockcnt,
+		  blk_t ref_block EXT2FS_ATTR((unused)),
+		  int ref_offset EXT2FS_ATTR((unused)),
+		  void *priv_data)
+{
+	struct readahead_state *ra = (struct readahead_state *) priv_data;
+
+	/*
+	 * The blockcnt arg is the total number of data blocks (?)
+	 * traversed so far for this inode, not the number of blocks
+	 * to release.
+	 */
+	blocks_release(ra, fs, *block_nr, 1);
+
+	return 0;
+}
+
+/*
+ * Mark all cached blocks from this inode as released.
+ */
+
+void ext2fs_readahead_inode_release(struct readahead_state *ra,
+				    ext2_filsys fs, ext2_ino_t ino, char *block_buf)
+{
+	errcode_t err;
+
+	dprintf("MT: %s: %u\n", __FUNCTION__, ino);
+
+	if (readahead_disabled(ra))
+		return;
+
+	/*
+	 * Don't use ra->fs - that contains the readahead thread's fd
+	 * - using the same fd results in exciting lseek()/read()
+	 * races.
+	 */
+	err = ext2fs_block_iterate2(fs, ino, BLOCK_FLAG_IND_ONLY,
+				    block_buf, ind_block_release, ra);
+
+	/* Can't return this error, but can notify user */
+	if (err)
+		fprintf(stderr, "%s: Error %d traversing indirect blocks "
+			"for ino %u\n",  __FUNCTION__, (int) err, ino);
+}
+
+/*
+ * Mark all cached blocks from this inode table as used.
+ */
+
+void ext2fs_readahead_inode_table_release(struct readahead_state *ra,
+					  ext2_filsys fs, dgrp_t group)
+{
+	if (readahead_disabled(ra))
+		return;
+
+	dprintf("MT: %s: Finished group %u cache_used %u\n",
+		__FUNCTION__, group, ra->cache_used);
+
+	blocks_release(ra, fs, ra->fs->group_desc[group].bg_inode_table,
+		       ra->fs->inode_blocks_per_group);
+}
+
+/*
+ * Generic thread start.  Pass it the function for starting this pass
+ * of readahead.
+ */
+
+static void readahead_thread_start(struct readahead_state *ra,
+				   void * (*thread_start)(void *))
+{
+	if (pthread_create(&ra->thread, NULL, thread_start, ra)) {
+		fprintf(stderr, "Thread failed to start.\n");
+		ra->enabled = 0;
+		return;
+	}
+
+	/*
+	 * Wait for readahead to put something in the cache.  The
+	 * readahead thread might have already read something into the
+	 * cache and signaled us to wake, so only wait if nothing is
+	 * in the cache already.
+	 */
+	pthread_mutex_lock(&ra->cache_mutex);
+	if (ra->cache_used == 0)
+		pthread_cond_wait(&ra->cache_wake, &ra->cache_mutex);
+	pthread_mutex_unlock(&ra->cache_mutex);
+}
+
+/*
+ * Signal the current readahead thread to stop.  Call it before
+ * freeing any resources that the readahead thread might access.
+ *
+ * This function must complete in the case of the thread exiting
+ * before or during this function.
+ */
+
+static void readahead_thread_stop(struct readahead_state *ra)
+{
+	ra->should_exit = 1;
+	/*
+	 * Wake readahead thread in case it's waiting on cache.  The
+	 * readahead thread must test for exit after every
+	 * pthread_cond_wait().
+	 */
+	pthread_mutex_lock(&ra->cache_mutex);
+	pthread_cond_signal(&ra->cache_wake);
+	pthread_mutex_unlock(&ra->cache_mutex);
+	/*
+	 * At this point, the readahead thread has either (a) already
+	 * exited, or (b) is running but will call
+	 * readahead_test_exit() shortly and exit then.
+	 */
+	pthread_join(ra->thread, NULL);
+	/* Now the readahead thread is dead for sure */
+
+	/* Run the pass-specific clean up */
+	if (ra->thread_exit)
+		ra->thread_exit(ra);
+	ra->thread_exit = NULL;
+
+	readahead_cache_init(ra);
+	ra->should_exit = 0;
+
+	/*
+	 * Check to see if the readahead thread encountered some
+	 * problem and disabled readahead.  In that case, close down
+	 * readahead entirely.  To restart, call
+	 * ext2fs_readahead_init().
+	 */
+	if (readahead_disabled(ra))
+		ext2fs_readahead_exit(&ra);
+}
+
+/*
+ * Use this to kill readahead entirely and turn it off until
+ * ext2fs_readahead_init() is called again.  It is safe to call any
+ * ext2fs_readahead_* function after this since readahead will be
+ * disabled.
+ */
+
+void ext2fs_readahead_kill(struct readahead_state *ra)
+{
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (!ra)
+		return;
+
+	readahead_thread_stop(ra);
+
+	ra->enabled = 0;
+}
+
+/*
+ * Clean up after inode readahead.
+ */
+
+static void readahead_inode_exit(struct readahead_state *ra)
+{
+	ext2fs_close_inode_scan(ra->scan);
+	ext2fs_free_mem(&ra->triple_buf);
+	ext2fs_free_mem(&ra->double_buf);
+	ra->should_read_inode = NULL;
+}
+
+/*
+ * Initialize inode readahead state.
+ */
+
+void ext2fs_readahead_inode_init(struct readahead_state *ra,
+				 int (*should_read_inode)(struct ext2_inode *))
+{
+	if (readahead_disabled(ra))
+		return;
+
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	/*
+	 * Allocate buffers for saving triple and double indirect
+	 * blocks while recursively queueing and issuing ios for the
+	 * block pointers they contain.  Don't need any for single
+	 * indirect blocks because we don't have to read the data
+	 * blocks they point to.
+	 */
+
+	if (ext2fs_get_mem(ra->fs->blocksize, &ra->double_buf))
+		goto out;
+
+	if (ext2fs_get_mem(ra->fs->blocksize, &ra->triple_buf))
+		goto out_free_double;
+
+	if (ext2fs_open_inode_scan(ra->fs, 8, &ra->scan)) {
+		fprintf(stderr, "Open inode scan failed, disabling "
+			"readahead\n");
+		goto out_free_double;
+	}
+
+	ra->should_read_inode = should_read_inode;
+	ra->thread_exit = readahead_inode_exit;
+
+	return;
+
+ out_free_double:
+	ext2fs_free_mem(&ra->double_buf);
+ out:
+	fprintf(stderr, "Inode readahead failed to initialize.\n");
+	ra->enabled = 0;
+}
+
+/*
+ * Kick off inode readahead.
+ */
+
+void ext2fs_readahead_inode_start(struct readahead_state *ra)
+{
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (readahead_disabled(ra))
+		return;
+
+	readahead_thread_start(ra, readahead_inode_loop);
+}
+
+/*
+ * Stop the inode readahead thread.  readahead_thread_stop() does all
+ * the actual work, including running the thread_exit function.
+ */
+
+void ext2fs_readahead_inode_exit(struct readahead_state *ra)
+{
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (readahead_disabled(ra))
+		return;
+
+	readahead_thread_stop(ra);
+}
+
+/*
+ * Release a block from the directory block list.
+ */
+
+void ext2fs_readahead_dblist_release(struct readahead_state *ra,
+				     ext2_filsys fs, blk_t blk)
+{
+	if (readahead_disabled(ra))
+		return;
+
+	blocks_release(ra, fs, blk, 1);
+}
+
+/*
+ * Clean up after directory block readahead.
+ */
+
+static void readahead_dblist_exit(struct readahead_state *ra)
+{
+	ra->dblist = NULL;
+}
+
+/*
+ * Set up the directory block readahead thread.
+ */
+
+void ext2fs_readahead_dblist_init(struct readahead_state *ra,
ext2_dblist dblist)
+{
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (readahead_disabled(ra))
+		return;
+
+	ra->dblist = dblist;
+	ra->thread_exit = readahead_dblist_exit;
+}
+
+/*
+ * Start the directory block readahead thread.
+ *
+ * Don't alter the dblist (i.e., sort it) after starting the dblist
+ * readahead, or you'll have some fabulous race conditions.  Note that
+ * ext2_dblist_iterate will sort the dblist if it's not already
+ * sorted.  Sort the dblist BEFORE you call this function.
+ */
+
+void ext2fs_readahead_dblist_start(struct readahead_state *ra)
+{
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (readahead_disabled(ra))
+		return;
+
+	readahead_thread_start(ra, readahead_dblist);
+}
+
+void ext2fs_readahead_dblist_exit(struct readahead_state *ra)
+{
+	dprintf("MT: %s\n", __FUNCTION__);
+
+	if (readahead_disabled(ra))
+		return;
+
+	readahead_thread_stop(ra);
+}
--- e2fsprogs-1.40.4.orig/e2fsck/pass2.c
+++ e2fsprogs-1.40.4/e2fsck/pass2.c
@@ -148,9 +148,17 @@ void e2fsck_pass2(e2fsck_t ctx)

 	if (fs->super->s_feature_compat & EXT2_FEATURE_COMPAT_DIR_INDEX)
 		ext2fs_dblist_sort(fs->dblist, special_dir_block_cmp);
-	
+	else
+		ext2fs_dblist_sort(fs->dblist, 0);
+
+	/* Start readahead after the dblist has been sorted */
+	ext2fs_readahead_dblist_init(ctx->ra, fs->dblist);
+	ext2fs_readahead_dblist_start(ctx->ra);
+
 	cd.pctx.errcode = ext2fs_dblist_iterate(fs->dblist, check_dir_block,
 						&cd);
+	ext2fs_readahead_dblist_exit(ctx->ra);
+
 	if (ctx->flags & E2F_FLAG_SIGNAL_MASK)
 		return;
 	if (cd.pctx.errcode) {
@@ -778,6 +786,8 @@ static int check_dir_block(ext2_filsys f
 	
 	old_op = ehandler_operation(_("reading directory block"));
 	cd->pctx.errcode = ext2fs_read_dir_block(fs, block_nr, buf);
+	/* Release the block now - it is already in our memory */
+	ext2fs_readahead_dblist_release(ctx->ra, fs, block_nr);
 	ehandler_operation(0);
 	if (cd->pctx.errcode == EXT2_ET_DIR_CORRUPTED)
 		cd->pctx.errcode = 0; /* We'll handle this ourselves */
--- e2fsprogs-1.40.4.orig/lib/ext2fs/inode.c
+++ e2fsprogs-1.40.4/lib/ext2fs/inode.c
@@ -491,6 +491,76 @@ errcode_t ext2fs_get_next_inode_full(ext
 	return retval;
 }

+errcode_t ext2fs_get_next_inode_ptr(ext2_inode_scan scan, ext2_ino_t *ino,
+				    struct ext2_inode **inode)
+{
+	errcode_t	retval;
+	int		extra_bytes = 0;
+
+	EXT2_CHECK_MAGIC(scan, EXT2_ET_MAGIC_INODE_SCAN);
+
+	/*
+	 * Do we need to start reading a new block group?
+	 */
+	if (scan->inodes_left <= 0) {
+	force_new_group:
+		if (scan->done_group) {
+			retval = (scan->done_group)
+				(scan->fs, scan, scan->current_group,
+				 scan->done_group_data);
+			if (retval)
+				return retval;
+		}
+		if (scan->groups_left <= 0) {
+			*ino = 0;
+			return 0;
+		}
+		retval = get_next_blockgroup(scan);
+		if (retval)
+			return retval;
+	}
+	/*
+	 * These checks are done outside the above if statement so
+	 * they can be done for block group #0.
+	 */
+	if ((scan->scan_flags & EXT2_SF_DO_LAZY) &&
+	    (scan->fs->group_desc[scan->current_group].bg_flags &
+	     EXT2_BG_INODE_UNINIT))
+		goto force_new_group;
+	if (scan->current_block == 0) {
+		if (scan->scan_flags & EXT2_SF_SKIP_MISSING_ITABLE) {
+			goto force_new_group;
+		} else
+			return EXT2_ET_MISSING_INODE_TABLE;
+	}
+
+	/*
+	 * Have we run out of space in the inode buffer?  If so, we
+	 * need to read in more blocks.
+	 */
+	if (scan->bytes_left < scan->inode_size) {
+		memcpy(scan->temp_buffer, scan->ptr, scan->bytes_left);
+		extra_bytes = scan->bytes_left;
+
+		retval = get_next_blocks(scan);
+		if (retval)
+			return retval;
+	}
+
+	/* XXX ignoring extra_bytes logic */
+	/* Return a pointer directly to the inode buffer... Don't write it. */
+	*inode = (struct ext2_inode *) scan->ptr;
+	scan->ptr += scan->inode_size;
+	scan->bytes_left -= scan->inode_size;
+	if (scan->scan_flags & EXT2_SF_BAD_INODE_BLK)
+		retval = EXT2_ET_BAD_BLOCK_IN_INODE_TABLE;
+
+	scan->inodes_left--;
+	scan->current_inode++;
+	*ino = scan->current_inode;
+	return retval;
+}
+
 errcode_t ext2fs_get_next_inode(ext2_inode_scan scan, ext2_ino_t *ino,
 				struct ext2_inode *inode)
 {
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html