[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20140130235058.31064.21096.stgit@birch.djwong.org>
Date: Thu, 30 Jan 2014 15:50:58 -0800
From: "Darrick J. Wong" <darrick.wong@...cle.com>
To: tytso@....edu, darrick.wong@...cle.com
Cc: linux-ext4@...r.kernel.org
Subject: [PATCH 2/2] libext2fs/e2fsck: implement metadata prefetching
Use threads with our new mmap IO manager to prefetch metadata. This
results in a major e2fsck run time speedup. There's also a stupider
multiprocess version that works with the good old UNIX IO manager to
get pages into the page cache.
Signed-off-by: Darrick J. Wong <darrick.wong@...cle.com>
---
e2fsck/unix.c | 13 +
lib/ext2fs/Makefile.in | 8 +
lib/ext2fs/ext2fs.h | 13 +
lib/ext2fs/prefetch.c | 456 ++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 490 insertions(+)
create mode 100644 lib/ext2fs/prefetch.c
diff --git a/e2fsck/unix.c b/e2fsck/unix.c
index eeeef7c..33afc06 100644
--- a/e2fsck/unix.c
+++ b/e2fsck/unix.c
@@ -1181,6 +1181,7 @@ int main (int argc, char *argv[])
__u32 features[3];
char *cp;
int qtype = -99; /* quota type */
+ struct ext2fs_prefetch_handle *h = NULL;
clear_problem_context(&pctx);
sigcatcher_setup();
@@ -1638,9 +1639,21 @@ print_unsupp_features:
quota_init_context(&ctx->qctx, ctx->fs, qtype);
}
+ if (getenv("PREFETCH")) {
+ int flags = PREFETCH_INODES | PREFETCH_DIRS | PREFETCH_THREADED;
+ if (getenv("PREFETCH_WAIT"))
+ flags &= ~PREFETCH_THREADED;
+ retval = ext2fs_prefetch(fs, flags, &h);
+ if (retval)
+ com_err(ctx->program_name, retval, "prefetching");
+ }
+
run_result = e2fsck_run(ctx);
e2fsck_clear_progbar(ctx);
+ if (h)
+ ext2fs_prefetch_free(&h);
+
if (ctx->flags & E2F_FLAG_JOURNAL_INODE) {
if (fix_problem(ctx, PR_6_RECREATE_JOURNAL, &pctx)) {
if (journal_size < 1024)
diff --git a/lib/ext2fs/Makefile.in b/lib/ext2fs/Makefile.in
index a1b5a01..9fbf2b5 100644
--- a/lib/ext2fs/Makefile.in
+++ b/lib/ext2fs/Makefile.in
@@ -73,6 +73,7 @@ OBJS= $(DEBUGFS_LIB_OBJS) $(RESIZE_LIB_OBJS) $(E2IMAGE_LIB_OBJS) \
native.o \
newdir.o \
openfs.o \
+ prefetch.o \
progress.o \
punch.o \
qcow2.o \
@@ -150,6 +151,7 @@ SRCS= ext2_err.c \
$(srcdir)/native.c \
$(srcdir)/newdir.c \
$(srcdir)/openfs.c \
+ $(srcdir)/prefetch.c \
$(srcdir)/progress.c \
$(srcdir)/punch.c \
$(srcdir)/qcow2.c \
@@ -863,6 +865,12 @@ openfs.o: $(srcdir)/openfs.c $(top_builddir)/lib/config.h \
$(srcdir)/ext2_fs.h $(srcdir)/ext3_extents.h $(top_srcdir)/lib/et/com_err.h \
$(srcdir)/ext2_io.h $(top_builddir)/lib/ext2fs/ext2_err.h \
$(srcdir)/ext2_ext_attr.h $(srcdir)/bitops.h $(srcdir)/e2image.h
+prefetch.o: $(srcdir)/prefetch.c $(top_builddir)/lib/config.h \
+ $(top_builddir)/lib/dirpaths.h $(srcdir)/ext2fs.h \
+ $(top_builddir)/lib/ext2fs/ext2_types.h $(srcdir)/ext2_fs.h \
+ $(srcdir)/ext3_extents.h $(top_srcdir)/lib/et/com_err.h $(srcdir)/ext2_io.h \
+ $(top_builddir)/lib/ext2fs/ext2_err.h $(srcdir)/ext2_ext_attr.h \
+ $(srcdir)/bitops.h $(srcdir)/ext2fsP.h
progress.o: $(srcdir)/progress.c $(top_builddir)/lib/config.h \
$(top_builddir)/lib/dirpaths.h $(srcdir)/ext2fs.h \
$(top_builddir)/lib/ext2fs/ext2_types.h $(srcdir)/ext2_fs.h \
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index ba5c388..e634d2c 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -1522,6 +1522,19 @@ errcode_t ext2fs_mmp_update2(ext2_filsys fs, int immediately);
errcode_t ext2fs_mmp_stop(ext2_filsys fs);
unsigned ext2fs_mmp_new_seq(void);
+/* prefetch.c */
+#define PREFETCH_THREADED (0x00000001)
+#define PREFETCH_ERROR_ABORT (0x00000002)
+#define PREFETCH_BITMAPS (0x00000004)
+#define PREFETCH_INODES (0x00000008)
+#define PREFETCH_MAPS (0x00000010)
+#define PREFETCH_DIRS (0x00000020)
+struct ext2fs_prefetch_handle;
+errcode_t ext2fs_prefetch(ext2_filsys fs, int flags,
+ struct ext2fs_prefetch_handle **h);
+errcode_t ext2fs_prefetch_wait(struct ext2fs_prefetch_handle *h);
+errcode_t ext2fs_prefetch_free(struct ext2fs_prefetch_handle **h);
+
/* read_bb.c */
extern errcode_t ext2fs_read_bb_inode(ext2_filsys fs,
ext2_badblocks_list *bb_list);
diff --git a/lib/ext2fs/prefetch.c b/lib/ext2fs/prefetch.c
new file mode 100644
index 0000000..022af41
--- /dev/null
+++ b/lib/ext2fs/prefetch.c
@@ -0,0 +1,456 @@
+/*
+ * prefetch.c --- Prefetch filesystem metadata.
+ *
+ * Copyright (C) 2014 by Oracle, Darrick J. Wong.
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Library
+ * General Public License, version 2.
+ * %End-Header%
+ */
+
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <pthread.h>
+
+#include "config.h"
+#include "ext2_fs.h"
+#include "ext2fs.h"
+
+#define USE_THREADS 1
+#define USE_SUPER 1
+
+struct ext2fs_prefetch_handle {
+ ext2_filsys fs;
+ int flags;
+ int done;
+ pid_t pid;
+#ifdef USE_THREADS
+ pthread_t tid;
+#endif
+};
+
+static int ignore_block(ext2_filsys fs, blk64_t *blocknr, e2_blkcnt_t blockcnt,
+ blk64_t ref_blk, int ref_offset, void *priv_data)
+{
+ return 0;
+}
+
+struct dirent_iterate {
+ void *buf;
+ int flags;
+};
+
+static int dirent_block(ext2_filsys fs, blk64_t *blocknr, e2_blkcnt_t blockcnt,
+ blk64_t ref_blk, int ref_offset, void *priv_data)
+{
+ struct dirent_iterate *di = priv_data;
+ errcode_t err;
+
+ err = io_channel_read_blk64(fs->io, *blocknr, 1, di->buf);
+ if (err && (di->flags & PREFETCH_ERROR_ABORT))
+ return BLOCK_ABORT;
+ return 0;
+}
+
+/*
+ * First dumb prefetch implementation: Separate process, just read data to
+ * get it into the page cache, at least.
+ */
+static void do_ext2fs_prefetch(ext2_filsys fs, int flags)
+{
+ void *buf;
+ blk64_t blk;
+ dgrp_t i;
+ ext2_inode_scan scan;
+ int length = EXT2_INODE_SIZE(fs->super);
+ ext2_ino_t ino;
+ errcode_t err;
+ struct ext2_inode inode;
+ struct dirent_iterate di;
+ int iter_flags;
+ unsigned int blocks_to_read;
+
+ err = ext2fs_get_array(fs->blocksize, fs->inode_blocks_per_group, &buf);
+ if (err)
+ return;
+
+ /* load bitmaps */
+ if (!(flags & PREFETCH_BITMAPS))
+ goto skip_bitmaps;
+ err = ext2fs_read_bitmaps(fs);
+ if (err && (flags & PREFETCH_ERROR_ABORT))
+ goto out;
+
+skip_bitmaps:
+ /* load inode tables */
+ if (!(flags & PREFETCH_INODES) || (flags & (PREFETCH_MAPS |
+ PREFETCH_DIRS)))
+ goto skip_itable;
+
+ for (i = 0; i < fs->group_desc_count; i++) {
+ if (ext2fs_bg_flags_test(fs, i, EXT2_BG_INODE_UNINIT))
+ continue;
+
+ blocks_to_read = fs->inode_blocks_per_group;
+ if (ext2fs_has_group_desc_csum(fs)) {
+ unsigned int num_inodes =
+ fs->super->s_inodes_per_group -
+ ext2fs_bg_itable_unused(fs, i);
+ blocks_to_read = (num_inodes *
+ EXT2_INODE_SIZE(fs->super)) /
+ fs->blocksize;
+ }
+
+ blk = ext2fs_inode_table_loc(fs, i);
+ err = io_channel_read_blk64(fs->io, blk, blocks_to_read, buf);
+ if (err && (flags & PREFETCH_ERROR_ABORT))
+ goto out;
+ }
+
+skip_itable:
+ /* load inodes */
+ if (!(flags & (PREFETCH_MAPS | PREFETCH_DIRS)))
+ goto skip_inodes;
+
+ err = ext2fs_open_inode_scan(fs, 0, &scan);
+ if (err && (flags & PREFETCH_ERROR_ABORT))
+ goto out;
+
+ di.buf = buf;
+ di.flags = flags;
+ do {
+ err = ext2fs_get_next_inode_full(scan, &ino, &inode,
+ sizeof(inode));
+ if (err)
+ break;
+ if (!ino)
+ break;
+ if (!ext2fs_test_inode_bitmap2(fs->inode_map, ino))
+ continue;
+
+ iter_flags = BLOCK_FLAG_READ_ONLY | BLOCK_FLAG_DATA_ONLY;
+ if ((flags & PREFETCH_MAPS) &&
+ !(flags & PREFETCH_DIRS) &&
+ (LINUX_S_ISREG(inode.i_mode) ||
+ LINUX_S_ISLNK(inode.i_mode))) {
+ err = ext2fs_block_iterate3(fs, ino, iter_flags, NULL,
+ ignore_block, &di);
+ } else if ((flags & PREFETCH_DIRS) &&
+ !(flags & PREFETCH_MAPS) &&
+ LINUX_S_ISDIR(inode.i_mode)) {
+ err = ext2fs_block_iterate3(fs, ino, iter_flags, NULL,
+ dirent_block, &di);
+ } else {
+ int (*func)(ext2_filsys fs, blk64_t *blocknr,
+ e2_blkcnt_t blockcnt, blk64_t ref_blk,
+ int ref_offset, void *priv_data) =
+ LINUX_S_ISDIR(inode.i_mode) ? dirent_block :
+ ignore_block;
+ err = ext2fs_block_iterate3(fs, ino, iter_flags, NULL,
+ func, &di);
+ }
+ if (err && (flags & PREFETCH_ERROR_ABORT))
+ break;
+
+ blk = ext2fs_file_acl_block(fs, &inode);
+ if (!blk)
+ continue;
+ err = io_channel_read_blk64(fs->io, blk, 1, buf);
+ if (err && (flags & PREFETCH_ERROR_ABORT))
+ break;
+ } while (ino);
+
+out2:
+ ext2fs_close_inode_scan(scan);
+
+skip_inodes:
+out:
+ ext2fs_free_mem(&buf);
+
+ return;
+}
+
+static void *prefetch_thread(void *data)
+{
+ struct ext2fs_prefetch_handle *pd = data;
+ do_ext2fs_prefetch(pd->fs, pd->flags);
+ return NULL;
+}
+
+/*
+ * Second, less dumb prefetch: Use threads to preload metadata in group order.
+ */
+struct super_entry {
+ dgrp_t group;
+ ext2_ino_t num_inodes;
+};
+
+struct super_thread {
+ ext2_filsys fs;
+ pthread_t tid;
+ int flags;
+ struct super_entry *start, *end;
+ unsigned int skip_factor;
+ void *buf;
+};
+
+static void *super_func(void *data)
+{
+ struct super_thread *t = data;
+ ext2_filsys fs = t->fs;
+ int flags = t->flags;
+ void *buf = t->buf;
+ struct super_entry *e;
+ unsigned int blocks_to_read;
+ blk64_t blk;
+ ext2_ino_t i, ino;
+ unsigned int nr_read = 0;
+ struct dirent_iterate di;
+ struct ext2_inode inode;
+ int iter_flags;
+ errcode_t err;
+
+ /* Read the inode tables */
+ for (e = t->start; e < t->end; e += t->skip_factor) {
+ blocks_to_read = (e->num_inodes *
+ EXT2_INODE_SIZE(fs->super)) / fs->blocksize;
+ blk = ext2fs_inode_table_loc(fs, e->group);
+ err = io_channel_read_blk64(fs->io, blk, blocks_to_read, buf);
+ if (err && (flags & PREFETCH_ERROR_ABORT))
+ continue;
+ }
+
+ /* Scan inodes for extent/dir blocks */
+ di.buf = buf;
+ di.flags = flags;
+ for (e = t->start; e < t->end; e += t->skip_factor) {
+ for (i = 0; i < e->num_inodes; i++) {
+ ino = e->group * fs->super->s_inodes_per_group + i;
+ err = ext2fs_read_inode(fs, ino, &inode);
+ if (err)
+ continue;
+ /* Skip unlinked or unknown-type inodes */
+ if (!inode.i_links_count ||
+ (inode.i_mode & 0xF000) == 0)
+ continue;
+
+ iter_flags = BLOCK_FLAG_READ_ONLY |
+ BLOCK_FLAG_DATA_ONLY;
+ if ((flags & PREFETCH_MAPS) &&
+ !(flags & PREFETCH_DIRS) &&
+ (LINUX_S_ISREG(inode.i_mode) ||
+ LINUX_S_ISLNK(inode.i_mode))) {
+ err = ext2fs_block_iterate3(fs, ino,
+ iter_flags, NULL,
+ ignore_block, &di);
+ } else if ((flags & PREFETCH_DIRS) &&
+ !(flags & PREFETCH_MAPS) &&
+ LINUX_S_ISDIR(inode.i_mode)) {
+ err = ext2fs_block_iterate3(fs, ino,
+ iter_flags, NULL,
+ dirent_block, &di);
+ } else {
+ int (*func)(ext2_filsys fs, blk64_t *blocknr,
+ e2_blkcnt_t blockcnt,
+ blk64_t ref_blk,
+ int ref_offset, void *priv_data) =
+ LINUX_S_ISDIR(inode.i_mode) ? dirent_block :
+ ignore_block;
+ err = ext2fs_block_iterate3(fs, ino,
+ iter_flags, NULL,
+ func, &di);
+ }
+
+ blk = ext2fs_file_acl_block(fs, &inode);
+ if (!blk)
+ continue;
+ err = io_channel_read_blk64(fs->io, blk, 1, buf);
+ }
+ }
+
+ return NULL;
+}
+
+static void *super_prefetch(void *data)
+{
+ struct ext2fs_prefetch_handle *pd = data;
+ ext2_filsys fs = pd->fs;
+ int flags = pd->flags;
+ void *b, *r;
+ struct super_thread *threads = NULL, *t;
+ unsigned int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
+ unsigned int j;
+ struct super_entry *entries = NULL, *e = NULL;
+ unsigned int num_entries = 0;
+ dgrp_t i;
+ ext2_ino_t num_inodes;
+ errcode_t err;
+
+ /* Find all non-empty groups */
+ for (i = 0; i < fs->group_desc_count; i++) {
+ if (ext2fs_bg_flags_test(fs, i, EXT2_BG_INODE_UNINIT))
+ continue;
+
+ num_inodes = fs->super->s_inodes_per_group;
+ if (ext2fs_bg_free_inodes_count(fs, i) == num_inodes)
+ continue;
+ if (ext2fs_has_group_desc_csum(fs))
+ num_inodes -= ext2fs_bg_itable_unused(fs, i);
+ if (e == entries + num_entries) {
+ r = realloc(entries, (num_entries + 32) *
+ sizeof(*entries));
+ if (r == NULL) {
+ err = errno;
+ goto out;
+ }
+ entries = r;
+ e = entries + num_entries;
+ num_entries += 32;
+ }
+ e->group = i;
+ e->num_inodes = num_inodes;
+ e++;
+ }
+ num_entries = e - entries;
+
+ /* Set up the threads */
+ if (getenv("PREFETCH_THREADS")) {
+ j = atoi(getenv("PREFETCH_THREADS"));
+ if (j > 0)
+ num_threads = j;
+ }
+
+ err = ext2fs_get_arrayzero(num_threads, sizeof(*threads) +
+ (fs->blocksize * fs->inode_blocks_per_group),
+ &b);
+ if (err)
+ goto out;
+ threads = b + (fs->blocksize * fs->inode_blocks_per_group *
+ num_threads);
+
+ for (j = 0, t = threads, e = entries; j < num_threads; j++, t++, e++) {
+ t->fs = fs;
+ t->flags = flags;
+ t->start = e;
+ t->end = entries + num_entries;
+ t->skip_factor = num_threads;
+ err = ext2fs_dup_handle(fs, &t->fs);
+ if (err)
+ goto out2;
+ t->fs->icache = NULL;
+ t->buf = b + (fs->blocksize * fs->inode_blocks_per_group * j);
+ pthread_create(&t->tid, NULL, super_func, t);
+ }
+
+ /* Wait for threads */
+ for (j = 0, t = threads; j < num_threads; j++, t++)
+ pthread_join(t->tid, NULL);
+ pd->done = 1;
+out2:
+ ext2fs_free_mem(&b);
+out:
+ free(entries);
+ return NULL;
+}
+
+struct unix_private_data_hack {
+ int magic;
+ int dev;
+};
+
+errcode_t ext2fs_prefetch(ext2_filsys fs, int flags,
+ struct ext2fs_prefetch_handle **h)
+{
+ struct ext2fs_prefetch_handle *pd;
+ errcode_t err;
+
+ err = ext2fs_get_memzero(sizeof(*pd), &pd);
+ if (err)
+ return err;
+ pd->fs = fs;
+ pd->flags = flags;
+
+ /* Load the rest */
+ if (flags & PREFETCH_THREADED) {
+ if (fs->io->manager == mmap_io_manager) {
+#if USE_SUPER
+ struct timespec ts;
+ err = pthread_create(&pd->tid, NULL, super_prefetch,
+ pd);
+ if (err)
+ goto errout;
+ ts.tv_sec = 0; ts.tv_nsec = 500000;
+ nanosleep(&ts, NULL);
+#elif USE_THREADS
+ err = pthread_create(&pd->tid, NULL, prefetch_thread,
+ pd);
+ if (err)
+ goto errout;
+#else
+ goto single_thread;
+#endif
+ } else if (fs->io->manager == unix_io_manager) {
+ pd->pid = fork();
+ if (pd->pid < 0) {
+ err = errno;
+ goto errout;
+ } else if (pd->pid == 0) {
+ struct unix_private_data_hack *m =
+ fs->io->private_data;
+ m->dev = open(fs->device_name, O_RDONLY);
+ do_ext2fs_prefetch(fs, flags);
+ exit(0);
+ }
+ }
+ } else {
+single_thread:
+#if USE_SUPER
+ super_prefetch(pd);
+#else
+ do_ext2fs_prefetch(fs, flags);
+#endif
+ pd->done = 1;
+ }
+ *h = pd;
+
+ return 0;
+errout:
+ ext2fs_free_mem(&pd);
+ return err;
+}
+
+errcode_t ext2fs_prefetch_wait(struct ext2fs_prefetch_handle *h)
+{
+ pid_t ret;
+ int status;
+
+ if (h->flags & PREFETCH_THREADED && h->done != 0) {
+ if (h->tid)
+ ret = pthread_join(h->tid, NULL);
+ if (h->pid) {
+ ret = waitpid(h->pid, &status, WNOHANG);
+ if (ret == 0)
+ kill(h->pid, SIGKILL);
+ waitpid(h->pid, NULL, 0);
+ }
+ }
+ h->done = 1;
+ return 0;
+}
+
+errcode_t ext2fs_prefetch_free(struct ext2fs_prefetch_handle **h)
+{
+ ext2fs_prefetch_wait(*h);
+ return ext2fs_free_mem(h);
+}
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists