[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20100405220220.GT29604@tux1.beaverton.ibm.com>
Date: Mon, 5 Apr 2010 15:02:20 -0700
From: "Darrick J. Wong" <djwong@...ibm.com>
To: "Theodore Ts'o" <tytso@....edu>
Cc: linux-ext4 <linux-ext4@...r.kernel.org>
Subject: EXT4_IOC_MOVE_EXT file corruption!
Hi all,
I wrote a program called e4frag that deliberately tries to fragment an ext4
filesystem via EXT4_IOC_MOVE_EXT so that I could run e4defrag through its
paces. While running e4frag and e4defrag concurrently on a kernel source tree,
I discovered ongoing file corruption. It appears that if e4frag and e4defrag
hit the same file at same time, the file ends up with a 4K data block from
somewhere else. "Somewhere else" seems to be a small chunk of binary gibberish
followed by contents from other files(!) Obviously this isn't a good thing to
see, since today it's header files but tomorrow it could be the credit card/SSN
database. :)
Ted asked me to send out a copy of the program ASAP, so the test program source
code is at the end of this message. To build it, run:
$ gcc -o e4frag -O2 -Wall e4frag.c
and then to run it:
(unpack something in /path/to/files)
$ cp -pRdu /path/to/files /path/to/intact_files
$ while true; do e4defrag /path/to/files & done
$ while true; do ./e4frag -m 500 -s random /path/to/files & done
$ while true; do diff -Naurp /path/to/intact_files /path/to/files; done
...and wait for diff to cough up differences. This seems to happen on
2.6.34-rc3, and only if e4frag and e4defrag are running concurrently. Running
e4frag or e4defrag in a serial loop doesn't produce this corruption, so I think
it's purely a concurrent access problem.
On a lark, I ran fsck afterwards:
# fsck -C -f -y /dev/sda
fsck from util-linux-ng 2.16
e2fsck 1.41.9 (22-Aug-2009)
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Pass 3: Checking directory connectivity
Pass 4: Checking reference counts
Pass 5: Checking group summary information
Inode bitmap differences: -534593 -534654 -534744 -534768 -534947 -662276
-662438 -1058789 -1058850 -1059026 -1059219 -1318193 -1583270 -1583378 -1583422
-2234673 -2631973 -3156444 -3156632 -3680888 -3680950 -4204922 -4205252
-4205286
Fix? yes
/dev/sda: ***** FILE SYSTEM WAS MODIFIED *****
/dev/sda: 291596/107143168 files (4.6% non-contiguous), 7829819/428544000 blocks
Is this a sign that the extent tree is getting corrupted somehow? Ted thought
that it might have something to do with an ialloc mutex, I think.
--D
/*
* Try to fragment files.
* Copyright (C) 2010 IBM. All rights reserved.
*
* This program is licensed under the GPLv2.
* Signed-off-by: Darrick J. Wong <djwong@...ibm.com>
*/
#define _FILE_OFFSET_BITS 64
#define _XOPEN_SOURCE 600
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <ftw.h>
#include <sys/vfs.h>
#include <sys/statfs.h>
#include <assert.h>
#include <sys/statvfs.h>
#include <errno.h>
#include <linux/magic.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/param.h>
#include <unistd.h>
#include <stdlib.h>
#include <asm-generic/int-l64.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#define DEFAULT_MAX_DONOR_FILES 0
#define STATUS_NEWLINE "\r"
#define PROGRAM "e4frag v0.2"
struct fragment_context {
const char *fpath;
off_t max_progress;
off_t current_progress;
int old_pct;
};
struct fragment_profile {
const char *name;
int (*get_donor_fd)(struct fragment_context *fc, off_t max_files, off_t num_blocks);
int (*prepare)(struct fragment_context *fc, off_t max_files, off_t num_blocks);
};
static int max_donor_files = DEFAULT_MAX_DONOR_FILES;
static struct statvfs statvfsbuf;
static char donor_file_template[PATH_MAX];
static off_t donor_files; /* expect as many donor files as blocks */
static struct fragment_profile *profile;
static int verbose = 0;
/* Shamelessly stolen from e4defrag.c */
struct move_extent {
__s32 reserved; /* original file descriptor */
__u32 donor_fd; /* donor file descriptor */
__u64 orig_start; /* logical start offset in block for orig */
__u64 donor_start; /* logical start offset in block for donor */
__u64 len; /* block length to be moved */
__u64 moved_len; /* moved block length */
};
#ifndef EXT4_IOC_MOVE_EXT
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
#endif
/* end stuff from e4defrag */
void print_status(struct fragment_context *fc, const char *str)
{
if (!verbose)
return;
printf("%s: %s\n", fc->fpath, str);
fflush(stdout);
}
void emit_status(struct fragment_context *fc, const char *str)
{
if (!verbose)
return;
printf("%s: %s" STATUS_NEWLINE, fc->fpath, str);
fflush(stdout);
}
void inc_status(struct fragment_context *fc)
{
int pct;
fc->current_progress++;
pct = 100 * fc->current_progress / fc->max_progress;
if (pct != fc->old_pct) {
if (verbose)
printf("%s: %d%%" STATUS_NEWLINE, fc->fpath, pct);
fflush(stdout);
fc->old_pct = pct;
}
}
int cleanup_donor_files(struct fragment_context *fc, int report_errors)
{
int ret;
char tmp_inode_name[PATH_MAX];
while (donor_files) {
snprintf(tmp_inode_name, PATH_MAX, donor_file_template, --donor_files);
ret = unlink(tmp_inode_name);
if (report_errors && ret) {
perror(tmp_inode_name);
return ret;
}
inc_status(fc);
}
return 0;
}
off_t calculate_max_files(off_t num_blocks)
{
off_t x = statvfsbuf.f_bavail / num_blocks;
/* Only use user setting if there's space. */
if (max_donor_files > 0 && x > max_donor_files)
return max_donor_files;
return x;
}
int generic_frag_file(const char *fpath, const struct stat *sb, struct fragment_profile *fp)
{
struct fragment_context fc;
struct move_extent move_data;
off_t num_blocks, block, max_files;
int ret, donor_fd, fd;
fc.fpath = fpath;
fc.max_progress = 0;
fc.current_progress = 0;
fc.old_pct = -1;
/* Screen out non-files or single-block files. */
if (!S_ISREG(sb->st_mode))
return 0;
num_blocks = sb->st_size / statvfsbuf.f_bsize;
if (sb->st_size % statvfsbuf.f_bsize)
num_blocks++;
if (num_blocks < 2)
return 0;
fd = open(fpath, O_RDWR);
if (fd < 0) {
perror(fpath);
ret = -errno;
goto out;
}
/* Kernel can return -ENODATA if we don't sync the source file first. */
emit_status(&fc, "syncing...");
fsync(fd);
emit_status(&fc, " ");
/* Prepare for donor files */
assert(!donor_files);
donor_files = 0;
snprintf(donor_file_template, PATH_MAX, "%s.%%lu.defrag", fpath);
/* Figure out the maximum donor file count for this file */
max_files = calculate_max_files(num_blocks);
ret = fp->prepare(&fc, max_files, num_blocks);
if (ret)
goto err;
/* Start moving blocks */
memset(&move_data, 0, sizeof(move_data));
move_data.len = 1;
for (block = num_blocks - 1; block >= 0; block--) {
donor_fd = fp->get_donor_fd(&fc, max_files, num_blocks);
if (donor_fd < 0)
goto err;
/* Swap blocks */
/* NB: Source and donor logical block must be the same. */
move_data.donor_fd = donor_fd;
move_data.orig_start = move_data.donor_start = block;
move_data.moved_len = 0;
ret = ioctl(fd, EXT4_IOC_MOVE_EXT, &move_data);
if (ret < 0) {
perror(fpath);
goto err2;
}
ret = close(donor_fd);
if (ret) {
perror("closing donor file");
goto err;
}
inc_status(&fc);
}
cleanup_donor_files(&fc, 0);
print_status(&fc, "Done.");
close(fd);
return 0;
err2:
cleanup_donor_files(&fc, 0);
close(donor_fd);
err:
close(fd);
out:
return ret;
}
/*
* So, to "reverse" the source logical block numbers, create a donor
* file for every block and do the swap. Occasionally flush out the
* donor files. Iterate the source file's blocks backwards in the
* hope of maximizing the amount of extent blocks that must also be
* dumped all over the filesystem.
*/
int reverse_prepare(struct fragment_context *fc, off_t max_files, off_t num_blocks)
{
fc->max_progress = 3 * num_blocks;
return 0;
}
int reverse_get_donor_fd(struct fragment_context *fc, off_t max_files, off_t num_blocks)
{
char tmp_inode_name[PATH_MAX];
int donor_fd, ret;
/* Clean out donor files */
if (donor_files > max_files) {
ret = cleanup_donor_files(fc, 1);
if (ret)
return ret;
}
/* Create hidden donor inode */
snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor_files++);
donor_fd = open(tmp_inode_name, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR);
if (donor_fd < 0) {
perror(tmp_inode_name);
fprintf(stderr, "Is the fragmenter already running?\n");
errno = EBUSY;
return -1;
}
/* Allocate space in the donor file */
ret = posix_fallocate(donor_fd, 0, num_blocks * statvfsbuf.f_bsize);
if (ret) {
perror(tmp_inode_name);
close(donor_fd);
return ret;
}
inc_status(fc);
return donor_fd;
}
/*
* So, to "randomize" the source logical block numbers, create a bunch
* of donor files. For each block, pick a donor file at random and
* swap blocks with it.
*/
int random_prepare(struct fragment_context *fc, off_t max_files, off_t num_blocks)
{
int donor_fd, ret;
char tmp_inode_name[PATH_MAX];
fc->max_progress = num_blocks + (2 * max_files);
/* Allocate the donor files */
for (donor_files = 0; donor_files < max_files; donor_files++) {
/* Create donor inode */
snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor_files);
donor_fd = open(tmp_inode_name, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR);
if (donor_fd < 0) {
perror(tmp_inode_name);
fprintf(stderr, "Is a fragmenter already running?\n");
return -1;
}
/* Allocate space in the donor file */
ret = posix_fallocate(donor_fd, 0, num_blocks * statvfsbuf.f_bsize);
if (ret) {
perror(tmp_inode_name);
close(donor_fd);
return -1;
}
close(donor_fd);
inc_status(fc);
}
return 0;
}
int random_get_donor_fd(struct fragment_context *fc, off_t max_files, off_t num_blocks)
{
char tmp_inode_name[PATH_MAX];
int donor_fd;
off_t donor = random() * max_files / RAND_MAX;
/* Reopen donor inode */
snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor);
donor_fd = open(tmp_inode_name, O_WRONLY, S_IRUSR);
if (donor_fd < 0) {
perror(tmp_inode_name);
errno = EBUSY;
return -1;
}
return donor_fd;
}
static struct fragment_profile profiles[] = {
{"random", random_get_donor_fd, random_prepare},
{"reverse", reverse_get_donor_fd, reverse_prepare},
{NULL},
};
int fragment_file(const char *fpath, const struct stat *sb, int typeflag,
struct FTW *ftwbuf)
{
return generic_frag_file(fpath, sb, profile);
}
void print_help(char *progname)
{
printf("Usage: %s [-m max_files] [-s random|reverse] [-v] pathspec [pathspecs...]\n", progname);
printf("-m Number of donor files to create while fragmenting. 0 = automatic\n");
printf("-s Set fragmentation strategy. (\"reverse\" or \"random\" (default))\n");
printf("-v Print progress indicators.\n");
}
int main(int argc, char *argv[])
{
struct fragment_profile *fp;
struct statfs statfsbuf;
struct stat statbuf;
int i, ret, opt;
profile = profiles;
if (argc < 2) {
print_help(argv[0]);
return 0;
}
while ((opt = getopt(argc, argv, "vm:s:")) != -1) {
switch (opt) {
case 'm':
max_donor_files = atoi(optarg);
break;
case 's':
fp = profiles;
while (fp->name) {
if (!strcmp(fp->name, optarg)) {
profile = fp;
break;
}
fp++;
}
if (!fp->name) {
print_help(argv[0]);
return 1;
}
break;
case 'v':
verbose = 1;
break;
default:
print_help(argv[0]);
return 1;
}
}
if (verbose)
printf(PROGRAM ", strategy \"%s\" max donors %d.\n", profile->name, max_donor_files);
for (i = optind; i < argc; i++) {
/* ignore files on non-ext4 filesystems */
ret = statfs(argv[i], &statfsbuf);
if (ret) {
perror(argv[i]);
break;
}
if (statfsbuf.f_type != EXT3_SUPER_MAGIC) {
ret = -ENOENT;
fprintf(stderr, "%s: Ignoring file on non-ext2/3/4 filesystem.\n", argv[i]);
break;
}
ret = stat(argv[i], &statbuf);
if (ret) {
perror(argv[i]);
break;
}
ret = statvfs(argv[i], &statvfsbuf);
if (ret) {
perror(argv[i]);
break;
}
if (S_ISDIR(statbuf.st_mode))
nftw(argv[i], fragment_file, 64, FTW_MOUNT | FTW_PHYS);
else
fragment_file(argv[i], &statbuf, 0, NULL);
}
sync();
return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists