lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20100405220220.GT29604@tux1.beaverton.ibm.com>
Date:	Mon, 5 Apr 2010 15:02:20 -0700
From:	"Darrick J. Wong" <djwong@...ibm.com>
To:	"Theodore Ts'o" <tytso@....edu>
Cc:	linux-ext4 <linux-ext4@...r.kernel.org>
Subject: EXT4_IOC_MOVE_EXT file corruption!

Hi all,

I wrote a program called e4frag that deliberately tries to fragment an ext4
filesystem via EXT4_IOC_MOVE_EXT so that I could run e4defrag through its
paces.  While running e4frag and e4defrag concurrently on a kernel source tree,
I discovered ongoing file corruption.  It appears that if e4frag and e4defrag
hit the same file at same time, the file ends up with a 4K data block from
somewhere else.  "Somewhere else" seems to be a small chunk of binary gibberish
followed by contents from other files(!)  Obviously this isn't a good thing to
see, since today it's header files but tomorrow it could be the credit card/SSN
database. :)

Ted asked me to send out a copy of the program ASAP, so the test program source
code is at the end of this message.  To build it, run:

$ gcc -o e4frag -O2 -Wall e4frag.c

and then to run it:

(unpack something in /path/to/files)
$ cp -pRdu /path/to/files /path/to/intact_files
$ while true; do e4defrag /path/to/files & done
$ while true; do ./e4frag -m 500 -s random /path/to/files & done
$ while true; do diff -Naurp /path/to/intact_files /path/to/files; done

...and wait for diff to cough up differences.  This seems to happen on
2.6.34-rc3, and only if e4frag and e4defrag are running concurrently.  Running
e4frag or e4defrag in a serial loop doesn't produce this corruption, so I think
it's purely a concurrent access problem.

On a lark, I ran fsck afterwards:

# fsck -C -f -y /dev/sda
fsck from util-linux-ng 2.16
e2fsck 1.41.9 (22-Aug-2009)
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure                                           
Pass 3: Checking directory connectivity                                        
Pass 4: Checking reference counts
Pass 5: Checking group summary information                                     
Inode bitmap differences:  -534593 -534654 -534744 -534768 -534947 -662276
-662438 -1058789 -1058850 -1059026 -1059219 -1318193 -1583270 -1583378 -1583422
-2234673 -2631973 -3156444 -3156632 -3680888 -3680950 -4204922 -4205252
-4205286
Fix? yes

                                                                               
/dev/sda: ***** FILE SYSTEM WAS MODIFIED *****
/dev/sda: 291596/107143168 files (4.6% non-contiguous), 7829819/428544000 blocks

Is this a sign that the extent tree is getting corrupted somehow?  Ted thought
that it might have something to do with an ialloc mutex, I think.

--D

/*
 * Try to fragment files.
 * Copyright (C) 2010 IBM.  All rights reserved.
 *
 * This program is licensed under the GPLv2.
 * Signed-off-by: Darrick J. Wong <djwong@...ibm.com>
 */
#define _FILE_OFFSET_BITS	64
#define _XOPEN_SOURCE		600
#define _GNU_SOURCE

#include <stdio.h>
#include <string.h>
#include <ftw.h>
#include <sys/vfs.h>
#include <sys/statfs.h>
#include <assert.h>
#include <sys/statvfs.h>
#include <errno.h>
#include <linux/magic.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/param.h>
#include <unistd.h>
#include <stdlib.h>
#include <asm-generic/int-l64.h>
#include <sys/ioctl.h>
#include <sys/mman.h>

#define DEFAULT_MAX_DONOR_FILES	0
#define STATUS_NEWLINE		"\r"
#define PROGRAM			"e4frag v0.2"

struct fragment_context {
	const char *fpath;
	off_t max_progress;
	off_t current_progress;
	int old_pct;
};

struct fragment_profile {
	const char *name;
	int (*get_donor_fd)(struct fragment_context *fc, off_t max_files, off_t num_blocks);
	int (*prepare)(struct fragment_context *fc, off_t max_files, off_t num_blocks);
};

static int max_donor_files = DEFAULT_MAX_DONOR_FILES;
static struct statvfs statvfsbuf;
static char donor_file_template[PATH_MAX];
static off_t donor_files; /* expect as many donor files as blocks */
static struct fragment_profile *profile;
static int verbose = 0;

/* Shamelessly stolen from e4defrag.c */

struct move_extent {
	__s32 reserved;		/* original file descriptor */
	__u32 donor_fd;		/* donor file descriptor */
	__u64 orig_start;       /* logical start offset in block for orig */
	__u64 donor_start;      /* logical start offset in block for donor */
	__u64 len;		/* block length to be moved */
	__u64 moved_len;	/* moved block length */
};

#ifndef EXT4_IOC_MOVE_EXT
#define EXT4_IOC_MOVE_EXT      _IOWR('f', 15, struct move_extent)
#endif

/* end stuff from e4defrag */

void print_status(struct fragment_context *fc, const char *str)
{
	if (!verbose)
		return;

	printf("%s: %s\n", fc->fpath, str);
	fflush(stdout);
}

void emit_status(struct fragment_context *fc, const char *str)
{
	if (!verbose)
		return;

	printf("%s: %s" STATUS_NEWLINE, fc->fpath, str);
	fflush(stdout);
}

void inc_status(struct fragment_context *fc)
{
	int pct;

	fc->current_progress++;
	pct = 100 * fc->current_progress / fc->max_progress;
	if (pct != fc->old_pct) {
		if (verbose)
			printf("%s: %d%%" STATUS_NEWLINE, fc->fpath, pct);
		fflush(stdout);
		fc->old_pct = pct;
	}
}

int cleanup_donor_files(struct fragment_context *fc, int report_errors)
{
	int ret;
	char tmp_inode_name[PATH_MAX];

	while (donor_files) {
		snprintf(tmp_inode_name, PATH_MAX, donor_file_template, --donor_files);
		ret = unlink(tmp_inode_name);
		if (report_errors && ret) {
			perror(tmp_inode_name);
			return ret;
		}
		inc_status(fc);
	}

	return 0;
}

off_t calculate_max_files(off_t num_blocks)
{
	off_t x = statvfsbuf.f_bavail / num_blocks;

	/* Only use user setting if there's space. */
	if (max_donor_files > 0 && x > max_donor_files)
		return max_donor_files;

	return x;
}

int generic_frag_file(const char *fpath, const struct stat *sb, struct fragment_profile *fp)
{
	struct fragment_context fc;
	struct move_extent move_data;
	off_t num_blocks, block, max_files;
	int ret, donor_fd, fd;

	fc.fpath = fpath;
	fc.max_progress = 0;
	fc.current_progress = 0;
	fc.old_pct = -1;

	/* Screen out non-files or single-block files. */
	if (!S_ISREG(sb->st_mode))
		return 0;

	num_blocks = sb->st_size / statvfsbuf.f_bsize;
	if (sb->st_size % statvfsbuf.f_bsize)
		num_blocks++;

	if (num_blocks < 2)
		return 0;

	fd = open(fpath, O_RDWR);
	if (fd < 0) {
		perror(fpath);
		ret = -errno;
		goto out;
	}

	/* Kernel can return -ENODATA if we don't sync the source file first. */
	emit_status(&fc, "syncing...");
	fsync(fd);
	emit_status(&fc, "          ");

	/* Prepare for donor files */
	assert(!donor_files);
	donor_files = 0;
	snprintf(donor_file_template, PATH_MAX, "%s.%%lu.defrag", fpath);

	/* Figure out the maximum donor file count for this file */
	max_files = calculate_max_files(num_blocks);

	ret = fp->prepare(&fc, max_files, num_blocks);
	if (ret)
		goto err;

	/* Start moving blocks */
	memset(&move_data, 0, sizeof(move_data));
	move_data.len = 1;
	for (block = num_blocks - 1; block >= 0; block--) {
		donor_fd = fp->get_donor_fd(&fc, max_files, num_blocks);
		if (donor_fd < 0)
			goto err;

		/* Swap blocks */
		/* NB: Source and donor logical block must be the same. */
		move_data.donor_fd = donor_fd;
		move_data.orig_start = move_data.donor_start = block;
		move_data.moved_len = 0;
		ret = ioctl(fd, EXT4_IOC_MOVE_EXT, &move_data);
		if (ret < 0) {
			perror(fpath);
			goto err2;
		}

		ret = close(donor_fd);
		if (ret) {
			perror("closing donor file");
			goto err;
		}

		inc_status(&fc);
	}

	cleanup_donor_files(&fc, 0);
	print_status(&fc, "Done.");
	close(fd);
	return 0;

err2:
	cleanup_donor_files(&fc, 0);
	close(donor_fd);
err:
	close(fd);
out:
	return ret;
}

/*
 * So, to "reverse" the source logical block numbers, create a donor
 * file for every block and do the swap.  Occasionally flush out the
 * donor files.  Iterate the source file's blocks backwards in the
 * hope of maximizing the amount of extent blocks that must also be
 * dumped all over the filesystem.
 */
int reverse_prepare(struct fragment_context *fc, off_t max_files, off_t num_blocks)
{
	fc->max_progress = 3 * num_blocks;
	return 0;
}

int reverse_get_donor_fd(struct fragment_context *fc, off_t max_files, off_t num_blocks)
{
	char tmp_inode_name[PATH_MAX];
	int donor_fd, ret;

	/* Clean out donor files */
	if (donor_files > max_files) {
		ret = cleanup_donor_files(fc, 1);
		if (ret)
			return ret;
	}

	/* Create hidden donor inode */
	snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor_files++);
	donor_fd = open(tmp_inode_name, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR);
	if (donor_fd < 0) {
		perror(tmp_inode_name);
		fprintf(stderr, "Is the fragmenter already running?\n");
		errno = EBUSY;
		return -1;
	}

	/* Allocate space in the donor file */
	ret = posix_fallocate(donor_fd, 0, num_blocks * statvfsbuf.f_bsize);
	if (ret) {
		perror(tmp_inode_name);
		close(donor_fd);
		return ret;
	}

	inc_status(fc);

	return donor_fd;
}

/*
 * So, to "randomize" the source logical block numbers, create a bunch
 * of donor files.  For each block, pick a donor file at random and
 * swap blocks with it.
 */
int random_prepare(struct fragment_context *fc, off_t max_files, off_t num_blocks)
{
	int donor_fd, ret;
	char tmp_inode_name[PATH_MAX];

	fc->max_progress = num_blocks + (2 * max_files);

	/* Allocate the donor files */
	for (donor_files = 0; donor_files < max_files; donor_files++) {
		/* Create donor inode */
		snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor_files);
		donor_fd = open(tmp_inode_name, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR);
		if (donor_fd < 0) {
			perror(tmp_inode_name);
			fprintf(stderr, "Is a fragmenter already running?\n");
			return -1;
		}

		/* Allocate space in the donor file */
		ret = posix_fallocate(donor_fd, 0, num_blocks * statvfsbuf.f_bsize);
		if (ret) {
			perror(tmp_inode_name);
			close(donor_fd);
			return -1;
		}

		close(donor_fd);
		inc_status(fc);
	}

	return 0;
}

int random_get_donor_fd(struct fragment_context *fc, off_t max_files, off_t num_blocks)
{
	char tmp_inode_name[PATH_MAX];
	int donor_fd;
	off_t donor = random() * max_files / RAND_MAX;

	/* Reopen donor inode */
	snprintf(tmp_inode_name, PATH_MAX, donor_file_template, donor);
	donor_fd = open(tmp_inode_name, O_WRONLY, S_IRUSR);
	if (donor_fd < 0) {
		perror(tmp_inode_name);
		errno = EBUSY;
		return -1;
	}

	return donor_fd;
}

static struct fragment_profile profiles[] = {
{"random", random_get_donor_fd, random_prepare},
{"reverse", reverse_get_donor_fd, reverse_prepare},
{NULL},
};

int fragment_file(const char *fpath, const struct stat *sb, int typeflag,
		  struct FTW *ftwbuf)
{
	return generic_frag_file(fpath, sb, profile);
}

void print_help(char *progname)
{
	printf("Usage: %s [-m max_files] [-s random|reverse] [-v] pathspec [pathspecs...]\n", progname);
	printf("-m	Number of donor files to create while fragmenting.  0 = automatic\n");
	printf("-s	Set fragmentation strategy. (\"reverse\" or \"random\" (default))\n");
	printf("-v	Print progress indicators.\n");
}

int main(int argc, char *argv[])
{
	struct fragment_profile *fp;
	struct statfs statfsbuf;
	struct stat statbuf;
	int i, ret, opt;

	profile = profiles;

	if (argc < 2) {
		print_help(argv[0]);
		return 0;
	}

	while ((opt = getopt(argc, argv, "vm:s:")) != -1) {
		switch (opt) {
		case 'm':
			max_donor_files = atoi(optarg);
			break;
		case 's':
			fp = profiles;
			while (fp->name) {
				if (!strcmp(fp->name, optarg)) {
					profile = fp;
					break;
				}
				fp++;
			}

			if (!fp->name) {
				print_help(argv[0]);
				return 1;
			}
			break;
		case 'v':
			verbose = 1;
			break;
		default:
			print_help(argv[0]);
			return 1;
		}
	}

	if (verbose)
		printf(PROGRAM ", strategy \"%s\" max donors %d.\n", profile->name, max_donor_files);

	for (i = optind; i < argc; i++) {
		/* ignore files on non-ext4 filesystems */
		ret = statfs(argv[i], &statfsbuf);
		if (ret) {
			perror(argv[i]);
			break;
		}

		if (statfsbuf.f_type != EXT3_SUPER_MAGIC) {
			ret = -ENOENT;
			fprintf(stderr, "%s: Ignoring file on non-ext2/3/4 filesystem.\n", argv[i]);
			break;
		}

		ret = stat(argv[i], &statbuf);
		if (ret) {
			perror(argv[i]);
			break;
		}

		ret = statvfs(argv[i], &statvfsbuf);
		if (ret) {
			perror(argv[i]);
			break;
		}

		if (S_ISDIR(statbuf.st_mode))
			nftw(argv[i], fragment_file, 64, FTW_MOUNT | FTW_PHYS);
		else
			fragment_file(argv[i], &statbuf, 0, NULL);
	}

	sync();

	return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ