linux-ext4 - Re: [PATCH] mke2fs: add support to align hugefiles relative to beginning of the disk

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20140711182558.GG10417@birch.djwong.org>
Date:	Fri, 11 Jul 2014 11:25:58 -0700
From:	"Darrick J. Wong" <darrick.wong@...cle.com>
To:	"Theodore Ts'o" <tytso@....edu>
Cc:	Ext4 Developers List <linux-ext4@...r.kernel.org>
Subject: Re: [PATCH] mke2fs: add support to align hugefiles relative to
 beginning of the disk

On Tue, Jul 08, 2014 at 08:44:51PM -0400, Theodore Ts'o wrote:
> Add the mke2fs.conf configuration option which causes the hugefiles to
> be aligned to the beginning of the disk.  This is important if the the
> reason for aligning the hugefiles is to support hard-drive specific
> features such as Shingled Magnetic Recording (SMR).
> 
> Signed-off-by: Theodore Ts'o <tytso@....edu>
> ---
>  misc/mk_hugefiles.c   | 154 ++++++++++++++++++++++++++++++++++++++++++++++++--
>  misc/mke2fs.c         |   2 +-
>  misc/mke2fs.conf.5.in |   7 +++
>  misc/mke2fs.h         |   2 +-
>  4 files changed, 157 insertions(+), 8 deletions(-)
> 
> diff --git a/misc/mk_hugefiles.c b/misc/mk_hugefiles.c
> index b7a9840..ea42b6c 100644
> --- a/misc/mk_hugefiles.c
> +++ b/misc/mk_hugefiles.c
> @@ -3,9 +3,11 @@
>   */
>  
>  #define _XOPEN_SOURCE 600 /* for inclusion of PATH_MAX in Solaris */
> +#define _BSD_SOURCE	  /* for makedev() and major() */
>  
>  #include "config.h"
>  #include <stdio.h>
> +#include <stdarg.h>
>  #include <string.h>
>  #include <strings.h>
>  #include <fcntl.h>
> @@ -60,6 +62,141 @@ static char *fn_buf;
>  static char *fn_numbuf;
>  int zero_hugefile = 1;
>  
> +#define SYSFS_PATH_LEN 256
> +typedef char sysfs_path_t[SYSFS_PATH_LEN];
> +
> +#ifndef HAVE_SNPRINTF
> +/*
> + * We are very careful to avoid needing to worry about buffer
> + * overflows, so we don't really need to use snprintf() except as an
> + * additional safety check.  So if snprintf() is not present, it's
> + * safe to fall back to vsprintf().  This provides portability since
> + * vsprintf() is guaranteed by C89, while snprintf() is only
> + * guaranteed by C99 --- which for example, Microsoft Visual Studio
> + * has *still* not bothered to implement.  :-/  (Not that I expect
> + * mke2fs to be ported to MS Visual Studio any time soon, but
> + * libext2fs *does* get built on Microsoft platforms, and we might
> + * want to move this into libext2fs some day.)
> + */
> +static int my_snprintf(char *str, size_t size, const char *format, ...)
> +{
> +	va_list	ap;
> +	int ret;
> +
> +	va_start(ap, format);
> +	ret = vsprintf(str, format, ap);
> +	va_end(ap);
> +	return ret;
> +}
> +
> +#define snprintf my_snprintf
> +#endif
> +
> +/*
> + * Fall back to Linux's definitions of makedev and major are needed.
> + * The search_sysfs_block() function is highly unlikely to work on
> + * non-Linux systems anyway.
> + */
> +#ifndef makedev
> +#define makedev(maj, min) (((maj) << 8) + (min))
> +#endif
> +
> +static char *search_sysfs_block(dev_t devno, sysfs_path_t ret_path)
> +{
> +	struct dirent	*de, *p_de;
> +	DIR		*dir = NULL, *p_dir = NULL;
> +	FILE		*f;
> +	sysfs_path_t	path, p_path;
> +	unsigned int	major, minor;
> +	char		*ret = ret_path;
> +
> +	ret_path[0] = 0;
> +	if ((dir = opendir("/sys/block")) == NULL)
> +		return NULL;
> +	while ((de = readdir(dir)) != NULL) {
> +		if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, "..") ||
> +		    strlen(de->d_name) > sizeof(path)-32)
> +			continue;
> +		snprintf(path, SYSFS_PATH_LEN,
> +			 "/sys/block/%s/dev", de->d_name);
> +		f = fopen(path, "r");
> +		if (f &&
> +		    (fscanf(f, "%u:%u", &major, &minor) == 2)) {
> +			fclose(f); f = NULL;
> +			if (makedev(major, minor) == devno) {
> +				snprintf(ret_path, SYSFS_PATH_LEN,
> +					 "/sys/block/%s", de->d_name);
> +				goto success;
> +			}
> +#ifdef major
> +			if (major(devno) != major)
> +				continue;
> +#endif
> +		}
> +		if (f)
> +			fclose(f);
> +
> +		snprintf(path, SYSFS_PATH_LEN, "/sys/block/%s", de->d_name);
> +
> +		if (p_dir)
> +			closedir(p_dir);
> +		if ((p_dir = opendir(path)) == NULL)
> +			continue;
> +		while ((p_de = readdir(p_dir)) != NULL) {
> +			if (!strcmp(p_de->d_name, ".") ||
> +			    !strcmp(p_de->d_name, "..") ||
> +			    (strlen(p_de->d_name) >
> +			     SYSFS_PATH_LEN - strlen(path) - 32))
> +				continue;
> +			snprintf(p_path, SYSFS_PATH_LEN, "%s/%s/dev",
> +				 path, p_de->d_name);
> +
> +			f = fopen(p_path, "r");
> +			if (f &&
> +			    (fscanf(f, "%u:%u", &major, &minor) == 2) &&
> +			    (((major << 8) + minor) == devno)) {
> +				fclose(f);
> +				snprintf(ret_path, SYSFS_PATH_LEN, "%s/%s",
> +					 path, p_de->d_name);
> +				goto success;
> +			}
> +			if (f)
> +				fclose(f);
> +		}
> +	}
> +	ret = NULL;
> +success:
> +	if (dir)
> +		closedir(dir);
> +	if (p_dir)
> +		closedir(p_dir);
> +	return ret;
> +}
> +
> +static blk64_t get_partition_start(const char *device_name)
> +{
> +	unsigned long long start;
> +	sysfs_path_t	path;
> +	struct stat	st;
> +	FILE		*f;
> +	char		*cp;
> +	int		n;
> +
> +	if ((stat(device_name, &st) < 0) || !S_ISBLK(st.st_mode))
> +		return 0;
> +
> +	cp = search_sysfs_block(st.st_rdev, path);
> +	if (!cp)
> +		return 0;
> +	strncat(path, "/start", SYSFS_PATH_LEN);

The third argument is the maximum number of bytes to concatenate from the
second argument ("/start").  Though it's unlikely that we'll ever find anything
in /sys/block approaching 255 characters, we might as well guard against stack
corruption:

	if (strlen(path) > SYSFS_PATH_LEN - strlen("/start") - 1)
		return 0;
	strcat(path, "/start");

Oh, I guess coverity is complaining about this too.

Though FWIW, "find /sys | while read f; do echo "$f" | wc -c; done | sort -g |
tail -n 5" spits out "133" as the longest sysfs path ever.  I guess that could
be much longer on some multi-node NUMA box or whatever.

<shrug> /me codes up a fix, tosses it on the patch pile.

--D

> +	f = fopen(path, "r");
> +	if (!f)
> +		return 0;
> +	n = fscanf(f, "%llu", &start);
> +	fclose(f);
> +	return (n == 1) ? start : 0;
> +}
> +
>  static errcode_t create_directory(ext2_filsys fs, char *dir,
>  				  ext2_ino_t *ret_ino)
>  
> @@ -310,24 +447,26 @@ static blk64_t get_start_block(ext2_filsys fs, blk64_t slack)
>  	return blk;
>  }
>  
> -static blk64_t round_up_align(blk64_t b, unsigned long align)
> +static blk64_t round_up_align(blk64_t b, unsigned long align,
> +			      blk64_t part_offset)
>  {
>  	unsigned long m;
>  
>  	if (align == 0)
>  		return b;
> -	m = b % align;
> +	part_offset = part_offset % align;
> +	m = (b + part_offset) % align;
>  	if (m)
>  		b += align - m;
>  	return b;
>  }
>  
> -errcode_t mk_hugefiles(ext2_filsys fs)
> +errcode_t mk_hugefiles(ext2_filsys fs, const char *device_name)
>  {
>  	unsigned long	i;
>  	ext2_ino_t	dir;
>  	errcode_t	retval;
> -	blk64_t		fs_blocks;
> +	blk64_t		fs_blocks, part_offset;
>  	unsigned long	align;
>  	int		d, dsize;
>  	char		*t;
> @@ -348,7 +487,10 @@ errcode_t mk_hugefiles(ext2_filsys fs)
>  	t = get_string_from_profile(fs_types, "hugefiles_align", "0");
>  	align = parse_num_blocks2(t, fs->super->s_log_block_size);
>  	free(t);
> -	num_blocks = round_up_align(num_blocks, align);
> +	if (get_bool_from_profile(fs_types, "hugefiles_align_disk", 0))
> +		part_offset = get_partition_start(device_name) /
> +			(fs->blocksize / 512);
> +	num_blocks = round_up_align(num_blocks, align, 0);
>  	zero_hugefile = get_bool_from_profile(fs_types, "zero_hugefiles",
>  					      zero_hugefile);
>  
> @@ -400,7 +542,7 @@ errcode_t mk_hugefiles(ext2_filsys fs)
>  	num_slack += calc_overhead(fs, num_blocks) * num_files;
>  	num_slack += (num_files / 16) + 1; /* space for dir entries */
>  	goal = get_start_block(fs, num_slack);
> -	goal = round_up_align(goal, align);
> +	goal = round_up_align(goal, align, part_offset);
>  
>  	if ((num_blocks ? num_blocks : fs_blocks) >
>  	    (0x80000000UL / fs->blocksize))
> diff --git a/misc/mke2fs.c b/misc/mke2fs.c
> index ecd47e6..da77e3a 100644
> --- a/misc/mke2fs.c
> +++ b/misc/mke2fs.c
> @@ -2913,7 +2913,7 @@ no_journal:
>  				       EXT4_FEATURE_RO_COMPAT_QUOTA))
>  		create_quota_inodes(fs);
>  
> -	retval = mk_hugefiles(fs);
> +	retval = mk_hugefiles(fs, device_name);
>  	if (retval)
>  		com_err(program_name, retval, "while creating huge files");
>  
> diff --git a/misc/mke2fs.conf.5.in b/misc/mke2fs.conf.5.in
> index 8e25892..19458ac 100644
> --- a/misc/mke2fs.conf.5.in
> +++ b/misc/mke2fs.conf.5.in
> @@ -480,6 +480,13 @@ files.  It also forces the size of huge files to be a multiple of the
>  requested alignment.  If this relation is not specified, no alignment
>  requirement will be imposed on the huge files.
>  .TP
> +.I hugefiles_align_disk
> +Thie relations specifies whether the alignment should be relative to the
> +beginning of the hard drive (assuming that the starting offset of the
> +partition is available to mke2fs).  The default value is false, which
> +if will cause hugefile alignment to be relative to the beginning of the
> +file system.
> +.TP
>  .I hugefiles_name
>  This relation specifies the base file name for the huge files.
>  .TP
> diff --git a/misc/mke2fs.h b/misc/mke2fs.h
> index 9fa6bfe..ce72cb3 100644
> --- a/misc/mke2fs.h
> +++ b/misc/mke2fs.h
> @@ -24,7 +24,7 @@ extern int get_bool_from_profile(char **types, const char *opt, int def_val);
>  extern int int_log10(unsigned long long arg);
>  
>  /* mk_hugefiles.c */
> -extern errcode_t mk_hugefiles(ext2_filsys fs);
> +extern errcode_t mk_hugefiles(ext2_filsys fs, const char *device_name);
>  
>  
>  
> -- 
> 2.0.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html