linux-ext4 - Re: [PATCH v2 4/4] mke2fs: set overhead in super block for bigalloc

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <82090b26ca1b80e757048e0043d4489e5b3e6508.camel@ddn.com>
Date:   Mon, 26 Aug 2019 05:53:33 +0000
From:   Dongyang Li <dongyangli@....com>
To:     "adilger@...ger.ca" <adilger@...ger.ca>
CC:     "linux-ext4@...r.kernel.org" <linux-ext4@...r.kernel.org>
Subject: Re: [PATCH v2 4/4] mke2fs: set overhead in super block for bigalloc

On Sun, 2019-08-25 at 21:29 -0600, Andreas Dilger wrote:
> On Aug 22, 2019, at 2:26 AM, Dongyang Li <dongyangli@....com> wrote:
> > If overhead is not recorded in the super block, it is caculated
> > during mount in kernel, for bigalloc file systems the it takes
> > O(groups**2) in time.
> > For a 1PB deivce with 32K cluste size it takes ~12 mins to
> > mount, with most of the time spent on figuring out overhead.
> > 
> > While we can not improve the overhead algorithm in kernel
> > due to the nature of bigalloc, we can work out the overhead
> > during mke2fs and set it in the super block, avoiding calculating
> > it every time when it mounts.
> 
> It would also be good to get an ext4 patch to save the calculated
> overhead to s_overhead_clusters if the kernel finds it unset?
> That isn't any less accurate than recomputing it each time, and
> avoids extra overhead on each mount for filesystems that did not
> get it set at mke2fs time.
Sounds good, we also need to update the overhead when resize happens.
> 
> > Overhead is s_first_data_block plus internal journal blocks plus
> > the block and inode bitmaps, inode table, super block backups and
> > group descriptor blocks for every group. This patch introduces
> > ext2fs_count_used_clusters(), which calculates the clusters used
> > in the block bitmap for the given range.
> > 
> > When bad blocks are involved, it gets tricky because the blocks
> > counted as overhead and the bad blocks can end up in the same
> > allocation cluster.
> 
> On the other hand, would it be wrong if the bad blocks are stored
> in "s_overhead_clusters"?
IMHO the bad blocks are considered as used blocks, overhead is the
filesystem structures, so they are different.
Someone please correct me if I'm wrong, considering bad blocks as
overhead will make this heaps easier.
> 
> > In this case we will unmark the bad blocks from
> > the block bitmap, covert to cluster bitmap and get the overhead,
> 
> (typo) "convert"
> 
> > then mark the bad blocks back in the cluster bitmap.
> 
> In this case, should the bad block numbers be converted to
> clusters during the second iteration?
ext2fs_mark_generic_bmap() will do that for us.
> 
> > Signed-off-by: Li Dongyang <dongyangli@....com>
> > ---
> > lib/ext2fs/ext2fs.h       |  2 ++
> > lib/ext2fs/gen_bitmap64.c | 35 +++++++++++++++++++++++++++
> > misc/mke2fs.c             | 50
> > ++++++++++++++++++++++++++++++++++++++-
> > 3 files changed, 86 insertions(+), 1 deletion(-)
> > 
> > diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
> > index 59fd9742..a8ddb9e4 100644
> > --- a/lib/ext2fs/ext2fs.h
> > +++ b/lib/ext2fs/ext2fs.h
> > @@ -1437,6 +1437,8 @@ errcode_t
> > ext2fs_set_generic_bmap_range(ext2fs_generic_bitmap bmap,
> > 					void *in);
> > errcode_t ext2fs_convert_subcluster_bitmap(ext2_filsys fs,
> > 					   ext2fs_block_bitmap
> > *bitmap);
> > +errcode_t ext2fs_count_used_clusters(ext2_filsys fs, blk64_t
> > start,
> > +				     blk64_t end, blk64_t *out);
> > 
> > /* get_num_dirs.c */
> > extern errcode_t ext2fs_get_num_dirs(ext2_filsys fs, ext2_ino_t
> > *ret_num_dirs);
> > diff --git a/lib/ext2fs/gen_bitmap64.c b/lib/ext2fs/gen_bitmap64.c
> > index f1dd1891..b2370667 100644
> > --- a/lib/ext2fs/gen_bitmap64.c
> > +++ b/lib/ext2fs/gen_bitmap64.c
> > @@ -940,3 +940,38 @@ errcode_t
> > ext2fs_find_first_set_generic_bmap(ext2fs_generic_bitmap bitmap,
> > 
> > 	return ENOENT;
> > }
> > +
> > +errcode_t ext2fs_count_used_clusters(ext2_filsys fs, blk64_t
> > start,
> > +				     blk64_t end, blk64_t *out)
> > +{
> > +	blk64_t		next;
> > +	blk64_t		tot_set = 0;
> > +	errcode_t	retval;
> > +
> > +	while (start < end) {
> > +		retval = ext2fs_find_first_set_block_bitmap2(fs-
> > >block_map,
> > +							start, end,
> > &next);
> > +		if (retval) {
> > +			if (retval == ENOENT)
> > +				retval = 0;
> > +			break;
> > +		}
> > +		start = next;
> > +
> > +		retval = ext2fs_find_first_zero_block_bitmap2(fs-
> > >block_map,
> > +							start, end,
> > &next);
> > +		if (retval == 0) {
> > +			tot_set += next - start;
> > +			start  = next + 1;
> > +		} else if (retval == ENOENT) {
> > +			retval = 0;
> > +			tot_set += end - start + 1;
> > +			break;
> > +		} else
> > +			break;
> > +	}
> > +
> > +	if (!retval)
> > +		*out = EXT2FS_NUM_B2C(fs, tot_set);
> > +	return retval;
> > +}
> > diff --git a/misc/mke2fs.c b/misc/mke2fs.c
> > index 30e353d3..1928c9bf 100644
> > --- a/misc/mke2fs.c
> > +++ b/misc/mke2fs.c
> > @@ -2912,6 +2912,8 @@ int main (int argc, char *argv[])
> > 	errcode_t	retval = 0;
> > 	ext2_filsys	fs;
> > 	badblocks_list	bb_list = 0;
> > +	badblocks_iterate	bb_iter;
> > +	blk_t		blk;
> > 	unsigned int	journal_blocks = 0;
> > 	unsigned int	i, checkinterval;
> > 	int		max_mnt_count;
> > @@ -2922,6 +2924,7 @@ int main (int argc, char *argv[])
> > 	char		opt_string[40];
> > 	char		*hash_alg_str;
> > 	int		itable_zeroed = 0;
> > +	blk64_t		overhead;
> > 
> > #ifdef ENABLE_NLS
> > 	setlocale(LC_MESSAGES, "");
> > @@ -3213,6 +3216,23 @@ int main (int argc, char *argv[])
> > 	if (!quiet)
> > 		printf("%s", _("done                            \n"));
> > 
> > +	/*
> > +	 * Unmark bad blocks to calculate overhead, because metadata
> > + 	 * blocks and bad blocks can land on the same allocation
> > cluster.
> > + 	 */
> > +	if (bb_list) {
> > +		retval = ext2fs_badblocks_list_iterate_begin(bb_list,
> > +							     &bb_iter);
> > +		if (retval) {
> > +			com_err("ext2fs_badblocks_list_iterate_begin",
> > retval,
> > +				"%s", _("while unmarking bad blocks"));
> > +			exit(1);
> > +		}
> > +		while (ext2fs_badblocks_list_iterate(bb_iter, &blk))
> > +			ext2fs_unmark_block_bitmap2(fs->block_map,
> > blk);
> > +		ext2fs_badblocks_list_iterate_end(bb_iter);
> > +	}
> > +
> > 	retval = ext2fs_convert_subcluster_bitmap(fs, &fs->block_map);
> > 	if (retval) {
> > 		com_err(program_name, retval, "%s",
> > @@ -3220,6 +3240,28 @@ int main (int argc, char *argv[])
> > 		exit(1);
> > 	}
> > 
> > +	retval = ext2fs_count_used_clusters(fs, fs->super-
> > >s_first_data_block,
> > +					ext2fs_blocks_count(fs->super)
> > - 1,
> > +					&overhead);
> > +	if (retval) {
> > +		com_err(program_name, retval, "%s",
> > +			_("while calculating overhead"));
> > +		exit(1);
> > +	}
> > +
> > +	if (bb_list) {
> > +		retval = ext2fs_badblocks_list_iterate_begin(bb_list,
> > +							     &bb_iter);
> > +		if (retval) {
> > +			com_err("ext2fs_badblocks_list_iterate_begin",
> > retval,
> > +				"%s", _("while marking bad blocks as
> > used"));
> > +			exit(1);
> > +		}
> > +		while (ext2fs_badblocks_list_iterate(bb_iter, &blk))
> > +			ext2fs_mark_block_bitmap2(fs->block_map, blk);
> > +		ext2fs_badblocks_list_iterate_end(bb_iter);
> > +	}
> > +
> > 	if (super_only) {
> > 		check_plausibility(device_name, CHECK_FS_EXIST, NULL);
> > 		printf(_("%s may be further corrupted by superblock
> > rewrite\n"),
> > @@ -3317,6 +3359,7 @@ int main (int argc, char *argv[])
> > 		free(journal_device);
> > 	} else if ((journal_size) ||
> > 		   ext2fs_has_feature_journal(&fs_param)) {
> > +		overhead += EXT2FS_NUM_B2C(fs, journal_blocks);
> > 		if (super_only) {
> > 			printf("%s", _("Skipping journal creation in
> > super-only mode\n"));
> > 			fs->super->s_journal_inum = EXT2_JOURNAL_INO;
> > @@ -3359,8 +3402,13 @@ no_journal:
> > 			       fs->super->s_mmp_update_interval);
> > 	}
> > 
> > -	if (ext2fs_has_feature_bigalloc(&fs_param))
> > +	overhead += fs->super->s_first_data_block;
> > +
> > +	if (ext2fs_has_feature_bigalloc(&fs_param)) {
> > +		if (!super_only)
> > +			fs->super->s_overhead_clusters = overhead;
> > 		fix_cluster_bg_counts(fs);
> > +	}
> 
> Should we consider to always store the overhead value into the
> superblock,
> regardless of whether bigalloc is enabled or not?
> 
> Cheers, Andreas
> 
> 
> 
> 
>