lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <alpine.LFD.2.00.1011251512020.2871@dhcp-lab-213.englab.brq.redhat.com>
Date:	Thu, 25 Nov 2010 15:27:09 +0100 (CET)
From:	Lukas Czerner <lczerner@...hat.com>
To:	Jan Kara <jack@...e.cz>
cc:	linux-ext4@...r.kernel.org, lczerner@...hat.com,
	greg.freemyer@...il.com, linux-fsdevel@...r.kernel.org
Subject: Re: [PATCH 1/2] ext3: Add batched discard support for ext3

On Wed, 24 Nov 2010, Jan Kara wrote:

> From: Lukas Czerner <lczerner@...hat.com>
> 
> Walk through allocation groups and trim all free extents. It can be
> invoked through FITRIM ioctl on the file system. The main idea is to
> provide a way to trim the whole file system if needed, since some SSD's
> may suffer from performance loss after the whole device was filled (it
> does not mean that fs is full!).
> 
> It search for free extents in allocation groups specified by Byte range
> start -> start+len. When the free extent is within this range, blocks are
> marked as used and then trimmed. Afterwards these blocks are marked as
> free in per-group bitmap.
> 
> [JK: Fixed up error handling]
> 
> Signed-off-by: Lukas Czerner <lczerner@...hat.com>
> Reviewed-by: Jan Kara <jack@...e.cz>
> Reviewed-by: Dmitry Monakhov <dmonakhov@...nvz.org>
> Signed-off-by: Jan Kara <jack@...e.cz>
> ---
>  fs/ext3/balloc.c        |  257 +++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/ext3_fs.h |    1 +
>  2 files changed, 258 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
> index b3db226..ae88960 100644
> --- a/fs/ext3/balloc.c
> +++ b/fs/ext3/balloc.c
> @@ -20,6 +20,7 @@
>  #include <linux/ext3_jbd.h>
>  #include <linux/quotaops.h>
>  #include <linux/buffer_head.h>
> +#include <linux/blkdev.h>
>  
>  /*
>   * balloc.c contains the blocks allocation and deallocation routines
> @@ -39,6 +40,23 @@
>  
>  #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
>  
> +/*
> + * Calculate the block group number and offset, given a block number
> + */
> +void ext3_get_group_no_and_offset(struct super_block *sb, ext3_fsblk_t blocknr,
> +		unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
> +{
> +	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
> +	ext3_grpblk_t offset;
> +
> +	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
> +	offset = do_div(blocknr, EXT3_BLOCKS_PER_GROUP(sb));
> +	if (offsetp)
> +		*offsetp = offset;
> +	if (blockgrpp)
> +		*blockgrpp = blocknr;
> +}
> +
>  /**
>   * ext3_get_group_desc() -- load group descriptor from disk
>   * @sb:			super block
> @@ -1885,3 +1903,242 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
>  	return ext3_bg_num_gdb_meta(sb,group);
>  
>  }
> +
> +/**
> + * ext3_trim_all_free -- function to trim all free space in alloc. group
> + * @sb:			super block for file system
> + * @group:		allocation group to trim
> + * @start:		first group block to examine
> + * @max:		last group block to examine
> + * @gdp:		allocation group description structure
> + * @minblocks:		minimum extent block count
> + *
> + * ext3_trim_all_free walks through group's block bitmap searching for free
> + * blocks. When the free block is found, it tries to allocate this block and
> + * consequent free block to get the biggest free extent possible, until it
> + * reaches any used block. Then issue a TRIM command on this extent and free
> + * the extent in the block bitmap. This is done until whole group is scanned.
> + */
> +ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
> +				ext3_grpblk_t start, ext3_grpblk_t max,
> +				ext3_grpblk_t minblocks)
> +{
> +	handle_t *handle;
> +	ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
> +	ext3_fsblk_t discard_block;
> +	struct ext3_sb_info *sbi;
> +	struct buffer_head *gdp_bh, *bitmap_bh = NULL;
> +	struct ext3_group_desc *gdp;
> +	int err = 0, ret = 0;
> +
> +	/*
> +	 * We will update one block bitmap, and one group descriptor
> +	 */
> +	handle = ext3_journal_start_sb(sb, 2);
> +	if (IS_ERR(handle))
> +		return PTR_ERR(handle);
> +
> +	bitmap_bh = read_block_bitmap(sb, group);
> +	if (!bitmap_bh) {
> +		err = -EIO;
> +		goto err_out;
> +	}
> +
> +	BUFFER_TRACE(bitmap_bh, "getting undo access");
> +	err = ext3_journal_get_undo_access(handle, bitmap_bh);
> +	if (err)
> +		goto err_out;
> +
> +	gdp = ext3_get_group_desc(sb, group, &gdp_bh);
> +	if (!gdp) {
> +		err = -EIO;
> +		goto err_out;
> +	}
> +
> +	BUFFER_TRACE(gdp_bh, "get_write_access");
> +	err = ext3_journal_get_write_access(handle, gdp_bh);
> +	if (err)
> +		goto err_out;
> +
> +	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
> +	sbi = EXT3_SB(sb);
> +
> +	 /* Walk through the whole group */
> +	while (start < max) {
> +		start = bitmap_search_next_usable_block(start, bitmap_bh, max);
> +		if (start < 0)
> +			break;
> +		next = start;
> +
> +		/*
> +		 * Allocate contiguous free extents by setting bits in the
> +		 * block bitmap
> +		 */
> +		while (next < max
> +			&& claim_block(sb_bgl_lock(sbi, group),
> +					next, bitmap_bh)) {
> +			next++;
> +		}
> +
> +		 /* We did not claim any blocks */
> +		if (next == start)
> +			continue;
> +
> +		discard_block = (ext3_fsblk_t)start +
> +				ext3_group_first_block_no(sb, group);
> +
> +		/* Update counters */
> +		spin_lock(sb_bgl_lock(sbi, group));
> +		le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
> +		spin_unlock(sb_bgl_lock(sbi, group));
> +		percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
> +
> +		/* Do not issue a TRIM on extents smaller than minblocks */
> +		if ((next - start) < minblocks)
> +			goto free_extent;
> +
> +		 /* Send the TRIM command down to the device */
> +		err = sb_issue_discard(sb, discard_block, next - start,
> +				       GFP_NOFS, 0);
> +		count += (next - start);
> +free_extent:
> +		freed = 0;
> +
> +		/*
> +		 * Clear bits in the bitmap
> +		 */
> +		for (bit = start; bit < next; bit++) {
> +			BUFFER_TRACE(bitmap_bh, "clear bit");
> +			if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
> +						bit, bitmap_bh->b_data)) {
> +				ext3_error(sb, __func__,
> +					"bit already cleared for block "E3FSBLK,
> +					 (unsigned long)bit);
> +				BUFFER_TRACE(bitmap_bh, "bit already cleared");
> +			} else {
> +				freed++;
> +			}
> +		}
> +
> +		/* Update couters */
> +		spin_lock(sb_bgl_lock(sbi, group));
> +		le16_add_cpu(&gdp->bg_free_blocks_count, freed);
> +		spin_unlock(sb_bgl_lock(sbi, group));
> +		percpu_counter_add(&sbi->s_freeblocks_counter, freed);
> +
> +		start = next;
> +		if (err < 0) {
> +			if (err != -EOPNOTSUPP)
> +				ext3_warning(sb, __func__, "Discard command "
> +					     "returned error %d\n", err);
> +			break;
> +		}
> +
> +		if (fatal_signal_pending(current)) {
> +			err = -ERESTARTSYS;
> +			break;
> +		}
> +
> +		cond_resched();
> +
> +		/* No more suitable extents */
> +		if ((free_blocks - count) < minblocks)
> +			break;
> +	}
> +
> +	/* We dirtied the bitmap block */
> +	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
> +	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
> +
> +	/* And the group descriptor block */
> +	BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
> +	ret = ext3_journal_dirty_metadata(handle, gdp_bh);
> +	if (!err)
> +		err = ret;
> +
> +	ext3_debug("trimmed %d blocks in the group %d\n",
> +		count, group);
> +
> +err_out:
> +	if (err)
> +		count = err;
> +	ext3_journal_stop(handle);
> +	brelse(bitmap_bh);
> +
> +	return count;
> +}
> +
> +/**
> + * ext3_trim_fs() -- trim ioctl handle function
> + * @sb:			superblock for filesystem
> + * @start:		First Byte to trim
> + * @len:		number of Bytes to trim from start
> + * @minlen:		minimum extent length in Bytes
> + *
> + * ext3_trim_fs goes through all allocation groups containing Bytes from
> + * start to start+len. For each such a group ext3_trim_all_free function
> + * is invoked to trim all free space.
> + */
> +int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
> +{
> +	ext3_grpblk_t last_block, first_block, free_blocks;
> +	unsigned long first_group, last_group;
> +	unsigned long group, ngroups;
> +	struct ext3_group_desc *gdp;
> +	struct ext3_super_block *es;
> +	uint64_t start, len, minlen, trimmed;
> +	int ret = 0;
We probably need to add this:

ext3_fsblk_t blocks_count = le32_to_cpu(EXT3_SB(sb)->s_es->s_blocks_count);

> +
> +	start = range->start >> sb->s_blocksize_bits;
> +	len = range->len >> sb->s_blocksize_bits;
> +	minlen = range->minlen >> sb->s_blocksize_bits;
> +	trimmed = 0;
and this:

	if (len > blocks_count)
		len = blocks_count - start;

Because when determining last group through ext3_get_group_no_and_offset()
the result may be wrong in cases when range->start and range-len are too
big, because it may overflow when summing up those two numbers.


> +
> +	if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
> +		return -EINVAL;
> +
> +	es = EXT3_SB(sb)->s_es;
> +	ngroups = EXT3_SB(sb)->s_groups_count;
> +	smp_rmb();
> +
> +	/* Determine first and last group to examine based on start and len */
> +	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
> +				     &first_group, &first_block);
> +	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
							^^^^^^^^^^^^^
							here
overflow may occur.

> +				     &last_group, &last_block);
> +	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
> +	last_block = EXT3_BLOCKS_PER_GROUP(sb);
> +
> +	if (first_group > last_group)
> +		return -EINVAL;
> +
> +	for (group = first_group; group <= last_group; group++) {
> +		gdp = ext3_get_group_desc(sb, group, NULL);
> +		if (!gdp)
> +			break;
> +
> +		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
> +		if (free_blocks < minlen)
> +			continue;
> +
> +		if (len >= EXT3_BLOCKS_PER_GROUP(sb))
> +			len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
> +		else
> +			last_block = len;
> +
> +		ret = ext3_trim_all_free(sb, group, first_block,
> +					last_block, minlen);
> +		if (ret < 0)
> +			break;
> +
> +		trimmed += ret;
> +		first_block = 0;
> +	}
> +
> +	if (ret >= 0)
> +		ret = 0;
> +
> +	range->len = trimmed * sb->s_blocksize;
> +
> +	return ret;
> +}
> diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
> index 6ce1bca..a443965 100644
> --- a/include/linux/ext3_fs.h
> +++ b/include/linux/ext3_fs.h
> @@ -856,6 +856,7 @@ extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
>  extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
>  extern void ext3_init_block_alloc_info(struct inode *);
>  extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);
> +extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range);
>  
>  /* dir.c */
>  extern int ext3_check_dir_entry(const char *, struct inode *,
> 

-- 
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ