lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Thu, 22 Oct 2015 11:11:44 -0700
From:	Jaegeuk Kim <jaegeuk@...nel.org>
To:	Chao Yu <chao2.yu@...sung.com>
Cc:	linux-f2fs-devel@...ts.sourceforge.net,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH] f2fs: support file defragment

Hi,

On Thu, Oct 22, 2015 at 07:59:14PM +0800, Chao Yu wrote:
> This patch introduces a new ioctl F2FS_IOC_DEFRAGMENT to support file
> defragment in a specified range of regular file.
> 
> This ioctl can be used in very limited workload: if user expects high
> sequential read performance in randomly written file, this interface
> can be used for defragmentation, after that file can be written as
> continuous as possible in the device.
> 
> Meanwhile, it has side-effect, it will make holes in segments where
> blocks located originally, so it's better to trigger GC to eliminate
> fragment in segments.
> 
> Signed-off-by: Chao Yu <chao2.yu@...sung.com>
> ---
>  fs/f2fs/data.c |   6 +-
>  fs/f2fs/f2fs.h |   8 +++
>  fs/f2fs/file.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 213 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index 972eab7..5bb375a 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -566,7 +566,7 @@ out:
>   *     b. do not use extent cache for better performance
>   *     c. give the block addresses to blockdev
>   */
> -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
> +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
>  						int create, int flag)
>  {
>  	unsigned int maxblocks = map->m_len;
> @@ -1354,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space *mapping,
>  			available_free_memory(sbi, DIRTY_DENTS))
>  		goto skip_write;
>  
> +	/* skip writing during file defragment */
> +	if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
> +		goto skip_write;
> +
>  	/* during POR, we don't need to trigger writepage at all. */
>  	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
>  		goto skip_write;
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 9db5500..068813c 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -234,6 +234,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
>  #define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
>  #define F2FS_IOC_GARBAGE_COLLECT	_IO(F2FS_IOCTL_MAGIC, 6)
>  #define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
> +#define F2FS_IOC_DEFRAGMENT		_IO(F2FS_IOCTL_MAGIC, 8)
>  
>  #define F2FS_IOC_SET_ENCRYPTION_POLICY					\
>  		_IOR('f', 19, struct f2fs_encryption_policy)
> @@ -260,6 +261,11 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
>  #define F2FS_IOC32_SETFLAGS             FS_IOC32_SETFLAGS
>  #endif
>  
> +struct f2fs_defragment {
> +	u64 start;
> +	u64 len;
> +};
> +
>  /*
>   * For INODE and NODE manager
>   */
> @@ -1416,6 +1422,7 @@ enum {
>  	FI_DROP_CACHE,		/* drop dirty page cache */
>  	FI_DATA_EXIST,		/* indicate data exists */
>  	FI_INLINE_DOTS,		/* indicate inline dot dentries */
> +	FI_DO_DEFRAG,		/* indicate defragment is running */
>  };
>  
>  static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
> @@ -1847,6 +1854,7 @@ struct page *find_data_page(struct inode *, pgoff_t);
>  struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
>  struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
>  int do_write_data_page(struct f2fs_io_info *);
> +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
>  int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
>  void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
>  int f2fs_release_page(struct page *, gfp_t);
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index a197215..ad59694 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -1646,6 +1646,204 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
>  	return 0;
>  }
>  
> +static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
> +					struct file *filp,
> +					struct f2fs_defragment *range)
> +{
> +	struct inode *inode = file_inode(filp);
> +	struct f2fs_map_blocks map;
> +	struct extent_info ei;
> +	pgoff_t pg_start, pg_end;
> +	unsigned int blk_per_seg = 1 << sbi->log_blocks_per_seg;
> +	unsigned int total = 0, sec_num;
> +	unsigned int pages_per_sec = sbi->segs_per_sec *
> +					(1 << sbi->log_blocks_per_seg);
> +	block_t blk_end = 0;
> +	bool fragmented = false;
> +	int err = 0;
> +
> +	pg_start = range->start >> PAGE_CACHE_SHIFT;
> +	pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT;
> +
> +	f2fs_balance_fs(sbi);
> +
> +	mutex_lock(&inode->i_mutex);
> +
> +	/* writeback all dirty pages in the range */
> +	err = filemap_write_and_wait_range(inode->i_mapping, range->start,
> +						range->start + range->len);
> +	if (err)
> +		goto out;
> +
> +	/*
> +	 * lookup mapping info in extent cache, skip defragmenting if physical
> +	 * block addresses are continuous.
> +	 */
> +	if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
> +		if (ei.fofs + ei.len >= pg_end)
> +			goto out;
> +	}
> +
> +	map.m_lblk = pg_start;
> +	map.m_len = pg_end - pg_start;
> +
> +	/*
> +	 * lookup mapping info in dnode page cache, skip defragmenting if all
> +	 * physical block addresses are continuous even if there are hole(s)
> +	 * in logical blocks.
> +	 */
> +	while (map.m_lblk < pg_end) {
> +		map.m_flags = 0;
> +		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);

How about using f2fs_fiemap to get the extent information?

> +		if (err)
> +			goto out;
> +
> +		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
> +			map.m_lblk++;
> +			map.m_len--;
> +			continue;
> +		}
> +
> +		if (blk_end && blk_end != map.m_pblk) {
> +			fragmented = true;
> +			break;
> +		}
> +		blk_end = map.m_pblk + map.m_len;
> +
> +		map.m_lblk += map.m_len;
> +		map.m_len = pg_end - map.m_lblk;
> +	}
> +
> +	if (!fragmented)
> +		goto out;
> +
> +	map.m_lblk = pg_start;
> +	map.m_len = pg_end - pg_start;
> +
> +	sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
> +
> +	if (has_not_enough_free_secs(sbi, sec_num))

Later, ->writepage will handle this?

> +		goto out;
> +
> +	while (map.m_lblk < pg_end) {
> +		pgoff_t idx;
> +		int cnt = 0;

What about this?

	for_each_extents(extent_info) {
		page = get_lock_data_page(inode, idx, true);

		set_page_dirty(page);
	}
	filemap_fdatawrite();

Thanks,

> +
> +do_map:
> +		map.m_flags = 0;
> +		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
> +		if (err)
> +			goto out;
> +
> +		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
> +			map.m_lblk++;
> +			continue;
> +		}
> +
> +		set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
> +
> +		idx = map.m_lblk;
> +		while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
> +			struct address_space *mapping = inode->i_mapping;
> +			struct page *page;
> +
> +			page = find_or_create_page(mapping, idx, GFP_NOFS);
> +			if (!page) {
> +				err = -ENOMEM;
> +				goto out;
> +			}
> +
> +			f2fs_wait_on_page_writeback(page, DATA);
> +
> +			if (!PageUptodate(page)) {
> +				err = mapping->a_ops->readpage(filp, page);
> +				if (unlikely(err)) {
> +					f2fs_put_page(page, 0);
> +					goto out;
> +				}
> +
> +				lock_page_killable(page);
> +
> +				if (!PageUptodate(page)) {
> +					f2fs_put_page(page, 1);
> +					err = -EIO;
> +					goto out;
> +				}
> +			}
> +			set_page_dirty(page);
> +			f2fs_put_page(page, 1);
> +
> +			idx++;
> +			cnt++;
> +			total++;
> +		}
> +
> +		map.m_lblk = idx;
> +		map.m_len = pg_end - idx;
> +
> +		if (idx < pg_end && cnt < blk_per_seg)
> +			goto do_map;
> +
> +		clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
> +
> +		err = filemap_fdatawrite(inode->i_mapping);
> +		if (err)
> +			goto out;
> +	}
> +out:
> +	mutex_unlock(&inode->i_mutex);
> +	if (!err)
> +		range->len = (u64)total << PAGE_CACHE_SHIFT;
> +	return err;
> +}
> +
> +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
> +{
> +	struct inode *inode = file_inode(filp);
> +	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> +	struct f2fs_defragment range;
> +	int err;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	if (!S_ISREG(inode->i_mode))
> +		return -EINVAL;
> +
> +	err = mnt_want_write_file(filp);
> +	if (err)
> +		return err;
> +
> +	if (f2fs_readonly(sbi->sb)) {
> +		err = -EROFS;
> +		goto out;
> +	}
> +
> +	if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
> +							sizeof(range))) {
> +		err = -EFAULT;
> +		goto out;
> +	}
> +
> +	/* verify alignment of offset & size */
> +	if (range.start & (F2FS_BLKSIZE - 1) ||
> +		range.len & (F2FS_BLKSIZE - 1)) {
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	err = f2fs_defragment_range(sbi, filp, &range);
> +	if (err < 0)
> +		goto out;
> +
> +	if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
> +							sizeof(range)))
> +		err = -EFAULT;
> +out:
> +	mnt_drop_write_file(filp);
> +	return err;
> +}
> +
>  long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>  {
>  	switch (cmd) {
> @@ -1679,6 +1877,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>  		return f2fs_ioc_gc(filp, arg);
>  	case F2FS_IOC_WRITE_CHECKPOINT:
>  		return f2fs_ioc_write_checkpoint(filp, arg);
> +	case F2FS_IOC_DEFRAGMENT:
> +		return f2fs_ioc_defragment(filp, arg);
>  	default:
>  		return -ENOTTY;
>  	}
> -- 
> 2.6.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ