lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20221214170125.bixz46ybm76rtbzf@quack3>
Date:   Wed, 14 Dec 2022 18:01:25 +0100
From:   Jan Kara <jack@...e.cz>
To:     Zhang Yi <yi.zhang@...wei.com>
Cc:     linux-ext4@...r.kernel.org, tytso@....edu,
        adilger.kernel@...ger.ca, jack@...e.cz, yi.zhang@...weicloud.com,
        yukuai3@...wei.com
Subject: Re: [RFC PATCH] ext4: dio take shared inode lock when overwriting
 preallocated blocks

On Sat 03-12-22 18:39:56, Zhang Yi wrote:
> In the dio write path, we only take shared inode lock for the case of
> aligned overwriting initialized blocks inside EOF. But for overwriting
> preallocated blocks, it may only need to split unwritten extents, this
> procedure has been protected under i_data_sem lock, it's safe to
> release the exclusive inode lock and take shared inode lock.
> 
> This could give a significant speed up for multi-threaded writes. Test
> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
> 
>  direct=1
>  ioengine=libaio
>  iodepth=10
>  numjobs=10
>  runtime=60
>  rw=randwrite
>  size=100G
> 
> And the test result are:
> Before:
>  bs=4k       IOPS=11.1k, BW=43.2MiB/s
>  bs=16k      IOPS=11.1k, BW=173MiB/s
>  bs=64k      IOPS=11.2k, BW=697MiB/s
> 
> After:
>  bs=4k       IOPS=41.4k, BW=162MiB/s
>  bs=16k      IOPS=41.3k, BW=646MiB/s
>  bs=64k      IOPS=13.5k, BW=843MiB/s
> 
> Signed-off-by: Zhang Yi <yi.zhang@...wei.com>
> ---
>  It passed xfstests auto mode with 1k and 4k blocksize.

Besides some naming nits (see below) I think this should work. But I have
to say I'm a bit uneasy about this because we will now be changing block
mapping from unwritten to written only with shared i_rwsem. OTOH that
happens during writeback as well so we should be fine and the gain is very
nice.

Out of curiosity do you have a real usecase for this?

>  fs/ext4/file.c | 34 ++++++++++++++++++++++------------
>  1 file changed, 22 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index a7a597c727e6..7edac94025ac 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
>  	return false;
>  }
>  
> -/* Is IO overwriting allocated and initialized blocks? */
> -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
> +/* Is IO overwriting allocated or initialized blocks? */
> +static bool ext4_overwrite_io(struct inode *inode,
> +			      loff_t pos, loff_t len, bool *inited)

'inited' just sounds weird. Correct is 'initialized' which is a bit long.
Maybe just negate the meaning and call this 'unwritten'?

>  {
>  	struct ext4_map_blocks map;
>  	unsigned int blkbits = inode->i_blkbits;
> @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>  	blklen = map.m_len;
>  
>  	err = ext4_map_blocks(NULL, inode, &map, 0);
> +	if (err != blklen)
> +		return false;
>  	/*
>  	 * 'err==len' means that all of the blocks have been preallocated,
> -	 * regardless of whether they have been initialized or not. To exclude
> -	 * unwritten extents, we need to check m_flags.
> +	 * regardless of whether they have been initialized or not. We need to
> +	 * check m_flags to distinguish the unwritten extents.
>  	 */
> -	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
> +	*inited = !!(map.m_flags & EXT4_MAP_MAPPED);
> +	return true;
>  }
>  
>  static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
> @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
>   * - For extending writes case we don't take the shared lock, since it requires
>   *   updating inode i_disksize and/or orphan handling with exclusive lock.
>   *
> - * - shared locking will only be true mostly with overwrites. Otherwise we will
> - *   switch to exclusive i_rwsem lock.
> + * - shared locking will only be true mostly with overwrites, include
> + *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
> + *   we protects splitting extents by i_data_sem in ext4_inode_info, so we can
> + *   also release exclusive i_rwsem lock.
> + *
> + * - Otherwise we will switch to exclusive i_rwsem lock.
>   */
>  static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
> -				     bool *ilock_shared, bool *extend)
> +				     bool *ilock_shared, bool *extend,
> +				     bool *overwrite)

And it would be good to preserve the argument name in other functions as
well - i.e., keep using 'unwritten' here as well.

>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file_inode(file);
> @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>  	 * in file_modified().
>  	 */
>  	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
> -	     !ext4_overwrite_io(inode, offset, count))) {
> +	     !ext4_overwrite_io(inode, offset, count, overwrite))) {
>  		if (iocb->ki_flags & IOCB_NOWAIT) {
>  			ret = -EAGAIN;
>  			goto out;
> @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	loff_t offset = iocb->ki_pos;
>  	size_t count = iov_iter_count(from);
>  	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
> -	bool extend = false, unaligned_io = false;
> +	bool extend = false, unaligned_io = false, overwrite = false;
>  	bool ilock_shared = true;
>  
>  	/*
> @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		return ext4_buffered_write_iter(iocb, from);
>  	}
>  
> -	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
> +	ret = ext4_dio_write_checks(iocb, from,
> +				    &ilock_shared, &extend, &overwrite);
>  	if (ret <= 0)
>  		return ret;
>  
> @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		ext4_journal_stop(handle);
>  	}
>  
> -	if (ilock_shared)
> +	if (overwrite)
>  		iomap_ops = &ext4_iomap_overwrite_ops;
>  	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
>  			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
-- 
Jan Kara <jack@...e.com>
SUSE Labs, CR

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ