lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <2d9fc63c-091d-4224-b5dd-968bceed72ee@189.cn>
Date: Wed, 23 Jul 2025 13:44:19 +0800
From: Song Chen <chensong_2000@....cn>
To: tytso@....edu, adilger.kernel@...ger.ca
Cc: linux-ext4@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH RFC] ext4: fallback unaligned part of dio to buffered IO

Dear maintainers,

This is kindly ping, i would appreciate it if you could have a look and 
give me your feedback.

many thanks.

BR

Song

在 2025/7/10 16:59, chensong_2000@....cn 写道:
> From: Song Chen <chensong_2000@....cn>
> 
> When I was trying to read a big file in direct IO mode, if the
> file was not page aligned, ext4 rejected the request in
> iomap_dio_bio_iter which checks alignments of pos, addr and length
> before submitting bio.
> 
> As my understanding, pos and addr must be block size aligned, but length
> doesn't have to be. Instead of rejecting entire request which is so
> frastrating to upper layer, this patch splits length into aligned part
> and unaligned part. For the aligned part, still uses direct IO with
> no problem, for the rest unaligned part, falls back to buffered IO.
> This way looks more friendly to apps.
> 
> Please have a look at the patch in [1], it has to reopen the file
> to read the unaligned part in upper layer, which doen't look
> gracefully.
> 
> I guess I'm not the first one who brings it up, there must be a reason
> to stop this porblem being addressed. unaligned write seems to be
> addressed in [2] and [3]. Side effects or complexity, I would like to know.
> 
> This is just a draft of RFC, I haven't taken care of ubuf properly yet,
> please let me know if you like this idea or not, then I can drop it or
> go further, like introduce helpers to split iov_iter in lib/iov_iters.c
> 
> [1]:https://lore.kernel.org/all/20240730075755.10941-4-link@vivo.com/
> [2]:https://lore.kernel.org/linux-ext4/20230314130759.642710-1-bfoster
> @redhat.com/
> [3]:https://lore.kernel.org/linux-ext4/20230810165559.946222-1-bfoster
> @redhat.com/
> 
> Signed-off-by: Song Chen <chensong_2000@....cn>
> ---
>   fs/ext4/file.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++----
>   1 file changed, 83 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 21df81347147..a059985d0b16 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -36,6 +36,12 @@
>   #include "acl.h"
>   #include "truncate.h"
>   
> +enum {
> +	SHOULD_NOT_DIO,
> +	SHOULD_DIO,
> +	SHOULD_PARTIAL_DIO,
> +};
> +
>   /*
>    * Returns %true if the given DIO request should be attempted with DIO, or
>    * %false if it should fall back to buffered I/O.
> @@ -52,23 +58,89 @@
>    *
>    * This function implements the traditional ext4 behavior in all these cases.
>    */
> -static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
> +static int ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
>   {
>   	struct inode *inode = file_inode(iocb->ki_filp);
> +	unsigned int len_mask = i_blocksize(inode) - 1;
> +	unsigned int addr_mask = bdev_dma_alignment(inode->i_sb->s_bdev);
>   	u32 dio_align = ext4_dio_alignment(inode);
>   
> +	/* inode doesn't support dio, fall back to buffered IO*/
>   	if (dio_align == 0)
> -		return false;
> +		return SHOULD_NOT_DIO;
> +
> +	/* addr is misaligned, fall back to buffered IO*/
> +	if (!iov_iter_is_aligned(iter, addr_mask, 0))
> +		return SHOULD_NOT_DIO;
> +
> +	/* pos is misaligned, fall back to buffered IO*/
> +	if (!IS_ALIGNED(iocb->ki_pos, len_mask))
> +		return SHOULD_NOT_DIO;
> +
> +	/* length is misaligned*/
> +	if (!iov_iter_is_aligned(iter, 0, len_mask)) {
> +		/* if length is less than a block, fall back to buffered IO*/
> +		if (iov_iter_count(iter) < i_blocksize(inode))
> +			return SHOULD_NOT_DIO;
> +		/*direct IO for aligned part, buffered IO for misaligned part*/
> +		return SHOULD_PARTIAL_DIO;
> +	}
>   
> -	if (dio_align == 1)
> -		return true;
> +	return SHOULD_DIO;
> +}
>   
> -	return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
> +/*
> + * First of all, truncate the length to block size aligned and start
> + * a direct IO. If it goes well in iomap_dio_rw, fall back the rest
> + * unaligned part to buffered IO.
> + *
> + * At the end, return the sum bytes of direct IO and buffered IO.
> + */
> +static ssize_t ext4_mixed_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct iov_iter to_misaligned = *to;
> +	struct iovec iov;
> +	ssize_t ret, ret_dio, ret_generic;
> +
> +	/* truncate iter->count to blocksize aligned and start direct IO */
> +	iov_iter_truncate(to, ALIGN_DOWN(to->count, i_blocksize(inode)));
> +	ret_dio = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
> +
> +	if (ret_dio <= 0) {
> +		ret = ret_dio;
> +		goto out;
> +	}
> +
> +	/* set up iter to misaligned part and start buffered IO*/
> +	iov.iov_base = to->__iov->iov_base +  ret_dio;
> +	iov.iov_len	 = to->__iov->iov_len -  ret_dio;
> +
> +	to_misaligned.__iov = &iov;
> +	iov_iter_truncate(&to_misaligned, iov.iov_len);
> +
> +	iocb->ki_flags &= ~IOCB_DIRECT;
> +	ret_generic = generic_file_read_iter(iocb, &to_misaligned);
> +
> +	if (ret_generic <= 0) {
> +		ret  = ret_generic;
> +		goto out;
> +	}
> +
> +	ret = ret_dio + ret_generic;
> +
> +out:
> +	iocb->ki_flags |= IOCB_DIRECT;
> +	inode_unlock_shared(inode);
> +	file_accessed(iocb->ki_filp);
> +
> +	return ret;
>   }
>   
>   static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>   {
>   	ssize_t ret;
> +	int dio_supported;
>   	struct inode *inode = file_inode(iocb->ki_filp);
>   
>   	if (iocb->ki_flags & IOCB_NOWAIT) {
> @@ -78,7 +150,8 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>   		inode_lock_shared(inode);
>   	}
>   
> -	if (!ext4_should_use_dio(iocb, to)) {
> +	dio_supported = ext4_should_use_dio(iocb, to);
> +	if (dio_supported == SHOULD_NOT_DIO) {
>   		inode_unlock_shared(inode);
>   		/*
>   		 * Fallback to buffered I/O if the operation being performed on
> @@ -91,6 +164,9 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>   		return generic_file_read_iter(iocb, to);
>   	}
>   
> +	if (dio_supported == SHOULD_PARTIAL_DIO)
> +		return ext4_mixed_read_iter(iocb, to);
> +
>   	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
>   	inode_unlock_shared(inode);
>   
> @@ -537,7 +613,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>   	}
>   
>   	/* Fallback to buffered I/O if the inode does not support direct I/O. */
> -	if (!ext4_should_use_dio(iocb, from)) {
> +	if (ext4_should_use_dio(iocb, from) != SHOULD_DIO) {
>   		if (ilock_shared)
>   			inode_unlock_shared(inode);
>   		else

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ