[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <2d9fc63c-091d-4224-b5dd-968bceed72ee@189.cn>
Date: Wed, 23 Jul 2025 13:44:19 +0800
From: Song Chen <chensong_2000@....cn>
To: tytso@....edu, adilger.kernel@...ger.ca
Cc: linux-ext4@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH RFC] ext4: fallback unaligned part of dio to buffered IO
Dear maintainers,
This is kindly ping, i would appreciate it if you could have a look and
give me your feedback.
many thanks.
BR
Song
在 2025/7/10 16:59, chensong_2000@....cn 写道:
> From: Song Chen <chensong_2000@....cn>
>
> When I was trying to read a big file in direct IO mode, if the
> file was not page aligned, ext4 rejected the request in
> iomap_dio_bio_iter which checks alignments of pos, addr and length
> before submitting bio.
>
> As my understanding, pos and addr must be block size aligned, but length
> doesn't have to be. Instead of rejecting entire request which is so
> frastrating to upper layer, this patch splits length into aligned part
> and unaligned part. For the aligned part, still uses direct IO with
> no problem, for the rest unaligned part, falls back to buffered IO.
> This way looks more friendly to apps.
>
> Please have a look at the patch in [1], it has to reopen the file
> to read the unaligned part in upper layer, which doen't look
> gracefully.
>
> I guess I'm not the first one who brings it up, there must be a reason
> to stop this porblem being addressed. unaligned write seems to be
> addressed in [2] and [3]. Side effects or complexity, I would like to know.
>
> This is just a draft of RFC, I haven't taken care of ubuf properly yet,
> please let me know if you like this idea or not, then I can drop it or
> go further, like introduce helpers to split iov_iter in lib/iov_iters.c
>
> [1]:https://lore.kernel.org/all/20240730075755.10941-4-link@vivo.com/
> [2]:https://lore.kernel.org/linux-ext4/20230314130759.642710-1-bfoster
> @redhat.com/
> [3]:https://lore.kernel.org/linux-ext4/20230810165559.946222-1-bfoster
> @redhat.com/
>
> Signed-off-by: Song Chen <chensong_2000@....cn>
> ---
> fs/ext4/file.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 83 insertions(+), 7 deletions(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 21df81347147..a059985d0b16 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -36,6 +36,12 @@
> #include "acl.h"
> #include "truncate.h"
>
> +enum {
> + SHOULD_NOT_DIO,
> + SHOULD_DIO,
> + SHOULD_PARTIAL_DIO,
> +};
> +
> /*
> * Returns %true if the given DIO request should be attempted with DIO, or
> * %false if it should fall back to buffered I/O.
> @@ -52,23 +58,89 @@
> *
> * This function implements the traditional ext4 behavior in all these cases.
> */
> -static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
> +static int ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
> {
> struct inode *inode = file_inode(iocb->ki_filp);
> + unsigned int len_mask = i_blocksize(inode) - 1;
> + unsigned int addr_mask = bdev_dma_alignment(inode->i_sb->s_bdev);
> u32 dio_align = ext4_dio_alignment(inode);
>
> + /* inode doesn't support dio, fall back to buffered IO*/
> if (dio_align == 0)
> - return false;
> + return SHOULD_NOT_DIO;
> +
> + /* addr is misaligned, fall back to buffered IO*/
> + if (!iov_iter_is_aligned(iter, addr_mask, 0))
> + return SHOULD_NOT_DIO;
> +
> + /* pos is misaligned, fall back to buffered IO*/
> + if (!IS_ALIGNED(iocb->ki_pos, len_mask))
> + return SHOULD_NOT_DIO;
> +
> + /* length is misaligned*/
> + if (!iov_iter_is_aligned(iter, 0, len_mask)) {
> + /* if length is less than a block, fall back to buffered IO*/
> + if (iov_iter_count(iter) < i_blocksize(inode))
> + return SHOULD_NOT_DIO;
> + /*direct IO for aligned part, buffered IO for misaligned part*/
> + return SHOULD_PARTIAL_DIO;
> + }
>
> - if (dio_align == 1)
> - return true;
> + return SHOULD_DIO;
> +}
>
> - return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
> +/*
> + * First of all, truncate the length to block size aligned and start
> + * a direct IO. If it goes well in iomap_dio_rw, fall back the rest
> + * unaligned part to buffered IO.
> + *
> + * At the end, return the sum bytes of direct IO and buffered IO.
> + */
> +static ssize_t ext4_mixed_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> + struct inode *inode = file_inode(iocb->ki_filp);
> + struct iov_iter to_misaligned = *to;
> + struct iovec iov;
> + ssize_t ret, ret_dio, ret_generic;
> +
> + /* truncate iter->count to blocksize aligned and start direct IO */
> + iov_iter_truncate(to, ALIGN_DOWN(to->count, i_blocksize(inode)));
> + ret_dio = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
> +
> + if (ret_dio <= 0) {
> + ret = ret_dio;
> + goto out;
> + }
> +
> + /* set up iter to misaligned part and start buffered IO*/
> + iov.iov_base = to->__iov->iov_base + ret_dio;
> + iov.iov_len = to->__iov->iov_len - ret_dio;
> +
> + to_misaligned.__iov = &iov;
> + iov_iter_truncate(&to_misaligned, iov.iov_len);
> +
> + iocb->ki_flags &= ~IOCB_DIRECT;
> + ret_generic = generic_file_read_iter(iocb, &to_misaligned);
> +
> + if (ret_generic <= 0) {
> + ret = ret_generic;
> + goto out;
> + }
> +
> + ret = ret_dio + ret_generic;
> +
> +out:
> + iocb->ki_flags |= IOCB_DIRECT;
> + inode_unlock_shared(inode);
> + file_accessed(iocb->ki_filp);
> +
> + return ret;
> }
>
> static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
> {
> ssize_t ret;
> + int dio_supported;
> struct inode *inode = file_inode(iocb->ki_filp);
>
> if (iocb->ki_flags & IOCB_NOWAIT) {
> @@ -78,7 +150,8 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
> inode_lock_shared(inode);
> }
>
> - if (!ext4_should_use_dio(iocb, to)) {
> + dio_supported = ext4_should_use_dio(iocb, to);
> + if (dio_supported == SHOULD_NOT_DIO) {
> inode_unlock_shared(inode);
> /*
> * Fallback to buffered I/O if the operation being performed on
> @@ -91,6 +164,9 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
> return generic_file_read_iter(iocb, to);
> }
>
> + if (dio_supported == SHOULD_PARTIAL_DIO)
> + return ext4_mixed_read_iter(iocb, to);
> +
> ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
> inode_unlock_shared(inode);
>
> @@ -537,7 +613,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
> }
>
> /* Fallback to buffered I/O if the inode does not support direct I/O. */
> - if (!ext4_should_use_dio(iocb, from)) {
> + if (ext4_should_use_dio(iocb, from) != SHOULD_DIO) {
> if (ilock_shared)
> inode_unlock_shared(inode);
> else
Powered by blists - more mailing lists