linux-ext4 - [PATCH RFC] ext4: fallback unaligned part of dio to buffered IO

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20250710085910.123168-1-chensong_2000@189.cn>
Date: Thu, 10 Jul 2025 16:59:10 +0800
From: chensong_2000@....cn
To: tytso@....edu,
	adilger.kernel@...ger.ca
Cc: linux-ext4@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	Song Chen <chensong_2000@....cn>
Subject: [PATCH RFC] ext4: fallback unaligned part of dio to buffered IO

From: Song Chen <chensong_2000@....cn>

When I was trying to read a big file in direct IO mode, if the
file was not page aligned, ext4 rejected the request in
iomap_dio_bio_iter which checks alignments of pos, addr and length
before submitting bio.

As my understanding, pos and addr must be block size aligned, but length
doesn't have to be. Instead of rejecting entire request which is so
frastrating to upper layer, this patch splits length into aligned part
and unaligned part. For the aligned part, still uses direct IO with
no problem, for the rest unaligned part, falls back to buffered IO.
This way looks more friendly to apps.

Please have a look at the patch in [1], it has to reopen the file
to read the unaligned part in upper layer, which doen't look
gracefully.

I guess I'm not the first one who brings it up, there must be a reason
to stop this porblem being addressed. unaligned write seems to be
addressed in [2] and [3]. Side effects or complexity, I would like to know.

This is just a draft of RFC, I haven't taken care of ubuf properly yet,
please let me know if you like this idea or not, then I can drop it or
go further, like introduce helpers to split iov_iter in lib/iov_iters.c

[1]:https://lore.kernel.org/all/20240730075755.10941-4-link@vivo.com/
[2]:https://lore.kernel.org/linux-ext4/20230314130759.642710-1-bfoster
@redhat.com/
[3]:https://lore.kernel.org/linux-ext4/20230810165559.946222-1-bfoster
@redhat.com/

Signed-off-by: Song Chen <chensong_2000@....cn>
---
 fs/ext4/file.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 83 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 21df81347147..a059985d0b16 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -36,6 +36,12 @@
 #include "acl.h"
 #include "truncate.h"
 
+enum {
+	SHOULD_NOT_DIO,
+	SHOULD_DIO,
+	SHOULD_PARTIAL_DIO,
+};
+
 /*
  * Returns %true if the given DIO request should be attempted with DIO, or
  * %false if it should fall back to buffered I/O.
@@ -52,23 +58,89 @@
  *
  * This function implements the traditional ext4 behavior in all these cases.
  */
-static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
+static int ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
+	unsigned int len_mask = i_blocksize(inode) - 1;
+	unsigned int addr_mask = bdev_dma_alignment(inode->i_sb->s_bdev);
 	u32 dio_align = ext4_dio_alignment(inode);
 
+	/* inode doesn't support dio, fall back to buffered IO*/
 	if (dio_align == 0)
-		return false;
+		return SHOULD_NOT_DIO;
+
+	/* addr is misaligned, fall back to buffered IO*/
+	if (!iov_iter_is_aligned(iter, addr_mask, 0))
+		return SHOULD_NOT_DIO;
+
+	/* pos is misaligned, fall back to buffered IO*/
+	if (!IS_ALIGNED(iocb->ki_pos, len_mask))
+		return SHOULD_NOT_DIO;
+
+	/* length is misaligned*/
+	if (!iov_iter_is_aligned(iter, 0, len_mask)) {
+		/* if length is less than a block, fall back to buffered IO*/
+		if (iov_iter_count(iter) < i_blocksize(inode))
+			return SHOULD_NOT_DIO;
+		/*direct IO for aligned part, buffered IO for misaligned part*/
+		return SHOULD_PARTIAL_DIO;
+	}
 
-	if (dio_align == 1)
-		return true;
+	return SHOULD_DIO;
+}
 
-	return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
+/*
+ * First of all, truncate the length to block size aligned and start
+ * a direct IO. If it goes well in iomap_dio_rw, fall back the rest
+ * unaligned part to buffered IO.
+ *
+ * At the end, return the sum bytes of direct IO and buffered IO.
+ */
+static ssize_t ext4_mixed_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct iov_iter to_misaligned = *to;
+	struct iovec iov;
+	ssize_t ret, ret_dio, ret_generic;
+
+	/* truncate iter->count to blocksize aligned and start direct IO */
+	iov_iter_truncate(to, ALIGN_DOWN(to->count, i_blocksize(inode)));
+	ret_dio = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
+
+	if (ret_dio <= 0) {
+		ret = ret_dio;
+		goto out;
+	}
+
+	/* set up iter to misaligned part and start buffered IO*/
+	iov.iov_base = to->__iov->iov_base +  ret_dio;
+	iov.iov_len	 = to->__iov->iov_len -  ret_dio;
+
+	to_misaligned.__iov = &iov;
+	iov_iter_truncate(&to_misaligned, iov.iov_len);
+
+	iocb->ki_flags &= ~IOCB_DIRECT;
+	ret_generic = generic_file_read_iter(iocb, &to_misaligned);
+
+	if (ret_generic <= 0) {
+		ret  = ret_generic;
+		goto out;
+	}
+
+	ret = ret_dio + ret_generic;
+
+out:
+	iocb->ki_flags |= IOCB_DIRECT;
+	inode_unlock_shared(inode);
+	file_accessed(iocb->ki_filp);
+
+	return ret;
 }
 
 static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	ssize_t ret;
+	int dio_supported;
 	struct inode *inode = file_inode(iocb->ki_filp);
 
 	if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -78,7 +150,8 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		inode_lock_shared(inode);
 	}
 
-	if (!ext4_should_use_dio(iocb, to)) {
+	dio_supported = ext4_should_use_dio(iocb, to);
+	if (dio_supported == SHOULD_NOT_DIO) {
 		inode_unlock_shared(inode);
 		/*
 		 * Fallback to buffered I/O if the operation being performed on
@@ -91,6 +164,9 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		return generic_file_read_iter(iocb, to);
 	}
 
+	if (dio_supported == SHOULD_PARTIAL_DIO)
+		return ext4_mixed_read_iter(iocb, to);
+
 	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
 	inode_unlock_shared(inode);
 
@@ -537,7 +613,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 
 	/* Fallback to buffered I/O if the inode does not support direct I/O. */
-	if (!ext4_should_use_dio(iocb, from)) {
+	if (ext4_should_use_dio(iocb, from) != SHOULD_DIO) {
 		if (ilock_shared)
 			inode_unlock_shared(inode);
 		else
-- 
2.25.1