linux-kernel - [PATCH 2/2] iomap: align writeback to RAID stripe boundaries

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <55deda1d-967d-4d68-a9ba-4d5139374a37@cybernetics.com>
Date: Tue, 29 Jul 2025 12:13:42 -0400
From: Tony Battersby <tonyb@...ernetics.com>
To: Song Liu <song@...nel.org>, Yu Kuai <yukuai3@...wei.com>,
 Christian Brauner <brauner@...nel.org>, "Darrick J. Wong"
 <djwong@...nel.org>, "Matthew Wilcox (Oracle)" <willy@...radead.org>
Cc: linux-raid@...r.kernel.org, linux-xfs@...r.kernel.org,
 linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: [PATCH 2/2] iomap: align writeback to RAID stripe boundaries

Improve writeback performance to RAID-4/5/6 by aligning writes to stripe
boundaries.  This relies on io_opt being set to the stripe size (or
a multiple) when BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE is set.

Benchmark of sequential writing to a large file on XFS using
io_uring with 8-disk md-raid6:
Before:      601.0 MB/s
After:       614.5 MB/s
Improvement: +2.3%

Signed-off-by: Tony Battersby <tonyb@...ernetics.com>
---
 fs/iomap/buffered-io.c | 175 +++++++++++++++++++++++++----------------
 1 file changed, 106 insertions(+), 69 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index fb4519158f3a..f9020f916268 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1685,81 +1685,118 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 		struct inode *inode, loff_t pos, loff_t end_pos,
 		unsigned len)
 {
-	struct iomap_folio_state *ifs = folio->private;
-	size_t poff = offset_in_folio(folio, pos);
-	unsigned int ioend_flags = 0;
-	int error;
-
-	if (wpc->iomap.type == IOMAP_UNWRITTEN)
-		ioend_flags |= IOMAP_IOEND_UNWRITTEN;
-	if (wpc->iomap.flags & IOMAP_F_SHARED)
-		ioend_flags |= IOMAP_IOEND_SHARED;
-	if (folio_test_dropbehind(folio))
-		ioend_flags |= IOMAP_IOEND_DONTCACHE;
-	if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
-		ioend_flags |= IOMAP_IOEND_BOUNDARY;
+	struct queue_limits *lim = bdev_limits(wpc->iomap.bdev);
+	unsigned int io_align =
+		(lim->features & BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) ?
+		lim->io_opt >> SECTOR_SHIFT : 0;
 
-	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
+	do {
+		struct iomap_folio_state *ifs = folio->private;
+		size_t poff = offset_in_folio(folio, pos);
+		unsigned int ioend_flags = 0;
+		unsigned int rem_len = 0;
+		int error;
+
+		if (wpc->iomap.type == IOMAP_UNWRITTEN)
+			ioend_flags |= IOMAP_IOEND_UNWRITTEN;
+		if (wpc->iomap.flags & IOMAP_F_SHARED)
+			ioend_flags |= IOMAP_IOEND_SHARED;
+		if (folio_test_dropbehind(folio))
+			ioend_flags |= IOMAP_IOEND_DONTCACHE;
+		if (pos == wpc->iomap.offset &&
+		    (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+			ioend_flags |= IOMAP_IOEND_BOUNDARY;
+
+		if (!wpc->ioend ||
+		    !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
 new_ioend:
-		error = iomap_submit_ioend(wpc, 0);
-		if (error)
-			return error;
-		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
-				ioend_flags);
-	}
+			error = iomap_submit_ioend(wpc, 0);
+			if (error)
+				return error;
+			wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
+					ioend_flags);
+		}
 
-	if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
-		goto new_ioend;
+		/* Align writes to io_align if given. */
+		if (io_align && !(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) {
+			sector_t lba = bio_end_sector(&wpc->ioend->io_bio);
+			unsigned int mod = lba % io_align;
+			unsigned int max_len;
 
-	if (ifs)
-		atomic_add(len, &ifs->write_bytes_pending);
+			/*
+			 * If the end sector is already aligned and the bio is
+			 * nonempty, then start a new bio for the remainder.
+			 */
+			if (!mod && wpc->ioend->io_bio.bi_iter.bi_size)
+				goto new_ioend;
 
-	/*
-	 * Clamp io_offset and io_size to the incore EOF so that ondisk
-	 * file size updates in the ioend completion are byte-accurate.
-	 * This avoids recovering files with zeroed tail regions when
-	 * writeback races with appending writes:
-	 *
-	 *    Thread 1:                  Thread 2:
-	 *    ------------               -----------
-	 *    write [A, A+B]
-	 *    update inode size to A+B
-	 *    submit I/O [A, A+BS]
-	 *                               write [A+B, A+B+C]
-	 *                               update inode size to A+B+C
-	 *    <I/O completes, updates disk size to min(A+B+C, A+BS)>
-	 *    <power failure>
-	 *
-	 *  After reboot:
-	 *    1) with A+B+C < A+BS, the file has zero padding in range
-	 *       [A+B, A+B+C]
-	 *
-	 *    |<     Block Size (BS)   >|
-	 *    |DDDDDDDDDDDD0000000000000|
-	 *    ^           ^        ^
-	 *    A          A+B     A+B+C
-	 *                       (EOF)
-	 *
-	 *    2) with A+B+C > A+BS, the file has zero padding in range
-	 *       [A+B, A+BS]
-	 *
-	 *    |<     Block Size (BS)   >|<     Block Size (BS)    >|
-	 *    |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
-	 *    ^           ^             ^           ^
-	 *    A          A+B           A+BS       A+B+C
-	 *                             (EOF)
-	 *
-	 *    D = Valid Data
-	 *    0 = Zero Padding
-	 *
-	 * Note that this defeats the ability to chain the ioends of
-	 * appending writes.
-	 */
-	wpc->ioend->io_size += len;
-	if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
-		wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+			/*
+			 * Clip the end of the bio to the alignment boundary.
+			 */
+			max_len = (io_align - mod) << SECTOR_SHIFT;
+			if (len > max_len) {
+				rem_len = len - max_len;
+				len = max_len;
+			}
+		}
+
+		if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+			goto new_ioend;
+
+		if (ifs)
+			atomic_add(len, &ifs->write_bytes_pending);
+
+		/*
+		 * Clamp io_offset and io_size to the incore EOF so that ondisk
+		 * file size updates in the ioend completion are byte-accurate.
+		 * This avoids recovering files with zeroed tail regions when
+		 * writeback races with appending writes:
+		 *
+		 *    Thread 1:                  Thread 2:
+		 *    ------------               -----------
+		 *    write [A, A+B]
+		 *    update inode size to A+B
+		 *    submit I/O [A, A+BS]
+		 *                               write [A+B, A+B+C]
+		 *                               update inode size to A+B+C
+		 *    <I/O completes, updates disk size to min(A+B+C, A+BS)>
+		 *    <power failure>
+		 *
+		 *  After reboot:
+		 *    1) with A+B+C < A+BS, the file has zero padding in range
+		 *       [A+B, A+B+C]
+		 *
+		 *    |<     Block Size (BS)   >|
+		 *    |DDDDDDDDDDDD0000000000000|
+		 *    ^           ^        ^
+		 *    A          A+B     A+B+C
+		 *                       (EOF)
+		 *
+		 *    2) with A+B+C > A+BS, the file has zero padding in range
+		 *       [A+B, A+BS]
+		 *
+		 *    |<     Block Size (BS)   >|<     Block Size (BS)    >|
+		 *    |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
+		 *    ^           ^             ^           ^
+		 *    A          A+B           A+BS       A+B+C
+		 *                             (EOF)
+		 *
+		 *    D = Valid Data
+		 *    0 = Zero Padding
+		 *
+		 * Note that this defeats the ability to chain the ioends of
+		 * appending writes.
+		 */
+		wpc->ioend->io_size += len;
+		if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
+			wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+
+		wbc_account_cgroup_owner(wbc, folio, len);
+
+		pos += len;
+		len = rem_len;
+	} while (len);
 
-	wbc_account_cgroup_owner(wbc, folio, len);
 	return 0;
 }
 
-- 
2.43.0