[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <55deda1d-967d-4d68-a9ba-4d5139374a37@cybernetics.com>
Date: Tue, 29 Jul 2025 12:13:42 -0400
From: Tony Battersby <tonyb@...ernetics.com>
To: Song Liu <song@...nel.org>, Yu Kuai <yukuai3@...wei.com>,
Christian Brauner <brauner@...nel.org>, "Darrick J. Wong"
<djwong@...nel.org>, "Matthew Wilcox (Oracle)" <willy@...radead.org>
Cc: linux-raid@...r.kernel.org, linux-xfs@...r.kernel.org,
linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: [PATCH 2/2] iomap: align writeback to RAID stripe boundaries
Improve writeback performance to RAID-4/5/6 by aligning writes to stripe
boundaries. This relies on io_opt being set to the stripe size (or
a multiple) when BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE is set.
Benchmark of sequential writing to a large file on XFS using
io_uring with 8-disk md-raid6:
Before: 601.0 MB/s
After: 614.5 MB/s
Improvement: +2.3%
Signed-off-by: Tony Battersby <tonyb@...ernetics.com>
---
fs/iomap/buffered-io.c | 175 +++++++++++++++++++++++++----------------
1 file changed, 106 insertions(+), 69 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index fb4519158f3a..f9020f916268 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1685,81 +1685,118 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
struct inode *inode, loff_t pos, loff_t end_pos,
unsigned len)
{
- struct iomap_folio_state *ifs = folio->private;
- size_t poff = offset_in_folio(folio, pos);
- unsigned int ioend_flags = 0;
- int error;
-
- if (wpc->iomap.type == IOMAP_UNWRITTEN)
- ioend_flags |= IOMAP_IOEND_UNWRITTEN;
- if (wpc->iomap.flags & IOMAP_F_SHARED)
- ioend_flags |= IOMAP_IOEND_SHARED;
- if (folio_test_dropbehind(folio))
- ioend_flags |= IOMAP_IOEND_DONTCACHE;
- if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
- ioend_flags |= IOMAP_IOEND_BOUNDARY;
+ struct queue_limits *lim = bdev_limits(wpc->iomap.bdev);
+ unsigned int io_align =
+ (lim->features & BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) ?
+ lim->io_opt >> SECTOR_SHIFT : 0;
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
+ do {
+ struct iomap_folio_state *ifs = folio->private;
+ size_t poff = offset_in_folio(folio, pos);
+ unsigned int ioend_flags = 0;
+ unsigned int rem_len = 0;
+ int error;
+
+ if (wpc->iomap.type == IOMAP_UNWRITTEN)
+ ioend_flags |= IOMAP_IOEND_UNWRITTEN;
+ if (wpc->iomap.flags & IOMAP_F_SHARED)
+ ioend_flags |= IOMAP_IOEND_SHARED;
+ if (folio_test_dropbehind(folio))
+ ioend_flags |= IOMAP_IOEND_DONTCACHE;
+ if (pos == wpc->iomap.offset &&
+ (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+ ioend_flags |= IOMAP_IOEND_BOUNDARY;
+
+ if (!wpc->ioend ||
+ !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
new_ioend:
- error = iomap_submit_ioend(wpc, 0);
- if (error)
- return error;
- wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
- ioend_flags);
- }
+ error = iomap_submit_ioend(wpc, 0);
+ if (error)
+ return error;
+ wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
+ ioend_flags);
+ }
- if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
- goto new_ioend;
+ /* Align writes to io_align if given. */
+ if (io_align && !(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) {
+ sector_t lba = bio_end_sector(&wpc->ioend->io_bio);
+ unsigned int mod = lba % io_align;
+ unsigned int max_len;
- if (ifs)
- atomic_add(len, &ifs->write_bytes_pending);
+ /*
+ * If the end sector is already aligned and the bio is
+ * nonempty, then start a new bio for the remainder.
+ */
+ if (!mod && wpc->ioend->io_bio.bi_iter.bi_size)
+ goto new_ioend;
- /*
- * Clamp io_offset and io_size to the incore EOF so that ondisk
- * file size updates in the ioend completion are byte-accurate.
- * This avoids recovering files with zeroed tail regions when
- * writeback races with appending writes:
- *
- * Thread 1: Thread 2:
- * ------------ -----------
- * write [A, A+B]
- * update inode size to A+B
- * submit I/O [A, A+BS]
- * write [A+B, A+B+C]
- * update inode size to A+B+C
- * <I/O completes, updates disk size to min(A+B+C, A+BS)>
- * <power failure>
- *
- * After reboot:
- * 1) with A+B+C < A+BS, the file has zero padding in range
- * [A+B, A+B+C]
- *
- * |< Block Size (BS) >|
- * |DDDDDDDDDDDD0000000000000|
- * ^ ^ ^
- * A A+B A+B+C
- * (EOF)
- *
- * 2) with A+B+C > A+BS, the file has zero padding in range
- * [A+B, A+BS]
- *
- * |< Block Size (BS) >|< Block Size (BS) >|
- * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
- * ^ ^ ^ ^
- * A A+B A+BS A+B+C
- * (EOF)
- *
- * D = Valid Data
- * 0 = Zero Padding
- *
- * Note that this defeats the ability to chain the ioends of
- * appending writes.
- */
- wpc->ioend->io_size += len;
- if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
- wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+ /*
+ * Clip the end of the bio to the alignment boundary.
+ */
+ max_len = (io_align - mod) << SECTOR_SHIFT;
+ if (len > max_len) {
+ rem_len = len - max_len;
+ len = max_len;
+ }
+ }
+
+ if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+ goto new_ioend;
+
+ if (ifs)
+ atomic_add(len, &ifs->write_bytes_pending);
+
+ /*
+ * Clamp io_offset and io_size to the incore EOF so that ondisk
+ * file size updates in the ioend completion are byte-accurate.
+ * This avoids recovering files with zeroed tail regions when
+ * writeback races with appending writes:
+ *
+ * Thread 1: Thread 2:
+ * ------------ -----------
+ * write [A, A+B]
+ * update inode size to A+B
+ * submit I/O [A, A+BS]
+ * write [A+B, A+B+C]
+ * update inode size to A+B+C
+ * <I/O completes, updates disk size to min(A+B+C, A+BS)>
+ * <power failure>
+ *
+ * After reboot:
+ * 1) with A+B+C < A+BS, the file has zero padding in range
+ * [A+B, A+B+C]
+ *
+ * |< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|
+ * ^ ^ ^
+ * A A+B A+B+C
+ * (EOF)
+ *
+ * 2) with A+B+C > A+BS, the file has zero padding in range
+ * [A+B, A+BS]
+ *
+ * |< Block Size (BS) >|< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
+ * ^ ^ ^ ^
+ * A A+B A+BS A+B+C
+ * (EOF)
+ *
+ * D = Valid Data
+ * 0 = Zero Padding
+ *
+ * Note that this defeats the ability to chain the ioends of
+ * appending writes.
+ */
+ wpc->ioend->io_size += len;
+ if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
+ wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+
+ wbc_account_cgroup_owner(wbc, folio, len);
+
+ pos += len;
+ len = rem_len;
+ } while (len);
- wbc_account_cgroup_owner(wbc, folio, len);
return 0;
}
--
2.43.0
Powered by blists - more mailing lists