[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8229fb9bcd2504b80caf0e763b1984d7ee6178b0.1762945505.git.ojaswin@linux.ibm.com>
Date: Wed, 12 Nov 2025 16:36:07 +0530
From: Ojaswin Mujoo <ojaswin@...ux.ibm.com>
To: Christian Brauner <brauner@...nel.org>, djwong@...nel.org,
ritesh.list@...il.com, john.g.garry@...cle.com, tytso@....edu,
willy@...radead.org, dchinner@...hat.com, hch@....de
Cc: linux-xfs@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-ext4@...r.kernel.org, linux-fsdevel@...r.kernel.org,
linux-mm@...ck.org, jack@...e.cz, nilay@...ux.ibm.com,
martin.petersen@...cle.com, rostedt@...dmis.org, axboe@...nel.dk,
linux-block@...r.kernel.org, linux-trace-kernel@...r.kernel.org
Subject: [RFC PATCH 4/8] iomap: buffered atomic write support
Add special handling of PG_atomic flag to iomap buffered write path.
To flag an iomap iter for an atomic write, set IOMAP_ATOMIC. For a folio
associated with a write which has IOMAP_ATOMIC set, set PG_atomic.
Otherwise, when IOMAP_ATOMIC is unset, clear PG_atomic.
This means that for an "atomic" folio which has not been written back,
it loses it "atomicity". So if userspace issues a write with RWF_ATOMIC
set and another write with RWF_ATOMIC unset, that folio is not written back
atomically. For such a scenario to occur, it would be considered a userspace
usage error.
To ensure that a buffered atomic write is written back atomically when
the write syscall returns, RWF_SYNC or similar needs to be used (in
conjunction with RWF_ATOMIC).
Only a single BIO should ever be submitted for an atomic write. So
modify iomap_add_to_ioend() to ensure that we don't try to write back an
atomic folio as part of a larger mixed-atomicity BIO.
In iomap_alloc_ioend(), handle an atomic write by setting REQ_ATOMIC for
the allocated BIO. When a folio is written back, again clear PG_atomic,
as it is no longer required.
Currently, RWF_ATOMIC with buffered IO is limited to single block
size writes, and has 2 main restrictions:
1. Only blocksize == pagesize is supported
2. Writes where the user buffer is not aligned to PAGE_SIZE are not
supported
For more details, refer to the comment in generic_atomic_write_valid()
Co-developed-by: John Garry <john.g.garry@...cle.com>
Signed-off-by: John Garry <john.g.garry@...cle.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@...ux.ibm.com>
---
fs/iomap/buffered-io.c | 48 ++++++++++++++++++++++++++++++++++++------
fs/iomap/ioend.c | 18 ++++++++++++----
fs/read_write.c | 34 ++++++++++++++++++++++++++++--
include/linux/iomap.h | 2 ++
4 files changed, 89 insertions(+), 13 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f099c086cbe8..947c76c2688a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -850,11 +850,13 @@ static int iomap_write_begin(struct iomap_iter *iter,
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos;
- u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
+ u64 orig_len = min_t(u64, SIZE_MAX, iomap_length(iter));
+ u64 len;
struct folio *folio;
int status = 0;
+ bool is_atomic = iter->flags & IOMAP_ATOMIC;
- len = min_not_zero(len, *plen);
+ len = min_not_zero(orig_len, *plen);
*foliop = NULL;
*plen = 0;
@@ -922,6 +924,11 @@ static int iomap_write_begin(struct iomap_iter *iter,
if (unlikely(status))
goto out_unlock;
+ if (is_atomic && (len != orig_len)) {
+ status = -EINVAL;
+ goto out_unlock;
+ }
+
*foliop = folio;
*plen = len;
return 0;
@@ -931,7 +938,7 @@ static int iomap_write_begin(struct iomap_iter *iter,
return status;
}
-static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
+static bool __iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
flush_dcache_folio(folio);
@@ -951,7 +958,27 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
return false;
iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
- filemap_dirty_folio(inode->i_mapping, folio);
+ filemap_dirty_folio(iter->inode->i_mapping, folio);
+
+ /*
+ * Policy: non atomic write over a previously atomic range makes the
+ * range non-atomic. Handle this here.
+ */
+ if (iter->flags & IOMAP_ATOMIC) {
+ if (copied < len) {
+ /*
+ * A short atomic write is only okay as long as nothing
+ * is written at all. If we have a partial write, there
+ * is a bug in our code.
+ */
+ WARN_ON_ONCE(copied != 0);
+
+ return false;
+ }
+ folio_set_atomic(folio);
+ } else
+ folio_clear_atomic(folio);
+
return true;
}
@@ -997,7 +1024,7 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
return bh_written == copied;
}
- return __iomap_write_end(iter->inode, pos, len, copied, folio);
+ return __iomap_write_end(iter, pos, len, copied, folio);
}
static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
@@ -1124,6 +1151,8 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
iter.flags |= IOMAP_NOWAIT;
if (iocb->ki_flags & IOCB_DONTCACHE)
iter.flags |= IOMAP_DONTCACHE;
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ iter.flags |= IOMAP_ATOMIC;
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.status = iomap_write_iter(&iter, i, write_ops);
@@ -1588,6 +1617,7 @@ static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
} else {
WARN_ON_ONCE(!folio_test_uptodate(folio));
folio_mark_dirty(folio);
+ folio_clear_atomic(folio);
}
return iomap_iter_advance(iter, length);
@@ -1642,8 +1672,10 @@ void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
- if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
+ if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) {
+ folio_clear_atomic(folio);
folio_end_writeback(folio);
+ }
}
EXPORT_SYMBOL_GPL(iomap_finish_folio_write);
@@ -1807,8 +1839,10 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
if (atomic_dec_and_test(&ifs->write_bytes_pending))
folio_end_writeback(folio);
} else {
- if (!wb_pending)
+ if (!wb_pending) {
+ folio_clear_atomic(folio);
folio_end_writeback(folio);
+ }
}
mapping_set_error(inode->i_mapping, error);
return error;
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index b49fa75eab26..c129a695ceca 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -98,13 +98,17 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error)
EXPORT_SYMBOL_GPL(iomap_ioend_writeback_submit);
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
- loff_t pos, u16 ioend_flags)
+ loff_t pos, u16 ioend_flags,
+ bool atomic)
{
struct bio *bio;
+ blk_opf_t opf = REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc);
+
+ if (atomic)
+ opf |= REQ_ATOMIC;
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
- REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc),
- GFP_NOFS, &iomap_ioend_bioset);
+ opf, GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
bio->bi_write_hint = wpc->inode->i_write_hint;
wbc_init_bio(wpc->wbc, bio);
@@ -122,6 +126,9 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
(ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
return false;
+ if ((ioend_flags & IOMAP_IOEND_ATOMIC) ||
+ (ioend->io_flags & IOMAP_IOEND_ATOMIC))
+ return false;
if (pos != ioend->io_offset + ioend->io_size)
return false;
if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
@@ -156,6 +163,7 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
unsigned int ioend_flags = 0;
unsigned int map_len = min_t(u64, dirty_len,
wpc->iomap.offset + wpc->iomap.length - pos);
+ bool is_atomic = folio_test_atomic(folio);
int error;
trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap);
@@ -180,6 +188,8 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
ioend_flags |= IOMAP_IOEND_DONTCACHE;
if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
ioend_flags |= IOMAP_IOEND_BOUNDARY;
+ if (is_atomic)
+ ioend_flags |= IOMAP_IOEND_ATOMIC;
if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
new_ioend:
@@ -188,7 +198,7 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
if (error)
return error;
}
- wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags);
+ wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags, is_atomic);
}
if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
diff --git a/fs/read_write.c b/fs/read_write.c
index 833bae068770..37546aa40f0d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1802,6 +1802,8 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct super_block *sb = iocb->ki_filp->f_mapping->host->i_sb;
+
size_t len = iov_iter_count(iter);
if (!iter_is_ubuf(iter))
@@ -1813,8 +1815,36 @@ int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
if (!IS_ALIGNED(iocb->ki_pos, len))
return -EINVAL;
- if (!(iocb->ki_flags & IOCB_DIRECT))
- return -EOPNOTSUPP;
+ if (!(iocb->ki_flags & IOCB_DIRECT)) {
+ /* Some restrictions to buferred IO */
+
+ /*
+ * We only support block size == page size
+ * right now. This is to avoid the following:
+ * 1. 4kb block atomic write marks the complete 64kb folio as
+ * atomic.
+ * 2. Other writes, dirty the whole 64kb folio.
+ * 3. Writeback sees the whole folio dirty and atomic and tries
+ * to send a 64kb atomic write, which might exceed the
+ * allowed size and fail.
+ *
+ * Once we support sub-page atomic write tracking, we can remove
+ * this restriction.
+ */
+ if (sb->s_blocksize != PAGE_SIZE)
+ return -EOPNOTSUPP;
+
+ /*
+ * If the user buffer of atomic write crosses page boundary,
+ * there's a possibility of short write, example if 1 user page
+ * could not be faulted or got reclaimed before the copy
+ * operation. For now don't allow such a scenario by ensuring
+ * user buffer is page aligned.
+ */
+ if (!PAGE_ALIGNED(iov_iter_alignment(iter)))
+ return -EOPNOTSUPP;
+
+ }
return 0;
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 8b1ac08c7474..693f3e5ad03c 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -390,6 +390,8 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
#define IOMAP_IOEND_DIRECT (1U << 3)
/* is DONTCACHE I/O */
#define IOMAP_IOEND_DONTCACHE (1U << 4)
+/* is atomic I/O. These are never merged */
+#define IOMAP_IOEND_ATOMIC (1U << 5)
/*
* Flags that if set on either ioend prevent the merge of two ioends.
--
2.51.0
Powered by blists - more mailing lists