[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251223223018.3295372-2-sashal@kernel.org>
Date: Tue, 23 Dec 2025 17:30:17 -0500
From: Sasha Levin <sashal@...nel.org>
To: joannelkoong@...il.com
Cc: willy@...radead.org,
linux-fsdevel@...r.kernel.org,
linux-kernel@...r.kernel.org,
Sasha Levin <sashal@...nel.org>
Subject: [RFC PATCH 1/1] iomap: fix race between iomap_set_range_uptodate and folio_end_read
When iomap uses large folios, per-block uptodate tracking is managed via
iomap_folio_state (ifs). A race condition can cause the ifs uptodate bits
to become inconsistent with the folio's uptodate flag.
The race occurs because folio_end_read() uses XOR semantics to atomically
set the uptodate bit and clear the locked bit:
Thread A (read completion): Thread B (concurrent write):
-------------------------------- --------------------------------
iomap_finish_folio_read()
spin_lock(state_lock)
ifs_set_range_uptodate() -> true
spin_unlock(state_lock)
iomap_set_range_uptodate()
spin_lock(state_lock)
ifs_set_range_uptodate() -> true
spin_unlock(state_lock)
folio_mark_uptodate(folio)
folio_end_read(folio, true)
folio_xor_flags() // XOR CLEARS uptodate!
Result: folio is NOT uptodate, but ifs says all blocks ARE uptodate.
Fix by checking read_bytes_pending in iomap_set_range_uptodate() under the
lock. If a read is in progress, skip calling folio_mark_uptodate() - the
read completion path will handle it via folio_end_read().
The warning was triggered during FUSE-based filesystem (e.g., NTFS-3G)
unmount when the LTP writev03 test was run:
WARNING: fs/iomap/buffered-io.c at ifs_free
Call trace:
ifs_free
iomap_invalidate_folio
truncate_cleanup_folio
truncate_inode_pages_range
truncate_inode_pages_final
fuse_evict_inode
...
fuse_kill_sb_blk
Fixes: 7a4847e54cc1 ("iomap: use folio_end_read()")
Assisted-by: claude-opus-4-5-20251101
Signed-off-by: Sasha Levin <sashal@...nel.org>
---
fs/fuse/dev.c | 3 +-
fs/fuse/file.c | 6 ++--
fs/iomap/buffered-io.c | 65 +++++++++++++++++++++++++++++++++++++++---
include/linux/iomap.h | 2 ++
4 files changed, 68 insertions(+), 8 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6d59cbc877c6..50e84e913589 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -11,6 +11,7 @@
#include "fuse_dev_i.h"
#include <linux/init.h>
+#include <linux/iomap.h>
#include <linux/module.h>
#include <linux/poll.h>
#include <linux/sched/signal.h>
@@ -1820,7 +1821,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
if (!folio_test_uptodate(folio) && !err && offset == 0 &&
(nr_bytes == folio_size(folio) || file_size == end)) {
folio_zero_segment(folio, nr_bytes, folio_size(folio));
- folio_mark_uptodate(folio);
+ iomap_set_range_uptodate(folio, 0, folio_size(folio));
}
folio_unlock(folio);
folio_put(folio);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 01bc894e9c2b..3abe38416199 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1216,13 +1216,13 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
struct folio *folio = ap->folios[i];
if (err) {
- folio_clear_uptodate(folio);
+ iomap_clear_folio_uptodate(folio);
} else {
if (count >= folio_size(folio) - offset)
count -= folio_size(folio) - offset;
else {
if (short_write)
- folio_clear_uptodate(folio);
+ iomap_clear_folio_uptodate(folio);
count = 0;
}
offset = 0;
@@ -1305,7 +1305,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
/* If we copied full folio, mark it uptodate */
if (tmp == folio_size(folio))
- folio_mark_uptodate(folio);
+ iomap_set_range_uptodate(folio, 0, folio_size(folio));
if (folio_test_uptodate(folio)) {
folio_unlock(folio);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e5c1ca440d93..7ceda24cf6a7 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -74,8 +74,7 @@ static bool ifs_set_range_uptodate(struct folio *folio,
return ifs_is_fully_uptodate(folio, ifs);
}
-static void iomap_set_range_uptodate(struct folio *folio, size_t off,
- size_t len)
+void iomap_set_range_uptodate(struct folio *folio, size_t off, size_t len)
{
struct iomap_folio_state *ifs = folio->private;
unsigned long flags;
@@ -87,12 +86,50 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
if (ifs) {
spin_lock_irqsave(&ifs->state_lock, flags);
uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
+ /*
+ * If a read is in progress, we must NOT call folio_mark_uptodate
+ * here. The read completion path (iomap_finish_folio_read or
+ * iomap_read_end) will call folio_end_read() which uses XOR
+ * semantics to set the uptodate bit. If we set it here, the XOR
+ * in folio_end_read() will clear it, leaving the folio not
+ * uptodate while the ifs says all blocks are uptodate.
+ */
+ if (uptodate && ifs->read_bytes_pending)
+ uptodate = false;
spin_unlock_irqrestore(&ifs->state_lock, flags);
}
if (uptodate)
folio_mark_uptodate(folio);
}
+EXPORT_SYMBOL_GPL(iomap_set_range_uptodate);
+
+void iomap_clear_folio_uptodate(struct folio *folio)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ if (ifs) {
+ struct inode *inode = folio->mapping->host;
+ unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+ unsigned long flags;
+
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ /*
+ * If a read is in progress, don't clear the uptodate state.
+ * The read completion path will handle the folio state, and
+ * clearing here would race with iomap_finish_folio_read()
+ * potentially causing ifs/folio uptodate state mismatch.
+ */
+ if (ifs->read_bytes_pending) {
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
+ return;
+ }
+ bitmap_clear(ifs->state, 0, nr_blocks);
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
+ }
+ folio_clear_uptodate(folio);
+}
+EXPORT_SYMBOL_GPL(iomap_clear_folio_uptodate);
/*
* Find the next dirty block in the folio. end_blk is inclusive.
@@ -399,8 +436,17 @@ void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len,
spin_unlock_irqrestore(&ifs->state_lock, flags);
}
- if (finished)
+ if (finished) {
+ /*
+ * If uptodate is true but the folio is already marked uptodate,
+ * folio_end_read's XOR semantics would clear the uptodate bit.
+ * This should never happen because iomap_set_range_uptodate()
+ * skips calling folio_mark_uptodate() when read_bytes_pending
+ * is non-zero, ensuring only the read completion path sets it.
+ */
+ WARN_ON_ONCE(uptodate && folio_test_uptodate(folio));
folio_end_read(folio, uptodate);
+ }
}
EXPORT_SYMBOL_GPL(iomap_finish_folio_read);
@@ -481,8 +527,19 @@ static void iomap_read_end(struct folio *folio, size_t bytes_submitted)
if (end_read)
uptodate = ifs_is_fully_uptodate(folio, ifs);
spin_unlock_irq(&ifs->state_lock);
- if (end_read)
+ if (end_read) {
+ /*
+ * If uptodate is true but the folio is already marked
+ * uptodate, folio_end_read's XOR semantics would clear
+ * the uptodate bit. This should never happen because
+ * iomap_set_range_uptodate() skips calling
+ * folio_mark_uptodate() when read_bytes_pending is
+ * non-zero, ensuring only the read completion path
+ * sets it.
+ */
+ WARN_ON_ONCE(uptodate && folio_test_uptodate(folio));
folio_end_read(folio, uptodate);
+ }
} else if (!bytes_submitted) {
/*
* If there were no bytes submitted, this means we are
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 520e967cb501..3c2ad88d16b6 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -345,6 +345,8 @@ void iomap_read_folio(const struct iomap_ops *ops,
void iomap_readahead(const struct iomap_ops *ops,
struct iomap_read_folio_ctx *ctx);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
+void iomap_set_range_uptodate(struct folio *folio, size_t off, size_t len);
+void iomap_clear_folio_uptodate(struct folio *folio);
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
--
2.51.0
Powered by blists - more mailing lists