lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251223223018.3295372-2-sashal@kernel.org>
Date: Tue, 23 Dec 2025 17:30:17 -0500
From: Sasha Levin <sashal@...nel.org>
To: joannelkoong@...il.com
Cc: willy@...radead.org,
	linux-fsdevel@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	Sasha Levin <sashal@...nel.org>
Subject: [RFC PATCH 1/1] iomap: fix race between iomap_set_range_uptodate and folio_end_read

When iomap uses large folios, per-block uptodate tracking is managed via
iomap_folio_state (ifs). A race condition can cause the ifs uptodate bits
to become inconsistent with the folio's uptodate flag.

The race occurs because folio_end_read() uses XOR semantics to atomically
set the uptodate bit and clear the locked bit:

  Thread A (read completion):          Thread B (concurrent write):
  --------------------------------     --------------------------------
  iomap_finish_folio_read()
    spin_lock(state_lock)
    ifs_set_range_uptodate() -> true
    spin_unlock(state_lock)
                                       iomap_set_range_uptodate()
                                         spin_lock(state_lock)
                                         ifs_set_range_uptodate() -> true
                                         spin_unlock(state_lock)
                                         folio_mark_uptodate(folio)
    folio_end_read(folio, true)
      folio_xor_flags()  // XOR CLEARS uptodate!

Result: folio is NOT uptodate, but ifs says all blocks ARE uptodate.

Fix by checking read_bytes_pending in iomap_set_range_uptodate() under the
lock. If a read is in progress, skip calling folio_mark_uptodate() - the
read completion path will handle it via folio_end_read().

The warning was triggered during FUSE-based filesystem (e.g., NTFS-3G)
unmount when the LTP writev03 test was run:

  WARNING: fs/iomap/buffered-io.c at ifs_free
  Call trace:
   ifs_free
   iomap_invalidate_folio
   truncate_cleanup_folio
   truncate_inode_pages_range
   truncate_inode_pages_final
   fuse_evict_inode
   ...
   fuse_kill_sb_blk

Fixes: 7a4847e54cc1 ("iomap: use folio_end_read()")
Assisted-by: claude-opus-4-5-20251101
Signed-off-by: Sasha Levin <sashal@...nel.org>
---
 fs/fuse/dev.c          |  3 +-
 fs/fuse/file.c         |  6 ++--
 fs/iomap/buffered-io.c | 65 +++++++++++++++++++++++++++++++++++++++---
 include/linux/iomap.h  |  2 ++
 4 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6d59cbc877c6..50e84e913589 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -11,6 +11,7 @@
 #include "fuse_dev_i.h"
 
 #include <linux/init.h>
+#include <linux/iomap.h>
 #include <linux/module.h>
 #include <linux/poll.h>
 #include <linux/sched/signal.h>
@@ -1820,7 +1821,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
 		if (!folio_test_uptodate(folio) && !err && offset == 0 &&
 		    (nr_bytes == folio_size(folio) || file_size == end)) {
 			folio_zero_segment(folio, nr_bytes, folio_size(folio));
-			folio_mark_uptodate(folio);
+			iomap_set_range_uptodate(folio, 0, folio_size(folio));
 		}
 		folio_unlock(folio);
 		folio_put(folio);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 01bc894e9c2b..3abe38416199 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1216,13 +1216,13 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
 		struct folio *folio = ap->folios[i];
 
 		if (err) {
-			folio_clear_uptodate(folio);
+			iomap_clear_folio_uptodate(folio);
 		} else {
 			if (count >= folio_size(folio) - offset)
 				count -= folio_size(folio) - offset;
 			else {
 				if (short_write)
-					folio_clear_uptodate(folio);
+					iomap_clear_folio_uptodate(folio);
 				count = 0;
 			}
 			offset = 0;
@@ -1305,7 +1305,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
 
 		/* If we copied full folio, mark it uptodate */
 		if (tmp == folio_size(folio))
-			folio_mark_uptodate(folio);
+			iomap_set_range_uptodate(folio, 0, folio_size(folio));
 
 		if (folio_test_uptodate(folio)) {
 			folio_unlock(folio);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e5c1ca440d93..7ceda24cf6a7 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -74,8 +74,7 @@ static bool ifs_set_range_uptodate(struct folio *folio,
 	return ifs_is_fully_uptodate(folio, ifs);
 }
 
-static void iomap_set_range_uptodate(struct folio *folio, size_t off,
-		size_t len)
+void iomap_set_range_uptodate(struct folio *folio, size_t off, size_t len)
 {
 	struct iomap_folio_state *ifs = folio->private;
 	unsigned long flags;
@@ -87,12 +86,50 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
 	if (ifs) {
 		spin_lock_irqsave(&ifs->state_lock, flags);
 		uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
+		/*
+		 * If a read is in progress, we must NOT call folio_mark_uptodate
+		 * here. The read completion path (iomap_finish_folio_read or
+		 * iomap_read_end) will call folio_end_read() which uses XOR
+		 * semantics to set the uptodate bit. If we set it here, the XOR
+		 * in folio_end_read() will clear it, leaving the folio not
+		 * uptodate while the ifs says all blocks are uptodate.
+		 */
+		if (uptodate && ifs->read_bytes_pending)
+			uptodate = false;
 		spin_unlock_irqrestore(&ifs->state_lock, flags);
 	}
 
 	if (uptodate)
 		folio_mark_uptodate(folio);
 }
+EXPORT_SYMBOL_GPL(iomap_set_range_uptodate);
+
+void iomap_clear_folio_uptodate(struct folio *folio)
+{
+	struct iomap_folio_state *ifs = folio->private;
+
+	if (ifs) {
+		struct inode *inode = folio->mapping->host;
+		unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+		unsigned long flags;
+
+		spin_lock_irqsave(&ifs->state_lock, flags);
+		/*
+		 * If a read is in progress, don't clear the uptodate state.
+		 * The read completion path will handle the folio state, and
+		 * clearing here would race with iomap_finish_folio_read()
+		 * potentially causing ifs/folio uptodate state mismatch.
+		 */
+		if (ifs->read_bytes_pending) {
+			spin_unlock_irqrestore(&ifs->state_lock, flags);
+			return;
+		}
+		bitmap_clear(ifs->state, 0, nr_blocks);
+		spin_unlock_irqrestore(&ifs->state_lock, flags);
+	}
+	folio_clear_uptodate(folio);
+}
+EXPORT_SYMBOL_GPL(iomap_clear_folio_uptodate);
 
 /*
  * Find the next dirty block in the folio. end_blk is inclusive.
@@ -399,8 +436,17 @@ void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len,
 		spin_unlock_irqrestore(&ifs->state_lock, flags);
 	}
 
-	if (finished)
+	if (finished) {
+		/*
+		 * If uptodate is true but the folio is already marked uptodate,
+		 * folio_end_read's XOR semantics would clear the uptodate bit.
+		 * This should never happen because iomap_set_range_uptodate()
+		 * skips calling folio_mark_uptodate() when read_bytes_pending
+		 * is non-zero, ensuring only the read completion path sets it.
+		 */
+		WARN_ON_ONCE(uptodate && folio_test_uptodate(folio));
 		folio_end_read(folio, uptodate);
+	}
 }
 EXPORT_SYMBOL_GPL(iomap_finish_folio_read);
 
@@ -481,8 +527,19 @@ static void iomap_read_end(struct folio *folio, size_t bytes_submitted)
 		if (end_read)
 			uptodate = ifs_is_fully_uptodate(folio, ifs);
 		spin_unlock_irq(&ifs->state_lock);
-		if (end_read)
+		if (end_read) {
+			/*
+			 * If uptodate is true but the folio is already marked
+			 * uptodate, folio_end_read's XOR semantics would clear
+			 * the uptodate bit. This should never happen because
+			 * iomap_set_range_uptodate() skips calling
+			 * folio_mark_uptodate() when read_bytes_pending is
+			 * non-zero, ensuring only the read completion path
+			 * sets it.
+			 */
+			WARN_ON_ONCE(uptodate && folio_test_uptodate(folio));
 			folio_end_read(folio, uptodate);
+		}
 	} else if (!bytes_submitted) {
 		/*
 		 * If there were no bytes submitted, this means we are
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 520e967cb501..3c2ad88d16b6 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -345,6 +345,8 @@ void iomap_read_folio(const struct iomap_ops *ops,
 void iomap_readahead(const struct iomap_ops *ops,
 		struct iomap_read_folio_ctx *ctx);
 bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
+void iomap_set_range_uptodate(struct folio *folio, size_t off, size_t len);
+void iomap_clear_folio_uptodate(struct folio *folio);
 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
-- 
2.51.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ