lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251017141536.577466-1-kirill@shutemov.name>
Date: Fri, 17 Oct 2025 15:15:36 +0100
From: Kiryl Shutsemau <kirill@...temov.name>
To: Andrew Morton <akpm@...ux-foundation.org>,
	David Hildenbrand <david@...hat.com>,
	Matthew Wilcox <willy@...radead.org>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Alexander Viro <viro@...iv.linux.org.uk>,
	Christian Brauner <brauner@...nel.org>,
	Jan Kara <jack@...e.cz>
Cc: linux-mm@...ck.org,
	linux-fsdevel@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	Kiryl Shutsemau <kas@...nel.org>
Subject: [PATCH] mm/filemap: Implement fast short reads

From: Kiryl Shutsemau <kas@...nel.org>

The protocol for page cache lookup is as follows:

  1. Locate a folio in XArray.
  2. Obtain a reference on the folio using folio_try_get().
  3. If successful, verify that the folio still belongs to
     the mapping and has not been truncated or reclaimed.
  4. Perform operations on the folio, such as copying data
     to userspace.
  5. Release the reference.

For short reads, the overhead of atomic operations on reference
manipulation can be significant, particularly when multiple tasks access
the same folio, leading to cache line bouncing.

To address this issue, introduce i_pages_delete_seqcnt, which increments
each time a folio is deleted from the page cache and implement a modified
page cache lookup protocol for short reads:

  1. Locate a folio in XArray.
  2. Take note of the i_pages_delete_seqcnt.
  3. Copy the data to a local buffer on the stack.
  4. Verify that the i_pages_delete_seqcnt has not changed.
  5. Copy the data from the local buffer to the iterator.

If any issues arise in the fast path, fallback to the slow path that
relies on the refcount to stabilize the folio.

The new approach requires a local buffer in the stack. The size of the
buffer determines which read requests are served by the fast path. Set
the buffer to 1k. This seems to be a reasonable amount of stack usage
for the function at the bottom of the call stack.

The fast read approach demonstrates significant performance
improvements, particularly in contended cases.

16 threads, reads from 4k file(s), mean MiB/s (StdDev)

 -------------------------------------------------------------
| Block |  Baseline  |  Baseline   |  Patched   |  Patched    |
| size  |  same file |  diff files |  same file | diff files  |
 -------------------------------------------------------------
|     1 |    10.96   |     27.56   |    30.42   |     30.4    |
|       |    (0.497) |     (0.114) |    (0.130) |     (0.158) |
|    32 |   350.8    |    886.2    |   980.6    |    981.8    |
|       |   (13.64)  |     (2.863) |    (3.361) |     (1.303) |
|   256 |  2798      |   7009.6    |  7641.4    |   7653.6    |
|       |  (103.9)   |    (28.00)  |   (33.26)  |    (25.50)  |
|  1024 | 10780      |  27040      | 29280      |  29320      |
|       |  (389.8)   |    (89.44)  |  (130.3)   |    (83.66)  |
|  4096 | 43700      | 103800      | 48420      | 102000      |
|       | (1953)     |    (447.2)  | (2012)     |     (0)     |
 -------------------------------------------------------------

16 threads, reads from 1M file(s), mean MiB/s (StdDev)

 --------------------------------------------------------------
| Block |  Baseline   |  Baseline   |  Patched    |  Patched   |
| size  |  same file  |  diff files |  same file  | diff files |
 ---------------------------------------------------------
|     1 |     26.38   |     27.34   |     30.38   |    30.36   |
|       |     (0.998) |     (0.114) |     (0.083) |    (0.089) |
|    32 |    824.4    |    877.2    |    977.8    |   975.8    |
|       |    (15.78)  |     (3.271) |     (2.683) |    (1.095) |
|   256 |   6494      |   6992.8    |   7619.8    |   7625     |
|       |   (116.0)   |    (32.39)  |    (10.66)  |    (28.19) |
|  1024 |  24960      |  26840      |  29100      |  29180     |
|       |   (606.6)   |   (151.6)   |   (122.4)   |    (83.66) |
|  4096 |  94420      | 100520      |  95260      |  99760     |
|       |  (3144)     |   (672.3)   |  (2874)     |   (134.1)  |
| 32768 | 386000      | 402400      | 368600      | 397400     |
|       | (36599)     | (10526)     | (47188)     |  (6107)    |
 --------------------------------------------------------------

There's also improvement on kernel build:

Base line: 61.3462 +- 0.0597 seconds time elapsed  ( +-  0.10% )
Patched:   60.6106 +- 0.0759 seconds time elapsed  ( +-  0.13% )

Co-developed-by: Linus Torvalds <torvalds@...ux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@...ux-foundation.org>
Signed-off-by: Kiryl Shutsemau <kas@...nel.org>
---
 fs/inode.c         |   2 +
 include/linux/fs.h |   1 +
 mm/filemap.c       | 150 ++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 130 insertions(+), 23 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index ec9339024ac3..52163d28d630 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -482,6 +482,8 @@ EXPORT_SYMBOL(inc_nlink);
 static void __address_space_init_once(struct address_space *mapping)
 {
 	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+	seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
+			       &mapping->i_pages->xa_lock);
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->i_private_list);
 	spin_lock_init(&mapping->i_private_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..c9588d555f73 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -523,6 +523,7 @@ struct address_space {
 	struct list_head	i_private_list;
 	struct rw_semaphore	i_mmap_rwsem;
 	void *			i_private_data;
+	seqcount_spinlock_t	i_pages_delete_seqcnt;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
diff --git a/mm/filemap.c b/mm/filemap.c
index 13f0259d993c..51689c4f3773 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -138,8 +138,10 @@ static void page_cache_delete(struct address_space *mapping,
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 
+	write_seqcount_begin(&mapping->i_pages_delete_seqcnt);
 	xas_store(&xas, shadow);
 	xas_init_marks(&xas);
+	write_seqcount_end(&mapping->i_pages_delete_seqcnt);
 
 	folio->mapping = NULL;
 	/* Leave folio->index set: truncation lookup relies upon it */
@@ -2695,21 +2697,98 @@ static void filemap_end_dropbehind_read(struct folio *folio)
 	}
 }
 
-/**
- * filemap_read - Read data from the page cache.
- * @iocb: The iocb to read.
- * @iter: Destination for the data.
- * @already_read: Number of bytes already read by the caller.
- *
- * Copies data from the page cache.  If the data is not currently present,
- * uses the readahead and read_folio address_space operations to fetch it.
- *
- * Return: Total number of bytes copied, including those already read by
- * the caller.  If an error happens before any bytes are copied, returns
- * a negative error number.
- */
-ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
-		ssize_t already_read)
+static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
+						  loff_t pos, char *buffer,
+						  size_t size)
+{
+	XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
+	struct folio *folio;
+	loff_t file_size;
+	unsigned int seq;
+
+	lockdep_assert_in_rcu_read_lock();
+
+	/* Give up and go to slow path if raced with page_cache_delete() */
+	if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
+		return false;
+
+	folio = xas_load(&xas);
+	if (xas_retry(&xas, folio))
+		return 0;
+
+	if (!folio || xa_is_value(folio))
+		return 0;
+
+	if (!folio_test_uptodate(folio))
+		return 0;
+
+	/* No fast-case if readahead is supposed to started */
+	if (folio_test_readahead(folio))
+		return 0;
+	/* .. or mark it accessed */
+	if (!folio_test_referenced(folio))
+		return 0;
+
+	/* i_size check must be after folio_test_uptodate() */
+	file_size = i_size_read(mapping->host);
+	if (unlikely(pos >= file_size))
+		return 0;
+	if (size > file_size - pos)
+		size = file_size - pos;
+
+	/* Do the data copy */
+	size = memcpy_from_file_folio(buffer, folio, pos, size);
+	if (!size)
+		return 0;
+
+	/* Give up and go to slow path if raced with page_cache_delete() */
+	if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
+		return 0;
+
+	return size;
+}
+
+#define FAST_READ_BUF_SIZE 1024
+
+static noinline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
+				       ssize_t *already_read)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct file_ra_state *ra = &iocb->ki_filp->f_ra;
+	char buffer[FAST_READ_BUF_SIZE];
+	size_t count;
+
+	if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
+		return false;
+
+	if (iov_iter_count(iter) > sizeof(buffer))
+		return false;
+
+	count = iov_iter_count(iter);
+
+	/* Let's see if we can just do the read under RCU */
+	rcu_read_lock();
+	count = filemap_read_fast_rcu(mapping, iocb->ki_pos, buffer, count);
+	rcu_read_unlock();
+
+	if (!count)
+		return false;
+
+	count = copy_to_iter(buffer, count, iter);
+	if (unlikely(!count))
+		return false;
+
+	iocb->ki_pos += count;
+	ra->prev_pos = iocb->ki_pos;
+	file_accessed(iocb->ki_filp);
+	*already_read += count;
+
+	return !iov_iter_count(iter);
+}
+
+static noinline ssize_t filemap_read_slow(struct kiocb *iocb,
+					  struct iov_iter *iter,
+					  ssize_t already_read)
 {
 	struct file *filp = iocb->ki_filp;
 	struct file_ra_state *ra = &filp->f_ra;
@@ -2721,14 +2800,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 	loff_t isize, end_offset;
 	loff_t last_pos = ra->prev_pos;
 
-	if (unlikely(iocb->ki_pos < 0))
-		return -EINVAL;
-	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
-		return 0;
-	if (unlikely(!iov_iter_count(iter)))
-		return 0;
-
-	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
 	folio_batch_init(&fbatch);
 
 	do {
@@ -2821,6 +2892,39 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 	ra->prev_pos = last_pos;
 	return already_read ? already_read : error;
 }
+
+/**
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
+ *
+ * Copies data from the page cache.  If the data is not currently present,
+ * uses the readahead and read_folio address_space operations to fetch it.
+ *
+ * Return: Total number of bytes copied, including those already read by
+ * the caller.  If an error happens before any bytes are copied, returns
+ * a negative error number.
+ */
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+		     ssize_t already_read)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+
+	if (unlikely(iocb->ki_pos < 0))
+		return -EINVAL;
+	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
+		return 0;
+	if (unlikely(!iov_iter_count(iter)))
+		return 0;
+
+	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
+
+	if (filemap_read_fast(iocb, iter, &already_read))
+		return already_read;
+
+	return filemap_read_slow(iocb, iter, already_read);
+}
 EXPORT_SYMBOL_GPL(filemap_read);
 
 int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
-- 
2.50.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ