linux-kernel - [RFC PATCH v4 4/5] mm/readahead: Store folio order in struct file_ra

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250430145920.3748738-5-ryan.roberts@arm.com>
Date: Wed, 30 Apr 2025 15:59:17 +0100
From: Ryan Roberts <ryan.roberts@....com>
To: Andrew Morton <akpm@...ux-foundation.org>,
	"Matthew Wilcox (Oracle)" <willy@...radead.org>,
	Alexander Viro <viro@...iv.linux.org.uk>,
	Christian Brauner <brauner@...nel.org>,
	Jan Kara <jack@...e.cz>,
	David Hildenbrand <david@...hat.com>,
	Dave Chinner <david@...morbit.com>,
	Catalin Marinas <catalin.marinas@....com>,
	Will Deacon <will@...nel.org>,
	Kalesh Singh <kaleshsingh@...gle.com>,
	Zi Yan <ziy@...dia.com>
Cc: Ryan Roberts <ryan.roberts@....com>,
	linux-arm-kernel@...ts.infradead.org,
	linux-kernel@...r.kernel.org,
	linux-fsdevel@...r.kernel.org,
	linux-mm@...ck.org
Subject: [RFC PATCH v4 4/5] mm/readahead: Store folio order in struct file_ra_state

Previously the folio order of the previous readahead request was
inferred from the folio who's readahead marker was hit. But due to the
way we have to round to non-natural boundaries sometimes, this first
folio in the readahead block is often smaller than the preferred order
for that request. This means that for cases where the initial sync
readahead is poorly aligned, the folio order will ramp up much more
slowly.

So instead, let's store the order in struct file_ra_state so we are not
affected by any required alignment. We previously made enough room in
the struct for a 16 order field. This should be plenty big enough since
we are limited to MAX_PAGECACHE_ORDER anyway, which is certainly never
larger than ~20.

Since we now pass order in struct file_ra_state, page_cache_ra_order()
no longer needs it's new_order parameter, so let's remove that.

Worked example:

Here we are touching pages 17-256 sequentially just as we did in the
previous commit, but now that we are remembering the preferred order
explicitly, we no longer have the slow ramp up problem. Note
specifically that we no longer have 2 rounds (2x ~128K) of order-2
folios:

TYPE    STARTOFFS     ENDOFFS        SIZE  STARTPG    ENDPG   NRPG  ORDER  RA
-----  ----------  ----------  ----------  -------  -------  -----  -----  --
HOLE   0x00000000  0x00001000        4096        0        1      1
FOLIO  0x00001000  0x00002000        4096        1        2      1      0
FOLIO  0x00002000  0x00003000        4096        2        3      1      0
FOLIO  0x00003000  0x00004000        4096        3        4      1      0
FOLIO  0x00004000  0x00005000        4096        4        5      1      0
FOLIO  0x00005000  0x00006000        4096        5        6      1      0
FOLIO  0x00006000  0x00007000        4096        6        7      1      0
FOLIO  0x00007000  0x00008000        4096        7        8      1      0
FOLIO  0x00008000  0x00009000        4096        8        9      1      0
FOLIO  0x00009000  0x0000a000        4096        9       10      1      0
FOLIO  0x0000a000  0x0000b000        4096       10       11      1      0
FOLIO  0x0000b000  0x0000c000        4096       11       12      1      0
FOLIO  0x0000c000  0x0000d000        4096       12       13      1      0
FOLIO  0x0000d000  0x0000e000        4096       13       14      1      0
FOLIO  0x0000e000  0x0000f000        4096       14       15      1      0
FOLIO  0x0000f000  0x00010000        4096       15       16      1      0
FOLIO  0x00010000  0x00011000        4096       16       17      1      0
FOLIO  0x00011000  0x00012000        4096       17       18      1      0
FOLIO  0x00012000  0x00013000        4096       18       19      1      0
FOLIO  0x00013000  0x00014000        4096       19       20      1      0
FOLIO  0x00014000  0x00015000        4096       20       21      1      0
FOLIO  0x00015000  0x00016000        4096       21       22      1      0
FOLIO  0x00016000  0x00017000        4096       22       23      1      0
FOLIO  0x00017000  0x00018000        4096       23       24      1      0
FOLIO  0x00018000  0x00019000        4096       24       25      1      0
FOLIO  0x00019000  0x0001a000        4096       25       26      1      0
FOLIO  0x0001a000  0x0001b000        4096       26       27      1      0
FOLIO  0x0001b000  0x0001c000        4096       27       28      1      0
FOLIO  0x0001c000  0x0001d000        4096       28       29      1      0
FOLIO  0x0001d000  0x0001e000        4096       29       30      1      0
FOLIO  0x0001e000  0x0001f000        4096       30       31      1      0
FOLIO  0x0001f000  0x00020000        4096       31       32      1      0
FOLIO  0x00020000  0x00021000        4096       32       33      1      0
FOLIO  0x00021000  0x00022000        4096       33       34      1      0
FOLIO  0x00022000  0x00024000        8192       34       36      2      1
FOLIO  0x00024000  0x00028000       16384       36       40      4      2
FOLIO  0x00028000  0x0002c000       16384       40       44      4      2
FOLIO  0x0002c000  0x00030000       16384       44       48      4      2
FOLIO  0x00030000  0x00034000       16384       48       52      4      2
FOLIO  0x00034000  0x00038000       16384       52       56      4      2
FOLIO  0x00038000  0x0003c000       16384       56       60      4      2
FOLIO  0x0003c000  0x00040000       16384       60       64      4      2
FOLIO  0x00040000  0x00050000       65536       64       80     16      4
FOLIO  0x00050000  0x00060000       65536       80       96     16      4
FOLIO  0x00060000  0x00080000      131072       96      128     32      5
FOLIO  0x00080000  0x000a0000      131072      128      160     32      5
FOLIO  0x000a0000  0x000c0000      131072      160      192     32      5
FOLIO  0x000c0000  0x000e0000      131072      192      224     32      5
FOLIO  0x000e0000  0x00100000      131072      224      256     32      5
FOLIO  0x00100000  0x00120000      131072      256      288     32      5
FOLIO  0x00120000  0x00140000      131072      288      320     32      5  Y
HOLE   0x00140000  0x00800000     7077888      320     2048   1728

Signed-off-by: Ryan Roberts <ryan.roberts@....com>
---
 include/linux/fs.h |  2 ++
 mm/filemap.c       |  6 ++++--
 mm/internal.h      |  3 +--
 mm/readahead.c     | 18 +++++++++++-------
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 44362bef0010..cde482a7270a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1031,6 +1031,7 @@ struct fown_struct {
  *      and so were/are genuinely "ahead".  Start next readahead when
  *      the first of these pages is accessed.
  * @ra_pages: Maximum size of a readahead request, copied from the bdi.
+ * @order: Preferred folio order used for most recent readahead.
  * @mmap_miss: How many mmap accesses missed in the page cache.
  * @prev_pos: The last byte in the most recent read request.
  *
@@ -1042,6 +1043,7 @@ struct file_ra_state {
 	unsigned int size;
 	unsigned int async_size;
 	unsigned int ra_pages;
+	unsigned short order;
 	unsigned short mmap_miss;
 	loff_t prev_pos;
 };
diff --git a/mm/filemap.c b/mm/filemap.c
index fa129ecfd80f..e61f374068d4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3222,7 +3222,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 		if (!(vm_flags & VM_RAND_READ))
 			ra->size *= 2;
 		ra->async_size = HPAGE_PMD_NR;
-		page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
+		ra->order = HPAGE_PMD_ORDER;
+		page_cache_ra_order(&ractl, ra);
 		return fpin;
 	}
 #endif
@@ -3258,8 +3259,9 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 	ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
 	ra->size = ra->ra_pages;
 	ra->async_size = ra->ra_pages / 4;
+	ra->order = 0;
 	ractl._index = ra->start;
-	page_cache_ra_order(&ractl, ra, 0);
+	page_cache_ra_order(&ractl, ra);
 	return fpin;
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index 40464f755092..437c7738668d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -442,8 +442,7 @@ void zap_page_range_single_batched(struct mmu_gather *tlb,
 int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
 			   gfp_t gfp);
 
-void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
-		unsigned int order);
+void page_cache_ra_order(struct readahead_control *, struct file_ra_state *);
 void force_page_cache_ra(struct readahead_control *, unsigned long nr);
 static inline void force_page_cache_readahead(struct address_space *mapping,
 		struct file *file, pgoff_t index, unsigned long nr_to_read)
diff --git a/mm/readahead.c b/mm/readahead.c
index 82f9f623f2d7..18972bc34861 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -457,7 +457,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
 }
 
 void page_cache_ra_order(struct readahead_control *ractl,
-		struct file_ra_state *ra, unsigned int new_order)
+		struct file_ra_state *ra)
 {
 	struct address_space *mapping = ractl->mapping;
 	pgoff_t start = readahead_index(ractl);
@@ -469,6 +469,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
 	int err = 0;
 	gfp_t gfp = readahead_gfp_mask(mapping);
 	unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping));
+	unsigned int new_order = ra->order;
 
 	/*
 	 * Fallback when size < min_nrpages as each folio should be
@@ -483,6 +484,8 @@ void page_cache_ra_order(struct readahead_control *ractl,
 	new_order = min_t(unsigned int, new_order, ilog2(ra->size));
 	new_order = max(new_order, min_order);
 
+	ra->order = new_order;
+
 	/* See comment in page_cache_ra_unbounded() */
 	nofs = memalloc_nofs_save();
 	filemap_invalidate_lock_shared(mapping);
@@ -525,6 +528,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
 	 * ->readahead() may have updated readahead window size so we have to
 	 * check there's still something to read.
 	 */
+	ra->order = 0;
 	if (ra->size > index - start)
 		do_page_cache_ra(ractl, ra->size - (index - start),
 				 ra->async_size);
@@ -614,8 +618,9 @@ void page_cache_sync_ra(struct readahead_control *ractl,
 	ra->size = min(contig_count + req_count, max_pages);
 	ra->async_size = 1;
 readit:
+	ra->order = 0;
 	ractl->_index = ra->start;
-	page_cache_ra_order(ractl, ra, 0);
+	page_cache_ra_order(ractl, ra);
 }
 EXPORT_SYMBOL_GPL(page_cache_sync_ra);
 
@@ -626,7 +631,6 @@ void page_cache_async_ra(struct readahead_control *ractl,
 	struct file_ra_state *ra = ractl->ra;
 	pgoff_t index = readahead_index(ractl);
 	pgoff_t expected, start, end, aligned_end;
-	unsigned int order = folio_order(folio);
 
 	/* no readahead */
 	if (!ra->ra_pages)
@@ -649,7 +653,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
 	 * Ramp up sizes, and push forward the readahead window.
 	 */
 	expected = round_down(ra->start + ra->size - ra->async_size,
-			1UL << order);
+			1UL << folio_order(folio));
 	if (index == expected) {
 		ra->start += ra->size;
 		/*
@@ -678,14 +682,14 @@ void page_cache_async_ra(struct readahead_control *ractl,
 	ra->size += req_count;
 	ra->size = get_next_ra_size(ra, max_pages);
 readit:
-	order += 2;
+	ra->order += 2;
 	end = ra->start + ra->size;
-	aligned_end = round_down(end, 1UL << order);
+	aligned_end = round_down(end, 1UL << ra->order);
 	if (aligned_end > ra->start)
 		ra->size -= end - aligned_end;
 	ra->async_size = ra->size;
 	ractl->_index = ra->start;
-	page_cache_ra_order(ractl, ra, order);
+	page_cache_ra_order(ractl, ra);
 }
 EXPORT_SYMBOL_GPL(page_cache_async_ra);
 
-- 
2.43.0