lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260206135648.38164-1-dev.jain@arm.com>
Date: Fri,  6 Feb 2026 19:26:48 +0530
From: Dev Jain <dev.jain@....com>
To: akpm@...ux-foundation.org,
	david@...nel.org
Cc: lorenzo.stoakes@...cle.com,
	Liam.Howlett@...cle.com,
	vbabka@...e.cz,
	rppt@...nel.org,
	surenb@...gle.com,
	mhocko@...e.com,
	linux-mm@...ck.org,
	linux-kernel@...r.kernel.org,
	ryan.roberts@....com,
	anshuman.khandual@....com,
	kirill@...temov.name,
	willy@...radead.org,
	Dev Jain <dev.jain@....com>
Subject: [PATCH] mm: map maximum pages possible in finish_fault

Suppose a VMA of size 64K, maps a file/shmem-file at index 0, and that the
pagecache at index 0 contains an order-9 folio. If do_fault_around is able
to find this folio, filemap_map_pages ultimately maps the first 64K half of
the folio into the pagetable, thus reducing the number of future page
faults. If fault-around fails to satisfy the fault, or if it is a write
fault, then we use vma->vm_ops->fault to find/create the folio, followed by
finish_fault to map the folio into the pagetable. On encountering a similar
large folio crossing the VMA boundary, finish_fault currently falls back
to mapping only a single page.

Align finish_fault with filemap_map_pages, and map as many pages as
possible, without crossing VMA/PMD/file boundaries.

Commit 19773df031bc ("mm/fault: try to map the entire file folio in
finish_fault()") argues that doing a per-page fault only prevents RSS
accounting, and not RSS inflation. Combining with the improvements below,
it makes sense to map as maximum pages as possible.

We test the patch with the following userspace program. A shmem VMA of
2M is created, and faulted in, with sysfs setting
hugepages-2048k/shmem_enabled = always, so that the pagecache is populated
with a 2M folio. Then, a 64K VMA is created, and we fault on each page.
Then, we do MADV_DONTNEED to zap the pagetable, so that we can fault again
in the next iteration. We measure the accumulated time taken during
faulting the VMA.

On arm64,

without patch:
Total time taken by inner loop: 4701721766 ns

with patch:
Total time taken by inner loop: 516043507 ns

giving a 9x improvement.

To remove arm64 contpte interference (although contpte will worsen the
execution time due to not accessing mapped memory, but incurring the
overhead of painting the ptes with cont bit), we can change the program to
map a 32K VMA, and do the fault 8 times. For this case:

without patch:
Total time taken by inner loop: 2081356415 ns

with patch:
Total time taken by inner loop: 408755218 ns

leading to an improvement as well.

#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <assert.h>
#include <time.h>
#include <errno.h>
#include <string.h>
#include <sys/syscall.h>
#include <linux/memfd.h>
#include <fcntl.h>

#define PMDSIZE     (1UL << 21)
#define PAGESIZE    (1UL << 12)
#define MTHPSIZE    (1UL << 16)

#define ITERATIONS  1000000

static int xmemfd_create(const char *name, unsigned int flags)
{
#ifdef SYS_memfd_create
    return syscall(SYS_memfd_create, name, flags);
#else
    (void)name; (void)flags;
    errno = ENOSYS;
    return -1;
#endif
}

static void die(const char *msg)
{
    fprintf(stderr, "%s: %s (errno=%d)\n", msg, strerror(errno), errno);
    exit(1);
}

int main(void)
{
    /* Create a shmem-backed "file" (anonymous, RAM/swap-backed) */
    int fd = xmemfd_create("willitscale-shmem", MFD_CLOEXEC);
    if (fd < 0)
        die("memfd_create");

    if (ftruncate(fd, PMDSIZE) != 0)
        die("ftruncate");

    char *c = mmap(NULL, PMDSIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (c == MAP_FAILED)
        die("mmap PMDSIZE");

    assert(!((unsigned long)c & (PMDSIZE - 1)));

    /* allocate PMD shmem folio */
    c[0] = 0;

    char *ptr = mmap(NULL, MTHPSIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (ptr == MAP_FAILED)
        die("mmap MTHPSIZE");

    assert(!((unsigned long)ptr & (MTHPSIZE - 1)));

    long total_time_ns = 0;

    for (int i = 0; i < ITERATIONS; ++i) {
        struct timespec start, end;
        if (clock_gettime(CLOCK_MONOTONIC, &start) != 0)
            die("clock_gettime start");

        for (int j = 0; j < 8; ++j) {
            ptr[j * PAGESIZE] = 3;
        }

        if (clock_gettime(CLOCK_MONOTONIC, &end) != 0)
            die("clock_gettime end");

        long elapsed_ns =
            (end.tv_sec - start.tv_sec) * 1000000000L +
            (end.tv_nsec - start.tv_nsec);

        total_time_ns += elapsed_ns;

        assert(madvise(ptr, MTHPSIZE, MADV_DONTNEED) == 0);
    }

    printf("Total time taken by inner loop: %ld ns\n", total_time_ns);

    munmap(ptr, MTHPSIZE);
    munmap(c, PMDSIZE);
    close(fd);
    return 0;
}

Signed-off-by: Dev Jain <dev.jain@....com>
---
Based on mm-unstable (6873c4e2723d). mm-selftests pass.

 mm/memory.c | 72 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 34 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 79ba525671c7..b3d951573076 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5563,11 +5563,14 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 	bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
 		      !(vma->vm_flags & VM_SHARED);
 	int type, nr_pages;
-	unsigned long addr;
-	bool needs_fallback = false;
+	unsigned long start_addr;
+	bool single_page_fallback = false;
+	bool try_pmd_mapping = true;
+	pgoff_t file_end;
+	struct address_space *mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
 
 fallback:
-	addr = vmf->address;
+	start_addr = vmf->address;
 
 	/* Did we COW the page? */
 	if (is_cow)
@@ -5586,25 +5589,22 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 			return ret;
 	}
 
-	if (!needs_fallback && vma->vm_file) {
-		struct address_space *mapping = vma->vm_file->f_mapping;
-		pgoff_t file_end;
-
+	if (!single_page_fallback && mapping) {
 		file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
 
 		/*
-		 * Do not allow to map with PTEs beyond i_size and with PMD
-		 * across i_size to preserve SIGBUS semantics.
+		 * Do not allow to map with PMD across i_size to preserve
+		 * SIGBUS semantics.
 		 *
 		 * Make an exception for shmem/tmpfs that for long time
 		 * intentionally mapped with PMDs across i_size.
 		 */
-		needs_fallback = !shmem_mapping(mapping) &&
-			file_end < folio_next_index(folio);
+		try_pmd_mapping = shmem_mapping(mapping) ||
+			file_end >= folio_next_index(folio);
 	}
 
 	if (pmd_none(*vmf->pmd)) {
-		if (!needs_fallback && folio_test_pmd_mappable(folio)) {
+		if (try_pmd_mapping && folio_test_pmd_mappable(folio)) {
 			ret = do_set_pmd(vmf, folio, page);
 			if (ret != VM_FAULT_FALLBACK)
 				return ret;
@@ -5619,49 +5619,53 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 	nr_pages = folio_nr_pages(folio);
 
 	/* Using per-page fault to maintain the uffd semantics */
-	if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) {
+	if (unlikely(userfaultfd_armed(vma)) || unlikely(single_page_fallback)) {
 		nr_pages = 1;
 	} else if (nr_pages > 1) {
-		pgoff_t idx = folio_page_idx(folio, page);
-		/* The page offset of vmf->address within the VMA. */
-		pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
-		/* The index of the entry in the pagetable for fault page. */
-		pgoff_t pte_off = pte_index(vmf->address);
+
+		/* Ensure mapping stays within VMA and PMD boundaries */
+		unsigned long pmd_boundary_start = ALIGN_DOWN(vmf->address, PMD_SIZE);
+		unsigned long pmd_boundary_end = pmd_boundary_start + PMD_SIZE;
+		unsigned long va_of_folio_start = vmf->address - ((vmf->pgoff - folio->index) * PAGE_SIZE);
+		unsigned long va_of_folio_end = va_of_folio_start + nr_pages * PAGE_SIZE;
+		unsigned long end_addr;
+
+		start_addr = max3(vma->vm_start, pmd_boundary_start, va_of_folio_start);
+		end_addr = min3(vma->vm_end, pmd_boundary_end, va_of_folio_end);
 
 		/*
-		 * Fallback to per-page fault in case the folio size in page
-		 * cache beyond the VMA limits and PMD pagetable limits.
+		 * Do not allow to map with PTEs across i_size to preserve
+		 * SIGBUS semantics.
+		 *
+		 * Make an exception for shmem/tmpfs that for long time
+		 * intentionally mapped with PMDs across i_size.
 		 */
-		if (unlikely(vma_off < idx ||
-			    vma_off + (nr_pages - idx) > vma_pages(vma) ||
-			    pte_off < idx ||
-			    pte_off + (nr_pages - idx)  > PTRS_PER_PTE)) {
-			nr_pages = 1;
-		} else {
-			/* Now we can set mappings for the whole large folio. */
-			addr = vmf->address - idx * PAGE_SIZE;
-			page = &folio->page;
-		}
+		if (mapping && !shmem_mapping(mapping))
+			end_addr = min(end_addr, va_of_folio_start + (file_end - folio->index) * PAGE_SIZE);
+
+		nr_pages = (end_addr - start_addr) >> PAGE_SHIFT;
+		page = folio_page(folio, (start_addr - va_of_folio_start) >> PAGE_SHIFT);
 	}
 
 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-				       addr, &vmf->ptl);
+				       start_addr, &vmf->ptl);
 	if (!vmf->pte)
 		return VM_FAULT_NOPAGE;
 
 	/* Re-check under ptl */
 	if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
-		update_mmu_tlb(vma, addr, vmf->pte);
+		update_mmu_tlb(vma, start_addr, vmf->pte);
 		ret = VM_FAULT_NOPAGE;
 		goto unlock;
 	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
-		needs_fallback = true;
+		single_page_fallback = true;
+		try_pmd_mapping = false;
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		goto fallback;
 	}
 
 	folio_ref_add(folio, nr_pages - 1);
-	set_pte_range(vmf, folio, page, nr_pages, addr);
+	set_pte_range(vmf, folio, page, nr_pages, start_addr);
 	type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
 	add_mm_counter(vma->vm_mm, type, nr_pages);
 	ret = 0;
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ