linux-kernel - Re: [PATCH v4 2/2] mm: Optimize mremap() by PTE batching

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <726dcb51-82a7-49a7-a8e5-49bc3eb05dcf@redhat.com>
Date: Mon, 27 Oct 2025 22:40:49 +0100
From: David Hildenbrand <david@...hat.com>
To: Dev Jain <dev.jain@....com>, akpm@...ux-foundation.org
Cc: Liam.Howlett@...cle.com, lorenzo.stoakes@...cle.com, vbabka@...e.cz,
 jannh@...gle.com, pfalcato@...e.de, linux-mm@...ck.org,
 linux-kernel@...r.kernel.org, peterx@...hat.com, ryan.roberts@....com,
 mingo@...nel.org, libang.li@...group.com, maobibo@...ngson.cn,
 zhengqi.arch@...edance.com, baohua@...nel.org, anshuman.khandual@....com,
 willy@...radead.org, ioworker0@...il.com, yang@...amperecomputing.com,
 baolin.wang@...ux.alibaba.com, ziy@...dia.com, hughd@...gle.com
Subject: Re: [PATCH v4 2/2] mm: Optimize mremap() by PTE batching

On 10.06.25 05:50, Dev Jain wrote:
> Use folio_pte_batch() to optimize move_ptes(). On arm64, if the ptes
> are painted with the contig bit, then ptep_get() will iterate through all 16
> entries to collect a/d bits. Hence this optimization will result in a 16x
> reduction in the number of ptep_get() calls. Next, ptep_get_and_clear()
> will eventually call contpte_try_unfold() on every contig block, thus
> flushing the TLB for the complete large folio range. Instead, use
> get_and_clear_full_ptes() so as to elide TLBIs on each contig block, and only
> do them on the starting and ending contig block.
> 
> For split folios, there will be no pte batching; nr_ptes will be 1. For
> pagetable splitting, the ptes will still point to the same large folio;
> for arm64, this results in the optimization described above, and for other
> arches (including the general case), a minor improvement is expected due to
> a reduction in the number of function calls.
> 
> Signed-off-by: Dev Jain <dev.jain@....com>
> ---
>   mm/mremap.c | 39 ++++++++++++++++++++++++++++++++-------
>   1 file changed, 32 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 180b12225368..18b215521ada 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -170,6 +170,23 @@ static pte_t move_soft_dirty_pte(pte_t pte)
>   	return pte;
>   }
>   
> +static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr,
> +		pte_t *ptep, pte_t pte, int max_nr)
> +{
> +	const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> +	struct folio *folio;
> +
> +	if (max_nr == 1)
> +		return 1;
> +
> +	folio = vm_normal_folio(vma, addr, pte);
> +	if (!folio || !folio_test_large(folio))
> +		return 1;
> +
> +	return folio_pte_batch(folio, addr, ptep, pte, max_nr, flags, NULL,
> +			       NULL, NULL);
> +}

Dev, I think there is another bug hiding in here. That function ignores
the writable bit, which is not what you need here, in particular for
anonymous folios in some cases.

Later set_ptes() could end up marking ptes writable that were not writable
before, which is bad (at least for anonymous folios, maybe also for pagecache
folios).

I think you really must respect the writable bit through something like
FPB_RESPECT_WRITE.

I patched out the "pte_batch_hint(ptep, pte) == 1" check we have upstream
to make it reproduce on x86_64, but the following reproducer should likely
reproduce on aarch64 without further kernel modifications.


# ./mremap
BUG: Memory modified


#define _GNU_SOURCE
#include <stdint.h>
#include <string.h>
#include <stdbool.h>
#include <x86intrin.h>
#include <stdio.h>
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#include <signal.h>
#include <fcntl.h>
#include <sys/wait.h>

static size_t pagesize;
static size_t thpsize;
static int pagemap_fd;

static uint64_t pagemap_get_entry(int fd, char *start)
{
         const unsigned long pfn = (unsigned long)start / getpagesize();
         uint64_t entry;
         int ret;

         ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry));
         if (ret != sizeof(entry)) {
                 perror("reading pagemap failed");
                 exit(-1);
         }
         return entry;
}

static bool pagemap_is_populated(int fd, char *start)
{
         return pagemap_get_entry(fd, start) & ((1ULL << 62) | (1ULL << 63));
}

unsigned long pagemap_get_pfn(int fd, char *start)
{
         uint64_t entry = pagemap_get_entry(fd, start);

         /* If present (63th bit), PFN is at bit 0 -- 54. */
         if (entry & (1ULL << 63))
                 return entry & 0x007fffffffffffffull;
         return -1ul;
}

int main(void)
{
         char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
         size_t size, mmap_size;
         int ret;

         pagesize = getpagesize();
         thpsize = 2 * 1024 * 1024ul;
         pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
         if (pagemap_fd < 0) {
                 perror("opening pagemap failed");
                 return -1;
         }

         /* For alignment purposes, we need twice the thp size. */
         mmap_size = 2 * thpsize;
         mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
         if (mmap_mem == MAP_FAILED) {
                 perror("mmap() failed");
                 return -1;
         }

         /* We need a THP-aligned memory area. */
         mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));

         ret = madvise(mem, thpsize, MADV_HUGEPAGE);
         if (ret) {
                 perror("MADV_HUGEPAGE failed");
                 return -1;
         }

         /*
          * Try to populate a THP. Touch the first sub-page and test if we get
          * another sub-page populated automatically.
          */
         mem[0] = 0;
         if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
                 perror("Did not get a THP populated");
                 return -1;
         }

         /* Share only the first page of the THP. */
         if (madvise(mem, pagesize, MADV_DONTFORK)) {
                 perror("MADV_DONTFORK failed");
                 return -1;
         }

         ret = fork();
         if (ret < 0) {
                 perror("fork() failed");
                 return -1;
         } else if (!ret) {
                 while (true) {
                         char c = *((volatile char *)(mem + pagesize));

                         if (c) {
                                 fprintf(stderr, "BUG: Memory modified\n");
                                 exit(-2);
                         }
                 }
         }

         /* Merge VMAs again. */
         if (madvise(mem, pagesize, MADV_DOFORK)) {
                 perror("MADV_DONTFORK failed");
                 return -1;
         }

         /* Mremap multiple pages. */
         mremap_mem = mmap(NULL, 2 * pagesize, PROT_NONE,
                           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
         if (mem == MAP_FAILED) {
                 perror("mmap() failed");
                 return -1;
         }
         tmp = mremap(mem, 2 * pagesize, 2 * pagesize, MREMAP_MAYMOVE | MREMAP_FIXED,
                      mremap_mem);
         if (tmp != mremap_mem) {
                 perror("mremap() failed");
                 return -1;
         }

         /* Write into both pages. The child should never see these updates. */
         memset(mremap_mem, 1, 2 * pagesize);

         pause();
}

-- 
Cheers

David / dhildenb