linux-kernel - Re: [PATCH v1] kernel/fork: only call untrack_pfn

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9f78fd5e-a785-4473-8c6b-8267821db446@redhat.com>
Date: Thu, 24 Apr 2025 14:33:54 +0200
From: David Hildenbrand <david@...hat.com>
To: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org, x86@...nel.org,
 Andrew Morton <akpm@...ux-foundation.org>, Ingo Molnar <mingo@...nel.org>,
 Dave Hansen <dave.hansen@...ux.intel.com>, Andy Lutomirski
 <luto@...nel.org>, Peter Zijlstra <peterz@...radead.org>,
 Thomas Gleixner <tglx@...utronix.de>, Borislav Petkov <bp@...en8.de>,
 Rik van Riel <riel@...riel.com>, "H. Peter Anvin" <hpa@...or.com>,
 Linus Torvalds <torvalds@...ux-foundation.org>
Subject: Re: [PATCH v1] kernel/fork: only call untrack_pfn_clear() on VMAs
 duplicated for fork()

>>
>> ... and I think we still have space in vm_area_struct without increasing it
>> beyond 192 bytes.
> 
> Hm, so you're thinking of a general field in the VMA? I thought this would
> belong to the PAT object somehow?

It's glued to a VMA. The only alternative to using a VMA field would be looking it
up for a VMA, storing it in an xarray etc ... ends up complicating stuff when there
is no need to right now.

> 
> Though getting rid of VM_PAT would be fantastic...
> 
> I wonder if a _general_ VMA ref count would be a bit much just for this
> case.

I don't think it would be helpful for this case. It's much more similar to the anon
VMA name (that also has its own kref)

> 
> But maybe I misunderstand your approach :) Happy to obviously look and if
> not like some crazy thing just for PAT (you can understand why I would not
> like this) will be supportive :>)

This is something quick (well, longer than I wish it would take) that seems to
work. There are smaller pat-internal cleanups to be had on top of this, and
the new functions shall be documented.


Observe how:
* We remove VM_PAT and that weird VM flags manipulation + "locked" flag
* We remove any traces of the nasty tracking handling from mremap+fork code
* Just like anon_vma_name, it hooks into vm_area_dup()/vm_area_free().
* We remove the page table lookup via get_pat_info()->... completely
* We remove the VMA parameter from PAT code completely
* We reduce the track/untrack/sanitize interface to 3 functions

 From 4cf8b2a2e60220c5b438adf920d75cba3af50ab4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@...hat.com>
Date: Thu, 24 Apr 2025 12:06:15 +0200
Subject: [PATCH] mm: rewrite pfnmap tracking

Signed-off-by: David Hildenbrand <david@...hat.com>
---
  arch/x86/mm/pat/memtype.c      | 155 ++-------------------------------
  drivers/gpu/drm/i915/i915_mm.c |   4 +-
  include/linux/mm.h             |   4 +-
  include/linux/mm_inline.h      |   2 +
  include/linux/mm_types.h       |  11 +++
  include/linux/pgtable.h        |  71 ++-------------
  include/trace/events/mmflags.h |   4 +-
  kernel/fork.c                  |  54 ++++++++++--
  mm/huge_memory.c               |   7 +-
  mm/io-mapping.c                |   2 +-
  mm/memory.c                    |  93 +++++++++++++++-----
  mm/memremap.c                  |   8 +-
  mm/mremap.c                    |   4 -
  13 files changed, 162 insertions(+), 257 deletions(-)

diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 72d8cbc611583..237c7e5e9d9aa 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -932,124 +932,14 @@ static void free_pfn_range(u64 paddr, unsigned long size)
  		memtype_free(paddr, paddr + size);
  }
  
-static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
-		resource_size_t *phys)
-{
-	struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start };
-
-	if (follow_pfnmap_start(&args))
-		return -EINVAL;
-
-	/* Never return PFNs of anon folios in COW mappings. */
-	if (!args.special) {
-		follow_pfnmap_end(&args);
-		return -EINVAL;
-	}
-
-	*prot = pgprot_val(args.pgprot);
-	*phys = (resource_size_t)args.pfn << PAGE_SHIFT;
-	follow_pfnmap_end(&args);
-	return 0;
-}
-
-static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
-		pgprot_t *pgprot)
-{
-	unsigned long prot;
-
-	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT));
-
-	/*
-	 * We need the starting PFN and cachemode used for track_pfn_remap()
-	 * that covered the whole VMA. For most mappings, we can obtain that
-	 * information from the page tables. For COW mappings, we might now
-	 * suddenly have anon folios mapped and follow_phys() will fail.
-	 *
-	 * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to
-	 * detect the PFN. If we need the cachemode as well, we're out of luck
-	 * for now and have to fail fork().
-	 */
-	if (!follow_phys(vma, &prot, paddr)) {
-		if (pgprot)
-			*pgprot = __pgprot(prot);
-		return 0;
-	}
-	if (is_cow_mapping(vma->vm_flags)) {
-		if (pgprot)
-			return -EINVAL;
-		*paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
-		return 0;
-	}
-	WARN_ON_ONCE(1);
-	return -EINVAL;
-}
-
-int track_pfn_copy(struct vm_area_struct *dst_vma,
-		struct vm_area_struct *src_vma, unsigned long *pfn)
-{
-	const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start;
-	resource_size_t paddr;
-	pgprot_t pgprot;
-	int rc;
-
-	if (!(src_vma->vm_flags & VM_PAT))
-		return 0;
-
-	/*
-	 * Duplicate the PAT information for the dst VMA based on the src
-	 * VMA.
-	 */
-	if (get_pat_info(src_vma, &paddr, &pgprot))
-		return -EINVAL;
-	rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1);
-	if (rc)
-		return rc;
-
-	/* Reservation for the destination VMA succeeded. */
-	vm_flags_set(dst_vma, VM_PAT);
-	*pfn = PHYS_PFN(paddr);
-	return 0;
-}
-
-void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn)
-{
-	untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true);
-	/*
-	 * Reservation was freed, any copied page tables will get cleaned
-	 * up later, but without getting PAT involved again.
-	 */
-}
-
-/*
- * prot is passed in as a parameter for the new mapping. If the vma has
- * a linear pfn mapping for the entire range, or no vma is provided,
- * reserve the entire pfn + size range with single reserve_pfn_range
- * call.
- */
-int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
-		    unsigned long pfn, unsigned long addr, unsigned long size)
+int pfnmap_sanitize(unsigned long pfn, unsigned long size, pgprot_t *prot)
  {
  	resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
  	enum page_cache_mode pcm;
  
-	/* reserve the whole chunk starting from paddr */
-	if (!vma || (addr == vma->vm_start
-				&& size == (vma->vm_end - vma->vm_start))) {
-		int ret;
-
-		ret = reserve_pfn_range(paddr, size, prot, 0);
-		if (ret == 0 && vma)
-			vm_flags_set(vma, VM_PAT);
-		return ret;
-	}
-
  	if (!pat_enabled())
  		return 0;
  
-	/*
-	 * For anything smaller than the vma size we set prot based on the
-	 * lookup.
-	 */
  	pcm = lookup_memtype(paddr);
  
  	/* Check memtype for the remaining pages */
@@ -1066,51 +956,18 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
  	return 0;
  }
  
-void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
+int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot)
  {
-	enum page_cache_mode pcm;
-
-	if (!pat_enabled())
-		return;
+	const resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
  
-	/* Set prot based on lookup */
-	pcm = lookup_memtype(pfn_t_to_phys(pfn));
-	*prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
-			 cachemode2protval(pcm));
+	return reserve_pfn_range(paddr, size, prot, 0);
  }
  
-/*
- * untrack_pfn is called while unmapping a pfnmap for a region.
- * untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case pfn, size are zero).
- */
-void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
-		 unsigned long size, bool mm_wr_locked)
+void pfnmap_untrack(unsigned long pfn, unsigned long size)
  {
-	resource_size_t paddr;
+	const resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
  
-	if (vma && !(vma->vm_flags & VM_PAT))
-		return;
-
-	/* free the chunk starting from pfn or the whole chunk */
-	paddr = (resource_size_t)pfn << PAGE_SHIFT;
-	if (!paddr && !size) {
-		if (get_pat_info(vma, &paddr, NULL))
-			return;
-		size = vma->vm_end - vma->vm_start;
-	}
  	free_pfn_range(paddr, size);
-	if (vma) {
-		if (mm_wr_locked)
-			vm_flags_clear(vma, VM_PAT);
-		else
-			__vm_flags_mod(vma, 0, VM_PAT);
-	}
-}
-
-void untrack_pfn_clear(struct vm_area_struct *vma)
-{
-	vm_flags_clear(vma, VM_PAT);
  }
  
  pgprot_t pgprot_writecombine(pgprot_t prot)
diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c
index 76e2801619f09..c33bd3d830699 100644
--- a/drivers/gpu/drm/i915/i915_mm.c
+++ b/drivers/gpu/drm/i915/i915_mm.c
@@ -100,7 +100,7 @@ int remap_io_mapping(struct vm_area_struct *vma,
  
  	GEM_BUG_ON((vma->vm_flags & EXPECTED_FLAGS) != EXPECTED_FLAGS);
  
-	/* We rely on prevalidation of the io-mapping to skip track_pfn(). */
+	/* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */
  	r.mm = vma->vm_mm;
  	r.pfn = pfn;
  	r.prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
@@ -140,7 +140,7 @@ int remap_io_sg(struct vm_area_struct *vma,
  	};
  	int err;
  
-	/* We rely on prevalidation of the io-mapping to skip track_pfn(). */
+	/* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */
  	GEM_BUG_ON((vma->vm_flags & EXPECTED_FLAGS) != EXPECTED_FLAGS);
  
  	while (offset >= r.sgt.max >> PAGE_SHIFT) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf55206935c46..1dc7df6ff38e9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -356,9 +356,7 @@ extern unsigned int kobjsize(const void *objp);
  # define VM_SHADOW_STACK	VM_NONE
  #endif
  
-#if defined(CONFIG_X86)
-# define VM_PAT		VM_ARCH_1	/* PAT reserves whole VMA at once (x86) */
-#elif defined(CONFIG_PPC64)
+#if defined(CONFIG_PPC64)
  # define VM_SAO		VM_ARCH_1	/* Strong Access Ordering (powerpc) */
  #elif defined(CONFIG_PARISC)
  # define VM_GROWSUP	VM_ARCH_1
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index f9157a0c42a5c..89b518ff097e6 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -447,6 +447,8 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
  
  #endif  /* CONFIG_ANON_VMA_NAME */
  
+void pfnmap_track_ctx_release(struct kref *ref);
+
  static inline void init_tlb_flush_pending(struct mm_struct *mm)
  {
  	atomic_set(&mm->tlb_flush_pending, 0);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 56d07edd01f91..91124761cfda8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -764,6 +764,14 @@ struct vma_numab_state {
  	int prev_scan_seq;
  };
  
+#ifdef __HAVE_PFNMAP_TRACKING
+struct pfnmap_track_ctx {
+	struct kref kref;
+	unsigned long pfn;
+	unsigned long size;
+};
+#endif
+
  /*
   * This struct describes a virtual memory area. There is one of these
   * per VM-area/task. A VM area is any part of the process virtual memory
@@ -877,6 +885,9 @@ struct vm_area_struct {
  	struct anon_vma_name *anon_name;
  #endif
  	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef __HAVE_PFNMAP_TRACKING
+	struct pfnmap_track_ctx *pfnmap_track_ctx;
+#endif
  } __randomize_layout;
  
  #ifdef CONFIG_NUMA
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index b50447ef1c921..941ef982e1b61 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1489,82 +1489,25 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
   * vmf_insert_pfn.
   */
  
-/*
- * track_pfn_remap is called when a _new_ pfn mapping is being established
- * by remap_pfn_range() for physical range indicated by pfn and size.
- */
-static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
-				  unsigned long pfn, unsigned long addr,
-				  unsigned long size)
+/* Cannot fail if size <= PAGE_SIZE. */
+int pfnmap_sanitize(unsigned long pfn, unsigned long size, pgprot_t *prot)
  {
  	return 0;
  }
  
-/*
- * track_pfn_insert is called when a _new_ single pfn is established
- * by vmf_insert_pfn().
- */
-static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
-				    pfn_t pfn)
-{
-}
-
-/*
- * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page
- * tables copied during copy_page_range(). Will store the pfn to be
- * passed to untrack_pfn_copy() only if there is something to be untracked.
- * Callers should initialize the pfn to 0.
- */
-static inline int track_pfn_copy(struct vm_area_struct *dst_vma,
-		struct vm_area_struct *src_vma, unsigned long *pfn)
+int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot)
  {
  	return 0;
  }
  
-/*
- * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during
- * copy_page_range(), but after track_pfn_copy() was already called. Can
- * be called even if track_pfn_copy() did not actually track anything:
- * handled internally.
- */
-static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma,
-		unsigned long pfn)
+void pfnmap_untrack(unsigned long pfn, unsigned long size)
  {
  }
  
-/*
- * untrack_pfn is called while unmapping a pfnmap for a region.
- * untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case pfn, size are zero).
- */
-static inline void untrack_pfn(struct vm_area_struct *vma,
-			       unsigned long pfn, unsigned long size,
-			       bool mm_wr_locked)
-{
-}
-
-/*
- * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA:
- *
- * 1) During mremap() on the src VMA after the page tables were moved.
- * 2) During fork() on the dst VMA, immediately after duplicating the src VMA.
- */
-static inline void untrack_pfn_clear(struct vm_area_struct *vma)
-{
-}
  #else
-extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
-			   unsigned long pfn, unsigned long addr,
-			   unsigned long size);
-extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
-			     pfn_t pfn);
-extern int track_pfn_copy(struct vm_area_struct *dst_vma,
-		struct vm_area_struct *src_vma, unsigned long *pfn);
-extern void untrack_pfn_copy(struct vm_area_struct *dst_vma,
-		unsigned long pfn);
-extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
-			unsigned long size, bool mm_wr_locked);
-extern void untrack_pfn_clear(struct vm_area_struct *vma);
+int pfnmap_sanitize(unsigned long pfn, unsigned long size, pgprot_t *prot);
+int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot);
+void pfnmap_untrack(unsigned long pfn, unsigned long size);
  #endif
  
  #ifdef CONFIG_MMU
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 15aae955a10bf..aa441f593e9a6 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -172,9 +172,7 @@ IF_HAVE_PG_ARCH_3(arch_3)
  	__def_pageflag_names						\
  	) : "none"
  
-#if defined(CONFIG_X86)
-#define __VM_ARCH_SPECIFIC_1 {VM_PAT,     "pat"           }
-#elif defined(CONFIG_PPC64)
+#if defined(CONFIG_PPC64)
  #define __VM_ARCH_SPECIFIC_1 {VM_SAO,     "sao"           }
  #elif defined(CONFIG_PARISC)
  #define __VM_ARCH_SPECIFIC_1 {VM_GROWSUP,	"growsup"	}
diff --git a/kernel/fork.c b/kernel/fork.c
index c4b26cd8998b8..a6c54dde5f05c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -481,7 +481,50 @@ static void vm_area_init_from(const struct vm_area_struct *src,
  #ifdef CONFIG_NUMA
  	dest->vm_policy = src->vm_policy;
  #endif
+#ifdef __HAVE_PFNMAP_TRACKING
+	dest->pfnmap_track_ctx = NULL;
+#endif
+}
+
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
+		struct vm_area_struct *new)
+{
+	struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx;
+
+	if (likely(!ctx))
+		return 0;
+
+	/*
+	 * We don't expect to ever hit this. If ever required, we would have
+	 * to duplicate the tracking.
+	 */
+	if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX))
+		return -ENOMEM;
+	kref_get(&ctx->kref);
+	new->pfnmap_track_ctx = ctx;
+	return 0;
+}
+
+static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
+{
+	struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx;
+
+	if (likely(!ctx))
+		return;
+
+	kref_put(&ctx->kref, pfnmap_track_ctx_release);
+	vma->pfnmap_track_ctx = NULL;
+}
+#else
+static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
+		struct vm_area_struct *new)
+{
+}
+static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma);
+{
  }
+#endif
  
  struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
  {
@@ -493,15 +536,15 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
  	ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
  	ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
  	vm_area_init_from(orig, new);
+
+	if (vma_pfnmap_track_ctx_dup(orig, new)) {
+		kmem_cache_free(vm_area_cachep, new);
+		return NULL;
+	}
  	vma_lock_init(new, true);
  	INIT_LIST_HEAD(&new->anon_vma_chain);
  	vma_numab_state_init(new);
  	dup_anon_vma_name(orig, new);
-
-	/* track_pfn_copy() will later take care of copying internal state. */
-	if (unlikely(new->vm_flags & VM_PFNMAP))
-		untrack_pfn_clear(new);
-
  	return new;
  }
  
@@ -511,6 +554,7 @@ void vm_area_free(struct vm_area_struct *vma)
  	vma_assert_detached(vma);
  	vma_numab_state_free(vma);
  	free_anon_vma_name(vma);
+	vma_pfnmap_track_ctx_release(vma);
  	kmem_cache_free(vm_area_cachep, vma);
  }
  
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2a47682d1ab77..9ad6a0a8f0089 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1456,7 +1456,9 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
  			return VM_FAULT_OOM;
  	}
  
-	track_pfn_insert(vma, &pgprot, pfn);
+	/* TODO: we should check the whole range and handle errors. */
+	pfnmap_sanitize(pfn_t_to_pfn(pfn), PAGE_SIZE, &pgprot);
+
  	ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  	error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write,
  			pgtable);
@@ -1578,7 +1580,8 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return VM_FAULT_SIGBUS;
  
-	track_pfn_insert(vma, &pgprot, pfn);
+	/* TODO: we should check the whole range and handle errors. */
+	pfnmap_sanitize(pfn_t_to_pfn(pfn), PAGE_SIZE, &pgprot);
  
  	ptl = pud_lock(vma->vm_mm, vmf->pud);
  	insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
index 01b3627999304..7266441ad0834 100644
--- a/mm/io-mapping.c
+++ b/mm/io-mapping.c
@@ -21,7 +21,7 @@ int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
  	if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
  		return -EINVAL;
  
-	/* We rely on prevalidation of the io-mapping to skip track_pfn(). */
+	/* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */
  	return remap_pfn_range_notrack(vma, addr, pfn, size,
  		__pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
  			 (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)));
diff --git a/mm/memory.c b/mm/memory.c
index ba3ea0a82f7f7..fdbba7261af4d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1361,7 +1361,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
  	struct mm_struct *dst_mm = dst_vma->vm_mm;
  	struct mm_struct *src_mm = src_vma->vm_mm;
  	struct mmu_notifier_range range;
-	unsigned long next, pfn = 0;
+	unsigned long next;
  	bool is_cow;
  	int ret;
  
@@ -1371,12 +1371,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
  	if (is_vm_hugetlb_page(src_vma))
  		return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
  
-	if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
-		ret = track_pfn_copy(dst_vma, src_vma, &pfn);
-		if (ret)
-			return ret;
-	}
-
  	/*
  	 * We need to invalidate the secondary MMU mappings only when
  	 * there could be a permission downgrade on the ptes of the
@@ -1418,8 +1412,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
  		raw_write_seqcount_end(&src_mm->write_protect_seq);
  		mmu_notifier_invalidate_range_end(&range);
  	}
-	if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
-		untrack_pfn_copy(dst_vma, pfn);
  	return ret;
  }
  
@@ -1914,9 +1906,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  	if (vma->vm_file)
  		uprobe_munmap(vma, start, end);
  
-	if (unlikely(vma->vm_flags & VM_PFNMAP))
-		untrack_pfn(vma, 0, 0, mm_wr_locked);
-
  	if (start != end) {
  		if (unlikely(is_vm_hugetlb_page(vma))) {
  			/*
@@ -2525,7 +2514,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
  	if (!pfn_modify_allowed(pfn, pgprot))
  		return VM_FAULT_SIGBUS;
  
-	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
+	pfnmap_sanitize(pfn, PAGE_SIZE, &pgprot);
  
  	return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
  			false);
@@ -2588,7 +2577,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
  	if (addr < vma->vm_start || addr >= vma->vm_end)
  		return VM_FAULT_SIGBUS;
  
-	track_pfn_insert(vma, &pgprot, pfn);
+	pfnmap_sanitize(pfn_t_to_pfn(pfn), PAGE_SIZE, &pgprot);
  
  	if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
  		return VM_FAULT_SIGBUS;
@@ -2833,6 +2822,36 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
  	return error;
  }
  
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn,
+		unsigned long size, pgprot_t *prot)
+{
+	struct pfnmap_track_ctx *ctx;
+
+	if (pfnmap_track(pfn, size, prot))
+		return ERR_PTR(-EINVAL);
+
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (unlikely(!ctx)) {
+		pfnmap_untrack(pfn, size);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ctx->pfn = pfn;
+	ctx->size = size;
+	kref_init(&ctx->kref);
+	return ctx;
+}
+
+void pfnmap_track_ctx_release(struct kref *ref)
+{
+	struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref);
+
+	pfnmap_untrack(ctx->pfn, ctx->size);
+	kfree(ctx);
+}
+#endif /* __HAVE_PFNMAP_TRACKING */
+
  /**
   * remap_pfn_range - remap kernel memory to userspace
   * @vma: user vma to map to
@@ -2845,20 +2864,54 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
   *
   * Return: %0 on success, negative error code otherwise.
   */
+#ifdef __HAVE_PFNMAP_TRACKING
  int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
  		    unsigned long pfn, unsigned long size, pgprot_t prot)
  {
+	struct pfnmap_track_ctx *ctx = NULL;
  	int err;
  
-	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
-	if (err)
-		return -EINVAL;
+	size = PAGE_ALIGN(size);
+
+	/*
+	 * If we cover the full VMA, we'll perform actual tracking, and
+	 * remember to untrack when the last reference to our tracking
+	 * context from a VMA goes away.
+	 *
+	 * If we only cover parts of the VMA, we'll only lookup the prot
+	 * we can use without tracking.
+	 */
+	if (addr == vma->vm_start && addr + size == vma->vm_end) {
+		if (vma->pfnmap_track_ctx)
+			return -EINVAL;
+		ctx = pfnmap_track_ctx_alloc(pfn, size, &prot);
+		if (IS_ERR(ctx))
+			return PTR_ERR(ctx);
+	} else {
+		err = pfnmap_sanitize(pfn, size, &prot);
+		if (err)
+			return -EINVAL;
+	}
  
  	err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
-	if (err)
-		untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
-	return err;
+	if (err) {
+		if (ctx)
+			kref_put(&ctx->kref, pfnmap_track_ctx_release);
+		return err;
+	}
+
+	if (ctx)
+		vma->pfnmap_track_ctx = ctx;
+	return 0;
+}
+
+#else
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+		    unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+	return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
  }
+#endif
  EXPORT_SYMBOL(remap_pfn_range);
  
  /**
diff --git a/mm/memremap.c b/mm/memremap.c
index 2aebc1b192da9..c417c843e9b1f 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -130,7 +130,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
  	}
  	mem_hotplug_done();
  
-	untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
+	pfnmap_untrack(PHYS_PFN(range->start), range_len(range));
  	pgmap_array_delete(range);
  }
  
@@ -211,8 +211,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
  	if (nid < 0)
  		nid = numa_mem_id();
  
-	error = track_pfn_remap(NULL, &params->pgprot, PHYS_PFN(range->start), 0,
-			range_len(range));
+	error = pfnmap_track(PHYS_PFN(range->start), range_len(range),
+			     &params->pgprot);
  	if (error)
  		goto err_pfn_remap;
  
@@ -277,7 +277,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
  	if (!is_private)
  		kasan_remove_zero_shadow(__va(range->start), range_len(range));
  err_kasan:
-	untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
+	pfnmap_untrack(PHYS_PFN(range->start), range_len(range));
  err_pfn_remap:
  	pgmap_array_delete(range);
  	return error;
diff --git a/mm/mremap.c b/mm/mremap.c
index 7db9da609c84f..6e78e02f74bd3 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1191,10 +1191,6 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm,
  	if (is_vm_hugetlb_page(vma))
  		clear_vma_resv_huge_pages(vma);
  
-	/* Tell pfnmap has moved from this vma */
-	if (unlikely(vma->vm_flags & VM_PFNMAP))
-		untrack_pfn_clear(vma);
-
  	*new_vma_ptr = new_vma;
  	return err;
  }
-- 
2.49.0


-- 
Cheers,

David / dhildenb