lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250723161827.15802-1-dev.jain@arm.com>
Date: Wed, 23 Jul 2025 21:48:27 +0530
From: Dev Jain <dev.jain@....com>
To: catalin.marinas@....com,
	will@...nel.org
Cc: anshuman.khandual@....com,
	quic_zhenhuah@...cinc.com,
	ryan.roberts@....com,
	kevin.brodsky@....com,
	yangyicong@...ilicon.com,
	joey.gouly@....com,
	linux-arm-kernel@...ts.infradead.org,
	linux-kernel@...r.kernel.org,
	david@...hat.com,
	mark.rutland@....com,
	urezki@...il.com,
	Dev Jain <dev.jain@....com>
Subject: [RESEND PATCH v5] arm64: Enable vmalloc-huge with ptdump

Our goal is to move towards enabling vmalloc-huge by default on arm64 so
as to reduce TLB pressure. Therefore, we need a way to analyze the portion
of block mappings in vmalloc space we can get on a production system; this
can be done through ptdump, but currently we disable vmalloc-huge if
CONFIG_PTDUMP_DEBUGFS is on. The reason is that lazy freeing of kernel
pagetables via vmap_try_huge_pxd() may race with ptdump, so ptdump
may dereference a bogus address.

To solve this, we need to synchronize ptdump_walk_pgd() with
pud_free_pmd_page() and pmd_free_pte_page().

Since this race is very unlikely to happen in practice, we do not want to
penalize other code paths taking the init_mm mmap_lock. Therefore, we use
static keys. ptdump will enable the static key - upon observing that, the
vmalloc table freeing path will get patched in with an
mmap_read_lock/unlock sequence. A code comment explains in detail, how
a combination of acquire sematics of static_branch_enable() and the
barriers in __flush_tlb_kernel_pgtable() ensures that ptdump will never
get a hold on the address of a freed PMD or PTE table.

For an (almost) rigorous proof of correctness, one may look at:
https://lore.kernel.org/all/e1e87f16-1c48-481b-8f7c-9333ac5d13e7@arm.com/

Reviewed-by: Ryan Roberts <ryan.roberts@....com>
Signed-off-by: Dev Jain <dev.jain@....com>
---
Rebased on 6.16-rc7.

mm-selfests pass. No issues were observed while parallelly running
test_vmalloc.sh (which stresses the vmalloc subsystem) and dumping the
kernel pagetable through sysfs in a loop.

v4->v5:
 - The arch_vmap_pxd_supported() changes were dropped by mistake in between
   the revisions, add them back (Anshuman)
 - Rewrite cover letter, drop stray change, add arm64_ptdump_walk_pgd()
   helper, rename ptdump_lock_key -> arm64_ptdump_lock_key, add comment
   above __pmd_free_pte_page() to explain when the lock won't
   be taken (Anshuman)
 - Rewrite the comment explaining the synchronization logic (Catalin)

v3->v4:
 - Lock-unlock immediately
 - Simplify includes

v2->v3:
 - Use static key mechanism

v1->v2:
 - Take lock only when CONFIG_PTDUMP_DEBUGFS is on
 - In case of pud_free_pmd_page(), isolate the PMD table to avoid taking
   the lock 512 times again via pmd_free_pte_page()

---
 arch/arm64/include/asm/ptdump.h  |  2 +
 arch/arm64/include/asm/vmalloc.h |  6 +--
 arch/arm64/mm/mmu.c              | 86 ++++++++++++++++++++++++++++++--
 arch/arm64/mm/ptdump.c           | 11 +++-
 4 files changed, 95 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h
index fded5358641f..baff24004459 100644
--- a/arch/arm64/include/asm/ptdump.h
+++ b/arch/arm64/include/asm/ptdump.h
@@ -7,6 +7,8 @@
 
 #include <linux/ptdump.h>
 
+DECLARE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);
+
 #ifdef CONFIG_PTDUMP
 
 #include <linux/mm_types.h>
diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
index 12f534e8f3ed..e835fd437ae0 100644
--- a/arch/arm64/include/asm/vmalloc.h
+++ b/arch/arm64/include/asm/vmalloc.h
@@ -12,15 +12,13 @@ static inline bool arch_vmap_pud_supported(pgprot_t prot)
 	/*
 	 * SW table walks can't handle removal of intermediate entries.
 	 */
-	return pud_sect_supported() &&
-	       !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
+	return pud_sect_supported();
 }
 
 #define arch_vmap_pmd_supported arch_vmap_pmd_supported
 static inline bool arch_vmap_pmd_supported(pgprot_t prot)
 {
-	/* See arch_vmap_pud_supported() */
-	return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
+	return true;
 }
 
 #define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 00ab1d648db6..49932c1dd34e 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -46,6 +46,8 @@
 #define NO_CONT_MAPPINGS	BIT(1)
 #define NO_EXEC_MAPPINGS	BIT(2)	/* assumes FEAT_HPDS is not used */
 
+DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);
+
 enum pgtable_type {
 	TABLE_PTE,
 	TABLE_PMD,
@@ -1267,7 +1269,12 @@ int pmd_clear_huge(pmd_t *pmdp)
 	return 1;
 }
 
-int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
+/*
+ * If PMD has been isolated via pud_free_pmd_page(), ptdump cannot get
+ * a hold to it, so no need to serialize with mmap_lock, hence lock
+ * will be passed as false here. Otherwise, lock will be true.
+ */
+static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr, bool lock)
 {
 	pte_t *table;
 	pmd_t pmd;
@@ -1279,13 +1286,24 @@ int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
 		return 1;
 	}
 
+	/* See comment in pud_free_pmd_page for static key logic */
 	table = pte_offset_kernel(pmdp, addr);
 	pmd_clear(pmdp);
 	__flush_tlb_kernel_pgtable(addr);
+	if (static_branch_unlikely(&arm64_ptdump_lock_key) && lock) {
+		mmap_read_lock(&init_mm);
+		mmap_read_unlock(&init_mm);
+	}
+
 	pte_free_kernel(NULL, table);
 	return 1;
 }
 
+int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
+{
+	return __pmd_free_pte_page(pmdp, addr, true);
+}
+
 int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
 {
 	pmd_t *table;
@@ -1301,16 +1319,76 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
 	}
 
 	table = pmd_offset(pudp, addr);
+
+	/*
+	 * Our objective is to prevent ptdump from reading a PMD table which has
+	 * been freed.  Assume that ptdump_walk_pgd() (call this thread T1)
+	 * executes completely on CPU1 and pud_free_pmd_page() (call this thread
+	 * T2) executes completely on CPU2. Let the region sandwiched by the
+	 * mmap_write_lock/unlock in T1 be called CS (the critical section).
+	 *
+	 * Claim: The CS of T1 will never operate on a freed PMD table.
+	 *
+	 * Proof:
+	 *
+	 * Case 1: The static branch is visible to T2.
+	 *
+	 * Case 1 (a): T1 acquires the lock before T2 can.
+	 * T2 will block until T1 drops the lock, so pmd_free() will only be
+	 * executed after T1 exits CS.
+	 *
+	 * Case 1 (b): T2 acquires the lock before T1 can.
+	 * The sequence of barriers issued in __flush_tlb_kernel_pgtable()
+	 * ensures that an empty PUD (via pud_clear()) is visible to T1 before
+	 * T1 can enter CS, therefore it is impossible for the CS to get hold
+	 * of the address of the isolated PMD table.
+	 *
+	 * Case 2: The static branch is not visible to T2.
+	 *
+	 * Since static_branch_enable() (via dmb(ish)) and mmap_write_lock()
+	 * have acquire semantics, it is guaranteed that the static branch
+	 * will be visible to all CPUs before T1 can enter CS. The static
+	 * branch not being visible to T2 therefore guarantees that T1 has
+	 * not yet entered CS .... (i)
+	 * The sequence of barriers via __flush_tlb_kernel_pgtable() in T2
+	 * implies that if the invisibility of the static branch has been
+	 * observed by T2 (i.e static_branch_unlikely() is observed as false),
+	 * then all CPUs will have observed an empty PUD ... (ii)
+	 * Combining (i) and (ii), we conclude that T1 observes an empty PUD
+	 * before entering CS => it is impossible for the CS to get hold of
+	 * the address of the isolated PMD table. Q.E.D
+	 *
+	 * We have proven that the claim is true on the assumption that
+	 * there is no context switch for T1 and T2. Note that the reasoning
+	 * of the proof uses barriers operating on the inner shareable domain,
+	 * which means that they will affect all CPUs, and also a context switch
+	 * will insert extra barriers into the code paths => the claim will
+	 * stand true even if we drop the assumption.
+	 *
+	 * It is also worth reasoning whether something can go wrong via
+	 * pud_free_pmd_page() -> __pmd_free_pte_page(), since the latter
+	 * will be called locklessly on this code path.
+	 *
+	 * For Case 1 (a), T2 will block until CS is finished, so we are safe.
+	 * For Case 1 (b) and Case 2, the PMD table will be isolated before
+	 * T1 can enter CS, therefore it is safe for T2 to operate on the
+	 * PMD table locklessly.
+	 */
+	pud_clear(pudp);
+	__flush_tlb_kernel_pgtable(addr);
+	if (static_branch_unlikely(&arm64_ptdump_lock_key)) {
+		mmap_read_lock(&init_mm);
+		mmap_read_unlock(&init_mm);
+	}
+
 	pmdp = table;
 	next = addr;
 	end = addr + PUD_SIZE;
 	do {
 		if (pmd_present(pmdp_get(pmdp)))
-			pmd_free_pte_page(pmdp, next);
+			__pmd_free_pte_page(pmdp, next, false);
 	} while (pmdp++, next += PMD_SIZE, next != end);
 
-	pud_clear(pudp);
-	__flush_tlb_kernel_pgtable(addr);
 	pmd_free(NULL, table);
 	return 1;
 }
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 421a5de806c6..65335c7ba482 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -283,6 +283,13 @@ void note_page_flush(struct ptdump_state *pt_st)
 	note_page(pt_st, 0, -1, pte_val(pte_zero));
 }
 
+static void arm64_ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm)
+{
+	static_branch_enable(&arm64_ptdump_lock_key);
+	ptdump_walk_pgd(st, mm, NULL);
+	static_branch_disable(&arm64_ptdump_lock_key);
+}
+
 void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
 {
 	unsigned long end = ~0UL;
@@ -311,7 +318,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
 		}
 	};
 
-	ptdump_walk_pgd(&st.ptdump, info->mm, NULL);
+	arm64_ptdump_walk_pgd(&st.ptdump, info->mm);
 }
 
 static void __init ptdump_initialize(void)
@@ -353,7 +360,7 @@ bool ptdump_check_wx(void)
 		}
 	};
 
-	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+	arm64_ptdump_walk_pgd(&st.ptdump, &init_mm);
 
 	if (st.wx_pages || st.uxn_pages) {
 		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",
-- 
2.30.2


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ