lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250612105100.59144-4-p.raghav@samsung.com>
Date: Thu, 12 Jun 2025 12:50:58 +0200
From: Pankaj Raghav <p.raghav@...sung.com>
To: Suren Baghdasaryan <surenb@...gle.com>,
	Ryan Roberts <ryan.roberts@....com>,
	Mike Rapoport <rppt@...nel.org>,
	Michal Hocko <mhocko@...e.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Nico Pache <npache@...hat.com>,
	Dev Jain <dev.jain@....com>,
	Baolin Wang <baolin.wang@...ux.alibaba.com>,
	Borislav Petkov <bp@...en8.de>,
	Ingo Molnar <mingo@...hat.com>,
	"H . Peter Anvin" <hpa@...or.com>,
	Vlastimil Babka <vbabka@...e.cz>,
	Zi Yan <ziy@...dia.com>,
	Dave Hansen <dave.hansen@...ux.intel.com>,
	David Hildenbrand <david@...hat.com>,
	Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
	Andrew Morton <akpm@...ux-foundation.org>,
	"Liam R . Howlett" <Liam.Howlett@...cle.com>,
	Jens Axboe <axboe@...nel.dk>
Cc: linux-kernel@...r.kernel.org,
	linux-mm@...ck.org,
	willy@...radead.org,
	x86@...nel.org,
	linux-block@...r.kernel.org,
	linux-fsdevel@...r.kernel.org,
	"Darrick J . Wong" <djwong@...nel.org>,
	mcgrof@...nel.org,
	gost.dev@...sung.com,
	kernel@...kajraghav.com,
	hch@....de,
	Pankaj Raghav <p.raghav@...sung.com>
Subject: [PATCH 3/5] mm: add static PMD zero page

There are many places in the kernel where we need to zeroout larger
chunks but the maximum segment we can zeroout at a time by ZERO_PAGE
is limited by PAGE_SIZE.

This is especially annoying in block devices and filesystems where we
attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage
bvec support in block layer, it is much more efficient to send out
larger zero pages as a part of single bvec.

This concern was raised during the review of adding LBS support to
XFS[1][2].

Usually huge_zero_folio is allocated on demand, and it will be
deallocated by the shrinker if there are no users of it left.

Add a config option STATIC_PMD_ZERO_PAGE that will always allocate
the huge_zero_folio in .bss, and it will never be freed. This makes using the
huge_zero_folio without having to pass any mm struct and call put_folio
in the destructor.

As STATIC_PMD_ZERO_PAGE does not depend on THP, declare huge_zero_folio
and huge_zero_pfn outside of the THP ifdef.

It can only be enabled from x86_64, but it is an optional config. We
could expand it more architectures in the future.

[1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
[2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/

Suggested-by: David Hildenbrand <david@...hat.com>
Signed-off-by: Pankaj Raghav <p.raghav@...sung.com>
---
Questions:
- Can we call __split_huge_zero_page_pmd() on static PMD page?

 arch/x86/Kconfig               |  1 +
 arch/x86/include/asm/pgtable.h |  8 ++++++++
 arch/x86/kernel/head_64.S      |  8 ++++++++
 include/linux/mm.h             | 16 +++++++++++++++-
 mm/Kconfig                     | 13 +++++++++++++
 mm/huge_memory.c               | 24 ++++++++++++++++++++----
 mm/memory.c                    | 19 +++++++++++++++++++
 7 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 340e5468980e..c3a9d136ec0a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,6 +153,7 @@ config X86
 	select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP	if X86_64
 	select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64
 	select ARCH_WANTS_THP_SWAP		if X86_64
+	select ARCH_HAS_STATIC_PMD_ZERO_PAGE	if X86_64
 	select ARCH_HAS_PARANOID_L1D_FLUSH
 	select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	select BUILDTIME_TABLE_SORT
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 774430c3abff..7013a7d26da5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -47,6 +47,14 @@ void ptdump_walk_user_pgd_level_checkwx(void);
 #define debug_checkwx_user()	do { } while (0)
 #endif
 
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+/*
+ * PMD_ZERO_PAGE is a global shared PMD page that is always zero.
+ */
+extern unsigned long empty_pmd_zero_page[(PMD_SIZE) / sizeof(unsigned long)]
+	__visible;
+#endif
+
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3e9b3a3bd039..86aaa53fd619 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -714,6 +714,14 @@ EXPORT_SYMBOL(phys_base)
 #include "../xen/xen-head.S"
 
 	__PAGE_ALIGNED_BSS
+
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+SYM_DATA_START_PAGE_ALIGNED(empty_pmd_zero_page)
+	.skip PMD_SIZE
+SYM_DATA_END(empty_pmd_zero_page)
+EXPORT_SYMBOL(empty_pmd_zero_page)
+#endif
+
 SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
 	.skip PAGE_SIZE
 SYM_DATA_END(empty_zero_page)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c8fbeaacf896..b20d60d68b3c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4018,10 +4018,10 @@ static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
 
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern struct folio *huge_zero_folio;
 extern unsigned long huge_zero_pfn;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline bool is_huge_zero_folio(const struct folio *folio)
 {
 	return READ_ONCE(huge_zero_folio) == folio;
@@ -4032,9 +4032,23 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 	return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
 }
 
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+static inline struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
+{
+	return READ_ONCE(huge_zero_folio);
+}
+
+static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
+{
+	return;
+}
+
+#else
 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
 void mm_put_huge_zero_folio(struct mm_struct *mm);
 
+#endif /* CONFIG_STATIC_PMD_ZERO_PAGE */
+
 #else
 static inline bool is_huge_zero_folio(const struct folio *folio)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index 781be3240e21..fd1c51995029 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -826,6 +826,19 @@ config ARCH_WANTS_THP_SWAP
 config MM_ID
 	def_bool n
 
+config ARCH_HAS_STATIC_PMD_ZERO_PAGE
+	def_bool n
+
+config STATIC_PMD_ZERO_PAGE
+	bool "Allocate a PMD page for zeroing"
+	depends on ARCH_HAS_STATIC_PMD_ZERO_PAGE
+	help
+	  Typically huge_zero_folio, which is a PMD page of zeroes, is allocated
+	  on demand and deallocated when not in use. This option will
+	  allocate a PMD sized zero page in .bss and huge_zero_folio will
+	  use it instead allocating dynamically.
+	  Not suitable for memory constrained systems.
+
 menuconfig TRANSPARENT_HUGEPAGE
 	bool "Transparent Hugepage Support"
 	depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 101b67ab2eb6..c12ca7134e88 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -75,9 +75,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 					 struct shrink_control *sc);
 static bool split_underused_thp = true;
 
-static atomic_t huge_zero_refcount;
-struct folio *huge_zero_folio __read_mostly;
-unsigned long huge_zero_pfn __read_mostly = ~0UL;
 unsigned long huge_anon_orders_always __read_mostly;
 unsigned long huge_anon_orders_madvise __read_mostly;
 unsigned long huge_anon_orders_inherit __read_mostly;
@@ -208,6 +205,23 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 	return orders;
 }
 
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+static int huge_zero_page_shrinker_init(void)
+{
+	return 0;
+}
+
+static void huge_zero_page_shrinker_exit(void)
+{
+	return;
+}
+#else
+
+static struct shrinker *huge_zero_page_shrinker;
+static atomic_t huge_zero_refcount;
+struct folio *huge_zero_folio __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
+
 static bool get_huge_zero_page(void)
 {
 	struct folio *zero_folio;
@@ -288,7 +302,6 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
 	return 0;
 }
 
-static struct shrinker *huge_zero_page_shrinker;
 static int huge_zero_page_shrinker_init(void)
 {
 	huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
@@ -307,6 +320,7 @@ static void huge_zero_page_shrinker_exit(void)
 	return;
 }
 
+#endif
 
 #ifdef CONFIG_SYSFS
 static ssize_t enabled_show(struct kobject *kobj,
@@ -2843,6 +2857,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	pte_t *pte;
 	int i;
 
+	// FIXME: can this be called with static zero page?
+	VM_BUG_ON(IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE));
 	/*
 	 * Leave pmd empty until pte is filled note that it is fine to delay
 	 * notification until mmu_notifier_invalidate_range_end() as we are
diff --git a/mm/memory.c b/mm/memory.c
index 8eba595056fe..77721f5ae043 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -159,6 +159,25 @@ static int __init init_zero_pfn(void)
 }
 early_initcall(init_zero_pfn);
 
+#ifdef CONFIG_STATIC_PMD_ZERO_PAGE
+struct folio *huge_zero_folio __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
+
+static int __init init_pmd_zero_pfn(void)
+{
+	huge_zero_folio = virt_to_folio(empty_pmd_zero_page);
+	huge_zero_pfn = page_to_pfn(virt_to_page(empty_pmd_zero_page));
+
+	__folio_set_head(huge_zero_folio);
+	prep_compound_head((struct page *)huge_zero_folio, PMD_ORDER);
+	/* Ensure zero folio won't have large_rmappable flag set. */
+	folio_clear_large_rmappable(huge_zero_folio);
+
+	return 0;
+}
+early_initcall(init_pmd_zero_pfn);
+#endif
+
 void mm_trace_rss_stat(struct mm_struct *mm, int member)
 {
 	trace_rss_stat(mm, member);
-- 
2.49.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ