lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250516101054.676046-2-p.raghav@samsung.com>
Date: Fri, 16 May 2025 12:10:52 +0200
From: Pankaj Raghav <p.raghav@...sung.com>
To: "Darrick J . Wong" <djwong@...nel.org>,
	hch@....de,
	willy@...radead.org
Cc: linux-kernel@...r.kernel.org,
	linux-mm@...ck.org,
	David Hildenbrand <david@...hat.com>,
	linux-fsdevel@...r.kernel.org,
	mcgrof@...nel.org,
	gost.dev@...sung.com,
	Andrew Morton <akpm@...ux-foundation.org>,
	kernel@...kajraghav.com,
	Pankaj Raghav <p.raghav@...sung.com>
Subject: [RFC 1/3] mm: add large zero page for efficient zeroing of larger segments

Introduce LARGE_ZERO_PAGE of size 2M as an alternative to ZERO_PAGE of
size PAGE_SIZE.

There are many places in the kernel where we need to zeroout larger
chunks but the maximum segment we can zeroout at a time is limited by
PAGE_SIZE.

This is especially annoying in block devices and filesystems where we
attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage
bvec support in block layer, it is much more efficient to send out
larger zero pages as a part of single bvec.

While there are other options such as huge_zero_page, they can fail
based on the system memory pressure requiring a fallback to ZERO_PAGE[3].

This idea (but not the implementation) was suggested during the review of
adding LBS support to XFS[1][2].

LARGE_ZERO_PAGE is added behind a config option so that systems that are
constrained by memory are not forced to use it.

[1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
[2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/
[3] https://lore.kernel.org/linux-xfs/3pqmgrlewo6ctcwakdvbvjqixac5en6irlipe5aiz6vkylfyni@2luhrs36ke5r/

Suggested-by: Christoph Hellwig <hch@....de>
Signed-off-by: Pankaj Raghav <p.raghav@...sung.com>
---
 arch/Kconfig                   |  8 ++++++++
 arch/x86/include/asm/pgtable.h | 20 +++++++++++++++++++-
 arch/x86/kernel/head_64.S      |  9 ++++++++-
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index b0adb665041f..aefa519cb211 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -218,6 +218,14 @@ config USER_RETURN_NOTIFIER
 	  Provide a kernel-internal notification when a cpu is about to
 	  switch to user mode.
 
+config LARGE_ZERO_PAGE
+	bool "Large zero pages"
+	def_bool n
+	help
+	  2M sized zero pages for zeroing. This will reserve 2M sized
+	  physical pages for zeroing. Not suitable for memory constrained
+	  systems.
+
 config HAVE_IOREMAP_PROT
 	bool
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 3f59d7a16010..78eb83f2da34 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -17,6 +17,7 @@
 
 #ifndef __ASSEMBLER__
 #include <linux/spinlock.h>
+#include <linux/sizes.h>
 #include <asm/x86_init.h>
 #include <asm/pkru.h>
 #include <asm/fpu/api.h>
@@ -47,14 +48,31 @@ void ptdump_walk_user_pgd_level_checkwx(void);
 #define debug_checkwx_user()	do { } while (0)
 #endif
 
+#ifdef CONFIG_LARGE_ZERO_PAGE
+/*
+ * LARGE_ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern unsigned long empty_large_zero_page[(SZ_2M) / sizeof(unsigned long)]
+	__visible;
+#define ZERO_LARGE_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_large_zero_page))
+
+#define ZERO_PAGE(vaddr) ZERO_LARGE_PAGE(vaddr)
+#define ZERO_LARGE_PAGE_SIZE SZ_2M
+#else
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
  */
-extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
+extern unsigned long empty_zero_page[(PAGE_SIZE) / sizeof(unsigned long)]
 	__visible;
 #define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page))
 
+#define ZERO_LARGE_PAGE(vaddr) ZERO_PAGE(vaddr)
+
+#define ZERO_LARGE_PAGE_SIZE PAGE_SIZE
+#endif
+
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index fefe2a25cf02..ebcd12f72966 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -14,6 +14,7 @@
 #include <linux/threads.h>
 #include <linux/init.h>
 #include <linux/pgtable.h>
+#include <linux/sizes.h>
 #include <asm/segment.h>
 #include <asm/page.h>
 #include <asm/msr.h>
@@ -708,8 +709,14 @@ EXPORT_SYMBOL(phys_base)
 #include "../xen/xen-head.S"
 
 	__PAGE_ALIGNED_BSS
+#ifdef CONFIG_LARGE_ZERO_PAGE
+SYM_DATA_START_PAGE_ALIGNED(empty_large_zero_page)
+	.skip SZ_2M
+SYM_DATA_END(empty_large_zero_page)
+EXPORT_SYMBOL(empty_large_zero_page)
+#else
 SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
 	.skip PAGE_SIZE
 SYM_DATA_END(empty_zero_page)
 EXPORT_SYMBOL(empty_zero_page)
-
+#endif
-- 
2.47.2


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ