[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250527050452.817674-3-p.raghav@samsung.com>
Date: Tue, 27 May 2025 07:04:51 +0200
From: Pankaj Raghav <p.raghav@...sung.com>
To: Suren Baghdasaryan <surenb@...gle.com>,
Ryan Roberts <ryan.roberts@....com>,
Vlastimil Babka <vbabka@...e.cz>,
Baolin Wang <baolin.wang@...ux.alibaba.com>,
Borislav Petkov <bp@...en8.de>,
Ingo Molnar <mingo@...hat.com>,
"H . Peter Anvin" <hpa@...or.com>,
Zi Yan <ziy@...dia.com>,
Mike Rapoport <rppt@...nel.org>,
Dave Hansen <dave.hansen@...ux.intel.com>,
Michal Hocko <mhocko@...e.com>,
David Hildenbrand <david@...hat.com>,
Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Thomas Gleixner <tglx@...utronix.de>,
Nico Pache <npache@...hat.com>,
Dev Jain <dev.jain@....com>,
"Liam R . Howlett" <Liam.Howlett@...cle.com>,
Jens Axboe <axboe@...nel.dk>
Cc: linux-kernel@...r.kernel.org,
linux-mm@...ck.org,
linux-block@...r.kernel.org,
willy@...radead.org,
x86@...nel.org,
linux-fsdevel@...r.kernel.org,
"Darrick J . Wong" <djwong@...nel.org>,
mcgrof@...nel.org,
gost.dev@...sung.com,
kernel@...kajraghav.com,
hch@....de,
Pankaj Raghav <p.raghav@...sung.com>
Subject: [RFC 2/3] mm: add STATIC_PMD_ZERO_PAGE config option
There are many places in the kernel where we need to zeroout larger
chunks but the maximum segment we can zeroout at a time by ZERO_PAGE
is limited by PAGE_SIZE.
This is especially annoying in block devices and filesystems where we
attach multiple ZERO_PAGEs to the bio in different bvecs. With multipage
bvec support in block layer, it is much more efficient to send out
larger zero pages as a part of single bvec.
This concern was raised during the review of adding LBS support to
XFS[1][2].
Usually huge_zero_folio is allocated on demand, and it will be
deallocated by the shrinker if there are no users of it left.
Add a config option STATIC_PMD_ZERO_PAGE that will always allocate
the huge_zero_folio, and it will never be freed. This makes using the
huge_zero_folio without having to pass any mm struct and call put_folio
in the destructor.
We can enable it by default for x86_64 where the PMD size is 2M.
It is good compromise between the memory and efficiency.
As a THP zero page might be wasteful for architectures with bigger page
sizes, let's not enable it for them.
[1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
[2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/
Suggested-by: David Hildenbrand <david@...hat.com>
Signed-off-by: Pankaj Raghav <p.raghav@...sung.com>
---
arch/x86/Kconfig | 1 +
mm/Kconfig | 12 ++++++++++++
mm/memory.c | 30 ++++++++++++++++++++++++++----
3 files changed, 39 insertions(+), 4 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 055204dc211d..96f99b4f96ea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -152,6 +152,7 @@ config X86
select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64
select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64
select ARCH_WANTS_THP_SWAP if X86_64
+ select ARCH_WANTS_STATIC_PMD_ZERO_PAGE if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
select BUILDTIME_TABLE_SORT
select CLKEVT_I8253
diff --git a/mm/Kconfig b/mm/Kconfig
index bd08e151fa1b..8f50f5c3f7a7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -826,6 +826,18 @@ config ARCH_WANTS_THP_SWAP
config MM_ID
def_bool n
+config ARCH_WANTS_STATIC_PMD_ZERO_PAGE
+ bool
+
+config STATIC_PMD_ZERO_PAGE
+ def_bool y
+ depends on ARCH_WANTS_STATIC_PMD_ZERO_PAGE
+ help
+ Typically huge_zero_folio, which is a PMD page of zeroes, is allocated
+ on demand and deallocated when not in use. This option will always
+ allocate huge_zero_folio for zeroing and it is never deallocated.
+ Not suitable for memory constrained systems.
+
menuconfig TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
diff --git a/mm/memory.c b/mm/memory.c
index 11edc4d66e74..ab8c16d04307 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -203,9 +203,17 @@ static void put_huge_zero_page(void)
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
+/*
+ * If STATIC_PMD_ZERO_PAGE is enabled, @mm can be NULL, i.e, the huge_zero_folio
+ * is not associated with any mm_struct.
+*/
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ if (!IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE) && !mm)
+ return NULL;
+
+ if (IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE) ||
+ test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
return READ_ONCE(huge_zero_folio);
if (!get_huge_zero_page())
@@ -219,6 +227,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
void mm_put_huge_zero_folio(struct mm_struct *mm)
{
+ if (IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE))
+ return;
+
if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
put_huge_zero_page();
}
@@ -246,15 +257,26 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
static int __init init_huge_zero_page(void)
{
+ int ret = 0;
+
+ if (IS_ENABLED(CONFIG_STATIC_PMD_ZERO_PAGE)) {
+ if (!get_huge_zero_page())
+ ret = -ENOMEM;
+ goto out;
+ }
+
huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
- if (!huge_zero_page_shrinker)
- return -ENOMEM;
+ if (!huge_zero_page_shrinker) {
+ ret = -ENOMEM;
+ goto out;
+ }
huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
shrinker_register(huge_zero_page_shrinker);
- return 0;
+out:
+ return ret;
}
early_initcall(init_huge_zero_page);
--
2.47.2
Powered by blists - more mailing lists