[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <46bf07b6-633f-43b8-8e2b-b08d437494b9@amd.com>
Date: Fri, 26 Dec 2025 14:54:17 +0530
From: Raghavendra K T <raghavendra.kt@....com>
To: 李喆 <lizhe.67@...edance.com>, muchun.song@...ux.dev,
osalvador@...e.de, david@...nel.org, akpm@...ux-foundation.org,
fvdl@...gle.com
Cc: linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/8] mm/hugetlb: add pre-zeroed framework
On 12/25/2025 1:50 PM, æå wrote:
> From: Li Zhe <lizhe.67@...edance.com>
>
> This patch establishes a pre-zeroing framework by introducing two new
> hugetlb page flags and extends the code at every point where these flags
> may later be required. The roles of the two flags are as follows.
>
> (1) HPG_zeroed – indicates that the huge folio has already been
> zeroed
> (2) HPG_zeroing – marks that the huge folio is currently being zeroed
>
> No functional change, as nothing sets the flags yet.
>
> Co-developed-by: Frank van der Linden <fvdl@...gle.com>
> Signed-off-by: Frank van der Linden <fvdl@...gle.com>
> Signed-off-by: Li Zhe <lizhe.67@...edance.com>
> ---
> fs/hugetlbfs/inode.c | 3 +-
> include/linux/hugetlb.h | 26 +++++++++
> mm/hugetlb.c | 113 +++++++++++++++++++++++++++++++++++++---
> 3 files changed, 133 insertions(+), 9 deletions(-)
>
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 3b4c152c5c73..be6b32ab3ca8 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -828,8 +828,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
> error = PTR_ERR(folio);
> goto out;
> }
> - folio_zero_user(folio, addr);
> - __folio_mark_uptodate(folio);
> + hugetlb_zero_folio(folio, addr);
> error = hugetlb_add_to_page_cache(folio, mapping, index);
> if (unlikely(error)) {
> restore_reserve_on_error(h, &pseudo_vma, addr, folio);
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 019a1c5281e4..2daf4422a17d 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -584,6 +584,17 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
> * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
> * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page
> * that is not tracked by raw_hwp_page list.
> + * HPG_zeroed - page was pre-zeroed.
> + * Synchronization: hugetlb_lock held when set by pre-zero thread.
> + * Only valid to read outside hugetlb_lock once the page is off
> + * the freelist, and HPG_zeroing is clear. Always cleared when a
> + * page is put (back) on the freelist.
> + * HPG_zeroing - page is being zeroed by the pre-zero thread.
> + * Synchronization: set and cleared by the pre-zero thread with
> + * hugetlb_lock held. Access by others is read-only. Once the page
> + * is off the freelist, this can only change from set -> clear,
> + * which the new page owner must wait for. Always cleared
> + * when a page is put (back) on the freelist.
> */
> enum hugetlb_page_flags {
> HPG_restore_reserve = 0,
> @@ -593,6 +604,8 @@ enum hugetlb_page_flags {
> HPG_vmemmap_optimized,
> HPG_raw_hwp_unreliable,
> HPG_cma,
> + HPG_zeroed,
> + HPG_zeroing,
> __NR_HPAGEFLAGS,
> };
>
> @@ -653,6 +666,8 @@ HPAGEFLAG(Freed, freed)
> HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
> HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
> HPAGEFLAG(Cma, cma)
> +HPAGEFLAG(Zeroed, zeroed)
> +HPAGEFLAG(Zeroing, zeroing)
>
> #ifdef CONFIG_HUGETLB_PAGE
>
> @@ -678,6 +693,12 @@ struct hstate {
> unsigned int nr_huge_pages_node[MAX_NUMNODES];
> unsigned int free_huge_pages_node[MAX_NUMNODES];
> unsigned int surplus_huge_pages_node[MAX_NUMNODES];
> +
> + unsigned int free_huge_pages_zero_node[MAX_NUMNODES];
> +
> + /* Queue to wait for a hugetlb folio that is being prezeroed */
> + wait_queue_head_t dqzero_wait[MAX_NUMNODES];
> +
> char name[HSTATE_NAME_LEN];
> };
>
> @@ -711,6 +732,7 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping
> pgoff_t idx);
> void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
> unsigned long address, struct folio *folio);
> +void hugetlb_zero_folio(struct folio *folio, unsigned long address);
>
> /* arch callback */
> int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
> @@ -1303,6 +1325,10 @@ static inline bool hugetlb_bootmem_allocated(void)
> {
> return false;
> }
> +
> +static inline void hugetlb_zero_folio(struct folio *folio, unsigned long address)
> +{
> +}
> #endif /* CONFIG_HUGETLB_PAGE */
>
> static inline spinlock_t *huge_pte_lock(struct hstate *h,
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 51273baec9e5..d20614b1c927 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -93,6 +93,8 @@ static int hugetlb_param_index __initdata;
> static __init int hugetlb_add_param(char *s, int (*setup)(char *val));
> static __init void hugetlb_parse_params(void);
>
> +static void hpage_wait_zeroing(struct hstate *h, struct folio *folio);
> +
> #define hugetlb_early_param(str, func) \
> static __init int func##args(char *s) \
> { \
> @@ -1292,21 +1294,33 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
> hugetlb_dup_vma_private(vma);
> }
>
> +/*
> + * Clear flags for either a fresh page or one that is being
> + * added to the free list.
> + */
> +static inline void prep_clear_zeroed(struct folio *folio)
> +{
> + folio_clear_hugetlb_zeroed(folio);
> + folio_clear_hugetlb_zeroing(folio);
> +}
> +
> static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
> {
> int nid = folio_nid(folio);
>
> lockdep_assert_held(&hugetlb_lock);
> VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
> + VM_WARN_ON_FOLIO(folio_test_hugetlb_zeroing(folio), folio);
>
> list_move(&folio->lru, &h->hugepage_freelists[nid]);
> h->free_huge_pages++;
> h->free_huge_pages_node[nid]++;
> + prep_clear_zeroed(folio);
> folio_set_hugetlb_freed(folio);
> }
>
> -static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
> - int nid)
> +static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, int nid,
> + gfp_t gfp_mask)
> {
> struct folio *folio;
> bool pin = !!(current->flags & PF_MEMALLOC_PIN);
> @@ -1316,6 +1330,16 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
> if (pin && !folio_is_longterm_pinnable(folio))
> continue;
>
> + /*
> + * This shouldn't happen, as hugetlb pages are never allocated
> + * with GFP_ATOMIC. But be paranoid and check for it, as
> + * a zero_busy page might cause a sleep later in
> + * hpage_wait_zeroing().
> + */
> + if (WARN_ON_ONCE(folio_test_hugetlb_zeroing(folio) &&
> + !gfpflags_allow_blocking(gfp_mask)))
> + continue;
> +
> if (folio_test_hwpoison(folio))
> continue;
>
> @@ -1327,6 +1351,10 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
> folio_clear_hugetlb_freed(folio);
> h->free_huge_pages--;
> h->free_huge_pages_node[nid]--;
> + if (folio_test_hugetlb_zeroed(folio) ||
> + folio_test_hugetlb_zeroing(folio))
> + h->free_huge_pages_zero_node[nid]--;
> +
> return folio;
> }
>
> @@ -1363,7 +1391,7 @@ static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_
> continue;
> node = zone_to_nid(zone);
>
> - folio = dequeue_hugetlb_folio_node_exact(h, node);
> + folio = dequeue_hugetlb_folio_node_exact(h, node, gfp_mask);
> if (folio)
> return folio;
> }
> @@ -1490,7 +1518,16 @@ void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
> folio_clear_hugetlb_freed(folio);
> h->free_huge_pages--;
> h->free_huge_pages_node[nid]--;
> + folio_clear_hugetlb_freed(folio);
> }
> + /*
> + * Adjust the zero page counters now. Note that
> + * if a page is currently being zeroed, that
> + * will be waited for in update_and_free_page()
> + */
> + if (folio_test_hugetlb_zeroed(folio) ||
> + folio_test_hugetlb_zeroing(folio))
> + h->free_huge_pages_zero_node[nid]--;
> if (adjust_surplus) {
> h->surplus_huge_pages--;
> h->surplus_huge_pages_node[nid]--;
> @@ -1543,6 +1580,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
> {
> bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);
>
> + VM_WARN_ON_FOLIO(folio_test_hugetlb_zeroing(folio), folio);
> +
> if (hstate_is_gigantic_no_runtime(h))
> return;
>
> @@ -1627,6 +1666,7 @@ static void free_hpage_workfn(struct work_struct *work)
> */
> h = size_to_hstate(folio_size(folio));
>
> + hpage_wait_zeroing(h, folio);
> __update_and_free_hugetlb_folio(h, folio);
>
> cond_resched();
> @@ -1643,7 +1683,8 @@ static inline void flush_free_hpage_work(struct hstate *h)
> static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
> bool atomic)
> {
> - if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
> + if ((!folio_test_hugetlb_zeroing(folio) &&
> + !folio_test_hugetlb_vmemmap_optimized(folio)) || !atomic) {
> __update_and_free_hugetlb_folio(h, folio);
> return;
> }
> @@ -1840,6 +1881,13 @@ static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio)
> h->nr_huge_pages_node[folio_nid(folio)]++;
> }
>
> +static void prep_new_hugetlb_folio(struct folio *folio)
> +{
> + lockdep_assert_held(&hugetlb_lock);
> + folio_clear_hugetlb_freed(folio);
> + prep_clear_zeroed(folio);
> +}
> +
> void init_new_hugetlb_folio(struct folio *folio)
> {
> __folio_set_hugetlb(folio);
> @@ -1964,6 +2012,7 @@ void prep_and_add_allocated_folios(struct hstate *h,
> /* Add all new pool pages to free lists in one lock cycle */
> spin_lock_irqsave(&hugetlb_lock, flags);
> list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
> + prep_new_hugetlb_folio(folio);
> account_new_hugetlb_folio(h, folio);
> enqueue_hugetlb_folio(h, folio);
> }
> @@ -2171,6 +2220,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
> return NULL;
>
> spin_lock_irq(&hugetlb_lock);
> + prep_new_hugetlb_folio(folio);
> /*
> * nr_huge_pages needs to be adjusted within the same lock cycle
> * as surplus_pages, otherwise it might confuse
> @@ -2214,6 +2264,7 @@ static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mas
> return NULL;
>
> spin_lock_irq(&hugetlb_lock);
> + prep_new_hugetlb_folio(folio);
> account_new_hugetlb_folio(h, folio);
> spin_unlock_irq(&hugetlb_lock);
>
> @@ -2289,6 +2340,13 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
> preferred_nid, nmask);
> if (folio) {
> spin_unlock_irq(&hugetlb_lock);
> + /*
> + * The contents of this page will be completely
> + * overwritten immediately, as its a migration
> + * target, so no clearing is needed. Do wait in
> + * case pre-zero thread was working on it, though.
> + */
> + hpage_wait_zeroing(h, folio);
> return folio;
> }
> }
> @@ -2779,6 +2837,7 @@ static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio,
> */
> remove_hugetlb_folio(h, old_folio, false);
>
> + prep_new_hugetlb_folio(new_folio);
> /*
> * Ref count on new_folio is already zero as it was dropped
> * earlier. It can be directly added to the pool free list.
> @@ -2999,6 +3058,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>
> spin_unlock_irq(&hugetlb_lock);
>
> + hpage_wait_zeroing(h, folio);
> +
> hugetlb_set_folio_subpool(folio, spool);
>
> if (map_chg != MAP_CHG_ENFORCED) {
> @@ -3257,6 +3318,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
> hugetlb_bootmem_init_migratetype(folio, h);
> /* Subdivide locks to achieve better parallel performance */
> spin_lock_irqsave(&hugetlb_lock, flags);
> + prep_new_hugetlb_folio(folio);
> account_new_hugetlb_folio(h, folio);
> enqueue_hugetlb_folio(h, folio);
> spin_unlock_irqrestore(&hugetlb_lock, flags);
> @@ -4190,6 +4252,42 @@ bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
> return size == HPAGE_SIZE;
> }
>
> +/*
> + * Zero a hugetlb page.
> + *
> + * The caller has already made sure that the page is not
> + * being actively zeroed out in the background.
> + *
> + * If it wasn't zeroed out, do it ourselves.
> + */
> +void hugetlb_zero_folio(struct folio *folio, unsigned long address)
> +{
> + if (!folio_test_hugetlb_zeroed(folio))
> + folio_zero_user(folio, address);
> +
> + __folio_mark_uptodate(folio);
> +}
> +
> +/*
> + * Once a page has been taken off the freelist, the new page owner
> + * must wait for the pre-zero thread to finish if it happens
> + * to be working on this page (which should be rare).
> + */
> +static void hpage_wait_zeroing(struct hstate *h, struct folio *folio)
> +{
> + if (!folio_test_hugetlb_zeroing(folio))
> + return;
> +
> + spin_lock_irq(&hugetlb_lock);
> +
> + wait_event_cmd(h->dqzero_wait[folio_nid(folio)],
> + !folio_test_hugetlb_zeroing(folio),
> + spin_unlock_irq(&hugetlb_lock),
> + spin_lock_irq(&hugetlb_lock));
> +
> + spin_unlock_irq(&hugetlb_lock);
> +}
> +
nit:
May be simple enough chunk to introduce guard() above
[...]
Regards
- Raghu
Powered by blists - more mailing lists