linux-kernel - Re: [PATCH 1/8] mm/hugetlb: add pre-zeroed framework

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <46bf07b6-633f-43b8-8e2b-b08d437494b9@amd.com>
Date: Fri, 26 Dec 2025 14:54:17 +0530
From: Raghavendra K T <raghavendra.kt@....com>
To: 李喆 <lizhe.67@...edance.com>, muchun.song@...ux.dev,
 osalvador@...e.de, david@...nel.org, akpm@...ux-foundation.org,
 fvdl@...gle.com
Cc: linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/8] mm/hugetlb: add pre-zeroed framework

On 12/25/2025 1:50 PM, æå wrote:
> From: Li Zhe <lizhe.67@...edance.com>
> 
> This patch establishes a pre-zeroing framework by introducing two new
> hugetlb page flags and extends the code at every point where these flags
> may later be required. The roles of the two flags are as follows.
> 
> (1) HPG_zeroed – indicates that the huge folio has already been
>      zeroed
> (2) HPG_zeroing – marks that the huge folio is currently being zeroed
> 
> No functional change, as nothing sets the flags yet.
> 
> Co-developed-by: Frank van der Linden <fvdl@...gle.com>
> Signed-off-by: Frank van der Linden <fvdl@...gle.com>
> Signed-off-by: Li Zhe <lizhe.67@...edance.com>
> ---
>   fs/hugetlbfs/inode.c    |   3 +-
>   include/linux/hugetlb.h |  26 +++++++++
>   mm/hugetlb.c            | 113 +++++++++++++++++++++++++++++++++++++---
>   3 files changed, 133 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 3b4c152c5c73..be6b32ab3ca8 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -828,8 +828,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
>   			error = PTR_ERR(folio);
>   			goto out;
>   		}
> -		folio_zero_user(folio, addr);
> -		__folio_mark_uptodate(folio);
> +		hugetlb_zero_folio(folio, addr);
>   		error = hugetlb_add_to_page_cache(folio, mapping, index);
>   		if (unlikely(error)) {
>   			restore_reserve_on_error(h, &pseudo_vma, addr, folio);
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 019a1c5281e4..2daf4422a17d 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -584,6 +584,17 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
>    * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
>    * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page
>    *     that is not tracked by raw_hwp_page list.
> + * HPG_zeroed - page was pre-zeroed.
> + *	Synchronization: hugetlb_lock held when set by pre-zero thread.
> + *	Only valid to read outside hugetlb_lock once the page is off
> + *	the freelist, and HPG_zeroing is clear. Always cleared when a
> + *	page is put (back) on the freelist.
> + * HPG_zeroing - page is being zeroed by the pre-zero thread.
> + *	Synchronization: set and cleared by the pre-zero thread with
> + *	hugetlb_lock held. Access by others is read-only. Once the page
> + *	is off the freelist, this can only change from set -> clear,
> + *	which the new page owner must wait for. Always cleared
> + *	when a page is put (back) on the freelist.
>    */
>   enum hugetlb_page_flags {
>   	HPG_restore_reserve = 0,
> @@ -593,6 +604,8 @@ enum hugetlb_page_flags {
>   	HPG_vmemmap_optimized,
>   	HPG_raw_hwp_unreliable,
>   	HPG_cma,
> +	HPG_zeroed,
> +	HPG_zeroing,
>   	__NR_HPAGEFLAGS,
>   };
>   
> @@ -653,6 +666,8 @@ HPAGEFLAG(Freed, freed)
>   HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
>   HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
>   HPAGEFLAG(Cma, cma)
> +HPAGEFLAG(Zeroed, zeroed)
> +HPAGEFLAG(Zeroing, zeroing)
>   
>   #ifdef CONFIG_HUGETLB_PAGE
>   
> @@ -678,6 +693,12 @@ struct hstate {
>   	unsigned int nr_huge_pages_node[MAX_NUMNODES];
>   	unsigned int free_huge_pages_node[MAX_NUMNODES];
>   	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
> +
> +	unsigned int free_huge_pages_zero_node[MAX_NUMNODES];
> +
> +	/* Queue to wait for a hugetlb folio that is being prezeroed */
> +	wait_queue_head_t dqzero_wait[MAX_NUMNODES];
> +
>   	char name[HSTATE_NAME_LEN];
>   };
>   
> @@ -711,6 +732,7 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping
>   			pgoff_t idx);
>   void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
>   				unsigned long address, struct folio *folio);
> +void hugetlb_zero_folio(struct folio *folio, unsigned long address);
>   
>   /* arch callback */
>   int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
> @@ -1303,6 +1325,10 @@ static inline bool hugetlb_bootmem_allocated(void)
>   {
>   	return false;
>   }
> +
> +static inline void hugetlb_zero_folio(struct folio *folio, unsigned long address)
> +{
> +}
>   #endif	/* CONFIG_HUGETLB_PAGE */
>   
>   static inline spinlock_t *huge_pte_lock(struct hstate *h,
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 51273baec9e5..d20614b1c927 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -93,6 +93,8 @@ static int hugetlb_param_index __initdata;
>   static __init int hugetlb_add_param(char *s, int (*setup)(char *val));
>   static __init void hugetlb_parse_params(void);
>   
> +static void hpage_wait_zeroing(struct hstate *h, struct folio *folio);
> +
>   #define hugetlb_early_param(str, func) \
>   static __init int func##args(char *s) \
>   { \
> @@ -1292,21 +1294,33 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
>   	hugetlb_dup_vma_private(vma);
>   }
>   
> +/*
> + * Clear flags for either a fresh page or one that is being
> + * added to the free list.
> + */
> +static inline void prep_clear_zeroed(struct folio *folio)
> +{
> +	folio_clear_hugetlb_zeroed(folio);
> +	folio_clear_hugetlb_zeroing(folio);
> +}
> +
>   static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
>   {
>   	int nid = folio_nid(folio);
>   
>   	lockdep_assert_held(&hugetlb_lock);
>   	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
> +	VM_WARN_ON_FOLIO(folio_test_hugetlb_zeroing(folio), folio);
>   
>   	list_move(&folio->lru, &h->hugepage_freelists[nid]);
>   	h->free_huge_pages++;
>   	h->free_huge_pages_node[nid]++;
> +	prep_clear_zeroed(folio);
>   	folio_set_hugetlb_freed(folio);
>   }
>   
> -static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
> -								int nid)
> +static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, int nid,
> +		gfp_t gfp_mask)
>   {
>   	struct folio *folio;
>   	bool pin = !!(current->flags & PF_MEMALLOC_PIN);
> @@ -1316,6 +1330,16 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
>   		if (pin && !folio_is_longterm_pinnable(folio))
>   			continue;
>   
> +		/*
> +		 * This shouldn't happen, as hugetlb pages are never allocated
> +		 * with GFP_ATOMIC. But be paranoid and check for it, as
> +		 * a zero_busy page might cause a sleep later in
> +		 * hpage_wait_zeroing().
> +		 */
> +		if (WARN_ON_ONCE(folio_test_hugetlb_zeroing(folio) &&
> +					!gfpflags_allow_blocking(gfp_mask)))
> +			continue;
> +
>   		if (folio_test_hwpoison(folio))
>   			continue;
>   
> @@ -1327,6 +1351,10 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
>   		folio_clear_hugetlb_freed(folio);
>   		h->free_huge_pages--;
>   		h->free_huge_pages_node[nid]--;
> +		if (folio_test_hugetlb_zeroed(folio) ||
> +		    folio_test_hugetlb_zeroing(folio))
> +			h->free_huge_pages_zero_node[nid]--;
> +
>   		return folio;
>   	}
>   
> @@ -1363,7 +1391,7 @@ static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_
>   			continue;
>   		node = zone_to_nid(zone);
>   
> -		folio = dequeue_hugetlb_folio_node_exact(h, node);
> +		folio = dequeue_hugetlb_folio_node_exact(h, node, gfp_mask);
>   		if (folio)
>   			return folio;
>   	}
> @@ -1490,7 +1518,16 @@ void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
>   		folio_clear_hugetlb_freed(folio);
>   		h->free_huge_pages--;
>   		h->free_huge_pages_node[nid]--;
> +		folio_clear_hugetlb_freed(folio);
>   	}
> +	/*
> +	 * Adjust the zero page counters now. Note that
> +	 * if a page is currently being zeroed, that
> +	 * will be waited for in update_and_free_page()
> +	 */
> +	if (folio_test_hugetlb_zeroed(folio) ||
> +	    folio_test_hugetlb_zeroing(folio))
> +		h->free_huge_pages_zero_node[nid]--;
>   	if (adjust_surplus) {
>   		h->surplus_huge_pages--;
>   		h->surplus_huge_pages_node[nid]--;
> @@ -1543,6 +1580,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
>   {
>   	bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);
>   
> +	VM_WARN_ON_FOLIO(folio_test_hugetlb_zeroing(folio), folio);
> +
>   	if (hstate_is_gigantic_no_runtime(h))
>   		return;
>   
> @@ -1627,6 +1666,7 @@ static void free_hpage_workfn(struct work_struct *work)
>   		 */
>   		h = size_to_hstate(folio_size(folio));
>   
> +		hpage_wait_zeroing(h, folio);
>   		__update_and_free_hugetlb_folio(h, folio);
>   
>   		cond_resched();
> @@ -1643,7 +1683,8 @@ static inline void flush_free_hpage_work(struct hstate *h)
>   static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
>   				 bool atomic)
>   {
> -	if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
> +	if ((!folio_test_hugetlb_zeroing(folio) &&
> +	     !folio_test_hugetlb_vmemmap_optimized(folio)) || !atomic) {
>   		__update_and_free_hugetlb_folio(h, folio);
>   		return;
>   	}
> @@ -1840,6 +1881,13 @@ static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio)
>   	h->nr_huge_pages_node[folio_nid(folio)]++;
>   }
>   
> +static void prep_new_hugetlb_folio(struct folio *folio)
> +{
> +	lockdep_assert_held(&hugetlb_lock);
> +	folio_clear_hugetlb_freed(folio);
> +	prep_clear_zeroed(folio);
> +}
> +
>   void init_new_hugetlb_folio(struct folio *folio)
>   {
>   	__folio_set_hugetlb(folio);
> @@ -1964,6 +2012,7 @@ void prep_and_add_allocated_folios(struct hstate *h,
>   	/* Add all new pool pages to free lists in one lock cycle */
>   	spin_lock_irqsave(&hugetlb_lock, flags);
>   	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
> +		prep_new_hugetlb_folio(folio);
>   		account_new_hugetlb_folio(h, folio);
>   		enqueue_hugetlb_folio(h, folio);
>   	}
> @@ -2171,6 +2220,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
>   		return NULL;
>   
>   	spin_lock_irq(&hugetlb_lock);
> +	prep_new_hugetlb_folio(folio);
>   	/*
>   	 * nr_huge_pages needs to be adjusted within the same lock cycle
>   	 * as surplus_pages, otherwise it might confuse
> @@ -2214,6 +2264,7 @@ static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mas
>   		return NULL;
>   
>   	spin_lock_irq(&hugetlb_lock);
> +	prep_new_hugetlb_folio(folio);
>   	account_new_hugetlb_folio(h, folio);
>   	spin_unlock_irq(&hugetlb_lock);
>   
> @@ -2289,6 +2340,13 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
>   						preferred_nid, nmask);
>   		if (folio) {
>   			spin_unlock_irq(&hugetlb_lock);
> +			/*
> +			 * The contents of this page will be completely
> +			 * overwritten immediately, as its a migration
> +			 * target, so no clearing is needed. Do wait in
> +			 * case pre-zero thread was working on it, though.
> +			 */
> +			hpage_wait_zeroing(h, folio);
>   			return folio;
>   		}
>   	}
> @@ -2779,6 +2837,7 @@ static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio,
>   		 */
>   		remove_hugetlb_folio(h, old_folio, false);
>   
> +		prep_new_hugetlb_folio(new_folio);
>   		/*
>   		 * Ref count on new_folio is already zero as it was dropped
>   		 * earlier.  It can be directly added to the pool free list.
> @@ -2999,6 +3058,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>   
>   	spin_unlock_irq(&hugetlb_lock);
>   
> +	hpage_wait_zeroing(h, folio);
> +
>   	hugetlb_set_folio_subpool(folio, spool);
>   
>   	if (map_chg != MAP_CHG_ENFORCED) {
> @@ -3257,6 +3318,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
>   		hugetlb_bootmem_init_migratetype(folio, h);
>   		/* Subdivide locks to achieve better parallel performance */
>   		spin_lock_irqsave(&hugetlb_lock, flags);
> +		prep_new_hugetlb_folio(folio);
>   		account_new_hugetlb_folio(h, folio);
>   		enqueue_hugetlb_folio(h, folio);
>   		spin_unlock_irqrestore(&hugetlb_lock, flags);
> @@ -4190,6 +4252,42 @@ bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
>   	return size == HPAGE_SIZE;
>   }
>   
> +/*
> + * Zero a hugetlb page.
> + *
> + * The caller has already made sure that the page is not
> + * being actively zeroed out in the background.
> + *
> + * If it wasn't zeroed out, do it ourselves.
> + */
> +void hugetlb_zero_folio(struct folio *folio, unsigned long address)
> +{
> +	if (!folio_test_hugetlb_zeroed(folio))
> +		folio_zero_user(folio, address);
> +
> +	__folio_mark_uptodate(folio);
> +}
> +
> +/*
> + * Once a page has been taken off the freelist, the new page owner
> + * must wait for the pre-zero thread to finish if it happens
> + * to be working on this page (which should be rare).
> + */
> +static void hpage_wait_zeroing(struct hstate *h, struct folio *folio)
> +{
> +	if (!folio_test_hugetlb_zeroing(folio))
> +		return;
> +
> +	spin_lock_irq(&hugetlb_lock);
> +
> +	wait_event_cmd(h->dqzero_wait[folio_nid(folio)],
> +		       !folio_test_hugetlb_zeroing(folio),
> +		       spin_unlock_irq(&hugetlb_lock),
> +		       spin_lock_irq(&hugetlb_lock));
> +
> +	spin_unlock_irq(&hugetlb_lock);
> +}
> +

nit:
May be simple enough chunk to introduce guard() above

[...]

Regards
- Raghu