lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <3ed10ea4-347f-4d01-82aa-1d92d2804ced@gmail.com>
Date: Sat, 6 Dec 2025 16:42:30 +0000
From: Usama Arif <usamaarif642@...il.com>
To: Kiryl Shutsemau <kas@...nel.org>,
 Andrew Morton <akpm@...ux-foundation.org>,
 Muchun Song <muchun.song@...ux.dev>
Cc: David Hildenbrand <david@...nel.org>, Oscar Salvador <osalvador@...e.de>,
 Mike Rapoport <rppt@...nel.org>, Vlastimil Babka <vbabka@...e.cz>,
 Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
 Matthew Wilcox <willy@...radead.org>, Zi Yan <ziy@...dia.com>,
 Baoquan He <bhe@...hat.com>, Michal Hocko <mhocko@...e.com>,
 Johannes Weiner <hannes@...xchg.org>, Jonathan Corbet <corbet@....net>,
 kernel-team@...a.com, linux-mm@...ck.org, linux-kernel@...r.kernel.org,
 linux-doc@...r.kernel.org
Subject: Re: [PATCH 05/11] mm/hugetlb: Refactor code around vmemmap_walk



On 05/12/2025 19:43, Kiryl Shutsemau wrote:
> To prepare for removing fake head pages, the vmemmap_walk code is being reworked.
> 
> The reuse_page and reuse_addr variables are being eliminated. There will
> no longer be an expectation regarding the reuse address in relation to
> the operated range. Instead, the caller will provide head and tail
> vmemmap pages, along with the vmemmap_start address where the head page
> is located.
> 
> Currently, vmemmap_head and vmemmap_tail are set to the same page, but
> this will change in the future.
> 
> The only functional change is that __hugetlb_vmemmap_optimize_folio()
> will abandon optimization if memory allocation fails.
> 
> Signed-off-by: Kiryl Shutsemau <kas@...nel.org>
> ---
>  mm/hugetlb_vmemmap.c | 184 ++++++++++++++++++-------------------------
>  1 file changed, 77 insertions(+), 107 deletions(-)
> 
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index ba0fb1b6a5a8..f5ee499b8563 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -24,8 +24,9 @@
>   *
>   * @remap_pte:		called for each lowest-level entry (PTE).
>   * @nr_walked:		the number of walked pte.
> - * @reuse_page:		the page which is reused for the tail vmemmap pages.
> - * @reuse_addr:		the virtual address of the @reuse_page page.
> + * @vmemmap_start:	the start of vmemmap range, where head page is located
> + * @vmemmap_head:	the page to be installed as first in the vmemmap range
> + * @vmemmap_tail:	the page to be installed as non-first in the vmemmap range
>   * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
>   *			or is mapped from.
>   * @flags:		used to modify behavior in vmemmap page table walking
> @@ -34,11 +35,14 @@
>  struct vmemmap_remap_walk {
>  	void			(*remap_pte)(pte_t *pte, unsigned long addr,
>  					     struct vmemmap_remap_walk *walk);
> +
>  	unsigned long		nr_walked;
> -	struct page		*reuse_page;
> -	unsigned long		reuse_addr;
> +	unsigned long		vmemmap_start;
> +	struct page		*vmemmap_head;
> +	struct page		*vmemmap_tail;
>  	struct list_head	*vmemmap_pages;
>  
> +
>  /* Skip the TLB flush when we split the PMD */
>  #define VMEMMAP_SPLIT_NO_TLB_FLUSH	BIT(0)
>  /* Skip the TLB flush when we remap the PTE */
> @@ -140,14 +144,7 @@ static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
>  {
>  	struct vmemmap_remap_walk *vmemmap_walk = walk->private;
>  
> -	/*
> -	 * The reuse_page is found 'first' in page table walking before
> -	 * starting remapping.
> -	 */
> -	if (!vmemmap_walk->reuse_page)
> -		vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
> -	else
> -		vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
> +	vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
>  	vmemmap_walk->nr_walked++;
>  
>  	return 0;
> @@ -207,18 +204,12 @@ static void free_vmemmap_page_list(struct list_head *list)
>  static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
>  			      struct vmemmap_remap_walk *walk)
>  {
> -	/*
> -	 * Remap the tail pages as read-only to catch illegal write operation
> -	 * to the tail pages.
> -	 */
> -	pgprot_t pgprot = PAGE_KERNEL_RO;
>  	struct page *page = pte_page(ptep_get(pte));
>  	pte_t entry;
>  
>  	/* Remapping the head page requires r/w */
> -	if (unlikely(addr == walk->reuse_addr)) {
> -		pgprot = PAGE_KERNEL;
> -		list_del(&walk->reuse_page->lru);
> +	if (unlikely(addr == walk->vmemmap_start)) {
> +		list_del(&walk->vmemmap_head->lru);
>  
>  		/*
>  		 * Makes sure that preceding stores to the page contents from
> @@ -226,9 +217,16 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
>  		 * write.
>  		 */
>  		smp_wmb();
> +
> +		entry = mk_pte(walk->vmemmap_head, PAGE_KERNEL);
> +	} else {
> +		/*
> +		 * Remap the tail pages as read-only to catch illegal write
> +		 * operation to the tail pages.
> +		 */
> +		entry = mk_pte(walk->vmemmap_tail, PAGE_KERNEL_RO);
>  	}
>  
> -	entry = mk_pte(walk->reuse_page, pgprot);
>  	list_add(&page->lru, walk->vmemmap_pages);
>  	set_pte_at(&init_mm, addr, pte, entry);
>  }
> @@ -255,16 +253,13 @@ static inline void reset_struct_pages(struct page *start)
>  static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>  				struct vmemmap_remap_walk *walk)
>  {
> -	pgprot_t pgprot = PAGE_KERNEL;
>  	struct page *page;
>  	void *to;
>  
> -	BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
> -
>  	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
>  	list_del(&page->lru);
>  	to = page_to_virt(page);
> -	copy_page(to, (void *)walk->reuse_addr);
> +	copy_page(to, (void *)walk->vmemmap_start);
>  	reset_struct_pages(to);
>  
>  	/*
> @@ -272,7 +267,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>  	 * before the set_pte_at() write.
>  	 */
>  	smp_wmb();
> -	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
> +	set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
>  }
>  
>  /**
> @@ -282,22 +277,17 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>   *             to remap.
>   * @end:       end address of the vmemmap virtual address range that we want to
>   *             remap.
> - * @reuse:     reuse address.
> - *
>   * Return: %0 on success, negative error code otherwise.
>   */
> -static int vmemmap_remap_split(unsigned long start, unsigned long end,
> -			       unsigned long reuse)
> +static int vmemmap_remap_split(unsigned long start, unsigned long end)
>  {
>  	struct vmemmap_remap_walk walk = {
>  		.remap_pte	= NULL,
> +		.vmemmap_start	= start,
>  		.flags		= VMEMMAP_SPLIT_NO_TLB_FLUSH,
>  	};
>  
> -	/* See the comment in the vmemmap_remap_free(). */
> -	BUG_ON(start - reuse != PAGE_SIZE);
> -
> -	return vmemmap_remap_range(reuse, end, &walk);
> +	return vmemmap_remap_range(start, end, &walk);
>  }
>  
>  /**
> @@ -308,7 +298,8 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end,
>   *		to remap.
>   * @end:	end address of the vmemmap virtual address range that we want to
>   *		remap.
> - * @reuse:	reuse address.
> + * @vmemmap_head: the page to be installed as first in the vmemmap range
> + * @vmemmap_tail: the page to be installed as non-first in the vmemmap range
>   * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
>   *		responsibility to free pages.
>   * @flags:	modifications to vmemmap_remap_walk flags
> @@ -316,69 +307,40 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end,
>   * Return: %0 on success, negative error code otherwise.
>   */
>  static int vmemmap_remap_free(unsigned long start, unsigned long end,
> -			      unsigned long reuse,
> +			      struct page *vmemmap_head,
> +			      struct page *vmemmap_tail,
>  			      struct list_head *vmemmap_pages,
>  			      unsigned long flags)

Need to fix the doc above vmemmap_remap_free as it mentions reuse.
>  {
>  	int ret;
>  	struct vmemmap_remap_walk walk = {
>  		.remap_pte	= vmemmap_remap_pte,
> -		.reuse_addr	= reuse,
> +		.vmemmap_start	= start,
> +		.vmemmap_head	= vmemmap_head,
> +		.vmemmap_tail	= vmemmap_tail,
>  		.vmemmap_pages	= vmemmap_pages,
>  		.flags		= flags,
>  	};
> -	int nid = page_to_nid((struct page *)reuse);
> -	gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
> +
> +	ret = vmemmap_remap_range(start, end, &walk);
> +	if (!ret || !walk.nr_walked)
> +		return ret;
> +
> +	end = start + walk.nr_walked * PAGE_SIZE;
>  
>  	/*
> -	 * Allocate a new head vmemmap page to avoid breaking a contiguous
> -	 * block of struct page memory when freeing it back to page allocator
> -	 * in free_vmemmap_page_list(). This will allow the likely contiguous
> -	 * struct page backing memory to be kept contiguous and allowing for
> -	 * more allocations of hugepages. Fallback to the currently
> -	 * mapped head page in case should it fail to allocate.
> +	 * vmemmap_pages contains pages from the previous vmemmap_remap_range()
> +	 * call which failed.  These are pages which were removed from
> +	 * the vmemmap. They will be restored in the following call.
>  	 */
> -	walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
> -	if (walk.reuse_page) {
> -		copy_page(page_to_virt(walk.reuse_page),
> -			  (void *)walk.reuse_addr);
> -		list_add(&walk.reuse_page->lru, vmemmap_pages);
> -		memmap_pages_add(1);
> -	}
> +	walk = (struct vmemmap_remap_walk) {
> +		.remap_pte	= vmemmap_restore_pte,
> +		.vmemmap_start	= start,
> +		.vmemmap_pages	= vmemmap_pages,
> +		.flags		= 0,
> +	};
>  
> -	/*
> -	 * In order to make remapping routine most efficient for the huge pages,
> -	 * the routine of vmemmap page table walking has the following rules
> -	 * (see more details from the vmemmap_pte_range()):
> -	 *
> -	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
> -	 *   should be continuous.
> -	 * - The @reuse address is part of the range [@reuse, @end) that we are
> -	 *   walking which is passed to vmemmap_remap_range().
> -	 * - The @reuse address is the first in the complete range.
> -	 *
> -	 * So we need to make sure that @start and @reuse meet the above rules.
> -	 */
> -	BUG_ON(start - reuse != PAGE_SIZE);
> -
> -	ret = vmemmap_remap_range(reuse, end, &walk);
> -	if (ret && walk.nr_walked) {
> -		end = reuse + walk.nr_walked * PAGE_SIZE;
> -		/*
> -		 * vmemmap_pages contains pages from the previous
> -		 * vmemmap_remap_range call which failed.  These
> -		 * are pages which were removed from the vmemmap.
> -		 * They will be restored in the following call.
> -		 */
> -		walk = (struct vmemmap_remap_walk) {
> -			.remap_pte	= vmemmap_restore_pte,
> -			.reuse_addr	= reuse,
> -			.vmemmap_pages	= vmemmap_pages,
> -			.flags		= 0,
> -		};
> -
> -		vmemmap_remap_range(reuse, end, &walk);
> -	}
> +	vmemmap_remap_range(start + PAGE_SIZE, end, &walk);


I think this should be vmemmap_remap_range(start, end, &walk)? Otherwise if start failed to remap,
you wont restore it?

>  
>  	return ret;
>  }
> @@ -415,29 +377,27 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
>   *		to remap.
>   * @end:	end address of the vmemmap virtual address range that we want to
>   *		remap.
> - * @reuse:	reuse address.
>   * @flags:	modifications to vmemmap_remap_walk flags
>   *
>   * Return: %0 on success, negative error code otherwise.
>   */
>  static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
> -			       unsigned long reuse, unsigned long flags)
> +			       unsigned long flags)
>  {
>  	LIST_HEAD(vmemmap_pages);
>  	struct vmemmap_remap_walk walk = {
>  		.remap_pte	= vmemmap_restore_pte,
> -		.reuse_addr	= reuse,
> +		.vmemmap_start	= start,
>  		.vmemmap_pages	= &vmemmap_pages,
>  		.flags		= flags,
>  	};
>  
> -	/* See the comment in the vmemmap_remap_free(). */
> -	BUG_ON(start - reuse != PAGE_SIZE);
> +	start += HUGETLB_VMEMMAP_RESERVE_SIZE;
>  
>  	if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
>  		return -ENOMEM;
>  
> -	return vmemmap_remap_range(reuse, end, &walk);
> +	return vmemmap_remap_range(start, end, &walk);
>  }
>  
>  DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
> @@ -454,8 +414,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
>  					   struct folio *folio, unsigned long flags)
>  {
>  	int ret;
> -	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
> -	unsigned long vmemmap_reuse;
> +	unsigned long vmemmap_start, vmemmap_end;
>  
>  	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
>  	VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
> @@ -466,9 +425,8 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
>  	if (flags & VMEMMAP_SYNCHRONIZE_RCU)
>  		synchronize_rcu();
>  
> +	vmemmap_start	= (unsigned long)folio;
>  	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
> -	vmemmap_reuse	= vmemmap_start;
> -	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
>  
>  	/*
>  	 * The pages which the vmemmap virtual address range [@vmemmap_start,
> @@ -477,7 +435,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
>  	 * When a HugeTLB page is freed to the buddy allocator, previously
>  	 * discarded vmemmap pages must be allocated and remapping.
>  	 */
> -	ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
> +	ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, flags);
>  	if (!ret) {
>  		folio_clear_hugetlb_vmemmap_optimized(folio);
>  		static_branch_dec(&hugetlb_optimize_vmemmap_key);
> @@ -565,9 +523,9 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
>  					    struct list_head *vmemmap_pages,
>  					    unsigned long flags)
>  {
> -	int ret = 0;
> -	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
> -	unsigned long vmemmap_reuse;
> +	unsigned long vmemmap_start, vmemmap_end;
> +	struct page *vmemmap_head, *vmemmap_tail;
> +	int nid, ret = 0;
>  
>  	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
>  	VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
> @@ -592,9 +550,21 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
>  	 */
>  	folio_set_hugetlb_vmemmap_optimized(folio);
>  
> +	nid = folio_nid(folio);
> +	vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0);

Should we add __GFP_NORETRY | __GFP_NOWARN here? It was there in the previous code. I am guessing
that it was there in the previous code as its an optimization and if it fails its not a big issue.


> +
> +	if (!vmemmap_head) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	copy_page(page_to_virt(vmemmap_head), folio);
> +	list_add(&vmemmap_head->lru, vmemmap_pages);
> +	memmap_pages_add(1);
> +
> +	vmemmap_tail	= vmemmap_head;
> +	vmemmap_start	= (unsigned long)folio;
>  	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
> -	vmemmap_reuse	= vmemmap_start;
> -	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
>  
>  	/*
>  	 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
> @@ -602,8 +572,10 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
>  	 * mapping the range to vmemmap_pages list so that they can be freed by
>  	 * the caller.
>  	 */
> -	ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
> +	ret = vmemmap_remap_free(vmemmap_start, vmemmap_end,
> +				 vmemmap_head, vmemmap_tail,
>  				 vmemmap_pages, flags);

The doc above this also mentions vmemmap_reuse.

> +out:
>  	if (ret) {
>  		static_branch_dec(&hugetlb_optimize_vmemmap_key);
>  		folio_clear_hugetlb_vmemmap_optimized(folio);
> @@ -632,21 +604,19 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
>  
>  static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
>  {
> -	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
> -	unsigned long vmemmap_reuse;
> +	unsigned long vmemmap_start, vmemmap_end;
>  
>  	if (!vmemmap_should_optimize_folio(h, folio))
>  		return 0;
>  
> +	vmemmap_start	= (unsigned long)folio;
>  	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
> -	vmemmap_reuse	= vmemmap_start;
> -	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
>  
>  	/*
>  	 * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
>  	 * @vmemmap_end]
>  	 */
> -	return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
> +	return vmemmap_remap_split(vmemmap_start, vmemmap_end);
>  }
>  
>  static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ