lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <cogzomaems6f4v2ihijwd3qoq5mb6g7lr37xrrwynbdmjjgy7y@5dytmftheuhz>
Date: Mon, 22 Dec 2025 15:00:35 +0000
From: Kiryl Shutsemau <kas@...nel.org>
To: Muchun Song <muchun.song@...ux.dev>
Cc: Oscar Salvador <osalvador@...e.de>, Mike Rapoport <rppt@...nel.org>, 
	Vlastimil Babka <vbabka@...e.cz>, Lorenzo Stoakes <lorenzo.stoakes@...cle.com>, 
	Zi Yan <ziy@...dia.com>, Baoquan He <bhe@...hat.com>, Michal Hocko <mhocko@...e.com>, 
	Johannes Weiner <hannes@...xchg.org>, Jonathan Corbet <corbet@....net>, kernel-team@...a.com, 
	linux-mm@...ck.org, linux-kernel@...r.kernel.org, linux-doc@...r.kernel.org, 
	Andrew Morton <akpm@...ux-foundation.org>, David Hildenbrand <david@...nel.org>, 
	Matthew Wilcox <willy@...radead.org>, Usama Arif <usamaarif642@...il.com>, 
	Frank van der Linden <fvdl@...gle.com>
Subject: Re: [PATCHv2 08/14] mm/hugetlb: Refactor code around vmemmap_walk

On Mon, Dec 22, 2025 at 01:54:47PM +0800, Muchun Song wrote:
> 
> 
> On 2025/12/18 23:09, Kiryl Shutsemau wrote:
> > To prepare for removing fake head pages, the vmemmap_walk code is being reworked.
> > 
> > The reuse_page and reuse_addr variables are being eliminated. There will
> > no longer be an expectation regarding the reuse address in relation to
> > the operated range. Instead, the caller will provide head and tail
> > vmemmap pages, along with the vmemmap_start address where the head page
> > is located.
> > 
> > Currently, vmemmap_head and vmemmap_tail are set to the same page, but
> > this will change in the future.
> > 
> > The only functional change is that __hugetlb_vmemmap_optimize_folio()
> > will abandon optimization if memory allocation fails.
> > 
> > Signed-off-by: Kiryl Shutsemau <kas@...nel.org>
> > ---
> >   mm/hugetlb_vmemmap.c | 198 ++++++++++++++++++-------------------------
> >   1 file changed, 83 insertions(+), 115 deletions(-)
> > 
> > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> > index ba0fb1b6a5a8..d18e7475cf95 100644
> > --- a/mm/hugetlb_vmemmap.c
> > +++ b/mm/hugetlb_vmemmap.c
> > @@ -24,8 +24,9 @@
> >    *
> >    * @remap_pte:		called for each lowest-level entry (PTE).
> >    * @nr_walked:		the number of walked pte.
> > - * @reuse_page:		the page which is reused for the tail vmemmap pages.
> > - * @reuse_addr:		the virtual address of the @reuse_page page.
> > + * @vmemmap_start:	the start of vmemmap range, where head page is located
> > + * @vmemmap_head:	the page to be installed as first in the vmemmap range
> > + * @vmemmap_tail:	the page to be installed as non-first in the vmemmap range
> >    * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
> >    *			or is mapped from.
> >    * @flags:		used to modify behavior in vmemmap page table walking
> > @@ -34,11 +35,14 @@
> >   struct vmemmap_remap_walk {
> >   	void			(*remap_pte)(pte_t *pte, unsigned long addr,
> >   					     struct vmemmap_remap_walk *walk);
> > +
> >   	unsigned long		nr_walked;
> > -	struct page		*reuse_page;
> > -	unsigned long		reuse_addr;
> > +	unsigned long		vmemmap_start;
> > +	struct page		*vmemmap_head;
> > +	struct page		*vmemmap_tail;
> >   	struct list_head	*vmemmap_pages;
> > +
> >   /* Skip the TLB flush when we split the PMD */
> >   #define VMEMMAP_SPLIT_NO_TLB_FLUSH	BIT(0)
> >   /* Skip the TLB flush when we remap the PTE */
> > @@ -140,14 +144,7 @@ static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
> >   {
> >   	struct vmemmap_remap_walk *vmemmap_walk = walk->private;
> > -	/*
> > -	 * The reuse_page is found 'first' in page table walking before
> > -	 * starting remapping.
> > -	 */
> > -	if (!vmemmap_walk->reuse_page)
> > -		vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
> > -	else
> > -		vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
> > +	vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
> >   	vmemmap_walk->nr_walked++;
> >   	return 0;
> > @@ -207,18 +204,12 @@ static void free_vmemmap_page_list(struct list_head *list)
> >   static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
> >   			      struct vmemmap_remap_walk *walk)
> >   {
> > -	/*
> > -	 * Remap the tail pages as read-only to catch illegal write operation
> > -	 * to the tail pages.
> > -	 */
> > -	pgprot_t pgprot = PAGE_KERNEL_RO;
> >   	struct page *page = pte_page(ptep_get(pte));
> >   	pte_t entry;
> >   	/* Remapping the head page requires r/w */
> > -	if (unlikely(addr == walk->reuse_addr)) {
> > -		pgprot = PAGE_KERNEL;
> > -		list_del(&walk->reuse_page->lru);
> > +	if (unlikely(addr == walk->vmemmap_start)) {
> > +		list_del(&walk->vmemmap_head->lru);
> >   		/*
> >   		 * Makes sure that preceding stores to the page contents from
> > @@ -226,9 +217,16 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
> >   		 * write.
> >   		 */
> >   		smp_wmb();
> > +
> > +		entry = mk_pte(walk->vmemmap_head, PAGE_KERNEL);
> > +	} else {
> > +		/*
> > +		 * Remap the tail pages as read-only to catch illegal write
> > +		 * operation to the tail pages.
> > +		 */
> > +		entry = mk_pte(walk->vmemmap_tail, PAGE_KERNEL_RO);
> >   	}
> > -	entry = mk_pte(walk->reuse_page, pgprot);
> >   	list_add(&page->lru, walk->vmemmap_pages);
> >   	set_pte_at(&init_mm, addr, pte, entry);
> >   }
> > @@ -255,16 +253,13 @@ static inline void reset_struct_pages(struct page *start)
> >   static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
> >   				struct vmemmap_remap_walk *walk)
> >   {
> > -	pgprot_t pgprot = PAGE_KERNEL;
> >   	struct page *page;
> >   	void *to;
> > -	BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
> > -
> >   	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
> >   	list_del(&page->lru);
> >   	to = page_to_virt(page);
> > -	copy_page(to, (void *)walk->reuse_addr);
> > +	copy_page(to, (void *)walk->vmemmap_start);
> >   	reset_struct_pages(to);
> >   	/*
> > @@ -272,7 +267,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
> >   	 * before the set_pte_at() write.
> >   	 */
> >   	smp_wmb();
> > -	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
> > +	set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
> >   }
> >   /**
> > @@ -282,33 +277,29 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
> >    *             to remap.
> >    * @end:       end address of the vmemmap virtual address range that we want to
> >    *             remap.
> > - * @reuse:     reuse address.
> > - *
> >    * Return: %0 on success, negative error code otherwise.
> >    */
> > -static int vmemmap_remap_split(unsigned long start, unsigned long end,
> > -			       unsigned long reuse)
> > +static int vmemmap_remap_split(unsigned long start, unsigned long end)
> >   {
> >   	struct vmemmap_remap_walk walk = {
> >   		.remap_pte	= NULL,
> > +		.vmemmap_start	= start,
> >   		.flags		= VMEMMAP_SPLIT_NO_TLB_FLUSH,
> >   	};
> > -	/* See the comment in the vmemmap_remap_free(). */
> > -	BUG_ON(start - reuse != PAGE_SIZE);
> > -
> > -	return vmemmap_remap_range(reuse, end, &walk);
> > +	return vmemmap_remap_range(start, end, &walk);
> >   }
> >   /**
> >    * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
> > - *			to the page which @reuse is mapped to, then free vmemmap
> > - *			which the range are mapped to.
> > + *			to use @vmemmap_head/tail, then free vmemmap which
> > + *			the range are mapped to.
> >    * @start:	start address of the vmemmap virtual address range that we want
> >    *		to remap.
> >    * @end:	end address of the vmemmap virtual address range that we want to
> >    *		remap.
> > - * @reuse:	reuse address.
> > + * @vmemmap_head: the page to be installed as first in the vmemmap range
> > + * @vmemmap_tail: the page to be installed as non-first in the vmemmap range
> >    * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
> >    *		responsibility to free pages.
> >    * @flags:	modifications to vmemmap_remap_walk flags
> > @@ -316,69 +307,40 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end,
> >    * Return: %0 on success, negative error code otherwise.
> >    */
> >   static int vmemmap_remap_free(unsigned long start, unsigned long end,
> > -			      unsigned long reuse,
> > +			      struct page *vmemmap_head,
> > +			      struct page *vmemmap_tail,
> >   			      struct list_head *vmemmap_pages,
> >   			      unsigned long flags)
> >   {
> >   	int ret;
> >   	struct vmemmap_remap_walk walk = {
> >   		.remap_pte	= vmemmap_remap_pte,
> > -		.reuse_addr	= reuse,
> > +		.vmemmap_start	= start,
> > +		.vmemmap_head	= vmemmap_head,
> > +		.vmemmap_tail	= vmemmap_tail,
> >   		.vmemmap_pages	= vmemmap_pages,
> >   		.flags		= flags,
> >   	};
> > -	int nid = page_to_nid((struct page *)reuse);
> > -	gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
> > +
> > +	ret = vmemmap_remap_range(start, end, &walk);
> > +	if (!ret || !walk.nr_walked)
> > +		return ret;
> > +
> > +	end = start + walk.nr_walked * PAGE_SIZE;
> >   	/*
> > -	 * Allocate a new head vmemmap page to avoid breaking a contiguous
> > -	 * block of struct page memory when freeing it back to page allocator
> > -	 * in free_vmemmap_page_list(). This will allow the likely contiguous
> > -	 * struct page backing memory to be kept contiguous and allowing for
> > -	 * more allocations of hugepages. Fallback to the currently
> > -	 * mapped head page in case should it fail to allocate.
> > +	 * vmemmap_pages contains pages from the previous vmemmap_remap_range()
> > +	 * call which failed.  These are pages which were removed from
> > +	 * the vmemmap. They will be restored in the following call.
> >   	 */
> > -	walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
> > -	if (walk.reuse_page) {
> > -		copy_page(page_to_virt(walk.reuse_page),
> > -			  (void *)walk.reuse_addr);
> > -		list_add(&walk.reuse_page->lru, vmemmap_pages);
> > -		memmap_pages_add(1);
> > -	}
> > +	walk = (struct vmemmap_remap_walk) {
> > +		.remap_pte	= vmemmap_restore_pte,
> > +		.vmemmap_start	= start,
> > +		.vmemmap_pages	= vmemmap_pages,
> > +		.flags		= 0,
> > +	};
> > -	/*
> > -	 * In order to make remapping routine most efficient for the huge pages,
> > -	 * the routine of vmemmap page table walking has the following rules
> > -	 * (see more details from the vmemmap_pte_range()):
> > -	 *
> > -	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
> > -	 *   should be continuous.
> > -	 * - The @reuse address is part of the range [@reuse, @end) that we are
> > -	 *   walking which is passed to vmemmap_remap_range().
> > -	 * - The @reuse address is the first in the complete range.
> > -	 *
> > -	 * So we need to make sure that @start and @reuse meet the above rules.
> > -	 */
> > -	BUG_ON(start - reuse != PAGE_SIZE);
> > -
> > -	ret = vmemmap_remap_range(reuse, end, &walk);
> > -	if (ret && walk.nr_walked) {
> > -		end = reuse + walk.nr_walked * PAGE_SIZE;
> > -		/*
> > -		 * vmemmap_pages contains pages from the previous
> > -		 * vmemmap_remap_range call which failed.  These
> > -		 * are pages which were removed from the vmemmap.
> > -		 * They will be restored in the following call.
> > -		 */
> > -		walk = (struct vmemmap_remap_walk) {
> > -			.remap_pte	= vmemmap_restore_pte,
> > -			.reuse_addr	= reuse,
> > -			.vmemmap_pages	= vmemmap_pages,
> > -			.flags		= 0,
> > -		};
> > -
> > -		vmemmap_remap_range(reuse, end, &walk);
> > -	}
> > +	vmemmap_remap_range(start + PAGE_SIZE, end, &walk);
> 
> The reason we previously passed the "start" address
> was to perform a TLB flush within that address range.
> So he startaddress is still necessary.

Good catch.

> >   	return ret;
> >   }
> > @@ -415,29 +377,27 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
> >    *		to remap.
> >    * @end:	end address of the vmemmap virtual address range that we want to
> >    *		remap.
> > - * @reuse:	reuse address.
> >    * @flags:	modifications to vmemmap_remap_walk flags
> >    *
> >    * Return: %0 on success, negative error code otherwise.
> >    */
> >   static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
> > -			       unsigned long reuse, unsigned long flags)
> > +			       unsigned long flags)
> >   {
> >   	LIST_HEAD(vmemmap_pages);
> >   	struct vmemmap_remap_walk walk = {
> >   		.remap_pte	= vmemmap_restore_pte,
> > -		.reuse_addr	= reuse,
> > +		.vmemmap_start	= start,
> >   		.vmemmap_pages	= &vmemmap_pages,
> >   		.flags		= flags,
> >   	};
> > -	/* See the comment in the vmemmap_remap_free(). */
> > -	BUG_ON(start - reuse != PAGE_SIZE);
> > +	start += HUGETLB_VMEMMAP_RESERVE_SIZE;
> >   	if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
> >   		return -ENOMEM;
> > -	return vmemmap_remap_range(reuse, end, &walk);
> > +	return vmemmap_remap_range(start, end, &walk);
> >   }
> >   DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
> > @@ -454,8 +414,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
> >   					   struct folio *folio, unsigned long flags)
> >   {
> >   	int ret;
> > -	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
> > -	unsigned long vmemmap_reuse;
> > +	unsigned long vmemmap_start, vmemmap_end;
> >   	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
> >   	VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
> > @@ -466,18 +425,16 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
> >   	if (flags & VMEMMAP_SYNCHRONIZE_RCU)
> >   		synchronize_rcu();
> > +	vmemmap_start	= (unsigned long)folio;
> >   	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
> > -	vmemmap_reuse	= vmemmap_start;
> > -	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
> >   	/*
> >   	 * The pages which the vmemmap virtual address range [@vmemmap_start,
> > -	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
> > -	 * the range is mapped to the page which @vmemmap_reuse is mapped to.
> > +	 * @vmemmap_end) are mapped to are freed to the buddy allocator.
> >   	 * When a HugeTLB page is freed to the buddy allocator, previously
> >   	 * discarded vmemmap pages must be allocated and remapping.
> >   	 */
> > -	ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
> > +	ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, flags);
> >   	if (!ret) {
> >   		folio_clear_hugetlb_vmemmap_optimized(folio);
> >   		static_branch_dec(&hugetlb_optimize_vmemmap_key);
> > @@ -565,9 +522,9 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
> >   					    struct list_head *vmemmap_pages,
> >   					    unsigned long flags)
> >   {
> > -	int ret = 0;
> > -	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
> > -	unsigned long vmemmap_reuse;
> > +	unsigned long vmemmap_start, vmemmap_end;
> > +	struct page *vmemmap_head, *vmemmap_tail;
> > +	int nid, ret = 0;
> >   	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
> >   	VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
> > @@ -592,18 +549,31 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
> >   	 */
> >   	folio_set_hugetlb_vmemmap_optimized(folio);
> > +	nid = folio_nid(folio);
> > +	vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0);
> 
> Why did you choose to change the gfpmask (previous is
> GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN)?

Because I removed the fallback for allocation failure. Trying harder and
warn if the allocation failed is justified without the fallback path.

> > +
> > +	if (!vmemmap_head) {
> > +		ret = -ENOMEM;
> 
> Why did you choose to change the allocation-failure
> behavior? Replacing the head page isn’t mandatory;
> it’s only nice-to-have.

I would require to extract the vmemmap_head page from page tables which
I found to be useless complication that never will get executed and
therefore tested.

If we failed to allocate a single page here, we are in OOM territory. It
is not time to play with huge page allocation.

> > +		goto out;
> > +	}
> > +
> > +	copy_page(page_to_virt(vmemmap_head), folio);
> > +	list_add(&vmemmap_head->lru, vmemmap_pages);
> > +	memmap_pages_add(1);
> > +
> > +	vmemmap_tail	= vmemmap_head;
> > +	vmemmap_start	= (unsigned long)folio;
> >   	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
> > -	vmemmap_reuse	= vmemmap_start;
> > -	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
> >   	/*
> > -	 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
> > -	 * to the page which @vmemmap_reuse is mapped to.  Add pages previously
> > -	 * mapping the range to vmemmap_pages list so that they can be freed by
> > -	 * the caller.
> > +	 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end).
> > +	 * Add pages previously mapping the range to vmemmap_pages list so that
> > +	 * they can be freed by the caller.
> >   	 */
> > -	ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
> > +	ret = vmemmap_remap_free(vmemmap_start, vmemmap_end,
> > +				 vmemmap_head, vmemmap_tail,
> >   				 vmemmap_pages, flags);
> > +out:
> >   	if (ret) {
> >   		static_branch_dec(&hugetlb_optimize_vmemmap_key);
> >   		folio_clear_hugetlb_vmemmap_optimized(folio);
> > @@ -632,21 +602,19 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
> >   static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
> >   {
> > -	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
> > -	unsigned long vmemmap_reuse;
> > +	unsigned long vmemmap_start, vmemmap_end;
> >   	if (!vmemmap_should_optimize_folio(h, folio))
> >   		return 0;
> > +	vmemmap_start	= (unsigned long)folio;
> >   	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
> > -	vmemmap_reuse	= vmemmap_start;
> > -	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
> >   	/*
> >   	 * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
> >   	 * @vmemmap_end]
> >   	 */
> > -	return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
> > +	return vmemmap_remap_split(vmemmap_start, vmemmap_end);
> >   }
> >   static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
> 

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ