[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <DC02711B-70F3-47B8-B8BE-1C0F3FE43780@linux.dev>
Date: Mon, 22 Dec 2025 23:11:46 +0800
From: Muchun Song <muchun.song@...ux.dev>
To: Kiryl Shutsemau <kas@...nel.org>
Cc: Oscar Salvador <osalvador@...e.de>, Mike Rapoport <rppt@...nel.org>,
Vlastimil Babka <vbabka@...e.cz>,
Lorenzo Stoakes <lorenzo.stoakes@...cle.com>, Zi Yan <ziy@...dia.com>,
Baoquan He <bhe@...hat.com>, Michal Hocko <mhocko@...e.com>,
Johannes Weiner <hannes@...xchg.org>, Jonathan Corbet <corbet@....net>,
kernel-team@...a.com, linux-mm@...ck.org, linux-kernel@...r.kernel.org,
linux-doc@...r.kernel.org, Andrew Morton <akpm@...ux-foundation.org>,
David Hildenbrand <david@...nel.org>, Matthew Wilcox <willy@...radead.org>,
Usama Arif <usamaarif642@...il.com>,
Frank van der Linden <fvdl@...gle.com>
Subject: Re: [PATCHv2 08/14] mm/hugetlb: Refactor code around vmemmap_walk
> On Dec 22, 2025, at 23:03, Kiryl Shutsemau <kas@...nel.org> wrote:
>
> On Mon, Dec 22, 2025 at 01:54:47PM +0800, Muchun Song wrote:
>>
>>
>>> On 2025/12/18 23:09, Kiryl Shutsemau wrote:
>>> To prepare for removing fake head pages, the vmemmap_walk code is being reworked.
>>>
>>> The reuse_page and reuse_addr variables are being eliminated. There will
>>> no longer be an expectation regarding the reuse address in relation to
>>> the operated range. Instead, the caller will provide head and tail
>>> vmemmap pages, along with the vmemmap_start address where the head page
>>> is located.
>>>
>>> Currently, vmemmap_head and vmemmap_tail are set to the same page, but
>>> this will change in the future.
>>>
>>> The only functional change is that __hugetlb_vmemmap_optimize_folio()
>>> will abandon optimization if memory allocation fails.
>>>
>>> Signed-off-by: Kiryl Shutsemau <kas@...nel.org>
>>> ---
>>> mm/hugetlb_vmemmap.c | 198 ++++++++++++++++++-------------------------
>>> 1 file changed, 83 insertions(+), 115 deletions(-)
>>>
>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>> index ba0fb1b6a5a8..d18e7475cf95 100644
>>> --- a/mm/hugetlb_vmemmap.c
>>> +++ b/mm/hugetlb_vmemmap.c
>>> @@ -24,8 +24,9 @@
>>> *
>>> * @remap_pte: called for each lowest-level entry (PTE).
>>> * @nr_walked: the number of walked pte.
>>> - * @reuse_page: the page which is reused for the tail vmemmap pages.
>>> - * @reuse_addr: the virtual address of the @reuse_page page.
>>> + * @vmemmap_start: the start of vmemmap range, where head page is located
>>> + * @vmemmap_head: the page to be installed as first in the vmemmap range
>>> + * @vmemmap_tail: the page to be installed as non-first in the vmemmap range
>>> * @vmemmap_pages: the list head of the vmemmap pages that can be freed
>>> * or is mapped from.
>>> * @flags: used to modify behavior in vmemmap page table walking
>>> @@ -34,11 +35,14 @@
>>> struct vmemmap_remap_walk {
>>> void (*remap_pte)(pte_t *pte, unsigned long addr,
>>> struct vmemmap_remap_walk *walk);
>>> +
>>> unsigned long nr_walked;
>>> - struct page *reuse_page;
>>> - unsigned long reuse_addr;
>>> + unsigned long vmemmap_start;
>>> + struct page *vmemmap_head;
>>> + struct page *vmemmap_tail;
>>> struct list_head *vmemmap_pages;
>>> +
>>> /* Skip the TLB flush when we split the PMD */
>>> #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
>>> /* Skip the TLB flush when we remap the PTE */
>>> @@ -140,14 +144,7 @@ static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
>>> {
>>> struct vmemmap_remap_walk *vmemmap_walk = walk->private;
>>> - /*
>>> - * The reuse_page is found 'first' in page table walking before
>>> - * starting remapping.
>>> - */
>>> - if (!vmemmap_walk->reuse_page)
>>> - vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
>>> - else
>>> - vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
>>> + vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
>>> vmemmap_walk->nr_walked++;
>>> return 0;
>>> @@ -207,18 +204,12 @@ static void free_vmemmap_page_list(struct list_head *list)
>>> static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
>>> struct vmemmap_remap_walk *walk)
>>> {
>>> - /*
>>> - * Remap the tail pages as read-only to catch illegal write operation
>>> - * to the tail pages.
>>> - */
>>> - pgprot_t pgprot = PAGE_KERNEL_RO;
>>> struct page *page = pte_page(ptep_get(pte));
>>> pte_t entry;
>>> /* Remapping the head page requires r/w */
>>> - if (unlikely(addr == walk->reuse_addr)) {
>>> - pgprot = PAGE_KERNEL;
>>> - list_del(&walk->reuse_page->lru);
>>> + if (unlikely(addr == walk->vmemmap_start)) {
>>> + list_del(&walk->vmemmap_head->lru);
>>> /*
>>> * Makes sure that preceding stores to the page contents from
>>> @@ -226,9 +217,16 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
>>> * write.
>>> */
>>> smp_wmb();
>>> +
>>> + entry = mk_pte(walk->vmemmap_head, PAGE_KERNEL);
>>> + } else {
>>> + /*
>>> + * Remap the tail pages as read-only to catch illegal write
>>> + * operation to the tail pages.
>>> + */
>>> + entry = mk_pte(walk->vmemmap_tail, PAGE_KERNEL_RO);
>>> }
>>> - entry = mk_pte(walk->reuse_page, pgprot);
>>> list_add(&page->lru, walk->vmemmap_pages);
>>> set_pte_at(&init_mm, addr, pte, entry);
>>> }
>>> @@ -255,16 +253,13 @@ static inline void reset_struct_pages(struct page *start)
>>> static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>>> struct vmemmap_remap_walk *walk)
>>> {
>>> - pgprot_t pgprot = PAGE_KERNEL;
>>> struct page *page;
>>> void *to;
>>> - BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
>>> -
>>> page = list_first_entry(walk->vmemmap_pages, struct page, lru);
>>> list_del(&page->lru);
>>> to = page_to_virt(page);
>>> - copy_page(to, (void *)walk->reuse_addr);
>>> + copy_page(to, (void *)walk->vmemmap_start);
>>> reset_struct_pages(to);
>>> /*
>>> @@ -272,7 +267,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>>> * before the set_pte_at() write.
>>> */
>>> smp_wmb();
>>> - set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>>> + set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
>>> }
>>> /**
>>> @@ -282,33 +277,29 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>>> * to remap.
>>> * @end: end address of the vmemmap virtual address range that we want to
>>> * remap.
>>> - * @reuse: reuse address.
>>> - *
>>> * Return: %0 on success, negative error code otherwise.
>>> */
>>> -static int vmemmap_remap_split(unsigned long start, unsigned long end,
>>> - unsigned long reuse)
>>> +static int vmemmap_remap_split(unsigned long start, unsigned long end)
>>> {
>>> struct vmemmap_remap_walk walk = {
>>> .remap_pte = NULL,
>>> + .vmemmap_start = start,
>>> .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH,
>>> };
>>> - /* See the comment in the vmemmap_remap_free(). */
>>> - BUG_ON(start - reuse != PAGE_SIZE);
>>> -
>>> - return vmemmap_remap_range(reuse, end, &walk);
>>> + return vmemmap_remap_range(start, end, &walk);
>>> }
>>> /**
>>> * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
>>> - * to the page which @reuse is mapped to, then free vmemmap
>>> - * which the range are mapped to.
>>> + * to use @vmemmap_head/tail, then free vmemmap which
>>> + * the range are mapped to.
>>> * @start: start address of the vmemmap virtual address range that we want
>>> * to remap.
>>> * @end: end address of the vmemmap virtual address range that we want to
>>> * remap.
>>> - * @reuse: reuse address.
>>> + * @vmemmap_head: the page to be installed as first in the vmemmap range
>>> + * @vmemmap_tail: the page to be installed as non-first in the vmemmap range
>>> * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers
>>> * responsibility to free pages.
>>> * @flags: modifications to vmemmap_remap_walk flags
>>> @@ -316,69 +307,40 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end,
>>> * Return: %0 on success, negative error code otherwise.
>>> */
>>> static int vmemmap_remap_free(unsigned long start, unsigned long end,
>>> - unsigned long reuse,
>>> + struct page *vmemmap_head,
>>> + struct page *vmemmap_tail,
>>> struct list_head *vmemmap_pages,
>>> unsigned long flags)
>>> {
>>> int ret;
>>> struct vmemmap_remap_walk walk = {
>>> .remap_pte = vmemmap_remap_pte,
>>> - .reuse_addr = reuse,
>>> + .vmemmap_start = start,
>>> + .vmemmap_head = vmemmap_head,
>>> + .vmemmap_tail = vmemmap_tail,
>>> .vmemmap_pages = vmemmap_pages,
>>> .flags = flags,
>>> };
>>> - int nid = page_to_nid((struct page *)reuse);
>>> - gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
>>> +
>>> + ret = vmemmap_remap_range(start, end, &walk);
>>> + if (!ret || !walk.nr_walked)
>>> + return ret;
>>> +
>>> + end = start + walk.nr_walked * PAGE_SIZE;
>>> /*
>>> - * Allocate a new head vmemmap page to avoid breaking a contiguous
>>> - * block of struct page memory when freeing it back to page allocator
>>> - * in free_vmemmap_page_list(). This will allow the likely contiguous
>>> - * struct page backing memory to be kept contiguous and allowing for
>>> - * more allocations of hugepages. Fallback to the currently
>>> - * mapped head page in case should it fail to allocate.
>>> + * vmemmap_pages contains pages from the previous vmemmap_remap_range()
>>> + * call which failed. These are pages which were removed from
>>> + * the vmemmap. They will be restored in the following call.
>>> */
>>> - walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
>>> - if (walk.reuse_page) {
>>> - copy_page(page_to_virt(walk.reuse_page),
>>> - (void *)walk.reuse_addr);
>>> - list_add(&walk.reuse_page->lru, vmemmap_pages);
>>> - memmap_pages_add(1);
>>> - }
>>> + walk = (struct vmemmap_remap_walk) {
>>> + .remap_pte = vmemmap_restore_pte,
>>> + .vmemmap_start = start,
>>> + .vmemmap_pages = vmemmap_pages,
>>> + .flags = 0,
>>> + };
>>> - /*
>>> - * In order to make remapping routine most efficient for the huge pages,
>>> - * the routine of vmemmap page table walking has the following rules
>>> - * (see more details from the vmemmap_pte_range()):
>>> - *
>>> - * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
>>> - * should be continuous.
>>> - * - The @reuse address is part of the range [@reuse, @end) that we are
>>> - * walking which is passed to vmemmap_remap_range().
>>> - * - The @reuse address is the first in the complete range.
>>> - *
>>> - * So we need to make sure that @start and @reuse meet the above rules.
>>> - */
>>> - BUG_ON(start - reuse != PAGE_SIZE);
>>> -
>>> - ret = vmemmap_remap_range(reuse, end, &walk);
>>> - if (ret && walk.nr_walked) {
>>> - end = reuse + walk.nr_walked * PAGE_SIZE;
>>> - /*
>>> - * vmemmap_pages contains pages from the previous
>>> - * vmemmap_remap_range call which failed. These
>>> - * are pages which were removed from the vmemmap.
>>> - * They will be restored in the following call.
>>> - */
>>> - walk = (struct vmemmap_remap_walk) {
>>> - .remap_pte = vmemmap_restore_pte,
>>> - .reuse_addr = reuse,
>>> - .vmemmap_pages = vmemmap_pages,
>>> - .flags = 0,
>>> - };
>>> -
>>> - vmemmap_remap_range(reuse, end, &walk);
>>> - }
>>> + vmemmap_remap_range(start + PAGE_SIZE, end, &walk);
>>
>> The reason we previously passed the "start" address
>> was to perform a TLB flush within that address range.
>> So he startaddress is still necessary.
>
> Good catch.
>
>>> return ret;
>>> }
>>> @@ -415,29 +377,27 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
>>> * to remap.
>>> * @end: end address of the vmemmap virtual address range that we want to
>>> * remap.
>>> - * @reuse: reuse address.
>>> * @flags: modifications to vmemmap_remap_walk flags
>>> *
>>> * Return: %0 on success, negative error code otherwise.
>>> */
>>> static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
>>> - unsigned long reuse, unsigned long flags)
>>> + unsigned long flags)
>>> {
>>> LIST_HEAD(vmemmap_pages);
>>> struct vmemmap_remap_walk walk = {
>>> .remap_pte = vmemmap_restore_pte,
>>> - .reuse_addr = reuse,
>>> + .vmemmap_start = start,
>>> .vmemmap_pages = &vmemmap_pages,
>>> .flags = flags,
>>> };
>>> - /* See the comment in the vmemmap_remap_free(). */
>>> - BUG_ON(start - reuse != PAGE_SIZE);
>>> + start += HUGETLB_VMEMMAP_RESERVE_SIZE;
>>> if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
>>> return -ENOMEM;
>>> - return vmemmap_remap_range(reuse, end, &walk);
>>> + return vmemmap_remap_range(start, end, &walk);
>>> }
>>> DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
>>> @@ -454,8 +414,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
>>> struct folio *folio, unsigned long flags)
>>> {
>>> int ret;
>>> - unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
>>> - unsigned long vmemmap_reuse;
>>> + unsigned long vmemmap_start, vmemmap_end;
>>> VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
>>> VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
>>> @@ -466,18 +425,16 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
>>> if (flags & VMEMMAP_SYNCHRONIZE_RCU)
>>> synchronize_rcu();
>>> + vmemmap_start = (unsigned long)folio;
>>> vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
>>> - vmemmap_reuse = vmemmap_start;
>>> - vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
>>> /*
>>> * The pages which the vmemmap virtual address range [@vmemmap_start,
>>> - * @vmemmap_end) are mapped to are freed to the buddy allocator, and
>>> - * the range is mapped to the page which @vmemmap_reuse is mapped to.
>>> + * @vmemmap_end) are mapped to are freed to the buddy allocator.
>>> * When a HugeTLB page is freed to the buddy allocator, previously
>>> * discarded vmemmap pages must be allocated and remapping.
>>> */
>>> - ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
>>> + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, flags);
>>> if (!ret) {
>>> folio_clear_hugetlb_vmemmap_optimized(folio);
>>> static_branch_dec(&hugetlb_optimize_vmemmap_key);
>>> @@ -565,9 +522,9 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
>>> struct list_head *vmemmap_pages,
>>> unsigned long flags)
>>> {
>>> - int ret = 0;
>>> - unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
>>> - unsigned long vmemmap_reuse;
>>> + unsigned long vmemmap_start, vmemmap_end;
>>> + struct page *vmemmap_head, *vmemmap_tail;
>>> + int nid, ret = 0;
>>> VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
>>> VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
>>> @@ -592,18 +549,31 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
>>> */
>>> folio_set_hugetlb_vmemmap_optimized(folio);
>>> + nid = folio_nid(folio);
>>> + vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0);
>>
>> Why did you choose to change the gfpmask (previous is
>> GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN)?
>
> Because I removed the fallback for allocation failure. Trying harder and
> warn if the allocation failed is justified without the fallback path.
>
>>> +
>>> + if (!vmemmap_head) {
>>> + ret = -ENOMEM;
>>
>> Why did you choose to change the allocation-failure
>> behavior? Replacing the head page isn’t mandatory;
>> it’s only nice-to-have.
>
> I would require to extract the vmemmap_head page from page tables which
> I found to be useless complication that never will get executed and
> therefore tested.
>
> If we failed to allocate a single page here, we are in OOM territory. It
> is not time to play with huge page allocation.
Alright, you’ve convinced me.
>
>>> + goto out;
>>> + }
>>> +
>>> + copy_page(page_to_virt(vmemmap_head), folio);
>>> + list_add(&vmemmap_head->lru, vmemmap_pages);
>>> + memmap_pages_add(1);
>>> +
>>> + vmemmap_tail = vmemmap_head;
>>> + vmemmap_start = (unsigned long)folio;
>>> vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
>>> - vmemmap_reuse = vmemmap_start;
>>> - vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
>>> /*
>>> - * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
>>> - * to the page which @vmemmap_reuse is mapped to. Add pages previously
>>> - * mapping the range to vmemmap_pages list so that they can be freed by
>>> - * the caller.
>>> + * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end).
>>> + * Add pages previously mapping the range to vmemmap_pages list so that
>>> + * they can be freed by the caller.
>>> */
>>> - ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
>>> + ret = vmemmap_remap_free(vmemmap_start, vmemmap_end,
>>> + vmemmap_head, vmemmap_tail,
>>> vmemmap_pages, flags);
>>> +out:
>>> if (ret) {
>>> static_branch_dec(&hugetlb_optimize_vmemmap_key);
>>> folio_clear_hugetlb_vmemmap_optimized(folio);
>>> @@ -632,21 +602,19 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
>>> static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
>>> {
>>> - unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
>>> - unsigned long vmemmap_reuse;
>>> + unsigned long vmemmap_start, vmemmap_end;
>>> if (!vmemmap_should_optimize_folio(h, folio))
>>> return 0;
>>> + vmemmap_start = (unsigned long)folio;
>>> vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
>>> - vmemmap_reuse = vmemmap_start;
>>> - vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
>>> /*
>>> * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
>>> * @vmemmap_end]
>>> */
>>> - return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
>>> + return vmemmap_remap_split(vmemmap_start, vmemmap_end);
>>> }
>>> static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
>>
>
> --
> Kiryl Shutsemau / Kirill A. Shutemov
Powered by blists - more mailing lists