lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <DC02711B-70F3-47B8-B8BE-1C0F3FE43780@linux.dev>
Date: Mon, 22 Dec 2025 23:11:46 +0800
From: Muchun Song <muchun.song@...ux.dev>
To: Kiryl Shutsemau <kas@...nel.org>
Cc: Oscar Salvador <osalvador@...e.de>, Mike Rapoport <rppt@...nel.org>,
 Vlastimil Babka <vbabka@...e.cz>,
 Lorenzo Stoakes <lorenzo.stoakes@...cle.com>, Zi Yan <ziy@...dia.com>,
 Baoquan He <bhe@...hat.com>, Michal Hocko <mhocko@...e.com>,
 Johannes Weiner <hannes@...xchg.org>, Jonathan Corbet <corbet@....net>,
 kernel-team@...a.com, linux-mm@...ck.org, linux-kernel@...r.kernel.org,
 linux-doc@...r.kernel.org, Andrew Morton <akpm@...ux-foundation.org>,
 David Hildenbrand <david@...nel.org>, Matthew Wilcox <willy@...radead.org>,
 Usama Arif <usamaarif642@...il.com>,
 Frank van der Linden <fvdl@...gle.com>
Subject: Re: [PATCHv2 08/14] mm/hugetlb: Refactor code around vmemmap_walk



> On Dec 22, 2025, at 23:03, Kiryl Shutsemau <kas@...nel.org> wrote:
> 
> On Mon, Dec 22, 2025 at 01:54:47PM +0800, Muchun Song wrote:
>> 
>> 
>>> On 2025/12/18 23:09, Kiryl Shutsemau wrote:
>>> To prepare for removing fake head pages, the vmemmap_walk code is being reworked.
>>> 
>>> The reuse_page and reuse_addr variables are being eliminated. There will
>>> no longer be an expectation regarding the reuse address in relation to
>>> the operated range. Instead, the caller will provide head and tail
>>> vmemmap pages, along with the vmemmap_start address where the head page
>>> is located.
>>> 
>>> Currently, vmemmap_head and vmemmap_tail are set to the same page, but
>>> this will change in the future.
>>> 
>>> The only functional change is that __hugetlb_vmemmap_optimize_folio()
>>> will abandon optimization if memory allocation fails.
>>> 
>>> Signed-off-by: Kiryl Shutsemau <kas@...nel.org>
>>> ---
>>>  mm/hugetlb_vmemmap.c | 198 ++++++++++++++++++-------------------------
>>>  1 file changed, 83 insertions(+), 115 deletions(-)
>>> 
>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>> index ba0fb1b6a5a8..d18e7475cf95 100644
>>> --- a/mm/hugetlb_vmemmap.c
>>> +++ b/mm/hugetlb_vmemmap.c
>>> @@ -24,8 +24,9 @@
>>>   *
>>>   * @remap_pte:        called for each lowest-level entry (PTE).
>>>   * @nr_walked:        the number of walked pte.
>>> - * @reuse_page:        the page which is reused for the tail vmemmap pages.
>>> - * @reuse_addr:        the virtual address of the @reuse_page page.
>>> + * @vmemmap_start:    the start of vmemmap range, where head page is located
>>> + * @vmemmap_head:    the page to be installed as first in the vmemmap range
>>> + * @vmemmap_tail:    the page to be installed as non-first in the vmemmap range
>>>   * @vmemmap_pages:    the list head of the vmemmap pages that can be freed
>>>   *            or is mapped from.
>>>   * @flags:        used to modify behavior in vmemmap page table walking
>>> @@ -34,11 +35,14 @@
>>>  struct vmemmap_remap_walk {
>>>      void            (*remap_pte)(pte_t *pte, unsigned long addr,
>>>                           struct vmemmap_remap_walk *walk);
>>> +
>>>      unsigned long        nr_walked;
>>> -    struct page        *reuse_page;
>>> -    unsigned long        reuse_addr;
>>> +    unsigned long        vmemmap_start;
>>> +    struct page        *vmemmap_head;
>>> +    struct page        *vmemmap_tail;
>>>      struct list_head    *vmemmap_pages;
>>> +
>>>  /* Skip the TLB flush when we split the PMD */
>>>  #define VMEMMAP_SPLIT_NO_TLB_FLUSH    BIT(0)
>>>  /* Skip the TLB flush when we remap the PTE */
>>> @@ -140,14 +144,7 @@ static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
>>>  {
>>>      struct vmemmap_remap_walk *vmemmap_walk = walk->private;
>>> -    /*
>>> -     * The reuse_page is found 'first' in page table walking before
>>> -     * starting remapping.
>>> -     */
>>> -    if (!vmemmap_walk->reuse_page)
>>> -        vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
>>> -    else
>>> -        vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
>>> +    vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
>>>      vmemmap_walk->nr_walked++;
>>>      return 0;
>>> @@ -207,18 +204,12 @@ static void free_vmemmap_page_list(struct list_head *list)
>>>  static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
>>>                    struct vmemmap_remap_walk *walk)
>>>  {
>>> -    /*
>>> -     * Remap the tail pages as read-only to catch illegal write operation
>>> -     * to the tail pages.
>>> -     */
>>> -    pgprot_t pgprot = PAGE_KERNEL_RO;
>>>      struct page *page = pte_page(ptep_get(pte));
>>>      pte_t entry;
>>>      /* Remapping the head page requires r/w */
>>> -    if (unlikely(addr == walk->reuse_addr)) {
>>> -        pgprot = PAGE_KERNEL;
>>> -        list_del(&walk->reuse_page->lru);
>>> +    if (unlikely(addr == walk->vmemmap_start)) {
>>> +        list_del(&walk->vmemmap_head->lru);
>>>          /*
>>>           * Makes sure that preceding stores to the page contents from
>>> @@ -226,9 +217,16 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
>>>           * write.
>>>           */
>>>          smp_wmb();
>>> +
>>> +        entry = mk_pte(walk->vmemmap_head, PAGE_KERNEL);
>>> +    } else {
>>> +        /*
>>> +         * Remap the tail pages as read-only to catch illegal write
>>> +         * operation to the tail pages.
>>> +         */
>>> +        entry = mk_pte(walk->vmemmap_tail, PAGE_KERNEL_RO);
>>>      }
>>> -    entry = mk_pte(walk->reuse_page, pgprot);
>>>      list_add(&page->lru, walk->vmemmap_pages);
>>>      set_pte_at(&init_mm, addr, pte, entry);
>>>  }
>>> @@ -255,16 +253,13 @@ static inline void reset_struct_pages(struct page *start)
>>>  static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>>>                  struct vmemmap_remap_walk *walk)
>>>  {
>>> -    pgprot_t pgprot = PAGE_KERNEL;
>>>      struct page *page;
>>>      void *to;
>>> -    BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
>>> -
>>>      page = list_first_entry(walk->vmemmap_pages, struct page, lru);
>>>      list_del(&page->lru);
>>>      to = page_to_virt(page);
>>> -    copy_page(to, (void *)walk->reuse_addr);
>>> +    copy_page(to, (void *)walk->vmemmap_start);
>>>      reset_struct_pages(to);
>>>      /*
>>> @@ -272,7 +267,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>>>       * before the set_pte_at() write.
>>>       */
>>>      smp_wmb();
>>> -    set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>>> +    set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
>>>  }
>>>  /**
>>> @@ -282,33 +277,29 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>>>   *             to remap.
>>>   * @end:       end address of the vmemmap virtual address range that we want to
>>>   *             remap.
>>> - * @reuse:     reuse address.
>>> - *
>>>   * Return: %0 on success, negative error code otherwise.
>>>   */
>>> -static int vmemmap_remap_split(unsigned long start, unsigned long end,
>>> -                   unsigned long reuse)
>>> +static int vmemmap_remap_split(unsigned long start, unsigned long end)
>>>  {
>>>      struct vmemmap_remap_walk walk = {
>>>          .remap_pte    = NULL,
>>> +        .vmemmap_start    = start,
>>>          .flags        = VMEMMAP_SPLIT_NO_TLB_FLUSH,
>>>      };
>>> -    /* See the comment in the vmemmap_remap_free(). */
>>> -    BUG_ON(start - reuse != PAGE_SIZE);
>>> -
>>> -    return vmemmap_remap_range(reuse, end, &walk);
>>> +    return vmemmap_remap_range(start, end, &walk);
>>>  }
>>>  /**
>>>   * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
>>> - *            to the page which @reuse is mapped to, then free vmemmap
>>> - *            which the range are mapped to.
>>> + *            to use @vmemmap_head/tail, then free vmemmap which
>>> + *            the range are mapped to.
>>>   * @start:    start address of the vmemmap virtual address range that we want
>>>   *        to remap.
>>>   * @end:    end address of the vmemmap virtual address range that we want to
>>>   *        remap.
>>> - * @reuse:    reuse address.
>>> + * @vmemmap_head: the page to be installed as first in the vmemmap range
>>> + * @vmemmap_tail: the page to be installed as non-first in the vmemmap range
>>>   * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
>>>   *        responsibility to free pages.
>>>   * @flags:    modifications to vmemmap_remap_walk flags
>>> @@ -316,69 +307,40 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end,
>>>   * Return: %0 on success, negative error code otherwise.
>>>   */
>>>  static int vmemmap_remap_free(unsigned long start, unsigned long end,
>>> -                  unsigned long reuse,
>>> +                  struct page *vmemmap_head,
>>> +                  struct page *vmemmap_tail,
>>>                    struct list_head *vmemmap_pages,
>>>                    unsigned long flags)
>>>  {
>>>      int ret;
>>>      struct vmemmap_remap_walk walk = {
>>>          .remap_pte    = vmemmap_remap_pte,
>>> -        .reuse_addr    = reuse,
>>> +        .vmemmap_start    = start,
>>> +        .vmemmap_head    = vmemmap_head,
>>> +        .vmemmap_tail    = vmemmap_tail,
>>>          .vmemmap_pages    = vmemmap_pages,
>>>          .flags        = flags,
>>>      };
>>> -    int nid = page_to_nid((struct page *)reuse);
>>> -    gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
>>> +
>>> +    ret = vmemmap_remap_range(start, end, &walk);
>>> +    if (!ret || !walk.nr_walked)
>>> +        return ret;
>>> +
>>> +    end = start + walk.nr_walked * PAGE_SIZE;
>>>      /*
>>> -     * Allocate a new head vmemmap page to avoid breaking a contiguous
>>> -     * block of struct page memory when freeing it back to page allocator
>>> -     * in free_vmemmap_page_list(). This will allow the likely contiguous
>>> -     * struct page backing memory to be kept contiguous and allowing for
>>> -     * more allocations of hugepages. Fallback to the currently
>>> -     * mapped head page in case should it fail to allocate.
>>> +     * vmemmap_pages contains pages from the previous vmemmap_remap_range()
>>> +     * call which failed.  These are pages which were removed from
>>> +     * the vmemmap. They will be restored in the following call.
>>>       */
>>> -    walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
>>> -    if (walk.reuse_page) {
>>> -        copy_page(page_to_virt(walk.reuse_page),
>>> -              (void *)walk.reuse_addr);
>>> -        list_add(&walk.reuse_page->lru, vmemmap_pages);
>>> -        memmap_pages_add(1);
>>> -    }
>>> +    walk = (struct vmemmap_remap_walk) {
>>> +        .remap_pte    = vmemmap_restore_pte,
>>> +        .vmemmap_start    = start,
>>> +        .vmemmap_pages    = vmemmap_pages,
>>> +        .flags        = 0,
>>> +    };
>>> -    /*
>>> -     * In order to make remapping routine most efficient for the huge pages,
>>> -     * the routine of vmemmap page table walking has the following rules
>>> -     * (see more details from the vmemmap_pte_range()):
>>> -     *
>>> -     * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
>>> -     *   should be continuous.
>>> -     * - The @reuse address is part of the range [@reuse, @end) that we are
>>> -     *   walking which is passed to vmemmap_remap_range().
>>> -     * - The @reuse address is the first in the complete range.
>>> -     *
>>> -     * So we need to make sure that @start and @reuse meet the above rules.
>>> -     */
>>> -    BUG_ON(start - reuse != PAGE_SIZE);
>>> -
>>> -    ret = vmemmap_remap_range(reuse, end, &walk);
>>> -    if (ret && walk.nr_walked) {
>>> -        end = reuse + walk.nr_walked * PAGE_SIZE;
>>> -        /*
>>> -         * vmemmap_pages contains pages from the previous
>>> -         * vmemmap_remap_range call which failed.  These
>>> -         * are pages which were removed from the vmemmap.
>>> -         * They will be restored in the following call.
>>> -         */
>>> -        walk = (struct vmemmap_remap_walk) {
>>> -            .remap_pte    = vmemmap_restore_pte,
>>> -            .reuse_addr    = reuse,
>>> -            .vmemmap_pages    = vmemmap_pages,
>>> -            .flags        = 0,
>>> -        };
>>> -
>>> -        vmemmap_remap_range(reuse, end, &walk);
>>> -    }
>>> +    vmemmap_remap_range(start + PAGE_SIZE, end, &walk);
>> 
>> The reason we previously passed the "start" address
>> was to perform a TLB flush within that address range.
>> So he startaddress is still necessary.
> 
> Good catch.
> 
>>>      return ret;
>>>  }
>>> @@ -415,29 +377,27 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
>>>   *        to remap.
>>>   * @end:    end address of the vmemmap virtual address range that we want to
>>>   *        remap.
>>> - * @reuse:    reuse address.
>>>   * @flags:    modifications to vmemmap_remap_walk flags
>>>   *
>>>   * Return: %0 on success, negative error code otherwise.
>>>   */
>>>  static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
>>> -                   unsigned long reuse, unsigned long flags)
>>> +                   unsigned long flags)
>>>  {
>>>      LIST_HEAD(vmemmap_pages);
>>>      struct vmemmap_remap_walk walk = {
>>>          .remap_pte    = vmemmap_restore_pte,
>>> -        .reuse_addr    = reuse,
>>> +        .vmemmap_start    = start,
>>>          .vmemmap_pages    = &vmemmap_pages,
>>>          .flags        = flags,
>>>      };
>>> -    /* See the comment in the vmemmap_remap_free(). */
>>> -    BUG_ON(start - reuse != PAGE_SIZE);
>>> +    start += HUGETLB_VMEMMAP_RESERVE_SIZE;
>>>      if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
>>>          return -ENOMEM;
>>> -    return vmemmap_remap_range(reuse, end, &walk);
>>> +    return vmemmap_remap_range(start, end, &walk);
>>>  }
>>>  DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
>>> @@ -454,8 +414,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
>>>                         struct folio *folio, unsigned long flags)
>>>  {
>>>      int ret;
>>> -    unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
>>> -    unsigned long vmemmap_reuse;
>>> +    unsigned long vmemmap_start, vmemmap_end;
>>>      VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
>>>      VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
>>> @@ -466,18 +425,16 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
>>>      if (flags & VMEMMAP_SYNCHRONIZE_RCU)
>>>          synchronize_rcu();
>>> +    vmemmap_start    = (unsigned long)folio;
>>>      vmemmap_end    = vmemmap_start + hugetlb_vmemmap_size(h);
>>> -    vmemmap_reuse    = vmemmap_start;
>>> -    vmemmap_start    += HUGETLB_VMEMMAP_RESERVE_SIZE;
>>>      /*
>>>       * The pages which the vmemmap virtual address range [@vmemmap_start,
>>> -     * @vmemmap_end) are mapped to are freed to the buddy allocator, and
>>> -     * the range is mapped to the page which @vmemmap_reuse is mapped to.
>>> +     * @vmemmap_end) are mapped to are freed to the buddy allocator.
>>>       * When a HugeTLB page is freed to the buddy allocator, previously
>>>       * discarded vmemmap pages must be allocated and remapping.
>>>       */
>>> -    ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
>>> +    ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, flags);
>>>      if (!ret) {
>>>          folio_clear_hugetlb_vmemmap_optimized(folio);
>>>          static_branch_dec(&hugetlb_optimize_vmemmap_key);
>>> @@ -565,9 +522,9 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
>>>                          struct list_head *vmemmap_pages,
>>>                          unsigned long flags)
>>>  {
>>> -    int ret = 0;
>>> -    unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
>>> -    unsigned long vmemmap_reuse;
>>> +    unsigned long vmemmap_start, vmemmap_end;
>>> +    struct page *vmemmap_head, *vmemmap_tail;
>>> +    int nid, ret = 0;
>>>      VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
>>>      VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
>>> @@ -592,18 +549,31 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
>>>       */
>>>      folio_set_hugetlb_vmemmap_optimized(folio);
>>> +    nid = folio_nid(folio);
>>> +    vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0);
>> 
>> Why did you choose to change the gfpmask (previous is
>> GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN)?
> 
> Because I removed the fallback for allocation failure. Trying harder and
> warn if the allocation failed is justified without the fallback path.
> 
>>> +
>>> +    if (!vmemmap_head) {
>>> +        ret = -ENOMEM;
>> 
>> Why did you choose to change the allocation-failure
>> behavior? Replacing the head page isn’t mandatory;
>> it’s only nice-to-have.
> 
> I would require to extract the vmemmap_head page from page tables which
> I found to be useless complication that never will get executed and
> therefore tested.
> 
> If we failed to allocate a single page here, we are in OOM territory. It
> is not time to play with huge page allocation.

Alright, you’ve convinced me.

> 
>>> +        goto out;
>>> +    }
>>> +
>>> +    copy_page(page_to_virt(vmemmap_head), folio);
>>> +    list_add(&vmemmap_head->lru, vmemmap_pages);
>>> +    memmap_pages_add(1);
>>> +
>>> +    vmemmap_tail    = vmemmap_head;
>>> +    vmemmap_start    = (unsigned long)folio;
>>>      vmemmap_end    = vmemmap_start + hugetlb_vmemmap_size(h);
>>> -    vmemmap_reuse    = vmemmap_start;
>>> -    vmemmap_start    += HUGETLB_VMEMMAP_RESERVE_SIZE;
>>>      /*
>>> -     * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
>>> -     * to the page which @vmemmap_reuse is mapped to.  Add pages previously
>>> -     * mapping the range to vmemmap_pages list so that they can be freed by
>>> -     * the caller.
>>> +     * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end).
>>> +     * Add pages previously mapping the range to vmemmap_pages list so that
>>> +     * they can be freed by the caller.
>>>       */
>>> -    ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
>>> +    ret = vmemmap_remap_free(vmemmap_start, vmemmap_end,
>>> +                 vmemmap_head, vmemmap_tail,
>>>                   vmemmap_pages, flags);
>>> +out:
>>>      if (ret) {
>>>          static_branch_dec(&hugetlb_optimize_vmemmap_key);
>>>          folio_clear_hugetlb_vmemmap_optimized(folio);
>>> @@ -632,21 +602,19 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
>>>  static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
>>>  {
>>> -    unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
>>> -    unsigned long vmemmap_reuse;
>>> +    unsigned long vmemmap_start, vmemmap_end;
>>>      if (!vmemmap_should_optimize_folio(h, folio))
>>>          return 0;
>>> +    vmemmap_start    = (unsigned long)folio;
>>>      vmemmap_end    = vmemmap_start + hugetlb_vmemmap_size(h);
>>> -    vmemmap_reuse    = vmemmap_start;
>>> -    vmemmap_start    += HUGETLB_VMEMMAP_RESERVE_SIZE;
>>>      /*
>>>       * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
>>>       * @vmemmap_end]
>>>       */
>>> -    return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
>>> +    return vmemmap_remap_split(vmemmap_start, vmemmap_end);
>>>  }
>>>  static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
>> 
> 
> --
>  Kiryl Shutsemau / Kirill A. Shutemov

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ