lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAA1CXcA4DXA875+C3-pHjZMtS332Jdwe2eWQbxOqQD80F43rVQ@mail.gmail.com>
Date: Sat, 7 Jun 2025 06:48:27 -0600
From: Nico Pache <npache@...hat.com>
To: Dev Jain <dev.jain@....com>
Cc: Baolin Wang <baolin.wang@...ux.alibaba.com>, linux-mm@...ck.org, 
	linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org, 
	linux-trace-kernel@...r.kernel.org, akpm@...ux-foundation.org, corbet@....net, 
	rostedt@...dmis.org, mhiramat@...nel.org, mathieu.desnoyers@...icios.com, 
	david@...hat.com, baohua@...nel.org, ryan.roberts@....com, 
	willy@...radead.org, peterx@...hat.com, ziy@...dia.com, 
	wangkefeng.wang@...wei.com, usamaarif642@...il.com, sunnanyong@...wei.com, 
	vishal.moola@...il.com, thomas.hellstrom@...ux.intel.com, 
	yang@...amperecomputing.com, kirill.shutemov@...ux.intel.com, 
	aarcange@...hat.com, raquini@...hat.com, anshuman.khandual@....com, 
	catalin.marinas@....com, tiwai@...e.de, will@...nel.org, 
	dave.hansen@...ux.intel.com, jack@...e.cz, cl@...two.org, jglisse@...gle.com, 
	surenb@...gle.com, zokeefe@...gle.com, hannes@...xchg.org, 
	rientjes@...gle.com, mhocko@...e.com, rdunlap@...radead.org, 
	lorenzo.stoakes@...cle.com, Liam.Howlett@...cle.com
Subject: Re: [PATCH v5 06/12] khugepaged: introduce khugepaged_scan_bitmap for
 mTHP support

On Fri, Jun 6, 2025 at 10:38 AM Dev Jain <dev.jain@....com> wrote:
>
>
> On 01/05/25 12:26 am, Nico Pache wrote:
> > On Wed, Apr 30, 2025 at 4:08 AM Baolin Wang
> > <baolin.wang@...ux.alibaba.com> wrote:
> >>
> >>
> >> On 2025/4/29 02:12, Nico Pache wrote:
> >>> khugepaged scans anons PMD ranges for potential collapse to a hugepage.
> >>> To add mTHP support we use this scan to instead record chunks of utilized
> >>> sections of the PMD.
> >>>
> >>> khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap
> >>> that represents chunks of utilized regions. We can then determine what
> >>> mTHP size fits best and in the following patch, we set this bitmap while
> >>> scanning the anon PMD.
> >>>
> >>> max_ptes_none is used as a scale to determine how "full" an order must
> >>> be before being considered for collapse.
> >>>
> >>> When attempting to collapse an order that has its order set to "always"
> >>> lets always collapse to that order in a greedy manner without
> >>> considering the number of bits set.
> >>>
> >>> Signed-off-by: Nico Pache <npache@...hat.com>
> >>> ---
> >>>    include/linux/khugepaged.h |  4 ++
> >>>    mm/khugepaged.c            | 94 ++++++++++++++++++++++++++++++++++----
> >>>    2 files changed, 89 insertions(+), 9 deletions(-)
> >>>
> >>> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> >>> index 1f46046080f5..18fe6eb5051d 100644
> >>> --- a/include/linux/khugepaged.h
> >>> +++ b/include/linux/khugepaged.h
> >>> @@ -1,6 +1,10 @@
> >>>    /* SPDX-License-Identifier: GPL-2.0 */
> >>>    #ifndef _LINUX_KHUGEPAGED_H
> >>>    #define _LINUX_KHUGEPAGED_H
> >>> +#define KHUGEPAGED_MIN_MTHP_ORDER    2
> >> Still better to add some comments to explain explicitly why choose 2 as
> >> the MIN_MTHP_ORDER.
> > Ok i'll add a note that explicitly states that the min order of anon mTHPs is 2
> >>> +#define KHUGEPAGED_MIN_MTHP_NR       (1<<KHUGEPAGED_MIN_MTHP_ORDER)
> >>> +#define MAX_MTHP_BITMAP_SIZE  (1 << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
> >>> +#define MTHP_BITMAP_SIZE  (1 << (HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER))
> >>>
> >>>    extern unsigned int khugepaged_max_ptes_none __read_mostly;
> >>>    #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> >>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >>> index e21998a06253..6e67db86409a 100644
> >>> --- a/mm/khugepaged.c
> >>> +++ b/mm/khugepaged.c
> >>> @@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> >>>
> >>>    static struct kmem_cache *mm_slot_cache __ro_after_init;
> >>>
> >>> +struct scan_bit_state {
> >>> +     u8 order;
> >>> +     u16 offset;
> >>> +};
> >>> +
> >>>    struct collapse_control {
> >>>        bool is_khugepaged;
> >>>
> >>> @@ -102,6 +107,18 @@ struct collapse_control {
> >>>
> >>>        /* nodemask for allocation fallback */
> >>>        nodemask_t alloc_nmask;
> >>> +
> >>> +     /*
> >>> +      * bitmap used to collapse mTHP sizes.
> >>> +      * 1bit = order KHUGEPAGED_MIN_MTHP_ORDER mTHP
> >>> +      */
> >>> +     DECLARE_BITMAP(mthp_bitmap, MAX_MTHP_BITMAP_SIZE);
> >>> +     DECLARE_BITMAP(mthp_bitmap_temp, MAX_MTHP_BITMAP_SIZE);
> >>> +     struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_SIZE];
> >>> +};
> >>> +
> >>> +struct collapse_control khugepaged_collapse_control = {
> >>> +     .is_khugepaged = true,
> >>>    };
> >>>
> >>>    /**
> >>> @@ -851,10 +868,6 @@ static void khugepaged_alloc_sleep(void)
> >>>        remove_wait_queue(&khugepaged_wait, &wait);
> >>>    }
> >>>
> >>> -struct collapse_control khugepaged_collapse_control = {
> >>> -     .is_khugepaged = true,
> >>> -};
> >>> -
> >>>    static bool khugepaged_scan_abort(int nid, struct collapse_control *cc)
> >>>    {
> >>>        int i;
> >>> @@ -1118,7 +1131,8 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> >>>
> >>>    static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >>>                              int referenced, int unmapped,
> >>> -                           struct collapse_control *cc)
> >>> +                           struct collapse_control *cc, bool *mmap_locked,
> >>> +                               u8 order, u16 offset)
> >>>    {
> >>>        LIST_HEAD(compound_pagelist);
> >>>        pmd_t *pmd, _pmd;
> >>> @@ -1137,8 +1151,12 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >>>         * The allocation can take potentially a long time if it involves
> >>>         * sync compaction, and we do not need to hold the mmap_lock during
> >>>         * that. We will recheck the vma after taking it again in write mode.
> >>> +      * If collapsing mTHPs we may have already released the read_lock.
> >>>         */
> >>> -     mmap_read_unlock(mm);
> >>> +     if (*mmap_locked) {
> >>> +             mmap_read_unlock(mm);
> >>> +             *mmap_locked = false;
> >>> +     }
> >>>
> >>>        result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> >>>        if (result != SCAN_SUCCEED)
> >>> @@ -1273,12 +1291,72 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >>>    out_up_write:
> >>>        mmap_write_unlock(mm);
> >>>    out_nolock:
> >>> +     *mmap_locked = false;
> >>>        if (folio)
> >>>                folio_put(folio);
> >>>        trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> >>>        return result;
> >>>    }
> >>>
> >>> +// Recursive function to consume the bitmap
> >> Nit: please use '/* Xxxx */' for comments in this patch.
> >>
> >>> +static int khugepaged_scan_bitmap(struct mm_struct *mm, unsigned long address,
> >>> +                     int referenced, int unmapped, struct collapse_control *cc,
> >>> +                     bool *mmap_locked, unsigned long enabled_orders)
> >>> +{
> >>> +     u8 order, next_order;
> >>> +     u16 offset, mid_offset;
> >>> +     int num_chunks;
> >>> +     int bits_set, threshold_bits;
> >>> +     int top = -1;
> >>> +     int collapsed = 0;
> >>> +     int ret;
> >>> +     struct scan_bit_state state;
> >>> +     bool is_pmd_only = (enabled_orders == (1 << HPAGE_PMD_ORDER));
> >>> +
> >>> +     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> >>> +             { HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0 };
> >>> +
> >>> +     while (top >= 0) {
> >>> +             state = cc->mthp_bitmap_stack[top--];
> >>> +             order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
> >>> +             offset = state.offset;
> >>> +             num_chunks = 1 << (state.order);
> >>> +             // Skip mTHP orders that are not enabled
> >>> +             if (!test_bit(order, &enabled_orders))
> >>> +                     goto next;
> >>> +
> >>> +             // copy the relavant section to a new bitmap
> >>> +             bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset,
> >>> +                               MTHP_BITMAP_SIZE);
> >>> +
> >>> +             bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks);
> >>> +             threshold_bits = (HPAGE_PMD_NR - khugepaged_max_ptes_none - 1)
> >>> +                             >> (HPAGE_PMD_ORDER - state.order);
> >>> +
> >>> +             //Check if the region is "almost full" based on the threshold
> >>> +             if (bits_set > threshold_bits || is_pmd_only
> >>> +                     || test_bit(order, &huge_anon_orders_always)) {
> >> When testing this patch, I disabled the PMD-sized THP and enabled
> >> 64K-sized mTHP, but it still attempts to collapse into a PMD-sized THP
> >> (since bits_set > threshold_bits is ture). This doesn't seem reasonable?
> > We are still required to have PMD enabled for mTHP collapse to work.
> > It's a limitation of the current khugepaged code (it currently only
> > adds mm_slots when PMD is enabled).
> > We've discussed this in the past and are looking for a proper way
> > forward, but the solution becomes tricky.
>
> Not sure if this is still a problem, but does this patch solve
> it?
>
> https://lore.kernel.org/all/20250211111326.14295-12-dev.jain@arm.com/

Hi Dev,

Baolin sent out a patch to do something similar to what you did here
based on my changes. I was going to keep the original behavior of
activating khugepaged only if the PMD size is enabled, and make that
change separately (outside this series), but I've gone ahead and
applied/tested Baolin's patch.

Sorry I had forgotten you already had a solution for this.

Cheers,
-- Nico
>
> >
> > However I'm surprised that it still collapses due to the code below.
> > I'll test this out later today.
> >      +             if (!test_bit(order, &enabled_orders))
> >      +                     goto next;
> >>> +                     ret = collapse_huge_page(mm, address, referenced, unmapped, cc,
> >>> +                                     mmap_locked, order, offset * KHUGEPAGED_MIN_MTHP_NR);
> >>> +                     if (ret == SCAN_SUCCEED) {
> >>> +                             collapsed += (1 << order);
> >>> +                             continue;
> >>> +                     }
> >>> +             }
> >>> +
> >>> +next:
> >>> +             if (state.order > 0) {
> >>> +                     next_order = state.order - 1;
> >>> +                     mid_offset = offset + (num_chunks / 2);
> >>> +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> >>> +                             { next_order, mid_offset };
> >>> +                     cc->mthp_bitmap_stack[++top] = (struct scan_bit_state)
> >>> +                             { next_order, offset };
> >>> +                     }
> >>> +     }
> >>> +     return collapsed;
> >>> +}
> >>> +
> >>>    static int khugepaged_scan_pmd(struct mm_struct *mm,
> >>>                                   struct vm_area_struct *vma,
> >>>                                   unsigned long address, bool *mmap_locked,
> >>> @@ -1445,9 +1523,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >>>        pte_unmap_unlock(pte, ptl);
> >>>        if (result == SCAN_SUCCEED) {
> >>>                result = collapse_huge_page(mm, address, referenced,
> >>> -                                         unmapped, cc);
> >>> -             /* collapse_huge_page will return with the mmap_lock released */
> >>> -             *mmap_locked = false;
> >>> +                                         unmapped, cc, mmap_locked, HPAGE_PMD_ORDER, 0);
> >>>        }
> >>>    out:
> >>>        trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
>


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ