linux-kernel - Re: [PATCH v2 5/7] mm: shmem: add mTHP support for anonymous shmem

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAK1f24m1LAK-+GKeTSGANXXfk3KahmpN7KvO8FCkA+2cdNtfzw@mail.gmail.com>
Date: Tue, 14 May 2024 21:36:34 +0800
From: Lance Yang <ioworker0@...il.com>
To: Baolin Wang <baolin.wang@...ux.alibaba.com>
Cc: akpm@...ux-foundation.org, hughd@...gle.com, willy@...radead.org, 
	david@...hat.com, wangkefeng.wang@...wei.com, ying.huang@...el.com, 
	21cnbao@...il.com, ryan.roberts@....com, shy828301@...il.com, ziy@...dia.com, 
	linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 5/7] mm: shmem: add mTHP support for anonymous shmem

Hi Baolin,

On Mon, May 13, 2024 at 1:08 PM Baolin Wang
<baolin.wang@...ux.alibaba.com> wrote:
>
> Commit 19eaf44954df adds multi-size THP (mTHP) for anonymous pages, that
> can allow THP to be configured through the sysfs interface located at
> '/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled'.
>
> However, the anonymous share pages will ignore the anonymous mTHP rule
> configured through the sysfs interface, and can only use the PMD-mapped
> THP, that is not reasonable. Users expect to apply the mTHP rule for
> all anonymous pages, including the anonymous share pages, in order to
> enjoy the benefits of mTHP. For example, lower latency than PMD-mapped THP,
> smaller memory bloat than PMD-mapped THP, contiguous PTEs on ARM architecture
> to reduce TLB miss etc.
>
> The primary strategy is similar to supporting anonymous mTHP. Introduce
> a new interface '/mm/transparent_hugepage/hugepage-XXkb/shmem_enabled',
> which can have all the same values as the top-level
> '/sys/kernel/mm/transparent_hugepage/shmem_enabled', with adding a new
> additional "inherit" option. By default all sizes will be set to "never"
> except PMD size, which is set to "inherit". This ensures backward compatibility
> with the anonymous shmem enabled of the top level, meanwhile also allows
> independent control of anonymous shmem enabled for each mTHP.
>
> Signed-off-by: Baolin Wang <baolin.wang@...ux.alibaba.com>
> ---
>  include/linux/huge_mm.h |  10 +++
>  mm/shmem.c              | 179 +++++++++++++++++++++++++++++++++-------
>  2 files changed, 161 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 1fce6fee7766..b5339210268d 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -583,6 +583,16 @@ static inline bool thp_migration_supported(void)
>  {
>         return false;
>  }
> +
> +static inline int highest_order(unsigned long orders)
> +{
> +       return 0;
> +}
> +
> +static inline int next_order(unsigned long *orders, int prev)
> +{
> +       return 0;
> +}
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>
>  static inline int split_folio_to_list_to_order(struct folio *folio,
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 59cc26d44344..b50ddf013e37 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1611,6 +1611,106 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
>         return result;
>  }
>
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +static unsigned long anon_shmem_allowable_huge_orders(struct inode *inode,
> +                               struct vm_area_struct *vma, pgoff_t index,
> +                               bool global_huge)
> +{
> +       unsigned long mask = READ_ONCE(huge_anon_shmem_orders_always);
> +       unsigned long within_size_orders = READ_ONCE(huge_anon_shmem_orders_within_size);
> +       unsigned long vm_flags = vma->vm_flags;
> +       /*
> +        * Check all the (large) orders below HPAGE_PMD_ORDER + 1 that
> +        * are enabled for this vma.
> +        */
> +       unsigned long orders = BIT(PMD_ORDER + 1) - 1;
> +       loff_t i_size;
> +       int order;
> +
> +       if ((vm_flags & VM_NOHUGEPAGE) ||
> +           test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
> +               return 0;
> +
> +       /* If the hardware/firmware marked hugepage support disabled. */
> +       if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
> +               return 0;
> +
> +       /*
> +        * Following the 'deny' semantics of the top level, force the huge
> +        * option off from all mounts.
> +        */
> +       if (shmem_huge == SHMEM_HUGE_DENY)
> +               return 0;
> +       /*
> +        * Only allow inherit orders if the top-level value is 'force', which
> +        * means non-PMD sized THP can not override 'huge' mount option now.
> +        */
> +       if (shmem_huge == SHMEM_HUGE_FORCE)
> +               return READ_ONCE(huge_anon_shmem_orders_inherit);
> +
> +       /* Allow mTHP that will be fully within i_size. */
> +       order = highest_order(within_size_orders);
> +       while (within_size_orders) {
> +               index = round_up(index + 1, order);
> +               i_size = round_up(i_size_read(inode), PAGE_SIZE);
> +               if (i_size >> PAGE_SHIFT >= index) {
> +                       mask |= within_size_orders;
> +                       break;
> +               }
> +
> +               order = next_order(&within_size_orders, order);
> +       }
> +
> +       if (vm_flags & VM_HUGEPAGE)
> +               mask |= READ_ONCE(huge_anon_shmem_orders_madvise);
> +
> +       if (global_huge)
> +               mask |= READ_ONCE(huge_anon_shmem_orders_inherit);
> +
> +       return orders & mask;
> +}
> +
> +static unsigned long anon_shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
> +                                       struct address_space *mapping, pgoff_t index,
> +                                       unsigned long orders)
> +{
> +       struct vm_area_struct *vma = vmf->vma;
> +       unsigned long pages;
> +       int order;
> +
> +       orders = thp_vma_suitable_orders(vma, vmf->address, orders);
> +       if (!orders)
> +               return 0;
> +
> +       /* Find the highest order that can add into the page cache */
> +       order = highest_order(orders);
> +       while (orders) {
> +               pages = 1UL << order;
> +               index = round_down(index, pages);
> +               if (!xa_find(&mapping->i_pages, &index,
> +                            index + pages - 1, XA_PRESENT))
> +                       break;
> +               order = next_order(&orders, order);
> +       }
> +
> +       return orders;
> +}
> +#else
> +static unsigned long anon_shmem_allowable_huge_orders(struct inode *inode,
> +                               struct vm_area_struct *vma, pgoff_t index,
> +                               bool global_huge)
> +{
> +       return 0;
> +}
> +
> +static unsigned long anon_shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
> +                                       struct address_space *mapping, pgoff_t index,
> +                                       unsigned long orders)
> +{
> +       return 0;
> +}
> +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> +
>  static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
>                 struct shmem_inode_info *info, pgoff_t index, int order)
>  {
> @@ -1639,38 +1739,55 @@ static struct folio *shmem_alloc_folio(gfp_t gfp,
>         return (struct folio *)page;
>  }
>
> -static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
> -               struct inode *inode, pgoff_t index,
> -               struct mm_struct *fault_mm, bool huge)
> +static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
> +               gfp_t gfp, struct inode *inode, pgoff_t index,
> +               struct mm_struct *fault_mm, bool huge, unsigned long orders)

IMO, it might be cleaner to drop the huge parameter and just set 'orders' as
BIT(HPAGE_PMD_ORDER), then we only do the 'orders' check :)

Likely:

if (orders > 0) {
       if (vma && vma_is_anon_shmem(vma)) {
              ...
       } else if (orders & BIT(HPAGE_PMD_ORDER)) {
              ...
       }
}

>  {
>         struct address_space *mapping = inode->i_mapping;
>         struct shmem_inode_info *info = SHMEM_I(inode);
> -       struct folio *folio;
> +       struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
> +       unsigned long suitable_orders;
> +       struct folio *folio = NULL;
>         long pages;
> -       int error;
> +       int error, order;
>
>         if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
>                 huge = false;

Currently, if THP is disabled, 'huge' will fall back to order-0, but 'orders'
does not, IIUC. How about we make both consistent if THP is disabled?

if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
        huge = false;
        orders = 0;
}

Thanks,
Lance

>
> -       if (huge) {
> -               pages = HPAGE_PMD_NR;
> -               index = round_down(index, HPAGE_PMD_NR);
> +       if (huge || orders > 0) {
> +               if (vma && vma_is_anon_shmem(vma) && orders) {
> +                       suitable_orders = anon_shmem_suitable_orders(inode, vmf,
> +                                                       mapping, index, orders);
> +               } else {
> +                       pages = HPAGE_PMD_NR;
> +                       suitable_orders = BIT(HPAGE_PMD_ORDER);
> +                       index = round_down(index, HPAGE_PMD_NR);
>
> -               /*
> -                * Check for conflict before waiting on a huge allocation.
> -                * Conflict might be that a huge page has just been allocated
> -                * and added to page cache by a racing thread, or that there
> -                * is already at least one small page in the huge extent.
> -                * Be careful to retry when appropriate, but not forever!
> -                * Elsewhere -EEXIST would be the right code, but not here.
> -                */
> -               if (xa_find(&mapping->i_pages, &index,
> +                       /*
> +                        * Check for conflict before waiting on a huge allocation.
> +                        * Conflict might be that a huge page has just been allocated
> +                        * and added to page cache by a racing thread, or that there
> +                        * is already at least one small page in the huge extent.
> +                        * Be careful to retry when appropriate, but not forever!
> +                        * Elsewhere -EEXIST would be the right code, but not here.
> +                        */
> +                       if (xa_find(&mapping->i_pages, &index,
>                                 index + HPAGE_PMD_NR - 1, XA_PRESENT))
> -                       return ERR_PTR(-E2BIG);
> +                               return ERR_PTR(-E2BIG);
> +               }
>
> -               folio = shmem_alloc_hugefolio(gfp, info, index, HPAGE_PMD_ORDER);
> -               if (!folio && pages == HPAGE_PMD_NR)
> -                       count_vm_event(THP_FILE_FALLBACK);
> +               order = highest_order(suitable_orders);
> +               while (suitable_orders) {
> +                       pages = 1 << order;
> +                       index = round_down(index, pages);
> +                       folio = shmem_alloc_hugefolio(gfp, info, index, order);
> +                       if (folio)
> +                               goto allocated;
> +
> +                       if (pages == HPAGE_PMD_NR)
> +                               count_vm_event(THP_FILE_FALLBACK);
> +                       order = next_order(&suitable_orders, order);
> +               }
>         } else {
>                 pages = 1;
>                 folio = shmem_alloc_folio(gfp, info, index);
> @@ -1678,6 +1795,7 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
>         if (!folio)
>                 return ERR_PTR(-ENOMEM);
>
> +allocated:
>         __folio_set_locked(folio);
>         __folio_set_swapbacked(folio);
>
> @@ -1972,7 +2090,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
>         struct mm_struct *fault_mm;
>         struct folio *folio;
>         int error;
> -       bool alloced;
> +       bool alloced, huge;
> +       unsigned long orders = 0;
>
>         if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
>                 return -EINVAL;
> @@ -2044,14 +2163,18 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
>                 return 0;
>         }
>
> -       if (shmem_is_huge(inode, index, false, fault_mm,
> -                         vma ? vma->vm_flags : 0)) {
> +       huge = shmem_is_huge(inode, index, false, fault_mm,
> +                            vma ? vma->vm_flags : 0);
> +       /* Find hugepage orders that are allowed for anonymous shmem. */
> +       if (vma && vma_is_anon_shmem(vma))
> +               orders = anon_shmem_allowable_huge_orders(inode, vma, index, huge);
> +       if (huge || orders > 0) {
>                 gfp_t huge_gfp;
>
>                 huge_gfp = vma_thp_gfp_mask(vma);
>                 huge_gfp = limit_gfp_mask(huge_gfp, gfp);
> -               folio = shmem_alloc_and_add_folio(huge_gfp,
> -                               inode, index, fault_mm, true);
> +               folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
> +                               inode, index, fault_mm, true, orders);
>                 if (!IS_ERR(folio)) {
>                         if (folio_test_pmd_mappable(folio))
>                                 count_vm_event(THP_FILE_ALLOC);
> @@ -2061,7 +2184,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
>                         goto repeat;
>         }
>
> -       folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false);
> +       folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, false, 0);
>         if (IS_ERR(folio)) {
>                 error = PTR_ERR(folio);
>                 if (error == -EEXIST)
> @@ -2072,7 +2195,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
>
>  alloced:
>         alloced = true;
> -       if (folio_test_pmd_mappable(folio) &&
> +       if (folio_test_large(folio) &&
>             DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
>                                         folio_next_index(folio) - 1) {
>                 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
> --
> 2.39.3
>