[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAK1f24m1LAK-+GKeTSGANXXfk3KahmpN7KvO8FCkA+2cdNtfzw@mail.gmail.com>
Date: Tue, 14 May 2024 21:36:34 +0800
From: Lance Yang <ioworker0@...il.com>
To: Baolin Wang <baolin.wang@...ux.alibaba.com>
Cc: akpm@...ux-foundation.org, hughd@...gle.com, willy@...radead.org,
david@...hat.com, wangkefeng.wang@...wei.com, ying.huang@...el.com,
21cnbao@...il.com, ryan.roberts@....com, shy828301@...il.com, ziy@...dia.com,
linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 5/7] mm: shmem: add mTHP support for anonymous shmem
Hi Baolin,
On Mon, May 13, 2024 at 1:08 PM Baolin Wang
<baolin.wang@...ux.alibaba.com> wrote:
>
> Commit 19eaf44954df adds multi-size THP (mTHP) for anonymous pages, that
> can allow THP to be configured through the sysfs interface located at
> '/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled'.
>
> However, the anonymous share pages will ignore the anonymous mTHP rule
> configured through the sysfs interface, and can only use the PMD-mapped
> THP, that is not reasonable. Users expect to apply the mTHP rule for
> all anonymous pages, including the anonymous share pages, in order to
> enjoy the benefits of mTHP. For example, lower latency than PMD-mapped THP,
> smaller memory bloat than PMD-mapped THP, contiguous PTEs on ARM architecture
> to reduce TLB miss etc.
>
> The primary strategy is similar to supporting anonymous mTHP. Introduce
> a new interface '/mm/transparent_hugepage/hugepage-XXkb/shmem_enabled',
> which can have all the same values as the top-level
> '/sys/kernel/mm/transparent_hugepage/shmem_enabled', with adding a new
> additional "inherit" option. By default all sizes will be set to "never"
> except PMD size, which is set to "inherit". This ensures backward compatibility
> with the anonymous shmem enabled of the top level, meanwhile also allows
> independent control of anonymous shmem enabled for each mTHP.
>
> Signed-off-by: Baolin Wang <baolin.wang@...ux.alibaba.com>
> ---
> include/linux/huge_mm.h | 10 +++
> mm/shmem.c | 179 +++++++++++++++++++++++++++++++++-------
> 2 files changed, 161 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 1fce6fee7766..b5339210268d 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -583,6 +583,16 @@ static inline bool thp_migration_supported(void)
> {
> return false;
> }
> +
> +static inline int highest_order(unsigned long orders)
> +{
> + return 0;
> +}
> +
> +static inline int next_order(unsigned long *orders, int prev)
> +{
> + return 0;
> +}
> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>
> static inline int split_folio_to_list_to_order(struct folio *folio,
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 59cc26d44344..b50ddf013e37 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1611,6 +1611,106 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
> return result;
> }
>
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +static unsigned long anon_shmem_allowable_huge_orders(struct inode *inode,
> + struct vm_area_struct *vma, pgoff_t index,
> + bool global_huge)
> +{
> + unsigned long mask = READ_ONCE(huge_anon_shmem_orders_always);
> + unsigned long within_size_orders = READ_ONCE(huge_anon_shmem_orders_within_size);
> + unsigned long vm_flags = vma->vm_flags;
> + /*
> + * Check all the (large) orders below HPAGE_PMD_ORDER + 1 that
> + * are enabled for this vma.
> + */
> + unsigned long orders = BIT(PMD_ORDER + 1) - 1;
> + loff_t i_size;
> + int order;
> +
> + if ((vm_flags & VM_NOHUGEPAGE) ||
> + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
> + return 0;
> +
> + /* If the hardware/firmware marked hugepage support disabled. */
> + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
> + return 0;
> +
> + /*
> + * Following the 'deny' semantics of the top level, force the huge
> + * option off from all mounts.
> + */
> + if (shmem_huge == SHMEM_HUGE_DENY)
> + return 0;
> + /*
> + * Only allow inherit orders if the top-level value is 'force', which
> + * means non-PMD sized THP can not override 'huge' mount option now.
> + */
> + if (shmem_huge == SHMEM_HUGE_FORCE)
> + return READ_ONCE(huge_anon_shmem_orders_inherit);
> +
> + /* Allow mTHP that will be fully within i_size. */
> + order = highest_order(within_size_orders);
> + while (within_size_orders) {
> + index = round_up(index + 1, order);
> + i_size = round_up(i_size_read(inode), PAGE_SIZE);
> + if (i_size >> PAGE_SHIFT >= index) {
> + mask |= within_size_orders;
> + break;
> + }
> +
> + order = next_order(&within_size_orders, order);
> + }
> +
> + if (vm_flags & VM_HUGEPAGE)
> + mask |= READ_ONCE(huge_anon_shmem_orders_madvise);
> +
> + if (global_huge)
> + mask |= READ_ONCE(huge_anon_shmem_orders_inherit);
> +
> + return orders & mask;
> +}
> +
> +static unsigned long anon_shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
> + struct address_space *mapping, pgoff_t index,
> + unsigned long orders)
> +{
> + struct vm_area_struct *vma = vmf->vma;
> + unsigned long pages;
> + int order;
> +
> + orders = thp_vma_suitable_orders(vma, vmf->address, orders);
> + if (!orders)
> + return 0;
> +
> + /* Find the highest order that can add into the page cache */
> + order = highest_order(orders);
> + while (orders) {
> + pages = 1UL << order;
> + index = round_down(index, pages);
> + if (!xa_find(&mapping->i_pages, &index,
> + index + pages - 1, XA_PRESENT))
> + break;
> + order = next_order(&orders, order);
> + }
> +
> + return orders;
> +}
> +#else
> +static unsigned long anon_shmem_allowable_huge_orders(struct inode *inode,
> + struct vm_area_struct *vma, pgoff_t index,
> + bool global_huge)
> +{
> + return 0;
> +}
> +
> +static unsigned long anon_shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
> + struct address_space *mapping, pgoff_t index,
> + unsigned long orders)
> +{
> + return 0;
> +}
> +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> +
> static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
> struct shmem_inode_info *info, pgoff_t index, int order)
> {
> @@ -1639,38 +1739,55 @@ static struct folio *shmem_alloc_folio(gfp_t gfp,
> return (struct folio *)page;
> }
>
> -static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
> - struct inode *inode, pgoff_t index,
> - struct mm_struct *fault_mm, bool huge)
> +static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
> + gfp_t gfp, struct inode *inode, pgoff_t index,
> + struct mm_struct *fault_mm, bool huge, unsigned long orders)
IMO, it might be cleaner to drop the huge parameter and just set 'orders' as
BIT(HPAGE_PMD_ORDER), then we only do the 'orders' check :)
Likely:
if (orders > 0) {
if (vma && vma_is_anon_shmem(vma)) {
...
} else if (orders & BIT(HPAGE_PMD_ORDER)) {
...
}
}
> {
> struct address_space *mapping = inode->i_mapping;
> struct shmem_inode_info *info = SHMEM_I(inode);
> - struct folio *folio;
> + struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
> + unsigned long suitable_orders;
> + struct folio *folio = NULL;
> long pages;
> - int error;
> + int error, order;
>
> if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> huge = false;
Currently, if THP is disabled, 'huge' will fall back to order-0, but 'orders'
does not, IIUC. How about we make both consistent if THP is disabled?
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
huge = false;
orders = 0;
}
Thanks,
Lance
>
> - if (huge) {
> - pages = HPAGE_PMD_NR;
> - index = round_down(index, HPAGE_PMD_NR);
> + if (huge || orders > 0) {
> + if (vma && vma_is_anon_shmem(vma) && orders) {
> + suitable_orders = anon_shmem_suitable_orders(inode, vmf,
> + mapping, index, orders);
> + } else {
> + pages = HPAGE_PMD_NR;
> + suitable_orders = BIT(HPAGE_PMD_ORDER);
> + index = round_down(index, HPAGE_PMD_NR);
>
> - /*
> - * Check for conflict before waiting on a huge allocation.
> - * Conflict might be that a huge page has just been allocated
> - * and added to page cache by a racing thread, or that there
> - * is already at least one small page in the huge extent.
> - * Be careful to retry when appropriate, but not forever!
> - * Elsewhere -EEXIST would be the right code, but not here.
> - */
> - if (xa_find(&mapping->i_pages, &index,
> + /*
> + * Check for conflict before waiting on a huge allocation.
> + * Conflict might be that a huge page has just been allocated
> + * and added to page cache by a racing thread, or that there
> + * is already at least one small page in the huge extent.
> + * Be careful to retry when appropriate, but not forever!
> + * Elsewhere -EEXIST would be the right code, but not here.
> + */
> + if (xa_find(&mapping->i_pages, &index,
> index + HPAGE_PMD_NR - 1, XA_PRESENT))
> - return ERR_PTR(-E2BIG);
> + return ERR_PTR(-E2BIG);
> + }
>
> - folio = shmem_alloc_hugefolio(gfp, info, index, HPAGE_PMD_ORDER);
> - if (!folio && pages == HPAGE_PMD_NR)
> - count_vm_event(THP_FILE_FALLBACK);
> + order = highest_order(suitable_orders);
> + while (suitable_orders) {
> + pages = 1 << order;
> + index = round_down(index, pages);
> + folio = shmem_alloc_hugefolio(gfp, info, index, order);
> + if (folio)
> + goto allocated;
> +
> + if (pages == HPAGE_PMD_NR)
> + count_vm_event(THP_FILE_FALLBACK);
> + order = next_order(&suitable_orders, order);
> + }
> } else {
> pages = 1;
> folio = shmem_alloc_folio(gfp, info, index);
> @@ -1678,6 +1795,7 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
> if (!folio)
> return ERR_PTR(-ENOMEM);
>
> +allocated:
> __folio_set_locked(folio);
> __folio_set_swapbacked(folio);
>
> @@ -1972,7 +2090,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
> struct mm_struct *fault_mm;
> struct folio *folio;
> int error;
> - bool alloced;
> + bool alloced, huge;
> + unsigned long orders = 0;
>
> if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
> return -EINVAL;
> @@ -2044,14 +2163,18 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
> return 0;
> }
>
> - if (shmem_is_huge(inode, index, false, fault_mm,
> - vma ? vma->vm_flags : 0)) {
> + huge = shmem_is_huge(inode, index, false, fault_mm,
> + vma ? vma->vm_flags : 0);
> + /* Find hugepage orders that are allowed for anonymous shmem. */
> + if (vma && vma_is_anon_shmem(vma))
> + orders = anon_shmem_allowable_huge_orders(inode, vma, index, huge);
> + if (huge || orders > 0) {
> gfp_t huge_gfp;
>
> huge_gfp = vma_thp_gfp_mask(vma);
> huge_gfp = limit_gfp_mask(huge_gfp, gfp);
> - folio = shmem_alloc_and_add_folio(huge_gfp,
> - inode, index, fault_mm, true);
> + folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
> + inode, index, fault_mm, true, orders);
> if (!IS_ERR(folio)) {
> if (folio_test_pmd_mappable(folio))
> count_vm_event(THP_FILE_ALLOC);
> @@ -2061,7 +2184,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
> goto repeat;
> }
>
> - folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false);
> + folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, false, 0);
> if (IS_ERR(folio)) {
> error = PTR_ERR(folio);
> if (error == -EEXIST)
> @@ -2072,7 +2195,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
>
> alloced:
> alloced = true;
> - if (folio_test_pmd_mappable(folio) &&
> + if (folio_test_large(folio) &&
> DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
> folio_next_index(folio) - 1) {
> struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
> --
> 2.39.3
>
Powered by blists - more mailing lists