linux-kernel - Re: [PATCH 1/1] mm/vmscan: avoid split PMD-mapped THP during shrink_folio

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2062c2d1-4ebb-4a40-89f9-3083e6912301@redhat.com>
Date: Wed, 17 Apr 2024 17:02:19 +0200
From: David Hildenbrand <david@...hat.com>
To: Lance Yang <ioworker0@...il.com>, akpm@...ux-foundation.org
Cc: willy@...radead.org, maskray@...gle.com, ziy@...dia.com,
 ryan.roberts@....com, 21cnbao@...il.com, mhocko@...e.com,
 fengwei.yin@...el.com, zokeefe@...gle.com, shy828301@...il.com,
 xiehuan09@...il.com, wangkefeng.wang@...wei.com, songmuchun@...edance.com,
 peterx@...hat.com, minchan@...nel.org, linux-mm@...ck.org,
 linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/1] mm/vmscan: avoid split PMD-mapped THP during
 shrink_folio_list()

On 17.04.24 16:11, Lance Yang wrote:
> When the user no longer requires the pages, they would use madvise(madv_free)
> to mark the pages as lazy free. IMO, they would not typically rewrite to the
> given range.
> 
> At present, a PMD-mapped THP marked as lazyfree during shrink_folio_list()
> is unconditionally split, which may be unnecessary. If the THP is exclusively
> mapped and clean, and the PMD associated with it is also clean, then we can
> attempt to remove the PMD mapping from it. This change will improve the
> efficiency of memory reclamation in this case.
> 
> On an Intel i5 CPU, reclaiming 1GiB of PMD-mapped THPs using
> mem_cgroup_force_empty() results in the following runtimes in seconds
> (shorter is better):
> 
> --------------------------------------------
> |     Old       |      New       |  Change  |
> --------------------------------------------
> |   0.683426    |    0.049197    |  -92.80% |
> --------------------------------------------
> 
> Signed-off-by: Lance Yang <ioworker0@...il.com>
> ---
>   include/linux/huge_mm.h |  1 +
>   include/linux/rmap.h    |  1 +
>   mm/huge_memory.c        |  2 +-
>   mm/rmap.c               | 81 +++++++++++++++++++++++++++++++++++++++++
>   mm/vmscan.c             |  7 ++++
>   5 files changed, 91 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 7cd07b83a3d0..02a71c05f68a 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -36,6 +36,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>   int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>   		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
>   		    unsigned long cp_flags);
> +inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd);
>   
>   vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
>   vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 0f906dc6d280..8c2f45713351 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -100,6 +100,7 @@ enum ttu_flags {
>   					 * do a final flush if necessary */
>   	TTU_RMAP_LOCKED		= 0x80,	/* do not grab rmap lock:
>   					 * caller holds it */
> +	TTU_LAZYFREE_THP	= 0x100, /* avoid split PMD-mapped THP */
>   };
>   
>   #ifdef CONFIG_MMU
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 58f2c4745d80..309fba9624c2 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1801,7 +1801,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>   	return ret;
>   }
>   
> -static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
> +inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
>   {
>   	pgtable_t pgtable;
>   
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 2608c40dffad..4994f9e402d4 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -77,6 +77,7 @@
>   #include <linux/mm_inline.h>
>   
>   #include <asm/tlbflush.h>
> +#include <asm/tlb.h>
>   
>   #define CREATE_TRACE_POINTS
>   #include <trace/events/tlb.h>
> @@ -1606,6 +1607,80 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
>   #endif
>   }
>   
> +static bool __try_to_unmap_lazyfree_thp(struct vm_area_struct *vma,
> +					     unsigned long address,
> +					     struct folio *folio)
> +{
> +	spinlock_t *ptl;
> +	pmd_t *pmdp, orig_pmd;
> +	struct mmu_notifier_range range;
> +	struct mmu_gather tlb;
> +	struct mm_struct *mm = vma->vm_mm;
> +	struct page *page;
> +	bool ret = false;
> +
> +	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
> +	VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
> +	VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
> +	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
> +
> +	/*
> +	 * If we encounter a PMD-mapped THP that marked as lazyfree, we
> +	 * will try to unmap it without splitting.
> +	 *
> +	 * The folio exclusively mapped should only have two refs:
> +	 * one from the isolation and one from the rmap.
> +	 */
> +	if (folio_entire_mapcount(folio) != 1 || folio_test_dirty(folio) ||
> +	    folio_ref_count(folio) != 2)

folio_mapcount() == 1 is a bit nicer. Bit I assume you can drop that 
completely and only check the refcount?

> +		return false;
> +
> +	pmdp = mm_find_pmd(mm, address);
> +	if (unlikely(!pmdp))
> +		return false;
> +	if (pmd_dirty(*pmdp))
> +		return false;
> +
> +	tlb_gather_mmu(&tlb, mm);
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
> +				address & HPAGE_PMD_MASK,
> +				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
> +	mmu_notifier_invalidate_range_start(&range);
> +
> +	ptl = pmd_lock(mm, pmdp);
> +	orig_pmd = *pmdp;
> +	if (unlikely(!pmd_present(orig_pmd) || !pmd_trans_huge(orig_pmd)))
> +		goto out;
> +
> +	page = pmd_page(orig_pmd);
> +	if (unlikely(page_folio(page) != folio))
> +		goto out;
> +
> +	orig_pmd = pmdp_huge_get_and_clear(mm, address, pmdp);
> +	tlb_remove_pmd_tlb_entry(&tlb, pmdp, address);

Until this point, the page could have been pinned (including GUP-fast) 
and we might be in trouble if we drop it.

-- 
Cheers,

David / dhildenb