linux-kernel - Re: [PATCH] thp: Simplify splitting PMD mapping huge zero page

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <1437F525-1510-45CD-A67F-13DF525083B4@nvidia.com>
Date:   Fri, 27 Mar 2020 13:23:07 -0400
From:   Zi Yan <ziy@...dia.com>
To:     "Kirill A. Shutemov" <kirill@...temov.name>
CC:     <akpm@...ux-foundation.org>, <linux-mm@...ck.org>,
        <linux-kernel@...r.kernel.org>,
        "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>
Subject: Re: [PATCH] thp: Simplify splitting PMD mapping huge zero page

On 27 Mar 2020, at 13:03, Kirill A. Shutemov wrote:

> Splitting PMD mapping huge zero page can be simplified a lot: we can
> just unmap it and fallback to PTE handling.

So we will have an extra page fault for the first read to each subpage, but nothing changes if the first access to a subpage is a write, right? BTW, what is the motivation for this code simplification?

Thanks.

>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
> ---
>  mm/huge_memory.c | 57 ++++--------------------------------------------
>  1 file changed, 4 insertions(+), 53 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 42407e16bd80..ef6a6bcb291f 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2114,40 +2114,6 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
>  }
>  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
>
> -static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
> -		unsigned long haddr, pmd_t *pmd)
> -{
> -	struct mm_struct *mm = vma->vm_mm;
> -	pgtable_t pgtable;
> -	pmd_t _pmd;
> -	int i;
> -
> -	/*
> -	 * Leave pmd empty until pte is filled note that it is fine to delay
> -	 * notification until mmu_notifier_invalidate_range_end() as we are
> -	 * replacing a zero pmd write protected page with a zero pte write
> -	 * protected page.
> -	 *
> -	 * See Documentation/vm/mmu_notifier.rst
> -	 */
> -	pmdp_huge_clear_flush(vma, haddr, pmd);
> -
> -	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
> -	pmd_populate(mm, &_pmd, pgtable);
> -
> -	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> -		pte_t *pte, entry;
> -		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
> -		entry = pte_mkspecial(entry);
> -		pte = pte_offset_map(&_pmd, haddr);
> -		VM_BUG_ON(!pte_none(*pte));
> -		set_pte_at(mm, haddr, pte, entry);
> -		pte_unmap(pte);
> -	}
> -	smp_wmb(); /* make pte visible before pmd */
> -	pmd_populate(mm, pmd, pgtable);
> -}
> -
>  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		unsigned long haddr, bool freeze)
>  {
> @@ -2167,7 +2133,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>
>  	count_vm_event(THP_SPLIT_PMD);
>
> -	if (!vma_is_anonymous(vma)) {
> +	if (!vma_is_anonymous(vma) || is_huge_zero_pmd(*pmd)) {
>  		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
>  		/*
>  		 * We are going to unmap this huge page. So
> @@ -2175,7 +2141,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		 */
>  		if (arch_needs_pgtable_deposit())
>  			zap_deposited_table(mm, pmd);
> -		if (vma_is_dax(vma))
> +		if (vma_is_dax(vma) || is_huge_zero_pmd(*pmd))
>  			return;
>  		page = pmd_page(_pmd);
>  		if (!PageDirty(page) && pmd_dirty(_pmd))
> @@ -2186,17 +2152,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		put_page(page);
>  		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
>  		return;
> -	} else if (is_huge_zero_pmd(*pmd)) {
> -		/*
> -		 * FIXME: Do we want to invalidate secondary mmu by calling
> -		 * mmu_notifier_invalidate_range() see comments below inside
> -		 * __split_huge_pmd() ?
> -		 *
> -		 * We are going from a zero huge page write protected to zero
> -		 * small page also write protected so it does not seems useful
> -		 * to invalidate secondary mmu at this time.
> -		 */
> -		return __split_huge_zero_page_pmd(vma, haddr, pmd);
>  	}
>
>  	/*
> @@ -2339,13 +2294,9 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>  	spin_unlock(ptl);
>  	/*
>  	 * No need to double call mmu_notifier->invalidate_range() callback.
> -	 * They are 3 cases to consider inside __split_huge_pmd_locked():
> +	 * They are 2 cases to consider inside __split_huge_pmd_locked():
>  	 *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
> -	 *  2) __split_huge_zero_page_pmd() read only zero page and any write
> -	 *    fault will trigger a flush_notify before pointing to a new page
> -	 *    (it is fine if the secondary mmu keeps pointing to the old zero
> -	 *    page in the meantime)
> -	 *  3) Split a huge pmd into pte pointing to the same page. No need
> +	 *  2) Split a huge pmd into pte pointing to the same page. No need
>  	 *     to invalidate secondary tlb entry they are all still valid.
>  	 *     any further changes to individual pte will notify. So no need
>  	 *     to call mmu_notifier->invalidate_range()
> -- 
> 2.26.0


—
Best Regards,
Yan Zi

Download attachment "signature.asc" of type "application/pgp-signature" (855 bytes)