[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <097741c8-b2ea-4869-8792-f9663019137c@arm.com>
Date: Tue, 17 Dec 2024 19:24:17 +0000
From: Ryan Roberts <ryan.roberts@....com>
To: Dev Jain <dev.jain@....com>, akpm@...ux-foundation.org, david@...hat.com,
willy@...radead.org, kirill.shutemov@...ux.intel.com
Cc: anshuman.khandual@....com, catalin.marinas@....com, cl@...two.org,
vbabka@...e.cz, mhocko@...e.com, apopple@...dia.com,
dave.hansen@...ux.intel.com, will@...nel.org, baohua@...nel.org,
jack@...e.cz, srivatsa@...il.mit.edu, haowenchao22@...il.com,
hughd@...gle.com, aneesh.kumar@...nel.org, yang@...amperecomputing.com,
peterx@...hat.com, ioworker0@...il.com, wangkefeng.wang@...wei.com,
ziy@...dia.com, jglisse@...gle.com, surenb@...gle.com,
vishal.moola@...il.com, zokeefe@...gle.com, zhengqi.arch@...edance.com,
jhubbard@...dia.com, 21cnbao@...il.com, linux-mm@...ck.org,
linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH 08/12] khugepaged: Abstract PMD-THP collapse
On 16/12/2024 16:51, Dev Jain wrote:
> Abstract away taking the mmap_lock exclusively, copying page contents, and
> setting the PMD, into vma_collapse_anon_folio_pmd().
>
> Signed-off-by: Dev Jain <dev.jain@....com>
> ---
> mm/khugepaged.c | 119 +++++++++++++++++++++++++++---------------------
> 1 file changed, 66 insertions(+), 53 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 078794aa3335..88beebef773e 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1111,58 +1111,17 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> return SCAN_SUCCEED;
> }
>
> -static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> - int referenced, int unmapped, int order,
> - struct collapse_control *cc)
> +static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long address,
> + struct vm_area_struct *vma, struct collapse_control *cc, pmd_t *pmd,
> + struct folio *folio)
> {
> + struct mmu_notifier_range range;
> + spinlock_t *pmd_ptl, *pte_ptl;
> LIST_HEAD(compound_pagelist);
> - pmd_t *pmd, _pmd;
> - pte_t *pte;
> pgtable_t pgtable;
> - struct folio *folio;
> - spinlock_t *pmd_ptl, *pte_ptl;
> - int result = SCAN_FAIL;
> - struct vm_area_struct *vma;
> - struct mmu_notifier_range range;
> -
> - VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> -
> - /*
> - * Before allocating the hugepage, release the mmap_lock read lock.
> - * The allocation can take potentially a long time if it involves
> - * sync compaction, and we do not need to hold the mmap_lock during
> - * that. We will recheck the vma after taking it again in write mode.
> - */
> - mmap_read_unlock(mm);
> -
> - result = alloc_charge_folio(&folio, mm, order, cc);
> - if (result != SCAN_SUCCEED)
> - goto out_nolock;
> -
> - mmap_read_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc);
> - if (result != SCAN_SUCCEED) {
> - mmap_read_unlock(mm);
> - goto out_nolock;
> - }
> -
> - result = find_pmd_or_thp_or_none(mm, address, &pmd);
> - if (result != SCAN_SUCCEED) {
> - mmap_read_unlock(mm);
> - goto out_nolock;
> - }
> -
> - if (unmapped) {
> - /*
> - * __collapse_huge_page_swapin will return with mmap_lock
> - * released when it fails. So we jump out_nolock directly in
> - * that case. Continuing to collapse causes inconsistency.
> - */
> - result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> - referenced, order);
> - if (result != SCAN_SUCCEED)
> - goto out_nolock;
> - }
> + int result;
> + pmd_t _pmd;
> + pte_t *pte;
>
> mmap_read_unlock(mm);
> /*
> @@ -1174,7 +1133,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * mmap_lock.
> */
> mmap_write_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc);
> +
> + result = hugepage_vma_revalidate(mm, address, true, &vma, HPAGE_PMD_ORDER, cc);
> if (result != SCAN_SUCCEED)
> goto out_up_write;
> /* check if the pmd is still valid */
> @@ -1206,7 +1166,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> if (pte) {
> result = __collapse_huge_page_isolate(vma, address, pte, cc,
> - &compound_pagelist, order);
> + &compound_pagelist, HPAGE_PMD_ORDER);
> spin_unlock(pte_ptl);
> } else {
> result = SCAN_PMD_NULL;
> @@ -1262,11 +1222,64 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> deferred_split_folio(folio, false);
> spin_unlock(pmd_ptl);
>
> - folio = NULL;
> -
> result = SCAN_SUCCEED;
> out_up_write:
> mmap_write_unlock(mm);
> + return result;
> +}
> +
> +static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> + int referenced, int unmapped, int order,
> + struct collapse_control *cc)
> +{
> + struct vm_area_struct *vma;
> + int result = SCAN_FAIL;
> + struct folio *folio;
> + pmd_t *pmd;
> +
> + /*
> + * Before allocating the hugepage, release the mmap_lock read lock.
> + * The allocation can take potentially a long time if it involves
> + * sync compaction, and we do not need to hold the mmap_lock during
> + * that. We will recheck the vma after taking it again in write mode.
> + */
> + mmap_read_unlock(mm);
> +
> + result = alloc_charge_folio(&folio, mm, order, cc);
> + if (result != SCAN_SUCCEED)
> + goto out_nolock;
> +
> + mmap_read_lock(mm);
> + result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc);
> + if (result != SCAN_SUCCEED) {
> + mmap_read_unlock(mm);
> + goto out_nolock;
> + }
> +
> + result = find_pmd_or_thp_or_none(mm, address, &pmd);
> + if (result != SCAN_SUCCEED) {
> + mmap_read_unlock(mm);
> + goto out_nolock;
> + }
> +
> + if (unmapped) {
> + /*
> + * __collapse_huge_page_swapin will return with mmap_lock
> + * released when it fails. So we jump out_nolock directly in
> + * that case. Continuing to collapse causes inconsistency.
> + */
> + result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> + referenced, order);
> + if (result != SCAN_SUCCEED)
> + goto out_nolock;
> + }
> +
> + if (order == HPAGE_PMD_ORDER)
> + result = vma_collapse_anon_folio_pmd(mm, address, vma, cc, pmd, folio);
I think the locking is broken here? collapse_huge_page() used to enter with the
mmamp read lock and exit without the lock held at all. After the change, this is
only true for order == HPAGE_PMD_ORDER. For other orders, you exit with the mmap
read lock still held. Perhaps:
else
mmap_read_unlock(mm);
> +
> + if (result == SCAN_SUCCEED)
> + folio = NULL;
> +
> out_nolock:
> if (folio)
> folio_put(folio);
Powered by blists - more mailing lists