[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <999a569d-9424-41b0-b0c8-d028aee2985f@arm.com>
Date: Wed, 18 Dec 2024 14:56:35 +0530
From: Dev Jain <dev.jain@....com>
To: Ryan Roberts <ryan.roberts@....com>, akpm@...ux-foundation.org,
david@...hat.com, willy@...radead.org, kirill.shutemov@...ux.intel.com
Cc: anshuman.khandual@....com, catalin.marinas@....com, cl@...two.org,
vbabka@...e.cz, mhocko@...e.com, apopple@...dia.com,
dave.hansen@...ux.intel.com, will@...nel.org, baohua@...nel.org,
jack@...e.cz, srivatsa@...il.mit.edu, haowenchao22@...il.com,
hughd@...gle.com, aneesh.kumar@...nel.org, yang@...amperecomputing.com,
peterx@...hat.com, ioworker0@...il.com, wangkefeng.wang@...wei.com,
ziy@...dia.com, jglisse@...gle.com, surenb@...gle.com,
vishal.moola@...il.com, zokeefe@...gle.com, zhengqi.arch@...edance.com,
jhubbard@...dia.com, 21cnbao@...il.com, linux-mm@...ck.org,
linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH 08/12] khugepaged: Abstract PMD-THP collapse
On 18/12/24 12:54 am, Ryan Roberts wrote:
> On 16/12/2024 16:51, Dev Jain wrote:
>> Abstract away taking the mmap_lock exclusively, copying page contents, and
>> setting the PMD, into vma_collapse_anon_folio_pmd().
>>
>> Signed-off-by: Dev Jain <dev.jain@....com>
>> ---
>> mm/khugepaged.c | 119 +++++++++++++++++++++++++++---------------------
>> 1 file changed, 66 insertions(+), 53 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 078794aa3335..88beebef773e 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1111,58 +1111,17 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
>> return SCAN_SUCCEED;
>> }
>>
>> -static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>> - int referenced, int unmapped, int order,
>> - struct collapse_control *cc)
>> +static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long address,
>> + struct vm_area_struct *vma, struct collapse_control *cc, pmd_t *pmd,
>> + struct folio *folio)
>> {
>> + struct mmu_notifier_range range;
>> + spinlock_t *pmd_ptl, *pte_ptl;
>> LIST_HEAD(compound_pagelist);
>> - pmd_t *pmd, _pmd;
>> - pte_t *pte;
>> pgtable_t pgtable;
>> - struct folio *folio;
>> - spinlock_t *pmd_ptl, *pte_ptl;
>> - int result = SCAN_FAIL;
>> - struct vm_area_struct *vma;
>> - struct mmu_notifier_range range;
>> -
>> - VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>> -
>> - /*
>> - * Before allocating the hugepage, release the mmap_lock read lock.
>> - * The allocation can take potentially a long time if it involves
>> - * sync compaction, and we do not need to hold the mmap_lock during
>> - * that. We will recheck the vma after taking it again in write mode.
>> - */
>> - mmap_read_unlock(mm);
>> -
>> - result = alloc_charge_folio(&folio, mm, order, cc);
>> - if (result != SCAN_SUCCEED)
>> - goto out_nolock;
>> -
>> - mmap_read_lock(mm);
>> - result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc);
>> - if (result != SCAN_SUCCEED) {
>> - mmap_read_unlock(mm);
>> - goto out_nolock;
>> - }
>> -
>> - result = find_pmd_or_thp_or_none(mm, address, &pmd);
>> - if (result != SCAN_SUCCEED) {
>> - mmap_read_unlock(mm);
>> - goto out_nolock;
>> - }
>> -
>> - if (unmapped) {
>> - /*
>> - * __collapse_huge_page_swapin will return with mmap_lock
>> - * released when it fails. So we jump out_nolock directly in
>> - * that case. Continuing to collapse causes inconsistency.
>> - */
>> - result = __collapse_huge_page_swapin(mm, vma, address, pmd,
>> - referenced, order);
>> - if (result != SCAN_SUCCEED)
>> - goto out_nolock;
>> - }
>> + int result;
>> + pmd_t _pmd;
>> + pte_t *pte;
>>
>> mmap_read_unlock(mm);
>> /*
>> @@ -1174,7 +1133,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>> * mmap_lock.
>> */
>> mmap_write_lock(mm);
>> - result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc);
>> +
>> + result = hugepage_vma_revalidate(mm, address, true, &vma, HPAGE_PMD_ORDER, cc);
>> if (result != SCAN_SUCCEED)
>> goto out_up_write;
>> /* check if the pmd is still valid */
>> @@ -1206,7 +1166,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>> pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
>> if (pte) {
>> result = __collapse_huge_page_isolate(vma, address, pte, cc,
>> - &compound_pagelist, order);
>> + &compound_pagelist, HPAGE_PMD_ORDER);
>> spin_unlock(pte_ptl);
>> } else {
>> result = SCAN_PMD_NULL;
>> @@ -1262,11 +1222,64 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>> deferred_split_folio(folio, false);
>> spin_unlock(pmd_ptl);
>>
>> - folio = NULL;
>> -
>> result = SCAN_SUCCEED;
>> out_up_write:
>> mmap_write_unlock(mm);
>> + return result;
>> +}
>> +
>> +static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>> + int referenced, int unmapped, int order,
>> + struct collapse_control *cc)
>> +{
>> + struct vm_area_struct *vma;
>> + int result = SCAN_FAIL;
>> + struct folio *folio;
>> + pmd_t *pmd;
>> +
>> + /*
>> + * Before allocating the hugepage, release the mmap_lock read lock.
>> + * The allocation can take potentially a long time if it involves
>> + * sync compaction, and we do not need to hold the mmap_lock during
>> + * that. We will recheck the vma after taking it again in write mode.
>> + */
>> + mmap_read_unlock(mm);
>> +
>> + result = alloc_charge_folio(&folio, mm, order, cc);
>> + if (result != SCAN_SUCCEED)
>> + goto out_nolock;
>> +
>> + mmap_read_lock(mm);
>> + result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc);
>> + if (result != SCAN_SUCCEED) {
>> + mmap_read_unlock(mm);
>> + goto out_nolock;
>> + }
>> +
>> + result = find_pmd_or_thp_or_none(mm, address, &pmd);
>> + if (result != SCAN_SUCCEED) {
>> + mmap_read_unlock(mm);
>> + goto out_nolock;
>> + }
>> +
>> + if (unmapped) {
>> + /*
>> + * __collapse_huge_page_swapin will return with mmap_lock
>> + * released when it fails. So we jump out_nolock directly in
>> + * that case. Continuing to collapse causes inconsistency.
>> + */
>> + result = __collapse_huge_page_swapin(mm, vma, address, pmd,
>> + referenced, order);
>> + if (result != SCAN_SUCCEED)
>> + goto out_nolock;
>> + }
>> +
>> + if (order == HPAGE_PMD_ORDER)
>> + result = vma_collapse_anon_folio_pmd(mm, address, vma, cc, pmd, folio);
> I think the locking is broken here? collapse_huge_page() used to enter with the
> mmamp read lock and exit without the lock held at all. After the change, this is
> only true for order == HPAGE_PMD_ORDER. For other orders, you exit with the mmap
> read lock still held. Perhaps:
>
> else
> mmap_read_unlock(mm);
I have completely messed up sequential building of patches :) I will take care next time.
>
>> +
>> + if (result == SCAN_SUCCEED)
>> + folio = NULL;
>> +
>> out_nolock:
>> if (folio)
>> folio_put(folio);
Powered by blists - more mailing lists