linux-kernel - Re: [RFC PATCH 08/12] khugepaged: Abstract PMD-THP collapse

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <999a569d-9424-41b0-b0c8-d028aee2985f@arm.com>
Date: Wed, 18 Dec 2024 14:56:35 +0530
From: Dev Jain <dev.jain@....com>
To: Ryan Roberts <ryan.roberts@....com>, akpm@...ux-foundation.org,
 david@...hat.com, willy@...radead.org, kirill.shutemov@...ux.intel.com
Cc: anshuman.khandual@....com, catalin.marinas@....com, cl@...two.org,
 vbabka@...e.cz, mhocko@...e.com, apopple@...dia.com,
 dave.hansen@...ux.intel.com, will@...nel.org, baohua@...nel.org,
 jack@...e.cz, srivatsa@...il.mit.edu, haowenchao22@...il.com,
 hughd@...gle.com, aneesh.kumar@...nel.org, yang@...amperecomputing.com,
 peterx@...hat.com, ioworker0@...il.com, wangkefeng.wang@...wei.com,
 ziy@...dia.com, jglisse@...gle.com, surenb@...gle.com,
 vishal.moola@...il.com, zokeefe@...gle.com, zhengqi.arch@...edance.com,
 jhubbard@...dia.com, 21cnbao@...il.com, linux-mm@...ck.org,
 linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH 08/12] khugepaged: Abstract PMD-THP collapse


On 18/12/24 12:54 am, Ryan Roberts wrote:
> On 16/12/2024 16:51, Dev Jain wrote:
>> Abstract away taking the mmap_lock exclusively, copying page contents, and
>> setting the PMD, into vma_collapse_anon_folio_pmd().
>>
>> Signed-off-by: Dev Jain <dev.jain@....com>
>> ---
>>   mm/khugepaged.c | 119 +++++++++++++++++++++++++++---------------------
>>   1 file changed, 66 insertions(+), 53 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 078794aa3335..88beebef773e 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1111,58 +1111,17 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
>>   	return SCAN_SUCCEED;
>>   }
>>   
>> -static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>> -			      int referenced, int unmapped, int order,
>> -			      struct collapse_control *cc)
>> +static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long address,
>> +		struct vm_area_struct *vma, struct collapse_control *cc, pmd_t *pmd,
>> +		struct folio *folio)
>>   {
>> +	struct mmu_notifier_range range;
>> +	spinlock_t *pmd_ptl, *pte_ptl;
>>   	LIST_HEAD(compound_pagelist);
>> -	pmd_t *pmd, _pmd;
>> -	pte_t *pte;
>>   	pgtable_t pgtable;
>> -	struct folio *folio;
>> -	spinlock_t *pmd_ptl, *pte_ptl;
>> -	int result = SCAN_FAIL;
>> -	struct vm_area_struct *vma;
>> -	struct mmu_notifier_range range;
>> -
>> -	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>> -
>> -	/*
>> -	 * Before allocating the hugepage, release the mmap_lock read lock.
>> -	 * The allocation can take potentially a long time if it involves
>> -	 * sync compaction, and we do not need to hold the mmap_lock during
>> -	 * that. We will recheck the vma after taking it again in write mode.
>> -	 */
>> -	mmap_read_unlock(mm);
>> -
>> -	result = alloc_charge_folio(&folio, mm, order, cc);
>> -	if (result != SCAN_SUCCEED)
>> -		goto out_nolock;
>> -
>> -	mmap_read_lock(mm);
>> -	result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc);
>> -	if (result != SCAN_SUCCEED) {
>> -		mmap_read_unlock(mm);
>> -		goto out_nolock;
>> -	}
>> -
>> -	result = find_pmd_or_thp_or_none(mm, address, &pmd);
>> -	if (result != SCAN_SUCCEED) {
>> -		mmap_read_unlock(mm);
>> -		goto out_nolock;
>> -	}
>> -
>> -	if (unmapped) {
>> -		/*
>> -		 * __collapse_huge_page_swapin will return with mmap_lock
>> -		 * released when it fails. So we jump out_nolock directly in
>> -		 * that case.  Continuing to collapse causes inconsistency.
>> -		 */
>> -		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
>> -						     referenced, order);
>> -		if (result != SCAN_SUCCEED)
>> -			goto out_nolock;
>> -	}
>> +	int result;
>> +	pmd_t _pmd;
>> +	pte_t *pte;
>>   
>>   	mmap_read_unlock(mm);
>>   	/*
>> @@ -1174,7 +1133,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>>   	 * mmap_lock.
>>   	 */
>>   	mmap_write_lock(mm);
>> -	result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc);
>> +
>> +	result = hugepage_vma_revalidate(mm, address, true, &vma, HPAGE_PMD_ORDER, cc);
>>   	if (result != SCAN_SUCCEED)
>>   		goto out_up_write;
>>   	/* check if the pmd is still valid */
>> @@ -1206,7 +1166,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>>   	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
>>   	if (pte) {
>>   		result = __collapse_huge_page_isolate(vma, address, pte, cc,
>> -						      &compound_pagelist, order);
>> +						      &compound_pagelist, HPAGE_PMD_ORDER);
>>   		spin_unlock(pte_ptl);
>>   	} else {
>>   		result = SCAN_PMD_NULL;
>> @@ -1262,11 +1222,64 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>>   	deferred_split_folio(folio, false);
>>   	spin_unlock(pmd_ptl);
>>   
>> -	folio = NULL;
>> -
>>   	result = SCAN_SUCCEED;
>>   out_up_write:
>>   	mmap_write_unlock(mm);
>> +	return result;
>> +}
>> +
>> +static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>> +			      int referenced, int unmapped, int order,
>> +			      struct collapse_control *cc)
>> +{
>> +	struct vm_area_struct *vma;
>> +	int result = SCAN_FAIL;
>> +	struct folio *folio;
>> +	pmd_t *pmd;
>> +
>> +	/*
>> +	 * Before allocating the hugepage, release the mmap_lock read lock.
>> +	 * The allocation can take potentially a long time if it involves
>> +	 * sync compaction, and we do not need to hold the mmap_lock during
>> +	 * that. We will recheck the vma after taking it again in write mode.
>> +	 */
>> +	mmap_read_unlock(mm);
>> +
>> +	result = alloc_charge_folio(&folio, mm, order, cc);
>> +	if (result != SCAN_SUCCEED)
>> +		goto out_nolock;
>> +
>> +	mmap_read_lock(mm);
>> +	result = hugepage_vma_revalidate(mm, address, true, &vma, order, cc);
>> +	if (result != SCAN_SUCCEED) {
>> +		mmap_read_unlock(mm);
>> +		goto out_nolock;
>> +	}
>> +
>> +	result = find_pmd_or_thp_or_none(mm, address, &pmd);
>> +	if (result != SCAN_SUCCEED) {
>> +		mmap_read_unlock(mm);
>> +		goto out_nolock;
>> +	}
>> +
>> +	if (unmapped) {
>> +		/*
>> +		 * __collapse_huge_page_swapin will return with mmap_lock
>> +		 * released when it fails. So we jump out_nolock directly in
>> +		 * that case.  Continuing to collapse causes inconsistency.
>> +		 */
>> +		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
>> +						     referenced, order);
>> +		if (result != SCAN_SUCCEED)
>> +			goto out_nolock;
>> +	}
>> +
>> +	if (order == HPAGE_PMD_ORDER)
>> +		result = vma_collapse_anon_folio_pmd(mm, address, vma, cc, pmd, folio);
> I think the locking is broken here? collapse_huge_page() used to enter with the
> mmamp read lock and exit without the lock held at all. After the change, this is
> only true for order == HPAGE_PMD_ORDER. For other orders, you exit with the mmap
> read lock still held. Perhaps:
>
> 	else
> 		mmap_read_unlock(mm);

I have completely messed up sequential building of patches :) I will take care next time.

>
>> +
>> +	if (result == SCAN_SUCCEED)
>> +		folio = NULL;
>> +
>>   out_nolock:
>>   	if (folio)
>>   		folio_put(folio);