lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8e7c7078-7757-44d2-b44a-aa8cf17cabd0@linux.dev>
Date: Wed, 15 Oct 2025 09:48:05 +0800
From: Lance Yang <lance.yang@...ux.dev>
To: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
Cc: akpm@...ux-foundation.org, david@...hat.com, ziy@...dia.com,
 baolin.wang@...ux.alibaba.com, Liam.Howlett@...cle.com, npache@...hat.com,
 ryan.roberts@....com, dev.jain@....com, baohua@...nel.org,
 ioworker0@...il.com, richard.weiyang@...il.com,
 linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [PATCH mm-new v3 3/3] mm/khugepaged: merge PTE scanning logic
 into a new helper



On 2025/10/15 01:41, Lorenzo Stoakes wrote:
> OK so this patch _is_ based on [0] already being applied then?
> 
> I don't see any is_swap_pte() here so presumably so right?
> 
> Can we just respin these series and put them together? Because now I've
> reviewed a bunch of stuff in [0], which this series depends upon, and
> you're saying we should now review on this instead of that and it's a bit
> of a mess.
> 
> Once review here is dealt with can you combine [0] into this series please?

Got it, thanks!

> 
> For [0] I would like you to reinstate the is_swap_pte() conditional as
> (copiously :) discussed over there.
> 
> [0}: https://lore.kernel.org/all/20251008032657.72406-1-lance.yang@linux.dev/
> 
> Thanks, Lorenzo
> 
> On Wed, Oct 08, 2025 at 12:37:48PM +0800, Lance Yang wrote:
>> From: Lance Yang <lance.yang@...ux.dev>
>>
>> As David suggested, the PTE scanning logic in hpage_collapse_scan_pmd()
>> and __collapse_huge_page_isolate() was almost duplicated.
>>
>> This patch cleans things up by moving all the common PTE checking logic
>> into a new shared helper, thp_collapse_check_pte(). While at it, we use
>> vm_normal_folio() instead of vm_normal_page().
>>
>> Suggested-by: David Hildenbrand <david@...hat.com>
>> Suggested-by: Dev Jain <dev.jain@....com>
>> Signed-off-by: Lance Yang <lance.yang@...ux.dev>
>> ---
>>   mm/khugepaged.c | 243 ++++++++++++++++++++++++++----------------------
>>   1 file changed, 130 insertions(+), 113 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index b5c0295c3414..7116caae1fa4 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -61,6 +61,12 @@ enum scan_result {
>>   	SCAN_PAGE_FILLED,
>>   };
>>
>> +enum pte_check_result {
>> +	PTE_CHECK_SUCCEED,
>> +	PTE_CHECK_CONTINUE,
>> +	PTE_CHECK_FAIL,
>> +};
>> +
>>   #define CREATE_TRACE_POINTS
>>   #include <trace/events/huge_memory.h>
>>
>> @@ -533,62 +539,139 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
>>   	}
>>   }
>>
>> +/*
>> + * thp_collapse_check_pte - Check if a PTE is suitable for THP collapse
>> + * @pte:           The PTE to check
>> + * @vma:           The VMA the PTE belongs to
>> + * @addr:          The virtual address corresponding to this PTE
>> + * @foliop:        On success, used to return a pointer to the folio
>> + *                 Must be non-NULL
>> + * @none_or_zero:  Counter for none/zero PTEs. Must be non-NULL
>> + * @unmapped:      Counter for swap PTEs. Can be NULL if not scanning swaps
>> + * @shared:        Counter for shared pages. Must be non-NULL
>> + * @scan_result:   Used to return the failure reason (SCAN_*) on a
>> + *                 PTE_CHECK_FAIL return. Must be non-NULL
>> + * @cc:            Collapse control settings
>> + *
>> + * Returns:
>> + *   PTE_CHECK_SUCCEED  - PTE is suitable, proceed with further checks
>> + *   PTE_CHECK_CONTINUE - Skip this PTE and continue scanning
>> + *   PTE_CHECK_FAIL     - Abort collapse scan
>> + */
>> +static inline int thp_collapse_check_pte(pte_t pte, struct vm_area_struct *vma,
>> +		unsigned long addr, struct folio **foliop, int *none_or_zero,
>> +		int *unmapped, int *shared, int *scan_result,
>> +		struct collapse_control *cc)
>> +{
>> +	struct folio *folio = NULL;
>> +
>> +	if (pte_none(pte) || is_zero_pfn(pte_pfn(pte))) {
>> +		(*none_or_zero)++;
>> +		if (!userfaultfd_armed(vma) &&
>> +		    (!cc->is_khugepaged ||
>> +		     *none_or_zero <= khugepaged_max_ptes_none)) {
>> +			return PTE_CHECK_CONTINUE;
>> +		} else {
>> +			*scan_result = SCAN_EXCEED_NONE_PTE;
>> +			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
>> +			return PTE_CHECK_FAIL;
>> +		}
>> +	} else if (!pte_present(pte)) {
>> +		if (!unmapped) {
>> +			*scan_result = SCAN_PTE_NON_PRESENT;
>> +			return PTE_CHECK_FAIL;
>> +		}
>> +
>> +		if (non_swap_entry(pte_to_swp_entry(pte))) {
>> +			*scan_result = SCAN_PTE_NON_PRESENT;
>> +			return PTE_CHECK_FAIL;
>> +		}
>> +
>> +		(*unmapped)++;
>> +		if (!cc->is_khugepaged ||
>> +		    *unmapped <= khugepaged_max_ptes_swap) {
>> +			/*
>> +			 * Always be strict with uffd-wp enabled swap
>> +			 * entries. Please see comment below for
>> +			 * pte_uffd_wp().
>> +			 */
>> +			if (pte_swp_uffd_wp(pte)) {
>> +				*scan_result = SCAN_PTE_UFFD_WP;
>> +				return PTE_CHECK_FAIL;
>> +			}
>> +			return PTE_CHECK_CONTINUE;
>> +		} else {
>> +			*scan_result = SCAN_EXCEED_SWAP_PTE;
>> +			count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
>> +			return PTE_CHECK_FAIL;
>> +		}
>> +	} else if (pte_uffd_wp(pte)) {
>> +		/*
>> +		 * Don't collapse the page if any of the small PTEs are
>> +		 * armed with uffd write protection. Here we can also mark
>> +		 * the new huge pmd as write protected if any of the small
>> +		 * ones is marked but that could bring unknown userfault
>> +		 * messages that falls outside of the registered range.
>> +		 * So, just be simple.
>> +		 */
>> +		*scan_result = SCAN_PTE_UFFD_WP;
>> +		return PTE_CHECK_FAIL;
>> +	}
>> +
>> +	folio = vm_normal_folio(vma, addr, pte);
>> +	if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) {
>> +		*scan_result = SCAN_PAGE_NULL;
>> +		return PTE_CHECK_FAIL;
>> +	}
>> +
>> +	if (!folio_test_anon(folio)) {
>> +		VM_WARN_ON_FOLIO(true, folio);
>> +		*scan_result = SCAN_PAGE_ANON;
>> +		return PTE_CHECK_FAIL;
>> +	}
>> +
>> +	/*
>> +	 * We treat a single page as shared if any part of the THP
>> +	 * is shared.
>> +	 */
>> +	if (folio_maybe_mapped_shared(folio)) {
>> +		(*shared)++;
>> +		if (cc->is_khugepaged && *shared > khugepaged_max_ptes_shared) {
>> +			*scan_result = SCAN_EXCEED_SHARED_PTE;
>> +			count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>> +			return PTE_CHECK_FAIL;
>> +		}
>> +	}
>> +
>> +	*foliop = folio;
>> +
>> +	return PTE_CHECK_SUCCEED;
>> +}
>> +
>>   static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>   					unsigned long start_addr,
>>   					pte_t *pte,
>>   					struct collapse_control *cc,
>>   					struct list_head *compound_pagelist)
>>   {
>> -	struct page *page = NULL;
>>   	struct folio *folio = NULL;
>>   	unsigned long addr = start_addr;
>>   	pte_t *_pte;
>>   	int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
>> +	int pte_check_res;
>>
>>   	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>>   	     _pte++, addr += PAGE_SIZE) {
>>   		pte_t pteval = ptep_get(_pte);
>> -		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>> -			++none_or_zero;
>> -			if (!userfaultfd_armed(vma) &&
>> -			    (!cc->is_khugepaged ||
>> -			     none_or_zero <= khugepaged_max_ptes_none)) {
>> -				continue;
>> -			} else {
>> -				result = SCAN_EXCEED_NONE_PTE;
>> -				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
>> -				goto out;
>> -			}
>> -		} else if (!pte_present(pteval)) {
>> -			result = SCAN_PTE_NON_PRESENT;
>> -			goto out;
>> -		} else if (pte_uffd_wp(pteval)) {
>> -			result = SCAN_PTE_UFFD_WP;
>> -			goto out;
>> -		}
>> -		page = vm_normal_page(vma, addr, pteval);
>> -		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>> -			result = SCAN_PAGE_NULL;
>> -			goto out;
>> -		}
>>
>> -		folio = page_folio(page);
>> -		if (!folio_test_anon(folio)) {
>> -			VM_WARN_ON_FOLIO(true, folio);
>> -			result = SCAN_PAGE_ANON;
>> -			goto out;
>> -		}
>> +		pte_check_res = thp_collapse_check_pte(pteval, vma, addr,
>> +					&folio, &none_or_zero, NULL, &shared,
>> +					&result, cc);
>>
>> -		/* See hpage_collapse_scan_pmd(). */
>> -		if (folio_maybe_mapped_shared(folio)) {
>> -			++shared;
>> -			if (cc->is_khugepaged &&
>> -			    shared > khugepaged_max_ptes_shared) {
>> -				result = SCAN_EXCEED_SHARED_PTE;
>> -				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>> -				goto out;
>> -			}
>> -		}
>> +		if (pte_check_res == PTE_CHECK_CONTINUE)
>> +			continue;
>> +		else if (pte_check_res == PTE_CHECK_FAIL)
>> +			goto out;
>>
>>   		if (folio_test_large(folio)) {
>>   			struct folio *f;
>> @@ -1264,11 +1347,11 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>>   	pte_t *pte, *_pte;
>>   	int result = SCAN_FAIL, referenced = 0;
>>   	int none_or_zero = 0, shared = 0;
>> -	struct page *page = NULL;
>>   	struct folio *folio = NULL;
>>   	unsigned long addr;
>>   	spinlock_t *ptl;
>>   	int node = NUMA_NO_NODE, unmapped = 0;
>> +	int pte_check_res;
>>
>>   	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
>>
>> @@ -1287,81 +1370,15 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>>   	for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>>   	     _pte++, addr += PAGE_SIZE) {
>>   		pte_t pteval = ptep_get(_pte);
>> -		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>> -			++none_or_zero;
>> -			if (!userfaultfd_armed(vma) &&
>> -			    (!cc->is_khugepaged ||
>> -			     none_or_zero <= khugepaged_max_ptes_none)) {
>> -				continue;
>> -			} else {
>> -				result = SCAN_EXCEED_NONE_PTE;
>> -				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
>> -				goto out_unmap;
>> -			}
>> -		} else if (!pte_present(pteval)) {
>> -			if (non_swap_entry(pte_to_swp_entry(pteval))) {
>> -				result = SCAN_PTE_NON_PRESENT;
>> -				goto out_unmap;
>> -			}
>>
>> -			++unmapped;
>> -			if (!cc->is_khugepaged ||
>> -			    unmapped <= khugepaged_max_ptes_swap) {
>> -				/*
>> -				 * Always be strict with uffd-wp
>> -				 * enabled swap entries.  Please see
>> -				 * comment below for pte_uffd_wp().
>> -				 */
>> -				if (pte_swp_uffd_wp(pteval)) {
>> -					result = SCAN_PTE_UFFD_WP;
>> -					goto out_unmap;
>> -				}
>> -				continue;
>> -			} else {
>> -				result = SCAN_EXCEED_SWAP_PTE;
>> -				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
>> -				goto out_unmap;
>> -			}
>> -		} else if (pte_uffd_wp(pteval)) {
>> -			/*
>> -			 * Don't collapse the page if any of the small
>> -			 * PTEs are armed with uffd write protection.
>> -			 * Here we can also mark the new huge pmd as
>> -			 * write protected if any of the small ones is
>> -			 * marked but that could bring unknown
>> -			 * userfault messages that falls outside of
>> -			 * the registered range.  So, just be simple.
>> -			 */
>> -			result = SCAN_PTE_UFFD_WP;
>> -			goto out_unmap;
>> -		}
>> +		pte_check_res = thp_collapse_check_pte(pteval, vma, addr,
>> +					&folio, &none_or_zero, &unmapped,
>> +					&shared, &result, cc);
>>
>> -		page = vm_normal_page(vma, addr, pteval);
>> -		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>> -			result = SCAN_PAGE_NULL;
>> -			goto out_unmap;
>> -		}
>> -		folio = page_folio(page);
>> -
>> -		if (!folio_test_anon(folio)) {
>> -			VM_WARN_ON_FOLIO(true, folio);
>> -			result = SCAN_PAGE_ANON;
>> +		if (pte_check_res == PTE_CHECK_CONTINUE)
>> +			continue;
>> +		else if (pte_check_res == PTE_CHECK_FAIL)
>>   			goto out_unmap;
>> -		}
>> -
>> -		/*
>> -		 * We treat a single page as shared if any part of the THP
>> -		 * is shared.
>> -		 */
>> -		if (folio_maybe_mapped_shared(folio)) {
>> -			++shared;
>> -			if (cc->is_khugepaged &&
>> -			    shared > khugepaged_max_ptes_shared) {
>> -				result = SCAN_EXCEED_SHARED_PTE;
>> -				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>> -				goto out_unmap;
>> -			}
>> -		}
>>
>>   		/*
>>   		 * Record which node the original page is from and save this
>> --
>> 2.49.0
>>


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ