linux-kernel - Re: [PATCH 11/28] mm, swap: clean up and consolidate helper for mTHP swapin check

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <iqcrqzvoqt7dcmboh75u5g5cwgzaevlota3izpsfmaeotdnnzw@f7udsv2gghwa>
Date: Thu, 15 May 2025 11:31:10 +0200
From: Klara Modin <klarasmodin@...il.com>
To: Kairui Song <kasong@...cent.com>
Cc: linux-mm@...ck.org, Andrew Morton <akpm@...ux-foundation.org>, 
	Matthew Wilcox <willy@...radead.org>, Hugh Dickins <hughd@...gle.com>, Chris Li <chrisl@...nel.org>, 
	David Hildenbrand <david@...hat.com>, Yosry Ahmed <yosryahmed@...gle.com>, 
	"Huang, Ying" <ying.huang@...ux.alibaba.com>, Nhat Pham <nphamcs@...il.com>, 
	Johannes Weiner <hannes@...xchg.org>, Baolin Wang <baolin.wang@...ux.alibaba.com>, 
	Baoquan He <bhe@...hat.com>, Barry Song <baohua@...nel.org>, 
	Kalesh Singh <kaleshsingh@...gle.com>, Kemeng Shi <shikemeng@...weicloud.com>, 
	Tim Chen <tim.c.chen@...ux.intel.com>, Ryan Roberts <ryan.roberts@....com>, 
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH 11/28] mm, swap: clean up and consolidate helper for mTHP
 swapin check

Hi,

On 2025-05-15 04:17:11 +0800, Kairui Song wrote:
> From: Kairui Song <kasong@...cent.com>
> 
> Move all mTHP swapin check into can_swapin_thp and use it for both pre
> IO check and post IO check. This way the code is more consolidated and
> make later commit easier to maintain.

>From what I can see, can_swapin_thp is gated behind
CONFIG_TRANSPARENT_HUGEPAGE and this fails to build when it's not
enabled.

> 
> Also clean up the comments while at it. The current comment of
> non_swapcache_batch is not correct: swap in bypassing swap cache won't
> reach the swap device as long as the entry is cached, because it still
> sets the SWAP_HAS_CACHE flag. If the folio is already in swap cache, raced
> swap in will either fail due to -EEXIST with swapcache_prepare, or see the
> cached folio.
> 
> The real reason this non_swapcache_batch is needed is that if a smaller
> folio is in the swap cache but not mapped, mTHP swapin will be blocked
> forever as it won't see the folio due to index offset, nor it can set the
> SWAP_HAS_CACHE bit, so it has to fallback to order 0 swap in.
> 
> Signed-off-by: Kairui Song <kasong@...cent.com>
> ---
>  mm/memory.c | 90 ++++++++++++++++++++++++-----------------------------
>  1 file changed, 41 insertions(+), 49 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index f2897d9059f2..1b6e192de6ec 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4319,12 +4319,6 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
>  	pgoff_t offset = swp_offset(entry);
>  	int i;
>  
> -	/*
> -	 * While allocating a large folio and doing swap_read_folio, which is
> -	 * the case the being faulted pte doesn't have swapcache. We need to
> -	 * ensure all PTEs have no cache as well, otherwise, we might go to
> -	 * swap devices while the content is in swapcache.
> -	 */
>  	for (i = 0; i < max_nr; i++) {
>  		if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
>  			return i;

> @@ -4334,34 +4328,30 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
>  }
>  
>  /*
> - * Check if the PTEs within a range are contiguous swap entries
> - * and have consistent swapcache, zeromap.
> + * Check if the page table is still suitable for large folio swap in.
> + * @vmf: The fault triggering the swap-in.
> + * @ptep: Pointer to the PTE that should be the head of the swap in folio.
> + * @addr: The address corresponding to the PTE.
> + * @nr_pages: Number of pages of the folio that suppose to be swapped in.
>   */
> -static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
> +static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep,
> +			   unsigned long addr, unsigned int nr_pages)
>  {
> -	unsigned long addr;
> -	swp_entry_t entry;
> -	int idx;
> -	pte_t pte;
> -
> -	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
> -	idx = (vmf->address - addr) / PAGE_SIZE;
> -	pte = ptep_get(ptep);
> +	pte_t pte = ptep_get(ptep);
> +	unsigned long addr_end = addr + (PAGE_SIZE * nr_pages);
> +	unsigned long pte_offset = (vmf->address - addr) / PAGE_SIZE;
>  
> -	if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
> +	VM_WARN_ON_ONCE(!IS_ALIGNED(addr, PAGE_SIZE) ||
> +			addr > vmf->address || addr_end <= vmf->address);
> +	if (unlikely(addr < max(addr & PMD_MASK, vmf->vma->vm_start) ||
> +		     addr_end > pmd_addr_end(addr, vmf->vma->vm_end)))
>  		return false;
> -	entry = pte_to_swp_entry(pte);
> -	if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
> -		return false;
> -
>  	/*
> -	 * swap_read_folio() can't handle the case a large folio is hybridly
> -	 * from different backends. And they are likely corner cases. Similar
> -	 * things might be added once zswap support large folios.
> +	 * All swap entries must from the same swap device, in same
> +	 * cgroup, with same exclusiveness, only differs in offset.
>  	 */
> -	if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
> -		return false;
> -	if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
> +	if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -pte_offset)) ||
> +	    swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
>  		return false;
>  
>  	return true;
> @@ -4441,13 +4431,24 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>  	 * completely swap entries with contiguous swap offsets.
>  	 */
>  	order = highest_order(orders);
> -	while (orders) {
> -		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> -		if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
> -			break;
> -		order = next_order(&orders, order);
> +	for (; orders; order = next_order(&orders, order)) {
> +		unsigned long nr_pages = 1 << order;
> +		swp_entry_t swap_entry = { .val = ALIGN_DOWN(entry.val, nr_pages) };
> +		addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
> +		if (!can_swapin_thp(vmf, pte + pte_index(addr), addr, nr_pages))
> +			continue;
> +		/*
> +		 * If there is already a smaller folio in cache, it will
> +		 * conflict with the larger folio in the swap cache layer
> +		 * and block the swap in.
> +		 */
> +		if (unlikely(non_swapcache_batch(swap_entry, nr_pages) != nr_pages))
> +			continue;
> +		/* Zero map doesn't work with large folio yet. */
> +		if (unlikely(swap_zeromap_batch(swap_entry, nr_pages, NULL) != nr_pages))
> +			continue;
> +		break;
>  	}
> -
>  	pte_unmap_unlock(pte, ptl);
>  
>  	/* Try allocating the highest of the remaining orders. */
> @@ -4731,27 +4732,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	page_idx = 0;
>  	address = vmf->address;
>  	ptep = vmf->pte;
> +
>  	if (folio_test_large(folio) && folio_test_swapcache(folio)) {
> -		int nr = folio_nr_pages(folio);
> +		unsigned long nr = folio_nr_pages(folio);
>  		unsigned long idx = folio_page_idx(folio, page);
> -		unsigned long folio_start = address - idx * PAGE_SIZE;
> -		unsigned long folio_end = folio_start + nr * PAGE_SIZE;
> -		pte_t *folio_ptep;
> -		pte_t folio_pte;
> +		unsigned long folio_address = address - idx * PAGE_SIZE;
> +		pte_t *folio_ptep = vmf->pte - idx;
>  
> -		if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start)))
> -			goto check_folio;
> -		if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end)))
> -			goto check_folio;
> -
> -		folio_ptep = vmf->pte - idx;
> -		folio_pte = ptep_get(folio_ptep);
> -		if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
> -		    swap_pte_batch(folio_ptep, nr, folio_pte) != nr)

> +		if (!can_swapin_thp(vmf, folio_ptep, folio_address, nr))
>  			goto check_folio;

At this point we're outside CONFIG_TRANSPARENT_HUGEPAGE.

>  
>  		page_idx = idx;
> -		address = folio_start;
> +		address = folio_address;
>  		ptep = folio_ptep;
>  		nr_pages = nr;
>  		entry = folio->swap;
> -- 
> 2.49.0
> 

Regards,
Klara Modin