[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <iqcrqzvoqt7dcmboh75u5g5cwgzaevlota3izpsfmaeotdnnzw@f7udsv2gghwa>
Date: Thu, 15 May 2025 11:31:10 +0200
From: Klara Modin <klarasmodin@...il.com>
To: Kairui Song <kasong@...cent.com>
Cc: linux-mm@...ck.org, Andrew Morton <akpm@...ux-foundation.org>,
Matthew Wilcox <willy@...radead.org>, Hugh Dickins <hughd@...gle.com>, Chris Li <chrisl@...nel.org>,
David Hildenbrand <david@...hat.com>, Yosry Ahmed <yosryahmed@...gle.com>,
"Huang, Ying" <ying.huang@...ux.alibaba.com>, Nhat Pham <nphamcs@...il.com>,
Johannes Weiner <hannes@...xchg.org>, Baolin Wang <baolin.wang@...ux.alibaba.com>,
Baoquan He <bhe@...hat.com>, Barry Song <baohua@...nel.org>,
Kalesh Singh <kaleshsingh@...gle.com>, Kemeng Shi <shikemeng@...weicloud.com>,
Tim Chen <tim.c.chen@...ux.intel.com>, Ryan Roberts <ryan.roberts@....com>,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH 11/28] mm, swap: clean up and consolidate helper for mTHP
swapin check
Hi,
On 2025-05-15 04:17:11 +0800, Kairui Song wrote:
> From: Kairui Song <kasong@...cent.com>
>
> Move all mTHP swapin check into can_swapin_thp and use it for both pre
> IO check and post IO check. This way the code is more consolidated and
> make later commit easier to maintain.
>From what I can see, can_swapin_thp is gated behind
CONFIG_TRANSPARENT_HUGEPAGE and this fails to build when it's not
enabled.
>
> Also clean up the comments while at it. The current comment of
> non_swapcache_batch is not correct: swap in bypassing swap cache won't
> reach the swap device as long as the entry is cached, because it still
> sets the SWAP_HAS_CACHE flag. If the folio is already in swap cache, raced
> swap in will either fail due to -EEXIST with swapcache_prepare, or see the
> cached folio.
>
> The real reason this non_swapcache_batch is needed is that if a smaller
> folio is in the swap cache but not mapped, mTHP swapin will be blocked
> forever as it won't see the folio due to index offset, nor it can set the
> SWAP_HAS_CACHE bit, so it has to fallback to order 0 swap in.
>
> Signed-off-by: Kairui Song <kasong@...cent.com>
> ---
> mm/memory.c | 90 ++++++++++++++++++++++++-----------------------------
> 1 file changed, 41 insertions(+), 49 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index f2897d9059f2..1b6e192de6ec 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4319,12 +4319,6 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
> pgoff_t offset = swp_offset(entry);
> int i;
>
> - /*
> - * While allocating a large folio and doing swap_read_folio, which is
> - * the case the being faulted pte doesn't have swapcache. We need to
> - * ensure all PTEs have no cache as well, otherwise, we might go to
> - * swap devices while the content is in swapcache.
> - */
> for (i = 0; i < max_nr; i++) {
> if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
> return i;
> @@ -4334,34 +4328,30 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
> }
>
> /*
> - * Check if the PTEs within a range are contiguous swap entries
> - * and have consistent swapcache, zeromap.
> + * Check if the page table is still suitable for large folio swap in.
> + * @vmf: The fault triggering the swap-in.
> + * @ptep: Pointer to the PTE that should be the head of the swap in folio.
> + * @addr: The address corresponding to the PTE.
> + * @nr_pages: Number of pages of the folio that suppose to be swapped in.
> */
> -static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
> +static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep,
> + unsigned long addr, unsigned int nr_pages)
> {
> - unsigned long addr;
> - swp_entry_t entry;
> - int idx;
> - pte_t pte;
> -
> - addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
> - idx = (vmf->address - addr) / PAGE_SIZE;
> - pte = ptep_get(ptep);
> + pte_t pte = ptep_get(ptep);
> + unsigned long addr_end = addr + (PAGE_SIZE * nr_pages);
> + unsigned long pte_offset = (vmf->address - addr) / PAGE_SIZE;
>
> - if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
> + VM_WARN_ON_ONCE(!IS_ALIGNED(addr, PAGE_SIZE) ||
> + addr > vmf->address || addr_end <= vmf->address);
> + if (unlikely(addr < max(addr & PMD_MASK, vmf->vma->vm_start) ||
> + addr_end > pmd_addr_end(addr, vmf->vma->vm_end)))
> return false;
> - entry = pte_to_swp_entry(pte);
> - if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
> - return false;
> -
> /*
> - * swap_read_folio() can't handle the case a large folio is hybridly
> - * from different backends. And they are likely corner cases. Similar
> - * things might be added once zswap support large folios.
> + * All swap entries must from the same swap device, in same
> + * cgroup, with same exclusiveness, only differs in offset.
> */
> - if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
> - return false;
> - if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
> + if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -pte_offset)) ||
> + swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
> return false;
>
> return true;
> @@ -4441,13 +4431,24 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
> * completely swap entries with contiguous swap offsets.
> */
> order = highest_order(orders);
> - while (orders) {
> - addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> - if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
> - break;
> - order = next_order(&orders, order);
> + for (; orders; order = next_order(&orders, order)) {
> + unsigned long nr_pages = 1 << order;
> + swp_entry_t swap_entry = { .val = ALIGN_DOWN(entry.val, nr_pages) };
> + addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
> + if (!can_swapin_thp(vmf, pte + pte_index(addr), addr, nr_pages))
> + continue;
> + /*
> + * If there is already a smaller folio in cache, it will
> + * conflict with the larger folio in the swap cache layer
> + * and block the swap in.
> + */
> + if (unlikely(non_swapcache_batch(swap_entry, nr_pages) != nr_pages))
> + continue;
> + /* Zero map doesn't work with large folio yet. */
> + if (unlikely(swap_zeromap_batch(swap_entry, nr_pages, NULL) != nr_pages))
> + continue;
> + break;
> }
> -
> pte_unmap_unlock(pte, ptl);
>
> /* Try allocating the highest of the remaining orders. */
> @@ -4731,27 +4732,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> page_idx = 0;
> address = vmf->address;
> ptep = vmf->pte;
> +
> if (folio_test_large(folio) && folio_test_swapcache(folio)) {
> - int nr = folio_nr_pages(folio);
> + unsigned long nr = folio_nr_pages(folio);
> unsigned long idx = folio_page_idx(folio, page);
> - unsigned long folio_start = address - idx * PAGE_SIZE;
> - unsigned long folio_end = folio_start + nr * PAGE_SIZE;
> - pte_t *folio_ptep;
> - pte_t folio_pte;
> + unsigned long folio_address = address - idx * PAGE_SIZE;
> + pte_t *folio_ptep = vmf->pte - idx;
>
> - if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start)))
> - goto check_folio;
> - if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end)))
> - goto check_folio;
> -
> - folio_ptep = vmf->pte - idx;
> - folio_pte = ptep_get(folio_ptep);
> - if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
> - swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
> + if (!can_swapin_thp(vmf, folio_ptep, folio_address, nr))
> goto check_folio;
At this point we're outside CONFIG_TRANSPARENT_HUGEPAGE.
>
> page_idx = idx;
> - address = folio_start;
> + address = folio_address;
> ptep = folio_ptep;
> nr_pages = nr;
> entry = folio->swap;
> --
> 2.49.0
>
Regards,
Klara Modin
Powered by blists - more mailing lists