linux-kernel - [PATCH 16/28] mm, swap: use swap cache as the swap in synchronize layer

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250514201729.48420-17-ryncsn@gmail.com>
Date: Thu, 15 May 2025 04:17:16 +0800
From: Kairui Song <ryncsn@...il.com>
To: linux-mm@...ck.org
Cc: Andrew Morton <akpm@...ux-foundation.org>,
	Matthew Wilcox <willy@...radead.org>,
	Hugh Dickins <hughd@...gle.com>,
	Chris Li <chrisl@...nel.org>,
	David Hildenbrand <david@...hat.com>,
	Yosry Ahmed <yosryahmed@...gle.com>,
	"Huang, Ying" <ying.huang@...ux.alibaba.com>,
	Nhat Pham <nphamcs@...il.com>,
	Johannes Weiner <hannes@...xchg.org>,
	Baolin Wang <baolin.wang@...ux.alibaba.com>,
	Baoquan He <bhe@...hat.com>,
	Barry Song <baohua@...nel.org>,
	Kalesh Singh <kaleshsingh@...gle.com>,
	Kemeng Shi <shikemeng@...weicloud.com>,
	Tim Chen <tim.c.chen@...ux.intel.com>,
	Ryan Roberts <ryan.roberts@....com>,
	linux-kernel@...r.kernel.org,
	Kairui Song <kasong@...cent.com>
Subject: [PATCH 16/28] mm, swap: use swap cache as the swap in synchronize layer

From: Kairui Song <kasong@...cent.com>

Current swap synchronization is mostly based on the swap_map's
SWAP_HAS_CACHE bit. Whoever sets the bit first does the actual work to
swap in a folio.

This has been causing many issues as it's just a poor implementation
of a bit lock based on a busy loop. The busy loop is relaxed with a
schedule_timeout_uninterruptible(1), which is ugly and causes more
long tailing or other performance issues. Beside, the abuse of
SWAP_HAS_CACHE has been causing trouble for maintenance.

We have just removed all swap in paths bypassing the swap cache,
so now we can just resolve the swap synchronization with the swap cache
layer directly (similar to page cache). Whoever adds a folio into the
swap cache first takes care of the real IO. Raced threads will see the
newly inserted folio from others, they can simply wait on the folio lock.

This way, the race swap in is synchronized with a proper lock.
This both simplifies the logic and should improve the performance, and
eliminated issues like the workaround in commit 01626a1823024
("mm: avoid unconditional one-tick sleep when swapcache_prepare fails"),
or the "skip_if_exists" from commit a65b0e7607ccb ("zswap: make shrinking
memcg-aware").

Signed-off-by: Kairui Song <kasong@...cent.com>
---
 include/linux/swap.h |   6 ---
 mm/swap.h            |  17 ++++--
 mm/swap_state.c      | 120 +++++++++++++++++--------------------------
 mm/swapfile.c        |  32 ++++++------
 mm/vmscan.c          |   1 -
 mm/zswap.c           |   2 +-
 6 files changed, 76 insertions(+), 102 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 58230f3e15e6..2da769cdc663 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -443,7 +443,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern int swap_duplicate_nr(swp_entry_t entry, int nr);
-extern int swapcache_prepare(swp_entry_t entry, int nr);
 extern void swap_free_nr(swp_entry_t entry, int nr_pages);
 extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
 int swap_type_of(dev_t device, sector_t offset);
@@ -502,11 +501,6 @@ static inline int swap_duplicate_nr(swp_entry_t swp, int nr_pages)
 	return 0;
 }
 
-static inline int swapcache_prepare(swp_entry_t swp, int nr)
-{
-	return 0;
-}
-
 static inline void swap_free_nr(swp_entry_t entry, int nr_pages)
 {
 }
diff --git a/mm/swap.h b/mm/swap.h
index cad24a3abda8..2abfb40fc7ec 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -135,6 +135,13 @@ static inline void swap_unlock_cluster_irq(struct swap_cluster_info *ci)
 	spin_unlock_irq(&ci->lock);
 }
 
+extern int __swap_cache_set_entry(struct swap_info_struct *si,
+				  struct swap_cluster_info *ci,
+				  unsigned long offset);
+extern void __swap_cache_put_entries(struct swap_info_struct *si,
+				     struct swap_cluster_info *ci,
+				     swp_entry_t entry, unsigned int size);
+
 /* linux/mm/page_io.c */
 int sio_pool_init(void);
 struct swap_iocb;
@@ -158,8 +165,8 @@ static inline struct address_space *swap_address_space(swp_entry_t entry)
 
 /* Below helpers requires the caller to pin the swap device. */
 extern struct folio *swap_cache_get_folio(swp_entry_t entry);
-extern int swap_cache_add_folio(swp_entry_t entry,
-				struct folio *folio, void **shadow);
+extern struct folio *swap_cache_add_folio(swp_entry_t entry, struct folio *folio,
+					  void **shadow, bool swapin);
 extern void *swap_cache_get_shadow(swp_entry_t entry);
 /* Below helpers requires the caller to lock the swap cluster. */
 extern void __swap_cache_del_folio(swp_entry_t entry,
@@ -211,8 +218,7 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		struct vm_area_struct *vma, unsigned long addr,
 		struct swap_iocb **plug);
 struct folio *__swapin_cache_alloc(swp_entry_t entry, gfp_t gfp_flags,
-		struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
-		bool skip_if_exists);
+		struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated);
 struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
 		struct mempolicy *mpol, pgoff_t ilx);
 struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
@@ -324,7 +330,8 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
 	return NULL;
 }
 
-static inline int swap_cache_add_folio(swp_entry_t end, struct folio *folio, void **shadow)
+static inline int swap_cache_add_folio(swp_entry_t end, struct folio *folio,
+				       void **shadow, bool swapin)
 {
 	return -EINVAL;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d68687295f52..715aff5aca57 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -110,12 +110,18 @@ int __swap_cache_replace_folio(struct swap_cluster_info *ci, swp_entry_t entry,
 	return 0;
 }
 
-int swap_cache_add_folio(swp_entry_t entry, struct folio *folio,
-			 void **shadow)
+/*
+ * Return the folio being added on success, or return the existing folio
+ * with conflicting index on failure.
+ */
+struct folio *swap_cache_add_folio(swp_entry_t entry, struct folio *folio,
+				   void **shadow, bool swapin)
 {
 	swp_te_t exist;
 	pgoff_t end, start, offset;
+	struct swap_info_struct *si;
 	struct swap_cluster_info *ci;
+	struct folio *existing = NULL;
 	unsigned long nr_pages = folio_nr_pages(folio);
 
 	start = swp_offset(entry);
@@ -124,12 +130,18 @@ int swap_cache_add_folio(swp_entry_t entry, struct folio *folio,
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 	VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
 	VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
-
+again:
 	offset = start;
-	ci = swap_lock_cluster(swp_info(entry), offset);
+	existing = NULL;
+	si = swp_info(entry);
+	ci = swap_lock_cluster(si, offset);
 	do {
 		exist = __swap_table_get(ci, offset);
-		if (unlikely(swp_te_is_folio(exist)))
+		if (unlikely(swp_te_is_folio(exist))) {
+			existing = swp_te_folio(exist);
+			goto out_failed;
+		}
+		if (swapin && __swap_cache_set_entry(si, ci, offset))
 			goto out_failed;
 		if (shadow && swp_te_is_shadow(exist))
 			*shadow = swp_te_shadow(exist);
@@ -144,18 +156,27 @@ int swap_cache_add_folio(swp_entry_t entry, struct folio *folio,
 	node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
 	lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
 
-	return 0;
+	return folio;
 
 out_failed:
 	/*
-	 * We may lose shadow due to raced swapin, which should be
-	 * fine, caller better keep the previous returned shadow.
+	 * We may lose shadow here due to raced swapin, which is rare and OK,
+	 * caller better keep the previous returned shadow.
 	 */
-	while (offset-- > start)
+	while (offset-- > start) {
 		__swap_table_set_shadow(ci, offset, NULL);
+		__swap_cache_put_entries(si, ci, swp_entry(si->type, offset), 1);
+	}
 	swap_unlock_cluster(ci);
 
-	return -EEXIST;
+	/*
+	 * Need to grab the conflicting folio before return. If it's
+	 * already gone, just try insert again.
+	 */
+	if (existing && !folio_try_get(existing))
+		goto again;
+
+	return existing;
 }
 
 /*
@@ -192,6 +213,7 @@ void __swap_cache_del_folio(swp_entry_t entry,
 	folio_clear_swapcache(folio);
 	node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages);
 	lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
+	__swap_cache_put_entries(si, ci, entry, nr_pages);
 }
 
 void delete_from_swap_cache(struct folio *folio)
@@ -203,7 +225,6 @@ void delete_from_swap_cache(struct folio *folio)
 	__swap_cache_del_folio(entry, folio, NULL);
 	swap_unlock_cluster(ci);
 
-	put_swap_folio(folio, entry);
 	folio_ref_sub(folio, folio_nr_pages(folio));
 }
 
@@ -354,59 +375,18 @@ void swap_update_readahead(struct folio *folio,
 }
 
 static struct folio *__swapin_cache_add_prepare(swp_entry_t entry,
-						struct folio *folio,
-						bool skip_if_exists)
+						struct folio *folio)
 {
-	int nr_pages = folio_nr_pages(folio);
-	struct folio *exist;
 	void *shadow = NULL;
-	int err;
+	struct folio *swapcache = NULL;
 
-	for (;;) {
-		/*
-		 * Caller should have checked swap cache and swap count
-		 * already, try prepare the swap map directly, it will still
-		 * fail with -ENOENT or -EEXIST if the entry is gone or raced.
-		 */
-		err = swapcache_prepare(entry, nr_pages);
-		if (!err)
-			break;
-		else if (err != -EEXIST)
-			return NULL;
-
-		/*
-		 * Protect against a recursive call to __swapin_cache_alloc()
-		 * on the same entry waiting forever here because SWAP_HAS_CACHE
-		 * is set but the folio is not the swap cache yet. This can
-		 * happen today if mem_cgroup_swapin_charge_folio() below
-		 * triggers reclaim through zswap, which may call
-		 * __swapin_cache_alloc() in the writeback path.
-		 */
-		if (skip_if_exists)
-			return NULL;
-
-		exist = swap_cache_get_folio(entry);
-		if (exist)
-			return exist;
-
-		/*
-		 * We might race against __swap_cache_del_folio(), and
-		 * stumble across a swap_map entry whose SWAP_HAS_CACHE
-		 * has not yet been cleared.  Or race against another
-		 * __swapin_cache_alloc(), which has set SWAP_HAS_CACHE
-		 * in swap_map, but not yet added its folio to swap cache.
-		 */
-		schedule_timeout_uninterruptible(1);
-	}
-
-	/*
-	 * The swap entry is ours to swap in. Prepare the new folio.
-	 */
 	__folio_set_locked(folio);
 	__folio_set_swapbacked(folio);
-
-	if (swap_cache_add_folio(entry, folio, &shadow))
-		goto fail_unlock;
+	swapcache = swap_cache_add_folio(entry, folio, &shadow, true);
+	if (swapcache != folio) {
+		folio_unlock(folio);
+		return swapcache;
+	}
 
 	memcg1_swapin(entry, 1);
 
@@ -416,16 +396,10 @@ static struct folio *__swapin_cache_add_prepare(swp_entry_t entry,
 	/* Caller will initiate read into locked new_folio */
 	folio_add_lru(folio);
 	return folio;
-
-fail_unlock:
-	put_swap_folio(folio, entry);
-	folio_unlock(folio);
-	return NULL;
 }
 
 struct folio *__swapin_cache_alloc(swp_entry_t entry, gfp_t gfp_mask,
-		struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
-		bool skip_if_exists)
+		struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated)
 {
 	struct swap_info_struct *si = swp_info(entry);
 	struct folio *swapcache = NULL, *folio = NULL;
@@ -457,7 +431,7 @@ struct folio *__swapin_cache_alloc(swp_entry_t entry, gfp_t gfp_mask,
 	if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry))
 		goto out;
 
-	swapcache = __swapin_cache_add_prepare(entry, folio, skip_if_exists);
+	swapcache = __swapin_cache_add_prepare(entry, folio);
 out:
 	if (swapcache && swapcache == folio) {
 		*new_page_allocated = true;
@@ -491,7 +465,7 @@ struct folio *swapin_entry(swp_entry_t entry, struct folio *folio)
 	VM_WARN_ON_ONCE(nr_pages > SWAPFILE_CLUSTER);
 
 	entry = swp_entry(swp_type(entry), ALIGN_DOWN(offset, nr_pages));
-	swapcache = __swapin_cache_add_prepare(entry, folio, false);
+	swapcache = __swapin_cache_add_prepare(entry, folio);
 	if (swapcache == folio)
 		swap_read_folio(folio, NULL);
 	return swapcache;
@@ -523,7 +497,7 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 
 	mpol = get_vma_policy(vma, addr, 0, &ilx);
 	folio = __swapin_cache_alloc(entry, gfp_mask, mpol, ilx,
-					&page_allocated, false);
+				     &page_allocated);
 	mpol_cond_put(mpol);
 
 	if (page_allocated)
@@ -642,7 +616,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		/* Ok, do the async read-ahead now */
 		folio = __swapin_cache_alloc(
 				swp_entry(swp_type(entry), offset),
-				gfp_mask, mpol, ilx, &page_allocated, false);
+				gfp_mask, mpol, ilx, &page_allocated);
 		if (!folio)
 			continue;
 		if (page_allocated) {
@@ -660,7 +634,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 skip:
 	/* The page was likely read above, so no need for plugging here */
 	folio = __swapin_cache_alloc(entry, gfp_mask, mpol, ilx,
-					&page_allocated, false);
+				     &page_allocated);
 	if (unlikely(page_allocated))
 		swap_read_folio(folio, NULL);
 	return folio;
@@ -755,7 +729,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 		pte_unmap(pte);
 		pte = NULL;
 		folio = __swapin_cache_alloc(entry, gfp_mask, mpol, ilx,
-						&page_allocated, false);
+					     &page_allocated);
 		if (!folio)
 			continue;
 		if (page_allocated) {
@@ -775,7 +749,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 skip:
 	/* The folio was likely read above, so no need for plugging here */
 	folio = __swapin_cache_alloc(targ_entry, gfp_mask, mpol, targ_ilx,
-					&page_allocated, false);
+				     &page_allocated);
 	if (unlikely(page_allocated))
 		swap_read_folio(folio, NULL);
 	return folio;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d01dc0646db9..8909d1655432 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1283,7 +1283,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
 	if (!entry.val)
 		return -ENOMEM;
 
-	if (swap_cache_add_folio(entry, folio, NULL))
+	if (WARN_ON(swap_cache_add_folio(entry, folio, NULL, false) != folio))
 		goto out_free;
 
 	atomic_long_sub(size, &nr_swap_pages);
@@ -1556,6 +1556,17 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
 	}
 }
 
+void __swap_cache_put_entries(struct swap_info_struct *si,
+			      struct swap_cluster_info *ci,
+			      swp_entry_t entry, unsigned int size)
+{
+	if (swap_only_has_cache(si, swp_offset(entry), size))
+		swap_entries_free(si, ci, entry, size);
+	else
+		for (int i = 0; i < size; i++, entry.val++)
+			swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
+}
+
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */
@@ -1571,11 +1582,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
 		return;
 
 	ci = swap_lock_cluster(si, offset);
-	if (swap_only_has_cache(si, offset, size))
-		swap_entries_free(si, ci, entry, size);
-	else
-		for (int i = 0; i < size; i++, entry.val++)
-			swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
+	__swap_cache_put_entries(si, ci, entry, size);
 	swap_unlock_cluster(ci);
 }
 
@@ -3597,17 +3604,10 @@ int swap_duplicate_nr(swp_entry_t entry, int nr)
 	return err;
 }
 
-/*
- * @entry: first swap entry from which we allocate nr swap cache.
- *
- * Called when allocating swap cache for existing swap entries,
- * This can return error codes. Returns 0 at success.
- * -EEXIST means there is a swap cache.
- * Note: return code is different from swap_duplicate().
- */
-int swapcache_prepare(swp_entry_t entry, int nr)
+int __swap_cache_set_entry(struct swap_info_struct *si,
+			   struct swap_cluster_info *ci, unsigned long offset)
 {
-	return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
+	return swap_dup_entries(si, ci, offset, SWAP_HAS_CACHE, 1);
 }
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7b5f41b4147b..8b5498cae0d5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -798,7 +798,6 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
 		__swap_cache_del_folio(swap, folio, shadow);
 		memcg1_swapout(folio, swap);
 		swap_unlock_cluster_irq(ci);
-		put_swap_folio(folio, swap);
 	} else {
 		void (*free_folio)(struct folio *);
 
diff --git a/mm/zswap.c b/mm/zswap.c
index 87aebeee11ef..65c1aff5c4a4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1085,7 +1085,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 
 	mpol = get_task_policy(current);
 	folio = __swapin_cache_alloc(swpentry, GFP_KERNEL, mpol,
-				NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+				     NO_INTERLEAVE_INDEX, &folio_was_allocated);
 	put_swap_device(si);
 	if (!folio)
 		return -ENOMEM;
-- 
2.49.0