linux-kernel - [RFC PATCH v3 5/5] mm, swap: introduce percpu swap device cache to avoid fragmentation

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20260131125454.3187546-6-youngjun.park@lge.com>
Date: Sat, 31 Jan 2026 21:54:54 +0900
From: Youngjun Park <youngjun.park@....com>
To: akpm@...ux-foundation.org
Cc: chrisl@...nel.org,
	kasong@...cent.com,
	hannes@...xchg.org,
	mhocko@...nel.org,
	roman.gushchin@...ux.dev,
	shakeel.butt@...ux.dev,
	muchun.song@...ux.dev,
	shikemeng@...weicloud.com,
	nphamcs@...il.com,
	bhe@...hat.com,
	baohua@...nel.org,
	cgroups@...r.kernel.org,
	linux-mm@...ck.org,
	linux-kernel@...r.kernel.org,
	gunho.lee@....com,
	youngjun.park@....com,
	taejoon.song@....com
Subject: [RFC PATCH v3 5/5] mm, swap: introduce percpu swap device cache to avoid fragmentation

In the previous commit that introduced per-device percpu clusters,
the allocation logic caused swap device rotation on every allocation
when multiple swap devices share the same priority. This led to
cluster fragmentation on every allocation attemp.

To address this issue, this patch introduces a per-cpu swap device
cache, restoring the allocation behavior to closely match the
traditional fastpath and slowpath flow.

With swap tiers, cluster fragmentation can still occur when a CPU's
cached swap device doesn't belong to the required tier for the current
allocation - this is the intended behavior for tier-based allocation.

With swap tiers and same-priority swap devices, the slow path
triggers device rotation and causes initial cluster fragmentation.
However, once a cluster is allocated, subsequent allocations will
continue using that cluster until it's exhausted, preventing repeated
fragmentation. While this may not be severe, there is room for future
optimization.

Signed-off-by: Youngjun Park <youngjun.park@....com>
---
 include/linux/swap.h |  1 -
 mm/swapfile.c        | 87 +++++++++++++++++++++++++++++++++++---------
 2 files changed, 69 insertions(+), 19 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6921e22b14d3..ac634a21683a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -253,7 +253,6 @@ enum {
   * throughput.
   */
 struct percpu_cluster {
-	local_lock_t lock; /* Protect the percpu_cluster above */
 	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
 };
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4708014c96c4..fc1f64eaa8fe 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -106,6 +106,16 @@ PLIST_HEAD(swap_active_head);
 static PLIST_HEAD(swap_avail_head);
 static DEFINE_SPINLOCK(swap_avail_lock);
 
+struct percpu_swap_device {
+	struct swap_info_struct *si[SWAP_NR_ORDERS];
+	local_lock_t lock;
+};
+
+static DEFINE_PER_CPU(struct percpu_swap_device, percpu_swap_device) = {
+	.si = { NULL },
+	.lock = INIT_LOCAL_LOCK(),
+};
+
 struct swap_info_struct *swap_info[MAX_SWAPFILES];
 
 static struct kmem_cache *swap_table_cachep;
@@ -465,10 +475,8 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
 	 * Swap allocator uses percpu clusters and holds the local lock.
 	 */
 	lockdep_assert_held(&ci->lock);
-	if (si->flags & SWP_SOLIDSTATE)
-		lockdep_assert_held(this_cpu_ptr(&si->percpu_cluster->lock));
-	else
-		lockdep_assert_held(&si->global_cluster->lock);
+	lockdep_assert_held(this_cpu_ptr(&percpu_swap_device.lock));
+
 	/* The cluster must be free and was just isolated from the free list. */
 	VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
 
@@ -484,10 +492,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
 	 * the potential recursive allocation is limited.
 	 */
 	spin_unlock(&ci->lock);
-	if (si->flags & SWP_SOLIDSTATE)
-		local_unlock(&si->percpu_cluster->lock);
-	else
-		spin_unlock(&si->global_cluster->lock);
+	local_unlock(&percpu_swap_device.lock);
 
 	table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL);
 
@@ -499,7 +504,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
 	 * could happen with ignoring the percpu cluster is fragmentation,
 	 * which is acceptable since this fallback and race is rare.
 	 */
-	local_lock(&si->percpu_cluster->lock);
+	local_lock(&percpu_swap_device.lock);
 	if (!(si->flags & SWP_SOLIDSTATE))
 		spin_lock(&si->global_cluster->lock);
 	spin_lock(&ci->lock);
@@ -944,9 +949,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 out:
 	relocate_cluster(si, ci);
 	swap_cluster_unlock(ci);
-	if (si->flags & SWP_SOLIDSTATE)
+	if (si->flags & SWP_SOLIDSTATE) {
 		this_cpu_write(si->percpu_cluster->next[order], next);
-	else
+		this_cpu_write(percpu_swap_device.si[order], si);
+	} else
 		si->global_cluster->next[order] = next;
 
 	return found;
@@ -1044,7 +1050,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
 
 	if (si->flags & SWP_SOLIDSTATE) {
 		/* Fast path using per CPU cluster */
-		local_lock(&si->percpu_cluster->lock);
 		offset = __this_cpu_read(si->percpu_cluster->next[order]);
 	} else {
 		/* Serialize HDD SWAP allocation for each device. */
@@ -1122,9 +1127,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
 			goto done;
 	}
 done:
-	if (si->flags & SWP_SOLIDSTATE)
-		local_unlock(&si->percpu_cluster->lock);
-	else
+	if (!(si->flags & SWP_SOLIDSTATE))
 		spin_unlock(&si->global_cluster->lock);
 
 	return found;
@@ -1306,8 +1309,29 @@ static bool get_swap_device_info(struct swap_info_struct *si)
 	return true;
 }
 
+static bool swap_alloc_fast(struct folio *folio)
+{
+	unsigned int order = folio_order(folio);
+	struct swap_info_struct *si;
+	int mask = folio_tier_effective_mask(folio);
+
+	/*
+	 * Once allocated, swap_info_struct will never be completely freed,
+	 * so checking it's liveness by get_swap_device_info is enough.
+	 */
+	si = this_cpu_read(percpu_swap_device.si[order]);
+	if (!si || !swap_tiers_mask_test(si->tier_mask, mask) ||
+		!get_swap_device_info(si))
+		return false;
+
+	cluster_alloc_swap_entry(si, folio);
+	put_swap_device(si);
+
+	return folio_test_swapcache(folio);
+}
+
 /* Rotate the device and switch to a new cluster */
-static void swap_alloc_entry(struct folio *folio)
+static void swap_alloc_slow(struct folio *folio)
 {
 	struct swap_info_struct *si, *next;
 	int mask = folio_tier_effective_mask(folio);
@@ -1484,7 +1508,11 @@ int folio_alloc_swap(struct folio *folio)
 	}
 
 again:
-	swap_alloc_entry(folio);
+	local_lock(&percpu_swap_device.lock);
+	if (!swap_alloc_fast(folio))
+		swap_alloc_slow(folio);
+	local_unlock(&percpu_swap_device.lock);
+
 	if (!order && unlikely(!folio_test_swapcache(folio))) {
 		if (swap_sync_discard())
 			goto again;
@@ -1903,7 +1931,9 @@ swp_entry_t swap_alloc_hibernation_slot(int type)
 			 * Grab the local lock to be compliant
 			 * with swap table allocation.
 			 */
+			local_lock(&percpu_swap_device.lock);
 			offset = cluster_alloc_swap_entry(si, NULL);
+			local_unlock(&percpu_swap_device.lock);
 			if (offset)
 				entry = swp_entry(si->type, offset);
 		}
@@ -2707,6 +2737,27 @@ static void free_cluster_info(struct swap_cluster_info *cluster_info,
 	kvfree(cluster_info);
 }
 
+/*
+ * Called after swap device's reference count is dead, so
+ * neither scan nor allocation will use it.
+ */
+static void flush_percpu_swap_device(struct swap_info_struct *si)
+{
+	int cpu, i;
+	struct swap_info_struct **pcp_si;
+
+	for_each_possible_cpu(cpu) {
+		pcp_si = per_cpu_ptr(percpu_swap_device.si, cpu);
+		/*
+		 * Invalidate the percpu swap device cache, si->users
+		 * is dead, so no new user will point to it, just flush
+		 * any existing user.
+		 */
+		for (i = 0; i < SWAP_NR_ORDERS; i++)
+			cmpxchg(&pcp_si[i], si, NULL);
+	}
+}
+
 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
 	struct swap_info_struct *p = NULL;
@@ -2790,6 +2841,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
 	flush_work(&p->discard_work);
 	flush_work(&p->reclaim_work);
+	flush_percpu_swap_device(p);
 
 	destroy_swap_extents(p);
 	if (p->flags & SWP_CONTINUED)
@@ -3224,7 +3276,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
 			cluster = per_cpu_ptr(si->percpu_cluster, cpu);
 			for (i = 0; i < SWAP_NR_ORDERS; i++)
 				cluster->next[i] = SWAP_ENTRY_INVALID;
-			local_lock_init(&cluster->lock);
 		}
 	} else {
 		si->global_cluster = kmalloc(sizeof(*si->global_cluster),
-- 
2.34.1