linux-kernel - [PATCH 01/28] mm, swap: don't scan every fragment cluster

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250514201729.48420-2-ryncsn@gmail.com>
Date: Thu, 15 May 2025 04:17:01 +0800
From: Kairui Song <ryncsn@...il.com>
To: linux-mm@...ck.org
Cc: Andrew Morton <akpm@...ux-foundation.org>,
	Matthew Wilcox <willy@...radead.org>,
	Hugh Dickins <hughd@...gle.com>,
	Chris Li <chrisl@...nel.org>,
	David Hildenbrand <david@...hat.com>,
	Yosry Ahmed <yosryahmed@...gle.com>,
	"Huang, Ying" <ying.huang@...ux.alibaba.com>,
	Nhat Pham <nphamcs@...il.com>,
	Johannes Weiner <hannes@...xchg.org>,
	Baolin Wang <baolin.wang@...ux.alibaba.com>,
	Baoquan He <bhe@...hat.com>,
	Barry Song <baohua@...nel.org>,
	Kalesh Singh <kaleshsingh@...gle.com>,
	Kemeng Shi <shikemeng@...weicloud.com>,
	Tim Chen <tim.c.chen@...ux.intel.com>,
	Ryan Roberts <ryan.roberts@....com>,
	linux-kernel@...r.kernel.org,
	Kairui Song <kasong@...cent.com>
Subject: [PATCH 01/28] mm, swap: don't scan every fragment cluster

From: Kairui Song <kasong@...cent.com>

Fragment clusters were failing high order allocation already, the reason
we scan it now is that a swap entry may get freed without releasing the
cache so a swap map entry will end up in HAS_CACHE only status and the
cluster won't be moved back to non-full or free cluster list.

The chance is low and only happens with the device usage is low
(!vm_swap_full()). This is especially unhelpful for SWP_SYNCHRONOUS_IO
devices as swap cache almost always gets freed when count reaches zero
for these device.

And besides, high order allocation failure isn't a critical issue.
Having the scan actually slow down mTHP allocation by a lot
when the fragment cluster list is long.

The HAS_CACHE issue will be fixed in a proper way later, so drop this
fragment cluster scanning design.

Signed-off-by: Kairui Song <kasong@...cent.com>
---
 include/linux/swap.h |  1 -
 mm/swapfile.c        | 32 +++++++++-----------------------
 2 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index bc0e1c275fc0..817e427a47d2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -310,7 +310,6 @@ struct swap_info_struct {
 					/* list of cluster that contains at least one free slot */
 	struct list_head frag_clusters[SWAP_NR_ORDERS];
 					/* list of cluster that are fragmented or contented */
-	atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
 	unsigned int pages;		/* total of usable pages of swap */
 	atomic_long_t inuse_pages;	/* number of those currently in use */
 	struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 026090bf3efe..34188714479f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -470,11 +470,6 @@ static void move_cluster(struct swap_info_struct *si,
 	else
 		list_move_tail(&ci->list, list);
 	spin_unlock(&si->lock);
-
-	if (ci->flags == CLUSTER_FLAG_FRAG)
-		atomic_long_dec(&si->frag_cluster_nr[ci->order]);
-	else if (new_flags == CLUSTER_FLAG_FRAG)
-		atomic_long_inc(&si->frag_cluster_nr[ci->order]);
 	ci->flags = new_flags;
 }
 
@@ -926,32 +921,25 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 		swap_reclaim_full_clusters(si, false);
 
 	if (order < PMD_ORDER) {
-		unsigned int frags = 0, frags_existing;
-
 		while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
 			found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
 							order, usage);
 			if (found)
 				goto done;
-			/* Clusters failed to allocate are moved to frag_clusters */
-			frags++;
 		}
 
-		frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
-		while (frags < frags_existing &&
-		       (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) {
-			atomic_long_dec(&si->frag_cluster_nr[order]);
-			/*
-			 * Rotate the frag list to iterate, they were all
-			 * failing high order allocation or moved here due to
-			 * per-CPU usage, but they could contain newly released
-			 * reclaimable (eg. lazy-freed swap cache) slots.
-			 */
+		/*
+		 * Scan only one fragment cluster is good enough. Order 0
+		 * allocation will surely success, and mTHP allocation failure
+		 * is not critical, and scanning one cluster still keeps the
+		 * list rotated and scanned (for reclaiming HAS_CACHE).
+		 */
+		ci = isolate_lock_cluster(si, &si->frag_clusters[order]);
+		if (ci) {
 			found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
-							order, usage);
+					order, usage);
 			if (found)
 				goto done;
-			frags++;
 		}
 	}
 
@@ -973,7 +961,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 		 * allocation, but reclaim may drop si->lock and race with another user.
 		 */
 		while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
-			atomic_long_dec(&si->frag_cluster_nr[o]);
 			found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
 							0, usage);
 			if (found)
@@ -3234,7 +3221,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
 	for (i = 0; i < SWAP_NR_ORDERS; i++) {
 		INIT_LIST_HEAD(&si->nonfull_clusters[i]);
 		INIT_LIST_HEAD(&si->frag_clusters[i]);
-		atomic_long_set(&si->frag_cluster_nr[i], 0);
 	}
 
 	/*
-- 
2.49.0