linux-kernel - [RFC PATCH v1 4/5] mm: swap: Scan for free swap entries in allocated clusters

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240618232648.4090299-5-ryan.roberts@arm.com>
Date: Wed, 19 Jun 2024 00:26:44 +0100
From: Ryan Roberts <ryan.roberts@....com>
To: Andrew Morton <akpm@...ux-foundation.org>,
	Chris Li <chrisl@...nel.org>,
	Kairui Song <kasong@...cent.com>,
	"Huang, Ying" <ying.huang@...el.com>,
	Kalesh Singh <kaleshsingh@...gle.com>,
	Barry Song <baohua@...nel.org>,
	Hugh Dickins <hughd@...gle.com>,
	David Hildenbrand <david@...hat.com>
Cc: Ryan Roberts <ryan.roberts@....com>,
	linux-kernel@...r.kernel.org,
	linux-mm@...ck.org
Subject: [RFC PATCH v1 4/5] mm: swap: Scan for free swap entries in allocated clusters

Previously mTHP would only be swapped out if a CPU could allocate itself
a free cluster from which to allocate mTHP-sized contiguous swap entry
blocks. But for a system making heavy use of swap, after a while
fragmentation ensures there are no available free clusters and therefore
the swap entry allocation fails and forces the mTHP to be split to base
pages which then get swap entries allocated by scanning the swap file
for free individual pages.

But when swap entries are freed, this makes holes in the clusters, and
often it would be possible to allocate new mTHP swap entries in those
holes.

So if we fail to allocate a free cluster, scan through the clusters
until we find one that is in use and contains swap entries of the order
we require. Then scan it until we find a suitably sized and aligned
hole. We keep a per-order "next cluster to scan" pointer so that future
scanning can be picked up from where we last left off. And if we scan
through all clusters without finding a suitable hole, we give up to
prevent live lock.

Running the test case provided by Barry Song at the below link, I can
see swpout fallback rate, which was previously 100% after a few
iterations, falls to 0% and stays there for all 100 iterations. This is
also the case when sprinkling in some non-mTHP allocations ("-s") too.

Signed-off-by: Ryan Roberts <ryan.roberts@....com>
Link: https://lore.kernel.org/linux-mm/20240615084714.37499-1-21cnbao@gmail.com/
---
 include/linux/swap.h |  2 +
 mm/swapfile.c        | 90 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2a40fe02d281..34ec4668a5c9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -310,6 +310,8 @@ struct swap_info_struct {
 	unsigned int cluster_nr;	/* countdown to next cluster search */
 	unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */
 	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
+	struct swap_cluster_info *next_order_scan[SWAP_NR_ORDERS];
+					/* Start cluster for next order-based scan */
 	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7b13f02a7ac2..24db03db8830 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -644,6 +644,84 @@ static inline bool swap_range_empty(char *swap_map, unsigned int start,
 	return true;
 }

+static inline
+struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si,
+					    unsigned int offset)
+{
+	VM_WARN_ON(!si->cluster_info);
+	return si->cluster_info + (offset / SWAPFILE_CLUSTER);
+}
+
+static inline
+unsigned int cluster_to_offset(struct swap_info_struct *si,
+			       struct swap_cluster_info *ci)
+{
+	VM_WARN_ON(!si->cluster_info);
+	return (ci - si->cluster_info) * SWAPFILE_CLUSTER;
+}
+
+static inline
+struct swap_cluster_info *next_cluster_circular(struct swap_info_struct *si,
+						struct swap_cluster_info *ci)
+{
+	struct swap_cluster_info *last;
+
+	/*
+	 * Wrap after the last whole cluster; never return the final partial
+	 * cluster because users assume an entire cluster is accessible.
+	 */
+	last = offset_to_cluster(si, si->max) - 1;
+	return ci == last ? si->cluster_info : ++ci;
+}
+
+static inline
+struct swap_cluster_info *prev_cluster_circular(struct swap_info_struct *si,
+						struct swap_cluster_info *ci)
+{
+	struct swap_cluster_info *last;
+
+	/*
+	 * Wrap to the last whole cluster; never return the final partial
+	 * cluster because users assume an entire cluster is accessible.
+	 */
+	last = offset_to_cluster(si, si->max) - 1;
+	return ci == si->cluster_info ? last : --ci;
+}
+
+/*
+ * Returns the offset of the next cluster, allocated to contain swap entries of
+ * `order`, that is eligible to scan for free space. On first call, *stop should
+ * be set to SWAP_NEXT_INVALID to indicate the clusters should be scanned all
+ * the way back around to the returned cluster. The function updates *stop upon
+ * first call and consumes it in subsequent calls. Returns SWAP_NEXT_INVALID if
+ * no such clusters are available. Must be called with si lock held.
+ */
+static unsigned int next_cluster_for_scan(struct swap_info_struct *si,
+					  int order, unsigned int *stop)
+{
+	struct swap_cluster_info *ci;
+	struct swap_cluster_info *end;
+
+	ci = si->next_order_scan[order];
+	if (*stop == SWAP_NEXT_INVALID)
+		*stop = cluster_to_offset(si, prev_cluster_circular(si, ci));
+	end = offset_to_cluster(si, *stop);
+
+	while (ci != end) {
+		if ((ci->flags & CLUSTER_FLAG_FREE) == 0 && ci->order == order)
+			break;
+		ci = next_cluster_circular(si, ci);
+	}
+
+	if (ci == end) {
+		si->next_order_scan[order] = ci;
+		return SWAP_NEXT_INVALID;
+	}
+
+	si->next_order_scan[order] = next_cluster_circular(si, ci);
+	return cluster_to_offset(si, ci);
+}
+
 /*
  * Try to get swap entries with specified order from current cpu's swap entry
  * pool (a cluster). This might involve allocating a new cluster for current CPU
@@ -656,6 +734,7 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 	struct percpu_cluster *cluster;
 	struct swap_cluster_info *ci;
 	unsigned int tmp, max;
+	unsigned int stop = SWAP_NEXT_INVALID;

 new_cluster:
 	cluster = this_cpu_ptr(si->percpu_cluster);
@@ -674,6 +753,15 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 			*scan_base = this_cpu_read(*si->cluster_next_cpu);
 			*offset = *scan_base;
 			goto new_cluster;
+		} else if (nr_pages < SWAPFILE_CLUSTER) {
+			/*
+			 * There is no point in scanning for free areas the same
+			 * size as the cluster, since the cluster would have
+			 * already been freed in that case.
+			 */
+			tmp = next_cluster_for_scan(si, order, &stop);
+			if (tmp == SWAP_NEXT_INVALID)
+				return false;
 		} else
 			return false;
 	}
@@ -2392,6 +2480,8 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
 	}
 	p->swap_map = swap_map;
 	p->cluster_info = cluster_info;
+	for (i = 0; i < SWAP_NR_ORDERS; i++)
+		p->next_order_scan[i] = cluster_info;
 }

 static void _enable_swap_info(struct swap_info_struct *p)
--
2.43.0