lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241022192451.38138-13-ryncsn@gmail.com>
Date: Wed, 23 Oct 2024 03:24:50 +0800
From: Kairui Song <ryncsn@...il.com>
To: linux-mm@...ck.org
Cc: Andrew Morton <akpm@...ux-foundation.org>,
	Chris Li <chrisl@...nel.org>,
	Barry Song <v-songbaohua@...o.com>,
	Ryan Roberts <ryan.roberts@....com>,
	Hugh Dickins <hughd@...gle.com>,
	Yosry Ahmed <yosryahmed@...gle.com>,
	"Huang, Ying" <ying.huang@...el.com>,
	Tim Chen <tim.c.chen@...ux.intel.com>,
	Nhat Pham <nphamcs@...il.com>,
	linux-kernel@...r.kernel.org,
	Kairui Song <kasong@...cent.com>
Subject: [PATCH 12/13] mm, swap: use a global swap cluster for non-rotation device

From: Kairui Song <kasong@...cent.com>

Non-rotation (SSD / ZRAM) device can tolerate fragmentations so the goal
of SWAP allocator is to avoid contention of clusters. So it used a
per-CPU cluster design, and each CPU will be using a different cluster
as much as possible.

But HDD is very sensitive to fragmentations, contention is trivial compared
to this. So just use one global cluster instead. This ensured each order
will be wring to a same cluster as much as possible, which helps to make
the IO more continuous.

This ensures the performance of cluster allocator is as good as the old
allocator. Test after this commit compared to before this series:

make -j32 with tinyconfig, using 1G memcg limit and HDD swap:

Before this series:
114.44user 29.11system 39:42.90elapsed 6%CPU (0avgtext+0avgdata 157284maxresident)k
2901232inputs+0outputs (238877major+4227640minor)pagefaults

After this commit:
113.90user 23.81system 38:11.77elapsed 6%CPU (0avgtext+0avgdata 157260maxresident)k
2548728inputs+0outputs (235471major+4238110minor)pagefaults

Suggested-by: Chris Li <chrisl@...nel.org>
Signed-off-by: Kairui Song <kasong@...cent.com>
---
 include/linux/swap.h |  2 ++
 mm/swapfile.c        | 48 ++++++++++++++++++++++++++++++++------------
 2 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0e6c6bb385f0..9898b1881d4d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -319,6 +319,8 @@ struct swap_info_struct {
 	unsigned int pages;		/* total of usable pages of swap */
 	atomic_long_t inuse_pages;	/* number of those currently in use */
 	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
+	struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */
+	spinlock_t global_cluster_lock;	/* Serialize usage of global cluster */
 	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f25d697f6736..6eb298a222c0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -798,7 +798,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 out:
 	relocate_cluster(si, ci);
 	unlock_cluster(ci);
-	__this_cpu_write(si->percpu_cluster->next[order], next);
+	if (si->flags & SWP_SOLIDSTATE)
+		__this_cpu_write(si->percpu_cluster->next[order], next);
+	else
+		si->global_cluster->next[order] = next;
 	return found;
 }
 
@@ -860,8 +863,14 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 	unsigned int offset, found = 0;
 
 	/* Fast path using per CPU cluster */
-	local_lock(&si->percpu_cluster->lock);
-	offset = __this_cpu_read(si->percpu_cluster->next[order]);
+	if (si->flags & SWP_SOLIDSTATE) {
+		local_lock(&si->percpu_cluster->lock);
+		offset = __this_cpu_read(si->percpu_cluster->next[order]);
+	} else {
+		spin_lock(&si->global_cluster_lock);
+		offset = si->global_cluster->next[order];
+	}
+
 	if (offset) {
 		ci = lock_cluster(si, offset);
 		/* Cluster could have been used by another order */
@@ -960,8 +969,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 		}
 	}
 done:
-	local_unlock(&si->percpu_cluster->lock);
-
+	if (si->flags & SWP_SOLIDSTATE)
+		local_unlock(&si->percpu_cluster->lock);
+	else
+		spin_unlock(&si->global_cluster_lock);
 	return found;
 }
 
@@ -2737,6 +2748,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	mutex_unlock(&swapon_mutex);
 	free_percpu(p->percpu_cluster);
 	p->percpu_cluster = NULL;
+	kfree(p->global_cluster);
+	p->global_cluster = NULL;
 	vfree(swap_map);
 	kvfree(zeromap);
 	kvfree(cluster_info);
@@ -3142,17 +3155,24 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
 	for (i = 0; i < nr_clusters; i++)
 		spin_lock_init(&cluster_info[i].lock);
 
-	si->percpu_cluster = alloc_percpu(struct percpu_cluster);
-	if (!si->percpu_cluster)
-		goto err_free;
+	if (si->flags & SWP_SOLIDSTATE) {
+		si->percpu_cluster = alloc_percpu(struct percpu_cluster);
+		if (!si->percpu_cluster)
+			goto err_free;
 
-	for_each_possible_cpu(cpu) {
-		struct percpu_cluster *cluster;
+		for_each_possible_cpu(cpu) {
+			struct percpu_cluster *cluster;
 
-		cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+			cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+			for (i = 0; i < SWAP_NR_ORDERS; i++)
+				cluster->next[i] = SWAP_ENTRY_INVALID;
+			local_lock_init(&cluster->lock);
+		}
+	} else {
+		si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL);
 		for (i = 0; i < SWAP_NR_ORDERS; i++)
-			cluster->next[i] = SWAP_ENTRY_INVALID;
-		local_lock_init(&cluster->lock);
+			si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
+		spin_lock_init(&si->global_cluster_lock);
 	}
 
 	/*
@@ -3426,6 +3446,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 bad_swap:
 	free_percpu(si->percpu_cluster);
 	si->percpu_cluster = NULL;
+	kfree(si->global_cluster);
+	si->global_cluster = NULL;
 	inode = NULL;
 	destroy_swap_extents(si);
 	swap_cgroup_swapoff(si->type);
-- 
2.47.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ