linux-kernel - [PATCH v2 2/2] mm: vmscan: reclaim contended folios asynchronously instead of promoting them

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240308031126.750-3-lipeifeng@oppo.com>
Date: Fri,  8 Mar 2024 11:11:26 +0800
From: lipeifeng@...o.com
To: lipeifeng@...o.com,
	21cnbao@...il.com,
	akpm@...ux-foundation.org,
	david@...hat.com,
	osalvador@...e.de,
	willy@...radead.org
Cc: linux-mm@...ck.org,
	linux-kernel@...r.kernel.org,
	Barry Song <v-songbaohua@...o.com>
Subject: [PATCH v2 2/2] mm: vmscan: reclaim contended folios asynchronously instead of promoting them

From: Peifeng Li <lipeifeng@...o.com>

Commit 6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path")
prevents the reclaim path from becoming stuck on the rmap lock. However,
it reinserts those folios at the head of the LRU during shrink_folio_list,
even if those folios are very cold.

While running an Android phone with 6GiB memory for 2 hours, I observed
that 321728 folios can be incorrectly placed back to the inactive head
of the LRU due to lock contention, which amounts to approximately 44
folios per second. Similarly, the same test conducted on 4GiB phones
shows that 106 folios are improperly promoted per second. This can
have a detrimental effect on performance by increasing refaults and
the likelihood of OOM (Out of Memory) killing.

For this reason, the patch introduces a separate list for contended folios
and wakes up a new kthread:kshrinkd thread to asynchronously reclaim them,
thus preventing excessive violations of LRU rules. This new thread will
set try_lock to false and always wait until it holds the lock.

Below is some data collected from two phones running monkey for two
hours(less is better):

Phone with 6GiB memory:
                      w/o patch          w/patch              delta
workingset_refault   1451043114         1408015823            -2.9%
lmkd count             9231              9009                 -2.4%

Phone with 4GiB memory:
                      w/o patch          w/patch              delta
workingset_refault    2674649801         2581150132           -3.4%
lmkd count             13800             13061                -5.3%

The Monkey is a program that runs on your emulator or device and generates
pseudo-random streams of user events such as clicks, touches, or gestures,
as well as a number of system-level events.

The Android low memory killer daemon (lmkd) process monitors the memory
state of a running Android system and reacts to high memory pressure by
killing the least essential processes to keep the system performing at
acceptable levels.

Signed-off-by: Peifeng Li <lipeifeng@...o.com>
Signed-off-by: Barry Song <v-songbaohua@...o.com>
---
 include/linux/mmzone.h        |   6 ++
 include/linux/swap.h          |   3 +
 include/linux/vm_event_item.h |   2 +
 mm/memory_hotplug.c           |   2 +
 mm/vmscan.c                   | 189 +++++++++++++++++++++++++++++++++-
 mm/vmstat.c                   |   2 +
 6 files changed, 201 insertions(+), 3 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c11b7cde81ef..19acacf92cc9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1332,6 +1332,12 @@ typedef struct pglist_data {
 
 	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
 
+	struct list_head kshrinkd_folios; /* rmap_walk contended folios list*/
+	spinlock_t kf_lock; /* Protect kshrinkd_folios list*/
+
+	struct task_struct *kshrinkd; /* reclaim kshrinkd_folios*/
+	wait_queue_head_t kshrinkd_wait;
+
 #ifdef CONFIG_COMPACTION
 	int kcompactd_max_order;
 	enum zone_type kcompactd_highest_zoneidx;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2955f7a78d8d..6d15b577b6a3 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -438,6 +438,9 @@ void check_move_unevictable_folios(struct folio_batch *fbatch);
 extern void __meminit kswapd_run(int nid);
 extern void __meminit kswapd_stop(int nid);
 
+extern void kshrinkd_run(int nid);
+extern void kshrinkd_stop(int nid);
+
 #ifdef CONFIG_SWAP
 
 int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 747943bc8cc2..ee95ab138c87 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -38,9 +38,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PGLAZYFREED,
 		PGREFILL,
 		PGREUSE,
+		PGSTEAL_KSHRINKD,
 		PGSTEAL_KSWAPD,
 		PGSTEAL_DIRECT,
 		PGSTEAL_KHUGEPAGED,
+		PGSCAN_KSHRINKD,
 		PGSCAN_KSWAPD,
 		PGSCAN_DIRECT,
 		PGSCAN_KHUGEPAGED,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a444e2d7dd2b..5e1c326a8bde 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1218,6 +1218,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 
 	kswapd_run(nid);
 	kcompactd_run(nid);
+	kshrinkd_run(nid);
 
 	writeback_set_ratelimit();
 
@@ -2098,6 +2099,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 	}
 
 	if (arg.status_change_nid >= 0) {
+		kshrinkd_stop(node);
 		kcompactd_stop(node);
 		kswapd_stop(node);
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 509b5e0dffd3..ef540a520b47 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -150,6 +150,9 @@ struct scan_control {
 	/* if try_lock in rmap_walk */
 	unsigned int rw_try_lock:1;
 
+	/* need kshrinkd to reclaim if rwc trylock contended*/
+	unsigned int need_kshrinkd:1;
+
 	/* Allocation order */
 	s8 order;
 
@@ -201,6 +204,17 @@ struct scan_control {
  */
 int vm_swappiness = 60;
 
+/*
+ * Wakeup kshrinkd those folios which lock-contended in ramp_walk
+ * during shrink_folio_list, instead of putting back to the head
+ * of LRU, to avoid to break the rules of LRU.
+ */
+static void wakeup_kshrinkd(struct pglist_data *pgdat)
+{
+	if (likely(pgdat->kshrinkd))
+		wake_up_interruptible(&pgdat->kshrinkd_wait);
+}
+
 #ifdef CONFIG_MEMCG
 
 /* Returns true for reclaim through cgroup limits or cgroup interfaces. */
@@ -844,6 +858,7 @@ enum folio_references {
 	FOLIOREF_RECLAIM_CLEAN,
 	FOLIOREF_KEEP,
 	FOLIOREF_ACTIVATE,
+	FOLIOREF_LOCK_CONTENDED,
 };
 
 static enum folio_references folio_check_references(struct folio *folio,
@@ -864,8 +879,12 @@ static enum folio_references folio_check_references(struct folio *folio,
 		return FOLIOREF_ACTIVATE;
 
 	/* rmap lock contention: rotate */
-	if (referenced_ptes == -1)
-		return FOLIOREF_KEEP;
+	if (referenced_ptes == -1) {
+		if (sc->need_kshrinkd && folio_pgdat(folio)->kshrinkd)
+			return FOLIOREF_LOCK_CONTENDED;
+		else
+			return FOLIOREF_KEEP;
+	}
 
 	if (referenced_ptes) {
 		/*
@@ -1035,6 +1054,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 	struct folio_batch free_folios;
 	LIST_HEAD(ret_folios);
 	LIST_HEAD(demote_folios);
+	LIST_HEAD(contended_folios);
 	unsigned int nr_reclaimed = 0;
 	unsigned int pgactivate = 0;
 	bool do_demote_pass;
@@ -1052,6 +1072,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 		enum folio_references references = FOLIOREF_RECLAIM;
 		bool dirty, writeback;
 		unsigned int nr_pages;
+		bool lock_contended = false;
 
 		cond_resched();
 
@@ -1193,6 +1214,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 		case FOLIOREF_KEEP:
 			stat->nr_ref_keep += nr_pages;
 			goto keep_locked;
+		case FOLIOREF_LOCK_CONTENDED:
+			lock_contended = true;
+			goto keep_locked;
 		case FOLIOREF_RECLAIM:
 		case FOLIOREF_RECLAIM_CLEAN:
 			; /* try to reclaim the folio below */
@@ -1470,7 +1494,10 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 keep_locked:
 		folio_unlock(folio);
 keep:
-		list_add(&folio->lru, &ret_folios);
+		if (unlikely(lock_contended))
+			list_add(&folio->lru, &contended_folios);
+		else
+			list_add(&folio->lru, &ret_folios);
 		VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
 				folio_test_unevictable(folio), folio);
 	}
@@ -1512,6 +1539,14 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 	free_unref_folios(&free_folios);
 
 	list_splice(&ret_folios, folio_list);
+
+	if (!list_empty(&contended_folios)) {
+		spin_lock_irq(&pgdat->kf_lock);
+		list_splice(&contended_folios, &pgdat->kshrinkd_folios);
+		spin_unlock_irq(&pgdat->kf_lock);
+		wakeup_kshrinkd(pgdat);
+	}
+
 	count_vm_events(PGACTIVATE, pgactivate);
 
 	if (plug)
@@ -1526,6 +1561,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 		.gfp_mask = GFP_KERNEL,
 		.may_unmap = 1,
 		.rw_try_lock = 1,
+		.need_kshrinkd = 0,
 	};
 	struct reclaim_stat stat;
 	unsigned int nr_reclaimed;
@@ -2119,6 +2155,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
 		.may_swap = 1,
 		.no_demotion = 1,
 		.rw_try_lock = 1,
+		.need_kshrinkd = 0,
 	};
 
 	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references);
@@ -5465,6 +5502,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.gfp_mask = GFP_KERNEL,
 		.rw_try_lock = 1,
+		.need_kshrinkd = 0,
 	};
 
 	buf = kvmalloc(len + 1, GFP_KERNEL);
@@ -6443,6 +6481,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.may_unmap = 1,
 		.may_swap = 1,
 		.rw_try_lock = 1,
+		.need_kshrinkd = 1,
 	};
 
 	/*
@@ -6489,6 +6528,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.may_swap = !noswap,
 		.rw_try_lock = 1,
+		.need_kshrinkd = 0,
 	};
 
 	WARN_ON_ONCE(!current->reclaim_state);
@@ -6536,6 +6576,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
 		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
 		.rw_try_lock = 1,
+		.need_kshrinkd = 0,
 	};
 	/*
 	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
@@ -6798,6 +6839,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 		.order = order,
 		.may_unmap = 1,
 		.rw_try_lock = 1,
+		.need_kshrinkd = 1,
 	};
 
 	set_task_reclaim_state(current, &sc.reclaim_state);
@@ -7268,6 +7310,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 		.may_swap = 1,
 		.hibernation_mode = 1,
 		.rw_try_lock = 1,
+		.need_kshrinkd = 0,
 	};
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
 	unsigned long nr_reclaimed;
@@ -7338,6 +7381,145 @@ static int __init kswapd_init(void)
 
 module_init(kswapd_init)
 
+static int kshrinkd_should_run(pg_data_t *pgdat)
+{
+	int should_run;
+
+	spin_lock_irq(&pgdat->kf_lock);
+	should_run = !list_empty(&pgdat->kshrinkd_folios);
+	spin_unlock_irq(&pgdat->kf_lock);
+
+	return should_run;
+}
+
+static unsigned long kshrinkd_reclaim_folios(struct list_head *folio_list,
+				struct pglist_data *pgdat)
+{
+	struct reclaim_stat dummy_stat;
+	unsigned int nr_reclaimed = 0;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.may_writepage = 1,
+		.may_unmap = 1,
+		.may_swap = 1,
+		.no_demotion = 1,
+		.rw_try_lock = 0,
+		.need_kshrinkd = 0,
+	};
+
+	if (list_empty(folio_list))
+		return nr_reclaimed;
+
+	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
+
+	return nr_reclaimed;
+}
+
+/*
+ * The background kshrink daemon, started as a kernel thread
+ * from the init process.
+ *
+ * Kshrinkd is to reclaim the contended-folio in rmap_walk when
+ * shrink_folio_list instead of putting back into the head of LRU
+ * directly, to avoid to break the rules of LRU.
+ */
+
+static int kshrinkd(void *p)
+{
+	pg_data_t *pgdat;
+	LIST_HEAD(tmp_contended_folios);
+
+	pgdat = (pg_data_t *)p;
+
+	current->flags |= PF_MEMALLOC | PF_KSWAPD;
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		unsigned long nr_reclaimed = 0;
+		unsigned long nr_putback = 0;
+
+		wait_event_freezable(pgdat->kshrinkd_wait,
+				kshrinkd_should_run(pgdat));
+
+		/* splice rmap_walk contended folios to tmp-list */
+		spin_lock_irq(&pgdat->kf_lock);
+		list_splice(&pgdat->kshrinkd_folios, &tmp_contended_folios);
+		INIT_LIST_HEAD(&pgdat->kshrinkd_folios);
+		spin_unlock_irq(&pgdat->kf_lock);
+
+		/* reclaim rmap_walk contended folios */
+		nr_reclaimed = kshrinkd_reclaim_folios(&tmp_contended_folios, pgdat);
+		__count_vm_events(PGSTEAL_KSHRINKD, nr_reclaimed);
+
+		/* putback the folios which failed to reclaim to lru */
+		while (!list_empty(&tmp_contended_folios)) {
+			struct folio *folio = lru_to_folio(&tmp_contended_folios);
+
+			nr_putback += folio_nr_pages(folio);
+			list_del(&folio->lru);
+			folio_putback_lru(folio);
+		}
+
+		__count_vm_events(PGSCAN_KSHRINKD, nr_reclaimed + nr_putback);
+	}
+
+	current->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
+
+	return 0;
+}
+
+/*
+ * This kshrinkd start function will be called by init and node-hot-add.
+ */
+void kshrinkd_run(int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	if (pgdat->kshrinkd)
+		return;
+
+	pgdat->kshrinkd = kthread_run(kshrinkd, pgdat, "kshrinkd%d", nid);
+	if (IS_ERR(pgdat->kshrinkd)) {
+		/* failure to start kshrinkd */
+		WARN_ON_ONCE(system_state < SYSTEM_RUNNING);
+		pr_err("Failed to start kshrinkd on node %d\n", nid);
+		pgdat->kshrinkd = NULL;
+	}
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined.  Caller must
+ * be holding mem_hotplug_begin/done().
+ */
+void kshrinkd_stop(int nid)
+{
+	struct task_struct *kshrinkd = NODE_DATA(nid)->kshrinkd;
+
+	if (kshrinkd) {
+		kthread_stop(kshrinkd);
+		NODE_DATA(nid)->kshrinkd = NULL;
+	}
+}
+
+static int __init kshrinkd_init(void)
+{
+	int nid;
+
+	for_each_node_state(nid, N_MEMORY) {
+		pg_data_t *pgdat = NODE_DATA(nid);
+
+		spin_lock_init(&pgdat->kf_lock);
+		init_waitqueue_head(&pgdat->kshrinkd_wait);
+		INIT_LIST_HEAD(&pgdat->kshrinkd_folios);
+
+		kshrinkd_run(nid);
+	}
+
+	return 0;
+}
+
+module_init(kshrinkd_init)
+
 #ifdef CONFIG_NUMA
 /*
  * Node reclaim mode
@@ -7427,6 +7609,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 		.may_swap = 1,
 		.reclaim_idx = gfp_zone(gfp_mask),
 		.rw_try_lock = 1,
+		.need_kshrinkd = 1,
 	};
 	unsigned long pflags;
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db79935e4a54..76d8a3b2d1a8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1279,9 +1279,11 @@ const char * const vmstat_text[] = {
 
 	"pgrefill",
 	"pgreuse",
+	"pgsteal_kshrinkd",
 	"pgsteal_kswapd",
 	"pgsteal_direct",
 	"pgsteal_khugepaged",
+	"pgscan_kshrinkd",
 	"pgscan_kswapd",
 	"pgscan_direct",
 	"pgscan_khugepaged",
-- 
2.34.1