lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20251011062043.772549-1-mawupeng1@huawei.com>
Date: Sat, 11 Oct 2025 14:20:43 +0800
From: Wupeng Ma <mawupeng1@...wei.com>
To: <akpm@...ux-foundation.org>, <david@...hat.com>, <jackmanb@...gle.com>,
	<hannes@...xchg.org>, <zhengqi.arch@...edance.com>, <shakeel.butt@...ux.dev>
CC: <mawupeng1@...wei.com>, <linux-mm@...ck.org>,
	<linux-kernel@...r.kernel.org>
Subject: [RFC PATCH] mm: vmscan: wakeup kswapd during node_reclaim

During testing, we observed that memory allocation with node_reclaim_mode
enabled becomes extremely slow when a large allocation is attempted on a
node whose free memory is mostly occupied by clean page cache.

The slowness arises because during node reclaim, only direct reclaim-like
behavior is triggered — recycling only 32 pages at a time — without
waking kswapd, even when the watermark levels and alloc_flags already
satisfy the condition to activate kswapd.

This patch wakes kswapd during node reclaim, allowing background reclaim
to bring free memory up to the high watermark and avoid excessive node
reclaim overhead.

Signed-off-by: Wupeng Ma <mawupeng1@...wei.com>
---
 mm/internal.h   | 14 ++++++++------
 mm/page_alloc.c |  6 +++++-
 mm/vmscan.c     | 19 +++++++++++++++++--
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 1561fc2ff5b8..5303123dd0a8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1196,21 +1196,23 @@ static inline void mminit_verify_zonelist(void)
 }
 #endif /* CONFIG_DEBUG_MEMORY_INIT */
 
-#define NODE_RECLAIM_NOSCAN	-2
-#define NODE_RECLAIM_FULL	-1
-#define NODE_RECLAIM_SOME	0
-#define NODE_RECLAIM_SUCCESS	1
+#define NODE_RECLAIM_NOSCAN		-2
+#define NODE_RECLAIM_FULL		-1
+#define NODE_RECLAIM_SOME		0
+#define NODE_RECLAIM_SUCCESS		1
+#define NODE_RECLAIM_KSWAPD_SUCCESS	2
 
 #ifdef CONFIG_NUMA
 extern int node_reclaim_mode;
 
-extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order,
+		 int alloc_flags, struct zone *zone);
 extern int find_next_best_node(int node, nodemask_t *used_node_mask);
 #else
 #define node_reclaim_mode 0
 
 static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
-				unsigned int order)
+		unsigned int order, int alloc_flags, struct zone *zone)
 {
 	return NODE_RECLAIM_NOSCAN;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 600d9e981c23..2472000cab78 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3859,7 +3859,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 			    !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone))
 				continue;
 
-			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
+			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order,
+					   alloc_flags, zone);
 			switch (ret) {
 			case NODE_RECLAIM_NOSCAN:
 				/* did not scan */
@@ -3867,6 +3868,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 			case NODE_RECLAIM_FULL:
 				/* scanned but unreclaimable */
 				continue;
+			case NODE_RECLAIM_KSWAPD_SUCCESS:
+				/* kswapd reclaim enough */
+				goto try_this_zone;
 			default:
 				/* did we reclaim enough */
 				if (zone_watermark_ok(zone, order, mark,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2fc8b626d3d..ebee8b6330a8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7680,9 +7680,11 @@ static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
 	return sc->nr_reclaimed;
 }
 
-int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order,
+		 int alloc_flags, struct zone *zone)
 {
 	int ret;
+	enum zone_type highest_zoneidx = gfp_zone(gfp_mask);
 	/* Minimum pages needed in order to stay on node */
 	const unsigned long nr_pages = 1 << order;
 	struct scan_control sc = {
@@ -7693,7 +7695,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
 		.may_swap = 1,
-		.reclaim_idx = gfp_zone(gfp_mask),
+		.reclaim_idx = highest_zoneidx,
 	};
 
 	/*
@@ -7729,6 +7731,19 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 	if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
 		return NODE_RECLAIM_NOSCAN;
 
+	if (alloc_flags & ALLOC_KSWAPD) {
+		unsigned long mark;
+
+		wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
+
+		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+		if (zone_watermark_ok(zone, order, mark, highest_zoneidx,
+					alloc_flags)) {
+			clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+			return NODE_RECLAIM_KSWAPD_SUCCESS;
+		}
+	}
+
 	ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages;
 	clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
 
-- 
2.43.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ