linux-kernel - [RFC PATCH 20/26] mm: vmscan: use compaction

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230418191313.268131-21-hannes@cmpxchg.org>
Date:   Tue, 18 Apr 2023 15:13:07 -0400
From:   Johannes Weiner <hannes@...xchg.org>
To:     linux-mm@...ck.org
Cc:     Kaiyang Zhao <kaiyang2@...cmu.edu>,
        Mel Gorman <mgorman@...hsingularity.net>,
        Vlastimil Babka <vbabka@...e.cz>,
        David Rientjes <rientjes@...gle.com>,
        linux-kernel@...r.kernel.org, kernel-team@...com
Subject: [RFC PATCH 20/26] mm: vmscan: use compaction_suitable() check in kswapd

Kswapd currently bails on higher-order allocations with an open-coded
check for whether it's reclaimed the compaction gap.

compaction_suitable() is the customary interface to coordinate reclaim
with compaction.

Signed-off-by: Johannes Weiner <hannes@...xchg.org>
---
 mm/vmscan.c | 67 ++++++++++++++++++-----------------------------------
 1 file changed, 23 insertions(+), 44 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index ee8c8ca2e7b5..723705b9e4d9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6872,12 +6872,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
 		if (!managed_zone(zone))
 			continue;
 
+		/* Allocation can succeed in any zone, done */
 		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
 			mark = wmark_pages(zone, WMARK_PROMO);
 		else
 			mark = high_wmark_pages(zone);
 		if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
 			return true;
+
+		/* Allocation can't succeed, but enough order-0 to compact */
+		if (compaction_suitable(zone, order,
+					highest_zoneidx) == COMPACT_CONTINUE)
+			return true;
 	}
 
 	/*
@@ -6968,16 +6974,6 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
 	 */
 	shrink_node(pgdat, sc);
 
-	/*
-	 * Fragmentation may mean that the system cannot be rebalanced for
-	 * high-order allocations. If twice the allocation size has been
-	 * reclaimed then recheck watermarks only at order-0 to prevent
-	 * excessive reclaim. Assume that a process requested a high-order
-	 * can direct reclaim/compact.
-	 */
-	if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
-		sc->order = 0;
-
 	return sc->nr_scanned >= sc->nr_to_reclaim;
 }
 
@@ -7018,15 +7014,13 @@ clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
  * that are eligible for use by the caller until at least one zone is
  * balanced.
  *
- * Returns the order kswapd finished reclaiming at.
- *
  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
  * zones which have free_pages > high_wmark_pages(zone), but once a zone is
  * found to have free_pages <= high_wmark_pages(zone), any page in that zone
  * or lower is eligible for reclaim until at least one usable zone is
  * balanced.
  */
-static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
+static void balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 {
 	int i;
 	unsigned long nr_soft_reclaimed;
@@ -7226,14 +7220,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 	__fs_reclaim_release(_THIS_IP_);
 	psi_memstall_leave(&pflags);
 	set_task_reclaim_state(current, NULL);
-
-	/*
-	 * Return the order kswapd stopped reclaiming at as
-	 * prepare_kswapd_sleep() takes it into account. If another caller
-	 * entered the allocator slow path while kswapd was awake, order will
-	 * remain at the higher level.
-	 */
-	return sc.order;
 }
 
 /*
@@ -7251,7 +7237,7 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
 	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
 }
 
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
 				unsigned int highest_zoneidx)
 {
 	long remaining = 0;
@@ -7269,7 +7255,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 	 * eligible zone balanced that it's also unlikely that compaction will
 	 * succeed.
 	 */
-	if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
+	if (prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) {
 		/*
 		 * Compaction records what page blocks it recently failed to
 		 * isolate pages from and skips them in the future scanning.
@@ -7282,7 +7268,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 		 * We have freed the memory, now we should compact it to make
 		 * allocation of the requested order possible.
 		 */
-		wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
+		wakeup_kcompactd(pgdat, order, highest_zoneidx);
 
 		remaining = schedule_timeout(HZ/10);
 
@@ -7296,8 +7282,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 					kswapd_highest_zoneidx(pgdat,
 							highest_zoneidx));
 
-			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
-				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
+			if (READ_ONCE(pgdat->kswapd_order) < order)
+				WRITE_ONCE(pgdat->kswapd_order, order);
 		}
 
 		finish_wait(&pgdat->kswapd_wait, &wait);
@@ -7308,8 +7294,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 	 * After a short sleep, check if it was a premature sleep. If not, then
 	 * go fully to sleep until explicitly woken up.
 	 */
-	if (!remaining &&
-	    prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
+	if (!remaining && prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) {
 		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 
 		/*
@@ -7350,8 +7335,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
  */
 static int kswapd(void *p)
 {
-	unsigned int alloc_order, reclaim_order;
-	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
+	unsigned int order, highest_zoneidx;
 	pg_data_t *pgdat = (pg_data_t *)p;
 	struct task_struct *tsk = current;
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -7374,22 +7358,20 @@ static int kswapd(void *p)
 	tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
 	set_freezable();
 
-	WRITE_ONCE(pgdat->kswapd_order, 0);
+	order = 0;
+	highest_zoneidx = MAX_NR_ZONES - 1;
+	WRITE_ONCE(pgdat->kswapd_order, order);
 	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
+
 	atomic_set(&pgdat->nr_writeback_throttled, 0);
+
 	for ( ; ; ) {
 		bool ret;
 
-		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
-		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
-							highest_zoneidx);
-
-kswapd_try_sleep:
-		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
-					highest_zoneidx);
+		kswapd_try_to_sleep(pgdat, order, highest_zoneidx);
 
 		/* Read the new order and highest_zoneidx */
-		alloc_order = READ_ONCE(pgdat->kswapd_order);
+		order = READ_ONCE(pgdat->kswapd_order);
 		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
 							highest_zoneidx);
 		WRITE_ONCE(pgdat->kswapd_order, 0);
@@ -7415,11 +7397,8 @@ static int kswapd(void *p)
 		 * request (alloc_order).
 		 */
 		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
-						alloc_order);
-		reclaim_order = balance_pgdat(pgdat, alloc_order,
-						highest_zoneidx);
-		if (reclaim_order < alloc_order)
-			goto kswapd_try_sleep;
+					    order);
+		balance_pgdat(pgdat, order, highest_zoneidx);
 	}
 
 	tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
-- 
2.39.2