[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20260120123033.b2f0dec292fba02d5c8aafab@linux-foundation.org>
Date: Tue, 20 Jan 2026 12:30:33 -0800
From: Andrew Morton <akpm@...ux-foundation.org>
To: Jiayuan Chen <jiayuan.chen@...ux.dev>
Cc: linux-mm@...ck.org, David Hildenbrand <david@...nel.org>, Lorenzo
Stoakes <lorenzo.stoakes@...cle.com>, "Liam R. Howlett"
<Liam.Howlett@...cle.com>, Vlastimil Babka <vbabka@...e.cz>, Mike Rapoport
<rppt@...nel.org>, Suren Baghdasaryan <surenb@...gle.com>, Michal Hocko
<mhocko@...e.com>, Axel Rasmussen <axelrasmussen@...gle.com>, Yuanchu Xie
<yuanchu@...gle.com>, Wei Xu <weixugc@...gle.com>, Steven Rostedt
<rostedt@...dmis.org>, Masami Hiramatsu <mhiramat@...nel.org>, Mathieu
Desnoyers <mathieu.desnoyers@...icios.com>, Brendan Jackman
<jackmanb@...gle.com>, Johannes Weiner <hannes@...xchg.org>, Zi Yan
<ziy@...dia.com>, Qi Zheng <zhengqi.arch@...edance.com>, Shakeel Butt
<shakeel.butt@...ux.dev>, Jiayuan Chen <jiayuan.chen@...pee.com>,
linux-kernel@...r.kernel.org, linux-trace-kernel@...r.kernel.org
Subject: Re: [PATCH v4 0/2] mm/vmscan: mitigate spurious kswapd_failures
reset and add tracepoints
On Tue, 20 Jan 2026 10:43:47 +0800 Jiayuan Chen <jiayuan.chen@...ux.dev> wrote:
> == Problem ==
>
> We observed an issue in production on a multi-NUMA system where kswapd
> runs endlessly, causing sustained heavy IO READ pressure across the
> entire system.
>
> The root cause is that direct reclaim triggered by cgroup memory.high
> keeps resetting kswapd_failures to 0, even when the node cannot be
> balanced. This prevents kswapd from ever stopping after reaching
> MAX_RECLAIM_RETRIES.
>
Updated, thanks.
> v3 -> v4: https://lore.kernel.org/linux-mm/20260114074049.229935-1-jiayuan.chen@linux.dev/
> - Add Acked-by tags
> - Some modifications suggested by Johannes Weiner
Here's how v4 altered mm.git:
include/linux/mmzone.h | 26 ++++++++-----
include/trace/events/vmscan.h | 24 ++++++------
mm/memory-tiers.c | 2 -
mm/page_alloc.c | 4 +-
mm/show_mem.c | 3 -
mm/vmscan.c | 60 +++++++++++++++++---------------
mm/vmstat.c | 2 -
7 files changed, 64 insertions(+), 57 deletions(-)
--- a/include/linux/mmzone.h~b
+++ a/include/linux/mmzone.h
@@ -1531,26 +1531,30 @@ static inline unsigned long pgdat_end_pf
return pgdat->node_start_pfn + pgdat->node_spanned_pages;
}
-enum reset_kswapd_failures_reason {
- RESET_KSWAPD_FAILURES_OTHER = 0,
- RESET_KSWAPD_FAILURES_KSWAPD,
- RESET_KSWAPD_FAILURES_DIRECT,
- RESET_KSWAPD_FAILURES_PCP,
-};
-
-void pgdat_reset_kswapd_failures(pg_data_t *pgdat, enum reset_kswapd_failures_reason reason);
-
#include <linux/memory_hotplug.h>
void build_all_zonelists(pg_data_t *pgdat);
-void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
- enum zone_type highest_zoneidx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int highest_zoneidx, unsigned int alloc_flags,
long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx,
unsigned int alloc_flags);
+
+enum kswapd_clear_hopeless_reason {
+ KSWAPD_CLEAR_HOPELESS_OTHER = 0,
+ KSWAPD_CLEAR_HOPELESS_KSWAPD,
+ KSWAPD_CLEAR_HOPELESS_DIRECT,
+ KSWAPD_CLEAR_HOPELESS_PCP,
+};
+
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
+ enum zone_type highest_zoneidx);
+void kswapd_try_clear_hopeless(struct pglist_data *pgdat,
+ unsigned int order, int highest_zoneidx);
+void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason);
+bool kswapd_test_hopeless(pg_data_t *pgdat);
+
/*
* Memory initialization context, use to differentiate memory added by
* the platform statically or via memory hotplug interface.
--- a/include/trace/events/vmscan.h~b
+++ a/include/trace/events/vmscan.h
@@ -40,16 +40,16 @@
{_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \
) : "VMSCAN_THROTTLE_NONE"
-TRACE_DEFINE_ENUM(RESET_KSWAPD_FAILURES_OTHER);
-TRACE_DEFINE_ENUM(RESET_KSWAPD_FAILURES_KSWAPD);
-TRACE_DEFINE_ENUM(RESET_KSWAPD_FAILURES_DIRECT);
-TRACE_DEFINE_ENUM(RESET_KSWAPD_FAILURES_PCP);
-
-#define reset_kswapd_src \
- {RESET_KSWAPD_FAILURES_KSWAPD, "KSWAPD"}, \
- {RESET_KSWAPD_FAILURES_DIRECT, "DIRECT"}, \
- {RESET_KSWAPD_FAILURES_PCP, "PCP"}, \
- {RESET_KSWAPD_FAILURES_OTHER, "OTHER"}
+TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_OTHER);
+TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_KSWAPD);
+TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_DIRECT);
+TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_PCP);
+
+#define kswapd_clear_hopeless_reason_ops \
+ {KSWAPD_CLEAR_HOPELESS_KSWAPD, "KSWAPD"}, \
+ {KSWAPD_CLEAR_HOPELESS_DIRECT, "DIRECT"}, \
+ {KSWAPD_CLEAR_HOPELESS_PCP, "PCP"}, \
+ {KSWAPD_CLEAR_HOPELESS_OTHER, "OTHER"}
#define trace_reclaim_flags(file) ( \
(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
@@ -566,7 +566,7 @@ TRACE_EVENT(mm_vmscan_kswapd_reclaim_fai
__entry->nid, __entry->failures)
);
-TRACE_EVENT(mm_vmscan_reset_kswapd_failures,
+TRACE_EVENT(mm_vmscan_kswapd_clear_hopeless,
TP_PROTO(int nid, int reason),
@@ -584,7 +584,7 @@ TRACE_EVENT(mm_vmscan_reset_kswapd_failu
TP_printk("nid=%d reason=%s",
__entry->nid,
- __print_symbolic(__entry->reason, reset_kswapd_src))
+ __print_symbolic(__entry->reason, kswapd_clear_hopeless_reason_ops))
);
#endif /* _TRACE_VMSCAN_H */
--- a/mm/memory-tiers.c~b
+++ a/mm/memory-tiers.c
@@ -955,7 +955,7 @@ static ssize_t demotion_enabled_store(st
struct pglist_data *pgdat;
for_each_online_pgdat(pgdat)
- pgdat_reset_kswapd_failures(pgdat, RESET_KSWAPD_FAILURES_OTHER);
+ kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_OTHER);
}
return count;
--- a/mm/page_alloc.c~b
+++ a/mm/page_alloc.c
@@ -2945,9 +2945,9 @@ static bool free_frozen_page_commit(stru
* 'hopeless node' to stay in that state for a while. Let
* kswapd work again by resetting kswapd_failures.
*/
- if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES &&
+ if (kswapd_test_hopeless(pgdat) &&
next_memory_node(pgdat->node_id) < MAX_NUMNODES)
- pgdat_reset_kswapd_failures(pgdat, RESET_KSWAPD_FAILURES_PCP);
+ kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_PCP);
}
return ret;
}
--- a/mm/show_mem.c~b
+++ a/mm/show_mem.c
@@ -278,8 +278,7 @@ static void show_free_areas(unsigned int
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
- str_yes_no(atomic_read(&pgdat->kswapd_failures) >=
- MAX_RECLAIM_RETRIES),
+ str_yes_no(kswapd_test_hopeless(pgdat)),
K(node_page_state(pgdat, NR_BALLOON_PAGES)));
}
--- a/mm/vmscan.c~b
+++ a/mm/vmscan.c
@@ -506,7 +506,7 @@ static bool skip_throttle_noprogress(pg_
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
*/
- if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
+ if (kswapd_test_hopeless(pgdat))
return true;
/*
@@ -2647,28 +2647,6 @@ static bool can_age_anon_pages(struct lr
lruvec_memcg(lruvec));
}
-void pgdat_reset_kswapd_failures(pg_data_t *pgdat, enum reset_kswapd_failures_reason reason)
-{
- /* Only trace actual resets, not redundant zero-to-zero */
- if (atomic_xchg(&pgdat->kswapd_failures, 0))
- trace_mm_vmscan_reset_kswapd_failures(pgdat->node_id, reason);
-}
-
-/*
- * Reset kswapd_failures only when the node is balanced. Without this
- * check, successful direct reclaim (e.g., from cgroup memory.high
- * throttling) can keep resetting kswapd_failures even when the node
- * cannot be balanced, causing kswapd to run endlessly.
- */
-static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx);
-static inline void pgdat_try_reset_kswapd_failures(struct pglist_data *pgdat,
- struct scan_control *sc)
-{
- if (pgdat_balanced(pgdat, sc->order, sc->reclaim_idx))
- pgdat_reset_kswapd_failures(pgdat, current_is_kswapd() ?
- RESET_KSWAPD_FAILURES_KSWAPD : RESET_KSWAPD_FAILURES_DIRECT);
-}
-
#ifdef CONFIG_LRU_GEN
#ifdef CONFIG_LRU_GEN_ENABLED
@@ -5086,7 +5064,7 @@ static void lru_gen_shrink_node(struct p
blk_finish_plug(&plug);
done:
if (sc->nr_reclaimed > reclaimed)
- pgdat_try_reset_kswapd_failures(pgdat, sc);
+ kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx);
}
/******************************************************************************
@@ -6153,7 +6131,7 @@ again:
* successful direct reclaim run will revive a dormant kswapd.
*/
if (reclaimable)
- pgdat_try_reset_kswapd_failures(pgdat, sc);
+ kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx);
else if (sc->cache_trim_mode)
sc->cache_trim_mode_failed = 1;
}
@@ -6458,7 +6436,7 @@ static bool allow_direct_reclaim(pg_data
int i;
bool wmark_ok;
- if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
+ if (kswapd_test_hopeless(pgdat))
return true;
for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
@@ -6867,7 +6845,7 @@ static bool prepare_kswapd_sleep(pg_data
wake_up_all(&pgdat->pfmemalloc_wait);
/* Hopeless node, leave it to direct reclaim */
- if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
+ if (kswapd_test_hopeless(pgdat))
return true;
if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
@@ -7395,7 +7373,7 @@ void wakeup_kswapd(struct zone *zone, gf
return;
/* Hopeless node, leave it to direct reclaim if possible */
- if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES ||
+ if (kswapd_test_hopeless(pgdat) ||
(pgdat_balanced(pgdat, order, highest_zoneidx) &&
!pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
/*
@@ -7415,6 +7393,32 @@ void wakeup_kswapd(struct zone *zone, gf
wake_up_interruptible(&pgdat->kswapd_wait);
}
+void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason)
+{
+ /* Only trace actual resets, not redundant zero-to-zero */
+ if (atomic_xchg(&pgdat->kswapd_failures, 0))
+ trace_mm_vmscan_kswapd_clear_hopeless(pgdat->node_id, reason);
+}
+
+/*
+ * Reset kswapd_failures only when the node is balanced. Without this
+ * check, successful direct reclaim (e.g., from cgroup memory.high
+ * throttling) can keep resetting kswapd_failures even when the node
+ * cannot be balanced, causing kswapd to run endlessly.
+ */
+void kswapd_try_clear_hopeless(struct pglist_data *pgdat,
+ unsigned int order, int highest_zoneidx)
+{
+ if (pgdat_balanced(pgdat, order, highest_zoneidx))
+ kswapd_clear_hopeless(pgdat, current_is_kswapd() ?
+ KSWAPD_CLEAR_HOPELESS_KSWAPD : KSWAPD_CLEAR_HOPELESS_DIRECT);
+}
+
+bool kswapd_test_hopeless(pg_data_t *pgdat)
+{
+ return atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES;
+}
+
#ifdef CONFIG_HIBERNATION
/*
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
--- a/mm/vmstat.c~b
+++ a/mm/vmstat.c
@@ -1840,7 +1840,7 @@ static void zoneinfo_show_print(struct s
"\n start_pfn: %lu"
"\n reserved_highatomic: %lu"
"\n free_highatomic: %lu",
- atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES,
+ kswapd_test_hopeless(pgdat),
zone->zone_start_pfn,
zone->nr_reserved_highatomic,
zone->nr_free_highatomic);
_
Powered by blists - more mailing lists