linux-kernel - [RFC PATCH -next 1/7] vmscan: add memcg heat level for reclaim

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260120134256.2271710-2-chenridong@huaweicloud.com>
Date: Tue, 20 Jan 2026 13:42:50 +0000
From: Chen Ridong <chenridong@...weicloud.com>
To: akpm@...ux-foundation.org,
	axelrasmussen@...gle.com,
	yuanchu@...gle.com,
	weixugc@...gle.com,
	david@...nel.org,
	lorenzo.stoakes@...cle.com,
	Liam.Howlett@...cle.com,
	vbabka@...e.cz,
	rppt@...nel.org,
	surenb@...gle.com,
	mhocko@...e.com,
	corbet@....net,
	skhan@...uxfoundation.org,
	hannes@...xchg.org,
	roman.gushchin@...ux.dev,
	shakeel.butt@...ux.dev,
	muchun.song@...ux.dev,
	zhengqi.arch@...edance.com
Cc: linux-mm@...ck.org,
	linux-doc@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	cgroups@...r.kernel.org,
	lujialin4@...wei.com,
	chenridong@...weicloud.com,
	ryncsn@...il.com
Subject: [RFC PATCH -next 1/7] vmscan: add memcg heat level for reclaim

From: Chen Ridong <chenridong@...wei.com>

The memcg LRU was originally introduced to improve scalability during
global reclaim. However, it is complex and only works with gen lru
global reclaim. Moreover, its implementation complexity has led to
performance regressions when handling a large number of memory cgroups [1].

This patch introduces a per-memcg heat level for reclaim, aiming to unify
gen lru and traditional LRU global reclaim. The core idea is to track
per-node per-memcg reclaim state, including heat, last_decay, and
last_refault. The last_refault records the total reclaimed data from the
previous memcg reclaim. The last_decay is a time-based parameter; the heat
level decays over time if the memcg is not reclaimed again. Both last_decay
and last_refault are used to calculate the current heat level when reclaim
starts.

Three reclaim heat levels are defined: cold, warm, and hot. Cold memcgs are
reclaimed first; only if cold memcgs cannot reclaim enough pages, warm
memcgs become eligible for reclaim. Hot memcgs are reclaimed last.

While this design can be applied to all memcg reclaim scenarios, this patch
is conservative and only introduces heat levels for traditional LRU global
reclaim. Subsequent patches will replace the memcg LRU with
heat-level-based reclaim.

Based on tests provided by YU Zhao, traditional LRU global reclaim shows
significant performance improvement with heat-level reclaim enabled.

The results below are from a 2-hour run of the test [2].

Throughput (number of requests)		before	   after	Change
Total					1734169    2353717	+35%

Tail latency (number of requests)	before	   after	Change
[128s, inf)				1231	   1057		-14%
[64s, 128s)				586	   444		-24%
[32s, 64s)				1658	   1061		-36%
[16s, 32s)				4611	   2863		-38%

[1] https://lore.kernel.org/r/20251126171513.GC135004@cmpxchg.org
[2] https://lore.kernel.org/all/20221220214923.1229538-1-yuzhao@google.com/

Signed-off-by: Chen Ridong <chenridong@...wei.com>
---
 include/linux/memcontrol.h |   7 ++
 mm/memcontrol.c            |   3 +
 mm/vmscan.c                | 227 +++++++++++++++++++++++++++++--------
 3 files changed, 192 insertions(+), 45 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index af352cabedba..b293caf70034 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -76,6 +76,12 @@ struct memcg_vmstats;
 struct lruvec_stats_percpu;
 struct lruvec_stats;
 
+struct memcg_reclaim_state {
+	atomic_long_t heat;
+	unsigned long last_decay;
+	atomic_long_t last_refault;
+};
+
 struct mem_cgroup_reclaim_iter {
 	struct mem_cgroup *position;
 	/* scan generation, increased every round-trip */
@@ -114,6 +120,7 @@ struct mem_cgroup_per_node {
 	CACHELINE_PADDING(_pad2_);
 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 	struct mem_cgroup_reclaim_iter	iter;
+	struct memcg_reclaim_state	reclaim;
 
 #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
 	/* slab stats for nmi context */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f2b87e02574e..675d49ad7e2c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3713,6 +3713,9 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 
 	lruvec_init(&pn->lruvec);
 	pn->memcg = memcg;
+	atomic_long_set(&pn->reclaim.heat, 0);
+	pn->reclaim.last_decay = jiffies;
+	atomic_long_set(&pn->reclaim.last_refault, 0);
 
 	memcg->nodeinfo[node] = pn;
 	return true;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4aa73f125772..3759cd52c336 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5978,6 +5978,124 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
 	return inactive_lru_pages > pages_for_compaction;
 }
 
+enum memcg_scan_level {
+	MEMCG_LEVEL_COLD,
+	MEMCG_LEVEL_WARM,
+	MEMCG_LEVEL_HOT,
+	MEMCG_LEVEL_MAX,
+};
+
+#define MEMCG_HEAT_WARM		4
+#define MEMCG_HEAT_HOT		8
+#define MEMCG_HEAT_MAX		12
+#define MEMCG_HEAT_DECAY_STEP	1
+#define MEMCG_HEAT_DECAY_INTERVAL	(1 * HZ)
+
+static void memcg_adjust_heat(struct mem_cgroup_per_node *pn, long delta)
+{
+	long heat, new_heat;
+
+	if (mem_cgroup_is_root(pn->memcg))
+		return;
+
+	heat = atomic_long_read(&pn->reclaim.heat);
+	do {
+		new_heat = clamp_t(long, heat + delta, 0, MEMCG_HEAT_MAX);
+		if (atomic_long_cmpxchg(&pn->reclaim.heat, heat, new_heat) == heat)
+			break;
+		heat = atomic_long_read(&pn->reclaim.heat);
+	} while (1);
+}
+
+static void memcg_decay_heat(struct mem_cgroup_per_node *pn)
+{
+	unsigned long last;
+	unsigned long now = jiffies;
+
+	if (mem_cgroup_is_root(pn->memcg))
+		return;
+
+	last = READ_ONCE(pn->reclaim.last_decay);
+	if (!time_after(now, last + MEMCG_HEAT_DECAY_INTERVAL))
+		return;
+
+	if (cmpxchg(&pn->reclaim.last_decay, last, now) != last)
+		return;
+
+	memcg_adjust_heat(pn, -MEMCG_HEAT_DECAY_STEP);
+}
+
+static int memcg_heat_level(struct mem_cgroup_per_node *pn)
+{
+	long heat;
+
+	if (mem_cgroup_is_root(pn->memcg))
+		return MEMCG_LEVEL_COLD;
+
+	memcg_decay_heat(pn);
+	heat = atomic_long_read(&pn->reclaim.heat);
+
+	if (heat >= MEMCG_HEAT_HOT)
+		return MEMCG_LEVEL_HOT;
+	if (heat >= MEMCG_HEAT_WARM)
+		return MEMCG_LEVEL_WARM;
+	return MEMCG_LEVEL_COLD;
+}
+
+static void memcg_record_reclaim_result(struct mem_cgroup_per_node *pn,
+					struct lruvec *lruvec,
+					unsigned long scanned,
+					unsigned long reclaimed)
+{
+	long delta;
+
+	if (mem_cgroup_is_root(pn->memcg))
+		return;
+
+	memcg_decay_heat(pn);
+
+	/*
+	 * Memory cgroup heat adjustment algorithm:
+	 * - If scanned == 0: mark as hottest (+MAX_HEAT)
+	 * - If reclaimed >= 50% * scanned: strong cool (-2)
+	 * - If reclaimed >= 25% * scanned: mild cool (-1)
+	 * - Otherwise:  warm up (+1)
+	 */
+	if (!scanned)
+		delta = MEMCG_HEAT_MAX;
+	else if (reclaimed * 2 >= scanned)
+		delta = -2;
+	else if (reclaimed * 4 >= scanned)
+		delta = -1;
+	else
+		delta = 1;
+
+	/*
+	 * Refault-based heat adjustment:
+	 * - If refault increase > reclaimed pages: heat up (more cautious reclaim)
+	 * - If no refaults and currently warm:     cool down (allow more reclaim)
+	 * This prevents thrashing by backing off when refaults indicate over-reclaim.
+	 */
+	if (lruvec) {
+		unsigned long total_refaults;
+		unsigned long prev;
+		long refault_delta;
+
+		total_refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE_ANON);
+		total_refaults += lruvec_page_state(lruvec, WORKINGSET_ACTIVATE_FILE);
+
+		prev = atomic_long_xchg(&pn->reclaim.last_refault, total_refaults);
+		refault_delta = total_refaults - prev;
+
+		if (refault_delta > reclaimed)
+			delta++;
+		else if (!refault_delta && delta > 0)
+			delta--;
+	}
+
+	memcg_adjust_heat(pn, delta);
+}
+
 static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 {
 	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
@@ -5986,7 +6104,8 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 	};
 	struct mem_cgroup_reclaim_cookie *partial = &reclaim;
 	struct mem_cgroup *memcg;
-
+	int level;
+	int max_level = root_reclaim(sc) ? MEMCG_LEVEL_MAX : MEMCG_LEVEL_WARM;
 	/*
 	 * In most cases, direct reclaimers can do partial walks
 	 * through the cgroup tree, using an iterator state that
@@ -5999,62 +6118,80 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 	if (current_is_kswapd() || sc->memcg_full_walk)
 		partial = NULL;
 
-	memcg = mem_cgroup_iter(target_memcg, NULL, partial);
-	do {
-		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
-		unsigned long reclaimed;
-		unsigned long scanned;
-
-		/*
-		 * This loop can become CPU-bound when target memcgs
-		 * aren't eligible for reclaim - either because they
-		 * don't have any reclaimable pages, or because their
-		 * memory is explicitly protected. Avoid soft lockups.
-		 */
-		cond_resched();
+	for (level = MEMCG_LEVEL_COLD; level < max_level; level++) {
+		bool need_next_level = false;
 
-		mem_cgroup_calculate_protection(target_memcg, memcg);
+		memcg = mem_cgroup_iter(target_memcg, NULL, partial);
+		do {
+			struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+			unsigned long reclaimed;
+			unsigned long scanned;
+			struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id];
 
-		if (mem_cgroup_below_min(target_memcg, memcg)) {
-			/*
-			 * Hard protection.
-			 * If there is no reclaimable memory, OOM.
-			 */
-			continue;
-		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
 			/*
-			 * Soft protection.
-			 * Respect the protection only as long as
-			 * there is an unprotected supply
-			 * of reclaimable memory from other cgroups.
+			 * This loop can become CPU-bound when target memcgs
+			 * aren't eligible for reclaim - either because they
+			 * don't have any reclaimable pages, or because their
+			 * memory is explicitly protected. Avoid soft lockups.
 			 */
-			if (!sc->memcg_low_reclaim) {
-				sc->memcg_low_skipped = 1;
+			cond_resched();
+
+			mem_cgroup_calculate_protection(target_memcg, memcg);
+
+			if (mem_cgroup_below_min(target_memcg, memcg)) {
+				/*
+				 * Hard protection.
+				 * If there is no reclaimable memory, OOM.
+				 */
 				continue;
+			} else if (mem_cgroup_below_low(target_memcg, memcg)) {
+				/*
+				 * Soft protection.
+				 * Respect the protection only as long as
+				 * there is an unprotected supply
+				 * of reclaimable memory from other cgroups.
+				 */
+				if (!sc->memcg_low_reclaim) {
+					sc->memcg_low_skipped = 1;
+					continue;
+				}
+				memcg_memory_event(memcg, MEMCG_LOW);
 			}
-			memcg_memory_event(memcg, MEMCG_LOW);
-		}
 
-		reclaimed = sc->nr_reclaimed;
-		scanned = sc->nr_scanned;
+			if (root_reclaim(sc) && memcg_heat_level(pn) > level) {
+				need_next_level = true;
+				continue;
+			}
 
-		shrink_lruvec(lruvec, sc);
+			reclaimed = sc->nr_reclaimed;
+			scanned = sc->nr_scanned;
 
-		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
-			    sc->priority);
+			shrink_lruvec(lruvec, sc);
+			if (!memcg || memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B))
+				shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
+					    sc->priority);
 
-		/* Record the group's reclaim efficiency */
-		if (!sc->proactive)
-			vmpressure(sc->gfp_mask, memcg, false,
-				   sc->nr_scanned - scanned,
-				   sc->nr_reclaimed - reclaimed);
+			if (root_reclaim(sc))
+				memcg_record_reclaim_result(pn, lruvec,
+						    sc->nr_scanned - scanned,
+						    sc->nr_reclaimed - reclaimed);
 
-		/* If partial walks are allowed, bail once goal is reached */
-		if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) {
-			mem_cgroup_iter_break(target_memcg, memcg);
+			/* Record the group's reclaim efficiency */
+			if (!sc->proactive)
+				vmpressure(sc->gfp_mask, memcg, false,
+					   sc->nr_scanned - scanned,
+					   sc->nr_reclaimed - reclaimed);
+
+			/* If partial walks are allowed, bail once goal is reached */
+			if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) {
+				mem_cgroup_iter_break(target_memcg, memcg);
+				break;
+			}
+		} while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial)));
+
+		if (!need_next_level)
 			break;
-		}
-	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial)));
+	}
 }
 
 static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
-- 
2.34.1