[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251223212032.665731-1-bingjiao@google.com>
Date: Tue, 23 Dec 2025 21:19:59 +0000
From: Bing Jiao <bingjiao@...gle.com>
To: linux-mm@...ck.org
Cc: linux-kernel@...r.kernel.org, akpm@...ux-foundation.org, gourry@...rry.net,
longman@...hat.com, hannes@...xchg.org, mhocko@...nel.org,
roman.gushchin@...ux.dev, shakeel.butt@...ux.dev, muchun.song@...ux.dev,
tj@...nel.org, mkoutny@...e.com, david@...nel.org, zhengqi.arch@...edance.com,
lorenzo.stoakes@...cle.com, axelrasmussen@...gle.com,
chenridong@...weicloud.com, yuanchu@...gle.com, weixugc@...gle.com,
cgroups@...r.kernel.org
Subject: [PATCH v3] mm/vmscan: fix demotion targets checks in reclaim/demotion
Fix two bugs in demote_folio_list() and can_demote() due to incorrect
demotion target checks in reclaim/demotion.
Commit 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim")
introduces the cpuset.mems_effective check and applies it to
can_demote(). However:
1. It does not apply this check in demote_folio_list(), which leads
to situations where pages are demoted to nodes that are
explicitly excluded from the task's cpuset.mems.
2. It checks only the nodes in the immediate next demotion hierarchy
and does not check all allowed demotion targets in can_demote().
This can cause pages to never be demoted if the nodes in the next
demotion hierarchy are not set in mems_effective.
These bugs break resource isolation provided by cpuset.mems.
This is visible from userspace because pages can either fail to be
demoted entirely or are demoted to nodes that are not allowed
in multi-tier memory systems.
To address these bugs, update cpuset_node_allowed() and
mem_cgroup_node_allowed() to return effective_mems, allowing directly
logic-and operation against demotion targets. Also update can_demote()
and demote_folio_list() accordingly.
Reproduct Bug 1:
Assume a system with 4 nodes, where nodes 0-1 are top-tier and
nodes 2-3 are far-tier memory. All nodes have equal capacity.
Test script:
echo 1 > /sys/kernel/mm/numa/demotion_enabled
mkdir /sys/fs/cgroup/test
echo +cpuset > /sys/fs/cgroup/cgroup.subtree_control
echo "0-2" > /sys/fs/cgroup/test/cpuset.mems
echo $$ > /sys/fs/cgroup/test/cgroup.procs
swapoff -a
# Expectation: Should respect node 0-2 limit.
# Observation: Node 3 shows significant allocation (MemFree drops)
stress-ng --oomable --vm 1 --vm-bytes 150% --mbind 0,1
Reproduct Bug 2:
Assume a system with 6 nodes, where nodes 0-2 are top-tier,
node 3 is a far-tier node, and nodes 4-5 are the farthest-tier nodes.
All nodes have equal capacity.
Test script:
echo 1 > /sys/kernel/mm/numa/demotion_enabled
mkdir /sys/fs/cgroup/test
echo +cpuset > /sys/fs/cgroup/cgroup.subtree_control
echo "0-2,4-5" > /sys/fs/cgroup/test/cpuset.mems
echo $$ > /sys/fs/cgroup/test/cgroup.procs
swapoff -a
# Expectation: Pages are demoted to Nodes 4-5
# Observation: No pages are demoted before oom.
stress-ng --oomable --vm 1 --vm-bytes 150% --mbind 0,1,2
Fixes: 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim")
Cc: <stable@...r.kernel.org>
Signed-off-by: Bing Jiao <bingjiao@...gle.com>
---
include/linux/cpuset.h | 6 +++---
include/linux/memcontrol.h | 6 +++---
kernel/cgroup/cpuset.c | 16 ++++++++--------
mm/memcontrol.c | 6 ++++--
mm/vmscan.c | 35 +++++++++++++++++++++++------------
5 files changed, 41 insertions(+), 28 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a98d3330385c..eb358c3aa9c0 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -174,7 +174,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
task_unlock(current);
}
-extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
+extern nodemask_t cpuset_node_get_allowed(struct cgroup *cgroup);
#else /* !CONFIG_CPUSETS */
static inline bool cpusets_enabled(void) { return false; }
@@ -301,9 +301,9 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
return false;
}
-static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+static inline nodemask_t cpuset_node_get_allowed(struct cgroup *cgroup)
{
- return true;
+ return node_possible_map;
}
#endif /* !CONFIG_CPUSETS */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fd400082313a..f9463d853bba 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1740,7 +1740,7 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
rcu_read_unlock();
}
-bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
+nodemask_t mem_cgroup_node_get_allowed(struct mem_cgroup *memcg);
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg);
@@ -1811,9 +1811,9 @@ static inline ino_t page_cgroup_ino(struct page *page)
return 0;
}
-static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+static inline nodemask_t mem_cgroup_node_get_allowed(struct mem_cgroup *memcg)
{
- return true;
+ return node_possible_map;
}
static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6e6eb09b8db6..abb9afb64205 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4416,23 +4416,23 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
return allowed;
}
-bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+nodemask_t cpuset_node_get_allowed(struct cgroup *cgroup)
{
+ nodemask_t nodes = node_possible_map;
struct cgroup_subsys_state *css;
struct cpuset *cs;
- bool allowed;
/*
* In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
* and mems_allowed is likely to be empty even if we could get to it,
- * so return true to avoid taking a global lock on the empty check.
+ * so return directly to avoid taking a global lock on the empty check.
*/
- if (!cpuset_v2())
- return true;
+ if (!cgroup || !cpuset_v2())
+ return nodes;
css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
if (!css)
- return true;
+ return nodes;
/*
* Normally, accessing effective_mems would require the cpuset_mutex
@@ -4447,9 +4447,9 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
* cannot make strong isolation guarantees, so this is acceptable.
*/
cs = container_of(css, struct cpuset, css);
- allowed = node_isset(nid, cs->effective_mems);
+ nodes_copy(nodes, cs->effective_mems);
css_put(css);
- return allowed;
+ return nodes;
}
/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 75fc22a33b28..c2f4ac50d5c2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5597,9 +5597,11 @@ subsys_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_SWAP */
-bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+nodemask_t mem_cgroup_node_get_allowed(struct mem_cgroup *memcg)
{
- return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+ if (memcg)
+ return cpuset_node_get_allowed(memcg->css.cgroup);
+ return node_possible_map;
}
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a4b308a2f9ad..711a04baf258 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -345,18 +345,24 @@ static bool can_demote(int nid, struct scan_control *sc,
struct mem_cgroup *memcg)
{
int demotion_nid;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ nodemask_t allowed_mask, allowed_mems;
- if (!numa_demotion_enabled)
+ if (!pgdat || !numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
- demotion_nid = next_demotion_node(nid);
- if (demotion_nid == NUMA_NO_NODE)
+ node_get_allowed_targets(pgdat, &allowed_mask);
+ if (nodes_empty(allowed_mask))
+ return false;
+
+ allowed_mems = mem_cgroup_node_get_allowed(memcg);
+ nodes_and(allowed_mask, allowed_mask, allowed_mems);
+ if (nodes_empty(allowed_mask))
return false;
- /* If demotion node isn't in the cgroup's mems_allowed, fall back */
- if (mem_cgroup_node_allowed(memcg, demotion_nid)) {
+ for_each_node_mask(demotion_nid, allowed_mask) {
int z;
struct zone *zone;
struct pglist_data *pgdat = NODE_DATA(demotion_nid);
@@ -1029,11 +1035,12 @@ static struct folio *alloc_demote_folio(struct folio *src,
* Folios which are not demoted are left on @demote_folios.
*/
static unsigned int demote_folio_list(struct list_head *demote_folios,
- struct pglist_data *pgdat)
+ struct pglist_data *pgdat,
+ struct mem_cgroup *memcg)
{
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
- nodemask_t allowed_mask;
+ nodemask_t allowed_mask, allowed_mems;
struct migration_target_control mtc = {
/*
@@ -1043,7 +1050,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
__GFP_NOMEMALLOC | GFP_NOWAIT,
- .nid = target_nid,
.nmask = &allowed_mask,
.reason = MR_DEMOTION,
};
@@ -1051,10 +1057,15 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
if (list_empty(demote_folios))
return 0;
- if (target_nid == NUMA_NO_NODE)
- return 0;
-
node_get_allowed_targets(pgdat, &allowed_mask);
+ allowed_mems = mem_cgroup_node_get_allowed(memcg);
+ nodes_and(allowed_mask, allowed_mask, allowed_mems);
+ if (nodes_empty(allowed_mask))
+ return false;
+
+ if (target_nid == NUMA_NO_NODE || !node_isset(target_nid, allowed_mask))
+ target_nid = node_random(&allowed_mask);
+ mtc.nid = target_nid;
/* Demotion ignores all cpuset and mempolicy settings */
migrate_pages(demote_folios, alloc_demote_folio, NULL,
@@ -1576,7 +1587,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- nr_demoted = demote_folio_list(&demote_folios, pgdat);
+ nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg);
nr_reclaimed += nr_demoted;
stat->nr_demoted += nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */
--
2.52.0.358.g0dd7633a29-goog
Powered by blists - more mailing lists