lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251221233635.3761887-2-bingjiao@google.com>
Date: Sun, 21 Dec 2025 23:36:34 +0000
From: Bing Jiao <bingjiao@...gle.com>
To: linux-mm@...ck.org
Cc: linux-kernel@...r.kernel.org, stable@...r.kernel.org, 
	akpm@...ux-foundation.org, gourry@...rry.net, longman@...hat.com, 
	hannes@...xchg.org, mhocko@...nel.org, roman.gushchin@...ux.dev, 
	shakeel.butt@...ux.dev, muchun.song@...ux.dev, tj@...nel.org, 
	mkoutny@...e.com, david@...nel.org, zhengqi.arch@...edance.com, 
	lorenzo.stoakes@...cle.com, axelrasmussen@...gle.com, yuanchu@...gle.com, 
	weixugc@...gle.com, cgroups@...r.kernel.org, Bing Jiao <bingjiao@...gle.com>
Subject: [PATCH v2 1/2] mm/vmscan: respect mems_effective in demote_folio_list()

Commit 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim")
introduces the cpuset.mems_effective check and applies it to
can_demote(). However, it does not apply this check in
demote_folio_list().

This omission leads to situations where pages are demoted to nodes
that are explicitly excluded from the task's cpuset.mems.
The impact is two-fold:

  1. Resource Isolation: This bug breaks resource isolation provided
     by cpuset.mems. It allows pages to be demoted to nodes that are
     dedicated to other tasks or are intended for hot-unplugging.

  2. Performance Issue: In multi-tier systems, users use cpuset.mems
     to bind tasks to different performed-far tiers (e.g., avoiding
     the slowest tiers for latency-sensitive data). This bug can
     cause unexpected latency spikes if pages are demoted to the
     farthest nodes.

To address the bug, implement a new function
mem_cgroup_filter_mems_allowed() to filter out nodes that are not
set in mems_effective, and update demote_folio_list() to utilize
this filtering logic. This ensures that demotions target respect
task's memory placement constraints.

Fixes: 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim")
Signed-off-by: Bing Jiao <bingjiao@...gle.com>
---
 include/linux/cpuset.h     |  6 ++++++
 include/linux/memcontrol.h |  7 +++++++
 kernel/cgroup/cpuset.c     | 18 ++++++++++++++++++
 mm/memcontrol.c            |  6 ++++++
 mm/vmscan.c                | 13 ++++++++++---
 5 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a98d3330385c..0e94548e2d24 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -175,6 +175,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 }
 
 extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
+extern void cpuset_node_filter_allowed(struct cgroup *cgroup, nodemask_t *mask);
 #else /* !CONFIG_CPUSETS */
 
 static inline bool cpusets_enabled(void) { return false; }
@@ -305,6 +306,11 @@ static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
 {
 	return true;
 }
+
+static inline void cpuset_node_filter_allowed(struct cgroup *cgroup,
+					      nodemask_t *mask)
+{
+}
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fd400082313a..7cfd71c57caa 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1742,6 +1742,8 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
 
 bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
 
+void mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg, nodemask_t *mask);
+
 void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg);
 
 static inline bool memcg_is_dying(struct mem_cgroup *memcg)
@@ -1816,6 +1818,11 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
 	return true;
 }
 
+static inline bool mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg,
+						  nodemask_t *mask)
+{
+}
+
 static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
 {
 }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6e6eb09b8db6..2925bd6bca91 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4452,6 +4452,24 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
 	return allowed;
 }
 
+void cpuset_node_filter_allowed(struct cgroup *cgroup, nodemask_t *mask)
+{
+	struct cgroup_subsys_state *css;
+	struct cpuset *cs;
+
+	if (!cpuset_v2())
+		return;
+
+	css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
+	if (!css)
+		return;
+
+	/* Follows the same assumption in cpuset_node_allowed() */
+	cs = container_of(css, struct cpuset, css);
+	nodes_and(*mask, *mask, cs->effective_mems);
+	css_put(css);
+}
+
 /**
  * cpuset_spread_node() - On which node to begin search for a page
  * @rotor: round robin rotor
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 75fc22a33b28..f414653867de 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5602,6 +5602,12 @@ bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
 	return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
 }
 
+void mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg, nodemask_t *mask)
+{
+	if (memcg)
+		cpuset_node_filter_allowed(memcg->css.cgroup, mask);
+}
+
 void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
 {
 	if (mem_cgroup_disabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 453d654727c1..4d23c491e914 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1018,7 +1018,8 @@ static struct folio *alloc_demote_folio(struct folio *src,
  * Folios which are not demoted are left on @demote_folios.
  */
 static unsigned int demote_folio_list(struct list_head *demote_folios,
-				     struct pglist_data *pgdat)
+				      struct pglist_data *pgdat,
+				      struct mem_cgroup *memcg)
 {
 	int target_nid = next_demotion_node(pgdat->node_id);
 	unsigned int nr_succeeded;
@@ -1032,7 +1033,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
 		 */
 		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
 			__GFP_NOMEMALLOC | GFP_NOWAIT,
-		.nid = target_nid,
 		.nmask = &allowed_mask,
 		.reason = MR_DEMOTION,
 	};
@@ -1044,6 +1044,13 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
 		return 0;
 
 	node_get_allowed_targets(pgdat, &allowed_mask);
+	/* Filter the given nmask based on cpuset.mems.allowed */
+	mem_cgroup_filter_mems_allowed(memcg, &allowed_mask);
+	if (nodes_empty(allowed_mask))
+		return 0;
+	if (!node_isset(target_nid, allowed_mask))
+		target_nid = node_random(&allowed_mask);
+	mtc.nid = target_nid;
 
 	/* Demotion ignores all cpuset and mempolicy settings */
 	migrate_pages(demote_folios, alloc_demote_folio, NULL,
@@ -1565,7 +1572,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 	/* 'folio_list' is always empty here */
 
 	/* Migrate folios selected for demotion */
-	nr_demoted = demote_folio_list(&demote_folios, pgdat);
+	nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg);
 	nr_reclaimed += nr_demoted;
 	stat->nr_demoted += nr_demoted;
 	/* Folios that could not be demoted are still in @demote_folios */
-- 
2.52.0.351.gbe84eed79e-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ