[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251107224956.477056-6-gourry@gourry.net>
Date: Fri, 7 Nov 2025 17:49:50 -0500
From: Gregory Price <gourry@...rry.net>
To: linux-mm@...ck.org
Cc: linux-cxl@...r.kernel.org,
linux-kernel@...r.kernel.org,
nvdimm@...ts.linux.dev,
linux-fsdevel@...r.kernel.org,
cgroups@...r.kernel.org,
dave@...olabs.net,
jonathan.cameron@...wei.com,
dave.jiang@...el.com,
alison.schofield@...el.com,
vishal.l.verma@...el.com,
ira.weiny@...el.com,
dan.j.williams@...el.com,
longman@...hat.com,
akpm@...ux-foundation.org,
david@...hat.com,
lorenzo.stoakes@...cle.com,
Liam.Howlett@...cle.com,
vbabka@...e.cz,
rppt@...nel.org,
surenb@...gle.com,
mhocko@...e.com,
osalvador@...e.de,
ziy@...dia.com,
matthew.brost@...el.com,
joshua.hahnjy@...il.com,
rakie.kim@...com,
byungchul@...com,
gourry@...rry.net,
ying.huang@...ux.alibaba.com,
apopple@...dia.com,
mingo@...hat.com,
peterz@...radead.org,
juri.lelli@...hat.com,
vincent.guittot@...aro.org,
dietmar.eggemann@....com,
rostedt@...dmis.org,
bsegall@...gle.com,
mgorman@...e.de,
vschneid@...hat.com,
tj@...nel.org,
hannes@...xchg.org,
mkoutny@...e.com,
kees@...nel.org,
muchun.song@...ux.dev,
roman.gushchin@...ux.dev,
shakeel.butt@...ux.dev,
rientjes@...gle.com,
jackmanb@...gle.com,
cl@...two.org,
harry.yoo@...cle.com,
axelrasmussen@...gle.com,
yuanchu@...gle.com,
weixugc@...gle.com,
zhengqi.arch@...edance.com,
yosry.ahmed@...ux.dev,
nphamcs@...il.com,
chengming.zhou@...ux.dev,
fabio.m.de.francesco@...ux.intel.com,
rrichter@....com,
ming.li@...omail.com,
usamaarif642@...il.com,
brauner@...nel.org,
oleg@...hat.com,
namcao@...utronix.de,
escape@...ux.alibaba.com,
dongjoo.seo1@...sung.com
Subject: [RFC PATCH 5/9] cpuset: introduce cpuset.mems.default
mems_default is intersect(effective_mems, default_sysram_nodes). This
allows hotplugged memory nodes to be marked "protected". A protected
node's memory is not default-allocable via standard methods (basic
pages faults, mempolicies, etc).
When checking node_allowed, check for GFP_PROTECTED to determine if
the check should be made against mems_default or mems_allowed, since
mems_default only contains sysram nodes.
Signed-off-by: Gregory Price <gourry@...rry.net>
---
include/linux/cpuset.h | 8 ++--
kernel/cgroup/cpuset-internal.h | 8 ++++
kernel/cgroup/cpuset-v1.c | 7 +++
kernel/cgroup/cpuset.c | 83 ++++++++++++++++++++++++++-------
mm/memcontrol.c | 2 +-
mm/mempolicy.c | 8 ++--
mm/migrate.c | 4 +-
7 files changed, 93 insertions(+), 27 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 4db08c580cc3..7f683e4cf6c3 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -77,7 +77,7 @@ extern void cpuset_unlock(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
extern bool cpuset_cpu_is_isolated(int cpu);
-extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
+extern nodemask_t cpuset_mems_default(struct task_struct *p);
#define cpuset_current_mems_default (current->mems_default)
void cpuset_init_current_mems_default(void);
int cpuset_nodemask_valid_mems_default(const nodemask_t *nodemask);
@@ -173,7 +173,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
task_unlock(current);
}
-extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
+extern bool cpuset_node_default(struct cgroup *cgroup, int nid);
#else /* !CONFIG_CPUSETS */
static inline bool cpusets_enabled(void) { return false; }
@@ -211,7 +211,7 @@ static inline bool cpuset_cpu_is_isolated(int cpu)
return false;
}
-static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
+static inline nodemask_t cpuset_mems_default(struct task_struct *p)
{
return node_possible_map;
}
@@ -294,7 +294,7 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
return false;
}
-static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+static inline bool cpuset_node_default(struct cgroup *cgroup, int nid)
{
return true;
}
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 337608f408ce..6978e04477b2 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -55,6 +55,7 @@ typedef enum {
FILE_MEMLIST,
FILE_EFFECTIVE_CPULIST,
FILE_EFFECTIVE_MEMLIST,
+ FILE_MEMS_DEFAULT,
FILE_SUBPARTS_CPULIST,
FILE_EXCLUSIVE_CPULIST,
FILE_EFFECTIVE_XCPULIST,
@@ -104,6 +105,13 @@ struct cpuset {
cpumask_var_t effective_cpus;
nodemask_t effective_mems;
+ /*
+ * Default Memory Nodes for tasks.
+ * This is the intersection of effective_mems and default_sysram_nodes.
+ * Tasks will have their mems_default set to this value.
+ */
+ nodemask_t mems_default;
+
/*
* Exclusive CPUs dedicated to current cgroup (default hierarchy only)
*
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 12e76774c75b..a06f2b032e0d 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -293,6 +293,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
cpumask_copy(cs->effective_cpus, new_cpus);
cs->mems_allowed = *new_mems;
cs->effective_mems = *new_mems;
+ cpuset_update_mems_default(cs);
cpuset_callback_unlock_irq();
/*
@@ -532,6 +533,12 @@ struct cftype cpuset1_files[] = {
.private = FILE_EFFECTIVE_MEMLIST,
},
+ {
+ .name = "mems_default",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_MEMS_DEFAULT,
+ },
+
{
.name = "cpu_exclusive",
.read_u64 = cpuset_read_u64,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b05c07489a4d..ea5ca1a05cf5 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -29,6 +29,7 @@
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
+#include <linux/memory-tiers.h>
#include <linux/export.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
@@ -430,9 +431,9 @@ static void guarantee_active_cpus(struct task_struct *tsk,
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
+ while (!nodes_intersects(cs->mems_default, node_states[N_MEMORY]))
cs = parent_cs(cs);
- nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
+ nodes_and(*pmask, cs->mems_default, node_states[N_MEMORY]);
}
/**
@@ -2748,7 +2749,7 @@ void cpuset_update_tasks_nodemask(struct cpuset *cs)
migrate = is_memory_migrate(cs);
- mpol_rebind_mm(mm, &cs->mems_allowed);
+ mpol_rebind_mm(mm, &cs->mems_default);
if (migrate)
cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
else
@@ -2808,6 +2809,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
spin_lock_irq(&callback_lock);
cp->effective_mems = *new_mems;
+ if (!nodes_empty(default_sysram_nodelist))
+ nodes_and(cp->mems_default, cp->effective_mems,
+ default_sysram_nodelist);
spin_unlock_irq(&callback_lock);
WARN_ON(!is_in_v2_mode() &&
@@ -3234,7 +3238,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* by skipping the task iteration and update.
*/
if (cpuset_v2() && !cpus_updated && !mems_updated) {
- cpuset_attach_nodemask_to = cs->effective_mems;
+ cpuset_attach_nodemask_to = cs->mems_default;
goto out;
}
@@ -3249,7 +3253,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* if there is no change in effective_mems and CS_MEMORY_MIGRATE is
* not set.
*/
- cpuset_attach_nodemask_to = cs->effective_mems;
+ cpuset_attach_nodemask_to = cs->mems_default;
if (!is_memory_migrate(cs) && !mems_updated)
goto out;
@@ -3371,6 +3375,9 @@ int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_EFFECTIVE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
break;
+ case FILE_MEMS_DEFAULT:
+ seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_default));
+ break;
case FILE_EXCLUSIVE_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
break;
@@ -3482,6 +3489,12 @@ static struct cftype dfl_files[] = {
.private = FILE_EFFECTIVE_MEMLIST,
},
+ {
+ .name = "mems.default",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_MEMS_DEFAULT,
+ },
+
{
.name = "cpus.partition",
.seq_show = cpuset_partition_show,
@@ -3585,6 +3598,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
+ if (!nodes_empty(default_sysram_nodelist))
+ nodes_and(cs->mems_default, cs->effective_mems,
+ default_sysram_nodelist);
}
spin_unlock_irq(&callback_lock);
@@ -3616,6 +3632,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
spin_lock_irq(&callback_lock);
cs->mems_allowed = parent->mems_allowed;
cs->effective_mems = parent->mems_allowed;
+ if (!nodes_empty(default_sysram_nodelist))
+ nodes_and(cs->mems_default, cs->effective_mems,
+ default_sysram_nodelist);
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
@@ -3818,6 +3837,9 @@ int __init cpuset_init(void)
cpumask_setall(top_cpuset.effective_xcpus);
cpumask_setall(top_cpuset.exclusive_cpus);
nodes_setall(top_cpuset.effective_mems);
+ if (!nodes_empty(default_sysram_nodelist))
+ nodes_and(top_cpuset.mems_default, top_cpuset.effective_mems,
+ default_sysram_nodelist);
fmeter_init(&top_cpuset.fmeter);
INIT_LIST_HEAD(&remote_children);
@@ -3848,6 +3870,9 @@ hotplug_update_tasks(struct cpuset *cs,
spin_lock_irq(&callback_lock);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->effective_mems = *new_mems;
+ if (!nodes_empty(default_sysram_nodelist))
+ nodes_and(cs->mems_default, cs->effective_mems,
+ default_sysram_nodelist);
spin_unlock_irq(&callback_lock);
if (cpus_updated)
@@ -4039,6 +4064,10 @@ static void cpuset_handle_hotplug(void)
if (!on_dfl)
top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
+ if (!nodes_empty(default_sysram_nodelist))
+ nodes_and(top_cpuset.mems_default,
+ top_cpuset.effective_mems,
+ default_sysram_nodelist);
spin_unlock_irq(&callback_lock);
cpuset_update_tasks_nodemask(&top_cpuset);
}
@@ -4109,6 +4138,9 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
+ if (!nodes_empty(default_sysram_nodelist))
+ nodes_and(top_cpuset.mems_default, top_cpuset.effective_mems,
+ default_sysram_nodelist);
hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
@@ -4205,22 +4237,27 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
return changed;
}
+/*
+ * At this point in time, no hotplug nodes can have been added, so just set
+ * the mems_default of the init task to the set of N_MEMORY nodes.
+ */
void __init cpuset_init_current_mems_default(void)
{
- nodes_setall(current->mems_default);
+ nodes_clear(current->mems_default);
+ nodes_or(current->mems_default, current->mems_default, node_states[N_MEMORY]);
}
/**
- * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
- * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
+ * cpuset_mems_default - return mems_default mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->mems_default.
*
- * Description: Returns the nodemask_t mems_allowed of the cpuset
+ * Description: Returns the nodemask_t mems_default of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
* subset of node_states[N_MEMORY], even if this means going outside the
* tasks cpuset.
**/
-nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
+nodemask_t cpuset_mems_default(struct task_struct *tsk)
{
nodemask_t mask;
unsigned long flags;
@@ -4295,17 +4332,29 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* tsk_is_oom_victim - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
+ * GFP_PROTECTED - allow non-sysram nodes in mems_allowed
*/
bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
bool allowed; /* is allocation in zone z allowed? */
unsigned long flags;
+ bool protected_node = gfp_mask & __GFP_PROTECTED;
if (in_interrupt())
return true;
- if (node_isset(node, current->mems_default))
- return true;
+
+ if (protected_node) {
+ rcu_read_lock();
+ cs = task_cs(current);
+ allowed = node_isset(node, cs->mems_allowed);
+ rcu_read_unlock();
+ } else if (node_isset(node, current->mems_default))
+ allowed = true;
+
+ if (allowed)
+ return allowed;
+
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
@@ -4322,13 +4371,15 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
spin_lock_irqsave(&callback_lock, flags);
cs = nearest_hardwall_ancestor(task_cs(current));
- allowed = node_isset(node, cs->mems_allowed);
+ allowed = node_isset(node, cs->mems_allowed); /* include protected */
+ if (!protected_node && !nodes_empty(default_sysram_nodelist))
+ allowed &= node_isset(node, default_sysram_nodelist);
spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
}
-bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+bool cpuset_node_default(struct cgroup *cgroup, int nid)
{
struct cgroup_subsys_state *css;
struct cpuset *cs;
@@ -4347,7 +4398,7 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
return true;
/*
- * Normally, accessing effective_mems would require the cpuset_mutex
+ * Normally, accessing mems_default would require the cpuset_mutex
* or callback_lock - but node_isset is atomic and the reference
* taken via cgroup_get_e_css is sufficient to protect css.
*
@@ -4359,7 +4410,7 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
* cannot make strong isolation guarantees, so this is acceptable.
*/
cs = container_of(css, struct cpuset, css);
- allowed = node_isset(nid, cs->effective_mems);
+ allowed = node_isset(nid, cs->mems_default);
css_put(css);
return allowed;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4deda33625f4..a25584cb281e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5599,5 +5599,5 @@ subsys_initcall(mem_cgroup_swap_init);
bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
{
- return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+ return memcg ? cpuset_node_default(memcg->css.cgroup, nid) : true;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6225d4d23010..5360333dc06d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1831,14 +1831,14 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
}
rcu_read_unlock();
- task_nodes = cpuset_mems_allowed(task);
+ task_nodes = cpuset_mems_default(task);
/* Is the user allowed to access the target nodes? */
if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out_put;
}
- task_nodes = cpuset_mems_allowed(current);
+ task_nodes = cpuset_mems_default(current);
nodes_and(*new, *new, task_nodes);
if (nodes_empty(*new))
goto out_put;
@@ -2738,7 +2738,7 @@ int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
/*
* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
- * with the mems_default returned by cpuset_mems_allowed(). This
+ * with the mems_default returned by cpuset_mems_default(). This
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
*
@@ -2763,7 +2763,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
*new = *old;
if (current_cpuset_is_being_rebound()) {
- nodemask_t mems = cpuset_mems_allowed(current);
+ nodemask_t mems = cpuset_mems_default(current);
mpol_rebind_policy(new, &mems);
}
atomic_set(&new->refcnt, 1);
diff --git a/mm/migrate.c b/mm/migrate.c
index c0e9f15be2a2..f9a910b43a9f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2526,7 +2526,7 @@ static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
*/
if (!pid) {
mmget(current->mm);
- *mem_nodes = cpuset_mems_allowed(current);
+ *mem_nodes = cpuset_mems_default(current);
return current->mm;
}
@@ -2547,7 +2547,7 @@ static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
mm = ERR_PTR(security_task_movememory(task));
if (IS_ERR(mm))
goto out;
- *mem_nodes = cpuset_mems_allowed(task);
+ *mem_nodes = cpuset_mems_default(task);
mm = get_task_mm(task);
out:
put_task_struct(task);
--
2.51.1
Powered by blists - more mailing lists