[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251015172835.497158969@linutronix.de>
Date: Wed, 15 Oct 2025 19:30:00 +0200 (CEST)
From: Thomas Gleixner <tglx@...utronix.de>
To: LKML <linux-kernel@...r.kernel.org>
Cc: Peter Zijlstra <peterz@...radead.org>,
Gabriele Monaco <gmonaco@...hat.com>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Michael Jeanson <mjeanson@...icios.com>,
Jens Axboe <axboe@...nel.dk>,
"Paul E. McKenney" <paulmck@...nel.org>,
"Gautham R. Shenoy" <gautham.shenoy@....com>,
Florian Weimer <fweimer@...hat.com>,
Tim Chen <tim.c.chen@...el.com>,
TCMalloc Team <tcmalloc-eng@...gle.com>
Subject: [patch 19/19] sched/mmcid: Switch over to the new mechanism
Now that all pieces are in place, change the implementations of
sched_mm_cid_fork() and sched_mm_cid_exit() to adhere to the new strict
ownership scheme and switch context_switch() over to use the new
mm_cid_schedin() functionality.
The common case is that there is no mode change required, which makes
fork() and exit() just update the user count and the constraints.
In case that a new user would exceed the CID space limit the fork() context
handles the transition to per CPU mode with mm::mm_cid::mutex held. exit()
handles the transition back to per task mode when the user count drops
below the switch back threshold. fork() might also be forced to handle a
deferred switch back to per task mode, when a affinity change increased the
number of allowed CPUs enough.
Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
---
include/linux/rseq.h | 19 -------
include/linux/rseq_types.h | 8 +--
kernel/fork.c | 1
kernel/sched/core.c | 109 ++++++++++++++++++++++++++++++++++++++-------
kernel/sched/sched.h | 78 --------------------------------
5 files changed, 99 insertions(+), 116 deletions(-)
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -82,24 +82,6 @@ static __always_inline void rseq_sched_s
t->rseq.event.ids_changed = true;
}
-/*
- * Invoked from switch_mm_cid() in context switch when the task gets a MM
- * CID assigned.
- *
- * This does not raise TIF_NOTIFY_RESUME as that happens in
- * rseq_sched_switch_event().
- */
-static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid)
-{
- /*
- * Requires a comparison as the switch_mm_cid() code does not
- * provide a conditional for it readily. So avoid excessive updates
- * when nothing changes.
- */
- if (t->rseq.ids.mm_cid != cid)
- t->rseq.event.ids_changed = true;
-}
-
/* Enforce a full update after RSEQ registration and when execve() failed */
static inline void rseq_force_update(void)
{
@@ -177,7 +159,6 @@ static inline void rseq_handle_slowpath(
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
-static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
static inline void rseq_force_update(void) { }
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -100,18 +100,18 @@ struct rseq_data { };
/**
* struct sched_mm_cid - Storage for per task MM CID data
* @active: MM CID is active for the task
- * @cid: The CID associated to the task
- * @last_cid: The last CID associated to the task
+ * @cid: The CID associated to the task either permanently or
+ * borrowed from the CPU
*/
struct sched_mm_cid {
unsigned int active;
unsigned int cid;
- unsigned int last_cid;
};
/**
* struct mm_cid_pcpu - Storage for per CPU MM_CID data
- * @cid: The CID associated to the CPU
+ * @cid: The CID associated to the CPU either permanently or
+ * while a task with a CID is running
*/
struct mm_cid_pcpu {
unsigned int cid;
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -956,7 +956,6 @@ static struct task_struct *dup_task_stru
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid.cid = MM_CID_UNSET;
- tsk->mm_cid.last_cid = MM_CID_UNSET;
tsk->mm_cid.active = 0;
#endif
return tsk;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5339,7 +5339,7 @@ context_switch(struct rq *rq, struct tas
}
}
- switch_mm_cid(prev, next);
+ mm_cid_schedin(next);
/*
* Tell rseq that the task was scheduled in. Must be after
@@ -10632,7 +10632,7 @@ static bool mm_cid_fixup_task_to_cpu(str
return true;
}
-static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void)
+static void mm_cid_fixup_tasks_to_cpus(void)
{
struct mm_struct *mm = current->mm;
struct task_struct *p, *t;
@@ -10682,14 +10682,42 @@ static bool sched_mm_cid_add_user(struct
void sched_mm_cid_fork(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
+ bool percpu;
WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
guard(mutex)(&mm->mm_cid.mutex);
- scoped_guard(raw_spinlock, &mm->mm_cid.lock) {
- sched_mm_cid_add_user(t, mm);
- /* Preset last_cid for mm_cid_select() */
- t->mm_cid.last_cid = mm->mm_cid.max_cids - 1;
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
+
+ /* First user ? */
+ if (!mm->mm_cid.users) {
+ sched_mm_cid_add_user(t, mm);
+ t->mm_cid.cid = mm_get_cid(mm);
+ /* Required for execve() */
+ pcp->cid = t->mm_cid.cid;
+ return;
+ }
+
+ if (!sched_mm_cid_add_user(t, mm)) {
+ if (!mm->mm_cid.percpu)
+ t->mm_cid.cid = mm_get_cid(mm);
+ return;
+ }
+
+ /* Handle the mode change and transfer current's CID */
+ percpu = !!mm->mm_cid.percpu;
+ if (!percpu)
+ mm_cid_transfer_to_task(current, pcp);
+ else
+ mm_cid_transfer_to_cpu(current, pcp);
+ }
+
+ if (percpu) {
+ mm_cid_fixup_tasks_to_cpus();
+ } else {
+ mm_cid_fixup_cpus_to_tasks(mm);
+ t->mm_cid.cid = mm_get_cid(mm);
}
}
@@ -10701,6 +10729,30 @@ static bool sched_mm_cid_remove_user(str
return mm_update_max_cids(t->mm);
}
+static bool __sched_mm_cid_exit(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+
+ if (!sched_mm_cid_remove_user(t))
+ return false;
+ /*
+ * Contrary to fork() this only deals with a switch back to per
+ * task mode either because the above decreased users or an
+ * affinity change increased the number of allowed CPUs and the
+ * deferred fixup did not run yet.
+ */
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return false;
+ /*
+ * A failed fork(2) cleanup never gets here, so @current must have
+ * the same MM as @t. That's true for exit() and the failed
+ * pthread_create() cleanup case.
+ */
+ if (WARN_ON_ONCE(current->mm != mm))
+ return false;
+ return true;
+}
+
/*
* When a task exits, the MM CID held by the task is not longer required as
* the task cannot return to user space.
@@ -10711,10 +10763,43 @@ void sched_mm_cid_exit(struct task_struc
if (!mm || !t->mm_cid.active)
return;
+ /*
+ * Ensure that only one instance is doing MM CID operations within
+ * a MM. The common case is uncontended. The rare fixup case adds
+ * some overhead.
+ */
+ scoped_guard(mutex, &mm->mm_cid.mutex) {
+ /* mm_cid::mutex is sufficient to protect mm_cid::users */
+ if (likely(mm->mm_cid.users > 1)) {
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ if (!__sched_mm_cid_exit(t))
+ return;
+ /* Mode change required. Transfer currents CID */
+ mm_cid_transfer_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+ }
+ mm_cid_fixup_cpus_to_tasks(mm);
+ return;
+ }
+ /* Last user */
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Required across execve() */
+ if (t == current)
+ mm_cid_transfer_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
+ /* Ignore mode change. There is nothing to do. */
+ sched_mm_cid_remove_user(t);
+ }
+ }
- guard(mutex)(&mm->mm_cid.mutex);
- scoped_guard(raw_spinlock, &mm->mm_cid.lock)
- sched_mm_cid_remove_user(t);
+ /*
+ * As this is the last user (execve(), process exit or failed
+ * fork(2)) there is no concurrency anymore.
+ *
+ * Synchronize eventally pending work to ensure that there are no
+ * dangling references left. @t->mm_cid.users is zero so nothing
+ * can queue this work anymore.
+ */
+ irq_work_sync(&mm->mm_cid.irq_work);
+ cancel_work_sync(&mm->mm_cid.work);
}
/* Deactivate MM CID allocation across execve() */
@@ -10727,18 +10812,12 @@ void sched_mm_cid_before_execve(struct t
void sched_mm_cid_after_execve(struct task_struct *t)
{
sched_mm_cid_fork(t);
- guard(preempt)();
- mm_cid_select(t);
}
static void mm_cid_work_fn(struct work_struct *work)
{
struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
- /* Make it compile, but not functional yet */
- if (!IS_ENABLED(CONFIG_NEW_MM_CID))
- return;
-
guard(mutex)(&mm->mm_cid.mutex);
/* Did the last user task exit already? */
if (!mm->mm_cid.users)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3704,84 +3704,8 @@ static __always_inline void mm_cid_sched
else
mm_cid_from_cpu(next, cpu_cid);
}
-
-/* Active implementation */
-static inline void init_sched_mm_cid(struct task_struct *t)
-{
- struct mm_struct *mm = t->mm;
- unsigned int max_cid;
-
- if (!mm)
- return;
-
- /* Preset last_mm_cid */
- max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
- t->mm_cid.last_cid = max_cid - 1;
-}
-
-static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids)
-{
- struct mm_struct *mm = t->mm;
-
- if (cid >= max_cids)
- return false;
- if (test_and_set_bit(cid, mm_cidmask(mm)))
- return false;
- t->mm_cid.cid = t->mm_cid.last_cid = cid;
- __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
- return true;
-}
-
-static inline bool mm_cid_get(struct task_struct *t)
-{
- struct mm_struct *mm = t->mm;
- unsigned int max_cids;
-
- max_cids = READ_ONCE(mm->mm_cid.max_cids);
-
- /* Try to reuse the last CID of this task */
- if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
- return true;
-
- /* Try to reuse the last CID of this mm on this CPU */
- if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids))
- return true;
-
- /* Try the first zero bit in the cidmask. */
- return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), nr_cpu_ids), max_cids);
-}
-
-static inline void mm_cid_select(struct task_struct *t)
-{
- /*
- * mm_cid_get() can fail when the maximum CID, which is determined
- * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently.
- * That's a transient failure as there cannot be more tasks
- * concurrently on a CPU (or about to be scheduled in) than that.
- */
- for (;;) {
- if (mm_cid_get(t))
- break;
- }
-}
-
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
-{
- if (prev->mm_cid.active) {
- if (prev->mm_cid.cid != MM_CID_UNSET)
- clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm));
- prev->mm_cid.cid = MM_CID_UNSET;
- }
-
- if (next->mm_cid.active) {
- mm_cid_select(next);
- rseq_sched_set_task_mm_cid(next, next->mm_cid.cid);
- }
-}
-
#else /* !CONFIG_SCHED_MM_CID: */
-static inline void mm_cid_select(struct task_struct *t) { }
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+static inline void mm_cid_schedin(struct task_struct *next) { }
#endif /* !CONFIG_SCHED_MM_CID */
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
Powered by blists - more mailing lists