linux-kernel - [patch 19/19] sched/mmcid: Switch over to the new mechanism

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251015172835.497158969@linutronix.de>
Date: Wed, 15 Oct 2025 19:30:00 +0200 (CEST)
From: Thomas Gleixner <tglx@...utronix.de>
To: LKML <linux-kernel@...r.kernel.org>
Cc: Peter Zijlstra <peterz@...radead.org>,
 Gabriele Monaco <gmonaco@...hat.com>,
 Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
 Michael Jeanson <mjeanson@...icios.com>,
 Jens Axboe <axboe@...nel.dk>,
 "Paul E. McKenney" <paulmck@...nel.org>,
 "Gautham R. Shenoy" <gautham.shenoy@....com>,
 Florian Weimer <fweimer@...hat.com>,
 Tim Chen <tim.c.chen@...el.com>,
 TCMalloc Team <tcmalloc-eng@...gle.com>
Subject: [patch 19/19] sched/mmcid: Switch over to the new mechanism

Now that all pieces are in place, change the implementations of
sched_mm_cid_fork() and sched_mm_cid_exit() to adhere to the new strict
ownership scheme and switch context_switch() over to use the new
mm_cid_schedin() functionality.

The common case is that there is no mode change required, which makes
fork() and exit() just update the user count and the constraints.

In case that a new user would exceed the CID space limit the fork() context
handles the transition to per CPU mode with mm::mm_cid::mutex held. exit()
handles the transition back to per task mode when the user count drops
below the switch back threshold. fork() might also be forced to handle a
deferred switch back to per task mode, when a affinity change increased the
number of allowed CPUs enough.

Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
---
 include/linux/rseq.h       |   19 -------
 include/linux/rseq_types.h |    8 +--
 kernel/fork.c              |    1 
 kernel/sched/core.c        |  109 ++++++++++++++++++++++++++++++++++++++-------
 kernel/sched/sched.h       |   78 --------------------------------
 5 files changed, 99 insertions(+), 116 deletions(-)

--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -82,24 +82,6 @@ static __always_inline void rseq_sched_s
 	t->rseq.event.ids_changed = true;
 }
 
-/*
- * Invoked from switch_mm_cid() in context switch when the task gets a MM
- * CID assigned.
- *
- * This does not raise TIF_NOTIFY_RESUME as that happens in
- * rseq_sched_switch_event().
- */
-static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid)
-{
-	/*
-	 * Requires a comparison as the switch_mm_cid() code does not
-	 * provide a conditional for it readily. So avoid excessive updates
-	 * when nothing changes.
-	 */
-	if (t->rseq.ids.mm_cid != cid)
-		t->rseq.event.ids_changed = true;
-}
-
 /* Enforce a full update after RSEQ registration and when execve() failed */
 static inline void rseq_force_update(void)
 {
@@ -177,7 +159,6 @@ static inline void rseq_handle_slowpath(
 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
 static inline void rseq_sched_switch_event(struct task_struct *t) { }
 static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
-static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
 static inline void rseq_force_update(void) { }
 static inline void rseq_virt_userspace_exit(void) { }
 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -100,18 +100,18 @@ struct rseq_data { };
 /**
  * struct sched_mm_cid - Storage for per task MM CID data
  * @active:	MM CID is active for the task
- * @cid:	The CID associated to the task
- * @last_cid:	The last CID associated to the task
+ * @cid:	The CID associated to the task either permanently or
+ *		borrowed from the CPU
  */
 struct sched_mm_cid {
 	unsigned int		active;
 	unsigned int		cid;
-	unsigned int		last_cid;
 };
 
 /**
  * struct mm_cid_pcpu - Storage for per CPU MM_CID data
- * @cid:	The CID associated to the CPU
+ * @cid:	The CID associated to the CPU either permanently or
+ *		while a task with a CID is running
  */
 struct mm_cid_pcpu {
 	unsigned int	cid;
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -956,7 +956,6 @@ static struct task_struct *dup_task_stru
 
 #ifdef CONFIG_SCHED_MM_CID
 	tsk->mm_cid.cid = MM_CID_UNSET;
-	tsk->mm_cid.last_cid = MM_CID_UNSET;
 	tsk->mm_cid.active = 0;
 #endif
 	return tsk;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5339,7 +5339,7 @@ context_switch(struct rq *rq, struct tas
 		}
 	}
 
-	switch_mm_cid(prev, next);
+	mm_cid_schedin(next);
 
 	/*
 	 * Tell rseq that the task was scheduled in. Must be after
@@ -10632,7 +10632,7 @@ static bool mm_cid_fixup_task_to_cpu(str
 	return true;
 }
 
-static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void)
+static void mm_cid_fixup_tasks_to_cpus(void)
 {
 	struct mm_struct *mm = current->mm;
 	struct task_struct *p, *t;
@@ -10682,14 +10682,42 @@ static bool sched_mm_cid_add_user(struct
 void sched_mm_cid_fork(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
+	bool percpu;
 
 	WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
 
 	guard(mutex)(&mm->mm_cid.mutex);
-	scoped_guard(raw_spinlock, &mm->mm_cid.lock) {
-		sched_mm_cid_add_user(t, mm);
-		/* Preset last_cid for mm_cid_select() */
-		t->mm_cid.last_cid = mm->mm_cid.max_cids - 1;
+	scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+		struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
+
+		/* First user ? */
+		if (!mm->mm_cid.users) {
+			sched_mm_cid_add_user(t, mm);
+			t->mm_cid.cid = mm_get_cid(mm);
+			/* Required for execve() */
+			pcp->cid = t->mm_cid.cid;
+			return;
+		}
+
+		if (!sched_mm_cid_add_user(t, mm)) {
+			if (!mm->mm_cid.percpu)
+				t->mm_cid.cid = mm_get_cid(mm);
+			return;
+		}
+
+		/* Handle the mode change and transfer current's CID */
+		percpu = !!mm->mm_cid.percpu;
+		if (!percpu)
+			mm_cid_transfer_to_task(current, pcp);
+		else
+			mm_cid_transfer_to_cpu(current, pcp);
+	}
+
+	if (percpu) {
+		mm_cid_fixup_tasks_to_cpus();
+	} else {
+		mm_cid_fixup_cpus_to_tasks(mm);
+		t->mm_cid.cid = mm_get_cid(mm);
 	}
 }
 
@@ -10701,6 +10729,30 @@ static bool sched_mm_cid_remove_user(str
 	return mm_update_max_cids(t->mm);
 }
 
+static bool __sched_mm_cid_exit(struct task_struct *t)
+{
+	struct mm_struct *mm = t->mm;
+
+	if (!sched_mm_cid_remove_user(t))
+		return false;
+	/*
+	 * Contrary to fork() this only deals with a switch back to per
+	 * task mode either because the above decreased users or an
+	 * affinity change increased the number of allowed CPUs and the
+	 * deferred fixup did not run yet.
+	 */
+	if (WARN_ON_ONCE(mm->mm_cid.percpu))
+		return false;
+	/*
+	 * A failed fork(2) cleanup never gets here, so @current must have
+	 * the same MM as @t. That's true for exit() and the failed
+	 * pthread_create() cleanup case.
+	 */
+	if (WARN_ON_ONCE(current->mm != mm))
+		return false;
+	return true;
+}
+
 /*
  * When a task exits, the MM CID held by the task is not longer required as
  * the task cannot return to user space.
@@ -10711,10 +10763,43 @@ void sched_mm_cid_exit(struct task_struc
 
 	if (!mm || !t->mm_cid.active)
 		return;
+	/*
+	 * Ensure that only one instance is doing MM CID operations within
+	 * a MM. The common case is uncontended. The rare fixup case adds
+	 * some overhead.
+	 */
+	scoped_guard(mutex, &mm->mm_cid.mutex) {
+		/* mm_cid::mutex is sufficient to protect mm_cid::users */
+		if (likely(mm->mm_cid.users > 1)) {
+			scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+				if (!__sched_mm_cid_exit(t))
+					return;
+				/* Mode change required. Transfer currents CID */
+				mm_cid_transfer_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+			}
+			mm_cid_fixup_cpus_to_tasks(mm);
+			return;
+		}
+		/* Last user */
+		scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+			/* Required across execve() */
+			if (t == current)
+				mm_cid_transfer_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
+			/* Ignore mode change. There is nothing to do. */
+			sched_mm_cid_remove_user(t);
+		}
+	}
 
-	guard(mutex)(&mm->mm_cid.mutex);
-	scoped_guard(raw_spinlock, &mm->mm_cid.lock)
-		sched_mm_cid_remove_user(t);
+	/*
+	 * As this is the last user (execve(), process exit or failed
+	 * fork(2)) there is no concurrency anymore.
+	 *
+	 * Synchronize eventally pending work to ensure that there are no
+	 * dangling references left. @t->mm_cid.users is zero so nothing
+	 * can queue this work anymore.
+	 */
+	irq_work_sync(&mm->mm_cid.irq_work);
+	cancel_work_sync(&mm->mm_cid.work);
 }
 
 /* Deactivate MM CID allocation across execve() */
@@ -10727,18 +10812,12 @@ void sched_mm_cid_before_execve(struct t
 void sched_mm_cid_after_execve(struct task_struct *t)
 {
 	sched_mm_cid_fork(t);
-	guard(preempt)();
-	mm_cid_select(t);
 }
 
 static void mm_cid_work_fn(struct work_struct *work)
 {
 	struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
 
-	/* Make it compile, but not functional yet */
-	if (!IS_ENABLED(CONFIG_NEW_MM_CID))
-		return;
-
 	guard(mutex)(&mm->mm_cid.mutex);
 	/* Did the last user task exit already? */
 	if (!mm->mm_cid.users)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3704,84 +3704,8 @@ static __always_inline void mm_cid_sched
 	else
 		mm_cid_from_cpu(next, cpu_cid);
 }
-
-/* Active implementation */
-static inline void init_sched_mm_cid(struct task_struct *t)
-{
-	struct mm_struct *mm = t->mm;
-	unsigned int max_cid;
-
-	if (!mm)
-		return;
-
-	/* Preset last_mm_cid */
-	max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
-	t->mm_cid.last_cid = max_cid - 1;
-}
-
-static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids)
-{
-	struct mm_struct *mm = t->mm;
-
-	if (cid >= max_cids)
-		return false;
-	if (test_and_set_bit(cid, mm_cidmask(mm)))
-		return false;
-	t->mm_cid.cid = t->mm_cid.last_cid = cid;
-	__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
-	return true;
-}
-
-static inline bool mm_cid_get(struct task_struct *t)
-{
-	struct mm_struct *mm = t->mm;
-	unsigned int max_cids;
-
-	max_cids = READ_ONCE(mm->mm_cid.max_cids);
-
-	/* Try to reuse the last CID of this task */
-	if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
-		return true;
-
-	/* Try to reuse the last CID of this mm on this CPU */
-	if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids))
-		return true;
-
-	/* Try the first zero bit in the cidmask. */
-	return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), nr_cpu_ids), max_cids);
-}
-
-static inline void mm_cid_select(struct task_struct *t)
-{
-	/*
-	 * mm_cid_get() can fail when the maximum CID, which is determined
-	 * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently.
-	 * That's a transient failure as there cannot be more tasks
-	 * concurrently on a CPU (or about to be scheduled in) than that.
-	 */
-	for (;;) {
-		if (mm_cid_get(t))
-			break;
-	}
-}
-
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
-{
-	if (prev->mm_cid.active) {
-		if (prev->mm_cid.cid != MM_CID_UNSET)
-			clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm));
-		prev->mm_cid.cid = MM_CID_UNSET;
-	}
-
-	if (next->mm_cid.active) {
-		mm_cid_select(next);
-		rseq_sched_set_task_mm_cid(next, next->mm_cid.cid);
-	}
-}
-
 #else /* !CONFIG_SCHED_MM_CID: */
-static inline void mm_cid_select(struct task_struct *t) { }
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+static inline void mm_cid_schedin(struct task_struct *next) { }
 #endif /* !CONFIG_SCHED_MM_CID */
 
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);