linux-kernel - [RFC PATCH v2 2/5] sched: Note schedule() invocations at return-to-user with SM

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Fri,  2 Feb 2024 09:09:17 +0100
From: Valentin Schneider <vschneid@...hat.com>
To: linux-kernel@...r.kernel.org
Cc: Benjamin Segall <bsegall@...gle.com>,
	Ingo Molnar <mingo@...hat.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Juri Lelli <juri.lelli@...hat.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Mel Gorman <mgorman@...e.de>,
	Daniel Bristot de Oliveira <bristot@...hat.com>,
	Phil Auld <pauld@...hat.com>,
	Clark Williams <williams@...hat.com>,
	Tomas Glozar <tglozar@...hat.com>
Subject: [RFC PATCH v2 2/5] sched: Note schedule() invocations at return-to-user with SM_USER

task_struct.in_return_to_user is currently updated via atomic operations in
schedule_usermode().

However, one can note:
o .in_return_to_user is only updated for the current task
o There are no remote (smp_processor_id() != task_cpu(p)) accesses to
  .in_return_to_user

Add schedule_with_mode() to factorize schedule() with different flags to
pass down to __schedule_loop().

Add SM_USER to denote schedule() calls from return-to-userspace points.

Update .in_return_to_user from within the preemption-disabled, rq_lock-held
part of __schedule().

Suggested-by: Benjamin Segall <bsegall@...gle.com>
Signed-off-by: Valentin Schneider <vschneid@...hat.com>
---
 include/linux/sched.h |  2 +-
 kernel/sched/core.c   | 43 ++++++++++++++++++++++++++++++++-----------
 kernel/sched/fair.c   | 17 ++++++++++++++++-
 3 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a0105d1eaa21..1b6f17b2150a6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1544,7 +1544,7 @@ struct task_struct {
 #endif
 
 #ifdef CONFIG_CFS_BANDWIDTH
-	atomic_t			in_return_to_user;
+	int				in_return_to_user;
 #endif
 	/*
 	 * New fields for task_struct should be added above here, so that
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a7c028fad5a89..54e6690626b13 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4531,7 +4531,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	INIT_LIST_HEAD(&p->se.kernel_node);
-	atomic_set(&p->in_return_to_user, 0);
+	p->in_return_to_user            = false;
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
@@ -5147,6 +5147,9 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
 
 static inline void finish_lock_switch(struct rq *rq)
 {
+#ifdef CONFIG_CFS_BANDWIDTH
+	current->in_return_to_user = false;
+#endif
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
@@ -6562,6 +6565,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 #define SM_PREEMPT		0x1
 #define SM_RTLOCK_WAIT		0x2
 
+/*
+ * Special case for CFS_BANDWIDTH where we need to know if the call to
+ * __schedule() is directely preceding an entry into userspace.
+ * It is removed from the mode argument as soon as it is used to not go against
+ * the SM_MASK_PREEMPT optimisation below.
+ */
+#ifdef CONFIG_CFS_BANDWIDTH
+# define SM_USER                0x4
+#else
+# define SM_USER                SM_NONE
+#endif
+
 #ifndef CONFIG_PREEMPT_RT
 # define SM_MASK_PREEMPT	(~0U)
 #else
@@ -6646,6 +6661,14 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 	rq_lock(rq, &rf);
 	smp_mb__after_spinlock();
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	if (sched_mode & SM_USER) {
+		prev->in_return_to_user = true;
+		sched_mode &= ~SM_USER;
+	}
+#endif
+	SCHED_WARN_ON(sched_mode & SM_USER);
+
 	/* Promote REQ to ACT */
 	rq->clock_update_flags <<= 1;
 	update_rq_clock(rq);
@@ -6807,7 +6830,7 @@ static __always_inline void __schedule_loop(unsigned int sched_mode)
 	} while (need_resched());
 }
 
-asmlinkage __visible void __sched schedule(void)
+static __always_inline void schedule_with_mode(unsigned int sched_mode)
 {
 	struct task_struct *tsk = current;
 
@@ -6817,22 +6840,20 @@ asmlinkage __visible void __sched schedule(void)
 
 	if (!task_is_running(tsk))
 		sched_submit_work(tsk);
-	__schedule_loop(SM_NONE);
+	__schedule_loop(sched_mode);
 	sched_update_worker(tsk);
 }
+
+asmlinkage __visible void __sched schedule(void)
+{
+	schedule_with_mode(SM_NONE);
+}
 EXPORT_SYMBOL(schedule);
 
 asmlinkage __visible void __sched schedule_usermode(void)
 {
 #ifdef CONFIG_CFS_BANDWIDTH
-	/*
-	 * This is only atomic because of this simple implementation. We could
-	 * do something with an SM_USER to avoid other-cpu scheduler operations
-	 * racing against these writes.
-	 */
-	atomic_set(&current->in_return_to_user, true);
-	schedule();
-	atomic_set(&current->in_return_to_user, false);
+	schedule_with_mode(SM_USER);
 #else
 	schedule();
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a1808459a5acc..96504be6ee14a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6631,7 +6631,22 @@ static void enqueue_kernel(struct cfs_rq *cfs_rq, struct sched_entity *se, int c
 
 static bool is_kernel_task(struct task_struct *p)
 {
-	return sysctl_sched_cfs_bandwidth_kernel_bypass && !atomic_read(&p->in_return_to_user);
+	/*
+	 * The flag is updated within __schedule() with preemption disabled,
+	 * under the rq lock, and only when the task is current.
+	 *
+	 * Holding the rq lock for that task's CPU is thus sufficient for the
+	 * value to be stable, if the task is enqueued.
+	 *
+	 * If the task is dequeued, then task_cpu(p) *can* change, but this
+	 * so far only happens in enqueue_task_fair() which means either:
+	 * - the task is being activated, its CPU has been set previously in ttwu()
+	 * - the task is going through a "change" cycle (e.g. sched_move_task()),
+	 *   the pi_lock is also held so the CPU is stable.
+	 */
+	lockdep_assert_rq_held(cpu_rq(task_cpu(p)));
+
+	return sysctl_sched_cfs_bandwidth_kernel_bypass && !p->in_return_to_user;
 }
 
 /*
-- 
2.43.0