lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250220093257.9380-4-kprateek.nayak@amd.com>
Date: Thu, 20 Feb 2025 09:32:38 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
	<vincent.guittot@...aro.org>, Valentin Schneider <vschneid@...hat.com>, "Ben
 Segall" <bsegall@...gle.com>, Thomas Gleixner <tglx@...utronix.de>, "Andy
 Lutomirski" <luto@...nel.org>, <linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Mel Gorman <mgorman@...e.de>, "Sebastian Andrzej
 Siewior" <bigeasy@...utronix.de>, Clark Williams <clrkwllms@...nel.org>,
	<linux-rt-devel@...ts.linux.dev>, Tejun Heo <tj@...nel.org>, "Frederic
 Weisbecker" <frederic@...nel.org>, Barret Rhoden <brho@...gle.com>, "Petr
 Mladek" <pmladek@...e.com>, Josh Don <joshdon@...gle.com>, Qais Yousef
	<qyousef@...alina.io>, "Paul E. McKenney" <paulmck@...nel.org>, David Vernet
	<dvernet@...a.com>, K Prateek Nayak <kprateek.nayak@....com>, "Gautham R.
 Shenoy" <gautham.shenoy@....com>, Swapnil Sapkal <swapnil.sapkal@....com>
Subject: [RFC PATCH 03/22] [PoC] kernel/entry/common: Mark syscall as a kernel critical section

Mark the syscall boundary as a kernel critical section. Use a per-task
"kernel_cs_count" to track task's entry from userspace and exit to
userspace. When "kernel_cs_count" is non-zero, the task is executing in
kernel mode.

For this Proof-of-Concept, "kernel_cs_count" can only be 1 or 0 for a
tasks and the implementation will run with the same assumption. The
critical section is defined as an integer count to allow fine grained
control in the future where certain boundaries within the kernel can be
marked as resource holding critical sections.

For the sake of simplicity, the whole kernel mode is marked as a
critical section in this PoC. For future extensibility,
sched_notify_critical_sction{entry,exit}() helpers are defined to mark
boundaries of kernel critical section and is similar to preempt_count()
mechanism.

Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
 include/linux/sched.h | 19 ++++++++++++++++++-
 kernel/entry/common.c |  7 +++++++
 kernel/entry/common.h |  4 ++++
 kernel/sched/fair.c   | 20 ++++++++++++++++++++
 4 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 34862d904ea3..63f3f235a5c1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -577,7 +577,24 @@ struct sched_entity {
 	/* cached value of my_q->h_nr_running */
 	unsigned int			runnable_weight;
 	int				depth;
-#endif
+
+#ifdef CONFIG_CFS_BANDWIDTH
+	/*
+	 * Keep track of tasks, and cfs_rq(s) that contains tasks
+	 * running in kernel mode. Any throttling event for the
+	 * cfs_rq will be deferred until this count hits 0.
+	 *
+	 * Semantics:
+	 *
+	 * - For task: It represents if the task is currently
+	 *   running in kernel mode. It is always 0 or 1.
+	 *
+	 * TODO: Describe for sched_entity when implementing.
+	 */
+	int				kernel_cs_count;
+					/* hole */
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
 	/*
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index cc93cdcc36d0..b132b96e2b96 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -83,6 +83,8 @@ __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, lon
 {
 	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
 
+	sched_notify_critical_section_entry();
+
 	if (work & SYSCALL_WORK_ENTER)
 		syscall = syscall_trace_enter(regs, syscall, work);
 
@@ -214,6 +216,11 @@ static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *reg
 {
 	syscall_exit_to_user_mode_prepare(regs);
 	local_irq_disable_exit_to_user();
+	/*
+	 * Notify scheduler that the task is exiting to userspace after a
+	 * syscall. Must be called before checking for NEED_RESCHED work.
+	 */
+	sched_notify_critical_section_exit();
 	exit_to_user_mode_prepare(regs);
 }
 
diff --git a/kernel/entry/common.h b/kernel/entry/common.h
index f6e6d02f07fe..73e699a4c3e9 100644
--- a/kernel/entry/common.h
+++ b/kernel/entry/common.h
@@ -4,4 +4,8 @@
 
 bool syscall_user_dispatch(struct pt_regs *regs);
 
+/* sched notifiers for CFS bandwidth deferral */
+extern void sched_notify_critical_section_entry(void);
+extern void sched_notify_critical_section_exit(void);
+
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 857808da23d8..becf2d35f35a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -58,6 +58,8 @@
 #include "stats.h"
 #include "autogroup.h"
 
+#include "../entry/common.h" /* critical section entry / exit notifiers */
+
 /*
  * The initial- and re-scaling of tunables is configurable
  *
@@ -6704,6 +6706,20 @@ bool cfs_task_bw_constrained(struct task_struct *p)
 	return false;
 }
 
+__always_inline void sched_notify_critical_section_entry(void)
+{
+	current->se.kernel_cs_count++;
+	/*
+	 * Post this point, the task is considered to be in a kernel
+	 * critical section and will defer bandwidth throttling.
+	 */
+}
+
+__always_inline void sched_notify_critical_section_exit(void)
+{
+	current->se.kernel_cs_count--;
+}
+
 #ifdef CONFIG_NO_HZ_FULL
 /* called from pick_next_task_fair() */
 static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
@@ -6772,6 +6788,10 @@ bool cfs_task_bw_constrained(struct task_struct *p)
 	return false;
 }
 #endif
+
+__always_inline void sched_notify_critical_section_entry(void) {}
+__always_inline void sched_notify_critical_section_exit(void) {}
+
 #endif /* CONFIG_CFS_BANDWIDTH */
 
 #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
-- 
2.43.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ