linux-kernel - [RFC PATCH 05/22] sched/fair: Track EEVDF stats for entities preempted in kernel mode

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250220093257.9380-6-kprateek.nayak@amd.com>
Date: Thu, 20 Feb 2025 09:32:40 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
	<vincent.guittot@...aro.org>, Valentin Schneider <vschneid@...hat.com>, "Ben
 Segall" <bsegall@...gle.com>, Thomas Gleixner <tglx@...utronix.de>, "Andy
 Lutomirski" <luto@...nel.org>, <linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Mel Gorman <mgorman@...e.de>, "Sebastian Andrzej
 Siewior" <bigeasy@...utronix.de>, Clark Williams <clrkwllms@...nel.org>,
	<linux-rt-devel@...ts.linux.dev>, Tejun Heo <tj@...nel.org>, "Frederic
 Weisbecker" <frederic@...nel.org>, Barret Rhoden <brho@...gle.com>, "Petr
 Mladek" <pmladek@...e.com>, Josh Don <joshdon@...gle.com>, Qais Yousef
	<qyousef@...alina.io>, "Paul E. McKenney" <paulmck@...nel.org>, David Vernet
	<dvernet@...a.com>, K Prateek Nayak <kprateek.nayak@....com>, "Gautham R.
 Shenoy" <gautham.shenoy@....com>, Swapnil Sapkal <swapnil.sapkal@....com>
Subject: [RFC PATCH 05/22] sched/fair: Track EEVDF stats for entities preempted in kernel mode

Throttled hierarchies will require only picking between kernel mode
preempted entities queued on them with throttle deferral.

Track EEVDF stats of kernel mode preempted entities in avg_kcs_vruntime
and avg_kcs_load which is the same as avg_vruntime and avg_load
respectively, but only contains stats for kernel mode preempted entities
queued on the rbtree.

Since all the checks for eligibility are entity_key() based, also update
avg_kcs_vruntime when min_vruntime of the cfs_rq changes.

Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
 kernel/sched/fair.c  | 62 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |  6 +++++
 2 files changed, 68 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index becf2d35f35a..cbb7a227afe7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -523,6 +523,9 @@ static int se_is_idle(struct sched_entity *se)
 
 static __always_inline
 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
+static __always_inline void avg_kcs_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se);
+static __always_inline void avg_kcs_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se);
+static __always_inline void avg_kcs_vruntime_update(struct cfs_rq *cfs_rq, s64 delta);
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -630,6 +633,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	cfs_rq->avg_vruntime += key * weight;
 	cfs_rq->avg_load += weight;
+	avg_kcs_vruntime_add(cfs_rq, se);
 }
 
 static void
@@ -640,6 +644,7 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	cfs_rq->avg_vruntime -= key * weight;
 	cfs_rq->avg_load -= weight;
+	avg_kcs_vruntime_sub(cfs_rq, se);
 }
 
 static inline
@@ -649,6 +654,7 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
 	 * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
 	 */
 	cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+	avg_kcs_vruntime_update(cfs_rq, delta);
 }
 
 /*
@@ -6720,6 +6726,58 @@ __always_inline void sched_notify_critical_section_exit(void)
 	current->se.kernel_cs_count--;
 }
 
+static inline int se_in_kernel(struct sched_entity *se)
+{
+	return se->kernel_cs_count;
+}
+
+/*
+ * Same as avg_vruntime_add() except avg_kcs_vruntime_add() only adjusts the avg_kcs_vruntime
+ * and avg_kcs_load of kernel mode preempted entity when it joins the rbtree.
+ */
+static __always_inline void avg_kcs_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight;
+	s64 key;
+
+	if (!se_in_kernel(se))
+		return;
+
+	weight = scale_load_down(se->load.weight);
+	key = entity_key(cfs_rq, se);
+
+	cfs_rq->avg_kcs_vruntime += key * weight;
+	cfs_rq->avg_kcs_load += weight;
+}
+
+/*
+ * Same as avg_vruntime_sub() except avg_kcs_vruntime_sub() only adjusts the avg_kcs_vruntime
+ * and avg_kcs_load of kernel mode preempted entity when it leaves the rbtree.
+ */
+static __always_inline void avg_kcs_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight;
+	s64 key;
+
+	if (!se_in_kernel(se))
+		return;
+
+	weight = scale_load_down(se->load.weight);
+	key = entity_key(cfs_rq, se);
+
+	cfs_rq->avg_kcs_vruntime -= key * weight;
+	cfs_rq->avg_kcs_load -= weight;
+}
+
+/*
+ * Same as avg_vruntime_update() except it adjusts avg_kcs_vruntime based on avg_kcs_load
+ * when min_vruntime of the cfs_rq changes.
+ */
+static __always_inline void avg_kcs_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+	cfs_rq->avg_kcs_vruntime -= cfs_rq->avg_kcs_load * delta;
+}
+
 #ifdef CONFIG_NO_HZ_FULL
 /* called from pick_next_task_fair() */
 static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
@@ -6792,6 +6850,10 @@ bool cfs_task_bw_constrained(struct task_struct *p)
 __always_inline void sched_notify_critical_section_entry(void) {}
 __always_inline void sched_notify_critical_section_exit(void) {}
 
+static __always_inline void avg_kcs_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static __always_inline void avg_kcs_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static __always_inline void avg_kcs_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) {}
+
 #endif /* CONFIG_CFS_BANDWIDTH */
 
 #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ab16d3d0e51c..22567d236f82 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -658,6 +658,12 @@ struct cfs_rq {
 	s64			avg_vruntime;
 	u64			avg_load;
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	/* EEVDF stats of entities preempted in kernel mode */
+	s64			avg_kcs_vruntime;
+	u64			avg_kcs_load;
+#endif
+
 	u64			min_vruntime;
 #ifdef CONFIG_SCHED_CORE
 	unsigned int		forceidle_seq;
-- 
2.43.0