linux-kernel - [RFC PATCH 07/22] sched/fair: Propagate preempted entity information up cgroup hierarchy

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250220093257.9380-8-kprateek.nayak@amd.com>
Date: Thu, 20 Feb 2025 09:32:42 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
	<vincent.guittot@...aro.org>, Valentin Schneider <vschneid@...hat.com>, "Ben
 Segall" <bsegall@...gle.com>, Thomas Gleixner <tglx@...utronix.de>, "Andy
 Lutomirski" <luto@...nel.org>, <linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Mel Gorman <mgorman@...e.de>, "Sebastian Andrzej
 Siewior" <bigeasy@...utronix.de>, Clark Williams <clrkwllms@...nel.org>,
	<linux-rt-devel@...ts.linux.dev>, Tejun Heo <tj@...nel.org>, "Frederic
 Weisbecker" <frederic@...nel.org>, Barret Rhoden <brho@...gle.com>, "Petr
 Mladek" <pmladek@...e.com>, Josh Don <joshdon@...gle.com>, Qais Yousef
	<qyousef@...alina.io>, "Paul E. McKenney" <paulmck@...nel.org>, David Vernet
	<dvernet@...a.com>, K Prateek Nayak <kprateek.nayak@....com>, "Gautham R.
 Shenoy" <gautham.shenoy@....com>, Swapnil Sapkal <swapnil.sapkal@....com>
Subject: [RFC PATCH 07/22] sched/fair: Propagate preempted entity information up cgroup hierarchy

Use "kernel_cs_count" in cfs_rq's sched entity to track the number of
preempted entities queued on the cfs_rq. Since the current task can
frequently switch in and out of kernel mode, "cfs_rq->curr" and the
whole sched entity hierarchy of the current task is treated special.

This is similar to "runnable_sum" except "kernel_cs_count" does not
have a corresponding count in the in struct cfs_rq.

The counts are managed at enqueue, dequeue, and task pick boundaries
since the entities on rbtree have a stable "kernel_cs_count". Use
min_vruntime_cb_propagate() generated from RB_DECLARE_CALLBACKS() to
propagate the adjustments to the root of the rbtree.

Since propagations require the state of the task being enqueued /
dequeued / put / set, expose se_in_kernel() to generic code to
streamline the propagation.

Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
 include/linux/sched.h |   7 ++-
 kernel/sched/fair.c   | 121 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 124 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4bb7e45758f4..48115de839a7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -589,7 +589,12 @@ struct sched_entity {
 	 * - For task: It represents if the task is currently
 	 *   running in kernel mode. It is always 0 or 1.
 	 *
-	 * TODO: Describe for sched_entity when implementing.
+	 * - For cfs_rq: It represents the sum of the
+	 *   kernel_cs_count of the entities queued below it
+	 *   except for the "current". Since the current task can
+	 *   frequently switch in and out of kernel mode, its
+	 *   hierarchy is treated  special and its contribution
+	 *   is accounted at key decision points.
 	 */
 	int				kernel_cs_count;
 					/* hole */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ba1bd60ce433..9f28624e4442 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6732,7 +6732,7 @@ __always_inline void sched_notify_critical_section_exit(void)
 	current->se.kernel_cs_count--;
 }
 
-static inline int se_in_kernel(struct sched_entity *se)
+static __always_inline int se_in_kernel(struct sched_entity *se)
 {
 	return se->kernel_cs_count;
 }
@@ -6817,6 +6817,76 @@ static inline bool min_kcs_vruntime_update(struct sched_entity *se)
 	return se->min_kcs_vruntime == old_min_kcs_vruntime;
 }
 
+static inline void account_kcs_enqueue(struct cfs_rq *gcfs_rq, bool in_kernel)
+{
+	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+
+	if (!in_kernel)
+		return;
+
+	se = gcfs_rq->tg->se[cpu_of(rq_of(gcfs_rq))];
+	if (!se)
+		return;
+
+	cfs_rq = cfs_rq_of(se);
+	se->kernel_cs_count++;
+
+	/* se has transitioned into a kernel mode preempted entity */
+	if (se->kernel_cs_count == 1 && se != cfs_rq->curr && se->on_rq) {
+		/*
+		 * Must be done after "kernel_cs_count" has been
+		 * incremented since avg_kcs_vruntime_add() only
+		 * adjusts the stats if it believes the entity is in
+		 * the kernel mode.
+		 */
+		avg_kcs_vruntime_add(cfs_rq, se);
+
+		/* Propagate min_kcs_vruntime_update() till rb_root */
+		min_vruntime_cb_propagate(&se->run_node, NULL);
+	}
+
+	/* Sanity check */
+	SCHED_WARN_ON(se->kernel_cs_count > gcfs_rq->h_nr_queued);
+}
+
+static inline void account_kcs_dequeue(struct cfs_rq *gcfs_rq, bool in_kernel)
+{
+	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+	bool transition_out;
+
+	if (!in_kernel)
+		return;
+
+	se = gcfs_rq->tg->se[cpu_of(rq_of(gcfs_rq))];
+	if (!se)
+		return;
+
+	cfs_rq = cfs_rq_of(se);
+	transition_out = se->kernel_cs_count == 1;
+
+	/*
+	 * Discount the load and avg_kcs_vruntime contribution if the
+	 * entity is transitioning out. Must be done before
+	 * "kernel_cs_count" is decremented as avg_kcs_vruntime_sub()
+	 * should still consider it to be in kernel mode to adjust
+	 * the stats.
+	 */
+	if (transition_out && se != cfs_rq->curr && se->on_rq)
+		avg_kcs_vruntime_sub(cfs_rq, se);
+
+	se->kernel_cs_count--;
+
+	/* Propagate min_kcs_vruntime_update() till rb_root */
+	if (transition_out && se != cfs_rq->curr && se->on_rq)
+		min_vruntime_cb_propagate(&se->run_node, NULL);
+
+	/* Sanity checks */
+	SCHED_WARN_ON(se->kernel_cs_count > gcfs_rq->h_nr_queued);
+	SCHED_WARN_ON(se->kernel_cs_count < 0);
+}
+
 #ifdef CONFIG_NO_HZ_FULL
 /* called from pick_next_task_fair() */
 static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
@@ -6889,6 +6959,11 @@ bool cfs_task_bw_constrained(struct task_struct *p)
 __always_inline void sched_notify_critical_section_entry(void) {}
 __always_inline void sched_notify_critical_section_exit(void) {}
 
+static __always_inline int se_in_kernel(struct sched_entity *se)
+{
+	return false;
+}
+
 static __always_inline void avg_kcs_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static __always_inline void avg_kcs_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static __always_inline void avg_kcs_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) {}
@@ -6899,6 +6974,9 @@ static inline bool min_kcs_vruntime_update(struct sched_entity *se)
 	return true;
 }
 
+static inline void account_kcs_enqueue(struct cfs_rq *gcfs_rq, bool in_kernel) {}
+static inline void account_kcs_dequeue(struct cfs_rq *gcfs_rq, bool in_kernel) {}
+
 #endif /* CONFIG_CFS_BANDWIDTH */
 
 #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
@@ -7056,6 +7134,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	/* A task on CPU has its in-kernel stats discounted */
+	bool task_in_kernel = !task_on_cpu(rq, p) && se_in_kernel(se);
 	int h_nr_idle = task_has_idle_policy(p);
 	int h_nr_runnable = 1;
 	int task_new = !(flags & ENQUEUE_WAKEUP);
@@ -7110,6 +7190,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq->h_nr_runnable += h_nr_runnable;
 		cfs_rq->h_nr_queued++;
 		cfs_rq->h_nr_idle += h_nr_idle;
+		account_kcs_enqueue(cfs_rq, task_in_kernel);
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
@@ -7136,6 +7217,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq->h_nr_runnable += h_nr_runnable;
 		cfs_rq->h_nr_queued++;
 		cfs_rq->h_nr_idle += h_nr_idle;
+		account_kcs_enqueue(cfs_rq, task_in_kernel);
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
@@ -7196,6 +7278,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	bool task_sleep = flags & DEQUEUE_SLEEP;
 	bool task_delayed = flags & DEQUEUE_DELAYED;
 	struct task_struct *p = NULL;
+	bool task_in_kernel = false;
 	int h_nr_idle = 0;
 	int h_nr_queued = 0;
 	int h_nr_runnable = 0;
@@ -7205,6 +7288,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	if (entity_is_task(se)) {
 		p = task_of(se);
 		h_nr_queued = 1;
+		/* A task on CPU has its in-kernel stats discounted */
+		task_in_kernel = !task_on_cpu(rq, p) && se_in_kernel(se);
 		h_nr_idle = task_has_idle_policy(p);
 		if (task_sleep || task_delayed || !se->sched_delayed)
 			h_nr_runnable = 1;
@@ -7226,6 +7311,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		cfs_rq->h_nr_runnable -= h_nr_runnable;
 		cfs_rq->h_nr_queued -= h_nr_queued;
 		cfs_rq->h_nr_idle -= h_nr_idle;
+		account_kcs_dequeue(cfs_rq, task_in_kernel);
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
@@ -7267,6 +7353,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		cfs_rq->h_nr_runnable -= h_nr_runnable;
 		cfs_rq->h_nr_queued -= h_nr_queued;
 		cfs_rq->h_nr_idle -= h_nr_idle;
+		account_kcs_dequeue(cfs_rq, task_in_kernel);
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
@@ -9029,6 +9116,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	 */
 	if (prev != p) {
 		struct sched_entity *pse = &prev->se;
+		bool prev_in_kernel = task_on_rq_queued(prev) && se_in_kernel(pse);
+		bool next_in_kernel = se_in_kernel(se);
 		struct cfs_rq *cfs_rq;
 
 		while (!(cfs_rq = is_same_group(se, pse))) {
@@ -9036,18 +9125,35 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 			int pse_depth = pse->depth;
 
 			if (se_depth <= pse_depth) {
-				put_prev_entity(cfs_rq_of(pse), pse);
+				cfs_rq = cfs_rq_of(pse);
+
+				account_kcs_enqueue(cfs_rq, prev_in_kernel);
+				put_prev_entity(cfs_rq, pse);
 				pse = parent_entity(pse);
 			}
 			if (se_depth >= pse_depth) {
-				set_next_entity(cfs_rq_of(se), se);
+				cfs_rq = cfs_rq_of(se);
+
+				set_next_entity(cfs_rq, se);
 				se = parent_entity(se);
+				account_kcs_dequeue(cfs_rq, next_in_kernel);
 			}
 		}
 
 		put_prev_entity(cfs_rq, pse);
 		set_next_entity(cfs_rq, se);
 
+		if (prev_in_kernel != next_in_kernel) {
+			for_each_sched_entity(se) {
+				cfs_rq = cfs_rq_of(se);
+
+				if (prev_in_kernel)
+					account_kcs_enqueue(cfs_rq, prev_in_kernel);
+				else
+					account_kcs_dequeue(cfs_rq, next_in_kernel);
+			}
+		}
+
 		__set_next_task_fair(rq, p, true);
 	}
 
@@ -9114,10 +9220,17 @@ void fair_server_init(struct rq *rq)
 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next)
 {
 	struct sched_entity *se = &prev->se;
+	/*
+	 * When next is NULL, it is a save-restore operation. If the task is no
+	 * longer queued on the rq, the stats have been already discounted at
+	 * pick and should not be adjusted here.
+	 */
+	bool task_in_kernel = next && task_on_rq_queued(prev) && se_in_kernel(se);
 	struct cfs_rq *cfs_rq;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
+		account_kcs_enqueue(cfs_rq, task_in_kernel);
 		put_prev_entity(cfs_rq, se);
 	}
 }
@@ -13424,11 +13537,13 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
 static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
 {
 	struct sched_entity *se = &p->se;
+	bool task_in_kernel = !task_on_cpu(rq, p) && se_in_kernel(se);
 
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		set_next_entity(cfs_rq, se);
+		account_kcs_dequeue(cfs_rq, task_in_kernel);
 		/* ensure bandwidth has been allocated on our new cfs_rq */
 		account_cfs_rq_runtime(cfs_rq, 0);
 	}
-- 
2.43.0