linux-kernel - [RFC PATCH 21/22] [MAYBE BUGFIX] sched/fair: Group all the se->min

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250220093257.9380-22-kprateek.nayak@amd.com>
Date: Thu, 20 Feb 2025 09:32:56 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
	<vincent.guittot@...aro.org>, Valentin Schneider <vschneid@...hat.com>, "Ben
 Segall" <bsegall@...gle.com>, Thomas Gleixner <tglx@...utronix.de>, "Andy
 Lutomirski" <luto@...nel.org>, <linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Mel Gorman <mgorman@...e.de>, "Sebastian Andrzej
 Siewior" <bigeasy@...utronix.de>, Clark Williams <clrkwllms@...nel.org>,
	<linux-rt-devel@...ts.linux.dev>, Tejun Heo <tj@...nel.org>, "Frederic
 Weisbecker" <frederic@...nel.org>, Barret Rhoden <brho@...gle.com>, "Petr
 Mladek" <pmladek@...e.com>, Josh Don <joshdon@...gle.com>, Qais Yousef
	<qyousef@...alina.io>, "Paul E. McKenney" <paulmck@...nel.org>, David Vernet
	<dvernet@...a.com>, K Prateek Nayak <kprateek.nayak@....com>, "Gautham R.
 Shenoy" <gautham.shenoy@....com>, Swapnil Sapkal <swapnil.sapkal@....com>
Subject: [RFC PATCH 21/22] [MAYBE BUGFIX] sched/fair: Group all the se->min_* members together for propagation

When working on the prototype, at some point, min_kcs_vruntime heap
integrity issues were encountered. Since then many bugs were ironed out
but when auditing rb_{add,erase}_augmented_cached(), it was noticed that
only "min_vruntime" was considered as RBAUGMENTED for
RB_DECLARE_CALLBACKS().

The augment->copy() function generated only copies RBAUGMENTED member.
It is very likely that augment->propagate() is always called after
augment->copy() similar to the convention in augment->rotate() which
reconstructs all of min_{vruntime,slice,kcs_vruntime} before comparing
if the values have changed however this author was not sure if
augment->propagate() indeed reaches until the copied node for Case 2.
in __rb_erase_augmented().

Group all the min_* members together that need propagation to rb_root
and pass the new "min" struct as RBAUGMENTED to ease the author's
paranoia.

XXX: Reverting this bit reintroduces the heap integrity issue. It does
not happen immediately into the run and the amount of traces are
overwhelming to analyze which series of rbtree operations lead to this
but with this patch I haven't run into the heap integrity issue so far.
The cover letter should have more details.

Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
 include/linux/sched.h | 20 +++++++++-------
 kernel/sched/debug.c  |  2 +-
 kernel/sched/fair.c   | 54 +++++++++++++++++++++----------------------
 3 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 200cc086e121..6fd5cee3a0f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -549,8 +549,18 @@ struct sched_entity {
 	struct load_weight		load;
 	struct rb_node			run_node;
 	u64				deadline;
-	u64				min_vruntime;
-	u64				min_slice;
+	struct {
+		u64			vruntime;
+		u64			slice;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		/*
+		 * min_vruntime of the kernel mode preempted
+		 * entities in the subtree of this sched entity.
+		 */
+		s64			kcs_vruntime;
+#endif
+	} min;
 
 	struct list_head		group_node;
 	unsigned char			on_rq;
@@ -601,12 +611,6 @@ struct sched_entity {
 	/* Entity picked on a throttled hierarchy */
 	unsigned char			sched_throttled;
 					/* hole */
-
-	/*
-	 * min_vruntime of the kernel mode preempted entities
-	 * in the subtree of this sched entity.
-	 */
-	s64				min_kcs_vruntime;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ef047add7f9e..37bca53f109f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -821,7 +821,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	raw_spin_rq_lock_irqsave(rq, flags);
 	root = __pick_root_entity(cfs_rq);
 	if (root)
-		left_vruntime = root->min_vruntime;
+		left_vruntime = root->min.vruntime;
 	first = __pick_first_entity(cfs_rq);
 	if (first)
 		left_deadline = first->deadline;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 39c7e8f548ca..97566a043398 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -768,7 +768,7 @@ int pick_subtree(struct cfs_rq *cfs_rq, struct sched_entity *se, bool h_throttle
 	if (unlikely(h_throttled))
 		return pick_subtree_on_throttled(cfs_rq, se);
 
-	return vruntime_eligible(cfs_rq, se->min_vruntime);
+	return vruntime_eligible(cfs_rq, se->min.vruntime);
 }
 
 static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -800,9 +800,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 
 	if (se) {
 		if (!curr)
-			vruntime = se->min_vruntime;
+			vruntime = se->min.vruntime;
 		else
-			vruntime = min_vruntime(vruntime, se->min_vruntime);
+			vruntime = min_vruntime(vruntime, se->min.vruntime);
 	}
 
 	/* ensure we never gain time by being placed backwards. */
@@ -819,7 +819,7 @@ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
 		min_slice = curr->slice;
 
 	if (root)
-		min_slice = min(min_slice, root->min_slice);
+		min_slice = min(min_slice, root->min.slice);
 
 	return min_slice;
 }
@@ -835,8 +835,8 @@ static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node
 {
 	if (node) {
 		struct sched_entity *rse = __node_2_se(node);
-		if (vruntime_gt(min_vruntime, se, rse))
-			se->min_vruntime = rse->min_vruntime;
+		if (vruntime_gt(min.vruntime, se, rse))
+			se->min.vruntime = rse->min.vruntime;
 	}
 }
 
@@ -844,8 +844,8 @@ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *n
 {
 	if (node) {
 		struct sched_entity *rse = __node_2_se(node);
-		if (rse->min_slice < se->min_slice)
-			se->min_slice = rse->min_slice;
+		if (rse->min.slice < se->min.slice)
+			se->min.slice = rse->min.slice;
 	}
 }
 
@@ -853,30 +853,30 @@ static __always_inline void init_se_kcs_stats(struct sched_entity *se);
 static inline bool min_kcs_vruntime_update(struct sched_entity *se);
 
 /*
- * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
+ * se->min.vruntime = min(se->vruntime, {left,right}->min_vruntime)
  */
 static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
 {
-	u64 old_min_vruntime = se->min_vruntime;
-	u64 old_min_slice = se->min_slice;
+	u64 old_min_vruntime = se->min.vruntime;
+	u64 old_min_slice = se->min.slice;
 	struct rb_node *node = &se->run_node;
 	bool kcs_stats_unchanged = min_kcs_vruntime_update(se);
 
-	se->min_vruntime = se->vruntime;
+	se->min.vruntime = se->vruntime;
 	__min_vruntime_update(se, node->rb_right);
 	__min_vruntime_update(se, node->rb_left);
 
-	se->min_slice = se->slice;
+	se->min.slice = se->slice;
 	__min_slice_update(se, node->rb_right);
 	__min_slice_update(se, node->rb_left);
 
-	return se->min_vruntime == old_min_vruntime &&
-	       se->min_slice == old_min_slice &&
+	return se->min.vruntime == old_min_vruntime &&
+	       se->min.slice == old_min_slice &&
 	       kcs_stats_unchanged;
 }
 
 RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
-		     run_node, min_vruntime, min_vruntime_update);
+		     run_node, min, min_vruntime_update);
 
 /*
  * Enqueue an entity into the rb-tree:
@@ -885,8 +885,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	avg_vruntime_add(cfs_rq, se);
 	init_se_kcs_stats(se);
-	se->min_vruntime = se->vruntime;
-	se->min_slice = se->slice;
+	se->min.vruntime = se->vruntime;
+	se->min.slice = se->slice;
 	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
 				__entity_less, &min_vruntime_cb);
 }
@@ -953,7 +953,7 @@ static inline void cancel_protect_slice(struct sched_entity *se)
  * tree keeps the entries sorted on deadline, but also functions as a
  * heap based on the vruntime by keeping:
  *
- *  se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
+ *  se->min.vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
  *
  * Which allows tree pruning through eligibility.
  */
@@ -7018,7 +7018,7 @@ static __always_inline void init_se_kcs_stats(struct sched_entity *se)
 	 * the upper bound to differentiate the case where no kernel mode preempted
 	 * entities are queued on the subtree.
 	 */
-	se->min_kcs_vruntime = (se_in_kernel(se)) ? se->vruntime : LLONG_MAX;
+	se->min.kcs_vruntime = (se_in_kernel(se)) ? se->vruntime : LLONG_MAX;
 }
 
 /*
@@ -7096,11 +7096,11 @@ static __always_inline
 int pick_subtree_on_throttled(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/* There are no kernel mode preempted entities in the subtree. */
-	if (se->min_kcs_vruntime == LLONG_MAX)
+	if (se->min.kcs_vruntime == LLONG_MAX)
 		return false;
 
 	return throttled_vruntime_eligible(cfs_rq,
-					   se->min_kcs_vruntime,
+					   se->min.kcs_vruntime,
 					   curr_h_is_throttled(cfs_rq));
 }
 
@@ -7109,21 +7109,21 @@ static inline void __min_kcs_vruntime_update(struct sched_entity *se, struct rb_
 	if (node) {
 		struct sched_entity *rse = __node_2_se(node);
 
-		if (rse->min_kcs_vruntime < se->min_kcs_vruntime)
-			se->min_kcs_vruntime = rse->min_kcs_vruntime;
+		if (rse->min.kcs_vruntime < se->min.kcs_vruntime)
+			se->min.kcs_vruntime = rse->min.kcs_vruntime;
 	}
 }
 
 static inline bool min_kcs_vruntime_update(struct sched_entity *se)
 {
-	u64 old_min_kcs_vruntime = se->min_kcs_vruntime;
+	u64 old_min_kcs_vruntime = se->min.kcs_vruntime;
 	struct rb_node *node = &se->run_node;
 
 	init_se_kcs_stats(se);
 	__min_kcs_vruntime_update(se, node->rb_right);
 	__min_kcs_vruntime_update(se, node->rb_left);
 
-	return se->min_kcs_vruntime == old_min_kcs_vruntime;
+	return se->min.kcs_vruntime == old_min_kcs_vruntime;
 }
 
 static inline void account_kcs_enqueue(struct cfs_rq *gcfs_rq, bool in_kernel)
@@ -7341,7 +7341,7 @@ static __always_inline int pick_se_on_throttled(struct cfs_rq *cfs_rq, struct sc
 static __always_inline
 int pick_subtree_on_throttled(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	return vruntime_eligible(cfs_rq, se->min_vruntime);
+	return vruntime_eligible(cfs_rq, se->min.vruntime);
 }
 
 static inline bool min_kcs_vruntime_update(struct sched_entity *se)
-- 
2.43.0