linux-kernel - Re: [PATCH 05/12] sched: Move sched_class::prio

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260114130528.GB831285@noisy.programming.kicks-ass.net>
Date: Wed, 14 Jan 2026 14:05:28 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: K Prateek Nayak <kprateek.nayak@....com>
Cc: Pierre Gondois <pierre.gondois@....com>, tj@...nel.org,
	linux-kernel@...r.kernel.org, mingo@...nel.org,
	juri.lelli@...hat.com, vincent.guittot@...aro.org,
	dietmar.eggemann@....com, rostedt@...dmis.org, bsegall@...gle.com,
	mgorman@...e.de, vschneid@...hat.com, longman@...hat.com,
	hannes@...xchg.org, mkoutny@...e.com, void@...ifault.com,
	arighi@...dia.com, changwoo@...lia.com, cgroups@...r.kernel.org,
	sched-ext@...ts.linux.dev, liuwenfang@...or.com, tglx@...utronix.de,
	Christian Loehle <christian.loehle@....com>,
	luca.abeni@...tannapisa.it
Subject: Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the
 change pattern

On Wed, Jan 14, 2026 at 11:23:36AM +0100, Peter Zijlstra wrote:

> Juri, Luca, I'm tempted to suggest to simply remove the replenish on
> RESTORE entirely -- that would allow the task to continue as it had
> been, irrespective of it being 'late'.
> 
> Something like so -- what would this break?
> 
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
>  		update_dl_entity(dl_se);
>  	} else if (flags & ENQUEUE_REPLENISH) {
>  		replenish_dl_entity(dl_se);
> -	} else if ((flags & ENQUEUE_RESTORE) &&
> -		   !is_dl_boosted(dl_se) &&
> -		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
> -		setup_new_dl_entity(dl_se);
>  	}
>  
>  	/*

Ah, this is de-boost, right? Boosting allows one to break the CBS rules
and then we have to rein in the excesses.

But we have {DE,EN}QUEUE_MOVE for this, that explicitly allows priority
to change and is set for rt_mutex_setprio() (among others).

So doing s/RESTORE/MOVE/ above.

The corollary to all this is that everybody that sets MOVE must be able
to deal with balance callbacks, so audit that too.

This then gives something like so.. which builds and boots for me, but
clearly I haven't been able to trigger these funny cases.

---
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4969,9 +4969,13 @@ struct balance_callback *splice_balance_
 	return __splice_balance_callbacks(rq, true);
 }
 
-static void __balance_callbacks(struct rq *rq)
+void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
 {
+	if (rf)
+		rq_unpin_lock(rq, rf);
 	do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
+	if (rf)
+		rq_repin_lock(rq, rf);
 }
 
 void balance_callbacks(struct rq *rq, struct balance_callback *head)
@@ -5018,7 +5022,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
-	__balance_callbacks(rq);
+	__balance_callbacks(rq, NULL);
 	raw_spin_rq_unlock_irq(rq);
 }
 
@@ -6901,7 +6905,7 @@ static void __sched notrace __schedule(i
 			proxy_tag_curr(rq, next);
 
 		rq_unpin_lock(rq, &rf);
-		__balance_callbacks(rq);
+		__balance_callbacks(rq, NULL);
 		raw_spin_rq_unlock_irq(rq);
 	}
 	trace_sched_exit_tp(is_switch);
@@ -7350,7 +7354,7 @@ void rt_mutex_setprio(struct task_struct
 	trace_sched_pi_setprio(p, pi_task);
 	oldprio = p->prio;
 
-	if (oldprio == prio)
+	if (oldprio == prio && !dl_prio(prio))
 		queue_flag &= ~DEQUEUE_MOVE;
 
 	prev_class = p->sched_class;
@@ -7396,9 +7400,7 @@ void rt_mutex_setprio(struct task_struct
 out_unlock:
 	/* Caller holds task_struct::pi_lock, IRQs are still disabled */
 
-	rq_unpin_lock(rq, &rf);
-	__balance_callbacks(rq);
-	rq_repin_lock(rq, &rf);
+	__balance_callbacks(rq, &rf);
 	__task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_RT_MUTEXES */
@@ -9167,6 +9169,8 @@ void sched_move_task(struct task_struct
 
 	if (resched)
 		resched_curr(rq);
+
+	__balance_callbacks(rq, &rq_guard.rf);
 }
 
 static struct cgroup_subsys_state *
@@ -10891,6 +10895,9 @@ void sched_change_end(struct sched_chang
 				resched_curr(rq);
 		}
 	} else {
+		/*
+		 * XXX validate prio only really changed when ENQUEUE_MOVE is set.
+		 */
 		p->sched_class->prio_changed(rq, p, ctx->prio);
 	}
 }
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2214,9 +2214,14 @@ enqueue_dl_entity(struct sched_dl_entity
 		update_dl_entity(dl_se);
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se);
-	} else if ((flags & ENQUEUE_RESTORE) &&
+	} else if ((flags & ENQUEUE_MOVE) &&
 		   !is_dl_boosted(dl_se) &&
 		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
+		/*
+		 * Deals with the de-boost case, and ENQUEUE_MOVE explicitly
+		 * allows us to change priority. Callers are expected to deal
+		 * with balance_callbacks.
+		 */
 		setup_new_dl_entity(dl_se);
 	}
 
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -545,6 +545,7 @@ static void scx_task_iter_start(struct s
 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
 {
 	if (iter->locked_task) {
+		__balance_callbacks(iter->rq, &iter->rf);
 		task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
 		iter->locked_task = NULL;
 	}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2430,7 +2430,8 @@ extern const u32		sched_prio_to_wmult[40
  *                should preserve as much state as possible.
  *
  * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
- *        in the runqueue.
+ *        in the runqueue. IOW the priority is allowed to change. Callers
+ *        must expect to deal with balance callbacks.
  *
  * NOCLOCK - skip the update_rq_clock() (avoids double updates)
  *
@@ -4019,6 +4020,8 @@ extern void enqueue_task(struct rq *rq,
 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
 
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
+
+extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
 
 /*
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
 		 * itself.
 		 */
 		newprio = rt_effective_prio(p, newprio);
-		if (newprio == oldprio)
+		if (newprio == oldprio && !dl_prio(newprio))
 			queue_flags &= ~DEQUEUE_MOVE;
 	}