lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260114130528.GB831285@noisy.programming.kicks-ass.net>
Date: Wed, 14 Jan 2026 14:05:28 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: K Prateek Nayak <kprateek.nayak@....com>
Cc: Pierre Gondois <pierre.gondois@....com>, tj@...nel.org,
	linux-kernel@...r.kernel.org, mingo@...nel.org,
	juri.lelli@...hat.com, vincent.guittot@...aro.org,
	dietmar.eggemann@....com, rostedt@...dmis.org, bsegall@...gle.com,
	mgorman@...e.de, vschneid@...hat.com, longman@...hat.com,
	hannes@...xchg.org, mkoutny@...e.com, void@...ifault.com,
	arighi@...dia.com, changwoo@...lia.com, cgroups@...r.kernel.org,
	sched-ext@...ts.linux.dev, liuwenfang@...or.com, tglx@...utronix.de,
	Christian Loehle <christian.loehle@....com>,
	luca.abeni@...tannapisa.it
Subject: Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the
 change pattern

On Wed, Jan 14, 2026 at 11:23:36AM +0100, Peter Zijlstra wrote:

> Juri, Luca, I'm tempted to suggest to simply remove the replenish on
> RESTORE entirely -- that would allow the task to continue as it had
> been, irrespective of it being 'late'.
> 
> Something like so -- what would this break?
> 
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
>  		update_dl_entity(dl_se);
>  	} else if (flags & ENQUEUE_REPLENISH) {
>  		replenish_dl_entity(dl_se);
> -	} else if ((flags & ENQUEUE_RESTORE) &&
> -		   !is_dl_boosted(dl_se) &&
> -		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
> -		setup_new_dl_entity(dl_se);
>  	}
>  
>  	/*

Ah, this is de-boost, right? Boosting allows one to break the CBS rules
and then we have to rein in the excesses.

But we have {DE,EN}QUEUE_MOVE for this, that explicitly allows priority
to change and is set for rt_mutex_setprio() (among others).

So doing s/RESTORE/MOVE/ above.

The corollary to all this is that everybody that sets MOVE must be able
to deal with balance callbacks, so audit that too.

This then gives something like so.. which builds and boots for me, but
clearly I haven't been able to trigger these funny cases.

---
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4969,9 +4969,13 @@ struct balance_callback *splice_balance_
 	return __splice_balance_callbacks(rq, true);
 }
 
-static void __balance_callbacks(struct rq *rq)
+void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
 {
+	if (rf)
+		rq_unpin_lock(rq, rf);
 	do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
+	if (rf)
+		rq_repin_lock(rq, rf);
 }
 
 void balance_callbacks(struct rq *rq, struct balance_callback *head)
@@ -5018,7 +5022,7 @@ static inline void finish_lock_switch(st
 	 * prev into current:
 	 */
 	spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
-	__balance_callbacks(rq);
+	__balance_callbacks(rq, NULL);
 	raw_spin_rq_unlock_irq(rq);
 }
 
@@ -6901,7 +6905,7 @@ static void __sched notrace __schedule(i
 			proxy_tag_curr(rq, next);
 
 		rq_unpin_lock(rq, &rf);
-		__balance_callbacks(rq);
+		__balance_callbacks(rq, NULL);
 		raw_spin_rq_unlock_irq(rq);
 	}
 	trace_sched_exit_tp(is_switch);
@@ -7350,7 +7354,7 @@ void rt_mutex_setprio(struct task_struct
 	trace_sched_pi_setprio(p, pi_task);
 	oldprio = p->prio;
 
-	if (oldprio == prio)
+	if (oldprio == prio && !dl_prio(prio))
 		queue_flag &= ~DEQUEUE_MOVE;
 
 	prev_class = p->sched_class;
@@ -7396,9 +7400,7 @@ void rt_mutex_setprio(struct task_struct
 out_unlock:
 	/* Caller holds task_struct::pi_lock, IRQs are still disabled */
 
-	rq_unpin_lock(rq, &rf);
-	__balance_callbacks(rq);
-	rq_repin_lock(rq, &rf);
+	__balance_callbacks(rq, &rf);
 	__task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_RT_MUTEXES */
@@ -9167,6 +9169,8 @@ void sched_move_task(struct task_struct
 
 	if (resched)
 		resched_curr(rq);
+
+	__balance_callbacks(rq, &rq_guard.rf);
 }
 
 static struct cgroup_subsys_state *
@@ -10891,6 +10895,9 @@ void sched_change_end(struct sched_chang
 				resched_curr(rq);
 		}
 	} else {
+		/*
+		 * XXX validate prio only really changed when ENQUEUE_MOVE is set.
+		 */
 		p->sched_class->prio_changed(rq, p, ctx->prio);
 	}
 }
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2214,9 +2214,14 @@ enqueue_dl_entity(struct sched_dl_entity
 		update_dl_entity(dl_se);
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se);
-	} else if ((flags & ENQUEUE_RESTORE) &&
+	} else if ((flags & ENQUEUE_MOVE) &&
 		   !is_dl_boosted(dl_se) &&
 		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
+		/*
+		 * Deals with the de-boost case, and ENQUEUE_MOVE explicitly
+		 * allows us to change priority. Callers are expected to deal
+		 * with balance_callbacks.
+		 */
 		setup_new_dl_entity(dl_se);
 	}
 
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -545,6 +545,7 @@ static void scx_task_iter_start(struct s
 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
 {
 	if (iter->locked_task) {
+		__balance_callbacks(iter->rq, &iter->rf);
 		task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
 		iter->locked_task = NULL;
 	}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2430,7 +2430,8 @@ extern const u32		sched_prio_to_wmult[40
  *                should preserve as much state as possible.
  *
  * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
- *        in the runqueue.
+ *        in the runqueue. IOW the priority is allowed to change. Callers
+ *        must expect to deal with balance callbacks.
  *
  * NOCLOCK - skip the update_rq_clock() (avoids double updates)
  *
@@ -4019,6 +4020,8 @@ extern void enqueue_task(struct rq *rq,
 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
 
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
+
+extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
 
 /*
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
 		 * itself.
 		 */
 		newprio = rt_effective_prio(p, newprio);
-		if (newprio == oldprio)
+		if (newprio == oldprio && !dl_prio(newprio))
 			queue_flags &= ~DEQUEUE_MOVE;
 	}
 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ