linux-kernel - Re: [PATCH 05/12] sched: Move sched_class::prio

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aWemQDHyF2FpNU2P@jlelli-thinkpadt14gen4.remote.csb>
Date: Wed, 14 Jan 2026 15:20:48 +0100
From: Juri Lelli <juri.lelli@...hat.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: K Prateek Nayak <kprateek.nayak@....com>,
	Pierre Gondois <pierre.gondois@....com>, tj@...nel.org,
	linux-kernel@...r.kernel.org, mingo@...nel.org,
	vincent.guittot@...aro.org, dietmar.eggemann@....com,
	rostedt@...dmis.org, bsegall@...gle.com, mgorman@...e.de,
	vschneid@...hat.com, longman@...hat.com, hannes@...xchg.org,
	mkoutny@...e.com, void@...ifault.com, arighi@...dia.com,
	changwoo@...lia.com, cgroups@...r.kernel.org,
	sched-ext@...ts.linux.dev, liuwenfang@...or.com, tglx@...utronix.de,
	Christian Loehle <christian.loehle@....com>,
	luca.abeni@...tannapisa.it
Subject: Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the
 change pattern

On 14/01/26 14:05, Peter Zijlstra wrote:
> On Wed, Jan 14, 2026 at 11:23:36AM +0100, Peter Zijlstra wrote:
> 
> > Juri, Luca, I'm tempted to suggest to simply remove the replenish on
> > RESTORE entirely -- that would allow the task to continue as it had
> > been, irrespective of it being 'late'.
> > 
> > Something like so -- what would this break?
> > 
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
> >  		update_dl_entity(dl_se);
> >  	} else if (flags & ENQUEUE_REPLENISH) {
> >  		replenish_dl_entity(dl_se);
> > -	} else if ((flags & ENQUEUE_RESTORE) &&
> > -		   !is_dl_boosted(dl_se) &&
> > -		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
> > -		setup_new_dl_entity(dl_se);
> >  	}
> >  
> >  	/*
> 
> Ah, this is de-boost, right? Boosting allows one to break the CBS rules
> and then we have to rein in the excesses.
> 
> But we have {DE,EN}QUEUE_MOVE for this, that explicitly allows priority
> to change and is set for rt_mutex_setprio() (among others).
> 
> So doing s/RESTORE/MOVE/ above.
> 
> The corollary to all this is that everybody that sets MOVE must be able
> to deal with balance callbacks, so audit that too.
> 
> This then gives something like so.. which builds and boots for me, but
> clearly I haven't been able to trigger these funny cases.
> 
> ---
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4969,9 +4969,13 @@ struct balance_callback *splice_balance_
>  	return __splice_balance_callbacks(rq, true);
>  }
>  
> -static void __balance_callbacks(struct rq *rq)
> +void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
>  {
> +	if (rf)
> +		rq_unpin_lock(rq, rf);
>  	do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
> +	if (rf)
> +		rq_repin_lock(rq, rf);
>  }
>  
>  void balance_callbacks(struct rq *rq, struct balance_callback *head)
> @@ -5018,7 +5022,7 @@ static inline void finish_lock_switch(st
>  	 * prev into current:
>  	 */
>  	spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
> -	__balance_callbacks(rq);
> +	__balance_callbacks(rq, NULL);
>  	raw_spin_rq_unlock_irq(rq);
>  }
>  
> @@ -6901,7 +6905,7 @@ static void __sched notrace __schedule(i
>  			proxy_tag_curr(rq, next);
>  
>  		rq_unpin_lock(rq, &rf);
> -		__balance_callbacks(rq);
> +		__balance_callbacks(rq, NULL);
>  		raw_spin_rq_unlock_irq(rq);
>  	}
>  	trace_sched_exit_tp(is_switch);
> @@ -7350,7 +7354,7 @@ void rt_mutex_setprio(struct task_struct
>  	trace_sched_pi_setprio(p, pi_task);
>  	oldprio = p->prio;
>  
> -	if (oldprio == prio)
> +	if (oldprio == prio && !dl_prio(prio))
>  		queue_flag &= ~DEQUEUE_MOVE;
>  
>  	prev_class = p->sched_class;
> @@ -7396,9 +7400,7 @@ void rt_mutex_setprio(struct task_struct
>  out_unlock:
>  	/* Caller holds task_struct::pi_lock, IRQs are still disabled */
>  
> -	rq_unpin_lock(rq, &rf);
> -	__balance_callbacks(rq);
> -	rq_repin_lock(rq, &rf);
> +	__balance_callbacks(rq, &rf);
>  	__task_rq_unlock(rq, p, &rf);
>  }
>  #endif /* CONFIG_RT_MUTEXES */
> @@ -9167,6 +9169,8 @@ void sched_move_task(struct task_struct
>  
>  	if (resched)
>  		resched_curr(rq);
> +
> +	__balance_callbacks(rq, &rq_guard.rf);
>  }
>  
>  static struct cgroup_subsys_state *
> @@ -10891,6 +10895,9 @@ void sched_change_end(struct sched_chang
>  				resched_curr(rq);
>  		}
>  	} else {
> +		/*
> +		 * XXX validate prio only really changed when ENQUEUE_MOVE is set.
> +		 */
>  		p->sched_class->prio_changed(rq, p, ctx->prio);
>  	}
>  }
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2214,9 +2214,14 @@ enqueue_dl_entity(struct sched_dl_entity
>  		update_dl_entity(dl_se);
>  	} else if (flags & ENQUEUE_REPLENISH) {
>  		replenish_dl_entity(dl_se);
> -	} else if ((flags & ENQUEUE_RESTORE) &&
> +	} else if ((flags & ENQUEUE_MOVE) &&
>  		   !is_dl_boosted(dl_se) &&
>  		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
> +		/*
> +		 * Deals with the de-boost case, and ENQUEUE_MOVE explicitly
> +		 * allows us to change priority. Callers are expected to deal
> +		 * with balance_callbacks.
> +		 */
>  		setup_new_dl_entity(dl_se);
>  	}
>  
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -545,6 +545,7 @@ static void scx_task_iter_start(struct s
>  static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
>  {
>  	if (iter->locked_task) {
> +		__balance_callbacks(iter->rq, &iter->rf);
>  		task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
>  		iter->locked_task = NULL;
>  	}
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2430,7 +2430,8 @@ extern const u32		sched_prio_to_wmult[40
>   *                should preserve as much state as possible.
>   *
>   * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
> - *        in the runqueue.
> + *        in the runqueue. IOW the priority is allowed to change. Callers
> + *        must expect to deal with balance callbacks.
>   *
>   * NOCLOCK - skip the update_rq_clock() (avoids double updates)
>   *
> @@ -4019,6 +4020,8 @@ extern void enqueue_task(struct rq *rq,
>  extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
>  
>  extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
> +
> +extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
>  extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
>  
>  /*
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
>  		 * itself.
>  		 */
>  		newprio = rt_effective_prio(p, newprio);
> -		if (newprio == oldprio)
> +		if (newprio == oldprio && !dl_prio(newprio))
>  			queue_flags &= ~DEQUEUE_MOVE;
>  	}

We have been using (improperly?) ENQUEUE_SAVE also to know when a new
entity gets setscheduled to DEADLINE (or its parameters are changed) and
it looks like this keeps that happening with DEQUEUE_MOVE. So, from a
quick first look, it does sound good to me.

Thanks!
Juri