linux-kernel - Re: [RFC PATCH v3 11/24] sched/rt: Add rt-cgroups' dl-servers operations.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aOY8YpZrMXn8iB5f@jlelli-thinkpadt14gen4.remote.csb>
Date: Wed, 8 Oct 2025 12:26:42 +0200
From: Juri Lelli <juri.lelli@...hat.com>
To: Yuri Andriaccio <yurand2000@...il.com>
Cc: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>,
	linux-kernel@...r.kernel.org,
	Luca Abeni <luca.abeni@...tannapisa.it>,
	Yuri Andriaccio <yuri.andriaccio@...tannapisa.it>
Subject: Re: [RFC PATCH v3 11/24] sched/rt: Add rt-cgroups' dl-servers
 operations.

Hello,

On 29/09/25 11:22, Yuri Andriaccio wrote:
> Implement the servers' functions that pick the next eligible task to run.
> Enable/Disable dl-servers on task enqueue/dequeue when necessary.
> Update dl-servers on task update.
> Account the number of active rt-tasks in the cgroups' specific runqueue.
> Account the number of active rt-tasks on the global counter of active tasks when
> a cgroup is enqueued/dequeued (dl-server started/stopped).
> Update rq's cpuprio only if the cgroup's is root control group.
> Record which dl_server is managing a task when it changes runqueue.

Changelog looks a little dry. Claude suggests a little rewrite like

---
sched/rt: Implement dl-server operations for rt-cgroups

Implement the dl-server backend that enables rt-cgroups to run as
deadline servers. This allows RT tasks within a cgroup to be scheduled
according to the cgroup's allocated bandwidth using deadline scheduling.

The implementation consists of three main parts:

1) Server task selection callbacks:
   - rt_server_has_tasks(): Check if the rt_rq has runnable tasks
   - rt_server_pick(): Pick and set the next RT task from the cgroup's
     rt_rq when the server gets CPU time

2) Server lifecycle management:
   - Start the dl-server when the first RT task enqueues to an idle
     rt-cgroup
   - Stop the dl-server when the last RT task dequeues from an rt-cgroup
   - Update the server's consumed runtime in update_curr_rt() via
     dl_server_update()

3) Per-cpu priority and nr_running accounting:
   - Only update rq->cpupri for the root rt_rq (not for cgroup rt_rqs)
     since cgroups are scheduled via their dl-server priority
   - For cgroup rt_rqs, update global nr_running only when the dl-server
     is active (not throttled), as the server acts as the runnable entity
   - Bulk update nr_running when the server starts/stops based on the
     rt_rq's current rt_nr_running count

The rt.parent field is removed as the new implementation doesn't use
hierarchical RT scheduling entities. Instead, tasks record their dl_rq
to track which dl-server manages them.
---

Which seems to correspond to what this patch does. If that's the case,
however, I wonder if we are maybe doing too many things at once?

> Co-developed-by: Alessio Balsini <a.balsini@...up.it>
> Signed-off-by: Alessio Balsini <a.balsini@...up.it>
> Co-developed-by: Andrea Parri <parri.andrea@...il.com>
> Signed-off-by: Andrea Parri <parri.andrea@...il.com>
> Co-developed-by: luca abeni <luca.abeni@...tannapisa.it>
> Signed-off-by: luca abeni <luca.abeni@...tannapisa.it>
> Signed-off-by: Yuri Andriaccio <yurand2000@...il.com>
> ---
>  kernel/sched/deadline.c | 16 ++++++---
>  kernel/sched/rt.c       | 79 ++++++++++++++++++++++++++++++++++++-----
>  kernel/sched/sched.h    |  3 +-
>  3 files changed, 85 insertions(+), 13 deletions(-)
> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 754bfe231b4..1293b9a252b 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1869,9 +1869,13 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
>  	u64 deadline = dl_se->deadline;
>  
>  	dl_rq->dl_nr_running++;
> -
> -	if (!dl_server(dl_se))
> +	if (!dl_server(dl_se)) {
>  		add_nr_running(rq_of_dl_rq(dl_rq), 1);
> +	} else if (dl_se != &rq_of_dl_rq(dl_rq)->fair_server) {

I fear this condition might get unwieldy with the addition of new
servers (e.g. sched_scx).

> +		struct rt_rq *rt_rq = &dl_se->my_q->rt;
> +
> +		add_nr_running(rq_of_dl_rq(dl_rq), rt_rq->rt_nr_running);
> +	}
>  
>  	inc_dl_deadline(dl_rq, deadline);
>  }
> @@ -1881,9 +1885,13 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
>  {
>  	WARN_ON(!dl_rq->dl_nr_running);
>  	dl_rq->dl_nr_running--;
> -
> -	if (!dl_server(dl_se))
> +	if (!dl_server(dl_se)) {
>  		sub_nr_running(rq_of_dl_rq(dl_rq), 1);
> +	} else if (dl_se != &rq_of_dl_rq(dl_rq)->fair_server) {

Ditto.

> +		struct rt_rq *rt_rq = &dl_se->my_q->rt;
> +
> +		sub_nr_running(rq_of_dl_rq(dl_rq), rt_rq->rt_nr_running);
> +	}
>  
>  	dec_dl_deadline(dl_rq, dl_se->deadline);
>  }
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 3094f59d0c8..d9442f64c6b 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -144,14 +144,27 @@ void init_tg_rt_entry(struct task_group *tg, struct rq *served_rq,
>  	tg->dl_se[cpu] = dl_se;
>  }
>  
> +static struct task_struct *_pick_next_task_rt(struct rt_rq *rt_rq);
> +static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first);
> +
>  static bool rt_server_has_tasks(struct sched_dl_entity *dl_se)
>  {
> -	return false;
> +	return !!dl_se->my_q->rt.rt_nr_running;
>  }
>  
>  static struct task_struct *rt_server_pick(struct sched_dl_entity *dl_se)
>  {
> -	return NULL;
> +	struct rt_rq *rt_rq = &dl_se->my_q->rt;
> +	struct rq *rq = rq_of_rt_rq(rt_rq);
> +	struct task_struct *p;
> +
> +	if (dl_se->my_q->rt.rt_nr_running == 0)

Can't we use rt_server_has_tasks()?

> +		return NULL;
> +
> +	p = _pick_next_task_rt(rt_rq);
> +	set_next_task_rt(rq, p, true);
> +
> +	return p;

...

> @@ -953,9 +1011,14 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
>  	return next;
>  }
>  
> -static struct task_struct *_pick_next_task_rt(struct rq *rq)
> +static struct task_struct *_pick_next_task_rt(struct rt_rq *rt_rq)
>  {
> -	return NULL;
> +	struct sched_rt_entity *rt_se;
> +
> +	rt_se = pick_next_rt_entity(rt_rq);
> +	BUG_ON(!rt_se);

Can we WARN and recover somehow?

> +
> +	return rt_task_of(rt_se);
>  }
>  
>  static struct task_struct *pick_task_rt(struct rq *rq)
> @@ -965,7 +1028,7 @@ static struct task_struct *pick_task_rt(struct rq *rq)
>  	if (!sched_rt_runnable(rq))
>  		return NULL;
>  
> -	p = _pick_next_task_rt(rq);
> +	p = _pick_next_task_rt(&rq->rt);
>  
>  	return p;
>  }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 9853f321363..b2c87541257 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2170,7 +2170,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
>  	if (!rt_group_sched_enabled())
>  		tg = &root_task_group;
>  	p->rt.rt_rq  = tg->rt_rq[cpu];
> -	p->rt.parent = tg->rt_se[cpu];
> +	p->dl.dl_rq  = &cpu_rq(cpu)->dl;

Guess rt.parent is then removed in a subsequent patch? Do we want to
consolidate the cleanup?

>  #endif /* CONFIG_RT_GROUP_SCHED */
>  }
>  
> @@ -2726,6 +2726,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
>  
>  static inline void sub_nr_running(struct rq *rq, unsigned count)
>  {
> +	BUG_ON(rq->nr_running < count);

Can we WARN and recover somehow?

>  	rq->nr_running -= count;
>  	if (trace_sched_update_nr_running_tp_enabled()) {
>  		call_trace_sched_update_nr_running(rq, -count);

Thanks,
Juri