[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250731105543.40832-20-yurand2000@gmail.com>
Date: Thu, 31 Jul 2025 12:55:37 +0200
From: Yuri Andriaccio <yurand2000@...il.com>
To: Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>
Cc: linux-kernel@...r.kernel.org,
Luca Abeni <luca.abeni@...tannapisa.it>,
Yuri Andriaccio <yuri.andriaccio@...tannapisa.it>
Subject: [RFC PATCH v2 19/25] sched/deadline: Allow deeper hierarchies of RT cgroups
From: luca abeni <luca.abeni@...tannapisa.it>
Allow creation of cgroup hierachies with depth greater than two.
Add check to prevent attaching tasks to a child cgroup of an active cgroup (i.e.
with a running FIFO/RR task).
Add check to prevent attaching tasks to cgroups which have children with
non-zero runtime.
Update rt-cgroups allocated bandwidth accounting for nested cgroup hierachies.
Co-developed-by: Yuri Andriaccio <yurand2000@...il.com>
Signed-off-by: Yuri Andriaccio <yurand2000@...il.com>
Signed-off-by: luca abeni <luca.abeni@...tannapisa.it>
---
kernel/sched/core.c | 6 -----
kernel/sched/deadline.c | 51 +++++++++++++++++++++++++++++++++++++----
kernel/sched/rt.c | 25 +++++++++++++++++---
kernel/sched/sched.h | 2 +-
4 files changed, 70 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3ac65c6af70..eb9de8c7b1f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9277,12 +9277,6 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &root_task_group.css;
}
- /* Do not allow cpu_cgroup hierachies with depth greater than 2. */
-#ifdef CONFIG_RT_GROUP_SCHED
- if (parent != &root_task_group)
- return ERR_PTR(-EINVAL);
-#endif
-
tg = sched_create_group(parent);
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7b131630743..e263abcdc04 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -406,11 +406,42 @@ int dl_check_tg(unsigned long total)
return 1;
}
-void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
+static inline bool is_active_sched_group(struct task_group *tg)
{
+ struct task_group *child;
+ bool is_active = 1;
+
+ // if there are no children, this is a leaf group, thus it is active
+ list_for_each_entry_rcu(child, &tg->children, siblings) {
+ if (child->dl_bandwidth.dl_runtime > 0) {
+ is_active = 0;
+ }
+ }
+ return is_active;
+}
+
+static inline bool sched_group_has_active_siblings(struct task_group *tg)
+{
+ struct task_group *child;
+ bool has_active_siblings = 0;
+
+ // if there are no children, this is a leaf group, thus it is active
+ list_for_each_entry_rcu(child, &tg->parent->children, siblings) {
+ if (child != tg && child->dl_bandwidth.dl_runtime > 0) {
+ has_active_siblings = 1;
+ }
+ }
+ return has_active_siblings;
+}
+
+void dl_init_tg(struct task_group *tg, int cpu, u64 rt_runtime, u64 rt_period)
+{
+ struct sched_dl_entity *dl_se = tg->dl_se[cpu];
struct rq *rq = container_of(dl_se->dl_rq, struct rq, dl);
- int is_active;
- u64 new_bw;
+ int is_active, is_active_group;
+ u64 old_runtime, new_bw;
+
+ is_active_group = is_active_sched_group(tg);
raw_spin_rq_lock_irq(rq);
is_active = dl_se->my_q->rt.rt_nr_running > 0;
@@ -418,8 +449,10 @@ void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
update_rq_clock(rq);
dl_server_stop(dl_se);
+ old_runtime = dl_se->dl_runtime;
new_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
- dl_rq_change_utilization(rq, dl_se, new_bw);
+ if (is_active_group)
+ dl_rq_change_utilization(rq, dl_se, new_bw);
dl_se->dl_runtime = rt_runtime;
dl_se->dl_deadline = rt_period;
@@ -431,6 +464,16 @@ void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
dl_se->dl_bw = new_bw;
dl_se->dl_density = new_bw;
+ // add/remove the parent's bw
+ if (tg->parent && tg->parent != &root_task_group)
+ {
+ if (rt_runtime == 0 && old_runtime != 0 && !sched_group_has_active_siblings(tg)) {
+ __add_rq_bw(tg->parent->dl_se[cpu]->dl_bw, dl_se->dl_rq);
+ } else if (rt_runtime != 0 && old_runtime == 0 && !sched_group_has_active_siblings(tg)) {
+ __sub_rq_bw(tg->parent->dl_se[cpu]->dl_bw, dl_se->dl_rq);
+ }
+ }
+
if (is_active)
dl_server_start(dl_se);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 75a6860c2e2..29b51251fdc 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -107,7 +107,8 @@ void free_rt_sched_group(struct task_group *tg)
* Fix this issue by changing the group runtime
* to 0 immediately before freeing it.
*/
- dl_init_tg(tg->dl_se[i], 0, tg->dl_se[i]->dl_period);
+ if (tg->dl_se[i]->dl_runtime)
+ dl_init_tg(tg, i, 0, tg->dl_se[i]->dl_period);
raw_spin_rq_lock_irqsave(cpu_rq(i), flags);
BUG_ON(tg->rt_rq[i]->rt_nr_running);
raw_spin_rq_unlock_irqrestore(cpu_rq(i), flags);
@@ -2122,6 +2123,14 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
static DEFINE_MUTEX(rt_constraints_mutex);
int i, err = 0;
+ /*
+ * Do not allow to set a RT runtime > 0 if the parent has RT tasks
+ * (and is not the root group)
+ */
+ if (rt_runtime && (tg != &root_task_group) && (tg->parent != &root_task_group) && tg_has_rt_tasks(tg->parent)) {
+ return -EINVAL;
+ }
+
/* No period doesn't make any sense. */
if (rt_period == 0)
return -EINVAL;
@@ -2145,7 +2154,7 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
return 0;
for_each_possible_cpu(i) {
- dl_init_tg(tg->dl_se[i], rt_runtime, rt_period);
+ dl_init_tg(tg, i, rt_runtime, rt_period);
}
return 0;
@@ -2208,6 +2217,9 @@ static int sched_rt_global_constraints(void)
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
+ struct task_group *child;
+ int can_attach = 1;
+
/* Allow executing in the root cgroup regardless of allowed bandwidth */
if (tg == &root_task_group)
return 1;
@@ -2216,7 +2228,14 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
if (rt_group_sched_enabled() && tg->dl_bandwidth.dl_runtime == 0)
return 0;
- return 1;
+ /* If one of the children has runtime > 0, cannot attach RT tasks! */
+ list_for_each_entry_rcu(child, &tg->children, siblings) {
+ if (child->dl_bandwidth.dl_runtime) {
+ can_attach = 0;
+ }
+ }
+
+ return can_attach;
}
#else /* !CONFIG_RT_GROUP_SCHED: */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4a1bbda3720..3dd2ede6d35 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -386,7 +386,7 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq,
dl_server_pick_f pick_task);
extern void sched_init_dl_servers(void);
extern int dl_check_tg(unsigned long total);
-extern void dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period);
+extern void dl_init_tg(struct task_group *tg, int cpu, u64 rt_runtime, u64 rt_period);
extern void dl_server_update_idle_time(struct rq *rq,
struct task_struct *p);
--
2.50.1
Powered by blists - more mailing lists