lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250605071412.139240-10-yurand2000@gmail.com>
Date: Thu,  5 Jun 2025 09:14:12 +0200
From: Yuri Andriaccio <yurand2000@...il.com>
To: Ingo Molnar <mingo@...hat.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Juri Lelli <juri.lelli@...hat.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>
Cc: linux-kernel@...r.kernel.org,
	Luca Abeni <luca.abeni@...tannapisa.it>,
	Yuri Andriaccio <yuri.andriaccio@...tannapisa.it>
Subject: [RFC PATCH 9/9] sched/deadline: Allow deeper hierarchies of RT cgroups

From: luca abeni <luca.abeni@...tannapisa.it>

Allow creation of cgroup hierachies with depth greater than two.
Add check to prevent attaching tasks to a child cgroup of an active cgroup (i.e.
with a running FIFO/RR task).
Add check to prevent attaching tasks to cgroups which have children with
non-zero runtime.
Update rt-cgroups allocated bandwidth accounting for nested cgroup hierachies.

Co-developed-by: Yuri Andriaccio <yurand2000@...il.com>
Signed-off-by: Yuri Andriaccio <yurand2000@...il.com>
Signed-off-by: luca abeni <luca.abeni@...tannapisa.it>
---
 kernel/sched/core.c     |  6 ----
 kernel/sched/deadline.c | 69 ++++++++++++++++++++++++++++++++++-------
 kernel/sched/rt.c       | 25 +++++++++++++--
 kernel/sched/sched.h    |  2 +-
 kernel/sched/syscalls.c |  4 +++
 5 files changed, 84 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9c8bc9728..c02cdeccf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9127,12 +9127,6 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		return &root_task_group.css;
 	}
 
-	/* Do not allow cpu_cgroup hierachies with depth greater than 2. */
-#ifdef CONFIG_RT_GROUP_SCHED
-	if (parent != &root_task_group)
-		return ERR_PTR(-EINVAL);
-#endif
-
 	tg = sched_create_group(parent);
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b07abbb60..b405b0724 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -414,10 +414,39 @@ int dl_check_tg(unsigned long total)
 	return 1;
 }
 
-int dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
+static inline bool is_active_sched_group(struct task_group *tg)
 {
+	struct task_group *child;
+	bool is_active = 1;
+
+	// if there are no children, this is a leaf group, thus it is active
+	list_for_each_entry_rcu(child, &tg->children, siblings) {
+		if (child->dl_bandwidth.dl_runtime > 0) {
+			is_active = 0;
+		}
+	}
+	return is_active;
+}
+
+static inline bool sched_group_has_active_siblings(struct task_group *tg)
+{
+	struct task_group *child;
+	bool has_active_siblings = 0;
+
+	// if there are no children, this is a leaf group, thus it is active
+	list_for_each_entry_rcu(child, &tg->parent->children, siblings) {
+		if (child != tg && child->dl_bandwidth.dl_runtime > 0) {
+			has_active_siblings = 1;
+		}
+	}
+	return has_active_siblings;
+}
+
+int dl_init_tg(struct task_group *tg, int cpu, u64 rt_runtime, u64 rt_period)
+{
+	struct sched_dl_entity *dl_se = tg->dl_se[cpu];
 	struct rq *rq = container_of(dl_se->dl_rq, struct rq, dl);
-	int is_active;
+	int is_active, is_active_group;
 	u64 old_runtime;
 
 	/*
@@ -434,24 +463,40 @@ int dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period)
 	if (rt_period & (1ULL << 63))
 		return 0;
 
+	is_active_group = is_active_sched_group(tg);
+
 	raw_spin_rq_lock_irq(rq);
 	is_active = dl_se->my_q->rt.rt_nr_running > 0;
 	old_runtime = dl_se->dl_runtime;
 	dl_se->dl_runtime  = rt_runtime;
 	dl_se->dl_period   = rt_period;
 	dl_se->dl_deadline = dl_se->dl_period;
-	if (is_active) {
-		sub_running_bw(dl_se, dl_se->dl_rq);
-	} else if (dl_se->dl_non_contending) {
-		sub_running_bw(dl_se, dl_se->dl_rq);
-		dl_se->dl_non_contending = 0;
-		hrtimer_try_to_cancel(&dl_se->inactive_timer);
+	if (is_active_group) {
+		if (is_active) {
+			sub_running_bw(dl_se, dl_se->dl_rq);
+		} else if (dl_se->dl_non_contending) {
+			sub_running_bw(dl_se, dl_se->dl_rq);
+			dl_se->dl_non_contending = 0;
+			hrtimer_try_to_cancel(&dl_se->inactive_timer);
+		}
+		__sub_rq_bw(dl_se->dl_bw, dl_se->dl_rq);
+		dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+		__add_rq_bw(dl_se->dl_bw, dl_se->dl_rq);
+	} else {
+		dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+	}
+
+	// add/remove the parent's bw
+	if (tg->parent && tg->parent != &root_task_group)
+	{
+		if (rt_runtime == 0 && old_runtime != 0 && !sched_group_has_active_siblings(tg)) {
+			__add_rq_bw(tg->parent->dl_se[cpu]->dl_bw, dl_se->dl_rq);
+		} else if (rt_runtime != 0 && old_runtime == 0 && !sched_group_has_active_siblings(tg)) {
+			__sub_rq_bw(tg->parent->dl_se[cpu]->dl_bw, dl_se->dl_rq);
+		}
 	}
-	__sub_rq_bw(dl_se->dl_bw, dl_se->dl_rq);
-	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
-	__add_rq_bw(dl_se->dl_bw, dl_se->dl_rq);
 
-	if (is_active)
+	if (is_active_group && is_active)
 		add_running_bw(dl_se, dl_se->dl_rq);
 
 	raw_spin_rq_unlock_irq(rq);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ce3320f12..225684450 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -106,7 +106,8 @@ void free_rt_sched_group(struct task_group *tg)
 			 * Fix this issue by changing the group runtime
 			 * to 0 immediately before freeing it.
 			 */
-			BUG_ON(!dl_init_tg(tg->dl_se[i], 0, tg->dl_se[i]->dl_period));
+			if (tg->dl_se[i]->dl_runtime)
+				BUG_ON(!dl_init_tg(tg, i, 0, tg->dl_se[i]->dl_period));
 			raw_spin_rq_lock_irqsave(cpu_rq(i), flags);
 			BUG_ON(tg->rt_rq[i]->rt_nr_running);
 			raw_spin_rq_unlock_irqrestore(cpu_rq(i), flags);
@@ -2197,6 +2198,14 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 {
 	int i, err = 0;
 
+	/*
+	 * Do not allow to set a RT runtime > 0 if the parent has RT tasks
+	 * (and is not the root group)
+	 */
+	if (rt_runtime && (tg != &root_task_group) && (tg->parent != &root_task_group) && tg_has_rt_tasks(tg->parent)) {
+		return -EINVAL;
+	}
+
 	/* No period doesn't make any sense. */
 	if (rt_period == 0)
 		return -EINVAL;
@@ -2220,7 +2229,7 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 		goto unlock_bandwidth;
 
 	for_each_possible_cpu(i) {
-		if (!dl_init_tg(tg->dl_se[i], rt_runtime, rt_period)) {
+		if (!dl_init_tg(tg, i, rt_runtime, rt_period)) {
 			err = -EINVAL;
 			break;
 		}
@@ -2290,6 +2299,9 @@ static int sched_rt_global_constraints(void)
 
 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
+	struct task_group *child;
+	int can_attach = 1;
+
 	/* Allow executing in the root cgroup regardless of allowed bandwidth */
 	if (tg == &root_task_group)
 		return 1;
@@ -2298,7 +2310,14 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 	if (rt_group_sched_enabled() && tg->dl_bandwidth.dl_runtime == 0)
 		return 0;
 
-	return 1;
+	/* If one of the children has runtime > 0, cannot attach RT tasks! */
+	list_for_each_entry_rcu(child, &tg->children, siblings) {
+		if (child->dl_bandwidth.dl_runtime) {
+			can_attach = 0;
+		}
+	}
+
+	return can_attach;
 }
 
 #else /* !CONFIG_RT_GROUP_SCHED */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 686578666..fde133f9c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -384,7 +384,7 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq,
 		    dl_server_has_tasks_f has_tasks,
 		    dl_server_pick_f pick_task);
 int dl_check_tg(unsigned long total);
-int dl_init_tg(struct sched_dl_entity *dl_se, u64 rt_runtime, u64 rt_period);
+int dl_init_tg(struct task_group *tg, int cpu, u64 rt_runtime, u64 rt_period);
 
 extern void dl_server_update_idle_time(struct rq *rq,
 		    struct task_struct *p);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 45a38fe5e..7e5e6de92 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -630,6 +630,10 @@ int __sched_setscheduler(struct task_struct *p,
 
 	if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
+		if (dl_bandwidth_enabled() && rt_policy(policy) && !sched_rt_can_attach(task_group(p), p)) {
+			retval = -EPERM;
+			goto unlock;
+		}
 		/*
 		 * Do not allow real-time tasks into groups that have no runtime
 		 * assigned.
-- 
2.49.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ