[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250731105543.40832-21-yurand2000@gmail.com>
Date: Thu, 31 Jul 2025 12:55:38 +0200
From: Yuri Andriaccio <yurand2000@...il.com>
To: Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>
Cc: linux-kernel@...r.kernel.org,
Luca Abeni <luca.abeni@...tannapisa.it>,
Yuri Andriaccio <yuri.andriaccio@...tannapisa.it>
Subject: [RFC PATCH v2 20/25] sched/rt: Add rt-cgroup migration
From: luca abeni <luca.abeni@...tannapisa.it>
When the runtime is exhausted in a RT CGroup, the scheduler checks for
another non-throttled runqueue and, if available, migrates the tasks.
The bandwidth (runtime/period) chosen for a certain cgroup is replicated on
every core of the system, therefore, in an SMP system with M cores, the
total available bandwidth is the given runtime/period multiplied by M.
Co-developed-by: Alessio Balsini <a.balsini@...up.it>
Signed-off-by: Alessio Balsini <a.balsini@...up.it>
Co-developed-by: Andrea Parri <parri.andrea@...il.com>
Signed-off-by: Andrea Parri <parri.andrea@...il.com>
Co-developed-by: Yuri Andriaccio <yurand2000@...il.com>
Signed-off-by: Yuri Andriaccio <yurand2000@...il.com>
Signed-off-by: luca abeni <luca.abeni@...tannapisa.it>
---
kernel/sched/rt.c | 471 ++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 5 +
2 files changed, 468 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 29b51251fdc..2fdb2657554 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1,3 +1,4 @@
+#pragma GCC diagnostic ignored "-Wunused-function"
// SPDX-License-Identifier: GPL-2.0
/*
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
@@ -84,6 +85,8 @@ void init_rt_rq(struct rt_rq *rt_rq)
plist_head_init(&rt_rq->pushable_tasks);
}
+static void group_pull_rt_task(struct rt_rq *this_rt_rq);
+
#ifdef CONFIG_RT_GROUP_SCHED
void unregister_rt_sched_group(struct task_group *tg)
@@ -289,6 +292,45 @@ static inline void rt_queue_pull_task(struct rq *rq)
queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
}
+#ifdef CONFIG_RT_GROUP_SCHED
+static DEFINE_PER_CPU(struct balance_callback, rt_group_push_head);
+static DEFINE_PER_CPU(struct balance_callback, rt_group_pull_head);
+static void push_group_rt_tasks(struct rq *);
+static void pull_group_rt_tasks(struct rq *);
+
+static void rt_queue_push_from_group(struct rq *rq, struct rt_rq *rt_rq)
+{
+ BUG_ON(rt_rq == NULL);
+ BUG_ON(rt_rq->rq != rq);
+
+ if (rq->rq_to_push_from)
+ return;
+
+ rq->rq_to_push_from = container_of(rt_rq, struct rq, rt);
+ queue_balance_callback(rq, &per_cpu(rt_group_push_head, rq->cpu),
+ push_group_rt_tasks);
+}
+
+static void rt_queue_pull_to_group(struct rq *rq, struct rt_rq *rt_rq)
+{
+ struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+ BUG_ON(rt_rq == NULL);
+ BUG_ON(!is_dl_group(rt_rq));
+ BUG_ON(rt_rq->rq != rq);
+
+ if (dl_se->dl_throttled || rq->rq_to_pull_to)
+ return;
+
+ rq->rq_to_pull_to = container_of(rt_rq, struct rq, rt);
+ queue_balance_callback(rq, &per_cpu(rt_group_pull_head, rq->cpu),
+ pull_group_rt_tasks);
+}
+#else
+static inline void rt_queue_push_from_group(struct rq *rq, struct rt_rq *rt_rq) {};
+static inline void rt_queue_pull_to_group(struct rq *rq, struct rt_rq *rt_rq) {};
+#endif
+
static void enqueue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
{
plist_del(&p->pushable_tasks, &rt_rq->pushable_tasks);
@@ -1277,6 +1319,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
*/
static int push_rt_task(struct rq *rq, bool pull)
{
+ BUG_ON(is_dl_group(&rq->rt));
+
struct task_struct *next_task;
struct rq *lowest_rq;
int ret = 0;
@@ -1573,6 +1617,8 @@ void rto_push_irq_work_func(struct irq_work *work)
static void pull_rt_task(struct rq *this_rq)
{
+ BUG_ON(is_dl_group(&this_rq->rt));
+
int this_cpu = this_rq->cpu, cpu;
bool resched = false;
struct task_struct *p, *push_task;
@@ -1683,27 +1729,436 @@ static void pull_rt_task(struct rq *this_rq)
}
#ifdef CONFIG_RT_GROUP_SCHED
-static int group_push_rt_task(struct rt_rq *rt_rq)
+/*
+ * Find the lowest priority runqueue among the runqueues of the same
+ * task group. Unlike find_lowest_rt(), this does not mean that the
+ * lowest priority cpu is running tasks from this runqueue.
+ */
+static int group_find_lowest_rt_rq(struct task_struct *task, struct rt_rq* task_rt_rq)
+{
+ struct sched_domain *sd;
+ struct cpumask mask, *lowest_mask = &mask;
+ struct sched_dl_entity *dl_se;
+ struct rt_rq *rt_rq;
+ int prio, lowest_prio;
+ int cpu, this_cpu = smp_processor_id();
+
+ BUG_ON(task->sched_task_group != task_rt_rq->tg);
+
+ if (task->nr_cpus_allowed == 1)
+ return -1; /* No other targets possible */
+
+ lowest_prio = task->prio - 1;
+ cpumask_clear(lowest_mask);
+ for_each_cpu_and(cpu, cpu_online_mask, task->cpus_ptr) {
+ dl_se = task_rt_rq->tg->dl_se[cpu];
+ rt_rq = &dl_se->my_q->rt;
+ prio = rt_rq->highest_prio.curr;
+
+ /*
+ * If we're on asym system ensure we consider the different capacities
+ * of the CPUs when searching for the lowest_mask.
+ */
+ if (dl_se->dl_throttled || !rt_task_fits_capacity(task, cpu))
+ continue;
+
+ if (prio >= lowest_prio) {
+ if (prio > lowest_prio) {
+ cpumask_clear(lowest_mask);
+ lowest_prio = prio;
+ }
+
+ cpumask_set_cpu(cpu, lowest_mask);
+ }
+ }
+
+ if (cpumask_empty(lowest_mask))
+ return -1;
+
+ /*
+ * At this point we have built a mask of CPUs representing the
+ * lowest priority tasks in the system. Now we want to elect
+ * the best one based on our affinity and topology.
+ *
+ * We prioritize the last CPU that the task executed on since
+ * it is most likely cache-hot in that location.
+ */
+ cpu = task_cpu(task);
+ if (cpumask_test_cpu(cpu, lowest_mask))
+ return cpu;
+
+ /*
+ * Otherwise, we consult the sched_domains span maps to figure
+ * out which CPU is logically closest to our hot cache data.
+ */
+ if (!cpumask_test_cpu(this_cpu, lowest_mask))
+ this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
+
+ /*
+ * "this_cpu" is cheaper to preempt than a
+ * remote processor.
+ */
+ if (this_cpu != -1 &&
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
+ return this_cpu;
+ }
+
+ best_cpu = cpumask_any_and_distribute(lowest_mask,
+ sched_domain_span(sd));
+ if (best_cpu < nr_cpu_ids) {
+ rcu_read_unlock();
+ return best_cpu;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * And finally, if there were no matches within the domains
+ * just give the caller *something* to work with from the compatible
+ * locations.
+ */
+ if (this_cpu != -1)
+ return this_cpu;
+
+ cpu = cpumask_any_distribute(lowest_mask);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+
+ return -1;
+}
+
+/*
+ * Find and lock the lowest priority runqueue among the runqueues
+ * of the same task group. Unlike find_lock_lowest_rt(), this does not
+ * mean that the lowest priority cpu is running tasks from this runqueue.
+ */
+static struct rt_rq* group_find_lock_lowest_rt_rq(struct task_struct *task, struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct rq *lowest_rq;
+ struct rt_rq *lowest_rt_rq = NULL;
+ struct sched_dl_entity *lowest_dl_se;
+ int tries, cpu;
+
+ BUG_ON(task->sched_task_group != rt_rq->tg);
+
+ for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+ cpu = group_find_lowest_rt_rq(task, rt_rq);
+
+ if ((cpu == -1) || (cpu == rq->cpu))
+ break;
+
+ lowest_dl_se = rt_rq->tg->dl_se[cpu];
+ lowest_rt_rq = &lowest_dl_se->my_q->rt;
+ lowest_rq = cpu_rq(cpu);
+
+ if (lowest_rt_rq->highest_prio.curr <= task->prio) {
+ /*
+ * Target rq has tasks of equal or higher priority,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ lowest_rt_rq = NULL;
+ break;
+ }
+
+ /* if the prio of this runqueue changed, try again */
+ if (double_lock_balance(rq, lowest_rq)) {
+ /*
+ * We had to unlock the run queue. In
+ * the mean time, task could have
+ * migrated already or had its affinity changed.
+ * Also make sure that it wasn't scheduled on its rq.
+ * It is possible the task was scheduled, set
+ * "migrate_disabled" and then got preempted, so we must
+ * check the task migration disable flag here too.
+ */
+ if (unlikely(is_migration_disabled(task) ||
+ lowest_dl_se->dl_throttled ||
+ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
+ task != pick_next_pushable_task(rt_rq))) {
+
+ double_unlock_balance(rq, lowest_rq);
+ lowest_rt_rq = NULL;
+ break;
+ }
+ }
+
+ /* If this rq is still suitable use it. */
+ if (lowest_rt_rq->highest_prio.curr > task->prio)
+ break;
+
+ /* try again */
+ double_unlock_balance(rq, lowest_rq);
+ lowest_rt_rq = NULL;
+ }
+
+ return lowest_rt_rq;
+}
+
+static int group_push_rt_task(struct rt_rq *rt_rq, bool pull)
{
+ BUG_ON(!is_dl_group(rt_rq));
+
struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct task_struct *next_task;
+ struct rq *lowest_rq;
+ struct rt_rq *lowest_rt_rq;
+ int ret = 0;
+
+ if (!rt_rq->overloaded)
+ return 0;
+
+ next_task = pick_next_pushable_task(rt_rq);
+ if (!next_task)
+ return 0;
+
+retry:
+ if (is_migration_disabled(next_task)) {
+ struct task_struct *push_task = NULL;
+ int cpu;
+
+ if (!pull || rq->push_busy)
+ return 0;
+
+ /*
+ * If the current task does not belong to the same task group
+ * we cannot push it away.
+ */
+ if (rq->curr->sched_task_group != rt_rq->tg)
+ return 0;
+
+ /*
+ * Invoking group_find_lowest_rt_rq() on anything but an RT task doesn't
+ * make sense. Per the above priority check, curr has to
+ * be of higher priority than next_task, so no need to
+ * reschedule when bailing out.
+ *
+ * Note that the stoppers are masqueraded as SCHED_FIFO
+ * (cf. sched_set_stop_task()), so we can't rely on rt_task().
+ */
+ if (rq->curr->sched_class != &rt_sched_class)
+ return 0;
+
+ cpu = group_find_lowest_rt_rq(rq->curr, rt_rq);
+ if (cpu == -1 || cpu == rq->cpu)
+ return 0;
+
+ /*
+ * Given we found a CPU with lower priority than @next_task,
+ * therefore it should be running. However we cannot migrate it
+ * to this other CPU, instead attempt to push the current
+ * running task on this CPU away.
+ */
+ push_task = get_push_task(rq);
+ if (push_task) {
+ preempt_disable();
+ raw_spin_rq_unlock(rq);
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ push_task, &rq->push_work);
+ preempt_enable();
+ raw_spin_rq_lock(rq);
+ }
- if (is_dl_group(rt_rq))
return 0;
+ }
+
+ if (WARN_ON(next_task == rq->curr))
+ return 0;
+
+ /* We might release rq lock */
+ get_task_struct(next_task);
+
+ /* group_find_lock_lowest_rq locks the rq if found */
+ lowest_rt_rq = group_find_lock_lowest_rt_rq(next_task, rt_rq);
+ if (!lowest_rt_rq) {
+ struct task_struct *task;
+ /*
+ * group_find_lock_lowest_rt_rq releases rq->lock
+ * so it is possible that next_task has migrated.
+ *
+ * We need to make sure that the task is still on the same
+ * run-queue and is also still the next task eligible for
+ * pushing.
+ */
+ task = pick_next_pushable_task(rt_rq);
+ if (task == next_task) {
+ /*
+ * The task hasn't migrated, and is still the next
+ * eligible task, but we failed to find a run-queue
+ * to push it to. Do not retry in this case, since
+ * other CPUs will pull from us when ready.
+ */
+ goto out;
+ }
+
+ if (!task)
+ /* No more tasks, just exit */
+ goto out;
+
+ /*
+ * Something has shifted, try again.
+ */
+ put_task_struct(next_task);
+ next_task = task;
+ goto retry;
+ }
+
+ lowest_rq = rq_of_rt_rq(lowest_rt_rq);
+
+ move_queued_task_locked(rq, lowest_rq, next_task);
+ resched_curr(lowest_rq);
+ ret = 1;
+
+ double_unlock_balance(rq, lowest_rq);
+out:
+ put_task_struct(next_task);
+
+ return ret;
+}
+
+static void group_pull_rt_task(struct rt_rq *this_rt_rq)
+{
+ BUG_ON(!is_dl_group(this_rt_rq));
+
+ struct rq *this_rq = rq_of_rt_rq(this_rt_rq);
+ int this_cpu = this_rq->cpu, cpu;
+ bool resched = false;
+ struct task_struct *p, *push_task = NULL;
+ struct rt_rq *src_rt_rq;
+ struct rq *src_rq;
+ struct sched_dl_entity *src_dl_se;
+
+ for_each_online_cpu(cpu) {
+ if (this_cpu == cpu)
+ continue;
- return push_rt_task(rq, false);
+ src_dl_se = this_rt_rq->tg->dl_se[cpu];
+ src_rt_rq = &src_dl_se->my_q->rt;
+
+ if (src_rt_rq->rt_nr_running <= 1 && !src_dl_se->dl_throttled)
+ continue;
+
+ src_rq = rq_of_rt_rq(src_rt_rq);
+
+ /*
+ * Don't bother taking the src_rq->lock if the next highest
+ * task is known to be lower-priority than our current task.
+ * This may look racy, but if this value is about to go
+ * logically higher, the src_rq will push this task away.
+ * And if its going logically lower, we do not care
+ */
+ if (src_rt_rq->highest_prio.next >=
+ this_rt_rq->highest_prio.curr)
+ continue;
+
+ /*
+ * We can potentially drop this_rq's lock in
+ * double_lock_balance, and another CPU could
+ * alter this_rq
+ */
+ push_task = NULL;
+ double_lock_balance(this_rq, src_rq);
+
+ /*
+ * We can pull only a task, which is pushable
+ * on its rq, and no others.
+ */
+ p = pick_highest_pushable_task(src_rt_rq, this_cpu);
+
+ /*
+ * Do we have an RT task that preempts
+ * the to-be-scheduled task?
+ */
+ if (p && (p->prio < this_rt_rq->highest_prio.curr)) {
+ WARN_ON(p == src_rq->curr);
+ WARN_ON(!task_on_rq_queued(p));
+
+ /*
+ * There's a chance that p is higher in priority
+ * than what's currently running on its CPU.
+ * This is just that p is waking up and hasn't
+ * had a chance to schedule. We only pull
+ * p if it is lower in priority than the
+ * current task on the run queue
+ */
+ if (p->prio < src_rq->curr->prio)
+ goto skip;
+
+ if (is_migration_disabled(p)) {
+ /*
+ * If the current task does not belong to the same task group
+ * we cannot push it away.
+ */
+ if (src_rq->curr->sched_task_group != this_rt_rq->tg)
+ goto skip;
+
+ push_task = get_push_task(src_rq);
+ } else {
+ move_queued_task_locked(src_rq, this_rq, p);
+ resched = true;
+ }
+ /*
+ * We continue with the search, just in
+ * case there's an even higher prio task
+ * in another runqueue. (low likelihood
+ * but possible)
+ */
+ }
+skip:
+ double_unlock_balance(this_rq, src_rq);
+
+ if (push_task) {
+ preempt_disable();
+ raw_spin_rq_unlock(this_rq);
+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+ push_task, &src_rq->push_work);
+ preempt_enable();
+ raw_spin_rq_lock(this_rq);
+ }
+ }
+
+ if (resched)
+ resched_curr(this_rq);
}
static void group_push_rt_tasks(struct rt_rq *rt_rq)
{
- while (group_push_rt_task(rt_rq))
+ while (group_push_rt_task(rt_rq, false))
;
}
-#else
-static void group_push_rt_tasks(struct rt_rq *rt_rq)
+
+static void push_group_rt_tasks(struct rq *rq)
{
- push_rt_tasks(rq_of_rt_rq(rt_rq));
+ BUG_ON(rq->rq_to_push_from == NULL);
+
+ if ((rq->rq_to_push_from->rt.rt_nr_running > 1) ||
+ (dl_group_of(&rq->rq_to_push_from->rt)->dl_throttled == 1)) {
+ group_push_rt_task(&rq->rq_to_push_from->rt, false);
+ }
+
+ rq->rq_to_push_from = NULL;
}
-#endif
+
+static void pull_group_rt_tasks(struct rq *rq)
+{
+ BUG_ON(rq->rq_to_pull_to == NULL);
+ BUG_ON(rq->rq_to_pull_to->rt.rq != rq);
+
+ group_pull_rt_task(&rq->rq_to_pull_to->rt);
+ rq->rq_to_pull_to = NULL;
+}
+#else /* CONFIG_RT_GROUP_SCHED */
+static void group_pull_rt_task(struct rt_rq *this_rt_rq) { }
+static void group_push_rt_tasks(struct rt_rq *rt_rq) { }
+#endif /* CONFIG_RT_GROUP_SCHED */
/*
* If we are not running and we are not going to reschedule soon, we should
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3dd2ede6d35..10e29f37f9b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1282,6 +1282,11 @@ struct rq {
call_single_data_t cfsb_csd;
struct list_head cfsb_csd_list;
#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct rq *rq_to_push_from;
+ struct rq *rq_to_pull_to;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
--
2.50.1
Powered by blists - more mailing lists