[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251119124449.1149616-11-sshegde@linux.ibm.com>
Date: Wed, 19 Nov 2025 18:14:42 +0530
From: Shrikanth Hegde <sshegde@...ux.ibm.com>
To: linux-kernel@...r.kernel.org, linuxppc-dev@...ts.ozlabs.org
Cc: sshegde@...ux.ibm.com, mingo@...hat.com, peterz@...radead.org,
juri.lelli@...hat.com, vincent.guittot@...aro.org, tglx@...utronix.de,
yury.norov@...il.com, maddy@...ux.ibm.com, srikar@...ux.ibm.com,
gregkh@...uxfoundation.org, pbonzini@...hat.com, seanjc@...gle.com,
kprateek.nayak@....com, vschneid@...hat.com, iii@...ux.ibm.com,
huschle@...ux.ibm.com, rostedt@...dmis.org, dietmar.eggemann@....com,
christophe.leroy@...roup.eu
Subject: [PATCH 10/17] sched/core: Push current task from paravirt CPU
Actively push out RT/CFS running on a paravirt CPU. Since the task is
running on the CPU, need to stop the cpu and push the task out.
However, if the task in pinned only to paravirt CPUs, it will continue
running there.
Though code is almost same as __balance_push_cpu_stop and quite close to
push_cpu_stop, it provides a cleaner implementation w.r.t to PARAVIRT config.
Add push_task_work_done flag to protect pv_push_task_work buffer.
This currently works only FAIR and RT.
Signed-off-by: Shrikanth Hegde <sshegde@...ux.ibm.com>
---
kernel/sched/core.c | 83 ++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 9 +++++
2 files changed, 92 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 73d1d49a3c72..65c247c24191 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5521,6 +5521,10 @@ void sched_tick(void)
unsigned long hw_pressure;
u64 resched_latency;
+ /* push the current task out if a paravirt CPU */
+ if (cpu_paravirt(cpu))
+ push_current_from_paravirt_cpu(rq);
+
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
arch_scale_freq_tick();
@@ -10869,4 +10873,83 @@ void sched_change_end(struct sched_change_ctx *ctx)
#ifdef CONFIG_PARAVIRT
struct cpumask __cpu_paravirt_mask __read_mostly;
EXPORT_SYMBOL(__cpu_paravirt_mask);
+
+static DEFINE_PER_CPU(struct cpu_stop_work, pv_push_task_work);
+
+static int paravirt_push_cpu_stop(void *arg)
+{
+ struct task_struct *p = arg;
+ struct rq *rq = this_rq();
+ struct rq_flags rf;
+ int cpu;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ rq_lock(rq, &rf);
+ rq->push_task_work_done = 0;
+
+ update_rq_clock(rq);
+
+ if (task_rq(p) == rq && task_on_rq_queued(p)) {
+ cpu = select_fallback_rq(rq->cpu, p);
+ rq = __migrate_task(rq, &rf, p, cpu);
+ }
+
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irq(&p->pi_lock);
+ put_task_struct(p);
+
+ return 0;
+}
+
+/* A CPU is marked as Paravirt when there is contention for underlying
+ * physical CPU and using this CPU will lead to hypervisor preemptions.
+ * It is better not to use this CPU.
+ *
+ * In case any task is scheduled on such CPU, move it out. In
+ * select_fallback_rq a non paravirt CPU will be chosen and henceforth
+ * task shouldn't come back to this CPU
+ */
+void push_current_from_paravirt_cpu(struct rq *rq)
+{
+ struct task_struct *push_task = rq->curr;
+ unsigned long flags;
+ struct rq_flags rf;
+
+ if (!cpu_paravirt(rq->cpu))
+ return;
+
+ /* Idle task can't be pused out */
+ if (rq->curr == rq->idle)
+ return;
+
+ /* Do for only SCHED_NORMAL AND RT for now */
+ if (push_task->sched_class != &fair_sched_class &&
+ push_task->sched_class != &rt_sched_class)
+ return;
+
+ if (kthread_is_per_cpu(push_task) ||
+ is_migration_disabled(push_task))
+ return;
+
+ /* Is it affine to only paravirt cpus? */
+ if (cpumask_subset(push_task->cpus_ptr, cpu_paravirt_mask))
+ return;
+
+ /* There is already a stopper thread for this. Dont race with it */
+ if (rq->push_task_work_done == 1)
+ return;
+
+ local_irq_save(flags);
+
+ get_task_struct(push_task);
+ schedstat_inc(push_task->stats.nr_migrations_paravirt);
+
+ rq_lock(rq, &rf);
+ rq->push_task_work_done = 1;
+ rq_unlock(rq, &rf);
+
+ stop_one_cpu_nowait(rq->cpu, paravirt_push_cpu_stop, push_task,
+ this_cpu_ptr(&pv_push_task_work));
+ local_irq_restore(flags);
+}
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b419a4d98461..42984a65384c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1214,6 +1214,9 @@ struct rq {
unsigned char nohz_idle_balance;
unsigned char idle_balance;
+#ifdef CONFIG_PARAVIRT
+ bool push_task_work_done;
+#endif
unsigned long misfit_task_load;
/* For active balancing */
@@ -4017,6 +4020,12 @@ extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
+#ifdef CONFIG_PARAVIRT
+void push_current_from_paravirt_cpu(struct rq *rq);
+#else
+static inline void push_current_from_paravirt_cpu(struct rq *rq) { }
+#endif
+
/*
* The 'sched_change' pattern is the safe, easy and slow way of changing a
* task's scheduling properties. It dequeues a task, such that the scheduler
--
2.47.3
Powered by blists - more mailing lists