[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251128131954.324423-1-sieberf@amazon.com>
Date: Fri, 28 Nov 2025 15:19:54 +0200
From: Fernand Sieber <sieberf@...zon.com>
To: <mingo@...hat.com>, <peterz@...radead.org>, <vincent.guittot@...aro.org>
CC: <sieberf@...zon.com>, <abusse@...zon.de>, <dwmw@...zon.co.uk>,
<gmazz@...zon.de>, <jschoenh@...zon.de>, <kprateek.nayak@....com>,
<linux-kernel@...r.kernel.org>, <liuyuxua@...zon.com>, <rkagan@...zon.de>,
<vineethr@...ux.ibm.com>, <nh-open-source@...zon.com>
Subject: [PATCH] sched/core: Push tasks on force idle
When a cpu enters force idle, it will
1) try to steal cookie matching tasks from other CPUs
2) do the newidle balance
If the stealing fails, we are out of options to get out of force idle
properly. New idle balance might decide to pull other tasks, but they won't
necessarily be matching anyways.
Introduce a step in between where we try to push the runnable tasks that
are blocked in force idle to a more suitable CPU.
=== Testing setup ===
Similar setup as in:
https://lore.kernel.org/lkml/20251127202719.963766-1-sieberf@amazon.com
Testing is aimed at measuring perceived guest noise on hypervisor system
with time shared scenarios.
Setup is on system where the load is nearing 100% which should allow no
steal time. The system has 64 CPUs, with 8 VMs, each VM using core
scheduling with 8 vCPUs per VM, time shared.
7 VMs are running stressors (`stress-ng --cpu 0`) while the last VM is
running the hwlat tracer with a width of 100ms, a period of 300ms, and
a threshold of 100us. Each VM runs a cookied non vCPU VMM process that
adds a light level of noise which forces some level of load balancing.
The test scenario is ran 10x60s and the average noise is measured (we use
breaches scaled up to period/width to estimate noise).
=== Testing results ===
Baseline noise: 1.20%
After patch noise: 0.66% (-45%)
Signed-off-by: Fernand Sieber <sieberf@...zon.com>
---
kernel/sched/core.c | 88 +++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/fair.c | 11 ++++++
kernel/sched/sched.h | 1 +
3 files changed, 98 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f754a60de848..852863eda8b8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6338,6 +6338,81 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd)
return false;
}
+static bool forceidle_try_push_task(int this, int that)
+{
+ struct rq *dst = cpu_rq(that), *src = cpu_rq(this);
+ struct task_struct *p;
+ int cpu;
+ bool cookie_check = false;
+ bool success = false;
+ const struct sched_class *class;
+
+ if (!available_idle_cpu(that))
+ return false;
+
+ if (sched_core_enabled(dst)) {
+ for_each_cpu(cpu, cpu_smt_mask(that)) {
+ if (cpu == that)
+ continue;
+ if (!available_idle_cpu(cpu)) {
+ cookie_check = true;
+ break;
+ }
+ }
+ }
+
+ guard(irq)();
+ double_rq_lock(dst, src);
+
+ for_each_class(class) {
+ if (!class->select_next_task_push)
+ continue;
+
+ p = class->select_next_task_push(src, NULL);
+ while (p) {
+ if (!is_cpu_allowed(p, that))
+ goto next;
+
+ if (sched_task_is_throttled(p, that))
+ goto next;
+
+ if (cookie_check && dst->core->core_cookie != p->core_cookie)
+ goto next;
+
+ deactivate_task(src, p, 0);
+ set_task_cpu(p, that);
+ activate_task(dst, p, 0);
+ wakeup_preempt(dst, p, 0);
+
+ success = true;
+ break;
+
+next:
+ p = class->select_next_task_push(src, p);
+ }
+ }
+
+ double_rq_unlock(dst, src);
+ return success;
+}
+
+static bool forceidle_push_tasks(int cpu, struct sched_domain *sd)
+{
+ int i;
+
+ for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {
+ if (cpumask_test_cpu(i, cpu_smt_mask(cpu)))
+ continue;
+
+ if (need_resched())
+ break;
+
+ if (forceidle_try_push_task(cpu, i))
+ return true;
+ }
+ return false;
+}
+
static void sched_core_balance(struct rq *rq)
{
struct sched_domain *sd;
@@ -6349,11 +6424,20 @@ static void sched_core_balance(struct rq *rq)
raw_spin_rq_unlock_irq(rq);
for_each_domain(cpu, sd) {
if (need_resched())
- break;
+ goto out;
if (steal_cookie_task(cpu, sd))
- break;
+ goto out;
+ }
+ for_each_domain(cpu, sd) {
+ if (need_resched())
+ goto out;
+
+ if (forceidle_push_tasks(cpu, sd))
+ goto out;
}
+
+out:
raw_spin_rq_lock_irq(rq);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c86a67762d1..a50cec23458c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13113,6 +13113,16 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu)
#endif
return throttled_hierarchy(cfs_rq);
}
+
+static struct task_struct *select_next_task_push_fair(struct rq *rq, struct task_struct *p)
+{
+ p = list_prepare_entry(p, &rq->cfs_tasks, se.group_node);
+ list_for_each_entry_continue_reverse(p, &rq->cfs_tasks, se.group_node) {
+ return p;
+ }
+ return NULL;
+}
+
#else /* !CONFIG_SCHED_CORE: */
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
#endif /* !CONFIG_SCHED_CORE */
@@ -13674,6 +13684,7 @@ DEFINE_SCHED_CLASS(fair) = {
#ifdef CONFIG_SCHED_CORE
.task_is_throttled = task_is_throttled_fair,
+ .select_next_task_push = select_next_task_push_fair,
#endif
#ifdef CONFIG_UCLAMP_TASK
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fdee101b1a66..bdcea16fca54 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2477,6 +2477,7 @@ struct sched_class {
#ifdef CONFIG_SCHED_CORE
int (*task_is_throttled)(struct task_struct *p, int cpu);
+ struct task_struct* (*select_next_task_push)(struct rq *rq, struct task_struct *p);
#endif
};
--
2.43.0
Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07
Powered by blists - more mailing lists