[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtCFAmh6JF3VEjBKtb7GAvQf0A30esedqwyXCOBEhq3Cxw@mail.gmail.com>
Date: Fri, 28 Nov 2025 14:55:36 +0100
From: Vincent Guittot <vincent.guittot@...aro.org>
To: Fernand Sieber <sieberf@...zon.com>
Cc: mingo@...hat.com, peterz@...radead.org, linux-kernel@...r.kernel.org,
juri.lelli@...hat.com, dietmar.eggemann@....com, rostedt@...dmis.org,
bsegall@...gle.com, mgorman@...e.de, vschneid@...hat.com,
kprateek.nayak@....com, dwmw@...zon.co.uk, jschoenh@...zon.de,
liuyuxua@...zon.com, abusse@...zon.com, gmazz@...zon.com, rkagan@...zon.com
Subject: Re: [PATCH] sched/fair: Force idle aware load balancing
On Thu, 27 Nov 2025 at 21:28, Fernand Sieber <sieberf@...zon.com> wrote:
>
> Consider force idle wasted capacity when computing if a group is idle or
> overloaded. We use a rather crude mechanism based on the current force idle
> state of the rq. It may be preferable to use a decaying average, similar
> to other load metrics, to avoid jittering.
>
> If the busiest group has force idle, make it a task migration. This way we
> will try to move one task regardless of the load. There are still
> subsequent checks later on to verify that this doesn't cause more force
> idle on the destination.
>
> ===
>
> Testing
>
> Testing is aimed at measuring perceived guest noise on hypervisor system
> with time shared scenarios.
>
> Setup is on system where the load is nearing 100% which should allow no
> steal time. The system has 64 CPUs, with 8 VMs, each VM using core
> scheduling with 8 vCPUs per VM, time shared.
>
> 7 VMs are running stressors (`stress-ng --cpu 0`) while the last VM is
> running the hwlat tracer with a width of 100ms, a period of 300ms, and
> a threshold of 100us. Each VM runs a cookied non vCPU VMM process that
> adds a light level of noise which forces some level of load balancing.
>
> Signed-off-by: Fernand Sieber <sieberf@...zon.com>
>
> The test scenario is ran 10x60s and the average noise is measured.
>
> At baseline, we measure about 1.20% of noise (computed from hwlat
> breaches). With the proposed patch, the noise drops to 0.63%.
> ---
> kernel/sched/fair.c | 40 +++++++++++++++++++++++++++-------------
> kernel/sched/sched.h | 12 ++++++++++++
> 2 files changed, 39 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5b752324270b..ab8c9aa09107 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9932,6 +9932,7 @@ struct sg_lb_stats {
> unsigned int nr_numa_running;
> unsigned int nr_preferred_running;
> #endif
> + unsigned int forceidle_weight;
> };
>
> /*
> @@ -10135,15 +10136,15 @@ static inline int sg_imbalanced(struct sched_group *group)
> static inline bool
> group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
> {
> - if (sgs->sum_nr_running < sgs->group_weight)
> + if (sgs->sum_nr_running < (sgs->group_weight - sgs->forceidle_weight))
> return true;
>
> - if ((sgs->group_capacity * imbalance_pct) <
> - (sgs->group_runnable * 100))
> + if ((sgs->group_capacity * imbalance_pct * (sgs->group_weight - sgs->forceidle_weight)) <
> + (sgs->group_runnable * 100 * sgs->group_weight))
so you apply a ratio on group capacity based on the number of forced
idle but what if you have heterogeneous systems ?
> return false;
>
> - if ((sgs->group_capacity * 100) >
> - (sgs->group_util * imbalance_pct))
> + if ((sgs->group_capacity * 100 * (sgs->group_weight - sgs->forceidle_weight)) >
> + (sgs->group_util * imbalance_pct * sgs->group_weight))
> return true;
>
> return false;
> @@ -10160,15 +10161,15 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
> static inline bool
> group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
> {
> - if (sgs->sum_nr_running <= sgs->group_weight)
> + if (sgs->sum_nr_running <= (sgs->group_weight - sgs->forceidle_weight))
> return false;
>
> - if ((sgs->group_capacity * 100) <
> - (sgs->group_util * imbalance_pct))
> + if ((sgs->group_capacity * 100 * (sgs->group_weight - sgs->forceidle_weight)) <
> + (sgs->group_util * imbalance_pct * sgs->group_weight))
> return true;
>
> - if ((sgs->group_capacity * imbalance_pct) <
> - (sgs->group_runnable * 100))
> + if ((sgs->group_capacity * imbalance_pct * (sgs->group_weight - sgs->forceidle_weight)) <
> + (sgs->group_runnable * 100 * sgs->group_weight))
> return true;
>
> return false;
> @@ -10371,13 +10372,19 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> nr_running = rq->nr_running;
> sgs->sum_nr_running += nr_running;
>
> + /*
> + * Ignore force idle if we are balancing within the SMT mask
> + */
> + if (rq_in_forceidle(rq) && !(env->sd->flags & SD_SHARE_CPUCAPACITY))
> + sgs->forceidle_weight++;
> +
> if (cpu_overutilized(i))
> *sg_overutilized = 1;
>
> /*
> * No need to call idle_cpu() if nr_running is not 0
> */
> - if (!nr_running && idle_cpu(i)) {
> + if (!rq_in_forceidle(rq) && !nr_running && idle_cpu(i)) {
> sgs->idle_cpus++;
> /* Idle cpu can't have misfit task */
> continue;
> @@ -10691,10 +10698,16 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
> nr_running = rq->nr_running - local;
> sgs->sum_nr_running += nr_running;
>
> + /*
> + * Ignore force idle if we are balancing within the SMT mask
> + */
> + if (rq_in_forceidle(rq) && !(sd->flags & SD_SHARE_CPUCAPACITY))
> + sgs->forceidle_weight++;
> +
> /*
> * No need to call idle_cpu_without() if nr_running is not 0
> */
> - if (!nr_running && idle_cpu_without(i, p))
> + if (!rq_in_forceidle(rq) && !nr_running && idle_cpu_without(i, p))
> sgs->idle_cpus++;
>
> /* Check if task fits in the CPU */
> @@ -11123,7 +11136,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
> return;
> }
>
> - if (busiest->group_type == group_smt_balance) {
> + if (busiest->group_type == group_smt_balance ||
> + busiest->forceidle_weight) {
> /* Reduce number of tasks sharing CPU capacity */
> env->migration_type = migrate_task;
> env->imbalance = 1;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index adfb6e3409d7..fdee101b1a66 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1468,6 +1468,13 @@ static inline bool sched_core_enqueued(struct task_struct *p)
> return !RB_EMPTY_NODE(&p->core_node);
> }
>
> +static inline bool rq_in_forceidle(struct rq *rq)
> +{
> + return rq->core->core_forceidle_count > 0 &&
> + rq->nr_running &&
> + rq->curr == rq->idle;
> +}
> +
> extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
> extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
>
> @@ -1513,6 +1520,11 @@ static inline bool sched_group_cookie_match(struct rq *rq,
> return true;
> }
>
> +static inline bool rq_in_forceidle(struct rq *rq)
> +{
> + return false;
> +}
> +
> #endif /* !CONFIG_SCHED_CORE */
>
> #ifdef CONFIG_RT_GROUP_SCHED
> --
> 2.43.0
>
>
>
>
> Amazon Development Centre (South Africa) (Proprietary) Limited
> 29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
> Registration Number: 2004 / 034463 / 07
>
Powered by blists - more mailing lists