linux-kernel - Re: [PATCH] sched/fair: Force idle aware load balancing

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtCFAmh6JF3VEjBKtb7GAvQf0A30esedqwyXCOBEhq3Cxw@mail.gmail.com>
Date: Fri, 28 Nov 2025 14:55:36 +0100
From: Vincent Guittot <vincent.guittot@...aro.org>
To: Fernand Sieber <sieberf@...zon.com>
Cc: mingo@...hat.com, peterz@...radead.org, linux-kernel@...r.kernel.org, 
	juri.lelli@...hat.com, dietmar.eggemann@....com, rostedt@...dmis.org, 
	bsegall@...gle.com, mgorman@...e.de, vschneid@...hat.com, 
	kprateek.nayak@....com, dwmw@...zon.co.uk, jschoenh@...zon.de, 
	liuyuxua@...zon.com, abusse@...zon.com, gmazz@...zon.com, rkagan@...zon.com
Subject: Re: [PATCH] sched/fair: Force idle aware load balancing

On Thu, 27 Nov 2025 at 21:28, Fernand Sieber <sieberf@...zon.com> wrote:
>
> Consider force idle wasted capacity when computing if a group is idle or
> overloaded. We use a rather crude mechanism based on the current force idle
> state of the rq. It may be preferable to use a decaying average, similar
> to other load metrics, to avoid jittering.
>
> If the busiest group has force idle, make it a task migration. This way we
> will try to move one task regardless of the load. There are still
> subsequent checks later on to verify that this doesn't cause more force
> idle on the destination.
>
> ===
>
> Testing
>
> Testing is aimed at measuring perceived guest noise on hypervisor system
> with time shared scenarios.
>
> Setup is on system where the load is nearing 100% which should allow no
> steal time. The system has 64 CPUs, with 8 VMs, each VM using core
> scheduling with 8 vCPUs per VM, time shared.
>
> 7 VMs are running stressors (`stress-ng --cpu 0`) while the last VM is
> running the hwlat tracer with a width of 100ms, a period of 300ms, and
> a threshold of 100us. Each VM runs a cookied non vCPU VMM process that
> adds a light level of noise which forces some level of load balancing.
>
> Signed-off-by: Fernand Sieber <sieberf@...zon.com>
>
> The test scenario is ran 10x60s and the average noise is measured.
>
> At baseline, we measure about 1.20% of noise (computed from hwlat
> breaches). With the proposed patch, the noise drops to 0.63%.
> ---
>  kernel/sched/fair.c  | 40 +++++++++++++++++++++++++++-------------
>  kernel/sched/sched.h | 12 ++++++++++++
>  2 files changed, 39 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5b752324270b..ab8c9aa09107 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9932,6 +9932,7 @@ struct sg_lb_stats {
>         unsigned int nr_numa_running;
>         unsigned int nr_preferred_running;
>  #endif
> +       unsigned int forceidle_weight;
>  };
>
>  /*
> @@ -10135,15 +10136,15 @@ static inline int sg_imbalanced(struct sched_group *group)
>  static inline bool
>  group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
>  {
> -       if (sgs->sum_nr_running < sgs->group_weight)
> +       if (sgs->sum_nr_running < (sgs->group_weight - sgs->forceidle_weight))
>                 return true;
>
> -       if ((sgs->group_capacity * imbalance_pct) <
> -                       (sgs->group_runnable * 100))
> +       if ((sgs->group_capacity * imbalance_pct * (sgs->group_weight - sgs->forceidle_weight)) <
> +                       (sgs->group_runnable * 100 * sgs->group_weight))

so you apply a ratio on group capacity based on the number of forced
idle but what if you have heterogeneous systems ?

>                 return false;
>
> -       if ((sgs->group_capacity * 100) >
> -                       (sgs->group_util * imbalance_pct))
> +       if ((sgs->group_capacity * 100 * (sgs->group_weight - sgs->forceidle_weight)) >
> +                       (sgs->group_util * imbalance_pct * sgs->group_weight))
>                 return true;
>
>         return false;
> @@ -10160,15 +10161,15 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
>  static inline bool
>  group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
>  {
> -       if (sgs->sum_nr_running <= sgs->group_weight)
> +       if (sgs->sum_nr_running <= (sgs->group_weight - sgs->forceidle_weight))
>                 return false;
>
> -       if ((sgs->group_capacity * 100) <
> -                       (sgs->group_util * imbalance_pct))
> +       if ((sgs->group_capacity * 100 * (sgs->group_weight - sgs->forceidle_weight)) <
> +                       (sgs->group_util * imbalance_pct * sgs->group_weight))
>                 return true;
>
> -       if ((sgs->group_capacity * imbalance_pct) <
> -                       (sgs->group_runnable * 100))
> +       if ((sgs->group_capacity * imbalance_pct * (sgs->group_weight - sgs->forceidle_weight)) <
> +                       (sgs->group_runnable * 100 * sgs->group_weight))
>                 return true;
>
>         return false;
> @@ -10371,13 +10372,19 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>                 nr_running = rq->nr_running;
>                 sgs->sum_nr_running += nr_running;
>
> +               /*
> +                * Ignore force idle if we are balancing within the SMT mask
> +                */
> +               if (rq_in_forceidle(rq) && !(env->sd->flags & SD_SHARE_CPUCAPACITY))
> +                       sgs->forceidle_weight++;
> +
>                 if (cpu_overutilized(i))
>                         *sg_overutilized = 1;
>
>                 /*
>                  * No need to call idle_cpu() if nr_running is not 0
>                  */
> -               if (!nr_running && idle_cpu(i)) {
> +               if (!rq_in_forceidle(rq) && !nr_running && idle_cpu(i)) {
>                         sgs->idle_cpus++;
>                         /* Idle cpu can't have misfit task */
>                         continue;
> @@ -10691,10 +10698,16 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
>                 nr_running = rq->nr_running - local;
>                 sgs->sum_nr_running += nr_running;
>
> +               /*
> +                * Ignore force idle if we are balancing within the SMT mask
> +                */
> +               if (rq_in_forceidle(rq) && !(sd->flags & SD_SHARE_CPUCAPACITY))
> +                       sgs->forceidle_weight++;
> +
>                 /*
>                  * No need to call idle_cpu_without() if nr_running is not 0
>                  */
> -               if (!nr_running && idle_cpu_without(i, p))
> +               if (!rq_in_forceidle(rq) && !nr_running && idle_cpu_without(i, p))
>                         sgs->idle_cpus++;
>
>                 /* Check if task fits in the CPU */
> @@ -11123,7 +11136,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
>                 return;
>         }
>
> -       if (busiest->group_type == group_smt_balance) {
> +       if (busiest->group_type == group_smt_balance ||
> +           busiest->forceidle_weight) {
>                 /* Reduce number of tasks sharing CPU capacity */
>                 env->migration_type = migrate_task;
>                 env->imbalance = 1;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index adfb6e3409d7..fdee101b1a66 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1468,6 +1468,13 @@ static inline bool sched_core_enqueued(struct task_struct *p)
>         return !RB_EMPTY_NODE(&p->core_node);
>  }
>
> +static inline bool rq_in_forceidle(struct rq *rq)
> +{
> +       return rq->core->core_forceidle_count > 0 &&
> +               rq->nr_running &&
> +               rq->curr == rq->idle;
> +}
> +
>  extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
>  extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
>
> @@ -1513,6 +1520,11 @@ static inline bool sched_group_cookie_match(struct rq *rq,
>         return true;
>  }
>
> +static inline bool rq_in_forceidle(struct rq *rq)
> +{
> +       return false;
> +}
> +
>  #endif /* !CONFIG_SCHED_CORE */
>
>  #ifdef CONFIG_RT_GROUP_SCHED
> --
> 2.43.0
>
>
>
>
> Amazon Development Centre (South Africa) (Proprietary) Limited
> 29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
> Registration Number: 2004 / 034463 / 07
>