linux-kernel - Re: [RFC PATCH 3/3] sched,x86: Make the scheduler guest unhalted aware

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtDx3vVK1ZgBwicTeP82wL=wGOKdxheuBHCBjzM6mSDPOQ@mail.gmail.com>
Date: Thu, 27 Feb 2025 08:34:34 +0100
From: Vincent Guittot <vincent.guittot@...aro.org>
To: Fernand Sieber <sieberf@...zon.com>
Cc: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>, 
	Paolo Bonzini <pbonzini@...hat.com>, x86@...nel.org, kvm@...r.kernel.org, 
	linux-kernel@...r.kernel.org, nh-open-source@...zon.com
Subject: Re: [RFC PATCH 3/3] sched,x86: Make the scheduler guest unhalted aware

On Tue, 18 Feb 2025 at 21:27, Fernand Sieber <sieberf@...zon.com> wrote:
>
> With guest hlt/mwait/pause pass through, the scheduler has no visibility into
> real vCPU activity as it sees them all 100% active. As such, load balancing
> cannot make informed decisions on where it is preferrable to collocate
> tasks when necessary. I.e as far as the load balancer is concerned, a
> halted vCPU and an idle polling vCPU look exactly the same so it may decide
> that either should be preempted when in reality it would be preferrable to
> preempt the idle one.
>
> This commits enlightens the scheduler to real guest activity in this
> situation. Leveraging gtime unhalted, it adds a hook for kvm to communicate
> to the scheduler the duration that a vCPU spends halted. This is then used in
> PELT accounting to discount it from real activity. This results in better
> placement and overall steal time reduction.

NAK, PELT account for time spent by se on the CPU. If your thread/vcpu
doesn't do anything but burn cycles, find another way to report thatto
the host
Furthermore this breaks all the hierarchy dependency

>
> This initial implementation assumes that non-idle CPUs are ticking as it
> hooks the unhalted time the PELT decaying load accounting. As such it
> doesn't work well if PELT is updated infrequenly with large chunks of
> halted time. This is not a fundamental limitation but more complex
> accounting is needed to generalize the use case to nohz full.
> ---
>  arch/x86/kvm/x86.c    |  8 ++++++--
>  include/linux/sched.h |  4 ++++
>  kernel/sched/core.c   |  1 +
>  kernel/sched/fair.c   | 25 +++++++++++++++++++++++++
>  kernel/sched/pelt.c   | 42 +++++++++++++++++++++++++++++++++++-------
>  kernel/sched/sched.h  |  2 ++
>  6 files changed, 73 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 46975b0a63a5..156cf05b325f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10712,6 +10712,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>         int r;
>         unsigned long long cycles, cycles_start = 0;
>         unsigned long long unhalted_cycles, unhalted_cycles_start = 0;
> +       unsigned long long halted_cycles_ns = 0;
>         bool req_int_win =
>                 dm_request_for_irq_injection(vcpu) &&
>                 kvm_cpu_accept_dm_intr(vcpu);
> @@ -11083,8 +11084,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>                 cycles = get_cycles() - cycles_start;
>                 unhalted_cycles = get_unhalted_cycles() -
>                         unhalted_cycles_start;
> -               if (likely(cycles > unhalted_cycles))
> -                       current->gtime_halted += cycles2ns(cycles - unhalted_cycles);
> +               if (likely(cycles > unhalted_cycles)) {
> +                       halted_cycles_ns = cycles2ns(cycles - unhalted_cycles);
> +                       current->gtime_halted += halted_cycles_ns;
> +                       sched_account_gtime_halted(current, halted_cycles_ns);
> +               }
>         }
>
>         local_irq_enable();
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 5f6745357e20..5409fac152c9 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -367,6 +367,8 @@ struct vtime {
>         u64                     gtime;
>  };
>
> +extern void sched_account_gtime_halted(struct task_struct *p, u64 gtime_halted);
> +
>  /*
>   * Utilization clamp constraints.
>   * @UCLAMP_MIN:        Minimum utilization
> @@ -588,6 +590,8 @@ struct sched_entity {
>          */
>         struct sched_avg                avg;
>  #endif
> +
> +       u64                             gtime_halted;
>  };
>
>  struct sched_rt_entity {
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9aecd914ac69..1f3ced2b2636 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4487,6 +4487,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
>         p->se.nr_migrations             = 0;
>         p->se.vruntime                  = 0;
>         p->se.vlag                      = 0;
> +       p->se.gtime_halted              = 0;
>         INIT_LIST_HEAD(&p->se.group_node);
>
>         /* A delayed task cannot be in clone(). */
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1c0ef435a7aa..5ff52711d459 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -13705,4 +13705,29 @@ __init void init_sched_fair_class(void)
>  #endif
>  #endif /* SMP */
>
> +
> +}
> +
> +#ifdef CONFIG_NO_HZ_FULL
> +void sched_account_gtime_halted(struct task_struct *p, u64 gtime_halted)
> +{
>  }
> +#else
> +/*
> + * The implementation hooking into PELT requires regular updates of
> + * gtime_halted. This is guaranteed unless we run on CONFIG_NO_HZ_FULL.
> + */
> +void sched_account_gtime_halted(struct task_struct *p, u64 gtime_halted)
> +{
> +       struct sched_entity *se = &p->se;
> +
> +       if (unlikely(!gtime_halted))
> +               return;
> +
> +       for_each_sched_entity(se) {
> +               se->gtime_halted += gtime_halted;
> +               se->cfs_rq->gtime_halted += gtime_halted;
> +       }
> +}
> +#endif
> +EXPORT_SYMBOL(sched_account_gtime_halted);
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index 7a8534a2deff..9f96b7c46c00 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -305,10 +305,23 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
>
>  int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> -       if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
> -                               cfs_rq->curr == se)) {
> +       int ret = 0;
> +       u64 delta = now - se->avg.last_update_time;
> +       u64 gtime_halted = min(delta, se->gtime_halted);
>
> -               ___update_load_avg(&se->avg, se_weight(se));
> +       ret = ___update_load_sum(now - gtime_halted, &se->avg, !!se->on_rq, se_runnable(se),
> +                       cfs_rq->curr == se);
> +
> +       if (gtime_halted) {
> +               ret |= ___update_load_sum(now, &se->avg, 0, 0, 0);
> +               se->gtime_halted -= gtime_halted;
> +
> +               /* decay residual halted time */
> +               if (ret && se->gtime_halted)
> +                       se->gtime_halted = decay_load(se->gtime_halted, delta / 1024);
> +       }
> +
> +       if (ret) {
>                 cfs_se_util_change(&se->avg);
>                 trace_pelt_se_tp(se);
>                 return 1;
> @@ -319,10 +332,25 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
>
>  int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
>  {
> -       if (___update_load_sum(now, &cfs_rq->avg,
> -                               scale_load_down(cfs_rq->load.weight),
> -                               cfs_rq->h_nr_runnable,
> -                               cfs_rq->curr != NULL)) {
> +       int ret = 0;
> +       u64 delta = now - cfs_rq->avg.last_update_time;
> +       u64 gtime_halted = min(delta, cfs_rq->gtime_halted);
> +
> +       ret =  ___update_load_sum(now - gtime_halted, &cfs_rq->avg,
> +                       scale_load_down(cfs_rq->load.weight),
> +                       cfs_rq->h_nr_runnable,
> +                       cfs_rq->curr != NULL);
> +
> +       if (gtime_halted) {
> +               ret |= ___update_load_sum(now, &cfs_rq->avg, 0, 0, 0);
> +               cfs_rq->gtime_halted -= gtime_halted;
> +
> +               /* decay any residual halted time */
> +               if (ret && cfs_rq->gtime_halted)
> +                       cfs_rq->gtime_halted = decay_load(cfs_rq->gtime_halted, delta / 1024);
> +       }
> +
> +       if (ret) {
>
>                 ___update_load_avg(&cfs_rq->avg, 1);
>                 trace_pelt_cfs_tp(cfs_rq);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index b93c8c3dc05a..79b1166265bf 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -744,6 +744,8 @@ struct cfs_rq {
>         struct list_head        throttled_csd_list;
>  #endif /* CONFIG_CFS_BANDWIDTH */
>  #endif /* CONFIG_FAIR_GROUP_SCHED */
> +
> +       u64                     gtime_halted;
>  };
>
>  #ifdef CONFIG_SCHED_CLASS_EXT
> --
> 2.43.0
>
>
>
>
> Amazon Development Centre (South Africa) (Proprietary) Limited
> 29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
> Registration Number: 2004 / 034463 / 07
>