[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtDx3vVK1ZgBwicTeP82wL=wGOKdxheuBHCBjzM6mSDPOQ@mail.gmail.com>
Date: Thu, 27 Feb 2025 08:34:34 +0100
From: Vincent Guittot <vincent.guittot@...aro.org>
To: Fernand Sieber <sieberf@...zon.com>
Cc: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
Paolo Bonzini <pbonzini@...hat.com>, x86@...nel.org, kvm@...r.kernel.org,
linux-kernel@...r.kernel.org, nh-open-source@...zon.com
Subject: Re: [RFC PATCH 3/3] sched,x86: Make the scheduler guest unhalted aware
On Tue, 18 Feb 2025 at 21:27, Fernand Sieber <sieberf@...zon.com> wrote:
>
> With guest hlt/mwait/pause pass through, the scheduler has no visibility into
> real vCPU activity as it sees them all 100% active. As such, load balancing
> cannot make informed decisions on where it is preferrable to collocate
> tasks when necessary. I.e as far as the load balancer is concerned, a
> halted vCPU and an idle polling vCPU look exactly the same so it may decide
> that either should be preempted when in reality it would be preferrable to
> preempt the idle one.
>
> This commits enlightens the scheduler to real guest activity in this
> situation. Leveraging gtime unhalted, it adds a hook for kvm to communicate
> to the scheduler the duration that a vCPU spends halted. This is then used in
> PELT accounting to discount it from real activity. This results in better
> placement and overall steal time reduction.
NAK, PELT account for time spent by se on the CPU. If your thread/vcpu
doesn't do anything but burn cycles, find another way to report thatto
the host
Furthermore this breaks all the hierarchy dependency
>
> This initial implementation assumes that non-idle CPUs are ticking as it
> hooks the unhalted time the PELT decaying load accounting. As such it
> doesn't work well if PELT is updated infrequenly with large chunks of
> halted time. This is not a fundamental limitation but more complex
> accounting is needed to generalize the use case to nohz full.
> ---
> arch/x86/kvm/x86.c | 8 ++++++--
> include/linux/sched.h | 4 ++++
> kernel/sched/core.c | 1 +
> kernel/sched/fair.c | 25 +++++++++++++++++++++++++
> kernel/sched/pelt.c | 42 +++++++++++++++++++++++++++++++++++-------
> kernel/sched/sched.h | 2 ++
> 6 files changed, 73 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 46975b0a63a5..156cf05b325f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10712,6 +10712,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> int r;
> unsigned long long cycles, cycles_start = 0;
> unsigned long long unhalted_cycles, unhalted_cycles_start = 0;
> + unsigned long long halted_cycles_ns = 0;
> bool req_int_win =
> dm_request_for_irq_injection(vcpu) &&
> kvm_cpu_accept_dm_intr(vcpu);
> @@ -11083,8 +11084,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> cycles = get_cycles() - cycles_start;
> unhalted_cycles = get_unhalted_cycles() -
> unhalted_cycles_start;
> - if (likely(cycles > unhalted_cycles))
> - current->gtime_halted += cycles2ns(cycles - unhalted_cycles);
> + if (likely(cycles > unhalted_cycles)) {
> + halted_cycles_ns = cycles2ns(cycles - unhalted_cycles);
> + current->gtime_halted += halted_cycles_ns;
> + sched_account_gtime_halted(current, halted_cycles_ns);
> + }
> }
>
> local_irq_enable();
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 5f6745357e20..5409fac152c9 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -367,6 +367,8 @@ struct vtime {
> u64 gtime;
> };
>
> +extern void sched_account_gtime_halted(struct task_struct *p, u64 gtime_halted);
> +
> /*
> * Utilization clamp constraints.
> * @UCLAMP_MIN: Minimum utilization
> @@ -588,6 +590,8 @@ struct sched_entity {
> */
> struct sched_avg avg;
> #endif
> +
> + u64 gtime_halted;
> };
>
> struct sched_rt_entity {
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9aecd914ac69..1f3ced2b2636 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4487,6 +4487,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
> p->se.nr_migrations = 0;
> p->se.vruntime = 0;
> p->se.vlag = 0;
> + p->se.gtime_halted = 0;
> INIT_LIST_HEAD(&p->se.group_node);
>
> /* A delayed task cannot be in clone(). */
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1c0ef435a7aa..5ff52711d459 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -13705,4 +13705,29 @@ __init void init_sched_fair_class(void)
> #endif
> #endif /* SMP */
>
> +
> +}
> +
> +#ifdef CONFIG_NO_HZ_FULL
> +void sched_account_gtime_halted(struct task_struct *p, u64 gtime_halted)
> +{
> }
> +#else
> +/*
> + * The implementation hooking into PELT requires regular updates of
> + * gtime_halted. This is guaranteed unless we run on CONFIG_NO_HZ_FULL.
> + */
> +void sched_account_gtime_halted(struct task_struct *p, u64 gtime_halted)
> +{
> + struct sched_entity *se = &p->se;
> +
> + if (unlikely(!gtime_halted))
> + return;
> +
> + for_each_sched_entity(se) {
> + se->gtime_halted += gtime_halted;
> + se->cfs_rq->gtime_halted += gtime_halted;
> + }
> +}
> +#endif
> +EXPORT_SYMBOL(sched_account_gtime_halted);
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index 7a8534a2deff..9f96b7c46c00 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -305,10 +305,23 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
>
> int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> - if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
> - cfs_rq->curr == se)) {
> + int ret = 0;
> + u64 delta = now - se->avg.last_update_time;
> + u64 gtime_halted = min(delta, se->gtime_halted);
>
> - ___update_load_avg(&se->avg, se_weight(se));
> + ret = ___update_load_sum(now - gtime_halted, &se->avg, !!se->on_rq, se_runnable(se),
> + cfs_rq->curr == se);
> +
> + if (gtime_halted) {
> + ret |= ___update_load_sum(now, &se->avg, 0, 0, 0);
> + se->gtime_halted -= gtime_halted;
> +
> + /* decay residual halted time */
> + if (ret && se->gtime_halted)
> + se->gtime_halted = decay_load(se->gtime_halted, delta / 1024);
> + }
> +
> + if (ret) {
> cfs_se_util_change(&se->avg);
> trace_pelt_se_tp(se);
> return 1;
> @@ -319,10 +332,25 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
>
> int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
> {
> - if (___update_load_sum(now, &cfs_rq->avg,
> - scale_load_down(cfs_rq->load.weight),
> - cfs_rq->h_nr_runnable,
> - cfs_rq->curr != NULL)) {
> + int ret = 0;
> + u64 delta = now - cfs_rq->avg.last_update_time;
> + u64 gtime_halted = min(delta, cfs_rq->gtime_halted);
> +
> + ret = ___update_load_sum(now - gtime_halted, &cfs_rq->avg,
> + scale_load_down(cfs_rq->load.weight),
> + cfs_rq->h_nr_runnable,
> + cfs_rq->curr != NULL);
> +
> + if (gtime_halted) {
> + ret |= ___update_load_sum(now, &cfs_rq->avg, 0, 0, 0);
> + cfs_rq->gtime_halted -= gtime_halted;
> +
> + /* decay any residual halted time */
> + if (ret && cfs_rq->gtime_halted)
> + cfs_rq->gtime_halted = decay_load(cfs_rq->gtime_halted, delta / 1024);
> + }
> +
> + if (ret) {
>
> ___update_load_avg(&cfs_rq->avg, 1);
> trace_pelt_cfs_tp(cfs_rq);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index b93c8c3dc05a..79b1166265bf 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -744,6 +744,8 @@ struct cfs_rq {
> struct list_head throttled_csd_list;
> #endif /* CONFIG_CFS_BANDWIDTH */
> #endif /* CONFIG_FAIR_GROUP_SCHED */
> +
> + u64 gtime_halted;
> };
>
> #ifdef CONFIG_SCHED_CLASS_EXT
> --
> 2.43.0
>
>
>
>
> Amazon Development Centre (South Africa) (Proprietary) Limited
> 29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
> Registration Number: 2004 / 034463 / 07
>
Powered by blists - more mailing lists