[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <Z8NCGFcH9H14VOV-@char.us.oracle.com>
Date: Sat, 1 Mar 2025 12:21:28 -0500
From: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
To: Suleiman Souhlal <suleiman@...gle.com>
Cc: Paolo Bonzini <pbonzini@...hat.com>,
Sean Christopherson <seanjc@...gle.com>,
Thomas Gleixner <tglx@...utronix.de>, Ingo Molnar <mingo@...hat.com>,
Borislav Petkov <bp@...en8.de>,
Dave Hansen <dave.hansen@...ux.intel.com>, x86@...nel.org,
"H. Peter Anvin" <hpa@...or.com>, Chao Gao <chao.gao@...el.com>,
David Woodhouse <dwmw2@...radead.org>,
Sergey Senozhatsky <senozhatsky@...omium.org>, kvm@...r.kernel.org,
linux-kernel@...r.kernel.org, ssouhlal@...ebsd.org
Subject: Re: [PATCH v4 2/2] KVM: x86: Include host suspended time in steal
time
On Fri, Feb 21, 2025 at 02:39:27PM +0900, Suleiman Souhlal wrote:
> When the host resumes from a suspend, the guest thinks any task
> that was running during the suspend ran for a long time, even though
> the effective run time was much shorter, which can end up having
> negative effects with scheduling.
>
> To mitigate this issue, the time that the host was suspended is included
> in steal time, which lets the guest can subtract the duration from the
s/can//
> tasks' runtime.
>
> In order to implement this behavior, once the suspend notifier fires,
> vCPUs trying to run block until the resume notifier finishes. This is
s/run/run will/
> because the freezing of userspace tasks happens between these two points,
Full stop at the end of that ^
> which means that vCPUs could otherwise run and get their suspend steal
> time misaccounted, particularly if a vCPU would run after resume before
> the resume notifier.
s/notifier/notifier fires/
> Incidentally, doing this also addresses a potential race with the
> suspend notifier setting PVCLOCK_GUEST_STOPPED, which could then get
> cleared before the suspend actually happened.
>
> One potential caveat is that in the case of a suspend happening during
> a VM migration, the suspend time might not be accounted.
s/accounted/accounted for./
> A workaround would be for the VMM to ensure that the guest is entered
> with KVM_RUN after resuming from suspend.
So ..does that mean there is a QEMU patch as well?
>
> Signed-off-by: Suleiman Souhlal <suleiman@...gle.com>
> ---
> Documentation/virt/kvm/x86/msr.rst | 10 ++++--
> arch/x86/include/asm/kvm_host.h | 6 ++++
> arch/x86/kvm/x86.c | 51 ++++++++++++++++++++++++++++++
> 3 files changed, 65 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/virt/kvm/x86/msr.rst b/Documentation/virt/kvm/x86/msr.rst
> index 3aecf2a70e7b43..48f2a8ca519548 100644
> --- a/Documentation/virt/kvm/x86/msr.rst
> +++ b/Documentation/virt/kvm/x86/msr.rst
> @@ -294,8 +294,14 @@ data:
>
> steal:
> the amount of time in which this vCPU did not run, in
> - nanoseconds. Time during which the vcpu is idle, will not be
> - reported as steal time.
> + nanoseconds. This includes the time during which the host is
> + suspended. Time during which the vcpu is idle, might not be
> + reported as steal time. The case where the host suspends
> + during a VM migration might not be accounted if VCPUs aren't
> + entered post-resume, because KVM does not currently support
> + suspend/resuming the associated metadata. A workaround would
> + be for the VMM to ensure that the guest is entered with
> + KVM_RUN after resuming from suspend.
>
> preempted:
> indicate the vCPU who owns this struct is running or
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 452dd0204609af..007656ceac9a71 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -124,6 +124,7 @@
> #define KVM_REQ_HV_TLB_FLUSH \
> KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
> #define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34)
> +#define KVM_REQ_WAIT_FOR_RESUME KVM_ARCH_REQ(35)
>
> #define CR0_RESERVED_BITS \
> (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
> @@ -916,8 +917,13 @@ struct kvm_vcpu_arch {
>
> struct {
> u8 preempted;
> + bool host_suspended;
> u64 msr_val;
> u64 last_steal;
> + u64 last_suspend;
> + u64 suspend_ns;
> + u64 last_suspend_ns;
> + wait_queue_head_t resume_waitq;
> struct gfn_to_hva_cache cache;
> } st;
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 06464ec0d1c8d2..f34edcf77cca0a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3717,6 +3717,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
> steal += current->sched_info.run_delay -
> vcpu->arch.st.last_steal;
> vcpu->arch.st.last_steal = current->sched_info.run_delay;
> + steal += vcpu->arch.st.suspend_ns - vcpu->arch.st.last_suspend_ns;
> + vcpu->arch.st.last_suspend_ns = vcpu->arch.st.suspend_ns;
> unsafe_put_user(steal, &st->steal, out);
>
> version += 1;
> @@ -6930,6 +6932,19 @@ long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
> }
> #endif
>
> +static void wait_for_resume(struct kvm_vcpu *vcpu)
> +{
> + wait_event_interruptible(vcpu->arch.st.resume_waitq,
> + vcpu->arch.st.host_suspended == 0);
> +
> + /*
> + * This might happen if we blocked here before the freezing of tasks
> + * and we get woken up by the freezer.
> + */
> + if (vcpu->arch.st.host_suspended)
> + kvm_make_request(KVM_REQ_WAIT_FOR_RESUME, vcpu);
> +}
> +
> #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
> static int kvm_arch_suspend_notifier(struct kvm *kvm)
> {
> @@ -6939,6 +6954,19 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
>
> mutex_lock(&kvm->lock);
> kvm_for_each_vcpu(i, vcpu, kvm) {
> + vcpu->arch.st.last_suspend = ktime_get_boottime_ns();
> + /*
> + * Tasks get thawed before the resume notifier has been called
> + * so we need to block vCPUs until the resume notifier has run.
> + * Otherwise, suspend steal time might get applied too late,
> + * and get accounted to the wrong guest task.
> + * This also ensures that the guest paused bit set below
> + * doesn't get checked and cleared before the host actually
> + * suspends.
> + */
> + vcpu->arch.st.host_suspended = 1;
> + kvm_make_request(KVM_REQ_WAIT_FOR_RESUME, vcpu);
> +
> if (!vcpu->arch.pv_time.active)
> continue;
>
> @@ -6954,12 +6982,32 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
> return ret ? NOTIFY_BAD : NOTIFY_DONE;
> }
>
> +static int kvm_arch_resume_notifier(struct kvm *kvm)
> +{
> + struct kvm_vcpu *vcpu;
> + unsigned long i;
> +
> + mutex_lock(&kvm->lock);
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + vcpu->arch.st.host_suspended = 0;
> + vcpu->arch.st.suspend_ns += ktime_get_boottime_ns() -
> + vcpu->arch.st.last_suspend;
> + wake_up_interruptible(&vcpu->arch.st.resume_waitq);
> + }
> + mutex_unlock(&kvm->lock);
> +
> + return NOTIFY_DONE;
> +}
> +
> int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
> {
> switch (state) {
> case PM_HIBERNATION_PREPARE:
> case PM_SUSPEND_PREPARE:
> return kvm_arch_suspend_notifier(kvm);
> + case PM_POST_HIBERNATION:
> + case PM_POST_SUSPEND:
> + return kvm_arch_resume_notifier(kvm);
> }
>
> return NOTIFY_DONE;
> @@ -10813,6 +10861,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> r = 1;
> goto out;
> }
> + if (kvm_check_request(KVM_REQ_WAIT_FOR_RESUME, vcpu))
> + wait_for_resume(vcpu);
> if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
> record_steal_time(vcpu);
> if (kvm_check_request(KVM_REQ_PMU, vcpu))
> @@ -12341,6 +12391,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
> if (r)
> goto free_guest_fpu;
>
> + init_waitqueue_head(&vcpu->arch.st.resume_waitq);
> kvm_xen_init_vcpu(vcpu);
> vcpu_load(vcpu);
> kvm_vcpu_after_set_cpuid(vcpu);
> --
> 2.48.1.601.g30ceb7b040-goog
>
>
Powered by blists - more mailing lists