linux-kernel - Re: [PATCH v2 3/4] KVM: LAPIC: Expose per-vCPU timer adavance information to userspace

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190515172132.GE5875@linux.intel.com>
Date:   Wed, 15 May 2019 10:21:32 -0700
From:   Sean Christopherson <sean.j.christopherson@...el.com>
To:     Wanpeng Li <kernellwp@...il.com>
Cc:     linux-kernel@...r.kernel.org, kvm@...r.kernel.org,
        Paolo Bonzini <pbonzini@...hat.com>,
        Radim Krčmář <rkrcmar@...hat.com>,
        Liran Alon <liran.alon@...cle.com>
Subject: Re: [PATCH v2 3/4] KVM: LAPIC: Expose per-vCPU timer adavance
 information to userspace

On Wed, May 15, 2019 at 12:11:53PM +0800, Wanpeng Li wrote:
> From: Wanpeng Li <wanpengli@...cent.com>
> 
> Expose the per-vCPU advancement information to the user via per-vCPU debugfs 
> entry. wait_lapic_expire() call was moved above guest_enter_irqoff() because 
> of its tracepoint, which violated the RCU extended quiescent state invoked 
> by guest_enter_irqoff()[1][2]. This patch simply removes the tracepoint, 
> which would allow moving wait_lapic_expire(). Sean pointed out:
> 
> | Now that the advancement time is tracked per-vCPU, realizing a change 
> | in the advancement time requires creating a new VM. For all intents 
> | and purposes this makes it impractical to hand tune the advancement 
> | in real time using the tracepoint as the feedback mechanism.
> 
> [1] Commit 8b89fe1f6c43 ("kvm: x86: move tracepoints outside extended quiescent state")
> [2] https://patchwork.kernel.org/patch/7821111/
> 
> Cc: Paolo Bonzini <pbonzini@...hat.com>
> Cc: Radim Krčmář <rkrcmar@...hat.com>
> Cc: Sean Christopherson <sean.j.christopherson@...el.com>
> Cc: Liran Alon <liran.alon@...cle.com>
> Signed-off-by: Wanpeng Li <wanpengli@...cent.com>
> ---
>  arch/x86/kvm/debugfs.c | 16 ++++++++++++++++
>  arch/x86/kvm/lapic.c   | 16 ++++++++--------
>  arch/x86/kvm/lapic.h   |  1 +
>  arch/x86/kvm/trace.h   | 20 --------------------
>  4 files changed, 25 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
> index c19c7ed..8cf542e 100644
> --- a/arch/x86/kvm/debugfs.c
> +++ b/arch/x86/kvm/debugfs.c
> @@ -9,12 +9,22 @@
>   */
>  #include <linux/kvm_host.h>
>  #include <linux/debugfs.h>
> +#include "lapic.h"
>  
>  bool kvm_arch_has_vcpu_debugfs(void)
>  {
>  	return true;
>  }
>  
> +static int vcpu_get_timer_expire_delta(void *data, u64 *val)
> +{
> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
> +	*val = vcpu->arch.apic->lapic_timer.advance_expire_delta;
> +	return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_expire_delta_fops, vcpu_get_timer_expire_delta, NULL, "%lld\n");
> +
>  static int vcpu_get_tsc_offset(void *data, u64 *val)
>  {
>  	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
> @@ -51,6 +61,12 @@ int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
>  	if (!ret)
>  		return -ENOMEM;
>  
> +	ret = debugfs_create_file("advance_expire_delta", 0444,
> +							vcpu->debugfs_dentry,
> +							vcpu, &vcpu_timer_expire_delta_fops);

I was thinking we would expose 'kvm_timer.timer_advance_ns', not the
delta, the idea being that being able to query the auto-adjusted value
is now the desired behavior.  But rethinking things, that enhancement is
orthogonal to removing the tracepoint.

Back to the tracepoint, an alternative solution would be to add
kvm_timer.advance_expire_delta as you did, but rather than add a new
debugfs entry, simply move the tracepoint below guest_exit_irqoff()
in vcpu_enter_guest().  I.e. snapshot the delta before VM-Enter, but
trace it after VM-Exit.

If we want to continue supporting hand tuning the advancement, then a
tracepoint is much easier for userspace to consume, e.g. it allows the
user to monitor the history of the delta while adjusting the advancement
time.  Manually approximating that behavior by sampling the value from
debugfs would be quite cumbersome.

> +	if (!ret)
> +		return -ENOMEM;
> +
>  	if (kvm_has_tsc_control) {
>  		ret = debugfs_create_file("tsc-scaling-ratio", 0444,
>  							vcpu->debugfs_dentry,
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 2f364fe..af38ece 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1502,27 +1502,27 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
>  }
>  
>  static inline void adaptive_tune_timer_advancement(struct kvm_vcpu *vcpu,
> -				u64 guest_tsc, u64 tsc_deadline)
> +				s64 advance_expire_delta)
>  {
>  	struct kvm_lapic *apic = vcpu->arch.apic;
>  	u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
>  	u64 ns;
>  
>  	/* too early */
> -	if (guest_tsc < tsc_deadline) {
> -		ns = (tsc_deadline - guest_tsc) * 1000000ULL;
> +	if (advance_expire_delta < 0) {
> +		ns = -advance_expire_delta * 1000000ULL;
>  		do_div(ns, vcpu->arch.virtual_tsc_khz);
>  		timer_advance_ns -= min((u32)ns,
>  			timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
>  	} else {
>  	/* too late */
> -		ns = (guest_tsc - tsc_deadline) * 1000000ULL;
> +		ns = advance_expire_delta * 1000000ULL;
>  		do_div(ns, vcpu->arch.virtual_tsc_khz);
>  		timer_advance_ns += min((u32)ns,
>  			timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
>  	}
>  
> -	if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
> +	if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
>  		apic->lapic_timer.timer_advance_adjust_done = true;
>  	if (unlikely(timer_advance_ns > 5000)) {
>  		timer_advance_ns = 0;
> @@ -1545,13 +1545,13 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
>  	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
>  	apic->lapic_timer.expired_tscdeadline = 0;
>  	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> -	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
> +	apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
>  
> -	if (guest_tsc < tsc_deadline)
> +	if (apic->lapic_timer.advance_expire_delta < 0)
>  		__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
>  
>  	if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
> -		adaptive_tune_timer_advancement(vcpu, guest_tsc, tsc_deadline);
> +		adaptive_tune_timer_advancement(vcpu, apic->lapic_timer.advance_expire_delta);
>  }
>  
>  static void start_sw_tscdeadline(struct kvm_lapic *apic)
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index d6d049b..3e72a25 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -32,6 +32,7 @@ struct kvm_timer {
>  	u64 tscdeadline;
>  	u64 expired_tscdeadline;
>  	u32 timer_advance_ns;
> +	s64 advance_expire_delta;
>  	atomic_t pending;			/* accumulated triggered timers */
>  	bool hv_timer_in_use;
>  	bool timer_advance_adjust_done;
> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> index 4d47a26..3f9bc62 100644
> --- a/arch/x86/kvm/trace.h
> +++ b/arch/x86/kvm/trace.h
> @@ -953,26 +953,6 @@ TRACE_EVENT(kvm_pvclock_update,
>  		  __entry->flags)
>  );
>  
> -TRACE_EVENT(kvm_wait_lapic_expire,
> -	TP_PROTO(unsigned int vcpu_id, s64 delta),
> -	TP_ARGS(vcpu_id, delta),
> -
> -	TP_STRUCT__entry(
> -		__field(	unsigned int,	vcpu_id		)
> -		__field(	s64,		delta		)
> -	),
> -
> -	TP_fast_assign(
> -		__entry->vcpu_id	   = vcpu_id;
> -		__entry->delta             = delta;
> -	),
> -
> -	TP_printk("vcpu %u: delta %lld (%s)",
> -		  __entry->vcpu_id,
> -		  __entry->delta,
> -		  __entry->delta < 0 ? "early" : "late")
> -);
> -
>  TRACE_EVENT(kvm_enter_smm,
>  	TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
>  	TP_ARGS(vcpu_id, smbase, entering),
> -- 
> 2.7.4
>