lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190515172132.GE5875@linux.intel.com>
Date:   Wed, 15 May 2019 10:21:32 -0700
From:   Sean Christopherson <sean.j.christopherson@...el.com>
To:     Wanpeng Li <kernellwp@...il.com>
Cc:     linux-kernel@...r.kernel.org, kvm@...r.kernel.org,
        Paolo Bonzini <pbonzini@...hat.com>,
        Radim Krčmář <rkrcmar@...hat.com>,
        Liran Alon <liran.alon@...cle.com>
Subject: Re: [PATCH v2 3/4] KVM: LAPIC: Expose per-vCPU timer adavance
 information to userspace

On Wed, May 15, 2019 at 12:11:53PM +0800, Wanpeng Li wrote:
> From: Wanpeng Li <wanpengli@...cent.com>
> 
> Expose the per-vCPU advancement information to the user via per-vCPU debugfs 
> entry. wait_lapic_expire() call was moved above guest_enter_irqoff() because 
> of its tracepoint, which violated the RCU extended quiescent state invoked 
> by guest_enter_irqoff()[1][2]. This patch simply removes the tracepoint, 
> which would allow moving wait_lapic_expire(). Sean pointed out:
> 
> | Now that the advancement time is tracked per-vCPU, realizing a change 
> | in the advancement time requires creating a new VM. For all intents 
> | and purposes this makes it impractical to hand tune the advancement 
> | in real time using the tracepoint as the feedback mechanism.
> 
> [1] Commit 8b89fe1f6c43 ("kvm: x86: move tracepoints outside extended quiescent state")
> [2] https://patchwork.kernel.org/patch/7821111/
> 
> Cc: Paolo Bonzini <pbonzini@...hat.com>
> Cc: Radim Krčmář <rkrcmar@...hat.com>
> Cc: Sean Christopherson <sean.j.christopherson@...el.com>
> Cc: Liran Alon <liran.alon@...cle.com>
> Signed-off-by: Wanpeng Li <wanpengli@...cent.com>
> ---
>  arch/x86/kvm/debugfs.c | 16 ++++++++++++++++
>  arch/x86/kvm/lapic.c   | 16 ++++++++--------
>  arch/x86/kvm/lapic.h   |  1 +
>  arch/x86/kvm/trace.h   | 20 --------------------
>  4 files changed, 25 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
> index c19c7ed..8cf542e 100644
> --- a/arch/x86/kvm/debugfs.c
> +++ b/arch/x86/kvm/debugfs.c
> @@ -9,12 +9,22 @@
>   */
>  #include <linux/kvm_host.h>
>  #include <linux/debugfs.h>
> +#include "lapic.h"
>  
>  bool kvm_arch_has_vcpu_debugfs(void)
>  {
>  	return true;
>  }
>  
> +static int vcpu_get_timer_expire_delta(void *data, u64 *val)
> +{
> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
> +	*val = vcpu->arch.apic->lapic_timer.advance_expire_delta;
> +	return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_expire_delta_fops, vcpu_get_timer_expire_delta, NULL, "%lld\n");
> +
>  static int vcpu_get_tsc_offset(void *data, u64 *val)
>  {
>  	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
> @@ -51,6 +61,12 @@ int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
>  	if (!ret)
>  		return -ENOMEM;
>  
> +	ret = debugfs_create_file("advance_expire_delta", 0444,
> +							vcpu->debugfs_dentry,
> +							vcpu, &vcpu_timer_expire_delta_fops);

I was thinking we would expose 'kvm_timer.timer_advance_ns', not the
delta, the idea being that being able to query the auto-adjusted value
is now the desired behavior.  But rethinking things, that enhancement is
orthogonal to removing the tracepoint.

Back to the tracepoint, an alternative solution would be to add
kvm_timer.advance_expire_delta as you did, but rather than add a new
debugfs entry, simply move the tracepoint below guest_exit_irqoff()
in vcpu_enter_guest().  I.e. snapshot the delta before VM-Enter, but
trace it after VM-Exit.

If we want to continue supporting hand tuning the advancement, then a
tracepoint is much easier for userspace to consume, e.g. it allows the
user to monitor the history of the delta while adjusting the advancement
time.  Manually approximating that behavior by sampling the value from
debugfs would be quite cumbersome.

> +	if (!ret)
> +		return -ENOMEM;
> +
>  	if (kvm_has_tsc_control) {
>  		ret = debugfs_create_file("tsc-scaling-ratio", 0444,
>  							vcpu->debugfs_dentry,
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 2f364fe..af38ece 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1502,27 +1502,27 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
>  }
>  
>  static inline void adaptive_tune_timer_advancement(struct kvm_vcpu *vcpu,
> -				u64 guest_tsc, u64 tsc_deadline)
> +				s64 advance_expire_delta)
>  {
>  	struct kvm_lapic *apic = vcpu->arch.apic;
>  	u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
>  	u64 ns;
>  
>  	/* too early */
> -	if (guest_tsc < tsc_deadline) {
> -		ns = (tsc_deadline - guest_tsc) * 1000000ULL;
> +	if (advance_expire_delta < 0) {
> +		ns = -advance_expire_delta * 1000000ULL;
>  		do_div(ns, vcpu->arch.virtual_tsc_khz);
>  		timer_advance_ns -= min((u32)ns,
>  			timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
>  	} else {
>  	/* too late */
> -		ns = (guest_tsc - tsc_deadline) * 1000000ULL;
> +		ns = advance_expire_delta * 1000000ULL;
>  		do_div(ns, vcpu->arch.virtual_tsc_khz);
>  		timer_advance_ns += min((u32)ns,
>  			timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
>  	}
>  
> -	if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
> +	if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
>  		apic->lapic_timer.timer_advance_adjust_done = true;
>  	if (unlikely(timer_advance_ns > 5000)) {
>  		timer_advance_ns = 0;
> @@ -1545,13 +1545,13 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
>  	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
>  	apic->lapic_timer.expired_tscdeadline = 0;
>  	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> -	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
> +	apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
>  
> -	if (guest_tsc < tsc_deadline)
> +	if (apic->lapic_timer.advance_expire_delta < 0)
>  		__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
>  
>  	if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
> -		adaptive_tune_timer_advancement(vcpu, guest_tsc, tsc_deadline);
> +		adaptive_tune_timer_advancement(vcpu, apic->lapic_timer.advance_expire_delta);
>  }
>  
>  static void start_sw_tscdeadline(struct kvm_lapic *apic)
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index d6d049b..3e72a25 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -32,6 +32,7 @@ struct kvm_timer {
>  	u64 tscdeadline;
>  	u64 expired_tscdeadline;
>  	u32 timer_advance_ns;
> +	s64 advance_expire_delta;
>  	atomic_t pending;			/* accumulated triggered timers */
>  	bool hv_timer_in_use;
>  	bool timer_advance_adjust_done;
> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> index 4d47a26..3f9bc62 100644
> --- a/arch/x86/kvm/trace.h
> +++ b/arch/x86/kvm/trace.h
> @@ -953,26 +953,6 @@ TRACE_EVENT(kvm_pvclock_update,
>  		  __entry->flags)
>  );
>  
> -TRACE_EVENT(kvm_wait_lapic_expire,
> -	TP_PROTO(unsigned int vcpu_id, s64 delta),
> -	TP_ARGS(vcpu_id, delta),
> -
> -	TP_STRUCT__entry(
> -		__field(	unsigned int,	vcpu_id		)
> -		__field(	s64,		delta		)
> -	),
> -
> -	TP_fast_assign(
> -		__entry->vcpu_id	   = vcpu_id;
> -		__entry->delta             = delta;
> -	),
> -
> -	TP_printk("vcpu %u: delta %lld (%s)",
> -		  __entry->vcpu_id,
> -		  __entry->delta,
> -		  __entry->delta < 0 ? "early" : "late")
> -);
> -
>  TRACE_EVENT(kvm_enter_smm,
>  	TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
>  	TP_ARGS(vcpu_id, smbase, entering),
> -- 
> 2.7.4
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ