linux-kernel - Re: [PATCH 3/4] KVM: x86: introduce get_kvmclock

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160902135225.3ynmrv564rbgaca6@rkaganb.sw.ru>
Date:   Fri, 2 Sep 2016 16:52:26 +0300
From:   Roman Kagan <rkagan@...tuozzo.com>
To:     Paolo Bonzini <pbonzini@...hat.com>
CC:     <linux-kernel@...r.kernel.org>, <kvm@...r.kernel.org>,
        <peterhornyack@...gle.com>, <rkrcmar@...hat.com>, <den@...nvz.org>
Subject: Re: [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns

On Thu, Sep 01, 2016 at 05:26:14PM +0200, Paolo Bonzini wrote:
> Introduce a function that reads the exact nanoseconds value that is
> provided to the guest in kvmclock.  This crystallizes the notion of
> kvmclock as a thin veneer over a stable TSC, that the guest will
> (hopefully) convert with NTP.  In other words, kvmclock is *not* a
> paravirtualized host-to-guest NTP.
> 
> Drop the get_kernel_ns() function, that was used both to get the base
> value of the master clock and to get the current value of kvmclock.
> The former use is replaced by ktime_get_boot_ns(), the latter is
> the purpose of get_kernel_ns().
> 
> This also allows KVM to provide a Hyper-V time reference counter that
> is synchronized with the time that is computed from the TSC page.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@...hat.com>
> ---
>  arch/x86/entry/vdso/vclock_gettime.c |  2 +-
>  arch/x86/include/asm/pvclock.h       |  5 ++--
>  arch/x86/kernel/pvclock.c            |  2 +-
>  arch/x86/kvm/hyperv.c                |  2 +-
>  arch/x86/kvm/x86.c                   | 48 +++++++++++++++++++++++++++---------
>  arch/x86/kvm/x86.h                   |  6 +----
>  6 files changed, 43 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
> index 94d54d0defa7..02223cb4bcfd 100644
> --- a/arch/x86/entry/vdso/vclock_gettime.c
> +++ b/arch/x86/entry/vdso/vclock_gettime.c
> @@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode)
>  			return 0;
>  		}
>  
> -		ret = __pvclock_read_cycles(pvti);
> +		ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
>  	} while (pvclock_read_retry(pvti, version));
>  
>  	/* refer to vread_tsc() comment for rationale */
> diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
> index d019f0cc80ec..3ad741b84072 100644
> --- a/arch/x86/include/asm/pvclock.h
> +++ b/arch/x86/include/asm/pvclock.h
> @@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
>  }
>  
>  static __always_inline
> -cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
> +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
> +			      u64 tsc)
>  {
> -	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
> +	u64 delta = tsc - src->tsc_timestamp;
>  	cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
>  					     src->tsc_shift);
>  	return src->system_time + offset;
> diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
> index 3599404e3089..5b2cc889ce34 100644
> --- a/arch/x86/kernel/pvclock.c
> +++ b/arch/x86/kernel/pvclock.c
> @@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
>  
>  	do {
>  		version = pvclock_read_begin(src);
> -		ret = __pvclock_read_cycles(src);
> +		ret = __pvclock_read_cycles(src, rdtsc_ordered());
>  		flags = src->flags;
>  	} while (pvclock_read_retry(src, version));
>  

Perhaps adding an argument to __pvclock_read_cycles deserves a separate
patch, to get timekeeping folks' ack on it?

> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index 01bd7b7a6866..ed5b77f39ffb 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -386,7 +386,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
>  
>  static u64 get_time_ref_counter(struct kvm *kvm)
>  {
> -	return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
> +	return div_u64(get_kvmclock_ns(kvm), 100);

Since this does slightly different calculation than the real hyperv tsc
ref page clock is supposed to, I wonder if we are safe WRT precision
errors leading to occasional monotonicity violations?

>  }
>  
>  static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b5853b86b67d..2edcfa054cbe 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1425,7 +1425,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
>  
>  	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
>  	offset = kvm_compute_tsc_offset(vcpu, data);
> -	ns = get_kernel_ns();
> +	ns = ktime_get_boot_ns();
>  	elapsed = ns - kvm->arch.last_tsc_nsec;
>  
>  	if (vcpu->arch.virtual_tsc_khz) {
> @@ -1716,6 +1716,34 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>  #endif
>  }
>  
> +static u64 __get_kvmclock_ns(struct kvm *kvm)
> +{
> +	struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
> +	struct kvm_arch *ka = &kvm->arch;
> +	s64 ns;
> +
> +	if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
> +		u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> +		ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
> +	} else {
> +		ns = ktime_get_boot_ns() + ka->kvmclock_offset;
> +	}
> +
> +	return ns;
> +}
> +
> +u64 get_kvmclock_ns(struct kvm *kvm)
> +{
> +	unsigned long flags;
> +	s64 ns;
> +
> +	local_irq_save(flags);
> +	ns = __get_kvmclock_ns(kvm);
> +	local_irq_restore(flags);
> +
> +	return ns;
> +}
> +
>  static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
>  {
>  	struct kvm_vcpu_arch *vcpu = &v->arch;
> @@ -1805,7 +1833,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>  	}
>  	if (!use_master_clock) {
>  		host_tsc = rdtsc();
> -		kernel_ns = get_kernel_ns();
> +		kernel_ns = ktime_get_boot_ns();
>  	}
>  
>  	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
> @@ -4048,7 +4076,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  	case KVM_SET_CLOCK: {
>  		struct kvm_clock_data user_ns;
>  		u64 now_ns;
> -		s64 delta;
>  
>  		r = -EFAULT;
>  		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
> @@ -4060,10 +4087,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  
>  		r = 0;
>  		local_irq_disable();
> -		now_ns = get_kernel_ns();
> -		delta = user_ns.clock - now_ns;
> +		now_ns = __get_kvmclock_ns(kvm);
> +		kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
>  		local_irq_enable();
> -		kvm->arch.kvmclock_offset = delta;
>  		kvm_gen_update_masterclock(kvm);
>  		break;
>  	}

I'm curious why explicit irq disable/enable is left here, unlike the
next hunk where it's moved into get_kvmclock_ns?

> @@ -4071,10 +4097,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  		struct kvm_clock_data user_ns;
>  		u64 now_ns;
>  
> -		local_irq_disable();
> -		now_ns = get_kernel_ns();
> -		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
> -		local_irq_enable();
> +		now_ns = get_kvmclock_ns(kvm);
> +		user_ns.clock = now_ns;

Nitpick: now_ns is even less useful now than it was before the patch.

>  		user_ns.flags = 0;
>  		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
>  
> @@ -7539,7 +7563,7 @@ int kvm_arch_hardware_enable(void)
>  	 * before any KVM threads can be running.  Unfortunately, we can't
>  	 * bring the TSCs fully up to date with real time, as we aren't yet far
>  	 * enough into CPU bringup that we know how much real time has actually
> -	 * elapsed; our helper function, get_kernel_ns() will be using boot
> +	 * elapsed; our helper function, ktime_get_boot_ns() will be using boot
>  	 * variables that haven't been updated yet.
>  	 *
>  	 * So we simply find the maximum observed TSC above, then record the
> @@ -7774,7 +7798,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>  	mutex_init(&kvm->arch.apic_map_lock);
>  	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
>  
> -	kvm->arch.kvmclock_offset = -get_kernel_ns();
> +	kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
>  	pvclock_update_vm_gtod_copy(kvm);
>  
>  	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index a82ca466b62e..e8ff3e4ce38a 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
>  	return kvm_register_write(vcpu, reg, val);
>  }
>  
> -static inline u64 get_kernel_ns(void)
> -{
> -	return ktime_get_boot_ns();
> -}
> -
>  static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
>  {
>  	return !(kvm->arch.disabled_quirks & quirk);
> @@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
>  int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
>  
>  void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
> +u64 get_kvmclock_ns(struct kvm *kvm);
>  
>  int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
>  	gva_t addr, void *val, unsigned int bytes,
> -- 
> 1.8.3.1
> 
>