lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87bmj5k2pa.fsf@vitty.brq.redhat.com>
Date:   Mon, 11 Dec 2017 10:56:33 +0100
From:   Vitaly Kuznetsov <vkuznets@...hat.com>
To:     Roman Kagan <rkagan@...tuozzo.com>
Cc:     kvm@...r.kernel.org, x86@...nel.org,
        Paolo Bonzini <pbonzini@...hat.com>,
        Radim Krčmář <rkrcmar@...hat.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>,
        "H. Peter Anvin" <hpa@...or.com>,
        "K. Y. Srinivasan" <kys@...rosoft.com>,
        Haiyang Zhang <haiyangz@...rosoft.com>,
        Stephen Hemminger <sthemmin@...rosoft.com>,
        "Michael Kelley \(EOSG\)" <Michael.H.Kelley@...rosoft.com>,
        Andy Lutomirski <luto@...nel.org>,
        Mohammed Gamal <mmorsy@...hat.com>,
        Cathy Avery <cavery@...hat.com>, linux-kernel@...r.kernel.org,
        devel@...uxdriverproject.org
Subject: Re: [PATCH 3/6] x86/hyper-v: reenlightenment notifications support

Roman Kagan <rkagan@...tuozzo.com> writes:

> On Fri, Dec 08, 2017 at 11:49:57AM +0100, Vitaly Kuznetsov wrote:
>> Hyper-V supports Live Migration notification. This is supposed to be used
>> in conjunction with TSC emulation: when we are migrated to a host with
>> different TSC frequency for some short period host emulates our accesses
>> to TSC and sends us an interrupt to notify about the event. When we're
>> done updating everything we can disable TSC emulation and everything will
>> start working fast again.
>> 
>> We didn't need these notifications before as Hyper-V guests are not
>> supposed to use TSC as a clocksource: in Linux we even mark it as unstable
>> on boot. Guests normally use 'tsc page' clocksouce and host updates its
>> values on migrations automatically.
>> 
>> Things change when we want to run nested virtualization: even when we pass
>> through PV clocksources (kvm-clock or tsc page) to our guests we need to
>> know TSC frequency and when it changes.
>> 
>> Hyper-V Top Level Functional Specification (as of v5.0b) wrongly specifies
>> EAX:BIT(12) of CPUID:0x40000009 as the feature identification bit. The
>> right one to check is EAX:BIT(13) of CPUID:0x40000003. I was assured that
>> the fix in on the way.
>> 
>> Signed-off-by: Vitaly Kuznetsov <vkuznets@...hat.com>
>> ---
>> RFC -> v1:
>> - #include <asm/apic.h> [kbuild test robot]
>> - use div64_u64() [kbuild test robot]
>> - DECLARE_WORK -> DECLARE_DELAYED_WORK as testing showed there's some bug
>>   in Hyper-V hypervisor and disabling emulation after receiving interrupt
>>   may screw TSC counters.
>
> Looks kinda fragile...

I believe this is temporary and Microsoft will fix things up host side.

>
>> ---
>>  arch/x86/entry/entry_64.S          |  4 +++
>>  arch/x86/hyperv/hv_init.c          | 71 ++++++++++++++++++++++++++++++++++++++
>>  arch/x86/include/asm/entry_arch.h  |  4 +++
>>  arch/x86/include/asm/hw_irq.h      |  1 +
>>  arch/x86/include/asm/irq_vectors.h |  7 +++-
>>  arch/x86/include/asm/mshyperv.h    |  8 +++++
>>  arch/x86/include/uapi/asm/hyperv.h | 27 +++++++++++++++
>>  arch/x86/kernel/idt.c              |  3 ++
>>  8 files changed, 124 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>> index f81d50d7ceac..a32730b260bc 100644
>> --- a/arch/x86/entry/entry_64.S
>> +++ b/arch/x86/entry/entry_64.S
>> @@ -826,6 +826,10 @@ apicinterrupt SPURIOUS_APIC_VECTOR		spurious_interrupt		smp_spurious_interrupt
>>  apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
>>  #endif
>>  
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +apicinterrupt HYPERV_REENLIGHTENMENT_VECTOR	hyperv_reenlightenment_intr	smp_hyperv_reenlightenment_intr
>> +#endif
>> +
>>  /*
>>   * Exception entry points.
>>   */
>> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
>> index 1a6c63f721bc..bb62839ede81 100644
>> --- a/arch/x86/hyperv/hv_init.c
>> +++ b/arch/x86/hyperv/hv_init.c
>> @@ -18,6 +18,7 @@
>>   */
>>  
>>  #include <linux/types.h>
>> +#include <asm/apic.h>
>>  #include <asm/hypervisor.h>
>>  #include <asm/hyperv.h>
>>  #include <asm/mshyperv.h>
>> @@ -102,6 +103,76 @@ static int hv_cpu_init(unsigned int cpu)
>>  	return 0;
>>  }
>>  
>> +static void (*hv_reenlightenment_cb)(void);
>> +
>> +static void hv_reenlightenment_notify(struct work_struct *dummy)
>> +{
>> +	if (hv_reenlightenment_cb)
>> +		hv_reenlightenment_cb();
>> +}
>> +static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify);
>> +
>> +void hyperv_stop_tsc_emulation(void)
>> +{
>> +	u64 freq;
>> +	struct hv_tsc_emulation_status emu_status;
>> +
>> +	rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
>> +	emu_status.inprogress = 0;
>> +	wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
>> +
>> +	rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
>
> IIRC the availability of this msr is not guaranteed (I guess in reality
> it's present if the reenlightenment is supported, but I'd rather check).
>

Will do.

>> +	tsc_khz = div64_u64(freq, 1000);
>> +}
>> +EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation);
>> +
>> +void register_hv_tsc_update(void (*cb)(void))
>> +{
>
> The function name seems unfortunate.  IMHO such a name suggests
> registering a callback on a notifier chain (rather than unconditionally
> replacing the old callback), and having no other side effects.

I see, any suggestion? register_hv_reenlightenment_cb? register_hv_tscchange_cb?

>
>> +	struct hv_reenlightenment_control re_ctrl = {
>> +		.vector = HYPERV_REENLIGHTENMENT_VECTOR,
>> +		.enabled = 1,
>> +		.target_vp = hv_vp_index[smp_processor_id()]
>> +	};
>> +	struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1};
>> +
>> +	if (!(ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT))
>> +		return;
>
> What happens then?  L2 guests keep running with their clocks ticking at
> a different speed?
>

In reallity this never happens -- in case nested virtualization is
supported reenlightenment is also available. In theory, L0 can emulate
TSC acceess for forever after migration.

>> +
>> +	hv_reenlightenment_cb = cb;
>> +
>> +	/* Make sure callback is registered before we write to MSRs */
>> +	wmb();
>> +
>> +	wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
>> +	wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl));
>> +}
>> +EXPORT_SYMBOL_GPL(register_hv_tsc_update);
>> +
>> +void unregister_hv_tsc_update(void)
>> +{
>> +	struct hv_reenlightenment_control re_ctrl;
>> +
>> +	if (!(ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT))
>> +		return;
>> +
>> +	rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
>> +	re_ctrl.enabled = 0;
>> +	wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
>> +
>> +	hv_reenlightenment_cb = NULL;
>> +}
>> +EXPORT_SYMBOL_GPL(unregister_hv_tsc_update);
>> +
>> +asmlinkage __visible void
>> +__irq_entry smp_hyperv_reenlightenment_intr(struct pt_regs *regs)
>> +{
>> +	entering_ack_irq();
>> +
>> +	schedule_delayed_work(&hv_reenlightenment_work, HZ/10);
>> +
>> +	exiting_irq();
>> +}
>> +
>>  /*
>>   * This function is to be invoked early in the boot sequence after the
>>   * hypervisor has been detected.
>> diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
>> index 416422762845..eb936cc49b62 100644
>> --- a/arch/x86/include/asm/entry_arch.h
>> +++ b/arch/x86/include/asm/entry_arch.h
>> @@ -54,3 +54,7 @@ BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
>>  BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR)
>>  #endif
>>  #endif
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +BUILD_INTERRUPT(hyperv_reenlightenment_intr, HYPERV_REENLIGHTENMENT_VECTOR)
>> +#endif
>> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
>> index 2851077b6051..c65193dac7d9 100644
>> --- a/arch/x86/include/asm/hw_irq.h
>> +++ b/arch/x86/include/asm/hw_irq.h
>> @@ -36,6 +36,7 @@ extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
>>  extern asmlinkage void kvm_posted_intr_nested_ipi(void);
>>  extern asmlinkage void error_interrupt(void);
>>  extern asmlinkage void irq_work_interrupt(void);
>> +extern asmlinkage void hyperv_reenlightenment_intr(void);
>>  
>>  extern asmlinkage void spurious_interrupt(void);
>>  extern asmlinkage void thermal_interrupt(void);
>> diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
>> index 67421f649cfa..e71c1120426b 100644
>> --- a/arch/x86/include/asm/irq_vectors.h
>> +++ b/arch/x86/include/asm/irq_vectors.h
>> @@ -103,7 +103,12 @@
>>  #endif
>>  
>>  #define MANAGED_IRQ_SHUTDOWN_VECTOR	0xef
>> -#define LOCAL_TIMER_VECTOR		0xee
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +#define HYPERV_REENLIGHTENMENT_VECTOR	0xee
>> +#endif
>> +
>> +#define LOCAL_TIMER_VECTOR		0xed
>>  
>>  #define NR_VECTORS			 256
>>  
>> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
>> index a0b34994f453..43164b097585 100644
>> --- a/arch/x86/include/asm/mshyperv.h
>> +++ b/arch/x86/include/asm/mshyperv.h
>> @@ -314,11 +314,19 @@ void hyper_alloc_mmu(void);
>>  void hyperv_report_panic(struct pt_regs *regs, long err);
>>  bool hv_is_hypercall_page_setup(void);
>>  void hyperv_cleanup(void);
>> +
>> +asmlinkage void smp_hyperv_reenlightenment_intr(struct pt_regs *regs);
>> +void register_hv_tsc_update(void (*cb)(void));
>> +void unregister_hv_tsc_update(void);
>> +void hyperv_stop_tsc_emulation(void);
>>  #else /* CONFIG_HYPERV */
>>  static inline void hyperv_init(void) {}
>>  static inline bool hv_is_hypercall_page_setup(void) { return false; }
>>  static inline void hyperv_cleanup(void) {}
>>  static inline void hyperv_setup_mmu_ops(void) {}
>> +static inline void register_hv_tsc_update(void (*cb)(void)) {}
>> +static inline void unregister_hv_tsc_update(void) {}
>> +static inline void hyperv_stop_tsc_emulation(void) {};
>>  #endif /* CONFIG_HYPERV */
>>  
>>  #ifdef CONFIG_HYPERV_TSCPAGE
>> diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
>> index 1a5bfead93b4..197c2e6c7376 100644
>> --- a/arch/x86/include/uapi/asm/hyperv.h
>> +++ b/arch/x86/include/uapi/asm/hyperv.h
>> @@ -40,6 +40,9 @@
>>   */
>>  #define HV_X64_ACCESS_FREQUENCY_MSRS		(1 << 11)
>>  
>> +/* AccessReenlightenmentControls privilege */
>> +#define HV_X64_ACCESS_REENLIGHTENMENT		BIT(13)
>> +
>>  /*
>>   * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
>>   * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
>> @@ -234,6 +237,30 @@
>>  #define HV_X64_MSR_CRASH_PARAMS		\
>>  		(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
>>  
>> +/* TSC emulation after migration */
>> +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL	0x40000106
>> +
>> +struct hv_reenlightenment_control {
>> +	u64 vector:8;
>> +	u64 reserved1:8;
>> +	u64 enabled:1;
>> +	u64 reserved2:15;
>> +	u64 target_vp:32;
>> +};
>> +
>> +#define HV_X64_MSR_TSC_EMULATION_CONTROL	0x40000107
>> +#define HV_X64_MSR_TSC_EMULATION_STATUS		0x40000108
>> +
>> +struct hv_tsc_emulation_control {
>> +	u64 enabled:1;
>> +	u64 reserved:63;
>> +};
>> +
>> +struct hv_tsc_emulation_status {
>> +	u64 inprogress:1;
>> +	u64 reserved:63;
>> +};
>> +
>>  #define HV_X64_MSR_HYPERCALL_ENABLE		0x00000001
>>  #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT	12
>>  #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK	\
>> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
>> index d985cef3984f..5b8512d48aa3 100644
>> --- a/arch/x86/kernel/idt.c
>> +++ b/arch/x86/kernel/idt.c
>> @@ -140,6 +140,9 @@ static const __initdata struct idt_data apic_idts[] = {
>>  # ifdef CONFIG_IRQ_WORK
>>  	INTG(IRQ_WORK_VECTOR,		irq_work_interrupt),
>>  # endif
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +	INTG(HYPERV_REENLIGHTENMENT_VECTOR, hyperv_reenlightenment_intr),
>> +#endif
>>  	INTG(SPURIOUS_APIC_VECTOR,	spurious_interrupt),
>>  	INTG(ERROR_APIC_VECTOR,		error_interrupt),
>>  #endif
>
> Roman.

-- 
  Vitaly

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ