[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20171208181059.GB4777@rkaganb.sw.ru>
Date: Fri, 8 Dec 2017 21:10:59 +0300
From: Roman Kagan <rkagan@...tuozzo.com>
To: Vitaly Kuznetsov <vkuznets@...hat.com>
Cc: kvm@...r.kernel.org, x86@...nel.org,
Paolo Bonzini <pbonzini@...hat.com>,
Radim Krčmář <rkrcmar@...hat.com>,
Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...hat.com>,
"H. Peter Anvin" <hpa@...or.com>,
"K. Y. Srinivasan" <kys@...rosoft.com>,
Haiyang Zhang <haiyangz@...rosoft.com>,
Stephen Hemminger <sthemmin@...rosoft.com>,
"Michael Kelley (EOSG)" <Michael.H.Kelley@...rosoft.com>,
Andy Lutomirski <luto@...nel.org>,
Mohammed Gamal <mmorsy@...hat.com>,
Cathy Avery <cavery@...hat.com>, linux-kernel@...r.kernel.org,
devel@...uxdriverproject.org
Subject: Re: [PATCH 3/6] x86/hyper-v: reenlightenment notifications support
On Fri, Dec 08, 2017 at 11:49:57AM +0100, Vitaly Kuznetsov wrote:
> Hyper-V supports Live Migration notification. This is supposed to be used
> in conjunction with TSC emulation: when we are migrated to a host with
> different TSC frequency for some short period host emulates our accesses
> to TSC and sends us an interrupt to notify about the event. When we're
> done updating everything we can disable TSC emulation and everything will
> start working fast again.
>
> We didn't need these notifications before as Hyper-V guests are not
> supposed to use TSC as a clocksource: in Linux we even mark it as unstable
> on boot. Guests normally use 'tsc page' clocksouce and host updates its
> values on migrations automatically.
>
> Things change when we want to run nested virtualization: even when we pass
> through PV clocksources (kvm-clock or tsc page) to our guests we need to
> know TSC frequency and when it changes.
>
> Hyper-V Top Level Functional Specification (as of v5.0b) wrongly specifies
> EAX:BIT(12) of CPUID:0x40000009 as the feature identification bit. The
> right one to check is EAX:BIT(13) of CPUID:0x40000003. I was assured that
> the fix in on the way.
>
> Signed-off-by: Vitaly Kuznetsov <vkuznets@...hat.com>
> ---
> RFC -> v1:
> - #include <asm/apic.h> [kbuild test robot]
> - use div64_u64() [kbuild test robot]
> - DECLARE_WORK -> DECLARE_DELAYED_WORK as testing showed there's some bug
> in Hyper-V hypervisor and disabling emulation after receiving interrupt
> may screw TSC counters.
Looks kinda fragile...
> ---
> arch/x86/entry/entry_64.S | 4 +++
> arch/x86/hyperv/hv_init.c | 71 ++++++++++++++++++++++++++++++++++++++
> arch/x86/include/asm/entry_arch.h | 4 +++
> arch/x86/include/asm/hw_irq.h | 1 +
> arch/x86/include/asm/irq_vectors.h | 7 +++-
> arch/x86/include/asm/mshyperv.h | 8 +++++
> arch/x86/include/uapi/asm/hyperv.h | 27 +++++++++++++++
> arch/x86/kernel/idt.c | 3 ++
> 8 files changed, 124 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index f81d50d7ceac..a32730b260bc 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -826,6 +826,10 @@ apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
> apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
> #endif
>
> +#if IS_ENABLED(CONFIG_HYPERV)
> +apicinterrupt HYPERV_REENLIGHTENMENT_VECTOR hyperv_reenlightenment_intr smp_hyperv_reenlightenment_intr
> +#endif
> +
> /*
> * Exception entry points.
> */
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 1a6c63f721bc..bb62839ede81 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -18,6 +18,7 @@
> */
>
> #include <linux/types.h>
> +#include <asm/apic.h>
> #include <asm/hypervisor.h>
> #include <asm/hyperv.h>
> #include <asm/mshyperv.h>
> @@ -102,6 +103,76 @@ static int hv_cpu_init(unsigned int cpu)
> return 0;
> }
>
> +static void (*hv_reenlightenment_cb)(void);
> +
> +static void hv_reenlightenment_notify(struct work_struct *dummy)
> +{
> + if (hv_reenlightenment_cb)
> + hv_reenlightenment_cb();
> +}
> +static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify);
> +
> +void hyperv_stop_tsc_emulation(void)
> +{
> + u64 freq;
> + struct hv_tsc_emulation_status emu_status;
> +
> + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
> + emu_status.inprogress = 0;
> + wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
> +
> + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
IIRC the availability of this msr is not guaranteed (I guess in reality
it's present if the reenlightenment is supported, but I'd rather check).
> + tsc_khz = div64_u64(freq, 1000);
> +}
> +EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation);
> +
> +void register_hv_tsc_update(void (*cb)(void))
> +{
The function name seems unfortunate. IMHO such a name suggests
registering a callback on a notifier chain (rather than unconditionally
replacing the old callback), and having no other side effects.
> + struct hv_reenlightenment_control re_ctrl = {
> + .vector = HYPERV_REENLIGHTENMENT_VECTOR,
> + .enabled = 1,
> + .target_vp = hv_vp_index[smp_processor_id()]
> + };
> + struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1};
> +
> + if (!(ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT))
> + return;
What happens then? L2 guests keep running with their clocks ticking at
a different speed?
> +
> + hv_reenlightenment_cb = cb;
> +
> + /* Make sure callback is registered before we write to MSRs */
> + wmb();
> +
> + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
> + wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl));
> +}
> +EXPORT_SYMBOL_GPL(register_hv_tsc_update);
> +
> +void unregister_hv_tsc_update(void)
> +{
> + struct hv_reenlightenment_control re_ctrl;
> +
> + if (!(ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT))
> + return;
> +
> + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
> + re_ctrl.enabled = 0;
> + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
> +
> + hv_reenlightenment_cb = NULL;
> +}
> +EXPORT_SYMBOL_GPL(unregister_hv_tsc_update);
> +
> +asmlinkage __visible void
> +__irq_entry smp_hyperv_reenlightenment_intr(struct pt_regs *regs)
> +{
> + entering_ack_irq();
> +
> + schedule_delayed_work(&hv_reenlightenment_work, HZ/10);
> +
> + exiting_irq();
> +}
> +
> /*
> * This function is to be invoked early in the boot sequence after the
> * hypervisor has been detected.
> diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
> index 416422762845..eb936cc49b62 100644
> --- a/arch/x86/include/asm/entry_arch.h
> +++ b/arch/x86/include/asm/entry_arch.h
> @@ -54,3 +54,7 @@ BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
> BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR)
> #endif
> #endif
> +
> +#if IS_ENABLED(CONFIG_HYPERV)
> +BUILD_INTERRUPT(hyperv_reenlightenment_intr, HYPERV_REENLIGHTENMENT_VECTOR)
> +#endif
> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
> index 2851077b6051..c65193dac7d9 100644
> --- a/arch/x86/include/asm/hw_irq.h
> +++ b/arch/x86/include/asm/hw_irq.h
> @@ -36,6 +36,7 @@ extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
> extern asmlinkage void kvm_posted_intr_nested_ipi(void);
> extern asmlinkage void error_interrupt(void);
> extern asmlinkage void irq_work_interrupt(void);
> +extern asmlinkage void hyperv_reenlightenment_intr(void);
>
> extern asmlinkage void spurious_interrupt(void);
> extern asmlinkage void thermal_interrupt(void);
> diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
> index 67421f649cfa..e71c1120426b 100644
> --- a/arch/x86/include/asm/irq_vectors.h
> +++ b/arch/x86/include/asm/irq_vectors.h
> @@ -103,7 +103,12 @@
> #endif
>
> #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef
> -#define LOCAL_TIMER_VECTOR 0xee
> +
> +#if IS_ENABLED(CONFIG_HYPERV)
> +#define HYPERV_REENLIGHTENMENT_VECTOR 0xee
> +#endif
> +
> +#define LOCAL_TIMER_VECTOR 0xed
>
> #define NR_VECTORS 256
>
> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
> index a0b34994f453..43164b097585 100644
> --- a/arch/x86/include/asm/mshyperv.h
> +++ b/arch/x86/include/asm/mshyperv.h
> @@ -314,11 +314,19 @@ void hyper_alloc_mmu(void);
> void hyperv_report_panic(struct pt_regs *regs, long err);
> bool hv_is_hypercall_page_setup(void);
> void hyperv_cleanup(void);
> +
> +asmlinkage void smp_hyperv_reenlightenment_intr(struct pt_regs *regs);
> +void register_hv_tsc_update(void (*cb)(void));
> +void unregister_hv_tsc_update(void);
> +void hyperv_stop_tsc_emulation(void);
> #else /* CONFIG_HYPERV */
> static inline void hyperv_init(void) {}
> static inline bool hv_is_hypercall_page_setup(void) { return false; }
> static inline void hyperv_cleanup(void) {}
> static inline void hyperv_setup_mmu_ops(void) {}
> +static inline void register_hv_tsc_update(void (*cb)(void)) {}
> +static inline void unregister_hv_tsc_update(void) {}
> +static inline void hyperv_stop_tsc_emulation(void) {};
> #endif /* CONFIG_HYPERV */
>
> #ifdef CONFIG_HYPERV_TSCPAGE
> diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
> index 1a5bfead93b4..197c2e6c7376 100644
> --- a/arch/x86/include/uapi/asm/hyperv.h
> +++ b/arch/x86/include/uapi/asm/hyperv.h
> @@ -40,6 +40,9 @@
> */
> #define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11)
>
> +/* AccessReenlightenmentControls privilege */
> +#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13)
> +
> /*
> * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
> * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
> @@ -234,6 +237,30 @@
> #define HV_X64_MSR_CRASH_PARAMS \
> (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
>
> +/* TSC emulation after migration */
> +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
> +
> +struct hv_reenlightenment_control {
> + u64 vector:8;
> + u64 reserved1:8;
> + u64 enabled:1;
> + u64 reserved2:15;
> + u64 target_vp:32;
> +};
> +
> +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107
> +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108
> +
> +struct hv_tsc_emulation_control {
> + u64 enabled:1;
> + u64 reserved:63;
> +};
> +
> +struct hv_tsc_emulation_status {
> + u64 inprogress:1;
> + u64 reserved:63;
> +};
> +
> #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
> #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
> #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
> index d985cef3984f..5b8512d48aa3 100644
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -140,6 +140,9 @@ static const __initdata struct idt_data apic_idts[] = {
> # ifdef CONFIG_IRQ_WORK
> INTG(IRQ_WORK_VECTOR, irq_work_interrupt),
> # endif
> +#if IS_ENABLED(CONFIG_HYPERV)
> + INTG(HYPERV_REENLIGHTENMENT_VECTOR, hyperv_reenlightenment_intr),
> +#endif
> INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt),
> INTG(ERROR_APIC_VECTOR, error_interrupt),
> #endif
Roman.
Powered by blists - more mailing lists