linux-kernel - Re: [PATCH v5 untested] kvm: better MWAIT emulation for guests

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170316161612.GE4085@HEDWIG.INI.CMU.EDU>
Date:   Thu, 16 Mar 2017 12:16:13 -0400
From:   "Gabriel L. Somlo" <gsomlo@...il.com>
To:     Radim Krčmář <rkrcmar@...hat.com>
Cc:     "Michael S. Tsirkin" <mst@...hat.com>,
        linux-kernel@...r.kernel.org, Paolo Bonzini <pbonzini@...hat.com>,
        Jonathan Corbet <corbet@....net>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>,
        "H. Peter Anvin" <hpa@...or.com>, x86@...nel.org,
        Joerg Roedel <joro@...tes.org>, kvm@...r.kernel.org,
        linux-doc@...r.kernel.org
Subject: Re: [PATCH v5 untested] kvm: better MWAIT emulation for guests

On Thu, Mar 16, 2017 at 04:35:18PM +0100, Radim Krčmář wrote:
> 2017-03-16 10:58-0400, Gabriel L. Somlo:
> > On Thu, Mar 16, 2017 at 04:04:12PM +0200, Michael S. Tsirkin wrote:
> > > On Thu, Mar 16, 2017 at 09:24:27AM -0400, Gabriel L. Somlo wrote:
> > > > After studying your patch a bit more carefully (sorry, it's crazy
> > > > around here right now :) ) I realized you're simply trying to
> > > > (selectively) decide when to exit L1 and emulate as NOP vs. when to
> > > > just allow L1 to execute MONITOR & MWAIT natively.
> > > > 
> > > > Is that right ? Because if so, the issues I saw on my MacPro1,1 are
> > > > weird and inexplicable, given that allowing L>=1 to run MONITOR/MWAIT
> > > > natively was one of the options Alex Graf and Rene Rebe used back in
> > > > the very early days of OS X on QEMU, at the time I got involved with
> > > > that project. Here's part of an out of tree patch against 3.4 which did
> > > > just that, and worked as far as I remember on *any* MWAIT capable
> > > > intel chip I had access to back in 2010:
> > > > 
> > > > ##############################################################################
> > > > # 99-mwait.patch.kvm-kmod (Rene Rebe <rene@...ctcode.de>) 2010-04-27
> > > > ##############################################################################
> > > > diff -pNarU5 linux-3.4/arch/x86/kvm/cpuid.c linux-3.4-mac/arch/x86/kvm/cpuid.c
> > > > --- linux-3.4/arch/x86/kvm/cpuid.c	2012-05-20 18:29:13.000000000 -0400
> > > > +++ linux-3.4-mac/arch/x86/kvm/cpuid.c	2012-10-09 11:42:59.921215750 -0400
> > > > @@ -222,11 +222,11 @@ static int do_cpuid_ent(struct kvm_cpuid
> > > >  		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
> > > >  		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
> > > >  		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
> > > >  	/* cpuid 1.ecx */
> > > >  	const u32 kvm_supported_word4_x86_features =
> > > > -		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
> > > > +		F(XMM3) | F(PCLMULQDQ) | F(MWAIT) /* DTES64, MONITOR */ |
> > > >  		0 /* DS-CPL, VMX, SMX, EST */ |
> > > >  		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
> > > >  		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
> > > >  		0 /* Reserved, DCA */ | F(XMM4_1) |
> > > >  		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
> > > > diff -pNarU5 linux-3.4/arch/x86/kvm/svm.c linux-3.4-mac/arch/x86/kvm/svm.c
> > > > --- linux-3.4/arch/x86/kvm/svm.c	2012-05-20 18:29:13.000000000 -0400
> > > > +++ linux-3.4-mac/arch/x86/kvm/svm.c	2012-10-09 11:44:41.598997481 -0400
> > > > @@ -1102,12 +1102,10 @@ static void init_vmcb(struct vcpu_svm *s
> > > >  	set_intercept(svm, INTERCEPT_VMSAVE);
> > > >  	set_intercept(svm, INTERCEPT_STGI);
> > > >  	set_intercept(svm, INTERCEPT_CLGI);
> > > >  	set_intercept(svm, INTERCEPT_SKINIT);
> > > >  	set_intercept(svm, INTERCEPT_WBINVD);
> > > > -	set_intercept(svm, INTERCEPT_MONITOR);
> > > > -	set_intercept(svm, INTERCEPT_MWAIT);
> > > >  	set_intercept(svm, INTERCEPT_XSETBV);
> > > >  
> > > >  	control->iopm_base_pa = iopm_base;
> > > >  	control->msrpm_base_pa = __pa(svm->msrpm);
> > > >  	control->int_ctl = V_INTR_MASKING_MASK;
> > > > diff -pNarU5 linux-3.4/arch/x86/kvm/vmx.c linux-3.4-mac/arch/x86/kvm/vmx.c
> > > > --- linux-3.4/arch/x86/kvm/vmx.c	2012-05-20 18:29:13.000000000 -0400
> > > > +++ linux-3.4-mac/arch/x86/kvm/vmx.c	2012-10-09 11:42:59.925215977 -0400
> > > > @@ -1938,11 +1938,11 @@ static __init void nested_vmx_setup_ctls
> > > >  		nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
> > > >  	nested_vmx_procbased_ctls_low = 0;
> > > >  	nested_vmx_procbased_ctls_high &=
> > > >  		CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
> > > >  		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
> > > > -		CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
> > > > +		CPU_BASED_CR3_LOAD_EXITING |
> > > >  		CPU_BASED_CR3_STORE_EXITING |
> > > >  #ifdef CONFIG_X86_64
> > > >  		CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
> > > >  #endif
> > > >  		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
> > > > @@ -2404,12 +2404,10 @@ static __init int setup_vmcs_config(stru
> > > >  	      CPU_BASED_CR3_LOAD_EXITING |
> > > >  	      CPU_BASED_CR3_STORE_EXITING |
> > > >  	      CPU_BASED_USE_IO_BITMAPS |
> > > >  	      CPU_BASED_MOV_DR_EXITING |
> > > >  	      CPU_BASED_USE_TSC_OFFSETING |
> > > > -	      CPU_BASED_MWAIT_EXITING |
> > > > -	      CPU_BASED_MONITOR_EXITING |
> > > >  	      CPU_BASED_INVLPG_EXITING |
> > > >  	      CPU_BASED_RDPMC_EXITING;
> > > >  
> > > >  	opt = CPU_BASED_TPR_SHADOW |
> > > >  	      CPU_BASED_USE_MSR_BITMAPS |
> > > > 
> > > > If all you're trying to do is (selectively) revert to this behavior,
> > > > that "shouldn't" mess it up for the MacPro either, so I'm thoroughly
> > > > confused at this point :)
> > > 
> > > Yes.  Me too. Want to try that other patch and see what happens?
> > 
> > You mean the old 3.4 patch against current KVM ? I'll try to do that,
> > might take me a while :)
> 
> Michael's patch already did most of that, you just need to add
> 
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index efde6cc50875..b12f07d4ce17 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>  	const u32 kvm_cpuid_1_ecx_x86_features =
>  		/* NOTE: MONITOR (and MWAIT) are emulated as NOP,
>  		 * but *not* advertised to guests via CPUID ! */
> -		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
> +		F(XMM3) | F(PCLMULQDQ) | F(MWAIT) /* DTES64, MONITOR */ |
>  		0 /* DS-CPL, VMX, SMX, EST */ |
>  		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
>  		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
> 
> Note: this will never be upstream, because mwait isn't what we want by
> default. :)

But since OS X doesn't check CPUID and simply runs MONITOR & MWAIT
assuming they're present, the above one-liner would make no
difference. If everything else in the old patch I quoted is identical
to what Michael does, then I don't know -- maybe the MacPro1,1 has
really broken L>=1 MWAIT, and it only ever worked with vmexit and
emulation on the host side.

> >> > Back in 2010, running MWAIT in L>=1  behaved 100% exactly like a NOP,
> >> > didn't power down the physical CPU, just immediately moved on to the
> >> > next instruction. As such, there was no power saving and no
> >> > opportunity to yield to another L0 thread either, unlike with NOP
> >> > emulation at L0.
> >> > 
> >> > Did that change on newer Intel chips (i.e., is guest-mode MWAIT now
> >> > doing something smarter than just acting as a guest-mode NOP) ?
> >> > 
> >> > Thanks,
> >> > --Gabriel
> >> 
> >> Interesting.  What it seems to say is this:
> >> 
> >> MWAIT. Behavior of the MWAIT instruction (which always causes an invalid-
> >> opcode exception—#UD—if CPL > 0) is determined by the setting of the “MWAIT
> >> exiting” VM-execution control:
> >> — If the “MWAIT exiting” VM-execution control is 1, MWAIT causes a VM exit
> >> (see Section 22.1.3).
> >> — If the “MWAIT exiting” VM-execution control is 0, MWAIT operates normally if
> >> any of the following is true: (1) the “interrupt-window exiting” VM-execution
> >> control is 0; (2) ECX[0] is 0; or (3) RFLAGS.IF = 1.
> >> — If the “MWAIT exiting” VM-execution control is 0, the “interrupt-window
> >> exiting” VM-execution control is 1, ECX[0] = 1, and RFLAGS.IF = 0, MWAIT
> >> does not cause the processor to enter an implementation-dependent
> >> optimized state; instead, control passes to the instruction following the
> >> MWAIT instruction.
> >> 
> >> 
> >> And since interrupt-window exiting is 0 most of the time for KVM,
> >> I would expect MWAIT to behave normally.
> > 
> > The intel manual said the same thing back in 2010 as well. However,
> > regardless of how any flags were set, interrupt-window exiting or not,
> > "normal" L1 MWAIT behavior was that it woke up immediately regardless.
> > Remember, never going to sleep is still correct ("normal" ?) behavior
> > per the ISA definition of MWAIT :)
> 
> I'll write a simple kvm-unit-test to better understand why it is broken
> for you ...
> 
> > Also, when I tested your patch on the macbook air (where it worked),
> > not only was the host reporting 400% CPU for qemu (which is to be
> > expected), but the thermal fan/cooling thing also shifted up into high
> > gear, which means the physical CPU got hot, which it shouldn't have if
> > the guest-mode MWAIT actually did put the host CPU into low power.
> 
> I tested MWAIT with basically the same kernel patch and the qemu patch
> with Linux guest on Haswell and Nehalem.  Running the guest took 100% of
> the host CPUs, but it still had the same temperature as when the host
> was idle.
> 
> That reminds me that you to pass '-cpu host' for QEMU reasons.

For OS X to boot, one needs '-cpu core2duo' for <= 10.11, and
'-cpu Penryn' for 10.12. I never managed to get it working with any
other settings.

So I'm ready to write off the MacPro1,1 (unless you want me run more
tests and report back for you, which I'm happy to do in any case).

But please please, so at least I walk away from this having learned
something :) help me understand the use case:

	- By careful setting of vmx flags, and/or on newer, sanely
	  built Intel hardware, L1 MWAIT actually powers down the
	  physical host core (while I couldn't get it to stay cool
	  on my end, I totally believe you managed to pull it off)

	- We never admit to supporting MWAIT to guests, but when they
	  do anyway (either because they're old/grumpy/careless OS X
	  versions, or some newfangled custom-built Linux kernel which
	  is hacked to ignore CPUID on purpose), we now allow the
	  guest to:
		- keep its alloted time slice
		- but "waste" it by powering down the host CPU 
	  instead of
		- vmexit to the host OS at L0
		- yield the host core to another L0 runnable thread

Since newer OS X actually checks CPUID, I don't have a major stake in
one way vs. the other, but I'm really really curious:

Are we trying to save power assuming the host is unlikely to have
enough runnable L0 threads for when the L0-emulated NOP yields? So
we're better off letting the guest keep the CPU but also keep it cool
while at it (assuming the guest isn't totally hostile and didn't pick
a setting where L1 MWAIT actually works as L1 NOP, in which case we
don't even get to stay cool)?

Man, I wish I had the cycles to resurrect my attempt at acually emulating
MWAIT with something like a condition queue (below, just for reference).

Thanks much,
--Gabriel


##############################################################################
# kvm-mwait-emu.patch (Gabriel Somlo <somlo@....edu> 2014/02/05)
#   -- based on an idea suggested by Alex Graf --
# GLS: emulate MONITOR and MWAIT at page-level granularity by write-protecting
#       the page containing a monitored location and appropriately handling
#       subsequent write faults.
#       After debugging the SMP issue, we'll need a way to trigger a
#       periodic cleanup that will switch write-protected monitored pages
#       back to read-write, once they've stayed unused for "long enough"
##############################################################################
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fdf83af..7ca9b51 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -337,6 +337,16 @@ struct kvm_pmu {
 	u64 reprogram_pmi;
 };
 
+/*
+ * mwait-monitored page list element type
+ */
+struct kvm_mwait_pg {
+	gpa_t gpa;
+	struct list_head vcpu_list; /* VCPUs monitoring (armed on) this page */
+	struct list_head link;      /* links mwait-pages within a KVM */
+	unsigned accessed;
+};
+
 struct kvm_vcpu_arch {
 	/*
 	 * rip and regs accesses must go through
@@ -528,6 +538,10 @@ struct kvm_vcpu_arch {
 	struct {
 		bool pv_unhalted;
 	} pv;
+
+	/* MONITOR/MWAIT support */
+	struct kvm_mwait_pg *mwp;	/* page monitored by this VCPU */
+	struct list_head mw_link;	/* all VCPUs monitoring the same page */
 };
 
 struct kvm_lpage_info {
@@ -607,6 +621,10 @@ struct kvm_arch {
 	u64 hv_hypercall;
 	u64 hv_tsc_page;
 
+	/* MONITOR/MWAIT support */
+	struct mutex mwait_lock;
+	struct list_head mwait_pg_list;	/* monitored pages within this KVM */
+
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
@@ -854,6 +872,8 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
@@ -915,6 +935,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes);
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
+int kvm_mmu_protect_page(struct kvm *kvm, gfn_t gfn);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c697625..7d4f1ca 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -279,6 +279,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
 	/* cpuid 1.ecx */
 	const u32 kvm_supported_word4_x86_features =
+		/* OS X does not check CPUID before using MONITOR/MWAIT from its
+		 * power-optimized idle loop (AppleIntelPowerManagement.kext).
+		 * For now, we don't advertise MWAIT support below, but attempt
+		 * to emulate them instead of issuing an invalid opcode fault
+		 * if a misbehaving guest calls them anyway. Removing the above
+		 * mentioned kext from OS X will cause it to fall back to a
+		 * HLT-based idle loop, as an optional guest optimization step.
+		 */
 		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
 		0 /* DS-CPL, VMX, SMX, EST */ |
 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e50425d..bc02ebd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2283,6 +2283,20 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
+int kvm_mmu_protect_page(struct kvm *kvm, gfn_t gfn)
+{
+	int r;
+
+	spin_lock(&kvm->mmu_lock);
+	r = rmap_write_protect(kvm, gfn);
+	if (r)
+		kvm_flush_remote_tlbs(kvm);
+	spin_unlock(&kvm->mmu_lock);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_protect_page);
+
 /*
  * The function is based on mtrr_type_lookup() in
  * arch/x86/kernel/cpu/mtrr/generic.c
@@ -4146,12 +4160,68 @@ static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
 	return vcpu_match_mmio_gva(vcpu, addr);
 }
 
+// try to handle fault caused by write to monitored (mwait) page
+// FIXME: aim for better integration between this and FNAME(page_fault)() and
+// kvm_mmu_page_fault() below. For now, this is proof-of-concept code.
+static bool handle_mwait_write_fault(struct kvm_vcpu *vcpu, gva_t gva,
+					void *in, int in_len)
+{
+	gpa_t gpa;
+	struct kvm_mwait_pg *p, *mwp = NULL;
+	struct kvm_vcpu_arch *v, *u;
+	bool r = false;
+
+	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+	if (gpa == UNMAPPED_GVA)
+		goto ul_out;
+
+	mutex_lock(&vcpu->kvm->arch.mwait_lock);
+
+	/* is gpa matching a monitored (mwait) page? */
+	list_for_each_entry(p, &vcpu->kvm->arch.mwait_pg_list, link)
+		if (p->gpa == gpa) {
+			mwp = p;
+			break;
+		}
+	if (mwp == NULL)
+		goto out;
+
+	mwp->accessed = 1;
+
+	if (x86_emulate_instruction(vcpu, gva,
+				    EMULTYPE_RETRY, in, in_len) != EMULATE_DONE)
+		goto out;
+
+	/* disarm all VCPUs monitoring this page, waking them if needed */
+	list_for_each_entry_safe(v, u, &mwp->vcpu_list, mw_link) {
+		list_del(&v->mw_link);
+		v->mwp = NULL;
+		if (v->mp_state == KVM_MP_STATE_MWAIT)
+			v->mp_state = KVM_MP_STATE_RUNNABLE;
+	}
+
+	// What if the mwait is woken up by an interrupt instead of a write ?
+	// It might remain "armed" on its old mwait page, but any subsequent
+	// MONITOR instruction would replace that, so I don't think we need
+	// to worry about it...
+
+	r = true;
+out:
+	mutex_unlock(&vcpu->kvm->arch.mwait_lock);
+ul_out:
+	return r;
+}
+
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 		       void *insn, int insn_len)
 {
 	int r, emulation_type = EMULTYPE_RETRY;
 	enum emulation_result er;
 
+	/* writing to MONITORed memory area ? */
+	if (handle_mwait_write_fault(vcpu, cr2, insn, insn_len))
+		return 1;
+
 	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
 	if (r < 0)
 		goto out;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e81df8f..638704c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3262,6 +3262,18 @@ static int pause_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int monitor_interception(struct vcpu_svm *svm)
+{
+	skip_emulated_instruction(&(svm->vcpu));
+	return kvm_emulate_monitor(&(svm->vcpu));
+}
+
+static int mwait_interception(struct vcpu_svm *svm)
+{
+	skip_emulated_instruction(&(svm->vcpu));
+	return kvm_emulate_mwait(&(svm->vcpu));
+}
+
 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_READ_CR0]			= cr_interception,
 	[SVM_EXIT_READ_CR3]			= cr_interception,
@@ -3319,8 +3331,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_CLGI]				= clgi_interception,
 	[SVM_EXIT_SKINIT]			= skinit_interception,
 	[SVM_EXIT_WBINVD]                       = emulate_on_interception,
-	[SVM_EXIT_MONITOR]			= invalid_op_interception,
-	[SVM_EXIT_MWAIT]			= invalid_op_interception,
+	[SVM_EXIT_MONITOR]			= monitor_interception,
+	[SVM_EXIT_MWAIT]			= mwait_interception,
 	[SVM_EXIT_XSETBV]			= xsetbv_interception,
 	[SVM_EXIT_NPF]				= pf_interception,
 };
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a06f101..a7382e1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5603,6 +5603,18 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_monitor(struct kvm_vcpu *vcpu)
+{
+	skip_emulated_instruction(vcpu);
+	return kvm_emulate_monitor(vcpu);
+}
+
+static int handle_mwait(struct kvm_vcpu *vcpu)
+{
+	skip_emulated_instruction(vcpu);
+	return kvm_emulate_mwait(vcpu);
+}
+
 /*
  * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
  * We could reuse a single VMCS for all the L2 guests, but we also want the
@@ -6483,8 +6495,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
 	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
 	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
-	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op,
-	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_mwait,
+	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
 	[EXIT_REASON_INVEPT]                  = handle_invept,
 };
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 39c28f09..8edc1be 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5592,6 +5592,70 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+	gva_t gva;
+	gpa_t gpa;
+	struct kvm_mwait_pg *p;
+
+	/* emulate as NOP if no-kvm-irqchip */
+	if (!irqchip_in_kernel(vcpu->kvm))
+		return 1;
+
+	mutex_lock(&vcpu->kvm->arch.mwait_lock);
+
+	/* relinguish any previously monitored mwait page */
+	if (vcpu->arch.mwp != NULL) {
+		list_del(&vcpu->arch.mw_link);
+		vcpu->arch.mwp->accessed = 1;
+		vcpu->arch.mwp = NULL;
+	}
+
+	gva = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+	if (gpa == UNMAPPED_GVA)
+		goto out;       /* let some write op map the page first */
+
+	/* does the mwait page we're looking for already exist? */
+	list_for_each_entry(p, &vcpu->kvm->arch.mwait_pg_list, link)
+		if (p->gpa == gpa) {
+			vcpu->arch.mwp = p;
+			break;
+		}
+	if (vcpu->arch.mwp == NULL) { /* no, add new mwait page */
+		if (!kvm_mmu_protect_page(vcpu->kvm, gpa_to_gfn(gpa)))
+			goto out;
+		p = kmalloc(sizeof(struct kvm_mwait_pg), GFP_KERNEL);
+		p->gpa = gpa;
+		INIT_LIST_HEAD(&p->vcpu_list);
+		list_add(&p->link, &vcpu->kvm->arch.mwait_pg_list);
+
+		vcpu->arch.mwp = p;
+	}
+
+	/* link this VCPU into list of VCPUs monitoring this mwait page */
+	list_add(&vcpu->arch.mw_link, &vcpu->arch.mwp->vcpu_list);
+
+out:
+	mutex_unlock(&vcpu->kvm->arch.mwait_lock);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
+
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+	/* emulate as NOP if no-kvm-irqchip */
+	if (!irqchip_in_kernel(vcpu->kvm))
+		return 1;
+
+	mutex_lock(&vcpu->kvm->arch.mwait_lock);
+	if (vcpu->arch.mwp != NULL)
+		vcpu->arch.mp_state = KVM_MP_STATE_MWAIT;
+	mutex_unlock(&vcpu->kvm->arch.mwait_lock);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
 	u64 param, ingpa, outgpa, ret;
@@ -6077,6 +6141,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 			if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
 				kvm_apic_accept_events(vcpu);
 				switch(vcpu->arch.mp_state) {
+				case KVM_MP_STATE_MWAIT:
 				case KVM_MP_STATE_HALTED:
 					vcpu->arch.pv.pv_unhalted = false;
 					vcpu->arch.mp_state =
@@ -6961,6 +7026,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
 
+	vcpu->arch.mwp = NULL;
+
 	return 0;
 fail_free_wbinvd_dirty_mask:
 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
@@ -7013,6 +7080,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	pvclock_update_vm_gtod_copy(kvm);
 
+	mutex_init(&kvm->arch.mwait_lock);
+	INIT_LIST_HEAD(&kvm->arch.mwait_pg_list);
+
 	return 0;
 }
 
@@ -7254,8 +7324,10 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 		|| kvm_apic_has_events(vcpu)
 		|| vcpu->arch.pv.pv_unhalted
 		|| atomic_read(&vcpu->arch.nmi_queued) ||
-		(kvm_arch_interrupt_allowed(vcpu) &&
-		 kvm_cpu_has_interrupt(vcpu));
+		(kvm_cpu_has_interrupt(vcpu) &&
+		 (kvm_arch_interrupt_allowed(vcpu) ||
+		  (vcpu->arch.mp_state == KVM_MP_STATE_MWAIT &&
+		   kvm_register_read(vcpu, VCPU_REGS_RCX) & 0x01)));
 }
 
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 932d7f2..a4925fc 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -398,6 +398,7 @@ struct kvm_vapic_addr {
 #define KVM_MP_STATE_INIT_RECEIVED     2
 #define KVM_MP_STATE_HALTED            3
 #define KVM_MP_STATE_SIPI_RECEIVED     4
+#define KVM_MP_STATE_MWAIT             5
 
 struct kvm_mp_state {
 	__u32 mp_state;