linux-kernel - Re: [PATCH v2 10/11] KVM: SVM: implement support for vNMI

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c7629b0b246593ccf71661f2f8989e3a3acc3cbe.camel@redhat.com>
Date:   Sun, 04 Dec 2022 19:18:55 +0200
From:   Maxim Levitsky <mlevitsk@...hat.com>
To:     kvm@...r.kernel.org
Cc:     Sandipan Das <sandipan.das@....com>,
        Paolo Bonzini <pbonzini@...hat.com>,
        Jim Mattson <jmattson@...gle.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Dave Hansen <dave.hansen@...ux.intel.com>,
        Borislav Petkov <bp@...en8.de>,
        Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>,
        Josh Poimboeuf <jpoimboe@...nel.org>,
        Daniel Sneddon <daniel.sneddon@...ux.intel.com>,
        Jiaxi Chen <jiaxi.chen@...ux.intel.com>,
        Babu Moger <babu.moger@....com>, linux-kernel@...r.kernel.org,
        Jing Liu <jing2.liu@...el.com>,
        Wyes Karny <wyes.karny@....com>, x86@...nel.org,
        "H. Peter Anvin" <hpa@...or.com>,
        Sean Christopherson <seanjc@...gle.com>,
        Santosh Shukla <santosh.shukla@....com>
Subject: Re: [PATCH v2 10/11] KVM: SVM: implement support for vNMI

On Tue, 2022-11-29 at 21:37 +0200, Maxim Levitsky wrote:
> This patch implements support for injecting pending
> NMIs via the .kvm_x86_set_hw_nmi_pending using new AMD's vNMI
> feature.
> 
> Note that the vNMI can't cause a VM exit, which is needed
> when a nested guest intercepts NMIs.
> 
> Therefore to avoid breaking nesting, the vNMI is inhibited while
> a nested guest is running and instead, the legacy NMI window
> detection and delivery method is used.
> 
> While it is possible to passthrough the vNMI if a nested guest
> doesn't intercept NMIs, such usage is very uncommon, and it's
> not worth to optimize for.
> 
> Signed-off-by: Santosh Shukla <santosh.shukla@....com>
> Signed-off-by: Maxim Levitsky <mlevitsk@...hat.com>
> ---
>  arch/x86/kvm/svm/nested.c |  42 +++++++++++++++
>  arch/x86/kvm/svm/svm.c    | 111 ++++++++++++++++++++++++++++++--------
>  arch/x86/kvm/svm/svm.h    |  10 ++++
>  3 files changed, 140 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index e891318595113e..5bea672bf8b12d 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -623,6 +623,42 @@ static bool is_evtinj_nmi(u32 evtinj)
>  	return type == SVM_EVTINJ_TYPE_NMI;
>  }


Tiny bits of self review, I noticed some mistakes:

>  
> +static void nested_svm_save_vnmi(struct vcpu_svm *svm)
> +{
> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
> +
> +	/*
> +	 * Copy the vNMI state back to software NMI tracking state
> +	 * for the duration of the nested run
> +	 */
> +
> +	svm->nmi_masked = vmcb01->control.int_ctl & V_NMI_MASK;
> +	svm->vcpu.arch.nmi_pending += vmcb01->control.int_ctl & V_NMI_PENDING;
Sorry for a mistake here - this should be converted to a boolean first, something like that:


if (vmcb01->control.int_ctl & V_NMI_PENDING)
	svm->vcpu.arch.nmi_pending++;



> +}
> +
> +static void nested_svm_restore_vnmi(struct vcpu_svm *svm)
> +{
> +	struct kvm_vcpu *vcpu = &svm->vcpu;
> +	struct vmcb *vmcb01 = svm->vmcb01.ptr;
> +
> +	/*
> +	 * Restore the vNMI state from the software NMI tracking state
> +	 * after a nested run
> +	 */
> +
> +	if (svm->nmi_masked)
> +		vmcb01->control.int_ctl |= V_NMI_MASK;
> +	else
> +		vmcb01->control.int_ctl &= ~V_NMI_MASK;
> +
> +	if (vcpu->arch.nmi_pending) {
> +		vcpu->arch.nmi_pending--;
> +		vmcb01->control.int_ctl |= V_NMI_PENDING;
> +	} else
> +		vmcb01->control.int_ctl &= ~V_NMI_PENDING;
> +}
> +
> +
>  static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
>  					  unsigned long vmcb12_rip,
>  					  unsigned long vmcb12_csbase)
> @@ -646,6 +682,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
>  	else
>  		int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
>  
> +	if (vnmi)
> +		nested_svm_save_vnmi(svm);
> +
>  	/* Copied from vmcb01.  msrpm_base can be overwritten later.  */
>  	vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
>  	vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
> @@ -1049,6 +1088,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
>  		svm_update_lbrv(vcpu);
>  	}
>  
> +	if (vnmi)
> +		nested_svm_restore_vnmi(svm);
> +
>  	/*
>  	 * On vmexit the  GIF is set to false and
>  	 * no event can be injected in L1.
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index cfed6ab29c839a..bf10adcf3170a8 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -230,6 +230,8 @@ module_param(dump_invalid_vmcb, bool, 0644);
>  bool intercept_smi = true;
>  module_param(intercept_smi, bool, 0444);
>  
> +bool vnmi = true;
> +module_param(vnmi, bool, 0444);
>  
>  static bool svm_gp_erratum_intercept = true;
>  
> @@ -1299,6 +1301,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
>  	if (kvm_vcpu_apicv_active(vcpu))
>  		avic_init_vmcb(svm, vmcb);
>  
> +	if (vnmi)
> +		svm->vmcb->control.int_ctl |= V_NMI_ENABLE;
> +
>  	if (vgif) {
>  		svm_clr_intercept(svm, INTERCEPT_STGI);
>  		svm_clr_intercept(svm, INTERCEPT_CLGI);
> @@ -3487,6 +3492,39 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
>  	++vcpu->stat.nmi_injections;
>  }
>  
> +
> +static bool svm_get_hw_nmi_pending(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (!is_vnmi_enabled(svm))
> +		return false;
> +
> +	return !!(svm->vmcb->control.int_ctl & V_NMI_MASK);
> +}
> +
> +static bool svm_set_hw_nmi_pending(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (!is_vnmi_enabled(svm))
> +		return false;
> +
> +	if (svm->vmcb->control.int_ctl & V_NMI_PENDING)
> +		return false;
> +
> +	svm->vmcb->control.int_ctl |= V_NMI_PENDING;
> +	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);

vmcb_mark_dirty is not needed I think now, as KVM always leaves the VMCB_INTR dirty bit
due to vTPR update.


Best regards,
	Maxim Levitsky

> +
> +	/*
> +	 * NMI isn't yet technically injected but
> +	 * this rough estimation should be good enough
> +	 */
> +	++vcpu->stat.nmi_injections;
> +
> +	return true;
> +}
> +
>  static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -3582,11 +3620,38 @@ static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
>  		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
>  }
>  
> +static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (is_vnmi_enabled(svm))
> +		return svm->vmcb->control.int_ctl & V_NMI_MASK;
> +	else
> +		return svm->nmi_masked;
> +}
> +
> +static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	if (is_vnmi_enabled(svm)) {
> +		if (masked)
> +			svm->vmcb->control.int_ctl |= V_NMI_MASK;
> +		else
> +			svm->vmcb->control.int_ctl &= ~V_NMI_MASK;
> +	} else {
> +		svm->nmi_masked = masked;
> +		if (masked)
> +			svm_enable_iret_interception(svm);
> +		else
> +			svm_disable_iret_interception(svm);
> +	}
> +}
> +
>  bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
>  	struct vmcb *vmcb = svm->vmcb;
> -	bool ret;
>  
>  	if (!gif_set(svm))
>  		return true;
> @@ -3594,10 +3659,10 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
>  	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
>  		return false;
>  
> -	ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
> -	      (svm->nmi_masked);
> +	if (svm_get_nmi_mask(vcpu))
> +		return true;
>  
> -	return ret;
> +	return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
>  }
>  
>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
> @@ -3615,24 +3680,6 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
>  	return 1;
>  }
>  
> -static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
> -{
> -	return to_svm(vcpu)->nmi_masked;
> -}
> -
> -static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
> -{
> -	struct vcpu_svm *svm = to_svm(vcpu);
> -
> -	if (masked) {
> -		svm->nmi_masked = true;
> -		svm_enable_iret_interception(svm);
> -	} else {
> -		svm->nmi_masked = false;
> -		svm_disable_iret_interception(svm);
> -	}
> -}
> -
>  bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> @@ -3725,10 +3772,16 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
>  	/*
>  	 * Something prevents NMI from been injected. Single step over possible
>  	 * problem (IRET or exception injection or interrupt shadow)
> +	 *
> +	 * With vNMI we should never need an NMI window
> +	 * (we can always inject vNMI either by setting VNMI_PENDING or by EVENTINJ)
>  	 */
> +	if (WARN_ON_ONCE(is_vnmi_enabled(svm)))
> +		return;
> +
>  	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
> -	svm->nmi_singlestep = true;
>  	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
> +	svm->nmi_singlestep = true;
>  }
>  
>  static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
> @@ -4770,6 +4823,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>  	.patch_hypercall = svm_patch_hypercall,
>  	.inject_irq = svm_inject_irq,
>  	.inject_nmi = svm_inject_nmi,
> +	.get_hw_nmi_pending = svm_get_hw_nmi_pending,
> +	.set_hw_nmi_pending = svm_set_hw_nmi_pending,
>  	.inject_exception = svm_inject_exception,
>  	.cancel_injection = svm_cancel_injection,
>  	.interrupt_allowed = svm_interrupt_allowed,
> @@ -5058,6 +5113,16 @@ static __init int svm_hardware_setup(void)
>  			pr_info("Virtual GIF supported\n");
>  	}
>  
> +
> +	vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_AMD_VNMI);
> +	if (vnmi)
> +		pr_info("Virtual NMI enabled\n");
> +
> +	if (!vnmi) {
> +		svm_x86_ops.get_hw_nmi_pending = NULL;
> +		svm_x86_ops.set_hw_nmi_pending = NULL;
> +	}
> +
>  	if (lbrv) {
>  		if (!boot_cpu_has(X86_FEATURE_LBRV))
>  			lbrv = false;
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 587ddc150f9f34..0b7e1790fadde1 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -35,6 +35,7 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
>  extern bool npt_enabled;
>  extern int vgif;
>  extern bool intercept_smi;
> +extern bool vnmi;
>  
>  enum avic_modes {
>  	AVIC_MODE_NONE = 0,
> @@ -553,6 +554,15 @@ static inline bool is_x2apic_msrpm_offset(u32 offset)
>  	       (msr < (APIC_BASE_MSR + 0x100));
>  }
>  
> +static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
> +{
> +	/* L1's vNMI is inhibited while nested guest is running */
> +	if (is_guest_mode(&svm->vcpu))
> +		return false;
> +
> +	return !!(svm->vmcb01.ptr->control.int_ctl & V_NMI_ENABLE);
> +}
> +
>  /* svm.c */
>  #define MSR_INVALID				0xffffffffU
>