lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 25 Feb 2022 19:19:32 +0200
From:   Maxim Levitsky <mlevitsk@...hat.com>
To:     Zeng Guang <guang.zeng@...el.com>,
        Paolo Bonzini <pbonzini@...hat.com>,
        Sean Christopherson <seanjc@...gle.com>,
        Vitaly Kuznetsov <vkuznets@...hat.com>,
        Wanpeng Li <wanpengli@...cent.com>,
        Jim Mattson <jmattson@...gle.com>,
        Joerg Roedel <joro@...tes.org>, kvm@...r.kernel.org,
        Dave Hansen <dave.hansen@...ux.intel.com>,
        Tony Luck <tony.luck@...el.com>,
        Kan Liang <kan.liang@...ux.intel.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
        "H. Peter Anvin" <hpa@...or.com>,
        Kim Phillips <kim.phillips@....com>,
        Jarkko Sakkinen <jarkko@...nel.org>,
        Jethro Beekman <jethro@...tanix.com>,
        Kai Huang <kai.huang@...el.com>
Cc:     x86@...nel.org, linux-kernel@...r.kernel.org,
        Robert Hu <robert.hu@...el.com>, Gao Chao <chao.gao@...el.com>
Subject: Re: [PATCH v6 7/9] KVM: VMX: enable IPI virtualization

On Fri, 2022-02-25 at 16:22 +0800, Zeng Guang wrote:
> From: Gao Chao <chao.gao@...el.com>
> 
> With IPI virtualization enabled, the processor emulates writes to
> APIC registers that would send IPIs. The processor sets the bit
> corresponding to the vector in target vCPU's PIR and may send a
> notification (IPI) specified by NDST and NV fields in target vCPU's
> Posted-Interrupt Descriptor (PID). It is similar to what IOMMU
> engine does when dealing with posted interrupt from devices.
> 
> A PID-pointer table is used by the processor to locate the PID of a
> vCPU with the vCPU's APIC ID.
> 
> Like VT-d PI, if a vCPU goes to blocked state, VMM needs to switch its
> notification vector to wakeup vector. This can ensure that when an IPI
> for blocked vCPUs arrives, VMM can get control and wake up blocked
> vCPUs. And if a VCPU is preempted, its posted interrupt notification
> is suppressed.
> 
> Note that IPI virtualization can only virualize physical-addressing,
> flat mode, unicast IPIs. Sending other IPIs would still cause a
> trap-like APIC-write VM-exit and need to be handled by VMM.
> 
> Signed-off-by: Gao Chao <chao.gao@...el.com>
> Signed-off-by: Zeng Guang <guang.zeng@...el.com>
> ---
>  arch/x86/include/asm/vmx.h         |  8 ++++
>  arch/x86/include/asm/vmxfeatures.h |  2 +
>  arch/x86/kvm/vmx/capabilities.h    |  6 +++
>  arch/x86/kvm/vmx/posted_intr.c     | 12 ++++-
>  arch/x86/kvm/vmx/vmx.c             | 74 +++++++++++++++++++++++++++---
>  arch/x86/kvm/vmx/vmx.h             |  3 ++
>  6 files changed, 97 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 8c929596a299..b79b6438acaa 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -76,6 +76,11 @@
>  #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
>  #define SECONDARY_EXEC_BUS_LOCK_DETECTION	VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
>  
> +/*
> + * Definitions of Tertiary Processor-Based VM-Execution Controls.
> + */
> +#define TERTIARY_EXEC_IPI_VIRT			VMCS_CONTROL_BIT(IPI_VIRT)
> +
>  #define PIN_BASED_EXT_INTR_MASK                 VMCS_CONTROL_BIT(INTR_EXITING)
>  #define PIN_BASED_NMI_EXITING                   VMCS_CONTROL_BIT(NMI_EXITING)
>  #define PIN_BASED_VIRTUAL_NMIS                  VMCS_CONTROL_BIT(VIRTUAL_NMIS)
> @@ -159,6 +164,7 @@ static inline int vmx_misc_mseg_revid(u64 vmx_misc)
>  enum vmcs_field {
>  	VIRTUAL_PROCESSOR_ID            = 0x00000000,
>  	POSTED_INTR_NV                  = 0x00000002,
> +	LAST_PID_POINTER_INDEX		= 0x00000008,
>  	GUEST_ES_SELECTOR               = 0x00000800,
>  	GUEST_CS_SELECTOR               = 0x00000802,
>  	GUEST_SS_SELECTOR               = 0x00000804,
> @@ -224,6 +230,8 @@ enum vmcs_field {
>  	TSC_MULTIPLIER_HIGH             = 0x00002033,
>  	TERTIARY_VM_EXEC_CONTROL	= 0x00002034,
>  	TERTIARY_VM_EXEC_CONTROL_HIGH	= 0x00002035,
> +	PID_POINTER_TABLE		= 0x00002042,
> +	PID_POINTER_TABLE_HIGH		= 0x00002043,
>  	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
>  	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
>  	VMCS_LINK_POINTER               = 0x00002800,
> diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
> index ff20776dc83b..7ce616af2db2 100644
> --- a/arch/x86/include/asm/vmxfeatures.h
> +++ b/arch/x86/include/asm/vmxfeatures.h
> @@ -86,4 +86,6 @@
>  #define VMX_FEATURE_ENCLV_EXITING	( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
>  #define VMX_FEATURE_BUS_LOCK_DETECTION	( 2*32+ 30) /* "" VM-Exit when bus lock caused */
>  
> +/* Tertiary Processor-Based VM-Execution Controls, word 3 */
> +#define VMX_FEATURE_IPI_VIRT		(3*32 +  4) /* "" Enable IPI virtualization */
>  #endif /* _ASM_X86_VMXFEATURES_H */
> diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
> index 31f3d88b3e4d..5f656c9e33be 100644
> --- a/arch/x86/kvm/vmx/capabilities.h
> +++ b/arch/x86/kvm/vmx/capabilities.h
> @@ -13,6 +13,7 @@ extern bool __read_mostly enable_ept;
>  extern bool __read_mostly enable_unrestricted_guest;
>  extern bool __read_mostly enable_ept_ad_bits;
>  extern bool __read_mostly enable_pml;
> +extern bool __read_mostly enable_ipiv;
>  extern int __read_mostly pt_mode;
>  
>  #define PT_MODE_SYSTEM		0
> @@ -283,6 +284,11 @@ static inline bool cpu_has_vmx_apicv(void)
>  		cpu_has_vmx_posted_intr();
>  }
>  
> +static inline bool cpu_has_vmx_ipiv(void)
> +{
> +	return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT;
> +}
> +
>  static inline bool cpu_has_vmx_flexpriority(void)
>  {
>  	return cpu_has_vmx_tpr_shadow() &&
> diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
> index aa1fe9085d77..90124a30c074 100644
> --- a/arch/x86/kvm/vmx/posted_intr.c
> +++ b/arch/x86/kvm/vmx/posted_intr.c
> @@ -177,11 +177,21 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
>  	local_irq_restore(flags);
>  }
>  
> +static bool vmx_can_use_ipiv_pi(struct kvm *kvm)
> +{
> +	return irqchip_in_kernel(kvm) && enable_ipiv;
> +}
> +
> +static bool vmx_can_use_posted_interrupts(struct kvm *kvm)
> +{
> +	return vmx_can_use_ipiv_pi(kvm) || vmx_can_use_vtd_pi(kvm);

It took me a while to figure that out.
 
vmx_can_use_vtd_pi returns true when the VM can be targeted by posted
interrupts from the IOMMU, which leads to
 
1. update of the NV vector and SN bit on vcpu_load/vcpu_put to let
IOMMU knows where the vCPU really runs.
 
2. in vmx_pi_update_irte to configure the posted interrupts.
 
 
Now IPIv will also use the same NV vector and SN bit for IPI virtualization,
thus they have to be kept up to date on vcpu load/put.
 
I would appreciate a comment about this in vmx_can_use_posted_interrupts
because posted interrupts can mean too many things, like a host->guest
posted interrupt which is sent by just interrupt.
 
Maybe also rename the function to something like
 
vmx_need_up_to_date_nv_sn(). Sounds silly to me so
maybe something else.


> +}
> +
>  void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
>  {
>  	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
>  
> -	if (!vmx_can_use_vtd_pi(vcpu->kvm))
> +	if (!vmx_can_use_posted_interrupts(vcpu->kvm))
I see here it is used.
>  		return;
>  
>  	if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 7beba7a9f247..0cb141c277ef 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -105,6 +105,9 @@ module_param(fasteoi, bool, S_IRUGO);
>  
>  module_param(enable_apicv, bool, S_IRUGO);
>  
> +bool __read_mostly enable_ipiv = true;
> +module_param(enable_ipiv, bool, 0444);
> +
>  /*
>   * If nested=1, nested virtualization is supported, i.e., guests may use
>   * VMX and be a hypervisor for its own guests. If nested=0, guests may not
> @@ -227,6 +230,11 @@ static const struct {
>  };
>  
>  #define L1D_CACHE_ORDER 4
> +
> +/* PID(Posted-Interrupt Descriptor)-pointer table entry is 64-bit long */
> +#define MAX_PID_TABLE_ORDER get_order(KVM_MAX_VCPU_IDS * sizeof(u64))
> +#define PID_TABLE_ENTRY_VALID 1
> +
>  static void *vmx_l1d_flush_pages;
>  
>  static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
> @@ -2543,7 +2551,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  	}
>  
>  	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) {
> -		u64 opt3 = 0;
> +		u64 opt3 = TERTIARY_EXEC_IPI_VIRT;
>  		u64 min3 = 0;
>  
>  		if (adjust_vmx_controls_64(min3, opt3,
> @@ -3898,6 +3906,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
>  		vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
>  		vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
>  		vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
> +		if (enable_ipiv)
> +			vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR),MSR_TYPE_RW);
>  	}
>  }
>  
> @@ -4219,14 +4229,21 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
>  
>  	pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
>  	if (cpu_has_secondary_exec_ctrls()) {
> -		if (kvm_vcpu_apicv_active(vcpu))
> +		if (kvm_vcpu_apicv_active(vcpu)) {
>  			secondary_exec_controls_setbit(vmx,
>  				      SECONDARY_EXEC_APIC_REGISTER_VIRT |
>  				      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
> -		else
> +			if (cpu_has_tertiary_exec_ctrls() && enable_ipiv)
> +				tertiary_exec_controls_setbit(vmx,
> +						TERTIARY_EXEC_IPI_VIRT);
> +		} else {
>  			secondary_exec_controls_clearbit(vmx,
>  					SECONDARY_EXEC_APIC_REGISTER_VIRT |
>  					SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
> +			if (cpu_has_tertiary_exec_ctrls())
> +				tertiary_exec_controls_clearbit(vmx,
> +						TERTIARY_EXEC_IPI_VIRT);
> +		}

Why check for cpu_has_tertiary_exec_ctrls()? wouldn't it be always true
(enable_ipiv has to be turned to false if CPU doesn't support IPIv,
and if it does it will support tertiary exec controls).

I don't mind this as a precaution + consistency with other code.


>  	}
>  
>  	vmx_update_msr_bitmap_x2apic(vcpu);
> @@ -4260,7 +4277,16 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
>  
>  static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
>  {
> -	return vmcs_config.cpu_based_3rd_exec_ctrl;
> +	u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
> +
> +	/*
> +	 * IPI virtualization relies on APICv. Disable IPI
> +	 * virtualization if APICv is inhibited.
> +	 */
> +	if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
> +		exec_control &= ~TERTIARY_EXEC_IPI_VIRT;

I am not 100% sure, but kvm_vcpu_apicv_active might not be the
best thing to check here, as it reflects per-cpu dynamic APICv inhibit.

It probably works, but it might be better to use enable_apicv
here and rely on normal APICv inhibit, and there inibit IPIv  as well
as you do in vmx_refresh_apicv_exec_ctrl/



> +
> +	return exec_control;
>  }
>  
>  /*
> @@ -4412,6 +4438,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
>  
>  static void init_vmcs(struct vcpu_vmx *vmx)
>  {
> +	struct kvm_vcpu *vcpu = &vmx->vcpu;
> +	struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
> +
>  	if (nested)
>  		nested_vmx_set_vmcs_shadowing_bitmap();
>  
> @@ -4431,7 +4460,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>  	if (cpu_has_tertiary_exec_ctrls())
>  		tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
>  
> -	if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
> +	if (kvm_vcpu_apicv_active(vcpu)) {

here too (pre-existing), I also not 100% sure that kvm_vcpu_apicv_active
should be used. I haven't studied APICv code that much to be 100% sure.


>  		vmcs_write64(EOI_EXIT_BITMAP0, 0);
>  		vmcs_write64(EOI_EXIT_BITMAP1, 0);
>  		vmcs_write64(EOI_EXIT_BITMAP2, 0);
> @@ -4441,6 +4470,13 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>  
>  		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
>  		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
> +
> +		if (enable_ipiv) {
> +			WRITE_ONCE(kvm_vmx->pid_table[vcpu->vcpu_id],
> +				__pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
> +			vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
> +			vmcs_write16(LAST_PID_POINTER_INDEX, kvm_vmx->pid_last_index);
> +		}
>  	}
>  
>  	if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
> @@ -4492,7 +4528,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>  		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
>  	}
>  
> -	vmx_write_encls_bitmap(&vmx->vcpu, NULL);
> +	vmx_write_encls_bitmap(vcpu, NULL);

I might have separated the refactoring of using vcpu instead of &vmx->vcpu
in a separate patch, but I don't mind that that much.

>  
>  	if (vmx_pt_mode_is_host_guest()) {
>  		memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
> @@ -4508,7 +4544,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>  
>  	if (cpu_has_vmx_tpr_shadow()) {
>  		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
> -		if (cpu_need_tpr_shadow(&vmx->vcpu))
> +		if (cpu_need_tpr_shadow(vcpu))
>  			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
>  				     __pa(vmx->vcpu.arch.apic->regs));
>  		vmcs_write32(TPR_THRESHOLD, 0);
> @@ -7165,6 +7201,18 @@ static int vmx_vm_init(struct kvm *kvm)
>  			break;
>  		}
>  	}
> +
> +	if (enable_ipiv) {
> +		struct page *pages;
> +
> +		pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PID_TABLE_ORDER);
> +		if (!pages)
> +			return -ENOMEM;
> +
> +		to_kvm_vmx(kvm)->pid_table = (void *)page_address(pages);
> +		to_kvm_vmx(kvm)->pid_last_index = KVM_MAX_VCPU_IDS - 1;
> +	}
> +
>  	return 0;
>  }
>  
> @@ -7756,6 +7804,14 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
>  	return supported & BIT(bit);
>  }
>  
> +static void vmx_vm_destroy(struct kvm *kvm)
> +{
> +	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
> +
> +	if (kvm_vmx->pid_table)
> +		free_pages((unsigned long)kvm_vmx->pid_table, MAX_PID_TABLE_ORDER);

Maybe add a warning checking that ipiv was actually enabled.
Maybe this is overkill.


> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops __initdata = {
>  	.name = "kvm_intel",
>  
> @@ -7768,6 +7824,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
>  
>  	.vm_size = sizeof(struct kvm_vmx),
>  	.vm_init = vmx_vm_init,
> +	.vm_destroy = vmx_vm_destroy,
>  
>  	.vcpu_create = vmx_create_vcpu,
>  	.vcpu_free = vmx_free_vcpu,
> @@ -8022,6 +8079,9 @@ static __init int hardware_setup(void)
>  	if (!enable_apicv)
>  		vmx_x86_ops.sync_pir_to_irr = NULL;
>  
> +	if (!enable_apicv || !cpu_has_vmx_ipiv())
> +		enable_ipiv = false;
> +
>  	if (cpu_has_vmx_tsc_scaling()) {
>  		kvm_has_tsc_control = true;
>  		kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index d4a647d3ed4a..e7b0c00c9d43 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -365,6 +365,9 @@ struct kvm_vmx {
>  	unsigned int tss_addr;
>  	bool ept_identity_pagetable_done;
>  	gpa_t ept_identity_map_addr;
> +	/* PID table for IPI virtualization */
> +	u64 *pid_table;
> +	u16 pid_last_index;
>  };
>  
>  bool nested_vmx_allowed(struct kvm_vcpu *vcpu);


I might have missed something, but overall looks good.

Best regards,
	Maxim Levitsky

Powered by blists - more mailing lists