lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Fri, 26 Jan 2024 20:54:20 +0800
From: "Yang, Weijiang" <weijiang.yang@...el.com>
To: Chao Gao <chao.gao@...el.com>
CC: <seanjc@...gle.com>, <pbonzini@...hat.com>, <dave.hansen@...el.com>,
	<kvm@...r.kernel.org>, <linux-kernel@...r.kernel.org>, <x86@...nel.org>,
	<yuan.yao@...ux.intel.com>, <peterz@...radead.org>,
	<rick.p.edgecombe@...el.com>, <mlevitsk@...hat.com>, <john.allen@....com>
Subject: Re: [PATCH v9 24/27] KVM: x86: Enable CET virtualization for VMX and
 advertise to userspace

On 1/26/2024 3:50 PM, Chao Gao wrote:
> On Tue, Jan 23, 2024 at 06:41:57PM -0800, Yang Weijiang wrote:
>> Expose CET features to guest if KVM/host can support them, clear CPUID
>> feature bits if KVM/host cannot support.
>>
>> Set CPUID feature bits so that CET features are available in guest CPUID.
>> Add CR4.CET bit support in order to allow guest set CET master control
>> bit.
>>
>> Disable KVM CET feature if unrestricted_guest is unsupported/disabled as
>> KVM does not support emulating CET.
>>
>> The CET load-bits in VM_ENTRY/VM_EXIT control fields should be set to make
>> guest CET xstates isolated from host's.
>>
>> On platforms with VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error
>> code will fail, and if VMX_BASIC[bit56] == 1, #CP injection with or without
>> error code is allowed. Disable CET feature bits if the MSR bit is cleared
>> so that nested VMM can inject #CP if and only if VMX_BASIC[bit56] == 1.
>>
>> Don't expose CET feature if either of {U,S}_CET xstate bits is cleared
>> in host XSS or if XSAVES isn't supported.
>>
>> CET MSR contents after reset, power-up and INIT are set to 0s, clears the
>> guest fpstate fields so that the guest MSRs are reset to 0s after the events.
>>
>> Signed-off-by: Yang Weijiang <weijiang.yang@...el.com>
>> Reviewed-by: Maxim Levitsky <mlevitsk@...hat.com>
>> ---
>> arch/x86/include/asm/kvm_host.h  |  2 +-
>> arch/x86/include/asm/msr-index.h |  1 +
>> arch/x86/kvm/cpuid.c             | 25 ++++++++++++++++++++-----
>> arch/x86/kvm/vmx/capabilities.h  |  6 ++++++
>> arch/x86/kvm/vmx/vmx.c           | 28 +++++++++++++++++++++++++++-
>> arch/x86/kvm/vmx/vmx.h           |  6 ++++--
>> arch/x86/kvm/x86.c               | 31 +++++++++++++++++++++++++++++--
>> arch/x86/kvm/x86.h               |  3 +++
>> 8 files changed, 91 insertions(+), 11 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 6efaaaa15945..161d0552be5f 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -134,7 +134,7 @@
>> 			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
>> 			  | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
>> 			  | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \
>> -			  | X86_CR4_LAM_SUP))
>> +			  | X86_CR4_LAM_SUP | X86_CR4_CET))
>>
>> #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
>>
>> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
>> index 1d51e1850ed0..233e00c01e62 100644
>> --- a/arch/x86/include/asm/msr-index.h
>> +++ b/arch/x86/include/asm/msr-index.h
>> @@ -1102,6 +1102,7 @@
>> #define VMX_BASIC_MEM_TYPE_MASK	0x003c000000000000LLU
>> #define VMX_BASIC_MEM_TYPE_WB	6LLU
>> #define VMX_BASIC_INOUT		0x0040000000000000LLU
>> +#define VMX_BASIC_NO_HW_ERROR_CODE_CC	0x0100000000000000LLU
>>
>> /* Resctrl MSRs: */
>> /* - Intel: */
>> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>> index 95233b0879a3..fddc54991cd4 100644
>> --- a/arch/x86/kvm/cpuid.c
>> +++ b/arch/x86/kvm/cpuid.c
>> @@ -150,14 +150,14 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
>> 			return -EINVAL;
>> 	}
>> 	/*
>> -	 * Prevent 32-bit guest launch if shadow stack is exposed as SSP
>> -	 * state is not defined for 32-bit SMRAM.
>> +	 * CET is not supported for 32-bit guest, prevent guest launch if
>> +	 * shadow stack or IBT is enabled for 32-bit guest.
>> 	 */
>> 	best = cpuid_entry2_find(entries, nent, 0x80000001,
>> 				 KVM_CPUID_INDEX_NOT_SIGNIFICANT);
>> 	if (best && !(best->edx & F(LM))) {
>> 		best = cpuid_entry2_find(entries, nent, 0x7, 0);
>> -		if (best && (best->ecx & F(SHSTK)))
>> +		if (best && ((best->ecx & F(SHSTK)) || (best->edx & F(IBT))))
> IBT has nothing to do with SSP. why bother to do this?

The intent is to prevent userspace from launching an x32 guest with CET bits
advertised in CPUIDs as CET won't be supported for x32 guest.

>
>> 			return -EINVAL;
>> 	}
>>
>> @@ -665,7 +665,7 @@ void kvm_set_cpu_caps(void)
>> 		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
>> 		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
>> 		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
>> -		F(SGX_LC) | F(BUS_LOCK_DETECT)
>> +		F(SGX_LC) | F(BUS_LOCK_DETECT) | F(SHSTK)
>> 	);
>> 	/* Set LA57 based on hardware capability. */
>> 	if (cpuid_ecx(7) & F(LA57))
>> @@ -683,7 +683,8 @@ void kvm_set_cpu_caps(void)
>> 		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
>> 		F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
>> 		F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
>> -		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
>> +		F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) |
>> +		F(IBT)
>> 	);
>>
>> 	/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
>> @@ -696,6 +697,20 @@ void kvm_set_cpu_caps(void)
>> 		kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
>> 	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
>> 		kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
>> +	/*
>> +	 * Don't use boot_cpu_has() to check availability of IBT because the
>> +	 * feature bit is cleared in boot_cpu_data when ibt=off is applied
>> +	 * in host cmdline.
>> +	 *
>> +	 * As currently there's no HW bug which requires disabling IBT feature
>> +	 * while CPU can enumerate it, host cmdline option ibt=off is most
>> +	 * likely due to administrative reason on host side, so KVM refers to
>> +	 * CPU CPUID enumeration to enable the feature. In future if there's
>> +	 * actually some bug clobbered ibt=off option, then enforce additional
>> +	 * check here to disable the support in KVM.
>> +	 */
>> +	if (cpuid_edx(7) & F(IBT))
>> +		kvm_cpu_cap_set(X86_FEATURE_IBT);
> This can be done in a separate patch.

This can be put in a separate patch but IMHO it's still related to CET enabling
setup, so put it in this patch is somewhat reasonable :-)

>
> And we don't know whether IBT is cleared due to ibt=off. It could be due to
> lack of IBT on some CPUs; advertising IBT in this case is incorrect.

cpuid_edx(7) returns bootup CPUID available on platform, if it's there KVM
can enable the support. Currently KVM checks the minimum correlation with
host CET enabling status before exposes the features to guest. Unless there're
some HW bugs or the xsave dependency is broken, it exposes the features to
guest if platform can support them.

>> 	kvm_cpu_cap_mask(CPUID_7_1_EAX,
>> 		F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) |
>> diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
>> index ee8938818c8a..e12bc233d88b 100644
>> --- a/arch/x86/kvm/vmx/capabilities.h
>> +++ b/arch/x86/kvm/vmx/capabilities.h
>> @@ -79,6 +79,12 @@ static inline bool cpu_has_vmx_basic_inout(void)
>> 	return	(((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
>> }
>>
>> +static inline bool cpu_has_vmx_basic_no_hw_errcode(void)
>> +{
>> +	return	((u64)vmcs_config.basic_cap << 32) &
>> +		 VMX_BASIC_NO_HW_ERROR_CODE_CC;
>> +}
>> +
>> static inline bool cpu_has_virtual_nmis(void)
>> {
>> 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS &&
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 3993afbacd51..ef7aca954228 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -2609,6 +2609,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>> 		{ VM_ENTRY_LOAD_IA32_EFER,		VM_EXIT_LOAD_IA32_EFER },
>> 		{ VM_ENTRY_LOAD_BNDCFGS,		VM_EXIT_CLEAR_BNDCFGS },
>> 		{ VM_ENTRY_LOAD_IA32_RTIT_CTL,		VM_EXIT_CLEAR_IA32_RTIT_CTL },
>> +		{ VM_ENTRY_LOAD_CET_STATE,		VM_EXIT_LOAD_CET_STATE },
>> 	};
>>
>> 	memset(vmcs_conf, 0, sizeof(*vmcs_conf));
>> @@ -4934,6 +4935,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>>
>> 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
>>
>> +	if (kvm_cpu_cap_has(X86_FEATURE_SHSTK))
> ..
>> +		vmcs_writel(GUEST_SSP, 0);
>> +	if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) ||
>> +	    kvm_cpu_cap_has(X86_FEATURE_IBT))
>> +		vmcs_writel(GUEST_S_CET, 0);
>> +	if (kvm_cpu_cap_has(X86_FEATURE_SHSTK))
>> +		vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
> At least this can be merged with the first if-statement.
>
> how about:
> 	if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
> 		vmcs_writel(GUEST_SSP, 0);
> 		vmcs_writel(GUEST_S_CET, 0);
> 		vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
> 	} else if (kvm_cpu_cap_has(X86_FEATURE_IBT)) {
> 		vmcs_writel(GUEST_S_CET, 0);
> 	}

It looks better, I'll apply it later. Thanks!

>> 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
>>
>> 	vpid_sync_context(vmx->vpid);
>> @@ -6353,6 +6362,12 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
>> 	if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
>> 		vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
>>
>> +	if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) {
>> +		pr_err("S_CET = 0x%016lx\n", vmcs_readl(GUEST_S_CET));
>> +		pr_err("SSP = 0x%016lx\n", vmcs_readl(GUEST_SSP));
>> +		pr_err("INTR SSP TABLE = 0x%016lx\n",
>> +		       vmcs_readl(GUEST_INTR_SSP_TABLE));
> how about merging them into one line?

OK, will do it.

>> +	}
>> 	pr_err("*** Host State ***\n");
>> 	pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
>> 	       vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
>> @@ -6430,6 +6445,12 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
>> 	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
>> 		pr_err("Virtual processor ID = 0x%04x\n",
>> 		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
>> +	if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE) {
>> +		pr_err("S_CET = 0x%016lx\n", vmcs_readl(HOST_S_CET));
>> +		pr_err("SSP = 0x%016lx\n", vmcs_readl(HOST_SSP));
>> +		pr_err("INTR SSP TABLE = 0x%016lx\n",
>> +		       vmcs_readl(HOST_INTR_SSP_TABLE));
> ditto.
>
>> +	}
>> }
>>
>> /*
>> @@ -7965,7 +7986,6 @@ static __init void vmx_set_cpu_caps(void)
>> 		kvm_cpu_cap_set(X86_FEATURE_UMIP);
>>
>> 	/* CPUID 0xD.1 */
>> -	kvm_caps.supported_xss = 0;
>> 	if (!cpu_has_vmx_xsaves())
>> 		kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
>>
>> @@ -7977,6 +7997,12 @@ static __init void vmx_set_cpu_caps(void)
>>
>> 	if (cpu_has_vmx_waitpkg())
>> 		kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
>> +
>> +	if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
>> +	    !cpu_has_vmx_basic_no_hw_errcode()) {
> Can you add a comment here? This way, readers won't need to dig through git
> history to understand the reason.

OK, I've put some explanation in commit log, let me move them here.

>
>> +		kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
>> +		kvm_cpu_cap_clear(X86_FEATURE_IBT);
>> +	}
>> }
>>
>> static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
>> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
>> index e3b0985bb74a..d0cad2624564 100644
>> --- a/arch/x86/kvm/vmx/vmx.h
>> +++ b/arch/x86/kvm/vmx/vmx.h
>> @@ -484,7 +484,8 @@ static inline u8 vmx_get_rvi(void)
>> 	 VM_ENTRY_LOAD_IA32_EFER |					\
>> 	 VM_ENTRY_LOAD_BNDCFGS |					\
>> 	 VM_ENTRY_PT_CONCEAL_PIP |					\
>> -	 VM_ENTRY_LOAD_IA32_RTIT_CTL)
>> +	 VM_ENTRY_LOAD_IA32_RTIT_CTL |					\
>> +	 VM_ENTRY_LOAD_CET_STATE)
>>
>> #define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS				\
>> 	(VM_EXIT_SAVE_DEBUG_CONTROLS |					\
>> @@ -506,7 +507,8 @@ static inline u8 vmx_get_rvi(void)
>> 	       VM_EXIT_LOAD_IA32_EFER |					\
>> 	       VM_EXIT_CLEAR_BNDCFGS |					\
>> 	       VM_EXIT_PT_CONCEAL_PIP |					\
>> -	       VM_EXIT_CLEAR_IA32_RTIT_CTL)
>> +	       VM_EXIT_CLEAR_IA32_RTIT_CTL |				\
>> +	       VM_EXIT_LOAD_CET_STATE)
>>
>> #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL			\
>> 	(PIN_BASED_EXT_INTR_MASK |					\
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 9596763fae8d..eb531823447a 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -231,7 +231,8 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs;
>> 				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
>> 				| XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
>>
>> -#define KVM_SUPPORTED_XSS     0
>> +#define KVM_SUPPORTED_XSS	(XFEATURE_MASK_CET_USER | \
>> +				 XFEATURE_MASK_CET_KERNEL)
>>
>> u64 __read_mostly host_efer;
>> EXPORT_SYMBOL_GPL(host_efer);
>> @@ -9921,6 +9922,20 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
>> 	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
>> 		kvm_caps.supported_xss = 0;
>>
>> +	if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
>> +	    !kvm_cpu_cap_has(X86_FEATURE_IBT))
>> +		kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER |
>> +					    XFEATURE_MASK_CET_KERNEL);
>> +
>
>> +	if ((kvm_caps.supported_xss & (XFEATURE_MASK_CET_USER |
>> +	     XFEATURE_MASK_CET_KERNEL)) !=
>> +	    (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)) {
>> +		kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
>> +		kvm_cpu_cap_clear(X86_FEATURE_IBT);
>> +		kvm_caps.supported_xss &= ~(XFEATURE_MASK_CET_USER |
>> +					    XFEATURE_MASK_CET_KERNEL);
>> +	}
> I am not sure why this is necessary. Could you please explain?

It's necessary to put the check here as it guarantees when guest CET is available
a) the critical host xsave dependency is there, b) guest can play with CET in user
and supervisor mode at the same time.



Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ