linux-kernel - [RFC PATCH v3 2/2] x86: kvm: svm: advertise ERAPS (larger RSB) support to guests

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241128132834.15126-3-amit@kernel.org>
Date: Thu, 28 Nov 2024 14:28:34 +0100
From: Amit Shah <amit@...nel.org>
To: linux-kernel@...r.kernel.org,
	kvm@...r.kernel.org,
	x86@...nel.org,
	linux-doc@...r.kernel.org
Cc: amit.shah@....com,
	thomas.lendacky@....com,
	bp@...en8.de,
	tglx@...utronix.de,
	peterz@...radead.org,
	jpoimboe@...nel.org,
	pawan.kumar.gupta@...ux.intel.com,
	corbet@....net,
	mingo@...hat.com,
	dave.hansen@...ux.intel.com,
	hpa@...or.com,
	seanjc@...gle.com,
	pbonzini@...hat.com,
	daniel.sneddon@...ux.intel.com,
	kai.huang@...el.com,
	sandipan.das@....com,
	boris.ostrovsky@...cle.com,
	Babu.Moger@....com,
	david.kaplan@....com,
	dwmw@...zon.co.uk,
	andrew.cooper3@...rix.com
Subject: [RFC PATCH v3 2/2] x86: kvm: svm: advertise ERAPS (larger RSB) support to guests

From: Amit Shah <amit.shah@....com>

AMD CPUs with the ERAPS feature (Zen5+) have a larger RSB (aka RAP).
While the new default RSB size is used on the host without any software
modification necessary, the RSB size for guests is limited to the older
value (32 entries) for backwards compatibility.  With this patch, KVM
enables guest mode to also use the default number of entries by setting
the new ALLOW_LARGER_RAP bit in the VMCB.

The two cases for backward compatibility that need special handling are
nested guests, and guests using shadow paging (or when NPT is disabled):

For nested guests: the ERAPS feature adds host/guest tagging to entries
in the RSB, but does not distinguish between ASIDs.  On a nested exit,
the L0 hypervisor instructs the hardware (via another new VMCB bit,
FLUSH_RAP_ON_VMRUN) to flush the RSB on the next VMRUN to prevent RSB
poisoning attacks from an L2 guest to an L1 guest.  With that in place,
this feature can be exposed to guests.

For shadow paging guests: do not expose this feature to guests; only
expose if nested paging is enabled, to ensure a context switch within
a guest triggers a context switch on the CPU -- thereby ensuring guest
context switches flush guest RSB entries.  For shadow paging, the CPU's
CR3 is not used for guest processes, and hence cannot benefit from this
feature.

Signed-off-by: Amit Shah <amit.shah@....com>
---
 arch/x86/include/asm/svm.h |  6 +++++-
 arch/x86/kvm/cpuid.c       | 18 ++++++++++++++++--
 arch/x86/kvm/svm/svm.c     | 29 +++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.h     | 15 +++++++++++++++
 4 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 2b59b9951c90..f8584a63c859 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -129,7 +129,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u64 tsc_offset;
 	u32 asid;
 	u8 tlb_ctl;
-	u8 reserved_2[3];
+	u8 erap_ctl;
+	u8 reserved_2[2];
 	u32 int_ctl;
 	u32 int_vector;
 	u32 int_state;
@@ -175,6 +176,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 #define TLB_CONTROL_FLUSH_ASID 3
 #define TLB_CONTROL_FLUSH_ASID_LOCAL 7
 
+#define ERAP_CONTROL_ALLOW_LARGER_RAP 0
+#define ERAP_CONTROL_FLUSH_RAP 1
+
 #define V_TPR_MASK 0x0f
 
 #define V_IRQ_SHIFT 8
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 097bdc022d0f..dd589670a716 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -803,6 +803,8 @@ void kvm_set_cpu_caps(void)
 		F(WRMSR_XX_BASE_NS)
 	);
 
+	if (tdp_enabled)
+		kvm_cpu_cap_check_and_set(X86_FEATURE_ERAPS);
 	kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB);
 	kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE);
 	kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO);
@@ -1362,10 +1364,22 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 	case 0x80000020:
 		entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
 		break;
-	case 0x80000021:
-		entry->ebx = entry->ecx = entry->edx = 0;
+	case 0x80000021: {
+		unsigned int ebx_mask = 0;
+
+		entry->ecx = entry->edx = 0;
 		cpuid_entry_override(entry, CPUID_8000_0021_EAX);
+
+		/*
+		 * Bits 23:16 in EBX indicate the size of the RSB.
+		 * Expose the value in the hardware to the guest.
+		 */
+		if (kvm_cpu_cap_has(X86_FEATURE_ERAPS))
+			ebx_mask |= GENMASK(23, 16);
+
+		entry->ebx &= ebx_mask;
 		break;
+	}
 	/* AMD Extended Performance Monitoring and Debug */
 	case 0x80000022: {
 		union cpuid_0x80000022_ebx ebx;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index dd15cc635655..9b055de079cb 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1360,6 +1360,13 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
 	if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
 		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
 
+	/*
+	 * If the hardware has a larger RSB, use it in the guest context as
+	 * well.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_ERAPS) && npt_enabled)
+		vmcb_set_larger_rap(svm->vmcb);
+
 	if (kvm_vcpu_apicv_active(vcpu))
 		avic_init_vmcb(svm, vmcb);
 
@@ -3395,6 +3402,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
 	pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
 	pr_err("%-20s%d\n", "asid:", control->asid);
 	pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
+	pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl);
 	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
 	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
 	pr_err("%-20s%08x\n", "int_state:", control->int_state);
@@ -3561,6 +3569,27 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 
 		trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
 
+		if (boot_cpu_has(X86_FEATURE_ERAPS)
+		    && vmcb_is_larger_rap(svm->vmcb01.ptr)) {
+			/*
+			 * XXX a few further optimizations can be made:
+			 *
+			 * 1. In pre_svm_run() we can reset this bit when a hw
+			 * TLB flush has happened - any context switch on a
+			 * CPU (which causes a TLB flush) auto-flushes the RSB
+			 * - eg when this vCPU is scheduled on a different
+			 * pCPU.
+			 *
+			 * 2. This is also not needed in the case where the
+			 * vCPU is being scheduled on the same pCPU, but there
+			 * was a context switch between the #VMEXIT and VMRUN.
+			 *
+			 * 3. If the guest returns to L2 again after this
+			 * #VMEXIT, there's no need to flush the RSB.
+			 */
+			vmcb_set_flush_rap(svm->vmcb01.ptr);
+		}
+
 		vmexit = nested_svm_exit_special(svm);
 
 		if (vmexit == NESTED_EXIT_CONTINUE)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 43fa6a16eb19..8a7877f46dc5 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -500,6 +500,21 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
 	return vmcb_is_intercept(&svm->vmcb->control, bit);
 }
 
+static inline void vmcb_set_flush_rap(struct vmcb *vmcb)
+{
+	__set_bit(ERAP_CONTROL_FLUSH_RAP, (unsigned long *)&vmcb->control.erap_ctl);
+}
+
+static inline void vmcb_set_larger_rap(struct vmcb *vmcb)
+{
+	__set_bit(ERAP_CONTROL_ALLOW_LARGER_RAP, (unsigned long *)&vmcb->control.erap_ctl);
+}
+
+static inline bool vmcb_is_larger_rap(struct vmcb *vmcb)
+{
+	return test_bit(ERAP_CONTROL_ALLOW_LARGER_RAP, (unsigned long *)&vmcb->control.erap_ctl);
+}
+
 static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
 {
 	return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) &&
-- 
2.47.0