lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250110-asi-rfc-v2-v2-21-8419288bc805@google.com>
Date: Fri, 10 Jan 2025 18:40:47 +0000
From: Brendan Jackman <jackmanb@...gle.com>
To: Thomas Gleixner <tglx@...utronix.de>, Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>, 
	Dave Hansen <dave.hansen@...ux.intel.com>, "H. Peter Anvin" <hpa@...or.com>, 
	Andy Lutomirski <luto@...nel.org>, Peter Zijlstra <peterz@...radead.org>, 
	Richard Henderson <richard.henderson@...aro.org>, Matt Turner <mattst88@...il.com>, 
	Vineet Gupta <vgupta@...nel.org>, Russell King <linux@...linux.org.uk>, 
	Catalin Marinas <catalin.marinas@....com>, Will Deacon <will@...nel.org>, Guo Ren <guoren@...nel.org>, 
	Brian Cain <bcain@...cinc.com>, Huacai Chen <chenhuacai@...nel.org>, 
	WANG Xuerui <kernel@...0n.name>, Geert Uytterhoeven <geert@...ux-m68k.org>, 
	Michal Simek <monstr@...str.eu>, Thomas Bogendoerfer <tsbogend@...ha.franken.de>, 
	Dinh Nguyen <dinguyen@...nel.org>, Jonas Bonn <jonas@...thpole.se>, 
	Stefan Kristiansson <stefan.kristiansson@...nalahti.fi>, Stafford Horne <shorne@...il.com>, 
	"James E.J. Bottomley" <James.Bottomley@...senPartnership.com>, Helge Deller <deller@....de>, 
	Michael Ellerman <mpe@...erman.id.au>, Nicholas Piggin <npiggin@...il.com>, 
	Christophe Leroy <christophe.leroy@...roup.eu>, Naveen N Rao <naveen@...nel.org>, 
	Madhavan Srinivasan <maddy@...ux.ibm.com>, Paul Walmsley <paul.walmsley@...ive.com>, 
	Palmer Dabbelt <palmer@...belt.com>, Albert Ou <aou@...s.berkeley.edu>, 
	Heiko Carstens <hca@...ux.ibm.com>, Vasily Gorbik <gor@...ux.ibm.com>, 
	Alexander Gordeev <agordeev@...ux.ibm.com>, Christian Borntraeger <borntraeger@...ux.ibm.com>, 
	Sven Schnelle <svens@...ux.ibm.com>, Yoshinori Sato <ysato@...rs.sourceforge.jp>, 
	Rich Felker <dalias@...c.org>, John Paul Adrian Glaubitz <glaubitz@...sik.fu-berlin.de>, 
	"David S. Miller" <davem@...emloft.net>, Andreas Larsson <andreas@...sler.com>, 
	Richard Weinberger <richard@....at>, Anton Ivanov <anton.ivanov@...bridgegreys.com>, 
	Johannes Berg <johannes@...solutions.net>, Chris Zankel <chris@...kel.net>, 
	Max Filippov <jcmvbkbc@...il.com>, Arnd Bergmann <arnd@...db.de>, 
	Andrew Morton <akpm@...ux-foundation.org>, Juri Lelli <juri.lelli@...hat.com>, 
	Vincent Guittot <vincent.guittot@...aro.org>, Dietmar Eggemann <dietmar.eggemann@....com>, 
	Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>, 
	Valentin Schneider <vschneid@...hat.com>, Uladzislau Rezki <urezki@...il.com>, 
	Christoph Hellwig <hch@...radead.org>, Masami Hiramatsu <mhiramat@...nel.org>, 
	Mathieu Desnoyers <mathieu.desnoyers@...icios.com>, Mike Rapoport <rppt@...nel.org>, 
	Arnaldo Carvalho de Melo <acme@...nel.org>, Namhyung Kim <namhyung@...nel.org>, 
	Mark Rutland <mark.rutland@....com>, 
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>, Jiri Olsa <jolsa@...nel.org>, 
	Ian Rogers <irogers@...gle.com>, Adrian Hunter <adrian.hunter@...el.com>, 
	Dennis Zhou <dennis@...nel.org>, Tejun Heo <tj@...nel.org>, Christoph Lameter <cl@...ux.com>, 
	Sean Christopherson <seanjc@...gle.com>, Paolo Bonzini <pbonzini@...hat.com>, 
	Ard Biesheuvel <ardb@...nel.org>, Josh Poimboeuf <jpoimboe@...nel.org>, 
	Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>
Cc: x86@...nel.org, linux-kernel@...r.kernel.org, linux-alpha@...r.kernel.org, 
	linux-snps-arc@...ts.infradead.org, linux-arm-kernel@...ts.infradead.org, 
	linux-csky@...r.kernel.org, linux-hexagon@...r.kernel.org, 
	loongarch@...ts.linux.dev, linux-m68k@...ts.linux-m68k.org, 
	linux-mips@...r.kernel.org, linux-openrisc@...r.kernel.org, 
	linux-parisc@...r.kernel.org, linuxppc-dev@...ts.ozlabs.org, 
	linux-riscv@...ts.infradead.org, linux-s390@...r.kernel.org, 
	linux-sh@...r.kernel.org, sparclinux@...r.kernel.org, 
	linux-um@...ts.infradead.org, linux-arch@...r.kernel.org, linux-mm@...ck.org, 
	linux-trace-kernel@...r.kernel.org, linux-perf-users@...r.kernel.org, 
	kvm@...r.kernel.org, linux-efi@...r.kernel.org, 
	Brendan Jackman <jackmanb@...gle.com>
Subject: [PATCH RFC v2 21/29] KVM: x86: asi: Restricted address space for VM execution

An ASI restricted address space is added for KVM. This protects the
userspace from attack by the guest, and the guest from attack by other
processes. It doesn't attempt to prevent the guest from attack by the
current process.

This change incorporates an extra asi_exit at the end of vcpu_run. We
expect later iterations of ASI to drop that call as we gain the
ability to context switch within the ASI domain.

Signed-off-by: Brendan Jackman <jackmanb@...gle.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++
 arch/x86/kvm/svm/svm.c          |  2 ++
 arch/x86/kvm/vmx/vmx.c          | 38 ++++++++++++--------
 arch/x86/kvm/x86.c              | 77 ++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 105 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6d9f763a7bb9d5db422ea5625b2c28420bd14f26..00cda452dd6ca6ec57ff85ca194ee4aeb6af3be7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -37,6 +37,7 @@
 #include <asm/kvm_vcpu_regs.h>
 #include <asm/hyperv-tlfs.h>
 #include <asm/reboot.h>
+#include <asm/asi.h>
 
 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
 
@@ -1535,6 +1536,8 @@ struct kvm_arch {
 	 */
 #define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
 	struct kvm_mmu_memory_cache split_desc_cache;
+
+	struct asi *asi;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9df3e1e5ae81a1346409632edd693cb7e0740f72..f2c3154292b4f6c960b490b0773f53bea43897bb 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4186,6 +4186,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
 	guest_state_enter_irqoff();
 
 	amd_clear_divider();
+	asi_enter(vcpu->kvm->arch.asi);
 
 	if (sev_es_guest(vcpu->kvm))
 		__svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted,
@@ -4193,6 +4194,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
 	else
 		__svm_vcpu_run(svm, spec_ctrl_intercepted);
 
+	asi_relax();
 	guest_state_exit_irqoff();
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d28618e9277ede83ad2edc1b1778ea44123aa797..181d230b1c057fed33f7b29b7b0e378dbdfeb174 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -49,6 +49,7 @@
 #include <asm/mwait.h>
 #include <asm/spec-ctrl.h>
 #include <asm/vmx.h>
+#include <asm/asi.h>
 
 #include <trace/events/ipi.h>
 
@@ -7282,14 +7283,34 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 					unsigned int flags)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long cr3;
 
 	guest_state_enter_irqoff();
+	asi_enter(vcpu->kvm->arch.asi);
+
+	/*
+	 * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
+	 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+	 * it switches back to the current->mm, which can occur in KVM context
+	 * when switching to a temporary mm to patch kernel code, e.g. if KVM
+	 * toggles a static key while handling a VM-Exit.
+	 * Also, this must be done after asi_enter(), as it changes CR3
+	 * when switching address spaces.
+	 */
+	cr3 = __get_current_cr3_fast();
+	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+		vmcs_writel(HOST_CR3, cr3);
+		vmx->loaded_vmcs->host_state.cr3 = cr3;
+	}
 
 	/*
 	 * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW
 	 * mitigation for MDS is done late in VMentry and is still
 	 * executed in spite of L1D Flush. This is because an extra VERW
 	 * should not matter much after the big hammer L1D Flush.
+	 *
+	 * This is only after asi_enter() for performance reasons.
+	 * RFC: This also needs to be integrated with ASI's tainting model.
 	 */
 	if (static_branch_unlikely(&vmx_l1d_should_flush))
 		vmx_l1d_flush(vcpu);
@@ -7310,6 +7331,8 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 
 	vmx->idt_vectoring_info = 0;
 
+	asi_relax();
+
 	vmx_enable_fb_clear(vmx);
 
 	if (unlikely(vmx->fail)) {
@@ -7338,7 +7361,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long cr3, cr4;
+	unsigned long cr4;
 
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!enable_vnmi &&
@@ -7381,19 +7404,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
 		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 	vcpu->arch.regs_dirty = 0;
 
-	/*
-	 * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
-	 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
-	 * it switches back to the current->mm, which can occur in KVM context
-	 * when switching to a temporary mm to patch kernel code, e.g. if KVM
-	 * toggles a static key while handling a VM-Exit.
-	 */
-	cr3 = __get_current_cr3_fast();
-	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
-		vmcs_writel(HOST_CR3, cr3);
-		vmx->loaded_vmcs->host_state.cr3 = cr3;
-	}
-
 	cr4 = cr4_read_shadow();
 	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
 		vmcs_writel(HOST_CR4, cr4);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 83fe0a78146fc198115aba0e76ba57ecfb1dd8d9..3e0811eb510650abc601e4adce1ce4189835a730 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -85,6 +85,7 @@
 #include <asm/emulate_prefix.h>
 #include <asm/sgx.h>
 #include <clocksource/hyperv_timer.h>
+#include <asm/asi.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -9674,6 +9675,55 @@ static void kvm_x86_check_cpu_compat(void *ret)
 	*(int *)ret = kvm_x86_check_processor_compatibility();
 }
 
+#ifdef CONFIG_MITIGATION_ADDRESS_SPACE_ISOLATION
+static inline int kvm_x86_init_asi_class(void)
+{
+	static struct asi_taint_policy policy = {
+		/*
+		 * Prevent going to the guest with sensitive data potentially
+		 * left in sidechannels by code running in the unrestricted
+		 * address space, or another MM.
+		 */
+		.protect_data =  ASI_TAINT_KERNEL_DATA | ASI_TAINT_OTHER_MM_DATA,
+		/*
+		 * Prevent going to the guest with branch predictor state
+		 * influenced by other processes. Note this bit is about
+		 * protecting the guest from other parts of the system, while
+		 * data_taints is about protecting other parts of the system
+		 * from the guest.
+		 */
+		.prevent_control = ASI_TAINT_OTHER_MM_CONTROL,
+		.set = ASI_TAINT_GUEST_DATA,
+	};
+
+	/*
+	 * Inform ASI that the guest will gain control of the branch predictor,
+	 * unless we're just unconditionally blasting it after VM Exit.
+	 *
+	 * RFC: This is a bit simplified - on some configurations we could avoid
+	 * a duplicated RSB-fill if we had a separate taint specifically for the
+	 * RSB.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) ||
+	    !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ||
+	    !cpu_feature_enabled(X86_FEATURE_RSB_VMEXIT))
+		policy.set = ASI_TAINT_GUEST_CONTROL;
+
+	/*
+	 * And the same for data left behind by code in the userspace domain
+	 * (i.e. the VMM itself, plus kernel code serving its syscalls etc).
+	 * This should eventually be configurable: users whose VMMs contain
+	 * no secrets can disable it to avoid paying a mitigation cost on
+	 * transition between their guest and userspace.
+	 */
+	policy.protect_data |= ASI_TAINT_USER_DATA;
+
+	return asi_init_class(ASI_CLASS_KVM, &policy);
+}
+#else /* CONFIG_MITIGATION_ADDRESS_SPACE_ISOLATION */
+static inline int kvm_x86_init_asi_class(void) { return 0; }
+#endif /* CONFIG_MITIGATION_ADDRESS_SPACE_ISOLATION */
+
 int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 {
 	u64 host_pat;
@@ -9737,6 +9787,10 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 	kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM);
 	kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
 
+	r = kvm_x86_init_asi_class();
+	if (r < 0)
+		goto out_mmu_exit;
+
 	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
 		kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 		kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0;
@@ -9754,7 +9808,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 
 	r = ops->hardware_setup();
 	if (r != 0)
-		goto out_mmu_exit;
+		goto out_asi_uninit;
 
 	kvm_ops_update(ops);
 
@@ -9810,6 +9864,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
 out_unwind_ops:
 	kvm_x86_ops.enable_virtualization_cpu = NULL;
 	kvm_x86_call(hardware_unsetup)();
+out_asi_uninit:
+	asi_uninit_class(ASI_CLASS_KVM);
 out_mmu_exit:
 	kvm_mmu_vendor_module_exit();
 out_free_percpu:
@@ -9841,6 +9897,7 @@ void kvm_x86_vendor_exit(void)
 	cancel_work_sync(&pvclock_gtod_work);
 #endif
 	kvm_x86_call(hardware_unsetup)();
+	asi_uninit_class(ASI_CLASS_KVM);
 	kvm_mmu_vendor_module_exit();
 	free_percpu(user_return_msrs);
 	kmem_cache_destroy(x86_emulator_cache);
@@ -11574,6 +11631,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 
 	r = vcpu_run(vcpu);
 
+	/*
+	 * At present ASI doesn't have the capability to transition directly
+	 * from the restricted address space to the user address space. So we
+	 * just return to the unrestricted address space in between.
+	 */
+	asi_exit();
+
 out:
 	kvm_put_guest_fpu(vcpu);
 	if (kvm_run->kvm_valid_regs)
@@ -12705,6 +12769,14 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	if (ret)
 		goto out_uninit_mmu;
 
+	ret = asi_init(kvm->mm, ASI_CLASS_KVM, &kvm->arch.asi);
+	if (ret)
+		goto out_uninit_mmu;
+
+	ret = static_call(kvm_x86_vm_init)(kvm);
+	if (ret)
+		goto out_asi_destroy;
+
 	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
 	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 
@@ -12742,6 +12814,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	return 0;
 
+out_asi_destroy:
+	asi_destroy(kvm->arch.asi);
 out_uninit_mmu:
 	kvm_mmu_uninit_vm(kvm);
 	kvm_page_track_cleanup(kvm);
@@ -12883,6 +12957,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_destroy_vcpus(kvm);
 	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 	kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
+	asi_destroy(kvm->arch.asi);
 	kvm_mmu_uninit_vm(kvm);
 	kvm_page_track_cleanup(kvm);
 	kvm_xen_destroy_vm(kvm);

-- 
2.47.1.613.gc27f4b7a9f-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ