[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240226143630.33643-21-jiangshanlai@gmail.com>
Date: Mon, 26 Feb 2024 22:35:37 +0800
From: Lai Jiangshan <jiangshanlai@...il.com>
To: linux-kernel@...r.kernel.org
Cc: Lai Jiangshan <jiangshan.ljs@...group.com>,
Hou Wenlong <houwenlong.hwl@...group.com>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Peter Zijlstra <peterz@...radead.org>,
Sean Christopherson <seanjc@...gle.com>,
Thomas Gleixner <tglx@...utronix.de>,
Borislav Petkov <bp@...en8.de>,
Ingo Molnar <mingo@...hat.com>,
kvm@...r.kernel.org,
Paolo Bonzini <pbonzini@...hat.com>,
x86@...nel.org,
Kees Cook <keescook@...omium.org>,
Juergen Gross <jgross@...e.com>,
Dave Hansen <dave.hansen@...ux.intel.com>,
"H. Peter Anvin" <hpa@...or.com>
Subject: [RFC PATCH 20/73] KVM: x86/PVM: Implement vcpu_load()/vcpu_put() related callbacks
From: Lai Jiangshan <jiangshan.ljs@...group.com>
When preparing to switch to the guest, some guest states that only
matter to userspace can be loaded ahead before VM enter. In PVM, guest
segment registers and user return MSRs are loaded into hardware at that
time. Since LDT and IO bitmap are not supported in PVM guests, they are
cleared as well. When preparing to switch to the host in vcpu_put(),
host states are restored.
Signed-off-by: Lai Jiangshan <jiangshan.ljs@...group.com>
Signed-off-by: Hou Wenlong <houwenlong.hwl@...group.com>
---
arch/x86/kvm/pvm/pvm.c | 235 +++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/pvm/pvm.h | 5 +
2 files changed, 240 insertions(+)
diff --git a/arch/x86/kvm/pvm/pvm.c b/arch/x86/kvm/pvm/pvm.c
index d4cc52bf6b3f..52b3b47ffe42 100644
--- a/arch/x86/kvm/pvm/pvm.c
+++ b/arch/x86/kvm/pvm/pvm.c
@@ -13,6 +13,8 @@
#include <linux/module.h>
+#include <asm/gsseg.h>
+#include <asm/io_bitmap.h>
#include <asm/pvm_para.h>
#include "cpuid.h"
@@ -26,6 +28,211 @@ static bool __read_mostly is_intel;
static unsigned long host_idt_base;
+static inline void __save_gs_base(struct vcpu_pvm *pvm)
+{
+ // switcher will do a real hw swapgs, so use hw MSR_KERNEL_GS_BASE
+ rdmsrl(MSR_KERNEL_GS_BASE, pvm->segments[VCPU_SREG_GS].base);
+}
+
+static inline void __load_gs_base(struct vcpu_pvm *pvm)
+{
+ // switcher will do a real hw swapgs, so use hw MSR_KERNEL_GS_BASE
+ wrmsrl(MSR_KERNEL_GS_BASE, pvm->segments[VCPU_SREG_GS].base);
+}
+
+static inline void __save_fs_base(struct vcpu_pvm *pvm)
+{
+ rdmsrl(MSR_FS_BASE, pvm->segments[VCPU_SREG_FS].base);
+}
+
+static inline void __load_fs_base(struct vcpu_pvm *pvm)
+{
+ wrmsrl(MSR_FS_BASE, pvm->segments[VCPU_SREG_FS].base);
+}
+
+/*
+ * Test whether DS, ES, FS and GS need to be reloaded.
+ *
+ * Reading them only returns the selectors, but writing them (if
+ * nonzero) loads the full descriptor from the GDT or LDT.
+ *
+ * We therefore need to write new values to the segment registers
+ * on every host-guest state switch unless both the new and old
+ * values are zero.
+ */
+static inline bool need_reload_sel(u16 sel1, u16 sel2)
+{
+ return unlikely(sel1 | sel2);
+}
+
+/*
+ * Save host DS/ES/FS/GS selector, FS base, and inactive GS base.
+ * And load guest DS/ES/FS/GS selector, FS base, and GS base.
+ *
+ * Note, when the guest state is loaded and it is in hypervisor, the guest
+ * GS base is loaded in the hardware MSR_KERNEL_GS_BASE which is loaded
+ * with host inactive GS base when the guest state is NOT loaded.
+ */
+static void segments_save_host_and_switch_to_guest(struct vcpu_pvm *pvm)
+{
+ u16 pvm_ds_sel, pvm_es_sel, pvm_fs_sel, pvm_gs_sel;
+
+ /* Save host segments */
+ savesegment(ds, pvm->host_ds_sel);
+ savesegment(es, pvm->host_es_sel);
+ current_save_fsgs();
+
+ /* Load guest segments */
+ pvm_ds_sel = pvm->segments[VCPU_SREG_DS].selector;
+ pvm_es_sel = pvm->segments[VCPU_SREG_ES].selector;
+ pvm_fs_sel = pvm->segments[VCPU_SREG_FS].selector;
+ pvm_gs_sel = pvm->segments[VCPU_SREG_GS].selector;
+
+ if (need_reload_sel(pvm_ds_sel, pvm->host_ds_sel))
+ loadsegment(ds, pvm_ds_sel);
+ if (need_reload_sel(pvm_es_sel, pvm->host_es_sel))
+ loadsegment(es, pvm_es_sel);
+ if (need_reload_sel(pvm_fs_sel, current->thread.fsindex))
+ loadsegment(fs, pvm_fs_sel);
+ if (need_reload_sel(pvm_gs_sel, current->thread.gsindex))
+ load_gs_index(pvm_gs_sel);
+
+ __load_gs_base(pvm);
+ __load_fs_base(pvm);
+}
+
+/*
+ * Save guest DS/ES/FS/GS selector, FS base, and GS base.
+ * And load host DS/ES/FS/GS selector, FS base, and inactive GS base.
+ */
+static void segments_save_guest_and_switch_to_host(struct vcpu_pvm *pvm)
+{
+ u16 pvm_ds_sel, pvm_es_sel, pvm_fs_sel, pvm_gs_sel;
+
+ /* Save guest segments */
+ savesegment(ds, pvm_ds_sel);
+ savesegment(es, pvm_es_sel);
+ savesegment(fs, pvm_fs_sel);
+ savesegment(gs, pvm_gs_sel);
+ pvm->segments[VCPU_SREG_DS].selector = pvm_ds_sel;
+ pvm->segments[VCPU_SREG_ES].selector = pvm_es_sel;
+ pvm->segments[VCPU_SREG_FS].selector = pvm_fs_sel;
+ pvm->segments[VCPU_SREG_GS].selector = pvm_gs_sel;
+
+ __save_fs_base(pvm);
+ __save_gs_base(pvm);
+
+ /* Load host segments */
+ if (need_reload_sel(pvm_ds_sel, pvm->host_ds_sel))
+ loadsegment(ds, pvm->host_ds_sel);
+ if (need_reload_sel(pvm_es_sel, pvm->host_es_sel))
+ loadsegment(es, pvm->host_es_sel);
+ if (need_reload_sel(pvm_fs_sel, current->thread.fsindex))
+ loadsegment(fs, current->thread.fsindex);
+ if (need_reload_sel(pvm_gs_sel, current->thread.gsindex))
+ load_gs_index(current->thread.gsindex);
+
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
+ wrmsrl(MSR_FS_BASE, current->thread.fsbase);
+}
+
+static void pvm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_pvm *pvm = to_pvm(vcpu);
+
+ if (pvm->loaded_cpu_state)
+ return;
+
+ pvm->loaded_cpu_state = 1;
+
+#ifdef CONFIG_X86_IOPL_IOPERM
+ /*
+ * PVM doesn't load guest I/O bitmap into hardware. Invalidate I/O
+ * bitmap if the current task is using it. This prevents any possible
+ * leakage of an active I/O bitmap to the guest and forces I/O
+ * instructions in guest to be trapped and emulated.
+ *
+ * The I/O bitmap will be restored when the current task exits to
+ * user mode in arch_exit_to_user_mode_prepare().
+ */
+ if (test_thread_flag(TIF_IO_BITMAP))
+ native_tss_invalidate_io_bitmap();
+#endif
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ /* PVM doesn't support LDT. */
+ if (unlikely(current->mm->context.ldt))
+ clear_LDT();
+#endif
+
+ segments_save_host_and_switch_to_guest(pvm);
+
+ kvm_set_user_return_msr(0, (u64)entry_SYSCALL_64_switcher, -1ull);
+ kvm_set_user_return_msr(1, pvm->msr_tsc_aux, -1ull);
+ if (ia32_enabled()) {
+ if (is_intel)
+ kvm_set_user_return_msr(2, GDT_ENTRY_INVALID_SEG, -1ull);
+ else
+ kvm_set_user_return_msr(2, (u64)entry_SYSCALL32_ignore, -1ull);
+ }
+}
+
+static void pvm_prepare_switch_to_host(struct vcpu_pvm *pvm)
+{
+ if (!pvm->loaded_cpu_state)
+ return;
+
+ ++pvm->vcpu.stat.host_state_reload;
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ if (unlikely(current->mm->context.ldt))
+ kvm_load_ldt(GDT_ENTRY_LDT*8);
+#endif
+
+ segments_save_guest_and_switch_to_host(pvm);
+ pvm->loaded_cpu_state = 0;
+}
+
+/*
+ * Set all hardware states back to host.
+ * Except user return MSRs.
+ */
+static void pvm_switch_to_host(struct vcpu_pvm *pvm)
+{
+ preempt_disable();
+ pvm_prepare_switch_to_host(pvm);
+ preempt_enable();
+}
+
+DEFINE_PER_CPU(struct vcpu_pvm *, active_pvm_vcpu);
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+static void pvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_pvm *pvm = to_pvm(vcpu);
+
+ if (__this_cpu_read(active_pvm_vcpu) == pvm && vcpu->cpu == cpu)
+ return;
+
+ __this_cpu_write(active_pvm_vcpu, pvm);
+
+ indirect_branch_prediction_barrier();
+}
+
+static void pvm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_pvm *pvm = to_pvm(vcpu);
+
+ pvm_prepare_switch_to_host(pvm);
+}
+
+static void pvm_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
static void pvm_setup_mce(struct kvm_vcpu *vcpu)
{
}
@@ -100,6 +307,8 @@ static void pvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
struct vcpu_pvm *pvm = to_pvm(vcpu);
int i;
+ pvm_switch_to_host(pvm);
+
kvm_gpc_deactivate(&pvm->pvcs_gpc);
if (!init_event)
@@ -183,6 +392,24 @@ static int pvm_check_processor_compat(void)
return 0;
}
+/*
+ * When in PVM mode, the hardware MSR_LSTAR is set to the entry point
+ * provided by the host entry code (switcher), and the
+ * hypervisor can also change the hardware MSR_TSC_AUX to emulate
+ * the guest MSR_TSC_AUX.
+ */
+static __init void pvm_setup_user_return_msrs(void)
+{
+ kvm_add_user_return_msr(MSR_LSTAR);
+ kvm_add_user_return_msr(MSR_TSC_AUX);
+ if (ia32_enabled()) {
+ if (is_intel)
+ kvm_add_user_return_msr(MSR_IA32_SYSENTER_CS);
+ else
+ kvm_add_user_return_msr(MSR_CSTAR);
+ }
+}
+
static __init void pvm_set_cpu_caps(void)
{
if (boot_cpu_has(X86_FEATURE_NX))
@@ -253,6 +480,8 @@ static __init int hardware_setup(void)
store_idt(&dt);
host_idt_base = dt.address;
+ pvm_setup_user_return_msrs();
+
pvm_set_cpu_caps();
kvm_configure_mmu(false, 0, 0, 0);
@@ -287,8 +516,14 @@ static struct kvm_x86_ops pvm_x86_ops __initdata = {
.vcpu_free = pvm_vcpu_free,
.vcpu_reset = pvm_vcpu_reset,
+ .prepare_switch_to_guest = pvm_prepare_switch_to_guest,
+ .vcpu_load = pvm_vcpu_load,
+ .vcpu_put = pvm_vcpu_put,
+
.vcpu_after_set_cpuid = pvm_vcpu_after_set_cpuid,
+ .sched_in = pvm_sched_in,
+
.nested_ops = &pvm_nested_ops,
.setup_mce = pvm_setup_mce,
diff --git a/arch/x86/kvm/pvm/pvm.h b/arch/x86/kvm/pvm/pvm.h
index 599bbbb284dc..6584314487bc 100644
--- a/arch/x86/kvm/pvm/pvm.h
+++ b/arch/x86/kvm/pvm/pvm.h
@@ -30,13 +30,18 @@ struct vcpu_pvm {
unsigned long switch_flags;
+ u16 host_ds_sel, host_es_sel;
+
u32 hw_cs, hw_ss;
+ int loaded_cpu_state;
int int_shadow;
bool nmi_mask;
struct gfn_to_pfn_cache pvcs_gpc;
+ // emulated x86 msrs
+ u64 msr_tsc_aux;
/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
* msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
--
2.19.1.6.gb485710b
Powered by blists - more mailing lists