[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e75da0ce-670a-4a01-b68a-81201babe1bf@linux.intel.com>
Date: Tue, 3 Jun 2025 11:52:54 +0800
From: "Mi, Dapeng" <dapeng1.mi@...ux.intel.com>
To: Sean Christopherson <seanjc@...gle.com>,
Paolo Bonzini <pbonzini@...hat.com>
Cc: kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
Borislav Petkov <bp@...en8.de>, Xin Li <xin@...or.com>,
Chao Gao <chao.gao@...el.com>
Subject: Re: [PATCH 16/28] KVM: VMX: Manually recalc all MSR intercepts on
userspace MSR filter change
On 5/30/2025 7:40 AM, Sean Christopherson wrote:
> On a userspace MSR filter change, recalculate all MSR intercepts using the
> filter-agnostic logic instead of maintaining a "shadow copy" of KVM's
> desired intercepts. The shadow bitmaps add yet another point of failure,
> are confusing (e.g. what does "handled specially" mean!?!?), an eyesore,
> and a maintenance burden.
>
> Given that KVM *must* be able to recalculate the correct intercepts at any
> given time, and that MSR filter updates are not hot paths, there is zero
> benefit to maintaining the shadow bitmaps.
>
> Link: https://lore.kernel.org/all/aCdPbZiYmtni4Bjs@google.com
> Link: https://lore.kernel.org/all/20241126180253.GAZ0YNTdXH1UGeqsu6@fat_crate.local
> Cc: Borislav Petkov <bp@...en8.de>
> Cc: Xin Li <xin@...or.com>
> Cc: Chao Gao <chao.gao@...el.com>
> Cc: Dapeng Mi <dapeng1.mi@...ux.intel.com>
> Signed-off-by: Sean Christopherson <seanjc@...gle.com>
> ---
> arch/x86/kvm/vmx/vmx.c | 184 +++++++++++------------------------------
> arch/x86/kvm/vmx/vmx.h | 7 --
> 2 files changed, 47 insertions(+), 144 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 8f7fe04a1998..6ffa2b2b85ce 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -166,31 +166,6 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
> RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
> RTIT_STATUS_BYTECNT))
>
> -/*
> - * List of MSRs that can be directly passed to the guest.
> - * In addition to these x2apic, PT and LBR MSRs are handled specially.
> - */
> -static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
> - MSR_IA32_SPEC_CTRL,
> - MSR_IA32_PRED_CMD,
> - MSR_IA32_FLUSH_CMD,
> - MSR_IA32_TSC,
> -#ifdef CONFIG_X86_64
> - MSR_FS_BASE,
> - MSR_GS_BASE,
> - MSR_KERNEL_GS_BASE,
> - MSR_IA32_XFD,
> - MSR_IA32_XFD_ERR,
> -#endif
> - MSR_IA32_SYSENTER_CS,
> - MSR_IA32_SYSENTER_ESP,
> - MSR_IA32_SYSENTER_EIP,
> - MSR_CORE_C1_RES,
> - MSR_CORE_C3_RESIDENCY,
> - MSR_CORE_C6_RESIDENCY,
> - MSR_CORE_C7_RESIDENCY,
> -};
> -
> /*
> * These 2 parameters are used to config the controls for Pause-Loop Exiting:
> * ple_gap: upper bound on the amount of time between two successive
> @@ -672,40 +647,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
> return flexpriority_enabled && lapic_in_kernel(vcpu);
> }
>
> -static int vmx_get_passthrough_msr_slot(u32 msr)
> -{
> - int i;
> -
> - switch (msr) {
> - case 0x800 ... 0x8ff:
> - /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
> - return -ENOENT;
> - case MSR_IA32_RTIT_STATUS:
> - case MSR_IA32_RTIT_OUTPUT_BASE:
> - case MSR_IA32_RTIT_OUTPUT_MASK:
> - case MSR_IA32_RTIT_CR3_MATCH:
> - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
> - /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
> - case MSR_LBR_SELECT:
> - case MSR_LBR_TOS:
> - case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
> - case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
> - case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
> - case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
> - case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
> - /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
> - return -ENOENT;
> - }
> -
> - for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
> - if (vmx_possible_passthrough_msrs[i] == msr)
> - return i;
> - }
> -
> - WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
> - return -ENOENT;
> -}
> -
> struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
> {
> int i;
> @@ -4015,25 +3956,12 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
> {
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
> - int idx;
>
> if (!cpu_has_vmx_msr_bitmap())
> return;
>
> vmx_msr_bitmap_l01_changed(vmx);
>
> - /*
> - * Mark the desired intercept state in shadow bitmap, this is needed
> - * for resync when the MSR filters change.
> - */
> - idx = vmx_get_passthrough_msr_slot(msr);
> - if (idx >= 0) {
> - if (type & MSR_TYPE_R)
> - __clear_bit(idx, vmx->shadow_msr_intercept.read);
> - if (type & MSR_TYPE_W)
> - __clear_bit(idx, vmx->shadow_msr_intercept.write);
> - }
> -
The patch looks good to me. Only a minor code refine on
vmx_disable_intercept_for_msr().
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9a83d5b174c8..e898ff296fd5 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4041,23 +4041,19 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu
*vcpu, u32 msr, int type)
clear_bit(idx, vmx->shadow_msr_intercept.write);
}
- if ((type & MSR_TYPE_R) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
- type &= ~MSR_TYPE_R;
+ if (type & MSR_TYPE_R) {
+ if (!kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
+ else
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
}
- if ((type & MSR_TYPE_W) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
- type &= ~MSR_TYPE_W;
+ if (type & MSR_TYPE_W) {
+ if (!kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
+ else
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
}
-
- if (type & MSR_TYPE_R)
- vmx_clear_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_clear_msr_bitmap_write(msr_bitmap, msr);
}
It looks simpler and easily understood.
Reviewed-by: Dapeng Mi <dapeng1.mi@...ux.intel.com>
> if ((type & MSR_TYPE_R) &&
> !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
> vmx_set_msr_bitmap_read(msr_bitmap, msr);
> @@ -4057,25 +3985,12 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
> {
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
> - int idx;
>
> if (!cpu_has_vmx_msr_bitmap())
> return;
>
> vmx_msr_bitmap_l01_changed(vmx);
>
> - /*
> - * Mark the desired intercept state in shadow bitmap, this is needed
> - * for resync when the MSR filter changes.
> - */
> - idx = vmx_get_passthrough_msr_slot(msr);
> - if (idx >= 0) {
> - if (type & MSR_TYPE_R)
> - __set_bit(idx, vmx->shadow_msr_intercept.read);
> - if (type & MSR_TYPE_W)
> - __set_bit(idx, vmx->shadow_msr_intercept.write);
> - }
> -
> if (type & MSR_TYPE_R)
> vmx_set_msr_bitmap_read(msr_bitmap, msr);
>
> @@ -4159,35 +4074,59 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
> }
> }
>
> -void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
> +static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
> {
> - struct vcpu_vmx *vmx = to_vmx(vcpu);
> - u32 i;
> -
> if (!cpu_has_vmx_msr_bitmap())
> return;
>
> - /*
> - * Redo intercept permissions for MSRs that KVM is passing through to
> - * the guest. Disabling interception will check the new MSR filter and
> - * ensure that KVM enables interception if usersepace wants to filter
> - * the MSR. MSRs that KVM is already intercepting don't need to be
> - * refreshed since KVM is going to intercept them regardless of what
> - * userspace wants.
> - */
> - for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
> - u32 msr = vmx_possible_passthrough_msrs[i];
> -
> - if (!test_bit(i, vmx->shadow_msr_intercept.read))
> - vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
> -
> - if (!test_bit(i, vmx->shadow_msr_intercept.write))
> - vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
> + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
> +#ifdef CONFIG_X86_64
> + vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
> + vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
> + vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
> +#endif
> + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
> + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
> + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
> + if (kvm_cstate_in_guest(vcpu->kvm)) {
> + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
> + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
> + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
> + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
> }
>
> /* PT MSRs can be passed through iff PT is exposed to the guest. */
> if (vmx_pt_mode_is_host_guest())
> pt_update_intercept_for_msr(vcpu);
> +
> + if (vcpu->arch.xfd_no_write_intercept)
> + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
> +
> +
> + vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
> + !to_vmx(vcpu)->spec_ctrl);
> +
> + if (kvm_cpu_cap_has(X86_FEATURE_XFD))
> + vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
> + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
> +
> + if (boot_cpu_has(X86_FEATURE_IBPB))
> + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
> + !guest_has_pred_cmd_msr(vcpu));
> +
> + if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
> + vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
> + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
> +
> + /*
> + * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
> + * filtered by userspace.
> + */
> +}
> +
> +void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
> +{
> + vmx_recalc_msr_intercepts(vcpu);
> }
>
> static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
> @@ -7537,26 +7476,6 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
> evmcs->hv_enlightenments_control.msr_bitmap = 1;
> }
>
> - /* The MSR bitmap starts with all ones */
> - bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
> - bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
> -
> - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
> -#ifdef CONFIG_X86_64
> - vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
> - vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
> - vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
> -#endif
> - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
> - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
> - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
> - if (kvm_cstate_in_guest(vcpu->kvm)) {
> - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
> - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
> - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
> - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
> - }
> -
> vmx->loaded_vmcs = &vmx->vmcs01;
>
> if (cpu_need_virtualize_apic_accesses(vcpu)) {
> @@ -7842,18 +7761,6 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> }
> }
>
> - if (kvm_cpu_cap_has(X86_FEATURE_XFD))
> - vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
> - !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
> -
> - if (boot_cpu_has(X86_FEATURE_IBPB))
> - vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
> - !guest_has_pred_cmd_msr(vcpu));
> -
> - if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
> - vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
> - !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
> -
> set_cr4_guest_host_mask(vmx);
>
> vmx_write_encls_bitmap(vcpu, NULL);
> @@ -7869,6 +7776,9 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> vmx->msr_ia32_feature_control_valid_bits &=
> ~FEAT_CTL_SGX_LC_ENABLED;
>
> + /* Recalc MSR interception to account for feature changes. */
> + vmx_recalc_msr_intercepts(vcpu);
> +
> /* Refresh #PF interception to account for MAXPHYADDR changes. */
> vmx_update_exception_bitmap(vcpu);
> }
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 0afe97e3478f..a26fe3d9e1d2 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -294,13 +294,6 @@ struct vcpu_vmx {
> struct pt_desc pt_desc;
> struct lbr_desc lbr_desc;
>
> - /* Save desired MSR intercept (read: pass-through) state */
> -#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16
> - struct {
> - DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
> - DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
> - } shadow_msr_intercept;
> -
> /* ve_info must be page aligned. */
> struct vmx_ve_information *ve_info;
> };
Powered by blists - more mailing lists