linux-kernel - Re: [PATCH 16/28] KVM: VMX: Manually recalc all MSR intercepts on userspace MSR filter change

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e75da0ce-670a-4a01-b68a-81201babe1bf@linux.intel.com>
Date: Tue, 3 Jun 2025 11:52:54 +0800
From: "Mi, Dapeng" <dapeng1.mi@...ux.intel.com>
To: Sean Christopherson <seanjc@...gle.com>,
 Paolo Bonzini <pbonzini@...hat.com>
Cc: kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
 Borislav Petkov <bp@...en8.de>, Xin Li <xin@...or.com>,
 Chao Gao <chao.gao@...el.com>
Subject: Re: [PATCH 16/28] KVM: VMX: Manually recalc all MSR intercepts on
 userspace MSR filter change


On 5/30/2025 7:40 AM, Sean Christopherson wrote:
> On a userspace MSR filter change, recalculate all MSR intercepts using the
> filter-agnostic logic instead of maintaining a "shadow copy" of KVM's
> desired intercepts.  The shadow bitmaps add yet another point of failure,
> are confusing (e.g. what does "handled specially" mean!?!?), an eyesore,
> and a maintenance burden.
>
> Given that KVM *must* be able to recalculate the correct intercepts at any
> given time, and that MSR filter updates are not hot paths, there is zero
> benefit to maintaining the shadow bitmaps.
>
> Link: https://lore.kernel.org/all/aCdPbZiYmtni4Bjs@google.com
> Link: https://lore.kernel.org/all/20241126180253.GAZ0YNTdXH1UGeqsu6@fat_crate.local
> Cc: Borislav Petkov <bp@...en8.de>
> Cc: Xin Li <xin@...or.com>
> Cc: Chao Gao <chao.gao@...el.com>
> Cc: Dapeng Mi <dapeng1.mi@...ux.intel.com>
> Signed-off-by: Sean Christopherson <seanjc@...gle.com>
> ---
>  arch/x86/kvm/vmx/vmx.c | 184 +++++++++++------------------------------
>  arch/x86/kvm/vmx/vmx.h |   7 --
>  2 files changed, 47 insertions(+), 144 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 8f7fe04a1998..6ffa2b2b85ce 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -166,31 +166,6 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
>  	RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
>  	RTIT_STATUS_BYTECNT))
>  
> -/*
> - * List of MSRs that can be directly passed to the guest.
> - * In addition to these x2apic, PT and LBR MSRs are handled specially.
> - */
> -static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
> -	MSR_IA32_SPEC_CTRL,
> -	MSR_IA32_PRED_CMD,
> -	MSR_IA32_FLUSH_CMD,
> -	MSR_IA32_TSC,
> -#ifdef CONFIG_X86_64
> -	MSR_FS_BASE,
> -	MSR_GS_BASE,
> -	MSR_KERNEL_GS_BASE,
> -	MSR_IA32_XFD,
> -	MSR_IA32_XFD_ERR,
> -#endif
> -	MSR_IA32_SYSENTER_CS,
> -	MSR_IA32_SYSENTER_ESP,
> -	MSR_IA32_SYSENTER_EIP,
> -	MSR_CORE_C1_RES,
> -	MSR_CORE_C3_RESIDENCY,
> -	MSR_CORE_C6_RESIDENCY,
> -	MSR_CORE_C7_RESIDENCY,
> -};
> -
>  /*
>   * These 2 parameters are used to config the controls for Pause-Loop Exiting:
>   * ple_gap:    upper bound on the amount of time between two successive
> @@ -672,40 +647,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
>  	return flexpriority_enabled && lapic_in_kernel(vcpu);
>  }
>  
> -static int vmx_get_passthrough_msr_slot(u32 msr)
> -{
> -	int i;
> -
> -	switch (msr) {
> -	case 0x800 ... 0x8ff:
> -		/* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
> -		return -ENOENT;
> -	case MSR_IA32_RTIT_STATUS:
> -	case MSR_IA32_RTIT_OUTPUT_BASE:
> -	case MSR_IA32_RTIT_OUTPUT_MASK:
> -	case MSR_IA32_RTIT_CR3_MATCH:
> -	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
> -		/* PT MSRs. These are handled in pt_update_intercept_for_msr() */
> -	case MSR_LBR_SELECT:
> -	case MSR_LBR_TOS:
> -	case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
> -	case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
> -	case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
> -	case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
> -	case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
> -		/* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
> -		return -ENOENT;
> -	}
> -
> -	for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
> -		if (vmx_possible_passthrough_msrs[i] == msr)
> -			return i;
> -	}
> -
> -	WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
> -	return -ENOENT;
> -}
> -
>  struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
>  {
>  	int i;
> @@ -4015,25 +3956,12 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
> -	int idx;
>  
>  	if (!cpu_has_vmx_msr_bitmap())
>  		return;
>  
>  	vmx_msr_bitmap_l01_changed(vmx);
>  
> -	/*
> -	 * Mark the desired intercept state in shadow bitmap, this is needed
> -	 * for resync when the MSR filters change.
> -	 */
> -	idx = vmx_get_passthrough_msr_slot(msr);
> -	if (idx >= 0) {
> -		if (type & MSR_TYPE_R)
> -			__clear_bit(idx, vmx->shadow_msr_intercept.read);
> -		if (type & MSR_TYPE_W)
> -			__clear_bit(idx, vmx->shadow_msr_intercept.write);
> -	}
> -

The patch looks good to me. Only a minor code refine on
vmx_disable_intercept_for_msr().

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9a83d5b174c8..e898ff296fd5 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4041,23 +4041,19 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu
*vcpu, u32 msr, int type)
                        clear_bit(idx, vmx->shadow_msr_intercept.write);
        }

-       if ((type & MSR_TYPE_R) &&
-           !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
-               vmx_set_msr_bitmap_read(msr_bitmap, msr);
-               type &= ~MSR_TYPE_R;
+       if (type & MSR_TYPE_R) {
+               if (!kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+                       vmx_set_msr_bitmap_read(msr_bitmap, msr);
+               else
+                       vmx_clear_msr_bitmap_read(msr_bitmap, msr);
        }

-       if ((type & MSR_TYPE_W) &&
-           !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
-               vmx_set_msr_bitmap_write(msr_bitmap, msr);
-               type &= ~MSR_TYPE_W;
+       if (type & MSR_TYPE_W) {
+               if (!kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+                       vmx_set_msr_bitmap_write(msr_bitmap, msr);
+               else
+                       vmx_clear_msr_bitmap_write(msr_bitmap, msr);
        }
-
-       if (type & MSR_TYPE_R)
-               vmx_clear_msr_bitmap_read(msr_bitmap, msr);
-
-       if (type & MSR_TYPE_W)
-               vmx_clear_msr_bitmap_write(msr_bitmap, msr);
 }

It looks simpler and easily understood.

Reviewed-by: Dapeng Mi <dapeng1.mi@...ux.intel.com>



>  	if ((type & MSR_TYPE_R) &&
>  	    !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
>  		vmx_set_msr_bitmap_read(msr_bitmap, msr);
> @@ -4057,25 +3985,12 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
> -	int idx;
>  
>  	if (!cpu_has_vmx_msr_bitmap())
>  		return;
>  
>  	vmx_msr_bitmap_l01_changed(vmx);
>  
> -	/*
> -	 * Mark the desired intercept state in shadow bitmap, this is needed
> -	 * for resync when the MSR filter changes.
> -	 */
> -	idx = vmx_get_passthrough_msr_slot(msr);
> -	if (idx >= 0) {
> -		if (type & MSR_TYPE_R)
> -			__set_bit(idx, vmx->shadow_msr_intercept.read);
> -		if (type & MSR_TYPE_W)
> -			__set_bit(idx, vmx->shadow_msr_intercept.write);
> -	}
> -
>  	if (type & MSR_TYPE_R)
>  		vmx_set_msr_bitmap_read(msr_bitmap, msr);
>  
> @@ -4159,35 +4074,59 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
>  	}
>  }
>  
> -void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
> +static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
>  {
> -	struct vcpu_vmx *vmx = to_vmx(vcpu);
> -	u32 i;
> -
>  	if (!cpu_has_vmx_msr_bitmap())
>  		return;
>  
> -	/*
> -	 * Redo intercept permissions for MSRs that KVM is passing through to
> -	 * the guest.  Disabling interception will check the new MSR filter and
> -	 * ensure that KVM enables interception if usersepace wants to filter
> -	 * the MSR.  MSRs that KVM is already intercepting don't need to be
> -	 * refreshed since KVM is going to intercept them regardless of what
> -	 * userspace wants.
> -	 */
> -	for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
> -		u32 msr = vmx_possible_passthrough_msrs[i];
> -
> -		if (!test_bit(i, vmx->shadow_msr_intercept.read))
> -			vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
> -
> -		if (!test_bit(i, vmx->shadow_msr_intercept.write))
> -			vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
> +	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
> +#ifdef CONFIG_X86_64
> +	vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
> +	vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
> +	vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
> +#endif
> +	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
> +	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
> +	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
> +	if (kvm_cstate_in_guest(vcpu->kvm)) {
> +		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
> +		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
> +		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
> +		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
>  	}
>  
>  	/* PT MSRs can be passed through iff PT is exposed to the guest. */
>  	if (vmx_pt_mode_is_host_guest())
>  		pt_update_intercept_for_msr(vcpu);
> +
> +	if (vcpu->arch.xfd_no_write_intercept)
> +		vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
> +
> +
> +	vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
> +				  !to_vmx(vcpu)->spec_ctrl);
> +
> +	if (kvm_cpu_cap_has(X86_FEATURE_XFD))
> +		vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
> +					  !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
> +
> +	if (boot_cpu_has(X86_FEATURE_IBPB))
> +		vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
> +					  !guest_has_pred_cmd_msr(vcpu));
> +
> +	if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
> +		vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
> +					  !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
> +
> +	/*
> +	 * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
> +	 * filtered by userspace.
> +	 */
> +}
> +
> +void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
> +{
> +	vmx_recalc_msr_intercepts(vcpu);
>  }
>  
>  static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
> @@ -7537,26 +7476,6 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
>  		evmcs->hv_enlightenments_control.msr_bitmap = 1;
>  	}
>  
> -	/* The MSR bitmap starts with all ones */
> -	bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
> -	bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
> -
> -	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
> -#ifdef CONFIG_X86_64
> -	vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
> -	vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
> -	vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
> -#endif
> -	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
> -	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
> -	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
> -	if (kvm_cstate_in_guest(vcpu->kvm)) {
> -		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
> -		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
> -		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
> -		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
> -	}
> -
>  	vmx->loaded_vmcs = &vmx->vmcs01;
>  
>  	if (cpu_need_virtualize_apic_accesses(vcpu)) {
> @@ -7842,18 +7761,6 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
>  		}
>  	}
>  
> -	if (kvm_cpu_cap_has(X86_FEATURE_XFD))
> -		vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
> -					  !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
> -
> -	if (boot_cpu_has(X86_FEATURE_IBPB))
> -		vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
> -					  !guest_has_pred_cmd_msr(vcpu));
> -
> -	if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
> -		vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
> -					  !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
> -
>  	set_cr4_guest_host_mask(vmx);
>  
>  	vmx_write_encls_bitmap(vcpu, NULL);
> @@ -7869,6 +7776,9 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
>  		vmx->msr_ia32_feature_control_valid_bits &=
>  			~FEAT_CTL_SGX_LC_ENABLED;
>  
> +	/* Recalc MSR interception to account for feature changes. */
> +	vmx_recalc_msr_intercepts(vcpu);
> +
>  	/* Refresh #PF interception to account for MAXPHYADDR changes. */
>  	vmx_update_exception_bitmap(vcpu);
>  }
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 0afe97e3478f..a26fe3d9e1d2 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -294,13 +294,6 @@ struct vcpu_vmx {
>  	struct pt_desc pt_desc;
>  	struct lbr_desc lbr_desc;
>  
> -	/* Save desired MSR intercept (read: pass-through) state */
> -#define MAX_POSSIBLE_PASSTHROUGH_MSRS	16
> -	struct {
> -		DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
> -		DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
> -	} shadow_msr_intercept;
> -
>  	/* ve_info must be page aligned. */
>  	struct vmx_ve_information *ve_info;
>  };