[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <66bc75f6-58c5-c67f-f268-220d371022a2@redhat.com>
Date: Wed, 17 Mar 2021 14:17:34 +0100
From: Paolo Bonzini <pbonzini@...hat.com>
To: Sean Christopherson <seanjc@...gle.com>
Cc: Vitaly Kuznetsov <vkuznets@...hat.com>,
Wanpeng Li <wanpengli@...cent.com>,
Jim Mattson <jmattson@...gle.com>,
Joerg Roedel <joro@...tes.org>, kvm@...r.kernel.org,
linux-kernel@...r.kernel.org, Alexander Graf <graf@...zon.com>,
Yuan Yao <yaoyuan0329os@...il.com>
Subject: Re: [PATCH 2/4] KVM: nVMX: Handle dynamic MSR intercept toggling
On 16/03/21 19:44, Sean Christopherson wrote:
> Always check vmcs01's MSR bitmap when merging L0 and L1 bitmaps for L2,
> and always update the relevant bits in vmcs02. This fixes two distinct,
> but intertwined bugs related to dynamic MSR bitmap modifications.
>
> The first issue is that KVM fails to enable MSR interception in vmcs02
> for the FS/GS base MSRs if L1 first runs L2 with interception disabled,
> and later enables interception.
>
> The second issue is that KVM fails to honor userspace MSR filtering when
> preparing vmcs02.
>
> Fix both issues simultaneous as fixing only one of the issues (doesn't
> matter which) would create a mess that no one should have to bisect.
> Fixing only the first bug would exacerbate the MSR filtering issue as
> userspace would see inconsistent behavior depending on the whims of L1.
> Fixing only the second bug (MSR filtering) effectively requires fixing
> the first, as the nVMX code only knows how to transition vmcs02's
> bitmap from 1->0.
>
> Move the various accessor/mutators buried in vmx.c into vmx.h so that
> they can be shared by the nested code.
>
> Fixes: 1a155254ff93 ("KVM: x86: Introduce MSR filtering")
> Fixes: d69129b4e46a ("KVM: nVMX: Disable intercept for FS/GS base MSRs in vmcs02 when possible")
> Cc: stable@...r.kernel.org
> Cc: Alexander Graf <graf@...zon.com>
> Signed-off-by: Sean Christopherson <seanjc@...gle.com>
> ---
> arch/x86/kvm/vmx/nested.c | 108 +++++++++++++++++---------------------
> arch/x86/kvm/vmx/vmx.c | 67 ++---------------------
> arch/x86/kvm/vmx/vmx.h | 63 ++++++++++++++++++++++
> 3 files changed, 115 insertions(+), 123 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index fd334e4aa6db..aff41a432a56 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -475,29 +475,6 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
> return 0;
> }
>
> -/*
> - * Check if MSR is intercepted for L01 MSR bitmap.
> - */
> -static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
> -{
> - unsigned long *msr_bitmap;
> - int f = sizeof(unsigned long);
> -
> - if (!cpu_has_vmx_msr_bitmap())
> - return true;
> -
> - msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
> -
> - if (msr <= 0x1fff) {
> - return !!test_bit(msr, msr_bitmap + 0x800 / f);
> - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> - msr &= 0x1fff;
> - return !!test_bit(msr, msr_bitmap + 0xc00 / f);
> - }
> -
> - return true;
> -}
> -
> /*
> * If a msr is allowed by L0, we should check whether it is allowed by L1.
> * The corresponding bit will be cleared unless both of L0 and L1 allow it.
> @@ -551,6 +528,34 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
> }
> }
>
> +#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \
> +static inline \
> +void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \
> + unsigned long *msr_bitmap_l1, \
> + unsigned long *msr_bitmap_l0, u32 msr) \
> +{ \
> + if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \
> + vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \
> + vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \
> + else \
> + vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \
> +}
> +BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
> +BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
> +
> +static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
> + unsigned long *msr_bitmap_l1,
> + unsigned long *msr_bitmap_l0,
> + u32 msr, int types)
> +{
> + if (types & MSR_TYPE_R)
> + nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
> + msr_bitmap_l0, msr);
> + if (types & MSR_TYPE_W)
> + nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
> + msr_bitmap_l0, msr);
> +}
> +
> /*
> * Merge L0's and L1's MSR bitmap, return false to indicate that
> * we do not use the hardware.
> @@ -558,10 +563,11 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
> static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
> struct vmcs12 *vmcs12)
> {
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> int msr;
> unsigned long *msr_bitmap_l1;
> - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
> - struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
> + unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
> + struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
>
> /* Nothing to do if the MSR bitmap is not in use. */
> if (!cpu_has_vmx_msr_bitmap() ||
> @@ -612,42 +618,26 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
> }
> }
>
> - /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
> - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
> - MSR_FS_BASE, MSR_TYPE_RW);
> -
> - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
> - MSR_GS_BASE, MSR_TYPE_RW);
> -
> - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
> - MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
> -
> /*
> - * Checking the L0->L1 bitmap is trying to verify two things:
> - *
> - * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
> - * ensures that we do not accidentally generate an L02 MSR bitmap
> - * from the L12 MSR bitmap that is too permissive.
> - * 2. That L1 or L2s have actually used the MSR. This avoids
> - * unnecessarily merging of the bitmap if the MSR is unused. This
> - * works properly because we only update the L01 MSR bitmap lazily.
> - * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
> - * updated to reflect this when L1 (or its L2s) actually write to
> - * the MSR.
> + * Always check vmcs01's bitmap to honor userspace MSR filters and any
> + * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
> */
> - if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
> - nested_vmx_disable_intercept_for_msr(
> - msr_bitmap_l1, msr_bitmap_l0,
> - MSR_IA32_SPEC_CTRL,
> - MSR_TYPE_R | MSR_TYPE_W);
> -
> - if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
> - nested_vmx_disable_intercept_for_msr(
> - msr_bitmap_l1, msr_bitmap_l0,
> - MSR_IA32_PRED_CMD,
> - MSR_TYPE_W);
> -
> - kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
> + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> + MSR_FS_BASE, MSR_TYPE_RW);
> +
> + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> + MSR_GS_BASE, MSR_TYPE_RW);
> +
> + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> + MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
> +
> + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> + MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
> +
> + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> + MSR_IA32_PRED_CMD, MSR_TYPE_W);
> +
> + kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
>
> return true;
> }
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index c8a4a548e96b..9972e5d1c44e 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -879,29 +879,6 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
> vmcs_write32(EXCEPTION_BITMAP, eb);
> }
>
> -/*
> - * Check if MSR is intercepted for currently loaded MSR bitmap.
> - */
> -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
> -{
> - unsigned long *msr_bitmap;
> - int f = sizeof(unsigned long);
> -
> - if (!cpu_has_vmx_msr_bitmap())
> - return true;
> -
> - msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
> -
> - if (msr <= 0x1fff) {
> - return !!test_bit(msr, msr_bitmap + 0x800 / f);
> - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> - msr &= 0x1fff;
> - return !!test_bit(msr, msr_bitmap + 0xc00 / f);
> - }
> -
> - return true;
> -}
> -
> static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
> unsigned long entry, unsigned long exit)
> {
> @@ -3709,46 +3686,6 @@ void free_vpid(int vpid)
> spin_unlock(&vmx_vpid_lock);
> }
>
> -static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
> -{
> - int f = sizeof(unsigned long);
> -
> - if (msr <= 0x1fff)
> - __clear_bit(msr, msr_bitmap + 0x000 / f);
> - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> - __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
> -}
> -
> -static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
> -{
> - int f = sizeof(unsigned long);
> -
> - if (msr <= 0x1fff)
> - __clear_bit(msr, msr_bitmap + 0x800 / f);
> - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> - __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
> -}
> -
> -static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
> -{
> - int f = sizeof(unsigned long);
> -
> - if (msr <= 0x1fff)
> - __set_bit(msr, msr_bitmap + 0x000 / f);
> - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> - __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
> -}
> -
> -static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
> -{
> - int f = sizeof(unsigned long);
> -
> - if (msr <= 0x1fff)
> - __set_bit(msr, msr_bitmap + 0x800 / f);
> - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> - __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
> -}
> -
> static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
> u32 msr, int type)
> {
> @@ -6722,7 +6659,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
> * If the L02 MSR bitmap does not intercept the MSR, then we need to
> * save it.
> */
> - if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
> + if (unlikely(cpu_has_vmx_msr_bitmap() &&
> + vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap,
> + MSR_IA32_SPEC_CTRL)))
> vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
>
> x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 0fb3236b0283..a6000c91b897 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -393,6 +393,69 @@ void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
> u32 msr, int type, bool value);
> void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
>
> +static inline bool vmx_test_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
> +{
> + int f = sizeof(unsigned long);
> +
> + if (msr <= 0x1fff)
> + return test_bit(msr, msr_bitmap + 0x000 / f);
> + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> + return test_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
> + return true;
> +}
> +
> +static inline bool vmx_test_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
> +{
> + int f = sizeof(unsigned long);
> +
> + if (msr <= 0x1fff)
> + return test_bit(msr, msr_bitmap + 0x800 / f);
> + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> + return test_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
> + return true;
> +}
> +
> +static inline void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
> +{
> + int f = sizeof(unsigned long);
> +
> + if (msr <= 0x1fff)
> + __clear_bit(msr, msr_bitmap + 0x000 / f);
> + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> + __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
> +}
> +
> +static inline void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
> +{
> + int f = sizeof(unsigned long);
> +
> + if (msr <= 0x1fff)
> + __clear_bit(msr, msr_bitmap + 0x800 / f);
> + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> + __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
> +}
> +
> +static inline void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
> +{
> + int f = sizeof(unsigned long);
> +
> + if (msr <= 0x1fff)
> + __set_bit(msr, msr_bitmap + 0x000 / f);
> + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> + __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
> +}
> +
> +static inline void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
> +{
> + int f = sizeof(unsigned long);
> +
> + if (msr <= 0x1fff)
> + __set_bit(msr, msr_bitmap + 0x800 / f);
> + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> + __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
> +}
> +
> +
> static inline u8 vmx_get_rvi(void)
> {
> return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
>
Feel free to squash patch 3 in this one or reorder it before; it makes
sense to make them macros when you go from 4 to 6 functions.
Paolo
Powered by blists - more mailing lists