[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <86f9b2f0-533c-478d-ac9a-dbee11537dac@linux.intel.com>
Date: Wed, 7 May 2025 13:17:04 +0800
From: "Mi, Dapeng" <dapeng1.mi@...ux.intel.com>
To: mlevitsk@...hat.com, kvm@...r.kernel.org
Cc: Thomas Gleixner <tglx@...utronix.de>, Borislav Petkov <bp@...en8.de>,
Paolo Bonzini <pbonzini@...hat.com>, x86@...nel.org,
Sean Christopherson <seanjc@...gle.com>,
Dave Hansen <dave.hansen@...ux.intel.com>, Ingo Molnar <mingo@...hat.com>,
linux-kernel@...r.kernel.org, "H. Peter Anvin" <hpa@...or.com>
Subject: Re: [PATCH 1/3] x86: KVM: VMX: Wrap GUEST_IA32_DEBUGCTL read/write
with access functions
On 5/2/2025 4:34 AM, mlevitsk@...hat.com wrote:
> On Wed, 2025-04-23 at 17:51 +0800, Mi, Dapeng wrote:
>> The shortlog "x86: KVM: VMX: Wrap GUEST_IA32_DEBUGCTL read/write with
>> access functions" doesn't follow Sean's suggestion
>> (https://github.com/kvm-x86/linux/blob/next/Documentation/process/maintainer-kvm-x86.rst#shortlog).
>> Please modify. Thanks.
>>
>>
>> On 4/16/2025 8:25 AM, Maxim Levitsky wrote:
>>> Instead of reading and writing GUEST_IA32_DEBUGCTL vmcs field directly,
>>> wrap the logic with get/set functions.
>>>
>>> Also move the checks that the guest's supplied value is valid to the new
>>> 'set' function.
>>>
>>> In particular, the above change fixes a minor security issue in which L1
>>> hypervisor could set the GUEST_IA32_DEBUGCTL, and eventually the host's
>>> MSR_IA32_DEBUGCTL to any value by performing a VM entry to L2 with
>>> VM_ENTRY_LOAD_DEBUG_CONTROLS set.
>>>
>>> Signed-off-by: Maxim Levitsky <mlevitsk@...hat.com>
>>> ---
>>> arch/x86/kvm/vmx/nested.c | 15 +++++++---
>>> arch/x86/kvm/vmx/pmu_intel.c | 9 +++---
>>> arch/x86/kvm/vmx/vmx.c | 58 +++++++++++++++++++++++-------------
>>> arch/x86/kvm/vmx/vmx.h | 3 ++
>>> 4 files changed, 57 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>>> index e073e3008b16..b7686569ee09 100644
>>> --- a/arch/x86/kvm/vmx/nested.c
>>> +++ b/arch/x86/kvm/vmx/nested.c
>>> @@ -2641,6 +2641,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>>> struct vcpu_vmx *vmx = to_vmx(vcpu);
>>> struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
>>> bool load_guest_pdptrs_vmcs12 = false;
>>> + u64 new_debugctl;
>>>
>>> if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) {
>>> prepare_vmcs02_rare(vmx, vmcs12);
>>> @@ -2653,11 +2654,17 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>>> if (vmx->nested.nested_run_pending &&
>>> (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
>>> kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
>>> - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
>>> + new_debugctl = vmcs12->guest_ia32_debugctl;
>>> } else {
>>> kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
>>> - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
>>> + new_debugctl = vmx->nested.pre_vmenter_debugctl;
>>> }
>>> +
>>> + if (CC(!vmx_set_guest_debugctl(vcpu, new_debugctl, false))) {
>>> + *entry_failure_code = ENTRY_FAIL_DEFAULT;
>>> + return -EINVAL;
>>> + }
>>> +
>>> if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
>>> !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
>>> vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
>>> @@ -3520,7 +3527,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
>>>
>>> if (!vmx->nested.nested_run_pending ||
>>> !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
>>> - vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
>>> + vmx->nested.pre_vmenter_debugctl = vmx_get_guest_debugctl(vcpu);
>>> if (kvm_mpx_supported() &&
>>> (!vmx->nested.nested_run_pending ||
>>> !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
>>> @@ -4788,7 +4795,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
>>> __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
>>>
>>> kvm_set_dr(vcpu, 7, 0x400);
>>> - vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
>>> + vmx_set_guest_debugctl(vcpu, 0, false);
>>>
>>> if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
>>> vmcs12->vm_exit_msr_load_count))
>>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
>>> index 8a94b52c5731..f6f448adfb80 100644
>>> --- a/arch/x86/kvm/vmx/pmu_intel.c
>>> +++ b/arch/x86/kvm/vmx/pmu_intel.c
>>> @@ -19,6 +19,7 @@
>>> #include "lapic.h"
>>> #include "nested.h"
>>> #include "pmu.h"
>>> +#include "vmx.h"
>>> #include "tdx.h"
>>>
>>> /*
>>> @@ -652,11 +653,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
>>> */
>>> static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
>>> {
>>> - u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
>>> + u64 data = vmx_get_guest_debugctl(vcpu);
>>>
>>> if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
>>> data &= ~DEBUGCTLMSR_LBR;
>>> - vmcs_write64(GUEST_IA32_DEBUGCTL, data);
>>> + vmx_set_guest_debugctl(vcpu, data, true);
>> Two questions.
>>
>> 1. why to call vmx_set_guest_debugctl() to do the extra check? currently
>> IA32_DEBUGCTL MSR is always intercepted and it's already checked at
>> vmx_set_msr() and seems unnecessary to check here again.
> Hi,
>
>
> I wanted this to be consistent. KVM has plenty of functions that can be both
> guest triggered and internally triggered. For example kvm_set_cr4()
>
> Besides the vmx_set_guest_debugctl also notes the value the guest wrote
> to be able to return it back to the guest if we choose to overide some
> bits of the MSR, so it made sense to have one common function to set the msr.
>
> Do you think that can affect performance?
hmm, since only DEBUGCTLMSR_LBR bit is changed here, it's safe to skip this
check and write guest debug_ctrl directly. I have no idea how much
performance impact this check would bring in high sampling frequency, but
why not to eliminate it if it can?
>
>
>> 2. why the argument "host_initiated" is true? It looks the data is not from
>> host.
> This is my mistake.
>
>>
>>> }
>>> }
>>>
>>> @@ -729,7 +730,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
>>>
>>> if (!lbr_desc->event) {
>>> vmx_disable_lbr_msrs_passthrough(vcpu);
>>> - if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
>>> + if (vmx_get_guest_debugctl(vcpu) & DEBUGCTLMSR_LBR)
>>> goto warn;
>>> if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
>>> goto warn;
>>> @@ -751,7 +752,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
>>>
>>> static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
>>> {
>>> - if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
>>> + if (!(vmx_get_guest_debugctl(vcpu) & DEBUGCTLMSR_LBR))
>>> intel_pmu_release_guest_lbr_event(vcpu);
>>> }
>>>
>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>>> index ef2d7208dd20..4237422dc4ed 100644
>>> --- a/arch/x86/kvm/vmx/vmx.c
>>> +++ b/arch/x86/kvm/vmx/vmx.c
>>> @@ -2154,7 +2154,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>> msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
>>> break;
>>> case MSR_IA32_DEBUGCTLMSR:
>>> - msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
>>> + msr_info->data = vmx_get_guest_debugctl(vcpu);
>>> break;
>>> default:
>>> find_uret_msr:
>>> @@ -2194,6 +2194,41 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
>>> return debugctl;
>>> }
>>>
>>> +u64 vmx_get_guest_debugctl(struct kvm_vcpu *vcpu)
>>> +{
>>> + return vmcs_read64(GUEST_IA32_DEBUGCTL);
>>> +}
>>> +
>>> +static void __vmx_set_guest_debugctl(struct kvm_vcpu *vcpu, u64 data)
>>> +{
>>> + vmcs_write64(GUEST_IA32_DEBUGCTL, data);
>>> +}
>> IMO, it seems unnecessary to add these 2 wrappers since the original code
>> is already intuitive enough and simple. But if you want, please add
>> "inline" before these 2 wrappers.
> The __vmx_set_guest_debugctl in the next patch will store the written value in
> a field, this is why I did it this way.
>
> The vmx_get_guest_debugctl will read this value instead, also in the next patch.
>
> I thought it would be cleaner to first introduce the trivial wrappers and then
> extend them.
>
>>
>>> +
>>> +bool vmx_set_guest_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
>> Since most of code in this function checks guest debugctl, better to rename
>> it to "vmx_check_and_set_guest_debugctl".
> I don't mind doing so.
>
>>
>>> +{
>>> + u64 invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
>>> +
>>> + if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
>>> + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
>>> + data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
>>> + invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
>> Add space around above 3 "|".
> I copied this code "as is" from the wrmsr code. I can add this though.
>
> Best regards,
> Maxim Levitsky
>
>>
>>> + }
>>> +
>>> + if (invalid)
>>> + return false;
>>> +
>>> + if (is_guest_mode(vcpu) && (get_vmcs12(vcpu)->vm_exit_controls &
>>> + VM_EXIT_SAVE_DEBUG_CONTROLS))
>>> + get_vmcs12(vcpu)->guest_ia32_debugctl = data;
>>> +
>>> + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
>>> + (data & DEBUGCTLMSR_LBR))
>>> + intel_pmu_create_guest_lbr_event(vcpu);
>>> +
>>> + __vmx_set_guest_debugctl(vcpu, data);
>>> + return true;
>>> +}
>>> +
>>> /*
>>> * Writes msr value into the appropriate "register".
>>> * Returns 0 on success, non-0 otherwise.
>>> @@ -2263,26 +2298,9 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>> vmcs_writel(GUEST_SYSENTER_ESP, data);
>>> break;
>>> case MSR_IA32_DEBUGCTLMSR: {
>>> - u64 invalid;
>>> -
>>> - invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
>>> - if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
>>> - kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
>>> - data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
>>> - invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
>>> - }
>>> -
>>> - if (invalid)
>>> + if (!vmx_set_guest_debugctl(vcpu, data, msr_info->host_initiated))
>>> return 1;
>>>
>>> - if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
>>> - VM_EXIT_SAVE_DEBUG_CONTROLS)
>>> - get_vmcs12(vcpu)->guest_ia32_debugctl = data;
>>> -
>>> - vmcs_write64(GUEST_IA32_DEBUGCTL, data);
>>> - if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
>>> - (data & DEBUGCTLMSR_LBR))
>>> - intel_pmu_create_guest_lbr_event(vcpu);
>>> return 0;
>>> }
>>> case MSR_IA32_BNDCFGS:
>>> @@ -4795,7 +4813,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>>> vmcs_write32(GUEST_SYSENTER_CS, 0);
>>> vmcs_writel(GUEST_SYSENTER_ESP, 0);
>>> vmcs_writel(GUEST_SYSENTER_EIP, 0);
>>> - vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
>>> + __vmx_set_guest_debugctl(&vmx->vcpu, 0);
>>>
>>> if (cpu_has_vmx_tpr_shadow()) {
>>> vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
>>> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
>>> index 6d1e40ecc024..8ac46fb47abd 100644
>>> --- a/arch/x86/kvm/vmx/vmx.h
>>> +++ b/arch/x86/kvm/vmx/vmx.h
>>> @@ -404,6 +404,9 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
>>>
>>> gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
>>>
>>> +bool vmx_set_guest_debugctl(struct kvm_vcpu *vcpu, u64 value, bool host_initiated);
>>> +u64 vmx_get_guest_debugctl(struct kvm_vcpu *vcpu);
>>> +
>>> static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
>>> int type, bool value)
>>> {
Powered by blists - more mailing lists