linux-kernel - Re: [PATCH 1/3] x86: KVM: VMX: Wrap GUEST_IA32_DEBUGCTL read/write with access functions

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <86f9b2f0-533c-478d-ac9a-dbee11537dac@linux.intel.com>
Date: Wed, 7 May 2025 13:17:04 +0800
From: "Mi, Dapeng" <dapeng1.mi@...ux.intel.com>
To: mlevitsk@...hat.com, kvm@...r.kernel.org
Cc: Thomas Gleixner <tglx@...utronix.de>, Borislav Petkov <bp@...en8.de>,
 Paolo Bonzini <pbonzini@...hat.com>, x86@...nel.org,
 Sean Christopherson <seanjc@...gle.com>,
 Dave Hansen <dave.hansen@...ux.intel.com>, Ingo Molnar <mingo@...hat.com>,
 linux-kernel@...r.kernel.org, "H. Peter Anvin" <hpa@...or.com>
Subject: Re: [PATCH 1/3] x86: KVM: VMX: Wrap GUEST_IA32_DEBUGCTL read/write
 with access functions


On 5/2/2025 4:34 AM, mlevitsk@...hat.com wrote:
> On Wed, 2025-04-23 at 17:51 +0800, Mi, Dapeng wrote:
>> The shortlog "x86: KVM: VMX: Wrap GUEST_IA32_DEBUGCTL read/write with
>> access functions" doesn't follow Sean's suggestion
>> (https://github.com/kvm-x86/linux/blob/next/Documentation/process/maintainer-kvm-x86.rst#shortlog).
>> Please modify. Thanks.
>>
>>
>> On 4/16/2025 8:25 AM, Maxim Levitsky wrote:
>>> Instead of reading and writing GUEST_IA32_DEBUGCTL vmcs field directly,
>>> wrap the logic with get/set functions.
>>>
>>> Also move the checks that the guest's supplied value is valid to the new
>>> 'set' function.
>>>
>>> In particular, the above change fixes a minor security issue in which L1
>>> hypervisor could set the GUEST_IA32_DEBUGCTL, and eventually the host's
>>> MSR_IA32_DEBUGCTL to any value by performing a VM entry to L2 with
>>> VM_ENTRY_LOAD_DEBUG_CONTROLS set.
>>>
>>> Signed-off-by: Maxim Levitsky <mlevitsk@...hat.com>
>>> ---
>>>  arch/x86/kvm/vmx/nested.c    | 15 +++++++---
>>>  arch/x86/kvm/vmx/pmu_intel.c |  9 +++---
>>>  arch/x86/kvm/vmx/vmx.c       | 58 +++++++++++++++++++++++-------------
>>>  arch/x86/kvm/vmx/vmx.h       |  3 ++
>>>  4 files changed, 57 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>>> index e073e3008b16..b7686569ee09 100644
>>> --- a/arch/x86/kvm/vmx/nested.c
>>> +++ b/arch/x86/kvm/vmx/nested.c
>>> @@ -2641,6 +2641,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>>>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>>>  	struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
>>>  	bool load_guest_pdptrs_vmcs12 = false;
>>> +	u64 new_debugctl;
>>>  
>>>  	if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) {
>>>  		prepare_vmcs02_rare(vmx, vmcs12);
>>> @@ -2653,11 +2654,17 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>>>  	if (vmx->nested.nested_run_pending &&
>>>  	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
>>>  		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
>>> -		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
>>> +		new_debugctl = vmcs12->guest_ia32_debugctl;
>>>  	} else {
>>>  		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
>>> -		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
>>> +		new_debugctl = vmx->nested.pre_vmenter_debugctl;
>>>  	}
>>> +
>>> +	if (CC(!vmx_set_guest_debugctl(vcpu, new_debugctl, false))) {
>>> +		*entry_failure_code = ENTRY_FAIL_DEFAULT;
>>> +		return -EINVAL;
>>> +	}
>>> +
>>>  	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
>>>  	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
>>>  		vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
>>> @@ -3520,7 +3527,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
>>>  
>>>  	if (!vmx->nested.nested_run_pending ||
>>>  	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
>>> -		vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
>>> +		vmx->nested.pre_vmenter_debugctl = vmx_get_guest_debugctl(vcpu);
>>>  	if (kvm_mpx_supported() &&
>>>  	    (!vmx->nested.nested_run_pending ||
>>>  	     !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
>>> @@ -4788,7 +4795,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
>>>  	__vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
>>>  
>>>  	kvm_set_dr(vcpu, 7, 0x400);
>>> -	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
>>> +	vmx_set_guest_debugctl(vcpu, 0, false);
>>>  
>>>  	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
>>>  				vmcs12->vm_exit_msr_load_count))
>>> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
>>> index 8a94b52c5731..f6f448adfb80 100644
>>> --- a/arch/x86/kvm/vmx/pmu_intel.c
>>> +++ b/arch/x86/kvm/vmx/pmu_intel.c
>>> @@ -19,6 +19,7 @@
>>>  #include "lapic.h"
>>>  #include "nested.h"
>>>  #include "pmu.h"
>>> +#include "vmx.h"
>>>  #include "tdx.h"
>>>  
>>>  /*
>>> @@ -652,11 +653,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
>>>   */
>>>  static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
>>>  {
>>> -	u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
>>> +	u64 data = vmx_get_guest_debugctl(vcpu);
>>>  
>>>  	if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
>>>  		data &= ~DEBUGCTLMSR_LBR;
>>> -		vmcs_write64(GUEST_IA32_DEBUGCTL, data);
>>> +		vmx_set_guest_debugctl(vcpu, data, true);
>> Two questions. 
>>
>> 1. why to call vmx_set_guest_debugctl() to do the extra check? currently
>> IA32_DEBUGCTL MSR is always intercepted and it's already checked at
>> vmx_set_msr() and seems unnecessary to check here again.
> Hi,
>
>
> I wanted this to be consistent. KVM has plenty of functions that can be both
> guest triggered and internally triggered. For example kvm_set_cr4()
>
> Besides the vmx_set_guest_debugctl also notes the value the guest wrote
> to be able to return it back to the guest if we choose to overide some
> bits of the MSR, so it made sense to have one common function to set the msr.
>
> Do you think that can affect performance? 

hmm, since only DEBUGCTLMSR_LBR bit is changed here, it's safe to skip this
check and write guest debug_ctrl directly. I have no idea how much
performance impact this check would bring in high sampling frequency, but
why not to eliminate it if it can?


>
>
>> 2. why the argument "host_initiated" is true? It looks the data is not from
>> host.
> This is my mistake.
>
>>
>>>  	}
>>>  }
>>>  
>>> @@ -729,7 +730,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
>>>  
>>>  	if (!lbr_desc->event) {
>>>  		vmx_disable_lbr_msrs_passthrough(vcpu);
>>> -		if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
>>> +		if (vmx_get_guest_debugctl(vcpu) & DEBUGCTLMSR_LBR)
>>>  			goto warn;
>>>  		if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
>>>  			goto warn;
>>> @@ -751,7 +752,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
>>>  
>>>  static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
>>>  {
>>> -	if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
>>> +	if (!(vmx_get_guest_debugctl(vcpu) & DEBUGCTLMSR_LBR))
>>>  		intel_pmu_release_guest_lbr_event(vcpu);
>>>  }
>>>  
>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>>> index ef2d7208dd20..4237422dc4ed 100644
>>> --- a/arch/x86/kvm/vmx/vmx.c
>>> +++ b/arch/x86/kvm/vmx/vmx.c
>>> @@ -2154,7 +2154,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>>  			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
>>>  		break;
>>>  	case MSR_IA32_DEBUGCTLMSR:
>>> -		msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
>>> +		msr_info->data = vmx_get_guest_debugctl(vcpu);
>>>  		break;
>>>  	default:
>>>  	find_uret_msr:
>>> @@ -2194,6 +2194,41 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
>>>  	return debugctl;
>>>  }
>>>  
>>> +u64 vmx_get_guest_debugctl(struct kvm_vcpu *vcpu)
>>> +{
>>> +	return vmcs_read64(GUEST_IA32_DEBUGCTL);
>>> +}
>>> +
>>> +static void __vmx_set_guest_debugctl(struct kvm_vcpu *vcpu, u64 data)
>>> +{
>>> +	vmcs_write64(GUEST_IA32_DEBUGCTL, data);
>>> +}
>> IMO,  it seems unnecessary to add these 2  wrappers since the original code
>> is already intuitive enough and simple. But if you want, please add
>> "inline" before these 2 wrappers.
> The __vmx_set_guest_debugctl in the next patch will store the written value in
> a field, this is why I did it this way.
>
> The vmx_get_guest_debugctl will read this value instead, also in the next patch.
>
> I thought it would be cleaner to first introduce the trivial wrappers and then
> extend them.
>
>>
>>> +
>>> +bool vmx_set_guest_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
>> Since most of code in this function checks guest debugctl, better to rename
>> it to "vmx_check_and_set_guest_debugctl".
> I don't mind doing so.
>
>>
>>> +{
>>> +	u64 invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
>>> +
>>> +	if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
>>> +		kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
>>> +		data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
>>> +		invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
>> Add space around above 3 "|".
> I copied this code "as is" from the wrmsr code. I can add this though.
>
> Best regards,
> 	Maxim Levitsky
>
>>
>>> +	}
>>> +
>>> +	if (invalid)
>>> +		return false;
>>> +
>>> +	if (is_guest_mode(vcpu) && (get_vmcs12(vcpu)->vm_exit_controls &
>>> +					VM_EXIT_SAVE_DEBUG_CONTROLS))
>>> +		get_vmcs12(vcpu)->guest_ia32_debugctl = data;
>>> +
>>> +	if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
>>> +	    (data & DEBUGCTLMSR_LBR))
>>> +		intel_pmu_create_guest_lbr_event(vcpu);
>>> +
>>> +	__vmx_set_guest_debugctl(vcpu, data);
>>> +	return true;
>>> +}
>>> +
>>>  /*
>>>   * Writes msr value into the appropriate "register".
>>>   * Returns 0 on success, non-0 otherwise.
>>> @@ -2263,26 +2298,9 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>>  		vmcs_writel(GUEST_SYSENTER_ESP, data);
>>>  		break;
>>>  	case MSR_IA32_DEBUGCTLMSR: {
>>> -		u64 invalid;
>>> -
>>> -		invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
>>> -		if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
>>> -			kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
>>> -			data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
>>> -			invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
>>> -		}
>>> -
>>> -		if (invalid)
>>> +		if (!vmx_set_guest_debugctl(vcpu, data, msr_info->host_initiated))
>>>  			return 1;
>>>  
>>> -		if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
>>> -						VM_EXIT_SAVE_DEBUG_CONTROLS)
>>> -			get_vmcs12(vcpu)->guest_ia32_debugctl = data;
>>> -
>>> -		vmcs_write64(GUEST_IA32_DEBUGCTL, data);
>>> -		if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
>>> -		    (data & DEBUGCTLMSR_LBR))
>>> -			intel_pmu_create_guest_lbr_event(vcpu);
>>>  		return 0;
>>>  	}
>>>  	case MSR_IA32_BNDCFGS:
>>> @@ -4795,7 +4813,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
>>>  	vmcs_write32(GUEST_SYSENTER_CS, 0);
>>>  	vmcs_writel(GUEST_SYSENTER_ESP, 0);
>>>  	vmcs_writel(GUEST_SYSENTER_EIP, 0);
>>> -	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
>>> +	__vmx_set_guest_debugctl(&vmx->vcpu, 0);
>>>  
>>>  	if (cpu_has_vmx_tpr_shadow()) {
>>>  		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
>>> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
>>> index 6d1e40ecc024..8ac46fb47abd 100644
>>> --- a/arch/x86/kvm/vmx/vmx.h
>>> +++ b/arch/x86/kvm/vmx/vmx.h
>>> @@ -404,6 +404,9 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
>>>  
>>>  gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
>>>  
>>> +bool vmx_set_guest_debugctl(struct kvm_vcpu *vcpu, u64 value, bool host_initiated);
>>> +u64 vmx_get_guest_debugctl(struct kvm_vcpu *vcpu);
>>> +
>>>  static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
>>>  					     int type, bool value)
>>>  {