linux-kernel - Re: [PATCH v4 20/38] KVM: x86/pmu: Check if mediated vPMU can intercept rdpmc

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a700ab4c-0e8d-499d-be71-f24c4a6439cf@amd.com>
Date: Mon, 26 May 2025 11:45:24 +0530
From: Sandipan Das <sandipan.das@....com>
To: Mingwei Zhang <mizhang@...gle.com>, Peter Zijlstra
 <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
 Arnaldo Carvalho de Melo <acme@...nel.org>,
 Namhyung Kim <namhyung@...nel.org>, Sean Christopherson <seanjc@...gle.com>,
 Paolo Bonzini <pbonzini@...hat.com>
Cc: Mark Rutland <mark.rutland@....com>,
 Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
 Jiri Olsa <jolsa@...nel.org>, Ian Rogers <irogers@...gle.com>,
 Adrian Hunter <adrian.hunter@...el.com>, Liang@...gle.com,
 Kan <kan.liang@...ux.intel.com>, "H. Peter Anvin" <hpa@...or.com>,
 linux-perf-users@...r.kernel.org, linux-kernel@...r.kernel.org,
 kvm@...r.kernel.org, linux-kselftest@...r.kernel.org,
 Yongwei Ma <yongwei.ma@...el.com>,
 Xiong Zhang <xiong.y.zhang@...ux.intel.com>,
 Dapeng Mi <dapeng1.mi@...ux.intel.com>, Jim Mattson <jmattson@...gle.com>,
 Zide Chen <zide.chen@...el.com>, Eranian Stephane <eranian@...gle.com>,
 Shukla Manali <Manali.Shukla@....com>,
 Nikunj Dadhania <nikunj.dadhania@....com>
Subject: Re: [PATCH v4 20/38] KVM: x86/pmu: Check if mediated vPMU can
 intercept rdpmc

On 3/24/2025 11:01 PM, Mingwei Zhang wrote:
> From: Dapeng Mi <dapeng1.mi@...ux.intel.com>
> 
> Check if rdpmc can be intercepted for mediated vPMU. Simply speaking,
> if guest own all PMU counters in mediated vPMU, then rdpmc interception
> should be disabled to mitigate the performance impact, otherwise rdpmc
> has to be intercepted to avoid guest obtain host counter's data via
> rdpmc instruction.
> 
> Co-developed-by: Mingwei Zhang <mizhang@...gle.com>
> Signed-off-by: Mingwei Zhang <mizhang@...gle.com>
> Co-developed-by: Sandipan Das <sandipan.das@....com>
> Signed-off-by: Sandipan Das <sandipan.das@....com>
> Signed-off-by: Dapeng Mi <dapeng1.mi@...ux.intel.com>
> ---
>  arch/x86/include/asm/msr-index.h |  1 +
>  arch/x86/kvm/pmu.c               | 34 ++++++++++++++++++++++++++++++++
>  arch/x86/kvm/pmu.h               | 19 ++++++++++++++++++
>  arch/x86/kvm/svm/pmu.c           | 14 ++++++++++++-
>  arch/x86/kvm/vmx/pmu_intel.c     | 18 ++++++++---------
>  5 files changed, 76 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index ca70846ffd55..337f4b0a2998 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -312,6 +312,7 @@
>  #define PERF_CAP_PEBS_FORMAT		0xf00
>  #define PERF_CAP_FW_WRITES		BIT_ULL(13)
>  #define PERF_CAP_PEBS_BASELINE		BIT_ULL(14)
> +#define PERF_CAP_PERF_METRICS		BIT_ULL(15)
>  #define PERF_CAP_PEBS_MASK		(PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
>  					 PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE)
>  
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index 92c742ead663..6ad71752be4b 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -604,6 +604,40 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
>  	return 0;
>  }
>  
> +inline bool kvm_rdpmc_in_guest(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +
> +	if (!kvm_mediated_pmu_enabled(vcpu))
> +		return false;
> +
> +	/*
> +	 * VMware allows access to these Pseduo-PMCs even when read via RDPMC
> +	 * in Ring3 when CR4.PCE=0.
> +	 */
> +	if (enable_vmware_backdoor)
> +		return false;
> +
> +	/*
> +	 * FIXME: In theory, perf metrics is always combined with fixed
> +	 *	  counter 3. it's fair enough to compare the guest and host
> +	 *	  fixed counter number and don't need to check perf metrics
> +	 *	  explicitly. However kvm_pmu_cap.num_counters_fixed is limited
> +	 *	  KVM_MAX_NR_FIXED_COUNTERS (3) as fixed counter 3 is not
> +	 *	  supported now. perf metrics is still needed to be checked
> +	 *	  explicitly here. Once fixed counter 3 is supported, the perf
> +	 *	  metrics checking can be removed.
> +	 */
> +	return pmu->nr_arch_gp_counters == kvm_pmu_cap.num_counters_gp &&
> +	       pmu->nr_arch_fixed_counters == kvm_pmu_cap.num_counters_fixed &&
> +	       vcpu_has_perf_metrics(vcpu) == kvm_host_has_perf_metrics() &&
> +	       pmu->counter_bitmask[KVM_PMC_GP] ==
> +				(BIT_ULL(kvm_pmu_cap.bit_width_gp) - 1) &&
> +	       pmu->counter_bitmask[KVM_PMC_FIXED] ==
> +				(BIT_ULL(kvm_pmu_cap.bit_width_fixed) - 1);
> +}
> +EXPORT_SYMBOL_GPL(kvm_rdpmc_in_guest);
> +
>  void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
>  {
>  	if (lapic_in_kernel(vcpu)) {
> diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
> index e1d0096f249b..509c995b7871 100644
> --- a/arch/x86/kvm/pmu.h
> +++ b/arch/x86/kvm/pmu.h
> @@ -271,6 +271,24 @@ static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc)
>  	return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
>  }
>  
> +static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
> +{
> +	if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
> +		return 0;
> +
> +	return vcpu->arch.perf_capabilities;
> +}
> +
> +static inline bool vcpu_has_perf_metrics(struct kvm_vcpu *vcpu)
> +{
> +	return !!(vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PERF_METRICS);
> +}
> +
> +static inline bool kvm_host_has_perf_metrics(void)
> +{
> +	return !!(kvm_host.perf_capabilities & PERF_CAP_PERF_METRICS);
> +}
> +
>  void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
>  void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
>  int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
> @@ -287,6 +305,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel);
>  bool vcpu_pmu_can_enable(struct kvm_vcpu *vcpu);
>  
>  bool is_vmware_backdoor_pmc(u32 pmc_idx);
> +bool kvm_rdpmc_in_guest(struct kvm_vcpu *vcpu);
>  
>  extern struct kvm_pmu_ops intel_pmu_ops;
>  extern struct kvm_pmu_ops amd_pmu_ops;
> diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
> index c8b9fd9b5350..153972e944eb 100644
> --- a/arch/x86/kvm/svm/pmu.c
> +++ b/arch/x86/kvm/svm/pmu.c
> @@ -173,7 +173,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  	return 1;
>  }
>  
> -static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
> +static void __amd_pmu_refresh(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>  	union cpuid_0x80000022_ebx ebx;
> @@ -212,6 +212,18 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
>  	bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters);
>  }
>  
> +static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +
> +	__amd_pmu_refresh(vcpu);
> +
> +	if (kvm_rdpmc_in_guest(vcpu))
> +		svm_clr_intercept(svm, INTERCEPT_RDPMC);
> +	else
> +		svm_set_intercept(svm, INTERCEPT_RDPMC);
> +}
> +

After putting kprobes on kvm_pmu_rdpmc(), I noticed that RDPMC instructions were
getting intercepted for the secondary vCPUs. This happens because when secondary
vCPUs come up, kvm_vcpu_reset() gets called after guest CPUID has been updated.
While RDPMC interception is initially disabled in the kvm_pmu_refresh() path, it
gets re-enabled in the kvm_vcpu_reset() path as svm_vcpu_reset() calls init_vmcb().
We should consider adding the following change to avoid that.

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 6f9142063cc4..1c9c183092f3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1354,7 +1354,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
                svm_set_intercept(svm, INTERCEPT_SMI);

        svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
-       svm_set_intercept(svm, INTERCEPT_RDPMC);
        svm_set_intercept(svm, INTERCEPT_CPUID);
        svm_set_intercept(svm, INTERCEPT_INVD);
        svm_set_intercept(svm, INTERCEPT_INVLPG);

>  static void amd_pmu_init(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> index fc017e9a6a0c..2a5f79206b02 100644
> --- a/arch/x86/kvm/vmx/pmu_intel.c
> +++ b/arch/x86/kvm/vmx/pmu_intel.c
> @@ -108,14 +108,6 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
>  	return &counters[array_index_nospec(idx, num_counters)];
>  }
>  
> -static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
> -{
> -	if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
> -		return 0;
> -
> -	return vcpu->arch.perf_capabilities;
> -}
> -
>  static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
>  {
>  	return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0;
> @@ -456,7 +448,7 @@ static void intel_pmu_enable_fixed_counter_bits(struct kvm_pmu *pmu, u64 bits)
>  		pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits);
>  }
>  
> -static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
> +static void __intel_pmu_refresh(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>  	struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
> @@ -564,6 +556,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
>  	}
>  }
>  
> +static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
> +{
> +	__intel_pmu_refresh(vcpu);
> +
> +	exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING,
> +				!kvm_rdpmc_in_guest(vcpu));
> +}
> +
>  static void intel_pmu_init(struct kvm_vcpu *vcpu)
>  {
>  	int i;