linux-kernel - Re: [PATCH v4 29/38] KVM: x86/pmu: Switch host/guest PMU context at vm-exit/vm-entry

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aCYWXXpFcx33uVPi@google.com>
Date: Thu, 15 May 2025 09:29:17 -0700
From: Sean Christopherson <seanjc@...gle.com>
To: Mingwei Zhang <mizhang@...gle.com>
Cc: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>, 
	Arnaldo Carvalho de Melo <acme@...nel.org>, Namhyung Kim <namhyung@...nel.org>, 
	Paolo Bonzini <pbonzini@...hat.com>, Mark Rutland <mark.rutland@....com>, 
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>, Jiri Olsa <jolsa@...nel.org>, 
	Ian Rogers <irogers@...gle.com>, Adrian Hunter <adrian.hunter@...el.com>, Liang@...gle.com, 
	Kan <kan.liang@...ux.intel.com>, "H. Peter Anvin" <hpa@...or.com>, 
	linux-perf-users@...r.kernel.org, linux-kernel@...r.kernel.org, 
	kvm@...r.kernel.org, linux-kselftest@...r.kernel.org, 
	Yongwei Ma <yongwei.ma@...el.com>, Xiong Zhang <xiong.y.zhang@...ux.intel.com>, 
	Dapeng Mi <dapeng1.mi@...ux.intel.com>, Jim Mattson <jmattson@...gle.com>, 
	Sandipan Das <sandipan.das@....com>, Zide Chen <zide.chen@...el.com>, 
	Eranian Stephane <eranian@...gle.com>, Shukla Manali <Manali.Shukla@....com>, 
	Nikunj Dadhania <nikunj.dadhania@....com>
Subject: Re: [PATCH v4 29/38] KVM: x86/pmu: Switch host/guest PMU context at vm-exit/vm-entry

On Mon, Mar 24, 2025, Mingwei Zhang wrote:
> diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
> index 9159bf1a4730..35f27366c277 100644
> --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
> @@ -22,6 +22,8 @@ KVM_X86_PMU_OP(init)
>  KVM_X86_PMU_OP_OPTIONAL(reset)
>  KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
>  KVM_X86_PMU_OP_OPTIONAL(cleanup)
> +KVM_X86_PMU_OP(put_guest_context)
> +KVM_X86_PMU_OP(load_guest_context)

For KVM, the "guest_context" part is largely superfluous, as KVM always operates
on guest state, e.g. kvm_fpu_{load,put}().

I do think we should squeeze in "mediated" somewhere, otherwise the it's hard to
see that these are specific to the mediated PMU.

So probably mediated_{load,put}()?

>  #undef KVM_X86_PMU_OP
>  #undef KVM_X86_PMU_OP_OPTIONAL
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 7ee74bbbb0aa..4117a382739a 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -568,6 +568,10 @@ struct kvm_pmu {
>  	u64 raw_event_mask;
>  	struct kvm_pmc gp_counters[KVM_MAX_NR_GP_COUNTERS];
>  	struct kvm_pmc fixed_counters[KVM_MAX_NR_FIXED_COUNTERS];
> +	u32 gp_eventsel_base;
> +	u32 gp_counter_base;
> +	u32 fixed_base;
> +	u32 cntr_shift;

Gah, my bad, "shift" was a terrible suggestion.  It should be "stride".

> @@ -306,6 +313,10 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
>  int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
>  void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel);
>  bool vcpu_pmu_can_enable(struct kvm_vcpu *vcpu);
> +void kvm_pmu_put_guest_pmcs(struct kvm_vcpu *vcpu);
> +void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu);
> +void kvm_pmu_put_guest_context(struct kvm_vcpu *vcpu);
> +void kvm_pmu_load_guest_context(struct kvm_vcpu *vcpu);
>  
>  bool is_vmware_backdoor_pmc(u32 pmc_idx);
>  bool kvm_rdpmc_in_guest(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
> index 1a7e3a897fdf..7e0d84d50b74 100644
> --- a/arch/x86/kvm/svm/pmu.c
> +++ b/arch/x86/kvm/svm/pmu.c
> @@ -175,6 +175,22 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  	return 1;
>  }
>  
> +static inline void amd_update_msr_base(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +
> +	if (kvm_pmu_has_perf_global_ctrl(pmu) ||
> +	    guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
> +		pmu->gp_eventsel_base = MSR_F15H_PERF_CTL0;
> +		pmu->gp_counter_base = MSR_F15H_PERF_CTR0;
> +		pmu->cntr_shift = 2;
> +	} else {
> +		pmu->gp_eventsel_base = MSR_K7_EVNTSEL0;
> +		pmu->gp_counter_base = MSR_K7_PERFCTR0;
> +		pmu->cntr_shift = 1;
> +	}
> +}

Moving quoted text around to organize responses...

> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> index 796b7bc4affe..ed17ab198dfb 100644
> --- a/arch/x86/kvm/vmx/pmu_intel.c
> +++ b/arch/x86/kvm/vmx/pmu_intel.c
> @@ -460,6 +460,17 @@ static void intel_pmu_enable_fixed_counter_bits(struct kvm_pmu *pmu, u64 bits)
>  		pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits);
>  }
>  
> +static inline void intel_update_msr_base(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +
> +	pmu->gp_eventsel_base = MSR_P6_EVNTSEL0;
> +	pmu->gp_counter_base = fw_writes_is_enabled(vcpu) ?
> +			       MSR_IA32_PMC0 : MSR_IA32_PERFCTR0;

This is wrong.  And I unintentionally proved that it's wrong, by goofing when I
fixed up this code and using MSR_IA32_PERFCTR0 instead of MSR_IA32_PMC0.

Whether or not the guest supports full-width writes is irrelevant, because support
for FW writes doesn't change the width of the counters.  Just because the *guest* 
can't directly write all e.g. 48 bits doesn't mean clobbering bits 47:32 is ok.

Similarly, on the AMD side, using the legacy interface in KVM is unnecessary.
The guest may be limited to those MSRs, but KVM has a hard dependency on PMU v2,
so just unconditionally use MSR_F15H_PERF_CTR0 (and for the record, because I
had to look it up, the newfangled MSRs on AMD are aliased to the legacy MSRs for
0..3).

Very happily, that means the MSRs don't need to be per-PMU, and they don't even
need to be configured at runtime for a given vendor.  Simply require FW writes
on Intel to enable the mediated PMU, and then hardcode the GP base to MSR_IA32_PMC0.

> +static void amd_put_guest_context(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +
> +	rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, pmu->global_ctrl);
> +	wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0);
> +	rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, pmu->global_status);
> +
> +	/* Clear global status bits if non-zero */
> +	if (pmu->global_status)
> +		wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, pmu->global_status);
> +
> +	kvm_pmu_put_guest_pmcs(vcpu);
> +}
> +
> +static void amd_load_guest_context(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +	u64 global_status;
> +
> +	wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0);

Back when I suggested we give up on trying to handle PMCs and eventsels in common
x86, this WRMSR didn't exist.  Now that it does, I don't see anything that prevents
invoking kvm_pmu_{load,put}_guest_pmcs() from common x86, KVM just needs to clear
GLOBAL_CTRL before setting eventsels and PMCs.

For the load path:

	/*
	 * Disable all counters before loading event selectors and PMCs so that
	 * KVM doesn't enable or load guest counters while host events are
	 * active.  VMX will enable/disabled counters at VM-Enter/VM-Exit by
	 * atomically loading PERF_GLOBAL_CONTROL.  SVM effectively performs
	 * the switch by configuring all events to be GUEST_ONLY.
	 */
	wrmsrl(kvm_pmu_ops.PERF_GLOBAL_CTRL, 0);

	kvm_pmu_load_guest_pmcs(vcpu);

	kvm_pmu_call(mediated_load)(vcpu);

And for the put path, just reverse the ordering:

	/*
	 * Defer handling of PERF_GLOBAL_CTRL to vendor code.  On Intel, it's
	 * atomically cleared on VM-Exit, i.e. doesn't need to be clear here.
	 */
	kvm_pmu_call(mediated_put)(vcpu);

	kvm_pmu_put_guest_pmcs(vcpu);

	perf_put_guest_context();

On Intel, PERF_GLOBAL_CTRL is cleared on VM-Exit, and on AMD, the vendor hook
will clear it.  The fact that vendor code sets other MSRs is irrelevant, what
matters is that all counters are quieseced.

I think it's still worth having helpers, but they can be static locals.

> +
> +	kvm_pmu_load_guest_pmcs(vcpu);
> +
> +	rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, global_status);
> +	/* Clear host global_status MSR if non-zero. */
> +	if (global_status)
> +		wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, global_status);
> +
> +	wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, pmu->global_status);
> +	wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, pmu->global_ctrl);
> +}
> +
>  static void intel_pmu_update_msr_intercepts(struct kvm_vcpu *vcpu)
> @@ -809,6 +822,50 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
>  	}
>  }
>  
> +static void intel_put_guest_context(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +
> +	/* Global ctrl register is already saved at VM-exit. */
> +	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status);
> +
> +	/* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */
> +	if (pmu->global_status)
> +		wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status);
> +
> +	rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl_hw);
> +
> +	/*
> +	 * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and
> +	 * also avoid these guest fixed counters get accidentially enabled
> +	 * during host running when host enable global ctrl.
> +	 */
> +	if (pmu->fixed_ctr_ctrl_hw)
> +		wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
> +
> +	kvm_pmu_put_guest_pmcs(vcpu);
> +}
> +
> +static void intel_load_guest_context(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +	u64 global_status, toggle;
> +
> +	/* Clear host global_ctrl MSR if non-zero. */
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> +
> +	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, global_status);
> +	toggle = pmu->global_status ^ global_status;
> +	if (global_status & toggle)
> +		wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle);
> +	if (pmu->global_status & toggle)
> +		wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle);
> +
> +	wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl_hw);
> +
> +	kvm_pmu_load_guest_pmcs(vcpu);
> +}