linux-kernel - [PATCH] KVM: X86: Emulate APERF/MPERF to report actual VCPU frequency

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200623063530.81917-1-like.xu@linux.intel.com>
Date:   Tue, 23 Jun 2020 14:35:30 +0800
From:   Like Xu <like.xu@...ux.intel.com>
To:     Paolo Bonzini <pbonzini@...hat.com>, kvm@...r.kernel.org
Cc:     Sean Christopherson <sean.j.christopherson@...el.com>,
        Vitaly Kuznetsov <vkuznets@...hat.com>,
        Wanpeng Li <wanpengli@...cent.com>,
        Jim Mattson <jmattson@...gle.com>,
        Joerg Roedel <joro@...tes.org>, wei.huang2@....com,
        Peter Zijlstra <peterz@...radead.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        linux-kernel@...r.kernel.org, Like Xu <like.xu@...ux.intel.com>,
        Li RongQing <lirongqing@...du.com>,
        Chai Wen <chaiwen@...du.com>, Jia Lina <jialina01@...du.com>
Subject: [PATCH] KVM: X86: Emulate APERF/MPERF to report actual VCPU frequency

The aperf/mperf are used to report current CPU frequency after 7d5905dc14a
"x86 / CPU: Always show current CPU frequency in /proc/cpuinfo". But guest
kernel always reports a fixed VCPU frequency in the /proc/cpuinfo, which
may confuse users especially when turbo is enabled on the host.

Emulate guest APERF/MPERF capability based their values on the host.

Co-developed-by: Li RongQing <lirongqing@...du.com>
Signed-off-by: Li RongQing <lirongqing@...du.com>
Reviewed-by: Chai Wen <chaiwen@...du.com>
Reviewed-by: Jia Lina <jialina01@...du.com>
Signed-off-by: Like Xu <like.xu@...ux.intel.com>
---
 arch/x86/include/asm/kvm_host.h | 12 ++++++
 arch/x86/kvm/cpuid.c            |  8 +++-
 arch/x86/kvm/x86.c              | 76 ++++++++++++++++++++++++++++++++-
 3 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f852ee350beb..c48b9a0a086e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -539,6 +539,16 @@ struct kvm_vcpu_hv_stimer {
 	bool msg_pending;
 };
 
+/* vCPU thermal and power context */
+struct kvm_vcpu_hwp {
+	/* Hardware Coordination Feedback Capability (Presence of APERF/MPERF) */
+	bool hw_coord_fb_cap;
+	/* MPERF increases with a fixed frequency */
+	u64 mperf;
+	/* APERF increases with the current/actual frequency */
+	u64 aperf;
+};
+
 /* Hyper-V synthetic interrupt controller (SynIC)*/
 struct kvm_vcpu_hv_synic {
 	u64 version;
@@ -829,6 +839,8 @@ struct kvm_vcpu_arch {
 
 	/* AMD MSRC001_0015 Hardware Configuration */
 	u64 msr_hwcr;
+
+	struct kvm_vcpu_hwp hwp;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8a294f9747aa..7057809e7cfd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -78,6 +78,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 			apic->lapic_timer.timer_mode_mask = 1 << 17;
 	}
 
+	best = kvm_find_cpuid_entry(vcpu, 0x6, 0);
+	if (best && best->function == 0x6 &&
+	    boot_cpu_has(X86_FEATURE_APERFMPERF) && (best->ecx & 0x1))
+		vcpu->arch.hwp.hw_coord_fb_cap = true;
+
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 	if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
 		cpuid_entry_change(best, X86_FEATURE_OSPKE,
@@ -561,7 +566,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 	case 6: /* Thermal management */
 		entry->eax = 0x4; /* allow ARAT */
 		entry->ebx = 0;
-		entry->ecx = 0;
+		/* allow aperf/mperf to report the true VCPU frequency. */
+		entry->ecx = boot_cpu_has(X86_FEATURE_APERFMPERF) ? 0x1 : 0;
 		entry->edx = 0;
 		break;
 	/* function 7 has additional index. */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00c88c2f34e4..d220d9cc904a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3056,6 +3056,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vcpu->arch.msr_misc_features_enables = data;
 		break;
+	case MSR_IA32_MPERF:
+		if (!msr_info->host_initiated && !vcpu->arch.hwp.hw_coord_fb_cap)
+			return 1;
+		vcpu->arch.hwp.mperf = 0;
+		return 0;
+	case MSR_IA32_APERF:
+		if (!msr_info->host_initiated && !vcpu->arch.hwp.hw_coord_fb_cap)
+			return 1;
+		vcpu->arch.hwp.aperf = 0;
+		return 0;
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
@@ -3323,6 +3333,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_K7_HWCR:
 		msr_info->data = vcpu->arch.msr_hwcr;
 		break;
+	case MSR_IA32_MPERF:
+		if (!msr_info->host_initiated && !vcpu->arch.hwp.hw_coord_fb_cap)
+			return 1;
+		msr_info->data = vcpu->arch.hwp.mperf;
+		break;
+	case MSR_IA32_APERF:
+		if (!msr_info->host_initiated && !vcpu->arch.hwp.hw_coord_fb_cap)
+			return 1;
+		msr_info->data = vcpu->arch.hwp.aperf;
+		break;
 	default:
 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
 			return kvm_pmu_get_msr(vcpu, msr_info);
@@ -8300,6 +8320,50 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
 
+static inline void get_host_amperf(u64 *mperf, u64 *aperf)
+{
+	rdmsrl(MSR_IA32_MPERF, *mperf);
+	rdmsrl(MSR_IA32_APERF, *aperf);
+}
+
+static inline u64 get_amperf_delta(u64 enter, u64 exit)
+{
+	return (exit >= enter) ? (exit - enter) : (ULONG_MAX - enter + exit);
+}
+
+static inline void vcpu_update_amperf(struct kvm_vcpu *vcpu, u64 adelta, u64 mdelta)
+{
+	u64 aperf_left, mperf_left, delta, tmp;
+
+	aperf_left = ULONG_MAX - vcpu->arch.hwp.aperf;
+	mperf_left = ULONG_MAX - vcpu->arch.hwp.mperf;
+
+	/* fast path when neither MSR overflows */
+	if (adelta <= aperf_left && mdelta <= mperf_left) {
+		vcpu->arch.hwp.aperf += adelta;
+		vcpu->arch.hwp.mperf += mdelta;
+		return;
+	}
+
+	/* when either MSR overflows, both MSRs are reset to zero and continue to increment. */
+	delta = min(adelta, mdelta);
+	if (delta > aperf_left || delta > mperf_left) {
+		tmp = max(vcpu->arch.hwp.aperf, vcpu->arch.hwp.mperf);
+		tmp = delta - (ULONG_MAX - tmp) - 1;
+		vcpu->arch.hwp.aperf = tmp + adelta - delta;
+		vcpu->arch.hwp.mperf = tmp + mdelta - delta;
+		return;
+	}
+
+	if (mdelta > adelta && mdelta > aperf_left) {
+		vcpu->arch.hwp.mperf = mdelta - mperf_left - 1;
+		vcpu->arch.hwp.aperf = 0;
+	} else {
+		vcpu->arch.hwp.mperf = 0;
+		vcpu->arch.hwp.aperf = adelta - aperf_left - 1;
+	}
+}
+
 /*
  * Returns 1 to let vcpu_run() continue the guest execution loop without
  * exiting to the userspace.  Otherwise, the value will be returned to the
@@ -8312,7 +8376,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		dm_request_for_irq_injection(vcpu) &&
 		kvm_cpu_accept_dm_intr(vcpu);
 	fastpath_t exit_fastpath;
-
+	u64 enter_mperf = 0, enter_aperf = 0, exit_mperf = 0, exit_aperf = 0;
 	bool req_immediate_exit = false;
 
 	if (kvm_request_pending(vcpu)) {
@@ -8516,8 +8580,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
 	}
 
+	if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap))
+		get_host_amperf(&enter_mperf, &enter_aperf);
+
 	exit_fastpath = kvm_x86_ops.run(vcpu);
 
+	if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap)) {
+		get_host_amperf(&exit_mperf, &exit_aperf);
+		vcpu_update_amperf(vcpu, get_amperf_delta(enter_aperf, exit_aperf),
+			get_amperf_delta(enter_mperf, exit_mperf));
+	}
+
 	/*
 	 * Do this here before restoring debug registers on the host.  And
 	 * since we do this before handling the vmexit, a DR access vmexit
@@ -9482,6 +9555,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.pending_external_vector = -1;
 	vcpu->arch.preempted_in_kernel = false;
+	vcpu->arch.hwp.hw_coord_fb_cap = false;
 
 	kvm_hv_vcpu_init(vcpu);
 
-- 
2.21.3