linux-kernel - [RFC PATCH 37/73] KVM: x86/PVM: Use host PCID to reduce guest TLB flushing

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240226143630.33643-38-jiangshanlai@gmail.com>
Date: Mon, 26 Feb 2024 22:35:54 +0800
From: Lai Jiangshan <jiangshanlai@...il.com>
To: linux-kernel@...r.kernel.org
Cc: Lai Jiangshan <jiangshan.ljs@...group.com>,
	Hou Wenlong <houwenlong.hwl@...group.com>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Peter Zijlstra <peterz@...radead.org>,
	Sean Christopherson <seanjc@...gle.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Borislav Petkov <bp@...en8.de>,
	Ingo Molnar <mingo@...hat.com>,
	kvm@...r.kernel.org,
	Paolo Bonzini <pbonzini@...hat.com>,
	x86@...nel.org,
	Kees Cook <keescook@...omium.org>,
	Juergen Gross <jgross@...e.com>,
	Dave Hansen <dave.hansen@...ux.intel.com>,
	"H. Peter Anvin" <hpa@...or.com>
Subject: [RFC PATCH 37/73] KVM: x86/PVM: Use host PCID to reduce guest TLB flushing

From: Lai Jiangshan <jiangshan.ljs@...group.com>

Since the host doesn't use all PCIDs, PVM can utilize the host PCID to
reduce guest TLB flushing. The PCID allocation algorithm in PVM is
similar to that of the host.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@...group.com>
Signed-off-by: Hou Wenlong <houwenlong.hwl@...group.com>
---
 arch/x86/kvm/pvm/pvm.c | 228 ++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/pvm/pvm.h |   5 +
 2 files changed, 232 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/pvm/pvm.c b/arch/x86/kvm/pvm/pvm.c
index 242c355fda8f..2d3785e7f2f3 100644
--- a/arch/x86/kvm/pvm/pvm.c
+++ b/arch/x86/kvm/pvm/pvm.c
@@ -349,6 +349,211 @@ static void pvm_switch_to_host(struct vcpu_pvm *pvm)
 	preempt_enable();
 }
 
+struct host_pcid_one {
+	/*
+	 * It is struct vcpu_pvm *pvm, but it is not allowed to be
+	 * dereferenced since it might be freed.
+	 */
+	void *pvm;
+	u64 root_hpa;
+};
+
+struct host_pcid_state {
+	struct host_pcid_one pairs[NUM_HOST_PCID_FOR_GUEST];
+	int evict_next_round_robin;
+};
+
+static DEFINE_PER_CPU(struct host_pcid_state, pvm_tlb_state);
+
+static void host_pcid_flush_all(struct vcpu_pvm *pvm)
+{
+	struct host_pcid_state *tlb_state = this_cpu_ptr(&pvm_tlb_state);
+	int i;
+
+	for (i = 0; i < NUM_HOST_PCID_FOR_GUEST; i++) {
+		if (tlb_state->pairs[i].pvm == pvm)
+			tlb_state->pairs[i].pvm = NULL;
+	}
+}
+
+static inline unsigned int host_pcid_to_index(unsigned int host_pcid)
+{
+	return host_pcid & ~HOST_PCID_TAG_FOR_GUEST;
+}
+
+static inline int index_to_host_pcid(int index)
+{
+	return index | HOST_PCID_TAG_FOR_GUEST;
+}
+
+/*
+ * Free the uncached guest pcid (not in mmu->root nor mmu->prev_root), so
+ * that the next allocation would not evict a clean one.
+ *
+ * It would be better if kvm.ko notifies us when a root_pgd is freed
+ * from the cache.
+ *
+ * Returns a freed index or -1 if nothing is freed.
+ */
+static int host_pcid_free_uncached(struct vcpu_pvm *pvm)
+{
+	/* It is allowed to do nothing. */
+	return -1;
+}
+
+/*
+ * Get a host pcid of the current pCPU for the specific guest pgd.
+ * PVM vTLB is guest pgd tagged.
+ */
+static int host_pcid_get(struct vcpu_pvm *pvm, u64 root_hpa, bool *flush)
+{
+	struct host_pcid_state *tlb_state = this_cpu_ptr(&pvm_tlb_state);
+	int i, j = -1;
+
+	/* find if it is allocated. */
+	for (i = 0; i < NUM_HOST_PCID_FOR_GUEST; i++) {
+		struct host_pcid_one *tlb = &tlb_state->pairs[i];
+
+		if (tlb->root_hpa == root_hpa && tlb->pvm == pvm)
+			return index_to_host_pcid(i);
+
+		/* if it has no owner, allocate it if not found. */
+		if (!tlb->pvm)
+			j = i;
+	}
+
+	/*
+	 * Fallback to:
+	 *    use the fallback recorded in the above loop.
+	 *    use a freed uncached.
+	 *    evict one (which might be still usable) by round-robin policy.
+	 */
+	if (j < 0)
+		j = host_pcid_free_uncached(pvm);
+	if (j < 0) {
+		j = tlb_state->evict_next_round_robin;
+		if (++tlb_state->evict_next_round_robin == NUM_HOST_PCID_FOR_GUEST)
+			tlb_state->evict_next_round_robin = 0;
+	}
+
+	/* associate the host pcid to the guest */
+	tlb_state->pairs[j].pvm = pvm;
+	tlb_state->pairs[j].root_hpa = root_hpa;
+
+	*flush = true;
+	return index_to_host_pcid(j);
+}
+
+static void host_pcid_free(struct vcpu_pvm *pvm, u64 root_hpa)
+{
+	struct host_pcid_state *tlb_state = this_cpu_ptr(&pvm_tlb_state);
+	int i;
+
+	for (i = 0; i < NUM_HOST_PCID_FOR_GUEST; i++) {
+		struct host_pcid_one *tlb = &tlb_state->pairs[i];
+
+		if (tlb->root_hpa == root_hpa && tlb->pvm == pvm) {
+			tlb->pvm = NULL;
+			return;
+		}
+	}
+}
+
+static inline void *host_pcid_owner(int host_pcid)
+{
+	return this_cpu_read(pvm_tlb_state.pairs[host_pcid_to_index(host_pcid)].pvm);
+}
+
+static inline u64 host_pcid_root(int host_pcid)
+{
+	return this_cpu_read(pvm_tlb_state.pairs[host_pcid_to_index(host_pcid)].root_hpa);
+}
+
+static void __pvm_hwtlb_flush_all(struct vcpu_pvm *pvm)
+{
+	if (static_cpu_has(X86_FEATURE_PCID))
+		host_pcid_flush_all(pvm);
+}
+
+static void pvm_flush_hwtlb(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_pvm *pvm = to_pvm(vcpu);
+
+	get_cpu();
+	__pvm_hwtlb_flush_all(pvm);
+	put_cpu();
+}
+
+static void pvm_flush_hwtlb_guest(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * flushing hwtlb for guest only when:
+	 *	change to the shadow page table.
+	 *	reused an used (guest) pcid.
+	 * change to the shadow page table always results flushing hwtlb
+	 * and PVM uses pgd tagged tlb.
+	 *
+	 * So no hwtlb needs to be flushed here.
+	 */
+}
+
+static void pvm_flush_hwtlb_current(struct kvm_vcpu *vcpu)
+{
+	/* No flush required if the current context is invalid. */
+	if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
+		return;
+
+	if (static_cpu_has(X86_FEATURE_PCID)) {
+		get_cpu();
+		host_pcid_free(to_pvm(vcpu), vcpu->arch.mmu->root.hpa);
+		put_cpu();
+	}
+}
+
+static void pvm_flush_hwtlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+	struct vcpu_pvm *pvm = to_pvm(vcpu);
+	int max = MIN_HOST_PCID_FOR_GUEST + NUM_HOST_PCID_FOR_GUEST;
+	int i;
+
+	if (!static_cpu_has(X86_FEATURE_PCID))
+		return;
+
+	get_cpu();
+	if (!this_cpu_has(X86_FEATURE_INVPCID)) {
+		host_pcid_flush_all(pvm);
+		put_cpu();
+		return;
+	}
+
+	host_pcid_free_uncached(pvm);
+	for (i = MIN_HOST_PCID_FOR_GUEST; i < max; i++) {
+		if (host_pcid_owner(i) == pvm)
+			invpcid_flush_one(i, addr);
+	}
+
+	put_cpu();
+}
+
+static void pvm_set_host_cr3_for_guest_with_host_pcid(struct vcpu_pvm *pvm)
+{
+	u64 root_hpa = pvm->vcpu.arch.mmu->root.hpa;
+	bool flush = false;
+	u32 host_pcid = host_pcid_get(pvm, root_hpa, &flush);
+	u64 hw_cr3 = root_hpa | host_pcid;
+
+	if (!flush)
+		hw_cr3 |= CR3_NOFLUSH;
+	this_cpu_write(cpu_tss_rw.tss_ex.enter_cr3, hw_cr3);
+}
+
+static void pvm_set_host_cr3_for_guest_without_host_pcid(struct vcpu_pvm *pvm)
+{
+	u64 root_hpa = pvm->vcpu.arch.mmu->root.hpa;
+
+	this_cpu_write(cpu_tss_rw.tss_ex.enter_cr3, root_hpa);
+}
+
 static void pvm_set_host_cr3_for_hypervisor(struct vcpu_pvm *pvm)
 {
 	unsigned long cr3;
@@ -365,7 +570,11 @@ static void pvm_set_host_cr3_for_hypervisor(struct vcpu_pvm *pvm)
 static void pvm_set_host_cr3(struct vcpu_pvm *pvm)
 {
 	pvm_set_host_cr3_for_hypervisor(pvm);
-	this_cpu_write(cpu_tss_rw.tss_ex.enter_cr3, pvm->vcpu.arch.mmu->root.hpa);
+
+	if (static_cpu_has(X86_FEATURE_PCID))
+		pvm_set_host_cr3_for_guest_with_host_pcid(pvm);
+	else
+		pvm_set_host_cr3_for_guest_without_host_pcid(pvm);
 }
 
 static void pvm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
@@ -391,6 +600,9 @@ static void pvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	__this_cpu_write(active_pvm_vcpu, pvm);
 
+	if (vcpu->cpu != cpu)
+		__pvm_hwtlb_flush_all(pvm);
+
 	indirect_branch_prediction_barrier();
 }
 
@@ -398,6 +610,7 @@ static void pvm_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_pvm *pvm = to_pvm(vcpu);
 
+	host_pcid_free_uncached(pvm);
 	pvm_prepare_switch_to_host(pvm);
 }
 
@@ -2086,6 +2299,11 @@ static struct kvm_x86_ops pvm_x86_ops __initdata = {
 	.set_rflags = pvm_set_rflags,
 	.get_if_flag = pvm_get_if_flag,
 
+	.flush_tlb_all = pvm_flush_hwtlb,
+	.flush_tlb_current = pvm_flush_hwtlb_current,
+	.flush_tlb_gva = pvm_flush_hwtlb_gva,
+	.flush_tlb_guest = pvm_flush_hwtlb_guest,
+
 	.vcpu_pre_run = pvm_vcpu_pre_run,
 	.vcpu_run = pvm_vcpu_run,
 	.handle_exit = pvm_handle_exit,
@@ -2152,8 +2370,16 @@ static void pvm_exit(void)
 }
 module_exit(pvm_exit);
 
+#define TLB_NR_DYN_ASIDS	6
+
 static int __init hardware_cap_check(void)
 {
+	BUILD_BUG_ON(MIN_HOST_PCID_FOR_GUEST <= TLB_NR_DYN_ASIDS);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	BUILD_BUG_ON((MIN_HOST_PCID_FOR_GUEST + NUM_HOST_PCID_FOR_GUEST) >=
+		     (1 << X86_CR3_PTI_PCID_USER_BIT));
+#endif
+
 	/*
 	 * switcher can't be used when KPTI. See the comments above
 	 * SWITCHER_SAVE_AND_SWITCH_TO_HOST_CR3
diff --git a/arch/x86/kvm/pvm/pvm.h b/arch/x86/kvm/pvm/pvm.h
index 4cdcbed1c813..31060831e009 100644
--- a/arch/x86/kvm/pvm/pvm.h
+++ b/arch/x86/kvm/pvm/pvm.h
@@ -28,6 +28,11 @@ extern u64 *host_mmu_root_pgd;
 void host_mmu_destroy(void);
 int host_mmu_init(void);
 
+#define HOST_PCID_TAG_FOR_GUEST			(32)
+
+#define MIN_HOST_PCID_FOR_GUEST			HOST_PCID_TAG_FOR_GUEST
+#define NUM_HOST_PCID_FOR_GUEST			HOST_PCID_TAG_FOR_GUEST
+
 struct vcpu_pvm {
 	struct kvm_vcpu vcpu;
 
-- 
2.19.1.6.gb485710b