linux-kernel - [RFC PATCH] kvm: nv: Optimize the unmapping of shadow S2-MMU tables.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240305054606.13261-1-gankulkarni@os.amperecomputing.com>
Date: Mon,  4 Mar 2024 21:46:06 -0800
From: Ganapatrao Kulkarni <gankulkarni@...amperecomputing.com>
To: kvmarm@...ts.cs.columbia.edu,
	kvm@...r.kernel.org,
	linux-arm-kernel@...ts.infradead.org,
	linux-kernel@...r.kernel.org
Cc: maz@...nel.org,
	oliver.upton@...ux.dev,
	darren@...amperecomputing.com,
	d.scott.phillips@...erecomputing.com,
	gankulkarni@...amperecomputing.com
Subject: [RFC PATCH] kvm: nv: Optimize the unmapping of shadow S2-MMU tables.

As per 'commit 178a6915434c ("KVM: arm64: nv: Unmap/flush shadow stage 2
page tables")', when ever there is unmap of pages that
are mapped to L1, they are invalidated from both L1 S2-MMU and from
all the active shadow/L2 S2-MMU tables. Since there is no mapping
to invalidate the IPAs of Shadow S2 to a page, there is a complete
S2-MMU page table walk and invalidation is done covering complete
address space allocated to a L2. This has performance impacts and
even soft lockup for NV(L1 and L2) boots with higher number of
CPUs and large Memory.

Adding a lookup table of mapping of Shadow IPA to Canonical IPA
whenever a page is mapped to any of the L2. While any page is
unmaped, this lookup is helpful to unmap only if it is mapped in
any of the shadow S2-MMU tables. Hence avoids unnecessary long
iterations of S2-MMU table walk-through and invalidation for the
complete address space.

Signed-off-by: Ganapatrao Kulkarni <gankulkarni@...amperecomputing.com>
---
 arch/arm64/include/asm/kvm_emulate.h |   5 ++
 arch/arm64/include/asm/kvm_host.h    |  14 ++++
 arch/arm64/include/asm/kvm_nested.h  |   4 +
 arch/arm64/kvm/mmu.c                 |  19 ++++-
 arch/arm64/kvm/nested.c              | 113 +++++++++++++++++++++++++++
 5 files changed, 152 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 5173f8cf2904..f503b2eaedc4 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -656,4 +656,9 @@ static inline bool kvm_is_shadow_s2_fault(struct kvm_vcpu *vcpu)
 		vcpu->arch.hw_mmu->nested_stage2_enabled);
 }
 
+static inline bool kvm_is_l1_using_shadow_s2(struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu);
+}
+
 #endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 8da3c9a81ae3..f61c674c300a 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -144,6 +144,13 @@ struct kvm_vmid {
 	atomic64_t id;
 };
 
+struct mapipa_node {
+	struct rb_node node;
+	phys_addr_t ipa;
+	phys_addr_t shadow_ipa;
+	long size;
+};
+
 struct kvm_s2_mmu {
 	struct kvm_vmid vmid;
 
@@ -216,6 +223,13 @@ struct kvm_s2_mmu {
 	 * >0: Somebody is actively using this.
 	 */
 	atomic_t refcnt;
+
+	/*
+	 * For a Canonical IPA to Shadow IPA mapping.
+	 */
+	struct rb_root nested_mapipa_root;
+	rwlock_t mmu_lock;
+
 };
 
 static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index da7ebd2f6e24..c31a59a1fdc6 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -65,6 +65,9 @@ extern void kvm_init_nested(struct kvm *kvm);
 extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
 extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
 extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
+extern void add_shadow_ipa_map_node(
+		struct kvm_s2_mmu *mmu,
+		phys_addr_t ipa, phys_addr_t shadow_ipa, long size);
 
 union tlbi_info;
 
@@ -123,6 +126,7 @@ extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
 extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
 extern void kvm_nested_s2_wp(struct kvm *kvm);
 extern void kvm_nested_s2_unmap(struct kvm *kvm);
+extern void kvm_nested_s2_unmap_range(struct kvm *kvm, struct kvm_gfn_range *range);
 extern void kvm_nested_s2_flush(struct kvm *kvm);
 int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe);
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 61bdd8798f83..3948681426a0 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1695,6 +1695,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 					     memcache,
 					     KVM_PGTABLE_WALK_HANDLE_FAULT |
 					     KVM_PGTABLE_WALK_SHARED);
+		if ((nested || kvm_is_l1_using_shadow_s2(vcpu)) && !ret) {
+			struct kvm_s2_mmu *shadow_s2_mmu;
+
+			ipa &= ~(vma_pagesize - 1);
+			shadow_s2_mmu = lookup_s2_mmu(vcpu);
+			add_shadow_ipa_map_node(shadow_s2_mmu, ipa, fault_ipa, vma_pagesize);
+		}
 	}
 
 	/* Mark the page dirty only if the fault is handled successfully */
@@ -1918,7 +1925,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 			     (range->end - range->start) << PAGE_SHIFT,
 			     range->may_block);
 
-	kvm_nested_s2_unmap(kvm);
+	kvm_nested_s2_unmap_range(kvm, range);
 	return false;
 }
 
@@ -1953,7 +1960,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 			       PAGE_SIZE, __pfn_to_phys(pfn),
 			       KVM_PGTABLE_PROT_R, NULL, 0);
 
-	kvm_nested_s2_unmap(kvm);
+	kvm_nested_s2_unmap_range(kvm, range);
 	return false;
 }
 
@@ -2223,12 +2230,18 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 				   struct kvm_memory_slot *slot)
 {
+	struct kvm_gfn_range range;
+
 	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
 	phys_addr_t size = slot->npages << PAGE_SHIFT;
 
+	range.start = gpa;
+	range.end = gpa + size;
+	range.may_block = true;
+
 	write_lock(&kvm->mmu_lock);
 	kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, size);
-	kvm_nested_s2_unmap(kvm);
+	kvm_nested_s2_unmap_range(kvm, &range);
 	write_unlock(&kvm->mmu_lock);
 }
 
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index f88d9213c6b3..888ec9fba4a0 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -565,6 +565,88 @@ void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
 	write_unlock(&kvm->mmu_lock);
 }
 
+/*
+ * Create a node and add to lookup table, when a page is mapped to
+ * Canonical IPA and also mapped to Shadow IPA.
+ */
+void add_shadow_ipa_map_node(struct kvm_s2_mmu *mmu,
+			phys_addr_t ipa,
+			phys_addr_t shadow_ipa, long size)
+{
+	struct rb_root *ipa_root = &(mmu->nested_mapipa_root);
+	struct rb_node **node = &(ipa_root->rb_node), *parent = NULL;
+	struct mapipa_node *new;
+
+	new = kzalloc(sizeof(struct mapipa_node), GFP_KERNEL);
+	if (!new)
+		return;
+
+	new->shadow_ipa = shadow_ipa;
+	new->ipa = ipa;
+	new->size = size;
+
+	write_lock(&mmu->mmu_lock);
+
+	while (*node) {
+		struct mapipa_node *tmp;
+
+		tmp = container_of(*node, struct mapipa_node, node);
+		parent = *node;
+		if (new->ipa < tmp->ipa) {
+			node = &(*node)->rb_left;
+		} else if (new->ipa > tmp->ipa) {
+			node = &(*node)->rb_right;
+		} else {
+			write_unlock(&mmu->mmu_lock);
+			kfree(new);
+			return;
+		}
+	}
+
+	rb_link_node(&new->node, parent, node);
+	rb_insert_color(&new->node, ipa_root);
+	write_unlock(&mmu->mmu_lock);
+}
+
+/*
+ * Iterate over the lookup table of Canonical IPA to Shadow IPA.
+ * Return Shadow IPA, if the page mapped to Canonical IPA is
+ * also mapped to a Shadow IPA.
+ *
+ */
+bool get_shadow_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, phys_addr_t *shadow_ipa, long *size)
+{
+	struct rb_node *node;
+	struct mapipa_node *tmp = NULL;
+
+	read_lock(&mmu->mmu_lock);
+	node = mmu->nested_mapipa_root.rb_node;
+
+	while (node) {
+		tmp = container_of(node, struct mapipa_node, node);
+
+		if (tmp->ipa == ipa)
+			break;
+		else if (ipa > tmp->ipa)
+			node = node->rb_right;
+		else
+			node = node->rb_left;
+	}
+
+	read_unlock(&mmu->mmu_lock);
+
+	if (tmp && tmp->ipa == ipa) {
+		*shadow_ipa = tmp->shadow_ipa;
+		*size = tmp->size;
+		write_lock(&mmu->mmu_lock);
+		rb_erase(&tmp->node, &mmu->nested_mapipa_root);
+		write_unlock(&mmu->mmu_lock);
+		kfree(tmp);
+		return true;
+	}
+	return false;
+}
+
 /* Must be called with kvm->mmu_lock held */
 struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
 {
@@ -674,6 +756,7 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
 	mmu->tlb_vttbr = 1;
 	mmu->nested_stage2_enabled = false;
 	atomic_set(&mmu->refcnt, 0);
+	mmu->nested_mapipa_root = RB_ROOT;
 }
 
 void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
@@ -760,6 +843,36 @@ void kvm_nested_s2_unmap(struct kvm *kvm)
 	}
 }
 
+void kvm_nested_s2_unmap_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	int i;
+	long size;
+	bool ret;
+
+	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+		if (kvm_s2_mmu_valid(mmu)) {
+			phys_addr_t shadow_ipa, start, end;
+
+			start = range->start << PAGE_SHIFT;
+			end = range->end << PAGE_SHIFT;
+
+			while (start < end) {
+				size = PAGE_SIZE;
+				/*
+				 * get the Shadow IPA if the page is mapped
+				 * to L1 and also mapped to any of active L2.
+				 */
+				ret = get_shadow_ipa(mmu, start, &shadow_ipa, &size);
+				if (ret)
+					kvm_unmap_stage2_range(mmu, shadow_ipa, size);
+				start += size;
+			}
+		}
+	}
+}
+
 /* expects kvm->mmu_lock to be held */
 void kvm_nested_s2_flush(struct kvm *kvm)
 {
-- 
2.40.1