linux-kernel - [RFC PATCH 16/21] KVM: x86/mmu: Introduce kvm_split_boundary

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250424030816.470-1-yan.y.zhao@intel.com>
Date: Thu, 24 Apr 2025 11:08:16 +0800
From: Yan Zhao <yan.y.zhao@...el.com>
To: pbonzini@...hat.com,
	seanjc@...gle.com
Cc: linux-kernel@...r.kernel.org,
	kvm@...r.kernel.org,
	x86@...nel.org,
	rick.p.edgecombe@...el.com,
	dave.hansen@...el.com,
	kirill.shutemov@...el.com,
	tabba@...gle.com,
	ackerleytng@...gle.com,
	quic_eberman@...cinc.com,
	michael.roth@....com,
	david@...hat.com,
	vannapurve@...gle.com,
	vbabka@...e.cz,
	jroedel@...e.de,
	thomas.lendacky@....com,
	pgonda@...gle.com,
	zhiquan1.li@...el.com,
	fan.du@...el.com,
	jun.miao@...el.com,
	ira.weiny@...el.com,
	isaku.yamahata@...el.com,
	xiaoyao.li@...el.com,
	binbin.wu@...ux.intel.com,
	chao.p.peng@...el.com,
	Yan Zhao <yan.y.zhao@...el.com>
Subject: [RFC PATCH 16/21] KVM: x86/mmu: Introduce kvm_split_boundary_leafs() to split boundary leafs

Introduce kvm_split_boundary_leafs() to manage the splitting of boundary
leafs within the mirror root.

Before zapping a specific GFN range in the mirror root, split any huge leaf
that intersects with the boundary of the GFN range to ensure that the
subsequent zap operation does not impact any GFN outside the specified
range. This is crucial for the mirror root as the private page table
requires the guest's ACCEPT operation after faulting back a GFN.

This function should be called while kvm->mmu_lock is held for writing. The
kvm->mmu_lock is temporarily released to allocate memory for sp for split.
The only expected error is -ENOMEM.

Opportunistically, WARN in tdp_mmu_zap_leafs() if zapping a huge leaf in
the mirror root affects a GFN outside the specified range.

Signed-off-by: Xiaoyao Li <xiaoyao.li@...el.com>
Signed-off-by: Yan Zhao <yan.y.zhao@...el.com>
---
 arch/x86/kvm/mmu/mmu.c     |  21 +++++++
 arch/x86/kvm/mmu/tdp_mmu.c | 116 ++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/mmu/tdp_mmu.h |   1 +
 include/linux/kvm_host.h   |   1 +
 4 files changed, 136 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0e227199d73e..0d49c69b6b55 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1640,6 +1640,27 @@ static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
 				 start, end - 1, can_yield, true, flush);
 }
 
+/*
+ * Split large leafs at the boundary of the specified range for the mirror root
+ *
+ * Return value:
+ * 0 : success, no flush is required;
+ * 1 : success, flush is required;
+ * <0: failure.
+ */
+int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	bool ret = 0;
+
+	lockdep_assert_once(kvm->mmu_invalidate_in_progress ||
+			    lockdep_is_held(&kvm->slots_lock));
+
+	if (tdp_mmu_enabled)
+		ret = kvm_tdp_mmu_gfn_range_split_boundary(kvm, range);
+
+	return ret;
+}
+
 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool flush = false;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 0f683753a7bb..d3fba5d11ea2 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -324,6 +324,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 				u64 old_spte, u64 new_spte, int level,
 				bool shared);
 
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+				   struct kvm_mmu_page *sp, bool shared);
 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(bool mirror);
 static void *get_external_spt(gfn_t gfn, u64 new_spte, int level);
 
@@ -962,6 +964,19 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 	return true;
 }
 
+static inline bool iter_split_required(struct kvm *kvm, struct kvm_mmu_page *root,
+				       struct tdp_iter *iter, gfn_t start, gfn_t end)
+{
+	if (!is_mirror_sp(root) || !is_large_pte(iter->old_spte))
+		return false;
+
+	/* Fully contained, no need to split */
+	if (iter->gfn >= start && iter->gfn + KVM_PAGES_PER_HPAGE(iter->level) <= end)
+		return false;
+
+	return true;
+}
+
 /*
  * If can_yield is true, will release the MMU lock and reschedule if the
  * scheduler needs the CPU or there is contention on the MMU lock. If this
@@ -991,6 +1006,8 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
 		    !is_last_spte(iter.old_spte, iter.level))
 			continue;
 
+		WARN_ON_ONCE(iter_split_required(kvm, root, &iter, start, end));
+
 		tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
 
 		/*
@@ -1246,9 +1263,6 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
 	return 0;
 }
 
-static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
-				   struct kvm_mmu_page *sp, bool shared);
-
 /*
  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
  * page tables and SPTEs to translate the faulting guest physical address.
@@ -1341,6 +1355,102 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	return ret;
 }
 
+/*
+ * Split large leafs at the boundary of the specified range for the mirror root
+ */
+static int tdp_mmu_split_boundary_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
+					gfn_t start, gfn_t end, bool can_yield, bool *flush)
+{
+	struct kvm_mmu_page *sp = NULL;
+	struct tdp_iter iter;
+
+	WARN_ON_ONCE(!can_yield);
+
+	if (!is_mirror_sp(root))
+		return 0;
+
+	end = min(end, tdp_mmu_max_gfn_exclusive());
+
+	lockdep_assert_held_write(&kvm->mmu_lock);
+
+	rcu_read_lock();
+
+	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
+retry:
+		if (can_yield &&
+		    tdp_mmu_iter_cond_resched(kvm, &iter, *flush, false)) {
+			*flush = false;
+			continue;
+		}
+
+		if (!is_shadow_present_pte(iter.old_spte) ||
+		    !is_last_spte(iter.old_spte, iter.level) ||
+		    !iter_split_required(kvm, root, &iter, start, end))
+			continue;
+
+		if (!sp) {
+			rcu_read_unlock();
+
+			write_unlock(&kvm->mmu_lock);
+
+			sp = tdp_mmu_alloc_sp_for_split(true);
+
+			write_lock(&kvm->mmu_lock);
+
+			if (!sp) {
+				trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte,
+							      iter.level, -ENOMEM);
+				return -ENOMEM;
+			}
+			rcu_read_lock();
+
+			iter.yielded = true;
+			continue;
+		}
+		tdp_mmu_init_child_sp(sp, &iter);
+
+		if (tdp_mmu_split_huge_page(kvm, &iter, sp, false))
+			goto retry;
+
+		sp = NULL;
+		/*
+		 * Set yielded in case after splitting to a lower level,
+		 * the new iter requires furter splitting.
+		 */
+		iter.yielded = true;
+		*flush = true;
+	}
+
+	rcu_read_unlock();
+
+	/* Leave it here though it should be impossible for the mirror root */
+	if (sp)
+		tdp_mmu_free_sp(sp);
+	return 0;
+}
+
+int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	enum kvm_tdp_mmu_root_types types;
+	struct kvm_mmu_page *root;
+	bool flush = false;
+	int ret;
+
+	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
+
+	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) {
+		ret = tdp_mmu_split_boundary_leafs(kvm, root, range->start, range->end,
+						   range->may_block, &flush);
+		if (ret < 0) {
+			if (flush)
+				kvm_flush_remote_tlbs(kvm);
+
+			return ret;
+		}
+	}
+	return flush;
+}
+
 /* Used by mmu notifier via kvm_unmap_gfn_range() */
 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
 				 bool flush)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 52acf99d40a0..806a21d4f0e3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -69,6 +69,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm);
 void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
 				  enum kvm_tdp_mmu_root_types root_types);
 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared);
+int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range);
 
 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 655d36e1f4db..19d7a577e7ed 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -272,6 +272,7 @@ struct kvm_gfn_range {
 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range);
 #endif
 
 enum {
-- 
2.43.2