linux-kernel - [PATCH v2 2/4] KVM: x86/mmu: Extract out TDP MMU NX huge page recovery code

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240829191135.2041489-3-vipinsh@google.com>
Date: Thu, 29 Aug 2024 12:11:33 -0700
From: Vipin Sharma <vipinsh@...gle.com>
To: seanjc@...gle.com, pbonzini@...hat.com, dmatlack@...gle.com
Cc: kvm@...r.kernel.org, linux-kernel@...r.kernel.org, 
	Vipin Sharma <vipinsh@...gle.com>
Subject: [PATCH v2 2/4] KVM: x86/mmu: Extract out TDP MMU NX huge page
 recovery code

Create separate function for TDP MMU NX huge page recovery. In the new
TDP MMU function remove code related to "prepare and commit" zap pages
of legacy MMU as there will be no legacy MMU pages. Similarly, remove
TDP MMU zap related logic from legacy MMU NX huge page recovery code.
Extract out dirty logging check as it is common to both. Rename
kvm_recover_nx_huge_pages() to kvm_mmu_recover_nx_huge_pages().

Separate code allows to change TDP MMU NX huge page recovery
independently of legacy MMU.

Signed-off-by: Vipin Sharma <vipinsh@...gle.com>
---
 arch/x86/kvm/mmu/mmu.c          | 93 ++++++++++++++-------------------
 arch/x86/kvm/mmu/mmu_internal.h |  2 +
 arch/x86/kvm/mmu/tdp_mmu.c      | 68 ++++++++++++++++++++++++
 arch/x86/kvm/mmu/tdp_mmu.h      |  3 ++
 4 files changed, 113 insertions(+), 53 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0bda372b13a5..c8c64df979e3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -925,7 +925,7 @@ void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	list_del_init(&sp->possible_nx_huge_page_link);
 }
 
-static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	sp->nx_huge_page_disallowed = false;
 
@@ -7327,26 +7327,44 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
 	return err;
 }
 
-static void kvm_recover_nx_huge_pages(struct kvm *kvm,
-				      struct list_head *nx_huge_pages,
-				      unsigned long to_zap)
+bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	struct kvm_memory_slot *slot = NULL;
+
+	/*
+	 * Since gfn_to_memslot() is relatively expensive, it helps to skip it if
+	 * it the test cannot possibly return true.  On the other hand, if any
+	 * memslot has logging enabled, chances are good that all of them do, in
+	 * which case unaccount_nx_huge_page() is much cheaper than zapping the
+	 * page.
+	 *
+	 * If a memslot update is in progress, reading an incorrect value of
+	 * kvm->nr_memslots_dirty_logging is not a problem: if it is becoming
+	 * zero, gfn_to_memslot() will be done unnecessarily; if it is becoming
+	 * nonzero, the page will be zapped unnecessarily.  Either way, this only
+	 * affects efficiency in racy situations, and not correctness.
+	 */
+	if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
+		struct kvm_memslots *slots;
+
+		slots = kvm_memslots_for_spte_role(kvm, sp->role);
+		slot = __gfn_to_memslot(slots, sp->gfn);
+		WARN_ON_ONCE(!slot);
+	}
+	return slot && kvm_slot_dirty_track_enabled(slot);
+}
+
+static void kvm_mmu_recover_nx_huge_pages(struct kvm *kvm,
+					  struct list_head *nx_huge_pages,
+					  unsigned long to_zap)
 {
-	struct kvm_memory_slot *slot;
 	int rcu_idx;
 	struct kvm_mmu_page *sp;
 	LIST_HEAD(invalid_list);
-	bool flush = false;
 
 	rcu_idx = srcu_read_lock(&kvm->srcu);
 	write_lock(&kvm->mmu_lock);
 
-	/*
-	 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
-	 * be done under RCU protection, because the pages are freed via RCU
-	 * callback.
-	 */
-	rcu_read_lock();
-
 	for ( ; to_zap; --to_zap) {
 		if (list_empty(nx_huge_pages))
 			break;
@@ -7370,50 +7388,19 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm,
 		 * back in as 4KiB pages. The NX Huge Pages in this slot will be
 		 * recovered, along with all the other huge pages in the slot,
 		 * when dirty logging is disabled.
-		 *
-		 * Since gfn_to_memslot() is relatively expensive, it helps to
-		 * skip it if it the test cannot possibly return true.  On the
-		 * other hand, if any memslot has logging enabled, chances are
-		 * good that all of them do, in which case unaccount_nx_huge_page()
-		 * is much cheaper than zapping the page.
-		 *
-		 * If a memslot update is in progress, reading an incorrect value
-		 * of kvm->nr_memslots_dirty_logging is not a problem: if it is
-		 * becoming zero, gfn_to_memslot() will be done unnecessarily; if
-		 * it is becoming nonzero, the page will be zapped unnecessarily.
-		 * Either way, this only affects efficiency in racy situations,
-		 * and not correctness.
 		 */
-		slot = NULL;
-		if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
-			struct kvm_memslots *slots;
-
-			slots = kvm_memslots_for_spte_role(kvm, sp->role);
-			slot = __gfn_to_memslot(slots, sp->gfn);
-			WARN_ON_ONCE(!slot);
-		}
-
-		if (slot && kvm_slot_dirty_track_enabled(slot))
+		if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp))
 			unaccount_nx_huge_page(kvm, sp);
-		else if (is_tdp_mmu_page(sp))
-			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
 		else
 			kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 		WARN_ON_ONCE(sp->nx_huge_page_disallowed);
 
 		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
-			kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
-			rcu_read_unlock();
-
+			kvm_mmu_commit_zap_page(kvm, &invalid_list);
 			cond_resched_rwlock_write(&kvm->mmu_lock);
-			flush = false;
-
-			rcu_read_lock();
 		}
 	}
-	kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
-
-	rcu_read_unlock();
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
 	write_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, rcu_idx);
@@ -7461,16 +7448,16 @@ static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
 			return 0;
 
 		to_zap = nx_huge_pages_to_zap(kvm);
-		kvm_recover_nx_huge_pages(kvm,
-					  &kvm->arch.possible_nx_huge_pages,
-					  to_zap);
+		kvm_mmu_recover_nx_huge_pages(kvm,
+					      &kvm->arch.possible_nx_huge_pages,
+					      to_zap);
 
 		if (tdp_mmu_enabled) {
 #ifdef CONFIG_X86_64
 			to_zap = kvm_tdp_mmu_nx_huge_pages_to_zap(kvm);
-			kvm_recover_nx_huge_pages(kvm,
-						  &kvm->arch.tdp_mmu_possible_nx_huge_pages,
-						  to_zap);
+			kvm_tdp_mmu_recover_nx_huge_pages(kvm,
+						      &kvm->arch.tdp_mmu_possible_nx_huge_pages,
+						      to_zap);
 #endif
 		}
 	}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 8deed808592b..83b165077d97 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -353,6 +353,8 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 
 void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm, struct kvm_mmu_page *sp);
 
 extern unsigned int nx_huge_pages_recovery_ratio;
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6415c2c7e936..f0b4341264fd 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1805,3 +1805,71 @@ unsigned long kvm_tdp_mmu_nx_huge_pages_to_zap(struct kvm *kvm)
 
 	return ratio ? DIV_ROUND_UP(pages, ratio) : 0;
 }
+
+void kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm,
+				   struct list_head *nx_huge_pages,
+				   unsigned long to_zap)
+{
+	int rcu_idx;
+	struct kvm_mmu_page *sp;
+	bool flush = false;
+
+	rcu_idx = srcu_read_lock(&kvm->srcu);
+	write_lock(&kvm->mmu_lock);
+
+	/*
+	 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
+	 * be done under RCU protection, because the pages are freed via RCU
+	 * callback.
+	 */
+	rcu_read_lock();
+
+	for ( ; to_zap; --to_zap) {
+		if (list_empty(nx_huge_pages))
+			break;
+
+		/*
+		 * We use a separate list instead of just using active_mmu_pages
+		 * because the number of shadow pages that be replaced with an
+		 * NX huge page is expected to be relatively small compared to
+		 * the total number of shadow pages.  And because the TDP MMU
+		 * doesn't use active_mmu_pages.
+		 */
+		sp = list_first_entry(nx_huge_pages,
+				      struct kvm_mmu_page,
+				      possible_nx_huge_page_link);
+		WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
+		WARN_ON_ONCE(!sp->role.direct);
+
+		/*
+		 * Unaccount and do not attempt to recover any NX Huge Pages
+		 * that are being dirty tracked, as they would just be faulted
+		 * back in as 4KiB pages. The NX Huge Pages in this slot will be
+		 * recovered, along with all the other huge pages in the slot,
+		 * when dirty logging is disabled.
+		 */
+		if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp))
+			unaccount_nx_huge_page(kvm, sp);
+		else
+			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
+		WARN_ON_ONCE(sp->nx_huge_page_disallowed);
+
+		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+			if (flush)
+				kvm_flush_remote_tlbs(kvm);
+			rcu_read_unlock();
+
+			cond_resched_rwlock_write(&kvm->mmu_lock);
+			flush = false;
+
+			rcu_read_lock();
+		}
+	}
+
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+	rcu_read_unlock();
+
+	write_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 95290fd6154e..4036552f40cd 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -68,6 +68,9 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
 					u64 *spte);
 
 unsigned long kvm_tdp_mmu_nx_huge_pages_to_zap(struct kvm *kvm);
+void kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm,
+				   struct list_head *nx_huge_pages,
+				   unsigned long to_zap);
 
 #ifdef CONFIG_X86_64
 static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
-- 
2.46.0.469.g59c65b2a67-goog