linux-kernel - Re: [PATCH 5/5] KVM: x86/mmu: Move kvm_tdp_mmu

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Y6NRJTboZnjKbAL7@google.com>
Date:   Wed, 21 Dec 2022 18:32:05 +0000
From:   Sean Christopherson <seanjc@...gle.com>
To:     David Matlack <dmatlack@...gle.com>
Cc:     Paolo Bonzini <pbonzini@...hat.com>, kvm@...r.kernel.org,
        linux-kernel@...r.kernel.org,
        Robert Hoo <robert.hu@...ux.intel.com>,
        Greg Thelen <gthelen@...gle.com>,
        Ben Gardon <bgardon@...gle.com>,
        Mingwei Zhang <mizhang@...gle.com>
Subject: Re: [PATCH 5/5] KVM: x86/mmu: Move kvm_tdp_mmu_map()'s prolog and
 epilog to its caller

On Tue, Dec 20, 2022, David Matlack wrote:
> On Tue, Dec 13, 2022 at 03:30:30AM +0000, Sean Christopherson wrote:
> > Move the hugepage adjust, tracepoint, and RCU (un)lock logic out of
> > kvm_tdp_mmu_map() and into its sole caller, kvm_tdp_mmu_page_fault(), to
> > eliminate the gotos used to bounce through rcu_read_unlock() when bailing
> > from the walk.
> > 
> > Opportunistically mark kvm_mmu_hugepage_adjust() as static as
> > kvm_tdp_mmu_map() was the only external user.
> > 
> > No functional change intended.
> > 
> > Signed-off-by: Sean Christopherson <seanjc@...gle.com>
> > ---
> >  arch/x86/kvm/mmu/mmu.c          |  9 ++++++++-
> >  arch/x86/kvm/mmu/mmu_internal.h |  1 -
> >  arch/x86/kvm/mmu/tdp_mmu.c      | 22 ++++------------------
> >  3 files changed, 12 insertions(+), 20 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 254bc46234e0..99c40617d325 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -3085,7 +3085,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> >  	return min(host_level, max_level);
> >  }
> >  
> > -void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > +static void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu,
> > +				    struct kvm_page_fault *fault)
> >  {
> >  	struct kvm_memory_slot *slot = fault->slot;
> >  	kvm_pfn_t mask;
> > @@ -4405,7 +4406,13 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
> >  	if (is_page_fault_stale(vcpu, fault))
> >  		goto out_unlock;
> >  
> > +	kvm_mmu_hugepage_adjust(vcpu, fault);
> 
> Can you also move the call to kvm_mmu_hugepage_adjust() from
> direct_map() to direct_page_fault()? I do think it's worth the
> maintenence burden to keep those functions consistent.

Sure.

> > +	trace_kvm_mmu_spte_requested(fault);
> > +
> > +	rcu_read_lock();
> >  	r = kvm_tdp_mmu_map(vcpu, fault);
> > +	rcu_read_unlock();
> 
> I would prefer to keep these in tdp_mmu.c, to reduce the amount of TDP
> MMU details that bleed into mmu.c (RCU) and for consistency with other
> TDP MMU APIs that don't require the caller to acquire RCU.  This will
> also be helpful for the Common MMU, as the tracepoint and RCU will be
> common.
> 
> e.g.
> 
> static int __kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> {
> 	...
> }
> 
> int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> {
> 	int r;
> 
> 	trace_kvm_mmu_spte_requested(fault);
> 
> 	rcu_read_lock();
> 	r = __kvm_tdp_mmu_map(vcpu, fault);
> 	rcu_read_unlock();
> 
> 	return r;
> }

I did that originally, but it felt really silly to have the trivial wrapper, especially
because mmu.c already has TDP MMU details, e.g. kvm_tdp_mmu_page_fault() takes mmu_lock
for read and other flows acquire rcu_read_lock() to protected the TDP MMU.

What about the below (split into multiple patches) instead?  kvm_tdp_mmu_page_fault()
really should live in tdp_mmu.c, the only reason it's in mmu.c is to get at various
helpers, e.g. fast_page_fault() and kvm_faultin_pfn().

Or is that doomed to fail because the TDP MMU will want to add code before
kvm_faultin_pfn() (I can't remember what motivated splitting out kvm_tdp_mmu_page_fault()
in the first place).

---
 arch/x86/kvm/mmu/mmu.c          | 132 ++++++++------------------------
 arch/x86/kvm/mmu/mmu_internal.h |  50 ++++++++++++
 arch/x86/kvm/mmu/spte.h         |   7 --
 arch/x86/kvm/mmu/tdp_mmu.c      |  41 ++++++----
 arch/x86/kvm/mmu/tdp_mmu.h      |   8 +-
 5 files changed, 108 insertions(+), 130 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 254bc46234e0..8203b1dd2753 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1927,16 +1927,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
 	return true;
 }
 
-static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-	if (sp->role.invalid)
-		return true;
-
-	/* TDP MMU pages do not use the MMU generation. */
-	return !is_tdp_mmu_page(sp) &&
-	       unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
-}
-
 struct mmu_page_path {
 	struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
 	unsigned int idx[PT64_ROOT_MAX_LEVEL];
@@ -3148,9 +3138,6 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	int ret;
 	gfn_t base_gfn = fault->gfn;
 
-	kvm_mmu_hugepage_adjust(vcpu, fault);
-
-	trace_kvm_mmu_spte_requested(fault);
 	for_each_shadow_entry(vcpu, fault->addr, it) {
 		/*
 		 * We cannot overwrite existing page tables with an NX
@@ -4270,54 +4257,10 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 	return RET_PF_CONTINUE;
 }
 
-/*
- * Returns true if the page fault is stale and needs to be retried, i.e. if the
- * root was invalidated by a memslot update or a relevant mmu_notifier fired.
- */
-static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
-				struct kvm_page_fault *fault)
+static int __direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
-	struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
+	int r = RET_PF_RETRY;
 
-	/* Special roots, e.g. pae_root, are not backed by shadow pages. */
-	if (sp && is_obsolete_sp(vcpu->kvm, sp))
-		return true;
-
-	/*
-	 * Roots without an associated shadow page are considered invalid if
-	 * there is a pending request to free obsolete roots.  The request is
-	 * only a hint that the current root _may_ be obsolete and needs to be
-	 * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
-	 * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
-	 * to reload even if no vCPU is actively using the root.
-	 */
-	if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
-		return true;
-
-	return fault->slot &&
-	       mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
-}
-
-static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
-{
-	int r;
-
-	if (page_fault_handle_page_track(vcpu, fault))
-		return RET_PF_EMULATE;
-
-	r = fast_page_fault(vcpu, fault);
-	if (r != RET_PF_INVALID)
-		return r;
-
-	r = mmu_topup_memory_caches(vcpu, false);
-	if (r)
-		return r;
-
-	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
-	if (r != RET_PF_CONTINUE)
-		return r;
-
-	r = RET_PF_RETRY;
 	write_lock(&vcpu->kvm->mmu_lock);
 
 	if (is_page_fault_stale(vcpu, fault))
@@ -4327,6 +4270,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	if (r)
 		goto out_unlock;
 
+	kvm_mmu_hugepage_adjust(vcpu, fault);
+
+	trace_kvm_mmu_spte_requested(fault);
+
 	r = direct_map(vcpu, fault);
 
 out_unlock:
@@ -4335,6 +4282,32 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	return r;
 }
 
+static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+	int r;
+
+	if (page_fault_handle_page_track(vcpu, fault))
+		return RET_PF_EMULATE;
+
+	r = fast_page_fault(vcpu, fault);
+	if (r != RET_PF_INVALID)
+		return r;
+
+	r = mmu_topup_memory_caches(vcpu, false);
+	if (r)
+		return r;
+
+	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
+	if (r != RET_PF_CONTINUE)
+		return r;
+
+#ifdef CONFIG_X86_64
+	if (tdp_mmu_enabled)
+		return kvm_tdp_mmu_page_fault(vcpu, fault);
+#endif
+	return __direct_page_fault(vcpu, fault);
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
 				struct kvm_page_fault *fault)
 {
@@ -4378,42 +4351,6 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 }
 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
 
-#ifdef CONFIG_X86_64
-static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
-				  struct kvm_page_fault *fault)
-{
-	int r;
-
-	if (page_fault_handle_page_track(vcpu, fault))
-		return RET_PF_EMULATE;
-
-	r = fast_page_fault(vcpu, fault);
-	if (r != RET_PF_INVALID)
-		return r;
-
-	r = mmu_topup_memory_caches(vcpu, false);
-	if (r)
-		return r;
-
-	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
-	if (r != RET_PF_CONTINUE)
-		return r;
-
-	r = RET_PF_RETRY;
-	read_lock(&vcpu->kvm->mmu_lock);
-
-	if (is_page_fault_stale(vcpu, fault))
-		goto out_unlock;
-
-	r = kvm_tdp_mmu_map(vcpu, fault);
-
-out_unlock:
-	read_unlock(&vcpu->kvm->mmu_lock);
-	kvm_release_pfn_clean(fault->pfn);
-	return r;
-}
-#endif
-
 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
 	/*
@@ -4438,11 +4375,6 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		}
 	}
 
-#ifdef CONFIG_X86_64
-	if (tdp_mmu_enabled)
-		return kvm_tdp_mmu_page_fault(vcpu, fault);
-#endif
-
 	return direct_page_fault(vcpu, fault);
 }
 
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index ac00bfbf32f6..2c7c2b49f719 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -133,6 +133,28 @@ struct kvm_mmu_page {
 
 extern struct kmem_cache *mmu_page_header_cache;
 
+static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
+{
+	struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);
+
+	return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp)
+{
+	return IS_ENABLED(CONFIG_X86_64) && sp->tdp_mmu_page;
+}
+
+static inline bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	if (sp->role.invalid)
+		return true;
+
+	/* TDP MMU pages do not use the MMU generation. */
+	return !is_tdp_mmu_page(sp) &&
+	       unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
 static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
 {
 	return role.smm ? 1 : 0;
@@ -314,6 +336,34 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	return r;
 }
 
+/*
+ * Returns true if the page fault is stale and needs to be retried, i.e. if the
+ * root was invalidated by a memslot update or a relevant mmu_notifier fired.
+ */
+static inline bool is_page_fault_stale(struct kvm_vcpu *vcpu,
+				       struct kvm_page_fault *fault)
+{
+	struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
+
+	/* Special roots, e.g. pae_root, are not backed by shadow pages. */
+	if (sp && is_obsolete_sp(vcpu->kvm, sp))
+		return true;
+
+	/*
+	 * Roots without an associated shadow page are considered invalid if
+	 * there is a pending request to free obsolete roots.  The request is
+	 * only a hint that the current root _may_ be obsolete and needs to be
+	 * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
+	 * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
+	 * to reload even if no vCPU is actively using the root.
+	 */
+	if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+		return true;
+
+	return fault->slot &&
+	       mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
+}
+
 int kvm_mmu_max_mapping_level(struct kvm *kvm,
 			      const struct kvm_memory_slot *slot, gfn_t gfn,
 			      int max_level);
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1f03701b943a..23e8f8c152b5 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -219,13 +219,6 @@ static inline int spte_index(u64 *sptep)
  */
 extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
 
-static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
-{
-	struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);
-
-	return (struct kvm_mmu_page *)page_private(page);
-}
-
 static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte)
 {
 	return to_shadow_page(spte & SPTE_BASE_ADDR_MASK);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index cc1fb9a65620..4bb58c0f465b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1144,19 +1144,12 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
  * page tables and SPTEs to translate the faulting guest physical address.
  */
-int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+static int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
 	struct kvm *kvm = vcpu->kvm;
 	struct tdp_iter iter;
 	struct kvm_mmu_page *sp;
-	int ret = RET_PF_RETRY;
-
-	kvm_mmu_hugepage_adjust(vcpu, fault);
-
-	trace_kvm_mmu_spte_requested(fault);
-
-	rcu_read_lock();
 
 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
 		int r;
@@ -1169,10 +1162,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		 * retry, avoiding unnecessary page table allocation and free.
 		 */
 		if (is_removed_spte(iter.old_spte))
-			goto retry;
+			return RET_PF_RETRY;
 
 		if (iter.level == fault->goal_level)
-			goto map_target_level;
+			return tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
 
 		/* Step down into the lower level page table if it exists. */
 		if (is_shadow_present_pte(iter.old_spte) &&
@@ -1199,7 +1192,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		 */
 		if (r) {
 			tdp_mmu_free_sp(sp);
-			goto retry;
+			return RET_PF_RETRY;
 		}
 
 		if (fault->huge_page_disallowed &&
@@ -1216,14 +1209,30 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	 * iterator detected an upper level SPTE was frozen during traversal.
 	 */
 	WARN_ON_ONCE(iter.level == fault->goal_level);
-	goto retry;
+	return RET_PF_RETRY;
+}
 
-map_target_level:
-	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
+int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+	int r = RET_PF_RETRY;
 
-retry:
+	read_lock(&vcpu->kvm->mmu_lock);
+
+	if (is_page_fault_stale(vcpu, fault))
+		goto out_unlock;
+
+	kvm_mmu_hugepage_adjust(vcpu, fault);
+
+	trace_kvm_mmu_spte_requested(fault);
+
+	rcu_read_lock();
+	r = kvm_tdp_mmu_map(vcpu, fault);
 	rcu_read_unlock();
-	return ret;
+
+out_unlock:
+	read_unlock(&vcpu->kvm->mmu_lock);
+	kvm_release_pfn_clean(fault->pfn);
+	return r;
 }
 
 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 0a63b1afabd3..697dca948d0a 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -27,7 +27,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm);
 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
 
-int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 
 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
 				 bool flush);
@@ -70,10 +70,4 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
 					u64 *spte);
 
-#ifdef CONFIG_X86_64
-static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
-#else
-static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
-#endif
-
 #endif /* __KVM_X86_MMU_TDP_MMU_H */

base-commit: ffc4e70d9855921b740aee9bcbc8503cc753aee2
--