linux-kernel - [PATCH] kvm: x86/mmu: Remove FNAME(is_self_change

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20221213125538.81209-1-jiangshanlai@gmail.com>
Date:   Tue, 13 Dec 2022 20:55:38 +0800
From:   Lai Jiangshan <jiangshanlai@...il.com>
To:     linux-kernel@...r.kernel.org
Cc:     Paolo Bonzini <pbonzini@...hat.com>,
        Sean Christopherson <seanjc@...gle.com>,
        Lai Jiangshan <jiangshan.ljs@...group.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
        Dave Hansen <dave.hansen@...ux.intel.com>, x86@...nel.org,
        "H. Peter Anvin" <hpa@...or.com>, kvm@...r.kernel.org
Subject: [PATCH] kvm: x86/mmu: Remove FNAME(is_self_change_mapping)

From: Lai Jiangshan <jiangshan.ljs@...group.com>

FNAME(is_self_change_mapping) has two functionalities.

  If the fault is on a huge page but at least one of the pagetable on
  the walk is also on the terminal huge page, disable the huge page
  mapping for the fault.

  If the fault is modifying at least one of the pagetable on the walk,
  set something to tell the emulator.

The first functionality is much better handled by kvm_mmu_hugepage_adjust()
now, and it has a defect that it blindly disables the huge page mapping
rather than trying to reduce the size of the huge page first.

Huang Hang reported that when a guest is writing to a 1G page, but
only a 4K page is mapped because of the first functionality in a case
in which we think a 2M page should be mapped.  The 1G page includes
a pagetable on the pagetable-walk, but the narrowed 2M page doesn't.

To fix the problem, remove FNAME(is_self_change_mapping) for its first
functionality is already and better handled by kvm_mmu_hugepage_adjust(),
and re-implement the second functionality in FNAME(fetch).

Reported-by: Huang Hang <hhuang@...ux.alibaba.com>
Signed-off-by: Lai Jiangshan <jiangshan.ljs@...group.com>
---
 arch/x86/kvm/mmu/paging_tmpl.h | 66 ++++++++--------------------------
 1 file changed, 15 insertions(+), 51 deletions(-)

diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 8b83abf1d8bc..c69e30737cd2 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -630,6 +630,13 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 	top_level = vcpu->arch.mmu->cpu_role.base.level;
 	if (top_level == PT32E_ROOT_LEVEL)
 		top_level = PT32_ROOT_LEVEL;
+
+	/*
+	 * write_fault_to_shadow_pgtable will be set if the fault gfn is
+	 * currently used as its pagetable on the path of the pagetable walk.
+	 */
+	vcpu->arch.write_fault_to_shadow_pgtable = false;
+
 	/*
 	 * Verify that the top-level gpte is still there.  Since the page
 	 * is a root page, it is either write protected (and cannot be
@@ -685,8 +692,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 
 		if (sp != ERR_PTR(-EEXIST))
 			link_shadow_page(vcpu, it.sptep, sp);
+
+		if (fault->write && table_gfn == fault->gfn)
+			vcpu->arch.write_fault_to_shadow_pgtable = true;
 	}
 
+	/*
+	 * Adjust huge page after getting non-direct shadow page which might
+	 * affect the huge page info.
+	 */
 	kvm_mmu_hugepage_adjust(vcpu, fault);
 
 	trace_kvm_mmu_spte_requested(fault);
@@ -733,46 +747,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 	return RET_PF_RETRY;
 }
 
- /*
- * To see whether the mapped gfn can write its page table in the current
- * mapping.
- *
- * It is the helper function of FNAME(page_fault). When guest uses large page
- * size to map the writable gfn which is used as current page table, we should
- * force kvm to use small page size to map it because new shadow page will be
- * created when kvm establishes shadow page table that stop kvm using large
- * page size. Do it early can avoid unnecessary #PF and emulation.
- *
- * @write_fault_to_shadow_pgtable will return true if the fault gfn is
- * currently used as its page table.
- *
- * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
- * since the PDPT is always shadowed, that means, we can not use large page
- * size to map the gfn which is used as PDPT.
- */
-static bool
-FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
-			      struct guest_walker *walker, bool user_fault,
-			      bool *write_fault_to_shadow_pgtable)
-{
-	int level;
-	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
-	bool self_changed = false;
-
-	if (!(walker->pte_access & ACC_WRITE_MASK ||
-	    (!is_cr0_wp(vcpu->arch.mmu) && !user_fault)))
-		return false;
-
-	for (level = walker->level; level <= walker->max_level; level++) {
-		gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
-
-		self_changed |= !(gfn & mask);
-		*write_fault_to_shadow_pgtable |= !gfn;
-	}
-
-	return self_changed;
-}
-
 /*
  * Page fault handler.  There are several causes for a page fault:
  *   - there is no shadow pte for the guest pte
@@ -791,7 +765,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 {
 	struct guest_walker walker;
 	int r;
-	bool is_self_change_mapping;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
 	WARN_ON_ONCE(fault->is_tdp);
@@ -816,6 +789,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	}
 
 	fault->gfn = walker.gfn;
+	fault->max_level = walker.level;
 	fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
 
 	if (page_fault_handle_page_track(vcpu, fault)) {
@@ -827,16 +801,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	if (r)
 		return r;
 
-	vcpu->arch.write_fault_to_shadow_pgtable = false;
-
-	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
-	      &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable);
-
-	if (is_self_change_mapping)
-		fault->max_level = PG_LEVEL_4K;
-	else
-		fault->max_level = walker.level;
-
 	r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
 	if (r != RET_PF_CONTINUE)
 		return r;
-- 
2.19.1.6.gb485710b