linux-kernel - [PATCH V2 2/7] KVM: X86/MMU: Add special shadow pages

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220503150735.32723-3-jiangshanlai@gmail.com>
Date:   Tue,  3 May 2022 23:07:30 +0800
From:   Lai Jiangshan <jiangshanlai@...il.com>
To:     linux-kernel@...r.kernel.org, kvm@...r.kernel.org,
        Paolo Bonzini <pbonzini@...hat.com>,
        Sean Christopherson <seanjc@...gle.com>
Cc:     Lai Jiangshan <jiangshan.ljs@...group.com>,
        Vitaly Kuznetsov <vkuznets@...hat.com>,
        Wanpeng Li <wanpengli@...cent.com>,
        Jim Mattson <jmattson@...gle.com>,
        Joerg Roedel <joro@...tes.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
        Dave Hansen <dave.hansen@...ux.intel.com>, x86@...nel.org,
        "H. Peter Anvin" <hpa@...or.com>
Subject: [PATCH V2 2/7] KVM: X86/MMU: Add special shadow pages

From: Lai Jiangshan <jiangshan.ljs@...group.com>

Special pages are pages to hold PDPTEs for 32bit guest or higher
level pages linked to special page when shadowing NPT.

Current code use mmu->pae_root, mmu->pml4_root, and mmu->pml5_root to
setup special root.  The initialization code is complex and the roots
are not associated with struct kvm_mmu_page which causes the code more
complex.

Add kvm_mmu_alloc_special_page() and mmu_free_special_root_page() to
allocate and free special shadow pages and prepare for using special
shadow pages to replace current logic and share the most logic with
normal shadow pages.

The code is not activated since using_special_root_page() is false in
the place where it is inserted.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@...group.com>
---
 arch/x86/kvm/mmu/mmu.c | 91 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 90 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7f20796af351..126f0cd07f98 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1719,6 +1719,58 @@ static bool using_special_root_page(struct kvm_mmu *mmu)
 		return mmu->cpu_role.base.level <= PT32E_ROOT_LEVEL;
 }
 
+/*
+ * Special pages are pages to hold PAE PDPTEs for 32bit guest or higher level
+ * pages linked to special page when shadowing NPT.
+ *
+ * Special pages are specially allocated.  If sp->spt needs to be 32bit, it
+ * will use the preallocated mmu->pae_root.
+ *
+ * Special pages are only visible to local VCPU except through rmap from their
+ * children, so they are not in the kvm->arch.active_mmu_pages nor in the hash.
+ *
+ * And they are either accounted nor write-protected since they don't has gfn
+ * associated.
+ *
+ * Because of above, special pages can not be freed nor zapped like normal
+ * shadow pages.  They are freed directly when the special root is freed, see
+ * mmu_free_special_root_page().
+ *
+ * Special root page can not be put on mmu->prev_roots because the comparison
+ * must use PDPTEs instead of CR3 and mmu->pae_root can not be shared for multi
+ * root pages.
+ *
+ * Except above limitations, all the other abilities are the same as other
+ * shadow page, like link, parent rmap, sync, unsync etc.
+ *
+ * Special pages can be obsoleted but might be possibly reused later.  When
+ * the obsoleting process is done, all the obsoleted shadow pages are unlinked
+ * from the special pages by the help of the parent rmap of the children and
+ * the special pages become theoretically valid again.  If there is no other
+ * event to cause a VCPU to free the root and the VCPU is being preempted by
+ * the host during two obsoleting processes, the VCPU can reuse its special
+ * pages when it is back.
+ */
+static struct kvm_mmu_page *kvm_mmu_alloc_special_page(struct kvm_vcpu *vcpu,
+		union kvm_mmu_page_role role)
+{
+	struct kvm_mmu_page *sp;
+
+	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+	sp->gfn = 0;
+	sp->role = role;
+	if (role.level == PT32E_ROOT_LEVEL &&
+	    vcpu->arch.mmu->root_role.level == PT32E_ROOT_LEVEL)
+		sp->spt = vcpu->arch.mmu->pae_root;
+	else
+		sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+	/* sp->gfns is not used for special shadow page */
+	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
+
+	return sp;
+}
+
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
 {
 	struct kvm_mmu_page *sp;
@@ -2076,6 +2128,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	if (level <= vcpu->arch.mmu->cpu_role.base.level)
 		role.passthrough = 0;
 
+	if (unlikely(level >= PT32E_ROOT_LEVEL && using_special_root_page(vcpu->arch.mmu)))
+		return kvm_mmu_alloc_special_page(vcpu, role);
+
 	sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
 	for_each_valid_sp(vcpu->kvm, sp, sp_list) {
 		if (sp->gfn != gfn) {
@@ -3290,6 +3345,37 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 	*root_hpa = INVALID_PAGE;
 }
 
+static void mmu_free_special_root_page(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+	u64 spte = mmu->root.hpa;
+	struct kvm_mmu_page *sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK);
+	int i;
+
+	/* Free level 5 or 4 roots for shadow NPT for 32 bit L1 */
+	while (sp->role.level > PT32E_ROOT_LEVEL)
+	{
+		spte = sp->spt[0];
+		mmu_page_zap_pte(kvm, sp, sp->spt + 0, NULL);
+		free_page((unsigned long)sp->spt);
+		kmem_cache_free(mmu_page_header_cache, sp);
+		if (!is_shadow_present_pte(spte))
+			return;
+		sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK);
+	}
+
+	if (WARN_ON_ONCE(sp->role.level != PT32E_ROOT_LEVEL))
+		return;
+
+	/* Free PAE roots */
+	for (i = 0; i < 4; i++)
+		mmu_page_zap_pte(kvm, sp, sp->spt + i, NULL);
+
+	if (sp->spt != mmu->pae_root)
+		free_page((unsigned long)sp->spt);
+
+	kmem_cache_free(mmu_page_header_cache, sp);
+}
+
 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
 			ulong roots_to_free)
@@ -3323,7 +3409,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
 
 	if (free_active_root) {
 		if (to_shadow_page(mmu->root.hpa)) {
-			mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
+			if (using_special_root_page(mmu))
+				mmu_free_special_root_page(kvm, mmu);
+			else
+				mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
 		} else if (mmu->pae_root) {
 			for (i = 0; i < 4; ++i) {
 				if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
-- 
2.19.1.6.gb485710b