[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220330132152.4568-4-jiangshanlai@gmail.com>
Date: Wed, 30 Mar 2022 21:21:51 +0800
From: Lai Jiangshan <jiangshanlai@...il.com>
To: linux-kernel@...r.kernel.org, kvm@...r.kernel.org,
Paolo Bonzini <pbonzini@...hat.com>,
Sean Christopherson <seanjc@...gle.com>
Cc: Lai Jiangshan <jiangshan.ljs@...group.com>,
Jonathan Corbet <corbet@....net>,
Vitaly Kuznetsov <vkuznets@...hat.com>,
Wanpeng Li <wanpengli@...cent.com>,
Jim Mattson <jmattson@...gle.com>,
Joerg Roedel <joro@...tes.org>,
Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
Dave Hansen <dave.hansen@...ux.intel.com>, x86@...nel.org,
"H. Peter Anvin" <hpa@...or.com>, linux-doc@...r.kernel.org
Subject: [RFC PATCH V3 3/4] KVM: X86: Alloc role.pae_root shadow page
From: Lai Jiangshan <jiangshan.ljs@...group.com>
Currently pae_root is special root page, this patch adds facility to
allow using kvm_mmu_get_page() to allocate pae_root shadow page.
When kvm_mmu_get_page() is called for role.level == PT32E_ROOT_LEVEL and
vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL, it will get a
PAE root pagetable and set role.pae_root=1 for freeing.
The role.pae_root bit is needed in the page role because:
o PAE roots must be allocated below 4gb (for kvm_mmu_get_page())
o PAE roots can not be encrypted (for kvm_mmu_get_page())
o Must be re-encrypted when freeing (for kvm_mmu_free_page())
o PAE root's PDPTE is special (for link_shadow_page())
o Not share the decrypted low-address pagetable with non-PAE-root
ones or vice verse. (for kvm_mmu_get_page(), the crucial reason)
Both role.pae_root in link_shadow_page() and in kvm_mmu_get_page() can
be possible changed to use shadow_root_level and role.level instead.
But in kvm_mmu_free_page(), it can't use vcpu->arch.mmu->shadow_root_level.
PAE roots must be allocated below 4gb (CR3 has only 32 bits). So a
cache is introduced (mmu_pae_root_cache).
No functionality changed since this code is not activated because when
vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL, kvm_mmu_get_page()
is only called for level == 1 or 2 now.
Signed-off-by: Lai Jiangshan <jiangshan.ljs@...group.com>
---
Documentation/virt/kvm/mmu.rst | 2 +
arch/x86/include/asm/kvm_host.h | 9 +++-
arch/x86/kvm/mmu/mmu.c | 78 +++++++++++++++++++++++++++++++--
arch/x86/kvm/mmu/paging_tmpl.h | 1 +
4 files changed, 86 insertions(+), 4 deletions(-)
diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst
index dee0e96d694a..800f1eba55b3 100644
--- a/Documentation/virt/kvm/mmu.rst
+++ b/Documentation/virt/kvm/mmu.rst
@@ -209,6 +209,8 @@ Shadow pages contain the following information:
top with role.glevel = guest paging level and acks as passthrough sp
and its contents are specially installed rather than the translations
of the corresponding guest pagetable.
+ role.pae_root:
+ Is 1 if it is a PAE root.
gfn:
Either the guest page table containing the translations shadowed by this
page, or the base page frame for linear translations. See role.direct.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 67e1bccaf472..658c493e7617 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -313,6 +313,11 @@ struct kvm_kernel_irq_routing_entry;
* - on top of this, smep_andnot_wp and smap_andnot_wp are only set if
* cr0_wp=0, therefore these three bits only give rise to 5 possibilities.
*
+ * - pae_root can only be set when level=3, so combinations for level and
+ * pae_root can be seen as 2/3/3-page_root/4/5, a.k.a 5 possibilities.
+ * Combined with cr0_wp, smep_andnot_wp and smap_andnot_wp, it will be
+ * 5X5 = 25 < 2^5.
+ *
* Therefore, the maximum number of possible upper-level shadow pages for a
* single gfn is a bit less than 2^15.
*/
@@ -332,7 +337,8 @@ union kvm_mmu_page_role {
unsigned ad_disabled:1;
unsigned guest_mode:1;
unsigned glevel:4;
- unsigned :2;
+ unsigned pae_root:1;
+ unsigned :1;
/*
* This is left at the top of the word so that
@@ -699,6 +705,7 @@ struct kvm_vcpu_arch {
struct kvm_mmu_memory_cache mmu_shadow_page_cache;
struct kvm_mmu_memory_cache mmu_gfn_array_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
+ void *mmu_pae_root_cache;
/*
* QEMU userspace and the guest each have their own FPU state.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index d53037df8177..81ccaa7c1165 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -694,6 +694,35 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
}
}
+static int mmu_topup_pae_root_cache(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+
+ if (vcpu->arch.mmu->shadow_root_level != PT32E_ROOT_LEVEL)
+ return 0;
+ if (vcpu->arch.mmu_pae_root_cache)
+ return 0;
+
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_DMA32);
+ if (!page)
+ return -ENOMEM;
+ vcpu->arch.mmu_pae_root_cache = page_address(page);
+
+ /*
+ * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+ * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
+ * that KVM's writes and the CPU's reads get along. Note, this is
+ * only necessary when using shadow paging, as 64-bit NPT can get at
+ * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+ * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+ */
+ if (!tdp_enabled)
+ set_memory_decrypted((unsigned long)vcpu->arch.mmu_pae_root_cache, 1);
+ else
+ WARN_ON_ONCE(shadow_me_mask);
+ return 0;
+}
+
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
{
int r;
@@ -705,6 +734,9 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
return r;
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ r = mmu_topup_pae_root_cache(vcpu);
if (r)
return r;
if (maybe_indirect) {
@@ -717,12 +749,23 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
PT64_ROOT_MAX_LEVEL);
}
+static void mmu_free_pae_root(void *root_pt)
+{
+ if (!tdp_enabled)
+ set_memory_encrypted((unsigned long)root_pt, 1);
+ free_page((unsigned long)root_pt);
+}
+
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+ if (vcpu->arch.mmu_pae_root_cache) {
+ mmu_free_pae_root(vcpu->arch.mmu_pae_root_cache);
+ vcpu->arch.mmu_pae_root_cache = NULL;
+ }
}
static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
@@ -1682,7 +1725,10 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
list_del(&sp->link);
- free_page((unsigned long)sp->spt);
+ if (sp->role.pae_root)
+ mmu_free_pae_root(sp->spt);
+ else
+ free_page((unsigned long)sp->spt);
free_page((unsigned long)sp->gfns);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -1720,7 +1766,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, gfn_t gfn,
struct kvm_mmu_page *sp;
sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
- sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+ if (!role.pae_root) {
+ sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+ } else {
+ sp->spt = vcpu->arch.mmu_pae_root_cache;
+ vcpu->arch.mmu_pae_root_cache = NULL;
+ }
if (role.glevel == role.level)
sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
@@ -2064,6 +2115,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
}
if (level < role.glevel)
role.glevel = level;
+ if (level != PT32E_ROOT_LEVEL)
+ role.pae_root = 0;
sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
for_each_valid_sp(vcpu->kvm, sp, sp_list) {
@@ -2199,14 +2252,26 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
__shadow_walk_next(iterator, *iterator->sptep);
}
+static u64 make_pae_pdpte(u64 *child_pt)
+{
+ /* The only ignore bits in PDPTE are 11:9. */
+ BUILD_BUG_ON(!(GENMASK(11,9) & SPTE_MMU_PRESENT_MASK));
+ return __pa(child_pt) | PT_PRESENT_MASK | SPTE_MMU_PRESENT_MASK |
+ shadow_me_mask;
+}
+
static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
struct kvm_mmu_page *sp)
{
+ struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
u64 spte;
BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
- spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
+ if (!parent_sp->role.pae_root)
+ spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
+ else
+ spte = make_pae_pdpte(sp->spt);
mmu_spte_set(sptep, spte);
@@ -4782,6 +4847,8 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
role.base.level = kvm_mmu_get_tdp_level(vcpu);
role.base.direct = true;
role.base.has_4_byte_gpte = false;
+ if (role.base.level == PT32E_ROOT_LEVEL)
+ role.base.pae_root = 1;
return role;
}
@@ -4848,6 +4915,9 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
else
role.base.level = PT64_ROOT_4LEVEL;
+ if (role.base.level == PT32E_ROOT_LEVEL)
+ role.base.pae_root = 1;
+
return role;
}
@@ -4893,6 +4963,8 @@ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
role.base.direct = false;
role.base.level = kvm_mmu_get_tdp_level(vcpu);
+ if (role.base.level == PT32E_ROOT_LEVEL)
+ role.base.pae_root = 1;
return role;
}
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 67489a060eba..1015f33e0758 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -1043,6 +1043,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
.access = 0x7,
.quadrant = 0x3,
.glevel = 0xf,
+ .pae_root = 0x1,
};
/*
--
2.19.1.6.gb485710b
Powered by blists - more mailing lists