[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260129011517.3545883-21-seanjc@google.com>
Date: Wed, 28 Jan 2026 17:14:52 -0800
From: Sean Christopherson <seanjc@...gle.com>
To: Thomas Gleixner <tglx@...nel.org>, Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
Dave Hansen <dave.hansen@...ux.intel.com>, x86@...nel.org,
Kiryl Shutsemau <kas@...nel.org>, Sean Christopherson <seanjc@...gle.com>, Paolo Bonzini <pbonzini@...hat.com>
Cc: linux-kernel@...r.kernel.org, linux-coco@...ts.linux.dev,
kvm@...r.kernel.org, Kai Huang <kai.huang@...el.com>,
Rick Edgecombe <rick.p.edgecombe@...el.com>, Yan Zhao <yan.y.zhao@...el.com>,
Vishal Annapurve <vannapurve@...gle.com>, Ackerley Tng <ackerleytng@...gle.com>,
Sagi Shahar <sagis@...gle.com>, Binbin Wu <binbin.wu@...ux.intel.com>,
Xiaoyao Li <xiaoyao.li@...el.com>, Isaku Yamahata <isaku.yamahata@...el.com>
Subject: [RFC PATCH v5 20/45] KVM: x86/mmu: Allocate/free S-EPT pages using tdx_{alloc,free}_control_page()
Now that kvm_mmu_memory_cache supports custom page allocators, wire up the
S-EPT cache to use tdx_{alloc,free}_control_page() (arguably S-EPT pages
aren't "control" pages, but they're not guest pages either). Using the
TDX APIs will make S-EPT pages naturally play nice with Dynamic PAMT, by
virtue of adding/removing PAMT entries when S-EPT pages are allocated and
freed, as opposed to when they are added/removed from the S-EPT tree.
Inserting into the PAMT entries on allocation does mean KVM will create
unnecessary PAMT entries, e.g. once a vCPU stops faulting in memory, the
remaining pages in the MMU cache will go unused. But in practice, odds
are very good the containing 2MiB page will have other in-use S-EPT pages,
i.e. will create PAMT entries anyways. And _if_ creating PAMT entries on
allocation is problematic for memory consumption, that can be resolved by
tweaking KVM's cache size.
Suggested-by: Kai Huang <kai.huang@...el.com>
Signed-off-by: Sean Christopherson <seanjc@...gle.com>
---
arch/x86/include/asm/kvm-x86-ops.h | 2 ++
arch/x86/include/asm/kvm_host.h | 18 +++++++++---------
arch/x86/kvm/mmu/mmu.c | 6 ++++--
arch/x86/kvm/mmu/mmu_internal.h | 11 -----------
arch/x86/kvm/mmu/tdp_mmu.c | 5 +++--
arch/x86/kvm/vmx/tdx.c | 13 ++++++++++++-
6 files changed, 30 insertions(+), 25 deletions(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index c17cedc485c9..17dddada69fc 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -94,6 +94,8 @@ KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
KVM_X86_OP(load_mmu_pgd)
+KVM_X86_OP_OPTIONAL(alloc_external_sp)
+KVM_X86_OP_OPTIONAL(free_external_sp)
KVM_X86_OP_OPTIONAL_RET0(set_external_spte)
KVM_X86_OP_OPTIONAL(remove_external_spte)
KVM_X86_OP_OPTIONAL(reclaim_external_sp)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b35a07ed11fb..6e84dbc89e79 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -867,10 +867,7 @@ struct kvm_vcpu_arch {
struct kvm_mmu_memory_cache mmu_shadow_page_cache;
struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
- /*
- * This cache is to allocate external page table. E.g. private EPT used
- * by the TDX module.
- */
+ /* Used to allocate S-EPT pages (gifted to the TDX-Module). */
struct kvm_mmu_memory_cache mmu_external_spt_cache;
/*
@@ -1853,18 +1850,21 @@ struct kvm_x86_ops {
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int root_level);
- /* Update the external page table from spte getting set. */
+ /*
+ * Callbacks to allocate and free external page tables, a.k.a. S-EPT,
+ * and to propagate changes in mirror page tables to the external page
+ * tables.
+ */
+ unsigned long (*alloc_external_sp)(gfp_t gfp);
+ void (*free_external_sp)(unsigned long addr);
int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
u64 mirror_spte);
-
- /* Update external page tables for page table about to be freed. */
void (*reclaim_external_sp)(struct kvm *kvm, gfn_t gfn,
struct kvm_mmu_page *sp);
-
- /* Update external page table from spte getting removed, and flush TLB. */
void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
u64 mirror_spte);
+
bool (*has_wbinvd_exit)(void);
u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3911ac9bddfd..9b5a6861e2a4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6690,11 +6690,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
- vcpu->arch.mmu_shadow_page_cache.init_value =
- SHADOW_NONPRESENT_VALUE;
+ vcpu->arch.mmu_shadow_page_cache.init_value = SHADOW_NONPRESENT_VALUE;
if (!vcpu->arch.mmu_shadow_page_cache.init_value)
vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+ vcpu->arch.mmu_external_spt_cache.page_get = kvm_x86_ops.alloc_external_sp;
+ vcpu->arch.mmu_external_spt_cache.page_free = kvm_x86_ops.free_external_sp;
+
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 73cdcbccc89e..6bb97f660793 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -157,17 +157,6 @@ static inline bool is_mirror_sp(const struct kvm_mmu_page *sp)
return sp->role.is_mirror;
}
-static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
- /*
- * external_spt is allocated for TDX module to hold private EPT mappings,
- * TDX module will initialize the page by itself.
- * Therefore, KVM does not need to initialize or access external_spt.
- * KVM only interacts with sp->spt for private EPT operations.
- */
- sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
-}
-
static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mmu_page *root)
{
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 18764dbc97ea..01e3e4f4baa5 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -55,7 +55,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
- free_page((unsigned long)sp->external_spt);
+ if (sp->external_spt)
+ kvm_x86_call(free_external_sp)((unsigned long)sp->external_spt);
free_page((unsigned long)sp->spt);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -1246,7 +1247,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
sp = tdp_mmu_alloc_sp(vcpu);
tdp_mmu_init_child_sp(sp, &iter);
if (is_mirror_sp(sp))
- kvm_mmu_alloc_external_spt(vcpu, sp);
+ sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 323aae4300a1..0946eba2de23 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1790,7 +1790,9 @@ static void tdx_sept_reclaim_private_sp(struct kvm *kvm, gfn_t gfn,
* TD's hkid is freed, when the TD is being torn down.
*
* If the S-EPT PTE can't be removed for any reason, intentionally leak
- * the page to prevent the kernel from accessing the encrypted page.
+ * the page to prevent the kernel from accessing the encrypted page,
+ * and if Dynamic PAMT is enabled, to avoid inducing a failure on
+ * removal of the still-used PAMT entry.
*/
if (KVM_BUG_ON(is_hkid_assigned(to_kvm_tdx(kvm)), kvm) ||
tdx_reclaim_page(virt_to_page(sp->external_spt)))
@@ -3600,6 +3602,15 @@ void __init tdx_hardware_setup(void)
*/
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
+ /*
+ * TDX uses the external_spt cache to allocate S-EPT page table pages,
+ * which (a) don't need to be initialized by KVM as the TDX-Module will
+ * initialize the page (using the guest's encryption key), and (b) need
+ * to use a custom allocator to be compatible with Dynamic PAMT.
+ */
+ vt_x86_ops.alloc_external_sp = tdx_alloc_control_page;
+ vt_x86_ops.free_external_sp = tdx_free_control_page;
+
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
vt_x86_ops.reclaim_external_sp = tdx_sept_reclaim_private_sp;
vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
--
2.53.0.rc1.217.geba53bf80e-goog
Powered by blists - more mailing lists