[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260106102345.25261-1-yan.y.zhao@intel.com>
Date: Tue, 6 Jan 2026 18:23:45 +0800
From: Yan Zhao <yan.y.zhao@...el.com>
To: pbonzini@...hat.com,
seanjc@...gle.com
Cc: linux-kernel@...r.kernel.org,
kvm@...r.kernel.org,
x86@...nel.org,
rick.p.edgecombe@...el.com,
dave.hansen@...el.com,
kas@...nel.org,
tabba@...gle.com,
ackerleytng@...gle.com,
michael.roth@....com,
david@...nel.org,
vannapurve@...gle.com,
sagis@...gle.com,
vbabka@...e.cz,
thomas.lendacky@....com,
nik.borisov@...e.com,
pgonda@...gle.com,
fan.du@...el.com,
jun.miao@...el.com,
francescolavra.fl@...il.com,
jgross@...e.com,
ira.weiny@...el.com,
isaku.yamahata@...el.com,
xiaoyao.li@...el.com,
kai.huang@...el.com,
binbin.wu@...ux.intel.com,
chao.p.peng@...el.com,
chao.gao@...el.com,
yan.y.zhao@...el.com
Subject: [PATCH v3 20/24] KVM: TDX: Implement per-VM external cache for splitting in TDX
Implement the KVM x86 ops for per-VM external cache for splitting the
external page table in TDX.
Since the per-VM external cache for splitting the external page table is
intended to be invoked outside of vCPU threads, i.e., when the per-vCPU
external_fault_cache is not available, introduce a spinlock
prealloc_split_cache_lock in TDX to protect pages enqueuing/dequeuing
operations for the per-VM external split cache.
Cache topup in tdx_topup_vm_split_cache() manages page enqueuing with the
help of prealloc_split_cache_lock.
Cache dequeuing will be implemented in tdx_sept_split_private_spte() in
later patches, which will also hold prealloc_split_cache_lock.
Checking the need of topup in tdx_need_topup_vm_split_cache() does not hold
prealloc_split_cache_lock internally. When tdx_need_topup_vm_split_cache()
is invoked under write mmu_lock, there's no need for further acquiring
prealloc_split_cache_lock; when tdx_need_topup_vm_split_cache() is invoked
under read mmu_lock, it needs to be checked again after acquiring
prealloc_split_cache_lock for cache dequeuing.
Cache free does not hold prealloc_split_cache_lock because it's intended to
be called when there's no contention.
Signed-off-by: Yan Zhao <yan.y.zhao@...el.com>
---
v3:
- new patch corresponds to DPAMT v4.
---
arch/x86/kvm/vmx/tdx.c | 61 ++++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/tdx.h | 5 ++++
2 files changed, 66 insertions(+)
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index c1dc1aaae49d..40cca273d480 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -671,6 +671,9 @@ int tdx_vm_init(struct kvm *kvm)
kvm_tdx->state = TD_STATE_UNINITIALIZED;
+ INIT_LIST_HEAD(&kvm_tdx->prealloc_split_cache.page_list);
+ spin_lock_init(&kvm_tdx->prealloc_split_cache_lock);
+
return 0;
}
@@ -1680,6 +1683,61 @@ static void tdx_free_external_fault_cache(struct kvm_vcpu *vcpu)
__free_page(page);
}
+/*
+ * Need to prepare at least 2 pairs of PAMT pages (i.e., 4 PAMT pages) for
+ * splitting a S-EPT PG_LEVEL_2M mapping when Dynamic PAMT is enabled:
+ * - 1 pair for the new 4KB S-EPT page for splitting, which may be dequeued in
+ * tdx_sept_split_private_spte() when there are no installed PAMT pages for
+ * the 2MB physical range of the S-EPT page.
+ * - 1 pair for demoting guest private memory from 2MB to 4KB, which will be
+ * dequeued in tdh_mem_page_demote().
+ */
+static int tdx_min_split_cache_sz(struct kvm *kvm, int level)
+{
+ KVM_BUG_ON(level != PG_LEVEL_2M, kvm);
+
+ if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
+ return 0;
+
+ return tdx_dpamt_entry_pages() * 2;
+}
+
+static int tdx_topup_vm_split_cache(struct kvm *kvm, enum pg_level level)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct tdx_prealloc *prealloc = &kvm_tdx->prealloc_split_cache;
+ int cnt = tdx_min_split_cache_sz(kvm, level);
+
+ while (READ_ONCE(prealloc->cnt) < cnt) {
+ struct page *page = alloc_page(GFP_KERNEL);
+
+ if (!page)
+ return -ENOMEM;
+
+ spin_lock(&kvm_tdx->prealloc_split_cache_lock);
+ list_add(&page->lru, &prealloc->page_list);
+ prealloc->cnt++;
+ spin_unlock(&kvm_tdx->prealloc_split_cache_lock);
+ }
+
+ return 0;
+}
+
+static bool tdx_need_topup_vm_split_cache(struct kvm *kvm, enum pg_level level)
+{
+ struct tdx_prealloc *prealloc = &to_kvm_tdx(kvm)->prealloc_split_cache;
+
+ return prealloc->cnt < tdx_min_split_cache_sz(kvm, level);
+}
+
+static void tdx_free_vm_split_cache(struct kvm *kvm)
+{
+ struct page *page;
+
+ while ((page = get_tdx_prealloc_page(&to_kvm_tdx(kvm)->prealloc_split_cache)))
+ __free_page(page);
+}
+
static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
enum pg_level level, kvm_pfn_t pfn)
{
@@ -3804,4 +3862,7 @@ void __init tdx_hardware_setup(void)
vt_x86_ops.alloc_external_fault_cache = tdx_alloc_external_fault_cache;
vt_x86_ops.topup_external_fault_cache = tdx_topup_external_fault_cache;
vt_x86_ops.free_external_fault_cache = tdx_free_external_fault_cache;
+ vt_x86_ops.topup_external_per_vm_split_cache = tdx_topup_vm_split_cache;
+ vt_x86_ops.need_topup_external_per_vm_split_cache = tdx_need_topup_vm_split_cache;
+ vt_x86_ops.free_external_per_vm_split_cache = tdx_free_vm_split_cache;
}
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 43dd295b7fd6..034e3ddfb679 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -48,6 +48,11 @@ struct kvm_tdx {
* Set/unset is protected with kvm->mmu_lock.
*/
bool wait_for_sept_zap;
+
+ /* The per-VM cache for splitting S-EPT */
+ struct tdx_prealloc prealloc_split_cache;
+ /* Protect page enqueuing/dequeuing in prealloc_split_cache */
+ spinlock_t prealloc_split_cache_lock;
};
/* TDX module vCPU states */
--
2.43.2
Powered by blists - more mailing lists