[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260129011517.3545883-23-seanjc@google.com>
Date: Wed, 28 Jan 2026 17:14:54 -0800
From: Sean Christopherson <seanjc@...gle.com>
To: Thomas Gleixner <tglx@...nel.org>, Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
Dave Hansen <dave.hansen@...ux.intel.com>, x86@...nel.org,
Kiryl Shutsemau <kas@...nel.org>, Sean Christopherson <seanjc@...gle.com>, Paolo Bonzini <pbonzini@...hat.com>
Cc: linux-kernel@...r.kernel.org, linux-coco@...ts.linux.dev,
kvm@...r.kernel.org, Kai Huang <kai.huang@...el.com>,
Rick Edgecombe <rick.p.edgecombe@...el.com>, Yan Zhao <yan.y.zhao@...el.com>,
Vishal Annapurve <vannapurve@...gle.com>, Ackerley Tng <ackerleytng@...gle.com>,
Sagi Shahar <sagis@...gle.com>, Binbin Wu <binbin.wu@...ux.intel.com>,
Xiaoyao Li <xiaoyao.li@...el.com>, Isaku Yamahata <isaku.yamahata@...el.com>
Subject: [RFC PATCH v5 22/45] KVM: TDX: Get/put PAMT pages when (un)mapping
private memory
From: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
Add Dynamic PAMT support to KVM's S-EPT MMU by "getting" a PAMT page when
adding guest memory (PAGE.ADD or PAGE.AUG), and "putting" the page when
removing guest memory (PAGE.REMOVE).
To access the per-vCPU PAMT caches without plumbing @vcpu throughout the
TDP MMU, begrudginly use kvm_get_running_vcpu() to get the vCPU, and bug
the VM If KVM attempts to set an S-EPT without an active vCPU. KVM only
supports creating _new_ mappings in page (pre)fault paths, all of which
require an active vCPU.
The PAMT memory holds metadata for TDX-protected memory. With Dynamic
PAMT, PAMT_4K is allocated on demand. The kernel supplies the TDX module
with a few pages that cover 2M of host physical memory.
PAMT memory can be reclaimed when the last user is gone. It can happen
in a few code paths:
- On TDH.PHYMEM.PAGE.RECLAIM in tdx_reclaim_td_control_pages() and
tdx_reclaim_page().
- On TDH.MEM.PAGE.REMOVE in tdx_sept_drop_private_spte().
- In tdx_sept_zap_private_spte() for pages that were in the queue to be
added with TDH.MEM.PAGE.ADD, but it never happened due to an error.
- In tdx_sept_free_private_spt() for SEPT pages;
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
[Minor log tweak]
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@...el.com>
Co-developed-by: Sean Christopherson <seanjc@...gle.com>
Signed-off-by: Sean Christopherson <seanjc@...gle.com>
---
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu/mmu.c | 4 +++
arch/x86/kvm/vmx/tdx.c | 44 ++++++++++++++++++++++++++----
arch/x86/kvm/vmx/tdx.h | 2 ++
5 files changed, 47 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 17dddada69fc..394dc29483a7 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -99,6 +99,7 @@ KVM_X86_OP_OPTIONAL(free_external_sp)
KVM_X86_OP_OPTIONAL_RET0(set_external_spte)
KVM_X86_OP_OPTIONAL(remove_external_spte)
KVM_X86_OP_OPTIONAL(reclaim_external_sp)
+KVM_X86_OP_OPTIONAL_RET0(topup_external_cache)
KVM_X86_OP(has_wbinvd_exit)
KVM_X86_OP(get_l2_tsc_offset)
KVM_X86_OP(get_l2_tsc_multiplier)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6e84dbc89e79..a6e4ab76b1b2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1863,6 +1863,7 @@ struct kvm_x86_ops {
struct kvm_mmu_page *sp);
void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
u64 mirror_spte);
+ int (*topup_external_cache)(struct kvm_vcpu *vcpu, int min);
bool (*has_wbinvd_exit)(void);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 9b5a6861e2a4..4ecbf216d96f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -605,6 +605,10 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
PT64_ROOT_MAX_LEVEL);
if (r)
return r;
+
+ r = kvm_x86_call(topup_external_cache)(vcpu, PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
}
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
PT64_ROOT_MAX_LEVEL);
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 0946eba2de23..d74a2547e512 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -683,6 +683,8 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
if (!irqchip_split(vcpu->kvm))
return -EINVAL;
+ tdx_init_pamt_cache(&tdx->pamt_cache);
+
fpstate_set_confidential(&vcpu->arch.guest_fpu);
vcpu->arch.apic->guest_apic_protected = true;
INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
@@ -868,6 +870,8 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
struct vcpu_tdx *tdx = to_tdx(vcpu);
int i;
+ tdx_free_pamt_cache(&tdx->pamt_cache);
+
if (vcpu->cpu != -1) {
KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm);
tdx_flush_vp_on_cpu(vcpu);
@@ -1615,6 +1619,14 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
}
+static int tdx_topup_external_pamt_cache(struct kvm_vcpu *vcpu, int min)
+{
+ if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
+ return 0;
+
+ return tdx_topup_pamt_cache(&to_tdx(vcpu)->pamt_cache, min);
+}
+
static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
kvm_pfn_t pfn)
{
@@ -1696,8 +1708,15 @@ static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, u64 mirror_spte)
{
+ struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct page *page = pfn_to_page(pfn);
+ int ret;
+
+ if (KVM_BUG_ON(!vcpu, kvm))
+ return -EINVAL;
if (KVM_BUG_ON(!is_shadow_present_pte(mirror_spte), kvm))
return -EIO;
@@ -1711,6 +1730,10 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
WARN_ON_ONCE((mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
+ ret = tdx_pamt_get(page, &tdx->pamt_cache);
+ if (ret)
+ return ret;
+
/*
* Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
* before kvm_tdx->state. Userspace must not be allowed to pre-fault
@@ -1723,14 +1746,17 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
* If the TD isn't finalized/runnable, then userspace is initializing
* the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD.
*/
- if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
- return tdx_mem_page_add(kvm, gfn, level, pfn);
+ if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
+ ret = tdx_mem_page_aug(kvm, gfn, level, pfn);
+ else
+ ret = tdx_mem_page_add(kvm, gfn, level, pfn);
- return tdx_mem_page_aug(kvm, gfn, level, pfn);
+ if (ret)
+ tdx_pamt_put(page);
+
+ return ret;
}
-
-
/*
* Ensure shared and private EPTs to be flushed on all vCPUs.
* tdh_mem_track() is the only caller that increases TD epoch. An increase in
@@ -1847,6 +1873,7 @@ static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
return;
tdx_quirk_reset_page(page);
+ tdx_pamt_put(page);
}
void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
@@ -3614,5 +3641,12 @@ void __init tdx_hardware_setup(void)
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
vt_x86_ops.reclaim_external_sp = tdx_sept_reclaim_private_sp;
vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
+
+ /*
+ * FIXME: Wire up the PAMT hook iff DPAMT is supported, once VMXON is
+ * moved out of KVM and tdx_bringup() is folded into here.
+ */
+ vt_x86_ops.topup_external_cache = tdx_topup_external_pamt_cache;
+
vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
}
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index ce2720a028ad..f444fc84d93b 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -73,6 +73,8 @@ struct vcpu_tdx {
u64 map_gpa_next;
u64 map_gpa_end;
+
+ struct tdx_pamt_cache pamt_cache;
};
void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err);
--
2.53.0.rc1.217.geba53bf80e-goog
Powered by blists - more mailing lists