[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aXt_L6QKB9CSTZcW@google.com>
Date: Thu, 29 Jan 2026 07:39:27 -0800
From: Sean Christopherson <seanjc@...gle.com>
To: Thomas Gleixner <tglx@...nel.org>, Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
Dave Hansen <dave.hansen@...ux.intel.com>, x86@...nel.org,
Kiryl Shutsemau <kas@...nel.org>, Paolo Bonzini <pbonzini@...hat.com>, linux-kernel@...r.kernel.org,
linux-coco@...ts.linux.dev, kvm@...r.kernel.org,
Kai Huang <kai.huang@...el.com>, Rick Edgecombe <rick.p.edgecombe@...el.com>,
Yan Zhao <yan.y.zhao@...el.com>, Vishal Annapurve <vannapurve@...gle.com>,
Ackerley Tng <ackerleytng@...gle.com>, Sagi Shahar <sagis@...gle.com>,
Binbin Wu <binbin.wu@...ux.intel.com>, Xiaoyao Li <xiaoyao.li@...el.com>,
Isaku Yamahata <isaku.yamahata@...el.com>
Subject: Re: [RFC PATCH v5 44/45] KVM: x86/mmu: Add support for splitting
S-EPT hugepages on conversion
On Wed, Jan 28, 2026, Sean Christopherson wrote:
> #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_CONVERT
> +static int __tdp_mmu_split_mirror_huge_pages(struct kvm *kvm,
> + struct kvm_mmu_page *root,
> + gfn_t gfn, int target_level)
> +{
> + gfn_t end = gfn + KVM_PAGES_PER_HPAGE(target_level + 1);
> +
> + return tdp_mmu_split_huge_pages_root(kvm, root, gfn, end, target_level, false);
> +}
> +
> +static int tdp_mmu_split_mirror_huge_pages(struct kvm *kvm,
> + struct kvm_mmu_page *root,
> + gfn_t start, gfn_t end, int level)
> +{
> +
> + gfn_t head = gfn_round_for_level(start, level + 1);
> + gfn_t tail = gfn_round_for_level(end, level + 1);
> + int r;
> +
> + if (head != start) {
> + r = __tdp_mmu_split_mirror_huge_pages(kvm, root, head, level);
> + if (r)
> + return r;
> + }
> +
> + if (tail != end && (head != tail || head == start)) {
> + r = __tdp_mmu_split_mirror_huge_pages(kvm, root, tail, level);
> + if (r)
> + return r;
> + }
> +
> + return 0;
> +}
> +
> int kvm_arch_gmem_convert(struct kvm *kvm, gfn_t start, gfn_t end,
> bool to_private)
> {
> + struct kvm_mmu_page *root;
> + int r;
> +
> + /*
> + * When converting from private=>shared, KVM must first split potential
> + * hugepages, as KVM mustn't overzap private mappings for TDX guests,
> + * i.e. must zap _exactly_ [start, end). Split potential hugepages at
> + * the head and tail of the to-be-converted (and thus zapped) range so
> + * that KVM doesn't overzap due to dropping a hugepage that doesn't
> + * fall wholly inside the range.
> + */
> + if (to_private || !kvm_has_mirrored_tdp(kvm))
> + return 0;
> +
> + /*
> + * Acquire the external cache lock, a.k.a. the Dynamic PAMT lock, to
> + * protect the per-VM cache of pre-allocate pages used to populate the
> + * Dynamic PAMT when splitting S-EPT huge pages.
> + */
> + guard(mutex)(&kvm->arch.tdp_mmu_external_cache_lock);
> +
> + guard(write_lock)(&kvm->mmu_lock);
> +
> + /*
> + * TODO: Also split from PG_LEVEL_1G => PG_LEVEL_2M when KVM supports
> + * 1GiB S-EPT pages.
> + */
> + __for_each_tdp_mmu_root_yield_safe(kvm, root, 0, KVM_MIRROR_ROOTS) {
> + r = tdp_mmu_split_mirror_huge_pages(kvm, root, start, end, PG_LEVEL_4K);
> + if (r)
This needs to call kvm_tdp_mmu_put_root() on failure. But if we instead add
kvm_tdp_mmu_mirrors_split_huge_pages() for use in handling mismatched ACCEPT,
this code goes away.
And then the bulk of this code can live in tdx.c instead of tdp_mmu.c, and the
pamt mutex can live in kvm_tdx instead of kvm_arch.
Compile tested only...
---
From: Sean Christopherson <seanjc@...gle.com>
Date: Thu, 22 Jan 2026 07:36:47 -0800
Subject: [PATCH] KVM: x86/mmu: Add support for splitting S-EPT hugepages on
conversion
Add support for splitting S-EPT hugepages in preparation for converting a
subset of a hugepage to be shared, as KVM must precisely zap/remove S-EPT
entries to avoid clobbering guest memory (the lifetime of guest private
memory is tied to the S-EPT). I.e. KVM needs to first split a hugepage so
that only the to-be-converted small pages can be zapped.
To avoid unnecessary work, e.g. if only the tail/end page of massive region
isn't aligned to the conversion, explicitly detect unaligned head and tail
pages relative to the max page size support by KVM, i.e. head/tail pages
that will undergo partial conversion.
To support splitting an S-EPT hugepage without a vCPU, add a per-VM PAMT
cache, along with a mutex to guard the cache. Using a mutex, e.g. versus
a spinlock, is important at it allows KVM to allocate memory *without*
dropping the lock, i.e. so that the PAMT cache can be topped-up as needed
without needed to juggle arch.tdp_mmu_external_cache_lock.
Signed-off-by: Sean Christopherson <seanjc@...gle.com>
---
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 3 +-
arch/x86/kvm/mmu/mmu.c | 2 +-
arch/x86/kvm/mmu/tdp_mmu.c | 7 ++-
arch/x86/kvm/vmx/tdx.c | 96 ++++++++++++++++++++++++++++--
arch/x86/kvm/vmx/tdx.h | 3 +
arch/x86/kvm/x86.c | 2 +-
7 files changed, 102 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 3ca56fe6b951..6083fb07cd3b 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -149,6 +149,7 @@ KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
KVM_X86_OP_OPTIONAL_RET0(gmem_max_mapping_level)
KVM_X86_OP_OPTIONAL(gmem_invalidate)
+KVM_X86_OP_OPTIONAL_RET0(gmem_convert)
#undef KVM_X86_OP
#undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 385f1cf32d70..cd3e7dc6ab9b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1861,7 +1861,7 @@ struct kvm_x86_ops {
u64 new_spte, enum pg_level level);
void (*reclaim_external_sp)(struct kvm *kvm, gfn_t gfn,
struct kvm_mmu_page *sp);
- int (*topup_external_cache)(struct kvm_vcpu *vcpu, int min);
+ int (*topup_external_cache)(struct kvm *kvm, struct kvm_vcpu *vcpu, int min);
bool (*has_wbinvd_exit)(void);
@@ -1950,6 +1950,7 @@ struct kvm_x86_ops {
void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
+ int (*gmem_convert)(struct kvm *kvm, gfn_t start, gfn_t end, bool to_private);
int (*gmem_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
};
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c2765bfc8492..62bf6bec2df2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -606,7 +606,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
if (r)
return r;
- r = kvm_x86_call(topup_external_cache)(vcpu, PT64_ROOT_MAX_LEVEL);
+ r = kvm_x86_call(topup_external_cache)(vcpu->kvm, vcpu, PT64_ROOT_MAX_LEVEL);
if (r)
return r;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index a45d8ee91481..a32192c35099 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1447,7 +1447,8 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
return spte_set;
}
-static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct tdp_iter *iter)
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
+ struct tdp_iter *iter)
{
struct kvm_mmu_page *sp;
@@ -1464,7 +1465,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct tdp_iter *iter)
if (!sp->external_spt)
goto err_external_spt;
- if (kvm_x86_call(topup_external_cache)(kvm_get_running_vcpu(), 1))
+ if (kvm_x86_call(topup_external_cache)(kvm, kvm_get_running_vcpu(), 1))
goto err_external_split;
}
@@ -1556,7 +1557,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
else
write_unlock(&kvm->mmu_lock);
- sp = tdp_mmu_alloc_sp_for_split(&iter);
+ sp = tdp_mmu_alloc_sp_for_split(kvm, &iter);
if (shared)
read_lock(&kvm->mmu_lock);
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 9f2ef46f87b0..c4050d94fb4d 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -607,6 +607,8 @@ void tdx_vm_destroy(struct kvm *kvm)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ tdx_free_pamt_cache(&kvm_tdx->pamt_cache);
+
tdx_reclaim_td_control_pages(kvm);
kvm_tdx->state = TD_STATE_UNINITIALIZED;
@@ -629,6 +631,8 @@ int tdx_vm_init(struct kvm *kvm)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ tdx_init_pamt_cache(&kvm_tdx->pamt_cache);
+
kvm->arch.has_protected_state = true;
/*
* TDX Module doesn't allow the hypervisor to modify the EOI-bitmap,
@@ -1285,6 +1289,66 @@ static int tdx_map_gpa(struct kvm_vcpu *vcpu)
return 1;
}
+static int __tdx_sept_split_huge_pages(struct kvm *kvm, gfn_t gfn, int target_level)
+{
+ gfn_t end = gfn + KVM_PAGES_PER_HPAGE(target_level + 1);
+
+ return kvm_tdp_mmu_mirrors_split_huge_pages(kvm, gfn, end, target_level);
+}
+
+static int tdx_sept_split_huge_pages(struct kvm *kvm, gfn_t start, gfn_t end,
+ int target_level)
+{
+
+ gfn_t head = gfn_round_for_level(start, target_level + 1);
+ gfn_t tail = gfn_round_for_level(end, target_level + 1);
+ int r;
+
+ if (head != start) {
+ r = __tdx_sept_split_huge_pages(kvm, head, target_level);
+ if (r)
+ return r;
+ }
+
+ if (tail != end && (head != tail || head == start)) {
+ r = __tdx_sept_split_huge_pages(kvm, tail, target_level);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+static int tdx_gmem_convert(struct kvm *kvm, gfn_t start, gfn_t end,
+ bool to_private)
+{
+ /*
+ * When converting from private=>shared, KVM must first split potential
+ * hugepages, as KVM mustn't overzap private mappings for TDX guests,
+ * i.e. must zap _exactly_ [start, end). Split potential hugepages at
+ * the head and tail of the to-be-converted (and thus zapped) range so
+ * that KVM doesn't overzap due to dropping a hugepage that doesn't
+ * fall wholly inside the range.
+ */
+ if (to_private || !kvm_has_mirrored_tdp(kvm))
+ return 0;
+
+ /*
+ * Acquire the external cache lock, a.k.a. the Dynamic PAMT lock, to
+ * protect the per-VM cache of pre-allocate pages used to populate the
+ * Dynamic PAMT when splitting S-EPT huge pages.
+ */
+ guard(mutex)(&to_kvm_tdx(kvm)->pamt_cache_lock);
+
+ guard(write_lock)(&kvm->mmu_lock);
+
+ /*
+ * TODO: Also split from PG_LEVEL_1G => PG_LEVEL_2M when KVM supports
+ * 1GiB S-EPT pages.
+ */
+ return tdx_sept_split_huge_pages(kvm, start, end, PG_LEVEL_4K);
+}
+
static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
{
struct vcpu_tdx *tdx = to_tdx(vcpu);
@@ -1621,15 +1685,32 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
}
-static int tdx_topup_external_pamt_cache(struct kvm_vcpu *vcpu, int min)
+static struct tdx_pamt_cache *tdx_get_pamt_cache(struct kvm *kvm,
+ struct kvm_vcpu *vcpu)
{
+ if (KVM_BUG_ON(vcpu && vcpu->kvm != kvm, kvm))
+ return NULL;
+
+ if (vcpu)
+ return &to_tdx(vcpu)->pamt_cache;
+
+ lockdep_assert_held(&to_kvm_tdx(kvm)->pamt_cache_lock);
+ return &to_kvm_tdx(kvm)->pamt_cache;
+}
+
+static int tdx_topup_external_pamt_cache(struct kvm *kvm,
+ struct kvm_vcpu *vcpu, int min)
+{
+ struct tdx_pamt_cache *pamt_cache;
+
if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
return 0;
- if (WARN_ON_ONCE(!vcpu))
+ pamt_cache = tdx_get_pamt_cache(kvm, vcpu);
+ if (!pamt_cache)
return -EIO;
- return tdx_topup_pamt_cache(&to_tdx(vcpu)->pamt_cache, min);
+ return tdx_topup_pamt_cache(pamt_cache, min);
}
static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
@@ -1792,8 +1873,8 @@ static struct page *tdx_spte_to_external_spt(struct kvm *kvm, gfn_t gfn,
static int tdx_sept_split_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
u64 new_spte, enum pg_level level)
{
- struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct tdx_pamt_cache *pamt_cache;
gpa_t gpa = gfn_to_gpa(gfn);
u64 err, entry, level_state;
struct page *external_spt;
@@ -1804,7 +1885,8 @@ static int tdx_sept_split_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
if (!external_spt)
return -EIO;
- if (KVM_BUG_ON(!vcpu || vcpu->kvm != kvm, kvm))
+ pamt_cache = tdx_get_pamt_cache(kvm, kvm_get_running_vcpu());
+ if (!pamt_cache)
return -EIO;
err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
@@ -1816,7 +1898,7 @@ static int tdx_sept_split_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
err = tdh_do_no_vcpus(tdh_mem_page_demote, kvm, &kvm_tdx->td, gpa,
level, spte_to_pfn(old_spte), external_spt,
- &to_tdx(vcpu)->pamt_cache, &entry, &level_state);
+ pamt_cache, &entry, &level_state);
if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_DEMOTE, entry, level_state, kvm))
return -EIO;
@@ -3776,6 +3858,8 @@ void __init tdx_hardware_setup(void)
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
vt_x86_ops.reclaim_external_sp = tdx_sept_reclaim_private_sp;
+ vt_x86_ops.gmem_convert = tdx_gmem_convert;
+
/*
* FIXME: Wire up the PAMT hook iff DPAMT is supported, once VMXON is
* moved out of KVM and tdx_bringup() is folded into here.
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index f444fc84d93b..2bb4604a64ca 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -48,6 +48,9 @@ struct kvm_tdx {
* Set/unset is protected with kvm->mmu_lock.
*/
bool wait_for_sept_zap;
+
+ struct tdx_pamt_cache pamt_cache;
+ struct mutex pamt_cache_lock;
};
/* TDX module vCPU states */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c80cc60e7862..c3d71ba9a1dc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -14061,7 +14061,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
int kvm_arch_gmem_convert(struct kvm *kvm, gfn_t start, gfn_t end,
bool to_private)
{
- return 0;
+ return kvm_x86_call(gmem_convert)(kvm, start, end, to_private);
}
#endif
#endif
base-commit: b2791d61e9774d8575525816e864d2e09ee9090a
--
Powered by blists - more mailing lists