[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260106102413.25294-1-yan.y.zhao@intel.com>
Date: Tue, 6 Jan 2026 18:24:13 +0800
From: Yan Zhao <yan.y.zhao@...el.com>
To: pbonzini@...hat.com,
seanjc@...gle.com
Cc: linux-kernel@...r.kernel.org,
kvm@...r.kernel.org,
x86@...nel.org,
rick.p.edgecombe@...el.com,
dave.hansen@...el.com,
kas@...nel.org,
tabba@...gle.com,
ackerleytng@...gle.com,
michael.roth@....com,
david@...nel.org,
vannapurve@...gle.com,
sagis@...gle.com,
vbabka@...e.cz,
thomas.lendacky@....com,
nik.borisov@...e.com,
pgonda@...gle.com,
fan.du@...el.com,
jun.miao@...el.com,
francescolavra.fl@...il.com,
jgross@...e.com,
ira.weiny@...el.com,
isaku.yamahata@...el.com,
xiaoyao.li@...el.com,
kai.huang@...el.com,
binbin.wu@...ux.intel.com,
chao.p.peng@...el.com,
chao.gao@...el.com,
yan.y.zhao@...el.com
Subject: [PATCH v3 22/24] x86/tdx: Add/Remove DPAMT pages for guest private memory to demote
From: "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>
When Dynamic PAMT is enabled and splitting a 2MB mapping to 512 4KB
mappings, SEAMCALL TDH.MEM.PAGE.DEMOTE takes the Dynamic PAMT page pair in
registers R12 and R13. The Dynamic PAMT page pair is used to store physical
memory metadata for the 2MB guest private memory after its S-EPT mapping is
split to 4KB successfully.
Pass prealloc_split_cache (the per-VM split cache) to SEAMCALL wrapper
tdh_mem_page_demote() for dequeuing Dynamic PAMT pages from the cache.
Protect the cache dequeuing in KVM with prealloc_split_cache_lock.
Inside wrapper tdh_mem_page_demote(), dequeue the Dynamic PAMT pages into
the guest_memory_pamt_page array and copy the page address to R12 and R13.
Invoke SEAMCALL TDH_MEM_PAGE_DEMOTE using seamcall_saved_ret() to handle
registers above R11.
Free the Dynamic PAMT pages after SEAMCALL TDH_MEM_PAGE_DEMOTE fails since
the guest private memory is still mapped at 2MB level.
Opportunistically, rename dpamt_args_array_ptr() to
dpamt_args_array_ptr_rdx() for tdh_phymem_pamt_{add/remove} and invoke
dpamt_args_array_ptr_r12() in tdh_mem_page_demote() for populating
registers starting from R12.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
Co-developed-by: Yan Zhao <yan.y.zhao@...el.com>
Signed-off-by: Yan Zhao <yan.y.zhao@...el.com>
---
v3:
- Split out as a new patch.
- Get pages from preallocate cache corresponding to DPAMT v4.
---
arch/x86/include/asm/tdx.h | 1 +
arch/x86/kvm/vmx/tdx.c | 5 ++-
arch/x86/virt/vmx/tdx/tdx.c | 76 ++++++++++++++++++++++++++-----------
3 files changed, 59 insertions(+), 23 deletions(-)
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index abe484045132..5fc7498392fd 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -251,6 +251,7 @@ u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp);
u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data);
u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *new_sept_page,
+ struct tdx_prealloc *prealloc,
u64 *ext_err1, u64 *ext_err2);
u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2);
u64 tdh_mr_finalize(struct tdx_td *td);
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index ec47bd799274..a11ff02a4f30 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -2021,8 +2021,11 @@ static int tdx_sept_split_private_spte(struct kvm *kvm, gfn_t gfn, enum pg_level
if (KVM_BUG_ON(ret, kvm))
return -EIO;
+ spin_lock(&kvm_tdx->prealloc_split_cache_lock);
err = tdh_do_no_vcpus(tdh_mem_page_demote, kvm, &kvm_tdx->td, gpa,
- tdx_level, new_sept_page, &entry, &level_state);
+ tdx_level, new_sept_page,
+ &kvm_tdx->prealloc_split_cache, &entry, &level_state);
+ spin_unlock(&kvm_tdx->prealloc_split_cache_lock);
if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_DEMOTE, entry, level_state, kvm)) {
tdx_pamt_put(new_sept_page);
return -EIO;
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 76963c563906..9917e4e7705f 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1848,25 +1848,69 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
}
EXPORT_SYMBOL_GPL(tdh_mng_rd);
+static int alloc_pamt_array(u64 *pa_array, struct tdx_prealloc *prealloc);
+static void free_pamt_array(u64 *pa_array);
+/*
+ * The TDX spec treats the registers like an array, as they are ordered
+ * in the struct. The array size is limited by the number or registers,
+ * so define the max size it could be for worst case allocations and sanity
+ * checking.
+ */
+#define MAX_TDX_ARG_SIZE(reg) ((sizeof(struct tdx_module_args) - \
+ offsetof(struct tdx_module_args, reg)) / sizeof(u64))
+#define TDX_ARG_INDEX(reg) (offsetof(struct tdx_module_args, reg) / \
+ sizeof(u64))
+/*
+ * Treat struct the registers like an array that starts at R12, per
+ * TDX spec. Do some sanitychecks, and return an indexable type.
+ */
+static u64 *dpamt_args_array_ptr_r12(struct tdx_module_array_args *args)
+{
+ WARN_ON_ONCE(tdx_dpamt_entry_pages() > MAX_TDX_ARG_SIZE(r12));
+
+ return &args->args_array[TDX_ARG_INDEX(r12)];
+}
+
u64 tdh_mem_page_demote(struct tdx_td *td, u64 gpa, int level, struct page *new_sept_page,
+ struct tdx_prealloc *prealloc,
u64 *ext_err1, u64 *ext_err2)
{
- struct tdx_module_args args = {
- .rcx = gpa | level,
- .rdx = tdx_tdr_pa(td),
- .r8 = page_to_phys(new_sept_page),
+ bool dpamt = tdx_supports_dynamic_pamt(&tdx_sysinfo) && level == TDX_PS_2M;
+ u64 guest_memory_pamt_page[MAX_TDX_ARG_SIZE(r12)];
+ struct tdx_module_array_args args = {
+ .args.rcx = gpa | level,
+ .args.rdx = tdx_tdr_pa(td),
+ .args.r8 = page_to_phys(new_sept_page),
};
u64 ret;
if (!tdx_supports_demote_nointerrupt(&tdx_sysinfo))
return TDX_SW_ERROR;
+ if (dpamt) {
+ u64 *args_array = dpamt_args_array_ptr_r12(&args);
+
+ if (alloc_pamt_array(guest_memory_pamt_page, prealloc))
+ return TDX_SW_ERROR;
+
+ /*
+ * Copy PAMT page PAs of the guest memory into the struct per the
+ * TDX ABI
+ */
+ memcpy(args_array, guest_memory_pamt_page,
+ tdx_dpamt_entry_pages() * sizeof(*args_array));
+ }
+
/* Flush the new S-EPT page to be added */
tdx_clflush_page(new_sept_page);
- ret = seamcall_ret(TDH_MEM_PAGE_DEMOTE, &args);
- *ext_err1 = args.rcx;
- *ext_err2 = args.rdx;
+ ret = seamcall_saved_ret(TDH_MEM_PAGE_DEMOTE, &args.args);
+
+ *ext_err1 = args.args.rcx;
+ *ext_err2 = args.args.rdx;
+
+ if (dpamt && ret)
+ free_pamt_array(guest_memory_pamt_page);
return ret;
}
@@ -2104,23 +2148,11 @@ static struct page *alloc_dpamt_page(struct tdx_prealloc *prealloc)
return alloc_page(GFP_KERNEL_ACCOUNT);
}
-
-/*
- * The TDX spec treats the registers like an array, as they are ordered
- * in the struct. The array size is limited by the number or registers,
- * so define the max size it could be for worst case allocations and sanity
- * checking.
- */
-#define MAX_TDX_ARG_SIZE(reg) (sizeof(struct tdx_module_args) - \
- offsetof(struct tdx_module_args, reg))
-#define TDX_ARG_INDEX(reg) (offsetof(struct tdx_module_args, reg) / \
- sizeof(u64))
-
/*
* Treat struct the registers like an array that starts at RDX, per
* TDX spec. Do some sanitychecks, and return an indexable type.
*/
-static u64 *dpamt_args_array_ptr(struct tdx_module_array_args *args)
+static u64 *dpamt_args_array_ptr_rdx(struct tdx_module_array_args *args)
{
WARN_ON_ONCE(tdx_dpamt_entry_pages() > MAX_TDX_ARG_SIZE(rdx));
@@ -2188,7 +2220,7 @@ static u64 tdh_phymem_pamt_add(struct page *page, u64 *pamt_pa_array)
struct tdx_module_array_args args = {
.args.rcx = pamt_2mb_arg(page)
};
- u64 *dpamt_arg_array = dpamt_args_array_ptr(&args);
+ u64 *dpamt_arg_array = dpamt_args_array_ptr_rdx(&args);
/* Copy PAMT page PA's into the struct per the TDX ABI */
memcpy(dpamt_arg_array, pamt_pa_array,
@@ -2216,7 +2248,7 @@ static u64 tdh_phymem_pamt_remove(struct page *page, u64 *pamt_pa_array)
struct tdx_module_array_args args = {
.args.rcx = pamt_2mb_arg(page),
};
- u64 *args_array = dpamt_args_array_ptr(&args);
+ u64 *args_array = dpamt_args_array_ptr_rdx(&args);
u64 ret;
ret = seamcall_ret(TDH_PHYMEM_PAMT_REMOVE, &args.args);
--
2.43.2
Powered by blists - more mailing lists