[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0ad78fb6ba88e5cac8b67eecf2851e393c58739c.camel@intel.com>
Date: Thu, 31 Oct 2024 18:57:20 +0000
From: "Edgecombe, Rick P" <rick.p.edgecombe@...el.com>
To: "Hansen, Dave" <dave.hansen@...el.com>, "Zhao, Yan Y"
<yan.y.zhao@...el.com>
CC: "sean.j.christopherson@...el.com" <sean.j.christopherson@...el.com>,
"seanjc@...gle.com" <seanjc@...gle.com>, "Huang, Kai" <kai.huang@...el.com>,
"binbin.wu@...ux.intel.com" <binbin.wu@...ux.intel.com>, "Yao, Yuan"
<yuan.yao@...el.com>, "Li, Xiaoyao" <xiaoyao.li@...el.com>,
"isaku.yamahata@...il.com" <isaku.yamahata@...il.com>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"tony.lindgren@...ux.intel.com" <tony.lindgren@...ux.intel.com>,
"kvm@...r.kernel.org" <kvm@...r.kernel.org>, "pbonzini@...hat.com"
<pbonzini@...hat.com>, "Chatre, Reinette" <reinette.chatre@...el.com>,
"Yamahata, Isaku" <isaku.yamahata@...el.com>
Subject: Re: [PATCH v2 08/25] x86/virt/tdx: Add SEAMCALL wrappers for TDX page
cache management
On Thu, 2024-10-31 at 11:57 +0800, Yan Zhao wrote:
> When TD is tearing down,
> - TD pages are removed and freed when hkid is assigned, so
> tdh_phymem_page_reclaim() will not be called for them.
> - after vt_vm_destroy() releasing the hkid, kvm_arch_destroy_vm() calls
> kvm_destroy_vcpus(), kvm_mmu_uninit_tdp_mmu() and tdx_vm_free() to reclaim
> TDCX/TDVPR/EPT/TDR pages sequentially in a single thread.
>
> So, there should be no contentions expected for current KVM to call
> tdh_phymem_page_reclaim().
This links into the question of how much of the wrappers should be in KVM code
vs arch/x86. I got the impression Dave would like to not see SEAMCALLs just
getting wrapped on KVM's side with what it really needs. Towards that, it could
be tempting to move tdx_reclaim_page() (see "[PATCH v2 17/25] KVM: TDX:
create/destroy VM structure") into arch/x86 and have arch/x86 handle the
tdx_clear_page() part too. That would also be more symmetric with what arch/x86
already does for clflush on the calls that hand pages to the TDX module.
But the analysis of why we don't need to worry about TDX_OPERAND_BUSY is based
on KVM's current use of tdh_phymem_page_reclaim(). So KVM still has to be the
one to reason about TDX_OPERAND_BUSY, and the more we wrap the low level
SEAMCALLs, the more brittle and spread out the solution to dance around the TDX
module locks becomes.
I took a look at dropping the retry loop and moving tdx_reclaim_page() into
arch/x86 anyway:
arch/x86/include/asm/tdx.h | 3 +--
arch/x86/kvm/vmx/tdx.c | 74 ++++------------------------------------------
----------------------------
arch/x86/virt/vmx/tdx/tdx.c | 63
++++++++++++++++++++++++++++++++++++++++++++++++++-------------
3 files changed, 55 insertions(+), 85 deletions(-)
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 051465261155..790d6d99d895 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -145,13 +145,12 @@ u64 tdh_vp_init(u64 tdvpr, u64 initial_rcx);
u64 tdh_vp_rd(u64 tdvpr, u64 field, u64 *data);
u64 tdh_vp_wr(u64 tdvpr, u64 field, u64 data, u64 mask);
u64 tdh_vp_init_apicid(u64 tdvpr, u64 initial_rcx, u32 x2apicid);
-u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8);
+u64 tdx_reclaim_page(u64 pa, bool wbind);
u64 tdh_mem_page_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx);
u64 tdh_mem_sept_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx);
u64 tdh_mem_track(u64 tdr);
u64 tdh_mem_range_unblock(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx);
u64 tdh_phymem_cache_wb(bool resume);
-u64 tdh_phymem_page_wbinvd_tdr(u64 tdr);
u64 tdh_phymem_page_wbinvd_hkid(u64 hpa, u64 hkid);
#else
static inline void tdx_init(void) { }
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 0ee8ec86d02a..aca73d942344 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -291,67 +291,6 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu
*vcpu)
vcpu->cpu = -1;
}
-static void tdx_clear_page(unsigned long page_pa)
-{
- const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
- void *page = __va(page_pa);
- unsigned long i;
-
- /*
- * The page could have been poisoned. MOVDIR64B also clears
- * the poison bit so the kernel can safely use the page again.
- */
- for (i = 0; i < PAGE_SIZE; i += 64)
- movdir64b(page + i, zero_page);
- /*
- * MOVDIR64B store uses WC buffer. Prevent following memory reads
- * from seeing potentially poisoned cache.
- */
- __mb();
-}
-
-/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
-static int __tdx_reclaim_page(hpa_t pa)
-{
- u64 err, rcx, rdx, r8;
- int i;
-
- for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
- err = tdh_phymem_page_reclaim(pa, &rcx, &rdx, &r8);
-
- /*
- * TDH.PHYMEM.PAGE.RECLAIM is allowed only when TD is shutdown.
- * state. i.e. destructing TD.
- * TDH.PHYMEM.PAGE.RECLAIM requires TDR and target page.
- * Because we're destructing TD, it's rare to contend with TDR.
- */
- switch (err) {
- case TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX:
- case TDX_OPERAND_BUSY | TDX_OPERAND_ID_TDR:
- cond_resched();
- continue;
- default:
- goto out;
- }
- }
-
-out:
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
- return -EIO;
- }
- return 0;
-}
-
-static int tdx_reclaim_page(hpa_t pa)
-{
- int r;
-
- r = __tdx_reclaim_page(pa);
- if (!r)
- tdx_clear_page(pa);
- return r;
-}
/*
@@ -365,7 +304,7 @@ static void tdx_reclaim_control_page(unsigned long
ctrl_page_pa)
* Leak the page if the kernel failed to reclaim the page.
* The kernel cannot use it safely anymore.
*/
- if (tdx_reclaim_page(ctrl_page_pa))
+ if (tdx_reclaim_page(ctrl_page_pa, false))
return;
free_page((unsigned long)__va(ctrl_page_pa));
@@ -581,20 +520,16 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
if (!kvm_tdx->tdr_pa)
return;
- if (__tdx_reclaim_page(kvm_tdx->tdr_pa))
- return;
-
/*
* Use a SEAMCALL to ask the TDX module to flush the cache based on the
* KeyID. TDX module may access TDR while operating on TD (Especially
* when it is reclaiming TDCS).
*/
- err = tdh_phymem_page_wbinvd_tdr(kvm_tdx->tdr_pa);
+ err = tdx_reclaim_page(kvm_tdx->tdr_pa, true);
if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+ pr_tdx_error(tdx_reclaim_page, err);
return;
}
- tdx_clear_page(kvm_tdx->tdr_pa);
free_page((unsigned long)__va(kvm_tdx->tdr_pa));
kvm_tdx->tdr_pa = 0;
@@ -1694,7 +1629,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm,
gfn_t gfn,
pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
return -EIO;
}
- tdx_clear_page(hpa);
tdx_unpin(kvm, pfn);
return 0;
}
@@ -1805,7 +1739,7 @@ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
* The HKID assigned to this TD was already freed and cache was
* already flushed. We don't have to flush again.
*/
- return tdx_reclaim_page(__pa(private_spt));
+ return tdx_reclaim_page(__pa(private_spt), false);
}
int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index bad83f6a3b0c..bb7cdb867581 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1892,7 +1892,7 @@ u64 tdh_vp_init_apicid(u64 tdvpr, u64 initial_rcx, u32
x2apicid)
}
EXPORT_SYMBOL_GPL(tdh_vp_init_apicid);
-u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8)
+static u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8)
{
struct tdx_module_args args = {
.rcx = page,
@@ -1914,7 +1914,49 @@ u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx,
u64 *r8)
return ret;
}
-EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
+
+static void tdx_clear_page(unsigned long page_pa)
+{
+ const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+ void *page = __va(page_pa);
+ unsigned long i;
+
+ /*
+ * The page could have been poisoned. MOVDIR64B also clears
+ * the poison bit so the kernel can safely use the page again.
+ */
+ for (i = 0; i < PAGE_SIZE; i += 64)
+ movdir64b(page + i, zero_page);
+ /*
+ * MOVDIR64B store uses WC buffer. Prevent following memory reads
+ * from seeing potentially poisoned cache.
+ */
+ __mb();
+}
+
+/*
+ * tdx_reclaim_page() calls tdh_phymem_page_reclaim() internally. Callers
should
+ * be prepared to handle TDX_OPERAND_BUSY.
+ * If return code is not an error, page has been cleared with MOVDIR64.
+ */
+u64 tdx_reclaim_page(u64 pa, bool wbind_global_key)
+{
+ u64 rcx, rdx, r8;
+ u64 r;
+
+ r = tdh_phymem_page_reclaim(pa, &rcx, &rdx, &r8);
+ if (r)
+ return r;
+
+ /* tdh_phymem_page_wbinvd_hkid() will do tdx_clear_page() */
+ if (wbind_global_key)
+ return tdh_phymem_page_wbinvd_hkid(pa, tdx_global_keyid);
+
+ tdx_clear_page(pa);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(tdx_reclaim_page);
u64 tdh_mem_page_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx)
{
@@ -1987,22 +2029,17 @@ u64 tdh_phymem_cache_wb(bool resume)
}
EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb);
-u64 tdh_phymem_page_wbinvd_tdr(u64 tdr)
-{
- struct tdx_module_args args = {};
-
- args.rcx = tdr | ((u64)tdx_global_keyid << boot_cpu_data.x86_phys_bits);
-
- return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
-}
-EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr);
-
u64 tdh_phymem_page_wbinvd_hkid(u64 hpa, u64 hkid)
{
struct tdx_module_args args = {};
+ u64 err;
args.rcx = hpa | (hkid << boot_cpu_data.x86_phys_bits);
- return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
+ err = seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
+ if (!err)
+ tdx_clear_page(hpa);
+
+ return err;
}
EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
Powered by blists - more mailing lists