linux-kernel - Re: [PATCH v2 08/25] x86/virt/tdx: Add SEAMCALL wrappers for TDX page cache management

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0ad78fb6ba88e5cac8b67eecf2851e393c58739c.camel@intel.com>
Date: Thu, 31 Oct 2024 18:57:20 +0000
From: "Edgecombe, Rick P" <rick.p.edgecombe@...el.com>
To: "Hansen, Dave" <dave.hansen@...el.com>, "Zhao, Yan Y"
	<yan.y.zhao@...el.com>
CC: "sean.j.christopherson@...el.com" <sean.j.christopherson@...el.com>,
	"seanjc@...gle.com" <seanjc@...gle.com>, "Huang, Kai" <kai.huang@...el.com>,
	"binbin.wu@...ux.intel.com" <binbin.wu@...ux.intel.com>, "Yao, Yuan"
	<yuan.yao@...el.com>, "Li, Xiaoyao" <xiaoyao.li@...el.com>,
	"isaku.yamahata@...il.com" <isaku.yamahata@...il.com>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	"tony.lindgren@...ux.intel.com" <tony.lindgren@...ux.intel.com>,
	"kvm@...r.kernel.org" <kvm@...r.kernel.org>, "pbonzini@...hat.com"
	<pbonzini@...hat.com>, "Chatre, Reinette" <reinette.chatre@...el.com>,
	"Yamahata, Isaku" <isaku.yamahata@...el.com>
Subject: Re: [PATCH v2 08/25] x86/virt/tdx: Add SEAMCALL wrappers for TDX page
 cache management

On Thu, 2024-10-31 at 11:57 +0800, Yan Zhao wrote:
> When TD is tearing down,
> - TD pages are removed and freed when hkid is assigned, so
>   tdh_phymem_page_reclaim() will not be called for them.
> - after vt_vm_destroy() releasing the hkid, kvm_arch_destroy_vm() calls
>   kvm_destroy_vcpus(), kvm_mmu_uninit_tdp_mmu() and tdx_vm_free() to reclaim
>   TDCX/TDVPR/EPT/TDR pages sequentially in a single thread.
> 
> So, there should be no contentions expected for current KVM to call
> tdh_phymem_page_reclaim().

This links into the question of how much of the wrappers should be in KVM code
vs arch/x86. I got the impression Dave would like to not see SEAMCALLs just
getting wrapped on KVM's side with what it really needs. Towards that, it could
be tempting to move tdx_reclaim_page() (see "[PATCH v2 17/25] KVM: TDX:
create/destroy VM structure") into arch/x86 and have arch/x86 handle the
tdx_clear_page() part too. That would also be more symmetric with what arch/x86
already does for clflush on the calls that hand pages to the TDX module.

But the analysis of why we don't need to worry about TDX_OPERAND_BUSY is based
on KVM's current use of tdh_phymem_page_reclaim(). So KVM still has to be the
one to reason about TDX_OPERAND_BUSY, and the more we wrap the low level
SEAMCALLs, the more brittle and spread out the solution to dance around the TDX
module locks becomes.

I took a look at dropping the retry loop and moving tdx_reclaim_page() into
arch/x86 anyway:

 arch/x86/include/asm/tdx.h  |  3 +--
 arch/x86/kvm/vmx/tdx.c      | 74 ++++------------------------------------------
----------------------------
 arch/x86/virt/vmx/tdx/tdx.c | 63
++++++++++++++++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 55 insertions(+), 85 deletions(-)


diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 051465261155..790d6d99d895 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -145,13 +145,12 @@ u64 tdh_vp_init(u64 tdvpr, u64 initial_rcx);
 u64 tdh_vp_rd(u64 tdvpr, u64 field, u64 *data);
 u64 tdh_vp_wr(u64 tdvpr, u64 field, u64 data, u64 mask);
 u64 tdh_vp_init_apicid(u64 tdvpr, u64 initial_rcx, u32 x2apicid);
-u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8);
+u64 tdx_reclaim_page(u64 pa, bool wbind);
 u64 tdh_mem_page_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx);
 u64 tdh_mem_sept_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx);
 u64 tdh_mem_track(u64 tdr);
 u64 tdh_mem_range_unblock(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx);
 u64 tdh_phymem_cache_wb(bool resume);
-u64 tdh_phymem_page_wbinvd_tdr(u64 tdr);
 u64 tdh_phymem_page_wbinvd_hkid(u64 hpa, u64 hkid);
 #else
 static inline void tdx_init(void) { }
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 0ee8ec86d02a..aca73d942344 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -291,67 +291,6 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu
*vcpu)
        vcpu->cpu = -1;
 }
 
-static void tdx_clear_page(unsigned long page_pa)
-{
-       const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
-       void *page = __va(page_pa);
-       unsigned long i;
-
-       /*
-        * The page could have been poisoned.  MOVDIR64B also clears
-        * the poison bit so the kernel can safely use the page again.
-        */
-       for (i = 0; i < PAGE_SIZE; i += 64)
-               movdir64b(page + i, zero_page);
-       /*
-        * MOVDIR64B store uses WC buffer.  Prevent following memory reads
-        * from seeing potentially poisoned cache.
-        */
-       __mb();
-}
-
-/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
-static int __tdx_reclaim_page(hpa_t pa)
-{
-       u64 err, rcx, rdx, r8;
-       int i;
-
-       for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
-               err = tdh_phymem_page_reclaim(pa, &rcx, &rdx, &r8);
-
-               /*
-                * TDH.PHYMEM.PAGE.RECLAIM is allowed only when TD is shutdown.
-                * state.  i.e. destructing TD.
-                * TDH.PHYMEM.PAGE.RECLAIM requires TDR and target page.
-                * Because we're destructing TD, it's rare to contend with TDR.
-                */
-               switch (err) {
-               case TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX:
-               case TDX_OPERAND_BUSY | TDX_OPERAND_ID_TDR:
-                       cond_resched();
-                       continue;
-               default:
-                       goto out;
-               }
-       }
-
-out:
-       if (WARN_ON_ONCE(err)) {
-               pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
-               return -EIO;
-       }
-       return 0;
-}
-
-static int tdx_reclaim_page(hpa_t pa)
-{
-       int r;
-
-       r = __tdx_reclaim_page(pa);
-       if (!r)
-               tdx_clear_page(pa);
-       return r;
-}
 
 
 /*
@@ -365,7 +304,7 @@ static void tdx_reclaim_control_page(unsigned long
ctrl_page_pa)
         * Leak the page if the kernel failed to reclaim the page.
         * The kernel cannot use it safely anymore.
         */
-       if (tdx_reclaim_page(ctrl_page_pa))
+       if (tdx_reclaim_page(ctrl_page_pa, false))
                return;
 
        free_page((unsigned long)__va(ctrl_page_pa));
@@ -581,20 +520,16 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
        if (!kvm_tdx->tdr_pa)
                return;
 
-       if (__tdx_reclaim_page(kvm_tdx->tdr_pa))
-               return;
-
        /*
         * Use a SEAMCALL to ask the TDX module to flush the cache based on the
         * KeyID. TDX module may access TDR while operating on TD (Especially
         * when it is reclaiming TDCS).
         */
-       err = tdh_phymem_page_wbinvd_tdr(kvm_tdx->tdr_pa);
+       err = tdx_reclaim_page(kvm_tdx->tdr_pa, true);
        if (KVM_BUG_ON(err, kvm)) {
-               pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+               pr_tdx_error(tdx_reclaim_page, err);
                return;
        }
-       tdx_clear_page(kvm_tdx->tdr_pa);
 
        free_page((unsigned long)__va(kvm_tdx->tdr_pa));
        kvm_tdx->tdr_pa = 0;
@@ -1694,7 +1629,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm,
gfn_t gfn,
                pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
                return -EIO;
        }
-       tdx_clear_page(hpa);
        tdx_unpin(kvm, pfn);
        return 0;
 }
@@ -1805,7 +1739,7 @@ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
         * The HKID assigned to this TD was already freed and cache was
         * already flushed. We don't have to flush again.
         */
-       return tdx_reclaim_page(__pa(private_spt));
+       return tdx_reclaim_page(__pa(private_spt), false);
 }
 
 int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index bad83f6a3b0c..bb7cdb867581 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1892,7 +1892,7 @@ u64 tdh_vp_init_apicid(u64 tdvpr, u64 initial_rcx, u32
x2apicid)
 }
 EXPORT_SYMBOL_GPL(tdh_vp_init_apicid);
 
-u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8)
+static u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8)
 {
        struct tdx_module_args args = {
                .rcx = page,
@@ -1914,7 +1914,49 @@ u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx,
u64 *r8)
 
        return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
+
+static void tdx_clear_page(unsigned long page_pa)
+{
+       const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+       void *page = __va(page_pa);
+       unsigned long i;
+
+       /*
+        * The page could have been poisoned.  MOVDIR64B also clears
+        * the poison bit so the kernel can safely use the page again.
+        */
+       for (i = 0; i < PAGE_SIZE; i += 64)
+               movdir64b(page + i, zero_page);
+       /*
+        * MOVDIR64B store uses WC buffer.  Prevent following memory reads
+        * from seeing potentially poisoned cache.
+        */
+       __mb();
+}
+
+/*
+ * tdx_reclaim_page() calls tdh_phymem_page_reclaim() internally. Callers
should
+ * be prepared to handle TDX_OPERAND_BUSY.
+ * If return code is not an error, page has been cleared with MOVDIR64.
+ */
+u64 tdx_reclaim_page(u64 pa, bool wbind_global_key)
+{
+       u64 rcx, rdx, r8;
+       u64 r;
+
+       r = tdh_phymem_page_reclaim(pa, &rcx, &rdx, &r8);
+       if (r)
+               return r;
+
+       /* tdh_phymem_page_wbinvd_hkid() will do tdx_clear_page() */
+       if (wbind_global_key)
+               return tdh_phymem_page_wbinvd_hkid(pa, tdx_global_keyid);
+
+       tdx_clear_page(pa);
+
+       return r;
+}
+EXPORT_SYMBOL_GPL(tdx_reclaim_page);
 
 u64 tdh_mem_page_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx)
 {
@@ -1987,22 +2029,17 @@ u64 tdh_phymem_cache_wb(bool resume)
 }
 EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb);
 
-u64 tdh_phymem_page_wbinvd_tdr(u64 tdr)
-{
-       struct tdx_module_args args = {};
-
-       args.rcx = tdr | ((u64)tdx_global_keyid << boot_cpu_data.x86_phys_bits);
-
-       return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
-}
-EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr);
-
 u64 tdh_phymem_page_wbinvd_hkid(u64 hpa, u64 hkid)
 {
        struct tdx_module_args args = {};
+       u64 err;
 
        args.rcx = hpa | (hkid << boot_cpu_data.x86_phys_bits);
 
-       return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
+       err = seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
+       if (!err)
+               tdx_clear_page(hpa);
+
+       return err;
 }
 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);