[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <xjl4nloxtmp7jrfus5rnij6xz6ut6p7riixj7mwt32zlkc7k27@xvallgw2ei7r>
Date: Fri, 23 May 2025 12:42:27 +0300
From: "kirill.shutemov@...ux.intel.com" <kirill.shutemov@...ux.intel.com>
To: "Huang, Kai" <kai.huang@...el.com>
Cc: "pbonzini@...hat.com" <pbonzini@...hat.com>,
"seanjc@...gle.com" <seanjc@...gle.com>, "Edgecombe, Rick P" <rick.p.edgecombe@...el.com>,
"bp@...en8.de" <bp@...en8.de>, "dave.hansen@...ux.intel.com" <dave.hansen@...ux.intel.com>,
"x86@...nel.org" <x86@...nel.org>, "mingo@...hat.com" <mingo@...hat.com>,
"Zhao, Yan Y" <yan.y.zhao@...el.com>, "tglx@...utronix.de" <tglx@...utronix.de>,
"kvm@...r.kernel.org" <kvm@...r.kernel.org>, "linux-coco@...ts.linux.dev" <linux-coco@...ts.linux.dev>,
"Yamahata, Isaku" <isaku.yamahata@...el.com>, "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: Re: [RFC, PATCH 05/12] KVM: TDX: Add tdx_pamt_get()/put() helpers
On Mon, May 05, 2025 at 12:44:26PM +0000, Huang, Kai wrote:
> On Fri, 2025-05-02 at 16:08 +0300, Kirill A. Shutemov wrote:
> > Introduce a pair of helpers to allocate and free memory for a given 2M
> > range. The range is represented by struct page for any memory in the
> > range and the PAMT memory by a list of pages.
> >
> > Use per-2M refcounts to detect when PAMT memory has to be allocated and
> > when it can be freed.
> >
> > pamt_lock spinlock serializes against races between multiple
> > tdx_pamt_add() as well as tdx_pamt_add() vs tdx_pamt_put().
>
> Maybe elaborate a little bit on _why_ using spinlock?
>
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
> > ---
> > arch/x86/include/asm/tdx.h | 2 +
> > arch/x86/kvm/vmx/tdx.c | 123 +++++++++++++++++++++++++++++++++++
> > arch/x86/kvm/vmx/tdx_errno.h | 1 +
> > 3 files changed, 126 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> > index 8091bf5b43cc..42449c054938 100644
> > --- a/arch/x86/include/asm/tdx.h
> > +++ b/arch/x86/include/asm/tdx.h
> > @@ -135,6 +135,8 @@ static inline int tdx_nr_pamt_pages(const struct tdx_sys_info *sysinfo)
> > return sysinfo->tdmr.pamt_4k_entry_size * PTRS_PER_PTE / PAGE_SIZE;
> > }
> >
> > +atomic_t *tdx_get_pamt_refcount(unsigned long hpa);
> > +
>
> This at least needs to be in the same patch which exports it. But as replied to
> patch 2, I think we should just move the code in this patch to TDX core code.
>
> > int tdx_guest_keyid_alloc(void);
> > u32 tdx_get_nr_guest_keyids(void);
> > void tdx_guest_keyid_free(unsigned int keyid);
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index b952bc673271..ea7e2d93fb44 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -207,6 +207,10 @@ static bool tdx_operand_busy(u64 err)
> > return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
> > }
> >
> > +static bool tdx_hpa_range_not_free(u64 err)
> > +{
> > + return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_HPA_RANGE_NOT_FREE;
> > +}
> >
> > /*
> > * A per-CPU list of TD vCPUs associated with a given CPU.
> > @@ -276,6 +280,125 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
> > vcpu->cpu = -1;
> > }
> >
> > +static DEFINE_SPINLOCK(pamt_lock);
> > +
> > +static void tdx_free_pamt_pages(struct list_head *pamt_pages)
> > +{
> > + struct page *page;
> > +
> > + while ((page = list_first_entry_or_null(pamt_pages, struct page, lru))) {
> > + list_del(&page->lru);
> > + __free_page(page);
> > + }
> > +}
> > +
> > +static int tdx_alloc_pamt_pages(struct list_head *pamt_pages)
> > +{
> > + for (int i = 0; i < tdx_nr_pamt_pages(tdx_sysinfo); i++) {
> > + struct page *page = alloc_page(GFP_KERNEL);
> > + if (!page)
> > + goto fail;
> > + list_add(&page->lru, pamt_pages);
> > + }
> > + return 0;
> > +fail:
> > + tdx_free_pamt_pages(pamt_pages);
> > + return -ENOMEM;
> > +}
> > +
> > +static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> > + struct list_head *pamt_pages)
> > +{
> > + u64 err;
> > +
> > + hpa = ALIGN_DOWN(hpa, SZ_2M);
> > +
> > + spin_lock(&pamt_lock);
>
> Just curious, Can the lock be per-2M-range?
>
> > +
> > + /* Lost race to other tdx_pamt_add() */
> > + if (atomic_read(pamt_refcount) != 0) {
> > + atomic_inc(pamt_refcount);
> > + spin_unlock(&pamt_lock);
> > + tdx_free_pamt_pages(pamt_pages);
>
> It's unfortunate multiple caller of tdx_pamt_add() needs to firstly allocate
> PAMT pages by the caller out of the spinlock and then free them here.
>
> I am thinking if we make tdx_pamt_add() return:
>
> * > 0: PAMT pages already added (another tdx_pamt_add() won)
> * = 0: PAMT pages added successfully
> * < 0: error code
>
> .. then we at least could move tdx_free_pamt_pages() to the caller too.
>
> > + return 0;
> > + }
> > +
> > + err = tdh_phymem_pamt_add(hpa | TDX_PS_2M, pamt_pages);
> > +
> > + if (err)
> > + tdx_free_pamt_pages(pamt_pages);
>
> Seems we are calling tdx_free_pamt_pages() within spinlock, which is not
> consistent with above when another tdx_pamt_add() has won the race.
>
> > +
> > + /*
> > + * tdx_hpa_range_not_free() is true if current task won race
> > + * against tdx_pamt_put().
> > + */
> > + if (err && !tdx_hpa_range_not_free(err)) {
> > + spin_unlock(&pamt_lock);
> > + pr_tdx_error(TDH_PHYMEM_PAMT_ADD, err);
> > + return -EIO;
> > + }
>
> I had hard time to figure out why we need to handle tdx_hpa_range_not_free()
> explicitly. IIUC, it is because atomic_dec_and_test() is used in
> tdx_pamt_put(), in which case the atomic_t can reach to 0 outside of the
> spinlock thus tdh_phymem_pamt_add() can be called when there's still PAMT pages
> populated.
>
> But ...
>
> > +
> > + atomic_set(pamt_refcount, 1);
> > + spin_unlock(&pamt_lock);
> > + return 0;
> > +}
> > +
> > +static int tdx_pamt_get(struct page *page)
> > +{
> > + unsigned long hpa = page_to_phys(page);
> > + atomic_t *pamt_refcount;
> > + LIST_HEAD(pamt_pages);
> > +
> > + if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
> > + return 0;
> > +
> > + pamt_refcount = tdx_get_pamt_refcount(hpa);
> > + WARN_ON_ONCE(atomic_read(pamt_refcount) < 0);
> > +
> > + if (atomic_inc_not_zero(pamt_refcount))
> > + return 0;
>
> ... if we set the initial value of pamt_refcount to -1, and use
> atomic_inc_unless_negetive() here:
>
> if (atomic_inc_unless_negative(pamt_refcount))
> return 0;
>
> if (tdx_alloc_pamt_pages(&pamt_pages))
> return -ENOMEM;
>
> spin_lock(&pamt_lock);
> ret = tdx_pamt_add(hpa, &pamt_pages);
> if (ret >= 0)
> atomic_inc(pamt_refcount, 0);
> spin_unlock(&pamt_lock);
>
> /*
> * If another tdx_pamt_get() won the race, or in case of
> * error, PAMT pages are not used and can be freed.
> */
> if (ret)
> tdx_free_pamt_pages(&pamt_pages);
>
> return ret >= 0 ? 0 : ret;
>
> and ...
>
> > +
> > + if (tdx_alloc_pamt_pages(&pamt_pages))
> > + return -ENOMEM;
> > +
> > + return tdx_pamt_add(pamt_refcount, hpa, &pamt_pages);
> > +}
> > +
> > +static void tdx_pamt_put(struct page *page)
> > +{
> > + unsigned long hpa = page_to_phys(page);
> > + atomic_t *pamt_refcount;
> > + LIST_HEAD(pamt_pages);
> > + u64 err;
> > +
> > + if (!tdx_supports_dynamic_pamt(tdx_sysinfo))
> > + return;
> > +
> > + hpa = ALIGN_DOWN(hpa, SZ_2M);
> > +
> > + pamt_refcount = tdx_get_pamt_refcount(hpa);
> > + if (!atomic_dec_and_test(pamt_refcount))
> > + return;
>
> ... use atomic_dec_if_possible() here, we should be able to avoid the special
> handling of tdx_hpa_range_not_free() in tdx_pamt_get(). Someething like:
>
> if (atomic_dec_if_positive(pamt_refcount) >= 0)
> return;
>
> spin_lock(&pamt_lock);
>
> /* tdx_pamt_get() called more than once */
> if (atomic_read(pamt_refcount) > 0) {
This check would do nothing to protect you against parallel increase of
the counter as we get here with pamt_refcount == 0 the parallel
atomic_inc_unless_negative() is free to bump the counter in the fast path
without taking the lock just after this condition.
So, the code below will free PAMT memory when there is still user.
> spin_unlock(&pamt_lock);
> return;
> }
>
> err = tdh_phymem_pamt_remove(hpa | TDX_PS_2M, &pamt_pages);
> atomic_set(pamt_refcount, -1);
> spin_unlock(&pamt_lock);
>
> tdx_free_pamt_pages(&pamt_pages);
>
> Hmm.. am I missing anything?
>
> > +
> > + spin_lock(&pamt_lock);
> > +
> > + /* Lost race against tdx_pamt_add()? */
> > + if (atomic_read(pamt_refcount) != 0) {
> > + spin_unlock(&pamt_lock);
> > + return;
> > + }
> > +
> > + err = tdh_phymem_pamt_remove(hpa | TDX_PS_2M, &pamt_pages);
> > + spin_unlock(&pamt_lock);
> > +
> > + if (err) {
> > + pr_tdx_error(TDH_PHYMEM_PAMT_REMOVE, err);
> > + return;
> > + }
> > +
> > + tdx_free_pamt_pages(&pamt_pages);
> > +}
> > +
>
--
Kiryl Shutsemau / Kirill A. Shutemov
Powered by blists - more mailing lists