[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aba17e84f56f2a51e7d3cf0e286a898adf66dc46.camel@intel.com>
Date: Wed, 25 Jun 2025 22:38:42 +0000
From: "Edgecombe, Rick P" <rick.p.edgecombe@...el.com>
To: "kirill.shutemov@...ux.intel.com" <kirill.shutemov@...ux.intel.com>,
"pbonzini@...hat.com" <pbonzini@...hat.com>, "seanjc@...gle.com"
<seanjc@...gle.com>, "dave.hansen@...ux.intel.com"
<dave.hansen@...ux.intel.com>
CC: "Gao, Chao" <chao.gao@...el.com>, "bp@...en8.de" <bp@...en8.de>, "Huang,
Kai" <kai.huang@...el.com>, "x86@...nel.org" <x86@...nel.org>,
"mingo@...hat.com" <mingo@...hat.com>, "Zhao, Yan Y" <yan.y.zhao@...el.com>,
"tglx@...utronix.de" <tglx@...utronix.de>, "kvm@...r.kernel.org"
<kvm@...r.kernel.org>, "linux-coco@...ts.linux.dev"
<linux-coco@...ts.linux.dev>, "Yamahata, Isaku" <isaku.yamahata@...el.com>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: Re: [PATCHv2 08/12] KVM: TDX: Handle PAMT allocation in fault path
On Mon, 2025-06-09 at 22:13 +0300, Kirill A. Shutemov wrote:
> There are two distinct cases when the kernel needs to allocate PAMT
> memory in the fault path: for SEPT page tables in tdx_sept_link_private_spt()
> and for leaf pages in tdx_sept_set_private_spte().
>
> These code paths run in atomic context. Use a pre-allocated per-VCPU
> pool for memory allocations.
This log is way to thin. It should explain the design, justify the function
pointer, excuse the export, etc.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
> ---
> arch/x86/include/asm/tdx.h | 4 ++++
> arch/x86/kvm/vmx/tdx.c | 40 ++++++++++++++++++++++++++++++++-----
> arch/x86/virt/vmx/tdx/tdx.c | 21 +++++++++++++------
> virt/kvm/kvm_main.c | 1 +
> 4 files changed, 55 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index 47092eb13eb3..39f8dd7e0f06 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -116,6 +116,10 @@ u32 tdx_get_nr_guest_keyids(void);
> void tdx_guest_keyid_free(unsigned int keyid);
>
> int tdx_nr_pamt_pages(void);
> +int tdx_pamt_get(struct page *page, enum pg_level level,
> + struct page *(alloc)(void *data), void *data);
> +void tdx_pamt_put(struct page *page, enum pg_level level);
> +
> struct page *tdx_alloc_page(void);
> void tdx_free_page(struct page *page);
>
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 36c3c9f8a62c..bc9bc393f866 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -1537,11 +1537,26 @@ static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
> return 0;
> }
>
> +static struct page *tdx_alloc_pamt_page_atomic(void *data)
> +{
> + struct kvm_vcpu *vcpu = data;
> + void *p;
> +
> + p = kvm_mmu_memory_cache_alloc(&vcpu->arch.pamt_page_cache);
> + return virt_to_page(p);
> +}
> +
> int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
> enum pg_level level, kvm_pfn_t pfn)
> {
> + struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
> struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> struct page *page = pfn_to_page(pfn);
> + int ret;
> +
> + ret = tdx_pamt_get(page, level, tdx_alloc_pamt_page_atomic, vcpu);
> + if (ret)
> + return ret;
>
> /* TODO: handle large pages. */
> if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
level is known to be PG_LEVEL_4K if you swap the order of these. I'm guessing
left over from order swap.
> @@ -1562,10 +1577,16 @@ int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
> * barrier in tdx_td_finalize().
> */
> smp_rmb();
> - if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
> - return tdx_mem_page_aug(kvm, gfn, level, page);
>
> - return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
> + if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
> + ret = tdx_mem_page_aug(kvm, gfn, level, page);
> + else
> + ret = tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
tdx_mem_page_record_premap_cnt() doesn't need tdx_pamt_get(). I think it could
be skipped if the order is re-arranged.
> +
> + if (ret)
> + tdx_pamt_put(page, level);
> +
> + return ret;
> }
>
> static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
> @@ -1622,17 +1643,26 @@ int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
> enum pg_level level, void *private_spt)
> {
> int tdx_level = pg_level_to_tdx_sept_level(level);
> - gpa_t gpa = gfn_to_gpa(gfn);
> + struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
> struct page *page = virt_to_page(private_spt);
> + gpa_t gpa = gfn_to_gpa(gfn);
> u64 err, entry, level_state;
> + int ret;
> +
> + ret = tdx_pamt_get(page, PG_LEVEL_4K, tdx_alloc_pamt_page_atomic, vcpu);
> + if (ret)
> + return ret;
>
> err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
> &level_state);
> - if (unlikely(tdx_operand_busy(err)))
> + if (unlikely(tdx_operand_busy(err))) {
> + tdx_pamt_put(page, PG_LEVEL_4K);
> return -EBUSY;
> + }
>
> if (KVM_BUG_ON(err, kvm)) {
> pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
> + tdx_pamt_put(page, PG_LEVEL_4K);
> return -EIO;
> }
>
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index 4f9eaba4af4a..d4b50b6428fa 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -2067,10 +2067,16 @@ static void tdx_free_pamt_pages(struct list_head *pamt_pages)
> }
> }
>
> -static int tdx_alloc_pamt_pages(struct list_head *pamt_pages)
> +static int tdx_alloc_pamt_pages(struct list_head *pamt_pages,
> + struct page *(alloc)(void *data), void *data)
> {
> for (int i = 0; i < tdx_nr_pamt_pages(); i++) {
> - struct page *page = alloc_page(GFP_KERNEL);
> + struct page *page;
> +
> + if (alloc)
> + page = alloc(data);
> + else
> + page = alloc_page(GFP_KERNEL);
It's not great I think. A branch between a function pointer and alloc_page,
where there is only ever one value for the function pointer. There has to be a
better way?
> if (!page)
> goto fail;
> list_add(&page->lru, pamt_pages);
> @@ -2115,7 +2121,8 @@ static int tdx_pamt_add(atomic_t *pamt_refcount, unsigned long hpa,
> return 0;
> }
>
> -static int tdx_pamt_get(struct page *page, enum pg_level level)
> +int tdx_pamt_get(struct page *page, enum pg_level level,
> + struct page *(alloc)(void *data), void *data)
> {
> unsigned long hpa = page_to_phys(page);
> atomic_t *pamt_refcount;
> @@ -2134,7 +2141,7 @@ static int tdx_pamt_get(struct page *page, enum pg_level level)
> if (atomic_inc_not_zero(pamt_refcount))
> return 0;
>
> - if (tdx_alloc_pamt_pages(&pamt_pages))
> + if (tdx_alloc_pamt_pages(&pamt_pages, alloc, data))
> return -ENOMEM;
>
> ret = tdx_pamt_add(pamt_refcount, hpa, &pamt_pages);
> @@ -2143,8 +2150,9 @@ static int tdx_pamt_get(struct page *page, enum pg_level level)
>
> return ret >= 0 ? 0 : ret;
> }
> +EXPORT_SYMBOL_GPL(tdx_pamt_get);
>
> -static void tdx_pamt_put(struct page *page, enum pg_level level)
> +void tdx_pamt_put(struct page *page, enum pg_level level)
> {
> unsigned long hpa = page_to_phys(page);
> atomic_t *pamt_refcount;
> @@ -2179,6 +2187,7 @@ static void tdx_pamt_put(struct page *page, enum pg_level level)
>
> tdx_free_pamt_pages(&pamt_pages);
> }
> +EXPORT_SYMBOL_GPL(tdx_pamt_put);
>
> struct page *tdx_alloc_page(void)
> {
> @@ -2188,7 +2197,7 @@ struct page *tdx_alloc_page(void)
> if (!page)
> return NULL;
>
> - if (tdx_pamt_get(page, PG_LEVEL_4K)) {
> + if (tdx_pamt_get(page, PG_LEVEL_4K, NULL, NULL)) {
> __free_page(page);
> return NULL;
> }
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index eec82775c5bf..6add012532a0 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -436,6 +436,7 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
> BUG_ON(!p);
> return p;
> }
> +EXPORT_SYMBOL_GPL(kvm_mmu_memory_cache_alloc);
Did you consider pre-allocating a page and returning it to the cache if it's not
needed. Or moving kvm_mmu_memory_cache_alloc() to a static inline in a header
that core x86 can use.
They all seem bad in different ways.
> #endif
>
> static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
Powered by blists - more mailing lists