[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250918232224.2202592-7-rick.p.edgecombe@intel.com>
Date: Thu, 18 Sep 2025 16:22:14 -0700
From: Rick Edgecombe <rick.p.edgecombe@...el.com>
To: kas@...nel.org,
bp@...en8.de,
chao.gao@...el.com,
dave.hansen@...ux.intel.com,
isaku.yamahata@...el.com,
kai.huang@...el.com,
kvm@...r.kernel.org,
linux-coco@...ts.linux.dev,
linux-kernel@...r.kernel.org,
mingo@...hat.com,
pbonzini@...hat.com,
seanjc@...gle.com,
tglx@...utronix.de,
x86@...nel.org,
yan.y.zhao@...el.com,
vannapurve@...gle.com
Cc: rick.p.edgecombe@...el.com,
"Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>
Subject: [PATCH v3 06/16] x86/virt/tdx: Improve PAMT refcounters allocation for sparse memory
From: "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>
init_pamt_metadata() allocates PAMT refcounters for all physical memory up
to max_pfn. It might be suboptimal if the physical memory layout is
discontinuous and has large holes.
The refcount allocation vmalloc allocation. This is necessary to support a
large allocation size. The virtually contiguous property also makes it
easy to find a specific 2MB range’s refcount since it can simply be
indexed.
Since vmalloc mappings support remapping during normal kernel runtime,
switch to an approach that only populates refcount pages for the vmalloc
mapping when there is actually memory for that range. This means any holes
in the physical address space won’t use actual physical memory.
The validity of this memory optimization is based on a couple assumptions:
1. Physical holes in the ram layout are commonly large enough for it to be
worth it.
2. An alternative approach that looks the refcounts via some more layered
data structure wouldn’t overly complicate the lookups. Or at least
more than the complexity of managing the vmalloc mapping.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
[Add feedback, update log]
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@...el.com>
---
v3:
- Split from "x86/virt/tdx: Allocate reference counters for
PAMT memory" (Dave)
- Rename tdx_get_pamt_refcount()->tdx_find_pamt_refcount() (Dave)
- Drop duplicate pte_none() check (Dave)
- Align assignments in alloc_pamt_refcount() (Kai)
- Add comment in pamt_refcount_depopulate() to clarify teardown
logic (Dave)
- Drop __va(PFN_PHYS(pte_pfn(ptep_get()))) pile on for simpler method.
(Dave)
- Improve log
---
arch/x86/virt/vmx/tdx/tdx.c | 120 ++++++++++++++++++++++++++++++++----
1 file changed, 109 insertions(+), 11 deletions(-)
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 0ce4181ca352..d4b01656759a 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -194,30 +194,119 @@ int tdx_cpu_enable(void)
}
EXPORT_SYMBOL_GPL(tdx_cpu_enable);
-/*
- * Allocate PAMT reference counters for all physical memory.
- *
- * It consumes 2MiB for every 1TiB of physical memory.
- */
-static int init_pamt_metadata(void)
+/* Find PAMT refcount for a given physical address */
+static atomic_t *tdx_find_pamt_refcount(unsigned long hpa)
{
- size_t size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
+ return &pamt_refcounts[hpa / PMD_SIZE];
+}
- if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
- return 0;
+/* Map a page into the PAMT refcount vmalloc region */
+static int pamt_refcount_populate(pte_t *pte, unsigned long addr, void *data)
+{
+ struct page *page;
+ pte_t entry;
- pamt_refcounts = vmalloc(size);
- if (!pamt_refcounts)
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
return -ENOMEM;
+ entry = mk_pte(page, PAGE_KERNEL);
+
+ spin_lock(&init_mm.page_table_lock);
+ /*
+ * PAMT refcount populations can overlap due to rounding of the
+ * start/end pfn. Make sure another PAMT range didn't already
+ * populate it.
+ */
+ if (pte_none(ptep_get(pte)))
+ set_pte_at(&init_mm, addr, pte, entry);
+ else
+ __free_page(page);
+ spin_unlock(&init_mm.page_table_lock);
+
return 0;
}
+/*
+ * Allocate PAMT reference counters for the given PFN range.
+ *
+ * It consumes 2MiB for every 1TiB of physical memory.
+ */
+static int alloc_pamt_refcount(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long start, end;
+
+ start = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(start_pfn));
+ end = (unsigned long)tdx_find_pamt_refcount(PFN_PHYS(end_pfn + 1));
+ start = round_down(start, PAGE_SIZE);
+ end = round_up(end, PAGE_SIZE);
+
+ return apply_to_page_range(&init_mm, start, end - start,
+ pamt_refcount_populate, NULL);
+}
+
+/*
+ * Reserve vmalloc range for PAMT reference counters. It covers all physical
+ * address space up to max_pfn. It is going to be populated from
+ * build_tdx_memlist() only for present memory that available for TDX use.
+ *
+ * It reserves 2MiB of virtual address space for every 1TiB of physical memory.
+ */
+static int init_pamt_metadata(void)
+{
+ struct vm_struct *area;
+ size_t size;
+
+ if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
+ return 0;
+
+ size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
+ size = round_up(size, PAGE_SIZE);
+
+ area = get_vm_area(size, VM_SPARSE);
+ if (!area)
+ return -ENOMEM;
+
+ pamt_refcounts = area->addr;
+ return 0;
+}
+
+/* Unmap a page from the PAMT refcount vmalloc region */
+static int pamt_refcount_depopulate(pte_t *pte, unsigned long addr, void *data)
+{
+ struct page *page;
+ pte_t entry;
+
+ spin_lock(&init_mm.page_table_lock);
+
+ entry = ptep_get(pte);
+ /* refount allocation is sparse, may not be populated */
+ if (!pte_none(entry)) {
+ pte_clear(&init_mm, addr, pte);
+ page = pte_page(entry);
+ __free_page(page);
+ }
+
+ spin_unlock(&init_mm.page_table_lock);
+
+ return 0;
+}
+
+/* Unmap all PAMT refcount pages and free vmalloc range */
static void free_pamt_metadata(void)
{
+ size_t size;
+
if (!tdx_supports_dynamic_pamt(&tdx_sysinfo))
return;
+ size = max_pfn / PTRS_PER_PTE * sizeof(*pamt_refcounts);
+ size = round_up(size, PAGE_SIZE);
+
+ apply_to_existing_page_range(&init_mm,
+ (unsigned long)pamt_refcounts,
+ size, pamt_refcount_depopulate,
+ NULL);
vfree(pamt_refcounts);
pamt_refcounts = NULL;
}
@@ -288,10 +377,19 @@ static int build_tdx_memlist(struct list_head *tmb_list)
ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
if (ret)
goto err;
+
+ /* Allocated PAMT refcountes for the memblock */
+ ret = alloc_pamt_refcount(start_pfn, end_pfn);
+ if (ret)
+ goto err;
}
return 0;
err:
+ /*
+ * Only free TDX memory blocks here, PAMT refcount pages
+ * will be freed in the init_tdx_module() error path.
+ */
free_tdx_memlist(tmb_list);
return ret;
}
--
2.51.0
Powered by blists - more mailing lists