linux-kernel - Re: [RFC PATCH 04/20] x86/virt/tdx: Introduce a "tdx" subsystem and "tsm" device

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <688bcf40215c3_48e5100d6@dwillia2-xfh.jf.intel.com.notmuch>
Date: Thu, 31 Jul 2025 13:17:04 -0700
From: <dan.j.williams@...el.com>
To: Chao Gao <chao.gao@...el.com>, "Huang, Kai" <kai.huang@...el.com>
CC: "kvm@...r.kernel.org" <kvm@...r.kernel.org>, "linux-coco@...ts.linux.dev"
	<linux-coco@...ts.linux.dev>, "x86@...nel.org" <x86@...nel.org>, "Shutemov,
 Kirill" <kirill.shutemov@...el.com>, "Dong, Eddie" <eddie.dong@...el.com>,
	"Hansen, Dave" <dave.hansen@...el.com>, "dave.hansen@...ux.intel.com"
	<dave.hansen@...ux.intel.com>, "Reshetova, Elena"
	<elena.reshetova@...el.com>, "kirill.shutemov@...ux.intel.com"
	<kirill.shutemov@...ux.intel.com>, "seanjc@...gle.com" <seanjc@...gle.com>,
	"mingo@...hat.com" <mingo@...hat.com>, "pbonzini@...hat.com"
	<pbonzini@...hat.com>, "tglx@...utronix.de" <tglx@...utronix.de>, "Yamahata,
 Isaku" <isaku.yamahata@...el.com>, "linux-kernel@...r.kernel.org"
	<linux-kernel@...r.kernel.org>, "hpa@...or.com" <hpa@...or.com>, "Chen,
 Farrah" <farrah.chen@...el.com>, "Edgecombe, Rick P"
	<rick.p.edgecombe@...el.com>, "bp@...en8.de" <bp@...en8.de>, "Williams, Dan
 J" <dan.j.williams@...el.com>
Subject: Re: [RFC PATCH 04/20] x86/virt/tdx: Introduce a "tdx" subsystem and
 "tsm" device

Chao Gao wrote:
> On Tue, Jun 03, 2025 at 07:44:08AM +0800, Huang, Kai wrote:
> >
> >>  static int init_tdx_module(void)
> >>  {
> >>  	int ret;
> >> @@ -1136,6 +1209,8 @@ static int init_tdx_module(void)
> >>  
> >>  	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
> >>  
> >> +	tdx_subsys_init();
> >> +
> >>  out_put_tdxmem:
> >>  	/*
> >>  	 * @tdx_memlist is written here and read at memory hotplug time.
> >
> >The error handling of init_module_module() is already very heavy.  Although
> >tdx_subsys_init() doesn't return any error, I would prefer to putting
> >tdx_subsys_init() to __tdx_enable() (the caller of init_tdx_module()) so that
> >init_tdx_module() can just focus on initializing the TDX module.
> 
> Sounds good. Will do.
> 
> btw, I think we can use guard() to simplify the error-handling a bit, e.g.,

In cleanup.h I wrote:

    Lastly, given that the benefit of cleanup helpers is removal of
    "goto", and that the "goto" statement can jump between scopes, the
    expectation is that usage of "goto" and cleanup helpers is never
    mixed in the same function. I.e. for a given routine, convert all
    resources that need a "goto" cleanup to scope-based cleanup, or
    convert none of them.

...because it leaves a minefield for the next person to keep a mental
map of the cleanup scopes vs goto.

In this case all of the cleanup functions take a pointer argument, so it
is a straightforward conversion to track those individual cleanup steps
in local pointer variables with something like below (compile tested
only).

So either convert the entire function, or none of the function to scope
based cleanup.

-- 8< --
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index c7a9a087ccaf..029db95982f7 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -28,6 +28,7 @@
 #include <linux/log2.h>
 #include <linux/acpi.h>
 #include <linux/suspend.h>
+#include <linux/cleanup.h>
 #include <linux/idr.h>
 #include <asm/page.h>
 #include <asm/special_insns.h>
@@ -225,7 +226,7 @@ static void free_tdx_memlist(struct list_head *tmb_list)
  * ranges off in a secondary structure because memblock is modified
  * in memory hotplug while TDX memory regions are fixed.
  */
-static int build_tdx_memlist(struct list_head *tmb_list)
+static struct list_head *build_tdx_memlist(struct list_head *tmb_list)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, nid, ret;
@@ -251,10 +252,10 @@ static int build_tdx_memlist(struct list_head *tmb_list)
 			goto err;
 	}
 
-	return 0;
+	return tmb_list;
 err:
 	free_tdx_memlist(tmb_list);
-	return ret;
+	return ERR_PTR(ret);
 }
 
 static int read_sys_metadata_field(u64 field_id, u64 *data)
@@ -306,8 +307,9 @@ static int tdmr_size_single(u16 max_reserved_per_tdmr)
 	return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
 }
 
-static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
-			   struct tdx_sys_info_tdmr *sysinfo_tdmr)
+static struct tdmr_info_list *
+alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
+		struct tdx_sys_info_tdmr *sysinfo_tdmr)
 {
 	size_t tdmr_sz, tdmr_array_sz;
 	void *tdmr_array;
@@ -323,7 +325,7 @@ static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
 	tdmr_array = alloc_pages_exact(tdmr_array_sz,
 			GFP_KERNEL | __GFP_ZERO);
 	if (!tdmr_array)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	tdmr_list->tdmrs = tdmr_array;
 
@@ -335,7 +337,7 @@ static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
 	tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs;
 	tdmr_list->nr_consumed_tdmrs = 0;
 
-	return 0;
+	return tdmr_list;
 }
 
 static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
@@ -888,9 +890,9 @@ static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
  * to cover all TDX memory regions in @tmb_list based on the TDX module
  * TDMR global information in @sysinfo_tdmr.
  */
-static int construct_tdmrs(struct list_head *tmb_list,
-			   struct tdmr_info_list *tdmr_list,
-			   struct tdx_sys_info_tdmr *sysinfo_tdmr)
+static struct tdmr_info_list *
+construct_tdmrs(struct list_head *tmb_list, struct tdmr_info_list *tdmr_list,
+		struct tdx_sys_info_tdmr *sysinfo_tdmr)
 {
 	u16 pamt_entry_size[TDX_PS_NR] = {
 		sysinfo_tdmr->pamt_4k_entry_size,
@@ -901,11 +903,11 @@ static int construct_tdmrs(struct list_head *tmb_list,
 
 	ret = fill_out_tdmrs(tmb_list, tdmr_list);
 	if (ret)
-		return ret;
+		return ERR_PTR(ret);
 
 	ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size);
 	if (ret)
-		return ret;
+		return ERR_PTR(ret);
 
 	ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
 			sysinfo_tdmr->max_reserved_per_tdmr);
@@ -919,10 +921,11 @@ static int construct_tdmrs(struct list_head *tmb_list,
 	 */
 	smp_wmb();
 
-	return ret;
+	return tdmr_list;
 }
 
-static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
+static struct tdmr_info_list *
+config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
 {
 	struct tdx_module_args args = {};
 	u64 *tdmr_pa_array;
@@ -941,7 +944,7 @@ static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
 
 	tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
 	if (!tdmr_pa_array)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
 		tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
@@ -954,7 +957,10 @@ static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
 	/* Free the array as it is not required anymore. */
 	kfree(tdmr_pa_array);
 
-	return ret;
+	if (ret)
+		return ERR_PTR(ret);
+
+	return tdmr_list;
 }
 
 static int do_global_key_config(void *unused)
@@ -1065,6 +1071,34 @@ static int init_tdmrs(struct tdmr_info_list *tdmr_list)
 	return 0;
 }
 
+DEFINE_FREE(free_tdx_memlist, struct list_head *,
+	    if (!IS_ERR_OR_NULL(_T)) free_tdx_memlist(_T))
+DEFINE_FREE(free_tdmr_list, struct tdmr_info_list *,
+	    if (!IS_ERR_OR_NULL(_T)) free_tdmr_list(_T))
+DEFINE_FREE(free_pamt_all, struct tdmr_info_list *,
+	    if (!IS_ERR_OR_NULL(_T)) tdmrs_free_pamt_all(_T))
+DEFINE_FREE(
+	reset_pamt_all, struct tdmr_info_list *,
+	if (!IS_ERR_OR_NULL(_T)) {
+		/*
+		 * Part of PAMTs may already have been initialized by the
+		 * TDX module.  Flush cache before returning PAMTs back
+		 * to the kernel.
+		 */
+
+		wbinvd_on_all_cpus();
+		/*
+		 * According to the TDX hardware spec, if the platform
+		 * doesn't have the "partial write machine check"
+		 * erratum, any kernel read/write will never cause #MC
+		 * in kernel space, thus it's OK to not convert PAMTs
+		 * back to normal.  But do the conversion anyway here
+		 * as suggested by the TDX spec.
+		 */
+		tdmrs_reset_pamt_all(_T);
+	}
+)
+
 static int init_tdx_module(void)
 {
 	int ret;
@@ -1088,70 +1122,49 @@ static int init_tdx_module(void)
 	 * holding mem_hotplug_lock read-lock as the memory hotplug code
 	 * path reads the @tdx_memlist to reject any new memory.
 	 */
-	get_online_mems();
+	guard(online_mems)();
 
-	ret = build_tdx_memlist(&tdx_memlist);
-	if (ret)
-		goto out_put_tdxmem;
+	struct list_head *memlist __free(free_tdx_memlist) =
+		build_tdx_memlist(&tdx_memlist);
+	if (IS_ERR(memlist))
+		return PTR_ERR(memlist);
 
 	/* Allocate enough space for constructing TDMRs */
-	ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr);
-	if (ret)
-		goto err_free_tdxmem;
+	struct tdmr_info_list *tdmrlist __free(free_tdmr_list) =
+		alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr);
+	if (IS_ERR(tdmrlist))
+		return PTR_ERR(tdmrlist);
 
 	/* Cover all TDX-usable memory regions in TDMRs */
-	ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr);
-	if (ret)
-		goto err_free_tdmrs;
+	struct tdmr_info_list *tdmrpamt __free(free_pamt_all) = construct_tdmrs(
+		&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr);
+	if (IS_ERR(tdmrpamt))
+		return PTR_ERR(tdmrpamt);
 
 	/* Pass the TDMRs and the global KeyID to the TDX module */
-	ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
-	if (ret)
-		goto err_free_pamts;
+	struct tdmr_info_list *tdmrconfig __free(reset_pamt_all) =
+		config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
+	if (IS_ERR(tdmrconfig))
+		return PTR_ERR(tdmrconfig);
 
 	/* Config the key of global KeyID on all packages */
 	ret = config_global_keyid();
 	if (ret)
-		goto err_reset_pamts;
+		return ret;
 
 	/* Initialize TDMRs to complete the TDX module initialization */
 	ret = init_tdmrs(&tdx_tdmr_list);
 	if (ret)
-		goto err_reset_pamts;
+		return ret;
 
-	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
+	retain_and_null_ptr(tdmrconfig);
+	retain_and_null_ptr(tdmrpamt);
+	retain_and_null_ptr(tdmrlist);
+	retain_and_null_ptr(memlist);
 
-out_put_tdxmem:
-	/*
-	 * @tdx_memlist is written here and read at memory hotplug time.
-	 * Lock out memory hotplug code while building it.
-	 */
-	put_online_mems();
-	return ret;
+	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
 
-err_reset_pamts:
-	/*
-	 * Part of PAMTs may already have been initialized by the
-	 * TDX module.  Flush cache before returning PAMTs back
-	 * to the kernel.
-	 */
-	wbinvd_on_all_cpus();
-	/*
-	 * According to the TDX hardware spec, if the platform
-	 * doesn't have the "partial write machine check"
-	 * erratum, any kernel read/write will never cause #MC
-	 * in kernel space, thus it's OK to not convert PAMTs
-	 * back to normal.  But do the conversion anyway here
-	 * as suggested by the TDX spec.
-	 */
-	tdmrs_reset_pamt_all(&tdx_tdmr_list);
-err_free_pamts:
-	tdmrs_free_pamt_all(&tdx_tdmr_list);
-err_free_tdmrs:
-	free_tdmr_list(&tdx_tdmr_list);
-err_free_tdxmem:
-	free_tdx_memlist(&tdx_memlist);
-	goto out_put_tdxmem;
+	return 0;
 }
 
 static int __tdx_enable(void)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index eaac5ae8c05c..6d3f997c7fe8 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -6,6 +6,7 @@
 #include <linux/spinlock.h>
 #include <linux/notifier.h>
 #include <linux/bug.h>
+#include <linux/cleanup.h>
 
 struct page;
 struct zone;
@@ -239,6 +240,8 @@ static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {}
 static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {}
 #endif /* ! CONFIG_MEMORY_HOTPLUG */
 
+DEFINE_LOCK_GUARD_0(online_mems, get_online_mems(), put_online_mems())
+
 /*
  * Keep this declaration outside CONFIG_MEMORY_HOTPLUG as some
  * platforms might override and use arch_get_mappable_range()