lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260108203755.1163107-2-gourry@gourry.net>
Date: Thu,  8 Jan 2026 15:37:48 -0500
From: Gregory Price <gourry@...rry.net>
To: linux-mm@...ck.org,
	cgroups@...r.kernel.org,
	linux-cxl@...r.kernel.org
Cc: linux-doc@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	linux-fsdevel@...r.kernel.org,
	kernel-team@...a.com,
	longman@...hat.com,
	tj@...nel.org,
	hannes@...xchg.org,
	mkoutny@...e.com,
	corbet@....net,
	gregkh@...uxfoundation.org,
	rafael@...nel.org,
	dakr@...nel.org,
	dave@...olabs.net,
	jonathan.cameron@...wei.com,
	dave.jiang@...el.com,
	alison.schofield@...el.com,
	vishal.l.verma@...el.com,
	ira.weiny@...el.com,
	dan.j.williams@...el.com,
	akpm@...ux-foundation.org,
	vbabka@...e.cz,
	surenb@...gle.com,
	mhocko@...e.com,
	jackmanb@...gle.com,
	ziy@...dia.com,
	david@...nel.org,
	lorenzo.stoakes@...cle.com,
	Liam.Howlett@...cle.com,
	rppt@...nel.org,
	axelrasmussen@...gle.com,
	yuanchu@...gle.com,
	weixugc@...gle.com,
	yury.norov@...il.com,
	linux@...musvillemoes.dk,
	rientjes@...gle.com,
	shakeel.butt@...ux.dev,
	chrisl@...nel.org,
	kasong@...cent.com,
	shikemeng@...weicloud.com,
	nphamcs@...il.com,
	bhe@...hat.com,
	baohua@...nel.org,
	yosry.ahmed@...ux.dev,
	chengming.zhou@...ux.dev,
	roman.gushchin@...ux.dev,
	muchun.song@...ux.dev,
	osalvador@...e.de,
	matthew.brost@...el.com,
	joshua.hahnjy@...il.com,
	rakie.kim@...com,
	byungchul@...com,
	gourry@...rry.net,
	ying.huang@...ux.alibaba.com,
	apopple@...dia.com,
	cl@...two.org,
	harry.yoo@...cle.com,
	zhengqi.arch@...edance.com,
	Balbir Singh <bsingharora@...il.com>
Subject: [RFC PATCH v3 1/8] numa,memory_hotplug: create N_PRIVATE (Private Nodes)

N_MEMORY nodes are intended to contain general System RAM.  Today, some
device drivers hotplug their memory (marked Specific Purpose or Reserved)
to get access to mm/ services, but don't intend it for general consumption.

This creates reliability issues as there are no isolation guarantees.

Create N_PRIVATE for memory nodes whose memory is not intended for
general consumption.  This state is mutually exclusive with N_MEMORY.

This will allow existing service code (like page_alloc.c) to manage
N_PRIVATE nodes without exposing N_MEMORY users to that memory.

Add `node_mark_private()` for device drivers to call to mark a node
as private prior to hotplugging memory.  This fails if the node is
already online or already has N_MEMORY.

Private nodes must have a memory types so that multiple drivers
trying to online private memory onto the same node are warned
when a conflict occurs.

Suggested-by: David Hildenbrand <david@...nel.org>
Suggested-by: Balbir Singh <bsingharora@...il.com>
Signed-off-by: Gregory Price <gourry@...rry.net>
---
 drivers/base/node.c      | 199 +++++++++++++++++++++++++++++++++++++++
 include/linux/node.h     |  60 ++++++++++++
 include/linux/nodemask.h |   1 +
 mm/memory_hotplug.c      |   2 +-
 4 files changed, 261 insertions(+), 1 deletion(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 00cf4532f121..b503782ea109 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -861,6 +861,193 @@ void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn,
 			   (void *)&nid, register_mem_block_under_node_hotplug);
 	return;
 }
+
+static enum private_memtype *private_nodes;
+/* Per-node list of private node operations callbacks */
+static struct list_head private_node_ops_list[MAX_NUMNODES];
+static DEFINE_MUTEX(private_node_ops_lock);
+static bool private_node_ops_initialized;
+
+/*
+ * Note: private_node_ops_list is initialized in node_dev_init() before
+ * any calls to node_register_private() can occur.
+ */
+
+/**
+ * node_register_private - Mark a node as private and register ops
+ * @nid: Node identifier
+ * @ops: Callback operations structure (required, but callbacks may be NULL)
+ *
+ * Mark a node as private and register the given ops structure. The ops
+ * structure must have res_start and res_end set to the physical address
+ * range covered by this registration, and memtype set to the private
+ * memory type. Multiple registrations for the same node are allowed as
+ * long as they have the same memtype.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+int node_register_private(int nid, struct private_node_ops *ops)
+{
+	int rc = 0;
+	enum private_memtype ctype;
+	enum private_memtype type;
+
+	if (!ops)
+		return -EINVAL;
+
+	type = ops->memtype;
+
+	if (!node_possible(nid) || !private_nodes || type >= NODE_MAX_MEMTYPE)
+		return -EINVAL;
+
+	/* Validate resource bounds */
+	if (ops->res_start > ops->res_end)
+		return -EINVAL;
+
+	mutex_lock(&private_node_ops_lock);
+
+	/* hotplug lock must be held while checking online/node state */
+	mem_hotplug_begin();
+
+	/*
+	 * N_PRIVATE and N_MEMORY are mutually exclusive. Fail if the node
+	 * already has N_MEMORY set, regardless of online state.
+	 */
+	if (node_state(nid, N_MEMORY)) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	ctype = private_nodes[nid];
+	if (ctype > NODE_MEM_NOTYPE && ctype != type) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* Initialize the ops list entry and add to the node's list */
+	INIT_LIST_HEAD(&ops->list);
+	list_add_tail_rcu(&ops->list, &private_node_ops_list[nid]);
+
+	private_nodes[nid] = type;
+	node_set_state(nid, N_PRIVATE);
+out:
+	mem_hotplug_done();
+	mutex_unlock(&private_node_ops_lock);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(node_register_private);
+
+/**
+ * node_unregister_private - Unregister ops and potentially unmark node as private
+ * @nid: Node identifier
+ * @ops: Callback operations structure to remove
+ *
+ * Remove the given ops structure from the node's ops list. If this is
+ * the last ops structure for the node and the node is offline, the
+ * node is unmarked as private.
+ */
+void node_unregister_private(int nid, struct private_node_ops *ops)
+{
+	if (!node_possible(nid) || !private_nodes || !ops)
+		return;
+
+	mutex_lock(&private_node_ops_lock);
+	mem_hotplug_begin();
+
+	list_del_rcu(&ops->list);
+	/* If list is now empty, clear private state */
+	if (list_empty(&private_node_ops_list[nid])) {
+		private_nodes[nid] = NODE_MEM_NOTYPE;
+		node_clear_state(nid, N_PRIVATE);
+	}
+
+	mem_hotplug_done();
+	mutex_unlock(&private_node_ops_lock);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(node_unregister_private);
+
+/**
+ * node_private_allocated - Validate a page allocation from a private node
+ * @page: The allocated page
+ *
+ * Find the ops structure whose region contains the page's physical address
+ * and call its page_allocated callback if one is registered.
+ *
+ * Returns:
+ *   0 if the callback succeeds or no callback is registered for this region
+ *   -ENXIO if the page is not found in any registered region
+ *   Other negative error code if the callback indicates the page is not safe
+ */
+int node_private_allocated(struct page *page)
+{
+	struct private_node_ops *ops;
+	phys_addr_t page_phys;
+	int nid = page_to_nid(page);
+	int ret = -ENXIO;
+
+	if (!node_possible(nid) || nid >= MAX_NUMNODES)
+		return -ENXIO;
+
+	if (!private_node_ops_initialized)
+		return -ENXIO;
+
+	page_phys = page_to_phys(page);
+
+	/*
+	 * Use RCU to safely traverse the list without holding locks.
+	 * Writers use list_add_tail_rcu/list_del_rcu with synchronize_rcu()
+	 * to ensure safe concurrent access.
+	 */
+	rcu_read_lock();
+	list_for_each_entry_rcu(ops, &private_node_ops_list[nid], list) {
+		if (page_phys >= ops->res_start && page_phys <= ops->res_end) {
+			if (ops->page_allocated)
+				ret = ops->page_allocated(page, ops->data);
+			else
+				ret = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(node_private_allocated);
+
+/**
+ * node_private_freed - Notify that a page from a private node is being freed
+ * @page: The page being freed
+ *
+ * Find the ops structure whose region contains the page's physical address
+ * and call its page_freed callback if one is registered.
+ */
+void node_private_freed(struct page *page)
+{
+	struct private_node_ops *ops;
+	phys_addr_t page_phys;
+	int nid = page_to_nid(page);
+
+	if (!node_possible(nid) || nid >= MAX_NUMNODES)
+		return;
+
+	if (!private_node_ops_initialized)
+		return;
+
+	page_phys = page_to_phys(page);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ops, &private_node_ops_list[nid], list) {
+		if (page_phys >= ops->res_start && page_phys <= ops->res_end) {
+			if (ops->page_freed)
+				ops->page_freed(page, ops->data);
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(node_private_freed);
+
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 /**
@@ -959,6 +1146,7 @@ static struct node_attr node_state_attr[] = {
 	[N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
 #endif
 	[N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
+	[N_PRIVATE] = _NODE_ATTR(has_private_memory, N_PRIVATE),
 	[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
 	[N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator,
 					   N_GENERIC_INITIATOR),
@@ -972,6 +1160,7 @@ static struct attribute *node_state_attrs[] = {
 	&node_state_attr[N_HIGH_MEMORY].attr.attr,
 #endif
 	&node_state_attr[N_MEMORY].attr.attr,
+	&node_state_attr[N_PRIVATE].attr.attr,
 	&node_state_attr[N_CPU].attr.attr,
 	&node_state_attr[N_GENERIC_INITIATOR].attr.attr,
 	NULL
@@ -1007,5 +1196,15 @@ void __init node_dev_init(void)
 			panic("%s() failed to add node: %d\n", __func__, ret);
 	}
 
+	private_nodes = kzalloc(sizeof(enum private_memtype) * MAX_NUMNODES,
+				GFP_KERNEL);
+	if (!private_nodes)
+		pr_warn("Failed to allocate private_nodes, private node support disabled\n");
+
+	/* Initialize private node ops lists */
+	for (i = 0; i < MAX_NUMNODES; i++)
+		INIT_LIST_HEAD(&private_node_ops_list[i]);
+	private_node_ops_initialized = true;
+
 	register_memory_blocks_under_nodes();
 }
diff --git a/include/linux/node.h b/include/linux/node.h
index 0269b064ba65..53a9fb63b60e 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -62,6 +62,47 @@ enum cache_mode {
 	NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR,
 };
 
+enum private_memtype {
+	NODE_MEM_NOTYPE,
+	NODE_MEM_ZSWAP,
+	NODE_MEM_COMPRESSED,
+	NODE_MEM_ACCELERATOR,
+	NODE_MEM_DEMOTE_ONLY,
+	NODE_MAX_MEMTYPE,
+};
+
+/**
+ * struct private_node_ops - Callbacks for private node operations
+ * @list: List node for per-node ops list
+ * @res_start: Start physical address of the memory region
+ * @res_end: End physical address of the memory region (inclusive)
+ * @memtype: Private node memory type for this region
+ * @page_allocated: Called after a page is allocated from this region
+ *                  to validate that the page is safe to use. Returns 0
+ *                  on success, negative error code on failure. If this
+ *                  returns an error, the caller should free the page
+ *                  and try another node. May be NULL if no validation
+ *                  is needed.
+ * @page_freed: Called when a page from this region is being freed.
+ *              Allows the driver to update its internal tracking.
+ *              May be NULL if no notification is needed.
+ * @data: Driver-private data passed to callbacks
+ *
+ * Multiple drivers may register ops for a single private node. Each
+ * registration covers a specific physical memory region. When a page
+ * is allocated, the appropriate ops structure is found by matching
+ * the page's physical address against the registered regions.
+ */
+struct private_node_ops {
+	struct list_head list;
+	resource_size_t res_start;
+	resource_size_t res_end;
+	enum private_memtype memtype;
+	int (*page_allocated)(struct page *page, void *data);
+	void (*page_freed)(struct page *page, void *data);
+	void *data;
+};
+
 /**
  * struct node_cache_attrs - system memory caching attributes
  *
@@ -121,6 +162,10 @@ extern struct node *node_devices[];
 #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
 void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn,
 					       unsigned long end_pfn);
+int node_register_private(int nid, struct private_node_ops *ops);
+void node_unregister_private(int nid, struct private_node_ops *ops);
+int node_private_allocated(struct page *page);
+void node_private_freed(struct page *page);
 #else
 static inline void register_memory_blocks_under_node_hotplug(int nid,
 							     unsigned long start_pfn,
@@ -130,6 +175,21 @@ static inline void register_memory_blocks_under_node_hotplug(int nid,
 static inline void register_memory_blocks_under_nodes(void)
 {
 }
+static inline int node_register_private(int nid, struct private_node_ops *ops)
+{
+	return -ENODEV;
+}
+static inline void node_unregister_private(int nid,
+					   struct private_node_ops *ops)
+{
+}
+static inline int node_private_allocated(struct page *page)
+{
+	return -ENXIO;
+}
+static inline void node_private_freed(struct page *page)
+{
+}
 #endif
 
 struct node_notify {
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index bd38648c998d..dac250c6f1a9 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -391,6 +391,7 @@ enum node_states {
 	N_HIGH_MEMORY = N_NORMAL_MEMORY,
 #endif
 	N_MEMORY,		/* The node has memory(regular, high, movable) */
+	N_PRIVATE,		/* The node's memory is private */
 	N_CPU,		/* The node has one or more cpus */
 	N_GENERIC_INITIATOR,	/* The node has one or more Generic Initiators */
 	NR_NODE_STATES
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 389989a28abe..57463fcb4021 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1207,7 +1207,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
 	online_pages_range(pfn, nr_pages);
 	adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
 
-	if (node_arg.nid >= 0)
+	if (node_arg.nid >= 0 && !node_state(nid, N_PRIVATE))
 		node_set_state(nid, N_MEMORY);
 	if (need_zonelists_rebuild)
 		build_all_zonelists(NULL);
-- 
2.52.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ