lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250616133931.206626-4-bharata@amd.com>
Date: Mon, 16 Jun 2025 19:09:30 +0530
From: Bharata B Rao <bharata@....com>
To: <linux-kernel@...r.kernel.org>, <linux-mm@...ck.org>
CC: <Jonathan.Cameron@...wei.com>, <dave.hansen@...el.com>,
	<gourry@...rry.net>, <hannes@...xchg.org>, <mgorman@...hsingularity.net>,
	<mingo@...hat.com>, <peterz@...radead.org>, <raghavendra.kt@....com>,
	<riel@...riel.com>, <rientjes@...gle.com>, <sj@...nel.org>,
	<weixugc@...gle.com>, <willy@...radead.org>, <ying.huang@...ux.alibaba.com>,
	<ziy@...dia.com>, <dave@...olabs.net>, <nifan.cxl@...il.com>,
	<xuezhengchu@...wei.com>, <yiannis@...corp.com>, <akpm@...ux-foundation.org>,
	<david@...hat.com>, <bharata@....com>
Subject: [RFC PATCH v1 3/4] mm: kmigrated - Async kernel migration thread

kmigrated is a per-node kernel thread that migrates the
folios marked for migration in batches. Each kmigrated
thread walks the PFN range spanning its node and checks
for potential migration candidates.

It depends on the fields added to extended page flags
to determine the pages that need to be migrated and
the target NID.

Signed-off-by: Bharata B Rao <bharata@....com>
---
 include/linux/mmzone.h   |   5 +
 include/linux/page_ext.h |  17 +++
 mm/Makefile              |   3 +-
 mm/kmigrated.c           | 223 +++++++++++++++++++++++++++++++++++++++
 mm/mm_init.c             |   6 ++
 mm/page_ext.c            |  11 ++
 6 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 mm/kmigrated.c

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 283913d42d7b..5d7f0b8d3c91 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -853,6 +853,8 @@ enum zone_type {
 
 };
 
+int kmigrated_add_pfn(unsigned long pfn, int nid);
+
 #ifndef __GENERATING_BOUNDS_H
 
 #define ASYNC_AND_SYNC 2
@@ -1049,6 +1051,7 @@ enum pgdat_flags {
 					 * many pages under writeback
 					 */
 	PGDAT_RECLAIM_LOCKED,		/* prevents concurrent reclaim */
+	PGDAT_KMIGRATED_ACTIVATE,	/* activates kmigrated */
 };
 
 enum zone_flags {
@@ -1493,6 +1496,8 @@ typedef struct pglist_data {
 #ifdef CONFIG_MEMORY_FAILURE
 	struct memory_failure_stats mf_stats;
 #endif
+	struct task_struct *kmigrated;
+	wait_queue_head_t kmigrated_wait;
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 76c817162d2f..4300c9dbafec 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -40,8 +40,25 @@ enum page_ext_flags {
 	PAGE_EXT_YOUNG,
 	PAGE_EXT_IDLE,
 #endif
+	/*
+	 * 32 bits following this are used by the migrator.
+	 * The next available bit position is 33.
+	 */
+	PAGE_EXT_MIGRATE_READY,
 };
 
+#define PAGE_EXT_MIG_NID_WIDTH	10
+#define PAGE_EXT_MIG_FREQ_WIDTH	3
+#define PAGE_EXT_MIG_TIME_WIDTH	18
+
+#define PAGE_EXT_MIG_NID_SHIFT	(PAGE_EXT_MIGRATE_READY + 1)
+#define PAGE_EXT_MIG_FREQ_SHIFT	(PAGE_EXT_MIG_NID_SHIFT + PAGE_EXT_MIG_NID_WIDTH)
+#define PAGE_EXT_MIG_TIME_SHIFT	(PAGE_EXT_MIG_FREQ_SHIFT + PAGE_EXT_MIG_FREQ_WIDTH)
+
+#define PAGE_EXT_MIG_NID_MASK	((1UL << PAGE_EXT_MIG_NID_SHIFT) - 1)
+#define PAGE_EXT_MIG_FREQ_MASK	((1UL << PAGE_EXT_MIG_FREQ_SHIFT) - 1)
+#define PAGE_EXT_MIG_TIME_MASK	((1UL << PAGE_EXT_MIG_TIME_SHIFT) - 1)
+
 /*
  * Page Extension can be considered as an extended mem_map.
  * A page_ext page is associated with every page descriptor. The
diff --git a/mm/Makefile b/mm/Makefile
index 1a7a11d4933d..5a382f19105f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,8 @@ mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
 			   msync.o page_vma_mapped.o pagewalk.o \
-			   pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o
+			   pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o \
+			   kmigrated.o
 
 
 ifdef CONFIG_CROSS_MEMORY_ATTACH
diff --git a/mm/kmigrated.c b/mm/kmigrated.c
new file mode 100644
index 000000000000..3caefe4be0e7
--- /dev/null
+++ b/mm/kmigrated.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * kmigrated is a kernel thread that runs for each node that has
+ * memory. It iterates over the node's PFNs and  migrates pages
+ * marked for migration into their targeted nodes.
+ *
+ * kmigrated depends on PAGE_EXTENSION to find out the pages that
+ * need to be migrated. In addition to a few fields that could be
+ * used by hot page promotion logic to store and evaluate the page
+ * hotness information, the extended page flags is field is extended
+ * to store the target NID for migration.
+ */
+#include <linux/mm.h>
+#include <linux/migrate.h>
+#include <linux/cpuhotplug.h>
+#include <linux/page_ext.h>
+
+#define KMIGRATE_DELAY	MSEC_PER_SEC
+#define KMIGRATE_BATCH	512
+
+static int page_ext_xchg_nid(struct page_ext *page_ext, int nid)
+{
+	unsigned long old_flags, flags;
+	int old_nid;
+
+	old_flags = READ_ONCE(page_ext->flags);
+	do {
+		flags = old_flags;
+		old_nid = (flags >> PAGE_EXT_MIG_NID_SHIFT) & PAGE_EXT_MIG_NID_MASK;
+
+		flags &= ~(PAGE_EXT_MIG_NID_MASK << PAGE_EXT_MIG_NID_SHIFT);
+		flags |= (nid & PAGE_EXT_MIG_NID_MASK) << PAGE_EXT_MIG_NID_SHIFT;
+	} while (unlikely(!try_cmpxchg(&page_ext->flags, &old_flags, flags)));
+
+	return old_nid;
+}
+
+/*
+ * Marks the page as ready for migration.
+ *
+ * @pfn: PFN of the page
+ * @nid: Target NID to were the page needs to be migrated
+ *
+ * The request for migration is noted by setting PAGE_EXT_MIGRATE_READY
+ * in the extended page flags which the kmigrated thread would check.
+ */
+int kmigrated_add_pfn(unsigned long pfn, int nid)
+{
+	struct page *page;
+	struct page_ext *page_ext;
+
+	page = pfn_to_page(pfn);
+	if (!page)
+		return -EINVAL;
+
+	page_ext = page_ext_get(page);
+	if (unlikely(!page_ext))
+		return -EINVAL;
+
+	page_ext_xchg_nid(page_ext, nid);
+	test_and_set_bit(PAGE_EXT_MIGRATE_READY, &page_ext->flags);
+	page_ext_put(page_ext);
+
+	set_bit(PGDAT_KMIGRATED_ACTIVATE, &page_pgdat(page)->flags);
+	return 0;
+}
+
+/*
+ * If the page has been marked ready for migration, return
+ * the NID to which it needs to be migrated to.
+ *
+ * If not return NUMA_NO_NODE.
+ */
+static int kmigrated_get_nid(struct page *page)
+{
+	struct page_ext *page_ext;
+	int nid = NUMA_NO_NODE;
+
+	page_ext = page_ext_get(page);
+	if (unlikely(!page_ext))
+		return nid;
+
+	if (!test_and_clear_bit(PAGE_EXT_MIGRATE_READY, &page_ext->flags))
+		goto out;
+
+	nid = page_ext_xchg_nid(page_ext, nid);
+out:
+	page_ext_put(page_ext);
+	return nid;
+}
+
+/*
+ * Walks the PFNs of the zone, isolates and migrates them in batches.
+ */
+static void kmigrated_walk_zone(unsigned long start_pfn, unsigned long end_pfn,
+				int src_nid)
+{
+	int nid, cur_nid = NUMA_NO_NODE;
+	LIST_HEAD(migrate_list);
+	int batch_count = 0;
+	struct folio *folio;
+	struct page *page;
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		if (!pfn_valid(pfn))
+			continue;
+
+		page = pfn_to_online_page(pfn);
+		if (!page)
+			continue;
+
+		if (page_to_nid(page) != src_nid)
+			continue;
+
+		/*
+		 * TODO: Take care of folio_nr_pages() increment
+		 * to pfn count.
+		 */
+		folio = page_folio(page);
+		if (!folio_test_lru(folio))
+			continue;
+
+		nid = kmigrated_get_nid(page);
+		if (nid == NUMA_NO_NODE)
+			continue;
+
+		if (page_to_nid(page) == nid)
+			continue;
+
+		if (migrate_misplaced_folio_prepare(folio, NULL, nid))
+			continue;
+
+		if (cur_nid != NUMA_NO_NODE)
+			cur_nid = nid;
+
+		if (++batch_count >= KMIGRATE_BATCH || cur_nid != nid) {
+			migrate_misplaced_folios_batch(&migrate_list, cur_nid);
+			cur_nid = nid;
+			batch_count = 0;
+			cond_resched();
+		}
+		list_add(&folio->lru, &migrate_list);
+	}
+	if (!list_empty(&migrate_list))
+		migrate_misplaced_folios_batch(&migrate_list, cur_nid);
+}
+
+static void kmigrated_do_work(pg_data_t *pgdat)
+{
+	struct zone *zone;
+	int zone_idx;
+
+	clear_bit(PGDAT_KMIGRATED_ACTIVATE, &pgdat->flags);
+	for (zone_idx = 0; zone_idx < MAX_NR_ZONES; zone_idx++) {
+		zone = &pgdat->node_zones[zone_idx];
+
+		if (!populated_zone(zone))
+			continue;
+
+		if (zone_is_zone_device(zone))
+			continue;
+
+		kmigrated_walk_zone(zone->zone_start_pfn, zone_end_pfn(zone),
+				    pgdat->node_id);
+	}
+}
+
+static inline bool kmigrated_work_requested(pg_data_t *pgdat)
+{
+	return test_bit(PGDAT_KMIGRATED_ACTIVATE, &pgdat->flags);
+}
+
+static void kmigrated_wait_work(pg_data_t *pgdat)
+{
+	long timeout = msecs_to_jiffies(KMIGRATE_DELAY);
+
+	wait_event_timeout(pgdat->kmigrated_wait,
+			   kmigrated_work_requested(pgdat), timeout);
+}
+
+/*
+ * Per-node kthread that iterates over its PFNs and migrates the
+ * pages that have been marked for migration.
+ */
+static int kmigrated(void *p)
+{
+	pg_data_t *pgdat = (pg_data_t *)p;
+
+	while (!kthread_should_stop()) {
+		kmigrated_wait_work(pgdat);
+		kmigrated_do_work(pgdat);
+	}
+	return 0;
+}
+
+static void kmigrated_run(int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	if (pgdat->kmigrated)
+		return;
+
+	pgdat->kmigrated = kthread_create(kmigrated, pgdat, "kmigrated%d", nid);
+	if (IS_ERR(pgdat->kmigrated)) {
+		pr_err("Failed to start kmigrated for node %d\n", nid);
+		pgdat->kmigrated = NULL;
+	} else {
+		wake_up_process(pgdat->kmigrated);
+	}
+}
+
+static int __init kmigrated_init(void)
+{
+	int nid;
+
+	for_each_node_state(nid, N_MEMORY)
+		kmigrated_run(nid);
+
+	return 0;
+}
+
+subsys_initcall(kmigrated_init)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f2944748f526..3a9cfd175366 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1398,6 +1398,11 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat)
 static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
 #endif
 
+static void pgdat_init_kmigrated(struct pglist_data *pgdat)
+{
+	init_waitqueue_head(&pgdat->kmigrated_wait);
+}
+
 static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 {
 	int i;
@@ -1407,6 +1412,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 
 	pgdat_init_split_queue(pgdat);
 	pgdat_init_kcompactd(pgdat);
+	pgdat_init_kmigrated(pgdat);
 
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index c351fdfe9e9a..546725fffddb 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -76,6 +76,16 @@ static struct page_ext_operations page_idle_ops __initdata = {
 };
 #endif
 
+static bool need_page_mig(void)
+{
+	return true;
+}
+
+static struct page_ext_operations page_mig_ops __initdata = {
+	.need = need_page_mig,
+	.need_shared_flags = true,
+};
+
 static struct page_ext_operations *page_ext_ops[] __initdata = {
 #ifdef CONFIG_PAGE_OWNER
 	&page_owner_ops,
@@ -89,6 +99,7 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
 #ifdef CONFIG_PAGE_TABLE_CHECK
 	&page_table_check_ops,
 #endif
+	&page_mig_ops,
 };
 
 unsigned long page_ext_size;
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ