linux-kernel - [RFC PATCH v0 2/2] mm: sched: Batch-migrate misplaced pages

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250521080238.209678-3-bharata@amd.com>
Date: Wed, 21 May 2025 13:32:38 +0530
From: Bharata B Rao <bharata@....com>
To: <linux-kernel@...r.kernel.org>, <linux-mm@...ck.org>
CC: <Jonathan.Cameron@...wei.com>, <dave.hansen@...el.com>,
	<gourry@...rry.net>, <hannes@...xchg.org>, <mgorman@...hsingularity.net>,
	<mingo@...hat.com>, <peterz@...radead.org>, <raghavendra.kt@....com>,
	<riel@...riel.com>, <rientjes@...gle.com>, <sj@...nel.org>,
	<weixugc@...gle.com>, <willy@...radead.org>, <ying.huang@...ux.alibaba.com>,
	<ziy@...dia.com>, <dave@...olabs.net>, <nifan.cxl@...il.com>,
	<joshua.hahnjy@...il.com>, <xuezhengchu@...wei.com>, <yiannis@...corp.com>,
	<akpm@...ux-foundation.org>, <david@...hat.com>, Bharata B Rao
	<bharata@....com>
Subject: [RFC PATCH v0 2/2] mm: sched: Batch-migrate misplaced pages

Currently the folios identified as misplaced by the NUMA
balancing sub-system are migrated one by one from the NUMA
hint fault handler as and when they are identified as
misplaced.

Instead of such singe folio migrations, batch them and
migrate them at once.

Identified misplaced folios are isolated and stored in
a per-task list. A new task_work is queued from task tick
handler to migrate them in batches. Migration is done
periodically or if pending number of isolated foios exceeds
a threshold.

The PTEs for the isolated folios are restored to PRESENT
state right after isolation.

The last_cpupid field of isolated folios is used to store
the target_nid to which the folios need to be migrated to.
This needs changes to (at least) a couple of places where
last_cpupid field is updated/reset which now should happen
conditionally. The updation in folio_migrate_flags() isn't
handled yet but the reset in write page fault case is
handled.

The failed migration count isn't fed back to the scan period
update heuristics currently.

Signed-off-by: Bharata B Rao <bharata@....com>
---
 include/linux/sched.h |  4 +++
 init/init_task.c      |  2 ++
 kernel/sched/fair.c   | 64 +++++++++++++++++++++++++++++++++++++++++++
 mm/memory.c           | 44 +++++++++++++++--------------
 4 files changed, 93 insertions(+), 21 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f96ac1982893..4177ecf53633 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1360,6 +1360,8 @@ struct task_struct {
 	u64				last_task_numa_placement;
 	u64				last_sum_exec_runtime;
 	struct callback_head		numa_work;
+	struct callback_head		numa_mig_work;
+	unsigned long			numa_mig_interval;
 
 	/*
 	 * This pointer is only modified for current in syscall and
@@ -1397,6 +1399,8 @@ struct task_struct {
 	unsigned long			numa_faults_locality[3];
 
 	unsigned long			numa_pages_migrated;
+	struct list_head		migrate_list;
+	unsigned long			migrate_count;
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_RSEQ
diff --git a/init/init_task.c b/init/init_task.c
index e557f622bd90..997af6ab67a7 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -187,6 +187,8 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 	.numa_preferred_nid = NUMA_NO_NODE,
 	.numa_group	= NULL,
 	.numa_faults	= NULL,
+	.migrate_count	= 0,
+	.migrate_list	= LIST_HEAD_INIT(init_task.migrate_list),
 #endif
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	.kasan_depth	= 1,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fb9bf995a47..d6cbf8be76e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -49,6 +49,7 @@
 #include <linux/ratelimit.h>
 #include <linux/task_work.h>
 #include <linux/rbtree_augmented.h>
+#include <linux/migrate.h>
 
 #include <asm/switch_to.h>
 
@@ -1463,6 +1464,8 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000;
 /* The page with hint page fault latency < threshold in ms is considered hot */
 unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
 
+#define NUMAB_BATCH_MIGRATION_THRESHOLD	512
+
 struct numa_group {
 	refcount_t refcount;
 
@@ -3297,6 +3300,46 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 
 #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
 
+/*
+ * TODO: Feed failed migration count back to scan period update
+ * mechanism.
+ */
+static void migrate_queued_pages(struct list_head *migrate_list)
+{
+	int cur_nid, nid;
+	struct folio *folio, *tmp;
+	LIST_HEAD(nid_list);
+
+	folio = list_entry(migrate_list, struct folio, lru);
+	cur_nid = folio_last_cpupid(folio);
+
+	list_for_each_entry_safe(folio, tmp, migrate_list, lru) {
+		nid = folio_xchg_last_cpupid(folio, -1);
+
+		if (cur_nid != nid) {
+			migrate_misplaced_folio_batch(&nid_list, cur_nid);
+			cur_nid = nid;
+		}
+		list_move(&folio->lru, &nid_list);
+	}
+	migrate_misplaced_folio_batch(&nid_list, cur_nid);
+}
+
+static void task_migration_work(struct callback_head *work)
+{
+	struct task_struct *p = current;
+
+	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_mig_work));
+
+	work->next = work;
+
+	if (list_empty(&p->migrate_list))
+		return;
+
+	migrate_queued_pages(&p->migrate_list);
+	p->migrate_count = 0;
+}
+
 /*
  * The expensive part of numa migration is done from task_work context.
  * Triggered from task_tick_numa().
@@ -3567,14 +3610,19 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 	p->numa_migrate_retry		= 0;
 	/* Protect against double add, see task_tick_numa and task_numa_work */
 	p->numa_work.next		= &p->numa_work;
+	p->numa_mig_work.next		= &p->numa_mig_work;
+	p->numa_mig_interval			= 0;
 	p->numa_faults			= NULL;
 	p->numa_pages_migrated		= 0;
 	p->total_numa_faults		= 0;
 	RCU_INIT_POINTER(p->numa_group, NULL);
 	p->last_task_numa_placement	= 0;
 	p->last_sum_exec_runtime	= 0;
+	p->migrate_count		= 0;
+	INIT_LIST_HEAD(&p->migrate_list);
 
 	init_task_work(&p->numa_work, task_numa_work);
+	init_task_work(&p->numa_mig_work, task_migration_work);
 
 	/* New address space, reset the preferred nid */
 	if (!(clone_flags & CLONE_VM)) {
@@ -3596,6 +3644,20 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 	}
 }
 
+static void task_check_pending_migrations(struct task_struct *curr)
+{
+	struct callback_head *work = &curr->numa_mig_work;
+
+	if (work->next != work)
+		return;
+
+	if (time_after(jiffies, curr->numa_mig_interval) ||
+	    (curr->migrate_count > NUMAB_BATCH_MIGRATION_THRESHOLD)) {
+		curr->numa_mig_interval = jiffies + HZ;
+		task_work_add(curr, work, TWA_RESUME);
+	}
+}
+
 /*
  * Drive the periodic memory faults..
  */
@@ -3610,6 +3672,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
 		return;
 
+	task_check_pending_migrations(curr);
+
 	/*
 	 * Using runtime rather than walltime has the dual advantage that
 	 * we (mostly) drive the selection from busy threads and that the
diff --git a/mm/memory.c b/mm/memory.c
index 49199410805c..11d07004cb04 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3375,8 +3375,13 @@ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
 		 * Clear the folio's cpupid information as the existing
 		 * information potentially belongs to a now completely
 		 * unrelated process.
+		 *
+		 * If the page is found to be isolated pending migration,
+		 * then don't reset as last_cpupid will be holding the
+		 * target_nid information.
 		 */
-		folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
+		if (folio_test_lru(folio))
+			folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
 	}
 
 	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
@@ -5766,12 +5771,13 @@ static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_stru
 
 static vm_fault_t do_numa_page(struct vm_fault *vmf)
 {
+	struct task_struct *task = current;
 	struct vm_area_struct *vma = vmf->vma;
 	struct folio *folio = NULL;
 	int nid = NUMA_NO_NODE;
 	bool writable = false, ignore_writable = false;
 	bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
-	int last_cpupid;
+	int last_cpupid = (-1 & LAST_CPUPID_MASK);
 	int target_nid;
 	pte_t pte, old_pte;
 	int flags = 0, nr_pages;
@@ -5807,6 +5813,13 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	nid = folio_nid(folio);
 	nr_pages = folio_nr_pages(folio);
 
+	/*
+	 * If it is a non-LRU folio, it has been already
+	 * isolated and is in migration list.
+	 */
+	if (!folio_test_lru(folio))
+		goto out_map;
+
 	target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags,
 					writable, &last_cpupid);
 	if (target_nid == NUMA_NO_NODE)
@@ -5815,28 +5828,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
 		flags |= TNF_MIGRATE_FAIL;
 		goto out_map;
 	}
-	/* The folio is isolated and isolation code holds a folio reference. */
-	pte_unmap_unlock(vmf->pte, vmf->ptl);
 	writable = false;
 	ignore_writable = true;
+	nid = target_nid;
 
-	/* Migrate to the requested node */
-	if (!migrate_misplaced_folio(folio, target_nid)) {
-		nid = target_nid;
-		flags |= TNF_MIGRATED;
-		task_numa_fault(last_cpupid, nid, nr_pages, flags);
-		return 0;
-	}
-
-	flags |= TNF_MIGRATE_FAIL;
-	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-				       vmf->address, &vmf->ptl);
-	if (unlikely(!vmf->pte))
-		return 0;
-	if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
-		pte_unmap_unlock(vmf->pte, vmf->ptl);
-		return 0;
-	}
+	/*
+	 * Store target_nid in last_cpupid field for the isolated
+	 * folios.
+	 */
+	folio_xchg_last_cpupid(folio, target_nid);
+	list_add_tail(&folio->lru, &task->migrate_list);
+	task->migrate_count += nr_pages;
 out_map:
 	/*
 	 * Make it present again, depending on how arch implements
-- 
2.34.1