linux-kernel - [PATCH v3 2/3] sched/fair: Introduce per cpu numa_balance

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-Id: <20250113073050.2811925-3-zhouchuyi@bytedance.com>
Date: Mon, 13 Jan 2025 15:30:49 +0800
From: Chuyi Zhou <zhouchuyi@...edance.com>
To: mingo@...hat.com,
	peterz@...radead.org,
	juri.lelli@...hat.com,
	vincent.guittot@...aro.org,
	dietmar.eggemann@....com,
	rostedt@...dmis.org,
	bsegall@...gle.com,
	mgorman@...e.de,
	vschneid@...hat.com,
	longman@...hat.com,
	riel@...riel.com
Cc: chengming.zhou@...ux.dev,
	kprateek.nayak@....com,
	linux-kernel@...r.kernel.org,
	Chuyi Zhou <zhouchuyi@...edance.com>
Subject: [PATCH v3 2/3] sched/fair: Introduce per cpu numa_balance_mask

This patch introduces per cpu numa_balance_mask. Similar to select_rq_mask,
it will be used as a temporary variable for candidate cpu searching and
numa status update. This will simplify the later patch, and we no longer
need to repeatedly verify whether the candidate CPU is in env->p->cpus_ptr
during iteration.

Signed-off-by: Chuyi Zhou <zhouchuyi@...edance.com>
---
 kernel/sched/fair.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f544012b9320..53fd95129b48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1448,6 +1448,9 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000;
 /* The page with hint page fault latency < threshold in ms is considered hot */
 unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
 
+/* Working cpumask for task_numa_migrate() */
+static DEFINE_PER_CPU(cpumask_var_t, numa_balance_mask);
+
 struct numa_group {
 	refcount_t refcount;
 
@@ -2047,6 +2050,7 @@ struct numa_stats {
 struct task_numa_env {
 	struct task_struct *p;
 
+	struct cpumask *cpus;
 	int src_cpu, src_nid;
 	int dst_cpu, dst_nid;
 	int imb_numa_nr;
@@ -2121,8 +2125,10 @@ static void update_numa_stats(struct task_numa_env *env,
 	memset(ns, 0, sizeof(*ns));
 	ns->idle_cpu = -1;
 
+	cpumask_copy(env->cpus, cpumask_of_node(nid));
+
 	rcu_read_lock();
-	for_each_cpu(cpu, cpumask_of_node(nid)) {
+	for_each_cpu(cpu, env->cpus) {
 		struct rq *rq = cpu_rq(cpu);
 
 		ns->load += cpu_load(rq);
@@ -2144,7 +2150,7 @@ static void update_numa_stats(struct task_numa_env *env,
 	}
 	rcu_read_unlock();
 
-	ns->weight = cpumask_weight(cpumask_of_node(nid));
+	ns->weight = cpumask_weight(env->cpus);
 
 	ns->node_type = numa_classify(env->imbalance_pct, ns);
 
@@ -2163,11 +2169,9 @@ static void task_numa_assign(struct task_numa_env *env,
 		int start = env->dst_cpu;
 
 		/* Find alternative idle CPU. */
-		for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
-			if (cpu == env->best_cpu || !idle_cpu(cpu) ||
-			    !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
+		for_each_cpu_wrap(cpu, env->cpus, start + 1) {
+			if (cpu == env->best_cpu || !idle_cpu(cpu))
 				continue;
-			}
 
 			env->dst_cpu = cpu;
 			rq = cpu_rq(env->dst_cpu);
@@ -2434,6 +2438,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 	bool maymove = false;
 	int cpu;
 
+	cpumask_and(env->cpus, cpumask_of_node(env->dst_nid), env->p->cpus_ptr);
+
 	/*
 	 * If dst node has spare capacity, then check if there is an
 	 * imbalance that would be overruled by the load balancer.
@@ -2475,11 +2481,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 		maymove = !load_too_imbalanced(src_load, dst_load, env);
 	}
 
-	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
-		/* Skip this CPU if the source task cannot migrate */
-		if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
-			continue;
-
+	for_each_cpu(cpu, env->cpus) {
 		env->dst_cpu = cpu;
 		if (task_numa_compare(env, taskimp, groupimp, maymove))
 			break;
@@ -2534,6 +2536,12 @@ static void task_numa_migrate(struct task_struct *p)
 		return;
 	}
 
+	/*
+	 * per-cpu numa_balance_mask and rq->rd->span usage
+	 */
+	preempt_disable();
+
+	env.cpus = this_cpu_cpumask_var_ptr(numa_balance_mask);
 	env.dst_nid = p->numa_preferred_nid;
 	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
 	taskweight = task_weight(p, env.src_nid, dist);
@@ -2579,6 +2587,8 @@ static void task_numa_migrate(struct task_struct *p)
 		}
 	}
 
+	preempt_enable();
+
 	/*
 	 * If the task is part of a workload that spans multiple NUMA nodes,
 	 * and is migrating into one of the workload's active nodes, remember
@@ -13638,6 +13648,10 @@ __init void init_sched_fair_class(void)
 		zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
 					GFP_KERNEL, cpu_to_node(i));
 
+#ifdef CONFIG_NUMA_BALANCING
+		zalloc_cpumask_var_node(&per_cpu(numa_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
 		INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
-- 
2.20.1