linux-kernel - [RFC PATCH 03/10] sched: Select a better task to pull across node using iterations

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-Id: <1375170505-5967-4-git-send-email-srikar@linux.vnet.ibm.com>
Date:	Tue, 30 Jul 2013 13:18:18 +0530
From:	Srikar Dronamraju <srikar@...ux.vnet.ibm.com>
To:	Mel Gorman <mgorman@...e.de>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Ingo Molnar <mingo@...nel.org>
Cc:	Andrea Arcangeli <aarcange@...hat.com>,
	Johannes Weiner <hannes@...xchg.org>,
	Linux-MM <linux-mm@...ck.org>,
	LKML <linux-kernel@...r.kernel.org>,
	Preeti U Murthy <preeti@...ux.vnet.ibm.com>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Srikar Dronamraju <srikar@...ux.vnet.ibm.com>
Subject: [RFC PATCH 03/10] sched: Select a better task to pull across node using iterations

While selecting a task to pull across a node, try to choose a task that
improves locatity. i.e choose a task that has more affinity to the
destination node.

To achieve this, parse the list of tasks in multiple iterations. For now
choose just two iterations.  In the first iteration, a task is chosen to
move if and only if moving such a task helps improve node locality.  In
the last iteration, choose the default behaviour, i.e, a task is chosen
irrespective of whether it improves node locality or not.(behaviour
before this change). This iteration logic is only for cross node
migration and with CONFIG_NUMA_BALANCING enabled.

So if there are two tasks in a runq, both eligible to be migrated to
another runq belonging to a different node, then this change tries to
chose a task among the two that improves locality.

Similar logic was first used in Peter Zijlstra's numa core.

Signed-off-by: Srikar Dronamraju <srikar@...ux.vnet.ibm.com>
---
 kernel/sched/fair.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3df7f76..8fcbf96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3906,6 +3906,7 @@ struct lb_env {
 	unsigned int		loop;
 	unsigned int		loop_break;
 	unsigned int		loop_max;
+	unsigned int		iterations;
 };
 
 /*
@@ -4030,6 +4031,21 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	return 1;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static bool preferred_node(struct task_struct *p, struct lb_env *env)
+{
+	if (!(env->sd->flags & SD_NUMA))
+		return false;
+
+	return (can_numa_migrate_task(p, env->dst_rq, env->src_rq) == 1);
+}
+#else
+static bool preferred_node(struct task_struct *p, struct lb_env *env)
+{
+	return false;
+}
+#endif
+
 /*
  * move_one_task tries to move exactly one task from busiest to this_rq, as
  * part of active balancing operations within "domain".
@@ -4041,7 +4057,11 @@ static int move_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 
+again:
 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+		if (!preferred_node(p, env))
+			continue;
+
 		if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
 			continue;
 
@@ -4049,6 +4069,7 @@ static int move_one_task(struct lb_env *env)
 			continue;
 
 		move_task(p, env);
+
 		/*
 		 * Right now, this is only the second place move_task()
 		 * is called, so we can safely collect move_task()
@@ -4057,6 +4078,9 @@ static int move_one_task(struct lb_env *env)
 		schedstat_inc(env->sd, lb_gained[env->idle]);
 		return 1;
 	}
+	if (!env->iterations++)
+		goto again;
+
 	return 0;
 }
 
@@ -4096,6 +4120,9 @@ static int move_tasks(struct lb_env *env)
 			break;
 		}
 
+		if (!preferred_node(p, env))
+			goto next;
+
 		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 			goto next;
 
@@ -5099,6 +5126,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		.idle		= idle,
 		.loop_break	= sched_nr_migrate_break,
 		.cpus		= cpus,
+		.iterations	= 1,
 	};
 
 	cpumask_copy(cpus, cpu_active_mask);
@@ -5130,6 +5158,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	ld_moved = 0;
 	lb_iterations = 1;
 	if (busiest->nr_running > 1) {
+#ifdef CONFIG_NUMA_BALANCING
+		if (sd->flags & SD_NUMA) {
+			if (cpu_to_node(env.dst_cpu) != cpu_to_node(env.src_cpu))
+				env.iterations = 0;
+		}
+#endif
 		/*
 		 * Attempt to move tasks. If find_busiest_group has found
 		 * an imbalance but busiest->nr_running <= 1, the group is
@@ -5160,6 +5194,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			goto more_balance;
 		}
 
+		if (!ld_moved && !env.iterations++) {
+			env.loop	 = 0;
+			env.loop_break	 = sched_nr_migrate_break;
+			goto more_balance;
+		}
+
 		/*
 		 * some other cpu did the load balance for us.
 		 */
@@ -5407,8 +5447,16 @@ static int active_load_balance_cpu_stop(void *data)
 			.src_cpu	= busiest_rq->cpu,
 			.src_rq		= busiest_rq,
 			.idle		= CPU_IDLE,
+			.iterations	= 1,
 		};
 
+#ifdef CONFIG_NUMA_BALANCING
+		if ((sd->flags & SD_NUMA)) {
+			if (cpu_to_node(env.dst_cpu) != cpu_to_node(env.src_cpu))
+				env.iterations = 0;
+		}
+#endif
+
 		schedstat_inc(sd, alb_count);
 
 		if (move_one_task(&env))
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/