linux-kernel - [PATCH RFC] numa,sched: have fbq_classify

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [thread-next>] [day] [month] [year] [list]

Message-ID: <20150519105544.68caa726@annuminas.surriel.com>
Date:	Tue, 19 May 2015 10:55:44 -0400
From:	Rik van Riel <riel@...hat.com>
To:	linux-kernel@...r.kernel.org
Cc:	peterz@...radead.org, mgorman@...e.de, jhladky@...hat.com,
	dedekind1@...il.com, mingo@...nel.org
Subject: [PATCH RFC] numa,sched: have fbq_classify_* factor out the current
 task

The load balancer tries to find, through find_busiest_group and
find_busiest_queue, a busy CPU with tasks that are running on
the wrong NUMA node.

However, the load balancer only moves runnable-but-not-running
tasks in most situations. This fails horribly when the current
task on a CPU is on the wrong NUMA node, but the other task(s)
on the run queue are placed correctly.

In that situation, what started out as one misplaced tasks
quickly turns into two misplaced tasks.

Try to avoid that by factoring out the placement of the current
task, in order to find groups and runqueues with misplaced tasks
that are not currently running.

Signed-off-by: Rik van Riel <riel@...hat.com>
---
 kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7696cbad82e0..265109566dc6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -810,6 +810,8 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 
+void fbq_classify_current(int cpu, struct rq *rq, int *numa, int *remote);
+
 static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
 	unsigned long rss = 0;
@@ -6290,6 +6292,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 #ifdef CONFIG_NUMA_BALANCING
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
+		{
+			int numa, remote;
+			/*
+			 * The current task cannot be moved. Pretend it is
+			 * running on the right NUMA node, without counting
+			 * it twice.
+			 */
+			fbq_classify_current(i, rq, &numa, &remote);
+			sgs->nr_numa_running += numa;
+			sgs->nr_preferred_running += remote;
+		}
 #endif
 		sgs->sum_weighted_load += weighted_cpuload(i);
 		if (idle_cpu(i))
@@ -6368,11 +6381,31 @@ static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
 	return all;
 }
 
-static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+/*
+ * Is the current task running on the desired NUMA node?
+ * Must be called with the rcu_read_lock held.
+ */
+void fbq_classify_current(int cpu, struct rq *rq, int *numa, int *remote)
 {
-	if (rq->nr_running > rq->nr_numa_running)
+	struct task_struct *curr = rq->curr;
+	int curr_node = cpu_to_node(cpu);
+
+	*numa = curr->numa_preferred_nid != -1;
+	*remote = *numa && curr->numa_preferred_nid != curr_node;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq, int cpu)
+{
+	int numa, remote;
+	/*
+	 * The current task cannot be moved by the load balancer.
+	 * Pretend it is running on the right NUMA node, but be
+	 * careful not to count it twice.
+	 */
+	fbq_classify_current(cpu, rq, &numa, &remote);
+	if (rq->nr_running > rq->nr_numa_running + numa)
 		return regular;
-	if (rq->nr_running > rq->nr_preferred_running)
+	if (rq->nr_running > rq->nr_preferred_running + remote)
 		return remote;
 	return all;
 }
@@ -6382,7 +6415,7 @@ static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
 	return all;
 }
 
-static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+static inline enum fbq_type fbq_classify_rq(struct rq *rq, int cpu)
 {
 	return regular;
 }
@@ -6773,7 +6806,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 		enum fbq_type rt;
 
 		rq = cpu_rq(i);
-		rt = fbq_classify_rq(rq);
+		rt = fbq_classify_rq(rq, i);
 
 		/*
 		 * We classify groups/runqueues into three groups:

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/