linux-kernel - Re: [PATCH] numa,sched: only consider less busy nodes as numa balancing destination

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <20150513062906.GJ3007@worktop.Skamania.guest>
Date:	Wed, 13 May 2015 08:29:06 +0200
From:	Peter Zijlstra <peterz@...radead.org>
To:	Rik van Riel <riel@...hat.com>
Cc:	dedekind1@...il.com, linux-kernel@...r.kernel.org, mgorman@...e.de,
	jhladky@...hat.com
Subject: Re: [PATCH] numa,sched: only consider less busy nodes as numa
 balancing destination

On Tue, May 12, 2015 at 11:45:09AM -0400, Rik van Riel wrote:
> I have a few poorly formed ideas on what could be done about that:
> 
> 1) have fbq_classify_rq take the current task on the rq into account,
>    and adjust the fbq classification if all the runnable-but-queued
>    tasks are on the right node

So while looking at this I came up with the below; it treats anything
inside ->active_nodes as a preferred node for balancing purposes.

Would that make sense?

I'll see what I can do about current in the runqueue type
classification.

> 2) ensure that rq->nr_numa_running and rq->nr_preferred_running also
>    get incremented for kernel threads that are bound to a particular
>    CPU - currently CPU-bound kernel threads will cause the NUMA
>    statistics to look like a CPU has tasks that do not belong on that
>    NUMA node

I'm thinking accounting those to nr_pinned, lemme see how that works
out.

---
 include/linux/sched.h |  1 +
 kernel/sched/fair.c   | 58 ++++++++++++++++++++++++++++++++-------------------
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cb734861123a..ffebc2e091ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1443,6 +1443,7 @@ struct task_struct {
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 	unsigned sched_migrated:1;
+	unsigned sched_preferred:1;
 
 #ifdef CONFIG_MEMCG_KMEM
 	unsigned memcg_kmem_skip_account:1;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8c1510abeefa..d59adb8e8ef4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -856,18 +856,6 @@ static unsigned int task_scan_max(struct task_struct *p)
 	return max(smin, smax);
 }
 
-static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
-{
-	rq->nr_numa_running += (p->numa_preferred_nid != -1);
-	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
-}
-
-static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
-{
-	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
-	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
-}
-
 struct numa_group {
 	atomic_t refcount;
 
@@ -887,6 +875,28 @@ struct numa_group {
 	unsigned long faults[0];
 };
 
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+	int node = task_node(p);
+	bool local;
+
+	rq->nr_numa_running += (p->numa_preferred_nid != -1);
+
+	if (p->numa_group)
+		local = node_isset(node, p->numa_group->active_nodes);
+	else
+		local = (p->numa_preferred_nid == node);
+
+	p->sched_preferred = local;
+	rq->nr_preferred_running += local;
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+	rq->nr_preferred_running -= p->sched_preferred;
+}
+
 /* Shared or private faults. */
 #define NR_NUMA_HINT_FAULT_TYPES 2
 
@@ -1572,9 +1582,10 @@ static void numa_migrate_preferred(struct task_struct *p)
  * are added when they cause over 6/16 of the maximum number of faults, but
  * only removed when they drop below 3/16.
  */
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static bool update_numa_active_node_mask(struct numa_group *numa_group)
 {
 	unsigned long faults, max_faults = 0;
+	bool update = false;
 	int nid;
 
 	for_each_online_node(nid) {
@@ -1586,11 +1597,17 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
 	for_each_online_node(nid) {
 		faults = group_faults_cpu(numa_group, nid);
 		if (!node_isset(nid, numa_group->active_nodes)) {
-			if (faults > max_faults * 6 / 16)
+			if (faults > max_faults * 6 / 16) {
 				node_set(nid, numa_group->active_nodes);
-		} else if (faults < max_faults * 3 / 16)
+				update = true;
+			}
+		} else if (faults < max_faults * 3 / 16) {
 			node_clear(nid, numa_group->active_nodes);
+			update = true;
+		}
 	}
+
+	return update;
 }
 
 /*
@@ -1884,16 +1901,15 @@ static void task_numa_placement(struct task_struct *p)
 		update_numa_active_node_mask(p->numa_group);
 		spin_unlock_irq(group_lock);
 		max_nid = preferred_group_nid(p, max_group_nid);
-	}
-
-	if (max_faults) {
+		sched_setnuma(p, max_nid);
+	} else if (max_faults) {
 		/* Set the new preferred node */
 		if (max_nid != p->numa_preferred_nid)
 			sched_setnuma(p, max_nid);
-
-		if (task_node(p) != p->numa_preferred_nid)
-			numa_migrate_preferred(p);
 	}
+
+	if (task_node(p) != p->numa_preferred_nid)
+		numa_migrate_preferred(p);
 }
 
 static inline int get_numa_group(struct numa_group *grp)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/