lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <1572863268-28585-1-git-send-email-zhangshaokun@hisilicon.com>
Date:   Mon, 4 Nov 2019 18:27:48 +0800
From:   Shaokun Zhang <zhangshaokun@...ilicon.com>
To:     <linux-kernel@...r.kernel.org>
CC:     yuqi jin <jinyuqi@...wei.com>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Mike Rapoport <rppt@...ux.ibm.com>,
        Paul Burton <paul.burton@...s.com>,
        Michal Hocko <mhocko@...e.com>,
        Michael Ellerman <mpe@...erman.id.au>,
        Anshuman Khandual <anshuman.khandual@....com>,
        Shaokun Zhang <zhangshaokun@...ilicon.com>
Subject: [PATCH v2] lib: optimize cpumask_local_spread()

From: yuqi jin <jinyuqi@...wei.com>

In the multi-processor and NUMA system, I/O device may have many numa
nodes belonging to multiple cpus. When we get a local numa, it is
better to find the node closest to the local numa node, instead
of choosing any online cpu immediately.

For the current code, it only considers the local NUMA node and it
doesn't compute the distances between different NUMA nodes for the
non-local NUMA nodes. Let's optimize it and find the nearest node
through NUMA distance. The performance will be better if it return
the nearest node than the random node.

When Parameter Server workload is tested using NIC device on Huawei
Kunpeng 920 SoC:
Without the patch, the performance is 22W QPS;
Added this patch, the performance become better and it is 26W QPS.

Cc: Andrew Morton <akpm@...ux-foundation.org>
Cc: Mike Rapoport <rppt@...ux.ibm.com>
Cc: Paul Burton <paul.burton@...s.com>
Cc: Michal Hocko <mhocko@...e.com>
Cc: Michael Ellerman <mpe@...erman.id.au>
Cc: Anshuman Khandual <anshuman.khandual@....com>
Signed-off-by: yuqi jin <jinyuqi@...wei.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@...ilicon.com>
---
 lib/cpumask.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 81 insertions(+), 12 deletions(-)

diff --git a/lib/cpumask.c b/lib/cpumask.c
index 0cb672eb107c..15d8940f32a8 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -192,18 +192,39 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 }
 #endif
 
-/**
- * cpumask_local_spread - select the i'th cpu with local numa cpu's first
- * @i: index number
- * @node: local numa_node
- *
- * This function selects an online CPU according to a numa aware policy;
- * local cpus are returned first, followed by non-local ones, then it
- * wraps around.
- *
- * It's not very efficient, but useful for setup.
- */
-unsigned int cpumask_local_spread(unsigned int i, int node)
+static void calc_node_distance(int *node_dist, int node)
+{
+	int i;
+
+	for (i = 0; i < nr_node_ids; i++)
+		node_dist[i] = node_distance(node, i);
+}
+
+static int find_nearest_node(int *node_dist, bool *used)
+{
+	int i, min_dist = node_dist[0], node_id = -1;
+
+	/* Choose the first unused node to compare */
+	for (i = 0; i < nr_node_ids; i++) {
+		if (used[i] == 0) {
+			min_dist = node_dist[i];
+			node_id = i;
+			break;
+		}
+	}
+
+	/* Compare and return the nearest node */
+	for (i = 0; i < nr_node_ids; i++) {
+		if (node_dist[i] < min_dist && used[i] == 0) {
+			min_dist = node_dist[i];
+			node_id = i;
+		}
+	}
+
+	return node_id;
+}
+
+static unsigned int __cpumask_local_spread(unsigned int i, int node)
 {
 	int cpu;
 
@@ -231,4 +252,52 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
 	}
 	BUG();
 }
+
+/**
+ * cpumask_local_spread - select the i'th cpu with local numa cpu's first
+ * @i: index number
+ * @node: local numa_node
+ *
+ * This function selects an online CPU according to a numa aware policy;
+ * local cpus are returned first, followed by the nearest non-local ones,
+ * then it wraps around.
+ *
+ * It's not very efficient, but useful for setup.
+ */
+unsigned int cpumask_local_spread(unsigned int i, int node)
+{
+	int node_dist[MAX_NUMNODES] = {0};
+	bool used[MAX_NUMNODES] = {0};
+	int cpu, j, id;
+
+	/* Wrap: we always want a cpu. */
+	i %= num_online_cpus();
+
+	if (node == NUMA_NO_NODE) {
+		for_each_cpu(cpu, cpu_online_mask)
+			if (i-- == 0)
+				return cpu;
+	} else {
+		if (nr_node_ids > MAX_NUMNODES)
+			return __cpumask_local_spread(i, node);
+
+		calc_node_distance(node_dist, node);
+		for (j = 0; j < nr_node_ids; j++) {
+			id = find_nearest_node(node_dist, used);
+			if (id < 0)
+				break;
+
+			for_each_cpu_and(cpu, cpumask_of_node(id),
+					 cpu_online_mask)
+				if (i-- == 0)
+					return cpu;
+			used[id] = 1;
+		}
+
+		for_each_cpu(cpu, cpu_online_mask)
+			if (i-- == 0)
+				return cpu;
+	}
+	BUG();
+}
 EXPORT_SYMBOL(cpumask_local_spread);
-- 
2.7.4

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ