lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <tip-a387e95a49743cf9835c5299ca549232618d8249@git.kernel.org>
Date:	Fri, 24 Dec 2010 01:38:12 GMT
From:	tip-bot for David Rientjes <rientjes@...gle.com>
To:	linux-tip-commits@...r.kernel.org
Cc:	linux-kernel@...r.kernel.org, hpa@...or.com, mingo@...hat.com,
	tglx@...utronix.de, hpa@...ux.intel.com, rientjes@...gle.com
Subject: [tip:x86/numa] x86, numa: Fix cpu to node mapping for sparse node ids

Commit-ID:  a387e95a49743cf9835c5299ca549232618d8249
Gitweb:     http://git.kernel.org/tip/a387e95a49743cf9835c5299ca549232618d8249
Author:     David Rientjes <rientjes@...gle.com>
AuthorDate: Wed, 22 Dec 2010 17:23:56 -0800
Committer:  H. Peter Anvin <hpa@...ux.intel.com>
CommitDate: Thu, 23 Dec 2010 15:27:16 -0800

x86, numa: Fix cpu to node mapping for sparse node ids

NUMA boot code assumes that physical node ids start at 0, but the DIMMs
that the apic id represents may not be reachable.  If this is the case,
node 0 is never online and cpus never end up getting appropriately
assigned to a node.  This causes the cpumask of all online nodes to be
empty and machines crash with kernel code assuming online nodes have
valid cpus.

The fix is to appropriately map all the address ranges for physical nodes
and ensure the cpu to node mapping function checks all possible nodes (up
to MAX_NUMNODES) instead of simply checking nodes 0-N, where N is the
number of physical nodes, for valid address ranges.

This requires no longer "compressing" the address ranges of nodes in the
physical node map from 0-N, but rather leave indices in physnodes[] to
represent the actual node id of the physical node.  Accordingly, the
topology exported by both amd_get_nodes() and acpi_get_nodes() no longer
must return the number of nodes to iterate through; all such iterations
will now be to MAX_NUMNODES.

This change also passes the end address of system RAM (which may be
different from normal operation if mem= is specified on the command line)
before the physnodes[] array is populated.  ACPI parsed nodes are
truncated to fit within the address range that respect the mem=
boundaries and even some physical nodes may become unreachable in such
cases.

When NUMA emulation does succeed, any apicid to node mapping that exists
for unreachable nodes are given default values so that proximity domains
can still be assigned.  This is important for node_distance() to
function as desired.

Signed-off-by: David Rientjes <rientjes@...gle.com>
LKML-Reference: <alpine.DEB.2.00.1012221702090.3701@...no.kir.corp.google.com>
Signed-off-by: H. Peter Anvin <hpa@...ux.intel.com>
---
 arch/x86/include/asm/acpi.h   |    3 ++-
 arch/x86/include/asm/amd_nb.h |    2 +-
 arch/x86/mm/amdtopology_64.c  |    9 +++------
 arch/x86/mm/numa_64.c         |   18 +++---------------
 arch/x86/mm/srat_64.c         |   22 ++++++++++++++++------
 5 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 8288daf..211ca3f 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -185,7 +185,8 @@ struct bootnode;
 
 #ifdef CONFIG_ACPI_NUMA
 extern int acpi_numa;
-extern int acpi_get_nodes(struct bootnode *physnodes);
+extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
+				unsigned long end);
 extern int acpi_scan_nodes(unsigned long start, unsigned long end);
 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
 
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 8f6192c..980f225 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -14,7 +14,7 @@ extern int amd_scan_nodes(void);
 
 #ifdef CONFIG_NUMA_EMU
 extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
-extern int amd_get_nodes(struct bootnode *nodes);
+extern void amd_get_nodes(struct bootnode *nodes);
 #endif
 
 struct amd_northbridge {
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index eb5cbb9..0df2623 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -187,17 +187,14 @@ static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 
-int __init amd_get_nodes(struct bootnode *physnodes)
+void __init amd_get_nodes(struct bootnode *physnodes)
 {
 	int i;
-	int ret = 0;
 
 	for_each_node_mask(i, nodes_parsed) {
-		physnodes[ret].start = nodes[i].start;
-		physnodes[ret].end = nodes[i].end;
-		ret++;
+		physnodes[i].start = nodes[i].start;
+		physnodes[i].end = nodes[i].end;
 	}
-	return ret;
 }
 
 static int __init find_node_by_addr(unsigned long addr)
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index dd300c4..3d73201 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -266,25 +266,24 @@ static char *cmdline __initdata;
 static int __init setup_physnodes(unsigned long start, unsigned long end,
 					int acpi, int amd)
 {
-	int nr_nodes = 0;
 	int ret = 0;
 	int i;
 
 	memset(physnodes, 0, sizeof(physnodes));
 #ifdef CONFIG_ACPI_NUMA
 	if (acpi)
-		nr_nodes = acpi_get_nodes(physnodes);
+		acpi_get_nodes(physnodes, start, end);
 #endif
 #ifdef CONFIG_AMD_NUMA
 	if (amd)
-		nr_nodes = amd_get_nodes(physnodes);
+		amd_get_nodes(physnodes);
 #endif
 	/*
 	 * Basic sanity checking on the physical node map: there may be errors
 	 * if the SRAT or AMD code incorrectly reported the topology or the mem=
 	 * kernel parameter is used.
 	 */
-	for (i = 0; i < nr_nodes; i++) {
+	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (physnodes[i].start == physnodes[i].end)
 			continue;
 		if (physnodes[i].start > end) {
@@ -299,17 +298,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
 			physnodes[i].start = start;
 		if (physnodes[i].end > end)
 			physnodes[i].end = end;
-	}
-
-	/*
-	 * Remove all nodes that have no memory or were truncated because of the
-	 * limited address range.
-	 */
-	for (i = 0; i < nr_nodes; i++) {
-		if (physnodes[i].start == physnodes[i].end)
-			continue;
-		physnodes[ret].start = physnodes[i].start;
-		physnodes[ret].end = physnodes[i].end;
 		ret++;
 	}
 
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index c48b443..a756bcf 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -340,17 +340,16 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 void __init acpi_numa_arch_fixup(void) {}
 
 #ifdef CONFIG_NUMA_EMU
-int __init acpi_get_nodes(struct bootnode *physnodes)
+void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
+				unsigned long end)
 {
 	int i;
-	int ret = 0;
 
 	for_each_node_mask(i, nodes_parsed) {
-		physnodes[ret].start = nodes[i].start;
-		physnodes[ret].end = nodes[i].end;
-		ret++;
+		cutoff_node(i, start, end);
+		physnodes[i].start = nodes[i].start;
+		physnodes[i].end = nodes[i].end;
 	}
-	return ret;
 }
 #endif /* CONFIG_NUMA_EMU */
 
@@ -516,6 +515,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 			    fake_apicid_to_node[j] == NUMA_NO_NODE)
 				fake_apicid_to_node[j] = i;
 	}
+
+	/*
+	 * If there are apicid-to-node mappings for physical nodes that do not
+	 * have a corresponding emulated node, it should default to a guaranteed
+	 * value.
+	 */
+	for (i = 0; i < MAX_LOCAL_APIC; i++)
+		if (apicid_to_node[i] != NUMA_NO_NODE &&
+		    fake_apicid_to_node[i] == NUMA_NO_NODE)
+			fake_apicid_to_node[i] = 0;
+
 	for (i = 0; i < num_nodes; i++)
 		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
 	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ