[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1164245696.29844.155.camel@galaxy.corp.google.com>
Date: Wed, 22 Nov 2006 17:34:55 -0800
From: Rohit Seth <rohitseth@...gle.com>
To: Andi Kleen <ak@...e.de>
Cc: linux-kernel <linux-kernel@...r.kernel.org>,
Mel Gorman <mel@....ul.ie>,
David Rientjes <rientjes@...washington.edu>,
Paul Menage <menage@...gle.com>, Andrew Morton <akpm@...l.org>
Subject: [Patch4/4]: fake numa for x86_64 patches
This patch extends the kernel command line option for numa=fake so that
user can specify different size nodes on a command line.
Signed-off-by: David Rientjes <reintjes@...gle.com>
Signed-off-by: Paul Menage <menage@...gle.com>
Signed-off-by: Rohit Seth <rohitseth@...gle.com>
--- linux-2.6.19-rc5-mm2.org/arch/x86_64/mm/numa.c 2006-11-22 16:50:34.000000000 -0800
+++ linux-2.6.19-rc5-mm2/arch/x86_64/mm/numa.c 2006-11-22 16:51:02.000000000 -0800
@@ -39,6 +39,10 @@ cpumask_t node_to_cpumask[MAX_NUMNODES]
int numa_off __initdata;
struct bootnode nodes[MAX_NUMNODES] __initdata;
+#ifdef CONFIG_NUMA_EMU
+char fake_numa[32] __initdata;
+#endif
+
void __init populate_physnode_map(struct bootnode *nodes, int numnodes)
{
int i;
@@ -287,6 +291,22 @@ static int split_nodes_equal(struct boot
}
/*
+ * Splits the remaining system RAM into chunks of size. The remaining memory is
+ * always assigned to a final node and can be asymmetric. Returns the number of
+ * nodes split.
+ */
+static int split_nodes_size(struct bootnode *nodes, u64 *addr, u64 max_addr,
+ int node_start, u64 sz)
+{
+ int i = node_start;
+ sz = (sz << 20) & NODE_HASH_MASK;
+ while (!setup_node_range(i++, nodes, addr, sz, max_addr))
+ ;
+ return i - node_start;
+}
+
+
+/*
* Sets up the system RAM area from start_pfn to end_pfn according to the
* numa=fake command line.
*/
@@ -294,17 +314,86 @@ static int numa_emulation(unsigned long
{
u64 addr = start_pfn << PAGE_SHIFT;
u64 max_addr = end_pfn << PAGE_SHIFT;
+ u64 sz;
int num_nodes;
+ int coeff_flag;
+ int coeff = -1;
+ int num;
int i;
+ char *temp = fake_numa;
memset(&nodes, 0, sizeof(nodes));
/*
* If the numa=fake command line is just a single number N, split the
* system RAM into N fake nodes.
*/
- num_nodes = split_nodes_equal(nodes, &addr, max_addr, 0, numa_fake);
- if (num_nodes < 0)
- return num_nodes;
+ if (!strchr(temp, '*')) {
+ num_nodes = split_nodes_equal(nodes, &addr, max_addr, 0,
+ simple_strtol(temp, NULL, 0));
+ if (num_nodes < 0)
+ return num_nodes;
+ goto out;
+ }
+
+ /* Parse the command line */
+ for (coeff_flag = num = num_nodes = 0; ; temp++) {
+ if (*temp && isdigit(*temp)) {
+ num = num * 10 + *temp - '0';
+ continue;
+ } else if (*temp == '*') {
+ if (num > 0)
+ coeff = num;
+ coeff_flag = 1;
+ } else if (!*temp) {
+ if (!coeff_flag)
+ coeff = 1;
+ /*
+ * Round down to the nearest 4MB for hash function.
+ * Command line coefficients are in megabytes.
+ */
+ sz = ((u64)num << 20) & NODE_HASH_MASK;
+ if (sz)
+ for (i = 0; i < coeff; i++, num_nodes++)
+ if (setup_node_range(num_nodes, nodes,
+ &addr, sz, max_addr) < 0)
+ goto done;
+ if (!*temp)
+ break;
+ coeff = -1;
+ coeff_flag = 0;
+ }
+ num = 0;
+ }
+done:
+ if (!num_nodes)
+ return -1;
+ /* Fill remaining system RAM */
+ if (addr < max_addr) {
+ if (coeff_flag && coeff < 0) {
+ /* Split remaining nodes into num-sized chunks */
+ num_nodes += split_nodes_size(nodes, &addr, max_addr,
+ num_nodes, num);
+ goto out;
+ }
+ switch (*(temp - 1)) {
+ case '*':
+ /* Split remaining nodes into coeff chunks */
+ if (coeff <= 0)
+ break;
+ num_nodes += split_nodes_equal(nodes, &addr, max_addr,
+ num_nodes, coeff);
+ break;
+ case ',':
+ /* Do not allocate remaining system RAM */
+ break;
+ default:
+ /* Give one final node */
+ setup_node_range(num_nodes, nodes, &addr,
+ max_addr - addr, max_addr);
+ num_nodes++;
+ }
+ }
+out:
populate_physnode_map(nodes, num_nodes);
for_each_online_node(i) {
e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
@@ -410,15 +499,18 @@ void __init paging_init(void)
static __init int numa_setup(char *opt)
{
+#ifdef CONFIG_NUMA_EMU
+ char *t;
+#endif
if (!opt)
return -EINVAL;
if (!strncmp(opt,"off",3))
numa_off = 1;
#ifdef CONFIG_NUMA_EMU
if(!strncmp(opt, "fake=", 5)) {
- numa_fake = simple_strtoul(opt+5,NULL,0);
- if (numa_fake >= MAX_NUMNODES)
- numa_fake = MAX_NUMNODES;
+ numa_fake = 1;
+ t = strchr(opt, ' ');
+ strlcpy(fake_numa, opt+5, (t-opt-5)+1);
}
#endif
#ifdef CONFIG_ACPI_NUMA
--- linux-2.6.19-rc5-mm2.org/Documentation/x86_64/boot-options.txt 2006-11-22 12:20:54.000000000 -0800
+++ linux-2.6.19-rc5-mm2/Documentation/x86_64/boot-options.txt 2006-11-20 11:44:05.000000000 -0800
@@ -149,7 +149,17 @@ NUMA
numa=noacpi Don't parse the SRAT table for NUMA setup
- numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine.
+ numa=fake=<cmdline>
+ If a number, fakes <cmdline> nodes and ignores NUMA setup of
+ the actual machine. Otherwise, system memory is configured
+ depending on the sizes and coefficients listed. For example:
+ numa=fake=2*512,1024,4*256,*128
+ gives two 512M nodes, a 1024M node, four 256M nodes, and the
+ rest split into 128M chunks. If the last character of
+ <cmdline> is a *, the remaining memory is divided equally
+ among its coefficient:
+ numa=fake=2*512,2*
+ gives two 512M nodes and the rest split into two nodes.
numa=hotadd=percent
Only allow hotadd memory to preallocate page structures upto
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists