lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20070426195348.6a4e5652.kamezawa.hiroyu@jp.fujitsu.com>
Date:	Thu, 26 Apr 2007 19:53:48 +0900
From:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
Cc:	ak@...e.de, linux-kernel@...r.kernel.org, linux-mm@...ck.org,
	akpm@...ux-foundation.org, clameter@....com
Subject: [PATCH] change global zonelist order on NUMA v3

Changelog V2 -> V3

- removed zone ordering selection knobs...

much simpler one. just changing zonelist ordering.
tested on ia64 NUMA works well as expected.

-Kame


change zonelist order on NUMA v3.

[Description]
Assume 2 node NUMA, only node(0) has ZONE_DMA.
(ia64's ZONE_DMA is below 4GB...x86_64's ZONE_DMA32)

In this case, current default (node0's) zonelist order is

Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)"s NORMAL.

This means Node(0)'s DMA will be used before Node(1)'s NORMAL.
This will cause OOM on ZONE_DMA easily.

This patch changes *default* zone order to

Node(0)'s NORMAL -> Node(1)'s NORMAL -> Node(0)'s DMA.

tested ia64 2-Node NUMA. works well.

Signed-Off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>

Index: linux-2.6.21-rc7-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/mm/page_alloc.c
+++ linux-2.6.21-rc7-mm2/mm/page_alloc.c
@@ -2023,6 +2023,7 @@ void show_free_areas(void)
  *
  * Add all populated zones of a node to the zonelist.
  */
+#ifndef CONFIG_NUMA
 static int __meminit build_zonelists_node(pg_data_t *pgdat,
 			struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
 {
@@ -2042,6 +2043,7 @@ static int __meminit build_zonelists_nod
 	} while (zone_type);
 	return nr_zones;
 }
+#endif
 
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
@@ -2106,52 +2108,51 @@ static int __meminit find_next_best_node
 	return best_node;
 }
 
+/*
+ * Build zonelist based on zone priority.
+ */
+static int __meminitdata node_order[MAX_NUMNODES];
 static void __meminit build_zonelists(pg_data_t *pgdat)
 {
-	int j, node, local_node;
-	enum zone_type i;
-	int prev_node, load;
-	struct zonelist *zonelist;
+	int i, j, pos, zone_type, node, load;
 	nodemask_t used_mask;
+	int local_node, prev_node;
+	struct zone *z;
+	struct zonelist *zonelist;
 
-	/* initialize zonelists */
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->zones[0] = NULL;
 	}
-
-	/* NUMA-aware ordering of nodes */
+	memset(node_order, 0, sizeof(node_order));
 	local_node = pgdat->node_id;
 	load = num_online_nodes();
 	prev_node = local_node;
 	nodes_clear(used_mask);
+	j = 0;
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		int distance = node_distance(local_node, node);
-
-		/*
-		 * If another node is sufficiently far away then it is better
-		 * to reclaim pages in a zone before going off node.
-		 */
 		if (distance > RECLAIM_DISTANCE)
 			zone_reclaim_mode = 1;
-
-		/*
-		 * We don't want to pressure a particular node.
-		 * So adding penalty to the first node in same
-		 * distance group to make it round-robin.
-		 */
-
 		if (distance != node_distance(local_node, prev_node))
-			node_load[node] += load;
+			node_load[node] = load;
+		node_order[j++] = node;
 		prev_node = node;
 		load--;
-		for (i = 0; i < MAX_NR_ZONES; i++) {
-			zonelist = pgdat->node_zonelists + i;
-			for (j = 0; zonelist->zones[j] != NULL; j++);
-
-	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
-			zonelist->zones[j] = NULL;
+	}
+	/* calculate node order */
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		pos = 0;
+		for (zone_type = i; zone_type >= 0; zone_type--) {
+			for (j = 0; j < num_online_nodes(); j++) {
+				node = node_order[j];
+				z = &NODE_DATA(node)->node_zones[zone_type];
+				if (populated_zone(z))
+					zonelist->zones[pos++] = z;
+			}
 		}
+		zonelist->zones[pos] = NULL;
 	}
 }
 
@@ -2239,6 +2240,7 @@ void __meminit build_all_zonelists(void)
 		__build_all_zonelists(NULL);
 		cpuset_init_current_mems_allowed();
 	} else {
+		memset(node_load, 0, sizeof(node_load));
 		/* we have to stop all cpus to guaranntee there is no user
 		   of zonelist */
 		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ