lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1502786736-21585-3-git-send-email-kemi.wang@intel.com>
Date:   Tue, 15 Aug 2017 16:45:36 +0800
From:   Kemi Wang <kemi.wang@...el.com>
To:     Andrew Morton <akpm@...ux-foundation.org>,
        Michal Hocko <mhocko@...e.com>,
        Mel Gorman <mgorman@...hsingularity.net>,
        Johannes Weiner <hannes@...xchg.org>
Cc:     Dave <dave.hansen@...ux.intel.com>,
        Andi Kleen <andi.kleen@...el.com>,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Ying Huang <ying.huang@...el.com>,
        Aaron Lu <aaron.lu@...el.com>, Tim Chen <tim.c.chen@...el.com>,
        Linux MM <linux-mm@...ck.org>,
        Linux Kernel <linux-kernel@...r.kernel.org>,
        Kemi Wang <kemi.wang@...el.com>
Subject: [PATCH 2/2] mm: Update NUMA counter threshold size

There is significant overhead in cache bouncing caused by zone counters
(NUMA associated counters) update in parallel in multi-threaded page
allocation (suggested by Dave Hansen).

This patch updates NUMA counter threshold to a fixed size of 32765, as a
small threshold greatly increases the update frequency of the global
counter from local per cpu counter, and the number of NUMA items in each
cpu (vm_numa_stat_diff[]) is added to zone->vm_numa_stat[] when a user
*reads* the value of numa counter to eliminate deviation (suggested by
Ying Huang).

The rationality is that these statistics counters don't need to be read
often, unlike other VM counters, so it's not a problem to use a large
threshold and make readers more expensive.

With this patchset, we see 26.6% drop of CPU cycles(537-->394) for per
single page allocation and reclaim on Jesper's page_bench03 benchmark.

Benchmark provided by Jesper D Broucer(increase loop times to 10000000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench

 Threshold   CPU cycles    Throughput(88 threads)
     32          799         241760478
     64          640         301628829
     125         537         358906028 <==> system by default (base)
     256         468         412397590
     512         428         450550704
     4096        399         482520943
     20000       394         489009617
     30000       395         488017817
     32765       394(-26.6%) 488932078(+36.2%) <==> with this patchset
     N/A         342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics

Signed-off-by: Kemi Wang <kemi.wang@...el.com>
Suggested-by: Dave Hansen <dave.hansen@...el.com>
Suggested-by: Ying Huang <ying.huang@...el.com>
---
 include/linux/mmzone.h |  4 ++--
 include/linux/vmstat.h |  6 +++++-
 mm/vmstat.c            | 23 ++++++++++-------------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0b11ba7..7eaf0e8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -282,8 +282,8 @@ struct per_cpu_pageset {
 	struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
 	s8 expire;
-	s8 numa_stat_threshold;
-	s8 vm_numa_stat_diff[NR_VM_ZONE_NUMA_STAT_ITEMS];
+	s16 numa_stat_threshold;
+	s16 vm_numa_stat_diff[NR_VM_ZONE_NUMA_STAT_ITEMS];
 #endif
 #ifdef CONFIG_SMP
 	s8 stat_threshold;
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1e19379..d97cc34 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,10 +125,14 @@ static inline unsigned long global_numa_state(enum zone_numa_stat_item item)
 	return x;
 }
 
-static inline unsigned long zone_numa_state(struct zone *zone,
+static inline unsigned long zone_numa_state_snapshot(struct zone *zone,
 					enum zone_numa_stat_item item)
 {
 	long x = atomic_long_read(&zone->vm_numa_stat[item]);
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
 
 	return x;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5a7fa30..c7f50ed 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -30,6 +30,8 @@
 
 #include "internal.h"
 
+#define NUMA_STAT_THRESHOLD  32765
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -196,7 +198,7 @@ void refresh_zone_stat_thresholds(void)
 							= threshold;
 #ifdef CONFIG_NUMA
 			per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
-							= threshold;
+							= NUMA_STAT_THRESHOLD;
 #endif
 			/* Base nodestat threshold on the largest populated zone. */
 			pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
@@ -231,14 +233,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
 			continue;
 
 		threshold = (*calculate_pressure)(zone);
-		for_each_online_cpu(cpu) {
+		for_each_online_cpu(cpu)
 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
 							= threshold;
-#ifdef CONFIG_NUMA
-			per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
-							= threshold;
-#endif
-		}
 	}
 }
 
@@ -872,13 +869,13 @@ void __inc_zone_numa_state(struct zone *zone,
 				 enum zone_numa_stat_item item)
 {
 	struct per_cpu_pageset __percpu *pcp = zone->pageset;
-	s8 __percpu *p = pcp->vm_numa_stat_diff + item;
-	s8 v, t;
+	s16 __percpu *p = pcp->vm_numa_stat_diff + item;
+	s16 v, t;
 
 	v = __this_cpu_inc_return(*p);
 	t = __this_cpu_read(pcp->numa_stat_threshold);
 	if (unlikely(v > t)) {
-		s8 overstep = t >> 1;
+		s16 overstep = t >> 1;
 
 		zone_numa_state_add(v + overstep, zone, item);
 		__this_cpu_write(*p, -overstep);
@@ -914,7 +911,7 @@ unsigned long sum_zone_node_numa_state(int node,
 	unsigned long count = 0;
 
 	for (i = 0; i < MAX_NR_ZONES; i++)
-		count += zone_numa_state(zones + i, item);
+		count += zone_numa_state_snapshot(zones + i, item);
 
 	return count;
 }
@@ -1536,7 +1533,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	for (i = 0; i < NR_VM_ZONE_NUMA_STAT_ITEMS; i++)
 		seq_printf(m, "\n      %-12s %lu",
 				vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-				zone_numa_state(zone, i));
+				zone_numa_state_snapshot(zone, i));
 #endif
 
 	seq_printf(m, "\n  pagesets");
@@ -1795,7 +1792,7 @@ static bool need_update(int cpu)
 
 		BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
 #ifdef CONFIG_NUMA
-		BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1);
+		BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
 #endif
 		/*
 		 * The fast way of checking if there are any vmstat diffs.
-- 
2.7.4

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ