linux-kernel - [PATCH -stable 4.1 1/4] perf/core: Fix dynamic interrupt throttle

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1488784465-58402-2-git-send-email-tanxiaojun@huawei.com>
Date:   Mon, 6 Mar 2017 15:14:22 +0800
From:   Tan Xiaojun <tanxiaojun@...wei.com>
To:     <alexander.levin@...izon.com>, <peterz@...radead.org>,
        <wangnan0@...wei.com>, <guohanjun@...wei.com>
CC:     <linuxarm@...wei.com>, <linux-kernel@...r.kernel.org>
Subject: [PATCH -stable 4.1 1/4] perf/core: Fix dynamic interrupt throttle

From: Peter Zijlstra <peterz@...radead.org>

There were two problems with the dynamic interrupt throttle mechanism,
both triggered by the same action.

When you (or perf_fuzzer) write a huge value into
/proc/sys/kernel/perf_event_max_sample_rate the computed
perf_sample_allowed_ns becomes 0. This effectively disables the whole
dynamic throttle.

This is fixed by ensuring update_perf_cpu_limits() never sets the
value to 0. However, we allow disabling of the dynamic throttle by
writing 100 to /proc/sys/kernel/perf_cpu_time_max_percent. This will
generate a warning in dmesg.

The second problem is that by setting the max_sample_rate to a huge
number, the adaptive process can take a few tries, since it halfs the
limit each time. Change that to directly compute a new value based on
the observed duration.

Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
Cc: Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Cc: Andy Lutomirski <luto@...capital.net>
Cc: Arnaldo Carvalho de Melo <acme@...hat.com>
Cc: Borislav Petkov <bp@...en8.de>
Cc: Brian Gerst <brgerst@...il.com>
Cc: David Ahern <dsahern@...il.com>
Cc: Denys Vlasenko <dvlasenk@...hat.com>
Cc: H. Peter Anvin <hpa@...or.com>
Cc: Jiri Olsa <jolsa@...hat.com>
Cc: Linus Torvalds <torvalds@...ux-foundation.org>
Cc: Namhyung Kim <namhyung@...nel.org>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Stephane Eranian <eranian@...gle.com>
Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: Vince Weaver <vincent.weaver@...ne.edu>
Signed-off-by: Ingo Molnar <mingo@...nel.org>
Signed-off-by: Tan Xiaojun <tanxiaojun@...wei.com>
---
 kernel/events/core.c | 87 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 51 insertions(+), 36 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6da64f0..3089a004 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -198,8 +198,11 @@ void update_perf_cpu_limits(void)
 	u64 tmp = perf_sample_period_ns;
 
 	tmp *= sysctl_perf_cpu_time_max_percent;
-	do_div(tmp, 100);
-	ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
+	tmp = div_u64(tmp, 100);
+	if (!tmp)
+		tmp = 1;
+
+	WRITE_ONCE(perf_sample_allowed_ns, tmp);
 }
 
 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -231,7 +234,13 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 	if (ret || !write)
 		return ret;
 
-	update_perf_cpu_limits();
+	if (sysctl_perf_cpu_time_max_percent == 100) {
+		printk(KERN_WARNING
+		       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
+		WRITE_ONCE(perf_sample_allowed_ns, 0);
+	} else {
+		update_perf_cpu_limits();
+	}
 
 	return 0;
 }
@@ -245,62 +254,68 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 #define NR_ACCUMULATED_SAMPLES 128
 static DEFINE_PER_CPU(u64, running_sample_length);
 
+static u64 __report_avg;
+static u64 __report_allowed;
+
 static void perf_duration_warn(struct irq_work *w)
 {
-	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
-	u64 avg_local_sample_len;
-	u64 local_samples_len;
-
-	local_samples_len = __this_cpu_read(running_sample_length);
-	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-
 	printk_ratelimited(KERN_WARNING
-			"perf interrupt took too long (%lld > %lld), lowering "
-			"kernel.perf_event_max_sample_rate to %d\n",
-			avg_local_sample_len, allowed_ns >> 1,
-			sysctl_perf_event_sample_rate);
+		"perf: interrupt took too long (%lld > %lld), lowering "
+		"kernel.perf_event_max_sample_rate to %d\n",
+		__report_avg, __report_allowed,
+		sysctl_perf_event_sample_rate);
 }
 
 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
 
 void perf_sample_event_took(u64 sample_len_ns)
 {
-	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
-	u64 avg_local_sample_len;
-	u64 local_samples_len;
+	u64 max_len = READ_ONCE(perf_sample_allowed_ns);
+	u64 running_len;
+	u64 avg_len;
+	u32 max;
 
-	if (allowed_ns == 0)
+	if (max_len == 0)
 		return;
 
-	/* decay the counter by 1 average sample */
-	local_samples_len = __this_cpu_read(running_sample_length);
-	local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
-	local_samples_len += sample_len_ns;
-	__this_cpu_write(running_sample_length, local_samples_len);
+	/* Decay the counter by 1 average sample. */
+	running_len = __this_cpu_read(running_sample_length);
+	running_len -= running_len/NR_ACCUMULATED_SAMPLES;
+	running_len += sample_len_ns;
+	__this_cpu_write(running_sample_length, running_len);
 
 	/*
-	 * note: this will be biased artifically low until we have
-	 * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
+	 * Note: this will be biased artifically low until we have
+	 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
 	 * from having to maintain a count.
 	 */
-	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-
-	if (avg_local_sample_len <= allowed_ns)
+	avg_len = running_len/NR_ACCUMULATED_SAMPLES;
+	if (avg_len <= max_len)
 		return;
 
-	if (max_samples_per_tick <= 1)
-		return;
+	__report_avg = avg_len;
+	__report_allowed = max_len;
 
-	max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
-	sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
-	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+	/*
+	 * Compute a throttle threshold 25% below the current duration.
+	 */
+	avg_len += avg_len / 4;
+	max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
+	if (avg_len < max)
+		max /= (u32)avg_len;
+	else
+		max = 1;
 
-	update_perf_cpu_limits();
+	WRITE_ONCE(perf_sample_allowed_ns, avg_len);
+	WRITE_ONCE(max_samples_per_tick, max);
+
+	sysctl_perf_event_sample_rate = max * HZ;
+	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 
 	if (!irq_work_queue(&perf_duration_work)) {
-		early_printk("perf interrupt took too long (%lld > %lld), lowering "
+		early_printk("perf: interrupt took too long (%lld > %lld), lowering "
 			     "kernel.perf_event_max_sample_rate to %d\n",
-			     avg_local_sample_len, allowed_ns >> 1,
+			     __report_avg, __report_allowed,
 			     sysctl_perf_event_sample_rate);
 	}
 }
-- 
1.9.1