lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 27 Apr 2016 11:31:11 -0300
From:	Arnaldo Carvalho de Melo <acme@...nel.org>
To:	Ingo Molnar <mingo@...nel.org>
Cc:	linux-kernel@...r.kernel.org,
	Arnaldo Carvalho de Melo <acme@...hat.com>,
	Adrian Hunter <adrian.hunter@...el.com>,
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
	He Kuang <hekuang@...wei.com>, Jiri Olsa <jolsa@...hat.com>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Masami Hiramatsu <mhiramat@...nel.org>,
	Milian Wolff <milian.wolff@...b.com>,
	Namhyung Kim <namhyung@...nel.org>,
	Peter Zijlstra <peterz@...radead.org>,
	Stephane Eranian <eranian@...gle.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Vince Weaver <vincent.weaver@...ne.edu>,
	Wang Nan <wangnan0@...wei.com>, Zefan Li <lizefan@...wei.com>
Subject: [PATCH 29/30] perf core: Allow setting up max frame stack depth via sysctl

From: Arnaldo Carvalho de Melo <acme@...hat.com>

The default remains 127, which is good for most cases, and not even hit
most of the time, but then for some cases, as reported by Brendan, 1024+
deep frames are appearing on the radar for things like groovy, ruby.

And in some workloads putting a _lower_ cap on this may make sense. One
that is per event still needs to be put in place tho.

The new file is:

  # cat /proc/sys/kernel/perf_event_max_stack
  127

Chaging it:

  # echo 256 > /proc/sys/kernel/perf_event_max_stack
  # cat /proc/sys/kernel/perf_event_max_stack
  256

But as soon as there is some event using callchains we get:

  # echo 512 > /proc/sys/kernel/perf_event_max_stack
  -bash: echo: write error: Device or resource busy
  #

Because we only allocate the callchain percpu data structures when there
is a user, which allows for changing the max easily, its just a matter
of having no callchain users at that point.

Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@...il.com>
Reviewed-by: Frederic Weisbecker <fweisbec@...il.com>
Acked-by: Alexei Starovoitov <ast@...nel.org>
Acked-by: David Ahern <dsahern@...il.com>
Cc: Adrian Hunter <adrian.hunter@...el.com>
Cc: Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Cc: He Kuang <hekuang@...wei.com>
Cc: Jiri Olsa <jolsa@...hat.com>
Cc: Linus Torvalds <torvalds@...ux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@...nel.org>
Cc: Milian Wolff <milian.wolff@...b.com>
Cc: Namhyung Kim <namhyung@...nel.org>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Stephane Eranian <eranian@...gle.com>
Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: Vince Weaver <vincent.weaver@...ne.edu>
Cc: Wang Nan <wangnan0@...wei.com>
Cc: Zefan Li <lizefan@...wei.com>
Link: http://lkml.kernel.org/r/20160426002928.GB16708@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@...hat.com>
---
 Documentation/sysctl/kernel.txt    | 14 ++++++++++++++
 arch/arm/kernel/perf_callchain.c   |  2 +-
 arch/arm64/kernel/perf_callchain.c |  4 ++--
 arch/metag/kernel/perf_callchain.c |  2 +-
 arch/mips/kernel/perf_event.c      |  4 ++--
 arch/powerpc/perf/callchain.c      |  4 ++--
 arch/sparc/kernel/perf_event.c     |  6 +++---
 arch/x86/events/core.c             |  4 ++--
 arch/xtensa/kernel/perf_event.c    |  4 ++--
 include/linux/perf_event.h         |  8 ++++++--
 kernel/bpf/stackmap.c              |  8 ++++----
 kernel/events/callchain.c          | 35 +++++++++++++++++++++++++++++++++--
 kernel/sysctl.c                    | 12 ++++++++++++
 13 files changed, 84 insertions(+), 23 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a44b128..260cde08e92e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
 - panic_on_warn
 - perf_cpu_time_max_percent
 - perf_event_paranoid
+- perf_event_max_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN).  The default value is 1.
 
 ==============================================================
 
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..27563befa8a2 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+	while ((entry->nr < sysctl_perf_event_max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff4665462a02..32c3c6e70119 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->nr < PERF_MAX_STACK_DEPTH &&
+		while (entry->nr < sysctl_perf_event_max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+		while ((entry->nr < sysctl_perf_event_max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 315633461a94..252abc12a5a3 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	--frame;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+	while ((entry->nr < sysctl_perf_event_max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6c3f77..5021c546ad07 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->nr >= PERF_MAX_STACK_DEPTH)
+			if (entry->nr >= sysctl_perf_event_max_stack)
 				break;
 		}
 	}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->nr >= PERF_MAX_STACK_DEPTH)
+		if (entry->nr >= sysctl_perf_event_max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a6752b399..22d9015c1acc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66ce112..a4b8b5aed21c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			}
 		}
 #endif
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442a3e28..41d93d0e972b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f01188c29c..a6b00b3af429 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			   struct pt_regs *regs)
 {
-	xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
 				callchain_trace, NULL, entry);
 }
 
 void perf_callchain_user(struct perf_callchain_entry *entry,
 			 struct pt_regs *regs)
 {
-	xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
 			      callchain_trace, entry);
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 85749ae8cb5f..a090700cccca 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
 
 struct perf_callchain_entry {
 	__u64				nr;
-	__u64				ip[PERF_MAX_STACK_DEPTH];
+	__u64				ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
 struct perf_raw_record {
@@ -993,9 +993,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
+extern int sysctl_perf_event_max_stack;
+
 static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-	if (entry->nr < PERF_MAX_STACK_DEPTH) {
+	if (entry->nr < sysctl_perf_event_max_stack) {
 		entry->ip[entry->nr++] = ip;
 		return 0;
 	} else {
@@ -1017,6 +1019,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static inline bool perf_paranoid_tracepoint_raw(void)
 {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..f5a19548be12 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    value_size < 8 || value_size % 8 ||
-	    value_size / 8 > PERF_MAX_STACK_DEPTH)
+	    value_size / 8 > sysctl_perf_event_max_stack)
 		return ERR_PTR(-EINVAL);
 
 	/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 	u32 max_depth = map->value_size / 8;
-	/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
-	u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	u32 hash, id, trace_nr, trace_len;
 	bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 		return -EFAULT;
 
 	/* get_perf_callchain() guarantees that trace->nr >= init_nr
-	 * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 	 */
 	trace_nr = trace->nr - init_nr;
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..b9325e7dcba1 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
 	struct perf_callchain_entry	*cpu_entries[0];
 };
 
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+	return (sizeof(struct perf_callchain_entry) +
+		sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
 static atomic_t nr_callchain_events;
 static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
 	if (!entries)
 		return -ENOMEM;
 
-	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
 
 	for_each_possible_cpu(cpu) {
 		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
 
 	cpu = smp_processor_id();
 
-	return &entries->cpu_entries[cpu][*rctx];
+	return (((void *)entries->cpu_entries[cpu]) +
+		(*rctx * perf_callchain_entry__sizeof()));
 }
 
 static void
@@ -215,3 +224,25 @@ exit_put:
 
 	return entry;
 }
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int new_value = sysctl_perf_event_max_stack, ret;
+	struct ctl_table new_table = *table;
+
+	new_table.data = &new_value;
+	ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	mutex_lock(&callchain_mutex);
+	if (atomic_read(&nr_callchain_events))
+		ret = -EBUSY;
+	else
+		sysctl_perf_event_max_stack = new_value;
+
+	mutex_unlock(&callchain_mutex);
+
+	return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..c8b318663525 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,6 +130,9 @@ static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
+#ifdef CONFIG_PERF_EVENTS
+static int six_hundred_forty_kb = 640 * 1024;
+#endif
 
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname	= "perf_event_max_stack",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(sysctl_perf_event_max_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+		.extra2		= &six_hundred_forty_kb,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{
-- 
2.5.5

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ