lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1391683834-29868-7-git-send-email-alexander.shishkin@linux.intel.com>
Date:	Thu,  6 Feb 2014 12:50:29 +0200
From:	Alexander Shishkin <alexander.shishkin@...ux.intel.com>
To:	Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc:	Ingo Molnar <mingo@...hat.com>, linux-kernel@...r.kernel.org,
	Frederic Weisbecker <fweisbec@...il.com>,
	Mike Galbraith <efault@....de>,
	Paul Mackerras <paulus@...ba.org>,
	Stephane Eranian <eranian@...gle.com>,
	Andi Kleen <ak@...ux.intel.com>,
	Adrian Hunter <adrian.hunter@...el.com>,
	Matt Fleming <matt.fleming@...el.com>,
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Subject: [PATCH v1 06/11] itrace: Add functionality to include traces in process core dumps

Per thread trace data that is provided by itrace PMUs can be included in
process core dumps, which is controlled via a new rlimit parameter
RLIMIT_ITRACE. This is done by a per-thread kernel counter that is
created when this RLIMIT_ITRACE is set.

The value of RLIMIT_ITRACE indicates the size of the per-thread elf note
in a core dump and the buffer size used to collect corresponding trace.

Signed-off-by: Alexander Shishkin <alexander.shishkin@...ux.intel.com>
---
 fs/binfmt_elf.c                     |   6 +
 fs/proc/base.c                      |   1 +
 include/asm-generic/resource.h      |   1 +
 include/linux/itrace.h              |  36 +++++
 include/linux/perf_event.h          |   3 +
 include/uapi/asm-generic/resource.h |   3 +-
 include/uapi/linux/elf.h            |   1 +
 kernel/events/itrace.c              | 289 +++++++++++++++++++++++++++++++++++-
 kernel/exit.c                       |   3 +
 kernel/sys.c                        |   5 +
 10 files changed, 343 insertions(+), 5 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 571a423..c7fcd49 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -34,6 +34,7 @@
 #include <linux/utsname.h>
 #include <linux/coredump.h>
 #include <linux/sched.h>
+#include <linux/itrace.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -1576,6 +1577,8 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 		}
 	}
 
+	*total += itrace_elf_note_size(t->task);
+
 	return 1;
 }
 
@@ -1608,6 +1611,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	for (i = 0; i < view->n; ++i)
 		if (view->regsets[i].core_note_type != 0)
 			++info->thread_notes;
+	info->thread_notes++; /* ITRACE */
 
 	/*
 	 * Sanity check.  We rely on regset 0 being in NT_PRSTATUS,
@@ -1710,6 +1714,8 @@ static int write_note_info(struct elf_note_info *info,
 			    !writenote(&t->notes[i], cprm))
 				return 0;
 
+		itrace_elf_note_write(cprm, t->task);
+
 		first = 0;
 		t = t->next;
 	} while (t);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 03c8d74..69935a9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -471,6 +471,7 @@ static const struct limit_names lnames[RLIM_NLIMITS] = {
 	[RLIMIT_NICE] = {"Max nice priority", NULL},
 	[RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
 	[RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
+	[RLIMIT_ITRACE] = {"Max ITRACE buffer size", "bytes"},
 };
 
 /* Display limits for a process */
diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h
index b4ea8f5..e6e5657 100644
--- a/include/asm-generic/resource.h
+++ b/include/asm-generic/resource.h
@@ -25,6 +25,7 @@
 	[RLIMIT_NICE]		= { 0, 0 },				\
 	[RLIMIT_RTPRIO]		= { 0, 0 },				\
 	[RLIMIT_RTTIME]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\
+	[RLIMIT_ITRACE]		= {              0,  RLIM_INFINITY },	\
 }
 
 #endif
diff --git a/include/linux/itrace.h b/include/linux/itrace.h
index 6adbb32..c1eb6d3 100644
--- a/include/linux/itrace.h
+++ b/include/linux/itrace.h
@@ -22,6 +22,7 @@
 
 #include <linux/perf_event.h>
 #include <linux/file.h>
+#include <linux/coredump.h>
 
 extern struct ring_buffer_ops itrace_rb_ops;
 
@@ -66,6 +67,19 @@ struct itrace_pmu {
 	void			(*sample_output)(struct perf_event *event,
 						 struct perf_output_handle *handle,
 						 struct perf_sample_data *data);
+
+	/*
+	 * Get the PMU-specific part of a core dump note
+	 */
+	size_t			(*core_size)(struct perf_event *event);
+
+	/*
+	 * Write out the core dump note
+	 */
+	void			(*core_output)(struct coredump_params *cprm,
+					       struct perf_event *event,
+					       unsigned long len);
+	u64			coredump_config;
 	char			*name;
 };
 
@@ -95,6 +109,17 @@ extern unsigned long itrace_sampler_trace(struct perf_event *event,
 extern void itrace_sampler_output(struct perf_event *event,
 				  struct perf_output_handle *handle,
 				  struct perf_sample_data *data);
+
+extern int update_itrace_rlimit(struct task_struct *, unsigned long);
+extern void exit_itrace(struct task_struct *);
+
+struct itrace_note {
+	u64	itrace_config;
+};
+
+extern size_t itrace_elf_note_size(struct task_struct *tsk);
+extern void itrace_elf_note_write(struct coredump_params *cprm,
+				  struct task_struct *task);
 #else
 static int itrace_kernel_event(struct perf_event *event,
 			       struct task_struct *task)	{ return 0; }
@@ -121,6 +146,17 @@ static inline void
 itrace_sampler_output(struct perf_event *event,
 		      struct perf_output_handle *handle,
 		      struct perf_sample_data *data)		{}
+
+static inline int
+update_itrace_rlimit(struct task_struct *, unsigned long)	{ return -EINVAL; }
+static inline void exit_itrace(struct task_struct *)		{}
+
+static inline size_t
+itrace_elf_note_size(struct task_struct *tsk)			{ return 0; }
+static inline void
+itrace_elf_note_write(struct coredump_params *cprm,
+		      struct task_struct *task)			{}
+
 #endif
 
 #endif /* _LINUX_PERF_EVENT_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 11eb133..8353d7f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -106,6 +106,9 @@ struct event_constraint;
 enum perf_itrace_counter_type {
 	PERF_ITRACE_USER	= BIT(1),
 	PERF_ITRACE_SAMPLING	= BIT(2),
+	PERF_ITRACE_COREDUMP	= BIT(3),
+	PERF_ITRACE_KERNEL	= (PERF_ITRACE_SAMPLING | PERF_ITRACE_COREDUMP),
+	PERF_ITRACE_ANY		= (PERF_ITRACE_KERNEL | PERF_ITRACE_USER),
 };
 
 /**
diff --git a/include/uapi/asm-generic/resource.h b/include/uapi/asm-generic/resource.h
index f863428..073f413 100644
--- a/include/uapi/asm-generic/resource.h
+++ b/include/uapi/asm-generic/resource.h
@@ -45,7 +45,8 @@
 					   0-39 for nice level 19 .. -20 */
 #define RLIMIT_RTPRIO		14	/* maximum realtime priority */
 #define RLIMIT_RTTIME		15	/* timeout for RT tasks in us */
-#define RLIM_NLIMITS		16
+#define RLIMIT_ITRACE		16	/* max itrace size */
+#define RLIM_NLIMITS		17
 
 /*
  * SuS says limits have to be unsigned.
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index ef6103b..4bfbf66 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -369,6 +369,7 @@ typedef struct elf64_shdr {
 #define NT_PRPSINFO	3
 #define NT_TASKSTRUCT	4
 #define NT_AUXV		6
+#define NT_ITRACE	7
 /*
  * Note to userspace developers: size of NT_SIGINFO note may increase
  * in the future to accomodate more fields, don't assume it is fixed!
diff --git a/kernel/events/itrace.c b/kernel/events/itrace.c
index f003530..1cc9a36 100644
--- a/kernel/events/itrace.c
+++ b/kernel/events/itrace.c
@@ -20,15 +20,21 @@
 #undef DEBUG
 
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/perf_event.h>
 #include <linux/itrace.h>
 #include <linux/sizes.h>
+#include <linux/elf.h>
+#include <linux/coredump.h>
 #include <linux/slab.h>
 
 #include "internal.h"
 
 static LIST_HEAD(itrace_pmus);
 static DEFINE_MUTEX(itrace_pmus_mutex);
+static struct itrace_pmu *itrace_pmu_coredump;
+
+#define CORE_OWNER "ITRACE"
 
 struct static_key_deferred itrace_core_events __read_mostly;
 
@@ -91,8 +97,12 @@ bool is_itrace_event(struct perf_event *event)
 
 static void itrace_event_destroy(struct perf_event *event)
 {
+	struct task_struct *task = event->hw.itrace_target;
 	struct ring_buffer *rb = event->rb[PERF_RB_ITRACE];
 
+	if (task && event->hw.counter_type == PERF_ITRACE_COREDUMP)
+		static_key_slow_dec_deferred(&itrace_core_events);
+
 	if (!rb)
 		return;
 
@@ -268,6 +278,10 @@ int itrace_inherit_event(struct perf_event *event, struct task_struct *task)
 	}
 
 	event->hw.counter_type = parent->hw.counter_type;
+	if (event->hw.counter_type == PERF_ITRACE_COREDUMP) {
+		static_key_slow_inc(&itrace_core_events.key);
+		size = task_rlimit(task, RLIMIT_ITRACE);
+	}
 
 	size = roundup_buffer_size(size);
 	rb = rb_alloc(event, size >> PAGE_SHIFT, 0, event->cpu, 0,
@@ -294,10 +308,10 @@ int itrace_kernel_event(struct perf_event *event, struct task_struct *task)
 
 	ipmu = to_itrace_pmu(event->pmu);
 
-	if (!event->attr.itrace_sample_size)
-		return 0;
-
-	size = roundup_buffer_size(event->attr.itrace_sample_size);
+	if (event->attr.itrace_sample_size)
+		size = roundup_buffer_size(event->attr.itrace_sample_size);
+	else
+		size = task_rlimit(task, RLIMIT_ITRACE);
 
 	rb = rb_alloc(event, size >> PAGE_SHIFT, 0, event->cpu, 0,
 		      &itrace_rb_ops);
@@ -325,6 +339,104 @@ void itrace_wake_up(struct perf_event *event)
 	rcu_read_unlock();
 }
 
+static ssize_t
+coredump_show(struct device *dev,
+	      struct device_attribute *attr,
+	      char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct itrace_pmu *ipmu = to_itrace_pmu(pmu);
+	int ret;
+
+	mutex_lock(&itrace_pmus_mutex);
+	ret = itrace_pmu_coredump == ipmu;
+	mutex_unlock(&itrace_pmus_mutex);
+
+	return snprintf(page, PAGE_SIZE-1, "%d\n", ret);
+}
+
+static ssize_t
+coredump_store(struct device *dev,
+	       struct device_attribute *attr,
+	       const char *buf, size_t count)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct itrace_pmu *ipmu = to_itrace_pmu(pmu);
+
+	mutex_lock(&itrace_pmus_mutex);
+	if (ipmu->core_size && ipmu->core_output)
+		itrace_pmu_coredump = ipmu;
+	mutex_unlock(&itrace_pmus_mutex);
+
+	return count;
+}
+static DEVICE_ATTR_RW(coredump);
+
+static ssize_t
+coredump_config_show(struct device *dev,
+		     struct device_attribute *attr,
+		     char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct itrace_pmu *ipmu = to_itrace_pmu(pmu);
+
+	return snprintf(page, PAGE_SIZE-1, "%016llx\n", ipmu->coredump_config);
+}
+
+static ssize_t
+coredump_config_store(struct device *dev,
+		      struct device_attribute *attr,
+		      const char *buf, size_t count)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct itrace_pmu *ipmu = to_itrace_pmu(pmu);
+	u64 config;
+	int ret;
+
+	ret = kstrtou64(buf, 0, &config);
+	if (ret)
+		return ret;
+
+	ipmu->coredump_config = config;
+
+	return count;
+}
+static DEVICE_ATTR_RW(coredump_config);
+
+static struct attribute *itrace_attrs[] = {
+	&dev_attr_coredump.attr,
+	&dev_attr_coredump_config.attr,
+	NULL,
+};
+
+struct attribute_group itrace_group = {
+	.attrs	= itrace_attrs,
+};
+
+static const struct attribute_group **
+itrace_get_attr_groups(const struct attribute_group **pgroups)
+{
+	const struct attribute_group **groups;
+	int i, ngroups;
+	size_t size;
+
+	for (i = 0, ngroups = 2; pgroups[i]; i++, ngroups++)
+		;
+
+	size = sizeof(struct attribute_group *) * ngroups;
+	groups = kzalloc(size, GFP_KERNEL);
+	if (!groups)
+		goto out;
+
+	for (i = 0; pgroups[i]; i++)
+		groups[i] = pgroups[i];
+
+	groups[i] = &itrace_group;
+
+out:
+	return groups;
+}
+
 int itrace_pmu_register(struct itrace_pmu *ipmu)
 {
 	int ret;
@@ -334,6 +446,7 @@ int itrace_pmu_register(struct itrace_pmu *ipmu)
 
 	ipmu->event_init = ipmu->pmu.event_init;
 	ipmu->pmu.event_init = itrace_event_init;
+	ipmu->pmu.attr_groups = itrace_get_attr_groups(ipmu->pmu.attr_groups);
 
 	ret = perf_pmu_register(&ipmu->pmu, ipmu->name, -1);
 	if (ret)
@@ -341,6 +454,8 @@ int itrace_pmu_register(struct itrace_pmu *ipmu)
 
 	mutex_lock(&itrace_pmus_mutex);
 	list_add_tail_rcu(&ipmu->entry, &itrace_pmus);
+	if (ipmu->core_size && ipmu->core_output)
+		itrace_pmu_coredump = ipmu;
 	mutex_unlock(&itrace_pmus_mutex);
 
 	return ret;
@@ -422,3 +537,169 @@ void itrace_sampler_output(struct perf_event *event,
 	ipmu = to_itrace_pmu(tevt->pmu);
 	ipmu->sample_output(tevt, handle, data);
 }
+
+/*
+ * Core dump bits
+ *
+ * Various parts of the kernel will call here:
+ *   + do_prlimit(): to tell us that the user is trying to set RLIMIT_ITRACE
+ *   + various places in bitfmt_elf.c: to write out itrace notes
+ *   + do_exit(): to destroy the first core dump counter
+ *   + the rest (copy_process()/do_exit()) is taken care of by perf for us
+ */
+
+static struct perf_event *
+itrace_find_task_event(struct task_struct *task, unsigned type)
+{
+	struct perf_event_context *ctx;
+	struct perf_event *event = NULL;
+
+	rcu_read_lock();
+	ctx = rcu_dereference(task->perf_event_ctxp[perf_hw_context]);
+	if (!ctx)
+		goto out;
+
+	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+		if (is_itrace_event(event) &&
+		    event->cpu == -1 &&
+		    !!(event->hw.counter_type & type))
+			goto out;
+	}
+
+	event = NULL;
+out:
+	rcu_read_unlock();
+
+	return event;
+}
+
+int update_itrace_rlimit(struct task_struct *task, unsigned long rlim)
+{
+	struct perf_event_attr attr;
+	struct perf_event *event;
+
+	event = itrace_find_task_event(task, PERF_ITRACE_ANY);
+	if (event) {
+		if (event->hw.counter_type != PERF_ITRACE_COREDUMP)
+			return -EINVAL;
+
+		perf_event_release_kernel(event);
+		static_key_slow_dec_deferred(&itrace_core_events);
+	}
+
+	if (!rlim)
+		return 0;
+
+	memset(&attr, 0, sizeof(attr));
+
+	mutex_lock(&itrace_pmus_mutex);
+	if (!itrace_pmu_coredump) {
+		mutex_unlock(&itrace_pmus_mutex);
+		return -ENOTSUPP;
+	}
+
+	attr.type = itrace_pmu_coredump->pmu.type;
+	attr.config = 0;
+	attr.sample_type = 0;
+	attr.exclude_kernel = 1;
+	attr.inherit = 1;
+	attr.itrace_config = itrace_pmu_coredump->coredump_config;
+
+	event = perf_event_create_kernel_counter(&attr, -1, task, NULL, NULL);
+	mutex_unlock(&itrace_pmus_mutex);
+
+	if (IS_ERR(event))
+		return PTR_ERR(event);
+
+	static_key_slow_inc(&itrace_core_events.key);
+
+	event->hw.counter_type = PERF_ITRACE_COREDUMP;
+	perf_event_enable(event);
+
+	return 0;
+}
+
+static void itrace_pmu_exit_task(struct task_struct *task)
+{
+	struct perf_event *event;
+
+	event = itrace_find_task_event(task, PERF_ITRACE_COREDUMP);
+
+	/*
+	 * here we are only interested in kernel counters created by
+	 * update_itrace_rlimit(), inherited ones should be taken care of by
+	 * perf_event_exit_task(), sampling ones are taken care of by
+	 * itrace_sampler_fini().
+	 */
+	if (!event)
+		return;
+
+	if (!event->parent)
+		perf_event_release_kernel(event);
+}
+
+void exit_itrace(struct task_struct *task)
+{
+	if (static_key_false(&itrace_core_events.key))
+		itrace_pmu_exit_task(task);
+}
+
+size_t itrace_elf_note_size(struct task_struct *task)
+{
+	struct itrace_pmu *ipmu;
+	struct perf_event *event = NULL;
+	size_t size = 0;
+
+	event = itrace_find_task_event(task, PERF_ITRACE_COREDUMP);
+	if (event) {
+		perf_event_disable(event);
+
+		ipmu = to_itrace_pmu(event->pmu);
+		size = ipmu->core_size(event);
+		size += task_rlimit(task, RLIMIT_ITRACE);
+		size = roundup(size + strlen(ipmu->name) + 1, 4);
+		size += sizeof(struct itrace_note) + sizeof(struct elf_note);
+		size += roundup(sizeof(CORE_OWNER), 4);
+	}
+
+	return size;
+}
+
+void itrace_elf_note_write(struct coredump_params *cprm,
+			   struct task_struct *task)
+{
+	struct perf_event *event;
+	struct itrace_note note;
+	struct itrace_pmu *ipmu;
+	struct elf_note en;
+	unsigned long rlim;
+	size_t pmu_len;
+
+	event = itrace_find_task_event(task, PERF_ITRACE_COREDUMP);
+	if (!event)
+		return;
+
+	ipmu = to_itrace_pmu(event->pmu);
+	pmu_len = strlen(ipmu->name) + 1;
+
+	rlim = task_rlimit(task, RLIMIT_ITRACE);
+
+	/* Elf note with name */
+	en.n_namesz = strlen(CORE_OWNER);
+	en.n_descsz = roundup(ipmu->core_size(event) + rlim + sizeof(note) +
+			      pmu_len, 4);
+	en.n_type = NT_ITRACE;
+	dump_emit(cprm, &en, sizeof(en));
+	dump_align(cprm, 4);
+	dump_emit(cprm, CORE_OWNER, sizeof(CORE_OWNER));
+	dump_align(cprm, 4);
+
+	/* ITRACE header */
+	note.itrace_config = event->attr.itrace_config;
+	dump_emit(cprm, &note, sizeof(note));
+	dump_emit(cprm, ipmu->name, pmu_len);
+
+	/* ITRACE PMU header + payload */
+	ipmu->core_output(cprm, event, rlim);
+	dump_align(cprm, 4);
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index a949819..28138ef 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,6 +48,7 @@
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
+#include <linux/itrace.h>
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
@@ -788,6 +789,8 @@ void do_exit(long code)
 	check_stack_usage();
 	exit_thread();
 
+	exit_itrace(tsk);
+
 	/*
 	 * Flush inherited counters to the parent - before the parent
 	 * gets woken up by child-exit notifications.
diff --git a/kernel/sys.c b/kernel/sys.c
index c723113..7651d6f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/kmod.h>
 #include <linux/perf_event.h>
+#include <linux/itrace.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
 #include <linux/workqueue.h>
@@ -1402,6 +1403,10 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
 		update_rlimit_cpu(tsk, new_rlim->rlim_cur);
 out:
 	read_unlock(&tasklist_lock);
+
+	if (!retval && new_rlim && resource == RLIMIT_ITRACE)
+		retval = update_itrace_rlimit(tsk, new_rlim->rlim_cur);
+
 	return retval;
 }
 
-- 
1.8.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ