linux-kernel - Re: [PATCH 1/4] perf_counter: Default to higher paranoia level

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Thu, 20 Aug 2009 14:00:06 +0200
From:	Peter Zijlstra <peterz@...radead.org>
To:	Ingo Molnar <mingo@...e.hu>
Cc:	Paul Mackerras <paulus@...ba.org>,
	Arnaldo Carvalho de Melo <acme@...hat.com>,
	Frederic Weisbecker <fweisbec@...il.com>,
	Mike Galbraith <efault@....de>, linux-kernel@...r.kernel.org,
	Jens Axboe <jens.axboe@...cle.com>,
	James Morris <jmorris@...ei.org>,
	Linus Torvalds <torvalds@...ux-foundation.org>
Subject: Re: [PATCH 1/4] perf_counter: Default to higher paranoia level

On Wed, 2009-08-19 at 16:07 +0200, Peter Zijlstra wrote:
> On Wed, 2009-08-19 at 11:18 +0200, Peter Zijlstra wrote:
> 
> > +static inline bool perf_paranoid_anon(void)
> > +{
> > +	return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 1;
> >  }
> >  
> >  static inline bool perf_paranoid_kernel(void)
> >  {
> > -	return sysctl_perf_counter_paranoid > 1;
> > +	return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 2;
> > +}
> 
> OK, this is buggy:
> 
>  - capable() uses current, which is unlikely to be counter->owner,
>  - but even security_real_capable(counter->owner, ...) wouldn't
>    work, since the ->capable() callback isn't NMI safe
>    (selinux takes locks and does allocations in that path).
> 
> This puts a severe strain on more complex anonymizers since its
> basically impossible to tell if counter->owner has permissions on
> current from NMI context.
> 
> I'll fix up this patch to pre-compute the perf_paranoid_anon_ip() per
> counter based on creation time state, unless somebody has a better idea.
> 
> I could possibly only anonymize IRQ context (SoftIRQ context is
> difficult since in_softirq() means both in-softirq and
> softirq-disabled).

Something like the below maybe.. its 3 patches folded and I've no clue
how to adapt the ppc code, what do people think?

compile tested on x86-64 only.

---
 arch/x86/include/asm/stacktrace.h  |   14 +++++--
 arch/x86/kernel/cpu/perf_counter.c |   66 +++++++++++++++++++++++++++++++----
 arch/x86/kernel/dumpstack.c        |   10 +++--
 arch/x86/kernel/dumpstack_32.c     |    8 +----
 arch/x86/kernel/dumpstack_64.c     |   13 ++-----
 arch/x86/kernel/stacktrace.c       |    8 +++--
 arch/x86/oprofile/backtrace.c      |    5 ++-
 include/linux/perf_counter.h       |    5 ++-
 kernel/perf_counter.c              |   30 ++++++++++++----
 kernel/trace/trace_sysprof.c       |    5 ++-
 10 files changed, 117 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index cf86a5e..7066caa 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,17 +3,23 @@
 
 extern int kstack_depth_to_print;
 
-int x86_is_stack_id(int id, char *name);
-
 /* Generic stack tracer with callbacks */
 
+enum stack_type {
+	STACK_UNKNOWN	= -1,
+	STACK_PROCESS   = 0,
+	STACK_INTERRUPT = 1,
+	STACK_EXCEPTION = 2,
+};
+
 struct stacktrace_ops {
 	void (*warning)(void *data, char *msg);
 	/* msg must contain %s for the symbol */
 	void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
-	void (*address)(void *data, unsigned long address, int reliable);
+	void (*address)(void *data, unsigned long stack,
+			unsigned long address, int reliable);
 	/* On negative return stop dumping */
-	int (*stack)(void *data, char *name);
+	int (*stack)(void *data, int type, char *name);
 };
 
 void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 396e35d..a6ddb3b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -2108,7 +2108,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
 
 static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
 static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
-static DEFINE_PER_CPU(int, in_nmi_frame);
+static DEFINE_PER_CPU(int, skip_frame);
 
 
 static void
@@ -2122,21 +2122,49 @@ static void backtrace_warning(void *data, char *msg)
 	/* Ignore warnings */
 }
 
-static int backtrace_stack(void *data, char *name)
+static int backtrace_stack(void *data, int type, char *name)
 {
-	per_cpu(in_nmi_frame, smp_processor_id()) =
-			x86_is_stack_id(NMI_STACK, name);
+	struct perf_callchain_entry *entry = data;
+	int skip = 0;
+
+	/*
+	 * Always skip exception (NMI) context
+	 */
+	if (type == STACK_EXCEPTION)
+		skip = 1;
+
+	/*
+	 * If we're restricted, don't show IRQ context
+	 */
+	if (entry->restricted && type == STACK_INTERRUPT)
+		skip = 1;
+
+	__get_cpu_var(skip_frame) = skip;
 
 	return 0;
 }
 
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static void backtrace_address(void *data, unsigned long stack,
+			      unsigned long addr, int reliable)
 {
 	struct perf_callchain_entry *entry = data;
 
-	if (per_cpu(in_nmi_frame, smp_processor_id()))
+	if (__get_cpu_var(skip_frame))
 		return;
 
+#ifdef CONFIG_4KSTACKS
+	if (entry->restricted) {
+		struct thread_info *tinfo = current_thread_info();
+		unsigned long task_stack = (unsigned long)tinfo;
+
+		/*
+		 * If its not on the task stack, don't show it.
+		 */
+		if (stack < task_stack || stack >= task_stack + PAGE_SIZE)
+			return;
+	}
+#endif
+
 	if (reliable)
 		callchain_store(entry, addr);
 }
@@ -2153,8 +2181,28 @@ static const struct stacktrace_ops backtrace_ops = {
 static void
 perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
+	int irq = hardirq_count() - (in_nmi() ? 0 : HARDIRQ_OFFSET);
+
 	callchain_store(entry, PERF_CONTEXT_KERNEL);
-	callchain_store(entry, regs->ip);
+	if (entry->restricted && !irq)
+		callchain_store(entry, regs->ip);
+
+#ifdef CONFIG_X86_32
+	/*
+	 * i386 is a friggin mess, 4KSTACKS makes it somewhat saner but
+	 * still don't have a way to identify NMI entries as they are
+	 * nested onto whatever stack is there.
+	 *
+	 * So until i386 starts using per context stacks unconditionally
+	 * and fixed up the unwinder (dump_trace behaves differently between
+	 * i386 and x86_64), we'll have to unconditionally truncate kernel
+	 * stacks that have IRQ context bits in them.
+	 */
+#ifndef CONFIG_4KSTACKS
+	if (irq)
+		return;
+#endif
+#endif
 
 	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
 }
@@ -2255,7 +2303,8 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		perf_callchain_user(regs, entry);
 }
 
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+struct perf_callchain_entry *
+perf_callchain(struct perf_counter *counter, struct pt_regs *regs)
 {
 	struct perf_callchain_entry *entry;
 
@@ -2265,6 +2314,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 		entry = &__get_cpu_var(irq_entry);
 
 	entry->nr = 0;
+	entry->restricted = counter->restricted;
 
 	perf_do_callchain(regs, entry);
 
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371..978ae89 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -51,7 +51,7 @@ print_ftrace_graph_addr(unsigned long addr, void *data,
 	index -= *graph;
 	ret_addr = task->ret_stack[index].ret;
 
-	ops->address(data, ret_addr, 1);
+	ops->address(data, 0, ret_addr, 1);
 
 	(*graph)++;
 }
@@ -96,12 +96,14 @@ print_context_stack(struct thread_info *tinfo,
 
 		addr = *stack;
 		if (__kernel_text_address(addr)) {
-			if ((unsigned long) stack == bp + sizeof(long)) {
-				ops->address(data, addr, 1);
+			unsigned long stk = (unsigned long)stack;
+
+			if (stk == bp + sizeof(long)) {
+				ops->address(data, stk, addr, 1);
 				frame = frame->next_frame;
 				bp = (unsigned long) frame;
 			} else {
-				ops->address(data, addr, 0);
+				ops->address(data, stk, addr, 0);
 			}
 			print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
 		}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba..be633ed 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -19,12 +19,6 @@
 
 #include "dumpstack.h"
 
-/* Just a stub for now */
-int x86_is_stack_id(int id, char *name)
-{
-	return 0;
-}
-
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
@@ -64,7 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		stack = (unsigned long *)context->previous_esp;
 		if (!stack)
 			break;
-		if (ops->stack(data, "IRQ") < 0)
+		if (ops->stack(data, STACK_UNKNOWN, "IRQ") < 0)
 			break;
 		touch_nmi_watchdog();
 	}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a32..9309c5c 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -32,11 +32,6 @@ static char x86_stack_ids[][8] = {
 #endif
 	};
 
-int x86_is_stack_id(int id, char *name)
-{
-	return x86_stack_ids[id - 1] == name;
-}
-
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 					unsigned *usedp, char **idp)
 {
@@ -155,12 +150,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 						&used, &id);
 
 		if (estack_end) {
-			if (ops->stack(data, id) < 0)
+			if (ops->stack(data, STACK_EXCEPTION, id) < 0)
 				break;
 
 			bp = print_context_stack(tinfo, stack, bp, ops,
 						 data, estack_end, &graph);
-			ops->stack(data, "<EOE>");
+			ops->stack(data, STACK_PROCESS, "<EOE>");
 			/*
 			 * We link to the next stack via the
 			 * second-to-last pointer (index -2 to end) in the
@@ -175,7 +170,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 				(IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
 
 			if (stack >= irq_stack && stack < irq_stack_end) {
-				if (ops->stack(data, "IRQ") < 0)
+				if (ops->stack(data, STACK_INTERRUPT, "IRQ") < 0)
 					break;
 				bp = print_context_stack(tinfo, stack, bp,
 					ops, data, irq_stack_end, &graph);
@@ -186,7 +181,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 				 */
 				stack = (unsigned long *) (irq_stack_end[-1]);
 				irq_stack_end = NULL;
-				ops->stack(data, "EOI");
+				ops->stack(data, STACK_PROCESS, "EOI");
 				continue;
 			}
 		}
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c3eb207..a1415f4 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -18,12 +18,13 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
 {
 }
 
-static int save_stack_stack(void *data, char *name)
+static int save_stack_stack(void *data, int type, char *name)
 {
 	return 0;
 }
 
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static void save_stack_address(void *data, unsigned long stack,
+			       unsigned long addr, int reliable)
 {
 	struct stack_trace *trace = data;
 	if (!reliable)
@@ -37,7 +38,8 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
 }
 
 static void
-save_stack_address_nosched(void *data, unsigned long addr, int reliable)
+save_stack_address_nosched(void *data, unsigned long stack,
+			   unsigned long addr, int reliable)
 {
 	struct stack_trace *trace = (struct stack_trace *)data;
 	if (!reliable)
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 044897b..7232bc9 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -26,13 +26,14 @@ static void backtrace_warning(void *data, char *msg)
 	/* Ignore warnings */
 }
 
-static int backtrace_stack(void *data, char *name)
+static int backtrace_stack(void *data, int type, char *name)
 {
 	/* Yes, we want all stacks */
 	return 0;
 }
 
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static void backtrace_address(void *data, unsigned long stack,
+			      unsigned long addr, int reliable)
 {
 	unsigned int *depth = data;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9ba1822..2b0528f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -439,6 +439,7 @@ enum perf_callchain_context {
 struct perf_callchain_entry {
 	__u64				nr;
 	__u64				ip[PERF_MAX_STACK_DEPTH];
+	int				restricted;
 };
 
 struct perf_raw_record {
@@ -535,6 +536,7 @@ struct perf_counter {
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
 	int				nr_siblings;
+	int				restricted;
 	struct perf_counter		*group_leader;
 	const struct pmu		*pmu;
 
@@ -754,7 +756,8 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma)
 extern void perf_counter_comm(struct task_struct *tsk);
 extern void perf_counter_fork(struct task_struct *tsk);
 
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_counter *counter, struct pt_regs *regs);
 
 extern int sysctl_perf_counter_paranoid;
 extern int sysctl_perf_counter_mlock;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 36f65e2..cc8450e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -50,16 +50,28 @@ static atomic_t nr_task_counters __read_mostly;
  *  1 - disallow cpu counters to unpriv
  *  2 - disallow kernel profiling to unpriv
  */
-int sysctl_perf_counter_paranoid __read_mostly;
+int sysctl_perf_counter_paranoid __read_mostly = 1;
 
 static inline bool perf_paranoid_cpu(void)
 {
-	return sysctl_perf_counter_paranoid > 0;
+	return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 0;
 }
 
 static inline bool perf_paranoid_kernel(void)
 {
-	return sysctl_perf_counter_paranoid > 1;
+	return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 1;
+}
+
+/*
+ * When paranoid == 1 we're limiting kernel profiling to not include
+ * other users' information, this includes IRQ stack traces.
+ */
+static bool perf_counter_restricted(struct perf_counter *counter)
+{
+	if (counter->restricted)
+		return hardirq_count() - (in_nmi() ? 0 : HARDIRQ_OFFSET);
+
+	return 0;
 }
 
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
@@ -1571,7 +1583,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	 */
 	if (cpu != -1) {
 		/* Must be root to operate on a CPU counter: */
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_cpu())
 			return ERR_PTR(-EACCES);
 
 		if (cpu < 0 || cpu > num_possible_cpus())
@@ -2458,7 +2470,8 @@ void perf_counter_do_pending(void)
  * Callchain support -- arch specific
  */
 
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+__weak struct perf_callchain_entry *
+perf_callchain(struct perf_counter *counter, struct pt_regs *regs)
 {
 	return NULL;
 }
@@ -2846,6 +2859,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 
 	if (sample_type & PERF_SAMPLE_IP) {
 		ip = perf_instruction_pointer(data->regs);
+		if (perf_counter_restricted(counter))
+			ip = 0;
 		header.size += sizeof(ip);
 	}
 
@@ -2889,7 +2904,7 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 		header.size += perf_counter_read_size(counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		callchain = perf_callchain(data->regs);
+		callchain = perf_callchain(counter, data->regs);
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
@@ -4049,6 +4064,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	counter->pmu		= NULL;
 	counter->ctx		= ctx;
 	counter->oncpu		= -1;
+	counter->restricted	= perf_paranoid_cpu();
 
 	counter->parent		= parent_counter;
 
@@ -4231,7 +4247,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 		return ret;
 
 	if (!attr.exclude_kernel) {
-		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_kernel())
 			return -EACCES;
 	}
 
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index f669396..d7d2d0d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -71,13 +71,14 @@ static void backtrace_warning(void *data, char *msg)
 	/* Ignore warnings */
 }
 
-static int backtrace_stack(void *data, char *name)
+static int backtrace_stack(void *data, int type, char *name)
 {
 	/* Don't bother with IRQ stacks for now */
 	return -1;
 }
 
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static void backtrace_address(void *data, unsigned long stack,
+			      unsigned long addr, int reliable)
 {
 	struct backtrace_info *info = data;
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/