lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200731025617.16243-2-yao.jin@linux.intel.com>
Date:   Fri, 31 Jul 2020 10:56:17 +0800
From:   Jin Yao <yao.jin@...ux.intel.com>
To:     peterz@...radead.org, mingo@...hat.com, oleg@...hat.com,
        acme@...nel.org, jolsa@...nel.org
Cc:     Linux-kernel@...r.kernel.org, ak@...ux.intel.com,
        kan.liang@...el.com, yao.jin@...el.com,
        alexander.shishkin@...ux.intel.com, mark.rutland@....com,
        Jin Yao <yao.jin@...ux.intel.com>
Subject: [PATCH v1 2/2] perf/core: Fake regs for leaked kernel samples

When doing sampling, for example,

perf record -e cycles:u ...

On workloads that do a lot of kernel entry/exits we see kernel
samples, even though :u is specified. This is due to skid.

This is a potential security issue because it may leak kernel
address even though kernel sampling is disabled.

The commit cc1582c231ea ("perf/core: Drop kernel samples even
though :u is specified") dropped the leaked kernel samples but it
broke rr-project.

Another idea is (inspired by Mark Rutland's original patch), it
doesn't lose the samples, it keeps the samples but fakes the regs
by using the user regs of current task. If the user regs is not
available, uses the instruction_pointer_set to set -1L to ip address
of regs.

For CALLCHAIN, the get_perf_callchain has checked user_mode(regs)
and use task_pt_regs(current) instead in some cases. So actually it
has considered the leaking possibility.

For REGS_USER and STACK_USER, it's similar. The perf_sample_regs_user
has also checked the user_mode(regs). It calls perf_get_regs_user()
for kthread. So we don't need to use "regs_fake" there.

Example:

  perf record -e cycles:u ./div
  perf report --stdio

Before:

  # Overhead  Command  Shared Object     Symbol
  # ........  .......  ................  ........................
  #
      41.08%  div      div               [.] main
      21.04%  div      libc-2.27.so      [.] __random_r
      19.90%  div      libc-2.27.so      [.] __random
       9.86%  div      div               [.] compute_flag
       4.24%  div      libc-2.27.so      [.] rand
       3.88%  div      div               [.] rand@plt
       0.01%  div      [unknown]         [k] 0xffffffffb9601c20
       0.00%  div      libc-2.27.so      [.] _dl_addr
       0.00%  div      ld-2.27.so        [.] __GI___tunables_init
       0.00%  div      [unknown]         [k] 0xffffffffb9601210
       0.00%  div      ld-2.27.so        [.] _start

"0xffffffffb9601c20, 0xffffffffb9601210" are leaked kernel addresses.

After:

  # Overhead  Command  Shared Object  Symbol
  # ........  .......  .............  ........................
  #
      40.86%  div      div            [.] main
      20.67%  div      libc-2.27.so   [.] __random_r
      20.54%  div      libc-2.27.so   [.] __random
       9.65%  div      div            [.] compute_flag
       4.32%  div      libc-2.27.so   [.] rand
       3.97%  div      div            [.] rand@plt
       0.00%  div      ld-2.27.so     [.] __GI___tunables_init
       0.00%  div      ld-2.27.so     [.] _start

Now there is no kernel address leaked.

Inspired-by: Mark Rutland <mark.rutland@....com>
Signed-off-by: Jin Yao <yao.jin@...ux.intel.com>
---
 kernel/events/core.c | 48 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7c436d705fbd..52f6d7f0b86b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6973,7 +6973,8 @@ static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 struct perf_callchain_entry *
 perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
-	bool kernel = !event->attr.exclude_callchain_kernel;
+	bool kernel = !event->attr.exclude_callchain_kernel &&
+		      !event->attr.exclude_kernel;
 	bool user   = !event->attr.exclude_callchain_user;
 	/* Disallow cross-task user callchains. */
 	bool crosstask = event->ctx->task && event->ctx->task != current;
@@ -6988,12 +6989,39 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 	return callchain ?: &__empty_callchain;
 }
 
+static struct pt_regs *leak_check(struct perf_event_header *header,
+				  struct perf_event *event,
+				  struct pt_regs *regs)
+{
+	struct pt_regs *regs_fake = NULL;
+
+	if (event->attr.exclude_kernel && !user_mode(regs)) {
+		if (!(current->flags & PF_KTHREAD)) {
+			regs_fake = task_pt_regs(current);
+			if (!user_mode(regs_fake)) {
+				regs_fake = NULL;
+				instruction_pointer_set(regs, -1L);
+			}
+		} else
+			instruction_pointer_set(regs, -1L);
+
+		if ((header->misc & PERF_RECORD_MISC_CPUMODE_MASK) ==
+		     PERF_RECORD_MISC_KERNEL) {
+			header->misc &= ~PERF_RECORD_MISC_CPUMODE_MASK;
+			header->misc |= PERF_RECORD_MISC_USER;
+		}
+	}
+
+	return regs_fake;
+}
+
 void perf_prepare_sample(struct perf_event_header *header,
 			 struct perf_sample_data *data,
 			 struct perf_event *event,
 			 struct pt_regs *regs)
 {
 	u64 sample_type = event->attr.sample_type;
+	struct pt_regs *regs_fake;
 
 	header->type = PERF_RECORD_SAMPLE;
 	header->size = sizeof(*header) + event->header_size;
@@ -7003,8 +7031,19 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 	__perf_event_header__init_id(header, data, event);
 
+	/*
+	 * Due to interrupt latency (AKA "skid"), we may enter the
+	 * kernel before taking an overflow, even if the PMU is only
+	 * counting user events. To avoid leaking kernel address to
+	 * userspace, we try to fake the regs by using the user regs
+	 * of current task.
+	 */
+	regs_fake = leak_check(header, event, regs);
+
 	if (sample_type & PERF_SAMPLE_IP)
-		data->ip = perf_instruction_pointer(regs);
+		data->ip = (regs_fake) ?
+			perf_instruction_pointer(regs_fake) :
+			perf_instruction_pointer(regs);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		int size = 1;
@@ -7099,7 +7138,10 @@ void perf_prepare_sample(struct perf_event_header *header,
 		/* regs dump ABI info */
 		int size = sizeof(u64);
 
-		perf_sample_regs_intr(&data->regs_intr, regs);
+		if (regs_fake)
+			perf_sample_regs_intr(&data->regs_intr, regs_fake);
+		else
+			perf_sample_regs_intr(&data->regs_intr, regs);
 
 		if (data->regs_intr.regs) {
 			u64 mask = event->attr.sample_regs_intr;
-- 
2.17.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ