lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1502993843-6837-1-git-send-email-kan.liang@intel.com>
Date:   Thu, 17 Aug 2017 14:17:23 -0400
From:   kan.liang@...el.com
To:     peterz@...radead.org, mingo@...hat.com,
        linux-kernel@...r.kernel.org
Cc:     acme@...nel.org, jolsa@...hat.com, tglx@...utronix.de,
        eranian@...gle.com, ak@...ux.intel.com,
        Kan Liang <kan.liang@...el.com>
Subject: [PATCH V5] perf: Add PERF_SAMPLE_PHYS_ADDR

From: Kan Liang <kan.liang@...el.com>

For understanding how the workload maps to memory channels and hardware
behavior, it's very important to collect address maps with physical
addresses. For example, 3D XPoint access can only be found by filtering
the physical address.
However, perf doesn't collect physical address information in sampling.

The load latency/DLA information in PEBS can be used to calculate the
physical address.
For kernel direct mapping addresses, virt_to_phys is used to convert the
virtual addresses from DLA to physical address.
For user virtual addresses, __get_user_pages_fast is used to walk the
pages tables for user physical address.
This does not work for vmalloc addresses. Right now these are not
resolved, but code to do that could be added.
For security, the physical address can only be exposed to root or
privileged user.
A new sample type PERF_SAMPLE_PHYS_ADDR is introduced to expose the
physical addresses.

Signed-off-by: Kan Liang <kan.liang@...el.com>
---

This patch is kernel patch.
The user space patch will be sent out later separately.

Changes since V4
 - Correct PHYS_ADDR placement in perf_output_sample (Stephane)

Changes since V3
 - Move the code dla->phys to separate function (Stephane)
 - Correct PHYS_ADDR misplacement in header file (Stephane)

Changes since V2
 - Only the kernel patch
 - Add example in changelog
 - Include a perf_paranoid_kernel() test (PeterZ)
 - Fix minor complier warning

 arch/x86/events/intel/ds.c      | 32 ++++++++++++++++++++++++++++++++
 arch/x86/events/perf_event.h    |  2 +-
 include/linux/perf_event.h      |  3 +++
 include/uapi/linux/perf_event.h |  4 +++-
 kernel/events/core.c            | 12 ++++++++++++
 5 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index a322fed..f0e8d9c 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1065,6 +1065,35 @@ static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
 	return txn;
 }
 
+static u64 dla_to_phys(u64 dla)
+{
+	u64 phys_addr = 0;
+	struct page *p = NULL;
+
+	if (dla >= TASK_SIZE) {
+		/* If it's vmalloc()d memory, leave phys_addr as 0 */
+		if (virt_addr_valid(dla) &&
+		    !(dla >= VMALLOC_START && dla < VMALLOC_END))
+			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)dla);
+	} else {
+		/*
+		 * Walking the pages tables for user address.
+		 * Interrupts are disabled, so it prevents any tear down
+		 * of the page tables.
+		 * Try IRQ-safe __get_user_pages_fast first.
+		 * If failed, leave phys_addr as 0.
+		 */
+		if ((current->mm != NULL) &&
+		    (__get_user_pages_fast(dla, 1, 0, &p) == 1))
+			phys_addr = page_to_phys(p) + dla % PAGE_SIZE;
+
+		if (p)
+			put_page(p);
+	}
+
+	return phys_addr;
+}
+
 static void setup_pebs_sample_data(struct perf_event *event,
 				   struct pt_regs *iregs, void *__pebs,
 				   struct perf_sample_data *data,
@@ -1179,6 +1208,9 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	    x86_pmu.intel_cap.pebs_format >= 1)
 		data->addr = pebs->dla;
 
+	if ((sample_type & PERF_SAMPLE_PHYS_ADDR) && (data->addr != 0))
+		data->phys_addr = dla_to_phys(data->addr);
+
 	if (x86_pmu.intel_cap.pebs_format >= 2) {
 		/* Only set the TSX weight when no memory weight. */
 		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 476aec3..65bb91e 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -91,7 +91,7 @@ struct amd_nb {
 	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
 	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
 	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-	PERF_SAMPLE_TRANSACTION)
+	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
 
 /*
  * A debug store configuration.
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a3b873f..6783c69 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -944,6 +944,8 @@ struct perf_sample_data {
 
 	struct perf_regs		regs_intr;
 	u64				stack_user_size;
+
+	u64				phys_addr;
 } ____cacheline_aligned;
 
 /* default value for data source */
@@ -964,6 +966,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->weight = 0;
 	data->data_src.val = PERF_MEM_NA;
 	data->txn = 0;
+	data->phys_addr = 0;
 }
 
 extern void perf_output_sample(struct perf_output_handle *handle,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 642db5f..cbea02f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
+	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 
-	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
 };
 
 /*
@@ -814,6 +815,7 @@ enum perf_event_type {
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ff..d5da77f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1570,6 +1570,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		size += sizeof(data->txn);
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		size += sizeof(data->phys_addr);
+
 	event->header_size = size;
 }
 
@@ -5972,6 +5975,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 		}
 	}
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		perf_output_put(handle, data->phys_addr);
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -9852,6 +9858,12 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/* Only privileged users can get kernel addresses */
+	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+	    perf_paranoid_kernel() &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	if (!attr.sample_max_stack)
 		attr.sample_max_stack = sysctl_perf_event_max_stack;
 
-- 
2.4.3

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ