linux-kernel - [PATCH 10/12] perf/core,x86/cqm: Add read for Cgroup events,per pkg reads.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1483739814-23000-11-git-send-email-vikas.shivappa@linux.intel.com>
Date:   Fri,  6 Jan 2017 13:56:52 -0800
From:   Vikas Shivappa <vikas.shivappa@...ux.intel.com>
To:     vikas.shivappa@...el.com, vikas.shivappa@...ux.intel.com
Cc:     linux-kernel@...r.kernel.org, x86@...nel.org, hpa@...or.com,
        tglx@...utronix.de, mingo@...nel.org, peterz@...radead.org,
        ravi.v.shankar@...el.com, tony.luck@...el.com,
        fenghua.yu@...el.com, andi.kleen@...el.com, h.peter.anvin@...el.com
Subject: [PATCH 10/12] perf/core,x86/cqm: Add read for Cgroup events,per pkg reads.

For cqm cgroup events, the events can be read even if the event was not
active on the cpu on which the event is being read. This is because the
RMIDs are per package and hence if we read the llc_occupancy value on a
cpu x, we are really reading the occupancy for the package where cpu x
belongs.

This patch adds a PERF_INACTIVE_CPU_READ_PKG to indicate this behaviour
of cqm and also changes the perf/core to still call the reads even when
the event is inactive on the cpu for cgroup events. The task events have
event->cpu as -1 and hence it does not apply for task events.

Tests: perf stat -C <cpux> would not return a count before this patch to
the perf/core.  After this patch the count of the package is returned to
the perf/core. We still dont see the count in the perf user mode - that
is fixed in next patches.

Patch is based on David Carrillo-Cisneros <davidcc@...gle.com> patches
in cqm2 series.

Signed-off-by: Vikas Shivappa <vikas.shivappa@...ux.intel.com>
---
 arch/x86/events/intel/cqm.c             | 31 ++++++++++++++++++++++++-------
 arch/x86/include/asm/intel_rdt_common.h |  2 +-
 include/linux/perf_event.h              | 19 ++++++++++++++++---
 kernel/events/core.c                    | 16 ++++++++++++----
 4 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 92efe12..3f5860c 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -111,6 +111,13 @@ bool __rmid_valid(u32 rmid)
 	return true;
 }
 
+static inline bool __rmid_valid_raw(u32 rmid)
+{
+	if (rmid > cqm_max_rmid)
+		return false;
+	return true;
+}
+
 static u64 __rmid_read(u32 rmid)
 {
 	u64 val;
@@ -884,16 +891,16 @@ static u64 intel_cqm_event_count(struct perf_event *event)
 	return __perf_event_count(event);
 }
 
-void alloc_needed_pkg_rmid(u32 *cqm_rmid)
+u32 alloc_needed_pkg_rmid(u32 *cqm_rmid)
 {
 	unsigned long flags;
 	u32 rmid;
 
 	if (WARN_ON(!cqm_rmid))
-		return;
+		return -EINVAL;
 
 	if (cqm_rmid == cqm_rootcginfo.rmid || cqm_rmid[pkg_id])
-		return;
+		return 0;
 
 	raw_spin_lock_irqsave(&cache_lock, flags);
 
@@ -902,6 +909,8 @@ void alloc_needed_pkg_rmid(u32 *cqm_rmid)
 		cqm_rmid[pkg_id] = rmid;
 
 	raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+	return rmid;
 }
 
 static void intel_cqm_event_start(struct perf_event *event, int mode)
@@ -913,10 +922,8 @@ static void intel_cqm_event_start(struct perf_event *event, int mode)
 
 	event->hw.cqm_state &= ~PERF_HES_STOPPED;
 
-	if (is_task_event(event)) {
-		alloc_needed_pkg_rmid(event->hw.cqm_rmid);
+	if (is_task_event(event))
 		state->next_task_rmid = event->hw.cqm_rmid[pkg_id];
-	}
 }
 
 static void intel_cqm_event_stop(struct perf_event *event, int mode)
@@ -932,10 +939,19 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode)
 
 static int intel_cqm_event_add(struct perf_event *event, int mode)
 {
+	u32 rmid;
+
 	event->hw.cqm_state = PERF_HES_STOPPED;
 
-	if ((mode & PERF_EF_START))
+	/*
+	 * If Lazy RMID alloc fails indicate the error to the user.
+	*/
+	if ((mode & PERF_EF_START)) {
+		rmid = alloc_needed_pkg_rmid(event->hw.cqm_rmid);
+		if (!__rmid_valid_raw(rmid))
+			return -EINVAL;
 		intel_cqm_event_start(event, mode);
+	}
 
 	return 0;
 }
@@ -1048,6 +1064,7 @@ static int intel_cqm_event_init(struct perf_event *event)
 	 * cgroup hierarchies.
 	 */
 	event->event_caps |= PERF_EV_CAP_CGROUP_NO_RECURSION;
+	event->event_caps |= PERF_EV_CAP_INACTIVE_CPU_READ_PKG;
 
 	mutex_lock(&cache_mutex);
 
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h
index 544acaa..fcaaaeb 100644
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ b/arch/x86/include/asm/intel_rdt_common.h
@@ -27,7 +27,7 @@ struct intel_pqr_state {
 
 u32 __get_rmid(int domain);
 bool __rmid_valid(u32 rmid);
-void alloc_needed_pkg_rmid(u32 *cqm_rmid);
+u32 alloc_needed_pkg_rmid(u32 *cqm_rmid);
 struct cgrp_cqm_info *cqminfo_from_tsk(struct task_struct *tsk);
 
 extern struct cgrp_cqm_info cqm_rootcginfo;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 410642a..adfddec 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -525,10 +525,13 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *,
  * PERF_EV_CAP_CGROUP_NO_RECURSION: A cgroup event that handles its own
  * cgroup scoping. It does not need to be enabled for all of its descendants
  * cgroups.
+ * PERF_EV_CAP_INACTIVE_CPU_READ_PKG: A cgroup event where we can read
+ * the package count on any cpu on the pkg even if inactive.
  */
-#define PERF_EV_CAP_SOFTWARE		BIT(0)
-#define PERF_EV_CAP_READ_ACTIVE_PKG	BIT(1)
-#define PERF_EV_CAP_CGROUP_NO_RECURSION	BIT(2)
+#define PERF_EV_CAP_SOFTWARE                    BIT(0)
+#define PERF_EV_CAP_READ_ACTIVE_PKG             BIT(1)
+#define PERF_EV_CAP_CGROUP_NO_RECURSION         BIT(2)
+#define PERF_EV_CAP_INACTIVE_CPU_READ_PKG       BIT(3)
 
 #define SWEVENT_HLIST_BITS		8
 #define SWEVENT_HLIST_SIZE		(1 << SWEVENT_HLIST_BITS)
@@ -722,6 +725,16 @@ struct perf_event {
 #endif /* CONFIG_PERF_EVENTS */
 };
 
+#ifdef CONFIG_PERF_EVENTS
+static inline bool __perf_can_read_inactive(struct perf_event *event)
+{
+	if ((event->group_caps & PERF_EV_CAP_INACTIVE_CPU_READ_PKG))
+		return true;
+
+	return false;
+}
+#endif
+
 /**
  * struct perf_event_context - event context structure
  *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 229f611..e71ca66 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3443,9 +3443,13 @@ struct perf_read_data {
 
 static int find_cpu_to_read(struct perf_event *event, int local_cpu)
 {
+	bool active = event->state == PERF_EVENT_STATE_ACTIVE;
 	int event_cpu = event->oncpu;
 	u16 local_pkg, event_pkg;
 
+	if (__perf_can_read_inactive(event) && !active)
+		event_cpu = event->cpu;
+
 	if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
 		event_pkg =  topology_physical_package_id(event_cpu);
 		local_pkg =  topology_physical_package_id(local_cpu);
@@ -3467,6 +3471,7 @@ static void __perf_event_read(void *info)
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 	struct pmu *pmu = event->pmu;
+	bool read_inactive = __perf_can_read_inactive(event);
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -3475,7 +3480,7 @@ static void __perf_event_read(void *info)
 	 * event->count would have been updated to a recent sample
 	 * when the event was scheduled out.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
+	if (ctx->task && cpuctx->task_ctx != ctx && !read_inactive)
 		return;
 
 	raw_spin_lock(&ctx->lock);
@@ -3485,7 +3490,7 @@ static void __perf_event_read(void *info)
 	}
 
 	update_event_times(event);
-	if (event->state != PERF_EVENT_STATE_ACTIVE)
+	if (ctx->task && cpuctx->task_ctx != ctx && !read_inactive)
 		goto unlock;
 
 	if (!data->group) {
@@ -3500,7 +3505,8 @@ static void __perf_event_read(void *info)
 
 	list_for_each_entry(sub, &event->sibling_list, group_entry) {
 		update_event_times(sub);
-		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+		if (sub->state == PERF_EVENT_STATE_ACTIVE ||
+		    __perf_can_read_inactive(sub)) {
 			/*
 			 * Use sibling's PMU rather than @event's since
 			 * sibling could be on different (eg: software) PMU.
@@ -3578,13 +3584,15 @@ u64 perf_event_read_local(struct perf_event *event)
 
 static int perf_event_read(struct perf_event *event, bool group)
 {
+	bool active = event->state == PERF_EVENT_STATE_ACTIVE;
 	int ret = 0, cpu_to_read, local_cpu;
 
 	/*
 	 * If event is enabled and currently active on a CPU, update the
 	 * value in the event structure:
 	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE) {
+	if (active || __perf_can_read_inactive(event)) {
+
 		struct perf_read_data data = {
 			.event = event,
 			.group = group,
-- 
1.9.1