lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1461905018-86355-17-git-send-email-davidcc@google.com>
Date:	Thu, 28 Apr 2016 21:43:22 -0700
From:	David Carrillo-Cisneros <davidcc@...gle.com>
To:	Peter Zijlstra <peterz@...radead.org>,
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
	Arnaldo Carvalho de Melo <acme@...nel.org>,
	Ingo Molnar <mingo@...hat.com>
Cc:	Vikas Shivappa <vikas.shivappa@...ux.intel.com>,
	Matt Fleming <matt.fleming@...el.com>,
	Tony Luck <tony.luck@...el.com>,
	Stephane Eranian <eranian@...gle.com>,
	Paul Turner <pjt@...gle.com>,
	David Carrillo-Cisneros <davidcc@...gle.com>, x86@...nel.org,
	linux-kernel@...r.kernel.org
Subject: [PATCH 16/32] perf/x86/intel/cqm: add cgroup support

Create a monr per monitored cgroup. Inserts monrs in the monr hierarchy.
Task events are leaves of the lowest monitored ancestor cgroup (the lowest
cgroup ancestor with a monr).

CQM starts after the cgroup subsystem, and uses the cqm_initialized_key
static key to avoid interfering with the perf cgroup logic until
propertly initialized. The cgroup_init_mutex protects the initialization.

Reviewed-by: Stephane Eranian <eranian@...gle.com>
Signed-off-by: David Carrillo-Cisneros <davidcc@...gle.com>
---
 arch/x86/events/intel/cqm.c       | 594 +++++++++++++++++++++++++++++++++++++-
 arch/x86/events/intel/cqm.h       |  13 +
 arch/x86/include/asm/perf_event.h |  33 +++
 3 files changed, 637 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 98a919f..f000fd0 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -35,10 +35,17 @@ static struct perf_pmu_events_attr event_attr_##v = {				\
 static LIST_HEAD(cache_groups);
 static DEFINE_MUTEX(cqm_mutex);
 
+/*
+ * Synchronizes initialization of cqm with cgroups.
+ */
+static DEFINE_MUTEX(cqm_init_mutex);
+
 struct monr *monr_hrchy_root;
 
 struct pkg_data *cqm_pkgs_data[PQR_MAX_NR_PKGS];
 
+DEFINE_STATIC_KEY_FALSE(cqm_initialized_key);
+
 static inline bool __pmonr__in_istate(struct pmonr *pmonr)
 {
 	lockdep_assert_held(&__pkg_data(pmonr, pkg_data_lock));
@@ -69,6 +76,9 @@ static inline bool __pmonr__in_ustate(struct pmonr *pmonr)
 	return !pmonr->prmid && !pmonr->ancestor_pmonr;
 }
 
+/* Whether the monr is root. Recall that the cgroups can not be root and yet
+ * point to a root monr.
+ */
 static inline bool monr__is_root(struct monr *monr)
 {
 	return monr_hrchy_root == monr;
@@ -115,6 +125,23 @@ static inline void __monr__clear_mon_active(struct monr *monr)
 	monr->flags &= ~MONR_MON_ACTIVE;
 }
 
+static inline bool monr__is_cgroup_type(struct monr *monr)
+{
+	return monr->mon_cgrp;
+}
+
+static inline bool monr_is_event_type(struct monr *monr)
+{
+	return !monr->mon_cgrp && monr->mon_event_group;
+}
+
+
+static inline struct cgroup_subsys_state *get_root_perf_css(void)
+{
+	/* Get css for root cgroup */
+	return  init_css_set.subsys[perf_event_cgrp_id];
+}
+
 /*
  * Update if enough time has passed since last read.
  *
@@ -725,6 +752,7 @@ static struct monr *monr_alloc(void)
 	monr->parent = NULL;
 	INIT_LIST_HEAD(&monr->children);
 	INIT_LIST_HEAD(&monr->parent_entry);
+	monr->mon_cgrp = NULL;
 	monr->mon_event_group = NULL;
 
 	/* Iterate over all pkgs, even unitialized ones. */
@@ -947,7 +975,7 @@ retry:
 }
 
 /*
- * Wrappers for monr manipulation in events.
+ * Wrappers for monr manipulation in events and cgroups.
  *
  */
 static inline struct monr *monr_from_event(struct perf_event *event)
@@ -960,6 +988,100 @@ static inline void event_set_monr(struct perf_event *event, struct monr *monr)
 	WRITE_ONCE(event->hw.cqm_monr, monr);
 }
 
+#ifdef CONFIG_CGROUP_PERF
+static inline struct monr *monr_from_perf_cgroup(struct perf_cgroup *cgrp)
+{
+	struct monr *monr;
+	struct cgrp_cqm_info *cqm_info;
+
+	cqm_info = (struct cgrp_cqm_info *)READ_ONCE(cgrp->arch_info);
+	WARN_ON_ONCE(!cqm_info);
+	monr = READ_ONCE(cqm_info->monr);
+	return monr;
+}
+
+static inline struct perf_cgroup *monr__get_mon_cgrp(struct monr *monr)
+{
+	WARN_ON_ONCE(!monr);
+	return READ_ONCE(monr->mon_cgrp);
+}
+
+static inline void
+monr__set_mon_cgrp(struct monr *monr, struct perf_cgroup *cgrp)
+{
+	WRITE_ONCE(monr->mon_cgrp, cgrp);
+}
+
+static inline void
+perf_cgroup_set_monr(struct perf_cgroup *cgrp, struct monr *monr)
+{
+	WRITE_ONCE(cgrp_to_cqm_info(cgrp)->monr, monr);
+}
+
+/*
+ * A perf_cgroup is monitored when it's set in a monr->mon_cgrp.
+ * There is a many-to-one relationship between perf_cgroup's monrs
+ * and monrs' mon_cgrp. A monitored cgroup is necesarily referenced
+ * back by its monr's mon_cgrp.
+ */
+static inline bool perf_cgroup_is_monitored(struct perf_cgroup *cgrp)
+{
+	struct monr *monr;
+	struct perf_cgroup *monr_cgrp;
+
+	/* monr can be referenced by a cgroup other than the one in its
+	 * mon_cgrp, be careful.
+	 */
+	monr = monr_from_perf_cgroup(cgrp);
+
+	monr_cgrp = monr__get_mon_cgrp(monr);
+	/* Root monr do not have a cgroup associated before initialization.
+	 * mon_cgrp and mon_event_group are union, so the pointer must be set
+	 * for all non-root monrs.
+	 */
+	return  monr_cgrp && monr__get_mon_cgrp(monr) == cgrp;
+}
+
+/* Set css's monr to the monr of its lowest monitored ancestor. */
+static inline void __css_set_monr_to_lma(struct cgroup_subsys_state *css)
+{
+	lockdep_assert_held(&cqm_mutex);
+	if (!css->parent) {
+		perf_cgroup_set_monr(css_to_perf_cgroup(css), monr_hrchy_root);
+		return;
+	}
+	perf_cgroup_set_monr(
+		css_to_perf_cgroup(css),
+		monr_from_perf_cgroup(css_to_perf_cgroup(css->parent)));
+}
+
+static inline void
+perf_cgroup_make_monitored(struct perf_cgroup *cgrp, struct monr *monr)
+{
+	monr_hrchy_assert_held_mutexes();
+	perf_cgroup_set_monr(cgrp, monr);
+	/* Make sure that monr is a valid monr for css before it's visible
+	 * to any reader of css.
+	 */
+	smp_wmb();
+	monr__set_mon_cgrp(monr, cgrp);
+}
+
+static inline void
+perf_cgroup_make_unmonitored(struct perf_cgroup *cgrp)
+{
+	struct monr *monr = monr_from_perf_cgroup(cgrp);
+
+	monr_hrchy_assert_held_mutexes();
+	__css_set_monr_to_lma(&cgrp->css);
+	/* Make sure that all readers of css'monr see lma css before
+	 * monr stops being a valid monr for css.
+	 */
+	smp_wmb();
+	monr__set_mon_cgrp(monr, NULL);
+}
+#endif
+
 /*
  * Always finds a rmid_entry to schedule. To be called during scheduler.
  * A fast path that only uses read_lock for common case when rmid for current
@@ -1068,6 +1190,286 @@ __monr_hrchy_remove_leaf(struct monr *monr)
 	monr->parent = NULL;
 }
 
+#ifdef CONFIG_CGROUP_PERF
+static struct perf_cgroup *__perf_cgroup_parent(struct perf_cgroup *cgrp)
+{
+	struct cgroup_subsys_state *parent_css = cgrp->css.parent;
+
+	if (parent_css)
+		return css_to_perf_cgroup(parent_css);
+	return NULL;
+}
+
+/* Get cgroup for both task and cgroup event. */
+static inline struct perf_cgroup *
+perf_cgroup_from_event(struct perf_event *event)
+{
+#ifdef CONFIG_LOCKDEP
+	u16 pkg_id = topology_physical_package_id(smp_processor_id());
+	bool rcu_safe = lockdep_is_held(
+		&cqm_pkgs_data[pkg_id]->pkg_data_lock);
+#endif
+
+	if (!(event->attach_state & PERF_ATTACH_TASK))
+		return event->cgrp;
+
+	return container_of(
+		task_css_check(event->hw.target, perf_event_cgrp_id, rcu_safe),
+		struct perf_cgroup, css);
+}
+
+/* Find lowest ancestor that is monitored, not including this cgrp.
+ * Return NULL if no ancestor is monitored.
+ */
+struct perf_cgroup *__cgroup_find_lma(struct perf_cgroup *cgrp)
+{
+	do {
+		cgrp = __perf_cgroup_parent(cgrp);
+	} while (cgrp && !perf_cgroup_is_monitored(cgrp));
+	return cgrp;
+}
+
+/* Similar to css_next_descendant_pre but skips the subtree rooted by pos. */
+struct cgroup_subsys_state *
+css_skip_subtree_pre(struct cgroup_subsys_state *pos,
+		     struct cgroup_subsys_state *root)
+{
+	struct cgroup_subsys_state *next;
+
+	WARN_ON_ONCE(!pos);
+	while (pos != root) {
+		next = css_next_child(pos, pos->parent);
+		if (next)
+			return next;
+		pos = pos->parent;
+	}
+	return NULL;
+}
+
+/* Make all monrs of css descendants of css to depend on new_monr. */
+inline void __css_subtree_update_monrs(struct cgroup_subsys_state *css,
+				       struct monr *new_monr)
+{
+	struct cgroup_subsys_state *pos_css;
+	int i;
+	unsigned long flags;
+
+	lockdep_assert_held(&cqm_mutex);
+	monr_hrchy_assert_held_mutexes();
+
+	rcu_read_lock();
+
+	/* Iterate over descendants of css in pre-order, in a way
+	 * similar to css_for_each_descendant_pre, but skipping the subtrees
+	 * rooted by css's with a monitored cgroup, since the elements
+	 * in those subtrees do not need to be updated.
+	 */
+	pos_css = css_next_descendant_pre(css, css);
+	while (pos_css) {
+		struct perf_cgroup *pos_cgrp = css_to_perf_cgroup(pos_css);
+		struct monr *pos_monr = monr_from_perf_cgroup(pos_cgrp);
+
+		/* Skip css that are not online, sync'ed with cqm_mutex. */
+		if (!(pos_css->flags & CSS_ONLINE)) {
+			pos_css = css_next_descendant_pre(pos_css, css);
+			continue;
+		}
+		/* Update descendant pos's mnor pointers to monr_parent. */
+		if (!perf_cgroup_is_monitored(pos_cgrp)) {
+			perf_cgroup_set_monr(pos_cgrp, new_monr);
+			pos_css = css_next_descendant_pre(pos_css, css);
+			continue;
+		}
+		monr_hrchy_acquire_raw_spin_locks_irq_save(flags, i);
+		pos_monr->parent = new_monr;
+		list_move_tail(&pos_monr->parent_entry, &new_monr->children);
+		monr_hrchy_release_raw_spin_locks_irq_restore(flags, i);
+		/* Dont go down the subtree in pos_css since pos_monr is the
+		 * lma for all its descendants.
+		 */
+		pos_css = css_skip_subtree_pre(pos_css, css);
+	}
+	rcu_read_unlock();
+}
+
+static inline int __css_start_monitoring(struct cgroup_subsys_state *css)
+{
+	struct perf_cgroup *cgrp, *cgrp_lma, *pos_cgrp;
+	struct monr *monr, *monr_parent, *pos_monr, *tmp_monr;
+	unsigned long flags;
+	int i;
+
+	lockdep_assert_held(&cqm_mutex);
+
+	/* Hold mutexes to prevent all rotation threads in all packages from
+	 * messing with this.
+	 */
+	monr_hrchy_acquire_mutexes();
+	cgrp = css_to_perf_cgroup(css);
+	if (WARN_ON_ONCE(perf_cgroup_is_monitored(cgrp)))
+		return -1;
+
+	/* When css is root cgroup's css, attach to the pre-existing
+	 * and active root monr.
+	 */
+	cgrp_lma = __cgroup_find_lma(cgrp);
+	if (!cgrp_lma) {
+		/* monr of root cgrp must be monr_hrchy_root. */
+		WARN_ON_ONCE(!monr__is_root(monr_from_perf_cgroup(cgrp)));
+		perf_cgroup_make_monitored(cgrp, monr_hrchy_root);
+		monr_hrchy_release_mutexes();
+		return 0;
+	}
+	/* The monr for the lowest monitored ancestor is direct ancestor
+	 * of monr in the monr hierarchy.
+	 */
+	monr_parent = monr_from_perf_cgroup(cgrp_lma);
+
+	/* Create new monr. */
+	monr = monr_alloc();
+	if (IS_ERR(monr)) {
+		monr_hrchy_release_mutexes();
+		return PTR_ERR(monr);
+	}
+
+	/* monr has no children yet so it is to be inserted in hierarchy with
+	 * all its pmors in (U)state.
+	 * We hold locks until monr_hrchy changes are complete, to prevent
+	 * possible state transition for the pmonrs in monr while still
+	 * allowing to read the prmid_summary in the scheduler path.
+	 */
+	monr_hrchy_acquire_raw_spin_locks_irq_save(flags, i);
+	__monr_hrchy_insert_leaf(monr, monr_parent);
+	monr_hrchy_release_raw_spin_locks_irq_restore(flags, i);
+
+	/* Make sure monr is in hierarchy before attaching monr to cgroup. */
+	barrier();
+
+	perf_cgroup_make_monitored(cgrp, monr);
+	__css_subtree_update_monrs(css, monr);
+
+	monr_hrchy_acquire_raw_spin_locks_irq_save(flags, i);
+	/* Move task-event monrs that are descendant from css's cgroup. */
+	list_for_each_entry_safe(pos_monr, tmp_monr,
+				 &monr_parent->children, parent_entry) {
+		if (!monr_is_event_type(pos_monr))
+			continue;
+		/* all events in event group must have the same cgroup.
+		 * No RCU read lock necessary for task_css_check since calling
+		 * inside critical section.
+		 */
+		pos_cgrp = perf_cgroup_from_event(pos_monr->mon_event_group);
+		if (!cgroup_is_descendant(pos_cgrp->css.cgroup,
+					  cgrp->css.cgroup))
+			continue;
+		pos_monr->parent = monr;
+		list_move_tail(&pos_monr->parent_entry, &monr->children);
+	}
+	/* Make sure monitoring starts after all monrs have moved. */
+	barrier();
+
+	__monr__set_mon_active(monr);
+	monr_hrchy_release_raw_spin_locks_irq_restore(flags, i);
+
+	monr_hrchy_release_mutexes();
+	return 0;
+}
+
+static inline int __css_stop_monitoring(struct cgroup_subsys_state *css)
+{
+	struct perf_cgroup *cgrp, *cgrp_lma;
+	struct monr *monr, *monr_parent, *pos_monr;
+	unsigned long flags;
+	int i;
+
+	lockdep_assert_held(&cqm_mutex);
+
+	monr_hrchy_acquire_mutexes();
+	cgrp = css_to_perf_cgroup(css);
+	if (WARN_ON_ONCE(!perf_cgroup_is_monitored(cgrp)))
+		return -1;
+
+	monr = monr_from_perf_cgroup(cgrp);
+
+	/* When css is root cgroup's css, detach cgroup but do not
+	 * destroy monr.
+	 */
+	cgrp_lma = __cgroup_find_lma(cgrp);
+	if (!cgrp_lma) {
+		/* monr of root cgrp must be monr_hrchy_root. */
+		WARN_ON_ONCE(!monr__is_root(monr_from_perf_cgroup(cgrp)));
+		perf_cgroup_make_unmonitored(cgrp);
+		monr_hrchy_release_mutexes();
+		return 0;
+	}
+	/* The monr for the lowest monitored ancestor is direct ancestor
+	 * of monr in the monr hierarchy.
+	 */
+	monr_parent = monr_from_perf_cgroup(cgrp_lma);
+
+	/* Lock together the transition to (U)state and clearing
+	 * MONR_MON_ACTIVE to prevent prmids to return to (A)state
+	 * or (I)state in between.
+	 */
+	monr_hrchy_acquire_raw_spin_locks_irq_save(flags, i);
+	cqm_pkg_id_for_each_online(i)
+		__pmonr__to_ustate(monr->pmonrs[i]);
+	barrier();
+	__monr__clear_mon_active(monr);
+	monr_hrchy_release_raw_spin_locks_irq_restore(flags, i);
+
+	__css_subtree_update_monrs(css, monr_parent);
+
+
+	/*
+	 * Move the children monrs that are no cgroups.
+	 */
+	monr_hrchy_acquire_raw_spin_locks_irq_save(flags, i);
+
+	list_for_each_entry(pos_monr, &monr->children, parent_entry)
+		pos_monr->parent = monr_parent;
+	list_splice_tail_init(&monr->children, &monr_parent->children);
+	perf_cgroup_make_unmonitored(cgrp);
+	__monr_hrchy_remove_leaf(monr);
+
+	monr_hrchy_release_raw_spin_locks_irq_restore(flags, i);
+
+	monr_hrchy_release_mutexes();
+	monr_dealloc(monr);
+	return 0;
+}
+
+/* Attaching an event to a cgroup starts monitoring in the cgroup.
+ * If the cgroup is already monitoring, just use its pre-existing mnor.
+ */
+static int __monr_hrchy_attach_cgroup_event(struct perf_event *event,
+					    struct perf_cgroup *perf_cgrp)
+{
+	struct monr *monr;
+	int ret;
+
+	lockdep_assert_held(&cqm_mutex);
+	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_TASK);
+	WARN_ON_ONCE(monr_from_event(event));
+	WARN_ON_ONCE(!perf_cgrp);
+
+	if (!perf_cgroup_is_monitored(perf_cgrp)) {
+		css_get(&perf_cgrp->css);
+		ret = __css_start_monitoring(&perf_cgrp->css);
+		css_put(&perf_cgrp->css);
+		if (ret)
+			return ret;
+	}
+
+	/* At this point, cgrp is always monitored, use its monr. */
+	monr = monr_from_perf_cgroup(perf_cgrp);
+
+	event_set_monr(event, monr);
+	monr->mon_event_group = event;
+	return 0;
+}
+#endif
+
 static int __monr_hrchy_attach_cpu_event(struct perf_event *event)
 {
 	lockdep_assert_held(&cqm_mutex);
@@ -1109,12 +1511,27 @@ static int __monr_hrchy_attach_task_event(struct perf_event *event,
 static int monr_hrchy_attach_event(struct perf_event *event)
 {
 	struct monr *monr_parent;
+#ifdef CONFIG_CGROUP_PERF
+	struct perf_cgroup *perf_cgrp;
+#endif
 
 	if (!event->cgrp && !(event->attach_state & PERF_ATTACH_TASK))
 		return __monr_hrchy_attach_cpu_event(event);
 
+#ifdef CONFIG_CGROUP_PERF
+	/* Task events become leaves, cgroup events reuse the cgroup's monr */
+	if (event->cgrp)
+		return __monr_hrchy_attach_cgroup_event(event, event->cgrp);
+
+	rcu_read_lock();
+	perf_cgrp = perf_cgroup_from_event(event);
+	rcu_read_unlock();
+
+	monr_parent = monr_from_perf_cgroup(perf_cgrp);
+#else
 	/* Two-levels hierarchy: Root and all event monr underneath it. */
 	monr_parent = monr_hrchy_root;
+#endif
 	return __monr_hrchy_attach_task_event(event, monr_parent);
 }
 
@@ -1126,7 +1543,7 @@ static int monr_hrchy_attach_event(struct perf_event *event)
  */
 static bool __match_event(struct perf_event *a, struct perf_event *b)
 {
-	/* Per-cpu and task events don't mix */
+	/* Cgroup/non-task per-cpu and task events don't mix */
 	if ((a->attach_state & PERF_ATTACH_TASK) !=
 	    (b->attach_state & PERF_ATTACH_TASK))
 		return false;
@@ -2185,6 +2602,129 @@ static struct pmu intel_cqm_pmu = {
 	.read		     = intel_cqm_event_read,
 };
 
+#ifdef CONFIG_CGROUP_PERF
+/* XXX: Add hooks for attach dettach task with monr to a cgroup. */
+inline int perf_cgroup_arch_css_alloc(struct cgroup_subsys_state *parent_css,
+				      struct cgroup_subsys_state *new_css)
+{
+	struct perf_cgroup *new_cgrp;
+	struct cgrp_cqm_info *cqm_info;
+
+	new_cgrp = css_to_perf_cgroup(new_css);
+	cqm_info = kmalloc(sizeof(struct cgrp_cqm_info), GFP_KERNEL);
+	if (!cqm_info)
+		return -ENOMEM;
+	cqm_info->cont_monitoring = false;
+	cqm_info->monr = NULL;
+	new_cgrp->arch_info = cqm_info;
+
+	return 0;
+}
+
+inline void perf_cgroup_arch_css_free(struct cgroup_subsys_state *css)
+{
+	struct perf_cgroup *cgrp = css_to_perf_cgroup(css);
+
+	kfree(cgrp_to_cqm_info(cgrp));
+	cgrp->arch_info = NULL;
+}
+
+/* Do the bulk of arch_css_online. To be called when CQM starts after
+ * css has gone online.
+ */
+static inline int __css_go_online(struct cgroup_subsys_state *css)
+{
+	lockdep_assert_held(&cqm_mutex);
+
+	/* css must not be used in monr hierarchy before having
+	 * set its monr in this step.
+	 */
+	__css_set_monr_to_lma(css);
+	/* Root monr is always monitoring. */
+	if (!css->parent)
+		css_to_cqm_info(css)->cont_monitoring = true;
+
+	if (css_to_cqm_info(css)->cont_monitoring)
+		return __css_start_monitoring(css);
+	return 0;
+}
+
+inline int perf_cgroup_arch_css_online(struct cgroup_subsys_state *css)
+{
+	int ret = 0;
+
+	/* use cqm_init_mutex to synchronize with
+	 * __start_monitoring_all_cgroups.
+	 */
+	mutex_lock(&cqm_init_mutex);
+
+	if (static_branch_unlikely(&cqm_initialized_key)) {
+		mutex_lock(&cqm_mutex);
+		ret = __css_go_online(css);
+		mutex_unlock(&cqm_mutex);
+		WARN_ON_ONCE(ret);
+	}
+
+	mutex_unlock(&cqm_init_mutex);
+	return ret;
+}
+
+inline void perf_cgroup_arch_css_offline(struct cgroup_subsys_state *css)
+{
+	int ret = 0;
+	struct monr *monr;
+	struct perf_cgroup *cgrp = css_to_perf_cgroup(css);
+
+	mutex_lock(&cqm_init_mutex);
+
+	if (!static_branch_unlikely(&cqm_initialized_key))
+		goto out;
+
+	mutex_lock(&cqm_mutex);
+
+	monr = monr_from_perf_cgroup(cgrp);
+	if (!perf_cgroup_is_monitored(cgrp))
+		goto out_cqm;
+
+	/* Stop monitoring for the css's monr only if no more events need it.
+	 * If events need the monr, it will be destroyed when the events that
+	 * use it are destroyed.
+	 */
+	if (monr->mon_event_group) {
+		monr_hrchy_acquire_mutexes();
+		perf_cgroup_make_unmonitored(cgrp);
+		monr_hrchy_release_mutexes();
+	} else {
+		ret = __css_stop_monitoring(css);
+		WARN_ON_ONCE(ret);
+	}
+
+out_cqm:
+	mutex_unlock(&cqm_mutex);
+out:
+	mutex_unlock(&cqm_init_mutex);
+	WARN_ON_ONCE(ret);
+}
+
+inline void perf_cgroup_arch_css_released(struct cgroup_subsys_state *css)
+{
+	mutex_lock(&cqm_init_mutex);
+
+	if (static_branch_unlikely(&cqm_initialized_key)) {
+		mutex_lock(&cqm_mutex);
+		/*
+		 * Remove css from monr hierarchy now that css is about to
+		 * leave the cgroup hierarchy.
+		 */
+		perf_cgroup_set_monr(css_to_perf_cgroup(css), NULL);
+		mutex_unlock(&cqm_mutex);
+	}
+
+	mutex_unlock(&cqm_init_mutex);
+}
+
+#endif
+
 static inline void cqm_pick_event_reader(int cpu)
 {
 	u16 pkg_id = topology_physical_package_id(cpu);
@@ -2249,6 +2789,39 @@ static const struct x86_cpu_id intel_cqm_match[] = {
 	{}
 };
 
+#ifdef CONFIG_CGROUP_PERF
+/* Start monitoring for all cgroups in cgroup hierarchy. */
+static int __start_monitoring_all_cgroups(void)
+{
+	int ret;
+	struct cgroup_subsys_state *css, *css_root;
+
+	lockdep_assert_held(&cqm_init_mutex);
+
+	rcu_read_lock();
+	/* Get css for root cgroup */
+	css_root =  get_root_perf_css();
+
+	css_for_each_descendant_pre(css, css_root) {
+		if (!css_tryget_online(css))
+			continue;
+
+		rcu_read_unlock();
+		mutex_lock(&cqm_mutex);
+		ret = __css_go_online(css);
+		mutex_unlock(&cqm_mutex);
+
+		css_put(css);
+		if (ret)
+			return ret;
+
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+	return 0;
+}
+#endif
+
 static int __init intel_cqm_init(void)
 {
 	char *str, scale[20];
@@ -2324,17 +2897,32 @@ static int __init intel_cqm_init(void)
 
 	__perf_cpu_notifier(intel_cqm_cpu_notifier);
 
+	/* Use cqm_init_mutex to synchronize with css's online/offline. */
+	mutex_lock(&cqm_init_mutex);
+
+#ifdef CONFIG_CGROUP_PERF
+	ret = __start_monitoring_all_cgroups();
+	if (ret)
+		goto error_init_mutex;
+#endif
+
 	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
 	if (ret)
-		goto error;
+		goto error_init_mutex;
 
 	cpu_notifier_register_done();
 
+	static_branch_enable(&cqm_initialized_key);
+
+	mutex_unlock(&cqm_init_mutex);
+
 	pr_info("Intel CQM monitoring enabled with at least %u rmids per package.\n",
 		min_max_rmid + 1);
 
 	return ret;
 
+error_init_mutex:
+	mutex_unlock(&cqm_init_mutex);
 error:
 	pr_err("Intel CQM perf registration failed: %d\n", ret);
 	cpu_notifier_register_done();
diff --git a/arch/x86/events/intel/cqm.h b/arch/x86/events/intel/cqm.h
index 25646a2..0f3da94 100644
--- a/arch/x86/events/intel/cqm.h
+++ b/arch/x86/events/intel/cqm.h
@@ -313,6 +313,7 @@ struct pkg_data {
  * struct monr: MONitored Resource.
  * @flags:		Flags field for monr (XXX: More flags will be added
  *			with MBM).
+ * @mon_cgrp:		The cgroup associated with this monr, if any
  * @mon_event_group:	The head of event's group that use this monr, if any.
  * @parent:		Parent in monr hierarchy.
  * @children:		List of children in monr hierarchy.
@@ -333,6 +334,7 @@ struct pkg_data {
 struct monr {
 	u16				flags;
 	/* Back reference pointers */
+	struct perf_cgroup		*mon_cgrp;
 	struct perf_event		*mon_event_group;
 
 	struct monr			*parent;
@@ -506,3 +508,14 @@ static unsigned int __cqm_min_progress_rate = CQM_DEFAULT_MIN_PROGRESS_RATE;
  * It's units are bytes must be scaled by cqm_l3_scale to obtain cache lines.
  */
 static unsigned int __intel_cqm_max_threshold;
+
+
+struct cgrp_cqm_info {
+	/* Should the cgroup be continuously monitored? */
+	bool		cont_monitoring;
+	struct monr	*monr;
+};
+
+# define css_to_perf_cgroup(css_) container_of(css_, struct perf_cgroup, css)
+# define cgrp_to_cqm_info(cgrp_) ((struct cgrp_cqm_info *)cgrp_->arch_info)
+# define css_to_cqm_info(css_) cgrp_to_cqm_info(css_to_perf_cgroup(css_))
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index f353061..c22d9e0 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -299,4 +299,37 @@ static inline void perf_check_microcode(void) { }
 
 #define arch_perf_out_copy_user copy_from_user_nmi
 
+
+/*
+ * Hooks for architecture specific features of perf_event cgroup.
+ * Currently used by Intel's CQM.
+ */
+#ifdef CONFIG_INTEL_RDT
+#define perf_cgroup_arch_css_alloc \
+	perf_cgroup_arch_css_alloc
+inline int perf_cgroup_arch_css_alloc(struct cgroup_subsys_state *parent_css,
+				      struct cgroup_subsys_state *new_css);
+
+#define perf_cgroup_arch_css_online \
+	perf_cgroup_arch_css_online
+inline int perf_cgroup_arch_css_online(struct cgroup_subsys_state *css);
+
+#define perf_cgroup_arch_css_offline \
+	perf_cgroup_arch_css_offline
+inline void perf_cgroup_arch_css_offline(struct cgroup_subsys_state *css);
+
+#define perf_cgroup_arch_css_released \
+	perf_cgroup_arch_css_released
+inline void perf_cgroup_arch_css_released(struct cgroup_subsys_state *css);
+
+#define perf_cgroup_arch_css_free \
+	perf_cgroup_arch_css_free
+inline void perf_cgroup_arch_css_free(struct cgroup_subsys_state *css);
+
+#else
+
+#define PERF_CGROUP_ARCH_CGRP_SUBSYS_ATTS
+
+#endif
+
 #endif /* _ASM_X86_PERF_EVENT_H */
-- 
2.8.0.rc3.226.g39d4020

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ