linux-kernel - [PATCH v2 06/32] perf/x86/intel/cqm: add per-package RMIDs, data and locks

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1463007752-116802-7-git-send-email-davidcc@google.com>
Date:	Wed, 11 May 2016 16:02:06 -0700
From:	David Carrillo-Cisneros <davidcc@...gle.com>
To:	Peter Zijlstra <peterz@...radead.org>,
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
	Arnaldo Carvalho de Melo <acme@...nel.org>,
	Ingo Molnar <mingo@...hat.com>
Cc:	Vikas Shivappa <vikas.shivappa@...ux.intel.com>,
	Matt Fleming <matt@...eblueprint.co.uk>,
	Tony Luck <tony.luck@...el.com>,
	Stephane Eranian <eranian@...gle.com>,
	Paul Turner <pjt@...gle.com>,
	David Carrillo-Cisneros <davidcc@...gle.com>, x86@...nel.org,
	linux-kernel@...r.kernel.org
Subject: [PATCH v2 06/32] perf/x86/intel/cqm: add per-package RMIDs, data and locks

Introduce struct pkg_data that contains all per-package CQM data for new
CQM driver. The per-package data is:
  1) A pool of free prmids (per-package per RMID). Each package may have
  different number of prmids (different hw max_rmid_index).
  2) lock and mutex that protect the prmids pools, changes to the pmonr
  state, and the rotation logic.
  The per-package separation of locks reduces the contention for each
 lock and mutex compared with the previous version that had system-wide
 mutex and lock.

More per-package data will be added in future patches is this series.

Reviewed-by: Stephane Eranian <eranian@...gle.com>
Signed-off-by: David Carrillo-Cisneros <davidcc@...gle.com>
---
 arch/x86/events/intel/cqm.c | 499 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/events/intel/cqm.h |  62 ++++++
 include/linux/perf_event.h  |   7 +
 3 files changed, 568 insertions(+)

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 2daee37..54f219f 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -12,6 +12,8 @@
 #define MSR_IA32_QM_CTR		0x0c8e
 #define MSR_IA32_QM_EVTSEL	0x0c8d
 
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
 #define RMID_VAL_ERROR		(1ULL << 63)
 #define RMID_VAL_UNAVAIL	(1ULL << 62)
 
@@ -69,3 +71,500 @@ static inline int __cqm_prmid_update(struct prmid *prmid,
 
 	return 1;
 }
+
+/*
+ * A cache groups is a group of perf_events with the same target (thread,
+ * cgroup, CPU or system-wide). Each cache group receives has one RMID.
+ * Cache groups are protected by cqm_mutex.
+ */
+static LIST_HEAD(cache_groups);
+static DEFINE_MUTEX(cqm_mutex);
+
+struct pkg_data **cqm_pkgs_data;
+
+static inline bool __valid_pkg_id(u16 pkg_id)
+{
+	return pkg_id < topology_max_packages();
+}
+
+/* Init cqm pkg_data for @cpu 's package. */
+static int pkg_data_init_cpu(int cpu)
+{
+	struct pkg_data *pkg_data;
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	u16 pkg_id = topology_physical_package_id(cpu);
+
+	if (cqm_pkgs_data[pkg_id])
+		return 0;
+
+
+	pkg_data = kmalloc_node(sizeof(struct pkg_data),
+				GFP_KERNEL, cpu_to_node(cpu));
+	if (!pkg_data)
+		return -ENOMEM;
+
+	pkg_data->max_rmid = c->x86_cache_max_rmid;
+
+	/* Does hardware has more rmids than this driver can handle? */
+	if (WARN_ON(pkg_data->max_rmid >= INVALID_RMID))
+		pkg_data->max_rmid = INVALID_RMID - 1;
+
+	if (c->x86_cache_occ_scale != cqm_l3_scale) {
+		pr_err("Multiple LLC scale values, disabling\n");
+		kfree(pkg_data);
+		return -EINVAL;
+	}
+
+	pkg_data->prmids_by_rmid = kmalloc_node(
+		sizeof(struct prmid *) * (1 + pkg_data->max_rmid),
+		GFP_KERNEL, cpu_to_node(cpu));
+
+	if (!pkg_data) {
+		kfree(pkg_data);
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&pkg_data->free_prmids_pool);
+
+	mutex_init(&pkg_data->pkg_data_mutex);
+	raw_spin_lock_init(&pkg_data->pkg_data_lock);
+
+	/* XXX: Chose randomly*/
+	pkg_data->rotation_cpu = cpu;
+
+	cqm_pkgs_data[pkg_id] = pkg_data;
+	return 0;
+}
+
+static int intel_cqm_setup_pkg_prmid_pools(u16 pkg_id)
+{
+	int r;
+	unsigned long flags;
+	struct prmid *prmid;
+	struct pkg_data *pkg_data = cqm_pkgs_data[pkg_id];
+
+	if (!__valid_pkg_id(pkg_id))
+		return -EINVAL;
+
+	for (r = 0; r <= pkg_data->max_rmid; r++) {
+
+		prmid = kmalloc_node(sizeof(struct prmid), GFP_KERNEL,
+				     cpu_to_node(pkg_data->rotation_cpu));
+		if (!prmid)
+			goto fail;
+
+		atomic64_set(&prmid->last_read_value, 0L);
+		atomic64_set(&prmid->last_read_time, 0L);
+		INIT_LIST_HEAD(&prmid->pool_entry);
+		prmid->rmid = r;
+
+		/* Lock needed if called during CPU hotplug. */
+		raw_spin_lock_irqsave_nested(
+			&pkg_data->pkg_data_lock, flags, pkg_id);
+		pkg_data->prmids_by_rmid[r] = prmid;
+
+
+		/* RMID 0 is special and makes the root of rmid hierarchy. */
+		raw_spin_unlock_irqrestore(&pkg_data->pkg_data_lock, flags);
+	}
+	return 0;
+fail:
+	while (!list_empty(&pkg_data->free_prmids_pool)) {
+		prmid = list_first_entry(&pkg_data->free_prmids_pool,
+					 struct prmid, pool_entry);
+		list_del(&prmid->pool_entry);
+		kfree(pkg_data->prmids_by_rmid[prmid->rmid]);
+		kfree(prmid);
+	}
+	return -ENOMEM;
+}
+
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+	/* Per-cpu and task events don't mix */
+	if ((a->attach_state & PERF_ATTACH_TASK) !=
+	    (b->attach_state & PERF_ATTACH_TASK))
+		return false;
+
+#ifdef CONFIG_CGROUP_PERF
+	if (a->cgrp != b->cgrp)
+		return false;
+#endif
+
+	/* If not task event, it's a a cgroup or a non-task cpu event. */
+	if (!(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Events that target same task are placed into the same cache group.
+	 */
+	if (a->hw.target == b->hw.target)
+		return true;
+
+	/*
+	 * Are we an inherited event?
+	 */
+	if (b->parent == a)
+		return true;
+
+	return false;
+}
+
+static struct pmu intel_cqm_pmu;
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's monr.
+ */
+static int
+intel_cqm_setup_event(struct perf_event *event, struct perf_event **group)
+{
+	struct perf_event *iter;
+
+
+	list_for_each_entry(iter, &cache_groups, hw.cqm_event_groups_entry) {
+		if (__match_event(iter, event)) {
+			*group = iter;
+			return 0;
+		}
+	}
+	return 0;
+}
+
+/* Read current package immediately and remote pkg (if any) from cache. */
+static void intel_cqm_event_read(struct perf_event *event)
+{
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+	if (!(event->hw.state & PERF_HES_STOPPED))
+		return;
+
+	event->hw.state &= ~PERF_HES_STOPPED;
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
+	event->hw.state |= PERF_HES_STOPPED;
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+	event->hw.state = PERF_HES_STOPPED;
+
+	return 0;
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+	return !list_empty(&event->hw.cqm_event_groups_entry);
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+	struct perf_event *group_other = NULL;
+
+	mutex_lock(&cqm_mutex);
+	/*
+	 * If there's another event in this group...
+	 */
+	if (!list_empty(&event->hw.cqm_event_group_entry)) {
+		group_other = list_first_entry(&event->hw.cqm_event_group_entry,
+					       struct perf_event,
+					       hw.cqm_event_group_entry);
+		list_del(&event->hw.cqm_event_group_entry);
+	}
+	/*
+	 * And we're the group leader..
+	 */
+	if (!cqm_group_leader(event))
+		goto exit;
+
+	/*
+	 * If there was a group_other, make that leader, otherwise
+	 * destroy the group and return the RMID.
+	 */
+	if (group_other) {
+		/* Update monr reference to group head. */
+		list_replace(&event->hw.cqm_event_groups_entry,
+			     &group_other->hw.cqm_event_groups_entry);
+		goto exit;
+	}
+
+	/*
+	 * Event is the only event in cache group.
+	 */
+
+	list_del(&event->hw.cqm_event_groups_entry);
+
+exit:
+	mutex_unlock(&cqm_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+	struct perf_event *group = NULL;
+	int ret;
+
+	if (event->attr.type != intel_cqm_pmu.type)
+		return -ENOENT;
+
+	if (event->attr.config & ~QOS_EVENT_MASK)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&event->hw.cqm_event_groups_entry);
+	INIT_LIST_HEAD(&event->hw.cqm_event_group_entry);
+
+	event->destroy = intel_cqm_event_destroy;
+
+	mutex_lock(&cqm_mutex);
+
+
+	/* Will also set rmid */
+	ret = intel_cqm_setup_event(event, &group);
+	if (ret) {
+		mutex_unlock(&cqm_mutex);
+		return ret;
+	}
+
+	if (group) {
+		list_add_tail(&event->hw.cqm_event_group_entry,
+				&group->hw.cqm_event_group_entry);
+	} else {
+		list_add_tail(&event->hw.cqm_event_groups_entry,
+				&cache_groups);
+	}
+
+	mutex_unlock(&cqm_mutex);
+
+	return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+	EVENT_PTR(intel_cqm_llc),
+	EVENT_PTR(intel_cqm_llc_pkg),
+	EVENT_PTR(intel_cqm_llc_unit),
+	EVENT_PTR(intel_cqm_llc_scale),
+	EVENT_PTR(intel_cqm_llc_snapshot),
+	NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+	.name = "events",
+	.attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+	.name = "format",
+	.attrs = intel_cqm_formats_attr,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+	&intel_cqm_events_group,
+	&intel_cqm_format_group,
+	NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+	.hrtimer_interval_ms = CQM_DEFAULT_ROTATION_PERIOD,
+	.attr_groups	     = intel_cqm_attr_groups,
+	.task_ctx_nr	     = perf_sw_context,
+	.event_init	     = intel_cqm_event_init,
+	.add		     = intel_cqm_event_add,
+	.del		     = intel_cqm_event_stop,
+	.start		     = intel_cqm_event_start,
+	.stop		     = intel_cqm_event_stop,
+	.read		     = intel_cqm_event_read,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+	u16 pkg_id = topology_physical_package_id(cpu);
+	/* XXX: lock, check if rotation cpu is online, maybe */
+	/*
+	 * Pick a reader if there isn't one already.
+	 */
+	if (cqm_pkgs_data[pkg_id]->rotation_cpu != -1)
+		cqm_pkgs_data[pkg_id]->rotation_cpu = cpu;
+}
+
+static void intel_cqm_cpu_starting(unsigned int cpu)
+{
+	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	u16 pkg_id = topology_physical_package_id(cpu);
+
+	state->rmid = 0;
+	state->closid = 0;
+
+	/* XXX: lock */
+	/* XXX: Make sure this case is handled when hotplug happens. */
+	WARN_ON(c->x86_cache_max_rmid != cqm_pkgs_data[pkg_id]->max_rmid);
+	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+	/*
+	 * Is @cpu a designated cqm reader?
+	 */
+	u16 pkg_id = topology_physical_package_id(cpu);
+
+	if (cqm_pkgs_data[pkg_id]->rotation_cpu != cpu)
+		return;
+	/* XXX: do remove unused packages */
+	cqm_pkgs_data[pkg_id]->rotation_cpu = cpumask_any_but(
+		topology_core_cpumask(cpu), cpu);
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu  = (unsigned long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		intel_cqm_cpu_exit(cpu);
+		break;
+	case CPU_STARTING:
+		pkg_data_init_cpu(cpu);
+		intel_cqm_cpu_starting(cpu);
+		cqm_pick_event_reader(cpu);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+	{}
+};
+
+static int __init intel_cqm_init(void)
+{
+	char *str, scale[20];
+	int i, cpu, ret = 0, min_max_rmid = 0;
+
+	if (!x86_match_cpu(intel_cqm_match))
+		return -ENODEV;
+
+	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+	if (WARN_ON(cqm_l3_scale == 0))
+		cqm_l3_scale = 1;
+
+	cqm_pkgs_data = kmalloc(
+		sizeof(struct pkg_data *) * topology_max_packages(),
+		GFP_KERNEL);
+	if (!cqm_pkgs_data)
+		return -ENOMEM;
+
+	for (i = 0; i < topology_max_packages(); i++)
+		cqm_pkgs_data[i] = NULL;
+
+	/*
+	 * It's possible that not all resources support the same number
+	 * of RMIDs. Instead of making scheduling much more complicated
+	 * (where we have to match a task's RMID to a cpu that supports
+	 * that many RMIDs) just find the minimum RMIDs supported across
+	 * all cpus.
+	 *
+	 * Also, check that the scales match on all cpus.
+	 */
+	cpu_notifier_register_begin();
+
+	/* XXX: assert all cpus in pkg have same nr rmids (they should). */
+	for_each_online_cpu(cpu) {
+		ret = pkg_data_init_cpu(cpu);
+		if  (ret)
+			goto error;
+	}
+
+	/* Select the minimum of the maximum rmids to use as limit for
+	 * threshold. XXX: per-package threshold.
+	 */
+	cqm_pkg_id_for_each_online(i) {
+		if (min_max_rmid < cqm_pkgs_data[i]->max_rmid)
+			min_max_rmid = cqm_pkgs_data[i]->max_rmid;
+		intel_cqm_setup_pkg_prmid_pools(i);
+	}
+
+	/*
+	 * A reasonable upper limit on the max threshold is the number
+	 * of lines tagged per RMID if all RMIDs have the same number of
+	 * lines tagged in the LLC.
+	 *
+	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+	 */
+	__intel_cqm_max_threshold =
+		boot_cpu_data.x86_cache_size * 1024 / (min_max_rmid + 1);
+
+	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+	str = kstrdup(scale, GFP_KERNEL);
+	if (!str) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	event_attr_intel_cqm_llc_scale.event_str = str;
+
+	for_each_online_cpu(i) {
+		intel_cqm_cpu_starting(i);
+		cqm_pick_event_reader(i);
+	}
+
+	__perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+	if (ret)
+		goto error;
+
+	cpu_notifier_register_done();
+
+	pr_info("Intel CQM monitoring enabled with at least %u rmids per package.\n",
+		min_max_rmid + 1);
+
+	return ret;
+
+error:
+	pr_err("Intel CQM perf registration failed: %d\n", ret);
+	cpu_notifier_register_done();
+
+	return ret;
+}
+
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/events/intel/cqm.h b/arch/x86/events/intel/cqm.h
index 06964cd..08623b5 100644
--- a/arch/x86/events/intel/cqm.h
+++ b/arch/x86/events/intel/cqm.h
@@ -41,9 +41,71 @@ struct prmid {
 };
 
 /*
+ * struct pkg_data: Per-package CQM data.
+ * @max_rmid:			Max rmid valid for cpus in this package.
+ * @prmids_by_rmid:		Utility mapping between rmid values and prmids.
+ *				XXX: Make it an array of prmids.
+ * @free_prmid_pool:		Free prmids.
+ * @pkg_data_mutex:		Hold for stability when modifying pmonrs
+ *				hierarchy.
+ * @pkg_data_lock:		Hold to protect variables that may be accessed
+ *				during process scheduling. The locks for all
+ *				packages must be held when modifying the monr
+ *				hierarchy.
+ * @rotation_cpu:               CPU to run @rotation_work on, it must be in the
+ *                              package associated to this instance of pkg_data.
+ */
+struct pkg_data {
+	u32			max_rmid;
+	/* Quick map from rmids to prmids. */
+	struct prmid		**prmids_by_rmid;
+
+	/*
+	 * Pools of prmids used in rotation logic.
+	 */
+	struct list_head	free_prmids_pool;
+
+	struct mutex		pkg_data_mutex;
+	raw_spinlock_t		pkg_data_lock;
+
+	int			rotation_cpu;
+};
+
+extern struct pkg_data **cqm_pkgs_data;
+
+static inline u16 __cqm_pkgs_data_next_online(u16 pkg_id)
+{
+	while (!cqm_pkgs_data[++pkg_id] && pkg_id < topology_max_packages())
+		;
+	return pkg_id;
+}
+
+static inline u16 __cqm_pkgs_data_first_online(void)
+{
+	if (cqm_pkgs_data[0])
+		return 0;
+	return __cqm_pkgs_data_next_online(0);
+}
+
+/* Iterate for each online pkgs data */
+#define cqm_pkg_id_for_each_online(pkg_id__) \
+	for (pkg_id__ = __cqm_pkgs_data_first_online(); \
+	     pkg_id__ < topology_max_packages(); \
+	     pkg_id__ = __cqm_pkgs_data_next_online(pkg_id__))
+
+#define __pkg_data(pmonr, member) cqm_pkgs_data[pmonr->pkg_id]->member
+
+/*
  * Time between execution of rotation logic. The frequency of execution does
  * not affect the rate at which RMIDs are recycled, except by the delay by the
  * delay updating the prmid's and their pools.
  * The rotation period is stored in pmu->hrtimer_interval_ms.
  */
 #define CQM_DEFAULT_ROTATION_PERIOD 1200	/* ms */
+
+/*
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ * It's units are bytes must be scaled by cqm_l3_scale to obtain cache lines.
+ */
+static unsigned int __intel_cqm_max_threshold;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1417d3b..02b8e24 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -118,6 +118,13 @@ struct hw_perf_event {
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
+#ifdef CONFIG_INTEL_RDT
+		struct { /* intel_cqm */
+			void			*cqm_monr;
+			struct list_head	cqm_event_group_entry;
+			struct list_head	cqm_event_groups_entry;
+		};
+#endif
 		struct { /* itrace */
 			int			itrace_started;
 		};
-- 
2.8.0.rc3.226.g39d4020