[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1463007752-116802-7-git-send-email-davidcc@google.com>
Date: Wed, 11 May 2016 16:02:06 -0700
From: David Carrillo-Cisneros <davidcc@...gle.com>
To: Peter Zijlstra <peterz@...radead.org>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
Ingo Molnar <mingo@...hat.com>
Cc: Vikas Shivappa <vikas.shivappa@...ux.intel.com>,
Matt Fleming <matt@...eblueprint.co.uk>,
Tony Luck <tony.luck@...el.com>,
Stephane Eranian <eranian@...gle.com>,
Paul Turner <pjt@...gle.com>,
David Carrillo-Cisneros <davidcc@...gle.com>, x86@...nel.org,
linux-kernel@...r.kernel.org
Subject: [PATCH v2 06/32] perf/x86/intel/cqm: add per-package RMIDs, data and locks
Introduce struct pkg_data that contains all per-package CQM data for new
CQM driver. The per-package data is:
1) A pool of free prmids (per-package per RMID). Each package may have
different number of prmids (different hw max_rmid_index).
2) lock and mutex that protect the prmids pools, changes to the pmonr
state, and the rotation logic.
The per-package separation of locks reduces the contention for each
lock and mutex compared with the previous version that had system-wide
mutex and lock.
More per-package data will be added in future patches is this series.
Reviewed-by: Stephane Eranian <eranian@...gle.com>
Signed-off-by: David Carrillo-Cisneros <davidcc@...gle.com>
---
arch/x86/events/intel/cqm.c | 499 ++++++++++++++++++++++++++++++++++++++++++++
arch/x86/events/intel/cqm.h | 62 ++++++
include/linux/perf_event.h | 7 +
3 files changed, 568 insertions(+)
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 2daee37..54f219f 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -12,6 +12,8 @@
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
#define RMID_VAL_ERROR (1ULL << 63)
#define RMID_VAL_UNAVAIL (1ULL << 62)
@@ -69,3 +71,500 @@ static inline int __cqm_prmid_update(struct prmid *prmid,
return 1;
}
+
+/*
+ * A cache groups is a group of perf_events with the same target (thread,
+ * cgroup, CPU or system-wide). Each cache group receives has one RMID.
+ * Cache groups are protected by cqm_mutex.
+ */
+static LIST_HEAD(cache_groups);
+static DEFINE_MUTEX(cqm_mutex);
+
+struct pkg_data **cqm_pkgs_data;
+
+static inline bool __valid_pkg_id(u16 pkg_id)
+{
+ return pkg_id < topology_max_packages();
+}
+
+/* Init cqm pkg_data for @cpu 's package. */
+static int pkg_data_init_cpu(int cpu)
+{
+ struct pkg_data *pkg_data;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ u16 pkg_id = topology_physical_package_id(cpu);
+
+ if (cqm_pkgs_data[pkg_id])
+ return 0;
+
+
+ pkg_data = kmalloc_node(sizeof(struct pkg_data),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!pkg_data)
+ return -ENOMEM;
+
+ pkg_data->max_rmid = c->x86_cache_max_rmid;
+
+ /* Does hardware has more rmids than this driver can handle? */
+ if (WARN_ON(pkg_data->max_rmid >= INVALID_RMID))
+ pkg_data->max_rmid = INVALID_RMID - 1;
+
+ if (c->x86_cache_occ_scale != cqm_l3_scale) {
+ pr_err("Multiple LLC scale values, disabling\n");
+ kfree(pkg_data);
+ return -EINVAL;
+ }
+
+ pkg_data->prmids_by_rmid = kmalloc_node(
+ sizeof(struct prmid *) * (1 + pkg_data->max_rmid),
+ GFP_KERNEL, cpu_to_node(cpu));
+
+ if (!pkg_data) {
+ kfree(pkg_data);
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&pkg_data->free_prmids_pool);
+
+ mutex_init(&pkg_data->pkg_data_mutex);
+ raw_spin_lock_init(&pkg_data->pkg_data_lock);
+
+ /* XXX: Chose randomly*/
+ pkg_data->rotation_cpu = cpu;
+
+ cqm_pkgs_data[pkg_id] = pkg_data;
+ return 0;
+}
+
+static int intel_cqm_setup_pkg_prmid_pools(u16 pkg_id)
+{
+ int r;
+ unsigned long flags;
+ struct prmid *prmid;
+ struct pkg_data *pkg_data = cqm_pkgs_data[pkg_id];
+
+ if (!__valid_pkg_id(pkg_id))
+ return -EINVAL;
+
+ for (r = 0; r <= pkg_data->max_rmid; r++) {
+
+ prmid = kmalloc_node(sizeof(struct prmid), GFP_KERNEL,
+ cpu_to_node(pkg_data->rotation_cpu));
+ if (!prmid)
+ goto fail;
+
+ atomic64_set(&prmid->last_read_value, 0L);
+ atomic64_set(&prmid->last_read_time, 0L);
+ INIT_LIST_HEAD(&prmid->pool_entry);
+ prmid->rmid = r;
+
+ /* Lock needed if called during CPU hotplug. */
+ raw_spin_lock_irqsave_nested(
+ &pkg_data->pkg_data_lock, flags, pkg_id);
+ pkg_data->prmids_by_rmid[r] = prmid;
+
+
+ /* RMID 0 is special and makes the root of rmid hierarchy. */
+ raw_spin_unlock_irqrestore(&pkg_data->pkg_data_lock, flags);
+ }
+ return 0;
+fail:
+ while (!list_empty(&pkg_data->free_prmids_pool)) {
+ prmid = list_first_entry(&pkg_data->free_prmids_pool,
+ struct prmid, pool_entry);
+ list_del(&prmid->pool_entry);
+ kfree(pkg_data->prmids_by_rmid[prmid->rmid]);
+ kfree(prmid);
+ }
+ return -ENOMEM;
+}
+
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+ /* Per-cpu and task events don't mix */
+ if ((a->attach_state & PERF_ATTACH_TASK) !=
+ (b->attach_state & PERF_ATTACH_TASK))
+ return false;
+
+#ifdef CONFIG_CGROUP_PERF
+ if (a->cgrp != b->cgrp)
+ return false;
+#endif
+
+ /* If not task event, it's a a cgroup or a non-task cpu event. */
+ if (!(b->attach_state & PERF_ATTACH_TASK))
+ return true;
+
+ /*
+ * Events that target same task are placed into the same cache group.
+ */
+ if (a->hw.target == b->hw.target)
+ return true;
+
+ /*
+ * Are we an inherited event?
+ */
+ if (b->parent == a)
+ return true;
+
+ return false;
+}
+
+static struct pmu intel_cqm_pmu;
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's monr.
+ */
+static int
+intel_cqm_setup_event(struct perf_event *event, struct perf_event **group)
+{
+ struct perf_event *iter;
+
+
+ list_for_each_entry(iter, &cache_groups, hw.cqm_event_groups_entry) {
+ if (__match_event(iter, event)) {
+ *group = iter;
+ return 0;
+ }
+ }
+ return 0;
+}
+
+/* Read current package immediately and remote pkg (if any) from cache. */
+static void intel_cqm_event_read(struct perf_event *event)
+{
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+ if (!(event->hw.state & PERF_HES_STOPPED))
+ return;
+
+ event->hw.state &= ~PERF_HES_STOPPED;
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return;
+
+ event->hw.state |= PERF_HES_STOPPED;
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+ event->hw.state = PERF_HES_STOPPED;
+
+ return 0;
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+ return !list_empty(&event->hw.cqm_event_groups_entry);
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+ struct perf_event *group_other = NULL;
+
+ mutex_lock(&cqm_mutex);
+ /*
+ * If there's another event in this group...
+ */
+ if (!list_empty(&event->hw.cqm_event_group_entry)) {
+ group_other = list_first_entry(&event->hw.cqm_event_group_entry,
+ struct perf_event,
+ hw.cqm_event_group_entry);
+ list_del(&event->hw.cqm_event_group_entry);
+ }
+ /*
+ * And we're the group leader..
+ */
+ if (!cqm_group_leader(event))
+ goto exit;
+
+ /*
+ * If there was a group_other, make that leader, otherwise
+ * destroy the group and return the RMID.
+ */
+ if (group_other) {
+ /* Update monr reference to group head. */
+ list_replace(&event->hw.cqm_event_groups_entry,
+ &group_other->hw.cqm_event_groups_entry);
+ goto exit;
+ }
+
+ /*
+ * Event is the only event in cache group.
+ */
+
+ list_del(&event->hw.cqm_event_groups_entry);
+
+exit:
+ mutex_unlock(&cqm_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+ struct perf_event *group = NULL;
+ int ret;
+
+ if (event->attr.type != intel_cqm_pmu.type)
+ return -ENOENT;
+
+ if (event->attr.config & ~QOS_EVENT_MASK)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&event->hw.cqm_event_groups_entry);
+ INIT_LIST_HEAD(&event->hw.cqm_event_group_entry);
+
+ event->destroy = intel_cqm_event_destroy;
+
+ mutex_lock(&cqm_mutex);
+
+
+ /* Will also set rmid */
+ ret = intel_cqm_setup_event(event, &group);
+ if (ret) {
+ mutex_unlock(&cqm_mutex);
+ return ret;
+ }
+
+ if (group) {
+ list_add_tail(&event->hw.cqm_event_group_entry,
+ &group->hw.cqm_event_group_entry);
+ } else {
+ list_add_tail(&event->hw.cqm_event_groups_entry,
+ &cache_groups);
+ }
+
+ mutex_unlock(&cqm_mutex);
+
+ return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_llc_snapshot),
+ NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+ .name = "events",
+ .attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+ .name = "format",
+ .attrs = intel_cqm_formats_attr,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+ &intel_cqm_events_group,
+ &intel_cqm_format_group,
+ NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+ .hrtimer_interval_ms = CQM_DEFAULT_ROTATION_PERIOD,
+ .attr_groups = intel_cqm_attr_groups,
+ .task_ctx_nr = perf_sw_context,
+ .event_init = intel_cqm_event_init,
+ .add = intel_cqm_event_add,
+ .del = intel_cqm_event_stop,
+ .start = intel_cqm_event_start,
+ .stop = intel_cqm_event_stop,
+ .read = intel_cqm_event_read,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+ u16 pkg_id = topology_physical_package_id(cpu);
+ /* XXX: lock, check if rotation cpu is online, maybe */
+ /*
+ * Pick a reader if there isn't one already.
+ */
+ if (cqm_pkgs_data[pkg_id]->rotation_cpu != -1)
+ cqm_pkgs_data[pkg_id]->rotation_cpu = cpu;
+}
+
+static void intel_cqm_cpu_starting(unsigned int cpu)
+{
+ struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ u16 pkg_id = topology_physical_package_id(cpu);
+
+ state->rmid = 0;
+ state->closid = 0;
+
+ /* XXX: lock */
+ /* XXX: Make sure this case is handled when hotplug happens. */
+ WARN_ON(c->x86_cache_max_rmid != cqm_pkgs_data[pkg_id]->max_rmid);
+ WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+ /*
+ * Is @cpu a designated cqm reader?
+ */
+ u16 pkg_id = topology_physical_package_id(cpu);
+
+ if (cqm_pkgs_data[pkg_id]->rotation_cpu != cpu)
+ return;
+ /* XXX: do remove unused packages */
+ cqm_pkgs_data[pkg_id]->rotation_cpu = cpumask_any_but(
+ topology_core_cpumask(cpu), cpu);
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ intel_cqm_cpu_exit(cpu);
+ break;
+ case CPU_STARTING:
+ pkg_data_init_cpu(cpu);
+ intel_cqm_cpu_starting(cpu);
+ cqm_pick_event_reader(cpu);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+ { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+ {}
+};
+
+static int __init intel_cqm_init(void)
+{
+ char *str, scale[20];
+ int i, cpu, ret = 0, min_max_rmid = 0;
+
+ if (!x86_match_cpu(intel_cqm_match))
+ return -ENODEV;
+
+ cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+ if (WARN_ON(cqm_l3_scale == 0))
+ cqm_l3_scale = 1;
+
+ cqm_pkgs_data = kmalloc(
+ sizeof(struct pkg_data *) * topology_max_packages(),
+ GFP_KERNEL);
+ if (!cqm_pkgs_data)
+ return -ENOMEM;
+
+ for (i = 0; i < topology_max_packages(); i++)
+ cqm_pkgs_data[i] = NULL;
+
+ /*
+ * It's possible that not all resources support the same number
+ * of RMIDs. Instead of making scheduling much more complicated
+ * (where we have to match a task's RMID to a cpu that supports
+ * that many RMIDs) just find the minimum RMIDs supported across
+ * all cpus.
+ *
+ * Also, check that the scales match on all cpus.
+ */
+ cpu_notifier_register_begin();
+
+ /* XXX: assert all cpus in pkg have same nr rmids (they should). */
+ for_each_online_cpu(cpu) {
+ ret = pkg_data_init_cpu(cpu);
+ if (ret)
+ goto error;
+ }
+
+ /* Select the minimum of the maximum rmids to use as limit for
+ * threshold. XXX: per-package threshold.
+ */
+ cqm_pkg_id_for_each_online(i) {
+ if (min_max_rmid < cqm_pkgs_data[i]->max_rmid)
+ min_max_rmid = cqm_pkgs_data[i]->max_rmid;
+ intel_cqm_setup_pkg_prmid_pools(i);
+ }
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ __intel_cqm_max_threshold =
+ boot_cpu_data.x86_cache_size * 1024 / (min_max_rmid + 1);
+
+ snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+ str = kstrdup(scale, GFP_KERNEL);
+ if (!str) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ event_attr_intel_cqm_llc_scale.event_str = str;
+
+ for_each_online_cpu(i) {
+ intel_cqm_cpu_starting(i);
+ cqm_pick_event_reader(i);
+ }
+
+ __perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+ ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+ if (ret)
+ goto error;
+
+ cpu_notifier_register_done();
+
+ pr_info("Intel CQM monitoring enabled with at least %u rmids per package.\n",
+ min_max_rmid + 1);
+
+ return ret;
+
+error:
+ pr_err("Intel CQM perf registration failed: %d\n", ret);
+ cpu_notifier_register_done();
+
+ return ret;
+}
+
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/events/intel/cqm.h b/arch/x86/events/intel/cqm.h
index 06964cd..08623b5 100644
--- a/arch/x86/events/intel/cqm.h
+++ b/arch/x86/events/intel/cqm.h
@@ -41,9 +41,71 @@ struct prmid {
};
/*
+ * struct pkg_data: Per-package CQM data.
+ * @max_rmid: Max rmid valid for cpus in this package.
+ * @prmids_by_rmid: Utility mapping between rmid values and prmids.
+ * XXX: Make it an array of prmids.
+ * @free_prmid_pool: Free prmids.
+ * @pkg_data_mutex: Hold for stability when modifying pmonrs
+ * hierarchy.
+ * @pkg_data_lock: Hold to protect variables that may be accessed
+ * during process scheduling. The locks for all
+ * packages must be held when modifying the monr
+ * hierarchy.
+ * @rotation_cpu: CPU to run @rotation_work on, it must be in the
+ * package associated to this instance of pkg_data.
+ */
+struct pkg_data {
+ u32 max_rmid;
+ /* Quick map from rmids to prmids. */
+ struct prmid **prmids_by_rmid;
+
+ /*
+ * Pools of prmids used in rotation logic.
+ */
+ struct list_head free_prmids_pool;
+
+ struct mutex pkg_data_mutex;
+ raw_spinlock_t pkg_data_lock;
+
+ int rotation_cpu;
+};
+
+extern struct pkg_data **cqm_pkgs_data;
+
+static inline u16 __cqm_pkgs_data_next_online(u16 pkg_id)
+{
+ while (!cqm_pkgs_data[++pkg_id] && pkg_id < topology_max_packages())
+ ;
+ return pkg_id;
+}
+
+static inline u16 __cqm_pkgs_data_first_online(void)
+{
+ if (cqm_pkgs_data[0])
+ return 0;
+ return __cqm_pkgs_data_next_online(0);
+}
+
+/* Iterate for each online pkgs data */
+#define cqm_pkg_id_for_each_online(pkg_id__) \
+ for (pkg_id__ = __cqm_pkgs_data_first_online(); \
+ pkg_id__ < topology_max_packages(); \
+ pkg_id__ = __cqm_pkgs_data_next_online(pkg_id__))
+
+#define __pkg_data(pmonr, member) cqm_pkgs_data[pmonr->pkg_id]->member
+
+/*
* Time between execution of rotation logic. The frequency of execution does
* not affect the rate at which RMIDs are recycled, except by the delay by the
* delay updating the prmid's and their pools.
* The rotation period is stored in pmu->hrtimer_interval_ms.
*/
#define CQM_DEFAULT_ROTATION_PERIOD 1200 /* ms */
+
+/*
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ * It's units are bytes must be scaled by cqm_l3_scale to obtain cache lines.
+ */
+static unsigned int __intel_cqm_max_threshold;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1417d3b..02b8e24 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -118,6 +118,13 @@ struct hw_perf_event {
/* for tp_event->class */
struct list_head tp_list;
};
+#ifdef CONFIG_INTEL_RDT
+ struct { /* intel_cqm */
+ void *cqm_monr;
+ struct list_head cqm_event_group_entry;
+ struct list_head cqm_event_groups_entry;
+ };
+#endif
struct { /* itrace */
int itrace_started;
};
--
2.8.0.rc3.226.g39d4020
Powered by blists - more mailing lists