[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20140127173420.GA9636@twins.programming.kicks-ass.net>
Date: Mon, 27 Jan 2014 18:34:20 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: "H. Peter Anvin" <hpa@...or.com>
Cc: "Waskiewicz Jr, Peter P" <peter.p.waskiewicz.jr@...el.com>,
Tejun Heo <tj@...nel.org>,
Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...hat.com>, Li Zefan <lizefan@...wei.com>,
"containers@...ts.linux-foundation.org"
<containers@...ts.linux-foundation.org>,
"cgroups@...r.kernel.org" <cgroups@...r.kernel.org>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
Stephane Eranian <eranian@...gle.com>
Subject: Re: [PATCH 0/4] x86: Add Cache QoS Monitoring (CQM) support
On Tue, Jan 14, 2014 at 09:58:26AM -0800, H. Peter Anvin wrote:
> On 01/12/2014 11:55 PM, Peter Zijlstra wrote:
> >
> > The problem is, since there's a limited number of RMIDs we have to
> > rotate at some point, but since changing RMIDs is nondeterministic we
> > can't.
> >
>
> This is fundamentally the crux here. RMIDs are quite expensive for the
> hardware to implement, so they are limited - but recycling them is
> *very* expensive because you literally have to touch every line in the
> cache.
Its not a problem that changing the task:RMID map is expensive, what is
a problem is that there's no deterministic fashion of doing it.
That said; I think I've got a sort-of workaround for that. See the
largish comment near cache_pmu_rotate().
I've also illustrated how to use perf-cgroup for this.
The below is a rough draft, most if not all XXXs should be
fixed/finished. But given I don't actually have hardware that supports
this stuff (afaik) I couldn't be arsed.
---
include/linux/perf_event.h | 33 +
kernel/events/core.c | 22 -
x86/kernel/cpu/perf_event_intel_cache.c | 687 ++++++++++++++++++++++++++++++++
3 files changed, 725 insertions(+), 17 deletions(-)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -126,6 +126,14 @@ struct hw_perf_event {
/* for tp_event->class */
struct list_head tp_list;
};
+ struct { /* cache_pmu */
+ struct task_struct *cache_target;
+ int cache_state;
+ int cache_rmid;
+ struct list_head cache_events_entry;
+ struct list_head cache_groups_entry;
+ struct list_head cache_group_entry;
+ };
#ifdef CONFIG_HAVE_HW_BREAKPOINT
struct { /* breakpoint */
/*
@@ -526,6 +534,31 @@ struct perf_output_handle {
int page;
};
+#ifdef CONFIG_CGROUP_PERF
+
+struct perf_cgroup_info;
+
+struct perf_cgroup {
+ struct cgroup_subsys_state css;
+ struct perf_cgroup_info __percpu *info;
+};
+
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ *
+ * XXX: its not safe to use this thing!!!
+ */
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+ return container_of(task_css(task, perf_subsys_id),
+ struct perf_cgroup, css);
+}
+
+#endif /* CONFIG_CGROUP_PERF */
+
#ifdef CONFIG_PERF_EVENTS
extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -329,23 +329,6 @@ struct perf_cgroup_info {
u64 timestamp;
};
-struct perf_cgroup {
- struct cgroup_subsys_state css;
- struct perf_cgroup_info __percpu *info;
-};
-
-/*
- * Must ensure cgroup is pinned (css_get) before calling
- * this function. In other words, we cannot call this function
- * if there is no cgroup event for the current CPU context.
- */
-static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
-{
- return container_of(task_css(task, perf_subsys_id),
- struct perf_cgroup, css);
-}
-
static inline bool
perf_cgroup_match(struct perf_event *event)
{
@@ -6711,6 +6694,11 @@ perf_event_alloc(struct perf_event_attr
if (task) {
event->attach_state = PERF_ATTACH_TASK;
+ /*
+ * XXX fix for cache_target, dynamic type won't have an easy test,
+ * maybe move target crap into generic event.
+ */
+
if (attr->type == PERF_TYPE_TRACEPOINT)
event->hw.tp_target = task;
#ifdef CONFIG_HAVE_HW_BREAKPOINT
--- /dev/null
+++ b/x86/kernel/cpu/perf_event_intel_cache.c
@@ -0,0 +1,687 @@
+#include <asm/processor.h>
+#include <linux/idr.h>
+#include <linux/raw_spinlock.h>
+#include <linux/perf_event.h>
+
+
+#define MSR_IA32_PQR_ASSOC 0x0c8f
+#define MSR_IA32_QM_CTR 0x0c8e
+#define MSR_IA32_QM_EVTSEL 0x0c8d
+
+unsigned int max_rmid;
+
+unsigned int l3_scale; /* supposedly cacheline size */
+unsigned int l3_max_rmid;
+
+
+struct cache_pmu_state {
+ raw_spin_lock lock;
+ int rmid;
+ int cnt;
+};
+
+static DEFINE_PER_CPU(struct cache_pmu_state, state);
+
+/*
+ * Protects the global state, hold both for modification, hold either for
+ * stability.
+ *
+ * XXX we modify RMID with only cache_mutex held, racy!
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+static unsigned long *cache_rmid_bitmap;
+
+/*
+ * All events
+ */
+static LIST_HEAD(cache_events);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * The new RMID we must not use until cache_pmu_stable().
+ * See cache_pmu_rotate().
+ */
+static unsigned long *cache_limbo_bitmap;
+
+/*
+ * The spare RMID that make rotation possible; keep out of the
+ * cache_rmid_bitmap to avoid it getting used for new events.
+ */
+static int cache_rotation_rmid;
+
+/*
+ * The freed RMIDs, see cache_pmu_rotate().
+ */
+static int cache_freed_nr;
+static int *cache_freed_rmid;
+
+/*
+ * One online cpu per package, for cache_pmu_stable().
+ */
+static cpumask_t cache_cpus;
+
+/*
+ * Returns < 0 on fail.
+ */
+static int __get_rmid(void)
+{
+ return bitmap_find_free_region(cache_rmid_bitmap, max_rmid, 0);
+}
+
+static void __put_rmid(int rmid)
+{
+ bitmap_release_region(cache_rmid_bitmap, rmid, 0);
+}
+
+/*
+ * Needs a quesent state before __put, see cache_pmu_stabilize().
+ */
+static void __free_rmid(int rmid)
+{
+ cache_freed_rmid[cache_freed_nr++] = rmid;
+}
+
+#define RMID_VAL_ERROR (1ULL << 63)
+#define RMID_VAL_UNAVAIL (1ULL << 62)
+
+static u64 __rmid_read(unsigned long rmid)
+{
+ u64 val;
+
+ /*
+ * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+ * it just says that to increase confusion.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, 1 | (rmid << 32));
+ rdmsr(MSR_IA32_QM_CTR, val);
+
+ /*
+ * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+ * the number of cachelines tagged with @rmid.
+ */
+ return val;
+}
+
+static void smp_test_stable(void *info)
+{
+ bool *used = info;
+ int i;
+
+ for (i = 0; i < cache_freed_nr; i++) {
+ if (__rmid_read(cache_freed_rmid[i]))
+ *used = false;
+ }
+}
+
+/*
+ * Test if the rotation_rmid is unused; see the comment near
+ * cache_pmu_rotate().
+ */
+static bool cache_pmu_is_stable(void)
+{
+ bool used = true;
+
+ smp_call_function_many(&cache_cpus, smp_test_stable, &used, true);
+
+ return used;
+}
+
+/*
+ * Quescent state; wait for all the 'freed' RMIDs to become unused. After this
+ * we can can reuse them and know that the current set of active RMIDs is
+ * stable.
+ */
+static void cache_pmu_stabilize(void)
+{
+ int i = 0;
+
+ if (!cache_freed_nr)
+ return;
+
+ /*
+ * Now wait until the old RMID drops back to 0 again, this means all
+ * cachelines have acquired a new tag and the new RMID is now stable.
+ */
+ while (!cache_pmu_is_stable()) {
+ /*
+ * XXX adaptive timeout? Ideally the hardware would get us an
+ * interrupt :/
+ */
+ schedule_timeout_uninterruptible(1);
+ }
+
+ bitmap_clear(cache_limbo_bitmap, 0, max_rmid);
+
+ if (cache_rotation_rmid <= 0) {
+ cache_rotation_rmid = cache_freed_rmid[0];
+ i++;
+ }
+
+ for (; i < cache_freed_nr; i++)
+ __put_rmid(cache_freed_rmid[i]);
+
+ cache_freed_nr = 0;
+}
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static unsigned long cache_group_xchg_rmid(struct perf_event *group, unsigned long rmid)
+{
+ struct perf_event *event;
+ unsigned long old_rmid = group->hw.cache_rmid;
+
+ group->hw.cache_rmid = rmid;
+ list_for_each_entry(event, &group->hw.cache_group_entry, hw.cache_group_entry)
+ event->hw.cache_rmid = rmid;
+
+ return old_rmid;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+ if ((a->attach_state & PERF_ATTACH_TASK) !=
+ (b->attach_state & PERF_ATTACH_TASK))
+ return false;
+
+ if (a->attach_state & PERF_ATTACH_TASK) {
+ if (a->hw.cache_target != b->hw.cache_target)
+ return false;
+
+ return true;
+ }
+
+ /* not task */
+
+#ifdef CONFIG_CGROUP_PERF
+ if ((a->cgrp == b->cgrp) && a->cgrp)
+ return true;
+#endif
+
+ return true; /* if not task or cgroup, we're machine wide */
+}
+
+static struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+ if (event->cgrp)
+ return event->cgrp;
+
+ if (event->attach_state & PERF_ATTACH_TASK) /* XXX */
+ return perf_cgroup_from_task(event->hw.cache_target);
+
+ return NULL;
+}
+
+/*
+ * Determine if @na's tasks intersect with @b's tasks
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+ struct perf_cb *ac, *bc;
+
+ ac = event_to_cgroup(a);
+ bc = event_to_cgroup(b);
+
+ if (!ac || !bc) {
+ /*
+ * If either is NULL, its a system wide event and that
+ * always conflicts with a cgroup one.
+ *
+ * If both are system wide, __match_event() should've
+ * been true and we'll never get here, if we did fail.
+ */
+ return true;
+ }
+
+ /*
+ * If one is a parent of the other, we've got an intersection.
+ */
+ if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+ cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+ return true;
+#endif
+
+ /*
+ * If one of them is not a task, same story as above with cgroups.
+ */
+ if (!(a->attach_state & PERF_ATTACH_TASK) ||
+ !(b->attach_state & PERF_ATTACH_TASK))
+ return true;
+
+ /*
+ * Again, if they're the same __match_event() should've caught us, if not fail.
+ */
+ if (a->hw.cache_target == b->hw.cache_target)
+ return true;
+
+ /*
+ * Must be non-overlapping.
+ */
+ return false;
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs, ought to run from an
+ * delayed work or somesuch.
+ *
+ * Rotating RMIDs is complicated; firstly because the hardware doesn't give us
+ * any clues; secondly because of cgroups.
+ *
+ * There's problems with the hardware interface; when you change the task:RMID
+ * map cachelines retain their 'old' tags, giving a skewed picture. In order to
+ * work around this, we must always keep one free RMID.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID), and
+ * assigning the free RMID to another group (the new RMID). We must then wait
+ * for the old RMID to not be used (no cachelines tagged). This ensure that all
+ * cachelines are tagged with 'active' RMIDs. At this point we can start
+ * reading values for the new RMID and treat the old RMID as the free RMID for
+ * the next rotation.
+ *
+ * Secondly, since cgroups can nest, we must make sure to not program
+ * conflicting cgroups at the same time. A conflicting cgroup is one that has a
+ * parent<->child relation. After all, a task of the child cgroup will also be
+ * covered by the parent cgroup.
+ *
+ * Therefore, when selecting a new group, we must invalidate all conflicting
+ * groups. Rotations allows us to measure all (conflicting) groups
+ * sequentially.
+ *
+ * XXX there's a further problem in that because we do our own rotation and
+ * cheat with schedulability the event {enabled,running} times are incorrect.
+ */
+static bool cache_pmu_rotate(void)
+{
+ struct perf_event *rotor;
+ int rmid;
+
+ mutex_lock(&cache_mutex);
+
+ if (list_empty(&cache_groups))
+ goto unlock_mutex;
+
+ rotor = list_first_entry(&cache_groups, struct perf_event, hw.cache_groups_entry);
+
+ raw_spin_lock_irq(&cache_lock);
+ list_del(&rotor->hw.cache_groups_entry);
+ rmid = cache_group_xchg_rmid(rotor, -1);
+ WARN_ON_ONCE(rmid <= 0); /* first entry must always have an RMID */
+ __free_rmid(rmid);
+ raw_spin_unlock_irq(&cache_loc);
+
+ /*
+ * XXX O(n^2) schedulability
+ */
+
+ list_for_each_entry(group, &cache_groups, hw.cache_groups_entry) {
+ bool conflicts = false;
+ struct perf_event *iter;
+
+ list_for_each_entry(iter, &cache_groups, hw.cache_groups_entry) {
+ if (iter == group)
+ break;
+ if (__conflict_event(group, iter)) {
+ conflicts = true;
+ break;
+ }
+ }
+
+ if (conflicts && group->hw.cache_rmid > 0) {
+ rmid = cache_group_xchg_rmid(group, -1);
+ WARN_ON_ONCE(rmid <= 0);
+ __free_rmid(rmid);
+ continue;
+ }
+
+ if (!conflicts && group->hw.cache_rmid <= 0) {
+ rmid = __get_rmid();
+ if (rmid <= 0) {
+ rmid = cache_rotation_rmid;
+ cache_rotation_rmid = -1;
+ }
+ set_bit(rmid, cache_limbo_rmid);
+ if (rmid <= 0)
+ break; /* we're out of RMIDs, more next time */
+
+ rmid = cache_group_xchg_rmid(group, rmid);
+ WARM_ON_ONCE(rmid > 0);
+ continue;
+ }
+
+ /*
+ * either we conflict and do not have an RMID -> good,
+ * or we do not conflict and have an RMID -> also good.
+ */
+ }
+
+ raw_spin_lock_irq(&cache_lock);
+ list_add_tail(&rotor->hw.cache_groups_entry, &cache_groups);
+ raw_spin_unlock_irq(&cache_lock);
+
+ /*
+ * XXX force a PMU reprogram here such that the new RMIDs are in
+ * effect.
+ */
+
+ cache_pmu_stabilize();
+
+unlock_mutex:
+ mutex_unlock(&cache_mutex);
+
+ /*
+ * XXX reschedule work.
+ */
+}
+
+/*
+ * Find a group and setup RMID
+ */
+static struct perf_event *cache_pmu_setup_event(struct perf_event *event)
+{
+ struct perf_event *iter;
+ int rmid = 0; /* unset */
+
+ list_for_each_entry(iter, &cache_groups, hw.cache_groups_entry) {
+ if (__match_event(iter, event)) {
+ event->hw.cache_rmid = iter->hw.cache_rmid;
+ return iter;
+ }
+ if (__conflict_event(iter, event))
+ rmid = -1; /* conflicting rmid */
+ }
+
+ if (!rmid) {
+ /* XXX lacks stabilization */
+ event->hw.cache_rmid = __get_rmid();
+ }
+
+ return NULL;
+}
+
+static void cache_pmu_event_read(struct perf_event *event)
+{
+ unsigned long rmid = event->hw.cache_rmid;
+ u64 val = RMID_VAL_UNAVAIL;
+
+ if (!test_bit(rmid, cache_limbo_bitmap))
+ val = __rmid_read(rmid);
+
+ /*
+ * Ignore this reading on error states and do not update the value.
+ */
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+
+ val *= l3_scale; /* cachelines -> bytes */
+
+ local64_set(&event->count, val);
+}
+
+static void cache_pmu_event_start(struct perf_event *event, int mode)
+{
+ struct cache_pmu_state *state = &__get_cpu_var(&state);
+ unsigned long flags;
+
+ if (!(event->hw.cache_state & PERF_HES_STOPPED))
+ return;
+
+ event->hw.cache_state &= ~PERF_HES_STOPPED;
+
+ raw_spin_lock_irqsave(&state->lock, flags);
+ if (state->cnt++)
+ WARN_ON_ONCE(state->rmid != rmid);
+ else
+ WARN_ON_ONCE(state->rmid);
+ state->rmid = rmid;
+ wrmsr(MSR_IA32_PQR_ASSOC, state->rmid);
+ raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static void cache_pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct cache_pmu_state *state = &__get_cpu_var(&state);
+ unsigned long flags;
+
+ if (event->hw.cache_state & PERF_HES_STOPPED)
+ return;
+
+ event->hw.cache_state |= PERF_HES_STOPPED;
+
+ raw_spin_lock_irqsave(&state->lock, flags);
+ cache_pmu_event_read(event);
+ if (!--state->cnt) {
+ state->rmid = 0;
+ wrmsr(MSR_IA32_PQR_ASSOC, 0);
+ } else {
+ WARN_ON_ONCE(!state->rmid);
+ raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int cache_pmu_event_add(struct perf_event *event, int mode)
+{
+ struct cache_pmu_state *state = &__get_cpu_var(&state);
+ unsigned long flags;
+ int rmid;
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+
+ event->hw.cache_state = PERF_HES_STOPPED;
+ rmid = event->hw.cache_rmid;
+ if (rmid <= 0)
+ goto unlock;
+
+ if (mode & PERF_EF_START)
+ cache_pmu_event_start(event, mode);
+
+unlock:
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+ return 0;
+}
+
+static void cache_pmu_event_del(struct perf_event *event, int mode)
+{
+ struct cache_pmu_state *state = &__get_cpu_var(&state);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+ cache_pmu_event_stop(event, mode);
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+ return 0;
+}
+
+static void cache_pmu_event_destroy(struct perf_event *event)
+{
+ struct perf_event *group_other = NULL;
+
+ mutex_lock(&cache_mutex);
+ raw_spin_lock_irq(&cache_lock);
+
+ list_del(&event->hw.cache_events_entry);
+
+ /*
+ * If there's another event in this group...
+ */
+ if (!list_empty(&event->hw.cache_group_entry)) {
+ group_other = list_first_entry(&event->hw.cache_group_entry,
+ struct perf_event,
+ hw.cache_group_entry);
+ list_del(&event->hw.cache_group_entry);
+ }
+ /*
+ * And we're the group leader..
+ */
+ if (!list_empty(&event->hw.cache_groups_entry)) {
+ /*
+ * If there was a group_other, make that leader, otherwise
+ * destroy the group and return the RMID.
+ */
+ if (group_other) {
+ list_replace(&event->hw.cache_groups_entry,
+ &group_other->hw.cache_groups_entry);
+ } else {
+ int rmid = event->hw.cache_rmid;
+ if (rmid > 0)
+ __put_rmid(rmid);
+ list_del(&event->hw.cache_groups_entry);
+ }
+ }
+
+ raw_spin_unlock_irq(&cache_lock);
+ mutex_unlock(&cache_mutex);
+}
+
+static struct pmu cache_pmu;
+
+/*
+ * Takes non-sampling task,cgroup or machine wide events.
+ *
+ * XXX there's a bit of a problem in that we cannot simply do the one event per
+ * node as one would want, since that one event would one get scheduled on the
+ * one cpu. But we want to 'schedule' the RMID on all CPUs.
+ *
+ * This means we want events for each CPU, however, that generates a lot of
+ * duplicate values out to userspace -- this is not to be helped unless we want
+ * to change the core code in some way.
+ */
+static int cache_pmu_event_init(struct perf_event *event)
+{
+ struct perf_event *group;
+
+ if (event->attr.type != cache_pmu.type)
+ return -ENOENT;
+
+ if (event->attr.config != 0)
+ return -EINVAL;
+
+ if (event->cpu == -1) /* must have per-cpu events; see above */
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ event->destroy = cache_pmu_event_destroy;
+
+ mutex_lock(&cache_mutex);
+
+ group = cache_pmu_setup_event(event); /* will also set rmid */
+
+ raw_spin_lock_irq(&cache_lock);
+ if (group) {
+ event->hw.cache_rmid = group->hw.cache_rmid;
+ list_add_tail(&event->hw.cache_group_entry,
+ &group->hw.cache_group_entry);
+ } else {
+ list_add_tail(&event->hw.cache_groups_entry,
+ &cache_groups);
+ }
+
+ list_add_tail(&event->hw.cache_events_entry, &cache_events);
+ raw_spin_unlock_irq(&cache_lock);
+
+ mutex_unlock(&cache_mutex);
+
+ return 0;
+}
+
+static struct pmu cache_pmu = {
+ .task_ctx_nr = perf_sw_context, /* we cheat: our add will never fail */
+ .event_init = cache_pmu_event_init,
+ .add = cache_pmu_event_add,
+ .del = cache_pmu_event_del,
+ .start = cache_pmu_event_start,
+ .stop = cache_pmu_event_stop,
+ .read = cache_pmu_event_read,
+};
+
+static int __init cache_pmu_init(void)
+{
+ unsigned int eax, ebx, ecd, edx;
+ int i;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (boot_cpu_data.x86 != 6)
+ return 0;
+
+ cpuid_count(0x07, 0, &eax, &ebx, &ecx, &edx);
+
+ /* CPUID.(EAX=07H, ECX=0).EBX.QOS[bit12] */
+ if (!(ebx & (1 << 12)))
+ return 0;
+
+ cpuid_count(0x0f, 0, &eax, &ebx, &ecx, &edx);
+
+ max_rmid = ebx;
+
+ /*
+ * We should iterate bits in CPUID(EAX=0FH, ECX=0).EDX
+ * For now, only support L3 (bit 1).
+ */
+ if (!(edx & (1 << 1)))
+ return 0;
+
+ cpuid_count(0x0f, 1, &eax, &ebx, &ecx, &edx);
+
+ l3_scale = ebx;
+ l3_max_rmid = ecx;
+
+ if (l3_max_rmid != max_rmid)
+ return 0;
+
+ cache_rmid_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(max_rmid), GFP_KERNEL);
+ if (!cache_rmid_bitmap)
+ return -ENOMEM;
+
+ cache_limbo_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(max_rmid), GFP_KERNEL);
+ if (!cache_limbo_bitmap)
+ return -ENOMEM; /* XXX frees */
+
+ cache_freed_rmid = kmalloc(sizeof(int) * max_rmid, GFP_KERNEL);
+ if (!cache_freed_rmid)
+ return -ENOMEM; /* XXX free bitmaps */
+
+ bitmap_zero(cache_rmid_bitmap, max_rmid);
+ bitmap_set(cache_rmid_bitmap, 0, 1); /* RMID 0 is special */
+ cache_rotation_rmid = __get_rmid(); /* keep one free RMID for rotation */
+ if (WARN_ON_ONCE(cache_rotation_rmid < 0))
+ return cache_rotation_rmid;
+
+ /*
+ * XXX hotplug notifiers!
+ */
+ for_each_possible_cpu(i) {
+ struct cache_pmu_state *state = &per_cpu(state, cpu);
+
+ raw_spin_lock_init(&state->lock);
+ state->rmid = 0;
+ }
+
+ ret = perf_pmu_register(&cache_pmu, "cache_qos", -1);
+ if (WARN_ON(ret)) {
+ pr_info("Cache QoS detected, registration failed (%d), disabled\n", ret);
+ return -1;
+ }
+
+ return 0;
+}
+device_initcall(cache_pmu_init);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists