linux-kernel - [PATCH 07/22] 2.6.22-rc3 perfmon2 : event sets and multiplexing support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-Id: <200705291348.l4TDmLcf019697@frankl.hpl.hp.com>
Date:	Tue, 29 May 2007 06:48:21 -0700
From:	Stephane Eranian <eranian@...nkl.hpl.hp.com>
To:	linux-kernel@...r.kernel.org
Cc:	eranian@....hp.com
Subject: [PATCH 07/22] 2.6.22-rc3 perfmon2 : event sets and multiplexing support

This patch contains the event set and multiplexing support.

On many PMU models, there is not enough counter to collect
certain metric in one run. Even on those that have potentially
lots of counters, e.g. P4 with 18, there are oftentimes constraints
which make measuring certain event together impossible. In those
situation the user has n choice but to measure with multiple
runs which is not always practical and prone to errors.

One way to alleviate the problem is to introduce the notion
of an event set. Each set encapsulates the entire PMU state.
If a PMU has M counters then each set can define M events. 
Multiple sets can be defined. They are then multiplexed onto
the actual PMU such that only one is active at any time.
The collected counts can then be scaled to get an *estimate*
of what they would have been had each event been measured across
the entire run. It is important to note that this remains an
estimate. The faster we can switch, the smaller the blind spots are
but the higher the overhead is.

Sets and set switching can be implemented at the user level. Yet
by having kernel support for it, we can signification improve
performance especially for non self-monitoring per-thread context where
we guarantee switching always occurs in the context of the monitored thread.

By default, any perfmon2 context is created with a default set, i.e., set0.
Set can be dynamically created/deleted with specific system calls. A set
is identified by a simple number (0-65535). The number determines the
position of the set in an ordered list. The order in the list determines
the switch order. Switching occurs in a round-robin fashion.

Switching can be triggered by a timeout or after a certain number of overflows.
The type of switching as well as the timeout is determined per set.
The timeout granularity is determined by that of the timer tick. The actual
timeout value is returned to the user.

The file perfmon_sets.c implements:
	- set-related back-end system calls: __pfm_create_evtsets(), __pfm_delete_evtsets(), __pfm_getinfo_evtsets()
	- set switching: pfm_switch_sets(), __pfm_handle_switch_timeout()




--- linux-2.6.22.base/perfmon/perfmon_sets.c	1969-12-31 16:00:00.000000000 -0800
+++ linux-2.6.22/perfmon/perfmon_sets.c	2007-05-29 03:24:14.000000000 -0700
@@ -0,0 +1,1014 @@
+/*
+ * perfmon_sets.c: perfmon2 event sets and multiplexing functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@....hp.com>
+ *                David Mosberger-Tang <davidm@....hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/kernel.h>
+#include <linux/perfmon.h>
+#include <linux/pagemap.h>
+
+static struct kmem_cache	*pfm_set_cachep;
+static struct kmem_cache	*pfm_lg_set_cachep;
+
+/*
+ * reload reference overflow switch thresholds
+ */
+static void pfm_reload_switch_thresholds(struct pfm_event_set *set)
+{
+	u64 *mask;
+	u16 i, max_cnt_pmd, first_cnt_pmd;
+
+	mask = set->used_pmds;
+	first_cnt_pmd = pfm_pmu_conf->first_cnt_pmd;
+	max_cnt_pmd = pfm_pmu_conf->max_cnt_pmd;
+
+	for (i = first_cnt_pmd; i< max_cnt_pmd; i++) {
+		if (test_bit(i, ulp(mask))) {
+			set->pmds[i].ovflsw_thres = set->pmds[i].ovflsw_ref_thres;
+			PFM_DBG("set%u pmd%u ovflsw_thres=%llu",
+				set->id,
+				i,
+				(unsigned long long)set->pmds[i].ovflsw_thres);
+		}
+	}
+}
+
+/*
+ * ensures that all id_next sets exists such that the round-robin
+ * will work correctly, i.e., next dangling references.
+ */
+int pfm_prepare_sets(struct pfm_context *ctx, struct pfm_event_set *act_set)
+{
+	struct pfm_event_set *set1, *set2;
+	u16 max_cnt_pmd;
+#define is_last_set(s, c)	((s)->list.next == &(c)->list)
+
+	max_cnt_pmd = pfm_pmu_conf->max_cnt_pmd;
+
+	list_for_each_entry(set1, &ctx->list, list) {
+
+		if (is_last_set(set1, ctx))
+			set2 = list_entry(ctx->list.next,
+					  struct pfm_event_set, list);
+		else
+			set2 = list_entry(set1->list.next,
+					  struct pfm_event_set, list);
+		/*
+		 * switch_next is used during actual switching
+		 * so we prepare its value here. When no explicit next
+		 * is requested, the field is initialized with the address
+		 * of the next element in the ordered list
+		 */
+		if (set1->flags & PFM_SETFL_EXPL_NEXT) {
+			list_for_each_entry(set2, &ctx->list, list) {
+				if (set2->id == set1->id_next)
+					break;
+			}
+			if (set2 == NULL) {
+				PFM_DBG("set%u points to set%u "
+					"which does not exist",
+					set1->id,
+					set1->id_next);
+				return -EINVAL;
+			}
+		}
+		/*
+		 * update field used during actual switching
+		 */
+		set1->sw_next = set2;
+
+		PFM_DBG("set%u sw_next=%u", set1->id, set2->id);
+
+		/*
+		 * cleanup bitvectors
+		 */
+		bitmap_zero(ulp(set1->ovfl_pmds), max_cnt_pmd);
+		bitmap_zero(ulp(set1->povfl_pmds), max_cnt_pmd);
+		set1->npend_ovfls = 0;
+		/*
+		 * we cannot just use plain clear because of arch-specific flags
+		 */
+		set1->priv_flags &= ~(PFM_SETFL_PRIV_MOD_BOTH|PFM_SETFL_PRIV_SWITCH);
+
+		/*
+		 * reset activation and elapsed ns
+		 */
+		set1->duration = 0;
+
+		pfm_modview_begin(set1);
+
+		set1->view->set_runs = 0;
+
+		pfm_modview_end(set1);
+	}
+	/*
+	 * setting PFM_CPUINFO_TIME_SWITCH, triggers
+	 * further checking if __pfm_handle_switch_timeout().
+	 * switch timeout is effectively decremented only when
+	 * monitoring has been activated via pfm_start() or
+	 * any user level equivalent.
+	 */
+	if (act_set->flags & PFM_SETFL_TIME_SWITCH) {
+
+		act_set->timeout_sw_left = act_set->timeout_sw_ref;
+		PFM_DBG("arming timeout for set%u", act_set->id);
+
+		if (ctx->flags.system)
+			__get_cpu_var(pfm_syst_info) = PFM_CPUINFO_TIME_SWITCH;
+
+	} else if (act_set->flags & PFM_SETFL_OVFL_SWITCH)
+		pfm_reload_switch_thresholds(act_set);
+
+	return 0;
+}
+
+/*
+ * called from *_timer_interrupt(). task == current
+ */
+void __pfm_handle_switch_timeout(void)
+{
+	struct pfm_event_set *set;
+	struct pfm_context *ctx;
+	unsigned long flags;
+
+	/*
+	 * The timer tick check is operating on each
+	 * CPU. Not all CPUs have time switching enabled
+	 * hence we need to check.
+	 */
+	ctx  = __get_cpu_var(pmu_ctx);
+	if (ctx == NULL)
+		return;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	set = ctx->active_set;
+	BUG_ON(set == NULL);
+
+	/*
+	 * we decrement only when attached and not masked or zombie
+	 */
+	if (ctx->state != PFM_CTX_LOADED)
+		goto done;
+
+	/*
+	 * do not decrement timeout unless monitoring is active.
+	 */
+	if (!pfm_arch_is_active(ctx))
+		goto done;
+
+	set->timeout_sw_left--;
+
+	__get_cpu_var(pfm_stats).handle_timeout_count++;
+
+	if (!set->timeout_sw_left)
+		pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_SHORT, 0);
+done:
+	spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+/*
+ *
+ * always operating on the current task
+ * interrupts are masked
+ *
+ * input:
+ * 	- new_set: new set to switch to, if NULL follow normal chain
+ */
+void pfm_switch_sets(struct pfm_context *ctx,
+		    struct pfm_event_set *new_set,
+		    int reset_mode,
+		    int no_restart)
+{
+	struct pfm_event_set *set;
+	u64 switch_count;
+	u64 now, end;
+	unsigned long info = 0;
+	u32 new_flags;
+	int is_system, state, is_active, nn;
+
+	now = sched_clock();
+	set = ctx->active_set;
+	is_active = pfm_arch_is_active(ctx);
+
+	/*
+	 * if no set is explicitly requested,
+	 * use the set_switch_next field
+	 */
+	if (new_set == NULL) {
+		/*
+	 	 * we use round-robin unless the user specified
+		 * a particular set to go to.
+	 	 */
+		new_set = set->sw_next;
+		BUG_ON(new_set == NULL);
+	}
+
+	PFM_DBG("state=%d act=%d cur_set=%u cur_runs=%llu cur_npend=%d next_set=%u "
+		  "next_runs=%llu new_npend=%d reset_mode=%d reset_pmds=%llx",
+		  ctx->state,
+		  is_active,
+		  set->id,
+		  (unsigned long long)set->view->set_runs,
+		  set->npend_ovfls,
+		  new_set->id,
+		  (unsigned long long)new_set->view->set_runs,
+		  new_set->npend_ovfls,
+		  reset_mode,
+		  (unsigned long long)new_set->reset_pmds[0]);
+
+	is_system = ctx->flags.system;
+	state = ctx->state;
+	new_flags = new_set->flags;
+	switch_count = __get_cpu_var(pfm_stats).set_switch_count;
+
+	pfm_modview_begin(set);
+
+	/*
+	 * nothing more to do
+	 */
+	if (new_set == set)
+		goto skip_same_set;
+
+
+	if (is_active) {
+		pfm_arch_stop(current, ctx, set);
+
+		pfm_save_pmds(ctx, set);
+
+		/*
+	 	 * compute elapsed ns for active set
+	 	 */
+		set->duration += now - set->duration_start;
+		set->view->set_status &= ~PFM_SETVFL_ACTIVE;
+
+		/*
+		 * must be done after pfm_arch_stop()
+		 */
+		if (is_system)
+			info = __get_cpu_var(pfm_syst_info);
+	}
+	pfm_modview_end(set);
+
+	pfm_modview_begin(new_set);
+
+	pfm_arch_restore_pmds(ctx, new_set);
+
+	/*
+	 * if masked, we must restore the pmcs such that they
+	 * do not capture anything.
+	 */
+	pfm_arch_restore_pmcs(ctx, new_set);
+
+	if (new_set->npend_ovfls) {
+		pfm_arch_resend_irq();
+		__get_cpu_var(pfm_stats).ovfl_intr_replay_count++;
+	}
+
+	new_set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+skip_same_set:
+	switch_count++;
+	new_set->view->set_runs++;
+	/*
+	 * reset switch threshold
+	 */
+	if (new_flags & PFM_SETFL_OVFL_SWITCH)
+		pfm_reload_switch_thresholds(new_set);
+	else if (new_flags & PFM_SETFL_TIME_SWITCH)
+		new_set->timeout_sw_left = new_set->timeout_sw_ref;
+
+	/*
+	 * reset overflowed PMD registers
+	 */
+	nn = bitmap_weight(ulp(new_set->reset_pmds), pfm_pmu_conf->max_pmd);
+	if (nn)
+		pfm_reset_pmds(ctx, new_set, nn, reset_mode);
+	/*
+	 * this is needed when coming from pfm_start()
+	 */
+	if (no_restart)
+		goto skip_restart;
+
+	/*
+	 * reactivate monitoring
+	 */
+	if (is_system) {
+		info  &= ~PFM_CPUINFO_TIME_SWITCH;
+
+		if (new_flags & PFM_SETFL_TIME_SWITCH)
+			info |= PFM_CPUINFO_TIME_SWITCH;
+
+		__get_cpu_var(pfm_syst_info) = info;
+
+		PFM_DBG("new_set=%u info=0x%lx flags=0x%x",
+			new_set->id,
+			info,
+			new_flags);
+	}
+
+	if (is_active) {
+		pfm_arch_start(current, ctx, new_set);
+		new_set->duration_start = now;
+	}
+
+skip_restart:
+	ctx->active_set = new_set;
+	ctx->set_all_runs++;
+
+	new_set->view->set_status |= PFM_SETVFL_ACTIVE;
+
+	pfm_modview_end(new_set);
+	end = sched_clock();
+
+	__get_cpu_var(pfm_stats).set_switch_count = switch_count;
+	__get_cpu_var(pfm_stats).set_switch_ns += end - now;
+}
+
+/*
+ * called from __pfm_overflow_handler() to switch event sets.
+ * monitoring is stopped, task is current, interrupts are masked.
+ * compared to pfm_switch_sets(), this version is simplified because
+ * it know about the call path. There is no need to stop monitoring
+ * because it is already frozen by PMU handler.
+ */
+void pfm_switch_sets_from_intr(struct pfm_context *ctx)
+{
+	struct pfm_event_set *set, *new_set;
+	u64 switch_count;
+	u64 now, end;
+	u32 new_flags;
+	unsigned long info = 0;
+	int is_system, state, n;
+
+	now = sched_clock();
+	set = ctx->active_set;
+	new_set = set->sw_next;
+
+	PFM_DBG_ovfl("state=%d cur_set=%u cur_runs=%llu cur_npend=%d next_set=%u "
+		  "next_runs=%llu new_npend=%d new_r_pmds=%llx",
+		  ctx->state,
+		  set->id,
+		  (unsigned long long)set->view->set_runs,
+		  set->npend_ovfls,
+		  new_set->id,
+		  (unsigned long long)new_set->view->set_runs,
+		  new_set->npend_ovfls,
+		  (unsigned long long)new_set->reset_pmds[0]);
+
+	is_system = ctx->flags.system;
+	state = ctx->state;
+	new_flags = new_set->flags;
+	switch_count = __get_cpu_var(pfm_stats).set_switch_count;
+
+	pfm_modview_begin(set);
+
+	/*
+	 * nothing more to do
+	 */
+	if (new_set == set)
+		goto skip_same_set;
+
+	/*
+	 * when called from PMU intr handler, monitoring
+	 * is already stopped
+	 *
+	 * save current PMD registers, we use a special
+	 * form for performance reason. On some architectures,
+	 * such as x86, the pmds are already saved when entering
+	 * the PMU interrupt handler via pfm-arch_intr_freeze()
+	 * so we don't need to save them again. On the contrary,
+	 * on IA-64, they are not saved by freeze, thus we have to
+	 * to it here.
+	 */
+	pfm_arch_save_pmds_from_intr(ctx, set);
+
+	/*
+	 * compute elapsed ns for active set
+	 */
+	set->view->set_status &= ~PFM_SETVFL_ACTIVE;
+
+	pfm_modview_end(set);
+
+	set->duration += now - set->duration_start;
+
+	if (is_system)
+		info = __get_cpu_var(pfm_syst_info);
+
+	pfm_modview_begin(new_set);
+
+	pfm_arch_restore_pmds(ctx, new_set);
+	__get_cpu_var(pfm_stats).ccnt6 += sched_clock() - now;
+	/*
+	 * must not be restored active as we are still executing in the
+	 * PMU interrupt handler. activation is deferred to unfreeze PMU
+	 */
+	pfm_arch_restore_pmcs(ctx, new_set);
+
+	/*
+	 * check for pending interrupt on incoming set.
+	 * interrupts are masked so handler call deferred
+	 */
+	if (new_set->npend_ovfls) {
+		pfm_arch_resend_irq();
+		__get_cpu_var(pfm_stats).ovfl_intr_replay_count++;
+	}
+	/*
+	 * no need to restore anything, that is already done
+	 */
+	new_set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+	/*
+	 * reset duration counter
+	 */
+	new_set->duration_start = now;
+
+skip_same_set:
+	switch_count++;
+	new_set->view->set_runs++;
+
+	/*
+	 * reset switch threshold
+	 */
+	if (new_flags & PFM_SETFL_OVFL_SWITCH)
+		pfm_reload_switch_thresholds(new_set);
+	else if (new_flags & PFM_SETFL_TIME_SWITCH)
+		new_set->timeout_sw_left = new_set->timeout_sw_ref;
+
+	/*
+	 * reset overflowed PMD registers
+	 */
+	n = bitmap_weight(ulp(new_set->reset_pmds), pfm_pmu_conf->max_pmd);
+	if (n)
+		pfm_reset_pmds(ctx, new_set, n, PFM_PMD_RESET_SHORT);
+
+	/*
+	 * reactivate monitoring
+	 */
+	if (is_system) {
+		info  &= ~PFM_CPUINFO_TIME_SWITCH;
+
+		if (new_flags & PFM_SETFL_TIME_SWITCH)
+			info |= PFM_CPUINFO_TIME_SWITCH;
+
+		__get_cpu_var(pfm_syst_info) = info;
+
+		PFM_DBG("new_set=%u info=0x%lx flags=0x%x",
+			new_set->id,
+			info,
+			new_flags);
+	}
+
+	ctx->active_set = new_set;
+	ctx->set_all_runs++;
+
+	new_set->view->set_status |= PFM_SETVFL_ACTIVE;
+
+	pfm_modview_end(new_set);
+
+	end = sched_clock();
+
+	__get_cpu_var(pfm_stats).set_switch_count = switch_count;
+	__get_cpu_var(pfm_stats).set_switch_ns += end - now;
+}
+
+
+static int pfm_setfl_sane(struct pfm_context *ctx, u32 flags)
+{
+#define PFM_SETFL_BOTH_SWITCH	(PFM_SETFL_OVFL_SWITCH|PFM_SETFL_TIME_SWITCH)
+	int ret;
+
+	ret = pfm_arch_setfl_sane(ctx, flags);
+	if (ret)
+		return ret;
+
+	if ((flags & PFM_SETFL_BOTH_SWITCH) == PFM_SETFL_BOTH_SWITCH) {
+		PFM_DBG("both switch ovfl and switch time are set");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * it is never possible to change the identification of an existing set
+ */
+static int __pfm_change_evtset(struct pfm_context *ctx,
+				  struct pfm_event_set *set,
+				  struct pfarg_setdesc *req)
+{
+	u32 flags;
+	u16 set_id, set_id_next;
+	struct timespec tv;
+	unsigned long ji;
+	int ret;
+
+	BUG_ON(ctx->state == PFM_CTX_LOADED);
+
+	set_id = req->set_id;
+	set_id_next = req->set_id_next;
+	flags = req->set_flags;
+
+	ret = pfm_setfl_sane(ctx, flags);
+	if (ret) {
+		PFM_DBG("invalid flags 0x%x set %u", flags, set_id);
+		return -EINVAL;
+	}
+
+	/*
+	 * commit changes
+	 *
+	 * note that we defer checking the validity of set_id_next until the
+	 * context is actually attached. This is the only moment where we can
+	 * safely assess the sanity of the sets because sets cannot be changed
+	 * or deleted once the context is attached
+	 */
+	set->id = set_id;
+	set->id_next = set_id_next;
+	set->flags = flags;
+	set->priv_flags = 0;
+
+	/*
+	 * reset pointer to next set
+	 */
+	set->sw_next = NULL;
+
+	tv.tv_sec = 0;
+	tv.tv_nsec = req->set_timeout;
+	ji = timespec_to_jiffies(&tv);
+
+	/*
+	 * verify that timeout is not 0
+	 */
+	if (!ji && (flags & PFM_SETFL_TIME_SWITCH) != 0) {
+		PFM_DBG("invalid timeout=0");
+		return -EINVAL;
+	}
+
+	set->timeout_sw_ref = set->timeout_sw_left = ji;
+
+	PFM_DBG("set%u flags=0x%x id_next=%u req_nsec=%llu"
+		" jiffies=%lu HZ=%u TICK_NSEC=%lu eff_nsec=%lu",
+		set_id,
+		flags,
+		set_id_next,
+		(unsigned long long)req->set_timeout,
+		ji,
+		HZ, TICK_NSEC,
+		ji * TICK_NSEC);
+
+	/*
+	 * return actual timeout in nsecs
+	 */
+	jiffies_to_timespec(ji, &tv);
+	req->set_timeout = tv.tv_sec * 1000000000 + tv.tv_nsec;
+
+	return 0;
+}
+
+/*
+ * this function does not modify the next field
+ */
+void pfm_init_evtset(struct pfm_event_set *set)
+{
+	u64 *impl_pmcs;
+	u16 i, max_pmc;
+
+	max_pmc = pfm_pmu_conf->max_pmc;
+	impl_pmcs =  pfm_pmu_conf->impl_pmcs;
+
+	/*
+	 * install default values for all PMC  registers
+	 */
+	for (i=0; i < max_pmc;  i++) {
+		if (test_bit(i, ulp(impl_pmcs))) {
+			set->pmcs[i] = pfm_pmu_conf->pmc_desc[i].dfl_val;
+			PFM_DBG("set%u pmc%u=0x%llx",
+				set->id,
+				i,
+				(unsigned long long)set->pmcs[i]);
+		}
+	}
+
+	/*
+	 * PMD registers are set to 0 when the event set is allocated,
+	 * hence we do not need to explicitly initialize them.
+	 *
+	 * For virtual PMD registers (i.e., those tied to a SW resource)
+	 * their value becomes meaningful once the context is attached.
+	 */
+}
+
+/*
+ * look for an event set using its identification. If the set does not
+ * exist:
+ * 	- if alloc == 0 then return error
+ * 	- if alloc == 1  then allocate set
+ *
+ * alloc is one ONLY when coming from pfm_create_evtsets() which can only
+ * be called when the context is detached, i.e. monitoring is stopped.
+ */
+struct pfm_event_set *pfm_find_set(struct pfm_context *ctx, u16 set_id,
+					  int alloc)
+{
+	struct kmem_cache *cachep;
+	struct pfm_event_set *set, *new_set, *prev;
+	unsigned long offs;
+	size_t view_size;
+	void *view;
+
+	PFM_DBG("looking for set=%u", set_id);
+
+	/*
+	 * shortcut for set0: always exist, cannot be removed
+	 */
+	if (!(set_id || alloc))
+		return list_entry(ctx->list.next, struct pfm_event_set, list);
+
+	prev = NULL;
+	list_for_each_entry(set, &ctx->list, list) {
+		if (set->id == set_id)
+			return set;
+		if (set->id > set_id)
+			break;
+		prev = set;
+	}
+
+	if (!alloc)
+		return NULL;
+
+	/*
+	 * when alloc == 1, context is detached, monitoring is stopped and
+	 * interrupts are enabled but context is locked
+	 */
+
+	cachep = ctx->flags.mapset ? pfm_set_cachep : pfm_lg_set_cachep;
+
+	/*
+	 * we are holding the context spinlock and interrupts
+	 * are unmasked. We must use GFP_ATOMIC as we cannot
+	 * sleep while holding a spin lock.
+	 */
+	new_set = kmem_cache_zalloc(cachep, GFP_ATOMIC);
+	if (!new_set)
+		return NULL;
+
+	if (ctx->flags.mapset) {
+		view_size = PAGE_ALIGN(sizeof(struct pfm_set_view));
+		view      = kmalloc(view_size, GFP_ATOMIC);
+		if (view == NULL) {
+			PFM_DBG("cannot allocate set view");
+			kmem_cache_free(cachep, new_set);
+			return NULL;
+		}
+		offs = PFM_SET_REMAP_BASE
+		     + (set_id*PFM_SET_REMAP_SCALAR);
+	} else {
+		view_size = sizeof(struct pfm_set_view);
+		view = (struct pfm_set_view *)(new_set+1);
+		offs = 0;
+	}
+
+	memset(view, 0, sizeof(struct pfm_set_view));
+
+	new_set->id = set_id;
+	new_set->view = view;
+	new_set->mmap_offset = offs;
+
+	INIT_LIST_HEAD(&new_set->list);
+
+	if (prev == NULL) {
+		list_add(&(new_set->list), &ctx->list);
+	} else {
+		PFM_DBG("add after set=%u", prev->id);
+		list_add(&(new_set->list), &prev->list);
+	}
+	PFM_DBG("set_id=%u size=%zu view=%p remap=%d mmap_offs=%lu",
+			set_id,
+			view_size,
+			view,
+			ctx->flags.mapset,
+			new_set->mmap_offset);
+	return new_set;
+}
+
+/*
+ * context is unloaded for this command. Interrupts are enabled
+ */
+int __pfm_create_evtsets(struct pfm_context *ctx, struct pfarg_setdesc *req,
+			int count)
+{
+	struct pfm_event_set *set;
+	u16 set_id;
+	int i, ret;
+
+	for (i = 0; i < count; i++, req++) {
+		set_id = req->set_id;
+
+		PFM_DBG("set_id=%u", set_id);
+
+		set = pfm_find_set(ctx, set_id, 1);
+		if (set == NULL)
+			goto error_mem;
+
+		ret = __pfm_change_evtset(ctx, set, req);
+		if (ret)
+			goto error_params;
+
+		pfm_init_evtset(set);
+	}
+	return 0;
+error_mem:
+	PFM_DBG("cannot allocate set %u", set_id);
+	pfm_retflag_set(req->set_flags, PFM_REG_RETFL_EINVAL);
+	return -ENOMEM;
+error_params:
+	pfm_retflag_set(req->set_flags, PFM_REG_RETFL_EINVAL);
+	return ret;
+}
+
+int __pfm_getinfo_evtsets(struct pfm_context *ctx, struct pfarg_setinfo *req,
+				 int count)
+{
+	struct pfm_event_set *set;
+	int i, is_system, is_loaded;
+	u16 set_id;
+	int max_cnt_pmd;
+	u64 end;
+
+	end = sched_clock();
+	is_system = ctx->flags.system;
+	is_loaded = ctx->state == PFM_CTX_LOADED;
+	max_cnt_pmd = pfm_pmu_conf->max_cnt_pmd;
+
+	for (i = 0; i < count; i++, req++) {
+
+		set_id = req->set_id;
+
+		list_for_each_entry(set, &ctx->list, list) {
+			if (set->id == set_id)
+				goto found;
+			if (set->id > set_id)
+				goto error;
+		}
+found:
+		/*
+		 * compute leftover timeout
+		 */
+		req->set_flags = set->flags;
+		req->set_timeout = set->timeout_sw_left * TICK_NSEC;
+		req->set_runs = set->view->set_runs;
+		req->set_act_duration = set->duration;
+		req->set_mmap_offset = set->mmap_offset;
+
+		/*
+		 * adjust for active set if needed
+		 */
+		if (is_system && is_loaded && ctx->flags.started
+		    && set == ctx->active_set)
+			req->set_act_duration  += end - set->duration_start;
+
+		/*
+		 * copy the list of pmds which last overflowed for
+		 * the set
+		 */
+		bitmap_copy(ulp(req->set_ovfl_pmds),
+			    ulp(set->ovfl_pmds),
+			    max_cnt_pmd);
+
+		/*
+		 * copy bitmask of available PMU registers
+		 */
+		bitmap_copy(ulp(req->set_avail_pmcs),
+			    ulp(pfm_pmu_conf->impl_pmcs),
+			    pfm_pmu_conf->max_pmc);
+
+		bitmap_copy(ulp(req->set_avail_pmds),
+			    ulp(pfm_pmu_conf->impl_pmds),
+			    pfm_pmu_conf->max_pmd);
+
+		pfm_retflag_set(req->set_flags, 0);
+
+		PFM_DBG("set%u flags=0x%x eff_usec=%llu runs=%llu",
+			set_id,
+			set->flags,
+			(unsigned long long)req->set_timeout,
+			(unsigned long long)set->view->set_runs);
+	}
+	return 0;
+error:
+	PFM_DBG("set%u not found", set_id);
+	pfm_retflag_set(req->set_flags, PFM_REG_RETFL_EINVAL);
+	return -EINVAL;
+}
+
+/*
+ * context is unloaded for this command. Interrupts are enabled
+ */
+int __pfm_delete_evtsets(struct pfm_context *ctx, void *arg, int count)
+{
+	struct pfarg_setdesc *req = arg;
+	struct pfm_event_set *set;
+	struct kmem_cache *cachep;
+	u16 set_id;
+	size_t view_size;
+	int i;
+
+	/* delete operation only works when context is detached */
+	BUG_ON(ctx->state != PFM_CTX_UNLOADED);
+
+	view_size = PAGE_ALIGN(sizeof(struct pfm_set_view));
+
+	if (ctx->flags.mapset)
+		cachep = pfm_set_cachep;
+	else
+		cachep = pfm_lg_set_cachep;
+
+	for (i = 0; i < count; i++, req++) {
+		set_id = req->set_id;
+
+		/*
+		 * cannot remove set 0
+		 */
+		if (!set_id)
+			goto error;
+
+		list_for_each_entry(set, &ctx->list, list) {
+			if (set->id == set_id)
+				goto found;
+			if (set->id > set_id)
+				goto error;
+		}
+		goto error;
+found:
+		/*
+		 * clear active set if necessary.
+		 * will be updated when context is loaded
+		 */
+		if (set == ctx->active_set)
+			ctx->active_set = NULL;
+
+		list_del(&set->list);
+
+		if (ctx->flags.mapset)
+			kfree(set->view);
+		kmem_cache_free(cachep, set);
+
+		pfm_retflag_set(req->set_flags, 0);
+
+		PFM_DBG("set%u deleted", set_id);
+	}
+	return 0;
+error:
+	PFM_DBG("set%u not found or invalid", set_id);
+	pfm_retflag_set(req->set_flags, PFM_REG_RETFL_EINVAL);
+	return -EINVAL;
+}
+
+/*
+ * called from pfm_context_free() to free all sets
+ */
+void pfm_free_sets(struct pfm_context *ctx)
+{
+	struct pfm_event_set *set, *tmp;
+	struct kmem_cache *cachep;
+	int use_remap;
+
+	use_remap = ctx->flags.mapset;
+
+	if (use_remap)
+		cachep = pfm_set_cachep;
+	else
+		cachep = pfm_lg_set_cachep;
+
+	list_for_each_entry_safe(set, tmp, &ctx->list, list) {
+		list_del(&set->list);
+		if (use_remap)
+			kfree(set->view);
+		kmem_cache_free(cachep, set);
+	}
+}
+
+int pfm_sets_init(void)
+{
+
+	pfm_lg_set_cachep = kmem_cache_create("pfm_large_event_set",
+			sizeof(struct pfm_event_set)+sizeof(struct pfm_set_view),
+			SLAB_HWCACHE_ALIGN, 0, NULL, NULL);
+	if (pfm_lg_set_cachep == NULL) {
+		PFM_ERR("cannot initialize large event set slab");
+		return -ENOMEM;
+	}
+
+	pfm_set_cachep = kmem_cache_create("pfm_event_set",
+			sizeof(struct pfm_event_set),
+			SLAB_HWCACHE_ALIGN, 0, NULL, NULL);
+	if (pfm_set_cachep == NULL) {
+		PFM_ERR("cannot initialize event set slab");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static struct page *pfm_view_map_pagefault(struct vm_area_struct *vma,
+					   unsigned long address, int *type)
+{
+	void *kaddr;
+	struct page *page;
+
+	kaddr = vma->vm_private_data;
+	if (kaddr == NULL) {
+		PFM_DBG("no view");
+		return NOPAGE_SIGBUS;
+	}
+
+	if ( (address < (unsigned long) vma->vm_start) ||
+	     (address >= (unsigned long) (vma->vm_start + PAGE_SIZE)) )
+		return NOPAGE_SIGBUS;
+
+	kaddr += (address - vma->vm_start);
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
+	page = virt_to_page(kaddr);
+	get_page(page);
+
+	PFM_DBG("[%d] start=%p ref_count=%d",
+		  current->pid,
+		  kaddr, page_count(page));
+
+	return page;
+}
+struct vm_operations_struct pfm_view_map_vm_ops = {
+	.nopage	= pfm_view_map_pagefault,
+};
+
+int pfm_mmap_set(struct pfm_context *ctx, struct vm_area_struct *vma,
+		 size_t size)
+{
+	struct pfm_event_set *set;
+	u16 set_id;
+
+	if (!ctx->flags.mapset) {
+		PFM_DBG("context does not use set remapping");
+		return -EINVAL;
+	}
+
+	if (vma->vm_pgoff < PFM_SET_REMAP_OFFS
+			|| vma->vm_pgoff >= PFM_SET_REMAP_OFFS_MAX) {
+		PFM_DBG("invalid offset %lu", vma->vm_pgoff);
+		return -EINVAL;
+	}
+
+	if (size != PAGE_SIZE) {
+		PFM_DBG("size %zu must be page size", size);
+		return -EINVAL;
+	}
+
+	set_id = (u16)(vma->vm_pgoff - PFM_SET_REMAP_OFFS);
+	set = pfm_find_set(ctx, set_id, 0);
+	if (set == NULL) {
+		PFM_DBG("set=%u is undefined", set_id);
+		return -EINVAL;
+	}
+
+	PFM_DBG("mmaping set_id=%u", set_id);
+
+	vma->vm_ops = &pfm_view_map_vm_ops;
+	vma->vm_private_data = set->view;
+
+	return 0;
+}
+
+
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/