linux-kernel - [PATCH v2 13/32] perf/x86/intel/cqm: add polled update of RMID's llc

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1463007752-116802-14-git-send-email-davidcc@google.com>
Date:	Wed, 11 May 2016 16:02:13 -0700
From:	David Carrillo-Cisneros <davidcc@...gle.com>
To:	Peter Zijlstra <peterz@...radead.org>,
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
	Arnaldo Carvalho de Melo <acme@...nel.org>,
	Ingo Molnar <mingo@...hat.com>
Cc:	Vikas Shivappa <vikas.shivappa@...ux.intel.com>,
	Matt Fleming <matt@...eblueprint.co.uk>,
	Tony Luck <tony.luck@...el.com>,
	Stephane Eranian <eranian@...gle.com>,
	Paul Turner <pjt@...gle.com>,
	David Carrillo-Cisneros <davidcc@...gle.com>, x86@...nel.org,
	linux-kernel@...r.kernel.org
Subject: [PATCH v2 13/32] perf/x86/intel/cqm: add polled update of RMID's llc_occupancy

To avoid IPIs from IRQ disabled contexts, the occupancy for a RMID in a
remote package (a package other than the one the current cpu belongs) is
obtained from a cache that is periodically updated.
This removes the need for an IPI when reading occupancy for a task event,
that was the reason to add the problematic pmu::count and dummy
perf_event_read() in the previous CQM version.

The occupancy of all active prmids is updated every
__rmid_timed_update_period ms .

To avoid holding raw_spin_locks on the prmid hierarchy for too long, the
raw rmids to be read are copied to a temporal array list. The array list
is consumed to perform the wrmsrl and rdmsrl in each RMID required to
read its llc_occupancy.

This decoupling of traversing the RMID hierarchy and read occupancy is
specially useful due to high latency of the wrmsrl and rdmsl for the
llc_occupancy event (thousand of cycles in my test machine).

To avoid unnecessary memory allocations, the objects used to temporarily
store RMIDs are pooled in a per-package list and allocated on demand.

The infrastructure introduced in this patch will be used in future patches
in this series to perform reads on subtrees of a prmid hierarchy.

Reviewed-by: Stephane Eranian <eranian@...gle.com>
Signed-off-by: David Carrillo-Cisneros <davidcc@...gle.com>
---
 arch/x86/events/intel/cqm.c | 251 +++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/events/intel/cqm.h |  36 +++++++
 2 files changed, 286 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index a61dd70..523abc6 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -170,6 +170,8 @@ static inline bool __valid_pkg_id(u16 pkg_id)
 	return pkg_id < topology_max_packages();
 }
 
+static int anode_pool__alloc_one(u16 pkg_id);
+
 /* Init cqm pkg_data for @cpu 's package. */
 static int pkg_data_init_cpu(int cpu)
 {
@@ -222,11 +224,19 @@ static int pkg_data_init_cpu(int cpu)
 	mutex_init(&pkg_data->pkg_data_mutex);
 	raw_spin_lock_init(&pkg_data->pkg_data_lock);
 
+	INIT_LIST_HEAD(&pkg_data->anode_pool_head);
+	raw_spin_lock_init(&pkg_data->anode_pool_lock);
+
 	INIT_DELAYED_WORK(
 		&pkg_data->rotation_work, intel_cqm_rmid_rotation_work);
 	/* XXX: Chose randomly*/
 	pkg_data->rotation_cpu = cpu;
 
+	INIT_DELAYED_WORK(
+		&pkg_data->timed_update_work, intel_cqm_timed_update_work);
+	/* XXX: Chose randomly*/
+	pkg_data->timed_update_cpu = cpu;
+
 	cqm_pkgs_data[pkg_id] = pkg_data;
 	return 0;
 }
@@ -731,6 +741,189 @@ static void monr_dealloc(struct monr *monr)
 }
 
 /*
+ * Logic for reading sets of rmids into per-package lists.
+ * This package lists can be used to update occupancies without
+ * holding locks in the hierarchies of pmonrs.
+ * @pool: free pool.
+ */
+struct astack {
+	struct list_head	pool;
+	struct list_head	items;
+	int			top_idx;
+	int			max_idx;
+	u16			pkg_id;
+};
+
+static void astack__init(struct astack *astack, int max_idx, u16 pkg_id)
+{
+	INIT_LIST_HEAD(&astack->items);
+	INIT_LIST_HEAD(&astack->pool);
+	astack->top_idx = -1;
+	astack->max_idx = max_idx;
+	astack->pkg_id = pkg_id;
+}
+
+/* Try to enlarge astack->pool with a anode from this pkgs pool. */
+static int astack__try_add_pool(struct astack *astack)
+{
+	unsigned long flags;
+	int ret = -1;
+	struct pkg_data *pkg_data = cqm_pkgs_data[astack->pkg_id];
+
+	raw_spin_lock_irqsave(&pkg_data->anode_pool_lock, flags);
+
+	if (!list_empty(&pkg_data->anode_pool_head)) {
+		list_move_tail(pkg_data->anode_pool_head.prev, &astack->pool);
+		ret = 0;
+	}
+
+	raw_spin_unlock_irqrestore(&pkg_data->anode_pool_lock, flags);
+	return ret;
+}
+
+static int astack__push(struct astack *astack)
+{
+	if (!list_empty(&astack->items) && astack->top_idx < astack->max_idx) {
+		astack->top_idx++;
+		return 0;
+	}
+
+	if (list_empty(&astack->pool) && astack__try_add_pool(astack))
+		return -1;
+	list_move_tail(astack->pool.prev, &astack->items);
+	astack->top_idx = 0;
+	return 0;
+}
+
+/* Must be non-empty */
+# define __astack__top(astack_, member_) \
+	list_last_entry(&(astack_)->items, \
+	struct anode, entry)->member_[(astack_)->top_idx]
+
+static void astack__clear(struct astack *astack)
+{
+	list_splice_tail_init(&astack->items, &astack->pool);
+	astack->top_idx = -1;
+}
+
+/* Put back into pkg_data's pool. */
+static void astack__release(struct astack *astack)
+{
+	unsigned long flags;
+	struct pkg_data *pkg_data = cqm_pkgs_data[astack->pkg_id];
+
+	astack__clear(astack);
+	raw_spin_lock_irqsave(&pkg_data->anode_pool_lock, flags);
+	list_splice_tail_init(&astack->pool, &pkg_data->anode_pool_head);
+	raw_spin_unlock_irqrestore(&pkg_data->anode_pool_lock, flags);
+}
+
+static int anode_pool__alloc_one(u16 pkg_id)
+{
+	unsigned long flags;
+	struct anode *anode;
+	struct pkg_data *pkg_data = cqm_pkgs_data[pkg_id];
+
+	anode = kmalloc_node(sizeof(struct anode), GFP_KERNEL,
+			     cpu_to_node(pkg_data->rotation_cpu));
+	if (!anode)
+		return -ENOMEM;
+	raw_spin_lock_irqsave(&pkg_data->anode_pool_lock, flags);
+	list_add_tail(&anode->entry, &pkg_data->anode_pool_head);
+	raw_spin_unlock_irqrestore(&pkg_data->anode_pool_lock, flags);
+	return 0;
+}
+
+static int astack__end(struct astack *astack, struct anode *anode, int idx)
+{
+	return list_is_last(&anode->entry, &astack->items) &&
+	       idx > astack->top_idx;
+}
+
+static int __rmid_fn__cqm_prmid_update(struct prmid *prmid, u64 *val)
+{
+	int ret = cqm_prmid_update(prmid);
+
+	if (ret >= 0)
+		*val = atomic64_read(&prmid->last_read_value);
+	return ret;
+}
+
+/* Apply function to all elements in all nodes.
+ * On error returns first error in read, zero otherwise.
+ */
+static int astack__rmids_sum_apply(
+	struct astack *astack,
+	u16 pkg_id, int (*fn)(struct prmid *, u64 *), u64 *total)
+{
+	struct prmid *prmid;
+	struct anode *anode;
+	u32 rmid;
+	int i, ret, first_error = 0;
+	u64 count;
+	*total = 0;
+
+	list_for_each_entry(anode, &astack->items, entry) {
+		for (i = 0; i <= astack->max_idx; i++) {
+			/* node in tail only has astack->top_idx elements. */
+			if (astack__end(astack, anode, i))
+				break;
+			rmid = anode->rmids[i];
+			prmid = cqm_pkgs_data[pkg_id]->prmids_by_rmid[rmid];
+			WARN_ON_ONCE(!prmid);
+			ret = fn(prmid, &count);
+			if (ret < 0) {
+				if (!first_error)
+					first_error = ret;
+				continue;
+			}
+			*total += count;
+		}
+	}
+	return first_error;
+}
+
+/* Does not need mutex since protected by locks when transversing
+ * astate_pmonrs_lru and updating atomic prmids.
+ */
+static int update_rmids_in_astate_pmonrs_lru(u16 pkg_id)
+{
+	struct astack astack;
+	struct pkg_data *pkg_data;
+	struct pmonr *pmonr;
+	int ret = 0;
+	unsigned long flags;
+	u64 count;
+
+	astack__init(&astack, NR_RMIDS_PER_NODE - 1, pkg_id);
+	pkg_data = cqm_pkgs_data[pkg_id];
+
+retry:
+	if (ret) {
+		anode_pool__alloc_one(pkg_id);
+		ret = 0;
+	}
+	raw_spin_lock_irqsave_nested(&pkg_data->pkg_data_lock, flags, pkg_id);
+	list_for_each_entry(pmonr,
+			    &pkg_data->astate_pmonrs_lru, rotation_entry) {
+		ret = astack__push(&astack);
+		if (ret)
+			break;
+		__astack__top(&astack, rmids) = pmonr->prmid->rmid;
+	}
+	raw_spin_unlock_irqrestore(&pkg_data->pkg_data_lock, flags);
+	if (ret) {
+		astack__clear(&astack);
+		goto retry;
+	}
+	/* count is not used. */
+	ret = astack__rmids_sum_apply(&astack, pkg_id,
+				      &__rmid_fn__cqm_prmid_update, &count);
+	astack__release(&astack);
+	return ret;
+}
+
+/*
  * Wrappers for monr manipulation in events.
  *
  */
@@ -1519,6 +1712,17 @@ exit:
 	mutex_unlock(&pkg_data->pkg_data_mutex);
 }
 
+static void
+__intel_cqm_timed_update(u16 pkg_id)
+{
+	int ret;
+
+	mutex_lock_nested(&cqm_pkgs_data[pkg_id]->pkg_data_mutex, pkg_id);
+	ret = update_rmids_in_astate_pmonrs_lru(pkg_id);
+	mutex_unlock(&cqm_pkgs_data[pkg_id]->pkg_data_mutex);
+	WARN_ON_ONCE(ret);
+}
+
 static struct pmu intel_cqm_pmu;
 
 /* Rotation only needs to be run when there is any pmonr in (I)state. */
@@ -1541,6 +1745,22 @@ static bool intel_cqm_need_rotation(u16 pkg_id)
 	return need_rot;
 }
 
+static bool intel_cqm_need_timed_update(u16 pkg_id)
+{
+
+	struct pkg_data *pkg_data;
+	bool need_update;
+
+	pkg_data = cqm_pkgs_data[pkg_id];
+
+	mutex_lock_nested(&pkg_data->pkg_data_mutex, pkg_id);
+	/* Update is needed if prmids if there is any active prmid. */
+	need_update = !list_empty(&pkg_data->active_prmids_pool);
+	mutex_unlock(&pkg_data->pkg_data_mutex);
+
+	return need_update;
+}
+
 /*
  * Schedule rotation in one package.
  */
@@ -1555,6 +1775,19 @@ static void __intel_cqm_schedule_rotation_for_pkg(u16 pkg_id)
 		pkg_data->rotation_cpu, &pkg_data->rotation_work, delay);
 }
 
+static void __intel_cqm_schedule_timed_update_for_pkg(u16 pkg_id)
+{
+	struct pkg_data *pkg_data;
+	unsigned long delay;
+
+	delay = msecs_to_jiffies(__rmid_timed_update_period);
+	pkg_data = cqm_pkgs_data[pkg_id];
+	schedule_delayed_work_on(
+		pkg_data->timed_update_cpu,
+		&pkg_data->timed_update_work, delay);
+}
+
+
 /*
  * Schedule rotation and rmid's timed update in all packages.
  * Reescheduling will stop when no longer needed.
@@ -1563,8 +1796,10 @@ static void intel_cqm_schedule_work_all_pkgs(void)
 {
 	int pkg_id;
 
-	cqm_pkg_id_for_each_online(pkg_id)
+	cqm_pkg_id_for_each_online(pkg_id) {
 		__intel_cqm_schedule_rotation_for_pkg(pkg_id);
+		__intel_cqm_schedule_timed_update_for_pkg(pkg_id);
+	}
 }
 
 static void intel_cqm_rmid_rotation_work(struct work_struct *work)
@@ -1585,6 +1820,20 @@ static void intel_cqm_rmid_rotation_work(struct work_struct *work)
 		__intel_cqm_schedule_rotation_for_pkg(pkg_id);
 }
 
+static void intel_cqm_timed_update_work(struct work_struct *work)
+{
+	struct pkg_data *pkg_data = container_of(
+		to_delayed_work(work), struct pkg_data, timed_update_work);
+	u16 pkg_id = topology_physical_package_id(pkg_data->timed_update_cpu);
+
+	WARN_ON_ONCE(pkg_data != cqm_pkgs_data[pkg_id]);
+
+	__intel_cqm_timed_update(pkg_id);
+
+	if (intel_cqm_need_timed_update(pkg_id))
+		__intel_cqm_schedule_timed_update_for_pkg(pkg_id);
+}
+
 /*
  * Find a group and setup RMID.
  *
diff --git a/arch/x86/events/intel/cqm.h b/arch/x86/events/intel/cqm.h
index 7e4e37a..0467c52 100644
--- a/arch/x86/events/intel/cqm.h
+++ b/arch/x86/events/intel/cqm.h
@@ -49,6 +49,10 @@ static unsigned int __rmid_min_update_time = RMID_DEFAULT_MIN_UPDATE_TIME;
 
 static inline int cqm_prmid_update(struct prmid *prmid);
 
+#define RMID_DEFAULT_TIMED_UPDATE_PERIOD 100 /* ms */
+static unsigned int __rmid_timed_update_period =
+	RMID_DEFAULT_TIMED_UPDATE_PERIOD;
+
 /*
  * union prmid_summary: Machine-size summary of a pmonr's prmid state.
  * @value:		One word accesor.
@@ -213,6 +217,21 @@ struct pmonr {
 	atomic64_t				prmid_summary_atomic;
 };
 
+/* Store all RMIDs that can fit in a anode while keeping sizeof(struct anode)
+ * within one cache line (for performance).
+ */
+#define NR_TYPE_PER_NODE(__type) ((SMP_CACHE_BYTES - (int)sizeof(struct list_head)) / \
+	(int)sizeof(__type))
+
+#define NR_RMIDS_PER_NODE NR_TYPE_PER_NODE(u32)
+
+/* struct anode: Node of an array list used to temporarily store RMIDs. */
+struct anode {
+	/* Last valid RMID is RMID_INVALID */
+	u32			rmids[NR_RMIDS_PER_NODE];
+	struct list_head	entry;
+};
+
 /*
  * struct pkg_data: Per-package CQM data.
  * @max_rmid:			Max rmid valid for cpus in this package.
@@ -242,6 +261,14 @@ struct pmonr {
  * @rotation_work:		Task that performs rotation of prmids.
  * @rotation_cpu:               CPU to run @rotation_work on, it must be in the
  *                              package associated to this instance of pkg_data.
+ * @timed_update_work:		Task that performs periodic updates of values
+ *				for active rmids. These values are used when
+ *				inter-package event read is not available due to
+ *				irqs disabled contexts.
+ * @timed_update_cpu:		CPU to run @timed_update_work on, it must be a
+ *				cpu in this package.
+ * @anode_pool_head:		Pool of unused anodes.
+ * @anode_pool_lock:		Protect @anode_pool_head.
  */
 struct pkg_data {
 	u32			max_rmid;
@@ -271,6 +298,13 @@ struct pkg_data {
 
 	struct delayed_work	rotation_work;
 	int			rotation_cpu;
+
+	struct delayed_work	timed_update_work;
+	int			timed_update_cpu;
+
+	/* Pool of unused rmid_list_nodes and its lock */
+	struct list_head	anode_pool_head;
+	raw_spinlock_t		anode_pool_lock;
 };
 
 /*
@@ -441,6 +475,8 @@ static inline int monr_hrchy_count_held_raw_spin_locks(void)
  */
 static void intel_cqm_rmid_rotation_work(struct work_struct *work);
 
+static void intel_cqm_timed_update_work(struct work_struct *work);
+
 /*
  * Service Level Objectives (SLO) for the rotation logic.
  *
-- 
2.8.0.rc3.226.g39d4020