lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 30 Oct 2015 19:26:35 -0400
From:	Waiman Long <Waiman.Long@....com>
To:	Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	"H. Peter Anvin" <hpa@...or.com>
Cc:	x86@...nel.org, linux-kernel@...r.kernel.org,
	Scott J Norton <scott.norton@....com>,
	Douglas Hatch <doug.hatch@....com>,
	Davidlohr Bueso <dave@...olabs.net>,
	Waiman Long <Waiman.Long@....com>
Subject: [PATCH tip/locking/core v9 4/6] locking/pvqspinlock: Collect slowpath lock statistics

This patch enables the accumulation of kicking and waiting related
PV qspinlock statistics when the new QUEUED_LOCK_STAT configuration
option is selected. It also enables the collection of data which
enable us to calculate the kicking and wakeup latencies which have
a heavy dependency on the CPUs being used.

The statistical counters are per-cpu variables to minimize the
performance overhead in their updates. These counters are exported
via the sysfs filesystem under the /sys/kernel/qlockstat directory.
When the corresponding sysfs files are read, summation and computing
of the required data are then performed.

The measured latencies for different CPUs are:

	CPU		Wakeup		Kicking
	---		------		-------
	Haswell-EX	63.6us		 7.4us
	Westmere-EX	67.6us		 9.3us

The measured latencies varied a bit from run-to-run. The wakeup
latency is much higher than the kicking latency.

A sample of statistics counts after system bootup (with vCPU
overcommit) was:

pv_hash_hops=1.00
pv_kick_unlock=1148
pv_kick_wake=1146
pv_latency_kick=11040
pv_latency_wake=194840
pv_spurious_wakeup=7
pv_wait_again=4
pv_wait_head=23
pv_wait_node=1129

Signed-off-by: Waiman Long <Waiman.Long@....com>
---
 arch/x86/Kconfig                    |    8 +
 kernel/locking/qspinlock_paravirt.h |   32 ++++-
 kernel/locking/qspinlock_stat.h     |  291 +++++++++++++++++++++++++++++++++++
 3 files changed, 326 insertions(+), 5 deletions(-)
 create mode 100644 kernel/locking/qspinlock_stat.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4a9b9a9..403bfea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -688,6 +688,14 @@ config PARAVIRT_SPINLOCKS
 
 	  If you are unsure how to answer this question, answer Y.
 
+config QUEUED_LOCK_STAT
+	bool "Paravirt queued spinlock statistics"
+	depends on PARAVIRT_SPINLOCKS && SYSFS && QUEUED_SPINLOCKS
+	---help---
+	  Enable the collection of statistical data on the slowpath
+	  behavior of paravirtualized queued spinlocks and report
+	  them on sysfs.
+
 source "arch/x86/xen/Kconfig"
 
 config KVM_GUEST
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 4bd323d..aaeeefb 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -41,6 +41,11 @@ struct pv_node {
 };
 
 /*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
  * Lock and MCS node addresses hash table for fast lookup
  *
  * Hashing is done on a per-cacheline basis to minimize the need to access
@@ -100,10 +105,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
 {
 	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
 	struct pv_hash_entry *he;
+	int hopcnt = 0;
 
 	for_each_hash_entry(he, offset, hash) {
+		hopcnt++;
 		if (!cmpxchg(&he->lock, NULL, lock)) {
 			WRITE_ONCE(he->node, node);
+			qstat_hop(hopcnt);
 			return &he->lock;
 		}
 	}
@@ -164,9 +172,11 @@ static void pv_init_node(struct mcs_spinlock *node)
 static void pv_wait_node(struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
+	int waitcnt = 0;
 	int loop;
 
-	for (;;) {
+	/* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
+	for (;; waitcnt++) {
 		for (loop = SPIN_THRESHOLD; loop; loop--) {
 			if (READ_ONCE(node->locked))
 				return;
@@ -184,12 +194,16 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 */
 		smp_store_mb(pn->state, vcpu_halted);
 
-		if (!READ_ONCE(node->locked))
+		if (!READ_ONCE(node->locked)) {
+			qstat_inc(qstat_pv_wait_node, true);
+			qstat_inc(qstat_pv_wait_again, waitcnt);
 			pv_wait(&pn->state, vcpu_halted);
+		}
 
 		/*
-		 * If pv_kick_node() changed us to vcpu_hashed, retain that value
-		 * so that pv_wait_head() knows to not also try to hash this lock.
+		 * If pv_kick_node() changed us to vcpu_hashed, retain that
+		 * value so that pv_wait_head() knows to not also try to hash
+		 * this lock.
 		 */
 		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 
@@ -200,6 +214,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 * So it is better to spin for a while in the hope that the
 		 * MCS lock will be released soon.
 		 */
+		qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
 	}
 
 	/*
@@ -250,6 +265,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 	struct pv_node *pn = (struct pv_node *)node;
 	struct __qspinlock *l = (void *)lock;
 	struct qspinlock **lp = NULL;
+	int waitcnt = 0;
 	int loop;
 
 	/*
@@ -259,7 +275,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 	if (READ_ONCE(pn->state) == vcpu_hashed)
 		lp = (struct qspinlock **)1;
 
-	for (;;) {
+	for (;; waitcnt++) {
 		for (loop = SPIN_THRESHOLD; loop; loop--) {
 			if (!READ_ONCE(l->locked))
 				return;
@@ -290,14 +306,19 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 				return;
 			}
 		}
+		qstat_inc(qstat_pv_wait_head, true);
+		qstat_inc(qstat_pv_wait_again, waitcnt);
 		pv_wait(&l->locked, _Q_SLOW_VAL);
 
+		if (!READ_ONCE(l->locked))
+			return;
 		/*
 		 * The unlocker should have freed the lock before kicking the
 		 * CPU. So if the lock is still not free, it is a spurious
 		 * wakeup and so the vCPU should wait again after spinning for
 		 * a while.
 		 */
+		qstat_inc(qstat_pv_spurious_wakeup, true);
 	}
 
 	/*
@@ -352,6 +373,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 	 * vCPU is harmless other than the additional latency in completing
 	 * the unlock.
 	 */
+	qstat_inc(qstat_pv_kick_unlock, true);
 	pv_kick(node->cpu);
 }
 
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 0000000..16b84b2
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,291 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@....com>
+ */
+
+/*
+ * When queued spinlock statistics is enabled, the following sysfs files
+ * will be created to hold the statistics counters:
+ *
+ * /sys/kernel/qlockstat/
+ *   pv_hash_hops	- average # of hops per hashing operation
+ *   pv_kick_unlock	- # of vCPU kicks issued at unlock time
+ *   pv_kick_wake	- # of vCPU kicks used for computing pv_latency_wake
+ *   pv_latency_kick	- average latency (ns) of vCPU kick operation
+ *   pv_latency_wake	- average latency (ns) from vCPU kick to wakeup
+ *   pv_spurious_wakeup	- # of spurious wakeups
+ *   pv_wait_again	- # of vCPU wait's that happened after a vCPU kick
+ *   pv_wait_head	- # of vCPU wait's at the queue head
+ *   pv_wait_node	- # of vCPU wait's at a non-head queue node
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistics counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding sysfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+	qstat_pv_hash_hops,
+	qstat_pv_kick_unlock,
+	qstat_pv_kick_wake,
+	qstat_pv_latency_kick,
+	qstat_pv_latency_wake,
+	qstat_pv_spurious_wakeup,
+	qstat_pv_wait_again,
+	qstat_pv_wait_head,
+	qstat_pv_wait_node,
+	qstat_num,	/* Total number of statistics counters */
+	qstat_reset_cnts = qstat_num,
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sched.h>
+
+static const char * const qstat_names[qstat_num + 1] = {
+	[qstat_pv_hash_hops]	   = "pv_hash_hops",
+	[qstat_pv_kick_unlock]     = "pv_kick_unlock",
+	[qstat_pv_kick_wake]       = "pv_kick_wake",
+	[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+	[qstat_pv_latency_kick]	   = "pv_latency_kick",
+	[qstat_pv_latency_wake]    = "pv_latency_wake",
+	[qstat_pv_wait_again]      = "pv_wait_again",
+	[qstat_pv_wait_head]       = "pv_wait_head",
+	[qstat_pv_wait_node]       = "pv_wait_node",
+	[qstat_reset_cnts]         = "reset_counters",
+};
+
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Sysfs data structures
+ */
+static struct kobj_attribute qstat_kobj_attrs[qstat_num + 1];
+static struct attribute *attrs[qstat_num + 2];
+static struct kobject *qstat_kobj;
+static struct attribute_group attr_group = {
+	.attrs = attrs,
+};
+
+/*
+ * Function to show the qlock statistics count
+ */
+static ssize_t
+qstat_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	int cpu, idx;
+	u64 stat = 0;
+
+	/*
+	 * Compute the index of the kobj_attribute in the array and used
+	 * it as the same index as the per-cpu variable
+	 */
+	idx = attr - qstat_kobj_attrs;
+
+	for_each_online_cpu(cpu)
+		stat += per_cpu(qstats[idx], cpu);
+	return sprintf(buf, "%llu\n", stat);
+}
+
+/*
+ * Return the average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ */
+static ssize_t
+kick_latency_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	int cpu;
+	u64 latencies = 0, kicks = 0;
+
+	for_each_online_cpu(cpu) {
+		kicks     += per_cpu(qstats[qstat_pv_kick_unlock],  cpu);
+		latencies += per_cpu(qstats[qstat_pv_latency_kick], cpu);
+	}
+
+	/* Rounded to the nearest ns */
+	return sprintf(buf, "%llu\n", kicks ? (latencies + kicks/2)/kicks : 0);
+}
+
+/*
+ * Return the average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ */
+static ssize_t
+wake_latency_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	int cpu;
+	u64 latencies = 0, kicks = 0;
+
+	for_each_online_cpu(cpu) {
+		kicks     += per_cpu(qstats[qstat_pv_kick_wake],    cpu);
+		latencies += per_cpu(qstats[qstat_pv_latency_wake], cpu);
+	}
+
+	/* Rounded to the nearest ns */
+	return sprintf(buf, "%llu\n", kicks ? (latencies + kicks/2)/kicks : 0);
+}
+
+/*
+ * Return the average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t
+hash_hop_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	int cpu;
+	u64 hops = 0, kicks = 0;
+
+	for_each_online_cpu(cpu) {
+		kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+		hops  += per_cpu(qstats[qstat_pv_hash_hops],   cpu);
+	}
+
+	if (!kicks)
+		return sprintf(buf, "0\n");
+
+	/*
+	 * Return a X.XX decimal number
+	 */
+	return sprintf(buf, "%llu.%02llu\n", hops/kicks,
+		      ((hops%kicks)*100 + kicks/2)/kicks);
+}
+
+/*
+ * Reset all the counters value
+ *
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t
+reset_counters_store(struct kobject *kobj, struct kobj_attribute *attr,
+		     const char *buf, size_t count)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		int i;
+		unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+
+		for (i = 0 ; i < qstat_num; i++)
+			WRITE_ONCE(ptr[i], 0);
+		for (i = 0 ; i < qstat_num; i++)
+			WRITE_ONCE(ptr[i], 0);
+	}
+	return count;
+}
+
+/*
+ * Initialize sysfs for the qspinlock statistics
+ */
+static int __init init_qspinlock_stat(void)
+{
+	int i, retval;
+
+	qstat_kobj = kobject_create_and_add("qlockstat", kernel_kobj);
+	if (qstat_kobj == NULL)
+		return -ENOMEM;
+
+	/*
+	 * Initialize the attribute table
+	 *
+	 * As reading from and writing to the stat files can be slow, only
+	 * root is allowed to do the read/write to limit impact to system
+	 * performance.
+	 */
+	for (i = 0; i <= qstat_num; i++) {
+		qstat_kobj_attrs[i].attr.name = qstat_names[i];
+		qstat_kobj_attrs[i].attr.mode = 0400;
+		qstat_kobj_attrs[i].show      = qstat_show;
+		attrs[i]		      = &qstat_kobj_attrs[i].attr;
+	}
+	qstat_kobj_attrs[qstat_pv_hash_hops].show    = hash_hop_show;
+	qstat_kobj_attrs[qstat_pv_latency_kick].show = kick_latency_show;
+	qstat_kobj_attrs[qstat_pv_latency_wake].show = wake_latency_show;
+
+	/*
+	 * Set attributes for reset_counters
+	 */
+	qstat_kobj_attrs[qstat_reset_cnts].attr.mode = 0200;
+	qstat_kobj_attrs[qstat_reset_cnts].show      = NULL;
+	qstat_kobj_attrs[qstat_reset_cnts].store     = reset_counters_store;
+
+	retval = sysfs_create_group(qstat_kobj, &attr_group);
+	if (retval)
+		kobject_put(qstat_kobj);
+
+	return retval;
+}
+fs_initcall(init_qspinlock_stat);
+
+/*
+ * Increment the PV qspinlock statistics counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+	if (cond)
+		this_cpu_inc(qstats[stat]);
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+	this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+	u64 start = sched_clock();
+
+	per_cpu(pv_kick_time, cpu) = start;
+	pv_kick(cpu);
+	this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+	u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+	*pkick_time = 0;
+	pv_wait(ptr, val);
+	if (*pkick_time) {
+		this_cpu_add(qstats[qstat_pv_latency_wake],
+			     sched_clock() - *pkick_time);
+		qstat_inc(qstat_pv_kick_wake, true);
+	}
+}
+
+#define pv_kick(c)	__pv_kick(c)
+#define pv_wait(p, v)	__pv_wait(p, v)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void qstat_inc(enum qlock_stats stat, bool cond)	{ }
+static inline void qstat_hop(int hopcnt)			{ }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ