[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1437595962-21472-4-git-send-email-Waiman.Long@hp.com>
Date: Wed, 22 Jul 2015 16:12:38 -0400
From: Waiman Long <Waiman.Long@...com>
To: Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...hat.com>,
Thomas Gleixner <tglx@...utronix.de>,
"H. Peter Anvin" <hpa@...or.com>
Cc: x86@...nel.org, linux-kernel@...r.kernel.org,
Scott J Norton <scott.norton@...com>,
Douglas Hatch <doug.hatch@...com>,
Davidlohr Bueso <dave@...olabs.net>,
Waiman Long <Waiman.Long@...com>
Subject: [PATCH v3 3/7] locking/pvqspinlock: Collect slowpath lock statistics
This patch enables the accumulation of kicking and waiting related
PV qspinlock statistics when the new QUEUED_LOCK_STAT configuration
option is selected. It also enables the collection of kicking and
wakeup latencies which have a heavy dependency on the CPUs being used.
The measured latencies for different CPUs are:
CPU Wakeup Kicking
--- ------ -------
Haswell-EX 89.8us 7.4us
Westmere-EX 67.6us 9.3us
The measured latencies varied a bit from run-to-run. The wakeup
latency is much higher than the kicking latency.
A sample of statistics counts after a kernel build (no CPU overcommit)
was:
hash_hops_count=43
kick_latencies=5783565492
kick_time_count=640269
lock_kick_count=640238
pending_fail_count=10672
pending_lock_count=2946871
spurious_wakeup=14
unlock_kick_count=38
wait_again_count=4
wait_head_count=41
wait_node_count=640242
wake_latencies=42491684295
Signed-off-by: Waiman Long <Waiman.Long@...com>
---
arch/x86/Kconfig | 7 ++
kernel/locking/qspinlock_paravirt.h | 178 ++++++++++++++++++++++++++++++++++-
2 files changed, 181 insertions(+), 4 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 55bced1..299a1c4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -679,6 +679,13 @@ config PARAVIRT_SPINLOCKS
If you are unsure how to answer this question, answer Y.
+config QUEUED_LOCK_STAT
+ bool "Paravirt queued lock statistics"
+ depends on PARAVIRT && DEBUG_FS && QUEUED_SPINLOCKS
+ ---help---
+ Enable the collection of statistical data on the behavior of
+ paravirtualized queued spinlocks and report them on debugfs.
+
source "arch/x86/xen/Kconfig"
config KVM_GUEST
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index d64f054..db8d08b 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -44,6 +44,153 @@ struct pv_node {
};
/*
+ * PV qspinlock statistics
+ */
+enum pv_qlock_stat {
+ pvstat_wait_head,
+ pvstat_wait_node,
+ pvstat_wait_again,
+ pvstat_kick_time,
+ pvstat_lock_kick,
+ pvstat_unlock_kick,
+ pvstat_pend_lock,
+ pvstat_pend_fail,
+ pvstat_spurious,
+ pvstat_hops,
+ pvstat_num /* Total number of statistics counts */
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statiatics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+
+static const char * const stat_fsnames[pvstat_num] = {
+ [pvstat_wait_head] = "wait_head_count",
+ [pvstat_wait_node] = "wait_node_count",
+ [pvstat_wait_again] = "wait_again_count",
+ [pvstat_kick_time] = "kick_time_count",
+ [pvstat_lock_kick] = "lock_kick_count",
+ [pvstat_unlock_kick] = "unlock_kick_count",
+ [pvstat_pend_lock] = "pending_lock_count",
+ [pvstat_pend_fail] = "pending_fail_count",
+ [pvstat_spurious] = "spurious_wakeup",
+ [pvstat_hops] = "hash_hops_count",
+};
+
+static atomic_t pvstats[pvstat_num];
+
+/*
+ * pv_kick_latencies = sum of all pv_kick latencies in ns
+ * pv_wake_latencies = sum of all wakeup latencies in ns
+ *
+ * Avg kick latency = pv_kick_latencies/(lock_kick_count + unlock_kick_count)
+ * Avg wake latency = pv_wake_latencies/kick_time_count
+ * Avg # of hops/hash = hash_hops_count/unlock_kick_count
+ */
+static atomic64_t pv_kick_latencies, pv_wake_latencies;
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Reset all the statistics counts if set
+ */
+static bool reset_cnts __read_mostly;
+
+/*
+ * Initialize debugfs for the PV qspinlock statistics
+ */
+static int __init pv_qspinlock_debugfs(void)
+{
+ struct dentry *d_pvqlock = debugfs_create_dir("pv-qspinlock", NULL);
+ int i;
+
+ if (!d_pvqlock)
+ pr_warn("Could not create 'pv-qspinlock' debugfs directory\n");
+
+ for (i = 0; i < pvstat_num; i++)
+ debugfs_create_u32(stat_fsnames[i], 0444, d_pvqlock,
+ (u32 *)&pvstats[i]);
+ debugfs_create_u64("kick_latencies", 0444, d_pvqlock,
+ (u64 *)&pv_kick_latencies);
+ debugfs_create_u64("wake_latencies", 0444, d_pvqlock,
+ (u64 *)&pv_wake_latencies);
+ debugfs_create_bool("reset_cnts", 0644, d_pvqlock, (u32 *)&reset_cnts);
+ return 0;
+}
+fs_initcall(pv_qspinlock_debugfs);
+
+/*
+ * Reset all the counts
+ */
+static noinline void pvstat_reset(void)
+{
+ int i;
+
+ for (i = 0; i < pvstat_num; i++)
+ atomic_set(&pvstats[i], 0);
+ atomic64_set(&pv_kick_latencies, 0);
+ atomic64_set(&pv_wake_latencies, 0);
+ reset_cnts = 0;
+}
+
+/*
+ * Increment the PV qspinlock statistics counts
+ */
+static inline void pvstat_inc(enum pv_qlock_stat stat)
+{
+ atomic_inc(&pvstats[stat]);
+ if (unlikely(reset_cnts))
+ pvstat_reset();
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void pvstat_hop(int hopcnt)
+{
+ atomic_add(hopcnt, &pvstats[pvstat_hops]);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+ u64 start = sched_clock();
+
+ *per_cpu_ptr(&pv_kick_time, cpu) = start;
+ pv_kick(cpu);
+ atomic64_add(sched_clock() - start, &pv_kick_latencies);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+ u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+ *pkick_time = 0;
+ pv_wait(ptr, val);
+ if (*pkick_time) {
+ atomic64_add(sched_clock() - *pkick_time, &pv_wake_latencies);
+ pvstat_inc(pvstat_kick_time);
+ }
+}
+
+#define pv_kick(c) __pv_kick(c)
+#define pv_wait(p, v) __pv_wait(p, v)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void pvstat_inc(enum pv_qlock_stat stat) { }
+static inline void pvstat_hop(int hopcnt) { }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */
+
+/*
* Lock and MCS node addresses hash table for fast lookup
*
* Hashing is done on a per-cacheline basis to minimize the need to access
@@ -103,10 +250,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
{
unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
struct pv_hash_entry *he;
+ int hopcnt = 0;
for_each_hash_entry(he, offset, hash) {
+ hopcnt++;
if (!cmpxchg(&he->lock, NULL, lock)) {
WRITE_ONCE(he->node, node);
+ pvstat_hop(hopcnt);
return &he->lock;
}
}
@@ -209,11 +359,13 @@ static int pv_pending_lock(struct qspinlock *lock, u32 val)
* Clear the pending bit and fall back to queuing
*/
clear_pending(lock);
+ pvstat_inc(pvstat_pend_fail);
queue:
return 0;
gotlock:
+ pvstat_inc(pvstat_pend_lock);
return 1;
}
@@ -224,9 +376,10 @@ gotlock:
static void pv_wait_node(struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
+ int waitcnt = 0;
int loop;
- for (;;) {
+ for (;; waitcnt++) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
if (READ_ONCE(node->locked))
return;
@@ -244,14 +397,20 @@ static void pv_wait_node(struct mcs_spinlock *node)
*/
smp_store_mb(pn->state, vcpu_halted);
- if (!READ_ONCE(node->locked))
+ if (!READ_ONCE(node->locked)) {
+ pvstat_inc(pvstat_wait_node);
+ if (waitcnt)
+ pvstat_inc(pvstat_wait_again);
pv_wait(&pn->state, vcpu_halted);
+ }
/*
* Reset the vCPU state to avoid unncessary CPU kicking
*/
WRITE_ONCE(pn->state, vcpu_running);
+ if (READ_ONCE(node->locked))
+ break;
/*
* If the locked flag is still not set after wakeup, it is a
* spurious wakeup and the vCPU should wait again. However,
@@ -259,6 +418,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* So it is better to spin for a while in the hope that the
* MCS lock will be released soon.
*/
+ pvstat_inc(pvstat_spurious);
}
/*
* By now our node->locked should be 1 and our caller will not actually
@@ -284,8 +444,10 @@ static void pv_kick_node(struct mcs_spinlock *node)
*
* See the comment in pv_wait_node().
*/
- if (xchg(&pn->state, vcpu_running) == vcpu_halted)
+ if (xchg(&pn->state, vcpu_running) == vcpu_halted) {
+ pvstat_inc(pvstat_lock_kick);
pv_kick(pn->cpu);
+ }
}
/*
@@ -297,9 +459,10 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
struct qspinlock **lp = NULL;
+ int waitcnt = 0;
int loop;
- for (;;) {
+ for (;; waitcnt++) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
if (!READ_ONCE(l->locked))
return;
@@ -327,14 +490,20 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
return;
}
}
+ pvstat_inc(pvstat_wait_head);
+ if (waitcnt)
+ pvstat_inc(pvstat_wait_again);
pv_wait(&l->locked, _Q_SLOW_VAL);
+ if (!READ_ONCE(l->locked))
+ return;
/*
* The unlocker should have freed the lock before kicking the
* CPU. So if the lock is still not free, it is a spurious
* wakeup and so the vCPU should wait again after spinning for
* a while.
*/
+ pvstat_inc(pvstat_spurious);
}
/*
@@ -386,6 +555,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* At this point the memory pointed at by lock can be freed/reused,
* however we can still use the pv_node to kick the CPU.
*/
+ pvstat_inc(pvstat_unlock_kick);
pv_kick(node->cpu);
}
/*
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists