linux-kernel - [PATCH v3 5/7] locking/pvqspinlock: Allow vCPUs kick-ahead

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1437595962-21472-6-git-send-email-Waiman.Long@hp.com>
Date:	Wed, 22 Jul 2015 16:12:40 -0400
From:	Waiman Long <Waiman.Long@...com>
To:	Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	"H. Peter Anvin" <hpa@...or.com>
Cc:	x86@...nel.org, linux-kernel@...r.kernel.org,
	Scott J Norton <scott.norton@...com>,
	Douglas Hatch <doug.hatch@...com>,
	Davidlohr Bueso <dave@...olabs.net>,
	Waiman Long <Waiman.Long@...com>
Subject: [PATCH v3 5/7] locking/pvqspinlock: Allow vCPUs kick-ahead

Frequent CPU halting (vmexit) and CPU kicking (vmenter) lengthens
critical section and block forward progress.  This patch implements
a kick-ahead mechanism where the unlocker will kick the queue head
vCPUs as well as up to four additional vCPUs next to the queue head
if they were halted.  The kickings are done after exiting the critical
section to improve parallelism.

The amount of kick-ahead allowed depends on the number of vCPUs
in the VM guest. Currently it allows up to 1 vCPU kick-ahead per
4 vCPUs available up to a maximum of PV_KICK_AHEAD_MAX (4). There
are diminishing returns in increasing the maximum value. The current
value of 4 is a compromise of getting a nice performance boost without
penalizing too much on the one vCPU that is doing all the kickings.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

		    Westmere			Haswell
  Patch		32 vCPUs    48 vCPUs	32 vCPUs    48 vCPUs
  -----		--------    --------    --------    --------
  Before patch	 3m42.3s    10m27.5s	 2m00.7s    17m22.2s
  After patch	 3m02.3s     9m35.9s	 1m59.8s    16m57.6s

This improves performance quite substantially on Westmere, but not
so much on Haswell.

Signed-off-by: Waiman Long <Waiman.Long@...com>
---
 kernel/locking/qspinlock_paravirt.h |  108 ++++++++++++++++++++++++++++++++---
 1 files changed, 99 insertions(+), 9 deletions(-)

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index a4986c6..96c3833 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -54,6 +54,7 @@ enum pv_qlock_stat {
 	pvstat_kick_time,
 	pvstat_lock_kick,
 	pvstat_unlock_kick,
+	pvstat_kick_ahead,
 	pvstat_pend_lock,
 	pvstat_pend_fail,
 	pvstat_spurious,
@@ -75,6 +76,7 @@ static const char * const stat_fsnames[pvstat_num] = {
 	[pvstat_kick_time]   = "kick_time_count",
 	[pvstat_lock_kick]   = "lock_kick_count",
 	[pvstat_unlock_kick] = "unlock_kick_count",
+	[pvstat_kick_ahead]  = "kick_ahead_count",
 	[pvstat_pend_lock]   = "pending_lock_count",
 	[pvstat_pend_fail]   = "pending_fail_count",
 	[pvstat_spurious]    = "spurious_wakeup",
@@ -87,7 +89,8 @@ static atomic_t pvstats[pvstat_num];
  * pv_kick_latencies = sum of all pv_kick latencies in ns
  * pv_wake_latencies = sum of all wakeup latencies in ns
  *
- * Avg kick latency   = pv_kick_latencies/(lock_kick_count + unlock_kick_count)
+ * Avg kick latency   = pv_kick_latencies/
+ *			(lock_kick_count + unlock_kick_count + kick_ahead_count)
  * Avg wake latency   = pv_wake_latencies/kick_time_count
  * Avg # of hops/hash = hash_hops_count/unlock_kick_count
  */
@@ -219,6 +222,18 @@ static struct pv_hash_entry *pv_lock_hash;
 static unsigned int pv_lock_hash_bits __read_mostly;
 
 /*
+ * Allow kick-ahead of vCPUs at unlock time
+ *
+ * The pv_kick_ahead value is set by a simple formula that 1 vCPU kick-ahead
+ * is allowed per 4 vCPUs available up to a maximum of PV_KICK_AHEAD_MAX.
+ * There are diminishing returns in increasing PV_KICK_AHEAD_MAX. The current
+ * value of 4 is a good compromise that gives a good performance boost without
+ * penalizing the vCPU that is doing the kicking by too much.
+ */
+#define PV_KICK_AHEAD_MAX	4
+static int pv_kick_ahead __read_mostly;
+
+/*
  * Allocate memory for the PV qspinlock hash buckets
  *
  * This function should be called from the paravirt spinlock initialization
@@ -226,7 +241,8 @@ static unsigned int pv_lock_hash_bits __read_mostly;
  */
 void __init __pv_init_lock_hash(void)
 {
-	int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+	int ncpus = num_possible_cpus();
+	int pv_hash_size = ALIGN(4 * ncpus, PV_HE_PER_LINE);
 
 	if (pv_hash_size < PV_HE_MIN)
 		pv_hash_size = PV_HE_MIN;
@@ -240,6 +256,13 @@ void __init __pv_init_lock_hash(void)
 					       pv_hash_size, 0, HASH_EARLY,
 					       &pv_lock_hash_bits, NULL,
 					       pv_hash_size, pv_hash_size);
+	/*
+	 * Enable the unlock kick ahead mode according to the number of
+	 * vCPUs available.
+	 */
+	pv_kick_ahead = min(ncpus/4, PV_KICK_AHEAD_MAX);
+	if (pv_kick_ahead)
+		pr_info("PV unlock kick ahead max count = %d\n", pv_kick_ahead);
 }
 
 #define for_each_hash_entry(he, offset, hash)						\
@@ -433,6 +456,28 @@ static void pv_wait_node(struct mcs_spinlock *node)
 }
 
 /*
+ * Helper to get the address of the next kickable node
+ *
+ * The node has to be in the halted state. If the chkonly flag is set,
+ * the CPU state won't be changed. Otherwise, it will be transitioned to
+ * running state by this function. If no kickable node is found, NULL
+ * will be returned.
+ */
+static inline struct pv_node *
+pv_get_kick_node(struct pv_node *node, int chkonly)
+{
+	struct pv_node *next = (struct pv_node *)READ_ONCE(node->mcs.next);
+
+	if (!next || (READ_ONCE(next->state) != vcpu_halted))
+		return NULL;
+
+	if (!chkonly && (xchg(&next->state, vcpu_running) != vcpu_halted))
+		next = NULL;	/* No kicking is needed */
+
+	return next;
+}
+
+/*
  * Called after setting next->locked = 1, used to wake those stuck in
  * pv_wait_node(). Alternatively, it can also defer the kicking to the
  * unlock function.
@@ -445,14 +490,36 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 		return;
 
 	/*
-	 * Note that because node->locked is already set, this actual
-	 * mcs_spinlock entry could be re-used already.
+	 * Kicking the next node at lock time can actually be a bit faster
+	 * than doing it at unlock time because the critical section time
+	 * overlaps with the wakeup latency of the next node. However, if the
+	 * VM is too overcommmitted, it can happen that we need to kick the
+	 * CPU again at unlock time (double-kick). To avoid that and also to
+	 * fully utilize the kick-ahead functionality at unlock time,
+	 * the kicking will be deferred under either one of the following
+	 * 2 conditions:
+	 * 1) The VM guest has too few vCPUs that kick-ahead is not even
+	 *    enabled. In this case, the chance of double-kick will be
+	 *    higher.
+	 * 2) The node after the next one is also in the halted state.
 	 *
-	 * This should be fine however, kicking people for no reason is
-	 * harmless.
-	 *
-	 * See the comment in pv_wait_node().
+	 * In this case, the state is set to vcpu_hased to indicate that hash
+	 * table has been filled and _Q_SLOW_VAL is set.
 	 */
+	if (!pv_kick_ahead || pv_get_kick_node(pn, 1)) {
+		if (xchg(&pn->state, vcpu_hashed) != vcpu_hashed) {
+			struct __qspinlock *l = (void *)lock;
+
+			/*
+			 * As this is the same vCPU that will check the
+			 * _Q_SLOW_VAL value and the hash table later on at
+			 * unlock time, no atomic instruction is needed.
+			 */
+			WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+			(void)pv_hash(lock, pn);
+		}
+		return;
+	}
 	pvstat_inc(pvstat_lock_kick);
 	pv_kick(pn->cpu);
 }
@@ -534,7 +601,8 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 {
 	struct __qspinlock *l = (void *)lock;
-	struct pv_node *node;
+	struct pv_node *node, *next;
+	int i, nr_kick, cpus[PV_KICK_AHEAD_MAX];
 	u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
 
 	/*
@@ -560,6 +628,20 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	node = pv_unhash(lock);
 
 	/*
+	 * Implement unlock kick-ahead
+	 *
+	 * Access the next group of nodes, if available, and prepare to kick
+	 * them after releasing the lock if they are in the halted state. This
+	 * should improve performance on an overcommitted system.
+	 */
+	for (nr_kick = 0, next = node; nr_kick < pv_kick_ahead; nr_kick++) {
+		next = pv_get_kick_node(next, 0);
+		if (!next)
+			break;
+		cpus[nr_kick] = next->cpu;
+	}
+
+	/*
 	 * Now that we have a reference to the (likely) blocked pv_node,
 	 * release the lock.
 	 */
@@ -571,6 +653,14 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	 */
 	pvstat_inc(pvstat_unlock_kick);
 	pv_kick(node->cpu);
+
+	/*
+	 * Kick the next group of vCPUs, if available.
+	 */
+	for (i = 0; i < nr_kick; i++) {
+		pvstat_inc(pvstat_kick_ahead);
+		pv_kick(cpus[i]);
+	}
 }
 /*
  * Include the architecture specific callee-save thunk of the
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/