linux-kernel - [PATCH v7 4/5] locking/pvqspinlock: Allow 1 lock stealing attempt

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1442955044-43895-5-git-send-email-Waiman.Long@hpe.com>
Date:	Tue, 22 Sep 2015 16:50:43 -0400
From:	Waiman Long <Waiman.Long@....com>
To:	Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	"H. Peter Anvin" <hpa@...or.com>
Cc:	x86@...nel.org, linux-kernel@...r.kernel.org,
	Scott J Norton <scott.norton@....com>,
	Douglas Hatch <doug.hatch@....com>,
	Davidlohr Bueso <dave@...olabs.net>,
	Waiman Long <Waiman.Long@....com>
Subject: [PATCH v7 4/5] locking/pvqspinlock: Allow 1 lock stealing attempt

This patch allows one attempt for the lock waiter to steal the lock
when entering the PV slowpath.  This helps to reduce the performance
penalty caused by lock waiter preemption while not having much of
the downsides of a real unfair lock.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

                    Westmere                    Haswell
  Patch         32 vCPUs    48 vCPUs    32 vCPUs    48 vCPUs
  -----         --------    --------    --------    --------
  Before patch   3m15.6s    10m56.1s     1m44.1s     5m29.1s
  After patch    3m02.3s     5m00.2s     1m43.7s     3m03.5s

For the overcommited case (48 vCPUs), this patch is able to reduce
kernel build time by more than 54% for Westmere and 44% for Haswell.

Signed-off-by: Waiman Long <Waiman.Long@....com>
---
 kernel/locking/qspinlock.c          |   19 ++---
 kernel/locking/qspinlock_paravirt.h |  128 ++++++++++++++++++++++++++++-------
 2 files changed, 112 insertions(+), 35 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index d2f3fda..42b20b2 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -249,17 +249,15 @@ static __always_inline void set_locked(struct qspinlock *lock)
 
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
 static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_kick_node(struct qspinlock *lock,
-					   struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
-					   struct mcs_spinlock *node) { }
-
+static __always_inline bool __pv_wait_head_and_lock(struct qspinlock *lock,
+						    struct mcs_spinlock *node,
+						    u32 tail)
+						    { return false; }
 #define pv_enabled()		false
 
 #define pv_init_node		__pv_init_node
 #define pv_wait_node		__pv_wait_node
-#define pv_kick_node		__pv_kick_node
-#define pv_wait_head		__pv_wait_head
+#define pv_wait_head_and_lock	__pv_wait_head_and_lock
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
@@ -417,7 +415,8 @@ queue:
 	 * does not imply a full barrier.
 	 *
 	 */
-	pv_wait_head(lock, node);
+	if (pv_wait_head_and_lock(lock, node, tail))
+		goto release;
 	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
 		cpu_relax();
 
@@ -454,7 +453,6 @@ queue:
 		cpu_relax();
 
 	arch_mcs_spin_unlock_contended(&next->locked);
-	pv_kick_node(lock, next);
 
 release:
 	/*
@@ -475,8 +473,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
 
 #undef pv_init_node
 #undef pv_wait_node
-#undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_and_lock
 
 #undef  queued_spin_lock_slowpath
 #define queued_spin_lock_slowpath	__pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index ab823b6..66f3964 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -41,6 +41,31 @@ struct pv_node {
 };
 
 /*
+ * Allow one unfair trylock when entering the PV slowpath to reduce the
+ * performance impact of lock waiter preemption (either explicitly via
+ * pv_wait or implicitly via PLE). This function will be called once when
+ * a lock waiter enter the slowpath before being queued.
+ *
+ * A little bit of unfairness here can improve performance without many
+ * of the downsides of a real unfair lock.
+ */
+#define queued_spin_trylock(l)	pv_queued_spin_trylock_unfair(l)
+static inline bool pv_queued_spin_trylock_unfair(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	if (READ_ONCE(l->locked))
+		return 0;
+	/*
+	 * Wait a bit here to ensure that an actively spinning queue head vCPU
+	 * has a fair chance of getting the lock.
+	 */
+	cpu_relax();
+
+	return cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0;
+}
+
+/*
  * PV qspinlock statistics
  */
 enum pv_qlock_stat {
@@ -49,6 +74,7 @@ enum pv_qlock_stat {
 	pvstat_wait_again,
 	pvstat_kick_wait,
 	pvstat_kick_unlock,
+	pvstat_steal_lock,
 	pvstat_spurious,
 	pvstat_hops,
 	pvstat_num	/* Total number of statistics counts */
@@ -67,6 +93,7 @@ static const char * const stat_fsnames[pvstat_num] = {
 	[pvstat_wait_again]  = "wait_again_count",
 	[pvstat_kick_wait]   = "kick_wait_count",
 	[pvstat_kick_unlock] = "kick_unlock_count",
+	[pvstat_steal_lock]  = "lock_stealing_count",
 	[pvstat_spurious]    = "spurious_wakeup",
 	[pvstat_hops]	     = "hash_hops_count",
 };
@@ -146,6 +173,19 @@ static inline void pvstat_hop(int hopcnt)
 }
 
 /*
+ * PV unfair trylock count
+ */
+static inline int pvstat_trylock_unfair(struct qspinlock *lock)
+{
+	int ret = pv_queued_spin_trylock_unfair(lock);
+
+	pvstat_inc(pvstat_steal_lock, ret);
+	return ret;
+}
+#undef  queued_spin_trylock
+#define queued_spin_trylock(l)	pvstat_trylock_unfair(l)
+
+/*
  * Replacement function for pv_kick()
  */
 static inline void __pv_kick(int cpu)
@@ -338,8 +378,8 @@ static void pv_wait_node(struct mcs_spinlock *node)
 
 		/*
 		 * If pv_kick_node() changed us to vcpu_hashed, retain that
-		 * value so that pv_wait_head() knows to not also try to hash
-		 * this lock.
+		 * value so that pv_wait_head_and_lock() knows to not also
+		 * try to hash this lock.
 		 */
 		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 
@@ -363,8 +403,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
 /*
  * Called after setting next->locked = 1 when we're the lock owner.
  *
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_and_lock(), this avoids a
+ * wake/sleep cycle.
  */
 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 {
@@ -396,13 +437,15 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
  * Wait for l->locked to become clear; halt the vcpu after a short spin.
  * __pv_queued_spin_unlock() will wake us.
  */
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+static bool pv_wait_head_and_lock(struct qspinlock *lock,
+				  struct mcs_spinlock *node, u32 tail)
 {
 	struct pv_node *pn = (struct pv_node *)node;
+	struct mcs_spinlock *next;
 	struct __qspinlock *l = (void *)lock;
 	struct qspinlock **lp = NULL;
 	int waitcnt = 0;
-	int loop;
+	int loop, old;
 
 	/*
 	 * If pv_kick_node() already advanced our state, we don't need to
@@ -412,10 +455,23 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 		lp = (struct qspinlock **)1;
 
 	for (;; waitcnt++) {
-		for (loop = SPIN_THRESHOLD; loop; loop--) {
-			if (!READ_ONCE(l->locked))
-				return;
-			cpu_relax();
+		loop = SPIN_THRESHOLD;
+		while (loop) {
+			/*
+			 * Spin until the lock is free
+			 */
+			for (; loop && READ_ONCE(l->locked); loop--)
+				cpu_relax();
+			/*
+			 * Seeing the lock is free, this queue head vCPU is
+			 * the rightful next owner of the lock. However, the
+			 * lock may have just been stolen by another task which
+			 * has entered the slowpath. So we need to use atomic
+			 * operation to make sure that we really get the lock.
+			 * Otherwise, we have to wait again.
+			 */
+			if (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)
+				goto gotlock;
 		}
 
 		if (!lp) { /* ONCE */
@@ -432,36 +488,60 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 			 *
 			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
 			 */
-			if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+			if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
 				/*
-				 * The lock is free and _Q_SLOW_VAL has never
-				 * been set. Therefore we need to unhash before
-				 * getting the lock.
+				 * The lock was free and now we own the lock.
+				 * Change the lock value back to _Q_LOCKED_VAL
+				 * and unhash the table.
 				 */
+				WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
 				WRITE_ONCE(*lp, NULL);
-				return;
+				goto gotlock;
 			}
 		}
 		pvstat_inc(pvstat_wait_head, true);
 		pvstat_inc(pvstat_wait_again, waitcnt);
 		pv_wait(&l->locked, _Q_SLOW_VAL);
 
-		if (!READ_ONCE(l->locked))
-			return;
 		/*
 		 * The unlocker should have freed the lock before kicking the
 		 * CPU. So if the lock is still not free, it is a spurious
-		 * wakeup and so the vCPU should wait again after spinning for
-		 * a while.
+		 * wakeup or another vCPU has stolen the lock. The current
+		 * vCPU should spin again.
 		 */
-		pvstat_inc(pvstat_spurious, true);
+		pvstat_inc(pvstat_spurious, READ_ONCE(l->locked));
 	}
 
+gotlock:
 	/*
-	 * Lock is unlocked now; the caller will acquire it without waiting.
-	 * As with pv_wait_node() we rely on the caller to do a load-acquire
-	 * for us.
+	 * We now have the lock. We need to either clear the tail code or
+	 * notify the next one in queue as the new queue head.
 	 */
+	old = atomic_read(&lock->val);
+	while ((old & _Q_TAIL_MASK) == tail) {
+		int val;
+		int new = old & ~_Q_TAIL_MASK;
+
+		/*
+		 * We are the only one in the queue, so clear the tail code
+		 * and return.
+		 */
+		val = atomic_cmpxchg(&lock->val, old, new);
+		if (old == val)
+			goto done;
+		old = val;
+	}
+
+	/*
+	 * contended path; wait for next, release.
+	 */
+	while (!(next = READ_ONCE(node->next)))
+		cpu_relax();
+
+	arch_mcs_spin_unlock_contended(&next->locked);
+	pv_kick_node(lock, next);
+done:
+	return 1;
 }
 
 /*
@@ -486,7 +566,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 	 * so we need a barrier to order the read of the node data in
 	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
 	 *
-	 * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+	 * Matches the cmpxchg() in pv_wait_head_and_lock() setting _Q_SLOW_VAL.
 	 */
 	smp_rmb();
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/