lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1486145034-22210-12-git-send-email-longman@redhat.com>
Date:   Fri,  3 Feb 2017 13:03:44 -0500
From:   Waiman Long <longman@...hat.com>
To:     Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...nel.org>,
        Peter Zijlstra <peterz@...radead.org>,
        Jonathan Corbet <corbet@....net>
Cc:     linux-kernel@...r.kernel.org, linux-doc@...r.kernel.org,
        Arnaldo Carvalho de Melo <acme@...nel.org>,
        Davidlohr Bueso <dave@...olabs.net>,
        Mike Galbraith <umgwanakikbuti@...il.com>,
        Scott J Norton <scott.norton@....com>,
        Waiman Long <longman@...hat.com>
Subject: [PATCH-tip v5 11/21] TP-futex: Implement lock handoff to prevent lock starvation

The current TP futexes has no guarantee that the top waiter
(serialization mutex owner) can get the lock within a finite time.
As a result, lock starvation can happen.

A lock handoff mechanism is added to the TP futexes to prevent lock
starvation from happening. The idea is that the top waiter can set a
special handoff_pid variable with its own pid. The futex owner will
then check this variable at unlock time to see if it should free the
futex or change its value to the designated pid to transfer the lock
directly to the top waiter.

This change does have the effect of limiting the amount of unfairness
and hence may adversely affect performance depending on the workloads.

Currently the handoff mechanism is triggered when the top waiter fails
to acquire the lock after about 5ms of spinning and sleeping. So the
maximum lock handoff rate is about 200 handoffs per second.

Signed-off-by: Waiman Long <longman@...hat.com>
---
 kernel/futex.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 53 insertions(+), 6 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 46a1a4b..348b44c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -63,6 +63,7 @@
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
 #include <linux/ptrace.h>
+#include <linux/sched/clock.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/wake_q.h>
 #include <linux/sched/mm.h>
@@ -237,6 +238,8 @@ struct futex_state {
 	struct task_struct *mutex_owner;	/* For TP futexes only */
 	atomic_t refcount;
 
+	u32 handoff_pid;			/* For TP futexes only */
+
 	enum futex_type type;
 	union futex_key key;
 };
@@ -914,6 +917,7 @@ static int refill_futex_state_cache(void)
 	/* pi_mutex gets initialized later */
 	state->owner = NULL;
 	state->mutex_owner = NULL;
+	state->handoff_pid = 0;
 	atomic_set(&state->refcount, 1);
 	state->key = FUTEX_KEY_INIT;
 
@@ -3337,8 +3341,9 @@ void exit_robust_list(struct task_struct *curr)
  * and unlocking.
  *
  * The purpose of this FUTEX_WAITERS bit is to make the unlocker wake up the
- * serialization mutex owner. Not having the FUTEX_WAITERS bit set doesn't
- * mean there is no waiter in the kernel.
+ * serialization mutex owner or to hand off the lock directly to the top
+ * waiter. Not having the FUTEX_WAITERS bit set doesn't mean there is no
+ * waiter in the kernel.
  *
  * Like PI futexes, TP futexes are orthogonal to robust futexes can be
  * used together.
@@ -3354,6 +3359,16 @@ void exit_robust_list(struct task_struct *curr)
  * the ownership of the futex.
  */
 
+/*
+ * Timeout value for enabling lock handoff and prevent lock starvation.
+ *
+ * Currently, lock handoff will be enabled if the top waiter can't get the
+ * futex after about 5ms. To reduce time checking overhead, it is only done
+ * after every 128 spins or right after wakeup. As a result, the actual
+ * elapsed time will be a bit longer than the specified value.
+ */
+#define TP_HANDOFF_TIMEOUT	5000000	/* 5ms	*/
+
 /**
  * lookup_futex_state - Looking up the futex state structure.
  * @hb:		 hash bucket
@@ -3433,6 +3448,7 @@ static inline int put_futex_state_unlocked(struct futex_state *state)
  * If waiter is true
  * then
  *   don't preserve the flag bits;
+ *   check for handoff (futex word == own pid)
  * else
  *   preserve the flag bits
  * endif
@@ -3451,6 +3467,9 @@ static inline int futex_trylock(u32 __user *uaddr, const u32 vpid, u32 *puval,
 
 	uval = *puval;
 
+	if (waiter && (uval & FUTEX_TID_MASK) == vpid)
+		return 1;
+
 	if (uval & FUTEX_TID_MASK)
 		return 0;	/* Trylock fails */
 
@@ -3544,10 +3563,12 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
 	"futex: owner pid %d of TP futex 0x%lx was %s.\n"	\
 	"\tLock is now acquired by pid %d!\n"
 
-	int ret;
+	int ret, loopcnt = 1;
+	bool handoff_set = false;
 	u32 uval;
 	u32 owner_pid = 0;
 	struct task_struct *owner_task = NULL;
+	u64 handoff_time = running_clock() + TP_HANDOFF_TIMEOUT;
 
 	preempt_disable();
 	WRITE_ONCE(state->mutex_owner, current);
@@ -3602,6 +3623,7 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
 		if (need_resched()) {
 			__set_current_state(TASK_RUNNING);
 			schedule_preempt_disabled();
+			loopcnt = 0;
 			continue;
 		}
 
@@ -3610,6 +3632,23 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
 			break;
 		}
 
+		/*
+		 * Enable lock handoff if the elapsed time exceed the timeout
+		 * value. We also need to set the FUTEX_WAITERS bit to make
+		 * sure that futex lock holder will initiate the handoff at
+		 * unlock time.
+		 */
+		if (!handoff_set && !(loopcnt++ & 0x7f)) {
+			if (running_clock() > handoff_time) {
+				WARN_ON(READ_ONCE(state->handoff_pid));
+				ret = futex_set_waiters_bit(uaddr, &uval);
+				if (ret)
+					break;
+				WRITE_ONCE(state->handoff_pid, vpid);
+				handoff_set = true;
+			}
+		}
+
 		if (owner_task->on_cpu) {
 			cpu_relax();
 			continue;
@@ -3652,8 +3691,10 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
 		 * lock stealing happen in between setting the FUTEX_WAITERS
 		 * and setting state to TASK_INTERRUPTIBLE.
 		 */
-		if (!(uval & FUTEX_OWNER_DIED) && (uval & FUTEX_WAITERS))
+		if (!(uval & FUTEX_OWNER_DIED) && (uval & FUTEX_WAITERS)) {
 			schedule_preempt_disabled();
+			loopcnt = 0;
+		}
 		__set_current_state(TASK_RUNNING);
 	}
 
@@ -3675,6 +3716,7 @@ static int futex_spin_on_owner(u32 __user *uaddr, const u32 vpid,
 	 * Cleanup futex state.
 	 */
 	WRITE_ONCE(state->mutex_owner, NULL);
+	WRITE_ONCE(state->handoff_pid, 0);
 
 	preempt_enable();
 	return ret;
@@ -3789,6 +3831,7 @@ static noinline int futex_lock(u32 __user *uaddr, unsigned int flags)
 static int futex_unlock(u32 __user *uaddr, unsigned int flags)
 {
 	u32 uval, vpid = task_pid_vnr(current);
+	u32 newpid = 0;
 	union futex_key key = FUTEX_KEY_INIT;
 	struct futex_hash_bucket *hb;
 	struct futex_state *state = NULL;
@@ -3822,6 +3865,10 @@ static int futex_unlock(u32 __user *uaddr, unsigned int flags)
 		goto out_put_key;
 	}
 
+	newpid = READ_ONCE(state->handoff_pid);
+	if (newpid)
+		WRITE_ONCE(state->handoff_pid, 0);
+
 	owner = READ_ONCE(state->mutex_owner);
 	if (owner)
 		wake_q_add(&wake_q, owner);
@@ -3829,13 +3876,13 @@ static int futex_unlock(u32 __user *uaddr, unsigned int flags)
 	spin_unlock(&hb->fs_lock);
 
 	/*
-	 * Unlock the futex.
+	 * Unlock the futex or handoff to the next owner.
 	 * The flag bits are not preserved to encourage more lock stealing.
 	 */
 	for (;;) {
 		u32 old = uval;
 
-		if (cmpxchg_futex_value(&uval, uaddr, old, 0)) {
+		if (cmpxchg_futex_value(&uval, uaddr, old, newpid)) {
 			ret = -EFAULT;
 			break;
 		}
-- 
1.8.3.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ