lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <218b548b69479baa005af8a7b04a3abbea8ed6fa.1392252790.git.luto@amacapital.net>
Date:	Wed, 12 Feb 2014 17:40:12 -0800
From:	Andy Lutomirski <luto@...capital.net>
To:	Peter Zijlstra <peterz@...radead.org>
Cc:	Thomas Gleixner <tglx@...utronix.de>,
	Mike Galbraith <bitbucket@...ine.de>, X86 ML <x86@...nel.org>,
	linux-kernel@...r.kernel.org, Andy Lutomirski <luto@...capital.net>
Subject: [RFC] sched: Add a new lockless wake-from-idle implementation

This is a strawman proposal to simplify the idle implementation, eliminate
a race

Benefits over current code:
 - ttwu_queue_remote doesn't use an IPI unless needed
 - The diffstat should speak for itself :)
 - Less racy.  Spurious IPIs are possible, but only in narrow windows or
   when two wakeups occur in rapid succession.
 - Seems to work (?)

Issues:
 - Am I doing the percpu stuff right?
 - Needs work on non-x86 architectures
 - The !CONFIG_SMP case needs to be checked
 - Is "idlepoll" a good name for the new code?  It doesn't have *that*
   much to do with the idle state.  Maybe cpukick?

If this turns out okay, TIF_NEED_RESCHED could possibly be deleted as well.

Signed-off-by: Andy Lutomirski <luto@...capital.net>
---
 arch/x86/include/asm/mwait.h       |  22 +++----
 arch/x86/include/asm/thread_info.h |   2 -
 arch/x86/kernel/apm_32.c           |  12 ----
 include/linux/idlepoll.h           |  85 ++++++++++++++++++++++++++
 include/linux/preempt.h            |   5 --
 include/linux/sched.h              | 120 -------------------------------------
 kernel/cpu/idle.c                  |  60 ++++++++++++-------
 kernel/sched/core.c                |  93 +++++++++++-----------------
 8 files changed, 167 insertions(+), 232 deletions(-)
 create mode 100644 include/linux/idlepoll.h

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 1da25a5..8addd29 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_MWAIT_H
 
 #include <linux/sched.h>
+#include <linux/idlepoll.h>
 
 #define MWAIT_SUBSTATE_MASK		0xf
 #define MWAIT_CSTATE_MASK		0xf
@@ -42,18 +43,17 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
  */
 static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 {
-	if (!current_set_polling_and_test()) {
-		if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
-			mb();
-			clflush((void *)&current_thread_info()->flags);
-			mb();
-		}
-
-		__monitor((void *)&current_thread_info()->flags, 0, 0);
-		if (!need_resched())
-			__mwait(eax, ecx);
+	idlepoll_begin_poll();
+	if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
+		mb();
+		clflush(idlepoll_poll_ptr());
+		mb();
 	}
-	current_clr_polling();
+
+	__monitor(idlepoll_poll_ptr(), 0, 0);
+	if (idlepoll_keep_polling())
+		__mwait(eax, ecx);
+	idlepoll_end_poll();
 }
 
 #endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e1940c0..caa3f63 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -234,8 +234,6 @@ static inline struct thread_info *current_thread_info(void)
  * have to worry about atomic accesses.
  */
 #define TS_COMPAT		0x0002	/* 32bit syscall active (64BIT)*/
-#define TS_POLLING		0x0004	/* idle task polling need_resched,
-					   skip sending interrupt */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 3ab0343..5848744 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -841,24 +841,12 @@ static int apm_do_idle(void)
 	u32 eax;
 	u8 ret = 0;
 	int idled = 0;
-	int polling;
 	int err = 0;
 
-	polling = !!(current_thread_info()->status & TS_POLLING);
-	if (polling) {
-		current_thread_info()->status &= ~TS_POLLING;
-		/*
-		 * TS_POLLING-cleared state must be visible before we
-		 * test NEED_RESCHED:
-		 */
-		smp_mb();
-	}
 	if (!need_resched()) {
 		idled = 1;
 		ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
 	}
-	if (polling)
-		current_thread_info()->status |= TS_POLLING;
 
 	if (!idled)
 		return 0;
diff --git a/include/linux/idlepoll.h b/include/linux/idlepoll.h
new file mode 100644
index 0000000..072bc7e
--- /dev/null
+++ b/include/linux/idlepoll.h
@@ -0,0 +1,85 @@
+/*
+ * idlepoll.h: definitions and inlines for lockless wake-from-idle.
+ *
+ * Copyright (c) 2014 Andy Lutomirski (luto@...capital.net)
+ */
+#ifndef __IDLEPOLL_H
+#define __IDLEPOLL_H
+
+#include <linux/percpu-defs.h>
+#include <asm/atomic.h>
+
+DECLARE_PER_CPU_ALIGNED(atomic_t, idlepoll_state);
+
+#define IDLEPOLL_NOT_POLLING	0  /* wakers must send IPI */
+#define IDLEPOLL_POLLING	1  /* ok to wake using idlepoll_state */
+#define IDLEPOLL_KICKED		2  /* __idlepoll_kicked will be called soon */
+
+void __idlepoll_kicked(void);
+
+/**
+ * idlepoll_poll_ptr - Address to use for hardware idle polling
+ *
+ * On architectures with an idle-until-an-address-is-written operations,
+ * this returns the address to use.
+ */
+static inline void *idlepoll_poll_ptr(void)
+{
+	return this_cpu_ptr(&idlepoll_state);
+}
+
+/**
+ * idlepoll_begin_poll - Address to use for hardware idle polling
+ *
+ * If an idle implementation polls, call this before polling.
+ */
+static inline void idlepoll_begin_poll(void)
+{
+	BUG_ON(atomic_read(this_cpu_ptr(&idlepoll_state)) == IDLEPOLL_POLLING);
+
+	/*
+	 * Mark us polling.  Note: we don't particularly care what
+	 * what the previous value was.  If it was IDLEPOLL_KICKED, then
+	 * an IPI is on its way.
+	 */
+	atomic_set(this_cpu_ptr(&idlepoll_state), IDLEPOLL_POLLING);
+
+	/*
+	 * No barrier here: we assume that whoever calls us has enough
+	 * barriers to notice idlepoll_state changing.
+	 */
+}
+
+/**
+ * idlepoll_end_poll - Address to use for hardware idle polling
+ *
+ * If an idle implementation polls, call this after polling.
+ */
+static inline void idlepoll_end_poll(void)
+{
+	/*
+	 * It's possible to be in IDLEPOLL_NOT_POLLING: an IPI
+	 * could have come in and beaten us to the punch.
+	 */
+	if (atomic_xchg(this_cpu_ptr(&idlepoll_state), IDLEPOLL_NOT_POLLING) ==
+	    IDLEPOLL_KICKED)
+		__idlepoll_kicked();
+}
+
+/**
+ * idlepoll_keep_polling - Should a polling idle implementation stop polling?
+ *
+ * If an idle implementation polls, it should check this before pausing
+ * or asking hardware to wait.
+ */
+static inline bool idlepoll_keep_polling(void)
+{
+	return atomic_read(this_cpu_ptr(&idlepoll_state)) == IDLEPOLL_POLLING
+		&& !test_preempt_need_resched();
+}
+
+#ifdef CONFIG_SMP
+void idlepoll_kick_cpu(int cpu);
+#endif
+
+#endif
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index de83b4e..04a3f39 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -138,11 +138,6 @@ do { \
 do { \
 	set_preempt_need_resched(); \
 } while (0)
-#define preempt_fold_need_resched() \
-do { \
-	if (tif_need_resched()) \
-		set_preempt_need_resched(); \
-} while (0)
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a781dec..508df16 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2661,126 +2661,6 @@ static inline int spin_needbreak(spinlock_t *lock)
 #endif
 }
 
-/*
- * Idle thread specific functions to determine the need_resched
- * polling state. We have two versions, one based on TS_POLLING in
- * thread_info.status and one based on TIF_POLLING_NRFLAG in
- * thread_info.flags
- */
-#ifdef TS_POLLING
-static inline int tsk_is_polling(struct task_struct *p)
-{
-	return task_thread_info(p)->status & TS_POLLING;
-}
-static inline void __current_set_polling(void)
-{
-	current_thread_info()->status |= TS_POLLING;
-}
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-	__current_set_polling();
-
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_task()
-	 */
-	smp_mb();
-
-	return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
-{
-	current_thread_info()->status &= ~TS_POLLING;
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-	__current_clr_polling();
-
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_task()
-	 */
-	smp_mb();
-
-	return unlikely(tif_need_resched());
-}
-#elif defined(TIF_POLLING_NRFLAG)
-static inline int tsk_is_polling(struct task_struct *p)
-{
-	return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
-}
-
-static inline void __current_set_polling(void)
-{
-	set_thread_flag(TIF_POLLING_NRFLAG);
-}
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-	__current_set_polling();
-
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_task()
-	 *
-	 * XXX: assumes set/clear bit are identical barrier wise.
-	 */
-	smp_mb__after_clear_bit();
-
-	return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
-{
-	clear_thread_flag(TIF_POLLING_NRFLAG);
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-	__current_clr_polling();
-
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_task()
-	 */
-	smp_mb__after_clear_bit();
-
-	return unlikely(tif_need_resched());
-}
-
-#else
-static inline int tsk_is_polling(struct task_struct *p) { return 0; }
-static inline void __current_set_polling(void) { }
-static inline void __current_clr_polling(void) { }
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-	return unlikely(tif_need_resched());
-}
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-	return unlikely(tif_need_resched());
-}
-#endif
-
-static inline void current_clr_polling(void)
-{
-	__current_clr_polling();
-
-	/*
-	 * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
-	 * Once the bit is cleared, we'll get IPIs with every new
-	 * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
-	 * fold.
-	 */
-	smp_mb(); /* paired with resched_task() */
-
-	preempt_fold_need_resched();
-}
-
 static __always_inline bool need_resched(void)
 {
 	return unlikely(tif_need_resched());
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 277f494..5f772f5 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -6,12 +6,14 @@
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
+#include <linux/idlepoll.h>
 
 #include <asm/tlb.h>
 
 #include <trace/events/power.h>
 
 static int __read_mostly cpu_idle_force_poll;
+DEFINE_PER_CPU(atomic_t, idlepoll_state) ____cacheline_aligned;
 
 void cpu_idle_poll_ctrl(bool enable)
 {
@@ -44,13 +46,39 @@ static inline int cpu_idle_poll(void)
 	rcu_idle_enter();
 	trace_cpu_idle_rcuidle(0, smp_processor_id());
 	local_irq_enable();
-	while (!tif_need_resched())
+
+	idlepoll_begin_poll();
+	while (idlepoll_keep_polling())
 		cpu_relax();
+	idlepoll_end_poll();
+
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 	rcu_idle_exit();
 	return 1;
 }
 
+#ifdef CONFIG_SMP
+DEFINE_PER_CPU_ALIGNED(atomic_t, idlepoll_state);
+EXPORT_SYMBOL_GPL(idlepoll_state);
+
+/**
+ * idlepoll_kick_cpu - Cause a target cpu to call __idlepoll_kicked
+ *
+ * This implies a strong enough barrier that any writes before
+ * idlepoll_kick_cpu will be visible before __idlepoll_kicked is called.
+ *
+ * Do NOT call this on the current cpu.  Call from any context: no
+ * locks are required or taken.
+ */
+void idlepoll_kick_cpu(int cpu)
+{
+	if (atomic_xchg(per_cpu_ptr(&idlepoll_state, cpu), IDLEPOLL_KICKED) ==
+	    IDLEPOLL_NOT_POLLING)
+		smp_send_reschedule(cpu);
+}
+EXPORT_SYMBOL_GPL(idlepoll_kick_cpu);
+#endif
+
 /* Weak implementations for optional arch specific functions */
 void __weak arch_cpu_idle_prepare(void) { }
 void __weak arch_cpu_idle_enter(void) { }
@@ -65,12 +93,13 @@ void __weak arch_cpu_idle(void)
 /*
  * Generic idle loop implementation
  */
+
 static void cpu_idle_loop(void)
 {
 	while (1) {
 		tick_nohz_idle_enter();
 
-		while (!need_resched()) {
+		while (!test_preempt_need_resched()) {
 			check_pgt_cache();
 			rmb();
 
@@ -92,30 +121,16 @@ static void cpu_idle_loop(void)
 			if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
 				cpu_idle_poll();
 			} else {
-				if (!current_clr_polling_and_test()) {
-					stop_critical_timings();
-					rcu_idle_enter();
-					arch_cpu_idle();
-					WARN_ON_ONCE(irqs_disabled());
-					rcu_idle_exit();
-					start_critical_timings();
-				} else {
-					local_irq_enable();
-				}
-				__current_set_polling();
+				stop_critical_timings();
+				rcu_idle_enter();
+				arch_cpu_idle();
+				WARN_ON_ONCE(irqs_disabled());
+				rcu_idle_exit();
+				start_critical_timings();
 			}
 			arch_cpu_idle_exit();
 		}
 
-		/*
-		 * Since we fell out of the loop above, we know
-		 * TIF_NEED_RESCHED must be set, propagate it into
-		 * PREEMPT_NEED_RESCHED.
-		 *
-		 * This is required because for polling idle loops we will
-		 * not have had an IPI to fold the state for us.
-		 */
-		preempt_set_need_resched();
 		tick_nohz_idle_exit();
 		schedule_preempt_disabled();
 	}
@@ -138,7 +153,6 @@ void cpu_startup_entry(enum cpuhp_state state)
 	 */
 	boot_init_stack_canary();
 #endif
-	__current_set_polling();
 	arch_cpu_idle_prepare();
 	cpu_idle_loop();
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131e..b771c46 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
 #include <linux/init_task.h>
 #include <linux/binfmts.h>
 #include <linux/context_tracking.h>
+#include <linux/idlepoll.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -504,6 +505,18 @@ static inline void init_hrtick(void)
 }
 #endif	/* CONFIG_SCHED_HRTICK */
 
+void resched_cpu(int cpu)
+{
+#ifdef CONFIG_SMP
+	if (cpu == smp_processor_id())
+		set_preempt_need_resched();
+	else
+		idlepoll_kick_cpu(cpu);
+#else
+	set_preempt_need_resched();
+#endif
+}
+
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
@@ -513,36 +526,16 @@ static inline void init_hrtick(void)
  */
 void resched_task(struct task_struct *p)
 {
-	int cpu;
-
 	lockdep_assert_held(&task_rq(p)->lock);
 
+	/* XXX: this is probably unnecessary. */
 	if (test_tsk_need_resched(p))
 		return;
 
+	/* XXX: so is this. */
 	set_tsk_need_resched(p);
 
-	cpu = task_cpu(p);
-	if (cpu == smp_processor_id()) {
-		set_preempt_need_resched();
-		return;
-	}
-
-	/* NEED_RESCHED must be visible before we test polling */
-	smp_mb();
-	if (!tsk_is_polling(p))
-		smp_send_reschedule(cpu);
-}
-
-void resched_cpu(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
-		return;
-	resched_task(cpu_curr(cpu));
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	resched_cpu(task_cpu(p));
 }
 
 #ifdef CONFIG_SMP
@@ -586,32 +579,10 @@ unlock:
  */
 static void wake_up_idle_cpu(int cpu)
 {
-	struct rq *rq = cpu_rq(cpu);
-
 	if (cpu == smp_processor_id())
 		return;
 
-	/*
-	 * This is safe, as this function is called with the timer
-	 * wheel base lock of (cpu) held. When the CPU is on the way
-	 * to idle and has not yet set rq->curr to idle then it will
-	 * be serialized on the timer wheel base lock and take the new
-	 * timer into account automatically.
-	 */
-	if (rq->curr != rq->idle)
-		return;
-
-	/*
-	 * We can set TIF_RESCHED on the idle task of the other CPU
-	 * lockless. The worst case is that the other CPU runs the
-	 * idle task through an additional NOOP schedule()
-	 */
-	set_tsk_need_resched(rq->idle);
-
-	/* NEED_RESCHED must be visible before we test polling */
-	smp_mb();
-	if (!tsk_is_polling(rq->idle))
-		smp_send_reschedule(cpu);
+	idlepoll_kick_cpu(cpu);
 }
 
 static bool wake_up_full_nohz_cpu(int cpu)
@@ -1493,19 +1464,23 @@ static void sched_ttwu_pending(void)
 	raw_spin_unlock(&rq->lock);
 }
 
+void __idlepoll_kicked(void)
+{
+	set_preempt_need_resched();
+	sched_ttwu_pending();
+}
+EXPORT_SYMBOL_GPL(__idlepoll_kicked);
+
 void scheduler_ipi(void)
 {
-	/*
-	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
-	 * TIF_NEED_RESCHED remotely (for the first time) will also send
-	 * this IPI.
-	 */
-	preempt_fold_need_resched();
+	irq_enter();
 
-	if (llist_empty(&this_rq()->wake_list)
-			&& !tick_nohz_full_cpu(smp_processor_id())
-			&& !got_nohz_idle_kick())
-		return;
+	if (atomic_xchg(this_cpu_ptr(&idlepoll_state), IDLEPOLL_NOT_POLLING) ==
+	    IDLEPOLL_KICKED)
+		__idlepoll_kicked();
+
+	if (!tick_nohz_full_cpu(smp_processor_id()) && !got_nohz_idle_kick())
+		goto out;
 
 	/*
 	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
@@ -1520,9 +1495,7 @@ void scheduler_ipi(void)
 	 * however a fair share of IPIs are still resched only so this would
 	 * somewhat pessimize the simple resched case.
 	 */
-	irq_enter();
 	tick_nohz_full_check();
-	sched_ttwu_pending();
 
 	/*
 	 * Check if someone kicked us for doing the nohz idle load balance.
@@ -1531,13 +1504,15 @@ void scheduler_ipi(void)
 		this_rq()->idle_balance = 1;
 		raise_softirq_irqoff(SCHED_SOFTIRQ);
 	}
+
+out:
 	irq_exit();
 }
 
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
 {
 	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
-		smp_send_reschedule(cpu);
+		idlepoll_kick_cpu(cpu);
 }
 
 bool cpus_share_cache(int this_cpu, int that_cpu)
-- 
1.8.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ