lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1330122800-834-3-git-send-email-venki@google.com>
Date:	Fri, 24 Feb 2012 14:33:18 -0800
From:	Venkatesh Pallipadi <venki@...gle.com>
To:	Peter Zijlstra <peterz@...radead.org>,
	Thomas Gleixner <tglx@...utronix.de>,
	Ingo Molnar <mingo@...hat.com>,
	"H. Peter Anvin" <hpa@...or.com>
Cc:	Suresh Siddha <suresh.b.siddha@...el.com>,
	Aaron Durbin <adurbin@...gle.com>,
	Paul Turner <pjt@...gle.com>,
	Yong Zhang <yong.zhang0@...il.com>,
	Andi Kleen <andi@...stfloor.org>, linux-kernel@...r.kernel.org,
	Venkatesh Pallipadi <venki@...gle.com>
Subject: [PATCH 2/4] x86: Mwait idle optimization to avoid CAL+RES IPIs -v2

smp_call_function_single and ttwu_queue_remote sends unconditional IPI
to target CPU. However, if the target CPU is in some form of poll based idle,
we can do IPI-less wakeups.
Doing this has certain advantages:
* Lower overhead on Async IPI send path. Measurements on Westmere based
  systems show savings on "no wait" smp_call_function_single with idle
  target CPU (as measured on the sender side).
  local socket smp_call_func cost goes from ~1600 to ~1100 cycles
  remote socket smp_call_func cost goes from ~2000 to ~1800 cycles
* Avoiding actual interrupts shows a measurable reduction (10%) in system
  non-idle cycles and cache-references with micro-benchmark sending IPI from
  one CPU to all the other mostly idle CPUs in the system.
* On a mostly idle system, turbostat shows a tiny decrease in C0(active) time
  and a corresponding increase in C6 state (Each row being 10min avg)
          %c0   %c1   %c6
  Before
  Run 1  1.51  2.93 95.55
  Run 2  1.48  2.86 95.65
  Run 3  1.46  2.78 95.74
  After
  Run 1  1.35  2.63 96.00
  Run 2  1.46  2.78 95.74
  Run 3  1.37  2.63 95.98

We started looking at this with one of our workloads where system is partially
busy and we noticed some kernel hotspots in find_next_bit and
default_send_IPI_mask_sequence_phys coming from sched wakeup (futex wakeups)
and networking call functions.

Thanks to Suresh for the suggestion of using TIF flags instead of
having a new percpu state variable and complicated update logic.

Notes:
* This only helps when target CPU is idle. When it is busy we will still send
  IPI as before.

Signed-off-by: Venkatesh Pallipadi <venki@...gle.com>
---
 arch/x86/include/asm/ipiless_wake.h |   84 +++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/thread_info.h  |    3 +
 arch/x86/kernel/acpi/cstate.c       |    7 ++-
 arch/x86/kernel/process_32.c        |    2 +
 arch/x86/kernel/process_64.c        |    2 +
 arch/x86/kernel/smp.c               |    8 +++
 6 files changed, 104 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/include/asm/ipiless_wake.h

diff --git a/arch/x86/include/asm/ipiless_wake.h b/arch/x86/include/asm/ipiless_wake.h
new file mode 100644
index 0000000..a490dd3
--- /dev/null
+++ b/arch/x86/include/asm/ipiless_wake.h
@@ -0,0 +1,84 @@
+#ifndef _ASM_X86_IPILESS_WAKE_H
+#define _ASM_X86_IPILESS_WAKE_H
+
+#include <linux/hardirq.h>
+#include <linux/sched.h>
+#include <asm/thread_info.h>
+
+#ifdef CONFIG_SMP
+
+/*
+ * TIF_IN_IPILESS_IDLE CPU being in a idle state with ipiless wakeup
+ * capability, without any pending IPIs.
+ * It is conditionally reset by an IPI source CPU and the reset automatically
+ * brings the target CPU out of its idle state.
+ *
+ * TS_IPILESS_WAKEUP is only changed by local CPU and is a place to store
+ * the info that there is a pending IPI work needed after complete idle exit.
+ */
+
+static inline void enter_ipiless_idle(void)
+{
+	set_thread_flag(TIF_IN_IPILESS_IDLE);
+}
+
+static inline void exit_ipiless_idle(void)
+{
+	if (!test_and_clear_thread_flag(TIF_IN_IPILESS_IDLE)) {
+		/*
+		 * Flag was already cleared, indicating that there is
+		 * a pending IPIless wakeup.
+		 * Save that info in status for later use.
+		 */
+		current_thread_info()->status |= TS_IPILESS_WAKEUP;
+	}
+}
+
+static inline int is_ipiless_wakeup_pending(void)
+{
+	return need_resched() ||
+		unlikely(!test_thread_flag(TIF_IN_IPILESS_IDLE));
+}
+
+static inline void do_ipiless_pending_work(void)
+{
+	if (unlikely(current_thread_info()->status & TS_IPILESS_WAKEUP)) {
+		current_thread_info()->status &= ~TS_IPILESS_WAKEUP;
+
+		local_bh_disable();
+		local_irq_disable();
+
+		irq_enter();
+		generic_smp_call_function_single_interrupt();
+		irq_exit();
+
+		scheduler_ipi(); /* Does its own irq enter/exit */
+
+		local_irq_enable();
+		local_bh_enable(); /* Needed for bh handling */
+	}
+}
+
+static inline int try_ipiless_wakeup(int cpu)
+{
+	struct thread_info *idle_ti = task_thread_info(idle_task(cpu));
+
+	if (!(idle_ti->flags & _TIF_IN_IPILESS_IDLE))
+		return 0;
+
+	return test_and_clear_bit(TIF_IN_IPILESS_IDLE,
+					(unsigned long *)&idle_ti->flags);
+}
+
+#else
+static inline void do_ipiless_pending_work(void) { }
+static inline void enter_ipiless_idle(void) { }
+static inline void exit_ipiless_idle(void) { }
+
+static inline int is_ipiless_wakeup_pending(void)
+{
+	return need_resched();
+}
+#endif
+
+#endif /* _ASM_X86_IPILESS_WAKE_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a4d3888..3c5ae3b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -96,6 +96,7 @@ struct thread_info {
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 #define TIF_POLLING_NRFLAG	29	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_IN_IPILESS_IDLE	30	/* Task in IPIless idle state */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -118,6 +119,7 @@ struct thread_info {
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
+#define _TIF_IN_IPILESS_IDLE	(1 << TIF_IN_IPILESS_IDLE)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -253,6 +255,7 @@ static inline struct thread_info *current_thread_info(void)
 					   this quantum (SMP) */
 #define TS_COMPAT		0x0002	/* 32bit syscall active (64BIT)*/
 #define TS_RESTORE_SIGMASK	0x0004	/* restore signal mask in do_signal() */
+#define TS_IPILESS_WAKEUP	0x0008	/* pending IPI-work on idle exit */
 
 #ifndef __ASSEMBLY__
 #define HAVE_SET_RESTORE_SIGMASK	1
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index f50e7fb..30ab435 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 
 #include <acpi/processor.h>
+#include <asm/ipiless_wake.h>
 #include <asm/acpi.h>
 #include <asm/mwait.h>
 
@@ -161,15 +162,17 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
-	if (!need_resched()) {
+	enter_ipiless_idle();
+	if (!is_ipiless_wakeup_pending()) {
 		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
 
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
-		if (!need_resched())
+		if (!is_ipiless_wakeup_pending())
 			__mwait(ax, cx);
 	}
+	exit_ipiless_idle();
 }
 
 void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 5de6bb1..014e26d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -44,6 +44,7 @@
 #include <asm/system.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
+#include <asm/ipiless_wake.h>
 #include <asm/i387.h>
 #include <asm/desc.h>
 #ifdef CONFIG_MATH_EMULATION
@@ -116,6 +117,7 @@ void cpu_idle(void)
 			if (cpuidle_idle_call())
 				pm_idle();
 			start_critical_timings();
+			do_ipiless_pending_work();
 		}
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 98b1854..777bb7d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -42,6 +42,7 @@
 #include <asm/pgtable.h>
 #include <asm/system.h>
 #include <asm/processor.h>
+#include <asm/ipiless_wake.h>
 #include <asm/i387.h>
 #include <asm/mmu_context.h>
 #include <asm/prctl.h>
@@ -148,6 +149,7 @@ void cpu_idle(void)
 
 			rcu_idle_exit();
 			start_critical_timings();
+			do_ipiless_pending_work();
 
 			/* In many cases the interrupt that ended idle
 			   has already called exit_idle. But some idle
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 66c74f4..4b44bef 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -27,6 +27,7 @@
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
+#include <asm/ipiless_wake.h>
 #include <asm/proto.h>
 #include <asm/apic.h>
 #include <asm/nmi.h>
@@ -120,11 +121,18 @@ static void native_smp_send_reschedule(int cpu)
 		WARN_ON(1);
 		return;
 	}
+
+	if (try_ipiless_wakeup(cpu))
+		return;
+
 	apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
 }
 
 void native_send_call_func_single_ipi(int cpu)
 {
+	if (try_ipiless_wakeup(cpu))
+		return;
+
 	apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
 }
 
-- 
1.7.7.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ