[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200718021331.940659-1-joshdon@google.com>
Date: Fri, 17 Jul 2020 19:13:31 -0700
From: Josh Don <joshdon@...gle.com>
To: Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>
Cc: x86@...nel.org, "H . Peter Anvin" <hpa@...or.com>,
linux-pm@...r.kernel.org, linux-kernel@...r.kernel.org,
"Rafael J . Wysocki" <rjw@...ysocki.net>,
Daniel Lezcano <daniel.lezcano@...aro.org>,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
Paul Turner <pjt@...gle.com>, Josh Don <joshdon@...gle.com>
Subject: [RFC][PATCH] x86: optimization to avoid CAL+RES IPIs
From: Venkatesh Pallipadi <venki@...gle.com>
smp_call_function_single and smp_send_reschedule send unconditional IPI
to target CPU. However, if the target CPU is in some form of poll based
idle, we can do IPI-less wakeups.
Doing this has certain advantages:
* Lower overhead on Async "no wait" IPI send path.
* Avoiding actual interrupts reduces system non-idle cycles.
Note that this only helps when target CPU is idle. When it is busy we
will still send an IPI as before.
*** RFC NOTE ***
This patch breaks idle time accounting (and to a lesser degree, softirq
accounting). This is because this patch violates the assumption that
softirq can only be run either on the tail of a hard IRQ or inline on
a non-idle thread via local_bh_enable(), since we can now process
softirq inline within the idle loop. These ssues can be resolved in a
later version of this patch.
Signed-off-by: Josh Don <joshdon@...gle.com>
---
arch/x86/include/asm/mwait.h | 5 +-
arch/x86/include/asm/processor.h | 1 +
arch/x86/include/asm/thread_info.h | 2 +
arch/x86/kernel/apic/ipi.c | 8 +++
arch/x86/kernel/smpboot.c | 4 ++
drivers/cpuidle/poll_state.c | 5 +-
include/linux/ipiless_wake.h | 93 ++++++++++++++++++++++++++++++
kernel/sched/idle.c | 10 +++-
8 files changed, 124 insertions(+), 4 deletions(-)
create mode 100644 include/linux/ipiless_wake.h
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index e039a933aca3..aed393f38a39 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_MWAIT_H
#define _ASM_X86_MWAIT_H
+#include <linux/ipiless_wake.h>
#include <linux/sched.h>
#include <linux/sched/idle.h>
@@ -109,6 +110,7 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
+ enter_ipiless_idle();
if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
mb();
clflush((void *)¤t_thread_info()->flags);
@@ -116,8 +118,9 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
}
__monitor((void *)¤t_thread_info()->flags, 0, 0);
- if (!need_resched())
+ if (!is_ipiless_wakeup_pending())
__mwait(eax, ecx);
+ exit_ipiless_idle();
}
current_clr_polling();
}
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 03b7c4ca425a..045fc9bbd095 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -568,6 +568,7 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
* have to worry about atomic accesses.
*/
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
+#define TS_IPILESS_WAKEUP 0x0010 /* pending IPI-work on idle exit */
static inline void
native_load_sp0(unsigned long sp0)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8de8ceccb8bc..b6d3fa3c1578 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -98,6 +98,7 @@ struct thread_info {
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
+#define TIF_IN_IPILESS_IDLE 26 /* task in IPIless idle state */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
@@ -127,6 +128,7 @@ struct thread_info {
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
+#define _TIF_IN_IPILESS_IDLE (1 << TIF_IN_IPILESS_IDLE)
#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_ADDR32 (1 << TIF_ADDR32)
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 6ca0f91372fd..6739aea98aee 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/cpumask.h>
+#include <linux/ipiless_wake.h>
#include <linux/smp.h>
#include "local.h"
@@ -67,11 +68,18 @@ void native_smp_send_reschedule(int cpu)
WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
return;
}
+
+ if (try_ipiless_wakeup(cpu))
+ return;
+
apic->send_IPI(cpu, RESCHEDULE_VECTOR);
}
void native_send_call_func_single_ipi(int cpu)
{
+ if (try_ipiless_wakeup(cpu))
+ return;
+
apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ffbd9a3d78d8..3e681f0359f7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -105,6 +105,8 @@ EXPORT_SYMBOL(__max_logical_packages);
static unsigned int logical_packages __read_mostly;
static unsigned int logical_die __read_mostly;
+DEFINE_PER_CPU(unsigned long *, idletask_ti_flags);
+
/* Maximum number of SMT threads on any online core */
int __read_mostly __max_smt_threads = 1;
@@ -1042,6 +1044,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
unsigned long timeout;
idle->thread.sp = (unsigned long)task_pt_regs(idle);
+ per_cpu(idletask_ti_flags, cpu) = &task_thread_info(idle)->flags;
early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;
@@ -1405,6 +1408,7 @@ void __init native_smp_prepare_boot_cpu(void)
cpumask_set_cpu(me, cpu_callout_mask);
cpu_set_state_online(me);
native_pv_lock_init();
+ per_cpu(idletask_ti_flags, me) = &task_thread_info(current)->flags;
}
void __init calculate_max_logical_packages(void)
diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
index f7e83613ae94..e48cfa8fb15f 100644
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -4,6 +4,7 @@
*/
#include <linux/cpuidle.h>
+#include <linux/ipiless_wake.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/sched/idle.h>
@@ -24,7 +25,8 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev,
limit = cpuidle_poll_time(drv, dev);
- while (!need_resched()) {
+ enter_ipiless_idle();
+ while (!is_ipiless_wakeup_pending()) {
cpu_relax();
if (loop_count++ < POLL_IDLE_RELAX_COUNT)
continue;
@@ -35,6 +37,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev,
break;
}
}
+ exit_ipiless_idle();
}
current_clr_polling();
diff --git a/include/linux/ipiless_wake.h b/include/linux/ipiless_wake.h
new file mode 100644
index 000000000000..3854845a25a0
--- /dev/null
+++ b/include/linux/ipiless_wake.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IPILESS_WAKE_H
+#define _LINUX_IPILESS_WAKE_H
+
+#include <linux/hardirq.h>
+#include <linux/sched.h>
+#include <linux/thread_info.h>
+
+#if defined(CONFIG_SMP) && defined(TIF_IN_IPILESS_IDLE)
+
+DECLARE_PER_CPU(unsigned long *, idletask_ti_flags);
+/*
+ * TIF_IN_IPILESS_IDLE CPU being in a idle state with ipiless wakeup
+ * capability, without any pending IPIs.
+ * It is conditionally reset by an IPI source CPU and the reset automatically
+ * brings the target CPU out of its idle state.
+ *
+ * TS_IPILESS_WAKEUP is only changed by local CPU and is a place to store
+ * the info that there is a pending IPI work needed after complete idle exit.
+ */
+
+static inline void enter_ipiless_idle(void)
+{
+ set_thread_flag(TIF_IN_IPILESS_IDLE);
+}
+
+static inline void exit_ipiless_idle(void)
+{
+ if (!test_and_clear_thread_flag(TIF_IN_IPILESS_IDLE)) {
+ /*
+ * Flag was already cleared, indicating that there is
+ * a pending IPIless wakeup.
+ * Save that info in status for later use.
+ */
+ current_thread_info()->status |= TS_IPILESS_WAKEUP;
+ }
+}
+
+static inline int is_ipiless_wakeup_pending(void)
+{
+ return need_resched() ||
+ unlikely(!test_thread_flag(TIF_IN_IPILESS_IDLE));
+}
+
+static inline void do_ipiless_pending_work(void)
+{
+ if (unlikely(current_thread_info()->status & TS_IPILESS_WAKEUP)) {
+ current_thread_info()->status &= ~TS_IPILESS_WAKEUP;
+
+ local_bh_disable();
+ local_irq_disable();
+
+ /*
+ * Note: we must be in some form of idle, so no need to perform
+ * a kvm_set_cpu_l1tf_flush_l1d().
+ */
+
+ /* CALL_FUNCTION_SINGLE_VECTOR */
+ irq_enter();
+ generic_smp_call_function_single_interrupt();
+ irq_exit();
+
+ /* RESCHEDULE_VECTOR */
+ scheduler_ipi();
+
+ local_irq_enable();
+ local_bh_enable();
+ }
+}
+
+static inline int try_ipiless_wakeup(int cpu)
+{
+ unsigned long *ti_flags = per_cpu(idletask_ti_flags, cpu);
+
+ if (!(*ti_flags & _TIF_IN_IPILESS_IDLE))
+ return 0;
+
+ return test_and_clear_bit(TIF_IN_IPILESS_IDLE,
+ (unsigned long *)ti_flags);
+}
+
+#else
+static inline void do_ipiless_pending_work(void) { }
+static inline void enter_ipiless_idle(void) { }
+static inline void exit_ipiless_idle(void) { }
+
+static inline int is_ipiless_wakeup_pending(void)
+{
+ return need_resched();
+}
+#endif
+
+#endif /* _LINUX_IPILESS_WAKE_H */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1ae95b9150d3..8897721816d5 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -8,6 +8,8 @@
*/
#include "sched.h"
+#include <linux/ipiless_wake.h>
+
#include <trace/events/power.h>
/* Linker adds these: start and end of __cpuidle functions */
@@ -58,10 +60,12 @@ static noinline int __cpuidle cpu_idle_poll(void)
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
stop_critical_timings();
-
- while (!tif_need_resched() &&
+ /* caller will process ipiless work */
+ enter_ipiless_idle();
+ while (!is_ipiless_wakeup_pending() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
+ exit_ipiless_idle();
start_critical_timings();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
@@ -276,6 +280,8 @@ static void do_idle(void)
cpuidle_idle_call();
}
arch_cpu_idle_exit();
+
+ do_ipiless_pending_work();
}
/*
--
2.28.0.rc0.105.gf9edc3c819-goog
Powered by blists - more mailing lists