[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20260124171546.43398-4-qq570070308@gmail.com>
Date: Sun, 25 Jan 2026 01:15:46 +0800
From: Xie Yuanbin <qq570070308@...il.com>
To: peterz@...radead.org,
tglx@...nel.org,
riel@...riel.com,
segher@...nel.crashing.org,
david@...nel.org,
hpa@...or.com,
arnd@...db.de,
mingo@...hat.com,
juri.lelli@...hat.com,
vincent.guittot@...aro.org,
dietmar.eggemann@....com,
rostedt@...dmis.org,
bsegall@...gle.com,
mgorman@...e.de,
vschneid@...hat.com,
bp@...en8.de,
dave.hansen@...ux.intel.com,
luto@...nel.org,
houwenlong.hwl@...group.com
Cc: linux-kernel@...r.kernel.org,
x86@...nel.org,
Xie Yuanbin <qq570070308@...il.com>
Subject: [PATCH v6 3/3] sched/core: Make finish_task_switch() and its subfunctions always inline
finish_task_switch() is not inlined even in the O2 level optimization,
performance testing indicates that this could lead to a significant
performance degradation when certain Spectre vulnerability mitigations
are enabled.
In switch_mm_irq_off(), some mitigations may clear branch prediction
history, or the instruction cache, like arm64_apply_bp_hardening() on
arm64, BPIALL/ICIALLU on arm, and indirect_branch_prediction_barrier()
on x86. finish_task_switch() is right after switch_mm_irqs_off(), the
performance is greatly affected by function calls and branch jumps.
__schedule() has a __sched attribute, which makes it be placed in
'.sched.text' section, while finish_task_switch() does not. This makes
they "far away from each other" in vmlinux, which aggravating the
performance degradation.
Make finish_task_switch() and its subfunctions always inline to optimize
the performance.
Performance test data - time spent on calling finish_task_switch():
1. x86-64: Intel i5-8300h@...z, DDR4@...6mhz; unit: x86's tsc
| test scenario | old | new | delta |
| gcc 15.2 | 27.50 | 25.45 | -2.05 ( -7.5%) |
| gcc 15.2 + spectre_v2_user=on | 46.75 | 25.96 | -20.79 (-44.5%) |
| clang 21.1.7 | 27.25 | 25.45 | -1.80 ( -6.6%) |
| clang 21.1.7 + spectre_v2_user=on | 39.50 | 26.00 | -13.50 (-34.2%) |
2. x86-64: AMD 9600x@...5Ghz, DDR5@...0mhz; unit: x86's tsc
| test scenario | old | new | delta |
| gcc 15.2 | 27.51 | 27.51 | 0 ( 0%) |
| gcc 15.2 + spectre_v2_user=on | 105.21 | 67.89 | -37.32 (-35.5%) |
| clang 21.1.7 | 27.51 | 27.51 | 0 ( 0%) |
| clang 21.1.7 + spectre_v2_user=on | 104.15 | 67.52 | -36.63 (-35.2%) |
3. arm64: Raspberry Pi 3b Rev 1.2, Cortex-A53@...Ghz; unit: cntvct_el0
| test scenario | old | new | delta |
| gcc 15.2 | 1.453 | 1.115 | -0.338 (-23.3%) |
| clang 21.1.7 | 1.532 | 1.123 | -0.409 (-26.7%) |
4. arm32: Raspberry Pi 3b Rev 1.2, Cortex-A53@...Ghz; unit: cntvct_el0
| test scenario | old | new | delta |
| gcc 15.2 | 1.421 | 1.187 | -0.234 (-16.5%) |
| clang 21.1.7 | 1.437 | 1.200 | -0.237 (-16.5%) |
Signed-off-by: Xie Yuanbin <qq570070308@...il.com>
Cc: Thomas Gleixner <tglx@...nel.org>
Cc: Rik van Riel <riel@...riel.com>
Cc: Segher Boessenkool <segher@...nel.crashing.org>
Cc: David Hildenbrand (Red Hat) <david@...nel.org>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: H. Peter Anvin (Intel) <hpa@...or.com>
Cc: Arnd Bergmann <arnd@...db.de>
---
More detailed information about the test can be found in the cover letter:
Link: https://lore.kernel.org/20260124171546.43398-1-qq570070308@gmail.com
arch/arm/include/asm/mmu_context.h | 2 +-
arch/riscv/include/asm/sync_core.h | 2 +-
arch/s390/include/asm/mmu_context.h | 2 +-
arch/sparc/include/asm/mmu_context_64.h | 2 +-
arch/x86/include/asm/sync_core.h | 2 +-
include/linux/perf_event.h | 2 +-
include/linux/sched/mm.h | 10 +++++-----
include/linux/tick.h | 4 ++--
include/linux/vtime.h | 8 ++++----
kernel/sched/core.c | 12 ++++++------
kernel/sched/sched.h | 24 ++++++++++++------------
11 files changed, 35 insertions(+), 35 deletions(-)
diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index db2cb06aa8cf..bebde469f81a 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -80,7 +80,7 @@ static inline void check_and_switch_context(struct mm_struct *mm,
#ifndef MODULE
#define finish_arch_post_lock_switch \
finish_arch_post_lock_switch
-static inline void finish_arch_post_lock_switch(void)
+static __always_inline void finish_arch_post_lock_switch(void)
{
struct mm_struct *mm = current->mm;
diff --git a/arch/riscv/include/asm/sync_core.h b/arch/riscv/include/asm/sync_core.h
index 9153016da8f1..2fe6b7fe6b12 100644
--- a/arch/riscv/include/asm/sync_core.h
+++ b/arch/riscv/include/asm/sync_core.h
@@ -6,7 +6,7 @@
* RISC-V implements return to user-space through an xRET instruction,
* which is not core serializing.
*/
-static inline void sync_core_before_usermode(void)
+static __always_inline void sync_core_before_usermode(void)
{
asm volatile ("fence.i" ::: "memory");
}
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index d9b8501bc93d..c124ef6a01b3 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -97,7 +97,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
}
#define finish_arch_post_lock_switch finish_arch_post_lock_switch
-static inline void finish_arch_post_lock_switch(void)
+static __always_inline void finish_arch_post_lock_switch(void)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
index 78bbacc14d2d..d1967214ef25 100644
--- a/arch/sparc/include/asm/mmu_context_64.h
+++ b/arch/sparc/include/asm/mmu_context_64.h
@@ -160,7 +160,7 @@ static inline void arch_start_context_switch(struct task_struct *prev)
}
#define finish_arch_post_lock_switch finish_arch_post_lock_switch
-static inline void finish_arch_post_lock_switch(void)
+static __always_inline void finish_arch_post_lock_switch(void)
{
/* Restore the state of MCDPER register for the new process
* just switched to.
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
index 96bda43538ee..4b55fa353bb5 100644
--- a/arch/x86/include/asm/sync_core.h
+++ b/arch/x86/include/asm/sync_core.h
@@ -93,7 +93,7 @@ static __always_inline void sync_core(void)
* to user-mode. x86 implements return to user-space through sysexit,
* sysrel, and sysretq, which are not core serializing.
*/
-static inline void sync_core_before_usermode(void)
+static __always_inline void sync_core_before_usermode(void)
{
/* With PTI, we unconditionally serialize before running user code. */
if (static_cpu_has(X86_FEATURE_PTI))
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 48d851fbd8ea..7c1dac8da5e5 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1632,7 +1632,7 @@ static inline void perf_event_task_migrate(struct task_struct *task)
task->sched_migrated = 1;
}
-static inline void perf_event_task_sched_in(struct task_struct *prev,
+static __always_inline void perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task)
{
if (static_branch_unlikely(&perf_sched_events))
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 95d0040df584..4a279ee2d026 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -44,7 +44,7 @@ static inline void smp_mb__after_mmgrab(void)
extern void __mmdrop(struct mm_struct *mm);
-static inline void mmdrop(struct mm_struct *mm)
+static __always_inline void mmdrop(struct mm_struct *mm)
{
/*
* The implicit full barrier implied by atomic_dec_and_test() is
@@ -71,14 +71,14 @@ static inline void __mmdrop_delayed(struct rcu_head *rhp)
* Invoked from finish_task_switch(). Delegates the heavy lifting on RT
* kernels via RCU.
*/
-static inline void mmdrop_sched(struct mm_struct *mm)
+static __always_inline void mmdrop_sched(struct mm_struct *mm)
{
/* Provides a full memory barrier. See mmdrop() */
if (atomic_dec_and_test(&mm->mm_count))
call_rcu(&mm->delayed_drop, __mmdrop_delayed);
}
#else
-static inline void mmdrop_sched(struct mm_struct *mm)
+static __always_inline void mmdrop_sched(struct mm_struct *mm)
{
mmdrop(mm);
}
@@ -104,7 +104,7 @@ static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
}
}
-static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
+static __always_inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
{
if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
mmdrop_sched(mm);
@@ -532,7 +532,7 @@ enum {
#include <asm/membarrier.h>
#endif
-static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
+static __always_inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
/*
* The atomic_read() below prevents CSE. The following should
diff --git a/include/linux/tick.h b/include/linux/tick.h
index ac76ae9fa36d..fce16aa10ba2 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -175,7 +175,7 @@ extern cpumask_var_t tick_nohz_full_mask;
#ifdef CONFIG_NO_HZ_FULL
extern bool tick_nohz_full_running;
-static inline bool tick_nohz_full_enabled(void)
+static __always_inline bool tick_nohz_full_enabled(void)
{
if (!context_tracking_enabled())
return false;
@@ -299,7 +299,7 @@ static inline void __tick_nohz_task_switch(void) { }
static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { }
#endif
-static inline void tick_nohz_task_switch(void)
+static __always_inline void tick_nohz_task_switch(void)
{
if (tick_nohz_full_enabled())
__tick_nohz_task_switch();
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 29dd5b91dd7d..428464bb81b3 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -67,24 +67,24 @@ static __always_inline void vtime_account_guest_exit(void)
* For now vtime state is tied to context tracking. We might want to decouple
* those later if necessary.
*/
-static inline bool vtime_accounting_enabled(void)
+static __always_inline bool vtime_accounting_enabled(void)
{
return context_tracking_enabled();
}
-static inline bool vtime_accounting_enabled_cpu(int cpu)
+static __always_inline bool vtime_accounting_enabled_cpu(int cpu)
{
return context_tracking_enabled_cpu(cpu);
}
-static inline bool vtime_accounting_enabled_this_cpu(void)
+static __always_inline bool vtime_accounting_enabled_this_cpu(void)
{
return context_tracking_enabled_this_cpu();
}
extern void vtime_task_switch_generic(struct task_struct *prev);
-static inline void vtime_task_switch(struct task_struct *prev)
+static __always_inline void vtime_task_switch(struct task_struct *prev)
{
if (vtime_accounting_enabled_this_cpu())
vtime_task_switch_generic(prev);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 12d3c42960f2..d56620c667dd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4889,7 +4889,7 @@ static inline void prepare_task(struct task_struct *next)
WRITE_ONCE(next->on_cpu, 1);
}
-static inline void finish_task(struct task_struct *prev)
+static __always_inline void finish_task(struct task_struct *prev)
{
/*
* This must be the very last reference to @prev from this CPU. After
@@ -4905,7 +4905,7 @@ static inline void finish_task(struct task_struct *prev)
smp_store_release(&prev->on_cpu, 0);
}
-static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
+static __always_inline void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
{
void (*func)(struct rq *rq);
struct balance_callback *next;
@@ -4940,7 +4940,7 @@ struct balance_callback balance_push_callback = {
.func = balance_push,
};
-static inline struct balance_callback *
+static __always_inline struct balance_callback *
__splice_balance_callbacks(struct rq *rq, bool split)
{
struct balance_callback *head = rq->balance_callback;
@@ -5014,7 +5014,7 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
__acquire(__rq_lockp(this_rq()));
}
-static inline void finish_lock_switch(struct rq *rq)
+static __always_inline void finish_lock_switch(struct rq *rq)
__releases(__rq_lockp(rq))
{
/*
@@ -5047,7 +5047,7 @@ static inline void kmap_local_sched_out(void)
#endif
}
-static inline void kmap_local_sched_in(void)
+static __always_inline void kmap_local_sched_in(void)
{
#ifdef CONFIG_KMAP_LOCAL
if (unlikely(current->kmap_ctrl.idx))
@@ -5101,7 +5101,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
* past. 'prev == current' is still correct but we need to recalculate this_rq
* because prev may have moved to another CPU.
*/
-static struct rq *finish_task_switch(struct task_struct *prev)
+static __always_inline struct rq *finish_task_switch(struct task_struct *prev)
__releases(__rq_lockp(this_rq()))
{
struct rq *rq = this_rq();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2daa63b760dd..0b259e77ac67 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1427,12 +1427,12 @@ static inline struct cpumask *sched_group_span(struct sched_group *sg);
DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
-static inline bool sched_core_enabled(struct rq *rq)
+static __always_inline bool sched_core_enabled(struct rq *rq)
{
return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled;
}
-static inline bool sched_core_disabled(void)
+static __always_inline bool sched_core_disabled(void)
{
return !static_branch_unlikely(&__sched_core_enabled);
}
@@ -1441,7 +1441,7 @@ static inline bool sched_core_disabled(void)
* Be careful with this function; not for general use. The return value isn't
* stable unless you actually hold a relevant rq->__lock.
*/
-static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+static __always_inline raw_spinlock_t *rq_lockp(struct rq *rq)
{
if (sched_core_enabled(rq))
return &rq->core->__lock;
@@ -1449,7 +1449,7 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
return &rq->__lock;
}
-static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+static __always_inline raw_spinlock_t *__rq_lockp(struct rq *rq)
__returns_ctx_lock(rq_lockp(rq)) /* alias them */
{
if (rq->core_enabled)
@@ -1544,12 +1544,12 @@ static inline bool sched_core_disabled(void)
return true;
}
-static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+static __always_inline raw_spinlock_t *rq_lockp(struct rq *rq)
{
return &rq->__lock;
}
-static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+static __always_inline raw_spinlock_t *__rq_lockp(struct rq *rq)
__returns_ctx_lock(rq_lockp(rq)) /* alias them */
{
return &rq->__lock;
@@ -1604,33 +1604,33 @@ extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
extern bool raw_spin_rq_trylock(struct rq *rq)
__cond_acquires(true, __rq_lockp(rq));
-static inline void raw_spin_rq_lock(struct rq *rq)
+static __always_inline void raw_spin_rq_lock(struct rq *rq)
__acquires(__rq_lockp(rq))
{
raw_spin_rq_lock_nested(rq, 0);
}
-static inline void raw_spin_rq_unlock(struct rq *rq)
+static __always_inline void raw_spin_rq_unlock(struct rq *rq)
__releases(__rq_lockp(rq))
{
raw_spin_unlock(rq_lockp(rq));
}
-static inline void raw_spin_rq_lock_irq(struct rq *rq)
+static __always_inline void raw_spin_rq_lock_irq(struct rq *rq)
__acquires(__rq_lockp(rq))
{
local_irq_disable();
raw_spin_rq_lock(rq);
}
-static inline void raw_spin_rq_unlock_irq(struct rq *rq)
+static __always_inline void raw_spin_rq_unlock_irq(struct rq *rq)
__releases(__rq_lockp(rq))
{
raw_spin_rq_unlock(rq);
local_irq_enable();
}
-static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq)
+static __always_inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq)
__acquires(__rq_lockp(rq))
{
unsigned long flags;
@@ -1641,7 +1641,7 @@ static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq)
return flags;
}
-static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags)
+static __always_inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags)
__releases(__rq_lockp(rq))
{
raw_spin_rq_unlock(rq);
--
2.51.0
Powered by blists - more mailing lists