[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250821093807.49750-3-dongml2@chinatelecom.cn>
Date: Thu, 21 Aug 2025 17:38:06 +0800
From: Menglong Dong <menglong8.dong@...il.com>
To: peterz@...radead.org
Cc: mingo@...hat.com,
juri.lelli@...hat.com,
vincent.guittot@...aro.org,
dietmar.eggemann@....com,
rostedt@...dmis.org,
bsegall@...gle.com,
mgorman@...e.de,
vschneid@...hat.com,
ast@...nel.org,
daniel@...earbox.net,
john.fastabend@...il.com,
andrii@...nel.org,
martin.lau@...ux.dev,
eddyz87@...il.com,
song@...nel.org,
yonghong.song@...ux.dev,
kpsingh@...nel.org,
sdf@...ichev.me,
haoluo@...gle.com,
jolsa@...nel.org,
tzimmermann@...e.de,
simona.vetter@...ll.ch,
jani.nikula@...el.com,
linux-kernel@...r.kernel.org,
bpf@...r.kernel.org
Subject: [PATCH v3 2/3] sched: make migrate_enable/migrate_disable inline
For now, migrate_enable and migrate_disable are global, which makes them
become hotspots in some case. Take BPF for example, the function calling
to migrate_enable and migrate_disable in BPF trampoline can introduce
significant overhead, and following is the 'perf top' of FENTRY's
benchmark (./tools/testing/selftests/bpf/bench trig-fentry):
54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k]
bpf_prog_2dcccf652aac1793_bench_trigger_fentry
10.43% [kernel] [k] migrate_enable
10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037
8.06% [kernel] [k] __bpf_prog_exit_recur
4.11% libc.so.6 [.] syscall
2.15% [kernel] [k] entry_SYSCALL_64
1.48% [kernel] [k] memchr_inv
1.32% [kernel] [k] fput
1.16% [kernel] [k] _copy_to_user
0.73% [kernel] [k] bpf_prog_test_run_raw_tp
So in this commit, we make migrate_enable/migrate_disable inline to obtain
better performance. The struct rq is defined internally in
kernel/sched/sched.h, and the field "nr_pinned" is accessed in
migrate_enable/migrate_disable, which makes it hard to make them inline.
Alexei Starovoitov suggests to generate the offset of "nr_pinned" in [1],
so we can define the migrate_enable/migrate_disable in
include/linux/sched.h and access "this_rq()->nr_pinned" with
"(void *)this_rq() + RQ_nr_pinned".
The offset of "nr_pinned" is generated in include/generated/rq-offsets.h
by kernel/sched/rq-offsets.c.
Generally speaking, we move the definition of migrate_enable and
migrate_disable to include/linux/sched.h from kernel/sched/core.c. The
calling to __set_cpus_allowed_ptr() is leaved in ___migrate_enable().
The "struct rq" is not available in include/linux/sched.h, so we can't
access the "runqueues" with this_cpu_ptr(), as the compilation will fail
in this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr():
typeof((ptr) + 0)
So we introduce the this_rq_raw() and access the runqueues with
arch_raw_cpu_ptr/PERCPU_PTR directly.
The variable "runqueues" is not visible in the kernel modules, and export
it is not a good idea. As Peter Zijlstra advised in [2], we define and
export migrate_enable/migrate_disable in kernel/sched/core.c too, and use
them for the modules.
Before this patch, the performance of BPF FENTRY is:
fentry : 113.030 ± 0.149M/s
fentry : 112.501 ± 0.187M/s
fentry : 112.828 ± 0.267M/s
fentry : 115.287 ± 0.241M/s
After this patch, the performance of BPF FENTRY increases to:
fentry : 143.644 ± 0.670M/s
fentry : 149.764 ± 0.362M/s
fentry : 149.642 ± 0.156M/s
fentry : 145.263 ± 0.221M/s
Link: https://lore.kernel.org/bpf/CAADnVQ+5sEDKHdsJY5ZsfGDO_1SEhhQWHrt2SMBG5SYyQ+jt7w@mail.gmail.com/ [1]
Link: https://lore.kernel.org/all/20250819123214.GH4067720@noisy.programming.kicks-ass.net/ [2]
Signed-off-by: Menglong Dong <dongml2@...natelecom.cn>
---
v3:
- don't export runqueues, define migrate_enable and migrate_disable in
kernel/sched/core.c and use them for kernel modules instead
- define the macro this_rq_pinned()
- add some comment for this_rq_raw()
v2:
- use PERCPU_PTR() for this_rq_raw() if !CONFIG_SMP
---
Kbuild | 13 ++++-
include/linux/preempt.h | 3 --
include/linux/sched.h | 106 ++++++++++++++++++++++++++++++++++++++
kernel/bpf/verifier.c | 1 +
kernel/sched/core.c | 63 +++++-----------------
kernel/sched/rq-offsets.c | 12 +++++
6 files changed, 145 insertions(+), 53 deletions(-)
create mode 100644 kernel/sched/rq-offsets.c
diff --git a/Kbuild b/Kbuild
index f327ca86990c..13324b4bbe23 100644
--- a/Kbuild
+++ b/Kbuild
@@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s: $(timeconst-file) $(bounds-file)
$(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
$(call filechk,offsets,__ASM_OFFSETS_H__)
+# Generate rq-offsets.h
+
+rq-offsets-file := include/generated/rq-offsets.h
+
+targets += kernel/sched/rq-offsets.s
+
+kernel/sched/rq-offsets.s: $(offsets-file)
+
+$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE
+ $(call filechk,offsets,__RQ_OFFSETS_H__)
+
# Check for missing system calls
quiet_cmd_syscalls = CALL $<
cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags) $(missing_syscalls_flags)
PHONY += missing-syscalls
-missing-syscalls: scripts/checksyscalls.sh $(offsets-file)
+missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file)
$(call cmd,syscalls)
# Check the manual modification of atomic headers
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 1fad1c8a4c76..92237c319035 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -424,8 +424,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
* work-conserving schedulers.
*
*/
-extern void migrate_disable(void);
-extern void migrate_enable(void);
/**
* preempt_disable_nested - Disable preemption inside a normally preempt disabled section
@@ -471,7 +469,6 @@ static __always_inline void preempt_enable_nested(void)
DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace())
-DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
#ifdef CONFIG_PREEMPT_DYNAMIC
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8188b833350..ae73fb70cc05 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -49,6 +49,9 @@
#include <linux/tracepoint-defs.h>
#include <linux/unwind_deferred_types.h>
#include <asm/kmap_size.h>
+#ifndef COMPILE_OFFSETS
+#include <generated/rq-offsets.h>
+#endif
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
@@ -2312,4 +2315,107 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
#define alloc_tag_restore(_tag, _old) do {} while (0)
#endif
+#ifndef MODULE
+#ifndef COMPILE_OFFSETS
+
+extern void ___migrate_enable(void);
+
+struct rq;
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+/* The "struct rq" is not available here, so we can't access the
+ * "runqueues" with this_cpu_ptr(), as the compilation will fail in
+ * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr():
+ * typeof((ptr) + 0)
+ *
+ * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here.
+ */
+#ifdef CONFIG_SMP
+#define this_rq_raw() arch_raw_cpu_ptr(&runqueues)
+#else
+#define this_rq_raw() PERCPU_PTR(&runqueues)
+#endif
+#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned))
+
+static inline void __migrate_enable(void)
+{
+ struct task_struct *p = current;
+
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Check both overflow from migrate_disable() and superfluous
+ * migrate_enable().
+ */
+ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
+ return;
+#endif
+
+ if (p->migration_disabled > 1) {
+ p->migration_disabled--;
+ return;
+ }
+
+ /*
+ * Ensure stop_task runs either before or after this, and that
+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+ */
+ guard(preempt)();
+ if (unlikely(p->cpus_ptr != &p->cpus_mask))
+ ___migrate_enable();
+ /*
+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
+ * regular cpus_mask, otherwise things that race (eg.
+ * select_fallback_rq) get confused.
+ */
+ barrier();
+ p->migration_disabled = 0;
+ this_rq_pinned()--;
+}
+
+static inline void __migrate_disable(void)
+{
+ struct task_struct *p = current;
+
+ if (p->migration_disabled) {
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ *Warn about overflow half-way through the range.
+ */
+ WARN_ON_ONCE((s16)p->migration_disabled < 0);
+#endif
+ p->migration_disabled++;
+ return;
+ }
+
+ guard(preempt)();
+ this_rq_pinned()++;
+ p->migration_disabled = 1;
+}
+#else /* !COMPILE_OFFSETS */
+static inline void __migrate_disable(void) { }
+static inline void __migrate_enable(void) { }
+#endif /* !COMPILE_OFFSETS */
+
+#ifndef CREATE_MIGRATE_DISABLE
+static inline void migrate_disable(void)
+{
+ __migrate_disable();
+}
+
+static inline void migrate_enable(void)
+{
+ __migrate_enable();
+}
+#else /* CREATE_MIGRATE_DISABLE */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+#endif /* CREATE_MIGRATE_DISABLE */
+
+#else /* MODULE */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+#endif /* MODULE */
+
+DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
+
#endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c4f69a9e9af6..de9078a9df3a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23855,6 +23855,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
BTF_SET_START(btf_id_deny)
BTF_ID_UNUSED
#ifdef CONFIG_SMP
+BTF_ID(func, ___migrate_enable)
BTF_ID(func, migrate_disable)
BTF_ID(func, migrate_enable)
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index be00629f0ba4..58164a69449d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,6 +7,8 @@
* Copyright (C) 1991-2002 Linus Torvalds
* Copyright (C) 1998-2024 Ingo Molnar, Red Hat
*/
+#define CREATE_MIGRATE_DISABLE
+#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/hrtimer_api.h>
#include <linux/ktime_api.h>
@@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
__do_set_cpus_allowed(p, &ac);
}
-void migrate_disable(void)
-{
- struct task_struct *p = current;
-
- if (p->migration_disabled) {
-#ifdef CONFIG_DEBUG_PREEMPT
- /*
- *Warn about overflow half-way through the range.
- */
- WARN_ON_ONCE((s16)p->migration_disabled < 0);
-#endif
- p->migration_disabled++;
- return;
- }
-
- guard(preempt)();
- this_rq()->nr_pinned++;
- p->migration_disabled = 1;
-}
-EXPORT_SYMBOL_GPL(migrate_disable);
-
-void migrate_enable(void)
+void ___migrate_enable(void)
{
struct task_struct *p = current;
struct affinity_context ac = {
@@ -2410,35 +2391,19 @@ void migrate_enable(void)
.flags = SCA_MIGRATE_ENABLE,
};
-#ifdef CONFIG_DEBUG_PREEMPT
- /*
- * Check both overflow from migrate_disable() and superfluous
- * migrate_enable().
- */
- if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
- return;
-#endif
+ __set_cpus_allowed_ptr(p, &ac);
+}
+EXPORT_SYMBOL_GPL(___migrate_enable);
- if (p->migration_disabled > 1) {
- p->migration_disabled--;
- return;
- }
+void migrate_disable(void)
+{
+ __migrate_disable();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
- /*
- * Ensure stop_task runs either before or after this, and that
- * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
- */
- guard(preempt)();
- if (p->cpus_ptr != &p->cpus_mask)
- __set_cpus_allowed_ptr(p, &ac);
- /*
- * Mustn't clear migration_disabled() until cpus_ptr points back at the
- * regular cpus_mask, otherwise things that race (eg.
- * select_fallback_rq) get confused.
- */
- barrier();
- p->migration_disabled = 0;
- this_rq()->nr_pinned--;
+void migrate_enable(void)
+{
+ __migrate_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c
new file mode 100644
index 000000000000..a23747bbe25b
--- /dev/null
+++ b/kernel/sched/rq-offsets.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#define COMPILE_OFFSETS
+#include <linux/kbuild.h>
+#include <linux/types.h>
+#include "sched.h"
+
+int main(void)
+{
+ DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
+
+ return 0;
+}
--
2.50.1
Powered by blists - more mailing lists