[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250212053644.14787-1-cpru@amazon.com>
Date: Tue, 11 Feb 2025 23:36:44 -0600
From: Cristian Prundeanu <cpru@...zon.com>
To: Peter Zijlstra <peterz@...radead.org>
CC: Cristian Prundeanu <cpru@...zon.com>, K Prateek Nayak
<kprateek.nayak@....com>, Hazem Mohamed Abuelfotoh <abuehaze@...zon.com>,
"Ali Saidi" <alisaidi@...zon.com>, Benjamin Herrenschmidt
<benh@...nel.crashing.org>, Geoff Blake <blakgeof@...zon.com>, Csaba Csoma
<csabac@...zon.com>, Bjoern Doebel <doebel@...zon.com>, Gautham Shenoy
<gautham.shenoy@....com>, Joseph Salisbury <joseph.salisbury@...cle.com>,
Dietmar Eggemann <dietmar.eggemann@....com>, Ingo Molnar <mingo@...hat.com>,
Linus Torvalds <torvalds@...ux-foundation.org>, Borislav Petkov
<bp@...en8.de>, <linux-arm-kernel@...ts.infradead.org>,
<linux-kernel@...r.kernel.org>, <linux-tip-commits@...r.kernel.org>,
<x86@...nel.org>
Subject: [PATCH v2] [tip: sched/core] sched: Move PLACE_LAG and RUN_TO_PARITY to sysctl
Replacing CFS with the EEVDF scheduler in kernel 6.6 introduced
significant performance degradation in multiple database-oriented
workloads. This degradation manifests in all kernel versions using EEVDF,
across multiple Linux distributions, hardware architectures (x86_64,
aarm64, amd64), and CPU generations.
Testing combinations of available scheduler features showed that the
largest improvement (short of disabling all EEVDF features) came from
disabling both PLACE_LAG and RUN_TO_PARITY.
Moving PLACE_LAG and RUN_TO_PARITY to sysctl will allow users to override
their default values and persist them with established mechanisms.
Link: https://lore.kernel.org/20241017052000.99200-1-cpru@amazon.com
Signed-off-by: Cristian Prundeanu <cpru@...zon.com>
---
v2: use latest sched/core; defer default value change to a follow-up patch
include/linux/sched/sysctl.h | 8 ++++++++
kernel/sched/core.c | 13 +++++++++++++
kernel/sched/fair.c | 7 ++++---
kernel/sched/features.h | 10 ----------
kernel/sysctl.c | 20 ++++++++++++++++++++
5 files changed, 45 insertions(+), 13 deletions(-)
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 5a64582b086b..a899398bc1c4 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -29,4 +29,12 @@ extern int sysctl_numa_balancing_mode;
#define sysctl_numa_balancing_mode 0
#endif
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+extern unsigned int sysctl_sched_place_lag_enabled;
+extern unsigned int sysctl_sched_run_to_parity_enabled;
+#else
+#define sysctl_sched_place_lag_enabled 1
+#define sysctl_sched_run_to_parity_enabled 1
+#endif
+
#endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9142a0394d46..a379240628ea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -134,6 +134,19 @@ const_debug unsigned int sysctl_sched_features =
0;
#undef SCHED_FEAT
+#ifdef CONFIG_SYSCTL
+/*
+ * Using the avg_vruntime, do the right thing and preserve lag across
+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
+ */
+__read_mostly unsigned int sysctl_sched_place_lag_enabled = 1;
+/*
+ * Inhibit (wakeup) preemption until the current task has either matched the
+ * 0-lag point or until it has exhausted its slice.
+ */
+__read_mostly unsigned int sysctl_sched_run_to_parity_enabled = 1;
+#endif
+
/*
* Print a warning if need_resched is set for the given duration (if
* LATENCY_WARN is enabled).
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1e78caa21436..c87fd1accd54 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -923,7 +923,8 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
* Once selected, run a task until it either becomes non-eligible or
* until it gets a new slice. See the HACK in set_next_entity().
*/
- if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ if (sysctl_sched_run_to_parity_enabled && curr &&
+ curr->vlag == curr->deadline)
return curr;
/* Pick the leftmost entity if it's eligible */
@@ -5199,7 +5200,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*
* EEVDF: placement strategy #1 / #2
*/
- if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
+ if (sysctl_sched_place_lag_enabled && cfs_rq->nr_queued && se->vlag) {
struct sched_entity *curr = cfs_rq->curr;
unsigned long load;
@@ -9327,7 +9328,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
#else
dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
#endif
- if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued &&
+ if (sysctl_sched_place_lag_enabled && dst_cfs_rq->nr_queued &&
!entity_eligible(task_cfs_rq(p), &p->se))
return 1;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 3c12d9f93331..b98ec31ef2c4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -1,10 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Using the avg_vruntime, do the right thing and preserve lag across
- * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
- */
-SCHED_FEAT(PLACE_LAG, true)
/*
* Give new tasks half a slice to ease into the competition.
*/
@@ -13,11 +8,6 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
* Preserve relative virtual deadline on 'migration'.
*/
SCHED_FEAT(PLACE_REL_DEADLINE, true)
-/*
- * Inhibit (wakeup) preemption until the current task has either matched the
- * 0-lag point or until is has exhausted it's slice.
- */
-SCHED_FEAT(RUN_TO_PARITY, true)
/*
* Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for
* current.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7ae7a4136855..11651d87f6d4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2019,6 +2019,26 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_INT_MAX,
},
#endif
+#ifdef CONFIG_SCHED_DEBUG
+ {
+ .procname = "sched_place_lag_enabled",
+ .data = &sysctl_sched_place_lag_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "sched_run_to_parity_enabled",
+ .data = &sysctl_sched_run_to_parity_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
};
static struct ctl_table vm_table[] = {
base-commit: 05dbaf8dd8bf537d4b4eb3115ab42a5fb40ff1f5
--
2.48.1
Powered by blists - more mailing lists