lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Thu, 4 Dec 2008 23:37:19 -0800
From:	Ken Chen <kenchen@...gle.com>
To:	Ingo Molnar <mingo@...e.hu>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc:	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>
Subject: [patch] make wake-affine a sysctl variable

Turn wake-affine into a fully controllable sysctl variable.

This is not the first time that SD_WAKE_AFFINE load balance has detrimental
performance impact to server workloads. Three years ago, I presented hard
benchmark results to the community and showed that multi-threaded database
workloads suffered from poor load balance decision made by SD_WAKE_AFFINE.
Today, the very same 3 years old performance bug bite us again on internet
server application workload. The performance impact is even more astonishing,
at 28% performance degradation.

The bug surfaced due to cover up agents went dormant.  Usually there are
several load balance actions in the system, most notably the idle load
balancer and the newly-idle l-b.  These two agents largely will correct the
mishaps that SD_WAKE_AFFINE would make and silently cover up bad behaving
task migrations.  However, in our production environment, scheduler doesn't
have any chance to run the cover up agents because CPU cycles are used
heavily, i.e., CPUs are rarely idle and effectively suppressed idle and
newly-idle balancer.  Without the corrective l-b action, the bug poke
right through and causing detrimental performance degradations.

Several people in the past attempted to fiddle with SD_WAKE_AFFINE, none
was successful AFAIK. Short of any other answer, I propose again that
kernel provides a knob to turn the darn thing off if one chooses to.


Signed-off-by: Ken Chen <kenchen@...gle.com>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55e30d1..d171cad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1671,7 +1671,6 @@ extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
-extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_shares_ratelimit;
@@ -1689,6 +1688,7 @@ int sched_rt_handler
 		loff_t *ppos);

 extern unsigned int sysctl_sched_compat_yield;
+extern unsigned int sysctl_sched_features;

 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index b7480fb..188629b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -676,6 +676,7 @@ int runqueue_is_locked(void)

 #define SCHED_FEAT(name, enabled)	\
 	__SCHED_FEAT_##name ,
+#define SCHED_FEAT_SYSCTL	SCHED_FEAT

 enum {
 #include "sched_features.h"
@@ -683,6 +684,7 @@ enum {

 #undef SCHED_FEAT

+#ifdef CONFIG_SCHED_DEBUG
 #define SCHED_FEAT(name, enabled)	\
 	(1UL << __SCHED_FEAT_##name) * enabled |

@@ -692,7 +694,6 @@ const_debug unsigned int sysctl_sched_features =

 #undef SCHED_FEAT

-#ifdef CONFIG_SCHED_DEBUG
 #define SCHED_FEAT(name, enabled)	\
 	#name ,

@@ -702,6 +703,7 @@ static __read_mostly char *sched_feat_names[] = {
 };

 #undef SCHED_FEAT
+#undef SCHED_FEAT_SYSCTL

 static int sched_feat_open(struct inode *inode, struct file *filp)
 {
@@ -801,9 +803,35 @@ static __init int sched_init_debug(void)
 }
 late_initcall(sched_init_debug);

-#endif
-
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+#define sched_feat_sysctl	sched_feat
+
+#else
+
+#undef SCHED_FEAT_SYSCTL
+#define SCHED_FEAT(name, enabled)	\
+	(1UL << __SCHED_FEAT_##name) * enabled |
+#define SCHED_FEAT_SYSCTL(name, enabled)
+
+const_debug unsigned int const_sysctl_sched_features =
+#include "sched_features.h"
+	0;
+#define sched_feat(x) (const_sysctl_sched_features & (1 << __SCHED_FEAT_##x))
+#undef SCHED_FEAT
+#undef SCHED_FEAT_SYSCTL
+
+#define SCHED_FEAT(name, enable)
+#define SCHED_FEAT_SYSCTL(name, enabled)	\
+	(1UL << __SCHED_FEAT_##name) * enabled |
+
+unsigned int __read_mostly sysctl_sched_features =
+#include "sched_features.h"
+	0;
+#define sched_feat_sysctl(x) (sysctl_sched_features & (1 << __SCHED_FEAT_##x))
+#undef SCHED_FEAT
+#undef SCHED_FEAT_SYSCTL
+
+#endif

 /*
  * Number of tasks to iterate in a single balance run.
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 98345e4..679b17f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1167,7 +1167,8 @@ wake_affine(struct sched_domain
 	unsigned long weight;
 	int balanced;

-	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
+	if (!(this_sd->flags & SD_WAKE_AFFINE) ||
+	    !sched_feat_sysctl(AFFINE_WAKEUPS))
 		return 0;

 	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index da5d93b..fcc32a2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,8 +1,8 @@
+SCHED_FEAT_SYSCTL(AFFINE_WAKEUPS, 1)
 SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
 SCHED_FEAT(NORMALIZED_SLEEPER, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 SCHED_FEAT(START_DEBIT, 1)
-SCHED_FEAT(AFFINE_WAKEUPS, 1)
 SCHED_FEAT(CACHE_HOT_BUDDY, 1)
 SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 0)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3d56fe7..74852c1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -297,14 +297,6 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_features",
-		.data		= &sysctl_sched_features,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_migration_cost",
 		.data		= &sysctl_sched_migration_cost,
 		.maxlen		= sizeof(unsigned int),
@@ -344,6 +336,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_features",
+		.data		= &sysctl_sched_features,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.ctl_name	= CTL_UNNUMBERED,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists