lists.openwall.net | lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC | |
Open Source and information security mailing list archives
| ||
|
Date: Fri, 05 Dec 2008 08:49:54 +0100 From: Peter Zijlstra <a.p.zijlstra@...llo.nl> To: Ken Chen <kenchen@...gle.com> Cc: Ingo Molnar <mingo@...e.hu>, Linux Kernel Mailing List <linux-kernel@...r.kernel.org> Subject: Re: [patch] make wake-affine a sysctl variable On Thu, 2008-12-04 at 23:37 -0800, Ken Chen wrote: > Turn wake-affine into a fully controllable sysctl variable. > > This is not the first time that SD_WAKE_AFFINE load balance has detrimental > performance impact to server workloads. Three years ago, I presented hard > benchmark results to the community and showed that multi-threaded database > workloads suffered from poor load balance decision made by SD_WAKE_AFFINE. > Today, the very same 3 years old performance bug bite us again on internet > server application workload. The performance impact is even more astonishing, > at 28% performance degradation. > > The bug surfaced due to cover up agents went dormant. Usually there are > several load balance actions in the system, most notably the idle load > balancer and the newly-idle l-b. These two agents largely will correct the > mishaps that SD_WAKE_AFFINE would make and silently cover up bad behaving > task migrations. However, in our production environment, scheduler doesn't > have any chance to run the cover up agents because CPU cycles are used > heavily, i.e., CPUs are rarely idle and effectively suppressed idle and > newly-idle balancer. Without the corrective l-b action, the bug poke > right through and causing detrimental performance degradations. > > Several people in the past attempted to fiddle with SD_WAKE_AFFINE, none > was successful AFAIK. Short of any other answer, I propose again that > kernel provides a knob to turn the darn thing off if one chooses to. provide a benchmark people can fiddle with? Is it like pgbench, where one thread wakes a lot of others? I think that problem is solvable and just needs proper attention. > Signed-off-by: Ken Chen <kenchen@...gle.com> This patch looks like utter rubbish, sorry. > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 55e30d1..d171cad 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1671,7 +1671,6 @@ extern unsigned int sysctl_sched_latency; > extern unsigned int sysctl_sched_min_granularity; > extern unsigned int sysctl_sched_wakeup_granularity; > extern unsigned int sysctl_sched_child_runs_first; > -extern unsigned int sysctl_sched_features; > extern unsigned int sysctl_sched_migration_cost; > extern unsigned int sysctl_sched_nr_migrate; > extern unsigned int sysctl_sched_shares_ratelimit; > @@ -1689,6 +1688,7 @@ int sched_rt_handler > loff_t *ppos); > > extern unsigned int sysctl_sched_compat_yield; > +extern unsigned int sysctl_sched_features; > > #ifdef CONFIG_RT_MUTEXES > extern int rt_mutex_getprio(struct task_struct *p); > diff --git a/kernel/sched.c b/kernel/sched.c > index b7480fb..188629b 100644 > --- a/kernel/sched.c > +++ b/kernel/sched.c > @@ -676,6 +676,7 @@ int runqueue_is_locked(void) > > #define SCHED_FEAT(name, enabled) \ > __SCHED_FEAT_##name , > +#define SCHED_FEAT_SYSCTL SCHED_FEAT > > enum { > #include "sched_features.h" > @@ -683,6 +684,7 @@ enum { > > #undef SCHED_FEAT > > +#ifdef CONFIG_SCHED_DEBUG > #define SCHED_FEAT(name, enabled) \ > (1UL << __SCHED_FEAT_##name) * enabled | > > @@ -692,7 +694,6 @@ const_debug unsigned int sysctl_sched_features = > > #undef SCHED_FEAT > > -#ifdef CONFIG_SCHED_DEBUG > #define SCHED_FEAT(name, enabled) \ > #name , > > @@ -702,6 +703,7 @@ static __read_mostly char *sched_feat_names[] = { > }; > > #undef SCHED_FEAT > +#undef SCHED_FEAT_SYSCTL > > static int sched_feat_open(struct inode *inode, struct file *filp) > { > @@ -801,9 +803,35 @@ static __init int sched_init_debug(void) > } > late_initcall(sched_init_debug); > > -#endif > - > #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) > +#define sched_feat_sysctl sched_feat > + > +#else > + > +#undef SCHED_FEAT_SYSCTL > +#define SCHED_FEAT(name, enabled) \ > + (1UL << __SCHED_FEAT_##name) * enabled | > +#define SCHED_FEAT_SYSCTL(name, enabled) > + > +const_debug unsigned int const_sysctl_sched_features = > +#include "sched_features.h" > + 0; > +#define sched_feat(x) (const_sysctl_sched_features & (1 << __SCHED_FEAT_##x)) > +#undef SCHED_FEAT > +#undef SCHED_FEAT_SYSCTL > + > +#define SCHED_FEAT(name, enable) > +#define SCHED_FEAT_SYSCTL(name, enabled) \ > + (1UL << __SCHED_FEAT_##name) * enabled | > + > +unsigned int __read_mostly sysctl_sched_features = > +#include "sched_features.h" > + 0; > +#define sched_feat_sysctl(x) (sysctl_sched_features & (1 << __SCHED_FEAT_##x)) > +#undef SCHED_FEAT > +#undef SCHED_FEAT_SYSCTL > + > +#endif > > /* > * Number of tasks to iterate in a single balance run. > diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c > index 98345e4..679b17f 100644 > --- a/kernel/sched_fair.c > +++ b/kernel/sched_fair.c > @@ -1167,7 +1167,8 @@ wake_affine(struct sched_domain > unsigned long weight; > int balanced; > > - if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) > + if (!(this_sd->flags & SD_WAKE_AFFINE) || > + !sched_feat_sysctl(AFFINE_WAKEUPS)) > return 0; > > if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || > diff --git a/kernel/sched_features.h b/kernel/sched_features.h > index da5d93b..fcc32a2 100644 > --- a/kernel/sched_features.h > +++ b/kernel/sched_features.h > @@ -1,8 +1,8 @@ > +SCHED_FEAT_SYSCTL(AFFINE_WAKEUPS, 1) > SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) > SCHED_FEAT(NORMALIZED_SLEEPER, 1) > SCHED_FEAT(WAKEUP_PREEMPT, 1) > SCHED_FEAT(START_DEBIT, 1) > -SCHED_FEAT(AFFINE_WAKEUPS, 1) > SCHED_FEAT(CACHE_HOT_BUDDY, 1) > SCHED_FEAT(SYNC_WAKEUPS, 1) > SCHED_FEAT(HRTICK, 0) > diff --git a/kernel/sysctl.c b/kernel/sysctl.c > index 3d56fe7..74852c1 100644 > --- a/kernel/sysctl.c > +++ b/kernel/sysctl.c > @@ -297,14 +297,6 @@ static struct ctl_table kern_table[] = { > }, > { > .ctl_name = CTL_UNNUMBERED, > - .procname = "sched_features", > - .data = &sysctl_sched_features, > - .maxlen = sizeof(unsigned int), > - .mode = 0644, > - .proc_handler = &proc_dointvec, > - }, > - { > - .ctl_name = CTL_UNNUMBERED, > .procname = "sched_migration_cost", > .data = &sysctl_sched_migration_cost, > .maxlen = sizeof(unsigned int), > @@ -344,6 +336,14 @@ static struct ctl_table kern_table[] = { > .mode = 0644, > .proc_handler = &proc_dointvec, > }, > + { > + .ctl_name = CTL_UNNUMBERED, > + .procname = "sched_features", > + .data = &sysctl_sched_features, > + .maxlen = sizeof(unsigned int), > + .mode = 0644, > + .proc_handler = &proc_dointvec, > + }, > #ifdef CONFIG_PROVE_LOCKING > { > .ctl_name = CTL_UNNUMBERED, -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@...r.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists