linux-kernel - Re: Missing recalculation of scheduler tunables in case of cpu hot add/remove

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <1259253950.31676.249.camel@laptop>
Date:	Thu, 26 Nov 2009 17:45:50 +0100
From:	Peter Zijlstra <peterz@...radead.org>
To:	Christian Ehrhardt <ehrhardt@...ux.vnet.ibm.com>
Cc:	Ingo Molnar <mingo@...e.hu>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	Holger.Wolf@...ibm.com, epasch@...ibm.com,
	Martin Schwidefsky <schwidefsky@...ibm.com>
Subject: Re: Missing recalculation of scheduler tunables in case of cpu hot
 add/remove

On Thu, 2009-11-26 at 17:31 +0100, Christian Ehrhardt wrote:
> Peter Zijlstra wrote:
> > On Thu, 2009-11-26 at 17:25 +0100, Christian Ehrhardt wrote:
> >   
> >>> Aside from that, we probably should put an upper limit in place, as I
> >>> guess large cpu count machines get silly large values
> >>>       
> >> I agree to that, but in the code is already an upper limit of 
> >> 200.000.000 - well we might discuss if that is too low/high.
> >>     
> >
> > Yeah, I think we should cap it around the 8-16 CPUs.
> >
> >   
> ok for me, driven by that finding I think I have to measure different 
> kind of scalings anyway, but as usually that takes some time :-/
> At least too time much for the discussion & solution of that bug I guess.
> 
> The question for now is what we do on cpu hot add/remove?
> Would hooking somewhere in kernel/cpu.c be the right approach - I'm not 
> quite sure about my own suggestion yet :-).

Something like the below might work I suppose, just needs a cleanup and
such.


diff --git a/kernel/sched.c b/kernel/sched.c
index 0cbf2ef..210365f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
  * default: 0.25ms
  */
 unsigned int sysctl_sched_shares_ratelimit = 250000;
+unsigned int default_sysctl_sched_shares_ratelimit = 250000;
 
 /*
  * Inject some fuzzyness into changing the per-cpu group shares
@@ -1810,6 +1811,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #endif
 
 static void calc_load_account_active(struct rq *this_rq);
+static void update_sysctl(void);
 
 #include "sched_stats.h"
 #include "sched_idletask.c"
@@ -7019,22 +7021,24 @@ cpumask_var_t nohz_cpu_mask;
  *
  * This idea comes from the SD scheduler of Con Kolivas:
  */
-static inline void sched_init_granularity(void)
+#define SET_SYSCTL(name, factor) \
+	sysctl_##name = (factor) * default_sysctl_##name
+
+static void update_sysctl(void)
 {
-	unsigned int factor = 1 + ilog2(num_online_cpus());
+	unsigned int cpus = max(num_active_cpus(), 8);
+	unsigned int factor = 1 + ilog2(cpus);
 	const unsigned long limit = 200000000;
 
-	sysctl_sched_min_granularity *= factor;
-	if (sysctl_sched_min_granularity > limit)
-		sysctl_sched_min_granularity = limit;
-
-	sysctl_sched_latency *= factor;
-	if (sysctl_sched_latency > limit)
-		sysctl_sched_latency = limit;
-
-	sysctl_sched_wakeup_granularity *= factor;
+	SET_SYSCTL(sched_min_granularity);
+	SET_SYSCTL(sched_latency);
+	SET_SYSCTL(sched_wakeup_granularity);
+	SET_SYSCTL(sched_shares_ratelimit);
+}
 
-	sysctl_sched_shares_ratelimit *= factor;
+static inline void sched_init_granularity(void)
+{
+	update_sysctl();
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0ff21af..4d429b8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,12 +35,14 @@
  *  run vmstat and monitor the context-switches (cs) field)
  */
 unsigned int sysctl_sched_latency = 5000000ULL;
+unsigned int default_sysctl_sched_latency = 5000000ULL;
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
 unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int default_sysctl_sched_min_granularity = 1000000ULL;
 
 /*
  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +72,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
  * have immediate wakeup/sleep latencies.
  */
 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int default_sysctl_sched_wakeup_granularity = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 
@@ -1905,6 +1908,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	return 0;
 }
+
+static void rq_online_fair(struct rq *rq)
+{
+	update_sysctl();
+}
+
+static void rq_offline_fair(struct rq *rq)
+{
+	update_sysctl();
+}
+
 #endif /* CONFIG_SMP */
 
 /*
@@ -2052,6 +2066,8 @@ static const struct sched_class fair_sched_class = {
 
 	.load_balance		= load_balance_fair,
 	.move_one_task		= move_one_task_fair,
+	.rq_online		= rq_online_fair,
+	.rq_offline		= rq_offline_fair,
 #endif
 
 	.set_curr_task          = set_curr_task_fair,


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/