linux-kernel - Re: [ANNOUNCE/RFC] Really Fair Scheduler

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Pine.LNX.4.64.0709102338170.1817@scrub.home>
Date:	Tue, 11 Sep 2007 01:23:40 +0200 (CEST)
From:	Roman Zippel <zippel@...ux-m68k.org>
To:	Mike Galbraith <efault@....de>
cc:	Ingo Molnar <mingo@...e.hu>, Daniel Walker <dwalker@...sta.com>,
	linux-kernel@...r.kernel.org, peterz@...radead.org
Subject: Re: [ANNOUNCE/RFC] Really Fair Scheduler

Hi,

On Sat, 8 Sep 2007, Mike Galbraith wrote:

> > On Sun, 2 Sep 2007, Ingo Molnar wrote:
> > 
> > Below is a patch updated against the latest git tree, no major changes.
> 
> Interesting, I see major behavioral changes.
> 
> I still see an aberration with fairtest2.  On startup, the hog component
> will consume 100% cpu for a bit, then the sleeper shows up.  This
> doesn't always happen, but happens quite often.

I found the problem for this. What basically happened is that a task that 
hasn't been running for a second is enqueued first on an idle queue and it 
keeps that advantage compared to tasks that had been running more recently 
until it catched up. The new version will now remember where the last task 
left off and use that for that first task which restarts the queue. As a 
side effect it also limits the bonus a task gets if multiple tasks are 
woken at the same time.

This version also limits againsts runtime overruns, that means a task 
exceeds its time slice, because the timer interrupt was disabled and so 
the current task wasn't preempted, this usually happens during boot, but 
also when using a serial console.

bye, Roman

---
 include/linux/sched.h |   26 -
 kernel/sched.c        |  173 +++++----
 kernel/sched_debug.c  |   26 -
 kernel/sched_norm.c   |  870 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sysctl.c       |   30 -
 5 files changed, 977 insertions(+), 148 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -884,40 +884,28 @@ struct load_weight {
  *
  * Current field usage histogram:
  *
- *     4 se->block_start
  *     4 se->run_node
- *     4 se->sleep_start
- *     4 se->sleep_start_fair
  *     6 se->load.weight
- *     7 se->delta_fair
- *    15 se->wait_runtime
  */
 struct sched_entity {
-	long			wait_runtime;
-	unsigned long		delta_fair_run;
-	unsigned long		delta_fair_sleep;
-	unsigned long		delta_exec;
-	s64			fair_key;
+	s64			time_key;
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
-	unsigned int		on_rq;
+	unsigned int		on_rq, queued;
+	unsigned int		weight_shift;
 
 	u64			exec_start;
 	u64			sum_exec_runtime;
-	u64			prev_sum_exec_runtime;
-	u64			wait_start_fair;
-	u64			sleep_start_fair;
+	s64			time_norm;
+	s64			req_weight_inv;
 
 #ifdef CONFIG_SCHEDSTATS
-	u64			wait_start;
 	u64			wait_max;
 	s64			sum_wait_runtime;
 
-	u64			sleep_start;
 	u64			sleep_max;
 	s64			sum_sleep_runtime;
 
-	u64			block_start;
 	u64			block_max;
 	u64			exec_max;
 
@@ -1400,12 +1388,10 @@ static inline void idle_task_exit(void) 
 
 extern void sched_idle_next(void);
 
-extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_stat_granularity;
-extern unsigned int sysctl_sched_runtime_limit;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -183,18 +183,24 @@ struct cfs_rq {
 
 	s64 fair_clock;
 	u64 exec_clock;
-	s64 wait_runtime;
 	u64 sleeper_bonus;
 	unsigned long wait_runtime_overruns, wait_runtime_underruns;
 
+	u64 prev_update;
+	s64 time_norm_base, time_norm_inc, time_avg_min;
+	u64 run_start, run_end;
+	u64 run_end_next, run_end_curr;
+	s64 time_sum_max, time_sum_off;
+	unsigned long inc_shift, weight_sum;
+
 	struct rb_root tasks_timeline;
-	struct rb_node *rb_leftmost;
 	struct rb_node *rb_load_balance_curr;
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	/* 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
-	struct sched_entity *curr;
+	struct sched_entity *curr, *next;
+	struct sched_entity *rb_leftmost;
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 
 	/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
@@ -231,12 +237,16 @@ struct rq {
 	 */
 	unsigned long nr_running;
 	#define CPU_LOAD_IDX_MAX 5
+#ifdef CONFIG_SMP
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+#endif
 	unsigned char idle_at_tick;
 #ifdef CONFIG_NO_HZ
 	unsigned char in_nohz_recently;
 #endif
+#ifdef CONFIG_SMP
 	struct load_stat ls;	/* capture load from *all* tasks on this cpu */
+#endif
 	unsigned long nr_load_updates;
 	u64 nr_switches;
 
@@ -636,13 +646,6 @@ static void resched_cpu(int cpu)
 	resched_task(cpu_curr(cpu));
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
-#else
-static inline void resched_task(struct task_struct *p)
-{
-	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_need_resched(p);
-}
-#endif
 
 static u64 div64_likely32(u64 divident, unsigned long divisor)
 {
@@ -657,6 +660,13 @@ static u64 div64_likely32(u64 divident, 
 #endif
 }
 
+#else
+static inline void resched_task(struct task_struct *p)
+{
+	assert_spin_locked(&task_rq(p)->lock);
+	set_tsk_need_resched(p);
+}
+#endif
 #if BITS_PER_LONG == 32
 # define WMULT_CONST	(~0UL)
 #else
@@ -734,15 +744,33 @@ static void update_load_sub(struct load_
  * If a task goes up by ~10% and another task goes down by ~10% then
  * the relative distance between them is ~25%.)
  */
+#ifdef SANE_NICE_LEVEL
+static const int prio_to_weight[40] = {
+        3567, 3102, 2703, 2351, 2048, 1783, 1551, 1351, 1177, 1024,
+        892, 776, 676, 588, 512, 446, 388, 338, 294, 256,
+        223, 194, 169, 147, 128, 111, 97, 84, 74, 64,
+        56, 49, 42, 37, 32, 28, 24, 21, 18, 16
+};
+
+static const u32 prio_to_wmult[40] = {
+	294, 338, 388, 446, 512, 588, 676, 776, 891, 1024,
+	1176, 1351, 1552, 1783, 2048, 2353, 2702, 3104, 3566, 4096,
+	4705, 5405, 6208, 7132, 8192, 9410, 10809, 12417, 14263, 16384,
+	18820, 21619, 24834, 28526, 32768, 37641, 43238, 49667, 57052, 65536
+};
+
+static const u32 prio_to_wshift[40] = {
+	8, 8, 7, 7, 7, 7, 7, 6, 6, 6,
+	6, 6, 5, 5, 5, 5, 5, 4, 4, 4,
+	4, 4, 3, 3, 3, 3, 3, 2, 2, 2,
+	2, 2, 1, 1, 1, 1, 1, 0, 0, 0
+};
+#else
 static const int prio_to_weight[40] = {
- /* -20 */     88761,     71755,     56483,     46273,     36291,
- /* -15 */     29154,     23254,     18705,     14949,     11916,
- /* -10 */      9548,      7620,      6100,      4904,      3906,
- /*  -5 */      3121,      2501,      1991,      1586,      1277,
- /*   0 */      1024,       820,       655,       526,       423,
- /*   5 */       335,       272,       215,       172,       137,
- /*  10 */       110,        87,        70,        56,        45,
- /*  15 */        36,        29,        23,        18,        15,
+	95325, 74898, 61681, 49932, 38836, 31775, 24966, 20165, 16132, 12945,
+	10382, 8257, 6637, 5296, 4228, 3393, 2709, 2166, 1736, 1387,
+	1111, 888, 710, 568, 455, 364, 291, 233, 186, 149,
+	119, 95, 76, 61, 49, 39, 31, 25, 20, 16
 };
 
 /*
@@ -753,16 +781,20 @@ static const int prio_to_weight[40] = {
  * into multiplications:
  */
 static const u32 prio_to_wmult[40] = {
- /* -20 */     48388,     59856,     76040,     92818,    118348,
- /* -15 */    147320,    184698,    229616,    287308,    360437,
- /* -10 */    449829,    563644,    704093,    875809,   1099582,
- /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
- /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
- /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
- /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
- /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+	11, 14, 17, 21, 27, 33, 42, 52, 65, 81,
+	101, 127, 158, 198, 248, 309, 387, 484, 604, 756,
+	944, 1181, 1476, 1845, 2306, 2882, 3603, 4504, 5629, 7037,
+	8796, 10995, 13744, 17180, 21475, 26844, 33554, 41943, 52429, 65536
 };
 
+static const u32 prio_to_wshift[40] = {
+	13, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+	9, 9, 9, 8, 8, 8, 7, 7, 7, 6,
+	6, 6, 5, 5, 5, 5, 4, 4, 4, 3,
+	3, 3, 2, 2, 2, 1, 1, 1, 0, 0
+};
+#endif
+
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 
 /*
@@ -784,7 +816,8 @@ static int balance_tasks(struct rq *this
 
 #include "sched_stats.h"
 #include "sched_rt.c"
-#include "sched_fair.c"
+//#include "sched_fair.c"
+#include "sched_norm.c"
 #include "sched_idletask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
@@ -792,6 +825,7 @@ static int balance_tasks(struct rq *this
 
 #define sched_class_highest (&rt_sched_class)
 
+#ifdef CONFIG_SMP
 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
 {
 	if (rq->curr != rq->idle && ls->load.weight) {
@@ -843,6 +877,14 @@ static inline void dec_load(struct rq *r
 	update_curr_load(rq);
 	update_load_sub(&rq->ls.load, p->se.load.weight);
 }
+#else
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+}
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+}
+#endif
 
 static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
@@ -858,9 +900,6 @@ static void dec_nr_running(struct task_s
 
 static void set_load_weight(struct task_struct *p)
 {
-	task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
-	p->se.wait_runtime = 0;
-
 	if (task_has_rt_policy(p)) {
 		p->se.load.weight = prio_to_weight[0] * 2;
 		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
@@ -878,6 +917,8 @@ static void set_load_weight(struct task_
 
 	p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
 	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+	p->se.weight_shift = prio_to_wshift[p->static_prio - MAX_RT_PRIO];
+	p->se.req_weight_inv = p->se.load.inv_weight * (kclock_t)sysctl_sched_granularity;
 }
 
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
@@ -986,11 +1027,13 @@ inline int task_curr(const struct task_s
 	return cpu_curr(task_cpu(p)) == p;
 }
 
+#ifdef CONFIG_SMP
 /* Used instead of source_load when we know the type == 0 */
 unsigned long weighted_cpuload(const int cpu)
 {
 	return cpu_rq(cpu)->ls.load.weight;
 }
+#endif
 
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
@@ -1004,27 +1047,6 @@ static inline void __set_task_cpu(struct
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
-	int old_cpu = task_cpu(p);
-	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
-	u64 clock_offset, fair_clock_offset;
-
-	clock_offset = old_rq->clock - new_rq->clock;
-	fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
-
-	if (p->se.wait_start_fair)
-		p->se.wait_start_fair -= fair_clock_offset;
-	if (p->se.sleep_start_fair)
-		p->se.sleep_start_fair -= fair_clock_offset;
-
-#ifdef CONFIG_SCHEDSTATS
-	if (p->se.wait_start)
-		p->se.wait_start -= clock_offset;
-	if (p->se.sleep_start)
-		p->se.sleep_start -= clock_offset;
-	if (p->se.block_start)
-		p->se.block_start -= clock_offset;
-#endif
-
 	__set_task_cpu(p, new_cpu);
 }
 
@@ -1584,22 +1606,12 @@ int fastcall wake_up_state(struct task_s
  */
 static void __sched_fork(struct task_struct *p)
 {
-	p->se.wait_start_fair		= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
-	p->se.prev_sum_exec_runtime	= 0;
-	p->se.delta_exec		= 0;
-	p->se.delta_fair_run		= 0;
-	p->se.delta_fair_sleep		= 0;
-	p->se.wait_runtime		= 0;
-	p->se.sleep_start_fair		= 0;
 
 #ifdef CONFIG_SCHEDSTATS
-	p->se.wait_start		= 0;
 	p->se.sum_wait_runtime		= 0;
 	p->se.sum_sleep_runtime		= 0;
-	p->se.sleep_start		= 0;
-	p->se.block_start		= 0;
 	p->se.sleep_max			= 0;
 	p->se.block_max			= 0;
 	p->se.exec_max			= 0;
@@ -1971,6 +1983,7 @@ unsigned long nr_active(void)
 	return running + uninterruptible;
 }
 
+#ifdef CONFIG_SMP
 /*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
@@ -2027,8 +2040,6 @@ do_avg:
 	}
 }
 
-#ifdef CONFIG_SMP
-
 /*
  * double_rq_lock - safely lock two runqueues
  *
@@ -3350,7 +3361,9 @@ void scheduler_tick(void)
 	if (unlikely(rq->clock < next_tick))
 		rq->clock = next_tick;
 	rq->tick_timestamp = rq->clock;
+#ifdef CONFIG_SMP
 	update_cpu_load(rq);
+#endif
 	if (curr != rq->idle) /* FIXME: needed? */
 		curr->sched_class->task_tick(rq, curr);
 	spin_unlock(&rq->lock);
@@ -4914,16 +4927,12 @@ static inline void sched_init_granularit
 	unsigned int factor = 1 + ilog2(num_online_cpus());
 	const unsigned long limit = 100000000;
 
-	sysctl_sched_min_granularity *= factor;
-	if (sysctl_sched_min_granularity > limit)
-		sysctl_sched_min_granularity = limit;
-
-	sysctl_sched_latency *= factor;
-	if (sysctl_sched_latency > limit)
-		sysctl_sched_latency = limit;
+	sysctl_sched_granularity *= factor;
+	if (sysctl_sched_granularity > limit)
+		sysctl_sched_granularity = limit;
 
-	sysctl_sched_runtime_limit = sysctl_sched_latency;
-	sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
+	sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
+	sysctl_sched_batch_wakeup_granularity = sysctl_sched_granularity;
 }
 
 #ifdef CONFIG_SMP
@@ -6516,6 +6525,9 @@ static inline void init_cfs_rq(struct cf
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
 	cfs_rq->fair_clock = 1;
+	cfs_rq->time_sum_max = 0;
+	cfs_rq->time_norm_inc = MAX_TIMESUM >> 1;
+	cfs_rq->inc_shift = 0;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 #endif
@@ -6523,7 +6535,6 @@ static inline void init_cfs_rq(struct cf
 
 void __init sched_init(void)
 {
-	u64 now = sched_clock();
 	int highest_cpu = 0;
 	int i, j;
 
@@ -6548,12 +6559,11 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 		list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 #endif
-		rq->ls.load_update_last = now;
-		rq->ls.load_update_start = now;
+#ifdef CONFIG_SMP
+		rq->ls.load_update_last = rq->ls.load_update_start = sched_clock();
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
-#ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
@@ -6643,16 +6653,7 @@ void normalize_rt_tasks(void)
 
 	read_lock_irq(&tasklist_lock);
 	do_each_thread(g, p) {
-		p->se.fair_key			= 0;
-		p->se.wait_runtime		= 0;
 		p->se.exec_start		= 0;
-		p->se.wait_start_fair		= 0;
-		p->se.sleep_start_fair		= 0;
-#ifdef CONFIG_SCHEDSTATS
-		p->se.wait_start		= 0;
-		p->se.sleep_start		= 0;
-		p->se.block_start		= 0;
-#endif
 		task_rq(p)->cfs.fair_clock	= 0;
 		task_rq(p)->clock		= 0;
 
Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -38,9 +38,9 @@ print_task(struct seq_file *m, struct rq
 
 	SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ",
 		p->comm, p->pid,
-		(long long)p->se.fair_key,
-		(long long)(p->se.fair_key - rq->cfs.fair_clock),
-		(long long)p->se.wait_runtime,
+		(long long)p->se.time_norm >> 16,
+		(long long)((p->se.time_key >> 16) - rq->cfs.fair_clock),
+		((long long)((rq->cfs.fair_clock << 16) - p->se.time_norm) * p->se.load.weight) >> 20,
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
@@ -73,6 +73,7 @@ static void print_rq(struct seq_file *m,
 
 	read_lock_irq(&tasklist_lock);
 
+	rq->cfs.fair_clock = get_time_avg(&rq->cfs) >> 16;
 	do_each_thread(g, p) {
 		if (!p->se.on_rq || task_cpu(p) != rq_cpu)
 			continue;
@@ -93,10 +94,10 @@ print_cfs_rq_runtime_sum(struct seq_file
 	struct rq *rq = &per_cpu(runqueues, cpu);
 
 	spin_lock_irqsave(&rq->lock, flags);
-	curr = first_fair(cfs_rq);
+	curr = cfs_rq->rb_leftmost ? &cfs_rq->rb_leftmost->run_node : NULL;
 	while (curr) {
 		p = rb_entry(curr, struct task_struct, se.run_node);
-		wait_runtime_rq_sum += p->se.wait_runtime;
+		//wait_runtime_rq_sum += p->se.wait_runtime;
 
 		curr = rb_next(curr);
 	}
@@ -110,12 +111,12 @@ void print_cfs_rq(struct seq_file *m, in
 {
 	SEQ_printf(m, "\ncfs_rq\n");
 
+	cfs_rq->fair_clock = get_time_avg(cfs_rq) >> 16;
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
 
 	P(fair_clock);
 	P(exec_clock);
-	P(wait_runtime);
 	P(wait_runtime_overruns);
 	P(wait_runtime_underruns);
 	P(sleeper_bonus);
@@ -143,10 +144,12 @@ static void print_cpu(struct seq_file *m
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x))
 
 	P(nr_running);
+#ifdef CONFIG_SMP
 	SEQ_printf(m, "  .%-30s: %lu\n", "load",
 		   rq->ls.load.weight);
 	P(ls.delta_fair);
 	P(ls.delta_exec);
+#endif
 	P(nr_switches);
 	P(nr_load_updates);
 	P(nr_uninterruptible);
@@ -160,11 +163,13 @@ static void print_cpu(struct seq_file *m
 	P(clock_overflows);
 	P(clock_deep_idle_events);
 	P(clock_max_delta);
+#ifdef CONFIG_SMP
 	P(cpu_load[0]);
 	P(cpu_load[1]);
 	P(cpu_load[2]);
 	P(cpu_load[3]);
 	P(cpu_load[4]);
+#endif
 #undef P
 
 	print_cfs_stats(m, cpu);
@@ -241,16 +246,7 @@ void proc_sched_show_task(struct task_st
 #define P(F) \
 	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
 
-	P(se.wait_runtime);
-	P(se.wait_start_fair);
-	P(se.exec_start);
-	P(se.sleep_start_fair);
-	P(se.sum_exec_runtime);
-
 #ifdef CONFIG_SCHEDSTATS
-	P(se.wait_start);
-	P(se.sleep_start);
-	P(se.block_start);
 	P(se.sleep_max);
 	P(se.block_max);
 	P(se.exec_max);
Index: linux-2.6/kernel/sched_norm.c
===================================================================
--- /dev/null
+++ linux-2.6/kernel/sched_norm.c
@@ -0,0 +1,870 @@
+/*
+ * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@...hat.com>
+ *
+ *  Interactivity improvements by Mike Galbraith
+ *  (C) 2007 Mike Galbraith <efault@....de>
+ *
+ *  Various enhancements by Dmitry Adamushko.
+ *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@...il.com>
+ *
+ *  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  Copyright IBM Corporation, 2007
+ *  Author: Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>
+ *
+ *  Scaled math optimizations by Thomas Gleixner
+ *  Copyright (C) 2007, Thomas Gleixner <tglx@...utronix.de>
+ *
+ *  Really fair scheduling
+ *  Copyright (C) 2007, Roman Zippel <zippel@...ux-m68k.org>
+ */
+
+typedef s64 kclock_t;
+
+static inline kclock_t kclock_max(kclock_t x, kclock_t y)
+{
+	return (kclock_t)(x - y) > 0 ? x : y;
+}
+static inline kclock_t kclock_min(kclock_t x, kclock_t y)
+{
+	return (kclock_t)(x - y) < 0 ? x : y;
+}
+
+#define MSHIFT		16
+#define MAX_TIMESUM	((kclock_t)1 << (30 + MSHIFT))
+
+/*
+ * Preemption granularity:
+ * (default: 10 msec, units: nanoseconds)
+ *
+ * NOTE: this granularity value is not the same as the concept of
+ * 'timeslice length' - timeslices in CFS will typically be somewhat
+ * larger than this value. (to see the precise effective timeslice
+ * length of your workload, run vmstat and monitor the context-switches
+ * field)
+ *
+ * On SMP systems the value of this is multiplied by the log2 of the
+ * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
+ * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
+ */
+unsigned int sysctl_sched_granularity __read_mostly = 10000000ULL;
+
+/*
+ * SCHED_BATCH wake-up granularity.
+ * (default: 10 msec, units: nanoseconds)
+ *
+ * This option delays the preemption effects of decoupled workloads
+ * and reduces their over-scheduling. Synchronous workloads will still
+ * have immediate wakeup/sleep latencies.
+ */
+unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
+							10000000ULL;
+
+/*
+ * SCHED_OTHER wake-up granularity.
+ * (default: 2 msec, units: nanoseconds)
+ *
+ * This option delays the preemption effects of decoupled workloads
+ * and reduces their over-scheduling. Synchronous workloads will still
+ * have immediate wakeup/sleep latencies.
+ */
+unsigned int sysctl_sched_wakeup_granularity __read_mostly = 2000000ULL;
+
+unsigned int sysctl_sched_stat_granularity __read_mostly;
+
+/*
+ * Debugging: various feature bits
+ */
+enum {
+	SCHED_FEAT_FAIR_SLEEPERS	= 1,
+	SCHED_FEAT_SLEEPER_AVG		= 2,
+	SCHED_FEAT_SLEEPER_LOAD_AVG	= 4,
+	SCHED_FEAT_PRECISE_CPU_LOAD	= 8,
+	SCHED_FEAT_START_DEBIT		= 16,
+	SCHED_FEAT_SKIP_INITIAL		= 32,
+};
+
+unsigned int sysctl_sched_features __read_mostly =
+		SCHED_FEAT_FAIR_SLEEPERS	*1 |
+		SCHED_FEAT_SLEEPER_AVG		*1 |
+		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
+		SCHED_FEAT_PRECISE_CPU_LOAD	*1 |
+		SCHED_FEAT_START_DEBIT		*1 |
+		SCHED_FEAT_SKIP_INITIAL		*0;
+
+extern struct sched_class fair_sched_class;
+
+static kclock_t get_time_avg(struct cfs_rq *cfs_rq)
+{
+	kclock_t avg;
+
+	avg = cfs_rq->time_norm_base;
+	if (cfs_rq->weight_sum)
+		avg += (kclock_t)((int)(cfs_rq->time_sum_off >> MSHIFT) / cfs_rq->weight_sum) << MSHIFT;
+
+	return avg;
+}
+
+static void normalize_time_sum_off(struct cfs_rq *cfs_rq)
+{
+	if (unlikely(cfs_rq->time_sum_off >= cfs_rq->time_sum_max)) {
+		cfs_rq->time_sum_off -= cfs_rq->time_sum_max;
+		cfs_rq->time_norm_base += cfs_rq->time_norm_inc;
+	} else if (unlikely(cfs_rq->time_sum_off < 0)) {
+		cfs_rq->time_sum_off += cfs_rq->time_sum_max;
+		cfs_rq->time_norm_base -= cfs_rq->time_norm_inc;
+	}
+}
+
+/* When determining the end runtime of the current task, two main cases have to
+ * be considered relative to the next task:
+ * - if the next task has a higher priority, it will shorten the runtime of the
+ *   current task by using run_end_next
+ * - if the next task has a lower priority, it may have gotten already enough
+ *   time, so we keep the current task running until it's close enough again
+ */
+static void update_run_end(struct cfs_rq *cfs_rq)
+{
+	kclock_t run_end;
+
+	run_end = cfs_rq->run_end_curr;
+	if (cfs_rq->rb_leftmost)
+		run_end = kclock_min(cfs_rq->run_end_next,
+				     kclock_max(cfs_rq->rb_leftmost->time_norm, run_end));
+	cfs_rq->run_end = run_end;
+}
+
+/**************************************************************
+ * CFS operations on generic schedulable entities:
+ */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* cpu runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->rq;
+}
+
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se)	(!se->my_q)
+
+#else	/* CONFIG_FAIR_GROUP_SCHED */
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+	return container_of(cfs_rq, struct rq, cfs);
+}
+
+#define entity_is_task(se)	1
+
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+	return container_of(se, struct task_struct, se);
+}
+
+
+/**************************************************************
+ * Scheduling class tree data structure manipulation methods:
+ */
+
+/*
+ * Enqueue an entity into the rb-tree:
+ */
+static void
+__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+	struct rb_node *parent = NULL;
+	struct sched_entity *entry;
+	kclock_t key;
+	int leftmost = 1;
+
+	se->time_key = key = se->time_norm + (se->req_weight_inv >> 1);
+	/*
+	 * Find the right place in the rbtree:
+	 */
+	while (*link) {
+		parent = *link;
+		entry = rb_entry(parent, struct sched_entity, run_node);
+		/*
+		 * We dont care about collisions. Nodes with
+		 * the same key stay together.
+		 */
+		if (key - entry->time_key < 0) {
+			link = &parent->rb_left;
+		} else {
+			link = &parent->rb_right;
+			leftmost = 0;
+		}
+	}
+
+	/*
+	 * Maintain a cache of leftmost tree entries (it is frequently
+	 * used):
+	 */
+	if (leftmost) {
+		cfs_rq->rb_leftmost = se;
+		cfs_rq->run_end_next = se->time_norm + se->req_weight_inv;
+	}
+
+	rb_link_node(&se->run_node, parent, link);
+	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+	update_load_add(&cfs_rq->load, se->load.weight);
+	se->queued = 1;
+}
+
+static void
+__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	if (cfs_rq->rb_leftmost == se) {
+		struct sched_entity *next = NULL;
+		struct rb_node *next_node = rb_next(&se->run_node);
+		if (next_node) {
+			next = rb_entry(next_node, struct sched_entity, run_node);
+			cfs_rq->run_end_next = next->time_norm + next->req_weight_inv;
+		}
+		cfs_rq->rb_leftmost = next;
+	}
+	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+	update_load_sub(&cfs_rq->load, se->load.weight);
+	se->queued = 0;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static void verify_queue(struct cfs_rq *cfs_rq, int inc_curr, struct sched_entity *se2)
+{
+	struct rb_node *node;
+	struct sched_entity *se;
+	kclock_t avg, sum = 0;
+	static int busy = 0, cnt = 10;
+
+#define RANGE_CHECK(se, se2) ({							\
+	if (cnt > 0 && abs(avg - se->time_norm) > (se->req_weight_inv << 2)) {	\
+		cnt--;								\
+		printk("%ld,%p(%u,%lx,%Lx): %Lx,%Lx,%p(%u,%lx,%Lx),%d\n",	\
+			cfs_rq->nr_running, se, task_of(se)->pid,		\
+			se->load.inv_weight, se->req_weight_inv,		\
+			se->time_norm, avg, se2, task_of(se2)->pid,		\
+			se2->load.inv_weight, se2->req_weight_inv, inc_curr);	\
+		WARN_ON(1);							\
+	} })
+
+	if (busy)
+		return;
+	busy = 1;
+
+	avg = get_time_avg(cfs_rq);
+	se = cfs_rq->curr;
+	if (inc_curr && se) {
+		sum = (se->time_norm - cfs_rq->time_norm_base) << se->weight_shift;
+		RANGE_CHECK(se, se2);
+	}
+	node = rb_first(&cfs_rq->tasks_timeline);
+	WARN_ON(node && cfs_rq->rb_leftmost != rb_entry(node, struct sched_entity, run_node));
+	while (node) {
+		se = rb_entry(node, struct sched_entity, run_node);
+		sum += (se->time_norm - cfs_rq->time_norm_base) << se->weight_shift;
+		RANGE_CHECK(se, se2);
+		node = rb_next(node);
+	}
+	if (sum != cfs_rq->time_sum_off) {
+		kclock_t oldsum = cfs_rq->time_sum_off;
+		cfs_rq->time_sum_off = sum;
+		printk("%ld:%Lx,%Lx,%p,%p,%d\n", cfs_rq->nr_running, sum, oldsum, cfs_rq->curr, se2, inc_curr);
+		WARN_ON(1);
+	}
+	busy = 0;
+}
+#else
+#define verify_queue(q,c,s)	((void)0)
+#endif
+
+/**************************************************************
+ * Scheduling class statistics methods:
+ */
+
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr = cfs_rq->curr;
+	kclock_t now = rq_of(cfs_rq)->clock;
+	unsigned long delta_exec;
+	kclock_t delta_norm;
+
+	if (unlikely(!curr))
+		return;
+
+	delta_exec = now - cfs_rq->prev_update;
+	if (!delta_exec)
+		return;
+	cfs_rq->prev_update = now;
+
+	curr->sum_exec_runtime += delta_exec;
+	cfs_rq->exec_clock += delta_exec;
+
+	delta_norm = (kclock_t)delta_exec * curr->load.inv_weight;
+	curr->time_norm += delta_norm;
+	if (unlikely((kclock_t)(curr->time_norm - cfs_rq->run_end) > curr->req_weight_inv)) {
+		if (cfs_rq->rb_leftmost) {
+			kclock_t end = cfs_rq->run_end + curr->req_weight_inv;
+			cfs_rq->wait_runtime_overruns++;
+			delta_norm -= curr->time_norm - end;
+			curr->time_norm = end;
+		} else
+			cfs_rq->run_end = cfs_rq->run_end_curr = curr->time_norm + curr->req_weight_inv;
+	}
+	cfs_rq->time_sum_off += delta_norm << curr->weight_shift;
+
+	verify_queue(cfs_rq, 4, curr);
+}
+
+static void
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	verify_queue(cfs_rq, cfs_rq->curr != se, se);
+	cfs_rq->time_avg_min = kclock_max(cfs_rq->time_avg_min, get_time_avg(cfs_rq));
+	se->time_norm = kclock_max(cfs_rq->time_avg_min - se->req_weight_inv, se->time_norm);
+
+	cfs_rq->nr_running++;
+	cfs_rq->weight_sum += 1 << se->weight_shift;
+	if (cfs_rq->inc_shift < se->weight_shift) {
+		cfs_rq->time_norm_inc >>= se->weight_shift - cfs_rq->inc_shift;
+		cfs_rq->time_sum_max >>= se->weight_shift - cfs_rq->inc_shift;
+		cfs_rq->inc_shift = se->weight_shift;
+	}
+	cfs_rq->time_sum_max += cfs_rq->time_norm_inc << se->weight_shift;
+	while (cfs_rq->time_sum_max >= MAX_TIMESUM) {
+		cfs_rq->time_norm_inc >>= 1;
+		cfs_rq->time_sum_max >>= 1;
+		cfs_rq->inc_shift++;
+	}
+	cfs_rq->time_sum_off += (se->time_norm - cfs_rq->time_norm_base) << se->weight_shift;
+	normalize_time_sum_off(cfs_rq);
+
+	if (&rq_of(cfs_rq)->curr->se == se) {
+		cfs_rq->curr = se;
+		cfs_rq->run_end_curr = cfs_rq->run_start + se->req_weight_inv;
+	}
+	if (cfs_rq->curr != se)
+		__enqueue_entity(cfs_rq, se);
+	update_run_end(cfs_rq);
+	verify_queue(cfs_rq, 2, se);
+}
+
+static void
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+{
+	verify_queue(cfs_rq, 3, se);
+
+	cfs_rq->weight_sum -= 1 << se->weight_shift;
+	if (cfs_rq->weight_sum) {
+		cfs_rq->time_sum_max -= cfs_rq->time_norm_inc << se->weight_shift;
+		while (cfs_rq->time_sum_max < (MAX_TIMESUM >> 1)) {
+			cfs_rq->time_norm_inc <<= 1;
+			cfs_rq->time_sum_max <<= 1;
+			cfs_rq->inc_shift--;
+		}
+
+		cfs_rq->time_sum_off -= (se->time_norm - cfs_rq->time_norm_base) << se->weight_shift;
+		normalize_time_sum_off(cfs_rq);
+	} else {
+		cfs_rq->time_avg_min = kclock_max(cfs_rq->time_avg_min, se->time_norm);
+		cfs_rq->time_sum_off -= (se->time_norm - cfs_rq->time_norm_base) << se->weight_shift;
+		BUG_ON(cfs_rq->time_sum_off);
+		cfs_rq->time_norm_inc = MAX_TIMESUM >> 1;
+		cfs_rq->time_sum_max = 0;
+		cfs_rq->inc_shift = 0;
+	}
+
+
+	cfs_rq->nr_running--;
+	if (se->queued)
+		__dequeue_entity(cfs_rq, se);
+	update_run_end(cfs_rq);
+	if (cfs_rq->curr == se)
+		cfs_rq->curr = NULL;
+	if (cfs_rq->next == se)
+		cfs_rq->next = NULL;
+	verify_queue(cfs_rq, cfs_rq->curr != se, se);
+}
+
+static inline void
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	cfs_rq->prev_update = rq_of(cfs_rq)->clock;
+	cfs_rq->run_start = se->time_norm;
+	cfs_rq->run_end_curr = cfs_rq->run_start + se->req_weight_inv;
+	cfs_rq->curr = se;
+}
+
+static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *se = cfs_rq->next ? cfs_rq->next : cfs_rq->rb_leftmost;
+
+	cfs_rq->next = NULL;
+	if (se->queued)
+		__dequeue_entity(cfs_rq, se);
+	set_next_entity(cfs_rq, se);
+	update_run_end(cfs_rq);
+
+	return se;
+}
+
+static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+{
+	update_curr(cfs_rq);
+	if (prev->on_rq)
+		__enqueue_entity(cfs_rq, prev);
+	cfs_rq->curr = NULL;
+}
+
+static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+	update_curr(cfs_rq);
+	normalize_time_sum_off(cfs_rq);
+
+#ifdef CONFIG_SCHED_DEBUG
+{
+	static int cnt = 10;
+	kclock_t avg = get_time_avg(cfs_rq);
+	int inc_curr = 5;
+	RANGE_CHECK(curr, curr);
+}
+#endif
+
+	/*
+	 * Reschedule if another task tops the current one.
+	 */
+	if ((kclock_t)(curr->time_norm - cfs_rq->run_end) >= 0) {
+		if (cfs_rq->rb_leftmost) {
+			cfs_rq->next = cfs_rq->rb_leftmost;
+			resched_task(rq_of(cfs_rq)->curr);
+		} else
+			cfs_rq->run_end = cfs_rq->run_end_curr = curr->time_norm + curr->req_weight_inv;
+	}
+}
+
+/**************************************************
+ * CFS operations on tasks:
+ */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+		for (; se; se = se->parent)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+	return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+	return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+	return grp->my_q;
+}
+
+/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
+ * another cpu ('this_cpu')
+ */
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+	/* A later patch will take group into account */
+	return &cpu_rq(this_cpu)->cfs;
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+/* Do the two (enqueued) tasks belong to the same group ? */
+static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
+{
+	if (curr->se.cfs_rq == p->se.cfs_rq)
+		return 1;
+
+	return 0;
+}
+
+#else	/* CONFIG_FAIR_GROUP_SCHED */
+
+#define for_each_sched_entity(se) \
+		for (; se; se = NULL)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+	return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+	struct task_struct *p = task_of(se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+	return NULL;
+}
+
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+	return &cpu_rq(this_cpu)->cfs;
+}
+
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
+static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
+{
+	return 1;
+}
+
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
+
+/*
+ * The enqueue_task method is called before nr_running is
+ * increased. Here we update the fair scheduling stats and
+ * then put the task into the rbtree:
+ */
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
+{
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se = &p->se;
+
+	for_each_sched_entity(se) {
+		if (se->on_rq)
+			break;
+		cfs_rq = cfs_rq_of(se);
+		enqueue_entity(cfs_rq, se);
+	}
+}
+
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+{
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se = &p->se;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		dequeue_entity(cfs_rq, se, sleep);
+		/* Don't dequeue parent if it has other entities besides us */
+		if (cfs_rq->weight_sum)
+			break;
+	}
+}
+
+/*
+ * sched_yield() support is very simple - we dequeue and enqueue
+ */
+static void yield_task_fair(struct rq *rq, struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *next;
+	__update_rq_clock(rq);
+
+	update_curr(cfs_rq);
+	next = cfs_rq->rb_leftmost;
+	if (next && (kclock_t)(p->se.time_norm - next->time_norm) > 0) {
+		cfs_rq->next = next;
+		return;
+	}
+	cfs_rq->next = &p->se;
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
+{
+	struct task_struct *curr = rq->curr;
+	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+	unsigned long gran;
+	kclock_t gran_norm;
+
+	if (!cfs_rq->curr || unlikely(rt_prio(p->prio))) {
+		resched_task(curr);
+		return;
+	}
+
+	if (!is_same_group(curr, p) || cfs_rq->rb_leftmost != &p->se)
+		return;
+
+	update_curr(cfs_rq);
+
+	gran = sysctl_sched_wakeup_granularity;
+	/*
+	 * Batch tasks prefer throughput over latency:
+	 */
+	if (unlikely(p->policy == SCHED_BATCH))
+		gran = sysctl_sched_batch_wakeup_granularity;
+	gran_norm = cfs_rq->curr->load.inv_weight * (kclock_t)gran;
+
+	if (curr->se.time_norm - cfs_rq->run_start >= gran_norm &&
+	    curr->se.time_norm - p->se.time_norm >= gran_norm) {
+		cfs_rq->next = &p->se;
+		resched_task(curr);
+	}
+}
+
+static struct task_struct *pick_next_task_fair(struct rq *rq)
+{
+	struct cfs_rq *cfs_rq = &rq->cfs;
+	struct sched_entity *se;
+
+	if (unlikely(!cfs_rq->nr_running))
+		return NULL;
+
+	if (cfs_rq->nr_running == 1 && cfs_rq->curr)
+		return task_of(cfs_rq->curr);
+
+	do {
+		se = pick_next_entity(cfs_rq);
+		cfs_rq = group_cfs_rq(se);
+	} while (cfs_rq);
+
+	return task_of(se);
+}
+
+/*
+ * Account for a descheduled task:
+ */
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+{
+	struct sched_entity *se = &prev->se;
+	struct cfs_rq *cfs_rq;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		put_prev_entity(cfs_rq, se);
+	}
+}
+
+/**************************************************
+ * Fair scheduling class load-balancing methods:
+ */
+
+/*
+ * Load-balancing iterator. Note: while the runqueue stays locked
+ * during the whole iteration, the current task might be
+ * dequeued so the iterator has to be dequeue-safe. Here we
+ * achieve that by always pre-iterating before returning
+ * the current task:
+ */
+static inline struct task_struct *
+__load_balance_iterator(struct cfs_rq *cfs_rq)
+{
+	struct task_struct *p;
+	struct rb_node *curr;
+
+	curr = cfs_rq->rb_load_balance_curr;
+	if (!curr)
+		return NULL;
+
+	p = rb_entry(curr, struct task_struct, se.run_node);
+	cfs_rq->rb_load_balance_curr = rb_next(curr);
+
+	return p;
+}
+
+static struct task_struct *load_balance_start_fair(void *arg)
+{
+	struct cfs_rq *cfs_rq = arg;
+
+	cfs_rq->rb_load_balance_curr = cfs_rq->rb_leftmost ?
+		&cfs_rq->rb_leftmost->run_node : NULL;
+	if (cfs_rq->curr)
+		return rb_entry(&cfs_rq->curr->run_node, struct task_struct, se.run_node);
+
+	return __load_balance_iterator(cfs_rq);
+}
+
+static struct task_struct *load_balance_next_fair(void *arg)
+{
+	struct cfs_rq *cfs_rq = arg;
+
+	return __load_balance_iterator(cfs_rq);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr;
+	struct task_struct *p;
+
+	if (!cfs_rq->nr_running)
+		return MAX_PRIO;
+
+	curr = cfs_rq->rb_leftmost;
+	p = task_of(curr);
+
+	return p->prio;
+}
+#endif
+
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		  unsigned long max_nr_move, unsigned long max_load_move,
+		  struct sched_domain *sd, enum cpu_idle_type idle,
+		  int *all_pinned, int *this_best_prio)
+{
+	struct cfs_rq *busy_cfs_rq;
+	unsigned long load_moved, total_nr_moved = 0, nr_moved;
+	long rem_load_move = max_load_move;
+	struct rq_iterator cfs_rq_iterator;
+
+	cfs_rq_iterator.start = load_balance_start_fair;
+	cfs_rq_iterator.next = load_balance_next_fair;
+
+	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		struct cfs_rq *this_cfs_rq;
+		long imbalance;
+		unsigned long maxload;
+
+		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+
+		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+		if (imbalance <= 0)
+			continue;
+
+		/* Don't pull more than imbalance/2 */
+		imbalance /= 2;
+		maxload = min(rem_load_move, imbalance);
+
+		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+#else
+# define maxload rem_load_move
+#endif
+		/* pass busy_cfs_rq argument into
+		 * load_balance_[start|next]_fair iterators
+		 */
+		cfs_rq_iterator.arg = busy_cfs_rq;
+		nr_moved = balance_tasks(this_rq, this_cpu, busiest,
+				max_nr_move, maxload, sd, idle, all_pinned,
+				&load_moved, this_best_prio, &cfs_rq_iterator);
+
+		total_nr_moved += nr_moved;
+		max_nr_move -= nr_moved;
+		rem_load_move -= load_moved;
+
+		if (max_nr_move <= 0 || rem_load_move <= 0)
+			break;
+	}
+
+	return max_load_move - rem_load_move;
+}
+
+/*
+ * scheduler tick hitting a task of our scheduling class:
+ */
+static void task_tick_fair(struct rq *rq, struct task_struct *curr)
+{
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se = &curr->se;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		entity_tick(cfs_rq, se);
+	}
+}
+
+/*
+ * Share the fairness runtime between parent and child, thus the
+ * total amount of pressure for CPU stays equal - new tasks
+ * get a chance to run but frequent forkers are not allowed to
+ * monopolize the CPU. Note: the parent runqueue is locked,
+ * the child is not running yet.
+ */
+static void task_new_fair(struct rq *rq, struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = &p->se;
+	kclock_t time;
+
+	sched_info_queued(p);
+
+	time = (rq->curr->se.time_norm - get_time_avg(cfs_rq)) >> 1;
+	cfs_rq->time_sum_off -= (time << rq->curr->se.weight_shift);
+	normalize_time_sum_off(cfs_rq);
+	rq->curr->se.time_norm -= time;
+	se->time_norm = rq->curr->se.time_norm;
+
+	enqueue_entity(cfs_rq, se);
+	p->se.on_rq = 1;
+
+	cfs_rq->next = se;
+	resched_task(rq->curr);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_curr_task_fair(struct rq *rq)
+{
+	struct sched_entity *se = &rq->curr.se;
+
+	for_each_sched_entity(se)
+		set_next_entity(cfs_rq_of(se), se);
+}
+#else
+static void set_curr_task_fair(struct rq *rq)
+{
+}
+#endif
+
+/*
+ * All the scheduling class methods:
+ */
+struct sched_class fair_sched_class __read_mostly = {
+	.enqueue_task		= enqueue_task_fair,
+	.dequeue_task		= dequeue_task_fair,
+	.yield_task		= yield_task_fair,
+
+	.check_preempt_curr	= check_preempt_curr_fair,
+
+	.pick_next_task		= pick_next_task_fair,
+	.put_prev_task		= put_prev_task_fair,
+
+	.load_balance		= load_balance_fair,
+
+	.set_curr_task          = set_curr_task_fair,
+	.task_tick		= task_tick_fair,
+	.task_new		= task_new_fair,
+};
+
+#ifdef CONFIG_SCHED_DEBUG
+static void print_cfs_stats(struct seq_file *m, int cpu)
+{
+	struct cfs_rq *cfs_rq;
+
+	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+		print_cfs_rq(m, cpu, cfs_rq);
+}
+#endif
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -211,30 +211,16 @@ static ctl_table root_table[] = {
 	{ .ctl_name = 0 }
 };
 
-#ifdef CONFIG_SCHED_DEBUG
 static unsigned long min_sched_granularity_ns = 100000;		/* 100 usecs */
 static unsigned long max_sched_granularity_ns = 1000000000;	/* 1 second */
 static unsigned long min_wakeup_granularity_ns;			/* 0 usecs */
 static unsigned long max_wakeup_granularity_ns = 1000000000;	/* 1 second */
-#endif
 
 static ctl_table kern_table[] = {
-#ifdef CONFIG_SCHED_DEBUG
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_min_granularity_ns",
-		.data		= &sysctl_sched_min_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
-	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_latency_ns",
-		.data		= &sysctl_sched_latency,
+		.procname	= "sched_granularity_ns",
+		.data		= &sysctl_sched_granularity,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
@@ -264,6 +250,7 @@ static ctl_table kern_table[] = {
 		.extra1		= &min_wakeup_granularity_ns,
 		.extra2		= &max_wakeup_granularity_ns,
 	},
+#ifdef CONFIG_SCHED_DEBUG
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_stat_granularity_ns",
@@ -277,17 +264,6 @@ static ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_runtime_limit_ns",
-		.data		= &sysctl_sched_runtime_limit,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_child_runs_first",
 		.data		= &sysctl_sched_child_runs_first,
 		.maxlen		= sizeof(unsigned int),
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/