Forward port of nicksched to 2.6.21-rc4. Can't find my old design/description document for it, but it is yet another scheduler. Focus is on simplicity and fairness. This one tends to require X to be reniced to -10 or so for best results (renice -10 `pidof X`). Timeslices get scaled by /proc/sys/kernel/base_timeslice. If you have bad interactivity you could try lowering this and see if it helps. fs/proc/array.c | 12 include/linux/init_task.h | 7 include/linux/sched.h | 34 - include/linux/sysctl.h | 1 kernel/sched.c | 1122 +++++++++++++++++----------------------------- kernel/sysctl.c | 16 mm/oom_kill.c | 7 7 files changed, 476 insertions(+), 723 deletions(-) Index: linux-2.6/fs/proc/array.c =================================================================== --- linux-2.6.orig/fs/proc/array.c 2007-03-22 20:44:00.000000000 +1100 +++ linux-2.6/fs/proc/array.c 2007-03-22 20:44:20.000000000 +1100 @@ -165,7 +165,13 @@ static inline char * task_state(struct t rcu_read_lock(); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" + "timeslice:\t%d\n" + "usedslice:\t%d\n" + "arrayseq:\t%lu\n" + "staticprio:\t%u\n" + "prio:\t%u\n" + "sleep_time:\t%lu\n" + "total_time:\t%lu\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -173,7 +179,9 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), + get_task_timeslice(p), p->used_slice, p->array_sequence, + p->static_prio, p->prio, + p->sleep_time, p->total_time, p->tgid, p->pid, pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h 2007-03-22 20:44:00.000000000 +1100 +++ linux-2.6/include/linux/sched.h 2007-03-22 20:45:18.000000000 +1100 @@ -523,7 +523,7 @@ struct signal_struct { #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 40) +#define MAX_PRIO (MAX_RT_PRIO + 59) #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) #define rt_task(p) rt_prio((p)->prio) @@ -788,24 +788,16 @@ struct mempolicy; struct pipe_inode_info; struct uts_namespace; -enum sleep_type { - SLEEP_NORMAL, - SLEEP_NONINTERACTIVE, - SLEEP_INTERACTIVE, - SLEEP_INTERRUPTED, -}; - struct prio_array; struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; atomic_t usage; + int lock_depth; /* BKL lock depth */ unsigned long flags; /* per process flags, defined below */ unsigned long ptrace; - int lock_depth; /* BKL lock depth */ - #ifdef CONFIG_SMP #ifdef __ARCH_WANT_UNLOCKED_CTXSW int oncpu; @@ -813,27 +805,29 @@ struct task_struct { #endif int load_weight; /* for niceness load balancing purposes */ int prio, static_prio, normal_prio; + int used_slice, over_slice; struct list_head run_list; struct prio_array *array; + unsigned long array_sequence; - unsigned short ioprio; -#ifdef CONFIG_BLK_DEV_IO_TRACE - unsigned int btrace_seq; -#endif - unsigned long sleep_avg; unsigned long long timestamp, last_ran; unsigned long long sched_time; /* sched_clock time spent running */ - enum sleep_type sleep_type; + unsigned long total_time, sleep_time; + + struct list_head tasks; + struct mm_struct *mm, *active_mm; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; #endif - struct list_head tasks; + unsigned short ioprio; +#ifdef CONFIG_BLK_DEV_IO_TRACE + unsigned int btrace_seq; +#endif /* * ptrace_list/ptrace_children forms the list of my children * that were stolen by a ptracer. @@ -841,8 +835,6 @@ struct task_struct { struct list_head ptrace_children; struct list_head ptrace_list; - struct mm_struct *mm, *active_mm; - /* task state */ struct linux_binfmt *binfmt; long exit_state; @@ -1199,6 +1191,8 @@ extern unsigned long long sched_clock(vo extern unsigned long long current_sched_time(const struct task_struct *current_task); +extern int get_task_timeslice(struct task_struct *t); + /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP extern void sched_exec(void); Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c 2007-03-22 20:44:00.000000000 +1100 +++ linux-2.6/kernel/sched.c 2007-03-22 20:48:52.000000000 +1100 @@ -71,129 +71,68 @@ unsigned long long __attribute__((weak)) * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 30) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 30) /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. + * it's a [ 0 ... 58 ] range. */ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +#define US_TO_JIFFIES(x) ((x) * HZ / 1000000) +#define JIFFIES_TO_US(x) ((x) * 1000000 / HZ) + /* - * Some helpers for converting nanosecond timing to jiffy resolution + * MIN_TIMESLICE is the timeslice that a minimum priority process gets if there + * is a maximum priority process runnable. MAX_TIMESLICE is derived from the + * formula in task_timeslice. It cannot be changed here. It is the timesilce + * that the maximum priority process will get. Larger timeslices are attainable + * by low priority processes however. */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +int sched_base_timeslice = 300; +int sched_min_base = 10; +int sched_max_base = 10000; + +#define MIN_TIMESLICE 1 +#define RT_TIMESLICE (50 * 1000 / HZ) /* 50ms */ +#define BASE_TIMESLICE (sched_base_timeslice) + +/* Maximum amount of history that will be used to calculate priority */ +#define MAX_SLEEP_SHIFT 18 +#define MAX_SLEEP (1UL << MAX_SLEEP_SHIFT) /* ~0.25s */ /* - * These are the 'tuning knobs' of the scheduler: + * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is + * will moderate dicard freak events (eg. SIGSTOP) * - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. - * Timeslices get refilled after they expire. */ -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) -#define DEF_TIMESLICE (100 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) +#define MAX_SLEEP_AFFECT (MAX_SLEEP/4) /* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. + * The amount of history can be decreased (on fork for example). This puts a + * lower bound on it. */ +#define MIN_HISTORY (MAX_SLEEP/8) +#define FORKED_TS_MAX (US_TO_JIFFIES(10000) ?: 1) -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#define GRANULARITY (10 * HZ / 1000 ? : 1) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) - -#define DELTA(p) \ - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ - INTERACTIVE_DELTA) - -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) - -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) - -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) - -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) - -static unsigned int static_prio_timeslice(int static_prio) -{ - if (static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, static_prio); -} +/* + * SLEEP_FACTOR is a fixed point factor used to scale history tracking things. + * In particular: total_time, sleep_time, sleep_avg. + */ +#define SLEEP_FACTOR 1024 /* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. + * The scheduler classifies a process as performing one of the following + * activities */ +#define STIME_SLEEP 1 /* Sleeping */ +#define STIME_RUN 2 /* Using CPU */ -static inline unsigned int task_timeslice(struct task_struct *p) -{ - return static_prio_timeslice(p->static_prio); -} +#define TASK_PREEMPTS_CURR(p, rq) ((p)->prio < (rq)->curr->prio) /* * These are the runqueue data structures: @@ -224,6 +163,7 @@ struct rq { #ifdef CONFIG_SMP unsigned long cpu_load[3]; #endif + unsigned long array_sequence; unsigned long long nr_switches; /* @@ -234,15 +174,13 @@ struct rq { */ unsigned long nr_uninterruptible; - unsigned long expired_timestamp; - /* Cached timestamp set by update_cpu_clock() */ - unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; - struct prio_array *active, *expired, arrays[2]; - int best_expired_prio; atomic_t nr_iowait; + int min_expired_prio; + struct prio_array *active, *expired, arrays[2]; +// struct list_head quotient_queue[10]; #ifdef CONFIG_SMP struct sched_domain *sd; @@ -261,9 +199,6 @@ struct rq { struct sched_info rq_sched_info; /* sys_sched_yield() stats */ - unsigned long yld_exp_empty; - unsigned long yld_act_empty; - unsigned long yld_both_empty; unsigned long yld_cnt; /* schedule() stats */ @@ -411,9 +346,8 @@ static struct rq *task_rq_lock(struct ta struct rq *rq; repeat_lock_task: - local_irq_save(*flags); rq = task_rq(p); - spin_lock(&rq->lock); + spin_lock_irqsave(&rq->lock, *flags); if (unlikely(rq != task_rq(p))) { spin_unlock_irqrestore(&rq->lock, *flags); goto repeat_lock_task; @@ -456,8 +390,8 @@ static int show_schedstat(struct seq_fil /* runqueue-specific stats */ seq_printf(seq, "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", - cpu, rq->yld_both_empty, - rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, + cpu, + 0UL, 0UL, 0UL, rq->yld_cnt, rq->sched_switch, rq->sched_cnt, rq->sched_goidle, rq->ttwu_cnt, rq->ttwu_local, rq->rq_sched_info.cpu_time, @@ -561,21 +495,6 @@ rq_sched_info_depart(struct rq *rq, unsi # define schedstat_add(rq, field, amt) do { } while (0) #endif -/* - * this_rq_lock - lock this runqueue and disable interrupts. - */ -static inline struct rq *this_rq_lock(void) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_disable(); - rq = this_rq(); - spin_lock(&rq->lock); - - return rq; -} - #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* * Called when a process is dequeued from the active array and given @@ -682,6 +601,12 @@ sched_info_switch(struct task_struct *pr #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +static inline void update_min_expired_prio(struct task_struct *p, struct rq *rq) +{ + if (!rt_task(p) && p->static_prio < rq->min_expired_prio) + rq->min_expired_prio = p->static_prio; +} + /* * Adding/removing a task to/from a priority array: */ @@ -695,22 +620,15 @@ static void dequeue_task(struct task_str static void enqueue_task(struct task_struct *p, struct prio_array *array) { + struct list_head *entry = array->queue + p->prio; + sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); + list_add_tail(&p->run_list, entry); __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; } -/* - * Put task to the end of the run list without the overhead of dequeue - * followed by enqueue. - */ -static void requeue_task(struct task_struct *p, struct prio_array *array) -{ - list_move_tail(&p->run_list, array->queue + p->prio); -} - static inline void enqueue_task_head(struct task_struct *p, struct prio_array *array) { @@ -721,35 +639,6 @@ enqueue_task_head(struct task_struct *p, } /* - * __normal_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. - * - * Both properties are important to certain workloads. - */ - -static inline int __normal_prio(struct task_struct *p) -{ - int bonus, prio; - - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; - - prio = p->static_prio - bonus; - if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; - return prio; -} - -/* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that * each task makes to its run queue's load is weighted according to its @@ -758,12 +647,17 @@ static inline int __normal_prio(struct t * slice expiry etc. */ +static int static_prio_timeslice(unsigned int prio) +{ + return BASE_TIMESLICE; /* XXX: fixme for smpnice */ +} + /* - * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE + * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == BASE_TIMESLICE * If static_prio_timeslice() is ever changed to break this assumption then * this code will need modification */ -#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE +#define TIME_SLICE_NICE_ZERO BASE_TIMESLICE #define LOAD_WEIGHT(lp) \ (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) #define PRIO_TO_LOAD_WEIGHT(prio) \ @@ -813,134 +707,153 @@ static inline void dec_nr_running(struct dec_raw_weighted_load(rq, p); } + +static inline long long sched_clock_us(void) +{ + return ((unsigned long long)jiffies) << (20 - SHIFT_HZ); +} + /* - * Calculate the expected normal priority: i.e. priority - * without taking RT-inheritance into account. Might be - * boosted by interactivity modifiers. Changes upon fork, - * setprio syscalls, and whenever the interactivity - * estimator recalculates. + * add_task_time updates a task p after time of doing the specified @type + * of activity. See STIME_*. This is used for priority calculation. */ -static inline int normal_prio(struct task_struct *p) +static void add_task_time(struct task_struct *p, + unsigned long long time, unsigned long type) { - int prio; + unsigned long ratio; + unsigned long long tmp; + unsigned long t; - if (has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __normal_prio(p); - return prio; + if (!time) + return; + + t = time; + if (type == STIME_SLEEP) { + unsigned long fac; + fac = USER_PRIO(p->static_prio); /* 0..39 */ + fac = fac + 12; /* 12..41 */ + if (time > MAX_SLEEP_AFFECT*8) + t = MAX_SLEEP_AFFECT*8; + t = (t * fac) / 32; + t = (t + 7) / 8; + } else { + if (time > MAX_SLEEP) + t = MAX_SLEEP; + } + + ratio = MAX_SLEEP - t; + tmp = (unsigned long long)ratio*p->total_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->total_time = (unsigned long)tmp; + + tmp = (unsigned long long)ratio*p->sleep_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->sleep_time = (unsigned long)tmp; + + p->total_time += t; + if (type == STIME_SLEEP) + p->sleep_time += t; } -/* - * Calculate the current priority, i.e. the priority - * taken into account by the scheduler. This value might - * be boosted by RT tasks, or might be boosted by - * interactivity modifiers. Will be RT if the task got - * RT-boosted. If not then it returns p->normal_prio. - */ -static int effective_prio(struct task_struct *p) -{ - p->normal_prio = normal_prio(p); - /* - * If we are RT tasks or we were boosted to RT priority, - * keep the priority unchanged. Otherwise, update priority - * to the normal priority: - */ - if (!rt_prio(p->prio)) - return p->normal_prio; - return p->prio; +static inline unsigned long task_sleep_avg(struct task_struct *p) +{ + return (SLEEP_FACTOR * p->sleep_time) / (p->total_time + 1); } /* - * __activate_task - move a task to the runqueue. + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + * + * Timeslices are scaled, so if only low priority processes are running, + * they will all get long timeslices. */ -static void __activate_task(struct task_struct *p, struct rq *rq) +static int task_timeslice(struct task_struct *p, struct rq *rq) { - struct prio_array *target = rq->active; + unsigned int prio, delta, scale, ts; - if (batch_task(p)) - target = rq->expired; - enqueue_task(p, target); - inc_nr_running(p, rq); + if (rt_task(p)) + return RT_TIMESLICE; + + /* prio is 10..49 */ + prio = USER_PRIO(p->static_prio) - 10 + 9; + + ts = prio * 1024; /* 10240..50176 */ + + /* delta is 3..42 (delta-3 <= prio-9) */ + delta = p->static_prio - min(rq->min_expired_prio, p->static_prio) + 3; + + scale = delta*delta; /* 9..1764 */ + + ts = ts / scale; /* 1137..(28..5575) */ + + /* a nice 0 task has ts (29*29/9) here. scale to BASE_TIMESLICE */ + ts = ts * BASE_TIMESLICE / (29*1024/9); + + ts = msecs_to_jiffies(ts); + if (unlikely(ts == 0)) + ts = 1; + + return ts; } -/* - * __activate_idle_task - move idle task to the _front_ of runqueue. - */ -static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) +int get_task_timeslice(struct task_struct *p) { - enqueue_task_head(p, rq->active); - inc_nr_running(p, rq); + int ts; + struct rq *rq; + unsigned long flags; + rq = task_rq_lock(p, &flags); + ts = task_timeslice(p, rq); + task_rq_unlock(rq, &flags); + + return ts; } /* - * Recalculate p->normal_prio and p->prio after having slept, - * updating the sleep-average too: + * task_priority: calculates a task's priority based on previous running + * history (see add_task_time). The priority is just a simple linear function + * based on sleep_avg and static_prio. */ -static int recalc_task_prio(struct task_struct *p, unsigned long long now) +static int task_priority(struct task_struct *p) { - /* Caller must always ensure 'now >= p->timestamp' */ - unsigned long sleep_time = now - p->timestamp; + unsigned long sleep_avg; + int bonus, prio; - if (batch_task(p)) - sleep_time = 0; + if (rt_prio(p->prio)) + return p->prio; - if (likely(sleep_time > 0)) { - /* - * This ceiling is set to the lowest priority that would allow - * a task to be reinserted into the active array on timeslice - * completion. - */ - unsigned long ceiling = INTERACTIVE_SLEEP(p); + sleep_avg = task_sleep_avg(p); - if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { - /* - * Prevents user tasks from achieving best priority - * with one single large enough sleep. - */ - p->sleep_avg = ceiling; - /* - * Using INTERACTIVE_SLEEP() as a ceiling places a - * nice(0) task 1ms sleep away from promotion, and - * gives it 700ms to round-robin with no chance of - * being demoted. This is more than generous, so - * mark this sleep as non-interactive to prevent the - * on-runqueue bonus logic from intervening should - * this task not receive cpu immediately. - */ - p->sleep_type = SLEEP_NONINTERACTIVE; - } else { - /* - * Tasks waking from uninterruptible sleep are - * limited in their sleep_avg rise as they - * are likely to be waiting on I/O - */ - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { - if (p->sleep_avg >= ceiling) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - ceiling) { - p->sleep_avg = ceiling; - sleep_time = 0; - } - } + prio = USER_PRIO(p->static_prio) + 10; + bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + (SLEEP_FACTOR / 2)) + / SLEEP_FACTOR; + prio = MAX_RT_PRIO + prio - bonus; - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; + if (prio < MAX_RT_PRIO) + return MAX_RT_PRIO; + if (prio > MAX_PRIO-1) + return MAX_PRIO-1; - } - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; - } + return prio; +} - return effective_prio(p); +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(struct task_struct *p, struct rq *rq, + struct prio_array *array) +{ + enqueue_task(p, array); + rq->nr_running++; +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) +{ + enqueue_task_head(p, rq->active); + inc_nr_running(p, rq); } /* @@ -951,20 +864,14 @@ static int recalc_task_prio(struct task_ */ static void activate_task(struct task_struct *p, struct rq *rq, int local) { - unsigned long long now; + unsigned long long now, sleep; + struct prio_array *array; + array = rq->active; if (rt_task(p)) goto out; - now = sched_clock(); -#ifdef CONFIG_SMP - if (!local) { - /* Compensate for drifting sched_clock */ - struct rq *this_rq = this_rq(); - now = (now - this_rq->most_recent_timestamp) - + rq->most_recent_timestamp; - } -#endif + now = sched_clock_us(); /* * Sleep time is in units of nanosecs, so shift by 20 to get a @@ -976,41 +883,34 @@ static void activate_task(struct task_st profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), (now - p->timestamp) >> 20); } - - p->prio = recalc_task_prio(p, now); - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. + * If we have slept through an active/expired array switch, restart + * our timeslice too. */ - if (p->sleep_type == SLEEP_NORMAL) { - /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: - */ - if (in_interrupt()) - p->sleep_type = SLEEP_INTERRUPTED; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->sleep_type = SLEEP_INTERACTIVE; - } - } + sleep = now - p->timestamp; p->timestamp = now; + add_task_time(p, sleep, STIME_SLEEP); + p->prio = task_priority(p); + + if (rq->array_sequence != p->array_sequence) { + p->used_slice = 0; + p->over_slice = 0; + } else if (unlikely(p->used_slice == -1)) { + array = rq->expired; + p->used_slice = 0; + update_min_expired_prio(p, rq); + } + out: - __activate_task(p, rq); + __activate_task(p, rq, array); } /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p, struct rq *rq) +static inline void deactivate_task(struct task_struct *p, struct rq *rq) { + p->array_sequence = rq->array_sequence; dec_nr_running(p, rq); dequeue_task(p, p->array); p->array = NULL; @@ -1508,10 +1408,12 @@ static int try_to_wake_up(struct task_st out_set_cpu: new_cpu = wake_idle(new_cpu, p); if (new_cpu != cpu) { + int seq_delta = p->array_sequence - rq->array_sequence; set_task_cpu(p, new_cpu); task_rq_unlock(rq, &flags); /* might preempt at this point */ rq = task_rq_lock(p, &flags); + p->array_sequence = rq->array_sequence + seq_delta; old_state = p->state; if (!(old_state & state)) goto out; @@ -1524,33 +1426,9 @@ out_set_cpu: out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { + if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->sleep_type = SLEEP_NONINTERACTIVE; - } else - - /* - * Tasks that have marked their sleep as noninteractive get - * woken up with their sleep average not weighted in an - * interactive way. - */ - if (old_state & TASK_NONINTERACTIVE) - p->sleep_type = SLEEP_NONINTERACTIVE; - - activate_task(p, rq, cpu == this_cpu); - /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) - * don't trigger a preemption, if the woken up task will run on - * this cpu. (in this case the 'I will reschedule' promise of - * the waker guarantees that the freshly woken up task is going - * to be considered on this CPU.) - */ if (!sync || cpu != this_cpu) { if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -1584,6 +1462,9 @@ static void task_running_tick(struct rq */ void fastcall sched_fork(struct task_struct *p, int clone_flags) { + unsigned long sleep_avg; + struct rq *rq; + struct task_struct *cur; int cpu = get_cpu(); #ifdef CONFIG_SMP @@ -1617,31 +1498,47 @@ void fastcall sched_fork(struct task_str /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. - */ + p->timestamp = sched_clock_us(); + p->used_slice = 0; + p->over_slice = 0; + if (rt_task(p)) + goto out; + + rq = this_rq(); + cur = current; + + /* Get MIN_HISTORY of history with a bit less sleep_avg than parent. */ + sleep_avg = task_sleep_avg(cur); + sleep_avg -= sleep_avg >> 2; + p->total_time = MIN_HISTORY; + p->sleep_time = MIN_HISTORY * sleep_avg / SLEEP_FACTOR; + + p->prio = p->normal_prio = task_priority(p); + + /* Parent loses sleep time the child took */ + cur->sleep_time -= min(cur->sleep_time/4, p->sleep_time); + local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; - /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. - */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { - /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. - */ - current->time_slice = 1; - task_running_tick(cpu_rq(cpu), current); + if (unlikely(cur->used_slice == -1 || cur == rq->idle)) + p->used_slice = -1; + else { + int ts = task_timeslice(p, rq); + int child_ts = min_t(int, ts/4, FORKED_TS_MAX); + + if (child_ts == 0) { + p->used_slice = -1; + } else { + p->used_slice = ts - child_ts; + cur->used_slice += child_ts; + if (cur->used_slice >= task_timeslice(p, rq)) { + cur->used_slice = -1; + set_need_resched(); + } + } } local_irq_enable(); - put_cpu(); +out: + put_cpu(); } /* @@ -1653,108 +1550,55 @@ void fastcall sched_fork(struct task_str */ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { - struct rq *rq, *this_rq; unsigned long flags; int this_cpu, cpu; + struct rq *rq; + struct prio_array *array; + struct task_struct *cur; + + cpu = task_cpu(p); rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); this_cpu = smp_processor_id(); - cpu = task_cpu(p); - - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. The parent - * (current) is done further down, under its lock. - */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - p->prio = effective_prio(p); + array = rq->active; + if (unlikely(p->used_slice == -1)) { + array = rq->expired; + p->used_slice = 0; + update_min_expired_prio(p, rq); + } + cur = current; if (likely(cpu == this_cpu)) { - if (!(clone_flags & CLONE_VM)) { + if (!rt_task(p) && !rt_task(cur) && + !(clone_flags & CLONE_VM) && (array == rq->active)) { /* * The VM isn't cloned, so we're in a good position to * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. */ - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; - p->normal_prio = current->normal_prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - inc_nr_running(p, rq); - } set_need_resched(); - } else - /* Run child last */ - __activate_task(p, rq); - /* - * We skip the following code due to cpu == this_cpu - * - * task_rq_unlock(rq, &flags); - * this_rq = task_rq_lock(current, &flags); - */ - this_rq = rq; - } else { - this_rq = cpu_rq(this_cpu); + p->prio = cur->prio; + list_add_tail(&p->run_list, &cur->run_list); + p->array = cur->array; + p->array->nr_active++; + rq->nr_running++; + goto child_queued; + } + } + __activate_task(p, rq, array); - /* - * Not the local CPU - must adjust timestamp. This should - * get optimised away in the !CONFIG_SMP case. - */ - p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) - + rq->most_recent_timestamp; - __activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + /* This catches RT tasks and remote SMP tasks */ + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); - /* - * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: - */ - task_rq_unlock(rq, &flags); - this_rq = task_rq_lock(current, &flags); - } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - task_rq_unlock(this_rq, &flags); +child_queued: + task_rq_unlock(rq, &flags); } -/* - * Potentially available exiting-child timeslices are - * retrieved here - this way the parent does not get - * penalized for creating too many threads. - * - * (this cannot be used to 'generate' timeslices - * artificially, because any timeslice recovered here - * was given away by the parent in the first place.) - */ void fastcall sched_exit(struct task_struct *p) { - unsigned long flags; - struct rq *rq; - - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); - task_rq_unlock(rq, &flags); } /** @@ -1831,6 +1675,7 @@ static inline void finish_task_switch(st asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { + struct task_struct *cur; struct rq *rq = this_rq(); finish_task_switch(rq, prev); @@ -1838,8 +1683,9 @@ asmlinkage void schedule_tail(struct tas /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); #endif - if (current->set_child_tid) - put_user(current->pid, current->set_child_tid); + cur = current; + if (cur->set_child_tid) + put_user(cur->pid, cur->set_child_tid); } /* @@ -1979,19 +1825,19 @@ static void double_rq_lock(struct rq *rq __acquires(rq1->lock) __acquires(rq2->lock) { + spinlock_t *l1, *l2; + BUG_ON(!irqs_disabled()); - if (rq1 == rq2) { - spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { - if (rq1 < rq2) { - spin_lock(&rq1->lock); - spin_lock(&rq2->lock); - } else { - spin_lock(&rq2->lock); - spin_lock(&rq1->lock); - } + + l1 = &rq1->lock; + l2 = &rq2->lock; + if (rq1 > rq2) { + l1 = l2; + l2 = &rq1->lock; } + + spin_lock(l1); + spin_lock(l2); } /* @@ -2005,10 +1851,7 @@ static void double_rq_unlock(struct rq * __releases(rq2->lock) { spin_unlock(&rq1->lock); - if (rq1 != rq2) - spin_unlock(&rq2->lock); - else - __release(rq2->lock); + spin_unlock(&rq2->lock); } /* @@ -2045,9 +1888,12 @@ static void sched_migrate_task(struct ta struct migration_req req; unsigned long flags; struct rq *rq; + int cpu; rq = task_rq_lock(p, &flags); + cpu = task_cpu(p); if (!cpu_isset(dest_cpu, p->cpus_allowed) + || cpu == dest_cpu || unlikely(cpu_is_offline(dest_cpu))) goto out; @@ -2094,8 +1940,10 @@ static void pull_task(struct rq *src_rq, set_task_cpu(p, this_cpu); inc_nr_running(p, this_rq); enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) - + this_rq->most_recent_timestamp; + + if (this_array == this_rq->expired) + update_min_expired_prio(p, this_rq); + /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. @@ -2110,7 +1958,7 @@ static void pull_task(struct rq *src_rq, static int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum idle_type idle, - int *all_pinned) + int *all_pinned, unsigned long long now) { /* * We do not migrate tasks that are: @@ -2133,18 +1981,18 @@ int can_migrate_task(struct task_struct if (sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS - if (task_hot(p, rq->most_recent_timestamp, sd)) + if (task_hot(p, now, sd)) schedstat_inc(sd, lb_hot_gained[idle]); #endif return 1; } - if (task_hot(p, rq->most_recent_timestamp, sd)) + if (task_hot(p, now, sd)) return 0; return 1; } -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) +#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->min_expired_prio) /* * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted @@ -2164,12 +2012,14 @@ static int move_tasks(struct rq *this_rq struct list_head *head, *curr; struct task_struct *tmp; long rem_load_move; + unsigned long long now; if (max_nr_move == 0 || max_load_move == 0) goto out; rem_load_move = max_load_move; pinned = 1; + now = sched_clock_us(); this_best_prio = rq_best_prio(this_rq); best_prio = rq_best_prio(busiest); /* @@ -2228,7 +2078,7 @@ skip_queue: if (skip_for_load && idx < this_best_prio) skip_for_load = !best_prio_seen && idx == best_prio; if (skip_for_load || - !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned, now)) { best_prio_seen |= idx == best_prio; if (curr != head) @@ -2289,6 +2139,8 @@ find_busiest_group(struct sched_domain * struct sched_group *group_min = NULL, *group_leader = NULL; #endif + prefetch(group); + max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; @@ -2306,6 +2158,8 @@ find_busiest_group(struct sched_domain * unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; + prefetch(group->next); + local_group = cpu_isset(this_cpu, group->cpumask); if (local_group) @@ -3018,7 +2872,7 @@ static inline void update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) { p->sched_time += now - p->last_ran; - p->last_ran = rq->most_recent_timestamp = now; + p->last_ran = now; } /* @@ -3031,34 +2885,13 @@ unsigned long long current_sched_time(co unsigned long flags; local_irq_save(flags); - ns = p->sched_time + sched_clock() - p->last_ran; + ns = p->sched_time + sched_clock_us() - p->last_ran; local_irq_restore(flags); return ns; } /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -static inline int expired_starving(struct rq *rq) -{ - if (rq->curr->static_prio > rq->best_expired_prio) - return 1; - if (!STARVATION_LIMIT || !rq->expired_timestamp) - return 0; - if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) - return 1; - return 0; -} - -/* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -3133,77 +2966,26 @@ void account_steal_time(struct task_stru static void task_running_tick(struct rq *rq, struct task_struct *p) { - if (p->array != rq->active) { - /* Task has expired but was not scheduled yet */ - set_tsk_need_resched(p); + int allowed; + int ts; + + /* Task might have expired already, but not scheduled off yet */ + if (unlikely(p->used_slice == -1)) return; - } - spin_lock(&rq->lock); - /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. - */ - if (rt_task(p)) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); - /* put it at the end of the queue: */ - requeue_task(p, rq->active); - } - goto out_unlock; - } - if (!--p->time_slice) { - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { + /* SCHED_FIFO tasks have no timeslice */ + if (unlikely(p->policy == SCHED_FIFO)) + return; - requeue_task(p, rq->active); - set_tsk_need_resched(p); - } + /* p was running during this tick. Update its time slice counter. */ + p->used_slice++; + ts = task_timeslice(p, rq); + allowed = ts - min(p->over_slice, ts >> 1); + if (unlikely(p->used_slice >= allowed)) { + p->over_slice = allowed - p->used_slice; + p->used_slice = -1; + set_tsk_need_resched(p); } -out_unlock: - spin_unlock(&rq->lock); } /* @@ -3215,7 +2997,7 @@ out_unlock: */ void scheduler_tick(void) { - unsigned long long now = sched_clock(); + unsigned long long now = sched_clock_us(); struct task_struct *p = current; int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); @@ -3269,12 +3051,6 @@ EXPORT_SYMBOL(sub_preempt_count); #endif -static inline int interactive_sleep(enum sleep_type sleep_type) -{ - return (sleep_type == SLEEP_INTERACTIVE || - sleep_type == SLEEP_INTERRUPTED); -} - /* * schedule() is the main scheduler function. */ @@ -3285,56 +3061,46 @@ asmlinkage void __sched schedule(void) struct list_head *queue; unsigned long long now; unsigned long run_time; - int cpu, idx, new_prio; + int cpu, idx; long *switch_count; struct rq *rq; + prev = current; + prefetch(prev); + + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (unlikely(in_atomic() && !current->exit_state)) { - printk(KERN_ERR "BUG: scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); + if (unlikely(in_atomic())) { + if (unlikely(!current->exit_state)) { + printk(KERN_ERR "BUG: scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); + } } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -need_resched: preempt_disable(); - prev = current; - release_kernel_lock(prev); -need_resched_nonpreemptible: rq = this_rq(); + prefetchw(rq); +need_resched: + cpu = smp_processor_id(); + release_kernel_lock(prev); - /* - * The idle thread is not allowed to schedule! - * Remove this check after it has been exercised a bit. - */ - if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); - } - +need_resched_nobkl: schedstat_inc(rq, sched_cnt); - now = sched_clock(); - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { - run_time = now - prev->timestamp; - if (unlikely((long long)(now - prev->timestamp) < 0)) - run_time = 0; - } else - run_time = NS_MAX_SLEEP_AVG; - - /* - * Tasks charged proportionately less run_time at high sleep_avg to - * delay them losing their interactive status - */ - run_time /= (CURRENT_BONUS(prev) ? : 1); + now = sched_clock_us(); + run_time = now - prev->timestamp; + prev->timestamp = prev->last_ran = now; + add_task_time(prev, run_time, STIME_RUN); spin_lock_irq(&rq->lock); @@ -3342,24 +3108,42 @@ need_resched_nonpreemptible: if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) + unlikely(signal_pending(prev)))) { prev->state = TASK_RUNNING; - else { + } else { if (prev->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; deactivate_task(prev, rq); + goto no_check_expired; } } - cpu = smp_processor_id(); - if (unlikely(!rq->nr_running)) { + if (unlikely(prev->used_slice == -1)) { + dequeue_task(prev, prev->array); + /* SCHED_FIFO can come in here too, from sched_yield */ + array = rq->active; + if (!rt_task(prev)) { + array = rq->expired; + prev->prio = task_priority(prev); + update_min_expired_prio(prev, rq); + } + enqueue_task(prev, array); + prev->used_slice = 0; + goto no_check_idle; + } +no_check_expired: + + if (!rq->nr_running) { + rq->min_expired_prio = MAX_PRIO; + rq->array_sequence++; idle_balance(cpu, rq); if (!rq->nr_running) { + schedstat_inc(rq, sched_goidle); next = rq->idle; - rq->expired_timestamp = 0; goto switch_tasks; } } +no_check_idle: array = rq->active; if (unlikely(!array->nr_active)) { @@ -3370,72 +3154,55 @@ need_resched_nonpreemptible: rq->active = rq->expired; rq->expired = array; array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; + rq->array_sequence++; + rq->min_expired_prio = MAX_PRIO; } idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; next = list_entry(queue->next, struct task_struct, run_list); - if (!rt_task(next) && interactive_sleep(next->sleep_type)) { - unsigned long long delta = now - next->timestamp; - if (unlikely((long long)(now - next->timestamp) < 0)) - delta = 0; - - if (next->sleep_type == SLEEP_INTERACTIVE) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - new_prio = recalc_task_prio(next, next->timestamp + delta); - - if (unlikely(next->prio != new_prio)) { - dequeue_task(next, array); - next->prio = new_prio; - enqueue_task(next, array); - } - } - next->sleep_type = SLEEP_NORMAL; switch_tasks: - if (next == rq->idle) - schedstat_inc(rq, sched_goidle); + prefetch(&next->mm); prefetch(next); prefetch_stack(next); clear_tsk_need_resched(prev); - rcu_qsctr_inc(task_cpu(prev)); + rcu_qsctr_inc(cpu); update_cpu_clock(prev, rq, now); - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) - prev->sleep_avg = 0; - prev->timestamp = prev->last_ran = now; - sched_info_switch(prev, next); if (likely(prev != next)) { + struct task_struct *from; + + prefetch(next->mm); + prefetch(next->thread_info); + next->timestamp = next->last_ran = now; rq->nr_switches++; rq->curr = next; ++*switch_count; prepare_task_switch(rq, next); - prev = context_switch(rq, prev, next); - barrier(); + from = context_switch(rq, prev, next); /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ - finish_task_switch(this_rq(), prev); + rq = this_rq(); + finish_task_switch(rq, from); } else spin_unlock_irq(&rq->lock); - prev = current; if (unlikely(reacquire_kernel_lock(prev) < 0)) - goto need_resched_nonpreemptible; + goto need_resched_nobkl; preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) { + preempt_disable(); + rq = this_rq(); goto need_resched; + } } EXPORT_SYMBOL(schedule); @@ -3916,7 +3683,7 @@ void set_user_nice(struct task_struct *p p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); old_prio = p->prio; - p->prio = effective_prio(p); + p->prio = task_priority(p); delta = p->prio - old_prio; if (array) { @@ -4047,14 +3814,9 @@ static void __setscheduler(struct task_s p->policy = policy; p->rt_priority = prio; - p->normal_prio = normal_prio(p); + p->normal_prio = task_priority(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; set_load_weight(p); } @@ -4149,8 +3911,11 @@ recheck: deactivate_task(p, rq); oldprio = p->prio; __setscheduler(p, policy, param->sched_priority); + if (policy == SCHED_FIFO || policy == SCHED_RR) + p->used_slice = 0; + if (array) { - __activate_task(p, rq); + __activate_task(p, rq, rq->active); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -4438,46 +4203,13 @@ asmlinkage long sys_sched_getaffinity(pi */ asmlinkage long sys_sched_yield(void) { - struct rq *rq = this_rq_lock(); - struct prio_array *array = current->array, *target = rq->expired; - - schedstat_inc(rq, yld_cnt); - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ - if (rt_task(current)) - target = rq->active; - - if (array->nr_active == 1) { - schedstat_inc(rq, yld_act_empty); - if (!rq->expired->nr_active) - schedstat_inc(rq, yld_both_empty); - } else if (!rq->expired->nr_active) - schedstat_inc(rq, yld_exp_empty); - - if (array != target) { - dequeue_task(current, array); - enqueue_task(current, target); - } else - /* - * requeue_task is cheaper so perform that if possible. - */ - requeue_task(current, array); - - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - _raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); - - schedule(); + local_irq_disable(); +#ifdef CONFIG_SCHEDSTATS + schedstat_inc(this_rq(), yld_cnt); +#endif + current->used_slice = -1; + set_need_resched(); + local_irq_enable(); return 0; } @@ -4566,6 +4298,15 @@ void __sched yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); + /* + * Kernel-space yield won't follow the schedule upon + * return from syscall path. Unless preempt is enabled, + * however in that case, preempt_schedule doesn't drop + * the bkl, which is needed in some paths. + * + * Must call schedule() here. + */ + schedule(); } EXPORT_SYMBOL(yield); @@ -4662,6 +4403,8 @@ long sys_sched_rr_get_interval(pid_t pid struct task_struct *p; int retval = -EINVAL; struct timespec t; + unsigned long flags; + struct rq *rq; if (pid < 0) goto out_nounlock; @@ -4676,8 +4419,10 @@ long sys_sched_rr_get_interval(pid_t pid if (retval) goto out_unlock; + rq = task_rq_lock(p, &flags); jiffies_to_timespec(p->policy == SCHED_FIFO ? - 0 : task_timeslice(p), &t); + 0 : task_timeslice(p, rq), &t); + task_rq_unlock(rq, &flags); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -4805,11 +4550,12 @@ void __cpuinit init_idle(struct task_str struct rq *rq = cpu_rq(cpu); unsigned long flags; - idle->timestamp = sched_clock(); - idle->sleep_avg = 0; + idle->timestamp = sched_clock_us(); idle->array = NULL; idle->prio = idle->normal_prio = MAX_PRIO; idle->state = TASK_RUNNING; + idle->used_slice = 0; + idle->over_slice = 0; idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); @@ -4928,16 +4674,8 @@ static int __migrate_task(struct task_st set_task_cpu(p, dest_cpu); if (p->array) { - /* - * Sync timestamp with rq_dest's before activating. - * The same thing could be achieved by doing this step - * afterwards, and pretending it was a local activate. - * This way is cleaner and logically correct. - */ - p->timestamp = p->timestamp - rq_src->most_recent_timestamp - + rq_dest->most_recent_timestamp; deactivate_task(p, rq_src); - __activate_task(p, rq_dest); + activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); } @@ -6771,7 +6509,7 @@ void __init sched_init(void) rq->nr_running = 0; rq->active = rq->arrays; rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; + rq->min_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP rq->sd = NULL; @@ -6791,7 +6529,7 @@ void __init sched_init(void) INIT_LIST_HEAD(array->queue + k); __clear_bit(k, array->bitmap); } - // delimiter for bitsearch + /* delimiter for bitsearch */ __set_bit(MAX_PRIO, array->bitmap); } } @@ -6864,10 +6602,10 @@ void normalize_rt_tasks(void) array = p->array; if (array) - deactivate_task(p, task_rq(p)); + deactivate_task(p, rq); __setscheduler(p, SCHED_NORMAL, 0); if (array) { - __activate_task(p, task_rq(p)); + __activate_task(p, rq, array); resched_task(rq->curr); } Index: linux-2.6/mm/oom_kill.c =================================================================== --- linux-2.6.orig/mm/oom_kill.c 2007-03-22 20:44:00.000000000 +1100 +++ linux-2.6/mm/oom_kill.c 2007-03-22 20:44:17.000000000 +1100 @@ -287,11 +287,10 @@ static void __oom_kill_task(struct task_ printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm); /* - * We give our sacrificial lamb high priority and access to - * all the memory it needs. That way it should be able to - * exit() and clear out its resources quickly... + * We give our sacrificial lamb access to all the memory it needs. + * That way it should be able to exit() and clear out its resources + * quickly... */ - p->time_slice = HZ; set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); Index: linux-2.6/kernel/sysctl.c =================================================================== --- linux-2.6.orig/kernel/sysctl.c 2007-03-22 20:44:00.000000000 +1100 +++ linux-2.6/kernel/sysctl.c 2007-03-22 20:44:17.000000000 +1100 @@ -76,6 +76,9 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; +extern int sched_base_timeslice; +extern int sched_min_base; +extern int sched_max_base; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -603,7 +606,17 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif - + { + .ctl_name = KERN_SCHED_TIMESLICE, + .procname = "base_timeslice", + .data = &sched_base_timeslice, + .maxlen = sizeof (sched_base_timeslice), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &sched_min_base, + .extra2 = &sched_max_base, + }, { .ctl_name = 0 } }; @@ -859,6 +872,7 @@ static ctl_table vm_table[] = { .extra1 = &zero, }, #endif + { .ctl_name = 0 } }; Index: linux-2.6/include/linux/init_task.h =================================================================== --- linux-2.6.orig/include/linux/init_task.h 2007-03-22 20:44:00.000000000 +1100 +++ linux-2.6/include/linux/init_task.h 2007-03-22 20:44:17.000000000 +1100 @@ -99,16 +99,15 @@ extern struct group_info init_groups; .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ - .normal_prio = MAX_PRIO-20, \ + .prio = MAX_PRIO-29, \ + .static_prio = MAX_PRIO-29, \ + .normal_prio = MAX_PRIO-29, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ .ioprio = 0, \ - .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ Index: linux-2.6/include/linux/sysctl.h =================================================================== --- linux-2.6.orig/include/linux/sysctl.h 2007-03-22 20:44:00.000000000 +1100 +++ linux-2.6/include/linux/sysctl.h 2007-03-22 20:44:17.000000000 +1100 @@ -165,6 +165,7 @@ enum KERN_MAX_LOCK_DEPTH=74, KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ + KERN_SCHED_TIMESLICE=77, /* int: base timeslice for scheduler */ };