--- include/linux/sched.h | 7 + kernel/exit.c | 2 kernel/posix-cpu-timers.c | 24 ++--- kernel/rtmutex.c | 2 kernel/sched.c | 191 +++++++++++++++++++++++++--------------------- kernel/sched_debug.c | 14 +-- kernel/sched_fair.c | 80 +++++++++++++------ kernel/sched_rt.c | 21 +++++ kernel/sysctl.c | 8 + 9 files changed, 218 insertions(+), 131 deletions(-) Index: linux/include/linux/sched.h =================================================================== --- linux.orig/include/linux/sched.h +++ linux/include/linux/sched.h @@ -798,12 +798,15 @@ struct sched_class { void (*dequeue_task) (struct rq *rq, struct task_struct *p); void (*requeue_task) (struct rq *rq, struct task_struct *p); + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); + struct task_struct * (*pick_next_task) (struct rq *rq); void (*put_prev_task) (struct rq *rq, struct task_struct *p); struct task_struct * (*load_balance_start) (struct rq *rq); struct task_struct * (*load_balance_next) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p); + void (*task_new) (struct rq *rq, struct task_struct *p); void (*task_init) (struct rq *rq, struct task_struct *p); }; @@ -838,7 +841,8 @@ struct task_struct { u64 last_ran; s64 wait_runtime; - u64 exec_runtime, fair_key; + u64 sum_exec_runtime, fair_key; + s64 sum_wait_runtime; long nice_offset; s64 hog_limit; @@ -1236,6 +1240,7 @@ extern char * sched_print_task_state(str extern unsigned int sysctl_sched_max_hog_history; extern unsigned int sysctl_sched_granularity; +extern unsigned int sysctl_sched_child_runs_first; #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); Index: linux/kernel/exit.c =================================================================== --- linux.orig/kernel/exit.c +++ linux/kernel/exit.c @@ -112,7 +112,7 @@ static void __exit_signal(struct task_st sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; - sig->sum_sched_runtime += tsk->exec_runtime; + sig->sum_sched_runtime += tsk->sum_exec_runtime; sig = NULL; /* Marker for below. */ } Index: linux/kernel/posix-cpu-timers.c =================================================================== --- linux.orig/kernel/posix-cpu-timers.c +++ linux/kernel/posix-cpu-timers.c @@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struc } static inline unsigned long long sched_ns(struct task_struct *p) { - return (p == current) ? current_sched_runtime(p) : p->exec_runtime; + return (p == current) ? current_sched_runtime(p) : p->sum_exec_runtime; } int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) @@ -249,7 +249,7 @@ static int cpu_clock_sample_group_locked cpu->sched = p->signal->sum_sched_runtime; /* Add in each other live thread. */ while ((t = next_thread(t)) != p) { - cpu->sched += t->exec_runtime; + cpu->sched += t->sum_exec_runtime; } cpu->sched += sched_ns(p); break; @@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer */ static void cleanup_timers(struct list_head *head, cputime_t utime, cputime_t stime, - unsigned long long exec_runtime) + unsigned long long sum_exec_runtime) { struct cpu_timer_list *timer, *next; cputime_t ptime = cputime_add(utime, stime); @@ -451,10 +451,10 @@ static void cleanup_timers(struct list_h ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.sched < exec_runtime) { + if (timer->expires.sched < sum_exec_runtime) { timer->expires.sched = 0; } else { - timer->expires.sched -= exec_runtime; + timer->expires.sched -= sum_exec_runtime; } } } @@ -467,7 +467,7 @@ static void cleanup_timers(struct list_h void posix_cpu_timers_exit(struct task_struct *tsk) { cleanup_timers(tsk->cpu_timers, - tsk->utime, tsk->stime, tsk->exec_runtime); + tsk->utime, tsk->stime, tsk->sum_exec_runtime); } void posix_cpu_timers_exit_group(struct task_struct *tsk) @@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct cleanup_timers(tsk->signal->cpu_timers, cputime_add(tsk->utime, tsk->signal->utime), cputime_add(tsk->stime, tsk->signal->stime), - tsk->exec_runtime + tsk->signal->sum_sched_runtime); + tsk->sum_exec_runtime + tsk->signal->sum_sched_runtime); } @@ -536,7 +536,7 @@ static void process_timer_rebalance(stru nsleft = max_t(unsigned long long, nsleft, 1); do { if (likely(!(t->flags & PF_EXITING))) { - ns = t->exec_runtime + nsleft; + ns = t->sum_exec_runtime + nsleft; if (t->it_sched_expires == 0 || t->it_sched_expires > ns) { t->it_sched_expires = ns; @@ -1004,7 +1004,7 @@ static void check_thread_timers(struct t struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (!--maxfire || tsk->exec_runtime < t->expires.sched) { + if (!--maxfire || tsk->sum_exec_runtime < t->expires.sched) { tsk->it_sched_expires = t->expires.sched; break; } @@ -1049,7 +1049,7 @@ static void check_process_timers(struct do { utime = cputime_add(utime, t->utime); stime = cputime_add(stime, t->stime); - sum_sched_runtime += t->exec_runtime; + sum_sched_runtime += t->sum_exec_runtime; t = next_thread(t); } while (t != tsk); ptime = cputime_add(utime, stime); @@ -1208,7 +1208,7 @@ static void check_process_timers(struct t->it_virt_expires = ticks; } - sched = t->exec_runtime + sched_left; + sched = t->sum_exec_runtime + sched_left; if (sched_expires && (t->it_sched_expires == 0 || t->it_sched_expires > sched)) { t->it_sched_expires = sched; @@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_st if (UNEXPIRED(prof) && UNEXPIRED(virt) && (tsk->it_sched_expires == 0 || - tsk->exec_runtime < tsk->it_sched_expires)) + tsk->sum_exec_runtime < tsk->it_sched_expires)) return; #undef UNEXPIRED Index: linux/kernel/rtmutex.c =================================================================== --- linux.orig/kernel/rtmutex.c +++ linux/kernel/rtmutex.c @@ -337,7 +337,7 @@ static inline int try_to_steal_lock(stru * interrupted, so we would delay a waiter with higher * priority as current->normal_prio. * - * Note: in the rare case of a SCHED_FAIR task changing + * Note: in the rare case of a SCHED_OTHER task changing * its priority and thus stealing the lock, next->task * might be current: */ Index: linux/kernel/sched.c =================================================================== --- linux.orig/kernel/sched.c +++ linux/kernel/sched.c @@ -101,8 +101,10 @@ unsigned long long __attribute__((weak)) #define MIN_TIMESLICE max(5 * HZ / 1000, 1) #define DEF_TIMESLICE (100 * HZ / 1000) -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +{ + p->sched_class->check_preempt_curr(rq, p); +} #define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) @@ -227,7 +229,7 @@ char * sched_print_task_state(struct tas P(exec_start); P(last_ran); P(wait_runtime); - P(exec_runtime); + P(sum_exec_runtime); #undef P t0 = sched_clock(); @@ -431,38 +433,46 @@ static inline struct rq *this_rq_lock(vo return rq; } -#include "sched_stats.h" -#include "sched_rt.c" -#include "sched_fair.c" -#include "sched_debug.c" +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP -#define sched_class_highest (&rt_sched_class) +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif -static void enqueue_task(struct rq *rq, struct task_struct *p) +static void resched_task(struct task_struct *p) { - sched_info_queued(p); - p->sched_class->enqueue_task(rq, p); - p->on_rq = 1; -} + int cpu; -static void dequeue_task(struct rq *rq, struct task_struct *p) -{ - p->sched_class->dequeue_task(rq, p); - p->on_rq = 0; -} + assert_spin_locked(&task_rq(p)->lock); -static void requeue_task(struct rq *rq, struct task_struct *p) -{ - p->sched_class->requeue_task(rq, p); -} + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + return; -/* - * __normal_prio - return the priority that is based on the static prio - */ -static inline int __normal_prio(struct task_struct *p) + set_tsk_thread_flag(p, TIF_NEED_RESCHED); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); +} +#else +static inline void resched_task(struct task_struct *p) { - return p->static_prio; + assert_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); } +#endif /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -528,6 +538,41 @@ static inline void dec_nr_running(struct dec_raw_weighted_load(rq, p); } +static void activate_task(struct rq *rq, struct task_struct *p); + +#include "sched_stats.h" +#include "sched_rt.c" +#include "sched_fair.c" +#include "sched_debug.c" + +#define sched_class_highest (&rt_sched_class) + +static void enqueue_task(struct rq *rq, struct task_struct *p) +{ + sched_info_queued(p); + p->sched_class->enqueue_task(rq, p); + p->on_rq = 1; +} + +static void dequeue_task(struct rq *rq, struct task_struct *p) +{ + p->sched_class->dequeue_task(rq, p); + p->on_rq = 0; +} + +static void requeue_task(struct rq *rq, struct task_struct *p) +{ + p->sched_class->requeue_task(rq, p); +} + +/* + * __normal_prio - return the priority that is based on the static prio + */ +static inline int __normal_prio(struct task_struct *p) +{ + return p->static_prio; +} + /* * Calculate the expected normal priority: i.e. priority * without taking RT-inheritance into account. Might be @@ -593,47 +638,6 @@ static void deactivate_task(struct rq *r dec_nr_running(p, rq); } -/* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. - */ -#ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) -#endif - -static void resched_task(struct task_struct *p) -{ - int cpu; - - assert_spin_locked(&task_rq(p)->lock); - - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) - return; - - set_tsk_thread_flag(p, TIF_NEED_RESCHED); - - cpu = task_cpu(p); - if (cpu == smp_processor_id()) - return; - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) - smp_send_reschedule(cpu); -} -#else -static inline void resched_task(struct task_struct *p) -{ - assert_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} -#endif - /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. @@ -1113,10 +1117,8 @@ out_activate: * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - if (!sync || cpu != this_cpu) { - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } + if (!sync || cpu != this_cpu) + check_preempt_curr(rq, p); success = 1; out_running: @@ -1159,7 +1161,8 @@ static void task_running_tick(struct rq static void __sched_fork(struct task_struct *p) { p->wait_start_fair = p->exec_start = p->last_ran = 0; - p->exec_runtime = p->wait_runtime = 0; + p->sum_exec_runtime = p->wait_runtime = 0; + p->sum_wait_runtime = 0; INIT_LIST_HEAD(&p->run_list); p->on_rq = 0; @@ -1208,6 +1211,12 @@ void sched_fork(struct task_struct *p, i } /* + * After fork, child runs first. (default) If set to 0 then + * parent will (try to) run first. + */ +unsigned int __read_mostly sysctl_sched_child_runs_first = 1; + +/* * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping @@ -1218,15 +1227,25 @@ void fastcall wake_up_new_task(struct ta { unsigned long flags; struct rq *rq; + int this_cpu; rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); + this_cpu = smp_processor_id(); /* parent's CPU */ p->prio = effective_prio(p); - activate_task(rq, p); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || + task_cpu(p) != this_cpu || !current->on_rq) { + activate_task(rq, p); + } else { + /* + * Let the scheduling class do new task startup + * management (if any): + */ + p->sched_class->task_new(rq, p); + } + check_preempt_curr(rq, p); task_rq_unlock(rq, &flags); } @@ -1559,8 +1578,7 @@ static void pull_task(struct rq *src_rq, * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (TASK_PREEMPTS_CURR(p, this_rq)) - resched_task(this_rq->curr); + check_preempt_curr(this_rq, p); } /* @@ -2467,7 +2485,7 @@ DEFINE_PER_CPU(struct kernel_stat, kstat EXPORT_PER_CPU_SYMBOL(kstat); /* - * Return current->exec_runtime plus any more ns on the sched_clock + * Return current->sum_exec_runtime plus any more ns on the sched_clock * that have not yet been banked. */ unsigned long long current_sched_runtime(const struct task_struct *p) @@ -2476,7 +2494,7 @@ unsigned long long current_sched_runtime unsigned long flags; local_irq_save(flags); - ns = p->exec_runtime + sched_clock() - p->last_ran; + ns = p->sum_exec_runtime + sched_clock() - p->last_ran; local_irq_restore(flags); return ns; @@ -3176,8 +3194,9 @@ void rt_mutex_setprio(struct task_struct if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else { + check_preempt_curr(rq, p); + } } task_rq_unlock(rq, &flags); } @@ -3469,8 +3488,9 @@ recheck: if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else { + check_preempt_curr(rq, p); + } } __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -4183,8 +4203,7 @@ static int __migrate_task(struct task_st if (p->on_rq) { deactivate_task(rq_src, p); activate_task(rq_dest, p); - if (TASK_PREEMPTS_CURR(p, rq_dest)) - resched_task(rq_dest->curr); + check_preempt_curr(rq_dest, p); } ret = 1; out: Index: linux/kernel/sched_debug.c =================================================================== --- linux.orig/kernel/sched_debug.c +++ linux/kernel/sched_debug.c @@ -51,10 +51,10 @@ print_task(struct seq_file *m, struct rq p->prio, p->nice_offset, p->hog_limit, - p->wait_start_fair, + p->wait_start_fair - rq->fair_clock, p->exec_start, - p->last_ran, - p->exec_runtime); + p->sum_exec_runtime, + p->sum_wait_runtime); } static void print_rq(struct seq_file *m, struct rq *rq, u64 now) @@ -66,10 +66,10 @@ static void print_rq(struct seq_file *m, "\nrunnable tasks:\n" " task PID tree-key delta waiting" " switches prio nice-offset hog-limit wstart-fair exec-start" - " last-ran exec-runtime\n" - "------------------------------------------------------------------" - "------------------------------------------------------------------" - "-------------------\n"); + " sum-exec sum-wait\n" + "---------------------------------------------------------" + "--------------------------------------------------------------------" + "--------------------------\n"); curr = first_fair(rq); while (curr) { Index: linux/kernel/sched_fair.c =================================================================== --- linux.orig/kernel/sched_fair.c +++ linux/kernel/sched_fair.c @@ -27,15 +27,9 @@ static void __enqueue_task_fair(struct r { struct rb_node **link = &rq->tasks_timeline.rb_node; struct rb_node *parent = NULL; + long long key = p->fair_key; struct task_struct *entry; int leftmost = 1; - long long key; - - key = rq->fair_clock - p->wait_runtime; - if (unlikely(p->nice_offset)) - key += p->nice_offset / (rq->nr_running + 1); - - p->fair_key = key; /* * Find the right place in the rbtree: @@ -48,9 +42,9 @@ static void __enqueue_task_fair(struct r * the same key stay together. */ if (key < entry->fair_key) { - link = &(*link)->rb_left; + link = &parent->rb_left; } else { - link = &(*link)->rb_right; + link = &parent->rb_right; leftmost = 0; } } @@ -138,7 +132,7 @@ static inline void update_curr(struct rq delta_exec = convert_delta(rq, now - curr->exec_start, curr); delta_fair = delta_exec/rq->nr_running; - curr->exec_runtime += delta_exec; + curr->sum_exec_runtime += delta_exec; curr->exec_start = now; rq->fair_clock += delta_fair; @@ -182,6 +176,11 @@ update_stats_enqueue(struct rq *rq, stru */ if (p != rq->curr) update_stats_wait_start(rq, p, now); + + /* + * Update the key: + */ + p->fair_key = rq->fair_clock - p->wait_runtime + p->nice_offset; } /* @@ -195,6 +194,7 @@ static inline void update_stats_wait_end delta = scale_nice_down(rq, p, delta); p->wait_runtime += delta; + p->sum_wait_runtime += delta; rq->wait_runtime += delta; p->wait_start_fair = 0; @@ -275,6 +275,24 @@ static void requeue_task_fair(struct rq p->on_rq = 1; } +/* + * Preempt the current task with a newly woken task if needed: + */ +static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) +{ + struct task_struct *curr = rq->curr; + long long __delta = curr->fair_key - p->fair_key; + + /* + * Take scheduling granularity into account - do not + * preempt the current task unless the best task has + * a larger than sched_granularity fairness advantage: + */ + if (p->prio < curr->prio || + __delta > (unsigned long long)sysctl_sched_granularity) + resched_task(curr); +} + static struct task_struct * pick_next_task_fair(struct rq *rq) { struct task_struct *p = __pick_next_task_fair(rq); @@ -362,25 +380,36 @@ static void task_tick_fair(struct rq *rq * Dequeue and enqueue the task to update its * position within the tree: */ - dequeue_task_fair(rq, curr); - curr->on_rq = 0; - enqueue_task_fair(rq, curr); - curr->on_rq = 1; + requeue_task_fair(rq, curr); /* * Reschedule if another task tops the current one. - * - * Take scheduling granularity into account - do not - * preempt the current task unless the best task has - * a larger than sched_granularity fairness advantage: */ next = __pick_next_task_fair(rq); - if (next != curr) { - unsigned long long __delta = curr->fair_key - next->fair_key; + if (next != curr) + check_preempt_curr(rq, next); +} - if (__delta > (unsigned long long)sysctl_sched_granularity) - set_tsk_need_resched(curr); - } +/* + * Share the fairness runtime between parent and child, thus the + * total amount of pressure for CPU stays equal - new tasks + * get a chance to run but frequent forkers are not allowed to + * monopolize the CPU. Note: the parent runqueue is locked, + * the child is not running yet. + */ +static void task_new_fair(struct rq *rq, struct task_struct *p) +{ + sched_info_queued(p); + update_stats_enqueue(rq, p); + /* + * Child runs first: we let it run before the parent + * until it reschedules once. We set up a key so that + * it will preempt the parent: + */ + p->fair_key = current->fair_key - sysctl_sched_granularity - 1; + __enqueue_task_fair(rq, p); + p->on_rq = 1; + inc_nr_running(p, rq); } static inline long @@ -418,6 +447,8 @@ hog_limit(struct rq *rq, struct task_str return -(long long)limit; } +#define NICE_OFFSET_GRANULARITY 100000 + /* * Calculate and cache the nice offset and the hog limit values: */ @@ -441,12 +472,15 @@ struct sched_class fair_sched_class __re .dequeue_task = dequeue_task_fair, .requeue_task = requeue_task_fair, + .check_preempt_curr = check_preempt_curr_fair, + .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, .load_balance_start = load_balance_start_fair, .load_balance_next = load_balance_next_fair, .task_tick = task_tick_fair, + .task_new = task_new_fair, .task_init = task_init_fair, }; Index: linux/kernel/sched_rt.c =================================================================== --- linux.orig/kernel/sched_rt.c +++ linux/kernel/sched_rt.c @@ -34,6 +34,15 @@ static void requeue_task_rt(struct rq *r list_move_tail(&p->run_list, array->queue + p->prio); } +/* + * Preempt the current task with a newly woken task if needed: + */ +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) +{ + if (p->prio < rq->curr->prio) + resched_task(rq->curr); +} + static struct task_struct * pick_next_task_rt(struct rq *rq) { struct prio_array *array = &rq->active; @@ -140,6 +149,15 @@ static void task_tick_rt(struct rq *rq, } } +/* + * No parent/child timeslice management necessary for RT tasks, + * just activate them: + */ +static void task_new_rt(struct rq *rq, struct task_struct *p) +{ + activate_task(rq, p); +} + static void task_init_rt(struct rq *rq, struct task_struct *p) { } @@ -149,6 +167,8 @@ static struct sched_class rt_sched_class .dequeue_task = dequeue_task_rt, .requeue_task = requeue_task_rt, + .check_preempt_curr = check_preempt_curr_rt, + .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, @@ -156,5 +176,6 @@ static struct sched_class rt_sched_class .load_balance_next = load_balance_next_rt, .task_tick = task_tick_rt, + .task_new = task_new_rt, .task_init = task_init_rt, }; Index: linux/kernel/sysctl.c =================================================================== --- linux.orig/kernel/sysctl.c +++ linux/kernel/sysctl.c @@ -222,6 +222,14 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = KERN_PANIC, .procname = "panic", .data = &panic_timeout,