CFS modified by rohit, v2.6.22.13, v24 --- Documentation/kernel-parameters.txt | 43 Documentation/sched-design-CFS.txt | 186 + Makefile | 2 arch/i386/Kconfig | 11 arch/i386/kernel/smpboot.c | 12 arch/i386/kernel/tsc.c | 14 arch/ia64/kernel/setup.c | 6 arch/mips/kernel/smp.c | 11 arch/sparc/kernel/smp.c | 10 arch/sparc64/kernel/smp.c | 27 block/cfq-iosched.c | 3 drivers/acpi/processor_idle.c | 32 drivers/kvm/kvm.h | 11 fs/pipe.c | 9 fs/proc/array.c | 144 - fs/proc/base.c | 73 fs/proc/proc_misc.c | 15 include/asm-generic/bitops/sched.h | 21 include/linux/cgroup.h | 12 include/linux/cpu.h | 2 include/linux/cpuset.h | 5 include/linux/hardirq.h | 13 include/linux/init_task.h | 2 include/linux/kernel.h | 7 include/linux/kernel_stat.h | 3 include/linux/nodemask.h | 94 include/linux/nsproxy.h | 3 include/linux/sched.h | 408 ++ include/linux/taskstats.h | 10 include/linux/topology.h | 10 include/linux/user_namespace.h | 61 init/Kconfig | 26 init/main.c | 8 kernel/Makefile | 2 kernel/delayacct.c | 18 kernel/exit.c | 11 kernel/fork.c | 15 kernel/ksysfs.c | 8 kernel/nsproxy.c | 73 kernel/posix-cpu-timers.c | 34 kernel/sched.c | 5142 +++++++++++++++++++----------------- kernel/sched_debug.c | 394 ++ kernel/sched_fair.c | 1164 ++++++++ kernel/sched_idletask.c | 89 kernel/sched_rt.c | 259 + kernel/sched_stats.h | 236 + kernel/softirq.c | 1 kernel/sys.c | 6 kernel/sysctl.c | 128 kernel/timer.c | 7 kernel/tsacct.c | 4 kernel/user.c | 304 +- kernel/user_namespace.c | 88 lib/Kconfig.debug | 9 mm/memory_hotplug.c | 7 mm/page_alloc.c | 50 mm/vmscan.c | 4 net/unix/af_unix.c | 4 58 files changed, 6523 insertions(+), 2828 deletions(-) Index: linux-cfs-2.6.22.13.q/Documentation/kernel-parameters.txt =================================================================== --- linux-cfs-2.6.22.13.q.orig/Documentation/kernel-parameters.txt +++ linux-cfs-2.6.22.13.q/Documentation/kernel-parameters.txt @@ -1009,49 +1009,6 @@ and is between 256 and 4096 characters. mga= [HW,DRM] - migration_cost= - [KNL,SMP] debug: override scheduler migration costs - Format: ,,... - This debugging option can be used to override the - default scheduler migration cost matrix. The numbers - are indexed by 'CPU domain distance'. - E.g. migration_cost=1000,2000,3000 on an SMT NUMA - box will set up an intra-core migration cost of - 1 msec, an inter-core migration cost of 2 msecs, - and an inter-node migration cost of 3 msecs. - - WARNING: using the wrong values here can break - scheduler performance, so it's only for scheduler - development purposes, not production environments. - - migration_debug= - [KNL,SMP] migration cost auto-detect verbosity - Format=<0|1|2> - If a system's migration matrix reported at bootup - seems erroneous then this option can be used to - increase verbosity of the detection process. - We default to 0 (no extra messages), 1 will print - some more information, and 2 will be really - verbose (probably only useful if you also have a - serial console attached to the system). - - migration_factor= - [KNL,SMP] multiply/divide migration costs by a factor - Format= - This debug option can be used to proportionally - increase or decrease the auto-detected migration - costs for all entries of the migration matrix. - E.g. migration_factor=150 will increase migration - costs by 50%. (and thus the scheduler will be less - eager migrating cache-hot tasks) - migration_factor=80 will decrease migration costs - by 20%. (thus the scheduler will be more eager to - migrate tasks) - - WARNING: using the wrong values here can break - scheduler performance, so it's only for scheduler - development purposes, not production environments. - mousedev.tap_time= [MOUSE] Maximum time between finger touching and leaving touchpad surface for touch to be considered Index: linux-cfs-2.6.22.13.q/Documentation/sched-design-CFS.txt =================================================================== --- /dev/null +++ linux-cfs-2.6.22.13.q/Documentation/sched-design-CFS.txt @@ -0,0 +1,186 @@ + +This is the CFS scheduler. + +80% of CFS's design can be summed up in a single sentence: CFS basically +models an "ideal, precise multi-tasking CPU" on real hardware. + +"Ideal multi-tasking CPU" is a (non-existent :-)) CPU that has 100% +physical power and which can run each task at precise equal speed, in +parallel, each at 1/nr_running speed. For example: if there are 2 tasks +running then it runs each at 50% physical power - totally in parallel. + +On real hardware, we can run only a single task at once, so while that +one task runs, the other tasks that are waiting for the CPU are at a +disadvantage - the current task gets an unfair amount of CPU time. In +CFS this fairness imbalance is expressed and tracked via the per-task +p->wait_runtime (nanosec-unit) value. "wait_runtime" is the amount of +time the task should now run on the CPU for it to become completely fair +and balanced. + +( small detail: on 'ideal' hardware, the p->wait_runtime value would + always be zero - no task would ever get 'out of balance' from the + 'ideal' share of CPU time. ) + +CFS's task picking logic is based on this p->wait_runtime value and it +is thus very simple: it always tries to run the task with the largest +p->wait_runtime value. In other words, CFS tries to run the task with +the 'gravest need' for more CPU time. So CFS always tries to split up +CPU time between runnable tasks as close to 'ideal multitasking +hardware' as possible. + +Most of the rest of CFS's design just falls out of this really simple +concept, with a few add-on embellishments like nice levels, +multiprocessing and various algorithm variants to recognize sleepers. + +In practice it works like this: the system runs a task a bit, and when +the task schedules (or a scheduler tick happens) the task's CPU usage is +'accounted for': the (small) time it just spent using the physical CPU +is deducted from p->wait_runtime. [minus the 'fair share' it would have +gotten anyway]. Once p->wait_runtime gets low enough so that another +task becomes the 'leftmost task' of the time-ordered rbtree it maintains +(plus a small amount of 'granularity' distance relative to the leftmost +task so that we do not over-schedule tasks and trash the cache) then the +new leftmost task is picked and the current task is preempted. + +The rq->fair_clock value tracks the 'CPU time a runnable task would have +fairly gotten, had it been runnable during that time'. So by using +rq->fair_clock values we can accurately timestamp and measure the +'expected CPU time' a task should have gotten. All runnable tasks are +sorted in the rbtree by the "rq->fair_clock - p->wait_runtime" key, and +CFS picks the 'leftmost' task and sticks to it. As the system progresses +forwards, newly woken tasks are put into the tree more and more to the +right - slowly but surely giving a chance for every task to become the +'leftmost task' and thus get on the CPU within a deterministic amount of +time. + +Some implementation details: + + - the introduction of Scheduling Classes: an extensible hierarchy of + scheduler modules. These modules encapsulate scheduling policy + details and are handled by the scheduler core without the core + code assuming about them too much. + + - sched_fair.c implements the 'CFS desktop scheduler': it is a + replacement for the vanilla scheduler's SCHED_OTHER interactivity + code. + + I'd like to give credit to Con Kolivas for the general approach here: + he has proven via RSDL/SD that 'fair scheduling' is possible and that + it results in better desktop scheduling. Kudos Con! + + The CFS patch uses a completely different approach and implementation + from RSDL/SD. My goal was to make CFS's interactivity quality exceed + that of RSDL/SD, which is a high standard to meet :-) Testing + feedback is welcome to decide this one way or another. [ and, in any + case, all of SD's logic could be added via a kernel/sched_sd.c module + as well, if Con is interested in such an approach. ] + + CFS's design is quite radical: it does not use runqueues, it uses a + time-ordered rbtree to build a 'timeline' of future task execution, + and thus has no 'array switch' artifacts (by which both the vanilla + scheduler and RSDL/SD are affected). + + CFS uses nanosecond granularity accounting and does not rely on any + jiffies or other HZ detail. Thus the CFS scheduler has no notion of + 'timeslices' and has no heuristics whatsoever. There is only one + central tunable: + + /proc/sys/kernel/sched_granularity_ns + + which can be used to tune the scheduler from 'desktop' (low + latencies) to 'server' (good batching) workloads. It defaults to a + setting suitable for desktop workloads. SCHED_BATCH is handled by the + CFS scheduler module too. + + Due to its design, the CFS scheduler is not prone to any of the + 'attacks' that exist today against the heuristics of the stock + scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all + work fine and do not impact interactivity and produce the expected + behavior. + + the CFS scheduler has a much stronger handling of nice levels and + SCHED_BATCH: both types of workloads should be isolated much more + agressively than under the vanilla scheduler. + + ( another detail: due to nanosec accounting and timeline sorting, + sched_yield() support is very simple under CFS, and in fact under + CFS sched_yield() behaves much better than under any other + scheduler i have tested so far. ) + + - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler + way than the vanilla scheduler does. It uses 100 runqueues (for all + 100 RT priority levels, instead of 140 in the vanilla scheduler) + and it needs no expired array. + + - reworked/sanitized SMP load-balancing: the runqueue-walking + assumptions are gone from the load-balancing code now, and + iterators of the scheduling modules are used. The balancing code got + quite a bit simpler as a result. + + +Group scheduler extension to CFS +================================ + +Normally the scheduler operates on individual tasks and strives to provide +fair CPU time to each task. Sometimes, it may be desirable to group tasks +and provide fair CPU time to each such task group. For example, it may +be desirable to first provide fair CPU time to each user on the system +and then to each task belonging to a user. + +CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets +SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such +groups. At present, there are two (mutually exclusive) mechanisms to group +tasks for CPU bandwidth control purpose: + + - Based on user id (CONFIG_FAIR_USER_SCHED) + In this option, tasks are grouped according to their user id. + - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED) + This options lets the administrator create arbitrary groups + of tasks, using the "cgroup" pseudo filesystem. See + Documentation/cgroups.txt for more information about this + filesystem. + +Only one of these options to group tasks can be chosen and not both. + +Group scheduler tunables: + +When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for +each new user and a "cpu_share" file is added in that directory. + + # cd /sys/kernel/uids + # cat 512/cpu_share # Display user 512's CPU share + 1024 + # echo 2048 > 512/cpu_share # Modify user 512's CPU share + # cat 512/cpu_share # Display user 512's CPU share + 2048 + # + +CPU bandwidth between two users are divided in the ratio of their CPU shares. +For ex: if you would like user "root" to get twice the bandwidth of user +"guest", then set the cpu_share for both the users such that "root"'s +cpu_share is twice "guest"'s cpu_share + + +When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created +for each group created using the pseudo filesystem. See example steps +below to create task groups and modify their CPU share using the "cgroups" +pseudo filesystem + + # mkdir /dev/cpuctl + # mount -t cgroup -ocpu none /dev/cpuctl + # cd /dev/cpuctl + + # mkdir multimedia # create "multimedia" group of tasks + # mkdir browser # create "browser" group of tasks + + # #Configure the multimedia group to receive twice the CPU bandwidth + # #that of browser group + + # echo 2048 > multimedia/cpu.shares + # echo 1024 > browser/cpu.shares + + # firefox & # Launch firefox and move it to "browser" group + # echo > browser/tasks + + # #Launch gmplayer (or your favourite movie player) + # echo > multimedia/tasks Index: linux-cfs-2.6.22.13.q/Makefile =================================================================== --- linux-cfs-2.6.22.13.q.orig/Makefile +++ linux-cfs-2.6.22.13.q/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 22 -EXTRAVERSION = .13 +EXTRAVERSION = .13-cfs-v24 NAME = Holy Dancing Manatees, Batman! # *DOCUMENTATION* Index: linux-cfs-2.6.22.13.q/arch/i386/Kconfig =================================================================== --- linux-cfs-2.6.22.13.q.orig/arch/i386/Kconfig +++ linux-cfs-2.6.22.13.q/arch/i386/Kconfig @@ -210,6 +210,17 @@ config X86_ES7000 endchoice +config SCHED_NO_NO_OMIT_FRAME_POINTER + bool "Single-depth WCHAN output" + default y + help + Calculate simpler /proc//wchan values. If this option + is disabled then wchan values will recurse back to the + caller function. This provides more accurate wchan values, + at the expense of slightly more scheduling overhead. + + If in doubt, say "Y". + config PARAVIRT bool "Paravirtualization support (EXPERIMENTAL)" depends on EXPERIMENTAL Index: linux-cfs-2.6.22.13.q/arch/i386/kernel/smpboot.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/arch/i386/kernel/smpboot.c +++ linux-cfs-2.6.22.13.q/arch/i386/kernel/smpboot.c @@ -941,17 +941,6 @@ exit: } #endif -static void smp_tune_scheduling(void) -{ - if (cpu_khz) { - /* cache size in kB */ - long cachesize = boot_cpu_data.x86_cache_size; - - if (cachesize > 0) - max_cache_size = cachesize * 1024; - } -} - /* * Cycle through the processors sending APIC IPIs to boot each. */ @@ -980,7 +969,6 @@ static void __init smp_boot_cpus(unsigne x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; current_thread_info()->cpu = 0; - smp_tune_scheduling(); set_cpu_sibling_map(0); Index: linux-cfs-2.6.22.13.q/arch/i386/kernel/tsc.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/arch/i386/kernel/tsc.c +++ linux-cfs-2.6.22.13.q/arch/i386/kernel/tsc.c @@ -4,6 +4,7 @@ * See comments there for proper credits. */ +#include #include #include #include @@ -26,6 +27,7 @@ static int tsc_enabled; * an extra value to store the TSC freq */ unsigned int tsc_khz; +EXPORT_SYMBOL_GPL(tsc_khz); int tsc_disable; @@ -57,10 +59,11 @@ __setup("notsc", tsc_setup); */ static int tsc_unstable; -static inline int check_tsc_unstable(void) +int check_tsc_unstable(void) { return tsc_unstable; } +EXPORT_SYMBOL_GPL(check_tsc_unstable); /* Accellerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) @@ -83,7 +86,7 @@ static inline int check_tsc_unstable(voi * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static unsigned long cyc2ns_scale __read_mostly; +unsigned long cyc2ns_scale __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ @@ -106,8 +109,13 @@ unsigned long long sched_clock(void) /* * Fall back to jiffies if there's no TSC available: + * ( But note that we still use it if the TSC is marked + * unstable. We do this because unlike Time Of Day, + * the scheduler clock tolerates small errors and it's + * very important for it to be as fast as the platform + * can achive it. ) */ - if (unlikely(!tsc_enabled)) + if (unlikely(!tsc_enabled && !tsc_unstable)) /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); Index: linux-cfs-2.6.22.13.q/arch/ia64/kernel/setup.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/arch/ia64/kernel/setup.c +++ linux-cfs-2.6.22.13.q/arch/ia64/kernel/setup.c @@ -805,7 +805,6 @@ static void __cpuinit get_max_cacheline_size (void) { unsigned long line_size, max = 1; - unsigned int cache_size = 0; u64 l, levels, unique_caches; pal_cache_config_info_t cci; s64 status; @@ -835,8 +834,6 @@ get_max_cacheline_size (void) line_size = 1 << cci.pcci_line_size; if (line_size > max) max = line_size; - if (cache_size < cci.pcci_cache_size) - cache_size = cci.pcci_cache_size; if (!cci.pcci_unified) { status = ia64_pal_cache_config_info(l, /* cache_type (instruction)= */ 1, @@ -853,9 +850,6 @@ get_max_cacheline_size (void) ia64_i_cache_stride_shift = cci.pcci_stride; } out: -#ifdef CONFIG_SMP - max_cache_size = max(max_cache_size, cache_size); -#endif if (max > ia64_max_cacheline_size) ia64_max_cacheline_size = max; } Index: linux-cfs-2.6.22.13.q/arch/mips/kernel/smp.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/arch/mips/kernel/smp.c +++ linux-cfs-2.6.22.13.q/arch/mips/kernel/smp.c @@ -51,16 +51,6 @@ int __cpu_logical_map[NR_CPUS]; /* Map EXPORT_SYMBOL(phys_cpu_present_map); EXPORT_SYMBOL(cpu_online_map); -/* This happens early in bootup, can't really do it better */ -static void smp_tune_scheduling (void) -{ - struct cache_desc *cd = ¤t_cpu_data.scache; - unsigned long cachesize = cd->linesz * cd->sets * cd->ways; - - if (cachesize > max_cache_size) - max_cache_size = cachesize; -} - extern void __init calibrate_delay(void); extern ATTRIB_NORET void cpu_idle(void); @@ -228,7 +218,6 @@ void __init smp_prepare_cpus(unsigned in { init_new_context(current, &init_mm); current_thread_info()->cpu = 0; - smp_tune_scheduling(); plat_prepare_cpus(max_cpus); #ifndef CONFIG_HOTPLUG_CPU cpu_present_map = cpu_possible_map; Index: linux-cfs-2.6.22.13.q/arch/sparc/kernel/smp.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/arch/sparc/kernel/smp.c +++ linux-cfs-2.6.22.13.q/arch/sparc/kernel/smp.c @@ -68,16 +68,6 @@ void __cpuinit smp_store_cpu_info(int id cpu_data(id).prom_node = cpu_node; cpu_data(id).mid = cpu_get_hwmid(cpu_node); - /* this is required to tune the scheduler correctly */ - /* is it possible to have CPUs with different cache sizes? */ - if (id == boot_cpu_id) { - int cache_line,cache_nlines; - cache_line = 0x20; - cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line); - cache_nlines = 0x8000; - cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines); - max_cache_size = cache_line * cache_nlines; - } if (cpu_data(id).mid < 0) panic("No MID found for CPU%d at node 0x%08d", id, cpu_node); } Index: linux-cfs-2.6.22.13.q/arch/sparc64/kernel/smp.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/arch/sparc64/kernel/smp.c +++ linux-cfs-2.6.22.13.q/arch/sparc64/kernel/smp.c @@ -1163,32 +1163,6 @@ int setup_profiling_timer(unsigned int m return -EINVAL; } -static void __init smp_tune_scheduling(void) -{ - unsigned int smallest = ~0U; - int i; - - for (i = 0; i < NR_CPUS; i++) { - unsigned int val = cpu_data(i).ecache_size; - - if (val && val < smallest) - smallest = val; - } - - /* Any value less than 256K is nonsense. */ - if (smallest < (256U * 1024U)) - smallest = 256 * 1024; - - max_cache_size = smallest; - - if (smallest < 1U * 1024U * 1024U) - printk(KERN_INFO "Using max_cache_size of %uKB\n", - smallest / 1024U); - else - printk(KERN_INFO "Using max_cache_size of %uMB\n", - smallest / 1024U / 1024U); -} - /* Constrain the number of cpus to max_cpus. */ void __init smp_prepare_cpus(unsigned int max_cpus) { @@ -1206,7 +1180,6 @@ void __init smp_prepare_cpus(unsigned in } cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy; - smp_tune_scheduling(); } void __devinit smp_prepare_boot_cpu(void) Index: linux-cfs-2.6.22.13.q/block/cfq-iosched.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/block/cfq-iosched.c +++ linux-cfs-2.6.22.13.q/block/cfq-iosched.c @@ -1280,6 +1280,8 @@ static void cfq_init_prio_data(struct cf /* * no prio set, place us in the middle of the BE classes */ + if (tsk->policy == SCHED_IDLE) + goto set_class_idle; cfqq->ioprio = task_nice_ioprio(tsk); cfqq->ioprio_class = IOPRIO_CLASS_BE; break; @@ -1292,6 +1294,7 @@ static void cfq_init_prio_data(struct cf cfqq->ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: + set_class_idle: cfqq->ioprio_class = IOPRIO_CLASS_IDLE; cfqq->ioprio = 7; cfq_clear_cfqq_idle_window(cfqq); Index: linux-cfs-2.6.22.13.q/drivers/acpi/processor_idle.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/drivers/acpi/processor_idle.c +++ linux-cfs-2.6.22.13.q/drivers/acpi/processor_idle.c @@ -63,6 +63,7 @@ ACPI_MODULE_NAME("processor_idle"); #define ACPI_PROCESSOR_FILE_POWER "power" #define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) +#define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY) #define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ #define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ static void (*pm_idle_save) (void) __read_mostly; @@ -479,6 +480,9 @@ static void acpi_processor_idle(void) * TBD: Can't get time duration while in C1, as resumes * go to an ISR rather than here. Need to instrument * base interrupt handler. + * + * Note: the TSC better not stop in C1, sched_clock() will + * skew otherwise. */ sleep_ticks = 0xFFFFFFFF; break; @@ -486,6 +490,8 @@ static void acpi_processor_idle(void) case ACPI_STATE_C2: /* Get start time (ticks) */ t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); + /* Tell the scheduler that we are going deep-idle: */ + sched_clock_idle_sleep_event(); /* Invoke C2 */ acpi_state_timer_broadcast(pr, cx, 1); acpi_cstate_enter(cx); @@ -496,17 +502,22 @@ static void acpi_processor_idle(void) /* TSC halts in C2, so notify users */ mark_tsc_unstable("possible TSC halt in C2"); #endif + /* Compute time (ticks) that we were actually asleep */ + sleep_ticks = ticks_elapsed(t1, t2); + + /* Tell the scheduler how much we idled: */ + sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); + /* Re-enable interrupts */ local_irq_enable(); + /* Do not account our idle-switching overhead: */ + sleep_ticks -= cx->latency_ticks + C2_OVERHEAD; + current_thread_info()->status |= TS_POLLING; - /* Compute time (ticks) that we were actually asleep */ - sleep_ticks = - ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; acpi_state_timer_broadcast(pr, cx, 0); break; case ACPI_STATE_C3: - if (pr->flags.bm_check) { if (atomic_inc_return(&c3_cpu_count) == num_online_cpus()) { @@ -525,6 +536,8 @@ static void acpi_processor_idle(void) t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); /* Invoke C3 */ acpi_state_timer_broadcast(pr, cx, 1); + /* Tell the scheduler that we are going deep-idle: */ + sched_clock_idle_sleep_event(); acpi_cstate_enter(cx); /* Get end time (ticks) */ t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); @@ -538,12 +551,17 @@ static void acpi_processor_idle(void) /* TSC halts in C3, so notify users */ mark_tsc_unstable("TSC halts in C3"); #endif + /* Compute time (ticks) that we were actually asleep */ + sleep_ticks = ticks_elapsed(t1, t2); + /* Tell the scheduler how much we idled: */ + sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); + /* Re-enable interrupts */ local_irq_enable(); + /* Do not account our idle-switching overhead: */ + sleep_ticks -= cx->latency_ticks + C3_OVERHEAD; + current_thread_info()->status |= TS_POLLING; - /* Compute time (ticks) that we were actually asleep */ - sleep_ticks = - ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; acpi_state_timer_broadcast(pr, cx, 0); break; Index: linux-cfs-2.6.22.13.q/drivers/kvm/kvm.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/drivers/kvm/kvm.h +++ linux-cfs-2.6.22.13.q/drivers/kvm/kvm.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "vmx.h" @@ -531,6 +532,16 @@ void kvm_mmu_free_some_pages(struct kvm_ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); +static inline void kvm_guest_enter(void) +{ + current->flags |= PF_VCPU; +} + +static inline void kvm_guest_exit(void) +{ + current->flags &= ~PF_VCPU; +} + static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code) { Index: linux-cfs-2.6.22.13.q/fs/pipe.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/fs/pipe.c +++ linux-cfs-2.6.22.13.q/fs/pipe.c @@ -45,8 +45,7 @@ void pipe_wait(struct pipe_inode_info *p * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ - prepare_to_wait(&pipe->wait, &wait, - TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); + prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); schedule(); @@ -323,7 +322,7 @@ redo: /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { - wake_up_interruptible(&pipe->wait); + wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) @@ -496,7 +495,7 @@ redo2: out: mutex_unlock(&inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(&pipe->wait); + wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0) @@ -590,7 +589,7 @@ pipe_release(struct inode *inode, int de if (!pipe->readers && !pipe->writers) { free_pipe_info(inode); } else { - wake_up_interruptible(&pipe->wait); + wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } Index: linux-cfs-2.6.22.13.q/fs/proc/array.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/fs/proc/array.c +++ linux-cfs-2.6.22.13.q/fs/proc/array.c @@ -62,6 +62,8 @@ #include #include #include +#include +#include #include #include #include @@ -76,9 +78,7 @@ #include #include -#include #include -#include #include #include "internal.h" @@ -87,10 +87,10 @@ do { memcpy(buffer, string, strlen(string)); \ buffer += strlen(string); } while (0) -static inline char * task_name(struct task_struct *p, char * buf) +static inline char *task_name(struct task_struct *p, char *buf) { int i; - char * name; + char *name; char tcomm[sizeof(p->comm)]; get_task_comm(tcomm, p); @@ -138,7 +138,7 @@ static const char *task_state_array[] = "X (dead)" /* 32 */ }; -static inline const char * get_task_state(struct task_struct *tsk) +static inline const char *get_task_state(struct task_struct *tsk) { unsigned int state = (tsk->state & (TASK_RUNNING | TASK_INTERRUPTIBLE | @@ -156,7 +156,7 @@ static inline const char * get_task_stat return *p; } -static inline char * task_state(struct task_struct *p, char *buffer) +static inline char *task_state(struct task_struct *p, char *buffer) { struct group_info *group_info; int g; @@ -165,7 +165,6 @@ static inline char * task_state(struct t rcu_read_lock(); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -173,9 +172,8 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), - p->tgid, p->pid, - pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, + p->tgid, p->pid, + pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, p->uid, p->euid, p->suid, p->fsuid, p->gid, p->egid, p->sgid, p->fsgid); @@ -193,15 +191,15 @@ static inline char * task_state(struct t get_group_info(group_info); task_unlock(p); - for (g = 0; g < min(group_info->ngroups,NGROUPS_SMALL); g++) - buffer += sprintf(buffer, "%d ", GROUP_AT(group_info,g)); + for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) + buffer += sprintf(buffer, "%d ", GROUP_AT(group_info, g)); put_group_info(group_info); buffer += sprintf(buffer, "\n"); return buffer; } -static char * render_sigset_t(const char *header, sigset_t *set, char *buffer) +static char *render_sigset_t(const char *header, sigset_t *set, char *buffer) { int i, len; @@ -241,7 +239,7 @@ static void collect_sigign_sigcatch(stru } } -static inline char * task_sig(struct task_struct *p, char *buffer) +static inline char *task_sig(struct task_struct *p, char *buffer) { unsigned long flags; sigset_t pending, shpending, blocked, ignored, caught; @@ -291,14 +289,23 @@ static inline char *task_cap(struct task cap_t(p->cap_effective)); } -int proc_pid_status(struct task_struct *task, char * buffer) +static inline char *task_context_switch_counts(struct task_struct *p, + char *buffer) { - char * orig = buffer; + return buffer + sprintf(buffer, "voluntary_ctxt_switches:\t%lu\n" + "nonvoluntary_ctxt_switches:\t%lu\n", + p->nvcsw, + p->nivcsw); +} + +int proc_pid_status(struct task_struct *task, char *buffer) +{ + char *orig = buffer; struct mm_struct *mm = get_task_mm(task); buffer = task_name(task, buffer); buffer = task_state(task, buffer); - + if (mm) { buffer = task_mem(mm, buffer); mmput(mm); @@ -309,10 +316,70 @@ int proc_pid_status(struct task_struct * #if defined(CONFIG_S390) buffer = task_show_regs(task, buffer); #endif + buffer = task_context_switch_counts(task, buffer); return buffer - orig; } -static int do_task_stat(struct task_struct *task, char * buffer, int whole) +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +static cputime_t task_utime(struct task_struct *p) +{ + return p->utime; +} + +static cputime_t task_stime(struct task_struct *p) +{ + return p->stime; +} +#else +static cputime_t task_utime(struct task_struct *p) +{ + clock_t utime = cputime_to_clock_t(p->utime), + total = utime + cputime_to_clock_t(p->stime); + u64 temp; + + /* + * Use CFS's precise accounting: + */ + temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + + if (total) { + temp *= utime; + do_div(temp, total); + } + utime = (clock_t)temp; + + p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + return p->prev_utime; +} + +static cputime_t task_stime(struct task_struct *p) +{ + clock_t stime; + + /* + * Use CFS's precise accounting. (we subtract utime from + * the total, to make sure the total observed by userspace + * grows monotonically - apps rely on that): + */ + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - + cputime_to_clock_t(task_utime(p)); + + if (stime >= 0) + p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + + return p->prev_stime; +} +#endif + +static cputime_t task_gtime(struct task_struct *p) +{ + return p->gtime; +} + +static int do_task_stat(struct task_struct *task, char *buffer, int whole) { unsigned long vsize, eip, esp, wchan = ~0UL; long priority, nice; @@ -320,13 +387,14 @@ static int do_task_stat(struct task_stru sigset_t sigign, sigcatch; char state; int res; - pid_t ppid = 0, pgid = -1, sid = -1; + pid_t ppid = 0, pgid = -1, sid = -1; int num_threads = 0; struct mm_struct *mm; unsigned long long start_time; unsigned long cmin_flt = 0, cmaj_flt = 0; unsigned long min_flt = 0, maj_flt = 0; cputime_t cutime, cstime, utime, stime; + cputime_t cgtime, gtime; unsigned long rsslim = 0; char tcomm[sizeof(task->comm)]; unsigned long flags; @@ -345,6 +413,7 @@ static int do_task_stat(struct task_stru sigemptyset(&sigign); sigemptyset(&sigcatch); cutime = cstime = utime = stime = cputime_zero; + cgtime = gtime = cputime_zero; rcu_read_lock(); if (lock_task_sighand(task, &flags)) { @@ -362,6 +431,7 @@ static int do_task_stat(struct task_stru cmaj_flt = sig->cmaj_flt; cutime = sig->cutime; cstime = sig->cstime; + cgtime = sig->cgtime; rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; /* add up live thread stats at the group level */ @@ -370,8 +440,9 @@ static int do_task_stat(struct task_stru do { min_flt += t->min_flt; maj_flt += t->maj_flt; - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); + utime = cputime_add(utime, task_utime(t)); + stime = cputime_add(stime, task_stime(t)); + gtime = cputime_add(gtime, task_gtime(t)); t = next_thread(t); } while (t != task); @@ -379,6 +450,7 @@ static int do_task_stat(struct task_stru maj_flt += sig->maj_flt; utime = cputime_add(utime, sig->utime); stime = cputime_add(stime, sig->stime); + gtime = cputime_add(gtime, sig->gtime); } sid = signal_session(sig); @@ -389,13 +461,14 @@ static int do_task_stat(struct task_stru } rcu_read_unlock(); - if (!whole || num_threads<2) + if (!whole || num_threads < 2) wchan = get_wchan(task); if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - utime = task->utime; - stime = task->stime; + utime = task_utime(task); + stime = task_stime(task); + gtime = task_gtime(task); } /* scale priority and nice values from timeslices to -20..20 */ @@ -405,14 +478,15 @@ static int do_task_stat(struct task_stru /* Temporary variable needed for gcc-2.96 */ /* convert timespec -> nsec*/ - start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC + start_time = + (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC + task->start_time.tv_nsec; /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time); - res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %u %lu \ + res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", task->pid, tcomm, state, @@ -436,7 +510,7 @@ static int do_task_stat(struct task_stru start_time, vsize, mm ? get_mm_rss(mm) : 0, - rsslim, + rsslim, mm ? mm->start_code : 0, mm ? mm->end_code : 0, mm ? mm->start_stack : 0, @@ -457,18 +531,20 @@ static int do_task_stat(struct task_stru task_cpu(task), task->rt_priority, task->policy, - (unsigned long long)delayacct_blkio_ticks(task)); - if(mm) + (unsigned long long)delayacct_blkio_ticks(task), + cputime_to_clock_t(gtime), + cputime_to_clock_t(cgtime)); + if (mm) mmput(mm); return res; } -int proc_tid_stat(struct task_struct *task, char * buffer) +int proc_tid_stat(struct task_struct *task, char *buffer) { return do_task_stat(task, buffer, 0); } -int proc_tgid_stat(struct task_struct *task, char * buffer) +int proc_tgid_stat(struct task_struct *task, char *buffer) { return do_task_stat(task, buffer, 1); } @@ -477,12 +553,12 @@ int proc_pid_statm(struct task_struct *t { int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; struct mm_struct *mm = get_task_mm(task); - + if (mm) { size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); } - return sprintf(buffer,"%d %d %d %d %d %d %d\n", + return sprintf(buffer, "%d %d %d %d %d %d %d\n", size, resident, shared, text, lib, data, 0); } Index: linux-cfs-2.6.22.13.q/fs/proc/base.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/fs/proc/base.c +++ linux-cfs-2.6.22.13.q/fs/proc/base.c @@ -296,10 +296,10 @@ static int proc_pid_wchan(struct task_st */ static int proc_pid_schedstat(struct task_struct *task, char *buffer) { - return sprintf(buffer, "%lu %lu %lu\n", + return sprintf(buffer, "%llu %llu %lu\n", task->sched_info.cpu_time, task->sched_info.run_delay, - task->sched_info.pcnt); + task->sched_info.pcount); } #endif @@ -929,6 +929,69 @@ static const struct file_operations proc }; #endif +#ifdef CONFIG_SCHED_DEBUG +/* + * Print out various scheduling related per-task fields: + */ +static int sched_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + WARN_ON(!inode); + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + + WARN_ON(!inode); + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_set_task(p); + + put_task_struct(p); + + return count; +} + +static int sched_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_operations = { + .open = sched_open, + .read = seq_read, + .write = sched_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -1963,6 +2026,9 @@ static const struct pid_entry tgid_base_ INF("environ", S_IRUSR, pid_environ), INF("auxv", S_IRUSR, pid_auxv), INF("status", S_IRUGO, pid_status), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, pid_sched), +#endif INF("cmdline", S_IRUGO, pid_cmdline), INF("stat", S_IRUGO, tgid_stat), INF("statm", S_IRUGO, pid_statm), @@ -2247,6 +2313,9 @@ static const struct pid_entry tid_base_s INF("environ", S_IRUSR, pid_environ), INF("auxv", S_IRUSR, pid_auxv), INF("status", S_IRUGO, pid_status), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, pid_sched), +#endif INF("cmdline", S_IRUGO, pid_cmdline), INF("stat", S_IRUGO, tid_stat), INF("statm", S_IRUGO, pid_statm), Index: linux-cfs-2.6.22.13.q/fs/proc/proc_misc.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/fs/proc/proc_misc.c +++ linux-cfs-2.6.22.13.q/fs/proc/proc_misc.c @@ -442,10 +442,12 @@ static int show_stat(struct seq_file *p, int i; unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; + cputime64_t guest; u64 sum = 0; user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; + guest = cputime64_zero; jif = - wall_to_monotonic.tv_sec; if (wall_to_monotonic.tv_nsec) --jif; @@ -461,11 +463,12 @@ static int show_stat(struct seq_file *p, irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); + guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); for (j = 0 ; j < NR_IRQS ; j++) sum += kstat_cpu(i).irqs[j]; } - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n", + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), (unsigned long long)cputime64_to_clock_t(system), @@ -473,7 +476,8 @@ static int show_stat(struct seq_file *p, (unsigned long long)cputime64_to_clock_t(iowait), (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal)); + (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(guest)); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ @@ -485,7 +489,9 @@ static int show_stat(struct seq_file *p, irq = kstat_cpu(i).cpustat.irq; softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; - seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n", + guest = kstat_cpu(i).cpustat.guest; + seq_printf(p, + "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", i, (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), @@ -494,7 +500,8 @@ static int show_stat(struct seq_file *p, (unsigned long long)cputime64_to_clock_t(iowait), (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal)); + (unsigned long long)cputime64_to_clock_t(steal), + (unsigned long long)cputime64_to_clock_t(guest)); } seq_printf(p, "intr %llu", (unsigned long long)sum); Index: linux-cfs-2.6.22.13.q/include/asm-generic/bitops/sched.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/include/asm-generic/bitops/sched.h +++ linux-cfs-2.6.22.13.q/include/asm-generic/bitops/sched.h @@ -6,28 +6,23 @@ /* * Every architecture must define this function. It's the fastest - * way of searching a 140-bit bitmap where the first 100 bits are - * unlikely to be set. It's guaranteed that at least one of the 140 - * bits is cleared. + * way of searching a 100-bit bitmap. It's guaranteed that at least + * one of the 100 bits is cleared. */ static inline int sched_find_first_bit(const unsigned long *b) { #if BITS_PER_LONG == 64 - if (unlikely(b[0])) + if (b[0]) return __ffs(b[0]); - if (likely(b[1])) - return __ffs(b[1]) + 64; - return __ffs(b[2]) + 128; + return __ffs(b[1]) + 64; #elif BITS_PER_LONG == 32 - if (unlikely(b[0])) + if (b[0]) return __ffs(b[0]); - if (unlikely(b[1])) + if (b[1]) return __ffs(b[1]) + 32; - if (unlikely(b[2])) + if (b[2]) return __ffs(b[2]) + 64; - if (b[3]) - return __ffs(b[3]) + 96; - return __ffs(b[4]) + 128; + return __ffs(b[3]) + 96; #else #error BITS_PER_LONG not defined #endif Index: linux-cfs-2.6.22.13.q/include/linux/cgroup.h =================================================================== --- /dev/null +++ linux-cfs-2.6.22.13.q/include/linux/cgroup.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_CGROUP_H +#define _LINUX_CGROUP_H + +/* + * Control groups are not backported - we use a few compatibility + * defines to be able to use the upstream sched.c as-is: + */ +#define task_pid_nr(task) (task)->pid +#define task_pid_vnr(task) (task)->pid +#define find_task_by_vpid(pid) find_task_by_pid(pid) + +#endif Index: linux-cfs-2.6.22.13.q/include/linux/cpu.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/include/linux/cpu.h +++ linux-cfs-2.6.22.13.q/include/linux/cpu.h @@ -41,8 +41,6 @@ extern void cpu_remove_sysdev_attr(struc extern int cpu_add_sysdev_attr_group(struct attribute_group *attrs); extern void cpu_remove_sysdev_attr_group(struct attribute_group *attrs); -extern struct sysdev_attribute attr_sched_mc_power_savings; -extern struct sysdev_attribute attr_sched_smt_power_savings; extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls); #ifdef CONFIG_HOTPLUG_CPU Index: linux-cfs-2.6.22.13.q/include/linux/cpuset.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/include/linux/cpuset.h +++ linux-cfs-2.6.22.13.q/include/linux/cpuset.h @@ -146,6 +146,11 @@ static inline int cpuset_do_slab_mem_spr static inline void cpuset_track_online_nodes(void) {} +static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p) +{ + return cpu_possible_map; +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ Index: linux-cfs-2.6.22.13.q/include/linux/hardirq.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/include/linux/hardirq.h +++ linux-cfs-2.6.22.13.q/include/linux/hardirq.h @@ -79,6 +79,19 @@ #endif #ifdef CONFIG_PREEMPT +# define PREEMPT_CHECK_OFFSET 1 +#else +# define PREEMPT_CHECK_OFFSET 0 +#endif + +/* + * Check whether we were atomic before we did preempt_disable(): + * (used by the scheduler) + */ +#define in_atomic_preempt_off() \ + ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) + +#ifdef CONFIG_PREEMPT # define preemptible() (preempt_count() == 0 && !irqs_disabled()) # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1) #else Index: linux-cfs-2.6.22.13.q/include/linux/init_task.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/include/linux/init_task.h +++ linux-cfs-2.6.22.13.q/include/linux/init_task.h @@ -8,6 +8,7 @@ #include #include #include +#include #define INIT_FDTABLE \ { \ @@ -78,6 +79,7 @@ extern struct nsproxy init_nsproxy; .uts_ns = &init_uts_ns, \ .mnt_ns = NULL, \ INIT_IPC_NS(ipc_ns) \ + .user_ns = &init_user_ns, \ } #define INIT_SIGHAND(sighand) { \ Index: linux-cfs-2.6.22.13.q/include/linux/kernel.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/include/linux/kernel.h +++ linux-cfs-2.6.22.13.q/include/linux/kernel.h @@ -60,6 +60,13 @@ extern const char linux_proc_banner[]; #define KERN_INFO "<6>" /* informational */ #define KERN_DEBUG "<7>" /* debug-level messages */ +/* + * Annotation for a "continued" line of log printout (only done after a + * line that had no enclosing \n). Only to be used by core/arch code + * during early bootup (a continued line is not SMP-safe otherwise). + */ +#define KERN_CONT "" + extern int console_printk[]; #define console_loglevel (console_printk[0]) Index: linux-cfs-2.6.22.13.q/include/linux/kernel_stat.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/include/linux/kernel_stat.h +++ linux-cfs-2.6.22.13.q/include/linux/kernel_stat.h @@ -23,6 +23,7 @@ struct cpu_usage_stat { cputime64_t idle; cputime64_t iowait; cputime64_t steal; + cputime64_t guest; }; struct kernel_stat { @@ -52,7 +53,9 @@ static inline int kstat_irqs(int irq) } extern void account_user_time(struct task_struct *, cputime_t); +extern void account_user_time_scaled(struct task_struct *, cputime_t); extern void account_system_time(struct task_struct *, int, cputime_t); +extern void account_system_time_scaled(struct task_struct *, cputime_t); extern void account_steal_time(struct task_struct *, cputime_t); #endif /* _LINUX_KERNEL_STAT_H */ Index: linux-cfs-2.6.22.13.q/include/linux/nodemask.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/include/linux/nodemask.h +++ linux-cfs-2.6.22.13.q/include/linux/nodemask.h @@ -338,31 +338,88 @@ static inline void __nodes_remap(nodemas #endif /* MAX_NUMNODES */ /* + * Bitmasks that are kept for all the nodes. + */ +enum node_states { + N_POSSIBLE, /* The node could become online at some point */ + N_ONLINE, /* The node is online */ + N_NORMAL_MEMORY, /* The node has regular memory */ +#ifdef CONFIG_HIGHMEM + N_HIGH_MEMORY, /* The node has regular or high memory */ +#else + N_HIGH_MEMORY = N_NORMAL_MEMORY, +#endif + N_CPU, /* The node has one or more cpus */ + NR_NODE_STATES +}; + +/* * The following particular system nodemasks and operations * on them manage all possible and online nodes. */ -extern nodemask_t node_online_map; -extern nodemask_t node_possible_map; +extern nodemask_t node_states[NR_NODE_STATES]; #if MAX_NUMNODES > 1 -#define num_online_nodes() nodes_weight(node_online_map) -#define num_possible_nodes() nodes_weight(node_possible_map) -#define node_online(node) node_isset((node), node_online_map) -#define node_possible(node) node_isset((node), node_possible_map) -#define first_online_node first_node(node_online_map) -#define next_online_node(nid) next_node((nid), node_online_map) +static inline int node_state(int node, enum node_states state) +{ + return node_isset(node, node_states[state]); +} + +static inline void node_set_state(int node, enum node_states state) +{ + __node_set(node, &node_states[state]); +} + +static inline void node_clear_state(int node, enum node_states state) +{ + __node_clear(node, &node_states[state]); +} + +static inline int num_node_state(enum node_states state) +{ + return nodes_weight(node_states[state]); +} + +#define for_each_node_state(__node, __state) \ + for_each_node_mask((__node), node_states[__state]) + +#define first_online_node first_node(node_states[N_ONLINE]) +#define next_online_node(nid) next_node((nid), node_states[N_ONLINE]) + extern int nr_node_ids; #else -#define num_online_nodes() 1 -#define num_possible_nodes() 1 -#define node_online(node) ((node) == 0) -#define node_possible(node) ((node) == 0) + +static inline int node_state(int node, enum node_states state) +{ + return node == 0; +} + +static inline void node_set_state(int node, enum node_states state) +{ +} + +static inline void node_clear_state(int node, enum node_states state) +{ +} + +static inline int num_node_state(enum node_states state) +{ + return 1; +} + +#define for_each_node_state(node, __state) \ + for ( (node) = 0; (node) == 0; (node) = 1) + #define first_online_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define nr_node_ids 1 + #endif +#define node_online_map node_states[N_ONLINE] +#define node_possible_map node_states[N_POSSIBLE] + #define any_online_node(mask) \ ({ \ int node; \ @@ -372,10 +429,15 @@ extern int nr_node_ids; node; \ }) -#define node_set_online(node) set_bit((node), node_online_map.bits) -#define node_set_offline(node) clear_bit((node), node_online_map.bits) +#define num_online_nodes() num_node_state(N_ONLINE) +#define num_possible_nodes() num_node_state(N_POSSIBLE) +#define node_online(node) node_state((node), N_ONLINE) +#define node_possible(node) node_state((node), N_POSSIBLE) + +#define node_set_online(node) node_set_state((node), N_ONLINE) +#define node_set_offline(node) node_clear_state((node), N_ONLINE) -#define for_each_node(node) for_each_node_mask((node), node_possible_map) -#define for_each_online_node(node) for_each_node_mask((node), node_online_map) +#define for_each_node(node) for_each_node_state(node, N_POSSIBLE) +#define for_each_online_node(node) for_each_node_state(node, N_ONLINE) #endif /* __LINUX_NODEMASK_H */ Index: linux-cfs-2.6.22.13.q/include/linux/nsproxy.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/include/linux/nsproxy.h +++ linux-cfs-2.6.22.13.q/include/linux/nsproxy.h @@ -28,10 +28,11 @@ struct nsproxy { struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns; + struct user_namespace *user_ns; }; extern struct nsproxy init_nsproxy; -int copy_namespaces(int flags, struct task_struct *tsk); +int copy_namespaces(unsigned long flags, struct task_struct *tsk); void get_task_namespaces(struct task_struct *tsk); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, Index: linux-cfs-2.6.22.13.q/include/linux/sched.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/include/linux/sched.h +++ linux-cfs-2.6.22.13.q/include/linux/sched.h @@ -2,6 +2,15 @@ #define _LINUX_SCHED_H #include /* For AT_VECTOR_SIZE */ +#include /* CFS backport details */ + +#define DEFINE_PER_CPU_SHARED_ALIGNED(x,y) \ + DEFINE_PER_CPU(x,y) ____cacheline_aligned_in_smp + +#define COMPAT_REGISTER_SYSCTL + +/* backporting helper macro: */ +#define cpu_sibling_map(cpu) cpu_sibling_map[cpu] /* * cloning flags: @@ -26,6 +35,7 @@ #define CLONE_STOPPED 0x02000000 /* Start in stopped state */ #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ #define CLONE_NEWIPC 0x08000000 /* New ipcs */ +#define CLONE_NEWUSER 0x10000000 /* New user namespace */ /* * Scheduling policies @@ -34,6 +44,8 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 +/* SCHED_ISO: reserved but not implemented yet */ +#define SCHED_IDLE 5 #ifdef __KERNEL__ @@ -83,6 +95,7 @@ struct sched_param { #include #include #include +#include #include @@ -110,7 +123,7 @@ extern unsigned long avenrun[]; /* Load #define FSHIFT 11 /* nr of bits of precision */ #define FIXED_1 (1<state again */ -#define TASK_NONINTERACTIVE 64 -#define TASK_DEAD 128 +#define TASK_DEAD 64 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -193,6 +226,7 @@ struct task_struct; extern void sched_init(void); extern void sched_init_smp(void); extern void init_idle(struct task_struct *idle, int cpu); +extern void init_idle_bootup_task(struct task_struct *idle); extern cpumask_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) @@ -264,6 +298,7 @@ extern signed long schedule_timeout_unin asmlinkage void schedule(void); struct nsproxy; +struct user_namespace; /* Maximum number of active map areas.. This is a random (large) number */ #define DEFAULT_MAX_MAP_COUNT 65536 @@ -469,6 +504,8 @@ struct signal_struct { * in __exit_signal, except for the group leader. */ cputime_t utime, stime, cutime, cstime; + cputime_t gtime; + cputime_t cgtime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; @@ -479,7 +516,7 @@ struct signal_struct { * from jiffies_to_ns(utime + stime) if sched_clock uses something * other than jiffies.) */ - unsigned long long sched_time; + unsigned long long sum_sched_runtime; /* * We don't bother to synchronize most readers of this at all, @@ -521,31 +558,6 @@ struct signal_struct { #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ - -/* - * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH - * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority - * values are inverted: lower p->prio value means higher priority. - * - * The MAX_USER_RT_PRIO value allows the actual maximum - * RT priority to be separate from the value exported to - * user-space. This allows kernel threads to set their - * priority to a value higher than any user task. Note: - * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. - */ - -#define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO - -#define MAX_PRIO (MAX_RT_PRIO + 40) - -#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) -#define rt_task(p) rt_prio((p)->prio) -#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) -#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) -#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) - /* * Some day this will be a full-fledged user tracking system.. */ @@ -568,10 +580,25 @@ struct user_struct { #endif /* Hash table maintenance information */ - struct list_head uidhash_list; + struct hlist_node uidhash_node; uid_t uid; + +#ifdef CONFIG_FAIR_USER_SCHED + struct task_group *tg; +#ifdef CONFIG_SYSFS + struct kset kset; + struct subsys_attribute user_attr; + struct work_struct work; +#endif +#endif }; +#ifdef CONFIG_FAIR_USER_SCHED +extern int uids_kobject_init(void); +#else +static inline int uids_kobject_init(void) { return 0; } +#endif + extern struct user_struct *find_user(uid_t); extern struct user_struct root_user; @@ -583,13 +610,17 @@ struct reclaim_state; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info { /* cumulative counters */ - unsigned long cpu_time, /* time spent on the cpu */ - run_delay, /* time spent waiting on a runqueue */ - pcnt; /* # of timeslices run on this cpu */ + unsigned long pcount; /* # of times run on this cpu */ + unsigned long long cpu_time, /* time spent on the cpu */ + run_delay; /* time spent waiting on a runqueue */ /* timestamps */ - unsigned long last_arrival, /* when we last ran on a cpu */ - last_queued; /* when we were last queued to run */ + unsigned long long last_arrival,/* when we last ran on a cpu */ + last_queued; /* when we were last queued to run */ +#ifdef CONFIG_SCHEDSTATS + /* BKL stats */ + unsigned int bkl_count; +#endif }; #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ @@ -639,18 +670,24 @@ static inline int sched_info_on(void) #endif } -enum idle_type -{ - SCHED_IDLE, - NOT_IDLE, - NEWLY_IDLE, - MAX_IDLE_TYPES +enum cpu_idle_type { + CPU_IDLE, + CPU_NOT_IDLE, + CPU_NEWLY_IDLE, + CPU_MAX_IDLE_TYPES }; /* * sched-domains (multiprocessor balancing) declarations: */ -#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ + +/* + * Increase resolution of nice-level calculations: + */ +#define SCHED_LOAD_SHIFT 10 +#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) + +#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE #ifdef CONFIG_SMP #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ @@ -703,7 +740,6 @@ struct sched_domain { unsigned long max_interval; /* Maximum balance interval ms */ unsigned int busy_factor; /* less balancing by factor if busy */ unsigned int imbalance_pct; /* No balance until over watermark */ - unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ unsigned int busy_idx; unsigned int idle_idx; @@ -719,48 +755,57 @@ struct sched_domain { #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ - unsigned long lb_cnt[MAX_IDLE_TYPES]; - unsigned long lb_failed[MAX_IDLE_TYPES]; - unsigned long lb_balanced[MAX_IDLE_TYPES]; - unsigned long lb_imbalance[MAX_IDLE_TYPES]; - unsigned long lb_gained[MAX_IDLE_TYPES]; - unsigned long lb_hot_gained[MAX_IDLE_TYPES]; - unsigned long lb_nobusyg[MAX_IDLE_TYPES]; - unsigned long lb_nobusyq[MAX_IDLE_TYPES]; + unsigned int lb_count[CPU_MAX_IDLE_TYPES]; + unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; + unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; + unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; + unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; + unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; + unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES]; /* Active load balancing */ - unsigned long alb_cnt; - unsigned long alb_failed; - unsigned long alb_pushed; + unsigned int alb_count; + unsigned int alb_failed; + unsigned int alb_pushed; /* SD_BALANCE_EXEC stats */ - unsigned long sbe_cnt; - unsigned long sbe_balanced; - unsigned long sbe_pushed; + unsigned int sbe_count; + unsigned int sbe_balanced; + unsigned int sbe_pushed; /* SD_BALANCE_FORK stats */ - unsigned long sbf_cnt; - unsigned long sbf_balanced; - unsigned long sbf_pushed; + unsigned int sbf_count; + unsigned int sbf_balanced; + unsigned int sbf_pushed; /* try_to_wake_up() stats */ - unsigned long ttwu_wake_remote; - unsigned long ttwu_move_affine; - unsigned long ttwu_move_balance; + unsigned int ttwu_wake_remote; + unsigned int ttwu_move_affine; + unsigned int ttwu_move_balance; #endif }; -extern int partition_sched_domains(cpumask_t *partition1, - cpumask_t *partition2); - -/* - * Maximum cache size the migration-costs auto-tuning code will - * search from: - */ -extern unsigned int max_cache_size; +extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new); #endif /* CONFIG_SMP */ +/* + * A runqueue laden with a single nice 0 task scores a weighted_cpuload of + * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a + * task of nice 0 or enough lower priority tasks to bring up the + * weighted_cpuload + */ +static inline int above_background_load(void) +{ + unsigned long cpu; + + for_each_online_cpu(cpu) { + if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) + return 1; + } + return 0; +} struct io_context; /* See blkdev.h */ struct cpuset; @@ -809,14 +854,101 @@ struct mempolicy; struct pipe_inode_info; struct uts_namespace; -enum sleep_type { - SLEEP_NORMAL, - SLEEP_NONINTERACTIVE, - SLEEP_INTERACTIVE, - SLEEP_INTERRUPTED, +struct rq; +struct sched_domain; + +struct sched_class { + const struct sched_class *next; + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); + void (*yield_task) (struct rq *rq); + + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); + + struct task_struct * (*pick_next_task) (struct rq *rq); + void (*put_prev_task) (struct rq *rq, struct task_struct *p); + +#ifdef CONFIG_SMP + unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, + struct rq *busiest, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio); + + int (*move_one_task) (struct rq *this_rq, int this_cpu, + struct rq *busiest, struct sched_domain *sd, + enum cpu_idle_type idle); +#endif + + void (*set_curr_task) (struct rq *rq); + void (*task_tick) (struct rq *rq, struct task_struct *p); + void (*task_new) (struct rq *rq, struct task_struct *p); }; -struct prio_array; +struct load_weight { + unsigned long weight, inv_weight; +}; + +/* + * CFS stats for a schedulable entity (task, task-group etc) + * + * Current field usage histogram: + * + * 4 se->block_start + * 4 se->run_node + * 4 se->sleep_start + * 6 se->load.weight + */ +struct sched_entity { + struct load_weight load; /* for load-balancing */ + struct avl_node run_node; + unsigned int on_rq; + + u64 exec_start; + u64 sum_exec_runtime; + u64 vruntime; + u64 prev_sum_exec_runtime; + +#ifdef CONFIG_SCHEDSTATS + u64 wait_start; + u64 wait_max; + + u64 sleep_start; + u64 sleep_max; + s64 sum_sleep_runtime; + + u64 block_start; + u64 block_max; + u64 exec_max; + u64 slice_max; + + u64 nr_migrations; + u64 nr_migrations_cold; + u64 nr_failed_migrations_affine; + u64 nr_failed_migrations_running; + u64 nr_failed_migrations_hot; + u64 nr_forced_migrations; + u64 nr_forced2_migrations; + + u64 nr_wakeups; + u64 nr_wakeups_sync; + u64 nr_wakeups_migrate; + u64 nr_wakeups_local; + u64 nr_wakeups_remote; + u64 nr_wakeups_affine; + u64 nr_wakeups_affine_attempts; + u64 nr_wakeups_passive; + u64 nr_wakeups_idle; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *parent; + /* rq on which this entity is (to be) queued: */ + struct cfs_rq *cfs_rq; + /* rq "owned" by this entity/group: */ + struct cfs_rq *my_q; +#endif +}; struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ @@ -832,23 +964,25 @@ struct task_struct { int oncpu; #endif #endif - int load_weight; /* for niceness load balancing purposes */ + int prio, static_prio, normal_prio; struct list_head run_list; - struct prio_array *array; + const struct sched_class *sched_class; + struct sched_entity se; + +#ifdef CONFIG_PREEMPT_NOTIFIERS + /* list of struct preempt_notifier: */ + struct hlist_head preempt_notifiers; +#endif unsigned short ioprio; #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif - unsigned long sleep_avg; - unsigned long long timestamp, last_ran; - unsigned long long sched_time; /* sched_clock time spent running */ - enum sleep_type sleep_type; unsigned int policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + unsigned int time_slice; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@ -903,9 +1037,12 @@ struct task_struct { int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ unsigned int rt_priority; - cputime_t utime, stime; + cputime_t utime, stime, utimescaled, stimescaled; + cputime_t gtime; + cputime_t prev_utime, prev_stime; unsigned long nvcsw, nivcsw; /* context switch counts */ - struct timespec start_time; + struct timespec start_time; /* monotonic time */ + struct timespec real_start_time; /* boot based time */ /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt; @@ -1078,6 +1215,37 @@ struct task_struct { #endif }; +/* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority + * values are inverted: lower p->prio value means higher priority. + * + * The MAX_USER_RT_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. + */ + +#define MAX_USER_RT_PRIO 100 +#define MAX_RT_PRIO MAX_USER_RT_PRIO + +#define MAX_PRIO (MAX_RT_PRIO + 40) +#define DEFAULT_PRIO (MAX_RT_PRIO + 20) + +static inline int rt_prio(int prio) +{ + if (unlikely(prio < MAX_RT_PRIO)) + return 1; + return 0; +} + +static inline int rt_task(struct task_struct *p) +{ + return rt_prio(p->prio); +} + static inline pid_t process_group(struct task_struct *tsk) { return tsk->signal->pgrp; @@ -1163,6 +1331,7 @@ static inline void put_task_struct(struc #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ +#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ @@ -1222,8 +1391,15 @@ static inline int set_cpus_allowed(struc #endif extern unsigned long long sched_clock(void); + +/* + * For kernel-internal use: high-speed (but slightly incorrect) per-cpu + * clock constructed from sched_clock(): + */ +extern unsigned long long cpu_clock(int cpu); + extern unsigned long long -current_sched_time(const struct task_struct *current_task); +task_sched_runtime(struct task_struct *task); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP @@ -1232,6 +1408,9 @@ extern void sched_exec(void); #define sched_exec() {} #endif +extern void sched_clock_idle_sleep_event(void); +extern void sched_clock_idle_wakeup_event(u64 delta_ns); + #ifdef CONFIG_HOTPLUG_CPU extern void idle_task_exit(void); #else @@ -1240,6 +1419,27 @@ static inline void idle_task_exit(void) extern void sched_idle_next(void); +#ifdef CONFIG_SCHED_DEBUG +extern unsigned int sysctl_sched_latency; +extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_wakeup_granularity; +extern unsigned int sysctl_sched_batch_wakeup_granularity; +extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_features; +extern unsigned int sysctl_sched_migration_cost; +extern unsigned int sysctl_sched_nr_migrate; +#ifdef CONFIG_FAIR_GROUP_SCHED +extern unsigned int sysctl_sched_min_bal_int_shares; +extern unsigned int sysctl_sched_max_bal_int_shares; +#endif + +int sched_nr_latency_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, + loff_t *ppos); +#endif + +extern unsigned int sysctl_sched_compat_yield; + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); @@ -1295,7 +1495,7 @@ extern struct task_struct *find_task_by_ extern void __set_special_pids(pid_t session, pid_t pgrp); /* per-UID process charging. */ -extern struct user_struct * alloc_uid(uid_t); +extern struct user_struct * alloc_uid(struct user_namespace *, uid_t); static inline struct user_struct *get_uid(struct user_struct *u) { atomic_inc(&u->__count); @@ -1303,6 +1503,7 @@ static inline struct user_struct *get_ui } extern void free_uid(struct user_struct *); extern void switch_uid(struct user_struct *); +extern void release_uids(struct user_namespace *ns); #include @@ -1317,8 +1518,8 @@ extern void FASTCALL(wake_up_new_task(st #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags)); -extern void FASTCALL(sched_exit(struct task_struct * p)); +extern void sched_fork(struct task_struct *p, int clone_flags); +extern void sched_dead(struct task_struct *p); extern int in_group_p(gid_t); extern int in_egroup_p(gid_t); @@ -1406,7 +1607,7 @@ extern struct mm_struct * mm_alloc(void) extern void FASTCALL(__mmdrop(struct mm_struct *)); static inline void mmdrop(struct mm_struct * mm) { - if (atomic_dec_and_test(&mm->mm_count)) + if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } @@ -1638,10 +1839,7 @@ static inline unsigned int task_cpu(cons return task_thread_info(p)->cpu; } -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - task_thread_info(p)->cpu = cpu; -} +extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else @@ -1674,6 +1872,18 @@ extern int sched_mc_power_savings, sched extern void normalize_rt_tasks(void); +#ifdef CONFIG_FAIR_GROUP_SCHED + +extern struct task_group init_task_group; + +extern struct task_group *sched_create_group(void); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_move_task(struct task_struct *tsk); +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern unsigned long sched_group_shares(struct task_group *tg); + +#endif + #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { @@ -1712,6 +1922,14 @@ static inline void inc_syscw(struct task } #endif +#ifdef CONFIG_SMP +void migration_init(void); +#else +static inline void migration_init(void) +{ +} +#endif + #endif /* __KERNEL__ */ #endif Index: linux-cfs-2.6.22.13.q/include/linux/taskstats.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/include/linux/taskstats.h +++ linux-cfs-2.6.22.13.q/include/linux/taskstats.h @@ -31,7 +31,7 @@ */ -#define TASKSTATS_VERSION 4 +#define TASKSTATS_VERSION 6 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -149,6 +149,14 @@ struct taskstats { __u64 read_bytes; /* bytes of read I/O */ __u64 write_bytes; /* bytes of write I/O */ __u64 cancelled_write_bytes; /* bytes of cancelled write I/O */ + + __u64 nvcsw; /* voluntary_ctxt_switches */ + __u64 nivcsw; /* nonvoluntary_ctxt_switches */ + + /* time accounting for SMT machines */ + __u64 ac_utimescaled; /* utime scaled on frequency etc */ + __u64 ac_stimescaled; /* stime scaled on frequency etc */ + __u64 cpu_scaled_run_real_total; /* scaled cpu_run_real_total */ }; Index: linux-cfs-2.6.22.13.q/include/linux/topology.h =================================================================== --- linux-cfs-2.6.22.13.q.orig/include/linux/topology.h +++ linux-cfs-2.6.22.13.q/include/linux/topology.h @@ -50,10 +50,10 @@ for_each_online_node(node) \ if (nr_cpus_node(node)) -#ifndef node_distance /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 +#ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif #ifndef RECLAIM_DISTANCE @@ -98,7 +98,7 @@ .cache_nice_tries = 0, \ .busy_idx = 0, \ .idle_idx = 0, \ - .newidle_idx = 1, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ @@ -128,14 +128,15 @@ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 2, \ + .idle_idx = 0, \ + .newidle_idx = 0, \ .wake_idx = 1, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ + | SD_WAKE_IDLE \ | SD_SHARE_PKG_RESOURCES\ | BALANCE_FOR_MC_POWER, \ .last_balance = jiffies, \ @@ -183,7 +184,6 @@ .max_interval = 64*num_online_cpus(), \ .busy_factor = 128, \ .imbalance_pct = 133, \ - .cache_hot_time = (10*1000000), \ .cache_nice_tries = 1, \ .busy_idx = 3, \ .idle_idx = 3, \ Index: linux-cfs-2.6.22.13.q/include/linux/user_namespace.h =================================================================== --- /dev/null +++ linux-cfs-2.6.22.13.q/include/linux/user_namespace.h @@ -0,0 +1,61 @@ +#ifndef _LINUX_USER_NAMESPACE_H +#define _LINUX_USER_NAMESPACE_H + +#include +#include +#include +#include + +#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8) +#define UIDHASH_SZ (1 << UIDHASH_BITS) + +struct user_namespace { + struct kref kref; + struct hlist_head uidhash_table[UIDHASH_SZ]; + struct user_struct *root_user; +}; + +extern struct user_namespace init_user_ns; + +#ifdef CONFIG_USER_NS + +static inline struct user_namespace *get_user_ns(struct user_namespace *ns) +{ + if (ns) + kref_get(&ns->kref); + return ns; +} + +extern struct user_namespace *copy_user_ns(int flags, + struct user_namespace *old_ns); +extern void free_user_ns(struct kref *kref); + +static inline void put_user_ns(struct user_namespace *ns) +{ + if (ns) + kref_put(&ns->kref, free_user_ns); +} + +#else + +static inline struct user_namespace *get_user_ns(struct user_namespace *ns) +{ + return &init_user_ns; +} + +static inline struct user_namespace *copy_user_ns(int flags, + struct user_namespace *old_ns) +{ + if (flags & CLONE_NEWUSER) + return ERR_PTR(-EINVAL); + + return old_ns; +} + +static inline void put_user_ns(struct user_namespace *ns) +{ +} + +#endif + +#endif /* _LINUX_USER_H */ Index: linux-cfs-2.6.22.13.q/init/Kconfig =================================================================== --- linux-cfs-2.6.22.13.q.orig/init/Kconfig +++ linux-cfs-2.6.22.13.q/init/Kconfig @@ -284,6 +284,11 @@ config LOG_BUF_SHIFT config CPUSETS bool "Cpuset support" depends on SMP + # + # disabled for now - depends on control groups, which + # are hard to backport: + # + depends on 0 help This option will let you create and manage CPUSETs which allow dynamically partitioning a system into sets of CPUs and @@ -292,6 +297,27 @@ config CPUSETS Say N if unsure. +config FAIR_GROUP_SCHED + bool "Fair group CPU scheduler" + default y + depends on EXPERIMENTAL + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. + +choice + depends on FAIR_GROUP_SCHED + prompt "Basis for grouping tasks" + default FAIR_USER_SCHED + +config FAIR_USER_SCHED + bool "user id" + help + This option will choose userid as the basis for grouping + tasks, thus providing equal CPU bandwidth to each user. + +endchoice + config SYSFS_DEPRECATED bool "Create deprecated sysfs files" default y Index: linux-cfs-2.6.22.13.q/init/main.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/init/main.c +++ linux-cfs-2.6.22.13.q/init/main.c @@ -436,15 +436,16 @@ static void noinline __init_refok rest_i /* * The boot idle thread must execute schedule() - * at least one to get things moving: + * at least once to get things moving: */ + init_idle_bootup_task(current); preempt_enable_no_resched(); schedule(); preempt_disable(); /* Call into cpu_idle with preempt disabled */ cpu_idle(); -} +} /* Check for early params. */ static int __init do_early_param(char *param, char *val) @@ -727,11 +728,8 @@ static void __init do_basic_setup(void) static void __init do_pre_smp_initcalls(void) { extern int spawn_ksoftirqd(void); -#ifdef CONFIG_SMP - extern int migration_init(void); migration_init(); -#endif spawn_ksoftirqd(); spawn_softlockup_task(); } Index: linux-cfs-2.6.22.13.q/kernel/Makefile =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/Makefile +++ linux-cfs-2.6.22.13.q/kernel/Makefile @@ -4,7 +4,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o capability.o ptrace.o timer.o user.o \ + sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ Index: linux-cfs-2.6.22.13.q/kernel/delayacct.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/delayacct.c +++ linux-cfs-2.6.22.13.q/kernel/delayacct.c @@ -99,9 +99,10 @@ void __delayacct_blkio_end(void) int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { s64 tmp; - struct timespec ts; - unsigned long t1,t2,t3; + unsigned long t1; + unsigned long long t2,t3; unsigned long flags; + struct timespec ts; /* Though tsk->delays accessed later, early exit avoids * unnecessary returning of other data @@ -114,21 +115,26 @@ int __delayacct_add_tsk(struct taskstats tmp += timespec_to_ns(&ts); d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; + tmp = (s64)d->cpu_scaled_run_real_total; + cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); + tmp += timespec_to_ns(&ts); + d->cpu_scaled_run_real_total = + (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; + /* * No locking available for sched_info (and too expensive to add one) * Mitigate by taking snapshot of values */ - t1 = tsk->sched_info.pcnt; + t1 = tsk->sched_info.pcount; t2 = tsk->sched_info.run_delay; t3 = tsk->sched_info.cpu_time; d->cpu_count += t1; - jiffies_to_timespec(t2, &ts); - tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts); + tmp = (s64)d->cpu_delay_total + t2; d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; - tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; + tmp = (s64)d->cpu_run_virtual_total + t3; d->cpu_run_virtual_total = (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; Index: linux-cfs-2.6.22.13.q/kernel/exit.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/exit.c +++ linux-cfs-2.6.22.13.q/kernel/exit.c @@ -118,13 +118,14 @@ static void __exit_signal(struct task_st */ sig->utime = cputime_add(sig->utime, tsk->utime); sig->stime = cputime_add(sig->stime, tsk->stime); + sig->gtime = cputime_add(sig->gtime, tsk->gtime); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; - sig->sched_time += tsk->sched_time; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -182,7 +183,6 @@ repeat: zap_leader = (leader->exit_signal == -1); } - sched_exit(p); write_unlock_irq(&tasklist_lock); proc_flush_task(p); release_thread(p); @@ -291,7 +291,7 @@ static void reparent_to_kthreadd(void) /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; - if (!has_rt_policy(current) && (task_nice(current) < 0)) + if (task_nice(current) < 0) set_user_nice(current, 0); /* cpus_allowed? */ /* rt_priority? */ @@ -1216,6 +1216,11 @@ static int wait_task_zombie(struct task_ cputime_add(p->stime, cputime_add(sig->stime, sig->cstime))); + psig->cgtime = + cputime_add(psig->cgtime, + cputime_add(p->gtime, + cputime_add(sig->gtime, + sig->cgtime))); psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += Index: linux-cfs-2.6.22.13.q/kernel/fork.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/fork.c +++ linux-cfs-2.6.22.13.q/kernel/fork.c @@ -874,10 +874,12 @@ static inline int copy_signal(unsigned l sig->tty_old_pgrp = NULL; sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->gtime = cputime_zero; + sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; - sig->sched_time = 0; + sig->sum_sched_runtime = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); @@ -999,7 +1001,7 @@ static struct task_struct *copy_process( if (atomic_read(&p->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->user != &root_user) + p->user != current->nsproxy->user_ns->root_user) goto bad_fork_free; } @@ -1040,7 +1042,12 @@ static struct task_struct *copy_process( p->utime = cputime_zero; p->stime = cputime_zero; - p->sched_time = 0; + p->prev_utime = cputime_zero; + p->prev_stime = cputime_zero; + p->gtime = cputime_zero; + p->utimescaled = cputime_zero; + p->stimescaled = cputime_zero; + #ifdef CONFIG_TASK_XACCT p->rchar = 0; /* I/O counter: bytes read */ p->wchar = 0; /* I/O counter: bytes written */ @@ -1601,7 +1608,7 @@ asmlinkage long sys_unshare(unsigned lon err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER)) goto bad_unshare_out; if ((err = unshare_thread(unshare_flags))) Index: linux-cfs-2.6.22.13.q/kernel/ksysfs.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/ksysfs.c +++ linux-cfs-2.6.22.13.q/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include #include #include +#include #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -88,6 +89,13 @@ static int __init ksysfs_init(void) error = sysfs_create_group(&kernel_subsys.kobj, &kernel_attr_group); + /* + * Create "/sys/kernel/uids" directory and corresponding root user's + * directory under it. + */ + if (!error) + error = uids_kobject_init(); + return error; } Index: linux-cfs-2.6.22.13.q/kernel/nsproxy.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/nsproxy.c +++ linux-cfs-2.6.22.13.q/kernel/nsproxy.c @@ -20,6 +20,9 @@ #include #include #include +#include + +static struct kmem_cache *nsproxy_cachep; struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); @@ -43,9 +46,11 @@ static inline struct nsproxy *clone_nspr { struct nsproxy *ns; - ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); - if (ns) + ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); + if (ns) { + memcpy(ns, orig, sizeof(struct nsproxy)); atomic_set(&ns->count, 1); + } return ns; } @@ -54,33 +59,51 @@ static inline struct nsproxy *clone_nspr * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ -static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, - struct fs_struct *new_fs) +static struct nsproxy *create_new_namespaces(unsigned long flags, + struct task_struct *tsk, struct fs_struct *new_fs) { struct nsproxy *new_nsp; + int err; new_nsp = clone_nsproxy(tsk->nsproxy); if (!new_nsp) return ERR_PTR(-ENOMEM); new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); - if (IS_ERR(new_nsp->mnt_ns)) + if (IS_ERR(new_nsp->mnt_ns)) { + err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; + } new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); - if (IS_ERR(new_nsp->uts_ns)) + if (IS_ERR(new_nsp->uts_ns)) { + err = PTR_ERR(new_nsp->uts_ns); goto out_uts; + } new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); - if (IS_ERR(new_nsp->ipc_ns)) + if (IS_ERR(new_nsp->ipc_ns)) { + err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; + } new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); - if (IS_ERR(new_nsp->pid_ns)) + if (IS_ERR(new_nsp->pid_ns)) { + err = PTR_ERR(new_nsp->pid_ns); goto out_pid; + } + + new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns); + if (IS_ERR(new_nsp->user_ns)) { + err = PTR_ERR(new_nsp->user_ns); + goto out_user; + } return new_nsp; +out_user: + if (new_nsp->pid_ns) + put_pid_ns(new_nsp->pid_ns); out_pid: if (new_nsp->ipc_ns) put_ipc_ns(new_nsp->ipc_ns); @@ -91,15 +114,15 @@ out_uts: if (new_nsp->mnt_ns) put_mnt_ns(new_nsp->mnt_ns); out_ns: - kfree(new_nsp); - return ERR_PTR(-ENOMEM); + kmem_cache_free(nsproxy_cachep, new_nsp); + return ERR_PTR(err); } /* * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(int flags, struct task_struct *tsk) +int copy_namespaces(unsigned long flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct nsproxy *new_ns; @@ -110,7 +133,7 @@ int copy_namespaces(int flags, struct ta get_nsproxy(old_ns); - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER))) return 0; if (!capable(CAP_SYS_ADMIN)) { @@ -140,7 +163,9 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns) put_pid_ns(ns->pid_ns); - kfree(ns); + if (ns->user_ns) + put_user_ns(ns->user_ns); + kmem_cache_free(nsproxy_cachep, ns); } /* @@ -152,19 +177,10 @@ int unshare_nsproxy_namespaces(unsigned { int err = 0; - if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWUSER))) return 0; -#ifndef CONFIG_IPC_NS - if (unshare_flags & CLONE_NEWIPC) - return -EINVAL; -#endif - -#ifndef CONFIG_UTS_NS - if (unshare_flags & CLONE_NEWUTS) - return -EINVAL; -#endif - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -174,3 +190,12 @@ int unshare_nsproxy_namespaces(unsigned err = PTR_ERR(*new_nsp); return err; } + +static int __init nsproxy_cache_init(void) +{ + nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy), + 0, SLAB_PANIC, NULL, NULL); + return 0; +} + +module_init(nsproxy_cache_init); Index: linux-cfs-2.6.22.13.q/kernel/posix-cpu-timers.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/posix-cpu-timers.c +++ linux-cfs-2.6.22.13.q/kernel/posix-cpu-timers.c @@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struc } static inline unsigned long long sched_ns(struct task_struct *p) { - return (p == current) ? current_sched_time(p) : p->sched_time; + return task_sched_runtime(p); } int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) @@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked } while (t != p); break; case CPUCLOCK_SCHED: - cpu->sched = p->signal->sched_time; + cpu->sched = p->signal->sum_sched_runtime; /* Add in each other live thread. */ while ((t = next_thread(t)) != p) { - cpu->sched += t->sched_time; + cpu->sched += t->se.sum_exec_runtime; } cpu->sched += sched_ns(p); break; @@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer */ static void cleanup_timers(struct list_head *head, cputime_t utime, cputime_t stime, - unsigned long long sched_time) + unsigned long long sum_exec_runtime) { struct cpu_timer_list *timer, *next; cputime_t ptime = cputime_add(utime, stime); @@ -451,10 +451,10 @@ static void cleanup_timers(struct list_h ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.sched < sched_time) { + if (timer->expires.sched < sum_exec_runtime) { timer->expires.sched = 0; } else { - timer->expires.sched -= sched_time; + timer->expires.sched -= sum_exec_runtime; } } } @@ -467,7 +467,7 @@ static void cleanup_timers(struct list_h void posix_cpu_timers_exit(struct task_struct *tsk) { cleanup_timers(tsk->cpu_timers, - tsk->utime, tsk->stime, tsk->sched_time); + tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); } void posix_cpu_timers_exit_group(struct task_struct *tsk) @@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct cleanup_timers(tsk->signal->cpu_timers, cputime_add(tsk->utime, tsk->signal->utime), cputime_add(tsk->stime, tsk->signal->stime), - tsk->sched_time + tsk->signal->sched_time); + tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime); } @@ -536,7 +536,7 @@ static void process_timer_rebalance(stru nsleft = max_t(unsigned long long, nsleft, 1); do { if (likely(!(t->flags & PF_EXITING))) { - ns = t->sched_time + nsleft; + ns = t->se.sum_exec_runtime + nsleft; if (t->it_sched_expires == 0 || t->it_sched_expires > ns) { t->it_sched_expires = ns; @@ -1004,7 +1004,7 @@ static void check_thread_timers(struct t struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || tsk->sched_time < t->expires.sched) { + if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { tsk->it_sched_expires = t->expires.sched; break; } @@ -1024,7 +1024,7 @@ static void check_process_timers(struct int maxfire; struct signal_struct *const sig = tsk->signal; cputime_t utime, stime, ptime, virt_expires, prof_expires; - unsigned long long sched_time, sched_expires; + unsigned long long sum_sched_runtime, sched_expires; struct task_struct *t; struct list_head *timers = sig->cpu_timers; @@ -1044,12 +1044,12 @@ static void check_process_timers(struct */ utime = sig->utime; stime = sig->stime; - sched_time = sig->sched_time; + sum_sched_runtime = sig->sum_sched_runtime; t = tsk; do { utime = cputime_add(utime, t->utime); stime = cputime_add(stime, t->stime); - sched_time += t->sched_time; + sum_sched_runtime += t->se.sum_exec_runtime; t = next_thread(t); } while (t != tsk); ptime = cputime_add(utime, stime); @@ -1090,7 +1090,7 @@ static void check_process_timers(struct struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || sched_time < t->expires.sched) { + if (!--maxfire || sum_sched_runtime < t->expires.sched) { sched_expires = t->expires.sched; break; } @@ -1182,7 +1182,7 @@ static void check_process_timers(struct virt_left = cputime_sub(virt_expires, utime); virt_left = cputime_div_non_zero(virt_left, nthreads); if (sched_expires) { - sched_left = sched_expires - sched_time; + sched_left = sched_expires - sum_sched_runtime; do_div(sched_left, nthreads); sched_left = max_t(unsigned long long, sched_left, 1); } else { @@ -1208,7 +1208,7 @@ static void check_process_timers(struct t->it_virt_expires = ticks; } - sched = t->sched_time + sched_left; + sched = t->se.sum_exec_runtime + sched_left; if (sched_expires && (t->it_sched_expires == 0 || t->it_sched_expires > sched)) { t->it_sched_expires = sched; @@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_st if (UNEXPIRED(prof) && UNEXPIRED(virt) && (tsk->it_sched_expires == 0 || - tsk->sched_time < tsk->it_sched_expires)) + tsk->se.sum_exec_runtime < tsk->it_sched_expires)) return; #undef UNEXPIRED Index: linux-cfs-2.6.22.13.q/kernel/sched.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/sched.c +++ linux-cfs-2.6.22.13.q/kernel/sched.c @@ -16,13 +16,19 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2007-04-15 Work begun on replacing all interactivity tuning with a + * fair scheduling design by Con Kolivas. + * 2007-05-05 Load balancing (smp-nice) and other improvements + * by Peter Williams + * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith + * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri */ #include #include #include #include -#include +#include #include #include #include @@ -38,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -47,15 +54,18 @@ #include #include #include +#include #include #include #include #include #include #include +#include +#include #include -#include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -64,7 +74,7 @@ */ unsigned long long __attribute__((weak)) sched_clock(void) { - return (unsigned long long)jiffies * (1000000000 / HZ); + return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); } /* @@ -88,99 +98,19 @@ unsigned long long __attribute__((weak)) /* * Some helpers for converting nanosecond timing to jiffy resolution */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ)) + +#define NICE_0_LOAD SCHED_LOAD_SCALE +#define NICE_0_SHIFT SCHED_LOAD_SHIFT /* * These are the 'tuning knobs' of the scheduler: * - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. + * default timeslice is 100 msecs (used only for SCHED_RR tasks). * Timeslices get refilled after they expire. */ -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) #define DEF_TIMESLICE (100 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#define GRANULARITY (10 * HZ / 1000 ? : 1) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) - -#define DELTA(p) \ - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ - INTERACTIVE_DELTA) - -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) - -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) - -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) - -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) - -static unsigned int static_prio_timeslice(int static_prio) -{ - if (static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, static_prio); -} #ifdef CONFIG_SMP /* @@ -203,28 +133,201 @@ static inline void sg_inc_cpu_power(stru } #endif -/* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - */ +static inline int rt_policy(int policy) +{ + if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) + return 1; + return 0; +} -static inline unsigned int task_timeslice(struct task_struct *p) +static inline int task_has_rt_policy(struct task_struct *p) { - return static_prio_timeslice(p->static_prio); + return rt_policy(p->policy); } /* - * These are the runqueue data structures: + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED + +#include + +struct cfs_rq; + +/* task group related information */ +struct task_group { +#ifdef CONFIG_FAIR_CGROUP_SCHED + struct cgroup_subsys_state css; +#endif + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + + /* shares assigned to a task group governs how much of cpu bandwidth + * is allocated to the group. The more shares a group has, the more is + * the cpu bandwidth allocated to it. + * + * For ex, lets say that there are three task groups, A, B and C which + * have been assigned shares 1000, 2000 and 3000 respectively. Then, + * cpu bandwidth allocated by the scheduler to task groups A, B and C + * should be: + * + * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% + * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% + * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% + * + * The weight assigned to a task group's schedulable entities on every + * cpu (task_group.se[a_cpu]->load.weight) is derived from the task + * group's shares. For ex: lets say that task group A has been + * assigned shares of 1000 and there are two CPUs in a system. Then, + * + * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; + * + * Note: It's not necessary that each of a task's group schedulable + * entity have the same weight on all CPUs. If the group + * has 2 of its tasks on CPU0 and 1 task on CPU1, then a + * better distribution of weight could be: + * + * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 + * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 + * + * rebalance_shares() is responsible for distributing the shares of a + * task groups like this among the group's schedulable entities across + * cpus. + * + */ + unsigned long shares; + + /* lock to serialize modification to shares */ + struct mutex lock; + + unsigned long last_total_load; + struct rcu_head rcu; +}; + +/* Default task group's sched entity on each cpu */ +static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); +/* Default task group's cfs_rq on each cpu */ +static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; + +static struct sched_entity *init_sched_entity_p[NR_CPUS]; +static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; + +static DEFINE_MUTEX(doms_cur_mutex); /* serialize access to doms_curr[] array */ + +#ifdef CONFIG_SMP +/* kernel thread that runs rebalance_shares() periodically */ +static struct task_struct *lb_monitor_task; + +static int load_balance_monitor(void *unused); +#endif + +static void set_se_shares(struct sched_entity *se, unsigned long shares); + +/* Default task group. + * Every task in system belong to this group at bootup. */ +struct task_group init_task_group = { + .se = init_sched_entity_p, + .cfs_rq = init_cfs_rq_p, +}; + +#ifdef CONFIG_FAIR_USER_SCHED +# define INIT_TASK_GROUP_LOAD 2*NICE_0_LOAD /* root user's cpu share */ +#else +# define INIT_TASK_GROUP_LOAD NICE_0_LOAD +#endif + +#define MIN_GROUP_SHARES 100 + +static int init_task_group_load = INIT_TASK_GROUP_LOAD; + +/* return group to which a task belongs */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct task_group *tg; + +#ifdef CONFIG_FAIR_USER_SCHED + tg = p->user->tg; +#elif defined(CONFIG_FAIR_CGROUP_SCHED) + tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), + struct task_group, css); +#else + tg = &init_task_group; +#endif + + return tg; +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) +{ + p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; + p->se.parent = task_group(p)->se[cpu]; +} + +static inline void lock_doms_cur(void) +{ + mutex_lock(&doms_cur_mutex); +} + +static inline void unlock_doms_cur(void) +{ + mutex_unlock(&doms_cur_mutex); +} + +#else + +static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } +static inline void lock_doms_cur(void) { } +static inline void unlock_doms_cur(void) { } + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long nr_running; + + u64 exec_clock; + u64 min_vruntime; + + struct avl_root tasks_timeline; + struct avl_node *avl_leftmost; + struct avl_node *avl_load_balance_curr; + /* 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr; + + unsigned long nr_spread_over; + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ + struct task_group *tg; /* group that "owns" this runqueue */ +#endif +}; -struct prio_array { - unsigned int nr_active; - DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_PRIO]; +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + int rt_load_balance_idx; + struct list_head *rt_load_balance_head, *rt_load_balance_curr; }; /* @@ -235,6 +338,7 @@ struct prio_array { * acquire operations must be ordered by ascending &runqueue. */ struct rq { + /* runqueue lock: */ spinlock_t lock; /* @@ -242,15 +346,23 @@ struct rq { * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; - unsigned long raw_weighted_load; -#ifdef CONFIG_SMP - unsigned long cpu_load[3]; + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned char idle_at_tick; #ifdef CONFIG_NO_HZ unsigned char in_nohz_recently; #endif + /* capture load from *all* tasks on this cpu: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this cpu: */ + struct list_head leaf_cfs_rq_list; #endif - unsigned long long nr_switches; + struct rt_rq rt; /* * This is part of a global counter where only the total sum @@ -260,14 +372,18 @@ struct rq { */ unsigned long nr_uninterruptible; - unsigned long expired_timestamp; - /* Cached timestamp set by update_cpu_clock() */ - unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; - struct prio_array *active, *expired, arrays[2]; - int best_expired_prio; + + u64 clock, prev_clock_raw; + s64 clock_max_delta; + + unsigned int clock_warps, clock_overflows; + u64 idle_clock; + unsigned int clock_deep_idle_events; + u64 tick_timestamp; + atomic_t nr_iowait; #ifdef CONFIG_SMP @@ -276,7 +392,8 @@ struct rq { /* For active balancing */ int active_balance; int push_cpu; - int cpu; /* cpu of this runqueue */ + /* cpu of this runqueue: */ + int cpu; struct task_struct *migration_thread; struct list_head migration_queue; @@ -287,26 +404,34 @@ struct rq { struct sched_info rq_sched_info; /* sys_sched_yield() stats */ - unsigned long yld_exp_empty; - unsigned long yld_act_empty; - unsigned long yld_both_empty; - unsigned long yld_cnt; + unsigned int yld_exp_empty; + unsigned int yld_act_empty; + unsigned int yld_both_empty; + unsigned int yld_count; /* schedule() stats */ - unsigned long sched_switch; - unsigned long sched_cnt; - unsigned long sched_goidle; + unsigned int sched_switch; + unsigned int sched_count; + unsigned int sched_goidle; /* try_to_wake_up() stats */ - unsigned long ttwu_cnt; - unsigned long ttwu_local; + unsigned int ttwu_count; + unsigned int ttwu_local; + + /* BKL stats */ + unsigned int bkl_count; #endif struct lock_class_key rq_lock_key; }; -static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static DEFINE_MUTEX(sched_hotcpu_mutex); +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +{ + rq->curr->sched_class->check_preempt_curr(rq, p); +} + static inline int cpu_of(struct rq *rq) { #ifdef CONFIG_SMP @@ -317,6 +442,53 @@ static inline int cpu_of(struct rq *rq) } /* + * Update the per-runqueue clock, as finegrained as the platform can give + * us, but without assuming monotonicity, etc.: + */ +static void __update_rq_clock(struct rq *rq) +{ + u64 prev_raw = rq->prev_clock_raw; + u64 now = sched_clock(); + s64 delta = now - prev_raw; + u64 clock = rq->clock; + +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); +#endif + /* + * Protect against sched_clock() occasionally going backwards: + */ + if (unlikely(delta < 0)) { + clock++; + rq->clock_warps++; + } else { + /* + * Catch too large forward jumps too: + */ + if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { + if (clock < rq->tick_timestamp + TICK_NSEC) + clock = rq->tick_timestamp + TICK_NSEC; + else + clock++; + rq->clock_overflows++; + } else { + if (unlikely(delta > rq->clock_max_delta)) + rq->clock_max_delta = delta; + clock += delta; + } + } + + rq->prev_clock_raw = now; + rq->clock = clock; +} + +static void update_rq_clock(struct rq *rq) +{ + if (likely(smp_processor_id() == cpu_of(rq))) + __update_rq_clock(rq); +} + +/* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. * @@ -331,6 +503,61 @@ static inline int cpu_of(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# define const_debug __read_mostly +#else +# define const_debug static const +#endif + +/* + * Debugging: various feature bits + */ +enum { + SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, + SCHED_FEAT_WAKEUP_PREEMPT = 2, + SCHED_FEAT_START_DEBIT = 4, + SCHED_FEAT_TREE_AVG = 8, + SCHED_FEAT_APPROX_AVG = 16, +}; + +const_debug unsigned int sysctl_sched_features = + SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | + SCHED_FEAT_WAKEUP_PREEMPT * 1 | + SCHED_FEAT_START_DEBIT * 1 | + SCHED_FEAT_TREE_AVG * 0 | + SCHED_FEAT_APPROX_AVG * 0; + +#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) + +/* + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +const_debug unsigned int sysctl_sched_nr_migrate = 32; + +/* + * For kernel-internal use: high-speed (but slightly incorrect) per-cpu + * clock constructed from sched_clock(): + */ +unsigned long long cpu_clock(int cpu) +{ + unsigned long long now; + unsigned long flags; + struct rq *rq; + + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); + now = rq->clock; + local_irq_restore(flags); + + return now; +} +EXPORT_SYMBOL_GPL(cpu_clock); + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif @@ -415,16 +642,13 @@ static inline void finish_lock_switch(st static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { - struct rq *rq; - -repeat_lock_task: - rq = task_rq(p); - spin_lock(&rq->lock); - if (unlikely(rq != task_rq(p))) { + for (;;) { + struct rq *rq = task_rq(p); + spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; spin_unlock(&rq->lock); - goto repeat_lock_task; } - return rq; } /* @@ -437,18 +661,17 @@ static struct rq *task_rq_lock(struct ta { struct rq *rq; -repeat_lock_task: - local_irq_save(*flags); - rq = task_rq(p); - spin_lock(&rq->lock); - if (unlikely(rq != task_rq(p))) { + for (;;) { + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; spin_unlock_irqrestore(&rq->lock, *flags); - goto repeat_lock_task; } - return rq; } -static inline void __task_rq_unlock(struct rq *rq) +static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { spin_unlock(&rq->lock); @@ -460,138 +683,10 @@ static inline void task_rq_unlock(struct spin_unlock_irqrestore(&rq->lock, *flags); } -#ifdef CONFIG_SCHEDSTATS -/* - * bump this up when changing the output format or the meaning of an existing - * format, so that tools can adapt (or abort) - */ -#define SCHEDSTAT_VERSION 14 - -static int show_schedstat(struct seq_file *seq, void *v) -{ - int cpu; - - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); -#ifdef CONFIG_SMP - struct sched_domain *sd; - int dcnt = 0; -#endif - - /* runqueue-specific stats */ - seq_printf(seq, - "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", - cpu, rq->yld_both_empty, - rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, - rq->sched_switch, rq->sched_cnt, rq->sched_goidle, - rq->ttwu_cnt, rq->ttwu_local, - rq->rq_sched_info.cpu_time, - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); - - seq_printf(seq, "\n"); - -#ifdef CONFIG_SMP - /* domain-specific stats */ - preempt_disable(); - for_each_domain(cpu, sd) { - enum idle_type itype; - char mask_str[NR_CPUS]; - - cpumask_scnprintf(mask_str, NR_CPUS, sd->span); - seq_printf(seq, "domain%d %s", dcnt++, mask_str); - for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " - "%lu", - sd->lb_cnt[itype], - sd->lb_balanced[itype], - sd->lb_failed[itype], - sd->lb_imbalance[itype], - sd->lb_gained[itype], - sd->lb_hot_gained[itype], - sd->lb_nobusyq[itype], - sd->lb_nobusyg[itype]); - } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" - " %lu %lu %lu\n", - sd->alb_cnt, sd->alb_failed, sd->alb_pushed, - sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, - sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, - sd->ttwu_move_balance); - } - preempt_enable(); -#endif - } - return 0; -} - -static int schedstat_open(struct inode *inode, struct file *file) -{ - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; - - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; -} - -const struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/* - * Expects runqueue lock to be held for atomicity of update - */ -static inline void -rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) -{ - if (rq) { - rq->rq_sched_info.run_delay += delta_jiffies; - rq->rq_sched_info.pcnt++; - } -} - -/* - * Expects runqueue lock to be held for atomicity of update - */ -static inline void -rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) -{ - if (rq) - rq->rq_sched_info.cpu_time += delta_jiffies; -} -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) -#else /* !CONFIG_SCHEDSTATS */ -static inline void -rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) -{} -static inline void -rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) -{} -# define schedstat_inc(rq, field) do { } while (0) -# define schedstat_add(rq, field, amt) do { } while (0) -#endif - /* * this_rq_lock - lock this runqueue and disable interrupts. */ -static inline struct rq *this_rq_lock(void) +static struct rq *this_rq_lock(void) __acquires(rq->lock) { struct rq *rq; @@ -603,241 +698,294 @@ static inline struct rq *this_rq_lock(vo return rq; } -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple - * times as it is shuffled about, we're really interested in knowing how - * long it was from the *first* time it was queued to the time that it - * finally hit a cpu. + * We are going deep-idle (irqs are disabled): */ -static inline void sched_info_dequeued(struct task_struct *t) +void sched_clock_idle_sleep_event(void) { - t->sched_info.last_queued = 0; + struct rq *rq = cpu_rq(smp_processor_id()); + + spin_lock(&rq->lock); + __update_rq_clock(rq); + spin_unlock(&rq->lock); + rq->clock_deep_idle_events++; } +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* - * Called when a task finally hits the cpu. We can now calculate how - * long it was waiting to run. We also note when it began so that we - * can keep stats on how long its timeslice is. + * We just idled delta nanoseconds (called with irqs disabled): */ -static void sched_info_arrive(struct task_struct *t) +void sched_clock_idle_wakeup_event(u64 delta_ns) { - unsigned long now = jiffies, delta_jiffies = 0; - - if (t->sched_info.last_queued) - delta_jiffies = now - t->sched_info.last_queued; - sched_info_dequeued(t); - t->sched_info.run_delay += delta_jiffies; - t->sched_info.last_arrival = now; - t->sched_info.pcnt++; + struct rq *rq = cpu_rq(smp_processor_id()); + u64 now = sched_clock(); - rq_sched_info_arrive(task_rq(t), delta_jiffies); + rq->idle_clock += delta_ns; + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the + * rq clock: + */ + spin_lock(&rq->lock); + rq->prev_clock_raw = now; + rq->clock += delta_ns; + spin_unlock(&rq->lock); } +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); /* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. + * resched_task - mark a task 'to be rescheduled now'. * - * This function is only called from enqueue_task(), but also only updates - * the timestamp if it is already not set. It's assumed that - * sched_info_dequeued() will clear that stamp when appropriate. + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. */ -static inline void sched_info_queued(struct task_struct *t) +#ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + +static void resched_task(struct task_struct *p) { - if (unlikely(sched_info_on())) - if (!t->sched_info.last_queued) - t->sched_info.last_queued = jiffies; + int cpu; + + assert_spin_locked(&task_rq(p)->lock); + + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + return; + + set_tsk_thread_flag(p, TIF_NEED_RESCHED); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); } -/* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. - */ -static inline void sched_info_depart(struct task_struct *t) +static void resched_cpu(int cpu) { - unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; + struct rq *rq = cpu_rq(cpu); + unsigned long flags; - t->sched_info.cpu_time += delta_jiffies; - rq_sched_info_depart(task_rq(t), delta_jiffies); + if (!spin_trylock_irqsave(&rq->lock, flags)) + return; + resched_task(cpu_curr(cpu)); + spin_unlock_irqrestore(&rq->lock, flags); +} +#else +static inline void resched_task(struct task_struct *p) +{ + assert_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); } +#endif + +#if BITS_PER_LONG == 32 +# define WMULT_CONST (~0UL) +#else +# define WMULT_CONST (1UL << 32) +#endif + +#define WMULT_SHIFT 32 /* - * Called when tasks are switched involuntarily due, typically, to expiring - * their time slice. (This may also be called when switching to or from - * the idle task.) We are only called when prev != next. + * Shift right and round: */ -static inline void -__sched_info_switch(struct task_struct *prev, struct task_struct *next) +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +static unsigned long +calc_delta_mine(unsigned long delta_exec, unsigned long weight, + struct load_weight *lw) { - struct rq *rq = task_rq(prev); + u64 tmp; + if (unlikely(!lw->inv_weight)) + lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; + + tmp = (u64)delta_exec * weight; /* - * prev now departs the cpu. It's not interesting to record - * stats about how efficient we were at scheduling the idle - * process, however. + * Check whether we'd overflow the 64-bit multiplication: */ - if (prev != rq->idle) - sched_info_depart(prev); + if (unlikely(tmp > WMULT_CONST)) + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + WMULT_SHIFT/2); + else + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); - if (next != rq->idle) - sched_info_arrive(next); + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } -static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) + +static inline unsigned long +calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) { - if (unlikely(sched_info_on())) - __sched_info_switch(prev, next); + return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); } -#else -#define sched_info_queued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ -/* - * Adding/removing a task to/from a priority array: - */ -static void dequeue_task(struct task_struct *p, struct prio_array *array) +static inline void update_load_add(struct load_weight *lw, unsigned long inc) { - array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); + lw->weight += inc; } -static void enqueue_task(struct task_struct *p, struct prio_array *array) +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) { - sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + lw->weight -= dec; } /* - * Put task to the end of the run list without the overhead of dequeue - * followed by enqueue. + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. */ -static void requeue_task(struct task_struct *p, struct prio_array *array) -{ - list_move_tail(&p->run_list, array->queue + p->prio); -} -static inline void -enqueue_task_head(struct task_struct *p, struct prio_array *array) -{ - list_add(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; -} +#define WEIGHT_IDLEPRIO 2 +#define WMULT_IDLEPRIO (1 << 31) /* - * __normal_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +static const int prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. * - * Both properties are important to certain workloads. + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +static const u32 prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; + +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); + +/* + * runqueue iterator, to support SMP load-balancing between different + * scheduling classes, without having to expose their internal data + * structures to the load-balancing proper: */ +struct rq_iterator { + void *arg; + struct task_struct *(*start)(void *); + struct task_struct *(*next)(void *); +}; -static inline int __normal_prio(struct task_struct *p) -{ - int bonus, prio; +#ifdef CONFIG_SMP +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, + int *this_best_prio, struct rq_iterator *iterator); - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; +static int +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle, + struct rq_iterator *iterator); +#endif - prio = p->static_prio - bonus; - if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; - return prio; +static inline void inc_load(struct rq *rq, unsigned long load) +{ + update_load_add(&rq->load, load); } -/* - * To aid in avoiding the subversion of "niceness" due to uneven distribution - * of tasks with abnormal "nice" values across CPUs the contribution that - * each task makes to its run queue's load is weighted according to its - * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a - * scaled version of the new time slice allocation that they receive on time - * slice expiry etc. - */ +static inline void dec_load(struct rq *rq, unsigned long load) +{ + update_load_sub(&rq->load, load); +} -/* - * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE - * If static_prio_timeslice() is ever changed to break this assumption then - * this code will need modification - */ -#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE -#define LOAD_WEIGHT(lp) \ - (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) -#define PRIO_TO_LOAD_WEIGHT(prio) \ - LOAD_WEIGHT(static_prio_timeslice(prio)) -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) +#include "sched_stats.h" +#include "sched_idletask.c" +#include "sched_fair.c" +#include "sched_rt.c" +#ifdef CONFIG_SCHED_DEBUG +# include "sched_debug.c" +#endif -static void set_load_weight(struct task_struct *p) +#define sched_class_highest (&rt_sched_class) + +static void inc_nr_running(struct task_struct *p, struct rq *rq) { - if (has_rt_policy(p)) { -#ifdef CONFIG_SMP - if (p == task_rq(p)->migration_thread) - /* - * The migration thread does the actual balancing. - * Giving its load any weight will skew balancing - * adversely. - */ - p->load_weight = 0; - else -#endif - p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); - } else - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); + rq->nr_running++; } -static inline void -inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) +static void dec_nr_running(struct task_struct *p, struct rq *rq) { - rq->raw_weighted_load += p->load_weight; + rq->nr_running--; } -static inline void -dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) +static void set_load_weight(struct task_struct *p) +{ + if (task_has_rt_policy(p)) { + p->se.load.weight = prio_to_weight[0] * 2; + p->se.load.inv_weight = prio_to_wmult[0] >> 1; + return; + } + + /* + * SCHED_IDLE tasks get minimal weight: + */ + if (p->policy == SCHED_IDLE) { + p->se.load.weight = WEIGHT_IDLEPRIO; + p->se.load.inv_weight = WMULT_IDLEPRIO; + return; + } + + p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; + p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; +} + +static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { - rq->raw_weighted_load -= p->load_weight; + sched_info_queued(p); + p->sched_class->enqueue_task(rq, p, wakeup); + p->se.on_rq = 1; } -static inline void inc_nr_running(struct task_struct *p, struct rq *rq) +static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { - rq->nr_running++; - inc_raw_weighted_load(rq, p); + p->sched_class->dequeue_task(rq, p, sleep); + p->se.on_rq = 0; } -static inline void dec_nr_running(struct task_struct *p, struct rq *rq) +/* + * __normal_prio - return the priority that is based on the static prio + */ +static inline int __normal_prio(struct task_struct *p) { - rq->nr_running--; - dec_raw_weighted_load(rq, p); + return p->static_prio; } /* @@ -851,7 +999,7 @@ static inline int normal_prio(struct tas { int prio; - if (has_rt_policy(p)) + if (task_has_rt_policy(p)) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); @@ -879,238 +1027,111 @@ static int effective_prio(struct task_st } /* - * __activate_task - move a task to the runqueue. + * activate_task - move a task to the runqueue. */ -static void __activate_task(struct task_struct *p, struct rq *rq) +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) { - struct prio_array *target = rq->active; + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; - if (batch_task(p)) - target = rq->expired; - enqueue_task(p, target); + enqueue_task(rq, p, wakeup); inc_nr_running(p, rq); } /* - * __activate_idle_task - move idle task to the _front_ of runqueue. + * deactivate_task - remove a task from the runqueue. */ -static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { - enqueue_task_head(p, rq->active); - inc_nr_running(p, rq); + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + + dequeue_task(rq, p, sleep); + dec_nr_running(p, rq); } -/* - * Recalculate p->normal_prio and p->prio after having slept, - * updating the sleep-average too: +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. */ -static int recalc_task_prio(struct task_struct *p, unsigned long long now) +inline int task_curr(const struct task_struct *p) { - /* Caller must always ensure 'now >= p->timestamp' */ - unsigned long sleep_time = now - p->timestamp; - - if (batch_task(p)) - sleep_time = 0; - - if (likely(sleep_time > 0)) { - /* - * This ceiling is set to the lowest priority that would allow - * a task to be reinserted into the active array on timeslice - * completion. - */ - unsigned long ceiling = INTERACTIVE_SLEEP(p); - - if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { - /* - * Prevents user tasks from achieving best priority - * with one single large enough sleep. - */ - p->sleep_avg = ceiling; - /* - * Using INTERACTIVE_SLEEP() as a ceiling places a - * nice(0) task 1ms sleep away from promotion, and - * gives it 700ms to round-robin with no chance of - * being demoted. This is more than generous, so - * mark this sleep as non-interactive to prevent the - * on-runqueue bonus logic from intervening should - * this task not receive cpu immediately. - */ - p->sleep_type = SLEEP_NONINTERACTIVE; - } else { - /* - * Tasks waking from uninterruptible sleep are - * limited in their sleep_avg rise as they - * are likely to be waiting on I/O - */ - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { - if (p->sleep_avg >= ceiling) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - ceiling) { - p->sleep_avg = ceiling; - sleep_time = 0; - } - } - - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; - - } - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; - } - - return effective_prio(p); + return cpu_curr(task_cpu(p)) == p; } -/* - * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) - */ -static void activate_task(struct task_struct *p, struct rq *rq, int local) +/* Used instead of source_load when we know the type == 0 */ +unsigned long weighted_cpuload(const int cpu) { - unsigned long long now; - - if (rt_task(p)) - goto out; + return cpu_rq(cpu)->load.weight; +} - now = sched_clock(); +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_cfs_rq(p, cpu); #ifdef CONFIG_SMP - if (!local) { - /* Compensate for drifting sched_clock */ - struct rq *this_rq = this_rq(); - now = (now - this_rq->most_recent_timestamp) - + rq->most_recent_timestamp; - } -#endif - - /* - * Sleep time is in units of nanosecs, so shift by 20 to get a - * milliseconds-range estimation of the amount of time that the task - * spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - if (p->state == TASK_UNINTERRUPTIBLE) - profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), - (now - p->timestamp) >> 20); - } - - p->prio = recalc_task_prio(p, now); - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. */ - if (p->sleep_type == SLEEP_NORMAL) { - /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: - */ - if (in_interrupt()) - p->sleep_type = SLEEP_INTERRUPTED; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->sleep_type = SLEEP_INTERACTIVE; - } - } - p->timestamp = now; -out: - __activate_task(p, rq); + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif } -/* - * deactivate_task - remove a task from the runqueue. - */ -static void deactivate_task(struct task_struct *p, struct rq *rq) -{ - dec_nr_running(p, rq); - dequeue_task(p, p->array); - p->array = NULL; -} +#ifdef CONFIG_SMP /* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. + * Is this task likely cache-hot: */ -#ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) -#endif - -static void resched_task(struct task_struct *p) +static inline int +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { - int cpu; - - assert_spin_locked(&task_rq(p)->lock); + s64 delta; - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) - return; + if (p->sched_class != &fair_sched_class) + return 0; - set_tsk_thread_flag(p, TIF_NEED_RESCHED); + if (sysctl_sched_migration_cost == -1) + return 1; + if (sysctl_sched_migration_cost == 0) + return 0; - cpu = task_cpu(p); - if (cpu == smp_processor_id()) - return; + delta = now - p->se.exec_start; - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) - smp_send_reschedule(cpu); + return delta < (s64)sysctl_sched_migration_cost; } -static void resched_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - if (!spin_trylock_irqsave(&rq->lock, flags)) - return; - resched_task(cpu_curr(cpu)); - spin_unlock_irqrestore(&rq->lock, flags); -} -#else -static inline void resched_task(struct task_struct *p) +void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { - assert_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} -#endif + int old_cpu = task_cpu(p); + struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); + struct cfs_rq *old_cfsrq = task_cfs_rq(p), + *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); + u64 clock_offset; -/** - * task_curr - is this task currently executing on a CPU? - * @p: the task in question. - */ -inline int task_curr(const struct task_struct *p) -{ - return cpu_curr(task_cpu(p)) == p; -} + clock_offset = old_rq->clock - new_rq->clock; -/* Used instead of source_load when we know the type == 0 */ -unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->raw_weighted_load; +#ifdef CONFIG_SCHEDSTATS + if (p->se.wait_start) + p->se.wait_start -= clock_offset; + if (p->se.sleep_start) + p->se.sleep_start -= clock_offset; + if (p->se.block_start) + p->se.block_start -= clock_offset; + if (old_cpu != new_cpu) { + schedstat_inc(p, se.nr_migrations); + if (task_hot(p, old_rq->clock, NULL)) + schedstat_inc(p, se.nr_forced2_migrations); + } +#endif + p->se.vruntime -= old_cfsrq->min_vruntime - + new_cfsrq->min_vruntime; + + __set_task_cpu(p, new_cpu); } -#ifdef CONFIG_SMP struct migration_req { struct list_head list; @@ -1133,7 +1154,7 @@ migrate_task(struct task_struct *p, int * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!p->se.on_rq && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } @@ -1158,73 +1179,74 @@ migrate_task(struct task_struct *p, int void wait_task_inactive(struct task_struct *p) { unsigned long flags; + int running, on_rq; struct rq *rq; - struct prio_array *array; - int running; -repeat: - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_running(rq, p)) - cpu_relax(); + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_running()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_running(rq, p)) + cpu_relax(); - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &flags); - running = task_running(rq, p); - array = p->array; - task_rq_unlock(rq, &flags); + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); + on_rq = p->se.on_rq; + task_rq_unlock(rq, &flags); - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - goto repeat; - } + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it wa still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(array)) { - yield(); - goto repeat; - } + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it wa still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(on_rq)) { + schedule_timeout_uninterruptible(1); + continue; + } - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } } /*** @@ -1258,28 +1280,30 @@ void kick_process(struct task_struct *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long source_load(int cpu, int type) +static unsigned long source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); if (type == 0) - return rq->raw_weighted_load; + return total; - return min(rq->cpu_load[type-1], rq->raw_weighted_load); + return min(rq->cpu_load[type-1], total); } /* * Return a high guess at the load of a migration-target cpu weighted * according to the scheduling class and "nice" value. */ -static inline unsigned long target_load(int cpu, int type) +static unsigned long target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); if (type == 0) - return rq->raw_weighted_load; + return total; - return max(rq->cpu_load[type-1], rq->raw_weighted_load); + return max(rq->cpu_load[type-1], total); } /* @@ -1288,9 +1312,10 @@ static inline unsigned long target_load( static inline unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); unsigned long n = rq->nr_running; - return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; + return n ? total / n : SCHED_LOAD_SCALE; } /* @@ -1312,7 +1337,7 @@ find_idlest_group(struct sched_domain *s /* Skip over this group if it has no CPUs allowed */ if (!cpus_intersects(group->cpumask, p->cpus_allowed)) - goto nextgroup; + continue; local_group = cpu_isset(this_cpu, group->cpumask); @@ -1340,9 +1365,7 @@ find_idlest_group(struct sched_domain *s min_load = avg_load; idlest = group; } -nextgroup: - group = group->next; - } while (group != sd->groups); + } while (group = group->next, group != sd->groups); if (!idlest || 100*this_load < imbalance*min_load) return NULL; @@ -1392,9 +1415,9 @@ static int sched_balance_self(int cpu, i struct sched_domain *tmp, *sd = NULL; for_each_domain(cpu, tmp) { - /* - * If power savings logic is enabled for a domain, stop there. - */ + /* + * If power savings logic is enabled for a domain, stop there. + */ if (tmp->flags & SD_POWERSAVINGS_BALANCE) break; if (tmp->flags & flag) @@ -1474,12 +1497,17 @@ static int wake_idle(int cpu, struct tas if (sd->flags & SD_WAKE_IDLE) { cpus_and(tmp, sd->span, p->cpus_allowed); for_each_cpu_mask(i, tmp) { - if (idle_cpu(i)) + if (idle_cpu(i)) { + if (i != task_cpu(p)) { + schedstat_inc(p, + se.nr_wakeups_idle); + } return i; + } } - } - else + } else { break; + } } return cpu; } @@ -1506,7 +1534,7 @@ static inline int wake_idle(int cpu, str */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) { - int cpu, this_cpu, success = 0; + int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; long old_state; struct rq *rq; @@ -1521,10 +1549,11 @@ static int try_to_wake_up(struct task_st if (!(old_state & state)) goto out; - if (p->array) + if (p->se.on_rq) goto out_running; cpu = task_cpu(p); + orig_cpu = cpu; this_cpu = smp_processor_id(); #ifdef CONFIG_SMP @@ -1533,7 +1562,7 @@ static int try_to_wake_up(struct task_st new_cpu = cpu; - schedstat_inc(rq, ttwu_cnt); + schedstat_inc(rq, ttwu_count); if (cpu == this_cpu) { schedstat_inc(rq, ttwu_local); goto out_set_cpu; @@ -1568,7 +1597,14 @@ static int try_to_wake_up(struct task_st unsigned long tl = this_load; unsigned long tl_per_task; - tl_per_task = cpu_avg_load_per_task(this_cpu); + /* + * Attract cache-cold tasks on sync wakeups: + */ + if (sync && !task_hot(p, rq->clock, this_sd)) + goto out_set_cpu; + + schedstat_inc(p, se.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); /* * If sync wakeup then subtract the (maximum possible) @@ -1576,17 +1612,18 @@ static int try_to_wake_up(struct task_st * of the current CPU: */ if (sync) - tl -= current->load_weight; + tl -= current->se.load.weight; if ((tl <= load && tl + target_load(cpu, idx) <= tl_per_task) || - 100*(tl + p->load_weight) <= imbalance*load) { + 100*(tl + p->se.load.weight) <= imbalance*load) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and * there is no bad imbalance. */ schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(p, se.nr_wakeups_affine); goto out_set_cpu; } } @@ -1598,6 +1635,7 @@ static int try_to_wake_up(struct task_st if (this_sd->flags & SD_WAKE_BALANCE) { if (imbalance*this_load <= 100*load) { schedstat_inc(this_sd, ttwu_move_balance); + schedstat_inc(p, se.nr_wakeups_passive); goto out_set_cpu; } } @@ -1614,7 +1652,7 @@ out_set_cpu: old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (p->se.on_rq) goto out_running; this_cpu = smp_processor_id(); @@ -1623,37 +1661,18 @@ out_set_cpu: out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { - rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->sleep_type = SLEEP_NONINTERACTIVE; - } else - - /* - * Tasks that have marked their sleep as noninteractive get - * woken up with their sleep average not weighted in an - * interactive way. - */ - if (old_state & TASK_NONINTERACTIVE) - p->sleep_type = SLEEP_NONINTERACTIVE; - - - activate_task(p, rq, cpu == this_cpu); - /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) - * don't trigger a preemption, if the woken up task will run on - * this cpu. (in this case the 'I will reschedule' promise of - * the waker guarantees that the freshly woken up task is going - * to be considered on this CPU.) - */ - if (!sync || cpu != this_cpu) { - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } + schedstat_inc(p, se.nr_wakeups); + if (sync) + schedstat_inc(p, se.nr_wakeups_sync); + if (orig_cpu != cpu) + schedstat_inc(p, se.nr_wakeups_migrate); + if (cpu == this_cpu) + schedstat_inc(p, se.nr_wakeups_local); + else + schedstat_inc(p, se.nr_wakeups_remote); + update_rq_clock(rq); + activate_task(rq, p, 1); + check_preempt_curr(rq, p); success = 1; out_running: @@ -1676,19 +1695,36 @@ int fastcall wake_up_state(struct task_s return try_to_wake_up(p, state, 0); } -static void task_running_tick(struct rq *rq, struct task_struct *p); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. + * + * __sched_fork() is basic setup used by init_idle() too: */ -void fastcall sched_fork(struct task_struct *p, int clone_flags) +static void __sched_fork(struct task_struct *p) { - int cpu = get_cpu(); + p->se.exec_start = 0; + p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#ifdef CONFIG_SCHEDSTATS + p->se.wait_start = 0; + p->se.sum_sleep_runtime = 0; + p->se.sleep_start = 0; + p->se.block_start = 0; + p->se.sleep_max = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.slice_max = 0; + p->se.wait_max = 0; +#endif + + INIT_LIST_HEAD(&p->run_list); + p->se.on_rq = 0; + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); #endif - set_task_cpu(p, cpu); /* * We mark the process as running here, but have not actually @@ -1697,16 +1733,31 @@ void fastcall sched_fork(struct task_str * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; +} + +/* + * fork()/clone()-time setup: + */ +void sched_fork(struct task_struct *p, int clone_flags) +{ + int cpu = get_cpu(); + + __sched_fork(p); + +#ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#endif + set_task_cpu(p, cpu); /* * Make sure we do not leak PI boosting priority to the child: */ p->prio = current->normal_prio; + if (!rt_prio(p->prio)) + p->sched_class = &fair_sched_class; - INIT_LIST_HEAD(&p->run_list); - p->array = NULL; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - if (unlikely(sched_info_on())) + if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) @@ -1716,30 +1767,6 @@ void fastcall sched_fork(struct task_str /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. - */ - local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; - /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. - */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { - /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. - */ - current->time_slice = 1; - task_running_tick(cpu_rq(cpu), current); - } - local_irq_enable(); put_cpu(); } @@ -1752,113 +1779,91 @@ void fastcall sched_fork(struct task_str */ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { - struct rq *rq, *this_rq; unsigned long flags; - int this_cpu, cpu; + struct rq *rq; rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. The parent - * (current) is done further down, under its lock. - */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + update_rq_clock(rq); p->prio = effective_prio(p); - if (likely(cpu == this_cpu)) { - if (!(clone_flags & CLONE_VM)) { - /* - * The VM isn't cloned, so we're in a good position to - * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. - */ - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; - p->normal_prio = current->normal_prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - inc_nr_running(p, rq); - } - set_need_resched(); - } else - /* Run child last */ - __activate_task(p, rq); - /* - * We skip the following code due to cpu == this_cpu - * - * task_rq_unlock(rq, &flags); - * this_rq = task_rq_lock(current, &flags); - */ - this_rq = rq; + if (!p->sched_class->task_new || !current->se.on_rq) { + activate_task(rq, p, 0); } else { - this_rq = cpu_rq(this_cpu); - - /* - * Not the local CPU - must adjust timestamp. This should - * get optimised away in the !CONFIG_SMP case. - */ - p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) - + rq->most_recent_timestamp; - __activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - /* - * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: + * Let the scheduling class do new task startup + * management (if any): */ - task_rq_unlock(rq, &flags); - this_rq = task_rq_lock(current, &flags); + p->sched_class->task_new(rq, p); + inc_nr_running(p, rq); } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - task_rq_unlock(this_rq, &flags); + check_preempt_curr(rq, p); + task_rq_unlock(rq, &flags); } -/* - * Potentially available exiting-child timeslices are - * retrieved here - this way the parent does not get - * penalized for creating too many threads. +#ifdef CONFIG_PREEMPT_NOTIFIERS + +/** + * preempt_notifier_register - tell me when current is being being preempted & rescheduled + * @notifier: notifier struct to register + */ +void preempt_notifier_register(struct preempt_notifier *notifier) +{ + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); +} +EXPORT_SYMBOL_GPL(preempt_notifier_register); + +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * @notifier: notifier struct to unregister * - * (this cannot be used to 'generate' timeslices - * artificially, because any timeslice recovered here - * was given away by the parent in the first place.) + * This is safe to call from within a preemption notifier. */ -void fastcall sched_exit(struct task_struct *p) +void preempt_notifier_unregister(struct preempt_notifier *notifier) { - unsigned long flags; - struct rq *rq; + hlist_del(¬ifier->link); +} +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); - task_rq_unlock(rq, &flags); +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_out(notifier, next); +} + +#else + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ } +#endif + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch + * @prev: the current task that is being switched out * @next: the task we are going to switch to. * * This is called with the rq lock held and interrupts off. It must @@ -1868,8 +1873,11 @@ void fastcall sched_exit(struct task_str * prepare_task_switch sets up locking and calls architecture specific * hooks. */ -static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) { + fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -1889,7 +1897,7 @@ static inline void prepare_task_switch(s * with the lock held can cause deadlocks; see schedule() for * details.) */ -static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) +static void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; @@ -1911,13 +1919,14 @@ static inline void finish_task_switch(st prev_state = prev->state; finish_arch_switch(prev); finish_lock_switch(rq, prev); + fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this * task and put them back on the free list. - */ + */ kprobe_flush_task(prev); put_task_struct(prev); } @@ -1938,20 +1947,22 @@ asmlinkage void schedule_tail(struct tas preempt_enable(); #endif if (current->set_child_tid) - put_user(current->pid, current->set_child_tid); + put_user(task_pid_vnr(current), current->set_child_tid); } /* * context_switch - switch to the new MM and the new * thread's register state. */ -static inline struct task_struct * +static inline void context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - struct mm_struct *mm = next->mm; - struct mm_struct *oldmm = prev->active_mm; + struct mm_struct *mm, *oldmm; + prepare_task_switch(rq, prev, next); + mm = next->mm; + oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into @@ -1959,16 +1970,15 @@ context_switch(struct rq *rq, struct tas */ arch_enter_lazy_cpu_mode(); - if (!mm) { + if (unlikely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (!prev->mm) { + if (unlikely(!prev->mm)) { prev->active_mm = NULL; - WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; } /* @@ -1984,7 +1994,13 @@ context_switch(struct rq *rq, struct tas /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); - return prev; + barrier(); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); } /* @@ -2057,17 +2073,38 @@ unsigned long nr_active(void) return running + uninterruptible; } -#ifdef CONFIG_SMP - /* - * Is this task likely cache-hot: + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). */ -static inline int -task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) +static void update_cpu_load(struct rq *this_rq) { - return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; + unsigned long this_load = this_rq->load.weight; + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + } } +#ifdef CONFIG_SMP + /* * double_rq_lock - safely lock two runqueues * @@ -2091,6 +2128,8 @@ static void double_rq_lock(struct rq *rq spin_lock(&rq1->lock); } } + update_rq_clock(rq1); + update_rq_clock(rq2); } /* @@ -2184,23 +2223,17 @@ void sched_exec(void) * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct prio_array *src_array, - struct task_struct *p, struct rq *this_rq, - struct prio_array *this_array, int this_cpu) +static void pull_task(struct rq *src_rq, struct task_struct *p, + struct rq *this_rq, int this_cpu) { - dequeue_task(p, src_array); - dec_nr_running(p, src_rq); + deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); - inc_nr_running(p, this_rq); - enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) - + this_rq->most_recent_timestamp; + activate_task(this_rq, p, 0); /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (TASK_PREEMPTS_CURR(p, this_rq)) - resched_task(this_rq->curr); + check_preempt_curr(this_rq, p); } /* @@ -2208,7 +2241,7 @@ static void pull_task(struct rq *src_rq, */ static int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle, + struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { /* @@ -2217,12 +2250,16 @@ int can_migrate_task(struct task_struct * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) + if (!cpu_isset(this_cpu, p->cpus_allowed)) { + schedstat_inc(p, se.nr_failed_migrations_affine); return 0; + } *all_pinned = 0; - if (task_running(rq, p)) + if (task_running(rq, p)) { + schedstat_inc(p, se.nr_failed_migrations_running); return 0; + } /* * Aggressive migration if: @@ -2230,131 +2267,75 @@ int can_migrate_task(struct task_struct * 2) too many balance attempts have failed. */ - if (sd->nr_balance_failed > sd->cache_nice_tries) { + if (!task_hot(p, rq->clock, sd) || + sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS - if (task_hot(p, rq->most_recent_timestamp, sd)) + if (task_hot(p, rq->clock, sd)) { schedstat_inc(sd, lb_hot_gained[idle]); + schedstat_inc(p, se.nr_forced_migrations); + } #endif return 1; } - if (task_hot(p, rq->most_recent_timestamp, sd)) + if (task_hot(p, rq->clock, sd)) { + schedstat_inc(p, se.nr_failed_migrations_hot); return 0; + } return 1; } -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) - -/* - * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted - * load from busiest to this_rq, as part of a balancing operation within - * "domain". Returns the number of tasks moved. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum idle_type idle, - int *all_pinned) +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, + int *this_best_prio, struct rq_iterator *iterator) { - int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, - best_prio_seen, skip_for_load; - struct prio_array *array, *dst_array; - struct list_head *head, *curr; - struct task_struct *tmp; - long rem_load_move; + int loops = 0, pulled = 0, pinned = 0, skip_for_load; + struct task_struct *p; + long rem_load_move = max_load_move; - if (max_nr_move == 0 || max_load_move == 0) + if (max_load_move == 0) goto out; - rem_load_move = max_load_move; pinned = 1; - this_best_prio = rq_best_prio(this_rq); - best_prio = rq_best_prio(busiest); - /* - * Enable handling of the case where there is more than one task - * with the best priority. If the current running task is one - * of those with prio==best_prio we know it won't be moved - * and therefore it's safe to override the skip (based on load) of - * any task we find with that prio. - */ - best_prio_seen = best_prio == busiest->curr->prio; - - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { - array = busiest->active; - dst_array = this_rq->active; - } -new_array: - /* Start searching at priority 0: */ - idx = 0; -skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired && busiest->active->nr_active) { - array = busiest->active; - dst_array = this_rq->active; - goto new_array; - } + /* + * Start the load-balancing iterator: + */ + p = iterator->start(iterator->arg); +next: + if (!p || loops++ > sysctl_sched_nr_migrate) goto out; - } - - head = array->queue + idx; - curr = head->prev; -skip_queue: - tmp = list_entry(curr, struct task_struct, run_list); - - curr = curr->prev; - /* - * To help distribute high priority tasks accross CPUs we don't + * To help distribute high priority tasks across CPUs we don't * skip a task if it will be the highest priority task (i.e. smallest * prio value) on its new queue regardless of its load weight */ - skip_for_load = tmp->load_weight > rem_load_move; - if (skip_for_load && idx < this_best_prio) - skip_for_load = !best_prio_seen && idx == best_prio; - if (skip_for_load || - !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { - - best_prio_seen |= idx == best_prio; - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; + skip_for_load = (p->se.load.weight >> 1) > rem_load_move + + SCHED_LOAD_SCALE_FUZZ; + if ((skip_for_load && p->prio >= *this_best_prio) || + !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { + p = iterator->next(iterator->arg); + goto next; } - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pull_task(busiest, p, this_rq, this_cpu); pulled++; - rem_load_move -= tmp->load_weight; + rem_load_move -= p->se.load.weight; /* - * We only want to steal up to the prescribed number of tasks - * and the prescribed amount of weighted load. + * We only want to steal up to the prescribed amount of weighted load. */ - if (pulled < max_nr_move && rem_load_move > 0) { - if (idx < this_best_prio) - this_best_prio = idx; - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; + if (rem_load_move > 0) { + if (p->prio < *this_best_prio) + *this_best_prio = p->prio; + p = iterator->next(iterator->arg); + goto next; } out: /* - * Right now, this is the only place pull_task() is called, + * Right now, this is one of only two places pull_task() is called, * so we can safely collect pull_task() stats here rather than * inside pull_task(). */ @@ -2362,7 +2343,80 @@ out: if (all_pinned) *all_pinned = pinned; - return pulled; + + return max_load_move - rem_load_move; +} + +/* + * move_tasks tries to move up to max_load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) +{ + const struct sched_class *class = sched_class_highest; + unsigned long total_load_moved = 0; + int this_best_prio = this_rq->curr->prio; + + do { + total_load_moved += + class->load_balance(this_rq, this_cpu, busiest, + max_load_move - total_load_moved, + sd, idle, all_pinned, &this_best_prio); + class = class->next; + } while (class && max_load_move > total_load_moved); + + return total_load_moved > 0; +} + +static int +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle, + struct rq_iterator *iterator) +{ + struct task_struct *p = iterator->start(iterator->arg); + int pinned = 0; + + while (p) { + if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { + pull_task(busiest, p, this_rq, this_cpu); + /* + * Right now, this is only the second place pull_task() + * is called, so we can safely collect pull_task() + * stats here rather than inside pull_task(). + */ + schedstat_inc(sd, lb_gained[idle]); + + return 1; + } + p = iterator->next(iterator->arg); + } + + return 0; +} + +/* + * move_one_task tries to move exactly one task from busiest to this_rq, as + * part of active balancing operations within "domain". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + const struct sched_class *class; + + for (class = sched_class_highest; class; class = class->next) + if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) + return 1; + + return 0; } /* @@ -2372,15 +2426,15 @@ out: */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle, int *sd_idle, - cpumask_t *cpus, int *balance) + unsigned long *imbalance, enum cpu_idle_type idle, + int *sd_idle, cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; unsigned long max_pull; unsigned long busiest_load_per_task, busiest_nr_running; unsigned long this_load_per_task, this_nr_running; - int load_idx; + int load_idx, group_imb = 0; #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) int power_savings_balance = 1; unsigned long leader_nr_running = 0, min_load_per_task = 0; @@ -2391,17 +2445,18 @@ find_busiest_group(struct sched_domain * max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; - if (idle == NOT_IDLE) + if (idle == CPU_NOT_IDLE) load_idx = sd->busy_idx; - else if (idle == NEWLY_IDLE) + else if (idle == CPU_NEWLY_IDLE) load_idx = sd->newidle_idx; else load_idx = sd->idle_idx; do { - unsigned long load, group_capacity; + unsigned long load, group_capacity, max_cpu_load, min_cpu_load; int local_group; int i; + int __group_imb = 0; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; @@ -2412,6 +2467,8 @@ find_busiest_group(struct sched_domain * /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; + max_cpu_load = 0; + min_cpu_load = ~0UL; for_each_cpu_mask(i, group->cpumask) { struct rq *rq; @@ -2421,7 +2478,7 @@ find_busiest_group(struct sched_domain * rq = cpu_rq(i); - if (*sd_idle && !idle_cpu(i)) + if (*sd_idle && rq->nr_running) *sd_idle = 0; /* Bias balancing toward cpus of our domain */ @@ -2432,20 +2489,27 @@ find_busiest_group(struct sched_domain * } load = target_load(i, load_idx); - } else + } else { load = source_load(i, load_idx); + if (load > max_cpu_load) + max_cpu_load = load; + if (min_cpu_load > load) + min_cpu_load = load; + } avg_load += load; sum_nr_running += rq->nr_running; - sum_weighted_load += rq->raw_weighted_load; + sum_weighted_load += weighted_cpuload(i); } /* * First idle cpu or the first cpu(busiest) in this sched group * is eligible for doing load balancing at this and above - * domains. + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. */ - if (local_group && balance_cpu != this_cpu && balance) { + if (idle != CPU_NEWLY_IDLE && local_group && + balance_cpu != this_cpu && balance) { *balance = 0; goto ret; } @@ -2457,6 +2521,9 @@ find_busiest_group(struct sched_domain * avg_load = sg_div_cpu_power(group, avg_load * SCHED_LOAD_SCALE); + if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) + __group_imb = 1; + group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; if (local_group) { @@ -2465,11 +2532,12 @@ find_busiest_group(struct sched_domain * this_nr_running = sum_nr_running; this_load_per_task = sum_weighted_load; } else if (avg_load > max_load && - sum_nr_running > group_capacity) { + (sum_nr_running > group_capacity || __group_imb)) { max_load = avg_load; busiest = group; busiest_nr_running = sum_nr_running; busiest_load_per_task = sum_weighted_load; + group_imb = __group_imb; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -2477,8 +2545,9 @@ find_busiest_group(struct sched_domain * * Busy processors will not participate in power savings * balance. */ - if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - goto group_next; + if (idle == CPU_NOT_IDLE || + !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto group_next; /* * If the local group is idle or completely loaded @@ -2488,42 +2557,42 @@ find_busiest_group(struct sched_domain * !this_nr_running)) power_savings_balance = 0; - /* + /* * If a group is already running at full capacity or idle, * don't include that group in power savings calculations - */ - if (!power_savings_balance || sum_nr_running >= group_capacity + */ + if (!power_savings_balance || sum_nr_running >= group_capacity || !sum_nr_running) - goto group_next; + goto group_next; - /* + /* * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sum_nr_running < min_nr_running) || - (sum_nr_running == min_nr_running && + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sum_nr_running < min_nr_running) || + (sum_nr_running == min_nr_running && first_cpu(group->cpumask) < first_cpu(group_min->cpumask))) { - group_min = group; - min_nr_running = sum_nr_running; + group_min = group; + min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / sum_nr_running; - } + } - /* + /* * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sum_nr_running <= group_capacity - 1) { - if (sum_nr_running > leader_nr_running || - (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { - group_leader = group; - leader_nr_running = sum_nr_running; - } + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sum_nr_running <= group_capacity - 1) { + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + first_cpu(group->cpumask) > + first_cpu(group_leader->cpumask))) { + group_leader = group; + leader_nr_running = sum_nr_running; + } } group_next: #endif @@ -2540,6 +2609,9 @@ group_next: goto out_balanced; busiest_load_per_task /= busiest_nr_running; + if (group_imb) + busiest_load_per_task = min(busiest_load_per_task, avg_load); + /* * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to @@ -2592,7 +2664,8 @@ small_imbalance: } else this_load_per_task = SCHED_LOAD_SCALE; - if (max_load - this_load >= busiest_load_per_task * imbn) { + if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= + busiest_load_per_task * imbn) { *imbalance = busiest_load_per_task; return busiest; } @@ -2629,17 +2702,15 @@ small_imbalance: pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ - if (pwr_move <= pwr_now) - goto out_balanced; - - *imbalance = busiest_load_per_task; + if (pwr_move > pwr_now) + *imbalance = busiest_load_per_task; } return busiest; out_balanced: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto ret; if (this == group_leader && group_leader != group_min) { @@ -2656,7 +2727,7 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq * -find_busiest_queue(struct sched_group *group, enum idle_type idle, +find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long imbalance, cpumask_t *cpus) { struct rq *busiest = NULL, *rq; @@ -2664,17 +2735,19 @@ find_busiest_queue(struct sched_group *g int i; for_each_cpu_mask(i, group->cpumask) { + unsigned long wl; if (!cpu_isset(i, *cpus)) continue; rq = cpu_rq(i); + wl = weighted_cpuload(i); - if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) + if (rq->nr_running == 1 && wl > imbalance) continue; - if (rq->raw_weighted_load > max_load) { - max_load = rq->raw_weighted_load; + if (wl > max_load) { + max_load = wl; busiest = rq; } } @@ -2688,20 +2761,15 @@ find_busiest_queue(struct sched_group *g */ #define MAX_PINNED_INTERVAL 512 -static inline unsigned long minus_1_or_zero(unsigned long n) -{ - return n > 0 ? n - 1 : 0; -} - /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum idle_type idle, + struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -2711,14 +2779,14 @@ static int load_balance(int this_cpu, st /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as IDLE, instead of - * portraying it as NOT_IDLE. + * let the state of idle sibling percolate up as CPU_IDLE, instead of + * portraying it as CPU_NOT_IDLE. */ - if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && + if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; - schedstat_inc(sd, lb_cnt[idle]); + schedstat_inc(sd, lb_count[idle]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, @@ -2742,18 +2810,17 @@ redo: schedstat_add(sd, lb_imbalance[idle], imbalance); - nr_moved = 0; + ld_moved = 0; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. nr_moved simply stays zero, so it is + * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ local_irq_save(flags); double_rq_lock(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - minus_1_or_zero(busiest->nr_running), + ld_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -2761,7 +2828,7 @@ redo: /* * some other cpu did the load balance for us. */ - if (nr_moved && this_cpu != smp_processor_id()) + if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); /* All tasks on this runqueue were pinned by CPU affinity */ @@ -2773,7 +2840,7 @@ redo: } } - if (!nr_moved) { + if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; @@ -2822,10 +2889,10 @@ redo: sd->balance_interval *= 2; } - if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && + if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; - return nr_moved; + return ld_moved; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -2848,7 +2915,7 @@ out_one_pinned: * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. * - * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). * this_rq is locked. */ static int @@ -2857,68 +2924,71 @@ load_balance_newidle(int this_cpu, struc struct sched_group *group; struct rq *busiest = NULL; unsigned long imbalance; - int nr_moved = 0; + int ld_moved = 0; int sd_idle = 0; + int all_pinned = 0; cpumask_t cpus = CPU_MASK_ALL; /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, * let the state of idle sibling percolate up as IDLE, instead of - * portraying it as NOT_IDLE. + * portraying it as CPU_NOT_IDLE. */ if (sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; - schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); + schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); redo: - group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, + group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, &sd_idle, &cpus, NULL); if (!group) { - schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); + schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); goto out_balanced; } - busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, + busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, &cpus); if (!busiest) { - schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); + schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); goto out_balanced; } BUG_ON(busiest == this_rq); - schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); + schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); - nr_moved = 0; + ld_moved = 0; if (busiest->nr_running > 1) { /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - minus_1_or_zero(busiest->nr_running), - imbalance, sd, NEWLY_IDLE, NULL); + /* this_rq->clock is already updated */ + update_rq_clock(busiest); + ld_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, CPU_NEWLY_IDLE, + &all_pinned); spin_unlock(&busiest->lock); - if (!nr_moved) { + if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), cpus); if (!cpus_empty(cpus)) goto redo; } } - if (!nr_moved) { - schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + if (!ld_moved) { + schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; } else sd->nr_balance_failed = 0; - return nr_moved; + return ld_moved; out_balanced: - schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; @@ -2934,8 +3004,8 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; - int pulled_task = 0; - unsigned long next_balance = jiffies + 60 * HZ; + int pulled_task = -1; + unsigned long next_balance = jiffies + HZ; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -2954,12 +3024,13 @@ static void idle_balance(int this_cpu, s if (pulled_task) break; } - if (!pulled_task) + if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* * We are going idle. next_balance may be set based on * a busy processor. So reset next_balance. */ this_rq->next_balance = next_balance; + } } /* @@ -2991,6 +3062,8 @@ static void active_load_balance(struct r /* move a task from busiest_rq to target_rq */ double_lock_balance(busiest_rq, target_rq); + update_rq_clock(busiest_rq); + update_rq_clock(target_rq); /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { @@ -3000,11 +3073,10 @@ static void active_load_balance(struct r } if (likely(sd)) { - schedstat_inc(sd, alb_cnt); + schedstat_inc(sd, alb_count); - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, - RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, - NULL)) + if (move_one_task(target_rq, target_cpu, busiest_rq, + sd, CPU_IDLE)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -3012,32 +3084,6 @@ static void active_load_balance(struct r spin_unlock(&target_rq->lock); } -static void update_load(struct rq *this_rq) -{ - unsigned long this_load; - unsigned int i, scale; - - this_load = this_rq->raw_weighted_load; - - /* Update our load: */ - for (i = 0, scale = 1; i < 3; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; - } -} - #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; @@ -3120,7 +3166,7 @@ static DEFINE_SPINLOCK(balancing); * * Balancing parameters are set up in arch_init_sched_domains. */ -static inline void rebalance_domains(int cpu, enum idle_type idle) +static void rebalance_domains(int cpu, enum cpu_idle_type idle) { int balance = 1; struct rq *rq = cpu_rq(cpu); @@ -3128,19 +3174,23 @@ static inline void rebalance_domains(int struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; interval = sd->balance_interval; - if (idle != SCHED_IDLE) + if (idle != CPU_IDLE) interval *= sd->busy_factor; /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); if (unlikely(!interval)) interval = 1; + if (interval > HZ*NR_CPUS/10) + interval = HZ*NR_CPUS/10; + if (sd->flags & SD_SERIALIZE) { if (!spin_trylock(&balancing)) @@ -3154,15 +3204,17 @@ static inline void rebalance_domains(int * longer idle, or one of our SMT siblings is * not idle. */ - idle = NOT_IDLE; + idle = CPU_NOT_IDLE; } sd->last_balance = jiffies; } if (sd->flags & SD_SERIALIZE) spin_unlock(&balancing); out: - if (time_after(next_balance, sd->last_balance + interval)) + if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; + update_next_balance = 1; + } /* * Stop the load balance at this level. There is another @@ -3172,7 +3224,14 @@ out: if (!balance) break; } - rq->next_balance = next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; } /* @@ -3182,11 +3241,12 @@ out: */ static void run_rebalance_domains(struct softirq_action *h) { - int local_cpu = smp_processor_id(); - struct rq *local_rq = cpu_rq(local_cpu); - enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; + int this_cpu = smp_processor_id(); + struct rq *this_rq = cpu_rq(this_cpu); + enum cpu_idle_type idle = this_rq->idle_at_tick ? + CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(local_cpu, idle); + rebalance_domains(this_cpu, idle); #ifdef CONFIG_NO_HZ /* @@ -3194,13 +3254,13 @@ static void run_rebalance_domains(struct * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - if (local_rq->idle_at_tick && - atomic_read(&nohz.load_balancer) == local_cpu) { + if (this_rq->idle_at_tick && + atomic_read(&nohz.load_balancer) == this_cpu) { cpumask_t cpus = nohz.cpu_mask; struct rq *rq; int balance_cpu; - cpu_clear(local_cpu, cpus); + cpu_clear(this_cpu, cpus); for_each_cpu_mask(balance_cpu, cpus) { /* * If this cpu gets work to do, stop the load balancing @@ -3210,11 +3270,11 @@ static void run_rebalance_domains(struct if (need_resched()) break; - rebalance_domains(balance_cpu, SCHED_IDLE); + rebalance_domains(balance_cpu, CPU_IDLE); rq = cpu_rq(balance_cpu); - if (time_after(local_rq->next_balance, rq->next_balance)) - local_rq->next_balance = rq->next_balance; + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; } } #endif @@ -3227,9 +3287,8 @@ static void run_rebalance_domains(struct * idle load balancing owner or decide to stop the periodic load balancing, * if the whole system is idle. */ -static inline void trigger_load_balance(int cpu) +static inline void trigger_load_balance(struct rq *rq, int cpu) { - struct rq *rq = cpu_rq(cpu); #ifdef CONFIG_NO_HZ /* * If we were in the nohz mode recently and busy at the current @@ -3281,13 +3340,16 @@ static inline void trigger_load_balance( if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); } -#else + +#else /* CONFIG_SMP */ + /* * on UP we do not need to balance between CPUs: */ static inline void idle_balance(int cpu, struct rq *rq) { } + #endif DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -3295,57 +3357,31 @@ DEFINE_PER_CPU(struct kernel_stat, kstat EXPORT_PER_CPU_SYMBOL(kstat); /* - * This is called on clock ticks and on context switches. - * Bank in p->sched_time the ns elapsed since the last tick or switch. + * Return p->sum_exec_runtime plus any more ns on the sched_clock + * that have not yet been banked in case the task is currently running. */ -static inline void -update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) +unsigned long long task_sched_runtime(struct task_struct *p) { - p->sched_time += now - p->last_ran; - p->last_ran = rq->most_recent_timestamp = now; -} - -/* - * Return current->sched_time plus any more ns on the sched_clock - * that have not yet been banked. - */ -unsigned long long current_sched_time(const struct task_struct *p) -{ - unsigned long long ns; - unsigned long flags; - - local_irq_save(flags); - ns = p->sched_time + sched_clock() - p->last_ran; - local_irq_restore(flags); - - return ns; -} - -/* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -static inline int expired_starving(struct rq *rq) -{ - if (rq->curr->static_prio > rq->best_expired_prio) - return 1; - if (!STARVATION_LIMIT || !rq->expired_timestamp) - return 0; - if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) - return 1; - return 0; + unsigned long flags; + u64 ns, delta_exec; + struct rq *rq; + + rq = task_rq_lock(p, &flags); + ns = p->se.sum_exec_runtime; + if (rq->curr == p) { + update_rq_clock(rq); + delta_exec = rq->clock - p->se.exec_start; + if ((s64)delta_exec > 0) + ns += delta_exec; + } + task_rq_unlock(rq, &flags); + + return ns; } /* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in user space since the last update */ void account_user_time(struct task_struct *p, cputime_t cputime) @@ -3364,6 +3400,35 @@ void account_user_time(struct task_struc } /* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime) +{ + cputime64_t tmp; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + tmp = cputime_to_cputime64(cputime); + + p->utime = cputime_add(p->utime, cputime); + p->gtime = cputime_add(p->gtime, cputime); + + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); +} + +/* + * Account scaled user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + */ +void account_user_time_scaled(struct task_struct *p, cputime_t cputime) +{ + p->utimescaled = cputime_add(p->utimescaled, cputime); +} + +/* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -3376,6 +3441,9 @@ void account_system_time(struct task_str struct rq *rq = this_rq(); cputime64_t tmp; + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) + return account_guest_time(p, cputime); + p->stime = cputime_add(p->stime, cputime); /* Add system time to cpustat. */ @@ -3395,6 +3463,17 @@ void account_system_time(struct task_str } /* + * Account scaled system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + */ +void account_system_time_scaled(struct task_struct *p, cputime_t cputime) +{ + p->stimescaled = cputime_add(p->stimescaled, cputime); +} + +/* * Account for involuntary wait time. * @p: the process from which the cpu time has been stolen * @steal: the cpu time spent in involuntary wait @@ -3415,81 +3494,6 @@ void account_steal_time(struct task_stru cpustat->steal = cputime64_add(cpustat->steal, tmp); } -static void task_running_tick(struct rq *rq, struct task_struct *p) -{ - if (p->array != rq->active) { - /* Task has expired but was not scheduled yet */ - set_tsk_need_resched(p); - return; - } - spin_lock(&rq->lock); - /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. - */ - if (rt_task(p)) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - requeue_task(p, rq->active); - } - goto out_unlock; - } - if (!--p->time_slice) { - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - - requeue_task(p, rq->active); - set_tsk_need_resched(p); - } - } -out_unlock: - spin_unlock(&rq->lock); -} - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3499,20 +3503,27 @@ out_unlock: */ void scheduler_tick(void) { - unsigned long long now = sched_clock(); - struct task_struct *p = current; int cpu = smp_processor_id(); - int idle_at_tick = idle_cpu(cpu); struct rq *rq = cpu_rq(cpu); + struct task_struct *curr = rq->curr; + u64 next_tick = rq->tick_timestamp + TICK_NSEC; - update_cpu_clock(p, rq, now); + spin_lock(&rq->lock); + __update_rq_clock(rq); + /* + * Let rq->clock advance by at least TICK_NSEC: + */ + if (unlikely(rq->clock < next_tick)) + rq->clock = next_tick; + rq->tick_timestamp = rq->clock; + update_cpu_load(rq); + if (curr != rq->idle) /* FIXME: needed? */ + curr->sched_class->task_tick(rq, curr); + spin_unlock(&rq->lock); - if (!idle_at_tick) - task_running_tick(rq, p); #ifdef CONFIG_SMP - update_load(rq); - rq->idle_at_tick = idle_at_tick; - trigger_load_balance(cpu); + rq->idle_at_tick = idle_cpu(cpu); + trigger_load_balance(rq, cpu); #endif } @@ -3554,170 +3565,145 @@ EXPORT_SYMBOL(sub_preempt_count); #endif -static inline int interactive_sleep(enum sleep_type sleep_type) +/* + * Print scheduling while atomic bug: + */ +static noinline void __schedule_bug(struct task_struct *prev) { - return (sleep_type == SLEEP_INTERACTIVE || - sleep_type == SLEEP_INTERRUPTED); + struct pt_regs *regs = get_irq_regs(); + + printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); + + debug_show_held_locks(prev); + if (irqs_disabled()) + print_irqtrace_events(prev); + + if (regs) + show_regs(regs); + else + dump_stack(); } /* - * schedule() is the main scheduler function. + * Various schedule()-time debugging checks and statistics: */ -asmlinkage void __sched schedule(void) +static inline void schedule_debug(struct task_struct *prev) { - struct task_struct *prev, *next; - struct prio_array *array; - struct list_head *queue; - unsigned long long now; - unsigned long run_time; - int cpu, idx, new_prio; - long *switch_count; - struct rq *rq; - /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (unlikely(in_atomic() && !current->exit_state)) { - printk(KERN_ERR "BUG: scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); - } + if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) + __schedule_bug(prev); + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -need_resched: - preempt_disable(); - prev = current; - release_kernel_lock(prev); -need_resched_nonpreemptible: - rq = this_rq(); + schedstat_inc(this_rq(), sched_count); +#ifdef CONFIG_SCHEDSTATS + if (unlikely(prev->lock_depth >= 0)) { + schedstat_inc(this_rq(), bkl_count); + schedstat_inc(prev, sched_info.bkl_count); + } +#endif +} + +/* + * Pick up the highest-prio task: + */ +static inline struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev) +{ + const struct sched_class *class; + struct task_struct *p; /* - * The idle thread is not allowed to schedule! - * Remove this check after it has been exercised a bit. + * Optimization: we know that if all tasks are in + * the fair class we can call that function directly: */ - if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); + if (likely(rq->nr_running == rq->cfs.nr_running)) { + p = fair_sched_class.pick_next_task(rq); + if (likely(p)) + return p; } - schedstat_inc(rq, sched_cnt); - now = sched_clock(); - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { - run_time = now - prev->timestamp; - if (unlikely((long long)(now - prev->timestamp) < 0)) - run_time = 0; - } else - run_time = NS_MAX_SLEEP_AVG; + class = sched_class_highest; + for ( ; ; ) { + p = class->pick_next_task(rq); + if (p) + return p; + /* + * Will never be NULL as the idle class always + * returns a non-NULL p: + */ + class = class->next; + } +} + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + struct task_struct *prev, *next; + long *switch_count; + struct rq *rq; + int cpu; + +need_resched: + preempt_disable(); + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + rcu_qsctr_inc(cpu); + prev = rq->curr; + switch_count = &prev->nivcsw; + + release_kernel_lock(prev); +need_resched_nonpreemptible: + + schedule_debug(prev); /* - * Tasks charged proportionately less run_time at high sleep_avg to - * delay them losing their interactive status + * Do the rq-clock update outside the rq lock: */ - run_time /= (CURRENT_BONUS(prev) ? : 1); - - spin_lock_irq(&rq->lock); + local_irq_disable(); + __update_rq_clock(rq); + spin_lock(&rq->lock); + clear_tsk_need_resched(prev); - switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - switch_count = &prev->nvcsw; if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) + unlikely(signal_pending(prev)))) { prev->state = TASK_RUNNING; - else { - if (prev->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible++; - deactivate_task(prev, rq); + } else { + deactivate_task(rq, prev, 1); } + switch_count = &prev->nvcsw; } - cpu = smp_processor_id(); - if (unlikely(!rq->nr_running)) { + if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; - rq->expired_timestamp = 0; - goto switch_tasks; - } - } - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - schedstat_inc(rq, sched_switch); - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } - - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, struct task_struct, run_list); - - if (!rt_task(next) && interactive_sleep(next->sleep_type)) { - unsigned long long delta = now - next->timestamp; - if (unlikely((long long)(now - next->timestamp) < 0)) - delta = 0; - - if (next->sleep_type == SLEEP_INTERACTIVE) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - new_prio = recalc_task_prio(next, next->timestamp + delta); - - if (unlikely(next->prio != new_prio)) { - dequeue_task(next, array); - next->prio = new_prio; - enqueue_task(next, array); - } - } - next->sleep_type = SLEEP_NORMAL; -switch_tasks: - if (next == rq->idle) - schedstat_inc(rq, sched_goidle); - prefetch(next); - prefetch_stack(next); - clear_tsk_need_resched(prev); - rcu_qsctr_inc(task_cpu(prev)); - - update_cpu_clock(prev, rq, now); - - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) - prev->sleep_avg = 0; - prev->timestamp = prev->last_ran = now; + prev->sched_class->put_prev_task(rq, prev); + next = pick_next_task(rq, prev); sched_info_switch(prev, next); + if (likely(prev != next)) { - next->timestamp = next->last_ran = now; rq->nr_switches++; rq->curr = next; ++*switch_count; - prepare_task_switch(rq, next); - prev = context_switch(rq, prev, next); - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); + context_switch(rq, prev, next); /* unlocks the rq */ } else spin_unlock_irq(&rq->lock); - prev = current; - if (unlikely(reacquire_kernel_lock(prev) < 0)) + if (unlikely(reacquire_kernel_lock(current) < 0)) { + cpu = smp_processor_id(); + rq = cpu_rq(cpu); goto need_resched_nonpreemptible; + } preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; @@ -3744,27 +3730,30 @@ asmlinkage void __sched preempt_schedule if (likely(ti->preempt_count || irqs_disabled())) return; -need_resched: - add_preempt_count(PREEMPT_ACTIVE); - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ + do { + add_preempt_count(PREEMPT_ACTIVE); + + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ #ifdef CONFIG_PREEMPT_BKL - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; #endif - schedule(); + schedule(); #ifdef CONFIG_PREEMPT_BKL - task->lock_depth = saved_lock_depth; + task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); + sub_preempt_count(PREEMPT_ACTIVE); - /* we could miss a preemption opportunity between schedule and now */ - barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); } EXPORT_SYMBOL(preempt_schedule); @@ -3784,29 +3773,32 @@ asmlinkage void __sched preempt_schedule /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); -need_resched: - add_preempt_count(PREEMPT_ACTIVE); - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ + do { + add_preempt_count(PREEMPT_ACTIVE); + + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ #ifdef CONFIG_PREEMPT_BKL - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; #endif - local_irq_enable(); - schedule(); - local_irq_disable(); + local_irq_enable(); + schedule(); + local_irq_disable(); #ifdef CONFIG_PREEMPT_BKL - task->lock_depth = saved_lock_depth; + task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); + sub_preempt_count(PREEMPT_ACTIVE); - /* we could miss a preemption opportunity between schedule and now */ - barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); } #endif /* CONFIG_PREEMPT */ @@ -3830,10 +3822,9 @@ EXPORT_SYMBOL(default_wake_function); static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { - struct list_head *tmp, *next; + wait_queue_t *curr, *next; - list_for_each_safe(tmp, next, &q->task_list) { - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; if (curr->func(curr, mode, sync, key) && @@ -3899,7 +3890,7 @@ __wake_up_sync(wait_queue_head_t *q, uns } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ -void fastcall complete(struct completion *x) +void complete(struct completion *x) { unsigned long flags; @@ -3911,7 +3902,7 @@ void fastcall complete(struct completion } EXPORT_SYMBOL(complete); -void fastcall complete_all(struct completion *x) +void complete_all(struct completion *x) { unsigned long flags; @@ -3923,196 +3914,120 @@ void fastcall complete_all(struct comple } EXPORT_SYMBOL(complete_all); -void fastcall __sched wait_for_completion(struct completion *x) -{ - might_sleep(); - - spin_lock_irq(&x->wait.lock); - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - wait.flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(&x->wait, &wait); - do { - __set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&x->wait.lock); - schedule(); - spin_lock_irq(&x->wait.lock); - } while (!x->done); - __remove_wait_queue(&x->wait, &wait); - } - x->done--; - spin_unlock_irq(&x->wait.lock); -} -EXPORT_SYMBOL(wait_for_completion); - -unsigned long fastcall __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) +static inline long __sched +do_wait_for_common(struct completion *x, long timeout, int state) { - might_sleep(); - - spin_lock_irq(&x->wait.lock); if (!x->done) { DECLARE_WAITQUEUE(wait, current); wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - __set_current_state(TASK_UNINTERRUPTIBLE); + if (state == TASK_INTERRUPTIBLE && + signal_pending(current)) { + __remove_wait_queue(&x->wait, &wait); + return -ERESTARTSYS; + } + __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); if (!timeout) { __remove_wait_queue(&x->wait, &wait); - goto out; + return timeout; } } while (!x->done); __remove_wait_queue(&x->wait, &wait); } x->done--; -out: - spin_unlock_irq(&x->wait.lock); return timeout; } -EXPORT_SYMBOL(wait_for_completion_timeout); -int fastcall __sched wait_for_completion_interruptible(struct completion *x) +static long __sched +wait_for_common(struct completion *x, long timeout, int state) { - int ret = 0; - might_sleep(); spin_lock_irq(&x->wait.lock); - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - wait.flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(&x->wait, &wait); - do { - if (signal_pending(current)) { - ret = -ERESTARTSYS; - __remove_wait_queue(&x->wait, &wait); - goto out; - } - __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irq(&x->wait.lock); - schedule(); - spin_lock_irq(&x->wait.lock); - } while (!x->done); - __remove_wait_queue(&x->wait, &wait); - } - x->done--; -out: + timeout = do_wait_for_common(x, timeout, state); spin_unlock_irq(&x->wait.lock); - - return ret; + return timeout; } -EXPORT_SYMBOL(wait_for_completion_interruptible); -unsigned long fastcall __sched -wait_for_completion_interruptible_timeout(struct completion *x, - unsigned long timeout) +void __sched wait_for_completion(struct completion *x) { - might_sleep(); + wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); - spin_lock_irq(&x->wait.lock); - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); - wait.flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(&x->wait, &wait); - do { - if (signal_pending(current)) { - timeout = -ERESTARTSYS; - __remove_wait_queue(&x->wait, &wait); - goto out; - } - __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irq(&x->wait.lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&x->wait.lock); - if (!timeout) { - __remove_wait_queue(&x->wait, &wait); - goto out; - } - } while (!x->done); - __remove_wait_queue(&x->wait, &wait); - } - x->done--; -out: - spin_unlock_irq(&x->wait.lock); - return timeout; +int __sched wait_for_completion_interruptible(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +unsigned long __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +static long __sched +sleep_on_common(wait_queue_head_t *q, int state, long timeout) +{ + unsigned long flags; + wait_queue_t wait; -#define SLEEP_ON_VAR \ - unsigned long flags; \ - wait_queue_t wait; \ init_waitqueue_entry(&wait, current); -#define SLEEP_ON_HEAD \ - spin_lock_irqsave(&q->lock,flags); \ - __add_wait_queue(q, &wait); \ - spin_unlock(&q->lock); + __set_current_state(state); -#define SLEEP_ON_TAIL \ - spin_lock_irq(&q->lock); \ - __remove_wait_queue(q, &wait); \ + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, &wait); + spin_unlock(&q->lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&q->lock); + __remove_wait_queue(q, &wait); spin_unlock_irqrestore(&q->lock, flags); -void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - SLEEP_ON_VAR - - current->state = TASK_INTERRUPTIBLE; + return timeout; +} - SLEEP_ON_HEAD - schedule(); - SLEEP_ON_TAIL +void __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall __sched +long __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { - SLEEP_ON_VAR - - current->state = TASK_INTERRUPTIBLE; - - SLEEP_ON_HEAD - timeout = schedule_timeout(timeout); - SLEEP_ON_TAIL - - return timeout; + return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); } EXPORT_SYMBOL(interruptible_sleep_on_timeout); -void fastcall __sched sleep_on(wait_queue_head_t *q) +void __sched sleep_on(wait_queue_head_t *q) { - SLEEP_ON_VAR - - current->state = TASK_UNINTERRUPTIBLE; - - SLEEP_ON_HEAD - schedule(); - SLEEP_ON_TAIL + sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } EXPORT_SYMBOL(sleep_on); -long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { - SLEEP_ON_VAR - - current->state = TASK_UNINTERRUPTIBLE; - - SLEEP_ON_HEAD - timeout = schedule_timeout(timeout); - SLEEP_ON_TAIL - - return timeout; + return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); } - EXPORT_SYMBOL(sleep_on_timeout); #ifdef CONFIG_RT_MUTEXES @@ -4129,39 +4044,46 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - struct prio_array *array; unsigned long flags; + int oldprio, on_rq, running; struct rq *rq; - int oldprio; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); + update_rq_clock(rq); oldprio = p->prio; - array = p->array; - if (array) - dequeue_task(p, array); + on_rq = p->se.on_rq; + running = task_running(rq, p); + if (on_rq) { + dequeue_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + } + + if (rt_prio(prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + p->prio = prio; - if (array) { - /* - * If changing to an RT priority then queue it - * in the active array! - */ - if (rt_task(p)) - array = rq->active; - enqueue_task(p, array); + if (on_rq) { + if (running) + p->sched_class->set_curr_task(rq); + enqueue_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_running(rq, p)) { + if (running) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else { + check_preempt_curr(rq, p); + } } task_rq_unlock(rq, &flags); } @@ -4170,8 +4092,7 @@ void rt_mutex_setprio(struct task_struct void set_user_nice(struct task_struct *p, long nice) { - struct prio_array *array; - int old_prio, delta; + int old_prio, delta, on_rq; unsigned long flags; struct rq *rq; @@ -4182,21 +4103,20 @@ void set_user_nice(struct task_struct *p * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + update_rq_clock(rq); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is - * not SCHED_NORMAL/SCHED_BATCH: + * SCHED_FIFO/SCHED_RR: */ - if (has_rt_policy(p)) { + if (task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - array = p->array; - if (array) { - dequeue_task(p, array); - dec_raw_weighted_load(rq, p); - } + on_rq = p->se.on_rq; + if (on_rq) + dequeue_task(rq, p, 0); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -4204,9 +4124,8 @@ void set_user_nice(struct task_struct *p p->prio = effective_prio(p); delta = p->prio - old_prio; - if (array) { - enqueue_task(p, array); - inc_raw_weighted_load(rq, p); + if (on_rq) { + enqueue_task(rq, p, 0); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4320,26 +4239,34 @@ struct task_struct *idle_task(int cpu) * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. */ -static inline struct task_struct *find_process_by_pid(pid_t pid) +static struct task_struct *find_process_by_pid(pid_t pid) { - return pid ? find_task_by_pid(pid) : current; + return pid ? find_task_by_vpid(pid) : current; } /* Actually do priority change: must hold rq lock. */ -static void __setscheduler(struct task_struct *p, int policy, int prio) +static void +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) { - BUG_ON(p->array); + BUG_ON(p->se.on_rq); p->policy = policy; + switch (p->policy) { + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + p->sched_class = &fair_sched_class; + break; + case SCHED_FIFO: + case SCHED_RR: + p->sched_class = &rt_sched_class; + break; + } + p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; set_load_weight(p); } @@ -4354,8 +4281,7 @@ static void __setscheduler(struct task_s int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - int retval, oldprio, oldpolicy = -1; - struct prio_array *array; + int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; struct rq *rq; @@ -4366,27 +4292,27 @@ recheck: if (policy < 0) policy = oldpolicy = p->policy; else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH) + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and - * SCHED_BATCH is 0. + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * SCHED_BATCH and SCHED_IDLE is 0. */ if (param->sched_priority < 0 || (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if (is_rt_policy(policy) != (param->sched_priority != 0)) + if (rt_policy(policy) != (param->sched_priority != 0)) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (!capable(CAP_SYS_NICE)) { - if (is_rt_policy(policy)) { + if (rt_policy(policy)) { unsigned long rlim_rtprio; - unsigned long flags; if (!lock_task_sighand(p, &flags)) return -ESRCH; @@ -4402,6 +4328,12 @@ recheck: param->sched_priority > rlim_rtprio) return -EPERM; } + /* + * Like positive nice levels, dont allow tasks to + * move out of SCHED_IDLE either: + */ + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) + return -EPERM; /* can't change other user's priorities */ if ((current->euid != p->euid) && @@ -4429,23 +4361,33 @@ recheck: spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - array = p->array; - if (array) - deactivate_task(p, rq); + update_rq_clock(rq); + on_rq = p->se.on_rq; + running = task_running(rq, p); + if (on_rq) { + deactivate_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + } + oldprio = p->prio; - __setscheduler(p, policy, param->sched_priority); - if (array) { - __activate_task(p, rq); + __setscheduler(rq, p, policy, param->sched_priority); + + if (on_rq) { + if (running) + p->sched_class->set_curr_task(rq); + activate_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_running(rq, p)) { + if (running) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else { + check_preempt_curr(rq, p); + } } __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -4511,10 +4453,10 @@ asmlinkage long sys_sched_setparam(pid_t asmlinkage long sys_sched_getscheduler(pid_t pid) { struct task_struct *p; - int retval = -EINVAL; + int retval; if (pid < 0) - goto out_nounlock; + return -EINVAL; retval = -ESRCH; read_lock(&tasklist_lock); @@ -4525,8 +4467,6 @@ asmlinkage long sys_sched_getscheduler(p retval = p->policy; } read_unlock(&tasklist_lock); - -out_nounlock: return retval; } @@ -4539,10 +4479,10 @@ asmlinkage long sys_sched_getparam(pid_t { struct sched_param lp; struct task_struct *p; - int retval = -EINVAL; + int retval; if (!param || pid < 0) - goto out_nounlock; + return -EINVAL; read_lock(&tasklist_lock); p = find_process_by_pid(pid); @@ -4562,7 +4502,6 @@ asmlinkage long sys_sched_getparam(pid_t */ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -out_nounlock: return retval; out_unlock: @@ -4605,8 +4544,21 @@ long sched_setaffinity(pid_t pid, cpumas cpus_allowed = cpuset_cpus_allowed(p); cpus_and(new_mask, new_mask, cpus_allowed); + again: retval = set_cpus_allowed(p, new_mask); + if (!retval) { + cpus_allowed = cpuset_cpus_allowed(p); + if (!cpus_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + new_mask = cpus_allowed; + goto again; + } + } out_unlock: put_task_struct(p); mutex_unlock(&sched_hotcpu_mutex); @@ -4683,10 +4635,8 @@ long sched_getaffinity(pid_t pid, cpumas out_unlock: read_unlock(&tasklist_lock); mutex_unlock(&sched_hotcpu_mutex); - if (retval) - return retval; - return 0; + return retval; } /** @@ -4717,41 +4667,15 @@ asmlinkage long sys_sched_getaffinity(pi /** * sys_sched_yield - yield the current processor to other threads. * - * This function yields the current CPU by moving the calling thread - * to the expired array. If there are no other threads running on this - * CPU then this function will return. + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. */ asmlinkage long sys_sched_yield(void) { struct rq *rq = this_rq_lock(); - struct prio_array *array = current->array, *target = rq->expired; - - schedstat_inc(rq, yld_cnt); - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ - if (rt_task(current)) - target = rq->active; - if (array->nr_active == 1) { - schedstat_inc(rq, yld_act_empty); - if (!rq->expired->nr_active) - schedstat_inc(rq, yld_both_empty); - } else if (!rq->expired->nr_active) - schedstat_inc(rq, yld_exp_empty); - - if (array != target) { - dequeue_task(current, array); - enqueue_task(current, target); - } else - /* - * requeue_task is cheaper so perform that if possible. - */ - requeue_task(current, array); + schedstat_inc(rq, yld_count); + current->sched_class->yield_task(rq); /* * Since we are going to call schedule() anyway, there's @@ -4902,6 +4826,7 @@ asmlinkage long sys_sched_get_priority_m break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_IDLE: ret = 0; break; } @@ -4926,6 +4851,7 @@ asmlinkage long sys_sched_get_priority_m break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_IDLE: ret = 0; } return ret; @@ -4943,11 +4869,12 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) { struct task_struct *p; - int retval = -EINVAL; + unsigned int time_slice; + int retval; struct timespec t; if (pid < 0) - goto out_nounlock; + return -EINVAL; retval = -ESRCH; read_lock(&tasklist_lock); @@ -4959,12 +4886,24 @@ long sys_sched_rr_get_interval(pid_t pid if (retval) goto out_unlock; - jiffies_to_timespec(p->policy == SCHED_FIFO ? - 0 : task_timeslice(p), &t); + if (p->policy == SCHED_FIFO) + time_slice = 0; + else if (p->policy == SCHED_RR) + time_slice = DEF_TIMESLICE; + else { + struct sched_entity *se = &p->se; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(p, &flags); + time_slice = DEF_TIMESLICE; + task_rq_unlock(rq, &flags); + } read_unlock(&tasklist_lock); + jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; -out_nounlock: return retval; + out_unlock: read_unlock(&tasklist_lock); return retval; @@ -4978,18 +4917,18 @@ static void show_task(struct task_struct unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk("%-13.13s %c", p->comm, + printk(KERN_INFO "%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if (BITS_PER_LONG == 32) +#if BITS_PER_LONG == 32 if (state == TASK_RUNNING) - printk(" running "); + printk(KERN_CONT " running "); else - printk(" %08lX ", thread_saved_pc(p)); + printk(KERN_CONT " %08lx ", thread_saved_pc(p)); #else if (state == TASK_RUNNING) - printk(" running task "); + printk(KERN_CONT " running task "); else - printk(" %016lx ", thread_saved_pc(p)); + printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE { @@ -4999,11 +4938,8 @@ static void show_task(struct task_struct free = (unsigned long)n - (unsigned long)end_of_stack(p); } #endif - printk("%5lu %5d %6d", free, p->pid, p->parent->pid); - if (!p->mm) - printk(" (L-TLB)\n"); - else - printk(" (NOTLB)\n"); + printk(KERN_CONT "%5lu %5d %6d\n", free, + task_pid_nr(p), task_pid_nr(p->parent)); if (state != TASK_RUNNING) show_stack(p, NULL); @@ -5013,14 +4949,12 @@ void show_state_filter(unsigned long sta { struct task_struct *g, *p; -#if (BITS_PER_LONG == 32) - printk("\n" - " free sibling\n"); - printk(" task PC stack pid father child younger older\n"); +#if BITS_PER_LONG == 32 + printk(KERN_INFO + " task PC stack pid father\n"); #else - printk("\n" - " free sibling\n"); - printk(" task PC stack pid father child younger older\n"); + printk(KERN_INFO + " task PC stack pid father\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -5035,6 +4969,9 @@ void show_state_filter(unsigned long sta touch_all_softlockup_watchdogs(); +#ifdef CONFIG_SCHED_DEBUG + sysrq_sched_debug_show(); +#endif read_unlock(&tasklist_lock); /* * Only show locks if all tasks are dumped: @@ -5043,6 +4980,11 @@ void show_state_filter(unsigned long sta debug_show_all_locks(); } +void __cpuinit init_idle_bootup_task(struct task_struct *idle) +{ + idle->sched_class = &idle_sched_class; +} + /** * init_idle - set up an idle thread for a given CPU * @idle: task in question @@ -5056,13 +4998,12 @@ void __cpuinit init_idle(struct task_str struct rq *rq = cpu_rq(cpu); unsigned long flags; - idle->timestamp = sched_clock(); - idle->sleep_avg = 0; - idle->array = NULL; + __sched_fork(idle); + idle->se.exec_start = sched_clock(); + idle->prio = idle->normal_prio = MAX_PRIO; - idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); - set_task_cpu(idle, cpu); + __set_task_cpu(idle, cpu); spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; @@ -5077,6 +5018,10 @@ void __cpuinit init_idle(struct task_str #else task_thread_info(idle)->preempt_count = 0; #endif + /* + * The idle tasks have their own, simple scheduling class: + */ + idle->sched_class = &idle_sched_class; } /* @@ -5088,6 +5033,32 @@ void __cpuinit init_idle(struct task_str */ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static inline void sched_init_granularity(void) +{ + unsigned int factor = 1 + ilog2(num_online_cpus()); + const unsigned long limit = 200000000; + + sysctl_sched_min_granularity *= factor; + if (sysctl_sched_min_granularity > limit) + sysctl_sched_min_granularity = limit; + + sysctl_sched_latency *= factor; + if (sysctl_sched_latency > limit) + sysctl_sched_latency = limit; + + sysctl_sched_wakeup_granularity *= factor; + sysctl_sched_batch_wakeup_granularity *= factor; +} + #ifdef CONFIG_SMP /* * This is how migration works: @@ -5161,7 +5132,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; - int ret = 0; + int ret = 0, on_rq; if (unlikely(cpu_is_offline(dest_cpu))) return ret; @@ -5177,20 +5148,14 @@ static int __migrate_task(struct task_st if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq_src, p, 0); + set_task_cpu(p, dest_cpu); - if (p->array) { - /* - * Sync timestamp with rq_dest's before activating. - * The same thing could be achieved by doing this step - * afterwards, and pretending it was a local activate. - * This way is cleaner and logically correct. - */ - p->timestamp = p->timestamp - rq_src->most_recent_timestamp - + rq_dest->most_recent_timestamp; - deactivate_task(p, rq_src); - __activate_task(p, rq_dest); - if (TASK_PREEMPTS_CURR(p, rq_dest)) - resched_task(rq_dest->curr); + if (on_rq) { + activate_task(rq_dest, p, 0); + check_preempt_curr(rq_dest, p); } ret = 1; out: @@ -5262,8 +5227,19 @@ wait_to_die: } #ifdef CONFIG_HOTPLUG_CPU + +static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) +{ + int ret; + + local_irq_disable(); + ret = __migrate_task(p, src_cpu, dest_cpu); + local_irq_enable(); + return ret; +} + /* - * Figure out where task on dead CPU should go, use force if neccessary. + * Figure out where task on dead CPU should go, use force if necessary. * NOTE: interrupts should be disabled by the caller */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) @@ -5273,35 +5249,42 @@ static void move_task_off_dead_cpu(int d struct rq *rq; int dest_cpu; -restart: - /* On same node? */ - mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, p->cpus_allowed); - dest_cpu = any_online_cpu(mask); - - /* On any allowed CPU? */ - if (dest_cpu == NR_CPUS) - dest_cpu = any_online_cpu(p->cpus_allowed); - - /* No more Mr. Nice Guy. */ - if (dest_cpu == NR_CPUS) { - rq = task_rq_lock(p, &flags); - cpus_setall(p->cpus_allowed); - dest_cpu = any_online_cpu(p->cpus_allowed); - task_rq_unlock(rq, &flags); + do { + /* On same node? */ + mask = node_to_cpumask(cpu_to_node(dead_cpu)); + cpus_and(mask, mask, p->cpus_allowed); + dest_cpu = any_online_cpu(mask); + + /* On any allowed CPU? */ + if (dest_cpu == NR_CPUS) + dest_cpu = any_online_cpu(p->cpus_allowed); + + /* No more Mr. Nice Guy. */ + if (dest_cpu == NR_CPUS) { + cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); + /* + * Try to stay on the same cpuset, where the + * current cpuset may be a subset of all cpus. + * The cpuset_cpus_allowed_locked() variant of + * cpuset_cpus_allowed() will not block. It must be + * called within calls to cpuset_lock/cpuset_unlock. + */ + rq = task_rq_lock(p, &flags); + p->cpus_allowed = cpus_allowed; + dest_cpu = any_online_cpu(p->cpus_allowed); + task_rq_unlock(rq, &flags); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - p->pid, p->comm, dead_cpu); - } - if (!__migrate_task(p, dead_cpu, dest_cpu)) - goto restart; + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); + } + } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); } /* @@ -5329,7 +5312,7 @@ static void migrate_live_tasks(int src_c { struct task_struct *p, *t; - write_lock_irq(&tasklist_lock); + read_lock(&tasklist_lock); do_each_thread(t, p) { if (p == current) @@ -5339,12 +5322,13 @@ static void migrate_live_tasks(int src_c move_task_off_dead_cpu(src_cpu, p); } while_each_thread(t, p); - write_unlock_irq(&tasklist_lock); + read_unlock(&tasklist_lock); } -/* Schedules idle task to be the next runnable task on current CPU. - * It does so by boosting its priority to highest possible and adding it to - * the _front_ of the runqueue. Used by CPU offline code. +/* + * Schedules idle task to be the next runnable task on current CPU. + * It does so by boosting its priority to highest possible. + * Used by CPU offline code. */ void sched_idle_next(void) { @@ -5362,10 +5346,10 @@ void sched_idle_next(void) */ spin_lock_irqsave(&rq->lock, flags); - __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - /* Add idle task to the _front_ of its priority queue: */ - __activate_idle_task(p, rq); + update_rq_clock(rq); + activate_task(rq, p, 0); spin_unlock_irqrestore(&rq->lock, flags); } @@ -5391,7 +5375,7 @@ static void migrate_dead(unsigned int de struct rq *rq = cpu_rq(dead_cpu); /* Must be exiting, otherwise would be on tasklist. */ - BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); + BUG_ON(!p->exit_state); /* Cannot have done final schedule yet: would have vanished. */ BUG_ON(p->state == TASK_DEAD); @@ -5402,11 +5386,10 @@ static void migrate_dead(unsigned int de * Drop lock around migration; if someone else moves it, * that's OK. No task can be added to this CPU, so iteration is * fine. - * NOTE: interrupts should be left disabled --dev@ */ - spin_unlock(&rq->lock); + spin_unlock_irq(&rq->lock); move_task_off_dead_cpu(dead_cpu, p); - spin_lock(&rq->lock); + spin_lock_irq(&rq->lock); put_task_struct(p); } @@ -5415,20 +5398,186 @@ static void migrate_dead(unsigned int de static void migrate_dead_tasks(unsigned int dead_cpu) { struct rq *rq = cpu_rq(dead_cpu); - unsigned int arr, i; + struct task_struct *next; + + for ( ; ; ) { + if (!rq->nr_running) + break; + update_rq_clock(rq); + next = pick_next_task(rq, rq->curr); + if (!next) + break; + migrate_dead(dead_cpu, next); - for (arr = 0; arr < 2; arr++) { - for (i = 0; i < MAX_PRIO; i++) { - struct list_head *list = &rq->arrays[arr].queue[i]; - - while (!list_empty(list)) - migrate_dead(dead_cpu, list_entry(list->next, - struct task_struct, run_list)); - } } } #endif /* CONFIG_HOTPLUG_CPU */ +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {0, }, +}; + +static struct ctl_table sd_ctl_root[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {0, }, +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + + return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ + struct ctl_table *entry; + + /* + * In the intermediate directories, both the child directory and + * procname are dynamically allocated and could fail but the mode + * will always be set. In the lowest directory the names are + * static strings and all have proc handlers. + */ + for (entry = *tablep; entry->mode; entry++) { + if (entry->child) + sd_free_ctl_entry(&entry->child); + if (entry->proc_handler == NULL) + kfree(entry->procname); + } + + kfree(*tablep); + *tablep = NULL; +} + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + mode_t mode, proc_handler *proc_handler) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(12); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[9], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[10], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax); + /* &table[11] is terminator */ + + return table; +} + +static ctl_table * sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + if (table == NULL) + return NULL; + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void register_sched_domain_sysctl(void) +{ + int i, cpu_num = num_online_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = entry; + + if (entry == NULL) + return; + + for_each_online_cpu(i) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + entry++; + } + + WARN_ON(sd_sysctl_header); + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +static void unregister_sched_domain_sysctl(void) +{ + if (sd_sysctl_header) + unregister_sysctl_table(sd_sysctl_header); + sd_sysctl_header = NULL; + if (sd_ctl_dir[0].child) + sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#else +static void register_sched_domain_sysctl(void) +{ +} +static void unregister_sched_domain_sysctl(void) +{ +} +#endif + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -5448,21 +5597,21 @@ migration_call(struct notifier_block *nf case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); + p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); if (IS_ERR(p)) return NOTIFY_BAD; p->flags |= PF_NOFREEZE; kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); - __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); task_rq_unlock(rq, &flags); cpu_rq(cpu)->migration_thread = p; break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - /* Strictly unneccessary, as first user will wake it. */ + /* Strictly unnecessary, as first user will wake it. */ wake_up_process(cpu_rq(cpu)->migration_thread); break; @@ -5480,17 +5629,21 @@ migration_call(struct notifier_block *nf case CPU_DEAD: case CPU_DEAD_FROZEN: + cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ - rq = task_rq_lock(rq->idle, &flags); - deactivate_task(rq->idle, rq); + spin_lock_irq(&rq->lock); + update_rq_clock(rq); + deactivate_task(rq, rq->idle, 0); rq->idle->static_prio = MAX_PRIO; - __setscheduler(rq->idle, SCHED_NORMAL, 0); + __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); + rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); - task_rq_unlock(rq, &flags); + spin_unlock_irq(&rq->lock); + cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); @@ -5524,7 +5677,7 @@ static struct notifier_block __cpuinitda .priority = 10 }; -int __init migration_init(void) +void __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; @@ -5534,8 +5687,6 @@ int __init migration_init(void) BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); - - return 0; } #endif @@ -5545,100 +5696,102 @@ int __init migration_init(void) int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); -#undef SCHED_DOMAIN_DEBUG -#ifdef SCHED_DOMAIN_DEBUG -static void sched_domain_debug(struct sched_domain *sd, int cpu) +#ifdef CONFIG_SCHED_DEBUG + +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) { - int level = 0; + struct sched_group *group = sd->groups; + cpumask_t groupmask; + char str[NR_CPUS]; - if (!sd) { - printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); - return; + cpumask_scnprintf(str, NR_CPUS, sd->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); + if (sd->parent) + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); + return -1; } - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + printk(KERN_CONT "span %s\n", str); + if (!cpu_isset(cpu, sd->span)) { + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); + } + if (!cpu_isset(cpu, group->cpumask)) { + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); + } + + printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { - int i; - char str[NR_CPUS]; - struct sched_group *group = sd->groups; - cpumask_t groupmask; - - cpumask_scnprintf(str, NR_CPUS, sd->span); - cpus_clear(groupmask); - - printk(KERN_DEBUG); - for (i = 0; i < level + 1; i++) - printk(" "); - printk("domain %d: ", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); + if (!group) { + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); break; } - printk("span %s\n", str); + if (!group->__cpu_power) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); + break; + } - if (!cpu_isset(cpu, sd->span)) - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); - if (!cpu_isset(cpu, group->cpumask)) - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); - - printk(KERN_DEBUG); - for (i = 0; i < level + 2; i++) - printk(" "); - printk("groups:"); - do { - if (!group) { - printk("\n"); - printk(KERN_ERR "ERROR: group is NULL\n"); - break; - } + if (!cpus_weight(group->cpumask)) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: empty group\n"); + break; + } - if (!group->__cpu_power) { - printk("\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); - } + if (cpus_intersects(groupmask, group->cpumask)) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); + break; + } - if (!cpus_weight(group->cpumask)) { - printk("\n"); - printk(KERN_ERR "ERROR: empty group\n"); - } + cpus_or(groupmask, groupmask, group->cpumask); - if (cpus_intersects(groupmask, group->cpumask)) { - printk("\n"); - printk(KERN_ERR "ERROR: repeated CPUs\n"); - } + cpumask_scnprintf(str, NR_CPUS, group->cpumask); + printk(KERN_CONT " %s", str); + + group = group->next; + } while (group != sd->groups); + printk(KERN_CONT "\n"); + + if (!cpus_equal(sd->span, groupmask)) + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + + if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); + return 0; +} - cpus_or(groupmask, groupmask, group->cpumask); +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ + int level = 0; - cpumask_scnprintf(str, NR_CPUS, group->cpumask); - printk(" %s", str); + if (!sd) { + printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); + return; + } - group = group->next; - } while (group != sd->groups); - printk("\n"); - - if (!cpus_equal(sd->span, groupmask)) - printk(KERN_ERR "ERROR: groups don't span " - "domain->span\n"); + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + for (;;) { + if (sched_domain_debug_one(sd, cpu, level)) + break; level++; sd = sd->parent; if (!sd) - continue; - - if (!cpus_subset(groupmask, sd->span)) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); - - } while (sd); + break; + } } #else # define sched_domain_debug(sd, cpu) do { } while (0) @@ -5747,7 +5900,7 @@ static int __init isolated_cpu_setup(cha return 1; } -__setup ("isolcpus=", isolated_cpu_setup); +__setup("isolcpus=", isolated_cpu_setup); /* * init_sched_build_groups takes the cpumask we wish to span, and a pointer @@ -5797,483 +5950,6 @@ init_sched_build_groups(cpumask_t span, #define SD_NODES_PER_DOMAIN 16 -/* - * Self-tuning task migration cost measurement between source and target CPUs. - * - * This is done by measuring the cost of manipulating buffers of varying - * sizes. For a given buffer-size here are the steps that are taken: - * - * 1) the source CPU reads+dirties a shared buffer - * 2) the target CPU reads+dirties the same shared buffer - * - * We measure how long they take, in the following 4 scenarios: - * - * - source: CPU1, target: CPU2 | cost1 - * - source: CPU2, target: CPU1 | cost2 - * - source: CPU1, target: CPU1 | cost3 - * - source: CPU2, target: CPU2 | cost4 - * - * We then calculate the cost3+cost4-cost1-cost2 difference - this is - * the cost of migration. - * - * We then start off from a small buffer-size and iterate up to larger - * buffer sizes, in 5% steps - measuring each buffer-size separately, and - * doing a maximum search for the cost. (The maximum cost for a migration - * normally occurs when the working set size is around the effective cache - * size.) - */ -#define SEARCH_SCOPE 2 -#define MIN_CACHE_SIZE (64*1024U) -#define DEFAULT_CACHE_SIZE (5*1024*1024U) -#define ITERATIONS 1 -#define SIZE_THRESH 130 -#define COST_THRESH 130 - -/* - * The migration cost is a function of 'domain distance'. Domain - * distance is the number of steps a CPU has to iterate down its - * domain tree to share a domain with the other CPU. The farther - * two CPUs are from each other, the larger the distance gets. - * - * Note that we use the distance only to cache measurement results, - * the distance value is not used numerically otherwise. When two - * CPUs have the same distance it is assumed that the migration - * cost is the same. (this is a simplification but quite practical) - */ -#define MAX_DOMAIN_DISTANCE 32 - -static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = - { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -/* - * Architectures may override the migration cost and thus avoid - * boot-time calibration. Unit is nanoseconds. Mostly useful for - * virtualized hardware: - */ -#ifdef CONFIG_DEFAULT_MIGRATION_COST - CONFIG_DEFAULT_MIGRATION_COST -#else - -1LL -#endif -}; - -/* - * Allow override of migration cost - in units of microseconds. - * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost - * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: - */ -static int __init migration_cost_setup(char *str) -{ - int ints[MAX_DOMAIN_DISTANCE+1], i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - - printk("#ints: %d\n", ints[0]); - for (i = 1; i <= ints[0]; i++) { - migration_cost[i-1] = (unsigned long long)ints[i]*1000; - printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); - } - return 1; -} - -__setup ("migration_cost=", migration_cost_setup); - -/* - * Global multiplier (divisor) for migration-cutoff values, - * in percentiles. E.g. use a value of 150 to get 1.5 times - * longer cache-hot cutoff times. - * - * (We scale it from 100 to 128 to long long handling easier.) - */ - -#define MIGRATION_FACTOR_SCALE 128 - -static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; - -static int __init setup_migration_factor(char *str) -{ - get_option(&str, &migration_factor); - migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; - return 1; -} - -__setup("migration_factor=", setup_migration_factor); - -/* - * Estimated distance of two CPUs, measured via the number of domains - * we have to pass for the two CPUs to be in the same span: - */ -static unsigned long domain_distance(int cpu1, int cpu2) -{ - unsigned long distance = 0; - struct sched_domain *sd; - - for_each_domain(cpu1, sd) { - WARN_ON(!cpu_isset(cpu1, sd->span)); - if (cpu_isset(cpu2, sd->span)) - return distance; - distance++; - } - if (distance >= MAX_DOMAIN_DISTANCE) { - WARN_ON(1); - distance = MAX_DOMAIN_DISTANCE-1; - } - - return distance; -} - -static unsigned int migration_debug; - -static int __init setup_migration_debug(char *str) -{ - get_option(&str, &migration_debug); - return 1; -} - -__setup("migration_debug=", setup_migration_debug); - -/* - * Maximum cache-size that the scheduler should try to measure. - * Architectures with larger caches should tune this up during - * bootup. Gets used in the domain-setup code (i.e. during SMP - * bootup). - */ -unsigned int max_cache_size; - -static int __init setup_max_cache_size(char *str) -{ - get_option(&str, &max_cache_size); - return 1; -} - -__setup("max_cache_size=", setup_max_cache_size); - -/* - * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This - * is the operation that is timed, so we try to generate unpredictable - * cachemisses that still end up filling the L2 cache: - */ -static void touch_cache(void *__cache, unsigned long __size) -{ - unsigned long size = __size / sizeof(long); - unsigned long chunk1 = size / 3; - unsigned long chunk2 = 2 * size / 3; - unsigned long *cache = __cache; - int i; - - for (i = 0; i < size/6; i += 8) { - switch (i % 6) { - case 0: cache[i]++; - case 1: cache[size-1-i]++; - case 2: cache[chunk1-i]++; - case 3: cache[chunk1+i]++; - case 4: cache[chunk2-i]++; - case 5: cache[chunk2+i]++; - } - } -} - -/* - * Measure the cache-cost of one task migration. Returns in units of nsec. - */ -static unsigned long long -measure_one(void *cache, unsigned long size, int source, int target) -{ - cpumask_t mask, saved_mask; - unsigned long long t0, t1, t2, t3, cost; - - saved_mask = current->cpus_allowed; - - /* - * Flush source caches to RAM and invalidate them: - */ - sched_cacheflush(); - - /* - * Migrate to the source CPU: - */ - mask = cpumask_of_cpu(source); - set_cpus_allowed(current, mask); - WARN_ON(smp_processor_id() != source); - - /* - * Dirty the working set: - */ - t0 = sched_clock(); - touch_cache(cache, size); - t1 = sched_clock(); - - /* - * Migrate to the target CPU, dirty the L2 cache and access - * the shared buffer. (which represents the working set - * of a migrated task.) - */ - mask = cpumask_of_cpu(target); - set_cpus_allowed(current, mask); - WARN_ON(smp_processor_id() != target); - - t2 = sched_clock(); - touch_cache(cache, size); - t3 = sched_clock(); - - cost = t1-t0 + t3-t2; - - if (migration_debug >= 2) - printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", - source, target, t1-t0, t1-t0, t3-t2, cost); - /* - * Flush target caches to RAM and invalidate them: - */ - sched_cacheflush(); - - set_cpus_allowed(current, saved_mask); - - return cost; -} - -/* - * Measure a series of task migrations and return the average - * result. Since this code runs early during bootup the system - * is 'undisturbed' and the average latency makes sense. - * - * The algorithm in essence auto-detects the relevant cache-size, - * so it will properly detect different cachesizes for different - * cache-hierarchies, depending on how the CPUs are connected. - * - * Architectures can prime the upper limit of the search range via - * max_cache_size, otherwise the search range defaults to 20MB...64K. - */ -static unsigned long long -measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) -{ - unsigned long long cost1, cost2; - int i; - - /* - * Measure the migration cost of 'size' bytes, over an - * average of 10 runs: - * - * (We perturb the cache size by a small (0..4k) - * value to compensate size/alignment related artifacts. - * We also subtract the cost of the operation done on - * the same CPU.) - */ - cost1 = 0; - - /* - * dry run, to make sure we start off cache-cold on cpu1, - * and to get any vmalloc pagefaults in advance: - */ - measure_one(cache, size, cpu1, cpu2); - for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); - - measure_one(cache, size, cpu2, cpu1); - for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); - - /* - * (We measure the non-migrating [cached] cost on both - * cpu1 and cpu2, to handle CPUs with different speeds) - */ - cost2 = 0; - - measure_one(cache, size, cpu1, cpu1); - for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); - - measure_one(cache, size, cpu2, cpu2); - for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); - - /* - * Get the per-iteration migration cost: - */ - do_div(cost1, 2 * ITERATIONS); - do_div(cost2, 2 * ITERATIONS); - - return cost1 - cost2; -} - -static unsigned long long measure_migration_cost(int cpu1, int cpu2) -{ - unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; - unsigned int max_size, size, size_found = 0; - long long cost = 0, prev_cost; - void *cache; - - /* - * Search from max_cache_size*5 down to 64K - the real relevant - * cachesize has to lie somewhere inbetween. - */ - if (max_cache_size) { - max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); - size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); - } else { - /* - * Since we have no estimation about the relevant - * search range - */ - max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; - size = MIN_CACHE_SIZE; - } - - if (!cpu_online(cpu1) || !cpu_online(cpu2)) { - printk("cpu %d and %d not both online!\n", cpu1, cpu2); - return 0; - } - - /* - * Allocate the working set: - */ - cache = vmalloc(max_size); - if (!cache) { - printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); - return 1000000; /* return 1 msec on very small boxen */ - } - - while (size <= max_size) { - prev_cost = cost; - cost = measure_cost(cpu1, cpu2, cache, size); - - /* - * Update the max: - */ - if (cost > 0) { - if (max_cost < cost) { - max_cost = cost; - size_found = size; - } - } - /* - * Calculate average fluctuation, we use this to prevent - * noise from triggering an early break out of the loop: - */ - fluct = abs(cost - prev_cost); - avg_fluct = (avg_fluct + fluct)/2; - - if (migration_debug) - printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " - "(%8Ld %8Ld)\n", - cpu1, cpu2, size, - (long)cost / 1000000, - ((long)cost / 100000) % 10, - (long)max_cost / 1000000, - ((long)max_cost / 100000) % 10, - domain_distance(cpu1, cpu2), - cost, avg_fluct); - - /* - * If we iterated at least 20% past the previous maximum, - * and the cost has dropped by more than 20% already, - * (taking fluctuations into account) then we assume to - * have found the maximum and break out of the loop early: - */ - if (size_found && (size*100 > size_found*SIZE_THRESH)) - if (cost+avg_fluct <= 0 || - max_cost*100 > (cost+avg_fluct)*COST_THRESH) { - - if (migration_debug) - printk("-> found max.\n"); - break; - } - /* - * Increase the cachesize in 10% steps: - */ - size = size * 10 / 9; - } - - if (migration_debug) - printk("[%d][%d] working set size found: %d, cost: %Ld\n", - cpu1, cpu2, size_found, max_cost); - - vfree(cache); - - /* - * A task is considered 'cache cold' if at least 2 times - * the worst-case cost of migration has passed. - * - * (this limit is only listened to if the load-balancing - * situation is 'nice' - if there is a large imbalance we - * ignore it for the sake of CPU utilization and - * processing fairness.) - */ - return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; -} - -static void calibrate_migration_costs(const cpumask_t *cpu_map) -{ - int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); - unsigned long j0, j1, distance, max_distance = 0; - struct sched_domain *sd; - - j0 = jiffies; - - /* - * First pass - calculate the cacheflush times: - */ - for_each_cpu_mask(cpu1, *cpu_map) { - for_each_cpu_mask(cpu2, *cpu_map) { - if (cpu1 == cpu2) - continue; - distance = domain_distance(cpu1, cpu2); - max_distance = max(max_distance, distance); - /* - * No result cached yet? - */ - if (migration_cost[distance] == -1LL) - migration_cost[distance] = - measure_migration_cost(cpu1, cpu2); - } - } - /* - * Second pass - update the sched domain hierarchy with - * the new cache-hot-time estimations: - */ - for_each_cpu_mask(cpu, *cpu_map) { - distance = 0; - for_each_domain(cpu, sd) { - sd->cache_hot_time = migration_cost[distance]; - distance++; - } - } - /* - * Print the matrix: - */ - if (migration_debug) - printk("migration: max_cache_size: %d, cpu: %d MHz:\n", - max_cache_size, -#ifdef CONFIG_X86 - cpu_khz/1000 -#else - -1 -#endif - ); - if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); - } - printk("\n"); - } - j1 = jiffies; - if (migration_debug) - printk("migration: %ld seconds\n", (j1-j0) / HZ); - - /* - * Move back to the original CPU. NUMA-Q gets confused - * if we migrate to another quad during bootup. - */ - if (raw_smp_processor_id() != orig_cpu) { - cpumask_t mask = cpumask_of_cpu(orig_cpu), - saved_mask = current->cpus_allowed; - - set_cpus_allowed(current, mask); - set_cpus_allowed(current, saved_mask); - } -} - #ifdef CONFIG_NUMA /** @@ -6380,7 +6056,7 @@ static int cpu_to_core_group(int cpu, co struct sched_group **sg) { int group; - cpumask_t mask = cpu_sibling_map[cpu]; + cpumask_t mask = cpu_sibling_map(cpu); cpus_and(mask, mask, *cpu_map); group = first_cpu(mask); if (sg) @@ -6409,7 +6085,7 @@ static int cpu_to_phys_group(int cpu, co cpus_and(mask, mask, *cpu_map); group = first_cpu(mask); #elif defined(CONFIG_SCHED_SMT) - cpumask_t mask = cpu_sibling_map[cpu]; + cpumask_t mask = cpu_sibling_map(cpu); cpus_and(mask, mask, *cpu_map); group = first_cpu(mask); #else @@ -6453,24 +6129,23 @@ static void init_numa_sched_groups_power if (!sg) return; -next_sg: - for_each_cpu_mask(j, sg->cpumask) { - struct sched_domain *sd; + do { + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; - sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } - sg_inc_cpu_power(sg, sd->groups->__cpu_power); - } - sg = sg->next; - if (sg != group_head) - goto next_sg; + sg_inc_cpu_power(sg, sd->groups->__cpu_power); + } + sg = sg->next; + } while (sg != group_head); } #endif @@ -6574,7 +6249,6 @@ static void init_sched_groups_power(int static int build_sched_domains(const cpumask_t *cpu_map) { int i; - struct sched_domain *sd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; @@ -6582,7 +6256,7 @@ static int build_sched_domains(const cpu /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, + sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); @@ -6601,8 +6275,8 @@ static int build_sched_domains(const cpu cpus_and(nodemask, nodemask, *cpu_map); #ifdef CONFIG_NUMA - if (cpus_weight(*cpu_map) - > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + if (cpus_weight(*cpu_map) > + SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = *cpu_map; @@ -6645,7 +6319,7 @@ static int build_sched_domains(const cpu p = sd; sd = &per_cpu(cpu_domains, i); *sd = SD_SIBLING_INIT; - sd->span = cpu_sibling_map[i]; + sd->span = cpu_sibling_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -6656,12 +6330,13 @@ static int build_sched_domains(const cpu #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_sibling_map = cpu_sibling_map[i]; + cpumask_t this_sibling_map = cpu_sibling_map(i); cpus_and(this_sibling_map, this_sibling_map, *cpu_map); if (i != first_cpu(this_sibling_map)) continue; - init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); + init_sched_build_groups(this_sibling_map, cpu_map, + &cpu_to_cpu_group); } #endif @@ -6672,11 +6347,11 @@ static int build_sched_domains(const cpu cpus_and(this_core_map, this_core_map, *cpu_map); if (i != first_cpu(this_core_map)) continue; - init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); + init_sched_build_groups(this_core_map, cpu_map, + &cpu_to_core_group); } #endif - /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); @@ -6691,7 +6366,8 @@ static int build_sched_domains(const cpu #ifdef CONFIG_NUMA /* Set up node groups */ if (sd_allnodes) - init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); + init_sched_build_groups(*cpu_map, cpu_map, + &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ @@ -6719,6 +6395,7 @@ static int build_sched_domains(const cpu sched_group_nodes[i] = sg; for_each_cpu_mask(j, nodemask) { struct sched_domain *sd; + sd = &per_cpu(node_domains, j); sd->groups = sg; } @@ -6763,19 +6440,22 @@ static int build_sched_domains(const cpu /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - sd = &per_cpu(cpu_domains, i); + struct sched_domain *sd = &per_cpu(cpu_domains, i); + init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu_mask(i, *cpu_map) { - sd = &per_cpu(core_domains, i); + struct sched_domain *sd = &per_cpu(core_domains, i); + init_sched_groups_power(i, sd); } #endif for_each_cpu_mask(i, *cpu_map) { - sd = &per_cpu(phys_domains, i); + struct sched_domain *sd = &per_cpu(phys_domains, i); + init_sched_groups_power(i, sd); } @@ -6803,10 +6483,6 @@ static int build_sched_domains(const cpu #endif cpu_attach_domain(sd, i); } - /* - * Tune cache-hot values: - */ - calibrate_migration_costs(cpu_map); return 0; @@ -6816,22 +6492,33 @@ error: return -ENOMEM; #endif } + +static cpumask_t *doms_cur; /* current sched domains */ +static int ndoms_cur; /* number of sched domains in 'doms_cur' */ + +/* + * Special case: If a kmalloc of a doms_cur partition (array of + * cpumask_t) fails, then fallback to a single sched domain, + * as determined by the single cpumask_t fallback_doms. + */ +static cpumask_t fallback_doms; + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. */ static int arch_init_sched_domains(const cpumask_t *cpu_map) { - cpumask_t cpu_default_map; int err; - /* - * Setup mask for cpus without special case scheduling requirements. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ - cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); - - err = build_sched_domains(&cpu_default_map); + ndoms_cur = 1; + doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!doms_cur) + doms_cur = &fallback_doms; + cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + err = build_sched_domains(doms_cur); + register_sched_domain_sysctl(); return err; } @@ -6849,6 +6536,8 @@ static void detach_destroy_domains(const { int i; + unregister_sched_domain_sysctl(); + for_each_cpu_mask(i, *cpu_map) cpu_attach_domain(NULL, i); synchronize_sched(); @@ -6856,34 +6545,78 @@ static void detach_destroy_domains(const } /* - * Partition sched domains as specified by the cpumasks below. - * This attaches all cpus from the cpumasks to the NULL domain, - * waits for a RCU quiescent period, recalculates sched - * domain information and then attaches them back to the - * correct sched domains + * Partition sched domains as specified by the 'ndoms_new' + * cpumasks in the array doms_new[] of cpumasks. This compares + * doms_new[] to the current sched domain partitioning, doms_cur[]. + * It destroys each deleted domain and builds each new domain. + * + * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. + * The masks don't intersect (don't overlap.) We should setup one + * sched domain for each mask. CPUs not in any of the cpumasks will + * not be load balanced. If the same cpumask appears both in the + * current 'doms_cur' domains and in the new 'doms_new', we can leave + * it as it is. + * + * The passed in 'doms_new' should be kmalloc'd. This routine takes + * ownership of it and will kfree it when done with it. If the caller + * failed the kmalloc call, then it can pass in doms_new == NULL, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms'. + * * Call with hotplug lock held */ -int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) { - cpumask_t change_map; - int err = 0; + int i, j; - cpus_and(*partition1, *partition1, cpu_online_map); - cpus_and(*partition2, *partition2, cpu_online_map); - cpus_or(change_map, *partition1, *partition2); - - /* Detach sched domains from all of the affected cpus */ - detach_destroy_domains(&change_map); - if (!cpus_empty(*partition1)) - err = build_sched_domains(partition1); - if (!err && !cpus_empty(*partition2)) - err = build_sched_domains(partition2); + lock_doms_cur(); - return err; + /* always unregister in case we don't destroy any domains */ + unregister_sched_domain_sysctl(); + + if (doms_new == NULL) { + ndoms_new = 1; + doms_new = &fallback_doms; + cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + } + + /* Destroy deleted domains */ + for (i = 0; i < ndoms_cur; i++) { + for (j = 0; j < ndoms_new; j++) { + if (cpus_equal(doms_cur[i], doms_new[j])) + goto match1; + } + /* no match - a current sched domain not in new doms_new[] */ + detach_destroy_domains(doms_cur + i); +match1: + ; + } + + /* Build new domains */ + for (i = 0; i < ndoms_new; i++) { + for (j = 0; j < ndoms_cur; j++) { + if (cpus_equal(doms_new[i], doms_cur[j])) + goto match2; + } + /* no match - add a new doms_new */ + build_sched_domains(doms_new + i); +match2: + ; + } + + /* Remember the new sched domains */ + if (doms_cur != &fallback_doms) + kfree(doms_cur); + doms_cur = doms_new; + ndoms_cur = ndoms_new; + + register_sched_domain_sysctl(); + + unlock_doms_cur(); } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -int arch_reinit_sched_domains(void) +static int arch_reinit_sched_domains(void) { int err; @@ -6912,24 +6645,6 @@ static ssize_t sched_power_savings_store return ret ? ret : count; } -int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) -{ - int err = 0; - -#ifdef CONFIG_SCHED_SMT - if (smt_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_smt_power_savings.attr); -#endif -#ifdef CONFIG_SCHED_MC - if (!err && mc_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_mc_power_savings.attr); -#endif - return err; -} -#endif - #ifdef CONFIG_SCHED_MC static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) { @@ -6940,8 +6655,8 @@ static ssize_t sched_mc_power_savings_st { return sched_power_savings_store(buf, count, 0); } -SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, - sched_mc_power_savings_store); +static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT @@ -6954,8 +6669,26 @@ static ssize_t sched_smt_power_savings_s { return sched_power_savings_store(buf, count, 1); } -SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, - sched_smt_power_savings_store); +static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, + sched_smt_power_savings_store); +#endif + +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +{ + int err = 0; + +#ifdef CONFIG_SCHED_SMT + if (smt_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_smt_power_savings.attr); +#endif +#ifdef CONFIG_SCHED_MC + if (!err && mc_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_mc_power_savings.attr); +#endif + return err; +} #endif /* @@ -7013,10 +6746,24 @@ void __init sched_init_smp(void) /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); + sched_init_granularity(); + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + lb_monitor_task = kthread_create(load_balance_monitor, NULL, + "load_balance_monitor"); + if (!IS_ERR(lb_monitor_task)) { + lb_monitor_task->flags |= PF_NOFREEZE; + wake_up_process(lb_monitor_task); + } else { + printk("Could not create load balance monitor thread" + "(error = %ld) \n", PTR_ERR(lb_monitor_task)); + } +#endif } #else void __init sched_init_smp(void) { + sched_init_granularity(); } #endif /* CONFIG_SMP */ @@ -7030,28 +6777,61 @@ int in_sched_functions(unsigned long add && addr < (unsigned long)__sched_text_end); } +static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +{ + cfs_rq->tasks_timeline = AVL_ROOT; +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->rq = rq; +#endif + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +} + void __init sched_init(void) { - int i, j, k; int highest_cpu = 0; + int i, j; for_each_possible_cpu(i) { - struct prio_array *array; + struct rt_prio_array *array; struct rq *rq; rq = cpu_rq(i); spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; + rq->clock = 1; + init_cfs_rq(&rq->cfs, rq); +#ifdef CONFIG_FAIR_GROUP_SCHED + INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + { + struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); + struct sched_entity *se = + &per_cpu(init_sched_entity, i); + + init_cfs_rq_p[i] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = &init_task_group; + list_add(&cfs_rq->leaf_cfs_rq_list, + &rq->leaf_cfs_rq_list); + + init_sched_entity_p[i] = se; + se->cfs_rq = &rq->cfs; + se->my_q = cfs_rq; + se->load.weight = init_task_group_load; + se->load.inv_weight = + div64_64(1ULL<<32, init_task_group_load); + se->parent = NULL; + } + init_task_group.shares = init_task_group_load; + mutex_init(&init_task_group.lock); +#endif + for (j = 0; j < CPU_LOAD_IDX_MAX; j++) + rq->cpu_load[j] = 0; #ifdef CONFIG_SMP rq->sd = NULL; - for (j = 1; j < 3; j++) - rq->cpu_load[j] = 0; rq->active_balance = 0; + rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; rq->migration_thread = NULL; @@ -7059,20 +6839,22 @@ void __init sched_init(void) #endif atomic_set(&rq->nr_iowait, 0); - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); + array = &rq->rt.active; + for (j = 0; j < MAX_RT_PRIO; j++) { + INIT_LIST_HEAD(array->queue + j); + __clear_bit(j, array->bitmap); } highest_cpu = i; + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); } set_load_weight(&init_task); +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&init_task.preempt_notifiers); +#endif + #ifdef CONFIG_SMP nr_cpu_ids = highest_cpu + 1; open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); @@ -7095,6 +6877,10 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -7123,30 +6909,56 @@ EXPORT_SYMBOL(__might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ +static void normalize_task(struct rq *rq, struct task_struct *p) +{ + int on_rq; + update_rq_clock(rq); + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq, p, 0); + __setscheduler(rq, p, SCHED_NORMAL, 0); + if (on_rq) { + activate_task(rq, p, 0); + resched_task(rq->curr); + } +} + void normalize_rt_tasks(void) { - struct prio_array *array; struct task_struct *g, *p; unsigned long flags; struct rq *rq; read_lock_irq(&tasklist_lock); - do_each_thread(g, p) { - if (!rt_task(p)) + /* + * Only normalize user tasks: + */ + if (!p->mm) + continue; + + p->se.exec_start = 0; +#ifdef CONFIG_SCHEDSTATS + p->se.wait_start = 0; + p->se.sleep_start = 0; + p->se.block_start = 0; +#endif + task_rq(p)->clock = 0; + + if (!rt_task(p)) { + /* + * Renice negative nice level userspace + * tasks back to 0: + */ + if (TASK_NICE(p) < 0 && p->mm) + set_user_nice(p, 0); continue; + } spin_lock_irqsave(&p->pi_lock, flags); rq = __task_rq_lock(p); - array = p->array; - if (array) - deactivate_task(p, task_rq(p)); - __setscheduler(p, SCHED_NORMAL, 0); - if (array) { - __activate_task(p, task_rq(p)); - resched_task(rq->curr); - } + normalize_task(rq, p); __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -7200,3 +7012,497 @@ void set_curr_task(int cpu, struct task_ } #endif + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + +/* distribute shares of all task groups among their schedulable entities, + * to reflect load distrbution across cpus. + */ +static int rebalance_shares(struct sched_domain *sd, int this_cpu) +{ + struct cfs_rq *cfs_rq; + struct rq *rq = cpu_rq(this_cpu); + cpumask_t sdspan = sd->span; + int balanced = 1; + + /* Walk thr' all the task groups that we have */ + for_each_leaf_cfs_rq(rq, cfs_rq) { + int i; + unsigned long total_load = 0, total_shares; + struct task_group *tg = cfs_rq->tg; + + /* Gather total task load of this group across cpus */ + for_each_cpu_mask(i, sdspan) + total_load += tg->cfs_rq[i]->load.weight; + + /* Nothing to do if this group has no load or if it's load + * hasn't changed since the last time we checked. + */ + if (!total_load || total_load == tg->last_total_load) + continue; + + tg->last_total_load = total_load; + + /* tg->shares represents the number of cpu shares the task group + * is eligible to hold on a single cpu. On N cpus, it is + * eligible to hold (N * tg->shares) number of cpu shares. + */ + total_shares = tg->shares * cpus_weight(sdspan); + + /* redistribute total_shares across cpus as per the task load + * distribution. + */ + for_each_cpu_mask(i, sdspan) { + unsigned long local_load, local_shares, irqflags; + + local_load = tg->cfs_rq[i]->load.weight; + local_shares = (local_load * total_shares) / total_load; + if (!local_shares) + local_shares = MIN_GROUP_SHARES; + if (local_shares == tg->se[i]->load.weight) + continue; + + spin_lock_irqsave(&cpu_rq(i)->lock, irqflags); + set_se_shares(tg->se[i], local_shares); + spin_unlock_irqrestore(&cpu_rq(i)->lock, irqflags); + balanced = 0; + } + } + + return balanced; +} + +/* + * How frequently should we rebalance_shares() across cpus? + * + * The more frequently we rebalance shares, the more accurate is the fairness + * of cpu bandwidth distribution between task groups. However higher frequency + * also implies increased scheduling overhead. + * + * sysctl_sched_min_bal_int_shares represents the minimum interval between + * consecutive calls to rebalance_shares() in the same sched domain. + * + * sysctl_sched_max_bal_int_shares represents the maximum interval between + * consecutive calls to rebalance_shares() in the same sched domain. + * + * These settings allows for the appropriate tradeoff between accuracy of + * fairness and the associated overhead. + * + */ + +/* default: 8ms, units: milliseconds */ +const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; + +/* default: 128ms, units: milliseconds */ +const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; + +static int load_balance_monitor(void *unused) +{ + unsigned int timeout = sysctl_sched_min_bal_int_shares; + + while (!kthread_should_stop()) { + int i, cpu, balanced = 1; + + lock_cpu_hotplug(); /* Prevent cpus going down or coming up */ + lock_doms_cur(); /* lockout changes to doms_cur[] array */ + + rcu_read_lock(); /* to walk rq->sd chain on various cpus */ + + for (i=0; i < ndoms_cur; i++) { + cpumask_t cpumap = doms_cur[i]; + struct sched_domain *sd = NULL, *sd_prev = NULL; + + cpu = first_cpu(cpumap); + + /* Find the highest domain at which to balance shares */ + for_each_domain(cpu, sd) { + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + sd_prev = sd; + } + + sd = sd_prev; + /* sd == NULL? No load balance reqd in this domain */ + if (!sd) + continue; + + balanced &= rebalance_shares(sd, cpu); + } + + rcu_read_unlock(); + + unlock_doms_cur(); + unlock_cpu_hotplug(); + + if (!balanced) + timeout = sysctl_sched_min_bal_int_shares; + else if (timeout < sysctl_sched_max_bal_int_shares) + timeout *= 2; + + msleep_interruptible(timeout); + } + + return 0; +} + +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(void) +{ + struct task_group *tg; + struct cfs_rq *cfs_rq; + struct sched_entity *se; + struct rq *rq; + int i; + + tg = kzalloc(sizeof(*tg), GFP_KERNEL); + if (!tg) + return ERR_PTR(-ENOMEM); + + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); + if (!tg->cfs_rq) + goto err; + tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); + if (!tg->se) + goto err; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + + cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, + cpu_to_node(i)); + if (!cfs_rq) + goto err; + + se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, + cpu_to_node(i)); + if (!se) + goto err; + + memset(cfs_rq, 0, sizeof(struct cfs_rq)); + memset(se, 0, sizeof(struct sched_entity)); + + tg->cfs_rq[i] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + + tg->se[i] = se; + se->cfs_rq = &rq->cfs; + se->my_q = cfs_rq; + se->load.weight = NICE_0_LOAD; + se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); + se->parent = NULL; + } + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = tg->cfs_rq[i]; + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + } + + tg->shares = NICE_0_LOAD; + mutex_init(&tg->lock); + + return tg; + +err: + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); + } + kfree(tg->cfs_rq); + kfree(tg->se); + kfree(tg); + + return ERR_PTR(-ENOMEM); +} + +/* rcu callback to free various structures associated with a task group */ +static void free_sched_group(struct rcu_head *rhp) +{ + struct task_group *tg = container_of(rhp, struct task_group, rcu); + struct cfs_rq *cfs_rq; + struct sched_entity *se; + int i; + + /* now it should be safe to free those cfs_rqs */ + for_each_possible_cpu(i) { + cfs_rq = tg->cfs_rq[i]; + kfree(cfs_rq); + + se = tg->se[i]; + kfree(se); + } + + kfree(tg->cfs_rq); + kfree(tg->se); + kfree(tg); +} + +/* Destroy runqueue etc associated with a task group */ +void sched_destroy_group(struct task_group *tg) +{ + struct cfs_rq *cfs_rq = NULL; + int i; + + for_each_possible_cpu(i) { + cfs_rq = tg->cfs_rq[i]; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + } + + BUG_ON(!cfs_rq); + + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, free_sched_group); +} + +/* change task's runqueue when it moves between groups. + * The caller of this function should have put the task in its new group + * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to + * reflect its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int on_rq, running; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(tsk, &flags); + + if (tsk->sched_class != &fair_sched_class) { + set_task_cfs_rq(tsk, task_cpu(tsk)); + goto done; + } + + update_rq_clock(rq); + + running = task_running(rq, tsk); + on_rq = tsk->se.on_rq; + + if (on_rq) { + dequeue_task(rq, tsk, 0); + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); + } + + set_task_cfs_rq(tsk, task_cpu(tsk)); + + if (on_rq) { + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + enqueue_task(rq, tsk, 0); + } + +done: + task_rq_unlock(rq, &flags); +} + +/* rq->lock to be locked by caller */ +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + int on_rq; + + if (!shares) + shares = MIN_GROUP_SHARES; + + on_rq = se->on_rq; + if (on_rq) { + dequeue_entity(cfs_rq, se, 0); + dec_load(rq, se->load.weight); + } + + se->load.weight = shares; + se->load.inv_weight = div64_64((1ULL<<32), shares); + + if (on_rq) { + enqueue_entity(cfs_rq, se, 0); + inc_load(rq, se->load.weight); + } +} + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int i; + struct cfs_rq *cfs_rq; + struct rq *rq; + + mutex_lock(&tg->lock); + + if (tg->shares == shares) + goto done; + + if (shares < MIN_GROUP_SHARES) + shares = MIN_GROUP_SHARES; + + /* Prevent any load balance activity (rebalance_shares, + * load_balance_fair) from referring to this group first, + * by taking it off the rq->leaf_cfs_rq_list on each cpu. + */ + for_each_possible_cpu(i) { + cfs_rq = tg->cfs_rq[i]; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + } + + /* wait for any ongoing reference to this group to finish */ + synchronize_sched(); + + /* Now we are free to modify the group's share on each cpu + * w/o tripping rebalance_share or load_balance_fair. + */ + tg->shares = shares; + for_each_possible_cpu(i) { + spin_lock_irq(&cpu_rq(i)->lock); + set_se_shares(tg->se[i], shares); + spin_unlock_irq(&cpu_rq(i)->lock); + } + + /* Enable load balance activity on this group, by inserting it back on + * each cpu's rq->lead_cfs_rq_list. + */ + for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = tg->cfs_rq[i]; + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + } + +done: + mutex_unlock(&tg->lock); + return 0; +} + +unsigned long sched_group_shares(struct task_group *tg) +{ + return tg->shares; +} + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_FAIR_CGROUP_SCHED + +/* return corresponding task_group object of a cgroup */ +static inline struct task_group *cgroup_tg(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), + struct task_group, css); +} + +static struct cgroup_subsys_state * +cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct task_group *tg; + + if (!cgrp->parent) { + /* This is early initialization for the top cgroup */ + init_task_group.css.cgroup = cgrp; + return &init_task_group.css; + } + + /* we support only 1-level deep hierarchical scheduler atm */ + if (cgrp->parent->parent) + return ERR_PTR(-EINVAL); + + tg = sched_create_group(); + if (IS_ERR(tg)) + return ERR_PTR(-ENOMEM); + + /* Bind the cgroup to task_group object we just created */ + tg->css.cgroup = cgrp; + + return &tg->css; +} + +static void cpu_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + + sched_destroy_group(tg); +} + +static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, + struct cgroup *cgrp, struct task_struct *tsk) +{ + /* We don't support RT-tasks being in separate groups */ + if (tsk->sched_class != &fair_sched_class) + return -EINVAL; + + return 0; +} + +static void +cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cont, struct task_struct *tsk) +{ + sched_move_task(tsk); +} + +static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 shareval) +{ + return sched_group_set_shares(cgroup_tg(cgrp), shareval); +} + +static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + struct task_group *tg = cgroup_tg(cgrp); + + return (u64) tg->shares; +} + +static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct task_group *tg = cgroup_tg(cgrp); + unsigned long flags; + u64 res = 0; + int i; + + for_each_possible_cpu(i) { + /* + * Lock to prevent races with updating 64-bit counters + * on 32-bit arches. + */ + spin_lock_irqsave(&cpu_rq(i)->lock, flags); + res += tg->se[i]->sum_exec_runtime; + spin_unlock_irqrestore(&cpu_rq(i)->lock, flags); + } + /* Convert from ns to ms */ + do_div(res, NSEC_PER_MSEC); + + return res; +} + +static struct cftype cpu_files[] = { + { + .name = "shares", + .read_uint = cpu_shares_read_uint, + .write_uint = cpu_shares_write_uint, + }, + { + .name = "usage", + .read_uint = cpu_usage_read, + }, +}; + +static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); +} + +struct cgroup_subsys cpu_cgroup_subsys = { + .name = "cpu", + .create = cpu_cgroup_create, + .destroy = cpu_cgroup_destroy, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, + .populate = cpu_cgroup_populate, + .subsys_id = cpu_cgroup_subsys_id, + .early_init = 1, +}; + +#endif /* CONFIG_FAIR_CGROUP_SCHED */ Index: linux-cfs-2.6.22.13.q/kernel/sched_debug.c =================================================================== --- /dev/null +++ linux-cfs-2.6.22.13.q/kernel/sched_debug.c @@ -0,0 +1,394 @@ +/* + * kernel/time/sched_debug.c + * + * Print the CFS rbtree + * + * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +/* + * This allows printing both to /proc/sched_debug and + * to the console + */ +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ + } while (0) + +/* + * Ease the printing of nsec fields: + */ +static long long nsec_high(long long nsec) +{ + if (nsec < 0) { + nsec = -nsec; + do_div(nsec, 1000000); + return -nsec; + } + do_div(nsec, 1000000); + + return nsec; +} + +static unsigned long nsec_low(long long nsec) +{ + if (nsec < 0) + nsec = -nsec; + + return do_div(nsec, 1000000); +} + +#define SPLIT_NS(x) nsec_high(x), nsec_low(x) + +static void +print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +{ + if (rq->curr == p) + SEQ_printf(m, "R"); + else + SEQ_printf(m, " "); + + SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", + p->comm, p->pid, + SPLIT_NS(p->se.vruntime), + (long long)(p->nvcsw + p->nivcsw), + p->prio); +#ifdef CONFIG_SCHEDSTATS + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", + SPLIT_NS(p->se.vruntime), + SPLIT_NS(p->se.sum_exec_runtime), + SPLIT_NS(p->se.sum_sleep_runtime)); +#else + SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", + 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); +#endif +} + +static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) +{ + struct task_struct *g, *p; + unsigned long flags; + + SEQ_printf(m, + "\nrunnable tasks:\n" + " task PID tree-key switches prio" + " exec-runtime sum-exec sum-sleep\n" + "------------------------------------------------------" + "----------------------------------------------------\n"); + + read_lock_irqsave(&tasklist_lock, flags); + + do_each_thread(g, p) { + if (!p->se.on_rq || task_cpu(p) != rq_cpu) + continue; + + print_task(m, rq, p); + } while_each_thread(g, p); + + read_unlock_irqrestore(&tasklist_lock, flags); +} + +void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +{ + s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, + spread, rq0_min_vruntime, spread0; + struct rq *rq = &per_cpu(runqueues, cpu); + struct sched_entity *last; + unsigned long flags; + + SEQ_printf(m, "\ncfs_rq\n"); + + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", + SPLIT_NS(cfs_rq->exec_clock)); + + spin_lock_irqsave(&rq->lock, flags); + if (cfs_rq->avl_leftmost) + MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; + last = __pick_last_entity(cfs_rq); + if (last) + max_vruntime = last->vruntime; + min_vruntime = rq->cfs.min_vruntime; + rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; + spin_unlock_irqrestore(&rq->lock, flags); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", + SPLIT_NS(MIN_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", + SPLIT_NS(min_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", + SPLIT_NS(max_vruntime)); + spread = max_vruntime - MIN_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", + SPLIT_NS(spread)); + spread0 = min_vruntime - rq0_min_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", + SPLIT_NS(spread0)); + SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); +#ifdef CONFIG_SCHEDSTATS + SEQ_printf(m, " .%-30s: %d\n", "bkl_count", + rq->bkl_count); +#endif + SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", + cfs_rq->nr_spread_over); +} + +static void print_cpu(struct seq_file *m, int cpu) +{ + struct rq *rq = &per_cpu(runqueues, cpu); + +#ifdef CONFIG_X86 + { + unsigned int freq = cpu_khz ? : 1; + + SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", + cpu, freq / 1000, (freq % 1000)); + } +#else + SEQ_printf(m, "\ncpu#%d\n", cpu); +#endif + +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) + + P(nr_running); + SEQ_printf(m, " .%-30s: %lu\n", "load", + rq->load.weight); + P(nr_switches); + P(nr_load_updates); + P(nr_uninterruptible); + SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); + PN(next_balance); + P(curr->pid); + PN(clock); + PN(idle_clock); + PN(prev_clock_raw); + P(clock_warps); + P(clock_overflows); + P(clock_deep_idle_events); + PN(clock_max_delta); + P(cpu_load[0]); + P(cpu_load[1]); + P(cpu_load[2]); + P(cpu_load[3]); + P(cpu_load[4]); +#undef P +#undef PN + + print_cfs_stats(m, cpu); + + print_rq(m, rq, cpu); +} + +static int sched_debug_show(struct seq_file *m, void *v) +{ + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + + SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); + +#define P(x) \ + SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(sysctl_sched_latency); + PN(sysctl_sched_min_granularity); + PN(sysctl_sched_wakeup_granularity); + PN(sysctl_sched_batch_wakeup_granularity); + PN(sysctl_sched_child_runs_first); + P(sysctl_sched_features); +#undef PN +#undef P + + for_each_online_cpu(cpu) + print_cpu(m, cpu); + + SEQ_printf(m, "\n"); + + return 0; +} + +static void sysrq_sched_debug_show(void) +{ + sched_debug_show(NULL, NULL); +} + +static int sched_debug_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_debug_show, NULL); +} + +static const struct file_operations sched_debug_fops = { + .open = sched_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init init_sched_debug_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = create_proc_entry("sched_debug", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->proc_fops = &sched_debug_fops; + + return 0; +} + +__initcall(init_sched_debug_procfs); + +void proc_sched_show_task(struct task_struct *p, struct seq_file *m) +{ + unsigned long nr_switches; + unsigned long flags; + int num_threads = 1; + + rcu_read_lock(); + if (lock_task_sighand(p, &flags)) { + num_threads = atomic_read(&p->signal->count); + unlock_task_sighand(p, &flags); + } + rcu_read_unlock(); + + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); + SEQ_printf(m, + "---------------------------------------------------------\n"); +#define __P(F) \ + SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) +#define P(F) \ + SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) \ + SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) \ + SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + + PN(se.exec_start); + PN(se.vruntime); + PN(se.sum_exec_runtime); + + nr_switches = p->nvcsw + p->nivcsw; + +#ifdef CONFIG_SCHEDSTATS + PN(se.wait_start); + PN(se.sleep_start); + PN(se.block_start); + PN(se.sleep_max); + PN(se.block_max); + PN(se.exec_max); + PN(se.slice_max); + PN(se.wait_max); + P(sched_info.bkl_count); + P(se.nr_migrations); + P(se.nr_migrations_cold); + P(se.nr_failed_migrations_affine); + P(se.nr_failed_migrations_running); + P(se.nr_failed_migrations_hot); + P(se.nr_forced_migrations); + P(se.nr_forced2_migrations); + P(se.nr_wakeups); + P(se.nr_wakeups_sync); + P(se.nr_wakeups_migrate); + P(se.nr_wakeups_local); + P(se.nr_wakeups_remote); + P(se.nr_wakeups_affine); + P(se.nr_wakeups_affine_attempts); + P(se.nr_wakeups_passive); + P(se.nr_wakeups_idle); + + { + u64 avg_atom, avg_per_cpu; + + avg_atom = p->se.sum_exec_runtime; + if (nr_switches) + do_div(avg_atom, nr_switches); + else + avg_atom = -1LL; + + avg_per_cpu = p->se.sum_exec_runtime; + if (p->se.nr_migrations) + avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations); + else + avg_per_cpu = -1LL; + + __PN(avg_atom); + __PN(avg_per_cpu); + } +#endif + __P(nr_switches); + SEQ_printf(m, "%-35s:%21Ld\n", + "nr_voluntary_switches", (long long)p->nvcsw); + SEQ_printf(m, "%-35s:%21Ld\n", + "nr_involuntary_switches", (long long)p->nivcsw); + + P(se.load.weight); + P(policy); + P(prio); +#undef PN +#undef __PN +#undef P +#undef __P + + { + u64 t0, t1; + + t0 = sched_clock(); + t1 = sched_clock(); + SEQ_printf(m, "%-35s:%21Ld\n", + "clock-delta", (long long)(t1-t0)); + } +} + +void proc_sched_set_task(struct task_struct *p) +{ +#ifdef CONFIG_SCHEDSTATS + p->se.wait_max = 0; + p->se.sleep_max = 0; + p->se.sum_sleep_runtime = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.slice_max = 0; + p->se.nr_migrations = 0; + p->se.nr_migrations_cold = 0; + p->se.nr_failed_migrations_affine = 0; + p->se.nr_failed_migrations_running = 0; + p->se.nr_failed_migrations_hot = 0; + p->se.nr_forced_migrations = 0; + p->se.nr_forced2_migrations = 0; + p->se.nr_wakeups = 0; + p->se.nr_wakeups_sync = 0; + p->se.nr_wakeups_migrate = 0; + p->se.nr_wakeups_local = 0; + p->se.nr_wakeups_remote = 0; + p->se.nr_wakeups_affine = 0; + p->se.nr_wakeups_affine_attempts = 0; + p->se.nr_wakeups_passive = 0; + p->se.nr_wakeups_idle = 0; + p->sched_info.bkl_count = 0; +#endif + p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; + p->nvcsw = 0; + p->nivcsw = 0; +} Index: linux-cfs-2.6.22.13.q/kernel/sched_fair.c =================================================================== --- /dev/null +++ linux-cfs-2.6.22.13.q/kernel/sched_fair.c @@ -0,0 +1,1164 @@ +/* + * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) + * + * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar + * + * Interactivity improvements by Mike Galbraith + * (C) 2007 Mike Galbraith + * + * Various enhancements by Dmitry Adamushko. + * (C) 2007 Dmitry Adamushko + * + * Group scheduling enhancements by Srivatsa Vaddagiri + * Copyright IBM Corporation, 2007 + * Author: Srivatsa Vaddagiri + * + * Scaled math optimizations by Thomas Gleixner + * Copyright (C) 2007, Thomas Gleixner + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + */ + +/* + * Targeted preemption latency for CPU-bound tasks: + * (default: 20ms * ilog(ncpus), units: nanoseconds) + * + * NOTE: this latency value is not the same as the concept of + * 'timeslice length' - timeslices in CFS are of variable length + * and have no persistent notion like in traditional, time-slice + * based scheduling concepts. + * + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches (cs) field) + */ +unsigned int sysctl_sched_latency = 20000000ULL; + +/* + * Minimal preemption granularity for CPU-bound tasks: + * (default: 1 msec * ilog(ncpus), units: nanoseconds) + */ +unsigned int sysctl_sched_min_granularity = 1000000ULL; + +/* + * is kept at sysctl_sched_latency / sysctl_sched_min_granularity + */ +static unsigned int sched_nr_latency = 20; + +/* + * After fork, child runs first. (default) If set to 0 then + * parent will (try to) run first. + */ +const_debug unsigned int sysctl_sched_child_runs_first = 1; + +/* + * sys_sched_yield() compat mode + * + * This option switches the agressive yield implementation of the + * old scheduler back on. + */ +unsigned int __read_mostly sysctl_sched_compat_yield; + +/* + * SCHED_BATCH wake-up granularity. + * (default: 10 msec * ilog(ncpus), units: nanoseconds) + * + * This option delays the preemption effects of decoupled workloads + * and reduces their over-scheduling. Synchronous workloads will still + * have immediate wakeup/sleep latencies. + */ +unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; + +/* + * SCHED_OTHER wake-up granularity. + * (default: 10 msec * ilog(ncpus), units: nanoseconds) + * + * This option delays the preemption effects of decoupled workloads + * and reduces their over-scheduling. Synchronous workloads will still + * have immediate wakeup/sleep latencies. + */ +unsigned int sysctl_sched_wakeup_granularity = 10000000UL; + +const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + +/************************************************************** + * CFS operations on generic schedulable entities: + */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* cpu runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) + +#else /* CONFIG_FAIR_GROUP_SCHED */ + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} + +#define entity_is_task(se) 1 + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + + +/************************************************************** + * Scheduling class tree data structure manipulation methods: + */ + +static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) +{ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta > 0) + min_vruntime = vruntime; + + return min_vruntime; +} + +static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +{ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta < 0) + min_vruntime = vruntime; + + return min_vruntime; +} + +static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return se->vruntime - cfs_rq->min_vruntime; +} + +/* + * Enqueue an entity into the rb-tree: + */ +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct avl_node **link = &cfs_rq->tasks_timeline.avl_node; + struct avl_node *parent = NULL; + struct sched_entity *entry; + s64 key = entity_key(cfs_rq, se); + int leftmost = 1; + + /* + * Find the right place in the rbtree: + */ + while (*link) { + parent = *link; + entry = avl_entry(parent, struct sched_entity, run_node); + /* + * We dont care about collisions. Nodes with + * the same key stay together. + */ + if (key < entity_key(cfs_rq, entry)) { + link = &parent->avl_left; + } else { + link = &parent->avl_right; + leftmost = 0; + } + } + + /* + * Maintain a cache of leftmost tree entries (it is frequently + * used): + */ + if (leftmost) + cfs_rq->avl_leftmost = &se->run_node; + + avl_link_node(&se->run_node, parent, link); + insert_avl_node(&se->run_node, &cfs_rq->tasks_timeline); +} + +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (cfs_rq->avl_leftmost == &se->run_node) + cfs_rq->avl_leftmost = avl_next(&se->run_node); + + delete_avl_node(&se->run_node, &cfs_rq->tasks_timeline); +} + +static inline struct avl_node *first_fair(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avl_leftmost; +} + +static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) +{ + return avl_entry(first_fair(cfs_rq), struct sched_entity, run_node); +} + +static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +{ + struct avl_node **link = &cfs_rq->tasks_timeline.avl_node; + struct sched_entity *se = NULL; + struct avl_node *parent; + + while (*link) { + parent = *link; + se = avl_entry(parent, struct sched_entity, run_node); + link = &parent->avl_right; + } + + return se; +} + +/************************************************************** + * Scheduling class statistics methods: + */ + +#ifdef CONFIG_SCHED_DEBUG +int sched_nr_latency_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, + sysctl_sched_min_granularity); + + return 0; +} +#endif + +/* + * The idea is to set a period in which each task runs once. + * + * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch + * this period because otherwise the slices get too small. + * + * p = (nr <= nl) ? l : l*nr/nl + */ +static u64 __sched_period(unsigned long nr_running) +{ + u64 period = sysctl_sched_latency; + unsigned long nr_latency = sched_nr_latency; + + if (unlikely(nr_running > nr_latency)) { + period *= nr_running; + do_div(period, nr_latency); + } + + return period; +} + +/* + * We calculate the wall-time slice from the period by taking a part + * proportional to the weight. + * + * s = p*w/rw + */ +static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 slice = __sched_period(cfs_rq->nr_running); + + slice *= se->load.weight; + do_div(slice, cfs_rq->load.weight); + + return slice; +} + +/* + * We calculate the vruntime slice. + * + * vs = s/w = p/rw + */ +static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) +{ + u64 vslice = __sched_period(nr_running); + + vslice *= NICE_0_LOAD; + do_div(vslice, rq_weight); + + return vslice; +} + +static u64 sched_vslice(struct cfs_rq *cfs_rq) +{ + return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); +} + +static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return __sched_vslice(cfs_rq->load.weight + se->load.weight, + cfs_rq->nr_running + 1); +} + +/* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. + */ +static inline void +__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, + unsigned long delta_exec) +{ + unsigned long delta_exec_weighted; + u64 vruntime; + + schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); + + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq, exec_clock, delta_exec); + delta_exec_weighted = delta_exec; + if (unlikely(curr->load.weight != NICE_0_LOAD)) { + delta_exec_weighted = calc_delta_fair(delta_exec_weighted, + &curr->load); + } + curr->vruntime += delta_exec_weighted; + + /* + * maintain cfs_rq->min_vruntime to be a monotonic increasing + * value tracking the leftmost vruntime in the tree. + */ + if (first_fair(cfs_rq)) { + vruntime = min_vruntime(curr->vruntime, + __pick_next_entity(cfs_rq)->vruntime); + } else + vruntime = curr->vruntime; + + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, vruntime); +} + +static void update_curr(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + u64 now = rq_of(cfs_rq)->clock; + unsigned long delta_exec; + + if (unlikely(!curr)) + return; + + /* + * Get the amount of time the current task was running + * since the last time we changed load (this cannot + * overflow on 32 bits): + */ + delta_exec = (unsigned long)(now - curr->exec_start); + + __update_curr(cfs_rq, curr, delta_exec); + curr->exec_start = now; +} + +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); +} + +/* + * Task is being enqueued - update stats: + */ +static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * Are we enqueueing a waiting task? (for current tasks + * a dequeue/enqueue event is a NOP) + */ + if (se != cfs_rq->curr) + update_stats_wait_start(cfs_rq, se); +} + +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + schedstat_set(se->wait_max, max(se->wait_max, + rq_of(cfs_rq)->clock - se->wait_start)); + schedstat_set(se->wait_start, 0); +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * Mark the end of the wait period if dequeueing a + * waiting task: + */ + if (se != cfs_rq->curr) + update_stats_wait_end(cfs_rq, se); +} + +/* + * We are picking a new current task - update its stats: + */ +static inline void +update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * We are starting a new run period: + */ + se->exec_start = rq_of(cfs_rq)->clock; +} + +/************************************************** + * Scheduling class queueing methods: + */ + +static void +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_add(&cfs_rq->load, se->load.weight); + cfs_rq->nr_running++; + se->on_rq = 1; +} + +static void +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_sub(&cfs_rq->load, se->load.weight); + cfs_rq->nr_running--; + se->on_rq = 0; +} + +static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHEDSTATS + if (se->sleep_start) { + u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->sleep_max)) + se->sleep_max = delta; + + se->sleep_start = 0; + se->sum_sleep_runtime += delta; + } + if (se->block_start) { + u64 delta = rq_of(cfs_rq)->clock - se->block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->block_max)) + se->block_max = delta; + + se->block_start = 0; + se->sum_sleep_runtime += delta; + + /* + * Blocking time is in units of nanosecs, so shift by 20 to + * get a milliseconds-range estimation of the amount of + * time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + struct task_struct *tsk = task_of(se); + + profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), + delta >> 20); + } + } +#endif +} + +static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + s64 d = se->vruntime - cfs_rq->min_vruntime; + + if (d < 0) + d = -d; + + if (d > 3*sysctl_sched_latency) + schedstat_inc(cfs_rq, nr_spread_over); +#endif +} + +static void +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +{ + u64 vruntime; + + vruntime = cfs_rq->min_vruntime; + + if (sched_feat(TREE_AVG)) { + struct sched_entity *last = __pick_last_entity(cfs_rq); + if (last) { + vruntime += last->vruntime; + vruntime >>= 1; + } + } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) + vruntime += sched_vslice(cfs_rq)/2; + + /* + * The 'current' period is already promised to the current tasks, + * however the extra weight of the new task will slow them down a + * little, place the new task so that it fits in the slot that + * stays open at the end. + */ + if (initial && sched_feat(START_DEBIT)) + vruntime += sched_vslice_add(cfs_rq, se); + + if (!initial) { + /* sleeps upto a single latency don't count. */ + if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && + task_of(se)->policy != SCHED_BATCH) + vruntime -= sysctl_sched_latency; + + /* ensure we never gain time by being placed backwards. */ + vruntime = max_vruntime(se->vruntime, vruntime); + } + + se->vruntime = vruntime; +} + +static void +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) +{ + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + + if (wakeup) { + place_entity(cfs_rq, se, 0); + enqueue_sleeper(cfs_rq, se); + } + + update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + account_entity_enqueue(cfs_rq, se); +} + +static void +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) +{ + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + + update_stats_dequeue(cfs_rq, se); + if (sleep) { +#ifdef CONFIG_SCHEDSTATS + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->state & TASK_INTERRUPTIBLE) + se->sleep_start = rq_of(cfs_rq)->clock; + if (tsk->state & TASK_UNINTERRUPTIBLE) + se->block_start = rq_of(cfs_rq)->clock; + } +#endif + } + + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + account_entity_dequeue(cfs_rq, se); +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void +check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + unsigned long ideal_runtime, delta_exec; + + ideal_runtime = sched_slice(cfs_rq, curr); + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) + resched_task(rq_of(cfs_rq)->curr); +} + +static void +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* 'current' is not kept within the tree. */ + if (se->on_rq) { + /* + * Any task has to be enqueued before it get to execute on + * a CPU. So account for the time it spent waiting on the + * runqueue. + */ + update_stats_wait_end(cfs_rq, se); + __dequeue_entity(cfs_rq, se); + } + + update_stats_curr_start(cfs_rq, se); + cfs_rq->curr = se; +#ifdef CONFIG_SCHEDSTATS + /* + * Track our maximum slice length, if the CPU's load is at + * least twice that of our own weight (i.e. dont track it + * when there are only lesser-weight tasks around): + */ + if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + se->slice_max = max(se->slice_max, + se->sum_exec_runtime - se->prev_sum_exec_runtime); + } +#endif + se->prev_sum_exec_runtime = se->sum_exec_runtime; +} + +static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) +{ + struct sched_entity *se = NULL; + + if (first_fair(cfs_rq)) { + se = __pick_next_entity(cfs_rq); + set_next_entity(cfs_rq, se); + } + + return se; +} + +static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) +{ + /* + * If still on the runqueue then deactivate_task() + * was not called and update_curr() has to be done: + */ + if (prev->on_rq) + update_curr(cfs_rq); + + check_spread(cfs_rq, prev); + if (prev->on_rq) { + update_stats_wait_start(cfs_rq, prev); + /* Put 'current' back into the tree. */ + __enqueue_entity(cfs_rq, prev); + } + cfs_rq->curr = NULL; +} + +static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + + if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) + check_preempt_tick(cfs_rq, curr); +} + +/************************************************** + * CFS operations on tasks: + */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* Walk up scheduling entities hierarchy */ +#define for_each_sched_entity(se) \ + for (; se; se = se->parent) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on + * another cpu ('this_cpu') + */ +static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) +{ + return cfs_rq->tg->cfs_rq[this_cpu]; +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ + if (se->cfs_rq == pse->cfs_rq) + return 1; + + return 0; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return se->parent; +} + +#define GROUP_IMBALANCE_PCT 20 + +#else /* CONFIG_FAIR_GROUP_SCHED */ + +#define for_each_sched_entity(se) \ + for (; se; se = NULL) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} + +static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) +{ + return &cpu_rq(this_cpu)->cfs; +} + +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) + +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ + return 1; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return NULL; +} + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +/* + * The enqueue_task method is called before nr_running is + * increased. Here we update the fair scheduling stats and + * then put the task into the rbtree: + */ +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + unsigned long old_load = rq->cfs.load.weight, new_load, delta_load; + + for_each_sched_entity(se) { + if (se->on_rq) + break; + cfs_rq = cfs_rq_of(se); + enqueue_entity(cfs_rq, se, wakeup); + wakeup = 1; + } + + new_load = rq->cfs.load.weight; + delta_load = new_load - old_load; + inc_load(rq, delta_load); +} + +/* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and + * update the fair scheduling stats: + */ +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + unsigned long old_load = rq->cfs.load.weight, new_load, delta_load; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + dequeue_entity(cfs_rq, se, sleep); + /* Don't dequeue parent if it has other entities besides us */ + if (cfs_rq->load.weight) + break; + sleep = 1; + } + + new_load = rq->cfs.load.weight; + delta_load = old_load - new_load; + dec_load(rq, delta_load); +} + +/* + * sched_yield() support is very simple - we dequeue and enqueue. + * + * If compat_yield is turned on then we requeue to the end of the tree. + */ +static void yield_task_fair(struct rq *rq) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr); + struct sched_entity *rightmost, *se = &rq->curr->se; + + /* + * Are we the only task in the tree? + */ + if (unlikely(cfs_rq->nr_running == 1)) + return; + + if (likely(!sysctl_sched_compat_yield)) { + __update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + + return; + } + /* + * Find the rightmost entry in the rbtree: + */ + rightmost = __pick_last_entity(cfs_rq); + /* + * Already in the rightmost position? + */ + if (unlikely(rightmost->vruntime < se->vruntime)) + return; + + /* + * Minimally necessary key value to be last in the tree: + * Upon rescheduling, sched_class::put_prev_task() will place + * 'current' within the tree based on its new key value. + */ + se->vruntime = rightmost->vruntime + 1; +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) +{ + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se, *pse = &p->se; + unsigned long gran; + + if (unlikely(rt_prio(p->prio))) { + update_rq_clock(rq); + update_curr(cfs_rq); + resched_task(curr); + return; + } + /* + * Batch tasks do not preempt (their preemption is driven by + * the tick): + */ + if (unlikely(p->policy == SCHED_BATCH)) + return; + + if (!sched_feat(WAKEUP_PREEMPT)) + return; + + while (!is_same_group(se, pse)) { + se = parent_entity(se); + pse = parent_entity(pse); + } + + gran = sysctl_sched_wakeup_granularity; + if (unlikely(se->load.weight != NICE_0_LOAD)) + gran = calc_delta_fair(gran, &se->load); + + if (pse->vruntime + gran < se->vruntime) + resched_task(curr); +} + +static struct task_struct *pick_next_task_fair(struct rq *rq) +{ + struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *se; + + if (unlikely(!cfs_rq->nr_running)) + return NULL; + + do { + se = pick_next_entity(cfs_rq); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + return task_of(se); +} + +/* + * Account for a descheduled task: + */ +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +{ + struct sched_entity *se = &prev->se; + struct cfs_rq *cfs_rq; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + put_prev_entity(cfs_rq, se); + } +} + +#ifdef CONFIG_SMP +/************************************************** + * Fair scheduling class load-balancing methods: + */ + +/* + * Load-balancing iterator. Note: while the runqueue stays locked + * during the whole iteration, the current task might be + * dequeued so the iterator has to be dequeue-safe. Here we + * achieve that by always pre-iterating before returning + * the current task: + */ +static struct task_struct * +__load_balance_iterator(struct cfs_rq *cfs_rq, struct avl_node *curr) +{ + struct task_struct *p; + + if (!curr) + return NULL; + + p = avl_entry(curr, struct task_struct, se.run_node); + cfs_rq->avl_load_balance_curr = avl_next(curr); + + return p; +} + +static struct task_struct *load_balance_start_fair(void *arg) +{ + struct cfs_rq *cfs_rq = arg; + + return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); +} + +static struct task_struct *load_balance_next_fair(void *arg) +{ + struct cfs_rq *cfs_rq = arg; + + return __load_balance_iterator(cfs_rq, cfs_rq->avl_load_balance_curr); +} + +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + struct cfs_rq *busy_cfs_rq; + long rem_load_move = max_load_move; + struct rq_iterator cfs_rq_iterator; + unsigned long load_moved; + + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; + + for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { +#ifdef CONFIG_FAIR_GROUP_SCHED + struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; + unsigned long maxload, task_load, group_weight; + unsigned long thisload, per_task_load; + struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; + + task_load = busy_cfs_rq->load.weight; + group_weight = se->load.weight; + + /* + * 'group_weight' is contributed by tasks of total weight + * 'task_load'. To move 'rem_load_move' worth of weight only, + * we need to move a maximum task load of: + * + * maxload = (remload / group_weight) * task_load; + */ + maxload = (rem_load_move * task_load) / group_weight; + + if (!maxload || !task_load) + continue; + + per_task_load = task_load/busy_cfs_rq->nr_running; + + /* balance_tasks will try to forcibly move atleast one task if + * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if + * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. + */ + if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) + continue; + + *this_best_prio = 0; + thisload = this_cfs_rq->load.weight; +#else +# define maxload rem_load_move +#endif + /* + * pass busy_cfs_rq argument into + * load_balance_[start|next]_fair iterators + */ + cfs_rq_iterator.arg = busy_cfs_rq; + load_moved = balance_tasks(this_rq, this_cpu, busiest, + maxload, sd, idle, all_pinned, + this_best_prio, + &cfs_rq_iterator); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* load_moved holds the task load that was moved. The + * effective weight moved would be: + * load_moved_eff = load_moved/task_load * group_weight; + */ + load_moved = (group_weight * load_moved) / task_load; + + /* Adjust shares on both cpus to reflect load_moved */ + group_weight -= load_moved; + set_se_shares(se, group_weight); + + se = busy_cfs_rq->tg->se[this_cpu]; + if (!thisload) + group_weight = load_moved; + else + group_weight = se->load.weight + load_moved; + set_se_shares(se, group_weight); +#endif + + rem_load_move -= load_moved; + if (rem_load_move <= 0) + break; + } + + return max_load_move - rem_load_move; +} + +static int +move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + struct cfs_rq *busy_cfs_rq; + struct rq_iterator cfs_rq_iterator; + + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; + + for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { + /* + * pass busy_cfs_rq argument into + * load_balance_[start|next]_fair iterators + */ + cfs_rq_iterator.arg = busy_cfs_rq; + if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, + &cfs_rq_iterator)) + return 1; + } + + return 0; +} +#endif + +/* + * scheduler tick hitting a task of our scheduling class: + */ +static void task_tick_fair(struct rq *rq, struct task_struct *curr) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &curr->se; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + entity_tick(cfs_rq, se); + } +} + +#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) + +/* + * Share the fairness runtime between parent and child, thus the + * total amount of pressure for CPU stays equal - new tasks + * get a chance to run but frequent forkers are not allowed to + * monopolize the CPU. Note: the parent runqueue is locked, + * the child is not running yet. + */ +static void task_new_fair(struct rq *rq, struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct sched_entity *se = &p->se, *curr = cfs_rq->curr; + int this_cpu = smp_processor_id(); + + sched_info_queued(p); + + update_curr(cfs_rq); + place_entity(cfs_rq, se, 1); + + /* 'curr' will be NULL if the child belongs to a different group */ + if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && + curr && curr->vruntime < se->vruntime) { + /* + * Upon rescheduling, sched_class::put_prev_task() will place + * 'current' within the tree based on its new key value. + */ + swap(curr->vruntime, se->vruntime); + } + + enqueue_task_fair(rq, p, 0); + resched_task(rq->curr); +} + +/* Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_curr_task_fair(struct rq *rq) +{ + struct sched_entity *se = &rq->curr->se; + + for_each_sched_entity(se) + set_next_entity(cfs_rq_of(se), se); +} + +/* + * All the scheduling class methods: + */ +static const struct sched_class fair_sched_class = { + .next = &idle_sched_class, + .enqueue_task = enqueue_task_fair, + .dequeue_task = dequeue_task_fair, + .yield_task = yield_task_fair, + + .check_preempt_curr = check_preempt_wakeup, + + .pick_next_task = pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + +#ifdef CONFIG_SMP + .load_balance = load_balance_fair, + .move_one_task = move_one_task_fair, +#endif + + .set_curr_task = set_curr_task_fair, + .task_tick = task_tick_fair, + .task_new = task_new_fair, +}; + +#ifdef CONFIG_SCHED_DEBUG +static void print_cfs_stats(struct seq_file *m, int cpu) +{ + struct cfs_rq *cfs_rq; + +#ifdef CONFIG_FAIR_GROUP_SCHED + print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); +#endif + + rcu_read_lock(); + for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + print_cfs_rq(m, cpu, cfs_rq); + rcu_read_unlock(); +} +#endif Index: linux-cfs-2.6.22.13.q/kernel/sched_idletask.c =================================================================== --- /dev/null +++ linux-cfs-2.6.22.13.q/kernel/sched_idletask.c @@ -0,0 +1,89 @@ +/* + * idle-task scheduling class. + * + * (NOTE: these are not related to SCHED_IDLE tasks which are + * handled in sched_fair.c) + */ + +/* + * Idle tasks are unconditionally rescheduled: + */ +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) +{ + resched_task(rq->idle); +} + +static struct task_struct *pick_next_task_idle(struct rq *rq) +{ + schedstat_inc(rq, sched_goidle); + + return rq->idle; +} + +/* + * It is not legal to sleep in the idle task - print a warning + * message if some code attempts to do it: + */ +static void +dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) +{ + spin_unlock_irq(&rq->lock); + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + spin_lock_irq(&rq->lock); +} + +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +{ +} + +#ifdef CONFIG_SMP +static unsigned long +load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + return 0; +} + +static int +move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + return 0; +} +#endif + +static void task_tick_idle(struct rq *rq, struct task_struct *curr) +{ +} + +static void set_curr_task_idle(struct rq *rq) +{ +} + +/* + * Simple, special scheduling class for the per-CPU idle tasks: + */ +const struct sched_class idle_sched_class = { + /* .next is NULL */ + /* no enqueue/yield_task for idle tasks */ + + /* dequeue is not valid, we print a debug message there: */ + .dequeue_task = dequeue_task_idle, + + .check_preempt_curr = check_preempt_curr_idle, + + .pick_next_task = pick_next_task_idle, + .put_prev_task = put_prev_task_idle, + +#ifdef CONFIG_SMP + .load_balance = load_balance_idle, + .move_one_task = move_one_task_idle, +#endif + + .set_curr_task = set_curr_task_idle, + .task_tick = task_tick_idle, + /* no .task_new for idle tasks */ +}; Index: linux-cfs-2.6.22.13.q/kernel/sched_rt.c =================================================================== --- /dev/null +++ linux-cfs-2.6.22.13.q/kernel/sched_rt.c @@ -0,0 +1,259 @@ +/* + * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR + * policies) + */ + +/* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. + */ +static void update_curr_rt(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + u64 delta_exec; + + if (!task_has_rt_policy(curr)) + return; + + delta_exec = rq->clock - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + curr->se.exec_start = rq->clock; +} + +static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) +{ + struct rt_prio_array *array = &rq->rt.active; + + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + inc_load(rq, p->se.load.weight); +} + +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) +{ + struct rt_prio_array *array = &rq->rt.active; + + update_curr_rt(rq); + + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); + dec_load(rq, p->se.load.weight); +} + +/* + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. + */ +static void requeue_task_rt(struct rq *rq, struct task_struct *p) +{ + struct rt_prio_array *array = &rq->rt.active; + + list_move_tail(&p->run_list, array->queue + p->prio); +} + +static void +yield_task_rt(struct rq *rq) +{ + requeue_task_rt(rq, rq->curr); +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) +{ + if (p->prio < rq->curr->prio) + resched_task(rq->curr); +} + +static struct task_struct *pick_next_task_rt(struct rq *rq) +{ + struct rt_prio_array *array = &rq->rt.active; + struct task_struct *next; + struct list_head *queue; + int idx; + + idx = sched_find_first_bit(array->bitmap); + if (idx >= MAX_RT_PRIO) + return NULL; + + queue = array->queue + idx; + next = list_entry(queue->next, struct task_struct, run_list); + + next->se.exec_start = rq->clock; + + return next; +} + +static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +{ + update_curr_rt(rq); + p->se.exec_start = 0; +} + +#ifdef CONFIG_SMP +/* + * Load-balancing iterator. Note: while the runqueue stays locked + * during the whole iteration, the current task might be + * dequeued so the iterator has to be dequeue-safe. Here we + * achieve that by always pre-iterating before returning + * the current task: + */ +static struct task_struct *load_balance_start_rt(void *arg) +{ + struct rq *rq = arg; + struct rt_prio_array *array = &rq->rt.active; + struct list_head *head, *curr; + struct task_struct *p; + int idx; + + idx = sched_find_first_bit(array->bitmap); + if (idx >= MAX_RT_PRIO) + return NULL; + + head = array->queue + idx; + curr = head->prev; + + p = list_entry(curr, struct task_struct, run_list); + + curr = curr->prev; + + rq->rt.rt_load_balance_idx = idx; + rq->rt.rt_load_balance_head = head; + rq->rt.rt_load_balance_curr = curr; + + return p; +} + +static struct task_struct *load_balance_next_rt(void *arg) +{ + struct rq *rq = arg; + struct rt_prio_array *array = &rq->rt.active; + struct list_head *head, *curr; + struct task_struct *p; + int idx; + + idx = rq->rt.rt_load_balance_idx; + head = rq->rt.rt_load_balance_head; + curr = rq->rt.rt_load_balance_curr; + + /* + * If we arrived back to the head again then + * iterate to the next queue (if any): + */ + if (unlikely(head == curr)) { + int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); + + if (next_idx >= MAX_RT_PRIO) + return NULL; + + idx = next_idx; + head = array->queue + idx; + curr = head->prev; + + rq->rt.rt_load_balance_idx = idx; + rq->rt.rt_load_balance_head = head; + } + + p = list_entry(curr, struct task_struct, run_list); + + curr = curr->prev; + + rq->rt.rt_load_balance_curr = curr; + + return p; +} + +static unsigned long +load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + struct rq_iterator rt_rq_iterator; + + rt_rq_iterator.start = load_balance_start_rt; + rt_rq_iterator.next = load_balance_next_rt; + /* pass 'busiest' rq argument into + * load_balance_[start|next]_rt iterators + */ + rt_rq_iterator.arg = busiest; + + return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, + idle, all_pinned, this_best_prio, &rt_rq_iterator); +} + +static int +move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + struct rq_iterator rt_rq_iterator; + + rt_rq_iterator.start = load_balance_start_rt; + rt_rq_iterator.next = load_balance_next_rt; + rt_rq_iterator.arg = busiest; + + return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, + &rt_rq_iterator); +} +#endif + +static void task_tick_rt(struct rq *rq, struct task_struct *p) +{ + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if (p->policy != SCHED_RR) + return; + + if (--p->time_slice) + return; + + p->time_slice = DEF_TIMESLICE; + + /* + * Requeue to the end of queue if we are not the only element + * on the queue: + */ + if (p->run_list.prev != p->run_list.next) { + requeue_task_rt(rq, p); + set_tsk_need_resched(p); + } +} + +static void set_curr_task_rt(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + p->se.exec_start = rq->clock; +} + +const struct sched_class rt_sched_class = { + .next = &fair_sched_class, + .enqueue_task = enqueue_task_rt, + .dequeue_task = dequeue_task_rt, + .yield_task = yield_task_rt, + + .check_preempt_curr = check_preempt_curr_rt, + + .pick_next_task = pick_next_task_rt, + .put_prev_task = put_prev_task_rt, + +#ifdef CONFIG_SMP + .load_balance = load_balance_rt, + .move_one_task = move_one_task_rt, +#endif + + .set_curr_task = set_curr_task_rt, + .task_tick = task_tick_rt, +}; Index: linux-cfs-2.6.22.13.q/kernel/sched_stats.h =================================================================== --- /dev/null +++ linux-cfs-2.6.22.13.q/kernel/sched_stats.h @@ -0,0 +1,236 @@ + +#ifdef CONFIG_SCHEDSTATS +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 14 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int dcount = 0; +#endif + + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu", + cpu, rq->yld_both_empty, + rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, + rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->ttwu_count, rq->ttwu_local, + rq->rq_sched_info.cpu_time, + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); + + seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP + /* domain-specific stats */ + preempt_disable(); + for_each_domain(cpu, sd) { + enum cpu_idle_type itype; + char mask_str[NR_CPUS]; + + cpumask_scnprintf(mask_str, NR_CPUS, sd->span); + seq_printf(seq, "domain%d %s", dcount++, mask_str); + for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; + itype++) { + seq_printf(seq, " %u %u %u %u %u %u %u %u", + sd->lb_count[itype], + sd->lb_balanced[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u %u\n", + sd->alb_count, sd->alb_failed, sd->alb_pushed, + sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); + } + preempt_enable(); +#endif + } + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +const struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta; + rq->rq_sched_info.pcount++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.cpu_time += delta; +} +# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) +# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) +# define schedstat_set(var, val) do { var = (val); } while (0) +#else /* !CONFIG_SCHEDSTATS */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{} +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{} +# define schedstat_inc(rq, field) do { } while (0) +# define schedstat_add(rq, field, amt) do { } while (0) +# define schedstat_set(var, val) do { } while (0) +#endif + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +/* + * Called when a process is dequeued from the active array and given + * the cpu. We should note that with the exception of interactive + * tasks, the expired queue will become the active queue after the active + * queue is empty, without explicitly dequeuing and requeuing tasks in the + * expired queue. (Interactive tasks may be requeued directly to the + * active queue, thus delaying tasks in the expired queue from running; + * see scheduler_tick()). + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple + * times as it is shuffled about, we're really interested in knowing how + * long it was from the *first* time it was queued to the time that it + * finally hit a cpu. + */ +static inline void sched_info_dequeued(struct task_struct *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static void sched_info_arrive(struct task_struct *t) +{ + unsigned long long now = task_rq(t)->clock, delta = 0; + + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_dequeued(t); + t->sched_info.run_delay += delta; + t->sched_info.last_arrival = now; + t->sched_info.pcount++; + + rq_sched_info_arrive(task_rq(t), delta); +} + +/* + * Called when a process is queued into either the active or expired + * array. The time is noted and later used to determine how long we + * had to wait for us to reach the cpu. Since the expired queue will + * become the active queue after active queue is empty, without dequeuing + * and requeuing any tasks, we are interested in queuing to either. It + * is unusual but not impossible for tasks to be dequeued and immediately + * requeued in the same or another array: this can happen in sched_yield(), + * set_user_nice(), and even load_balance() as it moves tasks from runqueue + * to runqueue. + * + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(struct task_struct *t) +{ + if (unlikely(sched_info_on())) + if (!t->sched_info.last_queued) + t->sched_info.last_queued = task_rq(t)->clock; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + */ +static inline void sched_info_depart(struct task_struct *t) +{ + unsigned long long delta = task_rq(t)->clock - + t->sched_info.last_arrival; + + t->sched_info.cpu_time += delta; + rq_sched_info_depart(task_rq(t), delta); +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void +__sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + struct rq *rq = task_rq(prev); + + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(prev); + + if (next != rq->idle) + sched_info_arrive(next); +} +static inline void +sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(sched_info_on())) + __sched_info_switch(prev, next); +} +#else +#define sched_info_queued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ + Index: linux-cfs-2.6.22.13.q/kernel/softirq.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/softirq.c +++ linux-cfs-2.6.22.13.q/kernel/softirq.c @@ -488,7 +488,6 @@ void __init softirq_init(void) static int ksoftirqd(void * __bind_cpu) { - set_user_nice(current, 19); current->flags |= PF_NOFREEZE; set_current_state(TASK_INTERRUPTIBLE); Index: linux-cfs-2.6.22.13.q/kernel/sys.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/sys.c +++ linux-cfs-2.6.22.13.q/kernel/sys.c @@ -31,11 +31,13 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -1080,13 +1082,13 @@ static int set_user(uid_t new_ruid, int { struct user_struct *new_user; - new_user = alloc_uid(new_ruid); + new_user = alloc_uid(current->nsproxy->user_ns, new_ruid); if (!new_user) return -EAGAIN; if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && - new_user != &root_user) { + new_user != current->nsproxy->user_ns->root_user) { free_uid(new_user); return -EAGAIN; } Index: linux-cfs-2.6.22.13.q/kernel/sysctl.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/sysctl.c +++ linux-cfs-2.6.22.13.q/kernel/sysctl.c @@ -202,11 +202,135 @@ static ctl_table root_table[] = { .mode = 0555, .child = dev_table, }, - +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ { .ctl_name = 0 } }; -static ctl_table kern_table[] = { +#ifdef CONFIG_SCHED_DEBUG +static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ +static unsigned long max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ +static unsigned long max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +#endif + +static struct ctl_table kern_table[] = { +#ifdef CONFIG_SCHED_DEBUG + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &sched_nr_latency_handler, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &sched_nr_latency_handler, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_wakeup_granularity_ns", + .data = &sysctl_sched_wakeup_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_wakeup_granularity_ns, + .extra2 = &max_wakeup_granularity_ns, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_batch_wakeup_granularity_ns", + .data = &sysctl_sched_batch_wakeup_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_wakeup_granularity_ns, + .extra2 = &max_wakeup_granularity_ns, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_features", + .data = &sysctl_sched_features, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_migration_cost", + .data = &sysctl_sched_migration_cost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_nr_migrate", + .data = &sysctl_sched_nr_migrate, + .maxlen = sizeof(unsigned int), + .mode = 644, + .proc_handler = &proc_dointvec, + }, +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_min_bal_int_shares", + .data = &sysctl_sched_min_bal_int_shares, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_max_bal_int_shares", + .data = &sysctl_sched_max_bal_int_shares, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_compat_yield", + .data = &sysctl_sched_compat_yield, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_PROVE_LOCKING + { + .ctl_name = CTL_UNNUMBERED, + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_PANIC, .procname = "panic", Index: linux-cfs-2.6.22.13.q/kernel/timer.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/timer.c +++ linux-cfs-2.6.22.13.q/kernel/timer.c @@ -812,10 +812,13 @@ void update_process_times(int user_tick) int cpu = smp_processor_id(); /* Note: this timer irq context must be accounted for as well. */ - if (user_tick) + if (user_tick) { account_user_time(p, jiffies_to_cputime(1)); - else + account_user_time_scaled(p, jiffies_to_cputime(1)); + } else { account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); + account_system_time_scaled(p, jiffies_to_cputime(1)); + } run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); Index: linux-cfs-2.6.22.13.q/kernel/tsacct.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/tsacct.c +++ linux-cfs-2.6.22.13.q/kernel/tsacct.c @@ -62,6 +62,10 @@ void bacct_add_tsk(struct taskstats *sta rcu_read_unlock(); stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; + stats->ac_utimescaled = + cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; + stats->ac_stimescaled = + cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC; stats->ac_minflt = tsk->min_flt; stats->ac_majflt = tsk->maj_flt; Index: linux-cfs-2.6.22.13.q/kernel/user.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/kernel/user.c +++ linux-cfs-2.6.22.13.q/kernel/user.c @@ -14,20 +14,19 @@ #include #include #include +#include +#include /* * UID task count cache, to get fast user lookup in "alloc_uid" * when changing user ID's (ie setuid() and friends). */ -#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8) -#define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) -#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) +#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) static struct kmem_cache *uid_cachep; -static struct list_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is @@ -51,31 +50,33 @@ struct user_struct root_user = { .uid_keyring = &root_user_keyring, .session_keyring = &root_session_keyring, #endif +#ifdef CONFIG_FAIR_USER_SCHED + .tg = &init_task_group, +#endif }; /* * These routines must be called with the uidhash spinlock held! */ -static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) +static inline void uid_hash_insert(struct user_struct *up, + struct hlist_head *hashent) { - list_add(&up->uidhash_list, hashent); + hlist_add_head(&up->uidhash_node, hashent); } static inline void uid_hash_remove(struct user_struct *up) { - list_del(&up->uidhash_list); + hlist_del_init(&up->uidhash_node); } -static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) +static inline struct user_struct *uid_hash_find(uid_t uid, + struct hlist_head *hashent) { - struct list_head *up; - - list_for_each(up, hashent) { - struct user_struct *user; + struct user_struct *user; + struct hlist_node *h; - user = list_entry(up, struct user_struct, uidhash_list); - - if(user->uid == uid) { + hlist_for_each_entry(user, h, hashent, uidhash_node) { + if (user->uid == uid) { atomic_inc(&user->__count); return user; } @@ -84,6 +85,203 @@ static inline struct user_struct *uid_ha return NULL; } +#ifdef CONFIG_FAIR_USER_SCHED + +static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ +static DEFINE_MUTEX(uids_mutex); + +static void sched_destroy_user(struct user_struct *up) +{ + sched_destroy_group(up->tg); +} + +static int sched_create_user(struct user_struct *up) +{ + int rc = 0; + + up->tg = sched_create_group(); + if (IS_ERR(up->tg)) + rc = -ENOMEM; + + return rc; +} + +static void sched_switch_user(struct task_struct *p) +{ + sched_move_task(p); +} + +static inline void uids_mutex_lock(void) +{ + mutex_lock(&uids_mutex); +} + +static inline void uids_mutex_unlock(void) +{ + mutex_unlock(&uids_mutex); +} + +/* return cpu shares held by the user */ +ssize_t cpu_shares_show(struct kset *kset, char *buffer) +{ + struct user_struct *up = container_of(kset, struct user_struct, kset); + + return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); +} + +/* modify cpu shares held by the user */ +ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size) +{ + struct user_struct *up = container_of(kset, struct user_struct, kset); + unsigned long shares; + int rc; + + sscanf(buffer, "%lu", &shares); + + rc = sched_group_set_shares(up->tg, shares); + + return (rc ? rc : size); +} + +static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) +{ + sa->attr.name = name; + sa->attr.mode = mode; + sa->show = cpu_shares_show; + sa->store = cpu_shares_store; +} + +/* Create "/sys/kernel/uids/" directory and + * "/sys/kernel/uids//cpu_share" file for this user. + */ +static int user_kobject_create(struct user_struct *up) +{ + struct kset *kset = &up->kset; + struct kobject *kobj = &kset->kobj; + int error; + + memset(kset, 0, sizeof(struct kset)); + kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ + kobject_set_name(kobj, "%d", up->uid); + kset_init(kset); + user_attr_init(&up->user_attr, "cpu_share", 0644); + + error = kobject_add(kobj); + if (error) + goto done; + + error = sysfs_create_file(kobj, &up->user_attr.attr); + if (error) + kobject_del(kobj); + + kobject_uevent(kobj, KOBJ_ADD); + +done: + return error; +} + +/* create these in sysfs filesystem: + * "/sys/kernel/uids" directory + * "/sys/kernel/uids/0" directory (for root user) + * "/sys/kernel/uids/0/cpu_share" file (for root user) + */ +int __init uids_kobject_init(void) +{ + int error; + + /* create under /sys/kernel dir */ + uids_kobject.parent = &kernel_subsys.kobj; + uids_kobject.kset = &kernel_subsys; + kobject_set_name(&uids_kobject, "uids"); + kobject_init(&uids_kobject); + + error = kobject_add(&uids_kobject); + if (!error) + error = user_kobject_create(&root_user); + + return error; +} + +/* work function to remove sysfs directory for a user and free up + * corresponding structures. + */ +static void remove_user_sysfs_dir(struct work_struct *w) +{ + struct user_struct *up = container_of(w, struct user_struct, work); + struct kobject *kobj = &up->kset.kobj; + unsigned long flags; + int remove_user = 0; + + /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() + * atomic. + */ + uids_mutex_lock(); + + local_irq_save(flags); + + if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { + uid_hash_remove(up); + remove_user = 1; + spin_unlock_irqrestore(&uidhash_lock, flags); + } else { + local_irq_restore(flags); + } + + if (!remove_user) + goto done; + + sysfs_remove_file(kobj, &up->user_attr.attr); + kobject_uevent(kobj, KOBJ_REMOVE); + kobject_del(kobj); + + sched_destroy_user(up); + key_put(up->uid_keyring); + key_put(up->session_keyring); + kmem_cache_free(uid_cachep, up); + +done: + uids_mutex_unlock(); +} + +/* IRQs are disabled and uidhash_lock is held upon function entry. + * IRQ state (as stored in flags) is restored and uidhash_lock released + * upon function exit. + */ +static inline void free_user(struct user_struct *up, unsigned long flags) +{ + /* restore back the count */ + atomic_inc(&up->__count); + spin_unlock_irqrestore(&uidhash_lock, flags); + + INIT_WORK(&up->work, remove_user_sysfs_dir); + schedule_work(&up->work); +} + +#else /* CONFIG_FAIR_USER_SCHED */ + +static void sched_destroy_user(struct user_struct *up) { } +static int sched_create_user(struct user_struct *up) { return 0; } +static void sched_switch_user(struct task_struct *p) { } +static inline int user_kobject_create(struct user_struct *up) { return 0; } +static inline void uids_mutex_lock(void) { } +static inline void uids_mutex_unlock(void) { } + +/* IRQs are disabled and uidhash_lock is held upon function entry. + * IRQ state (as stored in flags) is restored and uidhash_lock released + * upon function exit. + */ +static inline void free_user(struct user_struct *up, unsigned long flags) +{ + uid_hash_remove(up); + spin_unlock_irqrestore(&uidhash_lock, flags); + sched_destroy_user(up); + key_put(up->uid_keyring); + key_put(up->session_keyring); + kmem_cache_free(uid_cachep, up); +} + +#endif /* CONFIG_FAIR_USER_SCHED */ + /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). @@ -94,9 +292,10 @@ struct user_struct *find_user(uid_t uid) { struct user_struct *ret; unsigned long flags; + struct user_namespace *ns = current->nsproxy->user_ns; spin_lock_irqsave(&uidhash_lock, flags); - ret = uid_hash_find(uid, uidhashentry(uid)); + ret = uid_hash_find(uid, uidhashentry(ns, uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } @@ -109,22 +308,22 @@ void free_uid(struct user_struct *up) return; local_irq_save(flags); - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { - uid_hash_remove(up); - spin_unlock_irqrestore(&uidhash_lock, flags); - key_put(up->uid_keyring); - key_put(up->session_keyring); - kmem_cache_free(uid_cachep, up); - } else { + if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) + free_user(up, flags); + else local_irq_restore(flags); - } } -struct user_struct * alloc_uid(uid_t uid) +struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) { - struct list_head *hashent = uidhashentry(uid); + struct hlist_head *hashent = uidhashentry(ns, uid); struct user_struct *up; + /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() + * atomic. + */ + uids_mutex_lock(); + spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); @@ -153,6 +352,22 @@ struct user_struct * alloc_uid(uid_t uid return NULL; } + if (sched_create_user(new) < 0) { + key_put(new->uid_keyring); + key_put(new->session_keyring); + kmem_cache_free(uid_cachep, new); + return NULL; + } + + if (user_kobject_create(new)) { + sched_destroy_user(new); + key_put(new->uid_keyring); + key_put(new->session_keyring); + kmem_cache_free(uid_cachep, new); + uids_mutex_unlock(); + return NULL; + } + /* * Before adding this, check whether we raced * on adding the same user already.. @@ -160,6 +375,11 @@ struct user_struct * alloc_uid(uid_t uid spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + /* This case is not possible when CONFIG_FAIR_USER_SCHED + * is defined, since we serialize alloc_uid() using + * uids_mutex. Hence no need to call + * sched_destroy_user() or remove_user_sysfs_dir(). + */ key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); @@ -170,6 +390,9 @@ struct user_struct * alloc_uid(uid_t uid spin_unlock_irq(&uidhash_lock); } + + uids_mutex_unlock(); + return up; } @@ -187,6 +410,7 @@ void switch_uid(struct user_struct *new_ atomic_dec(&old_user->processes); switch_uid_keyring(new_user); current->user = new_user; + sched_switch_user(current); /* * We need to synchronize with __sigqueue_alloc() @@ -202,6 +426,30 @@ void switch_uid(struct user_struct *new_ suid_keys(current); } +void release_uids(struct user_namespace *ns) +{ + int i; + unsigned long flags; + struct hlist_head *head; + struct hlist_node *nd; + + spin_lock_irqsave(&uidhash_lock, flags); + /* + * collapse the chains so that the user_struct-s will + * be still alive, but not in hashes. subsequent free_uid() + * will free them. + */ + for (i = 0; i < UIDHASH_SZ; i++) { + head = ns->uidhash_table + i; + while (!hlist_empty(head)) { + nd = head->first; + hlist_del_init(nd); + } + } + spin_unlock_irqrestore(&uidhash_lock, flags); + + free_uid(ns->root_user); +} static int __init uid_cache_init(void) { @@ -211,11 +459,11 @@ static int __init uid_cache_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); for(n = 0; n < UIDHASH_SZ; ++n) - INIT_LIST_HEAD(uidhash_table + n); + INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); - uid_hash_insert(&root_user, uidhashentry(0)); + uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); spin_unlock_irq(&uidhash_lock); return 0; Index: linux-cfs-2.6.22.13.q/kernel/user_namespace.c =================================================================== --- /dev/null +++ linux-cfs-2.6.22.13.q/kernel/user_namespace.c @@ -0,0 +1,88 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include +#include + +struct user_namespace init_user_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, + .root_user = &root_user, +}; + +EXPORT_SYMBOL_GPL(init_user_ns); + +#ifdef CONFIG_USER_NS + +/* + * Clone a new ns copying an original user ns, setting refcount to 1 + * @old_ns: namespace to clone + * Return NULL on error (failure to kmalloc), new ns otherwise + */ +static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) +{ + struct user_namespace *ns; + struct user_struct *new_user; + int n; + + ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); + if (!ns) + return ERR_PTR(-ENOMEM); + + kref_init(&ns->kref); + + for (n = 0; n < UIDHASH_SZ; ++n) + INIT_HLIST_HEAD(ns->uidhash_table + n); + + /* Insert new root user. */ + ns->root_user = alloc_uid(ns, 0); + if (!ns->root_user) { + kfree(ns); + return ERR_PTR(-ENOMEM); + } + + /* Reset current->user with a new one */ + new_user = alloc_uid(ns, current->uid); + if (!new_user) { + free_uid(ns->root_user); + kfree(ns); + return ERR_PTR(-ENOMEM); + } + + switch_uid(new_user); + return ns; +} + +struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns) +{ + struct user_namespace *new_ns; + + BUG_ON(!old_ns); + get_user_ns(old_ns); + + if (!(flags & CLONE_NEWUSER)) + return old_ns; + + new_ns = clone_user_ns(old_ns); + + put_user_ns(old_ns); + return new_ns; +} + +void free_user_ns(struct kref *kref) +{ + struct user_namespace *ns; + + ns = container_of(kref, struct user_namespace, kref); + release_uids(ns); + kfree(ns); +} + +#endif /* CONFIG_USER_NS */ Index: linux-cfs-2.6.22.13.q/lib/Kconfig.debug =================================================================== --- linux-cfs-2.6.22.13.q.orig/lib/Kconfig.debug +++ linux-cfs-2.6.22.13.q/lib/Kconfig.debug @@ -105,6 +105,15 @@ config DETECT_SOFTLOCKUP can be detected via the NMI-watchdog, on platforms that support it.) +config SCHED_DEBUG + bool "Collect scheduler debugging info" + depends on DEBUG_KERNEL && PROC_FS + default y + help + If you say Y here, the /proc/sched_debug file will be provided + that can help debug the scheduler. The runtime overhead of this + option is minimal. + config SCHEDSTATS bool "Collect scheduler statistics" depends on DEBUG_KERNEL && PROC_FS Index: linux-cfs-2.6.22.13.q/mm/memory_hotplug.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/mm/memory_hotplug.c +++ linux-cfs-2.6.22.13.q/mm/memory_hotplug.c @@ -217,6 +217,10 @@ int online_pages(unsigned long pfn, unsi zone->zone_pgdat->node_present_pages += onlined_pages; setup_per_zone_pages_min(); + if (onlined_pages) { + kswapd_run(zone_to_nid(zone)); + node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); + } if (need_zonelists_rebuild) build_all_zonelists(); @@ -271,9 +275,6 @@ int add_memory(int nid, u64 start, u64 s if (!pgdat) return -ENOMEM; new_pgdat = 1; - ret = kswapd_run(nid); - if (ret) - goto error; } /* call arch's memory hotadd */ Index: linux-cfs-2.6.22.13.q/mm/page_alloc.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/mm/page_alloc.c +++ linux-cfs-2.6.22.13.q/mm/page_alloc.c @@ -47,13 +47,21 @@ #include "internal.h" /* - * MCD - HACK: Find somewhere to initialize this EARLY, or make this - * initializer cleaner + * Array of node states. */ -nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; -EXPORT_SYMBOL(node_online_map); -nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; -EXPORT_SYMBOL(node_possible_map); +nodemask_t node_states[NR_NODE_STATES] __read_mostly = { + [N_POSSIBLE] = NODE_MASK_ALL, + [N_ONLINE] = { { [0] = 1UL } }, +#ifndef CONFIG_NUMA + [N_NORMAL_MEMORY] = { { [0] = 1UL } }, +#ifdef CONFIG_HIGHMEM + [N_HIGH_MEMORY] = { { [0] = 1UL } }, +#endif + [N_CPU] = { { [0] = 1UL } }, +#endif /* NUMA */ +}; +EXPORT_SYMBOL(node_states); + unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; long nr_swap_pages; @@ -1819,14 +1827,35 @@ static void __meminit build_zonelist_cac #endif /* CONFIG_NUMA */ +/* Any regular memory on that node ? */ +static void check_for_regular_memory(pg_data_t *pgdat) +{ +#ifdef CONFIG_HIGHMEM + enum zone_type zone_type; + + for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + if (zone->present_pages) + node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); + } +#endif +} + /* return values int ....just for stop_machine_run() */ static int __meminit __build_all_zonelists(void *dummy) { int nid; for_each_online_node(nid) { - build_zonelists(NODE_DATA(nid)); - build_zonelist_cache(NODE_DATA(nid)); + pg_data_t *pgdat = NODE_DATA(nid); + + build_zonelists(pgdat); + build_zonelist_cache(pgdat); + + /* Any memory on that node */ + if (pgdat->node_present_pages) + node_set_state(nid, N_HIGH_MEMORY); + check_for_regular_memory(pgdat); } return 0; } @@ -2064,6 +2093,9 @@ static struct per_cpu_pageset boot_pages static int __cpuinit process_zones(int cpu) { struct zone *zone, *dzone; + int node = cpu_to_node(cpu); + + node_set_state(node, N_CPU); /* this node has a cpu */ for_each_zone(zone) { @@ -2071,7 +2103,7 @@ static int __cpuinit process_zones(int c continue; zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), - GFP_KERNEL, cpu_to_node(cpu)); + GFP_KERNEL, node); if (!zone_pcp(zone, cpu)) goto bad; Index: linux-cfs-2.6.22.13.q/mm/vmscan.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/mm/vmscan.c +++ linux-cfs-2.6.22.13.q/mm/vmscan.c @@ -1686,7 +1686,6 @@ static int __zone_reclaim(struct zone *z int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) { - cpumask_t mask; int node_id; /* @@ -1723,8 +1722,7 @@ int zone_reclaim(struct zone *zone, gfp_ * as wide as possible. */ node_id = zone_to_nid(zone); - mask = node_to_cpumask(node_id); - if (!cpus_empty(mask) && node_id != numa_node_id()) + if (node_state(node_id, N_CPU) && node_id != numa_node_id()) return 0; return __zone_reclaim(zone, gfp_mask, order); } Index: linux-cfs-2.6.22.13.q/net/unix/af_unix.c =================================================================== --- linux-cfs-2.6.22.13.q.orig/net/unix/af_unix.c +++ linux-cfs-2.6.22.13.q/net/unix/af_unix.c @@ -307,7 +307,7 @@ static void unix_write_space(struct sock read_lock(&sk->sk_callback_lock); if (unix_writable(sk)) { if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + wake_up_interruptible_sync(sk->sk_sleep); sk_wake_async(sk, 2, POLL_OUT); } read_unlock(&sk->sk_callback_lock); @@ -1611,7 +1611,7 @@ static int unix_dgram_recvmsg(struct kio if (!skb) goto out_unlock; - wake_up_interruptible(&u->peer_wait); + wake_up_interruptible_sync(&u->peer_wait); if (msg->msg_name) unix_copy_addr(msg, skb->sk);