diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 89f7ead..e851625 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -391,6 +391,9 @@ #define __NR_pwritev (__NR_SYSCALL_BASE+362) #define __NR_rt_tgsigqueueinfo (__NR_SYSCALL_BASE+363) #define __NR_perf_event_open (__NR_SYSCALL_BASE+364) +#define __NR_sched_setscheduler_ex (__NR_SYSCALL_BASE+365) +#define __NR_sched_setparam_ex (__NR_SYSCALL_BASE+366) +#define __NR_sched_getparam_ex (__NR_SYSCALL_BASE+367) /* * The following SWIs are ARM private. diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index fafce1b..42ad362 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -374,6 +374,9 @@ CALL(sys_pwritev) CALL(sys_rt_tgsigqueueinfo) CALL(sys_perf_event_open) +/* 365 */ CALL(sys_sched_setscheduler_ex) + CALL(sys_sched_setparam_ex) + CALL(sys_sched_getparam_ex) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls #define syscalls_counted diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 74619c4..fde92c2 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -832,4 +832,7 @@ ia32_sys_call_table: .quad compat_sys_pwritev .quad compat_sys_rt_tgsigqueueinfo /* 335 */ .quad sys_perf_event_open + .quad sys_sched_setscheduler_ex + .quad sys_sched_setparam_ex + .quad sys_sched_getparam_ex ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 6fb3c20..a83163f 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -342,6 +342,9 @@ #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 #define __NR_perf_event_open 336 +#define __NR_sched_setscheduler_ex 337 +#define __NR_sched_setparam_ex 338 +#define __NR_sched_getparam_ex 339 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 8d3ad0a..a11831c 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -661,6 +661,12 @@ __SYSCALL(__NR_pwritev, sys_pwritev) __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) #define __NR_perf_event_open 298 __SYSCALL(__NR_perf_event_open, sys_perf_event_open) +#define __NR_sched_setscheduler_ex 299 +__SYSCALL(__NR_sched_setscheduler_ex, sys_sched_setscheduler_ex) +#define __NR_sched_setparam_ex 230 +__SYSCALL(__NR_sched_setparam_ex, sys_sched_setparam_ex) +#define __NR_sched_getparam_ex 231 +__SYSCALL(__NR_sched_getparam_ex, sys_sched_getparam_ex) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 0157cd2..38f056c 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -336,3 +336,6 @@ ENTRY(sys_call_table) .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open + .long sys_sched_setscheduler_ex + .long sys_sched_setparam_ex + .long sys_sched_getparam_ex diff --git a/include/linux/sched.h b/include/linux/sched.h index 8fe351c..2554e08 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -38,6 +38,7 @@ #define SCHED_BATCH 3 /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 +#define SCHED_EDF 6 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 @@ -94,6 +95,19 @@ struct sched_param { #include +/* + * For an typical EDF periodic or sporadic task we need a runtime + * and period (equal to deadline) specification in the sched_param + * structure. + * However, the original struct sched_param can't be modified, for binary + * compatibility issues, and thus this new one is created. + */ +struct sched_param_ex { + int sched_priority; + struct timespec sched_period; + struct timespec sched_runtime; +}; + struct exec_domain; struct futex_pi_state; struct robust_list_head; @@ -147,12 +161,15 @@ extern unsigned long get_parent_ip(unsigned long addr); struct seq_file; struct cfs_rq; +struct edf_rq; struct task_group; #ifdef CONFIG_SCHED_DEBUG extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +extern void +print_edf_rq(struct seq_file *m, int cpu, struct edf_rq *edf_rq); #else static inline void proc_sched_show_task(struct task_struct *p, struct seq_file *m) @@ -165,6 +182,10 @@ static inline void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { } +static inline void +print_edf_rq(struct seq_file *m, int cpu, struct edf_rq *edf_rq) +{ +} #endif extern unsigned long long time_sync_thresh; @@ -1161,6 +1182,28 @@ struct sched_entity { #endif }; +#define EDF_NEW 1 +#define EDF_ENDED 2 +#define EDF_RUNNING 4 + +struct sched_edf_entity { + struct rb_node rb_node; + /* actual scheduling parameters */ + u64 deadline; + s64 runtime; + unsigned int edf_flags; + + /* original parameters taken from sched_param_ex */ + u64 runtime_max; + u64 period; +#ifdef CONFIG_EDF_GROUP_SCHED + u64 bw; +#endif + + int nr_cpus_allowed; + struct hrtimer edf_timer; +}; + struct sched_rt_entity { struct list_head run_list; unsigned long timeout; @@ -1198,6 +1241,7 @@ struct task_struct { unsigned int rt_priority; const struct sched_class *sched_class; struct sched_entity se; + struct sched_edf_entity edf; struct sched_rt_entity rt; #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1543,6 +1587,18 @@ static inline int rt_task(struct task_struct *p) return rt_prio(p->prio); } +static inline int edf_policy(int policy) +{ + if (unlikely(policy == SCHED_EDF)) + return 1; + return 0; +} + +static inline int edf_task(struct task_struct *p) +{ + return edf_policy(p->policy); +} + static inline struct pid *task_pid(struct task_struct *task) { return task->pids[PIDTYPE_PID].pid; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 8d8285a..f5b76a1 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -33,6 +33,7 @@ struct pollfd; struct rlimit; struct rusage; struct sched_param; +struct sched_param_ex; struct semaphore; struct sembuf; struct shmid_ds; @@ -390,11 +391,17 @@ asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags, asmlinkage long sys_nice(int increment); asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param); +asmlinkage long sys_sched_setscheduler_ex(pid_t pid, int policy, + struct sched_param_ex __user *param); asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param); +asmlinkage long sys_sched_setparam_ex(pid_t pid, + struct sched_param_ex __user *param); asmlinkage long sys_sched_getscheduler(pid_t pid); asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param); +asmlinkage long sys_sched_getparam_ex(pid_t pid, + struct sched_param_ex __user *param); asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr); asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, diff --git a/init/Kconfig b/init/Kconfig index 0aa6579..cd119ac 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -454,6 +454,20 @@ config RT_GROUP_SCHED realtime bandwidth for them. See Documentation/scheduler/sched-rt-group.txt for more information. +config EDF_GROUP_SCHED + bool "Group scheduling for SCHED_EDF" + depends on EXPERIMENTAL + depends on GROUP_SCHED + depends on CGROUPS + depends on !USER_SCHED + select CPUSETS + default n + help + This feature lets you explicitly specify, in terms of runtime + and period, the bandwidth of a task control group. This means + tasks and other control groups can be added, until such bandwidth + limit is reached, after that they will fail. + choice depends on GROUP_SCHED prompt "Basis for grouping tasks" diff --git a/kernel/fork.c b/kernel/fork.c index 2cebfb2..b4f3874 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1203,6 +1203,18 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->parent_exec_id = current->self_exec_id; } + /* + * In case an EDF task is forking, both parent and child get half + * the bandwidth of the parent (via dividing runtime by 2). + * This make it a lot easier to deal with control group bandwidth, and + * also prevent an heavily spawning task to exhaust the system (or + * a control group, if enabled) bandwidht. + */ + if (edf_task(p->real_parent)) { + p->real_parent->edf.runtime_max /= 2; + p->edf.runtime_max = p->real_parent->edf.runtime_max; + } + spin_lock(¤t->sighand->siglock); /* diff --git a/kernel/sched.c b/kernel/sched.c index 91843ba..4681a5c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -131,6 +131,11 @@ static inline int task_has_rt_policy(struct task_struct *p) return rt_policy(p->policy); } +static inline int task_has_edf_policy(struct task_struct *p) +{ + return edf_policy(p->policy); +} + /* * This is the priority-queue data structure of the RT scheduling class: */ @@ -227,6 +232,16 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) } #endif +struct edf_bandwidth { + spinlock_t lock; + /* runtime and period that determine the bandwidth of the group */ + u64 runtime_max; + u64 period; + u64 bw; + /* accumulator for the actual allocated bandwidth in a group */ + u64 total_bw; +}; + /* * sched_domains_mutex serializes calls to arch_init_sched_domains, * detach_destroy_domains and partition_sched_domains. @@ -259,6 +274,11 @@ struct task_group { unsigned long shares; #endif +#ifdef CONFIG_EDF_GROUP_SCHED + struct edf_bandwidth edf_bandwidth; + struct edf_rq **edf_rq; +#endif + #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; struct rt_rq **rt_rq; @@ -296,6 +316,10 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_EDF_GROUP_SCHED +static DEFINE_PER_CPU(struct edf_rq, init_edf_rq) ____cacheline_aligned_in_smp; +#endif + #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); @@ -447,6 +471,18 @@ struct cfs_rq { #endif }; +struct edf_rq { + unsigned long edf_nr_running; + + /* as in CFS, the runqueue is based on an rbtree, deadline ordered */ + struct rb_root rb_root; + struct rb_node *rb_leftmost; + +#ifdef CONFIG_EDF_GROUP_SCHED + struct rq *rq; +#endif +}; + /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; @@ -544,6 +580,7 @@ struct rq { u64 nr_migrations_in; struct cfs_rq cfs; + struct edf_rq edf; struct rt_rq rt; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1816,7 +1853,9 @@ static void calc_load_account_active(struct rq *this_rq); #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" +#include "sched_edf.c" #include "sched_rt.c" + #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif @@ -1837,7 +1876,7 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { - if (task_has_rt_policy(p)) { + if (task_has_rt_policy(p) || task_has_edf_policy(p)) { p->se.load.weight = prio_to_weight[0] * 2; p->se.load.inv_weight = prio_to_wmult[0] >> 1; return; @@ -2523,7 +2562,8 @@ void sched_fork(struct task_struct *p, int clone_flags) * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR || + edf_policy(p->policy)) p->policy = SCHED_NORMAL; if (p->normal_prio < DEFAULT_PRIO) @@ -2541,8 +2581,12 @@ void sched_fork(struct task_struct *p, int clone_flags) p->sched_reset_on_fork = 0; } - if (!rt_prio(p->prio)) - p->sched_class = &fair_sched_class; + if (!rt_prio(p->prio)) { + if (edf_policy(p->policy)) + p->sched_class = &edf_sched_class; + else + p->sched_class = &fair_sched_class; + } #ifdef CONFIG_SMP cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); @@ -2565,6 +2609,68 @@ void sched_fork(struct task_struct *p, int clone_flags) put_cpu(); } +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EDF_GROUP_SCHED) +static unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 20; + + return div64_u64(runtime << 20, period); +} + +#ifdef CONFIG_EDF_GROUP_SCHED +static inline +void __edf_tg_clear_task_bw(struct task_group *tg, u64 tsk_bw) +{ + tg->edf_bandwidth.total_bw -= tsk_bw; +} + +static inline +void __edf_tg_add_task_bw(struct task_group *tg, u64 tsk_bw) +{ + tg->edf_bandwidth.total_bw += tsk_bw; +} + +static inline +int __edf_check_task_tg_bw(struct task_struct *p, struct task_group *tg, + int policy, struct sched_param_ex *param_ex) +{ + u64 tg_bw = tg->edf_bandwidth.bw; + u64 bw = p->edf.bw; + u64 new_bw = 0; + + if (tg->edf_bandwidth.bw <= 0) + return 0; + + if (edf_policy(policy)) + new_bw = to_ratio(timespec_to_ns(¶m_ex->sched_period), + timespec_to_ns(¶m_ex->sched_runtime)); + + /* + * Either if a task, enters, leave, or stays SCHED_EDF but changing + * its parameters, we need to update accordingly the allocated + * bandwidth of the control group it is inside. + */ + if (task_has_edf_policy(p) && !edf_policy(policy)) { + __edf_tg_clear_task_bw(tg, bw); + return 1; + } else if (task_has_edf_policy(p) && edf_policy(policy) && + tg_bw >= tg->edf_bandwidth.total_bw - bw + new_bw) { + __edf_tg_clear_task_bw(tg, bw); + __edf_tg_add_task_bw(tg, new_bw); + return 1; + } else if (edf_policy(policy) && !task_has_edf_policy(p) && + tg_bw >= tg->edf_bandwidth.total_bw + new_bw) { + __edf_tg_add_task_bw(tg, new_bw); + return 1; + } + + return 0; +} +#endif /* CONFIG_EDF_GROUP_SCHED */ + +#endif /* CONFIG_RT_GROUP_SCHED || CONFIG_EDF_GROUP_SCHED */ + /* * wake_up_new_task - wake up a newly created task for the first time. * @@ -2582,6 +2688,18 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) update_rq_clock(rq); p->prio = effective_prio(p); + if (edf_task(p)) { + struct sched_edf_entity *edf_se = &p->edf; + + /* + * If the new task we are waking up is EDF, + * some initialization is needed + */ + RB_CLEAR_NODE(&edf_se->rb_node); + edf_se->runtime = edf_se->runtime_max; + edf_se->edf_flags = EDF_NEW; + init_edf_timer(&edf_se->edf_timer); + } if (!p->sched_class->task_new || !current->se.on_rq) { activate_task(rq, p, 0); @@ -2725,6 +2843,23 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { +#ifdef CONFIG_EDF_GROUP_SCHED + if (edf_task(prev)) { + struct task_group *tg = task_group(prev); + unsigned long flags; + + /* + * When an EDF task dies, we need to drain its + * bandwidth from the cgroup it was accomodating hime, + * to make space for new tasks. + */ + spin_lock_irqsave(&tg->edf_bandwidth.lock, flags); + __edf_tg_clear_task_bw(tg, prev->edf.bw); + spin_unlock_irqrestore(&tg->edf_bandwidth.lock, flags); + + hrtimer_cancel(&prev->edf.edf_timer); + } +#endif /* * Remove function-return probe instances associated with this * task and put them back on the free list. @@ -5952,8 +6087,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (rt_prio(prio)) p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; + else { + if (!edf_task(p)) + p->sched_class = &fair_sched_class; + else + p->sched_class = &edf_sched_class; + } p->prio = prio; @@ -5987,9 +6126,9 @@ void set_user_nice(struct task_struct *p, long nice) * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is - * SCHED_FIFO/SCHED_RR: + * SCHED_EDF, SCHED_FIFO or SCHED_RR: */ - if (task_has_rt_policy(p)) { + if (unlikely(task_has_rt_policy(p) || task_has_edf_policy(p))) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } @@ -6136,6 +6275,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) case SCHED_IDLE: p->sched_class = &fair_sched_class; break; + case SCHED_EDF: + p->sched_class = &edf_sched_class; + break; case SCHED_FIFO: case SCHED_RR: p->sched_class = &rt_sched_class; @@ -6149,6 +6291,25 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) set_load_weight(p); } +static void +__setscheduler_ex(struct rq *rq, struct task_struct *p, int policy, + struct sched_param_ex *param_ex) +{ + struct sched_edf_entity *edf_se = &p->edf; + + RB_CLEAR_NODE(&edf_se->rb_node); + edf_se->edf_flags = EDF_NEW; + + edf_se->runtime_max = timespec_to_ns(¶m_ex->sched_runtime); + edf_se->period = timespec_to_ns(¶m_ex->sched_period); + edf_se->bw = to_ratio(timespec_to_ns(¶m_ex->sched_period), + timespec_to_ns(¶m_ex->sched_runtime)); + + edf_se->runtime = edf_se->runtime_max; + + init_edf_timer(&edf_se->edf_timer); +} + /* * check the target process has a UID that matches the current process's */ @@ -6166,7 +6327,9 @@ static bool check_same_owner(struct task_struct *p) } static int __sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param, bool user) + struct sched_param *param, + struct sched_param_ex *param_ex, + bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; @@ -6186,6 +6349,7 @@ recheck: policy &= ~SCHED_RESET_ON_FORK; if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_EDF && policy != SCHED_NORMAL && policy != SCHED_BATCH && policy != SCHED_IDLE) return -EINVAL; @@ -6200,6 +6364,13 @@ recheck: (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; + if (edf_policy(policy) && param_ex->sched_priority != 0) + return -EINVAL; + if (edf_policy(policy) && (param_ex == NULL || + timespec_to_ns(¶m_ex->sched_period) == 0 || + timespec_to_ns(¶m_ex->sched_period) < + timespec_to_ns(¶m_ex->sched_runtime))) + return -EINVAL; if (rt_policy(policy) != (param->sched_priority != 0)) return -EINVAL; @@ -6273,6 +6444,24 @@ recheck: spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } +#ifdef CONFIG_EDF_GROUP_SCHED + /* + * If EDF tasks are involved, check tha bandwidth of its group, + * and do not allow setting the task EDF if there is not enough. + */ + if (edf_policy(policy) || edf_task(p)) { + struct task_group *tg = task_group(p); + + spin_lock(&tg->edf_bandwidth.lock); + if (!__edf_check_task_tg_bw(p, tg, policy, param_ex)) { + spin_unlock(&tg->edf_bandwidth.lock); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + return -EPERM; + } + spin_unlock(&tg->edf_bandwidth.lock); + } +#endif update_rq_clock(rq); on_rq = p->se.on_rq; running = task_current(rq, p); @@ -6285,6 +6474,8 @@ recheck: oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); + if (edf_policy(policy)) + __setscheduler_ex(rq, p, policy, param_ex); if (running) p->sched_class->set_curr_task(rq); @@ -6312,10 +6503,17 @@ recheck: int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - return __sched_setscheduler(p, policy, param, true); + return __sched_setscheduler(p, policy, param, NULL, true); } EXPORT_SYMBOL_GPL(sched_setscheduler); +int sched_setscheduler_ex(struct task_struct *p, int policy, + struct sched_param *param, + struct sched_param_ex *param_ex) +{ + return __sched_setscheduler(p, policy, param, param_ex, true); +} + /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * @p: the task in question. @@ -6330,7 +6528,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setscheduler_nocheck(struct task_struct *p, int policy, struct sched_param *param) { - return __sched_setscheduler(p, policy, param, false); + return __sched_setscheduler(p, policy, param, NULL, false); } static int @@ -6355,6 +6553,33 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) return retval; } +static int +do_sched_setscheduler_ex(pid_t pid, int policy, + struct sched_param_ex __user *param_ex) +{ + struct sched_param lparam; + struct sched_param_ex lparam_ex; + struct task_struct *p; + int retval; + + if (!param_ex || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam_ex, param_ex, + sizeof(struct sched_param_ex))) + return -EFAULT; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); + if (p != NULL) { + lparam.sched_priority = lparam_ex.sched_priority; + retval = sched_setscheduler_ex(p, policy, &lparam, &lparam_ex); + } + rcu_read_unlock(); + + return retval; +} + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. @@ -6372,6 +6597,22 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, } /** + * sys_sched_setscheduler_ex - set/change the scheduler policy and EDF deadline + * @pid: the pid in question. + * @policy: new policy (should be SCHED_EDF). + * @param: structure containg the extended EDF parameters. + */ +SYSCALL_DEFINE3(sched_setscheduler_ex, pid_t, pid, int, policy, + struct sched_param_ex __user *, param_ex) +{ + if (policy < 0) + return -EINVAL; + + return do_sched_setscheduler_ex(pid, policy, param_ex); +} + + +/** * sys_sched_setparam - set/change the RT priority of a thread * @pid: the pid in question. * @param: structure containing the new RT priority. @@ -6381,6 +6622,12 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) return do_sched_setscheduler(pid, -1, param); } +SYSCALL_DEFINE2(sched_setparam_ex, pid_t, pid, + struct sched_param_ex __user *, param_ex) +{ + return do_sched_setscheduler_ex(pid, -1, param_ex); +} + /** * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. @@ -6445,6 +6692,11 @@ out_unlock: return retval; } +SYSCALL_DEFINE2(sched_getparam_ex, pid_t, pid, struct sched_param_ex __user *, param) +{ + return -ENOSYS; +} + long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { cpumask_var_t cpus_allowed, new_mask; @@ -9210,6 +9462,14 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } +static void init_edf_rq(struct edf_rq *edf_rq, struct rq *rq) +{ + edf_rq->rb_root = RB_ROOT; +#ifdef CONFIG_EDF_GROUP_SCHED + edf_rq->rq = rq; +#endif +} + static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) { struct rt_prio_array *array; @@ -9275,6 +9535,22 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, } #endif +#ifdef CONFIG_EDF_GROUP_SCHED +void init_tg_edf_entry(struct task_group *tg, struct edf_rq *edf_rq, + struct sched_edf_entity *edf_se, int cpu, int add, + struct sched_edf_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + + tg->edf_rq[cpu] = &rq->edf; + + spin_lock_init(&tg->edf_bandwidth.lock); + tg->edf_bandwidth.runtime_max = 0; + tg->edf_bandwidth.period = 0; + tg->edf_bandwidth.total_bw = 0; +} +#endif + #ifdef CONFIG_RT_GROUP_SCHED static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, int add, @@ -9316,6 +9592,10 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif +#ifdef CONFIG_EDF_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif + #ifdef CONFIG_USER_SCHED alloc_size *= 2; #endif @@ -9344,6 +9624,10 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_EDF_GROUP_SCHED + init_task_group.edf_rq = (struct edf_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif /* CONFIG_EDF_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED init_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -9403,6 +9687,7 @@ void __init sched_init(void) rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs, rq); + init_edf_rq(&rq->edf, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.shares = init_task_group_load; @@ -9450,6 +9735,13 @@ void __init sched_init(void) #endif #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_EDF_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED + init_tg_edf_entry(&init_task_group, &rq->edf, + NULL, i, 1, NULL); +#endif +#endif + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_rt_rq_list); @@ -9760,6 +10052,68 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) } #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_EDF_GROUP_SCHED +static void free_edf_sched_group(struct task_group *tg) +{ + kfree(tg->edf_rq); +} + +int alloc_edf_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct rq *rq; + int i; + + tg->edf_rq = kzalloc(sizeof(struct edf_rq*) * nr_cpu_ids, GFP_KERNEL); + if (!tg->edf_rq) + return 0; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + init_tg_edf_entry(tg, &rq->edf, NULL, i, 0, NULL); + } + + return 1; +} + +int sched_edf_can_attach(struct cgroup *cgrp, struct task_struct *tsk) +{ + struct task_group *tg = container_of(cgroup_subsys_state(cgrp, + cpu_cgroup_subsys_id), + struct task_group, css); + u64 tg_bw = tg->edf_bandwidth.bw; + u64 tsk_bw = tsk->edf.bw; + + if (!edf_task(tsk)) + return 1; + + /* + * Check if the requested group has enough free space + * for this particular task. + */ + if (tg_bw < tsk_bw + tg->edf_bandwidth.total_bw) + return 0; + + return 1; +} +#else +static inline void free_edf_sched_group(struct task_group *tg) +{ +} + +static inline +int alloc_edf_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} +#endif +static inline void register_edf_sched_group(struct task_group *tg, int cpu) +{ +} + +static inline void unregister_edf_sched_group(struct task_group *tg, int cpu) +{ +} + #ifdef CONFIG_RT_GROUP_SCHED static void free_rt_sched_group(struct task_group *tg) { @@ -9852,6 +10206,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); + free_edf_sched_group(tg); free_rt_sched_group(tg); kfree(tg); } @@ -9870,12 +10225,16 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_fair_sched_group(tg, parent)) goto err; + if (!alloc_edf_sched_group(tg, parent)) + goto err; + if (!alloc_rt_sched_group(tg, parent)) goto err; spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { register_fair_sched_group(tg, i); + register_edf_sched_group(tg, i); register_rt_sched_group(tg, i); } list_add_rcu(&tg->list, &task_groups); @@ -9906,12 +10265,31 @@ void sched_destroy_group(struct task_group *tg) { unsigned long flags; int i; +#ifdef CONFIG_EDF_GROUP_SCHED + struct task_group *parent = tg->parent; + u64 tg_bw = tg->edf_bandwidth.bw; spin_lock_irqsave(&task_group_lock, flags); + + /* + * If an EDF group goes away, its parent group + * (if any), ends up with some free bandwidth that + * it might use for other groups/tasks. + */ + spin_lock(&parent->edf_bandwidth.lock); + if (tg_bw && parent) + parent->edf_bandwidth.total_bw -= tg_bw; + spin_unlock(&parent->edf_bandwidth.lock); +#else + spin_lock_irqsave(&task_group_lock, flags); +#endif + for_each_possible_cpu(i) { unregister_fair_sched_group(tg, i); + unregister_edf_sched_group(tg, i); unregister_rt_sched_group(tg, i); } + list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); @@ -10057,14 +10435,6 @@ unsigned long sched_group_shares(struct task_group *tg) */ static DEFINE_MUTEX(rt_constraints_mutex); -static unsigned long to_ratio(u64 period, u64 runtime) -{ - if (runtime == RUNTIME_INF) - return 1ULL << 20; - - return div64_u64(runtime << 20, period); -} - /* Must be called with tasklist_lock held */ static inline int tg_has_rt_tasks(struct task_group *tg) { @@ -10368,9 +10738,15 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk) { +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EDF_GROUP_SCHED) #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) return -EINVAL; +#endif +#ifdef CONFIG_EDF_GROUP_SCHED + if (!sched_edf_can_attach(cgrp, tsk)) + return -EINVAL; +#endif #else /* We don't support RT-tasks being in separate groups */ if (tsk->sched_class != &fair_sched_class) @@ -10384,6 +10760,30 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup *old_cont, struct task_struct *tsk) { +#ifdef CONFIG_EDF_GROUP_SCHED + struct task_group *tg = container_of(cgroup_subsys_state(cgrp, + cpu_cgroup_subsys_id), + struct task_group, css); + struct task_group *old_tg = container_of(cgroup_subsys_state(old_cont, + cpu_cgroup_subsys_id), + struct task_group, css); + u64 tsk_bw = tsk->edf.bw; + + /* + * An amount of bandwidth equal to the bandwidth of tsk + * is freed in the former group of tsk, and declared occupied + * in the new one. + */ + spin_lock_irq(&tg->edf_bandwidth.lock); + tg->edf_bandwidth.total_bw += tsk_bw; + spin_unlock_irq(&tg->edf_bandwidth.lock); + + if (old_tg) { + spin_lock_irq(&old_tg->edf_bandwidth.lock); + old_tg->edf_bandwidth.total_bw -= tsk_bw; + spin_unlock_irq(&old_tg->edf_bandwidth.lock); + } +#endif sched_move_task(tsk); } @@ -10426,6 +10826,100 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) } #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_EDF_GROUP_SCHED +/* + * We allow runtime_max > period, since it makes sense to + * assign more than 100% bandwidth to a group, if on an SMP + * machine. + */ +static int __edf_schedulable(struct task_group *tg, + u64 runtime_max, u64 period) +{ + struct task_group *parent = tg->parent; + u64 old_bw = tg->edf_bandwidth.bw; + u64 parent_bw = parent ? parent->edf_bandwidth.bw : 0; + u64 bw = period > 0 ? to_ratio(period, runtime_max) : 0; + int ret = 0; + + if (bw < tg->edf_bandwidth.total_bw) + return -EINVAL; + + if (parent) { + /* + * All we need to do is check if the parent group is able + * to accomodate the new bandwidth of this group or not, + * given its actual allocated bandwidth, e.g., to other + * groups or tasks. + */ + if (parent_bw < parent->edf_bandwidth.total_bw - + old_bw + bw) + return -EINVAL; + + spin_lock_irq(&parent->edf_bandwidth.lock); + parent->edf_bandwidth.total_bw -= old_bw; + parent->edf_bandwidth.total_bw += bw; + spin_unlock_irq(&parent->edf_bandwidth.lock); + } + + if (!ret) { + spin_lock_irq(&tg->edf_bandwidth.lock); + tg->edf_bandwidth.runtime_max = runtime_max; + tg->edf_bandwidth.period = period; + tg->edf_bandwidth.bw = bw; + spin_unlock_irq(&tg->edf_bandwidth.lock); + } + + return ret; +} + +static int cpu_edf_runtime_write_uint(struct cgroup *cgrp, + struct cftype *cftype, + u64 edf_runtime_us) +{ + struct task_group *tg = cgroup_tg(cgrp); + + return __edf_schedulable(tg, + edf_runtime_us * NSEC_PER_USEC, + tg->edf_bandwidth.period); +} + +static u64 cpu_edf_runtime_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + struct task_group *tg = cgroup_tg(cgrp); + u64 runtime; + + spin_lock_irq(&tg->edf_bandwidth.lock); + runtime = tg->edf_bandwidth.runtime_max; + spin_unlock_irq(&tg->edf_bandwidth.lock); + do_div(runtime, NSEC_PER_USEC); + + return runtime; +} + +static int cpu_edf_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 edf_period_us) +{ + struct task_group *tg = cgroup_tg(cgrp); + + return __edf_schedulable(tg, + tg->edf_bandwidth.runtime_max, + edf_period_us * NSEC_PER_USEC); +} + +static u64 cpu_edf_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + struct task_group *tg = cgroup_tg(cgrp); + u64 period; + + spin_lock_irq(&tg->edf_bandwidth.lock); + period = tg->edf_bandwidth.period; + spin_unlock_irq(&tg->edf_bandwidth.lock); + do_div(period, NSEC_PER_USEC); + + return period; +} +#endif + static struct cftype cpu_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -10446,6 +10940,19 @@ static struct cftype cpu_files[] = { .write_u64 = cpu_rt_period_write_uint, }, #endif +#ifdef CONFIG_EDF_GROUP_SCHED + { + .name = "edf_runtime_us", + .read_u64 = cpu_edf_runtime_read_uint, + .write_u64 = cpu_edf_runtime_write_uint, + }, + { + .name = "edf_period_us", + .read_u64 = cpu_edf_period_read_uint, + .write_u64 = cpu_edf_period_write_uint, + }, +#endif + }; static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index efb8440..1252a43 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -146,7 +146,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) } #if defined(CONFIG_CGROUP_SCHED) && \ - (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)) + (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) || \ + defined(CONFIG_EDF_GROUP_SCHED)) static void task_group_path(struct task_group *tg, char *buf, int buflen) { /* may be NULL if the underlying cgroup isn't fully-created yet */ @@ -218,6 +219,38 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #endif } +void print_edf_rq(struct seq_file *m, int cpu, struct edf_rq *edf_rq) +{ + s64 min_deadline = -1, max_deadline = -1; + struct rq *rq = &per_cpu(runqueues, cpu); + struct sched_edf_entity *last; + unsigned long flags; + + SEQ_printf(m, "\nedf_rq[%d]:\n", cpu); + + spin_lock_irqsave(&rq->lock, flags); + if (edf_rq->rb_leftmost) + min_deadline = (rb_entry(edf_rq->rb_leftmost, + struct sched_edf_entity, + rb_node))->deadline; + last = __pick_edf_last_entity(edf_rq); + if (last) + max_deadline = last->deadline; + spin_unlock_irqrestore(&rq->lock, flags); + +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + + P(edf_rq->edf_nr_running); + PN(min_deadline); + PN(max_deadline); + +#undef PN +#undef P +} + void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) @@ -300,6 +333,7 @@ static void print_cpu(struct seq_file *m, int cpu) #undef P #endif print_cfs_stats(m, cpu); + print_edf_stats(m, cpu); print_rt_stats(m, cpu); print_rq(m, rq, cpu); diff --git a/kernel/sched_edf.c b/kernel/sched_edf.c new file mode 100644 index 0000000..cdec433 --- /dev/null +++ b/kernel/sched_edf.c @@ -0,0 +1,550 @@ +/* + * Earliest Deadline Firsf (EDF) Scheduling Class (SCHED_EDF policy) + * + * Earliest Deadline First scheduling for periodic or sporadic tasks. + * + * An EDF task specifies: + * - a maximum runtime, + * - a period or minimum interarrival time. + * The scheduler, on its turn: + * - schedule the task according to its deadline d=t+period, + * - enforces the task to not overcame its bandwidth B=runtime_max/period. + * + * At the end of each instance of a periodic job (or sporadic activation), + * the task may inform the scheduler about such event, by calling + * sched_yield(), and then suspending until next activation time. + * + * The strategy used to confine each task inside its bandwidth reservation + * is the Constant Bandwidth Server (CBS) scheduling, a slight variation on + * EDF that makes this possible. + * + * Correct behavior, i.e., no EDF task misses its deadline, is only + * guaranteed if the parameters of all the EDF entities in the system are + * assigned so to result in a schedulable solution. + * However, thanks to bandwidth isolation, overruns and deadline misses + * remains local to a single task, and does not affect any other task in + * the system. + * + * Groups, if configured, have bandwidth as well, and it is enforced that + * the sum of the bandwidths of entities (tasks and groups) belonging to + * a group stays below its own bandwidth. + * + * Copyright (C) 2009 Dario Faggioli, Michael Trimarchi + */ + +static const struct sched_class edf_sched_class; + +static inline struct task_struct *edf_task_of(struct sched_edf_entity *edf_se) +{ + return container_of(edf_se, struct task_struct, edf); +} + +static inline struct rq *rq_of_edf_rq(struct edf_rq *edf_rq) +{ + return container_of(edf_rq, struct rq, edf); +} + +static inline struct edf_rq *edf_rq_of_se(struct sched_edf_entity *edf_se) +{ + struct task_struct *p = edf_task_of(edf_se); + struct rq *rq = task_rq(p); + + return &rq->edf; +} + +static inline int edf_se_boosted(struct sched_edf_entity *edf_se) +{ + struct task_struct *p = edf_task_of(edf_se); + + return p->prio != p->normal_prio; +} + +static inline int on_edf_rq(struct sched_edf_entity *edf_se) +{ + return !RB_EMPTY_NODE(&edf_se->rb_node); +} + +#define for_each_leaf_edf_rq(edf_rq, rq) \ + for (edf_rq = &rq->edf; edf_rq; edf_rq = NULL) + +static inline int edf_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static inline u64 edf_max_deadline(u64 a, u64 b) +{ + s64 delta = (s64)(b - a); + if (delta > 0) + a = b; + + return a; +} + +static inline void __edf_new_to_running(struct sched_edf_entity *edf_se) +{ + struct edf_rq *edf_rq = edf_rq_of_se(edf_se); + struct rq *rq = rq_of_edf_rq(edf_rq); + + edf_se->edf_flags &= ~EDF_NEW; + edf_se->edf_flags |= EDF_RUNNING; + + edf_se->deadline = rq->clock + edf_se->period; +} + +static inline void __edf_clear_ended(struct sched_edf_entity *edf_se) +{ + struct edf_rq *edf_rq = edf_rq_of_se(edf_se); + struct rq *rq = rq_of_edf_rq(edf_rq); + + edf_se->edf_flags &= ~EDF_ENDED; + + edf_se->runtime = edf_se->runtime > 0 ? 0 : edf_se->runtime; + if (edf_time_before(edf_se->deadline, rq->clock)) + edf_se->deadline = rq->clock; +} + +static inline void __edf_set_ended(struct sched_edf_entity *edf_se) +{ + edf_se->edf_flags |= EDF_ENDED; +} + +static void enqueue_edf_entity(struct sched_edf_entity *edf_se); +static void dequeue_edf_entity(struct sched_edf_entity *edf_se); + +static void update_edf_params(struct sched_edf_entity *edf_se) +{ + struct edf_rq *edf_rq = edf_rq_of_se(edf_se); + struct rq *rq = rq_of_edf_rq(edf_rq); + u64 left, right; + + BUG_ON(on_edf_rq(edf_se)); + + if (edf_se->edf_flags & EDF_NEW) { + __edf_new_to_running(edf_se); + return; + } + if (edf_se->edf_flags & EDF_ENDED) { + __edf_clear_ended(edf_se); + goto update; + } + + /* + * Update the deadline to the current time only if: + * - the budget has been completely exhausted; + * - using the ramaining budget, with the current deadline, would + * make the task exceed its bandwidth; + * - the deadline itself is in the past. + * + * For the second condition to hold, we check if: + * runtime / (deadline - rq->clock) >= runtime_max / period + * + * Which basically says if, in the time left before the current + * deadline, the tasks overcome its expected runtime by using the + * residual budget (left and right are the two sides of the equation, + * after a bit of shuffling to use multiplications instead of + * divisions). + */ + left = edf_se->runtime > 0 ? edf_se->period * edf_se->runtime : 0; + right = (edf_se->deadline - rq->clock) * edf_se->runtime_max; + + if (edf_se->runtime < 0 || edf_time_before(right, left) || + edf_time_before(edf_se->deadline, rq->clock)) { + edf_se->deadline = rq->clock; +update: + /* + * Keep moving the deadline and replenishing runtime by the + * proper amount at least until the runtime becomes positive. + */ + while (edf_se->runtime <= 0) { + edf_se->deadline += edf_se->period; + if (edf_se->runtime + edf_se->runtime_max < edf_se->runtime_max) + edf_se->runtime += edf_se->runtime_max; + else + edf_se->runtime = edf_se->runtime_max; + } + } +} + +static int start_edf_timer(struct sched_edf_entity *edf_se) +{ + struct edf_rq *edf_rq = edf_rq_of_se(edf_se); + struct rq *rq = rq_of_edf_rq(edf_rq); + ktime_t now, delta, act; + ktime_t soft, hard; + unsigned long range; + + now = hrtimer_cb_get_time(&edf_se->edf_timer); + delta = ktime_sub_ns(now, rq->clock); + act = ns_to_ktime(edf_se->deadline); + act = ktime_add(act, delta); + + hrtimer_set_expires(&edf_se->edf_timer, act); + + soft = hrtimer_get_softexpires(&edf_se->edf_timer); + hard = hrtimer_get_expires(&edf_se->edf_timer); + range = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(&edf_se->edf_timer, soft, + range, HRTIMER_MODE_ABS, 0); + + return hrtimer_active(&edf_se->edf_timer); +} + +static enum hrtimer_restart edf_timer(struct hrtimer *timer) +{ + struct sched_edf_entity *edf_se = container_of(timer, + struct sched_edf_entity, + edf_timer); + struct task_struct *p = edf_task_of(edf_se); + struct edf_rq *edf_rq = edf_rq_of_se(edf_se); + struct rq *rq = rq_of_edf_rq(edf_rq); + + spin_lock(&rq->lock); + if (!edf_task(p)) + goto unlock; + + update_edf_params(edf_se); + if (p->se.on_rq && !on_edf_rq(edf_se)) { + enqueue_edf_entity(edf_se); + resched_task(rq->curr); + } +unlock: + spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +static void init_edf_timer(struct hrtimer *timer) +{ + if (hrtimer_active(timer)) { + hrtimer_try_to_cancel(timer); + return; + } + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = edf_timer; +} + +static int edf_runtime_exceeded(struct sched_edf_entity *edf_se) +{ + if (edf_se->runtime > 0 || edf_se_boosted(edf_se)) + return 0; + + BUG_ON(hrtimer_active(&edf_se->edf_timer)); + + dequeue_edf_entity(edf_se); + if (!start_edf_timer(edf_se)) { + update_edf_params(edf_se); + enqueue_edf_entity(edf_se); + } + + return 1; +} + +static void update_curr_edf(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_edf_entity *edf_se = &curr->edf; + u64 delta_exec; + + if (!edf_task(curr) || !on_edf_rq(edf_se)) + return; + + delta_exec = rq->clock - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq->clock; + cpuacct_charge(curr, delta_exec); + + edf_se->runtime -= delta_exec; + /* + * If we exhausted our runtime it has to be replenished + * either immediately or after one (or more) CBS period(s). + */ + if (edf_runtime_exceeded(edf_se)) + resched_task(curr); +} + +static void enqueue_edf_entity(struct sched_edf_entity *edf_se) +{ + struct edf_rq *edf_rq = edf_rq_of_se(edf_se); + struct rb_node **link = &edf_rq->rb_root.rb_node; + struct rb_node *parent = NULL; + struct sched_edf_entity *entry; + int leftmost = 1; + + BUG_ON(!RB_EMPTY_NODE(&edf_se->rb_node)); + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_edf_entity, rb_node); + if (!edf_time_before(entry->deadline, edf_se->deadline)) + link = &parent->rb_left; + else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + edf_rq->rb_leftmost = &edf_se->rb_node; + + rb_link_node(&edf_se->rb_node, parent, link); + rb_insert_color(&edf_se->rb_node, &edf_rq->rb_root); + + edf_rq->edf_nr_running++; +} + +static void dequeue_edf_entity(struct sched_edf_entity *edf_se) +{ + struct edf_rq *edf_rq = edf_rq_of_se(edf_se); + + if (RB_EMPTY_NODE(&edf_se->rb_node)) + return; + + if (edf_rq->rb_leftmost == &edf_se->rb_node) { + struct rb_node *next_node; + struct sched_edf_entity *next; + + next_node = rb_next(&edf_se->rb_node); + edf_rq->rb_leftmost = next_node; + + if (next_node) + next = rb_entry(next_node, struct sched_edf_entity, + rb_node); + } + + rb_erase(&edf_se->rb_node, &edf_rq->rb_root); + RB_CLEAR_NODE(&edf_se->rb_node); + + edf_rq->edf_nr_running--; +} + +static void check_preempt_curr_edf(struct rq *rq, struct task_struct *p, + int sync) +{ + if (unlikely(rt_prio(p->prio))) + goto resched; + + if (unlikely(p->sched_class != &edf_sched_class)) + return; + + if ((s64)p->edf.deadline - rq->curr->edf.deadline < 0) +resched: + resched_task(rq->curr); +} + +static void +enqueue_task_edf(struct rq *rq, struct task_struct *p, int wakeup) +{ + struct sched_edf_entity *edf_se = &p->edf; + + BUG_ON(on_edf_rq(edf_se)); + + /* + * Only enqueue entities with some remaining runtime. + */ + if (edf_se->runtime <= 0) + return; + + update_edf_params(edf_se); + enqueue_edf_entity(edf_se); + check_preempt_curr_edf(rq, p, 0); +} + +static void +dequeue_task_edf(struct rq *rq, struct task_struct *p, int sleep) +{ + struct sched_edf_entity *edf_se = &p->edf; + + if (!on_edf_rq(edf_se)) + return; + + update_curr_edf(rq); + dequeue_edf_entity(edf_se); +} + +static void yield_task_edf(struct rq *rq) +{ + struct task_struct *p = rq->curr; + struct sched_edf_entity *edf_se = &p->edf; + + __edf_set_ended(edf_se); +} + +#ifdef CONFIG_SCHED_HRTICK +static void start_hrtick_edf(struct rq *rq, struct task_struct *p) +{ + struct sched_edf_entity *edf_se = &p->edf; + s64 delta; + + delta = edf_se->runtime_max - edf_se->runtime; + + if (delta > 10000) + hrtick_start(rq, delta); +} +#else +static void start_hrtick_edf(struct rq *rq, struct task_struct *p) +{ +} +#endif + +static struct sched_edf_entity *__pick_edf_last_entity(struct edf_rq *edf_rq) +{ + struct rb_node *last = rb_last(&edf_rq->rb_root); + + if (!last) + return NULL; + + return rb_entry(last, struct sched_edf_entity, rb_node); +} + +static struct sched_edf_entity *pick_next_edf_entity(struct rq *rq, + struct edf_rq *edf_rq) +{ + struct rb_node *left = edf_rq->rb_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct sched_edf_entity, rb_node); +} + +struct task_struct *pick_next_task_edf(struct rq *rq) +{ + struct sched_edf_entity *edf_se; + struct edf_rq *edf_rq = &rq->edf; + struct task_struct *p; + + BUG_ON(edf_rq->edf_nr_running < 0); + + if (!edf_rq->edf_nr_running) + return NULL; + + edf_se = pick_next_edf_entity(rq, edf_rq); + BUG_ON(!edf_se); + + p = edf_task_of(edf_se); + p->se.exec_start = rq->clock; + if (hrtick_enabled(rq)) + start_hrtick_edf(rq, p); + + return p; +} + +static void put_prev_task_edf(struct rq *rq, struct task_struct *p) +{ + update_curr_edf(rq); + p->se.exec_start = 0; +} + +static void task_tick_edf(struct rq *rq, struct task_struct *p, int queued) +{ + update_curr_edf(rq); +} + +static void set_curr_task_edf(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + p->se.exec_start = rq->clock; +} + +static void prio_changed_edf(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + if (running) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else + check_preempt_curr_edf(rq, p, 0); +} + +static void switched_to_edf(struct rq *rq, struct task_struct *p, + int running) +{ + if (!running) + check_preempt_curr_edf(rq, p, 0); +} + +#ifdef CONFIG_SMP +static int select_task_rq_edf(struct task_struct *p, int sd_flag, int flags) +{ + return task_cpu(p); +} + +static unsigned long +load_balance_edf(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + /* for now, don't touch EDF tasks */ + return 0; +} + +static int +move_one_task_edf(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + return 0; +} + +static void set_cpus_allowed_edf(struct task_struct *p, + const struct cpumask *new_mask) +{ + int weight = cpumask_weight(new_mask); + + BUG_ON(!edf_task(p)); + + cpumask_copy(&p->cpus_allowed, new_mask); + p->edf.nr_cpus_allowed = weight; +} +#endif + +static const struct sched_class edf_sched_class = { + .next = &fair_sched_class, + .enqueue_task = enqueue_task_edf, + .dequeue_task = dequeue_task_edf, + .yield_task = yield_task_edf, + + .check_preempt_curr = check_preempt_curr_edf, + + .pick_next_task = pick_next_task_edf, + .put_prev_task = put_prev_task_edf, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_edf, + + .load_balance = load_balance_edf, + .move_one_task = move_one_task_edf, + .set_cpus_allowed = set_cpus_allowed_edf, +#endif + + .set_curr_task = set_curr_task_edf, + .task_tick = task_tick_edf, + + .prio_changed = prio_changed_edf, + .switched_to = switched_to_edf, +}; + +#ifdef CONFIG_SCHED_DEBUG +void print_edf_rq(struct seq_file *m, int cpu, struct edf_rq *edf_rq); + +static void print_edf_stats(struct seq_file *m, int cpu) +{ + struct edf_rq *edf_rq = &cpu_rq(cpu)->edf; + + rcu_read_lock(); + for_each_leaf_edf_rq(edf_rq, cpu_rq(cpu)) + print_edf_rq(m, cpu, edf_rq); + rcu_read_unlock(); +} +#endif /* CONFIG_SCHED_DEBUG */ + diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ecc637a..cae7fb8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1571,10 +1571,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ update_curr(cfs_rq); - if (unlikely(rt_prio(p->prio))) { + if (rt_prio(p->prio) || task_has_edf_policy(p)) resched_task(curr); - return; - } if (unlikely(p->sched_class != &fair_sched_class)) return; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index a4d790c..4511fc9 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1746,7 +1746,7 @@ unsigned int get_rr_interval_rt(struct task_struct *task) } static const struct sched_class rt_sched_class = { - .next = &fair_sched_class, + .next = &edf_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 26185d7..d32aaf0 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled; static struct task_struct *wakeup_task; static int wakeup_cpu; +static int wakeup_edf; static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; @@ -214,6 +215,9 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) tracing_record_cmdline(p); tracing_record_cmdline(current); + if (wakeup_edf && !edf_task(p)) + return; + if ((wakeup_rt && !rt_task(p)) || p->prio >= wakeup_prio || p->prio >= current->prio) @@ -340,12 +344,21 @@ static int __wakeup_tracer_init(struct trace_array *tr) static int wakeup_tracer_init(struct trace_array *tr) { + wakeup_edf = 0; + wakeup_rt = 0; + return __wakeup_tracer_init(tr); +} + +static int wakeup_edf_tracer_init(struct trace_array *tr) +{ + wakeup_edf = 1; wakeup_rt = 0; return __wakeup_tracer_init(tr); } static int wakeup_rt_tracer_init(struct trace_array *tr) { + wakeup_edf = 0; wakeup_rt = 1; return __wakeup_tracer_init(tr); } @@ -384,6 +397,20 @@ static struct tracer wakeup_tracer __read_mostly = #endif }; +static struct tracer wakeup_edf_tracer __read_mostly = +{ + .name = "wakeup_edf", + .init = wakeup_edf_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .wait_pipe = poll_wait_pipe, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif +}; + static struct tracer wakeup_rt_tracer __read_mostly = { .name = "wakeup_rt", @@ -406,6 +433,10 @@ __init static int init_wakeup_tracer(void) if (ret) return ret; + ret = register_tracer(&wakeup_edf_tracer); + if (ret) + return ret; + ret = register_tracer(&wakeup_rt_tracer); if (ret) return ret;