[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20190906191237.27006-11-riel@surriel.com>
Date: Fri, 6 Sep 2019 15:12:32 -0400
From: Rik van Riel <riel@...riel.com>
To: linux-kernel@...r.kernel.org
Cc: kernel-team@...com, pjt@...gle.com, dietmar.eggemann@....com,
peterz@...radead.org, mingo@...hat.com, morten.rasmussen@....com,
tglx@...utronix.de, mgorman@...hsingularity.net,
vincent.guittot@...aro.org, Rik van Riel <riel@...riel.com>
Subject: [PATCH 10/15] sched,fair: flatten hierarchical runqueues
Flatten the hierarchical runqueues into just the per CPU rq.cfs runqueue.
Iteration of the sched_entity hierarchy is rate limited to once per jiffy
per sched_entity, which is a smaller change than it seems, because load
average adjustments were already rate limited to once per jiffy before this
patch series.
This patch breaks CONFIG_CFS_BANDWIDTH. The plan for that is to park tasks
from throttled cgroups onto their cgroup runqueues, and slowly (using the
GENTLE_FAIR_SLEEPERS) wake them back up, in vruntime order, once the cgroup
gets unthrottled, to prevent thundering herd issues.
Signed-off-by: Rik van Riel <riel@...riel.com>
---
include/linux/sched.h | 2 +
kernel/sched/fair.c | 502 ++++++++++++++----------------------------
kernel/sched/pelt.c | 6 +-
kernel/sched/pelt.h | 2 +-
kernel/sched/sched.h | 2 +-
5 files changed, 170 insertions(+), 344 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 84a6cc6f5c47..901c710363e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -453,6 +453,8 @@ struct sched_entity {
#ifdef CONFIG_FAIR_GROUP_SCHED
int depth;
unsigned long enqueued_h_load;
+ unsigned long enqueued_h_weight;
+ struct load_weight h_load;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 08f7f627bdde..d5a3103e3d5a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -470,6 +470,19 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
}
}
+/* Add the cgroup cfs_rqs to the list, for update_blocked_averages */
+static void enqueue_entity_cfs_rqs(struct sched_entity *se)
+{
+ SCHED_WARN_ON(!entity_is_task(se));
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = group_cfs_rq_of_parent(se);
+
+ if (list_add_leaf_cfs_rq(cfs_rq))
+ break;
+ }
+}
+
#else /* !CONFIG_FAIR_GROUP_SCHED */
static inline struct task_struct *task_of(struct sched_entity *se)
@@ -697,8 +710,14 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
*/
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
- if (unlikely(se->load.weight != NICE_0_LOAD))
+ if (task_se_in_cgroup(se)) {
+ unsigned long h_weight = task_se_h_weight(se);
+ if (h_weight != se->h_load.weight)
+ update_load_set(&se->h_load, h_weight);
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->h_load);
+ } else if (unlikely(se->load.weight != NICE_0_LOAD)) {
delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+ }
return delta;
}
@@ -728,22 +747,16 @@ static u64 __sched_period(unsigned long nr_running)
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
+ struct load_weight *load = &cfs_rq->load;
+ struct load_weight lw;
- for_each_sched_entity(se) {
- struct load_weight *load;
- struct load_weight lw;
+ if (unlikely(!se->on_rq)) {
+ lw = cfs_rq->load;
- cfs_rq = cfs_rq_of(se);
- load = &cfs_rq->load;
-
- if (unlikely(!se->on_rq)) {
- lw = cfs_rq->load;
-
- update_load_add(&lw, se->load.weight);
- load = &lw;
- }
- slice = __calc_delta(slice, se->load.weight, load);
+ update_load_add(&lw, task_se_h_weight(se));
+ load = &lw;
}
+ slice = __calc_delta(slice, task_se_h_weight(se), load);
/*
* To avoid cache thrashing, run at least sysctl_sched_min_granularity.
@@ -791,6 +804,7 @@ void init_entity_runnable_average(struct sched_entity *se)
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
static void attach_entity_cfs_rq(struct sched_entity *se);
+static void propagate_entity_cfs_rq(struct sched_entity *se, int flags);
/*
* With new tasks being created, their initial util_avgs are extrapolated
@@ -2742,18 +2756,36 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
#endif /* CONFIG_NUMA_BALANCING */
static void
-account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+account_entity_enqueue_h(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- update_load_add(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
+ if (task_se_in_cgroup(se)) {
+ /* Add the hierarchical weight to the CPU rq */
+ unsigned long h_weight = task_se_h_weight(se);
+ se->enqueued_h_weight = h_weight;
+ update_load_add(&rq_of(cfs_rq)->load, h_weight);
+ } else {
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
-#ifdef CONFIG_SMP
- if (entity_is_task(se)) {
- struct rq *rq = rq_of(cfs_rq);
+ }
+}
- account_numa_enqueue(rq, task_of(se));
- list_add(&se->group_node, &rq->cfs_tasks);
+static void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct rq *rq;
+
+ if (task_se_in_cgroup(se)) {
+ struct cfs_rq *cgroup_rq = group_cfs_rq_of_parent(se);
+
+ update_load_add(&cgroup_rq->load, se->load.weight);
+ cgroup_rq->nr_running++;
+ } else {
+ update_load_add(&cfs_rq->load, se->load.weight);
}
+#ifdef CONFIG_SMP
+ rq = rq_of(cfs_rq);
+
+ account_numa_enqueue(rq, task_of(se));
+ list_add(&se->group_node, &rq->cfs_tasks);
#endif
cfs_rq->nr_running++;
}
@@ -2761,14 +2793,20 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- update_load_sub(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
+ if (task_se_in_cgroup(se)) {
+ struct cfs_rq *cgroup_rq = group_cfs_rq_of_parent(se);
+
+ update_load_sub(&cgroup_rq->load, se->load.weight);
+ cgroup_rq->nr_running--;
+
+ update_load_sub(&rq_of(cfs_rq)->load, se->enqueued_h_weight);
+ } else {
+ update_load_sub(&cfs_rq->load, se->load.weight);
update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-#ifdef CONFIG_SMP
- if (entity_is_task(se)) {
- account_numa_dequeue(rq_of(cfs_rq), task_of(se));
- list_del_init(&se->group_node);
}
+#ifdef CONFIG_SMP
+ account_numa_dequeue(rq_of(cfs_rq), task_of(se));
+ list_del_init(&se->group_node);
#endif
cfs_rq->nr_running--;
}
@@ -2863,6 +2901,9 @@ update_runnable_load_avg(struct sched_entity *se)
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ if (task_se_in_cgroup(se))
+ cfs_rq = group_cfs_rq_of_parent(se);
+
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
}
@@ -2870,6 +2911,9 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ if (task_se_in_cgroup(se))
+ cfs_rq = group_cfs_rq_of_parent(se);
+
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
}
@@ -3047,7 +3091,6 @@ static void update_cfs_group(struct sched_entity *se)
long shares;
if (!gcfs_rq) {
- update_runnable_load_avg(se);
return;
}
@@ -3498,7 +3541,9 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
- add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
+ if (task_se_in_cgroup(se))
+ add_tg_cfs_propagate(group_cfs_rq_of_parent(se),
+ se->avg.load_sum);
cfs_rq_util_change(cfs_rq, flags);
}
@@ -3517,7 +3562,9 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
+ if (task_se_in_cgroup(se))
+ add_tg_cfs_propagate(group_cfs_rq_of_parent(se),
+ -se->avg.load_sum);
cfs_rq_util_change(cfs_rq, 0);
}
@@ -3528,11 +3575,13 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
#define UPDATE_TG 0x1
#define SKIP_AGE_LOAD 0x2
#define DO_ATTACH 0x4
+#define SE_IS_CURRENT 0x8
/* Update task and its cfs_rq load average */
static inline bool update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 now = cfs_rq_clock_pelt(cfs_rq);
+ bool curr = flags & SE_IS_CURRENT;
int decayed;
/*
@@ -3540,7 +3589,7 @@ static inline bool update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* track group sched_entity load average for task_se_h_load calc in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
- __update_load_avg_se(now, cfs_rq, se);
+ __update_load_avg_se(now, cfs_rq, se, curr, curr);
decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
@@ -3606,6 +3655,9 @@ static void remove_entity_load_avg(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
unsigned long flags;
+ if (task_se_in_cgroup(se))
+ cfs_rq = group_cfs_rq_of_parent(se);
+
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
* post_init_entity_util_avg() which will have added things to the
@@ -3776,6 +3828,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
#define DO_ATTACH 0x0
+#define SE_IS_CURRENT 0x0
static inline bool update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
@@ -3903,24 +3956,6 @@ static inline void check_schedstat_required(void)
* CPU and an up-to-date min_vruntime on the destination CPU.
*/
-static bool
-enqueue_entity_groups(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
- /*
- * When enqueuing a sched_entity, we must:
- * - Update loads to have both entity and cfs_rq synced with now.
- * - Add its load to cfs_rq->runnable_avg
- * - For group_entity, update its weight to reflect the new share of
- * its group cfs_rq
- * - Add its new weight to cfs_rq->load.weight
- */
- if (!update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH))
- return false;
-
- update_cfs_group(se);
- return true;
-}
-
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -3945,9 +3980,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
- enqueue_runnable_load_avg(cfs_rq, se);
- account_entity_enqueue(cfs_rq, se);
-
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
@@ -3958,77 +3990,24 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- if (cfs_rq->nr_running == 1) {
- list_add_leaf_cfs_rq(cfs_rq);
- check_enqueue_throttle(cfs_rq);
- }
-}
-
-static void __clear_buddies_last(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last != se)
- break;
-
- cfs_rq->last = NULL;
- }
-}
-
-static void __clear_buddies_next(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->next != se)
- break;
-
- cfs_rq->next = NULL;
- }
-}
-
-static void __clear_buddies_skip(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip != se)
- break;
-
- cfs_rq->skip = NULL;
- }
+ if (task_se_in_cgroup(se))
+ enqueue_entity_cfs_rqs(se);
}
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->last == se)
- __clear_buddies_last(se);
+ cfs_rq->last = NULL;
if (cfs_rq->next == se)
- __clear_buddies_next(se);
+ cfs_rq->next = NULL;
if (cfs_rq->skip == se)
- __clear_buddies_skip(se);
+ cfs_rq->skip = NULL;
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-static bool
-dequeue_entity_groups(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
- /*
- * When dequeuing a sched_entity, we must:
- * - Update loads to have both entity and cfs_rq synced with now.
- * - Subtract its load from the cfs_rq->runnable_avg.
- * - Subtract its previous weight from cfs_rq->load.weight.
- * - For group entity, update its weight to reflect the new share
- * of its group cfs_rq.
- */
- if (!update_load_avg(cfs_rq, se, UPDATE_TG))
- return false;
- update_cfs_group(se);
-
- return true;
-}
-
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -4122,7 +4101,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ propagate_entity_cfs_rq(se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
@@ -4220,11 +4199,12 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
check_spread(cfs_rq, prev);
if (prev->on_rq) {
+ struct sched_entity *se = prev;
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
- update_load_avg(cfs_rq, prev, 0);
+ propagate_entity_cfs_rq(se, SE_IS_CURRENT);
}
cfs_rq->curr = NULL;
}
@@ -4240,7 +4220,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(cfs_rq, curr, UPDATE_TG);
+ update_load_avg(cfs_rq, curr, UPDATE_TG|SE_IS_CURRENT);
update_cfs_group(curr);
#ifdef CONFIG_SCHED_HRTICK
@@ -4259,9 +4239,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
-
- if (cfs_rq->nr_running > 1)
- check_preempt_tick(cfs_rq, curr);
}
@@ -5136,7 +5113,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
SCHED_WARN_ON(task_rq(p) != rq);
- if (rq->cfs.h_nr_running > 1) {
+ if (rq->cfs.nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
@@ -5201,7 +5178,7 @@ static inline void update_overutilized_status(struct rq *rq) { }
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se = &p->se;
/*
@@ -5210,7 +5187,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
- util_est_enqueue(&rq->cfs, p);
+ util_est_enqueue(cfs_rq, p);
/*
* If in_iowait is set, the code below may not trigger any cpufreq
@@ -5220,38 +5197,17 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (p->in_iowait)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
- for_each_sched_entity(se) {
- if (se->on_rq)
- break;
- cfs_rq = cfs_rq_of(se);
- enqueue_entity_groups(cfs_rq, se, flags);
- enqueue_entity(cfs_rq, se, flags);
-
- /*
- * end evaluation on encountering a throttled cfs_rq
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running increment below.
- */
- if (cfs_rq_throttled(cfs_rq))
- break;
- cfs_rq->h_nr_running++;
-
- flags = ENQUEUE_WAKEUP;
- }
+ /* First, add the static weight to the (group) cfs_rq. */
+ account_entity_enqueue(cfs_rq, se);
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
- cfs_rq->h_nr_running++;
+ /* Then, propagate that up the hierarchy. */
+ propagate_entity_cfs_rq(se, UPDATE_TG|DO_ATTACH);
- if (cfs_rq_throttled(cfs_rq))
- break;
-
- update_load_avg(cfs_rq, se, UPDATE_TG);
- update_cfs_group(se);
- }
+ /* Apply the calculated hierarchical weight & load. */
+ enqueue_entity(cfs_rq, se, flags);
+ enqueue_runnable_load_avg(cfs_rq, se);
+ account_entity_enqueue_h(cfs_rq, se);
- if (!se) {
add_nr_running(rq, 1);
/*
* Since new tasks are assigned an initial util_avg equal to
@@ -5270,23 +5226,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (flags & ENQUEUE_WAKEUP)
update_overutilized_status(rq);
- }
-
- if (cfs_bandwidth_used()) {
- /*
- * When bandwidth control is enabled; the cfs_rq_throttled()
- * breaks in the above iteration can result in incomplete
- * leaf list maintenance, resulting in triggering the assertion
- * below.
- */
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(cfs_rq))
- break;
- }
- }
-
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -5301,55 +5240,17 @@ static void set_next_buddy(struct sched_entity *se);
*/
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
- dequeue_entity_groups(cfs_rq, se, flags);
- dequeue_entity(cfs_rq, se, flags);
-
- /*
- * end evaluation on encountering a throttled cfs_rq
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running decrement below.
- */
- if (cfs_rq_throttled(cfs_rq))
- break;
- cfs_rq->h_nr_running--;
-
- /* Don't dequeue parent if it has other entities besides us */
- if (cfs_rq->load.weight) {
- /* Avoid re-evaluating load for this entity: */
- se = parent_entity(se);
- /*
- * Bias pick_next to pick a task from this cfs_rq, as
- * p is sleeping when it is within its sched_slice.
- */
- if (task_sleep && se && !throttled_hierarchy(cfs_rq))
- set_next_buddy(se);
- break;
- }
- flags |= DEQUEUE_SLEEP;
- }
-
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
- cfs_rq->h_nr_running--;
-
- if (cfs_rq_throttled(cfs_rq))
- break;
+ propagate_entity_cfs_rq(se, UPDATE_TG|SE_IS_CURRENT);
- update_load_avg(cfs_rq, se, UPDATE_TG);
- update_cfs_group(se);
- }
+ dequeue_entity(cfs_rq, &p->se, flags);
- if (!se)
- sub_nr_running(rq, 1);
+ sub_nr_running(rq, 1);
- util_est_dequeue(&rq->cfs, p, task_sleep);
+ util_est_dequeue(cfs_rq, p, task_sleep);
hrtick_update(rq);
}
@@ -5672,7 +5573,7 @@ static unsigned long capacity_of(int cpu)
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
+ unsigned long nr_running = READ_ONCE(rq->cfs.nr_running);
unsigned long load_avg = weighted_cpuload(rq);
if (nr_running)
@@ -6891,11 +6792,9 @@ static void set_last_buddy(struct sched_entity *se)
if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return;
- for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
- return;
- cfs_rq_of(se)->last = se;
- }
+ if (SCHED_WARN_ON(!se->on_rq))
+ return;
+ cfs_rq_of(se)->last = se;
}
static void set_next_buddy(struct sched_entity *se)
@@ -6903,17 +6802,14 @@ static void set_next_buddy(struct sched_entity *se)
if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return;
- for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
- return;
- cfs_rq_of(se)->next = se;
- }
+ if (SCHED_WARN_ON(!se->on_rq))
+ return;
+ cfs_rq_of(se)->next = se;
}
static void set_skip_buddy(struct sched_entity *se)
{
- for_each_sched_entity(se)
- cfs_rq_of(se)->skip = se;
+ cfs_rq_of(se)->skip = se;
}
/*
@@ -6969,7 +6865,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
return;
- find_matching_se(&se, &pse);
update_curr(cfs_rq_of(se));
BUG_ON(!pse);
if (wakeup_preempt_entity(se, pse) == 1) {
@@ -7010,100 +6905,18 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
struct task_struct *p;
int new_tasks;
+ put_prev_task(rq, prev);
again:
if (!cfs_rq->nr_running)
goto idle;
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (prev->sched_class != &fair_sched_class)
- goto simple;
-
- /*
- * Because of the set_next_buddy() in dequeue_task_fair() it is rather
- * likely that a next task is from the same cgroup as the current.
- *
- * Therefore attempt to avoid putting and setting the entire cgroup
- * hierarchy, only change the part that actually changes.
- */
-
- do {
- struct sched_entity *curr = cfs_rq->curr;
-
- /*
- * Since we got here without doing put_prev_entity() we also
- * have to consider cfs_rq->curr. If it is still a runnable
- * entity, update_curr() will update its vruntime, otherwise
- * forget we've ever seen it.
- */
- if (curr) {
- if (curr->on_rq)
- update_curr(cfs_rq);
- else
- curr = NULL;
-
- /*
- * This call to check_cfs_rq_runtime() will do the
- * throttle and dequeue its entity in the parent(s).
- * Therefore the nr_running test will indeed
- * be correct.
- */
- if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
- cfs_rq = &rq->cfs;
-
- if (!cfs_rq->nr_running)
- goto idle;
-
- goto simple;
- }
- }
-
- se = pick_next_entity(cfs_rq, curr);
- cfs_rq = group_cfs_rq(se);
- } while (cfs_rq);
-
- p = task_of(se);
-
- /*
- * Since we haven't yet done put_prev_entity and if the selected task
- * is a different task than we started out with, try and touch the
- * least amount of cfs_rqs.
- */
- if (prev != p) {
- struct sched_entity *pse = &prev->se;
-
- while (!(cfs_rq = is_same_group(se, pse))) {
- int se_depth = se->depth;
- int pse_depth = pse->depth;
-
- if (se_depth <= pse_depth) {
- put_prev_entity(cfs_rq_of(pse), pse);
- pse = parent_entity(pse);
- }
- if (se_depth >= pse_depth) {
- set_next_entity(cfs_rq_of(se), se);
- se = parent_entity(se);
- }
- }
-
- put_prev_entity(cfs_rq, pse);
- set_next_entity(cfs_rq, se);
- }
-
- goto done;
-simple:
-#endif
-
- put_prev_task(rq, prev);
-
- do {
- se = pick_next_entity(cfs_rq, NULL);
- set_next_entity(cfs_rq, se);
- cfs_rq = group_cfs_rq(se);
- } while (cfs_rq);
+ se = pick_next_entity(cfs_rq, NULL);
+ if (!se)
+ goto idle;
+ set_next_entity(cfs_rq, se);
p = task_of(se);
-done: __maybe_unused;
#ifdef CONFIG_SMP
/*
* Move the next running task to the front of
@@ -7152,10 +6965,8 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
- put_prev_entity(cfs_rq, se);
- }
+ cfs_rq = cfs_rq_of(se);
+ put_prev_entity(cfs_rq, se);
}
/*
@@ -7916,6 +7727,11 @@ static unsigned long task_se_h_load(struct sched_entity *se)
return se->avg.load_avg;
}
+static unsigned long task_se_h_weight(struct sched_entity *se)
+{
+ return se->load.weight;
+}
+
static unsigned long task_se_h_weight(struct sched_entity *se)
{
return se->load.weight;
@@ -8321,7 +8137,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_load += load;
sgs->group_util += cpu_util(i);
- sgs->sum_nr_running += rq->cfs.h_nr_running;
+ sgs->sum_nr_running += rq->cfs.nr_running;
nr_running = rq->nr_running;
if (nr_running > 1)
@@ -9012,7 +8828,7 @@ voluntary_active_balance(struct lb_env *env)
* available on dst_cpu.
*/
if ((env->idle != CPU_NOT_IDLE) &&
- (env->src_rq->cfs.h_nr_running == 1)) {
+ (env->src_rq->cfs.nr_running == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
return 1;
@@ -9693,7 +9509,7 @@ static void nohz_balancer_kick(struct rq *rq)
* capacity; kick the ILB to see if there's a better CPU to run
* on.
*/
- if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
+ if (rq->cfs.nr_running >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
@@ -10142,7 +9958,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
* have been enqueued in the meantime. Since we're not going idle,
* pretend we pulled a task.
*/
- if (this_rq->cfs.h_nr_running && !pulled_task)
+ if (this_rq->cfs.nr_running && !pulled_task)
pulled_task = 1;
/* Move the next balance forward */
@@ -10150,7 +9966,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
this_rq->next_balance = next_balance;
/* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ if (this_rq->nr_running != this_rq->cfs.nr_running)
pulled_task = -1;
if (pulled_task)
@@ -10237,6 +10053,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
+ update_runnable_load_avg(&curr->se);
+
+ cfs_rq = &rq->cfs;
+ if (cfs_rq->nr_running > 1)
+ check_preempt_tick(cfs_rq, &curr->se);
+
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
@@ -10335,40 +10157,45 @@ static inline bool vruntime_normalized(struct task_struct *p)
* Propagate the changes of the sched_entity across the tg tree to make it
* visible to the root
*/
-static void propagate_entity_cfs_rq(struct sched_entity *se)
+static void propagate_entity_cfs_rq(struct sched_entity *se, int flags)
{
struct cfs_rq *cfs_rq;
- /* Start to propagate at parent */
- se = se->parent;
-
for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
+ cfs_rq = group_cfs_rq_of_parent(se);
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ /*
+ * Walk the hierarchy while PELT says there is work to do,
+ * or while attaching a new task, to get sane task weights.
+ */
+ if (!update_load_avg(cfs_rq, se, flags))
+ if (!(flags & DO_ATTACH))
+ break;
+
+ update_cfs_group(se);
}
}
#else
-static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+static void propagate_entity_cfs_rq(struct sched_entity *se, int flags) { }
#endif
static void detach_entity_cfs_rq(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct cfs_rq *cfs_rq = group_cfs_rq_of_parent(se);
- /* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
- propagate_entity_cfs_rq(se);
+ propagate_entity_cfs_rq(se->parent, UPDATE_TG);
}
static void attach_entity_cfs_rq(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ int flags = sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD;
+ struct cfs_rq *cfs_rq = group_cfs_rq_of_parent(se);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -10379,10 +10206,10 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
#endif
/* Synchronize entity with its cfs_rq */
- update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ update_load_avg(cfs_rq, se, flags);
attach_entity_load_avg(cfs_rq, se, 0);
update_tg_load_avg(cfs_rq, false);
- propagate_entity_cfs_rq(se);
+ propagate_entity_cfs_rq(se->parent, flags | UPDATE_TG);
}
static void detach_task_cfs_rq(struct task_struct *p)
@@ -10443,14 +10270,11 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
static void set_curr_task_fair(struct rq *rq)
{
struct sched_entity *se = &rq->curr->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- set_next_entity(cfs_rq, se);
- /* ensure bandwidth has been allocated on our new cfs_rq */
- account_cfs_rq_runtime(cfs_rq, 0);
- }
+ set_next_entity(cfs_rq, se);
+ /* ensure bandwidth has been allocated on our new cfs_rq */
+ account_cfs_rq_runtime(cfs_rq, 0);
}
void init_cfs_rq(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index b3f3e8b29394..8492c9792ad0 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -251,10 +251,10 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
return 0;
}
-int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se, bool load, bool running)
{
- if (___update_load_sum(now, &se->avg, !!se->on_rq,
- cfs_rq->curr == se)) {
+ if (___update_load_sum(now, &se->avg, (!!se->on_rq || load),
+ (cfs_rq->curr == se) || running)) {
___update_load_avg(&se->avg, se_weight(se));
cfs_se_util_change(&se->avg);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7489d5f56960..1152c4ebf314 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -2,7 +2,7 @@
#include "sched-pelt.h"
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
-int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se, bool load, bool running);
int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 32978a8de8ce..b05fd87cf8b5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1443,7 +1443,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#ifdef CONFIG_FAIR_GROUP_SCHED
set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
- p->se.cfs_rq = tg->cfs_rq[cpu];
+ p->se.cfs_rq = &cpu_rq(cpu)->cfs;
p->se.parent = tg->se[cpu];
#endif
--
2.20.1
Powered by blists - more mailing lists