linux-kernel - Re: [PATCH v2 for-4.12-fixes 2/2] sched/fair: Fix O(# total cgroups) in load balance path

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170510155524.GD32165@htj.duckdns.org>
Date:   Wed, 10 May 2017 11:55:24 -0400
From:   Tejun Heo <tj@...nel.org>
To:     ying.huang@...ux.intel.com,
        Vincent Guittot <vincent.guittot@...aro.org>
Cc:     Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>,
        linux-kernel <linux-kernel@...r.kernel.org>,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Mike Galbraith <efault@....de>, Paul Turner <pjt@...gle.com>,
        Chris Mason <clm@...com>, kernel-team@...com
Subject: Re: [PATCH v2 for-4.12-fixes 2/2] sched/fair: Fix O(# total cgroups)
 in load balance path

On Wed, May 10, 2017 at 10:44:14AM -0400, Tejun Heo wrote:
> Hello,
> 
> On Wed, May 10, 2017 at 08:50:14AM +0200, Vincent Guittot wrote:
> > On 9 May 2017 at 18:18, Tejun Heo <tj@...nel.org> wrote:
> > > Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
> > > live cfs_rqs which have ever been active on the CPU; unfortunately,
> > > this makes update_blocked_averages() O(# total cgroups) which isn't
> > > scalable at all.
> > 
> > Dietmar raised similar optimization in the past. The only question was
> > : what is the impact of  re-adding the cfs_rq in leaf_cfs_rq_list on
> > the wake up path ? Have you done some measurements ?
> 
> Didn't do a perf test yet but it's several more branches and a local
> list operation on enqueue, which is already pretty expensive vs. load
> balance being O(total number of cgroups on the system).
> 
> Anyways, I'll do some hackbench tests with several levels of layering.

Oh, BTW, do we still need bc4278987e38 ("sched/fair: Fix FTQ noise
bench regression") with this patch?  That patch looks like it's just
cutting down on the body of the O(#cgroups) loop.

Ying, can you verify whether the following patch on top of the current
linus#master 56868a460b83 also fixes the regression that you saw?

Index: work/kernel/sched/fair.c
===================================================================
--- work.orig/kernel/sched/fair.c
+++ work/kernel/sched/fair.c
@@ -372,6 +372,10 @@ static inline void list_del_leaf_cfs_rq(
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
+				 leaf_cfs_rq_list)
+
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
@@ -466,6 +470,9 @@ static inline void list_del_leaf_cfs_rq(
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
 	return NULL;
@@ -3170,36 +3177,6 @@ static inline int propagate_entity_load_
 	return 1;
 }
 
-/*
- * Check if we need to update the load and the utilization of a blocked
- * group_entity:
- */
-static inline bool skip_blocked_update(struct sched_entity *se)
-{
-	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
-
-	/*
-	 * If sched_entity still have not zero load or utilization, we have to
-	 * decay it:
-	 */
-	if (se->avg.load_avg || se->avg.util_avg)
-		return false;
-
-	/*
-	 * If there is a pending propagation, we have to update the load and
-	 * the utilization of the sched_entity:
-	 */
-	if (gcfs_rq->propagate_avg)
-		return false;
-
-	/*
-	 * Otherwise, the load and the utilization of the sched_entity is
-	 * already zero and there is no pending propagation, so it will be a
-	 * waste of time to try to decay it:
-	 */
-	return true;
-}
-
 #else /* CONFIG_FAIR_GROUP_SCHED */
 
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -4644,22 +4621,32 @@ static void destroy_cfs_bandwidth(struct
 
 static void __maybe_unused update_runtime_enabled(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
+
+	lockdep_assert_held(&rq->lock);
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
 		raw_spin_lock(&cfs_b->lock);
 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
 		raw_spin_unlock(&cfs_b->lock);
 	}
+	rcu_read_unlock();
 }
 
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
+
+	lockdep_assert_held(&rq->lock);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
 		if (!cfs_rq->runtime_enabled)
 			continue;
 
@@ -4677,6 +4664,7 @@ static void __maybe_unused unthrottle_of
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
 	}
+	rcu_read_unlock();
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
@@ -6973,7 +6961,7 @@ static void attach_tasks(struct lb_env *
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 	struct rq_flags rf;
 
 	rq_lock_irqsave(rq, &rf);
@@ -6983,9 +6971,7 @@ static void update_blocked_averages(int
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		struct sched_entity *se;
-
+	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 		/* throttled entities do not contribute to load */
 		if (throttled_hierarchy(cfs_rq))
 			continue;
@@ -6993,10 +6979,17 @@ static void update_blocked_averages(int
 		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
 			update_tg_load_avg(cfs_rq, 0);
 
-		/* Propagate pending load changes to the parent, if any: */
-		se = cfs_rq->tg->se[cpu];
-		if (se && !skip_blocked_update(se))
-			update_load_avg(se, 0);
+		/* Propagate pending load changes to the parent */
+		if (cfs_rq->tg->se[cpu])
+			update_load_avg(cfs_rq->tg->se[cpu], 0);
+
+		/*
+		 * There can be a lot of idle CPU cgroups.  Don't let fully
+		 * decayed cfs_rqs linger on the list.
+		 */
+		if (!cfs_rq->load.weight && !cfs_rq->avg.load_sum &&
+		    !cfs_rq->avg.util_sum && !cfs_rq->runnable_load_sum)
+			list_del_leaf_cfs_rq(cfs_rq);
 	}
 	rq_unlock_irqrestore(rq, &rf);
 }