linux-kernel - RFC [patch] sched,cgroup_sched: convince RT_GROUP

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <1333792489.12677.58.camel@marge.simpson.net>
Date:	Sat, 07 Apr 2012 11:54:49 +0200
From:	Mike Galbraith <efault@....de>
To:	LKML <linux-kernel@...r.kernel.org>
Cc:	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Ingo Molnar <mingo@...hat.com>
Subject: RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to
 work

Greetings,

I'm having trouble with RT_GROUP_SCHED throttle kicking in and staying
engaged (timer troubles).  Either groups execute one after the other
(frob timer above you), or bandwidth is wrong, or the reason I started
squabbling with this thing in the first place happens, one or the other
group gets stuck, even with only two groups, with the root task group
throttled, and the victim is marooned until I kill the cgroup setup.  If
(say) grp1 starts first, grp2 is screwed, or the other way around.

With this patch, the thing appears to work perfectly, but it doesn't
look correct, since I'm futzing with ->rt_time where I should not.

Not so pretty ascii-art:

/----------/system cpu 0-2, rt 300000-----/foo cpu 2, rt 100000
    \
     \
      \----/rtcpus cpu 3, rt 300000---\---/bar cpu 3, rt 100000
                                       \
                                        \-/baz cpu 3, rt 100000

It only needs to be two groups, grp1 containing most of the system, the
other rt only.  With the patch, the above setup works (last setup I
prodded box with), and bandwidth looked fine, twiddle budgets or not.

I just happened to notice the throttle wasn't doing it's thing right
after discovering that isolcpus is busted with RT_GROUP_SCHED.  Thought
I should probably beat on it a little.  The darn thing beat me back :)

---
 kernel/sched/rt.c |   76 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 31 deletions(-)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -776,55 +776,69 @@ static inline int balance_runtime(struct
 }
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_RT_GROUP_SCHED
+#define for_each_rt_rq_up_from(rt_rq, iter, rq)			\
+	for (iter = rt_rq->tg; iter; iter = iter->parent,	\
+		rt_rq = iter ? iter->rt_rq[cpu_of(rq)] : NULL)
+#else
+#define for_each_rt_rq_up_from(rt_rq, iter, rq)			\
+	for (iter = rt_rq; iter; iter = NULL)
+#endif
+
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
 	int i, idle = 1, throttled = 0;
 	const struct cpumask *span;
+	rt_rq_iter_t iter;
 
 	span = sched_rt_period_mask();
 	for_each_cpu(i, span) {
-		int enqueue = 0;
+		int enqueue = 0, depth = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
-		struct rq *rq = rq_of_rt_rq(rt_rq);
+		struct rq *rq = cpu_rq(i);
 
 		raw_spin_lock(&rq->lock);
-		if (rt_rq->rt_time) {
-			u64 runtime;
-
-			raw_spin_lock(&rt_rq->rt_runtime_lock);
-			if (rt_rq->rt_throttled)
-				balance_runtime(rt_rq);
-			runtime = rt_rq->rt_runtime;
-			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
-			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
-				rt_rq->rt_throttled = 0;
+		for_each_rt_rq_up_from(rt_rq, iter, rq) {
+			if (rt_rq->rt_time) {
+				u64 runtime;
+
+				raw_spin_lock(&rt_rq->rt_runtime_lock);
+				if (rt_rq->rt_throttled)
+					balance_runtime(rt_rq);
+				runtime = rt_rq->rt_runtime;
+				rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
+				if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
+					rt_rq->rt_throttled = 0;
+					enqueue = 1;
+
+					/*
+					 * Force a clock update if the CPU was idle,
+					 * lest wakeup -> unthrottle time accumulate.
+					 */
+					if (rt_rq->rt_nr_running && rq->curr == rq->idle)
+						rq->skip_clock_update = -1;
+				}
+				raw_spin_unlock(&rt_rq->rt_runtime_lock);
+			} else if (!rt_rq_throttled(rt_rq))
 				enqueue = 1;
 
-				/*
-				 * Force a clock update if the CPU was idle,
-				 * lest wakeup -> unthrottle time accumulate.
-				 */
-				if (rt_rq->rt_nr_running && rq->curr == rq->idle)
-					rq->skip_clock_update = -1;
+			if (enqueue)
+				sched_rt_rq_enqueue(rt_rq);
+
+			if (!depth++) {
+				if (rt_rq->rt_throttled) {
+					throttled = 1;
+					idle = 0;
+				} else if (rt_rq->rt_time || rt_rq->rt_nr_running)
+					idle = 0;
 			}
-			if (rt_rq->rt_time || rt_rq->rt_nr_running)
-				idle = 0;
-			raw_spin_unlock(&rt_rq->rt_runtime_lock);
-		} else if (rt_rq->rt_nr_running) {
-			idle = 0;
-			if (!rt_rq_throttled(rt_rq))
-				enqueue = 1;
-		}
-		if (rt_rq->rt_throttled)
-			throttled = 1;
 
-		if (enqueue)
-			sched_rt_rq_enqueue(rt_rq);
+		}
 		raw_spin_unlock(&rq->lock);
 	}
 
 	if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
-		return 1;
+		idle = 1;
 
 	return idle;
 }


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/