linux-kernel - [PATCH v2 7/7] sched/core: Add debug code to catch missing update_rq

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20160921133813.31976-8-matt@codeblueprint.co.uk>
Date:   Wed, 21 Sep 2016 14:38:13 +0100
From:   Matt Fleming <matt@...eblueprint.co.uk>
To:     Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...nel.org>
Cc:     Byungchul Park <byungchul.park@....com>,
        Frederic Weisbecker <fweisbec@...il.com>,
        Luca Abeni <luca.abeni@...tn.it>,
        Rik van Riel <riel@...hat.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Wanpeng Li <wanpeng.li@...mail.com>,
        Yuyang Du <yuyang.du@...el.com>,
        Petr Mladek <pmladek@...e.com>, Jan Kara <jack@...e.cz>,
        Sergey Senozhatsky <sergey.senozhatsky.work@...il.com>,
        linux-kernel@...r.kernel.org,
        Mel Gorman <mgorman@...hsingularity.net>,
        Mike Galbraith <umgwanakikbuti@...il.com>,
        Matt Fleming <matt@...eblueprint.co.uk>
Subject: [PATCH v2 7/7] sched/core: Add debug code to catch missing update_rq_clock()

There's no diagnostic checks for figuring out when we've accidentally
missed update_rq_clock() calls. Let's add some by piggybacking on the
rq_*pin_lock() wrappers.

The idea behind the diagnostic checks is that upon pining rq lock the
rq clock should be updated, via update_rq_clock(), before anybody
reads the clock with rq_clock() or rq_clock_task().

The exception to this rule is when updates have explicitly been
disabled with the rq_clock_skip_update() optimisation.

There are some functions that only unpin the rq lock in order to grab
some other lock and avoid deadlock. In that case we don't need to
update the clock again and the previous diagnostic state can be
carried over in rq_repin_lock() by saving the state in the rq_flags
context.

Since this patch adds a new clock update flag and some already exist
in rq::clock_skip_update, that field has now been renamed. An attempt
has been made to keep the flag manipulation code small and fast since
it's used in the heart of the __schedule() fast path.

For the !CONFIG_SCHED_DEBUG case the only object code change (other
than addresses) is the following change to reset RQCF_ACT_SKIP inside
of __schedule(),

  -       c7 83 38 09 00 00 00    movl   $0x0,0x938(%rbx)
  -       00 00 00
  +       83 a3 38 09 00 00 fc    andl   $0xfffffffc,0x938(%rbx)

Suggested-by: Peter Zijlstra <peterz@...radead.org>
Cc: Ingo Molnar <mingo@...nel.org>
Cc: Mike Galbraith <umgwanakikbuti@...il.com>
Cc: Mel Gorman <mgorman@...hsingularity.net>
Signed-off-by: Matt Fleming <matt@...eblueprint.co.uk>
---
 kernel/sched/core.c  | 11 +++++---
 kernel/sched/sched.h | 76 +++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1254629c9f2f..95f804b3a413 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -102,9 +102,12 @@ void update_rq_clock(struct rq *rq)
 
 	lockdep_assert_held(&rq->lock);
 
-	if (rq->clock_skip_update & RQCF_ACT_SKIP)
+	if (rq->clock_update_flags & RQCF_ACT_SKIP)
 		return;
 
+#ifdef CONFIG_SCHED_DEBUG
+	rq->clock_update_flags |= RQCF_UPDATED;
+#endif
 	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
 	if (delta < 0)
 		return;
@@ -2872,7 +2875,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		rq->prev_mm = oldmm;
 	}
 
-	rq->clock_skip_update = 0;
+	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
 	/*
 	 * Since the runqueue lock will be released by the next
@@ -3358,7 +3361,7 @@ static void __sched notrace __schedule(bool preempt)
 	raw_spin_lock(&rq->lock);
 	rq_pin_lock(rq, &rf);
 
-	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+	rq->clock_update_flags <<= 1; /* promote REQ to ACT */
 
 	switch_count = &prev->nivcsw;
 	if (!preempt && prev->state) {
@@ -3399,7 +3402,7 @@ static void __sched notrace __schedule(bool preempt)
 		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next, &rf); /* unlocks the rq */
 	} else {
-		rq->clock_skip_update = 0;
+		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 		rq_unpin_lock(rq, &rf);
 		raw_spin_unlock_irq(&rq->lock);
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bf48e7975c23..91f4b3d58d56 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -630,7 +630,7 @@ struct rq {
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 
-	unsigned int clock_skip_update;
+	unsigned int clock_update_flags;
 	u64 clock;
 	u64 clock_task;
 
@@ -737,48 +737,112 @@ static inline u64 __rq_clock_broken(struct rq *rq)
 	return READ_ONCE(rq->clock);
 }
 
+/*
+ * rq::clock_update_flags bits
+ *
+ * %RQCF_REQ_SKIP - will request skipping of clock update on the next
+ *  call to __schedule(). This is an optimisation to avoid
+ *  neighbouring rq clock updates.
+ *
+ * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is
+ *  in effect and calls to update_rq_clock() are being ignored.
+ *
+ * %RQCF_UPDATED - is a debug flag that indicates whether a call has been
+ *  made to update_rq_clock() since the last time rq::lock was pinned.
+ *
+ * If inside of __schedule(), clock_update_flags will have been
+ * shifted left (a left shift is a cheap operation for the fast path
+ * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use,
+ *
+ *	if (rq-clock_update_flags >= RQCF_UPDATED)
+ *
+ * to check if %RQCF_UPADTED is set. It'll never be shifted more than
+ * one position though, because the next rq_unpin_lock() will shift it
+ * back.
+ */
+#define RQCF_REQ_SKIP	0x01
+#define RQCF_ACT_SKIP	0x02
+#define RQCF_UPDATED	0x04
+
+static inline void assert_clock_updated(struct rq *rq)
+{
+#ifdef CONFIG_SCHED_DEBUG
+	/*
+	 * The only reason for not seeing a clock update since the
+	 * last rq_pin_lock() is if we're currently skipping updates.
+	 */
+	WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP);
+#endif
+}
+
 static inline u64 rq_clock(struct rq *rq)
 {
 	lockdep_assert_held(&rq->lock);
+	assert_clock_updated(rq);
+
 	return rq->clock;
 }
 
 static inline u64 rq_clock_task(struct rq *rq)
 {
 	lockdep_assert_held(&rq->lock);
+	assert_clock_updated(rq);
+
 	return rq->clock_task;
 }
 
-#define RQCF_REQ_SKIP	0x01
-#define RQCF_ACT_SKIP	0x02
-
 static inline void rq_clock_skip_update(struct rq *rq, bool skip)
 {
 	lockdep_assert_held(&rq->lock);
 	if (skip)
-		rq->clock_skip_update |= RQCF_REQ_SKIP;
+		rq->clock_update_flags |= RQCF_REQ_SKIP;
 	else
-		rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+		rq->clock_update_flags &= ~RQCF_REQ_SKIP;
 }
 
 struct rq_flags {
 	unsigned long flags;
 	struct pin_cookie cookie;
+#ifdef CONFIG_SCHED_DEBUG
+	/*
+	 * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
+	 * current pin context is stashed here in case it needs to be
+	 * restored in rq_repin_lock().
+	 */
+	unsigned int clock_update_flags;
+#endif
 };
 
 static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
 {
 	rf->cookie = lockdep_pin_lock(&rq->lock);
+
+#ifdef CONFIG_SCHED_DEBUG
+	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+	rf->clock_update_flags = 0;
+#endif
 }
 
 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
 {
+#ifdef CONFIG_SCHED_DEBUG
+	if (rq->clock_update_flags > RQCF_ACT_SKIP)
+		rf->clock_update_flags = RQCF_UPDATED;
+#endif
+
 	lockdep_unpin_lock(&rq->lock, rf->cookie);
 }
 
 static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
 {
 	lockdep_repin_lock(&rq->lock, rf->cookie);
+
+#ifdef CONFIG_SCHED_DEBUG
+	/*
+	 * Restore the value we stashed in @rf for this pin context.
+	 */
+	rq->clock_update_flags |= rf->clock_update_flags;
+#endif
 }
 
 #ifdef CONFIG_NUMA
-- 
2.9.3