linux-kernel - [RFC PATCH 13/16] sched: Fix pick_next_task() race condition in core scheduling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <907055bc2fa4b21e2b6675268903ca2c89285e34.1593530334.git.vpillai@digitalocean.com>
Date:   Tue, 30 Jun 2020 21:32:34 +0000
From:   Vineeth Remanan Pillai <vpillai@...italocean.com>
To:     Nishanth Aravamudan <naravamudan@...italocean.com>,
        Julien Desfossez <jdesfossez@...italocean.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Tim Chen <tim.c.chen@...ux.intel.com>, mingo@...nel.org,
        tglx@...utronix.de, pjt@...gle.com, torvalds@...ux-foundation.org
Cc:     Chen Yu <yu.c.chen@...el.com>, linux-kernel@...r.kernel.org,
        subhra.mazumdar@...cle.com, fweisbec@...il.com,
        keescook@...omium.org, kerrnel@...gle.com,
        Phil Auld <pauld@...hat.com>, Aaron Lu <aaron.lwe@...il.com>,
        Aubrey Li <aubrey.intel@...il.com>,
        Valentin Schneider <valentin.schneider@....com>,
        Mel Gorman <mgorman@...hsingularity.net>,
        Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>,
        Paolo Bonzini <pbonzini@...hat.com>,
        Joel Fernandes <joelaf@...gle.com>, joel@...lfernandes.org,
        vineethrp@...il.com,
        Christian Brauner <christian.brauner@...ntu.com>,
        Vineeth Remanan Pillai <vpillai@...italocean.com>
Subject: [RFC PATCH 13/16] sched: Fix pick_next_task() race condition in core scheduling

From: Chen Yu <yu.c.chen@...el.com>

As Peter mentioned that Commit 6e2df0581f56 ("sched: Fix pick_next_task()
vs 'change' pattern race") has fixed a race condition due to rq->lock
improperly released after put_prev_task(), backport this fix to core
scheduling's pick_next_task() as well.

Without this fix, Aubrey, Long and I found an NULL exception point
triggered within one hour when running RDT MBA(Intel Resource Directory
Technolodge Memory Bandwidth Allocation) benchmarks on a 36 Core(72 HTs)
platform, which tries to dereference a NULL sched_entity:

[ 3618.429053] BUG: kernel NULL pointer dereference, address: 0000000000000160
[ 3618.429039] RIP: 0010:pick_task_fair+0x2e/0xa0
[ 3618.429042] RSP: 0018:ffffc90000317da8 EFLAGS: 00010046
[ 3618.429044] RAX: 0000000000000000 RBX: ffff88afdf4ad100 RCX: 0000000000000001
[ 3618.429045] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88afdf4ad100
[ 3618.429045] RBP: ffffc90000317dc0 R08: 0000000000000048 R09: 0100000000100000
[ 3618.429046] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000
[ 3618.429047] R13: 000000000002d080 R14: ffff88afdf4ad080 R15: 0000000000000014
[ 3618.429048]  ? pick_task_fair+0x48/0xa0
[ 3618.429048]  pick_next_task+0x34c/0x7e0
[ 3618.429049]  ? tick_program_event+0x44/0x70
[ 3618.429049]  __schedule+0xee/0x5d0
[ 3618.429050]  schedule_idle+0x2c/0x40
[ 3618.429051]  do_idle+0x175/0x280
[ 3618.429051]  cpu_startup_entry+0x1d/0x30
[ 3618.429052]  start_secondary+0x169/0x1c0
[ 3618.429052]  secondary_startup_64+0xa4/0xb0

While with this patch applied, no NULL pointer exception was found within
14 hours for now. Although there's no direct evidence this fix would solve
the issue, it does fix a potential race condition.

Signed-off-by: Chen Yu <yu.c.chen@...el.com>
Signed-off-by: Vineeth Remanan Pillai <vpillai@...italocean.com>
---
 kernel/sched/core.c  | 44 +++++++++++++++++++++++++-------------------
 kernel/sched/fair.c  |  9 ++++++---
 kernel/sched/sched.h |  7 -------
 3 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c84f209b8591..ede86fb37b4e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4169,6 +4169,29 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
 	schedstat_inc(this_rq()->sched_count);
 }
 
+static inline void
+finish_prev_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+	const struct sched_class *class;
+
+	/*
+	 * We must do the balancing pass before put_next_task(), such
+	 * that when we release the rq->lock the task is in the same
+	 * state as before we took rq->lock.
+	 *
+	 * We can terminate the balance pass as soon as we know there is
+	 * a runnable task of @class priority or higher.
+	 */
+	for_class_range(class, prev->sched_class, &idle_sched_class) {
+		if (class->balance(rq, prev, rf))
+			break;
+	}
+#endif
+
+	put_prev_task(rq, prev);
+}
+
 /*
  * Pick up the highest-prio task:
  */
@@ -4202,22 +4225,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	}
 
 restart:
-#ifdef CONFIG_SMP
-	/*
-	 * We must do the balancing pass before put_next_task(), such
-	 * that when we release the rq->lock the task is in the same
-	 * state as before we took rq->lock.
-	 *
-	 * We can terminate the balance pass as soon as we know there is
-	 * a runnable task of @class priority or higher.
-	 */
-	for_class_range(class, prev->sched_class, &idle_sched_class) {
-		if (class->balance(rq, prev, rf))
-			break;
-	}
-#endif
-
-	put_prev_task(rq, prev);
+	finish_prev_task(rq, prev, rf);
 
 	for_each_class(class) {
 		p = class->pick_next_task(rq);
@@ -4323,9 +4331,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 		return next;
 	}
 
-	prev->sched_class->put_prev_task(rq, prev);
-	if (!rq->nr_running)
-		newidle_balance(rq, rf);
+	finish_prev_task(rq, prev, rf);
 
 	cpu = cpu_of(rq);
 	smt_mask = cpu_smt_mask(cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 33dc4bf01817..435b460d3c3f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -115,6 +115,7 @@ int __weak arch_asym_cpu_priority(int cpu)
  */
 #define fits_capacity(cap, max)	((cap) * 1280 < (max) * 1024)
 
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
 #endif
 
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -7116,9 +7117,11 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 	struct task_struct *p;
+#ifdef CONFIG_SMP
 	int new_tasks;
 
 again:
+#endif
 	if (!sched_fair_runnable(rq))
 		goto idle;
 
@@ -7232,6 +7235,7 @@ done: __maybe_unused;
 	if (!rf)
 		return NULL;
 
+#ifdef CONFIG_SMP
 	new_tasks = newidle_balance(rq, rf);
 
 	/*
@@ -7244,6 +7248,7 @@ done: __maybe_unused;
 
 	if (new_tasks > 0)
 		goto again;
+#endif
 
 	/*
 	 * rq is about to be idle, check if we need to update the
@@ -8750,10 +8755,8 @@ static int idle_cpu_without(int cpu, struct task_struct *p)
 	 * be computed and tested before calling idle_cpu_without().
 	 */
 
-#ifdef CONFIG_SMP
 	if (!llist_empty(&rq->wake_list))
 		return 0;
-#endif
 
 	return 1;
 }
@@ -10636,7 +10639,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
  *     0 - failed, no new tasks
  *   > 0 - success, new (fair) tasks present
  */
-int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 {
 	unsigned long next_balance = jiffies + HZ;
 	int this_cpu = this_rq->cpu;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c4b4640fcdc8..a5450886c4e4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1640,15 +1640,8 @@ static inline void unregister_sched_domain_sysctl(void)
 {
 }
 #endif
-
-extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
-
 #else
-
 static inline void sched_ttwu_pending(void) { }
-
-static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
-
 #endif /* CONFIG_SMP */
 
 #include "stats.h"
-- 
2.17.1