lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230427111937.2745231-3-bigeasy@linutronix.de>
Date:   Thu, 27 Apr 2023 13:19:35 +0200
From:   Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To:     linux-kernel@...r.kernel.org
Cc:     Ben Segall <bsegall@...gle.com>, Boqun Feng <boqun.feng@...il.com>,
        Crystal Wood <swood@...hat.com>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Ingo Molnar <mingo@...hat.com>,
        John Stultz <jstultz@...gle.com>,
        Juri Lelli <juri.lelli@...hat.com>,
        Mel Gorman <mgorman@...e.de>,
        Peter Zijlstra <peterz@...radead.org>,
        Steven Rostedt <rostedt@...dmis.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        Valentin Schneider <vschneid@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Waiman Long <longman@...hat.com>,
        Will Deacon <will@...nel.org>,
        Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Subject: [PATCH v2 2/4] locking/rtmutex: Submit/resume work explicitly before/after blocking

schedule() invokes sched_submit_work() before scheduling and
sched_resume_work() afterwards to ensure that queued block requests are
flushed and the (IO)worker machineries can instantiate new workers if
required. This avoids deadlocks and starvation.

With rt_mutexes this can lead to a subtle problem:

  When rtmutex blocks current::pi_blocked_on points to the rtmutex it
  blocks on. When one of the functions in sched_submit/resume_work() contends
  on a rtmutex based lock then that would corrupt current::pi_blocked_on.

Let rtmutex and the RT lock variants which are based on it invoke
sched_submit/resume_work() explicitly before and after the slowpath so
it's guaranteed that current::pi_blocked_on cannot be corrupted by blocking
on two locks.

This does not apply to the PREEMPT_RT variants of spinlock_t and rwlock_t
as their scheduling slowpath is separate and cannot invoke the work related
functions due to potential deadlocks anyway.

[ tglx: Make it explicit and symmetric. Massage changelog ]

Fixes: e17ba59b7e8e1 ("locking/rtmutex: Guard regular sleeping locks specific functions")
Reported-by: Crystal Wood <swood@...hat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
Link: https://lore.kernel.org/4b4ab374d3e24e6ea8df5cadc4297619a6d945af.camel@redhat.com
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
---
 kernel/locking/rtmutex.c     | 11 +++++++++--
 kernel/locking/rwbase_rt.c   | 18 ++++++++++++++++--
 kernel/locking/rwsem.c       |  6 ++++++
 kernel/locking/spinlock_rt.c |  3 +++
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 728f434de2bbf..aa66a3c5950a7 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1555,7 +1555,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
 		raw_spin_unlock_irq(&lock->wait_lock);
 
 		if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
-			schedule();
+			schedule_rtmutex();
 
 		raw_spin_lock_irq(&lock->wait_lock);
 		set_current_state(state);
@@ -1584,7 +1584,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
 	WARN(1, "rtmutex deadlock detected\n");
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		schedule();
+		schedule_rtmutex();
 	}
 }
 
@@ -1679,6 +1679,12 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 	unsigned long flags;
 	int ret;
 
+	/*
+	 * The task is about to sleep. Invoke sched_submit_work() before
+	 * blocking as that might take locks and corrupt tsk::pi_blocked_on.
+	 */
+	sched_submit_work();
+
 	/*
 	 * Technically we could use raw_spin_[un]lock_irq() here, but this can
 	 * be called in early boot if the cmpxchg() fast path is disabled
@@ -1691,6 +1697,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 	ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
+	sched_resume_work();
 	return ret;
 }
 
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 25ec0239477c2..945d474f5d27f 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -131,10 +131,21 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
 					    unsigned int state)
 {
+	int ret;
+
 	if (rwbase_read_trylock(rwb))
 		return 0;
 
-	return __rwbase_read_lock(rwb, state);
+	/*
+	 * The task is about to sleep. For rwsems this submits work as that
+	 * might take locks and corrupt tsk::pi_blocked_on. Must be
+	 * explicit here because __rwbase_read_lock() cannot invoke
+	 * rt_mutex_slowlock(). NOP for rwlocks.
+	 */
+	rwbase_sched_submit_work();
+	ret = __rwbase_read_lock(rwb, state);
+	rwbase_sched_resume_work();
+	return ret;
 }
 
 static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
@@ -230,7 +241,10 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 	struct rt_mutex_base *rtm = &rwb->rtmutex;
 	unsigned long flags;
 
-	/* Take the rtmutex as a first step */
+	/*
+	 * Take the rtmutex as a first step. For rwsem this will also
+	 * invoke sched_submit_work() to flush IO and workers.
+	 */
 	if (rwbase_rtmutex_lock_state(rtm, state))
 		return -EINTR;
 
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index acb5a50309a18..aca266006ad47 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1415,6 +1415,12 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 #define rwbase_rtmutex_lock_state(rtm, state)		\
 	__rt_mutex_lock(rtm, state)
 
+#define rwbase_sched_submit_work()			\
+	sched_submit_work()
+
+#define rwbase_sched_resume_work()			\
+	sched_resume_work()
+
 #define rwbase_rtmutex_slowlock_locked(rtm, state)	\
 	__rt_mutex_slowlock_locked(rtm, NULL, state)
 
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index 48a19ed8486d8..62c4a6866087a 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -159,6 +159,9 @@ rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
 	return 0;
 }
 
+static __always_inline void rwbase_sched_submit_work(void) { }
+static __always_inline void rwbase_sched_resume_work(void) { }
+
 static __always_inline int
 rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
 {
-- 
2.40.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ