linux-kernel - [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20220210181019.1259677-1-longman@redhat.com>
Date:   Thu, 10 Feb 2022 13:10:19 -0500
From:   Waiman Long <longman@...hat.com>
To:     Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Will Deacon <will.deacon@....com>,
        Boqun Feng <boqun.feng@...il.com>
Cc:     linux-kernel@...r.kernel.org, Waiman Long <longman@...hat.com>
Subject: [PATCH v2] locking/semaphore: Use wake_q to wake up processes outside lock critical section

The following lockdep splat was observed:

[ 9776.459819] ======================================================
[ 9776.459820] WARNING: possible circular locking dependency detected
[ 9776.459821] 5.14.0-0.rc4.35.el9.x86_64+debug #1 Not tainted
[ 9776.459823] ------------------------------------------------------
[ 9776.459824] stress-ng/117708 is trying to acquire lock:
[ 9776.459825] ffffffff892d41d8 ((console_sem).lock){-...}-{2:2}, at: down_trylock+0x13/0x70

[ 9776.459831] but task is already holding lock:
[ 9776.459832] ffff888e005f6d18 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x27/0x130

[ 9776.459837] which lock already depends on the new lock.
      :
[ 9776.459857] -> #1 (&p->pi_lock){-.-.}-{2:2}:
[ 9776.459860]        __lock_acquire+0xb72/0x1870
[ 9776.459861]        lock_acquire+0x1ca/0x570
[ 9776.459862]        _raw_spin_lock_irqsave+0x40/0x90
[ 9776.459863]        try_to_wake_up+0x9d/0x1210
[ 9776.459864]        up+0x7a/0xb0
[ 9776.459864]        __up_console_sem+0x33/0x70
[ 9776.459865]        console_unlock+0x3a1/0x5f0
[ 9776.459866]        vprintk_emit+0x23b/0x2b0
[ 9776.459867]        devkmsg_emit.constprop.0+0xab/0xdc
[ 9776.459868]        devkmsg_write.cold+0x4e/0x78
[ 9776.459869]        do_iter_readv_writev+0x343/0x690
[ 9776.459870]        do_iter_write+0x123/0x340
[ 9776.459871]        vfs_writev+0x19d/0x520
[ 9776.459871]        do_writev+0x110/0x290
[ 9776.459872]        do_syscall_64+0x3b/0x90
[ 9776.459873]        entry_SYSCALL_64_after_hwframe+0x44/0xae
      :
[ 9776.459905] Chain exists of:
[ 9776.459906]   (console_sem).lock --> &p->pi_lock --> &rq->__lock

[ 9776.459911]  Possible unsafe locking scenario:

[ 9776.459913]        CPU0                    CPU1
[ 9776.459914]        ----                    ----
[ 9776.459914]   lock(&rq->__lock);
[ 9776.459917]                                lock(&p->pi_lock);
[ 9776.459919]                                lock(&rq->__lock);
[ 9776.459921]   lock((console_sem).lock);

[ 9776.459923]  *** DEADLOCK ***
[ 9776.459925] 2 locks held by stress-ng/117708:
[ 9776.459925]  #0: ffffffff89403960 (&cpuset_rwsem){++++}-{0:0}, at: __sched_setscheduler+0xe2f/0x2c80
[ 9776.459930]  #1: ffff888e005f6d18 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x27/0x130

[ 9776.459935] stack backtrace:
[ 9776.459936] CPU: 95 PID: 117708 Comm: stress-ng Kdump: loaded Not tainted 5.14.0-0.rc4.35.el9.x86_64+debug #1
[ 9776.459938] Hardware name: FUJITSU PRIMEQUEST 2800E3/D3752, BIOS PRIMEQUEST 2000 Series BIOS Version 01.51 06/29/2020
[ 9776.459939] Call Trace:
[ 9776.459940]  <IRQ>
[ 9776.459940]  dump_stack_lvl+0x57/0x7d
[ 9776.459941]  check_noncircular+0x26a/0x310
[ 9776.459945]  check_prev_add+0x15e/0x20f0
[ 9776.459946]  validate_chain+0xaba/0xde0
[ 9776.459948]  __lock_acquire+0xb72/0x1870
[ 9776.459949]  lock_acquire+0x1ca/0x570
[ 9776.459952]  _raw_spin_lock_irqsave+0x40/0x90
[ 9776.459954]  down_trylock+0x13/0x70
[ 9776.459955]  __down_trylock_console_sem+0x2a/0xb0
[ 9776.459956]  console_trylock_spinning+0x13/0x1f0
[ 9776.459957]  vprintk_emit+0x1e6/0x2b0
[ 9776.459958]  printk+0xb2/0xe3
[ 9776.459960]  __warn_printk+0x9b/0xf3
[ 9776.459964]  update_rq_clock+0x3c2/0x780
[ 9776.459966]  do_sched_rt_period_timer+0x19e/0x9a0
[ 9776.459968]  sched_rt_period_timer+0x6b/0x150
[ 9776.459969]  __run_hrtimer+0x27a/0xb20
[ 9776.459970]  __hrtimer_run_queues+0x159/0x260
[ 9776.459974]  hrtimer_interrupt+0x2cb/0x8f0
[ 9776.459976]  __sysvec_apic_timer_interrupt+0x13e/0x540
[ 9776.459977]  sysvec_apic_timer_interrupt+0x6a/0x90
[ 9776.459977]  </IRQ>

The problematic locking sequence ((console_sem).lock --> &p->pi_lock)
was caused by the fact the semaphore up() function is calling
wake_up_process() while holding the semaphore raw spinlock.

The (&rq->__lock --> (console_sem).lock) locking sequence seems to be
caused by a SCHED_WARN_ON() call in update_rq_clock(). To work around
this problematic locking sequence, we may have to ban all WARN*() calls
when the rq lock is held, which may be too restrictive, or we may have
to add a WARN_DEFERRED() call which can be quite a lot of work.

On the other hand, by moving the wake_up_processs() call out of the
raw spinlock critical section using wake_q, it will break the first
problematic locking sequence as well as reducing raw spinlock hold time.
This is easier and cleaner.

Signed-off-by: Waiman Long <longman@...hat.com>
---
 kernel/locking/semaphore.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9ee381e4d2a4..a26c915430ba 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
 #include <linux/ftrace.h>
@@ -37,7 +38,7 @@ static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
 static noinline int __down_killable(struct semaphore *sem);
 static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
 
 /**
  * down - acquire the semaphore
@@ -182,13 +183,16 @@ EXPORT_SYMBOL(down_timeout);
 void up(struct semaphore *sem)
 {
 	unsigned long flags;
+	DEFINE_WAKE_Q(wake_q);
 
 	raw_spin_lock_irqsave(&sem->lock, flags);
 	if (likely(list_empty(&sem->wait_list)))
 		sem->count++;
 	else
-		__up(sem);
+		__up(sem, &wake_q);
 	raw_spin_unlock_irqrestore(&sem->lock, flags);
+	if (!wake_q_empty(&wake_q))
+		wake_up_q(&wake_q);
 }
 EXPORT_SYMBOL(up);
 
@@ -256,11 +260,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
 	return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
 }
 
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+				  struct wake_q_head *wake_q)
 {
 	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
 						struct semaphore_waiter, list);
 	list_del(&waiter->list);
 	waiter->up = true;
-	wake_up_process(waiter->task);
+	wake_q_add(wake_q, waiter->task);
 }
-- 
2.27.0