linux-kernel - [RFC] Sleep waiting for an rwsem to be unlocked

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZZ1+ZicgN8dZ3zj3@casper.infradead.org>
Date: Tue, 9 Jan 2024 17:12:06 +0000
From: Matthew Wilcox <willy@...radead.org>
To: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
	Will Deacon <will@...nel.org>, Waiman Long <longman@...hat.com>,
	linux-kernel@...r.kernel.org,
	Suren Baghdasaryan <surenb@...gle.com>,
	"Liam R. Howlett" <liam.howlett@...cle.com>,
	"Paul E. McKenney" <paulmck@...nel.org>
Subject: [RFC] Sleep waiting for an rwsem to be unlocked

The problem we're trying to solve is a lock-free walk of
/proc/$pid/maps. If the process is modifying the VMAs at the same time
the reader is walking them, it can see garbage.  For page faults, we
handle this by taking the mmap_lock for read and retrying the page fault
(excluding any further modifications).

We don't want to take that approach for the maps file.  The monitoring
task may have a significantly lower process priority, and so taking
the mmap_lock for read can block it for a significant period of time.
The obvious answer is to do some kind of backoff+sleep.  But we already
have a wait queue, so why not use it?

I haven't done the rwbase version; this is just a demonstration of what
we could do.  It's also untested other than by compilation.  It might
well be missing something.

Signed-off-by: Matthew Wilcox (Oracle) <willy@...radead.org>
---
 include/linux/rwsem.h  |   6 +++
 kernel/locking/rwsem.c | 104 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 4f1c18992f76..e7bf9dfc471a 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -250,6 +250,12 @@ DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
  */
 extern void downgrade_write(struct rw_semaphore *sem);
 
+/*
+ * wait for current writer to be finished
+ */
+void rwsem_wait(struct rw_semaphore *sem);
+int __must_check rwsem_wait_killable(struct rw_semaphore *sem);
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
  * nested locking. NOTE: rwsems are not allowed to recurse
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2340b6d90ec6..7c8096c5586f 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -332,7 +332,8 @@ EXPORT_SYMBOL(__init_rwsem);
 
 enum rwsem_waiter_type {
 	RWSEM_WAITING_FOR_WRITE,
-	RWSEM_WAITING_FOR_READ
+	RWSEM_WAITING_FOR_READ,
+	RWSEM_WAITING_FOR_RELEASE,
 };
 
 struct rwsem_waiter {
@@ -511,7 +512,8 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
 		if (waiter->type == RWSEM_WAITING_FOR_WRITE)
 			continue;
 
-		woken++;
+		if (waiter->type == RWSEM_WAITING_FOR_READ)
+			woken++;
 		list_move_tail(&waiter->list, &wlist);
 
 		/*
@@ -1401,6 +1403,67 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	preempt_enable();
 }
 
+static inline int __wait_read_common(struct rw_semaphore *sem, int state)
+{
+	int ret = 0;
+	long adjustment = 0;
+	struct rwsem_waiter waiter;
+	DEFINE_WAKE_Q(wake_q);
+
+	waiter.task = current;
+	waiter.type = RWSEM_WAITING_FOR_RELEASE;
+	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+	waiter.handoff_set = false;
+
+	preempt_disable();
+	raw_spin_lock_irq(&sem->wait_lock);
+	if (list_empty(&sem->wait_list)) {
+		if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
+			/* Provide lock ACQUIRE */
+			smp_acquire__after_ctrl_dep();
+			raw_spin_unlock_irq(&sem->wait_lock);
+			goto done;
+		}
+		adjustment = RWSEM_FLAG_WAITERS;
+	}
+	rwsem_add_waiter(sem, &waiter);
+	if (adjustment) {
+		long count = atomic_long_add_return(adjustment, &sem->count);
+		rwsem_cond_wake_waiter(sem, count, &wake_q);
+	}
+	raw_spin_unlock_irq(&sem->wait_lock);
+
+	if (!wake_q_empty(&wake_q))
+		wake_up_q(&wake_q);
+
+	for (;;) {
+		set_current_state(state);
+		if (!smp_load_acquire(&waiter.task)) {
+			/* Matches rwsem_mark_wake()'s smp_store_release(). */
+			break;
+		}
+		if (signal_pending_state(state, current)) {
+			raw_spin_lock_irq(&sem->wait_lock);
+			if (waiter.task)
+				goto out_nolock;
+			raw_spin_unlock_irq(&sem->wait_lock);
+			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
+			break;
+		}
+		schedule_preempt_disabled();
+	}
+
+	__set_current_state(TASK_RUNNING);
+done:
+	preempt_enable();
+	return ret;
+out_nolock:
+	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
+	__set_current_state(TASK_RUNNING);
+	ret = -EINTR;
+	goto done;
+}
+
 #else /* !CONFIG_PREEMPT_RT */
 
 #define RT_MUTEX_BUILD_MUTEX
@@ -1500,6 +1563,11 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	rwbase_write_downgrade(&sem->rwbase);
 }
 
+static inline int __wait_read_killable(struct rw_semaphore *sem)
+{
+	return rwbase_wait_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
 /* Debug stubs for the common API */
 #define DEBUG_RWSEMS_WARN_ON(c, sem)
 
@@ -1643,6 +1711,38 @@ void downgrade_write(struct rw_semaphore *sem)
 }
 EXPORT_SYMBOL(downgrade_write);
 
+/**
+ * rwsem_wait_killable - Wait for current write lock holder to release lock
+ * @sem: The semaphore to wait on.
+ *
+ * This is equivalent to calling down_read(); up_read() but avoids the
+ * possibility that the thread will be preempted while holding the lock
+ * causing threads that want to take the lock for writes to block.  The
+ * intended use case is for lockless readers who notice an inconsistent
+ * state and want to wait for the current writer to finish.
+ */
+int rwsem_wait_killable(struct rw_semaphore *sem)
+{
+	might_sleep();
+
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+	rwsem_release(&sem->dep_map, _RET_IP_);
+
+	return __wait_read_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_wait_killable);
+
+void rwsem_wait(struct rw_semaphore *sem)
+{
+	might_sleep();
+
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+	rwsem_release(&sem->dep_map, _RET_IP_);
+
+	__wait_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(rwsem_wait);
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 void down_read_nested(struct rw_semaphore *sem, int subclass)
-- 
2.43.0