lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200308055221.1088089-1-ebiggers@kernel.org>
Date:   Sat,  7 Mar 2020 21:52:21 -0800
From:   Eric Biggers <ebiggers@...nel.org>
To:     linux-fsdevel@...r.kernel.org
Cc:     linux-xfs@...r.kernel.org, linux-ext4@...r.kernel.org,
        linux-kernel@...r.kernel.org, syzkaller-bugs@...glegroups.com
Subject: [PATCH] fs/direct-io.c: avoid workqueue allocation race

From: Eric Biggers <ebiggers@...gle.com>

When a thread loses the workqueue allocation race in
sb_init_dio_done_wq(), lockdep reports that the call to
destroy_workqueue() can deadlock waiting for work to complete.  This is
a false positive since the workqueue is empty.  But we shouldn't simply
skip the lockdep check for empty workqueues for everyone.

Just avoid this issue by using a mutex to serialize the workqueue
allocation.  We still keep the preliminary check for ->s_dio_done_wq, so
this doesn't affect direct I/O performance.

Also fix the preliminary check for ->s_dio_done_wq to use READ_ONCE(),
since it's a data race.  (That part wasn't actually found by syzbot yet,
but it could be detected by KCSAN in the future.)

Note: the lockdep false positive could alternatively be fixed by
introducing a new function like "destroy_unused_workqueue()" to the
workqueue API as previously suggested.  But I think it makes sense to
avoid the double allocation anyway.

Reported-by: syzbot+a50c7541a4a55cd49b02@...kaller.appspotmail.com
Reported-by: syzbot+5cd33f0e6abe2bb3e397@...kaller.appspotmail.com
Signed-off-by: Eric Biggers <ebiggers@...gle.com>
---
 fs/direct-io.c       | 39 ++++++++++++++++++++-------------------
 fs/internal.h        |  9 ++++++++-
 fs/iomap/direct-io.c |  3 +--
 3 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 00b4d15bb811..8b73a2501c03 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -590,22 +590,25 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
  * filesystems that don't need it and also allows us to create the workqueue
  * late enough so the we can include s_id in the name of the workqueue.
  */
-int sb_init_dio_done_wq(struct super_block *sb)
+int __sb_init_dio_done_wq(struct super_block *sb)
 {
-	struct workqueue_struct *old;
-	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
-						      WQ_MEM_RECLAIM, 0,
-						      sb->s_id);
-	if (!wq)
-		return -ENOMEM;
-	/*
-	 * This has to be atomic as more DIOs can race to create the workqueue
-	 */
-	old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
-	/* Someone created workqueue before us? Free ours... */
-	if (old)
-		destroy_workqueue(wq);
-	return 0;
+	static DEFINE_MUTEX(sb_init_dio_done_wq_mutex);
+	struct workqueue_struct *wq;
+	int err = 0;
+
+	mutex_lock(&sb_init_dio_done_wq_mutex);
+	if (sb->s_dio_done_wq)
+		goto out;
+	wq = alloc_workqueue("dio/%s", WQ_MEM_RECLAIM, 0, sb->s_id);
+	if (!wq) {
+		err = -ENOMEM;
+		goto out;
+	}
+	/* pairs with READ_ONCE() in sb_init_dio_done_wq() */
+	smp_store_release(&sb->s_dio_done_wq, wq);
+out:
+	mutex_unlock(&sb_init_dio_done_wq_mutex);
+	return err;
 }
 
 static int dio_set_defer_completion(struct dio *dio)
@@ -615,9 +618,7 @@ static int dio_set_defer_completion(struct dio *dio)
 	if (dio->defer_completion)
 		return 0;
 	dio->defer_completion = true;
-	if (!sb->s_dio_done_wq)
-		return sb_init_dio_done_wq(sb);
-	return 0;
+	return sb_init_dio_done_wq(sb);
 }
 
 /*
@@ -1250,7 +1251,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		retval = 0;
 		if (iocb->ki_flags & IOCB_DSYNC)
 			retval = dio_set_defer_completion(dio);
-		else if (!dio->inode->i_sb->s_dio_done_wq) {
+		else {
 			/*
 			 * In case of AIO write racing with buffered read we
 			 * need to defer completion. We can't decide this now,
diff --git a/fs/internal.h b/fs/internal.h
index f3f280b952a3..7813dae1dbcd 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -183,7 +183,14 @@ extern void mnt_pin_kill(struct mount *m);
 extern const struct dentry_operations ns_dentry_operations;
 
 /* direct-io.c: */
-int sb_init_dio_done_wq(struct super_block *sb);
+int __sb_init_dio_done_wq(struct super_block *sb);
+static inline int sb_init_dio_done_wq(struct super_block *sb)
+{
+	/* pairs with smp_store_release() in __sb_init_dio_done_wq() */
+	if (likely(READ_ONCE(sb->s_dio_done_wq)))
+		return 0;
+	return __sb_init_dio_done_wq(sb);
+}
 
 /*
  * fs/stat.c:
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 23837926c0c5..5d81faada8a0 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -484,8 +484,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		dio_warn_stale_pagecache(iocb->ki_filp);
 	ret = 0;
 
-	if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
-	    !inode->i_sb->s_dio_done_wq) {
+	if (iov_iter_rw(iter) == WRITE && !wait_for_completion) {
 		ret = sb_init_dio_done_wq(inode->i_sb);
 		if (ret < 0)
 			goto out_free_dio;
-- 
2.25.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ