linux-kernel - [RFC 9/9] gfs2: Fix mmap + page fault deadlocks (part 2)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210531170123.243771-10-agruenba@redhat.com>
Date:   Mon, 31 May 2021 19:01:23 +0200
From:   Andreas Gruenbacher <agruenba@...hat.com>
To:     Linus Torvalds <torvalds@...ux-foundation.org>
Cc:     Andreas Gruenbacher <agruenba@...hat.com>,
        cluster-devel@...hat.com, linux-kernel@...r.kernel.org,
        Alexander Viro <viro@...iv.linux.org.uk>,
        Jan Kara <jack@...e.cz>, Matthew Wilcox <willy@...radead.org>
Subject: [RFC 9/9] gfs2: Fix mmap + page fault deadlocks (part 2)

Now that we handle self-recursion on the inode glock in gfs2_fault and
gfs2_page_mkwrite, we need to take care of more complex deadlock
scenarios like the following (example by Jan Kara):

Two independent processes P1, P2. Two files F1, F2, and two mappings M1,
M2 where M1 is a mapping of F1, M2 is a mapping of F2. Now P1 does DIO
to F1 with M2 as a buffer, P2 does DIO to F2 with M1 as a buffer. They
can race like:

P1                                      P2
read()                                  read()
  gfs2_file_read_iter()                   gfs2_file_read_iter()
    gfs2_file_direct_read()                 gfs2_file_direct_read()
      locks glock of F1                       locks glock of F2
      iomap_dio_rw()                          iomap_dio_rw()
        bio_iov_iter_get_pages()                bio_iov_iter_get_pages()
          <fault in M2>                           <fault in M1>
            gfs2_fault()                            gfs2_fault()
              tries to grab glock of F2               tries to grab glock of F1

Those of scenarios are much harder to reproduce than self-recursion.

We deal with such situations by using the LM_FLAG_OUTER flag to mark
"outer" glock taking.  Then, when taking an "inner" glock, we use the
LM_FLAG_TRY flag so that locking attempts that don't succeed immediately
will be aborted.  In case of a failed locking attempt, we "unwind" to
where the "outer" glock was taken, drop the "outer" glock, and fault in
the first offending user page.  This will re-trigger the "inner" locking
attempt but without the "outer" glock being held and without the
LM_FLAG_TRY flag.  Once that's done, we re-acquire the "outer" glock and
retry the original operation.

Reported-by: Jan Kara <jack@...e.cz>
Signed-off-by: Andreas Gruenbacher <agruenba@...hat.com>
---
 fs/gfs2/bmap.c |  3 ++-
 fs/gfs2/file.c | 58 ++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 2ff501c413f4..82e4506984e3 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -967,7 +967,8 @@ static int gfs2_write_lock(struct inode *inode)
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	int error;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_OUTER,
+			 &ip->i_gh);
 	error = gfs2_glock_nq(&ip->i_gh);
 	if (error)
 		goto out_uninit;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7d88abb4629b..3107d49a379b 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -431,21 +431,30 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
 	vm_fault_t ret = VM_FAULT_LOCKED;
 	struct gfs2_holder gh;
 	unsigned int length;
+	u16 flags = 0;
 	loff_t size;
 	int err;
 
 	sb_start_pagefault(inode->i_sb);
 
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (current_holds_glock())
+		flags |= LM_FLAG_TRY;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, flags, &gh);
 	if (likely(!outer_gh)) {
 		err = gfs2_glock_nq(&gh);
 		if (err) {
 			ret = block_page_mkwrite_return(err);
+			if (err == GLR_TRYFAILED) {
+				set_current_needs_retry(true);
+				ret = VM_FAULT_SIGBUS;
+			}
 			goto out_uninit;
 		}
 	} else {
 		if (!gfs2_holder_is_compatible(outer_gh, LM_ST_EXCLUSIVE)) {
 			/* We could try to upgrade outer_gh here. */
+			set_current_needs_retry(true);
 			ret = VM_FAULT_SIGBUS;
 			goto out_uninit;
 		}
@@ -568,20 +577,28 @@ static vm_fault_t gfs2_fault(struct vm_fault *vmf)
 	struct gfs2_holder *outer_gh = gfs2_glock_is_locked_by_me(ip->i_gl);
 	struct gfs2_holder gh;
 	vm_fault_t ret;
-	u16 state;
+	u16 state, flags = 0;
 	int err;
 
+	if (current_holds_glock())
+		flags |= LM_FLAG_TRY;
+
 	state = (vmf->flags & FAULT_FLAG_WRITE) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
-	gfs2_holder_init(ip->i_gl, state, 0, &gh);
+	gfs2_holder_init(ip->i_gl, state, flags, &gh);
 	if (likely(!outer_gh)) {
 		err = gfs2_glock_nq(&gh);
 		if (err) {
 			ret = block_page_mkwrite_return(err);
+			if (err == GLR_TRYFAILED) {
+				set_current_needs_retry(true);
+				ret = VM_FAULT_SIGBUS;
+			}
 			goto out_uninit;
 		}
 	} else {
 		if (!gfs2_holder_is_compatible(outer_gh, state)) {
 			/* We could try to upgrade outer_gh here. */
+			set_current_needs_retry(true);
 			ret = VM_FAULT_SIGBUS;
 			goto out_uninit;
 		}
@@ -807,13 +824,20 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
 	if (!count)
 		return 0; /* skip atime */
 
-	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, LM_FLAG_OUTER, gh);
+retry:
 	ret = gfs2_glock_nq(gh);
 	if (ret)
 		goto out_uninit;
 
 	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
 	gfs2_glock_dq(gh);
+	if (unlikely(current_needs_retry())) {
+		set_current_needs_retry(false);
+		if (ret == -EFAULT &&
+		    !iov_iter_fault_in_writeable(to, PAGE_SIZE))
+			goto retry;
+	}
 out_uninit:
 	gfs2_holder_uninit(gh);
 	return ret;
@@ -837,7 +861,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 	 * unfortunately, have the option of only flushing a range like the
 	 * VFS does.
 	 */
-	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, LM_FLAG_OUTER, gh);
+retry:
 	ret = gfs2_glock_nq(gh);
 	if (ret)
 		goto out_uninit;
@@ -851,6 +876,12 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 		ret = 0;
 out:
 	gfs2_glock_dq(gh);
+	if (unlikely(current_needs_retry())) {
+		set_current_needs_retry(false);
+		if (ret == -EFAULT &&
+		    !iov_iter_fault_in_readable(from, PAGE_SIZE))
+			goto retry;
+	}
 out_uninit:
 	gfs2_holder_uninit(gh);
 	return ret;
@@ -883,7 +914,8 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			return ret;
 	}
 	ip = GFS2_I(iocb->ki_filp->f_mapping->host);
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_OUTER, &gh);
+retry:
 	ret = gfs2_glock_nq(&gh);
 	if (ret)
 		goto out_uninit;
@@ -891,6 +923,12 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (ret > 0)
 		written += ret;
 	gfs2_glock_dq(&gh);
+	if (unlikely(current_needs_retry())) {
+		set_current_needs_retry(false);
+		if (ret == -EFAULT &&
+		    !iov_iter_fault_in_writeable(to, PAGE_SIZE))
+			goto retry;
+	}
 out_uninit:
 	gfs2_holder_uninit(&gh);
 	return written ? written : ret;
@@ -902,9 +940,17 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct iov_iter *fro
 	struct inode *inode = file_inode(file);
 	ssize_t ret;
 
+retry:
 	current->backing_dev_info = inode_to_bdi(inode);
 	ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
 	current->backing_dev_info = NULL;
+	if (unlikely(current_needs_retry())) {
+		set_current_needs_retry(false);
+		if (ret == -EFAULT &&
+		    !iov_iter_fault_in_readable(from, PAGE_SIZE))
+			goto retry;
+	}
+
 	return ret;
 }
 
-- 
2.26.3