linux-kernel - [PATCH] [RFC] Trigger retry from fault vm operation

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210511140113.1225981-1-agruenba@redhat.com>
Date:   Tue, 11 May 2021 16:01:13 +0200
From:   Andreas Gruenbacher <agruenba@...hat.com>
To:     linux-fsdevel@...r.kernel.org
Cc:     cluster-devel@...hat.com, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org, Jan Kara <jack@...e.cz>,
        Andreas Gruenbacher <agruenba@...hat.com>
Subject: [PATCH] [RFC] Trigger retry from fault vm operation

Hello,

we have a locking problem in gfs2 that I don't have a proper solution for, so
I'm looking for suggestions.

What's happening is that a page fault triggers during a read or write
operation, while we're holding a glock (the cluster-wide gfs2 inode
lock), and the page fault requires another glock.  We can recognize and
handle the case when both glocks are the same, but when the page fault requires
another glock, there is a chance that taking that other glock would deadlock.

When we realize that we may not be able to take the other glock in gfs2_fault,
we need to communicate that to the read or write operation, which will then
drop and re-acquire the "outer" glock and retry.  However, there doesn't seem
to be a good way to do that; we can only indicate that a page fault should fail
by returning VM_FAULT_SIGBUS or similar; that will then be mapped to -EFAULT.
We'd need something like VM_FAULT_RESTART that can be mapped to -EBUSY so that
we can tell the retry case apart from genuine -EFAULT errors.

To show what I mean, below is a proof of concept that adds a restart_hack flag
to struct task_struct as a side channel.  An even uglier alternative would be
to abuse task->journal_info.

The obvious response to this email would be "fix your locking order".  Well, we
can do that by pinning the user pages in gfs2_file_{read,write}_iter before
taking the "outer" glock, but we'd ideally only do that when it's actually
necessary (i.e., when gfs2_fault has indicated to retry).

Thanks for any thoughts.

Andreas
---
 fs/gfs2/file.c        | 37 +++++++++++++++++++++++++++++++++++--
 include/linux/sched.h |  1 +
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 658fed79d65a..253e720f2df0 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -543,10 +543,15 @@ static vm_fault_t gfs2_fault(struct vm_fault *vmf)
 	vm_fault_t ret;
 	int err;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_TRY, &gh);
 	if (likely(!recursive)) {
 		err = gfs2_glock_nq(&gh);
 		if (err) {
+			if (err == GLR_TRYFAILED) {
+				current->restart_hack = 1;
+				ret = VM_FAULT_SIGBUS;
+				goto out_uninit;
+			}
 			ret = block_page_mkwrite_return(err);
 			goto out_uninit;
 		}
@@ -773,12 +778,16 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
 		return 0; /* skip atime */
 
 	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+restart:
 	ret = gfs2_glock_nq(gh);
 	if (ret)
 		goto out_uninit;
 
+	current->restart_hack = 0;
 	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
 	gfs2_glock_dq(gh);
+	if (unlikely(ret == -EFAULT) && current->restart_hack)
+		goto restart;
 out_uninit:
 	gfs2_holder_uninit(gh);
 	return ret;
@@ -803,6 +812,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 	 * VFS does.
 	 */
 	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+
+restart:
 	ret = gfs2_glock_nq(gh);
 	if (ret)
 		goto out_uninit;
@@ -811,11 +822,14 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 	if (offset + len > i_size_read(&ip->i_inode))
 		goto out;
 
+	current->restart_hack = 0;
 	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
 	if (ret == -ENOTBLK)
 		ret = 0;
 out:
 	gfs2_glock_dq(gh);
+	if (unlikely(ret == -EFAULT) && current->restart_hack)
+		goto restart;
 out_uninit:
 	gfs2_holder_uninit(gh);
 	return ret;
@@ -834,7 +848,9 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			return ret;
 		iocb->ki_flags &= ~IOCB_DIRECT;
 	}
+restart1:
 	iocb->ki_flags |= IOCB_NOIO;
+	current->restart_hack = 0;
 	ret = generic_file_read_iter(iocb, to);
 	iocb->ki_flags &= ~IOCB_NOIO;
 	if (ret >= 0) {
@@ -842,6 +858,8 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			return ret;
 		written = ret;
 	} else {
+		if (unlikely(ret == -EFAULT) && current->restart_hack)
+			goto restart1;
 		if (ret != -EAGAIN)
 			return ret;
 		if (iocb->ki_flags & IOCB_NOWAIT)
@@ -849,13 +867,18 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	}
 	ip = GFS2_I(iocb->ki_filp->f_mapping->host);
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+
+restart2:
 	ret = gfs2_glock_nq(&gh);
 	if (ret)
 		goto out_uninit;
+	current->restart_hack = 0;
 	ret = generic_file_read_iter(iocb, to);
 	if (ret > 0)
 		written += ret;
 	gfs2_glock_dq(&gh);
+	if (unlikely(ret == -EFAULT) && current->restart_hack)
+		goto restart2;
 out_uninit:
 	gfs2_holder_uninit(&gh);
 	return written ? written : ret;
@@ -912,13 +935,18 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			goto out_unlock;
 
 		iocb->ki_flags |= IOCB_DSYNC;
+restart1:
+		current->restart_hack = 0;
 		current->backing_dev_info = inode_to_bdi(inode);
 		buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
 		current->backing_dev_info = NULL;
 		if (unlikely(buffered <= 0)) {
+			if (unlikely(buffered == -EFAULT) && current->restart_hack)
+				goto restart1;
 			if (!ret)
 				ret = buffered;
 			goto out_unlock;
+		}
 
 		/*
 		 * We need to ensure that the page cache pages are written to
@@ -935,10 +963,15 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		if (!ret || ret2 > 0)
 			ret += ret2;
 	} else {
+restart2:
+		current->restart_hack = 0;
 		current->backing_dev_info = inode_to_bdi(inode);
 		ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
 		current->backing_dev_info = NULL;
-		if (likely(ret > 0)) {
+		if (unlikely(ret <= 0)) {
+			if (unlikely(ret == -EFAULT) && current->restart_hack)
+				goto restart2;
+		} else {
 			iocb->ki_pos += ret;
 			ret = generic_write_sync(iocb, ret);
 		}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2c881384517..de053ac8d8d6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1067,6 +1067,7 @@ struct task_struct {
 
 	/* Journalling filesystem info: */
 	void				*journal_info;
+	unsigned			restart_hack:1;
 
 	/* Stacked block device info: */
 	struct bio_list			*bio_list;
-- 
2.26.3