[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <166147984375.25420.13018600986239729815.stgit@noble.brown>
Date: Fri, 26 Aug 2022 12:10:43 +1000
From: NeilBrown <neilb@...e.de>
To: Al Viro <viro@...iv.linux.org.uk>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Daire Byrne <daire@...g.com>,
Trond Myklebust <trond.myklebust@...merspace.com>,
Chuck Lever <chuck.lever@...cle.com>
Cc: Linux NFS Mailing List <linux-nfs@...r.kernel.org>,
linux-fsdevel@...r.kernel.org, LKML <linux-kernel@...r.kernel.org>
Subject: [PATCH 06/10] VFS: support concurrent renames.
Allow object can now be renamed from or to a directory in which a create
or unlink is concurrently happening.
Two or more renames with the one directory can also be concurrent.
s_vfs_rename_mutex still serialises lookups for cross-directory renames,
but the renames themselves can proceed concurrently.
A core part of this change is introducing lock_rename_lookup()
which both locks the directories and performs the lookups.
If the filesystem supports shared-lock updates and a wq is provided,
shared locks are used on directories, otherwise exclusive locks.
DCACHE_PAR_UPDATE is always set on the found dentries.
unlock_rename_lookup() performs appropriate unlocking. It needs to be
told if a wq was provided to lock_rename_lookup().
As we may use alloc_dentry_parallel() which can block, we need to be
careful of the case where both names are the same, in the same
directory. If the first ->lookup() chooses not to complete the lookup -
as may happen with LOOKUP_RENAME_TARGET - then the second will block.
LOOKUP_RENAME_TARGET is only expected on the first name listed, so we
make sure to lookup the second name given first.
Signed-off-by: NeilBrown <neilb@...e.de>
---
fs/namei.c | 221 ++++++++++++++++++++++++++++++++++++++++++++-----
include/linux/namei.h | 10 ++
2 files changed, 208 insertions(+), 23 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 13f8ac9721be..a7c458cc787c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3156,6 +3156,187 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
}
EXPORT_SYMBOL(unlock_rename);
+static struct dentry *lock_rename_lookup(struct dentry *p1, struct dentry *p2,
+ struct dentry **d1p, struct dentry **d2p,
+ struct qstr *last1, struct qstr *last2,
+ unsigned int flags1, unsigned int flags2,
+ wait_queue_head_t *wq)
+{
+ struct dentry *p;
+ struct dentry *d1, *d2;
+ bool ok1, ok2;
+ bool shared = wq && IS_PAR_UPDATE(p1->d_inode);
+
+ if (p1 == p2) {
+ if (shared)
+ inode_lock_shared_nested(p1->d_inode, I_MUTEX_PARENT);
+ else
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ retry:
+ /* last1 is expected to be target so and might be looked up
+ * lazily. So look up last2 first to avoid the second look up
+ * waiting for the first.
+ */
+ d2 = __lookup_hash(last2, p2, flags2, wq);
+ if (IS_ERR(d2))
+ goto out_unlock_2;
+ d1 = __lookup_hash(last1, p1, flags1, wq);
+ if (IS_ERR(d1))
+ goto out_unlock_1;
+ *d1p = d1; *d2p = d2;
+
+ if (d1 < d2) {
+ ok1 = d_lock_update_nested(d1, p1, last1,
+ I_MUTEX_PARENT);
+ ok2 = d_lock_update_nested(d2, p2, last2,
+ I_MUTEX_PARENT2);
+ } else if (d1 > d2) {
+ ok2 = d_lock_update_nested(d2, p2, last2,
+ I_MUTEX_PARENT);
+ ok1 = d_lock_update_nested(d1, p1, last1,
+ I_MUTEX_PARENT2);
+ } else {
+ /* d1 == d2 !! */
+ ok1 = d_lock_update_nested(d1, p1, last1,
+ I_MUTEX_PARENT);
+ ok2 = ok1;
+ }
+ if (!ok1 || !ok2) {
+ if (ok1)
+ d_unlock_update(d1);
+ if (ok2)
+ d_unlock_update(d2);
+ dput(d1);
+ dput(d2);
+ goto retry;
+ }
+ return NULL;
+ out_unlock_1:
+ d_lookup_done(d2);
+ dput(d2);
+ d2 = d1;
+ out_unlock_2:
+ if (shared)
+ inode_unlock_shared(p1->d_inode);
+ else
+ inode_unlock(p1->d_inode);
+ return d1;
+ }
+
+ mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
+
+ if ((p = d_ancestor(p2, p1)) != NULL) {
+ if (shared) {
+ inode_lock_shared_nested(p2->d_inode, I_MUTEX_PARENT);
+ inode_lock_shared_nested(p1->d_inode, I_MUTEX_CHILD);
+ } else {
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
+ }
+ } else if ((p = d_ancestor(p1, p2)) != NULL) {
+ if (shared) {
+ inode_lock_shared_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_shared_nested(p2->d_inode, I_MUTEX_CHILD);
+ } else {
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
+ }
+ } else {
+ if (shared) {
+ inode_lock_shared_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_shared_nested(p2->d_inode, I_MUTEX_PARENT2);
+ } else {
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
+ }
+ }
+retry2:
+ d1 = __lookup_hash(last1, p1, flags1, wq);
+ if (IS_ERR(d1))
+ goto unlock_out_3;
+ d2 = __lookup_hash(last2, p2, flags2, wq);
+ if (IS_ERR(d2))
+ goto unlock_out_4;
+
+ if (d1 < d2) {
+ ok1 = d_lock_update_nested(d1, p1, last1, I_MUTEX_PARENT);
+ ok2 = d_lock_update_nested(d2, p2, last2, I_MUTEX_PARENT2);
+ } else {
+ ok2 = d_lock_update_nested(d2, p2, last2, I_MUTEX_PARENT);
+ ok1 = d_lock_update_nested(d1, p1, last1, I_MUTEX_PARENT2);
+ }
+ if (!ok1 || !ok2) {
+ if (ok1)
+ d_unlock_update(d1);
+ if (ok2)
+ d_unlock_update(d2);
+ dput(d1);
+ dput(d2);
+ goto retry2;
+ }
+ *d1p = d1;
+ *d2p = d2;
+ return p;
+unlock_out_4:
+ d_lookup_done(d1);
+ dput(d1);
+ d1 = d2;
+unlock_out_3:
+ if (shared) {
+ inode_unlock_shared(p1->d_inode);
+ inode_unlock_shared(p2->d_inode);
+ } else {
+ inode_unlock(p1->d_inode);
+ inode_unlock(p2->d_inode);
+ }
+ mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
+ return d1;
+}
+
+struct dentry *lock_rename_lookup_one(struct dentry *p1, struct dentry *p2,
+ struct dentry **d1p, struct dentry **d2p,
+ const char *name1, int nlen1,
+ const char *name2, int nlen2,
+ unsigned int flags1, unsigned int flags2,
+ wait_queue_head_t *wq)
+{
+ struct qstr this1, this2;
+ int err;
+
+ err = lookup_one_common(&init_user_ns, name1, p1, nlen1, &this1);
+ if (err)
+ return ERR_PTR(err);
+ err = lookup_one_common(&init_user_ns, name2, p2, nlen2, &this2);
+ if (err)
+ return ERR_PTR(err);
+ return lock_rename_lookup(p1, p2, d1p, d2p, &this1, &this2,
+ flags1, flags2, wq);
+}
+EXPORT_SYMBOL(lock_rename_lookup_one);
+
+void unlock_rename_lookup(struct dentry *p1, struct dentry *p2,
+ struct dentry *d1, struct dentry *d2,
+ bool with_wq)
+{
+ bool shared = with_wq && IS_PAR_UPDATE(p1->d_inode);
+ d_lookup_done(d1);
+ d_lookup_done(d2);
+ d_unlock_update(d1);
+ if (d2 != d1)
+ d_unlock_update(d2);
+ if (shared) {
+ inode_unlock_shared(p1->d_inode);
+ if (p1 != p2) {
+ inode_unlock_shared(p2->d_inode);
+ mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
+ }
+ } else
+ unlock_rename(p1, p2);
+ dput(d1);
+ dput(d2);
+}
+EXPORT_SYMBOL(unlock_rename_lookup);
+
/**
* mode_strip_umask - handle vfs umask stripping
* @dir: parent directory of the new inode
@@ -4945,6 +5126,7 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
bool should_retry = false;
int error = -EINVAL;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
goto put_names;
@@ -4985,58 +5167,53 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
goto exit2;
retry_deleg:
- trap = lock_rename(new_path.dentry, old_path.dentry);
-
- old_dentry = __lookup_hash(&old_last, old_path.dentry,
- lookup_flags, NULL);
- error = PTR_ERR(old_dentry);
- if (IS_ERR(old_dentry))
+ trap = lock_rename_lookup(new_path.dentry, old_path.dentry,
+ &new_dentry, &old_dentry,
+ &new_last, &old_last,
+ lookup_flags | target_flags, lookup_flags,
+ &wq);
+ if (IS_ERR(trap))
goto exit3;
/* source must exist */
error = -ENOENT;
if (d_is_negative(old_dentry))
goto exit4;
- new_dentry = __lookup_hash(&new_last, new_path.dentry,
- lookup_flags | target_flags, NULL);
- error = PTR_ERR(new_dentry);
- if (IS_ERR(new_dentry))
- goto exit4;
error = -EEXIST;
if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
- goto exit5;
+ goto exit4;
if (flags & RENAME_EXCHANGE) {
error = -ENOENT;
if (d_is_negative(new_dentry))
- goto exit5;
+ goto exit4;
if (!d_is_dir(new_dentry)) {
error = -ENOTDIR;
if (new_last.name[new_last.len])
- goto exit5;
+ goto exit4;
}
}
/* unless the source is a directory trailing slashes give -ENOTDIR */
if (!d_is_dir(old_dentry)) {
error = -ENOTDIR;
if (old_last.name[old_last.len])
- goto exit5;
+ goto exit4;
if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
- goto exit5;
+ goto exit4;
}
/* source should not be ancestor of target */
error = -EINVAL;
if (old_dentry == trap)
- goto exit5;
+ goto exit4;
/* target should not be an ancestor of source */
if (!(flags & RENAME_EXCHANGE))
error = -ENOTEMPTY;
if (new_dentry == trap)
- goto exit5;
+ goto exit4;
error = security_path_rename(&old_path, old_dentry,
&new_path, new_dentry, flags);
if (error)
- goto exit5;
+ goto exit4;
rd.old_dir = old_path.dentry->d_inode;
rd.old_dentry = old_dentry;
@@ -5047,12 +5224,10 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
rd.delegated_inode = &delegated_inode;
rd.flags = flags;
error = vfs_rename(&rd);
-exit5:
- dput(new_dentry);
exit4:
- dput(old_dentry);
+ unlock_rename_lookup(new_path.dentry, old_path.dentry, new_dentry, old_dentry,
+ true);
exit3:
- unlock_rename(new_path.dentry, old_path.dentry);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
diff --git a/include/linux/namei.h b/include/linux/namei.h
index b1a210a51210..29756921f69b 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -108,6 +108,16 @@ extern int follow_up(struct path *);
extern struct dentry *lock_rename(struct dentry *, struct dentry *);
extern void unlock_rename(struct dentry *, struct dentry *);
+extern struct dentry *lock_rename_lookup_one(
+ struct dentry *p1, struct dentry *p2,
+ struct dentry **d1p, struct dentry **d2p,
+ const char *name1, int nlen1,
+ const char *name2, int nlen2,
+ unsigned int flags1, unsigned int flags2,
+ wait_queue_head_t *wq);
+extern void unlock_rename_lookup(struct dentry *p1, struct dentry *p2,
+ struct dentry *d1, struct dentry *d2,
+ bool withwq);
extern int __must_check nd_jump_link(struct path *path);
Powered by blists - more mailing lists