[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <575461a6-2e6c-a4ab-6fd0-966ce01bafd0@redhat.com>
Date: Tue, 17 May 2022 08:48:14 +0800
From: Xiubo Li <xiubli@...hat.com>
To: Jeff Layton <jlayton@...nel.org>, viro@...iv.linux.org.uk
Cc: idryomov@...il.com, vshankar@...hat.com,
ceph-devel@...r.kernel.org, mcgrof@...nel.org,
akpm@...ux-foundation.org, arnd@...db.de,
linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir
On 5/17/22 12:58 AM, Jeff Layton wrote:
> On Mon, 2022-05-16 at 22:56 +0800, Xiubo Li wrote:
>> On 5/16/22 9:23 PM, Jeff Layton wrote:
>>> On Mon, 2022-05-16 at 20:20 +0800, Xiubo Li wrote:
>>>> In async unlink case the kclient won't wait for the first reply
>>>> from MDS and just drop all the links and unhash the dentry and then
>>>> succeeds immediately.
>>>>
>>>> For any new create/link/rename,etc requests followed by using the
>>>> same file names we must wait for the first reply of the inflight
>>>> unlink request, or the MDS possibly will fail these following
>>>> requests with -EEXIST if the inflight async unlink request was
>>>> delayed for some reasons.
>>>>
>>>> And the worst case is that for the none async openc request it will
>>>> successfully open the file if the CDentry hasn't been unlinked yet,
>>>> but later the previous delayed async unlink request will remove the
>>>> CDenty. That means the just created file is possiblly deleted later
>>>> by accident.
>>>>
>>>> We need to wait for the inflight async unlink requests to finish
>>>> when creating new files/directories by using the same file names.
>>>>
>>>> URL: https://tracker.ceph.com/issues/55332
>>>> Signed-off-by: Xiubo Li <xiubli@...hat.com>
>>>> ---
>>>> fs/ceph/dir.c | 55 +++++++++++++++++++++++++++++++----
>>>> fs/ceph/file.c | 5 ++++
>>>> fs/ceph/mds_client.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
>>>> fs/ceph/mds_client.h | 1 +
>>>> fs/ceph/super.c | 2 ++
>>>> fs/ceph/super.h | 18 ++++++++----
>>>> 6 files changed, 140 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
>>>> index eae417d71136..20c648406528 100644
>>>> --- a/fs/ceph/dir.c
>>>> +++ b/fs/ceph/dir.c
>>>> @@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
>>>> if (ceph_snap(dir) != CEPH_NOSNAP)
>>>> return -EROFS;
>>>>
>>>> + err = ceph_wait_on_conflict_unlink(dentry);
>>>> + if (err)
>>>> + return err;
>>>> +
>>>> if (ceph_quota_is_max_files_exceeded(dir)) {
>>>> err = -EDQUOT;
>>>> goto out;
>>>> @@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
>>>> if (ceph_snap(dir) != CEPH_NOSNAP)
>>>> return -EROFS;
>>>>
>>>> + err = ceph_wait_on_conflict_unlink(dentry);
>>>> + if (err)
>>>> + return err;
>>>> +
>>>> if (ceph_quota_is_max_files_exceeded(dir)) {
>>>> err = -EDQUOT;
>>>> goto out;
>>>> @@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
>>>> struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
>>>> struct ceph_mds_request *req;
>>>> struct ceph_acl_sec_ctx as_ctx = {};
>>>> - int err = -EROFS;
>>>> + int err;
>>>> int op;
>>>>
>>>> + err = ceph_wait_on_conflict_unlink(dentry);
>>>> + if (err)
>>>> + return err;
>>>> +
>>>> if (ceph_snap(dir) == CEPH_SNAPDIR) {
>>>> /* mkdir .snap/foo is a MKSNAP */
>>>> op = CEPH_MDS_OP_MKSNAP;
>>>> @@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
>>>> dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
>>>> op = CEPH_MDS_OP_MKDIR;
>>>> } else {
>>>> + err = -EROFS;
>>>> goto out;
>>>> }
>>>>
>>>> @@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>>>> struct ceph_mds_request *req;
>>>> int err;
>>>>
>>>> + err = ceph_wait_on_conflict_unlink(dentry);
>>>> + if (err)
>>>> + return err;
>>>> +
>>>> if (ceph_snap(dir) != CEPH_NOSNAP)
>>>> return -EROFS;
>>>>
>>>> @@ -1071,9 +1088,24 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>>>> static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>>>> struct ceph_mds_request *req)
>>>> {
>>>> + struct dentry *dentry = req->r_dentry;
>>>> + struct ceph_dentry_info *di = ceph_dentry(dentry);
>>>> int result = req->r_err ? req->r_err :
>>>> le32_to_cpu(req->r_reply_info.head->result);
>>>>
>>>> + if (test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) {
>>>> + BUG_ON(req->r_op != CEPH_MDS_OP_UNLINK);
>>>> +
>>>> + hash_del_rcu(&di->hnode);
>>>> +
>>>> + spin_lock(&dentry->d_lock);
>>>> + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
>>>> + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
>>>> + spin_unlock(&dentry->d_lock);
>>>> +
>>>> + synchronize_rcu();
>>>> + }
>>>> +
>>>> if (result == -EJUKEBOX)
>>>> goto out;
>>>>
>>>> @@ -1081,7 +1113,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>>>> if (result) {
>>>> int pathlen = 0;
>>>> u64 base = 0;
>>>> - char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
>>>> + char *path = ceph_mdsc_build_path(dentry, &pathlen,
>>>> &base, 0);
>>>>
>>>> /* mark error on parent + clear complete */
>>>> @@ -1089,13 +1121,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>>>> ceph_dir_clear_complete(req->r_parent);
>>>>
>>>> /* drop the dentry -- we don't know its status */
>>>> - if (!d_unhashed(req->r_dentry))
>>>> - d_drop(req->r_dentry);
>>>> + if (!d_unhashed(dentry))
>>>> + d_drop(dentry);
>>>>
>>>> /* mark inode itself for an error (since metadata is bogus) */
>>>> mapping_set_error(req->r_old_inode->i_mapping, result);
>>>>
>>>> - pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
>>>> + pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
>>>> base, IS_ERR(path) ? "<<bad>>" : path, result);
>>>> ceph_mdsc_free_path(path, pathlen);
>>>> }
>>>> @@ -1189,12 +1221,21 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>>>> ihold(req->r_old_inode);
>>>> err = ceph_mdsc_submit_request(mdsc, dir, req);
>>>> if (!err) {
>>>> + struct ceph_dentry_info *di;
>>>> +
>>>> /*
>>>> * We have enough caps, so we assume that the unlink
>>>> * will succeed. Fix up the target inode and dcache.
>>>> */
>>>> drop_nlink(inode);
>>>> d_delete(dentry);
>>>> +
>>>> + spin_lock(&dentry->d_lock);
>>>> + di = ceph_dentry(dentry);
>>>> + di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
>>>> + hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
>>>> + dentry->d_name.hash);
>>>> + spin_unlock(&dentry->d_lock);
>>> This looks racy. It's possible that the reply comes in before we get to
>>> the point of setting this flag. You probably want to do this before
>>> calling ceph_mdsc_submit_request, and just unwind it if the submission
>>> fails.
>> Ah, right. Will fix it.
>>
>>
>>> Also, you do still need some sort of lock to protect the
>>> hash_add/del/_rcu calls.
>> Sure, will fix it too.
>>
>>> Those don't do any locking on their own. The
>>> d_lock is insufficient here since it can't protect the whole list. You
>>> may be able to use the i_ceph_lock of the parent though?
>> The hashtable is a global one, so we couldn't use the i_ceph_lock here.
>> I will add one dedicated spin lock for each sb.
>>
>>>> } else if (err == -EJUKEBOX) {
>>>> try_async = false;
>>>> ceph_mdsc_put_request(req);
>>>> @@ -1237,6 +1278,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
>>>> (!ceph_quota_is_same_realm(old_dir, new_dir)))
>>>> return -EXDEV;
>>>>
>>>> + err = ceph_wait_on_conflict_unlink(new_dentry);
>>>> + if (err)
>>>> + return err;
>>>> +
>>>> dout("rename dir %p dentry %p to dir %p dentry %p\n",
>>>> old_dir, old_dentry, new_dir, new_dentry);
>>>> req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
>>>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>>>> index 8c8226c0feac..47d068e6436a 100644
>>>> --- a/fs/ceph/file.c
>>>> +++ b/fs/ceph/file.c
>>>> @@ -740,6 +740,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>>>> if (dentry->d_name.len > NAME_MAX)
>>>> return -ENAMETOOLONG;
>>>>
>>>> + err = ceph_wait_on_conflict_unlink(dentry);
>>>> + if (err)
>>>> + return err;
>>>> +
>>> What might be nice here eventually is to not block an async create here,
>>> but instead queue the request so that it gets transmitted after the
>>> async unlink reply comes in.
>>>
>>> That'll be hard to get right though, so this is fine for now.
>> Sure.
>>
>>>> if (flags & O_CREAT) {
>>>> if (ceph_quota_is_max_files_exceeded(dir))
>>>> return -EDQUOT;
>>>> @@ -757,6 +761,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>>>> /* If it's not being looked up, it's negative */
>>>> return -ENOENT;
>>>> }
>>>> +
>>>> retry:
>>>> /* do the open */
>>>> req = prepare_open_request(dir->i_sb, flags, mode);
>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>> index e8c87dea0551..0ae0e0110eb4 100644
>>>> --- a/fs/ceph/mds_client.c
>>>> +++ b/fs/ceph/mds_client.c
>>>> @@ -468,6 +468,75 @@ static int ceph_parse_deleg_inos(void **p, void *end,
>>>> return -EIO;
>>>> }
>>>>
>>>> +/*
>>>> + * In async unlink case the kclient won't wait for the first reply
>>>> + * from MDS and just drop all the links and unhash the dentry and then
>>>> + * succeeds immediately.
>>>> + *
>>>> + * For any new create/link/rename,etc requests followed by using the
>>>> + * same file names we must wait for the first reply of the inflight
>>>> + * unlink request, or the MDS possibly will fail these following
>>>> + * requests with -EEXIST if the inflight async unlink request was
>>>> + * delayed for some reasons.
>>>> + *
>>>> + * And the worst case is that for the none async openc request it will
>>>> + * successfully open the file if the CDentry hasn't been unlinked yet,
>>>> + * but later the previous delayed async unlink request will remove the
>>>> + * CDenty. That means the just created file is possiblly deleted later
>>>> + * by accident.
>>>> + *
>>>> + * We need to wait for the inflight async unlink requests to finish
>>>> + * when creating new files/directories by using the same file names.
>>>> + */
>>>> +int ceph_wait_on_conflict_unlink(struct dentry *dentry)
>>>> +{
>>>> + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
>>>> + struct dentry *pdentry = dentry->d_parent;
>>>> + struct dentry *udentry, *found = NULL;
>>>> + struct ceph_dentry_info *di;
>>>> + struct qstr dname;
>>>> + u32 hash = dentry->d_name.hash;
>>>> + int err;
>>>> +
>>>> + dname.name = dentry->d_name.name;
>>>> + dname.len = dentry->d_name.len;
>>>> +
>>>> + rcu_read_lock();
>>>> + hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
>>>> + hnode, hash) {
>>>> + udentry = di->dentry;
>>>> +
>>>> + spin_lock(&udentry->d_lock);
>>>> + if (udentry->d_name.hash != hash)
>>>> + goto next;
>>>> + if (unlikely(udentry->d_parent != pdentry))
>>>> + goto next;
>>>> + if (!hash_hashed(&di->hnode))
>>>> + goto next;
>>>> +
>>>> + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
>>>> + goto next;
>>>> +
>>> Maybe this should be a warning? Will we ever have entries in this
>>> hashtable that don't have this bit set?
>> Just before we take "spin_lock(&udentry->d_lock)" the udentry could be
>> already removed from hashtable and the bit was cleared ?
>>
>>
>
> The point is that you're removing the dentry from the hash before you
> clear the flag, so there should never be a dentry in the hash that has
> the flag cleared.
>
Yeah, makes sense. Will make it a warning here.
-- XIubo
>>>> + if (d_compare(pdentry, udentry, &dname))
>>>> + goto next;
>>>> +
>>>> + spin_unlock(&udentry->d_lock);
>>>> + found = dget(udentry);
>>>> + break;
>>>> +next:
>>>> + spin_unlock(&udentry->d_lock);
>>>> + }
>>>> + rcu_read_unlock();
>>>> +
>>>> + if (likely(!found))
>>>> + return 0;
>>>> +
>>>> + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
>>>> + TASK_INTERRUPTIBLE);
>>>> + dput(found);
>>>> + return err;
>>>> +}
>>>> +
>>>> u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
>>>> {
>>>> unsigned long ino;
>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>>> index 33497846e47e..d1ae679c52c3 100644
>>>> --- a/fs/ceph/mds_client.h
>>>> +++ b/fs/ceph/mds_client.h
>>>> @@ -582,6 +582,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
>>>> TASK_INTERRUPTIBLE);
>>>> }
>>>>
>>>> +extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
>>>> extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
>>>> extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
>>>> #endif
>>>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>>>> index b73b4f75462c..7ae65001f04c 100644
>>>> --- a/fs/ceph/super.c
>>>> +++ b/fs/ceph/super.c
>>>> @@ -816,6 +816,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>>>> if (!fsc->cap_wq)
>>>> goto fail_inode_wq;
>>>>
>>>> + hash_init(fsc->async_unlink_conflict);
>>>> +
>>>> spin_lock(&ceph_fsc_lock);
>>>> list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
>>>> spin_unlock(&ceph_fsc_lock);
>>>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>>>> index 506d52633627..58bbb5df42da 100644
>>>> --- a/fs/ceph/super.h
>>>> +++ b/fs/ceph/super.h
>>>> @@ -19,6 +19,7 @@
>>>> #include <linux/security.h>
>>>> #include <linux/netfs.h>
>>>> #include <linux/fscache.h>
>>>> +#include <linux/hashtable.h>
>>>>
>>>> #include <linux/ceph/libceph.h>
>>>>
>>>> @@ -99,6 +100,8 @@ struct ceph_mount_options {
>>>> char *mon_addr;
>>>> };
>>>>
>>>> +#define CEPH_ASYNC_CREATE_CONFLICT_BITS 12
>>>> +
>>>> struct ceph_fs_client {
>>>> struct super_block *sb;
>>>>
>>>> @@ -124,6 +127,8 @@ struct ceph_fs_client {
>>>> struct workqueue_struct *inode_wq;
>>>> struct workqueue_struct *cap_wq;
>>>>
>>>> + DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS);
>>>> +
>>>> #ifdef CONFIG_DEBUG_FS
>>>> struct dentry *debugfs_dentry_lru, *debugfs_caps;
>>>> struct dentry *debugfs_congestion_kb;
>>>> @@ -281,7 +286,8 @@ struct ceph_dentry_info {
>>>> struct dentry *dentry;
>>>> struct ceph_mds_session *lease_session;
>>>> struct list_head lease_list;
>>>> - unsigned flags;
>>>> + struct hlist_node hnode;
>>>> + unsigned long flags;
>>>> int lease_shared_gen;
>>>> u32 lease_gen;
>>>> u32 lease_seq;
>>>> @@ -290,10 +296,12 @@ struct ceph_dentry_info {
>>>> u64 offset;
>>>> };
>>>>
>>>> -#define CEPH_DENTRY_REFERENCED 1
>>>> -#define CEPH_DENTRY_LEASE_LIST 2
>>>> -#define CEPH_DENTRY_SHRINK_LIST 4
>>>> -#define CEPH_DENTRY_PRIMARY_LINK 8
>>>> +#define CEPH_DENTRY_REFERENCED (1 << 0)
>>>> +#define CEPH_DENTRY_LEASE_LIST (1 << 1)
>>>> +#define CEPH_DENTRY_SHRINK_LIST (1 << 2)
>>>> +#define CEPH_DENTRY_PRIMARY_LINK (1 << 3)
>>>> +#define CEPH_DENTRY_ASYNC_UNLINK_BIT (4)
>>>> +#define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT)
>>>>
>>>> struct ceph_inode_xattrs_info {
>>>> /*
Powered by blists - more mailing lists