[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <708e3416-8a5a-6dfc-5d06-f82dd48c9913@redhat.com>
Date: Thu, 19 May 2022 08:22:50 +0800
From: Xiubo Li <xiubli@...hat.com>
To: Jeff Layton <jlayton@...nel.org>, idryomov@...il.com,
viro@...iv.linux.org.uk
Cc: willy@...radead.org, vshankar@...hat.com,
ceph-devel@...r.kernel.org, arnd@...db.de, mcgrof@...nel.org,
akpm@...ux-foundation.org, linux-fsdevel@...r.kernel.org,
linux-kernel@...r.kernel.org, kernel test robot <lkp@...el.com>
Subject: Re: [PATCH v4 2/2] ceph: wait the first reply of inflight async
unlink
On 5/19/22 4:28 AM, Jeff Layton wrote:
> On Wed, 2022-05-18 at 22:45 +0800, Xiubo Li wrote:
>> In async unlink case the kclient won't wait for the first reply
>> from MDS and just drop all the links and unhash the dentry and then
>> succeeds immediately.
>>
>> For any new create/link/rename,etc requests followed by using the
>> same file names we must wait for the first reply of the inflight
>> unlink request, or the MDS possibly will fail these following
>> requests with -EEXIST if the inflight async unlink request was
>> delayed for some reasons.
>>
>> And the worst case is that for the none async openc request it will
>> successfully open the file if the CDentry hasn't been unlinked yet,
>> but later the previous delayed async unlink request will remove the
>> CDenty. That means the just created file is possiblly deleted later
>> by accident.
>>
>> We need to wait for the inflight async unlink requests to finish
>> when creating new files/directories by using the same file names.
>>
>> URL: https://tracker.ceph.com/issues/55332
>> Reported-by: kernel test robot <lkp@...el.com>
>> Signed-off-by: Xiubo Li <xiubli@...hat.com>
>> ---
>> fs/ceph/dir.c | 70 +++++++++++++++++++++++++++++++++++++++---
>> fs/ceph/file.c | 4 +++
>> fs/ceph/mds_client.c | 73 ++++++++++++++++++++++++++++++++++++++++++++
>> fs/ceph/mds_client.h | 1 +
>> fs/ceph/super.c | 3 ++
>> fs/ceph/super.h | 19 +++++++++---
>> 6 files changed, 160 insertions(+), 10 deletions(-)
>>
>> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
>> index eae417d71136..01e7facef9b2 100644
>> --- a/fs/ceph/dir.c
>> +++ b/fs/ceph/dir.c
>> @@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
>> if (ceph_snap(dir) != CEPH_NOSNAP)
>> return -EROFS;
>>
>> + err = ceph_wait_on_conflict_unlink(dentry);
>> + if (err)
>> + return err;
>> +
>> if (ceph_quota_is_max_files_exceeded(dir)) {
>> err = -EDQUOT;
>> goto out;
>> @@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
>> if (ceph_snap(dir) != CEPH_NOSNAP)
>> return -EROFS;
>>
>> + err = ceph_wait_on_conflict_unlink(dentry);
>> + if (err)
>> + return err;
>> +
>> if (ceph_quota_is_max_files_exceeded(dir)) {
>> err = -EDQUOT;
>> goto out;
>> @@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
>> struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
>> struct ceph_mds_request *req;
>> struct ceph_acl_sec_ctx as_ctx = {};
>> - int err = -EROFS;
>> + int err;
>> int op;
>>
>> + err = ceph_wait_on_conflict_unlink(dentry);
>> + if (err)
>> + return err;
>> +
>> if (ceph_snap(dir) == CEPH_SNAPDIR) {
>> /* mkdir .snap/foo is a MKSNAP */
>> op = CEPH_MDS_OP_MKSNAP;
>> @@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
>> dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
>> op = CEPH_MDS_OP_MKDIR;
>> } else {
>> + err = -EROFS;
>> goto out;
>> }
>>
>> @@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>> struct ceph_mds_request *req;
>> int err;
>>
>> + err = ceph_wait_on_conflict_unlink(dentry);
>> + if (err)
>> + return err;
>> +
>> if (ceph_snap(dir) != CEPH_NOSNAP)
>> return -EROFS;
>>
>> @@ -1071,9 +1088,27 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>> static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>> struct ceph_mds_request *req)
>> {
>> + struct dentry *dentry = req->r_dentry;
>> + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
>> + struct ceph_dentry_info *di = ceph_dentry(dentry);
>> int result = req->r_err ? req->r_err :
>> le32_to_cpu(req->r_reply_info.head->result);
>>
>> + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
>> + pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
>> + __func__, dentry, dentry);
>> +
> This is a pr_warn so it won't have a "ceph: " prefix or anything
> prepending it like dout messages do. You should probably prepend this
> with something like that.
The pr_fmt already help add the module name:
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
Such as for one current log in ceph:
2022-05-02T17:28:51.006593+00:00 smithi040 kernel: ceph: ceph: async
create failure path=(1)client.0/tmp/ceph/src/os/ObjectStore.h result=-17!
>
>> + spin_lock(&fsc->async_unlink_conflict_lock);
>> + hash_del_rcu(&di->hnode);
>> + spin_unlock(&fsc->async_unlink_conflict_lock);
>> +
>> + spin_lock(&dentry->d_lock);
>> + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
>> + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
>> + spin_unlock(&dentry->d_lock);
>> +
>> + synchronize_rcu();
>> +
>> if (result == -EJUKEBOX)
>> goto out;
>>
>> @@ -1081,7 +1116,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>> if (result) {
>> int pathlen = 0;
>> u64 base = 0;
>> - char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
>> + char *path = ceph_mdsc_build_path(dentry, &pathlen,
>> &base, 0);
>>
>> /* mark error on parent + clear complete */
>> @@ -1089,13 +1124,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>> ceph_dir_clear_complete(req->r_parent);
>>
>> /* drop the dentry -- we don't know its status */
>> - if (!d_unhashed(req->r_dentry))
>> - d_drop(req->r_dentry);
>> + if (!d_unhashed(dentry))
>> + d_drop(dentry);
>>
>> /* mark inode itself for an error (since metadata is bogus) */
>> mapping_set_error(req->r_old_inode->i_mapping, result);
>>
>> - pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
>> + pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
> Ditto here: why remove the prefix?
The log will be:
2022-05-02T17:28:51.006593+00:00 smithi040 kernel: ceph: ceph: async
create failure path=(1)client.0/tmp/ceph/src/os/ObjectStore.h result=-17!
And the prefix "ceph:" is really needed ?
>> base, IS_ERR(path) ? "<<bad>>" : path, result);
>> ceph_mdsc_free_path(path, pathlen);
>> }
>> @@ -1180,6 +1215,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>>
>> if (try_async && op == CEPH_MDS_OP_UNLINK &&
>> (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
>> + struct ceph_dentry_info *di = ceph_dentry(dentry);
>> +
>> dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
>> dentry->d_name.len, dentry->d_name.name,
>> ceph_cap_string(req->r_dir_caps));
>> @@ -1187,6 +1224,16 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>> req->r_callback = ceph_async_unlink_cb;
>> req->r_old_inode = d_inode(dentry);
>> ihold(req->r_old_inode);
>> +
>> + spin_lock(&dentry->d_lock);
>> + di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
>> + spin_unlock(&dentry->d_lock);
>> +
>> + spin_lock(&fsc->async_unlink_conflict_lock);
>> + hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
>> + dentry->d_name.hash);
>> + spin_unlock(&fsc->async_unlink_conflict_lock);
>> +
>> err = ceph_mdsc_submit_request(mdsc, dir, req);
>> if (!err) {
>> /*
>> @@ -1198,6 +1245,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>> } else if (err == -EJUKEBOX) {
>> try_async = false;
>> ceph_mdsc_put_request(req);
>> +
>> + spin_lock(&dentry->d_lock);
>> + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
>> + spin_unlock(&dentry->d_lock);
>> +
>> + spin_lock(&fsc->async_unlink_conflict_lock);
>> + hash_del_rcu(&di->hnode);
>> + spin_unlock(&fsc->async_unlink_conflict_lock);
>> +
> You're clearing the flag here before removing it from the hash. That
> might end up causing a waiter to trip the pr_warn in
> ceph_wait_on_conflict_unlink. You probably want to reverse the order
> here.
Sure. Will fix it.
-- Xiubo
Powered by blists - more mailing lists