lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f9fd2da5d9e45d42fb02df598013d812f850d3f4.camel@ibm.com>
Date: Mon, 4 Aug 2025 21:44:11 +0000
From: Viacheslav Dubeyko <Slava.Dubeyko@....com>
To: Alex Markuze <amarkuze@...hat.com>,
        "ceph-devel@...r.kernel.org"
	<ceph-devel@...r.kernel.org>
CC: "idryomov@...il.com" <idryomov@...il.com>,
        "linux-kernel@...r.kernel.org"
	<linux-kernel@...r.kernel.org>
Subject: Re:  [PATCH v2 1/2] ceph: fix client race condition validating
 r_parent before applying state

On Mon, 2025-08-04 at 09:59 +0000, Alex Markuze wrote:
> Add validation to ensure the cached parent directory inode matches the
> directory info in MDS replies. This prevents client-side race conditions
> where concurrent operations (e.g. rename) cause r_parent to become stale
> between request initiation and reply processing, which could lead to
> applying state changes to incorrect directory inodes.
> ---
>  fs/ceph/debugfs.c    |   6 +-
>  fs/ceph/mds_client.c | 152 +++++++++++++++++++++++++++----------------
>  fs/ceph/mds_client.h |  12 +++-
>  3 files changed, 110 insertions(+), 60 deletions(-)
> 
> diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> index 2ffb29108176..35e621f41039 100644
> --- a/fs/ceph/debugfs.c
> +++ b/fs/ceph/debugfs.c
> @@ -88,8 +88,8 @@ static int mdsc_show(struct seq_file *s, void *p)
>  		if (req->r_inode) {
>  			seq_printf(s, " #%llx", ceph_ino(req->r_inode));
>  		} else if (req->r_dentry) {
> -			path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
> -						    &pathbase, 0);
> +			struct ceph_path_info path_info;
> +			path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
>  			if (IS_ERR(path))
>  				path = NULL;
>  			spin_lock(&req->r_dentry->d_lock);
> @@ -98,7 +98,7 @@ static int mdsc_show(struct seq_file *s, void *p)
>  				   req->r_dentry,
>  				   path ? path : "");
>  			spin_unlock(&req->r_dentry->d_lock);
> -			ceph_mdsc_free_path(path, pathlen);
> +			ceph_mdsc_free_path(path, path_info.pathlen);

Just guessing.. Maybe, we need to switch ceph_mdsc_free_path() on taking struct
ceph_path_info? Especially, because we have the freepath field. Am I right that
this field defines should we free or not the path?

>  		} else if (req->r_path1) {
>  			seq_printf(s, " #%llx/%s", req->r_ino1.ino,
>  				   req->r_path1);
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 8d9fc5e18b17..d2ae862c3dda 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2732,7 +2732,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
>   *   foo/.snap/bar -> foo//bar
>   */
>  char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
> -			   int *plen, u64 *pbase, int for_wire)
> +			   struct ceph_path_info *path_info, int for_wire)

I assume that comments for ceph_mdsc_build_path() have not been changed?

/**
 * ceph_mdsc_build_path - build a path string to a given dentry
 * @mdsc: mds client
 * @dentry: dentry to which path should be built
 * @plen: returned length of string
 * @pbase: returned base inode number
 * @for_wire: is this path going to be sent to the MDS?

But plen and pbase have been encapsulated into struct ceph_path_info and comment
should be corrected too.

By the way, should be for_wire encapsulated too? It sounds like relevant for
this structure. However, I could be not fully right here.

>  {
>  	struct ceph_client *cl = mdsc->fsc->client;
>  	struct dentry *cur;
> @@ -2843,17 +2843,31 @@ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
>  		return ERR_PTR(-ENAMETOOLONG);
>  	}
>  
> -	*pbase = base;
> -	*plen = PATH_MAX - 1 - pos;
> -	CEPH_SAN_STRNCPY(result_str, sizeof(result_str), path + pos, *plen);
> +	/* Initialize the output structure */
> +	memset(path_info, 0, sizeof(*path_info));
> +
> +	path_info->vino.ino = base;
> +	path_info->pathlen = PATH_MAX - 1 - pos;

Could be pos bigger than PATH_MAX? Technically speaking, do we have protection
from pos to be bigger than PATH_MAX?

> +	path_info->path = path + pos;
> +	path_info->freepath = true;
> +
> +	/* Set snap from dentry if available */
> +	if (d_inode(dentry))
> +		path_info->vino.snap = ceph_snap(d_inode(dentry));
> +	else
> +		path_info->vino.snap = CEPH_NOSNAP;
> +
> +	CEPH_SAN_STRNCPY(result_str, sizeof(result_str), path_info->path, path_info->pathlen);

I don't see such primitive in upstream. I am not sure that this patch can be
applied on upstream kernel and it can be compiled.

>  	boutc(cl, "on %p %d built %llx '%s'\n", dentry, d_count(dentry),
> -	      base, result_str);
> +	      path_info->vino.ino, result_str);
>  	return path + pos;
>  }
>  
> +
> +
>  static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
> -			     struct inode *dir, const char **ppath, int *ppathlen,
> -			     u64 *pino, bool *pfreepath, bool parent_locked)
> +			     struct inode *dir, struct ceph_path_info *path_info,
> +			     bool parent_locked)
>  {
>  	char *path;
>  
> @@ -2862,41 +2876,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry
>  		dir = d_inode_rcu(dentry->d_parent);
>  	if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
>  	    !IS_ENCRYPTED(dir)) {
> -		*pino = ceph_ino(dir);
> +		path_info->vino.ino = ceph_ino(dir);
> +		path_info->vino.snap = ceph_snap(dir);
>  		rcu_read_unlock();
> -		*ppath = dentry->d_name.name;
> -		*ppathlen = dentry->d_name.len;
> +		path_info->path = dentry->d_name.name;
> +		path_info->pathlen = dentry->d_name.len;
> +		path_info->freepath = false;

This is my worry that we can try to free memory by ceph_mdsc_free_path() even if
we should not do so.

>  		return 0;
>  	}
>  	rcu_read_unlock();
> -	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
> +	path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
>  	if (IS_ERR(path))
>  		return PTR_ERR(path);
> -	*ppath = path;
> -	*pfreepath = true;
> +	/*
> +	 * ceph_mdsc_build_path already fills path_info, including snap handling.
> +	 */
>  	return 0;
>  }
>  
> -static int build_inode_path(struct inode *inode,
> -			    const char **ppath, int *ppathlen, u64 *pino,
> -			    bool *pfreepath)
> +static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info)
>  {
>  	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
>  	struct dentry *dentry;
>  	char *path;
>  
>  	if (ceph_snap(inode) == CEPH_NOSNAP) {
> -		*pino = ceph_ino(inode);
> -		*ppathlen = 0;
> +		path_info->vino.ino = ceph_ino(inode);
> +		path_info->vino.snap = ceph_snap(inode);
> +		path_info->pathlen = 0;
> +		path_info->freepath = false;
>  		return 0;
>  	}
>  	dentry = d_find_alias(inode);
> -	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
> +	path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
>  	dput(dentry);
>  	if (IS_ERR(path))
>  		return PTR_ERR(path);
> -	*ppath = path;
> -	*pfreepath = true;
> +	/*
> +	 * ceph_mdsc_build_path already fills path_info, including snap from dentry.
> +	 * Override with inode's snap since that's what this function is for.
> +	 */
> +	path_info->vino.snap = ceph_snap(inode);
>  	return 0;
>  }
>  
> @@ -2906,28 +2926,31 @@ static int build_inode_path(struct inode *inode,
>   */
>  static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
>  				 struct dentry *rdentry, struct inode *rdiri,
> -				 const char *rpath, u64 rino, const char **ppath,
> -				 int *pathlen, u64 *ino, bool *freepath,
> +				 const char *rpath, u64 rino, struct ceph_path_info *path_info,
>  				 bool parent_locked)
>  {
>  	struct ceph_client *cl = mdsc->fsc->client;
>  	char result_str[128];
>  	int r = 0;
>  
> +	/* Initialize the output structure */
> +	memset(path_info, 0, sizeof(*path_info));
> +
>  	if (rinode) {
> -		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
> +		r = build_inode_path(rinode, path_info);
>  		boutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
> -		      ceph_snap(rinode));
> +		ceph_snap(rinode));
>  	} else if (rdentry) {
> -		r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
> -					freepath, parent_locked);
> -		CEPH_SAN_STRNCPY(result_str, sizeof(result_str), *ppath, *pathlen);
> -		boutc(cl, " dentry %p %llx/%s\n", rdentry, *ino, result_str);
> +		r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked);
> +		CEPH_SAN_STRNCPY(result_str, sizeof(result_str), path_info->path, path_info->pathlen);
> +		boutc(cl, " dentry %p %llx/%s\n", rdentry, path_info->vino.ino, result_str);

Ditto. We don't have CEPH_SAN_STRNCPY() and boutc() in upstream kernel.

Thanks,
Slava.

>  	} else if (rpath || rino) {
> -		*ino = rino;
> -		*ppath = rpath;
> -		*pathlen = rpath ? strlen(rpath) : 0;
> -		CEPH_SAN_STRNCPY(result_str, sizeof(result_str), rpath, *pathlen);
> +		path_info->vino.ino = rino;
> +		path_info->vino.snap = CEPH_NOSNAP;
> +		path_info->path = rpath;
> +		path_info->pathlen = rpath ? strlen(rpath) : 0;
> +		path_info->freepath = false;
> +		CEPH_SAN_STRNCPY(result_str, sizeof(result_str), rpath, path_info->pathlen);
>  		boutc(cl," path %s\n", result_str);
>  	}
>  
> @@ -3005,11 +3028,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>  	struct ceph_client *cl = mdsc->fsc->client;
>  	struct ceph_msg *msg;
>  	struct ceph_mds_request_head_legacy *lhead;
> -	const char *path1 = NULL;
> -	const char *path2 = NULL;
> -	u64 ino1 = 0, ino2 = 0;
> -	int pathlen1 = 0, pathlen2 = 0;
> -	bool freepath1 = false, freepath2 = false;
> +	struct ceph_path_info path_info1 = {0};
> +	struct ceph_path_info path_info2 = {0};
>  	struct dentry *old_dentry = NULL;
>  	int len;
>  	u16 releases;
> @@ -3019,25 +3039,42 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>  	u16 request_head_version = mds_supported_head_version(session);
>  	kuid_t caller_fsuid = req->r_cred->fsuid;
>  	kgid_t caller_fsgid = req->r_cred->fsgid;
> +	bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
>  
> -	ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
> -			      req->r_parent, req->r_path1, req->r_ino1.ino,
> -			      &path1, &pathlen1, &ino1, &freepath1,
> -			      test_bit(CEPH_MDS_R_PARENT_LOCKED,
> -					&req->r_req_flags));
> +		ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
> +		      req->r_parent, req->r_path1, req->r_ino1.ino,
> +		      &path_info1, parent_locked);
>  	if (ret < 0) {
>  		msg = ERR_PTR(ret);
>  		goto out;
>  	}
>  
> +	/*
> +	 * When the parent directory's i_rwsem is *not* locked, req->r_parent may
> +	 * have become stale (e.g. after a concurrent rename) between the time the
> +	 * dentry was looked up and now.  If we detect that the stored r_parent
> +	 * does not match the inode number we just encoded for the request, switch
> +	 * to the correct inode so that the MDS receives a valid parent reference.
> +	 */
> +	if (!parent_locked &&
> +	    	req->r_parent && path_info1.vino.ino && ceph_ino(req->r_parent) != path_info1.vino.ino) {
> +		struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL);
> +		if (!IS_ERR(correct_dir)) {
> +			WARN(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",
> +			     ceph_ino(req->r_parent), path_info1.vino.ino);
> +			iput(req->r_parent);
> +			req->r_parent = correct_dir;
> +		}
> +	}
> +
>  	/* If r_old_dentry is set, then assume that its parent is locked */
>  	if (req->r_old_dentry &&
>  	    !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
>  		old_dentry = req->r_old_dentry;
>  	ret = set_request_path_attr(mdsc, NULL, old_dentry,
> -			      req->r_old_dentry_dir,
> -			      req->r_path2, req->r_ino2.ino,
> -			      &path2, &pathlen2, &ino2, &freepath2, true);
> +		      req->r_old_dentry_dir,
> +		      req->r_path2, req->r_ino2.ino,
> +		      &path_info2, true);
>  	if (ret < 0) {
>  		msg = ERR_PTR(ret);
>  		goto out_free1;
> @@ -3068,7 +3105,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>  
>  	/* filepaths */
>  	len += 2 * (1 + sizeof(u32) + sizeof(u64));
> -	len += pathlen1 + pathlen2;
> +	len += path_info1.pathlen + path_info2.pathlen;
>  
>  	/* cap releases */
>  	len += sizeof(struct ceph_mds_request_release) *
> @@ -3076,9 +3113,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>  		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
>  
>  	if (req->r_dentry_drop)
> -		len += pathlen1;
> +		len += path_info1.pathlen;
>  	if (req->r_old_dentry_drop)
> -		len += pathlen2;
> +		len += path_info2.pathlen;
>  
>  	/* MClientRequest tail */
>  
> @@ -3191,8 +3228,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>  	lhead->ino = cpu_to_le64(req->r_deleg_ino);
>  	lhead->args = req->r_args;
>  
> -	ceph_encode_filepath(&p, end, ino1, path1);
> -	ceph_encode_filepath(&p, end, ino2, path2);
> +	ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path);
> +	ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path);
>  
>  	/* make note of release offset, in case we need to replay */
>  	req->r_request_release_offset = p - msg->front.iov_base;
> @@ -3255,11 +3292,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>  	msg->hdr.data_off = cpu_to_le16(0);
>  
>  out_free2:
> -	if (freepath2)
> -		ceph_mdsc_free_path((char *)path2, pathlen2);
> +	if (path_info2.freepath)
> +		ceph_mdsc_free_path((char *)path_info2.path, path_info2.pathlen);
>  out_free1:
> -	if (freepath1)
> -		ceph_mdsc_free_path((char *)path1, pathlen1);
> +	if (path_info1.freepath)
> +		ceph_mdsc_free_path((char *)path_info1.path, path_info1.pathlen);
>  out:
>  	return msg;
>  out_err:
> @@ -4623,14 +4660,17 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
>  
>  	dentry = d_find_primary(inode);
>  	if (dentry) {
> +		struct ceph_path_info path_info;
>  		/* set pathbase to parent dir when msg_version >= 2 */
> -		path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
> +		path = ceph_mdsc_build_path(mdsc, dentry, &path_info,
>  					    recon_state->msg_version >= 2);
>  		dput(dentry);
>  		if (IS_ERR(path)) {
>  			err = PTR_ERR(path);
>  			goto out_err;
>  		}
> +		pathlen = path_info.pathlen;
> +		pathbase = path_info.vino.ino;
>  	} else {
>  		path = NULL;
>  		pathbase = 0;
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 3e2a6fa7c19a..c4c1ea8d5f5e 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -623,8 +623,18 @@ static inline void ceph_mdsc_free_path(char *path, int len)
>  		__putname(path - (PATH_MAX - 1 - len));
>  }
>  
> +/*
> + * Structure to group path-related output parameters for build_*_path functions
> + */
> +struct ceph_path_info {
> +	const char *path;
> +	int pathlen;
> +	struct ceph_vino vino;
> +	bool freepath;
> +};
> +
>  extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
> -				  struct dentry *dentry, int *plen, u64 *base,
> +				  struct dentry *dentry, struct ceph_path_info *path_info,
>  				  int for_wire);
>  
>  extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ