[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <a62918950646701cb9bb2ab0a32c87b53e2f102e.camel@dubeyko.com>
Date: Fri, 14 Mar 2025 15:27:55 -0700
From: slava@...eyko.com
To: David Howells <dhowells@...hat.com>, Alex Markuze <amarkuze@...hat.com>
Cc: Ilya Dryomov <idryomov@...il.com>, Jeff Layton <jlayton@...nel.org>,
Dongsheng Yang <dongsheng.yang@...ystack.cn>, ceph-devel@...r.kernel.org,
linux-fsdevel@...r.kernel.org, linux-block@...r.kernel.org,
linux-kernel@...r.kernel.org, Slava.Dubeyko@....com
Subject: Re: [RFC PATCH 04/35] ceph: Convert ceph_mds_request::r_pagelist
to a databuf
On Thu, 2025-03-13 at 23:32 +0000, David Howells wrote:
> Convert ceph_mds_request::r_pagelist to a databuf, along with the
> stuff
> that uses it such as setxattr ops.
>
> Signed-off-by: David Howells <dhowells@...hat.com>
> cc: Viacheslav Dubeyko <slava@...eyko.com>
> cc: Alex Markuze <amarkuze@...hat.com>
> cc: Ilya Dryomov <idryomov@...il.com>
> cc: ceph-devel@...r.kernel.org
> cc: linux-fsdevel@...r.kernel.org
> ---
> fs/ceph/acl.c | 39 ++++++++++----------
> fs/ceph/file.c | 12 ++++---
> fs/ceph/inode.c | 85 +++++++++++++++++++-----------------------
> --
> fs/ceph/mds_client.c | 11 +++---
> fs/ceph/mds_client.h | 2 +-
> fs/ceph/super.h | 2 +-
> fs/ceph/xattr.c | 68 +++++++++++++++--------------------
> 7 files changed, 96 insertions(+), 123 deletions(-)
>
> diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
> index 1564eacc253d..d6da650db83e 100644
> --- a/fs/ceph/acl.c
> +++ b/fs/ceph/acl.c
> @@ -171,7 +171,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t
> *mode,
> {
> struct posix_acl *acl, *default_acl;
> size_t val_size1 = 0, val_size2 = 0;
> - struct ceph_pagelist *pagelist = NULL;
> + struct ceph_databuf *dbuf = NULL;
> void *tmp_buf = NULL;
> int err;
>
> @@ -201,58 +201,55 @@ int ceph_pre_init_acls(struct inode *dir,
> umode_t *mode,
> tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
> if (!tmp_buf)
> goto out_err;
> - pagelist = ceph_pagelist_alloc(GFP_KERNEL);
> - if (!pagelist)
> + dbuf = ceph_databuf_req_alloc(1, PAGE_SIZE, GFP_KERNEL);
> + if (!dbuf)
> goto out_err;
>
> - err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
> - if (err)
> - goto out_err;
> -
> - ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 :
> 1);
> + ceph_databuf_encode_32(dbuf, acl && default_acl ? 2 : 1);
>
> if (acl) {
> size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS);
> - err = ceph_pagelist_reserve(pagelist, len +
> val_size1 + 8);
> + err = ceph_databuf_reserve(dbuf, len + val_size1 +
> 8,
> + GFP_KERNEL);
I know that it's simple change. But this len + val_size1 + 8 looks
confusing, anyway. What this hardcoded 8 means? :)
> if (err)
> goto out_err;
> - ceph_pagelist_encode_string(pagelist,
> XATTR_NAME_POSIX_ACL_ACCESS,
> - len);
> + ceph_databuf_encode_string(dbuf,
> XATTR_NAME_POSIX_ACL_ACCESS,
> + len);
> err = posix_acl_to_xattr(&init_user_ns, acl,
> tmp_buf, val_size1);
> if (err < 0)
> goto out_err;
> - ceph_pagelist_encode_32(pagelist, val_size1);
> - ceph_pagelist_append(pagelist, tmp_buf, val_size1);
> + ceph_databuf_encode_32(dbuf, val_size1);
> + ceph_databuf_append(dbuf, tmp_buf, val_size1);
> }
> if (default_acl) {
> size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT);
> - err = ceph_pagelist_reserve(pagelist, len +
> val_size2 + 8);
> + err = ceph_databuf_reserve(dbuf, len + val_size2 +
> 8,
> + GFP_KERNEL);
Same question here. :) What this hardcoded 8 means? :)
> if (err)
> goto out_err;
> - ceph_pagelist_encode_string(pagelist,
> -
> XATTR_NAME_POSIX_ACL_DEFAULT, len);
> + ceph_databuf_encode_string(dbuf,
> +
> XATTR_NAME_POSIX_ACL_DEFAULT, len);
> err = posix_acl_to_xattr(&init_user_ns, default_acl,
> tmp_buf, val_size2);
> if (err < 0)
> goto out_err;
> - ceph_pagelist_encode_32(pagelist, val_size2);
> - ceph_pagelist_append(pagelist, tmp_buf, val_size2);
> + ceph_databuf_encode_32(dbuf, val_size2);
> + ceph_databuf_append(dbuf, tmp_buf, val_size2);
> }
>
> kfree(tmp_buf);
>
> as_ctx->acl = acl;
> as_ctx->default_acl = default_acl;
> - as_ctx->pagelist = pagelist;
> + as_ctx->dbuf = dbuf;
> return 0;
>
> out_err:
> posix_acl_release(acl);
> posix_acl_release(default_acl);
> kfree(tmp_buf);
> - if (pagelist)
> - ceph_pagelist_release(pagelist);
> + ceph_databuf_release(dbuf);
> return err;
> }
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 851d70200c6b..9de2960748b9 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -679,9 +679,9 @@ static int ceph_finish_async_create(struct inode
> *dir, struct inode *inode,
> iinfo.change_attr = 1;
> ceph_encode_timespec64(&iinfo.btime, &now);
>
> - if (req->r_pagelist) {
> - iinfo.xattr_len = req->r_pagelist->length;
> - iinfo.xattr_data = req->r_pagelist->mapped_tail;
> + if (req->r_dbuf) {
> + iinfo.xattr_len = ceph_databuf_len(req->r_dbuf);
> + iinfo.xattr_data = kmap_ceph_databuf_page(req-
> >r_dbuf, 0);
Possibly, it's in another patch. Have we removed req->r_pagelist from
the structure?
Do we always have memory pages in ceph_databuf? How
kmap_ceph_databuf_page() will behave if it's not memory page.
> } else {
> /* fake it */
> iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
> @@ -731,6 +731,8 @@ static int ceph_finish_async_create(struct inode
> *dir, struct inode *inode,
> ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req-
> >r_session,
> req->r_fmode, NULL);
> up_read(&mdsc->snap_rwsem);
> + if (req->r_dbuf)
> + kunmap_local(iinfo.xattr_data);
Maybe, we need to hide kunmap_local() into something like
kunmap_ceph_databuf_page()?
> if (ret) {
> doutc(cl, "failed to fill inode: %d\n", ret);
> ceph_dir_clear_complete(dir);
> @@ -849,8 +851,8 @@ int ceph_atomic_open(struct inode *dir, struct
> dentry *dentry,
> goto out_ctx;
> }
> /* Async create can't handle more than a page of
> xattrs */
> - if (as_ctx.pagelist &&
> - !list_is_singular(&as_ctx.pagelist->head))
> + if (as_ctx.dbuf &&
> + as_ctx.dbuf->nr_bvec > 1)
Maybe, it makes sense to call something like ceph_databuf_length()
instead of low level access to dbuf->nr_bvec?
> try_async = false;
> } else if (!d_in_lookup(dentry)) {
> /* If it's not being looked up, it's negative */
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index b060f765ad20..ec9b80fec7be 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -112,9 +112,9 @@ struct inode *ceph_new_inode(struct inode *dir,
> struct dentry *dentry,
> void ceph_as_ctx_to_req(struct ceph_mds_request *req,
> struct ceph_acl_sec_ctx *as_ctx)
> {
> - if (as_ctx->pagelist) {
> - req->r_pagelist = as_ctx->pagelist;
> - as_ctx->pagelist = NULL;
> + if (as_ctx->dbuf) {
> + req->r_dbuf = as_ctx->dbuf;
> + as_ctx->dbuf = NULL;
Maybe, we need something like swap() method? :)
> }
> ceph_fscrypt_as_ctx_to_req(req, as_ctx);
> }
> @@ -2341,11 +2341,10 @@ static int fill_fscrypt_truncate(struct inode
> *inode,
> loff_t pos, orig_pos = round_down(attr->ia_size,
> CEPH_FSCRYPT_BLOCK_SIZE);
> u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT;
> - struct ceph_pagelist *pagelist = NULL;
> - struct kvec iov = {0};
> + struct ceph_databuf *dbuf = NULL;
> struct iov_iter iter;
> - struct page *page = NULL;
> - struct ceph_fscrypt_truncate_size_header header;
> + struct ceph_fscrypt_truncate_size_header *header;
> + void *p;
> int retry_op = 0;
> int len = CEPH_FSCRYPT_BLOCK_SIZE;
> loff_t i_size = i_size_read(inode);
> @@ -2372,37 +2371,35 @@ static int fill_fscrypt_truncate(struct inode
> *inode,
> goto out;
> }
>
> - page = __page_cache_alloc(GFP_KERNEL);
> - if (page == NULL) {
> - ret = -ENOMEM;
> + ret = -ENOMEM;
> + dbuf = ceph_databuf_req_alloc(2, 0, GFP_KERNEL);
So, do we allocate 2 items of zero length here?
> + if (!dbuf)
> goto out;
> - }
>
> - pagelist = ceph_pagelist_alloc(GFP_KERNEL);
> - if (!pagelist) {
> - ret = -ENOMEM;
> + if (ceph_databuf_insert_frag(dbuf, 0, sizeof(*header),
> GFP_KERNEL) < 0)
> + goto out;
> + if (ceph_databuf_insert_frag(dbuf, 1, PAGE_SIZE, GFP_KERNEL)
> < 0)
> goto out;
> - }
>
> - iov.iov_base = kmap_local_page(page);
> - iov.iov_len = len;
> - iov_iter_kvec(&iter, READ, &iov, 1, len);
> + iov_iter_bvec(&iter, ITER_DEST, &dbuf->bvec[1], 1, len);
Is it correct &dbuf->bvec[1]? Why do we work with item #1? I think it
looks confusing.
>
> pos = orig_pos;
> ret = __ceph_sync_read(inode, &pos, &iter, &retry_op,
> &objver);
> if (ret < 0)
> goto out;
>
> + header = kmap_ceph_databuf_page(dbuf, 0);
> +
> /* Insert the header first */
> - header.ver = 1;
> - header.compat = 1;
> - header.change_attr =
> cpu_to_le64(inode_peek_iversion_raw(inode));
> + header->ver = 1;
> + header->compat = 1;
> + header->change_attr =
> cpu_to_le64(inode_peek_iversion_raw(inode));
>
> /*
> * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
> * because in MDS it may need this to do the truncate.
> */
> - header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
> + header->block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
>
> /*
> * If we hit a hole here, we should just skip filling
> @@ -2417,51 +2414,41 @@ static int fill_fscrypt_truncate(struct inode
> *inode,
> if (!objver) {
> doutc(cl, "hit hole, ppos %lld < size %lld\n", pos,
> i_size);
>
> - header.data_len = cpu_to_le32(8 + 8 + 4);
> - header.file_offset = 0;
> + header->data_len = cpu_to_le32(8 + 8 + 4);
The same problem of understanding here for me. What this hardcoded 8 +
8 + 4 value means? :)
> + header->file_offset = 0;
> ret = 0;
> } else {
> - header.data_len = cpu_to_le32(8 + 8 + 4 +
> CEPH_FSCRYPT_BLOCK_SIZE);
> - header.file_offset = cpu_to_le64(orig_pos);
> + header->data_len = cpu_to_le32(8 + 8 + 4 +
> CEPH_FSCRYPT_BLOCK_SIZE);
Ditto.
> + header->file_offset = cpu_to_le64(orig_pos);
>
> doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff,
> CEPH_FSCRYPT_BLOCK_SIZE);
>
> /* truncate and zero out the extra contents for the
> last block */
> - memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
> + p = kmap_ceph_databuf_page(dbuf, 1);
Maybe, we need to introduce some constants to address #0 and #1 pages?
Because, #0 it's header and I assume #1 is some content.
> + memset(p + boff, 0, PAGE_SIZE - boff);
> + kunmap_local(p);
>
> /* encrypt the last block */
> - ret = ceph_fscrypt_encrypt_block_inplace(inode,
> page,
> -
> CEPH_FSCRYPT_BLOCK_SIZE,
> - 0, block,
> - GFP_KERNEL);
> + ret = ceph_fscrypt_encrypt_block_inplace(
> + inode, ceph_databuf_page(dbuf, 1),
> + CEPH_FSCRYPT_BLOCK_SIZE, 0, block,
> GFP_KERNEL);
> if (ret)
> goto out;
> }
>
> - /* Insert the header */
> - ret = ceph_pagelist_append(pagelist, &header,
> sizeof(header));
> - if (ret)
> - goto out;
> + ceph_databuf_added_data(dbuf, sizeof(*header));
> + if (header->block_size)
> + ceph_databuf_added_data(dbuf,
> CEPH_FSCRYPT_BLOCK_SIZE);
>
> - if (header.block_size) {
> - /* Append the last block contents to pagelist */
> - ret = ceph_pagelist_append(pagelist, iov.iov_base,
> - CEPH_FSCRYPT_BLOCK_SIZE);
> - if (ret)
> - goto out;
> - }
> - req->r_pagelist = pagelist;
> + req->r_dbuf = dbuf;
> out:
> doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n",
> inode,
> ceph_vinop(inode), ceph_cap_string(got));
> ceph_put_cap_refs(ci, got);
> - if (iov.iov_base)
> - kunmap_local(iov.iov_base);
> - if (page)
> - __free_pages(page, 0);
> - if (ret && pagelist)
> - ceph_pagelist_release(pagelist);
> + kunmap_local(header);
> + if (ret)
> + ceph_databuf_release(dbuf);
> return ret;
> }
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 230e0c3f341f..09661a34f287 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -1125,8 +1125,7 @@ void ceph_mdsc_release_request(struct kref
> *kref)
> put_cred(req->r_cred);
> if (req->r_mnt_idmap)
> mnt_idmap_put(req->r_mnt_idmap);
> - if (req->r_pagelist)
> - ceph_pagelist_release(req->r_pagelist);
> + ceph_databuf_release(req->r_dbuf);
> kfree(req->r_fscrypt_auth);
> kfree(req->r_altname);
> put_request_session(req);
> @@ -3207,10 +3206,10 @@ static struct ceph_msg
> *create_request_message(struct ceph_mds_session *session,
> msg->front.iov_len = p - msg->front.iov_base;
> msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
>
> - if (req->r_pagelist) {
> - struct ceph_pagelist *pagelist = req->r_pagelist;
> - ceph_msg_data_add_pagelist(msg, pagelist);
> - msg->hdr.data_len = cpu_to_le32(pagelist->length);
> + if (req->r_dbuf) {
> + struct ceph_databuf *dbuf = req->r_dbuf;
> + ceph_msg_data_add_databuf(msg, dbuf);
> + msg->hdr.data_len =
> cpu_to_le32(ceph_databuf_len(dbuf));
> } else {
> msg->hdr.data_len = 0;
> }
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 3e2a6fa7c19a..a7ee8da07ce7 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -333,7 +333,7 @@ struct ceph_mds_request {
> u32 r_direct_hash; /* choose dir frag based on this
> dentry hash */
>
> /* data payload is used for xattr ops */
> - struct ceph_pagelist *r_pagelist;
> + struct ceph_databuf *r_dbuf;
>
> /* what caps shall we drop? */
> int r_inode_drop, r_inode_unless;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index bb0db0cc8003..984a6d2a5378 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -1137,7 +1137,7 @@ struct ceph_acl_sec_ctx {
> #ifdef CONFIG_FS_ENCRYPTION
> struct ceph_fscrypt_auth *fscrypt_auth;
> #endif
> - struct ceph_pagelist *pagelist;
> + struct ceph_databuf *dbuf;
> };
>
> #ifdef CONFIG_SECURITY
> diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
> index 537165db4519..b083cd3b3974 100644
> --- a/fs/ceph/xattr.c
> +++ b/fs/ceph/xattr.c
> @@ -1114,17 +1114,17 @@ static int ceph_sync_setxattr(struct inode
> *inode, const char *name,
> struct ceph_mds_request *req;
> struct ceph_mds_client *mdsc = fsc->mdsc;
> struct ceph_osd_client *osdc = &fsc->client->osdc;
> - struct ceph_pagelist *pagelist = NULL;
> + struct ceph_databuf *dbuf = NULL;
> int op = CEPH_MDS_OP_SETXATTR;
> int err;
>
> if (size > 0) {
> - /* copy value into pagelist */
> - pagelist = ceph_pagelist_alloc(GFP_NOFS);
> - if (!pagelist)
> + /* copy value into dbuf */
> + dbuf = ceph_databuf_req_alloc(1, size, GFP_NOFS);
> + if (!dbuf)
> return -ENOMEM;
>
> - err = ceph_pagelist_append(pagelist, value, size);
> + err = ceph_databuf_append(dbuf, value, size);
> if (err)
> goto out;
> } else if (!value) {
> @@ -1154,8 +1154,8 @@ static int ceph_sync_setxattr(struct inode
> *inode, const char *name,
> req->r_args.setxattr.flags = cpu_to_le32(flags);
> req->r_args.setxattr.osdmap_epoch =
> cpu_to_le32(osdc->osdmap->epoch);
> - req->r_pagelist = pagelist;
> - pagelist = NULL;
> + req->r_dbuf = dbuf;
> + dbuf = NULL;
> }
>
> req->r_inode = inode;
> @@ -1169,8 +1169,7 @@ static int ceph_sync_setxattr(struct inode
> *inode, const char *name,
> doutc(cl, "xattr.ver (after): %lld\n", ci-
> >i_xattrs.version);
>
> out:
> - if (pagelist)
> - ceph_pagelist_release(pagelist);
> + ceph_databuf_release(dbuf);
> return err;
> }
>
> @@ -1377,7 +1376,7 @@ bool ceph_security_xattr_deadlock(struct inode
> *in)
> int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
> struct ceph_acl_sec_ctx *as_ctx)
> {
> - struct ceph_pagelist *pagelist = as_ctx->pagelist;
> + struct ceph_databuf *dbuf = as_ctx->dbuf;
> const char *name;
> size_t name_len;
> int err;
> @@ -1391,14 +1390,11 @@ int ceph_security_init_secctx(struct dentry
> *dentry, umode_t mode,
> }
>
> err = -ENOMEM;
> - if (!pagelist) {
> - pagelist = ceph_pagelist_alloc(GFP_KERNEL);
> - if (!pagelist)
> + if (!dbuf) {
> + dbuf = ceph_databuf_req_alloc(0, PAGE_SIZE,
> GFP_KERNEL);
> + if (!dbuf)
> goto out;
> - err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
> - if (err)
> - goto out;
> - ceph_pagelist_encode_32(pagelist, 1);
> + ceph_databuf_encode_32(dbuf, 1);
> }
>
> /*
> @@ -1407,38 +1403,31 @@ int ceph_security_init_secctx(struct dentry
> *dentry, umode_t mode,
> * dentry_init_security hook.
> */
> name_len = strlen(name);
> - err = ceph_pagelist_reserve(pagelist,
> - 4 * 2 + name_len + as_ctx-
> >lsmctx.len);
> + err = ceph_databuf_reserve(dbuf, 4 * 2 + name_len + as_ctx-
> >lsmctx.len,
> + GFP_KERNEL);
The 4 * 2 + name_len + as_ctx->lsmctx.len looks unclear to me. It wil
be good to have some well defined constants here.
> if (err)
> goto out;
>
> - if (as_ctx->pagelist) {
> + if (as_ctx->dbuf) {
> /* update count of KV pairs */
> - BUG_ON(pagelist->length <= sizeof(__le32));
> - if (list_is_singular(&pagelist->head)) {
> - le32_add_cpu((__le32*)pagelist->mapped_tail,
> 1);
> - } else {
> - struct page *page =
> list_first_entry(&pagelist->head,
> - struct
> page, lru);
> - void *addr = kmap_atomic(page);
> - le32_add_cpu((__le32*)addr, 1);
> - kunmap_atomic(addr);
> - }
> + BUG_ON(ceph_databuf_len(dbuf) <= sizeof(__le32));
> + __le32 *addr = kmap_ceph_databuf_page(dbuf, 0);
> + le32_add_cpu(addr, 1);
> + kunmap_local(addr);
> } else {
> - as_ctx->pagelist = pagelist;
> + as_ctx->dbuf = dbuf;
> }
>
> - ceph_pagelist_encode_32(pagelist, name_len);
> - ceph_pagelist_append(pagelist, name, name_len);
> + ceph_databuf_encode_32(dbuf, name_len);
> + ceph_databuf_append(dbuf, name, name_len);
>
> - ceph_pagelist_encode_32(pagelist, as_ctx->lsmctx.len);
> - ceph_pagelist_append(pagelist, as_ctx->lsmctx.context,
> - as_ctx->lsmctx.len);
> + ceph_databuf_encode_32(dbuf, as_ctx->lsmctx.len);
> + ceph_databuf_append(dbuf, as_ctx->lsmctx.context, as_ctx-
> >lsmctx.len);
>
> err = 0;
> out:
> - if (pagelist && !as_ctx->pagelist)
> - ceph_pagelist_release(pagelist);
> + if (dbuf && !as_ctx->dbuf)
> + ceph_databuf_release(dbuf);
> return err;
> }
> #endif /* CONFIG_CEPH_FS_SECURITY_LABEL */
> @@ -1456,8 +1445,7 @@ void ceph_release_acl_sec_ctx(struct
> ceph_acl_sec_ctx *as_ctx)
> #ifdef CONFIG_FS_ENCRYPTION
> kfree(as_ctx->fscrypt_auth);
> #endif
> - if (as_ctx->pagelist)
> - ceph_pagelist_release(as_ctx->pagelist);
> + ceph_databuf_release(as_ctx->dbuf);
> }
>
> /*
>
Thanks,
Slava.
Powered by blists - more mailing lists