[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20221123101313.GB26377@test-zns>
Date: Wed, 23 Nov 2022 15:43:13 +0530
From: Nitesh Shetty <nj.shetty@...sung.com>
To: Amir Goldstein <amir73il@...il.com>
Cc: axboe@...nel.dk, agk@...hat.com, snitzer@...nel.org,
dm-devel@...hat.com, kbusch@...nel.org, hch@....de,
sagi@...mberg.me, james.smart@...adcom.com, kch@...dia.com,
damien.lemoal@...nsource.wdc.com, naohiro.aota@....com,
jth@...nel.org, viro@...iv.linux.org.uk,
linux-block@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-nvme@...ts.infradead.org, linux-fsdevel@...r.kernel.org,
anuj20.g@...sung.com, joshi.k@...sung.com, p.raghav@...sung.com,
nitheshshetty@...il.com, gost.dev@...sung.com
Subject: Re: [PATCH v5 10/10] fs: add support for copy file range in zonefs
On Wed, Nov 23, 2022 at 08:53:14AM +0200, Amir Goldstein wrote:
> On Wed, Nov 23, 2022 at 8:26 AM Nitesh Shetty <nj.shetty@...sung.com> wrote:
> >
> > copy_file_range is implemented using copy offload,
> > copy offloading to device is always enabled.
> > To disable copy offloading mount with "no_copy_offload" mount option.
> > At present copy offload is only used, if the source and destination files
> > are on same block device, otherwise copy file range is completed by
> > generic copy file range.
> >
> > copy file range implemented as following:
> > - write pending writes on the src and dest files
> > - drop page cache for dest file if its conv zone
> > - copy the range using offload
> > - update dest file info
> >
> > For all failure cases we fallback to generic file copy range
> > At present this implementation does not support conv aggregation
> >
> > Signed-off-by: Nitesh Shetty <nj.shetty@...sung.com>
> > Signed-off-by: Anuj Gupta <anuj20.g@...sung.com>
> > ---
> > fs/zonefs/super.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 179 insertions(+)
> >
> > diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> > index abc9a85106f2..15613433d4ae 100644
> > --- a/fs/zonefs/super.c
> > +++ b/fs/zonefs/super.c
> > @@ -1223,6 +1223,183 @@ static int zonefs_file_release(struct inode *inode, struct file *file)
> > return 0;
> > }
> >
> > +static int zonefs_is_file_copy_offset_ok(struct inode *src_inode,
> > + struct inode *dst_inode, loff_t src_off, loff_t dst_off,
> > + size_t *len)
> > +{
> > + loff_t size, endoff;
> > + struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> > +
> > + inode_lock(src_inode);
> > + size = i_size_read(src_inode);
> > + inode_unlock(src_inode);
> > + /* Don't copy beyond source file EOF. */
> > + if (src_off < size) {
> > + if (src_off + *len > size)
> > + *len = (size - (src_off + *len));
> > + } else
> > + *len = 0;
> > +
> > + mutex_lock(&dst_zi->i_truncate_mutex);
> > + if (dst_zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
> > + if (*len > dst_zi->i_max_size - dst_zi->i_wpoffset)
> > + *len -= dst_zi->i_max_size - dst_zi->i_wpoffset;
> > +
> > + if (dst_off != dst_zi->i_wpoffset)
> > + goto err;
> > + }
> > + mutex_unlock(&dst_zi->i_truncate_mutex);
> > +
> > + endoff = dst_off + *len;
> > + inode_lock(dst_inode);
> > + if (endoff > dst_zi->i_max_size ||
> > + inode_newsize_ok(dst_inode, endoff)) {
> > + inode_unlock(dst_inode);
> > + goto err;
> > + }
> > + inode_unlock(dst_inode);
> > +
> > + return 0;
> > +err:
> > + mutex_unlock(&dst_zi->i_truncate_mutex);
> > + return -EINVAL;
> > +}
> > +
> > +static ssize_t zonefs_issue_copy(struct zonefs_inode_info *src_zi,
> > + loff_t src_off, struct zonefs_inode_info *dst_zi,
> > + loff_t dst_off, size_t len)
> > +{
> > + struct block_device *src_bdev = src_zi->i_vnode.i_sb->s_bdev;
> > + struct block_device *dst_bdev = dst_zi->i_vnode.i_sb->s_bdev;
> > + struct range_entry *rlist = NULL;
> > + int ret = len;
> > +
> > + rlist = kmalloc(sizeof(*rlist), GFP_KERNEL);
> > + if (!rlist)
> > + return -ENOMEM;
> > +
> > + rlist[0].dst = (dst_zi->i_zsector << SECTOR_SHIFT) + dst_off;
> > + rlist[0].src = (src_zi->i_zsector << SECTOR_SHIFT) + src_off;
> > + rlist[0].len = len;
> > + rlist[0].comp_len = 0;
> > + ret = blkdev_issue_copy(src_bdev, dst_bdev, rlist, 1, NULL, NULL,
> > + GFP_KERNEL);
> > + if (rlist[0].comp_len > 0)
> > + ret = rlist[0].comp_len;
> > + kfree(rlist);
> > +
> > + return ret;
> > +}
> > +
> > +/* Returns length of possible copy, else returns error */
> > +static ssize_t zonefs_copy_file_checks(struct file *src_file, loff_t src_off,
> > + struct file *dst_file, loff_t dst_off,
> > + size_t *len, unsigned int flags)
> > +{
> > + struct inode *src_inode = file_inode(src_file);
> > + struct inode *dst_inode = file_inode(dst_file);
> > + struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
> > + struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> > + ssize_t ret;
> > +
> > + if (src_inode->i_sb != dst_inode->i_sb)
> > + return -EXDEV;
> > +
> > + /* Start by sync'ing the source and destination files for conv zones */
> > + if (src_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
> > + ret = file_write_and_wait_range(src_file, src_off,
> > + (src_off + *len));
> > + if (ret < 0)
> > + goto io_error;
> > + }
> > + inode_dio_wait(src_inode);
> > +
> > + /* Start by sync'ing the source and destination files ifor conv zones */
> > + if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
> > + ret = file_write_and_wait_range(dst_file, dst_off,
> > + (dst_off + *len));
> > + if (ret < 0)
> > + goto io_error;
> > + }
> > + inode_dio_wait(dst_inode);
> > +
> > + /* Drop dst file cached pages for a conv zone*/
> > + if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
> > + ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
> > + dst_off >> PAGE_SHIFT,
> > + (dst_off + *len) >> PAGE_SHIFT);
> > + if (ret < 0)
> > + goto io_error;
> > + }
> > +
> > + ret = zonefs_is_file_copy_offset_ok(src_inode, dst_inode, src_off,
> > + dst_off, len);
> > + if (ret < 0)
> > + return ret;
> > +
> > + return *len;
> > +
> > +io_error:
> > + zonefs_io_error(dst_inode, true);
> > + return ret;
> > +}
> > +
> > +static ssize_t zonefs_copy_file(struct file *src_file, loff_t src_off,
> > + struct file *dst_file, loff_t dst_off,
> > + size_t len, unsigned int flags)
> > +{
> > + struct inode *src_inode = file_inode(src_file);
> > + struct inode *dst_inode = file_inode(dst_file);
> > + struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
> > + struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> > + ssize_t ret = 0, bytes;
> > +
> > + inode_lock(src_inode);
> > + inode_lock(dst_inode);
> > + bytes = zonefs_issue_copy(src_zi, src_off, dst_zi, dst_off, len);
> > + if (bytes < 0)
> > + goto unlock_exit;
> > +
> > + ret += bytes;
> > +
> > + file_update_time(dst_file);
> > + mutex_lock(&dst_zi->i_truncate_mutex);
> > + zonefs_update_stats(dst_inode, dst_off + bytes);
> > + zonefs_i_size_write(dst_inode, dst_off + bytes);
> > + dst_zi->i_wpoffset += bytes;
> > + mutex_unlock(&dst_zi->i_truncate_mutex);
> > + /* if we still have some bytes left, do splice copy */
> > + if (bytes && (bytes < len)) {
> > + bytes = do_splice_direct(src_file, &src_off, dst_file,
> > + &dst_off, len, flags);
> > + if (bytes > 0)
> > + ret += bytes;
> > + }
> > +unlock_exit:
> > + if (ret < 0)
> > + zonefs_io_error(dst_inode, true);
> > + inode_unlock(src_inode);
> > + inode_unlock(dst_inode);
> > + return ret;
> > +}
> > +
> > +static ssize_t zonefs_copy_file_range(struct file *src_file, loff_t src_off,
> > + struct file *dst_file, loff_t dst_off,
> > + size_t len, unsigned int flags)
> > +{
> > + ssize_t ret = -EIO;
> > +
> > + ret = zonefs_copy_file_checks(src_file, src_off, dst_file, dst_off,
> > + &len, flags);
> > + if (ret > 0)
> > + ret = zonefs_copy_file(src_file, src_off, dst_file, dst_off,
> > + len, flags);
> > + else if (ret < 0 && ret == -EXDEV)
>
> First of all, ret < 0 is redundant.
>
acked
> > + ret = generic_copy_file_range(src_file, src_off, dst_file,
> > + dst_off, len, flags);
>
> But more importantly, why do you want to fall back to
> do_splice_direct() in zonefs copy_file_range?
> How does it serve your patch set or the prospect consumers
> of zonefs copy_file_range?
>
> The reason I am asking is because commit 5dae222a5ff0
> ("vfs: allow copy_file_range to copy across devices")
> turned out to be an API mistake that was later reverted by
> 868f9f2f8e00 ("vfs: fix copy_file_range() regression in cross-fs copies")
>
> It is always better to return EXDEV to userspace which can
> always fallback to splice itself, but maybe it has something
> smarter to do.
>
> The places where it made sense for kernel to fallback to
> direct splice was for network servers server-side-copy, but that
> is independent of any specific filesystem copy_file_range()
> implementation.
>
> Thanks,
> Amir.
>
At present we don't handle few case's such as IO getting split incase of
copy offload, so we wanted to fallback to existing mechanism. So went with
default operation, do_splice_direct.
Regards,
Nitesh Shetty
Powered by blists - more mailing lists