[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241211234748.GB6678@frogsfrogsfrogs>
Date: Wed, 11 Dec 2024 15:47:48 -0800
From: "Darrick J. Wong" <djwong@...nel.org>
To: John Garry <john.g.garry@...cle.com>
Cc: brauner@...nel.org, cem@...nel.org, dchinner@...hat.com, hch@....de,
ritesh.list@...il.com, linux-xfs@...r.kernel.org,
linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org,
martin.petersen@...cle.com
Subject: Re: [PATCH v2 2/7] iomap: Add zero unwritten mappings dio support
On Tue, Dec 10, 2024 at 12:57:32PM +0000, John Garry wrote:
> For atomic writes support, it is required to only ever submit a single bio
> (for an atomic write).
>
> Furthermore, currently the atomic write unit min and max limit is fixed at
> the FS block size.
>
> For lifting the atomic write unit max limit, it may occur that an atomic
> write spans mixed unwritten and mapped extents. For this case, due to the
> iterative nature of iomap, multiple bios would be produced, which is
> intolerable.
>
> Add a function to zero unwritten extents in a certain range, which may be
> used to ensure that unwritten extents are zeroed prior to issuing of an
> atomic write.
I still dislike this. IMO block untorn writes _is_ a niche feature for
programs that perform IO in large blocks. Any program that wants a
general "apply all these updates or none of them" interface should use
XFS_IOC_EXCHANGE_RANGE since it has no awu_max restrictions, can handle
discontiguous update ranges, doesn't require block alignment, etc.
Instead here we are adding a bunch of complexity, and not even all that
well:
> Signed-off-by: John Garry <john.g.garry@...cle.com>
> ---
> fs/iomap/direct-io.c | 76 +++++++++++++++++++++++++++++++++++++++++++
> include/linux/iomap.h | 3 ++
> 2 files changed, 79 insertions(+)
>
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 23fdad16e6a8..18c888f0c11f 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -805,6 +805,82 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> }
> EXPORT_SYMBOL_GPL(iomap_dio_rw);
>
> +static loff_t
> +iomap_dio_zero_unwritten_iter(struct iomap_iter *iter, struct iomap_dio *dio)
> +{
> + const struct iomap *iomap = &iter->iomap;
> + loff_t length = iomap_length(iter);
> + loff_t pos = iter->pos;
> +
> + if (iomap->type == IOMAP_UNWRITTEN) {
> + int ret;
> +
> + dio->flags |= IOMAP_DIO_UNWRITTEN;
> + ret = iomap_dio_zero(iter, dio, pos, length);
Shouldn't this be detecting the particular case that the mapping for the
kiocb is in mixed state and only zeroing in that case? This just
targets every unwritten extent, even if the unwritten extent covered the
entire range that is being written. It doesn't handle COW, it doesn't
handle holes, etc.
Also, can you make a version of blkdev_issue_zeroout that returns the
bio so the caller can issue them asynchronously instead of opencoding
the bio_alloc loop in iomap_dev_zero?
> + if (ret)
> + return ret;
> + }
> +
> + dio->size += length;
> +
> + return length;
> +}
> +
> +ssize_t
> +iomap_dio_zero_unwritten(struct kiocb *iocb, struct iov_iter *iter,
> + const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
> +{
> + struct inode *inode = file_inode(iocb->ki_filp);
> + struct iomap_dio *dio;
> + ssize_t ret;
> + struct iomap_iter iomi = {
> + .inode = inode,
> + .pos = iocb->ki_pos,
> + .len = iov_iter_count(iter),
> + .flags = IOMAP_WRITE,
IOMAP_WRITE | IOMAP_DIRECT, no?
--D
> + };
> +
> + dio = kzalloc(sizeof(*dio), GFP_KERNEL);
> + if (!dio)
> + return -ENOMEM;
> +
> + dio->iocb = iocb;
> + atomic_set(&dio->ref, 1);
> + dio->i_size = i_size_read(inode);
> + dio->dops = dops;
> + dio->submit.waiter = current;
> + dio->wait_for_completion = true;
> +
> + inode_dio_begin(inode);
> +
> + while ((ret = iomap_iter(&iomi, ops)) > 0)
> + iomi.processed = iomap_dio_zero_unwritten_iter(&iomi, dio);
> +
> + if (ret < 0)
> + iomap_dio_set_error(dio, ret);
> +
> + if (!atomic_dec_and_test(&dio->ref)) {
> + for (;;) {
> + set_current_state(TASK_UNINTERRUPTIBLE);
> + if (!READ_ONCE(dio->submit.waiter))
> + break;
> +
> + blk_io_schedule();
> + }
> + __set_current_state(TASK_RUNNING);
> + }
> +
> + if (dops && dops->end_io)
> + ret = dops->end_io(iocb, dio->size, ret, dio->flags);
> +
> + kfree(dio);
> +
> + inode_dio_end(file_inode(iocb->ki_filp));
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(iomap_dio_zero_unwritten);
> +
> static int __init iomap_dio_init(void)
> {
> zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 5675af6b740c..c2d44b9e446d 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -440,6 +440,9 @@ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
> unsigned int dio_flags, void *private, size_t done_before);
> +ssize_t iomap_dio_zero_unwritten(struct kiocb *iocb, struct iov_iter *iter,
> + const struct iomap_ops *ops, const struct iomap_dio_ops *dops);
> +
> ssize_t iomap_dio_complete(struct iomap_dio *dio);
> void iomap_dio_bio_end_io(struct bio *bio);
>
> --
> 2.31.1
>
>
Powered by blists - more mailing lists