[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240821165803.GI865349@frogsfrogsfrogs>
Date: Wed, 21 Aug 2024 09:58:03 -0700
From: "Darrick J. Wong" <djwong@...nel.org>
To: John Garry <john.g.garry@...cle.com>
Cc: axboe@...nel.dk, brauner@...nel.org, viro@...iv.linux.org.uk,
jack@...e.cz, chandan.babu@...cle.com, dchinner@...hat.com,
hch@....de, linux-block@...r.kernel.org,
linux-kernel@...r.kernel.org, linux-xfs@...r.kernel.org,
linux-fsdevel@...r.kernel.org, hare@...e.de,
martin.petersen@...cle.com, catherine.hoang@...cle.com,
kbusch@...nel.org
Subject: Re: [PATCH v5 3/7] fs: iomap: Atomic write support
On Sat, Aug 17, 2024 at 09:47:56AM +0000, John Garry wrote:
> Support direct I/O atomic writes by producing a single bio with REQ_ATOMIC
> flag set.
>
> We rely on the FS to guarantee extent alignment, such that an atomic write
> should never straddle two or more extents. The FS should also check for
> validity of an atomic write length/alignment.
>
> For each iter, data is appended to the single bio. That bio is allocated
> on the first iter.
>
> If that total bio data does not match the expected total, then error and
> do not submit the bio as we cannot tolerate a partial write.
>
> Signed-off-by: John Garry <john.g.garry@...cle.com>
> ---
> fs/iomap/direct-io.c | 122 +++++++++++++++++++++++++++++++++++-------
> fs/iomap/trace.h | 3 +-
> include/linux/iomap.h | 1 +
> 3 files changed, 106 insertions(+), 20 deletions(-)
>
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index f3b43d223a46..72f28d53ab03 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -37,6 +37,7 @@ struct iomap_dio {
> int error;
> size_t done_before;
> bool wait_for_completion;
> + struct bio *atomic_bio;
>
> union {
> /* used during submission and for synchronous completion: */
> @@ -61,6 +62,24 @@ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
> return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL);
> }
>
> +static struct bio *iomap_dio_alloc_bio_data(const struct iomap_iter *iter,
> + struct iomap_dio *dio, unsigned short nr_vecs, blk_opf_t opf,
> + loff_t pos)
> +{
> + struct bio *bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, opf);
> + struct inode *inode = iter->inode;
> +
> + fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
> + GFP_KERNEL);
> + bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
> + bio->bi_write_hint = inode->i_write_hint;
> + bio->bi_ioprio = dio->iocb->ki_ioprio;
> + bio->bi_private = dio;
> + bio->bi_end_io = iomap_dio_bio_end_io;
> +
> + return bio;
> +}
> +
> static void iomap_dio_submit_bio(const struct iomap_iter *iter,
> struct iomap_dio *dio, struct bio *bio, loff_t pos)
> {
> @@ -256,7 +275,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
> * clearing the WRITE_THROUGH flag in the dio request.
> */
> static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> - const struct iomap *iomap, bool use_fua)
> + const struct iomap *iomap, bool use_fua, bool atomic)
> {
> blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
>
> @@ -268,6 +287,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> opflags |= REQ_FUA;
> else
> dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
> + if (atomic)
> + opflags |= REQ_ATOMIC;
>
> return opflags;
> }
> @@ -275,21 +296,23 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> struct iomap_dio *dio)
> {
> + bool atomic = dio->iocb->ki_flags & IOCB_ATOMIC;
> const struct iomap *iomap = &iter->iomap;
> struct inode *inode = iter->inode;
> unsigned int fs_block_size = i_blocksize(inode), pad;
> + struct iov_iter *i = dio->submit.iter;
If you're going to pull this out into a convenience variable, please do
that as a separate patch so that the actual untorn write additions don't
get mixed in.
> loff_t length = iomap_length(iter);
> loff_t pos = iter->pos;
> blk_opf_t bio_opf;
> struct bio *bio;
> bool need_zeroout = false;
> bool use_fua = false;
> - int nr_pages, ret = 0;
> + int nr_pages, orig_nr_pages, ret = 0;
> size_t copied = 0;
> size_t orig_count;
>
> if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
> - !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
> + !bdev_iter_is_aligned(iomap->bdev, i))
> return -EINVAL;
>
> if (iomap->type == IOMAP_UNWRITTEN) {
> @@ -322,15 +345,35 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> dio->flags &= ~IOMAP_DIO_CALLER_COMP;
> }
>
> + if (dio->atomic_bio) {
> + /*
> + * These should not fail, but check just in case.
> + * Caller takes care of freeing the bio.
> + */
> + if (iter->iomap.bdev != dio->atomic_bio->bi_bdev) {
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + if (dio->atomic_bio->bi_iter.bi_sector +
> + (dio->atomic_bio->bi_iter.bi_size >> SECTOR_SHIFT) !=
Hmm, so I guess you stash an untorn write bio in the iomap_dio so that
multiple iomap_dio_bio_iter can try to combine a mixed mapping into a
single contiguous untorn write that can be completed all at once?
I suppose that works as long as the iomap->type is the same across all
the _iter calls, but I think that needs explicit checking here.
> + iomap_sector(iomap, pos)) {
> + ret = -EINVAL;
> + goto out;
> + }
> + } else if (atomic) {
> + orig_nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
> + }
> +
> /*
> * Save the original count and trim the iter to just the extent we
> * are operating on right now. The iter will be re-expanded once
> * we are done.
> */
> - orig_count = iov_iter_count(dio->submit.iter);
> - iov_iter_truncate(dio->submit.iter, length);
> + orig_count = iov_iter_count(i);
> + iov_iter_truncate(i, length);
>
> - if (!iov_iter_count(dio->submit.iter))
> + if (!iov_iter_count(i))
> goto out;
>
> /*
> @@ -365,27 +408,46 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> * can set up the page vector appropriately for a ZONE_APPEND
> * operation.
> */
> - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
> + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
> +
> + if (atomic) {
> + size_t orig_atomic_size;
> +
> + if (!dio->atomic_bio) {
> + dio->atomic_bio = iomap_dio_alloc_bio_data(iter,
> + dio, orig_nr_pages, bio_opf, pos);
> + }
> + orig_atomic_size = dio->atomic_bio->bi_iter.bi_size;
> +
> + /*
> + * In case of error, caller takes care of freeing the bio. The
> + * smallest size of atomic write is i_node size, so no need for
What is "i_node size"? Are you referring to i_blocksize?
> + * tail zeroing out.
> + */
> + ret = bio_iov_iter_get_pages(dio->atomic_bio, i);
> + if (!ret) {
> + copied = dio->atomic_bio->bi_iter.bi_size -
> + orig_atomic_size;
> + }
>
> - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
> + dio->size += copied;
> + goto out;
> + }
> +
> + nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
> do {
> size_t n;
> if (dio->error) {
> - iov_iter_revert(dio->submit.iter, copied);
> + iov_iter_revert(i, copied);
> copied = ret = 0;
> goto out;
> }
>
> - bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
> + bio = iomap_dio_alloc_bio_data(iter, dio, nr_pages, bio_opf, pos);
> fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
> GFP_KERNEL);
> - bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
> - bio->bi_write_hint = inode->i_write_hint;
> - bio->bi_ioprio = dio->iocb->ki_ioprio;
> - bio->bi_private = dio;
> - bio->bi_end_io = iomap_dio_bio_end_io;
I see two places (here and iomap_dio_zero) that allocate a bio and
perform some initialization of it. Can you move the common pieces to
iomap_dio_alloc_bio instead of adding a iomap_dio_alloc_bio_data
variant, and move all that to a separate cleanup patch?
> - ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
> + ret = bio_iov_iter_get_pages(bio, i);
> if (unlikely(ret)) {
> /*
> * We have to stop part way through an IO. We must fall
> @@ -408,8 +470,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> dio->size += n;
> copied += n;
>
> - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
> - BIO_MAX_VECS);
> + nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
> /*
> * We can only poll for single bio I/Os.
> */
> @@ -435,7 +496,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
> }
> out:
> /* Undo iter limitation to current extent */
> - iov_iter_reexpand(dio->submit.iter, orig_count - copied);
> + iov_iter_reexpand(i, orig_count - copied);
> if (copied)
> return copied;
> return ret;
> @@ -555,6 +616,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> struct blk_plug plug;
> struct iomap_dio *dio;
> loff_t ret = 0;
> + size_t orig_count = iov_iter_count(iter);
>
> trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
>
> @@ -580,6 +642,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> if (iocb->ki_flags & IOCB_NOWAIT)
> iomi.flags |= IOMAP_NOWAIT;
>
> + if (iocb->ki_flags & IOCB_ATOMIC) {
> + if (bio_iov_vecs_to_alloc(iter, INT_MAX) > BIO_MAX_VECS)
> + return ERR_PTR(-EINVAL);
> + iomi.flags |= IOMAP_ATOMIC;
> + }
> + dio->atomic_bio = NULL;
> +
> if (iov_iter_rw(iter) == READ) {
> /* reads can always complete inline */
> dio->flags |= IOMAP_DIO_INLINE_COMP;
> @@ -665,6 +734,21 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> iocb->ki_flags &= ~IOCB_HIPRI;
> }
>
> + if (iocb->ki_flags & IOCB_ATOMIC) {
> + if (ret >= 0) {
> + if (dio->size == orig_count) {
> + iomap_dio_submit_bio(&iomi, dio,
> + dio->atomic_bio, iocb->ki_pos);
Does this need to do task_io_account_write like regular direct writes
do?
> + } else {
> + if (dio->atomic_bio)
> + bio_put(dio->atomic_bio);
> + ret = -EINVAL;
> + }
> + } else if (dio->atomic_bio) {
> + bio_put(dio->atomic_bio);
This ought to null out dio->atomic_bio to prevent accidental UAF.
--D
> + }
> + }
> +
> blk_finish_plug(&plug);
>
> /*
> diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
> index 0a991c4ce87d..4118a42cdab0 100644
> --- a/fs/iomap/trace.h
> +++ b/fs/iomap/trace.h
> @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
> { IOMAP_REPORT, "REPORT" }, \
> { IOMAP_FAULT, "FAULT" }, \
> { IOMAP_DIRECT, "DIRECT" }, \
> - { IOMAP_NOWAIT, "NOWAIT" }
> + { IOMAP_NOWAIT, "NOWAIT" }, \
> + { IOMAP_ATOMIC, "ATOMIC" }
>
> #define IOMAP_F_FLAGS_STRINGS \
> { IOMAP_F_NEW, "NEW" }, \
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 6fc1c858013d..8fd949442262 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -178,6 +178,7 @@ struct iomap_folio_ops {
> #else
> #define IOMAP_DAX 0
> #endif /* CONFIG_FS_DAX */
> +#define IOMAP_ATOMIC (1 << 9)
>
> struct iomap_ops {
> /*
> --
> 2.31.1
>
>
Powered by blists - more mailing lists