[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ew75xhk7i26smogev3mhd6vg24dsiguyh4fvhfghcobyne6w2d@shlc7nufv5b7>
Date: Thu, 19 Feb 2026 13:03:00 +0100
From: Jan Kara <jack@...e.cz>
To: Tal Zussman <tz2294@...umbia.edu>
Cc: Jens Axboe <axboe@...nel.dk>,
"Tigran A. Aivazian" <aivazian.tigran@...il.com>, Alexander Viro <viro@...iv.linux.org.uk>,
Christian Brauner <brauner@...nel.org>, Jan Kara <jack@...e.cz>, Namjae Jeon <linkinjeon@...nel.org>,
Sungjong Seo <sj1557.seo@...sung.com>, Yuezhang Mo <yuezhang.mo@...y.com>,
Dave Kleikamp <shaggy@...nel.org>, Ryusuke Konishi <konishi.ryusuke@...il.com>,
Viacheslav Dubeyko <slava@...eyko.com>, Konstantin Komarov <almaz.alexandrovich@...agon-software.com>,
Bob Copeland <me@...copeland.com>, linux-block@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-fsdevel@...r.kernel.org, linux-ext4@...r.kernel.org, jfs-discussion@...ts.sourceforge.net,
linux-nilfs@...r.kernel.org, ntfs3@...ts.linux.dev, linux-karma-devel@...ts.sourceforge.net
Subject: Re: [PATCH RFC] block: enable RWF_DONTCACHE for block devices
On Wed 18-02-26 16:13:17, Tal Zussman wrote:
> Block device buffered reads and writes already pass through
> filemap_read() and iomap_file_buffered_write() respectively, both of
> which handle IOCB_DONTCACHE. Enable RWF_DONTCACHE for block device files
> by setting FOP_DONTCACHE in def_blk_fops.
>
> For CONFIG_BUFFER_HEAD paths, thread the kiocb through
> block_write_begin() so that buffer_head-based I/O can use DONTCACHE
> behavior as well. Callers without a kiocb context (e.g. nilfs2 recovery)
> pass NULL, which preserves the existing behavior.
>
> This support is useful for databases that operate on raw block devices,
> among other userspace applications.
>
> Signed-off-by: Tal Zussman <tz2294@...umbia.edu>
Looks good to me. Feel free to add:
Reviewed-by: Jan Kara <jack@...e.cz>
Honza
> ---
> This is based on v6.19. Please let me know if there's a different tree I
> should base this on.
>
> I wasn't sure if the block_write_begin() changes were necessary for
> block device support if CONFIG_BUFFER_HEAD is set (hence the RFC tag). I
> can remove those if they're not necessary.
> ---
> block/fops.c | 4 ++--
> fs/bfs/file.c | 2 +-
> fs/buffer.c | 12 ++++++++----
> fs/exfat/inode.c | 2 +-
> fs/ext2/inode.c | 2 +-
> fs/jfs/inode.c | 2 +-
> fs/minix/inode.c | 2 +-
> fs/nilfs2/inode.c | 2 +-
> fs/nilfs2/recovery.c | 2 +-
> fs/ntfs3/inode.c | 2 +-
> fs/omfs/file.c | 2 +-
> fs/udf/inode.c | 2 +-
> fs/ufs/inode.c | 2 +-
> include/linux/buffer_head.h | 5 +++--
> 14 files changed, 24 insertions(+), 19 deletions(-)
>
> diff --git a/block/fops.c b/block/fops.c
> index 4d32785b31d9..6bc727f8b252 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -505,7 +505,7 @@ static int blkdev_write_begin(const struct kiocb *iocb,
> unsigned len, struct folio **foliop,
> void **fsdata)
> {
> - return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
> + return block_write_begin(iocb, mapping, pos, len, foliop, blkdev_get_block);
> }
>
> static int blkdev_write_end(const struct kiocb *iocb,
> @@ -967,7 +967,7 @@ const struct file_operations def_blk_fops = {
> .splice_write = iter_file_splice_write,
> .fallocate = blkdev_fallocate,
> .uring_cmd = blkdev_uring_cmd,
> - .fop_flags = FOP_BUFFER_RASYNC,
> + .fop_flags = FOP_BUFFER_RASYNC | FOP_DONTCACHE,
> };
>
> static __init int blkdev_init(void)
> diff --git a/fs/bfs/file.c b/fs/bfs/file.c
> index d33d6bde992b..f2804e38b8a7 100644
> --- a/fs/bfs/file.c
> +++ b/fs/bfs/file.c
> @@ -177,7 +177,7 @@ static int bfs_write_begin(const struct kiocb *iocb,
> {
> int ret;
>
> - ret = block_write_begin(mapping, pos, len, foliop, bfs_get_block);
> + ret = block_write_begin(iocb, mapping, pos, len, foliop, bfs_get_block);
> if (unlikely(ret))
> bfs_write_failed(mapping, pos + len);
>
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 838c0c571022..33c3580b85d8 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -2241,14 +2241,18 @@ EXPORT_SYMBOL(block_commit_write);
> *
> * The filesystem needs to handle block truncation upon failure.
> */
> -int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
> - struct folio **foliop, get_block_t *get_block)
> +int block_write_begin(const struct kiocb *iocb, struct address_space *mapping,
> + loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block)
> {
> pgoff_t index = pos >> PAGE_SHIFT;
> + fgf_t fgp_flags = FGP_WRITEBEGIN;
> struct folio *folio;
> int status;
>
> - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
> + if (iocb && iocb->ki_flags & IOCB_DONTCACHE)
> + fgp_flags |= FGP_DONTCACHE;
> +
> + folio = __filemap_get_folio(mapping, index, fgp_flags,
> mapping_gfp_mask(mapping));
> if (IS_ERR(folio))
> return PTR_ERR(folio);
> @@ -2591,7 +2595,7 @@ int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
> (*bytes)++;
> }
>
> - return block_write_begin(mapping, pos, len, foliop, get_block);
> + return block_write_begin(iocb, mapping, pos, len, foliop, get_block);
> }
> EXPORT_SYMBOL(cont_write_begin);
>
> diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
> index f9501c3a3666..39d36e8fdfd6 100644
> --- a/fs/exfat/inode.c
> +++ b/fs/exfat/inode.c
> @@ -456,7 +456,7 @@ static int exfat_write_begin(const struct kiocb *iocb,
> if (unlikely(exfat_forced_shutdown(mapping->host->i_sb)))
> return -EIO;
>
> - ret = block_write_begin(mapping, pos, len, foliop, exfat_get_block);
> + ret = block_write_begin(iocb, mapping, pos, len, foliop, exfat_get_block);
>
> if (ret < 0)
> exfat_write_failed(mapping, pos+len);
> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> index dbfe9098a124..11aab03de752 100644
> --- a/fs/ext2/inode.c
> +++ b/fs/ext2/inode.c
> @@ -930,7 +930,7 @@ ext2_write_begin(const struct kiocb *iocb, struct address_space *mapping,
> {
> int ret;
>
> - ret = block_write_begin(mapping, pos, len, foliop, ext2_get_block);
> + ret = block_write_begin(iocb, mapping, pos, len, foliop, ext2_get_block);
> if (ret < 0)
> ext2_write_failed(mapping, pos + len);
> return ret;
> diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
> index 4709762713ef..ae52db437771 100644
> --- a/fs/jfs/inode.c
> +++ b/fs/jfs/inode.c
> @@ -303,7 +303,7 @@ static int jfs_write_begin(const struct kiocb *iocb,
> {
> int ret;
>
> - ret = block_write_begin(mapping, pos, len, foliop, jfs_get_block);
> + ret = block_write_begin(iocb, mapping, pos, len, foliop, jfs_get_block);
> if (unlikely(ret))
> jfs_write_failed(mapping, pos + len);
>
> diff --git a/fs/minix/inode.c b/fs/minix/inode.c
> index 51ea9bdc813f..9075c0ba2f20 100644
> --- a/fs/minix/inode.c
> +++ b/fs/minix/inode.c
> @@ -465,7 +465,7 @@ static int minix_write_begin(const struct kiocb *iocb,
> {
> int ret;
>
> - ret = block_write_begin(mapping, pos, len, foliop, minix_get_block);
> + ret = block_write_begin(iocb, mapping, pos, len, foliop, minix_get_block);
> if (unlikely(ret))
> minix_write_failed(mapping, pos + len);
>
> diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
> index 51bde45d5865..d9d57eeecc5d 100644
> --- a/fs/nilfs2/inode.c
> +++ b/fs/nilfs2/inode.c
> @@ -230,7 +230,7 @@ static int nilfs_write_begin(const struct kiocb *iocb,
> if (unlikely(err))
> return err;
>
> - err = block_write_begin(mapping, pos, len, foliop, nilfs_get_block);
> + err = block_write_begin(iocb, mapping, pos, len, foliop, nilfs_get_block);
> if (unlikely(err)) {
> nilfs_write_failed(mapping, pos + len);
> nilfs_transaction_abort(inode->i_sb);
> diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
> index a9c61d0492cb..2f5fe44bf736 100644
> --- a/fs/nilfs2/recovery.c
> +++ b/fs/nilfs2/recovery.c
> @@ -541,7 +541,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
> }
>
> pos = rb->blkoff << inode->i_blkbits;
> - err = block_write_begin(inode->i_mapping, pos, blocksize,
> + err = block_write_begin(NULL, inode->i_mapping, pos, blocksize,
> &folio, nilfs_get_block);
> if (unlikely(err)) {
> loff_t isize = inode->i_size;
> diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
> index 0a9ac5efeb67..8c788feb319e 100644
> --- a/fs/ntfs3/inode.c
> +++ b/fs/ntfs3/inode.c
> @@ -966,7 +966,7 @@ int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
> goto out;
> }
>
> - err = block_write_begin(mapping, pos, len, foliop,
> + err = block_write_begin(iocb, mapping, pos, len, foliop,
> ntfs_get_block_write_begin);
>
> out:
> diff --git a/fs/omfs/file.c b/fs/omfs/file.c
> index 49a1de5a827f..3bade632e36e 100644
> --- a/fs/omfs/file.c
> +++ b/fs/omfs/file.c
> @@ -317,7 +317,7 @@ static int omfs_write_begin(const struct kiocb *iocb,
> {
> int ret;
>
> - ret = block_write_begin(mapping, pos, len, foliop, omfs_get_block);
> + ret = block_write_begin(iocb, mapping, pos, len, foliop, omfs_get_block);
> if (unlikely(ret))
> omfs_write_failed(mapping, pos + len);
>
> diff --git a/fs/udf/inode.c b/fs/udf/inode.c
> index 7fae8002344a..aec9cdc938be 100644
> --- a/fs/udf/inode.c
> +++ b/fs/udf/inode.c
> @@ -259,7 +259,7 @@ static int udf_write_begin(const struct kiocb *iocb,
> int ret;
>
> if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
> - ret = block_write_begin(mapping, pos, len, foliop,
> + ret = block_write_begin(iocb, mapping, pos, len, foliop,
> udf_get_block);
> if (unlikely(ret))
> udf_write_failed(mapping, pos + len);
> diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
> index e2b0a35de2a7..dfba985265a8 100644
> --- a/fs/ufs/inode.c
> +++ b/fs/ufs/inode.c
> @@ -481,7 +481,7 @@ static int ufs_write_begin(const struct kiocb *iocb,
> {
> int ret;
>
> - ret = block_write_begin(mapping, pos, len, foliop, ufs_getfrag_block);
> + ret = block_write_begin(iocb, mapping, pos, len, foliop, ufs_getfrag_block);
> if (unlikely(ret))
> ufs_write_failed(mapping, pos + len);
>
> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
> index b16b88bfbc3e..4b07dec5f8eb 100644
> --- a/include/linux/buffer_head.h
> +++ b/include/linux/buffer_head.h
> @@ -258,8 +258,9 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
> get_block_t *get_block, struct writeback_control *wbc);
> int block_read_full_folio(struct folio *, get_block_t *);
> bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
> -int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
> - struct folio **foliop, get_block_t *get_block);
> +int block_write_begin(const struct kiocb *iocb, struct address_space *mapping,
> + loff_t pos, unsigned len, struct folio **foliop,
> + get_block_t *get_block);
> int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
> get_block_t *get_block);
> int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *);
>
> ---
> base-commit: 05f7e89ab9731565d8a62e3b5d1ec206485eeb0b
> change-id: 20260218-blk-dontcache-338133dd045e
>
> Best regards,
> --
> Tal Zussman <tz2294@...umbia.edu>
>
--
Jan Kara <jack@...e.com>
SUSE Labs, CR
Powered by blists - more mailing lists