linux-ext4 - Re: [RFC PATCH v1] ext2: remove buffer heads from quota handling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241107155018.GL21832@frogsfrogsfrogs>
Date: Thu, 7 Nov 2024 07:50:18 -0800
From: "Darrick J. Wong" <djwong@...nel.org>
To: Catherine Hoang <catherine.hoang@...cle.com>
Cc: linux-ext4@...r.kernel.org
Subject: Re: [RFC PATCH v1] ext2: remove buffer heads from quota handling

On Tue, Nov 05, 2024 at 10:29:50AM -0800, Darrick J. Wong wrote:
> On Tue, Oct 29, 2024 at 01:45:01PM -0700, Catherine Hoang wrote:
> > This patch removes the use of buffer heads from the quota read and
> > write paths. To do so, we implement a new buffer cache using an
> > rhashtable. Each buffer stores data from an associated block, and
> > can be read or written to as needed.
> > 
> > Ultimately, we want to completely remove buffer heads from ext2.
> > This patch serves as an example than can be applied to other parts
> > of the filesystem.
> > 
> > Signed-off-by: Catherine Hoang <catherine.hoang@...cle.com>
> 
> Cool, thanks for sending this out publicly. :)
> 
> > ---
> >  fs/ext2/Makefile |   2 +-
> >  fs/ext2/cache.c  | 195 +++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/ext2/ext2.h   |  30 ++++++++
> >  fs/ext2/inode.c  |  20 +++++
> >  fs/ext2/super.c  |  62 ++++++++-------
> >  5 files changed, 281 insertions(+), 28 deletions(-)
> >  create mode 100644 fs/ext2/cache.c
> > 
> > diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
> > index 8860948ef9ca..e8b38243058f 100644
> > --- a/fs/ext2/Makefile
> > +++ b/fs/ext2/Makefile
> > @@ -5,7 +5,7 @@
> >  
> >  obj-$(CONFIG_EXT2_FS) += ext2.o
> >  
> > -ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
> > +ext2-y := balloc.o cache.o dir.o file.o ialloc.o inode.o \
> >  	  ioctl.o namei.o super.o symlink.o trace.o
> >  
> >  # For tracepoints to include our trace.h from tracepoint infrastructure
> > diff --git a/fs/ext2/cache.c b/fs/ext2/cache.c
> > new file mode 100644
> > index 000000000000..c58416392c52
> > --- /dev/null
> > +++ b/fs/ext2/cache.c
> > @@ -0,0 +1,195 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * Copyright (c) 2024 Oracle. All rights reserved.
> > + */
> > +
> > +#include "ext2.h"
> > +#include <linux/bio.h>
> > +#include <linux/blkdev.h>
> > +#include <linux/rhashtable.h>
> > +#include <linux/mm.h>
> > +#include <linux/types.h>
> > +
> > +static const struct rhashtable_params buffer_cache_params = {
> > +	.key_len     = sizeof(sector_t),
> > +	.key_offset  = offsetof(struct ext2_buffer, b_block),
> > +	.head_offset = offsetof(struct ext2_buffer, b_rhash),
> > +	.automatic_shrinking = true,
> > +};
> > +
> > +static struct ext2_buffer *insert_buffer_cache(struct super_block *sb, struct ext2_buffer *new_buf)
> > +{
> > +	struct ext2_sb_info *sbi = EXT2_SB(sb);
> > +	struct rhashtable *buffer_cache = &sbi->buffer_cache;
> > +	struct ext2_buffer *old_buf;
> > +
> > +	spin_lock(&sbi->buffer_cache_lock);
> > +	old_buf = rhashtable_lookup_get_insert_fast(buffer_cache,
> > +				&new_buf->b_rhash, buffer_cache_params);
> > +	spin_unlock(&sbi->buffer_cache_lock);
> > +
> > +	if (old_buf)
> > +		return old_buf;
> > +
> > +	return new_buf;
> 
> Doesn't rhashtable_lookup_get_insert_fast return whichever buffer is in
> the rhashtable?  So you can just do "return old_buf" here?
> 
> > +}
> > +
> > +static void buf_write_end_io(struct bio *bio)
> > +{
> > +	struct ext2_buffer *buf = bio->bi_private;
> > +
> > +	clear_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags);
> 
> If the bio fails, should we leave the dirty bit set?  The IO error could
> be temporary, and a future syncfs ought to retry the write.
> 
> Also I wonder if we ought to have a way to report to sync_buffers that
> one of the writes failed so that it can return EIO for syncfs?
> 
> Perhaps an atomic_t write failure counter in the buffer cache struct
> (see below) that we could bump from buf_write_end_io every time there's
> a failure?  Then sync_buffers could compare the write failure counter
> before issuing IOs and after they've all completed?
> 
> > +	bio_put(bio);
> > +}
> > +
> > +static int submit_buffer_read(struct super_block *sb, struct ext2_buffer *buf)
> > +{
> > +	struct bio_vec bio_vec;
> > +	struct bio bio;
> > +	sector_t sector = buf->b_block * (sb->s_blocksize >> 9);
> > +
> > +	bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
> > +	bio.bi_iter.bi_sector = sector;
> > +
> > +	__bio_add_page(&bio, buf->b_page, buf->b_size, 0);
> > +
> > +	return submit_bio_wait(&bio);
> > +}
> > +
> > +static void submit_buffer_write(struct super_block *sb, struct ext2_buffer *buf)
> > +{
> > +	struct bio *bio;
> > +	sector_t sector = buf->b_block * (sb->s_blocksize >> 9);
> > +
> > +	bio = bio_alloc(sb->s_bdev, 1, REQ_OP_WRITE, GFP_KERNEL);
> > +
> > +	bio->bi_iter.bi_sector = sector;
> > +	bio->bi_end_io = buf_write_end_io;
> > +	bio->bi_private = buf;
> > +
> > +	__bio_add_page(bio, buf->b_page, buf->b_size, 0);
> > +
> > +	submit_bio(bio);
> > +}
> > +
> > +int sync_buffers(struct super_block *sb)
> > +{
> > +	struct ext2_sb_info *sbi = EXT2_SB(sb);
> > +	struct rhashtable *buffer_cache = &sbi->buffer_cache;
> > +	struct rhashtable_iter iter;
> > +	struct ext2_buffer *buf;
> > +	struct blk_plug plug;
> > +
> > +	blk_start_plug(&plug);
> > +	rhashtable_walk_enter(buffer_cache, &iter);
> > +	rhashtable_walk_start(&iter);
> > +	while ((buf = rhashtable_walk_next(&iter)) != NULL) {
> > +		if (IS_ERR(buf))
> > +			continue;
> > +		if (test_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags)) {
> > +			submit_buffer_write(sb, buf);
> > +		}
> > +	}
> > +	rhashtable_walk_stop(&iter);
> > +	rhashtable_walk_exit(&iter);
> > +	blk_finish_plug(&plug);
> 
> Hum.  I think this needs to wait for the writes to complete so that it
> can report any IO errors.  What do you think of adding a completion to
> ext2_buffer so that the end_io function can complete() on it, and this
> function can wait for each buffer?
> 
> (I think this is getting closer and closer to the delwri list code for
> xfs_buf...)
> 
> > +
> > +	return 0;
> > +}
> > +
> > +static struct ext2_buffer *lookup_buffer_cache(struct super_block *sb, sector_t block)
> > +{
> > +	struct ext2_sb_info *sbi = EXT2_SB(sb);
> > +	struct rhashtable *buffer_cache = &sbi->buffer_cache;
> > +	struct ext2_buffer *found = NULL;
> > +
> > +	found = rhashtable_lookup_fast(buffer_cache, &block, buffer_cache_params);
> > +
> > +	return found;
> 
> Doesn't this need to take the rcu read lock before the lookup so that
> the rhashtable structures (and the ext2_buffer) cannot be freed while we
> try to grab a refcount on the buffer?  I think it also needs to
> refcount_inc_not_zero before dropping that read lock:
> 
> 	rcu_read_lock();
> 	found = rhashtable_lookup_fast(..);
> 	if (!found || !refcount_inc_not_zero(&found->b_refcount)) {
> 		rcu_read_unlock();
> 		return -ENOENT;
> 	}
> 	rcu_read_unlock();
> 
> 	return found;
> 
> > +}
> > +
> > +static int init_buffer(struct super_block *sb, sector_t block, struct ext2_buffer **buf_ptr)
> > +{
> > +	struct ext2_buffer *buf;
> > +
> > +	buf = kmalloc(sizeof(struct ext2_buffer), GFP_KERNEL);
> > +	if (!buf)
> > +		return -ENOMEM;
> > +
> > +	buf->b_block = block;
> > +	buf->b_size = sb->s_blocksize;
> > +	buf->b_flags = 0;
> > +
> > +	mutex_init(&buf->b_lock);
> > +	refcount_set(&buf->b_refcount, 1);
> > +
> > +	buf->b_page = alloc_page(GFP_KERNEL);
> 
> For s_blocksize < PAGE_SIZE filesystems, you could make this more memory
> efficient by doing slab allocations instead of a full page.
> 
> > +	if (!buf->b_page)
> > +		return -ENOMEM;
> 
> Needs to free buf.
> 
> > +
> > +	buf->b_data = page_address(buf->b_page);
> > +
> > +	*buf_ptr = buf;
> > +
> > +	return 0;
> > +}
> > +
> > +void put_buffer(struct ext2_buffer *buf)
> > +{
> > +	refcount_dec(&buf->b_refcount);
> > +	mutex_unlock(&buf->b_lock);
> 
> Buffers can get freed after the refcount drops, so you need to drop the
> mutex before dropping the ref.
> 
> I almost wonder if this function should take a (struct ext2_buffer **)
> so that you can null out the caller's pointer to avoid accidental UAF,
> but that might be too training wheels for the kernel.
> 
> > +}
> > +
> > +struct ext2_buffer *get_buffer(struct super_block *sb, sector_t block, bool need_uptodate)
> > +{
> > +	int err;
> > +	struct ext2_buffer *buf;
> > +	struct ext2_buffer *new_buf;
> > +
> > +	buf = lookup_buffer_cache(sb, block);
> > +
> > +	if (!buf) {
> > +		err = init_buffer(sb, block, &new_buf);
> 
> If you made init_buffer return either a (struct ext2_buffer *) or NULL,
> then you could remove the third parameter and the callsite becomes:
> 
> 		new_buf = init_buffer(sb, block);
> 		if (!new_buf)
> 			return ERR_PTR(-ENOMEM);
> 
> > +		if (err)
> > +			return ERR_PTR(err);
> > +
> > +		if (need_uptodate) {
> > +			err = submit_buffer_read(sb, new_buf);
> > +			if (err)
> > +				return ERR_PTR(err);
> 
> If you passed need_uptodate to init_buffer, then you could pass GFP_ZERO
> to alloc_page() in the !need_uptodate case, and the buffer would be
> clean (i.e. not contain any stale memory contents) if we're initializing
> a new block and don't care what's on disk.
> 
> Also I think you need a destroy_buffer call if the read fails.
> 
> > +		}
> > +
> > +		buf = insert_buffer_cache(sb, new_buf);
> > +		if (IS_ERR(buf))
> > +			kfree(new_buf);
> 
> I'm confused here -- insert_buffer_cache returns either a buffer that
> has been added to the cache since the lookup_buffer_cache call, or it
> adds new_buf and returns it.  Neither of those pointers are an IS_ERR,
> so if buf != new_buf (aka we lost the race to add the buffer), we'll end
> up leaking new_buf.
> 
> Also, new_buf has resources allocated to it, don't you need to call
> destroy_buffer here?
> 
> > +	}
> > +
> > +	mutex_lock(&buf->b_lock);
> > +	refcount_inc(&buf->b_refcount);
> > +
> > +	return buf;
> > +}
> 
> Some people dislike trivial helpers, but I think the callsites would
> read better if you did:
> 
> struct ext2_buffer *
> __get_buffer(struct ext2_buffer_cache *bufs, sector_t block,
> 		bool need_uptodate);
> 
> static inline struct ext2_buffer *
> get_buffer(struct ext2_buffer_cache *bufs, sector_t bno)
> {
> 	return __get_buffer(bufs, bno, false);
> }
> 
> static inline struct ext2_buffer *
> read_buffer(struct ext2_buffer_cache *bufs, sector_t bno)
> {
> 	return __get_buffer(bufs, bno, true);
> }
> 
> It's much easier for me to distinguish between reading a buffer so that
> we can update its contents vs. getting a buffer so that we can
> initialize a new block if the words are right there in the function name
> instead of a 0/1 argument:
> 
> 	if (offset || tocopy != EXT2_BLOCK_SIZE(sb))
> 		buf = get_buffer(sb, bno, 1);
> 	else
> 		buf = get_buffer(sb, bno, 0);
> 
> vs:
> 
> 	/*
> 	 * rmw a partial buffer or skip the read if we're doing the
> 	 * entire block.
> 	 */
> 	if (offset || tocopy != EXT2_BLOCK_SIZE(sb))
> 		buf = read_buffer(sb, bno);
> 	else
> 		buf = get_buffer(sb, bno);
> 
> > +void buffer_set_dirty(struct ext2_buffer *buf)
> > +{
> > +    set_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags);
> > +}
> 
> There ought to be a way to clear the dirty bit on a buffer if you're
> freeing it... though AFAICT we never actually free blocks from a quota
> file so I guess you don't need it yet.
> 
> > +
> > +int init_buffer_cache(struct rhashtable *buffer_cache)
> > +{
> > +	return rhashtable_init(buffer_cache, &buffer_cache_params);
> 
> Shouldn't this be in charge of initializing the spinlock?
> 
> > +}
> > +
> > +static void destroy_buffer(void *ptr, void *arg)
> > +{
> > +	struct ext2_buffer *buf = ptr;
> > +
> > +	WARN_ON(test_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags));
> > +	__free_page(buf->b_page);
> > +	kfree(buf);
> > +}
> > +
> > +void destroy_buffer_cache(struct rhashtable *buffer_cache)
> > +{
> > +	rhashtable_free_and_destroy(buffer_cache, destroy_buffer, NULL);
> > +}
> 
> Just as a note to everyone else: there needs to be a shrinker to clear
> out !dirty buffers during memory reclaim, and (most probably) a means to
> initiate a background writeout of dirty buffers when memory starts
> getting low.  Each of those is complex enough to warrant separate
> patches.
> 
> > diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
> > index f38bdd46e4f7..ce0bb03527e0 100644
> > --- a/fs/ext2/ext2.h
> > +++ b/fs/ext2/ext2.h
> > @@ -18,6 +18,7 @@
> >  #include <linux/rbtree.h>
> >  #include <linux/mm.h>
> >  #include <linux/highmem.h>
> > +#include <linux/rhashtable.h>
> >  
> >  /* XXX Here for now... not interested in restructing headers JUST now */
> >  
> > @@ -116,6 +117,8 @@ struct ext2_sb_info {
> >  	struct mb_cache *s_ea_block_cache;
> >  	struct dax_device *s_daxdev;
> >  	u64 s_dax_part_off;
> > +	struct rhashtable buffer_cache;
> > +	spinlock_t buffer_cache_lock;
> 
> These ought to be in a struct ext2_buffer_cache, so that you can pass a
> (struct ext2_buffer_cache *) to the {init,destroy}_buffer_cache
> functions.  That improves type safety, because the compiler can check
> for us that someone doesn't accidentally pass some other rhashtable into
> destroy_buffer_cache.
> 
> >  };
> >  
> >  static inline spinlock_t *
> > @@ -683,6 +686,24 @@ struct ext2_inode_info {
> >   */
> >  #define EXT2_STATE_NEW			0x00000001 /* inode is newly created */
> >  
> > +/*
> > + * ext2 buffer
> > + */
> > +struct ext2_buffer {
> > +	sector_t b_block;
> > +	struct rhash_head b_rhash;
> > +	struct page *b_page;
> > +	size_t b_size;
> > +	char *b_data;
> > +	unsigned long b_flags;
> > +	refcount_t b_refcount;
> > +	struct mutex b_lock;
> > +};
> > +
> > +/*
> > + * Buffer flags
> > + */
> > + #define EXT2_BUF_DIRTY_BIT 0
> >  
> >  /*
> >   * Function prototypes
> > @@ -716,6 +737,14 @@ extern int ext2_should_retry_alloc(struct super_block *sb, int *retries);
> >  extern void ext2_init_block_alloc_info(struct inode *);
> >  extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_window_node *rsv);
> >  
> > +/* cache.c */
> > +extern int init_buffer_cache(struct rhashtable *);
> > +extern void destroy_buffer_cache(struct rhashtable *buffer_cache);
> > +extern int sync_buffers(struct super_block *);
> > +extern struct ext2_buffer *get_buffer(struct super_block *, sector_t, bool);
> > +extern void put_buffer(struct ext2_buffer *);
> > +extern void buffer_set_dirty(struct ext2_buffer *);
> > +
> >  /* dir.c */
> >  int ext2_add_link(struct dentry *, struct inode *);
> >  int ext2_inode_by_name(struct inode *dir,
> > @@ -741,6 +770,7 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
> >  extern void ext2_evict_inode(struct inode *);
> >  void ext2_write_failed(struct address_space *mapping, loff_t to);
> >  extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
> > +extern int ext2_get_block_bno(struct inode *, sector_t, int, u32 *, bool *);
> >  extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *);
> >  extern int ext2_getattr (struct mnt_idmap *, const struct path *,
> >  			 struct kstat *, u32, unsigned int);
> > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> > index 0caa1650cee8..7e7e6a5916c4 100644
> > --- a/fs/ext2/inode.c
> > +++ b/fs/ext2/inode.c
> > @@ -803,6 +803,26 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
> >  
> >  }
> >  
> > +int ext2_get_block_bno(struct inode *inode, sector_t iblock,
> > +		int create, u32 *bno, bool *mapped)
> > +{
> > +	struct super_block *sb = inode->i_sb;
> > +	struct buffer_head tmp_bh;
> 
> This probably ought to be switched to iomap if the goal is to get rid of
> buffer heads.  But as I mutter somewhere else, maybe the quota code
> should just copy to and from the quota file pagecache and this effort
> target the other metadata of ext2. ;)
> 
> > +	int err;
> > +
> > +	tmp_bh.b_state = 0;
> > +	tmp_bh.b_size = sb->s_blocksize;
> > +
> > +	err = ext2_get_block(inode, iblock, &tmp_bh, 0);
> > +	if (err)
> > +		return err;
> > +
> > +	*mapped = buffer_mapped(&tmp_bh);
> > +	*bno = tmp_bh.b_blocknr;
> > +
> > +	return 0;
> > +}
> > 
> >  static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> >  		unsigned flags, struct iomap *iomap, struct iomap *srcmap)
> >  {
> > diff --git a/fs/ext2/super.c b/fs/ext2/super.c
> > index 37f7ce56adce..11d88882ad24 100644
> > --- a/fs/ext2/super.c
> > +++ b/fs/ext2/super.c
> > @@ -152,6 +152,8 @@ static void ext2_put_super (struct super_block * sb)
> >  	ext2_xattr_destroy_cache(sbi->s_ea_block_cache);
> >  	sbi->s_ea_block_cache = NULL;
> >  
> > +	destroy_buffer_cache(&sbi->buffer_cache);
> > +
> >  	if (!sb_rdonly(sb)) {
> >  		struct ext2_super_block *es = sbi->s_es;
> >  
> > @@ -835,6 +837,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
> >  	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
> >  					   NULL, NULL);
> >  
> > +	spin_lock_init(&sbi->buffer_cache_lock);
> > +	ret = init_buffer_cache(&sbi->buffer_cache);
> > +	if (ret) {
> > +		ext2_msg(sb, KERN_ERR, "error: unable to create buffer cache");
> > +		goto failed_sbi;
> > +	}
> 
> If the subsequent initialization fails, shouldn't ext2_fill_super call
> destroy_buffer_cache to free the buffer cache object?
> 
> > +
> >  	spin_lock_init(&sbi->s_lock);
> >  	ret = -EINVAL;
> >  
> > @@ -1278,6 +1287,8 @@ static int ext2_sync_fs(struct super_block *sb, int wait)
> >  	 */
> >  	dquot_writeback_dquots(sb, -1);
> >  
> > +	sync_buffers(sb);
> 
> Assuming that sync_buffers (or really, sync_buffer_cache() called on the
> ext2_buffer_cache) starts returning io errors, any error code should be
> returned here:
> 
> 	int ret = 0, ret2;
> 
> 	ret2 = dquot_writeback_dquots(sb, -1);
> 	if (!ret && ret2)
> 		ret = ret2;
> 
> 	...
> 
> 	ret2 = sync_buffer_cache(&sb->buffer_cache);
> 	if (!ret && ret2)
> 		ret = ret2;
> 
> 	...
> 
> 	return ret;
> 
> 
> > +
> >  	spin_lock(&sbi->s_lock);
> >  	if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
> >  		ext2_debug("setting valid to 0\n");
> > @@ -1491,9 +1502,10 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
> >  	int offset = off & (sb->s_blocksize - 1);
> >  	int tocopy;
> >  	size_t toread;
> > -	struct buffer_head tmp_bh;
> > -	struct buffer_head *bh;
> >  	loff_t i_size = i_size_read(inode);
> > +	struct ext2_buffer *buf;
> > +	u32 bno;
> > +	bool mapped;
> >  
> >  	if (off > i_size)
> >  		return 0;
> > @@ -1503,20 +1515,19 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
> >  	while (toread > 0) {
> >  		tocopy = min_t(size_t, sb->s_blocksize - offset, toread);
> >  
> > -		tmp_bh.b_state = 0;
> > -		tmp_bh.b_size = sb->s_blocksize;
> > -		err = ext2_get_block(inode, blk, &tmp_bh, 0);
> > +		err = ext2_get_block_bno(inode, blk, 0, &bno, &mapped);
> >  		if (err < 0)
> >  			return err;
> > -		if (!buffer_mapped(&tmp_bh))	/* A hole? */
> > +		if (!mapped)	/* A hole? */
> >  			memset(data, 0, tocopy);
> >  		else {
> > -			bh = sb_bread(sb, tmp_bh.b_blocknr);
> > -			if (!bh)
> > -				return -EIO;
> > -			memcpy(data, bh->b_data+offset, tocopy);
> > -			brelse(bh);
> > +			buf = get_buffer(sb, bno, 1);
> > +			if (IS_ERR(buf))
> > +				return PTR_ERR(buf);
> > +			memcpy(data, buf->b_data+offset, tocopy);
> > +			put_buffer(buf);
> 
> I recognize that this is a RFC proof of concept, but I wonder why we
> don't just copy the dquot information to and from the pagecache?  And
> use the ext2_buffer for other non-file metadata things (e.g. bitmaps,
> indirect blocks, inodes, group descriptors)?

willy points out that there are good reasons for avoiding the pagecache
for quota file contents:
https://lore.kernel.org/linux-fsdevel/Yp%2F+fSoHgPIhiHQR@casper.infradead.org/

...so I withdraw the question.

--D

> >  		}
> > +
> >  		offset = 0;
> >  		toread -= tocopy;
> >  		data += tocopy;
> > @@ -1535,32 +1546,29 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
> >  	int offset = off & (sb->s_blocksize - 1);
> >  	int tocopy;
> >  	size_t towrite = len;
> > -	struct buffer_head tmp_bh;
> > -	struct buffer_head *bh;
> > +	struct ext2_buffer *buf;
> > +	u32 bno;
> > +	bool mapped;
> >  
> >  	while (towrite > 0) {
> >  		tocopy = min_t(size_t, sb->s_blocksize - offset, towrite);
> >  
> > -		tmp_bh.b_state = 0;
> > -		tmp_bh.b_size = sb->s_blocksize;
> > -		err = ext2_get_block(inode, blk, &tmp_bh, 1);
> > +		err = ext2_get_block_bno(inode, blk, 1, &bno, &mapped);
> >  		if (err < 0)
> >  			goto out;
> > +
> >  		if (offset || tocopy != EXT2_BLOCK_SIZE(sb))
> > -			bh = sb_bread(sb, tmp_bh.b_blocknr);
> > +			buf = get_buffer(sb, bno, 1);
> >  		else
> > -			bh = sb_getblk(sb, tmp_bh.b_blocknr);
> > -		if (unlikely(!bh)) {
> > -			err = -EIO;
> > +			buf = get_buffer(sb, bno, 0);
> > +		if (IS_ERR(buf)) {
> > +			err = PTR_ERR(buf);
> >  			goto out;
> >  		}
> > -		lock_buffer(bh);
> > -		memcpy(bh->b_data+offset, data, tocopy);
> > -		flush_dcache_page(bh->b_page);
> > -		set_buffer_uptodate(bh);
> > -		mark_buffer_dirty(bh);
> > -		unlock_buffer(bh);
> > -		brelse(bh);
> > +		memcpy(buf->b_data+offset, data, tocopy);
> > +		buffer_set_dirty(buf);
> 
> I also wonder if this could be lifted a buffer_write() helper:
> 
> void buffer_write(struct ext2_buffer *buf, void *data,
> 		  unsigned int count, unsigned int offset)
> {
> 	WARN_ON(offset + count > buf->b_size);
> 
> 	memcpy(buf->b_data + offset, data, count);
> 	buffer_set_dirty(buf);
> }
> 
> 		buffer_write(buf, data, offset, tocopy);
> 
> > +		put_buffer(buf);
> > +
> >  		offset = 0;
> >  		towrite -= tocopy;
> >  		data += tocopy;
> > -- 
> > 2.43.0
> > 
> > 
>