linux-ext4 - Re: [RFC PATCH v1] ext2: remove buffer heads from quota handling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <wqnn2xgfenyeyt65sgskdraslmw2gkzwxkyqvty2kmusbftvt6@ncfa7lwjap25>
Date: Mon, 6 Jan 2025 19:03:03 +0100
From: Jan Kara <jack@...e.cz>
To: Catherine Hoang <catherine.hoang@...cle.com>
Cc: linux-ext4@...r.kernel.org, djwong@...nel.org
Subject: Re: [RFC PATCH v1] ext2: remove buffer heads from quota handling

On Tue 29-10-24 13:45:01, Catherine Hoang wrote:
> This patch removes the use of buffer heads from the quota read and
> write paths. To do so, we implement a new buffer cache using an
> rhashtable. Each buffer stores data from an associated block, and
> can be read or written to as needed.
> 
> Ultimately, we want to completely remove buffer heads from ext2.
> This patch serves as an example than can be applied to other parts
> of the filesystem.
> 
> Signed-off-by: Catherine Hoang <catherine.hoang@...cle.com>

Thanks for the patch and sorry for the delay. Overall I'd say this is a bit
too rudimentary :). See below.

> diff --git a/fs/ext2/cache.c b/fs/ext2/cache.c
> new file mode 100644
> index 000000000000..c58416392c52
> --- /dev/null
> +++ b/fs/ext2/cache.c
> @@ -0,0 +1,195 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (c) 2024 Oracle. All rights reserved.
> + */
> +
> +#include "ext2.h"
> +#include <linux/bio.h>
> +#include <linux/blkdev.h>
> +#include <linux/rhashtable.h>
> +#include <linux/mm.h>
> +#include <linux/types.h>
> +
> +static const struct rhashtable_params buffer_cache_params = {
> +	.key_len     = sizeof(sector_t),
> +	.key_offset  = offsetof(struct ext2_buffer, b_block),
> +	.head_offset = offsetof(struct ext2_buffer, b_rhash),
> +	.automatic_shrinking = true,
> +};
> +
> +static struct ext2_buffer *insert_buffer_cache(struct super_block *sb, struct ext2_buffer *new_buf)
> +{
> +	struct ext2_sb_info *sbi = EXT2_SB(sb);
> +	struct rhashtable *buffer_cache = &sbi->buffer_cache;
> +	struct ext2_buffer *old_buf;
> +
> +	spin_lock(&sbi->buffer_cache_lock);
> +	old_buf = rhashtable_lookup_get_insert_fast(buffer_cache,
> +				&new_buf->b_rhash, buffer_cache_params);
> +	spin_unlock(&sbi->buffer_cache_lock);
> +
> +	if (old_buf)
> +		return old_buf;
> +
> +	return new_buf;
> +}
> +
> +static void buf_write_end_io(struct bio *bio)
> +{
> +	struct ext2_buffer *buf = bio->bi_private;
> +
> +	clear_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags);

Clearing the bit after IO has finished is too late. There can be
modifications done to the buffer while the IO is running and this way you
could clear the bit while not all changes in the buffer are written out.

> +	bio_put(bio);
> +}
> +
> +static int submit_buffer_read(struct super_block *sb, struct ext2_buffer *buf)
> +{
> +	struct bio_vec bio_vec;
> +	struct bio bio;
> +	sector_t sector = buf->b_block * (sb->s_blocksize >> 9);
> +
> +	bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
> +	bio.bi_iter.bi_sector = sector;
> +
> +	__bio_add_page(&bio, buf->b_page, buf->b_size, 0);
> +
> +	return submit_bio_wait(&bio);
> +}
> +
> +static void submit_buffer_write(struct super_block *sb, struct ext2_buffer *buf)
> +{
> +	struct bio *bio;
> +	sector_t sector = buf->b_block * (sb->s_blocksize >> 9);
> +
> +	bio = bio_alloc(sb->s_bdev, 1, REQ_OP_WRITE, GFP_KERNEL);
> +
> +	bio->bi_iter.bi_sector = sector;
> +	bio->bi_end_io = buf_write_end_io;
> +	bio->bi_private = buf;
> +
> +	__bio_add_page(bio, buf->b_page, buf->b_size, 0);
> +
> +	submit_bio(bio);
> +}
> +
> +int sync_buffers(struct super_block *sb)
> +{
> +	struct ext2_sb_info *sbi = EXT2_SB(sb);
> +	struct rhashtable *buffer_cache = &sbi->buffer_cache;
> +	struct rhashtable_iter iter;
> +	struct ext2_buffer *buf;
> +	struct blk_plug plug;
> +
> +	blk_start_plug(&plug);
> +	rhashtable_walk_enter(buffer_cache, &iter);
> +	rhashtable_walk_start(&iter);
> +	while ((buf = rhashtable_walk_next(&iter)) != NULL) {
> +		if (IS_ERR(buf))
> +			continue;
> +		if (test_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags)) {
> +			submit_buffer_write(sb, buf);
> +		}
> +	}
> +	rhashtable_walk_stop(&iter);
> +	rhashtable_walk_exit(&iter);
> +	blk_finish_plug(&plug);
> +
> +	return 0;
> +}

I think we need some kind of periodic writeback as well. Also fsync(2) will
need to flush appropriate metadata buffers as well and we need to track
them together with the inode to which metadata belongs.

> +static struct ext2_buffer *lookup_buffer_cache(struct super_block *sb, sector_t block)
> +{
> +	struct ext2_sb_info *sbi = EXT2_SB(sb);
> +	struct rhashtable *buffer_cache = &sbi->buffer_cache;
> +	struct ext2_buffer *found = NULL;
> +
> +	found = rhashtable_lookup_fast(buffer_cache, &block, buffer_cache_params);
> +
> +	return found;
> +}
> +
> +static int init_buffer(struct super_block *sb, sector_t block, struct ext2_buffer **buf_ptr)
> +{
> +	struct ext2_buffer *buf;
> +
> +	buf = kmalloc(sizeof(struct ext2_buffer), GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	buf->b_block = block;
> +	buf->b_size = sb->s_blocksize;
> +	buf->b_flags = 0;
> +
> +	mutex_init(&buf->b_lock);
> +	refcount_set(&buf->b_refcount, 1);
> +
> +	buf->b_page = alloc_page(GFP_KERNEL);
> +	if (!buf->b_page)
> +		return -ENOMEM;
> +
> +	buf->b_data = page_address(buf->b_page);
> +
> +	*buf_ptr = buf;
> +
> +	return 0;
> +}
> +
> +void put_buffer(struct ext2_buffer *buf)
> +{
> +	refcount_dec(&buf->b_refcount);
> +	mutex_unlock(&buf->b_lock);
> +}
> +
> +struct ext2_buffer *get_buffer(struct super_block *sb, sector_t block, bool need_uptodate)
> +{
> +	int err;
> +	struct ext2_buffer *buf;
> +	struct ext2_buffer *new_buf;
> +
> +	buf = lookup_buffer_cache(sb, block);
> +
> +	if (!buf) {
> +		err = init_buffer(sb, block, &new_buf);
> +		if (err)
> +			return ERR_PTR(err);
> +
> +		if (need_uptodate) {
> +			err = submit_buffer_read(sb, new_buf);
> +			if (err)
> +				return ERR_PTR(err);
> +		}
> +
> +		buf = insert_buffer_cache(sb, new_buf);

So this can return the old buffer as well in which case you need to free
the new one.

> +		if (IS_ERR(buf))
> +			kfree(new_buf);
> +	}
> +
> +	mutex_lock(&buf->b_lock);
> +	refcount_inc(&buf->b_refcount);

So currently I don't see any use of the refcount. It's always incremented
when locking b_lock and decremented before unlocking it.

Also locking b_lock whenever acquiring the buffer more or less works for
quota code but for more complex locking scenarios (xattrs come to mind) it
will not be really usable so we probably shouldn't mix get/put buffer and
locking of b_lock?

> +void buffer_set_dirty(struct ext2_buffer *buf)
> +{
> +    set_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags);
> +}
> +
> +int init_buffer_cache(struct rhashtable *buffer_cache)
> +{
> +	return rhashtable_init(buffer_cache, &buffer_cache_params);
> +}
> +

...

> +int ext2_get_block_bno(struct inode *inode, sector_t iblock,
> +		int create, u32 *bno, bool *mapped)
> +{
> +	struct super_block *sb = inode->i_sb;
> +	struct buffer_head tmp_bh;
> +	int err;
> +
> +	tmp_bh.b_state = 0;
> +	tmp_bh.b_size = sb->s_blocksize;
> +
> +	err = ext2_get_block(inode, iblock, &tmp_bh, 0);

I suppose you need to pass the 'create' argument here?

> +	if (err)
> +		return err;
> +
> +	*mapped = buffer_mapped(&tmp_bh);
> +	*bno = tmp_bh.b_blocknr;
> +
> +	return 0;
> +}
> +

So overall I'd say there's more functionality needed to be able to replace
the buffer cache even for ext2.

								Honza
-- 
Jan Kara <jack@...e.com>
SUSE Labs, CR