[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <175573714721.23206.1974821467460678080.stgit@frogsfrogsfrogs>
Date: Wed, 20 Aug 2025 18:23:47 -0700
From: "Darrick J. Wong" <djwong@...nel.org>
To: tytso@....edu
Cc: John@...ves.net, bernd@...ernd.com, linux-fsdevel@...r.kernel.org,
linux-ext4@...r.kernel.org, miklos@...redi.hu, joannelkoong@...il.com,
neal@...pa.dev
Subject: [PATCH 2/6] iocache: add the actual buffer cache
From: Darrick J. Wong <djwong@...nel.org>
Wire up buffer caching into our new caching IO manager.
Signed-off-by: "Darrick J. Wong" <djwong@...nel.org>
---
lib/support/iocache.c | 469 +++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 447 insertions(+), 22 deletions(-)
diff --git a/lib/support/iocache.c b/lib/support/iocache.c
index 9870780d65ef61..ab879e85d18f2a 100644
--- a/lib/support/iocache.c
+++ b/lib/support/iocache.c
@@ -9,46 +9,288 @@
* %End-Header%
*/
#include "config.h"
+#include <assert.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <unistd.h>
#include "ext2fs/ext2_fs.h"
#include "ext2fs/ext2fs.h"
#include "ext2fs/ext2fsP.h"
#include "support/iocache.h"
+#include "support/list.h"
+#include "support/cache.h"
#define IOCACHE_IO_CHANNEL_MAGIC 0x424F5254 /* BORT */
static io_manager iocache_backing_manager;
+static inline uint64_t B_TO_FSBT(io_channel channel, uint64_t number) {
+ return number / channel->block_size;
+}
+
+static inline uint64_t B_TO_FSB(io_channel channel, uint64_t number) {
+ return (number + channel->block_size - 1) / channel->block_size;
+}
+
struct iocache_private_data {
int magic;
- io_channel real;
+ io_channel real; /* lower level io channel */
+ io_channel channel; /* cache channel */
+ struct cache cache;
+ pthread_mutex_t stats_lock;
+ struct struct_io_stats io_stats;
+ unsigned long long write_errors;
};
+#define IOCACHEDATA(cache) \
+ (container_of(cache, struct iocache_private_data, cache))
+
static struct iocache_private_data *IOCACHE(io_channel channel)
{
return (struct iocache_private_data *)channel->private_data;
}
-static errcode_t iocache_read_error(io_channel channel, unsigned long block,
- int count, void *data, size_t size,
- int actual_bytes_read, errcode_t error)
+struct iocache_buf {
+ struct cache_node node;
+ struct list_head list;
+ blk64_t block;
+ void *buf;
+ errcode_t write_error;
+ unsigned int uptodate:1;
+ unsigned int dirty:1;
+};
+
+static inline void iocache_buf_lock(struct iocache_buf *ubuf)
{
- io_channel iocache_channel = channel->app_data;
+ pthread_mutex_lock(&ubuf->node.cn_mutex);
+}
- return iocache_channel->read_error(iocache_channel, block, count, data,
- size, actual_bytes_read, error);
+static inline void iocache_buf_unlock(struct iocache_buf *ubuf)
+{
+ pthread_mutex_unlock(&ubuf->node.cn_mutex);
}
-static errcode_t iocache_write_error(io_channel channel, unsigned long block,
- int count, const void *data, size_t size,
- int actual_bytes_written,
- errcode_t error)
+struct iocache_key {
+ blk64_t block;
+};
+
+#define IOKEY(key) ((struct iocache_key *)(key))
+#define IOBUF(node) (container_of((node), struct iocache_buf, node))
+
+static unsigned int
+iocache_hash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
{
- io_channel iocache_channel = channel->app_data;
+ uint64_t hashval = IOKEY(key)->block;
+ uint64_t tmp;
- return iocache_channel->write_error(iocache_channel, block, count, data,
- size, actual_bytes_written, error);
+ tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
+ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
+ return tmp % hashsize;
}
+static int iocache_compare(struct cache_node *node, cache_key_t key)
+{
+ struct iocache_buf *ubuf = IOBUF(node);
+ struct iocache_key *ukey = IOKEY(key);
+
+ if (ubuf->block == ukey->block)
+ return CACHE_HIT;
+
+ return CACHE_MISS;
+}
+
+static struct cache_node *iocache_alloc_node(struct cache *cache,
+ cache_key_t key)
+{
+ struct iocache_private_data *data = IOCACHEDATA(cache);
+ struct iocache_key *ukey = IOKEY(key);
+ struct iocache_buf *ubuf;
+ errcode_t retval;
+
+ retval = ext2fs_get_mem(sizeof(struct iocache_buf), &ubuf);
+ if (retval)
+ return NULL;
+ memset(ubuf, 0, sizeof(*ubuf));
+
+ retval = io_channel_alloc_buf(data->channel, 0, &ubuf->buf);
+ if (retval) {
+ free(ubuf);
+ return NULL;
+ }
+ memset(ubuf->buf, 0, data->channel->block_size);
+
+ INIT_LIST_HEAD(&ubuf->list);
+ ubuf->block = ukey->block;
+ return &ubuf->node;
+}
+
+static bool iocache_flush_node(struct cache *cache, struct cache_node *node)
+{
+ struct iocache_private_data *data = IOCACHEDATA(cache);
+ struct iocache_buf *ubuf = IOBUF(node);
+ errcode_t retval;
+
+ if (ubuf->dirty) {
+ retval = io_channel_write_blk64(data->real, ubuf->block, 1,
+ ubuf->buf);
+ if (retval) {
+ ubuf->write_error = retval;
+ data->write_errors++;
+ } else {
+ ubuf->dirty = 0;
+ ubuf->write_error = 0;
+ }
+ }
+
+ return ubuf->dirty;
+}
+
+static void iocache_relse(struct cache *cache, struct cache_node *node)
+{
+ struct iocache_buf *ubuf = IOBUF(node);
+
+ assert(!ubuf->dirty);
+
+ ext2fs_free_mem(&ubuf->buf);
+ ext2fs_free_mem(&ubuf);
+}
+
+static unsigned int iocache_bulkrelse(struct cache *cache,
+ struct list_head *list)
+{
+ struct cache_node *cn, *n;
+ int count = 0;
+
+ if (list_empty(list))
+ return 0;
+
+ list_for_each_entry_safe(cn, n, list, cn_mru) {
+ iocache_relse(cache, cn);
+ count++;
+ }
+
+ return count;
+}
+
+/* Flush all dirty buffers in the cache to disk. */
+static errcode_t iocache_flush_cache(struct iocache_private_data *data)
+{
+ return cache_flush(&data->cache) ? 0 : EIO;
+}
+
+/* Flush all dirty buffers in this range of the cache to disk. */
+static errcode_t iocache_flush_range(struct iocache_private_data *data,
+ blk64_t block, uint64_t count)
+{
+ uint64_t i;
+ bool still_dirty = false;
+
+ for (i = 0; i < count; i++) {
+ struct iocache_key ukey = {
+ .block = block + i,
+ };
+ struct cache_node *node;
+
+ cache_node_get(&data->cache, &ukey, CACHE_GET_INCORE,
+ &node);
+ if (!node)
+ continue;
+
+ /* cache_flush holds cn_mutex across the node flush */
+ pthread_mutex_unlock(&node->cn_mutex);
+ still_dirty |= iocache_flush_node(&data->cache, node);
+ pthread_mutex_unlock(&node->cn_mutex);
+
+ cache_node_put(&data->cache, node);
+ }
+
+ return still_dirty ? EIO : 0;
+}
+
+static void iocache_add_list(struct cache *cache, struct cache_node *node,
+ void *data)
+{
+ struct iocache_buf *ubuf = IOBUF(node);
+ struct list_head *list = data;
+
+ assert(node->cn_count == 0 || node->cn_count == 1);
+
+ iocache_buf_lock(ubuf);
+ cache_node_grab(cache, node);
+ list_add_tail(&ubuf->list, list);
+ iocache_buf_unlock(ubuf);
+}
+
+static void iocache_invalidate_bufs(struct iocache_private_data *data,
+ struct list_head *list)
+{
+ struct iocache_buf *ubuf, *n;
+
+ list_for_each_entry_safe(ubuf, n, list, list) {
+ struct iocache_key ukey = {
+ .block = ubuf->block,
+ };
+
+ assert(ubuf->node.cn_count == 1);
+
+ iocache_buf_lock(ubuf);
+ ubuf->dirty = 0;
+ list_del_init(&ubuf->list);
+ iocache_buf_unlock(ubuf);
+
+ cache_node_put(&data->cache, &ubuf->node);
+ cache_node_purge(&data->cache, &ukey, &ubuf->node);
+ }
+}
+
+/*
+ * Remove all blocks from the cache. Dirty contents are discarded. Buffer
+ * refcounts must be zero!
+ */
+static void iocache_invalidate_cache(struct iocache_private_data *data)
+{
+ LIST_HEAD(list);
+
+ cache_walk(&data->cache, iocache_add_list, &list);
+ iocache_invalidate_bufs(data, &list);
+}
+
+/*
+ * Remove a range of blocks from the cache. Dirty contents are discarded.
+ * Buffer refcounts must be zero!
+ */
+static void iocache_invalidate_range(struct iocache_private_data *data,
+ blk64_t block, uint64_t count)
+{
+ LIST_HEAD(list);
+ uint64_t i;
+
+ for (i = 0; i < count; i++) {
+ struct iocache_key ukey = {
+ .block = block + i,
+ };
+ struct cache_node *node;
+
+ cache_node_get(&data->cache, &ukey, CACHE_GET_INCORE,
+ &node);
+ if (node) {
+ iocache_add_list(&data->cache, node, &list);
+ cache_node_put(&data->cache, node);
+ }
+ }
+ iocache_invalidate_bufs(data, &list);
+}
+
+static const struct cache_operations iocache_ops = {
+ .hash = iocache_hash,
+ .alloc = iocache_alloc_node,
+ .flush = iocache_flush_node,
+ .relse = iocache_relse,
+ .compare = iocache_compare,
+ .bulkrelse = iocache_bulkrelse,
+ .resize = cache_gradual_resize,
+};
+
static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
{
io_channel io = NULL;
@@ -65,6 +307,9 @@ static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
if (retval)
return retval;
+ /* disable any static cache in the lower io manager */
+ real->manager->set_option(real, "cache", "off");
+
retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
if (retval)
goto out_backing;
@@ -76,12 +321,19 @@ static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
goto out_channel;
memset(data, 0, sizeof(struct iocache_private_data));
data->magic = IOCACHE_IO_CHANNEL_MAGIC;
+ data->io_stats.num_fields = 4;
+ data->channel = io;
io->manager = iocache_io_manager;
retval = ext2fs_get_mem(strlen(name) + 1, &io->name);
if (retval)
goto out_data;
+ retval = cache_init(CACHE_CAN_SHRINK, 1U << 10, &iocache_ops,
+ &data->cache);
+ if (retval)
+ goto out_name;
+
strcpy(io->name, name);
io->private_data = data;
io->block_size = real->block_size;
@@ -91,12 +343,14 @@ static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
io->flags = real->flags;
data->real = real;
real->app_data = io;
- real->read_error = iocache_read_error;
- real->write_error = iocache_write_error;
+
+ pthread_mutex_init(&data->stats_lock, NULL);
*channel = io;
return 0;
+out_name:
+ ext2fs_free_mem(&io->name);
out_data:
ext2fs_free_mem(&data);
out_channel:
@@ -116,6 +370,10 @@ static errcode_t iocache_close(io_channel channel)
if (--channel->refcount > 0)
return 0;
+ pthread_mutex_destroy(&data->stats_lock);
+ cache_flush(&data->cache);
+ cache_purge(&data->cache);
+ cache_destroy(&data->cache);
if (data->real)
retval = io_channel_close(data->real);
ext2fs_free_mem(&channel->private_data);
@@ -134,6 +392,11 @@ static errcode_t iocache_set_blksize(io_channel channel, int blksize)
EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+ retval = iocache_flush_cache(data);
+ if (retval)
+ return retval;
+ iocache_invalidate_cache(data);
+
retval = io_channel_set_blksize(data->real, blksize);
if (retval)
return retval;
@@ -145,21 +408,34 @@ static errcode_t iocache_set_blksize(io_channel channel, int blksize)
static errcode_t iocache_flush(io_channel channel)
{
struct iocache_private_data *data = IOCACHE(channel);
+ errcode_t retval = 0;
+ errcode_t retval2;
EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
- return io_channel_flush(data->real);
+ retval = iocache_flush_cache(data);
+ retval2 = io_channel_flush(data->real);
+ if (retval)
+ return retval;
+ return retval2;
}
static errcode_t iocache_write_byte(io_channel channel, unsigned long offset,
int count, const void *buf)
{
struct iocache_private_data *data = IOCACHE(channel);
+ blk64_t bno = B_TO_FSBT(channel, offset);
+ blk64_t next_bno = B_TO_FSB(channel, offset + count);
+ errcode_t retval;
EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+ retval = iocache_flush_range(data, bno, next_bno - bno);
+ if (retval)
+ return retval;
+ iocache_invalidate_range(data, bno, next_bno - bno);
return io_channel_write_byte(data->real, offset, count, buf);
}
@@ -170,6 +446,16 @@ static errcode_t iocache_set_option(io_channel channel, const char *option,
EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+ errcode_t retval;
+
+ /* don't let unix io cache options leak through */
+ if (!strcmp(option, "cache_blocks") || !strcmp(option, "cache"))
+ return 0;
+
+ retval = iocache_flush_cache(data);
+ if (retval)
+ return retval;
+ iocache_invalidate_cache(data);
return data->real->manager->set_option(data->real, option, arg);
}
@@ -181,31 +467,157 @@ static errcode_t iocache_get_stats(io_channel channel, io_stats *io_stats)
EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
- return data->real->manager->get_stats(data->real, io_stats);
+ /*
+ * Yes, io_stats is a double-pointer, and we let the caller scribble on
+ * our stats struct WITHOUT LOCKING!
+ */
+ if (io_stats)
+ *io_stats = &data->io_stats;
+ return 0;
+}
+
+static void iocache_update_stats(struct iocache_private_data *data,
+ unsigned long long bytes_read,
+ unsigned long long bytes_written,
+ int cache_op)
+{
+ pthread_mutex_lock(&data->stats_lock);
+ data->io_stats.bytes_read += bytes_read;
+ data->io_stats.bytes_written += bytes_written;
+ if (cache_op == CACHE_HIT)
+ data->io_stats.cache_hits++;
+ else
+ data->io_stats.cache_misses++;
+ pthread_mutex_unlock(&data->stats_lock);
}
static errcode_t iocache_read_blk64(io_channel channel,
unsigned long long block, int count,
void *buf)
{
+ struct iocache_key ukey = {
+ .block = block,
+ };
struct iocache_private_data *data = IOCACHE(channel);
+ unsigned long long i;
+ errcode_t retval;
EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
- return io_channel_read_blk64(data->real, block, count, buf);
+ /*
+ * If we're doing an odd-sized read, flush out the cache and then do a
+ * direct read.
+ */
+ if (count < 0) {
+ uint64_t fsbcount = B_TO_FSB(channel, -count);
+
+ retval = iocache_flush_range(data, block, fsbcount);
+ if (retval)
+ return retval;
+ iocache_invalidate_range(data, block, fsbcount);
+ iocache_update_stats(data, 0, 0, CACHE_MISS);
+ return io_channel_read_blk64(data->real, block, count, buf);
+ }
+
+ for (i = 0; i < count; i++, ukey.block++, buf += channel->block_size) {
+ struct cache_node *node;
+ struct iocache_buf *ubuf;
+
+ cache_node_get(&data->cache, &ukey, 0, &node);
+ if (!node) {
+ /* cannot instantiate cache, just do a direct read */
+ retval = io_channel_read_blk64(data->real, ukey.block,
+ 1, buf);
+ if (retval)
+ return retval;
+ iocache_update_stats(data, channel->block_size, 0,
+ CACHE_MISS);
+ continue;
+ }
+
+ ubuf = IOBUF(node);
+ iocache_buf_lock(ubuf);
+ if (!ubuf->uptodate) {
+ retval = io_channel_read_blk64(data->real, ukey.block,
+ 1, ubuf->buf);
+ if (!retval) {
+ ubuf->uptodate = 1;
+ iocache_update_stats(data, channel->block_size,
+ 0, CACHE_MISS);
+ }
+ } else {
+ iocache_update_stats(data, channel->block_size, 0,
+ CACHE_HIT);
+ }
+ if (ubuf->uptodate)
+ memcpy(buf, ubuf->buf, channel->block_size);
+ iocache_buf_unlock(ubuf);
+ cache_node_put(&data->cache, node);
+ if (retval)
+ return retval;
+ }
+
+ return 0;
}
static errcode_t iocache_write_blk64(io_channel channel,
unsigned long long block, int count,
const void *buf)
{
+ struct iocache_key ukey = {
+ .block = block,
+ };
struct iocache_private_data *data = IOCACHE(channel);
+ unsigned long long i;
+ errcode_t retval;
EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
- return io_channel_write_blk64(data->real, block, count, buf);
+ /*
+ * If we're doing an odd-sized write, flush out the cache and then do a
+ * direct write.
+ */
+ if (count < 0) {
+ uint64_t fsbcount = B_TO_FSB(channel, -count);
+
+ retval = iocache_flush_range(data, block, fsbcount);
+ if (retval)
+ return retval;
+ iocache_invalidate_range(data, block, fsbcount);
+ iocache_update_stats(data, 0, 0, CACHE_MISS);
+ return io_channel_write_blk64(data->real, block, count, buf);
+ }
+
+ for (i = 0; i < count; i++, ukey.block++, buf += channel->block_size) {
+ struct cache_node *node;
+ struct iocache_buf *ubuf;
+
+ cache_node_get(&data->cache, &ukey, 0, &node);
+ if (!node) {
+ /* cannot instantiate cache, do a direct write */
+ retval = io_channel_write_blk64(data->real, ukey.block,
+ 1, buf);
+ if (retval)
+ return retval;
+ iocache_update_stats(data, 0, channel->block_size,
+ CACHE_MISS);
+ continue;
+ }
+
+ ubuf = IOBUF(node);
+ iocache_buf_lock(ubuf);
+ memcpy(ubuf->buf, buf, channel->block_size);
+ iocache_update_stats(data, 0, channel->block_size,
+ ubuf->uptodate ? CACHE_HIT : CACHE_MISS);
+ ubuf->dirty = 1;
+ ubuf->uptodate = 1;
+ iocache_buf_unlock(ubuf);
+ cache_node_put(&data->cache, node);
+ }
+
+ return 0;
}
static errcode_t iocache_read_blk(io_channel channel, unsigned long block,
@@ -224,11 +636,17 @@ static errcode_t iocache_discard(io_channel channel, unsigned long long block,
unsigned long long count)
{
struct iocache_private_data *data = IOCACHE(channel);
+ errcode_t retval;
EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
- return io_channel_discard(data->real, block, count);
+ retval = io_channel_discard(data->real, block, count);
+ if (retval)
+ return retval;
+
+ iocache_invalidate_range(data, block, count);
+ return 0;
}
static errcode_t iocache_cache_readahead(io_channel channel,
@@ -247,11 +665,17 @@ static errcode_t iocache_zeroout(io_channel channel, unsigned long long block,
unsigned long long count)
{
struct iocache_private_data *data = IOCACHE(channel);
+ errcode_t retval;
EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
- return io_channel_zeroout(data->real, block, count);
+ retval = io_channel_zeroout(data->real, block, count);
+ if (retval)
+ return retval;
+
+ iocache_invalidate_range(data, block, count);
+ return 0;
}
static errcode_t iocache_get_fd(io_channel channel, int *fd)
@@ -273,6 +697,7 @@ static errcode_t iocache_invalidate_blocks(io_channel channel,
EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+ iocache_invalidate_range(data, block, count);
return io_channel_invalidate_blocks(data->real, block, count);
}
Powered by blists - more mailing lists