lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <175573714721.23206.1974821467460678080.stgit@frogsfrogsfrogs>
Date: Wed, 20 Aug 2025 18:23:47 -0700
From: "Darrick J. Wong" <djwong@...nel.org>
To: tytso@....edu
Cc: John@...ves.net, bernd@...ernd.com, linux-fsdevel@...r.kernel.org,
 linux-ext4@...r.kernel.org, miklos@...redi.hu, joannelkoong@...il.com,
 neal@...pa.dev
Subject: [PATCH 2/6] iocache: add the actual buffer cache

From: Darrick J. Wong <djwong@...nel.org>

Wire up buffer caching into our new caching IO manager.

Signed-off-by: "Darrick J. Wong" <djwong@...nel.org>
---
 lib/support/iocache.c |  469 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 447 insertions(+), 22 deletions(-)


diff --git a/lib/support/iocache.c b/lib/support/iocache.c
index 9870780d65ef61..ab879e85d18f2a 100644
--- a/lib/support/iocache.c
+++ b/lib/support/iocache.c
@@ -9,46 +9,288 @@
  * %End-Header%
  */
 #include "config.h"
+#include <assert.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <unistd.h>
 #include "ext2fs/ext2_fs.h"
 #include "ext2fs/ext2fs.h"
 #include "ext2fs/ext2fsP.h"
 #include "support/iocache.h"
+#include "support/list.h"
+#include "support/cache.h"
 
 #define IOCACHE_IO_CHANNEL_MAGIC	0x424F5254	/* BORT */
 
 static io_manager iocache_backing_manager;
 
+static inline uint64_t B_TO_FSBT(io_channel channel, uint64_t number) {
+	return number / channel->block_size;
+}
+
+static inline uint64_t B_TO_FSB(io_channel channel, uint64_t number) {
+	return (number + channel->block_size - 1) / channel->block_size;
+}
+
 struct iocache_private_data {
 	int			magic;
-	io_channel		real;
+	io_channel		real;		/* lower level io channel */
+	io_channel		channel;	/* cache channel */
+	struct cache		cache;
+	pthread_mutex_t		stats_lock;
+	struct struct_io_stats	io_stats;
+	unsigned long long	write_errors;
 };
 
+#define IOCACHEDATA(cache) \
+	(container_of(cache, struct iocache_private_data, cache))
+
 static struct iocache_private_data *IOCACHE(io_channel channel)
 {
 	return (struct iocache_private_data *)channel->private_data;
 }
 
-static errcode_t iocache_read_error(io_channel channel, unsigned long block,
-				    int count, void *data, size_t size,
-				    int actual_bytes_read, errcode_t error)
+struct iocache_buf {
+	struct cache_node	node;
+	struct list_head	list;
+	blk64_t			block;
+	void			*buf;
+	errcode_t		write_error;
+	unsigned int		uptodate:1;
+	unsigned int		dirty:1;
+};
+
+static inline void iocache_buf_lock(struct iocache_buf *ubuf)
 {
-	io_channel iocache_channel = channel->app_data;
+	pthread_mutex_lock(&ubuf->node.cn_mutex);
+}
 
-	return iocache_channel->read_error(iocache_channel, block, count, data,
-					   size, actual_bytes_read, error);
+static inline void iocache_buf_unlock(struct iocache_buf *ubuf)
+{
+	pthread_mutex_unlock(&ubuf->node.cn_mutex);
 }
 
-static errcode_t iocache_write_error(io_channel channel, unsigned long block,
-				     int count, const void *data, size_t size,
-				     int actual_bytes_written,
-				     errcode_t error)
+struct iocache_key {
+	blk64_t			block;
+};
+
+#define IOKEY(key)	((struct iocache_key *)(key))
+#define IOBUF(node)	(container_of((node), struct iocache_buf, node))
+
+static unsigned int
+iocache_hash(cache_key_t key, unsigned int hashsize, unsigned int hashshift)
 {
-	io_channel iocache_channel = channel->app_data;
+	uint64_t	hashval = IOKEY(key)->block;
+	uint64_t	tmp;
 
-	return iocache_channel->write_error(iocache_channel, block, count, data,
-					    size, actual_bytes_written, error);
+	tmp = hashval ^ (GOLDEN_RATIO_PRIME + hashval) / CACHE_LINE_SIZE;
+	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> hashshift);
+	return tmp % hashsize;
 }
 
+static int iocache_compare(struct cache_node *node, cache_key_t key)
+{
+	struct iocache_buf *ubuf = IOBUF(node);
+	struct iocache_key *ukey = IOKEY(key);
+
+	if (ubuf->block == ukey->block)
+		return CACHE_HIT;
+
+	return CACHE_MISS;
+}
+
+static struct cache_node *iocache_alloc_node(struct cache *cache,
+					     cache_key_t key)
+{
+	struct iocache_private_data *data = IOCACHEDATA(cache);
+	struct iocache_key *ukey = IOKEY(key);
+	struct iocache_buf *ubuf;
+	errcode_t retval;
+
+	retval = ext2fs_get_mem(sizeof(struct iocache_buf), &ubuf);
+	if (retval)
+		return NULL;
+	memset(ubuf, 0, sizeof(*ubuf));
+
+	retval = io_channel_alloc_buf(data->channel, 0, &ubuf->buf);
+	if (retval) {
+		free(ubuf);
+		return NULL;
+	}
+	memset(ubuf->buf, 0, data->channel->block_size);
+
+	INIT_LIST_HEAD(&ubuf->list);
+	ubuf->block = ukey->block;
+	return &ubuf->node;
+}
+
+static bool iocache_flush_node(struct cache *cache, struct cache_node *node)
+{
+	struct iocache_private_data *data = IOCACHEDATA(cache);
+	struct iocache_buf *ubuf = IOBUF(node);
+	errcode_t retval;
+
+	if (ubuf->dirty) {
+		retval = io_channel_write_blk64(data->real, ubuf->block, 1,
+						ubuf->buf);
+		if (retval) {
+			ubuf->write_error = retval;
+			data->write_errors++;
+		} else {
+			ubuf->dirty = 0;
+			ubuf->write_error = 0;
+		}
+	}
+
+	return ubuf->dirty;
+}
+
+static void iocache_relse(struct cache *cache, struct cache_node *node)
+{
+	struct iocache_buf *ubuf = IOBUF(node);
+
+	assert(!ubuf->dirty);
+
+	ext2fs_free_mem(&ubuf->buf);
+	ext2fs_free_mem(&ubuf);
+}
+
+static unsigned int iocache_bulkrelse(struct cache *cache,
+				      struct list_head *list)
+{
+	struct cache_node *cn, *n;
+	int count = 0;
+
+	if (list_empty(list))
+		return 0;
+
+	list_for_each_entry_safe(cn, n, list, cn_mru) {
+		iocache_relse(cache, cn);
+		count++;
+	}
+
+	return count;
+}
+
+/* Flush all dirty buffers in the cache to disk. */
+static errcode_t iocache_flush_cache(struct iocache_private_data *data)
+{
+	return cache_flush(&data->cache) ? 0 : EIO;
+}
+
+/* Flush all dirty buffers in this range of the cache to disk. */
+static errcode_t iocache_flush_range(struct iocache_private_data *data,
+				     blk64_t block, uint64_t count)
+{
+	uint64_t i;
+	bool still_dirty = false;
+
+	for (i = 0; i < count; i++) {
+		struct iocache_key ukey = {
+			.block = block + i,
+		};
+		struct cache_node *node;
+
+		cache_node_get(&data->cache, &ukey, CACHE_GET_INCORE,
+			       &node);
+		if (!node)
+			continue;
+
+		/* cache_flush holds cn_mutex across the node flush */
+		pthread_mutex_unlock(&node->cn_mutex);
+		still_dirty |= iocache_flush_node(&data->cache, node);
+		pthread_mutex_unlock(&node->cn_mutex);
+
+		cache_node_put(&data->cache, node);
+	}
+
+	return still_dirty ? EIO : 0;
+}
+
+static void iocache_add_list(struct cache *cache, struct cache_node *node,
+			     void *data)
+{
+	struct iocache_buf *ubuf = IOBUF(node);
+	struct list_head *list = data;
+
+	assert(node->cn_count == 0 || node->cn_count == 1);
+
+	iocache_buf_lock(ubuf);
+	cache_node_grab(cache, node);
+	list_add_tail(&ubuf->list, list);
+	iocache_buf_unlock(ubuf);
+}
+
+static void iocache_invalidate_bufs(struct iocache_private_data *data,
+				    struct list_head *list)
+{
+	struct iocache_buf *ubuf, *n;
+
+	list_for_each_entry_safe(ubuf, n, list, list) {
+		struct iocache_key ukey = {
+			.block = ubuf->block,
+		};
+
+		assert(ubuf->node.cn_count == 1);
+
+		iocache_buf_lock(ubuf);
+		ubuf->dirty = 0;
+		list_del_init(&ubuf->list);
+		iocache_buf_unlock(ubuf);
+
+		cache_node_put(&data->cache, &ubuf->node);
+		cache_node_purge(&data->cache, &ukey, &ubuf->node);
+	}
+}
+
+/*
+ * Remove all blocks from the cache.  Dirty contents are discarded.  Buffer
+ * refcounts must be zero!
+ */
+static void iocache_invalidate_cache(struct iocache_private_data *data)
+{
+	LIST_HEAD(list);
+
+	cache_walk(&data->cache, iocache_add_list, &list);
+	iocache_invalidate_bufs(data, &list);
+}
+
+/*
+ * Remove a range of blocks from the cache.  Dirty contents are discarded.
+ * Buffer refcounts must be zero!
+ */
+static void iocache_invalidate_range(struct iocache_private_data *data,
+				     blk64_t block, uint64_t count)
+{
+	LIST_HEAD(list);
+	uint64_t i;
+
+	for (i = 0; i < count; i++) {
+		struct iocache_key ukey = {
+			.block = block + i,
+		};
+		struct cache_node *node;
+
+		cache_node_get(&data->cache, &ukey, CACHE_GET_INCORE,
+			       &node);
+		if (node) {
+			iocache_add_list(&data->cache, node, &list);
+			cache_node_put(&data->cache, node);
+		}
+	}
+	iocache_invalidate_bufs(data, &list);
+}
+
+static const struct cache_operations iocache_ops = {
+	.hash		= iocache_hash,
+	.alloc		= iocache_alloc_node,
+	.flush		= iocache_flush_node,
+	.relse		= iocache_relse,
+	.compare	= iocache_compare,
+	.bulkrelse	= iocache_bulkrelse,
+	.resize		= cache_gradual_resize,
+};
+
 static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
 {
 	io_channel	io = NULL;
@@ -65,6 +307,9 @@ static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
 	if (retval)
 		return retval;
 
+	/* disable any static cache in the lower io manager */
+	real->manager->set_option(real, "cache", "off");
+
 	retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
 	if (retval)
 		goto out_backing;
@@ -76,12 +321,19 @@ static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
 		goto out_channel;
 	memset(data, 0, sizeof(struct iocache_private_data));
 	data->magic = IOCACHE_IO_CHANNEL_MAGIC;
+	data->io_stats.num_fields = 4;
+	data->channel = io;
 
 	io->manager = iocache_io_manager;
 	retval = ext2fs_get_mem(strlen(name) + 1, &io->name);
 	if (retval)
 		goto out_data;
 
+	retval = cache_init(CACHE_CAN_SHRINK, 1U << 10, &iocache_ops,
+			    &data->cache);
+	if (retval)
+		goto out_name;
+
 	strcpy(io->name, name);
 	io->private_data = data;
 	io->block_size = real->block_size;
@@ -91,12 +343,14 @@ static errcode_t iocache_open(const char *name, int flags, io_channel *channel)
 	io->flags = real->flags;
 	data->real = real;
 	real->app_data = io;
-	real->read_error = iocache_read_error;
-	real->write_error = iocache_write_error;
+
+	pthread_mutex_init(&data->stats_lock, NULL);
 
 	*channel = io;
 	return 0;
 
+out_name:
+	ext2fs_free_mem(&io->name);
 out_data:
 	ext2fs_free_mem(&data);
 out_channel:
@@ -116,6 +370,10 @@ static errcode_t iocache_close(io_channel channel)
 
 	if (--channel->refcount > 0)
 		return 0;
+	pthread_mutex_destroy(&data->stats_lock);
+	cache_flush(&data->cache);
+	cache_purge(&data->cache);
+	cache_destroy(&data->cache);
 	if (data->real)
 		retval = io_channel_close(data->real);
 	ext2fs_free_mem(&channel->private_data);
@@ -134,6 +392,11 @@ static errcode_t iocache_set_blksize(io_channel channel, int blksize)
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
+	retval = iocache_flush_cache(data);
+	if (retval)
+		return retval;
+	iocache_invalidate_cache(data);
+
 	retval = io_channel_set_blksize(data->real, blksize);
 	if (retval)
 		return retval;
@@ -145,21 +408,34 @@ static errcode_t iocache_set_blksize(io_channel channel, int blksize)
 static errcode_t iocache_flush(io_channel channel)
 {
 	struct iocache_private_data *data = IOCACHE(channel);
+	errcode_t retval = 0;
+	errcode_t retval2;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
-	return io_channel_flush(data->real);
+	retval = iocache_flush_cache(data);
+	retval2 = io_channel_flush(data->real);
+	if (retval)
+		return retval;
+	return retval2;
 }
 
 static errcode_t iocache_write_byte(io_channel channel, unsigned long offset,
 				    int count, const void *buf)
 {
 	struct iocache_private_data *data = IOCACHE(channel);
+	blk64_t bno = B_TO_FSBT(channel, offset);
+	blk64_t next_bno = B_TO_FSB(channel, offset + count);
+	errcode_t retval;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
+	retval = iocache_flush_range(data, bno, next_bno - bno);
+	if (retval)
+		return retval;
+	iocache_invalidate_range(data, bno, next_bno - bno);
 	return io_channel_write_byte(data->real, offset, count, buf);
 }
 
@@ -170,6 +446,16 @@ static errcode_t iocache_set_option(io_channel channel, const char *option,
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
+	errcode_t retval;
+
+	/* don't let unix io cache options leak through */
+	if (!strcmp(option, "cache_blocks") || !strcmp(option, "cache"))
+		return 0;
+
+	retval = iocache_flush_cache(data);
+	if (retval)
+		return retval;
+	iocache_invalidate_cache(data);
 
 	return data->real->manager->set_option(data->real, option, arg);
 }
@@ -181,31 +467,157 @@ static errcode_t iocache_get_stats(io_channel channel, io_stats *io_stats)
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
-	return data->real->manager->get_stats(data->real, io_stats);
+	/*
+	 * Yes, io_stats is a double-pointer, and we let the caller scribble on
+	 * our stats struct WITHOUT LOCKING!
+	 */
+	if (io_stats)
+		*io_stats = &data->io_stats;
+	return 0;
+}
+
+static void iocache_update_stats(struct iocache_private_data *data,
+				 unsigned long long bytes_read,
+				 unsigned long long bytes_written,
+				 int cache_op)
+{
+	pthread_mutex_lock(&data->stats_lock);
+	data->io_stats.bytes_read += bytes_read;
+	data->io_stats.bytes_written += bytes_written;
+	if (cache_op == CACHE_HIT)
+		data->io_stats.cache_hits++;
+	else
+		data->io_stats.cache_misses++;
+	pthread_mutex_unlock(&data->stats_lock);
 }
 
 static errcode_t iocache_read_blk64(io_channel channel,
 				    unsigned long long block, int count,
 				    void *buf)
 {
+	struct iocache_key ukey = {
+		.block = block,
+	};
 	struct iocache_private_data *data = IOCACHE(channel);
+	unsigned long long i;
+	errcode_t retval;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
-	return io_channel_read_blk64(data->real, block, count, buf);
+	/*
+	 * If we're doing an odd-sized read, flush out the cache and then do a
+	 * direct read.
+	 */
+	if (count < 0) {
+		uint64_t fsbcount = B_TO_FSB(channel, -count);
+
+		retval = iocache_flush_range(data, block, fsbcount);
+		if (retval)
+			return retval;
+		iocache_invalidate_range(data, block, fsbcount);
+		iocache_update_stats(data, 0, 0, CACHE_MISS);
+		return io_channel_read_blk64(data->real, block, count, buf);
+	}
+
+	for (i = 0; i < count; i++, ukey.block++, buf += channel->block_size) {
+		struct cache_node *node;
+		struct iocache_buf *ubuf;
+
+		cache_node_get(&data->cache, &ukey, 0, &node);
+		if (!node) {
+			/* cannot instantiate cache, just do a direct read */
+			retval = io_channel_read_blk64(data->real, ukey.block,
+						       1, buf);
+			if (retval)
+				return retval;
+			iocache_update_stats(data, channel->block_size, 0,
+					     CACHE_MISS);
+			continue;
+		}
+
+		ubuf = IOBUF(node);
+		iocache_buf_lock(ubuf);
+		if (!ubuf->uptodate) {
+			retval = io_channel_read_blk64(data->real, ukey.block,
+						       1, ubuf->buf);
+			if (!retval) {
+				ubuf->uptodate = 1;
+				iocache_update_stats(data, channel->block_size,
+						     0, CACHE_MISS);
+			}
+		} else {
+			iocache_update_stats(data, channel->block_size, 0,
+					     CACHE_HIT);
+		}
+		if (ubuf->uptodate)
+			memcpy(buf, ubuf->buf, channel->block_size);
+		iocache_buf_unlock(ubuf);
+		cache_node_put(&data->cache, node);
+		if (retval)
+			return retval;
+	}
+
+	return 0;
 }
 
 static errcode_t iocache_write_blk64(io_channel channel,
 				     unsigned long long block, int count,
 				     const void *buf)
 {
+	struct iocache_key ukey = {
+		.block = block,
+	};
 	struct iocache_private_data *data = IOCACHE(channel);
+	unsigned long long i;
+	errcode_t retval;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
-	return io_channel_write_blk64(data->real, block, count, buf);
+	/*
+	 * If we're doing an odd-sized write, flush out the cache and then do a
+	 * direct write.
+	 */
+	if (count < 0) {
+		uint64_t fsbcount = B_TO_FSB(channel, -count);
+
+		retval = iocache_flush_range(data, block, fsbcount);
+		if (retval)
+			return retval;
+		iocache_invalidate_range(data, block, fsbcount);
+		iocache_update_stats(data, 0, 0, CACHE_MISS);
+		return io_channel_write_blk64(data->real, block, count, buf);
+	}
+
+	for (i = 0; i < count; i++, ukey.block++, buf += channel->block_size) {
+		struct cache_node *node;
+		struct iocache_buf *ubuf;
+
+		cache_node_get(&data->cache, &ukey, 0, &node);
+		if (!node) {
+			/* cannot instantiate cache, do a direct write */
+			retval = io_channel_write_blk64(data->real, ukey.block,
+							1, buf);
+			if (retval)
+				return retval;
+			iocache_update_stats(data, 0, channel->block_size,
+					     CACHE_MISS);
+			continue;
+		}
+
+		ubuf = IOBUF(node);
+		iocache_buf_lock(ubuf);
+		memcpy(ubuf->buf, buf, channel->block_size);
+		iocache_update_stats(data, 0, channel->block_size,
+				     ubuf->uptodate ? CACHE_HIT : CACHE_MISS);
+		ubuf->dirty = 1;
+		ubuf->uptodate = 1;
+		iocache_buf_unlock(ubuf);
+		cache_node_put(&data->cache, node);
+	}
+
+	return 0;
 }
 
 static errcode_t iocache_read_blk(io_channel channel, unsigned long block,
@@ -224,11 +636,17 @@ static errcode_t iocache_discard(io_channel channel, unsigned long long block,
 				 unsigned long long count)
 {
 	struct iocache_private_data *data = IOCACHE(channel);
+	errcode_t retval;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
-	return io_channel_discard(data->real, block, count);
+	retval = io_channel_discard(data->real, block, count);
+	if (retval)
+		return retval;
+
+	iocache_invalidate_range(data, block, count);
+	return 0;
 }
 
 static errcode_t iocache_cache_readahead(io_channel channel,
@@ -247,11 +665,17 @@ static errcode_t iocache_zeroout(io_channel channel, unsigned long long block,
 				 unsigned long long count)
 {
 	struct iocache_private_data *data = IOCACHE(channel);
+	errcode_t retval;
 
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
-	return io_channel_zeroout(data->real, block, count);
+	retval = io_channel_zeroout(data->real, block, count);
+	if (retval)
+		return retval;
+
+	iocache_invalidate_range(data, block, count);
+	return 0;
 }
 
 static errcode_t iocache_get_fd(io_channel channel, int *fd)
@@ -273,6 +697,7 @@ static errcode_t iocache_invalidate_blocks(io_channel channel,
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
 	EXT2_CHECK_MAGIC(data, IOCACHE_IO_CHANNEL_MAGIC);
 
+	iocache_invalidate_range(data, block, count);
 	return io_channel_invalidate_blocks(data->real, block, count);
 }
 


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ