[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250105151208.3797385-3-hongzhen@linux.alibaba.com>
Date: Sun, 5 Jan 2025 23:12:06 +0800
From: Hongzhen Luo <hongzhen@...ux.alibaba.com>
To: linux-erofs@...ts.ozlabs.org
Cc: linux-kernel@...r.kernel.org,
Hongzhen Luo <hongzhen@...ux.alibaba.com>
Subject: [RFC PATCH v5 2/4] erofs: introduce the page cache share feature
Currently, reading files with different paths (or names) but the same
content will consume multiple copies of the page cache, even if the
content of these page caches is the same. For example, reading identical
files (e.g., *.so files) from two different minor versions of container
images will cost multiple copies of the same page cache, since different
containers have different mount points. Therefore, sharing the page cache
for files with the same content can save memory.
This introduces the page cache share feature in erofs. During the mkfs
phase, the file content is hashed and the hash value is stored in the
`trusted.erofs.fingerprint` extended attribute. Inodes of files with the
same `trusted.erofs.fingerprint` are mapped to the same anonymous inode
(indicated by the `ano_inode` field). When a read request occurs, the
anonymous inode serves as a "container" whose page cache is shared. The
actual operations involving the iomap are carried out by the original
inode which is mapped to the anonymous inode.
Signed-off-by: Hongzhen Luo <hongzhen@...ux.alibaba.com>
---
fs/erofs/Kconfig | 10 ++
fs/erofs/Makefile | 1 +
fs/erofs/internal.h | 4 +
fs/erofs/pagecache_share.c | 228 +++++++++++++++++++++++++++++++++++++
fs/erofs/pagecache_share.h | 26 +++++
fs/erofs/super.c | 24 +++-
6 files changed, 292 insertions(+), 1 deletion(-)
create mode 100644 fs/erofs/pagecache_share.c
create mode 100644 fs/erofs/pagecache_share.h
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 6ea60661fa55..3aa5f946b5f1 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -178,3 +178,13 @@ config EROFS_FS_PCPU_KTHREAD_HIPRI
at higher priority.
If unsure, say N.
+
+config EROFS_FS_PAGE_CACHE_SHARE
+ bool "EROFS page cache share support"
+ depends on EROFS_FS
+ default n
+ help
+ This permits EROFS to share page cache for files with same
+ fingerprints.
+
+ If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 4331d53c7109..d035c9063ef8 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -9,3 +9,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
+erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) += pagecache_share.o
\ No newline at end of file
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 47004eb89838..6c87621d86ba 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -280,6 +280,9 @@ struct erofs_inode {
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+ struct inode *ano_inode;
+#endif
/* the corresponding vfs inode */
struct inode vfs_inode;
};
@@ -376,6 +379,7 @@ extern const struct inode_operations erofs_dir_iops;
extern const struct file_operations erofs_file_fops;
extern const struct file_operations erofs_dir_fops;
+extern const struct file_operations erofs_pcshr_fops;
extern const struct iomap_ops z_erofs_iomap_report_ops;
diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
new file mode 100644
index 000000000000..703fd17c002c
--- /dev/null
+++ b/fs/erofs/pagecache_share.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024, Alibaba Cloud
+ */
+#include <linux/xxhash.h>
+#include <linux/refcount.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include "pagecache_share.h"
+#include "internal.h"
+#include "xattr.h"
+
+#define PCSHR_FPRT_IDX 4
+#define PCSHR_FPRT_NAME "erofs.fingerprint"
+#define PCSHR_FPRT_MAXLEN (sizeof(size_t) + 1024)
+
+struct erofs_pcshr_counter {
+ struct mutex mutex;
+ struct kref ref;
+ struct vfsmount *mnt;
+};
+
+struct erofs_pcshr_private {
+ char fprt[PCSHR_FPRT_MAXLEN];
+};
+
+static struct erofs_pcshr_counter mnt_counter = {
+ .mutex = __MUTEX_INITIALIZER(mnt_counter.mutex),
+ .mnt = NULL,
+};
+
+static void erofs_pcshr_counter_release(struct kref *ref)
+{
+ struct erofs_pcshr_counter *counter = container_of(ref,
+ struct erofs_pcshr_counter, ref);
+
+ DBG_BUGON(!counter->mnt);
+ kern_unmount(counter->mnt);
+ counter->mnt = NULL;
+}
+
+int erofs_pcshr_init_mnt(void)
+{
+ int ret;
+ struct vfsmount *tmp;
+
+ mutex_lock(&mnt_counter.mutex);
+ if (!mnt_counter.mnt) {
+ tmp = kern_mount(&erofs_anon_fs_type);
+ if (IS_ERR(tmp)) {
+ ret = PTR_ERR(tmp);
+ goto out;
+ }
+ mnt_counter.mnt = tmp;
+ kref_init(&mnt_counter.ref);
+ } else
+ kref_get(&mnt_counter.ref);
+ ret = 0;
+out:
+ mutex_unlock(&mnt_counter.mutex);
+ return ret;
+}
+
+void erofs_pcshr_free_mnt(void)
+{
+ mutex_lock(&mnt_counter.mutex);
+ kref_put(&mnt_counter.ref, erofs_pcshr_counter_release);
+ mutex_unlock(&mnt_counter.mutex);
+}
+
+static int erofs_fprt_eq(struct inode *inode, void *data)
+{
+ struct erofs_pcshr_private *ano_private = inode->i_private;
+
+ return ano_private && memcmp(ano_private->fprt, data,
+ sizeof(size_t) + *(size_t *)data) == 0 ? 1 : 0;
+}
+
+static int erofs_fprt_set(struct inode *inode, void *data)
+{
+ struct erofs_pcshr_private *ano_private;
+
+ ano_private = kmalloc(sizeof(struct erofs_pcshr_private), GFP_KERNEL);
+ if (!ano_private)
+ return -ENOMEM;
+ memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
+ inode->i_private = ano_private;
+ return 0;
+}
+
+int erofs_pcshr_fill_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ /* | fingerprint length | fingerprint content | */
+ char fprt[PCSHR_FPRT_MAXLEN];
+ struct inode *ano_inode;
+ unsigned long fprt_hash;
+ size_t fprt_len;
+ int ret = -1;
+
+ vi->ano_inode = NULL;
+ memset(fprt, 0, sizeof(fprt));
+ fprt_len = erofs_getxattr(inode, PCSHR_FPRT_IDX, PCSHR_FPRT_NAME,
+ fprt + sizeof(size_t), PCSHR_FPRT_MAXLEN);
+ if (fprt_len > 0 && fprt_len <= PCSHR_FPRT_MAXLEN) {
+ *(size_t *)fprt = fprt_len;
+ fprt_hash = xxh32(fprt + sizeof(size_t), fprt_len, 0);
+ ano_inode = iget5_locked(mnt_counter.mnt->mnt_sb, fprt_hash,
+ erofs_fprt_eq, erofs_fprt_set, fprt);
+ DBG_BUGON(!ano_inode);
+ vi->ano_inode = ano_inode;
+ if (ano_inode->i_state & I_NEW) {
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ ano_inode->i_mapping->a_ops = &z_erofs_aops;
+ else
+ ano_inode->i_mapping->a_ops = &erofs_aops;
+ ano_inode->i_size = inode->i_size;
+ unlock_new_inode(ano_inode);
+ }
+ ret = 0;
+ }
+ return ret;
+}
+
+void erofs_pcshr_free_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+
+ if (S_ISREG(inode->i_mode) && vi->ano_inode) {
+ iput(vi->ano_inode);
+ vi->ano_inode = NULL;
+ }
+}
+
+static struct file *erofs_pcshr_alloc_file(struct file *file,
+ struct inode *ano_inode)
+{
+ struct file *ano_file;
+
+ ano_file = alloc_file_pseudo(ano_inode, mnt_counter.mnt,
+ "[erofs_pcssh_f]", O_RDONLY, &erofs_file_fops);
+ if (IS_ERR(ano_file))
+ return ano_file;
+
+ file_ra_state_init(&ano_file->f_ra, file->f_mapping);
+ ano_file->private_data = EROFS_I(file_inode(file));
+ return ano_file;
+}
+
+static int erofs_pcshr_file_open(struct inode *inode, struct file *file)
+{
+ struct file *ano_file;
+ struct inode *ano_inode;
+ struct erofs_inode *vi = EROFS_I(inode);
+
+ ano_inode = vi->ano_inode;
+ if (!ano_inode)
+ return -EINVAL;
+
+ ano_file = erofs_pcshr_alloc_file(file, ano_inode);
+ if (IS_ERR(ano_file))
+ return PTR_ERR(ano_file);
+
+ ihold(ano_inode);
+ file->private_data = (void *)ano_file;
+ return 0;
+}
+
+static int erofs_pcshr_file_release(struct inode *inode, struct file *file)
+{
+ if (!file->private_data)
+ return -EINVAL;
+
+ fput((struct file *)file->private_data);
+ file->private_data = NULL;
+ return 0;
+}
+
+static ssize_t erofs_pcshr_file_read_iter(struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct inode __maybe_unused *inode = file_inode(iocb->ki_filp);
+ struct file *file, *ano_file;
+ struct kiocb ano_iocb;
+ ssize_t res;
+
+ if (!iov_iter_count(to))
+ return 0;
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ return erofs_file_fops.read_iter(iocb, to);
+#endif
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return erofs_file_fops.read_iter(iocb, to);
+
+ memcpy(&ano_iocb, iocb, sizeof(struct kiocb));
+ file = iocb->ki_filp;
+ ano_file = file->private_data;
+ if (!ano_file)
+ return -EINVAL;
+ ano_iocb.ki_filp = ano_file;
+ res = filemap_read(&ano_iocb, to, 0);
+ memcpy(iocb, &ano_iocb, sizeof(struct kiocb));
+ iocb->ki_filp = file;
+ file_accessed(file);
+ return res;
+}
+
+extern const struct vm_operations_struct generic_file_vm_ops;
+
+static int erofs_pcshr_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct file *ano_file = file->private_data;
+
+ vma_set_file(vma, ano_file);
+ vma->vm_ops = &generic_file_vm_ops;
+ return 0;
+}
+
+const struct file_operations erofs_pcshr_fops = {
+ .open = erofs_pcshr_file_open,
+ .llseek = generic_file_llseek,
+ .read_iter = erofs_pcshr_file_read_iter,
+ .mmap = erofs_pcshr_mmap,
+ .release = erofs_pcshr_file_release,
+ .get_unmapped_area = thp_get_unmapped_area,
+ .splice_read = filemap_splice_read,
+};
diff --git a/fs/erofs/pagecache_share.h b/fs/erofs/pagecache_share.h
new file mode 100644
index 000000000000..f3889d6889e5
--- /dev/null
+++ b/fs/erofs/pagecache_share.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2024, Alibaba Cloud
+ */
+#ifndef __EROFS_PAGECACHE_SHARE_H
+#define __EROFS_PAGECACHE_SHARE_H
+
+#include <linux/fs.h>
+
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+
+int erofs_pcshr_init_mnt(void);
+void erofs_pcshr_free_mnt(void);
+int erofs_pcshr_fill_inode(struct inode *inode);
+void erofs_pcshr_free_inode(struct inode *inode);
+
+#else
+
+static inline int erofs_pcshr_init_mnt(void) { return 0; }
+static inline void erofs_pcshr_free_mnt(void) {}
+static inline int erofs_pcshr_fill_inode(struct inode *inode) { return -1; }
+static inline void erofs_pcshr_free_inode(struct inode *inode) {}
+
+#endif // CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+
+#endif
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 25d2c2b44d0a..b4ce07dc931c 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -853,9 +853,31 @@ static struct file_system_type erofs_fs_type = {
};
MODULE_ALIAS_FS("erofs");
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+static void erofs_free_anon_inode(struct inode *inode)
+{
+ kfree(inode->i_private);
+ inode->i_private = NULL;
+}
+#else
+#define erofs_free_anon_inode NULL
+#endif
+
+static const struct super_operations erofs_anon_sops = {
+ .statfs = simple_statfs,
+ .free_inode = erofs_free_anon_inode,
+};
+
+
static int erofs_anon_init_fs_context(struct fs_context *fc)
{
- return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, EROFS_SUPER_MAGIC);
+ if (ctx)
+ ctx->ops = &erofs_anon_sops;
+
+ return ctx ? 0 : -ENOMEM;
}
struct file_system_type erofs_anon_fs_type = {
--
2.43.5
Powered by blists - more mailing lists