[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b825b7dc-d4b0-46bb-a427-68eeeb3e0204@linux.alibaba.com>
Date: Mon, 6 Jan 2025 10:27:22 +0800
From: Gao Xiang <hsiangkao@...ux.alibaba.com>
To: Hongzhen Luo <hongzhen@...ux.alibaba.com>, linux-erofs@...ts.ozlabs.org
Cc: linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH v5 2/4] erofs: introduce the page cache share feature
On 2025/1/5 23:12, Hongzhen Luo wrote:
> Currently, reading files with different paths (or names) but the same
> content will consume multiple copies of the page cache, even if the
> content of these page caches is the same. For example, reading identical
> files (e.g., *.so files) from two different minor versions of container
> images will cost multiple copies of the same page cache, since different
> containers have different mount points. Therefore, sharing the page cache
> for files with the same content can save memory.
>
> This introduces the page cache share feature in erofs. During the mkfs
> phase, the file content is hashed and the hash value is stored in the
> `trusted.erofs.fingerprint` extended attribute. Inodes of files with the
> same `trusted.erofs.fingerprint` are mapped to the same anonymous inode
> (indicated by the `ano_inode` field). When a read request occurs, the
> anonymous inode serves as a "container" whose page cache is shared. The
> actual operations involving the iomap are carried out by the original
> inode which is mapped to the anonymous inode.
>
> Signed-off-by: Hongzhen Luo <hongzhen@...ux.alibaba.com>
> ---
> fs/erofs/Kconfig | 10 ++
> fs/erofs/Makefile | 1 +
> fs/erofs/internal.h | 4 +
> fs/erofs/pagecache_share.c | 228 +++++++++++++++++++++++++++++++++++++
> fs/erofs/pagecache_share.h | 26 +++++
> fs/erofs/super.c | 24 +++-
> 6 files changed, 292 insertions(+), 1 deletion(-)
> create mode 100644 fs/erofs/pagecache_share.c
> create mode 100644 fs/erofs/pagecache_share.h
>
> diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
> index 6ea60661fa55..3aa5f946b5f1 100644
> --- a/fs/erofs/Kconfig
> +++ b/fs/erofs/Kconfig
> @@ -178,3 +178,13 @@ config EROFS_FS_PCPU_KTHREAD_HIPRI
> at higher priority.
>
> If unsure, say N.
> +
> +config EROFS_FS_PAGE_CACHE_SHARE
> + bool "EROFS page cache share support"
> + depends on EROFS_FS
> + default n
> + help
> + This permits EROFS to share page cache for files with same
> + fingerprints.
> +
> + If unsure, say N.
> diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
> index 4331d53c7109..d035c9063ef8 100644
> --- a/fs/erofs/Makefile
> +++ b/fs/erofs/Makefile
> @@ -9,3 +9,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
> erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
> erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
> erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
> +erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) += pagecache_share.o
> \ No newline at end of file
> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
> index 47004eb89838..6c87621d86ba 100644
> --- a/fs/erofs/internal.h
> +++ b/fs/erofs/internal.h
> @@ -280,6 +280,9 @@ struct erofs_inode {
> };
> #endif /* CONFIG_EROFS_FS_ZIP */
> };
> +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
> + struct inode *ano_inode;
ano_inode has no extra meaning, we'd better to think out a meaningful name.
> +#endif
> /* the corresponding vfs inode */
> struct inode vfs_inode;
> };
> @@ -376,6 +379,7 @@ extern const struct inode_operations erofs_dir_iops;
>
> extern const struct file_operations erofs_file_fops;
> extern const struct file_operations erofs_dir_fops;
> +extern const struct file_operations erofs_pcshr_fops;
>
> extern const struct iomap_ops z_erofs_iomap_report_ops;
>
> diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
> new file mode 100644
> index 000000000000..703fd17c002c
> --- /dev/null
> +++ b/fs/erofs/pagecache_share.c
> @@ -0,0 +1,228 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) 2024, Alibaba Cloud
> + */
> +#include <linux/xxhash.h>
> +#include <linux/refcount.h>
> +#include <linux/mount.h>
> +#include <linux/mutex.h>
> +#include "pagecache_share.h"
> +#include "internal.h"
> +#include "xattr.h"
> +
> +#define PCSHR_FPRT_IDX 4
> +#define PCSHR_FPRT_NAME "erofs.fingerprint"
> +#define PCSHR_FPRT_MAXLEN (sizeof(size_t) + 1024)
> +
> +struct erofs_pcshr_counter {
> + struct mutex mutex;
> + struct kref ref;
> + struct vfsmount *mnt;
> +};
> +
> +struct erofs_pcshr_private {
> + char fprt[PCSHR_FPRT_MAXLEN];
> +};
> +
> +static struct erofs_pcshr_counter mnt_counter = {
> + .mutex = __MUTEX_INITIALIZER(mnt_counter.mutex),
> + .mnt = NULL,
> +};
> +
> +static void erofs_pcshr_counter_release(struct kref *ref)
> +{
> + struct erofs_pcshr_counter *counter = container_of(ref,
> + struct erofs_pcshr_counter, ref);
> +
> + DBG_BUGON(!counter->mnt);
> + kern_unmount(counter->mnt);
> + counter->mnt = NULL;
> +}
> +
> +int erofs_pcshr_init_mnt(void)
> +{
> + int ret;
> + struct vfsmount *tmp;
> +
> + mutex_lock(&mnt_counter.mutex);
> + if (!mnt_counter.mnt) {
> + tmp = kern_mount(&erofs_anon_fs_type);
> + if (IS_ERR(tmp)) {
> + ret = PTR_ERR(tmp);
> + goto out;
> + }
> + mnt_counter.mnt = tmp;
> + kref_init(&mnt_counter.ref);
> + } else
> + kref_get(&mnt_counter.ref);
> + ret = 0;
> +out:
> + mutex_unlock(&mnt_counter.mutex);
> + return ret;
> +}
> +
> +void erofs_pcshr_free_mnt(void)
> +{
> + mutex_lock(&mnt_counter.mutex);
> + kref_put(&mnt_counter.ref, erofs_pcshr_counter_release);
> + mutex_unlock(&mnt_counter.mutex);
> +}
> +
> +static int erofs_fprt_eq(struct inode *inode, void *data)
> +{
> + struct erofs_pcshr_private *ano_private = inode->i_private;
> +
> + return ano_private && memcmp(ano_private->fprt, data,
> + sizeof(size_t) + *(size_t *)data) == 0 ? 1 : 0;
> +}
> +
> +static int erofs_fprt_set(struct inode *inode, void *data)
> +{
> + struct erofs_pcshr_private *ano_private;
> +
> + ano_private = kmalloc(sizeof(struct erofs_pcshr_private), GFP_KERNEL);
> + if (!ano_private)
> + return -ENOMEM;
> + memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
> + inode->i_private = ano_private;
> + return 0;
> +}
> +
> +int erofs_pcshr_fill_inode(struct inode *inode)
> +{
> + struct erofs_inode *vi = EROFS_I(inode);
> + /* | fingerprint length | fingerprint content | */
> + char fprt[PCSHR_FPRT_MAXLEN];
we shouldn't allocate too large space on stack.
> + struct inode *ano_inode;
> + unsigned long fprt_hash;
> + size_t fprt_len;
> + int ret = -1;
> +
> + vi->ano_inode = NULL;
> + memset(fprt, 0, sizeof(fprt));
> + fprt_len = erofs_getxattr(inode, PCSHR_FPRT_IDX, PCSHR_FPRT_NAME,
> + fprt + sizeof(size_t), PCSHR_FPRT_MAXLEN);
Now, I think it'd be better that users could have a way to configure
the xattr key name. Since in that way, we could reuse fsverity-like
root hash digest likewise.
> + if (fprt_len > 0 && fprt_len <= PCSHR_FPRT_MAXLEN) {
> + *(size_t *)fprt = fprt_len;
> + fprt_hash = xxh32(fprt + sizeof(size_t), fprt_len, 0);
> + ano_inode = iget5_locked(mnt_counter.mnt->mnt_sb, fprt_hash,
> + erofs_fprt_eq, erofs_fprt_set, fprt);
> + DBG_BUGON(!ano_inode);
Why iget5_locked() won't return NULL?
> + vi->ano_inode = ano_inode;
> + if (ano_inode->i_state & I_NEW) {
> + if (erofs_inode_is_data_compressed(vi->datalayout))
> + ano_inode->i_mapping->a_ops = &z_erofs_aops;
> + else
> + ano_inode->i_mapping->a_ops = &erofs_aops;
> + ano_inode->i_size = inode->i_size;
> + unlock_new_inode(ano_inode);
> + }
> + ret = 0;
> + }
> + return ret;
> +}
> +
> +void erofs_pcshr_free_inode(struct inode *inode)
> +{redundant
> + struct erofs_inode *vi = EROFS_I(inode);
> +
> + if (S_ISREG(inode->i_mode) && vi->ano_inode) {
redundant space.
> + iput(vi->ano_inode);
> + vi->ano_inode = NULL;
> + }
> +}
> +
> +static struct file *erofs_pcshr_alloc_file(struct file *file,
> + struct inode *ano_inode)
> +{
> + struct file *ano_file;
> +
> + ano_file = alloc_file_pseudo(ano_inode, mnt_counter.mnt,
> + "[erofs_pcssh_f]", O_RDONLY, &erofs_file_fops);
> + if (IS_ERR(ano_file))
> + return ano_file;
> +
> + file_ra_state_init(&ano_file->f_ra, file->f_mapping);
> + ano_file->private_data = EROFS_I(file_inode(file));
> + return ano_file;
> +}
> +
> +static int erofs_pcshr_file_open(struct inode *inode, struct file *file)
> +{
> + struct file *ano_file;
> + struct inode *ano_inode;
> + struct erofs_inode *vi = EROFS_I(inode);
> +
> + ano_inode = vi->ano_inode;
> + if (!ano_inode)
> + return -EINVAL;
> +
> + ano_file = erofs_pcshr_alloc_file(file, ano_inode);
> + if (IS_ERR(ano_file))
> + return PTR_ERR(ano_file);
> +
> + ihold(ano_inode);
> + file->private_data = (void *)ano_file;
> + return 0;
> +}
> +
> +static int erofs_pcshr_file_release(struct inode *inode, struct file *file)
> +{
> + if (!file->private_data)
> + return -EINVAL;
> +
> + fput((struct file *)file->private_data);
> + file->private_data = NULL;
> + return 0;
> +}
> +
> +static ssize_t erofs_pcshr_file_read_iter(struct kiocb *iocb,
> + struct iov_iter *to)
> +{
> + struct inode __maybe_unused *inode = file_inode(iocb->ki_filp);
> + struct file *file, *ano_file;
> + struct kiocb ano_iocb;
> + ssize_t res;
> +
> + if (!iov_iter_count(to))
> + return 0;
> +#ifdef CONFIG_FS_DAX
> + if (IS_DAX(inode))
> + return erofs_file_fops.read_iter(iocb, to);
> +#endif
> + if (iocb->ki_flags & IOCB_DIRECT)
> + return erofs_file_fops.read_iter(iocb, to);
> +
> + memcpy(&ano_iocb, iocb, sizeof(struct kiocb));
> + file = iocb->ki_filp;
> + ano_file = file->private_data;
> + if (!ano_file)
> + return -EINVAL;
> + ano_iocb.ki_filp = ano_file;
> + res = filemap_read(&ano_iocb, to, 0);
why we need to use this? what is "erofs_pcshr_file_read_iter" used for?
Thanks,
Gao Xiang
Powered by blists - more mailing lists