linux-kernel - Re: [RFC PATCH v5 2/4] erofs: introduce the page cache share feature

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7801bdfb-3052-44f0-bbd0-c4352ccb1de9@linux.alibaba.com>
Date: Mon, 6 Jan 2025 11:03:45 +0800
From: Hongzhen Luo <hongzhen@...ux.alibaba.com>
To: Gao Xiang <hsiangkao@...ux.alibaba.com>, linux-erofs@...ts.ozlabs.org
Cc: linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH v5 2/4] erofs: introduce the page cache share feature


On 2025/1/6 10:27, Gao Xiang wrote:
>
>
> On 2025/1/5 23:12, Hongzhen Luo wrote:
>> Currently, reading files with different paths (or names) but the same
>> content will consume multiple copies of the page cache, even if the
>> content of these page caches is the same. For example, reading identical
>> files (e.g., *.so files) from two different minor versions of container
>> images will cost multiple copies of the same page cache, since different
>> containers have different mount points. Therefore, sharing the page 
>> cache
>> for files with the same content can save memory.
>>
>> This introduces the page cache share feature in erofs. During the mkfs
>> phase, the file content is hashed and the hash value is stored in the
>> `trusted.erofs.fingerprint` extended attribute. Inodes of files with the
>> same `trusted.erofs.fingerprint` are mapped to the same anonymous inode
>> (indicated by the `ano_inode` field). When a read request occurs, the
>> anonymous inode serves as a "container" whose page cache is shared. The
>> actual operations involving the iomap are carried out by the original
>> inode which is mapped to the anonymous inode.
>>
>> Signed-off-by: Hongzhen Luo <hongzhen@...ux.alibaba.com>
>> ---
>>   fs/erofs/Kconfig           |  10 ++
>>   fs/erofs/Makefile          |   1 +
>>   fs/erofs/internal.h        |   4 +
>>   fs/erofs/pagecache_share.c | 228 +++++++++++++++++++++++++++++++++++++
>>   fs/erofs/pagecache_share.h |  26 +++++
>>   fs/erofs/super.c           |  24 +++-
>>   6 files changed, 292 insertions(+), 1 deletion(-)
>>   create mode 100644 fs/erofs/pagecache_share.c
>>   create mode 100644 fs/erofs/pagecache_share.h
>>
>> diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
>> index 6ea60661fa55..3aa5f946b5f1 100644
>> --- a/fs/erofs/Kconfig
>> +++ b/fs/erofs/Kconfig
>> @@ -178,3 +178,13 @@ config EROFS_FS_PCPU_KTHREAD_HIPRI
>>         at higher priority.
>>           If unsure, say N.
>> +
>> +config EROFS_FS_PAGE_CACHE_SHARE
>> +       bool "EROFS page cache share support"
>> +       depends on EROFS_FS
>> +       default n
>> +    help
>> +      This permits EROFS to share page cache for files with same
>> +      fingerprints.
>> +
>> +      If unsure, say N.
>> diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
>> index 4331d53c7109..d035c9063ef8 100644
>> --- a/fs/erofs/Makefile
>> +++ b/fs/erofs/Makefile
>> @@ -9,3 +9,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += 
>> decompressor_deflate.o
>>   erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
>>   erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
>>   erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
>> +erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) += pagecache_share.o
>> \ No newline at end of file
>> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
>> index 47004eb89838..6c87621d86ba 100644
>> --- a/fs/erofs/internal.h
>> +++ b/fs/erofs/internal.h
>> @@ -280,6 +280,9 @@ struct erofs_inode {
>>           };
>>   #endif    /* CONFIG_EROFS_FS_ZIP */
>>       };
>> +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
>> +    struct inode *ano_inode;
>
> ano_inode has no extra meaning, we'd better to think out a meaningful 
> name.
>
>> +#endif
>>       /* the corresponding vfs inode */
>>       struct inode vfs_inode;
>>   };
>> @@ -376,6 +379,7 @@ extern const struct inode_operations erofs_dir_iops;
>>     extern const struct file_operations erofs_file_fops;
>>   extern const struct file_operations erofs_dir_fops;
>> +extern const struct file_operations erofs_pcshr_fops;
>>     extern const struct iomap_ops z_erofs_iomap_report_ops;
>>   diff --git a/fs/erofs/pagecache_share.c b/fs/erofs/pagecache_share.c
>> new file mode 100644
>> index 000000000000..703fd17c002c
>> --- /dev/null
>> +++ b/fs/erofs/pagecache_share.c
>> @@ -0,0 +1,228 @@
>> +// SPDX-License-Identifier: GPL-2.0-or-later
>> +/*
>> + * Copyright (C) 2024, Alibaba Cloud
>> + */
>> +#include <linux/xxhash.h>
>> +#include <linux/refcount.h>
>> +#include <linux/mount.h>
>> +#include <linux/mutex.h>
>> +#include "pagecache_share.h"
>> +#include "internal.h"
>> +#include "xattr.h"
>> +
>> +#define PCSHR_FPRT_IDX    4
>> +#define PCSHR_FPRT_NAME    "erofs.fingerprint"
>> +#define PCSHR_FPRT_MAXLEN (sizeof(size_t) + 1024)
>> +
>> +struct erofs_pcshr_counter {
>> +    struct mutex mutex;
>> +    struct kref ref;
>> +    struct vfsmount *mnt;
>> +};
>> +
>> +struct erofs_pcshr_private {
>> +    char fprt[PCSHR_FPRT_MAXLEN];
>> +};
>> +
>> +static struct erofs_pcshr_counter mnt_counter = {
>> +    .mutex = __MUTEX_INITIALIZER(mnt_counter.mutex),
>> +    .mnt = NULL,
>> +};
>> +
>> +static void erofs_pcshr_counter_release(struct kref *ref)
>> +{
>> +    struct erofs_pcshr_counter *counter = container_of(ref,
>> +            struct erofs_pcshr_counter, ref);
>> +
>> +    DBG_BUGON(!counter->mnt);
>> +    kern_unmount(counter->mnt);
>> +    counter->mnt = NULL;
>> +}
>> +
>> +int erofs_pcshr_init_mnt(void)
>> +{
>> +    int ret;
>> +    struct vfsmount *tmp;
>> +
>> +    mutex_lock(&mnt_counter.mutex);
>> +    if (!mnt_counter.mnt) {
>> +        tmp = kern_mount(&erofs_anon_fs_type);
>> +        if (IS_ERR(tmp)) {
>> +            ret = PTR_ERR(tmp);
>> +            goto out;
>> +        }
>> +        mnt_counter.mnt = tmp;
>> +        kref_init(&mnt_counter.ref);
>> +    } else
>> +        kref_get(&mnt_counter.ref);
>> +    ret = 0;
>> +out:
>> +    mutex_unlock(&mnt_counter.mutex);
>> +    return ret;
>> +}
>> +
>> +void erofs_pcshr_free_mnt(void)
>> +{
>> +    mutex_lock(&mnt_counter.mutex);
>> +    kref_put(&mnt_counter.ref, erofs_pcshr_counter_release);
>> +    mutex_unlock(&mnt_counter.mutex);
>> +}
>> +
>> +static int erofs_fprt_eq(struct inode *inode, void *data)
>> +{
>> +    struct erofs_pcshr_private *ano_private = inode->i_private;
>> +
>> +    return ano_private && memcmp(ano_private->fprt, data,
>> +            sizeof(size_t) + *(size_t *)data) == 0 ? 1 : 0;
>> +}
>> +
>> +static int erofs_fprt_set(struct inode *inode, void *data)
>> +{
>> +    struct erofs_pcshr_private *ano_private;
>> +
>> +    ano_private = kmalloc(sizeof(struct erofs_pcshr_private), 
>> GFP_KERNEL);
>> +    if (!ano_private)
>> +        return -ENOMEM;
>> +    memcpy(ano_private, data, sizeof(size_t) + *(size_t *)data);
>> +    inode->i_private = ano_private;
>> +    return 0;
>> +}
>> +
>> +int erofs_pcshr_fill_inode(struct inode *inode)
>> +{
>> +    struct erofs_inode *vi = EROFS_I(inode);
>> +    /* | fingerprint length | fingerprint content | */
>> +    char fprt[PCSHR_FPRT_MAXLEN];
>
> we shouldn't allocate too large space on stack.
>
>> +    struct inode *ano_inode;
>> +    unsigned long fprt_hash;
>> +    size_t fprt_len;
>> +    int ret = -1;
>> +
>> +    vi->ano_inode = NULL;
>> +    memset(fprt, 0, sizeof(fprt));
>> +    fprt_len = erofs_getxattr(inode, PCSHR_FPRT_IDX, PCSHR_FPRT_NAME,
>> +            fprt + sizeof(size_t), PCSHR_FPRT_MAXLEN);
>
> Now, I think it'd be better that users could have a way to configure
> the xattr key name.  Since in that way, we could reuse fsverity-like
> root hash digest likewise.
>
>> +    if (fprt_len > 0 && fprt_len <= PCSHR_FPRT_MAXLEN) {
>> +        *(size_t *)fprt = fprt_len;
>> +        fprt_hash = xxh32(fprt + sizeof(size_t), fprt_len, 0);
>> +        ano_inode = iget5_locked(mnt_counter.mnt->mnt_sb, fprt_hash,
>> +                     erofs_fprt_eq, erofs_fprt_set, fprt);
>> +        DBG_BUGON(!ano_inode);
>
> Why iget5_locked() won't return NULL?
>
>> +        vi->ano_inode = ano_inode;
>> +        if (ano_inode->i_state & I_NEW) {
>> +            if (erofs_inode_is_data_compressed(vi->datalayout))
>> +                ano_inode->i_mapping->a_ops = &z_erofs_aops;
>> +            else
>> +                ano_inode->i_mapping->a_ops = &erofs_aops;
>> +            ano_inode->i_size = inode->i_size;
>> +            unlock_new_inode(ano_inode);
>> +        }
>> +        ret = 0;
>> +    }
>> +    return ret;
>> +}
>> +
>> +void erofs_pcshr_free_inode(struct inode *inode)
>> +{redundant
>> +    struct erofs_inode *vi = EROFS_I(inode);
>> +
>> +    if (S_ISREG(inode->i_mode) &&  vi->ano_inode) {
>
> redundant space.
>
>> +        iput(vi->ano_inode);
>> +        vi->ano_inode = NULL;
>> +    }
>> +}
>> +
>> +static struct file *erofs_pcshr_alloc_file(struct file *file,
>> +                       struct inode *ano_inode)
>> +{
>> +    struct file *ano_file;
>> +
>> +    ano_file = alloc_file_pseudo(ano_inode, mnt_counter.mnt,
>> +            "[erofs_pcssh_f]", O_RDONLY, &erofs_file_fops);
>> +    if (IS_ERR(ano_file))
>> +        return ano_file;
>> +
>> +    file_ra_state_init(&ano_file->f_ra, file->f_mapping);
>> +    ano_file->private_data = EROFS_I(file_inode(file));
>> +    return ano_file;
>> +}
>> +
>> +static int erofs_pcshr_file_open(struct inode *inode, struct file 
>> *file)
>> +{
>> +    struct file *ano_file;
>> +    struct inode *ano_inode;
>> +    struct erofs_inode *vi = EROFS_I(inode);
>> +
>> +    ano_inode = vi->ano_inode;
>> +    if (!ano_inode)
>> +        return -EINVAL;
>> +
>> +    ano_file = erofs_pcshr_alloc_file(file, ano_inode);
>> +    if (IS_ERR(ano_file))
>> +        return PTR_ERR(ano_file);
>> +
>> +    ihold(ano_inode);
>> +    file->private_data = (void *)ano_file;
>> +    return 0;
>> +}
>> +
>> +static int erofs_pcshr_file_release(struct inode *inode, struct file 
>> *file)
>> +{
>> +    if (!file->private_data)
>> +        return -EINVAL;
>> +
>> +    fput((struct file *)file->private_data);
>> +    file->private_data = NULL;
>> +    return 0;
>> +}
>> +
>> +static ssize_t erofs_pcshr_file_read_iter(struct kiocb *iocb,
>> +                    struct iov_iter *to)
>> +{
>> +    struct inode __maybe_unused *inode = file_inode(iocb->ki_filp);
>> +    struct file *file, *ano_file;
>> +    struct kiocb ano_iocb;
>> +    ssize_t res;
>> +
>> +    if (!iov_iter_count(to))
>> +        return 0;
>> +#ifdef CONFIG_FS_DAX
>> +    if (IS_DAX(inode))
>> +        return erofs_file_fops.read_iter(iocb, to);
>> +#endif
>> +    if (iocb->ki_flags & IOCB_DIRECT)
>> +        return erofs_file_fops.read_iter(iocb, to);
>> +
>> +    memcpy(&ano_iocb, iocb, sizeof(struct kiocb));
>> +    file = iocb->ki_filp;
>> +    ano_file = file->private_data;
>> +    if (!ano_file)
>> +        return -EINVAL;
>> +    ano_iocb.ki_filp = ano_file;
>> +    res = filemap_read(&ano_iocb, to, 0);
>
> why we need to use this? what is "erofs_pcshr_file_read_iter" used for?
>

We need this because at this point, we need to switch to anonymous files 
so that `filemap_read` uses

the page cache of the anonymous inode. Files with the same fingerprint 
can share the same page cache

of the anonymous inode in this way.


Thanks,

Hongzhen Luo

> Thanks,
> Gao Xiang