[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <diqztt1sbd2v.fsf@google.com>
Date: Wed, 27 Aug 2025 15:43:20 -0700
From: Ackerley Tng <ackerleytng@...gle.com>
To: Shivank Garg <shivankg@....com>, willy@...radead.org, akpm@...ux-foundation.org,
david@...hat.com, pbonzini@...hat.com, shuah@...nel.org, seanjc@...gle.com,
vbabka@...e.cz
Cc: brauner@...nel.org, viro@...iv.linux.org.uk, dsterba@...e.com,
xiang@...nel.org, chao@...nel.org, jaegeuk@...nel.org, clm@...com,
josef@...icpanda.com, kent.overstreet@...ux.dev, zbestahu@...il.com,
jefflexu@...ux.alibaba.com, dhavale@...gle.com, lihongbo22@...wei.com,
lorenzo.stoakes@...cle.com, Liam.Howlett@...cle.com, rppt@...nel.org,
surenb@...gle.com, mhocko@...e.com, ziy@...dia.com, matthew.brost@...el.com,
joshua.hahnjy@...il.com, rakie.kim@...com, byungchul@...com,
gourry@...rry.net, ying.huang@...ux.alibaba.com, apopple@...dia.com,
tabba@...gle.com, shivankg@....com, paul@...l-moore.com, jmorris@...ei.org,
serge@...lyn.com, pvorel@...e.cz, bfoster@...hat.com, vannapurve@...gle.com,
chao.gao@...el.com, bharata@....com, nikunj@....com, michael.day@....com,
shdhiman@....com, yan.y.zhao@...el.com, Neeraj.Upadhyay@....com,
thomas.lendacky@....com, michael.roth@....com, aik@....com, jgg@...dia.com,
kalyazin@...zon.com, peterx@...hat.com, jack@...e.cz, hch@...radead.org,
cgzones@...glemail.com, ira.weiny@...el.com, rientjes@...gle.com,
roypat@...zon.co.uk, chao.p.peng@...el.com, amit@...radead.org,
ddutile@...hat.com, dan.j.williams@...el.com, ashish.kalra@....com,
gshan@...hat.com, jgowans@...zon.com, pankaj.gupta@....com, papaluri@....com,
yuzhao@...gle.com, suzuki.poulose@....com, quic_eberman@...cinc.com,
linux-bcachefs@...r.kernel.org, linux-btrfs@...r.kernel.org,
linux-erofs@...ts.ozlabs.org, linux-f2fs-devel@...ts.sourceforge.net,
linux-fsdevel@...r.kernel.org, linux-mm@...ck.org,
linux-kernel@...r.kernel.org, linux-security-module@...r.kernel.org,
kvm@...r.kernel.org, linux-kselftest@...r.kernel.org,
linux-coco@...ts.linux.dev
Subject: Re: [PATCH kvm-next V11 4/7] KVM: guest_memfd: Use guest mem inodes
instead of anonymous inodes
Shivank Garg <shivankg@....com> writes:
>
> [...snip...]
>
I meant to send this to you before this version went out but you were
too quick!
Here's a new version, Fuad and I reviewed this again internally. The
changes are:
+ Sort linux/pseudo_fs.h after linux/pagemap.h (alphabetical)
+ Don't set MNT_NOEXEC on the mount, since SB_I_NOEXEC was already set
on the superblock
+ Rename kvm_gmem_inode_make_secure_inode() to kvm_gmem_inode_create()
+ Emphasizes that there is a creation in this function
+ Remove "secure" from the function name to remove confusion that
there may be a "non-secure" version
+ In kvm_gmem_inode_create_getfile()'s error path, return ERR_PTR(err)
directly instead of having a goto
>From ada9814b216eac129ed44dffd3acf76fce2cc08a Mon Sep 17 00:00:00 2001
From: Ackerley Tng <ackerleytng@...gle.com>
Date: Sun, 13 Jul 2025 17:43:35 +0000
Subject: [PATCH] KVM: guest_memfd: Use guest mem inodes instead of anonymous
inodes
guest_memfd's inode represents memory the guest_memfd is
providing. guest_memfd's file represents a struct kvm's view of that
memory.
Using a custom inode allows customization of the inode teardown
process via callbacks. For example, ->evict_inode() allows
customization of the truncation process on file close, and
->destroy_inode() and ->free_inode() allow customization of the inode
freeing process.
Customizing the truncation process allows flexibility in management of
guest_memfd memory and customization of the inode freeing process
allows proper cleanup of memory metadata stored on the inode.
Memory metadata is more appropriately stored on the inode (as opposed
to the file), since the metadata is for the memory and is not unique
to a specific binding and struct kvm.
Co-developed-by: Fuad Tabba <tabba@...gle.com>
Signed-off-by: Fuad Tabba <tabba@...gle.com>
Signed-off-by: Shivank Garg <shivankg@....com>
Signed-off-by: Ackerley Tng <ackerleytng@...gle.com>
---
include/uapi/linux/magic.h | 1 +
virt/kvm/guest_memfd.c | 126 ++++++++++++++++++++++++++++++-------
virt/kvm/kvm_main.c | 7 ++-
virt/kvm/kvm_mm.h | 9 +--
4 files changed, 116 insertions(+), 27 deletions(-)
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index bb575f3ab45e5..638ca21b7a909 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -103,5 +103,6 @@
#define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
#define PID_FS_MAGIC 0x50494446 /* "PIDF" */
+#define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */
#endif /* __LINUX_MAGIC_H__ */
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 08a6bc7d25b60..234e51fd69ff6 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -1,12 +1,16 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
#include <linux/backing-dev.h>
#include <linux/falloc.h>
+#include <linux/fs.h>
#include <linux/kvm_host.h>
#include <linux/pagemap.h>
-#include <linux/anon_inodes.h>
+#include <linux/pseudo_fs.h>
#include "kvm_mm.h"
+static struct vfsmount *kvm_gmem_mnt;
+
struct kvm_gmem {
struct kvm *kvm;
struct xarray bindings;
@@ -385,9 +389,44 @@ static struct file_operations kvm_gmem_fops = {
.fallocate = kvm_gmem_fallocate,
};
-void kvm_gmem_init(struct module *module)
+static int kvm_gmem_init_fs_context(struct fs_context *fc)
+{
+ if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
+ return -ENOMEM;
+
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
+
+ return 0;
+}
+
+static struct file_system_type kvm_gmem_fs = {
+ .name = "guest_memfd",
+ .init_fs_context = kvm_gmem_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int kvm_gmem_init_mount(void)
+{
+ kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
+
+ if (IS_ERR(kvm_gmem_mnt))
+ return PTR_ERR(kvm_gmem_mnt);
+
+ return 0;
+}
+
+int kvm_gmem_init(struct module *module)
{
kvm_gmem_fops.owner = module;
+
+ return kvm_gmem_init_mount();
+}
+
+void kvm_gmem_exit(void)
+{
+ kern_unmount(kvm_gmem_mnt);
+ kvm_gmem_mnt = NULL;
}
static int kvm_gmem_migrate_folio(struct address_space *mapping,
@@ -463,11 +502,70 @@ bool __weak kvm_arch_supports_gmem_mmap(struct kvm *kvm)
return true;
}
+static struct inode *kvm_gmem_inode_create(const char *name, loff_t size,
+ u64 flags)
+{
+ struct inode *inode;
+
+ inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
+ if (IS_ERR(inode))
+ return inode;
+
+ inode->i_private = (void *)(unsigned long)flags;
+ inode->i_op = &kvm_gmem_iops;
+ inode->i_mapping->a_ops = &kvm_gmem_aops;
+ inode->i_mode |= S_IFREG;
+ inode->i_size = size;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+ mapping_set_inaccessible(inode->i_mapping);
+ /* Unmovable mappings are supposed to be marked unevictable as well. */
+ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+ return inode;
+}
+
+static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
+ u64 flags)
+{
+ static const char *name = "[kvm-gmem]";
+ struct inode *inode;
+ struct file *file;
+ int err;
+
+ err = -ENOENT;
+ /* __fput() will take care of fops_put(). */
+ if (!fops_get(&kvm_gmem_fops))
+ goto err;
+
+ inode = kvm_gmem_inode_create(name, size, flags);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto err_fops_put;
+ }
+
+ file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR,
+ &kvm_gmem_fops);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_put_inode;
+ }
+
+ file->f_flags |= O_LARGEFILE;
+ file->private_data = priv;
+
+ return file;
+
+err_put_inode:
+ iput(inode);
+err_fops_put:
+ fops_put(&kvm_gmem_fops);
+err:
+ return ERR_PTR(err);
+}
+
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
- const char *anon_name = "[kvm-gmem]";
struct kvm_gmem *gmem;
- struct inode *inode;
struct file *file;
int fd, err;
@@ -481,32 +579,16 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
goto err_fd;
}
- file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
- O_RDWR, NULL);
+ file = kvm_gmem_inode_create_getfile(gmem, size, flags);
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto err_gmem;
}
- file->f_flags |= O_LARGEFILE;
-
- inode = file->f_inode;
- WARN_ON(file->f_mapping != inode->i_mapping);
-
- inode->i_private = (void *)(unsigned long)flags;
- inode->i_op = &kvm_gmem_iops;
- inode->i_mapping->a_ops = &kvm_gmem_aops;
- inode->i_mode |= S_IFREG;
- inode->i_size = size;
- mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
- mapping_set_inaccessible(inode->i_mapping);
- /* Unmovable mappings are supposed to be marked unevictable as well. */
- WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
-
kvm_get_kvm(kvm);
gmem->kvm = kvm;
xa_init(&gmem->bindings);
- list_add(&gmem->entry, &inode->i_mapping->i_private_list);
+ list_add(&gmem->entry, &file_inode(file)->i_mapping->i_private_list);
fd_install(fd, file);
return fd;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 18f29ef935437..301d48d6e00d0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6489,7 +6489,9 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
if (WARN_ON_ONCE(r))
goto err_vfio;
- kvm_gmem_init(module);
+ r = kvm_gmem_init(module);
+ if (r)
+ goto err_gmem;
r = kvm_init_virtualization();
if (r)
@@ -6510,6 +6512,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
err_register:
kvm_uninit_virtualization();
err_virt:
+ kvm_gmem_exit();
+err_gmem:
kvm_vfio_ops_exit();
err_vfio:
kvm_async_pf_deinit();
@@ -6541,6 +6545,7 @@ void kvm_exit(void)
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
+ kvm_gmem_exit();
kvm_vfio_ops_exit();
kvm_async_pf_deinit();
kvm_irqfd_exit();
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 31defb08ccbab..9fcc5d5b7f8d0 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -68,17 +68,18 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
#endif /* HAVE_KVM_PFNCACHE */
#ifdef CONFIG_KVM_GUEST_MEMFD
-void kvm_gmem_init(struct module *module);
+int kvm_gmem_init(struct module *module);
+void kvm_gmem_exit(void);
int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
unsigned int fd, loff_t offset);
void kvm_gmem_unbind(struct kvm_memory_slot *slot);
#else
-static inline void kvm_gmem_init(struct module *module)
+static inline int kvm_gmem_init(struct module *module)
{
-
+ return 0;
}
-
+static inline void kvm_gmem_exit(void) {};
static inline int kvm_gmem_bind(struct kvm *kvm,
struct kvm_memory_slot *slot,
unsigned int fd, loff_t offset)
--
2.51.0.268.g9569e192d0-goog
Powered by blists - more mailing lists