[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250915161815.40729-2-kalyazin@amazon.com>
Date: Mon, 15 Sep 2025 16:18:27 +0000
From: "Kalyazin, Nikita" <kalyazin@...zon.co.uk>
To: "akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
"david@...hat.com" <david@...hat.com>, "pbonzini@...hat.com"
<pbonzini@...hat.com>, "seanjc@...gle.com" <seanjc@...gle.com>,
"viro@...iv.linux.org.uk" <viro@...iv.linux.org.uk>, "brauner@...nel.org"
<brauner@...nel.org>
CC: "peterx@...hat.com" <peterx@...hat.com>, "lorenzo.stoakes@...cle.com"
<lorenzo.stoakes@...cle.com>, "Liam.Howlett@...cle.com"
<Liam.Howlett@...cle.com>, "willy@...radead.org" <willy@...radead.org>,
"vbabka@...e.cz" <vbabka@...e.cz>, "rppt@...nel.org" <rppt@...nel.org>,
"surenb@...gle.com" <surenb@...gle.com>, "mhocko@...e.com" <mhocko@...e.com>,
"jack@...e.cz" <jack@...e.cz>, "linux-mm@...ck.org" <linux-mm@...ck.org>,
"kvm@...r.kernel.org" <kvm@...r.kernel.org>, "linux-kernel@...r.kernel.org"
<linux-kernel@...r.kernel.org>, "linux-fsdevel@...r.kernel.org"
<linux-fsdevel@...r.kernel.org>, "jthoughton@...gle.com"
<jthoughton@...gle.com>, "tabba@...gle.com" <tabba@...gle.com>,
"vannapurve@...gle.com" <vannapurve@...gle.com>, "Roy, Patrick"
<roypat@...zon.co.uk>, "Thomson, Jack" <jackabt@...zon.co.uk>, "Manwaring,
Derek" <derekmn@...zon.com>, "Cali, Marco" <xmarcalx@...zon.co.uk>,
"Kalyazin, Nikita" <kalyazin@...zon.co.uk>
Subject: [RFC PATCH v6 1/2] mm: guestmem: introduce guestmem library
From: Nikita Kalyazin <kalyazin@...zon.com>
Move MM-generic parts of guest_memfd from KVM to MM. This allows other
hypervisors to use guestmem code and enables UserfaultFD implementation
for guest_memfd [1]. Previously it was not possible because KVM (and
guest_memfd code) might be built as a module.
Based on a patch by Elliot Berman <quic_eberman@...cinc.com> [2].
[1] https://lore.kernel.org/kvm/20250404154352.23078-1-kalyazin@amazon.com
[2] https://lore.kernel.org/kvm/20241122-guestmem-library-v5-2-450e92951a15@quicinc.com
Signed-off-by: Nikita Kalyazin <kalyazin@...zon.com>
---
MAINTAINERS | 2 +
include/linux/guestmem.h | 46 +++++
mm/Kconfig | 3 +
mm/Makefile | 1 +
mm/guestmem.c | 380 +++++++++++++++++++++++++++++++++++++++
virt/kvm/Kconfig | 1 +
virt/kvm/guest_memfd.c | 303 ++++---------------------------
7 files changed, 465 insertions(+), 271 deletions(-)
create mode 100644 include/linux/guestmem.h
create mode 100644 mm/guestmem.c
diff --git a/MAINTAINERS b/MAINTAINERS
index fed6cd812d79..c468c4847ffd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15956,6 +15956,7 @@ W: http://www.linux-mm.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
T: quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new
F: mm/
+F: mm/guestmem.c
F: tools/mm/
MEMORY MANAGEMENT - CORE
@@ -15973,6 +15974,7 @@ W: http://www.linux-mm.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
F: include/linux/gfp.h
F: include/linux/gfp_types.h
+F: include/linux/guestmem.h
F: include/linux/highmem.h
F: include/linux/memory.h
F: include/linux/mm.h
diff --git a/include/linux/guestmem.h b/include/linux/guestmem.h
new file mode 100644
index 000000000000..2a173261d32b
--- /dev/null
+++ b/include/linux/guestmem.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_GUESTMEM_H
+#define _LINUX_GUESTMEM_H
+
+#include <linux/types.h>
+
+struct address_space;
+struct list_head;
+struct inode;
+
+/**
+ * struct guestmem_ops - Hypervisor-specific maintenance operations
+ * @release_folio - Try to bring the folio back to fully owned by Linux
+ * for instance: about to free the folio [optional]
+ * @invalidate_begin - start invalidating mappings between start and end offsets
+ * @invalidate_end - paired with ->invalidate_begin() [optional]
+ * @supports_mmap - return true if the inode supports mmap [optional]
+ */
+struct guestmem_ops {
+ bool (*release_folio)(struct address_space *mapping,
+ struct folio *folio);
+ void (*invalidate_begin)(struct list_head *entry, pgoff_t start,
+ pgoff_t end);
+ void (*invalidate_end)(struct list_head *entry, pgoff_t start,
+ pgoff_t end);
+ bool (*supports_mmap)(struct inode *inode);
+};
+
+int guestmem_attach_mapping(struct address_space *mapping,
+ const struct guestmem_ops *const ops,
+ struct list_head *data);
+void guestmem_detach_mapping(struct address_space *mapping,
+ struct list_head *data);
+
+struct folio *guestmem_grab_folio(struct address_space *mapping, pgoff_t index);
+
+int guestmem_punch_hole(struct address_space *mapping, loff_t offset,
+ loff_t len);
+int guestmem_allocate(struct address_space *mapping, loff_t offset, loff_t len);
+
+bool guestmem_test_no_direct_map(struct inode *inode);
+void guestmem_mark_prepared(struct folio *folio);
+int guestmem_mmap(struct file *file, struct vm_area_struct *vma);
+bool guestmem_vma_is_guestmem(struct vm_area_struct *vma);
+
+#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index e443fe8cd6cf..a3705099601f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1254,6 +1254,9 @@ config SECRETMEM
memory areas visible only in the context of the owning process and
not mapped to other processes and other kernel page tables.
+config GUESTMEM
+ bool
+
config ANON_VMA_NAME
bool "Anonymous VMA name support"
depends on PROC_FS && ADVISE_SYSCALLS && MMU
diff --git a/mm/Makefile b/mm/Makefile
index ef54aa615d9d..c92892acd819 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -138,6 +138,7 @@ obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+obj-$(CONFIG_GUESTMEM) += guestmem.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
diff --git a/mm/guestmem.c b/mm/guestmem.c
new file mode 100644
index 000000000000..110087aff7e8
--- /dev/null
+++ b/mm/guestmem.c
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/guestmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/set_memory.h>
+#include <linux/userfaultfd_k.h>
+
+struct guestmem {
+ const struct guestmem_ops *ops;
+};
+
+static inline bool __guestmem_release_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ struct guestmem *gmem = mapping->i_private_data;
+
+ if (gmem->ops->release_folio) {
+ if (!gmem->ops->release_folio(mapping, folio))
+ return false;
+ }
+
+ return true;
+}
+
+static inline void
+__guestmem_invalidate_begin(struct address_space *const mapping, pgoff_t start,
+ pgoff_t end)
+{
+ struct guestmem *gmem = mapping->i_private_data;
+ struct list_head *entry;
+
+ list_for_each(entry, &mapping->i_private_list)
+ gmem->ops->invalidate_begin(entry, start, end);
+}
+
+static inline void
+__guestmem_invalidate_end(struct address_space *const mapping, pgoff_t start,
+ pgoff_t end)
+{
+ struct guestmem *gmem = mapping->i_private_data;
+ struct list_head *entry;
+
+ if (gmem->ops->invalidate_end) {
+ list_for_each(entry, &mapping->i_private_list)
+ gmem->ops->invalidate_end(entry, start, end);
+ }
+}
+
+static int guestmem_write_begin(const struct kiocb *kiocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned int len,
+ struct folio **foliop,
+ void **fsdata)
+{
+ struct file *file = kiocb->ki_filp;
+ pgoff_t index = pos >> PAGE_SHIFT;
+ struct folio *folio;
+
+ if (!PAGE_ALIGNED(pos) || len != PAGE_SIZE)
+ return -EINVAL;
+
+ if (pos + len > i_size_read(file_inode(file)))
+ return -EINVAL;
+
+ folio = guestmem_grab_folio(file_inode(file)->i_mapping, index);
+ if (IS_ERR(folio))
+ return -EFAULT;
+
+ if (WARN_ON_ONCE(folio_test_large(folio))) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return -EFAULT;
+ }
+
+ if (folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return -ENOSPC;
+ }
+
+ *foliop = folio;
+ return 0;
+}
+
+static int guestmem_write_end(const struct kiocb *kiocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int copied,
+ struct folio *folio, void *fsdata)
+{
+ if (copied) {
+ if (copied < len) {
+ unsigned int from = pos & (PAGE_SIZE - 1);
+
+ folio_zero_range(folio, from + copied, len - copied);
+ }
+ guestmem_mark_prepared(folio);
+ }
+
+ folio_unlock(folio);
+ folio_put(folio);
+
+ return copied;
+}
+
+static void guestmem_free_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ WARN_ON_ONCE(!__guestmem_release_folio(mapping, folio));
+}
+
+static int guestmem_error_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ pgoff_t start, end;
+
+ filemap_invalidate_lock_shared(mapping);
+
+ start = folio->index;
+ end = start + folio_nr_pages(folio);
+
+ __guestmem_invalidate_begin(mapping, start, end);
+
+ /*
+ * Do not truncate the range, what action is taken in response to the
+ * error is userspace's decision (assuming the architecture supports
+ * gracefully handling memory errors). If/when the guest attempts to
+ * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
+ * at which point KVM can either terminate the VM or propagate the
+ * error to userspace.
+ */
+
+ __guestmem_invalidate_end(mapping, start, end);
+
+ filemap_invalidate_unlock_shared(mapping);
+ return MF_FAILED;
+}
+
+static int guestmem_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
+ enum migrate_mode mode)
+{
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+}
+
+static const struct address_space_operations guestmem_aops = {
+ .dirty_folio = noop_dirty_folio,
+ .write_begin = guestmem_write_begin,
+ .write_end = guestmem_write_end,
+ .free_folio = guestmem_free_folio,
+ .error_remove_folio = guestmem_error_folio,
+ .migrate_folio = guestmem_migrate_folio,
+};
+
+int guestmem_attach_mapping(struct address_space *mapping,
+ const struct guestmem_ops *const ops,
+ struct list_head *data)
+{
+ struct guestmem *gmem;
+
+ if (mapping->a_ops == &guestmem_aops) {
+ gmem = mapping->i_private_data;
+ if (gmem->ops != ops)
+ return -EINVAL;
+
+ goto add;
+ }
+
+ gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
+ if (!gmem)
+ return -ENOMEM;
+
+ gmem->ops = ops;
+
+ mapping->a_ops = &guestmem_aops;
+ mapping->i_private_data = gmem;
+
+ mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
+ mapping_set_inaccessible(mapping);
+ /* Unmovable mappings are supposed to be marked unevictable as well. */
+ WARN_ON_ONCE(!mapping_unevictable(mapping));
+
+add:
+ list_add(data, &mapping->i_private_list);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(guestmem_attach_mapping);
+
+void guestmem_detach_mapping(struct address_space *mapping,
+ struct list_head *data)
+{
+ list_del(data);
+
+ if (list_empty(&mapping->i_private_list)) {
+ /**
+ * Ensures we call ->free_folio() for any allocated folios.
+ * Any folios allocated after this point are assumed not to be
+ * accessed by the guest, so we don't need to worry about
+ * guestmem ops not being called on them.
+ */
+ truncate_inode_pages(mapping, 0);
+
+ kfree(mapping->i_private_data);
+ mapping->i_private_data = NULL;
+ mapping->a_ops = &empty_aops;
+ }
+}
+EXPORT_SYMBOL_GPL(guestmem_detach_mapping);
+
+struct folio *guestmem_grab_folio(struct address_space *mapping, pgoff_t index)
+{
+ /* TODO: Support huge pages. */
+ return filemap_grab_folio(mapping, index);
+}
+EXPORT_SYMBOL_GPL(guestmem_grab_folio);
+
+int guestmem_punch_hole(struct address_space *mapping, loff_t offset,
+ loff_t len)
+{
+ pgoff_t start = offset >> PAGE_SHIFT;
+ pgoff_t end = (offset + len) >> PAGE_SHIFT;
+
+ filemap_invalidate_lock(mapping);
+ __guestmem_invalidate_begin(mapping, start, end);
+
+ truncate_inode_pages_range(mapping, offset, offset + len - 1);
+
+ __guestmem_invalidate_end(mapping, start, end);
+ filemap_invalidate_unlock(mapping);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(guestmem_punch_hole);
+
+int guestmem_allocate(struct address_space *mapping, loff_t offset, loff_t len)
+{
+ pgoff_t start, index, end;
+ int r;
+
+ /* Dedicated guest is immutable by default. */
+ if (offset + len > i_size_read(mapping->host))
+ return -EINVAL;
+
+ filemap_invalidate_lock_shared(mapping);
+
+ start = offset >> PAGE_SHIFT;
+ end = (offset + len) >> PAGE_SHIFT;
+
+ r = 0;
+ for (index = start; index < end; ) {
+ struct folio *folio;
+
+ if (signal_pending(current)) {
+ r = -EINTR;
+ break;
+ }
+
+ folio = guestmem_grab_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ r = PTR_ERR(folio);
+ break;
+ }
+
+ index = folio_next_index(folio);
+
+ folio_unlock(folio);
+ folio_put(folio);
+
+ /* 64-bit only, wrapping the index should be impossible. */
+ if (WARN_ON_ONCE(!index))
+ break;
+
+ cond_resched();
+ }
+
+ filemap_invalidate_unlock_shared(mapping);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(guestmem_allocate);
+
+bool guestmem_test_no_direct_map(struct inode *inode)
+{
+ return mapping_no_direct_map(inode->i_mapping);
+}
+EXPORT_SYMBOL_GPL(guestmem_test_no_direct_map);
+
+void guestmem_mark_prepared(struct folio *folio)
+{
+ struct inode *inode = folio_inode(folio);
+
+ if (guestmem_test_no_direct_map(inode))
+ set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio), false);
+
+ folio_mark_uptodate(folio);
+}
+EXPORT_SYMBOL_GPL(guestmem_mark_prepared);
+
+static vm_fault_t guestmem_fault_user_mapping(struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct folio *folio;
+ vm_fault_t ret = VM_FAULT_LOCKED;
+
+ if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
+ return VM_FAULT_SIGBUS;
+
+ folio = guestmem_grab_folio(inode->i_mapping, vmf->pgoff);
+ if (IS_ERR(folio)) {
+ int err = PTR_ERR(folio);
+
+ if (err == -EAGAIN)
+ return VM_FAULT_RETRY;
+
+ return vmf_error(err);
+ }
+
+ if (WARN_ON_ONCE(folio_test_large(folio))) {
+ ret = VM_FAULT_SIGBUS;
+ goto out_folio;
+ }
+
+ if (!folio_test_uptodate(folio)) {
+ clear_highpage(folio_page(folio, 0));
+ guestmem_mark_prepared(folio);
+ }
+
+ if (userfaultfd_minor(vmf->vma)) {
+ folio_unlock(folio);
+ return handle_userfault(vmf, VM_UFFD_MINOR);
+ }
+
+ vmf->page = folio_file_page(folio, vmf->pgoff);
+
+out_folio:
+ if (ret != VM_FAULT_LOCKED) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ return ret;
+}
+
+static const struct vm_operations_struct guestmem_vm_ops = {
+ .fault = guestmem_fault_user_mapping,
+};
+
+int guestmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file_inode(file)->i_mapping;
+ struct guestmem *gmem = mapping->i_private_data;
+
+ if (!gmem->ops->supports_mmap || !gmem->ops->supports_mmap(file_inode(file)))
+ return -ENODEV;
+
+ if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
+ (VM_SHARED | VM_MAYSHARE)) {
+ return -EINVAL;
+ }
+
+ vma->vm_ops = &guestmem_vm_ops;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(guestmem_mmap);
+
+bool guestmem_vma_is_guestmem(struct vm_area_struct *vma)
+{
+ struct inode *inode;
+
+ if (!vma->vm_file)
+ return false;
+
+ inode = file_inode(vma->vm_file);
+ if (!inode || !inode->i_mapping || !inode->i_mapping->i_private_data)
+ return false;
+
+ return inode->i_mapping->a_ops == &guestmem_aops;
+}
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 1b7d5be0b6c4..41e26ad33c1b 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -114,6 +114,7 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES
config KVM_GUEST_MEMFD
select XARRAY_MULTI
+ select GUESTMEM
bool
config HAVE_KVM_ARCH_GMEM_PREPARE
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 6989362c056c..15ab13bf6d40 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/backing-dev.h>
#include <linux/falloc.h>
+#include <linux/guestmem.h>
#include <linux/kvm_host.h>
#include <linux/pagemap.h>
#include <linux/anon_inodes.h>
@@ -43,26 +44,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
return 0;
}
-static bool kvm_gmem_test_no_direct_map(struct inode *inode)
-{
- return ((unsigned long) inode->i_private) & GUEST_MEMFD_FLAG_NO_DIRECT_MAP;
-}
-
-static inline int kvm_gmem_mark_prepared(struct folio *folio)
-{
- struct inode *inode = folio_inode(folio);
- int r = 0;
-
- if (kvm_gmem_test_no_direct_map(inode))
- r = set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio),
- false);
-
- if (!r)
- folio_mark_uptodate(folio);
-
- return r;
-}
-
/*
* Process @folio, which contains @gfn, so that the guest can use it.
* The folio must be locked and the gfn must be contained in @slot.
@@ -98,7 +79,7 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
index = ALIGN_DOWN(index, 1 << folio_order(folio));
r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
if (!r)
- r = kvm_gmem_mark_prepared(folio);
+ guestmem_mark_prepared(folio);
return r;
}
@@ -114,8 +95,7 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
*/
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
{
- /* TODO: Support huge pages. */
- return filemap_grab_folio(inode->i_mapping, index);
+ return guestmem_grab_folio(inode->i_mapping, index);
}
static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -167,79 +147,6 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
}
}
-static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
-{
- struct list_head *gmem_list = &inode->i_mapping->i_private_list;
- pgoff_t start = offset >> PAGE_SHIFT;
- pgoff_t end = (offset + len) >> PAGE_SHIFT;
- struct kvm_gmem *gmem;
-
- /*
- * Bindings must be stable across invalidation to ensure the start+end
- * are balanced.
- */
- filemap_invalidate_lock(inode->i_mapping);
-
- list_for_each_entry(gmem, gmem_list, entry)
- kvm_gmem_invalidate_begin(gmem, start, end);
-
- truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
-
- list_for_each_entry(gmem, gmem_list, entry)
- kvm_gmem_invalidate_end(gmem, start, end);
-
- filemap_invalidate_unlock(inode->i_mapping);
-
- return 0;
-}
-
-static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
-{
- struct address_space *mapping = inode->i_mapping;
- pgoff_t start, index, end;
- int r;
-
- /* Dedicated guest is immutable by default. */
- if (offset + len > i_size_read(inode))
- return -EINVAL;
-
- filemap_invalidate_lock_shared(mapping);
-
- start = offset >> PAGE_SHIFT;
- end = (offset + len) >> PAGE_SHIFT;
-
- r = 0;
- for (index = start; index < end; ) {
- struct folio *folio;
-
- if (signal_pending(current)) {
- r = -EINTR;
- break;
- }
-
- folio = kvm_gmem_get_folio(inode, index);
- if (IS_ERR(folio)) {
- r = PTR_ERR(folio);
- break;
- }
-
- index = folio_next_index(folio);
-
- folio_unlock(folio);
- folio_put(folio);
-
- /* 64-bit only, wrapping the index should be impossible. */
- if (WARN_ON_ONCE(!index))
- break;
-
- cond_resched();
- }
-
- filemap_invalidate_unlock_shared(mapping);
-
- return r;
-}
-
static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
@@ -255,9 +162,9 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
return -EINVAL;
if (mode & FALLOC_FL_PUNCH_HOLE)
- ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
+ ret = guestmem_punch_hole(file_inode(file)->i_mapping, offset, len);
else
- ret = kvm_gmem_allocate(file_inode(file), offset, len);
+ ret = guestmem_allocate(file_inode(file)->i_mapping, offset, len);
if (!ret)
file_modified(file);
@@ -299,7 +206,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
kvm_gmem_invalidate_begin(gmem, 0, -1ul);
kvm_gmem_invalidate_end(gmem, 0, -1ul);
- list_del(&gmem->entry);
+ guestmem_detach_mapping(inode->i_mapping, &gmem->entry);
filemap_invalidate_unlock(inode->i_mapping);
@@ -335,74 +242,8 @@ static bool kvm_gmem_supports_mmap(struct inode *inode)
return flags & GUEST_MEMFD_FLAG_MMAP;
}
-static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct folio *folio;
- vm_fault_t ret = VM_FAULT_LOCKED;
-
- if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
- return VM_FAULT_SIGBUS;
-
- folio = kvm_gmem_get_folio(inode, vmf->pgoff);
- if (IS_ERR(folio)) {
- int err = PTR_ERR(folio);
-
- if (err == -EAGAIN)
- return VM_FAULT_RETRY;
-
- return vmf_error(err);
- }
-
- if (WARN_ON_ONCE(folio_test_large(folio))) {
- ret = VM_FAULT_SIGBUS;
- goto out_folio;
- }
-
- if (!folio_test_uptodate(folio)) {
- int err = 0;
-
- clear_highpage(folio_page(folio, 0));
- err = kvm_gmem_mark_prepared(folio);
-
- if (err) {
- ret = vmf_error(err);
- goto out_folio;
- }
- }
-
- vmf->page = folio_file_page(folio, vmf->pgoff);
-
-out_folio:
- if (ret != VM_FAULT_LOCKED) {
- folio_unlock(folio);
- folio_put(folio);
- }
-
- return ret;
-}
-
-static const struct vm_operations_struct kvm_gmem_vm_ops = {
- .fault = kvm_gmem_fault_user_mapping,
-};
-
-static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
-{
- if (!kvm_gmem_supports_mmap(file_inode(file)))
- return -ENODEV;
-
- if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
- (VM_SHARED | VM_MAYSHARE)) {
- return -EINVAL;
- }
-
- vma->vm_ops = &kvm_gmem_vm_ops;
-
- return 0;
-}
-
static struct file_operations kvm_gmem_fops = {
- .mmap = kvm_gmem_mmap,
+ .mmap = guestmem_mmap,
.llseek = default_llseek,
.write_iter = generic_perform_write,
.open = generic_file_open,
@@ -415,104 +256,24 @@ void kvm_gmem_init(struct module *module)
kvm_gmem_fops.owner = module;
}
-static int kvm_kmem_gmem_write_begin(const struct kiocb *kiocb,
- struct address_space *mapping,
- loff_t pos, unsigned int len,
- struct folio **foliop,
- void **fsdata)
-{
- struct file *file = kiocb->ki_filp;
- pgoff_t index = pos >> PAGE_SHIFT;
- struct folio *folio;
-
- if (!PAGE_ALIGNED(pos) || len != PAGE_SIZE)
- return -EINVAL;
-
- if (pos + len > i_size_read(file_inode(file)))
- return -EINVAL;
-
- folio = kvm_gmem_get_folio(file_inode(file), index);
- if (IS_ERR(folio))
- return -EFAULT;
-
- if (WARN_ON_ONCE(folio_test_large(folio))) {
- folio_unlock(folio);
- folio_put(folio);
- return -EFAULT;
- }
-
- if (folio_test_uptodate(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- return -ENOSPC;
- }
-
- *foliop = folio;
- return 0;
-}
-
-static int kvm_kmem_gmem_write_end(const struct kiocb *kiocb,
- struct address_space *mapping,
- loff_t pos, unsigned int len,
- unsigned int copied,
- struct folio *folio, void *fsdata)
+static void kvm_guestmem_invalidate_begin(struct list_head *entry, pgoff_t start,
+ pgoff_t end)
{
- if (copied) {
- if (copied < len) {
- unsigned int from = pos & (PAGE_SIZE - 1);
-
- folio_zero_range(folio, from + copied, len - copied);
- }
- kvm_gmem_mark_prepared(folio);
- }
-
- folio_unlock(folio);
- folio_put(folio);
-
- return copied;
-}
+ struct kvm_gmem *gmem = container_of(entry, struct kvm_gmem, entry);
-static int kvm_gmem_migrate_folio(struct address_space *mapping,
- struct folio *dst, struct folio *src,
- enum migrate_mode mode)
-{
- WARN_ON_ONCE(1);
- return -EINVAL;
+ kvm_gmem_invalidate_begin(gmem, start, end);
}
-static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
+static void kvm_guestmem_invalidate_end(struct list_head *entry, pgoff_t start,
+ pgoff_t end)
{
- struct list_head *gmem_list = &mapping->i_private_list;
- struct kvm_gmem *gmem;
- pgoff_t start, end;
-
- filemap_invalidate_lock_shared(mapping);
-
- start = folio->index;
- end = start + folio_nr_pages(folio);
-
- list_for_each_entry(gmem, gmem_list, entry)
- kvm_gmem_invalidate_begin(gmem, start, end);
+ struct kvm_gmem *gmem = container_of(entry, struct kvm_gmem, entry);
- /*
- * Do not truncate the range, what action is taken in response to the
- * error is userspace's decision (assuming the architecture supports
- * gracefully handling memory errors). If/when the guest attempts to
- * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
- * at which point KVM can either terminate the VM or propagate the
- * error to userspace.
- */
-
- list_for_each_entry(gmem, gmem_list, entry)
- kvm_gmem_invalidate_end(gmem, start, end);
-
- filemap_invalidate_unlock_shared(mapping);
-
- return MF_DELAYED;
+ kvm_gmem_invalidate_end(gmem, start, end);
}
-static void kvm_gmem_free_folio(struct address_space *mapping,
- struct folio *folio)
+static bool kvm_gmem_release_folio(struct address_space *mapping,
+ struct folio *folio)
{
struct page *page = folio_page(folio, 0);
kvm_pfn_t pfn = page_to_pfn(page);
@@ -525,19 +286,19 @@ static void kvm_gmem_free_folio(struct address_space *mapping,
* happened in set_direct_map_invalid_noflush() in kvm_gmem_mark_prepared().
* Thus set_direct_map_valid_noflush() here only updates prot bits.
*/
- if (kvm_gmem_test_no_direct_map(mapping->host))
+ if (guestmem_test_no_direct_map(mapping->host))
set_direct_map_valid_noflush(page, folio_nr_pages(folio), true);
kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
+
+ return true;
}
-static const struct address_space_operations kvm_gmem_aops = {
- .dirty_folio = noop_dirty_folio,
- .write_begin = kvm_kmem_gmem_write_begin,
- .write_end = kvm_kmem_gmem_write_end,
- .migrate_folio = kvm_gmem_migrate_folio,
- .error_remove_folio = kvm_gmem_error_folio,
- .free_folio = kvm_gmem_free_folio,
+static const struct guestmem_ops kvm_guestmem_ops = {
+ .invalidate_begin = kvm_guestmem_invalidate_begin,
+ .invalidate_end = kvm_guestmem_invalidate_end,
+ .release_folio = kvm_gmem_release_folio,
+ .supports_mmap = kvm_gmem_supports_mmap,
};
static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
@@ -587,13 +348,12 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
inode->i_private = (void *)(unsigned long)flags;
inode->i_op = &kvm_gmem_iops;
- inode->i_mapping->a_ops = &kvm_gmem_aops;
inode->i_mode |= S_IFREG;
inode->i_size = size;
- mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
- mapping_set_inaccessible(inode->i_mapping);
- /* Unmovable mappings are supposed to be marked unevictable as well. */
- WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+ err = guestmem_attach_mapping(inode->i_mapping, &kvm_guestmem_ops,
+ &gmem->entry);
+ if (err)
+ goto err_putfile;
if (flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)
mapping_set_no_direct_map(inode->i_mapping);
@@ -601,11 +361,12 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
kvm_get_kvm(kvm);
gmem->kvm = kvm;
xa_init(&gmem->bindings);
- list_add(&gmem->entry, &inode->i_mapping->i_private_list);
fd_install(fd, file);
return fd;
+err_putfile:
+ fput(file);
err_gmem:
kfree(gmem);
err_fd:
@@ -869,7 +630,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
p = src ? src + i * PAGE_SIZE : NULL;
ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
if (!ret)
- ret = kvm_gmem_mark_prepared(folio);
+ guestmem_mark_prepared(folio);
put_folio_and_exit:
folio_put(folio);
--
2.50.1
Powered by blists - more mailing lists