lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250915161815.40729-2-kalyazin@amazon.com>
Date: Mon, 15 Sep 2025 16:18:27 +0000
From: "Kalyazin, Nikita" <kalyazin@...zon.co.uk>
To: "akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
	"david@...hat.com" <david@...hat.com>, "pbonzini@...hat.com"
	<pbonzini@...hat.com>, "seanjc@...gle.com" <seanjc@...gle.com>,
	"viro@...iv.linux.org.uk" <viro@...iv.linux.org.uk>, "brauner@...nel.org"
	<brauner@...nel.org>
CC: "peterx@...hat.com" <peterx@...hat.com>, "lorenzo.stoakes@...cle.com"
	<lorenzo.stoakes@...cle.com>, "Liam.Howlett@...cle.com"
	<Liam.Howlett@...cle.com>, "willy@...radead.org" <willy@...radead.org>,
	"vbabka@...e.cz" <vbabka@...e.cz>, "rppt@...nel.org" <rppt@...nel.org>,
	"surenb@...gle.com" <surenb@...gle.com>, "mhocko@...e.com" <mhocko@...e.com>,
	"jack@...e.cz" <jack@...e.cz>, "linux-mm@...ck.org" <linux-mm@...ck.org>,
	"kvm@...r.kernel.org" <kvm@...r.kernel.org>, "linux-kernel@...r.kernel.org"
	<linux-kernel@...r.kernel.org>, "linux-fsdevel@...r.kernel.org"
	<linux-fsdevel@...r.kernel.org>, "jthoughton@...gle.com"
	<jthoughton@...gle.com>, "tabba@...gle.com" <tabba@...gle.com>,
	"vannapurve@...gle.com" <vannapurve@...gle.com>, "Roy, Patrick"
	<roypat@...zon.co.uk>, "Thomson, Jack" <jackabt@...zon.co.uk>, "Manwaring,
 Derek" <derekmn@...zon.com>, "Cali, Marco" <xmarcalx@...zon.co.uk>,
	"Kalyazin, Nikita" <kalyazin@...zon.co.uk>
Subject: [RFC PATCH v6 1/2] mm: guestmem: introduce guestmem library

From: Nikita Kalyazin <kalyazin@...zon.com>

Move MM-generic parts of guest_memfd from KVM to MM.  This allows other
hypervisors to use guestmem code and enables UserfaultFD implementation
for guest_memfd [1].  Previously it was not possible because KVM (and
guest_memfd code) might be built as a module.

Based on a patch by Elliot Berman <quic_eberman@...cinc.com> [2].

[1] https://lore.kernel.org/kvm/20250404154352.23078-1-kalyazin@amazon.com
[2] https://lore.kernel.org/kvm/20241122-guestmem-library-v5-2-450e92951a15@quicinc.com

Signed-off-by: Nikita Kalyazin <kalyazin@...zon.com>
---
 MAINTAINERS              |   2 +
 include/linux/guestmem.h |  46 +++++
 mm/Kconfig               |   3 +
 mm/Makefile              |   1 +
 mm/guestmem.c            | 380 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/Kconfig         |   1 +
 virt/kvm/guest_memfd.c   | 303 ++++---------------------------
 7 files changed, 465 insertions(+), 271 deletions(-)
 create mode 100644 include/linux/guestmem.h
 create mode 100644 mm/guestmem.c

diff --git a/MAINTAINERS b/MAINTAINERS
index fed6cd812d79..c468c4847ffd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15956,6 +15956,7 @@ W:	http://www.linux-mm.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
 T:	quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new
 F:	mm/
+F:	mm/guestmem.c
 F:	tools/mm/
 
 MEMORY MANAGEMENT - CORE
@@ -15973,6 +15974,7 @@ W:	http://www.linux-mm.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
 F:	include/linux/gfp.h
 F:	include/linux/gfp_types.h
+F:	include/linux/guestmem.h
 F:	include/linux/highmem.h
 F:	include/linux/memory.h
 F:	include/linux/mm.h
diff --git a/include/linux/guestmem.h b/include/linux/guestmem.h
new file mode 100644
index 000000000000..2a173261d32b
--- /dev/null
+++ b/include/linux/guestmem.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_GUESTMEM_H
+#define _LINUX_GUESTMEM_H
+
+#include <linux/types.h>
+
+struct address_space;
+struct list_head;
+struct inode;
+
+/**
+ * struct guestmem_ops - Hypervisor-specific maintenance operations
+ * @release_folio - Try to bring the folio back to fully owned by Linux
+ *		    for instance: about to free the folio [optional]
+ * @invalidate_begin - start invalidating mappings between start and end offsets
+ * @invalidate_end - paired with ->invalidate_begin() [optional]
+ * @supports_mmap - return true if the inode supports mmap [optional]
+ */
+struct guestmem_ops {
+	bool (*release_folio)(struct address_space *mapping,
+			      struct folio *folio);
+	void (*invalidate_begin)(struct list_head *entry, pgoff_t start,
+				 pgoff_t end);
+	void (*invalidate_end)(struct list_head *entry, pgoff_t start,
+			       pgoff_t end);
+	bool (*supports_mmap)(struct inode *inode);
+};
+
+int guestmem_attach_mapping(struct address_space *mapping,
+			    const struct guestmem_ops *const ops,
+			    struct list_head *data);
+void guestmem_detach_mapping(struct address_space *mapping,
+			     struct list_head *data);
+
+struct folio *guestmem_grab_folio(struct address_space *mapping, pgoff_t index);
+
+int guestmem_punch_hole(struct address_space *mapping, loff_t offset,
+			loff_t len);
+int guestmem_allocate(struct address_space *mapping, loff_t offset, loff_t len);
+
+bool guestmem_test_no_direct_map(struct inode *inode);
+void guestmem_mark_prepared(struct folio *folio);
+int guestmem_mmap(struct file *file, struct vm_area_struct *vma);
+bool guestmem_vma_is_guestmem(struct vm_area_struct *vma);
+
+#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index e443fe8cd6cf..a3705099601f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1254,6 +1254,9 @@ config SECRETMEM
 	  memory areas visible only in the context of the owning process and
 	  not mapped to other processes and other kernel page tables.
 
+config GUESTMEM
+	bool
+
 config ANON_VMA_NAME
 	bool "Anonymous VMA name support"
 	depends on PROC_FS && ADVISE_SYSCALLS && MMU
diff --git a/mm/Makefile b/mm/Makefile
index ef54aa615d9d..c92892acd819 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -138,6 +138,7 @@ obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
 obj-$(CONFIG_ZONE_DEVICE) += memremap.o
 obj-$(CONFIG_HMM_MIRROR) += hmm.o
 obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+obj-$(CONFIG_GUESTMEM) += guestmem.o
 obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
 obj-$(CONFIG_PTDUMP) += ptdump.o
 obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
diff --git a/mm/guestmem.c b/mm/guestmem.c
new file mode 100644
index 000000000000..110087aff7e8
--- /dev/null
+++ b/mm/guestmem.c
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/guestmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/set_memory.h>
+#include <linux/userfaultfd_k.h>
+
+struct guestmem {
+	const struct guestmem_ops *ops;
+};
+
+static inline bool __guestmem_release_folio(struct address_space *mapping,
+					    struct folio *folio)
+{
+	struct guestmem *gmem = mapping->i_private_data;
+
+	if (gmem->ops->release_folio) {
+		if (!gmem->ops->release_folio(mapping, folio))
+			return false;
+	}
+
+	return true;
+}
+
+static inline void
+__guestmem_invalidate_begin(struct address_space *const mapping, pgoff_t start,
+			    pgoff_t end)
+{
+	struct guestmem *gmem = mapping->i_private_data;
+	struct list_head *entry;
+
+	list_for_each(entry, &mapping->i_private_list)
+		gmem->ops->invalidate_begin(entry, start, end);
+}
+
+static inline void
+__guestmem_invalidate_end(struct address_space *const mapping, pgoff_t start,
+			  pgoff_t end)
+{
+	struct guestmem *gmem = mapping->i_private_data;
+	struct list_head *entry;
+
+	if (gmem->ops->invalidate_end) {
+		list_for_each(entry, &mapping->i_private_list)
+			gmem->ops->invalidate_end(entry, start, end);
+	}
+}
+
+static int guestmem_write_begin(const struct kiocb *kiocb,
+				struct address_space *mapping,
+				loff_t pos, unsigned int len,
+				struct folio **foliop,
+				void **fsdata)
+{
+	struct file *file = kiocb->ki_filp;
+	pgoff_t index = pos >> PAGE_SHIFT;
+	struct folio *folio;
+
+	if (!PAGE_ALIGNED(pos) || len != PAGE_SIZE)
+		return -EINVAL;
+
+	if (pos + len > i_size_read(file_inode(file)))
+		return -EINVAL;
+
+	folio = guestmem_grab_folio(file_inode(file)->i_mapping, index);
+	if (IS_ERR(folio))
+		return -EFAULT;
+
+	if (WARN_ON_ONCE(folio_test_large(folio))) {
+		folio_unlock(folio);
+		folio_put(folio);
+		return -EFAULT;
+	}
+
+	if (folio_test_uptodate(folio)) {
+		folio_unlock(folio);
+		folio_put(folio);
+		return -ENOSPC;
+	}
+
+	*foliop = folio;
+	return 0;
+}
+
+static int guestmem_write_end(const struct kiocb *kiocb,
+			      struct address_space *mapping,
+			      loff_t pos, unsigned int len, unsigned int copied,
+			      struct folio *folio, void *fsdata)
+{
+	if (copied) {
+		if (copied < len) {
+			unsigned int from = pos & (PAGE_SIZE - 1);
+
+			folio_zero_range(folio, from + copied, len - copied);
+		}
+		guestmem_mark_prepared(folio);
+	}
+
+	folio_unlock(folio);
+	folio_put(folio);
+
+	return copied;
+}
+
+static void guestmem_free_folio(struct address_space *mapping,
+				struct folio *folio)
+{
+	WARN_ON_ONCE(!__guestmem_release_folio(mapping, folio));
+}
+
+static int guestmem_error_folio(struct address_space *mapping,
+				struct folio *folio)
+{
+	pgoff_t start, end;
+
+	filemap_invalidate_lock_shared(mapping);
+
+	start = folio->index;
+	end = start + folio_nr_pages(folio);
+
+	__guestmem_invalidate_begin(mapping, start, end);
+
+	/*
+	 * Do not truncate the range, what action is taken in response to the
+	 * error is userspace's decision (assuming the architecture supports
+	 * gracefully handling memory errors).  If/when the guest attempts to
+	 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
+	 * at which point KVM can either terminate the VM or propagate the
+	 * error to userspace.
+	 */
+
+	__guestmem_invalidate_end(mapping, start, end);
+
+	filemap_invalidate_unlock_shared(mapping);
+	return MF_FAILED;
+}
+
+static int guestmem_migrate_folio(struct address_space *mapping,
+				  struct folio *dst, struct folio *src,
+				  enum migrate_mode mode)
+{
+	WARN_ON_ONCE(1);
+	return -EINVAL;
+}
+
+static const struct address_space_operations guestmem_aops = {
+	.dirty_folio = noop_dirty_folio,
+	.write_begin = guestmem_write_begin,
+	.write_end = guestmem_write_end,
+	.free_folio = guestmem_free_folio,
+	.error_remove_folio = guestmem_error_folio,
+	.migrate_folio = guestmem_migrate_folio,
+};
+
+int guestmem_attach_mapping(struct address_space *mapping,
+			    const struct guestmem_ops *const ops,
+			    struct list_head *data)
+{
+	struct guestmem *gmem;
+
+	if (mapping->a_ops == &guestmem_aops) {
+		gmem = mapping->i_private_data;
+		if (gmem->ops != ops)
+			return -EINVAL;
+
+		goto add;
+	}
+
+	gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
+	if (!gmem)
+		return -ENOMEM;
+
+	gmem->ops = ops;
+
+	mapping->a_ops = &guestmem_aops;
+	mapping->i_private_data = gmem;
+
+	mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
+	mapping_set_inaccessible(mapping);
+	/* Unmovable mappings are supposed to be marked unevictable as well. */
+	WARN_ON_ONCE(!mapping_unevictable(mapping));
+
+add:
+	list_add(data, &mapping->i_private_list);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(guestmem_attach_mapping);
+
+void guestmem_detach_mapping(struct address_space *mapping,
+			     struct list_head *data)
+{
+	list_del(data);
+
+	if (list_empty(&mapping->i_private_list)) {
+		/**
+		 * Ensures we call ->free_folio() for any allocated folios.
+		 * Any folios allocated after this point are assumed not to be
+		 * accessed by the guest, so we don't need to worry about
+		 * guestmem ops not being called on them.
+		 */
+		truncate_inode_pages(mapping, 0);
+
+		kfree(mapping->i_private_data);
+		mapping->i_private_data = NULL;
+		mapping->a_ops = &empty_aops;
+	}
+}
+EXPORT_SYMBOL_GPL(guestmem_detach_mapping);
+
+struct folio *guestmem_grab_folio(struct address_space *mapping, pgoff_t index)
+{
+	/* TODO: Support huge pages. */
+	return filemap_grab_folio(mapping, index);
+}
+EXPORT_SYMBOL_GPL(guestmem_grab_folio);
+
+int guestmem_punch_hole(struct address_space *mapping, loff_t offset,
+			loff_t len)
+{
+	pgoff_t start = offset >> PAGE_SHIFT;
+	pgoff_t end = (offset + len) >> PAGE_SHIFT;
+
+	filemap_invalidate_lock(mapping);
+	__guestmem_invalidate_begin(mapping, start, end);
+
+	truncate_inode_pages_range(mapping, offset, offset + len - 1);
+
+	__guestmem_invalidate_end(mapping, start, end);
+	filemap_invalidate_unlock(mapping);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(guestmem_punch_hole);
+
+int guestmem_allocate(struct address_space *mapping, loff_t offset, loff_t len)
+{
+	pgoff_t start, index, end;
+	int r;
+
+	/* Dedicated guest is immutable by default. */
+	if (offset + len > i_size_read(mapping->host))
+		return -EINVAL;
+
+	filemap_invalidate_lock_shared(mapping);
+
+	start = offset >> PAGE_SHIFT;
+	end = (offset + len) >> PAGE_SHIFT;
+
+	r = 0;
+	for (index = start; index < end; ) {
+		struct folio *folio;
+
+		if (signal_pending(current)) {
+			r = -EINTR;
+			break;
+		}
+
+		folio = guestmem_grab_folio(mapping, index);
+		if (IS_ERR(folio)) {
+			r = PTR_ERR(folio);
+			break;
+		}
+
+		index = folio_next_index(folio);
+
+		folio_unlock(folio);
+		folio_put(folio);
+
+		/* 64-bit only, wrapping the index should be impossible. */
+		if (WARN_ON_ONCE(!index))
+			break;
+
+		cond_resched();
+	}
+
+	filemap_invalidate_unlock_shared(mapping);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(guestmem_allocate);
+
+bool guestmem_test_no_direct_map(struct inode *inode)
+{
+	return mapping_no_direct_map(inode->i_mapping);
+}
+EXPORT_SYMBOL_GPL(guestmem_test_no_direct_map);
+
+void guestmem_mark_prepared(struct folio *folio)
+{
+	struct inode *inode = folio_inode(folio);
+
+	if (guestmem_test_no_direct_map(inode))
+		set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio), false);
+
+	folio_mark_uptodate(folio);
+}
+EXPORT_SYMBOL_GPL(guestmem_mark_prepared);
+
+static vm_fault_t guestmem_fault_user_mapping(struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vmf->vma->vm_file);
+	struct folio *folio;
+	vm_fault_t ret = VM_FAULT_LOCKED;
+
+	if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
+		return VM_FAULT_SIGBUS;
+
+	folio = guestmem_grab_folio(inode->i_mapping, vmf->pgoff);
+	if (IS_ERR(folio)) {
+		int err = PTR_ERR(folio);
+
+		if (err == -EAGAIN)
+			return VM_FAULT_RETRY;
+
+		return vmf_error(err);
+	}
+
+	if (WARN_ON_ONCE(folio_test_large(folio))) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_folio;
+	}
+
+	if (!folio_test_uptodate(folio)) {
+		clear_highpage(folio_page(folio, 0));
+		guestmem_mark_prepared(folio);
+	}
+
+	if (userfaultfd_minor(vmf->vma)) {
+		folio_unlock(folio);
+		return handle_userfault(vmf, VM_UFFD_MINOR);
+	}
+
+	vmf->page = folio_file_page(folio, vmf->pgoff);
+
+out_folio:
+	if (ret != VM_FAULT_LOCKED) {
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+
+	return ret;
+}
+
+static const struct vm_operations_struct guestmem_vm_ops = {
+	.fault = guestmem_fault_user_mapping,
+};
+
+int guestmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file_inode(file)->i_mapping;
+	struct guestmem *gmem = mapping->i_private_data;
+
+	if (!gmem->ops->supports_mmap || !gmem->ops->supports_mmap(file_inode(file)))
+		return -ENODEV;
+
+	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
+	    (VM_SHARED | VM_MAYSHARE)) {
+		return -EINVAL;
+	}
+
+	vma->vm_ops = &guestmem_vm_ops;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(guestmem_mmap);
+
+bool guestmem_vma_is_guestmem(struct vm_area_struct *vma)
+{
+	struct inode *inode;
+
+	if (!vma->vm_file)
+		return false;
+
+	inode = file_inode(vma->vm_file);
+	if (!inode || !inode->i_mapping || !inode->i_mapping->i_private_data)
+		return false;
+
+	return inode->i_mapping->a_ops == &guestmem_aops;
+}
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 1b7d5be0b6c4..41e26ad33c1b 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -114,6 +114,7 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES
 
 config KVM_GUEST_MEMFD
        select XARRAY_MULTI
+       select GUESTMEM
        bool
 
 config HAVE_KVM_ARCH_GMEM_PREPARE
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 6989362c056c..15ab13bf6d40 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/backing-dev.h>
 #include <linux/falloc.h>
+#include <linux/guestmem.h>
 #include <linux/kvm_host.h>
 #include <linux/pagemap.h>
 #include <linux/anon_inodes.h>
@@ -43,26 +44,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
 	return 0;
 }
 
-static bool kvm_gmem_test_no_direct_map(struct inode *inode)
-{
-	return ((unsigned long) inode->i_private) & GUEST_MEMFD_FLAG_NO_DIRECT_MAP;
-}
-
-static inline int kvm_gmem_mark_prepared(struct folio *folio)
-{
-	struct inode *inode = folio_inode(folio);
-	int r = 0;
-
-	if (kvm_gmem_test_no_direct_map(inode))
-		r = set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio),
-						 false);
-
-	if (!r)
-		folio_mark_uptodate(folio);
-
-	return r;
-}
-
 /*
  * Process @folio, which contains @gfn, so that the guest can use it.
  * The folio must be locked and the gfn must be contained in @slot.
@@ -98,7 +79,7 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
 	index = ALIGN_DOWN(index, 1 << folio_order(folio));
 	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
 	if (!r)
-		r = kvm_gmem_mark_prepared(folio);
+		guestmem_mark_prepared(folio);
 
 	return r;
 }
@@ -114,8 +95,7 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
  */
 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 {
-	/* TODO: Support huge pages. */
-	return filemap_grab_folio(inode->i_mapping, index);
+	return guestmem_grab_folio(inode->i_mapping, index);
 }
 
 static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -167,79 +147,6 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
 	}
 }
 
-static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
-{
-	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
-	pgoff_t start = offset >> PAGE_SHIFT;
-	pgoff_t end = (offset + len) >> PAGE_SHIFT;
-	struct kvm_gmem *gmem;
-
-	/*
-	 * Bindings must be stable across invalidation to ensure the start+end
-	 * are balanced.
-	 */
-	filemap_invalidate_lock(inode->i_mapping);
-
-	list_for_each_entry(gmem, gmem_list, entry)
-		kvm_gmem_invalidate_begin(gmem, start, end);
-
-	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
-
-	list_for_each_entry(gmem, gmem_list, entry)
-		kvm_gmem_invalidate_end(gmem, start, end);
-
-	filemap_invalidate_unlock(inode->i_mapping);
-
-	return 0;
-}
-
-static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
-{
-	struct address_space *mapping = inode->i_mapping;
-	pgoff_t start, index, end;
-	int r;
-
-	/* Dedicated guest is immutable by default. */
-	if (offset + len > i_size_read(inode))
-		return -EINVAL;
-
-	filemap_invalidate_lock_shared(mapping);
-
-	start = offset >> PAGE_SHIFT;
-	end = (offset + len) >> PAGE_SHIFT;
-
-	r = 0;
-	for (index = start; index < end; ) {
-		struct folio *folio;
-
-		if (signal_pending(current)) {
-			r = -EINTR;
-			break;
-		}
-
-		folio = kvm_gmem_get_folio(inode, index);
-		if (IS_ERR(folio)) {
-			r = PTR_ERR(folio);
-			break;
-		}
-
-		index = folio_next_index(folio);
-
-		folio_unlock(folio);
-		folio_put(folio);
-
-		/* 64-bit only, wrapping the index should be impossible. */
-		if (WARN_ON_ONCE(!index))
-			break;
-
-		cond_resched();
-	}
-
-	filemap_invalidate_unlock_shared(mapping);
-
-	return r;
-}
-
 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
 			       loff_t len)
 {
@@ -255,9 +162,9 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
 		return -EINVAL;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
-		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
+		ret = guestmem_punch_hole(file_inode(file)->i_mapping, offset, len);
 	else
-		ret = kvm_gmem_allocate(file_inode(file), offset, len);
+		ret = guestmem_allocate(file_inode(file)->i_mapping, offset, len);
 
 	if (!ret)
 		file_modified(file);
@@ -299,7 +206,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
 	kvm_gmem_invalidate_begin(gmem, 0, -1ul);
 	kvm_gmem_invalidate_end(gmem, 0, -1ul);
 
-	list_del(&gmem->entry);
+	guestmem_detach_mapping(inode->i_mapping, &gmem->entry);
 
 	filemap_invalidate_unlock(inode->i_mapping);
 
@@ -335,74 +242,8 @@ static bool kvm_gmem_supports_mmap(struct inode *inode)
 	return flags & GUEST_MEMFD_FLAG_MMAP;
 }
 
-static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
-{
-	struct inode *inode = file_inode(vmf->vma->vm_file);
-	struct folio *folio;
-	vm_fault_t ret = VM_FAULT_LOCKED;
-
-	if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
-		return VM_FAULT_SIGBUS;
-
-	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
-	if (IS_ERR(folio)) {
-		int err = PTR_ERR(folio);
-
-		if (err == -EAGAIN)
-			return VM_FAULT_RETRY;
-
-		return vmf_error(err);
-	}
-
-	if (WARN_ON_ONCE(folio_test_large(folio))) {
-		ret = VM_FAULT_SIGBUS;
-		goto out_folio;
-	}
-
-	if (!folio_test_uptodate(folio)) {
-		int err = 0;
-
-		clear_highpage(folio_page(folio, 0));
-		err = kvm_gmem_mark_prepared(folio);
-
-		if (err) {
-			ret = vmf_error(err);
-			goto out_folio;
-		}
-	}
-
-	vmf->page = folio_file_page(folio, vmf->pgoff);
-
-out_folio:
-	if (ret != VM_FAULT_LOCKED) {
-		folio_unlock(folio);
-		folio_put(folio);
-	}
-
-	return ret;
-}
-
-static const struct vm_operations_struct kvm_gmem_vm_ops = {
-	.fault = kvm_gmem_fault_user_mapping,
-};
-
-static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	if (!kvm_gmem_supports_mmap(file_inode(file)))
-		return -ENODEV;
-
-	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
-	    (VM_SHARED | VM_MAYSHARE)) {
-		return -EINVAL;
-	}
-
-	vma->vm_ops = &kvm_gmem_vm_ops;
-
-	return 0;
-}
-
 static struct file_operations kvm_gmem_fops = {
-	.mmap           = kvm_gmem_mmap,
+	.mmap           = guestmem_mmap,
 	.llseek         = default_llseek,
 	.write_iter     = generic_perform_write,
 	.open		= generic_file_open,
@@ -415,104 +256,24 @@ void kvm_gmem_init(struct module *module)
 	kvm_gmem_fops.owner = module;
 }
 
-static int kvm_kmem_gmem_write_begin(const struct kiocb *kiocb,
-				     struct address_space *mapping,
-				     loff_t pos, unsigned int len,
-				     struct folio **foliop,
-				     void **fsdata)
-{
-	struct file *file = kiocb->ki_filp;
-	pgoff_t index = pos >> PAGE_SHIFT;
-	struct folio *folio;
-
-	if (!PAGE_ALIGNED(pos) || len != PAGE_SIZE)
-		return -EINVAL;
-
-	if (pos + len > i_size_read(file_inode(file)))
-		return -EINVAL;
-
-	folio = kvm_gmem_get_folio(file_inode(file), index);
-	if (IS_ERR(folio))
-		return -EFAULT;
-
-	if (WARN_ON_ONCE(folio_test_large(folio))) {
-		folio_unlock(folio);
-		folio_put(folio);
-		return -EFAULT;
-	}
-
-	if (folio_test_uptodate(folio)) {
-		folio_unlock(folio);
-		folio_put(folio);
-		return -ENOSPC;
-	}
-
-	*foliop = folio;
-	return 0;
-}
-
-static int kvm_kmem_gmem_write_end(const struct kiocb *kiocb,
-				   struct address_space *mapping,
-				   loff_t pos, unsigned int len,
-				   unsigned int copied,
-				   struct folio *folio, void *fsdata)
+static void kvm_guestmem_invalidate_begin(struct list_head *entry, pgoff_t start,
+					 pgoff_t end)
 {
-	if (copied) {
-		if (copied < len) {
-			unsigned int from = pos & (PAGE_SIZE - 1);
-
-			folio_zero_range(folio, from + copied, len - copied);
-		}
-		kvm_gmem_mark_prepared(folio);
-	}
-
-	folio_unlock(folio);
-	folio_put(folio);
-
-	return copied;
-}
+	struct kvm_gmem *gmem = container_of(entry, struct kvm_gmem, entry);
 
-static int kvm_gmem_migrate_folio(struct address_space *mapping,
-				  struct folio *dst, struct folio *src,
-				  enum migrate_mode mode)
-{
-	WARN_ON_ONCE(1);
-	return -EINVAL;
+	kvm_gmem_invalidate_begin(gmem, start, end);
 }
 
-static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
+static void kvm_guestmem_invalidate_end(struct list_head *entry, pgoff_t start,
+					pgoff_t end)
 {
-	struct list_head *gmem_list = &mapping->i_private_list;
-	struct kvm_gmem *gmem;
-	pgoff_t start, end;
-
-	filemap_invalidate_lock_shared(mapping);
-
-	start = folio->index;
-	end = start + folio_nr_pages(folio);
-
-	list_for_each_entry(gmem, gmem_list, entry)
-		kvm_gmem_invalidate_begin(gmem, start, end);
+	struct kvm_gmem *gmem = container_of(entry, struct kvm_gmem, entry);
 
-	/*
-	 * Do not truncate the range, what action is taken in response to the
-	 * error is userspace's decision (assuming the architecture supports
-	 * gracefully handling memory errors).  If/when the guest attempts to
-	 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
-	 * at which point KVM can either terminate the VM or propagate the
-	 * error to userspace.
-	 */
-
-	list_for_each_entry(gmem, gmem_list, entry)
-		kvm_gmem_invalidate_end(gmem, start, end);
-
-	filemap_invalidate_unlock_shared(mapping);
-
-	return MF_DELAYED;
+	kvm_gmem_invalidate_end(gmem, start, end);
 }
 
-static void kvm_gmem_free_folio(struct address_space *mapping,
-				struct folio *folio)
+static bool kvm_gmem_release_folio(struct address_space *mapping,
+				   struct folio *folio)
 {
 	struct page *page = folio_page(folio, 0);
 	kvm_pfn_t pfn = page_to_pfn(page);
@@ -525,19 +286,19 @@ static void kvm_gmem_free_folio(struct address_space *mapping,
 	 * happened in set_direct_map_invalid_noflush() in kvm_gmem_mark_prepared().
 	 * Thus set_direct_map_valid_noflush() here only updates prot bits.
 	 */
-	if (kvm_gmem_test_no_direct_map(mapping->host))
+	if (guestmem_test_no_direct_map(mapping->host))
 		set_direct_map_valid_noflush(page, folio_nr_pages(folio), true);
 
 	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
+
+	return true;
 }
 
-static const struct address_space_operations kvm_gmem_aops = {
-	.dirty_folio = noop_dirty_folio,
-	.write_begin = kvm_kmem_gmem_write_begin,
-	.write_end = kvm_kmem_gmem_write_end,
-	.migrate_folio	= kvm_gmem_migrate_folio,
-	.error_remove_folio = kvm_gmem_error_folio,
-	.free_folio = kvm_gmem_free_folio,
+static const struct guestmem_ops kvm_guestmem_ops = {
+	.invalidate_begin = kvm_guestmem_invalidate_begin,
+	.invalidate_end = kvm_guestmem_invalidate_end,
+	.release_folio = kvm_gmem_release_folio,
+	.supports_mmap = kvm_gmem_supports_mmap,
 };
 
 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
@@ -587,13 +348,12 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 
 	inode->i_private = (void *)(unsigned long)flags;
 	inode->i_op = &kvm_gmem_iops;
-	inode->i_mapping->a_ops = &kvm_gmem_aops;
 	inode->i_mode |= S_IFREG;
 	inode->i_size = size;
-	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
-	mapping_set_inaccessible(inode->i_mapping);
-	/* Unmovable mappings are supposed to be marked unevictable as well. */
-	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+	err = guestmem_attach_mapping(inode->i_mapping, &kvm_guestmem_ops,
+				      &gmem->entry);
+	if (err)
+		goto err_putfile;
 
 	if (flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)
 		mapping_set_no_direct_map(inode->i_mapping);
@@ -601,11 +361,12 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	kvm_get_kvm(kvm);
 	gmem->kvm = kvm;
 	xa_init(&gmem->bindings);
-	list_add(&gmem->entry, &inode->i_mapping->i_private_list);
 
 	fd_install(fd, file);
 	return fd;
 
+err_putfile:
+	fput(file);
 err_gmem:
 	kfree(gmem);
 err_fd:
@@ -869,7 +630,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 		p = src ? src + i * PAGE_SIZE : NULL;
 		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
 		if (!ret)
-			ret = kvm_gmem_mark_prepared(folio);
+			guestmem_mark_prepared(folio);
 
 put_folio_and_exit:
 		folio_put(folio);
-- 
2.50.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ