lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251114151828.98165-2-kalyazin@amazon.com>
Date: Fri, 14 Nov 2025 15:18:41 +0000
From: "Kalyazin, Nikita" <kalyazin@...zon.co.uk>
To: "pbonzini@...hat.com" <pbonzini@...hat.com>, "shuah@...nel.org"
	<shuah@...nel.org>
CC: "kvm@...r.kernel.org" <kvm@...r.kernel.org>,
	"linux-kselftest@...r.kernel.org" <linux-kselftest@...r.kernel.org>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	"seanjc@...gle.com" <seanjc@...gle.com>, "david@...nel.org"
	<david@...nel.org>, "jthoughton@...gle.com" <jthoughton@...gle.com>,
	"ackerleytng@...gle.com" <ackerleytng@...gle.com>, "vannapurve@...gle.com"
	<vannapurve@...gle.com>, "jackmanb@...gle.com" <jackmanb@...gle.com>,
	"patrick.roy@...ux.dev" <patrick.roy@...ux.dev>, "Thomson, Jack"
	<jackabt@...zon.co.uk>, "Itazuri, Takahiro" <itazur@...zon.co.uk>,
	"Manwaring, Derek" <derekmn@...zon.com>, "Cali, Marco"
	<xmarcalx@...zon.co.uk>, "Kalyazin, Nikita" <kalyazin@...zon.co.uk>
Subject: [PATCH v7 1/2] KVM: guest_memfd: add generic population via write

From: Nikita Kalyazin <kalyazin@...zon.com>

On systems that support shared guest memory, write() is useful, for
example, for population of the initial image.  Even though the same can
also be achieved via userspace mapping and memcpying from userspace,
write() provides a more performant option because it does not need to
set user page tables and it does not cause a page fault for every page
like memcpy would.  Note that memcpy cannot be accelerated via
MADV_POPULATE_WRITE as it is not supported by guest_memfd and relies on
GUP.

Populating 512MiB of guest_memfd on a x86 machine:
 - via memcpy: 436 ms
 - via write:  202 ms (-54%)

Only PAGE_ALIGNED offset and len are allowed.  Even though non-aligned
writes are technically possible, when in-place conversion support is
implemented [1], the restriction makes handling of mixed shared/private
huge pages simpler.  write() will only be allowed to populate shared
pages.

When direct map removal is implemented [2]
 - write() will not be allowed to access pages that have already been
   removed from direct map
 - on completion, write() will remove the populated pages from direct
   map

While it is technically possible to implement read() syscall on systems
with shared guest memory, it is not supported as there is currently no
use case for it.

[1] https://lore.kernel.org/kvm/cover.1760731772.git.ackerleytng@google.com
[2] https://lore.kernel.org/kvm/20250924151101.2225820-1-patrick.roy@campus.lmu.de

Signed-off-by: Nikita Kalyazin <kalyazin@...zon.com>
---
 Documentation/virt/kvm/api.rst |  2 ++
 include/linux/kvm_host.h       |  2 +-
 include/uapi/linux/kvm.h       |  1 +
 virt/kvm/guest_memfd.c         | 52 ++++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 57061fa29e6a..9541e95fc2ed 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6448,6 +6448,8 @@ specified via KVM_CREATE_GUEST_MEMFD.  Currently defined flags:
                                without INIT_SHARED will be marked private).
                                Shared memory can be faulted into host userspace
                                page tables. Private memory cannot.
+  GUEST_MEMFD_FLAG_WRITE       Enable using write() on the guest_memfd file
+                               descriptor.
   ============================ ================================================
 
 When the KVM MMU performs a PFN lookup to service a guest fault and the backing
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5bd76cf394fa..5fbf65f49586 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -736,7 +736,7 @@ static inline u64 kvm_gmem_get_supported_flags(struct kvm *kvm)
 	u64 flags = GUEST_MEMFD_FLAG_MMAP;
 
 	if (!kvm || kvm_arch_supports_gmem_init_shared(kvm))
-		flags |= GUEST_MEMFD_FLAG_INIT_SHARED;
+		flags |= GUEST_MEMFD_FLAG_INIT_SHARED | GUEST_MEMFD_FLAG_WRITE;
 
 	return flags;
 }
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 52f6000ab020..5b73d6528f1c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1601,6 +1601,7 @@ struct kvm_memory_attributes {
 #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
 #define GUEST_MEMFD_FLAG_MMAP		(1ULL << 0)
 #define GUEST_MEMFD_FLAG_INIT_SHARED	(1ULL << 1)
+#define GUEST_MEMFD_FLAG_WRITE		(1ULL << 2)
 
 struct kvm_create_guest_memfd {
 	__u64 size;
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index ffadc5ee8e04..2c71c21b9189 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -411,6 +411,8 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
 
 static struct file_operations kvm_gmem_fops = {
 	.mmap		= kvm_gmem_mmap,
+	.llseek		= default_llseek,
+	.write_iter     = generic_perform_write,
 	.open		= generic_file_open,
 	.release	= kvm_gmem_release,
 	.fallocate	= kvm_gmem_fallocate,
@@ -421,6 +423,53 @@ void kvm_gmem_init(struct module *module)
 	kvm_gmem_fops.owner = module;
 }
 
+static bool kvm_gmem_supports_write(struct inode *inode)
+{
+	const u64 flags = (u64)inode->i_private;
+
+	return flags & GUEST_MEMFD_FLAG_WRITE;
+}
+
+static int kvm_gmem_write_begin(const struct kiocb *kiocb,
+				struct address_space *mapping,
+				loff_t pos, unsigned int len,
+				struct folio **folio, void **fsdata)
+{
+	struct inode *inode = file_inode(kiocb->ki_filp);
+
+	if (!kvm_gmem_supports_write(inode))
+		return -ENODEV;
+
+	if (pos + len > i_size_read(inode))
+		return -EINVAL;
+
+	if (!IS_ALIGNED(pos, PAGE_SIZE) || !IS_ALIGNED(len, PAGE_SIZE))
+		return -EINVAL;
+
+	*folio = kvm_gmem_get_folio(inode, pos >> PAGE_SHIFT);
+	if (IS_ERR(*folio))
+		return PTR_ERR(*folio);
+
+	return 0;
+}
+
+static int kvm_gmem_write_end(const struct kiocb *kiocb,
+			      struct address_space *mapping,
+			      loff_t pos, unsigned int len,
+			      unsigned int copied,
+			      struct folio *folio, void *fsdata)
+{
+	if (!folio_test_uptodate(folio)) {
+		folio_zero_range(folio, copied, len - copied);
+		folio_mark_uptodate(folio);
+	}
+
+	folio_unlock(folio);
+	folio_put(folio);
+
+	return copied;
+}
+
 static int kvm_gmem_migrate_folio(struct address_space *mapping,
 				  struct folio *dst, struct folio *src,
 				  enum migrate_mode mode)
@@ -469,6 +518,8 @@ static void kvm_gmem_free_folio(struct folio *folio)
 
 static const struct address_space_operations kvm_gmem_aops = {
 	.dirty_folio = noop_dirty_folio,
+	.write_begin = kvm_gmem_write_begin,
+	.write_end = kvm_gmem_write_end,
 	.migrate_folio	= kvm_gmem_migrate_folio,
 	.error_remove_folio = kvm_gmem_error_folio,
 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
@@ -516,6 +567,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	}
 
 	file->f_flags |= O_LARGEFILE;
+	file->f_mode |= FMODE_LSEEK | FMODE_PWRITE;
 
 	inode = file->f_inode;
 	WARN_ON(file->f_mapping != inode->i_mapping);
-- 
2.50.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ