lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aDU3eL7qQYrXkE3T@yzhao56-desk.sh.intel.com>
Date: Tue, 27 May 2025 11:54:32 +0800
From: Yan Zhao <yan.y.zhao@...el.com>
To: Ackerley Tng <ackerleytng@...gle.com>
CC: <kvm@...r.kernel.org>, <linux-mm@...ck.org>,
	<linux-kernel@...r.kernel.org>, <x86@...nel.org>,
	<linux-fsdevel@...r.kernel.org>, <aik@....com>, <ajones@...tanamicro.com>,
	<akpm@...ux-foundation.org>, <amoorthy@...gle.com>,
	<anthony.yznaga@...cle.com>, <anup@...infault.org>, <aou@...s.berkeley.edu>,
	<bfoster@...hat.com>, <binbin.wu@...ux.intel.com>, <brauner@...nel.org>,
	<catalin.marinas@....com>, <chao.p.peng@...el.com>, <chenhuacai@...nel.org>,
	<dave.hansen@...el.com>, <david@...hat.com>, <dmatlack@...gle.com>,
	<dwmw@...zon.co.uk>, <erdemaktas@...gle.com>, <fan.du@...el.com>,
	<fvdl@...gle.com>, <graf@...zon.com>, <haibo1.xu@...el.com>,
	<hch@...radead.org>, <hughd@...gle.com>, <ira.weiny@...el.com>,
	<isaku.yamahata@...el.com>, <jack@...e.cz>, <james.morse@....com>,
	<jarkko@...nel.org>, <jgg@...pe.ca>, <jgowans@...zon.com>,
	<jhubbard@...dia.com>, <jroedel@...e.de>, <jthoughton@...gle.com>,
	<jun.miao@...el.com>, <kai.huang@...el.com>, <keirf@...gle.com>,
	<kent.overstreet@...ux.dev>, <kirill.shutemov@...el.com>,
	<liam.merwick@...cle.com>, <maciej.wieczor-retman@...el.com>,
	<mail@...iej.szmigiero.name>, <maz@...nel.org>, <mic@...ikod.net>,
	<michael.roth@....com>, <mpe@...erman.id.au>, <muchun.song@...ux.dev>,
	<nikunj@....com>, <nsaenz@...zon.es>, <oliver.upton@...ux.dev>,
	<palmer@...belt.com>, <pankaj.gupta@....com>, <paul.walmsley@...ive.com>,
	<pbonzini@...hat.com>, <pdurrant@...zon.co.uk>, <peterx@...hat.com>,
	<pgonda@...gle.com>, <pvorel@...e.cz>, <qperret@...gle.com>,
	<quic_cvanscha@...cinc.com>, <quic_eberman@...cinc.com>,
	<quic_mnalajal@...cinc.com>, <quic_pderrin@...cinc.com>,
	<quic_pheragu@...cinc.com>, <quic_svaddagi@...cinc.com>,
	<quic_tsoni@...cinc.com>, <richard.weiyang@...il.com>,
	<rick.p.edgecombe@...el.com>, <rientjes@...gle.com>, <roypat@...zon.co.uk>,
	<rppt@...nel.org>, <seanjc@...gle.com>, <shuah@...nel.org>,
	<steven.price@....com>, <steven.sistare@...cle.com>,
	<suzuki.poulose@....com>, <tabba@...gle.com>, <thomas.lendacky@....com>,
	<usama.arif@...edance.com>, <vannapurve@...gle.com>, <vbabka@...e.cz>,
	<viro@...iv.linux.org.uk>, <vkuznets@...hat.com>, <wei.w.wang@...el.com>,
	<will@...nel.org>, <willy@...radead.org>, <xiaoyao.li@...el.com>,
	<yilun.xu@...el.com>, <yuzenghui@...wei.com>, <zhiquan1.li@...el.com>
Subject: Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use
 shareability to guard faulting

On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
> Track guest_memfd memory's shareability status within the inode as
> opposed to the file, since it is property of the guest_memfd's memory
> contents.
> 
> Shareability is a property of the memory and is indexed using the
> page's index in the inode. Because shareability is the memory's
> property, it is stored within guest_memfd instead of within KVM, like
> in kvm->mem_attr_array.
> 
> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
> retained to allow VMs to only use guest_memfd for private memory and
> some other memory for shared memory.
> 
> Not all use cases require guest_memfd() to be shared with the host
> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
> private to the guest, and therefore not mappable by the
> host. Otherwise, memory is shared until explicitly converted to
> private.
> 
> Signed-off-by: Ackerley Tng <ackerleytng@...gle.com>
> Co-developed-by: Vishal Annapurve <vannapurve@...gle.com>
> Signed-off-by: Vishal Annapurve <vannapurve@...gle.com>
> Co-developed-by: Fuad Tabba <tabba@...gle.com>
> Signed-off-by: Fuad Tabba <tabba@...gle.com>
> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
> ---
>  Documentation/virt/kvm/api.rst |   5 ++
>  include/uapi/linux/kvm.h       |   2 +
>  virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
>  3 files changed, 129 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 86f74ce7f12a..f609337ae1c2 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
>  This is validated when the guest_memfd instance is bound to the VM.
>  
> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
> +will initialize the memory for the guest_memfd as guest-only and not faultable
> +by the host.
> +
>  See KVM_SET_USER_MEMORY_REGION2 for additional details.
>  
>  4.143 KVM_PRE_FAULT_MEMORY
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 4cc824a3a7c9..d7df312479aa 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1567,7 +1567,9 @@ struct kvm_memory_attributes {
>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
>  
>  #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
> +
>  #define GUEST_MEMFD_FLAG_SUPPORT_SHARED	(1UL << 0)
> +#define GUEST_MEMFD_FLAG_INIT_PRIVATE	(1UL << 1)
>  
>  struct kvm_create_guest_memfd {
>  	__u64 size;
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 239d0f13dcc1..590932499eba 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -4,6 +4,7 @@
>  #include <linux/falloc.h>
>  #include <linux/fs.h>
>  #include <linux/kvm_host.h>
> +#include <linux/maple_tree.h>
>  #include <linux/pseudo_fs.h>
>  #include <linux/pagemap.h>
>  
> @@ -17,6 +18,24 @@ struct kvm_gmem {
>  	struct list_head entry;
>  };
>  
> +struct kvm_gmem_inode_private {
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +	struct maple_tree shareability;
> +#endif
> +};
> +
> +enum shareability {
> +	SHAREABILITY_GUEST = 1,	/* Only the guest can map (fault) folios in this range. */
> +	SHAREABILITY_ALL = 2,	/* Both guest and host can fault folios in this range. */
> +};
> +
> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
> +
> +static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
> +{
> +	return inode->i_mapping->i_private_data;
> +}
> +
>  /**
>   * folio_file_pfn - like folio_file_page, but return a pfn.
>   * @folio: The folio which contains this index.
> @@ -29,6 +48,58 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
>  	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
>  }
>  
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +
> +static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
> +				      loff_t size, u64 flags)
> +{
> +	enum shareability m;
> +	pgoff_t last;
> +
> +	last = (size >> PAGE_SHIFT) - 1;
> +	m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
> +						    SHAREABILITY_ALL;
> +	return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
> +				 GFP_KERNEL);
> +}
> +
> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
> +						 pgoff_t index)
> +{
> +	struct maple_tree *mt;
> +	void *entry;
> +
> +	mt = &kvm_gmem_private(inode)->shareability;
> +	entry = mtree_load(mt, index);
> +	WARN(!entry,
> +	     "Shareability should always be defined for all indices in inode.");
I noticed that in [1], the kvm_gmem_mmap() does not check the range.
So, the WARN() here can be hit when userspace mmap() an area larger than the
inode size and accesses the out of band HVA.

Maybe limit the mmap() range?

@@ -1609,6 +1620,10 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
        if (!kvm_gmem_supports_shared(file_inode(file)))
                return -ENODEV;

+       if (vma->vm_end - vma->vm_start + (vma->vm_pgoff << PAGE_SHIFT) > i_size_read(file_inode(file)))
+               return -EINVAL;
+
        if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
            (VM_SHARED | VM_MAYSHARE)) {
                return -EINVAL;

[1] https://lore.kernel.org/all/20250513163438.3942405-8-tabba@google.com/

> +	return xa_to_value(entry);
> +}
> +
> +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> +{
> +	if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
> +		return ERR_PTR(-EACCES);
> +
> +	return kvm_gmem_get_folio(inode, index);
> +}
> +
> +#else
> +
> +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> +{
> +	return 0;
> +}
> +
> +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> +{
> +	WARN_ONCE("Unexpected call to get shared folio.")
> +	return NULL;
> +}
> +
> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> +
>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>  				    pgoff_t index, struct folio *folio)
>  {
> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
>  
>  	filemap_invalidate_lock_shared(inode->i_mapping);
>  
> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
>  	if (IS_ERR(folio)) {
>  		int err = PTR_ERR(folio);
>  
> @@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
>  	.fallocate	= kvm_gmem_fallocate,
>  };
>  
> +static void kvm_gmem_free_inode(struct inode *inode)
> +{
> +	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> +
> +	kfree(private);
> +
> +	free_inode_nonrcu(inode);
> +}
> +
> +static void kvm_gmem_destroy_inode(struct inode *inode)
> +{
> +	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> +
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +	/*
> +	 * mtree_destroy() can't be used within rcu callback, hence can't be
> +	 * done in ->free_inode().
> +	 */
> +	if (private)
> +		mtree_destroy(&private->shareability);
> +#endif
> +}
> +
>  static const struct super_operations kvm_gmem_super_operations = {
>  	.statfs		= simple_statfs,
> +	.destroy_inode	= kvm_gmem_destroy_inode,
> +	.free_inode	= kvm_gmem_free_inode,
>  };
>  
>  static int kvm_gmem_init_fs_context(struct fs_context *fc)
> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
>  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>  						      loff_t size, u64 flags)
>  {
> +	struct kvm_gmem_inode_private *private;
>  	struct inode *inode;
> +	int err;
>  
>  	inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
>  	if (IS_ERR(inode))
>  		return inode;
>  
> +	err = -ENOMEM;
> +	private = kzalloc(sizeof(*private), GFP_KERNEL);
> +	if (!private)
> +		goto out;
> +
> +	mt_init(&private->shareability);
Wrap the mt_init() inside "#ifdef CONFIG_KVM_GMEM_SHARED_MEM" ?

> +	inode->i_mapping->i_private_data = private;
> +
> +	err = kvm_gmem_shareability_setup(private, size, flags);
> +	if (err)
> +		goto out;
> +
>  	inode->i_private = (void *)(unsigned long)flags;
>  	inode->i_op = &kvm_gmem_iops;
>  	inode->i_mapping->a_ops = &kvm_gmem_aops;
> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>  	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>  
>  	return inode;
> +
> +out:
> +	iput(inode);
> +
> +	return ERR_PTR(err);
>  }
>  
>  static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
> @@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
>  	if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
>  		valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
>  
> +	if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
> +		valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
> +
>  	if (flags & ~valid_flags)
>  		return -EINVAL;
>  
> @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	if (!file)
>  		return -EFAULT;
>  
> +	filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> +
>  	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
>  	if (IS_ERR(folio)) {
>  		r = PTR_ERR(folio);
> @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  		*page = folio_file_page(folio, index);
>  	else
>  		folio_put(folio);
> -
>  out:
> +	filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
>  	fput(file);
>  	return r;
>  }
> -- 
> 2.49.0.1045.g170613ef41-goog
> 
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ