linux-kernel - Re: [RFC PATCH v2 05/51] KVM: guest_memfd: Skip LRU for guest

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aNxuUMbNhcURAxR_@google.com>
Date: Tue, 30 Sep 2025 16:57:04 -0700
From: Sean Christopherson <seanjc@...gle.com>
To: Ackerley Tng <ackerleytng@...gle.com>
Cc: kvm@...r.kernel.org, linux-kernel@...r.kernel.org, 
	Yan Zhao <yan.y.zhao@...el.com>, Fuad Tabba <tabba@...gle.com>, 
	Binbin Wu <binbin.wu@...ux.intel.com>, Michael Roth <michael.roth@....com>, 
	Ira Weiny <ira.weiny@...el.com>, Rick P Edgecombe <rick.p.edgecombe@...el.com>, 
	Vishal Annapurve <vannapurve@...gle.com>, David Hildenbrand <david@...hat.com>, 
	Paolo Bonzini <pbonzini@...hat.com>
Subject: Re: [RFC PATCH v2 05/51] KVM: guest_memfd: Skip LRU for guest_memfd folios

Trimmed Cc again.

On Wed, May 14, 2025, Ackerley Tng wrote:
> filemap_add_folio(), called from filemap_grab_folio(), adds the folio
> onto some LRU list, which is not necessary for guest_memfd since
> guest_memfd folios don't participate in any swapping.
> 
> This patch reimplements part of filemap_add_folio() to avoid adding
> allocated guest_memfd folios to the filemap.
> 
> With shared to private conversions dependent on refcounts, avoiding
> usage of LRU ensures that LRU lists no longer take any refcounts on
> guest_memfd folios and significantly reduces the chance of elevated
> refcounts during conversion.
> 
> Signed-off-by: Ackerley Tng <ackerleytng@...gle.com>
> Change-Id: Ia2540d9fc132d46219e6e714fd42bc82a62a27fa

Please make sure to purge Change-Id before posting upstream.

> ---
>  mm/filemap.c           |  1 +
>  mm/memcontrol.c        |  2 +
>  virt/kvm/guest_memfd.c | 91 ++++++++++++++++++++++++++++++++++++++----
>  3 files changed, 86 insertions(+), 8 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 7b90cbeb4a1a..bed7160db214 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -954,6 +954,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
>  	return xas_error(&xas);
>  }
>  ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
> +EXPORT_SYMBOL_GPL(__filemap_add_folio);
>  
>  int filemap_add_folio(struct address_space *mapping, struct folio *folio,
>  				pgoff_t index, gfp_t gfp)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c96c1f2b9cf5..1def80570738 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4611,6 +4611,7 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
>  
>  	return ret;
>  }
> +EXPORT_SYMBOL_GPL(__mem_cgroup_charge);
>  
>  /**
>   * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio
> @@ -4785,6 +4786,7 @@ void __mem_cgroup_uncharge(struct folio *folio)
>  	uncharge_folio(folio, &ug);
>  	uncharge_batch(&ug);
>  }
> +EXPORT_SYMBOL_GPL(__mem_cgroup_uncharge);

These exports need to be added in a separate patch.  As a general rule, isolate
changes that need approval from non-KVM (or whatever subsystem you're targeting)
Maintainers.  That provides maximum flexibility, e.g. the patch can be Acked and
carried in the targeted tree, it can be applied separately and exposed via a
topic branch, etc.

>  void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
>  {
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index f802116290ce..6f6c4d298f8f 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -466,6 +466,38 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	return r;
>  }
>  
> +static int __kvm_gmem_filemap_add_folio(struct address_space *mapping,
> +					struct folio *folio, pgoff_t index)
> +{
> +	void *shadow = NULL;
> +	gfp_t gfp;
> +	int ret;
> +
> +	gfp = mapping_gfp_mask(mapping);
> +
> +	__folio_set_locked(folio);
> +	ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);

The @shadow param can be NULL, at least in current upstream code.

> +	__folio_clear_locked(folio);
> +
> +	return ret;
> +}
> +
> +/*
> + * Adds a folio to the filemap for guest_memfd. Skips adding the folio to any
> + * LRU list.

Eh, the function name covers the first sentence, and stating something without
explaining _why_ isn't all that helpful for non-library code.  E.g. there's no
kvm_gmem_file_add_folio() that DOES add to LRUs, so on its own the distinction
is uninteresting.

> + */
> +static int kvm_gmem_filemap_add_folio(struct address_space *mapping,
> +					     struct folio *folio, pgoff_t index)
> +{
> +	int ret;
> +
> +	ret = __kvm_gmem_filemap_add_folio(mapping, folio, index);
> +	if (!ret)
> +		folio_set_unevictable(folio);
> +
> +	return ret;
> +}
> +
>  /*
>   * Returns a locked folio on success.  The caller is responsible for
>   * setting the up-to-date flag before the memory is mapped into the guest.
> @@ -477,8 +509,46 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>   */
>  static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>  {
> +	struct folio *folio;
> +	gfp_t gfp;
> +	int ret;
> +
> +repeat:
> +	folio = filemap_lock_folio(inode->i_mapping, index);
> +	if (!IS_ERR(folio))
> +		return folio;
> +
> +	gfp = mapping_gfp_mask(inode->i_mapping);
> +
>  	/* TODO: Support huge pages. */
> -	return filemap_grab_folio(inode->i_mapping, index);
> +	folio = filemap_alloc_folio(gfp, 0);
> +	if (!folio)
> +		return ERR_PTR(-ENOMEM);
> +
> +	ret = mem_cgroup_charge(folio, NULL, gfp);
> +	if (ret) {
> +		folio_put(folio);
> +		return ERR_PTR(ret);
> +	}
> +
> +	ret = kvm_gmem_filemap_add_folio(inode->i_mapping, folio, index);
> +	if (ret) {
> +		folio_put(folio);
> +
> +		/*
> +		 * There was a race, two threads tried to get a folio indexing
> +		 * to the same location in the filemap. The losing thread should
> +		 * free the allocated folio, then lock the folio added to the
> +		 * filemap by the winning thread.
> +		 */
> +		if (ret == -EEXIST)
> +			goto repeat;
> +
> +		return ERR_PTR(ret);
> +	}

I don't see any value in copy+pasting the complex "goto repeat" pattern from the
filemap code.  Copying names and exact patterns seems to be a theme with pulling
code from the kernel into guest_memfd code.  Please don't do that.  Write code
that makes the most sense for guest_memfd/KVM.  There's value in consistency
throughout the kernel, but copying code verbatim from a subsystem that's going
to evolve independently and thus diverage at some point isn't a net positive.

This code in particular can be reworked to something like this.  The do-while
loop makes it _much_ more obvious when KVM retries, and also makes it much easier
to see that the allocated folio is freed on any error, i.e. even on EEXIST.

static struct folio *__kvm_gmem_get_folio(struct address_space *mapping,
					  pgoff_t index,
					  struct mempolicy *policy)
{
	const gfp_t gfp = mapping_gfp_mask(mapping);
	struct folio *folio;
	int err;

	folio = filemap_lock_folio(mapping, index);
	if (!IS_ERR(folio))
		return folio;

	folio = filemap_alloc_folio(gfp, 0, policy);
	if (!folio)
		return ERR_PTR(-ENOMEM);

	err = mem_cgroup_charge(folio, NULL, gfp);
	if (err)
		goto err_put;

	__folio_set_locked(folio);

	err = __filemap_add_folio(mapping, folio, index, gfp, NULL);
	if (err) {
		__folio_clear_locked(folio);
		goto err_put;
	}

	return folio;

err_put:
	folio_put(folio);
	return ERR_PTR(err);
}

/*
 * Returns a locked folio on success.  The caller is responsible for
 * setting the up-to-date flag before the memory is mapped into the guest.
 * There is no backing storage for the memory, so the folio will remain
 * up-to-date until it's removed.
 *
 * Ignore accessed, referenced, and dirty flags.  The memory is
 * unevictable and there is no storage to write back to.
 */
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
{
	/* TODO: Support huge pages. */
	struct address_space *mapping = inode->i_mapping;
	struct mempolicy *policy;
	struct folio *folio;

	/*
	 * Fast-path: See if folio is already present in mapping to avoid
	 * policy_lookup.
	 */
	folio = filemap_lock_folio(mapping, index);
	if (!IS_ERR(folio))
		return folio;

	policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);

	do {
		folio = __kvm_gmem_get_folio(mapping, index, policy);
	} while (IS_ERR(folio) && PTR_ERR(folio) == -EEXIST);

	mpol_cond_put(policy);
	return folio;
}

> +
> +	__folio_set_locked(folio);
> +	return folio;
>  }
>  
>  static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> @@ -956,23 +1026,28 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
>  }
>  
>  #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> +static void kvm_gmem_invalidate(struct folio *folio)
> +{
> +	kvm_pfn_t pfn = folio_pfn(folio);
> +
> +	kvm_arch_gmem_invalidate(pfn, pfn + folio_nr_pages(folio));
> +}
> +#else
> +static inline void kvm_gmem_invalidate(struct folio *folio) {}
> +#endif

Similar to my comment about not adding one-off helpers without good reason, don't
add one-off #ifdef-guarded helpers.  Just bury the #ifdef in the function (or if
possible, avoid #ifdefs entirely).  This way a reader doesn't have to bounce
through ifdeffery just to see that kvm_gmem_invalidate() is a nop when
CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE=n

static void kvm_gmem_free_folio(struct folio *folio)
{
	folio_clear_unevictable(folio);

#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
	kvm_arch_gmem_invalidate(folio_pfn(folio),
				 folio_pfn(folio) + folio_nr_pages(folio));
#endif
}

> +
>  static void kvm_gmem_free_folio(struct folio *folio)
>  {
> -	struct page *page = folio_page(folio, 0);
> -	kvm_pfn_t pfn = page_to_pfn(page);
> -	int order = folio_order(folio);
> +	folio_clear_unevictable(folio);
>  
> -	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
> +	kvm_gmem_invalidate(folio);
>  }
> -#endif
>  
>  static const struct address_space_operations kvm_gmem_aops = {
>  	.dirty_folio = noop_dirty_folio,
>  	.migrate_folio	= kvm_gmem_migrate_folio,
>  	.error_remove_folio = kvm_gmem_error_folio,
> -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
>  	.free_folio = kvm_gmem_free_folio,
> -#endif
>  };
>  
>  static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
> -- 
> 2.49.0.1045.g170613ef41-goog
>