linux-kernel - Re: [RFC PATCH v2 29/51] mm: guestmem_hugetlb: Wrap HugeTLB as an allocator for guest

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250916225528.iycrfgf4nz6bcdce@amd.com>
Date: Tue, 16 Sep 2025 17:55:28 -0500
From: Michael Roth <michael.roth@....com>
To: Ackerley Tng <ackerleytng@...gle.com>
CC: <kvm@...r.kernel.org>, <linux-mm@...ck.org>,
	<linux-kernel@...r.kernel.org>, <x86@...nel.org>,
	<linux-fsdevel@...r.kernel.org>, <aik@....com>, <ajones@...tanamicro.com>,
	<akpm@...ux-foundation.org>, <amoorthy@...gle.com>,
	<anthony.yznaga@...cle.com>, <anup@...infault.org>, <aou@...s.berkeley.edu>,
	<bfoster@...hat.com>, <binbin.wu@...ux.intel.com>, <brauner@...nel.org>,
	<catalin.marinas@....com>, <chao.p.peng@...el.com>, <chenhuacai@...nel.org>,
	<dave.hansen@...el.com>, <david@...hat.com>, <dmatlack@...gle.com>,
	<dwmw@...zon.co.uk>, <erdemaktas@...gle.com>, <fan.du@...el.com>,
	<fvdl@...gle.com>, <graf@...zon.com>, <haibo1.xu@...el.com>,
	<hch@...radead.org>, <hughd@...gle.com>, <ira.weiny@...el.com>,
	<isaku.yamahata@...el.com>, <jack@...e.cz>, <james.morse@....com>,
	<jarkko@...nel.org>, <jgg@...pe.ca>, <jgowans@...zon.com>,
	<jhubbard@...dia.com>, <jroedel@...e.de>, <jthoughton@...gle.com>,
	<jun.miao@...el.com>, <kai.huang@...el.com>, <keirf@...gle.com>,
	<kent.overstreet@...ux.dev>, <kirill.shutemov@...el.com>,
	<liam.merwick@...cle.com>, <maciej.wieczor-retman@...el.com>,
	<mail@...iej.szmigiero.name>, <maz@...nel.org>, <mic@...ikod.net>,
	<mpe@...erman.id.au>, <muchun.song@...ux.dev>, <nikunj@....com>,
	<nsaenz@...zon.es>, <oliver.upton@...ux.dev>, <palmer@...belt.com>,
	<pankaj.gupta@....com>, <paul.walmsley@...ive.com>, <pbonzini@...hat.com>,
	<pdurrant@...zon.co.uk>, <peterx@...hat.com>, <pgonda@...gle.com>,
	<pvorel@...e.cz>, <qperret@...gle.com>, <quic_cvanscha@...cinc.com>,
	<quic_eberman@...cinc.com>, <quic_mnalajal@...cinc.com>,
	<quic_pderrin@...cinc.com>, <quic_pheragu@...cinc.com>,
	<quic_svaddagi@...cinc.com>, <quic_tsoni@...cinc.com>,
	<richard.weiyang@...il.com>, <rick.p.edgecombe@...el.com>,
	<rientjes@...gle.com>, <roypat@...zon.co.uk>, <rppt@...nel.org>,
	<seanjc@...gle.com>, <shuah@...nel.org>, <steven.price@....com>,
	<steven.sistare@...cle.com>, <suzuki.poulose@....com>, <tabba@...gle.com>,
	<thomas.lendacky@....com>, <usama.arif@...edance.com>,
	<vannapurve@...gle.com>, <vbabka@...e.cz>, <viro@...iv.linux.org.uk>,
	<vkuznets@...hat.com>, <wei.w.wang@...el.com>, <will@...nel.org>,
	<willy@...radead.org>, <xiaoyao.li@...el.com>, <yan.y.zhao@...el.com>,
	<yilun.xu@...el.com>, <yuzenghui@...wei.com>, <zhiquan1.li@...el.com>
Subject: Re: [RFC PATCH v2 29/51] mm: guestmem_hugetlb: Wrap HugeTLB as an
 allocator for guest_memfd

On Wed, May 14, 2025 at 04:42:08PM -0700, Ackerley Tng wrote:
> guestmem_hugetlb is an allocator for guest_memfd. It wraps HugeTLB to
> provide huge folios for guest_memfd.
> 
> This patch also introduces guestmem_allocator_operations as a set of
> operations that allocators for guest_memfd can provide. In a later
> patch, guest_memfd will use these operations to manage pages from an
> allocator.
> 
> The allocator operations are memory-management specific and are placed
> in mm/ so key mm-specific functions do not have to be exposed
> unnecessarily.
> 
> Signed-off-by: Ackerley Tng <ackerleytng@...gle.com>
> 
> Change-Id: I3cafe111ea7b3c84755d7112ff8f8c541c11136d
> ---
>  include/linux/guestmem.h      |  20 +++++
>  include/uapi/linux/guestmem.h |  29 +++++++
>  mm/Kconfig                    |   5 +-
>  mm/guestmem_hugetlb.c         | 159 ++++++++++++++++++++++++++++++++++
>  4 files changed, 212 insertions(+), 1 deletion(-)
>  create mode 100644 include/linux/guestmem.h
>  create mode 100644 include/uapi/linux/guestmem.h
> 
> diff --git a/include/linux/guestmem.h b/include/linux/guestmem.h
> new file mode 100644
> index 000000000000..4b2d820274d9
> --- /dev/null
> +++ b/include/linux/guestmem.h
> @@ -0,0 +1,20 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_GUESTMEM_H
> +#define _LINUX_GUESTMEM_H
> +
> +#include <linux/fs.h>
> +
> +struct guestmem_allocator_operations {
> +	void *(*inode_setup)(size_t size, u64 flags);
> +	void (*inode_teardown)(void *private, size_t inode_size);
> +	struct folio *(*alloc_folio)(void *private);
> +	/*
> +	 * Returns the number of PAGE_SIZE pages in a page that this guestmem
> +	 * allocator provides.
> +	 */
> +	size_t (*nr_pages_in_folio)(void *priv);
> +};
> +
> +extern const struct guestmem_allocator_operations guestmem_hugetlb_ops;
> +
> +#endif
> diff --git a/include/uapi/linux/guestmem.h b/include/uapi/linux/guestmem.h
> new file mode 100644
> index 000000000000..2e518682edd5
> --- /dev/null
> +++ b/include/uapi/linux/guestmem.h
> @@ -0,0 +1,29 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_GUESTMEM_H
> +#define _UAPI_LINUX_GUESTMEM_H
> +
> +/*
> + * Huge page size must be explicitly defined when using the guestmem_hugetlb
> + * allocator for guest_memfd.  It is the responsibility of the application to
> + * know which sizes are supported on the running system.  See mmap(2) man page
> + * for details.
> + */
> +
> +#define GUESTMEM_HUGETLB_FLAG_SHIFT	58
> +#define GUESTMEM_HUGETLB_FLAG_MASK	0x3fUL
> +
> +#define GUESTMEM_HUGETLB_FLAG_16KB	(14UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_64KB	(16UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_512KB	(19UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_1MB	(20UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_2MB	(21UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_8MB	(23UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_16MB	(24UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_32MB	(25UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_256MB	(28UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_512MB	(29UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_1GB	(30UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_2GB	(31UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +#define GUESTMEM_HUGETLB_FLAG_16GB	(34UL << GUESTMEM_HUGETLB_FLAG_SHIFT)
> +
> +#endif /* _UAPI_LINUX_GUESTMEM_H */
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 131adc49f58d..bb6e39e37245 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1218,7 +1218,10 @@ config SECRETMEM
>  
>  config GUESTMEM_HUGETLB
>  	bool "Enable guestmem_hugetlb allocator for guest_memfd"
> -	depends on HUGETLBFS
> +	select GUESTMEM
> +	select HUGETLBFS
> +	select HUGETLB_PAGE
> +	select HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>  	help
>  	  Enable this to make HugeTLB folios available to guest_memfd
>  	  (KVM virtualization) as backing memory.
> diff --git a/mm/guestmem_hugetlb.c b/mm/guestmem_hugetlb.c
> index 51a724ebcc50..5459ef7eb329 100644
> --- a/mm/guestmem_hugetlb.c
> +++ b/mm/guestmem_hugetlb.c
> @@ -5,6 +5,14 @@
>   */
>  
>  #include <linux/mm_types.h>
> +#include <linux/guestmem.h>
> +#include <linux/hugetlb.h>
> +#include <linux/hugetlb_cgroup.h>
> +#include <linux/mempolicy.h>
> +#include <linux/mm.h>
> +#include <linux/pagemap.h>
> +
> +#include <uapi/linux/guestmem.h>
>  
>  #include "guestmem_hugetlb.h"
>  
> @@ -12,3 +20,154 @@ void guestmem_hugetlb_handle_folio_put(struct folio *folio)
>  {
>  	WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress.");
>  }
> +
> +struct guestmem_hugetlb_private {
> +	struct hstate *h;
> +	struct hugepage_subpool *spool;
> +	struct hugetlb_cgroup *h_cg_rsvd;
> +};
> +
> +static size_t guestmem_hugetlb_nr_pages_in_folio(void *priv)
> +{
> +	struct guestmem_hugetlb_private *private = priv;
> +
> +	return pages_per_huge_page(private->h);
> +}
> +
> +static void *guestmem_hugetlb_setup(size_t size, u64 flags)
> +
> +{
> +	struct guestmem_hugetlb_private *private;
> +	struct hugetlb_cgroup *h_cg_rsvd = NULL;
> +	struct hugepage_subpool *spool;
> +	unsigned long nr_pages;
> +	int page_size_log;
> +	struct hstate *h;
> +	long hpages;
> +	int idx;
> +	int ret;
> +
> +	page_size_log = (flags >> GUESTMEM_HUGETLB_FLAG_SHIFT) &
> +			GUESTMEM_HUGETLB_FLAG_MASK;
> +	h = hstate_sizelog(page_size_log);
> +	if (!h)
> +		return ERR_PTR(-EINVAL);
> +
> +	/*
> +	 * Check against h because page_size_log could be 0 to request default
> +	 * HugeTLB page size.
> +	 */
> +	if (!IS_ALIGNED(size, huge_page_size(h)))
> +		return ERR_PTR(-EINVAL);

For SNP testing we ended up needing to relax this to play along a little
easier with QEMU/etc. and instead just round the size up via:

  size = round_up(size, huge_page_size(h));

The thinking is that since, presumably, the size would span beyond what
we actually bind to any memslots, that KVM will simply map them as 4K
in nested page table, and userspace already causes 4K split and inode
size doesn't change as part of this adjustment so the extra pages would
remain inaccessible.

The accounting might get a little weird but it's probably fair to
document that non-hugepage-aligned gmemfd sizes can result in wasted memory
if userspace wants to fine tune around that.

-Mike

> +
> +	private = kzalloc(sizeof(*private), GFP_KERNEL);
> +	if (!private)
> +		return ERR_PTR(-ENOMEM);
> +
> +	/* Creating a subpool makes reservations, hence charge for them now. */
> +	idx = hstate_index(h);
> +	nr_pages = size >> PAGE_SHIFT;
> +	ret = hugetlb_cgroup_charge_cgroup_rsvd(idx, nr_pages, &h_cg_rsvd);
> +	if (ret)
> +		goto err_free;
> +
> +	hpages = size >> huge_page_shift(h);
> +	spool = hugepage_new_subpool(h, hpages, hpages, false);
> +	if (!spool)
> +		goto err_uncharge;
> +
> +	private->h = h;
> +	private->spool = spool;
> +	private->h_cg_rsvd = h_cg_rsvd;
> +
> +	return private;
> +
> +err_uncharge:
> +	ret = -ENOMEM;
> +	hugetlb_cgroup_uncharge_cgroup_rsvd(idx, nr_pages, h_cg_rsvd);
> +err_free:
> +	kfree(private);
> +	return ERR_PTR(ret);
> +}
> +
> +static void guestmem_hugetlb_teardown(void *priv, size_t inode_size)
> +{
> +	struct guestmem_hugetlb_private *private = priv;
> +	unsigned long nr_pages;
> +	int idx;
> +
> +	hugepage_put_subpool(private->spool);
> +
> +	idx = hstate_index(private->h);
> +	nr_pages = inode_size >> PAGE_SHIFT;
> +	hugetlb_cgroup_uncharge_cgroup_rsvd(idx, nr_pages, private->h_cg_rsvd);
> +
> +	kfree(private);
> +}
> +
> +static struct folio *guestmem_hugetlb_alloc_folio(void *priv)
> +{
> +	struct guestmem_hugetlb_private *private = priv;
> +	struct mempolicy *mpol;
> +	struct folio *folio;
> +	pgoff_t ilx;
> +	int ret;
> +
> +	ret = hugepage_subpool_get_pages(private->spool, 1);
> +	if (ret == -ENOMEM) {
> +		return ERR_PTR(-ENOMEM);
> +	} else if (ret > 0) {
> +		/* guest_memfd will not use surplus pages. */
> +		goto err_put_pages;
> +	}
> +
> +	/*
> +	 * TODO: mempolicy would probably have to be stored on the inode, use
> +	 * task policy for now.
> +	 */
> +	mpol = get_task_policy(current);
> +
> +	/* TODO: ignore interleaving for now. */
> +	ilx = NO_INTERLEAVE_INDEX;
> +
> +	/*
> +	 * charge_cgroup_rsvd is false because we already charged reservations
> +	 * when creating the subpool for this
> +	 * guest_memfd. use_existing_reservation is true - we're using a
> +	 * reservation from the guest_memfd's subpool.
> +	 */
> +	folio = hugetlb_alloc_folio(private->h, mpol, ilx, false, true);
> +	mpol_cond_put(mpol);
> +
> +	if (IS_ERR_OR_NULL(folio))
> +		goto err_put_pages;
> +
> +	/*
> +	 * Clear restore_reserve here so that when this folio is freed,
> +	 * free_huge_folio() will always attempt to return the reservation to
> +	 * the subpool.  guest_memfd, unlike regular hugetlb, has no resv_map,
> +	 * and hence when freeing, the folio needs to be returned to the
> +	 * subpool.  guest_memfd does not use surplus hugetlb pages, so in
> +	 * free_huge_folio(), returning to subpool will always succeed and the
> +	 * hstate reservation will then get restored.
> +	 *
> +	 * hugetlbfs does this in hugetlb_add_to_page_cache().
> +	 */
> +	folio_clear_hugetlb_restore_reserve(folio);
> +
> +	hugetlb_set_folio_subpool(folio, private->spool);
> +
> +	return folio;
> +
> +err_put_pages:
> +	hugepage_subpool_put_pages(private->spool, 1);
> +	return ERR_PTR(-ENOMEM);
> +}
> +
> +const struct guestmem_allocator_operations guestmem_hugetlb_ops = {
> +	.inode_setup = guestmem_hugetlb_setup,
> +	.inode_teardown = guestmem_hugetlb_teardown,
> +	.alloc_folio = guestmem_hugetlb_alloc_folio,
> +	.nr_pages_in_folio = guestmem_hugetlb_nr_pages_in_folio,
> +};
> +EXPORT_SYMBOL_GPL(guestmem_hugetlb_ops);
> -- 
> 2.49.0.1045.g170613ef41-goog
> 
>