linux-kernel - [RFC PATCH v2 23/51] mm: hugetlb: Refactor out hugetlb_alloc

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <bdd00f8a1919794da94ba366529756bd6b925ade.1747264138.git.ackerleytng@google.com>
Date: Wed, 14 May 2025 16:42:02 -0700
From: Ackerley Tng <ackerleytng@...gle.com>
To: kvm@...r.kernel.org, linux-mm@...ck.org, linux-kernel@...r.kernel.org, 
	x86@...nel.org, linux-fsdevel@...r.kernel.org
Cc: ackerleytng@...gle.com, aik@....com, ajones@...tanamicro.com, 
	akpm@...ux-foundation.org, amoorthy@...gle.com, anthony.yznaga@...cle.com, 
	anup@...infault.org, aou@...s.berkeley.edu, bfoster@...hat.com, 
	binbin.wu@...ux.intel.com, brauner@...nel.org, catalin.marinas@....com, 
	chao.p.peng@...el.com, chenhuacai@...nel.org, dave.hansen@...el.com, 
	david@...hat.com, dmatlack@...gle.com, dwmw@...zon.co.uk, 
	erdemaktas@...gle.com, fan.du@...el.com, fvdl@...gle.com, graf@...zon.com, 
	haibo1.xu@...el.com, hch@...radead.org, hughd@...gle.com, ira.weiny@...el.com, 
	isaku.yamahata@...el.com, jack@...e.cz, james.morse@....com, 
	jarkko@...nel.org, jgg@...pe.ca, jgowans@...zon.com, jhubbard@...dia.com, 
	jroedel@...e.de, jthoughton@...gle.com, jun.miao@...el.com, 
	kai.huang@...el.com, keirf@...gle.com, kent.overstreet@...ux.dev, 
	kirill.shutemov@...el.com, liam.merwick@...cle.com, 
	maciej.wieczor-retman@...el.com, mail@...iej.szmigiero.name, maz@...nel.org, 
	mic@...ikod.net, michael.roth@....com, mpe@...erman.id.au, 
	muchun.song@...ux.dev, nikunj@....com, nsaenz@...zon.es, 
	oliver.upton@...ux.dev, palmer@...belt.com, pankaj.gupta@....com, 
	paul.walmsley@...ive.com, pbonzini@...hat.com, pdurrant@...zon.co.uk, 
	peterx@...hat.com, pgonda@...gle.com, pvorel@...e.cz, qperret@...gle.com, 
	quic_cvanscha@...cinc.com, quic_eberman@...cinc.com, 
	quic_mnalajal@...cinc.com, quic_pderrin@...cinc.com, quic_pheragu@...cinc.com, 
	quic_svaddagi@...cinc.com, quic_tsoni@...cinc.com, richard.weiyang@...il.com, 
	rick.p.edgecombe@...el.com, rientjes@...gle.com, roypat@...zon.co.uk, 
	rppt@...nel.org, seanjc@...gle.com, shuah@...nel.org, steven.price@....com, 
	steven.sistare@...cle.com, suzuki.poulose@....com, tabba@...gle.com, 
	thomas.lendacky@....com, usama.arif@...edance.com, vannapurve@...gle.com, 
	vbabka@...e.cz, viro@...iv.linux.org.uk, vkuznets@...hat.com, 
	wei.w.wang@...el.com, will@...nel.org, willy@...radead.org, 
	xiaoyao.li@...el.com, yan.y.zhao@...el.com, yilun.xu@...el.com, 
	yuzenghui@...wei.com, zhiquan1.li@...el.com
Subject: [RFC PATCH v2 23/51] mm: hugetlb: Refactor out hugetlb_alloc_folio()

Refactor out hugetlb_alloc_folio() from alloc_hugetlb_folio(), which
handles allocation of a folio and cgroup charging.

Other than flags to control charging in the allocation process,
hugetlb_alloc_folio() also has parameters for memory policy.

This refactoring as a whole decouples the hugetlb page allocation from
hugetlbfs, (1) where the subpool is stored at the fs mount, (2)
reservations are made during mmap and stored in the vma, and (3) mpol
must be stored at vma->vm_policy (4) a vma must be used for allocation
even if the pages are not meant to be used by host process.

This decoupling will allow hugetlb_alloc_folio() to be used by
guest_memfd in later patches. In guest_memfd, (1) a subpool is created
per-fd and is stored on the inode, (2) no vma-related reservations are
used (3) mpol may not be associated with a vma since (4) for private
pages, the pages will not be mappable to userspace and hence have to
associated vmas.

This could hopefully also open hugetlb up as a more generic source of
hugetlb pages that are not bound to hugetlbfs, with the complexities
of userspace/mmap/vma-related reservations contained just to
hugetlbfs.

Signed-off-by: Ackerley Tng <ackerleytng@...gle.com>
Change-Id: I60528f246341268acbf0ed5de7752ae2cacbef93
---
 include/linux/hugetlb.h |  12 +++
 mm/hugetlb.c            | 192 ++++++++++++++++++++++------------------
 2 files changed, 118 insertions(+), 86 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8f3ac832ee7f..8ba941d88956 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -698,6 +698,9 @@ bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m);
 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
 int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
 void wait_for_freed_hugetlb_folios(void);
+struct folio *hugetlb_alloc_folio(struct hstate *h, struct mempolicy *mpol,
+				  pgoff_t ilx, bool charge_cgroup_rsvd,
+				  bool use_existing_reservation);
 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 				unsigned long addr, bool cow_from_owner);
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
@@ -1099,6 +1102,15 @@ static inline void wait_for_freed_hugetlb_folios(void)
 {
 }
 
+static inline struct folio *hugetlb_alloc_folio(struct hstate *h,
+						struct mempolicy *mpol,
+						pgoff_t ilx,
+						bool charge_cgroup_rsvd,
+						bool use_existing_reservation)
+{
+	return NULL;
+}
+
 static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 					   unsigned long addr,
 					   bool cow_from_owner)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 29d1a3fb10df..5b088fe002a2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2954,6 +2954,101 @@ void wait_for_freed_hugetlb_folios(void)
 	flush_work(&free_hpage_work);
 }
 
+/**
+ * hugetlb_alloc_folio() - Allocates a hugetlb folio.
+ *
+ * @h: struct hstate to allocate from.
+ * @mpol: struct mempolicy to apply for this folio allocation.
+ * @ilx: Interleave index for interpretation of @mpol.
+ * @charge_cgroup_rsvd: Set to true to charge cgroup reservation.
+ * @use_existing_reservation: Set to true if this allocation should use an
+ *                            existing hstate reservation.
+ *
+ * This function handles cgroup and global hstate reservations. VMA-related
+ * reservations and subpool debiting must be handled by the caller if necessary.
+ *
+ * Return: folio on success or negated error otherwise.
+ */
+struct folio *hugetlb_alloc_folio(struct hstate *h, struct mempolicy *mpol,
+				  pgoff_t ilx, bool charge_cgroup_rsvd,
+				  bool use_existing_reservation)
+{
+	unsigned int nr_pages = pages_per_huge_page(h);
+	struct hugetlb_cgroup *h_cg = NULL;
+	struct folio *folio = NULL;
+	nodemask_t *nodemask;
+	gfp_t gfp_mask;
+	int nid;
+	int idx;
+	int ret;
+
+	idx = hstate_index(h);
+
+	if (charge_cgroup_rsvd) {
+		if (hugetlb_cgroup_charge_cgroup_rsvd(idx, nr_pages, &h_cg))
+			goto out;
+	}
+
+	if (hugetlb_cgroup_charge_cgroup(idx, nr_pages, &h_cg))
+		goto out_uncharge_cgroup_reservation;
+
+	gfp_mask = htlb_alloc_mask(h);
+	nid = policy_node_nodemask(mpol, gfp_mask, ilx, &nodemask);
+
+	spin_lock_irq(&hugetlb_lock);
+
+	if (use_existing_reservation || available_huge_pages(h))
+		folio = dequeue_hugetlb_folio(h, gfp_mask, mpol, nid, nodemask);
+
+	if (!folio) {
+		spin_unlock_irq(&hugetlb_lock);
+		folio = alloc_surplus_hugetlb_folio(h, gfp_mask, mpol, nid, nodemask);
+		if (!folio)
+			goto out_uncharge_cgroup;
+		spin_lock_irq(&hugetlb_lock);
+		list_add(&folio->lru, &h->hugepage_activelist);
+		folio_ref_unfreeze(folio, 1);
+		/* Fall through */
+	}
+
+	if (use_existing_reservation) {
+		folio_set_hugetlb_restore_reserve(folio);
+		h->resv_huge_pages--;
+	}
+
+	hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio);
+
+	if (charge_cgroup_rsvd)
+		hugetlb_cgroup_commit_charge_rsvd(idx, nr_pages, h_cg, folio);
+
+	spin_unlock_irq(&hugetlb_lock);
+
+	gfp_mask = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
+	ret = mem_cgroup_charge_hugetlb(folio, gfp_mask);
+	/*
+	 * Unconditionally increment NR_HUGETLB here. If it turns out that
+	 * mem_cgroup_charge_hugetlb failed, then immediately free the page and
+	 * decrement NR_HUGETLB.
+	 */
+	lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));
+
+	if (ret == -ENOMEM) {
+		free_huge_folio(folio);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return folio;
+
+out_uncharge_cgroup:
+	hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg);
+out_uncharge_cgroup_reservation:
+	if (charge_cgroup_rsvd)
+		hugetlb_cgroup_uncharge_cgroup_rsvd(idx, nr_pages, h_cg);
+out:
+	folio = ERR_PTR(-ENOSPC);
+	goto out;
+}
+
 /*
  * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
  * faults of hugetlb private mappings on top of a non-page-cache folio (in
@@ -2971,16 +3066,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 	bool reservation_exists;
 	bool charge_cgroup_rsvd;
 	struct folio *folio;
-	int ret, idx;
-	struct hugetlb_cgroup *h_cg = NULL;
-	gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
 	struct mempolicy *mpol;
-	nodemask_t *nodemask;
-	gfp_t gfp_mask;
 	pgoff_t ilx;
-	int nid;
-
-	idx = hstate_index(h);
 
 	if (cow_from_owner) {
 		/*
@@ -3020,69 +3107,22 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 	}
 	reservation_exists = vma_reservation_exists || subpool_reservation_exists;
 
-	/*
-	 * If a vma_reservation_exists, we can skip charging hugetlb
-	 * reservations since that was charged in hugetlb_reserve_pages() when
-	 * the reservation was recorded on the resv_map.
-	 */
-	charge_cgroup_rsvd = !vma_reservation_exists;
-	if (charge_cgroup_rsvd) {
-		ret = hugetlb_cgroup_charge_cgroup_rsvd(
-			idx, pages_per_huge_page(h), &h_cg);
-		if (ret)
-			goto out_subpool_put;
-	}
-
 	mpol = get_vma_policy(vma, addr, h->order, &ilx);
 
-	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
-	if (ret) {
-		mpol_cond_put(mpol);
-		goto out_uncharge_cgroup_reservation;
-	}
-
-	gfp_mask = htlb_alloc_mask(h);
-	nid = policy_node_nodemask(mpol, gfp_mask, ilx, &nodemask);
-
-	spin_lock_irq(&hugetlb_lock);
-
-	folio = NULL;
-	if (reservation_exists || available_huge_pages(h))
-		folio = dequeue_hugetlb_folio(h, gfp_mask, mpol, nid, nodemask);
-
-	if (!folio) {
-		spin_unlock_irq(&hugetlb_lock);
-		folio = alloc_surplus_hugetlb_folio(h, gfp_mask, mpol, nid, nodemask);
-		if (!folio) {
-			mpol_cond_put(mpol);
-			goto out_uncharge_cgroup;
-		}
-		spin_lock_irq(&hugetlb_lock);
-		list_add(&folio->lru, &h->hugepage_activelist);
-		folio_ref_unfreeze(folio, 1);
-		/* Fall through */
-	}
-
 	/*
-	 * Either dequeued or buddy-allocated folio needs to add special
-	 * mark to the folio when it consumes a global reservation.
+	 * If a vma_reservation_exists, we can skip charging cgroup reservations
+	 * since that was charged during vma reservation. Use a reservation as
+	 * long as it exists.
 	 */
-	if (reservation_exists) {
-		folio_set_hugetlb_restore_reserve(folio);
-		h->resv_huge_pages--;
-	}
-
-	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
-
-	if (charge_cgroup_rsvd) {
-		hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
-						  h_cg, folio);
-	}
-
-	spin_unlock_irq(&hugetlb_lock);
+	charge_cgroup_rsvd = !vma_reservation_exists;
+	folio = hugetlb_alloc_folio(h, mpol, ilx, charge_cgroup_rsvd,
+				    reservation_exists);
 
 	mpol_cond_put(mpol);
 
+	if (IS_ERR_OR_NULL(folio))
+		goto out_subpool_put;
+
 	hugetlb_set_folio_subpool(folio, spool);
 
 	/* If vma accounting wasn't bypassed earlier, follow up with commit. */
@@ -3091,9 +3131,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 		/*
 		 * If there is a discrepancy in reservation status between the
 		 * time of vma_needs_reservation() and vma_commit_reservation(),
-		 * then there the page must have been added to the reservation
-		 * map between vma_needs_reservation() and
-		 * vma_commit_reservation().
+		 * then the page must have been added to the reservation map
+		 * between vma_needs_reservation() and vma_commit_reservation().
 		 *
 		 * Adjust for the subpool count incremented above AND
 		 * in hugetlb_reserve_pages for the same page.	Also,
@@ -3115,27 +3154,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 		}
 	}
 
-	ret = mem_cgroup_charge_hugetlb(folio, gfp);
-	/*
-	 * Unconditionally increment NR_HUGETLB here. If it turns out that
-	 * mem_cgroup_charge_hugetlb failed, then immediately free the page and
-	 * decrement NR_HUGETLB.
-	 */
-	lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));
-
-	if (ret == -ENOMEM) {
-		free_huge_folio(folio);
-		return ERR_PTR(-ENOMEM);
-	}
-
 	return folio;
 
-out_uncharge_cgroup:
-	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
-out_uncharge_cgroup_reservation:
-	if (charge_cgroup_rsvd)
-		hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
-						    h_cg);
 out_subpool_put:
 	if (!vma_reservation_exists)
 		hugepage_subpool_put_pages(spool, 1);
-- 
2.49.0.1045.g170613ef41-goog