[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2ae41e0d80339da2b57011622ac2288fed65cd01.1747264138.git.ackerleytng@google.com>
Date: Wed, 14 May 2025 16:42:14 -0700
From: Ackerley Tng <ackerleytng@...gle.com>
To: kvm@...r.kernel.org, linux-mm@...ck.org, linux-kernel@...r.kernel.org,
x86@...nel.org, linux-fsdevel@...r.kernel.org
Cc: ackerleytng@...gle.com, aik@....com, ajones@...tanamicro.com,
akpm@...ux-foundation.org, amoorthy@...gle.com, anthony.yznaga@...cle.com,
anup@...infault.org, aou@...s.berkeley.edu, bfoster@...hat.com,
binbin.wu@...ux.intel.com, brauner@...nel.org, catalin.marinas@....com,
chao.p.peng@...el.com, chenhuacai@...nel.org, dave.hansen@...el.com,
david@...hat.com, dmatlack@...gle.com, dwmw@...zon.co.uk,
erdemaktas@...gle.com, fan.du@...el.com, fvdl@...gle.com, graf@...zon.com,
haibo1.xu@...el.com, hch@...radead.org, hughd@...gle.com, ira.weiny@...el.com,
isaku.yamahata@...el.com, jack@...e.cz, james.morse@....com,
jarkko@...nel.org, jgg@...pe.ca, jgowans@...zon.com, jhubbard@...dia.com,
jroedel@...e.de, jthoughton@...gle.com, jun.miao@...el.com,
kai.huang@...el.com, keirf@...gle.com, kent.overstreet@...ux.dev,
kirill.shutemov@...el.com, liam.merwick@...cle.com,
maciej.wieczor-retman@...el.com, mail@...iej.szmigiero.name, maz@...nel.org,
mic@...ikod.net, michael.roth@....com, mpe@...erman.id.au,
muchun.song@...ux.dev, nikunj@....com, nsaenz@...zon.es,
oliver.upton@...ux.dev, palmer@...belt.com, pankaj.gupta@....com,
paul.walmsley@...ive.com, pbonzini@...hat.com, pdurrant@...zon.co.uk,
peterx@...hat.com, pgonda@...gle.com, pvorel@...e.cz, qperret@...gle.com,
quic_cvanscha@...cinc.com, quic_eberman@...cinc.com,
quic_mnalajal@...cinc.com, quic_pderrin@...cinc.com, quic_pheragu@...cinc.com,
quic_svaddagi@...cinc.com, quic_tsoni@...cinc.com, richard.weiyang@...il.com,
rick.p.edgecombe@...el.com, rientjes@...gle.com, roypat@...zon.co.uk,
rppt@...nel.org, seanjc@...gle.com, shuah@...nel.org, steven.price@....com,
steven.sistare@...cle.com, suzuki.poulose@....com, tabba@...gle.com,
thomas.lendacky@....com, usama.arif@...edance.com, vannapurve@...gle.com,
vbabka@...e.cz, viro@...iv.linux.org.uk, vkuznets@...hat.com,
wei.w.wang@...el.com, will@...nel.org, willy@...radead.org,
xiaoyao.li@...el.com, yan.y.zhao@...el.com, yilun.xu@...el.com,
yuzenghui@...wei.com, zhiquan1.li@...el.com
Subject: [RFC PATCH v2 35/51] mm: guestmem_hugetlb: Add support for splitting
and merging pages
These functions allow guest_memfd to split and merge HugeTLB pages,
and clean them up on freeing the page.
For merging and splitting pages on conversion, guestmem_hugetlb
expects the refcount on the pages to already be 0. The caller must
ensure that.
For conversions, guest_memfd ensures that the refcounts are already 0
by checking that there are no unexpected refcounts, and then freezing
the expected refcounts away. On unexpected refcounts, guest_memfd will
return an error to userspace.
For truncation, on unexpected refcounts, guest_memfd will return an
error to userspace.
For truncation on closing, guest_memfd will just remove its own
refcounts (the filemap refcounts) and mark split pages with
PGTY_guestmem_hugetlb.
The presence of PGTY_guestmem_hugetlb will trigger the folio_put()
callback to handle further cleanup. This cleanup process will merge
pages (with refcount 0, since cleanup is triggered from folio_put())
before returning the pages to HugeTLB.
Since the merging process is long, it is deferred to a worker thread
since folio_put() could be called from atomic context.
Change-Id: Ib04a3236f1e7250fd9af827630c334d40fb09d40
Signed-off-by: Ackerley Tng <ackerleytng@...gle.com>
Co-developed-by: Vishal Annapurve <vannapurve@...gle.com>
Signed-off-by: Vishal Annapurve <vannapurve@...gle.com>
---
include/linux/guestmem.h | 3 +
mm/guestmem_hugetlb.c | 349 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 347 insertions(+), 5 deletions(-)
diff --git a/include/linux/guestmem.h b/include/linux/guestmem.h
index 4b2d820274d9..3ee816d1dd34 100644
--- a/include/linux/guestmem.h
+++ b/include/linux/guestmem.h
@@ -8,6 +8,9 @@ struct guestmem_allocator_operations {
void *(*inode_setup)(size_t size, u64 flags);
void (*inode_teardown)(void *private, size_t inode_size);
struct folio *(*alloc_folio)(void *private);
+ int (*split_folio)(struct folio *folio);
+ void (*merge_folio)(struct folio *folio);
+ void (*free_folio)(struct folio *folio);
/*
* Returns the number of PAGE_SIZE pages in a page that this guestmem
* allocator provides.
diff --git a/mm/guestmem_hugetlb.c b/mm/guestmem_hugetlb.c
index ec5a188ca2a7..8727598cf18e 100644
--- a/mm/guestmem_hugetlb.c
+++ b/mm/guestmem_hugetlb.c
@@ -11,15 +11,12 @@
#include <linux/mm.h>
#include <linux/mm_types.h>
#include <linux/pagemap.h>
+#include <linux/xarray.h>
#include <uapi/linux/guestmem.h>
#include "guestmem_hugetlb.h"
-
-void guestmem_hugetlb_handle_folio_put(struct folio *folio)
-{
- WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress.");
-}
+#include "hugetlb_vmemmap.h"
struct guestmem_hugetlb_private {
struct hstate *h;
@@ -34,6 +31,339 @@ static size_t guestmem_hugetlb_nr_pages_in_folio(void *priv)
return pages_per_huge_page(private->h);
}
+static DEFINE_XARRAY(guestmem_hugetlb_stash);
+
+struct guestmem_hugetlb_metadata {
+ void *_hugetlb_subpool;
+ void *_hugetlb_cgroup;
+ void *_hugetlb_hwpoison;
+ void *private;
+};
+
+struct guestmem_hugetlb_stash_item {
+ struct guestmem_hugetlb_metadata hugetlb_metadata;
+ /* hstate tracks the original size of this folio. */
+ struct hstate *h;
+ /* Count of split pages, individually freed, waiting to be merged. */
+ atomic_t nr_pages_waiting_to_be_merged;
+};
+
+struct workqueue_struct *guestmem_hugetlb_wq __ro_after_init;
+static struct work_struct guestmem_hugetlb_cleanup_work;
+static LLIST_HEAD(guestmem_hugetlb_cleanup_list);
+
+static inline void guestmem_hugetlb_register_folio_put_callback(struct folio *folio)
+{
+ __folio_set_guestmem_hugetlb(folio);
+}
+
+static inline void guestmem_hugetlb_unregister_folio_put_callback(struct folio *folio)
+{
+ __folio_clear_guestmem_hugetlb(folio);
+}
+
+static inline void guestmem_hugetlb_defer_cleanup(struct folio *folio)
+{
+ struct llist_node *node;
+
+ /*
+ * Reuse the folio->mapping pointer as a struct llist_node, since
+ * folio->mapping is NULL at this point.
+ */
+ BUILD_BUG_ON(sizeof(folio->mapping) != sizeof(struct llist_node));
+ node = (struct llist_node *)&folio->mapping;
+
+ /*
+ * Only schedule work if list is previously empty. Otherwise,
+ * schedule_work() had been called but the workfn hasn't retrieved the
+ * list yet.
+ */
+ if (llist_add(node, &guestmem_hugetlb_cleanup_list))
+ queue_work(guestmem_hugetlb_wq, &guestmem_hugetlb_cleanup_work);
+}
+
+void guestmem_hugetlb_handle_folio_put(struct folio *folio)
+{
+ guestmem_hugetlb_unregister_folio_put_callback(folio);
+
+ /*
+ * folio_put() can be called in interrupt context, hence do the work
+ * outside of interrupt context
+ */
+ guestmem_hugetlb_defer_cleanup(folio);
+}
+
+/*
+ * Stash existing hugetlb metadata. Use this function just before splitting a
+ * hugetlb page.
+ */
+static inline void
+__guestmem_hugetlb_stash_metadata(struct guestmem_hugetlb_metadata *metadata,
+ struct folio *folio)
+{
+ /*
+ * (folio->page + 1) doesn't have to be stashed since those fields are
+ * known on split/reconstruct and will be reinitialized anyway.
+ */
+
+ /*
+ * subpool is created for every guest_memfd inode, but the folios will
+ * outlive the inode, hence we store the subpool here.
+ */
+ metadata->_hugetlb_subpool = folio->_hugetlb_subpool;
+ /*
+ * _hugetlb_cgroup has to be stored for freeing
+ * later. _hugetlb_cgroup_rsvd does not, since it is NULL for
+ * guest_memfd folios anyway. guest_memfd reservations are handled in
+ * the inode.
+ */
+ metadata->_hugetlb_cgroup = folio->_hugetlb_cgroup;
+ metadata->_hugetlb_hwpoison = folio->_hugetlb_hwpoison;
+
+ /*
+ * HugeTLB flags are stored in folio->private. stash so that ->private
+ * can be used by core-mm.
+ */
+ metadata->private = folio->private;
+}
+
+static int guestmem_hugetlb_stash_metadata(struct folio *folio)
+{
+ XA_STATE(xas, &guestmem_hugetlb_stash, 0);
+ struct guestmem_hugetlb_stash_item *stash;
+ void *entry;
+
+ stash = kzalloc(sizeof(*stash), 1);
+ if (!stash)
+ return -ENOMEM;
+
+ stash->h = folio_hstate(folio);
+ __guestmem_hugetlb_stash_metadata(&stash->hugetlb_metadata, folio);
+
+ xas_set_order(&xas, folio_pfn(folio), folio_order(folio));
+
+ xas_lock(&xas);
+ entry = xas_store(&xas, stash);
+ xas_unlock(&xas);
+
+ if (xa_is_err(entry)) {
+ kfree(stash);
+ return xa_err(entry);
+ }
+
+ return 0;
+}
+
+static inline void
+__guestmem_hugetlb_unstash_metadata(struct guestmem_hugetlb_metadata *metadata,
+ struct folio *folio)
+{
+ folio->_hugetlb_subpool = metadata->_hugetlb_subpool;
+ folio->_hugetlb_cgroup = metadata->_hugetlb_cgroup;
+ folio->_hugetlb_cgroup_rsvd = NULL;
+ folio->_hugetlb_hwpoison = metadata->_hugetlb_hwpoison;
+
+ folio_change_private(folio, metadata->private);
+}
+
+static int guestmem_hugetlb_unstash_free_metadata(struct folio *folio)
+{
+ struct guestmem_hugetlb_stash_item *stash;
+ unsigned long pfn;
+
+ pfn = folio_pfn(folio);
+
+ stash = xa_erase(&guestmem_hugetlb_stash, pfn);
+ __guestmem_hugetlb_unstash_metadata(&stash->hugetlb_metadata, folio);
+
+ kfree(stash);
+
+ return 0;
+}
+
+/**
+ * guestmem_hugetlb_split_folio() - Split a HugeTLB @folio to PAGE_SIZE pages.
+ *
+ * @folio: The folio to be split.
+ *
+ * Context: Before splitting, the folio must have a refcount of 0. After
+ * splitting, each split folio has a refcount of 0.
+ * Return: 0 on success and negative error otherwise.
+ */
+static int guestmem_hugetlb_split_folio(struct folio *folio)
+{
+ long orig_nr_pages;
+ int ret;
+ int i;
+
+ if (folio_size(folio) == PAGE_SIZE)
+ return 0;
+
+ orig_nr_pages = folio_nr_pages(folio);
+ ret = guestmem_hugetlb_stash_metadata(folio);
+ if (ret)
+ return ret;
+
+ /*
+ * hugetlb_vmemmap_restore_folio() has to be called ahead of the rest
+ * because it checks and page type. This doesn't actually split the
+ * folio, so the first few struct pages are still intact.
+ */
+ ret = hugetlb_vmemmap_restore_folio(folio_hstate(folio), folio);
+ if (ret)
+ goto err;
+
+ /*
+ * Can clear without lock because this will not race with the folio
+ * being mapped. folio's page type is overlaid with mapcount and so in
+ * other cases it's necessary to take hugetlb_lock to prevent races with
+ * mapcount increasing.
+ */
+ __folio_clear_hugetlb(folio);
+
+ /*
+ * Remove the first folio from h->hugepage_activelist since it is no
+ * longer a HugeTLB page. The other split pages should not be on any
+ * lists.
+ */
+ hugetlb_folio_list_del(folio);
+
+ /* Actually split page by undoing prep_compound_page() */
+ __folio_clear_head(folio);
+
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ /*
+ * Zero out _nr_pages, otherwise this overlaps with memcg_data,
+ * resulting in lookups on false memcg_data. _nr_pages doesn't have to
+ * be set to 1 because folio_nr_pages() relies on the presence of the
+ * head flag to return 1 for nr_pages.
+ */
+ folio->_nr_pages = 0;
+#endif
+
+ for (i = 1; i < orig_nr_pages; ++i) {
+ struct page *p = folio_page(folio, i);
+
+ /* Copy flags from the first page to split pages. */
+ p->flags = folio->flags;
+
+ p->mapping = NULL;
+ clear_compound_head(p);
+ }
+
+ return 0;
+
+err:
+ guestmem_hugetlb_unstash_free_metadata(folio);
+
+ return ret;
+}
+
+/**
+ * guestmem_hugetlb_merge_folio() - Merge a HugeTLB folio from the folio
+ * beginning @first_folio.
+ *
+ * @first_folio: the first folio in a contiguous block of folios to be merged.
+ *
+ * The size of the contiguous block is tracked in guestmem_hugetlb_stash.
+ *
+ * Context: The first folio is checked to have a refcount of 0 before
+ * reconstruction. After reconstruction, the reconstructed folio has a
+ * refcount of 0.
+ */
+static void guestmem_hugetlb_merge_folio(struct folio *first_folio)
+{
+ struct guestmem_hugetlb_stash_item *stash;
+ struct hstate *h;
+
+ stash = xa_load(&guestmem_hugetlb_stash, folio_pfn(first_folio));
+ h = stash->h;
+
+ /*
+ * This is the step that does the merge. prep_compound_page() will write
+ * to pages 1 and 2 as well, so guestmem_unstash_hugetlb_metadata() has
+ * to come after this.
+ */
+ prep_compound_page(&first_folio->page, huge_page_order(h));
+
+ WARN_ON(guestmem_hugetlb_unstash_free_metadata(first_folio));
+
+ /*
+ * prep_compound_page() will set up mapping on tail pages. For
+ * completeness, clear mapping on head page.
+ */
+ first_folio->mapping = NULL;
+
+ __folio_set_hugetlb(first_folio);
+
+ hugetlb_folio_list_add(first_folio, &h->hugepage_activelist);
+
+ hugetlb_vmemmap_optimize_folio(h, first_folio);
+}
+
+static struct folio *guestmem_hugetlb_maybe_merge_folio(struct folio *folio)
+{
+ struct guestmem_hugetlb_stash_item *stash;
+ unsigned long first_folio_pfn;
+ struct folio *first_folio;
+ unsigned long pfn;
+ size_t nr_pages;
+
+ pfn = folio_pfn(folio);
+
+ stash = xa_load(&guestmem_hugetlb_stash, pfn);
+ nr_pages = pages_per_huge_page(stash->h);
+ if (atomic_inc_return(&stash->nr_pages_waiting_to_be_merged) < nr_pages)
+ return NULL;
+
+ first_folio_pfn = round_down(pfn, nr_pages);
+ first_folio = pfn_folio(first_folio_pfn);
+
+ guestmem_hugetlb_merge_folio(first_folio);
+
+ return first_folio;
+}
+
+static void guestmem_hugetlb_cleanup_folio(struct folio *folio)
+{
+ struct folio *merged_folio;
+
+ merged_folio = guestmem_hugetlb_maybe_merge_folio(folio);
+ if (merged_folio)
+ __folio_put(merged_folio);
+}
+
+static void guestmem_hugetlb_cleanup_workfn(struct work_struct *work)
+{
+ struct llist_node *node;
+
+ node = llist_del_all(&guestmem_hugetlb_cleanup_list);
+ while (node) {
+ struct folio *folio;
+
+ folio = container_of((struct address_space **)node,
+ struct folio, mapping);
+
+ node = node->next;
+ folio->mapping = NULL;
+
+ guestmem_hugetlb_cleanup_folio(folio);
+ }
+}
+
+static int __init guestmem_hugetlb_init(void)
+{
+ INIT_WORK(&guestmem_hugetlb_cleanup_work, guestmem_hugetlb_cleanup_workfn);
+
+ guestmem_hugetlb_wq = alloc_workqueue("guestmem_hugetlb",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (!guestmem_hugetlb_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+subsys_initcall(guestmem_hugetlb_init);
+
static void *guestmem_hugetlb_setup(size_t size, u64 flags)
{
@@ -164,10 +494,19 @@ static struct folio *guestmem_hugetlb_alloc_folio(void *priv)
return ERR_PTR(-ENOMEM);
}
+static void guestmem_hugetlb_free_folio(struct folio *folio)
+{
+ if (xa_load(&guestmem_hugetlb_stash, folio_pfn(folio)))
+ guestmem_hugetlb_register_folio_put_callback(folio);
+}
+
const struct guestmem_allocator_operations guestmem_hugetlb_ops = {
.inode_setup = guestmem_hugetlb_setup,
.inode_teardown = guestmem_hugetlb_teardown,
.alloc_folio = guestmem_hugetlb_alloc_folio,
+ .split_folio = guestmem_hugetlb_split_folio,
+ .merge_folio = guestmem_hugetlb_merge_folio,
+ .free_folio = guestmem_hugetlb_free_folio,
.nr_pages_in_folio = guestmem_hugetlb_nr_pages_in_folio,
};
EXPORT_SYMBOL_GPL(guestmem_hugetlb_ops);
--
2.49.0.1045.g170613ef41-goog
Powered by blists - more mailing lists