[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9e29972b-6f72-4228-89bb-0ede44ed5e83@redhat.com>
Date: Wed, 5 Nov 2025 12:42:09 +0100
From: David Hildenbrand <dhildenb@...hat.com>
To: Hui Zhu <hui.zhu@...ux.dev>, Andrew Morton <akpm@...ux-foundation.org>,
Muchun Song <muchun.song@...ux.dev>, Oscar Salvador <osalvador@...e.de>,
linux-kernel@...r.kernel.org, linux-mm@...ck.org
Cc: Hui Zhu <zhuhui@...inos.cn>, Geliang Tang <geliang@...nel.org>
Subject: Re: [PATCH v3 1/2] mm/hugetlb: extract sysfs into hugetlb_sysfs.c
On 04.11.25 09:37, Hui Zhu wrote:
> From: Hui Zhu <zhuhui@...inos.cn>
>
> Currently, hugetlb.c contains both core management logic and sysfs
> interface implementations, making it difficult to maintain. This patch
> extracts the sysfs-related code into a dedicated file to improve code
> organization.
>
> The following components are moved to mm/hugetlb_sysfs.c:
> - hugetlb page demote functions (demote_free_hugetlb_folios,
> demote_pool_huge_page)
> - sysfs attribute definitions and handlers
> - sysfs kobject management functions
> - NUMA per-node hstate attribute registration
>
> Several inline helper functions and macros are moved to
> mm/hugetlb_internal.h:
> - hstate_is_gigantic_no_runtime()
> - next_node_allowed()
> - get_valid_node_allowed()
> - hstate_next_node_to_alloc()
> - hstate_next_node_to_free()
> - for_each_node_mask_to_alloc/to_free macros
>
> To support code sharing, these functions are changed from static to
> exported symbols:
> - remove_hugetlb_folio()
> - add_hugetlb_folio()
> - init_new_hugetlb_folio()
> - prep_and_add_allocated_folios()
> - __nr_hugepages_store_common()
>
> The Makefile is updated to compile hugetlb_sysfs.o when
> CONFIG_HUGETLBFS is enabled. This maintains all existing functionality
> while improving maintainability by separating concerns.
>
> MAINTAINERS is updated to add new file hugetlb_sysfs.c.
>
> Signed-off-by: Geliang Tang <geliang@...nel.org>
> Signed-off-by: Hui Zhu <zhuhui@...inos.cn>
> ---
[...]
> --- /dev/null
> +++ b/mm/hugetlb_internal.h
> @@ -0,0 +1,107 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Internal HugeTLB definitions.
> + */
> +
> +#ifndef _LINUX_HUGETLB_INTERNAL_H
> +#define _LINUX_HUGETLB_INTERNAL_H
> +
> +#include <linux/hugetlb.h>
> +#include <linux/hugetlb_cgroup.h>
> +
> +/*
> + * Check if the hstate represents gigantic pages but gigantic page
> + * runtime support is not available. This is a common condition used to
> + * skip operations that cannot be performed on gigantic pages when runtime
> + * support is disabled.
> + */
> +static inline bool hstate_is_gigantic_no_runtime(struct hstate *h)
> +{
> + return hstate_is_gigantic(h) && !gigantic_page_runtime_supported();
> +}
> +
> +/*
> + * common helper functions for hstate_next_node_to_{alloc|free}.
> + * We may have allocated or freed a huge page based on a different
> + * nodes_allowed previously, so h->next_node_to_{alloc|free} might
> + * be outside of *nodes_allowed. Ensure that we use an allowed
> + * node for alloc or free.
> + */
> +static inline int next_node_allowed(int nid, nodemask_t *nodes_allowed)
> +{
> + nid = next_node_in(nid, *nodes_allowed);
> + VM_BUG_ON(nid >= MAX_NUMNODES);
> +
> + return nid;
> +}
> +
> +static inline int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
> +{
> + if (!node_isset(nid, *nodes_allowed))
> + nid = next_node_allowed(nid, nodes_allowed);
> + return nid;
> +}
> +
> +/*
> + * returns the previously saved node ["this node"] from which to
> + * allocate a persistent huge page for the pool and advance the
> + * next node from which to allocate, handling wrap at end of node
> + * mask.
> + */
> +static inline int hstate_next_node_to_alloc(int *next_node,
> + nodemask_t *nodes_allowed)
> +{
> + int nid;
> +
> + VM_BUG_ON(!nodes_allowed);
> +
> + nid = get_valid_node_allowed(*next_node, nodes_allowed);
> + *next_node = next_node_allowed(nid, nodes_allowed);
> +
> + return nid;
> +}
> +
> +/*
> + * helper for remove_pool_hugetlb_folio() - return the previously saved
> + * node ["this node"] from which to free a huge page. Advance the
> + * next node id whether or not we find a free huge page to free so
> + * that the next attempt to free addresses the next node.
> + */
> +static inline int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
> +{
> + int nid;
> +
> + VM_BUG_ON(!nodes_allowed);
> +
> + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
> + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
> +
> + return nid;
> +}
> +
> +#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \
> + for (nr_nodes = nodes_weight(*mask); \
> + nr_nodes > 0 && \
> + ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \
> + nr_nodes--)
> +
> +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
> + for (nr_nodes = nodes_weight(*mask); \
> + nr_nodes > 0 && \
> + ((node = hstate_next_node_to_free(hs, mask)) || 1); \
> + nr_nodes--)
> +
> +extern void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
> + bool adjust_surplus);
> +extern void add_hugetlb_folio(struct hstate *h, struct folio *folio,
> + bool adjust_surplus);
> +extern void init_new_hugetlb_folio(struct folio *folio);
> +extern void prep_and_add_allocated_folios(struct hstate *h,
> + struct list_head *folio_list);
> +extern ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
> + struct hstate *h, int nid,
> + unsigned long count, size_t len);
> +
Using extern for functions is no longer used.
> +extern void hugetlb_sysfs_init(void) __init;
> +
> +#endif /* _LINUX_HUGETLB_INTERNAL_H */
> diff --git a/mm/hugetlb_sysfs.c b/mm/hugetlb_sysfs.c
> new file mode 100644
> index 000000000000..a2567947d32c
> --- /dev/null
> +++ b/mm/hugetlb_sysfs.c
> @@ -0,0 +1,629 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * HugeTLB sysfs interfaces.
> + */
> +
As I said, we usually keep the copyright from the original file.
> +#include <linux/swap.h>
> +#include <linux/page_owner.h>
> +#include <linux/page-isolation.h>
> +
> +#include "hugetlb_vmemmap.h"
> +#include "hugetlb_internal.h"
> +
> +static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
> + struct list_head *src_list)
> +{
> + long rc;
> + struct folio *folio, *next;
> + LIST_HEAD(dst_list);
> + LIST_HEAD(ret_list);
> +
> + rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list);
> + list_splice_init(&ret_list, src_list);
> +
> + /*
> + * Taking target hstate mutex synchronizes with set_max_huge_pages.
> + * Without the mutex, pages added to target hstate could be marked
> + * as surplus.
> + *
> + * Note that we already hold src->resize_lock. To prevent deadlock,
> + * use the convention of always taking larger size hstate mutex first.
> + */
> + mutex_lock(&dst->resize_lock);
> +
> + list_for_each_entry_safe(folio, next, src_list, lru) {
> + int i;
> + bool cma;
> +
> + if (folio_test_hugetlb_vmemmap_optimized(folio))
> + continue;
> +
> + cma = folio_test_hugetlb_cma(folio);
> +
> + list_del(&folio->lru);
> +
> + split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
> + pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst));
> +
> + for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
> + struct page *page = folio_page(folio, i);
> + /* Careful: see __split_huge_page_tail() */
> + struct folio *new_folio = (struct folio *)page;
> +
> + clear_compound_head(page);
> + prep_compound_page(page, dst->order);
> +
> + new_folio->mapping = NULL;
> + init_new_hugetlb_folio(new_folio);
> + /* Copy the CMA flag so that it is freed correctly */
> + if (cma)
> + folio_set_hugetlb_cma(new_folio);
> + list_add(&new_folio->lru, &dst_list);
> + }
> + }
> +
> + prep_and_add_allocated_folios(dst, &dst_list);
> +
> + mutex_unlock(&dst->resize_lock);
> +
> + return rc;
> +}
> +
> +static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed,
> + unsigned long nr_to_demote)
> + __must_hold(&hugetlb_lock)
> +{
> + int nr_nodes, node;
> + struct hstate *dst;
> + long rc = 0;
> + long nr_demoted = 0;
> +
> + lockdep_assert_held(&hugetlb_lock);
> +
> + /* We should never get here if no demote order */
> + if (!src->demote_order) {
> + pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
> + return -EINVAL; /* internal error */
> + }
> + dst = size_to_hstate(PAGE_SIZE << src->demote_order);
> +
> + for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) {
> + LIST_HEAD(list);
> + struct folio *folio, *next;
> +
> + list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) {
> + if (folio_test_hwpoison(folio))
> + continue;
> +
> + remove_hugetlb_folio(src, folio, false);
> + list_add(&folio->lru, &list);
> +
> + if (++nr_demoted == nr_to_demote)
> + break;
> + }
> +
> + spin_unlock_irq(&hugetlb_lock);
> +
> + rc = demote_free_hugetlb_folios(src, dst, &list);
> +
> + spin_lock_irq(&hugetlb_lock);
> +
> + list_for_each_entry_safe(folio, next, &list, lru) {
> + list_del(&folio->lru);
> + add_hugetlb_folio(src, folio, false);
> +
> + nr_demoted--;
> + }
> +
> + if (rc < 0 || nr_demoted == nr_to_demote)
> + break;
> + }
> +
> + /*
> + * Not absolutely necessary, but for consistency update max_huge_pages
> + * based on pool changes for the demoted page.
> + */
> + src->max_huge_pages -= nr_demoted;
> + dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst));
> +
> + if (rc < 0)
> + return rc;
> +
> + if (nr_demoted)
> + return nr_demoted;
> + /*
> + * Only way to get here is if all pages on free lists are poisoned.
> + * Return -EBUSY so that caller will not retry.
> + */
> + return -EBUSY;
> +}
[...]
The core demotion logic should stay in hugetlb.c. In egenral, I think
hugetlb_sysfs.c should not contain any actualy hugetlb logic, but
primarily only the sysfs interface.
This will also avoid having to export low-level functions like
init_new_hugetlb_folio() in the internal header.
Likely, demote_store() should simply call a new function
demote_pool_huge_pages() that ...
> +
> +static ssize_t demote_store(struct kobject *kobj,
> + struct kobj_attribute *attr, const char *buf, size_t len)
> +{
> + unsigned long nr_demote;
> + unsigned long nr_available;
> + nodemask_t nodes_allowed, *n_mask;
> + struct hstate *h;
> + int err;
> + int nid;
> +
> + err = kstrtoul(buf, 10, &nr_demote);
> + if (err)
> + return err;
> + h = kobj_to_hstate(kobj, &nid);
> +
> + if (nid != NUMA_NO_NODE) {
> + init_nodemask_of_node(&nodes_allowed, nid);
> + n_mask = &nodes_allowed;
> + } else {
> + n_mask = &node_states[N_MEMORY];
> + }
> +
... encapsulated the locking + loop below.
> + /* Synchronize with other sysfs operations modifying huge pages */
> + mutex_lock(&h->resize_lock);
> + spin_lock_irq(&hugetlb_lock);
> +> + while (nr_demote) {
> + long rc;
> +
> + /*
> + * Check for available pages to demote each time thorough the
> + * loop as demote_pool_huge_page will drop hugetlb_lock.
> + */
> + if (nid != NUMA_NO_NODE)
> + nr_available = h->free_huge_pages_node[nid];
> + else
> + nr_available = h->free_huge_pages;
> + nr_available -= h->resv_huge_pages;
> + if (!nr_available)
> + break;
> +
> + rc = demote_pool_huge_page(h, n_mask, nr_demote);
> + if (rc < 0) {
> + err = rc;
> + break;
> + }
> +
> + nr_demote -= rc;
> + }
> +
> + spin_unlock_irq(&hugetlb_lock);
> + mutex_unlock(&h->resize_lock);
> +
> + if (err)
> + return err;
> + return len;
> +}
> +HSTATE_ATTR_WO(demote);
--
Cheers
David
Powered by blists - more mailing lists