[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <a5349a28-bf32-4b26-a55f-af53f4c225b8@huawei.com>
Date: Wed, 15 Oct 2025 17:08:06 +0800
From: mawupeng <mawupeng1@...wei.com>
To: <akpm@...ux-foundation.org>, <david@...hat.com>, <jackmanb@...gle.com>,
<hannes@...xchg.org>, <zhengqi.arch@...edance.com>, <shakeel.butt@...ux.dev>
CC: <mawupeng1@...wei.com>, <linux-mm@...ck.org>,
<linux-kernel@...r.kernel.org>, <lorenzo.stoakes@...cle.com>,
<Liam.Howlett@...cle.com>, <vbabka@...e.cz>, <rppt@...nel.org>,
<surenb@...gle.com>, <mhocko@...e.com>, <ziy@...dia.com>,
<axelrasmussen@...gle.com>, <yuanchu@...gle.com>, <weixugc@...gle.com>
Subject: Re: [RFC PATCH] mm: vmscan: wakeup kswapd during node_reclaim
Hi Reviewers:
kindly ping, cc more reviews.
On 2025/10/11 14:20, Wupeng Ma wrote:
> During testing, we observed that memory allocation with node_reclaim_mode
> enabled becomes extremely slow when a large allocation is attempted on a
> node whose free memory is mostly occupied by clean page cache.
>
> The slowness arises because during node reclaim, only direct reclaim-like
> behavior is triggered — recycling only 32 pages at a time — without
> waking kswapd, even when the watermark levels and alloc_flags already
> satisfy the condition to activate kswapd.
>
> This patch wakes kswapd during node reclaim, allowing background reclaim
> to bring free memory up to the high watermark and avoid excessive node
> reclaim overhead.
>
> Signed-off-by: Wupeng Ma <mawupeng1@...wei.com>
> ---
> mm/internal.h | 14 ++++++++------
> mm/page_alloc.c | 6 +++++-
> mm/vmscan.c | 19 +++++++++++++++++--
> 3 files changed, 30 insertions(+), 9 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 1561fc2ff5b8..5303123dd0a8 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1196,21 +1196,23 @@ static inline void mminit_verify_zonelist(void)
> }
> #endif /* CONFIG_DEBUG_MEMORY_INIT */
>
> -#define NODE_RECLAIM_NOSCAN -2
> -#define NODE_RECLAIM_FULL -1
> -#define NODE_RECLAIM_SOME 0
> -#define NODE_RECLAIM_SUCCESS 1
> +#define NODE_RECLAIM_NOSCAN -2
> +#define NODE_RECLAIM_FULL -1
> +#define NODE_RECLAIM_SOME 0
> +#define NODE_RECLAIM_SUCCESS 1
> +#define NODE_RECLAIM_KSWAPD_SUCCESS 2
>
> #ifdef CONFIG_NUMA
> extern int node_reclaim_mode;
>
> -extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
> +int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order,
> + int alloc_flags, struct zone *zone);
> extern int find_next_best_node(int node, nodemask_t *used_node_mask);
> #else
> #define node_reclaim_mode 0
>
> static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
> - unsigned int order)
> + unsigned int order, int alloc_flags, struct zone *zone)
> {
> return NODE_RECLAIM_NOSCAN;
> }
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 600d9e981c23..2472000cab78 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3859,7 +3859,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
> !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone))
> continue;
>
> - ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
> + ret = node_reclaim(zone->zone_pgdat, gfp_mask, order,
> + alloc_flags, zone);
> switch (ret) {
> case NODE_RECLAIM_NOSCAN:
> /* did not scan */
> @@ -3867,6 +3868,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
> case NODE_RECLAIM_FULL:
> /* scanned but unreclaimable */
> continue;
> + case NODE_RECLAIM_KSWAPD_SUCCESS:
> + /* kswapd reclaim enough */
> + goto try_this_zone;
> default:
> /* did we reclaim enough */
> if (zone_watermark_ok(zone, order, mark,
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b2fc8b626d3d..ebee8b6330a8 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -7680,9 +7680,11 @@ static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
> return sc->nr_reclaimed;
> }
>
> -int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
> +int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order,
> + int alloc_flags, struct zone *zone)
> {
> int ret;
> + enum zone_type highest_zoneidx = gfp_zone(gfp_mask);
> /* Minimum pages needed in order to stay on node */
> const unsigned long nr_pages = 1 << order;
> struct scan_control sc = {
> @@ -7693,7 +7695,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
> .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
> .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
> .may_swap = 1,
> - .reclaim_idx = gfp_zone(gfp_mask),
> + .reclaim_idx = highest_zoneidx,
> };
>
> /*
> @@ -7729,6 +7731,19 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
> if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
> return NODE_RECLAIM_NOSCAN;
>
> + if (alloc_flags & ALLOC_KSWAPD) {
> + unsigned long mark;
> +
> + wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
> +
> + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
> + if (zone_watermark_ok(zone, order, mark, highest_zoneidx,
> + alloc_flags)) {
> + clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
> + return NODE_RECLAIM_KSWAPD_SUCCESS;
> + }
> + }
> +
> ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages;
> clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
>
Powered by blists - more mailing lists