linux-kernel - Re: [RFC PATCH] mm: vmscan: wakeup kswapd during node

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <a5349a28-bf32-4b26-a55f-af53f4c225b8@huawei.com>
Date: Wed, 15 Oct 2025 17:08:06 +0800
From: mawupeng <mawupeng1@...wei.com>
To: <akpm@...ux-foundation.org>, <david@...hat.com>, <jackmanb@...gle.com>,
	<hannes@...xchg.org>, <zhengqi.arch@...edance.com>, <shakeel.butt@...ux.dev>
CC: <mawupeng1@...wei.com>, <linux-mm@...ck.org>,
	<linux-kernel@...r.kernel.org>, <lorenzo.stoakes@...cle.com>,
	<Liam.Howlett@...cle.com>, <vbabka@...e.cz>, <rppt@...nel.org>,
	<surenb@...gle.com>, <mhocko@...e.com>, <ziy@...dia.com>,
	<axelrasmussen@...gle.com>, <yuanchu@...gle.com>, <weixugc@...gle.com>
Subject: Re: [RFC PATCH] mm: vmscan: wakeup kswapd during node_reclaim

Hi Reviewers:
	kindly ping, cc more reviews.
	
On 2025/10/11 14:20, Wupeng Ma wrote:
> During testing, we observed that memory allocation with node_reclaim_mode
> enabled becomes extremely slow when a large allocation is attempted on a
> node whose free memory is mostly occupied by clean page cache.
> 
> The slowness arises because during node reclaim, only direct reclaim-like
> behavior is triggered — recycling only 32 pages at a time — without
> waking kswapd, even when the watermark levels and alloc_flags already
> satisfy the condition to activate kswapd.
> 
> This patch wakes kswapd during node reclaim, allowing background reclaim
> to bring free memory up to the high watermark and avoid excessive node
> reclaim overhead.
> 
> Signed-off-by: Wupeng Ma <mawupeng1@...wei.com>
> ---
>  mm/internal.h   | 14 ++++++++------
>  mm/page_alloc.c |  6 +++++-
>  mm/vmscan.c     | 19 +++++++++++++++++--
>  3 files changed, 30 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/internal.h b/mm/internal.h
> index 1561fc2ff5b8..5303123dd0a8 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1196,21 +1196,23 @@ static inline void mminit_verify_zonelist(void)
>  }
>  #endif /* CONFIG_DEBUG_MEMORY_INIT */
>  
> -#define NODE_RECLAIM_NOSCAN	-2
> -#define NODE_RECLAIM_FULL	-1
> -#define NODE_RECLAIM_SOME	0
> -#define NODE_RECLAIM_SUCCESS	1
> +#define NODE_RECLAIM_NOSCAN		-2
> +#define NODE_RECLAIM_FULL		-1
> +#define NODE_RECLAIM_SOME		0
> +#define NODE_RECLAIM_SUCCESS		1
> +#define NODE_RECLAIM_KSWAPD_SUCCESS	2
>  
>  #ifdef CONFIG_NUMA
>  extern int node_reclaim_mode;
>  
> -extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
> +int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order,
> +		 int alloc_flags, struct zone *zone);
>  extern int find_next_best_node(int node, nodemask_t *used_node_mask);
>  #else
>  #define node_reclaim_mode 0
>  
>  static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
> -				unsigned int order)
> +		unsigned int order, int alloc_flags, struct zone *zone)
>  {
>  	return NODE_RECLAIM_NOSCAN;
>  }
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 600d9e981c23..2472000cab78 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3859,7 +3859,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
>  			    !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone))
>  				continue;
>  
> -			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
> +			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order,
> +					   alloc_flags, zone);
>  			switch (ret) {
>  			case NODE_RECLAIM_NOSCAN:
>  				/* did not scan */
> @@ -3867,6 +3868,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
>  			case NODE_RECLAIM_FULL:
>  				/* scanned but unreclaimable */
>  				continue;
> +			case NODE_RECLAIM_KSWAPD_SUCCESS:
> +				/* kswapd reclaim enough */
> +				goto try_this_zone;
>  			default:
>  				/* did we reclaim enough */
>  				if (zone_watermark_ok(zone, order, mark,
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b2fc8b626d3d..ebee8b6330a8 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -7680,9 +7680,11 @@ static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
>  	return sc->nr_reclaimed;
>  }
>  
> -int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
> +int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order,
> +		 int alloc_flags, struct zone *zone)
>  {
>  	int ret;
> +	enum zone_type highest_zoneidx = gfp_zone(gfp_mask);
>  	/* Minimum pages needed in order to stay on node */
>  	const unsigned long nr_pages = 1 << order;
>  	struct scan_control sc = {
> @@ -7693,7 +7695,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
>  		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
>  		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
>  		.may_swap = 1,
> -		.reclaim_idx = gfp_zone(gfp_mask),
> +		.reclaim_idx = highest_zoneidx,
>  	};
>  
>  	/*
> @@ -7729,6 +7731,19 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
>  	if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
>  		return NODE_RECLAIM_NOSCAN;
>  
> +	if (alloc_flags & ALLOC_KSWAPD) {
> +		unsigned long mark;
> +
> +		wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
> +
> +		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
> +		if (zone_watermark_ok(zone, order, mark, highest_zoneidx,
> +					alloc_flags)) {
> +			clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
> +			return NODE_RECLAIM_KSWAPD_SUCCESS;
> +		}
> +	}
> +
>  	ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages;
>  	clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
>