linux-kernel - Re: [RFC] mm: alloc_pages_bulk: remove assumption of populating only NULL elements

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9004c0071bcbe8df3dd2ccc1451bd8bb2eb6d79c.camel@kernel.org>
Date: Mon, 17 Feb 2025 09:02:48 -0500
From: Jeff Layton <jlayton@...nel.org>
To: Yunsheng Lin <linyunsheng@...wei.com>, Yishai Hadas
 <yishaih@...dia.com>,  Jason Gunthorpe	 <jgg@...pe.ca>, Shameer Kolothum
 <shameerali.kolothum.thodi@...wei.com>,  Kevin Tian <kevin.tian@...el.com>,
 Alex Williamson <alex.williamson@...hat.com>, Chris Mason <clm@...com>, 
 Josef Bacik <josef@...icpanda.com>, David Sterba <dsterba@...e.com>, Gao
 Xiang <xiang@...nel.org>, Chao Yu	 <chao@...nel.org>, Yue Hu
 <zbestahu@...il.com>, Jeffle Xu	 <jefflexu@...ux.alibaba.com>, Sandeep
 Dhavale <dhavale@...gle.com>, Carlos Maiolino <cem@...nel.org>, "Darrick J.
 Wong" <djwong@...nel.org>, Andrew Morton	 <akpm@...ux-foundation.org>,
 Jesper Dangaard Brouer <hawk@...nel.org>, Ilias Apalodimas
 <ilias.apalodimas@...aro.org>, "David S. Miller" <davem@...emloft.net>,
 Eric Dumazet	 <edumazet@...gle.com>, Jakub Kicinski <kuba@...nel.org>,
 Paolo Abeni	 <pabeni@...hat.com>, Simon Horman <horms@...nel.org>, Trond
 Myklebust	 <trondmy@...nel.org>, Anna Schumaker <anna@...nel.org>, Chuck
 Lever	 <chuck.lever@...cle.com>, Neil Brown <neilb@...e.de>, Olga
 Kornievskaia	 <okorniev@...hat.com>, Dai Ngo <Dai.Ngo@...cle.com>, Tom
 Talpey <tom@...pey.com>
Cc: Luiz Capitulino <luizcap@...hat.com>, Mel Gorman
	 <mgorman@...hsingularity.net>, kvm@...r.kernel.org, 
	virtualization@...ts.linux.dev, linux-kernel@...r.kernel.org, 
	linux-btrfs@...r.kernel.org, linux-erofs@...ts.ozlabs.org, 
	linux-xfs@...r.kernel.org, linux-mm@...ck.org, netdev@...r.kernel.org, 
	linux-nfs@...r.kernel.org
Subject: Re: [RFC] mm: alloc_pages_bulk: remove assumption of populating
 only NULL elements

On Mon, 2025-02-17 at 20:31 +0800, Yunsheng Lin wrote:
> As mentioned in [1], it seems odd to check NULL elements in
> the middle of page bulk allocating, and it seems caller can
> do a better job of bulk allocating pages into a whole array
> sequentially without checking NULL elements first before
> doing the page bulk allocation.
> 
> Remove the above checking also enable the caller to not
> zero the array before calling the page bulk allocating API,
> which has about 1~2 ns performance improvement for the test
> case of time_bench_page_pool03_slow() for page_pool in a
> x86 vm system, this reduces some performance impact of
> fixing the DMA API misuse problem in [2], performance
> improves from 87.886 ns to 86.429 ns.
> 
> 1. https://lore.kernel.org/all/bd8c2f5c-464d-44ab-b607-390a87ea4cd5@huawei.com/
> 2. https://lore.kernel.org/all/20250212092552.1779679-1-linyunsheng@huawei.com/
> CC: Jesper Dangaard Brouer <hawk@...nel.org>
> CC: Luiz Capitulino <luizcap@...hat.com>
> CC: Mel Gorman <mgorman@...hsingularity.net>
> Signed-off-by: Yunsheng Lin <linyunsheng@...wei.com>
> ---
>  drivers/vfio/pci/virtio/migrate.c |  2 --
>  fs/btrfs/extent_io.c              |  8 +++++---
>  fs/erofs/zutil.c                  | 12 ++++++------
>  fs/xfs/xfs_buf.c                  |  9 +++++----
>  mm/page_alloc.c                   | 32 +++++--------------------------
>  net/core/page_pool.c              |  3 ---
>  net/sunrpc/svc_xprt.c             |  9 +++++----
>  7 files changed, 26 insertions(+), 49 deletions(-)
> 
> diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c
> index ba92bb4e9af9..9f003a237dec 100644
> --- a/drivers/vfio/pci/virtio/migrate.c
> +++ b/drivers/vfio/pci/virtio/migrate.c
> @@ -91,8 +91,6 @@ static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf,
>  		if (ret)
>  			goto err_append;
>  		buf->allocated_length += filled * PAGE_SIZE;
> -		/* clean input for another bulk allocation */
> -		memset(page_list, 0, filled * sizeof(*page_list));
>  		to_fill = min_t(unsigned int, to_alloc,
>  				PAGE_SIZE / sizeof(*page_list));
>  	} while (to_alloc > 0);
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index b2fae67f8fa3..d0466d795cca 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -626,10 +626,12 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
>  	unsigned int allocated;
>  
>  	for (allocated = 0; allocated < nr_pages;) {
> -		unsigned int last = allocated;
> +		unsigned int new_allocated;
>  
> -		allocated = alloc_pages_bulk(gfp, nr_pages, page_array);
> -		if (unlikely(allocated == last)) {
> +		new_allocated = alloc_pages_bulk(gfp, nr_pages - allocated,
> +						 page_array + allocated);
> +		allocated += new_allocated;
> +		if (unlikely(!new_allocated)) {
>  			/* No progress, fail and do cleanup. */
>  			for (int i = 0; i < allocated; i++) {
>  				__free_page(page_array[i]);
> diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
> index 55ff2ab5128e..1c50b5e27371 100644
> --- a/fs/erofs/zutil.c
> +++ b/fs/erofs/zutil.c
> @@ -85,13 +85,13 @@ int z_erofs_gbuf_growsize(unsigned int nrpages)
>  
>  		for (j = 0; j < gbuf->nrpages; ++j)
>  			tmp_pages[j] = gbuf->pages[j];
> -		do {
> -			last = j;
> -			j = alloc_pages_bulk(GFP_KERNEL, nrpages,
> -					     tmp_pages);
> -			if (last == j)
> +
> +		for (last = j; last < nrpages; last += j) {
> +			j = alloc_pages_bulk(GFP_KERNEL, nrpages - last,
> +					     tmp_pages + last);
> +			if (!j)
>  				goto out;
> -		} while (j != nrpages);
> +		}
>  
>  		ptr = vmap(tmp_pages, nrpages, VM_MAP, PAGE_KERNEL);
>  		if (!ptr)
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 15bb790359f8..9e1ce0ab9c35 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -377,16 +377,17 @@ xfs_buf_alloc_pages(
>  	 * least one extra page.
>  	 */
>  	for (;;) {
> -		long	last = filled;
> +		long	alloc;
>  
> -		filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
> -					  bp->b_pages);
> +		alloc = alloc_pages_bulk(gfp_mask, bp->b_page_count - refill,
> +					 bp->b_pages + refill);
> +		refill += alloc;
>  		if (filled == bp->b_page_count) {
>  			XFS_STATS_INC(bp->b_mount, xb_page_found);
>  			break;
>  		}
>  
> -		if (filled != last)
> +		if (alloc)
>  			continue;
>  
>  		if (flags & XBF_READ_AHEAD) {
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 579789600a3c..e0309532b6c4 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -4541,9 +4541,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
>   * This is a batched version of the page allocator that attempts to
>   * allocate nr_pages quickly. Pages are added to the page_array.
>   *
> - * Note that only NULL elements are populated with pages and nr_pages
> - * is the maximum number of pages that will be stored in the array.
> - *
>   * Returns the number of pages in the array.
>   */
>  unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
> @@ -4559,29 +4556,18 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
>  	struct alloc_context ac;
>  	gfp_t alloc_gfp;
>  	unsigned int alloc_flags = ALLOC_WMARK_LOW;
> -	int nr_populated = 0, nr_account = 0;
> -
> -	/*
> -	 * Skip populated array elements to determine if any pages need
> -	 * to be allocated before disabling IRQs.
> -	 */
> -	while (nr_populated < nr_pages && page_array[nr_populated])
> -		nr_populated++;
> +	int nr_populated = 0;
>  
>  	/* No pages requested? */
>  	if (unlikely(nr_pages <= 0))
>  		goto out;
>  
> -	/* Already populated array? */
> -	if (unlikely(nr_pages - nr_populated == 0))
> -		goto out;
> -
>  	/* Bulk allocator does not support memcg accounting. */
>  	if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT))
>  		goto failed;
>  
>  	/* Use the single page allocator for one page. */
> -	if (nr_pages - nr_populated == 1)
> +	if (nr_pages == 1)
>  		goto failed;
>  
>  #ifdef CONFIG_PAGE_OWNER
> @@ -4653,24 +4639,16 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
>  	/* Attempt the batch allocation */
>  	pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
>  	while (nr_populated < nr_pages) {
> -
> -		/* Skip existing pages */
> -		if (page_array[nr_populated]) {
> -			nr_populated++;
> -			continue;
> -		}
> -
>  		page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
>  								pcp, pcp_list);
>  		if (unlikely(!page)) {
>  			/* Try and allocate at least one page */
> -			if (!nr_account) {
> +			if (!nr_populated) {
>  				pcp_spin_unlock(pcp);
>  				goto failed_irq;
>  			}
>  			break;
>  		}
> -		nr_account++;
>  
>  		prep_new_page(page, 0, gfp, 0);
>  		set_page_refcounted(page);
> @@ -4680,8 +4658,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
>  	pcp_spin_unlock(pcp);
>  	pcp_trylock_finish(UP_flags);
>  
> -	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
> -	zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account);
> +	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_populated);
> +	zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_populated);
>  
>  out:
>  	return nr_populated;
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index f5e908c9e7ad..ae9e8c78e4bb 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -536,9 +536,6 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
>  	if (unlikely(pool->alloc.count > 0))
>  		return pool->alloc.cache[--pool->alloc.count];
>  
> -	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */
> -	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
> -
>  	nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk,
>  					 (struct page **)pool->alloc.cache);
>  	if (unlikely(!nr_pages))
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index ae25405d8bd2..6321a4d2f2be 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -663,9 +663,10 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
>  		pages = RPCSVC_MAXPAGES;
>  	}
>  
> -	for (filled = 0; filled < pages; filled = ret) {
> -		ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages);
> -		if (ret > filled)
> +	for (filled = 0; filled < pages; filled += ret) {
> +		ret = alloc_pages_bulk(GFP_KERNEL, pages - filled,
> +				       rqstp->rq_pages + filled);
> +		if (ret)
>  			/* Made progress, don't sleep yet */
>  			continue;
>  
> @@ -674,7 +675,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
>  			set_current_state(TASK_RUNNING);
>  			return false;
>  		}
> -		trace_svc_alloc_arg_err(pages, ret);
> +		trace_svc_alloc_arg_err(pages, filled);
>  		memalloc_retry_wait(GFP_KERNEL);
>  	}
>  	rqstp->rq_page_end = &rqstp->rq_pages[pages];

I agree that the API is cumbersome and weird as it is today. For the
sunrpc parts:

Acked-by: Jeff Layton <jlayton@...nel.org>