netdev - Re: [PATCH net-next v9 3/4] page_pool: support unlimited number of inflight pages

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <640946c8-237d-40de-b64e-0f8fd8f1a600@kernel.org>
Date: Fri, 21 Feb 2025 11:12:55 +0100
From: Jesper Dangaard Brouer <hawk@...nel.org>
To: Yunsheng Lin <linyunsheng@...wei.com>, davem@...emloft.net,
 kuba@...nel.org, pabeni@...hat.com
Cc: zhangkun09@...wei.com, liuyonglong@...wei.com, fanghaiqing@...wei.com,
 Robin Murphy <robin.murphy@....com>,
 Alexander Duyck <alexander.duyck@...il.com>, IOMMU <iommu@...ts.linux.dev>,
 Ilias Apalodimas <ilias.apalodimas@...aro.org>,
 Eric Dumazet <edumazet@...gle.com>, Simon Horman <horms@...nel.org>,
 netdev@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH net-next v9 3/4] page_pool: support unlimited number of
 inflight pages


See below, I request changes to add memory accounting for these item_blocks.

On 12/02/2025 10.25, Yunsheng Lin wrote:
> Currently a fixed size of pre-allocated memory is used to
> keep track of the inflight pages, in order to use the DMA
> API correctly.
> 
> As mentioned [1], the number of inflight pages can be up to
> 73203 depending on the use cases. Allocate memory dynamically
> to keep track of the inflight pages when pre-allocated memory
> runs out.
> 
> The overhead of using dynamic memory allocation is about 10ns~
> 20ns, which causes 5%~10% performance degradation for the test
> case of time_bench_page_pool03_slow() in [2].
> 
> 1. https://lore.kernel.org/all/b8b7818a-e44b-45f5-91c2-d5eceaa5dd5b@kernel.org/
> 2. https://github.com/netoptimizer/prototype-kernel
> CC: Robin Murphy <robin.murphy@....com>
> CC: Alexander Duyck <alexander.duyck@...il.com>
> CC: IOMMU <iommu@...ts.linux.dev>
> Fixes: f71fec47c2df ("page_pool: make sure struct device is stable")
> Signed-off-by: Yunsheng Lin <linyunsheng@...wei.com>
> ---
>   include/net/page_pool/types.h | 10 +++++
>   net/core/page_pool.c          | 80 ++++++++++++++++++++++++++++++++++-
>   2 files changed, 89 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
> index c131e2725e9a..c8c47ca67f4b 100644
> --- a/include/net/page_pool/types.h
> +++ b/include/net/page_pool/types.h
> @@ -103,6 +103,7 @@ struct page_pool_params {
>    * @waive:	pages obtained from the ptr ring that cannot be added to
>    *		the cache due to a NUMA mismatch
>    * @item_fast_empty: pre-allocated item cache is empty
> + * @item_slow_failed: failed to allocate memory for item_block
>    */
>   struct page_pool_alloc_stats {
>   	u64 fast;
> @@ -112,6 +113,7 @@ struct page_pool_alloc_stats {
>   	u64 refill;
>   	u64 waive;
>   	u64 item_fast_empty;
> +	u64 item_slow_failed;
>   };
>   
>   /**
> @@ -159,9 +161,16 @@ struct page_pool_item {
>   struct page_pool_item_block {
>   	struct page_pool *pp;
>   	struct list_head list;
> +	unsigned int flags;
> +	refcount_t ref;
>   	struct page_pool_item items[];
>   };
>   
> +struct page_pool_slow_item {
> +	struct page_pool_item_block *block;
> +	unsigned int next_to_use;
> +};
> +
>   /* Ensure the offset of 'pp' field for both 'page_pool_item_block' and
>    * 'netmem_item_block' are the same.
>    */
> @@ -191,6 +200,7 @@ struct page_pool {
>   	int cpuid;
>   	u32 pages_state_hold_cnt;
>   	struct llist_head hold_items;
> +	struct page_pool_slow_item slow_items;
>   
>   	bool has_init_callback:1;	/* slow::init_callback is set */
>   	bool dma_map:1;			/* Perform DMA mapping */
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 508dd51b1a9a..3109bf015225 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -64,6 +64,7 @@ static const char pp_stats[][ETH_GSTRING_LEN] = {
>   	"rx_pp_alloc_refill",
>   	"rx_pp_alloc_waive",
>   	"rx_pp_alloc_item_fast_empty",
> +	"rx_pp_alloc_item_slow_failed",
>   	"rx_pp_recycle_cached",
>   	"rx_pp_recycle_cache_full",
>   	"rx_pp_recycle_ring",
> @@ -98,6 +99,7 @@ bool page_pool_get_stats(const struct page_pool *pool,
>   	stats->alloc_stats.refill += pool->alloc_stats.refill;
>   	stats->alloc_stats.waive += pool->alloc_stats.waive;
>   	stats->alloc_stats.item_fast_empty += pool->alloc_stats.item_fast_empty;
> +	stats->alloc_stats.item_slow_failed += pool->alloc_stats.item_slow_failed;
>   
>   	for_each_possible_cpu(cpu) {
>   		const struct page_pool_recycle_stats *pcpu =
> @@ -144,6 +146,7 @@ u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
>   	*data++ = pool_stats->alloc_stats.refill;
>   	*data++ = pool_stats->alloc_stats.waive;
>   	*data++ = pool_stats->alloc_stats.item_fast_empty;
> +	*data++ = pool_stats->alloc_stats.item_slow_failed;
>   	*data++ = pool_stats->recycle_stats.cached;
>   	*data++ = pool_stats->recycle_stats.cache_full;
>   	*data++ = pool_stats->recycle_stats.ring;
> @@ -430,6 +433,7 @@ static void page_pool_item_uninit(struct page_pool *pool)
>   					 struct page_pool_item_block,
>   					 list);
>   		list_del(&block->list);
> +		WARN_ON(refcount_read(&block->ref));
>   		put_page(virt_to_page(block));
>   	}
>   }
> @@ -513,10 +517,43 @@ static struct page_pool_item *page_pool_fast_item_alloc(struct page_pool *pool)
>   	return llist_entry(first, struct page_pool_item, lentry);
>   }
>   
> +#define PAGE_POOL_SLOW_ITEM_BLOCK_BIT			BIT(0)
> +static struct page_pool_item *page_pool_slow_item_alloc(struct page_pool *pool)
> +{
> +	if (unlikely(!pool->slow_items.block ||
> +		     pool->slow_items.next_to_use >= ITEMS_PER_PAGE)) {
> +		struct page_pool_item_block *block;
> +		struct page *page;
> +
> +		page = alloc_pages_node(pool->p.nid, GFP_ATOMIC | __GFP_NOWARN |
> +					__GFP_ZERO, 0);
> +		if (!page) {
> +			alloc_stat_inc(pool, item_slow_failed);
> +			return NULL;
> +		}

We also need stats on how many pages we allocate for these item_blocks
(and later free). This new scheme of keeping track of all pages
allocated via page_pool, is obviously going to consume more memory.

I want to be able to find out how much memory a page_pool is consuming.
(E.g. Kuba added a nice interface for querying inflight packets, even
though this is kept as two different counters).

What I worry about, is that fragmentation happens inside these
item_blocks. (I hope you understand what I mean by fragmentation, else
let me know).

Could you explain how code handles or avoids fragmentation?


> +
> +		block = page_address(page);
> +		block->pp = pool;
> +		block->flags |= PAGE_POOL_SLOW_ITEM_BLOCK_BIT;
> +		refcount_set(&block->ref, ITEMS_PER_PAGE);
> +		pool->slow_items.block = block;
> +		pool->slow_items.next_to_use = 0;
> +
> +		spin_lock_bh(&pool->item_lock);
> +		list_add(&block->list, &pool->item_blocks);
> +		spin_unlock_bh(&pool->item_lock);
> +	}
> +
> +	return &pool->slow_items.block->items[pool->slow_items.next_to_use++];
> +}
> +
>   static bool page_pool_set_item_info(struct page_pool *pool, netmem_ref netmem)
>   {
>   	struct page_pool_item *item = page_pool_fast_item_alloc(pool);
>   
> +	if (unlikely(!item))
> +		item = page_pool_slow_item_alloc(pool);
> +
>   	if (likely(item)) {
>   		item->pp_netmem = netmem;
>   		page_pool_item_set_used(item);
> @@ -526,6 +563,37 @@ static bool page_pool_set_item_info(struct page_pool *pool, netmem_ref netmem)
>   	return !!item;
>   }
>   
> +static void __page_pool_slow_item_free(struct page_pool *pool,
> +				       struct page_pool_item_block *block)
> +{
> +	spin_lock_bh(&pool->item_lock);
> +	list_del(&block->list);
> +	spin_unlock_bh(&pool->item_lock);
> +
> +	put_page(virt_to_page(block));

Also counter for releasing block page here.

--Jesper

> +}
> +
> +static void page_pool_slow_item_drain(struct page_pool *pool)
> +{
> +	struct page_pool_item_block *block = pool->slow_items.block;
> +
> +	if (!block || pool->slow_items.next_to_use >= ITEMS_PER_PAGE)
> +		return;
> +
> +	if (refcount_sub_and_test(ITEMS_PER_PAGE - pool->slow_items.next_to_use,
> +				  &block->ref))
> +		__page_pool_slow_item_free(pool, block);
> +}
> +
> +static void page_pool_slow_item_free(struct page_pool *pool,
> +				     struct page_pool_item_block *block)
> +{
> +	if (likely(!refcount_dec_and_test(&block->ref)))
> +		return;
> +
> +	__page_pool_slow_item_free(pool, block);
> +}
> +
>   static void page_pool_fast_item_free(struct page_pool *pool,
>   				     struct page_pool_item *item)
>   {
> @@ -535,13 +603,22 @@ static void page_pool_fast_item_free(struct page_pool *pool,
>   static void page_pool_clear_item_info(struct page_pool *pool, netmem_ref netmem)
>   {
>   	struct page_pool_item *item = netmem_get_pp_item(netmem);
> +	struct page_pool_item_block *block;
>   
>   	DEBUG_NET_WARN_ON_ONCE(item->pp_netmem != netmem);
>   	DEBUG_NET_WARN_ON_ONCE(page_pool_item_is_mapped(item));
>   	DEBUG_NET_WARN_ON_ONCE(!page_pool_item_is_used(item));
>   	page_pool_item_clear_used(item);
>   	netmem_set_pp_item(netmem, NULL);
> -	page_pool_fast_item_free(pool, item);
> +
> +	block = page_pool_item_to_block(item);
> +	if (likely(!(block->flags & PAGE_POOL_SLOW_ITEM_BLOCK_BIT))) {
> +		DEBUG_NET_WARN_ON_ONCE(refcount_read(&block->ref));
> +		page_pool_fast_item_free(pool, item);
> +		return;
> +	}
> +
> +	page_pool_slow_item_free(pool, block);
>   }
>   
>   /**
> @@ -1383,6 +1460,7 @@ void page_pool_destroy(struct page_pool *pool)
>   
>   	page_pool_disable_direct_recycling(pool);
>   	page_pool_free_frag(pool);
> +	page_pool_slow_item_drain(pool);
>   
>   	if (!page_pool_release(pool))
>   		return;