linux-kernel - Re: [PATCH net-next v10 3/4] page_pool: support unlimited number of inflight pages

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d3c7d421-566f-4007-b272-650294edd019@kernel.org>
Date: Tue, 4 Mar 2025 10:25:46 +0100
From: Jesper Dangaard Brouer <hawk@...nel.org>
To: Yunsheng Lin <linyunsheng@...wei.com>, davem@...emloft.net,
 kuba@...nel.org, pabeni@...hat.com
Cc: zhangkun09@...wei.com, liuyonglong@...wei.com, fanghaiqing@...wei.com,
 Robin Murphy <robin.murphy@....com>,
 Alexander Duyck <alexander.duyck@...il.com>, IOMMU <iommu@...ts.linux.dev>,
 Eric Dumazet <edumazet@...gle.com>, Simon Horman <horms@...nel.org>,
 Donald Hunter <donald.hunter@...il.com>,
 Ilias Apalodimas <ilias.apalodimas@...aro.org>,
 Andrew Lunn <andrew+netdev@...n.ch>, netdev@...r.kernel.org,
 linux-kernel@...r.kernel.org, kernel-team <kernel-team@...udflare.com>,
 Yan Zhai <yan@...udflare.com>
Subject: Re: [PATCH net-next v10 3/4] page_pool: support unlimited number of
 inflight pages

(comments requesting changes inlined below)

On 26/02/2025 12.03, Yunsheng Lin wrote:
> Currently a fixed size of pre-allocated memory is used to
> keep track of the inflight pages, in order to use the DMA
> API correctly.
> 
> As mentioned [1], the number of inflight pages can be up to
> 73203 depending on the use cases. Allocate memory dynamically
> to keep track of the inflight pages when pre-allocated memory
> runs out.
> 
> The overhead of using dynamic memory allocation is about 10ns~
> 20ns, which causes 5%~10% performance degradation for the test
> case of time_bench_page_pool03_slow() in [2].
> 
> 1. https://lore.kernel.org/all/b8b7818a-e44b-45f5-91c2-d5eceaa5dd5b@kernel.org/
> 2. https://github.com/netoptimizer/prototype-kernel
> CC: Robin Murphy <robin.murphy@....com>
> CC: Alexander Duyck <alexander.duyck@...il.com>
> CC: IOMMU <iommu@...ts.linux.dev>
> Fixes: f71fec47c2df ("page_pool: make sure struct device is stable")
> Signed-off-by: Yunsheng Lin <linyunsheng@...wei.com>
> ---
>   Documentation/netlink/specs/netdev.yaml | 16 +++++
>   include/net/page_pool/types.h           | 10 ++++
>   include/uapi/linux/netdev.h             |  2 +
>   net/core/page_pool.c                    | 79 ++++++++++++++++++++++++-
>   net/core/page_pool_priv.h               |  2 +
>   net/core/page_pool_user.c               | 39 ++++++++++--
>   tools/net/ynl/samples/page-pool.c       | 11 ++++
>   7 files changed, 154 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
> index 36f1152bfac3..7c121d0a5d15 100644
> --- a/Documentation/netlink/specs/netdev.yaml
> +++ b/Documentation/netlink/specs/netdev.yaml
> @@ -162,6 +162,20 @@ attribute-sets:
>           type: uint
>           doc: |
>             Amount of memory held by inflight pages.
> +      -
> +        name: item_mem_resident
> +        type: uint
> +        doc: |
> +          Amount of actual memory allocated to track inflight pages.
> +          memory fragmentation ratio for item memory can be calculated
> +          using item_mem_resident / item_mem_used.
> +      -
> +        name: item_mem_used
> +        type: uint
> +        doc: |
> +          Amount of actual memory used to track inflight pages.
> +          memory fragmentation ratio for item memory can be calculated
> +          using item_mem_resident / item_mem_used.
>         -
>           name: detach-time
>           type: uint
> @@ -602,6 +616,8 @@ operations:
>               - detach-time
>               - dmabuf
>               - io-uring
> +            - item_mem_resident
> +            - item_mem_used
>         dump:
>           reply: *pp-reply
>         config-cond: page-pool
> diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
> index c131e2725e9a..c8c47ca67f4b 100644
> --- a/include/net/page_pool/types.h
> +++ b/include/net/page_pool/types.h
> @@ -103,6 +103,7 @@ struct page_pool_params {
>    * @waive:	pages obtained from the ptr ring that cannot be added to
>    *		the cache due to a NUMA mismatch
>    * @item_fast_empty: pre-allocated item cache is empty
> + * @item_slow_failed: failed to allocate memory for item_block
>    */
>   struct page_pool_alloc_stats {
>   	u64 fast;
> @@ -112,6 +113,7 @@ struct page_pool_alloc_stats {
>   	u64 refill;
>   	u64 waive;
>   	u64 item_fast_empty;
> +	u64 item_slow_failed;
>   };
>   
>   /**
> @@ -159,9 +161,16 @@ struct page_pool_item {
>   struct page_pool_item_block {
>   	struct page_pool *pp;
>   	struct list_head list;
> +	unsigned int flags;
> +	refcount_t ref;
>   	struct page_pool_item items[];
>   };
>   
> +struct page_pool_slow_item {
> +	struct page_pool_item_block *block;
> +	unsigned int next_to_use;
> +};
> +
>   /* Ensure the offset of 'pp' field for both 'page_pool_item_block' and
>    * 'netmem_item_block' are the same.
>    */
> @@ -191,6 +200,7 @@ struct page_pool {
>   	int cpuid;
>   	u32 pages_state_hold_cnt;
>   	struct llist_head hold_items;
> +	struct page_pool_slow_item slow_items;
>   
>   	bool has_init_callback:1;	/* slow::init_callback is set */
>   	bool dma_map:1;			/* Perform DMA mapping */
> diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
> index 7600bf62dbdf..9309cbfeb8d2 100644
> --- a/include/uapi/linux/netdev.h
> +++ b/include/uapi/linux/netdev.h
> @@ -103,6 +103,8 @@ enum {
>   	NETDEV_A_PAGE_POOL_DETACH_TIME,
>   	NETDEV_A_PAGE_POOL_DMABUF,
>   	NETDEV_A_PAGE_POOL_IO_URING,
> +	NETDEV_A_PAGE_POOL_ITEM_MEM_RESIDENT,
> +	NETDEV_A_PAGE_POOL_ITEM_MEM_USED,
>   
>   	__NETDEV_A_PAGE_POOL_MAX,
>   	NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1)
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index d927c468bc1b..dc9574bd129b 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -64,6 +64,7 @@ static const char pp_stats[][ETH_GSTRING_LEN] = {
>   	"rx_pp_alloc_refill",
>   	"rx_pp_alloc_waive",
>   	"rx_pp_alloc_item_fast_empty",
> +	"rx_pp_alloc_item_slow_failed",
>   	"rx_pp_recycle_cached",
>   	"rx_pp_recycle_cache_full",
>   	"rx_pp_recycle_ring",
> @@ -98,6 +99,7 @@ bool page_pool_get_stats(const struct page_pool *pool,
>   	stats->alloc_stats.refill += pool->alloc_stats.refill;
>   	stats->alloc_stats.waive += pool->alloc_stats.waive;
>   	stats->alloc_stats.item_fast_empty += pool->alloc_stats.item_fast_empty;
> +	stats->alloc_stats.item_slow_failed += pool->alloc_stats.item_slow_failed;
>   
>   	for_each_possible_cpu(cpu) {
>   		const struct page_pool_recycle_stats *pcpu =
> @@ -144,6 +146,7 @@ u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
>   	*data++ = pool_stats->alloc_stats.refill;
>   	*data++ = pool_stats->alloc_stats.waive;
>   	*data++ = pool_stats->alloc_stats.item_fast_empty;
> +	*data++ = pool_stats->alloc_stats.item_slow_failed;
>   	*data++ = pool_stats->recycle_stats.cached;
>   	*data++ = pool_stats->recycle_stats.cache_full;
>   	*data++ = pool_stats->recycle_stats.ring;
> @@ -431,6 +434,7 @@ static void page_pool_item_uninit(struct page_pool *pool)
>   					 struct page_pool_item_block,
>   					 list);
>   		list_del(&block->list);
> +		WARN_ON(refcount_read(&block->ref));
>   		put_page(virt_to_page(block));
>   	}
>   }
> @@ -514,10 +518,42 @@ static struct page_pool_item *page_pool_fast_item_alloc(struct page_pool *pool)
>   	return llist_entry(first, struct page_pool_item, lentry);
>   }
>   
> +static struct page_pool_item *page_pool_slow_item_alloc(struct page_pool *pool)
> +{
> +	if (unlikely(!pool->slow_items.block ||
> +		     pool->slow_items.next_to_use >= ITEMS_PER_PAGE)) {
> +		struct page_pool_item_block *block;
> +		struct page *page;
> +
> +		page = alloc_pages_node(pool->p.nid, GFP_ATOMIC | __GFP_NOWARN |
> +					__GFP_ZERO, 0);
> +		if (!page) {
> +			alloc_stat_inc(pool, item_slow_failed);
> +			return NULL;
> +		}

I'm missing a counter that I can use to monitor the rate of page
allocations for these "item" block's.
In production want to have a metric that shows me a sudden influx of
that cause code to hit this "item_slow_alloc" case (inflight_slow_alloc)

BTW should this be called "inflight_block" instead of "item_block"?


> +
> +		block = page_address(page);
> +		block->pp = pool;
> +		block->flags |= PAGE_POOL_SLOW_ITEM_BLOCK_BIT;
> +		refcount_set(&block->ref, ITEMS_PER_PAGE);
> +		pool->slow_items.block = block;
> +		pool->slow_items.next_to_use = 0;
> +
> +		spin_lock_bh(&pool->item_lock);
> +		list_add(&block->list, &pool->item_blocks);
> +		spin_unlock_bh(&pool->item_lock);
> +	}
> +
> +	return &pool->slow_items.block->items[pool->slow_items.next_to_use++];
> +}
> +
>   static bool page_pool_set_item_info(struct page_pool *pool, netmem_ref netmem)
>   {
>   	struct page_pool_item *item = page_pool_fast_item_alloc(pool);
>   
> +	if (unlikely(!item))
> +		item = page_pool_slow_item_alloc(pool);
> +
>   	if (likely(item)) {
>   		item->pp_netmem = netmem;
>   		page_pool_item_set_used(item);
> @@ -527,6 +563,37 @@ static bool page_pool_set_item_info(struct page_pool *pool, netmem_ref netmem)
>   	return !!item;
>   }
>   
> +static void __page_pool_slow_item_free(struct page_pool *pool,
> +				       struct page_pool_item_block *block)
> +{
> +	spin_lock_bh(&pool->item_lock);
> +	list_del(&block->list);
> +	spin_unlock_bh(&pool->item_lock);
> +
> +	put_page(virt_to_page(block));

Here again I'm missing a counter that I can use to monitor the rate of
page free events.

In production I want a metric (e.g inflight_slow_free_block) that
together with "item_slow_alloc" (perhaps named
inflight_slow_alloc_block), show me if this code path is creating churn,
that I can correlate/explain some other influx event on the system.

BTW subtracting these (alloc - free) counters gives us the memory used.

> +}
> +
> +static void page_pool_slow_item_drain(struct page_pool *pool)
> +{
> +	struct page_pool_item_block *block = pool->slow_items.block;
> +
> +	if (!block || pool->slow_items.next_to_use >= ITEMS_PER_PAGE)
> +		return;
> +
> +	if (refcount_sub_and_test(ITEMS_PER_PAGE - pool->slow_items.next_to_use,
> +				  &block->ref))
> +		__page_pool_slow_item_free(pool, block);
> +}
> +
> +static void page_pool_slow_item_free(struct page_pool *pool,
> +				     struct page_pool_item_block *block)
> +{
> +	if (likely(!refcount_dec_and_test(&block->ref)))
> +		return;
> +
> +	__page_pool_slow_item_free(pool, block);
> +}
> +
>   static void page_pool_fast_item_free(struct page_pool *pool,
>   				     struct page_pool_item *item)
>   {
> @@ -536,13 +603,22 @@ static void page_pool_fast_item_free(struct page_pool *pool,
>   static void page_pool_clear_item_info(struct page_pool *pool, netmem_ref netmem)
>   {
>   	struct page_pool_item *item = netmem_get_pp_item(netmem);
> +	struct page_pool_item_block *block;
>   
>   	DEBUG_NET_WARN_ON_ONCE(item->pp_netmem != netmem);
>   	DEBUG_NET_WARN_ON_ONCE(page_pool_item_is_mapped(item));
>   	DEBUG_NET_WARN_ON_ONCE(!page_pool_item_is_used(item));
>   	page_pool_item_clear_used(item);
>   	netmem_set_pp_item(netmem, NULL);
> -	page_pool_fast_item_free(pool, item);
> +
> +	block = page_pool_item_to_block(item);
> +	if (likely(!(block->flags & PAGE_POOL_SLOW_ITEM_BLOCK_BIT))) {
> +		DEBUG_NET_WARN_ON_ONCE(refcount_read(&block->ref));
> +		page_pool_fast_item_free(pool, item);
> +		return;
> +	}
> +
> +	page_pool_slow_item_free(pool, block);
>   }
>   
>   /**
> @@ -1390,6 +1466,7 @@ void page_pool_destroy(struct page_pool *pool)
>   
>   	page_pool_disable_direct_recycling(pool);
>   	page_pool_free_frag(pool);
> +	page_pool_slow_item_drain(pool);
>   
>   	if (!page_pool_release(pool))
>   		return;
> diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h
> index a5df5ab14ead..37adfc766c12 100644
> --- a/net/core/page_pool_priv.h
> +++ b/net/core/page_pool_priv.h
> @@ -7,6 +7,8 @@
>   
>   #include "netmem_priv.h"
>   
> +#define PAGE_POOL_SLOW_ITEM_BLOCK_BIT		BIT(0)
> +
>   extern struct mutex page_pools_lock;
>   
>   s32 page_pool_inflight(const struct page_pool *pool, bool strict);
> diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
> index c82a95beceff..0dc0090257ae 100644
> --- a/net/core/page_pool_user.c
> +++ b/net/core/page_pool_user.c
> @@ -33,7 +33,7 @@ DEFINE_MUTEX(page_pools_lock);
>    *    - user.list: unhashed, netdev: unknown
>    */
>   
> -typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool,
> +typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, struct page_pool *pool,
>   			     const struct genl_info *info);
>   
>   static int
> @@ -111,7 +111,7 @@ netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb,
>   }
>   
>   static int
> -page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool,
> +page_pool_nl_stats_fill(struct sk_buff *rsp, struct page_pool *pool,
>   			const struct genl_info *info)
>   {
>   #ifdef CONFIG_PAGE_POOL_STATS
> @@ -212,8 +212,36 @@ int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb,
>   	return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill);
>   }
>   
> +static int page_pool_nl_fill_item_mem_info(struct page_pool *pool,
> +					   struct sk_buff *rsp)
> +{
> +	struct page_pool_item_block *block;
> +	size_t resident = 0, used = 0;
> +	int err;
> +
> +	spin_lock_bh(&pool->item_lock);
> +
> +	list_for_each_entry(block, &pool->item_blocks, list) {
> +		resident += PAGE_SIZE;
> +
> +		if (block->flags & PAGE_POOL_SLOW_ITEM_BLOCK_BIT)
> +			used += (PAGE_SIZE - sizeof(struct page_pool_item) *
> +				 refcount_read(&block->ref));
> +		else
> +			used += PAGE_SIZE;
> +	}
> +
> +	spin_unlock_bh(&pool->item_lock);

Holding a BH spin_lock can easily create production issues.
I worry how long time it will take to traverse these lists.

We (Cc Yan) are currently hunting down a number of real production issue
due to different cases of control-path code querying the kernel that
takes a _bh lock to read data, hurting the data-path processing.

If we had the stats counters, then this would be less work, right?


> +
> +	err = nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ITEM_MEM_RESIDENT, resident);
> +	if (err)
> +		return err;
> +
> +	return nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ITEM_MEM_USED, used);
> +}
> +
>   static int
> -page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
> +page_pool_nl_fill(struct sk_buff *rsp, struct page_pool *pool,
>   		  const struct genl_info *info)
>   {
>   	size_t inflight, refsz;
> @@ -251,6 +279,9 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
>   	if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL))
>   		goto err_cancel;
>   
> +	if (page_pool_nl_fill_item_mem_info(pool, rsp))
> +		goto err_cancel;
> +
>   	genlmsg_end(rsp, hdr);
>   
>   	return 0;
> @@ -259,7 +290,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
>   	return -EMSGSIZE;
>   }
>   
> -static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd)
> +static void netdev_nl_page_pool_event(struct page_pool *pool, u32 cmd)
>   {
>   	struct genl_info info;
>   	struct sk_buff *ntf;
> diff --git a/tools/net/ynl/samples/page-pool.c b/tools/net/ynl/samples/page-pool.c
> index e5d521320fbf..16c48b6d3c2c 100644
> --- a/tools/net/ynl/samples/page-pool.c
> +++ b/tools/net/ynl/samples/page-pool.c
> @@ -16,6 +16,7 @@ struct stat {
>   	struct {
>   		unsigned int cnt;
>   		size_t refs, bytes;
> +		size_t item_mem_resident, item_mem_used;
>   	} live[2];
>   
>   	size_t alloc_slow, alloc_fast, recycle_ring, recycle_cache;
> @@ -52,6 +53,12 @@ static void count(struct stat *s, unsigned int l,
>   		s->live[l].refs += pp->inflight;
>   	if (pp->_present.inflight_mem)
>   		s->live[l].bytes += pp->inflight_mem;
> +
> +	if (pp->_present.item_mem_resident)
> +		s->live[l].item_mem_resident += pp->item_mem_resident;
> +
> +	if (pp->_present.item_mem_used)
> +		s->live[l].item_mem_used += pp->item_mem_used;
>   }
>   
>   int main(int argc, char **argv)
> @@ -127,6 +134,10 @@ int main(int argc, char **argv)
>   		       s->live[1].refs, s->live[1].bytes,
>   		       s->live[0].refs, s->live[0].bytes);
>   
> +		printf("\t\titem_mem_resident: %zu item_mem_used: %zu (item_mem_resident: %zu item_mem_used: %zu)\n",
> +		       s->live[1].item_mem_resident, s->live[1].item_mem_used,
> +		       s->live[0].item_mem_resident, s->live[0].item_mem_used);
> +
>   		/* We don't know how many pages are sitting in cache and ring
>   		 * so we will under-count the recycling rate a bit.
>   		 */