linux-kernel - Re: [Linuxarm] [PATCH rfc v3 3/4] page_pool: add page recycling support based on elevated refcnt

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <7342ad1a-f272-f599-2ce4-e8019acbcbcb@huawei.com>
Date:   Mon, 12 Jul 2021 17:28:32 +0800
From:   Yunsheng Lin <linyunsheng@...wei.com>
To:     <davem@...emloft.net>, <kuba@...nel.org>
CC:     <alexander.duyck@...il.com>, <linux@...linux.org.uk>,
        <mw@...ihalf.com>, <linuxarm@...neuler.org>,
        <yisen.zhuang@...wei.com>, <salil.mehta@...wei.com>,
        <thomas.petazzoni@...tlin.com>, <hawk@...nel.org>,
        <ilias.apalodimas@...aro.org>, <ast@...nel.org>,
        <daniel@...earbox.net>, <john.fastabend@...il.com>,
        <akpm@...ux-foundation.org>, <peterz@...radead.org>,
        <will@...nel.org>, <willy@...radead.org>, <vbabka@...e.cz>,
        <fenghua.yu@...el.com>, <guro@...com>, <peterx@...hat.com>,
        <feng.tang@...el.com>, <jgg@...pe.ca>, <mcroce@...rosoft.com>,
        <hughd@...gle.com>, <jonathan.lemon@...il.com>, <alobakin@...me>,
        <willemb@...gle.com>, <wenxu@...oud.cn>, <cong.wang@...edance.com>,
        <haokexin@...il.com>, <nogikh@...gle.com>, <elver@...gle.com>,
        <yhs@...com>, <kpsingh@...nel.org>, <andrii@...nel.org>,
        <kafai@...com>, <songliubraving@...com>, <netdev@...r.kernel.org>,
        <linux-kernel@...r.kernel.org>, <bpf@...r.kernel.org>
Subject: Re: [Linuxarm] [PATCH rfc v3 3/4] page_pool: add page recycling
 support based on elevated refcnt

Please ignore this one, the title name has been changed to:
"page_pool: add frag page recycling support in page pool".

On 2021/7/12 17:19, Yunsheng Lin wrote:
> Currently page pool only support page recycling only when
> there is only one user of the page, and the split page
> reusing implemented in the most driver can not use the
> page pool as bing-pong way of reusing requires the elevated
> refcnt support.
> 
> Those reusing or recycling has below limitations:
> 1. page from page pool can only be used be one user in order
>    for the page recycling to happen.
> 2. Bing-pong way of reusing in most driver does not support
>    multi desc using different part of the same page in order
>    to save memory.
> 
> So add elevated refcnt support in page pool to in order to
> overcome the above limitation.
> 
> This is a preparation to support allocating page frag in page
> pool.
> 
> Signed-off-by: Yunsheng Lin <linyunsheng@...wei.com>
> ---
>  include/net/page_pool.h |  22 ++++++++-
>  net/core/page_pool.c    | 121 ++++++++++++++++++++++++++++++++++++++++++------
>  2 files changed, 129 insertions(+), 14 deletions(-)
> 
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 84cd972..d9a736f 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -45,7 +45,10 @@
>  					* Please note DMA-sync-for-CPU is still
>  					* device driver responsibility
>  					*/
> -#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)
> +#define PP_FLAG_PAGE_FRAG	BIT(2)	/* for page frag feature */
> +#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP |\
> +				 PP_FLAG_DMA_SYNC_DEV |\
> +				 PP_FLAG_PAGE_FRAG)
>  
>  /*
>   * Fast allocation side cache array/stack
> @@ -88,6 +91,9 @@ struct page_pool {
>  	unsigned long defer_warn;
>  
>  	u32 pages_state_hold_cnt;
> +	unsigned int frag_offset;
> +	int frag_bias;
> +	struct page *frag_page;
>  
>  	/*
>  	 * Data structure for allocation side
> @@ -137,6 +143,20 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
>  	return page_pool_alloc_pages(pool, gfp);
>  }
>  
> +struct page *page_pool_alloc_frag(struct page_pool *pool,
> +				  unsigned int *offset,
> +				  unsigned int size,
> +				  gfp_t gfp);
> +
> +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
> +						    unsigned int *offset,
> +						    unsigned int size)
> +{
> +	gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
> +
> +	return page_pool_alloc_frag(pool, offset, size, gfp);
> +}
> +
>  /* get the stored dma direction. A driver might decide to treat this locally and
>   * avoid the extra cache line from page_pool to determine the direction
>   */
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 1abefc6..9f518dc 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -24,6 +24,8 @@
>  #define DEFER_TIME (msecs_to_jiffies(1000))
>  #define DEFER_WARN_INTERVAL (60 * HZ)
>  
> +#define BIAS_MAX	(PAGE_SIZE - 1)
> +
>  static int page_pool_init(struct page_pool *pool,
>  			  const struct page_pool_params *params)
>  {
> @@ -304,6 +306,33 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
>  	return page;
>  }
>  
> +/* nr could be negative */
> +static int page_pool_atomic_add_bias(struct page *page, int nr)
> +{
> +	unsigned long *bias_ptr = page_pool_pagecnt_bias_ptr(page);
> +	unsigned long old_bias = READ_ONCE(*bias_ptr);
> +	unsigned long new_bias;
> +
> +	do {
> +		int bias = (int)(old_bias & ~PAGE_MASK);
> +
> +		/* Warn when page_pool_dev_alloc_pages() is called
> +		 * with PP_FLAG_PAGE_FRAG flag in driver.
> +		 */
> +		WARN_ON(!bias);
> +
> +		/* already the last user */
> +		if (!(bias + nr))
> +			return 0;
> +
> +		new_bias = old_bias + nr;
> +	} while (!try_cmpxchg(bias_ptr, &old_bias, new_bias));
> +
> +	WARN_ON((new_bias & PAGE_MASK) != (old_bias & PAGE_MASK));
> +
> +	return new_bias & ~PAGE_MASK;
> +}
> +
>  /* For using page_pool replace: alloc_pages() API calls, but provide
>   * synchronization guarantee for allocation side.
>   */
> @@ -425,6 +454,11 @@ static __always_inline struct page *
>  __page_pool_put_page(struct page_pool *pool, struct page *page,
>  		     unsigned int dma_sync_size, bool allow_direct)
>  {
> +	/* It is not the last user for the page frag case */
> +	if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
> +	    page_pool_atomic_add_bias(page, -1))
> +		return NULL;
> +
>  	/* This allocator is optimized for the XDP mode that uses
>  	 * one-frame-per-page, but have fallbacks that act like the
>  	 * regular page allocator APIs.
> @@ -448,19 +482,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>  		/* Page found as candidate for recycling */
>  		return page;
>  	}
> -	/* Fallback/non-XDP mode: API user have elevated refcnt.
> -	 *
> -	 * Many drivers split up the page into fragments, and some
> -	 * want to keep doing this to save memory and do refcnt based
> -	 * recycling. Support this use case too, to ease drivers
> -	 * switching between XDP/non-XDP.
> -	 *
> -	 * In-case page_pool maintains the DMA mapping, API user must
> -	 * call page_pool_put_page once.  In this elevated refcnt
> -	 * case, the DMA is unmapped/released, as driver is likely
> -	 * doing refcnt based recycle tricks, meaning another process
> -	 * will be invoking put_page.
> -	 */
> +
>  	/* Do not replace this with page_pool_return_page() */
>  	page_pool_release_page(pool, page);
>  	put_page(page);
> @@ -517,6 +539,77 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
>  }
>  EXPORT_SYMBOL(page_pool_put_page_bulk);
>  
> +/* When BIAS_RESERVE to avoid frag page being recycled back to
> + * page pool while the frag page is still in pool->frag_page
> + * waiting for more user. As minimum align size for DMA seems to
> + * be 32, so we support max size of 2047 * 32 for 4K page size.
> + */
> +#define BIAS_RESERVE		((int)(BIAS_MAX / 2 + 1))
> +#define BIAS_NEGATIVE_RESERVE	(0 - BIAS_RESERVE)
> +
> +static struct page *page_pool_drain_frag(struct page_pool *pool,
> +					 struct page *page)
> +{
> +	/* page pool is not the last user */
> +	if (page_pool_atomic_add_bias(page, pool->frag_bias +
> +				      BIAS_NEGATIVE_RESERVE))
> +		return NULL;
> +	else
> +		return page;
> +}
> +
> +static void page_pool_free_frag(struct page_pool *pool)
> +{
> +	struct page *page = pool->frag_page;
> +
> +	if (!page ||
> +	    page_pool_atomic_add_bias(page, pool->frag_bias +
> +				      BIAS_NEGATIVE_RESERVE))
> +		return;
> +
> +	page_pool_return_page(pool, page);
> +	pool->frag_page = NULL;
> +}
> +
> +struct page *page_pool_alloc_frag(struct page_pool *pool,
> +				  unsigned int *offset,
> +				  unsigned int size,
> +				  gfp_t gfp)
> +{
> +	unsigned int max_size = PAGE_SIZE << pool->p.order;
> +	unsigned int frag_offset = pool->frag_offset;
> +	struct page *frag_page = pool->frag_page;
> +
> +	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
> +		    size > max_size))
> +		return NULL;
> +
> +	size = ALIGN(size, dma_get_cache_alignment());
> +
> +	if (frag_page && frag_offset + size > max_size)
> +		frag_page = page_pool_drain_frag(pool, frag_page);
> +
> +	if (!frag_page) {
> +		frag_page = page_pool_alloc_pages(pool, gfp);
> +		if (unlikely(!frag_page)) {
> +			pool->frag_page = NULL;
> +			return NULL;
> +		}
> +
> +		pool->frag_page = frag_page;
> +		pool->frag_bias = 0;
> +		frag_offset = 0;
> +		page_pool_set_pagecnt_bias(frag_page, BIAS_RESERVE);
> +	}
> +
> +	pool->frag_bias++;
> +	*offset = frag_offset;
> +	pool->frag_offset = frag_offset + size;
> +
> +	return frag_page;
> +}
> +EXPORT_SYMBOL(page_pool_alloc_frag);
> +
>  static void page_pool_empty_ring(struct page_pool *pool)
>  {
>  	struct page *page;
> @@ -622,6 +715,8 @@ void page_pool_destroy(struct page_pool *pool)
>  	if (!page_pool_put(pool))
>  		return;
>  
> +	page_pool_free_frag(pool);
> +
>  	if (!page_pool_release(pool))
>  		return;
>  
>