[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <7342ad1a-f272-f599-2ce4-e8019acbcbcb@huawei.com>
Date: Mon, 12 Jul 2021 17:28:32 +0800
From: Yunsheng Lin <linyunsheng@...wei.com>
To: <davem@...emloft.net>, <kuba@...nel.org>
CC: <alexander.duyck@...il.com>, <linux@...linux.org.uk>,
<mw@...ihalf.com>, <linuxarm@...neuler.org>,
<yisen.zhuang@...wei.com>, <salil.mehta@...wei.com>,
<thomas.petazzoni@...tlin.com>, <hawk@...nel.org>,
<ilias.apalodimas@...aro.org>, <ast@...nel.org>,
<daniel@...earbox.net>, <john.fastabend@...il.com>,
<akpm@...ux-foundation.org>, <peterz@...radead.org>,
<will@...nel.org>, <willy@...radead.org>, <vbabka@...e.cz>,
<fenghua.yu@...el.com>, <guro@...com>, <peterx@...hat.com>,
<feng.tang@...el.com>, <jgg@...pe.ca>, <mcroce@...rosoft.com>,
<hughd@...gle.com>, <jonathan.lemon@...il.com>, <alobakin@...me>,
<willemb@...gle.com>, <wenxu@...oud.cn>, <cong.wang@...edance.com>,
<haokexin@...il.com>, <nogikh@...gle.com>, <elver@...gle.com>,
<yhs@...com>, <kpsingh@...nel.org>, <andrii@...nel.org>,
<kafai@...com>, <songliubraving@...com>, <netdev@...r.kernel.org>,
<linux-kernel@...r.kernel.org>, <bpf@...r.kernel.org>
Subject: Re: [Linuxarm] [PATCH rfc v3 3/4] page_pool: add page recycling
support based on elevated refcnt
Please ignore this one, the title name has been changed to:
"page_pool: add frag page recycling support in page pool".
On 2021/7/12 17:19, Yunsheng Lin wrote:
> Currently page pool only support page recycling only when
> there is only one user of the page, and the split page
> reusing implemented in the most driver can not use the
> page pool as bing-pong way of reusing requires the elevated
> refcnt support.
>
> Those reusing or recycling has below limitations:
> 1. page from page pool can only be used be one user in order
> for the page recycling to happen.
> 2. Bing-pong way of reusing in most driver does not support
> multi desc using different part of the same page in order
> to save memory.
>
> So add elevated refcnt support in page pool to in order to
> overcome the above limitation.
>
> This is a preparation to support allocating page frag in page
> pool.
>
> Signed-off-by: Yunsheng Lin <linyunsheng@...wei.com>
> ---
> include/net/page_pool.h | 22 ++++++++-
> net/core/page_pool.c | 121 ++++++++++++++++++++++++++++++++++++++++++------
> 2 files changed, 129 insertions(+), 14 deletions(-)
>
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 84cd972..d9a736f 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -45,7 +45,10 @@
> * Please note DMA-sync-for-CPU is still
> * device driver responsibility
> */
> -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)
> +#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */
> +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\
> + PP_FLAG_DMA_SYNC_DEV |\
> + PP_FLAG_PAGE_FRAG)
>
> /*
> * Fast allocation side cache array/stack
> @@ -88,6 +91,9 @@ struct page_pool {
> unsigned long defer_warn;
>
> u32 pages_state_hold_cnt;
> + unsigned int frag_offset;
> + int frag_bias;
> + struct page *frag_page;
>
> /*
> * Data structure for allocation side
> @@ -137,6 +143,20 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
> return page_pool_alloc_pages(pool, gfp);
> }
>
> +struct page *page_pool_alloc_frag(struct page_pool *pool,
> + unsigned int *offset,
> + unsigned int size,
> + gfp_t gfp);
> +
> +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
> + unsigned int *offset,
> + unsigned int size)
> +{
> + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
> +
> + return page_pool_alloc_frag(pool, offset, size, gfp);
> +}
> +
> /* get the stored dma direction. A driver might decide to treat this locally and
> * avoid the extra cache line from page_pool to determine the direction
> */
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 1abefc6..9f518dc 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -24,6 +24,8 @@
> #define DEFER_TIME (msecs_to_jiffies(1000))
> #define DEFER_WARN_INTERVAL (60 * HZ)
>
> +#define BIAS_MAX (PAGE_SIZE - 1)
> +
> static int page_pool_init(struct page_pool *pool,
> const struct page_pool_params *params)
> {
> @@ -304,6 +306,33 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
> return page;
> }
>
> +/* nr could be negative */
> +static int page_pool_atomic_add_bias(struct page *page, int nr)
> +{
> + unsigned long *bias_ptr = page_pool_pagecnt_bias_ptr(page);
> + unsigned long old_bias = READ_ONCE(*bias_ptr);
> + unsigned long new_bias;
> +
> + do {
> + int bias = (int)(old_bias & ~PAGE_MASK);
> +
> + /* Warn when page_pool_dev_alloc_pages() is called
> + * with PP_FLAG_PAGE_FRAG flag in driver.
> + */
> + WARN_ON(!bias);
> +
> + /* already the last user */
> + if (!(bias + nr))
> + return 0;
> +
> + new_bias = old_bias + nr;
> + } while (!try_cmpxchg(bias_ptr, &old_bias, new_bias));
> +
> + WARN_ON((new_bias & PAGE_MASK) != (old_bias & PAGE_MASK));
> +
> + return new_bias & ~PAGE_MASK;
> +}
> +
> /* For using page_pool replace: alloc_pages() API calls, but provide
> * synchronization guarantee for allocation side.
> */
> @@ -425,6 +454,11 @@ static __always_inline struct page *
> __page_pool_put_page(struct page_pool *pool, struct page *page,
> unsigned int dma_sync_size, bool allow_direct)
> {
> + /* It is not the last user for the page frag case */
> + if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
> + page_pool_atomic_add_bias(page, -1))
> + return NULL;
> +
> /* This allocator is optimized for the XDP mode that uses
> * one-frame-per-page, but have fallbacks that act like the
> * regular page allocator APIs.
> @@ -448,19 +482,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
> /* Page found as candidate for recycling */
> return page;
> }
> - /* Fallback/non-XDP mode: API user have elevated refcnt.
> - *
> - * Many drivers split up the page into fragments, and some
> - * want to keep doing this to save memory and do refcnt based
> - * recycling. Support this use case too, to ease drivers
> - * switching between XDP/non-XDP.
> - *
> - * In-case page_pool maintains the DMA mapping, API user must
> - * call page_pool_put_page once. In this elevated refcnt
> - * case, the DMA is unmapped/released, as driver is likely
> - * doing refcnt based recycle tricks, meaning another process
> - * will be invoking put_page.
> - */
> +
> /* Do not replace this with page_pool_return_page() */
> page_pool_release_page(pool, page);
> put_page(page);
> @@ -517,6 +539,77 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
> }
> EXPORT_SYMBOL(page_pool_put_page_bulk);
>
> +/* When BIAS_RESERVE to avoid frag page being recycled back to
> + * page pool while the frag page is still in pool->frag_page
> + * waiting for more user. As minimum align size for DMA seems to
> + * be 32, so we support max size of 2047 * 32 for 4K page size.
> + */
> +#define BIAS_RESERVE ((int)(BIAS_MAX / 2 + 1))
> +#define BIAS_NEGATIVE_RESERVE (0 - BIAS_RESERVE)
> +
> +static struct page *page_pool_drain_frag(struct page_pool *pool,
> + struct page *page)
> +{
> + /* page pool is not the last user */
> + if (page_pool_atomic_add_bias(page, pool->frag_bias +
> + BIAS_NEGATIVE_RESERVE))
> + return NULL;
> + else
> + return page;
> +}
> +
> +static void page_pool_free_frag(struct page_pool *pool)
> +{
> + struct page *page = pool->frag_page;
> +
> + if (!page ||
> + page_pool_atomic_add_bias(page, pool->frag_bias +
> + BIAS_NEGATIVE_RESERVE))
> + return;
> +
> + page_pool_return_page(pool, page);
> + pool->frag_page = NULL;
> +}
> +
> +struct page *page_pool_alloc_frag(struct page_pool *pool,
> + unsigned int *offset,
> + unsigned int size,
> + gfp_t gfp)
> +{
> + unsigned int max_size = PAGE_SIZE << pool->p.order;
> + unsigned int frag_offset = pool->frag_offset;
> + struct page *frag_page = pool->frag_page;
> +
> + if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
> + size > max_size))
> + return NULL;
> +
> + size = ALIGN(size, dma_get_cache_alignment());
> +
> + if (frag_page && frag_offset + size > max_size)
> + frag_page = page_pool_drain_frag(pool, frag_page);
> +
> + if (!frag_page) {
> + frag_page = page_pool_alloc_pages(pool, gfp);
> + if (unlikely(!frag_page)) {
> + pool->frag_page = NULL;
> + return NULL;
> + }
> +
> + pool->frag_page = frag_page;
> + pool->frag_bias = 0;
> + frag_offset = 0;
> + page_pool_set_pagecnt_bias(frag_page, BIAS_RESERVE);
> + }
> +
> + pool->frag_bias++;
> + *offset = frag_offset;
> + pool->frag_offset = frag_offset + size;
> +
> + return frag_page;
> +}
> +EXPORT_SYMBOL(page_pool_alloc_frag);
> +
> static void page_pool_empty_ring(struct page_pool *pool)
> {
> struct page *page;
> @@ -622,6 +715,8 @@ void page_pool_destroy(struct page_pool *pool)
> if (!page_pool_put(pool))
> return;
>
> + page_pool_free_frag(pool);
> +
> if (!page_pool_release(pool))
> return;
>
>
Powered by blists - more mailing lists