[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20191112130832.6b3d69d5@carbon>
Date: Tue, 12 Nov 2019 13:08:32 +0100
From: Jesper Dangaard Brouer <brouer@...hat.com>
To: Jonathan Lemon <jonathan.lemon@...il.com>
Cc: <netdev@...r.kernel.org>, <davem@...emloft.net>,
<kernel-team@...com>, <ilias.apalodimas@...aro.org>,
brouer@...hat.com
Subject: Re: [net-next PATCH] page_pool: do not release pool until inflight
== 0.
On Mon, 11 Nov 2019 21:32:10 -0800
Jonathan Lemon <jonathan.lemon@...il.com> wrote:
> The page pool keeps track of the number of pages in flight, and
> it isn't safe to remove the pool until all pages are returned.
>
> Disallow removing the pool until all pages are back, so the pool
> is always available for page producers.
>
> Make the page pool responsible for its own delayed destruction
> instead of relying on XDP, so the page pool can be used without
> xdp.
Can you please change this to:
[... can be used without] xdp memory model.
> When all pages are returned, free the pool and notify xdp if the
> pool is registered with the xdp memory system. Have the callback
> perform a table walk since some drivers (cpsw) may share the pool
> among multiple xdp_rxq_info.
>
> Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic warning")
> Signed-off-by: Jonathan Lemon <jonathan.lemon@...il.com>
> ---
[...]
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 5bc65587f1c4..bfe96326335d 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
[...]
Found an issue, see below.
> @@ -338,31 +333,10 @@ static void __page_pool_empty_ring(struct page_pool *pool)
> }
> }
>
> -static void __warn_in_flight(struct page_pool *pool)
> +static void page_pool_free(struct page_pool *pool)
> {
> - u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
> - u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
> - s32 distance;
> -
> - distance = _distance(hold_cnt, release_cnt);
> -
> - /* Drivers should fix this, but only problematic when DMA is used */
> - WARN(1, "Still in-flight pages:%d hold:%u released:%u",
> - distance, hold_cnt, release_cnt);
> -}
> -
> -void __page_pool_free(struct page_pool *pool)
> -{
> - /* Only last user actually free/release resources */
> - if (!page_pool_put(pool))
> - return;
> -
> - WARN(pool->alloc.count, "API usage violation");
> - WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
> -
> - /* Can happen due to forced shutdown */
> - if (!__page_pool_safe_to_destroy(pool))
> - __warn_in_flight(pool);
> + if (pool->disconnect)
> + pool->disconnect(pool);
> ptr_ring_cleanup(&pool->ring, NULL);
>
> @@ -371,12 +345,8 @@ void __page_pool_free(struct page_pool *pool)
>
> kfree(pool);
> }
> -EXPORT_SYMBOL(__page_pool_free);
I don't think this is correct according to RCU.
Let me reproduce the resulting version of page_pool_free():
static void page_pool_free(struct page_pool *pool)
{
if (pool->disconnect)
pool->disconnect(pool);
ptr_ring_cleanup(&pool->ring, NULL);
if (pool->p.flags & PP_FLAG_DMA_MAP)
put_device(pool->p.dev);
kfree(pool);
}
The issue is that pool->disconnect() will call into
mem_allocator_disconnect() -> mem_xa_remove(), and mem_xa_remove() does
a RCU delayed free. And this function immediately does a kfree(pool).
I do know that we can ONLY reach this page_pool_free() function, when
inflight == 0, but that can happen as soon as __page_pool_clean_page()
does the decrement, and after this trace_page_pool_state_release()
still have access the page_pool object (thus, hard to catch use-after-free).
> -/* Request to shutdown: release pages cached by page_pool, and check
> - * for in-flight pages
> - */
> -bool __page_pool_request_shutdown(struct page_pool *pool)
> +static void page_pool_scrub(struct page_pool *pool)
> {
> struct page *page;
>
> @@ -393,7 +363,64 @@ bool __page_pool_request_shutdown(struct page_pool *pool)
> * be in-flight.
> */
> __page_pool_empty_ring(pool);
> -
> - return __page_pool_safe_to_destroy(pool);
> }
> -EXPORT_SYMBOL(__page_pool_request_shutdown);
> +
> +static int page_pool_release(struct page_pool *pool)
> +{
> + int inflight;
> +
> + page_pool_scrub(pool);
> + inflight = page_pool_inflight(pool);
> + if (!inflight)
> + page_pool_free(pool);
> +
> + return inflight;
> +}
> +
> +static void page_pool_release_retry(struct work_struct *wq)
> +{
> + struct delayed_work *dwq = to_delayed_work(wq);
> + struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
> + int inflight;
> +
> + inflight = page_pool_release(pool);
> + if (!inflight)
> + return;
> +
> + /* Periodic warning */
> + if (time_after_eq(jiffies, pool->defer_warn)) {
> + int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
> +
> + pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
> + __func__, inflight, sec);
> + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
> + }
> +
> + /* Still not ready to be disconnected, retry later */
> + schedule_delayed_work(&pool->release_dw, DEFER_TIME);
> +}
> +
> +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
> +{
> + refcount_inc(&pool->user_cnt);
> + pool->disconnect = disconnect;
> +}
> +
> +void page_pool_destroy(struct page_pool *pool)
> +{
> + if (!pool)
> + return;
> +
> + if (!page_pool_put(pool))
> + return;
> +
> + if (!page_pool_release(pool))
> + return;
> +
> + pool->defer_start = jiffies;
> + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
> +
> + INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
> + schedule_delayed_work(&pool->release_dw, DEFER_TIME);
> +}
> +EXPORT_SYMBOL(page_pool_destroy);
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 20781ad5f9c3..8e405abaf05a 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -70,10 +70,6 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
>
> xa = container_of(rcu, struct xdp_mem_allocator, rcu);
>
> - /* Allocator have indicated safe to remove before this is called */
> - if (xa->mem.type == MEM_TYPE_PAGE_POOL)
> - page_pool_free(xa->page_pool);
> -
> /* Allow this ID to be reused */
> ida_simple_remove(&mem_id_pool, xa->mem.id);
>
> @@ -85,10 +81,41 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
> kfree(xa);
> }
>
> -static bool __mem_id_disconnect(int id, bool force)
> +static void mem_xa_remove(struct xdp_mem_allocator *xa)
> +{
> + trace_mem_disconnect(xa);
> +
> + mutex_lock(&mem_id_lock);
> +
> + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
> + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
RCU delayed free.
> +
> + mutex_unlock(&mem_id_lock);
> +}
> +
> +static void mem_allocator_disconnect(void *allocator)
> +{
> + struct xdp_mem_allocator *xa;
> + struct rhashtable_iter iter;
> +
> + rhashtable_walk_enter(mem_id_ht, &iter);
> + do {
> + rhashtable_walk_start(&iter);
> +
> + while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) {
> + if (xa->allocator == allocator)
> + mem_xa_remove(xa);
> + }
> +
> + rhashtable_walk_stop(&iter);
> +
> + } while (xa == ERR_PTR(-EAGAIN));
> + rhashtable_walk_exit(&iter);
> +}
>
> +static void mem_id_disconnect(int id)
> {
> struct xdp_mem_allocator *xa;
> - bool safe_to_remove = true;
>
> mutex_lock(&mem_id_lock);
>
> @@ -96,51 +123,15 @@ static bool __mem_id_disconnect(int id, bool force)
> if (!xa) {
> mutex_unlock(&mem_id_lock);
> WARN(1, "Request remove non-existing id(%d), driver bug?", id);
> - return true;
> + return;
> }
> - xa->disconnect_cnt++;
>
> - /* Detects in-flight packet-pages for page_pool */
> - if (xa->mem.type == MEM_TYPE_PAGE_POOL)
> - safe_to_remove = page_pool_request_shutdown(xa->page_pool);
> + trace_mem_disconnect(xa);
>
> - trace_mem_disconnect(xa, safe_to_remove, force);
> -
> - if ((safe_to_remove || force) &&
> - !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
> + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
> call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
>
> mutex_unlock(&mem_id_lock);
> - return (safe_to_remove|force);
> -}
[...]
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
Powered by blists - more mailing lists