netdev - Re: [net-next PATCH] page_pool: do not release pool until inflight == 0.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20191112130832.6b3d69d5@carbon>
Date:   Tue, 12 Nov 2019 13:08:32 +0100
From:   Jesper Dangaard Brouer <brouer@...hat.com>
To:     Jonathan Lemon <jonathan.lemon@...il.com>
Cc:     <netdev@...r.kernel.org>, <davem@...emloft.net>,
        <kernel-team@...com>, <ilias.apalodimas@...aro.org>,
        brouer@...hat.com
Subject: Re: [net-next PATCH] page_pool: do not release pool until inflight
 == 0.

On Mon, 11 Nov 2019 21:32:10 -0800
Jonathan Lemon <jonathan.lemon@...il.com> wrote:

> The page pool keeps track of the number of pages in flight, and
> it isn't safe to remove the pool until all pages are returned.
> 
> Disallow removing the pool until all pages are back, so the pool
> is always available for page producers.
> 
> Make the page pool responsible for its own delayed destruction
> instead of relying on XDP, so the page pool can be used without
> xdp.

Can you please change this to:
 [... can be used without] xdp memory model.

 
> When all pages are returned, free the pool and notify xdp if the
> pool is registered with the xdp memory system.  Have the callback
> perform a table walk since some drivers (cpsw) may share the pool
> among multiple xdp_rxq_info.
> 
> Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic warning")
> Signed-off-by: Jonathan Lemon <jonathan.lemon@...il.com>
> ---
[...]
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 5bc65587f1c4..bfe96326335d 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
[...]

Found an issue, see below.

> @@ -338,31 +333,10 @@ static void __page_pool_empty_ring(struct page_pool *pool)
>  	}
>  }
>  
> -static void __warn_in_flight(struct page_pool *pool)
> +static void page_pool_free(struct page_pool *pool)
>  {
> -	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
> -	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
> -	s32 distance;
> -
> -	distance = _distance(hold_cnt, release_cnt);
> -
> -	/* Drivers should fix this, but only problematic when DMA is used */
> -	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
> -	     distance, hold_cnt, release_cnt);
> -}
> -
> -void __page_pool_free(struct page_pool *pool)
> -{
> -	/* Only last user actually free/release resources */
> -	if (!page_pool_put(pool))
> -		return;
> -
> -	WARN(pool->alloc.count, "API usage violation");
> -	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
> -
> -	/* Can happen due to forced shutdown */
> -	if (!__page_pool_safe_to_destroy(pool))
> -		__warn_in_flight(pool);
> +	if (pool->disconnect)
> +		pool->disconnect(pool);
>  	ptr_ring_cleanup(&pool->ring, NULL);
>  
> @@ -371,12 +345,8 @@ void __page_pool_free(struct page_pool *pool)
>  
>  	kfree(pool);
>  }
> -EXPORT_SYMBOL(__page_pool_free);

I don't think this is correct according to RCU.

Let me reproduce the resulting version of page_pool_free():

 static void page_pool_free(struct page_pool *pool)
 {
	if (pool->disconnect)
		pool->disconnect(pool);

	ptr_ring_cleanup(&pool->ring, NULL);

	if (pool->p.flags & PP_FLAG_DMA_MAP)
		put_device(pool->p.dev);

	kfree(pool);
 }

The issue is that pool->disconnect() will call into
mem_allocator_disconnect() -> mem_xa_remove(), and mem_xa_remove() does
a RCU delayed free.  And this function immediately does a kfree(pool).

I do know that we can ONLY reach this page_pool_free() function, when
inflight == 0, but that can happen as soon as __page_pool_clean_page()
does the decrement, and after this trace_page_pool_state_release()
still have access the page_pool object (thus, hard to catch use-after-free).

  
> -/* Request to shutdown: release pages cached by page_pool, and check
> - * for in-flight pages
> - */
> -bool __page_pool_request_shutdown(struct page_pool *pool)
> +static void page_pool_scrub(struct page_pool *pool)
>  {
>  	struct page *page;
>  
> @@ -393,7 +363,64 @@ bool __page_pool_request_shutdown(struct page_pool *pool)
>  	 * be in-flight.
>  	 */
>  	__page_pool_empty_ring(pool);
> -
> -	return __page_pool_safe_to_destroy(pool);
>  }
> -EXPORT_SYMBOL(__page_pool_request_shutdown);
> +
> +static int page_pool_release(struct page_pool *pool)
> +{
> +	int inflight;
> +
> +	page_pool_scrub(pool);
> +	inflight = page_pool_inflight(pool);
> +	if (!inflight)
> +		page_pool_free(pool);
> +
> +	return inflight;
> +}
> +
> +static void page_pool_release_retry(struct work_struct *wq)
> +{
> +	struct delayed_work *dwq = to_delayed_work(wq);
> +	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
> +	int inflight;
> +
> +	inflight = page_pool_release(pool);
> +	if (!inflight)
> +		return;
> +
> +	/* Periodic warning */
> +	if (time_after_eq(jiffies, pool->defer_warn)) {
> +		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
> +
> +		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
> +			__func__, inflight, sec);
> +		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
> +	}
> +
> +	/* Still not ready to be disconnected, retry later */
> +	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
> +}
> +
> +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
> +{
> +	refcount_inc(&pool->user_cnt);
> +	pool->disconnect = disconnect;
> +}
> +
> +void page_pool_destroy(struct page_pool *pool)
> +{
> +	if (!pool)
> +		return;
> +
> +	if (!page_pool_put(pool))
> +		return;
> +
> +	if (!page_pool_release(pool))
> +		return;
> +
> +	pool->defer_start = jiffies;
> +	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
> +
> +	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
> +	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
> +}
> +EXPORT_SYMBOL(page_pool_destroy);
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 20781ad5f9c3..8e405abaf05a 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -70,10 +70,6 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
>  
>  	xa = container_of(rcu, struct xdp_mem_allocator, rcu);
>  
> -	/* Allocator have indicated safe to remove before this is called */
> -	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
> -		page_pool_free(xa->page_pool);
> -
>  	/* Allow this ID to be reused */
>  	ida_simple_remove(&mem_id_pool, xa->mem.id);
>  
> @@ -85,10 +81,41 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
>  	kfree(xa);
>  }
>  
> -static bool __mem_id_disconnect(int id, bool force)
> +static void mem_xa_remove(struct xdp_mem_allocator *xa)
> +{
> +	trace_mem_disconnect(xa);
> +
> +	mutex_lock(&mem_id_lock);
> +
> +	if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
> +		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);

RCU delayed free.

> +
> +	mutex_unlock(&mem_id_lock);
> +}
> +
> +static void mem_allocator_disconnect(void *allocator)
> +{
> +	struct xdp_mem_allocator *xa;
> +	struct rhashtable_iter iter;
> +
> +	rhashtable_walk_enter(mem_id_ht, &iter);
> +	do {
> +		rhashtable_walk_start(&iter);
> +
> +		while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) {
> +			if (xa->allocator == allocator)
> +				mem_xa_remove(xa);
> +		}
> +
> +		rhashtable_walk_stop(&iter);
> +
> +	} while (xa == ERR_PTR(-EAGAIN));
> +	rhashtable_walk_exit(&iter);
> +}
>
> +static void mem_id_disconnect(int id)
>  {
>  	struct xdp_mem_allocator *xa;
> -	bool safe_to_remove = true;
>  
>  	mutex_lock(&mem_id_lock);
>  
> @@ -96,51 +123,15 @@ static bool __mem_id_disconnect(int id, bool force)
>  	if (!xa) {
>  		mutex_unlock(&mem_id_lock);
>  		WARN(1, "Request remove non-existing id(%d), driver bug?", id);
> -		return true;
> +		return;
>  	}
> -	xa->disconnect_cnt++;
>  
> -	/* Detects in-flight packet-pages for page_pool */
> -	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
> -		safe_to_remove = page_pool_request_shutdown(xa->page_pool);
> +	trace_mem_disconnect(xa);
>  
> -	trace_mem_disconnect(xa, safe_to_remove, force);
> -
> -	if ((safe_to_remove || force) &&
> -	    !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
> +	if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
>  		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
>  
>  	mutex_unlock(&mem_id_lock);
> -	return (safe_to_remove|force);
> -}
[...]

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer