lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <157323722276.10408.11333995838112864686.stgit@firesoul>
Date:   Fri, 08 Nov 2019 19:20:22 +0100
From:   Jesper Dangaard Brouer <brouer@...hat.com>
To:     unlisted-recipients:; (no To-header on input)
Cc:     Toke Høiland-Jørgensen <toke@...hat.com>,
        netdev@...r.kernel.org,
        Ilias Apalodimas <ilias.apalodimas@...aro.org>,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Saeed Mahameed <saeedm@...lanox.com>,
        Matteo Croce <mcroce@...hat.com>,
        Jonathan Lemon <jonathan.lemon@...il.com>,
        Lorenzo Bianconi <lorenzo@...nel.org>,
        Tariq Toukan <tariqt@...lanox.com>
Subject: [net-next v1 PATCH 1/2] xdp: revert forced mem allocator removal
 for page_pool

Forced removal of XDP mem allocator, specifically related to page_pool, turned
out to be a wrong approach.  Special thanks to Jonathan Lemon for convincing me.
This patch is a partial revert of commit d956a048cd3f (“xdp: force mem allocator
removal and periodic warning”).

It is much better to provide a guarantee that page_pool object stays valid
until 'inflight' pages reach zero, making it safe to remove.

We keep the periodic warning via a work-queue, but increased interval to
5-minutes. The reason is to have a way to catch bugs, where inflight
pages/packets never reach zero, indicating some kind of leak. These kind of
bugs have been observed while converting drivers over to use page_pool API.

Details on when to crash the kernel. If page_pool API is misused and
somehow __page_pool_free() is invoked while there are still inflight
frames, then (like before) a WARN() is triggered and not a BUG(). This can
potentially lead to use-after-free, which we try to catch via poisoning the
page_pool object memory with some NULL pointers. Doing it this way,
pinpoint both the driver (likely) prematurely freeing page_pool via WARN(),
and crash-dump for inflight page/packet show who to blame for late return.

Fixes: d956a048cd3f (“xdp: force mem allocator removal and periodic warning”)
Signed-off-by: Jesper Dangaard Brouer <brouer@...hat.com>
---
 include/trace/events/xdp.h |   35 +++--------------------------------
 net/core/page_pool.c       |    8 ++++++--
 net/core/xdp.c             |   36 +++++++++++++-----------------------
 3 files changed, 22 insertions(+), 57 deletions(-)

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index c7e3c9c5bad3..a3ead2b1f00e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -318,9 +318,9 @@ __MEM_TYPE_MAP(__MEM_TYPE_TP_FN)
 TRACE_EVENT(mem_disconnect,
 
 	TP_PROTO(const struct xdp_mem_allocator *xa,
-		 bool safe_to_remove, bool force),
+		 bool safe_to_remove),
 
-	TP_ARGS(xa, safe_to_remove, force),
+	TP_ARGS(xa, safe_to_remove),
 
 	TP_STRUCT__entry(
 		__field(const struct xdp_mem_allocator *,	xa)
@@ -328,7 +328,6 @@ TRACE_EVENT(mem_disconnect,
 		__field(u32,		mem_type)
 		__field(const void *,	allocator)
 		__field(bool,		safe_to_remove)
-		__field(bool,		force)
 		__field(int,		disconnect_cnt)
 	),
 
@@ -338,17 +337,15 @@ TRACE_EVENT(mem_disconnect,
 		__entry->mem_type	= xa->mem.type;
 		__entry->allocator	= xa->allocator;
 		__entry->safe_to_remove	= safe_to_remove;
-		__entry->force		= force;
 		__entry->disconnect_cnt	= xa->disconnect_cnt;
 	),
 
 	TP_printk("mem_id=%d mem_type=%s allocator=%p"
-		  " safe_to_remove=%s force=%s disconnect_cnt=%d",
+		  " safe_to_remove=%s disconnect_cnt=%d",
 		  __entry->mem_id,
 		  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
 		  __entry->allocator,
 		  __entry->safe_to_remove ? "true" : "false",
-		  __entry->force ? "true" : "false",
 		  __entry->disconnect_cnt
 	)
 );
@@ -387,32 +384,6 @@ TRACE_EVENT(mem_connect,
 	)
 );
 
-TRACE_EVENT(mem_return_failed,
-
-	TP_PROTO(const struct xdp_mem_info *mem,
-		 const struct page *page),
-
-	TP_ARGS(mem, page),
-
-	TP_STRUCT__entry(
-		__field(const struct page *,	page)
-		__field(u32,		mem_id)
-		__field(u32,		mem_type)
-	),
-
-	TP_fast_assign(
-		__entry->page		= page;
-		__entry->mem_id		= mem->id;
-		__entry->mem_type	= mem->type;
-	),
-
-	TP_printk("mem_id=%d mem_type=%s page=%p",
-		  __entry->mem_id,
-		  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
-		  __entry->page
-	)
-);
-
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5bc65587f1c4..226f2eb30418 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -346,7 +346,7 @@ static void __warn_in_flight(struct page_pool *pool)
 
 	distance = _distance(hold_cnt, release_cnt);
 
-	/* Drivers should fix this, but only problematic when DMA is used */
+	/* BUG but warn as kernel should crash later */
 	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
 	     distance, hold_cnt, release_cnt);
 }
@@ -360,12 +360,16 @@ void __page_pool_free(struct page_pool *pool)
 	WARN(pool->alloc.count, "API usage violation");
 	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
 
-	/* Can happen due to forced shutdown */
 	if (!__page_pool_safe_to_destroy(pool))
 		__warn_in_flight(pool);
 
 	ptr_ring_cleanup(&pool->ring, NULL);
 
+	/* Make sure kernel will crash on use-after-free */
+	pool->ring.queue = NULL;
+	pool->alloc.cache[PP_ALLOC_CACHE_SIZE - 1] = NULL;
+	pool->alloc.count = PP_ALLOC_CACHE_SIZE;
+
 	if (pool->p.flags & PP_FLAG_DMA_MAP)
 		put_device(pool->p.dev);
 
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 20781ad5f9c3..8673f199d9f4 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -85,7 +85,7 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
 	kfree(xa);
 }
 
-static bool __mem_id_disconnect(int id, bool force)
+static bool __mem_id_disconnect(int id)
 {
 	struct xdp_mem_allocator *xa;
 	bool safe_to_remove = true;
@@ -104,30 +104,26 @@ static bool __mem_id_disconnect(int id, bool force)
 	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
 		safe_to_remove = page_pool_request_shutdown(xa->page_pool);
 
-	trace_mem_disconnect(xa, safe_to_remove, force);
+	trace_mem_disconnect(xa, safe_to_remove);
 
-	if ((safe_to_remove || force) &&
+	if ((safe_to_remove) &&
 	    !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
 		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
 
 	mutex_unlock(&mem_id_lock);
-	return (safe_to_remove|force);
+	return safe_to_remove;
 }
 
-#define DEFER_TIME (msecs_to_jiffies(1000))
-#define DEFER_WARN_INTERVAL (30 * HZ)
-#define DEFER_MAX_RETRIES 120
+#define DEFER_TIME (msecs_to_jiffies(1000UL))
+#define DEFER_WARN_INTERVAL (600UL * HZ)
 
 static void mem_id_disconnect_defer_retry(struct work_struct *wq)
 {
 	struct delayed_work *dwq = to_delayed_work(wq);
 	struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
-	bool force = false;
+	unsigned long defer_time;
 
-	if (xa->disconnect_cnt > DEFER_MAX_RETRIES)
-		force = true;
-
-	if (__mem_id_disconnect(xa->mem.id, force))
+	if (__mem_id_disconnect(xa->mem.id))
 		return;
 
 	/* Periodic warning */
@@ -140,7 +136,8 @@ static void mem_id_disconnect_defer_retry(struct work_struct *wq)
 	}
 
 	/* Still not ready to be disconnected, retry later */
-	schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
+	defer_time = min(DEFER_WARN_INTERVAL, DEFER_TIME * xa->disconnect_cnt);
+	schedule_delayed_work(&xa->defer_wq, defer_time);
 }
 
 void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
@@ -161,7 +158,7 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 	if (id == 0)
 		return;
 
-	if (__mem_id_disconnect(id, false))
+	if (__mem_id_disconnect(id))
 		return;
 
 	/* Could not disconnect, defer new disconnect attempt to later */
@@ -402,15 +399,8 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
 		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 		page = virt_to_head_page(data);
-		if (likely(xa)) {
-			napi_direct &= !xdp_return_frame_no_direct();
-			page_pool_put_page(xa->page_pool, page, napi_direct);
-		} else {
-			/* Hopefully stack show who to blame for late return */
-			WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id);
-			trace_mem_return_failed(mem, page);
-			put_page(page);
-		}
+		napi_direct &= !xdp_return_frame_no_direct();
+		page_pool_put_page(xa->page_pool, page, napi_direct);
 		rcu_read_unlock();
 		break;
 	case MEM_TYPE_PAGE_SHARED:

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ