netdev - [RFC PATCH v4 11/16] io_uring: implement pp memory provider for zc rx

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240312214430.2923019-12-dw@davidwei.uk>
Date: Tue, 12 Mar 2024 14:44:25 -0700
From: David Wei <dw@...idwei.uk>
To: io-uring@...r.kernel.org,
	netdev@...r.kernel.org
Cc: Jens Axboe <axboe@...nel.dk>,
	Pavel Begunkov <asml.silence@...il.com>,
	Jakub Kicinski <kuba@...nel.org>,
	Paolo Abeni <pabeni@...hat.com>,
	"David S. Miller" <davem@...emloft.net>,
	Eric Dumazet <edumazet@...gle.com>,
	Jesper Dangaard Brouer <hawk@...nel.org>,
	David Ahern <dsahern@...nel.org>,
	Mina Almasry <almasrymina@...gle.com>
Subject: [RFC PATCH v4 11/16] io_uring: implement pp memory provider for zc rx

From: Pavel Begunkov <asml.silence@...il.com>

Implement a new pp memory provider for io_uring zerocopy receive.

All buffers are backed by struct io_zc_rx_buf, which is a thin extension
of struct net_iov. Initially, all of them are unallocated and placed in
a spinlock protected ->freelist. Then, they will be allocate via
the ->alloc_pages callback, which sets refcount to 1.

Later, buffers would either be dropped by the net stack and recycled
back into page pool / released by ->release_page, or, more likely, get
transferred to the userspace by posting a corresponding CQE and
elevating refcount by IO_ZC_RX_UREF. When the user is done with a buffer,
it should be put into the refill ring.

Next time io_pp_zc_alloc_pages() runs it'll check the ring, put user
refs and ultimately grab buffers from there. That's done in the attached
napi context and so doesn't need any additional synchronisation. That is
the second hottest path after getting a buffer from the pp lockless cache.

Signed-off-by: Pavel Begunkov <asml.silence@...il.com>
Signed-off-by: David Wei <dw@...idwei.uk>
---
 include/linux/io_uring/net.h  |   5 +
 include/net/page_pool/types.h |   1 +
 io_uring/zc_rx.c              | 202 ++++++++++++++++++++++++++++++++++
 io_uring/zc_rx.h              |   5 +
 net/core/page_pool.c          |   2 +-
 5 files changed, 214 insertions(+), 1 deletion(-)

diff --git a/include/linux/io_uring/net.h b/include/linux/io_uring/net.h
index 05d5a6a97264..a225d7090b6b 100644
--- a/include/linux/io_uring/net.h
+++ b/include/linux/io_uring/net.h
@@ -12,6 +12,11 @@ struct io_zc_rx_buf {
 };
 
 #if defined(CONFIG_IO_URING)
+
+#if defined(CONFIG_PAGE_POOL)
+extern const struct memory_provider_ops io_uring_pp_zc_ops;
+#endif
+
 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
 
 #else
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 347837b83d36..9e91f2cdbe61 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -227,6 +227,7 @@ netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 struct page_pool *page_pool_create_percpu(const struct page_pool_params *params,
 					  int cpuid);
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem);
 
 struct xdp_mem_info;
 
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index 326ae3fcc643..b2507df121fb 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -8,6 +8,7 @@
 #include <linux/nospec.h>
 #include <net/tcp.h>
 #include <net/af_unix.h>
+#include <trace/events/page_pool.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -357,4 +358,205 @@ int io_register_zc_rx_sock(struct io_ring_ctx *ctx,
 	return 0;
 }
 
+static inline struct io_zc_rx_buf *io_niov_to_buf(struct net_iov *niov)
+{
+	return container_of(niov, struct io_zc_rx_buf, niov);
+}
+
+static inline unsigned io_buf_pgid(struct io_zc_rx_pool *pool,
+				   struct io_zc_rx_buf *buf)
+{
+	return buf - pool->bufs;
+}
+
+static __maybe_unused void io_zc_rx_get_buf_uref(struct io_zc_rx_buf *buf)
+{
+	atomic_long_add(IO_ZC_RX_UREF, &buf->niov.pp_ref_count);
+}
+
+static bool io_zc_rx_buf_put(struct io_zc_rx_buf *buf, int nr)
+{
+	return atomic_long_sub_and_test(nr, &buf->niov.pp_ref_count);
+}
+
+static bool io_zc_rx_put_buf_uref(struct io_zc_rx_buf *buf)
+{
+	if (atomic_long_read(&buf->niov.pp_ref_count) < IO_ZC_RX_UREF)
+		return false;
+
+	return io_zc_rx_buf_put(buf, IO_ZC_RX_UREF);
+}
+
+static inline netmem_ref io_zc_buf_to_netmem(struct io_zc_rx_buf *buf)
+{
+	return net_iov_to_netmem(&buf->niov);
+}
+
+static inline void io_zc_add_pp_cache(struct page_pool *pp,
+				      struct io_zc_rx_buf *buf)
+{
+	netmem_ref netmem = io_zc_buf_to_netmem(buf);
+
+	page_pool_set_pp_info(pp, netmem);
+	pp->alloc.cache[pp->alloc.count++] = netmem;
+}
+
+static inline u32 io_zc_rx_rqring_entries(struct io_zc_rx_ifq *ifq)
+{
+	u32 entries;
+
+	entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
+	return min(entries, ifq->rq_entries);
+}
+
+static void io_zc_rx_ring_refill(struct page_pool *pp,
+				 struct io_zc_rx_ifq *ifq)
+{
+	unsigned int entries = io_zc_rx_rqring_entries(ifq);
+	unsigned int mask = ifq->rq_entries - 1;
+	struct io_zc_rx_pool *pool = ifq->pool;
+
+	if (unlikely(!entries))
+		return;
+
+	while (entries--) {
+		unsigned int rq_idx = ifq->cached_rq_head++ & mask;
+		struct io_uring_rbuf_rqe *rqe = &ifq->rqes[rq_idx];
+		u32 pgid = rqe->off / PAGE_SIZE;
+		struct io_zc_rx_buf *buf = &pool->bufs[pgid];
+
+		if (!io_zc_rx_put_buf_uref(buf))
+			continue;
+		io_zc_add_pp_cache(pp, buf);
+		if (pp->alloc.count >= PP_ALLOC_CACHE_REFILL)
+			break;
+	}
+	smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
+}
+
+static void io_zc_rx_refill_slow(struct page_pool *pp, struct io_zc_rx_ifq *ifq)
+{
+	struct io_zc_rx_pool *pool = ifq->pool;
+
+	spin_lock_bh(&pool->freelist_lock);
+	while (pool->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
+		struct io_zc_rx_buf *buf;
+		u32 pgid;
+
+		pgid = pool->freelist[--pool->free_count];
+		buf = &pool->bufs[pgid];
+
+		io_zc_add_pp_cache(pp, buf);
+		pp->pages_state_hold_cnt++;
+		trace_page_pool_state_hold(pp, io_zc_buf_to_netmem(buf),
+					   pp->pages_state_hold_cnt);
+	}
+	spin_unlock_bh(&pool->freelist_lock);
+}
+
+static void io_zc_rx_recycle_buf(struct io_zc_rx_pool *pool,
+				 struct io_zc_rx_buf *buf)
+{
+	spin_lock_bh(&pool->freelist_lock);
+	pool->freelist[pool->free_count++] = io_buf_pgid(pool, buf);
+	spin_unlock_bh(&pool->freelist_lock);
+}
+
+static netmem_ref io_pp_zc_alloc_pages(struct page_pool *pp, gfp_t gfp)
+{
+	struct io_zc_rx_ifq *ifq = pp->mp_priv;
+
+	/* pp should already be ensuring that */
+	if (unlikely(pp->alloc.count))
+		goto out_return;
+
+	io_zc_rx_ring_refill(pp, ifq);
+	if (likely(pp->alloc.count))
+		goto out_return;
+
+	io_zc_rx_refill_slow(pp, ifq);
+	if (!pp->alloc.count)
+		return 0;
+out_return:
+	return pp->alloc.cache[--pp->alloc.count];
+}
+
+static bool io_pp_zc_release_page(struct page_pool *pp, netmem_ref netmem)
+{
+	struct io_zc_rx_ifq *ifq = pp->mp_priv;
+	struct io_zc_rx_buf *buf;
+	struct net_iov *niov;
+
+	if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+		return false;
+
+	niov = netmem_to_net_iov(netmem);
+	buf = io_niov_to_buf(niov);
+
+	if (io_zc_rx_buf_put(buf, 1))
+		io_zc_rx_recycle_buf(ifq->pool, buf);
+	return false;
+}
+
+static void io_pp_zc_scrub(struct page_pool *pp)
+{
+	struct io_zc_rx_ifq *ifq = pp->mp_priv;
+	struct io_zc_rx_pool *pool = ifq->pool;
+	int i;
+
+	for (i = 0; i < pool->nr_bufs; i++) {
+		struct io_zc_rx_buf *buf = &pool->bufs[i];
+		int count;
+
+		if (!io_zc_rx_put_buf_uref(buf))
+			continue;
+		io_zc_rx_recycle_buf(pool, buf);
+
+		count = atomic_inc_return_relaxed(&pp->pages_state_release_cnt);
+		trace_page_pool_state_release(pp, io_zc_buf_to_netmem(buf), count);
+	}
+}
+
+static int io_pp_zc_init(struct page_pool *pp)
+{
+	struct io_zc_rx_ifq *ifq = pp->mp_priv;
+
+	if (!ifq)
+		return -EINVAL;
+	if (pp->p.order != 0)
+		return -EINVAL;
+	if (!pp->p.napi)
+		return -EINVAL;
+	if (pp->p.flags & PP_FLAG_DMA_MAP)
+		return -EOPNOTSUPP;
+	if (pp->p.flags & PP_FLAG_DMA_SYNC_DEV)
+		return -EOPNOTSUPP;
+
+	percpu_ref_get(&ifq->ctx->refs);
+	ifq->pp = pp;
+	return 0;
+}
+
+static void io_pp_zc_destroy(struct page_pool *pp)
+{
+	struct io_zc_rx_ifq *ifq = pp->mp_priv;
+	struct io_zc_rx_pool *pool = ifq->pool;
+
+	ifq->pp = NULL;
+
+	if (WARN_ON_ONCE(pool->free_count != pool->nr_bufs))
+		return;
+	percpu_ref_put(&ifq->ctx->refs);
+}
+
+const struct memory_provider_ops io_uring_pp_zc_ops = {
+	.alloc_pages		= io_pp_zc_alloc_pages,
+	.release_page		= io_pp_zc_release_page,
+	.init			= io_pp_zc_init,
+	.destroy		= io_pp_zc_destroy,
+	.scrub			= io_pp_zc_scrub,
+};
+EXPORT_SYMBOL(io_uring_pp_zc_ops);
+
+
 #endif
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
index 466b2b8f9813..c02bf8cabc6c 100644
--- a/io_uring/zc_rx.h
+++ b/io_uring/zc_rx.h
@@ -10,6 +10,9 @@
 #define IO_ZC_IFQ_IDX_OFFSET		16
 #define IO_ZC_IFQ_IDX_MASK		((1U << IO_ZC_IFQ_IDX_OFFSET) - 1)
 
+#define IO_ZC_RX_UREF			0x10000
+#define IO_ZC_RX_KREF_MASK		(IO_ZC_RX_UREF - 1)
+
 struct io_zc_rx_pool {
 	struct io_zc_rx_ifq	*ifq;
 	struct io_zc_rx_buf	*bufs;
@@ -26,10 +29,12 @@ struct io_zc_rx_ifq {
 	struct io_ring_ctx		*ctx;
 	struct net_device		*dev;
 	struct io_zc_rx_pool		*pool;
+	struct page_pool		*pp;
 
 	struct io_uring			*rq_ring;
 	struct io_uring_rbuf_rqe 	*rqes;
 	u32				rq_entries;
+	u32				cached_rq_head;
 
 	/* hw rx descriptor ring id */
 	u32				if_rxq_id;
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index fc92e551ed13..f83ddbb4ebd8 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -460,7 +460,7 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
 	return false;
 }
 
-static void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
 {
 	netmem_set_pp(netmem, pool);
 	netmem_or_pp_magic(netmem, PP_SIGNATURE);
-- 
2.43.0