netdev - [net-next 14/15] net/mlx5e: RX, Break the wqe bulk refill in smaller chunks

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230328205623.142075-15-saeed@kernel.org>
Date:   Tue, 28 Mar 2023 13:56:22 -0700
From:   Saeed Mahameed <saeed@...nel.org>
To:     "David S. Miller" <davem@...emloft.net>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>,
        Eric Dumazet <edumazet@...gle.com>
Cc:     Saeed Mahameed <saeedm@...dia.com>, netdev@...r.kernel.org,
        Tariq Toukan <tariqt@...dia.com>,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Matthew Wilcox <willy@...radead.org>,
        Toke Høiland-Jørgensen <toke@...hat.com>,
        Ilias Apalodimas <ilias.apalodimas@...aro.org>,
        Dragos Tatulea <dtatulea@...dia.com>
Subject: [net-next 14/15] net/mlx5e: RX, Break the wqe bulk refill in smaller chunks

From: Dragos Tatulea <dtatulea@...dia.com>

To avoid overflowing the page pool's cache, don't release the
whole bulk which is usually larger than the cache refill size.
Group release+alloc instead into cache refill units that
allow releasing to the cache and then allocating from the cache.

A refill_unit variable is added as a iteration unit over the
wqe_bulk when doing release+alloc.

For a single ring, single core, default MTU (1500) TCP stream
test the number of pages allocated from the cache directly
(rx_pp_recycle_cached) increases from 0% to 52%:

+---------------------------------------------+
| Page Pool stats (/sec)  |  Before |   After |
+-------------------------+---------+---------+
|rx_pp_alloc_fast         | 2145422 | 2193802 |
|rx_pp_alloc_slow         |       2 |       0 |
|rx_pp_alloc_empty        |       2 |       0 |
|rx_pp_alloc_refill       |   34059 |   16634 |
|rx_pp_alloc_waive        |       0 |       0 |
|rx_pp_recycle_cached     |       0 | 1145818 |
|rx_pp_recycle_cache_full |       0 |       0 |
|rx_pp_recycle_ring       | 2179361 | 1064616 |
|rx_pp_recycle_ring_full  |     121 |       0 |
+---------------------------------------------+

With this patch, the performance for legacy rq for the above test is
back to baseline.

Signed-off-by: Dragos Tatulea <dtatulea@...dia.com>
Signed-off-by: Saeed Mahameed <saeedm@...dia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  1 +
 .../ethernet/mellanox/mlx5/core/en/params.c   |  8 +++++-
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 28 +++++++++++++++++--
 3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index a087c433366b..ba615b74bb8e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -671,6 +671,7 @@ struct mlx5e_rq_frags_info {
 	u8 num_frags;
 	u8 log_num_frags;
 	u16 wqe_bulk;
+	u16 refill_unit;
 	u8 wqe_index_mask;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index 40218d77ef34..31f3c6e51d9e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -674,6 +674,7 @@ static void mlx5e_rx_compute_wqe_bulk_params(struct mlx5e_params *params,
 	u32 bulk_bound_rq_size_in_bytes;
 	u32 sum_frag_strides = 0;
 	u32 wqe_bulk_in_bytes;
+	u16 split_factor;
 	u32 wqe_bulk;
 	int i;
 
@@ -702,6 +703,10 @@ static void mlx5e_rx_compute_wqe_bulk_params(struct mlx5e_params *params,
 	 * by older WQEs.
 	 */
 	info->wqe_bulk = max_t(u16, info->wqe_index_mask + 1, wqe_bulk);
+
+	split_factor = DIV_ROUND_UP(MAX_WQE_BULK_BYTES(params->xdp_prog),
+				    PP_ALLOC_CACHE_REFILL * PAGE_SIZE);
+	info->refill_unit = DIV_ROUND_UP(info->wqe_bulk, split_factor);
 }
 
 #define DEFAULT_FRAG_SIZE (2048)
@@ -817,7 +822,8 @@ static int mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
 	 */
 	mlx5e_rx_compute_wqe_bulk_params(params, info);
 
-	mlx5_core_dbg(mdev, "%s: wqe_bulk = %u\n", __func__, info->wqe_bulk);
+	mlx5_core_dbg(mdev, "%s: wqe_bulk = %u, wqe_bulk_refill_unit = %u\n",
+		      __func__, info->wqe_bulk, info->refill_unit);
 
 	info->log_num_frags = order_base_2(info->num_frags);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 9c5270eb9dc6..df5dbef9e5ec 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -449,6 +449,31 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
 	return i;
 }
 
+static int mlx5e_refill_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
+{
+	int remaining = wqe_bulk;
+	int i = 0;
+
+	/* The WQE bulk is split into smaller bulks that are sized
+	 * according to the page pool cache refill size to avoid overflowing
+	 * the page pool cache due to too many page releases at once.
+	 */
+	do {
+		int refill = min_t(u16, rq->wqe.info.refill_unit, remaining);
+		int alloc_count;
+
+		mlx5e_free_rx_wqes(rq, ix + i, refill);
+		alloc_count = mlx5e_alloc_rx_wqes(rq, ix + i, refill);
+		i += alloc_count;
+		if (unlikely(alloc_count != refill))
+			break;
+
+		remaining -= refill;
+	} while (remaining);
+
+	return i;
+}
+
 static inline void
 mlx5e_add_skb_frag(struct mlx5e_rq *rq, struct sk_buff *skb,
 		   struct page *page, u32 frag_offset, u32 len,
@@ -837,8 +862,7 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 	wqe_bulk -= (head + wqe_bulk) & rq->wqe.info.wqe_index_mask;
 
 	if (!rq->xsk_pool) {
-		mlx5e_free_rx_wqes(rq, head, wqe_bulk);
-		count = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
+		count = mlx5e_refill_rx_wqes(rq, head, wqe_bulk);
 	} else if (likely(!rq->xsk_pool->dma_need_sync)) {
 		mlx5e_xsk_free_rx_wqes(rq, head, wqe_bulk);
 		count = mlx5e_xsk_alloc_rx_wqes_batched(rq, head, wqe_bulk);
-- 
2.39.2