netdev - [PATCH net-next 10/10] net/mlx5: HWS, dump bad completion details

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1746992290-568936-11-git-send-email-tariqt@nvidia.com>
Date: Sun, 11 May 2025 22:38:10 +0300
From: Tariq Toukan <tariqt@...dia.com>
To: "David S. Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>,
	Paolo Abeni <pabeni@...hat.com>, Eric Dumazet <edumazet@...gle.com>, "Andrew
 Lunn" <andrew+netdev@...n.ch>
CC: Saeed Mahameed <saeedm@...dia.com>, Leon Romanovsky <leon@...nel.org>,
	Tariq Toukan <tariqt@...dia.com>, <netdev@...r.kernel.org>,
	<linux-rdma@...r.kernel.org>, <linux-kernel@...r.kernel.org>, Moshe Shemesh
	<moshe@...dia.com>, Mark Bloch <mbloch@...dia.com>, Vlad Dogaru
	<vdogaru@...dia.com>, Yevgeny Kliteynik <kliteyn@...dia.com>, Gal Pressman
	<gal@...dia.com>
Subject: [PATCH net-next 10/10] net/mlx5: HWS, dump bad completion details

From: Yevgeny Kliteynik <kliteyn@...dia.com>

Failing to insert/delete a rule should not happen. If it does happen,
it would be good to know at which stage it happened and what was the
failure. This patch adds printing of bad CQE details.

Signed-off-by: Yevgeny Kliteynik <kliteyn@...dia.com>
Reviewed-by: Vlad Dogaru <vdogaru@...dia.com>
Reviewed-by: Mark Bloch <mbloch@...dia.com>
Signed-off-by: Tariq Toukan <tariqt@...dia.com>
---
 .../mellanox/mlx5/core/steering/hws/send.c    | 122 +++++++++++++++++-
 .../mellanox/mlx5/core/steering/hws/send.h    |   1 +
 2 files changed, 120 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
index cb6abc4ab7df..c4b22be19a9b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
@@ -344,18 +344,133 @@ hws_send_engine_update_rule_resize(struct mlx5hws_send_engine *queue,
 	}
 }
 
+static void hws_send_engine_dump_error_cqe(struct mlx5hws_send_engine *queue,
+					   struct mlx5hws_send_ring_priv *priv,
+					   struct mlx5_cqe64 *cqe)
+{
+	u8 wqe_opcode = cqe ? be32_to_cpu(cqe->sop_drop_qpn) >> 24 : 0;
+	struct mlx5hws_context *ctx = priv->rule->matcher->tbl->ctx;
+	u32 opcode = cqe ? get_cqe_opcode(cqe) : 0;
+	struct mlx5hws_rule *rule = priv->rule;
+
+	/* If something bad happens and lots of rules are failing, we don't
+	 * want to pollute dmesg. Print only the first bad cqe per engine,
+	 * the one that started the avalanche.
+	 */
+	if (queue->error_cqe_printed)
+		return;
+
+	queue->error_cqe_printed = true;
+
+	if (mlx5hws_rule_move_in_progress(rule))
+		mlx5hws_err(ctx,
+			    "--- rule 0x%08llx: error completion moving rule: phase %s, wqes left %d\n",
+			    HWS_PTR_TO_ID(rule),
+			    rule->resize_info->state ==
+			    MLX5HWS_RULE_RESIZE_STATE_WRITING ? "WRITING" :
+			    rule->resize_info->state ==
+			    MLX5HWS_RULE_RESIZE_STATE_DELETING ? "DELETING" :
+			    "UNKNOWN",
+			    rule->pending_wqes);
+	else
+		mlx5hws_err(ctx,
+			    "--- rule 0x%08llx: error completion %s (%d), wqes left %d\n",
+			    HWS_PTR_TO_ID(rule),
+			    rule->status ==
+			    MLX5HWS_RULE_STATUS_CREATING ? "CREATING" :
+			    rule->status ==
+			    MLX5HWS_RULE_STATUS_DELETING ? "DELETING" :
+			    rule->status ==
+			    MLX5HWS_RULE_STATUS_FAILING ? "FAILING" :
+			    rule->status ==
+			    MLX5HWS_RULE_STATUS_UPDATING ? "UPDATING" : "NA",
+			    rule->status,
+			    rule->pending_wqes);
+
+	mlx5hws_err(ctx, "    rule 0x%08llx: matcher 0x%llx %s\n",
+		    HWS_PTR_TO_ID(rule),
+		    HWS_PTR_TO_ID(rule->matcher),
+		    (rule->matcher->flags & MLX5HWS_MATCHER_FLAGS_ISOLATED) ?
+		    "(isolated)" : "");
+
+	if (!cqe) {
+		mlx5hws_err(ctx, "    rule 0x%08llx: no CQE\n",
+			    HWS_PTR_TO_ID(rule));
+		return;
+	}
+
+	mlx5hws_err(ctx, "    rule 0x%08llx: cqe->opcode      = %d %s\n",
+		    HWS_PTR_TO_ID(rule), opcode,
+		    opcode == MLX5_CQE_REQ ? "(MLX5_CQE_REQ)" :
+		    opcode == MLX5_CQE_REQ_ERR ? "(MLX5_CQE_REQ_ERR)" : " ");
+
+	if (opcode == MLX5_CQE_REQ_ERR) {
+		struct mlx5_err_cqe *err_cqe = (struct mlx5_err_cqe *)cqe;
+
+		mlx5hws_err(ctx,
+			    "    rule 0x%08llx:  |--- hw_error_syndrome = 0x%x\n",
+			    HWS_PTR_TO_ID(rule),
+			    err_cqe->rsvd1[16]);
+		mlx5hws_err(ctx,
+			    "    rule 0x%08llx:  |--- hw_syndrome_type = 0x%x\n",
+			    HWS_PTR_TO_ID(rule),
+			    err_cqe->rsvd1[17] >> 4);
+		mlx5hws_err(ctx,
+			    "    rule 0x%08llx:  |--- vendor_err_synd = 0x%x\n",
+			    HWS_PTR_TO_ID(rule),
+			    err_cqe->vendor_err_synd);
+		mlx5hws_err(ctx,
+			    "    rule 0x%08llx:  |--- syndrome = 0x%x\n",
+			    HWS_PTR_TO_ID(rule),
+			    err_cqe->syndrome);
+	}
+
+	mlx5hws_err(ctx,
+		    "    rule 0x%08llx: cqe->byte_cnt      = 0x%08x\n",
+		    HWS_PTR_TO_ID(rule), be32_to_cpu(cqe->byte_cnt));
+	mlx5hws_err(ctx,
+		    "    rule 0x%08llx:  |-- UPDATE STATUS = %s\n",
+		    HWS_PTR_TO_ID(rule),
+		    (be32_to_cpu(cqe->byte_cnt) & 0x80000000) ?
+		    "FAILURE" : "SUCCESS");
+	mlx5hws_err(ctx,
+		    "    rule 0x%08llx:  |------- SYNDROME = %s\n",
+		    HWS_PTR_TO_ID(rule),
+		    ((be32_to_cpu(cqe->byte_cnt) & 0x00000003) == 1) ?
+		    "SET_FLOW_FAIL" :
+		    ((be32_to_cpu(cqe->byte_cnt) & 0x00000003) == 2) ?
+		    "DISABLE_FLOW_FAIL" : "UNKNOWN");
+	mlx5hws_err(ctx,
+		    "    rule 0x%08llx: cqe->sop_drop_qpn  = 0x%08x\n",
+		    HWS_PTR_TO_ID(rule), be32_to_cpu(cqe->sop_drop_qpn));
+	mlx5hws_err(ctx,
+		    "    rule 0x%08llx:  |-send wqe opcode = 0x%02x %s\n",
+		    HWS_PTR_TO_ID(rule), wqe_opcode,
+		    wqe_opcode == MLX5HWS_WQE_OPCODE_TBL_ACCESS ?
+		    "(MLX5HWS_WQE_OPCODE_TBL_ACCESS)" : "(UNKNOWN)");
+	mlx5hws_err(ctx,
+		    "    rule 0x%08llx:  |------------ qpn = 0x%06x\n",
+		    HWS_PTR_TO_ID(rule),
+		    be32_to_cpu(cqe->sop_drop_qpn) & 0xffffff);
+}
+
 static void hws_send_engine_update_rule(struct mlx5hws_send_engine *queue,
 					struct mlx5hws_send_ring_priv *priv,
 					u16 wqe_cnt,
-					enum mlx5hws_flow_op_status *status)
+					enum mlx5hws_flow_op_status *status,
+					struct mlx5_cqe64 *cqe)
 {
 	priv->rule->pending_wqes--;
 
-	if (*status == MLX5HWS_FLOW_OP_ERROR) {
+	if (unlikely(*status == MLX5HWS_FLOW_OP_ERROR)) {
 		if (priv->retry_id) {
+			/* If there is a retry_id, then it's not an error yet,
+			 * retry to insert this rule in the collision RTC.
+			 */
 			hws_send_engine_retry_post_send(queue, priv, wqe_cnt);
 			return;
 		}
+		hws_send_engine_dump_error_cqe(queue, priv, cqe);
 		/* Some part of the rule failed */
 		priv->rule->status = MLX5HWS_RULE_STATUS_FAILING;
 		*priv->used_id = 0;
@@ -420,7 +535,8 @@ static void hws_send_engine_update(struct mlx5hws_send_engine *queue,
 
 	if (priv->user_data) {
 		if (priv->rule) {
-			hws_send_engine_update_rule(queue, priv, wqe_cnt, &status);
+			hws_send_engine_update_rule(queue, priv, wqe_cnt,
+						    &status, cqe);
 			/* Completion is provided on the last rule WQE */
 			if (priv->rule->pending_wqes)
 				return;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h
index f833092235c1..3fb8e99309b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h
@@ -140,6 +140,7 @@ struct mlx5hws_send_engine {
 	u16 used_entries;
 	u16 num_entries;
 	bool err;
+	bool error_cqe_printed;
 	struct mutex lock; /* Protects the send engine */
 };
 
-- 
2.31.1