[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1763415729-1238421-3-git-send-email-tariqt@nvidia.com>
Date: Mon, 17 Nov 2025 23:42:06 +0200
From: Tariq Toukan <tariqt@...dia.com>
To: Eric Dumazet <edumazet@...gle.com>, Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>, Andrew Lunn <andrew+netdev@...n.ch>, "David
S. Miller" <davem@...emloft.net>
CC: Saeed Mahameed <saeedm@...dia.com>, Leon Romanovsky <leon@...nel.org>,
Tariq Toukan <tariqt@...dia.com>, Mark Bloch <mbloch@...dia.com>, "Richard
Cochran" <richardcochran@...il.com>, <netdev@...r.kernel.org>,
<linux-rdma@...r.kernel.org>, <linux-kernel@...r.kernel.org>, Gal Pressman
<gal@...dia.com>, Moshe Shemesh <moshe@...dia.com>, Dragos Tatulea
<dtatulea@...dia.com>, Carolina Jubran <cjubran@...dia.com>
Subject: [PATCH net-next 2/5] net/mlx5e: Recover SQ on excessive PTP TX timestamp delta
From: Carolina Jubran <cjubran@...dia.com>
Extend the TX timestamp handler to recover the SQ when the difference
between the port and CQE TX timestamps is abnormally large.
The current logic aborts timestamp delivery if the delta exceeds
1/128 seconds, which matches the maximum expected packet interval in
ptp4l. A larger delta makes the timestamps unreliable.
This change adds recovery if the delta exceeds 0.5 seconds. Such a
large gap should not occur in normal operation and indicates that
firmware is stuck or metadata tracking is out of sync, leading to stale
or mismatched timestamps. Recovering the SQ ensures forward progress
and avoids silently dropping invalid timestamps.
The timestamp handler now takes mlx5e_ptpsq directly to access both CQ
stats and the recovery state.
Signed-off-by: Carolina Jubran <cjubran@...dia.com>
Reviewed-by: Shahar Shitrit <shshitrit@...dia.com>
Signed-off-by: Tariq Toukan <tariqt@...dia.com>
---
.../net/ethernet/mellanox/mlx5/core/en/ptp.c | 21 +++++++++++++------
.../net/ethernet/mellanox/mlx5/core/en/ptp.h | 2 +-
.../net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +-
3 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
index 12e10feb30f0..424f8a2728a3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
@@ -82,7 +82,7 @@ static struct mlx5e_skb_cb_hwtstamp *mlx5e_skb_cb_get_hwts(struct sk_buff *skb)
}
static void mlx5e_skb_cb_hwtstamp_tx(struct sk_buff *skb,
- struct mlx5e_ptp_cq_stats *cq_stats)
+ struct mlx5e_ptpsq *ptpsq)
{
struct skb_shared_hwtstamps hwts = {};
ktime_t diff;
@@ -92,8 +92,17 @@ static void mlx5e_skb_cb_hwtstamp_tx(struct sk_buff *skb,
/* Maximal allowed diff is 1 / 128 second */
if (diff > (NSEC_PER_SEC >> 7)) {
- cq_stats->abort++;
- cq_stats->abort_abs_diff_ns += diff;
+ struct mlx5e_txqsq *sq = &ptpsq->txqsq;
+
+ ptpsq->cq_stats->abort++;
+ ptpsq->cq_stats->abort_abs_diff_ns += diff;
+ if (diff > (NSEC_PER_SEC >> 1) &&
+ !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) {
+ netdev_warn(sq->channel->netdev,
+ "PTP TX timestamp difference between CQE and port exceeds threshold: %lld ns, recovering SQ %u\n",
+ (s64)diff, sq->sqn);
+ queue_work(sq->priv->wq, &ptpsq->report_unhealthy_work);
+ }
return;
}
@@ -103,7 +112,7 @@ static void mlx5e_skb_cb_hwtstamp_tx(struct sk_buff *skb,
void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type,
ktime_t hwtstamp,
- struct mlx5e_ptp_cq_stats *cq_stats)
+ struct mlx5e_ptpsq *ptpsq)
{
switch (hwtstamp_type) {
case (MLX5E_SKB_CB_CQE_HWTSTAMP):
@@ -121,7 +130,7 @@ void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type,
!mlx5e_skb_cb_get_hwts(skb)->port_hwtstamp)
return;
- mlx5e_skb_cb_hwtstamp_tx(skb, cq_stats);
+ mlx5e_skb_cb_hwtstamp_tx(skb, ptpsq);
memset(skb->cb, 0, sizeof(struct mlx5e_skb_cb_hwtstamp));
}
@@ -209,7 +218,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, get_cqe_ts(cqe));
mlx5e_skb_cb_hwtstamp_handler(skb, MLX5E_SKB_CB_PORT_HWTSTAMP,
- hwtstamp, ptpsq->cq_stats);
+ hwtstamp, ptpsq);
ptpsq->cq_stats->cqe++;
mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
index 1c0e0a86a9ac..2a457a2ed707 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
@@ -147,7 +147,7 @@ enum {
void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type,
ktime_t hwtstamp,
- struct mlx5e_ptp_cq_stats *cq_stats);
+ struct mlx5e_ptpsq *ptpsq);
void mlx5e_skb_cb_hwtstamp_init(struct sk_buff *skb);
#endif /* __MLX5_EN_PTP_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 2702b3885f06..14884b9ea7f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -755,7 +755,7 @@ static void mlx5e_consume_skb(struct mlx5e_txqsq *sq, struct sk_buff *skb,
hwts.hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, ts);
if (sq->ptpsq) {
mlx5e_skb_cb_hwtstamp_handler(skb, MLX5E_SKB_CB_CQE_HWTSTAMP,
- hwts.hwtstamp, sq->ptpsq->cq_stats);
+ hwts.hwtstamp, sq->ptpsq);
} else {
skb_tstamp_tx(skb, &hwts);
sq->stats->timestamps++;
--
2.31.1
Powered by blists - more mailing lists