lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1762939749-1165658-3-git-send-email-tariqt@nvidia.com>
Date: Wed, 12 Nov 2025 11:29:05 +0200
From: Tariq Toukan <tariqt@...dia.com>
To: Eric Dumazet <edumazet@...gle.com>, Jakub Kicinski <kuba@...nel.org>,
	Paolo Abeni <pabeni@...hat.com>, Andrew Lunn <andrew+netdev@...n.ch>, "David
 S. Miller" <davem@...emloft.net>
CC: Saeed Mahameed <saeedm@...dia.com>, Leon Romanovsky <leon@...nel.org>,
	Tariq Toukan <tariqt@...dia.com>, Mark Bloch <mbloch@...dia.com>, "Alexei
 Starovoitov" <ast@...nel.org>, Daniel Borkmann <daniel@...earbox.net>,
	"Jesper Dangaard Brouer" <hawk@...nel.org>, John Fastabend
	<john.fastabend@...il.com>, <netdev@...r.kernel.org>,
	<linux-rdma@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
	<bpf@...r.kernel.org>, Gal Pressman <gal@...dia.com>, Leon Romanovsky
	<leonro@...dia.com>, Moshe Shemesh <moshe@...dia.com>, William Tu
	<witu@...dia.com>, Dragos Tatulea <dtatulea@...dia.com>, Nimrod Oren
	<noren@...dia.com>, Alex Lazar <alazar@...dia.com>
Subject: [PATCH net-next 2/6] net/mlx5e: Use regular ICOSQ for triggering NAPI

From: William Tu <witu@...dia.com>

Before the cited commit, ICOSQ is used to post NOP WQE to trigger
hardware interrupt and start NAPI, but this mechanism suffers from
a race condition: mlx5e_alloc_rx_mpwqe may post UMR WQEs to ICOSQ
_before_ NOP WQE is posted. The cited commit fixes the issue by
replacing ICOSQ with async ICOSQ, as a new way to post the NOP WQE
to trigger the hardware interrupt and NAPI.

The patch changes it back by replacing async ICOSQ with regular
ICOSQ, for the purpose of saving memory in later patches, and solves
the issue by adding a new SQ state, MLX5E_SQ_STATE_LOCK_NEEDED
for syncing the start of NAPI.

What it does:
- Switch trigger path from async ICOSQ to regular ICOSQ to reduce
  need for async SQ.
- Introduce MLX5E_SQ_STATE_LOCK_NEEDED and mlx5e_icosq_sync_lock(),
  unlock() to prevent the race where UMR WQEs could be posted before
  the NOP WQE used to trigger NAPI.
- Use synchronize_net() once per trigger cycle to quiesce in-flight
  softirqs before serializing the NOP WQE and any UMR postings via
  the ICOSQ lock.
- Wrap ICOSQ UMR posting in en_rx.c and xsk/rx.c with the new
  conditional lock.

Signed-off-by: William Tu <witu@...dia.com>
Signed-off-by: Tariq Toukan <tariqt@...dia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 40 ++++++++++++++++++-
 .../mellanox/mlx5/core/en/reporter_tx.c       |  1 +
 .../ethernet/mellanox/mlx5/core/en/xsk/rx.c   |  3 ++
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 14 +++++--
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   |  3 ++
 5 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 70bc878bd2c2..9ee07fa19896 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -388,6 +388,7 @@ enum {
 	MLX5E_SQ_STATE_DIM,
 	MLX5E_SQ_STATE_PENDING_XSK_TX,
 	MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC,
+	MLX5E_SQ_STATE_LOCK_NEEDED,
 	MLX5E_NUM_SQ_STATES, /* Must be kept last */
 };
 
@@ -751,7 +752,7 @@ struct mlx5e_rq {
 
 enum mlx5e_channel_state {
 	MLX5E_CHANNEL_STATE_XSK,
-	MLX5E_CHANNEL_NUM_STATES
+	MLX5E_CHANNEL_NUM_STATES, /* Must be kept last */
 };
 
 struct mlx5e_channel {
@@ -801,6 +802,43 @@ struct mlx5e_channel {
 	struct dim_cq_moder        tx_cq_moder;
 };
 
+enum mlx5e_lock_type {
+	MLX5E_LOCK_TYPE_NONE,
+	MLX5E_LOCK_TYPE_SOFTIRQ,
+	MLX5E_LOCK_TYPE_BH,
+};
+
+static inline enum mlx5e_lock_type
+mlx5e_icosq_sync_lock(struct mlx5e_icosq *sq)
+{
+	if (!test_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &sq->state))
+		return MLX5E_LOCK_TYPE_NONE;
+
+	if (in_softirq()) {
+		spin_lock(&sq->lock);
+		return MLX5E_LOCK_TYPE_SOFTIRQ;
+	}
+
+	spin_lock_bh(&sq->lock);
+	return MLX5E_LOCK_TYPE_BH;
+}
+
+static inline void mlx5e_icosq_sync_unlock(struct mlx5e_icosq *sq,
+					   enum mlx5e_lock_type lock_type)
+{
+	switch (lock_type) {
+	case MLX5E_LOCK_TYPE_SOFTIRQ:
+		spin_unlock(&sq->lock);
+		break;
+	case MLX5E_LOCK_TYPE_BH:
+		spin_unlock_bh(&sq->lock);
+		break;
+	case MLX5E_LOCK_TYPE_NONE:
+	default:
+		break;
+	}
+}
+
 struct mlx5e_ptp;
 
 struct mlx5e_channels {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index 9e2cf191ed30..4adc1adf9897 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -15,6 +15,7 @@ static const char * const sq_sw_state_type_name[] = {
 	[MLX5E_SQ_STATE_DIM] = "dim",
 	[MLX5E_SQ_STATE_PENDING_XSK_TX] = "pending_xsk_tx",
 	[MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC] = "pending_tls_rx_resync",
+	[MLX5E_SQ_STATE_LOCK_NEEDED] = "lock_needed",
 };
 
 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 2b05536d564a..a96fd7f65485 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -21,6 +21,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	struct mlx5e_mpw_info *wi = mlx5e_get_mpw_info(rq, ix);
 	struct mlx5e_icosq *icosq = rq->icosq;
 	struct mlx5_wq_cyc *wq = &icosq->wq;
+	enum mlx5e_lock_type sync_locked;
 	struct mlx5e_umr_wqe *umr_wqe;
 	struct xdp_buff **xsk_buffs;
 	int batch, i;
@@ -47,6 +48,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 			goto err_reuse_batch;
 	}
 
+	sync_locked = mlx5e_icosq_sync_lock(icosq);
 	pi = mlx5e_icosq_get_next_pi(icosq, rq->mpwqe.umr_wqebbs);
 	umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
 	memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
@@ -143,6 +145,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	};
 
 	icosq->pc += rq->mpwqe.umr_wqebbs;
+	mlx5e_icosq_sync_unlock(icosq, sync_locked);
 
 	icosq->doorbell_cseg = &umr_wqe->hdr.ctrl;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 590707dc6f0e..80fb09d902f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2751,11 +2751,17 @@ static int mlx5e_channel_stats_alloc(struct mlx5e_priv *priv, int ix, int cpu)
 
 void mlx5e_trigger_napi_icosq(struct mlx5e_channel *c)
 {
-	struct mlx5e_icosq *async_icosq = &c->async_icosq;
+	enum mlx5e_lock_type locked_type;
 
-	spin_lock_bh(&async_icosq->lock);
-	mlx5e_trigger_irq(async_icosq);
-	spin_unlock_bh(&async_icosq->lock);
+	if (!test_and_set_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state))
+		synchronize_net();
+
+	locked_type = mlx5e_icosq_sync_lock(&c->icosq);
+	mlx5e_trigger_irq(&c->icosq);
+	if (locked_type != MLX5E_LOCK_TYPE_NONE)
+		mlx5e_icosq_sync_unlock(&c->icosq, locked_type);
+
+	clear_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state);
 }
 
 void mlx5e_trigger_napi_sched(struct napi_struct *napi)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 1f6930c77437..b54844d80922 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -776,6 +776,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	struct mlx5e_icosq *sq = rq->icosq;
 	struct mlx5e_frag_page *frag_page;
 	struct mlx5_wq_cyc *wq = &sq->wq;
+	enum mlx5e_lock_type sync_locked;
 	struct mlx5e_umr_wqe *umr_wqe;
 	u32 offset; /* 17-bit value with MTT. */
 	u16 pi;
@@ -788,6 +789,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 			goto err;
 	}
 
+	sync_locked = mlx5e_icosq_sync_lock(sq);
 	pi = mlx5e_icosq_get_next_pi(sq, rq->mpwqe.umr_wqebbs);
 	umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
 	memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
@@ -835,6 +837,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	};
 
 	sq->pc += rq->mpwqe.umr_wqebbs;
+	mlx5e_icosq_sync_unlock(sq, sync_locked);
 
 	sq->doorbell_cseg = &umr_wqe->hdr.ctrl;
 
-- 
2.31.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ