lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20251218-vf-bw-lag-mode-v1-2-7d8ed4368bea@nvidia.com>
Date: Thu, 18 Dec 2025 17:58:05 +0200
From: Edward Srouji <edwards@...dia.com>
To: <edwards@...dia.com>, Leon Romanovsky <leon@...nel.org>, Saeed Mahameed
	<saeedm@...dia.com>, Tariq Toukan <tariqt@...dia.com>, Mark Bloch
	<mbloch@...dia.com>, Andrew Lunn <andrew+netdev@...n.ch>, "David S. Miller"
	<davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>, Jakub Kicinski
	<kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>, Jason Gunthorpe
	<jgg@...pe.ca>
CC: <netdev@...r.kernel.org>, <linux-rdma@...r.kernel.org>,
	<linux-kernel@...r.kernel.org>, Or Har-Toov <ohartoov@...dia.com>, "Maher
 Sanalla" <msanalla@...dia.com>
Subject: [PATCH mlx5-next 02/10] net/mlx5: Propagate LAG effective  max_tx_speed to vports

From: Or Har-Toov <ohartoov@...dia.com>

Currently, vports report only their parent's uplink speed, which in LAG
setups does not reflect the true aggregated bandwidth. This makes it
hard for upper-layer software to optimize load balancing decisions
based on accurate bandwidth information.

Fix the issue by calculating the possible maximum speed of a LAG as
the sum of speeds of all active uplinks that are part of the LAG.
Propagate this effective max speed to vports associated with the LAG
whenever a relevant event occurs, such as physical port link state
changes or LAG creation/modification.

With this change, upper-layer components receive accurate bandwidth
information corresponding to the active members of the LAG and can
make better load balancing decisions.

Signed-off-by: Or Har-Toov <ohartoov@...dia.com>
Reviewed-by: Maher Sanalla <msanalla@...dia.com>
Reviewed-by: Mark Bloch <mbloch@...dia.com>
Signed-off-by: Edward Srouji <edwards@...dia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c  | 158 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h  |   9 ++
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/port.c     |  24 ++++
 drivers/net/ethernet/mellanox/mlx5/core/vport.c    |  45 ++++++
 include/linux/mlx5/vport.h                         |   4 +
 6 files changed, 241 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 1ac933cd8f02..a042612dcde6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -996,6 +996,126 @@ static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
 	       ldev->mode != MLX5_LAG_MODE_MPESW;
 }
 
+#ifdef CONFIG_MLX5_ESWITCH
+static int
+mlx5_lag_sum_devices_speed(struct mlx5_lag *ldev, u32 *sum_speed,
+			   int (*get_speed)(struct mlx5_core_dev *, u32 *))
+{
+	struct mlx5_core_dev *pf_mdev;
+	int pf_idx;
+	u32 speed;
+	int ret;
+
+	*sum_speed = 0;
+	mlx5_ldev_for_each(pf_idx, 0, ldev) {
+		pf_mdev = ldev->pf[pf_idx].dev;
+		if (!pf_mdev)
+			continue;
+
+		ret = get_speed(pf_mdev, &speed);
+		if (ret) {
+			mlx5_core_dbg(pf_mdev,
+				      "Failed to get device speed using %ps. Device %s speed is not available (err=%d)\n",
+				      get_speed, dev_name(pf_mdev->device),
+				      ret);
+			return ret;
+		}
+
+		*sum_speed += speed;
+	}
+
+	return 0;
+}
+
+static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed)
+{
+	return mlx5_lag_sum_devices_speed(ldev, max_speed,
+					  mlx5_port_max_linkspeed);
+}
+
+static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev,
+						u32 speed)
+{
+	u16 op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT;
+	struct mlx5_eswitch *esw = mdev->priv.eswitch;
+	struct mlx5_vport *vport;
+	unsigned long i;
+	int ret;
+
+	if (!esw)
+		return;
+
+	if (!MLX5_CAP_ESW(mdev, esw_vport_state_max_tx_speed))
+		return;
+
+	mlx5_esw_for_each_vport(esw, i, vport) {
+		if (!vport)
+			continue;
+
+		if (vport->vport == MLX5_VPORT_UPLINK)
+			continue;
+
+		ret = mlx5_modify_vport_max_tx_speed(mdev, op_mod,
+						     vport->vport, true, speed);
+		if (ret)
+			mlx5_core_dbg(mdev,
+				      "Failed to set vport %d speed %d, err=%d\n",
+				      vport->vport, speed, ret);
+	}
+}
+
+void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev)
+{
+	struct mlx5_core_dev *mdev;
+	u32 speed;
+	int pf_idx;
+
+	speed = ldev->tracker.bond_speed_mbps;
+
+	if (speed == SPEED_UNKNOWN)
+		return;
+
+	/* If speed is not set, use the sum of max speeds of all PFs */
+	if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed))
+		return;
+
+	speed = speed / MLX5_MAX_TX_SPEED_UNIT;
+
+	mlx5_ldev_for_each(pf_idx, 0, ldev) {
+		mdev = ldev->pf[pf_idx].dev;
+		if (!mdev)
+			continue;
+
+		mlx5_lag_modify_device_vports_speed(mdev, speed);
+	}
+}
+
+void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev)
+{
+	struct mlx5_core_dev *mdev;
+	u32 speed;
+	int pf_idx;
+	int ret;
+
+	mlx5_ldev_for_each(pf_idx, 0, ldev) {
+		mdev = ldev->pf[pf_idx].dev;
+		if (!mdev)
+			continue;
+
+		ret = mlx5_port_oper_linkspeed(mdev, &speed);
+		if (ret) {
+			mlx5_core_dbg(mdev,
+				      "Failed to reset vports speed for device %s. Oper speed is not available (err=%d)\n",
+				      dev_name(mdev->device), ret);
+			continue;
+		}
+
+		speed = speed / MLX5_MAX_TX_SPEED_UNIT;
+		mlx5_lag_modify_device_vports_speed(mdev, speed);
+	}
+}
+#endif
+
 static void mlx5_do_bond(struct mlx5_lag *ldev)
 {
 	int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
@@ -1083,9 +1203,12 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
 						     ndev);
 			dev_put(ndev);
 		}
+		mlx5_lag_set_vports_agg_speed(ldev);
 	} else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
 		mlx5_modify_lag(ldev, &tracker);
+		mlx5_lag_set_vports_agg_speed(ldev);
 	} else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
+		mlx5_lag_reset_vports_speed(ldev);
 		mlx5_disable_lag(ldev);
 	}
 }
@@ -1286,6 +1409,38 @@ static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev,
 	return 1;
 }
 
+static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker,
+					  struct net_device *ndev)
+{
+	struct ethtool_link_ksettings lksettings;
+	struct net_device *bond_dev;
+	int err;
+
+	if (netif_is_lag_master(ndev))
+		bond_dev = ndev;
+	else
+		bond_dev = netdev_master_upper_dev_get(ndev);
+
+	if (!bond_dev) {
+		tracker->bond_speed_mbps = SPEED_UNKNOWN;
+		return;
+	}
+
+	err = __ethtool_get_link_ksettings(bond_dev, &lksettings);
+	if (err) {
+		netdev_dbg(bond_dev,
+			   "Failed to get speed for bond dev %s, err=%d\n",
+			   bond_dev->name, err);
+		tracker->bond_speed_mbps = SPEED_UNKNOWN;
+		return;
+	}
+
+	if (lksettings.base.speed == SPEED_UNKNOWN)
+		tracker->bond_speed_mbps = 0;
+	else
+		tracker->bond_speed_mbps = lksettings.base.speed;
+}
+
 /* this handler is always registered to netdev events */
 static int mlx5_lag_netdev_event(struct notifier_block *this,
 				 unsigned long event, void *ptr)
@@ -1317,6 +1472,9 @@ static int mlx5_lag_netdev_event(struct notifier_block *this,
 		break;
 	}
 
+	if (changed)
+		mlx5_lag_update_tracker_speed(&tracker, ndev);
+
 	ldev->tracker = tracker;
 
 	if (changed)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index 4918eee2b3da..8de5640a0161 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -48,6 +48,7 @@ struct lag_tracker {
 	unsigned int is_bonded:1;
 	unsigned int has_inactive:1;
 	enum netdev_lag_hash hash_type;
+	u32 bond_speed_mbps;
 };
 
 /* LAG data of a ConnectX card.
@@ -116,6 +117,14 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev);
 void mlx5_lag_add_devices(struct mlx5_lag *ldev);
 struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev);
 
+#ifdef CONFIG_MLX5_ESWITCH
+void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev);
+void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev);
+#else
+static inline void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) {}
+static inline void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) {}
+#endif
+
 static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev)
 {
 	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index cfebc110c02f..9fdb9a543cf1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -381,6 +381,7 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev,
 u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev,
 			     struct mlx5_link_info *info,
 			     bool force_legacy);
+int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
 int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
 
 #define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) &&		\
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 85a9e534f442..83044c9b6b41 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -1200,6 +1200,30 @@ u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev,
 	return link_modes;
 }
 
+int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
+{
+	const struct mlx5_link_info *table;
+	struct mlx5_port_eth_proto eproto;
+	u32 oper_speed = 0;
+	u32 max_size;
+	bool ext;
+	int err;
+	int i;
+
+	ext = mlx5_ptys_ext_supported(mdev);
+	err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
+	if (err)
+		return err;
+
+	mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, false);
+	for (i = 0; i < max_size; ++i)
+		if (eproto.oper & MLX5E_PROT_MASK(i))
+			oper_speed = max(oper_speed, table[i].speed);
+
+	*speed = oper_speed;
+	return 0;
+}
+
 int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
 {
 	const struct mlx5_link_info *table;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 306affbcfd3b..78b1b291cfa4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -62,6 +62,28 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
 	return MLX5_GET(query_vport_state_out, out, state);
 }
 
+static int mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
+					u16 vport, u8 other_vport,
+					u8 *admin_state)
+{
+	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
+	int err;
+
+	MLX5_SET(query_vport_state_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VPORT_STATE);
+	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
+	MLX5_SET(query_vport_state_in, in, vport_number, vport);
+	MLX5_SET(query_vport_state_in, in, other_vport, other_vport);
+
+	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
+	if (err)
+		return err;
+
+	*admin_state = MLX5_GET(query_vport_state_out, out, admin_state);
+	return 0;
+}
+
 int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 				  u16 vport, u8 other_vport, u8 state)
 {
@@ -77,6 +99,29 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 	return mlx5_cmd_exec_in(mdev, modify_vport_state, in);
 }
 
+int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod,
+				   u16 vport, u8 other_vport, u16 max_tx_speed)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {};
+	u8 admin_state;
+	int err;
+
+	err = mlx5_query_vport_admin_state(mdev, opmod, vport, other_vport,
+					   &admin_state);
+	if (err)
+		return err;
+
+	MLX5_SET(modify_vport_state_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_VPORT_STATE);
+	MLX5_SET(modify_vport_state_in, in, op_mod, opmod);
+	MLX5_SET(modify_vport_state_in, in, vport_number, vport);
+	MLX5_SET(modify_vport_state_in, in, other_vport, other_vport);
+	MLX5_SET(modify_vport_state_in, in, admin_state, admin_state);
+	MLX5_SET(modify_vport_state_in, in, max_tx_speed, max_tx_speed);
+
+	return mlx5_cmd_exec_in(mdev, modify_vport_state, in);
+}
+
 static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
 					bool other_vport, u32 *out)
 {
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index f876bfc0669c..2acf10e9f60a 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -41,6 +41,8 @@
 	 (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&	\
 	 mlx5_core_is_pf(mdev))
 
+#define MLX5_MAX_TX_SPEED_UNIT 100
+
 enum {
 	MLX5_CAP_INLINE_MODE_L2,
 	MLX5_CAP_INLINE_MODE_VPORT_CONTEXT,
@@ -58,6 +60,8 @@ enum {
 u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport);
 int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 				  u16 vport, u8 other_vport, u8 state);
+int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod,
+				   u16 vport, u8 other_vport, u16 max_tx_speed);
 int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 				     u16 vport, bool other, u8 *addr);
 int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr);

-- 
2.47.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ