[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20251218-vf-bw-lag-mode-v1-2-7d8ed4368bea@nvidia.com>
Date: Thu, 18 Dec 2025 17:58:05 +0200
From: Edward Srouji <edwards@...dia.com>
To: <edwards@...dia.com>, Leon Romanovsky <leon@...nel.org>, Saeed Mahameed
<saeedm@...dia.com>, Tariq Toukan <tariqt@...dia.com>, Mark Bloch
<mbloch@...dia.com>, Andrew Lunn <andrew+netdev@...n.ch>, "David S. Miller"
<davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>, Jakub Kicinski
<kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>, Jason Gunthorpe
<jgg@...pe.ca>
CC: <netdev@...r.kernel.org>, <linux-rdma@...r.kernel.org>,
<linux-kernel@...r.kernel.org>, Or Har-Toov <ohartoov@...dia.com>, "Maher
Sanalla" <msanalla@...dia.com>
Subject: [PATCH mlx5-next 02/10] net/mlx5: Propagate LAG effective max_tx_speed to vports
From: Or Har-Toov <ohartoov@...dia.com>
Currently, vports report only their parent's uplink speed, which in LAG
setups does not reflect the true aggregated bandwidth. This makes it
hard for upper-layer software to optimize load balancing decisions
based on accurate bandwidth information.
Fix the issue by calculating the possible maximum speed of a LAG as
the sum of speeds of all active uplinks that are part of the LAG.
Propagate this effective max speed to vports associated with the LAG
whenever a relevant event occurs, such as physical port link state
changes or LAG creation/modification.
With this change, upper-layer components receive accurate bandwidth
information corresponding to the active members of the LAG and can
make better load balancing decisions.
Signed-off-by: Or Har-Toov <ohartoov@...dia.com>
Reviewed-by: Maher Sanalla <msanalla@...dia.com>
Reviewed-by: Mark Bloch <mbloch@...dia.com>
Signed-off-by: Edward Srouji <edwards@...dia.com>
---
drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 158 +++++++++++++++++++++
drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h | 9 ++
.../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 1 +
drivers/net/ethernet/mellanox/mlx5/core/port.c | 24 ++++
drivers/net/ethernet/mellanox/mlx5/core/vport.c | 45 ++++++
include/linux/mlx5/vport.h | 4 +
6 files changed, 241 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 1ac933cd8f02..a042612dcde6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -996,6 +996,126 @@ static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
ldev->mode != MLX5_LAG_MODE_MPESW;
}
+#ifdef CONFIG_MLX5_ESWITCH
+static int
+mlx5_lag_sum_devices_speed(struct mlx5_lag *ldev, u32 *sum_speed,
+ int (*get_speed)(struct mlx5_core_dev *, u32 *))
+{
+ struct mlx5_core_dev *pf_mdev;
+ int pf_idx;
+ u32 speed;
+ int ret;
+
+ *sum_speed = 0;
+ mlx5_ldev_for_each(pf_idx, 0, ldev) {
+ pf_mdev = ldev->pf[pf_idx].dev;
+ if (!pf_mdev)
+ continue;
+
+ ret = get_speed(pf_mdev, &speed);
+ if (ret) {
+ mlx5_core_dbg(pf_mdev,
+ "Failed to get device speed using %ps. Device %s speed is not available (err=%d)\n",
+ get_speed, dev_name(pf_mdev->device),
+ ret);
+ return ret;
+ }
+
+ *sum_speed += speed;
+ }
+
+ return 0;
+}
+
+static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed)
+{
+ return mlx5_lag_sum_devices_speed(ldev, max_speed,
+ mlx5_port_max_linkspeed);
+}
+
+static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev,
+ u32 speed)
+{
+ u16 op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT;
+ struct mlx5_eswitch *esw = mdev->priv.eswitch;
+ struct mlx5_vport *vport;
+ unsigned long i;
+ int ret;
+
+ if (!esw)
+ return;
+
+ if (!MLX5_CAP_ESW(mdev, esw_vport_state_max_tx_speed))
+ return;
+
+ mlx5_esw_for_each_vport(esw, i, vport) {
+ if (!vport)
+ continue;
+
+ if (vport->vport == MLX5_VPORT_UPLINK)
+ continue;
+
+ ret = mlx5_modify_vport_max_tx_speed(mdev, op_mod,
+ vport->vport, true, speed);
+ if (ret)
+ mlx5_core_dbg(mdev,
+ "Failed to set vport %d speed %d, err=%d\n",
+ vport->vport, speed, ret);
+ }
+}
+
+void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev)
+{
+ struct mlx5_core_dev *mdev;
+ u32 speed;
+ int pf_idx;
+
+ speed = ldev->tracker.bond_speed_mbps;
+
+ if (speed == SPEED_UNKNOWN)
+ return;
+
+ /* If speed is not set, use the sum of max speeds of all PFs */
+ if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed))
+ return;
+
+ speed = speed / MLX5_MAX_TX_SPEED_UNIT;
+
+ mlx5_ldev_for_each(pf_idx, 0, ldev) {
+ mdev = ldev->pf[pf_idx].dev;
+ if (!mdev)
+ continue;
+
+ mlx5_lag_modify_device_vports_speed(mdev, speed);
+ }
+}
+
+void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev)
+{
+ struct mlx5_core_dev *mdev;
+ u32 speed;
+ int pf_idx;
+ int ret;
+
+ mlx5_ldev_for_each(pf_idx, 0, ldev) {
+ mdev = ldev->pf[pf_idx].dev;
+ if (!mdev)
+ continue;
+
+ ret = mlx5_port_oper_linkspeed(mdev, &speed);
+ if (ret) {
+ mlx5_core_dbg(mdev,
+ "Failed to reset vports speed for device %s. Oper speed is not available (err=%d)\n",
+ dev_name(mdev->device), ret);
+ continue;
+ }
+
+ speed = speed / MLX5_MAX_TX_SPEED_UNIT;
+ mlx5_lag_modify_device_vports_speed(mdev, speed);
+ }
+}
+#endif
+
static void mlx5_do_bond(struct mlx5_lag *ldev)
{
int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
@@ -1083,9 +1203,12 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
ndev);
dev_put(ndev);
}
+ mlx5_lag_set_vports_agg_speed(ldev);
} else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
mlx5_modify_lag(ldev, &tracker);
+ mlx5_lag_set_vports_agg_speed(ldev);
} else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
+ mlx5_lag_reset_vports_speed(ldev);
mlx5_disable_lag(ldev);
}
}
@@ -1286,6 +1409,38 @@ static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev,
return 1;
}
+static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker,
+ struct net_device *ndev)
+{
+ struct ethtool_link_ksettings lksettings;
+ struct net_device *bond_dev;
+ int err;
+
+ if (netif_is_lag_master(ndev))
+ bond_dev = ndev;
+ else
+ bond_dev = netdev_master_upper_dev_get(ndev);
+
+ if (!bond_dev) {
+ tracker->bond_speed_mbps = SPEED_UNKNOWN;
+ return;
+ }
+
+ err = __ethtool_get_link_ksettings(bond_dev, &lksettings);
+ if (err) {
+ netdev_dbg(bond_dev,
+ "Failed to get speed for bond dev %s, err=%d\n",
+ bond_dev->name, err);
+ tracker->bond_speed_mbps = SPEED_UNKNOWN;
+ return;
+ }
+
+ if (lksettings.base.speed == SPEED_UNKNOWN)
+ tracker->bond_speed_mbps = 0;
+ else
+ tracker->bond_speed_mbps = lksettings.base.speed;
+}
+
/* this handler is always registered to netdev events */
static int mlx5_lag_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
@@ -1317,6 +1472,9 @@ static int mlx5_lag_netdev_event(struct notifier_block *this,
break;
}
+ if (changed)
+ mlx5_lag_update_tracker_speed(&tracker, ndev);
+
ldev->tracker = tracker;
if (changed)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index 4918eee2b3da..8de5640a0161 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -48,6 +48,7 @@ struct lag_tracker {
unsigned int is_bonded:1;
unsigned int has_inactive:1;
enum netdev_lag_hash hash_type;
+ u32 bond_speed_mbps;
};
/* LAG data of a ConnectX card.
@@ -116,6 +117,14 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev);
void mlx5_lag_add_devices(struct mlx5_lag *ldev);
struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev);
+#ifdef CONFIG_MLX5_ESWITCH
+void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev);
+void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev);
+#else
+static inline void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) {}
+static inline void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) {}
+#endif
+
static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev)
{
if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index cfebc110c02f..9fdb9a543cf1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -381,6 +381,7 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev,
u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev,
struct mlx5_link_info *info,
bool force_legacy);
+int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
#define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) && \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 85a9e534f442..83044c9b6b41 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -1200,6 +1200,30 @@ u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev,
return link_modes;
}
+int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
+{
+ const struct mlx5_link_info *table;
+ struct mlx5_port_eth_proto eproto;
+ u32 oper_speed = 0;
+ u32 max_size;
+ bool ext;
+ int err;
+ int i;
+
+ ext = mlx5_ptys_ext_supported(mdev);
+ err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
+ if (err)
+ return err;
+
+ mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, false);
+ for (i = 0; i < max_size; ++i)
+ if (eproto.oper & MLX5E_PROT_MASK(i))
+ oper_speed = max(oper_speed, table[i].speed);
+
+ *speed = oper_speed;
+ return 0;
+}
+
int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
{
const struct mlx5_link_info *table;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 306affbcfd3b..78b1b291cfa4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -62,6 +62,28 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
return MLX5_GET(query_vport_state_out, out, state);
}
+static int mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
+ u16 vport, u8 other_vport,
+ u8 *admin_state)
+{
+ u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
+ int err;
+
+ MLX5_SET(query_vport_state_in, in, opcode,
+ MLX5_CMD_OP_QUERY_VPORT_STATE);
+ MLX5_SET(query_vport_state_in, in, op_mod, opmod);
+ MLX5_SET(query_vport_state_in, in, vport_number, vport);
+ MLX5_SET(query_vport_state_in, in, other_vport, other_vport);
+
+ err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
+ if (err)
+ return err;
+
+ *admin_state = MLX5_GET(query_vport_state_out, out, admin_state);
+ return 0;
+}
+
int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
u16 vport, u8 other_vport, u8 state)
{
@@ -77,6 +99,29 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
return mlx5_cmd_exec_in(mdev, modify_vport_state, in);
}
+int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod,
+ u16 vport, u8 other_vport, u16 max_tx_speed)
+{
+ u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {};
+ u8 admin_state;
+ int err;
+
+ err = mlx5_query_vport_admin_state(mdev, opmod, vport, other_vport,
+ &admin_state);
+ if (err)
+ return err;
+
+ MLX5_SET(modify_vport_state_in, in, opcode,
+ MLX5_CMD_OP_MODIFY_VPORT_STATE);
+ MLX5_SET(modify_vport_state_in, in, op_mod, opmod);
+ MLX5_SET(modify_vport_state_in, in, vport_number, vport);
+ MLX5_SET(modify_vport_state_in, in, other_vport, other_vport);
+ MLX5_SET(modify_vport_state_in, in, admin_state, admin_state);
+ MLX5_SET(modify_vport_state_in, in, max_tx_speed, max_tx_speed);
+
+ return mlx5_cmd_exec_in(mdev, modify_vport_state, in);
+}
+
static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
bool other_vport, u32 *out)
{
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index f876bfc0669c..2acf10e9f60a 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -41,6 +41,8 @@
(MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && \
mlx5_core_is_pf(mdev))
+#define MLX5_MAX_TX_SPEED_UNIT 100
+
enum {
MLX5_CAP_INLINE_MODE_L2,
MLX5_CAP_INLINE_MODE_VPORT_CONTEXT,
@@ -58,6 +60,8 @@ enum {
u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport);
int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
u16 vport, u8 other_vport, u8 state);
+int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod,
+ u16 vport, u8 other_vport, u16 max_tx_speed);
int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
u16 vport, bool other, u8 *addr);
int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr);
--
2.47.1
Powered by blists - more mailing lists