[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220510055743.118828-3-saeedm@nvidia.com>
Date: Mon, 9 May 2022 22:57:30 -0700
From: Saeed Mahameed <saeedm@...dia.com>
To: "David S. Miller" <davem@...emloft.net>,
Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>
Cc: netdev@...r.kernel.org, Gavin Li <gavinl@...dia.com>,
Moshe Shemesh <moshe@...dia.com>,
Shay Drory <shayd@...dia.com>,
Saeed Mahameed <saeedm@...dia.com>
Subject: [net-next 02/15] net/mlx5: Increase FW pre-init timeout for health recovery
From: Gavin Li <gavinl@...dia.com>
Currently, health recovery will reload driver to recover it from fatal
errors. During the driver's load process, it would wait for FW to set the
pre-init bit for up to 120 seconds, beyond this threshold it would abort
the load process. In some cases, such as a FW upgrade on the DPU, this
timeout period is insufficient, and the user has no way to recover the
host device.
To solve this issue, introduce a new FW pre-init timeout for health
recovery, which is set to 2 hours.
The timeout for devlink reload and probe will use the original one because
they are user triggered flows, and therefore should not have a
significantly long timeout, during which the user command would hang.
Signed-off-by: Gavin Li <gavinl@...dia.com>
Reviewed-by: Moshe Shemesh <moshe@...dia.com>
Reviewed-by: Shay Drory <shayd@...dia.com>
Signed-off-by: Saeed Mahameed <saeedm@...dia.com>
---
.../net/ethernet/mellanox/mlx5/core/devlink.c | 4 ++--
.../ethernet/mellanox/mlx5/core/fw_reset.c | 2 +-
.../ethernet/mellanox/mlx5/core/lib/tout.c | 1 +
.../ethernet/mellanox/mlx5/core/lib/tout.h | 1 +
.../net/ethernet/mellanox/mlx5/core/main.c | 23 +++++++++++--------
.../ethernet/mellanox/mlx5/core/mlx5_core.h | 2 +-
6 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index e8789e6d7e7b..f85166e587f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -178,13 +178,13 @@ static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_a
*actions_performed = BIT(action);
switch (action) {
case DEVLINK_RELOAD_ACTION_DRIVER_REINIT:
- return mlx5_load_one(dev);
+ return mlx5_load_one(dev, false);
case DEVLINK_RELOAD_ACTION_FW_ACTIVATE:
if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET)
break;
/* On fw_activate action, also driver is reloaded and reinit performed */
*actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
- return mlx5_load_one(dev);
+ return mlx5_load_one(dev, false);
default:
/* Unsupported action should not get to this function */
WARN_ON(1);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
index ca1aba845dd6..84df0d56a2b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
@@ -148,7 +148,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
complete(&fw_reset->done);
} else {
- mlx5_load_one(dev);
+ mlx5_load_one(dev, false);
devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0,
BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
index c1df0d3595d8..d758848d34d0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
@@ -10,6 +10,7 @@ struct mlx5_timeouts {
static const u32 tout_def_sw_val[MAX_TIMEOUT_TYPES] = {
[MLX5_TO_FW_PRE_INIT_TIMEOUT_MS] = 120000,
+ [MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS] = 7200000,
[MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS] = 20000,
[MLX5_TO_FW_PRE_INIT_WAIT_MS] = 2,
[MLX5_TO_FW_INIT_MS] = 2000,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
index 1c42ead782fa..257c03eeab36 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
@@ -7,6 +7,7 @@
enum mlx5_timeouts_types {
/* pre init timeouts (not read from FW) */
MLX5_TO_FW_PRE_INIT_TIMEOUT_MS,
+ MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS,
MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS,
MLX5_TO_FW_PRE_INIT_WAIT_MS,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f28a3526aafa..84f75aa25214 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1003,7 +1003,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
mlx5_devcom_unregister_device(dev->priv.devcom);
}
-static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
+static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
{
int err;
@@ -1018,11 +1018,11 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
/* wait for firmware to accept initialization segments configurations
*/
- err = wait_fw_init(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT),
+ err = wait_fw_init(dev, timeout,
mlx5_tout_ms(dev, FW_PRE_INIT_WARN_MESSAGE_INTERVAL));
if (err) {
mlx5_core_err(dev, "Firmware over %llu MS in pre-initializing state, aborting\n",
- mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
+ timeout);
return err;
}
@@ -1272,7 +1272,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
mutex_lock(&dev->intf_state_mutex);
dev->state = MLX5_DEVICE_STATE_UP;
- err = mlx5_function_setup(dev, true);
+ err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
if (err)
goto err_function;
@@ -1336,9 +1336,10 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev)
mutex_unlock(&dev->intf_state_mutex);
}
-int mlx5_load_one(struct mlx5_core_dev *dev)
+int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery)
{
int err = 0;
+ u64 timeout;
mutex_lock(&dev->intf_state_mutex);
if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
@@ -1348,7 +1349,11 @@ int mlx5_load_one(struct mlx5_core_dev *dev)
/* remove any previous indication of internal error */
dev->state = MLX5_DEVICE_STATE_UP;
- err = mlx5_function_setup(dev, false);
+ if (recovery)
+ timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
+ else
+ timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
+ err = mlx5_function_setup(dev, timeout);
if (err)
goto err_function;
@@ -1719,7 +1724,7 @@ static void mlx5_pci_resume(struct pci_dev *pdev)
mlx5_pci_trace(dev, "Enter, loading driver..\n");
- err = mlx5_load_one(dev);
+ err = mlx5_load_one(dev, false);
mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err,
!err ? "recovered" : "Failed");
@@ -1807,7 +1812,7 @@ static int mlx5_resume(struct pci_dev *pdev)
{
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
- return mlx5_load_one(dev);
+ return mlx5_load_one(dev, false);
}
static const struct pci_device_id mlx5_core_pci_table[] = {
@@ -1852,7 +1857,7 @@ int mlx5_recover_device(struct mlx5_core_dev *dev)
return -EIO;
}
- return mlx5_load_one(dev);
+ return mlx5_load_one(dev, true);
}
static struct pci_driver mlx5_core_driver = {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index a9b2d6ead542..9026be1d6223 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -290,7 +290,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev);
int mlx5_init_one(struct mlx5_core_dev *dev);
void mlx5_uninit_one(struct mlx5_core_dev *dev);
void mlx5_unload_one(struct mlx5_core_dev *dev);
-int mlx5_load_one(struct mlx5_core_dev *dev);
+int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery);
int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out);
--
2.35.1
Powered by blists - more mailing lists