[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220530133133.1931716-75-sashal@kernel.org>
Date:   Mon, 30 May 2022 09:30:33 -0400
From:   Sasha Levin <sashal@...nel.org>
To:     linux-kernel@...r.kernel.org, stable@...r.kernel.org
Cc:     Gavin Li <gavinl@...dia.com>, Moshe Shemesh <moshe@...dia.com>,
        Shay Drory <shayd@...dia.com>,
        Saeed Mahameed <saeedm@...dia.com>,
        Sasha Levin <sashal@...nel.org>, davem@...emloft.net,
        edumazet@...gle.com, kuba@...nel.org, pabeni@...hat.com,
        amirtz@...dia.com, netdev@...r.kernel.org,
        linux-rdma@...r.kernel.org
Subject: [PATCH AUTOSEL 5.17 075/135] net/mlx5: Increase FW pre-init timeout for health recovery
From: Gavin Li <gavinl@...dia.com>
[ Upstream commit 37ca95e62ee23fa6d2c2c64e3dc40b4a0c0146dc ]
Currently, health recovery will reload driver to recover it from fatal
errors. During the driver's load process, it would wait for FW to set the
pre-init bit for up to 120 seconds, beyond this threshold it would abort
the load process. In some cases, such as a FW upgrade on the DPU, this
timeout period is insufficient, and the user has no way to recover the
host device.
To solve this issue, introduce a new FW pre-init timeout for health
recovery, which is set to 2 hours.
The timeout for devlink reload and probe will use the original one because
they are user triggered flows, and therefore should not have a
significantly long timeout, during which the user command would hang.
Signed-off-by: Gavin Li <gavinl@...dia.com>
Reviewed-by: Moshe Shemesh <moshe@...dia.com>
Reviewed-by: Shay Drory <shayd@...dia.com>
Signed-off-by: Saeed Mahameed <saeedm@...dia.com>
Signed-off-by: Sasha Levin <sashal@...nel.org>
---
 .../net/ethernet/mellanox/mlx5/core/devlink.c |  4 ++--
 .../ethernet/mellanox/mlx5/core/fw_reset.c    |  2 +-
 .../ethernet/mellanox/mlx5/core/lib/tout.c    |  1 +
 .../ethernet/mellanox/mlx5/core/lib/tout.h    |  1 +
 .../net/ethernet/mellanox/mlx5/core/main.c    | 23 +++++++++++--------
 .../ethernet/mellanox/mlx5/core/mlx5_core.h   |  2 +-
 6 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index d1093bb2d436..5e432e4fc381 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -182,13 +182,13 @@ static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_a
 	*actions_performed = BIT(action);
 	switch (action) {
 	case DEVLINK_RELOAD_ACTION_DRIVER_REINIT:
-		return mlx5_load_one(dev);
+		return mlx5_load_one(dev, false);
 	case DEVLINK_RELOAD_ACTION_FW_ACTIVATE:
 		if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET)
 			break;
 		/* On fw_activate action, also driver is reloaded and reinit performed */
 		*actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
-		return mlx5_load_one(dev);
+		return mlx5_load_one(dev, false);
 	default:
 		/* Unsupported action should not get to this function */
 		WARN_ON(1);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
index 1c771287bee5..d5f540132a0e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
@@ -106,7 +106,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
 	if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
 		complete(&fw_reset->done);
 	} else {
-		mlx5_load_one(dev);
+		mlx5_load_one(dev, false);
 		devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0,
 							BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
 							BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
index c1df0d3595d8..d758848d34d0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
@@ -10,6 +10,7 @@ struct mlx5_timeouts {
 
 static const u32 tout_def_sw_val[MAX_TIMEOUT_TYPES] = {
 	[MLX5_TO_FW_PRE_INIT_TIMEOUT_MS] = 120000,
+	[MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS] = 7200000,
 	[MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS] = 20000,
 	[MLX5_TO_FW_PRE_INIT_WAIT_MS] = 2,
 	[MLX5_TO_FW_INIT_MS] = 2000,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
index 1c42ead782fa..257c03eeab36 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
@@ -7,6 +7,7 @@
 enum mlx5_timeouts_types {
 	/* pre init timeouts (not read from FW) */
 	MLX5_TO_FW_PRE_INIT_TIMEOUT_MS,
+	MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS,
 	MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS,
 	MLX5_TO_FW_PRE_INIT_WAIT_MS,
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 4e49dca94bc3..56a8079dc16a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1015,7 +1015,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_devcom_unregister_device(dev->priv.devcom);
 }
 
-static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
+static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
 {
 	int err;
 
@@ -1030,11 +1030,11 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
 
 	/* wait for firmware to accept initialization segments configurations
 	 */
-	err = wait_fw_init(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT),
+	err = wait_fw_init(dev, timeout,
 			   mlx5_tout_ms(dev, FW_PRE_INIT_WARN_MESSAGE_INTERVAL));
 	if (err) {
 		mlx5_core_err(dev, "Firmware over %llu MS in pre-initializing state, aborting\n",
-			      mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
+			      timeout);
 		return err;
 	}
 
@@ -1297,7 +1297,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
 	mutex_lock(&dev->intf_state_mutex);
 	dev->state = MLX5_DEVICE_STATE_UP;
 
-	err = mlx5_function_setup(dev, true);
+	err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
 	if (err)
 		goto err_function;
 
@@ -1361,9 +1361,10 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev)
 	mutex_unlock(&dev->intf_state_mutex);
 }
 
-int mlx5_load_one(struct mlx5_core_dev *dev)
+int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery)
 {
 	int err = 0;
+	u64 timeout;
 
 	mutex_lock(&dev->intf_state_mutex);
 	if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
@@ -1373,7 +1374,11 @@ int mlx5_load_one(struct mlx5_core_dev *dev)
 	/* remove any previous indication of internal error */
 	dev->state = MLX5_DEVICE_STATE_UP;
 
-	err = mlx5_function_setup(dev, false);
+	if (recovery)
+		timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
+	else
+		timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
+	err = mlx5_function_setup(dev, timeout);
 	if (err)
 		goto err_function;
 
@@ -1746,7 +1751,7 @@ static void mlx5_pci_resume(struct pci_dev *pdev)
 
 	mlx5_pci_trace(dev, "Enter, loading driver..\n");
 
-	err = mlx5_load_one(dev);
+	err = mlx5_load_one(dev, false);
 
 	mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err,
 		       !err ? "recovered" : "Failed");
@@ -1833,7 +1838,7 @@ static int mlx5_resume(struct pci_dev *pdev)
 {
 	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
 
-	return mlx5_load_one(dev);
+	return mlx5_load_one(dev, false);
 }
 
 static const struct pci_device_id mlx5_core_pci_table[] = {
@@ -1878,7 +1883,7 @@ int mlx5_recover_device(struct mlx5_core_dev *dev)
 			return -EIO;
 	}
 
-	return mlx5_load_one(dev);
+	return mlx5_load_one(dev, true);
 }
 
 static struct pci_driver mlx5_core_driver = {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 6f8baa0f2a73..2d2150fc7a0f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -290,7 +290,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev);
 int mlx5_init_one(struct mlx5_core_dev *dev);
 void mlx5_uninit_one(struct mlx5_core_dev *dev);
 void mlx5_unload_one(struct mlx5_core_dev *dev);
-int mlx5_load_one(struct mlx5_core_dev *dev);
+int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery);
 
 int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out);
 
-- 
2.35.1
Powered by blists - more mailing lists
 
