lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1547737521-29888-25-git-send-email-eranbe@mellanox.com>
Date:   Thu, 17 Jan 2019 17:05:18 +0200
From:   Eran Ben Elisha <eranbe@...lanox.com>
To:     netdev@...r.kernel.org, Jiri Pirko <jiri@...lanox.com>,
        "David S. Miller" <davem@...emloft.net>,
        Ariel Almog <ariela@...lanox.com>,
        Aya Levin <ayal@...lanox.com>,
        Eran Ben Elisha <eranbe@...lanox.com>,
        Moshe Shemesh <moshe@...lanox.com>
Subject: [PATCH net-next 24/27] net/mlx5: Add FW fatal devlink_health_reporter

From: Moshe Shemesh <moshe@...lanox.com>

Create mlx5_devlink_health_reporter for FW fatal reporter.
The FW fatal reporter is added in addition to the fw reporter and
implements only the recover callback.
The point of having two reporters for FW issues, is that we
don't want to run FW recover on any issue, but only fatal ones.

Signed-off-by: Moshe Shemesh <moshe@...lanox.com>
Reviewed-by: Saeed Mahameed <saeedm@...lanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/devlink.c | 49 +++++++++++++++----
 .../net/ethernet/mellanox/mlx5/core/devlink.h |  4 +-
 .../net/ethernet/mellanox/mlx5/core/health.c  |  6 +--
 .../net/ethernet/mellanox/mlx5/core/main.c    |  6 +--
 .../ethernet/mellanox/mlx5/core/mlx5_core.h   |  1 +
 include/linux/mlx5/driver.h                   |  1 +
 6 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 5713f89d9235..ae08af00b101 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -235,21 +235,52 @@ static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
 		.diagnose = mlx5_fw_reporter_diagnose,
 };
 
-int mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
+static int
+mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter,
+			       void *priv_ctx)
 {
-	struct devlink *devlink = priv_to_devlink(dev);
+	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
 
-	dev->fw_reporter = devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
-							  0, false, dev);
-	return PTR_ERR_OR_ZERO(dev->fw_reporter);
+	if (mlx5_sensor_pci_not_working(dev)) {
+		dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n");
+		return -ECANCELED;
+	}
+	dev_err(&dev->pdev->dev, "starting health recovery flow\n");
+
+	mlx5_recover_device(dev);
+
+	return 0;
 }
 
-void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev)
+static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
+		.name = "FW_fatal",
+		.recover = mlx5_fw_fatal_reporter_recover,
+};
+
+#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 1200000
+int mlx5_fw_reporters_create(struct mlx5_core_dev *dev)
 {
-	if (!dev->fw_reporter)
-		return;
+	struct devlink *devlink = priv_to_devlink(dev);
 
-	devlink_health_reporter_destroy(dev->fw_reporter);
+	dev->fw_reporter =
+		devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
+					       0, false, dev);
+	if (IS_ERR(dev->fw_reporter))
+		return PTR_ERR(dev->fw_reporter);
+
+	dev->fw_fatal_reporter =
+		devlink_health_reporter_create(devlink, &mlx5_fw_fatal_reporter_ops,
+					       MLX5_REPORTER_FW_GRACEFUL_PERIOD,
+					       true, dev);
+	return PTR_ERR_OR_ZERO(dev->fw_fatal_reporter);
+}
+
+void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev)
+{
+	if (dev->fw_reporter)
+		devlink_health_reporter_destroy(dev->fw_reporter);
+	if (dev->fw_fatal_reporter)
+		devlink_health_reporter_destroy(dev->fw_fatal_reporter);
 }
 
 static int mlx5_devlink_get_crdump_snapshot(struct devlink *devlink, u32 id,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
index 082a648a3af3..9b544f677aa7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
@@ -14,8 +14,8 @@ struct mlx5_fw_reporter_ctx {
 
 int mlx5_devlink_register(struct devlink *devlink, struct device *dev);
 void mlx5_devlink_unregister(struct devlink *devlink);
-int mlx5_fw_reporter_create(struct mlx5_core_dev *dev);
-void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev);
+int mlx5_fw_reporters_create(struct mlx5_core_dev *dev);
+void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev);
 void mlx5_fw_reporter_err_work(struct work_struct *work);
 
 #endif /* __MLX5_DEVLINK_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 61ff82380093..d3ea624bd03c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -91,7 +91,7 @@ void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
 		    &dev->iseg->cmdq_addr_l_sz);
 }
 
-static bool sensor_pci_not_working(struct mlx5_core_dev *dev)
+bool mlx5_sensor_pci_not_working(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 	struct health_buffer __iomem *h = health->health;
@@ -114,7 +114,7 @@ static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
 
 static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
 {
-	if (sensor_pci_not_working(dev))
+	if (mlx5_sensor_pci_not_working(dev))
 		return MLX5_SENSOR_PCI_COMM_ERR;
 	if (pci_channel_offline(dev->pdev))
 		return MLX5_SENSOR_PCI_ERR;
@@ -315,7 +315,7 @@ static void health_recover(struct work_struct *work)
 	priv = container_of(health, struct mlx5_priv, health);
 	dev = container_of(priv, struct mlx5_core_dev, priv);
 
-	if (sensor_pci_not_working(dev)) {
+	if (mlx5_sensor_pci_not_working(dev)) {
 		dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n");
 		return;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 010a6a28e08d..2e7c54975f47 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -980,9 +980,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_fw_tracer;
 	}
 
-	err = mlx5_fw_reporter_create(dev);
+	err = mlx5_fw_reporters_create(dev);
 	if (err)
-		dev_warn(&pdev->dev, "Failed to create FW reporter\n");
+		dev_warn(&pdev->dev, "Failed to create FW reporters\n");
 
 	err = mlx5_fpga_device_start(dev);
 	if (err) {
@@ -1116,7 +1116,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_accel_ipsec_cleanup(dev);
 	mlx5_accel_tls_cleanup(dev);
 	mlx5_fpga_device_stop(dev);
-	mlx5_fw_reporter_destroy(dev);
+	mlx5_fw_reporters_destroy(dev);
 	mlx5_fw_tracer_cleanup(dev->tracer);
 	mlx5_eq_table_destroy(dev);
 	mlx5_pagealloc_stop(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index af5cf19cc019..dca9a736c2cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -204,6 +204,7 @@ enum {
 
 u8 mlx5_get_nic_state(struct mlx5_core_dev *dev);
 void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state);
+bool mlx5_sensor_pci_not_working(struct mlx5_core_dev *dev);
 
 #define HEALTH_INFO_MAX_LINE 64
 #define HEALTH_INFO_LINES (MLX5_FLD_SZ_DW(health_buffer, assert_var) + 8)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b5393684a6df..f2c6850b0ec4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -686,6 +686,7 @@ struct mlx5_core_dev {
 	struct page             *clock_info_page;
 	struct mlx5_fw_tracer   *tracer;
 	struct devlink_health_reporter *fw_reporter;
+	struct devlink_health_reporter *fw_fatal_reporter;
 	u32                      vsc_addr;
 };
 
-- 
2.17.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ