lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20190110102906.3751-10-saeedm@mellanox.com>
Date:   Thu, 10 Jan 2019 12:29:05 +0200
From:   Saeed Mahameed <saeedm@...lanox.com>
To:     "David S. Miller" <davem@...emloft.net>
Cc:     netdev@...r.kernel.org, Feras Daoud <ferasda@...lanox.com>,
        Saeed Mahameed <saeedm@...lanox.com>
Subject: [net-next 9/9] net/mlx5: Protect against infinite recovery requests

From: Feras Daoud <ferasda@...lanox.com>

A buggy HW may cause infinite recovery requests loop that may terminate
the driver. The following change protects against that by adding a
timestamp variable that will remember the last recover request timestamp,
and allow recovery only if the period between two sequential requests is
bigger than 20 min.

Signed-off-by: Feras Daoud <ferasda@...lanox.com>
Signed-off-by: Saeed Mahameed <saeedm@...lanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/health.c  | 24 +++++++++++++++++++
 include/linux/mlx5/driver.h                   |  1 +
 2 files changed, 25 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 74de30246eee..b43070e4f519 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -191,9 +191,26 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
 
 #define MLX5_CRDUMP_WAIT_MS	60000
 #define MLX5_FW_RESET_WAIT_MS	1000
+#define MLX5_RECOVERY_TIMEOUT_MS 1200000
+
+static bool mlx5_health_allow_recover(struct mlx5_core_health *health)
+{
+	bool ret = false;
+
+	ret = health->last_recover_tstamp ?
+	      time_after(jiffies, health->last_recover_tstamp +
+			 msecs_to_jiffies(MLX5_RECOVERY_TIMEOUT_MS)) :
+	      true;
+
+	health->last_recover_tstamp = jiffies;
+
+	return ret;
+}
+
 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
 {
 	unsigned long end, delay_ms = MLX5_FW_RESET_WAIT_MS;
+	struct mlx5_core_health *health = &dev->priv.health;
 	u32 fatal_error, err;
 	int lock = -EBUSY;
 
@@ -212,6 +229,12 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
 
 	fatal_error = check_fatal_sensors(dev);
 
+	if (fatal_error == MLX5_SENSOR_FW_SYND_RFR &&
+	    !mlx5_health_allow_recover(health)) {
+		mlx5_core_warn_once(dev, "Device recovery ignored\n");
+		goto err_state_done;
+	}
+
 	if (fatal_error || force) {
 		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
 		mlx5_cmd_trigger_completions(dev);
@@ -560,6 +583,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 	INIT_WORK(&health->work, health_care);
 	INIT_DELAYED_WORK(&health->recover_work, health_recover);
 	health->crdump = NULL;
+	health->last_recover_tstamp = 0;
 
 	return 0;
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2ea6732c1d4d..06ab2647f790 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -442,6 +442,7 @@ struct mlx5_core_health {
 	struct work_struct		work;
 	struct delayed_work		recover_work;
 	struct mlx5_fw_crdump	       *crdump;
+	unsigned long			last_recover_tstamp;
 };
 
 struct mlx5_qp_table {
-- 
2.20.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ