lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1477409795-28807-11-git-send-email-saeedm@mellanox.com>
Date:   Tue, 25 Oct 2016 18:36:33 +0300
From:   Saeed Mahameed <saeedm@...lanox.com>
To:     "David S. Miller" <davem@...emloft.net>
Cc:     netdev@...r.kernel.org, Or Gerlitz <ogerlitz@...lanox.com>,
        Leon Romanovsky <leonro@...lanox.com>,
        Mohamad Haj Yahia <mohamad@...lanox.com>,
        Saeed Mahameed <saeedm@...lanox.com>
Subject: [PATCH net 10/12] net/mlx5: Fix race between PCI error handlers and health work

From: Mohamad Haj Yahia <mohamad@...lanox.com>

Currently there is a race between the health care work and the kernel
pci error handlers because both of them detect the error, the first one
to be called will do the error handling.
There is a chance that health care will disable the pci after resuming
pci slot.
Also create a separate WQ because now we will have two types of health
works, one for the error detection and one for the recovery.

Fixes: 89d44f0a6c73 ('net/mlx5_core: Add pci error handlers to mlx5_core driver')
Signed-off-by: Mohamad Haj Yahia <mohamad@...lanox.com>
Signed-off-by: Saeed Mahameed <saeedm@...lanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 30 +++++++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/main.c   |  9 +++++--
 include/linux/mlx5/driver.h                      |  4 ++++
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 2cb4094..2d00022 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -64,6 +64,10 @@ enum {
 	MLX5_NIC_IFC_NO_DRAM_NIC	= 2
 };
 
+enum {
+	MLX5_DROP_NEW_HEALTH_WORK,
+};
+
 static u8 get_nic_interface(struct mlx5_core_dev *dev)
 {
 	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
@@ -272,7 +276,13 @@ static void poll_health(unsigned long data)
 	if (in_fatal(dev) && !health->sick) {
 		health->sick = true;
 		print_health_info(dev);
-		schedule_work(&health->work);
+		spin_lock(&health->wq_lock);
+		if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
+			queue_work(health->wq, &health->work);
+		else
+			dev_err(&dev->pdev->dev,
+				"new health works are not permitted at this stage\n");
+		spin_unlock(&health->wq_lock);
 	}
 }
 
@@ -282,6 +292,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 
 	init_timer(&health->timer);
 	health->sick = 0;
+	clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
 	health->health = &dev->iseg->health;
 	health->health_counter = &dev->iseg->health_counter;
 
@@ -298,11 +309,21 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
 	del_timer_sync(&health->timer);
 }
 
+void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	spin_lock(&health->wq_lock);
+	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+	spin_unlock(&health->wq_lock);
+	cancel_work_sync(&health->work);
+}
+
 void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 
-	flush_work(&health->work);
+	destroy_workqueue(health->wq);
 }
 
 int mlx5_health_init(struct mlx5_core_dev *dev)
@@ -317,8 +338,11 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 
 	strcpy(name, "mlx5_health");
 	strcat(name, dev_name(&dev->pdev->dev));
+	health->wq = create_singlethread_workqueue(name);
 	kfree(name);
-
+	if (!health->wq)
+		return -ENOMEM;
+	spin_lock_init(&health->wq_lock);
 	INIT_WORK(&health->work, health_care);
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 8a63910..9f90226 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1315,8 +1315,13 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
 	dev_info(&pdev->dev, "%s was called\n", __func__);
 	mlx5_enter_error_state(dev);
 	mlx5_unload_one(dev, priv, false);
-	pci_save_state(pdev);
-	mlx5_pci_disable_device(dev);
+	/* In case of kernel call save the pci state and drain health wq */
+	if (state) {
+		pci_save_state(pdev);
+		mlx5_drain_health_wq(dev);
+		mlx5_pci_disable_device(dev);
+	}
+
 	return state == pci_channel_io_perm_failure ?
 		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5dbda60..7d9a5d0 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -418,7 +418,10 @@ struct mlx5_core_health {
 	u32				prev;
 	int				miss_counter;
 	bool				sick;
+	/* wq spinlock to synchronize draining */
+	spinlock_t			wq_lock;
 	struct workqueue_struct	       *wq;
+	unsigned long			flags;
 	struct work_struct		work;
 };
 
@@ -778,6 +781,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
 void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
+void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
 int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
 			struct mlx5_buf *buf, int node);
 int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);
-- 
2.7.4

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ