lists.openwall.net | lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC | |
Open Source and information security mailing list archives
| ||
|
Date: Thu, 2 Dec 2021 16:56:14 -0800 From: Saeed Mahameed <saeed@...nel.org> To: "David S. Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org> Cc: netdev@...r.kernel.org, Saeed Mahameed <saeedm@...dia.com>, Moshe Shemesh <moshe@...dia.com> Subject: [net-next v0 06/14] net/mlx5: Print more info on pci error handlers From: Saeed Mahameed <saeedm@...dia.com> In case mlx5_pci_err_detected was called with state equals to pci_channel_io_perm_failure, the driver will never come back up. It is nice to know why the driver went to zombie land, so print some useful information on pci err handlers. Signed-off-by: Saeed Mahameed <saeedm@...dia.com> Reviewed-by: Moshe Shemesh <moshe@...dia.com> --- .../net/ethernet/mellanox/mlx5/core/main.c | 51 ++++++++++++++----- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 7df9c7f8d9c8..d97c9e86d7b3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1604,12 +1604,28 @@ static void remove_one(struct pci_dev *pdev) mlx5_devlink_free(devlink); } +#define mlx5_pci_trace(dev, fmt, ...) ({ \ + struct mlx5_core_dev *__dev = (dev); \ + mlx5_core_info(__dev, "%s Device state = %d health sensors: %d pci_status: %d. " fmt, \ + __func__, __dev->state, mlx5_health_check_fatal_sensors(__dev), \ + __dev->pci_status, ##__VA_ARGS__); \ +}) + +static const char *result2str(enum pci_ers_result result) +{ + return result == PCI_ERS_RESULT_NEED_RESET ? "need reset" : + result == PCI_ERS_RESULT_DISCONNECT ? "disconnect" : + result == PCI_ERS_RESULT_RECOVERED ? "recovered" : + "unknown"; +} + static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + enum pci_ers_result res; - mlx5_core_info(dev, "%s was called\n", __func__); + mlx5_pci_trace(dev, "Enter, pci channel state = %d\n", state); mlx5_enter_error_state(dev, false); mlx5_error_sw_reset(dev); @@ -1617,8 +1633,11 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, mlx5_drain_health_wq(dev); mlx5_pci_disable_device(dev); - return state == pci_channel_io_perm_failure ? + res = state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; + + mlx5_pci_trace(dev, "Exit, result = %d, %s\n", res, result2str(res)); + return res; } /* wait for the device to show vital signs by waiting @@ -1652,28 +1671,34 @@ static int wait_vital(struct pci_dev *pdev) static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev) { + enum pci_ers_result res = PCI_ERS_RESULT_DISCONNECT; struct mlx5_core_dev *dev = pci_get_drvdata(pdev); int err; - mlx5_core_info(dev, "%s was called\n", __func__); + mlx5_pci_trace(dev, "Enter\n"); err = mlx5_pci_enable_device(dev); if (err) { mlx5_core_err(dev, "%s: mlx5_pci_enable_device failed with error code: %d\n", __func__, err); - return PCI_ERS_RESULT_DISCONNECT; + goto out; } pci_set_master(pdev); pci_restore_state(pdev); pci_save_state(pdev); - if (wait_vital(pdev)) { - mlx5_core_err(dev, "%s: wait_vital timed out\n", __func__); - return PCI_ERS_RESULT_DISCONNECT; + err = wait_vital(pdev); + if (err) { + mlx5_core_err(dev, "%s: wait vital failed with error code: %d\n", + __func__, err); + goto out; } - return PCI_ERS_RESULT_RECOVERED; + res = PCI_ERS_RESULT_RECOVERED; +out: + mlx5_pci_trace(dev, "Exit, err = %d, result = %d, %s\n", err, res, result2str(res)); + return res; } static void mlx5_pci_resume(struct pci_dev *pdev) @@ -1681,14 +1706,12 @@ static void mlx5_pci_resume(struct pci_dev *pdev) struct mlx5_core_dev *dev = pci_get_drvdata(pdev); int err; - mlx5_core_info(dev, "%s was called\n", __func__); + mlx5_pci_trace(dev, "Enter, loading driver..\n"); err = mlx5_load_one(dev); - if (err) - mlx5_core_err(dev, "%s: mlx5_load_one failed with error code: %d\n", - __func__, err); - else - mlx5_core_info(dev, "%s: device recovered\n", __func__); + + mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err, + !err ? "recovered" : "Failed"); } static const struct pci_error_handlers mlx5_err_handler = { -- 2.31.1
Powered by blists - more mailing lists