[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1547737521-29888-24-git-send-email-eranbe@mellanox.com>
Date: Thu, 17 Jan 2019 17:05:17 +0200
From: Eran Ben Elisha <eranbe@...lanox.com>
To: netdev@...r.kernel.org, Jiri Pirko <jiri@...lanox.com>,
"David S. Miller" <davem@...emloft.net>,
Ariel Almog <ariela@...lanox.com>,
Aya Levin <ayal@...lanox.com>,
Eran Ben Elisha <eranbe@...lanox.com>,
Moshe Shemesh <moshe@...lanox.com>
Subject: [PATCH net-next 23/27] net/mlx5: Report devlink health on FW issues
From: Moshe Shemesh <moshe@...lanox.com>
Use devlink_health_report() to report any symptom of FW issue as FW
counter miss or new health syndrom.
Signed-off-by: Moshe Shemesh <moshe@...lanox.com>
Reviewed-by: Saeed Mahameed <saeedm@...lanox.com>
---
.../net/ethernet/mellanox/mlx5/core/devlink.c | 21 +++++++++++++++++++
.../net/ethernet/mellanox/mlx5/core/devlink.h | 1 +
.../net/ethernet/mellanox/mlx5/core/health.c | 12 +++++++++++
include/linux/mlx5/driver.h | 2 ++
4 files changed, 36 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 3124ced06d51..5713f89d9235 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -206,6 +206,27 @@ mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
return 0;
}
+void mlx5_fw_reporter_err_work(struct work_struct *work)
+{
+ struct mlx5_fw_reporter_ctx fw_reporter_ctx;
+ struct mlx5_core_health *health;
+ struct mlx5_core_dev *dev;
+ struct mlx5_priv *priv;
+
+ health = container_of(work, struct mlx5_core_health, report_work);
+ priv = container_of(health, struct mlx5_priv, health);
+ dev = container_of(priv, struct mlx5_core_dev, priv);
+
+ fw_reporter_ctx.err_synd = health->synd;
+ fw_reporter_ctx.miss_counter = health->miss_counter;
+ if (fw_reporter_ctx.err_synd)
+ devlink_health_report(dev->fw_reporter, "FW syndrom reported",
+ &fw_reporter_ctx);
+ else if (fw_reporter_ctx.miss_counter)
+ devlink_health_report(dev->fw_reporter, "FW miss counter reported",
+ &fw_reporter_ctx);
+}
+
static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
.name = "FW",
.dump_size = SAVED_TRACES_BUFFER_SIZE_BYTE,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
index 34f6bfed1cfb..082a648a3af3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
@@ -16,5 +16,6 @@ int mlx5_devlink_register(struct devlink *devlink, struct device *dev);
void mlx5_devlink_unregister(struct devlink *devlink);
int mlx5_fw_reporter_create(struct mlx5_core_dev *dev);
void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev);
+void mlx5_fw_reporter_err_work(struct work_struct *work);
#endif /* __MLX5_DEVLINK_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 20576a525421..61ff82380093 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -38,6 +38,7 @@
#include <linux/mlx5/driver.h>
#include <linux/mlx5/cmd.h>
#include "mlx5_core.h"
+#include "devlink.h"
#include "lib/eq.h"
#include "lib/mlx5.h"
#include "lib/pci_vsc.h"
@@ -464,8 +465,10 @@ static void poll_health(struct timer_list *t)
{
struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
struct mlx5_core_health *health = &dev->priv.health;
+ struct health_buffer __iomem *h = health->health;
u32 fatal_error;
u32 count;
+ u8 synd;
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
goto out;
@@ -480,6 +483,13 @@ static void poll_health(struct timer_list *t)
if (health->miss_counter == MAX_MISSES) {
dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
mlx5_print_health_info(dev);
+ queue_work(health->wq, &health->report_work);
+ }
+
+ synd = ioread8(&h->synd);
+ if (synd && synd != health->synd) {
+ health->synd = synd;
+ queue_work(health->wq, &health->report_work);
}
fatal_error = check_fatal_sensors(dev);
@@ -535,6 +545,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
spin_unlock_irqrestore(&health->wq_lock, flags);
cancel_delayed_work_sync(&health->recover_work);
+ cancel_work_sync(&health->report_work);
cancel_work_sync(&health->work);
}
@@ -574,6 +585,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
return -ENOMEM;
spin_lock_init(&health->wq_lock);
INIT_WORK(&health->work, health_care);
+ INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
INIT_DELAYED_WORK(&health->recover_work, health_recover);
health->crdump = NULL;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index cd23926aaf4a..b5393684a6df 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -435,12 +435,14 @@ struct mlx5_core_health {
struct timer_list timer;
u32 prev;
int miss_counter;
+ u8 synd;
u32 fatal_error;
/* wq spinlock to synchronize draining */
spinlock_t wq_lock;
struct workqueue_struct *wq;
unsigned long flags;
struct work_struct work;
+ struct work_struct report_work;
struct delayed_work recover_work;
struct mlx5_fw_crdump *crdump;
};
--
2.17.1
Powered by blists - more mailing lists