[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1471026035-15323-17-git-send-email-rvatsavayi@caviumnetworks.com>
Date: Fri, 12 Aug 2016 11:20:33 -0700
From: Raghu Vatsavayi <rvatsavayi@...iumnetworks.com>
To: <davem@...emloft.net>
CC: <netdev@...r.kernel.org>,
Raghu Vatsavayi <rvatsavayi@...iumnetworks.com>,
Derek Chickles <derek.chickles@...iumnetworks.com>,
Satanand Burla <satananda.burla@...iumnetworks.com>,
Felix Manlunas <felix.manlunas@...iumnetworks.com>,
Raghu Vatsavayi <raghu.vatsavayi@...iumnetworks.com>
Subject: [PATCH net-next V2 16/18] liquidio: CN23XX health monitoring
Adds support for watchdog based health monitoring
of octeon cores on cn23xx device.
Signed-off-by: Derek Chickles <derek.chickles@...iumnetworks.com>
Signed-off-by: Satanand Burla <satananda.burla@...iumnetworks.com>
Signed-off-by: Felix Manlunas <felix.manlunas@...iumnetworks.com>
Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@...iumnetworks.com>
---
drivers/net/ethernet/cavium/liquidio/lio_main.c | 124 ++++++++++++++++++++-
.../net/ethernet/cavium/liquidio/octeon_device.h | 2 +
2 files changed, 124 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index c73db84..e05fad4 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -24,6 +24,7 @@
#include <linux/firmware.h>
#include <linux/ptp_clock_kernel.h>
#include <net/vxlan.h>
+#include <linux/kthread.h>
#include "liquidio_common.h"
#include "octeon_droq.h"
#include "octeon_iq.h"
@@ -946,8 +947,6 @@ static void update_txq_status(struct octeon_device *oct, int iq_num)
struct lio *lio;
struct octeon_instr_queue *iq = oct->instr_queue[iq_num];
- /*octeon_update_iq_read_idx(oct, iq);*/
-
netdev = oct->props[iq->ifidx].netdev;
/* This is needed because the first IQ does not have
@@ -1183,6 +1182,100 @@ static int octeon_setup_interrupt(struct octeon_device *oct)
return 0;
}
+static int liquidio_watchdog(void *param)
+{
+#define CIU3_WDOG(c) (0x1010000020000ULL + (c << 3))
+ u64 wdog;
+ u16 mask_of_stuck_cores = 0;
+ u16 mask_of_crashed_cores = 0;
+ int core_num;
+ u8 core_is_stuck[12];
+ u8 core_crashed[12];
+ struct octeon_device *oct = param;
+
+ memset(core_is_stuck, 0, sizeof(core_is_stuck));
+ memset(core_crashed, 0, sizeof(core_crashed));
+
+ while (!kthread_should_stop()) {
+ mask_of_crashed_cores =
+ (u16)octeon_read_csr64(oct, CN23XX_SLI_SCRATCH2);
+
+ for (core_num = 0; core_num < 12; core_num++) {
+ if (!core_is_stuck[core_num]) {
+ wdog = lio_pci_readq(oct, CIU3_WDOG(core_num));
+
+ /* look at watchdog state field */
+ wdog &= 12ULL;
+ if (wdog) {
+ /* this watchdog timer has expired */
+ core_is_stuck[core_num] = 1;
+ mask_of_stuck_cores |= (1 << core_num);
+ }
+ }
+
+ if (!core_crashed[core_num])
+ core_crashed[core_num] =
+ (mask_of_crashed_cores >> core_num) & 1;
+ }
+
+ if (mask_of_stuck_cores) {
+ for (core_num = 0; core_num < 12; core_num++) {
+ if (core_is_stuck[core_num] == 1) {
+ dev_err(&oct->pci_dev->dev,
+ "ERROR: Octeon core %d is stuck!\n",
+ core_num);
+ core_is_stuck[core_num] =
+ 2; /* 2 means we have printk'd
+ * an error; so no need to
+ * repeat the same printk
+ */
+ }
+ }
+ }
+
+ if (mask_of_crashed_cores) {
+ for (core_num = 0; core_num < 12; core_num++) {
+ if (core_crashed[core_num] == 1) {
+ dev_err(&oct->pci_dev->dev,
+ "ERROR: Octeon core %d crashed! See oct-fwdump for details.\n",
+ core_num);
+ core_crashed[core_num] =
+ 2; /* 2 means we have printk'd
+ * an error; so no need to
+ * repeat the same printk
+ */
+ }
+ }
+ }
+#ifdef CONFIG_MODULE_UNLOAD
+ if (mask_of_stuck_cores || mask_of_crashed_cores) {
+ /* make module refcount=0 so that rmmod will work */
+ long refcount;
+
+ refcount = module_refcount(THIS_MODULE);
+
+ while (refcount > 0) {
+ module_put(THIS_MODULE);
+ refcount = module_refcount(THIS_MODULE);
+ }
+
+ /* compensate for and withstand an unlikely (but still
+ * possible) race condition
+ */
+ while (refcount < 0) {
+ try_module_get(THIS_MODULE);
+ refcount = module_refcount(THIS_MODULE);
+ }
+ }
+#endif
+ /* sleep for two seconds */
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(2 * HZ);
+ }
+
+ return 0;
+}
+
/**
* \brief PCI probe handler
* @param pdev PCI device structure
@@ -1228,6 +1321,30 @@ liquidio_probe(struct pci_dev *pdev,
return -ENOMEM;
}
+ if (OCTEON_CN23XX_PF(oct_dev)) {
+ u64 scratch1;
+ u8 bus, device, function;
+
+ scratch1 = octeon_read_csr64(oct_dev, CN23XX_SLI_SCRATCH1);
+ if (!(scratch1 & 4ULL)) {
+ /* Bit 2 of SLI_SCRATCH_1 is a flag that indicates that
+ * the lio watchdog kernel thread is running for this
+ * NIC. Each NIC gets one watchdog kernel thread.
+ */
+ scratch1 |= 4ULL;
+ octeon_write_csr64(oct_dev, CN23XX_SLI_SCRATCH1,
+ scratch1);
+
+ bus = pdev->bus->number;
+ device = PCI_SLOT(pdev->devfn);
+ function = PCI_FUNC(pdev->devfn);
+ oct_dev->watchdog_task = kthread_create(
+ liquidio_watchdog, oct_dev,
+ "liowd/%02hhx:%02hhx.%hhx", bus, device, function);
+ wake_up_process(oct_dev->watchdog_task);
+ }
+ }
+
oct_dev->rx_pause = 1;
oct_dev->tx_pause = 1;
@@ -1560,6 +1677,9 @@ static void liquidio_remove(struct pci_dev *pdev)
dev_dbg(&oct_dev->pci_dev->dev, "Stopping device\n");
+ if (oct_dev->watchdog_task)
+ kthread_stop(oct_dev->watchdog_task);
+
if (oct_dev->app_mode && (oct_dev->app_mode == CVM_DRV_NIC_APP))
liquidio_stop_nic_module(oct_dev);
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
index ec3cb22..773eb09 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
@@ -486,6 +486,8 @@ struct octeon_device {
/* private flags to control driver-specific features through ethtool */
u32 priv_flags;
+
+ void *watchdog_task;
};
#define OCT_DRV_ONLINE 1
--
1.8.3.1
Powered by blists - more mailing lists