lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20220403052449.1986-1-farbere@amazon.com>
Date:   Sun, 3 Apr 2022 05:24:49 +0000
From:   Eliav Farber <farbere@...zon.com>
To:     <wim@...ux-watchdog.org>, <linux@...ck-us.net>,
        <linux-watchdog@...r.kernel.org>, <linux-kernel@...r.kernel.org>
CC:     <ronenk@...zon.com>, <talel@...zon.com>, <hhhawa@...zon.com>,
        <jonnyc@...zon.com>, <hanochu@...zon.com>, <farbere@...zon.com>,
        <dwmw@...zon.co.uk>
Subject: [PATCH] watchdog: sp805: add support for irq

This change adds registration to IRQ for the sp805 watchdog.
Before this change there was no notification for watchdog
barking and the only thing that the HW watchdog did was to
directly restart the CPU.

With this change, upon IRQ the driver will call panic() which
in turn might initiate kdump (in case configured). In case the
dying process (like kdump collection) will hang, the hardware
watchdog will still directly restart the CPU on a second timeout.

The driver used to cut the specified timeout in half when setting
the watchdog timeout, since it was ignoring the IRQ that occurred
the first time the timeout expired. We now use the timeout as is
since the watchdog IRQ will go off after the configured interval.

Signed-off-by: Talel Shenhar <talel@...zon.com>
Signed-off-by: Eliav Farber <farbere@...zon.com>
---
 drivers/watchdog/sp805_wdt.c | 102 +++++++++++++++++++++++++++++++----
 1 file changed, 93 insertions(+), 9 deletions(-)

diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c
index d8876fba686d..09223aed87e3 100644
--- a/drivers/watchdog/sp805_wdt.c
+++ b/drivers/watchdog/sp805_wdt.c
@@ -17,6 +17,7 @@
 #include <linux/amba/bus.h>
 #include <linux/bitops.h>
 #include <linux/clk.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/ioport.h>
 #include <linux/kernel.h>
@@ -78,6 +79,11 @@ module_param(nowayout, bool, 0);
 MODULE_PARM_DESC(nowayout,
 		"Set to 1 to keep watchdog running after device release");
 
+static bool skip_panic_skip_reset_on_timeout;
+module_param(skip_panic_skip_reset_on_timeout, bool, 0600);
+MODULE_PARM_DESC(skip_panic_skip_reset_on_timeout,
+		 "For skipping panic and skipping reset on timeout, set this to Y/1");
+
 /* returns true if wdt is running; otherwise returns false */
 static bool wdt_is_running(struct watchdog_device *wdd)
 {
@@ -87,7 +93,14 @@ static bool wdt_is_running(struct watchdog_device *wdd)
 	return (wdtcontrol & ENABLE_MASK) == ENABLE_MASK;
 }
 
-/* This routine finds load value that will reset system in required timout */
+/*
+ * This routine finds load value that will reset or panic the system in the
+ * required timeout.
+ * It also calculates the actual timeout, which can differ from the input
+ * timeout if load value is not in the range of LOAD_MIN and LOAD_MAX.
+ * When panic is enabled it calculates the timeout till the panic, and when it
+ * is disabled it calculates the timeout till the reset.
+ */
 static int wdt_setload(struct watchdog_device *wdd, unsigned int timeout)
 {
 	struct sp805_wdt *wdt = watchdog_get_drvdata(wdd);
@@ -98,10 +111,21 @@ static int wdt_setload(struct watchdog_device *wdd, unsigned int timeout)
 	/*
 	 * sp805 runs counter with given value twice, after the end of first
 	 * counter it gives an interrupt and then starts counter again. If
-	 * interrupt already occurred then it resets the system. This is why
-	 * load is half of what should be required.
+	 * interrupt already occurred then it resets the system.
 	 */
-	load = div_u64(rate, 2) * timeout - 1;
+	if (wdt->adev->irq[0]) {
+		/*
+		 * Set load value based on a single timeout until we handle
+		 * the interrupt.
+		 */
+		load = rate * timeout - 1;
+	} else {
+		/*
+		 * Set load value to half the timeout, since the interrupt is
+		 * ignored and counter must expire twice before CPU is reset.
+		 */
+		load = div_u64(rate, 2) * timeout - 1;
+	}
 
 	load = (load > LOAD_MAX) ? LOAD_MAX : load;
 	load = (load < LOAD_MIN) ? LOAD_MIN : load;
@@ -109,13 +133,19 @@ static int wdt_setload(struct watchdog_device *wdd, unsigned int timeout)
 	spin_lock(&wdt->lock);
 	wdt->load_val = load;
 	/* roundup timeout to closest positive integer value */
-	wdd->timeout = div_u64((load + 1) * 2 + (rate / 2), rate);
+	if (wdt->adev->irq[0])
+		wdd->timeout = div_u64((load + 1) + (rate / 2), rate);
+	else
+		wdd->timeout = div_u64((load + 1) * 2 + (rate / 2), rate);
 	spin_unlock(&wdt->lock);
 
 	return 0;
 }
 
-/* returns number of seconds left for reset to occur */
+/*
+ * returns number of seconds left for reset to occur, or returns number of
+ * seconds left for panic to occur when enabled.
+ */
 static unsigned int wdt_timeleft(struct watchdog_device *wdd)
 {
 	struct sp805_wdt *wdt = watchdog_get_drvdata(wdd);
@@ -124,9 +154,18 @@ static unsigned int wdt_timeleft(struct watchdog_device *wdd)
 	spin_lock(&wdt->lock);
 	load = readl_relaxed(wdt->base + WDTVALUE);
 
-	/*If the interrupt is inactive then time left is WDTValue + WDTLoad. */
-	if (!(readl_relaxed(wdt->base + WDTRIS) & INT_MASK))
-		load += wdt->load_val + 1;
+	/*
+	 * When panic is enabled, it occurs after the first time that sp805
+	 * runs the counter with the given load value.
+	 * Otherwise, reset happens after sp805 runs the counter with given
+	 * value twice (once before the interrupt and second after the
+	 * interrupt), so if the interrupt is inactive (first count) then time
+	 * left is WDTValue + WDTLoad.
+	 */
+	if (!wdt->adev->irq[0])
+		if (!(readl_relaxed(wdt->base + WDTRIS) & INT_MASK))
+			load += wdt->load_val + 1;
+
 	spin_unlock(&wdt->lock);
 
 	return div_u64(load, wdt->rate);
@@ -227,6 +266,29 @@ static const struct watchdog_ops wdt_ops = {
 	.restart	= wdt_restart,
 };
 
+static irqreturn_t wdt_irq_handler(int irq, void *dev_id)
+{
+	struct device *dev = dev_id;
+	struct sp805_wdt *wdt = dev_get_drvdata(dev);
+
+	/*
+	 * Reset the watchdog timeout to maximize time available to the panic
+	 * handler before the watchdog induces a CPU reset.  Without resetting
+	 * the watchdog here, it would have a single timeout remaining between
+	 * interrupt and hardware reset. By resetting the timeout, the panic
+	 * handler can run with interrupts disabled for two watchdog timeouts,
+	 * ignoring the interrupt that would occur after the first timeout.
+	 */
+	wdt_config(&wdt->wdd, true);
+
+	if (skip_panic_skip_reset_on_timeout)
+		dev_warn(dev, "watchdog timeout expired\n");
+	else
+		panic("watchdog timeout expired");
+
+	return IRQ_HANDLED;
+}
+
 static int
 sp805_wdt_probe(struct amba_device *adev, const struct amba_id *id)
 {
@@ -299,9 +361,28 @@ sp805_wdt_probe(struct amba_device *adev, const struct amba_id *id)
 	}
 	amba_set_drvdata(adev, wdt);
 
+	if (adev->irq[0]) {
+		ret = devm_request_irq(&adev->dev,
+				       adev->irq[0],
+				       wdt_irq_handler,
+				       0,
+				       MODULE_NAME,
+				       adev);
+		if (ret) {
+			dev_err(&adev->dev,
+				"watchdog irq request failed: %d\n",
+				ret);
+			goto err_request_irq;
+		}
+	} else {
+		dev_warn(&adev->dev, "no irq exists for watchdog\n");
+	}
+
 	dev_info(&adev->dev, "registration successful\n");
 	return 0;
 
+err_request_irq:
+	watchdog_unregister_device(&wdt->wdd);
 err:
 	dev_err(&adev->dev, "Probe Failed!!!\n");
 	return ret;
@@ -311,6 +392,9 @@ static int sp805_wdt_remove(struct amba_device *adev)
 {
 	struct sp805_wdt *wdt = amba_get_drvdata(adev);
 
+	if (adev->irq[0])
+		devm_free_irq(&adev->dev, adev->irq[0], adev);
+
 	watchdog_unregister_device(&wdt->wdd);
 	watchdog_set_drvdata(&wdt->wdd, NULL);
 
-- 
2.32.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ