[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250220152640.49010-6-john.madieu.xa@bp.renesas.com>
Date: Thu, 20 Feb 2025 16:26:10 +0100
From: John Madieu <john.madieu.xa@...renesas.com>
To: mturquette@...libre.com,
magnus.damm@...il.com,
krzk+dt@...nel.org,
rui.zhang@...el.com,
daniel.lezcano@...aro.org,
sboyd@...nel.org,
geert+renesas@...der.be,
lukasz.luba@....com,
rafael@...nel.org,
robh@...nel.org,
p.zabel@...gutronix.de
Cc: biju.das.jz@...renesas.com,
claudiu.beznea.uj@...renesas.com,
conor+dt@...nel.org,
devicetree@...r.kernel.org,
john.madieu@...il.com,
linux-kernel@...r.kernel.org,
linux-renesas-soc@...r.kernel.org,
linux-clk@...r.kernel.org,
John Madieu <john.madieu.xa@...renesas.com>
Subject: [PATCH 5/7] thermal: renesas: rzg3e: Add safety check when reading temperature
Becaure reading temperature may fail, add mechanism to panic in case
reading the temperature fails after a given number of trials. This is due
to the thermal core disabling the thermal zone device after a couple of
consecutive attempt failures.
Signed-off-by: John Madieu <john.madieu.xa@...renesas.com>
---
This is proposed in a seperate patch on purpose, as it may be subject to debate
and would ease the review.
drivers/thermal/renesas/rzg3e_thermal.c | 38 +++++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)
diff --git a/drivers/thermal/renesas/rzg3e_thermal.c b/drivers/thermal/renesas/rzg3e_thermal.c
index 4b7b16b1fb09..b70bff45c88f 100644
--- a/drivers/thermal/renesas/rzg3e_thermal.c
+++ b/drivers/thermal/renesas/rzg3e_thermal.c
@@ -83,6 +83,19 @@
#define TSU_TIMEOUT_US 10000
#define TSU_MIN_CLOCK_RATE 24000000
+/*
+ * Number of consecutive errors before shutdown
+ *
+ * While simulating thermal sensor failure, we have noticed that the thermal
+ * core tries to fetch the temperature a couple times and then disable the
+ * thermal zone device. In case of extreme heat, this might lead to SoC
+ * destruction.
+ *
+ * Let's prevent this by limitating the number of failure and panic in
+ * case it happens.
+ */
+#define MAX_TEMP_READ_ERRORS 10
+
/**
* struct rzg3e_thermal_priv - RZ/G3E thermal private data structure
* @base: TSU base address
@@ -93,6 +106,7 @@
* @conv_complete: ADC conversion completion
* @reg_lock: protect shared register access
* @cached_temp: last computed temperature (milliCelsius)
+ * @error_count: Track consecutive errors
* @trmval: trim (calibration) values
*/
struct rzg3e_thermal_priv {
@@ -104,6 +118,7 @@ struct rzg3e_thermal_priv {
struct completion conv_complete;
spinlock_t reg_lock;
int cached_temp;
+ atomic_t error_count;
u32 trmval[2];
};
@@ -200,6 +215,7 @@ static irqreturn_t rzg3e_thermal_adc_irq(int irq, void *dev_id)
static int rzg3e_thermal_get_temp(struct thermal_zone_device *zone, int *temp)
{
struct rzg3e_thermal_priv *priv = thermal_zone_device_priv(zone);
+ int error_count;
u32 val;
int ret;
@@ -217,7 +233,7 @@ static int rzg3e_thermal_get_temp(struct thermal_zone_device *zone, int *temp)
TSU_POLL_DELAY_US, TSU_TIMEOUT_US);
if (ret) {
dev_err(priv->dev, "ADC conversion timed out\n");
- return ret;
+ goto handle_error;
}
/* Start conversion */
@@ -225,15 +241,33 @@ static int rzg3e_thermal_get_temp(struct thermal_zone_device *zone, int *temp)
if (!wait_for_completion_timeout(&priv->conv_complete,
msecs_to_jiffies(100))) {
+ ret = -ETIMEDOUT;
dev_err(priv->dev, "ADC conversion completion timeout\n");
- return -ETIMEDOUT;
+ goto handle_error;
}
scoped_guard(spinlock_irqsave, &priv->reg_lock) {
*temp = priv->cached_temp;
}
+ /* Reset error count on successful read */
+ atomic_set(&priv->error_count, 0);
return 0;
+
+handle_error:
+ error_count = atomic_inc_return(&priv->error_count);
+ if (error_count >= MAX_TEMP_READ_ERRORS) {
+ dev_emerg(priv->dev,
+ "Failed to read temperature %d times, initiating emergency shutdown\n",
+ error_count);
+ mdelay(100);
+ panic("Temperature sensor failure - emergency shutdown");
+ }
+
+ dev_err(priv->dev, "Failed to read temperature (error %d), attempt %d/%d\n",
+ ret, error_count, MAX_TEMP_READ_ERRORS);
+
+ return ret;
}
/* Convert temperature in milliCelsius to raw sensor code */
--
2.25.1
Powered by blists - more mailing lists