lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260115074909.245852-3-crajank@nvidia.com>
Date: Thu, 15 Jan 2026 09:49:09 +0200
From: Ciju Rajan K <crajank@...dia.com>
To: <hdegoede@...hat.com>, <ilpo.jarvinen@...ux.intel.com>,
	<tglx@...utronix.de>
CC: <christophe.jaillet@...adoo.fr>, <andriy.shevchenko@...ux.intel.com>,
	<vadimp@...dia.com>, <platform-driver-x86@...r.kernel.org>,
	<linux-kernel@...r.kernel.org>, Ciju Rajan K <crajank@...dia.com>
Subject: [PATCH platform-next v4 2/2] platform/mellanox: mlxreg-hotplug: Enabling interrupt storm detection

This patch enables the interrupt storm detection feature and
also adds the per device counter for tracking the faulty
devices. It also masks the faulty devices from generating
any further interrupts.

Add field for interrupt storm handling.
Extend structure mlxreg_core_data with the following field:
 'wmark_cntr'   - interrupt storm counter.

Extend structure mlxreg_core_item with the following field:
 'storming_bits' - interrupt storming bits mask.

Reviewed-by: Vadim Pasternak <vadimp@...dia.com>
Signed-off-by: Ciju Rajan K <crajank@...dia.com>
--
---
 drivers/platform/mellanox/mlxreg-hotplug.c | 74 +++++++++++++++++++++-
 include/linux/platform_data/mlxreg.h       |  4 ++
 2 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/mellanox/mlxreg-hotplug.c b/drivers/platform/mellanox/mlxreg-hotplug.c
index d246772aafd6..4752477207d4 100644
--- a/drivers/platform/mellanox/mlxreg-hotplug.c
+++ b/drivers/platform/mellanox/mlxreg-hotplug.c
@@ -30,6 +30,9 @@
 #define MLXREG_HOTPLUG_ATTRS_MAX	128
 #define MLXREG_HOTPLUG_NOT_ASSERT	3
 
+/* Interrupt storm frequency */
+#define MLXREG_HOTPLUG_INTR_FREQ_HZ	100
+
 /**
  * struct mlxreg_hotplug_priv_data - platform private data:
  * @irq: platform device interrupt number;
@@ -339,6 +342,57 @@ static int mlxreg_hotplug_attr_init(struct mlxreg_hotplug_priv_data *priv)
 	return 0;
 }
 
+/**
+ * mlxreg_hotplug_storm_handler - generic interrupt storm detection callback
+ * @irq: interrupt number experiencing storm
+ * @freq: detected frequency (interrupts per second)
+ * @dev_id: device data (mlxreg_hotplug_priv_data)
+ *
+ * This callback is invoked by the generic interrupt storm detection mechanism
+ * when an interrupt storm is detected on the shared IRQ line. The driver then
+ * analyzes per-device interrupt counters to identify which specific devices
+ * are causing excessive interrupts without blocking operations.
+ */
+static void mlxreg_hotplug_storm_handler(unsigned int irq, unsigned int freq, void *dev_id)
+{
+	struct mlxreg_hotplug_priv_data *priv = dev_id;
+	struct mlxreg_core_hotplug_platform_data *pdata;
+	struct mlxreg_core_item *item;
+	struct mlxreg_core_data *data;
+	unsigned long asserted;
+	u32 bit;
+
+	dev_warn(priv->dev,
+		 "Interrupt storm detected on IRQ %u (%u interrupts/sec)",
+		 irq, freq);
+
+	pdata = dev_get_platdata(&priv->pdev->dev);
+	item = pdata->items;
+	asserted = item->cache;
+
+	for_each_set_bit(bit, &asserted, 8) {
+		int pos;
+
+		pos = mlxreg_hotplug_item_label_index_get(item->mask, bit);
+		if (pos < 0)
+			goto out;
+
+		data = item->data + pos;
+		/* Check per device interrupt counter */
+		if (data->wmark_cntr >= MLXREG_HOTPLUG_INTR_FREQ_HZ - 1) {
+			dev_err(priv->dev,
+				"Storming bit %d (label: %s) - interrupt masked permanently. Replace broken HW.",
+				bit, data->label);
+			/* Mark bit as storming. */
+			item->storming_bits |= BIT(bit);
+		}
+		data->wmark_cntr = 0;
+	}
+	return;
+ out:
+	dev_err(priv->dev, "Failed to complete interrupt storm handler\n");
+}
+
 static void
 mlxreg_hotplug_work_helper(struct mlxreg_hotplug_priv_data *priv,
 			   struct mlxreg_core_item *item)
@@ -371,6 +425,10 @@ mlxreg_hotplug_work_helper(struct mlxreg_hotplug_priv_data *priv,
 			goto out;
 
 		data = item->data + pos;
+
+		/* Counter to keep track of interrupt storm */
+		data->wmark_cntr++;
+
 		if (regval & BIT(bit)) {
 			if (item->inversed)
 				mlxreg_hotplug_device_destroy(priv, data, item->kind);
@@ -390,9 +448,9 @@ mlxreg_hotplug_work_helper(struct mlxreg_hotplug_priv_data *priv,
 	if (ret)
 		goto out;
 
-	/* Unmask event. */
+	/* Unmask event, exclude storming bits. */
 	ret = regmap_write(priv->regmap, item->reg + MLXREG_HOTPLUG_MASK_OFF,
-			   item->mask);
+			   item->mask & ~item->storming_bits);
 
  out:
 	if (ret)
@@ -767,6 +825,15 @@ static int mlxreg_hotplug_probe(struct platform_device *pdev)
 
 	/* Perform initial interrupts setup. */
 	mlxreg_hotplug_set_irq(priv);
+
+	/* Register with generic interrupt storm detection */
+	if (!irq_register_storm_detection(priv->irq, MLXREG_HOTPLUG_INTR_FREQ_HZ,
+					  mlxreg_hotplug_storm_handler, priv)) {
+		dev_warn(&pdev->dev, "Failed to register generic interrupt storm detection\n");
+	} else {
+		dev_info(&pdev->dev, "Registered generic storm detection for IRQ %d\n", priv->irq);
+	}
+
 	priv->after_probe = true;
 
 	return 0;
@@ -776,6 +843,9 @@ static void mlxreg_hotplug_remove(struct platform_device *pdev)
 {
 	struct mlxreg_hotplug_priv_data *priv = dev_get_drvdata(&pdev->dev);
 
+	/* Unregister generic interrupt storm detection */
+	irq_unregister_storm_detection(priv->irq);
+
 	/* Clean interrupts setup. */
 	mlxreg_hotplug_unset_irq(priv);
 	devm_free_irq(&pdev->dev, priv->irq, priv);
diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h
index f6cca7a035c7..592256570175 100644
--- a/include/linux/platform_data/mlxreg.h
+++ b/include/linux/platform_data/mlxreg.h
@@ -131,6 +131,7 @@ struct mlxreg_hotplug_device {
  * @regnum: number of registers occupied by multi-register attribute;
  * @slot: slot number, at which device is located;
  * @secured: if set indicates that entry access is secured;
+ * @wmark_cntr: interrupt storm counter;
  */
 struct mlxreg_core_data {
 	char label[MLXREG_CORE_LABEL_MAX_SIZE];
@@ -151,6 +152,7 @@ struct mlxreg_core_data {
 	u8 regnum;
 	u8 slot;
 	u8 secured;
+	unsigned int wmark_cntr;
 };
 
 /**
@@ -167,6 +169,7 @@ struct mlxreg_core_data {
  * @ind: element's index inside the group;
  * @inversed: if 0: 0 for signal status is OK, if 1 - 1 is OK;
  * @health: true if device has health indication, false in other case;
+ * @storming_bits: interrupt storming bits mask;
  */
 struct mlxreg_core_item {
 	struct mlxreg_core_data *data;
@@ -180,6 +183,7 @@ struct mlxreg_core_item {
 	u8 ind;
 	u8 inversed;
 	u8 health;
+	u32 storming_bits;
 };
 
 /**
-- 
2.47.3


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ