lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20260123090741.1566469-2-o.rempel@pengutronix.de>
Date: Fri, 23 Jan 2026 10:07:37 +0100
From: Oleksij Rempel <o.rempel@...gutronix.de>
To: "David S. Miller" <davem@...emloft.net>,
	Eric Dumazet <edumazet@...gle.com>,
	Jakub Kicinski <kuba@...nel.org>,
	Paolo Abeni <pabeni@...hat.com>,
	Andrew Lunn <andrew+netdev@...n.ch>,
	Thangaraj Samynathan <Thangaraj.S@...rochip.com>,
	Rengarajan Sundararajan <Rengarajan.S@...rochip.com>
Cc: Oleksij Rempel <o.rempel@...gutronix.de>,
	kernel@...gutronix.de,
	linux-kernel@...r.kernel.org,
	netdev@...r.kernel.org,
	UNGLinuxDriver@...rochip.com
Subject: [RFC PATCH 1/4] net: lan78xx: Add devlink health support for diagnostics

Add devlink health support for diagnostics

Signed-off-by: Oleksij Rempel <o.rempel@...gutronix.de>
---
 drivers/net/usb/lan78xx.c | 388 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 387 insertions(+), 1 deletion(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index ad620b56443b..221be42e06f4 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -27,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/usb.h>
+#include <net/devlink.h>
 #include <net/ip6_checksum.h>
 #include <net/selftests.h>
 #include <net/vxlan.h>
@@ -41,6 +42,11 @@
 #define THROTTLE_JIFFIES		(HZ / 8)
 #define UNLINK_TIMEOUT_MS		3
 
+#define LAN78XX_STALL_PAUSE_THRESH	100
+#define LAN78XX_LIVELOCK_DROP_THRESH	10000
+#define LAN78XX_LIVELOCK_DROP_RATIO	10
+#define LAN78XX_TX_TIMEOUT_DROP_THRESH	1000
+
 #define RX_MAX_QUEUE_MEMORY		(60 * 1518)
 
 #define SS_USB_PKT_SIZE			(1024)
@@ -373,6 +379,10 @@ struct lan78xx_priv {
 	u32 wol;
 };
 
+struct lan78xx_devlink_priv {
+	struct lan78xx_net *dev;
+};
+
 enum skb_state {
 	illegal = 0,
 	tx_start,
@@ -411,6 +421,19 @@ struct statstage {
 	struct lan78xx_statstage64	curr_stat;
 };
 
+struct lan78xx_stat_snapshot {
+	ktime_t time;
+
+	u64 tx_pause_total;
+	u64 tx_unicast_total;
+	u64 rx_total_frames;
+	u64 rx_hw_drop_total;
+	u64 rx_sw_packets_total;
+
+	u32 last_delta_pause;
+	u32 last_delta_drops;
+};
+
 struct irq_domain_data {
 	struct irq_domain	*irqdomain;
 	unsigned int		phyirq;
@@ -477,6 +500,35 @@ struct lan78xx_net {
 
 	struct phylink		*phylink;
 	struct phylink_config	phylink_config;
+
+	struct devlink		*devlink;
+	struct devlink_health_reporter	*fifo_reporter;
+	struct devlink_health_reporter	*internal_err_reporter;
+	struct lan78xx_stat_snapshot	snapshot;
+};
+
+struct lan78xx_dump_ctx {
+	const char *msg;
+	ktime_t ts; /* Timestamp of detection */
+
+	union {
+		struct {
+			u64 delta_pause;
+			u64 delta_rx;
+			u64 delta_hw_drop;
+			u64 delta_sw_rx;
+		} fifo;
+		struct {
+			u32 int_sts; /* The ISR's view of INT_STS */
+			u32 int_enp; /* The ISR's view of INT_ENP_CTL */
+		} err;
+	};
+};
+
+/* Register Dump Map Structure */
+struct lan78xx_reg_map {
+	u32 reg;
+	const char *name;
 };
 
 /* use ethtool to change the level for any given device */
@@ -484,6 +536,87 @@ static int msg_level = -1;
 module_param(msg_level, int, 0);
 MODULE_PARM_DESC(msg_level, "Override default message level");
 
+/* Helper macro to map register to name string */
+#define LAN78XX_DUMP_REG(reg) { reg, #reg }
+
+static const struct lan78xx_reg_map lan78xx_fifo_regs[] = {
+	/* --- FIFO Control & Status ---
+	 * specific enable/reset bits.
+	 * used_bytes tells us if the bottleneck is USB (TX high) or MAC
+	 * (RX high).
+	 */
+	LAN78XX_DUMP_REG(FCT_TX_CTL),
+	LAN78XX_DUMP_REG(FCT_RX_CTL),
+
+	/* --- Data Path Usage ---
+	 * Capture total buffer usage including USB endpoint overhead.
+	 * If DP_STOR is high but FCT_USED is low, data is stuck in the USB
+	 * layer.
+	 */
+	LAN78XX_DUMP_REG(TX_DP_STOR),
+	LAN78XX_DUMP_REG(RX_DP_STOR),
+
+	/* --- FIFO Boundaries ---
+	 * verify if the FIFO partitioning has been corrupted or misconfigured.
+	 */
+	LAN78XX_DUMP_REG(FCT_TX_FIFO_END),
+	LAN78XX_DUMP_REG(FCT_RX_FIFO_END),
+
+	/* --- Flow Control ---
+	 * Critical for "Pause Storm" debugging.
+	 * Check if thresholds are set correctly and if Pause frames are enabled.
+	 */
+	LAN78XX_DUMP_REG(FCT_FLOW),
+	LAN78XX_DUMP_REG(FLOW),
+
+	/* --- Configuration & Speed ---
+	 * Mismatches between MAC speed (1G) and USB speed (HighSpeed)
+	 * are the #1 cause of buffer overflows.
+	 */
+	LAN78XX_DUMP_REG(MAC_CR),       /* MAC Speed/Duplex */
+	LAN78XX_DUMP_REG(USB_CFG0),     /* USB Speed/Burst Cap Enable */
+	LAN78XX_DUMP_REG(BURST_CAP),    /* USB Burst Size Limit */
+	LAN78XX_DUMP_REG(BULK_IN_DLY),  /* Inter-packet delay settings */
+
+	/* --- Debug Pointers ---
+	 * Internal read/write pointers for the FIFO RAM.
+	 * Helps detect if the hardware pointer logic has wrapped or frozen.
+	 */
+	LAN78XX_DUMP_REG(DP_SEL),
+	LAN78XX_DUMP_REG(DP_CMD),
+};
+
+static const struct lan78xx_reg_map lan78xx_err_regs[] = {
+	/* --- Interrupt Status ---
+	 * The "Smoking Gun". Reveals if the error was triggered by:
+	 * - MAC_ERR_INT: Internal logic overflow/underflow.
+	 * - PHY_INT: Link loss or signal degradation.
+	 * - TDFO/RDFO: FIFO Overflows (redundant but explicit).
+	 */
+	LAN78XX_DUMP_REG(INT_STS),
+	LAN78XX_DUMP_REG(INT_EP_CTL),
+
+	/* --- System Health ---
+	 * Check for invalid power states (D3 while active) or stuck resets.
+	 * HW_CFG also contains the "Soft Reset" status bit.
+	 */
+	LAN78XX_DUMP_REG(HW_CFG),
+	LAN78XX_DUMP_REG(PMT_CTL),
+
+	/* --- Bus Integrity ---
+	 * USB_CFG1 contains Low Power Mode (LPM) and Suspend guards.
+	 */
+	LAN78XX_DUMP_REG(USB_CFG0),
+	LAN78XX_DUMP_REG(USB_CFG1),
+
+	/* --- MAC Status ---
+	 * Verify if the receiver is actually enabled (RXEN) and if
+	 * filtering (Promiscuous/Multicast) is set as expected.
+	 */
+	LAN78XX_DUMP_REG(MAC_CR),
+	LAN78XX_DUMP_REG(MAC_RX),
+};
+
 static struct sk_buff *lan78xx_get_buf(struct sk_buff_head *buf_pool)
 {
 	if (skb_queue_empty(buf_pool))
@@ -831,12 +964,67 @@ static void lan78xx_check_stat_rollover(struct lan78xx_net *dev,
 	memcpy(&dev->stats.saved, stats, sizeof(struct lan78xx_statstage));
 }
 
+static void lan78xx_check_stat_anomalies(struct lan78xx_net *dev)
+{
+	u64 delta_pause, delta_rx, delta_hw_drop, delta_sw_rx;
+	struct lan78xx_dump_ctx ctx = {0};
+	struct lan78xx_stat_snapshot now;
+	const char *anomaly_msg = NULL;
+
+	/* 1. Capture "Now" (Atomic-ish collection) */
+	now.time = ktime_get_real();
+
+	mutex_lock(&dev->stats.access_lock);
+	now.tx_pause_total = dev->stats.curr_stat.tx_pause_frames;
+	now.rx_total_frames = dev->stats.curr_stat.rx_unicast_frames +
+			      dev->stats.curr_stat.rx_broadcast_frames +
+			      dev->stats.curr_stat.rx_multicast_frames;
+	now.rx_hw_drop_total = dev->stats.curr_stat.rx_dropped_frames;
+	now.tx_unicast_total = dev->stats.curr_stat.tx_unicast_frames;
+	mutex_unlock(&dev->stats.access_lock);
+
+	now.rx_sw_packets_total = dev->net->stats.rx_packets;
+
+	delta_pause = now.tx_pause_total - dev->snapshot.tx_pause_total;
+	delta_rx = now.rx_total_frames - dev->snapshot.rx_total_frames;
+	delta_hw_drop = now.rx_hw_drop_total - dev->snapshot.rx_hw_drop_total;
+	delta_sw_rx = now.rx_sw_packets_total - dev->snapshot.rx_sw_packets_total;
+
+	now.last_delta_pause = (u32)delta_pause;
+	now.last_delta_drops = (u32)delta_hw_drop;
+
+	dev->snapshot = now;
+
+	if (delta_pause > LAN78XX_STALL_PAUSE_THRESH && delta_rx == 0) {
+		anomaly_msg = "Stall: Pause Storm & No RX";
+	} else if (delta_hw_drop > LAN78XX_LIVELOCK_DROP_THRESH &&
+		   delta_hw_drop > (delta_sw_rx * LAN78XX_LIVELOCK_DROP_RATIO)) {
+		anomaly_msg = "Stall: RX Livelock Detected (Excessive Drop Ratio)";
+	}
+
+	if (!anomaly_msg)
+		return;
+
+	/* 5. Reporting */
+	ctx.msg = anomaly_msg;
+	ctx.ts = now.time;
+	ctx.fifo.delta_pause   = delta_pause;
+	ctx.fifo.delta_rx      = delta_rx;
+	ctx.fifo.delta_hw_drop = delta_hw_drop;
+	ctx.fifo.delta_sw_rx   = delta_sw_rx;
+
+	netdev_warn(dev->net, "%s (HW Drops: +%llu, SW RX: +%llu)\n",
+		    ctx.msg, delta_hw_drop, delta_sw_rx);
+
+	devlink_health_report(dev->fifo_reporter, ctx.msg, &ctx);
+}
+
 static void lan78xx_update_stats(struct lan78xx_net *dev)
 {
+	struct lan78xx_statstage lan78xx_stats;
 	u32 *p, *count, *max;
 	u64 *data;
 	int i;
-	struct lan78xx_statstage lan78xx_stats;
 
 	if (usb_autopm_get_interface(dev->intf) < 0)
 		return;
@@ -856,6 +1044,8 @@ static void lan78xx_update_stats(struct lan78xx_net *dev)
 
 	mutex_unlock(&dev->stats.access_lock);
 
+	lan78xx_check_stat_anomalies(dev);
+
 	usb_autopm_put_interface(dev->intf);
 }
 
@@ -1625,6 +1815,18 @@ static void lan78xx_status(struct lan78xx_net *dev, struct urb *urb)
 
 		if (dev->domain_data.phyirq > 0)
 			generic_handle_irq_safe(dev->domain_data.phyirq);
+	} else if (intdata & (INT_ENP_TDFO_INT | INT_ENP_TDFU_INT |
+			      INT_ENP_RDFO_INT | INT_ENP_MAC_ERR_INT)) {
+		struct lan78xx_dump_ctx ctx = {0};
+
+		ctx.msg = "HW Interrupt Error";
+		ctx.ts = ktime_get_real();
+		ctx.err.int_sts = intdata;
+
+		netdev_warn(dev->net, "HW Error detected: 0x%08x, triggering health report\n",
+			    intdata);
+
+		devlink_health_report(dev->internal_err_reporter, ctx.msg, &ctx);
 	} else {
 		netdev_warn(dev->net,
 			    "unexpected interrupt: 0x%08x\n", intdata);
@@ -4705,6 +4907,148 @@ static void intr_complete(struct urb *urb)
 	}
 }
 
+static int lan78xx_dump_regs(struct lan78xx_net *dev, struct devlink_fmsg *fmsg,
+			     const struct lan78xx_reg_map *map, size_t count)
+{
+	int ret, i;
+	u32 val;
+
+	for (i = 0; i < count; i++) {
+		ret = lan78xx_read_reg(dev, map[i].reg, &val);
+		if (ret)
+			return ret;
+		devlink_fmsg_u32_pair_put(fmsg, map[i].name, val);
+	}
+	return 0;
+}
+
+static int lan78xx_fifo_dump(struct devlink_health_reporter *reporter,
+			     struct devlink_fmsg *fmsg, void *priv_ctx,
+			     struct netlink_ext_ack *extack)
+{
+	struct lan78xx_net *dev = devlink_health_reporter_priv(reporter);
+	struct lan78xx_dump_ctx *ctx = priv_ctx;
+
+	/* 1. Context Snapshot:
+	 * Dump the specific counters that triggered the threshold.
+	 * Registers may have changed since the decision was made.
+	 */
+	if (ctx) {
+		devlink_fmsg_string_pair_put(fmsg, "trigger_reason", ctx->msg);
+		devlink_fmsg_u64_pair_put(fmsg, "timestamp_ns",
+					  ktime_to_ns(ctx->ts));
+
+		devlink_fmsg_obj_nest_start(fmsg);
+		devlink_fmsg_u64_pair_put(fmsg, "trigger_delta_pause",
+					  ctx->fifo.delta_pause);
+		devlink_fmsg_u64_pair_put(fmsg, "trigger_delta_rx",
+					  ctx->fifo.delta_rx);
+		devlink_fmsg_u64_pair_put(fmsg, "trigger_delta_hw_drop",
+					  ctx->fifo.delta_hw_drop);
+		devlink_fmsg_u64_pair_put(fmsg, "trigger_delta_sw_rx",
+					  ctx->fifo.delta_sw_rx);
+		devlink_fmsg_obj_nest_end(fmsg);
+	}
+
+	/* USB Speed is critical for interpreting throughput/stall issues */
+	devlink_fmsg_u8_pair_put(fmsg, "usb_speed_enum", dev->udev->speed);
+
+	/* 2. Live Register Dump */
+	return lan78xx_dump_regs(dev, fmsg, lan78xx_fifo_regs,
+				 ARRAY_SIZE(lan78xx_fifo_regs));
+}
+
+static int lan78xx_internal_err_dump(struct devlink_health_reporter *reporter,
+				     struct devlink_fmsg *fmsg, void *priv_ctx,
+				     struct netlink_ext_ack *extack)
+{
+	struct lan78xx_net *dev = devlink_health_reporter_priv(reporter);
+	struct lan78xx_dump_ctx *ctx = priv_ctx;
+
+	/* Interrupt status is "write-1-to-clear" or cleared on read.
+	 * We must dump the value seen by the ISR, not the current register
+	 * value.
+	 */
+	if (ctx) {
+		devlink_fmsg_string_pair_put(fmsg, "trigger_reason", ctx->msg);
+		devlink_fmsg_u64_pair_put(fmsg, "timestamp_ns",
+					  ktime_to_ns(ctx->ts));
+
+		devlink_fmsg_u32_pair_put(fmsg, "trigger_int_sts",
+					  ctx->err.int_sts);
+		devlink_fmsg_u32_pair_put(fmsg, "trigger_int_enp",
+					  ctx->err.int_enp);
+	}
+
+	return lan78xx_dump_regs(dev, fmsg, lan78xx_err_regs,
+				 ARRAY_SIZE(lan78xx_err_regs));
+}
+
+static const struct devlink_health_reporter_ops lan78xx_fifo_ops = {
+	.name = "fifo",
+	.dump = lan78xx_fifo_dump,
+};
+
+static const struct devlink_health_reporter_ops lan78xx_internal_err_ops = {
+	.name = "internal_err",
+	.dump = lan78xx_internal_err_dump,
+};
+
+static int lan78xx_health_init(struct lan78xx_net *dev)
+{
+	dev->fifo_reporter = devlink_health_reporter_create(dev->devlink,
+							    &lan78xx_fifo_ops,
+							    dev);
+	if (IS_ERR(dev->fifo_reporter)) {
+		netdev_warn(dev->net, "Failed to create fifo reporter\n");
+
+		return PTR_ERR(dev->fifo_reporter);
+	}
+
+	dev->internal_err_reporter =
+		devlink_health_reporter_create(dev->devlink,
+					       &lan78xx_internal_err_ops, dev);
+	if (IS_ERR(dev->internal_err_reporter)) {
+		netdev_warn(dev->net, "Failed to create internal_err reporter\n");
+		devlink_health_reporter_destroy(dev->fifo_reporter);
+
+		return PTR_ERR(dev->internal_err_reporter);
+	}
+
+	return 0;
+}
+
+static void lan78xx_health_cleanup(struct lan78xx_net *dev)
+{
+	devlink_health_reporter_destroy(dev->fifo_reporter);
+	devlink_health_reporter_destroy(dev->internal_err_reporter);
+}
+
+static int lan78xx_devlink_info_get(struct devlink *devlink,
+				    struct devlink_info_req *req,
+				    struct netlink_ext_ack *extack)
+{
+	struct lan78xx_devlink_priv *dl_priv = devlink_priv(devlink);
+	struct lan78xx_net *dev = dl_priv->dev;
+	char buf[16];
+
+	snprintf(buf, sizeof(buf), "0x%04X", dev->chipid);
+	devlink_info_version_fixed_put(req,
+				       DEVLINK_INFO_VERSION_GENERIC_ASIC_ID,
+				       buf);
+
+	snprintf(buf, sizeof(buf), "0x%04X", dev->chiprev);
+	devlink_info_version_fixed_put(req,
+				       DEVLINK_INFO_VERSION_GENERIC_ASIC_REV,
+				       buf);
+
+	return 0;
+}
+
+static const struct devlink_ops lan78xx_devlink_ops = {
+	.info_get = lan78xx_devlink_info_get,
+};
+
 static void lan78xx_disconnect(struct usb_interface *intf)
 {
 	struct lan78xx_net *dev;
@@ -4719,6 +5063,13 @@ static void lan78xx_disconnect(struct usb_interface *intf)
 	udev = interface_to_usbdev(intf);
 	net = dev->net;
 
+	lan78xx_health_cleanup(dev);
+	if (dev->devlink) {
+		devlink_unregister(dev->devlink);
+		devlink_free(dev->devlink);
+		dev->devlink = NULL;
+	}
+
 	rtnl_lock();
 	phylink_stop(dev->phylink);
 	phylink_disconnect_phy(dev->phylink);
@@ -4749,6 +5100,30 @@ static void lan78xx_disconnect(struct usb_interface *intf)
 static void lan78xx_tx_timeout(struct net_device *net, unsigned int txqueue)
 {
 	struct lan78xx_net *dev = netdev_priv(net);
+	struct lan78xx_dump_ctx ctx = {0};
+	s64 diff_ms;
+
+	/* Calculate time since last health check */
+	ctx.ts = ktime_get_real();
+	diff_ms = ktime_ms_delta(ctx.ts, dev->snapshot.time);
+
+	/* We rely on the trend data captured during the last valid stat update
+	 * to infer the system state before the crash.
+	 */
+	if (dev->snapshot.last_delta_pause > LAN78XX_STALL_PAUSE_THRESH)
+		ctx.msg = "TX Timeout (Flow Control Storm?)";
+	else if (dev->snapshot.last_delta_drops > LAN78XX_TX_TIMEOUT_DROP_THRESH)
+		ctx.msg = "TX Timeout (FIFO Drop Storm?)";
+	else
+		ctx.msg = "TX Timeout";
+
+	ctx.fifo.delta_pause = dev->snapshot.last_delta_pause;
+	ctx.fifo.delta_hw_drop = dev->snapshot.last_delta_drops;
+
+	netdev_warn(dev->net, "%s (Last stat update: %lld ms ago)\n",
+		    ctx.msg, diff_ms);
+
+	devlink_health_report(dev->fifo_reporter, ctx.msg, &ctx);
 
 	unlink_urbs(dev, &dev->txq);
 	napi_schedule(&dev->napi);
@@ -5157,6 +5532,17 @@ static int lan78xx_probe(struct usb_interface *intf,
 	pm_runtime_set_autosuspend_delay(&udev->dev,
 					 DEFAULT_AUTOSUSPEND_DELAY);
 
+	dev->devlink = devlink_alloc(&lan78xx_devlink_ops,
+				     sizeof(struct lan78xx_devlink_priv),
+				     &udev->dev);
+	if (dev->devlink) {
+		struct lan78xx_devlink_priv *dl_priv = devlink_priv(dev->devlink);
+
+		dl_priv->dev = dev;
+		devlink_register(dev->devlink);
+		lan78xx_health_init(dev);
+	}
+
 	return 0;
 
 phy_uninit:
-- 
2.47.3


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ