linux-kernel - Re: [PATCH v3 2/2] PCI: brcmstb: Add panic/die handler to driver

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2b0f9620-a105-6e49-f9cb-4bac14e14ce2@linux.intel.com>
Date: Tue, 21 Oct 2025 14:02:10 +0300 (EEST)
From: Ilpo Järvinen <ilpo.jarvinen@...ux.intel.com>
To: Bjorn Helgaas <helgaas@...nel.org>, 
    Jim Quinlan <james.quinlan@...adcom.com>
cc: linux-pci@...r.kernel.org, Nicolas Saenz Julienne <nsaenz@...nel.org>, 
    Bjorn Helgaas <bhelgaas@...gle.com>, 
    Lorenzo Pieralisi <lorenzo.pieralisi@....com>, 
    Cyril Brulebois <kibi@...ian.org>, bcm-kernel-feedback-list@...adcom.com, 
    jim2101024@...il.com, Florian Fainelli <florian.fainelli@...adcom.com>, 
    Lorenzo Pieralisi <lpieralisi@...nel.org>, 
    Krzysztof Wilczyński <kwilczynski@...nel.org>, 
    Manivannan Sadhasivam <mani@...nel.org>, Rob Herring <robh@...nel.org>, 
    "moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE" <linux-rpi-kernel@...ts.infradead.org>, 
    "moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE" <linux-arm-kernel@...ts.infradead.org>, 
    open list <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v3 2/2] PCI: brcmstb: Add panic/die handler to driver

On Mon, 20 Oct 2025, Bjorn Helgaas wrote:

> On Fri, Oct 03, 2025 at 03:56:07PM -0400, Jim Quinlan wrote:
> > Whereas most PCIe HW returns 0xffffffff on illegal accesses and the like,
> > by default Broadcom's STB PCIe controller effects an abort.  Some SoCs --
> > 7216 and its descendants -- have new HW that identifies error details.
> > 
> > This simple handler determines if the PCIe controller was the cause of the
> > abort and if so, prints out diagnostic info.  Unfortunately, an abort still
> > occurs.
> > 
> > Care is taken to read the error registers only when the PCIe bridge is
> > active and the PCIe registers are acceptable.  Otherwise, a "die" event
> > caused by something other than the PCIe could cause an abort if the PCIe
> > "die" handler tried to access registers when the bridge is off.
> > 
> > Example error output:
> >   brcm-pcie 8b20000.pcie: Error: Mem Acc: 32bit, Read, @0x38000000
> >   brcm-pcie 8b20000.pcie:  Type: TO=0 Abt=0 UnspReq=1 AccDsble=0 BadAddr=0
> 
> > +/* Error report registers */
> > +#define PCIE_OUTB_ERR_TREAT				0x6000
> > +#define  PCIE_OUTB_ERR_TREAT_CONFIG_MASK		0x1
> > +#define  PCIE_OUTB_ERR_TREAT_MEM_MASK			0x2
> > +#define PCIE_OUTB_ERR_VALID				0x6004
> > +#define PCIE_OUTB_ERR_CLEAR				0x6008
> > +#define PCIE_OUTB_ERR_ACC_INFO				0x600c
> > +#define  PCIE_OUTB_ERR_ACC_INFO_CFG_ERR_MASK		0x01
> > +#define  PCIE_OUTB_ERR_ACC_INFO_MEM_ERR_MASK		0x02
> > +#define  PCIE_OUTB_ERR_ACC_INFO_TYPE_64_MASK		0x04
> > +#define  PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE_MASK		0x10
> > +#define  PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES_MASK		0xff00
> > +#define PCIE_OUTB_ERR_ACC_ADDR				0x6010
> > +#define PCIE_OUTB_ERR_ACC_ADDR_BUS_MASK			0xff00000
> > +#define PCIE_OUTB_ERR_ACC_ADDR_DEV_MASK			0xf8000
> > +#define PCIE_OUTB_ERR_ACC_ADDR_FUNC_MASK		0x7000
> > +#define PCIE_OUTB_ERR_ACC_ADDR_REG_MASK			0xfff
> > +#define PCIE_OUTB_ERR_CFG_CAUSE				0x6014
> > +#define  PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT_MASK		0x40
> > +#define  PCIE_OUTB_ERR_CFG_CAUSE_ABORT_MASK		0x20
> > +#define  PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ_MASK	0x10
> > +#define  PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT_MASK	0x4
> > +#define  PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED_MASK	0x2
> > +#define  PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT__MASK	0x1

Double __

> > +#define PCIE_OUTB_ERR_MEM_ADDR_LO			0x6018
> > +#define PCIE_OUTB_ERR_MEM_ADDR_HI			0x601c
> > +#define PCIE_OUTB_ERR_MEM_CAUSE				0x6020
> > +#define  PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT_MASK		0x40
> > +#define  PCIE_OUTB_ERR_MEM_CAUSE_ABORT_MASK		0x20
> > +#define  PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ_MASK	0x10
> > +#define  PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED_MASK	0x2
> > +#define  PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR_MASK		0x1

Maybe use BIT() instead for single bits?

> IMO "_MASK" is not adding anything useful to these names.  But I see
> there's a lot of precedent in this driver.
>
> >  #define  PCIE_RGR1_SW_INIT_1_PERST_MASK			0x1
> >  #define  PCIE_RGR1_SW_INIT_1_PERST_SHIFT		0x0

Please don't add unnecessary _SHIFT defines as FIELD_GET/PREP() for the 
field define should have most cases covered that require shifting.

This define is also entirely unused in this patch.

> > @@ -306,6 +342,8 @@ struct brcm_pcie {
> >  	bool			ep_wakeup_capable;
> >  	const struct pcie_cfg_data	*cfg;
> >  	bool			bridge_in_reset;
> > +	struct notifier_block	die_notifier;
> > +	struct notifier_block	panic_notifier;
> >  	spinlock_t		bridge_lock;
> >  };
> >  
> > @@ -1731,6 +1769,115 @@ static int brcm_pcie_resume_noirq(struct device *dev)
> >  	return ret;
> >  }
> >  
> > +/* Dump out PCIe errors on die or panic */
> > +static int _brcm_pcie_dump_err(struct brcm_pcie *pcie,
> 
> What is the leading underscore telling me?  There's no
> brcm_pcie_dump_err() that we need to distinguish from.
> 
> > +			       const char *type)
> > +{
> > +	void __iomem *base = pcie->base;
> > +	int i, is_cfg_err, is_mem_err, lanes;
> > +	char *width_str, *direction_str, lanes_str[9];
> > +	u32 info, cfg_addr, cfg_cause, mem_cause, lo, hi;
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&pcie->bridge_lock, flags);
> > +	/* Don't access registers when the bridge is off */
> > +	if (pcie->bridge_in_reset || readl(base + PCIE_OUTB_ERR_VALID) == 0) {
> > +		spin_unlock_irqrestore(&pcie->bridge_lock, flags);
> > +		return NOTIFY_DONE;
> > +	}
> > +
> > +	/* Read all necessary registers so we can release the spinlock ASAP */
> > +	info = readl(base + PCIE_OUTB_ERR_ACC_INFO);
> > +	is_cfg_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_CFG_ERR_MASK);
> > +	is_mem_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_MEM_ERR_MASK);
> > +	if (is_cfg_err) {
> > +		cfg_addr = readl(base + PCIE_OUTB_ERR_ACC_ADDR);
> > +		cfg_cause = readl(base + PCIE_OUTB_ERR_CFG_CAUSE);
> > +	}
> > +	if (is_mem_err) {
> > +		mem_cause = readl(base + PCIE_OUTB_ERR_MEM_CAUSE);
> > +		lo = readl(base + PCIE_OUTB_ERR_MEM_ADDR_LO);
> > +		hi = readl(base + PCIE_OUTB_ERR_MEM_ADDR_HI);
> > +	}
> > +	/* We've got all of the info, clear the error */
> > +	writel(1, base + PCIE_OUTB_ERR_CLEAR);
> > +	spin_unlock_irqrestore(&pcie->bridge_lock, flags);
> > +
> > +	dev_err(pcie->dev, "reporting data on PCIe %s error\n", type);
> 
> Looks like this isn't included in the example error output.  Not a big
> deal in itself, but logging this:
> 
>   brcm-pcie 8b20000.pcie: reporting data on PCIe Panic error
> 
> suggests that we know this panic was directly *caused* by PCIe, and
> I'm not sure the fact that somebody called panic() and
> PCIE_OUTB_ERR_VALID was non-zero is convincing evidence of that.
> 
> I think this relies on the assumptions that (a) the controller
> triggers an abort and (b) the abort handler calls panic().  So I think
> this logs useful information that *might* be related to the panic.
> 
> I'd rather phrase this with a little less certainty, to convey the
> idea that "here's some PCIe error information that might be related to
> the panic/die".
> 
> > +	width_str = (info & PCIE_OUTB_ERR_ACC_INFO_TYPE_64_MASK) ? "64bit" : "32bit";
> > +	direction_str = (info & PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE_MASK) ? "Write" : "Read";

Please use str_read_write() + don't forget it's include.

It might be also worth to add str_64bit_32bit() in the form with the
dash ("64-bit") as there a couple of other drivers print the same choice.


> > +	lanes = FIELD_GET(PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES_MASK, info);
> > +	for (i = 0, lanes_str[8] = 0; i < 8; i++)
> > +		lanes_str[i] = (lanes & (1 << i)) ? '1' : '0';
> > +
> > +	if (is_cfg_err) {
> > +		int bus = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_BUS_MASK, cfg_addr);
> > +		int dev = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_DEV_MASK, cfg_addr);
> > +		int func = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_FUNC_MASK, cfg_addr);
> > +		int reg = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_REG_MASK, cfg_addr);
> > +
> > +		dev_err(pcie->dev, "Error: CFG Acc, %s, %s, Bus=%d, Dev=%d, Fun=%d, Reg=0x%x, lanes=%s\n",
> 
> Why are we printing bus and dev with %d?  Can we use the usual format
> ("%04x:%02x:%02x.%d") so it matches other logging?
> 
> > +			width_str, direction_str, bus, dev, func, reg, lanes_str);
> > +		dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccTO=%d AccDsbld=%d Acc64bit=%d\n",
> > +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT_MASK),
> > +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ABORT_MASK),
> > +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ_MASK),
> > +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT_MASK),
> > +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED_MASK),
> > +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT__MASK));
> > +	}
> > +
> > +	if (is_mem_err) {
> > +		u64 addr = ((u64)hi << 32) | (u64)lo;
> > +
> > +		dev_err(pcie->dev, "Error: Mem Acc, %s, %s, @0x%llx, lanes=%s\n",
> > +			width_str, direction_str, addr, lanes_str);
> > +		dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccDsble=%d BadAddr=%d\n",
> > +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT_MASK),
> > +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ABORT_MASK),
> > +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ_MASK),
> > +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED_MASK),
> > +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR_MASK));
> > +	}
> > +
> > +	return NOTIFY_OK;
> 
> What is the difference between NOTIFY_DONE and NOTIFY_OK?  Can the
> caller do anything useful based on the difference?
> 
> This seems like opportunistic error information that isn't definitely
> definitely connected to anything, so I'm not sure returning different
> values is really reliable.
>