lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <21ca2154-a8ff-4f75-96c1-ded798901cc0@broadcom.com>
Date: Tue, 28 Oct 2025 17:17:45 -0400
From: James Quinlan <james.quinlan@...adcom.com>
To: Bjorn Helgaas <helgaas@...nel.org>
Cc: linux-pci@...r.kernel.org, Nicolas Saenz Julienne <nsaenz@...nel.org>,
 Bjorn Helgaas <bhelgaas@...gle.com>,
 Lorenzo Pieralisi <lorenzo.pieralisi@....com>,
 Cyril Brulebois <kibi@...ian.org>, bcm-kernel-feedback-list@...adcom.com,
 jim2101024@...il.com, Florian Fainelli <florian.fainelli@...adcom.com>,
 Lorenzo Pieralisi <lpieralisi@...nel.org>,
 Krzysztof WilczyƄski <kwilczynski@...nel.org>,
 Manivannan Sadhasivam <mani@...nel.org>, Rob Herring <robh@...nel.org>,
 "moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE"
 <linux-rpi-kernel@...ts.infradead.org>,
 "moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE"
 <linux-arm-kernel@...ts.infradead.org>,
 open list <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v3 2/2] PCI: brcmstb: Add panic/die handler to driver

On 10/20/25 14:48, Bjorn Helgaas wrote:
> On Fri, Oct 03, 2025 at 03:56:07PM -0400, Jim Quinlan wrote:
>> Whereas most PCIe HW returns 0xffffffff on illegal accesses and the like,
>> by default Broadcom's STB PCIe controller effects an abort.  Some SoCs --
>> 7216 and its descendants -- have new HW that identifies error details.
>>
>> This simple handler determines if the PCIe controller was the cause of the
>> abort and if so, prints out diagnostic info.  Unfortunately, an abort still
>> occurs.
>>
>> Care is taken to read the error registers only when the PCIe bridge is
>> active and the PCIe registers are acceptable.  Otherwise, a "die" event
>> caused by something other than the PCIe could cause an abort if the PCIe
>> "die" handler tried to access registers when the bridge is off.
>>
>> Example error output:
>>    brcm-pcie 8b20000.pcie: Error: Mem Acc: 32bit, Read, @0x38000000
>>    brcm-pcie 8b20000.pcie:  Type: TO=0 Abt=0 UnspReq=1 AccDsble=0 BadAddr=0
>> +/* Error report registers */
>> +#define PCIE_OUTB_ERR_TREAT				0x6000
>> +#define  PCIE_OUTB_ERR_TREAT_CONFIG_MASK		0x1
>> +#define  PCIE_OUTB_ERR_TREAT_MEM_MASK			0x2
>> +#define PCIE_OUTB_ERR_VALID				0x6004
>> +#define PCIE_OUTB_ERR_CLEAR				0x6008
>> +#define PCIE_OUTB_ERR_ACC_INFO				0x600c
>> +#define  PCIE_OUTB_ERR_ACC_INFO_CFG_ERR_MASK		0x01
>> +#define  PCIE_OUTB_ERR_ACC_INFO_MEM_ERR_MASK		0x02
>> +#define  PCIE_OUTB_ERR_ACC_INFO_TYPE_64_MASK		0x04
>> +#define  PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE_MASK		0x10
>> +#define  PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES_MASK		0xff00
>> +#define PCIE_OUTB_ERR_ACC_ADDR				0x6010
>> +#define PCIE_OUTB_ERR_ACC_ADDR_BUS_MASK			0xff00000
>> +#define PCIE_OUTB_ERR_ACC_ADDR_DEV_MASK			0xf8000
>> +#define PCIE_OUTB_ERR_ACC_ADDR_FUNC_MASK		0x7000
>> +#define PCIE_OUTB_ERR_ACC_ADDR_REG_MASK			0xfff
>> +#define PCIE_OUTB_ERR_CFG_CAUSE				0x6014
>> +#define  PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT_MASK		0x40
>> +#define  PCIE_OUTB_ERR_CFG_CAUSE_ABORT_MASK		0x20
>> +#define  PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ_MASK	0x10
>> +#define  PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT_MASK	0x4
>> +#define  PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED_MASK	0x2
>> +#define  PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT__MASK	0x1
>> +#define PCIE_OUTB_ERR_MEM_ADDR_LO			0x6018
>> +#define PCIE_OUTB_ERR_MEM_ADDR_HI			0x601c
>> +#define PCIE_OUTB_ERR_MEM_CAUSE				0x6020
>> +#define  PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT_MASK		0x40
>> +#define  PCIE_OUTB_ERR_MEM_CAUSE_ABORT_MASK		0x20
>> +#define  PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ_MASK	0x10
>> +#define  PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED_MASK	0x2
>> +#define  PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR_MASK		0x1
> IMO "_MASK" is not adding anything useful to these names.  But I see
> there's a lot of precedent in this driver.
Removed.
>
>>   #define  PCIE_RGR1_SW_INIT_1_PERST_MASK			0x1
>>   #define  PCIE_RGR1_SW_INIT_1_PERST_SHIFT		0x0
>>   
>> @@ -306,6 +342,8 @@ struct brcm_pcie {
>>   	bool			ep_wakeup_capable;
>>   	const struct pcie_cfg_data	*cfg;
>>   	bool			bridge_in_reset;
>> +	struct notifier_block	die_notifier;
>> +	struct notifier_block	panic_notifier;
>>   	spinlock_t		bridge_lock;
>>   };
>>   
>> @@ -1731,6 +1769,115 @@ static int brcm_pcie_resume_noirq(struct device *dev)
>>   	return ret;
>>   }
>>   
>> +/* Dump out PCIe errors on die or panic */
>> +static int _brcm_pcie_dump_err(struct brcm_pcie *pcie,
> What is the leading underscore telling me?  There's no
> brcm_pcie_dump_err() that we need to distinguish from.
Will be removed.
>
>> +			       const char *type)
>> +{
>> +	void __iomem *base = pcie->base;
>> +	int i, is_cfg_err, is_mem_err, lanes;
>> +	char *width_str, *direction_str, lanes_str[9];
>> +	u32 info, cfg_addr, cfg_cause, mem_cause, lo, hi;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&pcie->bridge_lock, flags);
>> +	/* Don't access registers when the bridge is off */
>> +	if (pcie->bridge_in_reset || readl(base + PCIE_OUTB_ERR_VALID) == 0) {
>> +		spin_unlock_irqrestore(&pcie->bridge_lock, flags);
>> +		return NOTIFY_DONE;
>> +	}
>> +
>> +	/* Read all necessary registers so we can release the spinlock ASAP */
>> +	info = readl(base + PCIE_OUTB_ERR_ACC_INFO);
>> +	is_cfg_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_CFG_ERR_MASK);
>> +	is_mem_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_MEM_ERR_MASK);
>> +	if (is_cfg_err) {
>> +		cfg_addr = readl(base + PCIE_OUTB_ERR_ACC_ADDR);
>> +		cfg_cause = readl(base + PCIE_OUTB_ERR_CFG_CAUSE);
>> +	}
>> +	if (is_mem_err) {
>> +		mem_cause = readl(base + PCIE_OUTB_ERR_MEM_CAUSE);
>> +		lo = readl(base + PCIE_OUTB_ERR_MEM_ADDR_LO);
>> +		hi = readl(base + PCIE_OUTB_ERR_MEM_ADDR_HI);
>> +	}
>> +	/* We've got all of the info, clear the error */
>> +	writel(1, base + PCIE_OUTB_ERR_CLEAR);
>> +	spin_unlock_irqrestore(&pcie->bridge_lock, flags);
>> +
>> +	dev_err(pcie->dev, "reporting data on PCIe %s error\n", type);
> Looks like this isn't included in the example error output.  Not a big
> deal in itself, but logging this:
>
>    brcm-pcie 8b20000.pcie: reporting data on PCIe Panic error
>
> suggests that we know this panic was directly *caused* by PCIe, and
> I'm not sure the fact that somebody called panic() and
> PCIE_OUTB_ERR_VALID was non-zero is convincing evidence of that.
>
> I think this relies on the assumptions that (a) the controller
> triggers an abort and (b) the abort handler calls panic().  So I think
> this logs useful information that *might* be related to the panic.
>
> I'd rather phrase this with a little less certainty, to convey the
> idea that "here's some PCIe error information that might be related to
> the panic/die".
Message changed.
>
>> +	width_str = (info & PCIE_OUTB_ERR_ACC_INFO_TYPE_64_MASK) ? "64bit" : "32bit";
>> +	direction_str = (info & PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE_MASK) ? "Write" : "Read";
>> +	lanes = FIELD_GET(PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES_MASK, info);
>> +	for (i = 0, lanes_str[8] = 0; i < 8; i++)
>> +		lanes_str[i] = (lanes & (1 << i)) ? '1' : '0';
>> +
>> +	if (is_cfg_err) {
>> +		int bus = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_BUS_MASK, cfg_addr);
>> +		int dev = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_DEV_MASK, cfg_addr);
>> +		int func = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_FUNC_MASK, cfg_addr);
>> +		int reg = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_REG_MASK, cfg_addr);
>> +
>> +		dev_err(pcie->dev, "Error: CFG Acc, %s, %s, Bus=%d, Dev=%d, Fun=%d, Reg=0x%x, lanes=%s\n",
> Why are we printing bus and dev with %d?  Can we use the usual format
> ("%04x:%02x:%02x.%d") so it matches other logging?
ack
>
>> +			width_str, direction_str, bus, dev, func, reg, lanes_str);
>> +		dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccTO=%d AccDsbld=%d Acc64bit=%d\n",
>> +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT_MASK),
>> +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ABORT_MASK),
>> +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ_MASK),
>> +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT_MASK),
>> +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED_MASK),
>> +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT__MASK));
>> +	}
>> +
>> +	if (is_mem_err) {
>> +		u64 addr = ((u64)hi << 32) | (u64)lo;
>> +
>> +		dev_err(pcie->dev, "Error: Mem Acc, %s, %s, @0x%llx, lanes=%s\n",
>> +			width_str, direction_str, addr, lanes_str);
>> +		dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccDsble=%d BadAddr=%d\n",
>> +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT_MASK),
>> +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ABORT_MASK),
>> +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ_MASK),
>> +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED_MASK),
>> +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR_MASK));
>> +	}
>> +
>> +	return NOTIFY_OK;
> What is the difference between NOTIFY_DONE and NOTIFY_OK?  Can the
> caller do anything useful based on the difference?
>
> This seems like opportunistic error information that isn't definitely
> definitely connected to anything, so I'm not sure returning different
> values is really reliable.

Will change to NOTIFY_DONE.

Regards,

Jim Quinlan

Broadcom STB/CM



Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ