[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <f50465c8-d4c5-4e1c-b6d9-ebf7a14ad19d@broadcom.com>
Date: Tue, 28 Oct 2025 18:37:38 -0400
From: James Quinlan <james.quinlan@...adcom.com>
To: Ilpo Järvinen <ilpo.jarvinen@...ux.intel.com>,
Bjorn Helgaas <helgaas@...nel.org>
Cc: linux-pci@...r.kernel.org, Nicolas Saenz Julienne <nsaenz@...nel.org>,
Bjorn Helgaas <bhelgaas@...gle.com>,
Lorenzo Pieralisi <lorenzo.pieralisi@....com>,
Cyril Brulebois <kibi@...ian.org>, bcm-kernel-feedback-list@...adcom.com,
jim2101024@...il.com, Florian Fainelli <florian.fainelli@...adcom.com>,
Lorenzo Pieralisi <lpieralisi@...nel.org>,
Krzysztof Wilczyński <kwilczynski@...nel.org>,
Manivannan Sadhasivam <mani@...nel.org>, Rob Herring <robh@...nel.org>,
"moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE"
<linux-rpi-kernel@...ts.infradead.org>,
"moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE"
<linux-arm-kernel@...ts.infradead.org>,
open list <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v3 2/2] PCI: brcmstb: Add panic/die handler to driver
On 10/21/25 07:02, Ilpo Järvinen wrote:
> On Mon, 20 Oct 2025, Bjorn Helgaas wrote:
>
>> On Fri, Oct 03, 2025 at 03:56:07PM -0400, Jim Quinlan wrote:
>>> Whereas most PCIe HW returns 0xffffffff on illegal accesses and the like,
>>> by default Broadcom's STB PCIe controller effects an abort. Some SoCs --
>>> 7216 and its descendants -- have new HW that identifies error details.
>>>
>>> This simple handler determines if the PCIe controller was the cause of the
>>> abort and if so, prints out diagnostic info. Unfortunately, an abort still
>>> occurs.
>>>
>>> Care is taken to read the error registers only when the PCIe bridge is
>>> active and the PCIe registers are acceptable. Otherwise, a "die" event
>>> caused by something other than the PCIe could cause an abort if the PCIe
>>> "die" handler tried to access registers when the bridge is off.
>>>
>>> Example error output:
>>> brcm-pcie 8b20000.pcie: Error: Mem Acc: 32bit, Read, @0x38000000
>>> brcm-pcie 8b20000.pcie: Type: TO=0 Abt=0 UnspReq=1 AccDsble=0 BadAddr=0
>>> +/* Error report registers */
>>> +#define PCIE_OUTB_ERR_TREAT 0x6000
>>> +#define PCIE_OUTB_ERR_TREAT_CONFIG_MASK 0x1
>>> +#define PCIE_OUTB_ERR_TREAT_MEM_MASK 0x2
>>> +#define PCIE_OUTB_ERR_VALID 0x6004
>>> +#define PCIE_OUTB_ERR_CLEAR 0x6008
>>> +#define PCIE_OUTB_ERR_ACC_INFO 0x600c
>>> +#define PCIE_OUTB_ERR_ACC_INFO_CFG_ERR_MASK 0x01
>>> +#define PCIE_OUTB_ERR_ACC_INFO_MEM_ERR_MASK 0x02
>>> +#define PCIE_OUTB_ERR_ACC_INFO_TYPE_64_MASK 0x04
>>> +#define PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE_MASK 0x10
>>> +#define PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES_MASK 0xff00
>>> +#define PCIE_OUTB_ERR_ACC_ADDR 0x6010
>>> +#define PCIE_OUTB_ERR_ACC_ADDR_BUS_MASK 0xff00000
>>> +#define PCIE_OUTB_ERR_ACC_ADDR_DEV_MASK 0xf8000
>>> +#define PCIE_OUTB_ERR_ACC_ADDR_FUNC_MASK 0x7000
>>> +#define PCIE_OUTB_ERR_ACC_ADDR_REG_MASK 0xfff
>>> +#define PCIE_OUTB_ERR_CFG_CAUSE 0x6014
>>> +#define PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT_MASK 0x40
>>> +#define PCIE_OUTB_ERR_CFG_CAUSE_ABORT_MASK 0x20
>>> +#define PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ_MASK 0x10
>>> +#define PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT_MASK 0x4
>>> +#define PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED_MASK 0x2
>>> +#define PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT__MASK 0x1
> Double __
ack
>
>>> +#define PCIE_OUTB_ERR_MEM_ADDR_LO 0x6018
>>> +#define PCIE_OUTB_ERR_MEM_ADDR_HI 0x601c
>>> +#define PCIE_OUTB_ERR_MEM_CAUSE 0x6020
>>> +#define PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT_MASK 0x40
>>> +#define PCIE_OUTB_ERR_MEM_CAUSE_ABORT_MASK 0x20
>>> +#define PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ_MASK 0x10
>>> +#define PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED_MASK 0x2
>>> +#define PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR_MASK 0x1
> Maybe use BIT() instead for single bits?
ack
>
>> IMO "_MASK" is not adding anything useful to these names. But I see
>> there's a lot of precedent in this driver.
>>
>>> #define PCIE_RGR1_SW_INIT_1_PERST_MASK 0x1
>>> #define PCIE_RGR1_SW_INIT_1_PERST_SHIFT 0x0
> Please don't add unnecessary _SHIFT defines as FIELD_GET/PREP() for the
> field define should have most cases covered that require shifting.
>
> This define is also entirely unused in this patch.
I've removed PCIE_RGR1_SW_INIT_1_PERST_SHIFT as it is not used.
There is a reason we have _MASK and _SHIFT suffixes: a script scans the
driver code for such constants and compares/contrasts their values in
our RDB include files across multiple SoCs (22+) we support with this
driver. This helps me keep things working as the HW group has a bad
habit of moving registers and fields when a new SoC comes out.
I forget to mention this to Bjorn but I am not worried about our
PCIE_OUTB_xxx constants.
>>> @@ -306,6 +342,8 @@ struct brcm_pcie {
>>> bool ep_wakeup_capable;
>>> const struct pcie_cfg_data *cfg;
>>> bool bridge_in_reset;
>>> + struct notifier_block die_notifier;
>>> + struct notifier_block panic_notifier;
>>> spinlock_t bridge_lock;
>>> };
>>>
>>> @@ -1731,6 +1769,115 @@ static int brcm_pcie_resume_noirq(struct device *dev)
>>> return ret;
>>> }
>>>
>>> +/* Dump out PCIe errors on die or panic */
>>> +static int _brcm_pcie_dump_err(struct brcm_pcie *pcie,
>> What is the leading underscore telling me? There's no
>> brcm_pcie_dump_err() that we need to distinguish from.
>>
>>> + const char *type)
>>> +{
>>> + void __iomem *base = pcie->base;
>>> + int i, is_cfg_err, is_mem_err, lanes;
>>> + char *width_str, *direction_str, lanes_str[9];
>>> + u32 info, cfg_addr, cfg_cause, mem_cause, lo, hi;
>>> + unsigned long flags;
>>> +
>>> + spin_lock_irqsave(&pcie->bridge_lock, flags);
>>> + /* Don't access registers when the bridge is off */
>>> + if (pcie->bridge_in_reset || readl(base + PCIE_OUTB_ERR_VALID) == 0) {
>>> + spin_unlock_irqrestore(&pcie->bridge_lock, flags);
>>> + return NOTIFY_DONE;
>>> + }
>>> +
>>> + /* Read all necessary registers so we can release the spinlock ASAP */
>>> + info = readl(base + PCIE_OUTB_ERR_ACC_INFO);
>>> + is_cfg_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_CFG_ERR_MASK);
>>> + is_mem_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_MEM_ERR_MASK);
>>> + if (is_cfg_err) {
>>> + cfg_addr = readl(base + PCIE_OUTB_ERR_ACC_ADDR);
>>> + cfg_cause = readl(base + PCIE_OUTB_ERR_CFG_CAUSE);
>>> + }
>>> + if (is_mem_err) {
>>> + mem_cause = readl(base + PCIE_OUTB_ERR_MEM_CAUSE);
>>> + lo = readl(base + PCIE_OUTB_ERR_MEM_ADDR_LO);
>>> + hi = readl(base + PCIE_OUTB_ERR_MEM_ADDR_HI);
>>> + }
>>> + /* We've got all of the info, clear the error */
>>> + writel(1, base + PCIE_OUTB_ERR_CLEAR);
>>> + spin_unlock_irqrestore(&pcie->bridge_lock, flags);
>>> +
>>> + dev_err(pcie->dev, "reporting data on PCIe %s error\n", type);
>> Looks like this isn't included in the example error output. Not a big
>> deal in itself, but logging this:
>>
>> brcm-pcie 8b20000.pcie: reporting data on PCIe Panic error
>>
>> suggests that we know this panic was directly *caused* by PCIe, and
>> I'm not sure the fact that somebody called panic() and
>> PCIE_OUTB_ERR_VALID was non-zero is convincing evidence of that.
>>
>> I think this relies on the assumptions that (a) the controller
>> triggers an abort and (b) the abort handler calls panic(). So I think
>> this logs useful information that *might* be related to the panic.
>>
>> I'd rather phrase this with a little less certainty, to convey the
>> idea that "here's some PCIe error information that might be related to
>> the panic/die".
>>
>>> + width_str = (info & PCIE_OUTB_ERR_ACC_INFO_TYPE_64_MASK) ? "64bit" : "32bit";
>>> + direction_str = (info & PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE_MASK) ? "Write" : "Read";
> Please use str_read_write() + don't forget it's include.
Done.
>
> It might be also worth to add str_64bit_32bit() in the form with the
> dash ("64-bit") as there a couple of other drivers print the same choice.
Unfortunately, I'm a little pressed for time right now to add this in now...
Regards,
Jim Quinlan
Broadcom STB/CM
>
>
>>> + lanes = FIELD_GET(PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES_MASK, info);
>>> + for (i = 0, lanes_str[8] = 0; i < 8; i++)
>>> + lanes_str[i] = (lanes & (1 << i)) ? '1' : '0';
>>> +
>>> + if (is_cfg_err) {
>>> + int bus = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_BUS_MASK, cfg_addr);
>>> + int dev = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_DEV_MASK, cfg_addr);
>>> + int func = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_FUNC_MASK, cfg_addr);
>>> + int reg = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_REG_MASK, cfg_addr);
>>> +
>>> + dev_err(pcie->dev, "Error: CFG Acc, %s, %s, Bus=%d, Dev=%d, Fun=%d, Reg=0x%x, lanes=%s\n",
>> Why are we printing bus and dev with %d? Can we use the usual format
>> ("%04x:%02x:%02x.%d") so it matches other logging?
>>
>>> + width_str, direction_str, bus, dev, func, reg, lanes_str);
>>> + dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccTO=%d AccDsbld=%d Acc64bit=%d\n",
>>> + !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT_MASK),
>>> + !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ABORT_MASK),
>>> + !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ_MASK),
>>> + !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT_MASK),
>>> + !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED_MASK),
>>> + !!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT__MASK));
>>> + }
>>> +
>>> + if (is_mem_err) {
>>> + u64 addr = ((u64)hi << 32) | (u64)lo;
>>> +
>>> + dev_err(pcie->dev, "Error: Mem Acc, %s, %s, @0x%llx, lanes=%s\n",
>>> + width_str, direction_str, addr, lanes_str);
>>> + dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccDsble=%d BadAddr=%d\n",
>>> + !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT_MASK),
>>> + !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ABORT_MASK),
>>> + !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ_MASK),
>>> + !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED_MASK),
>>> + !!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR_MASK));
>>> + }
>>> +
>>> + return NOTIFY_OK;
>> What is the difference between NOTIFY_DONE and NOTIFY_OK? Can the
>> caller do anything useful based on the difference?
>>
>> This seems like opportunistic error information that isn't definitely
>> definitely connected to anything, so I'm not sure returning different
>> values is really reliable.
>>
Powered by blists - more mailing lists