linux-kernel - Re: [PATCH v9 2/2] PCI: hip: Add handling of HiSilicon HIP PCIe controller errors

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <20200615120053.GZ2428291@smile.fi.intel.com>
Date:   Mon, 15 Jun 2020 15:00:53 +0300
From:   Andy Shevchenko <andriy.shevchenko@...ux.intel.com>
To:     Shiju Jose <shiju.jose@...wei.com>
Cc:     linux-acpi@...r.kernel.org, linux-pci@...r.kernel.org,
        linux-kernel@...r.kernel.org, rjw@...ysocki.net, bp@...en8.de,
        james.morse@....com, lenb@...nel.org, tony.luck@...el.com,
        dan.carpenter@...cle.com, zhangliguang@...ux.alibaba.com,
        wangkefeng.wang@...wei.com, jroedel@...e.de,
        yangyicong@...ilicon.com, jonathan.cameron@...wei.com,
        tanxiaofei@...wei.com
Subject: Re: [PATCH v9 2/2] PCI: hip: Add handling of HiSilicon HIP PCIe
 controller errors

On Mon, Jun 15, 2020 at 11:15:52AM +0100, Shiju Jose wrote:
> From: Yicong Yang <yangyicong@...ilicon.com>
> 
> The HiSilicon HIP PCIe controller is capable of handling errors
> on root port and perform port reset separately at each root port.
> 
> Add error handling driver for HIP PCIe controller to log
> and report recoverable errors. Perform root port reset and restore
> link status after the recovery.
> 
> Following are some of the PCIe controller's recoverable errors
> 1. completion transmission timeout error.
> 2. CRS retry counter over the threshold error.
> 3. ECC 2 bit errors
> 4. AXI bresponse/rresponse errors etc.
> 
> The driver placed in the drivers/pci/controller/ because the
> HIP PCIe controller does not use DWC ip.

...

> +#include <linux/acpi.h>
> +#include <acpi/ghes.h>

bits.h ?

> +#include <linux/delay.h>
> +#include <linux/pci.h>
> +#include <linux/platform_device.h>
> +#include <linux/kfifo.h>
> +#include <linux/spinlock.h>

...

> +static guid_t hisi_pcie_sec_type = GUID_INIT(0xB2889FC9, 0xE7D7, 0x4F9D,
> +			0xA8, 0x67, 0xAF, 0x42, 0xE9, 0x8B, 0xE7, 0x72);

Can we have it in more common pattern, i.e.

static guid_t hisi_pcie_sec_type =
	GUID_INIT(0xB2889FC9, 0xE7D7, 0x4F9D,
		  0xA8, 0x67, 0xAF, 0x42, 0xE9, 0x8B, 0xE7, 0x72);
?

...

> +#define HISI_PCIE_CORE_PORT_ID(v)        (((v) % 8) << 1)

% -> & ?

...

> +struct hisi_pcie_error_private {
> +	struct notifier_block	nb;
> +	struct platform_device	*pdev;

Do you really need platform device? Isn't struct device * enough?

> +};

...

> +static char *hisi_pcie_sub_module_name(u8 id)
> +{
> +	switch (id) {
> +	case HISI_PCIE_SUB_MODULE_ID_AP: return "AP Layer";
> +	case HISI_PCIE_SUB_MODULE_ID_TL: return "TL Layer";
> +	case HISI_PCIE_SUB_MODULE_ID_MAC: return "MAC Layer";
> +	case HISI_PCIE_SUB_MODULE_ID_DL: return "DL Layer";
> +	case HISI_PCIE_SUB_MODULE_ID_SDI: return "SDI Layer";
> +	}

match_string() ?

> +	return "unknown";

> +}
> +
> +static char *hisi_pcie_error_severity(u8 err_sev)
> +{
> +	switch (err_sev) {
> +	case HISI_ERR_SEV_RECOVERABLE: return "recoverable";
> +	case HISI_ERR_SEV_FATAL: return "fatal";
> +	case HISI_ERR_SEV_CORRECTED: return "corrected";
> +	case HISI_ERR_SEV_NONE: return "none";
> +	}

Ditto?

> +	return "unknown";
> +}

...

> +	pdev = pci_get_domain_bus_and_slot(domain, busnr, devfn);
> +	if (!pdev) {

> +		dev_info(device, "Fail to get root port %04x:%02x:%02x.%d device\n",
> +			 domain, busnr, PCI_SLOT(devfn), PCI_FUNC(devfn));

pci_info() ?

> +		return -ENODEV;
> +	}

...

> +	/*
> +	 * The initialization time of subordinate devices after
> +	 * hot reset is no more than 1s, which is required by
> +	 * the PCI spec v5.0 sec 6.6.1. The time will shorten
> +	 * if Readiness Notifications mechanisms are used. But
> +	 * wait 1s here to adapt any conditions.
> +	 */
> +	ssleep(1UL);

It's a huge time out... Can we reduce it somehow?

...

> +	for (i = 0; i < HISI_PCIE_ERR_MISC_REGS; i++) {
> +		if (edata->val_bits &
> +				BIT_ULL(HISI_PCIE_LOCAL_VALID_ERR_MISC + i))

for_each_set_bit() ?

> +			dev_info(dev,
> +				 "ERR_MISC_%d = 0x%x\n", i, edata->err_misc[i]);
> +	}

> +
> +	/* Recovery for the PCIe controller errors */
> +	if (edata->err_severity == HISI_ERR_SEV_RECOVERABLE) {

Perhaps negative conditional?

> +		/* try reset PCI port for the error recovery */
> +		rc = hisi_pcie_port_do_recovery(pdev, edata->socket_id,
> +			HISI_PCIE_PORT_ID(edata->core_id, edata->port_id));
> +		if (rc) {
> +			dev_info(dev, "fail to do hisi pcie port reset\n");

> +			return;

redundant.

> +		}
> +	}

...

> +	const struct hisi_pcie_error_data *error_data =
> +				acpi_hest_get_payload(gdata);

One line is better to read.

> +	struct platform_device *pdev = priv->pdev;

> +	hisi_pcie_handle_error(pdev, error_data);

And how exactly _platform_ device pointer is being used?

...


> +		dev_err(&pdev->dev, "%s : ghes_register_event_notifier fail\n",
> +			__func__);

Make error message more descriptive that __func__ will not be needed.

...

> +	kfree(priv);

Double free?

...

> +static const struct acpi_device_id hisi_pcie_acpi_match[] = {
> +	{ "HISI0361", 0 },

', 0' part is not necessary to have.

> +	{ }
> +};

-- 
With Best Regards,
Andy Shevchenko