lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <43b8ab4a-f8ee-dc96-40ec-e6fdfedd8309@molgen.mpg.de>
Date:   Thu, 3 May 2018 10:49:24 +0200
From:   Paul Menzel <pmenzel+linux-pci@...gen.mpg.de>
To:     Bjorn Helgaas <helgaas@...nel.org>
Cc:     Bjorn Helgaas <bhelgaas@...gle.com>, linux-pci@...r.kernel.org,
        linux-kernel@...r.kernel.org, Lukas Wunner <lukas@...ner.de>,
        Sinan Kaya <okaya@...eaurora.org>
Subject: Re: pciehp 0000:00:1c.0:pcie004: Timeout on hotplug command 0x1038
 (issued 65284 msec ago)

Dear Bjorn,


On 04/27/18 21:22, Bjorn Helgaas wrote:
> [+cc Lukas, Sinan]

> On Thu, Apr 26, 2018 at 12:17:53PM +0200, Paul Menzel wrote:

>> On the Lenovo X60t, during resume from ACPI suspend and during shutdown, the
>> message below is shown in the logs.
>>
>>      pciehp 0000:00:1c.0:pcie004: Timeout on hotplug command 0x1038 (issued
>> 65284 msec ago)
> 
> This is an Intel root port:
> 
>    00:1c.0 PCI bridge: Intel Corporation NM10/ICH7 Family PCI Express Port 1 (rev 02) (prog-if 00 [Normal decode])
> 
> and probably has the CF118 erratum (see
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=3461a068661c
> for details).  I bet if you changed "msecs" in pcie_wait_cmd() to 30000
> you'd see a 30 second delay during shutdown because we write a command to
> tell the port not to generate any more hotplug interrupts, and we wait for
> that command to complete, but the port never tells us it has completed.
> 
> Lukas reported a similar issue in
> https://lkml.kernel.org/r/20180112104929.GA10599@wunner.de, which we sort
> of worked around by assuming that Thunderbolt controllers never support
> that "command complete" interrupt (see
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=493fb50e958c)
> 
> Sinan mooted the idea of using a "no-wait" path of sending the "don't
> generate hotplug interrupts" command.  I think we should work on this
> idea a little more.  If we're shutting down the whole system, I can't
> believe there's much value in *anything* we do in the pciehp_remove()
> path.
> 
> Maybe we should just get rid of pciehp_remove() (and probably
> pcie_port_remove_service() and the other service driver remove methods)
> completely.  That dates from when the service drivers could be modules that
> could be potentially unloaded, but unloading them hasn't been possible for
> years.
> 
> As far as the resume path, my guess is that in pciehp_resume(), we
> write a command to enable interrupts, then it looks like we get a
> PCI_EXP_SLTSTA_DLLSC "Link Up" interrupt, and apparently we issue
> another command.  Not sure exactly what's going on here.
> 
> Could you try the following patch?  The idea is to (1) do nothing on
> shutdown, so you should see no message and no delay, and (2) collect
> more information about the resume path.
> 
> 
> diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
> index 332b723ff9e6..99751cc52968 100644
> --- a/drivers/pci/hotplug/pciehp_core.c
> +++ b/drivers/pci/hotplug/pciehp_core.c
> @@ -260,14 +260,6 @@ static int pciehp_probe(struct pcie_device *dev)
>   	return -ENODEV;
>   }
>   
> -static void pciehp_remove(struct pcie_device *dev)
> -{
> -	struct controller *ctrl = get_service_data(dev);
> -
> -	cleanup_slot(ctrl);
> -	pciehp_release_ctrl(ctrl);
> -}
> -
>   #ifdef CONFIG_PM
>   static int pciehp_suspend(struct pcie_device *dev)
>   {
> @@ -305,7 +297,6 @@ static struct pcie_port_service_driver hpdriver_portdrv = {
>   	.service	= PCIE_PORT_SERVICE_HP,
>   
>   	.probe		= pciehp_probe,
> -	.remove		= pciehp_remove,
>   
>   #ifdef	CONFIG_PM
>   	.suspend	= pciehp_suspend,
> diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
> index 18a42f8f5dc5..c3a9c47ed061 100644
> --- a/drivers/pci/hotplug/pciehp_hpc.c
> +++ b/drivers/pci/hotplug/pciehp_hpc.c
> @@ -113,7 +113,7 @@ static int pcie_poll_cmd(struct controller *ctrl, int timeout)
>   	return 0;	/* timeout */
>   }
>   
> -static void pcie_wait_cmd(struct controller *ctrl)
> +static void pcie_wait_cmd(struct controller *ctrl, u16 cmd, u16 mask)
>   {
>   	unsigned int msecs = pciehp_poll_mode ? 2500 : 1000;
>   	unsigned long duration = msecs_to_jiffies(msecs);
> @@ -155,10 +155,13 @@ static void pcie_wait_cmd(struct controller *ctrl)
>   	 * don't change those bits, e.g., commands that merely enable
>   	 * interrupts.
>   	 */
> -	if (!rc)
> -		ctrl_info(ctrl, "Timeout on hotplug command %#06x (issued %u msec ago)\n",
> +	if (!rc) {
> +		ctrl_info(ctrl, "Timeout on hotplug command %#06x (issued %u msec ago), new command %#06x/mask %#06x\n",
>   			  ctrl->slot_ctrl,
> -			  jiffies_to_msecs(jiffies - ctrl->cmd_started));
> +			  jiffies_to_msecs(jiffies - ctrl->cmd_started),
> +			  cmd, mask);
> +		dump_stack();
> +	}
>   }
>   
>   static void pcie_do_write_cmd(struct controller *ctrl, u16 cmd,
> @@ -172,7 +175,7 @@ static void pcie_do_write_cmd(struct controller *ctrl, u16 cmd,
>   	/*
>   	 * Always wait for any previous command that might still be in progress
>   	 */
> -	pcie_wait_cmd(ctrl);
> +	pcie_wait_cmd(ctrl, cmd, mask);
>   
>   	pcie_capability_read_word(pdev, PCI_EXP_SLTCTL, &slot_ctrl);
>   	if (slot_ctrl == (u16) ~0) {
> @@ -193,7 +196,7 @@ static void pcie_do_write_cmd(struct controller *ctrl, u16 cmd,
>   	 * indicating completion of the above issued command.
>   	 */
>   	if (wait)
> -		pcie_wait_cmd(ctrl);
> +		pcie_wait_cmd(ctrl, cmd, mask);
>   
>   out:
>   	mutex_unlock(&ctrl->ctrl_lock);
> 

Thank you for the quick reply and sorry for only being able to test it 
now. Please find the relevant bits from the ACPI S3 suspend “action” 
below. The full log is attached.

```
[  190.600060] pciehp 0000:00:1c.0:pcie004: Timeout on hotplug command 
0x1038 (issued 190084 msec ago), new command 0x1038/mask 0x103b
[  190.600065] CPU: 0 PID: 1290 Comm: kworker/u4:37 Not tainted 
4.17.0-rc3+ #20
[  190.600067] Hardware name: LENOVO 636338U/636338U, BIOS CBET4000 
TIMELESS 01/01/1970
[  190.600077] Workqueue: events_unbound async_run_entry_fn
[  190.600079] Call Trace:
[  190.600087]  dump_stack+0x66/0xa6
[  190.600092]  pcie_wait_cmd+0x153/0x2b0
[  190.600097]  ? prepare_to_wait+0x190/0x190
[  190.600100]  pcie_do_write_cmd+0x54/0x130
[  190.600104]  ? radix_tree_lookup+0x14/0x20
[  190.600108]  ? suspend_iter+0x80/0x80
[  190.600111]  pcie_enable_notification+0x64/0x150
[  190.600115]  ? irq_set_irq_wake+0x6b/0x170
[  190.600117]  ? suspend_iter+0x80/0x80
[  190.600121]  pciehp_resume+0x28/0xa0
[  190.600124]  ? klist_next+0x2d/0x170
[  190.600127]  resume_iter+0x4b/0x80
[  190.600131]  device_for_each_child+0x61/0xb0
[  190.600134]  pcie_port_device_resume+0x14/0x20
[  190.600139]  pci_pm_resume+0x75/0x100
[  190.600143]  dpm_run_callback+0x47/0x1b0
[  190.600146]  ? pci_pm_thaw+0xd0/0xd0
[  190.600149]  device_resume+0x97/0x190
[  190.600152]  ? device_resume+0x190/0x190
[  190.600155]  async_resume+0x1e/0x50
[  190.600158]  async_run_entry_fn+0x61/0x3a0
[  190.600162]  ? try_to_wake_up+0x4d/0x790
[  190.600166]  ? __switch_to_asm+0x33/0x4c
[  190.600171]  process_one_work+0x235/0x690
[  190.600175]  worker_thread+0x19d/0x6a0
[  190.600179]  kthread+0x14a/0x1f0
[  190.600182]  ? process_one_work+0x690/0x690
[  190.600185]  ? kthread_create_worker_on_cpu+0x30/0x30
[  190.600187]  ret_from_fork+0x2e/0x38
[  190.664162] call 0000:05:00.1+ returned 0 after 67611 usecs
[  190.700252] call usb1+ returned 0 after 102522 usecs
[  190.700259] call usb2+ returned 0 after 102481 usecs
[  190.704238] call usb3+ returned 0 after 106349 usecs
[  190.704433] call ACPI0003:00+ returned 0 after 109837 usecs
```


Kind regards,

Paul

View attachment "linux_4.17-rc3+–lenovo_x60t–dmesg.txt" of type "text/plain" (165823 bytes)

Download attachment "smime.p7s" of type "application/pkcs7-signature" (5174 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ