[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6a2162a7-5f48-c107-0b1c-5258089522c5@molgen.mpg.de>
Date: Tue, 24 Apr 2018 23:04:30 +0200
From: Paul Menzel <pmenzel+linux-block@...gen.mpg.de>
To: Bart Van Assche <Bart.VanAssche@....com>
Cc: axboe@...nel.dk, jejb@...ux.vnet.ibm.com,
regressions@...mhuis.info, linux-kernel@...r.kernel.org,
linux-block@...r.kernel.org, martin.petersen@...cle.com,
linux-scsi@...r.kernel.org
Subject: Re: Regression 4.17-rc1: SSD doesn properly resume causing system
hang (NULL pointer dereference)
Dear Bart,
Thank you for looking into the issue.
Am 24.04.2018 um 20:27 schrieb Bart Van Assche:
> On Tue, 2018-04-24 at 19:37 +0200, Paul Menzel wrote:
>> On 04/24/18 19:31, Bart Van Assche wrote:
>> Here it is, pasted as citation, as otherwise Thunderbird would wrap the
>> line.
>>
>>> (gdb) disas blk_set_runtime_active
>>> Dump of assembler code for function blk_set_runtime_active:
>>> 0xc1518610 <+0>: call 0xc106ac9c <__fentry__>
>>> 0xc1518615 <+5>: push %ebp
>>> 0xc1518616 <+6>: mov %esp,%ebp
>>> 0xc1518618 <+8>: sub $0x14,%esp
>>> 0xc151861b <+11>: mov %ebx,-0xc(%ebp)
>>> 0xc151861e <+14>: mov %eax,%ebx
>>> 0xc1518620 <+16>: mov %gs:0x14,%eax
>>> 0xc1518626 <+22>: mov %eax,-0x10(%ebp)
>>> 0xc1518629 <+25>: xor %eax,%eax
>>> 0xc151862b <+27>: test %ebx,%ebx
>>> 0xc151862d <+29>: mov %esi,-0x8(%ebp)
>>> 0xc1518630 <+32>: mov %edi,-0x4(%ebp)
>>> 0xc1518633 <+35>: je 0xc15186b3 <blk_set_runtime_active+163>
>>> 0xc1518635 <+37>: mov 0xfc(%ebx),%eax
>>> 0xc151863b <+43>: call 0xc1a4b920 <_raw_spin_lock_irq>
>>> 0xc1518640 <+48>: mov 0x150(%ebx),%esi
>>> 0xc1518646 <+54>: xor %eax,%eax
>>> 0xc1518648 <+56>: mov 0xc1ca7d20,%edi
>>> 0xc151864e <+62>: mov %eax,0x154(%ebx)
>>> 0xc1518654 <+68>: cmp $0xffffff0c,%esi
>>> 0xc151865a <+74>: mov %edi,-0x14(%ebp)
>>> 0xc151865d <+77>: je 0xc15186a5 <blk_set_runtime_active+149>
>>> 0xc151865f <+79>: mov %edi,0xf4(%esi)
>
> The e-mail at the start of this e-mail thread shows that %esi == NULL at
> the time of the crash and also that the crash occurred at offset 79 (0x4f)
> in this function. I think that means that the crash occurred in the following
> code: pm_request_autosuspend(q->dev) and also that this means that q->dev ==
> NULL. Can you test the (untested) patch below?
>
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index 57cae47ab1c2..b029a94a1e66 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -3272,7 +3272,6 @@ static void sd_probe_async(struct work_struct *work)
> gd->events |= DISK_EVENT_MEDIA_CHANGE;
> }
>
> - blk_pm_runtime_init(sdp->request_queue, dev);
> device_add_disk(dev, gd);
> if (sdkp->capacity)
> sd_dif_config_host(sdkp);
> @@ -3390,6 +3389,8 @@ static int sd_probe(struct device *dev)
> get_device(dev);
> dev_set_drvdata(dev, sdkp);
>
> + blk_pm_runtime_init(sdp->request_queue, dev);
> +
> get_device(&sdkp->dev); /* prevent release before sd_probe_async() */
> WARN_ON_ONCE(!queue_work(system_unbound_wq, &sdkp->probe_work));
I applied your change, and rebuilt the Linux kernel. Unfortunately, it
looks like, it didn’t make a difference.
> 18.673: [ 184.143302] BUG: unable to handle kernel NULL pointer dereference at 000000f4
> 18.673: [ 184.143306] *pde = 00000000
> 18.673: [ 184.143312] Oops: 0002 [#1] SMP
> 18.674: [ 184.148479] calling usb2+ @ 720, parent: 0000:00:1d.7
> 18.674: [ 184.148471] Modules linked in: i915 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm drm_panel_orientation_quirks i2c_algo_bit serport binfmt_misc
> 18.674: [ 184.158340] calling PNP0C0A:01+ @ 690, parent: PNP0C09:00
> 18.674: [ 184.148471] mousedev iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm irqbypass psmouse snd_pcsp arc4 pcmcia sdhci_pci cqhci iwl3945 lpc_ich mfd_core iwlegacy
> 18.674: [ 184.169352] call 0000:05:00.2+ returned 0 after 251381 usecs
> 18.674: [ 184.169336] i2c_i801 sdhci mmc_core mac80211 snd_hda_codec_analog snd_hda_codec_generic firewire_ohci yenta_socket pcmcia_rsrc firewire_core crc_itu_t pcmcia_core rng_core cfg80211 snd_hda_intel ehci_pci e1000e thinkpad_acpi snd_hda_codec uhci_hcd snd_hda_core snd_hwdep snd_pcm snd_timer
> 18.674: [ 184.180378] calling phy0+ @ 727, parent: 0000:02:00.0
> 18.674: [ 184.180362] ehci_hcd nvram rfkill battery snd soundcore usbcore video button shpchp ac acpi_cpufreq ip_tables x_tables autofs4 ext4 crc16 mbcache jbd2 fscrypto cbc dm_crypt dm_mod sd_mod ahci
> 18.674: [ 184.192683] call PNP0C0A:01+ returned 0 after 23063 usecs
> 18.674: [ 184.180362] libahci evdev libata serio_raw scsi_mod fan thermal
> 18.674: [ 184.180362] CPU: 0 PID: 708 Comm: kworker/u4:7 Not tainted 4.17.0-rc2+ #14
> 18.674: [ 184.198715] calling PNP0C0E:00+ @ 690, parent: PNP0C09:00
> 18.674: [ 184.198705] Hardware name: LENOVO 636338U/636338U, BIOS CBET4000 TIMELESS 01/01/1970
> 18.674: [ 184.198705] Workqueue: events_unbound async_run_entry_fn
> 18.674: [ 184.198705] EIP: blk_set_runtime_active+0x4f/0xc0
> 18.674: [ 184.198705] EFLAGS: 00010013 CPU: 0
> 18.674: [ 184.198705] EAX: 00000000 EBX: f665f740 ECX: f665f838 EDX: 00000000
> 18.674: [ 184.198705] ESI: 00000000 EDI: ffff7bbe EBP: f1afde38 ESP: f1afde24
> 18.674: [ 184.218335] call PNP0C0E:00+ returned 0 after 2 usecs
> 18.674: [ 184.198705] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
> 18.674: [ 184.198705] CR0: 80050033 CR2: 000000f4 CR3: 0e40a000 CR4: 000006d0
> 18.674: [ 184.198705] Call Trace:
> 18.674: [ 184.198705] ? async_sdev_thaw+0x20/0x20 [scsi_mod]
> 18.674: [ 184.198705] scsi_bus_resume_common+0x79/0x180 [scsi_mod]
> 18.674: [ 184.238364] call phy0+ returned 0 after 44664 usecs
> 18.674: [ 184.198705] scsi_bus_resume+0x12/0x20 [scsi_mod]
> 18.674: [ 184.198705] dpm_run_callback+0x47/0x1b0
> 18.674: [ 184.198705] ? scsi_bus_thaw+0x20/0x20 [scsi_mod]
> 18.674: [ 184.198705] device_resume+0x97/0x190
> 18.674: [ 184.269566] calling usb3+ @ 727, parent: 0000:00:1d.1
> 18.674: [ 184.198705] ? device_resume+0x190/0x190
> 18.674: [ 184.198705] async_resume+0x1e/0x50
> 18.675: [ 184.198705] async_run_entry_fn+0x61/0x3a0
> 18.675: [ 184.198705] ? __schedule+0x2f5/0xcf0
> 18.675: [ 184.198705] ? try_to_wake_up+0x4d/0x790
> 18.675: [ 184.198705] ? __switch_to_asm+0x33/0x4c
> 18.675: [ 184.291885] ata1: SATA link up 1.5 Gbps (SStatus 113 SControl 300)
> 18.675: [ 184.198705] ? pwq_dec_nr_in_flight+0x7c/0x150
> 18.675: [ 184.198705] process_one_work+0x235/0x690
> 18.675: [ 184.198705] worker_thread+0x19d/0x6a0
> 18.675: [ 184.198705] kthread+0x14a/0x1f0
> 18.675: [ 184.198705] ? process_one_work+0x690/0x690
> 18.675: [ 184.198705] ? kthread_create_worker_on_cpu+0x30/0x30
> 18.675: [ 184.198705] ret_from_fork+0x2e/0x38
> 18.675: [ 184.198705] Code:
> 18.675: [ 184.303294] call usb2+ returned 0 after 141649 usecs
> 18.675: [ 184.198705] 7e 8b 83 fc 00 00 00 e8 e0 32 53 00 8b b3 50 01
> 18.675: [ 184.315689] calling usb4+ @ 118, parent: 0000:00:1d.2
> 18.675: [ 184.198705] 00 00 31 c0 8b 3d 20 7d ea cd 89 83 54 01 00 00 81 fe 0c
> 18.675: [ 184.328767] call usb1+ returned 0 after 190866 usecs
> 18.675: [ 184.198705] ff ff ff 89 7d ec 74 46 <89> be f4 00 00 00 ba 09 00 00 00 8b 83 50 01 00 00 e8 0b 25 24
> 18.675: [ 184.198705] EIP: blk_set_runtime_active+0x4f/0xc0 SS:ESP: 0068:f1afde24
> 18.675: [ 184.198705] CR2: 00000000000000f4
> 18.675: [ 184.336955] calling 2-6+ @ 6, parent: usb2
> 18.675: [ 184.198705] ---[ end trace 94bb43426dd6105a ]---
> (gdb) disas blk_set_runtime_active
> Dump of assembler code for function blk_set_runtime_active:
> 0xc1518610 <+0>: call 0xc106ac9c <__fentry__>
> 0xc1518615 <+5>: push %ebp
> 0xc1518616 <+6>: mov %esp,%ebp
> 0xc1518618 <+8>: sub $0x14,%esp
> 0xc151861b <+11>: mov %ebx,-0xc(%ebp)
> 0xc151861e <+14>: mov %eax,%ebx
> 0xc1518620 <+16>: mov %gs:0x14,%eax
> 0xc1518626 <+22>: mov %eax,-0x10(%ebp)
> 0xc1518629 <+25>: xor %eax,%eax
> 0xc151862b <+27>: test %ebx,%ebx
> 0xc151862d <+29>: mov %esi,-0x8(%ebp)
> 0xc1518630 <+32>: mov %edi,-0x4(%ebp)
> 0xc1518633 <+35>: je 0xc15186b3 <blk_set_runtime_active+163>
> 0xc1518635 <+37>: mov 0xfc(%ebx),%eax
> 0xc151863b <+43>: call 0xc1a4b920 <_raw_spin_lock_irq>
> 0xc1518640 <+48>: mov 0x150(%ebx),%esi
> 0xc1518646 <+54>: xor %eax,%eax
> 0xc1518648 <+56>: mov 0xc1ca7d20,%edi
> 0xc151864e <+62>: mov %eax,0x154(%ebx)
> 0xc1518654 <+68>: cmp $0xffffff0c,%esi
> 0xc151865a <+74>: mov %edi,-0x14(%ebp)
> 0xc151865d <+77>: je 0xc15186a5 <blk_set_runtime_active+149>
> 0xc151865f <+79>: mov %edi,0xf4(%esi)
> 0xc1518665 <+85>: mov $0x9,%edx
> 0xc151866a <+90>: mov 0x150(%ebx),%eax
> 0xc1518670 <+96>: call 0xc175ab80 <__pm_runtime_suspend>
> 0xc1518675 <+101>: mov 0xfc(%ebx),%eax
> 0xc151867b <+107>: call *0xc1ce2918
> 0xc1518681 <+113>: call *0xc1ce2888
> 0xc1518687 <+119>: mov -0x10(%ebp),%eax
> 0xc151868a <+122>: xor %gs:0x14,%eax
> 0xc1518691 <+129>: jne 0xc15186a0 <blk_set_runtime_active+144>
> 0xc1518693 <+131>: mov -0xc(%ebp),%ebx
> 0xc1518696 <+134>: mov -0x8(%ebp),%esi
> 0xc1518699 <+137>: mov -0x4(%ebp),%edi
> 0xc151869c <+140>: mov %ebp,%esp
> 0xc151869e <+142>: pop %ebp
> 0xc151869f <+143>: ret
> 0xc15186a0 <+144>: call 0xc108c6c0 <__stack_chk_fail>
> 0xc15186a5 <+149>: xor %edx,%edx
> 0xc15186a7 <+151>: mov $0xc1ee14b4,%eax
> 0xc15186ac <+156>: call 0xc15bb7f0 <__ubsan_handle_type_mismatch>
> 0xc15186b1 <+161>: jmp 0xc151865f <blk_set_runtime_active+79>
> 0xc15186b3 <+163>: xor %edx,%edx
> 0xc15186b5 <+165>: mov $0xc1ee14cc,%eax
> 0xc15186ba <+170>: call 0xc15bb7f0 <__ubsan_handle_type_mismatch>
> 0xc15186bf <+175>: jmp 0xc1518635 <blk_set_runtime_active+37>
> End of assembler dump.
Kind regards,
Paul
View attachment "seriallog-20180424_225038.log" of type "text/x-log" (174784 bytes)
Powered by blists - more mailing lists