lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6a2162a7-5f48-c107-0b1c-5258089522c5@molgen.mpg.de>
Date:   Tue, 24 Apr 2018 23:04:30 +0200
From:   Paul Menzel <pmenzel+linux-block@...gen.mpg.de>
To:     Bart Van Assche <Bart.VanAssche@....com>
Cc:     axboe@...nel.dk, jejb@...ux.vnet.ibm.com,
        regressions@...mhuis.info, linux-kernel@...r.kernel.org,
        linux-block@...r.kernel.org, martin.petersen@...cle.com,
        linux-scsi@...r.kernel.org
Subject: Re: Regression 4.17-rc1: SSD doesn properly resume causing system
 hang (NULL pointer dereference)

Dear Bart,


Thank you for looking into the issue.

Am 24.04.2018 um 20:27 schrieb Bart Van Assche:
> On Tue, 2018-04-24 at 19:37 +0200, Paul Menzel wrote:
>> On 04/24/18 19:31, Bart Van Assche wrote:
>> Here it is, pasted as citation, as otherwise Thunderbird would wrap the
>> line.
>>
>>> (gdb) disas blk_set_runtime_active
>>> Dump of assembler code for function blk_set_runtime_active:
>>>     0xc1518610 <+0>:	call   0xc106ac9c <__fentry__>
>>>     0xc1518615 <+5>:	push   %ebp
>>>     0xc1518616 <+6>:	mov    %esp,%ebp
>>>     0xc1518618 <+8>:	sub    $0x14,%esp
>>>     0xc151861b <+11>:	mov    %ebx,-0xc(%ebp)
>>>     0xc151861e <+14>:	mov    %eax,%ebx
>>>     0xc1518620 <+16>:	mov    %gs:0x14,%eax
>>>     0xc1518626 <+22>:	mov    %eax,-0x10(%ebp)
>>>     0xc1518629 <+25>:	xor    %eax,%eax
>>>     0xc151862b <+27>:	test   %ebx,%ebx
>>>     0xc151862d <+29>:	mov    %esi,-0x8(%ebp)
>>>     0xc1518630 <+32>:	mov    %edi,-0x4(%ebp)
>>>     0xc1518633 <+35>:	je     0xc15186b3 <blk_set_runtime_active+163>
>>>     0xc1518635 <+37>:	mov    0xfc(%ebx),%eax
>>>     0xc151863b <+43>:	call   0xc1a4b920 <_raw_spin_lock_irq>
>>>     0xc1518640 <+48>:	mov    0x150(%ebx),%esi
>>>     0xc1518646 <+54>:	xor    %eax,%eax
>>>     0xc1518648 <+56>:	mov    0xc1ca7d20,%edi
>>>     0xc151864e <+62>:	mov    %eax,0x154(%ebx)
>>>     0xc1518654 <+68>:	cmp    $0xffffff0c,%esi
>>>     0xc151865a <+74>:	mov    %edi,-0x14(%ebp)
>>>     0xc151865d <+77>:	je     0xc15186a5 <blk_set_runtime_active+149>
>>>     0xc151865f <+79>:	mov    %edi,0xf4(%esi)
> 
> The e-mail at the start of this e-mail thread shows that %esi == NULL at
> the time of the crash and also that the crash occurred at offset 79 (0x4f)
> in this function. I think that means that the crash occurred in the following
> code: pm_request_autosuspend(q->dev) and also that this means that q->dev ==
> NULL. Can you test the (untested) patch below?
> 
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index 57cae47ab1c2..b029a94a1e66 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -3272,7 +3272,6 @@ static void sd_probe_async(struct work_struct *work)
>   		gd->events |= DISK_EVENT_MEDIA_CHANGE;
>   	}
>   
> -	blk_pm_runtime_init(sdp->request_queue, dev);
>   	device_add_disk(dev, gd);
>   	if (sdkp->capacity)
>   		sd_dif_config_host(sdkp);
> @@ -3390,6 +3389,8 @@ static int sd_probe(struct device *dev)
>   	get_device(dev);
>   	dev_set_drvdata(dev, sdkp);
>   
> +	blk_pm_runtime_init(sdp->request_queue, dev);
> +
>   	get_device(&sdkp->dev);	/* prevent release before sd_probe_async() */
>   	WARN_ON_ONCE(!queue_work(system_unbound_wq, &sdkp->probe_work));

I applied your change, and rebuilt the Linux kernel. Unfortunately, it 
looks like, it didn’t make a difference.

> 18.673: [  184.143302] BUG: unable to handle kernel NULL pointer dereference at 000000f4
> 18.673: [  184.143306] *pde = 00000000 
> 18.673: [  184.143312] Oops: 0002 [#1] SMP
> 18.674: [  184.148479] calling  usb2+ @ 720, parent: 0000:00:1d.7
> 18.674: [  184.148471] Modules linked in: i915 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm drm_panel_orientation_quirks i2c_algo_bit serport binfmt_misc
> 18.674: [  184.158340] calling  PNP0C0A:01+ @ 690, parent: PNP0C09:00
> 18.674: [  184.148471]  mousedev iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm irqbypass psmouse snd_pcsp arc4 pcmcia sdhci_pci cqhci iwl3945 lpc_ich mfd_core iwlegacy
> 18.674: [  184.169352] call 0000:05:00.2+ returned 0 after 251381 usecs
> 18.674: [  184.169336]  i2c_i801 sdhci mmc_core mac80211 snd_hda_codec_analog snd_hda_codec_generic firewire_ohci yenta_socket pcmcia_rsrc firewire_core crc_itu_t pcmcia_core rng_core cfg80211 snd_hda_intel ehci_pci e1000e thinkpad_acpi snd_hda_codec uhci_hcd snd_hda_core snd_hwdep snd_pcm snd_timer
> 18.674: [  184.180378] calling  phy0+ @ 727, parent: 0000:02:00.0
> 18.674: [  184.180362]  ehci_hcd nvram rfkill battery snd soundcore usbcore video button shpchp ac acpi_cpufreq ip_tables x_tables autofs4 ext4 crc16 mbcache jbd2 fscrypto cbc dm_crypt dm_mod sd_mod ahci
> 18.674: [  184.192683] call PNP0C0A:01+ returned 0 after 23063 usecs
> 18.674: [  184.180362]  libahci evdev libata serio_raw scsi_mod fan thermal
> 18.674: [  184.180362] CPU: 0 PID: 708 Comm: kworker/u4:7 Not tainted 4.17.0-rc2+ #14
> 18.674: [  184.198715] calling  PNP0C0E:00+ @ 690, parent: PNP0C09:00
> 18.674: [  184.198705] Hardware name: LENOVO 636338U/636338U, BIOS CBET4000 TIMELESS 01/01/1970
> 18.674: [  184.198705] Workqueue: events_unbound async_run_entry_fn
> 18.674: [  184.198705] EIP: blk_set_runtime_active+0x4f/0xc0
> 18.674: [  184.198705] EFLAGS: 00010013 CPU: 0
> 18.674: [  184.198705] EAX: 00000000 EBX: f665f740 ECX: f665f838 EDX: 00000000
> 18.674: [  184.198705] ESI: 00000000 EDI: ffff7bbe EBP: f1afde38 ESP: f1afde24
> 18.674: [  184.218335] call PNP0C0E:00+ returned 0 after 2 usecs
> 18.674: [  184.198705]  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
> 18.674: [  184.198705] CR0: 80050033 CR2: 000000f4 CR3: 0e40a000 CR4: 000006d0
> 18.674: [  184.198705] Call Trace:
> 18.674: [  184.198705]  ? async_sdev_thaw+0x20/0x20 [scsi_mod]
> 18.674: [  184.198705]  scsi_bus_resume_common+0x79/0x180 [scsi_mod]
> 18.674: [  184.238364] call phy0+ returned 0 after 44664 usecs
> 18.674: [  184.198705]  scsi_bus_resume+0x12/0x20 [scsi_mod]
> 18.674: [  184.198705]  dpm_run_callback+0x47/0x1b0
> 18.674: [  184.198705]  ? scsi_bus_thaw+0x20/0x20 [scsi_mod]
> 18.674: [  184.198705]  device_resume+0x97/0x190
> 18.674: [  184.269566] calling  usb3+ @ 727, parent: 0000:00:1d.1
> 18.674: [  184.198705]  ? device_resume+0x190/0x190
> 18.674: [  184.198705]  async_resume+0x1e/0x50
> 18.675: [  184.198705]  async_run_entry_fn+0x61/0x3a0
> 18.675: [  184.198705]  ? __schedule+0x2f5/0xcf0
> 18.675: [  184.198705]  ? try_to_wake_up+0x4d/0x790
> 18.675: [  184.198705]  ? __switch_to_asm+0x33/0x4c
> 18.675: [  184.291885] ata1: SATA link up 1.5 Gbps (SStatus 113 SControl 300)
> 18.675: [  184.198705]  ? pwq_dec_nr_in_flight+0x7c/0x150
> 18.675: [  184.198705]  process_one_work+0x235/0x690
> 18.675: [  184.198705]  worker_thread+0x19d/0x6a0
> 18.675: [  184.198705]  kthread+0x14a/0x1f0
> 18.675: [  184.198705]  ? process_one_work+0x690/0x690
> 18.675: [  184.198705]  ? kthread_create_worker_on_cpu+0x30/0x30
> 18.675: [  184.198705]  ret_from_fork+0x2e/0x38
> 18.675: [  184.198705] Code: 
> 18.675: [  184.303294] call usb2+ returned 0 after 141649 usecs
> 18.675: [  184.198705] 7e 8b 83 fc 00 00 00 e8 e0 32 53 00 8b b3 50 01 
> 18.675: [  184.315689] calling  usb4+ @ 118, parent: 0000:00:1d.2
> 18.675: [  184.198705] 00 00 31 c0 8b 3d 20 7d ea cd 89 83 54 01 00 00 81 fe 0c 
> 18.675: [  184.328767] call usb1+ returned 0 after 190866 usecs
> 18.675: [  184.198705] ff ff ff 89 7d ec 74 46 <89> be f4 00 00 00 ba 09 00 00 00 8b 83 50 01 00 00 e8 0b 25 24 
> 18.675: [  184.198705] EIP: blk_set_runtime_active+0x4f/0xc0 SS:ESP: 0068:f1afde24
> 18.675: [  184.198705] CR2: 00000000000000f4
> 18.675: [  184.336955] calling  2-6+ @ 6, parent: usb2
> 18.675: [  184.198705] ---[ end trace 94bb43426dd6105a ]---

> (gdb) disas blk_set_runtime_active
> Dump of assembler code for function blk_set_runtime_active:
>    0xc1518610 <+0>:	call   0xc106ac9c <__fentry__>
>    0xc1518615 <+5>:	push   %ebp
>    0xc1518616 <+6>:	mov    %esp,%ebp
>    0xc1518618 <+8>:	sub    $0x14,%esp
>    0xc151861b <+11>:	mov    %ebx,-0xc(%ebp)
>    0xc151861e <+14>:	mov    %eax,%ebx
>    0xc1518620 <+16>:	mov    %gs:0x14,%eax
>    0xc1518626 <+22>:	mov    %eax,-0x10(%ebp)
>    0xc1518629 <+25>:	xor    %eax,%eax
>    0xc151862b <+27>:	test   %ebx,%ebx
>    0xc151862d <+29>:	mov    %esi,-0x8(%ebp)
>    0xc1518630 <+32>:	mov    %edi,-0x4(%ebp)
>    0xc1518633 <+35>:	je     0xc15186b3 <blk_set_runtime_active+163>
>    0xc1518635 <+37>:	mov    0xfc(%ebx),%eax
>    0xc151863b <+43>:	call   0xc1a4b920 <_raw_spin_lock_irq>
>    0xc1518640 <+48>:	mov    0x150(%ebx),%esi
>    0xc1518646 <+54>:	xor    %eax,%eax
>    0xc1518648 <+56>:	mov    0xc1ca7d20,%edi
>    0xc151864e <+62>:	mov    %eax,0x154(%ebx)
>    0xc1518654 <+68>:	cmp    $0xffffff0c,%esi
>    0xc151865a <+74>:	mov    %edi,-0x14(%ebp)
>    0xc151865d <+77>:	je     0xc15186a5 <blk_set_runtime_active+149>
>    0xc151865f <+79>:	mov    %edi,0xf4(%esi)
>    0xc1518665 <+85>:	mov    $0x9,%edx
>    0xc151866a <+90>:	mov    0x150(%ebx),%eax
>    0xc1518670 <+96>:	call   0xc175ab80 <__pm_runtime_suspend>
>    0xc1518675 <+101>:	mov    0xfc(%ebx),%eax
>    0xc151867b <+107>:	call   *0xc1ce2918
>    0xc1518681 <+113>:	call   *0xc1ce2888
>    0xc1518687 <+119>:	mov    -0x10(%ebp),%eax
>    0xc151868a <+122>:	xor    %gs:0x14,%eax
>    0xc1518691 <+129>:	jne    0xc15186a0 <blk_set_runtime_active+144>
>    0xc1518693 <+131>:	mov    -0xc(%ebp),%ebx
>    0xc1518696 <+134>:	mov    -0x8(%ebp),%esi
>    0xc1518699 <+137>:	mov    -0x4(%ebp),%edi
>    0xc151869c <+140>:	mov    %ebp,%esp
>    0xc151869e <+142>:	pop    %ebp
>    0xc151869f <+143>:	ret    
>    0xc15186a0 <+144>:	call   0xc108c6c0 <__stack_chk_fail>
>    0xc15186a5 <+149>:	xor    %edx,%edx
>    0xc15186a7 <+151>:	mov    $0xc1ee14b4,%eax
>    0xc15186ac <+156>:	call   0xc15bb7f0 <__ubsan_handle_type_mismatch>
>    0xc15186b1 <+161>:	jmp    0xc151865f <blk_set_runtime_active+79>
>    0xc15186b3 <+163>:	xor    %edx,%edx
>    0xc15186b5 <+165>:	mov    $0xc1ee14cc,%eax
>    0xc15186ba <+170>:	call   0xc15bb7f0 <__ubsan_handle_type_mismatch>
>    0xc15186bf <+175>:	jmp    0xc1518635 <blk_set_runtime_active+37>
> End of assembler dump.


Kind regards,

Paul

View attachment "seriallog-20180424_225038.log" of type "text/x-log" (174784 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ