linux-kernel - docker crashes rcuos in __blkg_release

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Sun, 8 Jun 2014 18:22:00 -0400
From:	Joe Lawrence <joe.lawrence@...atus.com>
To:	<linux-kernel@...r.kernel.org>
CC:	Tejun Heo <tj@...nel.org>, Vivek Goyal <vgoyal@...hat.com>
Subject: docker crashes rcuos in __blkg_release_rcu

Hi Tejun, Vivek,

I came across this crash when attempting to run the 'hello world'
example from the Getting Started section on the docker.io homepage.

Repro kernels:

(upstream linus) 3.15.0
(RHEL7 RC-2)     3.10.0-121.el7.x86_64

To reproduce, boot with slub_debug=FZPU and run the example.

  % # RHEL7 needs docker-io from EPEL
  % yum install http://dl.fedoraproject.org/pub/epel/beta/7/x86_64/epel-release-7-0.1.noarch.rpm
  % rpm -ivh epel-release-7-0.1.noarch.rpm
  % yum install docker-io
  
  % systemctl start docker
  % docker run ubuntu /bin/echo hello world

The host crashes every time with the following stack trace:

general protection fault: 0000 [#1] SMP 
Modules linked in: veth xt_addrtype xt_conntrack iptable_filter ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack ip_tables bridge stp llc dm_thin_pool dm_persistent_data dm_bio_prison dm_bufio libcrc32c loop bonding sg x86_pkg_temp_thermal coretemp crct10dif_pclmul crc32_pclmul crc32c_intel igb ixgbe ghash_clmulni_intel aesni_intel nfsd lrw gf128mul glue_helper ablk_helper dm_service_time cryptd pcspkr ptp auth_rpcgss ntb pps_core nfs_acl ses lockd mdio i2c_algo_bit enclosure ipmi_devintf dca ipmi_msghandler i2c_core dm_multipath sunrpc dm_mod ext4 mbcache jbd2 raid1 sd_mod crc_t10dif crct10dif_common sr_mod cdrom qla2xxx mpt3sas mpt2sas scsi_transport_fc usb_storage scsi_tgt raid_class scsi_transport_sas
CPU: 21 PID: 30 Comm: rcuos/21 Not tainted 3.15.0 #1
Hardware name: Stratus ftServer 6400/G7LAZ, BIOS BIOS Version 6.3:57 12/25/2013
task: ffff880854021de0 ti: ffff88085403c000 task.ti: ffff88085403c000
RIP: 0010:[<ffffffff8162e9e5>]  [<ffffffff8162e9e5>] _raw_spin_lock_irq+0x15/0x60
RSP: 0018:ffff88085403fdf0  EFLAGS: 00010086
RAX: 0000000000020000 RBX: 0000000000000010 RCX: 0000000000000000
RDX: 000060ef80008248 RSI: 0000000000000286 RDI: 6b6b6b6b6b6b6b6b
RBP: ffff88085403fdf0 R08: 0000000000000286 R09: 0000000000009f39
R10: 0000000000020001 R11: 0000000000020001 R12: ffff88103c17a130
R13: ffff88103c17a080 R14: 0000000000000000 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff88107fca0000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000006e5ab8 CR3: 000000000193d000 CR4: 00000000000407e0
Stack:
 ffff88085403fe18 ffffffff812cbfc2 ffff88103c17a130 0000000000000000
 ffff88103c17a130 ffff88085403fec0 ffffffff810d1d28 ffff880854021de0
 ffff880854021de0 ffff88107fcaec58 ffff88085403fe80 ffff88107fcaec30
Call Trace:
 [<ffffffff812cbfc2>] __blkg_release_rcu+0x72/0x150
 [<ffffffff810d1d28>] rcu_nocb_kthread+0x1e8/0x300
 [<ffffffff810b6a00>] ? abort_exclusive_wait+0xb0/0xb0
 [<ffffffff810d1b40>] ? rcu_start_gp+0x40/0x40
 [<ffffffff81091d81>] kthread+0xe1/0x100
 [<ffffffff81091ca0>] ? kthread_create_on_node+0x1a0/0x1a0
 [<ffffffff8163813c>] ret_from_fork+0x7c/0xb0
 [<ffffffff81091ca0>] ? kthread_create_on_node+0x1a0/0x1a0
Code: ff 47 04 48 8b 7d 08 be 00 02 00 00 e8 55 48 a4 ff 5d c3 0f 1f 00 66 66 66 66 90 55 48 89 e5 fa 66 66 90 66 66 90 b8 00 00 02 00 <f0> 0f c1 07 89 c2 c1 ea 10 66 39 c2 75 02 5d c3 83 e2 fe 0f b7 
RIP  [<ffffffff8162e9e5>] _raw_spin_lock_irq+0x15/0x60
 RSP <ffff88085403fdf0>

crash> dis -l _raw_spin_lock_irq

kernel/locking/spinlock.c: 166
  <_raw_spin_lock_irq>:        data32 data32 data32 xchg %ax,%ax
  <_raw_spin_lock_irq+0x5>:    push   %rbp
  <_raw_spin_lock_irq+0x6>:    mov    %rsp,%rbp
arch/x86/include/asm/paravirt.h: 814
  <_raw_spin_lock_irq+0x9>:    cli    
  <_raw_spin_lock_irq+0xa>:    data32 xchg %ax,%ax
  <_raw_spin_lock_irq+0xd>:    data32 xchg %ax,%ax
arch/x86/include/asm/spinlock.h: 86
  <_raw_spin_lock_irq+0x10>:   mov    $0x20000,%eax
  <_raw_spin_lock_irq+0x15>:   lock xadd %eax,(%rdi)       <<

arch/x86/include/asm/spinlock.h:

 82 static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
 83 {
 84         register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC };
 85 
 86         inc = xadd(&lock->tickets, inc);               <<

.tickets is offset 0 from arch_spinlock_t, so RDI should be the
arch_spinlock_t lock:
RDI: 6b6b6b6b6b6b6b6b

Back up a frame and get bearings...

crash> dis -l __blkg_release_rcu

block/blk-cgroup.c: 402
  <__blkg_release_rcu+0x56>:   cmpq   $0x0,-0x80(%r12)
  <__blkg_release_rcu+0x5c>:   je     0xffffffff812cc001 <__blkg_release_rcu+0xb1>
block/blk-cgroup.c: 403
  <__blkg_release_rcu+0x5e>:   mov    -0xb0(%r12),%rax
include/linux/spinlock.h: 328
  <__blkg_release_rcu+0x66>:   mov    0x460(%rax),%rdi
  <__blkg_release_rcu+0x6d>:   callq  0xffffffff8162e9d0 <_raw_spin_lock_irq>

block/blk-cgroup.c:

 387 void __blkg_release_rcu(struct rcu_head *rcu_head)
 388 {
 ...
 400         /* release the blkcg and parent blkg refs this blkg has been holding */
 401         css_put(&blkg->blkcg->css);
 402         if (blkg->parent) {
 403                 spin_lock_irq(blkg->q->queue_lock);
 404                 blkg_put(blkg->parent);
 405                 spin_unlock_irq(blkg->q->queue_lock);
 406         }

RAX is the struct request_queue*, but has been re-used by
_raw_spin_lock_irq.  How about R12?

crash> struct -o blkcg_gq | grep b0                                                                       
  [0xb0] struct callback_head callback_head;

... and ...

block/blk-cgroup.c: 389
  <__blkg_release_rcu+0xb>:    lea    -0xb0(%rdi),%r13
block/blk-cgroup.c: 388
  <__blkg_release_rcu+0x12>:   push   %r12
  <__blkg_release_rcu+0x14>:   mov    %rdi,%r12

Chances are R12 is struct rcu_head *rcu_head and R13 is struct blkcg_gq*

R13: ffff88103c17a080

crash> p/x 0xffff88103c17a130-0xb0                                                                        
$2 = 0xffff88103c17a080

Yup.

crash> struct blkcg_gq 0xffff88103c17a080 | grep q
struct blkcg_gq {
  q = 0xffff88103fc7df90,

crash> rd 0xffff88103fc7df90 0xee
... all 0x6b's ...

Summary thus far:

R12: ffff88103c17a130 = struct rcu_head *rcu_head 
R13: ffff88103c17a080 = struct blkcg_gq *blkg
     ffff88103fc7df90 = struct request_queue *blkg->q (contains 0x6b
                                                       poison-pattern)

commit 2a4fd070 "blkcg: move bulk of blkcg_gq release operations to the
RCU callback" shuffled around some code in this space, introducing the
the calls to spin_[un]lock_irq(blkg->q->queue_lock).

Tejun -- I still have the vmcore here if you would like further analysis
or test patches you would like me to try.

Vivek -- might slub_debug be a reliable repro for RHBZ-1019584 (closed,
needinfo)?

Regards,

-- Joe

[1] https://www.docker.io/gettingstarted/
[2] https://bugzilla.redhat.com/show_bug.cgi?id=1019584
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/