lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a4423d670903251007r72058ca8m28275c1433ab2706@mail.gmail.com>
Date:	Wed, 25 Mar 2009 20:07:46 +0300
From:	Alexander Beregalov <a.beregalov@...il.com>
To:	Jan Kara <jack@...e.cz>
Cc:	Theodore Tso <tytso@....edu>,
	"linux-next@...r.kernel.org" <linux-next@...r.kernel.org>,
	linux-ext4@...r.kernel.org, LKML <linux-kernel@...r.kernel.org>
Subject: Re: next-20090310: ext4 hangs

2009/3/25 Jan Kara <jack@...e.cz>:
> On Wed 25-03-09 18:29:10, Alexander Beregalov wrote:
>> 2009/3/25 Jan Kara <jack@...e.cz>:
>> > On Wed 25-03-09 18:18:43, Alexander Beregalov wrote:
>> >> 2009/3/25 Jan Kara <jack@...e.cz>:
>> >> >> > So, I think I need to try it on 2.6.29-rc7 again.
>> >> >>   I've looked into this. Obviously, what's happenning is that we delete
>> >> >> an inode and jbd2_journal_release_jbd_inode() finds inode is just under
>> >> >> writeout in transaction commit and thus it waits. But it gets never woken
>> >> >> up and because it has a handle from the transaction, every one eventually
>> >> >> blocks on waiting for a transaction to finish.
>> >> >>   But I don't really see how that can happen. The code is really
>> >> >> straightforward and everything happens under j_list_lock... Strange.
>> >> >  BTW: Is the system SMP?
>> >> No, it is UP system.
>> >  Even stranger. And do you have CONFIG_PREEMPT set?
>> >
>> >> The bug exists even in 2.6.29, I posted it with a new topic.
>> >  OK, I've sort-of expected this.
>>
>> CONFIG_PREEMPT_RCU=y
>> CONFIG_PREEMPT_RCU_TRACE=y
>> # CONFIG_PREEMPT_NONE is not set
>> # CONFIG_PREEMPT_VOLUNTARY is not set
>> CONFIG_PREEMPT=y
>> CONFIG_DEBUG_PREEMPT=y
>> # CONFIG_PREEMPT_TRACER is not set
>>
>> config is attached.
>  Thanks for the data. I still don't see how the wakeup can get lost. The
> process even cannot be preempted when we are in the section protected by
> j_list_lock... Can you send me a disassembly of functions
> jbd2_journal_release_jbd_inode() and journal_submit_data_buffers() so that
> I can see whether the compiler has not reordered something unexpectedly?

void jbd2_journal_release_jbd_inode(journal_t *journal,
                                    struct jbd2_inode *jinode)
{
     6d8:       9d e3 bf 00     save  %sp, -256, %sp
     6dc:       11 00 00 00     sethi  %hi(0), %o0
     6e0:       40 00 00 00     call  6e0 <jbd2_journal_release_jbd_inode+0x8>
     6e4:       90 12 20 00     mov  %o0, %o0   ! 0 <jbd2_history_skip_empty>
        int writeout = 0;

        if (!journal)
     6e8:       02 c6 00 30     brz,pn   %i0, 7a8
<jbd2_journal_release_jbd_inode+0xd0>
     6ec:       03 00 00 00     sethi  %hi(0), %g1
                return;
restart:
        spin_lock(&journal->j_list_lock);
     6f0:       b0 06 25 70     add  %i0, 0x570, %i0
        /* Is commit writing out inode - we have to wait */
        if (jinode->i_flags & JI_COMMIT_RUNNING) {
                wait_queue_head_t *wq;
                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
     6f4:       aa 10 60 00     mov  %g1, %l5
     6f8:       a2 06 60 28     add  %i1, 0x28, %l1
     6fc:       a8 07 a7 b7     add  %fp, 0x7b7, %l4
     700:       a6 07 a7 df     add  %fp, 0x7df, %l3
                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
                prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
     704:       a4 07 a7 c7     add  %fp, 0x7c7, %l2
        int writeout = 0;

        if (!journal)
                return;
restart:
        spin_lock(&journal->j_list_lock);
     708:       40 00 00 00     call  708 <jbd2_journal_release_jbd_inode+0x30>
     70c:       90 10 00 18     mov  %i0, %o0
        /* Is commit writing out inode - we have to wait */
        if (jinode->i_flags & JI_COMMIT_RUNNING) {
     710:       c2 06 60 28     ld  [ %i1 + 0x28 ], %g1
                wait_queue_head_t *wq;
                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
     714:       94 10 20 38     mov  0x38, %o2
     718:       90 10 00 14     mov  %l4, %o0
     71c:       92 10 20 00     clr  %o1
        if (!journal)
                return;
restart:
        spin_lock(&journal->j_list_lock);
        /* Is commit writing out inode - we have to wait */
        if (jinode->i_flags & JI_COMMIT_RUNNING) {
     720:       80 88 60 01     btst  1, %g1
     724:       02 60 00 19     be,pn   %xcc, 788
<jbd2_journal_release_jbd_inode+0xb0>
     728:       a0 10 00 04     mov  %g4, %l0
                wait_queue_head_t *wq;
                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
     72c:       40 00 00 00     call  72c <jbd2_journal_release_jbd_inode+0x54>
     730:       01 00 00 00     nop
                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
     734:       90 10 00 11     mov  %l1, %o0
     738:       92 10 20 00     clr  %o1
restart:
        spin_lock(&journal->j_list_lock);
        /* Is commit writing out inode - we have to wait */
        if (jinode->i_flags & JI_COMMIT_RUNNING) {
                wait_queue_head_t *wq;
                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
     73c:       e0 77 a7 cf     stx  %l0, [ %fp + 0x7cf ]
     740:       e2 77 a7 b7     stx  %l1, [ %fp + 0x7b7 ]
     744:       ea 77 a7 d7     stx  %l5, [ %fp + 0x7d7 ]
     748:       e6 77 a7 df     stx  %l3, [ %fp + 0x7df ]
                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
     74c:       40 00 00 00     call  74c <jbd2_journal_release_jbd_inode+0x74>
     750:       e6 77 a7 e7     stx  %l3, [ %fp + 0x7e7 ]
                prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
     754:       92 10 00 12     mov  %l2, %o1
     758:       94 10 20 02     mov  2, %o2
     75c:       40 00 00 00     call  75c <jbd2_journal_release_jbd_inode+0x84>
     760:       a0 10 00 08     mov  %o0, %l0
                spin_unlock(&journal->j_list_lock);
     764:       40 00 00 00     call  764 <jbd2_journal_release_jbd_inode+0x8c>
     768:       90 10 00 18     mov  %i0, %o0
                schedule();
     76c:       40 00 00 00     call  76c <jbd2_journal_release_jbd_inode+0x94>
     770:       01 00 00 00     nop
                finish_wait(wq, &wait.wait);
     774:       90 10 00 10     mov  %l0, %o0
     778:       40 00 00 00     call  778 <jbd2_journal_release_jbd_inode+0xa0>
     77c:       92 10 00 12     mov  %l2, %o1
     780:       10 6f ff e2     b  %xcc, 708
<jbd2_journal_release_jbd_inode+0x30>
     784:       01 00 00 00     nop
        }

        /* Do we need to wait for data writeback? */
        if (journal->j_committing_transaction == jinode->i_transaction)
                writeout = 1;
        if (jinode->i_transaction) {
     788:       c2 5e 40 00     ldx  [ %i1 ], %g1
     78c:       02 c0 40 05     brz,pn   %g1, 7a0
<jbd2_journal_release_jbd_inode+0xc8>
     790:       01 00 00 00     nop
                list_del(&jinode->i_list);
     794:       40 00 00 00     call  794 <jbd2_journal_release_jbd_inode+0xbc>
     798:       90 06 60 10     add  %i1, 0x10, %o0
                jinode->i_transaction = NULL;
     79c:       c0 76 40 00     clrx  [ %i1 ]
        }
        spin_unlock(&journal->j_list_lock);
     7a0:       40 00 00 00     call  7a0 <jbd2_journal_release_jbd_inode+0xc8>
     7a4:       90 10 00 18     mov  %i0, %o0
     7a8:       81 cf e0 08     rett  %i7 + 8
     7ac:       01 00 00 00     nop

====
By default gcc inlines journal_submit_data_buffers()
Here is -fno-inline version. Default version is in attach.
====

static int journal_submit_data_buffers(journal_t *journal,
                transaction_t *commit_transaction)
{
      9c:       9d e3 bf 40     save  %sp, -192, %sp
      a0:       11 00 00 00     sethi  %hi(0), %o0
        struct jbd2_inode *jinode;
        int err, ret = 0;
        struct address_space *mapping;

        spin_lock(&journal->j_list_lock);
      a4:       a4 06 25 70     add  %i0, 0x570, %l2
 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 * operate on from being released while we write out pages.
 */
static int journal_submit_data_buffers(journal_t *journal,
                transaction_t *commit_transaction)
{
      a8:       90 12 20 00     mov  %o0, %o0
      ac:       40 00 00 00     call  ac <journal_submit_data_buffers+0x10>
      b0:       b0 10 20 00     clr  %i0
        struct jbd2_inode *jinode;
        int err, ret = 0;
        struct address_space *mapping;

        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
      b4:       a6 06 60 60     add  %i1, 0x60, %l3
{
        struct jbd2_inode *jinode;
        int err, ret = 0;
        struct address_space *mapping;

        spin_lock(&journal->j_list_lock);
      b8:       40 00 00 00     call  b8 <journal_submit_data_buffers+0x1c>
      bc:       90 10 00 12     mov  %l2, %o0
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
      c0:       10 68 00 1d     b  %xcc, 134 <journal_submit_data_buffers+0x98>
      c4:       c2 5e 60 60     ldx  [ %i1 + 0x60 ], %g1
                mapping = jinode->i_vfs_inode->i_mapping;
                jinode->i_flags |= JI_COMMIT_RUNNING;
                spin_unlock(&journal->j_list_lock);
      c8:       90 10 00 12     mov  %l2, %o0
        struct address_space *mapping;

        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
                mapping = jinode->i_vfs_inode->i_mapping;
                jinode->i_flags |= JI_COMMIT_RUNNING;
      cc:       c2 04 60 28     ld  [ %l1 + 0x28 ], %g1
        int err, ret = 0;
        struct address_space *mapping;

        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
                mapping = jinode->i_vfs_inode->i_mapping;
      d0:       e0 58 a1 e0     ldx  [ %g2 + 0x1e0 ], %l0
                jinode->i_flags |= JI_COMMIT_RUNNING;
      d4:       82 10 60 01     or  %g1, 1, %g1
                spin_unlock(&journal->j_list_lock);
      d8:       40 00 00 00     call  d8 <journal_submit_data_buffers+0x3c>
      dc:       c2 24 60 28     st  %g1, [ %l1 + 0x28 ]
                 * submit the inode data buffers. We use writepage
                 * instead of writepages. Because writepages can do
                 * block allocation  with delalloc. We need to write
                 * only allocated blocks here.
                 */
                err = journal_submit_inode_data_buffers(mapping);
      e0:       7f ff ff d3     call  2c <journal_submit_inode_data_buffers>
      e4:       90 10 00 10     mov  %l0, %o0
                if (!ret)
      e8:       80 a6 20 00     cmp  %i0, 0
      ec:       b1 64 40 08     move  %icc, %o0, %i0
                        ret = err;
                spin_lock(&journal->j_list_lock);
      f0:       40 00 00 00     call  f0 <journal_submit_data_buffers+0x54>
      f4:       90 10 00 12     mov  %l2, %o0
                J_ASSERT(jinode->i_transaction == commit_transaction);
      f8:       c2 5c 40 00     ldx  [ %l1 ], %g1
      fc:       80 a0 40 19     cmp  %g1, %i1
     100:       22 68 00 07     be,a   %xcc, 11c
<journal_submit_data_buffers+0x80>
     104:       c2 04 60 28     ld  [ %l1 + 0x28 ], %g1
     108:       11 00 00 00     sethi  %hi(0), %o0
     10c:       92 10 21 04     mov  0x104, %o1
     110:       40 00 00 00     call  110 <journal_submit_data_buffers+0x74>
     114:       90 12 20 00     mov  %o0, %o0
     118:       91 d0 20 05     ta  5
                jinode->i_flags &= ~JI_COMMIT_RUNNING;
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
     11c:       90 04 60 28     add  %l1, 0x28, %o0
     120:       92 10 20 00     clr  %o1
                err = journal_submit_inode_data_buffers(mapping);
                if (!ret)
                        ret = err;
                spin_lock(&journal->j_list_lock);
                J_ASSERT(jinode->i_transaction == commit_transaction);
                jinode->i_flags &= ~JI_COMMIT_RUNNING;
     124:       82 08 7f fe     and  %g1, -2, %g1
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
     128:       40 00 00 00     call  128 <journal_submit_data_buffers+0x8c>
     12c:       c2 24 60 28     st  %g1, [ %l1 + 0x28 ]
        struct jbd2_inode *jinode;
        int err, ret = 0;
        struct address_space *mapping;

        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
     130:       c2 5c 60 10     ldx  [ %l1 + 0x10 ], %g1
     134:       a2 00 7f f0     add  %g1, -16, %l1
         * prefetches into the prefetch-cache which only is accessible
         * by floating point operations in UltraSPARC-III and later.
         * By contrast, "#one_write" prefetches into the L2 cache
         * in shared state.
         */
        __asm__ __volatile__("prefetch [%0], #one_write"
     138:       c2 5c 60 10     ldx  [ %l1 + 0x10 ], %g1
     13c:       c7 68 40 00     prefetch  [ %g1 ], #one_write
     140:       82 04 60 10     add  %l1, 0x10, %g1
     144:       80 a4 c0 01     cmp  %l3, %g1
     148:       32 6f ff e0     bne,a   %xcc, c8
<journal_submit_data_buffers+0x2c>
     14c:       c4 5c 60 20     ldx  [ %l1 + 0x20 ], %g2
                spin_lock(&journal->j_list_lock);
                J_ASSERT(jinode->i_transaction == commit_transaction);
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
        spin_unlock(&journal->j_list_lock);
     150:       90 10 00 12     mov  %l2, %o0
     154:       40 00 00 00     call  154 <journal_submit_data_buffers+0xb8>
     158:       b1 3e 20 00     sra  %i0, 0, %i0
        return ret;
}
     15c:       81 cf e0 08     rett  %i7 + 8
     160:       01 00 00 00     nop

Download attachment "jbd2-commit-noinline.out" of type "application/octet-stream" (151910 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ