lists.openwall.net | lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC | |
Open Source and information security mailing list archives
| ||
|
Message-ID: <20150925154925.GH4449@mtj.duckdns.org> Date: Fri, 25 Sep 2015 11:49:25 -0400 From: Tejun Heo <tj@...nel.org> To: Artem Bityutskiy <dedekind1@...il.com> Cc: Theodore Ts'o <tytso@....edu>, axboe@...nel.dk, linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org, lizefan@...wei.com, cgroups@...r.kernel.org, hannes@...xchg.org, kernel-team@...com, adilger.kernel@...ger.ca, linux-ext4@...r.kernel.org, Dexuan Cui <decui@...rosoft.com> Subject: Re: [PATCH cgroup/for-4.3-fixes] cgroup, writeback: don't enable cgroup writeback on traditional hierarchies Hello, Artem. On Fri, Sep 25, 2015 at 01:50:22PM +0300, Artem Bityutskiy wrote: > > Does not compile with multiple errors like > > > > linux/fs/fs-writeback.c:799:10: error: ‘struct bdi_writeback’ has no > > member named ‘last_comp_gen’ > > bdi->wb.last_comp_gen = bdi->wb.comp_gen; > > I tried to extend your patch with these fields, but I am not sure I got > it right, so please, send a new patch, I'll run the reboot corruption > test with your patch. Oops, sorry, I'm an idiot. > Please, note, because this test is about reboots, I'll probably output > everything to the serial console. Therefore, please, do not print too > much data. Otherwise I'd have to modify my scripts to collect dmesg > before restarting, which is more work. Hmmm... I'm gonna add ratelimit on inode printouts; other than that, it shouldn't print too much. Thanks. --- fs/fs-writeback.c | 154 +++++++++++++++++++++++++++++++++++++-- fs/inode.c | 1 include/linux/backing-dev-defs.h | 2 include/linux/backing-dev.h | 20 ++++- include/linux/fs.h | 2 mm/backing-dev.c | 2 6 files changed, 173 insertions(+), 8 deletions(-) --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -101,7 +101,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepa static bool wb_io_lists_populated(struct bdi_writeback *wb) { - if (wb_has_dirty_io(wb)) { + if (test_bit(WB_has_dirty_io, &wb->state)) { return false; } else { set_bit(WB_has_dirty_io, &wb->state); @@ -763,6 +763,15 @@ static long wb_split_bdi_pages(struct bd return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); } +extern spinlock_t cgwb_lock; + +struct split_work_dbg { + DECLARE_BITMAP(all_wbs, 8192); + DECLARE_BITMAP(iterated_wbs, 8192); + DECLARE_BITMAP(written_wbs, 8192); + DECLARE_BITMAP(sync_wbs, 8192); +}; + /** * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi * @bdi: target backing_dev_info @@ -776,11 +785,25 @@ static long wb_split_bdi_pages(struct bd */ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, - bool skip_if_busy) + bool skip_if_busy, struct split_work_dbg *dbg) { int next_memcg_id = 0; struct bdi_writeback *wb; struct wb_iter iter; + struct radix_tree_iter riter; + void **slot; + + if (dbg) { + spin_lock_irq(&cgwb_lock); + set_bit(bdi->wb.memcg_css->id, dbg->all_wbs); + bdi->wb.last_comp_gen = bdi->wb.comp_gen; + radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &riter, 0) { + wb = *slot; + set_bit(wb->memcg_css->id, dbg->all_wbs); + wb->last_comp_gen = wb->comp_gen; + } + spin_unlock_irq(&cgwb_lock); + } might_sleep(); restart: @@ -791,6 +814,9 @@ restart: struct wb_writeback_work *work; long nr_pages; + if (dbg) + set_bit(wb->memcg_css->id, dbg->iterated_wbs); + /* SYNC_ALL writes out I_DIRTY_TIME too */ if (!wb_has_dirty_io(wb) && (base_work->sync_mode == WB_SYNC_NONE || @@ -799,6 +825,9 @@ restart: if (skip_if_busy && writeback_in_progress(wb)) continue; + if (dbg) + set_bit(wb->memcg_css->id, dbg->written_wbs); + nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); work = kmalloc(sizeof(*work), GFP_ATOMIC); @@ -817,6 +846,8 @@ restart: work->auto_free = 0; work->done = &fallback_work_done; + if (dbg) + set_bit(wb->memcg_css->id, dbg->sync_wbs); wb_queue_work(wb, work); next_memcg_id = wb->memcg_css->id + 1; @@ -1425,6 +1456,9 @@ static long writeback_sb_inodes(struct s break; } + inode->i_dbg_marker = 0; + inode->i_dbg_marker2 = 0; + /* * Don't bother with new inodes or inodes being freed, first * kind does not need periodic writeout yet, and for the latter @@ -1515,6 +1549,7 @@ static long writeback_sb_inodes(struct s break; } } + return wrote; } @@ -1574,6 +1609,28 @@ static long writeback_inodes_wb(struct b return nr_pages - work.nr_pages; } +static int inode_which_wb_io_list(struct inode *inode, struct backing_dev_info *bdi) +{ + struct bdi_writeback *wb = inode->i_wb ?: &bdi->wb; + struct inode *pos; + + if (list_empty(&inode->i_io_list)) + return 0; + list_for_each_entry(pos, &wb->b_dirty, i_io_list) + if (pos == inode) + return 1; + list_for_each_entry(pos, &wb->b_io, i_io_list) + if (pos == inode) + return 2; + list_for_each_entry(pos, &wb->b_more_io, i_io_list) + if (pos == inode) + return 3; + list_for_each_entry(pos, &wb->b_dirty_time, i_io_list) + if (pos == inode) + return 4; + return 5; +} + /* * Explicit flushing or periodic writeback of "old" data. * @@ -1604,6 +1661,16 @@ static long wb_writeback(struct bdi_writ blk_start_plug(&plug); spin_lock(&wb->list_lock); + + list_for_each_entry(inode, &wb->b_dirty, i_io_list) + inode->i_dbg_marker2 = 1; + list_for_each_entry(inode, &wb->b_io, i_io_list) + inode->i_dbg_marker2 = 2; + list_for_each_entry(inode, &wb->b_more_io, i_io_list) + inode->i_dbg_marker2 = 3; + list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) + inode->i_dbg_marker2 = 4; + for (;;) { /* * Stop writeback when nr_pages has been consumed @@ -1681,6 +1748,24 @@ static long wb_writeback(struct bdi_writ spin_lock(&wb->list_lock); } } + + if (work->sync_mode == WB_SYNC_ALL) { + list_for_each_entry(inode, &wb->b_dirty, i_io_list) + if (inode->i_dbg_marker2) + printk("XXX wb_writeback: inode %lu marker2=%d on b_dirty\n", + inode->i_ino, inode->i_dbg_marker2); + list_for_each_entry(inode, &wb->b_io, i_io_list) + printk("XXX wb_writeback: inode %lu marker2=%d on b_io\n", + inode->i_ino, inode->i_dbg_marker2); + list_for_each_entry(inode, &wb->b_more_io, i_io_list) + printk("XXX wb_writeback: inode %lu marker2=%d on b_more_io\n", + inode->i_ino, inode->i_dbg_marker2); + list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) + if (inode->i_dbg_marker2) + printk("XXX wb_writeback: inode %lu marker2=%d on b_dirty_time\n", + inode->i_ino, inode->i_dbg_marker2); + } + spin_unlock(&wb->list_lock); blk_finish_plug(&plug); @@ -1785,8 +1870,11 @@ static long wb_do_writeback(struct bdi_w if (work->auto_free) kfree(work); - if (done && atomic_dec_and_test(&done->cnt)) - wake_up_all(&wb->bdi->wb_waitq); + if (done) { + wb->comp_gen++; + if (atomic_dec_and_test(&done->cnt)) + wake_up_all(&wb->bdi->wb_waitq); + } } /* @@ -1976,6 +2064,9 @@ void __mark_inode_dirty(struct inode *in trace_writeback_mark_inode_dirty(inode, flags); + WARN_ON_ONCE(!(sb->s_flags & MS_LAZYTIME) && + !list_empty(&inode_to_bdi(inode)->wb.b_dirty_time)); + /* * Don't do this for I_DIRTY_PAGES - that doesn't actually * dirty the inode itself @@ -2165,7 +2256,7 @@ static void __writeback_inodes_sb_nr(str return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); + bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy, NULL); wb_wait_for_completion(bdi, &done); } @@ -2257,6 +2348,10 @@ void sync_inodes_sb(struct super_block * .for_sync = 1, }; struct backing_dev_info *bdi = sb->s_bdi; + static DEFINE_MUTEX(dbg_mutex); + static struct split_work_dbg dbg; + static DECLARE_BITMAP(tmp_bitmap, 8192); + struct inode *inode; /* * Can't skip on !bdi_has_dirty() because we should wait for !dirty @@ -2267,9 +2362,56 @@ void sync_inodes_sb(struct super_block * return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_split_work_to_wbs(bdi, &work, false); + mutex_lock(&dbg_mutex); + + printk("XXX SYNCING %d:%d\n", MAJOR(sb->s_dev), MINOR(sb->s_dev)); + + bitmap_zero(dbg.all_wbs, 8192); + bitmap_zero(dbg.iterated_wbs, 8192); + bitmap_zero(dbg.written_wbs, 8192); + bitmap_zero(dbg.sync_wbs, 8192); + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + inode->i_dbg_marker = inode_which_wb_io_list(inode, bdi); + spin_unlock(&inode->i_lock); + } + spin_unlock(&sb->s_inode_list_lock); + + bdi_split_work_to_wbs(bdi, &work, false, &dbg); wb_wait_for_completion(bdi, &done); + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + struct bdi_writeback *wb = inode->i_wb ?: &bdi->wb; + + if (!inode->i_dbg_marker) + continue; + + spin_lock_irq(&wb->list_lock); + if (inode->i_state & I_DIRTY_ALL) + printk_ratelimited("XXX sync_inodes_sb(%d:%d): dirty inode %lu skipped, wb=%d comp_gen=%d->%d which=%d->%d i_state=0x%lx\n", + MAJOR(sb->s_dev), MINOR(sb->s_dev), inode->i_ino, + wb->memcg_css->id, wb->last_comp_gen, wb->comp_gen, + inode->i_dbg_marker, inode_which_wb_io_list(inode, bdi), + inode->i_state); + spin_unlock_irq(&wb->list_lock); + } + spin_unlock(&sb->s_inode_list_lock); + + bitmap_andnot(tmp_bitmap, dbg.all_wbs, dbg.iterated_wbs, 8192); + if (!bitmap_empty(tmp_bitmap, 8192)) + printk("XXX sync_inodes_sb(%d:%d): iteration skipped %8192pbl\n", + MAJOR(sb->s_dev), MINOR(sb->s_dev), tmp_bitmap); + + printk("XXX all_wbs = %8192pbl\n", dbg.all_wbs); + printk("XXX iterated_wbs = %8192pbl\n", dbg.iterated_wbs); + printk("XXX written_wbs = %8192pbl\n", dbg.written_wbs); + printk("XXX sync_wbs = %8192pbl\n", dbg.sync_wbs); + + mutex_unlock(&dbg_mutex); + wait_sb_inodes(sb); } EXPORT_SYMBOL(sync_inodes_sb); --- a/fs/inode.c +++ b/fs/inode.c @@ -183,6 +183,7 @@ int inode_init_always(struct super_block #endif inode->i_flctx = NULL; this_cpu_inc(nr_inodes); + inode->i_dbg_marker = 0; return 0; out: --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -129,6 +129,8 @@ struct bdi_writeback { struct rcu_head rcu; }; #endif + int comp_gen; + int last_comp_gen; }; struct backing_dev_info { --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -38,7 +38,25 @@ extern struct workqueue_struct *bdi_wq; static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { - return test_bit(WB_has_dirty_io, &wb->state); + bool ret = test_bit(WB_has_dirty_io, &wb->state); + long tot_write_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); + + if (!ret && (!list_empty(&wb->b_dirty) || !list_empty(&wb->b_io) || + !list_empty(&wb->b_more_io))) { + const char *name = wb->bdi->dev ? dev_name(wb->bdi->dev) : "UNK"; + + pr_err("wb_has_dirty_io: ERR %s has_dirty=%d b_dirty=%d b_io=%d b_more_io=%d\n", + name, ret, !list_empty(&wb->b_dirty), !list_empty(&wb->b_io), !list_empty(&wb->b_more_io)); + WARN_ON(1); + } + if (ret && !tot_write_bw) { + const char *name = wb->bdi->dev ? dev_name(wb->bdi->dev) : "UNK"; + + pr_err("wb_has_dirty_io: ERR %s has_dirty=%d but tot_write_bw=%ld\n", + name, ret, tot_write_bw); + WARN_ON(1); + } + return ret; } static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -677,6 +677,8 @@ struct inode { #endif void *i_private; /* fs or device private pointer */ + unsigned i_dbg_marker; + unsigned i_dbg_marker2; }; static inline int inode_unhashed(struct inode *inode) --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -382,7 +382,7 @@ static void wb_exit(struct bdi_writeback * protected. cgwb_release_wait is used to wait for the completion of cgwb * releases from bdi destruction path. */ -static DEFINE_SPINLOCK(cgwb_lock); +DEFINE_SPINLOCK(cgwb_lock); static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait); /** -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@...r.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists