lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20150925154925.GH4449@mtj.duckdns.org>
Date:	Fri, 25 Sep 2015 11:49:25 -0400
From:	Tejun Heo <tj@...nel.org>
To:	Artem Bityutskiy <dedekind1@...il.com>
Cc:	Theodore Ts'o <tytso@....edu>, axboe@...nel.dk,
	linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org,
	lizefan@...wei.com, cgroups@...r.kernel.org, hannes@...xchg.org,
	kernel-team@...com, adilger.kernel@...ger.ca,
	linux-ext4@...r.kernel.org, Dexuan Cui <decui@...rosoft.com>
Subject: Re: [PATCH cgroup/for-4.3-fixes] cgroup, writeback: don't enable
 cgroup writeback on traditional hierarchies

Hello, Artem.

On Fri, Sep 25, 2015 at 01:50:22PM +0300, Artem Bityutskiy wrote:
> > Does not compile with multiple errors like
> > 
> > linux/fs/fs-writeback.c:799:10: error: ‘struct bdi_writeback’ has no
> > member named ‘last_comp_gen’
> >    bdi->wb.last_comp_gen = bdi->wb.comp_gen;
> 
> I tried to extend your patch with these fields, but I am not sure I got
> it right, so please, send a new patch, I'll run the reboot corruption
> test with your patch.

Oops, sorry, I'm an idiot.

> Please, note, because this test is about reboots, I'll probably output
> everything to the serial console. Therefore, please, do not print too
> much data. Otherwise I'd have to modify my scripts to collect dmesg
> before restarting, which is more work.

Hmmm... I'm gonna add ratelimit on inode printouts; other than that,
it shouldn't print too much.

Thanks.

---
 fs/fs-writeback.c                |  154 +++++++++++++++++++++++++++++++++++++--
 fs/inode.c                       |    1 
 include/linux/backing-dev-defs.h |    2 
 include/linux/backing-dev.h      |   20 ++++-
 include/linux/fs.h               |    2 
 mm/backing-dev.c                 |    2 
 6 files changed, 173 insertions(+), 8 deletions(-)

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -101,7 +101,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepa
 
 static bool wb_io_lists_populated(struct bdi_writeback *wb)
 {
-	if (wb_has_dirty_io(wb)) {
+	if (test_bit(WB_has_dirty_io, &wb->state)) {
 		return false;
 	} else {
 		set_bit(WB_has_dirty_io, &wb->state);
@@ -763,6 +763,15 @@ static long wb_split_bdi_pages(struct bd
 		return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
 }
 
+extern spinlock_t cgwb_lock;
+
+struct split_work_dbg {
+	DECLARE_BITMAP(all_wbs, 8192);
+	DECLARE_BITMAP(iterated_wbs, 8192);
+	DECLARE_BITMAP(written_wbs, 8192);
+	DECLARE_BITMAP(sync_wbs, 8192);
+};
+
 /**
  * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
  * @bdi: target backing_dev_info
@@ -776,11 +785,25 @@ static long wb_split_bdi_pages(struct bd
  */
 static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 				  struct wb_writeback_work *base_work,
-				  bool skip_if_busy)
+				  bool skip_if_busy, struct split_work_dbg *dbg)
 {
 	int next_memcg_id = 0;
 	struct bdi_writeback *wb;
 	struct wb_iter iter;
+	struct radix_tree_iter riter;
+	void **slot;
+
+	if (dbg) {
+		spin_lock_irq(&cgwb_lock);
+		set_bit(bdi->wb.memcg_css->id, dbg->all_wbs);
+		bdi->wb.last_comp_gen = bdi->wb.comp_gen;
+		radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &riter, 0) {
+			wb = *slot;
+			set_bit(wb->memcg_css->id, dbg->all_wbs);
+			wb->last_comp_gen = wb->comp_gen;
+		}
+		spin_unlock_irq(&cgwb_lock);
+	}
 
 	might_sleep();
 restart:
@@ -791,6 +814,9 @@ restart:
 		struct wb_writeback_work *work;
 		long nr_pages;
 
+		if (dbg)
+			set_bit(wb->memcg_css->id, dbg->iterated_wbs);
+
 		/* SYNC_ALL writes out I_DIRTY_TIME too */
 		if (!wb_has_dirty_io(wb) &&
 		    (base_work->sync_mode == WB_SYNC_NONE ||
@@ -799,6 +825,9 @@ restart:
 		if (skip_if_busy && writeback_in_progress(wb))
 			continue;
 
+		if (dbg)
+			set_bit(wb->memcg_css->id, dbg->written_wbs);
+
 		nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
 
 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
@@ -817,6 +846,8 @@ restart:
 		work->auto_free = 0;
 		work->done = &fallback_work_done;
 
+		if (dbg)
+			set_bit(wb->memcg_css->id, dbg->sync_wbs);
 		wb_queue_work(wb, work);
 
 		next_memcg_id = wb->memcg_css->id + 1;
@@ -1425,6 +1456,9 @@ static long writeback_sb_inodes(struct s
 			break;
 		}
 
+		inode->i_dbg_marker = 0;
+		inode->i_dbg_marker2 = 0;
+
 		/*
 		 * Don't bother with new inodes or inodes being freed, first
 		 * kind does not need periodic writeout yet, and for the latter
@@ -1515,6 +1549,7 @@ static long writeback_sb_inodes(struct s
 				break;
 		}
 	}
+
 	return wrote;
 }
 
@@ -1574,6 +1609,28 @@ static long writeback_inodes_wb(struct b
 	return nr_pages - work.nr_pages;
 }
 
+static int inode_which_wb_io_list(struct inode *inode, struct backing_dev_info *bdi)
+{
+	struct bdi_writeback *wb = inode->i_wb ?: &bdi->wb;
+	struct inode *pos;
+
+	if (list_empty(&inode->i_io_list))
+		return 0;
+	list_for_each_entry(pos, &wb->b_dirty, i_io_list)
+		if (pos == inode)
+			return 1;
+	list_for_each_entry(pos, &wb->b_io, i_io_list)
+		if (pos == inode)
+			return 2;
+	list_for_each_entry(pos, &wb->b_more_io, i_io_list)
+		if (pos == inode)
+			return 3;
+	list_for_each_entry(pos, &wb->b_dirty_time, i_io_list)
+		if (pos == inode)
+			return 4;
+	return 5;
+}
+
 /*
  * Explicit flushing or periodic writeback of "old" data.
  *
@@ -1604,6 +1661,16 @@ static long wb_writeback(struct bdi_writ
 
 	blk_start_plug(&plug);
 	spin_lock(&wb->list_lock);
+
+	list_for_each_entry(inode, &wb->b_dirty, i_io_list)
+		inode->i_dbg_marker2 = 1;
+	list_for_each_entry(inode, &wb->b_io, i_io_list)
+		inode->i_dbg_marker2 = 2;
+	list_for_each_entry(inode, &wb->b_more_io, i_io_list)
+		inode->i_dbg_marker2 = 3;
+	list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
+		inode->i_dbg_marker2 = 4;
+
 	for (;;) {
 		/*
 		 * Stop writeback when nr_pages has been consumed
@@ -1681,6 +1748,24 @@ static long wb_writeback(struct bdi_writ
 			spin_lock(&wb->list_lock);
 		}
 	}
+
+	if (work->sync_mode == WB_SYNC_ALL) {
+		list_for_each_entry(inode, &wb->b_dirty, i_io_list)
+			if (inode->i_dbg_marker2)
+				printk("XXX wb_writeback: inode %lu marker2=%d on b_dirty\n",
+				       inode->i_ino, inode->i_dbg_marker2);
+		list_for_each_entry(inode, &wb->b_io, i_io_list)
+			printk("XXX wb_writeback: inode %lu marker2=%d on b_io\n",
+			       inode->i_ino, inode->i_dbg_marker2);
+		list_for_each_entry(inode, &wb->b_more_io, i_io_list)
+			printk("XXX wb_writeback: inode %lu marker2=%d on b_more_io\n",
+			       inode->i_ino, inode->i_dbg_marker2);
+		list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
+			if (inode->i_dbg_marker2)
+				printk("XXX wb_writeback: inode %lu marker2=%d on b_dirty_time\n",
+				       inode->i_ino, inode->i_dbg_marker2);
+	}
+
 	spin_unlock(&wb->list_lock);
 	blk_finish_plug(&plug);
 
@@ -1785,8 +1870,11 @@ static long wb_do_writeback(struct bdi_w
 
 		if (work->auto_free)
 			kfree(work);
-		if (done && atomic_dec_and_test(&done->cnt))
-			wake_up_all(&wb->bdi->wb_waitq);
+		if (done) {
+			wb->comp_gen++;
+			if (atomic_dec_and_test(&done->cnt))
+				wake_up_all(&wb->bdi->wb_waitq);
+		}
 	}
 
 	/*
@@ -1976,6 +2064,9 @@ void __mark_inode_dirty(struct inode *in
 
 	trace_writeback_mark_inode_dirty(inode, flags);
 
+	WARN_ON_ONCE(!(sb->s_flags & MS_LAZYTIME) &&
+		     !list_empty(&inode_to_bdi(inode)->wb.b_dirty_time));
+
 	/*
 	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
 	 * dirty the inode itself
@@ -2165,7 +2256,7 @@ static void __writeback_inodes_sb_nr(str
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
+	bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy, NULL);
 	wb_wait_for_completion(bdi, &done);
 }
 
@@ -2257,6 +2348,10 @@ void sync_inodes_sb(struct super_block *
 		.for_sync	= 1,
 	};
 	struct backing_dev_info *bdi = sb->s_bdi;
+	static DEFINE_MUTEX(dbg_mutex);
+	static struct split_work_dbg dbg;
+	static DECLARE_BITMAP(tmp_bitmap, 8192);
+	struct inode *inode;
 
 	/*
 	 * Can't skip on !bdi_has_dirty() because we should wait for !dirty
@@ -2267,9 +2362,56 @@ void sync_inodes_sb(struct super_block *
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	bdi_split_work_to_wbs(bdi, &work, false);
+	mutex_lock(&dbg_mutex);
+
+	printk("XXX SYNCING %d:%d\n", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+
+	bitmap_zero(dbg.all_wbs, 8192);
+	bitmap_zero(dbg.iterated_wbs, 8192);
+	bitmap_zero(dbg.written_wbs, 8192);
+	bitmap_zero(dbg.sync_wbs, 8192);
+
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		spin_lock(&inode->i_lock);
+		inode->i_dbg_marker = inode_which_wb_io_list(inode, bdi);
+		spin_unlock(&inode->i_lock);
+	}
+	spin_unlock(&sb->s_inode_list_lock);
+
+	bdi_split_work_to_wbs(bdi, &work, false, &dbg);
 	wb_wait_for_completion(bdi, &done);
 
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		struct bdi_writeback *wb = inode->i_wb ?: &bdi->wb;
+
+		if (!inode->i_dbg_marker)
+			continue;
+
+		spin_lock_irq(&wb->list_lock);
+		if (inode->i_state & I_DIRTY_ALL)
+			printk_ratelimited("XXX sync_inodes_sb(%d:%d): dirty inode %lu skipped, wb=%d comp_gen=%d->%d which=%d->%d i_state=0x%lx\n",
+			       MAJOR(sb->s_dev), MINOR(sb->s_dev), inode->i_ino,
+			       wb->memcg_css->id, wb->last_comp_gen, wb->comp_gen,
+			       inode->i_dbg_marker, inode_which_wb_io_list(inode, bdi),
+			       inode->i_state);
+		spin_unlock_irq(&wb->list_lock);
+	}
+	spin_unlock(&sb->s_inode_list_lock);
+
+	bitmap_andnot(tmp_bitmap, dbg.all_wbs, dbg.iterated_wbs, 8192);
+	if (!bitmap_empty(tmp_bitmap, 8192))
+		printk("XXX sync_inodes_sb(%d:%d): iteration skipped %8192pbl\n",
+		       MAJOR(sb->s_dev), MINOR(sb->s_dev), tmp_bitmap);
+
+	printk("XXX all_wbs      = %8192pbl\n", dbg.all_wbs);
+	printk("XXX iterated_wbs = %8192pbl\n", dbg.iterated_wbs);
+	printk("XXX written_wbs  = %8192pbl\n", dbg.written_wbs);
+	printk("XXX sync_wbs     = %8192pbl\n", dbg.sync_wbs);
+
+	mutex_unlock(&dbg_mutex);
+
 	wait_sb_inodes(sb);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -183,6 +183,7 @@ int inode_init_always(struct super_block
 #endif
 	inode->i_flctx = NULL;
 	this_cpu_inc(nr_inodes);
+	inode->i_dbg_marker = 0;
 
 	return 0;
 out:
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -129,6 +129,8 @@ struct bdi_writeback {
 		struct rcu_head rcu;
 	};
 #endif
+	int comp_gen;
+	int last_comp_gen;
 };
 
 struct backing_dev_info {
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -38,7 +38,25 @@ extern struct workqueue_struct *bdi_wq;
 
 static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
 {
-	return test_bit(WB_has_dirty_io, &wb->state);
+	bool ret = test_bit(WB_has_dirty_io, &wb->state);
+	long tot_write_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+
+	if (!ret && (!list_empty(&wb->b_dirty) || !list_empty(&wb->b_io) ||
+		     !list_empty(&wb->b_more_io))) {
+		const char *name = wb->bdi->dev ? dev_name(wb->bdi->dev) : "UNK";
+
+		pr_err("wb_has_dirty_io: ERR %s has_dirty=%d b_dirty=%d b_io=%d b_more_io=%d\n",
+		       name, ret, !list_empty(&wb->b_dirty), !list_empty(&wb->b_io), !list_empty(&wb->b_more_io));
+		WARN_ON(1);
+	}
+	if (ret && !tot_write_bw) {
+		const char *name = wb->bdi->dev ? dev_name(wb->bdi->dev) : "UNK";
+
+		pr_err("wb_has_dirty_io: ERR %s has_dirty=%d but tot_write_bw=%ld\n",
+		       name, ret, tot_write_bw);
+		WARN_ON(1);
+	}
+	return ret;
 }
 
 static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -677,6 +677,8 @@ struct inode {
 #endif
 
 	void			*i_private; /* fs or device private pointer */
+	unsigned		i_dbg_marker;
+	unsigned		i_dbg_marker2;
 };
 
 static inline int inode_unhashed(struct inode *inode)
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -382,7 +382,7 @@ static void wb_exit(struct bdi_writeback
  * protected.  cgwb_release_wait is used to wait for the completion of cgwb
  * releases from bdi destruction path.
  */
-static DEFINE_SPINLOCK(cgwb_lock);
+DEFINE_SPINLOCK(cgwb_lock);
 static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
 
 /**
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists