lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Tue,  6 Jan 2015 16:26:12 -0500
From:	Tejun Heo <tj@...nel.org>
To:	axboe@...nel.dk
Cc:	linux-kernel@...r.kernel.org, jack@...e.cz, hch@...radead.org,
	hannes@...xchg.org, linux-fsdevel@...r.kernel.org,
	vgoyal@...hat.com, lizefan@...wei.com, cgroups@...r.kernel.org,
	linux-mm@...ck.org, mhocko@...e.cz, clm@...com,
	fengguang.wu@...el.com, david@...morbit.com,
	Tejun Heo <tj@...nel.org>,
	Alexander Viro <viro@...iv.linux.org.uk>
Subject: [PATCH 35/45] vfs, writeback: implement inode->i_nr_syncs

Currently, I_SYNC is used to keep track of whether writeback is in
progress on the inode or not.  With cgroup writeback support, multiple
writebacks could be in progress simultaneously and a single bit in
inode->i_state isn't sufficient.

If CONFIG_CGROUP_WRITEBACK, this patch makes each iwbl (inode_wb_link)
track whether writeback is in progress using the new IWBL_SYNC flag on
iwbl->data and inode->i_nr_syncs aggregate total number of writebacks
in progress on the inode.  New helpers, iwbl_{test|set|clear}_sync(),
iwbl_sync_wakeup() and __iwbl_wait_for_writeback() are added to
manipulate these states and inode_sleep_on_writeback() is converted to
iwbl_sleep_on_writeback().  I_SYNC retains the same meaning - it's set
if any writeback is in progress and cleared if none.

If !CONFIG_CGROUP_WRITEBACK, the helpers simply operate on I_SYNC
directly and there's no behavioral changes compared to before.  When
CONFIG_CGROUP_WRITEBACK, this adds an atomic_t to struct inode.  This
competes for the same left over 4 byte slot w/ i_readcount from
CONFIG_IMA on 64 bit, and, as long as CONFIG_IMA isn't enabled, it
doesn't increase the size of struct inode on 64 bit.

This allows keeping track of writeback in-progress state per cgroup
bdi_writeback.

Signed-off-by: Tejun Heo <tj@...nel.org>
Cc: Alexander Viro <viro@...iv.linux.org.uk>
Cc: Christoph Hellwig <hch@...radead.org>
Cc: Jens Axboe <axboe@...nel.dk>
Cc: Jan Kara <jack@...e.cz>
---
 fs/fs-writeback.c                | 172 +++++++++++++++++++++++++++++++++------
 include/linux/backing-dev-defs.h |   7 ++
 include/linux/fs.h               |   3 +
 mm/backing-dev.c                 |   1 +
 4 files changed, 160 insertions(+), 23 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d10c231..df99b5b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -430,26 +430,113 @@ restart:
 	rcu_read_unlock();
 }
 
+/**
+ * iwbl_test_sync - test whether writeback is in progress on an inode_wb_link
+ * @iwbl: target inode_wb_link
+ *
+ * Test whether writeback is in progress for the inode on the bdi_writeback
+ * specified by @iwbl.  The caller is responsible for synchornization.
+ */
+static bool iwbl_test_sync(struct inode_wb_link *iwbl)
+{
+	return test_bit(IWBL_SYNC, &iwbl->data);
+}
+
+/**
+ * iwbl_set_sync - mark an inode_wb_link that writeback is in progress
+ * @iwbl: target inode_wb_link
+ * @inode: inode @iwbl is associated with
+ *
+ * Mark that writeback is in progress for @inode on the bdi_writeback
+ * specified by @iwbl.  iwbl_test_sync() will return %true on @iwbl and
+ * %I_SYNC is set on @inode while there's any writeback in progress on it.
+ */
+static void iwbl_set_sync(struct inode_wb_link *iwbl, struct inode *inode)
+{
+	lockdep_assert_held(&inode->i_lock);
+	WARN_ON_ONCE(iwbl_test_sync(iwbl));
+
+	set_bit(IWBL_SYNC, &iwbl->data);
+	inode->i_nr_syncs++;
+	inode->i_state |= I_SYNC;
+}
+
+/**
+ * iwbl_clear_sync - undo iwbl_set_sync()
+ * @iwbl: target inode_wb_link
+ * @inode: inode @iwbl is associated with
+ *
+ * Returns %true if this was the last writeback in progress on @inode;
+ * %false, otherwise.
+ */
+static bool iwbl_clear_sync(struct inode_wb_link *iwbl, struct inode *inode)
+{
+	bool sync_complete;
+
+	lockdep_assert_held(&inode->i_lock);
+	WARN_ON_ONCE(!iwbl_test_sync(iwbl));
+
+	clear_bit(IWBL_SYNC, &iwbl->data);
+	sync_complete = !--inode->i_nr_syncs;
+	if (sync_complete)
+		inode->i_state &= ~I_SYNC;
+	return sync_complete;
+}
+
+/**
+ * iwbl_wait_for_writeback - wait for writeback in progree on an inode_wb_link
+ * @iwbl: target inode_wb_link
+ *
+ * Wait for the writeback in progress for the inode on the bdi_writeback
+ * specified by @iwbl.
+ */
+static void iwbl_wait_for_writeback(struct inode_wb_link *iwbl)
+	__releases(inode->i_lock)
+	__acquires(inode->i_lock)
+{
+	struct inode *inode = iwbl_to_inode(iwbl);
+	DEFINE_WAIT_BIT(wq, &iwbl->data, IWBL_SYNC);
+	wait_queue_head_t *wqh;
+
+	lockdep_assert_held(&inode->i_lock);
+
+	wqh = bit_waitqueue(&iwbl->data, IWBL_SYNC);
+	while (test_bit(IWBL_SYNC, &iwbl->data)) {
+		spin_unlock(&inode->i_lock);
+		__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
+		spin_lock(&inode->i_lock);
+	}
+}
+
 /*
  * Sleep until I_SYNC is cleared. This function must be called with i_lock
  * held and drops it. It is aimed for callers not holding any inode reference
  * so once i_lock is dropped, inode can go away.
  */
-static void inode_sleep_on_writeback(struct inode *inode)
-	__releases(inode->i_lock)
+static void iwbl_sleep_on_writeback(struct inode_wb_link *iwbl)
 {
 	DEFINE_WAIT(wait);
-	wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
+	struct inode *inode = iwbl_to_inode(iwbl);
+	wait_queue_head_t *wqh = bit_waitqueue(&iwbl->data, IWBL_SYNC);
 	int sleep;
 
 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
-	sleep = inode->i_state & I_SYNC;
+	sleep = test_bit(IWBL_SYNC, &iwbl->data);
 	spin_unlock(&inode->i_lock);
 	if (sleep)
 		schedule();
 	finish_wait(wqh, &wait);
 }
 
+/**
+ * iwbl_sync_wakeup - wakeup iwbl_{wait_for|sleep_on}_writeback() waiter
+ * @iwbl: target inode_wb_link
+ */
+static void iwbl_sync_wakeup(struct inode_wb_link *iwbl)
+{
+	wake_up_bit(&iwbl->data, IWBL_SYNC);
+}
+
 static inline struct inode_cgwb_link *icgwbl_first(struct inode *inode)
 {
 	struct hlist_node *node =
@@ -504,6 +591,7 @@ static void inode_icgwbls_del(struct inode *inode)
 	 * bdi->icgwbls_lock.
 	 */
 	inode_for_each_icgwbl(icgwbl, next, inode) {
+		WARN_ON_ONCE(test_bit(IWBL_SYNC, &icgwbl->iwbl.data));
 		hlist_del_rcu(&icgwbl->inode_node);
 		list_move(&icgwbl->wb_node, &to_free);
 	}
@@ -544,15 +632,39 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 	}
 }
 
+static bool iwbl_test_sync(struct inode_wb_link *iwbl)
+{
+	struct inode *inode = iwbl_to_inode(iwbl);
+
+	return inode->i_state & I_SYNC;
+}
+
+static void iwbl_set_sync(struct inode_wb_link *iwbl, struct inode *inode)
+{
+	inode->i_state |= I_SYNC;
+}
+
+static bool iwbl_clear_sync(struct inode_wb_link *iwbl, struct inode *inode)
+{
+	inode->i_state &= ~I_SYNC;
+	return true;
+}
+
+static void iwbl_wait_for_writeback(struct inode_wb_link *iwbl)
+{
+	__inode_wait_for_writeback(iwbl_to_inode(iwbl));
+}
+
 /*
  * Sleep until I_SYNC is cleared. This function must be called with i_lock
  * held and drops it. It is aimed for callers not holding any inode reference
  * so once i_lock is dropped, inode can go away.
  */
-static void inode_sleep_on_writeback(struct inode *inode)
+static void iwbl_sleep_on_writeback(struct inode_wb_link *iwbl)
 	__releases(inode->i_lock)
 {
 	DEFINE_WAIT(wait);
+	struct inode *inode = iwbl_to_inode(iwbl);
 	wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
 	int sleep;
 
@@ -564,6 +676,11 @@ static void inode_sleep_on_writeback(struct inode *inode)
 	finish_wait(wqh, &wait);
 }
 
+static void iwbl_sync_wakeup(struct inode_wb_link *iwbl)
+{
+	/* noop, __I_SYNC wakeup is enough */
+}
+
 static void inode_icgwbls_del(struct inode *inode)
 {
 }
@@ -700,14 +817,22 @@ static void requeue_io(struct inode_wb_link *iwbl, struct bdi_writeback *wb)
 	iwbl_move_locked(iwbl, wb, &wb->b_more_io);
 }
 
-static void inode_sync_complete(struct inode *inode)
+static void iwbl_sync_complete(struct inode_wb_link *iwbl)
 {
-	inode->i_state &= ~I_SYNC;
+	struct inode *inode = iwbl_to_inode(iwbl);
+	bool sync_complete;
+
+	sync_complete = iwbl_clear_sync(iwbl, inode);
 	/* If inode is clean an unused, put it into LRU now... */
-	inode_add_lru(inode);
+	if (sync_complete)
+		inode_add_lru(inode);
+
 	/* Waiters must see I_SYNC cleared before being woken up */
 	smp_mb();
-	wake_up_bit(&inode->i_state, __I_SYNC);
+
+	iwbl_sync_wakeup(iwbl);
+	if (sync_complete)
+		wake_up_bit(&inode->i_state, __I_SYNC);
 }
 
 static bool iwbl_dirtied_after(struct inode_wb_link *iwbl, unsigned long t)
@@ -888,17 +1013,18 @@ static void requeue_inode(struct inode_wb_link *iwbl, struct bdi_writeback *wb,
 /*
  * Write out an inode and its dirty pages. Do not update the writeback list
  * linkage. That is left to the caller. The caller is also responsible for
- * setting I_SYNC flag and calling inode_sync_complete() to clear it.
+ * setting I_SYNC flag and calling iwbl_sync_complete() to clear it.
  */
 static int
 __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct address_space *mapping = inode->i_mapping;
+	struct inode_wb_link *iwbl = &inode->i_wb_link;
 	long nr_to_write = wbc->nr_to_write;
 	unsigned dirty;
 	int ret;
 
-	WARN_ON(!(inode->i_state & I_SYNC));
+	WARN_ON(!iwbl_test_sync(iwbl));
 
 	trace_writeback_single_inode_start(inode, wbc, nr_to_write);
 
@@ -976,7 +1102,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	else
 		WARN_ON(inode->i_state & I_WILL_FREE);
 
-	if (inode->i_state & I_SYNC) {
+	if (iwbl_test_sync(iwbl)) {
 		if (wbc->sync_mode != WB_SYNC_ALL)
 			goto out;
 		/*
@@ -984,9 +1110,9 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 		 * inode reference or inode has I_WILL_FREE set, it cannot go
 		 * away under us.
 		 */
-		__inode_wait_for_writeback(inode);
+		iwbl_wait_for_writeback(iwbl);
 	}
-	WARN_ON(inode->i_state & I_SYNC);
+	WARN_ON(iwbl_test_sync(iwbl));
 	/*
 	 * Skip inode if it is clean and we have no outstanding writeback in
 	 * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
@@ -999,7 +1125,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	    (wbc->sync_mode != WB_SYNC_ALL ||
 	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
 		goto out;
-	inode->i_state |= I_SYNC;
+	iwbl_set_sync(iwbl, inode);
 	spin_unlock(&inode->i_lock);
 
 	ret = __writeback_single_inode(inode, wbc);
@@ -1013,7 +1139,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	if (!(inode->i_state & I_DIRTY))
 		iwbl_del_locked(iwbl, wb);
 	spin_unlock(&wb->list_lock);
-	inode_sync_complete(inode);
+	iwbl_sync_complete(iwbl);
 out:
 	spin_unlock(&inode->i_lock);
 	return ret;
@@ -1107,7 +1233,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 			redirty_tail(iwbl, wb);
 			continue;
 		}
-		if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
+		if (iwbl_test_sync(iwbl) && wbc.sync_mode != WB_SYNC_ALL) {
 			/*
 			 * If this inode is locked for writeback and we are not
 			 * doing writeback-for-data-integrity, move it to
@@ -1129,14 +1255,14 @@ static long writeback_sb_inodes(struct super_block *sb,
 		 * are doing WB_SYNC_NONE writeback. So this catches only the
 		 * WB_SYNC_ALL case.
 		 */
-		if (inode->i_state & I_SYNC) {
+		if (iwbl_test_sync(iwbl)) {
 			/* Wait for I_SYNC. This function drops i_lock... */
-			inode_sleep_on_writeback(inode);
+			iwbl_sleep_on_writeback(iwbl);
 			/* Inode may be gone, start again */
 			spin_lock(&wb->list_lock);
 			continue;
 		}
-		inode->i_state |= I_SYNC;
+		iwbl_set_sync(iwbl, inode);
 		spin_unlock(&inode->i_lock);
 
 		write_chunk = writeback_chunk_size(wb, work);
@@ -1156,7 +1282,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		if (!(inode->i_state & I_DIRTY))
 			wrote++;
 		requeue_inode(iwbl, wb, &wbc);
-		inode_sync_complete(inode);
+		iwbl_sync_complete(iwbl);
 		spin_unlock(&inode->i_lock);
 		cond_resched_lock(&wb->list_lock);
 		/*
@@ -1356,7 +1482,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 			spin_lock(&inode->i_lock);
 			spin_unlock(&wb->list_lock);
 			/* This function drops i_lock... */
-			inode_sleep_on_writeback(inode);
+			iwbl_sleep_on_writeback(iwbl);
 			spin_lock(&wb->list_lock);
 		}
 	}
@@ -1648,7 +1774,7 @@ void mark_inode_dirty_dctx(struct dirty_context *dctx, int flags)
 		 * The unlocker will place the inode on the appropriate
 		 * superblock list, based upon its state.
 		 */
-		if (inode->i_state & I_SYNC)
+		if (iwbl_test_sync(iwbl))
 			goto out_unlock_inode;
 
 		/*
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index e448edc..e3b18f3 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -47,8 +47,15 @@ enum wb_stat_item {
  * IWBL_* flags which occupy the lower bits of inode_wb_link->data.  The
  * upper bits point to bdi_writeback, so the number of these flags
  * determines the minimum alignment of bdi_writeback.
+ *
+ * IWBL_SYNC
+ *
+ *  Tracks whether writeback is in progress for an iwbl.  If this bit is
+ *  set for any iwbl on an inode, the inode's I_SYNC is set too.
  */
 enum {
+	IWBL_SYNC		= 0,
+
 	IWBL_FLAGS_BITS,
 	IWBL_FLAGS_MASK		= (1UL << IWBL_FLAGS_BITS) - 1,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b394821..4c22824 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -625,6 +625,9 @@ struct inode {
 #ifdef CONFIG_IMA
 	atomic_t		i_readcount; /* struct files open RO */
 #endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+	unsigned int		i_nr_syncs;
+#endif
 	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct file_lock	*i_flock;
 	struct address_space	i_data;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e4db465..1399ad6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -539,6 +539,7 @@ static void cgwb_exit(struct bdi_writeback *wb)
 	spin_lock_irqsave(&wb->bdi->icgwbls_lock, flags);
 	list_for_each_entry_safe(icgwbl, next, &wb->icgwbls, wb_node) {
 		WARN_ON_ONCE(!list_empty(&icgwbl->iwbl.dirty_list));
+		WARN_ON_ONCE(test_bit(IWBL_SYNC, &icgwbl->iwbl.data));
 		hlist_del_rcu(&icgwbl->inode_node);
 		list_del(&icgwbl->wb_node);
 		kfree_rcu(icgwbl, rcu);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ