linux-kernel - [PATCH 3/5] backing-dev: replace private thread pool with workqueue

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1283777182-11426-4-git-send-email-tj@kernel.org>
Date:	Mon,  6 Sep 2010 14:46:20 +0200
From:	Tejun Heo <tj@...nel.org>
To:	jaxboe@...ionio.com, linux-kernel@...r.kernel.org, hch@....de
Cc:	Tejun Heo <tj@...nel.org>
Subject: [PATCH 3/5] backing-dev: replace private thread pool with workqueue

bdi writeback has been using a private thread pool.  Now that
workqueue can provide flexible concurrency, drop the dedicated thread
pool and use workqueue instead.

bdi_writeback->task and ->wakeup_timer are replaced with ->work and
->timer.  A separate timer is used instead of delayed_work because
mixing immediate and delayed queueing doesn't work too well with
delayed_work.

bdi_wq is added to execute bdi_writeback->work.  It's an unbound
freezeable workqueue w/ a rescuer.  As the thread pool is now managed
by the workqueue code, the special forker thread is no longer
necessary and removed along with bdi_forker_thread().

The work function for bdi_writeback->work is bdi_work_fn() and mostly
equivalent to a single iteration of bdi_writeback_thread() - IOW, it
calls wb_do_writeback() until bdi->work_list is exhausted and
reschedules itself if there is dirty data to write out.

One special provision is that the writeback work doesn't run
bdi->worklist if it's being executed from the rescuer.  Instead, it
just calls bdi_flush_io() and reschedules itself if necessary.  This
basically is the same behavior as bdi_forker_thread()'s when it failed
to create a thread for a pending bdi.  Writebacks are issued but not
waited upon so that other bdi's have a chance to clean and free
memory.

Trace points which make sense without change are preserved.  The next
patch will fix up the rest.

bdi_wakeup_thread_delayed() is renamed to bdi_delayed_writeback().

Signed-off-by: Tejun Heo <tj@...nel.org>
---
 fs/fs-writeback.c           |   91 ++--------------
 include/linux/backing-dev.h |   17 +--
 mm/backing-dev.c            |  249 ++++++++++---------------------------------
 3 files changed, 72 insertions(+), 285 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7d9d06b..1e9b807 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -78,16 +78,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 
 	spin_lock_bh(&bdi->wb_lock);
 	list_add_tail(&work->list, &bdi->work_list);
-	if (bdi->wb.task) {
-		wake_up_process(bdi->wb.task);
-	} else {
-		/*
-		 * The bdi thread isn't there, wake up the forker thread which
-		 * will create and run it.
-		 */
-		trace_writeback_nothread(bdi, work);
-		wake_up_process(default_backing_dev_info.wb.task);
-	}
+	queue_work(bdi_wq, &bdi->wb.work);
 	spin_unlock_bh(&bdi->wb_lock);
 }
 
@@ -99,14 +90,12 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 
 	/*
 	 * This is WB_SYNC_NONE writeback, so if allocation fails just
-	 * wakeup the thread for old dirty data writeback
+	 * queue the work for old dirty data writeback.
 	 */
 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work) {
-		if (bdi->wb.task) {
-			trace_writeback_nowork(bdi);
-			wake_up_process(bdi->wb.task);
-		}
+		trace_writeback_nowork(bdi);
+		queue_work(bdi_wq, &bdi->wb.work);
 		return;
 	}
 
@@ -773,70 +762,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 }
 
 /*
- * Handle writeback of dirty data for the device backed by this bdi. Also
- * wakes up periodically and does kupdated style flushing.
- */
-int bdi_writeback_thread(void *data)
-{
-	struct bdi_writeback *wb = data;
-	struct backing_dev_info *bdi = wb->bdi;
-	long pages_written;
-
-	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
-	set_freezable();
-	wb->last_active = jiffies;
-
-	/*
-	 * Our parent may run at a different priority, just set us to normal
-	 */
-	set_user_nice(current, 0);
-
-	trace_writeback_thread_start(bdi);
-
-	while (!kthread_should_stop()) {
-		/*
-		 * Remove own delayed wake-up timer, since we are already awake
-		 * and we'll take care of the preriodic write-back.
-		 */
-		del_timer(&wb->wakeup_timer);
-
-		pages_written = wb_do_writeback(wb, 0);
-
-		trace_writeback_pages_written(pages_written);
-
-		if (pages_written)
-			wb->last_active = jiffies;
-
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (!list_empty(&bdi->work_list)) {
-			__set_current_state(TASK_RUNNING);
-			continue;
-		}
-
-		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-		else {
-			/*
-			 * We have nothing to do, so can go sleep without any
-			 * timeout and save power. When a work is queued or
-			 * something is made dirty - we will be woken up.
-			 */
-			schedule();
-		}
-
-		try_to_freeze();
-	}
-
-	/* Flush any work that raced with us exiting */
-	if (!list_empty(&bdi->work_list))
-		wb_do_writeback(wb, 1);
-
-	trace_writeback_thread_stop(bdi);
-	return 0;
-}
-
-
-/*
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
  */
@@ -911,7 +836,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct super_block *sb = inode->i_sb;
 	struct backing_dev_info *bdi = NULL;
-	bool wakeup_bdi = false;
+	bool delayed_wb = false;
 
 	/*
 	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -978,7 +903,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 				 * write-back happens later.
 				 */
 				if (!wb_has_dirty_io(&bdi->wb))
-					wakeup_bdi = true;
+					delayed_wb = true;
 			}
 
 			inode->dirtied_when = jiffies;
@@ -988,8 +913,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 out:
 	spin_unlock(&inode_lock);
 
-	if (wakeup_bdi)
-		bdi_wakeup_thread_delayed(bdi);
+	if (delayed_wb)
+		bdi_delayed_writeback(bdi);
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 97842ab..ebeed5b 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -13,8 +13,8 @@
 #include <linux/proportions.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
-#include <linux/sched.h>
 #include <linux/timer.h>
+#include <linux/workqueue.h>
 #include <linux/writeback.h>
 #include <asm/atomic.h>
 
@@ -26,7 +26,6 @@ struct dentry;
  * Bits in backing_dev_info.state
  */
 enum bdi_state {
-	BDI_pending,		/* On its way to being activated */
 	BDI_wb_alloc,		/* Default embedded wb allocated */
 	BDI_async_congested,	/* The async (write) queue is getting full */
 	BDI_sync_congested,	/* The sync queue is getting full */
@@ -49,10 +48,9 @@ struct bdi_writeback {
 	struct backing_dev_info *bdi;	/* our parent bdi */
 
 	unsigned long last_old_flush;	/* last old data flush */
-	unsigned long last_active;	/* last time bdi thread was active */
 
-	struct task_struct *task;	/* writeback thread */
-	struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
+	struct work_struct work;	/* writeback work */
+	struct timer_list timer;	/* used for delayed bdi execution */
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
@@ -103,13 +101,13 @@ void bdi_unregister(struct backing_dev_info *bdi);
 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
-int bdi_writeback_thread(void *data);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);
-void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+void bdi_delayed_writeback(struct backing_dev_info *bdi);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
+extern struct workqueue_struct *bdi_wq;
 
 static inline int wb_has_dirty_io(struct bdi_writeback *wb)
 {
@@ -308,11 +306,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
 	return bdi->capabilities & BDI_CAP_SWAP_BACKED;
 }
 
-static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
-{
-	return bdi == &default_backing_dev_info;
-}
-
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index eaa4a5b..e874916 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -43,6 +43,7 @@ static struct class *bdi_class;
 DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
 LIST_HEAD(bdi_pending_list);
+struct workqueue_struct *bdi_wq;
 
 static struct task_struct *sync_supers_tsk;
 static struct timer_list sync_supers_timer;
@@ -234,6 +235,10 @@ static int __init default_bdi_init(void)
 {
 	int err;
 
+	bdi_wq = alloc_workqueue("bdi",
+				 WQ_UNBOUND | WQ_FREEZEABLE | WQ_RESCUER, 0);
+	BUG_ON(!bdi_wq);
+
 	sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
 	BUG_ON(IS_ERR(sync_supers_tsk));
 
@@ -305,24 +310,11 @@ static void sync_supers_timer_fn(unsigned long unused)
 	bdi_arm_supers_timer();
 }
 
-static void wakeup_timer_fn(unsigned long data)
+static void bdi_timer_fn(unsigned long data)
 {
 	struct backing_dev_info *bdi = (struct backing_dev_info *)data;
 
-	spin_lock_bh(&bdi->wb_lock);
-	if (bdi->wb.task) {
-		trace_writeback_wake_thread(bdi);
-		wake_up_process(bdi->wb.task);
-	} else {
-		/*
-		 * When bdi tasks are inactive for long time, they are killed.
-		 * In this case we have to wake-up the forker thread which
-		 * should create and run the bdi thread.
-		 */
-		trace_writeback_wake_forker_thread(bdi);
-		wake_up_process(default_backing_dev_info.wb.task);
-	}
-	spin_unlock_bh(&bdi->wb_lock);
+	queue_work(bdi_wq, &bdi->wb.work);
 }
 
 /*
@@ -336,165 +328,56 @@ static void wakeup_timer_fn(unsigned long data)
  * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
  * by delaying the wake-up.
  */
-void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+void bdi_delayed_writeback(struct backing_dev_info *bdi)
 {
 	unsigned long timeout;
 
 	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
+	mod_timer(&bdi->wb.timer, jiffies + timeout);
 }
 
 /*
- * Calculate the longest interval (jiffies) bdi threads are allowed to be
- * inactive.
+ * Handles writeback of dirty data for the device backed by this bdi.
+ * Also schedules itself periodically for kupdated style flushing.
  */
-static unsigned long bdi_longest_inactive(void)
+static void bdi_work_fn(struct work_struct *work)
 {
-	unsigned long interval;
+	struct bdi_writeback *wb =
+		container_of(work, struct bdi_writeback, work);
+	struct backing_dev_info *bdi = wb->bdi;
+	long pages_written;
 
-	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
-	return max(5UL * 60 * HZ, interval);
-}
+	WARN(!test_bit(BDI_registered, &bdi->state),
+	     "bdi %p/%s is not registered!\n", bdi, bdi->name);
 
-static int bdi_forker_thread(void *ptr)
-{
-	struct bdi_writeback *me = ptr;
+	/*
+	 * Remove own delayed timer, since we are already running and
+	 * we'll take care of the periodic write-back.
+	 */
+	del_timer(&wb->timer);
 
 	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
-	set_freezable();
 
 	/*
-	 * Our parent may run at a different priority, just set us to normal
+	 * Writeback works might block and we don't want to hog the
+	 * rescuer.  If we are running off the rescuer, skip works,
+	 * fire off writebacks and yield to other bdi's.
 	 */
-	set_user_nice(current, 0);
-
-	for (;;) {
-		struct task_struct *task = NULL;
-		struct backing_dev_info *bdi;
-		enum {
-			NO_ACTION,   /* Nothing to do */
-			FORK_THREAD, /* Fork bdi thread */
-			KILL_THREAD, /* Kill inactive bdi thread */
-		} action = NO_ACTION;
-
-		/*
-		 * Temporary measure, we want to make sure we don't see
-		 * dirty data on the default backing_dev_info
-		 */
-		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
-			del_timer(&me->wakeup_timer);
-			wb_do_writeback(me, 0);
-		}
-
-		spin_lock_bh(&bdi_lock);
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		list_for_each_entry(bdi, &bdi_list, bdi_list) {
-			bool have_dirty_io;
-
-			if (!bdi_cap_writeback_dirty(bdi) ||
-			     bdi_cap_flush_forker(bdi))
-				continue;
-
-			WARN(!test_bit(BDI_registered, &bdi->state),
-			     "bdi %p/%s is not registered!\n", bdi, bdi->name);
-
-			have_dirty_io = !list_empty(&bdi->work_list) ||
-					wb_has_dirty_io(&bdi->wb);
-
-			/*
-			 * If the bdi has work to do, but the thread does not
-			 * exist - create it.
-			 */
-			if (!bdi->wb.task && have_dirty_io) {
-				/*
-				 * Set the pending bit - if someone will try to
-				 * unregister this bdi - it'll wait on this bit.
-				 */
-				set_bit(BDI_pending, &bdi->state);
-				action = FORK_THREAD;
-				break;
-			}
-
-			spin_lock(&bdi->wb_lock);
-
-			/*
-			 * If there is no work to do and the bdi thread was
-			 * inactive long enough - kill it. The wb_lock is taken
-			 * to make sure no-one adds more work to this bdi and
-			 * wakes the bdi thread up.
-			 */
-			if (bdi->wb.task && !have_dirty_io &&
-			    time_after(jiffies, bdi->wb.last_active +
-						bdi_longest_inactive())) {
-				task = bdi->wb.task;
-				bdi->wb.task = NULL;
-				spin_unlock(&bdi->wb_lock);
-				set_bit(BDI_pending, &bdi->state);
-				action = KILL_THREAD;
-				break;
-			}
-			spin_unlock(&bdi->wb_lock);
-		}
-		spin_unlock_bh(&bdi_lock);
-
-		/* Keep working if default bdi still has things to do */
-		if (!list_empty(&me->bdi->work_list))
-			__set_current_state(TASK_RUNNING);
-
-		switch (action) {
-		case FORK_THREAD:
-			__set_current_state(TASK_RUNNING);
-			task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s",
-					   dev_name(bdi->dev));
-			if (IS_ERR(task)) {
-				/*
-				 * If thread creation fails, force writeout of
-				 * the bdi from the thread.
-				 */
-				bdi_flush_io(bdi);
-			} else {
-				/*
-				 * The spinlock makes sure we do not lose
-				 * wake-ups when racing with 'bdi_queue_work()'.
-				 */
-				spin_lock_bh(&bdi->wb_lock);
-				bdi->wb.task = task;
-				spin_unlock_bh(&bdi->wb_lock);
-			}
-			break;
-
-		case KILL_THREAD:
-			__set_current_state(TASK_RUNNING);
-			kthread_stop(task);
-			break;
-
-		case NO_ACTION:
-			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
-				/*
-				 * There are no dirty data. The only thing we
-				 * should now care about is checking for
-				 * inactive bdi threads and killing them. Thus,
-				 * let's sleep for longer time, save energy and
-				 * be friendly for battery-driven devices.
-				 */
-				schedule_timeout(bdi_longest_inactive());
-			else
-				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-			try_to_freeze();
-			/* Back to the main loop */
-			continue;
-		}
-
-		/*
-		 * Clear pending bit and wakeup anybody waiting to tear us down.
-		 */
-		clear_bit(BDI_pending, &bdi->state);
-		smp_mb__after_clear_bit();
-		wake_up_bit(&bdi->state, BDI_pending);
+	if (likely(!workqueue_on_rescuer(bdi_wq))) {
+		do {
+			pages_written = wb_do_writeback(wb, 0);
+			trace_writeback_pages_written(pages_written);
+		} while (!list_empty(&bdi->work_list));
+	} else {
+		bdi_flush_io(bdi);
+		if (!list_empty(&bdi->work_list))
+			queue_work(bdi_wq, work);
 	}
 
-	return 0;
+	if (wb_has_dirty_io(wb) && dirty_writeback_interval)
+		bdi_delayed_writeback(bdi);
+
+	current->flags &= ~(PF_FLUSHER | PF_SWAPWRITE);
 }
 
 /*
@@ -526,20 +409,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 
 	bdi->dev = dev;
 
-	/*
-	 * Just start the forker thread for our default backing_dev_info,
-	 * and add other bdi's to the list. They will get a thread created
-	 * on-demand when they need it.
-	 */
-	if (bdi_cap_flush_forker(bdi)) {
-		struct bdi_writeback *wb = &bdi->wb;
-
-		wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
-						dev_name(dev));
-		if (IS_ERR(wb->task))
-			return PTR_ERR(wb->task);
-	}
-
 	bdi_debug_register(bdi, dev_name(dev));
 	set_bit(BDI_registered, &bdi->state);
 
@@ -563,30 +432,25 @@ EXPORT_SYMBOL(bdi_register_dev);
  */
 static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 {
+	unsigned int saved_flags = current->flags & (PF_FLUSHER | PF_SWAPWRITE);
+
 	if (!bdi_cap_writeback_dirty(bdi))
 		return;
 
 	/*
-	 * Make sure nobody finds us on the bdi_list anymore
+	 * Make sure nobody finds us on the bdi_list anymore and
+	 * writeback work isn't running.
 	 */
 	bdi_remove_from_list(bdi);
+	cancel_work_sync(&bdi->wb.work);
 
-	/*
-	 * If setup is pending, wait for that to complete first
-	 */
-	wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
-			TASK_UNINTERRUPTIBLE);
+	/* Flush any work that raced with shutdown */
+	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
 
-	/*
-	 * Finally, kill the kernel thread. We don't need to be RCU
-	 * safe anymore, since the bdi is gone from visibility. Force
-	 * unfreeze of the thread before calling kthread_stop(), otherwise
-	 * it would never exet if it is currently stuck in the refrigerator.
-	 */
-	if (bdi->wb.task) {
-		thaw_process(bdi->wb.task);
-		kthread_stop(bdi->wb.task);
-	}
+	wb_do_writeback(&bdi->wb, 1);
+
+	current->flags &= ~(PF_FLUSHER | PF_SWAPWRITE);
+	current->flags |= saved_flags;
 }
 
 /*
@@ -609,10 +473,9 @@ void bdi_unregister(struct backing_dev_info *bdi)
 	if (bdi->dev) {
 		trace_writeback_bdi_unregister(bdi);
 		bdi_prune_sb(bdi);
-		del_timer_sync(&bdi->wb.wakeup_timer);
+		del_timer_sync(&bdi->wb.timer);
 
-		if (!bdi_cap_flush_forker(bdi))
-			bdi_wb_shutdown(bdi);
+		bdi_wb_shutdown(bdi);
 		bdi_debug_unregister(bdi);
 		device_unregister(bdi->dev);
 		bdi->dev = NULL;
@@ -629,7 +492,13 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_dirty);
 	INIT_LIST_HEAD(&wb->b_io);
 	INIT_LIST_HEAD(&wb->b_more_io);
-	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+	/*
+	 * work and separate timer are used instead of delayed_work
+	 * because mixing immediate and delayed queueing doesn't work
+	 * too well with delayed_work.
+	 */
+	INIT_WORK(&wb->work, bdi_work_fn);
+	setup_timer(&wb->timer, bdi_timer_fn, (unsigned long)bdi);
 }
 
 int bdi_init(struct backing_dev_info *bdi)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/