lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20090407081605.GP5178@kernel.dk>
Date:	Tue, 7 Apr 2009 10:16:05 +0200
From:	Jens Axboe <jens.axboe@...cle.com>
To:	Andrew Morton <akpm@...ux-foundation.org>
Cc:	Theodore Ts'o <tytso@....edu>,
	Linux Kernel Developers List <linux-kernel@...r.kernel.org>,
	Ext4 Developers List <linux-ext4@...r.kernel.org>,
	jack@...e.cz
Subject: Re: [PATCH 1/3] block_write_full_page: Use synchronous writes for
	WBC_SYNC_ALL writebacks

On Tue, Apr 07 2009, Jens Axboe wrote:
> BTW, with the increased number of sync IO and unplugging, it makes sense
> to soon look into some finer granularity of plugging. If we didn't have
> so many single page submission paths it would not be as big a problem,
> but we do. And since they still persist so many years after we added
> functionality to pass bigger IOs, it likely wont be much better in the
> future either.
> 
> So we can either look into doing per io context plugging, or doing
> something similar to:
> 
> plugctx = blk_get_plug_context();
>         ...
>         submit_bio_plug(rw, bio, plugctx);
>         ...
>         submit_bio_plug(rw, bio, plugctx);
>         ...
> blk_submit_plug_context(plugctx);
> 
> and pass that down through wbc, perhaps. Dunno, just a thought.
> Basically a work-around for not having a dedicated writepages() that
> does the right thing (ext3 anyone?).

Here's a quick mockup. It compiles, but that's about all the usage it
has seen so far :-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 43fdedc..5cf416c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1567,6 +1567,17 @@ void submit_bio(int rw, struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio);
 
+void submit_bio_plug(int rw, struct bio *bio, struct blk_plug_ctx *ctx)
+{
+	if (ctx) {
+		bio->bi_rw |= rw;
+		bio->bi_next = ctx->bio_list;
+		ctx->bio_list = bio;
+	} else
+		submit_bio(rw, bio);
+}
+EXPORT_SYMBOL(submit_bio_plug);
+
 /**
  * blk_rq_check_limits - Helper function to check a request for the queue limit
  * @q:  the queue
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 012f065..e4313e3 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -101,6 +101,8 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
 		INIT_HLIST_HEAD(&ret->cic_list);
 		ret->ioc_data = NULL;
+		ret->plug_ctx.bio_list = NULL;
+		ret->plug_ctx.state = 0;
 	}
 
 	return ret;
@@ -171,6 +173,55 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc)
 }
 EXPORT_SYMBOL(copy_io_context);
 
+struct blk_plug_ctx *blk_get_plug_context(void)
+{
+	struct io_context *ioc;
+
+	ioc = current_io_context(GFP_ATOMIC, -1);
+	if (!ioc)
+		return NULL;
+
+	if (!test_and_set_bit_lock(0, &ioc->plug_ctx.state))
+		return &ioc->plug_ctx;
+
+	return NULL;
+}
+
+static void __blk_submit_plug_context(struct blk_plug_ctx *ctx)
+{
+	struct block_device *bdev = NULL;
+	struct bio *bio;
+
+	while ((bio = ctx->bio_list) != NULL) {
+		ctx->bio_list = bio->bi_next;
+		bio->bi_next = NULL;
+
+		if (bdev && bdev != bio->bi_bdev)
+			blk_unplug(bdev_get_queue(bdev));
+
+		if (bio_unplug(bio))
+			bdev = bio->bi_bdev;
+
+		bio->bi_flags &= ~(1 << BIO_RW_UNPLUG);
+
+		submit_bio(bio->bi_rw, bio);
+	}
+}
+
+void blk_submit_plug_context(struct blk_plug_ctx *ctx)
+{
+	if (ctx) {
+		__blk_submit_plug_context(ctx);
+		clear_bit_unlock(0, &ctx->state);
+	}
+}
+
+void blk_flush_plug_context(struct blk_plug_ctx *ctx)
+{
+	if (ctx)
+		__blk_submit_plug_context(ctx);
+}
+
 static int __init blk_ioc_init(void)
 {
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/fs/buffer.c b/fs/buffer.c
index 6e35762..2ed21b8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1698,7 +1698,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh(write_op, bh);
+			submit_bh_plug(write_op, bh, wbc->plug);
 			nr_underway++;
 		}
 		bh = next;
@@ -2884,8 +2884,10 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-int submit_bh(int rw, struct buffer_head * bh)
+static int __submit_bh(int rw, struct buffer_head * bh,
+			struct blk_plug_ctx *ctx)
 {
+	gfp_t gfp = ctx ? GFP_ATOMIC : GFP_NOIO;
 	struct bio *bio;
 	int ret = 0;
 
@@ -2910,7 +2912,12 @@ int submit_bh(int rw, struct buffer_head * bh)
 	 * from here on down, it's all bio -- do the initial mapping,
 	 * submit_bio -> generic_make_request may further map this bio around
 	 */
-	bio = bio_alloc(GFP_NOIO, 1);
+	bio = bio_alloc(gfp, 1);
+	if (!bio) {
+		blk_flush_plug_context(ctx);
+		bio_alloc(GFP_NOIO, 1);
+		ctx = NULL;
+	}
 
 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
@@ -2926,7 +2933,8 @@ int submit_bh(int rw, struct buffer_head * bh)
 	bio->bi_private = bh;
 
 	bio_get(bio);
-	submit_bio(rw, bio);
+
+	submit_bio_plug(rw, bio, ctx);
 
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
@@ -2935,6 +2943,16 @@ int submit_bh(int rw, struct buffer_head * bh)
 	return ret;
 }
 
+int submit_bh(int rw, struct buffer_head *bh)
+{
+	return __submit_bh(rw, bh, NULL);
+}
+
+int submit_bh_plug(int rw, struct buffer_head *bh, struct blk_plug_ctx *ctx)
+{
+	return __submit_bh(rw, bh, ctx);
+}
+
 /**
  * ll_rw_block: low-level access to block devices (DEPRECATED)
  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7b73bb8..a8eec18 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -183,6 +183,7 @@ void __lock_buffer(struct buffer_head *bh);
 void ll_rw_block(int, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
 int submit_bh(int, struct buffer_head *);
+int submit_bh_plug(int, struct buffer_head *, struct blk_plug_ctx *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 int bh_uptodate_or_lock(struct buffer_head *bh);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bce40a2..8a0c4b5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2117,7 +2117,9 @@ extern void file_move(struct file *f, struct list_head *list);
 extern void file_kill(struct file *f);
 #ifdef CONFIG_BLOCK
 struct bio;
+struct blk_plug_ctx;
 extern void submit_bio(int, struct bio *);
+extern void submit_bio_plug(int, struct bio *, struct blk_plug_ctx *);
 extern int bdev_read_only(struct block_device *);
 #endif
 extern int set_blocksize(struct block_device *, int);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 08b987b..38c8a2c 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -3,6 +3,7 @@
 
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
+#include <linux/list.h>
 
 /*
  * This is the per-process anticipatory I/O scheduler state.
@@ -59,6 +60,11 @@ struct cfq_io_context {
 	struct rcu_head rcu_head;
 };
 
+struct blk_plug_ctx {
+	struct bio *bio_list;
+	unsigned long state;
+};
+
 /*
  * I/O subsystem state of the associated processes.  It is refcounted
  * and kmalloc'ed. These could be shared between processes.
@@ -83,6 +89,8 @@ struct io_context {
 	struct radix_tree_root radix_root;
 	struct hlist_head cic_list;
 	void *ioc_data;
+
+	struct blk_plug_ctx plug_ctx;
 };
 
 static inline struct io_context *ioc_task_link(struct io_context *ioc)
@@ -105,7 +113,17 @@ void exit_io_context(void);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
+struct blk_plug_ctx *blk_get_plug_context(void);
+void blk_submit_plug_context(struct blk_plug_ctx *);
+void blk_flush_plug_context(struct blk_plug_ctx *);
 #else
+static inline void blk_submit_plug_context(struct blk_plug_ctx *ctx)
+{
+}
+static inline struct blk_plug_ctx *blk_get_plug_context(void)
+{
+	return NULL;
+}
 static inline void exit_io_context(void)
 {
 }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9344547..8b5c14a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -6,6 +6,7 @@
 
 #include <linux/sched.h>
 #include <linux/fs.h>
+#include <linux/iocontext.h>
 
 struct backing_dev_info;
 
@@ -40,6 +41,7 @@ enum writeback_sync_modes {
 struct writeback_control {
 	struct backing_dev_info *bdi;	/* If !NULL, only write back this
 					   queue */
+	struct blk_plug_ctx *plug;
 	enum writeback_sync_modes sync_mode;
 	unsigned long *older_than_this;	/* If !NULL, only write back inodes
 					   older than this */
diff --git a/mm/filemap.c b/mm/filemap.c
index 2e2d38e..d521830 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -218,7 +218,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 	if (!mapping_cap_writeback_dirty(mapping))
 		return 0;
 
+	wbc.plug = blk_get_plug_context();
 	ret = do_writepages(mapping, &wbc);
+	blk_submit_plug_context(wbc.plug);
 	return ret;
 }
 

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ