lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20090331170319.GT5178@kernel.dk>
Date:	Tue, 31 Mar 2009 19:03:19 +0200
From:	Jens Axboe <jens.axboe@...cle.com>
To:	Linus Torvalds <torvalds@...ux-foundation.org>
Cc:	Ric Wheeler <rwheeler@...hat.com>,
	Fernando Luis Vázquez Cao 
	<fernando@....ntt.co.jp>, Jeff Garzik <jeff@...zik.org>,
	Christoph Hellwig <hch@...radead.org>,
	Theodore Tso <tytso@....edu>, Ingo Molnar <mingo@...e.hu>,
	Alan Cox <alan@...rguk.ukuu.org.uk>,
	Arjan van de Ven <arjan@...radead.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Nick Piggin <npiggin@...e.de>, David Rees <drees76@...il.com>,
	Jesper Krogh <jesper@...gh.cc>,
	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
	chris.mason@...cle.com, david@...morbit.com, tj@...nel.org
Subject: Re: [PATCH 1/7] block: Add block_flush_device()

On Tue, Mar 31 2009, Jens Axboe wrote:
> On Tue, Mar 31 2009, Linus Torvalds wrote:
> > 
> > 
> > On Tue, 31 Mar 2009, Ric Wheeler wrote:
> > > 
> > > The question is really what we do when you have a storage device in your box
> > > with a volatile write cache that does support flush or fua or similar.
> > 
> > Ok. Then you are talking about a different case - not EOPNOTSUPP.
> 
> So here's a test patch that attempts to just ignore such a failure to
> flush the caches. It will still flag the bio as BIO_EOPNOTSUPP, but
> that's merely maintaining the information in case the caller does want
> to see if that barrier failed or not. It may not actually be useful, in
> which case we can just kill that flag.

Updated version, the previous missed most of the buffer_eopnotsupp()
checking. So this one also gets rid of the file system retry logic.
Thanks to gfs2 Steve for pointing out that I missed gfs2, made me
realize that I missed a lot more as well.


 block/blk-barrier.c         |    8 ++------
 block/blk-settings.c        |   13 +++++++++++++
 block/ioctl.c               |    4 +---
 fs/bio.c                    |   12 +++++++++++-
 fs/btrfs/disk-io.c          |    5 -----
 fs/btrfs/extent_io.c        |    9 ++-------
 fs/buffer.c                 |   23 ++---------------------
 fs/fat/misc.c               |    5 +----
 fs/gfs2/log.c               |   18 ++++++------------
 fs/jbd2/commit.c            |   22 ----------------------
 fs/reiserfs/journal.c       |   15 ---------------
 fs/xfs/linux-2.6/xfs_aops.c |    1 -
 include/linux/blkdev.h      |    2 ++
 include/linux/buffer_head.h |    2 --
 14 files changed, 40 insertions(+), 99 deletions(-)commit 74e725b7f2e5f3f073abe84c5823026a6f1e33ce

---

Author: Jens Axboe <jens.axboe@...cle.com>
Date:   Tue Mar 31 19:00:53 2009 +0200

    barrier: Don't return -EOPNOTSUPP to the caller if the device does not support barriers
    
    The caller cannot really do much about the situation anyway. Instead log
    a warning if this is the first such failed barrier we see, so that the
    admin can look into whether this poses a data integrity problem or not.
    
    Signed-off-by: Jens Axboe <jens.axboe@...cle.com>

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index f7dae57..8660146 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -338,9 +338,7 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 		*error_sector = bio->bi_sector;
 
 	ret = 0;
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-	else if (!bio_flagged(bio, BIO_UPTODATE))
+	if (!bio_flagged(bio, BIO_UPTODATE))
 		ret = -EIO;
 
 	bio_put(bio);
@@ -408,9 +406,7 @@ int blkdev_issue_discard(struct block_device *bdev,
 		submit_bio(DISCARD_BARRIER, bio);
 
 		/* Check if it failed immediately */
-		if (bio_flagged(bio, BIO_EOPNOTSUPP))
-			ret = -EOPNOTSUPP;
-		else if (!bio_flagged(bio, BIO_UPTODATE))
+		if (!bio_flagged(bio, BIO_UPTODATE))
 			ret = -EIO;
 		bio_put(bio);
 	}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 59fd05d..0a81466 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -463,6 +463,19 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
+void blk_queue_set_noflush(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	char b[BDEVNAME_SIZE];
+
+	if (test_and_set_bit(QUEUE_FLAG_NOFLUSH, &q->queue_flags))
+		return;
+
+	printk(KERN_ERR "Device %s does not appear to honor cache flushes. "	
+			"This may mean that file system ordering guarantees "
+			"are not met.", bdevname(bdev, b));
+}
+
 static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/ioctl.c b/block/ioctl.c
index 0f22e62..769f7be 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -166,9 +166,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 
 		wait_for_completion(&wait);
 
-		if (bio_flagged(bio, BIO_EOPNOTSUPP))
-			ret = -EOPNOTSUPP;
-		else if (!bio_flagged(bio, BIO_UPTODATE))
+		if (!bio_flagged(bio, BIO_UPTODATE))
 			ret = -EIO;
 		bio_put(bio);
 	}
diff --git a/fs/bio.c b/fs/bio.c
index a040cde..79e3cec 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1380,7 +1380,17 @@ void bio_check_pages_dirty(struct bio *bio)
  **/
 void bio_endio(struct bio *bio, int error)
 {
-	if (error)
+	/*
+	 * Special case here - hide the -EOPNOTSUPP from the driver or
+	 * block layer, dump a warning the first time this happens so that
+	 * the admin knows that we may not provide the ordering guarantees
+	 * that are needed. Don't clear the uptodate bit.
+	 */
+	if (error == -EOPNOTSUPP && bio_barrier(bio)) {
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		blk_queue_set_noflush(bio->bi_bdev);
+		error = 0;
+	} else if (error)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6ec80c0..fd3ea97 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1951,11 +1951,6 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
-			printk(KERN_WARNING "lost page write due to "
-					"I/O error on %s\n",
-				       bdevname(bh->b_bdev, b));
-		}
 		/* note, we dont' set_buffer_write_io_error because we have
 		 * our own ways of dealing with the IO errors
 		 */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ebe6b29..d696d26 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1834,7 +1834,6 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 			  unsigned long bio_flags)
 {
-	int ret = 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct page *page = bvec->bv_page;
 	struct extent_io_tree *tree = bio->bi_private;
@@ -1846,17 +1845,13 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 
 	bio->bi_private = NULL;
 
-	bio_get(bio);
-
 	if (tree->ops && tree->ops->submit_bio_hook)
 		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
 					   mirror_num, bio_flags);
 	else
 		submit_bio(rw, bio);
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-	bio_put(bio);
-	return ret;
+
+	return 0;
 }
 
 static int submit_extent_page(int rw, struct extent_io_tree *tree,
diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743..6f50e08 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -147,17 +147,9 @@ void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-	char b[BDEVNAME_SIZE];
-
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
-			buffer_io_error(bh);
-			printk(KERN_WARNING "lost page write due to "
-					"I/O error on %s\n",
-				       bdevname(bh->b_bdev, b));
-		}
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
 	}
@@ -2828,7 +2820,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 
 	if (err == -EOPNOTSUPP) {
 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		set_bit(BH_Eopnotsupp, &bh->b_state);
+		err = 0;
 	}
 
 	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -2841,7 +2833,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 int submit_bh(int rw, struct buffer_head * bh)
 {
 	struct bio *bio;
-	int ret = 0;
 
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
@@ -2879,14 +2870,8 @@ int submit_bh(int rw, struct buffer_head * bh)
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
 
-	bio_get(bio);
 	submit_bio(rw, bio);
-
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-
-	bio_put(bio);
-	return ret;
+	return 0;
 }
 
 /**
@@ -2965,10 +2950,6 @@ int sync_dirty_buffer(struct buffer_head *bh)
 		bh->b_end_io = end_buffer_write_sync;
 		ret = submit_bh(WRITE, bh);
 		wait_on_buffer(bh);
-		if (buffer_eopnotsupp(bh)) {
-			clear_buffer_eopnotsupp(bh);
-			ret = -EOPNOTSUPP;
-		}
 		if (!ret && !buffer_uptodate(bh))
 			ret = -EIO;
 	} else {
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index ac39ebc..406da54 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -269,10 +269,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 	ll_rw_block(SWRITE, nr_bhs, bhs);
 	for (i = 0; i < nr_bhs; i++) {
 		wait_on_buffer(bhs[i]);
-		if (buffer_eopnotsupp(bhs[i])) {
-			clear_buffer_eopnotsupp(bhs[i]);
-			err = -EOPNOTSUPP;
-		} else if (!err && !buffer_uptodate(bhs[i]))
+		if (!err && !buffer_uptodate(bhs[i]))
 			err = -EIO;
 	}
 	return err;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 98918a7..78c3d59 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -578,6 +578,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	struct gfs2_log_header *lh;
 	unsigned int tail;
 	u32 hash;
+	int rw;
 
 	bh = sb_getblk(sdp->sd_vfs, blkno);
 	lock_buffer(bh);
@@ -602,20 +603,13 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 
 	bh->b_end_io = end_buffer_write_sync;
 	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
-		goto skip_barrier;
+		rw = WRITE;
+	else
+		rw = WRITE_BARRIER;
+
 	get_bh(bh);
-	submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+	submit_bh(rw | (1 << BIO_RW_META), bh);
 	wait_on_buffer(bh);
-	if (buffer_eopnotsupp(bh)) {
-		clear_buffer_eopnotsupp(bh);
-		set_buffer_uptodate(bh);
-		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
-		lock_buffer(bh);
-skip_barrier:
-		get_bh(bh);
-		submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
-		wait_on_buffer(bh);
-	}
 	if (!buffer_uptodate(bh))
 		gfs2_io_error_bh(sdp, bh);
 	brelse(bh);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e5..c1de70c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -174,30 +174,8 @@ static int journal_wait_on_commit_record(journal_t *journal,
 {
 	int ret = 0;
 
-retry:
 	clear_buffer_dirty(bh);
 	wait_on_buffer(bh);
-	if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
-		printk(KERN_WARNING
-		       "JBD2: wait_on_commit_record: sync failed on %s - "
-		       "disabling barriers\n", journal->j_devname);
-		spin_lock(&journal->j_state_lock);
-		journal->j_flags &= ~JBD2_BARRIER;
-		spin_unlock(&journal->j_state_lock);
-
-		lock_buffer(bh);
-		clear_buffer_dirty(bh);
-		set_buffer_uptodate(bh);
-		bh->b_end_io = journal_end_buffer_io_sync;
-
-		ret = submit_bh(WRITE_SYNC, bh);
-		if (ret) {
-			unlock_buffer(bh);
-			return ret;
-		}
-		goto retry;
-	}
-
 	if (unlikely(!buffer_uptodate(bh)))
 		ret = -EIO;
 	put_bh(bh);            /* One for getblk() */
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 77f5bb7..5eefa6c 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -700,18 +700,6 @@ static int submit_barrier_buffer(struct buffer_head *bh)
 	return submit_bh(WRITE_BARRIER, bh);
 }
 
-static void check_barrier_completion(struct super_block *s,
-				     struct buffer_head *bh)
-{
-	if (buffer_eopnotsupp(bh)) {
-		clear_buffer_eopnotsupp(bh);
-		disable_barrier(s);
-		set_buffer_uptodate(bh);
-		set_buffer_dirty(bh);
-		sync_dirty_buffer(bh);
-	}
-}
-
 #define CHUNK_SIZE 32
 struct buffer_chunk {
 	struct buffer_head *bh[CHUNK_SIZE];
@@ -1148,8 +1136,6 @@ static int flush_commit_list(struct super_block *s,
 	} else
 		wait_on_buffer(jl->j_commit_bh);
 
-	check_barrier_completion(s, jl->j_commit_bh);
-
 	/* If there was a write error in the journal - we can't commit this
 	 * transaction - it will be invalid and, if successful, will just end
 	 * up propagating the write error out to the filesystem. */
@@ -1313,7 +1299,6 @@ static int _update_journal_header_block(struct super_block *sb,
 				goto sync;
 			}
 			wait_on_buffer(journal->j_header_bh);
-			check_barrier_completion(sb, journal->j_header_bh);
 		} else {
 		      sync:
 			set_buffer_dirty(journal->j_header_bh);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index de3a198..c0cacb2 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -406,7 +406,6 @@ xfs_submit_ioend_bio(
 	bio->bi_end_io = xfs_end_bio;
 
 	submit_bio(WRITE, bio);
-	ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
 	bio_put(bio);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 465d6ba..ea2e15a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -452,6 +452,7 @@ struct request_queue
 #define QUEUE_FLAG_NONROT      14	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 #define QUEUE_FLAG_IO_STAT     15	/* do IO stats */
+#define QUEUE_FLAG_NOFLUSH     16	/* device doesn't do flushes */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_CLUSTER) |		\
@@ -789,6 +790,7 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
 extern void blk_unplug(struct request_queue *q);
+extern void blk_queue_set_noflush(struct block_device *bdev);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index f19fd90..8adcaa4 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -33,7 +33,6 @@ enum bh_state_bits {
 	BH_Boundary,	/* Block is followed by a discontiguity */
 	BH_Write_EIO,	/* I/O error on write */
 	BH_Ordered,	/* ordered write */
-	BH_Eopnotsupp,	/* operation not supported (barrier) */
 	BH_Unwritten,	/* Buffer is allocated on disk but not written */
 	BH_Quiet,	/* Buffer Error Prinks to be quiet */
 
@@ -126,7 +125,6 @@ BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
 BUFFER_FNS(Write_EIO, write_io_error)
 BUFFER_FNS(Ordered, ordered)
-BUFFER_FNS(Eopnotsupp, eopnotsupp)
 BUFFER_FNS(Unwritten, unwritten)
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)


-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ