linux-kernel - Re: [PATCH 135/241] md/raid10: close race that lose writes lost when replacement completes.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1355596839.18807.12.camel@deadeye.wl.decadent.org.uk>
Date:	Sat, 15 Dec 2012 18:40:39 +0000
From:	Ben Hutchings <ben@...adent.org.uk>
To:	Herton Ronaldo Krzesinski <herton.krzesinski@...onical.com>,
	Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
	NeilBrown <neilb@...e.de>
Cc:	linux-kernel@...r.kernel.org, stable@...r.kernel.org,
	kernel-team@...ts.ubuntu.com
Subject: Re: [PATCH 135/241] md/raid10: close race that lose writes lost
 when replacement completes.

On Thu, 2012-12-13 at 11:58 -0200, Herton Ronaldo Krzesinski wrote:
> 3.5.7.2 -stable review patch.  If anyone has any objections, please let me know.
> 
> ------------------
> 
> From: NeilBrown <neilb@...e.de>
> 
> commit e7c0c3fa29280d62aa5e11101a674bb3064bd791 upstream.
> 
> When a replacement operation completes there is a small window
> when the original device is marked 'faulty' and the replacement
> still looks like a replacement.  The faulty should be removed and
> the replacement moved in place very quickly, bit it isn't instant.
> 
> So the code write out to the array must handle the possibility that
> the only working device for some slot in the replacement - but it
> doesn't.  If the primary device is faulty it just gives up.  This
> can lead to corruption.
> 
> So make the code more robust: if either  the primary or the
> replacement is present and working, write to them.  Only when
> neither are present do we give up.
> 
> This bug has been present since replacement was introduced in
> 3.3, so it is suitable for any -stable kernel since then.

This is missing from 3.4, so Greg will presumably want to apply this (if
the backport is correct).

Ben.

> Reported-by: "George Spelvin" <linux@...izon.com>
> Signed-off-by: NeilBrown <neilb@...e.de>
> [ herton: hairy code adjustment on 3rd hunk (conf->copies for loop) ]
> Signed-off-by: Herton Ronaldo Krzesinski <herton.krzesinski@...onical.com>
> ---
>  drivers/md/raid10.c |  113 +++++++++++++++++++++++++++------------------------
>  1 file changed, 59 insertions(+), 54 deletions(-)
> 
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 17fae37..0920adf 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1267,18 +1267,21 @@ retry_write:
>  			blocked_rdev = rrdev;
>  			break;
>  		}
> +		if (rdev && (test_bit(Faulty, &rdev->flags)
> +			     || test_bit(Unmerged, &rdev->flags)))
> +			rdev = NULL;
>  		if (rrdev && (test_bit(Faulty, &rrdev->flags)
>  			      || test_bit(Unmerged, &rrdev->flags)))
>  			rrdev = NULL;
>  
>  		r10_bio->devs[i].bio = NULL;
>  		r10_bio->devs[i].repl_bio = NULL;
> -		if (!rdev || test_bit(Faulty, &rdev->flags) ||
> -		    test_bit(Unmerged, &rdev->flags)) {
> +
> +		if (!rdev && !rrdev) {
>  			set_bit(R10BIO_Degraded, &r10_bio->state);
>  			continue;
>  		}
> -		if (test_bit(WriteErrorSeen, &rdev->flags)) {
> +		if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
>  			sector_t first_bad;
>  			sector_t dev_sector = r10_bio->devs[i].addr;
>  			int bad_sectors;
> @@ -1320,8 +1323,10 @@ retry_write:
>  					max_sectors = good_sectors;
>  			}
>  		}
> -		r10_bio->devs[i].bio = bio;
> -		atomic_inc(&rdev->nr_pending);
> +		if (rdev) {
> +			r10_bio->devs[i].bio = bio;
> +			atomic_inc(&rdev->nr_pending);
> +		}
>  		if (rrdev) {
>  			r10_bio->devs[i].repl_bio = bio;
>  			atomic_inc(&rrdev->nr_pending);
> @@ -1377,58 +1382,58 @@ retry_write:
>  	for (i = 0; i < conf->copies; i++) {
>  		struct bio *mbio;
>  		int d = r10_bio->devs[i].devnum;
> -		if (!r10_bio->devs[i].bio)
> -			continue;
> -
> -		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> -		md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> -			    max_sectors);
> -		r10_bio->devs[i].bio = mbio;
> -
> -		mbio->bi_sector	= (r10_bio->devs[i].addr+
> -				   choose_data_offset(r10_bio,
> -						      conf->mirrors[d].rdev));
> -		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
> -		mbio->bi_end_io	= raid10_end_write_request;
> -		mbio->bi_rw = WRITE | do_sync | do_fua;
> -		mbio->bi_private = r10_bio;
> -
> -		atomic_inc(&r10_bio->remaining);
> -		spin_lock_irqsave(&conf->device_lock, flags);
> -		bio_list_add(&conf->pending_bio_list, mbio);
> -		conf->pending_count++;
> -		spin_unlock_irqrestore(&conf->device_lock, flags);
> -		if (!mddev_check_plugged(mddev))
> -			md_wakeup_thread(mddev->thread);
> -
> -		if (!r10_bio->devs[i].repl_bio)
> -			continue;
> +		if (r10_bio->devs[i].bio) {
> +			struct md_rdev *rdev = conf->mirrors[d].rdev;
> +			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> +			md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> +				    max_sectors);
> +			r10_bio->devs[i].bio = mbio;
> +
> +			mbio->bi_sector	= (r10_bio->devs[i].addr+
> +					   choose_data_offset(r10_bio,
> +							      rdev));
> +			mbio->bi_bdev = rdev->bdev;
> +			mbio->bi_end_io = raid10_end_write_request;
> +			mbio->bi_rw = WRITE | do_sync | do_fua;
> +			mbio->bi_private = r10_bio;
>  
> -		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> -		md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> -			    max_sectors);
> -		r10_bio->devs[i].repl_bio = mbio;
> +			atomic_inc(&r10_bio->remaining);
> +			spin_lock_irqsave(&conf->device_lock, flags);
> +			bio_list_add(&conf->pending_bio_list, mbio);
> +			conf->pending_count++;
> +			spin_unlock_irqrestore(&conf->device_lock, flags);
> +			if (!mddev_check_plugged(mddev))
> +				md_wakeup_thread(mddev->thread);
> +		}
>  
> -		/* We are actively writing to the original device
> -		 * so it cannot disappear, so the replacement cannot
> -		 * become NULL here
> -		 */
> -		mbio->bi_sector	= (r10_bio->devs[i].addr +
> -				   choose_data_offset(
> -					   r10_bio,
> -					   conf->mirrors[d].replacement));
> -		mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
> -		mbio->bi_end_io	= raid10_end_write_request;
> -		mbio->bi_rw = WRITE | do_sync | do_fua;
> -		mbio->bi_private = r10_bio;
> +		if (r10_bio->devs[i].repl_bio) {
> +			struct md_rdev *rdev = conf->mirrors[d].replacement;
> +			if (rdev == NULL) {
> +				/* Replacement just got moved to main 'rdev' */
> +				smp_mb();
> +				rdev = conf->mirrors[d].rdev;
> +			}
> +			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> +			md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> +				    max_sectors);
> +			r10_bio->devs[i].repl_bio = mbio;
> +
> +			mbio->bi_sector = (r10_bio->devs[i].addr +
> +					   choose_data_offset(
> +						   r10_bio, rdev));
> +			mbio->bi_bdev = rdev->bdev;
> +			mbio->bi_end_io = raid10_end_write_request;
> +			mbio->bi_rw = WRITE | do_sync | do_fua;
> +			mbio->bi_private = r10_bio;
>  
> -		atomic_inc(&r10_bio->remaining);
> -		spin_lock_irqsave(&conf->device_lock, flags);
> -		bio_list_add(&conf->pending_bio_list, mbio);
> -		conf->pending_count++;
> -		spin_unlock_irqrestore(&conf->device_lock, flags);
> -		if (!mddev_check_plugged(mddev))
> -			md_wakeup_thread(mddev->thread);
> +			atomic_inc(&r10_bio->remaining);
> +			spin_lock_irqsave(&conf->device_lock, flags);
> +			bio_list_add(&conf->pending_bio_list, mbio);
> +			conf->pending_count++;
> +			spin_unlock_irqrestore(&conf->device_lock, flags);
> +			if (!mddev_check_plugged(mddev))
> +				md_wakeup_thread(mddev->thread);
> +		}
>  	}
>  
>  	/* Don't remove the bias on 'remaining' (one_write_done) until

-- 
Ben Hutchings
Theory and practice are closer in theory than in practice.
                                - John Levine, moderator of comp.compilers

Download attachment "signature.asc" of type "application/pgp-signature" (829 bytes)