[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1389000716-3274-3-git-send-email-amadvance@gmail.com>
Date: Mon, 6 Jan 2014 10:31:56 +0100
From: Andrea Mazzoleni <amadvance@...il.com>
To: neilb@...e.de
Cc: clm@...com, jbacik@...com, linux-kernel@...r.kernel.org,
linux-raid@...r.kernel.org, linux-btrfs@...r.kernel.org,
amadvance@...il.com
Subject: [RFC v2 2/2] fs: btrfs: Extends btrfs/raid56 to support up to six parities
This patch changes btrfs/raid56.c to use the new raid interface and
extends its support to an arbitrary number of parities.
More in details, the two faila/failb failure indexes are now replaced
with a fail[] vector that keeps track of up to six failures, and now
the new raid_par() and raid_rec() functions are used to handle with
parity instead of the old xor/raid6 ones.
Signed-off-by: Andrea Mazzoleni <amadvance@...il.com>
---
fs/btrfs/Kconfig | 1 +
fs/btrfs/raid56.c | 278 ++++++++++++++++++-----------------------------------
fs/btrfs/raid56.h | 12 ++-
fs/btrfs/volumes.c | 4 +-
4 files changed, 102 insertions(+), 193 deletions(-)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index aa976ec..173fabe 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -5,6 +5,7 @@ config BTRFS_FS
select ZLIB_DEFLATE
select LZO_COMPRESS
select LZO_DECOMPRESS
+ select RAID_CAUCHY
select RAID6_PQ
select XOR_BLOCKS
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 24ac218..2ceff3a 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -27,10 +27,9 @@
#include <linux/capability.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
-#include <linux/raid/pq.h>
+#include <linux/raid/raid.h>
#include <linux/hash.h>
#include <linux/list_sort.h>
-#include <linux/raid/xor.h>
#include <linux/vmalloc.h>
#include <asm/div64.h>
#include "ctree.h"
@@ -125,11 +124,11 @@ struct btrfs_raid_bio {
*/
int read_rebuild;
- /* first bad stripe */
- int faila;
+ /* bad stripes */
+ int fail[RAID_PARITY_MAX];
- /* second bad stripe (for raid6 use) */
- int failb;
+ /* number of bad stripes in fail[] */
+ int nr_fail;
/*
* number of pages needed to represent the full
@@ -496,26 +495,6 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
}
/*
- * helper function to run the xor_blocks api. It is only
- * able to do MAX_XOR_BLOCKS at a time, so we need to
- * loop through.
- */
-static void run_xor(void **pages, int src_cnt, ssize_t len)
-{
- int src_off = 0;
- int xor_src_cnt = 0;
- void *dest = pages[src_cnt];
-
- while(src_cnt > 0) {
- xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
- xor_blocks(xor_src_cnt, len, dest, pages + src_off);
-
- src_cnt -= xor_src_cnt;
- src_off += xor_src_cnt;
- }
-}
-
-/*
* returns true if the bio list inside this rbio
* covers an entire stripe (no rmw required).
* Must be called with the bio list lock held, or
@@ -587,25 +566,18 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
}
/*
- * helper to index into the pstripe
+ * helper to index into the parity stripe
+ * returns null if there is no stripe
*/
-static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
+static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio,
+ int index, int parity)
{
- index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
- return rbio->stripe_pages[index];
-}
-
-/*
- * helper to index into the qstripe, returns null
- * if there is no qstripe
- */
-static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
-{
- if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
+ if (rbio->nr_data + parity >= rbio->bbio->num_stripes)
return NULL;
- index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
- PAGE_CACHE_SHIFT;
+ index += ((rbio->nr_data + parity) * rbio->stripe_len)
+ >> PAGE_CACHE_SHIFT;
+
return rbio->stripe_pages[index];
}
@@ -946,8 +918,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
rbio->fs_info = root->fs_info;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
- rbio->faila = -1;
- rbio->failb = -1;
+ rbio->nr_fail = 0;
atomic_set(&rbio->refs, 1);
/*
@@ -958,10 +929,10 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
rbio->stripe_pages = p;
rbio->bio_pages = p + sizeof(struct page *) * num_pages;
- if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
- nr_data = bbio->num_stripes - 2;
- else
- nr_data = bbio->num_stripes - 1;
+ /* get the number of data stripes removing all the parities */
+ nr_data = bbio->num_stripes;
+ while (nr_data > 0 && is_parity_stripe(raid_map[nr_data - 1]))
+ --nr_data;
rbio->nr_data = nr_data;
return rbio;
@@ -1072,8 +1043,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
*/
static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
{
- if (rbio->faila >= 0 || rbio->failb >= 0) {
- BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
+ if (rbio->nr_fail > 0) {
__raid56_parity_recover(rbio);
} else {
finish_rmw(rbio);
@@ -1137,10 +1107,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
void *pointers[bbio->num_stripes];
int stripe_len = rbio->stripe_len;
int nr_data = rbio->nr_data;
+ int nr_parity;
+ int parity;
int stripe;
int pagenr;
- int p_stripe = -1;
- int q_stripe = -1;
struct bio_list bio_list;
struct bio *bio;
int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
@@ -1148,14 +1118,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
bio_list_init(&bio_list);
- if (bbio->num_stripes - rbio->nr_data == 1) {
- p_stripe = bbio->num_stripes - 1;
- } else if (bbio->num_stripes - rbio->nr_data == 2) {
- p_stripe = bbio->num_stripes - 2;
- q_stripe = bbio->num_stripes - 1;
- } else {
- BUG();
- }
+ nr_parity = bbio->num_stripes - rbio->nr_data;
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
@@ -1194,29 +1157,15 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
pointers[stripe] = kmap(p);
}
- /* then add the parity stripe */
- p = rbio_pstripe_page(rbio, pagenr);
- SetPageUptodate(p);
- pointers[stripe++] = kmap(p);
-
- if (q_stripe != -1) {
-
- /*
- * raid6, add the qstripe and call the
- * library function to fill in our p/q
- */
- p = rbio_qstripe_page(rbio, pagenr);
+ /* then add the parity stripes */
+ for (parity = 0; parity < nr_parity; ++parity) {
+ p = rbio_pstripe_page(rbio, pagenr, parity);
SetPageUptodate(p);
pointers[stripe++] = kmap(p);
-
- raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
- pointers);
- } else {
- /* raid5 */
- memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
- run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
}
+ /* compute the parity */
+ raid_par(rbio->nr_data, nr_parity, PAGE_SIZE, pointers);
for (stripe = 0; stripe < bbio->num_stripes; stripe++)
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
@@ -1321,24 +1270,25 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
{
unsigned long flags;
int ret = 0;
+ int i;
spin_lock_irqsave(&rbio->bio_list_lock, flags);
/* we already know this stripe is bad, move on */
- if (rbio->faila == failed || rbio->failb == failed)
- goto out;
+ for (i = 0; i < rbio->nr_fail; ++i)
+ if (rbio->fail[i] == failed)
+ goto out;
- if (rbio->faila == -1) {
- /* first failure on this rbio */
- rbio->faila = failed;
- atomic_inc(&rbio->bbio->error);
- } else if (rbio->failb == -1) {
- /* second failure on this rbio */
- rbio->failb = failed;
- atomic_inc(&rbio->bbio->error);
- } else {
+ if (rbio->nr_fail == RAID_PARITY_MAX) {
ret = -EIO;
+ goto out;
}
+
+ /* new failure on this rbio */
+ rbio->fail[rbio->nr_fail] = failed;
+ ++rbio->nr_fail;
+ atomic_inc(&rbio->bbio->error);
+
out:
spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
@@ -1724,8 +1674,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
{
int pagenr, stripe;
void **pointers;
- int faila = -1, failb = -1;
+ int ifail;
int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int nr_parity;
+ int nr_fail;
struct page *page;
int err;
int i;
@@ -1737,8 +1689,11 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
goto cleanup_io;
}
- faila = rbio->faila;
- failb = rbio->failb;
+ nr_parity = rbio->bbio->num_stripes - rbio->nr_data;
+ nr_fail = rbio->nr_fail;
+
+ /* ensure that the fail indexes are in order */
+ raid_sort(nr_fail, rbio->fail);
if (rbio->read_rebuild) {
spin_lock_irq(&rbio->bio_list_lock);
@@ -1752,98 +1707,30 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
/* setup our array of pointers with pages
* from each stripe
*/
+ ifail = 0;
for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
/*
* if we're rebuilding a read, we have to use
* pages from the bio list
*/
if (rbio->read_rebuild &&
- (stripe == faila || stripe == failb)) {
+ rbio->fail[ifail] == stripe) {
page = page_in_rbio(rbio, stripe, pagenr, 0);
+ ++ifail;
} else {
page = rbio_stripe_page(rbio, stripe, pagenr);
}
pointers[stripe] = kmap(page);
}
- /* all raid6 handling here */
- if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
- RAID6_Q_STRIPE) {
-
- /*
- * single failure, rebuild from parity raid5
- * style
- */
- if (failb < 0) {
- if (faila == rbio->nr_data) {
- /*
- * Just the P stripe has failed, without
- * a bad data or Q stripe.
- * TODO, we should redo the xor here.
- */
- err = -EIO;
- goto cleanup;
- }
- /*
- * a single failure in raid6 is rebuilt
- * in the pstripe code below
- */
- goto pstripe;
- }
-
- /* make sure our ps and qs are in order */
- if (faila > failb) {
- int tmp = failb;
- failb = faila;
- faila = tmp;
- }
-
- /* if the q stripe is failed, do a pstripe reconstruction
- * from the xors.
- * If both the q stripe and the P stripe are failed, we're
- * here due to a crc mismatch and we can't give them the
- * data they want
- */
- if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
- err = -EIO;
- goto cleanup;
- }
- /*
- * otherwise we have one bad data stripe and
- * a good P stripe. raid5!
- */
- goto pstripe;
- }
-
- if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
- raid6_datap_recov(rbio->bbio->num_stripes,
- PAGE_SIZE, faila, pointers);
- } else {
- raid6_2data_recov(rbio->bbio->num_stripes,
- PAGE_SIZE, faila, failb,
- pointers);
- }
- } else {
- void *p;
-
- /* rebuild from P stripe here (raid5 or raid6) */
- BUG_ON(failb != -1);
-pstripe:
- /* Copy parity block into failed block to start with */
- memcpy(pointers[faila],
- pointers[rbio->nr_data],
- PAGE_CACHE_SIZE);
-
- /* rearrange the pointer array */
- p = pointers[faila];
- for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
- pointers[stripe] = pointers[stripe + 1];
- pointers[rbio->nr_data - 1] = p;
-
- /* xor in the rest */
- run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
+ /* if we have too many failure */
+ if (nr_fail > nr_parity) {
+ err = -EIO;
+ goto cleanup;
}
+ raid_rec(nr_fail, rbio->fail, rbio->nr_data, nr_parity,
+ PAGE_SIZE, pointers);
+
/* if we're doing this rebuild as part of an rmw, go through
* and set all of our private rbio pages in the
* failed stripes as uptodate. This way finish_rmw will
@@ -1852,24 +1739,23 @@ pstripe:
*/
if (!rbio->read_rebuild) {
for (i = 0; i < nr_pages; i++) {
- if (faila != -1) {
- page = rbio_stripe_page(rbio, faila, i);
- SetPageUptodate(page);
- }
- if (failb != -1) {
- page = rbio_stripe_page(rbio, failb, i);
+ for (ifail = 0; ifail < nr_fail; ++ifail) {
+ int sfail = rbio->fail[ifail];
+ page = rbio_stripe_page(rbio, sfail, i);
SetPageUptodate(page);
}
}
}
+ ifail = 0;
for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
/*
* if we're rebuilding a read, we have to use
* pages from the bio list
*/
if (rbio->read_rebuild &&
- (stripe == faila || stripe == failb)) {
+ rbio->fail[ifail] == stripe) {
page = page_in_rbio(rbio, stripe, pagenr, 0);
+ ++ifail;
} else {
page = rbio_stripe_page(rbio, stripe, pagenr);
}
@@ -1891,8 +1777,7 @@ cleanup_io:
rbio_orig_end_io(rbio, err, err == 0);
} else if (err == 0) {
- rbio->faila = -1;
- rbio->failb = -1;
+ rbio->nr_fail = 0;
finish_rmw(rbio);
} else {
rbio_orig_end_io(rbio, err, 0);
@@ -1939,6 +1824,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct btrfs_bio *bbio = rbio->bbio;
struct bio_list bio_list;
+ int ifail;
int ret;
int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
int pagenr;
@@ -1953,15 +1839,20 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
atomic_set(&rbio->bbio->error, 0);
+ /* ensure that the fail indexes are in order */
+ raid_sort(rbio->nr_fail, rbio->fail);
+
/*
* read everything that hasn't failed. Thanks to the
* stripe cache, it is possible that some or all of these
* pages are going to be uptodate.
*/
+ ifail = 0;
for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
- if (rbio->faila == stripe ||
- rbio->failb == stripe)
+ if (rbio->fail[ifail] == stripe) {
+ ++ifail;
continue;
+ }
for (pagenr = 0; pagenr < nr_pages; pagenr++) {
struct page *p;
@@ -2037,6 +1928,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
{
struct btrfs_raid_bio *rbio;
int ret;
+ int i;
rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
if (IS_ERR(rbio))
@@ -2046,21 +1938,33 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
bio_list_add(&rbio->bio_list, bio);
rbio->bio_list_bytes = bio->bi_size;
- rbio->faila = find_logical_bio_stripe(rbio, bio);
- if (rbio->faila == -1) {
+ rbio->fail[0] = find_logical_bio_stripe(rbio, bio);
+ if (rbio->fail[0] == -1) {
BUG();
kfree(raid_map);
kfree(bbio);
kfree(rbio);
return -EIO;
}
+ rbio->nr_fail = 1;
/*
- * reconstruct from the q stripe if they are
- * asking for mirror 3
+ * Reconstruct from other parity stripes if they are
+ * asking for different mirrors.
+ * For each mirror we disable one extra parity to trigger
+ * a different recovery.
+ * With mirror_num == 2 we disable nothing and we reconstruct
+ * with the first parity, with mirror_num == 3 we disable the
+ * first parity and then we reconstruct with the second,
+ * and so on, up to mirror_num == 7 where we disable the first 5
+ * parity levels and we recover with the 6 one.
*/
- if (mirror_num == 3)
- rbio->failb = bbio->num_stripes - 2;
+ if (mirror_num > 2 && mirror_num - 2 < RAID_PARITY_MAX) {
+ for (i = 0; i < mirror_num - 2; ++i) {
+ rbio->fail[rbio->nr_fail] = rbio->nr_data + i;
+ ++rbio->nr_fail;
+ }
+ }
ret = lock_stripe_add(rbio);
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index ea5d73b..8adc48d 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -33,11 +33,15 @@ static inline int nr_data_stripes(struct map_lookup *map)
{
return map->num_stripes - nr_parity_stripes(map);
}
-#define RAID5_P_STRIPE ((u64)-2)
-#define RAID6_Q_STRIPE ((u64)-1)
-#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
- ((x) == RAID6_Q_STRIPE))
+#define RAID_PAR1_STRIPE ((u64)-6)
+#define RAID_PAR2_STRIPE ((u64)-5)
+#define RAID_PAR3_STRIPE ((u64)-4)
+#define RAID_PAR4_STRIPE ((u64)-3)
+#define RAID_PAR5_STRIPE ((u64)-2)
+#define RAID_PAR6_STRIPE ((u64)-1)
+
+#define is_parity_stripe(x) (((u64)(x) >= RAID_PAR1_STRIPE))
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 *raid_map,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 92303f4..bf593f7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4918,10 +4918,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
raid_map[(i+rot) % num_stripes] =
em->start + (tmp + i) * map->stripe_len;
- raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ raid_map[(i+rot) % map->num_stripes] = RAID_PAR1_STRIPE;
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
raid_map[(i+rot+1) % num_stripes] =
- RAID6_Q_STRIPE;
+ RAID_PAR2_STRIPE;
*length = map->stripe_len;
stripe_index = 0;
--
1.7.12.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists