From 1ed7242e591af7e233234d483f12d33818b189d9 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 7 Jun 2011 17:50:35 -0500
Subject: MD: raid1 changes to allow use by device mapper

MD RAID1: Changes to allow RAID1 to be used by device-mapper (dm-raid.c)

Added the necessary congestion function and conditionalize calls requiring an
array 'queue' or 'gendisk'.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5d096096f958..f7431b6d8447 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -497,21 +497,19 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 	return best_disk;
 }
 
-static int raid1_congested(void *data, int bits)
+int md_raid1_congested(mddev_t *mddev, int bits)
 {
-	mddev_t *mddev = data;
 	conf_t *conf = mddev->private;
 	int i, ret = 0;
 
-	if (mddev_congested(mddev, bits))
-		return 1;
-
 	rcu_read_lock();
 	for (i = 0; i < mddev->raid_disks; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct request_queue *q = bdev_get_queue(rdev->bdev);
 
+			BUG_ON(!q);
+
 			/* Note the '|| 1' - when read_balance prefers
 			 * non-congested targets, it can be removed
 			 */
@@ -524,7 +522,15 @@ static int raid1_congested(void *data, int bits)
 	rcu_read_unlock();
 	return ret;
 }
+EXPORT_SYMBOL_GPL(md_raid1_congested);
 
+static int raid1_congested(void *data, int bits)
+{
+	mddev_t *mddev = data;
+
+	return mddev_congested(mddev, bits) ||
+		md_raid1_congested(mddev, bits);
+}
 
 static void flush_pending_writes(conf_t *conf)
 {
@@ -1972,6 +1978,8 @@ static int run(mddev_t *mddev)
 		return PTR_ERR(conf);
 
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (!mddev->gendisk)
+			continue;
 		disk_stack_limits(mddev->gendisk, rdev->bdev,
 				  rdev->data_offset << 9);
 		/* as we don't honour merge_bvec_fn, we must never risk
@@ -2013,8 +2021,10 @@ static int run(mddev_t *mddev)
 
 	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
 
-	mddev->queue->backing_dev_info.congested_fn = raid1_congested;
-	mddev->queue->backing_dev_info.congested_data = mddev;
+	if (mddev->queue) {
+		mddev->queue->backing_dev_info.congested_fn = raid1_congested;
+		mddev->queue->backing_dev_info.congested_data = mddev;
+	}
 	return md_integrity_register(mddev);
 }
 
-- 
cgit v1.2.3


From 8bda470e8ebde35f9349e98ecbce4dfb508a60fa Mon Sep 17 00:00:00 2001
From: Christian Dietrich <christian.dietrich@informatik.uni-erlangen.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid: use printk_ratelimited instead of printk_ratelimit

As per printk_ratelimit comment, it should not be used.

Signed-off-by: Christian Dietrich <christian.dietrich@informatik.uni-erlangen.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 25 +++++++++++++++----------
 drivers/md/raid10.c | 23 +++++++++++++----------
 drivers/md/raid5.c  | 47 +++++++++++++++++++++++++----------------------
 3 files changed, 53 insertions(+), 42 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f7431b6d8447..d3a8f4bb4fc3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -35,6 +35,7 @@
 #include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/seq_file.h>
+#include <linux/ratelimit.h>
 #include "md.h"
 #include "raid1.h"
 #include "bitmap.h"
@@ -287,10 +288,13 @@ static void raid1_end_read_request(struct bio *bio, int error)
 		 * oops, read error:
 		 */
 		char b[BDEVNAME_SIZE];
-		if (printk_ratelimit())
-			printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
-			       mdname(conf->mddev),
-			       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
+		printk_ratelimited(
+			KERN_ERR "md/raid1:%s: %s: "
+			"rescheduling sector %llu\n",
+			mdname(conf->mddev),
+			bdevname(conf->mirrors[mirror].rdev->bdev,
+				 b),
+			(unsigned long long)r1_bio->sector);
 		reschedule_retry(r1_bio);
 	}
 
@@ -1580,12 +1584,13 @@ static void raid1d(mddev_t *mddev)
 						      GFP_NOIO, mddev);
 				r1_bio->bios[r1_bio->read_disk] = bio;
 				rdev = conf->mirrors[disk].rdev;
-				if (printk_ratelimit())
-					printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
-					       " other mirror: %s\n",
-					       mdname(mddev),
-					       (unsigned long long)r1_bio->sector,
-					       bdevname(rdev->bdev,b));
+				printk_ratelimited(
+					KERN_ERR
+					"md/raid1:%s: redirecting sector %llu"
+					" to other mirror: %s\n",
+					mdname(mddev),
+					(unsigned long long)r1_bio->sector,
+					bdevname(rdev->bdev, b));
 				bio->bi_sector = r1_bio->sector + rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				bio->bi_end_io = raid1_end_read_request;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3715e220e5e0..1725ec1e1e82 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -22,6 +22,7 @@
 #include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/seq_file.h>
+#include <linux/ratelimit.h>
 #include "md.h"
 #include "raid10.h"
 #include "raid0.h"
@@ -301,10 +302,11 @@ static void raid10_end_read_request(struct bio *bio, int error)
 		 * oops, read error - keep the refcount on the rdev
 		 */
 		char b[BDEVNAME_SIZE];
-		if (printk_ratelimit())
-			printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n",
-			       mdname(conf->mddev),
-			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
+		printk_ratelimited(KERN_ERR
+				   "md/raid10:%s: %s: rescheduling sector %llu\n",
+				   mdname(conf->mddev),
+				   bdevname(conf->mirrors[dev].rdev->bdev, b),
+				   (unsigned long long)r10_bio->sector);
 		reschedule_retry(r10_bio);
 	}
 }
@@ -1669,12 +1671,13 @@ static void raid10d(mddev_t *mddev)
 				bio_put(bio);
 				slot = r10_bio->read_slot;
 				rdev = conf->mirrors[mirror].rdev;
-				if (printk_ratelimit())
-					printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
-					       " another mirror\n",
-					       mdname(mddev),
-					       bdevname(rdev->bdev,b),
-					       (unsigned long long)r10_bio->sector);
+				printk_ratelimited(
+					KERN_ERR
+					"md/raid10:%s: %s: redirecting"
+					"sector %llu to another mirror\n",
+					mdname(mddev),
+					bdevname(rdev->bdev, b),
+					(unsigned long long)r10_bio->sector);
 				bio = bio_clone_mddev(r10_bio->master_bio,
 						      GFP_NOIO, mddev);
 				r10_bio->devs[slot].bio = bio;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b321d6c36594..467e8e1cd3d2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -51,6 +51,7 @@
 #include <linux/seq_file.h>
 #include <linux/cpu.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "md.h"
 #include "raid5.h"
 #include "raid0.h"
@@ -96,8 +97,6 @@
 #define __inline__
 #endif
 
-#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
-
 /*
  * We maintain a biased count of active stripes in the bottom 16 bits of
  * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -1583,12 +1582,14 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
 			rdev = conf->disks[i].rdev;
-			printk_rl(KERN_INFO "md/raid:%s: read error corrected"
-				  " (%lu sectors at %llu on %s)\n",
-				  mdname(conf->mddev), STRIPE_SECTORS,
-				  (unsigned long long)(sh->sector
-						       + rdev->data_offset),
-				  bdevname(rdev->bdev, b));
+			printk_ratelimited(
+				KERN_INFO
+				"md/raid:%s: read error corrected"
+				" (%lu sectors at %llu on %s)\n",
+				mdname(conf->mddev), STRIPE_SECTORS,
+				(unsigned long long)(sh->sector
+						     + rdev->data_offset),
+				bdevname(rdev->bdev, b));
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
 		}
@@ -1602,22 +1603,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
 		atomic_inc(&rdev->read_errors);
 		if (conf->mddev->degraded >= conf->max_degraded)
-			printk_rl(KERN_WARNING
-				  "md/raid:%s: read error not correctable "
-				  "(sector %llu on %s).\n",
-				  mdname(conf->mddev),
-				  (unsigned long long)(sh->sector
-						       + rdev->data_offset),
-				  bdn);
+			printk_ratelimited(
+				KERN_WARNING
+				"md/raid:%s: read error not correctable "
+				"(sector %llu on %s).\n",
+				mdname(conf->mddev),
+				(unsigned long long)(sh->sector
+						     + rdev->data_offset),
+				bdn);
 		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
 			/* Oh, no!!! */
-			printk_rl(KERN_WARNING
-				  "md/raid:%s: read error NOT corrected!! "
-				  "(sector %llu on %s).\n",
-				  mdname(conf->mddev),
-				  (unsigned long long)(sh->sector
-						       + rdev->data_offset),
-				  bdn);
+			printk_ratelimited(
+				KERN_WARNING
+				"md/raid:%s: read error NOT corrected!! "
+				"(sector %llu on %s).\n",
+				mdname(conf->mddev),
+				(unsigned long long)(sh->sector
+						     + rdev->data_offset),
+				bdn);
 		else if (atomic_read(&rdev->read_errors)
 			 > conf->max_nr_stripes)
 			printk(KERN_WARNING
-- 
cgit v1.2.3


From 36fad858a7404a9656122a9e560a224ae2a00979 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md: introduce link/unlink_rdev() helpers

There are places where sysfs links to rdev are handled
in a same way. Add the helper functions to consolidate
them.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    | 47 ++++++++++++++---------------------------------
 drivers/md/md.h    | 14 ++++++++++++++
 drivers/md/raid1.c | 15 +++++----------
 drivers/md/raid5.c | 10 +++-------
 4 files changed, 36 insertions(+), 50 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 91e31e260b4a..0398dc42a956 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2459,7 +2459,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
 	char *e;
 	int err;
-	char nm[20];
 	int slot = simple_strtoul(buf, &e, 10);
 	if (strncmp(buf, "none", 4)==0)
 		slot = -1;
@@ -2482,8 +2481,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 			hot_remove_disk(rdev->mddev, rdev->raid_disk);
 		if (err)
 			return err;
-		sprintf(nm, "rd%d", rdev->raid_disk);
-		sysfs_remove_link(&rdev->mddev->kobj, nm);
+		sysfs_unlink_rdev(rdev->mddev, rdev);
 		rdev->raid_disk = -1;
 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
 		md_wakeup_thread(rdev->mddev->thread);
@@ -2522,8 +2520,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 			return err;
 		} else
 			sysfs_notify_dirent_safe(rdev->sysfs_state);
-		sprintf(nm, "rd%d", rdev->raid_disk);
-		if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
+		if (sysfs_link_rdev(rdev->mddev, rdev))
 			/* failure here is OK */;
 		/* don't wakeup anyone, leave that to userspace. */
 	} else {
@@ -3149,15 +3146,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	}
 
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
-		char nm[20];
 		if (rdev->raid_disk < 0)
 			continue;
 		if (rdev->new_raid_disk >= mddev->raid_disks)
 			rdev->new_raid_disk = -1;
 		if (rdev->new_raid_disk == rdev->raid_disk)
 			continue;
-		sprintf(nm, "rd%d", rdev->raid_disk);
-		sysfs_remove_link(&mddev->kobj, nm);
+		sysfs_unlink_rdev(mddev, rdev);
 	}
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (rdev->raid_disk < 0)
@@ -3168,11 +3163,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 		if (rdev->raid_disk < 0)
 			clear_bit(In_sync, &rdev->flags);
 		else {
-			char nm[20];
-			sprintf(nm, "rd%d", rdev->raid_disk);
-			if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
-				printk("md: cannot register %s for %s after level change\n",
-				       nm, mdname(mddev));
+			if (sysfs_link_rdev(mddev, rdev))
+				printk(KERN_WARNING "md: cannot register rd%d"
+				       " for %s after level change\n",
+				       rdev->raid_disk, mdname(mddev));
 		}
 	}
 
@@ -4621,12 +4615,9 @@ int md_run(mddev_t *mddev)
 	smp_wmb();
 	mddev->ready = 1;
 	list_for_each_entry(rdev, &mddev->disks, same_set)
-		if (rdev->raid_disk >= 0) {
-			char nm[20];
-			sprintf(nm, "rd%d", rdev->raid_disk);
-			if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
+		if (rdev->raid_disk >= 0)
+			if (sysfs_link_rdev(mddev, rdev))
 				/* failure here is OK */;
-		}
 	
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	
@@ -4854,11 +4845,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
 
 		list_for_each_entry(rdev, &mddev->disks, same_set)
-			if (rdev->raid_disk >= 0) {
-				char nm[20];
-				sprintf(nm, "rd%d", rdev->raid_disk);
-				sysfs_remove_link(&mddev->kobj, nm);
-			}
+			if (rdev->raid_disk >= 0)
+				sysfs_unlink_rdev(mddev, rdev);
 
 		set_capacity(disk, 0);
 		mutex_unlock(&mddev->open_mutex);
@@ -7077,9 +7065,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 		    atomic_read(&rdev->nr_pending)==0) {
 			if (mddev->pers->hot_remove_disk(
 				    mddev, rdev->raid_disk)==0) {
-				char nm[20];
-				sprintf(nm,"rd%d", rdev->raid_disk);
-				sysfs_remove_link(&mddev->kobj, nm);
+				sysfs_unlink_rdev(mddev, rdev);
 				rdev->raid_disk = -1;
 			}
 		}
@@ -7096,10 +7082,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 				rdev->recovery_offset = 0;
 				if (mddev->pers->
 				    hot_add_disk(mddev, rdev) == 0) {
-					char nm[20];
-					sprintf(nm, "rd%d", rdev->raid_disk);
-					if (sysfs_create_link(&mddev->kobj,
-							      &rdev->kobj, nm))
+					if (sysfs_link_rdev(mddev, rdev))
 						/* failure here is OK */;
 					spares++;
 					md_new_event(mddev);
@@ -7219,9 +7202,7 @@ void md_check_recovery(mddev_t *mddev)
 				    atomic_read(&rdev->nr_pending)==0) {
 					if (mddev->pers->hot_remove_disk(
 						    mddev, rdev->raid_disk)==0) {
-						char nm[20];
-						sprintf(nm,"rd%d", rdev->raid_disk);
-						sysfs_remove_link(&mddev->kobj, nm);
+						sysfs_unlink_rdev(mddev, rdev);
 						rdev->raid_disk = -1;
 					}
 				}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1c26c7a08ae6..6863f722cd2a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -413,6 +413,20 @@ static inline char * mdname (mddev_t * mddev)
 	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
 }
 
+static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	char nm[20];
+	sprintf(nm, "rd%d", rdev->raid_disk);
+	return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+}
+
+static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	char nm[20];
+	sprintf(nm, "rd%d", rdev->raid_disk);
+	sysfs_remove_link(&mddev->kobj, nm);
+}
+
 /*
  * iterates through some rdev ringlist. It's safe to remove the
  * current 'rdev'. Dont touch 'tmp' though.
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d3a8f4bb4fc3..1d79a041db09 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2159,18 +2159,13 @@ static int raid1_reshape(mddev_t *mddev)
 	for (d = d2 = 0; d < conf->raid_disks; d++) {
 		mdk_rdev_t *rdev = conf->mirrors[d].rdev;
 		if (rdev && rdev->raid_disk != d2) {
-			char nm[20];
-			sprintf(nm, "rd%d", rdev->raid_disk);
-			sysfs_remove_link(&mddev->kobj, nm);
+			sysfs_unlink_rdev(mddev, rdev);
 			rdev->raid_disk = d2;
-			sprintf(nm, "rd%d", rdev->raid_disk);
-			sysfs_remove_link(&mddev->kobj, nm);
-			if (sysfs_create_link(&mddev->kobj,
-					      &rdev->kobj, nm))
+			sysfs_unlink_rdev(mddev, rdev);
+			if (sysfs_link_rdev(mddev, rdev))
 				printk(KERN_WARNING
-				       "md/raid1:%s: cannot register "
-				       "%s\n",
-				       mdname(mddev), nm);
+				       "md/raid1:%s: cannot register rd%d\n",
+				       mdname(mddev), rdev->raid_disk);
 		}
 		if (rdev)
 			newmirrors[d2++].rdev = rdev;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 467e8e1cd3d2..0cd591472e1f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5152,16 +5152,14 @@ static int raid5_start_reshape(mddev_t *mddev)
 			if (rdev->raid_disk < 0 &&
 			    !test_bit(Faulty, &rdev->flags)) {
 				if (raid5_add_disk(mddev, rdev) == 0) {
-					char nm[20];
 					if (rdev->raid_disk
 					    >= conf->previous_raid_disks) {
 						set_bit(In_sync, &rdev->flags);
 						added_devices++;
 					} else
 						rdev->recovery_offset = 0;
-					sprintf(nm, "rd%d", rdev->raid_disk);
-					if (sysfs_create_link(&mddev->kobj,
-							      &rdev->kobj, nm))
+
+					if (sysfs_link_rdev(mddev, rdev))
 						/* Failure here is OK */;
 				}
 			} else if (rdev->raid_disk >= conf->previous_raid_disks
@@ -5257,9 +5255,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
 			     d++) {
 				mdk_rdev_t *rdev = conf->disks[d].rdev;
 				if (rdev && raid5_remove_disk(mddev, d) == 0) {
-					char nm[20];
-					sprintf(nm, "rd%d", rdev->raid_disk);
-					sysfs_remove_link(&mddev->kobj, nm);
+					sysfs_unlink_rdev(mddev, rdev);
 					rdev->raid_disk = -1;
 				}
 			}
-- 
cgit v1.2.3


From 5389042ffa36976caa45a79af16081d759001fa7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md: change managed of recovery_disabled.

If we hit a read error while recovering a mirror, we want to abort the
recovery without necessarily failing the disk - as having a disk this
a read error is better than not having an array at all.

Currently this is managed with a per-array flag "recovery_disabled"
and is only implemented for RAID1.  For RAID10 we will need finer
grained control as we might want to disable recovery for individual
devices separately.

So push more of the decision making into the personality.
'recovery_disabled' is now a 'cookie' which is copied when the
personality want to disable recovery and is changed when a device is
added to the array as this is used as a trigger to 'try recovery
again'.

This will allow RAID10 to get the control that it needs.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    | 4 ++--
 drivers/md/md.h    | 9 ++++++---
 drivers/md/raid1.c | 7 +++++--
 drivers/md/raid1.h | 6 ++++++
 4 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 77bd8d8708e4..c7d9c6af4634 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1922,7 +1922,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
 
 	/* May as well allow recovery to be retried once */
-	mddev->recovery_disabled = 0;
+	mddev->recovery_disabled++;
 
 	return 0;
 
@@ -7070,7 +7070,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 			}
 		}
 
-	if (mddev->degraded && !mddev->recovery_disabled) {
+	if (mddev->degraded) {
 		list_for_each_entry(rdev, &mddev->disks, same_set) {
 			if (rdev->raid_disk >= 0 &&
 			    !test_bit(In_sync, &rdev->flags) &&
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 6863f722cd2a..de5455d30d41 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -239,9 +239,12 @@ struct mddev_s
 #define	MD_RECOVERY_FROZEN	9
 
 	unsigned long			recovery;
-	int				recovery_disabled; /* if we detect that recovery
-							    * will always fail, set this
-							    * so we don't loop trying */
+	/* If a RAID personality determines that recovery (of a particular
+	 * device) will fail due to a read error on the source device, it
+	 * takes a copy of this number and does not attempt recovery again
+	 * until this number changes.
+	 */
+	int				recovery_disabled;
 
 	int				in_sync;	/* know to not need resync */
 	/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1d79a041db09..44069b38d6dd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -956,7 +956,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		 * However don't try a recovery from this drive as
 		 * it is very likely to fail.
 		 */
-		mddev->recovery_disabled = 1;
+		conf->recovery_disabled = mddev->recovery_disabled;
 		return;
 	}
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
@@ -1052,6 +1052,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int first = 0;
 	int last = mddev->raid_disks - 1;
 
+	if (mddev->recovery_disabled == conf->recovery_disabled)
+		return -EBUSY;
+
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
@@ -1107,7 +1110,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
 		 * is not possible.
 		 */
 		if (!test_bit(Faulty, &rdev->flags) &&
-		    !mddev->recovery_disabled &&
+		    mddev->recovery_disabled != conf->recovery_disabled &&
 		    mddev->degraded < conf->raid_disks) {
 			err = -EBUSY;
 			goto abort;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e743a64fac4f..3cd18cfda2ad 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -48,6 +48,12 @@ struct r1_private_data_s {
 					    * (fresh device added).
 					    * Cleared when a sync completes.
 					    */
+	int			recovery_disabled; /* when the same as
+						    * mddev->recovery_disabled
+						    * we don't allow recovery
+						    * to be attempted as we
+						    * expect a read error
+						    */
 
 	wait_queue_head_t	wait_barrier;
 
-- 
cgit v1.2.3


From 9d3d80113df824a266c5db3fac357a036ebc0b62 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: md/raid1: move rdev->corrected_errors counting

Read errors are considered to corrected if write-back and re-read
cycle is finished without further problems. Thus moving the rdev->
corrected_errors counting after the re-reading looks more reasonable
IMHO. Also included a couple of whitespace fixes on sync_page_io().

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 44069b38d6dd..a7e69081187f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1224,9 +1224,7 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
 				 * active, and resync is currently active
 				 */
 				rdev = conf->mirrors[d].rdev;
-				if (sync_page_io(rdev,
-						 sect,
-						 s<<9,
+				if (sync_page_io(rdev, sect, s<<9,
 						 bio->bi_io_vec[idx].bv_page,
 						 READ, false)) {
 					success = 1;
@@ -1261,16 +1259,13 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
 			if (r1_bio->bios[d]->bi_end_io != end_sync_read)
 				continue;
 			rdev = conf->mirrors[d].rdev;
-			if (sync_page_io(rdev,
-					 sect,
-					 s<<9,
+			if (sync_page_io(rdev, sect, s<<9,
 					 bio->bi_io_vec[idx].bv_page,
 					 WRITE, false) == 0) {
 				r1_bio->bios[d]->bi_end_io = NULL;
 				rdev_dec_pending(rdev, mddev);
 				md_error(mddev, rdev);
-			} else
-				atomic_add(s, &rdev->corrected_errors);
+			}
 		}
 		d = start;
 		while (d != r1_bio->read_disk) {
@@ -1280,12 +1275,12 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
 			if (r1_bio->bios[d]->bi_end_io != end_sync_read)
 				continue;
 			rdev = conf->mirrors[d].rdev;
-			if (sync_page_io(rdev,
-					 sect,
-					 s<<9,
+			if (sync_page_io(rdev, sect, s<<9,
 					 bio->bi_io_vec[idx].bv_page,
 					 READ, false) == 0)
 				md_error(mddev, rdev);
+			else
+				atomic_add(s, &rdev->corrected_errors);
 		}
 		sectors -= s;
 		sect += s;
-- 
cgit v1.2.3


From 654e8b5abc0a793f0c029128db6e4804691a383e Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Wed, 27 Jul 2011 11:00:36 +1000
Subject: MD: raid1 s/sysfs_notify_dirent/sysfs_notify_dirent_safe

If device-mapper creates a RAID1 array that includes devices to
be rebuilt, it will deref a NULL pointer when finished because
sysfs is not used by device-mapper instantiated RAID devices.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a7e69081187f..3cbf0ac2aaad 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1031,7 +1031,7 @@ static int raid1_spare_active(mddev_t *mddev)
 		    && !test_bit(Faulty, &rdev->flags)
 		    && !test_and_set_bit(In_sync, &rdev->flags)) {
 			count++;
-			sysfs_notify_dirent(rdev->sysfs_state);
+			sysfs_notify_dirent_safe(rdev->sysfs_state);
 		}
 	}
 	spin_lock_irqsave(&conf->device_lock, flags);
-- 
cgit v1.2.3


From 34b343cff4354ab9864be83be88405fd53d928a0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:31:47 +1000
Subject: md: don't allow arrays to contain devices with bad blocks.

As no personality understand bad block lists yet, we must
reject any device that is known to contain bad blocks.
As the personalities get taught, these tests can be removed.

This only applies to raid1/raid5/raid10.
For linear/raid0/multipath/faulty the whole concept of bad blocks
doesn't mean anything so there is no point adding the checks.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid1.c  | 7 +++++++
 drivers/md/raid10.c | 8 ++++++++
 drivers/md/raid5.c  | 7 +++++++
 3 files changed, 22 insertions(+)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3cbf0ac2aaad..8db311d7cddc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1055,6 +1055,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
 
+	if (rdev->badblocks.count)
+		return -EINVAL;
+
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
@@ -1994,6 +1997,10 @@ static int run(mddev_t *mddev)
 			blk_queue_segment_boundary(mddev->queue,
 						   PAGE_CACHE_SIZE - 1);
 		}
+		if (rdev->badblocks.count) {
+			printk(KERN_ERR "md/raid1: Cannot handle bad blocks yet\n");
+			return -EINVAL;
+		}
 	}
 
 	mddev->degraded = 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5def27c28be7..8aadd2f52dc8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1101,6 +1101,9 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
+	if (rdev->badblocks.count)
+		return -EINVAL;
+
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
 		 * very different from resync
@@ -2263,6 +2266,11 @@ static int run(mddev_t *mddev)
 				 (conf->raid_disks / conf->near_copies));
 
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
+
+		if (rdev->badblocks.count) {
+			printk(KERN_ERR "md/raid10: cannot handle bad blocks yet\n");
+			goto out_free_conf;
+		}
 		disk_idx = rdev->raid_disk;
 		if (disk_idx >= conf->raid_disks
 		    || disk_idx < 0)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b874f42694e2..719445004dd9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4667,6 +4667,10 @@ static int run(mddev_t *mddev)
 	 * 0 for a fully functional array, 1 or 2 for a degraded array.
 	 */
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (rdev->badblocks.count) {
+			printk(KERN_ERR "md/raid5: cannot handle bad blocks yet\n");
+			goto abort;
+		}
 		if (rdev->raid_disk < 0)
 			continue;
 		if (test_bit(In_sync, &rdev->flags)) {
@@ -4975,6 +4979,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
+	if (rdev->badblocks.count)
+		return -EINVAL;
+
 	if (has_failed(conf))
 		/* no point adding a device */
 		return -EINVAL;
-- 
cgit v1.2.3


From d2eb35acfdccbe2a3622ed6cc441a5482148423b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:31:48 +1000
Subject: md/raid1: avoid reading from known bad blocks.

Now that we have a bad block list, we should not read from those
blocks.
There are several main parts to this:
  1/ read_balance needs to check for bad blocks, and return not only
     the chosen device, but also how many good blocks are available
     there.
  2/ fix_read_error needs to avoid trying to read from bad blocks.
  3/ read submission must be ready to issue multiple reads to
     different devices as different bad blocks on different devices
     could mean that a single large read cannot be served by any one
     device, but can still be served by the array.
     This requires keeping count of the number of outstanding requests
     per bio.  This count is stored in 'bi_phys_segments'
  4/ retrying a read needs to also be ready to submit a smaller read
     and queue another request for the rest.

This does not yet handle bad blocks when reading to perform resync,
recovery, or check.

'md_trim_bio' will also be used for RAID10, so put it in md.c and
export it.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    |  49 +++++++++++++
 drivers/md/md.h    |   1 +
 drivers/md/raid1.c | 208 +++++++++++++++++++++++++++++++++++++++++++++--------
 drivers/md/raid1.h |   4 ++
 4 files changed, 233 insertions(+), 29 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7ae3c5a18001..48217e8aa0eb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 }
 EXPORT_SYMBOL_GPL(bio_clone_mddev);
 
+void md_trim_bio(struct bio *bio, int offset, int size)
+{
+	/* 'bio' is a cloned bio which we need to trim to match
+	 * the given offset and size.
+	 * This requires adjusting bi_sector, bi_size, and bi_io_vec
+	 */
+	int i;
+	struct bio_vec *bvec;
+	int sofar = 0;
+
+	size <<= 9;
+	if (offset == 0 && size == bio->bi_size)
+		return;
+
+	bio->bi_sector += offset;
+	bio->bi_size = size;
+	offset <<= 9;
+	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
+	while (bio->bi_idx < bio->bi_vcnt &&
+	       bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
+		/* remove this whole bio_vec */
+		offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
+		bio->bi_idx++;
+	}
+	if (bio->bi_idx < bio->bi_vcnt) {
+		bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
+		bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
+	}
+	/* avoid any complications with bi_idx being non-zero*/
+	if (bio->bi_idx) {
+		memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
+			(bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
+		bio->bi_vcnt -= bio->bi_idx;
+		bio->bi_idx = 0;
+	}
+	/* Make sure vcnt and last bv are not too big */
+	bio_for_each_segment(bvec, bio, i) {
+		if (sofar + bvec->bv_len > size)
+			bvec->bv_len = size - sofar;
+		if (bvec->bv_len == 0) {
+			bio->bi_vcnt = i;
+			break;
+		}
+		sofar += bvec->bv_len;
+	}
+}
+EXPORT_SYMBOL_GPL(md_trim_bio);
+
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
diff --git a/drivers/md/md.h b/drivers/md/md.h
index aea9e9ff8a33..7c3192c0a29a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -575,4 +575,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   mddev_t *mddev);
 extern int mddev_check_plugged(mddev_t *mddev);
+extern void md_trim_bio(struct bio *bio, int offset, int size);
 #endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8db311d7cddc..cc3939dc9e3d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -41,11 +41,7 @@
 #include "bitmap.h"
 
 #define DEBUG 0
-#if DEBUG
-#define PRINTK(x...) printk(x)
-#else
-#define PRINTK(x...)
-#endif
+#define PRINTK(x...) do { if (DEBUG) printk(x); } while (0)
 
 /*
  * Number of guaranteed r1bios in case of extreme VM load:
@@ -177,12 +173,6 @@ static void free_r1bio(r1bio_t *r1_bio)
 {
 	conf_t *conf = r1_bio->mddev->private;
 
-	/*
-	 * Wake up any possible resync thread that waits for the device
-	 * to go idle.
-	 */
-	allow_barrier(conf);
-
 	put_all_bios(conf, r1_bio);
 	mempool_free(r1_bio, conf->r1bio_pool);
 }
@@ -223,6 +213,33 @@ static void reschedule_retry(r1bio_t *r1_bio)
  * operation and are ready to return a success/failure code to the buffer
  * cache layer.
  */
+static void call_bio_endio(r1bio_t *r1_bio)
+{
+	struct bio *bio = r1_bio->master_bio;
+	int done;
+	conf_t *conf = r1_bio->mddev->private;
+
+	if (bio->bi_phys_segments) {
+		unsigned long flags;
+		spin_lock_irqsave(&conf->device_lock, flags);
+		bio->bi_phys_segments--;
+		done = (bio->bi_phys_segments == 0);
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+	} else
+		done = 1;
+
+	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	if (done) {
+		bio_endio(bio, 0);
+		/*
+		 * Wake up any possible resync thread that waits for the device
+		 * to go idle.
+		 */
+		allow_barrier(conf);
+	}
+}
+
 static void raid_end_bio_io(r1bio_t *r1_bio)
 {
 	struct bio *bio = r1_bio->master_bio;
@@ -235,8 +252,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
 			(unsigned long long) bio->bi_sector +
 				(bio->bi_size >> 9) - 1);
 
-		bio_endio(bio,
-			test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
+		call_bio_endio(r1_bio);
 	}
 	free_r1bio(r1_bio);
 }
@@ -295,6 +311,7 @@ static void raid1_end_read_request(struct bio *bio, int error)
 			bdevname(conf->mirrors[mirror].rdev->bdev,
 				 b),
 			(unsigned long long)r1_bio->sector);
+		set_bit(R1BIO_ReadError, &r1_bio->state);
 		reschedule_retry(r1_bio);
 	}
 
@@ -381,7 +398,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
 				       (unsigned long long) mbio->bi_sector,
 				       (unsigned long long) mbio->bi_sector +
 				       (mbio->bi_size >> 9) - 1);
-				bio_endio(mbio, 0);
+				call_bio_endio(r1_bio);
 			}
 		}
 	}
@@ -412,10 +429,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
  *
  * The rdev for the device selected will have nr_pending incremented.
  */
-static int read_balance(conf_t *conf, r1bio_t *r1_bio)
+static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors)
 {
 	const sector_t this_sector = r1_bio->sector;
-	const int sectors = r1_bio->sectors;
+	int sectors;
+	int best_good_sectors;
 	int start_disk;
 	int best_disk;
 	int i;
@@ -430,8 +448,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 	 * We take the first readable disk when above the resync window.
 	 */
  retry:
+	sectors = r1_bio->sectors;
 	best_disk = -1;
 	best_dist = MaxSector;
+	best_good_sectors = 0;
+
 	if (conf->mddev->recovery_cp < MaxSector &&
 	    (this_sector + sectors >= conf->next_resync)) {
 		choose_first = 1;
@@ -443,6 +464,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 
 	for (i = 0 ; i < conf->raid_disks ; i++) {
 		sector_t dist;
+		sector_t first_bad;
+		int bad_sectors;
+
 		int disk = start_disk + i;
 		if (disk >= conf->raid_disks)
 			disk -= conf->raid_disks;
@@ -465,6 +489,35 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 		/* This is a reasonable device to use.  It might
 		 * even be best.
 		 */
+		if (is_badblock(rdev, this_sector, sectors,
+				&first_bad, &bad_sectors)) {
+			if (best_dist < MaxSector)
+				/* already have a better device */
+				continue;
+			if (first_bad <= this_sector) {
+				/* cannot read here. If this is the 'primary'
+				 * device, then we must not read beyond
+				 * bad_sectors from another device..
+				 */
+				bad_sectors -= (this_sector - first_bad);
+				if (choose_first && sectors > bad_sectors)
+					sectors = bad_sectors;
+				if (best_good_sectors > sectors)
+					best_good_sectors = sectors;
+
+			} else {
+				sector_t good_sectors = first_bad - this_sector;
+				if (good_sectors > best_good_sectors) {
+					best_good_sectors = good_sectors;
+					best_disk = disk;
+				}
+				if (choose_first)
+					break;
+			}
+			continue;
+		} else
+			best_good_sectors = sectors;
+
 		dist = abs(this_sector - conf->mirrors[disk].head_position);
 		if (choose_first
 		    /* Don't change to another disk for sequential reads */
@@ -493,10 +546,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 			rdev_dec_pending(rdev, conf->mddev);
 			goto retry;
 		}
+		sectors = best_good_sectors;
 		conf->next_seq_sect = this_sector + sectors;
 		conf->last_used = best_disk;
 	}
 	rcu_read_unlock();
+	*max_sectors = sectors;
 
 	return best_disk;
 }
@@ -763,11 +818,25 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	r1_bio->mddev = mddev;
 	r1_bio->sector = bio->bi_sector;
 
+	/* We might need to issue multiple reads to different
+	 * devices if there are bad blocks around, so we keep
+	 * track of the number of reads in bio->bi_phys_segments.
+	 * If this is 0, there is only one r1_bio and no locking
+	 * will be needed when requests complete.  If it is
+	 * non-zero, then it is the number of not-completed requests.
+	 */
+	bio->bi_phys_segments = 0;
+	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
 	if (rw == READ) {
 		/*
 		 * read balancing logic:
 		 */
-		int rdisk = read_balance(conf, r1_bio);
+		int max_sectors;
+		int rdisk;
+
+read_again:
+		rdisk = read_balance(conf, r1_bio, &max_sectors);
 
 		if (rdisk < 0) {
 			/* couldn't find anywhere to read from */
@@ -788,6 +857,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		r1_bio->read_disk = rdisk;
 
 		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+		md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector,
+			    max_sectors);
 
 		r1_bio->bios[rdisk] = read_bio;
 
@@ -797,7 +868,38 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		read_bio->bi_rw = READ | do_sync;
 		read_bio->bi_private = r1_bio;
 
-		generic_make_request(read_bio);
+		if (max_sectors < r1_bio->sectors) {
+			/* could not read all from this device, so we will
+			 * need another r1_bio.
+			 */
+			int sectors_handled;
+
+			sectors_handled = (r1_bio->sector + max_sectors
+					   - bio->bi_sector);
+			r1_bio->sectors = max_sectors;
+			spin_lock_irq(&conf->device_lock);
+			if (bio->bi_phys_segments == 0)
+				bio->bi_phys_segments = 2;
+			else
+				bio->bi_phys_segments++;
+			spin_unlock_irq(&conf->device_lock);
+			/* Cannot call generic_make_request directly
+			 * as that will be queued in __make_request
+			 * and subsequent mempool_alloc might block waiting
+			 * for it.  So hand bio over to raid1d.
+			 */
+			reschedule_retry(r1_bio);
+
+			r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+			r1_bio->master_bio = bio;
+			r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+			r1_bio->state = 0;
+			r1_bio->mddev = mddev;
+			r1_bio->sector = bio->bi_sector + sectors_handled;
+			goto read_again;
+		} else
+			generic_make_request(read_bio);
 		return 0;
 	}
 
@@ -849,8 +951,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		goto retry_write;
 	}
 
-	BUG_ON(targets == 0); /* we never fail the last device */
-
 	if (targets < conf->raid_disks) {
 		/* array is degraded, we will not clear the bitmap
 		 * on I/O completion (see raid1_end_write_request) */
@@ -1425,7 +1525,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
  *
  *	1.	Retries failed read operations on working mirrors.
  *	2.	Updates the raid superblock when problems encounter.
- *	3.	Performs writes following reads for array syncronising.
+ *	3.	Performs writes following reads for array synchronising.
  */
 
 static void fix_read_error(conf_t *conf, int read_disk,
@@ -1448,9 +1548,14 @@ static void fix_read_error(conf_t *conf, int read_disk,
 			 * which is the thread that might remove
 			 * a device.  If raid1d ever becomes multi-threaded....
 			 */
+			sector_t first_bad;
+			int bad_sectors;
+
 			rdev = conf->mirrors[d].rdev;
 			if (rdev &&
 			    test_bit(In_sync, &rdev->flags) &&
+			    is_badblock(rdev, sect, s,
+					&first_bad, &bad_sectors) == 0 &&
 			    sync_page_io(rdev, sect, s<<9,
 					 conf->tmppage, READ, false))
 				success = 1;
@@ -1546,9 +1651,11 @@ static void raid1d(mddev_t *mddev)
 		conf = mddev->private;
 		if (test_bit(R1BIO_IsSync, &r1_bio->state))
 			sync_request_write(mddev, r1_bio);
-		else {
+		else if (test_bit(R1BIO_ReadError, &r1_bio->state)) {
 			int disk;
+			int max_sectors;
 
+			clear_bit(R1BIO_ReadError, &r1_bio->state);
 			/* we got a read error. Maybe the drive is bad.  Maybe just
 			 * the block and we can fix it.
 			 * We freeze all other IO, and try reading the block from
@@ -1568,21 +1675,28 @@ static void raid1d(mddev_t *mddev)
 					 conf->mirrors[r1_bio->read_disk].rdev);
 
 			bio = r1_bio->bios[r1_bio->read_disk];
-			if ((disk=read_balance(conf, r1_bio)) == -1) {
+			bdevname(bio->bi_bdev, b);
+read_more:
+			disk = read_balance(conf, r1_bio, &max_sectors);
+			if (disk == -1) {
 				printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
 				       " read error for block %llu\n",
-				       mdname(mddev),
-				       bdevname(bio->bi_bdev,b),
+				       mdname(mddev), b,
 				       (unsigned long long)r1_bio->sector);
 				raid_end_bio_io(r1_bio);
 			} else {
 				const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
-				r1_bio->bios[r1_bio->read_disk] =
-					mddev->ro ? IO_BLOCKED : NULL;
+				if (bio) {
+					r1_bio->bios[r1_bio->read_disk] =
+						mddev->ro ? IO_BLOCKED : NULL;
+					bio_put(bio);
+				}
 				r1_bio->read_disk = disk;
-				bio_put(bio);
 				bio = bio_clone_mddev(r1_bio->master_bio,
 						      GFP_NOIO, mddev);
+				md_trim_bio(bio,
+					    r1_bio->sector - bio->bi_sector,
+					    max_sectors);
 				r1_bio->bios[r1_bio->read_disk] = bio;
 				rdev = conf->mirrors[disk].rdev;
 				printk_ratelimited(
@@ -1597,8 +1711,44 @@ static void raid1d(mddev_t *mddev)
 				bio->bi_end_io = raid1_end_read_request;
 				bio->bi_rw = READ | do_sync;
 				bio->bi_private = r1_bio;
-				generic_make_request(bio);
+				if (max_sectors < r1_bio->sectors) {
+					/* Drat - have to split this up more */
+					struct bio *mbio = r1_bio->master_bio;
+					int sectors_handled =
+						r1_bio->sector + max_sectors
+						- mbio->bi_sector;
+					r1_bio->sectors = max_sectors;
+					spin_lock_irq(&conf->device_lock);
+					if (mbio->bi_phys_segments == 0)
+						mbio->bi_phys_segments = 2;
+					else
+						mbio->bi_phys_segments++;
+					spin_unlock_irq(&conf->device_lock);
+					generic_make_request(bio);
+					bio = NULL;
+
+					r1_bio = mempool_alloc(conf->r1bio_pool,
+							       GFP_NOIO);
+
+					r1_bio->master_bio = mbio;
+					r1_bio->sectors = (mbio->bi_size >> 9)
+						- sectors_handled;
+					r1_bio->state = 0;
+					set_bit(R1BIO_ReadError,
+						&r1_bio->state);
+					r1_bio->mddev = mddev;
+					r1_bio->sector = mbio->bi_sector
+						+ sectors_handled;
+
+					goto read_more;
+				} else
+					generic_make_request(bio);
 			}
+		} else {
+			/* just a partial read to be scheduled from separate
+			 * context
+			 */
+			generic_make_request(r1_bio->bios[r1_bio->read_disk]);
 		}
 		cond_resched();
 	}
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 3cd18cfda2ad..aa6af37ca01b 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -123,6 +123,10 @@ struct r1bio_s {
 #define	R1BIO_IsSync	1
 #define	R1BIO_Degraded	2
 #define	R1BIO_BehindIO	3
+/* Set ReadError on bios that experience a readerror so that
+ * raid1d knows what to do with them.
+ */
+#define R1BIO_ReadError 4
 /* For write-behind requests, we call bi_end_io when
  * the last non-write-behind device completes, providing
  * any write was successful.  Otherwise we call when
-- 
cgit v1.2.3


From 06f603851fa90bcd236328438278d4dc8b655495 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:31:48 +1000
Subject: md/raid1: avoid reading known bad blocks during resync

When performing resync/etc, keep the size of the request
small enough that it doesn't overlap any known bad blocks.
Devices with badblocks at the start of the request are completely
excluded.
If there is nowhere to read from due to bad blocks, record
a bad block on each target device.

Now that we never read from known-bad-blocks we can allow devices with
known-bad-blocks into a RAID1.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 97 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 75 insertions(+), 22 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index cc3939dc9e3d..8c31c39b6f8c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1155,9 +1155,6 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
 
-	if (rdev->badblocks.count)
-		return -EINVAL;
-
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
@@ -1303,6 +1300,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
 	 * We don't need to freeze the array, because being in an
 	 * active sync request, there is no normal IO, and
 	 * no overlapping syncs.
+	 * We don't need to check is_badblock() again as we
+	 * made sure that anything with a bad block in range
+	 * will have bi_end_io clear.
 	 */
 	mddev_t *mddev = r1_bio->mddev;
 	conf_t *conf = mddev->private;
@@ -1792,6 +1792,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	int write_targets = 0, read_targets = 0;
 	sector_t sync_blocks;
 	int still_degraded = 0;
+	int good_sectors = RESYNC_SECTORS;
+	int min_bad = 0; /* number of sectors that are bad in all devices */
 
 	if (!conf->r1buf_pool)
 		if (init_resync(conf))
@@ -1879,36 +1881,89 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 		rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev == NULL ||
-			   test_bit(Faulty, &rdev->flags)) {
+		    test_bit(Faulty, &rdev->flags)) {
 			still_degraded = 1;
-			continue;
 		} else if (!test_bit(In_sync, &rdev->flags)) {
 			bio->bi_rw = WRITE;
 			bio->bi_end_io = end_sync_write;
 			write_targets ++;
 		} else {
 			/* may need to read from here */
-			bio->bi_rw = READ;
-			bio->bi_end_io = end_sync_read;
-			if (test_bit(WriteMostly, &rdev->flags)) {
-				if (wonly < 0)
-					wonly = i;
-			} else {
-				if (disk < 0)
-					disk = i;
+			sector_t first_bad = MaxSector;
+			int bad_sectors;
+
+			if (is_badblock(rdev, sector_nr, good_sectors,
+					&first_bad, &bad_sectors)) {
+				if (first_bad > sector_nr)
+					good_sectors = first_bad - sector_nr;
+				else {
+					bad_sectors -= (sector_nr - first_bad);
+					if (min_bad == 0 ||
+					    min_bad > bad_sectors)
+						min_bad = bad_sectors;
+				}
+			}
+			if (sector_nr < first_bad) {
+				if (test_bit(WriteMostly, &rdev->flags)) {
+					if (wonly < 0)
+						wonly = i;
+				} else {
+					if (disk < 0)
+						disk = i;
+				}
+				bio->bi_rw = READ;
+				bio->bi_end_io = end_sync_read;
+				read_targets++;
 			}
-			read_targets++;
 		}
-		atomic_inc(&rdev->nr_pending);
-		bio->bi_sector = sector_nr + rdev->data_offset;
-		bio->bi_bdev = rdev->bdev;
-		bio->bi_private = r1_bio;
+		if (bio->bi_end_io) {
+			atomic_inc(&rdev->nr_pending);
+			bio->bi_sector = sector_nr + rdev->data_offset;
+			bio->bi_bdev = rdev->bdev;
+			bio->bi_private = r1_bio;
+		}
 	}
 	rcu_read_unlock();
 	if (disk < 0)
 		disk = wonly;
 	r1_bio->read_disk = disk;
 
+	if (read_targets == 0 && min_bad > 0) {
+		/* These sectors are bad on all InSync devices, so we
+		 * need to mark them bad on all write targets
+		 */
+		int ok = 1;
+		for (i = 0 ; i < conf->raid_disks ; i++)
+			if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
+				mdk_rdev_t *rdev =
+					rcu_dereference(conf->mirrors[i].rdev);
+				ok = rdev_set_badblocks(rdev, sector_nr,
+							min_bad, 0
+					) && ok;
+			}
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		*skipped = 1;
+		put_buf(r1_bio);
+
+		if (!ok) {
+			/* Cannot record the badblocks, so need to
+			 * abort the resync.
+			 * If there are multiple read targets, could just
+			 * fail the really bad ones ???
+			 */
+			conf->recovery_disabled = mddev->recovery_disabled;
+			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+			return 0;
+		} else
+			return min_bad;
+
+	}
+	if (min_bad > 0 && min_bad < good_sectors) {
+		/* only resync enough to reach the next bad->good
+		 * transition */
+		good_sectors = min_bad;
+	}
+
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
 		/* extra read targets are also write targets */
 		write_targets += read_targets-1;
@@ -1925,6 +1980,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 	if (max_sector > mddev->resync_max)
 		max_sector = mddev->resync_max; /* Don't do IO beyond here */
+	if (max_sector > sector_nr + good_sectors)
+		max_sector = sector_nr + good_sectors;
 	nr_sectors = 0;
 	sync_blocks = 0;
 	do {
@@ -2147,10 +2204,6 @@ static int run(mddev_t *mddev)
 			blk_queue_segment_boundary(mddev->queue,
 						   PAGE_CACHE_SIZE - 1);
 		}
-		if (rdev->badblocks.count) {
-			printk(KERN_ERR "md/raid1: Cannot handle bad blocks yet\n");
-			return -EINVAL;
-		}
 	}
 
 	mddev->degraded = 0;
-- 
cgit v1.2.3


From de393cdea66cbd63c90725663f400c76faf1b255 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:31:48 +1000
Subject: md: make it easier to wait for bad blocks to be acknowledged.

It is only safe to choose not to write to a bad block if that bad
block is safely recorded in metadata - i.e. if it has been
'acknowledged'.

If it hasn't we need to wait for the acknowledgement.

We support that using rdev->blocked wait and
md_wait_for_blocked_rdev by introducing a new device flag
'BlockedBadBlock'.

This flag is only advisory.
It is cleared whenever we acknowledge a bad block, so that a waiter
can re-check the particular bad blocks that it is interested it.

It should be set by a caller when they find they need to wait.
This (set after test) is inherently racy, but as
md_wait_for_blocked_rdev already has a timeout, losing the race will
have minimal impact.

When we clear "Blocked" was also clear "BlockedBadBlocks" incase it
was set incorrectly (see above race).

We also modify the way we manage 'Blocked' to fit better with the new
handling of 'BlockedBadBlocks' and to make it consistent between
externally managed and internally managed metadata.   This requires
that each raidXd loop checks if the metadata needs to be written and
triggers a write (md_check_recovery) if needed.  Otherwise a queued
write request might cause raidXd to wait for the metadata to write,
and only that thread can write it.

Before writing metadata, we set FaultRecorded for all devices that
are Faulty, then after writing the metadata we clear Blocked for any
device for which the Fault was certainly Recorded.

The 'faulty' device flag now appears in sysfs if the device is faulty
*or* it has unacknowledged bad blocks.  So user-space which does not
understand bad blocks can continue to function correctly.
User space which does, should not assume a device is faulty until it
sees the 'faulty' flag, and then sees the list of unacknowledged bad
blocks is empty.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c     | 77 ++++++++++++++++++++++++++++++++++++-----------------
 drivers/md/md.h     | 25 +++++++++++++++--
 drivers/md/raid1.c  |  3 +++
 drivers/md/raid10.c |  3 +++
 drivers/md/raid5.c  |  4 +++
 5 files changed, 85 insertions(+), 27 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1520d18c5af5..a6b6471da2bc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2341,8 +2341,18 @@ repeat:
 	if (!mddev->persistent) {
 		clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
 		clear_bit(MD_CHANGE_DEVS, &mddev->flags);
-		if (!mddev->external)
+		if (!mddev->external) {
 			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+			list_for_each_entry(rdev, &mddev->disks, same_set) {
+				if (rdev->badblocks.changed) {
+					md_ack_all_badblocks(&rdev->badblocks);
+					md_error(mddev, rdev);
+				}
+				clear_bit(Blocked, &rdev->flags);
+				clear_bit(BlockedBadBlocks, &rdev->flags);
+				wake_up(&rdev->blocked_wait);
+			}
+		}
 		wake_up(&mddev->sb_wait);
 		return;
 	}
@@ -2399,9 +2409,12 @@ repeat:
 		mddev->events --;
 	}
 
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (rdev->badblocks.changed)
 			any_badblocks_changed++;
+		if (test_bit(Faulty, &rdev->flags))
+			set_bit(FaultRecorded, &rdev->flags);
+	}
 
 	sync_sbs(mddev, nospares);
 	spin_unlock_irq(&mddev->write_lock);
@@ -2458,9 +2471,15 @@ repeat:
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 
-	if (any_badblocks_changed)
-		list_for_each_entry(rdev, &mddev->disks, same_set)
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
+			clear_bit(Blocked, &rdev->flags);
+
+		if (any_badblocks_changed)
 			md_ack_all_badblocks(&rdev->badblocks);
+		clear_bit(BlockedBadBlocks, &rdev->flags);
+		wake_up(&rdev->blocked_wait);
+	}
 }
 
 /* words written to sysfs files may, or may not, be \n terminated.
@@ -2495,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page)
 	char *sep = "";
 	size_t len = 0;
 
-	if (test_bit(Faulty, &rdev->flags)) {
+	if (test_bit(Faulty, &rdev->flags) ||
+	    rdev->badblocks.unacked_exist) {
 		len+= sprintf(page+len, "%sfaulty",sep);
 		sep = ",";
 	}
@@ -2507,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page)
 		len += sprintf(page+len, "%swrite_mostly",sep);
 		sep = ",";
 	}
-	if (test_bit(Blocked, &rdev->flags)) {
+	if (test_bit(Blocked, &rdev->flags) ||
+	    rdev->badblocks.unacked_exist) {
 		len += sprintf(page+len, "%sblocked", sep);
 		sep = ",";
 	}
@@ -2527,12 +2548,12 @@ static ssize_t
 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
 	/* can write
-	 *  faulty  - simulates and error
+	 *  faulty  - simulates an error
 	 *  remove  - disconnects the device
 	 *  writemostly - sets write_mostly
 	 *  -writemostly - clears write_mostly
-	 *  blocked - sets the Blocked flag
-	 *  -blocked - clears the Blocked flag
+	 *  blocked - sets the Blocked flags
+	 *  -blocked - clears the Blocked and possibly simulates an error
 	 *  insync - sets Insync providing device isn't active
 	 *  write_error - sets WriteErrorSeen
 	 *  -write_error - clears WriteErrorSeen
@@ -2562,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		set_bit(Blocked, &rdev->flags);
 		err = 0;
 	} else if (cmd_match(buf, "-blocked")) {
+		if (!test_bit(Faulty, &rdev->flags) &&
+		    test_bit(BlockedBadBlocks, &rdev->flags)) {
+			/* metadata handler doesn't understand badblocks,
+			 * so we need to fail the device
+			 */
+			md_error(rdev->mddev, rdev);
+		}
 		clear_bit(Blocked, &rdev->flags);
+		clear_bit(BlockedBadBlocks, &rdev->flags);
 		wake_up(&rdev->blocked_wait);
 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
 		md_wakeup_thread(rdev->mddev->thread);
@@ -2881,7 +2910,11 @@ static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
 }
 static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
 {
-	return badblocks_store(&rdev->badblocks, page, len, 0);
+	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
+	/* Maybe that ack was all we needed */
+	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
+		wake_up(&rdev->blocked_wait);
+	return rv;
 }
 static struct rdev_sysfs_entry rdev_bad_blocks =
 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
@@ -6398,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!rdev || test_bit(Faulty, &rdev->flags))
 		return;
 
-	if (mddev->external)
-		set_bit(Blocked, &rdev->flags);
-/*
-	dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
-		mdname(mddev),
-		MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
-		__builtin_return_address(0),__builtin_return_address(1),
-		__builtin_return_address(2),__builtin_return_address(3));
-*/
-	if (!mddev->pers)
-		return;
-	if (!mddev->pers->error_handler)
+	if (!mddev->pers || !mddev->pers->error_handler)
 		return;
 	mddev->pers->error_handler(mddev,rdev);
 	if (mddev->degraded)
@@ -7286,8 +7308,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 		list_for_each_entry(rdev, &mddev->disks, same_set) {
 			if (rdev->raid_disk >= 0 &&
 			    !test_bit(In_sync, &rdev->flags) &&
-			    !test_bit(Faulty, &rdev->flags) &&
-			    !test_bit(Blocked, &rdev->flags))
+			    !test_bit(Faulty, &rdev->flags))
 				spares++;
 			if (rdev->raid_disk < 0
 			    && !test_bit(Faulty, &rdev->flags)) {
@@ -7533,7 +7554,8 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 {
 	sysfs_notify_dirent_safe(rdev->sysfs_state);
 	wait_event_timeout(rdev->blocked_wait,
-			   !test_bit(Blocked, &rdev->flags),
+			   !test_bit(Blocked, &rdev->flags) &&
+			   !test_bit(BlockedBadBlocks, &rdev->flags),
 			   msecs_to_jiffies(5000));
 	rdev_dec_pending(rdev, mddev);
 }
@@ -7779,6 +7801,8 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
 	}
 
 	bb->changed = 1;
+	if (!acknowledged)
+		bb->unacked_exist = 1;
 	write_sequnlock_irq(&bb->lock);
 
 	return rv;
@@ -7923,6 +7947,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
 				p[i] = BB_MAKE(start, len, 1);
 			}
 		}
+		bb->unacked_exist = 0;
 	}
 	write_sequnlock_irq(&bb->lock);
 }
@@ -7970,6 +7995,8 @@ retry:
 				(unsigned long long)s << bb->shift,
 				length << bb->shift);
 	}
+	if (unack && len == 0)
+		bb->unacked_exist = 0;
 
 	if (read_seqretry(&bb->lock, seq))
 		goto retry;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fa4b607854ac..1e586bb4452e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -81,12 +81,29 @@ struct mdk_rdev_s
 #define	In_sync		2		/* device is in_sync with rest of array */
 #define	WriteMostly	4		/* Avoid reading if at all possible */
 #define	AutoDetected	7		/* added by auto-detect */
-#define Blocked		8		/* An error occurred on an externally
-					 * managed array, don't allow writes
+#define Blocked		8		/* An error occurred but has not yet
+					 * been acknowledged by the metadata
+					 * handler, so don't allow writes
 					 * until it is cleared */
 #define WriteErrorSeen	9		/* A write error has been seen on this
 					 * device
 					 */
+#define FaultRecorded	10		/* Intermediate state for clearing
+					 * Blocked.  The Fault is/will-be
+					 * recorded in the metadata, but that
+					 * metadata hasn't been stored safely
+					 * on disk yet.
+					 */
+#define BlockedBadBlocks 11		/* A writer is blocked because they
+					 * found an unacknowledged bad-block.
+					 * This can safely be cleared at any
+					 * time, and the writer will re-check.
+					 * It may be set at any time, and at
+					 * worst the writer will timeout and
+					 * re-check.  So setting it as
+					 * accurately as possible is good, but
+					 * not absolutely critical.
+					 */
 	wait_queue_head_t blocked_wait;
 
 	int desc_nr;			/* descriptor index in the superblock */
@@ -124,6 +141,10 @@ struct mdk_rdev_s
 
 	struct badblocks {
 		int	count;		/* count of bad blocks */
+		int	unacked_exist;	/* there probably are unacknowledged
+					 * bad blocks.  This is only cleared
+					 * when a read discovers none
+					 */
 		int	shift;		/* shift from sectors to block size
 					 * a -ve shift means badblocks are
 					 * disabled.*/
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8c31c39b6f8c..4d40d9d54a20 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1059,6 +1059,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		conf->recovery_disabled = mddev->recovery_disabled;
 		return;
 	}
+	set_bit(Blocked, &rdev->flags);
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
 		unsigned long flags;
 		spin_lock_irqsave(&conf->device_lock, flags);
@@ -1751,6 +1752,8 @@ read_more:
 			generic_make_request(r1_bio->bios[r1_bio->read_disk]);
 		}
 		cond_resched();
+		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+			md_check_recovery(mddev);
 	}
 	blk_finish_plug(&plug);
 }
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8aadd2f52dc8..fe6692e62215 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1021,6 +1021,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	}
+	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	printk(KERN_ALERT
@@ -1703,6 +1704,8 @@ static void raid10d(mddev_t *mddev)
 			}
 		}
 		cond_resched();
+		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+			md_check_recovery(mddev);
 	}
 	blk_finish_plug(&plug);
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 719445004dd9..304389ba5e27 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1706,6 +1706,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	}
+	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	printk(KERN_ALERT
@@ -4143,6 +4144,9 @@ static void raid5d(mddev_t *mddev)
 		release_stripe(sh);
 		cond_resched();
 
+		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+			md_check_recovery(mddev);
+
 		spin_lock_irq(&conf->device_lock);
 	}
 	pr_debug("%d stripes handled\n", handled);
-- 
cgit v1.2.3


From 1f68f0c4b677ccd6935ff61e4e6888787505f8dc Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:31:48 +1000
Subject: md/raid1: avoid writing to known-bad blocks on known-bad drives.

If we have seen any write error on a drive, then don't write to
any known-bad blocks on that drive.
If necessary, we divide the write request up into pieces just
like we do for reads, so each piece is either all written or
all not written to any given drive.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid1.c | 153 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 115 insertions(+), 38 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4d40d9d54a20..3214606204d2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -764,7 +764,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	mirror_info_t *mirror;
 	r1bio_t *r1_bio;
 	struct bio *read_bio;
-	int i, targets = 0, disks;
+	int i, disks;
 	struct bitmap *bitmap;
 	unsigned long flags;
 	const int rw = bio_data_dir(bio);
@@ -772,6 +772,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 	mdk_rdev_t *blocked_rdev;
 	int plugged;
+	int first_clone;
+	int sectors_handled;
+	int max_sectors;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -832,7 +835,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		/*
 		 * read balancing logic:
 		 */
-		int max_sectors;
 		int rdisk;
 
 read_again:
@@ -872,7 +874,6 @@ read_again:
 			/* could not read all from this device, so we will
 			 * need another r1_bio.
 			 */
-			int sectors_handled;
 
 			sectors_handled = (r1_bio->sector + max_sectors
 					   - bio->bi_sector);
@@ -906,9 +907,15 @@ read_again:
 	/*
 	 * WRITE:
 	 */
-	/* first select target devices under spinlock and
+	/* first select target devices under rcu_lock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
+	 * If there are known/acknowledged bad blocks on any device on
+	 * which we have seen a write error, we want to avoid writing those
+	 * blocks.
+	 * This potentially requires several writes to write around
+	 * the bad blocks.  Each set of writes gets it's own r1bio
+	 * with a set of bios attached.
 	 */
 	plugged = mddev_check_plugged(mddev);
 
@@ -916,6 +923,7 @@ read_again:
  retry_write:
 	blocked_rdev = NULL;
 	rcu_read_lock();
+	max_sectors = r1_bio->sectors;
 	for (i = 0;  i < disks; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -923,17 +931,56 @@ read_again:
 			blocked_rdev = rdev;
 			break;
 		}
-		if (rdev && !test_bit(Faulty, &rdev->flags)) {
-			atomic_inc(&rdev->nr_pending);
-			if (test_bit(Faulty, &rdev->flags)) {
+		r1_bio->bios[i] = NULL;
+		if (!rdev || test_bit(Faulty, &rdev->flags)) {
+			set_bit(R1BIO_Degraded, &r1_bio->state);
+			continue;
+		}
+
+		atomic_inc(&rdev->nr_pending);
+		if (test_bit(WriteErrorSeen, &rdev->flags)) {
+			sector_t first_bad;
+			int bad_sectors;
+			int is_bad;
+
+			is_bad = is_badblock(rdev, r1_bio->sector,
+					     max_sectors,
+					     &first_bad, &bad_sectors);
+			if (is_bad < 0) {
+				/* mustn't write here until the bad block is
+				 * acknowledged*/
+				set_bit(BlockedBadBlocks, &rdev->flags);
+				blocked_rdev = rdev;
+				break;
+			}
+			if (is_bad && first_bad <= r1_bio->sector) {
+				/* Cannot write here at all */
+				bad_sectors -= (r1_bio->sector - first_bad);
+				if (bad_sectors < max_sectors)
+					/* mustn't write more than bad_sectors
+					 * to other devices yet
+					 */
+					max_sectors = bad_sectors;
 				rdev_dec_pending(rdev, mddev);
-				r1_bio->bios[i] = NULL;
-			} else {
-				r1_bio->bios[i] = bio;
-				targets++;
+				/* We don't set R1BIO_Degraded as that
+				 * only applies if the disk is
+				 * missing, so it might be re-added,
+				 * and we want to know to recover this
+				 * chunk.
+				 * In this case the device is here,
+				 * and the fact that this chunk is not
+				 * in-sync is recorded in the bad
+				 * block log
+				 */
+				continue;
 			}
-		} else
-			r1_bio->bios[i] = NULL;
+			if (is_bad) {
+				int good_sectors = first_bad - r1_bio->sector;
+				if (good_sectors < max_sectors)
+					max_sectors = good_sectors;
+			}
+		}
+		r1_bio->bios[i] = bio;
 	}
 	rcu_read_unlock();
 
@@ -944,48 +991,56 @@ read_again:
 		for (j = 0; j < i; j++)
 			if (r1_bio->bios[j])
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
-
+		r1_bio->state = 0;
 		allow_barrier(conf);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
 		wait_barrier(conf);
 		goto retry_write;
 	}
 
-	if (targets < conf->raid_disks) {
-		/* array is degraded, we will not clear the bitmap
-		 * on I/O completion (see raid1_end_write_request) */
-		set_bit(R1BIO_Degraded, &r1_bio->state);
+	if (max_sectors < r1_bio->sectors) {
+		/* We are splitting this write into multiple parts, so
+		 * we need to prepare for allocating another r1_bio.
+		 */
+		r1_bio->sectors = max_sectors;
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
 	}
-
-	/* do behind I/O ?
-	 * Not if there are too many, or cannot allocate memory,
-	 * or a reader on WriteMostly is waiting for behind writes 
-	 * to flush */
-	if (bitmap &&
-	    (atomic_read(&bitmap->behind_writes)
-	     < mddev->bitmap_info.max_write_behind) &&
-	    !waitqueue_active(&bitmap->behind_wait))
-		alloc_behind_pages(bio, r1_bio);
+	sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
 
 	atomic_set(&r1_bio->remaining, 1);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
-	bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
-				test_bit(R1BIO_BehindIO, &r1_bio->state));
+	first_clone = 1;
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio;
 		if (!r1_bio->bios[i])
 			continue;
 
 		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		r1_bio->bios[i] = mbio;
-
-		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset;
-		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
-		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_rw = WRITE | do_flush_fua | do_sync;
-		mbio->bi_private = r1_bio;
-
+		md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
+
+		if (first_clone) {
+			/* do behind I/O ?
+			 * Not if there are too many, or cannot
+			 * allocate memory, or a reader on WriteMostly
+			 * is waiting for behind writes to flush */
+			if (bitmap &&
+			    (atomic_read(&bitmap->behind_writes)
+			     < mddev->bitmap_info.max_write_behind) &&
+			    !waitqueue_active(&bitmap->behind_wait))
+				alloc_behind_pages(mbio, r1_bio);
+
+			bitmap_startwrite(bitmap, r1_bio->sector,
+					  r1_bio->sectors,
+					  test_bit(R1BIO_BehindIO,
+						   &r1_bio->state));
+			first_clone = 0;
+		}
 		if (r1_bio->behind_pages) {
 			struct bio_vec *bvec;
 			int j;
@@ -1003,6 +1058,15 @@ read_again:
 				atomic_inc(&r1_bio->behind_remaining);
 		}
 
+		r1_bio->bios[i] = mbio;
+
+		mbio->bi_sector	= (r1_bio->sector +
+				   conf->mirrors[i].rdev->data_offset);
+		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+		mbio->bi_end_io	= raid1_end_write_request;
+		mbio->bi_rw = WRITE | do_flush_fua | do_sync;
+		mbio->bi_private = r1_bio;
+
 		atomic_inc(&r1_bio->remaining);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
@@ -1013,6 +1077,19 @@ read_again:
 	/* In case raid1d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
 
+	if (sectors_handled < (bio->bi_size >> 9)) {
+		/* We need another r1_bio.  It has already been counted
+		 * in bio->bi_phys_segments
+		 */
+		r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+		r1_bio->master_bio = bio;
+		r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+		r1_bio->state = 0;
+		r1_bio->mddev = mddev;
+		r1_bio->sector = bio->bi_sector + sectors_handled;
+		goto retry_write;
+	}
+
 	if (do_sync || !bitmap || !plugged)
 		md_wakeup_thread(mddev->thread);
 
-- 
cgit v1.2.3


From 4367af556133723d0f443e14ca8170d9447317cb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:31:49 +1000
Subject: md/raid1: clear bad-block record when write succeeds.

If we succeed in writing to a block that was recorded as
being bad, we clear the bad-block record.

This requires some delayed handling as the bad-block-list update has
to happen in process-context.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid1.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++--------
 drivers/md/raid1.h | 13 ++++++++-
 2 files changed, 80 insertions(+), 12 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3214606204d2..141de75a6c7c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -163,7 +163,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
 
 	for (i = 0; i < conf->raid_disks; i++) {
 		struct bio **bio = r1_bio->bios + i;
-		if (*bio && *bio != IO_BLOCKED)
+		if (!BIO_SPECIAL(*bio))
 			bio_put(*bio);
 		*bio = NULL;
 	}
@@ -337,7 +337,10 @@ static void r1_bio_write_done(r1bio_t *r1_bio)
 				!test_bit(R1BIO_Degraded, &r1_bio->state),
 				test_bit(R1BIO_BehindIO, &r1_bio->state));
 		md_write_end(r1_bio->mddev);
-		raid_end_bio_io(r1_bio);
+		if (test_bit(R1BIO_MadeGood, &r1_bio->state))
+			reschedule_retry(r1_bio);
+		else
+			raid_end_bio_io(r1_bio);
 	}
 }
 
@@ -363,7 +366,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
 		md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
 		/* an I/O failed, we can't clear the bitmap */
 		set_bit(R1BIO_Degraded, &r1_bio->state);
-	} else
+	} else {
 		/*
 		 * Set R1BIO_Uptodate in our master bio, so that we
 		 * will return a good error code for to the higher
@@ -374,8 +377,20 @@ static void raid1_end_write_request(struct bio *bio, int error)
 		 * to user-side. So if something waits for IO, then it
 		 * will wait for the 'master' bio.
 		 */
+		sector_t first_bad;
+		int bad_sectors;
+
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
 
+		/* Maybe we can clear some bad blocks. */
+		if (is_badblock(conf->mirrors[mirror].rdev,
+				r1_bio->sector, r1_bio->sectors,
+				&first_bad, &bad_sectors)) {
+			r1_bio->bios[mirror] = IO_MADE_GOOD;
+			set_bit(R1BIO_MadeGood, &r1_bio->state);
+		}
+	}
+
 	update_head_pos(mirror, r1_bio);
 
 	if (behind) {
@@ -402,7 +417,9 @@ static void raid1_end_write_request(struct bio *bio, int error)
 			}
 		}
 	}
-	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+	if (r1_bio->bios[mirror] == NULL)
+		rdev_dec_pending(conf->mirrors[mirror].rdev,
+				 conf->mddev);
 
 	/*
 	 * Let's see if all mirrored write operations have finished
@@ -1340,6 +1357,8 @@ static void end_sync_write(struct bio *bio, int error)
 	conf_t *conf = mddev->private;
 	int i;
 	int mirror=0;
+	sector_t first_bad;
+	int bad_sectors;
 
 	for (i = 0; i < conf->raid_disks; i++)
 		if (r1_bio->bios[i] == bio) {
@@ -1358,14 +1377,22 @@ static void end_sync_write(struct bio *bio, int error)
 			sectors_to_go -= sync_blocks;
 		} while (sectors_to_go > 0);
 		md_error(mddev, conf->mirrors[mirror].rdev);
-	}
+	} else if (is_badblock(conf->mirrors[mirror].rdev,
+			       r1_bio->sector,
+			       r1_bio->sectors,
+			       &first_bad, &bad_sectors))
+		set_bit(R1BIO_MadeGood, &r1_bio->state);
 
 	update_head_pos(mirror, r1_bio);
 
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
-		sector_t s = r1_bio->sectors;
-		put_buf(r1_bio);
-		md_done_sync(mddev, s, uptodate);
+		int s = r1_bio->sectors;
+		if (test_bit(R1BIO_MadeGood, &r1_bio->state))
+			reschedule_retry(r1_bio);
+		else {
+			put_buf(r1_bio);
+			md_done_sync(mddev, s, uptodate);
+		}
 	}
 }
 
@@ -1727,9 +1754,39 @@ static void raid1d(mddev_t *mddev)
 
 		mddev = r1_bio->mddev;
 		conf = mddev->private;
-		if (test_bit(R1BIO_IsSync, &r1_bio->state))
-			sync_request_write(mddev, r1_bio);
-		else if (test_bit(R1BIO_ReadError, &r1_bio->state)) {
+		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
+			if (test_bit(R1BIO_MadeGood, &r1_bio->state)) {
+				int m;
+				int s = r1_bio->sectors;
+				for (m = 0; m < conf->raid_disks ; m++) {
+					struct bio *bio = r1_bio->bios[m];
+					if (bio->bi_end_io != NULL &&
+					    test_bit(BIO_UPTODATE,
+						     &bio->bi_flags)) {
+						rdev = conf->mirrors[m].rdev;
+						rdev_clear_badblocks(
+							rdev,
+							r1_bio->sector,
+							r1_bio->sectors);
+					}
+				}
+				put_buf(r1_bio);
+				md_done_sync(mddev, s, 1);
+			} else
+				sync_request_write(mddev, r1_bio);
+		} else if (test_bit(R1BIO_MadeGood, &r1_bio->state)) {
+			int m;
+			for (m = 0; m < conf->raid_disks ; m++)
+				if (r1_bio->bios[m] == IO_MADE_GOOD) {
+					rdev = conf->mirrors[m].rdev;
+					rdev_clear_badblocks(
+						rdev,
+						r1_bio->sector,
+						r1_bio->sectors);
+					rdev_dec_pending(rdev, mddev);
+				}
+			raid_end_bio_io(r1_bio);
+		} else if (test_bit(R1BIO_ReadError, &r1_bio->state)) {
 			int disk;
 			int max_sectors;
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index aa6af37ca01b..f81360d49af4 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -116,7 +116,14 @@ struct r1bio_s {
  * correct the read error.  To keep track of bad blocks on a per-bio
  * level, we store IO_BLOCKED in the appropriate 'bios' pointer
  */
-#define IO_BLOCKED ((struct bio*)1)
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context.  So we record
+ * the success by setting bios[n] to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
 
 /* bits for r1bio.state */
 #define	R1BIO_Uptodate	0
@@ -135,6 +142,10 @@ struct r1bio_s {
  * Record that bi_end_io was called with this flag...
  */
 #define	R1BIO_Returned 6
+/* If a write for this request means we can clear some
+ * known-bad-block records, we set this flag
+ */
+#define R1BIO_MadeGood 7
 
 extern int md_raid1_congested(mddev_t *mddev, int bits);
 
-- 
cgit v1.2.3


From 2ca68f5ed7383733102ee53cd8fa4021ecc3b275 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:32:10 +1000
Subject: md/raid1: store behind-write pages in bi_vecs.

When performing write-behind we allocate pages to store the data
during write.
Previously we just keep a list of pages.  Now we keep a list of
bi_vec which includes offset and size.
This means that the r1bio has complete information to create a new
bio which will be needed for retrying after write errors.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid1.c | 33 +++++++++++++++++----------------
 drivers/md/raid1.h |  2 +-
 2 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 141de75a6c7c..b16d2ee5e9dd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -327,9 +327,9 @@ static void r1_bio_write_done(r1bio_t *r1_bio)
 			/* free extra copy of the data pages */
 			int i = r1_bio->behind_page_count;
 			while (i--)
-				safe_put_page(r1_bio->behind_pages[i]);
-			kfree(r1_bio->behind_pages);
-			r1_bio->behind_pages = NULL;
+				safe_put_page(r1_bio->behind_bvecs[i].bv_page);
+			kfree(r1_bio->behind_bvecs);
+			r1_bio->behind_bvecs = NULL;
 		}
 		/* clear the bitmap if all writes complete successfully */
 		bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -748,30 +748,31 @@ static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
 {
 	int i;
 	struct bio_vec *bvec;
-	struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
+	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
 					GFP_NOIO);
-	if (unlikely(!pages))
+	if (unlikely(!bvecs))
 		return;
 
 	bio_for_each_segment(bvec, bio, i) {
-		pages[i] = alloc_page(GFP_NOIO);
-		if (unlikely(!pages[i]))
+		bvecs[i] = *bvec;
+		bvecs[i].bv_page = alloc_page(GFP_NOIO);
+		if (unlikely(!bvecs[i].bv_page))
 			goto do_sync_io;
-		memcpy(kmap(pages[i]) + bvec->bv_offset,
-			kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
-		kunmap(pages[i]);
+		memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
+		       kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
+		kunmap(bvecs[i].bv_page);
 		kunmap(bvec->bv_page);
 	}
-	r1_bio->behind_pages = pages;
+	r1_bio->behind_bvecs = bvecs;
 	r1_bio->behind_page_count = bio->bi_vcnt;
 	set_bit(R1BIO_BehindIO, &r1_bio->state);
 	return;
 
 do_sync_io:
 	for (i = 0; i < bio->bi_vcnt; i++)
-		if (pages[i])
-			put_page(pages[i]);
-	kfree(pages);
+		if (bvecs[i].bv_page)
+			put_page(bvecs[i].bv_page);
+	kfree(bvecs);
 	PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
 }
 
@@ -1058,7 +1059,7 @@ read_again:
 						   &r1_bio->state));
 			first_clone = 0;
 		}
-		if (r1_bio->behind_pages) {
+		if (r1_bio->behind_bvecs) {
 			struct bio_vec *bvec;
 			int j;
 
@@ -1070,7 +1071,7 @@ read_again:
 			 * them all
 			 */
 			__bio_for_each_segment(bvec, mbio, j, 0)
-				bvec->bv_page = r1_bio->behind_pages[j];
+				bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
 			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
 				atomic_inc(&r1_bio->behind_remaining);
 		}
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index f81360d49af4..b7885474e96c 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -101,7 +101,7 @@ struct r1bio_s {
 
 	struct list_head	retry_list;
 	/* Next two are only valid when R1BIO_BehindIO is set */
-	struct page		**behind_pages;
+	struct bio_vec		*behind_bvecs;
 	int			behind_page_count;
 	/*
 	 * if the IO is in WRITE direction, then multiple bios are used.
-- 
cgit v1.2.3


From cd5ff9a16f0831f68c1024d5d776075b5123b034 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:32:41 +1000
Subject: md/raid1:  Handle write errors by updating badblock log.

When we get a write error (in the data area, not in metadata),
update the badblock log rather than failing the whole device.

As the write may well be many blocks, we trying writing each
block individually and only log the ones which fail.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid1.c | 168 +++++++++++++++++++++++++++++++++++++++++++++--------
 drivers/md/raid1.h |   3 +-
 2 files changed, 147 insertions(+), 24 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b16d2ee5e9dd..b45d641f4f33 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -318,25 +318,34 @@ static void raid1_end_read_request(struct bio *bio, int error)
 	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
 }
 
+static void close_write(r1bio_t *r1_bio)
+{
+	/* it really is the end of this request */
+	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+		/* free extra copy of the data pages */
+		int i = r1_bio->behind_page_count;
+		while (i--)
+			safe_put_page(r1_bio->behind_bvecs[i].bv_page);
+		kfree(r1_bio->behind_bvecs);
+		r1_bio->behind_bvecs = NULL;
+	}
+	/* clear the bitmap if all writes complete successfully */
+	bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+			r1_bio->sectors,
+			!test_bit(R1BIO_Degraded, &r1_bio->state),
+			test_bit(R1BIO_BehindIO, &r1_bio->state));
+	md_write_end(r1_bio->mddev);
+}
+
 static void r1_bio_write_done(r1bio_t *r1_bio)
 {
-	if (atomic_dec_and_test(&r1_bio->remaining))
-	{
-		/* it really is the end of this request */
-		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-			/* free extra copy of the data pages */
-			int i = r1_bio->behind_page_count;
-			while (i--)
-				safe_put_page(r1_bio->behind_bvecs[i].bv_page);
-			kfree(r1_bio->behind_bvecs);
-			r1_bio->behind_bvecs = NULL;
-		}
-		/* clear the bitmap if all writes complete successfully */
-		bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
-				r1_bio->sectors,
-				!test_bit(R1BIO_Degraded, &r1_bio->state),
-				test_bit(R1BIO_BehindIO, &r1_bio->state));
-		md_write_end(r1_bio->mddev);
+	if (!atomic_dec_and_test(&r1_bio->remaining))
+		return;
+
+	if (test_bit(R1BIO_WriteError, &r1_bio->state))
+		reschedule_retry(r1_bio);
+	else {
+		close_write(r1_bio);
 		if (test_bit(R1BIO_MadeGood, &r1_bio->state))
 			reschedule_retry(r1_bio);
 		else
@@ -360,12 +369,10 @@ static void raid1_end_write_request(struct bio *bio, int error)
 	/*
 	 * 'one mirror IO has finished' event handler:
 	 */
-	r1_bio->bios[mirror] = NULL;
-	to_put = bio;
 	if (!uptodate) {
-		md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
-		/* an I/O failed, we can't clear the bitmap */
-		set_bit(R1BIO_Degraded, &r1_bio->state);
+		set_bit(WriteErrorSeen,
+			&conf->mirrors[mirror].rdev->flags);
+		set_bit(R1BIO_WriteError, &r1_bio->state);
 	} else {
 		/*
 		 * Set R1BIO_Uptodate in our master bio, so that we
@@ -380,6 +387,8 @@ static void raid1_end_write_request(struct bio *bio, int error)
 		sector_t first_bad;
 		int bad_sectors;
 
+		r1_bio->bios[mirror] = NULL;
+		to_put = bio;
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
 
 		/* Maybe we can clear some bad blocks. */
@@ -1724,6 +1733,101 @@ static void fix_read_error(conf_t *conf, int read_disk,
 	}
 }
 
+static void bi_complete(struct bio *bio, int error)
+{
+	complete((struct completion *)bio->bi_private);
+}
+
+static int submit_bio_wait(int rw, struct bio *bio)
+{
+	struct completion event;
+	rw |= REQ_SYNC;
+
+	init_completion(&event);
+	bio->bi_private = &event;
+	bio->bi_end_io = bi_complete;
+	submit_bio(rw, bio);
+	wait_for_completion(&event);
+
+	return test_bit(BIO_UPTODATE, &bio->bi_flags);
+}
+
+static int narrow_write_error(r1bio_t *r1_bio, int i)
+{
+	mddev_t *mddev = r1_bio->mddev;
+	conf_t *conf = mddev->private;
+	mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+	int vcnt, idx;
+	struct bio_vec *vec;
+
+	/* bio has the data to be written to device 'i' where
+	 * we just recently had a write error.
+	 * We repeatedly clone the bio and trim down to one block,
+	 * then try the write.  Where the write fails we record
+	 * a bad block.
+	 * It is conceivable that the bio doesn't exactly align with
+	 * blocks.  We must handle this somehow.
+	 *
+	 * We currently own a reference on the rdev.
+	 */
+
+	int block_sectors;
+	sector_t sector;
+	int sectors;
+	int sect_to_write = r1_bio->sectors;
+	int ok = 1;
+
+	if (rdev->badblocks.shift < 0)
+		return 0;
+
+	block_sectors = 1 << rdev->badblocks.shift;
+	sector = r1_bio->sector;
+	sectors = ((sector + block_sectors)
+		   & ~(sector_t)(block_sectors - 1))
+		- sector;
+
+	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+		vcnt = r1_bio->behind_page_count;
+		vec = r1_bio->behind_bvecs;
+		idx = 0;
+		while (vec[idx].bv_page == NULL)
+			idx++;
+	} else {
+		vcnt = r1_bio->master_bio->bi_vcnt;
+		vec = r1_bio->master_bio->bi_io_vec;
+		idx = r1_bio->master_bio->bi_idx;
+	}
+	while (sect_to_write) {
+		struct bio *wbio;
+		if (sectors > sect_to_write)
+			sectors = sect_to_write;
+		/* Write at 'sector' for 'sectors'*/
+
+		wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
+		memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
+		wbio->bi_sector = r1_bio->sector;
+		wbio->bi_rw = WRITE;
+		wbio->bi_vcnt = vcnt;
+		wbio->bi_size = r1_bio->sectors << 9;
+		wbio->bi_idx = idx;
+
+		md_trim_bio(wbio, sector - r1_bio->sector, sectors);
+		wbio->bi_sector += rdev->data_offset;
+		wbio->bi_bdev = rdev->bdev;
+		if (submit_bio_wait(WRITE, wbio) == 0)
+			/* failure! */
+			ok = rdev_set_badblocks(rdev, sector,
+						sectors, 0)
+				&& ok;
+
+		bio_put(wbio);
+		sect_to_write -= sectors;
+		sector += sectors;
+		sectors = block_sectors;
+	}
+	return ok;
+}
+
 static void raid1d(mddev_t *mddev)
 {
 	r1bio_t *r1_bio;
@@ -1775,7 +1879,8 @@ static void raid1d(mddev_t *mddev)
 				md_done_sync(mddev, s, 1);
 			} else
 				sync_request_write(mddev, r1_bio);
-		} else if (test_bit(R1BIO_MadeGood, &r1_bio->state)) {
+		} else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+			   test_bit(R1BIO_WriteError, &r1_bio->state)) {
 			int m;
 			for (m = 0; m < conf->raid_disks ; m++)
 				if (r1_bio->bios[m] == IO_MADE_GOOD) {
@@ -1785,7 +1890,24 @@ static void raid1d(mddev_t *mddev)
 						r1_bio->sector,
 						r1_bio->sectors);
 					rdev_dec_pending(rdev, mddev);
+				} else if (r1_bio->bios[m] != NULL) {
+					/* This drive got a write error.  We
+					 * need to narrow down and record
+					 * precise write errors.
+					 */
+					if (!narrow_write_error(r1_bio, m)) {
+						md_error(mddev,
+							 conf->mirrors[m].rdev);
+						/* an I/O failed, we can't clear
+						 * the bitmap */
+						set_bit(R1BIO_Degraded,
+							&r1_bio->state);
+					}
+					rdev_dec_pending(conf->mirrors[m].rdev,
+							 mddev);
 				}
+			if (test_bit(R1BIO_WriteError, &r1_bio->state))
+				close_write(r1_bio);
 			raid_end_bio_io(r1_bio);
 		} else if (test_bit(R1BIO_ReadError, &r1_bio->state)) {
 			int disk;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index b7885474e96c..e0d676b48974 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -145,7 +145,8 @@ struct r1bio_s {
 /* If a write for this request means we can clear some
  * known-bad-block records, we set this flag
  */
-#define R1BIO_MadeGood 7
+#define	R1BIO_MadeGood 7
+#define	R1BIO_WriteError 8
 
 extern int md_raid1_congested(mddev_t *mddev, int bits);
 
-- 
cgit v1.2.3


From d8f05d2995d467a91db1af01637e6ffd94660ca8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:33:00 +1000
Subject: md/raid1: record badblocks found during resync etc.

If we find a bad block while writing as part of resync/recovery we
need to report that back to raid1d which must record the bad block,
or fail the device.

Similarly when fixing a read error, a further error should just
record a bad block if possible rather than failing the device.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid1.c | 81 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 30 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b45d641f4f33..e6957151233b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1386,7 +1386,9 @@ static void end_sync_write(struct bio *bio, int error)
 			s += sync_blocks;
 			sectors_to_go -= sync_blocks;
 		} while (sectors_to_go > 0);
-		md_error(mddev, conf->mirrors[mirror].rdev);
+		set_bit(WriteErrorSeen,
+			&conf->mirrors[mirror].rdev->flags);
+		set_bit(R1BIO_WriteError, &r1_bio->state);
 	} else if (is_badblock(conf->mirrors[mirror].rdev,
 			       r1_bio->sector,
 			       r1_bio->sectors,
@@ -1397,7 +1399,8 @@ static void end_sync_write(struct bio *bio, int error)
 
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
 		int s = r1_bio->sectors;
-		if (test_bit(R1BIO_MadeGood, &r1_bio->state))
+		if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+		    test_bit(R1BIO_WriteError, &r1_bio->state))
 			reschedule_retry(r1_bio);
 		else {
 			put_buf(r1_bio);
@@ -1406,6 +1409,20 @@ static void end_sync_write(struct bio *bio, int error)
 	}
 }
 
+static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
+			    int sectors, struct page *page, int rw)
+{
+	if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
+		/* success */
+		return 1;
+	if (rw == WRITE)
+		set_bit(WriteErrorSeen, &rdev->flags);
+	/* need to record an error - either for the block or the device */
+	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
+		md_error(rdev->mddev, rdev);
+	return 0;
+}
+
 static int fix_sync_read_error(r1bio_t *r1_bio)
 {
 	/* Try some synchronous reads of other devices to get
@@ -1477,12 +1494,11 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
 			if (r1_bio->bios[d]->bi_end_io != end_sync_read)
 				continue;
 			rdev = conf->mirrors[d].rdev;
-			if (sync_page_io(rdev, sect, s<<9,
-					 bio->bi_io_vec[idx].bv_page,
-					 WRITE, false) == 0) {
+			if (r1_sync_page_io(rdev, sect, s,
+					    bio->bi_io_vec[idx].bv_page,
+					    WRITE) == 0) {
 				r1_bio->bios[d]->bi_end_io = NULL;
 				rdev_dec_pending(rdev, mddev);
-				md_error(mddev, rdev);
 			}
 		}
 		d = start;
@@ -1493,11 +1509,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
 			if (r1_bio->bios[d]->bi_end_io != end_sync_read)
 				continue;
 			rdev = conf->mirrors[d].rdev;
-			if (sync_page_io(rdev, sect, s<<9,
-					 bio->bi_io_vec[idx].bv_page,
-					 READ, false) == 0)
-				md_error(mddev, rdev);
-			else
+			if (r1_sync_page_io(rdev, sect, s,
+					    bio->bi_io_vec[idx].bv_page,
+					    READ) != 0)
 				atomic_add(s, &rdev->corrected_errors);
 		}
 		sectors -= s;
@@ -1682,8 +1696,10 @@ static void fix_read_error(conf_t *conf, int read_disk,
 		} while (!success && d != read_disk);
 
 		if (!success) {
-			/* Cannot read from anywhere -- bye bye array */
-			md_error(mddev, conf->mirrors[read_disk].rdev);
+			/* Cannot read from anywhere - mark it bad */
+			mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev;
+			if (!rdev_set_badblocks(rdev, sect, s, 0))
+				md_error(mddev, rdev);
 			break;
 		}
 		/* write it back and re-read */
@@ -1694,13 +1710,9 @@ static void fix_read_error(conf_t *conf, int read_disk,
 			d--;
 			rdev = conf->mirrors[d].rdev;
 			if (rdev &&
-			    test_bit(In_sync, &rdev->flags)) {
-				if (sync_page_io(rdev, sect, s<<9,
-						 conf->tmppage, WRITE, false)
-				    == 0)
-					/* Well, this device is dead */
-					md_error(mddev, rdev);
-			}
+			    test_bit(In_sync, &rdev->flags))
+				r1_sync_page_io(rdev, sect, s,
+						conf->tmppage, WRITE);
 		}
 		d = start;
 		while (d != read_disk) {
@@ -1711,12 +1723,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
 			rdev = conf->mirrors[d].rdev;
 			if (rdev &&
 			    test_bit(In_sync, &rdev->flags)) {
-				if (sync_page_io(rdev, sect, s<<9,
-						 conf->tmppage, READ, false)
-				    == 0)
-					/* Well, this device is dead */
-					md_error(mddev, rdev);
-				else {
+				if (r1_sync_page_io(rdev, sect, s,
+						    conf->tmppage, READ)) {
 					atomic_add(s, &rdev->corrected_errors);
 					printk(KERN_INFO
 					       "md/raid1:%s: read error corrected "
@@ -1860,20 +1868,33 @@ static void raid1d(mddev_t *mddev)
 		mddev = r1_bio->mddev;
 		conf = mddev->private;
 		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
-			if (test_bit(R1BIO_MadeGood, &r1_bio->state)) {
+			if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+			    test_bit(R1BIO_WriteError, &r1_bio->state)) {
 				int m;
 				int s = r1_bio->sectors;
 				for (m = 0; m < conf->raid_disks ; m++) {
+					mdk_rdev_t *rdev
+						= conf->mirrors[m].rdev;
 					struct bio *bio = r1_bio->bios[m];
-					if (bio->bi_end_io != NULL &&
-					    test_bit(BIO_UPTODATE,
+					if (bio->bi_end_io == NULL)
+						continue;
+					if (test_bit(BIO_UPTODATE,
 						     &bio->bi_flags)) {
-						rdev = conf->mirrors[m].rdev;
 						rdev_clear_badblocks(
 							rdev,
 							r1_bio->sector,
 							r1_bio->sectors);
 					}
+					if (!test_bit(BIO_UPTODATE,
+						      &bio->bi_flags) &&
+					    test_bit(R1BIO_WriteError,
+						     &r1_bio->state)) {
+						if (!rdev_set_badblocks(
+							    rdev,
+							    r1_bio->sector,
+							    r1_bio->sectors, 0))
+							md_error(mddev, rdev);
+					}
 				}
 				put_buf(r1_bio);
 				md_done_sync(mddev, s, 1);
-- 
cgit v1.2.3


From 3a9f28a5117e00a868dd8b4395f9a707ae56764b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:33:42 +1000
Subject: md/raid1: improve handling of read failure during recovery.

If we cannot read a block from anywhere during recovery, there is
now a better approach than just giving up.
We can record a bad block on each device and keep going - being
careful not to clear the bad block when a write succeeds as it might -
it will be a write of incorrect data.

We have now reached the state where - for raid1 - we only call
md_error if md_set_badblocks has failed.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid1.c | 41 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 7 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e6957151233b..039e3af72929 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1392,7 +1392,12 @@ static void end_sync_write(struct bio *bio, int error)
 	} else if (is_badblock(conf->mirrors[mirror].rdev,
 			       r1_bio->sector,
 			       r1_bio->sectors,
-			       &first_bad, &bad_sectors))
+			       &first_bad, &bad_sectors) &&
+		   !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
+				r1_bio->sector,
+				r1_bio->sectors,
+				&first_bad, &bad_sectors)
+		)
 		set_bit(R1BIO_MadeGood, &r1_bio->state);
 
 	update_head_pos(mirror, r1_bio);
@@ -1473,16 +1478,36 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
 
 		if (!success) {
 			char b[BDEVNAME_SIZE];
-			/* Cannot read from anywhere, array is toast */
-			md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+			int abort = 0;
+			/* Cannot read from anywhere, this block is lost.
+			 * Record a bad block on each device.  If that doesn't
+			 * work just disable and interrupt the recovery.
+			 * Don't fail devices as that won't really help.
+			 */
 			printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
 			       " for block %llu\n",
 			       mdname(mddev),
 			       bdevname(bio->bi_bdev, b),
 			       (unsigned long long)r1_bio->sector);
-			md_done_sync(mddev, r1_bio->sectors, 0);
-			put_buf(r1_bio);
-			return 0;
+			for (d = 0; d < conf->raid_disks; d++) {
+				rdev = conf->mirrors[d].rdev;
+				if (!rdev || test_bit(Faulty, &rdev->flags))
+					continue;
+				if (!rdev_set_badblocks(rdev, sect, s, 0))
+					abort = 1;
+			}
+			if (abort) {
+				mddev->recovery_disabled = 1;
+				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+				md_done_sync(mddev, r1_bio->sectors, 0);
+				put_buf(r1_bio);
+				return 0;
+			}
+			/* Try next page */
+			sectors -= s;
+			sect += s;
+			idx++;
+			continue;
 		}
 
 		start = d;
@@ -1879,7 +1904,9 @@ static void raid1d(mddev_t *mddev)
 					if (bio->bi_end_io == NULL)
 						continue;
 					if (test_bit(BIO_UPTODATE,
-						     &bio->bi_flags)) {
+						     &bio->bi_flags) &&
+					    test_bit(R1BIO_MadeGood,
+						     &r1_bio->state)) {
 						rdev_clear_badblocks(
 							rdev,
 							r1_bio->sector,
-- 
cgit v1.2.3


From 62096bce231b3760882ed91205fc84682d6b0529 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 Jul 2011 11:38:13 +1000
Subject: md/raid1: factor several functions out or raid1d()

raid1d is too big with several deep branches.
So separate them out into their own functions.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid1.c | 310 ++++++++++++++++++++++++++---------------------------
 1 file changed, 151 insertions(+), 159 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 039e3af72929..32323f0afd89 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1861,21 +1861,160 @@ static int narrow_write_error(r1bio_t *r1_bio, int i)
 	return ok;
 }
 
+static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio)
+{
+	int m;
+	int s = r1_bio->sectors;
+	for (m = 0; m < conf->raid_disks ; m++) {
+		mdk_rdev_t *rdev = conf->mirrors[m].rdev;
+		struct bio *bio = r1_bio->bios[m];
+		if (bio->bi_end_io == NULL)
+			continue;
+		if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+		    test_bit(R1BIO_MadeGood, &r1_bio->state)) {
+			rdev_clear_badblocks(rdev, r1_bio->sector, s);
+		}
+		if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+		    test_bit(R1BIO_WriteError, &r1_bio->state)) {
+			if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
+				md_error(conf->mddev, rdev);
+		}
+	}
+	put_buf(r1_bio);
+	md_done_sync(conf->mddev, s, 1);
+}
+
+static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio)
+{
+	int m;
+	for (m = 0; m < conf->raid_disks ; m++)
+		if (r1_bio->bios[m] == IO_MADE_GOOD) {
+			mdk_rdev_t *rdev = conf->mirrors[m].rdev;
+			rdev_clear_badblocks(rdev,
+					     r1_bio->sector,
+					     r1_bio->sectors);
+			rdev_dec_pending(rdev, conf->mddev);
+		} else if (r1_bio->bios[m] != NULL) {
+			/* This drive got a write error.  We need to
+			 * narrow down and record precise write
+			 * errors.
+			 */
+			if (!narrow_write_error(r1_bio, m)) {
+				md_error(conf->mddev,
+					 conf->mirrors[m].rdev);
+				/* an I/O failed, we can't clear the bitmap */
+				set_bit(R1BIO_Degraded, &r1_bio->state);
+			}
+			rdev_dec_pending(conf->mirrors[m].rdev,
+					 conf->mddev);
+		}
+	if (test_bit(R1BIO_WriteError, &r1_bio->state))
+		close_write(r1_bio);
+	raid_end_bio_io(r1_bio);
+}
+
+static void handle_read_error(conf_t *conf, r1bio_t *r1_bio)
+{
+	int disk;
+	int max_sectors;
+	mddev_t *mddev = conf->mddev;
+	struct bio *bio;
+	char b[BDEVNAME_SIZE];
+	mdk_rdev_t *rdev;
+
+	clear_bit(R1BIO_ReadError, &r1_bio->state);
+	/* we got a read error. Maybe the drive is bad.  Maybe just
+	 * the block and we can fix it.
+	 * We freeze all other IO, and try reading the block from
+	 * other devices.  When we find one, we re-write
+	 * and check it that fixes the read error.
+	 * This is all done synchronously while the array is
+	 * frozen
+	 */
+	if (mddev->ro == 0) {
+		freeze_array(conf);
+		fix_read_error(conf, r1_bio->read_disk,
+			       r1_bio->sector, r1_bio->sectors);
+		unfreeze_array(conf);
+	} else
+		md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+
+	bio = r1_bio->bios[r1_bio->read_disk];
+	bdevname(bio->bi_bdev, b);
+read_more:
+	disk = read_balance(conf, r1_bio, &max_sectors);
+	if (disk == -1) {
+		printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
+		       " read error for block %llu\n",
+		       mdname(mddev), b, (unsigned long long)r1_bio->sector);
+		raid_end_bio_io(r1_bio);
+	} else {
+		const unsigned long do_sync
+			= r1_bio->master_bio->bi_rw & REQ_SYNC;
+		if (bio) {
+			r1_bio->bios[r1_bio->read_disk] =
+				mddev->ro ? IO_BLOCKED : NULL;
+			bio_put(bio);
+		}
+		r1_bio->read_disk = disk;
+		bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+		md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors);
+		r1_bio->bios[r1_bio->read_disk] = bio;
+		rdev = conf->mirrors[disk].rdev;
+		printk_ratelimited(KERN_ERR
+				   "md/raid1:%s: redirecting sector %llu"
+				   " to other mirror: %s\n",
+				   mdname(mddev),
+				   (unsigned long long)r1_bio->sector,
+				   bdevname(rdev->bdev, b));
+		bio->bi_sector = r1_bio->sector + rdev->data_offset;
+		bio->bi_bdev = rdev->bdev;
+		bio->bi_end_io = raid1_end_read_request;
+		bio->bi_rw = READ | do_sync;
+		bio->bi_private = r1_bio;
+		if (max_sectors < r1_bio->sectors) {
+			/* Drat - have to split this up more */
+			struct bio *mbio = r1_bio->master_bio;
+			int sectors_handled = (r1_bio->sector + max_sectors
+					       - mbio->bi_sector);
+			r1_bio->sectors = max_sectors;
+			spin_lock_irq(&conf->device_lock);
+			if (mbio->bi_phys_segments == 0)
+				mbio->bi_phys_segments = 2;
+			else
+				mbio->bi_phys_segments++;
+			spin_unlock_irq(&conf->device_lock);
+			generic_make_request(bio);
+			bio = NULL;
+
+			r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+			r1_bio->master_bio = mbio;
+			r1_bio->sectors = (mbio->bi_size >> 9)
+					  - sectors_handled;
+			r1_bio->state = 0;
+			set_bit(R1BIO_ReadError, &r1_bio->state);
+			r1_bio->mddev = mddev;
+			r1_bio->sector = mbio->bi_sector + sectors_handled;
+
+			goto read_more;
+		} else
+			generic_make_request(bio);
+	}
+}
+
 static void raid1d(mddev_t *mddev)
 {
 	r1bio_t *r1_bio;
-	struct bio *bio;
 	unsigned long flags;
 	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
-	mdk_rdev_t *rdev;
 	struct blk_plug plug;
 
 	md_check_recovery(mddev);
 
 	blk_start_plug(&plug);
 	for (;;) {
-		char b[BDEVNAME_SIZE];
 
 		if (atomic_read(&mddev->plug_cnt) == 0)
 			flush_pending_writes(conf);
@@ -1894,168 +2033,21 @@ static void raid1d(mddev_t *mddev)
 		conf = mddev->private;
 		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
 			if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
-			    test_bit(R1BIO_WriteError, &r1_bio->state)) {
-				int m;
-				int s = r1_bio->sectors;
-				for (m = 0; m < conf->raid_disks ; m++) {
-					mdk_rdev_t *rdev
-						= conf->mirrors[m].rdev;
-					struct bio *bio = r1_bio->bios[m];
-					if (bio->bi_end_io == NULL)
-						continue;
-					if (test_bit(BIO_UPTODATE,
-						     &bio->bi_flags) &&
-					    test_bit(R1BIO_MadeGood,
-						     &r1_bio->state)) {
-						rdev_clear_badblocks(
-							rdev,
-							r1_bio->sector,
-							r1_bio->sectors);
-					}
-					if (!test_bit(BIO_UPTODATE,
-						      &bio->bi_flags) &&
-					    test_bit(R1BIO_WriteError,
-						     &r1_bio->state)) {
-						if (!rdev_set_badblocks(
-							    rdev,
-							    r1_bio->sector,
-							    r1_bio->sectors, 0))
-							md_error(mddev, rdev);
-					}
-				}
-				put_buf(r1_bio);
-				md_done_sync(mddev, s, 1);
-			} else
+			    test_bit(R1BIO_WriteError, &r1_bio->state))
+				handle_sync_write_finished(conf, r1_bio);
+			else
 				sync_request_write(mddev, r1_bio);
 		} else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
-			   test_bit(R1BIO_WriteError, &r1_bio->state)) {
-			int m;
-			for (m = 0; m < conf->raid_disks ; m++)
-				if (r1_bio->bios[m] == IO_MADE_GOOD) {
-					rdev = conf->mirrors[m].rdev;
-					rdev_clear_badblocks(
-						rdev,
-						r1_bio->sector,
-						r1_bio->sectors);
-					rdev_dec_pending(rdev, mddev);
-				} else if (r1_bio->bios[m] != NULL) {
-					/* This drive got a write error.  We
-					 * need to narrow down and record
-					 * precise write errors.
-					 */
-					if (!narrow_write_error(r1_bio, m)) {
-						md_error(mddev,
-							 conf->mirrors[m].rdev);
-						/* an I/O failed, we can't clear
-						 * the bitmap */
-						set_bit(R1BIO_Degraded,
-							&r1_bio->state);
-					}
-					rdev_dec_pending(conf->mirrors[m].rdev,
-							 mddev);
-				}
-			if (test_bit(R1BIO_WriteError, &r1_bio->state))
-				close_write(r1_bio);
-			raid_end_bio_io(r1_bio);
-		} else if (test_bit(R1BIO_ReadError, &r1_bio->state)) {
-			int disk;
-			int max_sectors;
-
-			clear_bit(R1BIO_ReadError, &r1_bio->state);
-			/* we got a read error. Maybe the drive is bad.  Maybe just
-			 * the block and we can fix it.
-			 * We freeze all other IO, and try reading the block from
-			 * other devices.  When we find one, we re-write
-			 * and check it that fixes the read error.
-			 * This is all done synchronously while the array is
-			 * frozen
-			 */
-			if (mddev->ro == 0) {
-				freeze_array(conf);
-				fix_read_error(conf, r1_bio->read_disk,
-					       r1_bio->sector,
-					       r1_bio->sectors);
-				unfreeze_array(conf);
-			} else
-				md_error(mddev,
-					 conf->mirrors[r1_bio->read_disk].rdev);
-
-			bio = r1_bio->bios[r1_bio->read_disk];
-			bdevname(bio->bi_bdev, b);
-read_more:
-			disk = read_balance(conf, r1_bio, &max_sectors);
-			if (disk == -1) {
-				printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
-				       " read error for block %llu\n",
-				       mdname(mddev), b,
-				       (unsigned long long)r1_bio->sector);
-				raid_end_bio_io(r1_bio);
-			} else {
-				const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
-				if (bio) {
-					r1_bio->bios[r1_bio->read_disk] =
-						mddev->ro ? IO_BLOCKED : NULL;
-					bio_put(bio);
-				}
-				r1_bio->read_disk = disk;
-				bio = bio_clone_mddev(r1_bio->master_bio,
-						      GFP_NOIO, mddev);
-				md_trim_bio(bio,
-					    r1_bio->sector - bio->bi_sector,
-					    max_sectors);
-				r1_bio->bios[r1_bio->read_disk] = bio;
-				rdev = conf->mirrors[disk].rdev;
-				printk_ratelimited(
-					KERN_ERR
-					"md/raid1:%s: redirecting sector %llu"
-					" to other mirror: %s\n",
-					mdname(mddev),
-					(unsigned long long)r1_bio->sector,
-					bdevname(rdev->bdev, b));
-				bio->bi_sector = r1_bio->sector + rdev->data_offset;
-				bio->bi_bdev = rdev->bdev;
-				bio->bi_end_io = raid1_end_read_request;
-				bio->bi_rw = READ | do_sync;
-				bio->bi_private = r1_bio;
-				if (max_sectors < r1_bio->sectors) {
-					/* Drat - have to split this up more */
-					struct bio *mbio = r1_bio->master_bio;
-					int sectors_handled =
-						r1_bio->sector + max_sectors
-						- mbio->bi_sector;
-					r1_bio->sectors = max_sectors;
-					spin_lock_irq(&conf->device_lock);
-					if (mbio->bi_phys_segments == 0)
-						mbio->bi_phys_segments = 2;
-					else
-						mbio->bi_phys_segments++;
-					spin_unlock_irq(&conf->device_lock);
-					generic_make_request(bio);
-					bio = NULL;
-
-					r1_bio = mempool_alloc(conf->r1bio_pool,
-							       GFP_NOIO);
-
-					r1_bio->master_bio = mbio;
-					r1_bio->sectors = (mbio->bi_size >> 9)
-						- sectors_handled;
-					r1_bio->state = 0;
-					set_bit(R1BIO_ReadError,
-						&r1_bio->state);
-					r1_bio->mddev = mddev;
-					r1_bio->sector = mbio->bi_sector
-						+ sectors_handled;
-
-					goto read_more;
-				} else
-					generic_make_request(bio);
-			}
-		} else {
+			   test_bit(R1BIO_WriteError, &r1_bio->state))
+			handle_write_finished(conf, r1_bio);
+		else if (test_bit(R1BIO_ReadError, &r1_bio->state))
+			handle_read_error(conf, r1_bio);
+		else
 			/* just a partial read to be scheduled from separate
 			 * context
 			 */
 			generic_make_request(r1_bio->bios[r1_bio->read_disk]);
-		}
+
 		cond_resched();
 		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
 			md_check_recovery(mddev);
-- 
cgit v1.2.3