From: Jonathan Brassow <jbrassow@redhat.com>

This patch gives mirror the ability to handle device failures
during normal write operations.

The 'write_callback' function is called when a write completes.
If all the writes failed or succeeded, we report failure or
success respectively.  If some of the writes failed, we call
fail_mirror; which increments the error count for the device and
selects a new primary (if necessary).  Note that the primary
device can never change while the mirror is not in-sync (IOW,
while recovery is happening.)  This means that the scenario
where a failed write changes the primary and gives
recovery_complete a chance to misread the primary never happens.
The fact that the primary can change has necessitated the change
to the default_mirror field.  We need to protect against reading
garbage while the primary changes.  We then add the bio to a new
list in the mirror set, 'failures'.  (Since we must raise an event
and events can block, we must handle the failures in the main worker
thread.)  For every bio in the 'failures' list, we call a new
function, '__bio_mark_nosync', where we mark the region 'not-in-sync'
in the log and properly set the region state as, RH_NOSYNC.

Backwards compatibility is maintained by ignoring errors if
the DM_FEATURES_HANDLE_ERRORS flag is not present.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

---
 drivers/md/dm-raid1.c |  228 ++++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 202 insertions(+), 26 deletions(-)

Index: linux-2.6.24-rc8/drivers/md/dm-raid1.c
===================================================================
--- linux-2.6.24-rc8.orig/drivers/md/dm-raid1.c	2008-01-24 13:59:20.000000000 +0000
+++ linux-2.6.24-rc8/drivers/md/dm-raid1.c	2008-01-24 14:04:05.000000000 +0000
@@ -127,9 +127,10 @@ struct mirror_set {
 	struct kcopyd_client *kcopyd_client;
 	uint64_t features;
 
-	spinlock_t lock;	/* protects the next two lists */
+	spinlock_t lock;	/* protects the lists */
 	struct bio_list reads;
 	struct bio_list writes;
+	struct bio_list failures;
 
 	struct dm_io_client *io_client;
 
@@ -138,7 +139,8 @@ struct mirror_set {
 	int in_sync;
 	int log_failure;
 
-	struct mirror *default_mirror;	/* Default mirror */
+	rwlock_t default_mirror_lock;
+	atomic_t default_mirror;	/* Default mirror */
 
 	struct workqueue_struct *kmirrord_wq;
 	struct work_struct kmirrord_work;
@@ -646,6 +648,71 @@ static void bio_set_ms(struct bio *bio, 
 	bio->bi_next = (struct bio *) ms;
 }
 
+static struct mirror *get_default_mirror(struct mirror_set *ms)
+{
+	return &ms->mirror[atomic_read(&ms->default_mirror)];
+}
+
+static void set_default_mirror(struct mirror *m)
+{
+	struct mirror_set *ms = m->ms;
+	struct mirror *m0 = &(ms->mirror[0]);
+
+	atomic_set(&ms->default_mirror, m - m0);
+}
+
+/* fail_mirror
+ * @m: mirror device to fail
+ *
+ * If the device is valid, mark it invalid.  Also,
+ * if this is the default mirror device (i.e. the primary
+ * device) and the mirror set is in-sync, choose an
+ * alternative primary device.
+ *
+ * This function must not block.
+ */
+static void fail_mirror(struct mirror *m)
+{
+	struct mirror_set *ms = m->ms;
+	struct mirror *new;
+
+	if (!errors_handled(ms))
+		return;
+
+	/*
+	 * We use error count as a boolean to determine
+	 * whether the device has failed or not.  While
+	 * we don't use the actual count right now, we
+	 * may wish to have this information available for
+	 * the future.
+	 */
+	if (atomic_inc_return(&m->error_count) > 1)
+		return;
+
+	if (m != get_default_mirror(ms))
+		return;
+
+	/* Change default mirror provided it is fully in-sync. */
+	if (!ms->in_sync) {
+		/*
+		 * Better to issue requests to same failing device
+		 * than to risk returning corrupt data.
+		 */
+		DMERR("Primary mirror (%s) failed while out-of-sync: "
+		      "Reads may fail.", m->dev->name);
+		return;
+	}
+
+	for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++)
+		if (!atomic_read(&new->error_count)) {
+			set_default_mirror(new);
+			break;
+		}
+
+	if (unlikely(new == ms->mirror + ms->nr_mirrors))
+		DMWARN("All sides of mirror have failed.");
+}
+
 /*-----------------------------------------------------------------
  * Recovery.
  *
@@ -678,7 +745,7 @@ static int recover(struct mirror_set *ms
 	unsigned long flags = 0;
 
 	/* fill in the source */
-	m = ms->default_mirror;
+	m = get_default_mirror(ms);
 	from.bdev = m->dev->bdev;
 	from.sector = m->offset + region_to_sector(reg->rh, reg->key);
 	if (reg->key == (ms->nr_regions - 1)) {
@@ -694,7 +761,7 @@ static int recover(struct mirror_set *ms
 
 	/* fill in the destinations */
 	for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
-		if (&ms->mirror[i] == ms->default_mirror)
+		if (&ms->mirror[i] == get_default_mirror(ms))
 			continue;
 
 		m = ms->mirror + i;
@@ -749,7 +816,7 @@ static void do_recovery(struct mirror_se
 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
 {
 	/* FIXME: add read balancing */
-	return ms->default_mirror;
+	return get_default_mirror(ms);
 }
 
 /*
@@ -776,7 +843,7 @@ static void do_reads(struct mirror_set *
 		if (rh_in_sync(&ms->rh, region, 1))
 			m = choose_mirror(ms, bio->bi_sector);
 		else
-			m = ms->default_mirror;
+			m = get_default_mirror(ms);
 
 		map_bio(ms, m, bio);
 		generic_make_request(bio);
@@ -793,12 +860,67 @@ static void do_reads(struct mirror_set *
  * RECOVERING:	delay the io until recovery completes
  * NOSYNC:	increment pending, just write to the default mirror
  *---------------------------------------------------------------*/
+
+/* __bio_mark_nosync
+ * @ms
+ * @bio
+ * @done
+ * @error
+ *
+ * The bio was written on some mirror(s) but failed on other mirror(s).
+ * We can successfully endio the bio but should avoid the region being
+ * marked clean by setting the state RH_NOSYNC.
+ *
+ * This function is _not_ safe in interrupt context!
+ */
+static void __bio_mark_nosync(struct mirror_set *ms,
+			      struct bio *bio, unsigned int done, int error)
+{
+	unsigned long flags;
+	struct region_hash *rh = &ms->rh;
+	struct dirty_log *log = ms->rh.log;
+	struct region *reg;
+	region_t region = bio_to_region(rh, bio);
+	int recovering = 0;
+
+	/* We must inform the log that the sync count has changed. */
+	log->type->set_region_sync(log, region, 0);
+	ms->in_sync = 0;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	/* region hash entry should exist because write was in-flight */
+	BUG_ON(!reg);
+	BUG_ON(!list_empty(&reg->list));
+
+	spin_lock_irqsave(&rh->region_lock, flags);
+	/*
+	 * Possible cases:
+	 *   1) RH_DIRTY
+	 *   2) RH_NOSYNC: was dirty, other preceeding writes failed
+	 *   3) RH_RECOVERING: flushing pending writes
+	 * Either case, the region should have not been connected to list.
+	 */
+	recovering = (reg->state == RH_RECOVERING);
+	reg->state = RH_NOSYNC;
+	BUG_ON(!list_empty(&reg->list));
+	spin_unlock_irqrestore(&rh->region_lock, flags);
+
+	bio_endio(bio, error);
+	if (recovering)
+		complete_resync_work(reg, 0);
+}
+
 static void write_callback(unsigned long error, void *context)
 {
-	unsigned int i;
-	int uptodate = 1;
+	unsigned i, ret = 0;
 	struct bio *bio = (struct bio *) context;
 	struct mirror_set *ms;
+	int uptodate = 0;
+	int should_wake = 0;
+	unsigned long flags;
 
 	ms = bio_get_ms(bio);
 	bio_set_ms(bio, NULL);
@@ -809,20 +931,36 @@ static void write_callback(unsigned long
 	 * This way we handle both writes to SYNC and NOSYNC
 	 * regions with the same code.
 	 */
+	if (likely(!error))
+		goto out;
 
-	if (error) {
+	for (i = 0; i < ms->nr_mirrors; i++)
+		if (test_bit(i, &error))
+			fail_mirror(ms->mirror + i);
+		else
+			uptodate = 1;
+
+	if (unlikely(!uptodate)) {
+		DMERR("All replicated volumes dead, failing I/O");
+		/* None of the writes succeeded, fail the I/O. */
+		ret = -EIO;
+	} else if (errors_handled(ms)) {
 		/*
-		 * only error the io if all mirrors failed.
-		 * FIXME: bogus
+		 * Need to raise event.  Since raising
+		 * events can block, we need to do it in
+		 * the main thread.
 		 */
-		uptodate = 0;
-		for (i = 0; i < ms->nr_mirrors; i++)
-			if (!test_bit(i, &error)) {
-				uptodate = 1;
-				break;
-			}
+		spin_lock_irqsave(&ms->lock, flags);
+		if (!ms->failures.head)
+			should_wake = 1;
+		bio_list_add(&ms->failures, bio);
+		spin_unlock_irqrestore(&ms->lock, flags);
+		if (should_wake)
+			wake(ms);
+		return;
 	}
-	bio_endio(bio, 0);
+out:
+	bio_endio(bio, ret);
 }
 
 static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -910,33 +1048,69 @@ static void do_writes(struct mirror_set 
 		rh_delay(&ms->rh, bio);
 
 	while ((bio = bio_list_pop(&nosync))) {
-		map_bio(ms, ms->default_mirror, bio);
+		map_bio(ms, get_default_mirror(ms), bio);
 		generic_make_request(bio);
 	}
 }
 
+static void do_failures(struct mirror_set *ms, struct bio_list *failures)
+{
+	struct bio *bio;
+
+	if (!failures->head)
+		return;
+
+	dm_table_event(ms->ti->table);
+
+	while ((bio = bio_list_pop(failures)))
+		__bio_mark_nosync(ms, bio, bio->bi_size, 0);
+}
+
 /*-----------------------------------------------------------------
  * kmirrord
  *---------------------------------------------------------------*/
-static void do_mirror(struct work_struct *work)
+static int _do_mirror(struct work_struct *work)
 {
 	struct mirror_set *ms =container_of(work, struct mirror_set,
 					    kmirrord_work);
-	struct bio_list reads, writes;
+	struct bio_list reads, writes, failures;
+	unsigned long flags;
 
-	spin_lock(&ms->lock);
+	spin_lock_irqsave(&ms->lock, flags);
 	reads = ms->reads;
 	writes = ms->writes;
+	failures = ms->failures;
 	bio_list_init(&ms->reads);
 	bio_list_init(&ms->writes);
-	spin_unlock(&ms->lock);
+	bio_list_init(&ms->failures);
+	spin_unlock_irqrestore(&ms->lock, flags);
 
 	rh_update_states(&ms->rh);
 	do_recovery(ms);
 	do_reads(ms, &reads);
 	do_writes(ms, &writes);
+	do_failures(ms, &failures);
+
+	return (ms->failures.head) ? 1 : 0;
 }
 
+static void do_mirror(struct work_struct *work)
+{
+	/*
+	 * If _do_mirror returns 1, we give it
+	 * another shot.  This helps for cases like
+	 * 'suspend' where we call flush_workqueue
+	 * and expect all work to be finished.  If
+	 * a failure happens during a suspend, we
+	 * couldn't issue a 'wake' because it would
+	 * not be honored.  Therefore, we return '1'
+	 * from _do_mirror, and retry here.
+	 */
+	while (_do_mirror(work))
+		schedule();
+}
+
+
 /*-----------------------------------------------------------------
  * Target functions
  *---------------------------------------------------------------*/
@@ -965,7 +1139,7 @@ static struct mirror_set *alloc_context(
 	ms->nr_mirrors = nr_mirrors;
 	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
 	ms->in_sync = 0;
-	ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
+	atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
 
 	ms->io_client = dm_io_client_create(DM_IO_PAGES);
 	if (IS_ERR(ms->io_client)) {
@@ -1019,6 +1193,7 @@ static int get_mirror(struct mirror_set 
 	}
 
 	ms->mirror[mirror].ms = ms;
+	atomic_set(&(ms->mirror[mirror].error_count), 0);
 	ms->mirror[mirror].offset = offset;
 
 	return 0;
@@ -1220,14 +1395,15 @@ static void mirror_dtr(struct dm_target 
 
 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
 {
+	unsigned long flags;
 	int should_wake = 0;
 	struct bio_list *bl;
 
 	bl = (rw == WRITE) ? &ms->writes : &ms->reads;
-	spin_lock(&ms->lock);
+	spin_lock_irqsave(&ms->lock, flags);
 	should_wake = !(bl->head);
 	bio_list_add(bl, bio);
-	spin_unlock(&ms->lock);
+	spin_unlock_irqrestore(&ms->lock, flags);
 
 	if (should_wake)
 		wake(ms);