[dm-devel] [PATCHES 7+8] patches for dm-raid1-bio-holding

Mon Nov 30 23:14:29 UTC 2009

Hi

There is an error in do_failures, in that cycle
while ((bio = bio_list_pop(failures))) --- there exists a path where the 
bio is not processed at all and the pointer to the bio is lost --- you 
forgot the last "else hold_bio". It deadlocked in my testing.

I'm resending these two patches, correct it according to them.

Mikulas
-------------- next part --------------
Hold all write bios when errors are handled.

The patch changes these things:
- previously, the failures list was used only when handling errors with
  dmeventd. Now, it is used for all bios. Even when not using dmeventd,
  the regions where some writes failed must be marked as nosync. This can only
  be done in process context (i.e. in raid1 workqueue), not in
  the write_callback function.
- previously, the write would succeed if writing to at least one leg succeeded.
  This is wrong because data from the failed leg may be replicated to the
  correct leg.
  Now, if using dmeventd, the write with some failures will fail be held until
  dmeventd does its job and reconfigures the array.
  If not using dmeventd, the write still succeeds if at least one leg succeeds,
  it is wrong but it is consistent with current behavior.

Signed-off-by: Mikulas Patocka <mpatocka at redhat.com>

---
 drivers/md/dm-raid1.c |   62 ++++++++++++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

Index: linux-2.6.32-rc8-devel/drivers/md/dm-raid1.c
===================================================================

--- linux-2.6.32-rc8-devel.orig/drivers/md/dm-raid1.c	2009-11-26 00:55:21.000000000 +0100
+++ linux-2.6.32-rc8-devel/drivers/md/dm-raid1.c	2009-11-26 00:59:58.000000000 +0100
@@ -543,7 +543,6 @@ static void write_callback(unsigned long
 	unsigned i, ret = 0;
 	struct bio *bio = (struct bio *) context;
 	struct mirror_set *ms;
-	int uptodate = 0;
 	unsigned long flags;
 
 	ms = bio_get_m(bio)->ms;
@@ -555,33 +554,23 @@ static void write_callback(unsigned long
 	 * This way we handle both writes to SYNC and NOSYNC
 	 * regions with the same code.
 	 */
-	if (likely(!error))
-		goto out;
+	if (likely(!error)) {
+		bio_endio(bio, ret);
+		return;
+	}
 
 	for (i = 0; i < ms->nr_mirrors; i++)
 		if (test_bit(i, &error))
 			fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
-		else
-			uptodate = 1;
 
-	if (unlikely(!uptodate)) {
-		DMERR("All replicated volumes dead, failing I/O");
-		/* None of the writes succeeded, fail the I/O. */
-		ret = -EIO;
-	} else if (errors_handled(ms)) {
-		/*
-		 * Need to raise event.  Since raising
-		 * events can block, we need to do it in
-		 * the main thread.
-		 */
-		spin_lock_irqsave(&ms->lock, flags);
-		bio_list_add(&ms->failures, bio);
-		spin_unlock_irqrestore(&ms->lock, flags);
-		wakeup_mirrord(ms);
-		return;
-	}
-out:
-	bio_endio(bio, ret);
+	/*
+	 * In either case we must mark the region as NOSYNC.
+	 * That would block, so do it in the thread.
+	 */
+	spin_lock_irqsave(&ms->lock, flags);
+	bio_list_add(&ms->failures, bio);
+	spin_unlock_irqrestore(&ms->lock, flags);
+	wakeup_mirrord(ms);
 }
 
 static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -736,13 +725,28 @@ static void do_failures(struct mirror_se
 	 */
 
 	while ((bio = bio_list_pop(failures))) {
-		if (ms->log_failure)
-			hold_bio(ms, bio);
-		else {
-			ms->in_sync = 0;
-			dm_rh_mark_nosync(ms->rh, bio);
-			bio_endio(bio, 0);
+  		if (!ms->log_failure) {
+  			ms->in_sync = 0;
+  			dm_rh_mark_nosync(ms->rh, bio);
 		}
+		/*
+		 * If all the legs are dead, fail the I/O.
+		 *
+		 * If we are not using dmeventd, we pretend that the I/O
+		 * succeeded. This is wrong (the failed leg might come online
+		 * again after reboot and it would be replicated back to
+		 * the good leg) but it is consistent with current behavior.
+		 * For proper behavior, dm-raid1 shouldn't be used without
+		 * dmeventd at all.
+		 *
+		 * If we use dmeventd, hold the bio until dmeventd does its job.
+		 */
+		if (!get_valid_mirror(ms))
+			bio_endio(bio, -EIO);
+		else if (!errors_handled(ms))
+  			bio_endio(bio, 0);
+		else
+  			hold_bio(ms, bio);
 	}
 }
 
-------------- next part --------------
Hold all write bios when leg fails and errors are handled

When using dmeventd to handle errors, we must be held until dmeventd does
its job. This patch prevents the following race:
* primary leg fails
* write "1" fail, the write is held, secondary leg is set default
* write "2" goes straight to the secondary leg
*** crash *** (before dmeventd does its job)
* after a reboot, primary leg goes back online, it is resynchronized to
  the secondary leg, write "2" is reverted although it already completed

Signed-off-by: Mikulas Patocka <mpatocka at redhat.com>

---
 drivers/md/dm-raid1.c |   12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

Index: linux-2.6.32-rc8-devel/drivers/md/dm-raid1.c
===================================================================
--- linux-2.6.32-rc8-devel.orig/drivers/md/dm-raid1.c	2009-11-26 17:26:26.000000000 +0100
+++ linux-2.6.32-rc8-devel/drivers/md/dm-raid1.c	2009-11-26 18:30:48.000000000 +0100
@@ -68,6 +68,7 @@ struct mirror_set {
 	region_t nr_regions;
 	int in_sync;
 	int log_failure;
+	int leg_failure;
 	atomic_t suspend;
 
 	atomic_t default_mirror;	/* Default mirror */
@@ -210,6 +211,8 @@ static void fail_mirror(struct mirror *m
 	struct mirror_set *ms = m->ms;
 	struct mirror *new;
 
+	ms->leg_failure = 1;
+
 	/*
 	 * error_count is used for nothing more than a
 	 * simple way to tell if a device has encountered
@@ -694,8 +697,12 @@ static void do_writes(struct mirror_set 
 		dm_rh_delay(ms->rh, bio);
 
 	while ((bio = bio_list_pop(&nosync))) {
-		map_bio(get_default_mirror(ms), bio);
-		generic_make_request(bio);
+		if (unlikely(ms->leg_failure) && errors_handled(ms))
+			hold_bio(ms, bio);
+		else {
+			map_bio(get_default_mirror(ms), bio);
+			generic_make_request(bio);
+		}
 	}
 }
 
@@ -816,6 +823,7 @@ static struct mirror_set *alloc_context(
 	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
 	ms->in_sync = 0;
 	ms->log_failure = 0;
+	ms->leg_failure = 0;
 	atomic_set(&ms->suspend, 0);
 	atomic_set(&ms->default_mirror, DEFAULT_MIRROR);