[dm-devel] [2.6.22-rc1-mm1 PATCH 9/10] dm-raid1-handle-read-failures.patch

Jonathan Brassow jbrassow at redhat.com
Mon May 21 22:08:59 UTC 2007


 brassow

This patch gives the ability to respond-to/record device failures
that happen during read operations.  It also adds the ability to
read from mirror devices that are not the primary if they are
in-sync.

There are essentially two read paths in mirroring; the direct path
and the queued path.  When a read request is mapped, if the region
is 'in-sync' the direct path is taken; otherwise the queued path
is taken.

If the direct path is taken, we recored bio information so that if
the read fails we can retry it.  We discover the status of a direct
read through mirror_end_io.  If the read has failed, we mark the
device from which the read was attempted as failed (so we don't try
to read from it again), restore the bio and try again.

If the queued path is taken, we discover the results of the read
from 'read_callback'.  If the device failed, we mark the device
as failed and we attempt the read again if there is another device
where this region is known to be 'in-sync'.

Index: linux-2.6.22-rc1-mm1/drivers/md/dm-raid1.c
===================================================================
--- linux-2.6.22-rc1-mm1.orig/drivers/md/dm-raid1.c
+++ linux-2.6.22-rc1-mm1/drivers/md/dm-raid1.c
@@ -6,6 +6,7 @@
 
 #include "dm.h"
 #include "dm-bio-list.h"
+#include "dm-bio-record.h"
 #include "dm-io.h"
 #include "dm-log.h"
 #include "kcopyd.h"
@@ -639,24 +640,39 @@ static void rh_start_recovery(struct reg
 	wake(rh->ms);
 }
 
+struct bio_map_info {
+	struct mirror *bmi_m;
+	struct dm_bio_details bmi_bd;
+};
+
+static mempool_t *bio_map_info_pool = NULL;
+
+static void *bio_map_info_alloc(unsigned int gfp_mask, void *pool_data){
+	return kmalloc(sizeof(struct bio_map_info), gfp_mask);
+}
+
+static void bio_map_info_free(void *element, void *pool_data){
+	kfree(element);
+}
+
 /*
  * Every mirror should look like this one.
  */
 #define DEFAULT_MIRROR 0
 
 /*
- * This is yucky.  We squirrel the mirror_set struct away inside
- * bi_next for write buffers.  This is safe since the bh
+ * This is yucky.  We squirrel the mirror struct away inside
+ * bi_next for read/write buffers.  This is safe since the bh
  * doesn't get submitted to the lower levels of block layer.
  */
-static struct mirror_set *bio_get_ms(struct bio *bio)
+static struct mirror *bio_get_m(struct bio *bio)
 {
-	return (struct mirror_set *) bio->bi_next;
+	return (struct mirror *) bio->bi_next;
 }
 
-static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
+static void bio_set_m(struct bio *bio, struct mirror *m)
 {
-	bio->bi_next = (struct bio *) ms;
+	bio->bi_next = (struct bio *) m;
 }
 
 /*-----------------------------------------------------------------
@@ -776,13 +792,19 @@ static void do_recovery(struct mirror_se
 	}
 }
 
-/*-----------------------------------------------------------------
- * Reads
- *---------------------------------------------------------------*/
 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
 {
-	/* FIXME: add read balancing */
-	return ms->default_mirror;
+	struct mirror *m = ms->default_mirror;
+
+	do {
+		if (likely(!atomic_read(&m->error_count)))
+		    return m;
+
+		if (m-- == ms->mirror)
+			m += ms->nr_mirrors;
+	} while (m != ms->default_mirror);
+
+	return NULL;
 }
 
 /* fail_mirror
@@ -834,13 +856,84 @@ static void fail_mirror(struct mirror *m
 		DMWARN("All sides of mirror have failed.");
 }
 
+static int default_ok(struct mirror *m)
+{
+	return !atomic_read(&m->ms->default_mirror->error_count);
+}
+
+static int mirror_available(struct mirror_set *ms, struct bio *bio)
+{
+	region_t region = bio_to_region(&ms->rh, bio);
+
+	if (ms->rh.log->type->in_sync(ms->rh.log, region, 0) > 0)
+		return choose_mirror(ms,  bio->bi_sector) ? 1 : 0;
+
+	return 0;
+}
+
 /*
  * remap a buffer to a particular mirror.
  */
-static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
+static sector_t map_sector(struct mirror *m, struct bio *bio)
+{
+	return m->offset + (bio->bi_sector - m->ms->ti->begin);
+}
+
+static void map_bio(struct mirror *m, struct bio *bio)
 {
 	bio->bi_bdev = m->dev->bdev;
-	bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
+	bio->bi_sector = map_sector(m, bio);
+}
+
+static void map_region(struct io_region *io, struct mirror *m,
+		       struct bio *bio)
+{
+	io->bdev = m->dev->bdev;
+	io->sector = map_sector(m, bio);
+	io->count = bio->bi_size >> 9;
+}
+
+/*-----------------------------------------------------------------
+ * Reads
+ *---------------------------------------------------------------*/
+static void read_callback(unsigned long error, void *context)
+{
+	struct bio *bio = (struct bio *)context;
+	struct mirror *m;
+
+	m = bio_get_m(bio);
+	bio_set_m(bio, NULL);
+
+	if (unlikely(error)) {
+		DMWARN("A read failure occurred on a mirror device.");
+		fail_mirror(m);
+		if (likely(default_ok(m)) || mirror_available(m->ms, bio)) {
+			DMWARN("Trying different device.");
+			queue_bio(m->ms, bio, bio_rw(bio));
+		} else {
+			DMERR("No other device available, failing I/O.");
+			bio_endio(bio, bio->bi_size, -EIO);
+		}
+	} else
+		bio_endio(bio, bio->bi_size, 0);
+}
+
+/* Asynchronous read. */
+static void read_async_bio(struct mirror *m, struct bio *bio)
+{
+	struct io_region io;
+	struct dm_io_request io_req = {
+		.bi_rw = READ,
+		.mem.type = DM_IO_BVEC,
+		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
+		.notify.fn = read_callback,
+		.notify.context = bio,
+		.client = m->ms->io_client,
+	};
+
+	map_region(&io, m, bio);
+	bio_set_m(bio, m);
+	(void) dm_io(&io_req, m->ms->nr_mirrors, &io, NULL);
 }
 
 static void do_reads(struct mirror_set *ms, struct bio_list *reads)
@@ -855,13 +948,20 @@ static void do_reads(struct mirror_set *
 		/*
 		 * We can only read balance if the region is in sync.
 		 */
-		if (rh_in_sync(&ms->rh, region, 1))
+		if (likely(rh_in_sync(&ms->rh, region, 1)))
 			m = choose_mirror(ms, bio->bi_sector);
-		else
+		else {
 			m = ms->default_mirror;
 
-		map_bio(ms, m, bio);
-		generic_make_request(bio);
+			/* If default has failed, we give up. */
+			if (unlikely(m && atomic_read(&m->error_count)))
+				m = NULL;
+		}
+
+		if (likely(m))
+			read_async_bio(m, bio);
+		else
+			bio_endio(bio, bio->bi_size, -EIO);
 	}
 }
 
@@ -936,8 +1036,8 @@ static void write_callback(unsigned long
 	int uptodate = 0;
 	int should_wake = 0;
 
-	ms = bio_get_ms(bio);
-	bio_set_ms(bio, NULL);
+	ms = (bio_get_m(bio))->ms;
+	bio_set_m(bio, NULL);
 
 	/*
 	 * NOTE: We don't decrement the pending count here,
@@ -981,7 +1081,7 @@ static void write_callback(unsigned long
 static void do_write(struct mirror_set *ms, struct bio *bio)
 {
 	unsigned int i;
-	struct io_region io[KCOPYD_MAX_REGIONS+1];
+	struct io_region io[ms->nr_mirrors], *dest = io;
 	struct mirror *m;
 	struct dm_io_request io_req = {
 		.bi_rw = WRITE,
@@ -992,15 +1092,15 @@ static void do_write(struct mirror_set *
 		.client = ms->io_client,
 	};
 
-	for (i = 0; i < ms->nr_mirrors; i++) {
-		m = ms->mirror + i;
-
-		io[i].bdev = m->dev->bdev;
-		io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
-		io[i].count = bio->bi_size >> 9;
-	}
+	for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
+		map_region(dest++, m, bio);
 
-	bio_set_ms(bio, ms);
+	/*
+	 * We can use the default mirror here, because we
+	 * only need it in order to retrieve the reference
+	 * to the mirror set in write_callback().
+	 */
+	bio_set_m(bio, ms->default_mirror);
 
 	(void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
 }
@@ -1065,7 +1165,7 @@ static void do_writes(struct mirror_set 
 		rh_delay(&ms->rh, bio);
 
 	while ((bio = bio_list_pop(&nosync))) {
-		map_bio(ms, ms->default_mirror, bio);
+		map_bio(ms->default_mirror, bio);
 		generic_make_request(bio);
 	}
 }
@@ -1449,42 +1549,66 @@ static int mirror_map(struct dm_target *
 	int r, rw = bio_rw(bio);
 	struct mirror *m;
 	struct mirror_set *ms = ti->private;
-
-	map_context->ll = bio_to_region(&ms->rh, bio);
+	struct bio_map_info *bmi = NULL;
+	struct dm_bio_details *bd = NULL;
 
 	if (rw == WRITE) {
+		/* Save region for mirror_end_io() handler */
+		map_context->ll = bio_to_region(&ms->rh, bio);
 		queue_bio(ms, bio, rw);
 		return DM_MAPIO_SUBMITTED;
 	}
 
+	/* All about the reads now */
+
 	r = ms->rh.log->type->in_sync(ms->rh.log,
 				      bio_to_region(&ms->rh, bio), 0);
 	if (r < 0 && r != -EWOULDBLOCK)
 		return r;
 
-	if (r == -EWOULDBLOCK)	/* FIXME: ugly */
+	if (r == -EWOULDBLOCK)
 		r = DM_MAPIO_SUBMITTED;
 
-	/*
-	 * We don't want to fast track a recovery just for a read
-	 * ahead.  So we just let it silently fail.
-	 * FIXME: get rid of this.
-	 */
-	if (!r && rw == READA)
-		return -EIO;
+	if (likely(r)) {
+		/*
+		 * Optimize reads by avoiding to hand them to daemon.
+		 *
+		 * In case they fail, queue them for another shot
+		 * in the mirror_end_io() function.
+		 */
+		m = choose_mirror(ms, bio->bi_sector);
+		if (likely(m)) {
+			bmi = mempool_alloc(bio_map_info_pool, GFP_NOIO);
+
+			if (likely(bmi)) {
+				/* without this, a read is not retryable */
+				bd = &bmi->bmi_bd;
+				dm_bio_record(bd, bio);
+				map_context->ptr = bmi;
+				bmi->bmi_m = m;
+			} else {
+				/*
+				 * we could fail now, but we can at least
+				 * give it a shot.  The bd is only used to
+				 * retry in the event of a failure anyway.
+				 * If we fail, we can fail the I/O then.
+				 */
+				map_context->ptr = NULL;
+			}
+
+			map_bio(m, bio);
+			return DM_MAPIO_REMAPPED; /* Mapped -> queue request. */
+		} else
+			return -EIO;
+	} else {
+		/* Either not clean, or -EWOULDBLOCK */
+		if (rw == READA)
+			return -EWOULDBLOCK;
 
-	if (!r) {
-		/* Pass this io over to the daemon */
 		queue_bio(ms, bio, rw);
-		return DM_MAPIO_SUBMITTED;
 	}
 
-	m = choose_mirror(ms, bio->bi_sector);
-	if (!m)
-		return -EIO;
-
-	map_bio(ms, m, bio);
-	return DM_MAPIO_REMAPPED;
+	return DM_MAPIO_SUBMITTED;
 }
 
 static int mirror_end_io(struct dm_target *ti, struct bio *bio,
@@ -1492,15 +1616,61 @@ static int mirror_end_io(struct dm_targe
 {
 	int rw = bio_rw(bio);
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
-	region_t region = map_context->ll;
+	struct mirror *m = NULL;
+	struct dm_bio_details *bd = NULL;
 
 	/*
 	 * We need to dec pending if this was a write.
 	 */
-	if (rw == WRITE)
-		rh_dec(&ms->rh, region);
+	if (rw == WRITE) {
+		rh_dec(&ms->rh, map_context->ll);
+		return error;
+	}
 
-	return 0;
+	if (error == -EOPNOTSUPP)
+		goto out;
+
+	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+		goto out;
+
+	if (unlikely(error)) {
+		DMERR("A read failure occurred on a mirror device.");
+		if (!map_context->ptr) {
+			/*
+			 * There wasn't enough memory to record necessary
+			 * information for a retry or there was no other
+			 * mirror in-sync.
+			 */
+			DMERR("Unable to retry read.");
+			return -EIO;
+		}
+		m = ((struct bio_map_info *)map_context->ptr)->bmi_m;
+		fail_mirror(m); /* Flag error on mirror. */
+
+		/*
+		 * A failed read needs to get queued
+		 * to the daemon for another shot to
+		 * one (if any) intact mirrors.
+		 */
+		if (default_ok(m) || mirror_available(ms, bio)) {
+			bd = &(((struct bio_map_info *)map_context->ptr)->bmi_bd
+				);
+
+			DMWARN("Trying different device.");
+			dm_bio_restore(bd, bio);
+			mempool_free(map_context->ptr, bio_map_info_pool);
+			map_context->ptr = NULL;
+			queue_bio(ms, bio, rw);
+			return 1; /* We want another shot on the bio. */
+		}
+		DMERR("All replicated volumes dead, failing I/O");
+	}
+
+out:
+	if (map_context->ptr)
+		mempool_free(map_context->ptr, bio_map_info_pool);
+
+	return error;
 }
 
 static void mirror_presuspend(struct dm_target *ti)
@@ -1613,6 +1783,11 @@ static int __init dm_mirror_init(void)
 {
 	int r;
 
+	bio_map_info_pool = mempool_create(100, bio_map_info_alloc,
+					   bio_map_info_free, NULL);
+	if (!bio_map_info_pool)
+		return -ENOMEM;
+
 	r = dm_dirty_log_init();
 	if (r)
 		return r;





More information about the dm-devel mailing list