[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...

Wed Sep 26 03:15:42 UTC 2007

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-09-26 03:15:41

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	Bug 291521: Cluster mirror can become out-of-sync if nominal I/O overl...
	
	Another touch-up for this bug.
	
	Bad news:
	Because a node can cache the state of a region indefinitely (especially for
	blocks that are used alot - e.g. a journaling area of a file system), we must
	deny writes to any region of the mirror that is not yet recovered.  This is only
	the case with cluster mirroring.  This means poor performance of nominal I/O
	during recovery - probably really bad performance.  However, this is absolutely
	necessary for mirror reliability.
	
	Good news:
	The time I spent coding different fixes for this bug weren't a complete waste.
	I've been able to reuse some of that code to optimize the recovery process.
	Now, rather than going through the mirror from front to back, it skips ahead to
	recover regions that have pending writes.  Bottom line: performance will be bad
	during recovery, but it will be better than RHEL4.5.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.52&r2=1.1.2.53
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.14&r2=1.1.2.15
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.38&r2=1.1.2.39

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/21 20:07:37	1.1.2.52
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/26 03:15:40	1.1.2.53
@@ -142,6 +142,7 @@
 	lc->sync_count = (sync == NOSYNC) ? region_count : 0;
 
 	lc->recovering_region = (uint64_t)-1;
+	lc->recovering_next = (uint64_t)-1;
 	lc->sync_search = 0;
 	log->context = lc;
 	return 0;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/04/10 07:12:24	1.1.2.14
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/09/26 03:15:40	1.1.2.15
@@ -98,6 +98,7 @@
 	uint32_t *clean_bits;
 	uint32_t *sync_bits;
 	uint64_t recovering_region;
+	uint64_t recovering_next;
 
 	int sync_pass;          /* number of passes attempting to resync */
 	int sync_search;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/21 20:07:37	1.1.2.38
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/26 03:15:40	1.1.2.39
@@ -446,18 +446,25 @@
 
 static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr)
 {
-	uint64_t high, low;
+	lr->u.lr_int_rtn = 0;
 
-	high = lc->sync_search + 10;
-	low = (lc->recovering_region != (uint64_t)-1) ?
-		lc->recovering_region :
-		lc->sync_search;
-	if ((lr->u.lr_region >= low) && (lr->u.lr_region <= high)) {
-		DMDEBUG("Remote recovery conflict: %Lu/%s",
-			lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+	if ((lc->sync_search > lc->region_count) && !lc->sync_pass)
+		return 0;
+
+	/*
+	 * If the region hasn't been recovered yet,
+	 * we need to block the write
+	 */
+	if (!log_test_bit(lc->sync_bits, lr->u.lr_region) ||
+	    (lc->recovering_region == lr->u.lr_region)) {
 		lr->u.lr_int_rtn = 1;
-	} else
-		lr->u.lr_int_rtn = 0;
+
+		/* Try to make this region a priority */
+		if ((lr->u.lr_region != lc->recovering_region) &&
+		    (lc->recovering_next == (uint64_t)-1))
+			lc->recovering_next = lr->u.lr_region;
+		return 0;
+	}
 
 	return 0;
 }
@@ -542,7 +549,9 @@
 	}
 
 	if (!find_ru_by_region(lc, lr->u.lr_region)) {
-		log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
+		/* Only clear the region if it is also in sync */
+		if (log_test_bit(lc->sync_bits, lr->u.lr_region))
+			log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
 	} else if (check_bug) {
 		DMERR("Multiple marks exist on a region being recovered: %Lu/%s",
 		      lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
@@ -608,26 +617,45 @@
 		}
 	}
 
-	*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
-					  lc->region_count,
-					  lc->sync_search);
-	if (find_ru_by_region(lc, *region)) {
-		DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
-			*region, lc->uuid + (strlen(lc->uuid) - 8));
-		return 0;
-	}
+	DMDEBUG("Priority recovery region: %Lu/%s",
+		lc->recovering_next, lc->uuid + (strlen(lc->uuid) - 8));
 
-	if (*region >= lc->region_count)
-		return 0;
+	if ((lc->recovering_next != (uint64_t)-1) &&
+	    (!log_test_bit(lc->sync_bits, lc->recovering_next))) {
+		new = mempool_alloc(region_user_pool, GFP_NOFS);
+		if (!new)
+			return -ENOMEM;
+		*region = lc->recovering_region = lc->recovering_next;
+		DMDEBUG("Preempting normal recovery work for preferred region...");
+	} else {
+		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+						  lc->region_count,
+						  lc->sync_search);
+		if (find_ru_by_region(lc, *region)) {
+			/*
+			 * We disallow writes to regions that have not yet been
+			 * recovered via is_remote_recovering(), so this should
+			 * not happen.
+			 */
+			DMERR("Recovery blocked by outstanding write on region %Lu/%s",
+			      *region, lc->uuid + (strlen(lc->uuid) - 8));
+			BUG();
+			return 0;
+		}
 
-	new = mempool_alloc(region_user_pool, GFP_NOFS);
-	if (!new)
-		return -ENOMEM;
+		if (*region >= lc->region_count)
+			return 0;
 
-	lc->sync_search = *region + 1;
+		new = mempool_alloc(region_user_pool, GFP_NOFS);
+		if (!new)
+			return -ENOMEM;
 
-	lc->recovering_region = *region;
+		lc->sync_search = *region + 1;
+
+		lc->recovering_region = *region;
+	}
 
+	lc->recovering_next = (uint64_t)-1;
 	lr->u.lr_int_rtn = 1; /* Assigning work */
 	new->ru_nodeid = who;
 	new->ru_region = *region;