[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...

Sat Jul 22 22:19:05 UTC 2006

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	STABLE
Changes by:	jbrassow at sourceware.org	2006-07-22 22:19:04

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	Fix for bug:
	199862 - Suspending cluster mirrors can cause indefinite hang
	And is part of a fix for:
	199185 - 'lvconvert' fails to remove device-mapper devices ...
	198555 - mirror log not getting cleared causes new mirror ...
	And is likely to fix:
	199334 - cmirror removal attempt hangs and caused locking ...
	And will certainly help for:
	199498
	198821
	194137
	194125
	199635
	
	All of the above bugs will need to be reexamined when the packages
	are rebuilt.
	
	This fix allows the log server to migrate to other nodes during
	suspension.  This prevents the situation where the log server may
	have it's devices suspended when it recieves a request.  Trying to
	fulfill a log request while devices are suspended will lead to an
	indefinite hang, because I/O will not complete until the devices
	are unsuspended.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.4&r2=1.1.4.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.3&r2=1.1.4.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.4&r2=1.1.4.5

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/07 17:09:54	1.1.4.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/22 22:19:04	1.1.4.5
@@ -243,7 +243,7 @@
 
 
 
-static int run_election(struct log_c *lc){
+static int run_election(struct log_c *lc, uint32_t initial_server){
 	int error=0, len;
 	struct sockaddr_in saddr_in;
 	struct msghdr msg;
@@ -255,7 +255,7 @@
 
 	lr.lr_type = LRT_ELECTION;
 	lr.u.lr_starter = my_id;
-	lr.u.lr_coordinator = my_id;
+	lr.u.lr_coordinator = initial_server;
 	memcpy(lr.lr_uuid, lc->uuid, MAX_NAME_LEN);
 
 	memset(&saddr_in, 0, sizeof(struct sockaddr_cl));
@@ -420,7 +420,6 @@
 
 	if(len <= 0){
 		/* ATTENTION -- what do we do with this ? */
-//		DMWARN("Failed to recvmsg from clustered log server");
 		error = len;
 		*retry = 1;
 		goto fail;
@@ -435,16 +434,11 @@
 	}
 
 	if(lr->u.lr_int_rtn == -ENXIO){
-		DMWARN("server tells us it no longer controls the log");
 		lc->server_id = 0xDEAD;
 		*retry = 1;
 		goto fail;
 	}
 
-	if(lr->u.lr_int_rtn < 0){
-		DMWARN("an error occured on the server while processing our request");
-	}
-
 	if(result)
 		*result = lr->u.lr_region_rtn;
 
@@ -463,8 +457,9 @@
 	}
 
 	if(lr) kfree(lr);
-#ifdef DEBUG
-	DMWARN("Request (%s) to server failed :: %d",
+#if 0
+	DMINFO("My (%u) request (%s) to server (%u) failed :: %d",
+	       my_id,
 	       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
 	       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
 	       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
@@ -475,7 +470,7 @@
 	       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
 	       (type == LRT_ELECTION)? "LRT_ELECTION":
 	       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
-	       error);
+	       lc->server_id, error);
 #endif
 	return error;
 }
@@ -495,9 +490,13 @@
 	do{
 		retry = 0;
 		suspend_on(&suspend_client_queue, atomic_read(&suspend_client));
+		if ((type == LRT_MASTER_LEAVING) && (lc->server_id == 0xDEAD)) {
+			/* Nothing to do */
+			goto out;
+		}
 	election:
 		while(lc->server_id == 0xDEAD){
-			run_election(lc);
+			run_election(lc, my_id);
 			new_server = 1;
 		}
 
@@ -539,7 +538,7 @@
 				spin_unlock(&region_state_lock);
 				goto out;
 			} else {
-				DMWARN("Continuing request:: %s", 
+				DMINFO("Continuing request:: %s",
 				      (type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
 				      (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
 				      (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
@@ -737,14 +736,14 @@
 	struct log_c *lc = (struct log_c *) log->context;
 
 	if (!list_empty(&clear_region_list))
-		DMERR("LEAVING WHILE REGION REQUESTS REMAIN.");
+		DMINFO("Leaving while clear region requests remain.");
 
 	list_del_init(&lc->log_list);
 	if(lc->server_id == my_id)
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 	sock_release(lc->client_sock);
 
-	if (lc->log_dev) 
+	if (lc->log_dev)
 		disk_dtr(log);
 	else
 		core_dtr(log);
@@ -755,15 +754,13 @@
 
 static int cluster_presuspend(struct dirty_log *log)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	return 0;
+}
 
-	/*
-	atomic_set(&lc->suspended, 1);
+static int cluster_postsuspend(struct dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
 
-	if (lc->log_dev && lc->log_dev_failed)
-		complete(&lc->failure_completion);
-	else
-	*/
 	while (1) {
 		spin_lock(&region_state_lock);
 		if (list_empty(&clear_region_list)) {
@@ -775,20 +772,31 @@
 		/* Just an unnessesary call to clear out regions */
 		consult_server(lc, 0, LRT_IN_SYNC, NULL);
 	}
+	atomic_set(&lc->suspended, 1);
+	if(lc->server_id == my_id) {
+		while (1) {
+			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+			down(&consult_server_lock);
+			run_election(lc, 0xDEAD);
+			up(&consult_server_lock);
+			if (lc->server_id == my_id) {
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(HZ/4);
+			} else {
+				break;
+			}
+		}
+	}
 
 	return 0;
 }
 
-static int cluster_postsuspend(struct dirty_log *log){
-	return 0;
-}
-
 static int cluster_resume(struct dirty_log *log){
 	struct log_c *lc = (struct log_c *) log->context;
 
 	lc->sync_search = 0;
 	resume_server_requests();
-	/* atomic_set(&lc->suspended, 0); */
+	atomic_set(&lc->suspended, 0);
 
 	return 0;
 }
@@ -1330,7 +1338,7 @@
 	.get_default_mirror = cluster_get_default_mirror,
 };
 
-#define CMIRROR_RELEASE_NAME "0.1.0"
+#define CMIRROR_RELEASE_NAME "0.2.0"
 static int __init cluster_dirty_log_init(void)
 {
 	int r = 0;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/06/29 19:49:32	1.1.4.3
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/07/22 22:19:04	1.1.4.4
@@ -13,10 +13,12 @@
 	sector_t sector;
 	sector_t count;
 };
+int dm_io_get(unsigned int num_pages);
+void dm_io_put(unsigned int num_pages);
 int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
                   void *data, unsigned long *error_bits);
 /* from dm.h */
-#define DM_NAME "device-mapper"
+#define DM_NAME "dm-cmirror"
 #define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
 #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
 #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
@@ -111,8 +113,8 @@
 	 * Disk log fields
 	 */
 	int log_dev_failed;
-	/*
 	atomic_t suspended;
+	/*
 	struct completion failure_completion;
 	*/
 	struct dm_dev *log_dev;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/19 14:40:15	1.1.4.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/22 22:19:04	1.1.4.5
@@ -199,7 +199,7 @@
 		return 0;
 
 	return dm_io_sync_vm(1, &log->bits_location, WRITE,
-			      log->clean_bits, &ebits);
+			     log->clean_bits, &ebits);
 }
 
 static int count_bits32(uint32_t *addr, unsigned size)
@@ -676,23 +676,41 @@
 		return -1;
 	}
 
-	
-	if((lr->lr_type == LRT_MASTER_LEAVING) && 
-	   (lr->u.lr_starter == my_id) &&
-	   lr->u.lr_node_count){
-		lr->u.lr_coordinator = 0xDEAD;
-		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
-			return -1;
+	if(lr->lr_type == LRT_MASTER_LEAVING) {
+		/*
+		 * if we started this and (lr->u.lr_node_count != 0),
+		 * then we have told everyone that we are leaving
+		 */
+		if ((lr->u.lr_starter == my_id) && lr->u.lr_node_count){
+			lr->u.lr_coordinator = 0xDEAD;
+			if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
+				return -1;
+			}
+			saddr->sin_port = lr->u.lr_starter_port;
+			return 0;
 		}
-		saddr->sin_port = lr->u.lr_starter_port;
-		return 0;
 	}
-	
+
+	/*
+	 * Check if we have access to the log.  We may not
+	 * get have loaded this device.
+	 */
 	if(!lc){
 		lr->u.lr_node_count++;
 		return 0;
 	}
-	
+
+	if(lr->lr_type == LRT_MASTER_LEAVING){
+		lc->server_id = 0xDEAD;
+		lr->u.lr_node_count++;
+		return 0;
+	}
+
+	/*
+	 * New node joins and needs to know I am the server
+	 * We shortcut the election here and respond directly
+	 * to the inquirer
+	 */
 	if(lc->server_id == my_id){
 		lr->u.lr_coordinator = my_id;
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
@@ -701,16 +719,16 @@
 		saddr->sin_port = lr->u.lr_starter_port;
 		return 0;
 	}
-	
-	
-	if(lr->lr_type == LRT_MASTER_LEAVING){
-		lc->server_id = 0xDEAD;
-		lr->u.lr_node_count++;
-		return 0;
-	}
-	
+
 	if(lr->lr_type == LRT_ELECTION){
 		if((lr->u.lr_starter == my_id) && (lr->u.lr_node_count)){
+			/*
+			 * We started this election, and we've been
+			 * around the loop.  If the node count hasn't
+			 * changed since we started, we can proceed to
+			 * selection.  Otherwise, go again setting
+			 * ourself as the leader to start.
+			 */
 			if(node_count == lr->u.lr_node_count){
 				lr->lr_type = LRT_SELECTION;
 			} else {
@@ -720,9 +738,23 @@
 			return 0;
 		}
 
+		/*
+		 * We are in the election phase, so
+		 * if we have the lowest ID so far,
+		 * we elect ourselves for server.
+		 *
+		 * However, if the mirror is being suspended
+		 * (lc->suspended), then we leave the current
+		 * coordinator in place.
+		 *
+		 * The client must not set lc->suspended until
+		 * it has completed sending all requests.  That
+		 * way, everyone is done sending requests when
+		 * the last server is stuck holding the ball.
+		 */
 		lr->u.lr_node_count++;
 		
-		if(my_id < lr->u.lr_coordinator){
+		if((my_id < lr->u.lr_coordinator) && !atomic_read(&lc->suspended)){
 			lr->u.lr_coordinator = my_id;
 		}
 		return 0;
@@ -732,6 +764,13 @@
 			return 0;
 		}
 		
+		/*
+		 * Need to restart election if someone
+		 * has joined since we started.
+		 *
+		 * Here, we are the started, so set
+		 * node_count = 1
+		 */
 		if(lr->u.lr_node_count == node_count){
 			lr->lr_type = LRT_MASTER_ASSIGN;
 		} else {
@@ -741,12 +780,24 @@
 		}
 		lr->u.lr_node_count = 1;
 	} else if(lr->lr_type == LRT_MASTER_ASSIGN){
+		/*
+		 * If we are the server, assign it
+		 */
 		if(lr->u.lr_coordinator == my_id){
 			lc->server_id = my_id;
 		}
+
+		/*
+		 * Continue around the loop
+		 */
 		if(lr->u.lr_starter != my_id){
 			return 0;
 		}
+
+		/*
+		 * If I was the one who asked for the election,
+		 * the send the results back to the client
+		 */
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
 			return -1;
 		}
@@ -820,18 +871,15 @@
 		}
 
 		if(!lc){
-			DMWARN("Log context can not be found for request");
 			lr.u.lr_int_rtn = -ENXIO;
 			goto reply;
 		}
 
-/*
-  if(lc->server_id != my_id){
-  DMWARN("I am not the server for this request");
-  lr.u.lr_int_rtn = -ENXIO;
-  goto reply;
-  }
-*/
+		if (lc->server_id != my_id) {
+			lr.u.lr_int_rtn = -ENXIO;
+			goto reply;
+		}
+
 		switch(lr.lr_type){
 		case LRT_IS_CLEAN:
 			error = server_is_clean(lc, &lr);
@@ -886,12 +934,23 @@
 		}
 
 		/* ATTENTION -- if error? */
+/*
 		if(error){
-			DMWARN("Error (%d) while processing request (type = %d)",
-			       error, lr.lr_type);
+			DMWARN("Error (%d) while processing request (%s)",
+			       error,
+			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
+			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
+			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
 			lr.u.lr_int_rtn = error;
 		}
-
+*/
 	reply:
     
 		/* Why do we need to reset this? */
@@ -966,9 +1025,8 @@
 			** leaving node, it won't hurt anything - and**
 			** if there is, they will be recovered.      */
 		case SERVICE_NODE_FAILED:
-			DMINFO("A cluster mirror log member has %s",
-			       (restart_event_type == SERVICE_NODE_FAILED) ?
-			       "failed." : "left.");
+			if (restart_event_type == SERVICE_NODE_FAILED)
+				DMINFO("A cluster mirror log member has failed.");
 			
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
@@ -994,8 +1052,6 @@
 		schedule();
 	}
 
-	DMINFO("Cluster mirror log server is shutting down.");
-
 	sock_release(sock);
 	complete(&server_completion);
 	return 0;