[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...

jbrassow at sourceware.org jbrassow at sourceware.org
Tue Sep 5 17:50:12 UTC 2006


CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2006-09-05 17:50:11

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	- fix the bugs I've seen so far - mostly related to the recently added
	ability to migrate the log server on suspension - that cause hangs
	during combinations of create/delete/convert of mirrors

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.23&r2=1.1.2.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.16&r2=1.1.2.17

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/27 23:11:55	1.1.2.23
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/09/05 17:50:11	1.1.2.24
@@ -396,8 +396,8 @@
 	set_fs(get_ds());
 
 	if(type == LRT_MASTER_LEAVING){
-		len = sock_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
-				   /* WAIT for it */0);
+		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
+				 0, 10);
 	} else {
 		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
 				 0, 5);
@@ -419,7 +419,7 @@
 		goto fail;
 	}
 
-	if(lr->u.lr_int_rtn == -ENXIO){
+	if (lr->u.lr_int_rtn == -ENXIO) {
 		lc->server_id = 0xDEAD;
 		*retry = 1;
 		goto fail;
@@ -591,7 +591,7 @@
 		       unsigned int argc, char **argv, int disk)
 {
 	int error = 0;
-	struct log_c *lc;
+	struct log_c *lc, *tmp_lc;
 	struct sockaddr_in saddr_in;
 
 	if (!disk) {
@@ -621,6 +621,15 @@
 
 	atomic_set(&lc->in_sync, -1);
 
+	list_for_each_entry(tmp_lc, &log_list_head, log_list){
+		if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN)){
+			DMERR("Log already exists with uuid, %s",
+			      lc->uuid + (strlen(lc->uuid) - 8));
+			error = -EINVAL;
+			goto fail;
+		}
+	}
+
 	list_add(&lc->log_list, &log_list_head);
 	INIT_LIST_HEAD(&lc->region_users);
 
@@ -730,6 +739,7 @@
 	list_del_init(&lc->log_list);
 	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+
 	sock_release(lc->client_sock);
 
 	if (lc->log_dev)
@@ -748,6 +758,7 @@
 
 static int cluster_postsuspend(struct dirty_log *log)
 {
+	int r;
 	struct log_c *lc = (struct log_c *) log->context;
 
 	while (1) {
@@ -765,12 +776,16 @@
 	if(lc->server_id == my_id) {
 		while (1) {
 			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+
 			down(&consult_server_lock);
 			run_election(lc, 0xDEAD);
 			up(&consult_server_lock);
-			if (lc->server_id == my_id) {
+
+			if ((my_id && (lc->server_id == my_id))) {
+				atomic_set(&lc->suspended, 0);
 				set_current_state(TASK_INTERRUPTIBLE);
-				schedule_timeout(HZ/4);
+				schedule_timeout(HZ*2);
+				atomic_set(&lc->suspended, 1);
 			} else {
 				break;
 			}
@@ -1005,7 +1020,7 @@
 	if (!success) {
 		DMERR("Attempting to revert sync status of region #%llu", region);
 		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(HZ/50);
+		schedule_timeout(HZ/5);
 	}
 
 	return;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/27 23:11:55	1.1.2.16
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/09/05 17:50:11	1.1.2.17
@@ -107,7 +107,9 @@
 	if (!log->log_dev)
 		return 0;
 
-	BUG_ON(atomic_read(&log->suspended));
+	if (atomic_read(&log->suspended))
+		return -EDEADLK;
+
 	r = dm_io_sync_vm(1, &log->header_location, READ,
 			  log->disk_header, &ebits);
 	if (unlikely(r))
@@ -138,7 +140,9 @@
 	if (!log->log_dev)
 		return 0;
 
-	BUG_ON(atomic_read(&log->suspended));
+	if (atomic_read(&log->suspended))
+		return -EDEADLK;
+
 	header_to_disk(&log->header, log->disk_header);
 	return dm_io_sync_vm(1, &log->header_location, WRITE,
 			     log->disk_header, &ebits);
@@ -182,7 +186,9 @@
 	if (!log->log_dev)
 		return 0;
 
-	BUG_ON(atomic_read(&log->suspended));
+	if (atomic_read(&log->suspended))
+		return -EDEADLK;
+
 	r = dm_io_sync_vm(1, &log->bits_location, READ,
 			  log->clean_bits, &ebits);
 
@@ -199,7 +205,9 @@
 	if (!log->log_dev)
 		return 0;
 
-	BUG_ON(atomic_read(&log->suspended));
+	if (atomic_read(&log->suspended))
+		return -EDEADLK;
+
 	return dm_io_sync_vm(1, &log->bits_location, WRITE,
 			     log->clean_bits, &ebits);
 }
@@ -252,9 +260,9 @@
 			continue;
 		} else if(str[i] == 0xFF){
 			if(range_count==1){
-				DMINFO("  %d", region - 1);
+				DMDEBUG("  %d", region - 1);
 			} else if(range_count){
-				DMINFO("  %d - %d", region-range_count, region-1);
+				DMDEBUG("  %d - %d", region-range_count, region-1);
 			}
 			range_count = 0;
 			region+=(bit_count < 8)? bit_count: 8;      
@@ -272,9 +280,9 @@
 				count++;
 			} else {
 				if(range_count==1){
-					DMINFO("  %d", region - 1);
+					DMDEBUG("  %d", region - 1);
 				} else if(range_count){
-					DMINFO("  %d - %d", region-range_count, region-1);
+					DMDEBUG("  %d - %d", region-range_count, region-1);
 				}
 				range_count = 0;
 				region++;
@@ -283,9 +291,9 @@
 	}
 
 	if(range_count==1){
-		DMINFO("  %d", region - 1);
+		DMDEBUG("  %d", region - 1);
 	} else if(range_count){
-		DMINFO("  %d - %d", region-range_count, region);
+		DMDEBUG("  %d - %d", region-range_count, region);
 	}
 	return count;
 }
@@ -312,7 +320,7 @@
 	i = 1;
 	if (!lc->log_dev_failed &&
 	    ((r = read_header(lc)) || (i = 0) || (r = read_bits(lc)))) {
-		if (r == -EINVAL)
+		if (r == -EINVAL || r == -EDEADLK)
 			return r;
 
 		DMWARN("Read %s failed on mirror log device, %s",
@@ -416,9 +424,11 @@
 
 	i = 1;
 	if ((r = write_bits(lc)) || (i = 0) || (r = write_header(lc))) {
-		DMWARN("Write %s failed on mirror log device, %s.",
-		       i ? "bits" : "header", lc->log_dev->name);
-		lc->log_dev_failed = 1;
+		if (r != -EDEADLK) {
+			DMWARN("Write %s failed on mirror log device, %s.",
+			       i ? "bits" : "header", lc->log_dev->name);
+			lc->log_dev_failed = 1;
+		}
 	} else 
 		lc->log_dev_failed = 0;
 
@@ -469,6 +479,11 @@
 
 static int server_in_sync(struct log_c *lc, struct log_request *lr)
 {
+	if (lr->u.lr_region > lc->region_count) {
+		lr->u.lr_int_rtn = 0;
+		return -EINVAL;
+	}
+
 	if(likely(log_test_bit(lc->sync_bits, lr->u.lr_region)))
 		/* in-sync */
 		lr->u.lr_int_rtn = 1;
@@ -581,6 +596,11 @@
 
 static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){
 	uint32_t info;
+
+	if (lr->u.lr_region > lc->region_count) {
+		return -EINVAL;
+	}
+
 	log_clear_bit(lc, lc->recovering_bits, lr->u.lr_region);
 
 	if (success) {
@@ -678,15 +698,16 @@
 
 	/*
 	 * Check if we have access to the log.  We may not
-	 * get have loaded this device.
+	 * yet have loaded this device.
 	 */
-	if(!lc){
+	if (!lc) {
 		lr->u.lr_node_count++;
 		return 0;
 	}
 
 	if(lr->lr_type == LRT_MASTER_LEAVING){
-		lc->server_id = 0xDEAD;
+		if (lr->u.lr_starter == lc->server_id)
+			lc->server_id = 0xDEAD;
 		lr->u.lr_node_count++;
 		return 0;
 	}
@@ -696,7 +717,7 @@
 	 * We shortcut the election here and respond directly
 	 * to the inquirer
 	 */
-	if(lc->server_id == my_id){
+	if((lc->server_id == my_id) && !atomic_read(&lc->suspended)){
 		lr->u.lr_coordinator = my_id;
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
 			return -1;
@@ -850,10 +871,12 @@
 			if(lc && (old != lc->server_id) && (my_id == lc->server_id)){
 				DMDEBUG("I'm the cluster mirror log server for %s",
 				       lc->uuid + (strlen(lc->uuid) - 8));
-				if (!atomic_read(&lc->suspended))
-					disk_resume(lc);
-				else
+				if (atomic_read(&lc->suspended)) {
 					DMDEBUG("Not reading disk log because I'm suspended.");
+					
+				} else if (disk_resume(lc) == -EDEADLK) {
+					DMDEBUG("Unable to read disk log - deadlock potential.");
+				}
 			}
 			goto reply;
 		}
@@ -944,7 +967,7 @@
 /*
 			DMWARN("Error (%d) while processing request (%s)",
 			       error,
-			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
+			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
 			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
 			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
 			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
@@ -972,7 +995,18 @@
 			
 		set_fs(fs);
 		if(error < 0){
-			DMWARN("unable to sendmsg to client (error = %d)", error);
+			DMWARN("unable to sendmsg to client (type = %s, error = %d)",
+			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
+			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
+			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
+			       error);
 			return error;
 		}
 	} else if(error == -EAGAIN || error == -ETIMEDOUT){
@@ -1036,10 +1070,11 @@
 			
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
-					if (!atomic_read(&lc->suspended))
-						disk_resume(lc);
-					else
+					if (atomic_read(&lc->suspended)) {
 						DMDEBUG("Not reading disk log because I'm suspended.");
+					} else if (disk_resume(lc) == -EDEADLK) {
+						DMDEBUG("Unable to read disk log - deadlock potential.");
+					}
 				}
 			}
 			break;




More information about the Cluster-devel mailing list