[Cluster-devel] Cluster Project branch, RHEL5, updated. cmirror_1_1_15-150-gb291647

jbrassow at sourceware.org jbrassow at sourceware.org
Fri Jul 18 15:19:13 UTC 2008


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "Cluster Project".

http://sources.redhat.com/git/gitweb.cgi?p=cluster.git;a=commitdiff;h=b2916471b1b7c79dba7f9624a1a148240375891f

The branch, RHEL5 has been updated
       via  b2916471b1b7c79dba7f9624a1a148240375891f (commit)
      from  705c5ceb17da0daf018c688ac478b4fae2371e3d (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit b2916471b1b7c79dba7f9624a1a148240375891f
Author: Jonathan Brassow <jbrassow at redhat.com>
Date:   Fri Jul 18 10:16:45 2008 -0500

    dm-log-clustered: Fix bug that would cause communication problems
    
    When an error is received from user space, the caller of the
    communication function is given the return code along with a
    parameter indicating that no data is available.  However,
    if the error was EAGAIN, the data_size variable was still set
    to 0.  So, when the retry was attempted and data returned from
    userspace, the kernel would think that there was not enough
    room to store the request.
    
    Also did quite a bit of code clean-up (no functional changes)
    in userspace.... like a number->string translation for openAIS
    error codes, etc...

-----------------------------------------------------------------------

Summary of changes:
 cmirror-kernel/src/dm-clog-tfr.c |    8 +-
 cmirror/src/cluster.c            |  237 ++++++++++++++++++--------------------
 2 files changed, 117 insertions(+), 128 deletions(-)

diff --git a/cmirror-kernel/src/dm-clog-tfr.c b/cmirror-kernel/src/dm-clog-tfr.c
index 95d4bd2..7932f77 100644
--- a/cmirror-kernel/src/dm-clog-tfr.c
+++ b/cmirror-kernel/src/dm-clog-tfr.c
@@ -96,7 +96,13 @@ static int fill_pkg(struct cn_msg *msg, struct clog_tfr *tfr)
 
 		if (msg) {
 			pkg->error = -msg->ack;
-			*(pkg->data_size) = 0;
+			/*
+			 * If we are trying again, we will need to know our
+			 * storage capacity.  Otherwise, along with the
+			 * error code, we make explicit that we have no data.
+			 */
+			if (pkg->error != -EAGAIN)
+				*(pkg->data_size) = 0;
 		} else if (tfr->data_size > *(pkg->data_size)) {
 			DMERR("Insufficient space to receive package [%s]::",
 			      RQ_TYPE(tfr->request_type));
diff --git a/cmirror/src/cluster.c b/cmirror/src/cluster.c
index 3a18687..68ffefb 100644
--- a/cmirror/src/cluster.c
+++ b/cmirror/src/cluster.c
@@ -19,35 +19,36 @@
 #include "logging.h"
 #include "link_mon.h"
 
-/* Open AIS error codes
-        SA_AIS_OK = 1,
-        SA_AIS_ERR_LIBRARY = 2,
-        SA_AIS_ERR_VERSION = 3,
-        SA_AIS_ERR_INIT = 4,
-        SA_AIS_ERR_TIMEOUT = 5,
-        SA_AIS_ERR_TRY_AGAIN = 6,
-        SA_AIS_ERR_INVALID_PARAM = 7,
-        SA_AIS_ERR_NO_MEMORY = 8,
-        SA_AIS_ERR_BAD_HANDLE = 9,
-        SA_AIS_ERR_BUSY = 10,
-        SA_AIS_ERR_ACCESS = 11,
-        SA_AIS_ERR_NOT_EXIST = 12,
-        SA_AIS_ERR_NAME_TOO_LONG = 13,
-        SA_AIS_ERR_EXIST = 14,
-        SA_AIS_ERR_NO_SPACE = 15,
-        SA_AIS_ERR_INTERRUPT = 16,
-        SA_AIS_ERR_NAME_NOT_FOUND = 17,
-        SA_AIS_ERR_NO_RESOURCES = 18,
-        SA_AIS_ERR_NOT_SUPPORTED = 19,
-        SA_AIS_ERR_BAD_OPERATION = 20,
-        SA_AIS_ERR_FAILED_OPERATION = 21,
-        SA_AIS_ERR_MESSAGE_ERROR = 22,
-        SA_AIS_ERR_QUEUE_FULL = 23,
-        SA_AIS_ERR_QUEUE_NOT_AVAILABLE = 24,
-        SA_AIS_ERR_BAD_FLAGS = 25,
-        SA_AIS_ERR_TOO_BIG = 26,
-        SA_AIS_ERR_NO_SECTIONS = 27
-*/
+/* Open AIS error codes */
+#define str_ais_error(x) \
+	((x) == SA_AIS_OK) ? "SA_AIS_OK" : \
+	((x) == SA_AIS_ERR_LIBRARY) ? "SA_AIS_ERR_LIBRARY" : \
+	((x) == SA_AIS_ERR_VERSION) ? "SA_AIS_ERR_VERSION" : \
+	((x) == SA_AIS_ERR_INIT) ? "SA_AIS_ERR_INIT" : \
+	((x) == SA_AIS_ERR_TIMEOUT) ? "SA_AIS_ERR_TIMEOUT" : \
+	((x) == SA_AIS_ERR_TRY_AGAIN) ? "SA_AIS_ERR_TRY_AGAIN" : \
+	((x) == SA_AIS_ERR_INVALID_PARAM) ? "SA_AIS_ERR_INVALID_PARAM" : \
+	((x) == SA_AIS_ERR_NO_MEMORY) ? "SA_AIS_ERR_NO_MEMORY" : \
+	((x) == SA_AIS_ERR_BAD_HANDLE) ? "SA_AIS_ERR_BAD_HANDLE" : \
+	((x) == SA_AIS_ERR_BUSY) ? "SA_AIS_ERR_BUSY" : \
+	((x) == SA_AIS_ERR_ACCESS) ? "SA_AIS_ERR_ACCESS" : \
+	((x) == SA_AIS_ERR_NOT_EXIST) ? "SA_AIS_ERR_NOT_EXIST" : \
+	((x) == SA_AIS_ERR_NAME_TOO_LONG) ? "SA_AIS_ERR_NAME_TOO_LONG" : \
+	((x) == SA_AIS_ERR_EXIST) ? "SA_AIS_ERR_EXIST" : \
+	((x) == SA_AIS_ERR_NO_SPACE) ? "SA_AIS_ERR_NO_SPACE" : \
+	((x) == SA_AIS_ERR_INTERRUPT) ? "SA_AIS_ERR_INTERRUPT" : \
+	((x) == SA_AIS_ERR_NAME_NOT_FOUND) ? "SA_AIS_ERR_NAME_NOT_FOUND" : \
+	((x) == SA_AIS_ERR_NO_RESOURCES) ? "SA_AIS_ERR_NO_RESOURCES" : \
+	((x) == SA_AIS_ERR_NOT_SUPPORTED) ? "SA_AIS_ERR_NOT_SUPPORTED" : \
+	((x) == SA_AIS_ERR_BAD_OPERATION) ? "SA_AIS_ERR_BAD_OPERATION" : \
+	((x) == SA_AIS_ERR_FAILED_OPERATION) ? "SA_AIS_ERR_FAILED_OPERATION" : \
+	((x) == SA_AIS_ERR_MESSAGE_ERROR) ? "SA_AIS_ERR_MESSAGE_ERROR" : \
+	((x) == SA_AIS_ERR_QUEUE_FULL) ? "SA_AIS_ERR_QUEUE_FULL" : \
+	((x) == SA_AIS_ERR_QUEUE_NOT_AVAILABLE) ? "SA_AIS_ERR_QUEUE_NOT_AVAILABLE" : \
+	((x) == SA_AIS_ERR_BAD_FLAGS) ? "SA_AIS_ERR_BAD_FLAGS" : \
+	((x) == SA_AIS_ERR_TOO_BIG) ? "SA_AIS_ERR_TOO_BIG" : \
+	((x) == SA_AIS_ERR_NO_SECTIONS) ? "SA_AIS_ERR_NO_SECTIONS" : \
+	"ais_error_unknown"
 
 #define DM_CLOG_RESPONSE 0x1000 /* in last byte of 32-bit value */
 #define DM_CLOG_CHECKPOINT_READY 21
@@ -145,10 +146,7 @@ int cluster_send(struct clog_tfr *tfr)
 		return 0;
 
 	/* error codes found in openais/cpg.h */
-	LOG_ERROR("cpg_mcast_joined error: %d%s", r,
-		  (r == SA_AIS_ERR_TRY_AGAIN) ? "/SA_AIS_ERR_TRY_AGAIN" :
-		  (r == CPG_ERR_BAD_HANDLE) ? "/CPG_ERR_BAD_HANDLE" :
-		  (r == CPG_ERR_ACCESS) ? "/CPG_ERR_ACCESS" : "");
+	LOG_ERROR("cpg_mcast_joined error: %s", str_ais_error(r));
 
 	tfr->error = -EBADE;
 	return -EBADE;
@@ -358,16 +356,19 @@ static int export_checkpoint(struct checkpoint_data *cp)
 
 	LOG_DBG("Sending checkpointed data to %u", cp->requester);
 
-	len = snprintf((char *)(name.value), SA_MAX_NAME_LENGTH, "bitmaps_%s_%u",
-		       SHORT_UUID(cp->uuid), cp->requester);
+	len = snprintf((char *)(name.value), SA_MAX_NAME_LENGTH,
+		       "bitmaps_%s_%u", SHORT_UUID(cp->uuid), cp->requester);
 	name.length = len;
 
+	len = strlen(cp->recovering_region) + 1;
+
 	attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS;
-	attr.checkpointSize = cp->bitmap_size * 2 + strlen(cp->recovering_region) + 1;
+	attr.checkpointSize = cp->bitmap_size * 2 + len;
+
 	attr.retentionDuration = SA_TIME_MAX;
 	attr.maxSections = 4;      /* don't know why we need +1 */
-	attr.maxSectionSize = (cp->bitmap_size > (strlen(cp->recovering_region) + 1)) ?
-		cp->bitmap_size : (strlen(cp->recovering_region) + 1);
+
+	attr.maxSectionSize = (cp->bitmap_size > len) ?	cp->bitmap_size : len;
 	attr.maxSectionIdSize = 22;
 
 	flags = SA_CKPT_CHECKPOINT_READ |
@@ -388,8 +389,9 @@ open_retry:
 	}
 
 	if (rv != SA_AIS_OK) {
-		LOG_ERROR("[%s] Failed to open checkpoint for %u:  Reason = %d",
-			  SHORT_UUID(cp->uuid), cp->requester, rv);
+		LOG_ERROR("[%s] Failed to open checkpoint for %u: %s",
+			  SHORT_UUID(cp->uuid), cp->requester,
+			  str_ais_error(rv));
 		return -EIO; /* FIXME: better error */
 	}
 
@@ -402,21 +404,23 @@ open_retry:
 	section_attr.expirationTime = SA_TIME_END;
 
 sync_create_retry:
-	rv = saCkptSectionCreate(h, &section_attr, cp->sync_bits, cp->bitmap_size);
+	rv = saCkptSectionCreate(h, &section_attr,
+				 cp->sync_bits, cp->bitmap_size);
 	if (rv == SA_AIS_ERR_TRY_AGAIN) {
-		LOG_ERROR("export_checkpoint: sync create retry");
+		LOG_ERROR("Sync checkpoint section create retry");
 		sleep(1);
 		goto sync_create_retry;
 	}
 
 	if (rv == SA_AIS_ERR_EXIST) {
-		LOG_DBG("export_checkpoint: sync checkpoint section already exists");
+		LOG_DBG("Sync checkpoint section already exists");
 		saCkptCheckpointClose(h);
 		return -EEXIST;
 	}
 
 	if (rv != SA_AIS_OK) {
-		LOG_ERROR("export_checkpoint: sync checkpoint section creation failed");
+		LOG_ERROR("Sync checkpoint section creation failed: %s",
+			  str_ais_error(rv));
 		saCkptCheckpointClose(h);
 		return -EIO; /* FIXME: better error */
 	}
@@ -432,19 +436,20 @@ sync_create_retry:
 clean_create_retry:
 	rv = saCkptSectionCreate(h, &section_attr, cp->clean_bits, cp->bitmap_size);
 	if (rv == SA_AIS_ERR_TRY_AGAIN) {
-		LOG_ERROR("export_checkpoint: clean create retry");
+		LOG_ERROR("Clean checkpoint section create retry");
 		sleep(1);
 		goto clean_create_retry;
 	}
 
 	if (rv == SA_AIS_ERR_EXIST) {
-		LOG_DBG("export_checkpoint: clean checkpoint section already exists");
+		LOG_DBG("Clean checkpoint section already exists");
 		saCkptCheckpointClose(h);
 		return -EEXIST;
 	}
 
 	if (rv != SA_AIS_OK) {
-		LOG_ERROR("export_checkpoint: clean checkpoint section creation failed");
+		LOG_ERROR("Clean checkpoint section creation failed: %s",
+			  str_ais_error(rv));
 		saCkptCheckpointClose(h);
 		return -EIO; /* FIXME: better error */
 	}
@@ -461,19 +466,20 @@ rr_create_retry:
 	rv = saCkptSectionCreate(h, &section_attr, cp->recovering_region,
 				 strlen(cp->recovering_region) + 1);
 	if (rv == SA_AIS_ERR_TRY_AGAIN) {
-		LOG_ERROR("export_checkpoint: RR create retry");
+		LOG_ERROR("RR checkpoint section create retry");
 		sleep(1);
 		goto rr_create_retry;
 	}
 
 	if (rv == SA_AIS_ERR_EXIST) {
-		LOG_DBG("export_checkpoint: RR checkpoint section already exists");
+		LOG_DBG("RR checkpoint section already exists");
 		saCkptCheckpointClose(h);
 		return -EEXIST;
 	}
 
 	if (rv != SA_AIS_OK) {
-		LOG_ERROR("export_checkpoint: RR checkpoint section creation failed");
+		LOG_ERROR("RR checkpoint section creation failed: %s",
+			  str_ais_error(rv));
 		saCkptCheckpointClose(h);
 		return -EIO; /* FIXME: better error */
 	}
@@ -509,21 +515,6 @@ rr_create_retry:
 	return 0;
 }
 
-void ckpt_print (char *str, SaCkptCheckpointHandleT handle)
-{
-	SaCkptCheckpointDescriptorT descriptor;
-	SaAisErrorT rv;
-
-retry_statusget:
-	rv = saCkptCheckpointStatusGet (handle, &descriptor);
-	if (rv == SA_AIS_ERR_TRY_AGAIN)
-		goto retry_statusget;
-	
-	LOG_DBG("printing [%s] sections [%d] result [%d]",
-		str, descriptor.numberOfSections, rv);
-}
-
-
 static int import_checkpoint(struct clog_cpg *entry, int no_read)
 {
 	int rtn = 0;
@@ -554,12 +545,11 @@ open_retry:
 	}
 
 	if (rv != SA_AIS_OK) {
-		LOG_ERROR("Failed to open checkpoint");
+		LOG_ERROR("[%s] Failed to open checkpoint: %s",
+			  SHORT_UUID(entry->name.value), str_ais_error(rv));
 		return -EIO; /* FIXME: better error */
 	}
 
-	ckpt_print ("Before unlink", h);
-
 unlink_retry:
 	rv = saCkptCheckpointUnlink(ckpt_handle, &name);
 	if (rv == SA_AIS_ERR_TRY_AGAIN) {
@@ -573,10 +563,9 @@ unlink_retry:
 		goto no_read;
 	}
 
-	ckpt_print ("After unlink", h);
-
 init_retry:
-	rv = saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, SA_TIME_END, &itr);
+	rv = saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY,
+					      SA_TIME_END, &itr);
 	if (rv == SA_AIS_ERR_TRY_AGAIN) {
 		LOG_ERROR("import_checkpoint: sync create retry");
 		sleep(1);
@@ -584,7 +573,8 @@ init_retry:
 	}
 
 	if (rv != SA_AIS_OK) {
-		LOG_ERROR("import_checkpoint: sync checkpoint section creation failed");
+		LOG_ERROR("[%s] Sync checkpoint section creation failed: %s",
+			  SHORT_UUID(entry->name.value), str_ais_error(rv));
 		return -EIO; /* FIXME: better error */
 	}
 
@@ -602,11 +592,13 @@ init_retry:
 	}
 	saCkptSectionIterationFinalize(itr);
 	if (len != 3) {
-		LOG_ERROR("import_checkpoint: %d checkpoint sections found", len);
+		LOG_ERROR("import_checkpoint: %d checkpoint sections found",
+			  len);
 		sleep(1);
 		goto init_retry;
 	}
-	saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, SA_TIME_END, &itr);
+	saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY,
+					 SA_TIME_END, &itr);
 
 	while (1) {
 		rv = saCkptSectionIterationNext(itr, &desc);
@@ -620,7 +612,8 @@ init_retry:
 		}
 
 		if (rv != SA_AIS_OK) {
-			LOG_ERROR("import_checkpoint: clean checkpoint section creation failed");
+			LOG_ERROR("import_checkpoint: clean checkpoint section "
+				  "creation failed: %s", str_ais_error(rv));
 			rtn = -EIO; /* FIXME: better error */
 			goto fail;
 		}
@@ -645,20 +638,15 @@ init_retry:
 		}
 
 		if (rv != SA_AIS_OK) {
-			LOG_ERROR("import_checkpoint: ckpt read error");
+			LOG_ERROR("import_checkpoint: ckpt read error: %s",
+				  str_ais_error(rv));
 			rtn = -EIO; /* FIXME: better error */
 			goto fail;
 		}
 
-		/* FIXME: Is this catching something special?
-		if (!iov.readSize) {
-			LOG_ERROR("%s section empty", (char *)desc.sectionId.id);
-			continue;
-		}
-		*/
-
 		if (iov.readSize) {
-			if (pull_state(entry->name.value, (char *)desc.sectionId.id, bitmap,
+			if (pull_state(entry->name.value,
+				       (char *)desc.sectionId.id, bitmap,
 				       iov.readSize)) {
 				LOG_ERROR("Error loading state");
 				rtn = -EIO;
@@ -676,67 +664,62 @@ fail:
 no_read:
 	saCkptCheckpointClose(h);
 
-	/*
-	LOG_PRINT("Testing if chkpoint exists after unlink/close");
-	rv = saCkptCheckpointOpen(ckpt_handle, &name, NULL,
-				  SA_CKPT_CHECKPOINT_READ, 0, &h);
-	if (rv != SA_AIS_OK) {
-		LOG_PRINT("Checkpoint was not removed!!!");
-		saCkptCheckpointClose(h);
-	} else
-		LOG_PRINT("   rv == %d", rv);
-	*/
-
 	free(bitmap);
 	return rtn;
 }
 
+static void do_checkpoints(struct clog_cpg *entry)
+{
+	struct checkpoint_data *cp;
+
+	for (cp = entry->checkpoint_list; cp;) {
+		LOG_DBG("[%s] Checkpoint data available for node %u",
+			SHORT_UUID(entry->name.value), cp->requester);
+
+		/*
+		 * FIXME: Check return code.  Could send failure
+		 * notice in tfr in export_checkpoint function
+		 * by setting tfr->error
+		 */
+		switch (export_checkpoint(cp)) {
+		case -EEXIST:
+			LOG_DBG("[%s] Checkpoint for %u already handled",
+				SHORT_UUID(entry->name.value), cp->requester);
+		case 0:
+			entry->checkpoint_list = cp->next;
+			free_checkpoint(cp);
+			cp = entry->checkpoint_list;
+			break;
+		default:
+			/* FIXME: Skipping will cause list corruption */
+			LOG_ERROR("[%s] Failed to export checkpoint for %u",
+				  SHORT_UUID(entry->name.value), cp->requester);
+		}
+	}
+}
+
 static int do_cluster_work(void *data)
 {
 	int r = SA_AIS_OK;
 	struct clog_cpg *entry, *tmp;
-	struct checkpoint_data *cp;
 
 	list_for_each_entry_safe(entry, tmp, &clog_cpg_list, list) {
 		r = cpg_dispatch(entry->handle, CPG_DISPATCH_ALL);
 		if (r != SA_AIS_OK)
-			LOG_ERROR("cpg_dispatch failed: %d", r);
+			LOG_ERROR("cpg_dispatch failed: %s", str_ais_error(r));
 
 		if (entry->free_me) {
 			free(entry);
 			continue;
 		}
-
-		for (cp = entry->checkpoint_list; cp;) {
-			LOG_DBG("[%s] Checkpoint data available for node %u",
-				SHORT_UUID(entry->name.value), cp->requester);
-
-			/*
-			 * FIXME: Check return code.  Could send failure
-			 * notice in tfr in export_checkpoint function
-			 * by setting tfr->error
-			 */
-			switch (export_checkpoint(cp)) {
-			case -EEXIST:
-				LOG_DBG("[%s] Checkpoint for %u already handled by someone else",
-					SHORT_UUID(entry->name.value), cp->requester);
-			case 0:
-				entry->checkpoint_list = cp->next;
-				free_checkpoint(cp);
-				cp = entry->checkpoint_list;
-				break;
-			default:
-				/* FIXME: Skipping will cause list corruption */
-				LOG_ERROR("[%s] Failed to export checkpoint for %u",
-					  SHORT_UUID(entry->name.value), cp->requester);
-			}
-		}
+		do_checkpoints(entry);
 	}
 	return (r == SA_AIS_OK) ? 0 : -1;  /* FIXME: good error number? */
 }
 
 static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
-				 uint32_t nodeid, uint32_t pid, void *msg, int msg_len)
+				 uint32_t nodeid, uint32_t pid,
+				 void *msg, int msg_len)
 {
 	int i;
 	int r = 0;
@@ -785,7 +768,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
 	 * get our config callback.  However, since we can't respond after
 	 * leaving, we simply return.
 	 */
-	if (match->cpg_state != VALID)
+	if (match->state == LEAVING)
 		return;
 
 	i_am_server = (my_cluster_id == match->lowest_id) ? 1 : 0;
@@ -1098,10 +1081,10 @@ static void cpg_leave_callback(struct clog_cpg *match,
 			 */
 			if (!strcmp(match->name.value, tfr->uuid) &&
 			    (tfr->request_type != DM_CLOG_POSTSUSPEND)){
-				LOG_PRINT("[%s] Resending %s due to new server(%u)",
+				LOG_PRINT("[%s] Resending %s due to new server(%u -> %u)",
 					  SHORT_UUID(match->name.value),
 					  RQ_TYPE(tfr->request_type),
-					  match->lowest_id);
+					  lowest, match->lowest_id);
 				if (cluster_send(tfr))
 					LOG_ERROR("Failed resend");
 			}
@@ -1225,11 +1208,11 @@ int destroy_cluster_cpg(char *str)
 	list_for_each_entry_safe(del, tmp, &clog_cpg_list, list)
 		if (!strncmp(del->name.value, str, CPG_MAX_NAME_LENGTH)) {
 			del->cpg_state = INVALID;
+			del->state = LEAVING;
 			r = cpg_leave(del->handle, &del->name);
 			if (r != CPG_OK)
 				LOG_ERROR("Error leaving CPG!");
 			break;
-			del->state = LEAVING;
 		}
 
 	return 0;


hooks/post-receive
--
Cluster Project




More information about the Cluster-devel mailing list