[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

[Cluster-devel] cluster/cman cman_tool/main.c daemon/cnxman-pr ...



CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	pcaulfield sourceware org	2006-11-03 15:07:53

Modified files:
	cman/cman_tool : main.c 
	cman/daemon    : cnxman-private.h commands.c 

Log message:
	fix bz#213747
	Basically we don't let a node join a cluster that already has "Disallowed" nodes
	in it as we don't consistently know the state of the cluster in that case (it
	could be two inquorate halves for example).
	
	Sorry, Steven, this is yet another instance where cman has to exit() the aisexec
	process for the greater good of the cluster.
	
	I've also enhanceed "cman_tool nodes" to show the disallowed nodes and a warning
	message that the cluster is in a bit of a mess.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/cman_tool/main.c.diff?cvsroot=cluster&r1=1.50&r2=1.51
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/daemon/cnxman-private.h.diff?cvsroot=cluster&r1=1.25&r2=1.26
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/daemon/commands.c.diff?cvsroot=cluster&r1=1.53&r2=1.54

--- cluster/cman/cman_tool/main.c	2006/10/09 15:54:31	1.50
+++ cluster/cman/cman_tool/main.c	2006/11/03 15:07:51	1.51
@@ -302,6 +302,8 @@
 	int count;
 	int i;
 	int numnodes;
+	int dis_count;
+	cman_node_t *dis_nodes;
 	cman_node_t *nodes;
 	struct tm *jtime;
 	struct tm *ftime;
@@ -322,11 +324,46 @@
 	if (cman_get_nodes(h, count, &numnodes, nodes) < 0)
 		die("cman_get_nodes failed: %s", cman_error(errno));
 
+
+	/* Get Disallowed nodes, so we can show them as such */
+	dis_nodes = malloc(sizeof(cman_node_t) * count);
+
+	if (cman_get_disallowed_nodes(h, count, &dis_count, dis_nodes) == 0) {
+		int i,j;
+		for (i=0; i<numnodes; i++) {
+			for (j=0; j<dis_count; j++) {
+				if (dis_nodes[j].cn_nodeid == nodes[i].cn_nodeid)
+					nodes[i].cn_member = 2;
+			}
+		}
+	}
+
 	/* Sort by nodeid to be friendly */
 	qsort(nodes, numnodes, sizeof(cman_node_t), node_compare);
 
+	if (dis_count) {
+		printf("NOTE: There are %d disallowed nodes,\n", dis_count);
+		printf("      members list may seem inconsistent across the cluster\n");
+	}
+
 	printf("Node  Sts   Inc   Joined               Name\n");
 	for (i=0; i<numnodes; i++) {
+		char member_type;
+
+		switch (nodes[i].cn_member) {
+		case 0:
+			member_type = 'X';
+			break;
+		case 1:
+			member_type = 'M';
+			break;
+		case 2:
+			member_type = 'd';
+			break;
+		default:
+			member_type = '?';
+			break;
+		}
 
 		jtime = localtime(&nodes[i].cn_jointime.tv_sec);
 		if (nodes[i].cn_jointime.tv_sec && nodes[i].cn_member)
@@ -335,7 +372,7 @@
 			strcpy(jstring, "                   ");
 
 		printf("%4d   %c  %5d   %s  %s\n",
-		       nodes[i].cn_nodeid, nodes[i].cn_member?'M':'X',
+		       nodes[i].cn_nodeid, member_type,
 		       nodes[i].cn_incarnation, jstring, nodes[i].cn_name);
 
 		if (comline->fence_opt) {
--- cluster/cman/daemon/cnxman-private.h	2006/10/05 07:48:33	1.25
+++ cluster/cman/daemon/cnxman-private.h	2006/11/03 15:07:52	1.26
@@ -143,12 +143,15 @@
 #define RECONFIG_PARAM_CONFIG_VERSION 3
 #define RECONFIG_PARAM_CCS            4
 
-/* NODE_FLAGS_BEENDOWN   - this node has been down.
-   NODE_FLAGS_FENCED     - This node has been fenced since it last went down.
+/* NODE_FLAGS_BEENDOWN       - This node has been down.
+   NODE_FLAGS_FENCED         - This node has been fenced since it last went down.
+   NODE_FLAGS_FENCEDWHILEUP  - This node was fenced manually (probably).
+   NODE_FLAGS_SEESDISALLOWED - Only set in a transition message
 */
 #define NODE_FLAGS_BEENDOWN           1
 #define NODE_FLAGS_FENCED             2
 #define NODE_FLAGS_FENCEDWHILEUP      4
+#define NODE_FLAGS_SEESDISALLOWED     8
 
 /* There's one of these for each node in the cluster */
 struct cluster_node {
--- cluster/cman/daemon/commands.c	2006/10/16 14:10:21	1.53
+++ cluster/cman/daemon/commands.c	2006/11/03 15:07:52	1.54
@@ -131,6 +131,18 @@
 	return ((node->port_bits[byte] & (1<<bit)) != 0);
 }
 
+static int have_disallowed(void)
+{
+	struct cluster_node *node;
+
+	list_iterate_items(node, &cluster_members_list) {
+		if (node->state == NODESTATE_AISONLY)
+			return 1;
+	}
+
+	return 0;
+}
+
 /* If "cluster_is_quorate" is 0 then all activity apart from protected ports is
  * blocked. */
 static void set_quorate(int total_votes)
@@ -1532,6 +1544,9 @@
 		len += 1;
 	}
 
+	if (have_disallowed())
+		msg->flags |= NODE_FLAGS_SEESDISALLOWED;
+
 	comms_send_message(msg, len,
 			   0,0,
 			   0,  /* multicast */
@@ -1676,6 +1691,16 @@
 		P_MEMB("Transition message from %d does not match current config - should quit ?\n", nodeid);
 		return; // PJC ???
 	}
+
+	/* If the remote node can see AISONLY nodes then we can't join as we don't
+	   know the full state */
+	if (msg->flags & NODE_FLAGS_SEESDISALLOWED && !have_disallowed()) {
+		/* Must use syslog directly here or the message will never arrive */
+		syslog(LOG_CRIT, "CMAN: Joined a cluster with disallowed nodes. must die");
+		exit(2);
+	}
+	msg->flags &= ~NODE_FLAGS_SEESDISALLOWED;
+
 	node = find_node_by_nodeid(nodeid);
 	assert(node);
 
@@ -1703,6 +1728,12 @@
 		add_ais_node(nodeid, incarnation, num_ais_nodes);
 	}
 
+	/* If the cluster already has some AISONLY nodes then we can't make
+	   sense of the membership. So the new node has to also be AISONLY
+	   until we are consistent again */
+	if (have_disallowed() && !node->us)
+		node->state = NODESTATE_AISONLY;
+
 	node->flags = msg->flags; /* This will clear the BEENDOWN flag of course */
 	if (node->fence_agent && msg->fence_agent[0] && strcmp(node->fence_agent, msg->fence_agent))
 	{


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]