[Cluster-devel] cluster/cman cman_tool/join.c cman_tool/main.c ...

pcaulfield at sourceware.org pcaulfield at sourceware.org
Fri Aug 11 12:34:19 UTC 2006


CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	pcaulfield at sourceware.org	2006-08-11 12:34:18

Modified files:
	cman/cman_tool : join.c main.c 
	cman/daemon    : ais.c ais.h cmanccs.c daemon.c 

Log message:
	Create a pipe between cman_tool and the cman daemon so that it can
	communicate back any failures that occur during initialisation.
	
	This should help debug any problems people have with cman appearing
	to die straight after startup.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/cman_tool/join.c.diff?cvsroot=cluster&r1=1.46&r2=1.47
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/cman_tool/main.c.diff?cvsroot=cluster&r1=1.47&r2=1.48
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/daemon/ais.c.diff?cvsroot=cluster&r1=1.37&r2=1.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/daemon/ais.h.diff?cvsroot=cluster&r1=1.9&r2=1.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/daemon/cmanccs.c.diff?cvsroot=cluster&r1=1.18&r2=1.19
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/daemon/daemon.c.diff?cvsroot=cluster&r1=1.29&r2=1.30

--- cluster/cman/cman_tool/join.c	2006/07/03 07:58:39	1.46
+++ cluster/cman/cman_tool/join.c	2006/08/11 12:34:18	1.47
@@ -59,6 +59,7 @@
 	cman_handle_t h;
 	pid_t aisexec_pid;
 	int ctree;
+	int p[2];
 
 	ctree = ccs_force_connect(NULL, 1);
 	if (ctree < 0)
@@ -116,6 +117,12 @@
 	/* Use cman to configure services */
 	envp[envptr++] = strdup("OPENAIS_DEFAULT_CONFIG_IFACE=cmanconfig");
 
+	/* Create a pipe to monitor cman startup progress */
+	pipe(p);
+	fcntl(p[1], F_SETFD, 0); /* Don't close on exec */
+	snprintf(scratch, sizeof(scratch), "CMAN_PIPE=%d", p[1]);
+	envp[envptr++] = strdup(scratch);
+
 	envp[envptr++] = NULL;
 
 	argv[0] = "aisexec";
@@ -127,6 +134,7 @@
 		die("fork of aisexec daemon failed: %s", strerror(errno));
 
 	case 0: // child
+		close(p[0]);
 		be_daemon(!comline->verbose);
 		chdir(SBINDIR);
 		execve("./aisexec", argv, envp);
@@ -138,21 +146,33 @@
 
 	}
 
-#ifdef DEBUG
-	if (getenv("DEBUG_WAIT"))
-	{
-		printf("Waiting to attach gdb to aisexec (pid %d), press ENTER to continue\n", aisexec_pid);
-		getchar();
-	}
-#endif
-	/* Give the daemon a chance to start up */
+	/* Give the daemon a chance to start up, and monitor the pipe FD for messages */
 	i = 0;
 	do {
-		sleep(2);
-		h = cman_admin_init(NULL);
-		if (!h && comline->verbose)
-		{
-			fprintf(stderr, "waiting for aisexec to start\n");
+		fd_set fds;
+		struct timeval tv={1, 0};
+		int status;
+		char message[1024];
+
+		FD_ZERO(&fds);
+		FD_SET(p[0], &fds);
+
+		status = select(p[0]+1, &fds, NULL, NULL, &tv);
+		if (status == 0) {
+			h = cman_admin_init(NULL);
+			if (!h && comline->verbose)
+			{
+				fprintf(stderr, "waiting for aisexec to start\n");
+			}
+		}
+
+		/* Did we get an error? */
+		if (status == 1) {
+			if (read(p[0], message, sizeof(message)) != 0) {
+				fprintf(stderr, "cman not started: %s\n", message);
+				break;
+			}
+
 		}
 	} while (!h && ++i < 20);
 
--- cluster/cman/cman_tool/main.c	2006/05/11 10:38:11	1.47
+++ cluster/cman/cman_tool/main.c	2006/08/11 12:34:18	1.48
@@ -735,22 +735,10 @@
 
 static void check_arguments(commandline_t *comline)
 {
-	int error;
-
 	if (comline->two_node && comline->expected_votes != 1)
 		die("expected_votes value (%d) invalid in two node mode",
 		    comline->expected_votes);
 
-	if (!comline->nodenames[0]) {
-		struct utsname utsname;
-		error = uname(&utsname);
-		if (error)
-			die("cannot get node name, uname failed");
-
-		comline->nodenames[0] = strdup(utsname.nodename);
-		comline->num_nodenames++;
-	}
-
 	if (comline->port_opt &&
 	    (comline->port <= 0 || comline->port > 65535))
 		die("Port must be a number between 1 and 65535");
--- cluster/cman/daemon/ais.c	2006/08/11 07:09:13	1.37
+++ cluster/cman/daemon/ais.c	2006/08/11 12:34:18	1.38
@@ -58,6 +58,7 @@
 
 static int config_run;
 static char errorstring[512];
+static int startup_pipe;
 static unsigned int debug_mask;
 static struct objdb_iface_ver0 *global_objdb;
 static totempg_groups_handle group_handle;
@@ -201,6 +202,9 @@
 		debug_mask = atoi(getenv("CMAN_DEBUGLOG"));
 	}
 
+	if (getenv("CMAN_PIPE"))
+		startup_pipe = atoi(getenv("CMAN_PIPE"));
+
 	init_debug(debug_mask);
 
 	/* We need to set this up to internal defaults too early */
@@ -215,6 +219,7 @@
 	error = read_ccs_config();
 	if (error)
 	{
+		write_cman_pipe("Error reading config from CCS");
 		sprintf(errorstring, "Error reading config from CCS");
 		return -1;
 	}
@@ -254,6 +259,10 @@
 	/* Open local sockets and initialise I/O queues */
 	cman_init();
 
+	/* Let cman_tool know we are running */
+	close(startup_pipe);
+	startup_pipe = 0;
+
 	/* Start totem */
 	totempg_groups_initialize(&group_handle, cman_deliver_fn, cman_confchg_fn);
 	totempg_groups_join(group_handle, cman_group, 1);
@@ -275,7 +284,7 @@
 	unsigned int totem_object_handle;
 	unsigned int interface_object_handle;
 	char tmp[132];
-	int ret = -1;
+	int ret = 0;
 
 	P_AIS("Adding local address %s\n", ifaddr);
 
@@ -514,3 +523,12 @@
 
 	return 0;
 }
+
+
+/* Write an error message down the CMAN startup pipe so
+   that cman_tool can display it */
+void write_cman_pipe(char *message)
+{
+	if (startup_pipe)
+		write(startup_pipe, message, strlen(message)+1);
+}
--- cluster/cman/daemon/ais.h	2006/06/30 13:00:26	1.9
+++ cluster/cman/daemon/ais.h	2006/08/11 12:34:18	1.10
@@ -19,6 +19,7 @@
 			      unsigned char toport, unsigned char fromport,
 			      int nodeid,
 			      unsigned int flags);
+extern void write_cman_pipe(char *message);
 
 extern uint64_t incarnation;
 extern struct totem_ip_address mcast_addr[MAX_INTERFACES];
--- cluster/cman/daemon/cmanccs.c	2006/08/02 11:54:36	1.18
+++ cluster/cman/daemon/cmanccs.c	2006/08/11 12:34:18	1.19
@@ -100,6 +100,7 @@
     ctree = ccs_force_connect(NULL, 1);
     if (ctree < 0) {
 	    log_msg(LOG_ERR, "Error connecting to CCS");
+	    write_cman_pipe("Cannot connect to CCS");
 	    return -1;
     }
 
@@ -175,6 +176,7 @@
         ret = getaddrinfo(nodenames[0], NULL, &ahints, &ainfo);
 	if (ret) {
 		log_msg(LOG_ERR, "Can't determine address family of nodename %s\n", nodenames[0]);
+		write_cman_pipe("Can't determine address family of nodename");
 		return NULL;
 	}
 
@@ -205,16 +207,20 @@
 	 */
 	error = cman_join_cluster(cluster_name, cluster_id,
 				  two_node, expected_votes);
-	if (error)
+	if (error) {
+		write_cman_pipe("Cannot start, ais may already be running");
 		return error;
+	}
 
 	/*
 	 * Setup the interface/multicast addresses
 	 */
 	for (i = 0; i<num_nodenames; i++) {
 		error = ais_add_ifaddr(mcast[i], nodenames[i], portnums[i]);
-		if (error)
+		if (error) {
+			write_cman_pipe("Multicast and node address families differ.");
 			return error;
+		}
 	}
 
 	return 0;
@@ -358,6 +364,7 @@
 	cd = ccs_force_connect(cname, 1);
 	if (cd < 0) {
 		log_msg(LOG_ERR, "Error connecting to CCS");
+		write_cman_pipe("Can't connect to CCSD");
 		return -ENOTCONN;
 	}
 
@@ -365,12 +372,14 @@
 	error = ccs_get(cd, CLUSTER_NAME_PATH, &str);
 	if (error) {
 		log_msg(LOG_ERR, "cannot find cluster name in config file");
+		write_cman_pipe("Can't find cluster name in CCS");
 		return -ENOENT;
 	}
 
 	if (cname) {
 		if (strcmp(cname, str)) {
 			log_msg(LOG_ERR, "cluster names not equal %s %s", cname, str);
+			write_cman_pipe("Cluster name in CCS does not match that passed to cman_tool");
 			return -ENOENT;
 		}
 	} else {
@@ -385,12 +394,25 @@
 	if (getenv("CMAN_NODENAME")) {
 		strcpy(nodename, getenv("CMAN_NODENAME"));
 		log_msg(LOG_INFO, "Using override node name %s\n", nodename);
+
+		sprintf(path, NODE_NAME_PATH_BYNAME, nodename);
+
+		error = ccs_get(cd, path, &str);
+		if (!error) {
+			free(str);
+		}
+		else {
+			log_msg(LOG_ERR, "Overridden node name %s is not in CCS", nodename);
+			write_cman_pipe("Overridden node name is not in CCS");
+			return -ENOENT;
+		}
 	}
 	else {
 		struct utsname utsname;
 		error = uname(&utsname);
 		if (error) {
 			log_msg(LOG_ERR, "cannot get node name, uname failed");
+			write_cman_pipe("Can't determine local node name");
 			return -ENOENT;
 		}
 		strcpy(nodename, utsname.nodename);
@@ -402,6 +424,7 @@
 	if (error) {
 		log_msg(LOG_ERR, "local node name \"%s\" not found in cluster.conf",
 			nodename);
+		write_cman_pipe("Can't find local node name in cluster.conf");
 		return -ENOENT;
 	}
 	nodenames[0] = strdup(nodename);
@@ -441,6 +464,7 @@
 			else {
 				if (atoi(str) < 0) {
 					log_msg(LOG_ERR, "negative votes not allowed");
+					write_cman_pipe("Found negative votes for this node in CCS");
 					return -EINVAL;
 				}
 				vote_sum += atoi(str);
@@ -501,6 +525,7 @@
 			int votestmp = atoi(str);
 			if (votestmp < 0 || votestmp > 255) {
 				log_msg(LOG_ERR, "invalid votes value %d", votes);
+				write_cman_pipe("Found invalid votes for node in CCS");
 				return -EINVAL;
 			}
 			votes = votestmp;
@@ -531,6 +556,7 @@
 
 	if (!nodeid) {
 		log_msg(LOG_ERR, "No nodeid specified in cluster.conf");
+		write_cman_pipe("CCS does not have a nodeid for this node, run 'ccs_tool addnodeids' to fix");
 		return -EINVAL;
 	}
 
@@ -604,6 +630,7 @@
 					"nodes with one vote each and expected "
 					"votes of 1 (node_count=%d vote_sum=%d)",
 					node_count, vote_sum);
+				write_cman_pipe("two_node set but there are more than 2 nodes");
 				return -EINVAL;
 			}
 
@@ -611,6 +638,7 @@
 				log_msg(LOG_ERR, "the two-node option requires exactly two "
 					"nodes with one vote each and expected "
 					"votes of 1 (votes=%d)", votes);
+				write_cman_pipe("two_node set but votes not set to 1");
 				return -EINVAL;
 			}
 		}
--- cluster/cman/daemon/daemon.c	2006/06/30 13:00:27	1.29
+++ cluster/cman/daemon/daemon.c	2006/08/11 12:34:18	1.30
@@ -312,6 +312,7 @@
 	local_socket = socket(PF_UNIX, SOCK_STREAM, 0);
 	if (local_socket < 0) {
 		log_msg(LOG_ERR, "Can't create local socket %s: %s\n", name, strerror(errno));
+		write_cman_pipe("Can't create local cman socket");
 		return -1;
 	}
 	/* Set Close-on-exec */
@@ -323,11 +324,13 @@
 	sockaddr.sun_family = AF_UNIX;
 	if (bind(local_socket, (struct sockaddr *) &sockaddr, sizeof(sockaddr))) {
 		log_msg(LOG_ERR, "can't bind local socket to %s: %s\n", name, strerror(errno));
+		write_cman_pipe("Can't bind to local cman socket");
 		close(local_socket);
 		return -1;
 	}
 	if (listen(local_socket, 1) != 0) {
 		log_msg(LOG_ERR, "listen on %s failed: %s\n", name, strerror(errno));
+		write_cman_pipe("listen failed on local cman socket");
 		close(local_socket);
 		return -1;
 	}
@@ -338,6 +341,7 @@
 	con = malloc(sizeof(struct connection));
 	if (!con) {
 		log_msg(LOG_ERR, "Can't allocate space for local connection: %s\n", strerror(errno));
+		write_cman_pipe("malloc failed for connection info");
 		close(local_socket);
 		return -1;
 	}




More information about the Cluster-devel mailing list