[Cluster-devel] cluster/fence/fenced fd.h main.c recover.c

lhh at sourceware.org lhh at sourceware.org
Mon Jan 29 20:04:42 UTC 2007


CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	STABLE
Changes by:	lhh at sourceware.org	2007-01-29 20:04:41

Modified files:
	fence/fenced   : fd.h main.c recover.c 

Log message:
	Add manual override for fenced to STABLE branch; patch is a merge from RHEL4 branch; fixes 223060

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/fd.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.7.2.4&r2=1.7.2.4.6.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/main.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.16.2.9.6.2&r2=1.16.2.9.6.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/recover.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.10.2.6.6.1&r2=1.10.2.6.6.2

--- cluster/fence/fenced/fd.h	2005/02/24 07:06:09	1.7.2.4
+++ cluster/fence/fenced/fd.h	2007/01/29 20:04:41	1.7.2.4.6.1
@@ -52,6 +52,7 @@
 
 
 #define FENCED_SOCK_PATH "fenced_socket"
+#define DEFAULT_OVERRIDE_PATH	"/var/run/cluster/fenced_override"
 
 #define DEFAULT_POST_JOIN_DELAY   6
 #define DEFAULT_POST_FAIL_DELAY   0
@@ -129,10 +130,12 @@
 	int debug;
 	int post_join_delay;
 	int post_fail_delay;
+	char *override_path;
 	int8_t clean_start;
 	int8_t post_join_delay_opt;
 	int8_t post_fail_delay_opt;
 	int8_t clean_start_opt;
+	int8_t override_path_opt;
 };
 
 #define FDFL_RUN        (0)
--- cluster/fence/fenced/main.c	2006/01/23 19:24:10	1.16.2.9.6.2
+++ cluster/fence/fenced/main.c	2007/01/29 20:04:41	1.16.2.9.6.3
@@ -23,7 +23,7 @@
 char our_name[MAX_CLUSTER_MEMBER_NAME_LEN+1];
 
 
-#define OPTION_STRING			("cj:f:t:Dn:hVSwQ")
+#define OPTION_STRING			("cj:f:t:Dn:O:hVSwQ")
 #define LOCKFILE_NAME			"/var/run/fenced.pid"
 
 
@@ -40,6 +40,8 @@
 			           DEFAULT_POST_JOIN_DELAY);
 	printf("  -f <secs>        Post-fail fencing delay (default %d)\n",
 				   DEFAULT_POST_FAIL_DELAY);
+ 	printf("  -O <path>        Override path (default %s)\n",
+ 	       			   DEFAULT_OVERRIDE_PATH);
 	printf("  -D               Enable debugging code and don't fork\n");
 	printf("  -h               Print this help, then exit\n");
 	printf("  -n <name>        Name of the fence domain, \"default\" if none\n");
@@ -434,6 +436,23 @@
 			free(str);
 	}
 
+	if (fd->comline->override_path_opt == FALSE) {
+		str = NULL;
+		memset(path, 0, 256);
+		sprintf(path, "/cluster/fence_daemon/@override_path");
+
+		error = ccs_get(cd, path, &str);
+		if (!error)
+			/* XXX These are not explicitly freed on exit; if
+			   we decide to make fenced handle SIGHUP at a later
+			   time, we will need to free this. */
+			fd->comline->override_path = strdup(str);
+		else
+			fd->comline->override_path = strdup(DEFAULT_OVERRIDE_PATH);
+		if (str)
+			free(str);
+	}
+
 	log_debug("delay post_join %ds post_fail %ds",
 		  fd->comline->post_join_delay, fd->comline->post_fail_delay);
 
@@ -527,6 +546,8 @@
 	int cont = TRUE;
 	int optchar;
 
+	comline->override_path_opt = FALSE;
+	comline->override_path = NULL;
 	comline->post_join_delay_opt = FALSE;
 	comline->post_fail_delay_opt = FALSE;
 	comline->clean_start_opt = FALSE;
@@ -551,6 +572,11 @@
 			comline->post_fail_delay_opt = TRUE;
 			break;
 
+		case 'O':
+			comline->override_path = strdup(optarg);
+			comline->override_path_opt = TRUE;
+			break;
+
 		case 'D':
 			comline->debug = TRUE;
 			fenced_debug = TRUE;
--- cluster/fence/fenced/recover.c	2005/06/21 18:07:31	1.10.2.6.6.1
+++ cluster/fence/fenced/recover.c	2007/01/29 20:04:41	1.10.2.6.6.2
@@ -13,6 +13,9 @@
 
 #include "fd.h"
 #include "ccs.h"
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/select.h>
 
 /* Fencing recovery algorithm
 
@@ -359,6 +362,79 @@
 	return num_victims;
 }
 
+static inline void close_override(int *fd, char *path)
+{
+	unlink(path);
+	if (fd && *fd >= 0)
+		close(*fd);
+	*fd = -1;
+}
+
+static int open_override(char *path)
+{
+	int ret;
+	mode_t om;
+
+	om = umask(077);
+	ret = mkfifo(path, (S_IRUSR | S_IWUSR));
+	umask(om);
+
+	if (ret < 0)
+		return -1;
+        return open(path, O_RDONLY | O_NONBLOCK);
+}
+
+static int check_override(int ofd, char *nodename, int timeout)
+{
+	char buf[128];
+	fd_set rfds;
+	struct timeval tv = {0, 0};
+	int ret, x;
+
+	if (ofd < 0 || !nodename || !strlen(nodename)) {
+		sleep(timeout);
+		return 0;
+	}
+
+	FD_ZERO(&rfds);
+	FD_SET(ofd, &rfds);
+	tv.tv_usec = 0;
+	tv.tv_sec = timeout;
+
+	ret = select(ofd + 1, &rfds, NULL, NULL, &tv);
+	if (ret < 0) {
+		syslog(LOG_ERR, "select: %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (ret == 0)
+		return 0;
+
+	memset(buf, 0, sizeof(buf));
+	ret = read(ofd, buf, sizeof(buf) - 1);
+	if (ret < 0) {
+		syslog(LOG_ERR, "read: %s\n", strerror(errno));
+		return -1;
+	}
+
+	/* chop off control characters */
+	for (x = 0; x < ret; x++) {
+		if (buf[x] < 0x20) {
+			buf[x] = 0;
+			break;
+		}
+	}
+
+	if (!strcasecmp(nodename, buf)) {
+		/* Case insensitive, but not as nice as, say, name_equal
+		   in the other file... */
+		return 1;
+	}
+
+	return 0;
+}
+
+
 /* If there are victims after a node has joined, it's a good indication that
    they may be joining the cluster shortly.  If we delay a bit they might
    become members and we can avoid fencing them.  This is only really an issue
@@ -429,7 +505,7 @@
 	fd_node_t *node;
 	char *master_name;
 	uint32_t master;
-	int error, cd;
+	int error, cd, override = -1;
 
 	master = find_master_nodeid(fd, &master_name);
 
@@ -466,7 +542,22 @@
 			list_del(&node->list);
 			free(node);
 		}
-		sleep(5);
+
+		if (!fd->comline->override_path) {
+			sleep(5);
+			continue;
+		}
+
+		/* Check for manual intervention */
+		override = open_override(fd->comline->override_path);
+		if (check_override(override, node->name, 5) > 0) {
+			syslog(LOG_WARNING, "fence \"%s\" overridden by "
+			       "administrator intervention", node->name);
+
+			list_del(&node->list);
+			free(node);
+		}
+		close_override(&override, fd->comline->override_path);
 	}
 
 	ccs_disconnect(cd);




More information about the Cluster-devel mailing list