[Cluster-devel] cluster/fence/fenced fd.h main.c recover.c
lhh at sourceware.org
lhh at sourceware.org
Mon Jan 29 20:04:42 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: STABLE
Changes by: lhh at sourceware.org 2007-01-29 20:04:41
Modified files:
fence/fenced : fd.h main.c recover.c
Log message:
Add manual override for fenced to STABLE branch; patch is a merge from RHEL4 branch; fixes 223060
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/fd.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.7.2.4&r2=1.7.2.4.6.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/main.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.16.2.9.6.2&r2=1.16.2.9.6.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/recover.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.10.2.6.6.1&r2=1.10.2.6.6.2
--- cluster/fence/fenced/fd.h 2005/02/24 07:06:09 1.7.2.4
+++ cluster/fence/fenced/fd.h 2007/01/29 20:04:41 1.7.2.4.6.1
@@ -52,6 +52,7 @@
#define FENCED_SOCK_PATH "fenced_socket"
+#define DEFAULT_OVERRIDE_PATH "/var/run/cluster/fenced_override"
#define DEFAULT_POST_JOIN_DELAY 6
#define DEFAULT_POST_FAIL_DELAY 0
@@ -129,10 +130,12 @@
int debug;
int post_join_delay;
int post_fail_delay;
+ char *override_path;
int8_t clean_start;
int8_t post_join_delay_opt;
int8_t post_fail_delay_opt;
int8_t clean_start_opt;
+ int8_t override_path_opt;
};
#define FDFL_RUN (0)
--- cluster/fence/fenced/main.c 2006/01/23 19:24:10 1.16.2.9.6.2
+++ cluster/fence/fenced/main.c 2007/01/29 20:04:41 1.16.2.9.6.3
@@ -23,7 +23,7 @@
char our_name[MAX_CLUSTER_MEMBER_NAME_LEN+1];
-#define OPTION_STRING ("cj:f:t:Dn:hVSwQ")
+#define OPTION_STRING ("cj:f:t:Dn:O:hVSwQ")
#define LOCKFILE_NAME "/var/run/fenced.pid"
@@ -40,6 +40,8 @@
DEFAULT_POST_JOIN_DELAY);
printf(" -f <secs> Post-fail fencing delay (default %d)\n",
DEFAULT_POST_FAIL_DELAY);
+ printf(" -O <path> Override path (default %s)\n",
+ DEFAULT_OVERRIDE_PATH);
printf(" -D Enable debugging code and don't fork\n");
printf(" -h Print this help, then exit\n");
printf(" -n <name> Name of the fence domain, \"default\" if none\n");
@@ -434,6 +436,23 @@
free(str);
}
+ if (fd->comline->override_path_opt == FALSE) {
+ str = NULL;
+ memset(path, 0, 256);
+ sprintf(path, "/cluster/fence_daemon/@override_path");
+
+ error = ccs_get(cd, path, &str);
+ if (!error)
+ /* XXX These are not explicitly freed on exit; if
+ we decide to make fenced handle SIGHUP at a later
+ time, we will need to free this. */
+ fd->comline->override_path = strdup(str);
+ else
+ fd->comline->override_path = strdup(DEFAULT_OVERRIDE_PATH);
+ if (str)
+ free(str);
+ }
+
log_debug("delay post_join %ds post_fail %ds",
fd->comline->post_join_delay, fd->comline->post_fail_delay);
@@ -527,6 +546,8 @@
int cont = TRUE;
int optchar;
+ comline->override_path_opt = FALSE;
+ comline->override_path = NULL;
comline->post_join_delay_opt = FALSE;
comline->post_fail_delay_opt = FALSE;
comline->clean_start_opt = FALSE;
@@ -551,6 +572,11 @@
comline->post_fail_delay_opt = TRUE;
break;
+ case 'O':
+ comline->override_path = strdup(optarg);
+ comline->override_path_opt = TRUE;
+ break;
+
case 'D':
comline->debug = TRUE;
fenced_debug = TRUE;
--- cluster/fence/fenced/recover.c 2005/06/21 18:07:31 1.10.2.6.6.1
+++ cluster/fence/fenced/recover.c 2007/01/29 20:04:41 1.10.2.6.6.2
@@ -13,6 +13,9 @@
#include "fd.h"
#include "ccs.h"
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/select.h>
/* Fencing recovery algorithm
@@ -359,6 +362,79 @@
return num_victims;
}
+static inline void close_override(int *fd, char *path)
+{
+ unlink(path);
+ if (fd && *fd >= 0)
+ close(*fd);
+ *fd = -1;
+}
+
+static int open_override(char *path)
+{
+ int ret;
+ mode_t om;
+
+ om = umask(077);
+ ret = mkfifo(path, (S_IRUSR | S_IWUSR));
+ umask(om);
+
+ if (ret < 0)
+ return -1;
+ return open(path, O_RDONLY | O_NONBLOCK);
+}
+
+static int check_override(int ofd, char *nodename, int timeout)
+{
+ char buf[128];
+ fd_set rfds;
+ struct timeval tv = {0, 0};
+ int ret, x;
+
+ if (ofd < 0 || !nodename || !strlen(nodename)) {
+ sleep(timeout);
+ return 0;
+ }
+
+ FD_ZERO(&rfds);
+ FD_SET(ofd, &rfds);
+ tv.tv_usec = 0;
+ tv.tv_sec = timeout;
+
+ ret = select(ofd + 1, &rfds, NULL, NULL, &tv);
+ if (ret < 0) {
+ syslog(LOG_ERR, "select: %s\n", strerror(errno));
+ return -1;
+ }
+
+ if (ret == 0)
+ return 0;
+
+ memset(buf, 0, sizeof(buf));
+ ret = read(ofd, buf, sizeof(buf) - 1);
+ if (ret < 0) {
+ syslog(LOG_ERR, "read: %s\n", strerror(errno));
+ return -1;
+ }
+
+ /* chop off control characters */
+ for (x = 0; x < ret; x++) {
+ if (buf[x] < 0x20) {
+ buf[x] = 0;
+ break;
+ }
+ }
+
+ if (!strcasecmp(nodename, buf)) {
+ /* Case insensitive, but not as nice as, say, name_equal
+ in the other file... */
+ return 1;
+ }
+
+ return 0;
+}
+
+
/* If there are victims after a node has joined, it's a good indication that
they may be joining the cluster shortly. If we delay a bit they might
become members and we can avoid fencing them. This is only really an issue
@@ -429,7 +505,7 @@
fd_node_t *node;
char *master_name;
uint32_t master;
- int error, cd;
+ int error, cd, override = -1;
master = find_master_nodeid(fd, &master_name);
@@ -466,7 +542,22 @@
list_del(&node->list);
free(node);
}
- sleep(5);
+
+ if (!fd->comline->override_path) {
+ sleep(5);
+ continue;
+ }
+
+ /* Check for manual intervention */
+ override = open_override(fd->comline->override_path);
+ if (check_override(override, node->name, 5) > 0) {
+ syslog(LOG_WARNING, "fence \"%s\" overridden by "
+ "administrator intervention", node->name);
+
+ list_del(&node->list);
+ free(node);
+ }
+ close_override(&override, fd->comline->override_path);
}
ccs_disconnect(cd);
More information about the Cluster-devel
mailing list