[Cluster-devel] cluster/cman qdisk/score.h qdisk/score.c qdisk ...
lhh at sourceware.org
lhh at sourceware.org
Mon Jan 22 22:50:16 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL5
Changes by: lhh at sourceware.org 2007-01-22 22:50:12
Modified files:
cman/qdisk : score.h score.c Makefile main.c disk.h clulog.c
cman/man : qdisk.5 mkqdisk.8
Added files:
cman/qdisk : daemon_init.c
Log message:
Resolves bugzillas: #213533, #216092, #220211, #223002, #223234/#223240
Detailed comments:
* Lock in memory to prevent being swapped out
* Turn on RR scheduling for main + score threads
* Let qdiskd wait for CMAN to start
* Add option to qdiskd to stop CMAN if qdisk device is not available
* Make qdisk interval timings more accurate
* Add option to reboot node if qdiskd detects internal hang > failure time (e.g. interval*tko, in seconds)
* Add per-heuristic tko counts for unreliable heuristics (e.g. ping packets)
* Remove nodes from quorate mask immediately on eviction
* Update man pages with better examples
* Don't let >1 instance of qdiskd be started
* Clarify logging output.
* Improve data in status_file.
* Allow qdiskd to run with no defined heuristics (master-always-wins mode).
* Make fencing of nodes optional (default = on).
* Make sure CMAN is running before we try to talk to it at each point.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/daemon_init.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.6&r2=1.6.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.2&r2=1.4.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/clulog.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdisk.5.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.3&r2=1.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/mkqdisk.8.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.6.1
--- cluster/cman/qdisk/score.h 2006/05/19 14:41:35 1.2
+++ cluster/cman/qdisk/score.h 2007/01/22 22:50:11 1.2.4.1
@@ -32,7 +32,9 @@
char * program;
int score;
int available;
+ int tko;
int interval;
+ int misses;
pid_t childpid;
time_t nextrun;
};
@@ -50,7 +52,7 @@
/*
Start the thread which runs the scoring applets
*/
-int start_score_thread(struct h_data *h, int count);
+int start_score_thread(qd_ctx *ctx, struct h_data *h, int count);
/*
Get our score + maxscore
--- cluster/cman/qdisk/score.c 2006/05/19 14:41:35 1.2
+++ cluster/cman/qdisk/score.c 2007/01/22 22:50:11 1.2.4.1
@@ -32,14 +32,20 @@
#include <string.h>
#include <ccs.h>
#include <clulog.h>
+#include <sched.h>
+#include <sys/mman.h>
+#include "disk.h"
#include "score.h"
static pthread_mutex_t sc_lock = PTHREAD_MUTEX_INITIALIZER;
static int _score = 0, _maxscore = 0, _score_thread_running = 0;
static pthread_t score_thread = (pthread_t)0;
+void set_priority(int, int);
struct h_arg {
struct h_data *h;
+ int sched_queue;
+ int sched_prio;
int count;
};
@@ -97,6 +103,20 @@
h->childpid = pid;
return 0;
}
+
+ /*
+ * always use SCHED_OTHER for the child processes
+ * nice -1 is fine; but we don't know what the child process
+ * might do, so leaving it (potentially) in SCHED_RR or SCHED_FIFO
+ * is out of the question
+ *
+ * XXX if you set SCHED_OTHER in the conf file and nice 20, the below
+ * will make the heuristics a higher prio than qdiskd. This should be
+ * fine in practice, because running qdiskd at nice 20 will cause all
+ * sorts of problems on a busy system.
+ */
+ set_priority(SCHED_OTHER, -1);
+ munlockall();
argv[0] = "/bin/sh";
argv[1] = "-c";
@@ -122,6 +142,12 @@
*score = 0;
*maxscore = 0;
+
+ /* Allow operation w/o any heuristics */
+ if (!max) {
+ *score = *maxscore = 1;
+ return;
+ }
for (x = 0; x < max; x++) {
*maxscore += h[x].score;
@@ -141,22 +167,51 @@
int status;
if (h->childpid == 0)
+ /* No child to check */
return 0;
ret = waitpid(h->childpid, &status, block?0:WNOHANG);
if (!block && ret == 0)
+ /* No children exited */
return 0;
h->childpid = 0;
- h->available = 0;
if (ret < 0 && errno == ECHILD)
- return -1;
- if (!WIFEXITED(status))
- return 0;
- if (WEXITSTATUS(status) != 0)
- return 0;
- h->available = 1;
+ /* wrong child? */
+ goto miss;
+ if (!WIFEXITED(status)) {
+ ret = 0;
+ goto miss;
+ }
+ if (WEXITSTATUS(status) != 0) {
+ ret = 0;
+ goto miss;
+ }
+
+ /* Returned 0 and was not killed */
+ if (!h->available) {
+ h->available = 1;
+ clulog(LOG_INFO, "Heuristic: '%s' UP\n", h->program);
+ }
+ h->misses = 0;
return 0;
+
+miss:
+ if (h->available) {
+ h->misses++;
+ if (h->misses >= h->tko) {
+ clulog(LOG_INFO,
+ "Heuristic: '%s' DOWN (%d/%d)\n",
+ h->program, h->misses, h->tko);
+ h->available = 0;
+ } else {
+ clulog(LOG_DEBUG,
+ "Heuristic: '%s' missed (%d/%d)\n",
+ h->program, h->misses, h->tko);
+ }
+ }
+
+ return ret;
}
@@ -204,7 +259,9 @@
do {
h[x].program = NULL;
h[x].available = 0;
+ h[x].misses = 0;
h[x].interval = 2;
+ h[x].tko = 1;
h[x].score = 1;
h[x].childpid = 0;
h[x].nextrun = 0;
@@ -236,9 +293,20 @@
if (h[x].interval <= 0)
h[x].interval = 2;
}
+
+ /* Get tko for this heuristic */
+ snprintf(query, sizeof(query),
+ "/cluster/quorumd/heuristic[%d]/@tko", x+1);
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ h[x].tko= atoi(val);
+ free(val);
+ if (h[x].tko <= 0)
+ h[x].tko = 1;
+ }
- clulog(LOG_DEBUG, "Heuristic: '%s' score=%d interval=%d\n",
- h[x].program, h[x].score, h[x].interval);
+ clulog(LOG_DEBUG,
+ "Heuristic: '%s' score=%d interval=%d tko=%d\n",
+ h[x].program, h[x].score, h[x].interval, h[x].tko);
} while (++x < max);
@@ -271,6 +339,8 @@
{
struct h_arg *args = (struct h_arg *)arg;
int score, maxscore;
+
+ set_priority(args->sched_queue, args->sched_prio);
while (_score_thread_running) {
fork_heuristics(args->h, args->count);
@@ -317,7 +387,7 @@
to pass in h if it was allocated on the stack.
*/
int
-start_score_thread(struct h_data *h, int count)
+start_score_thread(qd_ctx *ctx, struct h_data *h, int count)
{
pthread_attr_t attrs;
struct h_arg *args;
@@ -337,8 +407,11 @@
memcpy(args->h, h, (sizeof(struct h_data) * count));
args->count = count;
+ args->sched_queue = ctx->qc_sched;
+ args->sched_prio = ctx->qc_sched_prio;
_score_thread_running = 1;
+
pthread_attr_init(&attrs);
pthread_attr_setinheritsched(&attrs, PTHREAD_INHERIT_SCHED);
pthread_create(&score_thread, &attrs, score_thread_main, args);
--- cluster/cman/qdisk/Makefile 2006/08/11 15:18:05 1.6
+++ cluster/cman/qdisk/Makefile 2007/01/22 22:50:11 1.6.2.1
@@ -28,7 +28,7 @@
install ${TARGET} ${sbindir}
qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \
- gettid.o proc.o ../lib/libcman.a
+ gettid.o proc.o daemon_init.o ../lib/libcman.a
gcc -o $@ $^ -lpthread -L../lib -L${ccslibdir} -lccs
mkqdisk: disk.o crc32.o disk_util.o \
--- cluster/cman/qdisk/main.c 2007/01/16 15:16:56 1.4.2.2
+++ cluster/cman/qdisk/main.c 2007/01/22 22:50:11 1.4.2.3
@@ -35,11 +35,21 @@
#include <unistd.h>
#include <time.h>
#include <sys/reboot.h>
+#include <sys/time.h>
#include <linux/reboot.h>
+#include <sched.h>
#include <signal.h>
#include <ccs.h>
#include "score.h"
#include "clulog.h"
+#if (!defined(LIBCMAN_VERSION) || \
+ (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION < 2))
+#include <cluster/cnxman-socket.h>
+#endif
+
+int daemon_init(char *);
+int check_process_running(char *, pid_t *);
+
/*
TODO:
1) Take into account timings to gracefully extend node timeouts during
@@ -155,6 +165,11 @@
if (sb->ps_timestamp == ni[x].ni_last_seen) {
/* XXX check for average + allow grace */
ni[x].ni_misses++;
+ if (ni[x].ni_misses > 1) {
+ clulog(LOG_DEBUG,
+ "Node %d missed an update (%d/%d)\n",
+ x+1, ni[x].ni_misses, ctx->qc_tko);
+ }
continue;
}
@@ -208,6 +223,11 @@
ni[x].ni_misses = 0;
ni[x].ni_state = S_NONE;
+ /* Clear our master mask for the node after eviction
+ * or shutdown */
+ if (mask)
+ clear_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+ sizeof(memb_mask_t));
continue;
}
@@ -227,15 +247,17 @@
Write eviction notice if we're the master.
*/
if (ctx->qc_status == S_MASTER) {
- clulog(LOG_DEBUG,
+ clulog(LOG_NOTICE,
"Writing eviction notice for node %d\n",
ni[x].ni_status.ps_nodeid);
qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
S_EVICT, NULL, NULL, NULL);
- clulog(LOG_DEBUG,
- "Telling CMAN to kill the node\n");
- cman_kill_node(ctx->qc_ch,
- ni[x].ni_status.ps_nodeid);
+ if (ctx->qc_flags & RF_ALLOW_KILL) {
+ clulog(LOG_DEBUG, "Telling CMAN to "
+ "kill the node\n");
+ cman_kill_node(ctx->qc_ch,
+ ni[x].ni_status.ps_nodeid);
+ }
}
/*
@@ -255,6 +277,10 @@
ni[x].ni_evil_incarnation =
ni[x].ni_status.ps_incarnation;
+ /* Clear our master mask for the node after eviction */
+ if (mask)
+ clear_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+ sizeof(memb_mask_t));
continue;
}
@@ -279,9 +305,12 @@
ni[x].ni_status.ps_state = S_EVICT;
/* XXX Need to fence it again */
- clulog(LOG_DEBUG, "Telling CMAN to kill the node\n");
- cman_kill_node(ctx->qc_ch,
- ni[x].ni_status.ps_nodeid);
+ if (ctx->qc_flags & RF_ALLOW_KILL) {
+ clulog(LOG_DEBUG, "Telling CMAN to "
+ "kill the node\n");
+ cman_kill_node(ctx->qc_ch,
+ ni[x].ni_status.ps_nodeid);
+ }
continue;
}
@@ -416,6 +445,10 @@
int x = 0, score, maxscore;
clulog(LOG_INFO, "Quorum Daemon Initializing\n");
+
+ if (mlockall(MCL_CURRENT|MCL_FUTURE) != 0) {
+ clulog(LOG_ERR, "Unable to mlockall()\n");
+ }
if (qdisk_validate(ctx->qc_device) < 0)
return -1;
@@ -427,7 +460,7 @@
return -1;
}
- start_score_thread(h, maxh);
+ start_score_thread(ctx, h, maxh);
node_info_init(ni, max);
if (qd_write_status(ctx, ctx->qc_my_id,
@@ -447,7 +480,6 @@
}
sleep(ctx->qc_interval);
-
}
get_my_score(&score,&maxscore);
@@ -500,12 +532,16 @@
return;
memset(master_mask, 0, sizeof(master_mask));
-
for (x = 0; x < retnodes; x++) {
if (is_bit_set(mask, nodes[x].cn_nodeid-1, sizeof(mask)) &&
- nodes[x].cn_member)
+ nodes[x].cn_member) {
set_bit(master_mask, nodes[x].cn_nodeid-1,
sizeof(master_mask));
+ } else {
+ /* Not in CMAN output = not allowed */
+ clear_bit(master_mask, (nodes[x].cn_nodeid-1),
+ sizeof(memb_mask_t));
+ }
}
}
@@ -604,12 +640,25 @@
}
fprintf(fp, "Node ID: %d\n", ctx->qc_my_id);
- fprintf(fp, "Score (current / min req. / max allowed): %d / %d / %d\n",
- score, score_req, score_max);
+
+ if (ctx->qc_master)
+ fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
+ else
+ fprintf(fp, "Master Node ID: (none)\n");
+
+ fprintf(fp, "Score: %d/%d (Minimum required = %d)\n",
+ score, score_max, score_req);
fprintf(fp, "Current state: %s\n", state_str(ctx->qc_status));
fprintf(fp, "Current disk state: %s\n",
state_str(ctx->qc_disk_status));
+ fprintf(fp, "Initializing Set: {");
+ for (x=0; x<max; x++) {
+ if (ni[x].ni_state == S_INIT)
+ fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
+ }
+ fprintf(fp, " }\n");
+
fprintf(fp, "Visible Set: {");
for (x=0; x<max; x++) {
if (ni[x].ni_state >= S_RUN || ni[x].ni_status.ps_nodeid ==
@@ -617,13 +666,10 @@
fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
}
fprintf(fp, " }\n");
-
- if (!ctx->qc_master) {
- fprintf(fp, "No master node\n");
+
+ if (!ctx->qc_master)
goto out;
- }
- fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
fprintf(fp, "Quorate Set: {");
for (x=0; x<max; x++) {
if (is_bit_set(ni[ctx->qc_master-1].ni_status.ps_master_mask,
@@ -642,18 +688,141 @@
}
+/* Timeval functions from clumanager */
+/**
+ * Scale a (struct timeval).
+ *
+ * @param tv The timeval to scale.
+ * @param scale Positive multiplier.
+ * @return tv
+ */
+struct timeval *
+_scale_tv(struct timeval *tv, int scale)
+{
+ tv->tv_sec *= scale;
+ tv->tv_usec *= scale;
+
+ if (tv->tv_usec > 1000000) {
+ tv->tv_sec += (tv->tv_usec / 1000000);
+ tv->tv_usec = (tv->tv_usec % 1000000);
+ }
+
+ return tv;
+}
+
+
+static inline void
+_diff_tv(struct timeval *dest, struct timeval *start, struct timeval *end)
+{
+ dest->tv_sec = end->tv_sec - start->tv_sec;
+ dest->tv_usec = end->tv_usec - start->tv_usec;
+
+ if (dest->tv_usec < 0) {
+ dest->tv_usec += 1000000;
+ dest->tv_sec--;
+ }
+}
+
+
+#define _print_tv(val) \
+ printf("%s: %d.%06d\n", #val, (int)((val)->tv_sec), \
+ (int)((val)->tv_usec))
+
+
+static inline int
+_cmp_tv(struct timeval *left, struct timeval *right)
+{
+ if (left->tv_sec > right->tv_sec)
+ return -1;
+
+ if (left->tv_sec < right->tv_sec)
+ return 1;
+
+ if (left->tv_usec > right->tv_usec)
+ return -1;
+
+ if (left->tv_usec < right->tv_usec)
+ return 1;
+
+ return 0;
+}
+
+
+void
+set_priority(int queue, int prio)
+{
+ struct sched_param s;
+ int ret;
+ char *func = "nice";
+
+ if (queue == SCHED_OTHER) {
+ s.sched_priority = 0;
+ ret = sched_setscheduler(0, queue, &s);
+ errno = 0;
+ ret = nice(prio);
+ } else {
+ memset(&s,0,sizeof(s));
+ s.sched_priority = prio;
+ ret = sched_setscheduler(0, queue, &s);
+ func = "sched_setscheduler";
+ }
+
+ if (ret < 0 && errno) {
+ clulog(LOG_WARNING, "set_priority [%s] failed: %s\n", func,
+ strerror(errno));
+ }
+}
+
+
+int
+cman_alive(cman_handle_t ch)
+{
+ fd_set rfds;
+ int fd = cman_get_fd(ch);
+ struct timeval tv = {0, 0};
+
+ FD_ZERO(&rfds);
+ FD_SET(fd, &rfds);
+ if (select(fd + 1, &rfds, NULL, NULL, &tv) == 1) {
+ if (cman_dispatch(ch, CMAN_DISPATCH_ALL) < 0) {
+ if (errno == EAGAIN)
+ return 0;
+ return -1;
+ }
+ }
+ return 0;
+}
+
int
quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
{
disk_msg_t msg = {0, 0, 0};
- int low_id, bid_pending = 0, score, score_max, score_req;
+ int low_id, bid_pending = 0, score, score_max, score_req,
+ upgrade = 0;
memb_mask_t mask, master_mask;
+ struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval;
- ctx->qc_status = S_RUN;
+ ctx->qc_status = S_NONE;
+
+ maxtime.tv_usec = 0;
+ maxtime.tv_sec = ctx->qc_interval * ctx->qc_tko;
+
+ interval.tv_usec = 0;
+ interval.tv_sec = ctx->qc_interval;
+
+ get_my_score(&score, &score_max);
+ if (score_max < ctx->qc_scoremin) {
+ clulog(LOG_WARNING, "Minimum score (%d) is impossible to "
+ "achieve (heuristic total = %d)\n",
+ ctx->qc_scoremin, score_max);
+ }
_running = 1;
while (_running) {
+ /* XXX this was getuptime() in clumanager */
+ gettimeofday(&oldtime, NULL);
+
/* Read everyone else's status */
read_node_blocks(ctx, ni, max);
@@ -662,6 +831,7 @@
/* Check heuristics and remove ourself if necessary */
get_my_score(&score, &score_max);
+ upgrade = 0;
score_req = ctx->qc_scoremin;
if (score_req <= 0)
@@ -672,14 +842,19 @@
if (ctx->qc_status > S_NONE) {
clulog(LOG_NOTICE,
"Score insufficient for master "
- "operation (%d/%d; max=%d); "
+ "operation (%d/%d; required=%d); "
"downgrading\n",
- score, score_req, score_max);
+ score, score_max, score_req);
ctx->qc_status = S_NONE;
msg.m_msg = M_NONE;
++msg.m_seq;
bid_pending = 0;
- cman_poll_quorum_device(ctx->qc_ch, 0);
+ if (cman_alive(ctx->qc_ch) < 0) {
+ clulog(LOG_ERR, "cman: %s\n",
+ strerror(errno));
+ } else {
+ cman_poll_quorum_device(ctx->qc_ch, 0);
+ }
if (ctx->qc_flags & RF_REBOOT)
reboot(RB_AUTOBOOT);
}
@@ -688,10 +863,13 @@
if (ctx->qc_status == S_NONE) {
clulog(LOG_NOTICE,
"Score sufficient for master "
- "operation (%d/%d; max=%d); "
+ "operation (%d/%d; required=%d); "
"upgrading\n",
- score, score_req, score_max);
+ score, score_max, score_req);
ctx->qc_status = S_RUN;
+ upgrade = (ctx->qc_tko / 3);
+ if (upgrade == 0)
+ upgrade = 1;
}
}
@@ -702,11 +880,13 @@
if (!ctx->qc_master &&
low_id == ctx->qc_my_id &&
ctx->qc_status == S_RUN &&
- !bid_pending ) {
+ !bid_pending &&
+ !upgrade) {
/*
If there's no master, and we are the lowest node
ID, make a bid to become master if we're not
- already bidding.
+ already bidding. We can't do this if we've just
+ upgraded.
*/
clulog(LOG_DEBUG,"Making bid for master\n");
@@ -724,10 +904,18 @@
/* We're currently bidding for master.
See if anyone's voted, or if we should
rescind our bid */
+ ++bid_pending;
/* Yes, those are all deliberate fallthroughs */
switch (check_votes(ctx, ni, max, &msg)) {
case 3:
+ /*
+ * Give ample time to become aware of other
+ * nodes
+ */
+ if (bid_pending < (ctx->qc_tko / 3))
+ break;
+
clulog(LOG_INFO,
"Assuming master role\n");
ctx->qc_status = S_MASTER;
@@ -755,6 +943,13 @@
/* We are the master. Poll the quorum device.
We can't be the master unless we score high
enough on our heuristics. */
+ if (cman_alive(ctx->qc_ch) < 0) {
+ clulog(LOG_ERR, "cman_dispatch: %s\n",
+ strerror(errno));
+ clulog(LOG_ERR,
+ "Halting qdisk operations\n");
+ return -1;
+ }
check_cman(ctx, mask, master_mask);
cman_poll_quorum_device(ctx->qc_ch, 1);
@@ -768,6 +963,13 @@
ni[ctx->qc_master-1].ni_status.ps_master_mask,
ctx->qc_my_id-1,
sizeof(memb_mask_t))) {
+ if (cman_alive(ctx->qc_ch) < 0) {
+ clulog(LOG_ERR, "cman_dispatch: %s\n",
+ strerror(errno));
+ clulog(LOG_ERR,
+ "Halting qdisk operations\n");
+ return -1;
+ }
cman_poll_quorum_device(ctx->qc_ch, 1);
}
}
@@ -783,8 +985,43 @@
/* Cycle. We could time the loop and sleep
usleep(interval-looptime), but this is fine for now.*/
+ gettimeofday(&newtime, NULL);
+ _diff_tv(&diff, &oldtime, &newtime);
+
+ /*
+ * Reboot if we didn't send a heartbeat in interval*TKO_COUNT
+ */
+ if (_cmp_tv(&maxtime, &diff) == 1 &&
+ ctx->qc_flags & RF_PARANOID) {
+ clulog(LOG_EMERG, "Failed to complete a cycle within "
+ "%d second%s (%d.%06d) - REBOOTING\n",
+ (int)maxtime.tv_sec,
+ maxtime.tv_sec==1?"":"s",
+ (int)diff.tv_sec,
+ (int)diff.tv_usec);
+ if (!(ctx->qc_flags & RF_DEBUG))
+ reboot(RB_AUTOBOOT);
+ }
+
+ /*
+ * If the amount we took to complete a loop is greater or less
+ * than our interval, we adjust by the difference each round.
+ *
+ * It's not really "realtime", but it helps!
+ */
+ if (_cmp_tv(&diff, &interval) == 1) {
+ _diff_tv(&sleeptime, &diff, &interval);
+ } else {
+ clulog(LOG_WARNING, "qdisk cycle took more "
+ "than %d second%s to complete (%d.%06d)\n",
+ ctx->qc_interval, ctx->qc_interval==1?"":"s",
+ (int)diff.tv_sec, (int)diff.tv_usec);
+ memcpy(&sleeptime, &interval, sizeof(sleeptime));
+ }
+
+ /* Could hit a watchdog timer here if we wanted to */
if (_running)
- sleep(ctx->qc_interval);
+ select(0, NULL, NULL, NULL, &sleeptime);
}
return 0;
@@ -829,12 +1066,15 @@
ctx->qc_interval = 1;
ctx->qc_tko = 10;
ctx->qc_scoremin = 0;
- ctx->qc_flags = RF_REBOOT;
+ ctx->qc_flags = RF_REBOOT | RF_ALLOW_KILL; /* | RF_STOP_CMAN;*/
+ ctx->qc_sched = SCHED_RR;
+ ctx->qc_sched_prio = 1;
/* Get log log_facility */
snprintf(query, sizeof(query), "/cluster/quorumd/@log_facility");
if (ccs_get(ccsfd, query, &val) == 0) {
clu_set_facility(val);
+ clulog(LOG_DEBUG, "Log facility: %s\n", val);
free(val);
}
@@ -903,6 +1143,37 @@
if (ctx->qc_scoremin < 0)
ctx->qc_scoremin = 0;
}
+
+ /* Get scheduling queue */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@scheduler");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ switch(val[0]) {
+ case 'r':
+ case 'R':
+ ctx->qc_sched = SCHED_RR;
+ break;
+ case 'f':
+ case 'F':
+ ctx->qc_sched = SCHED_FIFO;
+ break;
+ case 'o':
+ case 'O':
+ ctx->qc_sched = SCHED_OTHER;
+ break;
+ default:
+ clulog(LOG_WARNING, "Invalid scheduling queue '%s'\n",
+ val);
+ break;
+ }
+ free(val);
+ }
+
+ /* Get priority */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@priority");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_sched_prio = atoi(val);
+ free(val);
+ }
/* Get reboot flag for when we transition -> offline */
/* default = on, so, 0 to turn off */
@@ -912,6 +1183,50 @@
ctx->qc_flags &= ~RF_REBOOT;
free(val);
}
+
+ /*
+ * Get flag to see if we're supposed to kill cman if qdisk is not
+ * available.
+ */
+ /* default = off, so, 1 to turn on */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@stop_cman");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ if (!atoi(val))
+ ctx->qc_flags &= ~RF_STOP_CMAN;
+ else
+ ctx->qc_flags |= RF_STOP_CMAN;
+ free(val);
+ }
+
+
+ /*
+ * Get flag to see if we're supposed to reboot if we can't complete
+ * a pass in failure time
+ */
+ /* default = off, so, 1 to turn on */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@paranoid");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ if (!atoi(val))
+ ctx->qc_flags &= ~RF_PARANOID;
+ else
+ ctx->qc_flags |= RF_PARANOID;
+ free(val);
+ }
+
+
+ /*
+ * Get flag to see if we're supposed to reboot if we can't complete
+ * a pass in failure time
+ */
+ /* default = off, so, 1 to turn on */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@allow_kill");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ if (!atoi(val))
+ ctx->qc_flags &= ~RF_ALLOW_KILL;
+ else
+ ctx->qc_flags |= RF_ALLOW_KILL;
+ free(val);
+ }
*cfh = configure_heuristics(ccsfd, h, maxh);
@@ -925,18 +1240,47 @@
}
+void
+check_stop_cman(qd_ctx *ctx)
+{
+ if (!(ctx->qc_flags & RF_STOP_CMAN))
+ return;
+
+ clulog(LOG_WARNING, "Telling CMAN to leave the cluster; qdisk is not"
+ " available\n");
+#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
+ if (cman_shutdown(ctx->qc_ch, 0) < 0) {
+#else
+ int x = 0;
+ if (ioctl(cman_get_fd(ctx->qc_ch), SIOCCLUSTER_LEAVE_CLUSTER, &x) < 0) {
+#endif
+ clulog(LOG_CRIT, "Could not leave the cluster - rebooting\n");
+ sleep(5);
+ if (ctx->qc_flags & RF_DEBUG)
+ return;
+ reboot(RB_AUTOBOOT);
+ }
+}
+
+
int
main(int argc, char **argv)
{
cman_node_t me;
- int cfh, rv;
+ int cfh, rv, forked = 0;
qd_ctx ctx;
cman_handle_t ch;
node_info_t ni[MAX_NODES_DISK];
struct h_data h[10];
char debug = 0, foreground = 0;
char device[128];
-
+ pid_t pid;
+
+ if (check_process_running(argv[0], &pid) && pid !=getpid()) {
+ printf("QDisk services already running\n");
+ return 0;
+ }
+
while ((rv = getopt(argc, argv, "fd")) != EOF) {
switch (rv) {
case 'd':
@@ -944,40 +1288,64 @@
break;
case 'f':
foreground = 1;
+ clu_log_console(1);
default:
break;
}
}
+
#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
ch = cman_admin_init(NULL);
#else
ch = cman_init(NULL);
#endif
if (!ch) {
- printf("Could not connect to cluster (CMAN not running?)\n");
- return -1;
+ if (!foreground && !forked) {
+ if (daemon_init(argv[0]) < 0)
+ return -1;
+ else
+ forked = 1;
+ }
+
+ clulog(LOG_INFO, "Waiting for CMAN to start\n");
+
+ do {
+ sleep(5);
+#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
+ ch = cman_admin_init(NULL);
+#else
+ ch = cman_init(NULL);
+#endif
+ } while (!ch);
}
memset(&me, 0, sizeof(me));
- if (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) {
- printf("Could not determine local node ID; cannot start\n");
- return -1;
+ while (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) {
+ if (!foreground && !forked) {
+ if (daemon_init(argv[0]) < 0)
+ return -1;
+ else
+ forked = 1;
+ }
+ sleep(5);
}
qd_init(&ctx, ch, me.cn_nodeid);
signal(SIGINT, int_handler);
+ signal(SIGTERM, int_handler);
- if (debug)
+ if (debug) {
clu_set_loglevel(LOG_DEBUG);
- if (foreground)
- clu_log_console(1);
+ ctx.qc_flags |= RF_DEBUG;
+ }
if (get_config_data(NULL, &ctx, h, 10, &cfh, debug) < 0) {
clulog_and_print(LOG_CRIT, "Configuration failed\n");
+ check_stop_cman(&ctx);
return -1;
}
-
+
if (ctx.qc_label) {
if (find_partitions("/proc/partitions",
ctx.qc_label, device,
@@ -985,6 +1353,7 @@
clulog_and_print(LOG_CRIT, "Unable to match label"
" '%s' to any device\n",
ctx.qc_label);
+ check_stop_cman(&ctx);
return -1;
}
@@ -1000,15 +1369,21 @@
clulog(LOG_CRIT,
"Specified partition %s does not have a "
"qdisk label\n", ctx.qc_device);
+ check_stop_cman(&ctx);
return -1;
}
}
- if (!foreground)
- daemon(0,0);
+ if (!foreground && !forked) {
+ if (daemon_init(argv[0]) < 0)
+ return -1;
+ }
+
+ set_priority(ctx.qc_sched, ctx.qc_sched_prio);
if (quorum_init(&ctx, ni, MAX_NODES_DISK, h, cfh) < 0) {
clulog_and_print(LOG_CRIT, "Initialization failed\n");
+ check_stop_cman(&ctx);
return -1;
}
@@ -1026,14 +1401,12 @@
}
*/
- quorum_loop(&ctx, ni, MAX_NODES_DISK);
- cman_unregister_quorum_device(ctx.qc_ch);
+ if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0)
+ cman_unregister_quorum_device(ctx.qc_ch);
quorum_logout(&ctx);
-
qd_destroy(&ctx);
return 0;
-
}
--- cluster/cman/qdisk/disk.h 2006/10/03 18:06:40 1.4
+++ cluster/cman/qdisk/disk.h 2007/01/22 22:50:11 1.4.2.1
@@ -67,7 +67,11 @@
typedef enum {
- RF_REBOOT = 0x1 /* Reboot if we go from master->none */
+ RF_REBOOT = 0x1, /* Reboot if we go from master->none */
+ RF_STOP_CMAN = 0x2,
+ RF_DEBUG = 0x4,
+ RF_PARANOID = 0x8,
+ RF_ALLOW_KILL = 0x10
} run_flag_t;
@@ -237,6 +241,8 @@
int qc_tko;
int qc_votes;
int qc_scoremin;
+ int qc_sched;
+ int qc_sched_prio;
disk_node_state_t qc_disk_status;
disk_node_state_t qc_status;
int qc_master; /* Master?! */
--- cluster/cman/qdisk/clulog.c 2006/05/19 14:41:35 1.2
+++ cluster/cman/qdisk/clulog.c 2007/01/22 22:50:11 1.2.4.1
@@ -20,8 +20,6 @@
/** @file
* Library routines for communicating with the logging daemon.
*
- * $Id: clulog.c,v 1.2 2006/05/19 14:41:35 lhh Exp $
- *
* Author: Jeff Moyer <moyer at missioncriticallinux.com>
*/
#include <stdio.h>
@@ -50,8 +48,6 @@
#include <string.h>
-static const char *version __attribute__ ((unused)) = "$Revision: 1.2 $";
-
#ifdef DEBUG
#include <assert.h>
#define Dprintf(fmt,args...) printf(fmt,##args)
@@ -135,7 +131,7 @@
}
pthread_mutex_unlock(&log_mutex);
- return "local4";
+ return "daemon";
}
@@ -156,7 +152,6 @@
for (; facilitynames[x].c_name; x++) {
if (strcmp(facilityname, facilitynames[x].c_name))
continue;
-
syslog_facility = facilitynames[x].c_val;
break;
}
--- cluster/cman/man/qdisk.5 2006/10/03 18:07:58 1.3
+++ cluster/cman/man/qdisk.5 2007/01/22 22:50:12 1.3.2.1
@@ -1,6 +1,6 @@
-.TH "QDisk" "8" "July 2006" "" "Cluster Quorum Disk"
+.TH "QDisk" "21" "Jan 2007" "" "Cluster Quorum Disk"
.SH "NAME"
-QDisk 1.0 \- a disk-based quorum daemon for CMAN / Linux-Cluster
+QDisk 1.2 \- a disk-based quorum daemon for CMAN / Linux-Cluster
.SH "1. Overview"
.SH "1.1 Problem"
In some situations, it may be necessary or desirable to sustain
@@ -75,16 +75,24 @@
* Cluster node votes should be more or less equal.
-* CMAN must be running before the qdisk program can start.
+* CMAN must be running before the qdisk program can operate in full
+capacity. If CMAN is not running, qdisk will wait for it.
* CMAN's eviction timeout should be at least 2x the quorum daemon's
to give the quorum daemon adequate time to converge on a master during a
failure + load spike situation.
-* The total number of votes assigned to the quorum device should be
-equal to or greater than the total number of node-votes in the cluster.
-While it is possible to assign only one (or a few) votes to the quorum
-device, the effects of doing so have not been explored.
+* For 'all-but-one' failure operation, the total number of votes assigned
+to the quorum device should be equal to or greater than the total number
+of node-votes in the cluster. While it is possible to assign only one
+(or a few) votes to the quorum device, the effects of doing so have not
+been explored.
+
+* For 'tiebreaker' operation in a two-node cluster, unset CMAN's two_node
+flag (or set it to 0), set CMAN's expected votes to '3', set each node's
+vote to '1', and set qdisk's vote count to '1' as well. This will allow
+the cluster to operate if either both nodes are online, or a single node &
+the heuristics.
* Currently, the quorum disk daemon is difficult to use with CLVM if
the quorum disk resides on a CLVM logical volume. CLVM requires a
@@ -217,23 +225,27 @@
0 = emergencies; 7 = debug.
.in 9
-\fIlog_facility\fP\fB="\fPlocal4\fB"\fP
+\fIlog_facility\fP\fB="\fPdaemon\fB"\fP
.in 12
This controls the syslog facility used by the quorum daemon when logging.
For a complete list of available facilities, see \fBsyslog.conf(5)\fP.
+The default value for this is 'daemon'.
.in 9
\fIstatus_file\fP\fB="\fP/foo\fB"\fP
.in 12
Write internal states out to this file periodically ("-" = use stdout).
-This is primarily used for debugging.
+This is primarily used for debugging. The default value for this
+attribute is undefined.
.in 9
\fImin_score\fP\fB="\fP3\fB"\fP
.in 12
Absolute minimum score to be consider one's self "alive". If omitted,
or set to 0, the default function "floor((n+1)/2)" is used, where \fIn\fP
-is the sum-total of all of defined heuristics' \fIscore\fP attribute.
+is the total of all of defined heuristics' \fIscore\fP attribute. This
+must never exceed the sum of the heuristic scores, or else the quorum
+disk will never be available.
.in 9
\fIreboot\fP\fB="\fP1\fB"\fP
@@ -243,6 +255,45 @@
this value is 1 (on).
.in 9
+\fIallow_kill\fP\fB="\fP1\fB"\fP
+.in 12
+If set to 0 (off), qdiskd will *not* instruct to kill nodes it thinks
+are dead (as a result of not writing to the quorum disk). The default
+for this value is 1 (on).
+
+.in 9
+\fIparanoid\fP\fB="\fP0\fB"\fP
+.in 12
+If set to 1 (on), qdiskd will watch internal timers and reboot the node
+if it takes more than (interval * tko) seconds to complete a quorum disk
+pass. The default for this value is 0 (off).
+
+.in 9
+\fIscheduler\fP\fB="\fPrr\fB"\fP
+.in 12
+Valid values are 'rr', 'fifo', and 'other'. Selects the scheduling queue
+in the Linux kernel for operation of the main & score threads (does not
+affect the heuristics; they are always run in the 'other' queue). Default
+is 'rr'. See sched_setscheduler(2) for more details.
+
+.in 9
+\fIpriority\fP\fB="\fP1\fB"\fP
+.in 12
+Valid values for 'rr' and 'fifo' are 1..100 inclusive. Valid values
+for 'other' are -20..20 inclusive. Sets the priority of the main & score
+threads. The default value is 1 (in the RR and FIFO queues, higher numbers
+denote higher priority; in OTHER, lower values denote higher priority).
+
+.in 9
+\fIstop_cman\fP\fB="\fP0\fB"\fP
+.in 12
+Ordinarily, cluster membership is left up to CMAN, not qdisk.
+If this parameter is set to 1 (on), qdiskd will tell CMAN to leave the
+cluster if it is unable to initialize the quorum disk during startup. This
+can be used to prevent cluster participation by a node which has been
+disconnected from the SAN. The default for this value is 0 (off).
+
+.in 9
\fIdevice\fP\fB="\fP/dev/sda1\fB"\fP
.in 12
This is the device the quorum daemon will use. This device must be the
@@ -256,6 +307,8 @@
on every block device found, comparing the label against the specified
label. This is useful in configurations where the block device name
differs on a per-node basis.
+.in 8
+\fB...>\fP
.in 0
.SH "3.2. The <heuristic> tag"
@@ -268,34 +321,80 @@
.in 12
This is the program used to determine if this heuristic is alive. This
can be anything which may be executed by \fI/bin/sh -c\fP. A return
-value of zero indicates success; anything else indicates failure.
+value of zero indicates success; anything else indicates failure. This
+is required.
.in 9
\fIscore\fP\fB="\fP1\fB"\fP
.in 12
This is the weight of this heuristic. Be careful when determining scores
-for heuristics.
+for heuristics. The default score for each heuristic is 1.
.in 9
\fIinterval\fP\fB="\fP2\fB"/>\fP
.in 12
-This is the frequency at which we poll the heuristic.
+This is the frequency (in seconds) at which we poll the heuristic. The
+default interval for every heuristic is 2 seconds.
+.in 0
+
+.in 9
+\fItko\fP\fB="\fP1\fB"/>\fP
+.in 12
+After this many failed attempts to run the heuristic, it is considered DOWN,
+and its score is removed. The default tko for each heuristic is 1, which
+may be inadequate for things such as 'ping'.
+.in 8
+\fB/>\fP
.in 0
-.SH "3.3. Example"
+
+.SH "3.3. Examples"
+.SH "3.3.1. 3 cluster nodes & 3 routers"
+.in 8
+<cman expected_votes="6" .../>
+.br
+<clusternodes>
+.in 12
+<clusternode name="node1" votes="1" ... />
+.br
+<clusternode name="node2" votes="1" ... />
+.br
+<clusternode name="node3" votes="1" ... />
.in 8
+</clusternodes>
+.br
<quorumd interval="1" tko="10" votes="3" label="testing">
.in 12
-<heuristic program="ping A -c1 -t1" score="1" interval="2"/>
+<heuristic program="ping A -c1 -t1" score="1" interval="2" tko="3"/>
+.br
+<heuristic program="ping B -c1 -t1" score="1" interval="2" tko="3"/>
+.br
+<heuristic program="ping C -c1 -t1" score="1" interval="2" tko="3"/>
+.br
+.in 8
+</quorumd>
+
+.SH "3.3.2. 2 cluster nodes & 1 IP tiebreaker"
+.in 8
+<cman two_node="0" expected_votes="3" .../>
+.br
+<clusternodes>
+.in 12
+<clusternode name="node1" votes="1" ... />
.br
-<heuristic program="ping B -c1 -t1" score="1" interval="2"/>
+<clusternode name="node2" votes="1" ... />
+.in 8
+</clusternodes>
.br
-<heuristic program="ping C -c1 -t1" score="1" interval="2"/>
+<quorumd interval="1" tko="10" votes="1" label="testing">
+.in 12
+<heuristic program="ping A -c1 -t1" score="1" interval="2" tko="3"/>
.br
.in 8
</quorumd>
.in 0
+
.SH "3.4. Heuristic score considerations"
* Heuristic timeouts should be set high enough to allow the previous run
of a given heuristic to complete.
--- cluster/cman/man/mkqdisk.8 2006/07/21 17:55:04 1.2
+++ cluster/cman/man/mkqdisk.8 2007/01/22 22:50:12 1.2.6.1
@@ -13,11 +13,16 @@
.IP "\-c device \-l label"
Initialize a new cluster quorum disk. This will destroy all data on the given
device. If a cluster is currently using that device as a quorum disk, the
-entire cluster will malfunction. Do not ru
+entire cluster will malfunction. Do not run this on an active cluster when
+qdiskd is running. Only one device on the SAN should ever have the given
+label; using multiple different devices is currently not supported (it is
+expected a RAID array is used for quorum disk redundancy). The label can be
+any textual string up to 127 characters - and is therefore enough space to hold
+a UUID created with uuidgen(1).
.IP "\-f label"
-Find the cluster quorum disk with the given label and display information about it..
+Find the cluster quorum disk with the given label and display information about it.
.IP "\-L"
Display information on all accessible cluster quorum disks.
.SH "SEE ALSO"
-qdisk(5) qdiskd(8)
+qdisk(5), qdiskd(8), uuidgen(1)
More information about the Cluster-devel
mailing list