[Cluster-devel] cluster/cman qdisk/score.c qdisk/disk.h qdisk/ ...
lhh at sourceware.org
lhh at sourceware.org
Wed Feb 21 20:22:56 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL5
Changes by: lhh at sourceware.org 2007-02-21 20:22:54
Modified files:
cman/qdisk : score.c disk.h score.h main.c
cman/man : qdisk.5 qdiskd.8
cman/init.d : qdiskd
Log message:
Resolves: 229338
* Makes zero-heuristic mode work (#229338)
General (small) fixes:
* Add time stamp to status file
* Hush stdout/stderr from init script
* Give lots of information in status file if debug mode is enabled
Fixes for clusters with long failover times (e.g. 2+ minutes):
* Enable status file generation during initialization loop
* Allow termination (e.g. service qdiskd stop) during initialization loop
* Add tunables for clusters with long failure detection times (e.g. 2+ minutes)
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2.4.1&r2=1.2.4.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.2&r2=1.4.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2.4.1&r2=1.2.4.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.4&r2=1.4.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdisk.5.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.3.2.2&r2=1.3.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdiskd.8.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.6.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/init.d/qdiskd.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1
--- cluster/cman/qdisk/score.c 2007/01/22 22:50:11 1.2.4.1
+++ cluster/cman/qdisk/score.c 2007/02/21 20:22:53 1.2.4.2
@@ -143,6 +143,7 @@
*score = 0;
*maxscore = 0;
+ printf("max = %d\n", max);
/* Allow operation w/o any heuristics */
if (!max) {
*score = *maxscore = 1;
@@ -332,6 +333,20 @@
/**
+ Call this if no heuristics are set to run in master-wins mode
+ */
+int
+fudge_scoring(void)
+{
+ pthread_mutex_lock(&sc_lock);
+ _score = _maxscore = 1;
+ pthread_mutex_unlock(&sc_lock);
+
+ return 0;
+}
+
+
+/**
Loop for the scoring thread.
*/
void *
--- cluster/cman/qdisk/disk.h 2007/01/23 17:56:14 1.4.2.2
+++ cluster/cman/qdisk/disk.h 2007/02/21 20:22:53 1.4.2.3
@@ -240,6 +240,9 @@
int qc_writes;
int qc_interval;
int qc_tko;
+ int qc_tko_up;
+ int qc_upgrade_wait;
+ int qc_master_wait;
int qc_votes;
int qc_scoremin;
int qc_sched;
@@ -247,6 +250,7 @@
disk_node_state_t qc_disk_status;
disk_node_state_t qc_status;
int qc_master; /* Master?! */
+ int _pad_;
run_flag_t qc_flags;
cman_handle_t qc_ch;
char *qc_device;
--- cluster/cman/qdisk/score.h 2007/01/22 22:50:11 1.2.4.1
+++ cluster/cman/qdisk/score.h 2007/02/21 20:22:53 1.2.4.2
@@ -59,4 +59,11 @@
*/
int get_my_score(int *score, int *maxscore);
+/*
+ Set score + maxscore to 1. Call if no heuristics are present
+ to enable master-wins mode
+ */
+int fudge_scoring(void);
+
+
#endif
--- cluster/cman/qdisk/main.c 2007/01/23 17:56:14 1.4.2.4
+++ cluster/cman/qdisk/main.c 2007/02/21 20:22:53 1.4.2.5
@@ -66,7 +66,9 @@
inline void _diff_tv(struct timeval *dest, struct timeval *start,
struct timeval *end);
-static int _running = 0;
+static int _running = 1;
+void update_local_status(qd_ctx *ctx, node_info_t *ni, int max, int score,
+ int score_req, int score_max);
static void
@@ -158,6 +160,8 @@
continue;
}
/* message. */
+ memcpy(&(ni[x].ni_last_msg), &(ni[x].ni_msg),
+ sizeof(ni[x].ni_last_msg));
ni[x].ni_msg.m_arg = sb->ps_arg;
ni[x].ni_msg.m_msg = sb->ps_msg;
ni[x].ni_msg.m_seq = sb->ps_seq;
@@ -325,7 +329,7 @@
Transition from Offline -> Online
*/
- if (ni[x].ni_seen > (ctx->qc_tko / 2) &&
+ if (ni[x].ni_seen > ctx->qc_tko_up &&
!state_run(ni[x].ni_state)) {
/*
Node-join - everyone just kind of "agrees"
@@ -446,7 +450,7 @@
int
quorum_init(qd_ctx *ctx, node_info_t *ni, int max, struct h_data *h, int maxh)
{
- int x = 0, score, maxscore;
+ int x = 0, score, maxscore, score_req;
clulog(LOG_INFO, "Quorum Daemon Initializing\n");
@@ -464,16 +468,22 @@
return -1;
}
- start_score_thread(ctx, h, maxh);
+ if (h && maxh) {
+ start_score_thread(ctx, h, maxh);
+ } else {
+ clulog(LOG_DEBUG, "Permanently setting score to 1/1\n");
+ fudge_scoring();
+ }
node_info_init(ni, max);
+ ctx->qc_status = S_INIT;
if (qd_write_status(ctx, ctx->qc_my_id,
S_INIT, NULL, NULL, NULL) != 0) {
clulog(LOG_CRIT, "Could not initialize status block!\n");
return -1;
}
- while (++x <= ctx->qc_tko) {
+ while (++x <= ctx->qc_tko && _running) {
read_node_blocks(ctx, ni, max);
check_transitions(ctx, ni, max, NULL);
@@ -483,10 +493,16 @@
return -1;
}
+ get_my_score(&score, &maxscore);
+ score_req = ctx->qc_scoremin;
+ if (score_req <= 0)
+ score_req = (maxscore/2 + 1);
+ update_local_status(ctx, ni, max, score, score_req, maxscore);
+
sleep(ctx->qc_interval);
}
- get_my_score(&score,&maxscore);
+ get_my_score(&score, &maxscore);
clulog(LOG_INFO, "Initial score %d/%d\n", score, maxscore);
clulog(LOG_INFO, "Initialization complete\n");
@@ -625,11 +641,41 @@
void
+print_node_info(FILE *fp, node_info_t *ni)
+{
+ fprintf(fp, "node_info_t [node %d] {\n", ni->ni_status.ps_nodeid);
+ fprintf(fp, " ni_incarnation = 0x%08x%08x\n",
+ ((int)(ni->ni_incarnation>>32))&0xffffffff,
+ ((int)(ni->ni_incarnation)&0xffffffff));
+ fprintf(fp, " ni_evil_incarnation = 0x%08x%08x\n",
+ ((int)(ni->ni_evil_incarnation>>32))&0xffffffff,
+ ((int)(ni->ni_evil_incarnation)&0xffffffff));
+ fprintf(fp, " ni_last_seen = %s", ctime(&ni->ni_last_seen));
+ fprintf(fp, " ni_misses = %d\n", ni->ni_misses);
+ fprintf(fp, " ni_seen = %d\n", ni->ni_seen);
+ fprintf(fp, " ni_msg = {\n");
+ fprintf(fp, " m_msg = 0x%08x\n", ni->ni_msg.m_msg);
+ fprintf(fp, " m_arg = %d\n", ni->ni_msg.m_arg);
+ fprintf(fp, " m_seq = %d\n", ni->ni_msg.m_seq);
+ fprintf(fp, " }\n");
+ fprintf(fp, " ni_last_msg = {\n");
+ fprintf(fp, " m_msg = 0x%08x\n", ni->ni_last_msg.m_msg);
+ fprintf(fp, " m_arg = %d\n", ni->ni_last_msg.m_arg);
+ fprintf(fp, " m_seq = %d\n", ni->ni_last_msg.m_seq);
+ fprintf(fp, " }\n");
+ fprintf(fp, " ni_state = 0x%08x (%s)\n", ni->ni_state,
+ state_str(ni->ni_state));
+ fprintf(fp, "}\n\n");
+}
+
+
+void
update_local_status(qd_ctx *ctx, node_info_t *ni, int max, int score,
int score_req, int score_max)
{
FILE *fp;
int x, need_close = 0;
+ time_t now;
if (!ctx->qc_status_file)
return;
@@ -643,26 +689,25 @@
need_close = 1;
}
+ now = time(NULL);
+ fprintf(fp, "Time Stamp: %s", ctime(&now));
fprintf(fp, "Node ID: %d\n", ctx->qc_my_id);
- if (ctx->qc_master)
- fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
- else
- fprintf(fp, "Master Node ID: (none)\n");
-
fprintf(fp, "Score: %d/%d (Minimum required = %d)\n",
score, score_max, score_req);
fprintf(fp, "Current state: %s\n", state_str(ctx->qc_status));
+
+ /*
fprintf(fp, "Current disk state: %s\n",
state_str(ctx->qc_disk_status));
-
+ */
fprintf(fp, "Initializing Set: {");
for (x=0; x<max; x++) {
- if (ni[x].ni_state == S_INIT)
+ if (ni[x].ni_status.ps_state == S_INIT && ni[x].ni_seen)
fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
}
fprintf(fp, " }\n");
-
+
fprintf(fp, "Visible Set: {");
for (x=0; x<max; x++) {
if (ni[x].ni_state >= S_RUN || ni[x].ni_status.ps_nodeid ==
@@ -671,6 +716,14 @@
}
fprintf(fp, " }\n");
+ if (ctx->qc_status == S_INIT)
+ goto out;
+
+ if (ctx->qc_master)
+ fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
+ else
+ fprintf(fp, "Master Node ID: (none)\n");
+
if (!ctx->qc_master)
goto out;
@@ -686,6 +739,11 @@
fprintf(fp, " }\n");
out:
+ if (ctx->qc_flags & RF_DEBUG) {
+ for (x = 0; x < max; x++)
+ print_node_info(fp, &ni[x]);
+ }
+
fprintf(fp, "\n");
if (need_close)
fclose(fp);
@@ -823,7 +881,10 @@
/* Check heuristics and remove ourself if necessary */
get_my_score(&score, &score_max);
- upgrade = 0;
+
+ /* If we recently upgraded, decrement our wait time */
+ if (upgrade > 0)
+ --upgrade;
score_req = ctx->qc_scoremin;
if (score_req <= 0)
@@ -859,9 +920,7 @@
"upgrading\n",
score, score_max, score_req);
ctx->qc_status = S_RUN;
- upgrade = (ctx->qc_tko / 3);
- if (upgrade == 0)
- upgrade = 1;
+ upgrade = ctx->qc_upgrade_wait;
}
}
@@ -905,7 +964,7 @@
* Give ample time to become aware of other
* nodes
*/
- if (bid_pending < (ctx->qc_tko / 3))
+ if (bid_pending < (ctx->qc_master_wait))
break;
clulog(LOG_INFO,
@@ -1060,6 +1119,8 @@
ctx->qc_scoremin = 0;
ctx->qc_flags = RF_REBOOT | RF_ALLOW_KILL | RF_UPTIME;
/* | RF_STOP_CMAN;*/
+ if (debug)
+ ctx->qc_flags |= RF_DEBUG;
ctx->qc_sched = SCHED_RR;
ctx->qc_sched_prio = 1;
@@ -1100,6 +1161,38 @@
if (ctx->qc_tko < 3)
ctx->qc_tko = 3;
}
+
+ /* Get up-tko (transition off->online) */
+ ctx->qc_tko_up = (ctx->qc_tko / 2);
+ snprintf(query, sizeof(query), "/cluster/quorumd/@tko_up");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_tko_up = atoi(val);
+ free(val);
+ }
+ if (ctx->qc_tko_up < 2)
+ ctx->qc_tko_up = 2;
+
+ /* After coming online, wait this many intervals before
+ being allowed to bid for master. */
+ ctx->qc_upgrade_wait = 2; /* (ctx->qc_tko / 3); */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@upgrade_wait");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_upgrade_wait = atoi(val);
+ free(val);
+ }
+ if (ctx->qc_upgrade_wait < 1)
+ ctx->qc_upgrade_wait = 1;
+
+ /* wait this many intervals after bidding for master before
+ becoming Caesar */
+ ctx->qc_master_wait = (ctx->qc_tko / 3);
+ snprintf(query, sizeof(query), "/cluster/quorumd/@master_wait");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_master_wait = atoi(val);
+ free(val);
+ }
+ if (ctx->qc_master_wait < 2)
+ ctx->qc_master_wait = 2;
/* Get votes */
snprintf(query, sizeof(query), "/cluster/quorumd/@votes");
@@ -1275,7 +1368,7 @@
main(int argc, char **argv)
{
cman_node_t me;
- int cfh, rv, forked = 0;
+ int cfh, rv, forked = 0, nfd = -1;
qd_ctx ctx;
cman_handle_t ch;
node_info_t ni[MAX_NODES_DISK];
@@ -1283,13 +1376,13 @@
char debug = 0, foreground = 0;
char device[128];
pid_t pid;
-
+
if (check_process_running(argv[0], &pid) && pid !=getpid()) {
printf("QDisk services already running\n");
return 0;
}
- while ((rv = getopt(argc, argv, "fd")) != EOF) {
+ while ((rv = getopt(argc, argv, "fdQ")) != EOF) {
switch (rv) {
case 'd':
debug = 1;
@@ -1297,6 +1390,18 @@
case 'f':
foreground = 1;
clu_log_console(1);
+ break;
+ case 'Q':
+ /* Make qdisk very quiet */
+ nfd = open("/dev/null", O_RDWR);
+ close(0);
+ close(1);
+ close(2);
+ dup2(nfd, 0);
+ dup2(nfd, 1);
+ dup2(nfd, 2);
+ close(nfd);
+ break;
default:
break;
}
@@ -1394,6 +1499,9 @@
check_stop_cman(&ctx);
return -1;
}
+
+ if (!_running)
+ return 0;
cman_register_quorum_device(ctx.qc_ch, ctx.qc_device, ctx.qc_votes);
/*
--- cluster/cman/man/qdisk.5 2007/01/26 21:12:39 1.3.2.2
+++ cluster/cman/man/qdisk.5 2007/02/21 20:22:54 1.3.2.3
@@ -1,4 +1,4 @@
-.TH "QDisk" "21" "Jan 2007" "" "Cluster Quorum Disk"
+.TH "QDisk" "5" "20 Feb 2007" "" "Cluster Quorum Disk"
.SH "NAME"
QDisk 1.2 \- a disk-based quorum daemon for CMAN / Linux-Cluster
.SH "1. Overview"
@@ -205,7 +205,7 @@
.in 9
\fIinterval\fP\fB="\fP1\fB"\fP
.in 12
-This is the frequency of read/write cycles
+This is the frequency of read/write cycles, in seconds.
.in 9
\fItko\fP\fB="\fP10\fB"\fP
@@ -213,6 +213,26 @@
This is the number of cycles a node must miss in order to be declared dead.
.in 9
+\fItko_up\fP\fB="\fPX\fB"\fP
+.in 12
+This is the number of cycles a node must be seen in order to be declared
+online. Default is \fBfloor(tko/2)\fP.
+
+.in 9
+\fIupgrade_wait\fP\fB="\fP2\fB"\fP
+.in 12
+This is the number of cycles a node must wait before initiating a bid
+for master status after heuristic scoring becomes sufficient. The
+default is 2. This can not be set to 0, and should not exceed \fBtko\fP.
+
+.in 9
+\fImaster_wait\fP\fB="\fPX\fB"\fP
+.in 12
+This is the number of cycles a node must wait for votes before declaring
+itself master after making a bid. Default is \fBfloor(tko/3)\fP.
+This can not be less than 2 and should not exceed \fBtko\fP.
+
+.in 9
\fIvotes\fP\fB="\fP3\fB"\fP
.in 12
This is the number of votes the quorum daemon advertises to CMAN when it
--- cluster/cman/man/qdiskd.8 2006/07/21 17:55:04 1.2
+++ cluster/cman/man/qdiskd.8 2007/02/21 20:22:54 1.2.6.1
@@ -15,6 +15,11 @@
Run in the foreground (do not fork / daemonize).
.IP "\-d"
Enable debug output.
+.IP "\-Q"
+Close stdin/out/err immediately before doing validations. This
+is primarily for use when being called from an init script. Using
+this option will stop all output, and can not be used with the -d
+option.
.SH "SEE ALSO"
mkqdisk(8), qdisk(5), cman(5)
--- cluster/cman/init.d/qdiskd 2006/05/19 14:41:35 1.2
+++ cluster/cman/init.d/qdiskd 2007/02/21 20:22:54 1.2.4.1
@@ -19,7 +19,7 @@
# See how we were called.
case "$1" in
start)
- action "Starting the Quorum Disk Daemon:" qdiskd
+ action "Starting the Quorum Disk Daemon:" qdiskd -Q
rtrn=$?
[ $rtrn = 0 ] && touch $LOCK_FILE
;;
More information about the Cluster-devel
mailing list