[Cluster-devel] cluster/cman qdisk/score.c qdisk/disk.h qdisk/ ...

lhh at sourceware.org lhh at sourceware.org
Wed Feb 21 20:22:56 UTC 2007


CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL5
Changes by:	lhh at sourceware.org	2007-02-21 20:22:54

Modified files:
	cman/qdisk     : score.c disk.h score.h main.c 
	cman/man       : qdisk.5 qdiskd.8 
	cman/init.d    : qdiskd 

Log message:
	Resolves: 229338
	* Makes zero-heuristic mode work (#229338)
	
	General (small) fixes:
	* Add time stamp to status file
	* Hush stdout/stderr from init script
	* Give lots of information in status file if debug mode is enabled
	
	Fixes for clusters with long failover times (e.g. 2+ minutes):
	* Enable status file generation during initialization loop
	* Allow termination (e.g. service qdiskd stop) during initialization loop
	* Add tunables for clusters with long failure detection times (e.g. 2+ minutes)

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2.4.1&r2=1.2.4.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.2&r2=1.4.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2.4.1&r2=1.2.4.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.4&r2=1.4.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdisk.5.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.3.2.2&r2=1.3.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdiskd.8.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.6.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/init.d/qdiskd.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1

--- cluster/cman/qdisk/score.c	2007/01/22 22:50:11	1.2.4.1
+++ cluster/cman/qdisk/score.c	2007/02/21 20:22:53	1.2.4.2
@@ -143,6 +143,7 @@
 	*score = 0;
 	*maxscore = 0;
 	
+	printf("max = %d\n", max);
 	/* Allow operation w/o any heuristics */
 	if (!max) {
 		*score = *maxscore = 1;
@@ -332,6 +333,20 @@
 
 
 /**
+  Call this if no heuristics are set to run in master-wins mode
+ */
+int
+fudge_scoring(void)
+{
+	pthread_mutex_lock(&sc_lock);
+	_score = _maxscore = 1;
+	pthread_mutex_unlock(&sc_lock);
+
+	return 0;
+}
+
+
+/**
   Loop for the scoring thread.
  */
 void *
--- cluster/cman/qdisk/disk.h	2007/01/23 17:56:14	1.4.2.2
+++ cluster/cman/qdisk/disk.h	2007/02/21 20:22:53	1.4.2.3
@@ -240,6 +240,9 @@
 	int qc_writes;
 	int qc_interval;
 	int qc_tko;
+	int qc_tko_up;
+	int qc_upgrade_wait;
+	int qc_master_wait;
 	int qc_votes;
 	int qc_scoremin;
 	int qc_sched;
@@ -247,6 +250,7 @@
 	disk_node_state_t qc_disk_status;
 	disk_node_state_t qc_status;
 	int qc_master;		/* Master?! */
+	int _pad_;
 	run_flag_t qc_flags;
 	cman_handle_t qc_ch;
 	char *qc_device;
--- cluster/cman/qdisk/score.h	2007/01/22 22:50:11	1.2.4.1
+++ cluster/cman/qdisk/score.h	2007/02/21 20:22:53	1.2.4.2
@@ -59,4 +59,11 @@
  */
 int get_my_score(int *score, int *maxscore);
 
+/* 
+   Set score + maxscore to 1.  Call if no heuristics are present
+   to enable master-wins mode
+ */
+int fudge_scoring(void);
+
+
 #endif
--- cluster/cman/qdisk/main.c	2007/01/23 17:56:14	1.4.2.4
+++ cluster/cman/qdisk/main.c	2007/02/21 20:22:53	1.4.2.5
@@ -66,7 +66,9 @@
 inline void _diff_tv(struct timeval *dest, struct timeval *start,
 		     struct timeval *end);
 
-static int _running = 0;
+static int _running = 1;
+void update_local_status(qd_ctx *ctx, node_info_t *ni, int max, int score,
+		    	 int score_req, int score_max);
 
 
 static void
@@ -158,6 +160,8 @@
 			continue;
 		} 
 		/* message. */
+		memcpy(&(ni[x].ni_last_msg), &(ni[x].ni_msg),
+		       sizeof(ni[x].ni_last_msg));
 		ni[x].ni_msg.m_arg = sb->ps_arg;
 		ni[x].ni_msg.m_msg = sb->ps_msg;
 		ni[x].ni_msg.m_seq = sb->ps_seq;
@@ -325,7 +329,7 @@
 
 		   Transition from Offline -> Online
 		 */
-		if (ni[x].ni_seen > (ctx->qc_tko / 2) &&
+		if (ni[x].ni_seen > ctx->qc_tko_up &&
 		    !state_run(ni[x].ni_state)) {
 			/*
 			   Node-join - everyone just kind of "agrees"
@@ -446,7 +450,7 @@
 int
 quorum_init(qd_ctx *ctx, node_info_t *ni, int max, struct h_data *h, int maxh)
 {
-	int x = 0, score, maxscore;
+	int x = 0, score, maxscore, score_req;
 
 	clulog(LOG_INFO, "Quorum Daemon Initializing\n");
 	
@@ -464,16 +468,22 @@
 		return -1;
 	}
 	
-	start_score_thread(ctx, h, maxh);
+	if (h && maxh) {
+		start_score_thread(ctx, h, maxh);
+	} else {
+		clulog(LOG_DEBUG, "Permanently setting score to 1/1\n");
+		fudge_scoring();
+	}
 
 	node_info_init(ni, max);
+	ctx->qc_status = S_INIT;
 	if (qd_write_status(ctx, ctx->qc_my_id,
 			    S_INIT, NULL, NULL, NULL) != 0) {
 		clulog(LOG_CRIT, "Could not initialize status block!\n");
 		return -1;
 	}
 
-	while (++x <= ctx->qc_tko) {
+	while (++x <= ctx->qc_tko && _running) {
 		read_node_blocks(ctx, ni, max);
 		check_transitions(ctx, ni, max, NULL);
 
@@ -483,10 +493,16 @@
 			return -1;
 		}
 
+		get_my_score(&score, &maxscore);
+		score_req = ctx->qc_scoremin;
+		if (score_req <= 0)
+			score_req = (maxscore/2 + 1);
+		update_local_status(ctx, ni, max, score, score_req, maxscore);
+
 		sleep(ctx->qc_interval);
 	}
 
-	get_my_score(&score,&maxscore);
+	get_my_score(&score, &maxscore);
 	clulog(LOG_INFO, "Initial score %d/%d\n", score, maxscore);
 	clulog(LOG_INFO, "Initialization complete\n");
 
@@ -625,11 +641,41 @@
 
 
 void
+print_node_info(FILE *fp, node_info_t *ni)
+{
+	fprintf(fp, "node_info_t [node %d] {\n", ni->ni_status.ps_nodeid);
+	fprintf(fp, "    ni_incarnation = 0x%08x%08x\n",
+		((int)(ni->ni_incarnation>>32))&0xffffffff,
+		((int)(ni->ni_incarnation)&0xffffffff));
+	fprintf(fp, "    ni_evil_incarnation = 0x%08x%08x\n",
+		((int)(ni->ni_evil_incarnation>>32))&0xffffffff,
+		((int)(ni->ni_evil_incarnation)&0xffffffff));
+	fprintf(fp, "    ni_last_seen = %s", ctime(&ni->ni_last_seen));
+	fprintf(fp, "    ni_misses = %d\n", ni->ni_misses);
+	fprintf(fp, "    ni_seen = %d\n", ni->ni_seen);
+	fprintf(fp, "    ni_msg = {\n");
+	fprintf(fp, "        m_msg = 0x%08x\n", ni->ni_msg.m_msg);
+	fprintf(fp, "        m_arg = %d\n", ni->ni_msg.m_arg);
+	fprintf(fp, "        m_seq = %d\n", ni->ni_msg.m_seq);
+	fprintf(fp, "    }\n");
+	fprintf(fp, "    ni_last_msg = {\n");
+	fprintf(fp, "        m_msg = 0x%08x\n", ni->ni_last_msg.m_msg);
+	fprintf(fp, "        m_arg = %d\n", ni->ni_last_msg.m_arg);
+	fprintf(fp, "        m_seq = %d\n", ni->ni_last_msg.m_seq);
+	fprintf(fp, "    }\n");
+	fprintf(fp, "    ni_state = 0x%08x (%s)\n", ni->ni_state,
+		state_str(ni->ni_state));
+	fprintf(fp, "}\n\n");
+}
+
+
+void
 update_local_status(qd_ctx *ctx, node_info_t *ni, int max, int score,
 		    int score_req, int score_max)
 {
 	FILE *fp;
 	int x, need_close = 0;
+	time_t now;
 
 	if (!ctx->qc_status_file)
 		return;
@@ -643,26 +689,25 @@
 		need_close = 1;
 	}
 
+	now = time(NULL);
+	fprintf(fp, "Time Stamp: %s", ctime(&now));
 	fprintf(fp, "Node ID: %d\n", ctx->qc_my_id);
 	
-	if (ctx->qc_master)
-		fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
-	else 
-		fprintf(fp, "Master Node ID: (none)\n");
-	
 	fprintf(fp, "Score: %d/%d (Minimum required = %d)\n",
 		score, score_max, score_req);
 	fprintf(fp, "Current state: %s\n", state_str(ctx->qc_status));
+
+	/*
 	fprintf(fp, "Current disk state: %s\n",
 		state_str(ctx->qc_disk_status));
-
+	 */
 	fprintf(fp, "Initializing Set: {");
 	for (x=0; x<max; x++) {
-		if (ni[x].ni_state == S_INIT)
+		if (ni[x].ni_status.ps_state == S_INIT && ni[x].ni_seen)
 			fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
 	}
 	fprintf(fp, " }\n");
-	
+
 	fprintf(fp, "Visible Set: {");
 	for (x=0; x<max; x++) {
 		if (ni[x].ni_state >= S_RUN || ni[x].ni_status.ps_nodeid == 
@@ -671,6 +716,14 @@
 	}
 	fprintf(fp, " }\n");
 	
+	if (ctx->qc_status == S_INIT)
+		goto out;
+	
+	if (ctx->qc_master)
+		fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
+	else 
+		fprintf(fp, "Master Node ID: (none)\n");
+
 	if (!ctx->qc_master)
 		goto out;
 
@@ -686,6 +739,11 @@
 	fprintf(fp, " }\n");
 
 out:
+	if (ctx->qc_flags & RF_DEBUG) {
+		for (x = 0; x < max; x++)
+			print_node_info(fp, &ni[x]);
+	}
+
 	fprintf(fp, "\n");
 	if (need_close)
 		fclose(fp);
@@ -823,7 +881,10 @@
 
 		/* Check heuristics and remove ourself if necessary */
 		get_my_score(&score, &score_max);
-		upgrade = 0;
+
+		/* If we recently upgraded, decrement our wait time */
+		if (upgrade > 0)
+			--upgrade;
 
 		score_req = ctx->qc_scoremin;
 		if (score_req <= 0)
@@ -859,9 +920,7 @@
 				       "upgrading\n",
 				       score, score_max, score_req);
 				ctx->qc_status = S_RUN;
-				upgrade = (ctx->qc_tko / 3);
-				if (upgrade == 0)
-					upgrade = 1;
+				upgrade = ctx->qc_upgrade_wait;
 			}
 		}
 
@@ -905,7 +964,7 @@
 				 * Give ample time to become aware of other
 				 * nodes
 				 */
-				if (bid_pending < (ctx->qc_tko / 3))
+				if (bid_pending < (ctx->qc_master_wait))
 					break;
 				
 				clulog(LOG_INFO,
@@ -1060,6 +1119,8 @@
 	ctx->qc_scoremin = 0;
 	ctx->qc_flags = RF_REBOOT | RF_ALLOW_KILL | RF_UPTIME;
 			/* | RF_STOP_CMAN;*/
+	if (debug)
+		ctx->qc_flags |= RF_DEBUG;
 	ctx->qc_sched = SCHED_RR;
 	ctx->qc_sched_prio = 1;
 
@@ -1100,6 +1161,38 @@
 		if (ctx->qc_tko < 3)
 			ctx->qc_tko = 3;
 	}
+
+	/* Get up-tko (transition off->online) */
+	ctx->qc_tko_up = (ctx->qc_tko / 2);
+	snprintf(query, sizeof(query), "/cluster/quorumd/@tko_up");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_tko_up = atoi(val);
+		free(val);
+	}
+	if (ctx->qc_tko_up < 2)
+		ctx->qc_tko_up = 2;
+
+	/* After coming online, wait this many intervals before
+	   being allowed to bid for master. */
+	ctx->qc_upgrade_wait = 2; /* (ctx->qc_tko / 3); */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@upgrade_wait");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_upgrade_wait = atoi(val);
+		free(val);
+	}
+	if (ctx->qc_upgrade_wait < 1)
+		ctx->qc_upgrade_wait = 1;
+
+	/* wait this many intervals after bidding for master before
+	   becoming Caesar  */
+	ctx->qc_master_wait = (ctx->qc_tko / 3);
+	snprintf(query, sizeof(query), "/cluster/quorumd/@master_wait");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_master_wait = atoi(val);
+		free(val);
+	}
+	if (ctx->qc_master_wait < 2)
+		ctx->qc_master_wait = 2;
 		
 	/* Get votes */
 	snprintf(query, sizeof(query), "/cluster/quorumd/@votes");
@@ -1275,7 +1368,7 @@
 main(int argc, char **argv)
 {
 	cman_node_t me;
-	int cfh, rv, forked = 0;
+	int cfh, rv, forked = 0, nfd = -1;
 	qd_ctx ctx;
 	cman_handle_t ch;
 	node_info_t ni[MAX_NODES_DISK];
@@ -1283,13 +1376,13 @@
 	char debug = 0, foreground = 0;
 	char device[128];
 	pid_t pid;
-	
+
 	if (check_process_running(argv[0], &pid) && pid !=getpid()) {
 		printf("QDisk services already running\n");
 		return 0;
 	}
 	
-	while ((rv = getopt(argc, argv, "fd")) != EOF) {
+	while ((rv = getopt(argc, argv, "fdQ")) != EOF) {
 		switch (rv) {
 		case 'd':
 			debug = 1;
@@ -1297,6 +1390,18 @@
 		case 'f':
 			foreground = 1;
 			clu_log_console(1);
+			break;
+		case 'Q':
+			/* Make qdisk very quiet */
+			nfd = open("/dev/null", O_RDWR);
+			close(0);
+			close(1);
+			close(2);
+			dup2(nfd, 0);
+			dup2(nfd, 1);
+			dup2(nfd, 2);
+			close(nfd);
+			break;
 		default:
 			break;
 		}
@@ -1394,6 +1499,9 @@
 		check_stop_cman(&ctx);
 		return -1;
 	}
+
+	if (!_running)
+		return 0;
 	
 	cman_register_quorum_device(ctx.qc_ch, ctx.qc_device, ctx.qc_votes);
 	/*
--- cluster/cman/man/qdisk.5	2007/01/26 21:12:39	1.3.2.2
+++ cluster/cman/man/qdisk.5	2007/02/21 20:22:54	1.3.2.3
@@ -1,4 +1,4 @@
-.TH "QDisk" "21" "Jan 2007" "" "Cluster Quorum Disk"
+.TH "QDisk" "5" "20 Feb 2007" "" "Cluster Quorum Disk"
 .SH "NAME"
 QDisk 1.2 \- a disk-based quorum daemon for CMAN / Linux-Cluster
 .SH "1. Overview"
@@ -205,7 +205,7 @@
 .in 9
 \fIinterval\fP\fB="\fP1\fB"\fP
 .in 12 
-This is the frequency of read/write cycles
+This is the frequency of read/write cycles, in seconds.
 
 .in 9
 \fItko\fP\fB="\fP10\fB"\fP
@@ -213,6 +213,26 @@
 This is the number of cycles a node must miss in order to be declared dead.
 
 .in 9
+\fItko_up\fP\fB="\fPX\fB"\fP
+.in 12
+This is the number of cycles a node must be seen in order to be declared
+online.  Default is \fBfloor(tko/2)\fP.
+
+.in 9
+\fIupgrade_wait\fP\fB="\fP2\fB"\fP
+.in 12
+This is the number of cycles a node must wait before initiating a bid
+for master status after heuristic scoring becomes sufficient.  The
+default is 2.  This can not be set to 0, and should not exceed \fBtko\fP.
+
+.in 9
+\fImaster_wait\fP\fB="\fPX\fB"\fP
+.in 12
+This is the number of cycles a node must wait for votes before declaring
+itself master after making a bid.  Default is \fBfloor(tko/3)\fP. 
+This can not be less than 2 and should not exceed \fBtko\fP.
+
+.in 9
 \fIvotes\fP\fB="\fP3\fB"\fP
 .in 12
 This is the number of votes the quorum daemon advertises to CMAN when it
--- cluster/cman/man/qdiskd.8	2006/07/21 17:55:04	1.2
+++ cluster/cman/man/qdiskd.8	2007/02/21 20:22:54	1.2.6.1
@@ -15,6 +15,11 @@
 Run in the foreground (do not fork / daemonize).
 .IP "\-d"
 Enable debug output.
+.IP "\-Q"
+Close stdin/out/err immediately before doing validations.  This
+is primarily for use when being called from an init script.  Using
+this option will stop all output, and can not be used with the -d 
+option.
 
 .SH "SEE ALSO"
 mkqdisk(8), qdisk(5), cman(5)
--- cluster/cman/init.d/qdiskd	2006/05/19 14:41:35	1.2
+++ cluster/cman/init.d/qdiskd	2007/02/21 20:22:54	1.2.4.1
@@ -19,7 +19,7 @@
 # See how we were called.
 case "$1" in
   start)
-	action "Starting the Quorum Disk Daemon:" qdiskd
+	action "Starting the Quorum Disk Daemon:" qdiskd -Q
 	rtrn=$?
 	[ $rtrn = 0 ] && touch $LOCK_FILE
 	;;




More information about the Cluster-devel mailing list