[Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...

Fri Sep 1 19:02:24 UTC 2006

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	lhh at sourceware.org	2006-09-01 19:02:22

Modified files:
	rgmanager      : ChangeLog 
	rgmanager/include: resgroup.h vf.h 
	rgmanager/src/clulib: rg_strings.c vft.c 
	rgmanager/src/daemons: groups.c main.c 
	rgmanager/src/utils: clustat.c clusvcadm.c 

Log message:
	2006-09-01 Lon Hohberger <lhh at redhat.com>
	* include/resgroup.h: Add proto for rg_strerror
	* include/vf.h: Add proto for vf_invalidate (flushes vf cache)
	* src/clulib/rg_strings.c: Add rg_strerror function, define
	human-readable strings for rgmanager error values
	* src/clulib/vft.c: Add vf_invalidate (separate from vf_shutdown)
	* src/daemons/groups.c: Fix obvious logic error
	* src/daemons/main.c: Fix rg_doall() message during loss of quorum.
	Invalidate local VF cache and kill resource configurations on
	loss of quorum (#202497).  Send RG_EQUORUM back to clustat/clusvcadm
	so that they report why they can't get information.  Don't queue
	status checks if we've lost quorum.  Add command line parameter to
	disable internal crash watchdog
	* src/utils/clustat.c, clusvcadm.c: Handle SIGPIPE, and produce
	useful errors if possible.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.22&r2=1.23
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.13&r2=1.14
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/vf.h.diff?cvsroot=cluster&r1=1.5&r2=1.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/rg_strings.c.diff?cvsroot=cluster&r1=1.4&r2=1.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/vft.c.diff?cvsroot=cluster&r1=1.15&r2=1.16
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.21&r2=1.22
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.30&r2=1.31
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clustat.c.diff?cvsroot=cluster&r1=1.19&r2=1.20
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clusvcadm.c.diff?cvsroot=cluster&r1=1.11&r2=1.12

--- cluster/rgmanager/ChangeLog	2006/08/21 15:14:08	1.22
+++ cluster/rgmanager/ChangeLog	2006/09/01 19:02:20	1.23
@@ -1,3 +1,25 @@
+2006-09-01 Lon Hohberger <lhh at redhat.com>
+	* include/resgroup.h: Add proto for rg_strerror
+	* include/vf.h: Add proto for vf_invalidate (flushes vf cache)
+	* src/clulib/rg_strings.c: Add rg_strerror function, define
+	human-readable strings for rgmanager error values
+	* src/clulib/vft.c: Add vf_invalidate (separate from vf_shutdown)
+	* src/daemons/groups.c: Fix obvious logic error
+	* src/daemons/main.c: Fix rg_doall() message during loss of quorum.
+	Invalidate local VF cache and kill resource configurations on
+	loss of quorum (#202497).  Send RG_EQUORUM back to clustat/clusvcadm
+	so that they report why they can't get information.  Don't queue
+	status checks if we've lost quorum.  Add command line parameter to
+	disable internal crash watchdog
+	* src/utils/clustat.c, clusvcadm.c: Handle SIGPIPE, and produce
+	useful errors if possible.
+
+2006-08-31 Marek GrÃ¡c <mgrac at redhat.com>
+	* src/daemons/restree.c: Fix #203720. Do not run backup copies (ends
+	with ~) of resource agents.
+	* src/resources/apache.*, mysql.*: Add Apache & MySQL resource agents
+	* src/resources/utils/*: Add utility scripts for resource agents
+
 2006-08-21 Lon Hohberger <lhh at redhat.com>
 	* src/daemons/main.c: Fix #202500 - simultaneous starts confuse
 	rgmanager.  This happened due to the fact that rgmanager was not
--- cluster/rgmanager/include/resgroup.h	2006/08/18 15:26:22	1.13
+++ cluster/rgmanager/include/resgroup.h	2006/09/01 19:02:21	1.14
@@ -174,6 +174,9 @@
 #define RG_YES		1
 #define RG_NO		2
 
+char *rg_strerror(int val);
+
+
 /*
  * Fail-over domain states
  */
--- cluster/rgmanager/include/vf.h	2006/07/12 14:04:06	1.5
+++ cluster/rgmanager/include/vf.h	2006/09/01 19:02:21	1.6
@@ -170,6 +170,7 @@
  * VF Stuff.  VF only talks to peers.
  */
 int vf_init(int, uint16_t, vf_vote_cb_t, vf_commit_cb_t);
+int vf_invalidate(void);
 int vf_shutdown(void);
 
 /*
--- cluster/rgmanager/src/clulib/rg_strings.c	2006/07/11 23:52:41	1.4
+++ cluster/rgmanager/src/clulib/rg_strings.c	2006/09/01 19:02:22	1.5
@@ -16,6 +16,39 @@
   Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
   MA 02139, USA.
 */
+#include <resgroup.h>
+
+struct { int val; char *str; } rg_error_strings[] = {
+	{ RG_EQUORUM,	"Operation requires quorum" },
+	{ RG_EINVAL,	"Invalid operation for resource" },
+	{ RG_EDEPEND,	"Operation violates dependency rule" },
+	{ RG_EAGAIN,	"Temporary failure; try again" },
+	{ RG_EDEADLCK,	"Operation would cause a deadlock" },
+	{ RG_ENOSERVICE,"Service does not exist" },
+	{ RG_EFORWARD,	"Service not mastered locally" },
+	{ RG_EABORT,	"Aborted; service failed" },
+	{ RG_EFAIL,	"Failure" },
+	{ RG_ESUCCESS,	"Success" },
+	{ RG_YES,	"Yes" },
+	{ RG_NO, 	"No" },
+	{ 0,		NULL }
+};
+
+
+char *rg_strerror(int err)
+{
+	int x;
+
+	for (x = 0; rg_error_strings[x].str != NULL; x++) {
+		if (rg_error_strings[x].val == err) {
+			return rg_error_strings[x].str;
+		}
+	}
+
+	return "Unknown";
+}
+	
+
 const char *rg_state_strings[] = {
 	"stopped",
 	"starting",
@@ -51,3 +84,4 @@
 	"user stop",
 	""
 };
+
--- cluster/rgmanager/src/clulib/vft.c	2006/08/07 22:05:01	1.15
+++ cluster/rgmanager/src/clulib/vft.c	2006/09/01 19:02:22	1.16
@@ -935,22 +935,13 @@
 }
 
 
-/**
-  Shut down VF
-  */
 int
-vf_shutdown(void)
+vf_invalidate(void)
 {
 	key_node_t *c_key;
 	view_node_t *c_jv;
 	commit_node_t *c_cn;
 
-	pthread_mutex_lock(&vf_mutex);
-	vf_thread_ready = 0;
-	pthread_cancel(vf_thread);
-	pthread_join(vf_thread, NULL);
-	_port = 0;
-	_node_id = (int)-1;
 	pthread_mutex_lock(&key_list_mutex);
 
 	while ((c_key = key_list) != NULL) {
@@ -974,6 +965,29 @@
 	}
 
 	pthread_mutex_unlock(&key_list_mutex);
+	return 0;
+}
+
+
+/**
+  Shut down VF
+  */
+int
+vf_shutdown(void)
+{
+	key_node_t *c_key;
+	view_node_t *c_jv;
+	commit_node_t *c_cn;
+
+	pthread_mutex_lock(&vf_mutex);
+	vf_thread_ready = 0;
+	pthread_cancel(vf_thread);
+	pthread_join(vf_thread, NULL);
+	_port = 0;
+	_node_id = (int)-1;
+
+	vf_invalidate();
+
 	pthread_mutex_unlock(&vf_mutex);
 
 	return 0;
--- cluster/rgmanager/src/daemons/groups.c	2006/08/18 15:26:22	1.21
+++ cluster/rgmanager/src/daemons/groups.c	2006/09/01 19:02:22	1.22
@@ -273,7 +273,7 @@
 	 * local start.
 	 */
 	if (svcStatus->rs_state == RG_STATE_STARTED &&
-	    svcStatus->rs_owner == mp->cn_nodeid)
+	    svcStatus->rs_owner != mp->cn_nodeid)
 		return;
 
 	if (svcStatus->rs_state == RG_STATE_DISABLED)
--- cluster/rgmanager/src/daemons/main.c	2006/08/21 15:14:09	1.30
+++ cluster/rgmanager/src/daemons/main.c	2006/09/01 19:02:22	1.31
@@ -123,7 +123,13 @@
 		rg_set_inquorate();
 		member_list_update(NULL);/* Clear member list */
 		rg_lockall(L_SYS);
-		rg_doall(RG_INIT, 1, "Emergency stop of %s");
+		rg_doall(RG_INIT, 1, "Emergency stop of %s\n");
+#ifndef USE_OPENAIS
+		clulog(LOG_DEBUG, "Invalidating local VF cache\n");
+		vf_invalidate();
+#endif
+		clulog(LOG_DEBUG, "Flushing resource group cache\n");
+		kill_resource_groups();
 		rg_set_uninitialized();
 		return -1;
 	} else if (!rg_quorate()) {
@@ -131,7 +137,7 @@
 		rg_set_quorate();
 		rg_unlockall(L_SYS);
 		rg_unlockall(L_USER);
-		clulog(LOG_NOTICE, "Quorum Formed\n");
+		clulog(LOG_NOTICE, "Quorum Regained\n");
 	}
 
 	old_membership = member_list();
@@ -562,7 +568,7 @@
 	case M_STATECHANGE:
 		msg_receive(ctx, NULL, 0, 0);
 		clulog(LOG_DEBUG, "Membership Change Event\n");
-		if (rg_quorate() && running) {
+		if (running) {
 			rg_unlockall(L_SYS);
 			membership_update();
 		}
@@ -644,6 +650,7 @@
 		}
 			
 		if (!rg_initialized()) {
+			msg_send_simple(newctx, RG_FAIL, RG_EQUORUM, 0);
 			msg_close(newctx);
 			msg_free_ctx(newctx);
 			continue;
@@ -651,6 +658,7 @@
 
 		if (!rg_quorate()) {
 			printf("Dropping connect: NO QUORUM\n");
+			msg_send_simple(newctx, RG_FAIL, RG_EQUORUM, 0);
 			msg_close(newctx);
 			msg_free_ctx(newctx);
 		}
@@ -668,7 +676,7 @@
 		return 0;
 
 	/* No new messages.  Drop in the status check requests.  */
-	if (n == 0) {
+	if (n == 0 && rg_quorate()) {
 		do_status_checks();
 		return 0;
 	}
@@ -805,15 +813,18 @@
 main(int argc, char **argv)
 {
 	int rv;
-	char foreground = 0;
+	char foreground = 0, wd = 1;
 	cman_node_t me;
 	msgctx_t *cluster_ctx;
 	msgctx_t *local_ctx;
 	pthread_t th;
 	cman_handle_t clu = NULL;
 
-	while ((rv = getopt(argc, argv, "fd")) != EOF) {
+	while ((rv = getopt(argc, argv, "wfd")) != EOF) {
 		switch (rv) {
+		case 'w':
+			wd = 0;
+			break;
 		case 'd':
 			debug = 1;
 			break;
@@ -834,7 +845,7 @@
 
 	if (!foreground && (geteuid() == 0)) {
 		daemon_init(argv[0]);
-		if (!debug && !watchdog_init())
+		if (wd && !debug && !watchdog_init())
 			clulog(LOG_NOTICE, "Failed to start watchdog\n");
 	}
 
--- cluster/rgmanager/src/utils/clustat.c	2006/08/07 22:05:01	1.19
+++ cluster/rgmanager/src/utils/clustat.c	2006/09/01 19:02:22	1.20
@@ -10,6 +10,7 @@
 #include <termios.h>
 #include <ccs.h>
 #include <libcman.h>
+#include <signal.h>
 
 #ifdef HAVE_CONFIG_H
 #include <config.h>
@@ -46,7 +47,7 @@
 rg_state_list(int local_node_id, int fast)
 {
 	msgctx_t ctx;
-	int max, n, x;
+	int max = 0, n, x;
 	rg_state_list_t *rsl = NULL;
 	generic_msg_hdr *msgp = NULL;
 	rg_state_msg_t *rsmp = NULL;
@@ -91,6 +92,7 @@
 		}
 
 		n = msg_receive_simple(&ctx, &msgp, tv.tv_sec);
+
 		if (n < 0) {
 			if (errno == EAGAIN)
 				continue;
@@ -109,6 +111,13 @@
 
 		swab_generic_msg_hdr(msgp);
 
+		if (msgp->gh_command == RG_FAIL) {
+			printf("Service states unavailable: %s\n", 
+			       rg_strerror(msgp->gh_arg1));
+			msg_close(&ctx);
+			return NULL;
+		}	
+
 		if (msgp->gh_command == RG_SUCCESS) {
 			free(msgp);
 			break;
@@ -736,6 +745,8 @@
 		return 1;
 	}
 
+	signal(SIGPIPE, SIG_IGN);
+
 	/* Connect & grab all our info */
 	ch = cman_init(NULL);
 
--- cluster/rgmanager/src/utils/clusvcadm.c	2006/08/09 21:48:34	1.11
+++ cluster/rgmanager/src/utils/clusvcadm.c	2006/09/01 19:02:22	1.12
@@ -31,6 +31,7 @@
 #include <libcman.h>
 #include <resgroup.h>
 #include <msgsimple.h>
+#include <signal.h>
 
 #ifdef HAVE_CONFIG_H
 #include <config.h>
@@ -187,6 +188,7 @@
 	msgctx_t ctx;
 	cman_handle_t ch;
 	SmMessageSt msg;
+	generic_msg_hdr *h = (generic_msg_hdr *)&msg;
 	int action = RG_STATUS;
 	int node_specified = 0;
        	int me, svctarget = 0;
@@ -274,6 +276,8 @@
 		svcname = realsvcname;
 	}
 
+	signal(SIGPIPE, SIG_IGN);
+
 	/* No login */
 	ch = cman_init(NULL);
 	if (!ch) {
@@ -320,48 +324,23 @@
 		return 1;
 	}
 
-	opt = msg_send(&ctx, &msg, sizeof(msg));
-
-	if (opt < sizeof(msg)) {
-		perror("msg_send");
-		fprintf(stderr, "Could not send entire message!\n");
-		return 1;
-	}
+	msg_send(&ctx, &msg, sizeof(msg));
 
-	if (msg_receive(&ctx, &msg, sizeof(msg), 0) != sizeof(msg)) {
+	/* Reusing opt here */
+	if ((opt = msg_receive(&ctx, &msg, sizeof(msg), 0)) < sizeof(*h)) {
 		perror("msg_receive");
 		fprintf(stderr, "Error receiving reply!\n");
 		return 1;
 	}
 
 	/* Decode */
-	swab_SmMessageSt(&msg);
-	switch (msg.sm_data.d_ret) {
-	case RG_ESUCCESS:
-		printf("success\n");
-		break;
-	case RG_EFAIL:
-		printf("failed\n");
-		break;
-	case RG_EABORT:
-		printf("cancelled by resource manager\n");
-		break;
-	case RG_ENOSERVICE:
-		printf("failed: Service does not exist\n");
-		break;
-	case RG_EDEADLCK:
-		printf("failed: Operation would deadlock\n");
-		break;
-	case RG_EAGAIN:
-		printf("failed: Try again (resource groups locked)\n");
-		break;
-	case RG_EDEPEND:
-		printf("failed: Operation would break dependency\n");
-		break;
-	default:
-		printf("failed: unknown reason %d\n", msg.sm_data.d_ret);
-		break;
+	if (opt < sizeof(msg)) {
+		swab_generic_msg_hdr(h);
+		printf("%s\n", rg_strerror(h->gh_arg1));
+		return h->gh_arg1;
 	}
 
+	swab_SmMessageSt(&msg);
+	printf("%s\n", rg_strerror(msg.sm_data.d_ret));
 	return msg.sm_data.d_ret;
 }