[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

[Cluster-devel] cluster/rgmanager ChangeLog include/reslist.h ...



CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL5
Changes by:	lhh sourceware org	2007-11-26 21:46:27

Modified files:
	rgmanager      : ChangeLog 
	rgmanager/include: reslist.h 
	rgmanager/src/daemons: Makefile fo_domain.c groups.c main.c 
	                       reslist.c resrules.c restree.c rg_state.c 
	                       test.c 
	rgmanager/src/resources: service.sh vm.sh 
Added files:
	rgmanager/include: restart_counter.h 
	rgmanager/src/daemons: restart_counter.c 

Log message:
	Implement restart counters per #247139

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.31.2.28&r2=1.31.2.29
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.6&r2=1.15.2.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.14.2.3&r2=1.14.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/fo_domain.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.11&r2=1.11.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.25.2.12&r2=1.25.2.13
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.34.2.9&r2=1.34.2.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/reslist.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.14.2.4&r2=1.14.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/resrules.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.16.2.7&r2=1.16.2.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.23.2.12&r2=1.23.2.13
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.24.2.13&r2=1.24.2.14
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/test.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.6.2.5&r2=1.6.2.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/service.sh.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.7.2.6&r2=1.7.2.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/vm.sh.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.8&r2=1.1.2.9

--- cluster/rgmanager/ChangeLog	2007/11/26 21:37:17	1.31.2.28
+++ cluster/rgmanager/ChangeLog	2007/11/26 21:46:26	1.31.2.29
@@ -1,3 +1,21 @@
+2007-11-26 Lon Hohberger <lhh at redhat.com>
+	* include/reslist.h: Add restart counters to resource node structure
+	(intended for top-level resources, i.e. services, vms...)
+	* include/restart_counter.h: Add header file for restart counter
+	* src/daemons/Makefile: Fix build to include restart counters
+	* src/daemons/restart_counter.c: Implement restart counters #247139
+	* src/daemons/fo_domain.c, groups.c, restart_counter.c, resrules.c,
+	restree.c, test.c: Glue for restart counters.
+	* src/daemons/reslist.c: Glue for restart counters.  Make expand_time
+	parser more robust to allow things like '1h30m' as a time value.
+	* src/daemons/main.c: Mark quorum disk offline in the correct
+	place to avoid extraneous log messages
+	* src/daemons/rg_state.c: Allow marking service as stopped if
+	stuck in recover state.  Make service which failed to start
+	go to stopped state.  Glue for restart counters.
+	* src/resources/service.sh, vm.sh: Add parameters for restart
+	counters #247139
+
 2007-11-14 Lon Hohberger <lhh at redhat.com>
 	* src/utils/clulog.c: Make clulog honor rgmanager log levels
 	(#289501)
--- cluster/rgmanager/include/reslist.h	2007/08/02 14:46:51	1.15.2.6
+++ cluster/rgmanager/include/reslist.h	2007/11/26 21:46:26	1.15.2.7
@@ -126,6 +126,7 @@
 	struct _rg_node	*rn_child, *rn_parent;
 	resource_t	*rn_resource;
 	resource_act_t	*rn_actions;
+	restart_counter_t rn_restart_counter;
 	int	rn_state; /* State of this instance of rn_resource */
 	int	rn_flags;
 	int	rn_last_status;
--- cluster/rgmanager/src/daemons/Makefile	2007/07/24 13:53:08	1.14.2.3
+++ cluster/rgmanager/src/daemons/Makefile	2007/11/26 21:46:27	1.14.2.4
@@ -38,7 +38,8 @@
 clurgmgrd: rg_thread.o rg_locks.o main.o groups.o  \
 		rg_queue.o rg_forward.o reslist.o \
 		resrules.o restree.o fo_domain.o nodeevent.o \
-		rg_event.o watchdog.o rg_state.o ../clulib/libclulib.a
+		rg_event.o watchdog.o rg_state.o \
+		restart_counter.o ../clulib/libclulib.a
 	$(CC) -o $@ $^ $(INCLUDE) $(CFLAGS) $(LDFLAGS) -lccs -lcman -lpthread -ldlm
 
 #
@@ -56,7 +57,8 @@
 # packages should run 'make check' as part of the build process.
 #
 rg_test: rg_locks-noccs.o test-noccs.o reslist-noccs.o \
-		resrules-noccs.o restree-noccs.o fo_domain-noccs.o
+		resrules-noccs.o restree-noccs.o fo_domain-noccs.o \
+		restart_counter.o 
 	$(CC) -o $@ $^ $(INCLUDE) $(CFLAGS) -llalloc $(LDFLAGS) -lccs -lcman
 
 clurmtabd: clurmtabd.o clurmtabd_lib.o
--- cluster/rgmanager/src/daemons/fo_domain.c	2006/09/27 16:28:41	1.11
+++ cluster/rgmanager/src/daemons/fo_domain.c	2007/11/26 21:46:27	1.11.2.1
@@ -27,6 +27,7 @@
 #include <list.h>
 #include <clulog.h>
 #include <resgroup.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <ccs.h>
 #include <pthread.h>
--- cluster/rgmanager/src/daemons/groups.c	2007/08/02 14:46:51	1.25.2.12
+++ cluster/rgmanager/src/daemons/groups.c	2007/11/26 21:46:27	1.25.2.13
@@ -20,6 +20,7 @@
 //#define DEBUG
 #include <platform.h>
 #include <resgroup.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <vf.h>
 #include <message.h>
@@ -178,6 +179,29 @@
 }
 
 
+resource_node_t *
+node_by_ref(resource_node_t **tree, char *name)
+{
+	resource_t *res;
+	resource_node_t *node, *ret = NULL;
+	char rgname[64];
+	int x;
+
+	list_for(&_tree, node, x) {
+
+		res = node->rn_resource;
+		res_build_name(rgname, sizeof(rgname), res);
+
+		if (!strcasecmp(name, rgname)) {
+			ret = node;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+
 int
 count_resource_groups_local(cman_node_t *mp)
 {
@@ -1583,6 +1607,28 @@
 }
 
 
+int
+check_restart(char *rg_name)
+{
+	resource_node_t *node;
+	int ret = 1;
+
+	pthread_rwlock_rdlock(&resource_lock);
+	node = node_by_ref(&_tree, rg_name);
+	if (node) {
+		ret = restart_add(node->rn_restart_counter);
+		if (ret) {
+			/* Clear it out - caller is about 
+			   to relocate the service anyway */
+			restart_clear(node->rn_restart_counter);
+		}
+	}
+	pthread_rwlock_unlock(&resource_lock);
+
+	return ret;
+}
+
+
 void
 kill_resource_groups(void)
 {
--- cluster/rgmanager/src/daemons/main.c	2007/08/21 16:39:02	1.34.2.9
+++ cluster/rgmanager/src/daemons/main.c	2007/11/26 21:46:27	1.34.2.10
@@ -165,6 +165,7 @@
 
 	old_membership = member_list();
 	new_ml = get_member_list(h);
+	memb_mark_down(new_ml, 0);
 
 	for (x = 0; x < new_ml->cml_count; x++) {
 
@@ -181,19 +182,25 @@
 			quorate = cman_is_listening(h,
 					new_ml->cml_members[x].cn_nodeid,
 					port);
+
 			if (quorate == 0) {
 				clulog(LOG_DEBUG, "Node %d is not listening\n",
 					new_ml->cml_members[x].cn_nodeid);
 				new_ml->cml_members[x].cn_member = 0;
 			} else if (quorate < 0) {
+				if (errno == ENOTCONN) {
+					new_ml->cml_members[x].cn_member = 0;
+					break;
+				}
 				perror("cman_is_listening");
 				usleep(50000);
 				continue;
 			}
-
 #ifdef DEBUG
-			printf("Node %d IS listening\n",
-			       new_ml->cml_members[x].cn_nodeid);
+		       	else {
+				printf("Node %d IS listening\n",
+				       new_ml->cml_members[x].cn_nodeid);
+			}
 #endif
 			break;
 		} while(1);
@@ -201,7 +208,6 @@
 
 	cman_finish(h);
 	member_list_update(new_ml);
-	member_set_state(0, 0);		/* Mark qdisk as dead */
 
 	/*
 	 * Handle nodes lost.  Do our local node event first.
--- cluster/rgmanager/src/daemons/reslist.c	2007/07/31 17:54:54	1.14.2.4
+++ cluster/rgmanager/src/daemons/reslist.c	2007/11/26 21:46:27	1.14.2.5
@@ -26,6 +26,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #ifndef NO_CCS
--- cluster/rgmanager/src/daemons/resrules.c	2007/07/31 17:54:54	1.16.2.7
+++ cluster/rgmanager/src/daemons/resrules.c	2007/11/26 21:46:27	1.16.2.8
@@ -27,6 +27,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <dirent.h>
@@ -218,43 +219,70 @@
 
 
 int
-expand_time(char *val)
+expand_time (char *val)
 {
-	int l = strlen(val);
-	char c = val[l - 1];
-	int ret = atoi(val);
+	int curval, len;
+	int ret = 0;
+	char *start = val, ival[16];
 
-	if (ret <= 0)
-		return 0;
+	if (!val)
+		return (time_t)0;
+
+	while (start[0]) {
+
+		len = 0;
+		curval = 0;
+		memset(ival, 0, sizeof(ival));
+
+		while (isdigit(start[len])) {
+			ival[len] = start[len];
+			len++;
+		}
+
+		if (len) {
+			curval = atoi(ival);
+		} else {
+			len = 1;
+		}
 
-	if ((c >= '0') && (c <= '9'))
-		return ret;
+		switch(start[len]) {
+		case 0:
+		case 'S':
+		case 's':
+			break;
+		case 'M':
+        	case 'm':
+			curval *= 60;
+			break;
+		case 'h':
+		case 'H':
+			curval *= 3600;
+			break;
+		case 'd':
+		case 'D':
+			curval *= 86400;
+			break;
+		case 'w':
+		case 'W':
+			curval *= 604800;
+			break;
+		case 'y':
+		case 'Y':
+			curval *= 31536000;
+			break;
+		default:
+			curval = 0;
+		}
 
-	switch(c) {
-	case 'S':
-	case 's':
-		return (ret);
-	case 'M':
-	case 'm':
-		return (ret * 60);
-	case 'h':
-	case 'H':
-		return (ret * 3600);
-	case 'd':
-	case 'D':
-		return (ret * 86400);
-	case 'w':
-	case 'W':
-		return (ret * 604800);
-	case 'y':
-	case 'Y':
-		return (ret * 31536000);
+		ret += (time_t)curval;
+		start += len;
 	}
 
 	return ret;
 }
 
 
+
 /**
  * Store a resource action
  * @param actsp		Action array; may be modified and returned!
--- cluster/rgmanager/src/daemons/restree.c	2007/09/25 21:09:23	1.23.2.12
+++ cluster/rgmanager/src/daemons/restree.c	2007/11/26 21:46:27	1.23.2.13
@@ -30,6 +30,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <clulog.h>
@@ -432,6 +433,39 @@
 }
 
 
+static inline void
+assign_restart_policy(resource_t *curres, resource_node_t *parent,
+		      resource_node_t *node)
+{
+	char *val;
+	int max_restarts = 0;
+	time_t restart_expire_time = 0;
+
+	node->rn_restart_counter = NULL;
+
+	if (!curres || !node)
+		return;
+	if (parent) /* Non-parents don't get one for now */
+		return;
+
+	val = res_attr_value(curres, "max_restarts");
+	if (!val)
+		return;
+	max_restarts = atoi(val);
+	if (max_restarts <= 0)
+		return;
+	val = res_attr_value(curres, "restart_expire_time");
+	if (val) {
+		restart_expire_time = (time_t)expand_time(val);
+		if (!restart_expire_time)
+			return;
+	}
+
+	node->rn_restart_counter = restart_init(restart_expire_time,
+						max_restarts);
+}
+
+
 static inline int
 do_load_resource(int ccsfd, char *base,
 	         resource_rule_t *rule,
@@ -514,6 +548,7 @@
 	node->rn_state = RES_STOPPED;
 	node->rn_flags = 0;
 	node->rn_actions = (resource_act_t *)act_dup(curres->r_actions);
+	assign_restart_policy(curres, parent, node);
 
 	snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base);
 #ifndef NO_CCS
@@ -768,6 +803,11 @@
 			destroy_resource_tree(&(*tree)->rn_child);
 
 		list_remove(tree, node);
+
+		if (node->rn_restart_counter) {
+			restart_cleanup(node->rn_restart_counter);
+		}
+
 		if(node->rn_actions){
 			free(node->rn_actions);
 		}
--- cluster/rgmanager/src/daemons/rg_state.c	2007/08/30 16:03:03	1.24.2.13
+++ cluster/rgmanager/src/daemons/rg_state.c	2007/11/26 21:46:27	1.24.2.14
@@ -1315,7 +1315,8 @@
 	}
 
 	if ((svcStatus.rs_state != RG_STATE_STOPPING) &&
-	     (svcStatus.rs_state != RG_STATE_ERROR)) {
+	    (svcStatus.rs_state != RG_STATE_ERROR) &&
+	    (svcStatus.rs_state != RG_STATE_RECOVER)) {
 		rg_unlock(&lockp);
 		return 0;
 	}
@@ -1721,8 +1722,10 @@
 	 * We got sent here from handle_start_req.
 	 * We're DONE.
 	 */
-	if (request == RG_START_RECOVER)
+	if (request == RG_START_RECOVER) {
+		_svc_stop_finish(svcName, 0, RG_STATE_STOPPED);
 		return RG_EFAIL;
+	}
 
 	/*
 	 * All potential places for the service to start have been exhausted.
@@ -1731,7 +1734,7 @@
 exhausted:
 	if (!rg_locked()) {
 		clulog(LOG_WARNING,
-		       "#70: Attempting to restart service %s locally.\n",
+		       "#70: Failed to relocate %s; restarting locally\n",
 		       svcName);
 		if (svc_start(svcName, RG_START_RECOVER) == 0) {
 			*new_owner = me;
@@ -1969,6 +1972,14 @@
 					   new_owner);
 	}
 
+	/* Check restart counter/timer for this resource */
+	if (check_restart(svcName) > 0) {
+		clulog(LOG_NOTICE, "Restart threshold for %s exceeded; "
+		       "attempting to relocate\n", svcName);
+		return handle_relocate_req(svcName, RG_START_RECOVER, -1,
+					   new_owner);
+	}
+
 	return handle_start_req(svcName, RG_START_RECOVER, new_owner);
 }
 
--- cluster/rgmanager/src/daemons/test.c	2007/07/31 17:54:54	1.6.2.5
+++ cluster/rgmanager/src/daemons/test.c	2007/11/26 21:46:27	1.6.2.6
@@ -25,6 +25,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 
--- cluster/rgmanager/src/resources/service.sh	2007/11/13 17:38:43	1.7.2.6
+++ cluster/rgmanager/src/resources/service.sh	2007/11/26 21:46:27	1.7.2.7
@@ -154,6 +154,32 @@
             </shortdesc>
             <content type="string"/>
         </parameter>
+
+        <parameter name="max_restarts">
+            <longdesc lang="en">
+	    	Maximum restarts for this service.
+            </longdesc>
+            <shortdesc lang="en">
+	    	Maximum restarts for this service.
+            </shortdesc>
+            <content type="string"/>
+        </parameter>
+
+        <parameter name="restart_expire_time">
+            <longdesc lang="en">
+	    	Restart expiration time
+            </longdesc>
+            <shortdesc lang="en">
+	    	Restart expiration time.  A restart is forgotten
+		after this time.  When combined with the max_restarts
+		option, this lets administrators specify a threshold
+		for when to fail over services.  If max_restarts
+		is exceeded in this given expiration time, the service
+		is relocated instead of restarted again.
+            </shortdesc>
+            <content type="string"/>
+        </parameter>
+
     </parameters>
 
     <actions>
--- cluster/rgmanager/src/resources/vm.sh	2007/11/14 18:58:26	1.1.2.8
+++ cluster/rgmanager/src/resources/vm.sh	2007/11/26 21:46:27	1.1.2.9
@@ -184,6 +184,31 @@
             <content type="string" default="live"/>
         </parameter>
 
+        <parameter name="max_restarts">
+            <longdesc lang="en">
+	    	Maximum restarts for this service.
+            </longdesc>
+            <shortdesc lang="en">
+	    	Maximum restarts for this service.
+            </shortdesc>
+            <content type="string"/>
+        </parameter>
+
+        <parameter name="restart_expire_time">
+            <longdesc lang="en">
+	    	Restart expiration time
+            </longdesc>
+            <shortdesc lang="en">
+	    	Restart expiration time.  A restart is forgotten
+		after this time.  When combined with the max_restarts
+		option, this lets administrators specify a threshold
+		for when to fail over services.  If max_restarts
+		is exceeded in this given expiration time, the service
+		is relocated instead of restarted again.
+            </shortdesc>
+            <content type="string"/>
+        </parameter>
+
     </parameters>
 
     <actions>


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]