[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

[Cluster-devel] cluster/rgmanager ChangeLog include/resgroup.h ...



CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	lhh sourceware org	2007-11-30 20:36:18

Modified files:
	rgmanager      : ChangeLog 
	rgmanager/include: resgroup.h reslist.h 
	rgmanager/src/daemons: Makefile fo_domain.c groups.c main.c 
	                       reslist.c resrules.c restree.c rg_state.c 
	                       test.c 
Added files:
	rgmanager/include: restart_counter.h 
	rgmanager/src/daemons: restart_counter.c 

Log message:
	Merges from RHEL5 branch - round 2.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.60&r2=1.61
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/Makefile.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/fo_domain.c.diff?cvsroot=cluster&r1=1.13&r2=1.14
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.39&r2=1.40
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.44&r2=1.45
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/reslist.c.diff?cvsroot=cluster&r1=1.19&r2=1.20
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/resrules.c.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&r1=1.37&r2=1.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.40&r2=1.41
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/test.c.diff?cvsroot=cluster&r1=1.12&r2=1.13

--- cluster/rgmanager/ChangeLog	2007/11/30 20:06:55	1.60
+++ cluster/rgmanager/ChangeLog	2007/11/30 20:36:17	1.61
@@ -1,6 +1,8 @@
 2007-11-30 Lon Hohberger <lhh at redhat.com>
-	* src/resources/*: Merge from RHEL5 branch.
-	* src/utils/*: Merge from RHEL5 branch.
+	* src/resources/*: Merge misc. updates from RHEL5 branch.
+	* src/utils/*: Merge misc. updates from RHEL5 branch.
+	* include/*.h, src/daemons/*: Merge status-counter patch
+	from RHEL5 branch.
 
 2007-08-30 Lon Hohberger <lhh at redhat.com>
 	* src/daemons/restree.c, rg_state.c: Fix tree-restart bug
--- cluster/rgmanager/include/restart_counter.h	2007/11/26 21:46:26	1.1
+++ cluster/rgmanager/include/restart_counter.h	2007/11/30 20:36:17	1.2
@@ -0,0 +1,12 @@
+#ifndef _RESTART_COUNTER_H
+#define _RESTART_COUNTER_H
+
+typedef void *restart_counter_t;
+
+int restart_add(restart_counter_t arg);
+int restart_clear(restart_counter_t arg);
+int restart_count(restart_counter_t arg);
+restart_counter_t restart_init(time_t expire_timeout, int max_restarts);
+int restart_cleanup(restart_counter_t arg);
+
+#endif
--- cluster/rgmanager/include/resgroup.h	2007/06/27 14:03:51	1.23
+++ cluster/rgmanager/include/resgroup.h	2007/11/30 20:36:17	1.24
@@ -150,6 +150,8 @@
 int svc_freeze(char *svcName);
 int svc_unfreeze(char *svcName);
 int svc_migrate(char *svcName, int target);
+int check_restart(char *svcName);
+
 int rt_enqueue_request(const char *resgroupname, int request,
 		       msgctx_t *resp_ctx,
        		       int max, uint32_t target, int arg0, int arg1);
--- cluster/rgmanager/include/reslist.h	2007/08/02 14:53:37	1.23
+++ cluster/rgmanager/include/reslist.h	2007/11/30 20:36:17	1.24
@@ -126,6 +126,7 @@
 	struct _rg_node	*rn_child, *rn_parent;
 	resource_t	*rn_resource;
 	resource_act_t	*rn_actions;
+	restart_counter_t rn_restart_counter;
 	int	rn_state; /* State of this instance of rn_resource */
 	int	rn_flags;
 	int	rn_last_status;
--- cluster/rgmanager/src/daemons/restart_counter.c	2007/11/26 21:46:27	1.1
+++ cluster/rgmanager/src/daemons/restart_counter.c	2007/11/30 20:36:17	1.2
@@ -0,0 +1,185 @@
+/*
+  Copyright Red Hat, Inc. 2007
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License version 2 as published
+  by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+*/
+/* Time-based restart counters for rgmanager */
+
+#include <stdio.h>
+#include <list.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <time.h>
+#include <restart_counter.h>
+
+
+
+#define RESTART_INFO_MAGIC 0x184820ab
+
+typedef struct {
+	list_head();
+	time_t restart_time;
+} restart_item_t;
+
+typedef struct {
+	int magic;
+	time_t expire_timeout;
+	int max_restarts;
+	int restart_count;
+	restart_item_t *restart_nodes;
+} restart_info_t;
+
+
+#define VALIDATE(arg, ret) \
+do { \
+	if (((restart_info_t *)arg)->magic != RESTART_INFO_MAGIC) {\
+		errno = EINVAL; \
+		return ret; \
+	} \
+} while(0)
+
+
+/* Remove expired restarts */
+static int
+restart_timer_purge(restart_counter_t arg, time_t now)
+{
+	restart_info_t *restarts = (restart_info_t *)arg;
+	restart_item_t *i;
+	int x, done = 0;
+
+	VALIDATE(arg, -1);
+
+	/* No timeout */
+	if (restarts->expire_timeout == 0)
+		return 0;
+
+	do {
+		done = 1;
+		list_for(&restarts->restart_nodes, i, x) {
+			if ((now - i->restart_time) >=
+			    restarts->expire_timeout) {
+				restarts->restart_count--;
+				list_remove(&restarts->restart_nodes, i);
+				done = 0;
+				break;
+			}
+		}
+	} while(!done);
+
+	return 0;
+}
+
+
+int
+restart_count(restart_counter_t arg)
+{
+	restart_info_t *restarts = (restart_info_t *)arg;
+	time_t now;
+
+	VALIDATE(arg, -1);
+	now = time(NULL);
+	restart_timer_purge(arg, now);
+	return restarts->restart_count;
+}
+
+
+/* Add a restart entry to the list.  Returns 1 if restart
+   count is exceeded */
+int
+restart_add(restart_counter_t arg)
+{
+	restart_info_t *restarts = (restart_info_t *)arg;
+	restart_item_t *i;
+	time_t t;
+
+	if (!arg)
+		/* No max restarts / threshold = always
+		   ok to restart! */
+		return 0;
+
+	VALIDATE(arg, -1);
+
+	i = malloc(sizeof(*i));
+	if (!i) {
+		return -1;
+	}
+
+	t = time(NULL);
+	i->restart_time = t;
+
+	list_insert(&restarts->restart_nodes, i);
+	restarts->restart_count++;
+
+	/* Check and remove old entries */
+	restart_timer_purge(restarts, t);
+
+	if (restarts->restart_count > restarts->max_restarts)
+		return 1;
+
+	return 0;
+}
+
+
+int
+restart_clear(restart_counter_t arg)
+{
+	restart_info_t *restarts = (restart_info_t *)arg;
+	restart_item_t *i;
+
+	VALIDATE(arg, -1);
+	while ((i = restarts->restart_nodes)) {
+		list_remove(&restarts->restart_nodes, i);
+		free(i);
+	}
+
+	restarts->restart_count = 0;
+
+	return 0;
+}
+
+
+restart_counter_t
+restart_init(time_t expire_timeout, int max_restarts)
+{
+	restart_info_t *info;
+
+	if (max_restarts < 0) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	info = malloc(sizeof(*info));
+	if (info == NULL)
+		return NULL;
+
+	info->magic = RESTART_INFO_MAGIC;
+	info->expire_timeout = expire_timeout;
+	info->max_restarts = max_restarts;
+	info->restart_count = 0;
+
+	return (void *)info;
+}
+
+
+int
+restart_cleanup(restart_counter_t arg)
+{
+	VALIDATE(arg, -1);
+	restart_clear(arg);
+	free(arg);
+	return 0;
+}
--- cluster/rgmanager/src/daemons/Makefile	2007/08/28 04:35:47	1.23
+++ cluster/rgmanager/src/daemons/Makefile	2007/11/30 20:36:17	1.24
@@ -31,12 +31,14 @@
 	rg_queue.o \
 	rg_state.o \
 	rg_thread.o \
+	restart_counter.o \
 	watchdog.o
 
 OBJS2=	clurmtabd.o \
 	clurmtabd_lib.o
 
-OBJS3=	test-noccs.o
+OBJS3=	test-noccs.o \
+	restart_counter.o
 
 OBJS4=	dtest-noccs.o
 
--- cluster/rgmanager/src/daemons/fo_domain.c	2007/03/20 17:09:57	1.13
+++ cluster/rgmanager/src/daemons/fo_domain.c	2007/11/30 20:36:17	1.14
@@ -27,6 +27,7 @@
 #include <list.h>
 #include <clulog.h>
 #include <resgroup.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <ccs.h>
 #include <pthread.h>
--- cluster/rgmanager/src/daemons/groups.c	2007/08/02 14:53:38	1.39
+++ cluster/rgmanager/src/daemons/groups.c	2007/11/30 20:36:17	1.40
@@ -20,6 +20,7 @@
 //#define DEBUG
 #include <platform.h>
 #include <resgroup.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <vf.h>
 #include <message.h>
@@ -179,6 +180,29 @@
 }
 
 
+resource_node_t *
+node_by_ref(resource_node_t **tree, char *name)
+{
+	resource_t *res;
+	resource_node_t *node, *ret = NULL;
+	char rgname[64];
+	int x;
+
+	list_for(&_tree, node, x) {
+
+		res = node->rn_resource;
+		res_build_name(rgname, sizeof(rgname), res);
+
+		if (!strcasecmp(name, rgname)) {
+			ret = node;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+
 int
 count_resource_groups_local(cman_node_t *mp)
 {
@@ -1587,6 +1611,28 @@
 }
 
 
+int
+check_restart(char *rg_name)
+{
+	resource_node_t *node;
+	int ret = 1;
+
+	pthread_rwlock_rdlock(&resource_lock);
+	node = node_by_ref(&_tree, rg_name);
+	if (node) {
+		ret = restart_add(node->rn_restart_counter);
+		if (ret) {
+			/* Clear it out - caller is about 
+			   to relocate the service anyway */
+			restart_clear(node->rn_restart_counter);
+		}
+	}
+	pthread_rwlock_unlock(&resource_lock);
+
+	return ret;
+}
+
+
 void
 kill_resource_groups(void)
 {
--- cluster/rgmanager/src/daemons/main.c	2007/09/19 09:54:19	1.44
+++ cluster/rgmanager/src/daemons/main.c	2007/11/30 20:36:17	1.45
@@ -166,6 +166,7 @@
 
 	old_membership = member_list();
 	new_ml = get_member_list(h);
+	memb_mark_down(new_ml, 0);
 
 	for (x = 0; x < new_ml->cml_count; x++) {
 
@@ -182,19 +183,25 @@
 			quorate = cman_is_listening(h,
 					new_ml->cml_members[x].cn_nodeid,
 					port);
+
 			if (quorate == 0) {
 				clulog(LOG_DEBUG, "Node %d is not listening\n",
 					new_ml->cml_members[x].cn_nodeid);
 				new_ml->cml_members[x].cn_member = 0;
 			} else if (quorate < 0) {
+				if (errno == ENOTCONN) {
+					new_ml->cml_members[x].cn_member = 0;
+					break;
+				}
 				perror("cman_is_listening");
 				usleep(50000);
 				continue;
 			}
-
 #ifdef DEBUG
-			printf("Node %d IS listening\n",
-			       new_ml->cml_members[x].cn_nodeid);
+		       	else {
+				printf("Node %d IS listening\n",
+				       new_ml->cml_members[x].cn_nodeid);
+			}
 #endif
 			break;
 		} while(1);
@@ -202,7 +209,6 @@
 
 	cman_finish(h);
 	member_list_update(new_ml);
-	member_set_state(0, 0);		/* Mark qdisk as dead */
 
 	/*
 	 * Handle nodes lost.  Do our local node event first.
--- cluster/rgmanager/src/daemons/reslist.c	2007/07/31 18:00:25	1.19
+++ cluster/rgmanager/src/daemons/reslist.c	2007/11/30 20:36:17	1.20
@@ -26,6 +26,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #ifndef NO_CCS
--- cluster/rgmanager/src/daemons/resrules.c	2007/07/31 18:00:25	1.23
+++ cluster/rgmanager/src/daemons/resrules.c	2007/11/30 20:36:17	1.24
@@ -27,6 +27,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <ctype.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <dirent.h>
@@ -230,43 +232,70 @@
 
 
 int
-expand_time(char *val)
+expand_time (char *val)
 {
-	int l = strlen(val);
-	char c = val[l - 1];
-	int ret = atoi(val);
+	int curval, len;
+	int ret = 0;
+	char *start = val, ival[16];
 
-	if (ret <= 0)
-		return 0;
+	if (!val)
+		return (time_t)0;
+
+	while (start[0]) {
+
+		len = 0;
+		curval = 0;
+		memset(ival, 0, sizeof(ival));
+
+		while (isdigit(start[len])) {
+			ival[len] = start[len];
+			len++;
+		}
+
+		if (len) {
+			curval = atoi(ival);
+		} else {
+			len = 1;
+		}
 
-	if ((c >= '0') && (c <= '9'))
-		return ret;
+		switch(start[len]) {
+		case 0:
+		case 'S':
+		case 's':
+			break;
+		case 'M':
+        	case 'm':
+			curval *= 60;
+			break;
+		case 'h':
+		case 'H':
+			curval *= 3600;
+			break;
+		case 'd':
+		case 'D':
+			curval *= 86400;
+			break;
+		case 'w':
+		case 'W':
+			curval *= 604800;
+			break;
+		case 'y':
+		case 'Y':
+			curval *= 31536000;
+			break;
+		default:
+			curval = 0;
+		}
 
-	switch(c) {
-	case 'S':
-	case 's':
-		return (ret);
-	case 'M':
-	case 'm':
-		return (ret * 60);
-	case 'h':
-	case 'H':
-		return (ret * 3600);
-	case 'd':
-	case 'D':
-		return (ret * 86400);
-	case 'w':
-	case 'W':
-		return (ret * 604800);
-	case 'y':
-	case 'Y':
-		return (ret * 31536000);
+		ret += (time_t)curval;
+		start += len;
 	}
 
 	return ret;
 }
 
 
+
 /**
  * Store a resource action
  * @param actsp		Action array; may be modified and returned!
--- cluster/rgmanager/src/daemons/restree.c	2007/08/30 16:09:39	1.37
+++ cluster/rgmanager/src/daemons/restree.c	2007/11/30 20:36:17	1.38
@@ -30,6 +30,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <clulog.h>
@@ -432,6 +433,39 @@
 }
 
 
+static inline void
+assign_restart_policy(resource_t *curres, resource_node_t *parent,
+		      resource_node_t *node)
+{
+	char *val;
+	int max_restarts = 0;
+	time_t restart_expire_time = 0;
+
+	node->rn_restart_counter = NULL;
+
+	if (!curres || !node)
+		return;
+	if (parent) /* Non-parents don't get one for now */
+		return;
+
+	val = res_attr_value(curres, "max_restarts");
+	if (!val)
+		return;
+	max_restarts = atoi(val);
+	if (max_restarts <= 0)
+		return;
+	val = res_attr_value(curres, "restart_expire_time");
+	if (val) {
+		restart_expire_time = (time_t)expand_time(val);
+		if (!restart_expire_time)
+			return;
+	}
+
+	node->rn_restart_counter = restart_init(restart_expire_time,
+						max_restarts);
+}
+
+
 static inline int
 do_load_resource(int ccsfd, char *base,
 	         resource_rule_t *rule,
@@ -514,6 +548,7 @@
 	node->rn_state = RES_STOPPED;
 	node->rn_flags = 0;
 	node->rn_actions = (resource_act_t *)act_dup(curres->r_actions);
+	assign_restart_policy(curres, parent, node);
 
 	snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base);
 #ifndef NO_CCS
@@ -769,6 +804,11 @@
 			destroy_resource_tree(&(*tree)->rn_child);
 
 		list_remove(tree, node);
+
+		if (node->rn_restart_counter) {
+			restart_cleanup(node->rn_restart_counter);
+		}
+
 		if(node->rn_actions){
 			free(node->rn_actions);
 		}
--- cluster/rgmanager/src/daemons/rg_state.c	2007/08/30 16:09:39	1.40
+++ cluster/rgmanager/src/daemons/rg_state.c	2007/11/30 20:36:18	1.41
@@ -1350,7 +1350,8 @@
 	}
 
 	if ((svcStatus.rs_state != RG_STATE_STOPPING) &&
-	     (svcStatus.rs_state != RG_STATE_ERROR)) {
+	    (svcStatus.rs_state != RG_STATE_ERROR) &&
+	    (svcStatus.rs_state != RG_STATE_RECOVER)) {
 		rg_unlock(&lockp);
 		return 0;
 	}
@@ -1829,8 +1830,10 @@
 	 * We got sent here from handle_start_req.
 	 * We're DONE.
 	 */
-	if (request == RG_START_RECOVER)
+	if (request == RG_START_RECOVER) {
+		_svc_stop_finish(svcName, 0, RG_STATE_STOPPED);
 		return RG_EFAIL;
+	}
 
 	/*
 	 * All potential places for the service to start have been exhausted.
@@ -1839,7 +1842,7 @@
 exhausted:
 	if (!rg_locked()) {
 		clulog(LOG_WARNING,
-		       "#70: Attempting to restart service %s locally.\n",
+		       "#70: Failed to relocate %s; restarting locally\n",
 		       svcName);
 		if (svc_start(svcName, RG_START_RECOVER) == 0) {
 			*new_owner = me;
@@ -2078,6 +2081,14 @@
 					   new_owner);
 	}
 
+	/* Check restart counter/timer for this resource */
+	if (check_restart(svcName) > 0) {
+		clulog(LOG_NOTICE, "Restart threshold for %s exceeded; "
+		       "attempting to relocate\n", svcName);
+		return handle_relocate_req(svcName, RG_START_RECOVER, -1,
+					   new_owner);
+	}
+
 	return handle_start_req(svcName, RG_START_RECOVER, new_owner);
 }
 
--- cluster/rgmanager/src/daemons/test.c	2007/07/31 18:02:49	1.12
+++ cluster/rgmanager/src/daemons/test.c	2007/11/30 20:36:18	1.13
@@ -25,6 +25,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <depends.h>


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]