[Cluster-devel] cluster/rgmanager/src/resources Makefile netfs ...

lhh at sourceware.org lhh at sourceware.org
Tue Dec 4 21:59:54 UTC 2007


CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL5
Changes by:	lhh at sourceware.org	2007-12-04 21:59:54

Modified files:
	rgmanager/src/resources: Makefile netfs.sh 
Added files:
	rgmanager/src/resources: default_event_script.sl 

Log message:
	Port force-unmount from RHEL4 branch

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/default_event_script.sl.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.13.2.6&r2=1.13.2.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/netfs.sh.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.7.2.2&r2=1.7.2.3

/cvs/cluster/cluster/rgmanager/src/resources/default_event_script.sl,v  -->  standard output
revision 1.1.2.1
--- cluster/rgmanager/src/resources/default_event_script.sl
+++ -	2007-12-04 21:59:54.602852000 +0000
@@ -0,0 +1,291 @@
+define node_in_set(node_list, node)
+{
+	variable x, len;
+
+	len = length(node_list);
+	for (x = 0; x < len; x++) {
+		if (node_list[x] == node)
+			return 1;
+	}
+
+	return 0;
+}
+
+define move_or_start(service, node_list)
+{
+	variable len;
+	variable state, owner;
+	variable depends;
+
+	depends = service_property(service, "depend");
+	if (depends != "") {
+		(owner, state) = service_status(depends);
+		if (owner < 0) {
+			debug(service, " is not runnable; dependency not met");
+			return ERR_DEPEND;
+		}
+	}
+
+	(owner, state) = service_status(service);
+	debug("Evaluating ", service, " state=", state, " owner=", owner);
+
+	len = length(node_list);
+	if (len == 0) {
+		debug(service, " is not runnable");
+		return ERR_DOMAIN;
+	}
+
+	if (((event_type != EVENT_USER) and (state == "disabled")) or (state == "failed")) {
+		%
+		% Commenting out this block will -not- allow you to
+		% recover failed services from event scripts.  Sorry.
+		% All it will get you is a false log message about
+		% starting this service.
+		%
+		% You may enable disabled services, but I recommend
+		% against it.
+		%
+		debug(service, " is not runnable");
+		return -1;
+	}
+
+	if (node_list[0] == owner) {
+		debug(service, " is already running on best node");
+		return ERR_RUNNING;
+	}
+
+	if ((owner >= 0) and (node_in_set(node_list, owner) == 1)) {
+		notice("Moving ", service, " from ", owner,
+		       " to ", node_list);
+		if (service_stop(service) < 0) {
+			return ERR_ABORT;
+		}
+	} else {
+		notice("Starting ", service, " on ", node_list);
+	}
+
+	return service_start(service, node_list);
+}
+
+
+%
+% Returns the set of online nodes in preferred/shuffled order which
+% are allowed to run this service.  Gives highest preference to current
+% owner if nofailback is specified.
+% 
+define allowed_nodes(service)
+{
+	variable anodes;
+	variable online;
+	variable nodes_domain;
+	variable ordered, restricted, nofailback;
+	variable state, owner;
+	variable depends;
+
+	(nofailback, restricted, ordered, nodes_domain) =
+			service_domain_info(service);
+
+	(owner, state) = service_status(service);
+
+	anodes = nodes_online();
+
+	% Shuffle the array so we don't start all services on the same
+	% node.  TODO - add RR, Least-services, placement policies...
+	online = shuffle(anodes);
+
+	if (restricted == 1) {
+		anodes = intersection(nodes_domain, online);
+	} else {
+		% Ordered failover domains (nodes_domain) unioned with the
+		% online nodes basically just reorders the online node list
+		% according to failover domain priority rules.
+		anodes = union(intersection(nodes_domain, online),
+			       online);
+	}
+
+	if ((nofailback == 1) or (ordered == 0)) {
+		
+		if ((owner < 0) or (node_in_set(anodes, owner) == 0)) {
+			return anodes;
+		}
+		
+		% Because union takes left as priority, we can
+		% return the union of the current owner with the
+		% allowed node list.  This means the service will
+		% remain on the same node it's currently on.
+		return union(owner, anodes);
+	}
+
+	return anodes;
+}
+
+
+define default_node_event_handler()
+{
+	variable services = service_list();
+	variable x;
+	variable nodes;
+
+	% debug("Executing default node event handler");
+	for (x = 0; x < length(services); x++) {
+		nodes = allowed_nodes(services[x]);
+		()=move_or_start(services[x], nodes);
+	}
+}
+
+
+define default_service_event_handler()
+{
+	variable services = service_list();
+	variable x;
+	variable depends;
+	variable policy;
+	variable nodes;
+	variable tmp;
+	variable owner;
+	variable state;
+
+	% debug("Executing default service event handler");
+
+	if (service_state == "recovering") {
+
+		policy = service_property(service_name, "recovery");
+		debug("Recovering",
+		      " Service: ", service_name,
+		      " Last owner: ", service_last_owner,
+		      " Policy: ", policy);
+
+		if (policy == "disable") {
+			() = service_stop(service_name, 1);
+			return;
+		}
+
+		nodes = allowed_nodes(service_name);
+		if (policy == "restart") {
+			tmp = union(service_last_owner, nodes);
+		} else {
+			% relocate 
+			tmp = subtract(nodes, service_last_owner);
+			nodes = tmp;
+			tmp = union(nodes, service_last_owner);
+		}
+
+		()=move_or_start(service_name, nodes);
+
+		return;
+	}
+
+	for (x = 0; x < length(services); x++) {
+		if (service_name == services[x]) {
+			% don't do anything to ourself! 
+			continue;
+		}
+
+		%
+		% Simplistic dependency handling
+		%
+		depends = service_property(services[x], "depend");
+
+		% No dependency; do nothing
+		if (depends != service_name) {
+			continue;
+		}
+
+		(owner, state) = service_status(services[x]);
+		if ((service_state == "started") and (owner < 0)) {
+			info("Dependency met; starting ", services[x]);
+			nodes = allowed_nodes(services[x]);
+			()=move_or_start(services[x], nodes);
+		}
+
+		% service died - stop service(s) that depend on the dead
+		if ((service_owner < 0) and (owner >= 0)) {
+			info("Dependency lost; stopping ", services[x]);
+			()=service_stop(services[x]);
+		}
+	}
+}
+
+define default_config_event_handler()
+{
+	% debug("Executing default config event handler");
+}
+
+define default_user_event_handler()
+{
+	variable ret;
+	variable nodes;
+	variable reordered;
+	variable x;
+	variable target = user_target;
+	variable found = 0;
+	variable owner, state;
+
+	nodes = allowed_nodes(service_name);
+	(owner, state) = service_status(service_name);
+
+	if (user_request == USER_RESTART) {
+
+		if (owner >= 0) {
+			reordered = union(owner, nodes);
+			nodes = reordered;
+		}
+
+		notice("Stopping ", service_name, " for relocate to ", nodes);
+
+		found = service_stop(service_name);
+		if (found < 0) {
+			return ERR_ABORT;
+		}
+
+		ret = move_or_start(service_name, nodes);
+
+	} else if ((user_request == USER_RELOCATE) or 
+		   (user_request == USER_ENABLE)) {
+
+		if (user_target > 0) {
+			for (x = 0; x < length(nodes); x++) {
+				if (nodes[x] == user_target) {
+					reordered = union(user_target, nodes);
+					nodes = reordered;
+					found = 1;
+				}
+			}
+	
+			if (found == 0) {
+				warning("User specified node ", user_target,
+					" is offline");
+			}
+		}
+
+		if ((owner >= 0) and (user_request == USER_RELOCATE)) {
+			if (service_stop(service_name) < 0) {
+				return ERR_ABORT;
+			}
+		}
+
+		ret = move_or_start(service_name, nodes);
+
+	} else if (user_request == USER_DISABLE) {
+
+		ret = service_stop(service_name, 1);
+
+	} else if (user_request == USER_STOP) {
+
+		ret = service_stop(service_name);
+
+	} 
+	% todo - migrate
+
+	return ret;
+}
+
+if (event_type == EVENT_NODE)
+	default_node_event_handler();
+if (event_type == EVENT_SERVICE)
+	default_service_event_handler();
+if (event_type == EVENT_CONFIG)
+	default_config_event_handler();
+if (event_type == EVENT_USER)
+	user_return=default_user_event_handler();
+
--- cluster/rgmanager/src/resources/Makefile	2007/07/12 11:23:16	1.13.2.6
+++ cluster/rgmanager/src/resources/Makefile	2007/12/04 21:59:54	1.13.2.7
@@ -34,6 +34,9 @@
 	utils/httpd-parse-config.pl utils/tomcat-parse-config.pl \
 	utils/member_util.sh
 
+EVENT_TARGETS= \
+	default_event_script.sl
+
 all:
 
 install: all
@@ -44,6 +47,7 @@
 	install $(TARGETS) ${sharedir}
 	install $(UTIL_TARGETS) ${sharedir}/utils
 	install -m 644 $(METADATA) ${sharedir}
+	install -m 644 $(EVENT_TARGETS) ${sharedir}
 
 uninstall:
 	${UNINSTALL} ${UTIL_TARGETS} ${sharedir}/utils
--- cluster/rgmanager/src/resources/netfs.sh	2007/10/03 16:44:15	1.7.2.2
+++ cluster/rgmanager/src/resources/netfs.sh	2007/12/04 21:59:54	1.7.2.3
@@ -348,6 +348,112 @@
 	return $NO
 }
 
+#
+# killMountProcesses mount_point
+#
+# Using lsof or fuser try to unmount the mount by killing of the processes
+# that might be keeping it busy.
+#
+killMountProcesses()
+{
+        typeset -i ret=$SUCCESS
+        typeset have_lsof=""
+        typeset have_fuser=""
+        typeset try
+
+        if [ $# -ne 1 ]; then
+                ocf_log err \
+                        "Usage: killMountProcesses mount_point"
+                return $FAIL
+        fi
+
+        typeset mp=$1
+
+        ocf_log notice "Forcefully unmounting $mp"
+
+        #
+        # Not all distributions have lsof.  If not use fuser.  If it
+        # does, try both.
+        #
+        file=$(which lsof 2>/dev/null)
+        if [ -f "$file" ]; then
+                have_lsof=$YES
+        fi
+
+        file=$(which fuser 2>/dev/null)
+        if [ -f "$file" ]; then
+                have_fuser=$YES
+        fi
+
+        if [ -z "$have_lsof" -a -z "$have_fuser" ]; then
+                ocf_log warn \
+        "Cannot forcefully unmount $mp; cannot find lsof or fuser commands"
+                return $FAIL
+        fi
+
+        for try in 1 2 3; do
+                if [ -n "$have_lsof" ]; then
+                        #
+                        # Use lsof to free up mount point
+                        #
+                        while read command pid user
+                        do
+                                if [ -z "$pid" ]; then
+                                        continue
+                                fi
+
+                                if [ $try -eq 1 ]; then
+                                        ocf_log warn \
+                                  "killing process $pid ($user $command $mp)"
+                                elif [ $try -eq 3 ]; then
+                                        ocf_log crit \
+                                  "Could not clean up mountpoint $mp"
+                                ret=$FAIL
+                                fi
+
+                                if [ $try -gt 1 ]; then
+                                        kill -9 $pid
+                                else
+                                        kill -TERM $pid
+                                fi
+                        done < <(lsof -w -bn 2>/dev/null | \
+                            grep -w -E "$mp(/.*|)\$" | \
+                            awk '{print $1,$2,$3}' | \
+                            sort -u -k 1,3)
+                elif [ -n "$have_fuser" ]; then
+                        #
+                        # Use fuser to free up mount point
+                        #
+                        while read command pid user
+                        do
+                                if [ -z "$pid" ]; then
+                                        continue
+                                fi
+
+                                if [ $try -eq 1 ]; then
+                                        ocf_log warn \
+                                  "killing process $pid ($user $command $mp)"
+                                elif [ $try -eq 3 ]; then
+                                        ocf_log crit \
+                                    "Could not clean up mount point $mp"
+                                        ret=$FAIL
+                                fi
+
+                                if [ $try -gt 1 ]; then
+                                        kill -9 $pid
+                                else
+                                        kill -TERM $pid
+                                fi
+                        done < <(fuser -vm $mp | \
+                            grep -v PID | \
+                            sed 's;^'$mp';;' | \
+                            awk '{print $4,$2,$1}' | \
+                            sort -u -k 1,3)
+                fi
+        done
+
+        return $ret
+}
 
 #
 # startNFSFilesystem
@@ -498,8 +604,8 @@
 	#
 	if [ -n "$mp" ]; then
 		case ${OCF_RESKEY_force_unmount} in
-	        $YES_STR)	force_umount="-f" ;;
-		0)		force_umount="-f" ;;
+	        $YES_STR)	force_umount="$YES" ;;
+		1)		force_umount="$YES" ;;
 	        *)		force_umount="" ;;
 		esac
 	fi
@@ -507,6 +613,7 @@
 	#
 	# Unmount
 	#
+        while [ ! "$done" ]; do
 	isMounted $fullpath $mp
 	case $? in
 	$NO)
@@ -519,26 +626,46 @@
 		;;
 	$YES)
 		sync; sync; sync
-		ocf_log info "unmounting $fullpath ($mp)"
+                        ocf_log info "unmounting $mp"
 
-		umount $force_umount $mp
+                        umount $mp
 		if  [ $? -eq 0 ]; then
-			return $SUCCESS
+                                umount_failed=
+                                done=$YES
+                                continue
 		fi
 
 		umount_failed=yes
 
+                        if [ "$force_umount" ]; then
+                                killMountProcesses $mp
+                        fi
+
+                        if [ $try -ge $max_tries ]; then
+                                done=$YES
+                        else
+                                sleep $sleep_time
+                                let try=try+1
+                        fi
 		;;
 	*)
 		return $FAIL
 		;;
 	esac
 
+                if [ $try -ge $max_tries ]; then
+                        done=$YES
+                else
+                        sleep $sleep_time
+                        let try=try+1
+                fi
+        done # while
 	if [ -n "$umount_failed" ]; then
 		ocf_log err "'umount $fullpath' failed ($mp), error=$ret_val"
 
 		return $FAIL
 	fi
+
 	return $SUCCESS
 }
 




More information about the Cluster-devel mailing list