[Cluster-devel] cluster/rgmanager/src/resources Makefile netfs ...
lhh at sourceware.org
lhh at sourceware.org
Tue Dec 4 21:59:54 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL5
Changes by: lhh at sourceware.org 2007-12-04 21:59:54
Modified files:
rgmanager/src/resources: Makefile netfs.sh
Added files:
rgmanager/src/resources: default_event_script.sl
Log message:
Port force-unmount from RHEL4 branch
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/default_event_script.sl.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.13.2.6&r2=1.13.2.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/netfs.sh.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.7.2.2&r2=1.7.2.3
/cvs/cluster/cluster/rgmanager/src/resources/default_event_script.sl,v --> standard output
revision 1.1.2.1
--- cluster/rgmanager/src/resources/default_event_script.sl
+++ - 2007-12-04 21:59:54.602852000 +0000
@@ -0,0 +1,291 @@
+define node_in_set(node_list, node)
+{
+ variable x, len;
+
+ len = length(node_list);
+ for (x = 0; x < len; x++) {
+ if (node_list[x] == node)
+ return 1;
+ }
+
+ return 0;
+}
+
+define move_or_start(service, node_list)
+{
+ variable len;
+ variable state, owner;
+ variable depends;
+
+ depends = service_property(service, "depend");
+ if (depends != "") {
+ (owner, state) = service_status(depends);
+ if (owner < 0) {
+ debug(service, " is not runnable; dependency not met");
+ return ERR_DEPEND;
+ }
+ }
+
+ (owner, state) = service_status(service);
+ debug("Evaluating ", service, " state=", state, " owner=", owner);
+
+ len = length(node_list);
+ if (len == 0) {
+ debug(service, " is not runnable");
+ return ERR_DOMAIN;
+ }
+
+ if (((event_type != EVENT_USER) and (state == "disabled")) or (state == "failed")) {
+ %
+ % Commenting out this block will -not- allow you to
+ % recover failed services from event scripts. Sorry.
+ % All it will get you is a false log message about
+ % starting this service.
+ %
+ % You may enable disabled services, but I recommend
+ % against it.
+ %
+ debug(service, " is not runnable");
+ return -1;
+ }
+
+ if (node_list[0] == owner) {
+ debug(service, " is already running on best node");
+ return ERR_RUNNING;
+ }
+
+ if ((owner >= 0) and (node_in_set(node_list, owner) == 1)) {
+ notice("Moving ", service, " from ", owner,
+ " to ", node_list);
+ if (service_stop(service) < 0) {
+ return ERR_ABORT;
+ }
+ } else {
+ notice("Starting ", service, " on ", node_list);
+ }
+
+ return service_start(service, node_list);
+}
+
+
+%
+% Returns the set of online nodes in preferred/shuffled order which
+% are allowed to run this service. Gives highest preference to current
+% owner if nofailback is specified.
+%
+define allowed_nodes(service)
+{
+ variable anodes;
+ variable online;
+ variable nodes_domain;
+ variable ordered, restricted, nofailback;
+ variable state, owner;
+ variable depends;
+
+ (nofailback, restricted, ordered, nodes_domain) =
+ service_domain_info(service);
+
+ (owner, state) = service_status(service);
+
+ anodes = nodes_online();
+
+ % Shuffle the array so we don't start all services on the same
+ % node. TODO - add RR, Least-services, placement policies...
+ online = shuffle(anodes);
+
+ if (restricted == 1) {
+ anodes = intersection(nodes_domain, online);
+ } else {
+ % Ordered failover domains (nodes_domain) unioned with the
+ % online nodes basically just reorders the online node list
+ % according to failover domain priority rules.
+ anodes = union(intersection(nodes_domain, online),
+ online);
+ }
+
+ if ((nofailback == 1) or (ordered == 0)) {
+
+ if ((owner < 0) or (node_in_set(anodes, owner) == 0)) {
+ return anodes;
+ }
+
+ % Because union takes left as priority, we can
+ % return the union of the current owner with the
+ % allowed node list. This means the service will
+ % remain on the same node it's currently on.
+ return union(owner, anodes);
+ }
+
+ return anodes;
+}
+
+
+define default_node_event_handler()
+{
+ variable services = service_list();
+ variable x;
+ variable nodes;
+
+ % debug("Executing default node event handler");
+ for (x = 0; x < length(services); x++) {
+ nodes = allowed_nodes(services[x]);
+ ()=move_or_start(services[x], nodes);
+ }
+}
+
+
+define default_service_event_handler()
+{
+ variable services = service_list();
+ variable x;
+ variable depends;
+ variable policy;
+ variable nodes;
+ variable tmp;
+ variable owner;
+ variable state;
+
+ % debug("Executing default service event handler");
+
+ if (service_state == "recovering") {
+
+ policy = service_property(service_name, "recovery");
+ debug("Recovering",
+ " Service: ", service_name,
+ " Last owner: ", service_last_owner,
+ " Policy: ", policy);
+
+ if (policy == "disable") {
+ () = service_stop(service_name, 1);
+ return;
+ }
+
+ nodes = allowed_nodes(service_name);
+ if (policy == "restart") {
+ tmp = union(service_last_owner, nodes);
+ } else {
+ % relocate
+ tmp = subtract(nodes, service_last_owner);
+ nodes = tmp;
+ tmp = union(nodes, service_last_owner);
+ }
+
+ ()=move_or_start(service_name, nodes);
+
+ return;
+ }
+
+ for (x = 0; x < length(services); x++) {
+ if (service_name == services[x]) {
+ % don't do anything to ourself!
+ continue;
+ }
+
+ %
+ % Simplistic dependency handling
+ %
+ depends = service_property(services[x], "depend");
+
+ % No dependency; do nothing
+ if (depends != service_name) {
+ continue;
+ }
+
+ (owner, state) = service_status(services[x]);
+ if ((service_state == "started") and (owner < 0)) {
+ info("Dependency met; starting ", services[x]);
+ nodes = allowed_nodes(services[x]);
+ ()=move_or_start(services[x], nodes);
+ }
+
+ % service died - stop service(s) that depend on the dead
+ if ((service_owner < 0) and (owner >= 0)) {
+ info("Dependency lost; stopping ", services[x]);
+ ()=service_stop(services[x]);
+ }
+ }
+}
+
+define default_config_event_handler()
+{
+ % debug("Executing default config event handler");
+}
+
+define default_user_event_handler()
+{
+ variable ret;
+ variable nodes;
+ variable reordered;
+ variable x;
+ variable target = user_target;
+ variable found = 0;
+ variable owner, state;
+
+ nodes = allowed_nodes(service_name);
+ (owner, state) = service_status(service_name);
+
+ if (user_request == USER_RESTART) {
+
+ if (owner >= 0) {
+ reordered = union(owner, nodes);
+ nodes = reordered;
+ }
+
+ notice("Stopping ", service_name, " for relocate to ", nodes);
+
+ found = service_stop(service_name);
+ if (found < 0) {
+ return ERR_ABORT;
+ }
+
+ ret = move_or_start(service_name, nodes);
+
+ } else if ((user_request == USER_RELOCATE) or
+ (user_request == USER_ENABLE)) {
+
+ if (user_target > 0) {
+ for (x = 0; x < length(nodes); x++) {
+ if (nodes[x] == user_target) {
+ reordered = union(user_target, nodes);
+ nodes = reordered;
+ found = 1;
+ }
+ }
+
+ if (found == 0) {
+ warning("User specified node ", user_target,
+ " is offline");
+ }
+ }
+
+ if ((owner >= 0) and (user_request == USER_RELOCATE)) {
+ if (service_stop(service_name) < 0) {
+ return ERR_ABORT;
+ }
+ }
+
+ ret = move_or_start(service_name, nodes);
+
+ } else if (user_request == USER_DISABLE) {
+
+ ret = service_stop(service_name, 1);
+
+ } else if (user_request == USER_STOP) {
+
+ ret = service_stop(service_name);
+
+ }
+ % todo - migrate
+
+ return ret;
+}
+
+if (event_type == EVENT_NODE)
+ default_node_event_handler();
+if (event_type == EVENT_SERVICE)
+ default_service_event_handler();
+if (event_type == EVENT_CONFIG)
+ default_config_event_handler();
+if (event_type == EVENT_USER)
+ user_return=default_user_event_handler();
+
--- cluster/rgmanager/src/resources/Makefile 2007/07/12 11:23:16 1.13.2.6
+++ cluster/rgmanager/src/resources/Makefile 2007/12/04 21:59:54 1.13.2.7
@@ -34,6 +34,9 @@
utils/httpd-parse-config.pl utils/tomcat-parse-config.pl \
utils/member_util.sh
+EVENT_TARGETS= \
+ default_event_script.sl
+
all:
install: all
@@ -44,6 +47,7 @@
install $(TARGETS) ${sharedir}
install $(UTIL_TARGETS) ${sharedir}/utils
install -m 644 $(METADATA) ${sharedir}
+ install -m 644 $(EVENT_TARGETS) ${sharedir}
uninstall:
${UNINSTALL} ${UTIL_TARGETS} ${sharedir}/utils
--- cluster/rgmanager/src/resources/netfs.sh 2007/10/03 16:44:15 1.7.2.2
+++ cluster/rgmanager/src/resources/netfs.sh 2007/12/04 21:59:54 1.7.2.3
@@ -348,6 +348,112 @@
return $NO
}
+#
+# killMountProcesses mount_point
+#
+# Using lsof or fuser try to unmount the mount by killing of the processes
+# that might be keeping it busy.
+#
+killMountProcesses()
+{
+ typeset -i ret=$SUCCESS
+ typeset have_lsof=""
+ typeset have_fuser=""
+ typeset try
+
+ if [ $# -ne 1 ]; then
+ ocf_log err \
+ "Usage: killMountProcesses mount_point"
+ return $FAIL
+ fi
+
+ typeset mp=$1
+
+ ocf_log notice "Forcefully unmounting $mp"
+
+ #
+ # Not all distributions have lsof. If not use fuser. If it
+ # does, try both.
+ #
+ file=$(which lsof 2>/dev/null)
+ if [ -f "$file" ]; then
+ have_lsof=$YES
+ fi
+
+ file=$(which fuser 2>/dev/null)
+ if [ -f "$file" ]; then
+ have_fuser=$YES
+ fi
+
+ if [ -z "$have_lsof" -a -z "$have_fuser" ]; then
+ ocf_log warn \
+ "Cannot forcefully unmount $mp; cannot find lsof or fuser commands"
+ return $FAIL
+ fi
+
+ for try in 1 2 3; do
+ if [ -n "$have_lsof" ]; then
+ #
+ # Use lsof to free up mount point
+ #
+ while read command pid user
+ do
+ if [ -z "$pid" ]; then
+ continue
+ fi
+
+ if [ $try -eq 1 ]; then
+ ocf_log warn \
+ "killing process $pid ($user $command $mp)"
+ elif [ $try -eq 3 ]; then
+ ocf_log crit \
+ "Could not clean up mountpoint $mp"
+ ret=$FAIL
+ fi
+
+ if [ $try -gt 1 ]; then
+ kill -9 $pid
+ else
+ kill -TERM $pid
+ fi
+ done < <(lsof -w -bn 2>/dev/null | \
+ grep -w -E "$mp(/.*|)\$" | \
+ awk '{print $1,$2,$3}' | \
+ sort -u -k 1,3)
+ elif [ -n "$have_fuser" ]; then
+ #
+ # Use fuser to free up mount point
+ #
+ while read command pid user
+ do
+ if [ -z "$pid" ]; then
+ continue
+ fi
+
+ if [ $try -eq 1 ]; then
+ ocf_log warn \
+ "killing process $pid ($user $command $mp)"
+ elif [ $try -eq 3 ]; then
+ ocf_log crit \
+ "Could not clean up mount point $mp"
+ ret=$FAIL
+ fi
+
+ if [ $try -gt 1 ]; then
+ kill -9 $pid
+ else
+ kill -TERM $pid
+ fi
+ done < <(fuser -vm $mp | \
+ grep -v PID | \
+ sed 's;^'$mp';;' | \
+ awk '{print $4,$2,$1}' | \
+ sort -u -k 1,3)
+ fi
+ done
+
+ return $ret
+}
#
# startNFSFilesystem
@@ -498,8 +604,8 @@
#
if [ -n "$mp" ]; then
case ${OCF_RESKEY_force_unmount} in
- $YES_STR) force_umount="-f" ;;
- 0) force_umount="-f" ;;
+ $YES_STR) force_umount="$YES" ;;
+ 1) force_umount="$YES" ;;
*) force_umount="" ;;
esac
fi
@@ -507,6 +613,7 @@
#
# Unmount
#
+ while [ ! "$done" ]; do
isMounted $fullpath $mp
case $? in
$NO)
@@ -519,26 +626,46 @@
;;
$YES)
sync; sync; sync
- ocf_log info "unmounting $fullpath ($mp)"
+ ocf_log info "unmounting $mp"
- umount $force_umount $mp
+ umount $mp
if [ $? -eq 0 ]; then
- return $SUCCESS
+ umount_failed=
+ done=$YES
+ continue
fi
umount_failed=yes
+ if [ "$force_umount" ]; then
+ killMountProcesses $mp
+ fi
+
+ if [ $try -ge $max_tries ]; then
+ done=$YES
+ else
+ sleep $sleep_time
+ let try=try+1
+ fi
;;
*)
return $FAIL
;;
esac
+ if [ $try -ge $max_tries ]; then
+ done=$YES
+ else
+ sleep $sleep_time
+ let try=try+1
+ fi
+ done # while
if [ -n "$umount_failed" ]; then
ocf_log err "'umount $fullpath' failed ($mp), error=$ret_val"
return $FAIL
fi
+
return $SUCCESS
}
More information about the Cluster-devel
mailing list