[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

[Cluster-devel] [PATCH] rhel5 rgmanager: Pause during exit if we stopped services



A difference between rgmanager 1.9.x and later versions is that
they rely on openais/corosync for messaging.  This messaging is
quite reliable and has proved useful.

However, one drawback is that if you very quickly stop rgmanager
and corosync/cman, the other nodes in the cluster can not restart
services because message traffic is interrupted for the duration
of the token timeout.

There is no simple solution to this problem.  Rgmanager could
(in theory) find new placements for services prior to stopping,
but this is a large amount of design work; it was never designed
to run policies in the exit path.

A far simpler idea is to simply give the other nodes time to
restart services.

NOTE: This solution does not and can not work with central
      processing mode.

Resolves: rhbz#619468

Signed-off-by: Lon Hohberger <lhh redhat com>
---
 rgmanager/include/event.h        |    1 +
 rgmanager/include/resgroup.h     |    2 +-
 rgmanager/src/daemons/groups.c   |   15 +++++++++++++--
 rgmanager/src/daemons/main.c     |   14 ++++++++++++--
 rgmanager/src/daemons/rg_event.c |    7 +++++++
 5 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/rgmanager/include/event.h b/rgmanager/include/event.h
index 7e628d8..e63dffd 100644
--- a/rgmanager/include/event.h
+++ b/rgmanager/include/event.h
@@ -136,6 +136,7 @@ int slang_process_event(event_table_t *event_table, event_t *ev);
 
 /* For distributed events. */
 void set_transition_throttling(int nsecs);
+int get_transition_throttling(void);
 
 /* Simplified service start. */
 int service_op_start(char *svcName, int *target_list, int target_list_len,
diff --git a/rgmanager/include/resgroup.h b/rgmanager/include/resgroup.h
index 793ad3b..4be4dbc 100644
--- a/rgmanager/include/resgroup.h
+++ b/rgmanager/include/resgroup.h
@@ -160,7 +160,7 @@ void send_ret(msgctx_t *ctx, char *name, int ret, int req, int newowner);
 
 /* do this op on all resource groups.  The handler for the request 
    will sort out whether or not it's a valid request given the state */
-void rg_doall(int request, int block, char *debugfmt);
+int rg_doall(int request, int block, const char *debugfmt);
 void do_status_checks(void); /* Queue status checks for locally running
 				services */
 
diff --git a/rgmanager/src/daemons/groups.c b/rgmanager/src/daemons/groups.c
index ecb7b85..b546421 100644
--- a/rgmanager/src/daemons/groups.c
+++ b/rgmanager/src/daemons/groups.c
@@ -1292,12 +1292,21 @@ svc_exists(char *svcname)
 }
 
 
-void
-rg_doall(int request, int block, char *debugfmt)
+/*
+ * Perform an operation on all resources groups.
+ *
+ * Returns the number of requests queued.  This value is
+ * only used during shutdown, where we queue RG_STOP_EXITING
+ * only for services we have running locally as an optimization.
+ */
+int
+rg_doall(int request, int block,
+	 const char *debugfmt)
 {
 	resource_node_t *curr;
 	rg_state_t svcblk;
 	char rg[64];
+	int queued = 0;
 
 	pthread_rwlock_rdlock(&resource_lock);
 	list_do(&_tree, curr) {
@@ -1322,6 +1331,7 @@ rg_doall(int request, int block, char *debugfmt)
 
 		rt_enqueue_request(rg, request, NULL, 0,
 				   0, 0, 0);
+		++queued;
 	} while (!list_done(&_tree, curr));
 
 	pthread_rwlock_unlock(&resource_lock);
@@ -1331,6 +1341,7 @@ rg_doall(int request, int block, char *debugfmt)
 	   other rgmanagers to complete. */
 	if (block) 
 		rg_wait_threads();
+	return queued;
 }
 
 
diff --git a/rgmanager/src/daemons/main.c b/rgmanager/src/daemons/main.c
index aa78cef..1c7f746 100644
--- a/rgmanager/src/daemons/main.c
+++ b/rgmanager/src/daemons/main.c
@@ -72,6 +72,7 @@ static int signalled = 0;
 static int port = RG_PORT;
 static char *rgmanager_lsname = "rgmanager"; /* XXX default */
 static int status_poll_interval = DEFAULT_CHECK_INTERVAL;
+static int stops_queued = 0;
 
 int next_node_id(cluster_member_list_t *membership, int me);
 
@@ -1041,7 +1042,7 @@ void *
 shutdown_thread(void __attribute__ ((unused)) *arg)
 {
 	rg_lockall(L_SYS|L_SHUTDOWN);
-	rg_doall(RG_STOP_EXITING, 1, NULL);
+	stops_queued = rg_doall(RG_STOP_EXITING, 1, NULL);
 	running = 0;
 
 	pthread_exit(NULL);
@@ -1219,8 +1220,17 @@ out_cleanup:
 		clu_lock_finished(rgmanager_lsname);
 
 out:
-	clulog(LOG_NOTICE, "Shutdown complete, exiting\n");
+	clulog(LOG_DEBUG, "Stopped %d services\n", stops_queued);
+	clulog(LOG_NOTICE, "Disconnecting from CMAN\n");
 	cman_finish(clu);
+
+	if (stops_queued && !central_events_enabled()) {
+		clulog(LOG_DEBUG, "Pausing to allow services to "
+		       "start on other node(s)\n");
+		sleep(get_transition_throttling() * 3);
+	}
+
+	clulog(LOG_NOTICE, "Exiting\n");
 	
 	/*malloc_dump_table(); */ /* Only works if alloc.c us used */
 	/*malloc_stats();*/
diff --git a/rgmanager/src/daemons/rg_event.c b/rgmanager/src/daemons/rg_event.c
index 82c20c0..606d41b 100644
--- a/rgmanager/src/daemons/rg_event.c
+++ b/rgmanager/src/daemons/rg_event.c
@@ -69,6 +69,13 @@ set_transition_throttling(int nsecs)
 }
 
 
+int
+get_transition_throttling(void)
+{
+	return transition_throttling;
+}
+
+
 void
 set_central_events(int flag)
 {
-- 
1.7.3.4


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]