[Cluster-devel] cluster/rgmanager src/daemons/rg_forward.c src ...
lhh at sourceware.org
lhh at sourceware.org
Wed Jan 3 21:08:18 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL4
Changes by: lhh at sourceware.org 2007-01-03 21:08:17
Modified files:
rgmanager/src/daemons: rg_forward.c
rgmanager/src/utils: clusvcadm.c
rgmanager/include: resgroup.h
Log message:
Resolves: 201396
Part 1: Make rgmanager check the states of nodes during forward operations to remote nodes
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_forward.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.2.2.2&r2=1.2.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clusvcadm.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.2.2.7&r2=1.2.2.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.3.2.8&r2=1.3.2.9
--- cluster/rgmanager/src/daemons/rg_forward.c 2006/12/13 18:19:56 1.2.2.2
+++ cluster/rgmanager/src/daemons/rg_forward.c 2007/01/03 21:08:16 1.2.2.3
@@ -48,8 +48,9 @@
rg_state_t rgs;
request_t *req = (request_t *)arg;
void *lockp;
- int fd;
+ int fd, ret;
SmMessageSt msg;
+ cluster_member_list_t *m = NULL;
if (rg_lock(req->rr_group, &lockp) != 0) {
msg_close(req->rr_resp_fd);
@@ -88,19 +89,43 @@
pthread_exit(NULL);
}
- if (msg_receive(fd, &msg, sizeof(msg)) != sizeof(msg)) {
- msg_close(fd);
- msg_close(req->rr_resp_fd);
- rq_free(req);
- pthread_exit(NULL);
- }
+ /*
+ * Ok, we're forwarding a message to another node. Keep tabs on
+ * the node to make sure it doesn't die. Basically, wake up every
+ * now and again to make sure it's still online. If it isn't, send
+ * a response back to the caller.
+ */
+ do {
+ ret = msg_receive_timeout(fd, &msg, sizeof(msg), 10);
+ if (ret < (int)sizeof(msg)) {
+ if (ret < 0 && errno == ETIMEDOUT) {
+ m = member_list();
+ if (!memb_online(m, rgs.rs_owner)) {
+ msg.sm_data.d_ret = RG_ENODEDEATH;
+ /* we decode down below,
+ * so encode here */
+ swab_SmMessageSt(&msg);
+ break;
+ }
+ cml_free(m);
+ m = NULL;
+ continue;
+ }
+ msg_close(fd);
+ msg_close(req->rr_resp_fd);
+ goto out;
+ }
+ break;
+ } while(1);
+
+ if (m)
+ cml_free(m);
msg_close(fd);
swab_SmMessageSt(&msg);
send_response(msg.sm_data.d_ret, req->rr_target, req);
-
+out:
rq_free(req);
-
pthread_exit(NULL);
}
--- cluster/rgmanager/src/utils/clusvcadm.c 2006/12/13 18:19:56 1.2.2.7
+++ cluster/rgmanager/src/utils/clusvcadm.c 2007/01/03 21:08:17 1.2.2.8
@@ -147,6 +147,43 @@
}
+int
+do_msg_receive(uint64_t msgtarget, int fd, void *buf, size_t len)
+{
+ int ret;
+ cluster_member_list_t *m = NULL;
+
+ if ((int64_t)msgtarget < (int64_t)0)
+ return msg_receive(fd, buf, len);
+
+ /* Make sure a node hasn't died while processing our request. */
+ do {
+ ret = msg_receive_timeout(fd, buf, len, 20);
+ if (ret < (int)len) {
+ if (ret < 0 && errno == ETIMEDOUT) {
+ m = clu_member_list(RG_SERVICE_GROUP);
+ if (!memb_online(m, msgtarget)) {
+ ret = RG_ENODEDEATH;
+ break;
+ }
+ cml_free(m);
+ m = NULL;
+ continue;
+ }
+
+ /* Make sure we don't overwrite ENODEDEATH */
+ if (ret < 0)
+ ret = -1;
+ }
+ break;
+ } while(1);
+
+ if (m)
+ cml_free(m);
+ return ret;
+}
+
+
void
usage(char *name)
{
@@ -259,7 +296,6 @@
usage(basename(argv[0]));
return 1;
}
-
/* No login */
fd = clu_connect(RG_SERVICE_GROUP, 0);
@@ -294,10 +330,15 @@
fflush(stdout);
msgfd = msg_open(msgtarget, RG_PORT, 0, 5);
} else {
- printf("Trying to relocate %s to %s", svcname, nodename);
+ if (node_specified)
+ printf("Trying to relocate %s to %s", svcname, nodename);
+ else
+ printf("Trying to relocate %s", svcname);
printf("...");
fflush(stdout);
msgfd = msg_open(me, RG_PORT, 0, 5);
+ /* just do a normal receive from the local node */
+ msgtarget = (uint64_t)-1;
}
if (msgfd < 0) {
@@ -312,10 +353,25 @@
return 1;
}
- if (msg_receive(msgfd, &msg, sizeof(msg)) != sizeof(msg)) {
- perror("msg_receive");
- fprintf(stderr, "Error receiving reply!\n");
- return 1;
+ /* reusing opt */
+ opt = do_msg_receive(msgtarget, msgfd, &msg,
+ sizeof(msg));
+ if (opt < (int)sizeof(msg)) {
+ if (opt != RG_ENODEDEATH) {
+ perror("msg_receive");
+ fprintf(stderr, "Error receiving reply!\n");
+ return 1;
+ }
+
+ /*
+ * XXX hack to enable node death processing along side
+ * all the rest of the possible responses. If an end-node
+ * died while processing, this will have been set by the
+ * rgmanager and a response with RG_ENODEDEATH as the d_ret
+ * would have been received.
+ */
+ msg.sm_data.d_ret = RG_ENODEDEATH;
+ swab_SmMessageSt(&msg);
}
/* Decode */
@@ -346,6 +402,10 @@
case RG_EFAIL:
printf("failed\n");
break;
+ case RG_ENODEDEATH:
+ printf("node processing request died\n");
+ printf("(Status unknown)\n");
+ break;
case RG_EABORT:
printf("cancelled by resource manager\n");
break;
--- cluster/rgmanager/include/resgroup.h 2006/12/13 18:19:57 1.3.2.8
+++ cluster/rgmanager/include/resgroup.h 2007/01/03 21:08:17 1.3.2.9
@@ -156,6 +156,7 @@
cluster_member_list_t *member_list(void);
uint64_t my_id(void);
+#define RG_ENODEDEATH -8 /* Processing node died */
#define RG_ERUN -7 /* Service is running already */
#define RG_EAGAIN -6 /* Try again */
#define RG_EDEADLCK -5 /* Operation would cause deadlock */
More information about the Cluster-devel
mailing list