[Cluster-devel] cluster/group/gfs_controld cpg.c plock.c recover.c
teigland at sourceware.org
teigland at sourceware.org
Fri Oct 6 15:34:53 UTC 2006
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-10-06 15:34:52
Modified files:
group/gfs_controld: cpg.c plock.c recover.c
Log message:
- check cpg flow control status from openais when processing plocks
- handle case where we're mounting and the only other mounted node
fails -- we need to become the first mounter if we've not begun
mount(2) yet
- journal recovery requests need to be fed serially to gfs, we weren't
doing that in the case where a gfs journal recovery was in progress
when another node failed
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/cpg.c.diff?cvsroot=cluster&r1=1.7&r2=1.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.20&r2=1.21
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.17&r2=1.18
--- cluster/group/gfs_controld/cpg.c 2006/09/08 22:44:33 1.7
+++ cluster/group/gfs_controld/cpg.c 2006/10/06 15:34:52 1.8
@@ -19,6 +19,7 @@
static int saved_nodeid;
static int saved_len;
static char saved_data[MAX_MSGLEN];
+int message_flow_control_on;
void receive_journals(struct mountgroup *mg, char *buf, int len, int from);
void receive_options(struct mountgroup *mg, char *buf, int len, int from);
@@ -127,6 +128,7 @@
int process_cpg(void)
{
+ cpg_flow_control_state_t flow_control_state;
cpg_error_t error;
got_msg = 0;
@@ -142,6 +144,22 @@
if (got_msg)
do_deliver(saved_nodeid, saved_data, saved_len);
+
+ error = cpg_flow_control_state_get(daemon_handle, &flow_control_state);
+ if (error != CPG_OK) {
+ log_error("cpg_flow_control_state_get %d", error);
+ return -1;
+ }
+
+ if (flow_control_state == CPG_FLOW_CONTROL_ENABLED) {
+ message_flow_control_on = 1;
+ log_debug("flow control on");
+ } else {
+ if (message_flow_control_on)
+ log_debug("flow control off");
+ message_flow_control_on = 0;
+ }
+
return 0;
}
--- cluster/group/gfs_controld/plock.c 2006/08/31 19:13:02 1.20
+++ cluster/group/gfs_controld/plock.c 2006/10/06 15:34:52 1.21
@@ -46,6 +46,7 @@
static int control_fd = -1;
extern int our_nodeid;
static int plocks_online = 0;
+extern int message_flow_control_on;
static SaCkptHandleT ckpt_handle;
static SaCkptCallbacksT callbacks = { 0, 0 };
@@ -297,6 +298,10 @@
char *buf;
int len, rv;
+ /* Don't send more messages while the cpg message queue is backed up */
+ if (message_flow_control_on)
+ return 0;
+
memset(&info, 0, sizeof(info));
rv = read(control_fd, &info, sizeof(info));
--- cluster/group/gfs_controld/recover.c 2006/08/31 18:56:25 1.17
+++ cluster/group/gfs_controld/recover.c 2006/10/06 15:34:52 1.18
@@ -27,6 +27,7 @@
void start_participant_init_2(struct mountgroup *mg);
void start_spectator_init_2(struct mountgroup *mg);
void start_spectator_2(struct mountgroup *mg);
+void notify_mount_client(struct mountgroup *mg);
int set_sysfs(struct mountgroup *mg, char *field, int val)
{
@@ -1062,6 +1063,41 @@
unlink_checkpoint(mg);
resend_journals(mg);
}
+
+ /* Tricky situation when we're mounting and the failed node was
+ the only other node that had the fs mounted. If the failed node
+ didn't send us a journals message, we need to: unlink ckpt, pick a
+ journal for ourselves, act like the first mounter of the fs (do
+ first-mounter-recovery, the dead node may have been mounting itself
+ and not finished first-mounter-recovery). */
+
+ else if (neg && mg->memb_count == 1) {
+ if (!mg->got_our_journals) {
+ log_group(mg, "we are left alone, act first mounter");
+
+ unlink_checkpoint(mg);
+ memb = find_memb_nodeid(mg, our_nodeid);
+ memb->jid = 0;
+ mg->our_jid = 0;
+ mg->first_mounter = 1;
+ mg->first_mounter_done = 0;
+ mg->got_our_options = 1;
+ mg->got_our_journals = 1;
+ mg->mount_client_delay = 0;
+ notify_mount_client(mg);
+ } else if (mg->mount_client_notified && !mg->got_kernel_mount) {
+
+ /* FIXME */
+
+ log_group(mg, "FIXME: case not handled");
+
+ /* we got journals message from other node before it
+ died which means it finished first mounter recovery,
+ but we now need to tell gfs to recover the journal
+ after our own mount(2) completes */
+
+ }
+ }
}
struct mountgroup *create_mg(char *name)
@@ -1121,7 +1157,7 @@
static int we_are_in_fence_domain(void)
{
group_data_t data;
- int i, rv;
+ int rv;
memset(&data, 0, sizeof(data));
@@ -1130,11 +1166,8 @@
if (rv || strcmp(data.client_name, "fence"))
return 0;
- for (i = 0; i < data.member_count; i++) {
- if (data.members[i] == our_nodeid)
- return 1;
- }
-
+ if (data.member == 1)
+ return 1;
return 0;
}
@@ -1304,8 +1337,16 @@
the problem we're trying to avoid here is telling gfs-kernel to do
recovery when it can't for some reason and then waiting forever for
- a recovery_done signal that will never arrive. */
+ a recovery_done signal that will never arrive.
+ FIXME: we want to do more here to avoid telling gfs-kernel to do recovery
+ until our mount is really complete. I want to keep the join/mount
+ connection between mount.gfs and gfs_controld open throughout the mount
+ and have mount.gfs use it to return the result from mount(2). Then we'll
+ know when the mount(2) is done and we should also be able to remove the
+ special mount_error_fd since errors can be sent back through the original
+ connection as well. */
+
void recover_journals(struct mountgroup *mg)
{
struct mg_member *memb;
@@ -1318,24 +1359,25 @@
mg->kernel_mount_error ||
!mg->mount_client_notified ||
!mg->got_kernel_mount) {
+ log_group(mg, "recover_journals: unable %d,%d,%d,%d,%d,%d,%d",
+ mg->spectator,
+ mg->readonly,
+ mg->withdraw,
+ mg->our_jid,
+ mg->kernel_mount_error,
+ mg->mount_client_notified,
+ mg->got_kernel_mount);
list_for_each_entry(memb, &mg->members_gone, list) {
- if (!memb->tell_gfs_to_recover)
- continue;
-
- log_group(mg, "recover journal %d nodeid %d skip: "
- "%d %d %d %d %d %d %d",
- memb->jid, memb->nodeid,
- mg->spectator,
- mg->readonly,
- mg->withdraw,
- mg->our_jid,
- mg->kernel_mount_error,
- mg->mount_client_notified,
- mg->got_kernel_mount);
-
- memb->tell_gfs_to_recover = 0;
- memb->local_recovery_status = RS_READONLY;
+ log_group(mg, "member gone %d jid %d "
+ "tell_gfs_to_recover %d",
+ memb->nodeid, memb->jid,
+ memb->tell_gfs_to_recover);
+
+ if (memb->tell_gfs_to_recover) {
+ memb->tell_gfs_to_recover = 0;
+ memb->local_recovery_status = RS_READONLY;
+ }
}
start_done(mg);
return;
@@ -1346,6 +1388,15 @@
through the single recovery_done sysfs file */
list_for_each_entry(memb, &mg->members_gone, list) {
+ if (memb->wait_gfs_recover_done) {
+ log_group(mg, "delay new gfs recovery, "
+ "wait_gfs_recover_done for nodeid %d jid %d",
+ memb->nodeid, memb->jid);
+ return;
+ }
+ }
+
+ list_for_each_entry(memb, &mg->members_gone, list) {
if (!memb->tell_gfs_to_recover)
continue;
@@ -1416,6 +1467,17 @@
return 0;
}
+int need_kernel_recovery_done(struct mountgroup *mg)
+{
+ struct mg_member *memb;
+
+ list_for_each_entry(memb, &mg->members_gone, list) {
+ if (memb->wait_gfs_recover_done)
+ return 1;
+ }
+ return 0;
+}
+
/* Note: when a readonly node fails we do consider its journal (and the
fs) to need recovery... not sure this is really necessary, but
the readonly node did "own" a journal so it seems proper to recover
@@ -1500,19 +1562,13 @@
log_group(mg, "recovery_done jid %d nodeid %d %s",
memb->jid, memb->nodeid, ss);
- out:
- recover_journals(mg);
- return 0;
-}
-int need_kernel_recovery_done(struct mountgroup *mg)
-{
- struct mg_member *memb;
+ /* sanity check */
+ if (need_kernel_recovery_done(mg))
+ log_error("recovery_done: should be no pending gfs recoveries");
- list_for_each_entry(memb, &mg->members_gone, list) {
- if (memb->wait_gfs_recover_done)
- return 1;
- }
+ out:
+ recover_journals(mg);
return 0;
}
More information about the Cluster-devel
mailing list