[Cluster-devel] cluster/group/gfs_controld recover.c
teigland at sourceware.org
teigland at sourceware.org
Tue Dec 19 22:19:02 UTC 2006
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-12-19 22:19:02
Modified files:
group/gfs_controld: recover.c
Log message:
Fixes related to the needs_recovery state and first-mounter recovery.
Probably not perfect yet, but working in the tests I'm able to contrive.
bz 218551
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.26&r2=1.27
--- cluster/group/gfs_controld/recover.c 2006/12/19 17:05:59 1.26
+++ cluster/group/gfs_controld/recover.c 2006/12/19 22:19:01 1.27
@@ -29,6 +29,7 @@
void start_spectator_2(struct mountgroup *mg);
void notify_mount_client(struct mountgroup *mg);
+
int set_sysfs(struct mountgroup *mg, char *field, int val)
{
char fname[512];
@@ -325,6 +326,9 @@
continue;
if (memb->jid == -9)
continue;
+ if (memb->spectator || memb->readonly || memb->withdrawing ||
+ memb->ms_kernel_mount_done)
+ continue;
if (low == -1 || memb->nodeid < low) {
next = memb;
low = memb->nodeid;
@@ -641,12 +645,11 @@
goto out;
}
- if (mg->needs_recovery) {
- log_group(mg, "receive_remount from %d needs_recovery", from);
- msg = "error: needs recovery";
- error = -1;
- goto out;
- }
+ /* FIXME: check if we've even fully completed our normal mount yet
+ (received our own mount-status?) if not, then disallow remount */
+
+ /* FIXME: going ro->rw may mean we can now do journal or first-mounter
+ recovery that we couldn't do before. */
memb->readonly = ro;
memb->rw = !ro;
@@ -746,30 +749,19 @@
else if (memb->readonly)
ro_count++;
- if (memb->opts & MEMB_OPT_RECOVER)
+ if (memb->opts & MEMB_OPT_RECOVER) {
memb_recover = memb;
+ log_group(mg, "assign_journal: memb %d has OPT_RECOVER",
+ memb->nodeid);
+ }
if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error)
memb_mounted = memb;
}
- log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d",
- total, invalid_count, rw_count, ro_count, spect_count);
-
- /* do we let the new member mount? jid=-2 means no.
- - we only allow an rw mount when the fs needs recovery
- - we only allow a single rw mount when the fs needs recovery */
-
- if (mg->needs_recovery) {
- if (!new->rw || rw_count)
- new->jid = -2;
- }
-
- if (new->jid == -2) {
- log_group(mg, "assign_journal: fail - needs_recovery %d",
- mg->needs_recovery);
- goto out;
- }
+ log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d "
+ "needs_recovery %d", total, invalid_count, rw_count,
+ ro_count, spect_count, mg->needs_recovery);
if (new->spectator) {
log_group(mg, "assign_journal: new spectator allowed");
@@ -785,17 +777,33 @@
}
}
- /* Currently the fs needs recovery, i.e. none of the current
- mounters (ro/spectators) can recover journals. So, this new rw
- mounter is told to do first-mounter recovery of all the journals. */
-
+ /* Repeat first-mounter recovery: the fs has been mounted and in-use,
+ but nodes have failed and none of the current mounters has been able
+ to do recovery (all remaining nodes may be ro/spect for example).
+ This puts us into the special "needs_recovery" state where new
+ mounters are asked to do first-mounter recovery of the fs while
+ the current mounters sit in a blocked state. */
+
if (mg->needs_recovery) {
- log_group(mg, "assign_journal: memb %d gets OPT_RECOVER, "
- "needs_recovery", new->nodeid);
- new->opts |= MEMB_OPT_RECOVER;
+ if (!memb_recover) {
+ log_group(mg, "assign_journal: needs_recovery: "
+ "new memb %d gets OPT_RECOVER",
+ new->nodeid);
+ new->opts |= MEMB_OPT_RECOVER;
+ } else {
+ log_group(mg, "assign_journal: needs_recovery: "
+ "new memb %d memb %d has OPT_RECOVER",
+ new->nodeid, memb_recover->nodeid);
+ }
goto out;
}
+ /* Initial first-mounter recovery: the fs is coming online, the first
+ mg member assumes first-mounter role and other nodes join the mg
+ while the first-mounter is working. These non-first mounters wait
+ for the first-mounter to finish before notifying mount.gfs. If the
+ first-mounter fails, one of them will become the first-mounter. */
+
/* it shouldn't be possible to have someone doing first mounter
recovery and also have someone with the fs fully mounted */
@@ -839,7 +847,8 @@
mg->kernel_mount_done, mg->kernel_mount_error,
mg->first_mounter, mg->first_mounter_done);
- log_group(mg, "assign_journal: memb %d gets OPT_RECOVER", new->nodeid);
+ log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: "
+ "fs not mounted", new->nodeid);
new->opts |= MEMB_OPT_RECOVER;
out:
@@ -1006,7 +1015,7 @@
/* delay notifying mount client until we get a successful
mount status from the first mounter */
log_group(mg, "other node doing first mounter recovery, "
- "delay notify_mount_client");
+ "set mount_client_delay");
mg->mount_client_delay = 1;
mg->save_plocks = 0;
return;
@@ -1402,7 +1411,6 @@
if (memb_gone_recover) {
log_group(mg, "failed node %d had MEMB_OPT_RECOVER",
memb_gone_recover->nodeid);
- ASSERT(!mg->mount_client_notified);
memb_gone_recover->tell_gfs_to_recover = 0;
}
@@ -2168,14 +2176,39 @@
return 0;
}
-/* FIXME: what happens if a node is unmounting, others have it in members_gone,
- and it crashes? It shouldn't need journal recovery since the kernel umount
- happens before leaving the group. */
+/* After a start that initiated a recovery, everyone will go and see if they
+ can do recovery and try if they can. If a node can't, it does start_done,
+ if it tries and fails, it does start_done, if it tries and succeeds it
+ sends a message and then does start_done once it receives's it back. So,
+ when we get a finish we know that we have all the results from the recovery
+ cycle and can judge if everything is recovered properly or not. If so, we
+ can unblock locks (in the finish), if not, we leave them blocked (in the
+ finish).
+
+ If we leave locks blocked in the finish, then they can only be unblocked
+ after someone is able to do the recovery that's needed. So, leaving locks
+ blocked in a finish because recovery hasn't worked puts us into a special
+ state: the fs needs recovery, none of the current mounters has been able to
+ recover it, all current mounters have locks blocked in gfs, new mounters
+ are allowed, nodes can unmount, new mounters are asked to do first-mounter
+ recovery, if one of them succeeds then we can all clear this special state
+ and unblock locks (the unblock would happen upon recving the success
+ message from the new pseudo-first mounter, not as part of a finish), future
+ finishes would then go back to being able to unblock locks.
+
+ While in this special state, a new node has been added and asked to do
+ first-mounter recovery, other nodes can also be added while the new
+ first-mounter is active. These other nodes don't notify mount.gfs.
+ They'll receive the result of the first mounter and if it succeeded they'll
+ notify mount.gfs, otherwise one of them will become the next first-mounter
+ and notify mount.gfs. */
int do_finish(struct mountgroup *mg)
{
struct mg_member *memb, *safe;
- int leave_blocked = 0;
+
+ log_group(mg, "finish %d needs_recovery %d", mg->last_finish,
+ mg->needs_recovery);
/* members_gone list are the members that were removed from the
members list when processing a start. members are removed
@@ -2192,11 +2225,10 @@
list_del(&memb->list);
free(memb);
} else {
+ log_error("%s finish: needs recovery jid %d nodeid %d "
+ "status %d", mg->name, memb->jid,
+ memb->nodeid, memb->recovery_status);
mg->needs_recovery = 1;
- log_group(mg, "finish: needs recovery "
- "jid %d nodeid %d status %d",
- memb->jid, memb->nodeid,
- memb->recovery_status);
}
}
@@ -2210,12 +2242,7 @@
return 0;
}
- if (mg->needs_recovery) {
- log_group(mg, "finish: leave locks blocked for needs_recovery");
- leave_blocked = 1;
- }
-
- if (!leave_blocked) {
+ if (!mg->needs_recovery) {
set_sysfs(mg, "block", 0);
/* we may have been holding back our local mount due to
@@ -2224,7 +2251,8 @@
mg->mount_client_delay = 0;
notify_mount_client(mg);
}
- }
+ } else
+ log_group(mg, "finish: leave locks blocked for needs_recovery");
return 0;
}
More information about the Cluster-devel
mailing list