[Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c ...
teigland at sourceware.org
teigland at sourceware.org
Mon Aug 21 17:46:20 UTC 2006
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-08-21 17:46:20
Modified files:
group/gfs_controld: lock_dlm.h plock.c recover.c
Log message:
- the check for us becoming the new low nodeid after the previous one
failed and unlinking the ckpt wasn't adequately checking for the old
low node having failed
- rename low_finished_nodeid to master_nodeid and clarify some of the
code using this since it was confusing and misleading
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.15&r2=1.16
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.18&r2=1.19
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.14&r2=1.15
--- cluster/group/gfs_controld/lock_dlm.h 2006/08/18 16:33:08 1.15
+++ cluster/group/gfs_controld/lock_dlm.h 2006/08/21 17:46:19 1.16
@@ -148,8 +148,8 @@
int first_mounter_done;
int emulate_first_mounter;
int wait_first_done;
- int low_finished_nodeid;
int low_nodeid;
+ int master_nodeid;
int save_plocks;
uint64_t cp_handle;
--- cluster/group/gfs_controld/plock.c 2006/08/18 16:33:08 1.18
+++ cluster/group/gfs_controld/plock.c 2006/08/21 17:46:19 1.19
@@ -1374,7 +1374,8 @@
saCkptSectionIterationFinalize(itr);
out:
if (mg->low_nodeid == our_nodeid) {
- log_group(mg, "retrieve_plocks: unlink ckpt from old low node");
+ /* we're the new low nodeid, will be master */
+ log_group(mg, "retrieve_plocks: unlink ckpt from old master");
_unlink_checkpoint(mg, &name);
} else
saCkptCheckpointClose(h);
--- cluster/group/gfs_controld/recover.c 2006/08/18 16:33:08 1.14
+++ cluster/group/gfs_controld/recover.c 2006/08/21 17:46:19 1.15
@@ -514,9 +514,8 @@
free(buf);
}
-/* We set the new member's jid to the lowest unused jid.
- If we're the lowest existing member (by nodeid), then
- send jid info to the new node. */
+/* We set the new member's jid to the lowest unused jid. If we're the lowest
+ existing member (by nodeid), then send jid info to the new node. */
/* Look at rw/ro/spectator status of all existing mounters and whether
we need to do recovery. Based on that, decide if the current mount
@@ -590,14 +589,14 @@
log_group(mg, "assign_journal: new member %d got jid %d",
new->nodeid, new->jid);
- if (mg->low_finished_nodeid == our_nodeid)
+ if (mg->master_nodeid == our_nodeid) {
store_plocks(mg, new->nodeid);
- /* if we're the first mounter and haven't gotten others_may_mount
- yet, then don't send journals until kernel_recovery_done_first
- so the second node won't mount the fs until omm. */
+ /* if we're the first mounter and haven't gotten
+ others_may_mount yet, then don't send journals until
+ kernel_recovery_done_first so the second node won't mount
+ the fs until omm. */
- if (mg->low_finished_nodeid == our_nodeid) {
if (mg->first_mounter && !mg->first_mounter_done) {
log_group(mg, "delay sending journals to %d",
new->nodeid);
@@ -911,13 +910,63 @@
clear_memb_list(&mg->members_gone);
}
+/* New mounters may be waiting for a journals message that a failed node (as
+ master) would have sent. If the master failed and we're the new master,
+ then send a journals message to any nodes for whom we've not seen a journals
+ message. We also need to checkpoint the plock state for the new nodes to
+ read after they get their journals message. */
+
+void resend_journals(struct mountgroup *mg)
+{
+ struct mg_member *memb;
+ int stored_plocks = 0;
+
+ list_for_each_entry(memb, &mg->members, list) {
+ if (!memb->needs_journals)
+ continue;
+
+ if (!stored_plocks) {
+ store_plocks(mg, memb->nodeid);
+ stored_plocks = 1;
+ }
+
+ log_group(mg, "resend_journals to %d", memb->nodeid);
+ send_journals(mg, memb->nodeid);
+ }
+}
+
+/* The master node is the member of the group with the lowest nodeid who
+ was also a member of the last "finished" group, i.e. a member of the
+ group the last time it got a finish callback. The job of the master
+ is to send state info to new nodes joining the group, and doing that
+ requires that the master has all the state to send -- a new joining
+ node that has the lowest nodeid doesn't have any state, which is why
+ we add the "finished" requirement. */
+
+void update_master_nodeid(struct mountgroup *mg)
+{
+ struct mg_member *memb;
+ int new = -1, low = -1;
+
+ list_for_each_entry(memb, &mg->members, list) {
+ if (low == -1 || memb->nodeid < low)
+ low = memb->nodeid;
+ if (!memb->finished)
+ continue;
+ if (new == -1 || memb->nodeid < new)
+ new = memb->nodeid;
+ }
+ mg->master_nodeid = new;
+ mg->low_nodeid = low;
+}
+
/* This can happen before we receive a journals message for our mount. */
void recover_members(struct mountgroup *mg, int num_nodes,
int *nodeids, int *pos_out, int *neg_out)
{
struct mg_member *memb, *safe;
- int i, found, id, pos = 0, neg = 0, low = -1, old_low_finished_nodeid;
+ int i, found, id, pos = 0, neg = 0, prev_master_nodeid;
/* move departed nodes from members list to members_gone */
@@ -982,30 +1031,31 @@
log_group(mg, "add member %d", id);
}
- list_for_each_entry(memb, &mg->members, list) {
- if (mg->low_nodeid == -1 || memb->nodeid < mg->low_nodeid)
- mg->low_nodeid = memb->nodeid;
- if (!memb->finished)
- continue;
- if (low == -1 || memb->nodeid < low)
- low = memb->nodeid;
- }
- old_low_finished_nodeid = mg->low_finished_nodeid;
- mg->low_finished_nodeid = low;
+ prev_master_nodeid = mg->master_nodeid;
+ update_master_nodeid(mg);
*pos_out = pos;
*neg_out = neg;
- log_group(mg, "total members %d low_finished_nodeid %d",
- mg->memb_count, low);
+ log_group(mg, "total members %d master_nodeid %d prev %d",
+ mg->memb_count, mg->master_nodeid, prev_master_nodeid);
- /* the low nodeid failed and we're the new low nodeid, we need
- to unlink the ckpt that the failed node had open so new ckpts
- can be created down the road */
- if ((old_low_finished_nodeid != low) && (our_nodeid == low)) {
- log_group(mg, "unlink ckpt for failed low node %d",
- old_low_finished_nodeid);
+ /* the master failed and we're the new master, we need to:
+ - unlink the ckpt that the failed master had open so new ckpts
+ can be created down the road
+ - resend journals msg to any nodes that needed one from the
+ failed master
+ - store plocks in ckpt for the new mounters to read when they
+ get the journals msg from us */
+
+ if (neg &&
+ (prev_master_nodeid != -1) &&
+ (prev_master_nodeid != mg->master_nodeid) &&
+ (our_nodeid == mg->master_nodeid)) {
+ log_group(mg, "unlink ckpt for failed master %d",
+ prev_master_nodeid);
unlink_checkpoint(mg);
+ resend_journals(mg);
}
}
@@ -1021,6 +1071,7 @@
INIT_LIST_HEAD(&mg->resources);
INIT_LIST_HEAD(&mg->saved_messages);
mg->init = 1;
+ mg->master_nodeid = -1;
mg->low_nodeid = -1;
strncpy(mg->name, name, MAXNAME);
@@ -1925,31 +1976,6 @@
}
}
-/* New mounters may be waiting for a journals message that a failed node (as
- low nodeid) would have sent. If the low nodeid failed and we're the new low
- nodeid, then send a journals message to any nodes for whom we've not seen a
- journals message. We also need to checkpoint the plock state for the new
- nodes to read after they get their journals message. */
-
-void resend_journals(struct mountgroup *mg)
-{
- struct mg_member *memb;
- int stored_plocks = 0;
-
- list_for_each_entry(memb, &mg->members, list) {
- if (!memb->needs_journals)
- continue;
-
- if (!stored_plocks) {
- store_plocks(mg, memb->nodeid);
- stored_plocks = 1;
- }
-
- log_group(mg, "resend_journals to %d", memb->nodeid);
- send_journals(mg, memb->nodeid);
- }
-}
-
/*
old method:
A is rw mount, B mounts rw
@@ -1987,7 +2013,7 @@
void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids)
{
- int pos = 0, neg = 0, low;
+ int pos = 0, neg = 0;
mg->start_event_nr = mg->last_start;
mg->start_type = type;
@@ -1995,18 +2021,9 @@
log_group(mg, "start %d init %d type %d member_count %d",
mg->last_start, mg->init, type, member_count);
- low = mg->low_finished_nodeid;
-
recover_members(mg, member_count, nodeids, &pos, &neg);
-
reset_unfinished_recoveries(mg);
- if (neg && low != mg->low_finished_nodeid && low == our_nodeid) {
- log_group(mg, "low nodeid failed old %d new %d",
- low, mg->low_finished_nodeid);
- resend_journals(mg);
- }
-
if (mg->init) {
if (member_count == 1)
start_first_mounter(mg);
More information about the Cluster-devel
mailing list