[Cluster-devel] cluster/group/daemon cman.c cpg.c gd_internal. ...
teigland at sourceware.org
teigland at sourceware.org
Tue Jun 20 18:09:14 UTC 2006
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-06-20 18:09:12
Modified files:
group/daemon : cman.c cpg.c gd_internal.h main.c
Log message:
Moving the cluster infrastructure to userland introduced a new problem
that we didn't need to worry about before. All cluster state now exists
in userland processes which can go away and then come back like new, i.e.
unaware of the previous state.
Typically, when the userland cluster infrastructure on a node
"disappears", another node recognizes this as a node failure and recovers.
There's no problem there. The problem is when the cluster infrastructure
disappears on all the cluster nodes and then comes back. The
infrastructure that disappeared may have abandoned control of gfs/dlm
instances in the kernel. When the infrastructure comes back, it's like a
brand new cluster, it knows nothing about the residual, uncontrolled
instances of gfs/dlm. New nodes would use gfs/dlm in this new cluster
independently of the unknown gfs/dlm users from before and there'd be
immediate corruption [1].
Eventually, the infrastructure may be able to reconstruct the global state
of abandoned instances of gfs/dlm when it comes back and reassert control
of them, but that's not realistic any time soon. For now, the
infrastructure needs to recognize nodes with residual gfs/dlm state as
failed nodes that need recovery (starting with fencing). That recognition
and recovery now happens as part of the startup initialization, before new
instances of gfs/dlm are created [2].
[1] This is trivial to demonstrate:
- start up a cluster on nodes A,B,C
- mount gfs on nodes A,B
- run 'cman_tool leave force' on A,B,C
- start up the cluster again on A,B,C
- mount gfs on node C
- nodes A,B are now using gfs independently of node C
[2] The previous example now works like this:
- start up a cluster on nodes A,B,C
- mount gfs on nodes A,B
- run 'cman_tool leave force' on A,B,C
- start up the cluster again on A,B,C
i) when groupd starts on A,B, it recognizes the uncontrolled instance
of gfs, kills cman locally and fences the local node [3]
ii) when C runs fence_tool join, a new fence domain is started which
fences nodes with an unknown state, which are A and B
- mount gfs on node C
[3] This self-fencing does nothing for node C which still needs to fence
both A and B itself. If A fences itself before C fences it, A will be
fenced twice. This self-fencing step is optional, but it can be
convenient when 1) all the nodes restarting the infrastructure find
residual gfs/dlm instances and 2) reboot fencing is used. The anticipated
situation is one where everyone has residual state so no one can start up
to fence anyone else; all are stuck. But, they all fence themselves,
reboot and resolve everything.
There's a different approach we could take that would be more convenient
when not all cluster nodes are likely to be mounting gfs or SAN fencing is
used. In this case, a node that finds residual gfs/dlm instances would
remain a cman member and not fence itself. This would contribute quorum
to help another node without residual state start up and fence it.
The solution to this confusing situation is simple:
- groupd now checks for residual gfs/dlm kernel state when it starts up
and if it finds any it kills cman and exec's fence_node <myname>.
- fenced can't bypass fencing of a node unless the node is both a cman
member and has fully started groupd (a node may need fencing if it's
joined the cluster but groupd isn't starting).
- the same consideration in fence_manual as fenced
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/cman.c.diff?cvsroot=cluster&r1=1.22&r2=1.23
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/cpg.c.diff?cvsroot=cluster&r1=1.22&r2=1.23
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/gd_internal.h.diff?cvsroot=cluster&r1=1.30&r2=1.31
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/main.c.diff?cvsroot=cluster&r1=1.33&r2=1.34
--- cluster/group/daemon/cman.c 2006/05/02 16:25:11 1.22
+++ cluster/group/daemon/cman.c 2006/06/20 18:09:12 1.23
@@ -12,8 +12,24 @@
static int cman_node_count;
static int cman_cb;
static int cman_reason;
+static char name_buf[CMAN_MAX_NODENAME_LEN+1];
+int kill_cman(int nodeid)
+{
+ cman_handle_t ach;
+ int rv;
+
+ ach = cman_admin_init(NULL);
+ if (!ach) {
+ log_print("cman_admin_init error %d %d", (int) ch, errno);
+ return -ENOTCONN;
+ }
+ rv = cman_kill_node(ach, nodeid);
+ cman_finish(ach);
+ return rv;
+}
+
static int is_member(cman_node_t *node_list, int count, int nodeid)
{
int i;
@@ -181,8 +197,12 @@
cman_quorate = cman_is_quorate(ch);
+ memset(name_buf, 0, sizeof(name_buf));
+ strncpy(name_buf, node.cn_name, CMAN_MAX_NODENAME_LEN);
+ our_name = name_buf;
our_nodeid = node.cn_nodeid;
- log_debug("cman: our nodeid %d quorum %d", our_nodeid, cman_quorate);
+ log_debug("cman: our nodeid %d name %s quorum %d",
+ our_nodeid, our_name, cman_quorate);
fd = cman_get_fd(ch);
client_add(fd, process_cman, close_cman);
--- cluster/group/daemon/cpg.c 2006/05/25 13:41:27 1.22
+++ cluster/group/daemon/cpg.c 2006/06/20 18:09:12 1.23
@@ -10,6 +10,8 @@
static int groupd_ci;
static int got_confchg;
+static struct cpg_address groupd_cpg_member[MAX_GROUP_MEMBERS];
+static int groupd_cpg_member_count;
static struct cpg_address saved_member[MAX_GROUP_MEMBERS];
static struct cpg_address saved_joined[MAX_GROUP_MEMBERS];
static struct cpg_address saved_left[MAX_GROUP_MEMBERS];
@@ -149,6 +151,9 @@
log_print("process_groupd_confchg members %d -%d +%d",
saved_member_count, saved_left_count, saved_joined_count);
+ memcpy(&groupd_cpg_member, &saved_member, sizeof(saved_member));
+ groupd_cpg_member_count = saved_member_count;
+
for (i = 0; i < saved_member_count; i++) {
if (saved_member[i].nodeId == our_nodeid &&
saved_member[i].pid == (uint32_t) getpid()) {
@@ -162,12 +167,25 @@
log_print("we are not in groupd confchg: %u %u",
our_nodeid, (uint32_t) getpid());
+ /* FIXME: we probably want to do a cman_kill_node() on a node
+ where groupd exits but cman is still running. */
+
for (i = 0; i < saved_left_count; i++) {
if (saved_left[i].reason != CPG_REASON_LEAVE)
add_recovery_set(saved_left[i].nodeId);
}
}
+void copy_groupd_data(group_data_t *data)
+{
+ int i;
+
+ data->level = -1;
+ data->member_count = groupd_cpg_member_count;
+ for (i = 0; i < groupd_cpg_member_count; i++)
+ data->members[i] = groupd_cpg_member[i].nodeId;
+}
+
/* FIXME: also match name */
group_t *find_group_by_handle(cpg_handle_t h)
--- cluster/group/daemon/gd_internal.h 2006/05/02 16:25:11 1.30
+++ cluster/group/daemon/gd_internal.h 2006/06/20 18:09:12 1.31
@@ -23,6 +23,7 @@
#include <string.h>
#include <strings.h>
#include <ctype.h>
+#include <dirent.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/types.h>
@@ -93,15 +94,12 @@
#define FALSE (0)
#endif
-#define test_bit(nr, addr) ((*addr) & (1 << nr))
-#define set_bit(nr, addr) ((*addr) |= (1 << nr))
-#define clear_bit(nr, addr) ((*addr) &= ~(1 << nr))
-
extern struct list_head gd_groups;
extern struct list_head gd_levels[MAX_LEVELS];
extern uint32_t gd_event_nr;
extern int cman_quorate;
extern int our_nodeid;
+extern char *our_name;
struct group;
struct app;
@@ -256,6 +254,7 @@
/* cman.c */
int setup_cman(void);
+int kill_cman(int nodeid);
/* cpg.c */
int setup_cpg(void);
@@ -263,6 +262,7 @@
int do_cpg_leave(group_t *g);
int send_message(group_t *g, void *buf, int len);
int send_message_groupd(group_t *g, void *buf, int len);
+void copy_groupd_data(group_data_t *data);
/* joinleave.c */
void remove_group(group_t *g);
--- cluster/group/daemon/main.c 2006/06/14 21:38:14 1.33
+++ cluster/group/daemon/main.c 2006/06/20 18:09:12 1.34
@@ -17,9 +17,10 @@
extern struct list_head recovery_sets;
-struct list_head gd_groups;
-struct list_head gd_levels[MAX_LEVELS];
+struct list_head gd_groups;
+struct list_head gd_levels[MAX_LEVELS];
uint32_t gd_event_nr;
+char *our_name;
int our_nodeid;
int cman_quorate;
@@ -37,6 +38,92 @@
void *deadfn;
};
+/* Look for any instances of gfs or dlm in the kernel, if we find any, it
+ means they're uncontrolled by us (via gfs_controld/dlm_controld/groupd).
+ We need to be rebooted to clear out this uncontrolled kernel state. Most
+ importantly, other nodes must not be allowed to form groups that might
+ correspond to these same instances of gfs/dlm. If they did, then we'd
+ be accessing gfs/dlm independently from them and corrupt stuff. */
+
+/* If we detect any local gfs/dlm state, fence ourself via fence_node.
+ This may not be strictly necessary since other nodes should fence us
+ when they form a new fence domain. If they're not forming a new domain,
+ that means there is a domain member that has a record of previous cluster
+ state when we were a member; it will have recognized that we left the
+ cluster and need fencing. The case where we need groupd to fence ourself
+ is when all cluster nodes are starting up and have residual gfs/dlm kernel
+ state. None would be able to start groupd/fenced and fence anyone. */
+
+/* - we've rejoined the cman cluster with residual gfs/dlm state
+ - there is a previous cman/domain member that saw us fail
+ - when we failed it lost quorum
+ - our current rejoin has given the cluster quorum
+ - the old member that saw we needed fencing can now begin fencing
+ - the old member sees we're now a cman member, might bypass fencing us...
+ - only bypasses fencing us if we're also in groupd cpg
+ - we won't be in groupd cpg until after we've verified there's no
+ local residual gfs/dlm state */
+
+static int kernel_instance_count(char *sysfs_dir)
+{
+ char path[PATH_MAX];
+ DIR *d;
+ struct dirent *de;
+ int rv = 0;
+
+ memset(path, 0, PATH_MAX);
+ snprintf(path, PATH_MAX, "%s", sysfs_dir);
+
+ d = opendir(path);
+ if (!d)
+ return 0;
+
+ while ((de = readdir(d))) {
+ if (de->d_name[0] == '.')
+ continue;
+
+ log_print("found uncontrolled kernel object %s in %s",
+ de->d_name, sysfs_dir);
+ rv++;
+ }
+ closedir(d);
+ return rv;
+}
+
+int check_uncontrolled_groups(void)
+{
+ pid_t pid;
+ char *argv[4];
+ int status, rv = 0;
+
+ rv += kernel_instance_count("/sys/kernel/dlm");
+ rv += kernel_instance_count("/sys/fs/gfs");
+ rv += kernel_instance_count("/sys/fs/gfs2");
+
+ if (!rv)
+ return 0;
+
+ /* FIXME: make sure this is going into syslog */
+ log_print("local node must be reset to clear %d uncontrolled "
+ "instances of gfs and/or dlm", rv);
+
+ kill_cman(our_nodeid);
+
+ argv[0] = "fence_node";
+ argv[1] = "-O";
+ argv[2] = our_name;
+ argv[3] = NULL;
+
+ pid = fork();
+ if (pid)
+ waitpid(pid, &status, 0);
+ else {
+ execvp(argv[0], argv);
+ log_print("failed to exec fence_node");
+ }
+
+ return -1;
+}
static void app_action(app_t *a, char *buf)
{
@@ -380,6 +467,12 @@
memset(&data, 0, sizeof(data));
+ /* special case to get members of groupd cpg */
+ if (atoi(argv[1]) == -1 && !strncmp(argv[2], "groupd", 6)) {
+ copy_groupd_data(&data);
+ goto out;
+ }
+
g = find_group_level(argv[2], atoi(argv[1]));
if (!g)
goto out;
@@ -580,6 +673,10 @@
if (rv < 0)
return rv;
+ rv = check_uncontrolled_groups();
+ if (rv < 0)
+ return rv;
+
rv = setup_cpg();
if (rv < 0)
return rv;
More information about the Cluster-devel
mailing list