[Cluster-devel] Clean up & fixes for master branch of gfs_controld

Mon Oct 12 09:31:40 UTC 2009

Hi,

The following patch cleans up some old code in gfs_controld and also fixes
one or two issues along the way. After this patch has been applied,
gfs_controld no longer reads from sysfs, but instead gathers all its
information from the uevents. The uevents in question have been in all
recent versions of GFS & GFS2 and so there should be no backward
compatibility issues (even so, this is for the master branch only, at
least at the moment).

The diffstat shows a net reduction of about 100 lines of code.

 group/gfs_controld/cpg-new.c    |  161 ++++++++++++++--------------------------
 group/gfs_controld/gfs_daemon.h |    6 -
 group/gfs_controld/main.c       |   99 ++++++++++--------------
 group/gfs_controld/util.c       |   43 ----------
 4 files changed, 106 insertions(+), 203 deletions(-)

There are some bugs fixes with this patch too:
 - process_uevent() was checking for the string "/fs/gfs" anywhere in the 
   uevent DEVPATH, whereas the correct check was for a DEVPATH starting
   with that string.
 - There can now be no confusion between the CHANGE uevent from a first
   mount event and a CHANGE uevent from a recovery done event. In the
   old code, this was worked out according to the expected state of the
   cluster. The new code uses the uevent messages to do that, and hence
   should be more robust.
 - In the old code, the recovery uevent process was still reading sysfs
   files, even when it had all the information required from the uevent
   which is why my recent kernel patch had to be backed out. This patch
   fixes that as well.

Some other notes:
 - ping_kernel_mount() used to have a side effect with the old CPG code,
   but with the new CPG it does nothing useful and thus can be removed.
 - This is the first step in a process to eventually remove mount.gfs2
   (and let gfs_controld handle the complete mount process itself).
 - While testing I spotted this in the logs:
   1255337409 myfs set /sys/fs/gfs2/plurality:myfs/lock_module/block to 0
   1255337409 myfs set open /sys/fs/gfs2/plurality:myfs/lock_module/block error -1 2
   This happens even without the below patch and I believe it is caused by
   trying to write to the sysfs file before the filesystem has been
   mounted. It seems to be harmless and probably we can just remove the
   code which tries to do this. I need to do some more investigation
   to confirm that though.

Steve.

-----------------------------------------------------------------------------

diff --git a/group/gfs_controld/cpg-new.c b/group/gfs_controld/cpg-new.c
index 6def20c..23b7a86 100644
--- a/group/gfs_controld/cpg-new.c
+++ b/group/gfs_controld/cpg-new.c
@@ -2201,114 +2201,81 @@ static void apply_changes(struct mountgroup *mg)
 	}
 }
 
-/* We send messages with the info from kernel uevents or mount.gfs ipc,
-   and then process the uevent/ipc upon receiving the message for it, so
-   that it can be processed in the same order by all nodes. */
-
-void process_recovery_uevent(char *name, int jid, int recover_status,
-			     int first_done)
+void process_first_mount(struct mountgroup *mg)
 {
-	struct mountgroup *mg;
-	struct journal *j;
-	int rv;
+	/*
+	 * Assumption here is that only the first mounter will get
+	 * uevents when first_recovery_needed is set.
+	 */
 
-	mg = find_mg(name);
-	if (!mg) {
-		log_error("recovery_uevent mg not found %s", name);
-		return;
-	}
+	/* make a local record of jid and recover_status; we may want
+	   to check below that we've seen uevents for all jids
+	   during first recovery before sending first_recovery_done. */
 
-	if (jid < 0) {
-		/* for back compat, sysfs file deprecated */
-		rv = read_sysfs_int(mg, "recover_done", &jid);
-		if (rv < 0) {
-			log_error("recovery_uevent recover_done read %d", rv);
-			return;
-		}
-	}
+	log_group(mg, "recovery_uevent mg %s first recovery done", mg->name);
 
-	if (recover_status < 0) {
-		/* for back compat, sysfs file deprecated */
-		rv = read_sysfs_int(mg, "recover_status", &recover_status);
-		if (rv < 0) {
-			log_error("recovery_uevent recover_status read %d", rv);
-			return;
-		}
-	}
+	/* ignore extraneous uevent from others_may_mount */
+	if (mg->first_done_uevent)
+		return;
 
-	if (!mg->first_recovery_needed) {
-		if (!mg->local_recovery_busy) {
-			/* This will happen in two known situations:
-			   - we get a recovery_done uevent for our own journal
-			     when we mount  (jid == mg->our_jid)
-			   - the first mounter can read first_done and clear
-			     first_recovery_needed before seeing the change
-			     uevent from others_may_mount */
-			log_group(mg, "recovery_uevent jid %d ignore", jid);
-			return;
-		}
+	log_group(mg, "recovery_uevent first_done");
+	mg->first_done_uevent = 1;
+	send_first_recovery_done(mg);
 
-		mg->local_recovery_busy = 0;
+	apply_changes_recovery(mg);
+}
 
-		if (mg->local_recovery_jid != jid) {
-			log_error("recovery_uevent jid %d expected %d", jid,
-				  mg->local_recovery_jid);
-			return;
-		}
+/* We send messages with the info from kernel uevents or mount.gfs ipc,
+   and then process the uevent/ipc upon receiving the message for it, so
+   that it can be processed in the same order by all nodes. */
 
-		j = find_journal(mg, jid);
-		if (!j) {
-			log_error("recovery_uevent no journal %d", jid);
-			return;
-		}
+void process_recovery_uevent(struct mountgroup *mg, int jid, int recover_status)
+{
+	struct journal *j;
 
-		log_group(mg, "recovery_uevent jid %d status %d "
-			  "local_recovery_done %d needs_recovery %d",
-			  jid, recover_status, j->local_recovery_done,
-			  j->needs_recovery);
+	if (mg->first_recovery_needed)
+		return;
 
-		j->local_recovery_done = 1;
-		j->local_recovery_result = recover_status;
+	if (!mg->local_recovery_busy) {
+		/* This will happen in two known situations:
+		   - we get a recovery_done uevent for our own journal
+		     when we mount  (jid == mg->our_jid)
+		   - the first mounter can read first_done and clear
+		     first_recovery_needed before seeing the change
+		     uevent from others_may_mount */
+		log_group(mg, "recovery_uevent jid %d ignore", jid);
+		return;
+	}
 
-		/* j->needs_recovery will be cleared when we receive this
-		   recovery_result message.  if it's already set, then
-		   someone else has completed the recovery and there's
-		   no need to send our result */
+	mg->local_recovery_busy = 0;
 
-		if (j->needs_recovery)
-			send_recovery_result(mg, jid, recover_status);
-	} else {
-		/*
-		 * Assumption here is that only the first mounter will get
-		 * uevents when first_recovery_needed is set.
-		 */
+	if (mg->local_recovery_jid != jid) {
+		log_error("recovery_uevent jid %d expected %d", jid,
+			  mg->local_recovery_jid);
+		return;
+	}
 
-		/* make a local record of jid and recover_status; we may want
-		   to check below that we've seen uevents for all jids
-		   during first recovery before sending first_recovery_done. */
+	j = find_journal(mg, jid);
+	if (!j) {
+		log_error("recovery_uevent no journal %d", jid);
+		return;
+	}
 
-		log_group(mg, "recovery_uevent jid %d first recovery done %d",
-			  jid, mg->first_done_uevent);
+	log_group(mg, "recovery_uevent jid %d status %d "
+		  "local_recovery_done %d needs_recovery %d",
+		  jid, recover_status, j->local_recovery_done,
+		  j->needs_recovery);
 
-		/* ignore extraneous uevent from others_may_mount */
-		if (mg->first_done_uevent)
-			return;
+	j->local_recovery_done = 1;
+	j->local_recovery_result = recover_status;
 
-		if (first_done < 0) {
-			/* for back compat, sysfs file deprecated */
-			rv = read_sysfs_int(mg, "first_done", &first_done);
-			if (rv < 0) {
-				log_error("recovery_uevent first_done read %d", rv);
-				return;
-			}
-		}
+	/* j->needs_recovery will be cleared when we receive this
+	   recovery_result message.  if it's already set, then
+	   someone else has completed the recovery and there's
+	   no need to send our result */
 
-		if (first_done) {
-			log_group(mg, "recovery_uevent first_done");
-			mg->first_done_uevent = 1;
-			send_first_recovery_done(mg);
-		}
-	}
+	if (j->needs_recovery)
+		send_recovery_result(mg, jid, recover_status);
 
 	apply_changes_recovery(mg);
 }
@@ -2830,17 +2797,9 @@ static void leave_mountgroup(struct mountgroup *mg, int mnterr)
 		log_error("cpg_leave error %d", error);
 }
 
-void do_leave(char *name, int mnterr)
+void do_leave(struct mountgroup *mg, int mnterr)
 {
-	struct mountgroup *mg;
-
-	log_debug("do_leave %s mnterr %d", name, mnterr);
-
-	mg = find_mg(name);
-	if (!mg) {
-		log_error("do_leave: %s not found", name);
-		return;
-	}
+	log_debug("do_leave %s mnterr %d", mg->name, mnterr);
 
 	if (mg->withdraw_uevent) {
 		log_group(mg, "do_leave: ignored during withdraw");
diff --git a/group/gfs_controld/gfs_daemon.h b/group/gfs_controld/gfs_daemon.h
index fe9bd73..1d89497 100644
--- a/group/gfs_controld/gfs_daemon.h
+++ b/group/gfs_controld/gfs_daemon.h
@@ -175,10 +175,11 @@ void process_cpg_daemon(int ci);
 int setup_dlmcontrol(void);
 void process_dlmcontrol(int ci);
 int set_protocol(void);
-void process_recovery_uevent(char *name, int jid, int status, int first);
+void process_recovery_uevent(struct mountgroup *mg, int jid, int status);
+void process_first_mount(struct mountgroup *mg);
 void process_mountgroups(void);
 int gfs_join_mountgroup(struct mountgroup *mg);
-void do_leave(char *name, int mnterr);
+void do_leave(struct mountgroup *mg, int mnterr);
 void gfs_mount_done(struct mountgroup *mg);
 void send_remount(struct mountgroup *mg, struct gfsc_mount_args *ma);
 void send_withdraw(struct mountgroup *mg);
@@ -219,7 +220,6 @@ void kick_node_from_cluster(int nodeid);
 /* util.c */
 int we_are_in_fence_domain(void);
 int set_sysfs(struct mountgroup *mg, const char *field, int val);
-int read_sysfs_int(struct mountgroup *mg, const char *field, int *val_out);
 int run_dmsetup_suspend(struct mountgroup *mg, char *dev);
 void update_dmsetup_wait(void);
 void update_flow_control_status(void);
diff --git a/group/gfs_controld/main.c b/group/gfs_controld/main.c
index 64a388c..2b5a87e 100644
--- a/group/gfs_controld/main.c
+++ b/group/gfs_controld/main.c
@@ -23,7 +23,7 @@ struct client {
 	struct mountgroup *mg;
 };
 
-static void do_withdraw(char *name);
+static void do_withdraw(struct mountgroup *mg);
 
 int do_read(int fd, void *buf, size_t count)
 {
@@ -189,20 +189,6 @@ struct mountgroup *find_mg_id(uint32_t id)
 	return NULL;
 }
 
-static void ping_kernel_mount(char *name)
-{
-	struct mountgroup *mg;
-	int rv, val;
-
-	mg = find_mg(name);
-	if (!mg)
-		return;
-
-	rv = read_sysfs_int(mg, "block", &val);
-
-	log_group(mg, "ping_kernel_mount %d", rv);
-}
-
 enum {
 	Env_ACTION = 0,
 	Env_SUBSYSTEM,
@@ -260,29 +246,21 @@ static void decode_uevent(const char *buf, unsigned len, const char *vars[],
 	}
 }
 
-static char *uevent_fsname(const char *vars[])
+static char *uevent_fsname(const char *vals[])
 {
 	char *name = NULL;
 
-	if (vars[Env_LOCKTABLE])
-		name = strchr(vars[Env_LOCKTABLE], ':');
-
-	/* When all kernels are converted, we can dispose with the following
-	 * grotty bit. This is for backward compatibility only.
-	 */
-	if (!name && vars[Env_DEVPATH]) {
-		name = strchr(vars[Env_DEVPATH], ':');
-		if (name) {
-			char *end = strstr(name, "/lock_module");
-			if (end)
-				*end = 0;
-		}
+	if (vals[Env_LOCKTABLE]) {
+		name = strchr(vals[Env_LOCKTABLE], ':');
+		if (name && *name)
+			name++;
 	}
-	return (name && name[0]) ? name + 1 : NULL;
+	return name;
 }
 
 static void process_uevent(int ci)
 {
+	struct mountgroup *mg;
 	char buf[UEVENT_BUF_SIZE];
 	const char *uevent_vals[Env_Last];
 	char *fsname;
@@ -306,7 +284,7 @@ static void process_uevent(int ci)
 	    !uevent_vals[Env_SUBSYSTEM])
 		return;
 
-	if (!strstr(uevent_vals[Env_DEVPATH], "/fs/gfs"))
+	if (strncmp(uevent_vals[Env_DEVPATH], "/fs/gfs", 7) != 0)
 		return;
 
 	log_debug("uevent %s %s %s",
@@ -323,6 +301,12 @@ static void process_uevent(int ci)
 		return;
 	}
 
+	mg = find_mg(fsname);
+	if (!mg) {
+		log_error("mount group %s not found", fsname);
+		return;
+	}
+
 	if (!strcmp(uevent_vals[Env_ACTION], "remove")) {
 		/* We want to trigger the leave at the very end of the kernel's
 		   unmount process, i.e. at the end of put_super(), so we do the
@@ -330,33 +314,37 @@ static void process_uevent(int ci)
 
 		if (strcmp(uevent_vals[Env_SUBSYSTEM], "lock_dlm") == 0)
 			return;
-		do_leave(fsname, 0);
+		do_leave(mg, 0);
+		return;
+	}
 
-	} else if (!strcmp(uevent_vals[Env_ACTION], "change")) {
-		int jid, status = -1, first = -1;
+	if (!strcmp(uevent_vals[Env_ACTION], "change")) {
+		int jid, status = -1;
 
-		if (!uevent_vals[Env_JID] ||
-		    (sscanf(uevent_vals[Env_JID], "%d", &jid) != 1))
-			jid = -1;
 
 		if (uevent_vals[Env_RECOVERY]) {
+			if (!uevent_vals[Env_JID] ||
+			    (sscanf(uevent_vals[Env_JID], "%d", &jid) != 1))
+				return;
 			if (strcmp(uevent_vals[Env_RECOVERY], "Done") == 0)
 				status = LM_RD_SUCCESS;
 			if (strcmp(uevent_vals[Env_RECOVERY], "Failed") == 0)
 				status = LM_RD_GAVEUP;
+			if (status < 0)
+				return;
+			process_recovery_uevent(mg, jid, status);
+			return;
 		}
 
 		if (uevent_vals[Env_FIRSTMOUNT] &&
-		    (strcmp(uevent_vals[Env_FIRSTMOUNT], "Done") == 0))
-			first = 1;
-
-		process_recovery_uevent(fsname, jid, status, first);
-
-	} else if (!strcmp(uevent_vals[Env_ACTION], "offline")) {
-		do_withdraw(fsname);
-	} else {
-		ping_kernel_mount(fsname);
+		    (strcmp(uevent_vals[Env_FIRSTMOUNT], "Done") == 0)) {
+			process_first_mount(mg);
+		}
+		return;
 	}
+
+	if (!strcmp(uevent_vals[Env_ACTION], "offline"))
+		do_withdraw(mg);
 }
 
 static int setup_uevent(void)
@@ -724,24 +712,17 @@ static void do_join(int ci, struct gfsc_mount_args *ma)
    and when it's been removed from the group, it tells the locally withdrawing
    gfs to clear out locks. */
 
-static void do_withdraw(char *name)
+static void do_withdraw(struct mountgroup *mg)
 {
-	struct mountgroup *mg;
 	int rv;
 
-	log_debug("withdraw: %s", name);
+	log_debug("withdraw: %s", mg->name);
 
 	if (!cfgd_enable_withdraw) {
 		log_error("withdraw feature not enabled");
 		return;
 	}
 
-	mg = find_mg(name);
-	if (!mg) {
-		log_error("do_withdraw no mountgroup %s", name);
-		return;
-	}
-
 	mg->withdraw_uevent = 1;
 
 	rv = run_dmsetup_suspend(mg, mg->mount_args.dev);
@@ -819,6 +800,7 @@ void process_connection(int ci)
 	struct gfsc_header h;
 	struct gfsc_mount_args empty;
 	struct gfsc_mount_args *ma;
+	struct mountgroup *mg;
 	char *extra = NULL;
 	int rv, extra_len;
 
@@ -875,7 +857,12 @@ void process_connection(int ci)
 		break;
 
 	case GFSC_CMD_FS_LEAVE:
-		do_leave(ma->table, h.data);
+		mg = find_mg(ma->table);
+		if (!mg) {
+			log_error("do_leave: %s not found", ma->table);
+			break;
+		}
+		do_leave(mg, h.data);
 		break;
 
 	case GFSC_CMD_FS_MOUNT_DONE:
diff --git a/group/gfs_controld/util.c b/group/gfs_controld/util.c
index 44d8827..d6d46ee 100644
--- a/group/gfs_controld/util.c
+++ b/group/gfs_controld/util.c
@@ -76,49 +76,6 @@ int set_sysfs(struct mountgroup *mg, const char *field, int val)
 	return rv;
 }
 
-static int get_sysfs(struct mountgroup *mg, const char *field, char *buf, int len)
-{
-	char fname[PATH_MAX], *p;
-	int fd, rv;
-
-	snprintf(fname, PATH_MAX, "%s/%s/%s/lock_module/%s",
-		 SYSFS_DIR, mg->mount_args.type, mg->mount_args.table, field);
-
-	fd = open(fname, O_RDONLY);
-	if (fd < 0) {
-		log_group(mg, "get open %s error %d %d", fname, fd, errno);
-		return -1;
-	}
-
-	rv = read(fd, buf, len);
-	if (rv < 0)
-		log_error("read %s error %d %d", fname, rv, errno);
-	else {
-		rv = 0;
-		p = strchr(buf, '\n');
-		if (p)
-			*p = '\0';
-	}
-
-	close(fd);
-	return rv;
-}
-
-int read_sysfs_int(struct mountgroup *mg, const char *field, int *val_out)
-{
-	char buf[SYSFS_BUFLEN];
-	int rv;
-
-	memset(buf, 0, sizeof(buf));
-
-	rv = get_sysfs(mg, field, buf, sizeof(buf));
-	if (rv < 0)
-		return rv;
-
-	*val_out = atoi(buf);
-	return 0;
-}
-
 int run_dmsetup_suspend(struct mountgroup *mg, char *dev)
 {
 	struct sched_param sched_param;