[Cluster-devel] gfs uevent and sysfs changes

Tue Dec 2 14:02:30 UTC 2008

Hi,

On Mon, 2008-12-01 at 11:31 -0600, David Teigland wrote:
> Here are the compatibility aspects to the recent ideas about changes to
> the user/kernel interface between gfs (1 & 2) and gfs_controld.
> 
> . gfs_controld can remove id from hostdata string in mount options
> 
>   - no compat issues AFAICT
> 
> . getting rid of "id" sysfs file from lock_dlm
> 
>   - new gfs_controld old gfs-kernel
>     old kernel provides both "block" and "id" sysfs files
>     new daemon looks for "block" instead of "id" in sysfs
> 
>   - old gfs_controld new gfs-kernel
>     old daemon looks for "id" sysfs file
>     new kernel needs to provide "id" as well as "block" sysfs files
> 
>   Once everyone is using the new daemon, we can remove the "id" sysfs
>   file from the kernel.
> 
> . uevent strings to replace recover_done/recover_status sysfs files
> 
>   - new gfs_controld old gfs-kernel
>     old kernel has recover sysfs files, and no new uevent strings
>     new daemon needs to look for either sysfs files or uevent strings
> 
>   - old gfs_controld new gfs-kernel
>     old daemon looks for recover sysfs files, not new uevent strings
>     new kernel needs to provide both sysfs files and uevent strings
> 
>   Once everyone is using new kernel and new daemon, we can remove
>   the recover sysfs files from kernel, and daemon can stop looking for
>   recover sysfs files.
> 
> 
So notwithstanding the fact that I've still not sorted out a proper
build environment for groupd, I have created a new patch based on the
above, which I think should address all the issues.

I've tested the decode_uevent() function separately and it appears to
work well. With the sysfs files which we intend to eventually replace
with the new uevent variables which apply to the "change" message only,
I've introduced a system where "-1" means that we don't know the value
since the uevent didn't tell us. We then fall back to reading sysfs in
that case. It seems to work nicely since all the existing valid values
of those variable are positive integers.

I think I've addressed all the points above, and this version seems a
bit cleaner than the previous one, although there is still scope to do a
bit more cleaning up at a later date,

Steve.

diff --git a/group/gfs_controld/cpg-new.c b/group/gfs_controld/cpg-new.c
index 74806f6..4014081 100644
--- a/group/gfs_controld/cpg-new.c
+++ b/group/gfs_controld/cpg-new.c
@@ -2078,30 +2078,33 @@ static void apply_changes(struct mountgroup *mg)
    and then process the uevent/ipc upon receiving the message for it, so
    that it can be processed in the same order by all nodes. */
 
-void process_recovery_uevent(char *table)
+void process_recovery_uevent(char *name, int jid, int recover_status,
+			     int first_done)
 {
 	struct mountgroup *mg;
 	struct journal *j;
-	char *name = strstr(table, ":") + 1;
-	int jid, recover_status, first_done;
 	int rv;
 
 	mg = find_mg(name);
 	if (!mg) {
-		log_error("recovery_uevent mg not found %s", table);
+		log_error("recovery_uevent mg not found %s", name);
 		return;
 	}
 
-	rv = read_sysfs_int(mg, "recover_done", &jid);
-	if (rv < 0) {
-		log_error("recovery_uevent recover_done read %d", rv);
-		return;
+	if (jid < 0) {
+		rv = read_sysfs_int(mg, "recover_done", &jid);
+		if (rv < 0) {
+			log_error("recovery_uevent recover_done read %d", rv);
+			return;
+		}
 	}
 
-	rv = read_sysfs_int(mg, "recover_status", &recover_status);
-	if (rv < 0) {
-		log_error("recovery_uevent recover_status read %d", rv);
-		return;
+	if (recover_status < 0) {
+		rv = read_sysfs_int(mg, "recover_status", &recover_status);
+		if (rv < 0) {
+			log_error("recovery_uevent recover_status read %d", rv);
+			return;
+		}
 	}
 
 	if (!mg->first_recovery_needed) {
@@ -2162,10 +2165,12 @@ void process_recovery_uevent(char *table)
 		if (mg->first_done_uevent)
 			return;
 
-		rv = read_sysfs_int(mg, "first_done", &first_done);
-		if (rv < 0) {
-			log_error("recovery_uevent first_done read %d", rv);
-			return;
+		if (first_done < 0) {
+			rv = read_sysfs_int(mg, "first_done", &first_done);
+			if (rv < 0) {
+				log_error("recovery_uevent first_done read %d", rv);
+				return;
+			}
 		}
 
 		if (first_done) {
@@ -2678,12 +2683,11 @@ static void leave_mountgroup(struct mountgroup *mg, int mnterr)
 		log_error("cpg_leave error %d", error);
 }
 
-void do_leave(char *table, int mnterr)
+void do_leave(char *name, int mnterr)
 {
 	struct mountgroup *mg;
-	char *name = strstr(table, ":") + 1;
 
-	log_debug("do_leave %s mnterr %d", table, mnterr);
+	log_debug("do_leave %s mnterr %d", name, mnterr);
 
 	mg = find_mg(name);
 	if (!mg) {
diff --git a/group/gfs_controld/cpg-old.c b/group/gfs_controld/cpg-old.c
index 192a403..067fb85 100644
--- a/group/gfs_controld/cpg-old.c
+++ b/group/gfs_controld/cpg-old.c
@@ -1563,13 +1563,15 @@ static void recover_journals(struct mountgroup *mg)
    these and wait for gfs to be finished with all at which point it calls
    others_may_mount() and first_done is set. */
 
-static int kernel_recovery_done_first(struct mountgroup *mg)
+static int kernel_recovery_done_first(struct mountgroup *mg, int first_done)
 {
-	int rv, first_done;
+	int rv;
 
-	rv = read_sysfs_int(mg, "first_done", &first_done);
-	if (rv < 0)
-		return rv;
+	if (first_done < 0) {
+		rv = read_sysfs_int(mg, "first_done", &first_done);
+		if (rv < 0)
+			return rv;
+	}
 
 	log_group(mg, "kernel_recovery_done_first first_done %d", first_done);
 
@@ -1604,26 +1606,27 @@ static int need_kernel_recovery_done(struct mountgroup *mg)
    remain blocked until an rw node mounts, and the next mounter must
    be rw. */
 
-int process_recovery_uevent_old(char *table)
+int process_recovery_uevent_old(char *name, int jid_done, int status, int first)
 {
 	struct mountgroup *mg;
 	struct mg_member *memb;
-	char *name = strstr(table, ":") + 1;
 	char *ss;
-	int rv, jid_done, status, found = 0;
+	int rv, found = 0;
 
 	mg = find_mg(name);
 	if (!mg) {
-		log_error("recovery_done: unknown mount group %s", table);
+		log_error("recovery_done: unknown mount group %s", name);
 		return -1;
 	}
 
 	if (mg->first_mounter && !mg->first_mounter_done)
-		return kernel_recovery_done_first(mg);
+		return kernel_recovery_done_first(mg, first);
 
-	rv = read_sysfs_int(mg, "recover_done", &jid_done);
-	if (rv < 0)
-		return rv;
+	if (jid_done < 0) {
+		rv = read_sysfs_int(mg, "recover_done", &jid_done);
+		if (rv < 0)
+			return rv;
+	}
 
 	list_for_each_entry(memb, &mg->members_gone, list) {
 		if (memb->jid == jid_done) {
@@ -1646,12 +1649,14 @@ int process_recovery_uevent_old(char *table)
 		return 0;
 	}
 
-	rv = read_sysfs_int(mg, "recover_status", &status);
-	if (rv < 0) {
-		log_group(mg, "recovery_done jid %d nodeid %d sysfs error %d",
-			  memb->jid, memb->nodeid, rv);
-		memb->local_recovery_status = RS_NOFS;
-		goto out;
+	if (status < 0) {
+		rv = read_sysfs_int(mg, "recover_status", &status);
+		if (rv < 0) {
+			log_group(mg, "recovery_done jid %d nodeid %d sysfs error %d",
+				  memb->jid, memb->nodeid, rv);
+			memb->local_recovery_status = RS_NOFS;
+			goto out;
+		}
 	}
 
 	switch (status) {
@@ -1724,12 +1729,11 @@ static void leave_mountgroup(struct mountgroup *mg, int mnterr)
 	group_leave(gh, mg->name);
 }
 
-void do_leave_old(char *table, int mnterr)
+void do_leave_old(char *name, int mnterr)
 {
 	struct mountgroup *mg;
-	char *name = strstr(table, ":") + 1;
 
-	log_debug("do_leave_old %s mnterr %d", table, mnterr);
+	log_debug("do_leave_old %s mnterr %d", name, mnterr);
 
 	list_for_each_entry(mg, &withdrawn_mounts, list) {
 		if (strcmp(mg->name, name))
diff --git a/group/gfs_controld/gfs_daemon.h b/group/gfs_controld/gfs_daemon.h
index 3beeb0e..c93d9cc 100644
--- a/group/gfs_controld/gfs_daemon.h
+++ b/group/gfs_controld/gfs_daemon.h
@@ -225,10 +225,10 @@ void process_cpg(int ci);
 int setup_dlmcontrol(void);
 void process_dlmcontrol(int ci);
 int set_protocol(void);
-void process_recovery_uevent(char *table);
+void process_recovery_uevent(char *name, int jid, int status, int first);
 void process_mountgroups(void);
 int gfs_join_mountgroup(struct mountgroup *mg);
-void do_leave(char *table, int mnterr);
+void do_leave(char *name, int mnterr);
 void gfs_mount_done(struct mountgroup *mg);
 void send_remount(struct mountgroup *mg, struct gfsc_mount_args *ma);
 void send_withdraw(struct mountgroup *mg);
@@ -245,13 +245,12 @@ void close_cpg_old(void);
 void process_cpg_old(int ci);
 
 int gfs_join_mountgroup_old(struct mountgroup *mg, struct gfsc_mount_args *ma);
-void do_leave_old(char *table, int mnterr);
+void do_leave_old(char *name, int mnterr);
 int send_group_message_old(struct mountgroup *mg, int len, char *buf);
 void save_message_old(struct mountgroup *mg, char *buf, int len, int from,
 	int type);
 void send_withdraw_old(struct mountgroup *mg);
-int process_recovery_uevent_old(char *table);
-void ping_kernel_mount_old(char *table);
+int process_recovery_uevent_old(char *name, int jid, int status, int first);
 void send_remount_old(struct mountgroup *mg, struct gfsc_mount_args *ma);
 void send_mount_status_old(struct mountgroup *mg);
 int do_stop(struct mountgroup *mg);
diff --git a/group/gfs_controld/main.c b/group/gfs_controld/main.c
index a2d8ed9..3ad2232 100644
--- a/group/gfs_controld/main.c
+++ b/group/gfs_controld/main.c
@@ -7,6 +7,7 @@
 
 #define LOCKFILE_NAME	"/var/run/gfs_controld.pid"
 #define CLIENT_NALLOC   32
+#define UEVENT_BUF_SIZE 4096
 
 static int client_maxi;
 static int client_size;
@@ -22,7 +23,7 @@ struct client {
 	struct mountgroup *mg;
 };
 
-static void do_withdraw(char *table);
+static void do_withdraw(char *name);
 
 int do_read(int fd, void *buf, size_t count)
 {
@@ -198,62 +199,100 @@ struct mountgroup *find_mg_id(uint32_t id)
 	return NULL;
 }
 
-#define MAXARGS 8
-
-static char *get_args(char *buf, int *argc, char **argv, char sep, int want)
-{
-	char *p = buf, *rp = NULL;
-	int i;
-
-	argv[0] = p;
-
-	for (i = 1; i < MAXARGS; i++) {
-		p = strchr(buf, sep);
-		if (!p)
-			break;
-		*p = '\0';
-
-		if (want == i) {
-			rp = p + 1;
-			break;
-		}
-
-		argv[i] = p + 1;
-		buf = p + 1;
-	}
-	*argc = i;
-
-	/* we ended by hitting \0, return the point following that */
-	if (!rp)
-		rp = strchr(buf, '\0') + 1;
-
-	return rp;
-}
-
-static void ping_kernel_mount(char *table)
+static void ping_kernel_mount(char *name)
 {
 	struct mountgroup *mg;
-	char *name = strstr(table, ":") + 1;
 	int rv, val;
 
 	mg = find_mg(name);
 	if (!mg)
 		return;
 
-	rv = read_sysfs_int(mg, "id", &val);
+	rv = read_sysfs_int(mg, "block", &val);
 
 	log_group(mg, "ping_kernel_mount %d", rv);
 }
 
-static void process_uevent(int ci)
+enum {
+	Env_ACTION = 0,
+	Env_SUBSYSTEM,
+	Env_LOCKPROTO,
+	Env_LOCKTABLE,
+	Env_DEVPATH,
+	Env_RECOVERY,
+	Env_FIRSTMOUNT,
+	Env_JID,
+	Env_Last, /* Flag for end of vars */
+};
+
+static const char *uevent_vars[] = {
+	[Env_ACTION]		= "ACTION=",
+	[Env_SUBSYSTEM]		= "SUBSYSTEM=",
+	[Env_LOCKPROTO]		= "LOCKPROTO=",
+	[Env_LOCKTABLE]		= "LOCKTABLE=",
+	[Env_DEVPATH]		= "DEVPATH=/fs/gfs",
+	[Env_RECOVERY]		= "RECOVERY=",
+	[Env_FIRSTMOUNT]	= "FIRSTMOUNT=",
+	[Env_JID]		= "JID=",
+};
+
+/*
+ * Parses a uevent message for the interesting bits. It requires a list
+ * of variables to look for, and an equally long list of pointers into
+ * which to write the results.
+ */
+static void decode_uevent(const char *buf, unsigned len, const char *vars[],
+			  unsigned nvars, const char *vals[])
+{
+	const char *ptr;
+	unsigned i;
+	int slen, vlen;
+
+	memset(vals, 0, sizeof(const char *) * nvars);
+
+	while (len > 0) {
+		ptr = buf;
+		slen = strlen(ptr);
+		buf += slen;
+		len -= slen;
+		buf++; len--;
+		for (i = 0; i < nvars; i++) {
+			vlen = strlen(vars[i]);
+			if (vlen > slen)
+				continue;
+			if (memcmp(vars[i], ptr, vlen) != 0)
+				continue;
+			vals[i] = ptr + vlen;
+			break;
+		}
+	}
+}
+
+static char *uevent_fsname(const char *vars[])
 {
-	char buf[MAXLINE];
-	char *argv[MAXARGS], *act, *sys;
-	int rv, argc = 0;
-	int lock_module = 0;
+	char *name = NULL;
+	if (vars[Env_LOCKTABLE])
+		name = strchr(vars[Env_LOCKTABLE], ':');
+	/* When all kernels are converted, we can dispose with the following
+	 * grotty bit. This is for backward compatibility only.
+	 */
+	if (!name && vars[Env_DEVPATH]) {
+		name = strchr(vars[Env_DEVPATH], ':');
+		if (name) {
+			char *end = strstr(name, "/lock_dlm");
+			if (*end)
+				*end = 0;
+		}
+	}
+	return (name && name[0]) ? name + 1 : NULL;
+}
 
-	memset(buf, 0, sizeof(buf));
-	memset(argv, 0, sizeof(char *) * MAXARGS);
+static void process_uevent(int ci)
+{
+	char buf[UEVENT_BUF_SIZE];
+	const char *uevent_vals[Env_Last];
+	char *fsname;
+	int rv;
 
  retry_recv:
 	rv = recv(client[ci].fd, &buf, sizeof(buf), 0);
@@ -264,68 +303,52 @@ static void process_uevent(int ci)
 			log_error("uevent recv error %d errno %d", rv, errno);
 		return;
 	}
-
-	/* first we get the uevent for removing lock module kobject:
-	     "remove@/fs/gfs/bull:x/lock_module"
-	   second is the uevent for removing gfs kobject:
-	     "remove@/fs/gfs/bull:x"
-	*/
-
-	if (!strstr(buf, "gfs"))
+	buf[rv] = 0;
+	decode_uevent(buf, rv, uevent_vars, Env_Last, uevent_vals);
+	if (!uevent_vals[Env_DEVPATH] || !uevent_vals[Env_ACTION] ||
+	    !uevent_vals[Env_SUBSYSTEM])
 		return;
-
-	/* if an fs is named "gfs", it results in dlm uevents
-	   like "remove@/kernel/dlm/gfs" */
-
-	if (strstr(buf, "kernel/dlm"))
-		return;
-
+	fsname = uevent_fsname(uevent_vals);
 	log_debug("uevent: %s", buf);
+	log_debug("kernel: %s %s", uevent_vals[Env_ACTION], fsname);
 
-	if (strstr(buf, "lock_module"))
-		lock_module = 1;
-
-	get_args(buf, &argc, argv, '/', 4);
-	if (argc != 4)
-		log_error("uevent message has %d args", argc);
-	act = argv[0];
-	sys = argv[2];
-
-	log_debug("kernel: %s %s", act, argv[3]);
+	if (!fsname)
+		return;
 
-	if (!strcmp(act, "remove@")) {
+	if (!strcmp(uevent_vals[Env_ACTION], "remove")) {
 		/* We want to trigger the leave at the very end of the kernel's
 		   unmount process, i.e. at the end of put_super(), so we do the
 		   leave when the second uevent (from the gfs kobj) arrives. */
 
-		if (lock_module)
+		if (strcmp(uevent_vals[Env_SUBSYSTEM], "lock_dlm") == 0)
 			return;
-
 		if (group_mode == GROUP_LIBGROUP)
-			do_leave_old(argv[3], 0);
+			do_leave_old(fsname, 0);
 		else
-			do_leave(argv[3], 0);
-
-	} else if (!strcmp(act, "change@")) {
-		if (!lock_module)
-			return;
-
+			do_leave(fsname, 0);
+
+	} else if (!strcmp(uevent_vals[Env_ACTION], "change")) {
+		int jid, status = -1, first = -1;
+		if (!uevent_vals[Env_JID] ||
+		    (sscanf(uevent_vals[Env_JID], "%d", &jid) != 1))
+			jid = -1;
+		if (uevent_vals[Env_RECOVERY]) {
+			if (strcmp(uevent_vals[Env_RECOVERY], "Done") == 0)
+				status = LM_RD_SUCCESS;
+			if (strcmp(uevent_vals[Env_RECOVERY], "Failed") == 0)
+				status = LM_RD_GAVEUP;
+		}
+		if (uevent_vals[Env_FIRSTMOUNT] &&
+		    (strcmp(uevent_vals[Env_FIRSTMOUNT], "Done") == 0))
+			first = 1;
 		if (group_mode == GROUP_LIBGROUP)
-			process_recovery_uevent_old(argv[3]);
+			process_recovery_uevent_old(fsname, jid, status, first);
 		else
-			process_recovery_uevent(argv[3]);
-
-	} else if (!strcmp(act, "offline@")) {
-		if (!lock_module)
-			return;
-
-		do_withdraw(argv[3]);
-
+			process_recovery_uevent(fsname, jid, status, first);
+	} else if (!strcmp(uevent_vals[Env_ACTION], "offline")) {
+		do_withdraw(fsname);
 	} else {
-		if (!lock_module)
-			return;
-
-		ping_kernel_mount(argv[3]);
+		ping_kernel_mount(fsname);
 	}
 }
 
@@ -736,10 +759,9 @@ static void do_join(int ci, struct gfsc_mount_args *ma)
    and when it's been removed from the group, it tells the locally withdrawing
    gfs to clear out locks. */
 
-static void do_withdraw(char *table)
+static void do_withdraw(char *name)
 {
 	struct mountgroup *mg;
-	char *name = strstr(table, ":") + 1;
 	int rv;
 
 	log_debug("withdraw: %s", name);