[Cluster-devel] gfs_controld: Allow mounts entirely via sysfs/uevent interface

Wed Oct 6 14:22:42 UTC 2010

This patch adds the ability for gfs_controld to optionally allow
mounts via sysfs/uevents rather than requiring the mount.gfs2
helper. Some kernel support is required to do that, and the support
is in recent kernels (2.6.36 and up) so this has to be backwards
compatible.

The other change is that when a withdraw occurs, the device is now
specified by device number. This number is obtained from sysfs
and avoids the issue which would otherwise occur with the new
mount sequence for which the device path is not available.

There is a flag added to the mount group which determines whether
the mount helper exists or not. The flag is set depending on
whether the first notification of the filesystem is from the
ADD uevent or from the mount helper.

The plan is that the mount helper will be gradually phased out
over time.

Signed-off-by: Steven Whitehouse <swhiteho at redhat.com>

diff --git a/group/gfs_controld/cpg-new.c b/group/gfs_controld/cpg-new.c
index 23b7a86..a72d0f1 100644
--- a/group/gfs_controld/cpg-new.c
+++ b/group/gfs_controld/cpg-new.c
@@ -261,7 +261,6 @@ static int daemon_member_count;
 
 static void apply_changes_recovery(struct mountgroup *mg);
 static void send_withdraw_acks(struct mountgroup *mg);
-static void leave_mountgroup(struct mountgroup *mg, int mnterr);
 
 static void log_config(const struct cpg_name *group_name,
 		       const struct cpg_address *member_list,
@@ -1429,14 +1428,14 @@ static void send_recovery_result(struct mountgroup *mg, int jid, int result)
 	free(buf);
 }
 
-void send_remount(struct mountgroup *mg, struct gfsc_mount_args *ma)
+void send_remount(struct mountgroup *mg, int ro)
 {
 	struct gfs_header h;
 
 	memset(&h, 0, sizeof(h));
 
 	h.type = GFS_MSG_REMOUNT;
-	h.msgdata = strstr(ma->options, "ro") ? 1 : 0;
+	h.msgdata = ro;
 
 	gfs_send_message(mg, (char *)&h, sizeof(h));
 }
@@ -1654,7 +1653,7 @@ static void receive_withdraw(struct mountgroup *mg, struct gfs_header *hd,
 	node->withdraw = 1;
 
 	if (hd->nodeid == our_nodeid)
-		leave_mountgroup(mg, 0);
+		gfs_leave_mountgroup(mg, 0);
 }
 
 /* start message from all nodes shows zero started_count */
@@ -2769,7 +2768,7 @@ int gfs_join_mountgroup(struct mountgroup *mg)
    order.  We can just ignore the second.  The second would either not find
    the mg here, or would see mg->leaving of 1 from the first. */
 
-static void leave_mountgroup(struct mountgroup *mg, int mnterr)
+void gfs_leave_mountgroup(struct mountgroup *mg, int mnterr)
 {
 	cpg_error_t error;
 	struct cpg_name name;
@@ -2806,7 +2805,7 @@ void do_leave(struct mountgroup *mg, int mnterr)
 		return;
 	}
 
-	leave_mountgroup(mg, mnterr);
+	gfs_leave_mountgroup(mg, mnterr);
 }
 
 static void receive_withdraw_ack(struct gfs_header *hd, int len)
diff --git a/group/gfs_controld/gfs_daemon.h b/group/gfs_controld/gfs_daemon.h
index fdfc603..d5ea341 100644
--- a/group/gfs_controld/gfs_daemon.h
+++ b/group/gfs_controld/gfs_daemon.h
@@ -126,12 +126,12 @@ struct mountgroup {
 	int			our_jid;
 	int			spectator;
 	int			ro;
-	int			rw;
 	int                     joining;
 	int                     leaving;
 	int			kernel_mount_error;
 	int			kernel_mount_done;
 	int			first_mounter;
+	int			no_mount_helper;
 
 	/* cpg-new stuff */
 
@@ -181,7 +181,7 @@ void process_mountgroups(void);
 int gfs_join_mountgroup(struct mountgroup *mg);
 void do_leave(struct mountgroup *mg, int mnterr);
 void gfs_mount_done(struct mountgroup *mg);
-void send_remount(struct mountgroup *mg, struct gfsc_mount_args *ma);
+void send_remount(struct mountgroup *mg, int ro);
 void send_withdraw(struct mountgroup *mg);
 int set_mountgroup_info(struct mountgroup *mg, struct gfsc_mountgroup *out);
 int set_node_info(struct mountgroup *mg, int nodeid, struct gfsc_node *node);
@@ -191,6 +191,7 @@ int set_mountgroup_nodes(struct mountgroup *mg, int option, int *node_count,
 void free_mg(struct mountgroup *mg);
 void node_history_cluster_add(int nodeid);
 void node_history_cluster_remove(int nodeid);
+void gfs_leave_mountgroup(struct mountgroup *mg, int mnterr);
 
 /* main.c */
 int do_read(int fd, void *buf, size_t count);
@@ -200,11 +201,10 @@ int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci));
 int client_fd(int ci);
 void client_ignore(int ci, int fd);
 void client_back(int ci, int fd);
-struct mountgroup *create_mg(char *name);
+struct mountgroup *create_mg(const char *name);
 struct mountgroup *find_mg(char *name);
 struct mountgroup *find_mg_id(uint32_t id);
 void client_reply_remount(struct mountgroup *mg, int ci, int result);
-void client_reply_join(int ci, struct gfsc_mount_args *ma, int result);
 void client_reply_join_full(struct mountgroup *mg, int result);
 void query_lock(void);
 void query_unlock(void);
diff --git a/group/gfs_controld/main.c b/group/gfs_controld/main.c
index e52f04b..4d753d1 100644
--- a/group/gfs_controld/main.c
+++ b/group/gfs_controld/main.c
@@ -148,10 +148,15 @@ static void sigterm_handler(int sig)
 	daemon_quit = 1;
 }
 
-struct mountgroup *create_mg(char *name)
+struct mountgroup *create_mg(const char *name)
 {
 	struct mountgroup *mg;
 
+	if (strlen(name) > GFS_MOUNTGROUP_LEN) {
+		log_error("create_mg: name %s too long", name);
+		return NULL;
+	}
+
 	mg = malloc(sizeof(struct mountgroup));
 	if (!mg)
 		return NULL;
@@ -170,6 +175,11 @@ struct mountgroup *find_mg(char *name)
 {
 	struct mountgroup *mg;
 
+	if (strlen(name) > GFS_MOUNTGROUP_LEN) {
+		log_error("find_mg: name %s too long", name);
+		return NULL;
+	}
+
 	list_for_each_entry(mg, &mountgroups, list) {
 		if ((strlen(mg->name) == strlen(name)) &&
 		    !strncmp(mg->name, name, strlen(name)))
@@ -198,6 +208,8 @@ enum {
 	Env_RECOVERY,
 	Env_FIRSTMOUNT,
 	Env_JID,
+	Env_SPECTATOR,
+	Env_RDONLY,
 	Env_Last, /* Flag for end of vars */
 };
 
@@ -210,6 +222,8 @@ static const char *uevent_vars[] = {
 	[Env_RECOVERY]		= "RECOVERY=",
 	[Env_FIRSTMOUNT]	= "FIRSTMOUNT=",
 	[Env_JID]		= "JID=",
+	[Env_SPECTATOR]		= "SPECTATOR=",
+	[Env_RDONLY]		= "RDONLY=",
 };
 
 /*
@@ -258,6 +272,101 @@ static char *uevent_fsname(const char *vals[])
 	return name;
 }
 
+/*
+ * This is called only if mount.gfs2 has not already set up the
+ * mount group. In that case we know that the mount helper doesn't
+ * exist and thus the no_mount_helper flag is set, to indicate that
+ * this mount will be administrated entirely via the uevent/sysfs
+ * interface.
+ */
+
+static void do_new_mount(const char *name, struct mountgroup *mg,
+			 const char *uevent_vals[])
+{
+	int rv;
+
+	if (!uevent_vars[Env_LOCKPROTO] ||
+	    !uevent_vars[Env_LOCKTABLE])
+		return;
+
+	/* We only care about lock_dlm mounts */
+	if (strcmp(uevent_vals[Env_LOCKPROTO], "lock_dlm") != 0)
+		return;
+
+	if (mg) {
+		/* Might have already been set up by mount.gfs2 */
+		if (mg->no_mount_helper == 0)
+			return;
+		log_error("do_new_mount: duplicate mount %s",
+			  uevent_vals[Env_LOCKTABLE]);
+		return;
+	}
+
+	mg = create_mg(name);
+	if (mg == NULL)
+		return;
+
+	mg->no_mount_helper = 1;
+
+	strncpy(mg->mount_args.type, uevent_vals[Env_SUBSYSTEM], PATH_MAX);
+	strncpy(mg->mount_args.proto, uevent_vals[Env_LOCKPROTO], PATH_MAX);
+	strncpy(mg->mount_args.table, uevent_vals[Env_LOCKTABLE], PATH_MAX);
+
+	if (uevent_vals[Env_SPECTATOR] &&
+	    strcmp(uevent_vals[Env_SPECTATOR], "1") == 0)
+		mg->spectator = 1;
+
+	if (uevent_vals[Env_RDONLY] &&
+	    strcmp(uevent_vals[Env_RDONLY], "1") == 0)
+		mg->ro = 1;
+
+	list_add(&mg->list, &mountgroups);
+	rv = gfs_join_mountgroup(mg);
+	if (rv) {
+		log_error("join: group join error %d", rv);
+		goto fail;
+	}
+	log_group(mg, "do_new_mount ci %d result %d first=%d:jid=%d",
+                  mg->mount_client, rv, mg->first_mounter, mg->our_jid);
+	return;
+
+fail:
+	list_del(&mg->list);
+	free(mg);
+	return;
+}
+
+/*
+ * This is called upon successful mount and also upon a successful
+ * remount operation. Unless the no_mount_helper flag is set on the
+ * mount group, this is a no-op.
+ */
+static void do_online(struct mountgroup *mg, const char *uevent_vals[])
+{
+	int ro = 0;
+
+	/* If using mount helper, ignore the message here */
+	if (mg->no_mount_helper == 0)
+		return;
+
+	/* Catch successful original mount */
+	if (!mg->kernel_mount_done) {
+		mg->mount_client = 0;
+		mg->kernel_mount_done = 1;
+		mg->kernel_mount_error = 0;
+		gfs_mount_done(mg);
+		return;
+	}
+
+	/* From here on, its remounts only */
+
+	if (uevent_vals[Env_RDONLY] &&
+	    strcmp(uevent_vals[Env_RDONLY], "1") == 0)
+		ro = 1;
+
+	send_remount(mg, ro);
+}
+
 static void process_uevent(int ci)
 {
 	struct mountgroup *mg;
@@ -302,6 +411,12 @@ static void process_uevent(int ci)
 	}
 
 	mg = find_mg(fsname);
+
+	if (!strcmp(uevent_vals[Env_ACTION], "add")) {
+		do_new_mount(fsname, mg, uevent_vals);
+		return;
+	}
+
 	if (!mg) {
 		log_error("mount group %s not found", fsname);
 		return;
@@ -314,6 +429,16 @@ static void process_uevent(int ci)
 
 		if (strcmp(uevent_vals[Env_SUBSYSTEM], "lock_dlm") == 0)
 			return;
+
+		/* Catch original mount failure */
+		if (mg->no_mount_helper && !mg->kernel_mount_done) {
+			mg->mount_client = 0;
+			mg->kernel_mount_done = 1;
+			mg->kernel_mount_error = -1;
+			gfs_mount_done(mg);
+			return;
+		}
+
 		do_leave(mg, 0);
 		return;
 	}
@@ -343,6 +468,9 @@ static void process_uevent(int ci)
 		return;
 	}
 
+	if (!strcmp(uevent_vals[Env_ACTION], "online"))
+		do_online(mg, uevent_vals);
+
 	if (!strcmp(uevent_vals[Env_ACTION], "offline"))
 		do_withdraw(mg);
 }
@@ -546,7 +674,7 @@ static void query_mountgroup_nodes(int fd, char *name, int option, int max)
 		free(nodes);
 }
 
-void client_reply_join(int ci, struct gfsc_mount_args *ma, int result)
+static void client_reply_join(int ci, struct gfsc_mount_args *ma, int result)
 {
 	char *name = strstr(ma->table, ":") + 1;
 
@@ -556,6 +684,36 @@ void client_reply_join(int ci, struct gfsc_mount_args *ma, int result)
 		 name, result, ma, sizeof(struct gfsc_mount_args));
 }
 
+static void client_sysfs_join(struct mountgroup *mg, int result)
+{
+	int rv;
+
+	if (result) {
+		rv = set_sysfs(mg, "jid", result);
+		if (rv) {
+			log_error("join: error %d returning result %d", rv, result);
+		}
+		return;
+	}
+
+	if (mg->spectator) {
+		rv = set_sysfs(mg, "jid", 0);
+		if (rv) {
+			log_error("join: error setting jid %d", rv);
+		}
+		return;
+	}
+
+	rv = set_sysfs(mg, "first", mg->first_mounter);
+	if (rv) {
+		log_error("join: error setting first %d", rv);
+	}
+	rv = set_sysfs(mg, "jid", mg->our_jid);
+	if (rv) {
+		log_error("join: error setting jid %d", rv);
+	}
+}
+
 void client_reply_join_full(struct mountgroup *mg, int result)
 {
 	char nodir_str[32];
@@ -582,7 +740,10 @@ void client_reply_join_full(struct mountgroup *mg, int result)
 	log_group(mg, "client_reply_join_full ci %d result %d %s",
 		  mg->mount_client, result, mg->mount_args.hostdata);
 
-	client_reply_join(mg->mount_client, &mg->mount_args, result);
+	if (mg->no_mount_helper)
+		client_sysfs_join(mg, result);
+	else
+		client_reply_join(mg->mount_client, &mg->mount_args, result);
 }
 
 static void do_join(int ci, struct gfsc_mount_args *ma)
@@ -624,11 +785,6 @@ static void do_join(int ci, struct gfsc_mount_args *ma)
 	name++;
 	cluster = table2;
 
-	if (strlen(name) > GFS_MOUNTGROUP_LEN) {
-		rv = -ENAMETOOLONG;
-		goto fail;
-	}
-
 	mg = find_mg(name);
 	if (mg) {
 		if (strcmp(mg->mount_args.dev, ma->dev)) {
@@ -678,9 +834,7 @@ static void do_join(int ci, struct gfsc_mount_args *ma)
 		}
 	}
 
-	if (!mg->spectator && strstr(ma->options, "rw"))
-		mg->rw = 1;
-	else if (strstr(ma->options, "ro")) {
+	if (strstr(ma->options, "ro")) {
 		if (mg->spectator) {
 			log_error("join: readonly invalid with spectator");
 			rv = -EROFS;
@@ -778,6 +932,7 @@ static void do_remount(int ci, struct gfsc_mount_args *ma)
 		goto out;
 	}
 
+	/* FIXME: Should allow remounts */
 	if (mg->spectator) {
 		log_error("remount of spectator not allowed");
 		result = -1;
@@ -787,11 +942,8 @@ static void do_remount(int ci, struct gfsc_mount_args *ma)
 	if (!strcmp(ma->options, "ro"))
 		ro = 1;
 
-	if ((mg->ro && ro) || (!mg->ro && !ro))
-		goto out;
-
-	send_remount(mg, ma);
- out:
+	send_remount(mg, ro);
+out:
 	client_reply_remount(mg, ci, result);
 }
 
diff --git a/group/gfs_controld/util.c b/group/gfs_controld/util.c
index c2cb741..5b2e45d 100644
--- a/group/gfs_controld/util.c
+++ b/group/gfs_controld/util.c
@@ -68,27 +68,44 @@ int set_sysfs(struct mountgroup *mg, const char *field, int val)
 	sprintf(out, "%d", val);
 
 	rv = write(fd, out, strlen(out));
-
+	if (rv < 0)
+		log_group(mg, "set write %s error %d", fname, errno);
 	close(fd);
 
-	if (rv)
-		rv = 0;
-	return rv;
+	return 0;
 }
 
 int run_dmsetup_suspend(struct mountgroup *mg, char *dev)
 {
 	struct sched_param sched_param;
-	char buf[PATH_MAX];
+	char fname[PATH_MAX];
+	char smajor[16];
+	char sminor[16];
 	pid_t pid;
 	int i, rv;
+	int major, minor;
+	FILE *fp;
 
-	memset(buf, 0, sizeof(buf));
-	rv = readlink(dev, buf, PATH_MAX);
-	if (rv < 0)
-		strncpy(buf, dev, sizeof(buf));
+	snprintf(fname, PATH_MAX, "%s/%s/%s/device/dev",
+		 SYSFS_DIR, mg->mount_args.type, mg->mount_args.table);
+
+	fp = fopen(fname, "r");
+	if (fp == NULL) {
+		log_group(mg, "set open %s error %d", fname, errno);
+		return -1;
+	}
+
+	if (fscanf(fp, "%d:%d", &major, &minor) != 2) {
+		log_group(mg, "cannot read device numbers %d", errno);
+		return -1;
+	}
+
+	fclose(fp);
+
+	log_group(mg, "run_dmsetup_suspend %d:%d", major, minor);
 
-	log_group(mg, "run_dmsetup_suspend %s (orig %s)", buf, dev);
+	snprintf(smajor, 16, "%d", major);
+	snprintf(sminor, 16, "%d", minor);
 
 	pid = fork();
 	if (pid < 0)
@@ -106,7 +123,7 @@ int run_dmsetup_suspend(struct mountgroup *mg, char *dev)
 			close(i);
 
 		execlp("dmsetup", "dmsetup", "suspend",  "--nolockfs",
-		       "--noflush", buf, NULL);
+		       "--noflush", "-j", smajor, "-m", sminor, NULL);
 		exit(EXIT_FAILURE);
 	}
 	return -1;