[Cluster-devel] cluster/group/gfs_controld lock_dlm.h main.c r ...

Tue Dec 5 22:24:38 UTC 2006

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL50
Changes by:	teigland at sourceware.org	2006-12-05 22:24:37

Modified files:
	group/gfs_controld: lock_dlm.h main.c recover.c 

Log message:
	Before doing the mount-group portion of withdraw, fork off a dmsetup to
	suspend the fs device.  This means gfs doesn't need to call dm_suspend()
	in the kernel before calling out to us.  The suspend waits for all
	outstanding i/o to return on the device which is necessary prior to
	telling other nodes to do recovery.  (Later we should probably swap
	in an error table and resume the device.)
	bz 215962

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&only_with_tag=RHEL50&r1=1.21.4.2&r2=1.21.4.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/main.c.diff?cvsroot=cluster&only_with_tag=RHEL50&r1=1.18.4.7&r2=1.18.4.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&only_with_tag=RHEL50&r1=1.23&r2=1.23.4.1

--- cluster/group/gfs_controld/lock_dlm.h	2006/11/21 17:28:52	1.21.4.2
+++ cluster/group/gfs_controld/lock_dlm.h	2006/12/05 22:24:37	1.21.4.3
@@ -30,6 +30,7 @@
 #include <sys/un.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/wait.h>
 #include <sys/errno.h>
 #include <linux/netlink.h>
 
@@ -125,6 +126,7 @@
 	char			type[5];
 	char			dir[PATH_MAX+1];
 	char			options[MAX_OPTIONS_LEN+1];
+	char			dev[PATH_MAX+1];
 
 	int			last_stop;
 	int			last_start;
@@ -167,6 +169,8 @@
 	int			readonly;
 	int			rw;
 	int			withdraw;
+	int			dmsetup_wait;
+	pid_t			dmsetup_pid;
 
 	struct list_head	saved_messages;
 	void			*start2_fn;
@@ -269,7 +273,7 @@
 void exit_cman(void);
 
 int do_mount(int ci, char *dir, char *type, char *proto, char *table,
-	     char *options, struct mountgroup **mg_ret);
+	     char *options, char *dev, struct mountgroup **mg_ret);
 int do_unmount(int ci, char *dir, int mnterr);
 int do_remount(int ci, char *dir, char *mode);
 int do_withdraw(char *name);
@@ -289,5 +293,6 @@
 void process_saved_plocks(struct mountgroup *mg);
 void purge_plocks(struct mountgroup *mg, int nodeid, int unmount);
 int unlink_checkpoint(struct mountgroup *mg);
+void update_dmsetup_wait(void);
 
 #endif
--- cluster/group/gfs_controld/main.c	2006/12/05 17:26:52	1.18.4.7
+++ cluster/group/gfs_controld/main.c	2006/12/05 22:24:37	1.18.4.8
@@ -41,6 +41,7 @@
 int no_withdraw;
 int no_plock;
 uint32_t plock_rate_limit = DEFAULT_PLOCK_RATE_LIMIT;
+int dmsetup_wait;
 
 int do_read(int fd, void *buf, size_t count)
 {
@@ -285,14 +286,16 @@
 
 	log_debug("client %d: %s", ci, buf);
 
-	get_args(buf, &argc, argv, ' ', 6);
+	get_args(buf, &argc, argv, ' ', 7);
 	cmd = argv[0];
 	rv = 0;
 
 	if (!strcmp(cmd, "join")) {
-		/* ci, dir, type, proto, table, extra */
+		/* ci, dir (mountpoint), type (gfs/gfs2), proto (lock_dlm),
+		   table (fsname:clustername), extra (rw), dev (/dev/sda1) */
+
 		rv = do_mount(ci, argv[1], argv[2], argv[3], argv[4], argv[5],
-			      &mg);
+			      argv[6], &mg);
 		fd = client[ci].fd;
 		fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK);
 		if (!rv) {
@@ -533,6 +536,17 @@
 					poll_timeout = -1;
 				}
 			}
+
+			if (dmsetup_wait) {
+				update_dmsetup_wait();
+				if (dmsetup_wait) {
+					if (poll_timeout == -1)
+						poll_timeout = 1000;
+				} else {
+					if (poll_timeout == 1000)
+						poll_timeout = -1;
+				}
+			}
 		}
 	}
 	rv = 0;
--- cluster/group/gfs_controld/recover.c	2006/10/23 15:44:33	1.23
+++ cluster/group/gfs_controld/recover.c	2006/12/05 22:24:37	1.23.4.1
@@ -19,6 +19,7 @@
 extern int our_nodeid;
 extern group_handle_t gh;
 extern int no_withdraw;
+extern int dmsetup_wait;
 
 struct list_head mounts;
 struct list_head withdrawn_mounts;
@@ -1493,15 +1494,15 @@
 }
 
 int do_mount(int ci, char *dir, char *type, char *proto, char *table,
-	     char *options, struct mountgroup **mg_ret)
+	     char *options, char *dev, struct mountgroup **mg_ret)
 {
 	struct mountgroup *mg;
 	char table2[MAXLINE];
 	char *cluster = NULL, *name = NULL;
 	int rv;
 
-	log_debug("mount: %s %s %s %s %s",
-		  dir, type, proto, table, options);
+	log_debug("mount: %s %s %s %s %s %s",
+		  dir, type, proto, table, options, dev);
 
 	if (strcmp(proto, "lock_dlm")) {
 		log_error("mount: lockproto %s not supported", proto);
@@ -1554,6 +1555,7 @@
 	strncpy(mg->type, type, sizeof(mg->type));
 	strncpy(mg->table, table, sizeof(mg->table));
 	strncpy(mg->options, options, sizeof(mg->options));
+	strncpy(mg->dev, dev, sizeof(mg->dev));
 
 	if (strlen(cluster) != strlen(clustername) ||
 	    strlen(cluster) == 0 || strcmp(cluster, clustername)) {
@@ -2478,6 +2480,41 @@
 	return 0;
 }
 
+static int run_dmsetup_suspend(struct mountgroup *mg, char *dev)
+{
+	struct sched_param sched_param;
+	char buf[PATH_MAX];
+	pid_t pid;
+	int i, rv;
+
+	memset(buf, 0, sizeof(buf));
+	rv = readlink(dev, buf, PATH_MAX);
+	if (rv < 0)
+		strncpy(buf, dev, sizeof(buf));
+
+	log_group(mg, "run_dmsetup_suspend %s (orig %s)", buf, dev);
+
+	pid = fork();
+	if (pid < 0)
+		return -1;
+
+	if (pid) {
+		mg->dmsetup_wait = 1;
+		mg->dmsetup_pid = pid;
+		return 0;
+	} else {
+		sched_param.sched_priority = 0; 
+		sched_setscheduler(0, SCHED_OTHER, &sched_param);
+
+		for (i = 0; i < 50; i++)
+			close(i);
+	
+		execlp("dmsetup", "dmsetup", "suspend", buf, NULL);
+		exit(EXIT_FAILURE);
+	}
+	return -1;
+}
+
 /* The basic rule of withdraw is that we don't want to tell the kernel to drop
    all locks until we know gfs has been stopped/blocked on all nodes.  They'll
    be stopped for our leave, we just need to know when they've all arrived
@@ -2492,6 +2529,7 @@
 {
 	struct mountgroup *mg;
 	char *name = strstr(table, ":") + 1;
+	int rv;
 
 	if (no_withdraw) {
 		log_error("withdraw feature not enabled");
@@ -2504,8 +2542,66 @@
 		return -1;
 	}
 
-	mg->withdraw = 1;
-	send_withdraw(mg);
+	rv = run_dmsetup_suspend(mg, mg->dev);
+	if (rv) {
+		log_error("do_withdraw %s: dmsetup %s error %d", mg->name,
+			  mg->dev, rv);
+		return -1;
+	}
+
+	dmsetup_wait = 1;
 	return 0;
 }
 
+void dmsetup_suspend_done(struct mountgroup *mg, int rv)
+{
+	log_group(mg, "dmsetup_suspend_done result %d", rv);
+	mg->dmsetup_wait = 0;
+	mg->dmsetup_pid = 0;
+
+	if (!rv) {
+		mg->withdraw = 1;
+		send_withdraw(mg);
+	}
+}
+
+void update_dmsetup_wait(void)
+{
+	struct mountgroup *mg;
+	int status;
+	int waiting = 0;
+	pid_t pid;
+
+	list_for_each_entry(mg, &mounts, list) {
+		if (mg->dmsetup_wait) {
+			pid = waitpid(mg->dmsetup_pid, &status, WNOHANG);
+
+			/* process not exited yet */
+			if (!pid) {
+				waiting++;
+				continue;
+			}
+
+			if (pid < 0) {
+				log_error("update_dmsetup_wait %s: waitpid %d "
+					  "error %d", mg->name,
+					  mg->dmsetup_pid, errno);
+				dmsetup_suspend_done(mg, -2);
+				continue;
+			}
+
+			/* process exited */
+
+			if (!WIFEXITED(status) || WEXITSTATUS(status))
+				dmsetup_suspend_done(mg, -1);
+			else
+				dmsetup_suspend_done(mg, 0);
+		}
+	}
+
+	if (!waiting) {
+		dmsetup_wait = 0;
+		log_debug("dmsetup_wait off");
+	}
+}
+