[dm-devel] Re: [PATCH] locking update for VFS locking patch

Thu Feb 26 18:22:01 UTC 2004

On Thursday 26 February 2004 3:41 pm, Chris Mason wrote:
> On Thu, 2004-02-26 at 15:43, Kevin Corry wrote:
> > I've been trying to test out the VFS-lock patch, but haven't been having
> > any luck with it. I applied it to a clean 2.6.3 kernel (along with the
> > kgdb patch from -mm), and it consistently BUGs when it tries to mount the
> > root filesystem. I've captured the console output through gdb and
> > attached it below, along with the kernel config. My root filesystem is
> > ext3 if that helps.
> >
> > I'll keep hunting for the problem, but I figured I'd send this
> > information to you to start with.
>
> This one should work better for you.

I've completed some simple tests (just light loads so far) with the new
VFS-lock patch, and things seem to be working correctly. I've included
the patch below, along with the proposed changes to dm.c to actually call
the APIs.

-- 
Kevin Corry
kevcorry at us.ibm.com
http://evms.sourceforge.net/


VFS-Lock patch.

--- diff/drivers/md/dm.c	2004-02-25 16:19:20.000000000 -0600
+++ source/drivers/md/dm.c	2004-02-26 16:46:04.000000000 -0600
@@ -12,6 +12,7 @@
 #include <linux/moduleparam.h>
 #include <linux/blkpg.h>
 #include <linux/bio.h>
+#include <linux/buffer_head.h>
 #include <linux/mempool.h>
 #include <linux/slab.h>
 
@@ -46,6 +47,7 @@
  */
 #define DMF_BLOCK_IO 0
 #define DMF_SUSPENDED 1
+#define DMF_FS_LOCKED 2
 
 struct mapped_device {
 	struct rw_semaphore lock;
@@ -826,6 +828,24 @@
 	return 0;
 }
 
+static void __lock_disk(struct gendisk *disk)
+{
+	struct block_device *bdev = bdget_disk(disk, 0);
+	if (bdev) {
+		fsync_bdev_lockfs(bdev);
+		bdput(bdev);
+	}
+}
+
+static void __unlock_disk(struct gendisk *disk)
+{
+	struct block_device *bdev = bdget_disk(disk, 0);
+	if (bdev) {
+		unlockfs(bdev);
+		bdput(bdev);
+	}
+}
+
 /*
  * We need to be able to change a mapping table under a mounted
  * filesystem.  For example we might want to move some data in
@@ -837,12 +857,23 @@
 {
 	DECLARE_WAITQUEUE(wait, current);
 
-	down_write(&md->lock);
+	/* Flush I/O to the device. */
+	down_read(&md->lock);
+	if (test_bit(DMF_BLOCK_IO, &md->flags)) {
+		up_read(&md->lock);
+		return -EINVAL;
+	}
+
+	if (!test_and_set_bit(DMF_FS_LOCKED, &md->flags)) {
+		__lock_disk(md->disk);
+	}
+	up_read(&md->lock);
 
 	/*
 	 * First we set the BLOCK_IO flag so no more ios will be
 	 * mapped.
 	 */
+	down_write(&md->lock);
 	if (test_bit(DMF_BLOCK_IO, &md->flags)) {
 		up_write(&md->lock);
 		return -EINVAL;
@@ -892,11 +923,13 @@
 	dm_table_resume_targets(md->map);
 	clear_bit(DMF_SUSPENDED, &md->flags);
 	clear_bit(DMF_BLOCK_IO, &md->flags);
+	clear_bit(DMF_FS_LOCKED, &md->flags);
 
 	def = bio_list_get(&md->deferred);
 	__flush_deferred_io(md, def);
 	up_write(&md->lock);
 
+	__unlock_disk(md->disk);
 	blk_run_queues();
 
 	return 0;
--- diff/fs/block_dev.c	2004-02-26 15:50:53.000000000 -0600
+++ source/fs/block_dev.c	2004-02-26 15:50:41.000000000 -0600
@@ -242,6 +242,7 @@
 	{
 		memset(bdev, 0, sizeof(*bdev));
 		sema_init(&bdev->bd_sem, 1);
+		sema_init(&bdev->bd_mount_sem, 1);
 		INIT_LIST_HEAD(&bdev->bd_inodes);
 		INIT_LIST_HEAD(&bdev->bd_list);
 		inode_init_once(&ei->vfs_inode);
--- diff/fs/buffer.c	2004-02-18 10:39:07.000000000 -0600
+++ source/fs/buffer.c	2004-02-26 10:59:52.000000000 -0600
@@ -259,6 +259,17 @@
 	return sync_blockdev(bdev);
 }
 
+int fsync_bdev_lockfs(struct block_device *bdev)
+{
+	int res;
+	res = fsync_bdev(bdev);
+	if (res)
+		return res;
+	sync_super_lockfs(bdev);
+	return sync_blockdev(bdev);
+}
+EXPORT_SYMBOL(fsync_bdev_lockfs);
+
 /*
  * sync everything.  Start out by waking pdflush, because that writes back
  * all queues in parallel.
--- diff/fs/reiserfs/super.c	2004-02-17 21:57:47.000000000 -0600
+++ source/fs/reiserfs/super.c	2004-02-26 10:59:51.000000000 -0600
@@ -82,7 +82,7 @@
     reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
     journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
     reiserfs_block_writes(&th) ;
-    journal_end(&th, s, 1) ;
+    journal_end_sync(&th, s, 1) ;
   }
   s->s_dirt = dirty;
   reiserfs_write_unlock(s);
--- diff/fs/super.c	2004-02-18 10:39:09.000000000 -0600
+++ source/fs/super.c	2004-02-26 10:59:52.000000000 -0600
@@ -293,6 +293,62 @@
 }
 
 /*
+ * triggered by the device mapper code to lock a filesystem and force
+ * it into a consistent state.
+ *
+ * This takes the block device bd_mount_sem to make sure no new mounts
+ * happen on bdev until unlockfs is called.  If a super is found on this
+ * block device, we hould a read lock on the s->s_umount sem to make sure
+ * nobody unmounts until the snapshot creation is done
+ */
+void sync_super_lockfs(struct block_device *bdev) 
+{
+	struct super_block *sb;
+	down(&bdev->bd_mount_sem);
+	sb = get_super(bdev);
+	if (sb) {
+		lock_super(sb);
+		if (sb->s_dirt && sb->s_op->write_super)
+			sb->s_op->write_super(sb);
+		if (sb->s_op->write_super_lockfs)
+			sb->s_op->write_super_lockfs(sb);
+		unlock_super(sb);
+	}
+	/* unlockfs releases s->s_umount and bd_mount_sem */
+}
+
+void unlockfs(struct block_device *bdev)
+{
+	struct list_head *p;
+	/*
+	 * copied from get_super, but we need to
+	 * do special things since lockfs left the
+	 * s_umount sem held
+	 */
+	spin_lock(&sb_lock);
+	list_for_each(p, &super_blocks) {
+		struct super_block *s = sb_entry(p);
+		/*
+		 * if there is a super for this block device
+		 * in the list, get_super must have found it
+		 * during sync_super_lockfs, so our drop_super
+		 * will drop the reference created there.
+		 */
+		if (s->s_bdev == bdev && s->s_root) {
+			spin_unlock(&sb_lock);
+			if (s->s_op->unlockfs)
+				s->s_op->unlockfs(s);
+			drop_super(s);
+			goto unlock;
+		}
+	}
+	spin_unlock(&sb_lock);
+unlock:
+	up(&bdev->bd_mount_sem);
+}
+EXPORT_SYMBOL(unlockfs);
+
+/*
  * Note: check the dirty flag before waiting, so we don't
  * hold up the sync while mounting a device. (The newly
  * mounted device won't need syncing.)
@@ -613,7 +669,14 @@
 	if (IS_ERR(bdev))
 		return (struct super_block *)bdev;
 
+	/*
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
+	 */
+	down(&bdev->bd_mount_sem);
 	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
+	up(&bdev->bd_mount_sem);
 	if (IS_ERR(s))
 		goto out;
 
--- diff/include/linux/buffer_head.h	2004-02-17 21:57:12.000000000 -0600
+++ source/include/linux/buffer_head.h	2004-02-26 10:59:52.000000000 -0600
@@ -164,6 +164,8 @@
 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
 void wake_up_buffer(struct buffer_head *bh);
 int fsync_bdev(struct block_device *);
+int fsync_bdev_lockfs(struct block_device *);
+void unlockfs(struct block_device *);
 int fsync_super(struct super_block *);
 int fsync_no_super(struct block_device *);
 struct buffer_head *__find_get_block(struct block_device *, sector_t, int);
--- diff/include/linux/fs.h	2004-02-18 10:39:10.000000000 -0600
+++ source/include/linux/fs.h	2004-02-26 10:59:52.000000000 -0600
@@ -346,6 +346,7 @@
 	struct inode *		bd_inode;	/* will die */
 	int			bd_openers;
 	struct semaphore	bd_sem;	/* open/close mutex */
+	struct semaphore	bd_mount_sem;	/* mount mutex */
 	struct list_head	bd_inodes;
 	void *			bd_holder;
 	int			bd_holders;
@@ -1221,6 +1222,7 @@
 extern int filemap_fdatawait(struct address_space *);
 extern int filemap_write_and_wait(struct address_space *mapping);
 extern void sync_supers(void);
+extern void sync_super_lockfs(struct block_device *);
 extern void sync_filesystems(int wait);
 extern void emergency_sync(void);
 extern void emergency_remount(void);