[dm-devel] [PATCH 12 of 15] md separate meta and data devs

Jonathan Brassow jbrassow at redhat.com
Fri Dec 3 19:55:42 UTC 2010


Patch name: md-separate-meta-and-data-devs.patch

Allow the metadata to be on a separate device from the
data.

This doesn't mean the data and metadata will by on separate
physical devices - it simply gives device-mapper and userspace
tools more flexibility.

RFC-by: Jonathan Brassow <jbrassow at redhat.com>

Index: linux-2.6/drivers/md/bitmap.c
===================================================================
--- linux-2.6.orig/drivers/md/bitmap.c
+++ linux-2.6/drivers/md/bitmap.c
@@ -263,14 +263,18 @@ static mdk_rdev_t *next_active_rdev(mdk_
 static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
 	mdk_rdev_t *rdev = NULL;
+	struct block_device *bdev;
 	mddev_t *mddev = bitmap->mddev;
 
 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
 		int size = PAGE_SIZE;
 		loff_t offset = mddev->bitmap_info.offset;
+
+		bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
+
 		if (page->index == bitmap->file_pages-1)
 			size = roundup(bitmap->last_page_size,
-				       bdev_logical_block_size(rdev->bdev));
+				       bdev_logical_block_size(bdev));
 		/* Just make sure we aren't corrupting data or
 		 * metadata
 		 */
Index: linux-2.6/drivers/md/md.c
===================================================================
--- linux-2.6.orig/drivers/md/md.c
+++ linux-2.6/drivers/md/md.c
@@ -707,6 +707,20 @@ static struct mdk_personality *find_pers
 static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
 {
 	sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
+
+	if (rdev->meta_bdev)
+		return 0;
+
+	return MD_NEW_SIZE_SECTORS(num_sectors);
+}
+
+static inline sector_t calc_dev_sectors(mdk_rdev_t *rdev)
+{
+	sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
+
+	if (rdev->meta_bdev)
+		return num_sectors;
+
 	return MD_NEW_SIZE_SECTORS(num_sectors);
 }
 
@@ -764,7 +778,7 @@ void md_super_write(mddev_t *mddev, mdk_
 	 */
 	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
 
-	bio->bi_bdev = rdev->bdev;
+	bio->bi_bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
 	bio->bi_sector = sector;
 	bio_add_page(bio, page, size, 0);
 	bio->bi_private = rdev;
@@ -802,7 +816,8 @@ int sync_page_io(mdk_rdev_t *rdev, secto
 
 	rw |= REQ_SYNC | REQ_UNPLUG;
 
-	bio->bi_bdev = rdev->bdev;
+	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
+		rdev->meta_bdev : rdev->bdev;
 	bio->bi_sector = sector;
 	bio_add_page(bio, page, size, 0);
 	init_completion(&event);
@@ -820,6 +835,7 @@ EXPORT_SYMBOL_GPL(sync_page_io);
 static int read_disk_sb(mdk_rdev_t * rdev, int size)
 {
 	char b[BDEVNAME_SIZE];
+
 	if (!rdev->sb_page) {
 		MD_BUG();
 		return -EINVAL;
@@ -1678,7 +1694,7 @@ super_1_rdev_size_change(mdk_rdev_t *rde
 	sector_t max_sectors;
 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
 		return 0; /* component must fit device */
-	if (rdev->sb_start < rdev->data_offset) {
+	if (rdev->meta_bdev || rdev->sb_start < rdev->data_offset) {
 		/* minor versions 1 and 2; superblock before data */
 		max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
 		max_sectors -= rdev->data_offset;
@@ -1769,6 +1785,7 @@ int md_integrity_register(mddev_t *mddev
 		 * If at least one rdev is not integrity capable, we can not
 		 * enable data integrity for the md device.
 		 */
+		/* FIXME (brassow): check both [meta_]bdev ? */
 		if (!bdev_get_integrity(rdev->bdev))
 			return -EINVAL;
 		if (!reference) {
@@ -1935,6 +1952,8 @@ static int lock_rdev(mdk_rdev_t *rdev, d
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
+	/* FIXME (brassow): [un]lock all both [meta_]bdev ? */
+
 	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
 	if (IS_ERR(bdev)) {
 		printk(KERN_ERR "md: could not open %s.\n",
@@ -1957,6 +1976,8 @@ static int lock_rdev(mdk_rdev_t *rdev, d
 static void unlock_rdev(mdk_rdev_t *rdev)
 {
 	struct block_device *bdev = rdev->bdev;
+
+	/* FIXME brassow: end here on 'bdev' search */
 	rdev->bdev = NULL;
 	if (!bdev)
 		MD_BUG();
@@ -4434,7 +4455,18 @@ int md_run(mddev_t *mddev)
 		 * We don't want the data to overlap the metadata,
 		 * Internal Bitmap issues have been handled elsewhere.
 		 */
-		if (rdev->data_offset < rdev->sb_start) {
+		if (rdev->meta_bdev) {
+			/* Metadata is on a separate device */
+			if (rdev->data_offset) {
+				printk(KERN_ERR "md: data_offset should be 0\n");
+				return -EINVAL;
+			}
+
+			if (rdev->sb_start) {
+				printk(KERN_ERR "md: sb_start should be 0\n");
+				return -EINVAL;
+			}
+		} else if (rdev->data_offset < rdev->sb_start) {
 			if (mddev->dev_sectors &&
 			    rdev->data_offset + mddev->dev_sectors
 			    > rdev->sb_start) {
@@ -5240,7 +5272,7 @@ static int add_new_disk(mddev_t * mddev,
 			rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
 		} else
 			rdev->sb_start = calc_dev_sboffset(rdev);
-		rdev->sectors = rdev->sb_start;
+		rdev->sectors = calc_dev_sectors(rdev);
 
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err) {
@@ -5310,7 +5342,7 @@ static int hot_add_disk(mddev_t * mddev,
 	else
 		rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
 
-	rdev->sectors = rdev->sb_start;
+	rdev->sectors = calc_dev_sectors(rdev);
 
 	if (test_bit(Faulty, &rdev->flags)) {
 		printk(KERN_WARNING 
@@ -5519,7 +5551,6 @@ static int update_size(mddev_t *mddev, s
 	 * sb_start or, if that is <data_offset, it must fit before the size
 	 * of each device.  If num_sectors is zero, we find the largest size
 	 * that fits.
-
 	 */
 	if (mddev->sync_thread)
 		return -EBUSY;
Index: linux-2.6/drivers/md/md.h
===================================================================
--- linux-2.6.orig/drivers/md/md.h
+++ linux-2.6/drivers/md/md.h
@@ -60,6 +60,12 @@ struct mdk_rdev_s
 	mddev_t *mddev;			/* RAID array if running */
 	int last_events;		/* IO event timestamp */
 
+	/*
+	 * If meta_bdev is non-NULL, it means that a separate device is
+	 * being used to store the metadata (superblock/bitmap) which
+	 * would otherwise be contained on the same device as the data (bdev).
+	 */
+	struct block_device *meta_bdev;
 	struct block_device *bdev;	/* block device handle */
 
 	struct page	*sb_page;




More information about the dm-devel mailing list