[dm-devel] [PATCH] reworked dm-switch target

Jim Ramsay jim_ramsay at dell.com
Fri Sep 7 19:58:53 UTC 2012


On Wed, Aug 29, 2012 at 08:51:06PM -0400, Mikulas Patocka wrote:
> Basically, the worst performance brake is the spinlock in your code, 
> otherwise there is not much difference. vmalloc doesn't slow it down 
> (actually it was slightly faster with vmalloc than with kmalloc).

Thanks very much for your performance testing.  I've completed my
testing too, and my results agree with yours.  When running against our
actual iSCSI storage solution, the IO performance for this vmalloc-based
code is equivalent to the 2-step kmalloc code.

I also did a side-by-side comparison of uploading 1048576 page table
entries.  The netlink code averaged out to 0.0026s, and the DM message
code averaged out to 0.042s.  As expected, the message code is slower,
but well within reasonable performance expectations.

I am attaching below an updated version of your 'dm-switch.c' file,
based on your latest post in
http://www.redhat.com/archives/dm-devel/2012-August/msg00224.html that
makes the following changes:

 1. Support for FLUSH and DISCARD operations by implementing
    target_type.iterate_devices and handling (bio->bi_rw & REQ_FLUSH) in
    switch_map.  Sends DISCARD to one path, FLUSH to each path.

 2. Send IOCTLs to the device who owns sector 0, instead of
    pctx->dev_list[0]

 3. Copyright notice update in header, plus adding myself to
    MODULE_AUTHOR

(Question: Would you prefer these as a patch series against your dm-switch.c
instead?)

------ dm-switch.c ------
/*
 * Copyright (c) 2010-2012 by Dell Inc.  All rights reserved.
 *
 * This file is released under the GPL.
 *
 * Description:
 *
 *     file:    dm-switch.c
 *     authors: Kevin_OKelley at dell.com
 *              Jim_Ramsay at dell.com
 *              Narendran_Ganapathy at dell.com
 *		mpatocka at redhat.com
 *
 * This file implements a "switch" target which efficiently implements a
 * mapping of IOs to underlying block devices in scenarios where there are:
 *   (1) a large number of address regions
 *   (2) a fixed size equal across all address regions
 *   (3) no pattern than allows for a compact description with something like
 *       the dm-stripe target.
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/device-mapper.h>
#include <linux/vmalloc.h>

#define DM_MSG_PREFIX "switch"

/*
 * Switch device context block: A new one is created for each dm device.
 * Contains an array of devices from which we have taken references.
 */
struct switch_dev {
	struct dm_dev *dmdev;
	sector_t start;
};

typedef unsigned long pt_entry;

/* Switch context header */
struct switch_ctx {
	unsigned dev_count;		/* Number of devices */
	unsigned page_size;		/* Page size in 512B sectors */
	unsigned long n_pages;		/* Number of pages */
	signed char page_size_bits;	/* log2 of page_size or -1 */

	unsigned char pte_size;		/* Page table entry size in bits */
	unsigned char pte_fields;	/* Number of entries per pt_entry */
	signed char pte_fields_bits;	/* log2 of pte_fields or -1 */
	pt_entry *page_table;		/* Page table */

	/* Array of dm devices to switch between */
	struct switch_dev dev_list[0];
};

static inline void switch_get_position(struct switch_ctx *pctx,
				       unsigned long page,
				       unsigned long *index,
				       unsigned *bit)

{
	if (pctx->pte_fields_bits >= 0) {
		*index = page >> pctx->pte_fields_bits;
		*bit = page & (pctx->pte_fields - 1);
	} else {
		*index = page / pctx->pte_fields;
		*bit = page % pctx->pte_fields;
	}
	*bit *= pctx->pte_size;

}

static inline unsigned switch_get_deviceidx(struct switch_ctx *pctx,
					    sector_t sector)
{
	unsigned long index;
	unsigned bit, idev;
	sector_t p;

	p = sector;
	if (pctx->page_size_bits >= 0)
		p >>= pctx->page_size_bits;
	else
		sector_div(p, pctx->page_size);

	switch_get_position(pctx, p, &index, &bit);
	idev = (ACCESS_ONCE(pctx->page_table[index]) >> bit) &
	       ((1 << pctx->pte_size) - 1);

	/* This can only happen if the processor uses non-atomic stores. */
	if (unlikely(idev >= pctx->dev_count))
		idev = 0;

	return idev;
}

static void switch_page_table_write(struct switch_ctx *pctx, unsigned long page,
				    unsigned value)
{
	unsigned long index;
	unsigned bit;
	pt_entry pte;

	switch_get_position(pctx, page, &index, &bit);

	pte = pctx->page_table[index];
	pte &= ~((((pt_entry)1 << pctx->pte_size) - 1) << bit);
	pte |= (pt_entry)value << bit;
	pctx->page_table[index] = pte;
}

/*
 * Constructor: Called each time a dmsetup command creates a dm device.  The
 * target parameter will already have the table, type, begin and len fields
 * filled in.  Arguments are in pairs: <dev_path> <offset>.  Therefore, we get
 * multiple constructor calls, but we will need to build a list of switch_ctx
 * blocks so that the page table information gets matched to the correct
 * device.
 */
static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
	unsigned a;
	int n;
	int r;
	unsigned dev_count;
	struct switch_ctx *pctx;
	sector_t dev_size;
	unsigned long e;

	if (argc < 4) {
		ti->error = "Insufficient arguments";
		r = -EINVAL;
		goto error;
	}
	if (kstrtouint(argv[0], 10, &dev_count) ||
	    !dev_count ||
	    dev_count > (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_dev)) {
		ti->error = "Invalid device count";
		r = -EINVAL;
		goto error;
	}
	if (dev_count != (argc - 2) / 2) {
		ti->error = "Invalid argument count";
		r = -EINVAL;
		goto error;
	}
	pctx = kmalloc(sizeof(struct switch_ctx) + (dev_count * sizeof(struct switch_dev)),
		       GFP_KERNEL);
	if (!pctx) {
		ti->error = "Cannot allocate redirect context";
		r = -ENOMEM;
		goto error;
	}
	pctx->dev_count = dev_count;
	if (kstrtouint(argv[1], 10, &pctx->page_size) ||
	    !pctx->page_size) {
		ti->error = "Invalid page size";
		r = -EINVAL;
		goto error_kfree;
	}

	if (!(pctx->page_size & (pctx->page_size - 1)))
		pctx->page_size_bits = __ffs(pctx->page_size);
	else
		pctx->page_size_bits = -1;

	pctx->pte_size = 1;
	while (pctx->pte_size < sizeof(pt_entry) * 8 &&
	       (pt_entry)1 << pctx->pte_size < pctx->dev_count)
		pctx->pte_size++;

	pctx->pte_fields = (sizeof(pt_entry) * 8) / pctx->pte_size;
	if (!(pctx->pte_fields & (pctx->pte_fields - 1)))
		pctx->pte_fields_bits = __ffs(pctx->pte_fields);
	else
		pctx->pte_fields_bits = -1;

	dev_size = ti->len;
	if (sector_div(dev_size, pctx->page_size))
		dev_size++;

	pctx->n_pages = dev_size;
	if (pctx->n_pages != dev_size || pctx->n_pages >= ULONG_MAX) {
		ti->error = "Too long page table";
		r = -EINVAL;
		goto error_kfree;
	}

	if (sector_div(dev_size, pctx->pte_fields))
		dev_size++;

	if (dev_size > ULONG_MAX / sizeof(pt_entry)) {
		ti->error = "Too long page table";
		r = -EINVAL;
		goto error_kfree;
	}

	r = dm_set_target_max_io_len(ti, pctx->page_size);
	if (r)
		goto error_kfree;

	pctx->page_table = vmalloc(dev_size * sizeof(pt_entry));
	if (!pctx->page_table) {
		ti->error = "Cannot allocate page table";
		r = -ENOMEM;
		goto error_kfree;
	}

	a = 0;
	for (e = 0; e < pctx->n_pages; e++) {
		switch_page_table_write(pctx, e, a);
		a++;
		if (a >= pctx->dev_count)
			a = 0;
	}

	/*
	 * Check each device beneath the target to ensure that the limits are
	 * consistent.
	 */
	for (n = 0, a = 2; n < pctx->dev_count; n++, a += 2) {
		struct dm_dev *dm;
		sector_t dev_size;
		unsigned long long start;

		if (kstrtoull(argv[a + 1], 10, &start) ||
		    start != (sector_t)start) {
			ti->error = "Invalid device starting offset";
			r = -EINVAL;
			n--;
			goto error_release_n;
		}
		r = dm_get_device
		    (ti, argv[a], dm_table_get_mode(ti->table), &dm);
		if (r) {
			ti->error = "Device lookup failed";
			n--;
			goto error_release_n;
		}
		pctx->dev_list[n].dmdev = dm;
		pctx->dev_list[n].start = start;

		dev_size = i_size_read(dm->bdev->bd_inode) >> SECTOR_SHIFT;

		if (ti->len > start + dev_size) {
			ti->error = "Device is too small";
			r = -EINVAL;
			goto error_release_n;
		}
	}

	/* For UNMAP, sending the request down any path is sufficient */
	ti->num_discard_requests = 1;
	/* For FLUSH, we should flush each path */
	ti->num_flush_requests = pctx->dev_count;

	ti->private = pctx;

	return 0;

error_release_n:		/* De-reference all devices  */
	for (; n >= 0; n--)
		dm_put_device(ti, pctx->dev_list[n].dmdev);

	vfree(pctx->page_table);
error_kfree:
	kfree(pctx);

error:
	return r;
}

/*
 * Destructor: Don't free the dm_target, just the ti->private data (if any).
 */
static void switch_dtr(struct dm_target *ti)
{
	int n;
	struct switch_ctx *pctx = ti->private;

	for (n = 0; n < pctx->dev_count; n++)
		dm_put_device(ti, pctx->dev_list[n].dmdev);

	vfree(pctx->page_table);
	kfree(pctx);
}

static int switch_map(struct dm_target *ti, struct bio *bio,
		      union map_info *map_context)
{
	struct switch_ctx *pctx = ti->private;

	sector_t offset = bio->bi_sector - ti->begin;
	unsigned idev;

	if (bio->bi_rw & REQ_FLUSH) {
		int request_nr = map_context->target_request_nr;
		BUG_ON(request_nr >= pctx->dev_count);
		bio->bi_bdev = pctx->dev_list[request_nr].dmdev->bdev;
		return DM_MAPIO_REMAPPED;
	}

	idev = switch_get_deviceidx(pctx, offset);

	bio->bi_bdev = pctx->dev_list[idev].dmdev->bdev;
	bio->bi_sector = pctx->dev_list[idev].start + offset;

	return DM_MAPIO_REMAPPED;
}

/*
 * We need to parse hex numbers as fast as possible.
 * Message is used to load the whole table.
 *
 * This table-based hex parser improves performance.
 * It improves a time to load 1000000 entries compared to the condition-based
 * parser.
 *		table-based parser	condition-based parser
 * PA-RISC	0.29s			0.31s
 * Opteron	0.0495s			0.0498s
 */

static const unsigned char hex_table[256] = {
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
0,1,2,3,4,5,6,7,8,9,255,255,255,255,255,255,
255,10,11,12,13,14,15,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,10,11,12,13,14,15,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
};

static inline void parse_hex(const char *string, sector_t *result, const char **end)
{
	unsigned char d;
	sector_t r = 0;
#if 1
	while ((d = hex_table[(unsigned char)*string]) < 16) {
		r = (r << 4) | d;
		string++;
	}
#else
	while (1) {
		d = *string;
		if (d >= '0' && d <= '9')
			d -= '0';
		else if (d >= 'A' && d <= 'F')
			d -= 'A' - 10;
		else if (d >= 'a' && d <= 'f')
			d -= 'a' - 10;
		else
			break;
		r = (r << 4) | d;
		string++;
	}
#endif
	*end = string;
	*result = r;
}

static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
{
	static DEFINE_MUTEX(message_mutex);

	struct switch_ctx *pctx = ti->private;
	int r;

	mutex_lock(&message_mutex);

	if (!argc) {
		goto invalid_message;
	} else if (!strcasecmp(argv[0], "set-table")) {
		unsigned i;
		sector_t table_index = 0;
		for (i = 1; i < argc; i++) {
			sector_t device;
			const char *string = argv[i];
			if (*string == ':')
				table_index++;
			else {
				parse_hex(string, &table_index, &string);
				if (unlikely(*string != ':')) {
invalid_table:
					DMWARN("invalid set-table argument");
					r = -EINVAL;
					goto ret;
				}
			}
			string++;
			if (unlikely(!*string))
				goto invalid_table;
			parse_hex(string, &device, &string);
			if (unlikely(*string))
				goto invalid_table;
			if (unlikely(table_index >= pctx->n_pages)) {
				DMWARN("invalid set-table page");
				r = -EINVAL;
				goto ret;
			}
			if (unlikely(device >= pctx->dev_count)) {
				DMWARN("invalid set-table device");
				r = -EINVAL;
				goto ret;
			}
			switch_page_table_write(pctx, table_index, device);
		}
		r = 0;
	} else {
invalid_message:
		DMWARN("unrecognised message received.");
		r = -EINVAL;
	}
ret:
	mutex_unlock(&message_mutex);
	return r;
}

static int switch_status(struct dm_target *ti, status_type_t type,
			 unsigned status_flags, char *result, unsigned maxlen)
{
	struct switch_ctx *pctx = ti->private;
	unsigned sz = 0;
	int n;

	result[0] = '\0';
	switch (type) {
	case STATUSTYPE_INFO:
		result[0] = 0;
		break;

	case STATUSTYPE_TABLE:
		DMEMIT("%u %u", pctx->dev_count, pctx->page_size);
		for (n = 0; n < pctx->dev_count; n++) {
			DMEMIT(" %s %llu", pctx->dev_list[n].dmdev->name,
			       (unsigned long long)pctx->dev_list[n].start);
		}
		break;

	default:
		return 0;
	}
	return 0;
}

/*
 * Switch ioctl:
 *
 * Passthrough all ioctls to the path for sector 0
 */
static int switch_ioctl(struct dm_target *ti, unsigned cmd,
			unsigned long arg)
{
	struct switch_ctx *pctx = ti->private;
	struct block_device *bdev;
	fmode_t mode;
	unsigned idev;

	idev = switch_get_deviceidx(pctx, 0);

	bdev = pctx->dev_list[idev].dmdev->bdev;
	mode = pctx->dev_list[idev].dmdev->mode;

	return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
}

static int switch_iterate_devices(struct dm_target *ti,
				  iterate_devices_callout_fn fn, void *data)
{
	struct switch_ctx *pctx = (struct switch_ctx *)ti->private;
	int n, ret = 0;

	for (n = 0; n < pctx->dev_count; n++) {
		ret = fn(ti, pctx->dev_list[n].dmdev, ti->begin, ti->len, data);
		if (ret)
			goto out;
	}

out:
	return ret;
}

static struct target_type switch_target = {
	.name = "switch",
	.version = {1, 0, 0},
	.module = THIS_MODULE,
	.ctr = switch_ctr,
	.dtr = switch_dtr,
	.map = switch_map,
	.message = switch_message,
	.status = switch_status,
	.ioctl = switch_ioctl,
	.iterate_devices = switch_iterate_devices,
};

int __init dm_switch_init(void)
{
	int r;

	r = dm_register_target(&switch_target);
	if (r) {
		DMERR("dm_register_target() failed %d", r);
		return r;
	}

	return 0;
}

void dm_switch_exit(void)
{
	dm_unregister_target(&switch_target);
}

module_init(dm_switch_init);
module_exit(dm_switch_exit);

MODULE_DESCRIPTION(DM_NAME " fixed-size address-region-mapping throughput-oriented path selector");
MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley at dell.com>");
MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay at dell.com>");
MODULE_AUTHOR("Mikulas Patocka <mpatocka at redhat.com>");
MODULE_LICENSE("GPL");
-------------------------

-- 
Jim Ramsay




More information about the dm-devel mailing list