[dm-devel] [PATCH] reworked dm-switch target
Jim Ramsay
jim_ramsay at dell.com
Tue Sep 11 13:14:46 UTC 2012
On Sat, Sep 08, 2012 at 12:35:22PM -0400, Mikulas Patocka wrote:
> Another thing - please resend your code with "Signed-off-by". Read the
> meaning of "Signed-off-by" in Documentation/SubmittingPatches, agree to
> the terms and append "Signed-off-by: Jim Ramsay <jim_ramsay at dell.com>" to
> the code. It is a legal requirement, so that you certify that the code is
> under the open source license and that you have the right to distribute
> the code.
I am attaching below an updated version of your 'dm-switch.c' file,
based on your latest post in
http://www.redhat.com/archives/dm-devel/2012-August/msg00224.html that
makes the following changes:
1. Support for FLUSH and DISCARD operations by implementing
target_type.iterate_devices and handling (bio->bi_rw & REQ_FLUSH) in
switch_map. Sends DISCARD to one path, FLUSH to each path.
2. Send IOCTLs to the device who owns sector 0, instead of
pctx->dev_list[0]
3. Copyright notice update in header, plus adding myself to
MODULE_AUTHOR
Signed-off-by: Jim Ramsay <jim_ramsay at dell.com>
------ dm-switch.c ------
/*
* Copyright (c) 2010-2012 by Dell Inc. All rights reserved.
*
* This file is released under the GPL.
*
* Description:
*
* file: dm-switch.c
* authors: Kevin_OKelley at dell.com
* Jim_Ramsay at dell.com
* Narendran_Ganapathy at dell.com
* mpatocka at redhat.com
*
* This file implements a "switch" target which efficiently implements a
* mapping of IOs to underlying block devices in scenarios where there are:
* (1) a large number of address regions
* (2) a fixed size equal across all address regions
* (3) no pattern than allows for a compact description with something like
* the dm-stripe target.
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/device-mapper.h>
#include <linux/vmalloc.h>
#define DM_MSG_PREFIX "switch"
/*
* Switch device context block: A new one is created for each dm device.
* Contains an array of devices from which we have taken references.
*/
struct switch_dev {
struct dm_dev *dmdev;
sector_t start;
};
typedef unsigned long pt_entry;
/* Switch context header */
struct switch_ctx {
unsigned dev_count; /* Number of devices */
unsigned page_size; /* Page size in 512B sectors */
unsigned long n_pages; /* Number of pages */
signed char page_size_bits; /* log2 of page_size or -1 */
unsigned char pte_size; /* Page table entry size in bits */
unsigned char pte_fields; /* Number of entries per pt_entry */
signed char pte_fields_bits; /* log2 of pte_fields or -1 */
pt_entry *page_table; /* Page table */
/* Array of dm devices to switch between */
struct switch_dev dev_list[0];
};
static inline void switch_get_position(struct switch_ctx *pctx,
unsigned long page,
unsigned long *index,
unsigned *bit)
{
if (pctx->pte_fields_bits >= 0) {
*index = page >> pctx->pte_fields_bits;
*bit = page & (pctx->pte_fields - 1);
} else {
*index = page / pctx->pte_fields;
*bit = page % pctx->pte_fields;
}
*bit *= pctx->pte_size;
}
static inline unsigned switch_get_deviceidx(struct switch_ctx *pctx,
sector_t sector)
{
unsigned long index;
unsigned bit, idev;
sector_t p;
p = sector;
if (pctx->page_size_bits >= 0)
p >>= pctx->page_size_bits;
else
sector_div(p, pctx->page_size);
switch_get_position(pctx, p, &index, &bit);
idev = (ACCESS_ONCE(pctx->page_table[index]) >> bit) &
((1 << pctx->pte_size) - 1);
/* This can only happen if the processor uses non-atomic stores. */
if (unlikely(idev >= pctx->dev_count))
idev = 0;
return idev;
}
static void switch_page_table_write(struct switch_ctx *pctx, unsigned long page,
unsigned value)
{
unsigned long index;
unsigned bit;
pt_entry pte;
switch_get_position(pctx, page, &index, &bit);
pte = pctx->page_table[index];
pte &= ~((((pt_entry)1 << pctx->pte_size) - 1) << bit);
pte |= (pt_entry)value << bit;
pctx->page_table[index] = pte;
}
/*
* Constructor: Called each time a dmsetup command creates a dm device. The
* target parameter will already have the table, type, begin and len fields
* filled in. Arguments are in pairs: <dev_path> <offset>. Therefore, we get
* multiple constructor calls, but we will need to build a list of switch_ctx
* blocks so that the page table information gets matched to the correct
* device.
*/
static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
unsigned a;
int n;
int r;
unsigned dev_count;
struct switch_ctx *pctx;
sector_t dev_size;
unsigned long e;
if (argc < 4) {
ti->error = "Insufficient arguments";
r = -EINVAL;
goto error;
}
if (kstrtouint(argv[0], 10, &dev_count) ||
!dev_count ||
dev_count > (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_dev)) {
ti->error = "Invalid device count";
r = -EINVAL;
goto error;
}
if (dev_count != (argc - 2) / 2) {
ti->error = "Invalid argument count";
r = -EINVAL;
goto error;
}
pctx = kmalloc(sizeof(struct switch_ctx) + (dev_count * sizeof(struct switch_dev)),
GFP_KERNEL);
if (!pctx) {
ti->error = "Cannot allocate redirect context";
r = -ENOMEM;
goto error;
}
pctx->dev_count = dev_count;
if (kstrtouint(argv[1], 10, &pctx->page_size) ||
!pctx->page_size) {
ti->error = "Invalid page size";
r = -EINVAL;
goto error_kfree;
}
if (!(pctx->page_size & (pctx->page_size - 1)))
pctx->page_size_bits = __ffs(pctx->page_size);
else
pctx->page_size_bits = -1;
pctx->pte_size = 1;
while (pctx->pte_size < sizeof(pt_entry) * 8 &&
(pt_entry)1 << pctx->pte_size < pctx->dev_count)
pctx->pte_size++;
pctx->pte_fields = (sizeof(pt_entry) * 8) / pctx->pte_size;
if (!(pctx->pte_fields & (pctx->pte_fields - 1)))
pctx->pte_fields_bits = __ffs(pctx->pte_fields);
else
pctx->pte_fields_bits = -1;
dev_size = ti->len;
if (sector_div(dev_size, pctx->page_size))
dev_size++;
pctx->n_pages = dev_size;
if (pctx->n_pages != dev_size || pctx->n_pages >= ULONG_MAX) {
ti->error = "Too long page table";
r = -EINVAL;
goto error_kfree;
}
if (sector_div(dev_size, pctx->pte_fields))
dev_size++;
if (dev_size > ULONG_MAX / sizeof(pt_entry)) {
ti->error = "Too long page table";
r = -EINVAL;
goto error_kfree;
}
r = dm_set_target_max_io_len(ti, pctx->page_size);
if (r)
goto error_kfree;
pctx->page_table = vmalloc(dev_size * sizeof(pt_entry));
if (!pctx->page_table) {
ti->error = "Cannot allocate page table";
r = -ENOMEM;
goto error_kfree;
}
a = 0;
for (e = 0; e < pctx->n_pages; e++) {
switch_page_table_write(pctx, e, a);
a++;
if (a >= pctx->dev_count)
a = 0;
}
/*
* Check each device beneath the target to ensure that the limits are
* consistent.
*/
for (n = 0, a = 2; n < pctx->dev_count; n++, a += 2) {
struct dm_dev *dm;
sector_t dev_size;
unsigned long long start;
if (kstrtoull(argv[a + 1], 10, &start) ||
start != (sector_t)start) {
ti->error = "Invalid device starting offset";
r = -EINVAL;
n--;
goto error_release_n;
}
r = dm_get_device
(ti, argv[a], dm_table_get_mode(ti->table), &dm);
if (r) {
ti->error = "Device lookup failed";
n--;
goto error_release_n;
}
pctx->dev_list[n].dmdev = dm;
pctx->dev_list[n].start = start;
dev_size = i_size_read(dm->bdev->bd_inode) >> SECTOR_SHIFT;
if (ti->len > start + dev_size) {
ti->error = "Device is too small";
r = -EINVAL;
goto error_release_n;
}
}
/* For UNMAP, sending the request down any path is sufficient */
ti->num_discard_requests = 1;
/* For FLUSH, we should flush each path */
ti->num_flush_requests = pctx->dev_count;
ti->private = pctx;
return 0;
error_release_n: /* De-reference all devices */
for (; n >= 0; n--)
dm_put_device(ti, pctx->dev_list[n].dmdev);
vfree(pctx->page_table);
error_kfree:
kfree(pctx);
error:
return r;
}
/*
* Destructor: Don't free the dm_target, just the ti->private data (if any).
*/
static void switch_dtr(struct dm_target *ti)
{
int n;
struct switch_ctx *pctx = ti->private;
for (n = 0; n < pctx->dev_count; n++)
dm_put_device(ti, pctx->dev_list[n].dmdev);
vfree(pctx->page_table);
kfree(pctx);
}
static int switch_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
struct switch_ctx *pctx = ti->private;
sector_t offset = bio->bi_sector - ti->begin;
unsigned idev;
if (bio->bi_rw & REQ_FLUSH) {
int request_nr = map_context->target_request_nr;
BUG_ON(request_nr >= pctx->dev_count);
bio->bi_bdev = pctx->dev_list[request_nr].dmdev->bdev;
return DM_MAPIO_REMAPPED;
}
idev = switch_get_deviceidx(pctx, offset);
bio->bi_bdev = pctx->dev_list[idev].dmdev->bdev;
bio->bi_sector = pctx->dev_list[idev].start + offset;
return DM_MAPIO_REMAPPED;
}
/*
* We need to parse hex numbers as fast as possible.
* Message is used to load the whole table.
*
* This table-based hex parser improves performance.
* It improves a time to load 1000000 entries compared to the condition-based
* parser.
* table-based parser condition-based parser
* PA-RISC 0.29s 0.31s
* Opteron 0.0495s 0.0498s
*/
static const unsigned char hex_table[256] = {
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
0,1,2,3,4,5,6,7,8,9,255,255,255,255,255,255,
255,10,11,12,13,14,15,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,10,11,12,13,14,15,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
};
static inline void parse_hex(const char *string, sector_t *result, const char **end)
{
unsigned char d;
sector_t r = 0;
#if 1
while ((d = hex_table[(unsigned char)*string]) < 16) {
r = (r << 4) | d;
string++;
}
#else
while (1) {
d = *string;
if (d >= '0' && d <= '9')
d -= '0';
else if (d >= 'A' && d <= 'F')
d -= 'A' - 10;
else if (d >= 'a' && d <= 'f')
d -= 'a' - 10;
else
break;
r = (r << 4) | d;
string++;
}
#endif
*end = string;
*result = r;
}
static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
{
static DEFINE_MUTEX(message_mutex);
struct switch_ctx *pctx = ti->private;
int r;
mutex_lock(&message_mutex);
if (!argc) {
goto invalid_message;
} else if (!strcasecmp(argv[0], "set-table")) {
unsigned i;
sector_t table_index = 0;
for (i = 1; i < argc; i++) {
sector_t device;
const char *string = argv[i];
if (*string == ':')
table_index++;
else {
parse_hex(string, &table_index, &string);
if (unlikely(*string != ':')) {
invalid_table:
DMWARN("invalid set-table argument");
r = -EINVAL;
goto ret;
}
}
string++;
if (unlikely(!*string))
goto invalid_table;
parse_hex(string, &device, &string);
if (unlikely(*string))
goto invalid_table;
if (unlikely(table_index >= pctx->n_pages)) {
DMWARN("invalid set-table page");
r = -EINVAL;
goto ret;
}
if (unlikely(device >= pctx->dev_count)) {
DMWARN("invalid set-table device");
r = -EINVAL;
goto ret;
}
switch_page_table_write(pctx, table_index, device);
}
r = 0;
} else {
invalid_message:
DMWARN("unrecognised message received.");
r = -EINVAL;
}
ret:
mutex_unlock(&message_mutex);
return r;
}
static int switch_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
struct switch_ctx *pctx = ti->private;
unsigned sz = 0;
int n;
result[0] = '\0';
switch (type) {
case STATUSTYPE_INFO:
result[0] = 0;
break;
case STATUSTYPE_TABLE:
DMEMIT("%u %u", pctx->dev_count, pctx->page_size);
for (n = 0; n < pctx->dev_count; n++) {
DMEMIT(" %s %llu", pctx->dev_list[n].dmdev->name,
(unsigned long long)pctx->dev_list[n].start);
}
break;
default:
return 0;
}
return 0;
}
/*
* Switch ioctl:
*
* Passthrough all ioctls to the path for sector 0
*/
static int switch_ioctl(struct dm_target *ti, unsigned cmd,
unsigned long arg)
{
struct switch_ctx *pctx = ti->private;
struct block_device *bdev;
fmode_t mode;
unsigned idev;
idev = switch_get_deviceidx(pctx, 0);
bdev = pctx->dev_list[idev].dmdev->bdev;
mode = pctx->dev_list[idev].dmdev->mode;
return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
}
static int switch_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct switch_ctx *pctx = (struct switch_ctx *)ti->private;
int n, ret = 0;
for (n = 0; n < pctx->dev_count; n++) {
ret = fn(ti, pctx->dev_list[n].dmdev, ti->begin, ti->len, data);
if (ret)
goto out;
}
out:
return ret;
}
static struct target_type switch_target = {
.name = "switch",
.version = {1, 0, 0},
.module = THIS_MODULE,
.ctr = switch_ctr,
.dtr = switch_dtr,
.map = switch_map,
.message = switch_message,
.status = switch_status,
.ioctl = switch_ioctl,
.iterate_devices = switch_iterate_devices,
};
int __init dm_switch_init(void)
{
int r;
r = dm_register_target(&switch_target);
if (r) {
DMERR("dm_register_target() failed %d", r);
return r;
}
return 0;
}
void dm_switch_exit(void)
{
dm_unregister_target(&switch_target);
}
module_init(dm_switch_init);
module_exit(dm_switch_exit);
MODULE_DESCRIPTION(DM_NAME " fixed-size address-region-mapping throughput-oriented path selector");
MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley at dell.com>");
MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay at dell.com>");
MODULE_AUTHOR("Mikulas Patocka <mpatocka at redhat.com>");
MODULE_LICENSE("GPL");
-------------------------
--
Jim Ramsay
--
dm-devel mailing list
dm-devel at redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
--
Jim Ramsay
More information about the dm-devel
mailing list