[dm-devel] shared snapshots release 17
Mike Snitzer
snitzer at redhat.com
Mon Mar 22 20:54:34 UTC 2010
On Mon, Mar 22 2010 at 3:18pm -0400,
Mikulas Patocka <mpatocka at redhat.com> wrote:
> Hi
>
> New shared snapshots are here. It incorporates Mike's changes and it has
> reworked memory limit:
> - a minimum of 8 buffers is guaranteed to prevent thrasing with too big
> chunk sizes
> - the cache for multisnapshots is limited to 2% of memory or 25% of
> vmalloc memory (whichever is lower) [ I'm thinking about making this
> configurable in /proc ]
> - big chunk sizes (8MB or more) allocate memory always from vmalloc, there
> is no point in allocating from general allocator
For the benefit of others, here are your r17 changes relative to r16. I
have some early questions/comments:
- What is going on with EXPORT_SYMBOL at the top of drivers/md/dm-bufio.c?
Do you intend to have a standalone dm-bufio module? I think it makes
sense to go one way or another. As is you're in limbo; the name of
the _init and _exit functions don't _really_ make sense given that it
isn't a standalone module (e.g. dm_bufio_module_init). Makes the
calling code in dm-multisnap-mikulas.c somewhat awkward (calling
another _{module_init,exit}). Especially when you consider
dm_bufio_module_exit() doesn't do anything; yet you've reworked
dm_multisnapshot_mikulas_module_init() to call it.
- Please don't introduce long lines as you make new changes. Below, the
following functions have unnecessarily long lines:
get_memory_limit, dm_bufio_alloc_buffer_data, dm_bufio_module_init
- The empty newline between comment blocks and functions should be
removed, see: get_memory_limit, adjust_client_count, dm_bufio_module_exit
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 44dbb0e..1958b97 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -13,6 +13,10 @@
#include "dm-bufio.h"
+/* This will be enabled if this becomes a module or a part of another module */
+#undef EXPORT_SYMBOL
+#define EXPORT_SYMBOL(x)
+
/*
* dm_bufio_client_create --- create a buffered IO cache on a given device
* dm_bufio_client_destroy --- release a buffered IO cache
@@ -51,16 +55,17 @@
/*
* Memory management policy:
- * When we're above threshold, start asynchronous writing of dirty buffers
- * and memory reclaiming --- but still allow new allocations.
- * When we're above limit, don't allocate any more space and synchronously
- * wait until existing buffers are freed.
- *
- * These default parameters can be overriden with parameters to
- * dm_bufio_client_create.
+ * Limit the number of buffers to DM_BUFIO_MEMORY_RATIO of main memory or
+ * DM_BUFIO_VMALLOC_RATIO of vmalloc memory (whichever is lower).
+ * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
+ * When there are DM_BUFIO_WRITEBACK_RATIO dirty buffers, start background
+ * writeback.
*/
-#define DM_BUFIO_THRESHOLD_MEMORY (8 * 1048576)
-#define DM_BUFIO_LIMIT_MEMORY (9 * 1048576)
+
+#define DM_BUFIO_MIN_BUFFERS 8
+#define DM_BUFIO_MEMORY_RATIO 2 / 100
+#define DM_BUFIO_VMALLOC_RATIO 1 / 4
+#define DM_BUFIO_WRITEBACK_RATIO 3 / 4
/*
* The number of bvec entries that are embedded directly in the buffer.
@@ -78,7 +83,8 @@
* Don't try to kmalloc blocks larger than this.
* For explanation, see dm_bufio_alloc_buffer_data below.
*/
-#define DM_BUFIO_BLOCK_SIZE_KMALLOC_LIMIT PAGE_SIZE
+#define DM_BUFIO_BLOCK_SIZE_KMALLOC_LIMIT (PAGE_SIZE >> 1)
+#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1))
/*
* Buffer state bits.
@@ -110,8 +116,6 @@ struct dm_bufio_client {
unsigned char pages_per_block_bits;
unsigned long n_buffers;
- unsigned threshold_buffers;
- unsigned limit_buffers;
struct dm_io_client *dm_io;
@@ -146,6 +150,40 @@ struct dm_buffer {
struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
};
+static unsigned long dm_bufio_avail_mem;
+static unsigned long dm_bufio_avail_mem_per_client;
+static int dm_bufio_client_count;
+static DEFINE_MUTEX(dm_bufio_clients_lock);
+
+/*
+ * Change the number of clients and recalculate per-client limit.
+ */
+
+static void adjust_client_count(int diff)
+{
+ mutex_lock(&dm_bufio_clients_lock);
+ dm_bufio_client_count += diff;
+ BUG_ON(dm_bufio_client_count < 0);
+ dm_bufio_avail_mem_per_client = dm_bufio_avail_mem /
+ (dm_bufio_client_count ? dm_bufio_client_count : 1);
+ mutex_unlock(&dm_bufio_clients_lock);
+}
+
+/*
+ * Get writeback threshold and buffer limit for a given client.
+ */
+
+static void get_memory_limit(struct dm_bufio_client *c,
+ unsigned long *threshold_buffers,
+ unsigned long *limit_buffers)
+{
+ unsigned long buffers = dm_bufio_avail_mem_per_client >> (c->sectors_per_block_bits + SECTOR_SHIFT);
+ if (unlikely(buffers < DM_BUFIO_MIN_BUFFERS))
+ buffers = DM_BUFIO_MIN_BUFFERS;
+ *limit_buffers = buffers;
+ *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_RATIO;
+}
+
/*
* Allocating buffer data.
*
@@ -159,7 +197,8 @@ struct dm_buffer {
* as low as 128M) --- so using it for caching is not appropriate.
* If the allocation may fail we use __get_free_pages. Memory fragmentation
* won't have fatal effect here, it just causes flushes of some other
- * buffers and more I/O will be performed.
+ * buffers and more I/O will be performed. Don't use __get_free_pages if
+ * it always fails (i.e. order >= MAX_ORDER).
* If the allocation shouldn't fail we use __vmalloc. This is only for
* the initial reserve allocation, so there's no risk of wasting
* all vmalloc space.
@@ -170,7 +209,7 @@ static void *dm_bufio_alloc_buffer_data(struct dm_bufio_client *c,
if (c->block_size <= DM_BUFIO_BLOCK_SIZE_KMALLOC_LIMIT) {
*data_mode = DATA_MODE_KMALLOC;
return kmalloc(c->block_size, gfp_mask);
- } else if (gfp_mask & __GFP_NORETRY) {
+ } else if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && gfp_mask & __GFP_NORETRY) {
*data_mode = DATA_MODE_GET_FREE_PAGES;
return (void *)__get_free_pages(gfp_mask,
c->pages_per_block_bits);
@@ -424,9 +463,11 @@ static void free_buffer_wake(struct dm_buffer *b)
*/
static void check_watermark(struct dm_bufio_client *c)
{
- while (c->n_buffers > c->threshold_buffers) {
+ unsigned long threshold_buffers, limit_buffers;
+ get_memory_limit(c, &threshold_buffers, &limit_buffers);
+ while (c->n_buffers > threshold_buffers) {
struct dm_buffer *b;
- b = get_unclaimed_buffer(c, c->n_buffers > c->limit_buffers);
+ b = get_unclaimed_buffer(c, c->n_buffers > limit_buffers);
if (!b)
return;
free_buffer_wake(b);
@@ -558,7 +599,7 @@ static void *dm_bufio_new_read(struct dm_bufio_client *c, sector_t block,
retry_search:
b = dm_bufio_find(c, block);
if (b) {
- if (new_b)
+ if (unlikely(new_b != NULL))
free_buffer_wake(new_b);
b->hold_count++;
relink_lru(b, test_bit(B_DIRTY, &b->state) ||
@@ -568,7 +609,7 @@ unlock_wait_ret:
wait_ret:
wait_on_bit(&b->state, B_READING,
do_io_schedule, TASK_UNINTERRUPTIBLE);
- if (b->read_error) {
+ if (unlikely(b->read_error != 0)) {
int error = b->read_error;
dm_bufio_release(b);
return ERR_PTR(error);
@@ -898,8 +939,7 @@ EXPORT_SYMBOL(dm_bufio_drop_buffers);
/* Create the buffering interface */
struct dm_bufio_client *
-dm_bufio_client_create(struct block_device *bdev, unsigned blocksize,
- unsigned flags, __u64 cache_threshold, __u64 cache_limit)
+dm_bufio_client_create(struct block_device *bdev, unsigned blocksize)
{
int r;
struct dm_bufio_client *c;
@@ -925,22 +965,6 @@ dm_bufio_client_create(struct block_device *bdev, unsigned blocksize,
mutex_init(&c->lock);
c->n_buffers = 0;
- if (!cache_limit)
- cache_limit = DM_BUFIO_LIMIT_MEMORY;
- c->limit_buffers = cache_limit >>
- (c->sectors_per_block_bits + SECTOR_SHIFT);
- if (!c->limit_buffers)
- c->limit_buffers = 1;
-
- if (!cache_threshold)
- cache_threshold = DM_BUFIO_THRESHOLD_MEMORY;
- if (cache_threshold > cache_limit)
- cache_threshold = cache_limit;
- c->threshold_buffers = cache_threshold >>
- (c->sectors_per_block_bits + SECTOR_SHIFT);
- if (!c->threshold_buffers)
- c->threshold_buffers = 1;
-
init_waitqueue_head(&c->free_buffer_wait);
c->async_write_error = 0;
@@ -957,6 +981,8 @@ dm_bufio_client_create(struct block_device *bdev, unsigned blocksize,
goto bad_buffer;
}
+ adjust_client_count(1);
+
return c;
bad_buffer:
@@ -983,5 +1009,38 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
BUG_ON(c->n_buffers != 0);
dm_io_client_destroy(c->dm_io);
kfree(c);
+ adjust_client_count(-1);
}
EXPORT_SYMBOL(dm_bufio_client_destroy);
+
+/*
+ * This is called only once for the whole dm_bufio module.
+ * It initializes memory limit.
+ */
+void __init dm_bufio_module_init(void)
+{
+ __u64 mem = (__u64)((totalram_pages - totalhigh_pages) * DM_BUFIO_MEMORY_RATIO) << PAGE_SHIFT;
+ if (mem > ULONG_MAX)
+ mem = ULONG_MAX;
+#ifdef CONFIG_MMU
+ /*
+ * Get the size of vmalloc space,
+ * the same way as VMALLOC_TOTAL in fs/proc/internal.h
+ */
+ if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_RATIO)
+ mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_RATIO;
+#endif
+ dm_bufio_avail_mem = mem;
+ dm_bufio_avail_mem_per_client = mem;
+ dm_bufio_client_count = 0;
+}
+EXPORT_SYMBOL(dm_bufio_module_init);
+
+/*
+ * This is called once when unloading the dm_bufio module.
+ */
+
+void dm_bufio_module_exit(void)
+{
+}
+EXPORT_SYMBOL(dm_bufio_module_exit)
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index 3261ea2..f258d4f 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -26,10 +26,11 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c);
void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
struct dm_bufio_client *
-dm_bufio_client_create(struct block_device *bdev, unsigned blocksize,
- unsigned flags, __u64 cache_threshold,
- __u64 cache_limit);
+dm_bufio_client_create(struct block_device *bdev, unsigned blocksize);
void dm_bufio_client_destroy(struct dm_bufio_client *c);
void dm_bufio_drop_buffers(struct dm_bufio_client *c);
+void dm_bufio_module_init(void);
+void dm_bufio_module_exit(void);
+
#endif
diff --git a/drivers/md/dm-multisnap-mikulas.c b/drivers/md/dm-multisnap-mikulas.c
index 27cc050..e16c0d6 100644
--- a/drivers/md/dm-multisnap-mikulas.c
+++ b/drivers/md/dm-multisnap-mikulas.c
@@ -608,8 +608,7 @@ static int dm_multisnap_mikulas_init(struct dm_multisnap *dm,
}
s->bufio = dm_bufio_client_create(dm_multisnap_snapshot_bdev(s->dm),
- s->chunk_size, 0, s->cache_threshold,
- s->cache_limit);
+ s->chunk_size);
if (IS_ERR(s->bufio)) {
*error = "Can't create bufio client";
r = PTR_ERR(s->bufio);
@@ -751,13 +750,23 @@ struct dm_multisnap_exception_store dm_multisnap_mikulas_store = {
static int __init dm_multisnapshot_mikulas_module_init(void)
{
+ int r;
BUG_ON(sizeof(struct multisnap_commit_block) != 512);
- return dm_multisnap_register_exception_store(&dm_multisnap_mikulas_store);
+ dm_bufio_module_init();
+ r = dm_multisnap_register_exception_store(&dm_multisnap_mikulas_store);
+ if (r)
+ goto cant_register;
+ return 0;
+
+cant_register:
+ dm_bufio_module_exit();
+ return r;
}
static void __exit dm_multisnapshot_mikulas_module_exit(void)
{
dm_multisnap_unregister_exception_store(&dm_multisnap_mikulas_store);
+ dm_bufio_module_exit();
}
module_init(dm_multisnapshot_mikulas_module_init);
diff --git a/drivers/md/dm-multisnap.c b/drivers/md/dm-multisnap.c
index 1a1a500..5ba1af8 100644
--- a/drivers/md/dm-multisnap.c
+++ b/drivers/md/dm-multisnap.c
@@ -645,8 +645,15 @@ dispatch_write:
return;
}
+ /*
+ * Jump to the middle of the cycle.
+ * We already asked for the first remap, so we skip it in the first
+ * iteration. Chaning the cycle to start with add_next_remap would
+ * make the code less readable because it wouldn't follow the natural
+ * flow of operations, so we use this goto instead.
+ */
i = 0;
- goto midcycle;
+ goto skip_query_next_remap;
for (; i < DM_MULTISNAP_MAX_CHUNKS_TO_REMAP; i++) {
r = s->store->query_next_remap(s->p, chunk);
if (unlikely(r < 0))
@@ -654,7 +661,7 @@ dispatch_write:
if (likely(!r))
break;
-midcycle:
+skip_query_next_remap:
s->store->add_next_remap(s->p, &pe->desc[i], &new_chunk);
if (unlikely(dm_multisnap_has_error(s)))
goto free_err_endio;
@@ -1461,6 +1468,44 @@ poll_for_ios:
mutex_unlock(&all_multisnapshots_lock);
}
+static int multisnap_iterate_devices(struct dm_target *ti, struct dm_multisnap *s,
+ iterate_devices_callout_fn fn, void *data)
+{
+ int r;
+
+ r = fn(ti, s->origin, 0, s->origin_sectors, data);
+
+ if (!r)
+ r = fn(ti, s->snapshot, 0, s->origin_sectors, data);
+
+ return r;
+}
+
+static int multisnap_origin_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct dm_multisnap *s = ti->private;
+ return multisnap_iterate_devices(ti, s, fn, data);
+}
+
+static int multisnap_snap_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ int r;
+ struct dm_multisnap_snap *sn = ti->private;
+ struct dm_multisnap *s;
+
+ mutex_lock(&all_multisnapshots_lock);
+ s = sn->s;
+ if (s)
+ r = multisnap_iterate_devices(ti, s, fn, data);
+ else
+ r = 0;
+ mutex_unlock(&all_multisnapshots_lock);
+
+ return r;
+}
+
static int multisnap_origin_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
@@ -1934,6 +1979,7 @@ static struct target_type multisnap_origin_target = {
.message = multisnap_origin_message,
.status = multisnap_origin_status,
.postsuspend = multisnap_origin_postsuspend,
+ .iterate_devices = multisnap_origin_iterate_devices,
};
static struct target_type multisnap_snap_target = {
@@ -1945,6 +1991,7 @@ static struct target_type multisnap_snap_target = {
.map = multisnap_snap_map,
.end_io = multisnap_snap_end_io,
.status = multisnap_snap_status,
+ .iterate_devices = multisnap_snap_iterate_devices,
};
static int __init dm_multisnapshot_init(void)
More information about the dm-devel
mailing list