[dm-devel] [PATCH v2 2/4] dm thin: eliminate race when io and discards are issued simultaneously to the same block

Thu Dec 6 14:43:45 UTC 2012

From: Joe Thornber <ejt at redhat.com>

There is a race when discard bios and non-discard bios are issued
simultaneously to the same block.

Discard support is expensive for all thin devices precisely because you
have to be careful to quiesce the area you're discarding.  DM thin must
handle this conflicting IO pattern (simultaneous non-discard vs discard)
even though a sane application shouldn't be issuing such IO.

The race manifests as follows:
1. non-discard bio is mapped in thin_bio_map without first locking the
   bio in a dm_bio_prison_cell (locking convention DM thin uses)
2. discard bio is issued to the same block as the non-discard bio
3. discard bio is locked in a dm_bio_prison_cell in process_discard
4. non-discard bio's mapping continues and it's all_io_entry is
   incremented so the bio is accounted for in the thin pool's all_io_ds
   (dm_deferred_set used to track time locality of non-discard IO)
5. non-discard bio is finally locked in a dm_bio_prison_cell in
   process_bio

The race can result in deadlock, leaving the block layer hanging waiting
for completion of a discard bio that never completes, e.g.:

INFO: task ruby:15354 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
ruby            D ffffffff8160f0e0     0 15354  15314 0x00000000
 ffff8802fb08bc58 0000000000000082 ffff8802fb08bfd8 0000000000012900
 ffff8802fb08a010 0000000000012900 0000000000012900 0000000000012900
 ffff8802fb08bfd8 0000000000012900 ffff8803324b9480 ffff88032c6f14c0
Call Trace:
 [<ffffffff814e5a19>] schedule+0x29/0x70
 [<ffffffff814e3d85>] schedule_timeout+0x195/0x220
 [<ffffffffa06b9bc1>] ? _dm_request+0x111/0x160 [dm_mod]
 [<ffffffff814e589e>] wait_for_common+0x11e/0x190
 [<ffffffff8107a170>] ? try_to_wake_up+0x2b0/0x2b0
 [<ffffffff814e59ed>] wait_for_completion+0x1d/0x20
 [<ffffffff81233289>] blkdev_issue_discard+0x219/0x260
 [<ffffffff81233e79>] blkdev_ioctl+0x6e9/0x7b0
 [<ffffffff8119a65c>] block_ioctl+0x3c/0x40
 [<ffffffff8117539c>] do_vfs_ioctl+0x8c/0x340
 [<ffffffff8119a547>] ? block_llseek+0x67/0xb0
 [<ffffffff811756f1>] sys_ioctl+0xa1/0xb0
 [<ffffffff810561f6>] ? sys_rt_sigprocmask+0x86/0xd0
 [<ffffffff814ef099>] system_call_fastpath+0x16/0x1b

(The thinp-test-suite's test_discard_random_sectors reliably hits this
deadlock on fast SSD storage)

The fix for this race is that the all_io_entry for a bio must be
incremented whilst the dm_bio_prison_cell is held for the bio's
associated virtual and physical blocks.  That cell locking wasn't
occuring early enough (in thin_bio_map).  This patch fixes this.

Also, now that thin_bio_map may lock bios in a cell, process_bio is no
longer the only thread that will do so.  Because of this we must be sure
to use cell_defer_no_holder to release all non-holder entries, that were
added by the other thread, because they must be deferred.  This patch
depends on "dm thin: replace calls to dm_cell_release_singleton with
cell_defer_no_holder".

Lastly, now that issue() handles incrementing a bio's all_io_entry the
unlocking of the the bio's associated cells (via cell_defer_no_holder)
must be done after issue().

Signed-off-by: Joe Thornber <ejt at redhat.com>
Signed-off-by: Mike Snitzer <snitzer at redhat.com>
Cc: stable at vger.kernel.org
---
 drivers/md/dm-thin.c |   68 +++++++++++++++++++++++++++++++++----------------
 1 files changed, 46 insertions(+), 22 deletions(-)

[v2: split cell_defer_no_holder to prereq patch, rebased on DMERR_LIMIT series, revised patch header]

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2fadaef..6b86d9e 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -368,11 +368,24 @@ static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 		dm_thin_changed_this_transaction(tc->td);
 }
 
+static void inc_all_io_entry(struct pool *pool, struct bio *bio)
+{
+	struct dm_thin_endio_hook *h;
+
+	if (bio->bi_rw & REQ_DISCARD)
+		return;
+
+	h = dm_get_mapinfo(bio)->ptr;
+	h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
+}
+
 static void issue(struct thin_c *tc, struct bio *bio)
 {
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 
+	inc_all_io_entry(pool, bio);
+
 	if (!bio_triggers_commit(tc, bio)) {
 		generic_make_request(bio);
 		return;
@@ -967,12 +980,13 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 			 * a block boundary.  So we submit the discard of a
 			 * partial block appropriately.
 			 */
-			cell_defer_no_holder(tc, cell);
-			cell_defer_no_holder(tc, cell2);
 			if ((!lookup_result.shared) && pool->pf.discard_passdown)
 				remap_and_issue(tc, bio, lookup_result.block);
 			else
 				bio_endio(bio, 0);
+
+			cell_defer_no_holder(tc, cell);
+			cell_defer_no_holder(tc, cell2);
 		}
 		break;
 
@@ -1043,8 +1057,8 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 
 		h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
 
-		cell_defer_no_holder(tc, cell);
 		remap_and_issue(tc, bio, lookup_result->block);
+		cell_defer_no_holder(tc, cell);
 	}
 }
 
@@ -1058,8 +1072,8 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 	 * Remap empty bios (flushes) immediately, without provisioning.
 	 */
 	if (!bio->bi_size) {
-		cell_defer_no_holder(tc, cell);
 		remap_and_issue(tc, bio, 0);
+		cell_defer_no_holder(tc, cell);
 		return;
 	}
 
@@ -1114,27 +1128,22 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
 	case 0:
-		/*
-		 * We can release this cell now.  This thread is the only
-		 * one that puts bios into a cell, and we know there were
-		 * no preceding bios.
-		 */
-		/*
-		 * TODO: this will probably have to change when discard goes
-		 * back in.
-		 */
-		cell_defer_no_holder(tc, cell);
-
 		if (lookup_result.shared)
 			process_shared_bio(tc, bio, block, &lookup_result);
 		else
 			remap_and_issue(tc, bio, lookup_result.block);
+
+		/*
+		 * We can release this cell now.  But there may be other
+		 * other bios in the cell from the thin_map function.
+		 */
+		cell_defer_no_holder(tc, cell);
 		break;
 
 	case -ENODATA:
 		if (bio_data_dir(bio) == READ && tc->origin_dev) {
-			cell_defer_no_holder(tc, cell);
 			remap_to_origin_and_issue(tc, bio);
+			cell_defer_no_holder(tc, cell);
 		} else
 			provision_block(tc, bio, block, cell);
 		break;
@@ -1352,7 +1361,7 @@ static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *b
 
 	h->tc = tc;
 	h->shared_read_entry = NULL;
-	h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : dm_deferred_entry_inc(pool->all_io_ds);
+	h->all_io_entry = NULL;
 	h->overwrite_mapping = NULL;
 
 	return h;
@@ -1369,6 +1378,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_thin_device *td = tc->td;
 	struct dm_thin_lookup_result result;
+	struct dm_bio_prison_cell *cell1, *cell2;
+	struct dm_cell_key key;
 
 	map_context->ptr = thin_hook_bio(tc, bio);
 
@@ -1405,12 +1416,25 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
 			 * shared flag will be set in their case.
 			 */
 			thin_defer_bio(tc, bio);
-			r = DM_MAPIO_SUBMITTED;
-		} else {
-			remap(tc, bio, result.block);
-			r = DM_MAPIO_REMAPPED;
+			return DM_MAPIO_SUBMITTED;
 		}
-		break;
+
+		build_virtual_key(tc->td, block, &key);
+		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1))
+			return DM_MAPIO_SUBMITTED;
+
+		build_data_key(tc->td, result.block, &key);
+		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2)) {
+			cell_defer_no_holder(tc, cell1);
+			return DM_MAPIO_SUBMITTED;
+		}
+
+		inc_all_io_entry(tc->pool, bio);
+		cell_defer_no_holder(tc, cell2);
+		cell_defer_no_holder(tc, cell1);
+
+		remap(tc, bio, result.block);
+		return DM_MAPIO_REMAPPED;
 
 	case -ENODATA:
 		if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
-- 
1.7.1