[dm-devel] [PATCH REPOST RFC] relaxed barriers

Christoph Hellwig hch at lst.de
Sun Aug 8 14:31:26 UTC 2010


On Sat, Aug 07, 2010 at 12:13:06PM +0200, Tejun Heo wrote:
> The patch was on top of v2.6.35 but was generated against dirty tree
> and wouldn't apply cleanly.  Here's the proper one.

Here's an updated version:

 (a) ported to Jens' current block tree
 (b) optimize barriers on devices not requiring flushes to be no-ops
 (b) redo the blk_queue_ordered interface to just set QUEUE_HAS_FLUSH
     and QUEUE_HAS_FUA flags.

Index: linux-2.6/block/blk-barrier.c
===================================================================
--- linux-2.6.orig/block/blk-barrier.c	2010-08-07 12:53:23.727479189 -0400
+++ linux-2.6/block/blk-barrier.c	2010-08-07 14:52:21.402479191 -0400
@@ -9,37 +9,36 @@
 
 #include "blk.h"
 
+/*
+ * Ordered operation sequence.
+ */
+enum {
+	QUEUE_ORDSEQ_STARTED	= (1 << 0), /* flushing in progress */
+	QUEUE_ORDSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
+	QUEUE_ORDSEQ_BAR	= (1 << 2), /* barrier write in progress */
+	QUEUE_ORDSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
+	QUEUE_ORDSEQ_DONE	= (1 << 4),
+};
+
+static struct request *queue_next_ordseq(struct request_queue *q);
+
 /**
- * blk_queue_ordered - does this queue support ordered writes
- * @q:        the request queue
- * @ordered:  one of QUEUE_ORDERED_*
- *
- * Description:
- *   For journalled file systems, doing ordered writes on a commit
- *   block instead of explicitly doing wait_on_buffer (which is bad
- *   for performance) can be a big win. Block drivers supporting this
- *   feature should call this function and indicate so.
- *
+ * blk_queue_cache_features - set the supported cache control features
+ * @q:        		the request queue
+ * @cache_features:	the support features
  **/
-int blk_queue_ordered(struct request_queue *q, unsigned ordered)
+int blk_queue_cache_features(struct request_queue *q, unsigned cache_features)
 {
-	if (ordered != QUEUE_ORDERED_NONE &&
-	    ordered != QUEUE_ORDERED_DRAIN &&
-	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
-	    ordered != QUEUE_ORDERED_DRAIN_FUA &&
-	    ordered != QUEUE_ORDERED_TAG &&
-	    ordered != QUEUE_ORDERED_TAG_FLUSH &&
-	    ordered != QUEUE_ORDERED_TAG_FUA) {
-		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
+	if (cache_features & ~(QUEUE_HAS_FLUSH|QUEUE_HAS_FUA)) {
+		printk(KERN_ERR "blk_queue_cache_features: bad value %d\n",
+			cache_features);
 		return -EINVAL;
 	}
 
-	q->ordered = ordered;
-	q->next_ordered = ordered;
-
+	q->cache_features = cache_features;
 	return 0;
 }
-EXPORT_SYMBOL(blk_queue_ordered);
+EXPORT_SYMBOL(blk_queue_cache_features);
 
 /*
  * Cache flushing for ordered writes handling
@@ -51,38 +50,10 @@ unsigned blk_ordered_cur_seq(struct requ
 	return 1 << ffz(q->ordseq);
 }
 
-unsigned blk_ordered_req_seq(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-
-	BUG_ON(q->ordseq == 0);
-
-	if (rq == &q->pre_flush_rq)
-		return QUEUE_ORDSEQ_PREFLUSH;
-	if (rq == &q->bar_rq)
-		return QUEUE_ORDSEQ_BAR;
-	if (rq == &q->post_flush_rq)
-		return QUEUE_ORDSEQ_POSTFLUSH;
-
-	/*
-	 * !fs requests don't need to follow barrier ordering.  Always
-	 * put them at the front.  This fixes the following deadlock.
-	 *
-	 * http://thread.gmane.org/gmane.linux.kernel/537473
-	 */
-	if (rq->cmd_type != REQ_TYPE_FS)
-		return QUEUE_ORDSEQ_DRAIN;
-
-	if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
-	    (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
-		return QUEUE_ORDSEQ_DRAIN;
-	else
-		return QUEUE_ORDSEQ_DONE;
-}
-
-bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
+static struct request *blk_ordered_complete_seq(struct request_queue *q,
+						unsigned seq, int error)
 {
-	struct request *rq;
+	struct request *rq = NULL;
 
 	if (error && !q->orderr)
 		q->orderr = error;
@@ -90,16 +61,22 @@ bool blk_ordered_complete_seq(struct req
 	BUG_ON(q->ordseq & seq);
 	q->ordseq |= seq;
 
-	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
-		return false;
-
-	/*
-	 * Okay, sequence complete.
-	 */
-	q->ordseq = 0;
-	rq = q->orig_bar_rq;
-	__blk_end_request_all(rq, q->orderr);
-	return true;
+	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
+		/* not complete yet, queue the next ordered sequence */
+		rq = queue_next_ordseq(q);
+	} else {
+		/* complete this barrier request */
+		__blk_end_request_all(q->orig_bar_rq, q->orderr);
+		q->orig_bar_rq = NULL;
+		q->ordseq = 0;
+
+		/* dispatch the next barrier if there's one */
+		if (!list_empty(&q->pending_barriers)) {
+			rq = list_entry_rq(q->pending_barriers.next);
+			list_move(&rq->queuelist, &q->queue_head);
+		}
+	}
+	return rq;
 }
 
 static void pre_flush_end_io(struct request *rq, int error)
@@ -120,155 +97,100 @@ static void post_flush_end_io(struct req
 	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
 }
 
-static void queue_flush(struct request_queue *q, unsigned which)
+static void init_flush_request(struct request_queue *q, struct request *rq)
 {
-	struct request *rq;
-	rq_end_io_fn *end_io;
+	rq->cmd_type = REQ_TYPE_FS;
+	rq->cmd_flags = REQ_FLUSH;
+	rq->rq_disk = q->orig_bar_rq->rq_disk;
+}
 
-	if (which == QUEUE_ORDERED_DO_PREFLUSH) {
-		rq = &q->pre_flush_rq;
-		end_io = pre_flush_end_io;
-	} else {
-		rq = &q->post_flush_rq;
-		end_io = post_flush_end_io;
-	}
+/*
+ * Initialize proxy request and queue it.
+ */
+static struct request *queue_next_ordseq(struct request_queue *q)
+{
+	struct request *rq = &q->bar_rq;
 
 	blk_rq_init(q, rq);
-	rq->cmd_type = REQ_TYPE_FS;
-	rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
-	rq->rq_disk = q->orig_bar_rq->rq_disk;
-	rq->end_io = end_io;
+
+	switch (blk_ordered_cur_seq(q)) {
+	case QUEUE_ORDSEQ_PREFLUSH:
+		init_flush_request(q, rq);
+		rq->end_io = pre_flush_end_io;
+		break;
+	case QUEUE_ORDSEQ_BAR:
+		init_request_from_bio(rq, q->orig_bar_rq->bio);
+		rq->cmd_flags &= ~REQ_HARDBARRIER;
+ 		if (q->cache_features & QUEUE_HAS_FUA)
+ 			rq->cmd_flags |= REQ_FUA;
+		rq->end_io = bar_end_io;
+		break;
+	case QUEUE_ORDSEQ_POSTFLUSH:
+		init_flush_request(q, rq);
+		rq->end_io = post_flush_end_io;
+		break;
+	default:
+		BUG();
+	}
 
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+	return rq;
 }
 
-static inline bool start_ordered(struct request_queue *q, struct request **rqp)
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 {
-	struct request *rq = *rqp;
 	unsigned skip = 0;
 
-	q->orderr = 0;
-	q->ordered = q->next_ordered;
-	q->ordseq |= QUEUE_ORDSEQ_STARTED;
+	if (rq->cmd_type != REQ_TYPE_FS)
+		return rq;
+	if (!(rq->cmd_flags & REQ_HARDBARRIER))
+		return rq;
 
-	/*
-	 * For an empty barrier, there's no actual BAR request, which
-	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
-	 */
-	if (!blk_rq_sectors(rq)) {
-		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
-				QUEUE_ORDERED_DO_POSTFLUSH);
+	if (!(q->cache_features & QUEUE_HAS_FLUSH)) {
 		/*
-		 * Empty barrier on a write-through device w/ ordered
-		 * tag has no command to issue and without any command
-		 * to issue, ordering by tag can't be used.  Drain
-		 * instead.
+		 * No flush required.  We can just send on write requests
+		 * and complete cache flush requests ASAP.
 		 */
-		if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
-		    !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
-			q->ordered &= ~QUEUE_ORDERED_BY_TAG;
-			q->ordered |= QUEUE_ORDERED_BY_DRAIN;
+		if (blk_rq_sectors(rq)) {
+			rq->cmd_flags &= ~REQ_HARDBARRIER;
+			return rq;
 		}
+		blk_dequeue_request(rq);
+		__blk_end_request_all(rq, 0);
+		return NULL;
 	}
 
-	/* stash away the original request */
-	blk_dequeue_request(rq);
-	q->orig_bar_rq = rq;
-	rq = NULL;
-
-	/*
-	 * Queue ordered sequence.  As we stack them at the head, we
-	 * need to queue in reverse order.  Note that we rely on that
-	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
-	 * request gets inbetween ordered sequence.
-	 */
-	if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
-		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
-		rq = &q->post_flush_rq;
-	} else
-		skip |= QUEUE_ORDSEQ_POSTFLUSH;
-
-	if (q->ordered & QUEUE_ORDERED_DO_BAR) {
-		rq = &q->bar_rq;
-
-		/* initialize proxy request and queue it */
-		blk_rq_init(q, rq);
-		if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
-			rq->cmd_flags |= REQ_WRITE;
-		if (q->ordered & QUEUE_ORDERED_DO_FUA)
-			rq->cmd_flags |= REQ_FUA;
-		init_request_from_bio(rq, q->orig_bar_rq->bio);
-		rq->end_io = bar_end_io;
-
-		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-	} else
-		skip |= QUEUE_ORDSEQ_BAR;
-
-	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
-		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
-		rq = &q->pre_flush_rq;
-	} else
-		skip |= QUEUE_ORDSEQ_PREFLUSH;
-
-	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
-		rq = NULL;
-	else
-		skip |= QUEUE_ORDSEQ_DRAIN;
+	if (q->ordseq) {
+		/*
+		 * Barrier is already in progress and they can't be
+		 * processed in parallel.  Queue for later processing.
+		 */
+		list_move_tail(&rq->queuelist, &q->pending_barriers);
+		return NULL;
+	}
 
-	*rqp = rq;
 
 	/*
-	 * Complete skipped sequences.  If whole sequence is complete,
-	 * return false to tell elevator that this request is gone.
+	 * Start a new ordered sequence
 	 */
-	return !blk_ordered_complete_seq(q, skip, 0);
-}
-
-bool blk_do_ordered(struct request_queue *q, struct request **rqp)
-{
-	struct request *rq = *rqp;
-	const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
-				(rq->cmd_flags & REQ_HARDBARRIER);
-
-	if (!q->ordseq) {
-		if (!is_barrier)
-			return true;
-
-		if (q->next_ordered != QUEUE_ORDERED_NONE)
-			return start_ordered(q, rqp);
-		else {
-			/*
-			 * Queue ordering not supported.  Terminate
-			 * with prejudice.
-			 */
-			blk_dequeue_request(rq);
-			__blk_end_request_all(rq, -EOPNOTSUPP);
-			*rqp = NULL;
-			return false;
-		}
-	}
+	q->orderr = 0;
+	q->ordseq |= QUEUE_ORDSEQ_STARTED;
 
 	/*
-	 * Ordered sequence in progress
+	 * For an empty barrier, there's no actual BAR request, which
+	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
 	 */
+	if (!blk_rq_sectors(rq))
+		skip |= (QUEUE_ORDSEQ_BAR|QUEUE_ORDSEQ_POSTFLUSH);
+	else if (q->cache_features & QUEUE_HAS_FUA)
+		skip |= QUEUE_ORDSEQ_POSTFLUSH;
 
-	/* Special requests are not subject to ordering rules. */
-	if (rq->cmd_type != REQ_TYPE_FS &&
-	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
-		return true;
-
-	if (q->ordered & QUEUE_ORDERED_BY_TAG) {
-		/* Ordered by tag.  Blocking the next barrier is enough. */
-		if (is_barrier && rq != &q->bar_rq)
-			*rqp = NULL;
-	} else {
-		/* Ordered by draining.  Wait for turn. */
-		WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
-		if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
-			*rqp = NULL;
-	}
+	/* stash away the original request */
+	blk_dequeue_request(rq);
+	q->orig_bar_rq = rq;
 
-	return true;
+	/* complete skipped sequences and return the first sequence */
+	return blk_ordered_complete_seq(q, skip, 0);
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
Index: linux-2.6/include/linux/blkdev.h
===================================================================
--- linux-2.6.orig/include/linux/blkdev.h	2010-08-07 12:53:23.774479189 -0400
+++ linux-2.6/include/linux/blkdev.h	2010-08-07 14:51:42.751479190 -0400
@@ -354,13 +354,20 @@ struct request_queue
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	struct blk_trace	*blk_trace;
 #endif
+
+	/*
+	 * Features this queue understands.
+	 */
+	unsigned int		cache_features;
+
 	/*
 	 * reserved for flush operations
 	 */
-	unsigned int		ordered, next_ordered, ordseq;
-	int			orderr, ordcolor;
-	struct request		pre_flush_rq, bar_rq, post_flush_rq;
-	struct request		*orig_bar_rq;
+	unsigned int		ordseq;
+	int			orderr;
+	struct request		bar_rq;
+	struct request          *orig_bar_rq;
+	struct list_head	pending_barriers;
 
 	struct mutex		sysfs_lock;
 
@@ -461,54 +468,12 @@ static inline void queue_flag_clear(unsi
 	__clear_bit(flag, &q->queue_flags);
 }
 
+/*
+ * Possible features to control a volatile write cache.
+ */
 enum {
-	/*
-	 * Hardbarrier is supported with one of the following methods.
-	 *
-	 * NONE		: hardbarrier unsupported
-	 * DRAIN	: ordering by draining is enough
-	 * DRAIN_FLUSH	: ordering by draining w/ pre and post flushes
-	 * DRAIN_FUA	: ordering by draining w/ pre flush and FUA write
-	 * TAG		: ordering by tag is enough
-	 * TAG_FLUSH	: ordering by tag w/ pre and post flushes
-	 * TAG_FUA	: ordering by tag w/ pre flush and FUA write
-	 */
-	QUEUE_ORDERED_BY_DRAIN		= 0x01,
-	QUEUE_ORDERED_BY_TAG		= 0x02,
-	QUEUE_ORDERED_DO_PREFLUSH	= 0x10,
-	QUEUE_ORDERED_DO_BAR		= 0x20,
-	QUEUE_ORDERED_DO_POSTFLUSH	= 0x40,
-	QUEUE_ORDERED_DO_FUA		= 0x80,
-
-	QUEUE_ORDERED_NONE		= 0x00,
-
-	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_BY_DRAIN |
-					  QUEUE_ORDERED_DO_BAR,
-	QUEUE_ORDERED_DRAIN_FLUSH	= QUEUE_ORDERED_DRAIN |
-					  QUEUE_ORDERED_DO_PREFLUSH |
-					  QUEUE_ORDERED_DO_POSTFLUSH,
-	QUEUE_ORDERED_DRAIN_FUA		= QUEUE_ORDERED_DRAIN |
-					  QUEUE_ORDERED_DO_PREFLUSH |
-					  QUEUE_ORDERED_DO_FUA,
-
-	QUEUE_ORDERED_TAG		= QUEUE_ORDERED_BY_TAG |
-					  QUEUE_ORDERED_DO_BAR,
-	QUEUE_ORDERED_TAG_FLUSH		= QUEUE_ORDERED_TAG |
-					  QUEUE_ORDERED_DO_PREFLUSH |
-					  QUEUE_ORDERED_DO_POSTFLUSH,
-	QUEUE_ORDERED_TAG_FUA		= QUEUE_ORDERED_TAG |
-					  QUEUE_ORDERED_DO_PREFLUSH |
-					  QUEUE_ORDERED_DO_FUA,
-
-	/*
-	 * Ordered operation sequence
-	 */
-	QUEUE_ORDSEQ_STARTED	= 0x01,	/* flushing in progress */
-	QUEUE_ORDSEQ_DRAIN	= 0x02,	/* waiting for the queue to be drained */
-	QUEUE_ORDSEQ_PREFLUSH	= 0x04,	/* pre-flushing in progress */
-	QUEUE_ORDSEQ_BAR	= 0x08,	/* original barrier req in progress */
-	QUEUE_ORDSEQ_POSTFLUSH	= 0x10,	/* post-flushing in progress */
-	QUEUE_ORDSEQ_DONE	= 0x20,
+	QUEUE_HAS_FLUSH		= 1 << 0,	/* supports REQ_FLUSH */
+	QUEUE_HAS_FUA		= 1 << 1,	/* supports REQ_FUA */
 };
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
@@ -879,11 +844,9 @@ extern void blk_queue_softirq_done(struc
 extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern int blk_queue_ordered(struct request_queue *, unsigned);
-extern bool blk_do_ordered(struct request_queue *, struct request **);
+extern int blk_queue_cache_features(struct request_queue *, unsigned);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
-extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
Index: linux-2.6/drivers/block/virtio_blk.c
===================================================================
--- linux-2.6.orig/drivers/block/virtio_blk.c	2010-08-07 12:53:23.800479189 -0400
+++ linux-2.6/drivers/block/virtio_blk.c	2010-08-07 14:51:34.198479189 -0400
@@ -388,31 +388,8 @@ static int __devinit virtblk_probe(struc
 	vblk->disk->driverfs_dev = &vdev->dev;
 	index++;
 
-	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) {
-		/*
-		 * If the FLUSH feature is supported we do have support for
-		 * flushing a volatile write cache on the host.  Use that
-		 * to implement write barrier support.
-		 */
-		blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
-	} else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) {
-		/*
-		 * If the BARRIER feature is supported the host expects us
-		 * to order request by tags.  This implies there is not
-		 * volatile write cache on the host, and that the host
-		 * never re-orders outstanding I/O.  This feature is not
-		 * useful for real life scenarious and deprecated.
-		 */
-		blk_queue_ordered(q, QUEUE_ORDERED_TAG);
-	} else {
-		/*
-		 * If the FLUSH feature is not supported we must assume that
-		 * the host does not perform any kind of volatile write
-		 * caching. We still need to drain the queue to provider
-		 * proper barrier semantics.
-		 */
-		blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
-	}
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+		blk_queue_cache_features(q, QUEUE_HAS_FLUSH);
 
 	/* If disk is read-only in the host, the guest should obey */
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
Index: linux-2.6/drivers/scsi/sd.c
===================================================================
--- linux-2.6.orig/drivers/scsi/sd.c	2010-08-07 12:53:23.872479189 -0400
+++ linux-2.6/drivers/scsi/sd.c	2010-08-07 14:54:47.812479189 -0400
@@ -2109,7 +2109,7 @@ static int sd_revalidate_disk(struct gen
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	struct scsi_device *sdp = sdkp->device;
 	unsigned char *buffer;
-	unsigned ordered;
+	unsigned ordered = 0;
 
 	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,
 				      "sd_revalidate_disk\n"));
@@ -2151,17 +2151,14 @@ static int sd_revalidate_disk(struct gen
 
 	/*
 	 * We now have all cache related info, determine how we deal
-	 * with ordered requests.  Note that as the current SCSI
-	 * dispatch function can alter request order, we cannot use
-	 * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
+	 * with barriers.
 	 */
-	if (sdkp->WCE)
-		ordered = sdkp->DPOFUA
-			? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH;
-	else
-		ordered = QUEUE_ORDERED_DRAIN;
-
-	blk_queue_ordered(sdkp->disk->queue, ordered);
+	if (sdkp->WCE) {
+		ordered |= QUEUE_HAS_FLUSH;
+		if (sdkp->DPOFUA)
+			ordered |= QUEUE_HAS_FUA;
+	}
+	blk_queue_cache_features(sdkp->disk->queue, ordered);
 
 	set_capacity(disk, sdkp->capacity);
 	kfree(buffer);
Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c	2010-08-07 12:53:23.744479189 -0400
+++ linux-2.6/block/blk-core.c	2010-08-07 14:56:35.087479189 -0400
@@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_no
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
+	INIT_LIST_HEAD(&q->pending_barriers);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1037,22 +1038,6 @@ void blk_insert_request(struct request_q
 }
 EXPORT_SYMBOL(blk_insert_request);
 
-/*
- * add-request adds a request to the linked list.
- * queue lock is held and interrupts disabled, as we muck with the
- * request queue list.
- */
-static inline void add_request(struct request_queue *q, struct request *req)
-{
-	drive_stat_acct(req, 1);
-
-	/*
-	 * elevator indicated where it wants this request to be
-	 * inserted at elevator_merge time
-	 */
-	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
-}
-
 static void part_round_stats_single(int cpu, struct hd_struct *part,
 				    unsigned long now)
 {
@@ -1201,13 +1186,9 @@ static int __make_request(struct request
 	const bool sync = (bio->bi_rw & REQ_SYNC);
 	const bool unplug = (bio->bi_rw & REQ_UNPLUG);
 	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+	int where = ELEVATOR_INSERT_SORT;
 	int rw_flags;
 
-	if ((bio->bi_rw & REQ_HARDBARRIER) &&
-	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
-		bio_endio(bio, -EOPNOTSUPP);
-		return 0;
-	}
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
@@ -1217,7 +1198,12 @@ static int __make_request(struct request
 
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q))
+	if (bio->bi_rw & REQ_HARDBARRIER) {
+		where = ELEVATOR_INSERT_ORDERED;
+		goto get_rq;
+	}
+
+	if (elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1314,7 +1300,10 @@ get_rq:
 		req->cpu = blk_cpu_to_group(smp_processor_id());
 	if (queue_should_plug(q) && elv_queue_empty(q))
 		blk_plug_device(q);
-	add_request(q, req);
+
+	/* insert the request into the elevator */
+	drive_stat_acct(req, 1);
+	__elv_add_request(q, req, where, 0);
 out:
 	if (unplug || !queue_should_plug(q))
 		__generic_unplug_device(q);
Index: linux-2.6/block/elevator.c
===================================================================
--- linux-2.6.orig/block/elevator.c	2010-08-07 12:53:23.752479189 -0400
+++ linux-2.6/block/elevator.c	2010-08-07 12:53:53.162479190 -0400
@@ -564,7 +564,7 @@ void elv_requeue_request(struct request_
 
 	rq->cmd_flags &= ~REQ_STARTED;
 
-	elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
+	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
 void elv_drain_elevator(struct request_queue *q)
@@ -611,8 +611,6 @@ void elv_quiesce_end(struct request_queu
 
 void elv_insert(struct request_queue *q, struct request *rq, int where)
 {
-	struct list_head *pos;
-	unsigned ordseq;
 	int unplug_it = 1;
 
 	trace_block_rq_insert(q, rq);
@@ -622,10 +620,14 @@ void elv_insert(struct request_queue *q,
 	switch (where) {
 	case ELEVATOR_INSERT_FRONT:
 		rq->cmd_flags |= REQ_SOFTBARRIER;
-
 		list_add(&rq->queuelist, &q->queue_head);
 		break;
 
+	case ELEVATOR_INSERT_ORDERED:
+		rq->cmd_flags |= REQ_SOFTBARRIER;
+		list_add_tail(&rq->queuelist, &q->queue_head);
+		break;
+
 	case ELEVATOR_INSERT_BACK:
 		rq->cmd_flags |= REQ_SOFTBARRIER;
 		elv_drain_elevator(q);
@@ -662,36 +664,6 @@ void elv_insert(struct request_queue *q,
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
 
-	case ELEVATOR_INSERT_REQUEUE:
-		/*
-		 * If ordered flush isn't in progress, we do front
-		 * insertion; otherwise, requests should be requeued
-		 * in ordseq order.
-		 */
-		rq->cmd_flags |= REQ_SOFTBARRIER;
-
-		/*
-		 * Most requeues happen because of a busy condition,
-		 * don't force unplug of the queue for that case.
-		 */
-		unplug_it = 0;
-
-		if (q->ordseq == 0) {
-			list_add(&rq->queuelist, &q->queue_head);
-			break;
-		}
-
-		ordseq = blk_ordered_req_seq(rq);
-
-		list_for_each(pos, &q->queue_head) {
-			struct request *pos_rq = list_entry_rq(pos);
-			if (ordseq <= blk_ordered_req_seq(pos_rq))
-				break;
-		}
-
-		list_add_tail(&rq->queuelist, pos);
-		break;
-
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __func__, where);
@@ -710,33 +682,15 @@ void elv_insert(struct request_queue *q,
 void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		       int plug)
 {
-	if (q->ordcolor)
-		rq->cmd_flags |= REQ_ORDERED_COLOR;
-
 	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
-		/*
-		 * toggle ordered color
-		 */
-		if (rq->cmd_flags & REQ_HARDBARRIER)
-			q->ordcolor ^= 1;
-
-		/*
-		 * barriers implicitly indicate back insertion
-		 */
-		if (where == ELEVATOR_INSERT_SORT)
-			where = ELEVATOR_INSERT_BACK;
-
-		/*
-		 * this request is scheduling boundary, update
-		 * end_sector
-		 */
+		/* barriers are scheduling boundary, update end_sector */
 		if (rq->cmd_type == REQ_TYPE_FS ||
 		    (rq->cmd_flags & REQ_DISCARD)) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = rq;
 		}
 	} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
-		    where == ELEVATOR_INSERT_SORT)
+		   where == ELEVATOR_INSERT_SORT)
 		where = ELEVATOR_INSERT_BACK;
 
 	if (plug)
@@ -849,24 +803,6 @@ void elv_completed_request(struct reques
 		    e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
-
-	/*
-	 * Check if the queue is waiting for fs requests to be
-	 * drained for flush sequence.
-	 */
-	if (unlikely(q->ordseq)) {
-		struct request *next = NULL;
-
-		if (!list_empty(&q->queue_head))
-			next = list_entry_rq(q->queue_head.next);
-
-		if (!queue_in_flight(q) &&
-		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
-		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
-			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
-			__blk_run_queue(q);
-		}
-	}
 }
 
 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
Index: linux-2.6/block/blk.h
===================================================================
--- linux-2.6.orig/block/blk.h	2010-08-07 12:53:23.762479189 -0400
+++ linux-2.6/block/blk.h	2010-08-07 12:53:53.171479190 -0400
@@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete
  */
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq);
+
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
@@ -58,7 +60,8 @@ static inline struct request *__elv_next
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
-			if (blk_do_ordered(q, &rq))
+			rq = blk_do_ordered(q, rq);
+			if (rq)
 				return rq;
 		}
 
Index: linux-2.6/drivers/block/xen-blkfront.c
===================================================================
--- linux-2.6.orig/drivers/block/xen-blkfront.c	2010-08-07 12:53:23.807479189 -0400
+++ linux-2.6/drivers/block/xen-blkfront.c	2010-08-07 14:44:39.564479189 -0400
@@ -417,30 +417,6 @@ static int xlvbd_init_blk_queue(struct g
 	return 0;
 }
 
-
-static int xlvbd_barrier(struct blkfront_info *info)
-{
-	int err;
-	const char *barrier;
-
-	switch (info->feature_barrier) {
-	case QUEUE_ORDERED_DRAIN:	barrier = "enabled (drain)"; break;
-	case QUEUE_ORDERED_TAG:		barrier = "enabled (tag)"; break;
-	case QUEUE_ORDERED_NONE:	barrier = "disabled"; break;
-	default:			return -EINVAL;
-	}
-
-	err = blk_queue_ordered(info->rq, info->feature_barrier);
-
-	if (err)
-		return err;
-
-	printk(KERN_INFO "blkfront: %s: barriers %s\n",
-	       info->gd->disk_name, barrier);
-	return 0;
-}
-
-
 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 			       struct blkfront_info *info,
 			       u16 vdisk_info, u16 sector_size)
@@ -516,8 +492,6 @@ static int xlvbd_alloc_gendisk(blkif_sec
 	info->rq = gd->queue;
 	info->gd = gd;
 
-	xlvbd_barrier(info);
-
 	if (vdisk_info & VDISK_READONLY)
 		set_disk_ro(gd, 1);
 
@@ -662,8 +636,6 @@ static irqreturn_t blkif_interrupt(int i
 				printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
 				       info->gd->disk_name);
 				error = -EOPNOTSUPP;
-				info->feature_barrier = QUEUE_ORDERED_NONE;
-				xlvbd_barrier(info);
 			}
 			/* fall through */
 		case BLKIF_OP_READ:
@@ -1073,24 +1045,6 @@ static void blkfront_connect(struct blkf
 			    "feature-barrier", "%lu", &barrier,
 			    NULL);
 
-	/*
-	 * If there's no "feature-barrier" defined, then it means
-	 * we're dealing with a very old backend which writes
-	 * synchronously; draining will do what needs to get done.
-	 *
-	 * If there are barriers, then we can do full queued writes
-	 * with tagged barriers.
-	 *
-	 * If barriers are not supported, then there's no much we can
-	 * do, so just set ordering to NONE.
-	 */
-	if (err)
-		info->feature_barrier = QUEUE_ORDERED_DRAIN;
-	else if (barrier)
-		info->feature_barrier = QUEUE_ORDERED_TAG;
-	else
-		info->feature_barrier = QUEUE_ORDERED_NONE;
-
 	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
 	if (err) {
 		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
Index: linux-2.6/drivers/ide/ide-disk.c
===================================================================
--- linux-2.6.orig/drivers/ide/ide-disk.c	2010-08-07 12:53:23.889479189 -0400
+++ linux-2.6/drivers/ide/ide-disk.c	2010-08-07 15:00:30.215479189 -0400
@@ -518,12 +518,13 @@ static int ide_do_setfeature(ide_drive_t
 
 static void update_ordered(ide_drive_t *drive)
 {
-	u16 *id = drive->id;
-	unsigned ordered = QUEUE_ORDERED_NONE;
+	unsigned ordered = 0;
 
 	if (drive->dev_flags & IDE_DFLAG_WCACHE) {
+		u16 *id = drive->id;
 		unsigned long long capacity;
 		int barrier;
+
 		/*
 		 * We must avoid issuing commands a drive does not
 		 * understand or we may crash it. We check flush cache
@@ -543,13 +544,18 @@ static void update_ordered(ide_drive_t *
 		       drive->name, barrier ? "" : "not ");
 
 		if (barrier) {
-			ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+			printk(KERN_INFO "%s: cache flushes supported\n",
+				drive->name);
 			blk_queue_prep_rq(drive->queue, idedisk_prep_fn);
+			ordered |= QUEUE_HAS_FLUSH;
+		} else {
+			printk(KERN_INFO
+				"%s: WARNING: cache flushes not supported\n",
+				drive->name);
 		}
-	} else
-		ordered = QUEUE_ORDERED_DRAIN;
+	}
 
-	blk_queue_ordered(drive->queue, ordered);
+	blk_queue_cache_features(drive->queue, ordered);
 }
 
 ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE);
Index: linux-2.6/drivers/md/dm.c
===================================================================
--- linux-2.6.orig/drivers/md/dm.c	2010-08-07 12:53:23.905479189 -0400
+++ linux-2.6/drivers/md/dm.c	2010-08-07 14:51:38.240479189 -0400
@@ -1908,7 +1908,7 @@ static struct mapped_device *alloc_dev(i
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
 	blk_queue_lld_busy(md->queue, dm_lld_busy);
-	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
+	blk_queue_cache_features(md->queue, QUEUE_HAS_FLUSH);
 
 	md->disk = alloc_disk(1);
 	if (!md->disk)
Index: linux-2.6/drivers/mmc/card/queue.c
===================================================================
--- linux-2.6.orig/drivers/mmc/card/queue.c	2010-08-07 12:53:23.927479189 -0400
+++ linux-2.6/drivers/mmc/card/queue.c	2010-08-07 14:30:09.666479189 -0400
@@ -128,7 +128,6 @@ int mmc_init_queue(struct mmc_queue *mq,
 	mq->req = NULL;
 
 	blk_queue_prep_rq(mq->queue, mmc_prep_request);
-	blk_queue_ordered(mq->queue, QUEUE_ORDERED_DRAIN);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
 
 #ifdef CONFIG_MMC_BLOCK_BOUNCE
Index: linux-2.6/drivers/s390/block/dasd.c
===================================================================
--- linux-2.6.orig/drivers/s390/block/dasd.c	2010-08-07 12:53:23.939479189 -0400
+++ linux-2.6/drivers/s390/block/dasd.c	2010-08-07 14:30:13.307479189 -0400
@@ -2197,7 +2197,6 @@ static void dasd_setup_queue(struct dasd
 	 */
 	blk_queue_max_segment_size(block->request_queue, PAGE_SIZE);
 	blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1);
-	blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN);
 }
 
 /*
Index: linux-2.6/include/linux/elevator.h
===================================================================
--- linux-2.6.orig/include/linux/elevator.h	2010-08-07 12:53:23.781479189 -0400
+++ linux-2.6/include/linux/elevator.h	2010-08-07 12:53:53.208479190 -0400
@@ -162,9 +162,9 @@ extern struct request *elv_rb_find(struc
  * Insertion selection
  */
 #define ELEVATOR_INSERT_FRONT	1
-#define ELEVATOR_INSERT_BACK	2
-#define ELEVATOR_INSERT_SORT	3
-#define ELEVATOR_INSERT_REQUEUE	4
+#define ELEVATOR_INSERT_ORDERED	2
+#define ELEVATOR_INSERT_BACK	3
+#define ELEVATOR_INSERT_SORT	4
 
 /*
  * return values from elevator_may_queue_fn
Index: linux-2.6/drivers/block/pktcdvd.c
===================================================================
--- linux-2.6.orig/drivers/block/pktcdvd.c	2010-08-07 12:53:23.815479189 -0400
+++ linux-2.6/drivers/block/pktcdvd.c	2010-08-07 12:53:53.211479190 -0400
@@ -753,7 +753,6 @@ static int pkt_generic_packet(struct pkt
 
 	rq->timeout = 60*HZ;
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
-	rq->cmd_flags |= REQ_HARDBARRIER;
 	if (cgc->quiet)
 		rq->cmd_flags |= REQ_QUIET;
 
Index: linux-2.6/drivers/block/brd.c
===================================================================
--- linux-2.6.orig/drivers/block/brd.c	2010-08-07 12:53:23.825479189 -0400
+++ linux-2.6/drivers/block/brd.c	2010-08-07 14:26:12.293479191 -0400
@@ -482,7 +482,6 @@ static struct brd_device *brd_alloc(int
 	if (!brd->brd_queue)
 		goto out_free_dev;
 	blk_queue_make_request(brd->brd_queue, brd_make_request);
-	blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
 	blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 	blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
 
Index: linux-2.6/drivers/block/loop.c
===================================================================
--- linux-2.6.orig/drivers/block/loop.c	2010-08-07 12:53:23.836479189 -0400
+++ linux-2.6/drivers/block/loop.c	2010-08-07 14:51:27.937479189 -0400
@@ -831,8 +831,8 @@ static int loop_set_fd(struct loop_devic
 	lo->lo_queue->queuedata = lo;
 	lo->lo_queue->unplug_fn = loop_unplug;
 
-	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
-		blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN);
+	/* XXX(hch): loop can't properly deal with flush requests currently */
+//	blk_queue_cache_features(lo->lo_queue, QUEUE_HAS_FLUSH);
 
 	set_capacity(lo->lo_disk, size);
 	bd_set_size(bdev, size << 9);
Index: linux-2.6/drivers/block/osdblk.c
===================================================================
--- linux-2.6.orig/drivers/block/osdblk.c	2010-08-07 12:53:23.843479189 -0400
+++ linux-2.6/drivers/block/osdblk.c	2010-08-07 14:51:30.091479189 -0400
@@ -439,7 +439,7 @@ static int osdblk_init_disk(struct osdbl
 	blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
 
 	blk_queue_prep_rq(q, blk_queue_start_tag);
-	blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
+	blk_queue_cache_features(q, QUEUE_HAS_FLUSH);
 
 	disk->queue = q;
 
Index: linux-2.6/drivers/block/ps3disk.c
===================================================================
--- linux-2.6.orig/drivers/block/ps3disk.c	2010-08-07 12:53:23.859479189 -0400
+++ linux-2.6/drivers/block/ps3disk.c	2010-08-07 14:51:32.204479189 -0400
@@ -468,7 +468,7 @@ static int __devinit ps3disk_probe(struc
 	blk_queue_dma_alignment(queue, dev->blk_size-1);
 	blk_queue_logical_block_size(queue, dev->blk_size);
 
-	blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH);
+	blk_queue_cache_features(queue, QUEUE_HAS_FLUSH);
 
 	blk_queue_max_segments(queue, -1);
 	blk_queue_max_segment_size(queue, dev->bounce_size);
Index: linux-2.6/include/linux/blk_types.h
===================================================================
--- linux-2.6.orig/include/linux/blk_types.h	2010-08-07 12:53:23.793479189 -0400
+++ linux-2.6/include/linux/blk_types.h	2010-08-07 12:53:53.243479190 -0400
@@ -141,7 +141,6 @@ enum rq_flag_bits {
 	__REQ_FAILED,		/* set if the request failed */
 	__REQ_QUIET,		/* don't worry about errors */
 	__REQ_PREEMPT,		/* set for "ide_preempt" requests */
-	__REQ_ORDERED_COLOR,	/* is before or after barrier */
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_COPY_USER,	/* contains copies of user pages */
 	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
@@ -181,7 +180,6 @@ enum rq_flag_bits {
 #define REQ_FAILED		(1 << __REQ_FAILED)
 #define REQ_QUIET		(1 << __REQ_QUIET)
 #define REQ_PREEMPT		(1 << __REQ_PREEMPT)
-#define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
 #define REQ_ALLOCED		(1 << __REQ_ALLOCED)
 #define REQ_COPY_USER		(1 << __REQ_COPY_USER)
 #define REQ_INTEGRITY		(1 << __REQ_INTEGRITY)




More information about the dm-devel mailing list