[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

[dm-devel] Re: [kvm-devel] I/O bandwidth control on KVM



Ryo Tsuruta wrote:
Hi,

If you are using virtio drivers in the guest (which I presume you are given the reference to /dev/vda), try using the following -drive syntax:

-drive file=/dev/mapper/ioband1,if=virtio,boot=on,cache=off

This will force the use of O_DIRECT. By default, QEMU does not open with O_DIRECT so you'll see page cache effects.

I tried the test with "cache=off" option, here is the result.

Can you give the attached patch a try? The virtio backend does synchronous IO requests blocking the guest from making progress until the IO completes. It's possible that what you're seeing is the scheduler competing with your IO bandwidth limiting in order to ensure fairness since IO completion is intimately tied to CPU consumption (since we're using blocking IO).

The attached patch implements AIO support for the virtio backend so if this is the case, you should see the proper proportions.

Regards,

Anthony Liguori
diff --git a/qemu/hw/virtio-blk.c b/qemu/hw/virtio-blk.c
index 301b5a1..3c56bed 100644
--- a/qemu/hw/virtio-blk.c
+++ b/qemu/hw/virtio-blk.c
@@ -71,59 +71,121 @@ typedef struct VirtIOBlock
     BlockDriverState *bs;
 } VirtIOBlock;
 
+typedef struct VBDMARequestState VBDMARequestState;
+
+typedef struct VBDMAState
+{
+    VirtQueueElement elem;
+    int count;
+    int is_write;
+    unsigned int wlen;
+    VirtQueue *vq;
+    VirtIODevice *vdev;
+    VBDMARequestState *requests;
+} VBDMAState;
+
+struct VBDMARequestState
+{
+    VBDMAState *dma;
+    BlockDriverAIOCB *aiocb;
+    VBDMARequestState *next;
+};
+
 static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
 {
     return (VirtIOBlock *)vdev;
 }
 
+static void virtio_io_completion(void *opaque, int ret)
+{
+    VBDMARequestState *req = opaque, **ppreq;
+    VBDMAState *dma = req->dma;
+    struct virtio_blk_inhdr *in;
+
+    for (ppreq = &dma->requests; *ppreq; ppreq = &(*ppreq)->next) {
+	if (*ppreq == req) { 
+	    *ppreq = req->next;
+	    break;
+	}
+    }
+
+    qemu_free(req);
+
+    if (dma->requests)
+	return;
+
+    in = (void *)dma->elem.in_sg[dma->elem.in_num - 1].iov_base;
+    dma->wlen += sizeof(*in);
+    if (ret == -EOPNOTSUPP)
+	in->status = VIRTIO_BLK_S_UNSUPP;
+    else
+	in->status = VIRTIO_BLK_S_OK;
+    virtqueue_push(dma->vq, &dma->elem, dma->wlen);
+    virtio_notify(dma->vdev, dma->vq);
+    qemu_free(dma);
+}
+
 static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 {
     VirtIOBlock *s = to_virtio_blk(vdev);
-    VirtQueueElement elem;
+    VBDMAState *dma = qemu_mallocz(sizeof(VBDMAState));
     unsigned int count;
 
-    while ((count = virtqueue_pop(vq, &elem)) != 0) {
-	struct virtio_blk_inhdr *in;
+    while ((count = virtqueue_pop(vq, &dma->elem)) != 0) {
 	struct virtio_blk_outhdr *out;
-	unsigned int wlen;
+	VBDMARequestState *req;
 	off_t off;
 	int i;
 
-	out = (void *)elem.out_sg[0].iov_base;
-	in = (void *)elem.in_sg[elem.in_num - 1].iov_base;
+	out = (void *)dma->elem.out_sg[0].iov_base;
 	off = out->sector;
 
+	dma->vq = vq;
+	dma->vdev = vdev;
+
 	if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
-	    wlen = sizeof(*in);
-	    in->status = VIRTIO_BLK_S_UNSUPP;
+	    req = qemu_mallocz(sizeof(VBDMARequestState));
+	    req->dma = dma;
+	    req->next = dma->requests;
+	    dma->requests = req;
+	    virtio_io_completion(req, -EOPNOTSUPP);
 	} else if (out->type & VIRTIO_BLK_T_OUT) {
-	    wlen = sizeof(*in);
-
-	    for (i = 1; i < elem.out_num; i++) {
-		bdrv_write(s->bs, off,
-			   elem.out_sg[i].iov_base,
-			   elem.out_sg[i].iov_len / 512);
-		off += elem.out_sg[i].iov_len / 512;
+	    dma->count = dma->elem.out_num - 1;
+	    dma->is_write = 1;
+	    for (i = 1; i < dma->elem.out_num; i++) {
+		req = qemu_mallocz(sizeof(VBDMARequestState));
+		req->dma = dma;
+		req->next = dma->requests;
+		dma->requests = req;
+
+		req->aiocb = bdrv_aio_write(s->bs, off,
+					    dma->elem.out_sg[i].iov_base,
+					    dma->elem.out_sg[i].iov_len / 512,
+					    virtio_io_completion, req);
+		off += dma->elem.out_sg[i].iov_len / 512;
 	    }
-
-	    in->status = VIRTIO_BLK_S_OK;
 	} else {
-	    wlen = sizeof(*in);
-
-	    for (i = 0; i < elem.in_num - 1; i++) {
-		bdrv_read(s->bs, off,
-			  elem.in_sg[i].iov_base,
-			  elem.in_sg[i].iov_len / 512);
-		off += elem.in_sg[i].iov_len / 512;
-		wlen += elem.in_sg[i].iov_len;
+	    dma->count = dma->elem.in_num - 1;
+	    dma->is_write = 0;
+	    for (i = 0; i < dma->elem.in_num - 1; i++) {
+		req = qemu_mallocz(sizeof(VBDMARequestState));
+		req->dma = dma;
+		req->next = dma->requests;
+		dma->requests = req;
+
+		req->aiocb = bdrv_aio_read(s->bs, off,
+					   dma->elem.in_sg[i].iov_base,
+					   dma->elem.in_sg[i].iov_len / 512,
+					   virtio_io_completion, req);
+		off += dma->elem.in_sg[i].iov_len / 512;
+		dma->wlen += dma->elem.in_sg[i].iov_len;
 	    }
-
-	    in->status = VIRTIO_BLK_S_OK;
 	}
 
-	virtqueue_push(vq, &elem, wlen);
-	virtio_notify(vdev, vq);
+	dma = qemu_mallocz(sizeof(VBDMAState));
     }
+
+    qemu_free(dma);
 }
 
 static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)

[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]