[libvirt] [PATCH] qemu: allow blkstat/blkinfo calls during migration

Eric Blake eblake at redhat.com
Tue May 10 20:40:15 UTC 2011


On 05/10/2011 09:57 AM, Federico Simoncelli wrote:
> References:
>  - https://www.redhat.com/archives/libvir-list/2011-May/msg00210.html
>  - https://www.redhat.com/archives/libvir-list/2011-May/msg00287.html

Can you also provide a summary of these messages, as well as a summary
of your code changes, in your commit message, so that future 'git log'
readers won't have to fire up a browser?  Something like:

Add some additional job signal flags for doing blkstat/blkinfo during a
migration.  Add a condition variable that can be used to efficiently
wait for the migration code to clear the signal flag.

> 
> ---
>  src/qemu/qemu_domain.c    |    6 +++
>  src/qemu/qemu_domain.h    |    7 ++++
>  src/qemu/qemu_driver.c    |   86 +++++++++++++++++++++++++++++++--------------
>  src/qemu/qemu_migration.c |   31 ++++++++++++++++
>  4 files changed, 103 insertions(+), 27 deletions(-)

You've previously contributed under a gmail account; any preference
which of the two accounts should be listed in AUTHORS?

> 
> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
> index c61f9bf..d4e53c4 100644
> --- a/src/qemu/qemu_domain.c
> +++ b/src/qemu/qemu_domain.c
> @@ -526,6 +526,12 @@ int qemuDomainObjBeginJob(virDomainObjPtr obj)
>      priv->jobStart = timeval_to_ms(now);
>      memset(&priv->jobInfo, 0, sizeof(priv->jobInfo));
>  
> +    if (virCondInit(&priv->signalCond) < 0) {
> +        virReportSystemError(errno,
> +                             "%s", _("cannot initialize signal condition"));
> +        return -1;
> +    }
> +

This attempts to initialize the condition variable on every call to
qemuDomainObjBeginJob, which is undefined behavior after the first
initialization.  Also, I don't see any counterpart virCondDestroy in
your patch, which is a resource leak.  I think you meant to add the init
in qemuDomainObjPrivateAlloc, and a counterpart destroy in
qemuDomainObjPrivateFree.

EEK! When did we ever initialize priv->jobCond?  I'm impressed that the
qemu driver even works in the first place, since it is calling
virCondWaitUntil on an uninitialized condition!  [That is, we're
implicitly relying on PTHREAD_COND_INITIALIZER just _happening_ to be an
all-zeros initialization in Linux, where qemuDomainObjPrivateAlloc
zero-initialized the condition variable, but relying on this implicit
behavior violates POSIX]

>      return 0;
>  }
>  
> diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
> index 6d24f53..b82986c 100644
> --- a/src/qemu/qemu_domain.h
> +++ b/src/qemu/qemu_domain.h
> @@ -47,11 +47,17 @@ enum qemuDomainJobSignals {
>      QEMU_JOB_SIGNAL_SUSPEND = 1 << 1, /* Request VM suspend to finish live migration offline */
>      QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME = 1 << 2, /* Request migration downtime change */
>      QEMU_JOB_SIGNAL_MIGRATE_SPEED = 1 << 3, /* Request migration speed change */
> +    QEMU_JOB_SIGNAL_BLKSTAT = 1 << 4, /* Request blkstat during migration */
> +    QEMU_JOB_SIGNAL_BLKINFO = 1 << 5, /* Request blkinfo during migration */
>  };
>  
>  struct qemuDomainJobSignalsData {
>      unsigned long long migrateDowntime; /* Data for QEMU_JOB_SIGNAL_MIGRATE_DOWNTIME */
>      unsigned long migrateBandwidth; /* Data for QEMU_JOB_SIGNAL_MIGRATE_SPEED */
> +    char *devname; /* Device name used by blkstat/blkinfo calls */
> +    int returnCode; /* Return code for the blkstat/blkinfo calls */
> +    virDomainBlockStatsPtr blockStat; /* Block statistics for QEMU_JOB_SIGNAL_BLKSTAT */
> +    virDomainBlockInfoPtr blockInfo; /* Block information for QEMU_JOB_SIGNAL_BLKINFO */

What happens if both APIs are called in the window before either can be
acted on?  Shouldn't you have two returnCode fields, one for each
QEMU_JOB_SIGNAL_BLK* bit, so they aren't stomping on each other?

>  };
>  
>  typedef struct _qemuDomainPCIAddressSet qemuDomainPCIAddressSet;
> @@ -61,6 +67,7 @@ typedef struct _qemuDomainObjPrivate qemuDomainObjPrivate;
>  typedef qemuDomainObjPrivate *qemuDomainObjPrivatePtr;
>  struct _qemuDomainObjPrivate {
>      virCond jobCond; /* Use in conjunction with main virDomainObjPtr lock */
> +    virCond signalCond; /* Use in conjunction with main virDomainObjPtr lock */

Comment is a bit misleading; it is used to coordinate between migration
and safe query APIs.

>      enum qemuDomainJob jobActive;   /* Currently running job */
>      unsigned int jobSignals;        /* Signals for running job */
>      struct qemuDomainJobSignalsData jobSignalsData; /* Signal specific data */
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index 0fd0f10..f9f5e83 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -5031,13 +5031,10 @@ qemudDomainBlockStats (virDomainPtr dom,
>          goto cleanup;
>      }
>  
> -    if (qemuDomainObjBeginJob(vm) < 0)
> -        goto cleanup;
> -
>      if (!virDomainObjIsActive(vm)) {
>          qemuReportError(VIR_ERR_OPERATION_INVALID,
>                          "%s", _("domain is not running"));
> -        goto endjob;
> +        goto cleanup;
>      }
>  
>      for (i = 0 ; i < vm->def->ndisks ; i++) {
> @@ -5050,29 +5047,48 @@ qemudDomainBlockStats (virDomainPtr dom,
>      if (!disk) {
>          qemuReportError(VIR_ERR_INVALID_ARG,
>                          _("invalid path: %s"), path);
> -        goto endjob;
> +        goto cleanup;
>      }
>  
>      if (!disk->info.alias) {
>          qemuReportError(VIR_ERR_INTERNAL_ERROR,
>                          _("missing disk device alias name for %s"), disk->dst);
> -        goto endjob;
> +        goto cleanup;
>      }
>  
>      priv = vm->privateData;
> -    qemuDomainObjEnterMonitor(vm);
> -    ret = qemuMonitorGetBlockStatsInfo(priv->mon,
> -                                       disk->info.alias,
> -                                       &stats->rd_req,
> -                                       &stats->rd_bytes,
> -                                       &stats->wr_req,
> -                                       &stats->wr_bytes,
> -                                       &stats->errs);
> -    qemuDomainObjExitMonitor(vm);
> +    if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT)
> +            || (priv->jobActive == QEMU_JOB_SAVE)) {

Odd indentation.

>  
> -endjob:
> -    if (qemuDomainObjEndJob(vm) == 0)
> -        vm = NULL;
> +        while (priv->jobSignals)
> +            ignore_value(virCondWait(&priv->signalCond, &vm->lock));

Per src/qemu/THREADS.txt, we need to increase the reference count prior
to virCondWait, and decrement it after we read
jobSignalsData.returnCode.  Otherwise, because the condition value is
dropping the vm lock, it is possible that the vm could go away in the
meantime, and without holding an extra reference to the vm, we risk
trying to regain vm->lock after it has been garbage collected by another
thread.  The qemuDomainObjBeginJob for the non-migration case already
took care of that for us.

> +
> +        priv->jobSignals |= QEMU_JOB_SIGNAL_BLKSTAT;
> +        priv->jobSignalsData.devname = disk->info.alias;
> +        priv->jobSignalsData.blockStat = stats;
> +        priv->jobSignalsData.returnCode = -1;
> +
> +        while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT)
> +            ignore_value(virCondWait(&priv->signalCond, &vm->lock));
> +
> +        ret = priv->jobSignalsData.returnCode;
> +    } else {
> +        if (qemuDomainObjBeginJob(vm) < 0)
> +            goto cleanup;
> +
> +        qemuDomainObjEnterMonitor(vm);

Missing a check that the vm is still active (it could have died in the
time it took to get the job condition).

> +        ret = qemuMonitorGetBlockStatsInfo(priv->mon,
> +                                           disk->info.alias,
> +                                           &stats->rd_req,
> +                                           &stats->rd_bytes,
> +                                           &stats->wr_req,
> +                                           &stats->wr_bytes,
> +                                           &stats->errs);
> +        qemuDomainObjExitMonitor(vm);
> +
> +        if (qemuDomainObjEndJob(vm) == 0)
> +            vm = NULL;
> +    }
>  
>  cleanup:
>      if (vm)
> @@ -5473,23 +5489,39 @@ static int qemuDomainGetBlockInfo(virDomainPtr dom,
>         disk format and on a block device, then query
>         highest allocated extent from QEMU */
>      if (disk->type == VIR_DOMAIN_DISK_TYPE_BLOCK &&
> -        format != VIR_STORAGE_FILE_RAW &&
> -        S_ISBLK(sb.st_mode)) {
> +            format != VIR_STORAGE_FILE_RAW && S_ISBLK(sb.st_mode)) {

Spurious whitespace change.

>          qemuDomainObjPrivatePtr priv = vm->privateData;
> -        if (qemuDomainObjBeginJob(vm) < 0)
> -            goto cleanup;
> -        if (!virDomainObjIsActive(vm))
> +
> +        if (!virDomainObjIsActive(vm)) {
>              ret = 0;
> -        else {
> +        } else if ((priv->jobActive == QEMU_JOB_MIGRATION_OUT)
> +                    || (priv->jobActive == QEMU_JOB_SAVE)) {

Inconsistent indentation.

> +
> +            while (priv->jobSignals)
> +                ignore_value(virCondWait(&priv->signalCond, &vm->lock));

Same problem with needing to temporarily increase vm's reference count.

> +
> +            priv->jobSignals |= QEMU_JOB_SIGNAL_BLKINFO;
> +            priv->jobSignalsData.devname = disk->info.alias;
> +            priv->jobSignalsData.blockInfo = info;
> +            priv->jobSignalsData.returnCode = -1;
> +
> +            while (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO)
> +                ignore_value(virCondWait(&priv->signalCond, &vm->lock));
> +
> +            ret = priv->jobSignalsData.returnCode;

Yep, I've definitely convinced myself that you need two returnCode slots.

Thread 1 starts a migration.
Thread 2 queries BlockStats, waits for the condition variable and
priv->jobSignals to be 0.
Thread 2 sets priv->jobSignals to non-zero, then waits for condition
variable and priv->jobSignals to clear BLKSTATS.
Thread 3 queries BlockInfo.
Thread 1 processes the BlockStats query, and clears priv->jobSignals
Thread 3 then gets condition lock instead of thread 2, and requests blkinfo
Thread 1 then gets condition lock instead of thread 2, and nukes the
returnCode
Thread 2 finally gets condition lock, but sees the answer intended for
thread 3.

> +        } else {
> +            if (qemuDomainObjBeginJob(vm) < 0)
> +                goto cleanup;
> +

Missing a check if vm is still active.

>              qemuDomainObjEnterMonitor(vm);
>              ret = qemuMonitorGetBlockExtent(priv->mon,
>                                              disk->info.alias,
>                                              &info->allocation);
>              qemuDomainObjExitMonitor(vm);
> -        }
>  
> -        if (qemuDomainObjEndJob(vm) == 0)
> -            vm = NULL;
> +            if (qemuDomainObjEndJob(vm) == 0)
> +                vm = NULL;
> +        }
>      } else {
>          ret = 0;
>      }
> diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
> index 6738a53..54e41f9 100644
> --- a/src/qemu/qemu_migration.c
> +++ b/src/qemu/qemu_migration.c
> @@ -156,6 +156,34 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm)
>              qemuDomainObjExitMonitorWithDriver(driver, vm);
>              if (rc < 0)
>                  VIR_WARN0("Unable to set migration speed");
> +        } else if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKSTAT) {
> +            qemuDomainObjEnterMonitorWithDriver(driver, vm);
> +            rc = qemuMonitorGetBlockStatsInfo(priv->mon,
> +                                  priv->jobSignalsData.devname,
> +                                  &priv->jobSignalsData.blockStat->rd_req,
> +                                  &priv->jobSignalsData.blockStat->rd_bytes,
> +                                  &priv->jobSignalsData.blockStat->wr_req,
> +                                  &priv->jobSignalsData.blockStat->wr_bytes,
> +                                  &priv->jobSignalsData.blockStat->errs);
> +            qemuDomainObjExitMonitorWithDriver(driver, vm);
> +
> +            priv->jobSignalsData.returnCode = rc;
> +            priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKSTAT;
> +
> +            if (rc < 0)
> +                VIR_WARN0("Unable to get block statistics");
> +        } else if (priv->jobSignals & QEMU_JOB_SIGNAL_BLKINFO) {

Pre-existing, but I think we should process all signals on a single
pass, rather than just the first signal we encounter (which then delays
the remaining signals for the next sleep and time around the loop).

> +            qemuDomainObjEnterMonitorWithDriver(driver, vm);
> +            rc = qemuMonitorGetBlockExtent(priv->mon,
> +                               priv->jobSignalsData.devname,
> +                               &priv->jobSignalsData.blockInfo->allocation);
> +            qemuDomainObjExitMonitorWithDriver(driver, vm);
> +
> +            priv->jobSignalsData.returnCode = rc;
> +            priv->jobSignals ^= QEMU_JOB_SIGNAL_BLKINFO;
> +
> +            if (rc < 0)
> +                VIR_WARN0("Unable to get block information");
>          }
>  
>          /* Repeat check because the job signals might have caused
> @@ -223,6 +251,8 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm)
>              break;
>          }
>  
> +        virCondSignal(&priv->signalCond);
> +
>          virDomainObjUnlock(vm);
>          qemuDriverUnlock(driver);
>  
> @@ -233,6 +263,7 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm)
>      }
>  
>  cleanup:
> +    virCondBroadcast(&priv->signalCond);
>      return ret;
>  }
>  

Overall, I like the idea, but we need a v2.  Do you need me to help?

-- 
Eric Blake   eblake at redhat.com    +1-801-349-2682
Libvirt virtualization library http://libvirt.org

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 619 bytes
Desc: OpenPGP digital signature
URL: <http://listman.redhat.com/archives/libvir-list/attachments/20110510/757fbd20/attachment-0001.sig>


More information about the libvir-list mailing list