[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

[libvirt] [PATCH v3 12/14] qemu: Handle post-copy migration failures



When migration fails in the post-copy mode, it's impossible to just kill
the destination domain and resume the source since the source no longer
contains current guest state. Let's mark domains on both sides as
VIR_DOMAIN_PAUSED_POSTCOPY_FAILED to let the upper layer decide what to
do with them.

Signed-off-by: Jiri Denemark <jdenemar redhat com>
---

Notes:
    Version 3:
    - send VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED event only on the
      destination host
    
    Version 2:
    - no change

 src/qemu/qemu_migration.c | 91 ++++++++++++++++++++++++++++++++++++++---------
 src/qemu/qemu_migration.h |  3 ++
 src/qemu/qemu_process.c   | 59 ++++++++++++++++++++++--------
 3 files changed, 122 insertions(+), 31 deletions(-)

diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
index 202e955..d67eca8 100644
--- a/src/qemu/qemu_migration.c
+++ b/src/qemu/qemu_migration.c
@@ -1475,14 +1475,21 @@ qemuMigrationRestoreDomainState(virConnectPtr conn, virDomainObjPtr vm)
 {
     virQEMUDriverPtr driver = conn->privateData;
     qemuDomainObjPrivatePtr priv = vm->privateData;
-    int state = virDomainObjGetState(vm, NULL);
+    int reason;
+    virDomainState state = virDomainObjGetState(vm, &reason);
     bool ret = false;
 
-    VIR_DEBUG("driver=%p, vm=%p, pre-mig-state=%d, state=%d",
-              driver, vm, priv->preMigrationState, state);
+    VIR_DEBUG("driver=%p, vm=%p, pre-mig-state=%s, state=%s, reason=%s",
+              driver, vm,
+              virDomainStateTypeToString(priv->preMigrationState),
+              virDomainStateTypeToString(state),
+              virDomainStateReasonToString(state, reason));
 
-    if (state == VIR_DOMAIN_PAUSED &&
-        priv->preMigrationState == VIR_DOMAIN_RUNNING) {
+    if (state != VIR_DOMAIN_PAUSED ||
+        reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED)
+        goto cleanup;
+
+    if (priv->preMigrationState == VIR_DOMAIN_RUNNING) {
         /* This is basically the only restore possibility that's safe
          * and we should attempt to do */
 
@@ -2364,6 +2371,48 @@ qemuMigrationSetOffline(virQEMUDriverPtr driver,
     return ret;
 }
 
+
+void
+qemuMigrationPostcopyFailed(virQEMUDriverPtr driver,
+                            virDomainObjPtr vm)
+{
+    virDomainState state;
+    int reason;
+
+    state = virDomainObjGetState(vm, &reason);
+
+    if (state != VIR_DOMAIN_PAUSED &&
+        state != VIR_DOMAIN_RUNNING)
+        return;
+
+    if (state == VIR_DOMAIN_PAUSED &&
+        reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED)
+        return;
+
+    VIR_WARN("Migration of domain %s failed during post-copy; "
+             "leaving the domain paused", vm->def->name);
+
+    if (state == VIR_DOMAIN_RUNNING) {
+        virObjectEventPtr event;
+
+        if (qemuProcessStopCPUs(driver, vm,
+                                VIR_DOMAIN_PAUSED_POSTCOPY_FAILED,
+                                QEMU_ASYNC_JOB_MIGRATION_IN) < 0) {
+            VIR_WARN("Unable to pause guest CPUs for %s", vm->def->name);
+            return;
+        }
+
+        event = virDomainEventLifecycleNewFromObj(vm,
+                                VIR_DOMAIN_EVENT_SUSPENDED,
+                                VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED);
+        qemuDomainEventQueue(driver, event);
+    } else {
+        virDomainObjSetState(vm, VIR_DOMAIN_PAUSED,
+                             VIR_DOMAIN_PAUSED_POSTCOPY_FAILED);
+    }
+}
+
+
 static int
 qemuMigrationSetOption(virQEMUDriverPtr driver,
                        virDomainObjPtr vm,
@@ -4015,8 +4064,8 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver,
     if (flags & VIR_MIGRATE_OFFLINE)
         goto done;
 
-    /* Did the migration go as planned?  If yes, kill off the
-     * domain object, but if no, resume CPUs
+    /* Did the migration go as planned?  If yes, kill off the domain object.
+     * If something failed, resume CPUs, but only if we didn't use post-copy.
      */
     if (retcode == 0) {
         /* If guest uses SPICE and supports seamless migration we have to hold
@@ -4035,6 +4084,7 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver,
         qemuDomainEventEmitJobCompleted(driver, vm);
     } else {
         virErrorPtr orig_err = virSaveLastError();
+        int reason;
 
         /* cancel any outstanding NBD jobs */
         qemuMigrationCancelDriveMirror(driver, vm, false,
@@ -4043,7 +4093,10 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver,
         virSetError(orig_err);
         virFreeError(orig_err);
 
-        if (qemuMigrationRestoreDomainState(conn, vm)) {
+        if (virDomainObjGetState(vm, &reason) == VIR_DOMAIN_PAUSED &&
+            reason == VIR_DOMAIN_PAUSED_POSTCOPY) {
+            qemuMigrationPostcopyFailed(driver, vm);
+        } else if (qemuMigrationRestoreDomainState(conn, vm)) {
             event = virDomainEventLifecycleNewFromObj(vm,
                                                       VIR_DOMAIN_EVENT_RESUMED,
                                                       VIR_DOMAIN_EVENT_RESUMED_MIGRATED);
@@ -5860,6 +5913,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver,
     int rc;
     qemuDomainJobInfoPtr jobInfo = NULL;
     bool inPostCopy = false;
+    bool kill = true;
 
     VIR_DEBUG("driver=%p, dconn=%p, vm=%p, cookiein=%s, cookieinlen=%d, "
               "cookieout=%p, cookieoutlen=%p, flags=%lx, retcode=%d",
@@ -6007,6 +6061,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver,
         }
 
         if (inPostCopy) {
+            kill = false;
             event = virDomainEventLifecycleNewFromObj(vm,
                                         VIR_DOMAIN_EVENT_RESUMED,
                                         VIR_DOMAIN_EVENT_RESUMED_POSTCOPY);
@@ -6066,14 +6121,18 @@ qemuMigrationFinish(virQEMUDriverPtr driver,
     if (!dom &&
         !(flags & VIR_MIGRATE_OFFLINE) &&
         virDomainObjIsActive(vm)) {
-        qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED,
-                        QEMU_ASYNC_JOB_MIGRATION_IN,
-                        VIR_QEMU_PROCESS_STOP_MIGRATED);
-        virDomainAuditStop(vm, "failed");
-        event = virDomainEventLifecycleNewFromObj(vm,
-                                                  VIR_DOMAIN_EVENT_STOPPED,
-                                                  VIR_DOMAIN_EVENT_STOPPED_FAILED);
-        qemuDomainEventQueue(driver, event);
+        if (kill) {
+            qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED,
+                            QEMU_ASYNC_JOB_MIGRATION_IN,
+                            VIR_QEMU_PROCESS_STOP_MIGRATED);
+            virDomainAuditStop(vm, "failed");
+            event = virDomainEventLifecycleNewFromObj(vm,
+                                VIR_DOMAIN_EVENT_STOPPED,
+                                VIR_DOMAIN_EVENT_STOPPED_FAILED);
+            qemuDomainEventQueue(driver, event);
+        } else {
+            qemuMigrationPostcopyFailed(driver, vm);
+        }
     }
 
     if (dom) {
diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h
index 953f96b..6ea4424 100644
--- a/src/qemu/qemu_migration.h
+++ b/src/qemu/qemu_migration.h
@@ -210,4 +210,7 @@ int qemuMigrationRunIncoming(virQEMUDriverPtr driver,
                              const char *uri,
                              qemuDomainAsyncJob asyncJob);
 
+void qemuMigrationPostcopyFailed(virQEMUDriverPtr driver,
+                                 virDomainObjPtr vm);
+
 #endif /* __QEMU_MIGRATION_H__ */
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index f19a20c..1854132 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -3139,8 +3139,13 @@ qemuProcessRecoverMigrationIn(virQEMUDriverPtr driver,
                               virConnectPtr conn,
                               qemuMigrationJobPhase phase,
                               virDomainState state,
-                              int reason ATTRIBUTE_UNUSED)
+                              int reason)
 {
+    bool postcopy = (state == VIR_DOMAIN_PAUSED &&
+                     reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) ||
+                    (state == VIR_DOMAIN_RUNNING &&
+                     reason == VIR_DOMAIN_RUNNING_POSTCOPY);
+
     switch (phase) {
     case QEMU_MIGRATION_PHASE_NONE:
     case QEMU_MIGRATION_PHASE_PERFORM2:
@@ -3173,8 +3178,10 @@ qemuProcessRecoverMigrationIn(virQEMUDriverPtr driver,
     case QEMU_MIGRATION_PHASE_FINISH3:
         /* migration finished, we started resuming the domain but didn't
          * confirm success or failure yet; killing it seems safest unless
-         * we already started guest CPUs */
-        if (state != VIR_DOMAIN_RUNNING) {
+         * we already started guest CPUs or we were in post-copy mode */
+        if (postcopy) {
+            qemuMigrationPostcopyFailed(driver, vm);
+        } else if (state != VIR_DOMAIN_RUNNING) {
             VIR_DEBUG("Killing migrated domain %s", vm->def->name);
             return -1;
         }
@@ -3192,6 +3199,10 @@ qemuProcessRecoverMigrationOut(virQEMUDriverPtr driver,
                                virDomainState state,
                                int reason)
 {
+    bool postcopy = state == VIR_DOMAIN_PAUSED &&
+                    (reason == VIR_DOMAIN_PAUSED_POSTCOPY ||
+                     reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED);
+
     switch (phase) {
     case QEMU_MIGRATION_PHASE_NONE:
     case QEMU_MIGRATION_PHASE_PREPARE:
@@ -3209,26 +3220,44 @@ qemuProcessRecoverMigrationOut(virQEMUDriverPtr driver,
     case QEMU_MIGRATION_PHASE_PERFORM2:
     case QEMU_MIGRATION_PHASE_PERFORM3:
         /* migration is still in progress, let's cancel it and resume the
-         * domain */
-        VIR_DEBUG("Cancelling unfinished migration of domain %s",
-                  vm->def->name);
-        if (qemuMigrationCancel(driver, vm) < 0) {
-            VIR_WARN("Could not cancel ongoing migration of domain %s",
-                     vm->def->name);
+         * domain; however we can only do that before migration enters
+         * post-copy mode
+         */
+        if (postcopy) {
+            qemuMigrationPostcopyFailed(driver, vm);
+        } else {
+            VIR_DEBUG("Cancelling unfinished migration of domain %s",
+                      vm->def->name);
+            if (qemuMigrationCancel(driver, vm) < 0) {
+                VIR_WARN("Could not cancel ongoing migration of domain %s",
+                         vm->def->name);
+            }
+            goto resume;
         }
-        goto resume;
+        break;
 
     case QEMU_MIGRATION_PHASE_PERFORM3_DONE:
         /* migration finished but we didn't have a chance to get the result
-         * of Finish3 step; third party needs to check what to do next
+         * of Finish3 step; third party needs to check what to do next; in
+         * post-copy mode we can use PAUSED_POSTCOPY_FAILED state for this
          */
+        if (postcopy)
+            qemuMigrationPostcopyFailed(driver, vm);
         break;
 
     case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED:
-        /* Finish3 failed, we need to resume the domain */
-        VIR_DEBUG("Resuming domain %s after failed migration",
-                  vm->def->name);
-        goto resume;
+        /* Finish3 failed, we need to resume the domain, but once we enter
+         * post-copy mode there's no way back, so let's just mark the domain
+         * as broken in that case
+         */
+        if (postcopy) {
+            qemuMigrationPostcopyFailed(driver, vm);
+        } else {
+            VIR_DEBUG("Resuming domain %s after failed migration",
+                      vm->def->name);
+            goto resume;
+        }
+        break;
 
     case QEMU_MIGRATION_PHASE_CONFIRM3:
         /* migration completed, we need to kill the domain here */
-- 
2.7.2


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]