[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

[libvirt] [PATCH] qemu: Fix race between device rebind and kvm cleanup



Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
the host when doing device passthrough.  This can lead to a race
condition where the hypervisor is still cleaning up the device while
libvirt is trying to re-attach it to the host device driver.  To avoid
this situation, we look through /proc/iomem, and if the hypervisor is
still holding onto the bar (denoted by the string in the matcher variable),
then we can wait around a bit for that to clear up.

Signed-off-by: Chris Lalancette <clalance redhat com>
---
 src/libvirt_private.syms |    1 +
 src/qemu/qemu_driver.c   |   19 ++++++---
 src/util/pci.c           |   97 ++++++++++++++++++++++++++++++++++++++++++++++
 src/util/pci.h           |    1 +
 4 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index c912d81..e42c090 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -439,6 +439,7 @@ pciGetDevice;
 pciFreeDevice;
 pciDettachDevice;
 pciReAttachDevice;
+pciWaitForDeviceCleanup;
 pciResetDevice;
 pciDeviceSetManaged;
 pciDeviceGetManaged;
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index e14ed8f..cde0034 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -2244,6 +2244,7 @@ qemuDomainReAttachHostDevices(virConnectPtr conn,
 {
     pciDeviceList *pcidevs;
     int i;
+    int retries = 100;
 
     if (!def->nhostdevs)
         return;
@@ -2277,12 +2278,18 @@ qemuDomainReAttachHostDevices(virConnectPtr conn,
 
     for (i = 0; i < pciDeviceListCount(pcidevs); i++) {
         pciDevice *dev = pciDeviceListGet(pcidevs, i);
-        if (pciDeviceGetManaged(dev) &&
-            pciReAttachDevice(NULL, dev) < 0) {
-            virErrorPtr err = virGetLastError();
-            VIR_ERROR(_("Failed to re-attach PCI device: %s"),
-                      err ? err->message : "");
-            virResetError(err);
+        if (pciDeviceGetManaged(dev)) {
+            while (pciWaitForDeviceCleanup(dev, "kvm_assigned_device")
+                   && retries) {
+                usleep(100*1000);
+                retries--;
+            }
+            if (pciReAttachDevice(NULL, dev) < 0) {
+                virErrorPtr err = virGetLastError();
+                VIR_ERROR(_("Failed to re-attach PCI device: %s"),
+                          err ? err->message : "");
+                virResetError(err);
+            }
         }
     }
 
diff --git a/src/util/pci.c b/src/util/pci.c
index 0defbfe..09535bd 100644
--- a/src/util/pci.c
+++ b/src/util/pci.c
@@ -883,6 +883,103 @@ pciReAttachDevice(virConnectPtr conn, pciDevice *dev)
     return pciUnBindDeviceFromStub(conn, dev, driver);
 }
 
+/* Certain hypervisors (like qemu/kvm) map the PCI bar(s) on
+ * the host when doing device passthrough.  This can lead to a race
+ * condition where the hypervisor is still cleaning up the device while
+ * libvirt is trying to re-attach it to the host device driver.  To avoid
+ * this situation, we look through /proc/iomem, and if the hypervisor is
+ * still holding onto the bar (denoted by the string in the matcher variable),
+ * then we can wait around a bit for that to clear up.
+ *
+ * A typical /proc/iomem looks like this (snipped for brevity):
+ * 00010000-0008efff : System RAM
+ * 0008f000-0008ffff : reserved
+ * ...
+ * 00100000-cc9fcfff : System RAM
+ *   00200000-00483d3b : Kernel code
+ *   00483d3c-005c88df : Kernel data
+ * cc9fd000-ccc71fff : ACPI Non-volatile Storage
+ * ...
+ * d0200000-d02fffff : PCI Bus #05
+ *   d0200000-d021ffff : 0000:05:00.0
+ *     d0200000-d021ffff : e1000e
+ *   d0220000-d023ffff : 0000:05:00.0
+ *     d0220000-d023ffff : e1000e
+ * ...
+ * f0000000-f0003fff : 0000:00:1b.0
+ *   f0000000-f0003fff : kvm_assigned_device
+ *
+ * Returns 0 if we are clear to continue, and 1 if the hypervisor is still
+ * holding onto the resource.
+ */
+int
+pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher)
+{
+    FILE *fp;
+    char line[160];
+    unsigned long long start, end;
+    int consumed;
+    char *rest;
+    unsigned long long domain;
+    int bus, slot, function;
+    int in_matching_device;
+    int ret;
+    size_t match_depth;
+
+    fp = fopen("/proc/iomem", "r");
+    if (!fp) {
+        /* If we failed to open iomem, we just basically ignore the error.  The
+         * unbind might succeed anyway, and besides, it's very likely we have
+         * no way to report the error
+         */
+        VIR_DEBUG0("Failed to open /proc/iomem, trying to continue anyway");
+        return 0;
+    }
+
+    ret = 0;
+    in_matching_device = 0;
+    match_depth = 0;
+    while (fgets(line, sizeof(line), fp) != 0) {
+        /* the logic here is a bit confusing.  For each line, we look to
+         * see if it matches the domain:bus:slot.function we were given.
+         * If this line matches the DBSF, then any subsequent lines indented
+         * by 2 spaces are the PCI regions for this device.  It's also
+         * possible that none of the PCI regions are currently mapped, in
+         * which case we have no indented regions.  This code handles all
+         * of these situations
+         */
+        if (in_matching_device && (strspn(line, " ") == (match_depth + 2))) {
+            if (sscanf(line, "%Lx-%Lx : %n", &start, &end, &consumed) != 2)
+                continue;
+
+            rest = line + consumed;
+            if (STRPREFIX(rest, matcher)) {
+                ret = 1;
+                break;
+            }
+        }
+        else {
+            in_matching_device = 0;
+            if (sscanf(line, "%Lx-%Lx : %n", &start, &end, &consumed) != 2)
+                continue;
+
+            rest = line + consumed;
+            if (sscanf(rest, "%Lx:%x:%x.%x", &domain, &bus, &slot, &function) != 4)
+                continue;
+
+            if (domain != dev->domain || bus != dev->bus || slot != dev->slot ||
+                function != dev->function)
+                continue;
+            in_matching_device = 1;
+            match_depth = strspn(line, " ");
+        }
+    }
+
+    fclose(fp);
+
+    return ret;
+}
+
 static char *
 pciReadDeviceID(pciDevice *dev, const char *id_name)
 {
diff --git a/src/util/pci.h b/src/util/pci.h
index a1a19e7..e6ab137 100644
--- a/src/util/pci.h
+++ b/src/util/pci.h
@@ -81,5 +81,6 @@ int pciDeviceFileIterate(virConnectPtr conn,
 int pciDeviceIsAssignable(virConnectPtr conn,
                           pciDevice *dev,
                           int strict_acs_check);
+int pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher);
 
 #endif /* __VIR_PCI_H__ */
-- 
1.6.6


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]