[vfio-users] Occasional failure to start VM with GPU attached

Thomas Lindroth thomas.lindroth at gmail.com
Fri Jul 15 17:44:57 UTC 2016


I have a new setup and occasionally the VM fails to start after a fresh
boot. Qemu's thread "CPU 0/KVM" stalls and use 100% cpu on one core
until I shut down the VM.

If I remove the forwarded GPU the VM starts like normal using spice
output but it stalls again if I add the GPU. So the problem is probably
related to vfio. I have to reboot to make it work again with the GPU.

I use a win7 guest with UEFI and I forward a GTX 970 card. I've tried to
debug the problem best I could. Disabling hugepages, cpu pinning, apic
and some other features makes no difference. I'm not even getting the
tianocore boot splash before the stall so the guest operating system and
drivers seems irrelevant.

I tried to figure out what the stalled thread was doing when it ran.
I added some tracing to qemu by adding this to my libvirt xml.
<domain type='kvm' xmlns:qemu='http://libvirt.org/schemas/domain/qemu/1.0'>
...
   <qemu:commandline>
     <qemu:arg value='-D'/>
     <qemu:arg value='/tmp/qemu_trace'/>
     <qemu:arg value='-d'/>
     <qemu:arg value='trace:*'/>
   </qemu:commandline>

The trace is about 30 sec long and contains a whole lot of mostly
incomprehensible stuff. After 7.6 seconds the trace starts to repeat
itself. Here is a 1 sec cut out of what that looks like.
http://pastebin.com/4ZGDQBnh
It just executes kvm_vm_ioctl type 0x4010ae42, arg 0x7ffe9535ee50 over
and over. The last calls leading up to the repeat is this
http://pastebin.com/qezPNt9X
Looks like it's trying to do a pci config space write using an nvidia
quirk and after that it only repeats. I don't know what to make of that.
I guess OVMF is trying to post the card. The vbios is writing
something to pci config space and then sit around busy waiting for
something to happen but it never happens.

At first I tough it was a problem with my setup but now I'm guessing
bug in qemu, kernel or OVMF. I'm using kernel-4.6.4, qemu-2.6.0 and some
daily build of OVMF from about a week ago. My qemu is patched with commit
94ef4f337fb614f18b765a8e0e878a4c23cdedcd because the win7 install doesn't
work without it.

Is there anything else I can do to debug this problem?

My libvirt xml:
<domain type='kvm'>
  <name>win7</name>
  <uuid>f11b648e-c652-4c42-b2ba-02732533a15d</uuid>
  <memory unit='KiB'>8388608</memory>
  <currentMemory unit='KiB'>8388608</currentMemory>
  <memoryBacking>
    <hugepages/>
  </memoryBacking>
  <vcpu placement='static'>3</vcpu>
  <cputune>
    <vcpupin vcpu='0' cpuset='1'/>
    <vcpupin vcpu='1' cpuset='2'/>
    <vcpupin vcpu='2' cpuset='3'/>
    <emulatorpin cpuset='5-7'/>
  </cputune>
  <os>
    <type arch='x86_64' machine='pc-i440fx-2.6'>hvm</type>
    <loader readonly='yes' type='pflash'>/usr/share/OVMF/OVMF_CODE.fd</loader>
    <nvram>/var/lib/libvirt/qemu/nvram/win7_VARS.fd</nvram>
    <bootmenu enable='no'/>
  </os>
  <features>
    <acpi/>
    <apic/>
    <hyperv>
      <vendor_id state='on' value='SomeString'/>
    </hyperv>
    <kvm>
      <hidden state='on'/>
    </kvm>
  </features>
  <cpu mode='host-passthrough'>
    <topology sockets='1' cores='3' threads='1'/>
  </cpu>
  <clock offset='localtime'>
    <timer name='rtc' tickpolicy='catchup'/>
    <timer name='pit' tickpolicy='delay'/>
    <timer name='hpet' present='no'/>
  </clock>
  <on_poweroff>destroy</on_poweroff>
  <on_reboot>restart</on_reboot>
  <on_crash>restart</on_crash>
  <pm>
    <suspend-to-mem enabled='yes'/>
    <suspend-to-disk enabled='yes'/>
  </pm>
  <devices>
    <emulator>/usr/bin/qemu-system-x86_64</emulator>
    <disk type='block' device='disk'>
      <driver name='qemu' type='raw' cache='none' discard='unmap'/>
      <source dev='/dev/VM_default_VG/main_VM_volume'/>
      <target dev='sda' bus='sata'/>
      <boot order='2'/>
      <address type='drive' controller='0' bus='0' target='0' unit='0'/>
    </disk>
    <disk type='file' device='cdrom'>
      <driver name='qemu' type='raw'/>
      <target dev='hdb' bus='ide'/>
      <readonly/>
      <boot order='1'/>
      <address type='drive' controller='0' bus='0' target='0' unit='1'/>
    </disk>
    <disk type='file' device='cdrom'>
      <driver name='qemu' type='raw'/>
      <source file='/mnt/virtual/share/windows7_SP2.iso'/>
      <target dev='hdc' bus='ide'/>
      <readonly/>
      <address type='drive' controller='0' bus='1' target='0' unit='0'/>
    </disk>
    <controller type='usb' index='0' model='ich9-ehci1'>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x04' function='0x7'/>
    </controller>
    <controller type='usb' index='0' model='ich9-uhci1'>
      <master startport='0'/>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x04' function='0x0' multifunction='on'/>
    </controller>
    <controller type='usb' index='0' model='ich9-uhci2'>
      <master startport='2'/>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x04' function='0x1'/>
    </controller>
    <controller type='usb' index='0' model='ich9-uhci3'>
      <master startport='4'/>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x04' function='0x2'/>
    </controller>
    <controller type='pci' index='0' model='pci-root'/>
    <controller type='ide' index='0'>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x1'/>
    </controller>
    <controller type='sata' index='0'>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x06' function='0x0'/>
    </controller>
    <interface type='user'>
      <mac address='00:11:22:33:44:55'/>
      <model type='virtio'/>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/>
    </interface>
    <interface type='bridge'>
      <mac address='00:11:22:33:44:66'/>
      <source bridge='br0'/>
      <model type='virtio'/>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x09' function='0x0'/>
    </interface>
    <input type='mouse' bus='ps2'/>
    <input type='keyboard' bus='ps2'/>
    <graphics type='spice' autoport='yes'>
      <listen type='address'/>
    </graphics>
    <video>
      <model type='qxl' ram='65536' vram='65536' vgamem='16384' heads='1' primary='yes'/>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x0'/>
    </video>
    <hostdev mode='subsystem' type='usb' managed='yes'>
      <source>
        <vendor id='0x045e'/>
        <product id='0x0737'/>
      </source>
    </hostdev>
    <hostdev mode='subsystem' type='usb' managed='yes'>
      <source>
        <vendor id='0x05af'/>
        <product id='0x1125'/>
      </source>
    </hostdev>
    <hostdev mode='subsystem' type='usb' managed='yes'>
      <source>
        <vendor id='0x054c'/>
        <product id='0x05c4'/>
      </source>
    </hostdev>
    <hostdev mode='subsystem' type='pci' managed='yes'>
      <source>
        <address domain='0x0000' bus='0x04' slot='0x00' function='0x0'/>
      </source>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x07' function='0x0'/>
    </hostdev>
    <hostdev mode='subsystem' type='pci' managed='yes'>
      <source>
        <address domain='0x0000' bus='0x04' slot='0x00' function='0x1'/>
      </source>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x08' function='0x0'/>
    </hostdev>
    <memballoon model='virtio'>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x05' function='0x0'/>
    </memballoon>
  </devices>
</domain>

Wrapper script to start the VM:
#!/bin/bash

# todo parse this from the xml
VM_NAME=win7
HUGEPAGES=4096
DATETIME=`date +%F-%T |sed -e 's/\://g'`

if [ "$(virsh domstate $VM_NAME)" = "running" ]
then
  echo "VM $VM_NAME already running"
  exit 1
fi

lvchange -ay -K VM_default_VG/main_VM_volume

# drop caches and compact memory to free up continuous memory for huge pages
echo 3 > /proc/sys/vm/drop_caches
echo 1 > /proc/sys/vm/compact_memory

echo $HUGEPAGES > /proc/sys/vm/nr_hugepages
ALLOC_PAGES=`cat /proc/sys/vm/nr_hugepages`

if [ "$ALLOC_PAGES" -ne "$HUGEPAGES" ]
then
  echo Not able to allocate hugepages
  echo 0 > /proc/sys/vm/nr_hugepages
  exit 1
fi

# libvirt likes to create a cpuset called "machine" and leave it behind after closing
# this confuses cset. remove it just in case
rmdir /sys/fs/cgroup/cpuset/machine

# reserve cpu 0,4 (core 0) for the host and give 1-3,5-7 (core 1-3) to the guest/vm
cset shield --kthread on --cpu 1-3,5-7

# cset reserves the cpusets for exclusive use but libvirt also wants to create
# new cpusets containing the same cpus. Remove the exclusive flag.
echo 0 > /sys/fs/cgroup/cpuset/system/cpuset.cpu_exclusive
echo 0 > /sys/fs/cgroup/cpuset/user/cpuset.cpu_exclusive

# Maximum Powah!
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

# THP can allegedly result in OS jitter. Better keep it off.
echo never > /sys/kernel/mm/transparent_hugepage/enabled

echo -n "Snapshot [y/n]? "
read SNAPSHOT
if [ $SNAPSHOT = "y" ]
then
  lvcreate -s --name VM_backup_$DATETIME VM_default_VG/main_VM_volume
  virsh dumpxml $VM_NAME > /home/user/VM_backup/${VM_NAME}_${DATETIME}
fi

xrandr --output DP1 --off

su --login user --command '/usr/bin/synergys --name host --no-tray --no-daemon --address 192.168.0.1:9050' &

virsh start $VM_NAME

sleep 10
while [ "$(virsh domstate $VM_NAME)" = "running" ]
do
  sleep 1
done

pkill --uid user --signal SIGINT synergys

xrandr --output DP1 --auto --output HDMI2 --auto --right-of DP1

echo always > /sys/kernel/mm/transparent_hugepage/enabled

echo powersave | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

cset shield --reset

echo 0 > /proc/sys/vm/nr_hugepages




More information about the vfio-users mailing list