[libvirt] [PATCH 6/6] qemu: pass numa node binding preferences to qemu

Daniel P. Berrange berrange at redhat.com
Wed Jun 4 15:28:57 UTC 2014


On Wed, Jun 04, 2014 at 04:56:32PM +0200, Martin Kletzander wrote:
> diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c
> index b1bfb5a..6415a6d 100644
> --- a/src/qemu/qemu_cgroup.c
> +++ b/src/qemu/qemu_cgroup.c
> @@ -930,8 +932,15 @@ qemuSetupCgroupForVcpu(virDomainObjPtr vm)
>                  goto cleanup;
>          }
> 
> +        if (virDomainGetMemsForGuestCpu(def, i, &mem_mask) < 0)
> +            goto cleanup;
> +
>          /* Set vcpupin in cgroup if vcpupin xml is provided */
>          if (virCgroupHasController(priv->cgroup, VIR_CGROUP_CONTROLLER_CPUSET)) {
> +
> +            if (mem_mask && virCgroupSetCpusetMems(cgroup_vcpu, mem_mask) < 0)
> +                goto cleanup;
> +

I'm still pretty unclear on whether we want todo this or not.

IMHO the <memnode> is primarily about making guest NUMA nodes
be allocated from specific host NUMA nodes. 

I'm not sure that arbitrary allocations done by the QEMU vCPU
threads should be bound the same way or not. Be nice to have
explicit confirmation from someone knowing KVM to say that we
should force vCPU threads to allocate from the same NUMA
nodes as the guest NUMA the vCPU is in.

>              /* find the right CPU to pin, otherwise
>               * qemuSetupCgroupVcpuPin will fail. */
>              for (j = 0; j < def->cputune.nvcpupin; j++) {
> @@ -946,14 +955,21 @@ qemuSetupCgroupForVcpu(virDomainObjPtr vm)
> 
>                  break;
>              }
> +        } else if (mem_mask) {
> +            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                           _("cgroup cpuset is required for "
> +                             "per-node memory binding"));
> +            goto cleanup;
>          }
> 
> +        VIR_FREE(mem_mask);
>          virCgroupFree(&cgroup_vcpu);
>      }
> 
>      return 0;
> 
>   cleanup:
> +    VIR_FREE(mem_mask);
>      if (cgroup_vcpu) {
>          virCgroupRemove(cgroup_vcpu);
>          virCgroupFree(&cgroup_vcpu);
> diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
> index cc6023d..8699eea 100644
> --- a/src/qemu/qemu_command.c
> +++ b/src/qemu/qemu_command.c
> @@ -6531,21 +6531,99 @@ qemuBuildSmpArgStr(const virDomainDef *def,
>      return virBufferContentAndReset(&buf);
>  }
> 
> +static const char *
> +qemuNumaPolicyToString(virDomainNumatuneMemMode mode)
> +{
> +    const char *policy = NULL;
> +
> +    switch (mode) {
> +    case VIR_DOMAIN_NUMATUNE_MEM_STRICT:
> +        policy = "bind";
> +        break;
> +
> +    case VIR_DOMAIN_NUMATUNE_MEM_PREFERRED:
> +        policy = "preferred";
> +        break;
> +
> +    case VIR_DOMAIN_NUMATUNE_MEM_INTERLEAVE:
> +        policy = "interleave";
> +        break;
> +
> +    case VIR_DOMAIN_NUMATUNE_MEM_LAST:
> +        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                       _("Cannot translate memory policy"));
> +        break;
> +    }
> +
> +    return policy;
> +}

Looks like a candidate for qemu local VIR_ENUM

> +
>  static int
> -qemuBuildNumaArgStr(const virDomainDef *def, virCommandPtr cmd)
> +qemuBuildNumaArgStr(const virDomainDef *def,
> +                    virCommandPtr cmd,
> +                    virQEMUDriverPtr driver,
> +                    virQEMUCapsPtr qemuCaps,
> +                    virBitmapPtr autoNodemask)
>  {
> +    int ret = -1;
>      size_t i;
> -    virBuffer buf = VIR_BUFFER_INITIALIZER;
>      char *cpumask = NULL;
> -    int ret = -1;
> +    virCapsPtr caps = virQEMUDriverGetCapabilities(driver, false);
> +    virBuffer buf = VIR_BUFFER_INITIALIZER;
> +    char *nodemask = NULL;
> +    char *globalNodemask = NULL;
> +    const char *globalPolicy = qemuNumaPolicyToString(def->numatune.memory.mode);
> +
> +    if (!globalPolicy)
> +        goto cleanup;
> +

> 
>      for (i = 0; i < def->cpu->ncells; i++) {
> -        VIR_FREE(cpumask);
> +        const char *policy = NULL;
>          int cellmem = VIR_DIV_UP(def->cpu->cells[i].mem, 1024);
>          def->cpu->cells[i].mem = cellmem * 1024;
> 
> -        if (!(cpumask = virBitmapFormat(def->cpu->cells[i].cpumask)))
> +        VIR_FREE(nodemask);
> +        if (def->numatune.mem_nodes &&
> +            def->numatune.mem_nodes[i].specified) {
> +            policy = qemuNumaPolicyToString(def->numatune.mem_nodes[i].mode);
> +            if (!policy)
> +                goto cleanup;
> +
> +            nodemask = virBitmapFormat(def->numatune.mem_nodes[i].nodemask);
> +            if (!nodemask) {
> +                virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                               _("Unable to format nodemask"));
> +                goto cleanup;
> +            }
> +        }
> +
> +        VIR_FREE(cpumask);
> +
> +        if (!(cpumask = virBitmapFormat(def->cpu->cells[i].cpumask))) {
> +            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                           _("Unable to format nodemask"));
>              goto cleanup;
> +        }
> 
>          if (strchr(cpumask, ',')) {
>              virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> @@ -6554,6 +6632,38 @@ qemuBuildNumaArgStr(const virDomainDef *def, virCommandPtr cmd)
>              goto cleanup;
>          }
> 
> +        if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_OBJECT_MEMORY_RAM)) {
> +            virBufferAsprintf(&buf, "memory-ram,size=%dM,id=ram-node%d",
> +                              cellmem, def->cpu->cells[i].cellid);
> +
> +            if (nodemask) {
> +                if (strchr(nodemask, ',')) {
> +                    virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                                   _("disjoint NUMA node ranges are not supported "
> +                                     "with this QEMU"));
> +                    goto cleanup;
> +                }
> +
> +                virBufferEscape(&buf, ',', ",", ",host-nodes=%s", nodemask);
> +                virBufferAsprintf(&buf, ",policy=%s", policy);
> +
> +            } else if (globalNodemask) {
> +
> +                if (strchr(globalNodemask, ',')) {
> +                    virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> +                                   _("disjoint NUMA node ranges are not supported "
> +                                     "with this QEMU"));
> +                    goto cleanup;
> +                }
> +
> +                virBufferEscape(&buf, ',', ",", ",host-nodes=%s", globalNodemask);
> +                virBufferAsprintf(&buf, ",policy=%s", globalPolicy);
> +            }
> +
> +            virCommandAddArg(cmd, "-object");
> +            virCommandAddArgBuffer(cmd, &buf);
> +        }

We should be reporting VIR_ERR_CONFIG_SUPPORTED if they have requested
<memnode> in the XML and we don't have the QEMU_CAPS_OBJECT_MEMORY_RAM
capability.


> diff --git a/tests/qemuxml2argvdata/qemuxml2argv-numatune-memnodes.xml b/tests/qemuxml2argvdata/qemuxml2argv-numatune-memnodes.xml
> new file mode 100644
> index 0000000..18b00d8
> --- /dev/null
> +++ b/tests/qemuxml2argvdata/qemuxml2argv-numatune-memnodes.xml
> @@ -0,0 +1,31 @@
> +<domain type='qemu'>
> +  <name>QEMUGuest</name>
> +  <uuid>9f4b6512-e73a-4a25-93e8-5307802821ce</uuid>
> +  <memory unit='KiB'>65536</memory>
> +  <currentMemory unit='KiB'>65536</currentMemory>
> +  <vcpu placement='static'>2</vcpu>
> +  <numatune>
> +    <memory mode='strict' nodeset='0-3'/>
> +    <memnode cellid='0' mode='preferred' nodeset='3'/>
> +  </numatune>
> +  <os>
> +    <type arch='x86_64' machine='pc'>hvm</type>
> +    <boot dev='hd'/>
> +  </os>
> +  <cpu>
> +    <numa>
> +      <cell id='0' cpus='0' memory='32768'/>
> +      <cell id='1' cpus='1' memory='32768'/>

How about expanding this test case to demonstrate a huge asymmetric
NUMA topology. eg 32 vcpus. 3 numa nodes. 16 vCPUs in 1 node, and
8 in each of the other nodes.


Also, perhaps we should have an example topology that has CPU only nodes
and memory only nodes. eg

      <cell id='0' cpus='0-1'/>
      <cell id='1' memory='65536'/>

Regards,
Daniel
-- 
|: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org              -o-             http://virt-manager.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org       -o-       http://live.gnome.org/gtk-vnc :|




More information about the libvir-list mailing list