[libvirt] [PATCH RFC]: Support numad
Dave Allan
dallan at redhat.com
Tue Feb 28 16:33:03 UTC 2012
On Tue, Feb 28, 2012 at 10:10:50PM +0800, Osier Yang wrote:
> numad is an user-level daemon that monitors NUMA topology and
> processes resource consumption to facilitate good NUMA resource
> alignment of applications/virtual machines to improve performance
> and minimize cost of remote memory latencies. It provides a
> pre-placement advisory interface, so significant processes can
> be pre-bound to nodes with sufficient available resources.
>
> More details: http://fedoraproject.org/wiki/Features/numad
>
> "numad -w ncpus:memory_amount" is the advisory interface numad
> provides currently.
>
> This patch add the support by introducing new XML like:
> <numatune>
> <cpu required_cpus="4" required_memory="524288"/>
> </numatune>
Isn't the usual case going to be the vcpus and memory in the guest?
IMO we should default to passing those numbers to numad if
required_cpus and required_memory are not provided explicitly.
Dave
> And the corresponding numad command line will be:
> numad -w 4:500
>
> The advisory nodeset returned from numad will be used to set
> domain process CPU affinity then. (e.g. qemuProcessInitCpuAffinity).
>
> If the user specifies both CPU affinity policy (e.g.
> (<vcpu cpuset="1-10,^7,^8">4</vcpu>) and XML indicating to use
> numad for the advisory nodeset, the specified CPU affinity will be
> overridden by the nodeset returned from numad.
>
> If no XML to specify the CPU affinity policy, and XML indicating
> to use numad is specified, the returned nodeset will be printed
> in <cpu cpuset="$nodeset_from_numad"/>4</vcpu>.
>
> Only QEMU/KVM and LXC drivers support it now.
> ---
> configure.ac | 8 +++
> docs/formatdomain.html.in | 18 ++++++-
> docs/schemas/domaincommon.rng | 12 ++++
> src/conf/domain_conf.c | 125 +++++++++++++++++++++++++++++++----------
> src/conf/domain_conf.h | 5 ++
> src/lxc/lxc_controller.c | 98 ++++++++++++++++++++++++++++----
> src/qemu/qemu_process.c | 99 +++++++++++++++++++++++++++++----
> 7 files changed, 311 insertions(+), 54 deletions(-)
>
> diff --git a/configure.ac b/configure.ac
> index c9cdd7b..31f0835 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -1445,6 +1445,14 @@ AM_CONDITIONAL([HAVE_NUMACTL], [test "$with_numactl" != "no"])
> AC_SUBST([NUMACTL_CFLAGS])
> AC_SUBST([NUMACTL_LIBS])
>
> +dnl Do we have numad?
> +if test "$with_qemu" = "yes"; then
> + AC_PATH_PROG([NUMAD], [numad], [], [/bin:/usr/bin:/usr/local/bin:$PATH])
> +
> + if test -n "$NUMAD"; then
> + AC_DEFINE_UNQUOTED([NUMAD],["$NUMAD"], [Location or name of the numad program])
> + fi
> +fi
>
> dnl pcap lib
> LIBPCAP_CONFIG="pcap-config"
> diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in
> index 6fcca94..d8e70a6 100644
> --- a/docs/formatdomain.html.in
> +++ b/docs/formatdomain.html.in
> @@ -505,6 +505,7 @@
> ...
> <numatune>
> <memory mode="strict" nodeset="1-4,^3"/>
> + <cpu required_cpus="3" required_memory="524288"/>
> </numatune>
> ...
> </domain>
> @@ -519,7 +520,7 @@
> <span class='since'>Since 0.9.3</span>
> <dt><code>memory</code></dt>
> <dd>
> - The optional <code>memory</code> element specify how to allocate memory
> + The optional <code>memory</code> element specifies how to allocate memory
> for the domain process on a NUMA host. It contains two attributes,
> attribute <code>mode</code> is either 'interleave', 'strict',
> or 'preferred',
> @@ -527,6 +528,21 @@
> syntax with attribute <code>cpuset</code> of element <code>vcpu</code>.
> <span class='since'>Since 0.9.3</span>
> </dd>
> + <dd>
> + The optional <code>cpu</code> element indicates pinning the virtual CPUs
> + to the nodeset returned by querying "numad" (a system daemon that monitors
> + NUMA topology and usage). It has two attributes, attribute
> + <code>required_cpus</code> specifies the number of physical CPUs the guest
> + process want to use. And the optional attribute <code>required_memory</code>
> + specifies the amount of free memory the guest process want to see on a node,
> + "numad" will pick the physical CPUs on the node which has enough free
> + memory of amount specified by <code>required_memory</code>.
> +
> + NB, with using this element, the physical CPUs specified by attribute
> + <code>cpuset</code> (of element <code>vcpu</code>) will be overridden by the
> + nodeset returned from "numad".
> + <span class='since'>Since 0.9.11 (QEMU/KVM and LXC only)</span>
> + </dd>
> </dl>
>
>
> diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
> index 3908733..d0f443d 100644
> --- a/docs/schemas/domaincommon.rng
> +++ b/docs/schemas/domaincommon.rng
> @@ -549,6 +549,18 @@
> </attribute>
> </element>
> </optional>
> + <optional>
> + <element name="cpu">
> + <attribute name="required_cpu">
> + <ref name="countCPU"/>
> + </attribute>
> + <optional>
> + <attribute name="required_memory">
> + <ref name="memoryKB"/>
> + </attribute>
> + </optional>
> + </element>
> + </optional>
> </element>
> </optional>
> </interleave>
> diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
> index f9654f1..aa03c05 100644
> --- a/src/conf/domain_conf.c
> +++ b/src/conf/domain_conf.c
> @@ -7125,7 +7125,6 @@ error:
> goto cleanup;
> }
>
> -
> static int virDomainDefMaybeAddController(virDomainDefPtr def,
> int type,
> int idx)
> @@ -7185,6 +7184,7 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps,
> bool uuid_generated = false;
> virBitmapPtr bootMap = NULL;
> unsigned long bootMapSize = 0;
> + xmlNodePtr cur;
>
> if (VIR_ALLOC(def) < 0) {
> virReportOOMError();
> @@ -7454,47 +7454,100 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps,
> VIR_FREE(nodes);
>
> /* Extract numatune if exists. */
> - if ((n = virXPathNodeSet("./numatune", ctxt, NULL)) < 0) {
> + if ((n = virXPathNodeSet("./numatune", ctxt, &nodes)) < 0) {
> virDomainReportError(VIR_ERR_INTERNAL_ERROR,
> "%s", _("cannot extract numatune nodes"));
> goto error;
> }
>
> + if (n > 1) {
> + virDomainReportError(VIR_ERR_XML_ERROR, "%s",
> + _("only one numatune is supported"));
> + VIR_FREE(nodes);
> + goto error;
> + }
> +
> if (n) {
> - tmp = virXPathString("string(./numatune/memory/@nodeset)", ctxt);
> - if (tmp) {
> - char *set = tmp;
> - int nodemasklen = VIR_DOMAIN_CPUMASK_LEN;
> + cur = nodes[0]->children;
> + while (cur != NULL) {
> + if (cur->type == XML_ELEMENT_NODE) {
> + if ((xmlStrEqual(cur->name, BAD_CAST "memory"))) {
> + tmp = virXMLPropString(cur, "nodeset");
>
> - if (VIR_ALLOC_N(def->numatune.memory.nodemask, nodemasklen) < 0) {
> - goto no_memory;
> - }
> + if (tmp) {
> + char *set = tmp;
> + int nodemasklen = VIR_DOMAIN_CPUMASK_LEN;
>
> - /* "nodeset" leads same syntax with "cpuset". */
> - if (virDomainCpuSetParse(set, 0, def->numatune.memory.nodemask,
> - nodemasklen) < 0)
> - goto error;
> - VIR_FREE(tmp);
> - } else {
> - virDomainReportError(VIR_ERR_INTERNAL_ERROR,
> - "%s", _("nodeset for NUMA memory tuning must be set"));
> - goto error;
> - }
> + if (VIR_ALLOC_N(def->numatune.memory.nodemask,
> + nodemasklen) < 0) {
> + virReportOOMError();
> + goto error;
> + }
>
> - tmp = virXPathString("string(./numatune/memory/@mode)", ctxt);
> - if (tmp) {
> - if ((def->numatune.memory.mode =
> - virDomainNumatuneMemModeTypeFromString(tmp)) < 0) {
> - virDomainReportError(VIR_ERR_INTERNAL_ERROR,
> - _("Unsupported NUMA memory tuning mode '%s'"),
> - tmp);
> - goto error;
> + /* "nodeset" leads same syntax with "cpuset". */
> + if (virDomainCpuSetParse(set, 0,
> + def->numatune.memory.nodemask,
> + nodemasklen) < 0)
> + goto error;
> + VIR_FREE(tmp);
> + } else {
> + virDomainReportError(VIR_ERR_XML_ERROR, "%s",
> + _("nodeset for NUMA memory "
> + "tuning must be set"));
> + goto error;
> + }
> +
> + tmp = virXMLPropString(cur, "mode");
> + if (tmp) {
> + if ((def->numatune.memory.mode =
> + virDomainNumatuneMemModeTypeFromString(tmp)) < 0) {
> + virDomainReportError(VIR_ERR_XML_ERROR,
> + _("Unsupported NUMA memory "
> + "tuning mode '%s'"),
> + tmp);
> + goto error;
> + }
> + VIR_FREE(tmp);
> + } else {
> + def->numatune.memory.mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT;
> + }
> + } else if (xmlStrEqual(cur->name, BAD_CAST "cpu")) {
> + char *req_cpus = NULL;
> + char *req_memory = NULL;
> + req_cpus = virXMLPropString(cur, "required_cpus");
> + req_memory = virXMLPropString(cur, "required_memory");
> +
> + if (req_cpus &&
> + virStrToLong_ui(req_cpus, NULL, 10,
> + &def->numatune.cpu.required_cpus) < 0) {
> + virDomainReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> + _("Cannot parse <cpu> 'required_cpus'"
> + " attribute"));
> + goto error;
> + }
> +
> + if (req_memory &&
> + virStrToLong_ul(req_memory, NULL, 10,
> + &def->numatune.cpu.required_memory) < 0) {
> + virDomainReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> + _("Cannot parse <cpu> 'required_memory'"
> + " attribute"));
> + goto error;
> + }
> +
> + VIR_FREE(req_cpus);
> + VIR_FREE(req_memory);
> + } else {
> + virDomainReportError(VIR_ERR_XML_ERROR,
> + _("unsupported XML element %s"),
> + (const char *)cur->name);
> + goto error;
> + }
> }
> - VIR_FREE(tmp);
> - } else {
> - def->numatune.memory.mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT;
> + cur = cur->next;
> }
> }
> + VIR_FREE(nodes);
>
> n = virXPathNodeSet("./features/*", ctxt, &nodes);
> if (n < 0)
> @@ -11761,7 +11814,8 @@ virDomainDefFormatInternal(virDomainDefPtr def,
> def->cputune.period || def->cputune.quota)
> virBufferAddLit(buf, " </cputune>\n");
>
> - if (def->numatune.memory.nodemask) {
> + if (def->numatune.memory.nodemask ||
> + def->numatune.cpu.required_cpus) {
> const char *mode;
> char *nodemask = NULL;
>
> @@ -11778,6 +11832,15 @@ virDomainDefFormatInternal(virDomainDefPtr def,
> virBufferAsprintf(buf, " <memory mode='%s' nodeset='%s'/>\n",
> mode, nodemask);
> VIR_FREE(nodemask);
> +
> + if (def->numatune.cpu.required_cpus)
> + virBufferAsprintf(buf, " <cpu required_cpus='%d' ",
> + def->numatune.cpu.required_cpus);
> +
> + if (def->numatune.cpu.required_memory)
> + virBufferAsprintf(buf, "required_memory='%lu'/>\n",
> + def->numatune.cpu.required_memory);
> +
> virBufferAddLit(buf, " </numatune>\n");
> }
>
> diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
> index 596be4d..1284599 100644
> --- a/src/conf/domain_conf.h
> +++ b/src/conf/domain_conf.h
> @@ -1416,6 +1416,11 @@ struct _virDomainNumatuneDef {
> int mode;
> } memory;
>
> + struct {
> + unsigned int required_cpus;
> + unsigned long required_memory;
> + } cpu;
> +
> /* Future NUMA tuning related stuff should go here. */
> };
>
> diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c
> index 8f336f5..ec6434d 100644
> --- a/src/lxc/lxc_controller.c
> +++ b/src/lxc/lxc_controller.c
> @@ -327,6 +327,47 @@ static int lxcSetContainerNUMAPolicy(virDomainDefPtr def)
> }
> #endif
>
> +#if defined(NUMAD)
> +static char *
> +lxcGetNumadAdvice(unsigned int req_cpus,
> + unsigned long req_memory) {
> + virCommandPtr cmd = NULL;
> + char *reqs = NULL;
> + char *ret = NULL;
> +
> + /* numad uses "MB" for memory. */
> + if (req_memory) {
> + req_memory = req_memory / 1024;
> + if (virAsprintf(&reqs, "%d:%lu", req_cpus, req_memory) < 0) {
> + virReportOOMError();
> + goto out;
> + }
> + cmd = virCommandNewArgList(NUMAD, "-w", reqs, NULL);
> + } else {
> + cmd = virCommandNewArgList(NUMAD, "-w", "%d", req_cpus, NULL);
> + }
> +
> + virCommandSetOutputBuffer(cmd, &ret);
> +
> + if (virCommandRun(cmd, NULL) < 0) {
> + lxcError(VIR_ERR_INTERNAL_ERROR, "%s",
> + _("Failed to query numad for the advisory nodeset"));
> + }
> +
> +out:
> + VIR_FREE(reqs);
> + virCommandFree(cmd);
> + return ret;
> +}
> +#else
> +static char *
> +lxcGetNumadAdvice(unsigned int req_cpus ATTRIBUTE_UNUSED,
> + unsigned long req_memory ATTRIBUTE_UNUSED) {
> + lxcError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> + _("numad is not available on this host"));
> + return NULL;
> +}
> +#endif
>
> /*
> * To be run while still single threaded
> @@ -355,19 +396,54 @@ static int lxcSetContainerCpuAffinity(virDomainDefPtr def)
> return -1;
> }
>
> - if (def->cpumask) {
> - /* XXX why don't we keep 'cpumask' in the libvirt cpumap
> - * format to start with ?!?! */
> - for (i = 0 ; i < maxcpu && i < def->cpumasklen ; i++)
> - if (def->cpumask[i])
> + /* def->cpumask will be overridden by the nodeset
> + * suggested by numad if it's specified.
> + */
> + if (def->numatune.cpu.required_cpus) {
> + char *tmp_cpumask = NULL;
> + char *nodeset = NULL;
> +
> + nodeset = lxcGetNumadAdvice(def->numatune.cpu.required_cpus,
> + def->numatune.cpu.required_memory);
> + if (!nodeset)
> + return -1;
> +
> + if (VIR_ALLOC_N(tmp_cpumask, VIR_DOMAIN_CPUMASK_LEN) < 0) {
> + virReportOOMError();
> + return -1;
> + }
> +
> + if (virDomainCpuSetParse(nodeset, 0, tmp_cpumask,
> + VIR_DOMAIN_CPUMASK_LEN) < 0) {
> + VIR_FREE(tmp_cpumask);
> + VIR_FREE(nodeset);
> + return -1;
> + }
> +
> + for (i = 0; i < maxcpu && i < VIR_DOMAIN_CPUMASK_LEN; i++) {
> + if (tmp_cpumask[i])
> VIR_USE_CPU(cpumap, i);
> + }
> +
> + /* Update def->cpumask */
> + VIR_FREE(def->cpumask);
> + def->cpumask = tmp_cpumask;
> + VIR_FREE(nodeset);
> } else {
> - /* You may think this is redundant, but we can't assume libvirtd
> - * itself is running on all pCPUs, so we need to explicitly set
> - * the spawned LXC instance to all pCPUs if no map is given in
> - * its config file */
> - for (i = 0 ; i < maxcpu ; i++)
> - VIR_USE_CPU(cpumap, i);
> + if (def->cpumask) {
> + /* XXX why don't we keep 'cpumask' in the libvirt cpumap
> + * format to start with ?!?! */
> + for (i = 0 ; i < maxcpu && i < def->cpumasklen ; i++)
> + if (def->cpumask[i])
> + VIR_USE_CPU(cpumap, i);
> + } else {
> + /* You may think this is redundant, but we can't assume libvirtd
> + * itself is running on all pCPUs, so we need to explicitly set
> + * the spawned LXC instance to all pCPUs if no map is given in
> + * its config file */
> + for (i = 0 ; i < maxcpu ; i++)
> + VIR_USE_CPU(cpumap, i);
> + }
> }
>
> /* We are pressuming we are running between fork/exec of LXC
> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
> index 41218de..eb9f8f1 100644
> --- a/src/qemu/qemu_process.c
> +++ b/src/qemu/qemu_process.c
> @@ -1633,6 +1633,48 @@ qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm)
> }
> #endif
>
> +#if defined(NUMAD)
> +static char *
> +qemuGetNumadAdvice(unsigned int req_cpus,
> + unsigned long req_memory) {
> + virCommandPtr cmd = NULL;
> + char *reqs = NULL;
> + char *output = NULL;
> +
> + /* numad uses "MB" for memory. */
> + if (req_memory) {
> + req_memory = req_memory / 1024;
> + if (virAsprintf(&reqs, "%d:%lu", req_cpus, req_memory) < 0) {
> + virReportOOMError();
> + goto out;
> + }
> +
> + cmd = virCommandNewArgList(NUMAD, "-w", reqs, NULL);
> + } else {
> + cmd = virCommandNewArgList(NUMAD, "-w", "%u", req_cpus, NULL);
> + }
> +
> + virCommandSetOutputBuffer(cmd, &output);
> +
> + if (virCommandRun(cmd, NULL) < 0)
> + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> + _("Failed to query numad for the advisory nodeset"));
> +
> +out:
> + VIR_FREE(reqs);
> + virCommandFree(cmd);
> + return output;
> +}
> +#else
> +static char *
> +qemuGetNumadAdvice(unsigned int req_cpus ATTRIBUTE_UNUSED,
> + unsigned long req_memory ATTRIBUTE_UNUSED) {
> + qemuReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> + _("numad is not available on this host"));
> + return NULL;
> +}
> +#endif
> +
> /*
> * To be run between fork/exec of QEMU only
> */
> @@ -1661,19 +1703,54 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm)
> return -1;
> }
>
> - if (vm->def->cpumask) {
> - /* XXX why don't we keep 'cpumask' in the libvirt cpumap
> - * format to start with ?!?! */
> - for (i = 0 ; i < maxcpu && i < vm->def->cpumasklen ; i++)
> - if (vm->def->cpumask[i])
> + /* vm->def->cpumask will be overridden by the nodeset
> + * suggested by numad if it's specified.
> + */
> + if (vm->def->numatune.cpu.required_cpus) {
> + char *tmp_cpumask = NULL;
> + char *nodeset = NULL;
> +
> + nodeset = qemuGetNumadAdvice(vm->def->numatune.cpu.required_cpus,
> + vm->def->numatune.cpu.required_memory);
> + if (!nodeset)
> + return -1;
> +
> + if (VIR_ALLOC_N(tmp_cpumask, VIR_DOMAIN_CPUMASK_LEN) < 0) {
> + virReportOOMError();
> + return -1;
> + }
> +
> + if (virDomainCpuSetParse(nodeset, 0, tmp_cpumask,
> + VIR_DOMAIN_CPUMASK_LEN) < 0) {
> + VIR_FREE(tmp_cpumask);
> + VIR_FREE(nodeset);
> + return -1;
> + }
> +
> + for (i = 0; i < maxcpu && i < VIR_DOMAIN_CPUMASK_LEN; i++) {
> + if (tmp_cpumask[i])
> VIR_USE_CPU(cpumap, i);
> + }
> +
> + /* Update vm->def->cpumask */
> + VIR_FREE(vm->def->cpumask);
> + vm->def->cpumask = tmp_cpumask;
> + VIR_FREE(nodeset);
> } else {
> - /* You may think this is redundant, but we can't assume libvirtd
> - * itself is running on all pCPUs, so we need to explicitly set
> - * the spawned QEMU instance to all pCPUs if no map is given in
> - * its config file */
> - for (i = 0 ; i < maxcpu ; i++)
> - VIR_USE_CPU(cpumap, i);
> + if (vm->def->cpumask) {
> + /* XXX why don't we keep 'cpumask' in the libvirt cpumap
> + * format to start with ?!?! */
> + for (i = 0 ; i < maxcpu && i < vm->def->cpumasklen ; i++)
> + if (vm->def->cpumask[i])
> + VIR_USE_CPU(cpumap, i);
> + } else {
> + /* You may think this is redundant, but we can't assume libvirtd
> + * itself is running on all pCPUs, so we need to explicitly set
> + * the spawned QEMU instance to all pCPUs if no map is given in
> + * its config file */
> + for (i = 0 ; i < maxcpu ; i++)
> + VIR_USE_CPU(cpumap, i);
> + }
> }
>
> /* We are pressuming we are running between fork/exec of QEMU
> --
> 1.7.7.3
>
> --
> libvir-list mailing list
> libvir-list at redhat.com
> https://www.redhat.com/mailman/listinfo/libvir-list
More information about the libvir-list
mailing list