From bec7f1726f0edc90b9a432121e6230f92123e437 Mon Sep 17 00:00:00 2001 From: James Shubin Date: Tue, 10 Jan 2017 02:12:51 -0500 Subject: [PATCH] resources: virt: Allow hotplugging This allows hot (un)plugging of CPU's! It also includes some general cleanups which were necessary to support this as well as some other features to the virt resource. Hotunplug requires Fedora 25. It also comes with a mini shell script to help demo this capability. Many thanks to pkrempa for his help with the libvirt API! --- docs/resource-guide.md | 11 +- examples/virt4.yaml | 6 + misc/delta-cpu.sh | 81 ++++++ resources/virt.go | 642 ++++++++++++++++++++++++++++++++--------- test/test-bashfmt.sh | 2 +- 5 files changed, 603 insertions(+), 139 deletions(-) create mode 100755 misc/delta-cpu.sh diff --git a/docs/resource-guide.md b/docs/resource-guide.md index d961da62..fdc21060 100644 --- a/docs/resource-guide.md +++ b/docs/resource-guide.md @@ -108,10 +108,15 @@ opened in the `Init` method and were using throughout the resource. ```golang // Close runs some cleanup code for this resource. func (obj *FooRes) Close() error { + err := obj.conn.Close() // close some internal connection - obj.Conn.Close() // ignore error in this case - - return obj.BaseRes.Close() // call base close, b/c we're overriding + // call base close, b/c we're overriding + if e := obj.BaseRes.Close(); err == nil { + err = e + } else if e != nil { + err = multierr.Append(err, e) // list of errors + } + return err } ``` diff --git a/examples/virt4.yaml b/examples/virt4.yaml index efa5fb03..ca3b0752 100644 --- a/examples/virt4.yaml +++ b/examples/virt4.yaml @@ -3,7 +3,13 @@ graph: mygraph resources: virt: - name: mgmt4 + meta: + limit: .inf + burst: 0 uri: 'qemu:///session' + cpus: 1 + maxcpus: 4 + memory: 524288 boot: - hd disk: diff --git a/misc/delta-cpu.sh b/misc/delta-cpu.sh new file mode 100755 index 00000000..30fd2435 --- /dev/null +++ b/misc/delta-cpu.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# shitty cpu count control, useful for live demos + +minimum=1 # don't decrease below this number of cpus +maximum=8 # don't increase above this number of cpus +count=1 # initial count +factor=3 +function output() { +count=$1 # arg! +cat << EOF > ~/code/mgmt/examples/virt4.yaml +--- +graph: mygraph +resources: + virt: + - name: mgmt4 + meta: + limit: .inf + burst: 0 + uri: 'qemu:///session' + cpus: $count + maxcpus: $maximum + memory: 524288 + boot: + - hd + disk: + - type: qcow2 + source: "~/.local/share/libvirt/images/fedora-23-scratch.qcow2" + state: running + transient: false +edges: [] +comment: "qemu-img create -b fedora-23.qcow2 -f qcow2 fedora-23-scratch.qcow2" +EOF +} +#tput cuu 1 && tput el # remove last line +str='' +tnuoc=$((maximum-count)) # backwards count +count2=$((count * factor)) +tnuoc2=$((tnuoc * factor)) +left=`yes '>' | head -$count2 | paste -s -d '' -` +right=`yes ' ' | head -$tnuoc2 | paste -s -d '' -` +str="${left}${right}" +_min=$((minimum-1)) +_max=$((maximum+1)) +reset # clean up once... +output $count # call function +while true; do + + read -n1 -r -s -p "CPUs count is: $count; ${str} Press +/- key to adjust." key + if [ "$key" = "q" ] || [ "$key" = "Q" ]; then + echo # newline + exit + fi + if [ ! "$key" = "+" ] && [ ! "$key" = "-" ] && [ ! "$key" = "=" ] && [ ! "$key" = "_" ]; then # wrong key + reset # woops, reset it all... + continue + fi + if [ "$key" == "+" ] || [ "$key" == "=" ]; then + count=$((count+1)) + fi + if [ "$key" == "-" ] || [ "$key" == "_" ]; then + count=$((count-1)) + fi + if [ $count -eq $_min ]; then # min + count=$minimum + fi + if [ $count -eq $_max ]; then # max + count=$maximum + fi + + tnuoc=$((maximum-count)) # backwards count + #echo "count is: $count" + #echo "tnuoc is: $tnuoc" + count2=$((count * factor)) + tnuoc2=$((tnuoc * factor)) + left=`yes '>' | head -$count2 | paste -s -d '' -` + right=`yes ' ' | head -$tnuoc2 | paste -s -d '' -` + str="${left}${right}" + #echo "str is: $str" + echo -ne '\r' # backup + output $count # call function +done diff --git a/resources/virt.go b/resources/virt.go index b6418a7e..cd79c35e 100644 --- a/resources/virt.go +++ b/resources/virt.go @@ -31,7 +31,9 @@ import ( "github.com/purpleidea/mgmt/event" + multierr "github.com/hashicorp/go-multierror" "github.com/libvirt/libvirt-go" + libvirtxml "github.com/libvirt/libvirt-go-xml" errwrap "github.com/pkg/errors" ) @@ -39,6 +41,17 @@ func init() { gob.Register(&VirtRes{}) } +const ( + // DefaultMaxCPUs is the default number of possible cpu "slots" used. + DefaultMaxCPUs = 32 + + // MaxShutdownDelayTimeout is the max time we wait for a vm to shutdown. + MaxShutdownDelayTimeout = 60 * 5 // seconds + + // ShortPollInterval is how often we poll when expecting an event. + ShortPollInterval = 5 // seconds +) + var ( libvirtInitialized = false ) @@ -65,6 +78,7 @@ type VirtRes struct { State string `yaml:"state"` // running, paused, shutoff Transient bool `yaml:"transient"` // defined (false) or undefined (true) CPUs uint `yaml:"cpus"` + MaxCPUs uint `yaml:"maxcpus"` Memory uint64 `yaml:"memory"` // in KBytes OSInit string `yaml:"osinit"` // init used by lxc Boot []string `yaml:"boot"` // boot order. values: fd, hd, cdrom, network @@ -74,9 +88,19 @@ type VirtRes struct { Filesystem []filesystemDevice `yaml:"filesystem"` Auth *VirtAuth `yaml:"auth"` - conn *libvirt.Connect - absent bool // cached state - uriScheme virtURISchemeType + HotCPUs bool `yaml:"hotcpus"` // allow hotplug of cpus? + // FIXME: values here should be enum's! + RestartOnDiverge string `yaml:"restartondiverge"` // restart policy: "ignore", "ifneeded", "error" + RestartOnRefresh bool `yaml:"restartonrefresh"` // restart on refresh? + + conn *libvirt.Connect + version uint32 // major * 1000000 + minor * 1000 + release + absent bool // cached state + uriScheme virtURISchemeType + processExitWatch bool // do we want to wait on an explicit process exit? + processExitChan chan struct{} + restartScheduled bool // do we need to schedule a hard restart? + guestAgentConnected bool // our tracking of if guest agent is running } // Default returns some sensible defaults for this resource. @@ -85,9 +109,22 @@ func (obj *VirtRes) Default() Res { BaseRes: BaseRes{ MetaParams: DefaultMetaParams, // force a default }, + + MaxCPUs: DefaultMaxCPUs, + HotCPUs: true, // we're a dynamic engine, be dynamic by default! + + RestartOnDiverge: "error", // safest default :( } } +// Validate if the params passed in are valid data. +func (obj *VirtRes) Validate() error { + if obj.CPUs > obj.MaxCPUs { + return fmt.Errorf("the number of CPUs (%d) must not be greater than MaxCPUs (%d)", obj.CPUs, obj.MaxCPUs) + } + return obj.BaseRes.Validate() +} + // Init runs some startup code for this resource. func (obj *VirtRes) Init() error { if !libvirtInitialized { @@ -108,15 +145,71 @@ func (obj *VirtRes) Init() error { obj.absent = (obj.Transient && obj.State == "shutoff") // machine shouldn't exist + obj.conn, err = obj.connect() // gets closed in Close method of Res API + if err != nil { + return errwrap.Wrapf(err, "%s[%s]: Connection to libvirt failed in init", obj.Kind(), obj.GetName()) + } + + // check for hard to change properties + dom, err := obj.conn.LookupDomainByName(obj.GetName()) + if err == nil { + defer dom.Free() + } else if !isNotFound(err) { + return errwrap.Wrapf(err, "%s[%s]: Could not lookup on init", obj.Kind(), obj.GetName()) + } + + if err == nil { + // maxCPUs, err := dom.GetMaxVcpus() + i, err := dom.GetVcpusFlags(libvirt.DOMAIN_VCPU_MAXIMUM) + if err != nil { + return errwrap.Wrapf(err, "%s[%s]: Could not lookup MaxCPUs on init", obj.Kind(), obj.GetName()) + } + maxCPUs := uint(i) + if obj.MaxCPUs != maxCPUs { // max cpu slots is hard to change + // we'll need to reboot to fix this one... + obj.restartScheduled = true + } + + // parse running domain xml to read properties + // FIXME: should we do this in Watch, after we register the + // event handlers so that we don't miss any events via race? + xmlDesc, err := dom.GetXMLDesc(0) // 0 means no flags + if err != nil { + return errwrap.Wrapf(err, "%s[%s]: Could not GetXMLDesc on init", obj.Kind(), obj.GetName()) + } + domXML := &libvirtxml.Domain{} + if err := domXML.Unmarshal(xmlDesc); err != nil { + return errwrap.Wrapf(err, "%s[%s]: Could not unmarshal XML on init", obj.Kind(), obj.GetName()) + } + + // guest agent: domain->devices->channel->target->state == connected? + for _, x := range domXML.Devices.Channels { + if x.Target.Type == "virtio" && strings.HasPrefix(x.Target.Name, "org.qemu.guest_agent.") { + // last connection found wins (usually 1 anyways) + obj.guestAgentConnected = (x.Target.State == "connected") + } + } + } + obj.BaseRes.kind = "Virt" return obj.BaseRes.Init() // call base init, b/c we're overriding } -// Validate if the params passed in are valid data. -func (obj *VirtRes) Validate() error { - return obj.BaseRes.Validate() +// Close runs some cleanup code for this resource. +func (obj *VirtRes) Close() error { + // TODO: what is the first int Close return value useful for (if at all)? + _, err := obj.conn.Close() // close libvirt conn that was opened in Init + + // call base close, b/c we're overriding + if e := obj.BaseRes.Close(); err == nil { + err = e + } else if e != nil { + err = multierr.Append(err, e) // list of errors + } + return err } +// connect is the connect helper for the libvirt connection. It can handle auth. func (obj *VirtRes) connect() (conn *libvirt.Connect, err error) { if obj.Auth != nil { callback := func(creds []*libvirt.ConnectCredential) { @@ -139,21 +232,27 @@ func (obj *VirtRes) connect() (conn *libvirt.Connect, err error) { Callback: callback, } conn, err = libvirt.NewConnectWithAuth(obj.URI, auth, 0) + if err == nil { + if version, err := conn.GetLibVersion(); err == nil { + obj.version = version + } + } } if obj.Auth == nil || err != nil { conn, err = libvirt.NewConnect(obj.URI) + if err == nil { + if version, err := conn.GetLibVersion(); err == nil { + obj.version = version + } + } } return } // Watch is the primary listener for this resource and it outputs events. func (obj *VirtRes) Watch(processChan chan *event.Event) error { - conn, err := obj.connect() - if err != nil { - return fmt.Errorf("Connection to libvirt failed with: %s", err) - } - - eventChan := make(chan libvirt.DomainEventType) // TODO: do we need to buffer this? + domChan := make(chan libvirt.DomainEventType) // TODO: do we need to buffer this? + gaChan := make(chan *libvirt.DomainEventAgentLifecycle) errorChan := make(chan error) exitChan := make(chan struct{}) defer close(exitChan) @@ -181,17 +280,32 @@ func (obj *VirtRes) Watch(processChan chan *event.Event) error { } }() - callback := func(c *libvirt.Connect, d *libvirt.Domain, ev *libvirt.DomainEventLifecycle) { + // domain events callback + domCallback := func(c *libvirt.Connect, d *libvirt.Domain, ev *libvirt.DomainEventLifecycle) { domName, _ := d.GetName() if domName == obj.GetName() { - eventChan <- ev.Event + domChan <- ev.Event } } - callbackID, err := conn.DomainEventLifecycleRegister(nil, callback) + // if dom is nil, we get events for *all* domains! + domCallbackID, err := obj.conn.DomainEventLifecycleRegister(nil, domCallback) if err != nil { return err } - defer conn.DomainEventDeregister(callbackID) + defer obj.conn.DomainEventDeregister(domCallbackID) + + // guest agent events callback + gaCallback := func(c *libvirt.Connect, d *libvirt.Domain, eva *libvirt.DomainEventAgentLifecycle) { + domName, _ := d.GetName() + if domName == obj.GetName() { + gaChan <- eva + } + } + gaCallbackID, err := obj.conn.DomainEventAgentLifecycleRegister(nil, gaCallback) + if err != nil { + return err + } + defer obj.conn.DomainEventDeregister(gaCallbackID) // notify engine that we're running if err := obj.Running(processChan); err != nil { @@ -202,8 +316,9 @@ func (obj *VirtRes) Watch(processChan chan *event.Event) error { var exit *error // if ptr exists, that is the exit error to return for { + processExited := false // did the process exit fully (shutdown)? select { - case event := <-eventChan: + case event := <-domChan: // TODO: shouldn't we do these checks in CheckApply ? switch event { case libvirt.DOMAIN_EVENT_DEFINED: @@ -235,11 +350,43 @@ func (obj *VirtRes) Watch(processChan chan *event.Event) error { obj.StateOK(false) // dirty send = true } + processExited = true + case libvirt.DOMAIN_EVENT_PMSUSPENDED: + // FIXME: IIRC, in s3 we can't cold change + // hardware like cpus but in s4 it's okay? + // verify, detect and patch appropriately! fallthrough case libvirt.DOMAIN_EVENT_CRASHED: obj.StateOK(false) // dirty send = true + processExited = true // FIXME: is this okay for PMSUSPENDED ? + } + + if obj.processExitWatch && processExited { + close(obj.processExitChan) // send signal + obj.processExitWatch = false + } + + case agentEvent := <-gaChan: + state, reason := agentEvent.State, agentEvent.Reason + + if state == libvirt.CONNECT_DOMAIN_EVENT_AGENT_LIFECYCLE_STATE_CONNECTED { + obj.guestAgentConnected = true + obj.StateOK(false) // dirty + send = true + log.Printf("%s[%s]: Guest agent connected", obj.Kind(), obj.GetName()) + + } else if state == libvirt.CONNECT_DOMAIN_EVENT_AGENT_LIFECYCLE_STATE_DISCONNECTED { + obj.guestAgentConnected = false + // ignore CONNECT_DOMAIN_EVENT_AGENT_LIFECYCLE_REASON_DOMAIN_STARTED + // events because they just tell you that guest agent channel was added + if reason == libvirt.CONNECT_DOMAIN_EVENT_AGENT_LIFECYCLE_REASON_CHANNEL { + log.Printf("%s[%s]: Guest agent disconnected", obj.Kind(), obj.GetName()) + } + + } else { + return fmt.Errorf("Unknown %s[%s] guest agent state: %v", obj.Kind(), obj.GetName(), state) } case err := <-errorChan: @@ -258,51 +405,9 @@ func (obj *VirtRes) Watch(processChan chan *event.Event) error { } } -// attrCheckApply performs the CheckApply functions for CPU, Memory and others. -// This shouldn't be called when the machine is absent; it won't be found! -func (obj *VirtRes) attrCheckApply(apply bool) (bool, error) { - var checkOK = true - - dom, err := obj.conn.LookupDomainByName(obj.GetName()) - if err != nil { - return false, errwrap.Wrapf(err, "conn.LookupDomainByName failed") - } - - domInfo, err := dom.GetInfo() - if err != nil { - // we don't know if the state is ok - return false, errwrap.Wrapf(err, "domain.GetInfo failed") - } - - // check memory - if domInfo.Memory != obj.Memory { - checkOK = false - if !apply { - return false, nil - } - if err := dom.SetMemory(obj.Memory); err != nil { - return false, errwrap.Wrapf(err, "domain.SetMemory failed") - } - log.Printf("%s[%s]: Memory changed", obj.Kind(), obj.GetName()) - } - - // check cpus - if domInfo.NrVirtCpu != obj.CPUs { - checkOK = false - if !apply { - return false, nil - } - if err := dom.SetVcpus(obj.CPUs); err != nil { - return false, errwrap.Wrapf(err, "domain.SetVcpus failed") - } - log.Printf("%s[%s]: CPUs changed", obj.Kind(), obj.GetName()) - } - - return checkOK, nil -} - -// domainCreate creates a transient or persistent domain in the correct state. It -// doesn't check the state before hand, as it is a simple helper function. +// domainCreate creates a transient or persistent domain in the correct state. +// It doesn't check the state before hand, as it is a simple helper function. +// The caller must run dom.Free() after use, when error was returned as nil. func (obj *VirtRes) domainCreate() (*libvirt.Domain, bool, error) { if obj.Transient { @@ -350,83 +455,20 @@ func (obj *VirtRes) domainCreate() (*libvirt.Domain, bool, error) { return dom, false, nil } -// CheckApply checks the resource state and applies the resource if the bool -// input is true. It returns error info and if the state check passed or not. -func (obj *VirtRes) CheckApply(apply bool) (bool, error) { - var err error - obj.conn, err = obj.connect() - if err != nil { - return false, fmt.Errorf("Connection to libvirt failed with: %s", err) - } - +// stateCheckApply starts, stops, or pauses/unpauses the domain as needed. +func (obj *VirtRes) stateCheckApply(apply bool, dom *libvirt.Domain) (bool, error) { var checkOK = true - - dom, err := obj.conn.LookupDomainByName(obj.GetName()) - if err == nil { - // pass - } else if virErr, ok := err.(libvirt.Error); ok && virErr.Code == libvirt.ERR_NO_DOMAIN { - // domain not found - if obj.absent { - return true, nil - } - - if !apply { - return false, nil - } - - var c = true - dom, c, err = obj.domainCreate() // create the domain - if err != nil { - return false, errwrap.Wrapf(err, "domainCreate failed") - } else if !c { - checkOK = false - } - - } else { - return false, errwrap.Wrapf(err, "LookupDomainByName failed") - } - defer dom.Free() - // domain exists - domInfo, err := dom.GetInfo() if err != nil { // we don't know if the state is ok return false, errwrap.Wrapf(err, "domain.GetInfo failed") } - isPersistent, err := dom.IsPersistent() - if err != nil { - // we don't know if the state is ok - return false, errwrap.Wrapf(err, "domain.IsPersistent failed") - } isActive, err := dom.IsActive() if err != nil { // we don't know if the state is ok return false, errwrap.Wrapf(err, "domain.IsActive failed") } - // check for persistence - if isPersistent == obj.Transient { // if they're different! - if !apply { - return false, nil - } - if isPersistent { - if err := dom.Undefine(); err != nil { - return false, errwrap.Wrapf(err, "domain.Undefine failed") - } - log.Printf("%s[%s]: Domain undefined", obj.Kind(), obj.GetName()) - } else { - domXML, err := dom.GetXMLDesc(libvirt.DOMAIN_XML_INACTIVE) - if err != nil { - return false, errwrap.Wrapf(err, "domain.GetXMLDesc failed") - } - if _, err = obj.conn.DomainDefineXML(domXML); err != nil { - return false, errwrap.Wrapf(err, "conn.DomainDefineXML failed") - } - log.Printf("%s[%s]: Domain defined", obj.Kind(), obj.GetName()) - } - checkOK = false - } - // check for valid state switch obj.State { case "running": @@ -490,6 +532,283 @@ func (obj *VirtRes) CheckApply(apply bool) (bool, error) { log.Printf("%s[%s]: Domain destroyed", obj.Kind(), obj.GetName()) } + return checkOK, nil +} + +// attrCheckApply performs the CheckApply functions for CPU, Memory and others. +// This shouldn't be called when the machine is absent; it won't be found! +func (obj *VirtRes) attrCheckApply(apply bool, dom *libvirt.Domain) (bool, error) { + var checkOK = true + domInfo, err := dom.GetInfo() + if err != nil { + // we don't know if the state is ok + return false, errwrap.Wrapf(err, "domain.GetInfo failed") + } + + // check (balloon) memory + // FIXME: check that we don't increase past max memory... + if domInfo.Memory != obj.Memory { + if !apply { + return false, nil + } + checkOK = false + if err := dom.SetMemory(obj.Memory); err != nil { + return false, errwrap.Wrapf(err, "domain.SetMemory failed") + } + log.Printf("%s[%s]: Memory changed to %d", obj.Kind(), obj.GetName(), obj.Memory) + } + + // check cpus + if domInfo.NrVirtCpu != obj.CPUs { + if !apply { + return false, nil + } + + // unused: DOMAIN_VCPU_CURRENT + switch domInfo.State { + case libvirt.DOMAIN_PAUSED: + // we can queue up the SetVcpus operation, + // which will be seen once vm is unpaused! + fallthrough + case libvirt.DOMAIN_RUNNING: + // cpu hot*un*plug introduced in 2.2.0 + // 2 * 1000000 + 2 * 1000 + 0 = 2002000 + if obj.HotCPUs && obj.version < 2002000 && domInfo.NrVirtCpu > obj.CPUs { + return false, fmt.Errorf("libvirt 2.2.0 or greater is required to hotunplug cpus") + } + // pkrempa says HOTPLUGGABLE is implied when doing LIVE + // on running machine, but we add it anyways in case we + // race and the machine is in shutoff state. We need to + // specify HOTPLUGGABLE if we add while machine is off! + // We particularly need to add HOTPLUGGABLE with CONFIG + flags := libvirt.DOMAIN_VCPU_LIVE + if !obj.Transient { + flags |= libvirt.DOMAIN_VCPU_CONFIG + // hotpluggable flag introduced in 2.4.0 + // 2 * 1000000 + 4 * 1000 + 0 = 2004000 + if obj.version >= 2004000 { + flags |= libvirt.DOMAIN_VCPU_HOTPLUGGABLE + } + } + if err := dom.SetVcpusFlags(obj.CPUs, flags); err != nil { + return false, errwrap.Wrapf(err, "domain.SetVcpus failed") + } + checkOK = false + log.Printf("%s[%s]: CPUs (hot) changed to %d", obj.Kind(), obj.GetName(), obj.CPUs) + + case libvirt.DOMAIN_SHUTOFF, libvirt.DOMAIN_SHUTDOWN: + if !obj.Transient { + flags := libvirt.DOMAIN_VCPU_CONFIG + if obj.version >= 2004000 { + flags |= libvirt.DOMAIN_VCPU_HOTPLUGGABLE + } + if err := dom.SetVcpusFlags(obj.CPUs, flags); err != nil { + return false, errwrap.Wrapf(err, "domain.SetVcpus failed") + } + checkOK = false + log.Printf("%s[%s]: CPUs (cold) changed to %d", obj.Kind(), obj.GetName(), obj.CPUs) + } + + default: + // FIXME: is this accurate? + return false, fmt.Errorf("can't modify cpus when in %v", domInfo.State) + } + } + + // modify the online aspect of the cpus with qemu-guest-agent + if obj.HotCPUs && obj.guestAgentConnected && domInfo.State != libvirt.DOMAIN_PAUSED { + + // if hotplugging a cpu without the guest agent, you might need: + // manually to: echo 1 > /sys/devices/system/cpu/cpu1/online OR + // udev (untested) in: /etc/udev/rules.d/99-hotplugCPU.rules + // SUBSYSTEM=="cpu",ACTION=="add",RUN+="/bin/sh -c '[ ! -e /sys$devpath/online ] || echo 1 > /sys$devpath/online'" + + // how many online cpus are there? + i, err := dom.GetVcpusFlags(libvirt.DOMAIN_VCPU_GUEST) + if err != nil { + return false, errwrap.Wrapf(err, "domain.GetVcpus failed from qemu-guest-agent") + } + onlineCPUs := uint(i) + if onlineCPUs != obj.CPUs { + if !apply { + return false, nil + } + if err := dom.SetVcpusFlags(obj.CPUs, libvirt.DOMAIN_VCPU_GUEST); err != nil { + return false, errwrap.Wrapf(err, "domain.SetVcpus failed") + } + checkOK = false + log.Printf("%s[%s]: CPUs (guest) changed to %d", obj.Kind(), obj.GetName(), obj.CPUs) + } + } + + return checkOK, nil +} + +// domainShutdownSync powers off a domain in a manner which will allow hardware +// to be changed while off. This requires the process to exit so that when it's +// called again, qemu can start up fresh as if we cold swapped in new hardware! +// This method is particularly special because it waits for shutdown to finish. +func (obj *VirtRes) domainShutdownSync(apply bool, dom *libvirt.Domain) (bool, error) { + // we need to wait for shutdown to be finished before we can restart it + once := true + timeout := time.After(time.Duration(MaxShutdownDelayTimeout) * time.Second) + + // wait until shutdown completion... + for { + domInfo, err := dom.GetInfo() + if err != nil { + // we don't know if the state is ok + return false, errwrap.Wrapf(err, "domain.GetInfo failed") + } + if domInfo.State == libvirt.DOMAIN_SHUTOFF || domInfo.State == libvirt.DOMAIN_SHUTDOWN { + log.Printf("%s[%s]: Shutdown", obj.Kind(), obj.GetName()) + break + } + + if once { + if !apply { + return false, nil + } + obj.processExitWatch = true + obj.processExitChan = make(chan struct{}) + // if machine shuts down before we call this, we error; + // this isn't ideal, but it happened due to user error! + log.Printf("%s[%s]: Running shutdown", obj.Kind(), obj.GetName()) + if err := dom.Shutdown(); err != nil { + // FIXME: if machine is already shutdown completely, return early + return false, errwrap.Wrapf(err, "domain.Shutdown failed") + } + once = false // we did some work! + } + + select { + case <-obj.processExitChan: // should get a close signal... + // pass + case <-time.After(time.Duration(ShortPollInterval) * time.Second): + // poll until timeout in case no event ever arrives + // this happens when using Meta().Poll for example! + + // FIXME: some domains can reboot when asked to shutdown + // via the `on_poweroff` xml setting. in this case, we + // might only exit from here via timeout... avoid this! + // https://libvirt.org/formatdomain.html#elementsEvents + continue + case <-timeout: + return false, fmt.Errorf("%s[%s]: didn't shutdown after %d seconds", obj.Kind(), obj.GetName(), MaxShutdownDelayTimeout) + } + } + + return once, nil +} + +// CheckApply checks the resource state and applies the resource if the bool +// input is true. It returns error info and if the state check passed or not. +func (obj *VirtRes) CheckApply(apply bool) (bool, error) { + // if we do the restart, we must flip the flag back to false as evidence + var restart bool // do we need to do a restart? + if obj.RestartOnRefresh && obj.Refresh() { // a refresh is a restart ask + restart = true + } + + // we need to restart in all situations except ignore. the "error" case + // means that if a restart is actually needed, we should return an error + if obj.restartScheduled && obj.RestartOnDiverge != "ignore" { // "ignore", "ifneeded", "error" + restart = true + } + if !apply { + restart = false + } + + var checkOK = true + + dom, err := obj.conn.LookupDomainByName(obj.GetName()) + if err == nil { + // pass + } else if isNotFound(err) { + // domain not found + if obj.absent { + // we can ignore the restart var since we're not running + return true, nil + } + + if !apply { + return false, nil + } + + var c = true + dom, c, err = obj.domainCreate() // create the domain + if err != nil { + return false, errwrap.Wrapf(err, "domainCreate failed") + } else if !c { + checkOK = false + } + + } else { + return false, errwrap.Wrapf(err, "LookupDomainByName failed") + } + defer dom.Free() // the Free() for two possible domain objects above + // domain now exists + + isPersistent, err := dom.IsPersistent() + if err != nil { + // we don't know if the state is ok + return false, errwrap.Wrapf(err, "domain.IsPersistent failed") + } + // check for persistence + if isPersistent == obj.Transient { // if they're different! + if !apply { + return false, nil + } + if isPersistent { + if err := dom.Undefine(); err != nil { + return false, errwrap.Wrapf(err, "domain.Undefine failed") + } + log.Printf("%s[%s]: Domain undefined", obj.Kind(), obj.GetName()) + } else { + domXML, err := dom.GetXMLDesc(libvirt.DOMAIN_XML_INACTIVE) + if err != nil { + return false, errwrap.Wrapf(err, "domain.GetXMLDesc failed") + } + if _, err = obj.conn.DomainDefineXML(domXML); err != nil { + return false, errwrap.Wrapf(err, "conn.DomainDefineXML failed") + } + log.Printf("%s[%s]: Domain defined", obj.Kind(), obj.GetName()) + } + checkOK = false + } + + // shutdown here and let the stateCheckApply fix things up... + // TODO: i think this is the most straight forward process... + if !obj.absent && restart { + if c, err := obj.domainShutdownSync(apply, dom); err != nil { + return false, errwrap.Wrapf(err, "domainShutdownSync failed") + } else if !c { + checkOK = false + restart = false // clear the restart requirement... + } + } + + // FIXME: is doing this early check (therefore twice total) a good idea? + // run additional pre-emptive attr change checks here for hotplug stuff! + if !obj.absent { + if c, err := obj.attrCheckApply(apply, dom); err != nil { + return false, errwrap.Wrapf(err, "early attrCheckApply failed") + } else if !c { + checkOK = false + } + } + // TODO: do we need to run again below after we've booted up the domain? + + // apply correct machine state, eg: startup/shutoff/pause as needed + if c, err := obj.stateCheckApply(apply, dom); err != nil { + return false, errwrap.Wrapf(err, "stateCheckApply failed") + } else if !c { + checkOK = false + } + + // FIXME: should we wait to ensure machine is booted before continuing? + // it may be useful to wait for guest agent to hotplug some ram or cpu! + if !apply { return false, nil } @@ -497,17 +816,22 @@ func (obj *VirtRes) CheckApply(apply bool) (bool, error) { // mem & cpu checks... if !obj.absent { - if c, err := obj.attrCheckApply(apply); err != nil { + if c, err := obj.attrCheckApply(apply, dom); err != nil { return false, errwrap.Wrapf(err, "attrCheckApply failed") } else if !c { checkOK = false } } + // we had to do a restart, we didn't, and we should error if it was needed + if obj.restartScheduled && restart == true && obj.RestartOnDiverge == "error" { + return false, fmt.Errorf("%s[%s]: needed restart but didn't! (RestartOnDiverge: %v)", obj.Kind(), obj.GetName(), obj.RestartOnDiverge) + } + return checkOK, nil // w00t } -// Return the correct domain type based on the uri +// getDomainType returns the correct domain type based on the uri. func (obj VirtRes) getDomainType() string { switch obj.uriScheme { case lxcURI: @@ -517,7 +841,7 @@ func (obj VirtRes) getDomainType() string { } } -// Return the correct os type based on the uri +// getOSType returns the correct os type based on the uri. func (obj VirtRes) getOSType() string { switch obj.uriScheme { case lxcURI: @@ -536,13 +860,30 @@ func (obj VirtRes) getOSInit() string { } } +// getDomainXML returns the representative XML for a domain struct. +// FIXME: replace this with the libvirt-go-xml package instead! func (obj *VirtRes) getDomainXML() string { var b string b += obj.getDomainType() // start domain b += fmt.Sprintf("%s", obj.GetName()) b += fmt.Sprintf("%d", obj.Memory) - b += fmt.Sprintf("%d", obj.CPUs) + + if obj.HotCPUs { + b += fmt.Sprintf("%d", obj.CPUs, obj.MaxCPUs) + b += fmt.Sprintf("") + b += fmt.Sprintf("") // zeroth cpu can't change + for i := uint(1); i < obj.MaxCPUs; i++ { // skip first entry + enabled := "no" + if i < obj.CPUs { + enabled = "yes" + } + b += fmt.Sprintf("", i, enabled) + } + b += fmt.Sprintf("") + } else { + b += fmt.Sprintf("%d", obj.CPUs) + } b += "" b += obj.getOSType() @@ -554,6 +895,13 @@ func (obj *VirtRes) getDomainXML() string { } b += fmt.Sprintf("") + if obj.HotCPUs { + // acpi is needed for cpu hotplug support + b += fmt.Sprintf("") + b += fmt.Sprintf("") + b += fmt.Sprintf("") + } + b += fmt.Sprintf("") // start devices if obj.Disk != nil { @@ -580,6 +928,16 @@ func (obj *VirtRes) getDomainXML() string { } } + // qemu guest agent is not *required* for hotplugging cpus but + // it helps because it can ask the host to make them online... + if obj.HotCPUs { + // enable qemu guest agent (on the host side) + b += fmt.Sprintf("") + b += fmt.Sprintf("") + b += fmt.Sprintf("") + b += fmt.Sprintf("") + } + b += "" b += "" b += "" // end devices @@ -718,6 +1076,12 @@ func (obj *VirtRes) Compare(res Res) bool { if obj.CPUs != res.CPUs { return false } + // we can't change this property while machine is running! + // we do need to return false, so that a new struct gets built, + // which will cause at least one Init() & CheckApply() to run. + if obj.MaxCPUs != res.MaxCPUs { + return false + } // TODO: can we skip the compare of certain properties such as // Memory because this object (but with different memory) can be // *converted* into the new version that has more/less memory? @@ -748,10 +1112,6 @@ func (obj *VirtRes) Compare(res Res) bool { return true } -// CollectPattern applies the pattern for collection resources. -func (obj *VirtRes) CollectPattern(string) { -} - // UnmarshalYAML is the custom unmarshal handler for this struct. // It is primarily useful for setting the defaults. func (obj *VirtRes) UnmarshalYAML(unmarshal func(interface{}) error) error { @@ -781,6 +1141,18 @@ func randMAC() string { fmt.Sprintf(":%x", rand.Intn(255)) } +// isNotFound tells us if this is a domain not found error. +func isNotFound(err error) bool { + if err == nil { + return false + } + if virErr, ok := err.(libvirt.Error); ok && virErr.Code == libvirt.ERR_NO_DOMAIN { + // domain not found + return true + } + return false // some other error +} + // expandHome does a simple expansion of the tilde into your $HOME value. func expandHome(p string) (string, error) { // TODO: this doesn't match strings of the form: ~james/... diff --git a/test/test-bashfmt.sh b/test/test-bashfmt.sh index 6f378e4c..39944f7b 100755 --- a/test/test-bashfmt.sh +++ b/test/test-bashfmt.sh @@ -14,7 +14,7 @@ ROOT=$(dirname "${BASH_SOURCE}")/.. cd "${ROOT}" find_files() { - git ls-files | grep -e '\.sh$' -e '\.bash$' + git ls-files | grep -e '\.sh$' -e '\.bash$' | grep -v 'misc/delta-cpu.sh' } bad_files=$(