mgmt/etcd/etcd.go

// Mgmt
// Copyright (C) 2013-2024+ James Shubin and the project contributors
// Written by James Shubin <james@shubin.ca> and the project contributors
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

// TODO: remove race around leader operations
// TODO: fix unstarted member
// TODO: add VIP for servers (incorporate with net resource)
// TODO: auto assign ports/ip's for peers (if possible)
// TODO: check the shutdown ordering, so everything unrolls to a shutdown
// TODO: add the converger Register/Unregister stuff and timers if needed

// Package etcd implements the distributed key value store and fs integration.
// This also takes care of managing and clustering of the embedded etcd server.
// The automatic clustering is considered experimental. If you require a more
// robust, battle-test etcd cluster, then manage your own, and point each mgmt
// agent at it with --seeds and --no-server.
//
// # Algorithm
//
// The elastic etcd algorithm works in the following way:
//
// * When you start up mgmt, you can pass it a list of seeds.
//
// * If no seeds are given, then assume you are the first server and startup.
//
// * If a seed is given, connect as a client, and volunteer to be a server.
//
// * All volunteering clients should listen for a message for nomination.
//
// * If a client has been nominated, it should startup a server.
//
// * A server should shutdown if its nomination is removed.
//
// * The elected leader should decide who to nominate/unnominate as needed.
//
// # Notes
//
// If you attempt to add a new member to the cluster with a duplicate hostname,
// then the behaviour is undefined, and you could bork your cluster. This is not
// recommended or supported. Please ensure that your hostnames are unique.
//
// A single ^C requests an orderly shutdown, however a third ^C will ask etcd to
// shutdown forcefully. It is not recommended that you use this option, it
// exists as a way to make exit easier if something deadlocked the cluster. If
// this was due to user error (eg: duplicate hostnames) then it was your fault,
// but if the member did not shutdown from a single ^C under normal
// circumstances, then please file a bug.
//
// There are currently some races in this implementation. In practice, this
// should not cause any adverse effects unless you simultaneously add or remove
// members at a high rate. Fixing these races will probably require some
// internal changes to etcd. Help is welcome if you're interested in working on
// this.
//
// # Smoke testing
//
// Here is a simple way to test etcd clustering basics...
//
//	./mgmt run --tmp-prefix --no-pgp --hostname h1 empty
//	./mgmt run --tmp-prefix --no-pgp --hostname h2 --seeds http://127.0.0.1:2379 --client-urls http://127.0.0.1:2381 --server-urls http://127.0.0.1:2382 empty
//	./mgmt run --tmp-prefix --no-pgp --hostname h3 --seeds http://127.0.0.1:2379 --client-urls http://127.0.0.1:2383 --server-urls http://127.0.0.1:2384 empty
//	ETCDCTL_API=3 etcdctl --endpoints 127.0.0.1:2379 put /_mgmt/chooser/dynamicsize/idealclustersize 3
//	./mgmt run --tmp-prefix --no-pgp --hostname h4 --seeds http://127.0.0.1:2379 --client-urls http://127.0.0.1:2385 --server-urls http://127.0.0.1:2386 empty
//	./mgmt run --tmp-prefix --no-pgp --hostname h5 --seeds http://127.0.0.1:2379 --client-urls http://127.0.0.1:2387 --server-urls http://127.0.0.1:2388 empty
//	ETCDCTL_API=3 etcdctl --endpoints 127.0.0.1:2379 member list
//	ETCDCTL_API=3 etcdctl --endpoints 127.0.0.1:2381 put /_mgmt/chooser/dynamicsize/idealclustersize 5
//	ETCDCTL_API=3 etcdctl --endpoints 127.0.0.1:2381 member list
//
// # Bugs
//
// A member might occasionally think that an endpoint still exists after it has
// already shutdown. This isn't a major issue, since if that endpoint doesn't
// respond, then it will automatically choose the next available one. To see
// this issue, turn on debugging and start: H1, H2, H3, then stop H2, and you
// might see that H3 still knows about H2.
//
// Shutting down a cluster by setting the idealclustersize to zero is currently
// buggy and not supported. Try this at your own risk.
//
// If a member is nominated, and it doesn't respond to the nominate event and
// startup, and we lost quorum to add it, then we could be in a blocked state.
// This can be improved upon if we can call memberRemove after a timeout.
//
// Adding new cluster members very quickly, might trigger a:
// `runtime error: error validating peerURLs ... member count is unequal` error.
// See: https://github.com/etcd-io/etcd/issues/10626 for more information.
//
// If you use the dynamic size feature to start and stop the server process,
// once it has already started and then stopped, it can't be re-started because
// of a bug in etcd that doesn't free the port. Instead you'll get a:
// `bind: address already in use` error. See:
// https://github.com/etcd-io/etcd/issues/6042 for more information.
package etcd

import (
	"context"
	"fmt"
	"net/url"
	"os"
	"sort"
	"strings"
	"sync"
	"time"

	"github.com/purpleidea/mgmt/converger"
	"github.com/purpleidea/mgmt/etcd/chooser"
	"github.com/purpleidea/mgmt/etcd/client"
	"github.com/purpleidea/mgmt/etcd/interfaces"
	etcdUtil "github.com/purpleidea/mgmt/etcd/util"
	"github.com/purpleidea/mgmt/util"
	"github.com/purpleidea/mgmt/util/errwrap"

	etcdtypes "go.etcd.io/etcd/client/pkg/v3/types"
	etcd "go.etcd.io/etcd/client/v3"
	"go.etcd.io/etcd/client/v3/concurrency"
	"go.etcd.io/etcd/client/v3/namespace"
	"go.etcd.io/etcd/server/v3/embed"
)

const (
	// TODO: figure out a trailing slash convention...

	// NominatedPath is the unprefixed path under which nominated hosts are
	// stored. This is public so that other consumers can know to avoid this
	// key prefix.
	NominatedPath    = "/nominated/"
	nominatedPathFmt = NominatedPath + "%s" // takes a hostname on the end

	// VolunteerPath is the unprefixed path under which volunteering hosts
	// are stored. This is public so that other consumers can know to avoid
	// this key prefix.
	VolunteerPath    = "/volunteer/"
	volunteerPathFmt = VolunteerPath + "%s" // takes a hostname on the end

	// EndpointsPath is the unprefixed path under which the advertised host
	// endpoints are stored. This is public so that other consumers can know
	// to avoid this key prefix.
	EndpointsPath    = "/endpoints/"
	endpointsPathFmt = EndpointsPath + "%s" // takes a hostname on the end

	// ChooserPath is the unprefixed path under which the chooser algorithm
	// may store data. This is public so that other consumers can know to
	// avoid this key prefix.
	ChooserPath = "/chooser" // all hosts share the same namespace

	// ConvergedPath is the unprefixed path under which the converger
	// may store data. This is public so that other consumers can know to
	// avoid this key prefix.
	ConvergedPath    = "/converged/"
	convergedPathFmt = ConvergedPath + "%s" // takes a hostname on the end

	// SchedulerPath is the unprefixed path under which the scheduler
	// may store data. This is public so that other consumers can know to
	// avoid this key prefix.
	SchedulerPath    = "/scheduler/"
	schedulerPathFmt = SchedulerPath + "%s" // takes a namespace on the end

	// DefaultClientURL is the default value that is used for client URLs.
	// It is pulled from the upstream etcd package.
	DefaultClientURL = embed.DefaultListenClientURLs // 127.0.0.1:2379

	// DefaultServerURL is the default value that is used for server URLs.
	// It is pulled from the upstream etcd package.
	DefaultServerURL = embed.DefaultListenPeerURLs // 127.0.0.1:2380

	// DefaultMaxTxnOps is the maximum number of operations to run in a
	// single etcd transaction. If you exceed this limit, it is possible
	// that you have either an extremely large code base, or that you have
	// some code which is possibly not as efficient as it could be. Let us
	// know so that we can analyze the situation, and increase this if
	// necessary.
	DefaultMaxTxnOps = 512

	// RunStartupTimeout is the amount of time we will wait for regular run
	// startup before cancelling it all.
	RunStartupTimeout = 30 * time.Second

	// ClientDialTimeout is the DialTimeout option in the client config.
	ClientDialTimeout = 5 * time.Second

	// ClientDialKeepAliveTime is the DialKeepAliveTime config value for the
	// etcd client. It is recommended that you use this so that dead
	// endpoints don't block any cluster operations.
	ClientDialKeepAliveTime = 2 * time.Second // from etcdctl
	// ClientDialKeepAliveTimeout is the DialKeepAliveTimeout config value
	// for the etcd client. It is recommended that you use this so that dead
	// endpoints don't block any cluster operations.
	ClientDialKeepAliveTimeout = 6 * time.Second // from etcdctl

	// MemberChangeInterval is the polling interval to use when watching for
	// member changes during add or remove.
	MemberChangeInterval = 500 * time.Millisecond

	// SelfRemoveTimeout gives unnominated members a chance to self exit.
	SelfRemoveTimeout = 10 * time.Second

	// ForceExitTimeout is the amount of time we will wait for a force exit
	// to occur before cancelling it all.
	ForceExitTimeout = 15 * time.Second

	// SessionTTL is the number of seconds to wait before a dead or
	// unresponsive host has their volunteer keys removed from the cluster.
	// This should be an integer multiple of seconds, since one second is
	// the TTL precision used in etcd.
	SessionTTL = 10 * time.Second // seconds

	// RequireLeaderCtx specifies whether the volunteer loop should use the
	// WithRequireLeader ctx wrapper. It is unknown at this time if this
	// would cause occasional events to be lost, more extensive testing is
	// needed.
	RequireLeaderCtx = false

	// ConvergerHostnameNamespace is a unique key used in the converger.
	ConvergerHostnameNamespace = "etcd-hostname"
)

// EmbdEtcd provides the embedded server and client etcd functionality.
type EmbdEtcd struct { // EMBeddeD etcd
	Hostname string

	// Seeds is the list of servers that this client could connect to.
	Seeds etcdtypes.URLs

	// ClientURLs are the locations to listen for clients if i am a server.
	ClientURLs etcdtypes.URLs
	// ServerURLs are the locations to listen for servers (peers) if i am a
	// server (peer).
	ServerURLs etcdtypes.URLs
	// AClientURLs are the client urls to advertise.
	AClientURLs etcdtypes.URLs
	// AServerURLscare the server (peer) urls to advertise.
	AServerURLs etcdtypes.URLs

	// NoServer disables all server peering for this host.
	// TODO: allow changing this at runtime with some function call?
	NoServer bool
	// NoNetwork causes this to use unix:// sockets instead of TCP for
	// connections.
	NoNetwork bool

	// Chooser is the implementation of the algorithm that decides which
	// hosts to add or remove to grow and shrink the cluster.
	Chooser chooser.Chooser
	chooser chooser.Chooser // the one we use if it's active

	// Converger is a converged coordinator object that can be used to
	// track the converged state.
	Converger *converger.Coordinator

	// NS is a string namespace that we prefix to every key operation.
	NS string

	// Prefix is the directory where any etcd related state is stored. It
	// must be an absolute directory path.
	Prefix string

	Debug bool
	Logf  func(format string, v ...interface{})

	wg       *sync.WaitGroup
	exit     *util.EasyExit // exit signal
	closing  bool           // are we closing ?
	hardexit *util.EasyExit // hard exit signal (to unblock borked things)

	errChan chan error // global error chan, closes when Run is done

	// errExit1 ... errExitN all must get closed for errChan to close.
	errExit1 chan struct{} // connect
	errExit2 chan struct{} // chooser
	errExit3 chan struct{} // nominate
	errExit4 chan struct{} // volunteer
	errExit5 chan struct{} // endpoints
	errExitN chan struct{} // special signal for server closing (starts/stops)

	// coordinate an organized exit so we wait for everyone without blocking
	activeExit1   bool
	activeExit2   bool
	activeExit3   bool
	activeExit4   bool
	activeExit5   bool
	activateExit1 *util.EasyAckOnce
	activateExit2 *util.EasyAckOnce
	activateExit3 *util.EasyAckOnce
	activateExit4 *util.EasyAckOnce
	activateExit5 *util.EasyAckOnce

	readySignal chan struct{} // closes when we're up and running
	exitsSignal chan struct{} // closes when run exits

	// locally tracked state

	// nominated is a local cache of who's been nominated. This contains
	// values for where a *server* would connect to. It gets updated
	// primarily in the nominateCb watcher loop.
	// TODO: maybe this should just be a list?
	// TODO: is there a difference here between ServerURLs and AServerURLs ?
	nominated etcdtypes.URLsMap // map[hostname]URLs

	// volunteers is a local cache of who's volunteered. This contains
	// values for where a *server* would connect to. It gets updated
	// primarily in the volunteerCb watcher loop.
	// TODO: maybe this should just be a list?
	// TODO: is there a difference here between ServerURLs and AServerURLs ?
	volunteers etcdtypes.URLsMap // map[hostname]URLs

	// membermap is a local cache of server endpoints. This contains values
	// for where a *server* (peer) would connect to. It gets updated in the
	// membership state functions.
	membermap etcdtypes.URLsMap // map[hostname]URLs

	// endpoints is a local cache of server endpoints. It differs from the
	// config value which is a flattened representation of the same. That
	// value can be seen via client.Endpoints() and client.SetEndpoints().
	// This contains values for where a *client* would connect to. It gets
	// updated in the membership state functions.
	endpoints etcdtypes.URLsMap // map[hostname]URLs

	// memberIDs is a local cache of which cluster servers (peers) are
	// associated with each memberID. It gets updated in the membership
	// state functions. Note that unstarted members have an ID, but no name
	// yet, so they aren't included here, since that key would be the empty
	// string.
	memberIDs map[string]uint64 // map[hostname]memberID

	// behaviour mutexes
	stateMutex     *sync.RWMutex // lock around all locally tracked state
	orderingMutex  *sync.Mutex   // lock around non-concurrent changes
	nominatedMutex *sync.Mutex   // lock around nominatedCb
	volunteerMutex *sync.Mutex   // lock around volunteerCb

	// client related
	etcd          *etcd.Client
	connectSignal chan struct{}        // TODO: use a SubscribedSignal instead?
	client        *client.Simple       // provides useful helper methods
	clients       []*client.Simple     // list of registered clients
	session       *concurrency.Session // session that expires on disconnect
	leaseID       etcd.LeaseID         // the leaseID used by this session

	// server related
	server            *embed.Etcd            // contains the server struct
	serverID          uint64                 // uint64 because memberRemove uses that
	serverwg          *sync.WaitGroup        // wait for server to shutdown
	servermu          *sync.Mutex            // lock around destroy server
	serverExit        *util.EasyExit         // exit signal
	serverReadySignal *util.SubscribedSignal // signals when server is up and running
	serverExitsSignal *util.SubscribedSignal // signals when runServer exits

	// task queue state
	taskQueue        []*task
	taskQueueWg      *sync.WaitGroup
	taskQueueLock    *sync.Mutex
	taskQueueRunning bool
	taskQueueID      int
}

// sessionTTLSec transforms the time representation into the nearest number of
// seconds, which is needed by the etcd API.
func sessionTTLSec(d time.Duration) int {
	return int(d.Seconds())
}

// Validate the initial struct. This is called from Init, but can be used if you
// would like to check your configuration is correct.
func (obj *EmbdEtcd) Validate() error {
	s := sessionTTLSec(SessionTTL)
	if s <= 0 {
		return fmt.Errorf("the SessionTTL const of %s (%d sec) must be greater than zero", SessionTTL.String(), s)
	}
	if s > etcd.MaxLeaseTTL {
		return fmt.Errorf("the SessionTTL const of %s (%d sec) must be less than %d sec", SessionTTL.String(), s, etcd.MaxLeaseTTL)
	}

	if obj.Hostname == "" {
		return fmt.Errorf("the Hostname was not specified")
	}

	if obj.NoServer && len(obj.Seeds) == 0 {
		return fmt.Errorf("need at least one seed if NoServer is true")
	}

	if !obj.NoServer { // you don't need a Chooser if there's no server...
		if obj.Chooser == nil {
			return fmt.Errorf("need to specify a Chooser implementation")
		}
		if err := obj.Chooser.Validate(); err != nil {
			return errwrap.Wrapf(err, "the Chooser did not validate")
		}
	}

	if obj.NoNetwork {
		if len(obj.Seeds) != 0 || len(obj.ClientURLs) != 0 || len(obj.ServerURLs) != 0 {
			return fmt.Errorf("option NoNetwork is mutually exclusive with Seeds, ClientURLs and ServerURLs")
		}
	}

	if _, err := etcdUtil.CopyURLs(obj.Seeds); err != nil { // this will validate
		return errwrap.Wrapf(err, "the Seeds are not valid")
	}

	if obj.NS == "/" {
		return fmt.Errorf("the namespace should be empty instead of /")
	}
	if strings.HasSuffix(obj.NS, "/") {
		return fmt.Errorf("the namespace should not end in /")
	}

	if obj.Prefix == "" || obj.Prefix == "/" {
		return fmt.Errorf("the prefix of `%s` is invalid", obj.Prefix)
	}

	if obj.Logf == nil {
		return fmt.Errorf("no Logf function was specified")
	}

	return nil
}

// Init initializes the struct after it has been populated as desired. You must
// not use the struct if this returns an error.
func (obj *EmbdEtcd) Init() error {
	if err := obj.Validate(); err != nil {
		return errwrap.Wrapf(err, "validate error")
	}

	if obj.ClientURLs == nil {
		obj.ClientURLs = []url.URL{} // initialize
	}
	if obj.ServerURLs == nil {
		obj.ServerURLs = []url.URL{}
	}
	if obj.AClientURLs == nil {
		obj.AClientURLs = []url.URL{}
	}
	if obj.AServerURLs == nil {
		obj.AServerURLs = []url.URL{}
	}

	curls, err := obj.curls()
	if err != nil {
		return err
	}
	surls, err := obj.surls()
	if err != nil {
		return err
	}
	if !obj.NoServer {
		// add a default
		if len(curls) == 0 {
			u, err := url.Parse(DefaultClientURL)
			if err != nil {
				return err
			}
			obj.ClientURLs = []url.URL{*u}
		}
		// add a default for local use and testing, harmless and useful!
		if len(surls) == 0 {
			u, err := url.Parse(DefaultServerURL) // default
			if err != nil {
				return err
			}
			obj.ServerURLs = []url.URL{*u}
		}

		// TODO: if we don't have any localhost URLs, should we warn so
		// that our local client can be able to connect more easily?
		if len(etcdUtil.LocalhostURLs(obj.ClientURLs)) == 0 {
			u, err := url.Parse(DefaultClientURL)
			if err != nil {
				return err
			}
			obj.ClientURLs = append([]url.URL{*u}, obj.ClientURLs...) // prepend
		}
	}

	if obj.NoNetwork {
		var err error
		// FIXME: convince etcd to store these files in our obj.Prefix!
		obj.ClientURLs, err = etcdtypes.NewURLs([]string{"unix://clients.sock:0"})
		if err != nil {
			return err
		}
		obj.ServerURLs, err = etcdtypes.NewURLs([]string{"unix://servers.sock:0"})
		if err != nil {
			return err
		}
	}

	if !obj.NoServer { // you don't need a Chooser if there's no server...
		data := &chooser.Data{
			Hostname: obj.Hostname,
			Debug:    obj.Debug,
			Logf: func(format string, v ...interface{}) {
				obj.Logf("chooser: "+format, v...)
			},
		}
		obj.chooser = obj.Chooser // copy
		if err := obj.chooser.Init(data); err != nil {
			return errwrap.Wrapf(err, "error initializing chooser")
		}
	}

	if err := os.MkdirAll(obj.Prefix, 0770); err != nil {
		return errwrap.Wrapf(err, "couldn't mkdir: %s", obj.Prefix)
	}

	obj.wg = &sync.WaitGroup{}
	obj.exit = util.NewEasyExit()
	obj.hardexit = util.NewEasyExit()

	obj.errChan = make(chan error)

	obj.errExit1 = make(chan struct{})
	obj.errExit2 = make(chan struct{})
	obj.errExit3 = make(chan struct{})
	obj.errExit4 = make(chan struct{})
	obj.errExit5 = make(chan struct{})
	obj.errExitN = make(chan struct{}) // done before call to runServer!
	close(obj.errExitN)                // starts closed

	//obj.activeExit1 = false
	//obj.activeExit2 = false
	//obj.activeExit3 = false
	//obj.activeExit4 = false
	//obj.activeExit5 = false
	obj.activateExit1 = util.NewEasyAckOnce()
	obj.activateExit2 = util.NewEasyAckOnce()
	obj.activateExit3 = util.NewEasyAckOnce()
	obj.activateExit4 = util.NewEasyAckOnce()
	obj.activateExit5 = util.NewEasyAckOnce()

	obj.readySignal = make(chan struct{})
	obj.exitsSignal = make(chan struct{})

	// locally tracked state
	obj.nominated = make(etcdtypes.URLsMap)
	obj.volunteers = make(etcdtypes.URLsMap)
	obj.membermap = make(etcdtypes.URLsMap)
	obj.endpoints = make(etcdtypes.URLsMap)
	obj.memberIDs = make(map[string]uint64)

	// behaviour mutexes
	obj.stateMutex = &sync.RWMutex{}
	// TODO: I'm not sure if orderingMutex is actually required or not...
	obj.orderingMutex = &sync.Mutex{}
	obj.nominatedMutex = &sync.Mutex{}
	obj.volunteerMutex = &sync.Mutex{}

	// client related
	obj.connectSignal = make(chan struct{})
	obj.clients = []*client.Simple{}

	// server related
	obj.serverwg = &sync.WaitGroup{}
	obj.servermu = &sync.Mutex{}
	obj.serverExit = util.NewEasyExit() // is reset after destroyServer exit
	obj.serverReadySignal = &util.SubscribedSignal{}
	obj.serverExitsSignal = &util.SubscribedSignal{}

	// task queue state
	obj.taskQueue = []*task{}
	obj.taskQueueWg = &sync.WaitGroup{}
	obj.taskQueueLock = &sync.Mutex{}

	return nil
}

// Close cleans up after you are done using the struct.
func (obj *EmbdEtcd) Close() error {
	var reterr error

	if obj.chooser != nil {
		reterr = errwrap.Append(reterr, obj.chooser.Close())
	}

	return reterr
}

// curls returns the client urls that we should use everywhere except for
// locally, where we prefer to use the non-advertised perspective.
func (obj *EmbdEtcd) curls() (etcdtypes.URLs, error) {
	// TODO: do we need the copy?
	if len(obj.AClientURLs) > 0 {
		return etcdUtil.CopyURLs(obj.AClientURLs)
	}
	return etcdUtil.CopyURLs(obj.ClientURLs)
}

// surls returns the server (peer) urls that we should use everywhere except for
// locally, where we prefer to use the non-advertised perspective.
func (obj *EmbdEtcd) surls() (etcdtypes.URLs, error) {
	// TODO: do we need the copy?
	if len(obj.AServerURLs) > 0 {
		return etcdUtil.CopyURLs(obj.AServerURLs)
	}
	return etcdUtil.CopyURLs(obj.ServerURLs)
}

// err is an error helper that sends to the errChan.
func (obj *EmbdEtcd) err(err error) {
	select {
	case obj.errChan <- err:
	}
}

// Run is the main entry point to kick off the embedded etcd client and server.
// It blocks until we've exited for shutdown. The shutdown can be triggered by
// calling Destroy.
func (obj *EmbdEtcd) Run() error {
	curls, err := obj.curls()
	if err != nil {
		return err
	}
	surls, err := obj.surls()
	if err != nil {
		return err
	}

	exitCtx := obj.exit.Context() // local exit signal
	obj.Logf("running...")
	defer obj.Logf("exited!")
	wg := &sync.WaitGroup{}
	defer wg.Wait()
	defer close(obj.exitsSignal)
	defer obj.wg.Wait()
	defer obj.exit.Done(nil) // unblock anything waiting for exit...
	startupCtx, cancel := context.WithTimeout(exitCtx, RunStartupTimeout)
	defer cancel()
	defer obj.Logf("waiting for exit cleanup...") // TODO: is this useful?

	// After we trigger a hardexit, wait for the ForceExitTimeout and then
	// cancel any remaining stuck context's. This helps prevent angry users.
	unblockCtx, runTimeout := context.WithCancel(context.Background())
	defer runTimeout()
	wg.Add(1)
	go func() {
		defer wg.Done()
		defer runTimeout()
		select {
		case <-obj.hardexit.Signal(): // bork unblocker
		case <-obj.exitsSignal:
		}

		select {
		case <-time.After(ForceExitTimeout):
		case <-obj.exitsSignal:
		}
	}()

	// main loop exit signal
	obj.wg.Add(1)
	go func() {
		defer obj.wg.Done()
		// when all the senders on errChan have exited, we can exit too
		defer close(obj.errChan)
		// these signals guard around the errChan close operation
		wg := &sync.WaitGroup{}
		wg.Add(1)
		go func() {
			defer wg.Done()
			// We wait here until we're notified to know whether or
			// not this particular exit signal will be relevant...
			// This is because during some runs, we might not use
			// all of the signals, therefore we don't want to wait
			// for them!
			select {
			case <-obj.activateExit1.Wait():
			case <-exitCtx.Done():
			}
			if !obj.activeExit1 {
				return
			}
			select {
			case <-obj.errExit1:
				if obj.Debug {
					obj.Logf("exited connect loop (1)")
				}
			}
		}()
		wg.Add(1)
		go func() {
			defer wg.Done()
			select {
			case <-obj.activateExit2.Wait():
			case <-exitCtx.Done():
			}
			if !obj.activeExit2 {
				return
			}
			select {
			case <-obj.errExit2:
				if obj.Debug {
					obj.Logf("exited chooser loop (2)")
				}
			}
		}()
		wg.Add(1)
		go func() {
			defer wg.Done()
			select {
			case <-obj.activateExit3.Wait():
			case <-exitCtx.Done():
			}
			if !obj.activeExit3 {
				return
			}
			select {
			case <-obj.errExit3:
				if obj.Debug {
					obj.Logf("exited nominate loop (3)")

				}
			}
		}()
		wg.Add(1)
		go func() {
			defer wg.Done()
			select {
			case <-obj.activateExit4.Wait():
			case <-exitCtx.Done():
			}
			if !obj.activeExit4 {
				return
			}
			select {
			case <-obj.errExit4:
				if obj.Debug {
					obj.Logf("exited volunteer loop (4)")
				}
			}
		}()
		wg.Add(1)
		go func() {
			defer wg.Done()
			select {
			case <-obj.activateExit5.Wait():
			case <-exitCtx.Done():
			}
			if !obj.activeExit5 {
				return
			}
			select {
			case <-obj.errExit5:
				if obj.Debug {
					obj.Logf("exited endpoints loop (5)")
				}
			}
		}()
		wg.Wait() // wait for all the other exit signals before this one
		select {
		case <-obj.errExitN: // last one is for server (it can start/stop)
			if obj.Debug {
				obj.Logf("exited server loop (0)")
			}
		}
	}()

	// main loop
	var reterr error
	obj.wg.Add(1)
	go func() {
		defer obj.wg.Done()
	Loop:
		for {
			select {
			case err, ok := <-obj.errChan:
				if !ok { // when this closes, we can shutdown
					break Loop
				}
				if err == nil {
					err = fmt.Errorf("unexpected nil error")
				}
				obj.Logf("runtime error: %+v", err)
				if reterr == nil { // happens only once
					obj.exit.Done(err) // trigger an exit in Run!
				}
				reterr = errwrap.Append(reterr, err)
			}
		}
	}()

	bootstrapping := len(obj.Seeds) == 0 // we're the first, start a server!
	canServer := !obj.NoServer

	// Opportunistic "connect events" system, so that we can connect
	// promiscuously when it's needed, instead of needing to linearize code.
	obj.activeExit1 = true // activate errExit1
	obj.activateExit1.Ack()
	obj.wg.Add(1)
	go func() {
		defer obj.wg.Done()
		defer close(obj.errExit1) // multi-signal for errChan close op
		if bootstrapping {
			serverReady, ackReady := obj.ServerReady()    // must call ack!
			serverExited, ackExited := obj.ServerExited() // must call ack!
			select {
			case <-serverReady:
				ackReady()  // must be called
				ackExited() // must be called

			case <-serverExited:
				ackExited() // must be called
				ackReady()  // must be called
				// send an error in case server doesn't
				// TODO: do we want this error to be sent?
				obj.err(fmt.Errorf("server exited early"))
				return

			case <-obj.exit.Signal(): // exit early on exit signal
				ackReady()  // must be called
				ackExited() // must be called
				return
			}
		}

		// Connect here. If we're bootstrapping, the server came up
		// right above us. No need to add to our endpoints manually,
		// that is done for us in the server start method.
		if err := obj.connect(); err != nil {
			obj.err(errwrap.Wrapf(err, "error during client connect"))
			return
		}
		obj.client = client.NewClientFromClient(obj.etcd)
		obj.client.Debug = obj.Debug
		obj.client.Logf = func(format string, v ...interface{}) {
			obj.Logf("client: "+format, v...)
		}
		if err := obj.client.Init(); err != nil {
			obj.err(errwrap.Wrapf(err, "error during client init"))
			return
		}

		// Build a session for making leases that expire on disconnect!
		options := []concurrency.SessionOption{
			concurrency.WithTTL(sessionTTLSec(SessionTTL)),
		}
		if obj.leaseID > 0 { // in the odd chance we ever do reconnects
			options = append(options, concurrency.WithLease(obj.leaseID))
		}
		obj.session, err = concurrency.NewSession(obj.etcd, options...)
		if err != nil {
			obj.err(errwrap.Wrapf(err, "could not create session"))
			return
		}
		obj.leaseID = obj.session.Lease()

		obj.Logf("connected!")
		if !bootstrapping { // new clients need an initial state sync...
			if err := obj.memberStateFromList(startupCtx); err != nil {
				obj.err(errwrap.Wrapf(err, "error during initial state sync"))
				return
			}
		}
		close(obj.connectSignal)
	}()
	defer func() {
		if obj.session != nil {
			obj.session.Close() // this revokes the lease...
		}

		// run cleanup functions in reverse order
		for i := len(obj.clients) - 1; i >= 0; i-- {
			obj.clients[i].Close() // ignore errs
		}
		if obj.client != nil { // in case we bailed out early
			obj.client.Close() // ignore err, but contains wg.Wait()
		}
		if obj.etcd == nil { // in case we bailed out early
			return
		}
		obj.disconnect()
		obj.Logf("disconnected!")
		//close(obj.disconnectSignal)
	}()

	obj.Logf("watching chooser...")
	chooserChan := make(chan error)
	obj.activeExit2 = true // activate errExit2
	obj.activateExit2.Ack()
	obj.wg.Add(1)
	go func() {
		defer obj.wg.Done()
		defer close(obj.errExit2) // multi-signal for errChan close op
		if obj.chooser == nil {
			return
		}

		// wait till we're connected
		select {
		case <-obj.connectSignal:
		case <-exitCtx.Done():
			return // run exited early
		}

		p := obj.NS + ChooserPath
		c, err := obj.MakeClientFromNamespace(p)
		if err != nil {
			obj.err(errwrap.Wrapf(err, "error during chooser init"))
			return
		}
		if err := obj.chooser.Connect(exitCtx, c); err != nil {
			obj.err(errwrap.Wrapf(err, "error during chooser connect"))
			return
		}

		ch, err := obj.chooser.Watch()
		if err != nil {
			obj.err(errwrap.Wrapf(err, "error running chooser watch"))
			return
		}
		chooserChan = ch // watch it
	}()
	defer func() {
		if obj.chooser == nil {
			return
		}
		obj.chooser.Disconnect() // ignore error if any
	}()

	// call this once to start the server so we'll be able to connect
	if bootstrapping {
		obj.Logf("bootstrapping...")
		obj.volunteers[obj.Hostname] = surls // bootstrap this!
		obj.nominated[obj.Hostname] = surls
		// alternatively we can bootstrap like this if we add more stuff...
		//data := bootstrapWatcherData(obj.Hostname, surls) // server urls
		//if err := obj.nominateApply(data); err != nil { // fake apply
		//	return err
		//}
		// server starts inside here if bootstrapping!
		if err := obj.nominateCb(startupCtx); err != nil {
			// If while bootstrapping a new server, an existing one
			// is running on the same port, then we error this here.
			return err
		}

		// wait till we're connected
		select {
		case <-obj.connectSignal:
		case <-exitCtx.Done():
			// TODO: should this return an error?
			return nil // run exited early
		}

		// advertise our new endpoint (comes paired after nominateCb)
		if err := obj.advertise(startupCtx, obj.Hostname, curls); err != nil { // endpoints
			return errwrap.Wrapf(err, "error with endpoints advertise")
		}

		// run to add entry into our public nominated datastructure
		// FIXME: this might be redundant, but probably not harmful in
		// our bootstrapping process... it will get done in volunteerCb
		if err := obj.nominate(startupCtx, obj.Hostname, surls); err != nil {
			return errwrap.Wrapf(err, "error nominating self")
		}
	}

	// If obj.NoServer, then we don't need to start up the nominate watcher,
	// unless we're the first server... But we check that both are not true!
	if bootstrapping || canServer {
		if !bootstrapping && canServer { // wait for client!
			select {
			case <-obj.connectSignal:
			case <-exitCtx.Done():
				return nil // just exit
			}
		}

		ctx, cancel := context.WithCancel(unblockCtx)
		defer cancel()
		info, err := obj.client.ComplexWatcher(ctx, obj.NS+NominatedPath, etcd.WithPrefix())
		if err != nil {
			obj.activateExit3.Ack()
			return errwrap.Wrapf(err, "error adding nominated watcher")
		}
		obj.Logf("watching nominees...")
		obj.activeExit3 = true // activate errExit3
		obj.activateExit3.Ack()
		obj.wg.Add(1)
		go func() {
			defer obj.wg.Done()
			defer close(obj.errExit3) // multi-signal for errChan close op
			defer cancel()
			for {
				var event *interfaces.WatcherData
				var ok bool
				select {
				case event, ok = <-info.Events:
					if !ok {
						return
					}
				}

				if err := event.Err; err != nil {
					obj.err(errwrap.Wrapf(err, "nominated watcher errored"))
					continue
				}

				// on the initial created event, we populate...
				if !bootstrapping && event.Created && len(event.Events) == 0 {
					obj.Logf("populating nominated list...")
					nominated, err := obj.getNominated(ctx)
					if err != nil {
						obj.err(errwrap.Wrapf(err, "get nominated errored"))
						continue
					}
					obj.nominated = nominated

				} else if err := obj.nominateApply(event); err != nil {
					obj.err(errwrap.Wrapf(err, "nominate apply errored"))
					continue
				}

				// decide the desired state before we change it
				doStop := obj.serverAction(serverActionStop)
				doStart := obj.serverAction(serverActionStart)

				// server is running, but it should not be
				if doStop { // stop?
					// first, un advertise client urls
					// TODO: should this cause destroy server instead? does it already?
					if err := obj.advertise(ctx, obj.Hostname, nil); err != nil { // remove me
						obj.err(errwrap.Wrapf(err, "error with endpoints unadvertise"))
						continue
					}
				}

				// runServer gets started in a goroutine here...
				err := obj.nominateCb(ctx)
				if obj.Debug {
					obj.Logf("nominateCb: %+v", err)
				}

				if doStart { // start?
					if err := obj.advertise(ctx, obj.Hostname, curls); err != nil { // add one
						obj.err(errwrap.Wrapf(err, "error with endpoints advertise"))
						continue
					}
				}

				if err == interfaces.ErrShutdown {
					if obj.Debug {
						obj.Logf("nominated watcher shutdown")
					}
					return
				}
				if err == nil {
					continue
				}
				obj.err(errwrap.Wrapf(err, "nominated watcher callback errored"))
				continue
			}
		}()
		defer func() {
			// wait for unnominate of self to be seen...
			select {
			case <-obj.errExit3:
			case <-obj.hardexit.Signal(): // bork unblocker
				obj.Logf("unblocked unnominate signal")
				// now unblock the server in case it's running!
				if err := obj.destroyServer(); err != nil { // sync until exited
					obj.err(errwrap.Wrapf(err, "destroyServer errored"))
					return
				}
			}
		}()
		defer func() {
			// wait for volunteer loop to exit
			select {
			case <-obj.errExit4:
			}
		}()
	}
	obj.activateExit3.Ack()

	// volunteering code (volunteer callback and initial volunteering)
	if !obj.NoServer && len(obj.ServerURLs) > 0 {
		ctx, cancel := context.WithCancel(unblockCtx)
		defer cancel() // cleanup on close...
		info, err := obj.client.ComplexWatcher(ctx, obj.NS+VolunteerPath, etcd.WithPrefix())
		if err != nil {
			obj.activateExit4.Ack()
			return errwrap.Wrapf(err, "error adding volunteer watcher")
		}
		unvolunteered := make(chan struct{})
		obj.Logf("watching volunteers...")
		obj.wg.Add(1)
		obj.activeExit4 = true // activate errExit4
		obj.activateExit4.Ack()
		go func() {
			defer obj.wg.Done()
			defer close(obj.errExit4) // multi-signal for errChan close op
			for {
				var event *interfaces.WatcherData
				var ok bool
				select {
				case event, ok = <-info.Events:
					if !ok {
						return
					}
					if err := event.Err; err != nil {
						obj.err(errwrap.Wrapf(err, "volunteer watcher errored"))
						continue
					}

				case chooserEvent, ok := <-chooserChan:
					if !ok {
						obj.Logf("got chooser shutdown...")
						chooserChan = nil // done here!
						continue
					}
					if chooserEvent != nil {
						obj.err(errwrap.Wrapf(err, "chooser watcher errored"))
						continue
					}
					obj.Logf("got chooser event...")
					event = nil // pass through the apply...
					// chooser events should poke volunteerCb
				}

				_, exists1 := obj.volunteers[obj.Hostname] // before

				// on the initial created event, we populate...
				if !bootstrapping && event != nil && event.Created && len(event.Events) == 0 {
					obj.Logf("populating volunteers list...")
					volunteers, err := obj.getVolunteers(ctx)
					if err != nil {
						obj.err(errwrap.Wrapf(err, "get volunteers errored"))
						continue
					}
					// TODO: do we need to add ourself?
					//_, exists := volunteers[obj.Hostname]
					//if !exists {
					//	volunteers[obj.Hostname] = surls
					//}
					obj.volunteers = volunteers

				} else if err := obj.volunteerApply(event); event != nil && err != nil {
					obj.err(errwrap.Wrapf(err, "volunteer apply errored"))
					continue
				}
				_, exists2 := obj.volunteers[obj.Hostname] // after

				err := obj.volunteerCb(ctx)
				if err == nil {
					// it was there, and it got removed
					if exists1 && !exists2 {
						close(unvolunteered)
					}
					continue
				}
				obj.err(errwrap.Wrapf(err, "volunteer watcher callback errored"))
				continue
			}
		}()
		defer func() {
			// wait for unvolunteer of self to be seen...
			select {
			case <-unvolunteered:
			case <-obj.hardexit.Signal(): // bork unblocker
				obj.Logf("unblocked unvolunteer signal")
			}
		}()

		// self volunteer
		obj.Logf("volunteering...")
		surls, err := obj.surls()
		if err != nil {
			return err
		}
		if err := obj.volunteer(ctx, surls); err != nil {
			return err
		}
		defer obj.volunteer(ctx, nil) // unvolunteer
		defer obj.Logf("unvolunteering...")
		defer func() {
			// Move the leader if I'm it, so that the member remove
			// chooser operation happens on a different member than
			// myself. A leaving member should not decide its fate.
			member, err := obj.moveLeaderSomewhere(ctx)
			if err != nil {
				// TODO: use obj.err ?
				obj.Logf("move leader failed with: %+v", err)
				return
			}
			if member != "" {
				obj.Logf("moved leader to: %s", member)
			}
		}()
	}
	obj.activateExit4.Ack()

	// startup endpoints watcher (to learn about other servers)
	ctx, cancel := context.WithCancel(unblockCtx)
	defer cancel() // cleanup on close...
	if err := obj.runEndpoints(ctx); err != nil {
		obj.activateExit5.Ack()
		return err
	}
	obj.activateExit5.Ack()
	// We don't set state, we only watch others, so nothing to defer close!

	if obj.Converger != nil {
		obj.Converger.AddStateFn(ConvergerHostnameNamespace, func(converged bool) error {
			// send our individual state into etcd for others to see
			// TODO: what should happen on error?
			return obj.setHostnameConverged(exitCtx, obj.Hostname, converged)
		})
		defer obj.Converger.RemoveStateFn(ConvergerHostnameNamespace)
	}

	// NOTE: Add anything else we want to start up here...

	// If we get all the way down here, *and* we're connected, we're ready!
	obj.wg.Add(1)
	go func() {
		defer obj.wg.Done()
		select {
		case <-obj.connectSignal:
			close(obj.readySignal) // we're ready to be used now...
		case <-exitCtx.Done():
		}
	}()

	select {
	case <-exitCtx.Done():
	}
	obj.closing = true // flag to let nominateCb know we're shutting down...
	// kick off all the defer()'s....
	return reterr
}

// runEndpoints is a helper function to move all of this code into a new block.
func (obj *EmbdEtcd) runEndpoints(ctx context.Context) error {
	bootstrapping := len(obj.Seeds) == 0
	select {
	case <-obj.connectSignal:
	case <-ctx.Done():
		return nil // TODO: just exit ?
	}
	info, err := obj.client.ComplexWatcher(ctx, obj.NS+EndpointsPath, etcd.WithPrefix())
	if err != nil {
		obj.activateExit5.Ack()
		return errwrap.Wrapf(err, "error adding endpoints watcher")
	}
	obj.Logf("watching endpoints...")
	obj.wg.Add(1)
	obj.activeExit5 = true // activate errExit5
	obj.activateExit5.Ack()
	go func() {
		defer obj.wg.Done()
		defer close(obj.errExit5) // multi-signal for errChan close op
		for {
			var event *interfaces.WatcherData
			var ok bool
			select {
			case event, ok = <-info.Events:
				if !ok {
					return
				}
				if err := event.Err; err != nil {
					obj.err(errwrap.Wrapf(err, "endpoints watcher errored"))
					continue
				}
			}

			// on the initial created event, we populate...
			if !bootstrapping && event.Created && len(event.Events) == 0 {
				obj.Logf("populating endpoints list...")
				endpoints, err := obj.getEndpoints(ctx)
				if err != nil {
					obj.err(errwrap.Wrapf(err, "get endpoints errored"))
					continue
				}

				// If we're using an external etcd server, then
				// we would not initially see any endpoints, and
				// we'd erase our knowledge of our single
				// endpoint, which would deadlock our etcd
				// client. Instead, skip updates that take our
				// list down to zero endpoints. We maintain this
				// watch in case someone (an mgmt resource that
				// is managing the etcd cluster for example?)
				// wants to set these values for all of the mgmt
				// clients connected to the etcd server pool.
				if obj.NoServer && len(endpoints) == 0 && len(obj.endpoints) > 0 {
					if obj.Debug {
						obj.Logf("had %d endpoints: %+v", len(obj.endpoints), obj.endpoints)
						obj.Logf("got %d endpoints: %+v", len(endpoints), endpoints)
					}
					obj.Logf("skipping endpoints update")
					continue
				}

				obj.endpoints = endpoints
				obj.setEndpoints()

			} else if err := obj.endpointApply(event); err != nil {
				obj.err(errwrap.Wrapf(err, "endpoint apply errored"))
				continue
			}

			// there is no endpoint callback necessary

			// TODO: do we need this member state sync?
			if err := obj.memberStateFromList(ctx); err != nil {
				obj.err(errwrap.Wrapf(err, "error during endpoint state sync"))
				continue
			}
		}
	}()

	return nil
}

// Destroy cleans up the entire embedded etcd system. Use DestroyServer if you
// only want to shutdown the embedded server portion.
func (obj *EmbdEtcd) Destroy() error {
	obj.Logf("destroy...")
	obj.exit.Done(nil) // trigger an exit in Run!

	reterr := obj.exit.Error() // wait for exit signal (block until arrival)

	obj.wg.Wait()
	return reterr
}

// Interrupt causes this member to force shutdown. It does not safely wait for
// an ordered shutdown. It is not recommended to use this unless you're borked.
func (obj *EmbdEtcd) Interrupt() error {
	obj.Logf("interrupt...")
	wg := &sync.WaitGroup{}
	var err error
	wg.Add(1)
	go func() {
		defer wg.Done()
		err = obj.Destroy() // set return error
	}()
	wg.Add(1)
	go func() {
		defer wg.Done()
		select {
		case <-obj.exit.Signal(): // wait for Destroy to run first
		}
		obj.hardexit.Done(nil) // trigger a hard exit
	}()

	wg.Wait()
	return err
}

// Ready returns a channel that closes when we're up and running. This process
// happens when calling Run. If Run is never called, this will never happen. Our
// main startup must be running, and our client must be connected to get here.
func (obj *EmbdEtcd) Ready() <-chan struct{} { return obj.readySignal }

// Exited returns a channel that closes when we've destroyed. This process
// happens after Run exits. If Run is never called, this will never happen.
func (obj *EmbdEtcd) Exited() <-chan struct{} { return obj.exitsSignal }

// config returns the config struct to be used during the etcd client connect.
func (obj *EmbdEtcd) config() etcd.Config {
	// FIXME: filter out any urls which wouldn't resolve ?
	endpoints := etcdUtil.FromURLsMapToStringList(obj.endpoints) // flatten map
	// We don't need to do any sort of priority sort here, since for initial
	// connect we'd be the only one, so it doesn't matter, and subsequent
	// changes are made with SetEndpoints, not here, so we never need to
	// prioritize our local endpoint.
	sort.Strings(endpoints) // sort for determinism

	if len(endpoints) == 0 { // initially, we need to use the defaults...
		for _, u := range obj.Seeds {
			endpoints = append(endpoints, u.String())
		}
	}
	// XXX: connect to our local obj.ClientURLs instead of obj.AClientURLs ?
	cfg := etcd.Config{
		Endpoints: endpoints, // eg: []string{"http://254.0.0.1:12345"}
		// RetryDialer chooses the next endpoint to use, and comes with
		// a default dialer if unspecified.
		DialTimeout: ClientDialTimeout,

		// I think the keepalive stuff is needed for endpoint health.
		DialKeepAliveTime:    ClientDialKeepAliveTime,
		DialKeepAliveTimeout: ClientDialKeepAliveTimeout,

		// 0 disables auto-sync. By default auto-sync is disabled.
		AutoSyncInterval: 0, // we do something equivalent ourselves
	}
	return cfg
}

// connect connects the client to a server. If we are the first peer, then that
// server is itself.
func (obj *EmbdEtcd) connect() error {
	obj.Logf("connect...")
	if obj.etcd != nil {
		return fmt.Errorf("already connected")
	}
	cfg := obj.config() // get config
	var err error
	obj.etcd, err = etcd.New(cfg) // connect!
	return err
}

// disconnect closes the etcd connection.
func (obj *EmbdEtcd) disconnect() error {
	obj.Logf("disconnect...")
	if obj.etcd == nil {
		return fmt.Errorf("already disconnected")
	}

	return obj.etcd.Close()
}

// MakeClient returns an etcd Client interface that is suitable for basic tasks.
// Don't run this until the Ready method has acknowledged.
func (obj *EmbdEtcd) MakeClient() (interfaces.Client, error) {
	c := client.NewClientFromClient(obj.etcd)
	if err := c.Init(); err != nil {
		return nil, err
	}
	obj.clients = append(obj.clients, c) // make sure to clean up after...
	return c, nil
}

// MakeClientFromNamespace returns an etcd Client interface that is suitable for
// basic tasks and that has a key namespace prefix. // Don't run this until the
// Ready method has acknowledged.
func (obj *EmbdEtcd) MakeClientFromNamespace(ns string) (interfaces.Client, error) {
	kv := namespace.NewKV(obj.etcd.KV, ns)
	w := namespace.NewWatcher(obj.etcd.Watcher, ns)
	c := client.NewClientFromNamespace(obj.etcd, kv, w)
	if err := c.Init(); err != nil {
		return nil, err
	}
	obj.clients = append(obj.clients, c) // make sure to clean up after...
	return c, nil
}