remote: Add distributed converged timeout

This patch extends the --converged-timeout argument so that when used
with --remote it waits for the entire set of remote mgmt agents to
converge simultaneously before exiting.

purpleidea says: This particular part of the patch probably took as much
work as all of the work required for the initial remote patches alone!
This commit is contained in:
James Shubin
2016-08-30 05:18:55 -04:00
parent 6794aff77c
commit ff01e4a5e7
4 changed files with 245 additions and 26 deletions

75
main.go
View File

@@ -69,9 +69,19 @@ func run(c *cli.Context) error {
log.Printf("This is: %v, version: %v", program, version)
log.Printf("Main: Start: %v", start)
hostname := c.String("hostname")
if hostname == "" {
hostname, _ = os.Hostname()
hostname, _ := os.Hostname()
// allow passing in the hostname, instead of using --hostname
if c.IsSet("file") {
if config := ParseConfigFromFile(c.String("file")); config != nil {
if h := config.Hostname; h != "" {
hostname = h
}
}
}
if c.IsSet("hostname") { // override by cli
if h := c.String("hostname"); h != "" {
hostname = h
}
}
noop := c.Bool("noop")
@@ -123,6 +133,17 @@ func run(c *cli.Context) error {
return cli.NewExitError("", 1)
}
if c.IsSet("converged-timeout") && cConns > 0 && len(c.StringSlice("remote")) > c.Int("cconns") {
log.Printf("Main: Error: combining --converged-timeout with more remotes than available connections will never converge!")
return cli.NewExitError("", 1)
}
depth := uint16(c.Int("depth"))
if depth < 0 { // user should not be using this argument manually
log.Printf("Main: Error: negative values for --depth are not permitted!")
return cli.NewExitError("", 1)
}
if c.IsSet("prefix") && c.Bool("tmp-prefix") {
log.Println("Main: Error: combining --prefix and the request for a tmp prefix is illogical!")
return cli.NewExitError("", 1)
@@ -162,13 +183,7 @@ func run(c *cli.Context) error {
// setup converger
converger := NewConverger(
c.Int("converged-timeout"),
func(b bool) error { // lambda to run when converged
if b {
log.Printf("Converged for %d seconds, exiting!", c.Int("converged-timeout"))
exit <- true // trigger an exit!
}
return nil
},
nil, // stateFn gets added in by EmbdEtcd
)
go converger.Loop(true) // main loop for converger, true to start paused
@@ -196,6 +211,24 @@ func run(c *cli.Context) error {
log.Printf("Main: Etcd: Startup failed: %v", err)
exit <- true
}
convergerStateFn := func(b bool) error {
// exit if we are using the converged-timeout and we are the
// root node. otherwise, if we are a child node in a remote
// execution hierarchy, we should only notify our converged
// state and wait for the parent to trigger the exit.
if depth == 0 && c.Int("converged-timeout") >= 0 {
if b {
log.Printf("Converged for %d seconds, exiting!", c.Int("converged-timeout"))
exit <- true // trigger an exit!
}
return nil
}
// send our individual state into etcd for others to see
return EtcdSetHostnameConverged(EmbdEtcd, hostname, b) // TODO: what should happen on error?
}
if EmbdEtcd != nil {
converger.SetStateFn(convergerStateFn)
}
exitchan := make(chan Event) // exit event
go func() {
@@ -250,6 +283,12 @@ func run(c *cli.Context) error {
continue
}
if config.Hostname != "" && config.Hostname != hostname {
log.Printf("Config: Hostname changed, ignoring config!")
continue
}
config.Hostname = hostname // set it in case it was ""
// run graph vertex LOCK...
if !first { // TODO: we can flatten this check out I think
converger.Pause() // FIXME: add sync wait?
@@ -258,7 +297,7 @@ func run(c *cli.Context) error {
// build graph from yaml file on events (eg: from etcd)
// we need the vertices to be paused to work on them
if newFullgraph, err := fullGraph.NewGraphFromConfig(config, EmbdEtcd, hostname, noop); err == nil { // keep references to all original elements
if newFullgraph, err := fullGraph.NewGraphFromConfig(config, EmbdEtcd, noop); err == nil { // keep references to all original elements
fullGraph = newFullgraph
} else {
log.Printf("Config: Error making new graph from config: %v", err)
@@ -303,6 +342,11 @@ func run(c *cli.Context) error {
events = nil // signal that no-watch is true
}
// initialize the add watcher, which calls the f callback on map changes
convergerCb := func(f func(map[string]bool) error) (func(), error) {
return EtcdAddHostnameConvergedWatcher(EmbdEtcd, f)
}
// build remotes struct for remote ssh
remotes := NewRemotes(
EmbdEtcd.LocalhostClientURLs().StringSlice(),
@@ -314,7 +358,10 @@ func run(c *cli.Context) error {
c.Bool("allow-interactive"),
c.String("ssh-priv-id-rsa"),
!c.Bool("no-caching"),
depth,
prefix,
converger,
convergerCb,
)
// TODO: is there any benefit to running the remotes above in the loop?
@@ -503,6 +550,12 @@ func main() {
Name: "no-caching",
Usage: "don't allow remote caching of remote execution binary",
},
cli.IntFlag{
Name: "depth",
Hidden: true, // internal use only
Value: 0,
Usage: "specify depth in remote hierarchy",
},
cli.StringFlag{
Name: "prefix",
Usage: "specify a path to the working prefix directory",