Rework the converged detection and provide a clean interface

The old converged detection was hacked in code, instead of something
with a nice interface. This cleans it up, splits it into a separate
file, and removes a race condition that happened with the old code.

We also take the time to get rid of the ugly Set* methods and replace
them all with a single AssociateData method. This might be unnecessary
if we can pass in the Converger method at Resource construction.

Lastly, and most interesting, we suspend the individual timeout callers
when they've already converged, thus reducing unnecessary traffic, and
avoiding fast (eg: < 5 second) timers triggering more than once if they
stay converged!

A quick note on theory for any future readers... What happens if we have
--converged-timeout=0 ? Well, for this and any other positive value,
it's important to realize that deciding if something is converged is
actually a race between if the converged timer will fire and if some
random new event will get triggered. This is because there is nothing
that can actually predict if or when a new event will happen (eg the
user modifying a file). As a result, a race is always inherent, and
actually not a negative or "incorrect" algorithm.

A future improvement could be to add a global lock to each resource, and
to lock all resources when computing if we are converged or not. In
practice, this hasn't been necessary. The worst case scenario would be
(in theory, because this hasn't been tested) if an event happens
*during* the converged calculation, and starts running, the exit command
then runs, and the event finishes, but it doesn't get a chance to notify
some service to restart. A lock could probably fix this theoretical
case.
This commit is contained in:
James Shubin
2016-03-29 07:06:56 -04:00
parent a6dc81a38e
commit 6f3ac4bf2a
10 changed files with 342 additions and 151 deletions

50
main.go
View File

@@ -59,8 +59,7 @@ func waitForSignal(exit chan bool) {
func run(c *cli.Context) {
var start = time.Now().UnixNano()
var wg sync.WaitGroup
exit := make(chan bool) // exit signal
converged := make(chan bool) // converged signal
exit := make(chan bool) // exit signal
log.Printf("This is: %v, version: %v", program, version)
log.Printf("Main: Start: %v", start)
var G, fullGraph *Graph
@@ -73,6 +72,16 @@ func run(c *cli.Context) {
}()
}
// setup converger
converger := NewConverger(
c.Int("converged-timeout"),
func() { // lambda to run when converged
log.Printf("Converged for %d seconds, exiting!", c.Int("converged-timeout"))
exit <- true // trigger an exit!
},
)
go converger.Loop(true) // main loop for converger, true to start paused
// initial etcd peer endpoint
seed := c.String("seed")
if seed == "" {
@@ -86,8 +95,7 @@ func run(c *cli.Context) {
// etcd
etcdO := &EtcdWObject{
seed: seed,
ctimeout: c.Int("converged-timeout"),
converged: converged,
converger: converger,
}
hostname := c.String("hostname")
@@ -136,7 +144,8 @@ func run(c *cli.Context) {
// run graph vertex LOCK...
if !first { // TODO: we can flatten this check out I think
G.Pause() // sync
converger.Pause() // FIXME: add sync wait?
G.Pause() // sync
}
// build graph from yaml file on events (eg: from etcd)
@@ -148,6 +157,7 @@ func run(c *cli.Context) {
// unpause!
if !first {
G.Start(&wg, first) // sync
converger.Start() // after G.Start()
}
continue
}
@@ -165,44 +175,18 @@ func run(c *cli.Context) {
} else {
log.Printf("Graphviz: Successfully generated graph!")
}
G.SetVertex()
G.SetConvergedCallback(c.Int("converged-timeout"), converged)
G.AssociateData(converger)
// G.Start(...) needs to be synchronous or wait,
// because if half of the nodes are started and
// some are not ready yet and the EtcdWatch
// loops, we'll cause G.Pause(...) before we
// even got going, thus causing nil pointer errors
G.Start(&wg, first) // sync
converger.Start() // after G.Start()
first = false
}
}()
if i := c.Int("converged-timeout"); i >= 0 {
go func() {
ConvergedLoop:
for {
<-converged // when anyone says they have converged
if etcdO.GetConvergedState() != etcdConvergedTimeout {
continue
}
for v := range G.GetVerticesChan() {
if v.Res.GetConvergedState() != resConvergedTimeout {
continue ConvergedLoop
}
}
// if all have converged, exit
log.Printf("Converged for %d seconds, exiting!", i)
exit <- true
for {
<-converged
} // unblock/drain
//return
}
}()
}
log.Println("Main: Running...")
waitForSignal(exit) // pass in exit channel to watch