engine: graph: Cleanup pause/resume code
There's always the fear that there is either a panic or a deadlock in the highly concurrent engine resource code. I have not seen one recently and I've been running some pretty concurrent tests. In the meantime, and with my hopefully improved knowledge of concurrency, I decided to rewrite some of the "uglier" parts of the engine. I think it is a lot clearer now, and much less likely that there is a concurrency issue. This has been tested by running the examples/lang/fastcount.mcl example.
This commit is contained in:
@@ -364,8 +364,14 @@ func (obj *Engine) Worker(vertex pgraph.Vertex) error {
|
||||
var reserv *rate.Reservation
|
||||
var reterr error
|
||||
var failed bool // has Process permanently failed?
|
||||
var closed bool // has the resumeSignal channel closed?
|
||||
Loop:
|
||||
for { // process loop
|
||||
// This is the main select where things happen and where we exit
|
||||
// from. It's similar to the two "satellite" select's which we
|
||||
// might spend some time in if we're retrying or rate limiting.
|
||||
// This select is also the main event receiver and is also the
|
||||
// only place where we read from the poke channel.
|
||||
select {
|
||||
case err, ok := <-obj.state[vertex].eventsChan: // read from watch channel
|
||||
if !ok {
|
||||
@@ -394,9 +400,30 @@ Loop:
|
||||
obj.Logf("poke received")
|
||||
}
|
||||
reserv = nil // we didn't receive a real event here...
|
||||
}
|
||||
if failed { // don't Process anymore if we've already failed...
|
||||
continue Loop
|
||||
|
||||
case _, ok := <-obj.state[vertex].pauseSignal: // one message
|
||||
if !ok {
|
||||
obj.state[vertex].pauseSignal = nil
|
||||
continue // this is not a new pause message
|
||||
}
|
||||
// NOTE: If we allowed a doneCtx below to let us out
|
||||
// of the resumeSignal wait, then we could loop around
|
||||
// and run this again, causing a panic. Instead of this
|
||||
// being made safe with a sync.Once, we instead run a
|
||||
// close() call inside of the vertexRemoveFn function,
|
||||
// which should unblock resumeSignal so we can shutdown.
|
||||
obj.state[vertex].pausedAck.Ack() // send ack
|
||||
// we are paused now, and waiting for resume or exit...
|
||||
select {
|
||||
case _, closed = <-obj.state[vertex].resumeSignal: // channel closes
|
||||
// resumed!
|
||||
// pass through to allow a Process to try to run
|
||||
// TODO: consider adding this fast pause here...
|
||||
//if obj.fastPause {
|
||||
// obj.Logf("fast pausing on resume")
|
||||
// continue
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
// drop redundant pokes
|
||||
@@ -408,31 +435,8 @@ Loop:
|
||||
}
|
||||
}
|
||||
|
||||
// pause if one was requested...
|
||||
select {
|
||||
case <-obj.state[vertex].pauseSignal: // channel closes
|
||||
// NOTE: If we allowed a doneCtx below to let us out
|
||||
// of the resumeSignal wait, then we could loop around
|
||||
// and run this again, causing a panic. Instead of this
|
||||
// being made safe with a sync.Once, we instead run a
|
||||
// Resume() call inside of the vertexRemoveFn function,
|
||||
// which should unblock it when we're going to need to.
|
||||
obj.state[vertex].pausedAck.Ack() // send ack
|
||||
// we are paused now, and waiting for resume or exit...
|
||||
select {
|
||||
case <-obj.state[vertex].resumeSignal: // channel closes
|
||||
// resumed!
|
||||
// pass through to allow a Process to try to run
|
||||
// TODO: consider adding this fast pause here...
|
||||
//if obj.fastPause {
|
||||
// obj.Logf("fast pausing on resume")
|
||||
// continue
|
||||
//}
|
||||
}
|
||||
default:
|
||||
// no pause requested, keep going...
|
||||
}
|
||||
if failed { // don't Process anymore if we've already failed...
|
||||
// don't Process anymore if we've already failed or shutdown...
|
||||
if failed || closed {
|
||||
continue Loop
|
||||
}
|
||||
|
||||
@@ -446,6 +450,9 @@ Loop:
|
||||
timer := time.NewTimer(time.Duration(d) * time.Millisecond)
|
||||
LimitWait:
|
||||
for {
|
||||
// This "satellite" select doesn't need a poke
|
||||
// channel because we're already in "event
|
||||
// received" mode, and poke doesn't block.
|
||||
select {
|
||||
case <-timer.C: // the wait is over
|
||||
break LimitWait
|
||||
@@ -471,7 +478,8 @@ Loop:
|
||||
timer.Stop() // it's nice to cleanup
|
||||
obj.state[vertex].init.Logf("rate limiting expired!")
|
||||
}
|
||||
if failed { // don't Process anymore if we've already failed...
|
||||
// don't Process anymore if we've already failed or shutdown...
|
||||
if failed || closed {
|
||||
continue Loop
|
||||
}
|
||||
// end of limit delay
|
||||
@@ -486,6 +494,10 @@ Loop:
|
||||
timer := time.NewTimer(time.Duration(delay) * time.Millisecond)
|
||||
RetryWait:
|
||||
for {
|
||||
// This "satellite" select doesn't need
|
||||
// a poke channel because we're already
|
||||
// in "event received" mode, and poke
|
||||
// doesn't block.
|
||||
select {
|
||||
case <-timer.C: // the wait is over
|
||||
break RetryWait
|
||||
@@ -512,7 +524,8 @@ Loop:
|
||||
delay = 0 // reset
|
||||
obj.state[vertex].init.Logf("the CheckApply delay expired!")
|
||||
}
|
||||
if failed { // don't Process anymore if we've already failed...
|
||||
// don't Process anymore if we've already failed or shutdown...
|
||||
if failed || closed {
|
||||
continue Loop
|
||||
}
|
||||
|
||||
|
||||
@@ -255,9 +255,9 @@ func (obj *Engine) Commit() error {
|
||||
free := []func() error{} // functions to run after graphsync to reset...
|
||||
vertexRemoveFn := func(vertex pgraph.Vertex) error {
|
||||
// wait for exit before starting new graph!
|
||||
close(obj.state[vertex].removeDone) // causes doneCtx to cancel
|
||||
obj.state[vertex].Resume() // unblock from resume
|
||||
obj.waits[vertex].Wait() // sync
|
||||
close(obj.state[vertex].removeDone) // causes doneCtx to cancel
|
||||
close(obj.state[vertex].resumeSignal) // unblock (it only closes here)
|
||||
obj.waits[vertex].Wait() // sync
|
||||
|
||||
// close the state and resource
|
||||
// FIXME: will this mess up the sync and block the engine?
|
||||
@@ -372,8 +372,22 @@ func (obj *Engine) Resume() error {
|
||||
reversed := pgraph.Reverse(topoSort)
|
||||
|
||||
for _, vertex := range reversed {
|
||||
// The very first resume is skipped as those resources are
|
||||
// already running! We could do that by checking here, but it is
|
||||
// more convenient to just have a state struct field (paused) to
|
||||
// track things for this instead. As a bonus, it helps us know
|
||||
// if a resource is paused or not if we print for debugging.
|
||||
//if !obj.state[vertex].initialStartupDone {
|
||||
// obj.state[vertex].initialStartupDone = true
|
||||
// continue
|
||||
//}
|
||||
|
||||
//obj.state[vertex].starter = (indegree[vertex] == 0)
|
||||
obj.state[vertex].Resume() // doesn't error
|
||||
// This always works because if a resource errored while it was
|
||||
// paused, then we're in the paused state and we can still exit
|
||||
// from there. If a resource errors when we're trying to Pause
|
||||
// then it will only succeed without error if the resource ACKs.
|
||||
}
|
||||
// we wait for everyone to start before exiting!
|
||||
obj.paused = false
|
||||
|
||||
@@ -99,13 +99,19 @@ type State struct {
|
||||
// to send on since it is buffered.
|
||||
pokeChan chan struct{} // outgoing from resource
|
||||
|
||||
// paused represents if this particular res is paused or not.
|
||||
// paused represents if this particular res is paused or not. This is
|
||||
// primarily used to avoid running an unnecessary Resume on the first
|
||||
// run of this resource.
|
||||
paused bool
|
||||
// pauseSignal closes to request a pause of this resource.
|
||||
// pauseSignal receives a message to request a pause of this resource.
|
||||
pauseSignal chan struct{}
|
||||
// resumeSignal closes to request a resume of this resource.
|
||||
// resumeSignal receives a message to resume this resource. The channel
|
||||
// closes when the resource is removed from the graph.
|
||||
resumeSignal chan struct{}
|
||||
// pausedAck is used to send an ack message saying that we've paused.
|
||||
// This helps us know if the pause was actually received and when it was
|
||||
// received. Otherwise we might not know if it errored or when it
|
||||
// actually stopped being busy and go to the paused stage.
|
||||
pausedAck *util.EasyAck
|
||||
|
||||
wg *sync.WaitGroup // used for all vertex specific processes
|
||||
@@ -149,7 +155,7 @@ func (obj *State) Init() error {
|
||||
|
||||
//obj.paused = false // starts off as started
|
||||
obj.pauseSignal = make(chan struct{})
|
||||
//obj.resumeSignal = make(chan struct{}) // happens on pause
|
||||
obj.resumeSignal = make(chan struct{})
|
||||
//obj.pausedAck = util.NewEasyAck() // happens on pause
|
||||
|
||||
obj.wg = &sync.WaitGroup{}
|
||||
@@ -311,11 +317,6 @@ func (obj *State) Cleanup() error {
|
||||
// callers are expected to make sure that they don't leave any of these running
|
||||
// by the time the Worker() shuts down.
|
||||
func (obj *State) Poke() {
|
||||
// redundant
|
||||
//if len(obj.pokeChan) > 0 {
|
||||
// return
|
||||
//}
|
||||
|
||||
select {
|
||||
case obj.pokeChan <- struct{}{}:
|
||||
default: // if chan is now full because more than one poke happened...
|
||||
@@ -329,19 +330,20 @@ func (obj *State) Poke() {
|
||||
// so only call these one at a time and alternate between the two.
|
||||
func (obj *State) Pause() error {
|
||||
if obj.paused {
|
||||
return fmt.Errorf("already paused")
|
||||
panic("already paused")
|
||||
}
|
||||
|
||||
obj.pausedAck = util.NewEasyAck()
|
||||
obj.resumeSignal = make(chan struct{}) // build the resume signal
|
||||
close(obj.pauseSignal)
|
||||
obj.Poke() // unblock and notice the pause if necessary
|
||||
select {
|
||||
case obj.pauseSignal <- struct{}{}:
|
||||
}
|
||||
|
||||
// wait for ack (or exit signal)
|
||||
select {
|
||||
case <-obj.pausedAck.Wait(): // we got it!
|
||||
// we're paused
|
||||
case <-obj.doneCtx.Done():
|
||||
|
||||
case <-obj.doneCtx.Done(): // gc cleans up the obj.pausedAck
|
||||
return engine.ErrClosed
|
||||
}
|
||||
obj.paused = true
|
||||
@@ -354,19 +356,17 @@ func (obj *State) Pause() error {
|
||||
// called concurrently with either the Pause() method or itself, so only call
|
||||
// these one at a time and alternate between the two.
|
||||
func (obj *State) Resume() {
|
||||
// TODO: do we need a mutex around Resume?
|
||||
// This paused check prevents unnecessary "resume" calls to the resource
|
||||
// on its first run, since resources start in the running state!
|
||||
if !obj.paused { // no need to unpause brand-new resources
|
||||
return
|
||||
}
|
||||
|
||||
obj.pauseSignal = make(chan struct{}) // rebuild for next pause
|
||||
close(obj.resumeSignal)
|
||||
//obj.Poke() // not needed, we're already waiting for resume
|
||||
select {
|
||||
case obj.resumeSignal <- struct{}{}:
|
||||
}
|
||||
|
||||
obj.paused = false
|
||||
|
||||
// no need to wait for it to resume
|
||||
//return // implied
|
||||
}
|
||||
|
||||
// event is a helper function to send an event to the CheckApply process loop.
|
||||
@@ -378,7 +378,7 @@ func (obj *State) event() {
|
||||
obj.setDirty() // assume we're initially dirty
|
||||
|
||||
select {
|
||||
case obj.eventsChan <- nil:
|
||||
case obj.eventsChan <- nil: // blocks! (this is unbuffered)
|
||||
// send!
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user