engine: graph: Cleanup pause/resume code

There's always the fear that there is either a panic or a deadlock in
the highly concurrent engine resource code. I have not seen one recently
and I've been running some pretty concurrent tests. In the meantime, and
with my hopefully improved knowledge of concurrency, I decided to
rewrite some of the "uglier" parts of the engine. I think it is a lot
clearer now, and much less likely that there is a concurrency issue.

This has been tested by running the examples/lang/fastcount.mcl example.
This commit is contained in:
James Shubin
2023-08-30 21:17:05 -04:00
parent 2edae22a65
commit 2773a621a2
3 changed files with 82 additions and 55 deletions

View File

@@ -364,8 +364,14 @@ func (obj *Engine) Worker(vertex pgraph.Vertex) error {
var reserv *rate.Reservation var reserv *rate.Reservation
var reterr error var reterr error
var failed bool // has Process permanently failed? var failed bool // has Process permanently failed?
var closed bool // has the resumeSignal channel closed?
Loop: Loop:
for { // process loop for { // process loop
// This is the main select where things happen and where we exit
// from. It's similar to the two "satellite" select's which we
// might spend some time in if we're retrying or rate limiting.
// This select is also the main event receiver and is also the
// only place where we read from the poke channel.
select { select {
case err, ok := <-obj.state[vertex].eventsChan: // read from watch channel case err, ok := <-obj.state[vertex].eventsChan: // read from watch channel
if !ok { if !ok {
@@ -394,9 +400,30 @@ Loop:
obj.Logf("poke received") obj.Logf("poke received")
} }
reserv = nil // we didn't receive a real event here... reserv = nil // we didn't receive a real event here...
case _, ok := <-obj.state[vertex].pauseSignal: // one message
if !ok {
obj.state[vertex].pauseSignal = nil
continue // this is not a new pause message
}
// NOTE: If we allowed a doneCtx below to let us out
// of the resumeSignal wait, then we could loop around
// and run this again, causing a panic. Instead of this
// being made safe with a sync.Once, we instead run a
// close() call inside of the vertexRemoveFn function,
// which should unblock resumeSignal so we can shutdown.
obj.state[vertex].pausedAck.Ack() // send ack
// we are paused now, and waiting for resume or exit...
select {
case _, closed = <-obj.state[vertex].resumeSignal: // channel closes
// resumed!
// pass through to allow a Process to try to run
// TODO: consider adding this fast pause here...
//if obj.fastPause {
// obj.Logf("fast pausing on resume")
// continue
//}
} }
if failed { // don't Process anymore if we've already failed...
continue Loop
} }
// drop redundant pokes // drop redundant pokes
@@ -408,31 +435,8 @@ Loop:
} }
} }
// pause if one was requested... // don't Process anymore if we've already failed or shutdown...
select { if failed || closed {
case <-obj.state[vertex].pauseSignal: // channel closes
// NOTE: If we allowed a doneCtx below to let us out
// of the resumeSignal wait, then we could loop around
// and run this again, causing a panic. Instead of this
// being made safe with a sync.Once, we instead run a
// Resume() call inside of the vertexRemoveFn function,
// which should unblock it when we're going to need to.
obj.state[vertex].pausedAck.Ack() // send ack
// we are paused now, and waiting for resume or exit...
select {
case <-obj.state[vertex].resumeSignal: // channel closes
// resumed!
// pass through to allow a Process to try to run
// TODO: consider adding this fast pause here...
//if obj.fastPause {
// obj.Logf("fast pausing on resume")
// continue
//}
}
default:
// no pause requested, keep going...
}
if failed { // don't Process anymore if we've already failed...
continue Loop continue Loop
} }
@@ -446,6 +450,9 @@ Loop:
timer := time.NewTimer(time.Duration(d) * time.Millisecond) timer := time.NewTimer(time.Duration(d) * time.Millisecond)
LimitWait: LimitWait:
for { for {
// This "satellite" select doesn't need a poke
// channel because we're already in "event
// received" mode, and poke doesn't block.
select { select {
case <-timer.C: // the wait is over case <-timer.C: // the wait is over
break LimitWait break LimitWait
@@ -471,7 +478,8 @@ Loop:
timer.Stop() // it's nice to cleanup timer.Stop() // it's nice to cleanup
obj.state[vertex].init.Logf("rate limiting expired!") obj.state[vertex].init.Logf("rate limiting expired!")
} }
if failed { // don't Process anymore if we've already failed... // don't Process anymore if we've already failed or shutdown...
if failed || closed {
continue Loop continue Loop
} }
// end of limit delay // end of limit delay
@@ -486,6 +494,10 @@ Loop:
timer := time.NewTimer(time.Duration(delay) * time.Millisecond) timer := time.NewTimer(time.Duration(delay) * time.Millisecond)
RetryWait: RetryWait:
for { for {
// This "satellite" select doesn't need
// a poke channel because we're already
// in "event received" mode, and poke
// doesn't block.
select { select {
case <-timer.C: // the wait is over case <-timer.C: // the wait is over
break RetryWait break RetryWait
@@ -512,7 +524,8 @@ Loop:
delay = 0 // reset delay = 0 // reset
obj.state[vertex].init.Logf("the CheckApply delay expired!") obj.state[vertex].init.Logf("the CheckApply delay expired!")
} }
if failed { // don't Process anymore if we've already failed... // don't Process anymore if we've already failed or shutdown...
if failed || closed {
continue Loop continue Loop
} }

View File

@@ -256,7 +256,7 @@ func (obj *Engine) Commit() error {
vertexRemoveFn := func(vertex pgraph.Vertex) error { vertexRemoveFn := func(vertex pgraph.Vertex) error {
// wait for exit before starting new graph! // wait for exit before starting new graph!
close(obj.state[vertex].removeDone) // causes doneCtx to cancel close(obj.state[vertex].removeDone) // causes doneCtx to cancel
obj.state[vertex].Resume() // unblock from resume close(obj.state[vertex].resumeSignal) // unblock (it only closes here)
obj.waits[vertex].Wait() // sync obj.waits[vertex].Wait() // sync
// close the state and resource // close the state and resource
@@ -372,8 +372,22 @@ func (obj *Engine) Resume() error {
reversed := pgraph.Reverse(topoSort) reversed := pgraph.Reverse(topoSort)
for _, vertex := range reversed { for _, vertex := range reversed {
// The very first resume is skipped as those resources are
// already running! We could do that by checking here, but it is
// more convenient to just have a state struct field (paused) to
// track things for this instead. As a bonus, it helps us know
// if a resource is paused or not if we print for debugging.
//if !obj.state[vertex].initialStartupDone {
// obj.state[vertex].initialStartupDone = true
// continue
//}
//obj.state[vertex].starter = (indegree[vertex] == 0) //obj.state[vertex].starter = (indegree[vertex] == 0)
obj.state[vertex].Resume() // doesn't error obj.state[vertex].Resume() // doesn't error
// This always works because if a resource errored while it was
// paused, then we're in the paused state and we can still exit
// from there. If a resource errors when we're trying to Pause
// then it will only succeed without error if the resource ACKs.
} }
// we wait for everyone to start before exiting! // we wait for everyone to start before exiting!
obj.paused = false obj.paused = false

View File

@@ -99,13 +99,19 @@ type State struct {
// to send on since it is buffered. // to send on since it is buffered.
pokeChan chan struct{} // outgoing from resource pokeChan chan struct{} // outgoing from resource
// paused represents if this particular res is paused or not. // paused represents if this particular res is paused or not. This is
// primarily used to avoid running an unnecessary Resume on the first
// run of this resource.
paused bool paused bool
// pauseSignal closes to request a pause of this resource. // pauseSignal receives a message to request a pause of this resource.
pauseSignal chan struct{} pauseSignal chan struct{}
// resumeSignal closes to request a resume of this resource. // resumeSignal receives a message to resume this resource. The channel
// closes when the resource is removed from the graph.
resumeSignal chan struct{} resumeSignal chan struct{}
// pausedAck is used to send an ack message saying that we've paused. // pausedAck is used to send an ack message saying that we've paused.
// This helps us know if the pause was actually received and when it was
// received. Otherwise we might not know if it errored or when it
// actually stopped being busy and go to the paused stage.
pausedAck *util.EasyAck pausedAck *util.EasyAck
wg *sync.WaitGroup // used for all vertex specific processes wg *sync.WaitGroup // used for all vertex specific processes
@@ -149,7 +155,7 @@ func (obj *State) Init() error {
//obj.paused = false // starts off as started //obj.paused = false // starts off as started
obj.pauseSignal = make(chan struct{}) obj.pauseSignal = make(chan struct{})
//obj.resumeSignal = make(chan struct{}) // happens on pause obj.resumeSignal = make(chan struct{})
//obj.pausedAck = util.NewEasyAck() // happens on pause //obj.pausedAck = util.NewEasyAck() // happens on pause
obj.wg = &sync.WaitGroup{} obj.wg = &sync.WaitGroup{}
@@ -311,11 +317,6 @@ func (obj *State) Cleanup() error {
// callers are expected to make sure that they don't leave any of these running // callers are expected to make sure that they don't leave any of these running
// by the time the Worker() shuts down. // by the time the Worker() shuts down.
func (obj *State) Poke() { func (obj *State) Poke() {
// redundant
//if len(obj.pokeChan) > 0 {
// return
//}
select { select {
case obj.pokeChan <- struct{}{}: case obj.pokeChan <- struct{}{}:
default: // if chan is now full because more than one poke happened... default: // if chan is now full because more than one poke happened...
@@ -329,19 +330,20 @@ func (obj *State) Poke() {
// so only call these one at a time and alternate between the two. // so only call these one at a time and alternate between the two.
func (obj *State) Pause() error { func (obj *State) Pause() error {
if obj.paused { if obj.paused {
return fmt.Errorf("already paused") panic("already paused")
} }
obj.pausedAck = util.NewEasyAck() obj.pausedAck = util.NewEasyAck()
obj.resumeSignal = make(chan struct{}) // build the resume signal select {
close(obj.pauseSignal) case obj.pauseSignal <- struct{}{}:
obj.Poke() // unblock and notice the pause if necessary }
// wait for ack (or exit signal) // wait for ack (or exit signal)
select { select {
case <-obj.pausedAck.Wait(): // we got it! case <-obj.pausedAck.Wait(): // we got it!
// we're paused // we're paused
case <-obj.doneCtx.Done():
case <-obj.doneCtx.Done(): // gc cleans up the obj.pausedAck
return engine.ErrClosed return engine.ErrClosed
} }
obj.paused = true obj.paused = true
@@ -354,19 +356,17 @@ func (obj *State) Pause() error {
// called concurrently with either the Pause() method or itself, so only call // called concurrently with either the Pause() method or itself, so only call
// these one at a time and alternate between the two. // these one at a time and alternate between the two.
func (obj *State) Resume() { func (obj *State) Resume() {
// TODO: do we need a mutex around Resume? // This paused check prevents unnecessary "resume" calls to the resource
// on its first run, since resources start in the running state!
if !obj.paused { // no need to unpause brand-new resources if !obj.paused { // no need to unpause brand-new resources
return return
} }
obj.pauseSignal = make(chan struct{}) // rebuild for next pause select {
close(obj.resumeSignal) case obj.resumeSignal <- struct{}{}:
//obj.Poke() // not needed, we're already waiting for resume }
obj.paused = false obj.paused = false
// no need to wait for it to resume
//return // implied
} }
// event is a helper function to send an event to the CheckApply process loop. // event is a helper function to send an event to the CheckApply process loop.
@@ -378,7 +378,7 @@ func (obj *State) event() {
obj.setDirty() // assume we're initially dirty obj.setDirty() // assume we're initially dirty
select { select {
case obj.eventsChan <- nil: case obj.eventsChan <- nil: // blocks! (this is unbuffered)
// send! // send!
} }