Files
mgmt/lang/funcs/engine.go
James Shubin 06ee05026b lang: funcs: Don't race when building an initial graph
I noticed a very intermittent test failure where interpret would end up
running, but *fail* because a value wasn't present. This should never
happen, because the function engine is designed to only call interpret
when there has been at least one value produced for every node in the
AST. So what is the bug that would produce:

interpret error: could not interpret: func value does not yet exist

About 20 minutes ago while I was getting to bed, it occurred to me where
to look! Out of bed and to the laptop, and after briefly reminding
myself of the code, I think I've found the issue.

What I think was happening, was that an AST node would produce a value,
and send a message on the aggregate channel. This channel is monitored,
and every time it receives a message, it checks to ensure that all the
values now exist before producing a message for interpret to run.
However, this AST node was not the final one to be produced, but before
the message was read by the aggregate channel, the last remaining AST
node ran and set it's "loaded" state to `true`, but *before* its value
was made available for the aggregate channel to read. That channel then
occasionally won the race and tried to access a value before it existed,
thus causing out intermittent bug.

At least I think that's what was going on. Hopefully this patch fixes
this, if not, then there's another bug hiding too! And of course, this
entire function engine could do with some proper analysis from someone
familiar with glitches, back pressure, and FRP parallelism.

One particular note was that I used my brain, not some fancy debugging
tool to find this. Maybe skilled debuggers can fork lift their tools
onto this type of problem, but I haven't those skills!

¯\_(ツ)_/¯
2018-03-15 23:22:21 -04:00

611 lines
18 KiB
Go

// Mgmt
// Copyright (C) 2013-2018+ James Shubin and the project contributors
// Written by James Shubin <james@shubin.ca> and the project contributors
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package funcs
import (
"fmt"
"strings"
"sync"
"github.com/purpleidea/mgmt/lang/interfaces"
"github.com/purpleidea/mgmt/lang/types"
"github.com/purpleidea/mgmt/pgraph"
"github.com/purpleidea/mgmt/resources"
multierr "github.com/hashicorp/go-multierror"
errwrap "github.com/pkg/errors"
)
// State represents the state of a function vertex. This corresponds to an AST
// expr, which is the memory address (pointer) in the graph.
type State struct {
Expr interfaces.Expr // pointer to the expr vertex
handle interfaces.Func // the function (if not nil, we've found it on init)
init bool // have we run Init on our func?
ready bool // has it received all the args it needs at least once?
loaded bool // has the func run at least once ?
closed bool // did we close ourself down?
notify chan struct{} // ping here when new input values exist
input chan types.Value // the top level type must be a struct
output chan types.Value
}
// Init creates the function state if it can be found in the registered list.
func (obj *State) Init() error {
handle, err := obj.Expr.Func() // build one and store it, don't re-gen
if err != nil {
return err
}
if err := handle.Validate(); err != nil {
return errwrap.Wrapf(err, "could not validate func")
}
obj.handle = handle
sig := obj.handle.Info().Sig
if sig.Kind != types.KindFunc {
return fmt.Errorf("must be kind func")
}
if len(sig.Ord) > 0 {
// since we accept input, better get our notification chan built
obj.notify = make(chan struct{})
}
obj.input = make(chan types.Value) // we close this when we're done
obj.output = make(chan types.Value) // we create it, func closes it
return nil
}
// String satisfies fmt.Stringer so that these print nicely.
func (obj *State) String() string {
return obj.Expr.String()
}
// Edge links an output vertex (value) to an input vertex with a named argument.
type Edge struct {
Args []string // list of named args that this edge sends to
}
// String displays the list of arguments this edge satisfies. It is a required
// property to be a valid pgraph.Edge.
func (obj *Edge) String() string {
return strings.Join(obj.Args, ", ")
}
// Engine represents the running time varying directed acyclic function graph.
type Engine struct {
Graph *pgraph.Graph
Hostname string
World resources.World
Debug bool
Logf func(format string, v ...interface{})
// Glitch: https://en.wikipedia.org/wiki/Reactive_programming#Glitches
Glitch bool // allow glitching? (more responsive, but less accurate)
ag chan error // used to aggregate fact events without reflect
agLock *sync.Mutex
agCount int // last one turns out the light (closes the ag channel)
topologicalSort []pgraph.Vertex // cached sorting of the graph for perf
state map[pgraph.Vertex]*State // state associated with the vertex
mutex *sync.RWMutex // concurrency guard for the table map
table map[pgraph.Vertex]types.Value // live table of output values
loaded bool // are all of the funcs loaded?
loadedChan chan struct{} // funcs loaded signal
streamChan chan error // signals a new graph can be created or problem
closeChan chan struct{} // close signal
wg *sync.WaitGroup
}
// Init initializes the struct. This is the first call you must make. Do not
// proceed with calls to other methods unless this succeeds first. This also
// loads all the functions by calling Init on each one in the graph.
// TODO: should Init take the graph as an input arg to keep it as a private field?
func (obj *Engine) Init() error {
obj.ag = make(chan error)
obj.agLock = &sync.Mutex{}
obj.state = make(map[pgraph.Vertex]*State)
obj.mutex = &sync.RWMutex{}
obj.table = make(map[pgraph.Vertex]types.Value)
obj.loadedChan = make(chan struct{})
obj.streamChan = make(chan error)
obj.closeChan = make(chan struct{})
obj.wg = &sync.WaitGroup{}
topologicalSort, err := obj.Graph.TopologicalSort()
if err != nil {
return errwrap.Wrapf(err, "topo sort failed")
}
obj.topologicalSort = topologicalSort // cache the result
for _, vertex := range obj.Graph.Vertices() {
// is this an interface we can use?
if _, exists := obj.state[vertex]; exists {
return fmt.Errorf("vertex (%+v) is not unique in the graph", vertex)
}
expr, ok := vertex.(interfaces.Expr)
if !ok {
return fmt.Errorf("vertex (%+v) was not an expr", vertex)
}
obj.state[vertex] = &State{Expr: expr} // store some state!
if e := obj.state[vertex].Init(); e != nil {
err = multierr.Append(err, e) // list of errors
}
if obj.Debug {
obj.Logf("Loading func `%s`", vertex)
}
}
if err != nil { // usually due to `not found` errors
return errwrap.Wrapf(err, "could not load requested funcs")
}
return nil
}
// Validate the graph type checks properly and other tests. Must run Init first.
// This should check that: (1) all vertices have the correct number of inputs,
// (2) that the *Info signatures all match correctly, (3) that the argument
// names match correctly, and that the whole graph is statically correct.
func (obj *Engine) Validate() error {
inList := func(needle interfaces.Func, haystack []interfaces.Func) bool {
if needle == nil {
panic("nil value passed to inList") // catch bugs!
}
for _, x := range haystack {
if needle == x {
return true
}
}
return false
}
var err error
ptrs := []interfaces.Func{} // Func is a ptr
for _, vertex := range obj.Graph.Vertices() {
node := obj.state[vertex]
// TODO: this doesn't work for facts because they're in the Func
// duplicate pointers would get closed twice, causing a panic...
if inList(node.handle, ptrs) { // check for duplicate ptrs!
e := fmt.Errorf("vertex `%s` has duplicate ptr", vertex)
err = multierr.Append(err, e)
}
ptrs = append(ptrs, node.handle)
}
for _, edge := range obj.Graph.Edges() {
if _, ok := edge.(*Edge); !ok {
e := fmt.Errorf("edge `%s` was not the correct type", edge)
err = multierr.Append(err, e)
}
}
if err != nil {
return err // stage the errors so the user can fix many at once!
}
// check if vertices expecting inputs have them
for vertex, count := range obj.Graph.InDegree() {
node := obj.state[vertex]
if exp := len(node.handle.Info().Sig.Ord); exp != count {
e := fmt.Errorf("expected %d inputs to `%s`, got %d", exp, node, count)
err = multierr.Append(err, e)
}
}
// expected vertex -> argName
expected := make(map[*State]map[string]int) // expected input fields
for vertex1 := range obj.Graph.Adjacency() {
// check for outputs that don't go anywhere?
//node1 := obj.state[vertex1]
//if len(obj.Graph.Adjacency()[vertex1]) == 0 { // no vertex1 -> vertex2
// if node1.handle.Info().Sig.Output != nil {
// // an output value goes nowhere...
// }
//}
for vertex2 := range obj.Graph.Adjacency()[vertex1] { // populate
node2 := obj.state[vertex2]
expected[node2] = make(map[string]int)
for _, key := range node2.handle.Info().Sig.Ord {
expected[node2][key] = 1
}
}
}
for vertex1 := range obj.Graph.Adjacency() {
node1 := obj.state[vertex1]
for vertex2, edge := range obj.Graph.Adjacency()[vertex1] {
node2 := obj.state[vertex2]
edge := edge.(*Edge)
// check vertex1 -> vertex2 (with e) is valid
for _, arg := range edge.Args { // loop over each arg
sig := node2.handle.Info().Sig
if len(sig.Ord) == 0 {
e := fmt.Errorf("no input expected from `%s` to `%s` with arg `%s`", node1, node2, arg)
err = multierr.Append(err, e)
continue
}
if count, exists := expected[node2][arg]; !exists {
e := fmt.Errorf("wrong input name from `%s` to `%s` with arg `%s`", node1, node2, arg)
err = multierr.Append(err, e)
} else if count == 0 {
e := fmt.Errorf("duplicate input from `%s` to `%s` with arg `%s`", node1, node2, arg)
err = multierr.Append(err, e)
}
expected[node2][arg]-- // subtract one use
out := node1.handle.Info().Sig.Out
if out == nil {
e := fmt.Errorf("no output possible from `%s` to `%s` with arg `%s`", node1, node2, arg)
err = multierr.Append(err, e)
continue
}
typ, exists := sig.Map[arg] // key in struct
if !exists {
// second check of this!
e := fmt.Errorf("wrong input name from `%s` to `%s` with arg `%s`", node1, node2, arg)
err = multierr.Append(err, errwrap.Wrapf(e, "programming error"))
continue
}
if typ.Kind == types.KindVariant { // FIXME: hack for now
// pass (input arg variants)
} else if out.Kind == types.KindVariant { // FIXME: hack for now
// pass (output arg variants)
} else if typ.Cmp(out) != nil {
e := fmt.Errorf("type mismatch from `%s` (%s) to `%s` (%s) with arg `%s`", node1, out, node2, typ, arg)
err = multierr.Append(err, e)
}
}
}
}
// check for leftover function inputs which weren't filled up by outputs
// (we're trying to call a function with fewer input args than required)
for node, m := range expected { // map[*State]map[string]int
for arg, count := range m {
if count != 0 { // count should be zero if all were used
e := fmt.Errorf("missing input to `%s` on arg `%s`", node, arg)
err = multierr.Append(err, e)
}
}
}
if err != nil {
return err // stage the errors so the user can fix many at once!
}
return nil
}
// Run starts up this function engine and gets it all running. It errors if the
// startup failed for some reason. On success, use the Stream and Table methods
// for future interaction with the engine, and the Close method to shut it off.
func (obj *Engine) Run() error {
if len(obj.topologicalSort) == 0 { // no funcs to load!
close(obj.loadedChan)
close(obj.streamChan)
return nil
}
// TODO: build a timer that runs while we wait for all funcs to startup.
// after some delay print a message to tell us which funcs we're waiting
// for to startup and that they are slow and blocking everyone, and then
// fail permanently after the timeout so that bad code can't block this!
// loop through all funcs that we might need
obj.agAdd(len(obj.topologicalSort))
for _, vertex := range obj.topologicalSort {
node := obj.state[vertex]
if obj.Debug {
obj.Logf("Startup func `%s`", node)
}
incoming := obj.Graph.IncomingGraphVertices(vertex) // []Vertex
init := &interfaces.Init{
Hostname: obj.Hostname,
Input: node.input,
Output: node.output,
World: obj.World,
Debug: obj.Debug,
Logf: func(format string, v ...interface{}) {
obj.Logf("func: "+format, v...)
},
}
if err := node.handle.Init(init); err != nil {
return errwrap.Wrapf(err, "could not init func `%s`", node)
}
node.init = true // we've successfully initialized
// no incoming edges, so no incoming data
if len(incoming) == 0 { // TODO: do this here or earlier?
close(node.input)
} else {
// process function input data
obj.wg.Add(1)
go func(vertex pgraph.Vertex) {
node := obj.state[vertex]
defer obj.wg.Done()
defer close(node.input)
var ready bool
// the final closing output to this, closes this
for range node.notify { // new input values
// now build the struct if we can...
ready = true // assume for now...
si := &types.Type{
// input to functions are structs
Kind: types.KindStruct,
Map: node.handle.Info().Sig.Map,
Ord: node.handle.Info().Sig.Ord,
}
st := types.NewStruct(si)
for _, v := range incoming {
args := obj.Graph.Adjacency()[v][vertex].(*Edge).Args
from := obj.state[v]
obj.mutex.RLock()
value, exists := obj.table[v]
obj.mutex.RUnlock()
if !exists {
ready = false // nope!
break
}
// set each arg, since one value
// could get used for multiple
// function inputs (shared edge)
for _, arg := range args {
err := st.Set(arg, value) // populate struct
if err != nil {
panic(fmt.Sprintf("struct set failure on `%s` from `%s`: %v", node, from, err))
}
}
}
if !ready {
continue
}
select {
case node.input <- st: // send to function
case <-obj.closeChan:
return
}
}
}(vertex)
}
obj.wg.Add(1)
go func(vertex pgraph.Vertex) { // run function
node := obj.state[vertex]
defer obj.wg.Done()
if obj.Debug {
obj.Logf("Running func `%s`", node)
}
err := node.handle.Stream()
if obj.Debug {
obj.Logf("Exiting func `%s`", node)
}
if err != nil {
// we closed with an error...
err := errwrap.Wrapf(err, "problem streaming func `%s`", node)
select {
case obj.ag <- err: // send to aggregate channel
case <-obj.closeChan:
return
}
}
}(vertex)
obj.wg.Add(1)
go func(vertex pgraph.Vertex) { // process function output data
node := obj.state[vertex]
defer obj.wg.Done()
defer obj.agDone(vertex)
outgoing := obj.Graph.OutgoingGraphVertices(vertex) // []Vertex
for value := range node.output { // read from channel
obj.mutex.RLock()
cached, exists := obj.table[vertex]
obj.mutex.RUnlock()
if !exists { // first value received
// RACE: do this AFTER value is present!
//node.loaded = true // not yet please
obj.Logf("func `%s` started", node)
} else if value.Cmp(cached) == nil {
// skip if new value is same as previous
// if this happens often, it *might* be
// a bug in the function implementation
// FIXME: do we need to disable engine
// caching when using hysteresis?
obj.Logf("func `%s` skipped", node)
continue
}
obj.mutex.Lock()
// XXX: maybe we can get rid of the table...
obj.table[vertex] = value // save the latest
if err := node.Expr.SetValue(value); err != nil {
panic(fmt.Sprintf("could not set value for `%s`: %+v", node, err))
}
node.loaded = true // set *after* value is in :)
obj.mutex.Unlock()
obj.Logf("func `%s` changed", node)
// FIXME: will this actually prevent glitching?
// if we only notify the aggregate channel when
// we're at the bottom of the topo sort (eg: no
// outgoing vertices to notify) then we'll have
// a glitch free subtree in the programs ast...
if obj.Glitch || len(outgoing) == 0 {
select {
case obj.ag <- nil: // send to aggregate channel
case <-obj.closeChan:
return
}
}
// notify the receiving vertices
for _, v := range outgoing {
node := obj.state[v]
select {
case node.notify <- struct{}{}:
case <-obj.closeChan:
return
}
}
}
// no more output values are coming...
obj.Logf("func `%s` stopped", node)
}(vertex)
}
// send event on streamChan when any of the (aggregated) facts change
obj.wg.Add(1)
go func() {
defer obj.wg.Done()
defer close(obj.streamChan)
Loop:
for {
var err error
var ok bool
select {
case err, ok = <-obj.ag: // aggregated channel
if !ok {
break Loop // channel shutdown
}
if !obj.loaded {
// now check if we're ready
var loaded = true // initially assume true
for _, vertex := range obj.topologicalSort {
node := obj.state[vertex]
if !node.loaded {
loaded = false // we were wrong
break
}
}
obj.loaded = loaded
if obj.loaded {
// this causes an initial start
// signal to be sent out below,
// since the stream sender runs
if obj.Debug {
obj.Logf("loaded")
}
close(obj.loadedChan) // signal
} else {
if err == nil {
continue // not ready to send signal
} // pass errors through...
}
}
case <-obj.closeChan:
return
}
// send stream signal
select {
// send events or errors on streamChan, eg: func failure
case obj.streamChan <- err: // send
if err != nil {
return
}
case <-obj.closeChan:
return
}
}
}()
return nil
}
// agAdd registers a user on the ag channel.
func (obj *Engine) agAdd(i int) {
defer obj.agLock.Unlock()
obj.agLock.Lock()
obj.agCount += i
}
// agDone closes the channel if we're the last one using it.
func (obj *Engine) agDone(vertex pgraph.Vertex) {
defer obj.agLock.Unlock()
obj.agLock.Lock()
node := obj.state[vertex]
node.closed = true
// FIXME: (perf) cache this into a table which we narrow down with each
// successive call. look at the outgoing vertices that I would affect...
for _, v := range obj.Graph.OutgoingGraphVertices(vertex) { // close for each one
// now determine who provides inputs to that vertex...
var closed = true
for _, vv := range obj.Graph.IncomingGraphVertices(v) {
// are they all closed?
if !obj.state[vv].closed {
closed = false
break
}
}
if closed { // if they're all closed, we can close the input
close(obj.state[v].notify)
}
}
if obj.agCount == 0 {
close(obj.ag)
}
}
// Stream returns a channel of engine events. Wait for nil events to know when
// the Table map has changed. An error event means this will shutdown shortly.
// Do not run the Table function before we've received one non-error event.
func (obj *Engine) Stream() chan error {
return obj.streamChan
}
// Close shuts down the function engine. It waits till everything has finished.
func (obj *Engine) Close() error {
var err error
for _, vertex := range obj.topologicalSort { // FIXME: should we do this in reverse?
node := obj.state[vertex]
if node.init { // did we Init this func?
if e := node.handle.Close(); e != nil {
e := errwrap.Wrapf(e, "problem closing func `%s`", node)
err = multierr.Append(err, e) // list of errors
}
}
}
close(obj.closeChan)
obj.wg.Wait() // wait so that each func doesn't need to do this in close
return err
}