From e8855f7621cd67375a65400c6e0d96bffc8214be Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Sun, 12 Feb 2017 21:24:58 +0100 Subject: [PATCH] prometheus: Implement mgmt_checkapply_total metric Signed-off-by: Julien Pivotto --- lib/main.go | 7 +++--- pgraph/actions.go | 6 +++++ prometheus/prometheus.go | 28 +++++++++++++++++++++ resources/resources.go | 48 ++++++++++++++++++++++-------------- test/shell/prometheus-3.sh | 20 +++++++++++++++ test/shell/prometheus-3.yaml | 26 +++++++++++++++++++ 6 files changed, 113 insertions(+), 22 deletions(-) create mode 100755 test/shell/prometheus-3.sh create mode 100644 test/shell/prometheus-3.yaml diff --git a/lib/main.go b/lib/main.go index 12ba57c9..204b37c9 100644 --- a/lib/main.go +++ b/lib/main.go @@ -438,9 +438,10 @@ func (obj *Main) Run() error { newGraph.Flags = pgraph.Flags{Debug: obj.Flags.Debug} // pass in the information we need newGraph.AssociateData(&resources.Data{ - Converger: converger, - Prefix: pgraphPrefix, - Debug: obj.Flags.Debug, + Converger: converger, + Prometheus: prom, + Prefix: pgraphPrefix, + Debug: obj.Flags.Debug, }) // apply the global noop parameter if requested diff --git a/pgraph/actions.go b/pgraph/actions.go index ac529e93..fd82d566 100644 --- a/pgraph/actions.go +++ b/pgraph/actions.go @@ -224,6 +224,12 @@ func (g *Graph) Process(v *Vertex) error { // if this fails, don't UpdateTimestamp() checkOK, err = obj.CheckApply(!noop) + if obj.Prometheus() != nil { + if promErr := obj.Prometheus().UpdateCheckApplyTotal(obj.Kind(), !noop, !checkOK, err != nil); promErr != nil { + // TODO: how to error correctly + log.Printf("%s[%s]: Prometheus.UpdateCheckApplyTotal() errored: %v", v.Kind(), v.GetName(), err) + } + } // TODO: Can the `Poll` converged timeout tracking be a // more general method for all converged timeouts? this // would simplify the resources by removing boilerplate diff --git a/prometheus/prometheus.go b/prometheus/prometheus.go index 03f25a79..c2edb412 100644 --- a/prometheus/prometheus.go +++ b/prometheus/prometheus.go @@ -21,7 +21,9 @@ package prometheus import ( "net/http" + "strconv" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) @@ -33,6 +35,9 @@ const DefaultPrometheusListen = "127.0.0.1:9233" // prometheus instance. Run Init() on it. type Prometheus struct { Listen string // the listen specification for the net/http server + + checkApplyTotal *prometheus.CounterVec // total of CheckApplies that have been triggered + } // Init some parameters - currently the Listen address. @@ -40,6 +45,20 @@ func (obj *Prometheus) Init() error { if len(obj.Listen) == 0 { obj.Listen = DefaultPrometheusListen } + obj.checkApplyTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "mgmt_checkapply_total", + Help: "Number of CheckApply that have run.", + }, + // Labels for this metric. + // kind: resource type: Svc, File, ... + // apply: if the CheckApply happened in "apply" mode + // eventful: did the CheckApply generate an event + // errorful: did the CheckApply generate an error + []string{"kind", "apply", "eventful", "errorful"}, + ) + prometheus.MustRegister(obj.checkApplyTotal) + return nil } @@ -57,3 +76,12 @@ func (obj *Prometheus) Stop() error { // https://stackoverflow.com/questions/39320025/go-how-to-stop-http-listenandserve/41433555#41433555 return nil } + +// UpdateCheckApplyTotal refreshes the failing gauge by parsing the internal +// state map. +func (obj *Prometheus) UpdateCheckApplyTotal(kind string, apply, eventful, errorful bool) error { + labels := prometheus.Labels{"kind": kind, "apply": strconv.FormatBool(apply), "eventful": strconv.FormatBool(eventful), "errorful": strconv.FormatBool(errorful)} + metric := obj.checkApplyTotal.With(labels) + metric.Inc() + return nil +} diff --git a/resources/resources.go b/resources/resources.go index 93f15a4f..346ae25b 100644 --- a/resources/resources.go +++ b/resources/resources.go @@ -33,6 +33,7 @@ import ( // TODO: should each resource be a sub-package? "github.com/purpleidea/mgmt/converger" "github.com/purpleidea/mgmt/event" + "github.com/purpleidea/mgmt/prometheus" errwrap "github.com/pkg/errors" "golang.org/x/time/rate" @@ -57,9 +58,10 @@ const refreshPathToken = "refresh" type Data struct { //Hostname string // uuid for the host //Noop bool - Converger converger.Converger - Prefix string // the prefix to be used for the pgraph namespace - Debug bool + Converger converger.Converger + Prometheus *prometheus.Prometheus + Prefix string // the prefix to be used for the pgraph namespace + Debug bool // NOTE: we can add more fields here if needed for the resources. } @@ -164,6 +166,7 @@ type Base interface { Started() <-chan struct{} // returns when the resource has started Starter(bool) Poll(chan *event.Event) error // poll alternative to watching :( + Prometheus() *prometheus.Prometheus } // Res is the minimum interface you need to implement to define a new resource. @@ -188,22 +191,23 @@ type BaseRes struct { MetaParams MetaParams `yaml:"meta"` // struct of all the metaparams Recv map[string]*Send // mapping of key to receive on from value - kind string - mutex *sync.Mutex // locks around sending and closing of events channel - events chan *event.Event - converger converger.Converger // converged tracking - cuid converger.ConvergerUID - prefix string // base prefix for this resource - debug bool - state ResState - working bool // is the Worker() loop running ? - started chan struct{} // closed when worker is started/running - isStarted bool // did the started chan already close? - starter bool // does this have indegree == 0 ? XXX: usually? - isStateOK bool // whether the state is okay based on events or not - isGrouped bool // am i contained within a group? - grouped []Res // list of any grouped resources - refresh bool // does this resource have a refresh to run? + kind string + mutex *sync.Mutex // locks around sending and closing of events channel + events chan *event.Event + converger converger.Converger // converged tracking + cuid converger.ConvergerUID + prometheus *prometheus.Prometheus + prefix string // base prefix for this resource + debug bool + state ResState + working bool // is the Worker() loop running ? + started chan struct{} // closed when worker is started/running + isStarted bool // did the started chan already close? + starter bool // does this have indegree == 0 ? XXX: usually? + isStateOK bool // whether the state is okay based on events or not + isGrouped bool // am i contained within a group? + grouped []Res // list of any grouped resources + refresh bool // does this resource have a refresh to run? //refreshState StatefulBool // TODO: future stateful bool } @@ -348,6 +352,7 @@ func (obj *BaseRes) Events() chan *event.Event { // AssociateData associates some data with the object in question. func (obj *BaseRes) AssociateData(data *Data) { obj.converger = data.Converger + obj.prometheus = data.Prometheus obj.prefix = data.Prefix obj.debug = data.Debug } @@ -561,6 +566,11 @@ func (obj *BaseRes) Poll(processChan chan *event.Event) error { } } +// Prometheus returns the prometheus instance. +func (obj *BaseRes) Prometheus() *prometheus.Prometheus { + return obj.prometheus +} + // ResToB64 encodes a resource to a base64 encoded string (after serialization) func ResToB64(res Res) (string, error) { b := bytes.Buffer{} diff --git a/test/shell/prometheus-3.sh b/test/shell/prometheus-3.sh new file mode 100755 index 00000000..eedb8bfe --- /dev/null +++ b/test/shell/prometheus-3.sh @@ -0,0 +1,20 @@ +#!/bin/bash -e + +# run empty graph, with prometheus support +timeout --kill-after=20s 15s ./mgmt run --tmp-prefix --prometheus --yaml prometheus-3.yaml & +pid=$! +sleep 10s # let it converge + +# For test debugging purpose +curl 127.0.0.1:9233/metrics + +# Three CheckApply for a File ; with events +curl 127.0.0.1:9233/metrics | grep '^mgmt_checkapply_total{apply="true",errorful="false",eventful="true",kind="File"} 3$' + +# One CheckApply for a File ; in noop mode. +curl 127.0.0.1:9233/metrics | grep '^mgmt_checkapply_total{apply="false",errorful="false",eventful="true",kind="File"} 1$' + + +killall -SIGINT mgmt # send ^C to exit mgmt +wait $pid # get exit status +exit $? diff --git a/test/shell/prometheus-3.yaml b/test/shell/prometheus-3.yaml new file mode 100644 index 00000000..1ef2cd04 --- /dev/null +++ b/test/shell/prometheus-3.yaml @@ -0,0 +1,26 @@ +--- +graph: mygraph +resources: + file: + - name: file1 + path: "/tmp/mgmt/f1" + content: | + i am f1 + state: exists + - name: file2 + path: "/tmp/mgmt/f2" + content: | + i am f2 + state: exists + - name: file3 + path: "/tmp/mgmt/f3" + content: | + i am f3 + state: exists + - name: file4 + path: "/tmp/mgmt/f4" + content: | + i am f4 + state: exists + meta: + noop: true