From 33d20ac6d84de43e13889f064ba3b89987307feb Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Tue, 28 Feb 2017 22:11:46 +0100 Subject: [PATCH] prometheus: Add detailed metrics Signed-off-by: Julien Pivotto --- pgraph/actions.go | 10 +++ prometheus/prometheus.go | 154 +++++++++++++++++++++++++++++++++++ resources/resources.go | 12 ++- test/shell/prometheus-3.sh | 2 - test/shell/prometheus-4.sh | 35 ++++++++ test/shell/prometheus-4.yaml | 29 +++++++ 6 files changed, 237 insertions(+), 5 deletions(-) create mode 100755 test/shell/prometheus-4.sh create mode 100644 test/shell/prometheus-4.yaml diff --git a/pgraph/actions.go b/pgraph/actions.go index 7a1a2d13..8db259c6 100644 --- a/pgraph/actions.go +++ b/pgraph/actions.go @@ -26,6 +26,7 @@ import ( "time" "github.com/purpleidea/mgmt/event" + "github.com/purpleidea/mgmt/prometheus" "github.com/purpleidea/mgmt/resources" multierr "github.com/hashicorp/go-multierror" @@ -430,6 +431,11 @@ Loop: playback = true log.Printf("%s[%s]: CheckApply errored: %v", v.Kind(), v.GetName(), e) if retry == 0 { + if err := obj.Prometheus().UpdateState(fmt.Sprintf("%v[%v]", v.Kind(), v.GetName()), v.Kind(), prometheus.ResStateHardFail); err != nil { + // TODO: how to error this? + log.Printf("%s[%s]: Prometheus.UpdateState() errored: %v", v.Kind(), v.GetName(), err) + } + // wrap the error in the sentinel v.Res.QuiesceGroup().Done() // before the Wait that happens in SendEvent! v.SendEvent(event.EventExit, &SentinelErr{e}) @@ -438,6 +444,10 @@ Loop: if retry > 0 { // don't decrement the -1 retry-- } + if err := obj.Prometheus().UpdateState(fmt.Sprintf("%v[%v]", v.Kind(), v.GetName()), v.Kind(), prometheus.ResStateSoftFail); err != nil { + // TODO: how to error this? + log.Printf("%s[%s]: Prometheus.UpdateState() errored: %v", v.Kind(), v.GetName(), err) + } log.Printf("%s[%s]: CheckApply: Retrying after %.4f seconds (%d left)", v.Kind(), v.GetName(), delay.Seconds(), retry) // start the timer... timer.Reset(delay) diff --git a/prometheus/prometheus.go b/prometheus/prometheus.go index 34e5cf0e..9297e182 100644 --- a/prometheus/prometheus.go +++ b/prometheus/prometheus.go @@ -20,17 +20,33 @@ package prometheus import ( + "errors" "net/http" "strconv" + "sync" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + + errwrap "github.com/pkg/errors" ) // DefaultPrometheusListen is registered in // https://github.com/prometheus/prometheus/wiki/Default-port-allocations const DefaultPrometheusListen = "127.0.0.1:9233" +// ResState represents the status of a resource. +type ResState int + +const ( + // ResStateOK represents a working resource + ResStateOK ResState = iota + // ResStateSoftFail represents a resource in soft fail (will be retried) + ResStateSoftFail + // ResStateHardFail represents a resource in hard fail (will NOT be retried) + ResStateHardFail +) + // Prometheus is the struct that contains information about the // prometheus instance. Run Init() on it. type Prometheus struct { @@ -38,7 +54,18 @@ type Prometheus struct { checkApplyTotal *prometheus.CounterVec // total of CheckApplies that have been triggered pgraphStartTimeSeconds prometheus.Gauge // process start time in seconds since unix epoch + managedResources *prometheus.GaugeVec // Resources we manage now + failedResourcesTotal *prometheus.CounterVec // Total of failures since mgmt has started + failedResources *prometheus.GaugeVec // Number of current resources + resourcesState map[string]resStateWithKind // Maps the resources with their current kind/state + mutex *sync.Mutex // Mutex used to update resourcesState +} + +// resStateWithKind is used to count the failures by kind +type resStateWithKind struct { + state ResState + kind string } // Init some parameters - currently the Listen address. @@ -46,6 +73,10 @@ func (obj *Prometheus) Init() error { if len(obj.Listen) == 0 { obj.Listen = DefaultPrometheusListen } + + obj.mutex = &sync.Mutex{} + obj.resourcesState = make(map[string]resStateWithKind) + obj.checkApplyTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "mgmt_checkapply_total", @@ -68,6 +99,38 @@ func (obj *Prometheus) Init() error { ) prometheus.MustRegister(obj.pgraphStartTimeSeconds) + obj.managedResources = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "mgmt_resources", + Help: "Number of managed resources.", + }, + // kind: resource type: Svc, File, ... + []string{"kind"}, + ) + prometheus.MustRegister(obj.managedResources) + + obj.failedResourcesTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "mgmt_failures_total", + Help: "Total of failed resources.", + }, + // kind: resource type: Svc, File, ... + // failure: soft or hard + []string{"kind", "failure"}, + ) + prometheus.MustRegister(obj.failedResourcesTotal) + + obj.failedResources = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "mgmt_failures", + Help: "Number of failing resources.", + }, + // kind: resource type: Svc, File, ... + // failure: soft or hard + []string{"kind", "failure"}, + ) + prometheus.MustRegister(obj.failedResources) + return nil } @@ -107,3 +170,94 @@ func (obj *Prometheus) UpdatePgraphStartTime() error { obj.pgraphStartTimeSeconds.SetToCurrentTime() return nil } + +// AddManagedResource increments the Managed Resource counter and updates the resource status. +func (obj *Prometheus) AddManagedResource(resUUID string, rtype string) error { + if obj == nil { + return nil // happens when mgmt is launched without --prometheus + } + obj.managedResources.With(prometheus.Labels{"kind": rtype}).Inc() + if err := obj.UpdateState(resUUID, rtype, ResStateOK); err != nil { + return errwrap.Wrapf(err, "can't update the resource status in the map") + } + return nil +} + +// RemoveManagedResource decrements the Managed Resource counter and updates the resource status. +func (obj *Prometheus) RemoveManagedResource(resUUID string, rtype string) error { + if obj == nil { + return nil // happens when mgmt is launched without --prometheus + } + obj.managedResources.With(prometheus.Labels{"kind": rtype}).Dec() + if err := obj.deleteState(resUUID); err != nil { + return errwrap.Wrapf(err, "can't remove the resource status from the map") + } + return nil +} + +// deleteState removes the resources for the state map and re-populates the failing gauge. +func (obj *Prometheus) deleteState(resUUID string) error { + if obj == nil { + return nil // happens when mgmt is launched without --prometheus + } + obj.mutex.Lock() + delete(obj.resourcesState, resUUID) + obj.mutex.Unlock() + if err := obj.updateFailingGauge(); err != nil { + return errwrap.Wrapf(err, "can't update the failing gauge") + } + return nil +} + +// UpdateState updates the state of the resources in our internal state map +// then triggers a refresh of the failing gauge. +func (obj *Prometheus) UpdateState(resUUID string, rtype string, newState ResState) error { + defer obj.updateFailingGauge() + if obj == nil { + return nil // happens when mgmt is launched without --prometheus + } + obj.mutex.Lock() + obj.resourcesState[resUUID] = resStateWithKind{state: newState, kind: rtype} + obj.mutex.Unlock() + if newState != ResStateOK { + var strState string + if newState == ResStateSoftFail { + strState = "soft" + } else if newState == ResStateHardFail { + strState = "hard" + } else { + return errors.New("state should be soft or hard failure") + } + obj.failedResourcesTotal.With(prometheus.Labels{"kind": rtype, "failure": strState}).Inc() + } + return nil +} + +// updateFailingGauge refreshes the failing gauge by parsking the internal +// state map. +func (obj *Prometheus) updateFailingGauge() error { + if obj == nil { + return nil // happens when mgmt is launched without --prometheus + } + var softFails, hardFails map[string]float64 + softFails = make(map[string]float64) + hardFails = make(map[string]float64) + for _, v := range obj.resourcesState { + if v.state == ResStateSoftFail { + softFails[v.kind]++ + } else if v.state == ResStateHardFail { + hardFails[v.kind]++ + } + } + // TODO: we might want to Zero the metrics we are not using + // because in prometheus design the metrics keep living for some time + // even after they are removed. + obj.failedResources.Reset() + for k, v := range softFails { + obj.failedResources.With(prometheus.Labels{"kind": k, "failure": "soft"}).Set(v) + } + for k, v := range hardFails { + obj.failedResources.With(prometheus.Labels{"kind": k, "failure": "hard"}).Set(v) + } + return nil +} diff --git a/resources/resources.go b/resources/resources.go index 97b5e452..b518e1b9 100644 --- a/resources/resources.go +++ b/resources/resources.go @@ -249,8 +249,6 @@ type BaseRes struct { refresh bool // does this resource have a refresh to run? //refreshState StatefulBool // TODO: future stateful bool - - prometheus *prometheus.Prometheus } // UnmarshalYAML is the custom unmarshal handler for the BaseRes struct. It is @@ -366,6 +364,10 @@ func (obj *BaseRes) Init() error { // TODO: this StatefulBool implementation could be eventually swappable //obj.refreshState = &DiskBool{Path: path.Join(dir, refreshPathToken)} + if err := obj.Prometheus().AddManagedResource(fmt.Sprintf("%v[%v]", obj.Kind(), obj.GetName()), obj.Kind()); err != nil { + return errwrap.Wrapf(err, "could not increase prometheus counter!") + } + return nil } @@ -383,6 +385,10 @@ func (obj *BaseRes) Close() error { close(obj.stopped) obj.waitGroup.Done() + if err := obj.Prometheus().RemoveManagedResource(fmt.Sprintf("%v[%v]", obj.Kind(), obj.GetName()), obj.kind); err != nil { + return errwrap.Wrapf(err, "could not decrease prometheus counter!") + } + return nil } @@ -684,5 +690,5 @@ func (obj *BaseRes) Poll() error { // Prometheus returns the prometheus instance. func (obj *BaseRes) Prometheus() *prometheus.Prometheus { - return obj.prometheus + return obj.Data().Prometheus } diff --git a/test/shell/prometheus-3.sh b/test/shell/prometheus-3.sh index 8c029f3d..8be528d7 100755 --- a/test/shell/prometheus-3.sh +++ b/test/shell/prometheus-3.sh @@ -1,7 +1,5 @@ #!/bin/bash -e -exit 0 # FIXME: disabled until intermittent failures can be resolved - # run a graph, with prometheus support timeout --kill-after=30s 25s ./mgmt run --tmp-prefix --no-pgp --prometheus --yaml prometheus-3.yaml & pid=$! diff --git a/test/shell/prometheus-4.sh b/test/shell/prometheus-4.sh new file mode 100755 index 00000000..43f6df43 --- /dev/null +++ b/test/shell/prometheus-4.sh @@ -0,0 +1,35 @@ +#!/bin/bash -xe + +# run a graph, with prometheus support +timeout --kill-after=30s 25s ./mgmt run --tmp-prefix --no-pgp --prometheus --yaml prometheus-4.yaml & +pid=$! +sleep 10s # let it converge + +# For test debugging purpose +curl 127.0.0.1:9233/metrics + +# Check for mgmt_resources +curl 127.0.0.1:9233/metrics | grep '^mgmt_resources{kind="file"} 4$' + +# One CheckApply for a File ; in noop mode. +curl 127.0.0.1:9233/metrics | grep 'mgmt_checkapply_total{apply="false",errorful="false",eventful="true",kind="file"} 1$' + +# Two CheckApply for a File ; without errors, with events +curl 127.0.0.1:9233/metrics | grep 'mgmt_checkapply_total{apply="true",errorful="false",eventful="true",kind="file"} 2$' + +# Multiple CheckApplies with errors +curl 127.0.0.1:9233/metrics | grep 'mgmt_checkapply_total{apply="true",errorful="true",eventful="true",kind="file"} [0-9]\+' + +# One soft failure ATM +curl 127.0.0.1:9233/metrics | grep 'mgmt_failures{failure="soft",kind="file"} 1$' + +# Multiple soft failures since startup +if curl 127.0.0.1:9233/metrics | grep 'mgmt_failures_total{failure="soft",kind="file"} 1$' +then + false +fi +curl 127.0.0.1:9233/metrics | grep 'mgmt_failures_total{failure="soft",kind="file"} [0-9]\+' + +killall -SIGINT mgmt # send ^C to exit mgmt +wait $pid # get exit status +exit $? diff --git a/test/shell/prometheus-4.yaml b/test/shell/prometheus-4.yaml new file mode 100644 index 00000000..4219aa4e --- /dev/null +++ b/test/shell/prometheus-4.yaml @@ -0,0 +1,29 @@ +--- +graph: mygraph +resources: + file: + - name: file1 + path: "/tmp/mgmt/NONEXIST/f1" + content: | + i am f1 + state: exists + meta: + retry: -1 + delay: 1000 + - name: file2 + path: "/tmp/mgmt/f2" + content: | + i am f2 + state: exists + - name: file3 + path: "/tmp/mgmt/f3" + content: | + i am f3 + state: exists + - name: file4 + path: "/tmp/mgmt/f4" + content: | + i am f4 + state: exists + meta: + noop: true