prometheus: Add detailed metrics

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
2017-02-28 22:11:46 +01:00
parent 660554cc45
commit 33d20ac6d8
6 changed files with 237 additions and 5 deletions
--- a/pgraph/actions.go
+++ b/pgraph/actions.go
@@ -26,6 +26,7 @@ import (
 	"time"

 	"github.com/purpleidea/mgmt/event"
+	"github.com/purpleidea/mgmt/prometheus"
 	"github.com/purpleidea/mgmt/resources"

 	multierr "github.com/hashicorp/go-multierror"
@@ -430,6 +431,11 @@ Loop:
 					playback = true
 					log.Printf("%s[%s]: CheckApply errored: %v", v.Kind(), v.GetName(), e)
 					if retry == 0 {
+						if err := obj.Prometheus().UpdateState(fmt.Sprintf("%v[%v]", v.Kind(), v.GetName()), v.Kind(), prometheus.ResStateHardFail); err != nil {
+							// TODO: how to error this?
+							log.Printf("%s[%s]: Prometheus.UpdateState() errored: %v", v.Kind(), v.GetName(), err)
+						}
+
 						// wrap the error in the sentinel
 						v.Res.QuiesceGroup().Done() // before the Wait that happens in SendEvent!
 						v.SendEvent(event.EventExit, &SentinelErr{e})
@@ -438,6 +444,10 @@ Loop:
 					if retry > 0 { // don't decrement the -1
 						retry--
 					}
+					if err := obj.Prometheus().UpdateState(fmt.Sprintf("%v[%v]", v.Kind(), v.GetName()), v.Kind(), prometheus.ResStateSoftFail); err != nil {
+						// TODO: how to error this?
+						log.Printf("%s[%s]: Prometheus.UpdateState() errored: %v", v.Kind(), v.GetName(), err)
+					}
 					log.Printf("%s[%s]: CheckApply: Retrying after %.4f seconds (%d left)", v.Kind(), v.GetName(), delay.Seconds(), retry)
 					// start the timer...
 					timer.Reset(delay)
--- a/prometheus/prometheus.go
+++ b/prometheus/prometheus.go
@@ -20,17 +20,33 @@
 package prometheus

 import (
+	"errors"
 	"net/http"
 	"strconv"
+	"sync"

 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promhttp"
+
+	errwrap "github.com/pkg/errors"
 )

 // DefaultPrometheusListen is registered in
 // https://github.com/prometheus/prometheus/wiki/Default-port-allocations
 const DefaultPrometheusListen = "127.0.0.1:9233"

+// ResState represents the status of a resource.
+type ResState int
+
+const (
+	// ResStateOK represents a working resource
+	ResStateOK ResState = iota
+	// ResStateSoftFail represents a resource in soft fail (will be retried)
+	ResStateSoftFail
+	// ResStateHardFail represents a resource in hard fail (will NOT be retried)
+	ResStateHardFail
+)
+
 // Prometheus is the struct that contains information about the
 // prometheus instance. Run Init() on it.
 type Prometheus struct {
@@ -38,7 +54,18 @@ type Prometheus struct {

 	checkApplyTotal        *prometheus.CounterVec // total of CheckApplies that have been triggered
 	pgraphStartTimeSeconds prometheus.Gauge       // process start time in seconds since unix epoch
+	managedResources       *prometheus.GaugeVec   // Resources we manage now
+	failedResourcesTotal   *prometheus.CounterVec // Total of failures since mgmt has started
+	failedResources        *prometheus.GaugeVec   // Number of current resources

+	resourcesState map[string]resStateWithKind // Maps the resources with their current kind/state
+	mutex          *sync.Mutex                 // Mutex used to update resourcesState
+}
+
+// resStateWithKind is used to count the failures by kind
+type resStateWithKind struct {
+	state ResState
+	kind  string
 }

 // Init some parameters - currently the Listen address.
@@ -46,6 +73,10 @@ func (obj *Prometheus) Init() error {
 	if len(obj.Listen) == 0 {
 		obj.Listen = DefaultPrometheusListen
 	}
+
+	obj.mutex = &sync.Mutex{}
+	obj.resourcesState = make(map[string]resStateWithKind)
+
 	obj.checkApplyTotal = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Name: "mgmt_checkapply_total",
@@ -68,6 +99,38 @@ func (obj *Prometheus) Init() error {
 	)
 	prometheus.MustRegister(obj.pgraphStartTimeSeconds)

+	obj.managedResources = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "mgmt_resources",
+			Help: "Number of managed resources.",
+		},
+		// kind: resource type: Svc, File, ...
+		[]string{"kind"},
+	)
+	prometheus.MustRegister(obj.managedResources)
+
+	obj.failedResourcesTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "mgmt_failures_total",
+			Help: "Total of failed resources.",
+		},
+		// kind: resource type: Svc, File, ...
+		// failure: soft or hard
+		[]string{"kind", "failure"},
+	)
+	prometheus.MustRegister(obj.failedResourcesTotal)
+
+	obj.failedResources = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "mgmt_failures",
+			Help: "Number of failing resources.",
+		},
+		// kind: resource type: Svc, File, ...
+		// failure: soft or hard
+		[]string{"kind", "failure"},
+	)
+	prometheus.MustRegister(obj.failedResources)
+
 	return nil
 }

@@ -107,3 +170,94 @@ func (obj *Prometheus) UpdatePgraphStartTime() error {
 	obj.pgraphStartTimeSeconds.SetToCurrentTime()
 	return nil
 }
+
+// AddManagedResource increments the Managed Resource counter and updates the resource status.
+func (obj *Prometheus) AddManagedResource(resUUID string, rtype string) error {
+	if obj == nil {
+		return nil // happens when mgmt is launched without --prometheus
+	}
+	obj.managedResources.With(prometheus.Labels{"kind": rtype}).Inc()
+	if err := obj.UpdateState(resUUID, rtype, ResStateOK); err != nil {
+		return errwrap.Wrapf(err, "can't update the resource status in the map")
+	}
+	return nil
+}
+
+// RemoveManagedResource decrements the Managed Resource counter and updates the resource status.
+func (obj *Prometheus) RemoveManagedResource(resUUID string, rtype string) error {
+	if obj == nil {
+		return nil // happens when mgmt is launched without --prometheus
+	}
+	obj.managedResources.With(prometheus.Labels{"kind": rtype}).Dec()
+	if err := obj.deleteState(resUUID); err != nil {
+		return errwrap.Wrapf(err, "can't remove the resource status from the map")
+	}
+	return nil
+}
+
+// deleteState removes the resources for the state map and re-populates the failing gauge.
+func (obj *Prometheus) deleteState(resUUID string) error {
+	if obj == nil {
+		return nil // happens when mgmt is launched without --prometheus
+	}
+	obj.mutex.Lock()
+	delete(obj.resourcesState, resUUID)
+	obj.mutex.Unlock()
+	if err := obj.updateFailingGauge(); err != nil {
+		return errwrap.Wrapf(err, "can't update the failing gauge")
+	}
+	return nil
+}
+
+// UpdateState updates the state of the resources in our internal state map
+// then triggers a refresh of the failing gauge.
+func (obj *Prometheus) UpdateState(resUUID string, rtype string, newState ResState) error {
+	defer obj.updateFailingGauge()
+	if obj == nil {
+		return nil // happens when mgmt is launched without --prometheus
+	}
+	obj.mutex.Lock()
+	obj.resourcesState[resUUID] = resStateWithKind{state: newState, kind: rtype}
+	obj.mutex.Unlock()
+	if newState != ResStateOK {
+		var strState string
+		if newState == ResStateSoftFail {
+			strState = "soft"
+		} else if newState == ResStateHardFail {
+			strState = "hard"
+		} else {
+			return errors.New("state should be soft or hard failure")
+		}
+		obj.failedResourcesTotal.With(prometheus.Labels{"kind": rtype, "failure": strState}).Inc()
+	}
+	return nil
+}
+
+// updateFailingGauge refreshes the failing gauge by parsking the internal
+// state map.
+func (obj *Prometheus) updateFailingGauge() error {
+	if obj == nil {
+		return nil // happens when mgmt is launched without --prometheus
+	}
+	var softFails, hardFails map[string]float64
+	softFails = make(map[string]float64)
+	hardFails = make(map[string]float64)
+	for _, v := range obj.resourcesState {
+		if v.state == ResStateSoftFail {
+			softFails[v.kind]++
+		} else if v.state == ResStateHardFail {
+			hardFails[v.kind]++
+		}
+	}
+	// TODO: we might want to Zero the metrics we are not using
+	// because in prometheus design the metrics keep living for some time
+	// even after they are removed.
+	obj.failedResources.Reset()
+	for k, v := range softFails {
+		obj.failedResources.With(prometheus.Labels{"kind": k, "failure": "soft"}).Set(v)
+	}
+	for k, v := range hardFails {
+		obj.failedResources.With(prometheus.Labels{"kind": k, "failure": "hard"}).Set(v)
+	}
+	return nil
+}
--- a/resources/resources.go
+++ b/resources/resources.go
@@ -249,8 +249,6 @@ type BaseRes struct {

 	refresh bool // does this resource have a refresh to run?
 	//refreshState StatefulBool // TODO: future stateful bool
-
-	prometheus *prometheus.Prometheus
 }

 // UnmarshalYAML is the custom unmarshal handler for the BaseRes struct. It is
@@ -366,6 +364,10 @@ func (obj *BaseRes) Init() error {
 	// TODO: this StatefulBool implementation could be eventually swappable
 	//obj.refreshState = &DiskBool{Path: path.Join(dir, refreshPathToken)}

+	if err := obj.Prometheus().AddManagedResource(fmt.Sprintf("%v[%v]", obj.Kind(), obj.GetName()), obj.Kind()); err != nil {
+		return errwrap.Wrapf(err, "could not increase prometheus counter!")
+	}
+
 	return nil
 }

@@ -383,6 +385,10 @@ func (obj *BaseRes) Close() error {
 	close(obj.stopped)
 	obj.waitGroup.Done()

+	if err := obj.Prometheus().RemoveManagedResource(fmt.Sprintf("%v[%v]", obj.Kind(), obj.GetName()), obj.kind); err != nil {
+		return errwrap.Wrapf(err, "could not decrease prometheus counter!")
+	}
+
 	return nil
 }

@@ -684,5 +690,5 @@ func (obj *BaseRes) Poll() error {

 // Prometheus returns the prometheus instance.
 func (obj *BaseRes) Prometheus() *prometheus.Prometheus {
-	return obj.prometheus
+	return obj.Data().Prometheus
 }
--- a/test/shell/prometheus-3.sh
+++ b/test/shell/prometheus-3.sh
@@ -1,7 +1,5 @@
 #!/bin/bash -e

-exit 0	# FIXME: disabled until intermittent failures can be resolved
-
 # run a graph, with prometheus support
 timeout --kill-after=30s 25s ./mgmt run --tmp-prefix --no-pgp --prometheus --yaml prometheus-3.yaml &
 pid=$!
--- a/test/shell/prometheus-4.sh
+++ b/test/shell/prometheus-4.sh
@@ -0,0 +1,35 @@
+#!/bin/bash -xe
+
+# run a graph, with prometheus support
+timeout --kill-after=30s 25s ./mgmt run --tmp-prefix --no-pgp --prometheus --yaml prometheus-4.yaml &
+pid=$!
+sleep 10s	# let it converge
+
+# For test debugging purpose
+curl 127.0.0.1:9233/metrics
+
+# Check for mgmt_resources
+curl 127.0.0.1:9233/metrics | grep '^mgmt_resources{kind="file"} 4$'
+
+# One CheckApply for a File ; in noop mode.
+curl 127.0.0.1:9233/metrics | grep 'mgmt_checkapply_total{apply="false",errorful="false",eventful="true",kind="file"} 1$'
+
+# Two CheckApply for a File ; without errors, with events
+curl 127.0.0.1:9233/metrics | grep 'mgmt_checkapply_total{apply="true",errorful="false",eventful="true",kind="file"} 2$'
+
+# Multiple CheckApplies with errors
+curl 127.0.0.1:9233/metrics | grep 'mgmt_checkapply_total{apply="true",errorful="true",eventful="true",kind="file"} [0-9]\+'
+
+# One soft failure ATM
+curl 127.0.0.1:9233/metrics | grep 'mgmt_failures{failure="soft",kind="file"} 1$'
+
+# Multiple soft failures since startup
+if curl 127.0.0.1:9233/metrics | grep 'mgmt_failures_total{failure="soft",kind="file"} 1$'
+then
+	false
+fi
+curl 127.0.0.1:9233/metrics | grep 'mgmt_failures_total{failure="soft",kind="file"} [0-9]\+'
+
+killall -SIGINT mgmt	# send ^C to exit mgmt
+wait $pid	# get exit status
+exit $?
--- a/test/shell/prometheus-4.yaml
+++ b/test/shell/prometheus-4.yaml
@@ -0,0 +1,29 @@
+---
+graph: mygraph
+resources:
+  file:
+  - name: file1
+    path: "/tmp/mgmt/NONEXIST/f1"
+    content: |
+      i am f1
+    state: exists
+    meta:
+      retry: -1
+      delay: 1000
+  - name: file2
+    path: "/tmp/mgmt/f2"
+    content: |
+      i am f2
+    state: exists
+  - name: file3
+    path: "/tmp/mgmt/f3"
+    content: |
+      i am f3
+    state: exists
+  - name: file4
+    path: "/tmp/mgmt/f4"
+    content: |
+      i am f4
+    state: exists
+    meta:
+      noop: true