prometheus: Add detailed metrics

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
This commit is contained in:
Julien Pivotto
2017-02-28 22:11:46 +01:00
parent 660554cc45
commit 33d20ac6d8
6 changed files with 237 additions and 5 deletions

View File

@@ -26,6 +26,7 @@ import (
"time"
"github.com/purpleidea/mgmt/event"
"github.com/purpleidea/mgmt/prometheus"
"github.com/purpleidea/mgmt/resources"
multierr "github.com/hashicorp/go-multierror"
@@ -430,6 +431,11 @@ Loop:
playback = true
log.Printf("%s[%s]: CheckApply errored: %v", v.Kind(), v.GetName(), e)
if retry == 0 {
if err := obj.Prometheus().UpdateState(fmt.Sprintf("%v[%v]", v.Kind(), v.GetName()), v.Kind(), prometheus.ResStateHardFail); err != nil {
// TODO: how to error this?
log.Printf("%s[%s]: Prometheus.UpdateState() errored: %v", v.Kind(), v.GetName(), err)
}
// wrap the error in the sentinel
v.Res.QuiesceGroup().Done() // before the Wait that happens in SendEvent!
v.SendEvent(event.EventExit, &SentinelErr{e})
@@ -438,6 +444,10 @@ Loop:
if retry > 0 { // don't decrement the -1
retry--
}
if err := obj.Prometheus().UpdateState(fmt.Sprintf("%v[%v]", v.Kind(), v.GetName()), v.Kind(), prometheus.ResStateSoftFail); err != nil {
// TODO: how to error this?
log.Printf("%s[%s]: Prometheus.UpdateState() errored: %v", v.Kind(), v.GetName(), err)
}
log.Printf("%s[%s]: CheckApply: Retrying after %.4f seconds (%d left)", v.Kind(), v.GetName(), delay.Seconds(), retry)
// start the timer...
timer.Reset(delay)

View File

@@ -20,17 +20,33 @@
package prometheus
import (
"errors"
"net/http"
"strconv"
"sync"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
errwrap "github.com/pkg/errors"
)
// DefaultPrometheusListen is registered in
// https://github.com/prometheus/prometheus/wiki/Default-port-allocations
const DefaultPrometheusListen = "127.0.0.1:9233"
// ResState represents the status of a resource.
type ResState int
const (
// ResStateOK represents a working resource
ResStateOK ResState = iota
// ResStateSoftFail represents a resource in soft fail (will be retried)
ResStateSoftFail
// ResStateHardFail represents a resource in hard fail (will NOT be retried)
ResStateHardFail
)
// Prometheus is the struct that contains information about the
// prometheus instance. Run Init() on it.
type Prometheus struct {
@@ -38,7 +54,18 @@ type Prometheus struct {
checkApplyTotal *prometheus.CounterVec // total of CheckApplies that have been triggered
pgraphStartTimeSeconds prometheus.Gauge // process start time in seconds since unix epoch
managedResources *prometheus.GaugeVec // Resources we manage now
failedResourcesTotal *prometheus.CounterVec // Total of failures since mgmt has started
failedResources *prometheus.GaugeVec // Number of current resources
resourcesState map[string]resStateWithKind // Maps the resources with their current kind/state
mutex *sync.Mutex // Mutex used to update resourcesState
}
// resStateWithKind is used to count the failures by kind
type resStateWithKind struct {
state ResState
kind string
}
// Init some parameters - currently the Listen address.
@@ -46,6 +73,10 @@ func (obj *Prometheus) Init() error {
if len(obj.Listen) == 0 {
obj.Listen = DefaultPrometheusListen
}
obj.mutex = &sync.Mutex{}
obj.resourcesState = make(map[string]resStateWithKind)
obj.checkApplyTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "mgmt_checkapply_total",
@@ -68,6 +99,38 @@ func (obj *Prometheus) Init() error {
)
prometheus.MustRegister(obj.pgraphStartTimeSeconds)
obj.managedResources = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "mgmt_resources",
Help: "Number of managed resources.",
},
// kind: resource type: Svc, File, ...
[]string{"kind"},
)
prometheus.MustRegister(obj.managedResources)
obj.failedResourcesTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "mgmt_failures_total",
Help: "Total of failed resources.",
},
// kind: resource type: Svc, File, ...
// failure: soft or hard
[]string{"kind", "failure"},
)
prometheus.MustRegister(obj.failedResourcesTotal)
obj.failedResources = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "mgmt_failures",
Help: "Number of failing resources.",
},
// kind: resource type: Svc, File, ...
// failure: soft or hard
[]string{"kind", "failure"},
)
prometheus.MustRegister(obj.failedResources)
return nil
}
@@ -107,3 +170,94 @@ func (obj *Prometheus) UpdatePgraphStartTime() error {
obj.pgraphStartTimeSeconds.SetToCurrentTime()
return nil
}
// AddManagedResource increments the Managed Resource counter and updates the resource status.
func (obj *Prometheus) AddManagedResource(resUUID string, rtype string) error {
if obj == nil {
return nil // happens when mgmt is launched without --prometheus
}
obj.managedResources.With(prometheus.Labels{"kind": rtype}).Inc()
if err := obj.UpdateState(resUUID, rtype, ResStateOK); err != nil {
return errwrap.Wrapf(err, "can't update the resource status in the map")
}
return nil
}
// RemoveManagedResource decrements the Managed Resource counter and updates the resource status.
func (obj *Prometheus) RemoveManagedResource(resUUID string, rtype string) error {
if obj == nil {
return nil // happens when mgmt is launched without --prometheus
}
obj.managedResources.With(prometheus.Labels{"kind": rtype}).Dec()
if err := obj.deleteState(resUUID); err != nil {
return errwrap.Wrapf(err, "can't remove the resource status from the map")
}
return nil
}
// deleteState removes the resources for the state map and re-populates the failing gauge.
func (obj *Prometheus) deleteState(resUUID string) error {
if obj == nil {
return nil // happens when mgmt is launched without --prometheus
}
obj.mutex.Lock()
delete(obj.resourcesState, resUUID)
obj.mutex.Unlock()
if err := obj.updateFailingGauge(); err != nil {
return errwrap.Wrapf(err, "can't update the failing gauge")
}
return nil
}
// UpdateState updates the state of the resources in our internal state map
// then triggers a refresh of the failing gauge.
func (obj *Prometheus) UpdateState(resUUID string, rtype string, newState ResState) error {
defer obj.updateFailingGauge()
if obj == nil {
return nil // happens when mgmt is launched without --prometheus
}
obj.mutex.Lock()
obj.resourcesState[resUUID] = resStateWithKind{state: newState, kind: rtype}
obj.mutex.Unlock()
if newState != ResStateOK {
var strState string
if newState == ResStateSoftFail {
strState = "soft"
} else if newState == ResStateHardFail {
strState = "hard"
} else {
return errors.New("state should be soft or hard failure")
}
obj.failedResourcesTotal.With(prometheus.Labels{"kind": rtype, "failure": strState}).Inc()
}
return nil
}
// updateFailingGauge refreshes the failing gauge by parsking the internal
// state map.
func (obj *Prometheus) updateFailingGauge() error {
if obj == nil {
return nil // happens when mgmt is launched without --prometheus
}
var softFails, hardFails map[string]float64
softFails = make(map[string]float64)
hardFails = make(map[string]float64)
for _, v := range obj.resourcesState {
if v.state == ResStateSoftFail {
softFails[v.kind]++
} else if v.state == ResStateHardFail {
hardFails[v.kind]++
}
}
// TODO: we might want to Zero the metrics we are not using
// because in prometheus design the metrics keep living for some time
// even after they are removed.
obj.failedResources.Reset()
for k, v := range softFails {
obj.failedResources.With(prometheus.Labels{"kind": k, "failure": "soft"}).Set(v)
}
for k, v := range hardFails {
obj.failedResources.With(prometheus.Labels{"kind": k, "failure": "hard"}).Set(v)
}
return nil
}

View File

@@ -249,8 +249,6 @@ type BaseRes struct {
refresh bool // does this resource have a refresh to run?
//refreshState StatefulBool // TODO: future stateful bool
prometheus *prometheus.Prometheus
}
// UnmarshalYAML is the custom unmarshal handler for the BaseRes struct. It is
@@ -366,6 +364,10 @@ func (obj *BaseRes) Init() error {
// TODO: this StatefulBool implementation could be eventually swappable
//obj.refreshState = &DiskBool{Path: path.Join(dir, refreshPathToken)}
if err := obj.Prometheus().AddManagedResource(fmt.Sprintf("%v[%v]", obj.Kind(), obj.GetName()), obj.Kind()); err != nil {
return errwrap.Wrapf(err, "could not increase prometheus counter!")
}
return nil
}
@@ -383,6 +385,10 @@ func (obj *BaseRes) Close() error {
close(obj.stopped)
obj.waitGroup.Done()
if err := obj.Prometheus().RemoveManagedResource(fmt.Sprintf("%v[%v]", obj.Kind(), obj.GetName()), obj.kind); err != nil {
return errwrap.Wrapf(err, "could not decrease prometheus counter!")
}
return nil
}
@@ -684,5 +690,5 @@ func (obj *BaseRes) Poll() error {
// Prometheus returns the prometheus instance.
func (obj *BaseRes) Prometheus() *prometheus.Prometheus {
return obj.prometheus
return obj.Data().Prometheus
}

View File

@@ -1,7 +1,5 @@
#!/bin/bash -e
exit 0 # FIXME: disabled until intermittent failures can be resolved
# run a graph, with prometheus support
timeout --kill-after=30s 25s ./mgmt run --tmp-prefix --no-pgp --prometheus --yaml prometheus-3.yaml &
pid=$!

35
test/shell/prometheus-4.sh Executable file
View File

@@ -0,0 +1,35 @@
#!/bin/bash -xe
# run a graph, with prometheus support
timeout --kill-after=30s 25s ./mgmt run --tmp-prefix --no-pgp --prometheus --yaml prometheus-4.yaml &
pid=$!
sleep 10s # let it converge
# For test debugging purpose
curl 127.0.0.1:9233/metrics
# Check for mgmt_resources
curl 127.0.0.1:9233/metrics | grep '^mgmt_resources{kind="file"} 4$'
# One CheckApply for a File ; in noop mode.
curl 127.0.0.1:9233/metrics | grep 'mgmt_checkapply_total{apply="false",errorful="false",eventful="true",kind="file"} 1$'
# Two CheckApply for a File ; without errors, with events
curl 127.0.0.1:9233/metrics | grep 'mgmt_checkapply_total{apply="true",errorful="false",eventful="true",kind="file"} 2$'
# Multiple CheckApplies with errors
curl 127.0.0.1:9233/metrics | grep 'mgmt_checkapply_total{apply="true",errorful="true",eventful="true",kind="file"} [0-9]\+'
# One soft failure ATM
curl 127.0.0.1:9233/metrics | grep 'mgmt_failures{failure="soft",kind="file"} 1$'
# Multiple soft failures since startup
if curl 127.0.0.1:9233/metrics | grep 'mgmt_failures_total{failure="soft",kind="file"} 1$'
then
false
fi
curl 127.0.0.1:9233/metrics | grep 'mgmt_failures_total{failure="soft",kind="file"} [0-9]\+'
killall -SIGINT mgmt # send ^C to exit mgmt
wait $pid # get exit status
exit $?

View File

@@ -0,0 +1,29 @@
---
graph: mygraph
resources:
file:
- name: file1
path: "/tmp/mgmt/NONEXIST/f1"
content: |
i am f1
state: exists
meta:
retry: -1
delay: 1000
- name: file2
path: "/tmp/mgmt/f2"
content: |
i am f2
state: exists
- name: file3
path: "/tmp/mgmt/f3"
content: |
i am f3
state: exists
- name: file4
path: "/tmp/mgmt/f4"
content: |
i am f4
state: exists
meta:
noop: true