prometheus: Add detailed metrics
Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
This commit is contained in:
@@ -26,6 +26,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/purpleidea/mgmt/event"
|
"github.com/purpleidea/mgmt/event"
|
||||||
|
"github.com/purpleidea/mgmt/prometheus"
|
||||||
"github.com/purpleidea/mgmt/resources"
|
"github.com/purpleidea/mgmt/resources"
|
||||||
|
|
||||||
multierr "github.com/hashicorp/go-multierror"
|
multierr "github.com/hashicorp/go-multierror"
|
||||||
@@ -430,6 +431,11 @@ Loop:
|
|||||||
playback = true
|
playback = true
|
||||||
log.Printf("%s[%s]: CheckApply errored: %v", v.Kind(), v.GetName(), e)
|
log.Printf("%s[%s]: CheckApply errored: %v", v.Kind(), v.GetName(), e)
|
||||||
if retry == 0 {
|
if retry == 0 {
|
||||||
|
if err := obj.Prometheus().UpdateState(fmt.Sprintf("%v[%v]", v.Kind(), v.GetName()), v.Kind(), prometheus.ResStateHardFail); err != nil {
|
||||||
|
// TODO: how to error this?
|
||||||
|
log.Printf("%s[%s]: Prometheus.UpdateState() errored: %v", v.Kind(), v.GetName(), err)
|
||||||
|
}
|
||||||
|
|
||||||
// wrap the error in the sentinel
|
// wrap the error in the sentinel
|
||||||
v.Res.QuiesceGroup().Done() // before the Wait that happens in SendEvent!
|
v.Res.QuiesceGroup().Done() // before the Wait that happens in SendEvent!
|
||||||
v.SendEvent(event.EventExit, &SentinelErr{e})
|
v.SendEvent(event.EventExit, &SentinelErr{e})
|
||||||
@@ -438,6 +444,10 @@ Loop:
|
|||||||
if retry > 0 { // don't decrement the -1
|
if retry > 0 { // don't decrement the -1
|
||||||
retry--
|
retry--
|
||||||
}
|
}
|
||||||
|
if err := obj.Prometheus().UpdateState(fmt.Sprintf("%v[%v]", v.Kind(), v.GetName()), v.Kind(), prometheus.ResStateSoftFail); err != nil {
|
||||||
|
// TODO: how to error this?
|
||||||
|
log.Printf("%s[%s]: Prometheus.UpdateState() errored: %v", v.Kind(), v.GetName(), err)
|
||||||
|
}
|
||||||
log.Printf("%s[%s]: CheckApply: Retrying after %.4f seconds (%d left)", v.Kind(), v.GetName(), delay.Seconds(), retry)
|
log.Printf("%s[%s]: CheckApply: Retrying after %.4f seconds (%d left)", v.Kind(), v.GetName(), delay.Seconds(), retry)
|
||||||
// start the timer...
|
// start the timer...
|
||||||
timer.Reset(delay)
|
timer.Reset(delay)
|
||||||
|
|||||||
@@ -20,17 +20,33 @@
|
|||||||
package prometheus
|
package prometheus
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"net/http"
|
"net/http"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"sync"
|
||||||
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||||
|
|
||||||
|
errwrap "github.com/pkg/errors"
|
||||||
)
|
)
|
||||||
|
|
||||||
// DefaultPrometheusListen is registered in
|
// DefaultPrometheusListen is registered in
|
||||||
// https://github.com/prometheus/prometheus/wiki/Default-port-allocations
|
// https://github.com/prometheus/prometheus/wiki/Default-port-allocations
|
||||||
const DefaultPrometheusListen = "127.0.0.1:9233"
|
const DefaultPrometheusListen = "127.0.0.1:9233"
|
||||||
|
|
||||||
|
// ResState represents the status of a resource.
|
||||||
|
type ResState int
|
||||||
|
|
||||||
|
const (
|
||||||
|
// ResStateOK represents a working resource
|
||||||
|
ResStateOK ResState = iota
|
||||||
|
// ResStateSoftFail represents a resource in soft fail (will be retried)
|
||||||
|
ResStateSoftFail
|
||||||
|
// ResStateHardFail represents a resource in hard fail (will NOT be retried)
|
||||||
|
ResStateHardFail
|
||||||
|
)
|
||||||
|
|
||||||
// Prometheus is the struct that contains information about the
|
// Prometheus is the struct that contains information about the
|
||||||
// prometheus instance. Run Init() on it.
|
// prometheus instance. Run Init() on it.
|
||||||
type Prometheus struct {
|
type Prometheus struct {
|
||||||
@@ -38,7 +54,18 @@ type Prometheus struct {
|
|||||||
|
|
||||||
checkApplyTotal *prometheus.CounterVec // total of CheckApplies that have been triggered
|
checkApplyTotal *prometheus.CounterVec // total of CheckApplies that have been triggered
|
||||||
pgraphStartTimeSeconds prometheus.Gauge // process start time in seconds since unix epoch
|
pgraphStartTimeSeconds prometheus.Gauge // process start time in seconds since unix epoch
|
||||||
|
managedResources *prometheus.GaugeVec // Resources we manage now
|
||||||
|
failedResourcesTotal *prometheus.CounterVec // Total of failures since mgmt has started
|
||||||
|
failedResources *prometheus.GaugeVec // Number of current resources
|
||||||
|
|
||||||
|
resourcesState map[string]resStateWithKind // Maps the resources with their current kind/state
|
||||||
|
mutex *sync.Mutex // Mutex used to update resourcesState
|
||||||
|
}
|
||||||
|
|
||||||
|
// resStateWithKind is used to count the failures by kind
|
||||||
|
type resStateWithKind struct {
|
||||||
|
state ResState
|
||||||
|
kind string
|
||||||
}
|
}
|
||||||
|
|
||||||
// Init some parameters - currently the Listen address.
|
// Init some parameters - currently the Listen address.
|
||||||
@@ -46,6 +73,10 @@ func (obj *Prometheus) Init() error {
|
|||||||
if len(obj.Listen) == 0 {
|
if len(obj.Listen) == 0 {
|
||||||
obj.Listen = DefaultPrometheusListen
|
obj.Listen = DefaultPrometheusListen
|
||||||
}
|
}
|
||||||
|
|
||||||
|
obj.mutex = &sync.Mutex{}
|
||||||
|
obj.resourcesState = make(map[string]resStateWithKind)
|
||||||
|
|
||||||
obj.checkApplyTotal = prometheus.NewCounterVec(
|
obj.checkApplyTotal = prometheus.NewCounterVec(
|
||||||
prometheus.CounterOpts{
|
prometheus.CounterOpts{
|
||||||
Name: "mgmt_checkapply_total",
|
Name: "mgmt_checkapply_total",
|
||||||
@@ -68,6 +99,38 @@ func (obj *Prometheus) Init() error {
|
|||||||
)
|
)
|
||||||
prometheus.MustRegister(obj.pgraphStartTimeSeconds)
|
prometheus.MustRegister(obj.pgraphStartTimeSeconds)
|
||||||
|
|
||||||
|
obj.managedResources = prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Name: "mgmt_resources",
|
||||||
|
Help: "Number of managed resources.",
|
||||||
|
},
|
||||||
|
// kind: resource type: Svc, File, ...
|
||||||
|
[]string{"kind"},
|
||||||
|
)
|
||||||
|
prometheus.MustRegister(obj.managedResources)
|
||||||
|
|
||||||
|
obj.failedResourcesTotal = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Name: "mgmt_failures_total",
|
||||||
|
Help: "Total of failed resources.",
|
||||||
|
},
|
||||||
|
// kind: resource type: Svc, File, ...
|
||||||
|
// failure: soft or hard
|
||||||
|
[]string{"kind", "failure"},
|
||||||
|
)
|
||||||
|
prometheus.MustRegister(obj.failedResourcesTotal)
|
||||||
|
|
||||||
|
obj.failedResources = prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Name: "mgmt_failures",
|
||||||
|
Help: "Number of failing resources.",
|
||||||
|
},
|
||||||
|
// kind: resource type: Svc, File, ...
|
||||||
|
// failure: soft or hard
|
||||||
|
[]string{"kind", "failure"},
|
||||||
|
)
|
||||||
|
prometheus.MustRegister(obj.failedResources)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,3 +170,94 @@ func (obj *Prometheus) UpdatePgraphStartTime() error {
|
|||||||
obj.pgraphStartTimeSeconds.SetToCurrentTime()
|
obj.pgraphStartTimeSeconds.SetToCurrentTime()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AddManagedResource increments the Managed Resource counter and updates the resource status.
|
||||||
|
func (obj *Prometheus) AddManagedResource(resUUID string, rtype string) error {
|
||||||
|
if obj == nil {
|
||||||
|
return nil // happens when mgmt is launched without --prometheus
|
||||||
|
}
|
||||||
|
obj.managedResources.With(prometheus.Labels{"kind": rtype}).Inc()
|
||||||
|
if err := obj.UpdateState(resUUID, rtype, ResStateOK); err != nil {
|
||||||
|
return errwrap.Wrapf(err, "can't update the resource status in the map")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// RemoveManagedResource decrements the Managed Resource counter and updates the resource status.
|
||||||
|
func (obj *Prometheus) RemoveManagedResource(resUUID string, rtype string) error {
|
||||||
|
if obj == nil {
|
||||||
|
return nil // happens when mgmt is launched without --prometheus
|
||||||
|
}
|
||||||
|
obj.managedResources.With(prometheus.Labels{"kind": rtype}).Dec()
|
||||||
|
if err := obj.deleteState(resUUID); err != nil {
|
||||||
|
return errwrap.Wrapf(err, "can't remove the resource status from the map")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// deleteState removes the resources for the state map and re-populates the failing gauge.
|
||||||
|
func (obj *Prometheus) deleteState(resUUID string) error {
|
||||||
|
if obj == nil {
|
||||||
|
return nil // happens when mgmt is launched without --prometheus
|
||||||
|
}
|
||||||
|
obj.mutex.Lock()
|
||||||
|
delete(obj.resourcesState, resUUID)
|
||||||
|
obj.mutex.Unlock()
|
||||||
|
if err := obj.updateFailingGauge(); err != nil {
|
||||||
|
return errwrap.Wrapf(err, "can't update the failing gauge")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateState updates the state of the resources in our internal state map
|
||||||
|
// then triggers a refresh of the failing gauge.
|
||||||
|
func (obj *Prometheus) UpdateState(resUUID string, rtype string, newState ResState) error {
|
||||||
|
defer obj.updateFailingGauge()
|
||||||
|
if obj == nil {
|
||||||
|
return nil // happens when mgmt is launched without --prometheus
|
||||||
|
}
|
||||||
|
obj.mutex.Lock()
|
||||||
|
obj.resourcesState[resUUID] = resStateWithKind{state: newState, kind: rtype}
|
||||||
|
obj.mutex.Unlock()
|
||||||
|
if newState != ResStateOK {
|
||||||
|
var strState string
|
||||||
|
if newState == ResStateSoftFail {
|
||||||
|
strState = "soft"
|
||||||
|
} else if newState == ResStateHardFail {
|
||||||
|
strState = "hard"
|
||||||
|
} else {
|
||||||
|
return errors.New("state should be soft or hard failure")
|
||||||
|
}
|
||||||
|
obj.failedResourcesTotal.With(prometheus.Labels{"kind": rtype, "failure": strState}).Inc()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// updateFailingGauge refreshes the failing gauge by parsking the internal
|
||||||
|
// state map.
|
||||||
|
func (obj *Prometheus) updateFailingGauge() error {
|
||||||
|
if obj == nil {
|
||||||
|
return nil // happens when mgmt is launched without --prometheus
|
||||||
|
}
|
||||||
|
var softFails, hardFails map[string]float64
|
||||||
|
softFails = make(map[string]float64)
|
||||||
|
hardFails = make(map[string]float64)
|
||||||
|
for _, v := range obj.resourcesState {
|
||||||
|
if v.state == ResStateSoftFail {
|
||||||
|
softFails[v.kind]++
|
||||||
|
} else if v.state == ResStateHardFail {
|
||||||
|
hardFails[v.kind]++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TODO: we might want to Zero the metrics we are not using
|
||||||
|
// because in prometheus design the metrics keep living for some time
|
||||||
|
// even after they are removed.
|
||||||
|
obj.failedResources.Reset()
|
||||||
|
for k, v := range softFails {
|
||||||
|
obj.failedResources.With(prometheus.Labels{"kind": k, "failure": "soft"}).Set(v)
|
||||||
|
}
|
||||||
|
for k, v := range hardFails {
|
||||||
|
obj.failedResources.With(prometheus.Labels{"kind": k, "failure": "hard"}).Set(v)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -249,8 +249,6 @@ type BaseRes struct {
|
|||||||
|
|
||||||
refresh bool // does this resource have a refresh to run?
|
refresh bool // does this resource have a refresh to run?
|
||||||
//refreshState StatefulBool // TODO: future stateful bool
|
//refreshState StatefulBool // TODO: future stateful bool
|
||||||
|
|
||||||
prometheus *prometheus.Prometheus
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// UnmarshalYAML is the custom unmarshal handler for the BaseRes struct. It is
|
// UnmarshalYAML is the custom unmarshal handler for the BaseRes struct. It is
|
||||||
@@ -366,6 +364,10 @@ func (obj *BaseRes) Init() error {
|
|||||||
// TODO: this StatefulBool implementation could be eventually swappable
|
// TODO: this StatefulBool implementation could be eventually swappable
|
||||||
//obj.refreshState = &DiskBool{Path: path.Join(dir, refreshPathToken)}
|
//obj.refreshState = &DiskBool{Path: path.Join(dir, refreshPathToken)}
|
||||||
|
|
||||||
|
if err := obj.Prometheus().AddManagedResource(fmt.Sprintf("%v[%v]", obj.Kind(), obj.GetName()), obj.Kind()); err != nil {
|
||||||
|
return errwrap.Wrapf(err, "could not increase prometheus counter!")
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -383,6 +385,10 @@ func (obj *BaseRes) Close() error {
|
|||||||
close(obj.stopped)
|
close(obj.stopped)
|
||||||
obj.waitGroup.Done()
|
obj.waitGroup.Done()
|
||||||
|
|
||||||
|
if err := obj.Prometheus().RemoveManagedResource(fmt.Sprintf("%v[%v]", obj.Kind(), obj.GetName()), obj.kind); err != nil {
|
||||||
|
return errwrap.Wrapf(err, "could not decrease prometheus counter!")
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -684,5 +690,5 @@ func (obj *BaseRes) Poll() error {
|
|||||||
|
|
||||||
// Prometheus returns the prometheus instance.
|
// Prometheus returns the prometheus instance.
|
||||||
func (obj *BaseRes) Prometheus() *prometheus.Prometheus {
|
func (obj *BaseRes) Prometheus() *prometheus.Prometheus {
|
||||||
return obj.prometheus
|
return obj.Data().Prometheus
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,5 @@
|
|||||||
#!/bin/bash -e
|
#!/bin/bash -e
|
||||||
|
|
||||||
exit 0 # FIXME: disabled until intermittent failures can be resolved
|
|
||||||
|
|
||||||
# run a graph, with prometheus support
|
# run a graph, with prometheus support
|
||||||
timeout --kill-after=30s 25s ./mgmt run --tmp-prefix --no-pgp --prometheus --yaml prometheus-3.yaml &
|
timeout --kill-after=30s 25s ./mgmt run --tmp-prefix --no-pgp --prometheus --yaml prometheus-3.yaml &
|
||||||
pid=$!
|
pid=$!
|
||||||
|
|||||||
35
test/shell/prometheus-4.sh
Executable file
35
test/shell/prometheus-4.sh
Executable file
@@ -0,0 +1,35 @@
|
|||||||
|
#!/bin/bash -xe
|
||||||
|
|
||||||
|
# run a graph, with prometheus support
|
||||||
|
timeout --kill-after=30s 25s ./mgmt run --tmp-prefix --no-pgp --prometheus --yaml prometheus-4.yaml &
|
||||||
|
pid=$!
|
||||||
|
sleep 10s # let it converge
|
||||||
|
|
||||||
|
# For test debugging purpose
|
||||||
|
curl 127.0.0.1:9233/metrics
|
||||||
|
|
||||||
|
# Check for mgmt_resources
|
||||||
|
curl 127.0.0.1:9233/metrics | grep '^mgmt_resources{kind="file"} 4$'
|
||||||
|
|
||||||
|
# One CheckApply for a File ; in noop mode.
|
||||||
|
curl 127.0.0.1:9233/metrics | grep 'mgmt_checkapply_total{apply="false",errorful="false",eventful="true",kind="file"} 1$'
|
||||||
|
|
||||||
|
# Two CheckApply for a File ; without errors, with events
|
||||||
|
curl 127.0.0.1:9233/metrics | grep 'mgmt_checkapply_total{apply="true",errorful="false",eventful="true",kind="file"} 2$'
|
||||||
|
|
||||||
|
# Multiple CheckApplies with errors
|
||||||
|
curl 127.0.0.1:9233/metrics | grep 'mgmt_checkapply_total{apply="true",errorful="true",eventful="true",kind="file"} [0-9]\+'
|
||||||
|
|
||||||
|
# One soft failure ATM
|
||||||
|
curl 127.0.0.1:9233/metrics | grep 'mgmt_failures{failure="soft",kind="file"} 1$'
|
||||||
|
|
||||||
|
# Multiple soft failures since startup
|
||||||
|
if curl 127.0.0.1:9233/metrics | grep 'mgmt_failures_total{failure="soft",kind="file"} 1$'
|
||||||
|
then
|
||||||
|
false
|
||||||
|
fi
|
||||||
|
curl 127.0.0.1:9233/metrics | grep 'mgmt_failures_total{failure="soft",kind="file"} [0-9]\+'
|
||||||
|
|
||||||
|
killall -SIGINT mgmt # send ^C to exit mgmt
|
||||||
|
wait $pid # get exit status
|
||||||
|
exit $?
|
||||||
29
test/shell/prometheus-4.yaml
Normal file
29
test/shell/prometheus-4.yaml
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
---
|
||||||
|
graph: mygraph
|
||||||
|
resources:
|
||||||
|
file:
|
||||||
|
- name: file1
|
||||||
|
path: "/tmp/mgmt/NONEXIST/f1"
|
||||||
|
content: |
|
||||||
|
i am f1
|
||||||
|
state: exists
|
||||||
|
meta:
|
||||||
|
retry: -1
|
||||||
|
delay: 1000
|
||||||
|
- name: file2
|
||||||
|
path: "/tmp/mgmt/f2"
|
||||||
|
content: |
|
||||||
|
i am f2
|
||||||
|
state: exists
|
||||||
|
- name: file3
|
||||||
|
path: "/tmp/mgmt/f3"
|
||||||
|
content: |
|
||||||
|
i am f3
|
||||||
|
state: exists
|
||||||
|
- name: file4
|
||||||
|
path: "/tmp/mgmt/f4"
|
||||||
|
content: |
|
||||||
|
i am f4
|
||||||
|
state: exists
|
||||||
|
meta:
|
||||||
|
noop: true
|
||||||
Reference in New Issue
Block a user