prometheus: Add detailed metrics
Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
This commit is contained in:
@@ -20,17 +20,33 @@
|
||||
package prometheus
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"sync"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
|
||||
errwrap "github.com/pkg/errors"
|
||||
)
|
||||
|
||||
// DefaultPrometheusListen is registered in
|
||||
// https://github.com/prometheus/prometheus/wiki/Default-port-allocations
|
||||
const DefaultPrometheusListen = "127.0.0.1:9233"
|
||||
|
||||
// ResState represents the status of a resource.
|
||||
type ResState int
|
||||
|
||||
const (
|
||||
// ResStateOK represents a working resource
|
||||
ResStateOK ResState = iota
|
||||
// ResStateSoftFail represents a resource in soft fail (will be retried)
|
||||
ResStateSoftFail
|
||||
// ResStateHardFail represents a resource in hard fail (will NOT be retried)
|
||||
ResStateHardFail
|
||||
)
|
||||
|
||||
// Prometheus is the struct that contains information about the
|
||||
// prometheus instance. Run Init() on it.
|
||||
type Prometheus struct {
|
||||
@@ -38,7 +54,18 @@ type Prometheus struct {
|
||||
|
||||
checkApplyTotal *prometheus.CounterVec // total of CheckApplies that have been triggered
|
||||
pgraphStartTimeSeconds prometheus.Gauge // process start time in seconds since unix epoch
|
||||
managedResources *prometheus.GaugeVec // Resources we manage now
|
||||
failedResourcesTotal *prometheus.CounterVec // Total of failures since mgmt has started
|
||||
failedResources *prometheus.GaugeVec // Number of current resources
|
||||
|
||||
resourcesState map[string]resStateWithKind // Maps the resources with their current kind/state
|
||||
mutex *sync.Mutex // Mutex used to update resourcesState
|
||||
}
|
||||
|
||||
// resStateWithKind is used to count the failures by kind
|
||||
type resStateWithKind struct {
|
||||
state ResState
|
||||
kind string
|
||||
}
|
||||
|
||||
// Init some parameters - currently the Listen address.
|
||||
@@ -46,6 +73,10 @@ func (obj *Prometheus) Init() error {
|
||||
if len(obj.Listen) == 0 {
|
||||
obj.Listen = DefaultPrometheusListen
|
||||
}
|
||||
|
||||
obj.mutex = &sync.Mutex{}
|
||||
obj.resourcesState = make(map[string]resStateWithKind)
|
||||
|
||||
obj.checkApplyTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "mgmt_checkapply_total",
|
||||
@@ -68,6 +99,38 @@ func (obj *Prometheus) Init() error {
|
||||
)
|
||||
prometheus.MustRegister(obj.pgraphStartTimeSeconds)
|
||||
|
||||
obj.managedResources = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "mgmt_resources",
|
||||
Help: "Number of managed resources.",
|
||||
},
|
||||
// kind: resource type: Svc, File, ...
|
||||
[]string{"kind"},
|
||||
)
|
||||
prometheus.MustRegister(obj.managedResources)
|
||||
|
||||
obj.failedResourcesTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "mgmt_failures_total",
|
||||
Help: "Total of failed resources.",
|
||||
},
|
||||
// kind: resource type: Svc, File, ...
|
||||
// failure: soft or hard
|
||||
[]string{"kind", "failure"},
|
||||
)
|
||||
prometheus.MustRegister(obj.failedResourcesTotal)
|
||||
|
||||
obj.failedResources = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "mgmt_failures",
|
||||
Help: "Number of failing resources.",
|
||||
},
|
||||
// kind: resource type: Svc, File, ...
|
||||
// failure: soft or hard
|
||||
[]string{"kind", "failure"},
|
||||
)
|
||||
prometheus.MustRegister(obj.failedResources)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -107,3 +170,94 @@ func (obj *Prometheus) UpdatePgraphStartTime() error {
|
||||
obj.pgraphStartTimeSeconds.SetToCurrentTime()
|
||||
return nil
|
||||
}
|
||||
|
||||
// AddManagedResource increments the Managed Resource counter and updates the resource status.
|
||||
func (obj *Prometheus) AddManagedResource(resUUID string, rtype string) error {
|
||||
if obj == nil {
|
||||
return nil // happens when mgmt is launched without --prometheus
|
||||
}
|
||||
obj.managedResources.With(prometheus.Labels{"kind": rtype}).Inc()
|
||||
if err := obj.UpdateState(resUUID, rtype, ResStateOK); err != nil {
|
||||
return errwrap.Wrapf(err, "can't update the resource status in the map")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// RemoveManagedResource decrements the Managed Resource counter and updates the resource status.
|
||||
func (obj *Prometheus) RemoveManagedResource(resUUID string, rtype string) error {
|
||||
if obj == nil {
|
||||
return nil // happens when mgmt is launched without --prometheus
|
||||
}
|
||||
obj.managedResources.With(prometheus.Labels{"kind": rtype}).Dec()
|
||||
if err := obj.deleteState(resUUID); err != nil {
|
||||
return errwrap.Wrapf(err, "can't remove the resource status from the map")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// deleteState removes the resources for the state map and re-populates the failing gauge.
|
||||
func (obj *Prometheus) deleteState(resUUID string) error {
|
||||
if obj == nil {
|
||||
return nil // happens when mgmt is launched without --prometheus
|
||||
}
|
||||
obj.mutex.Lock()
|
||||
delete(obj.resourcesState, resUUID)
|
||||
obj.mutex.Unlock()
|
||||
if err := obj.updateFailingGauge(); err != nil {
|
||||
return errwrap.Wrapf(err, "can't update the failing gauge")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// UpdateState updates the state of the resources in our internal state map
|
||||
// then triggers a refresh of the failing gauge.
|
||||
func (obj *Prometheus) UpdateState(resUUID string, rtype string, newState ResState) error {
|
||||
defer obj.updateFailingGauge()
|
||||
if obj == nil {
|
||||
return nil // happens when mgmt is launched without --prometheus
|
||||
}
|
||||
obj.mutex.Lock()
|
||||
obj.resourcesState[resUUID] = resStateWithKind{state: newState, kind: rtype}
|
||||
obj.mutex.Unlock()
|
||||
if newState != ResStateOK {
|
||||
var strState string
|
||||
if newState == ResStateSoftFail {
|
||||
strState = "soft"
|
||||
} else if newState == ResStateHardFail {
|
||||
strState = "hard"
|
||||
} else {
|
||||
return errors.New("state should be soft or hard failure")
|
||||
}
|
||||
obj.failedResourcesTotal.With(prometheus.Labels{"kind": rtype, "failure": strState}).Inc()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// updateFailingGauge refreshes the failing gauge by parsking the internal
|
||||
// state map.
|
||||
func (obj *Prometheus) updateFailingGauge() error {
|
||||
if obj == nil {
|
||||
return nil // happens when mgmt is launched without --prometheus
|
||||
}
|
||||
var softFails, hardFails map[string]float64
|
||||
softFails = make(map[string]float64)
|
||||
hardFails = make(map[string]float64)
|
||||
for _, v := range obj.resourcesState {
|
||||
if v.state == ResStateSoftFail {
|
||||
softFails[v.kind]++
|
||||
} else if v.state == ResStateHardFail {
|
||||
hardFails[v.kind]++
|
||||
}
|
||||
}
|
||||
// TODO: we might want to Zero the metrics we are not using
|
||||
// because in prometheus design the metrics keep living for some time
|
||||
// even after they are removed.
|
||||
obj.failedResources.Reset()
|
||||
for k, v := range softFails {
|
||||
obj.failedResources.With(prometheus.Labels{"kind": k, "failure": "soft"}).Set(v)
|
||||
}
|
||||
for k, v := range hardFails {
|
||||
obj.failedResources.With(prometheus.Labels{"kind": k, "failure": "hard"}).Set(v)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user