This ensures that docstring comments are wrapped to 80 chars. ffrank seemed to be making this mistake far too often, and it's a silly thing to look for manually. As it turns out, I've made it too, as have many others. Now we have a test that checks for most cases. There are still a few stray cases that aren't checked automatically, but this can be improved upon if someone is motivated to do so. Before anyone complains about the 80 character limit: this only checks docstring comments, not source code length or inline source code comments. There's no excuse for having docstrings that are badly reflowed or over 80 chars, particularly if you have an automated test.
300 lines
9.8 KiB
Go
300 lines
9.8 KiB
Go
// Mgmt
|
|
// Copyright (C) 2013-2020+ James Shubin and the project contributors
|
|
// Written by James Shubin <james@shubin.ca> and the project contributors
|
|
//
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
// Package prometheus provides functions that are useful to control and manage
|
|
// the built-in prometheus instance.
|
|
package prometheus
|
|
|
|
import (
|
|
"errors"
|
|
"net/http"
|
|
"strconv"
|
|
"sync"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
"github.com/purpleidea/mgmt/util/errwrap"
|
|
)
|
|
|
|
// DefaultPrometheusListen is registered in
|
|
// https://github.com/prometheus/prometheus/wiki/Default-port-allocations
|
|
const DefaultPrometheusListen = "127.0.0.1:9233"
|
|
|
|
// ResState represents the status of a resource.
|
|
type ResState int
|
|
|
|
const (
|
|
// ResStateOK represents a working resource
|
|
ResStateOK ResState = iota
|
|
// ResStateSoftFail represents a resource in soft fail (will be retried)
|
|
ResStateSoftFail
|
|
// ResStateHardFail represents a resource in hard fail (will NOT be retried)
|
|
ResStateHardFail
|
|
)
|
|
|
|
// Prometheus is the struct that contains information about the prometheus
|
|
// instance. Run Init() on it.
|
|
type Prometheus struct {
|
|
Listen string // the listen specification for the net/http server
|
|
|
|
checkApplyTotal *prometheus.CounterVec // total of CheckApplies that have been triggered
|
|
pgraphStartTimeSeconds prometheus.Gauge // process start time in seconds since unix epoch
|
|
managedResources *prometheus.GaugeVec // Resources we manage now
|
|
failedResourcesTotal *prometheus.CounterVec // Total of failures since mgmt has started
|
|
failedResources *prometheus.GaugeVec // Number of current resources
|
|
|
|
resourcesState map[string]resStateWithKind // Maps the resources with their current kind/state
|
|
mutex *sync.Mutex // Mutex used to update resourcesState
|
|
}
|
|
|
|
// resStateWithKind is used to count the failures by kind
|
|
type resStateWithKind struct {
|
|
state ResState
|
|
kind string
|
|
}
|
|
|
|
// Init some parameters - currently the Listen address.
|
|
func (obj *Prometheus) Init() error {
|
|
if len(obj.Listen) == 0 {
|
|
obj.Listen = DefaultPrometheusListen
|
|
}
|
|
|
|
obj.mutex = &sync.Mutex{}
|
|
obj.resourcesState = make(map[string]resStateWithKind)
|
|
|
|
obj.checkApplyTotal = prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "mgmt_checkapply_total",
|
|
Help: "Number of CheckApply that have run.",
|
|
},
|
|
// Labels for this metric.
|
|
// kind: resource type: Svc, File, ...
|
|
// apply: if the CheckApply happened in "apply" mode
|
|
// eventful: did the CheckApply generate an event
|
|
// errorful: did the CheckApply generate an error
|
|
[]string{"kind", "apply", "eventful", "errorful"},
|
|
)
|
|
prometheus.MustRegister(obj.checkApplyTotal)
|
|
|
|
obj.pgraphStartTimeSeconds = prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Name: "mgmt_graph_start_time_seconds",
|
|
Help: "Start time of the current graph since unix epoch in seconds.",
|
|
},
|
|
)
|
|
prometheus.MustRegister(obj.pgraphStartTimeSeconds)
|
|
|
|
obj.managedResources = prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "mgmt_resources",
|
|
Help: "Number of managed resources.",
|
|
},
|
|
// kind: resource type: Svc, File, ...
|
|
[]string{"kind"},
|
|
)
|
|
prometheus.MustRegister(obj.managedResources)
|
|
|
|
obj.failedResourcesTotal = prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "mgmt_failures_total",
|
|
Help: "Total of failed resources.",
|
|
},
|
|
// kind: resource type: Svc, File, ...
|
|
// failure: soft or hard
|
|
[]string{"kind", "failure"},
|
|
)
|
|
prometheus.MustRegister(obj.failedResourcesTotal)
|
|
|
|
obj.failedResources = prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "mgmt_failures",
|
|
Help: "Number of failing resources.",
|
|
},
|
|
// kind: resource type: Svc, File, ...
|
|
// failure: soft or hard
|
|
[]string{"kind", "failure"},
|
|
)
|
|
prometheus.MustRegister(obj.failedResources)
|
|
|
|
return nil
|
|
}
|
|
|
|
// Start runs a http server in a go routine, that responds to /metrics as
|
|
// prometheus would expect.
|
|
func (obj *Prometheus) Start() error {
|
|
http.Handle("/metrics", promhttp.Handler())
|
|
go http.ListenAndServe(obj.Listen, nil)
|
|
return nil
|
|
}
|
|
|
|
// Stop the http server.
|
|
func (obj *Prometheus) Stop() error {
|
|
// FIXME: There is no way in go < 1.8 to stop a http server. Add this!
|
|
// https://stackoverflow.com/questions/39320025/go-how-to-stop-http-listenandserve/41433555#41433555
|
|
return nil
|
|
}
|
|
|
|
// InitKindMetrics initialized prometheus counters. For each kind of resource,
|
|
// checkApply counters are initialized with all the possible value.
|
|
func (obj *Prometheus) InitKindMetrics(kinds []string) error {
|
|
if obj == nil {
|
|
return nil // happens when mgmt is launched without --prometheus
|
|
}
|
|
bools := []bool{true, false}
|
|
for _, kind := range kinds {
|
|
for _, apply := range bools {
|
|
for _, eventful := range bools {
|
|
for _, errorful := range bools {
|
|
labels := prometheus.Labels{
|
|
"kind": kind,
|
|
"apply": strconv.FormatBool(apply),
|
|
"eventful": strconv.FormatBool(eventful),
|
|
"errorful": strconv.FormatBool(errorful),
|
|
}
|
|
obj.checkApplyTotal.With(labels)
|
|
}
|
|
}
|
|
}
|
|
|
|
obj.managedResources.With(prometheus.Labels{"kind": kind})
|
|
|
|
failures := []string{"soft", "hard"}
|
|
for _, f := range failures {
|
|
failLabels := prometheus.Labels{"kind": kind, "failure": f}
|
|
obj.failedResourcesTotal.With(failLabels)
|
|
obj.failedResources.With(failLabels)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// UpdateCheckApplyTotal refreshes the failing gauge by parsing the internal
|
|
// state map.
|
|
func (obj *Prometheus) UpdateCheckApplyTotal(kind string, apply, eventful, errorful bool) error {
|
|
if obj == nil {
|
|
return nil // happens when mgmt is launched without --prometheus
|
|
}
|
|
labels := prometheus.Labels{"kind": kind, "apply": strconv.FormatBool(apply), "eventful": strconv.FormatBool(eventful), "errorful": strconv.FormatBool(errorful)}
|
|
metric := obj.checkApplyTotal.With(labels)
|
|
metric.Inc()
|
|
return nil
|
|
}
|
|
|
|
// UpdatePgraphStartTime updates the mgmt_graph_start_time_seconds metric to the
|
|
// current timestamp.
|
|
func (obj *Prometheus) UpdatePgraphStartTime() error {
|
|
if obj == nil {
|
|
return nil // happens when mgmt is launched without --prometheus
|
|
}
|
|
obj.pgraphStartTimeSeconds.SetToCurrentTime()
|
|
return nil
|
|
}
|
|
|
|
// AddManagedResource increments the Managed Resource counter and updates the
|
|
// resource status.
|
|
func (obj *Prometheus) AddManagedResource(resUUID string, rtype string) error {
|
|
if obj == nil {
|
|
return nil // happens when mgmt is launched without --prometheus
|
|
}
|
|
obj.managedResources.With(prometheus.Labels{"kind": rtype}).Inc()
|
|
if err := obj.UpdateState(resUUID, rtype, ResStateOK); err != nil {
|
|
return errwrap.Wrapf(err, "can't update the resource status in the map")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// RemoveManagedResource decrements the Managed Resource counter and updates the
|
|
// resource status.
|
|
func (obj *Prometheus) RemoveManagedResource(resUUID string, rtype string) error {
|
|
if obj == nil {
|
|
return nil // happens when mgmt is launched without --prometheus
|
|
}
|
|
obj.managedResources.With(prometheus.Labels{"kind": rtype}).Dec()
|
|
if err := obj.deleteState(resUUID); err != nil {
|
|
return errwrap.Wrapf(err, "can't remove the resource status from the map")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// deleteState removes the resources for the state map and re-populates the
|
|
// failing gauge.
|
|
func (obj *Prometheus) deleteState(resUUID string) error {
|
|
if obj == nil {
|
|
return nil // happens when mgmt is launched without --prometheus
|
|
}
|
|
obj.mutex.Lock()
|
|
delete(obj.resourcesState, resUUID)
|
|
obj.mutex.Unlock()
|
|
if err := obj.updateFailingGauge(); err != nil {
|
|
return errwrap.Wrapf(err, "can't update the failing gauge")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// UpdateState updates the state of the resources in our internal state map then
|
|
// triggers a refresh of the failing gauge.
|
|
func (obj *Prometheus) UpdateState(resUUID string, rtype string, newState ResState) error {
|
|
defer obj.updateFailingGauge()
|
|
if obj == nil {
|
|
return nil // happens when mgmt is launched without --prometheus
|
|
}
|
|
obj.mutex.Lock()
|
|
obj.resourcesState[resUUID] = resStateWithKind{state: newState, kind: rtype}
|
|
obj.mutex.Unlock()
|
|
if newState != ResStateOK {
|
|
var strState string
|
|
if newState == ResStateSoftFail {
|
|
strState = "soft"
|
|
} else if newState == ResStateHardFail {
|
|
strState = "hard"
|
|
} else {
|
|
return errors.New("state should be soft or hard failure")
|
|
}
|
|
obj.failedResourcesTotal.With(prometheus.Labels{"kind": rtype, "failure": strState}).Inc()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// updateFailingGauge refreshes the failing gauge by parsking the internal state
|
|
// map.
|
|
func (obj *Prometheus) updateFailingGauge() error {
|
|
if obj == nil {
|
|
return nil // happens when mgmt is launched without --prometheus
|
|
}
|
|
var softFails, hardFails map[string]float64
|
|
softFails = make(map[string]float64)
|
|
hardFails = make(map[string]float64)
|
|
for _, v := range obj.resourcesState {
|
|
if v.state == ResStateSoftFail {
|
|
softFails[v.kind]++
|
|
} else if v.state == ResStateHardFail {
|
|
hardFails[v.kind]++
|
|
}
|
|
}
|
|
// TODO: we might want to Zero the metrics we are not using
|
|
// because in prometheus design the metrics keep living for some time
|
|
// even after they are removed.
|
|
obj.failedResources.Reset()
|
|
for k, v := range softFails {
|
|
obj.failedResources.With(prometheus.Labels{"kind": k, "failure": "soft"}).Set(v)
|
|
}
|
|
for k, v := range hardFails {
|
|
obj.failedResources.With(prometheus.Labels{"kind": k, "failure": "hard"}).Set(v)
|
|
}
|
|
return nil
|
|
}
|