Add storage metrics (#554)

* Add cache storage metrics

- autocache client metrics
- autocache server metrics
- boltdb metrics
- redis client metrics
- refactor metrics registry to be general purpose
This commit is contained in:
Travis Groth 2020-03-23 22:07:48 -04:00 committed by GitHub
parent acfc880421
commit cc504362e4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 400 additions and 108 deletions

View file

@ -325,7 +325,35 @@ Expose a prometheus format HTTP endpoint on the specified port. Disabled by defa
**Metrics tracked** **Metrics tracked**
| Name | Type | Description | | Name | Type | Description |
| :-------------------------------------------- | :-------- | :---------------------------------------------------------------------- | | --------------------------------------------- | --------- | ----------------------------------------------------------------------- |
| boltdb_free_alloc_size_bytes | Gauge | Bytes allocated in free pages |
| boltdb_free_page_n | Gauge | Number of free pages on the freelist |
| boltdb_freelist_inuse_size_bytes | Gauge | Bytes used by the freelist |
| boltdb_open_txn | Gauge | number of currently open read transactions |
| boltdb_pending_page_n | Gauge | Number of pending pages on the freelist |
| boltdb_txn | Gauge | total number of started read transactions |
| boltdb_txn_cursor_total | Counter | Total number of cursors created |
| boltdb_txn_node_deref_total | Counter | Total number of node dereferences |
| boltdb_txn_node_total | Counter | Total number of node allocations |
| boltdb_txn_page_alloc_size_bytes_total | Counter | Total bytes allocated |
| boltdb_txn_page_total | Counter | Total number of page allocations |
| boltdb_txn_rebalance_duration_ms_total | Counter | Total time spent rebalancing |
| boltdb_txn_rebalance_total | Counter | Total number of node rebalances |
| boltdb_txn_spill_duration_ms_total | Counter | Total time spent spilling |
| boltdb_txn_spill_total | Counter | Total number of nodes spilled |
| boltdb_txn_split_total | Counter | Total number of nodes split |
| boltdb_txn_write_duration_ms_total | Counter | Total time spent writing to disk |
| boltdb_txn_write_total | Counter | Total number of writes performed |
| groupcache_cache_hits_total | Counter | Total cache hits in local or cluster cache |
| groupcache_cache_hits_total | Counter | Total cache hits in local or cluster cache |
| groupcache_gets_total | Counter | Total get request, including from peers |
| groupcache_loads_deduped_total | Counter | gets without cache hits after duplicate suppression |
| groupcache_loads_total | Counter | Total gets without cache hits |
| groupcache_local_load_errs_total | Counter | Total local load errors |
| groupcache_local_loads_total | Counter | Total good local loads |
| groupcache_peer_errors_total | Counter | Total errors from peers |
| groupcache_peer_loads_total | Counter | Total remote loads or cache hits without error |
| groupcache_server_requests_total | Counter | Total gets from peers |
| grpc_client_request_duration_ms | Histogram | GRPC client request duration by service | | grpc_client_request_duration_ms | Histogram | GRPC client request duration by service |
| grpc_client_request_size_bytes | Histogram | GRPC client request size by service | | grpc_client_request_size_bytes | Histogram | GRPC client request size by service |
| grpc_client_requests_total | Counter | Total GRPC client requests made by service | | grpc_client_requests_total | Counter | Total GRPC client requests made by service |
@ -342,10 +370,17 @@ Expose a prometheus format HTTP endpoint on the specified port. Disabled by defa
| http_server_request_size_bytes | Histogram | HTTP server request size by service | | http_server_request_size_bytes | Histogram | HTTP server request size by service |
| http_server_requests_total | Counter | Total HTTP server requests handled by service | | http_server_requests_total | Counter | Total HTTP server requests handled by service |
| http_server_response_size_bytes | Histogram | HTTP server response size by service | | http_server_response_size_bytes | Histogram | HTTP server response size by service |
| pomerium_build_info | Gauge | Pomerium build metadata by git revision, service, version and goversion |
| pomerium_config_checksum_int64 | Gauge | Currently loaded configuration checksum by service | | pomerium_config_checksum_int64 | Gauge | Currently loaded configuration checksum by service |
| pomerium_config_last_reload_success | Gauge | Whether the last configuration reload succeeded by service | | pomerium_config_last_reload_success | Gauge | Whether the last configuration reload succeeded by service |
| pomerium_config_last_reload_success_timestamp | Gauge | The timestamp of the last successful configuration reload by service | | pomerium_config_last_reload_success_timestamp | Gauge | The timestamp of the last successful configuration reload by service |
| pomerium_build_info | Gauge | Pomerium build metadata by git revision, service, version and goversion | | redis_conns | Gauge | Number of total connections in the pool |
| redis_hits_total | Counter | Total number of times free connection was found in the pool |
| redis_idle_conns | Gauge | Number of idle connections in the pool |
| redis_misses_total | Counter | Total number of times free connection was NOT found in the pool |
| redis_stale_conns_total | Counter | Total number of stale connections removed from the pool |
| redis_timeouts_total | Counter | Total number of times a wait timeout occurred |
### Tracing ### Tracing
@ -357,8 +392,8 @@ Each unit work is called a Span in a trace. Spans include metadata about the wor
| Config Key | Description | Required | | Config Key | Description | Required |
| :--------------- | :---------------------------------------------------------------- | -------- | | :--------------- | :---------------------------------------------------------------- | -------- |
| tracing_provider | The name of the tracing provider. (e.g. jaeger) | ✅ | | tracing_provider | The name of the tracing provider. (e.g. jaeger) | ✅ |
| tracing_debug | Will disable [sampling](https://opencensus.io/tracing/sampling/). | ❌ | | tracing_debug | Will disable [sampling](https://opencensus.io/tracing/sampling/). | ❌ |
#### Jaeger #### Jaeger
@ -372,8 +407,8 @@ Each unit work is called a Span in a trace. Spans include metadata about the wor
| Config Key | Description | Required | | Config Key | Description | Required |
| :-------------------------------- | :------------------------------------------ | -------- | | :-------------------------------- | :------------------------------------------ | -------- |
| tracing_jaeger_collector_endpoint | Url to the Jaeger HTTP Thrift collector. | ✅ | | tracing_jaeger_collector_endpoint | Url to the Jaeger HTTP Thrift collector. | ✅ |
| tracing_jaeger_agent_endpoint | Send spans to jaeger-agent at this address. | ✅ | | tracing_jaeger_agent_endpoint | Send spans to jaeger-agent at this address. | ✅ |
#### Example #### Example

View file

@ -12,9 +12,11 @@ import (
"sync" "sync"
"github.com/golang/groupcache" "github.com/golang/groupcache"
"github.com/pomerium/autocache" "github.com/pomerium/autocache"
"github.com/pomerium/pomerium/internal/httputil" "github.com/pomerium/pomerium/internal/httputil"
"github.com/pomerium/pomerium/internal/kv" "github.com/pomerium/pomerium/internal/kv"
"github.com/pomerium/pomerium/internal/telemetry/metrics"
"github.com/pomerium/pomerium/internal/urlutil" "github.com/pomerium/pomerium/internal/urlutil"
) )
@ -116,13 +118,14 @@ func New(o *Options) (*Store, error) {
} }
serverOpts := &httputil.ServerOptions{Addr: o.Addr} serverOpts := &httputil.ServerOptions{Addr: o.Addr}
var wg sync.WaitGroup var wg sync.WaitGroup
s.srv, err = httputil.NewServer(serverOpts, QueryParamToCtx(s.cluster), &wg) s.srv, err = httputil.NewServer(serverOpts, metrics.HTTPMetricsHandler("groupcache")(QueryParamToCtx(s.cluster)), &wg)
if err != nil { if err != nil {
return nil, err return nil, err
} }
if _, err := s.cluster.Join([]string{o.ClusterDomain}); err != nil { if _, err := s.cluster.Join([]string{o.ClusterDomain}); err != nil {
return nil, err return nil, err
} }
metrics.AddGroupCacheMetrics(s.db)
return &s, nil return &s, nil
} }
@ -191,7 +194,9 @@ func (s signedSession) RoundTrip(req *http.Request) (*http.Response, error) {
q.Set(defaultQueryParamKey, session) q.Set(defaultQueryParamKey, session)
newReqURL.RawQuery = q.Encode() newReqURL.RawQuery = q.Encode()
newReq.URL = urlutil.NewSignedURL(s.sharedKey, &newReqURL).Sign() newReq.URL = urlutil.NewSignedURL(s.sharedKey, &newReqURL).Sign()
return http.DefaultTransport.RoundTrip(newReq)
tripper := metrics.HTTPMetricsRoundTripper("cache", "groupcache")(http.DefaultTransport)
return tripper.RoundTrip(newReq)
} }
// QueryParamToCtx takes a value from a query param and adds it to the // QueryParamToCtx takes a value from a query param and adds it to the

View file

@ -5,9 +5,10 @@ package bolt
import ( import (
"context" "context"
"github.com/pomerium/pomerium/internal/kv"
bolt "go.etcd.io/bbolt" bolt "go.etcd.io/bbolt"
"github.com/pomerium/pomerium/internal/kv"
"github.com/pomerium/pomerium/internal/telemetry/metrics"
) )
var _ kv.Store = &Store{} var _ kv.Store = &Store{}
@ -64,6 +65,7 @@ func New(o *Options) (*Store, error) {
return nil, err return nil, err
} }
metrics.AddBoltDBMetrics(db.Stats)
return &Store{db: db, bucket: o.Bucket}, nil return &Store{db: db, bucket: o.Bucket}, nil
} }

View file

@ -9,7 +9,9 @@ import (
"fmt" "fmt"
"github.com/go-redis/redis/v7" "github.com/go-redis/redis/v7"
"github.com/pomerium/pomerium/internal/kv" "github.com/pomerium/pomerium/internal/kv"
"github.com/pomerium/pomerium/internal/telemetry/metrics"
) )
var _ kv.Store = &Store{} var _ kv.Store = &Store{}
@ -56,6 +58,7 @@ func New(o *Options) (*Store, error) {
return nil, fmt.Errorf("kv/redis: error connecting to redis: %w", err) return nil, fmt.Errorf("kv/redis: error connecting to redis: %w", err)
} }
metrics.AddRedisMetrics(db.PoolStats)
return &Store{db: db}, nil return &Store{db: db}, nil
} }

View file

@ -2,19 +2,14 @@ package metrics
import ( import (
"context" "context"
"runtime"
"sync"
"time" "time"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/version"
"go.opencensus.io/metric"
"go.opencensus.io/metric/metricdata"
"go.opencensus.io/metric/metricproducer" "go.opencensus.io/metric/metricproducer"
"go.opencensus.io/stats" "go.opencensus.io/stats"
"go.opencensus.io/stats/view" "go.opencensus.io/stats/view"
"go.opencensus.io/tag" "go.opencensus.io/tag"
"github.com/pomerium/pomerium/internal/log"
) )
var ( var (
@ -30,7 +25,6 @@ var (
"config_last_reload_success", "config_last_reload_success",
"Returns 1 if last reload was successful", "Returns 1 if last reload was successful",
"1") "1")
registry = newMetricRegistry()
// ConfigLastReloadView contains the timestamp the configuration was last // ConfigLastReloadView contains the timestamp the configuration was last
// reloaded, labeled by service. // reloaded, labeled by service.
@ -79,75 +73,6 @@ func SetConfigInfo(service string, success bool) {
} }
} }
// metricRegistry holds the non-view metrics and handles safe
// initialization and updates. Behavior without using newMetricRegistry()
// is undefined.
type metricRegistry struct {
registry *metric.Registry
buildInfo *metric.Int64Gauge
policyCount *metric.Int64DerivedGauge
configChecksum *metric.Float64Gauge
sync.Once
}
func newMetricRegistry() *metricRegistry {
r := new(metricRegistry)
r.init()
return r
}
func (r *metricRegistry) init() {
r.Do(
func() {
r.registry = metric.NewRegistry()
var err error
r.buildInfo, err = r.registry.AddInt64Gauge("build_info",
metric.WithDescription("Build Metadata"),
metric.WithLabelKeys("service", "version", "revision", "goversion"),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to register build info metric")
}
r.configChecksum, err = r.registry.AddFloat64Gauge("config_checksum_decimal",
metric.WithDescription("Config checksum represented in decimal notation"),
metric.WithLabelKeys("service"),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to register config checksum metric")
}
r.policyCount, err = r.registry.AddInt64DerivedGauge("policy_count_total",
metric.WithDescription("Total number of policies loaded"),
metric.WithLabelKeys("service"),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to register policy count metric")
}
})
}
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
// have this exported
func (r *metricRegistry) setBuildInfo(service string) {
if registry.buildInfo == nil {
return
}
m, err := registry.buildInfo.GetEntry(
metricdata.NewLabelValue(service),
metricdata.NewLabelValue(version.FullVersion()),
metricdata.NewLabelValue(version.GitCommit),
metricdata.NewLabelValue((runtime.Version())),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to get build info metric")
}
// This sets our build_info metric to a constant 1 per
// https://www.robustperception.io/exposing-the-software-version-to-prometheus
m.Set(1)
}
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to // SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
// have this exported // have this exported
func SetBuildInfo(service string) { func SetBuildInfo(service string) {
@ -159,33 +84,12 @@ func RegisterInfoMetrics() {
metricproducer.GlobalManager().AddProducer(registry.registry) metricproducer.GlobalManager().AddProducer(registry.registry)
} }
func (r *metricRegistry) setConfigChecksum(service string, checksum uint64) {
if r.configChecksum == nil {
return
}
m, err := r.configChecksum.GetEntry(metricdata.NewLabelValue(service))
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to get config checksum metric")
}
m.Set(float64(checksum))
}
// SetConfigChecksum creates the configuration checksum metric. You must call RegisterInfoMetrics to // SetConfigChecksum creates the configuration checksum metric. You must call RegisterInfoMetrics to
// have this exported // have this exported
func SetConfigChecksum(service string, checksum uint64) { func SetConfigChecksum(service string, checksum uint64) {
registry.setConfigChecksum(service, checksum) registry.setConfigChecksum(service, checksum)
} }
func (r *metricRegistry) addPolicyCountCallback(service string, f func() int64) {
if r.policyCount == nil {
return
}
err := r.policyCount.UpsertEntry(f, metricdata.NewLabelValue(service))
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to get policy count metric")
}
}
// AddPolicyCountCallback sets the function to call when exporting the // AddPolicyCountCallback sets the function to call when exporting the
// policy count metric. You must call RegisterInfoMetrics to have this // policy count metric. You must call RegisterInfoMetrics to have this
// exported // exported

View file

@ -0,0 +1,107 @@
package metrics
import (
"github.com/go-redis/redis/v7"
"github.com/golang/groupcache"
"go.etcd.io/bbolt"
)
// AddGroupCacheMetrics registers a metrics handler against a *groupcache.Group
func AddGroupCacheMetrics(gc *groupcache.Group) {
cumulativeMetrics := []struct {
name string
desc string
f func() int64
}{
{"groupcache_gets_total", "Total get request, including from peers", gc.Stats.Gets.Get},
{"groupcache_cache_hits_total", "Total cache hits in local or cluster cache", gc.Stats.CacheHits.Get},
{"groupcache_cache_hits_total", "Total cache hits in local or cluster cache", gc.Stats.CacheHits.Get},
{"groupcache_peer_loads_total", "Total remote loads or cache hits without error", gc.Stats.PeerLoads.Get},
{"groupcache_peer_errors_total", "Total errors from peers", gc.Stats.PeerErrors.Get},
{"groupcache_loads_total", "Total gets without cache hits", gc.Stats.Loads.Get},
{"groupcache_loads_deduped_total", "gets without cache hits after duplicate suppression", gc.Stats.LoadsDeduped.Get},
{"groupcache_local_loads_total", "Total good local loads", gc.Stats.LocalLoads.Get},
{"groupcache_local_load_errs_total", "Total local load errors", gc.Stats.LocalLoadErrs.Get},
{"groupcache_server_requests_total", "Total gets from peers", gc.Stats.ServerRequests.Get},
}
for _, m := range cumulativeMetrics {
registry.addInt64DerivedCumulativeMetric(m.name, m.desc, "autocache", m.f)
}
}
// AddBoltDBMetrics registers a metrics handler against a *bbolt.DB
func AddBoltDBMetrics(stats func() bbolt.Stats) {
gaugeMetrics := []struct {
name string
desc string
f func() int64
}{
{"boltdb_free_page_n", "Number of free pages on the freelist", func() int64 { return int64(stats().FreePageN) }},
{"boltdb_pending_page_n", "Number of pending pages on the freelist", func() int64 { return int64(stats().PendingPageN) }},
{"boltdb_free_alloc_size_bytes", "Bytes allocated in free pages", func() int64 { return int64(stats().FreeAlloc) }},
{"boltdb_freelist_inuse_size_bytes", "Bytes used by the freelist", func() int64 { return int64(stats().FreelistInuse) }},
{"boltdb_txn", "total number of started read transactions", func() int64 { return int64(stats().TxN) }},
{"boltdb_open_txn", "number of currently open read transactions", func() int64 { return int64(stats().OpenTxN) }},
}
for _, m := range gaugeMetrics {
registry.addInt64DerivedGaugeMetric(m.name, m.desc, "boltdb", m.f)
}
cumulativeMetrics := []struct {
name string
desc string
f func() int64
}{
{"boltdb_txn_page_total", "Total number of page allocations", func() int64 { return int64(stats().TxStats.PageCount) }},
{"boltdb_txn_page_alloc_size_bytes_total", "Total bytes allocated", func() int64 { return int64(stats().TxStats.PageAlloc) }},
{"boltdb_txn_cursor_total", "Total number of cursors created", func() int64 { return int64(stats().TxStats.CursorCount) }},
{"boltdb_txn_node_total", "Total number of node allocations", func() int64 { return int64(stats().TxStats.NodeCount) }},
{"boltdb_txn_node_deref_total", "Total number of node dereferences", func() int64 { return int64(stats().TxStats.NodeDeref) }},
{"boltdb_txn_rebalance_total", "Total number of node rebalances", func() int64 { return int64(stats().TxStats.Rebalance) }},
{"boltdb_txn_rebalance_duration_ms_total", "Total time spent rebalancing", func() int64 { return stats().TxStats.RebalanceTime.Milliseconds() }},
{"boltdb_txn_split_total", "Total number of nodes split", func() int64 { return int64(stats().TxStats.Split) }},
{"boltdb_txn_spill_total", "Total number of nodes spilled", func() int64 { return int64(stats().TxStats.Spill) }},
{"boltdb_txn_spill_duration_ms_total", "Total time spent spilling", func() int64 { return stats().TxStats.SpillTime.Milliseconds() }},
{"boltdb_txn_write_total", "Total number of writes performed", func() int64 { return int64(stats().TxStats.Write) }},
{"boltdb_txn_write_duration_ms_total", "Total time spent writing to disk", func() int64 { return stats().TxStats.WriteTime.Milliseconds() }},
}
for _, m := range cumulativeMetrics {
registry.addInt64DerivedCumulativeMetric(m.name, m.desc, "boltdb", m.f)
}
}
// AddRedisMetrics registers a metrics handler against a redis Client's PoolStats() method
func AddRedisMetrics(stats func() *redis.PoolStats) {
gaugeMetrics := []struct {
name string
desc string
f func() int64
}{
{"redis_conns", "Number of total connections in the pool", func() int64 { return int64(stats().TotalConns) }},
{"redis_idle_conns", "Number of idle connections in the pool", func() int64 { return int64(stats().IdleConns) }},
}
for _, m := range gaugeMetrics {
registry.addInt64DerivedGaugeMetric(m.name, m.desc, "redis", m.f)
}
cumulativeMetrics := []struct {
name string
desc string
f func() int64
}{
{"redis_hits_total", "Total number of times free connection was found in the pool", func() int64 { return int64(stats().Hits) }},
{"redis_misses_total", "Total number of times free connection was NOT found in the pool", func() int64 { return int64(stats().Misses) }},
{"redis_timeouts_total", "Total number of times a wait timeout occurred", func() int64 { return int64(stats().Timeouts) }},
{"redis_stale_conns_total", "Total number of stale connections removed from the pool", func() int64 { return int64(stats().StaleConns) }},
}
for _, m := range cumulativeMetrics {
registry.addInt64DerivedCumulativeMetric(m.name, m.desc, "redis", m.f)
}
}

View file

@ -0,0 +1,94 @@
package metrics
import (
"testing"
"time"
"github.com/go-redis/redis/v7"
"github.com/golang/groupcache"
"go.etcd.io/bbolt"
"go.opencensus.io/metric/metricdata"
)
func Test_AddGroupCacheMetrics(t *testing.T) {
t.Parallel()
gc := &groupcache.Group{}
AddGroupCacheMetrics(gc)
tests := []struct {
name string
stat *groupcache.AtomicInt
want int64
}{
{"groupcache_gets_total", &gc.Stats.Gets, 4},
{"groupcache_loads_total", &gc.Stats.Loads, 42},
{"groupcache_server_requests_total", &gc.Stats.ServerRequests, 8},
}
labelValues := []metricdata.LabelValue{
metricdata.NewLabelValue("autocache"),
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tt.stat.Add(tt.want)
testMetricRetrieval(registry.registry.Read(), t, labelValues, tt.want, tt.name)
})
}
}
func Test_AddBoltDBMetrics(t *testing.T) {
t.Parallel()
tests := []struct {
name string
stat bbolt.Stats
want int64
}{
{"boltdb_free_page_n", bbolt.Stats{FreePageN: 14}, 14},
{"boltdb_txn", bbolt.Stats{TxN: 88}, 88},
{"boltdb_txn_rebalance_duration_ms_total", bbolt.Stats{TxStats: bbolt.TxStats{RebalanceTime: 42 * time.Millisecond}}, 42},
{"boltdb_txn_write_total", bbolt.Stats{TxStats: bbolt.TxStats{Write: 42}}, 42},
}
labelValues := []metricdata.LabelValue{
metricdata.NewLabelValue("boltdb"),
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
AddBoltDBMetrics(func() bbolt.Stats { return tt.stat })
testMetricRetrieval(registry.registry.Read(), t, labelValues, tt.want, tt.name)
})
}
}
func Test_AddRedisMetrics(t *testing.T) {
t.Parallel()
tests := []struct {
name string
stat *redis.PoolStats
want int64
}{
{"redis_conns", &redis.PoolStats{TotalConns: 7}, 7},
{"redis_hits_total", &redis.PoolStats{Hits: 78}, 78},
{"redis_timeouts_total", &redis.PoolStats{Timeouts: 2}, 2},
}
labelValues := []metricdata.LabelValue{
metricdata.NewLabelValue("redis"),
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
AddRedisMetrics(func() *redis.PoolStats { return tt.stat })
testMetricRetrieval(registry.registry.Read(), t, labelValues, tt.want, tt.name)
})
}
}

View file

@ -0,0 +1,142 @@
package metrics
import (
"runtime"
"sync"
"go.opencensus.io/metric"
"go.opencensus.io/metric/metricdata"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/version"
)
var (
registry = newMetricRegistry()
)
// metricRegistry holds the non-view metrics and handles safe
// initialization and updates. Behavior without using newMetricRegistry()
// is undefined.
type metricRegistry struct {
registry *metric.Registry
buildInfo *metric.Int64Gauge
policyCount *metric.Int64DerivedGauge
configChecksum *metric.Float64Gauge
sync.Once
}
func newMetricRegistry() *metricRegistry {
r := new(metricRegistry)
r.init()
return r
}
func (r *metricRegistry) init() {
r.Do(
func() {
r.registry = metric.NewRegistry()
var err error
r.buildInfo, err = r.registry.AddInt64Gauge("build_info",
metric.WithDescription("Build Metadata"),
metric.WithLabelKeys("service", "version", "revision", "goversion"),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to register build info metric")
}
r.configChecksum, err = r.registry.AddFloat64Gauge("config_checksum_decimal",
metric.WithDescription("Config checksum represented in decimal notation"),
metric.WithLabelKeys("service"),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to register config checksum metric")
}
r.policyCount, err = r.registry.AddInt64DerivedGauge("policy_count_total",
metric.WithDescription("Total number of policies loaded"),
metric.WithLabelKeys("service"),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to register policy count metric")
}
})
}
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
// have this exported
func (r *metricRegistry) setBuildInfo(service string) {
if registry.buildInfo == nil {
return
}
m, err := registry.buildInfo.GetEntry(
metricdata.NewLabelValue(service),
metricdata.NewLabelValue(version.FullVersion()),
metricdata.NewLabelValue(version.GitCommit),
metricdata.NewLabelValue((runtime.Version())),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to get build info metric")
}
// This sets our build_info metric to a constant 1 per
// https://www.robustperception.io/exposing-the-software-version-to-prometheus
m.Set(1)
}
func (r *metricRegistry) addPolicyCountCallback(service string, f func() int64) {
if r.policyCount == nil {
return
}
err := r.policyCount.UpsertEntry(f, metricdata.NewLabelValue(service))
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to get policy count metric")
}
}
func (r *metricRegistry) setConfigChecksum(service string, checksum uint64) {
if r.configChecksum == nil {
return
}
m, err := r.configChecksum.GetEntry(metricdata.NewLabelValue(service))
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to get config checksum metric")
}
m.Set(float64(checksum))
}
func (r *metricRegistry) addInt64DerivedGaugeMetric(name string, desc string, service string, f func() int64) {
m, err := r.registry.AddInt64DerivedGauge(name, metric.WithDescription(desc), metric.WithLabelKeys("service"))
if err != nil {
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to register metric")
return
}
err = m.UpsertEntry(
f,
metricdata.NewLabelValue(service),
)
if err != nil {
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to update metric")
return
}
}
func (r *metricRegistry) addInt64DerivedCumulativeMetric(name string, desc string, service string, f func() int64) {
m, err := r.registry.AddInt64DerivedCumulative(name, metric.WithDescription(desc), metric.WithLabelKeys("service"))
if err != nil {
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to register metric")
return
}
err = m.UpsertEntry(
f,
metricdata.NewLabelValue(service),
)
if err != nil {
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to update metric")
return
}
}