mirror of
https://github.com/pomerium/pomerium.git
synced 2025-08-06 02:09:15 +02:00
Add storage metrics (#554)
* Add cache storage metrics - autocache client metrics - autocache server metrics - boltdb metrics - redis client metrics - refactor metrics registry to be general purpose
This commit is contained in:
parent
acfc880421
commit
cc504362e4
8 changed files with 400 additions and 108 deletions
|
@ -325,7 +325,35 @@ Expose a prometheus format HTTP endpoint on the specified port. Disabled by defa
|
|||
**Metrics tracked**
|
||||
|
||||
| Name | Type | Description |
|
||||
| :-------------------------------------------- | :-------- | :---------------------------------------------------------------------- |
|
||||
| --------------------------------------------- | --------- | ----------------------------------------------------------------------- |
|
||||
| boltdb_free_alloc_size_bytes | Gauge | Bytes allocated in free pages |
|
||||
| boltdb_free_page_n | Gauge | Number of free pages on the freelist |
|
||||
| boltdb_freelist_inuse_size_bytes | Gauge | Bytes used by the freelist |
|
||||
| boltdb_open_txn | Gauge | number of currently open read transactions |
|
||||
| boltdb_pending_page_n | Gauge | Number of pending pages on the freelist |
|
||||
| boltdb_txn | Gauge | total number of started read transactions |
|
||||
| boltdb_txn_cursor_total | Counter | Total number of cursors created |
|
||||
| boltdb_txn_node_deref_total | Counter | Total number of node dereferences |
|
||||
| boltdb_txn_node_total | Counter | Total number of node allocations |
|
||||
| boltdb_txn_page_alloc_size_bytes_total | Counter | Total bytes allocated |
|
||||
| boltdb_txn_page_total | Counter | Total number of page allocations |
|
||||
| boltdb_txn_rebalance_duration_ms_total | Counter | Total time spent rebalancing |
|
||||
| boltdb_txn_rebalance_total | Counter | Total number of node rebalances |
|
||||
| boltdb_txn_spill_duration_ms_total | Counter | Total time spent spilling |
|
||||
| boltdb_txn_spill_total | Counter | Total number of nodes spilled |
|
||||
| boltdb_txn_split_total | Counter | Total number of nodes split |
|
||||
| boltdb_txn_write_duration_ms_total | Counter | Total time spent writing to disk |
|
||||
| boltdb_txn_write_total | Counter | Total number of writes performed |
|
||||
| groupcache_cache_hits_total | Counter | Total cache hits in local or cluster cache |
|
||||
| groupcache_cache_hits_total | Counter | Total cache hits in local or cluster cache |
|
||||
| groupcache_gets_total | Counter | Total get request, including from peers |
|
||||
| groupcache_loads_deduped_total | Counter | gets without cache hits after duplicate suppression |
|
||||
| groupcache_loads_total | Counter | Total gets without cache hits |
|
||||
| groupcache_local_load_errs_total | Counter | Total local load errors |
|
||||
| groupcache_local_loads_total | Counter | Total good local loads |
|
||||
| groupcache_peer_errors_total | Counter | Total errors from peers |
|
||||
| groupcache_peer_loads_total | Counter | Total remote loads or cache hits without error |
|
||||
| groupcache_server_requests_total | Counter | Total gets from peers |
|
||||
| grpc_client_request_duration_ms | Histogram | GRPC client request duration by service |
|
||||
| grpc_client_request_size_bytes | Histogram | GRPC client request size by service |
|
||||
| grpc_client_requests_total | Counter | Total GRPC client requests made by service |
|
||||
|
@ -342,10 +370,17 @@ Expose a prometheus format HTTP endpoint on the specified port. Disabled by defa
|
|||
| http_server_request_size_bytes | Histogram | HTTP server request size by service |
|
||||
| http_server_requests_total | Counter | Total HTTP server requests handled by service |
|
||||
| http_server_response_size_bytes | Histogram | HTTP server response size by service |
|
||||
| pomerium_build_info | Gauge | Pomerium build metadata by git revision, service, version and goversion |
|
||||
| pomerium_config_checksum_int64 | Gauge | Currently loaded configuration checksum by service |
|
||||
| pomerium_config_last_reload_success | Gauge | Whether the last configuration reload succeeded by service |
|
||||
| pomerium_config_last_reload_success_timestamp | Gauge | The timestamp of the last successful configuration reload by service |
|
||||
| pomerium_build_info | Gauge | Pomerium build metadata by git revision, service, version and goversion |
|
||||
| redis_conns | Gauge | Number of total connections in the pool |
|
||||
| redis_hits_total | Counter | Total number of times free connection was found in the pool |
|
||||
| redis_idle_conns | Gauge | Number of idle connections in the pool |
|
||||
| redis_misses_total | Counter | Total number of times free connection was NOT found in the pool |
|
||||
| redis_stale_conns_total | Counter | Total number of stale connections removed from the pool |
|
||||
| redis_timeouts_total | Counter | Total number of times a wait timeout occurred |
|
||||
|
||||
|
||||
### Tracing
|
||||
|
||||
|
|
|
@ -12,9 +12,11 @@ import (
|
|||
"sync"
|
||||
|
||||
"github.com/golang/groupcache"
|
||||
|
||||
"github.com/pomerium/autocache"
|
||||
"github.com/pomerium/pomerium/internal/httputil"
|
||||
"github.com/pomerium/pomerium/internal/kv"
|
||||
"github.com/pomerium/pomerium/internal/telemetry/metrics"
|
||||
"github.com/pomerium/pomerium/internal/urlutil"
|
||||
)
|
||||
|
||||
|
@ -116,13 +118,14 @@ func New(o *Options) (*Store, error) {
|
|||
}
|
||||
serverOpts := &httputil.ServerOptions{Addr: o.Addr}
|
||||
var wg sync.WaitGroup
|
||||
s.srv, err = httputil.NewServer(serverOpts, QueryParamToCtx(s.cluster), &wg)
|
||||
s.srv, err = httputil.NewServer(serverOpts, metrics.HTTPMetricsHandler("groupcache")(QueryParamToCtx(s.cluster)), &wg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if _, err := s.cluster.Join([]string{o.ClusterDomain}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
metrics.AddGroupCacheMetrics(s.db)
|
||||
return &s, nil
|
||||
}
|
||||
|
||||
|
@ -191,7 +194,9 @@ func (s signedSession) RoundTrip(req *http.Request) (*http.Response, error) {
|
|||
q.Set(defaultQueryParamKey, session)
|
||||
newReqURL.RawQuery = q.Encode()
|
||||
newReq.URL = urlutil.NewSignedURL(s.sharedKey, &newReqURL).Sign()
|
||||
return http.DefaultTransport.RoundTrip(newReq)
|
||||
|
||||
tripper := metrics.HTTPMetricsRoundTripper("cache", "groupcache")(http.DefaultTransport)
|
||||
return tripper.RoundTrip(newReq)
|
||||
}
|
||||
|
||||
// QueryParamToCtx takes a value from a query param and adds it to the
|
||||
|
|
|
@ -5,9 +5,10 @@ package bolt
|
|||
import (
|
||||
"context"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/kv"
|
||||
|
||||
bolt "go.etcd.io/bbolt"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/kv"
|
||||
"github.com/pomerium/pomerium/internal/telemetry/metrics"
|
||||
)
|
||||
|
||||
var _ kv.Store = &Store{}
|
||||
|
@ -64,6 +65,7 @@ func New(o *Options) (*Store, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
metrics.AddBoltDBMetrics(db.Stats)
|
||||
return &Store{db: db, bucket: o.Bucket}, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -9,7 +9,9 @@ import (
|
|||
"fmt"
|
||||
|
||||
"github.com/go-redis/redis/v7"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/kv"
|
||||
"github.com/pomerium/pomerium/internal/telemetry/metrics"
|
||||
)
|
||||
|
||||
var _ kv.Store = &Store{}
|
||||
|
@ -56,6 +58,7 @@ func New(o *Options) (*Store, error) {
|
|||
return nil, fmt.Errorf("kv/redis: error connecting to redis: %w", err)
|
||||
}
|
||||
|
||||
metrics.AddRedisMetrics(db.PoolStats)
|
||||
return &Store{db: db}, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -2,19 +2,14 @@ package metrics
|
|||
|
||||
import (
|
||||
"context"
|
||||
"runtime"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/log"
|
||||
"github.com/pomerium/pomerium/internal/version"
|
||||
|
||||
"go.opencensus.io/metric"
|
||||
"go.opencensus.io/metric/metricdata"
|
||||
"go.opencensus.io/metric/metricproducer"
|
||||
"go.opencensus.io/stats"
|
||||
"go.opencensus.io/stats/view"
|
||||
"go.opencensus.io/tag"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/log"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -30,7 +25,6 @@ var (
|
|||
"config_last_reload_success",
|
||||
"Returns 1 if last reload was successful",
|
||||
"1")
|
||||
registry = newMetricRegistry()
|
||||
|
||||
// ConfigLastReloadView contains the timestamp the configuration was last
|
||||
// reloaded, labeled by service.
|
||||
|
@ -79,75 +73,6 @@ func SetConfigInfo(service string, success bool) {
|
|||
}
|
||||
}
|
||||
|
||||
// metricRegistry holds the non-view metrics and handles safe
|
||||
// initialization and updates. Behavior without using newMetricRegistry()
|
||||
// is undefined.
|
||||
type metricRegistry struct {
|
||||
registry *metric.Registry
|
||||
buildInfo *metric.Int64Gauge
|
||||
policyCount *metric.Int64DerivedGauge
|
||||
configChecksum *metric.Float64Gauge
|
||||
sync.Once
|
||||
}
|
||||
|
||||
func newMetricRegistry() *metricRegistry {
|
||||
r := new(metricRegistry)
|
||||
r.init()
|
||||
return r
|
||||
}
|
||||
|
||||
func (r *metricRegistry) init() {
|
||||
r.Do(
|
||||
func() {
|
||||
r.registry = metric.NewRegistry()
|
||||
var err error
|
||||
r.buildInfo, err = r.registry.AddInt64Gauge("build_info",
|
||||
metric.WithDescription("Build Metadata"),
|
||||
metric.WithLabelKeys("service", "version", "revision", "goversion"),
|
||||
)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("telemetry/metrics: failed to register build info metric")
|
||||
}
|
||||
|
||||
r.configChecksum, err = r.registry.AddFloat64Gauge("config_checksum_decimal",
|
||||
metric.WithDescription("Config checksum represented in decimal notation"),
|
||||
metric.WithLabelKeys("service"),
|
||||
)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("telemetry/metrics: failed to register config checksum metric")
|
||||
}
|
||||
|
||||
r.policyCount, err = r.registry.AddInt64DerivedGauge("policy_count_total",
|
||||
metric.WithDescription("Total number of policies loaded"),
|
||||
metric.WithLabelKeys("service"),
|
||||
)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("telemetry/metrics: failed to register policy count metric")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
|
||||
// have this exported
|
||||
func (r *metricRegistry) setBuildInfo(service string) {
|
||||
if registry.buildInfo == nil {
|
||||
return
|
||||
}
|
||||
m, err := registry.buildInfo.GetEntry(
|
||||
metricdata.NewLabelValue(service),
|
||||
metricdata.NewLabelValue(version.FullVersion()),
|
||||
metricdata.NewLabelValue(version.GitCommit),
|
||||
metricdata.NewLabelValue((runtime.Version())),
|
||||
)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("telemetry/metrics: failed to get build info metric")
|
||||
}
|
||||
|
||||
// This sets our build_info metric to a constant 1 per
|
||||
// https://www.robustperception.io/exposing-the-software-version-to-prometheus
|
||||
m.Set(1)
|
||||
}
|
||||
|
||||
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
|
||||
// have this exported
|
||||
func SetBuildInfo(service string) {
|
||||
|
@ -159,33 +84,12 @@ func RegisterInfoMetrics() {
|
|||
metricproducer.GlobalManager().AddProducer(registry.registry)
|
||||
}
|
||||
|
||||
func (r *metricRegistry) setConfigChecksum(service string, checksum uint64) {
|
||||
if r.configChecksum == nil {
|
||||
return
|
||||
}
|
||||
m, err := r.configChecksum.GetEntry(metricdata.NewLabelValue(service))
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("telemetry/metrics: failed to get config checksum metric")
|
||||
}
|
||||
m.Set(float64(checksum))
|
||||
}
|
||||
|
||||
// SetConfigChecksum creates the configuration checksum metric. You must call RegisterInfoMetrics to
|
||||
// have this exported
|
||||
func SetConfigChecksum(service string, checksum uint64) {
|
||||
registry.setConfigChecksum(service, checksum)
|
||||
}
|
||||
|
||||
func (r *metricRegistry) addPolicyCountCallback(service string, f func() int64) {
|
||||
if r.policyCount == nil {
|
||||
return
|
||||
}
|
||||
err := r.policyCount.UpsertEntry(f, metricdata.NewLabelValue(service))
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("telemetry/metrics: failed to get policy count metric")
|
||||
}
|
||||
}
|
||||
|
||||
// AddPolicyCountCallback sets the function to call when exporting the
|
||||
// policy count metric. You must call RegisterInfoMetrics to have this
|
||||
// exported
|
||||
|
|
107
internal/telemetry/metrics/kv.go
Normal file
107
internal/telemetry/metrics/kv.go
Normal file
|
@ -0,0 +1,107 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"github.com/go-redis/redis/v7"
|
||||
"github.com/golang/groupcache"
|
||||
"go.etcd.io/bbolt"
|
||||
)
|
||||
|
||||
// AddGroupCacheMetrics registers a metrics handler against a *groupcache.Group
|
||||
func AddGroupCacheMetrics(gc *groupcache.Group) {
|
||||
|
||||
cumulativeMetrics := []struct {
|
||||
name string
|
||||
desc string
|
||||
f func() int64
|
||||
}{
|
||||
{"groupcache_gets_total", "Total get request, including from peers", gc.Stats.Gets.Get},
|
||||
{"groupcache_cache_hits_total", "Total cache hits in local or cluster cache", gc.Stats.CacheHits.Get},
|
||||
{"groupcache_cache_hits_total", "Total cache hits in local or cluster cache", gc.Stats.CacheHits.Get},
|
||||
{"groupcache_peer_loads_total", "Total remote loads or cache hits without error", gc.Stats.PeerLoads.Get},
|
||||
{"groupcache_peer_errors_total", "Total errors from peers", gc.Stats.PeerErrors.Get},
|
||||
{"groupcache_loads_total", "Total gets without cache hits", gc.Stats.Loads.Get},
|
||||
{"groupcache_loads_deduped_total", "gets without cache hits after duplicate suppression", gc.Stats.LoadsDeduped.Get},
|
||||
{"groupcache_local_loads_total", "Total good local loads", gc.Stats.LocalLoads.Get},
|
||||
{"groupcache_local_load_errs_total", "Total local load errors", gc.Stats.LocalLoadErrs.Get},
|
||||
{"groupcache_server_requests_total", "Total gets from peers", gc.Stats.ServerRequests.Get},
|
||||
}
|
||||
|
||||
for _, m := range cumulativeMetrics {
|
||||
registry.addInt64DerivedCumulativeMetric(m.name, m.desc, "autocache", m.f)
|
||||
}
|
||||
}
|
||||
|
||||
// AddBoltDBMetrics registers a metrics handler against a *bbolt.DB
|
||||
func AddBoltDBMetrics(stats func() bbolt.Stats) {
|
||||
gaugeMetrics := []struct {
|
||||
name string
|
||||
desc string
|
||||
f func() int64
|
||||
}{
|
||||
{"boltdb_free_page_n", "Number of free pages on the freelist", func() int64 { return int64(stats().FreePageN) }},
|
||||
{"boltdb_pending_page_n", "Number of pending pages on the freelist", func() int64 { return int64(stats().PendingPageN) }},
|
||||
{"boltdb_free_alloc_size_bytes", "Bytes allocated in free pages", func() int64 { return int64(stats().FreeAlloc) }},
|
||||
{"boltdb_freelist_inuse_size_bytes", "Bytes used by the freelist", func() int64 { return int64(stats().FreelistInuse) }},
|
||||
{"boltdb_txn", "total number of started read transactions", func() int64 { return int64(stats().TxN) }},
|
||||
{"boltdb_open_txn", "number of currently open read transactions", func() int64 { return int64(stats().OpenTxN) }},
|
||||
}
|
||||
|
||||
for _, m := range gaugeMetrics {
|
||||
registry.addInt64DerivedGaugeMetric(m.name, m.desc, "boltdb", m.f)
|
||||
}
|
||||
|
||||
cumulativeMetrics := []struct {
|
||||
name string
|
||||
desc string
|
||||
f func() int64
|
||||
}{
|
||||
{"boltdb_txn_page_total", "Total number of page allocations", func() int64 { return int64(stats().TxStats.PageCount) }},
|
||||
{"boltdb_txn_page_alloc_size_bytes_total", "Total bytes allocated", func() int64 { return int64(stats().TxStats.PageAlloc) }},
|
||||
{"boltdb_txn_cursor_total", "Total number of cursors created", func() int64 { return int64(stats().TxStats.CursorCount) }},
|
||||
{"boltdb_txn_node_total", "Total number of node allocations", func() int64 { return int64(stats().TxStats.NodeCount) }},
|
||||
{"boltdb_txn_node_deref_total", "Total number of node dereferences", func() int64 { return int64(stats().TxStats.NodeDeref) }},
|
||||
{"boltdb_txn_rebalance_total", "Total number of node rebalances", func() int64 { return int64(stats().TxStats.Rebalance) }},
|
||||
{"boltdb_txn_rebalance_duration_ms_total", "Total time spent rebalancing", func() int64 { return stats().TxStats.RebalanceTime.Milliseconds() }},
|
||||
{"boltdb_txn_split_total", "Total number of nodes split", func() int64 { return int64(stats().TxStats.Split) }},
|
||||
{"boltdb_txn_spill_total", "Total number of nodes spilled", func() int64 { return int64(stats().TxStats.Spill) }},
|
||||
{"boltdb_txn_spill_duration_ms_total", "Total time spent spilling", func() int64 { return stats().TxStats.SpillTime.Milliseconds() }},
|
||||
{"boltdb_txn_write_total", "Total number of writes performed", func() int64 { return int64(stats().TxStats.Write) }},
|
||||
{"boltdb_txn_write_duration_ms_total", "Total time spent writing to disk", func() int64 { return stats().TxStats.WriteTime.Milliseconds() }},
|
||||
}
|
||||
|
||||
for _, m := range cumulativeMetrics {
|
||||
registry.addInt64DerivedCumulativeMetric(m.name, m.desc, "boltdb", m.f)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// AddRedisMetrics registers a metrics handler against a redis Client's PoolStats() method
|
||||
func AddRedisMetrics(stats func() *redis.PoolStats) {
|
||||
gaugeMetrics := []struct {
|
||||
name string
|
||||
desc string
|
||||
f func() int64
|
||||
}{
|
||||
{"redis_conns", "Number of total connections in the pool", func() int64 { return int64(stats().TotalConns) }},
|
||||
{"redis_idle_conns", "Number of idle connections in the pool", func() int64 { return int64(stats().IdleConns) }},
|
||||
}
|
||||
|
||||
for _, m := range gaugeMetrics {
|
||||
registry.addInt64DerivedGaugeMetric(m.name, m.desc, "redis", m.f)
|
||||
}
|
||||
|
||||
cumulativeMetrics := []struct {
|
||||
name string
|
||||
desc string
|
||||
f func() int64
|
||||
}{
|
||||
{"redis_hits_total", "Total number of times free connection was found in the pool", func() int64 { return int64(stats().Hits) }},
|
||||
{"redis_misses_total", "Total number of times free connection was NOT found in the pool", func() int64 { return int64(stats().Misses) }},
|
||||
{"redis_timeouts_total", "Total number of times a wait timeout occurred", func() int64 { return int64(stats().Timeouts) }},
|
||||
{"redis_stale_conns_total", "Total number of stale connections removed from the pool", func() int64 { return int64(stats().StaleConns) }},
|
||||
}
|
||||
|
||||
for _, m := range cumulativeMetrics {
|
||||
registry.addInt64DerivedCumulativeMetric(m.name, m.desc, "redis", m.f)
|
||||
}
|
||||
}
|
94
internal/telemetry/metrics/kv_test.go
Normal file
94
internal/telemetry/metrics/kv_test.go
Normal file
|
@ -0,0 +1,94 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/go-redis/redis/v7"
|
||||
"github.com/golang/groupcache"
|
||||
"go.etcd.io/bbolt"
|
||||
"go.opencensus.io/metric/metricdata"
|
||||
)
|
||||
|
||||
func Test_AddGroupCacheMetrics(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
gc := &groupcache.Group{}
|
||||
AddGroupCacheMetrics(gc)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
stat *groupcache.AtomicInt
|
||||
want int64
|
||||
}{
|
||||
{"groupcache_gets_total", &gc.Stats.Gets, 4},
|
||||
{"groupcache_loads_total", &gc.Stats.Loads, 42},
|
||||
{"groupcache_server_requests_total", &gc.Stats.ServerRequests, 8},
|
||||
}
|
||||
|
||||
labelValues := []metricdata.LabelValue{
|
||||
metricdata.NewLabelValue("autocache"),
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
tt.stat.Add(tt.want)
|
||||
testMetricRetrieval(registry.registry.Read(), t, labelValues, tt.want, tt.name)
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func Test_AddBoltDBMetrics(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
stat bbolt.Stats
|
||||
want int64
|
||||
}{
|
||||
{"boltdb_free_page_n", bbolt.Stats{FreePageN: 14}, 14},
|
||||
{"boltdb_txn", bbolt.Stats{TxN: 88}, 88},
|
||||
|
||||
{"boltdb_txn_rebalance_duration_ms_total", bbolt.Stats{TxStats: bbolt.TxStats{RebalanceTime: 42 * time.Millisecond}}, 42},
|
||||
{"boltdb_txn_write_total", bbolt.Stats{TxStats: bbolt.TxStats{Write: 42}}, 42},
|
||||
}
|
||||
|
||||
labelValues := []metricdata.LabelValue{
|
||||
metricdata.NewLabelValue("boltdb"),
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
AddBoltDBMetrics(func() bbolt.Stats { return tt.stat })
|
||||
testMetricRetrieval(registry.registry.Read(), t, labelValues, tt.want, tt.name)
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func Test_AddRedisMetrics(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
stat *redis.PoolStats
|
||||
want int64
|
||||
}{
|
||||
{"redis_conns", &redis.PoolStats{TotalConns: 7}, 7},
|
||||
{"redis_hits_total", &redis.PoolStats{Hits: 78}, 78},
|
||||
{"redis_timeouts_total", &redis.PoolStats{Timeouts: 2}, 2},
|
||||
}
|
||||
|
||||
labelValues := []metricdata.LabelValue{
|
||||
metricdata.NewLabelValue("redis"),
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
AddRedisMetrics(func() *redis.PoolStats { return tt.stat })
|
||||
testMetricRetrieval(registry.registry.Read(), t, labelValues, tt.want, tt.name)
|
||||
})
|
||||
}
|
||||
|
||||
}
|
142
internal/telemetry/metrics/registry.go
Normal file
142
internal/telemetry/metrics/registry.go
Normal file
|
@ -0,0 +1,142 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"sync"
|
||||
|
||||
"go.opencensus.io/metric"
|
||||
"go.opencensus.io/metric/metricdata"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/log"
|
||||
"github.com/pomerium/pomerium/internal/version"
|
||||
)
|
||||
|
||||
var (
|
||||
registry = newMetricRegistry()
|
||||
)
|
||||
|
||||
// metricRegistry holds the non-view metrics and handles safe
|
||||
// initialization and updates. Behavior without using newMetricRegistry()
|
||||
// is undefined.
|
||||
type metricRegistry struct {
|
||||
registry *metric.Registry
|
||||
buildInfo *metric.Int64Gauge
|
||||
policyCount *metric.Int64DerivedGauge
|
||||
configChecksum *metric.Float64Gauge
|
||||
sync.Once
|
||||
}
|
||||
|
||||
func newMetricRegistry() *metricRegistry {
|
||||
r := new(metricRegistry)
|
||||
r.init()
|
||||
return r
|
||||
}
|
||||
|
||||
func (r *metricRegistry) init() {
|
||||
r.Do(
|
||||
func() {
|
||||
r.registry = metric.NewRegistry()
|
||||
var err error
|
||||
r.buildInfo, err = r.registry.AddInt64Gauge("build_info",
|
||||
metric.WithDescription("Build Metadata"),
|
||||
metric.WithLabelKeys("service", "version", "revision", "goversion"),
|
||||
)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("telemetry/metrics: failed to register build info metric")
|
||||
}
|
||||
|
||||
r.configChecksum, err = r.registry.AddFloat64Gauge("config_checksum_decimal",
|
||||
metric.WithDescription("Config checksum represented in decimal notation"),
|
||||
metric.WithLabelKeys("service"),
|
||||
)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("telemetry/metrics: failed to register config checksum metric")
|
||||
}
|
||||
|
||||
r.policyCount, err = r.registry.AddInt64DerivedGauge("policy_count_total",
|
||||
metric.WithDescription("Total number of policies loaded"),
|
||||
metric.WithLabelKeys("service"),
|
||||
)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("telemetry/metrics: failed to register policy count metric")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
|
||||
// have this exported
|
||||
func (r *metricRegistry) setBuildInfo(service string) {
|
||||
if registry.buildInfo == nil {
|
||||
return
|
||||
}
|
||||
m, err := registry.buildInfo.GetEntry(
|
||||
metricdata.NewLabelValue(service),
|
||||
metricdata.NewLabelValue(version.FullVersion()),
|
||||
metricdata.NewLabelValue(version.GitCommit),
|
||||
metricdata.NewLabelValue((runtime.Version())),
|
||||
)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("telemetry/metrics: failed to get build info metric")
|
||||
}
|
||||
|
||||
// This sets our build_info metric to a constant 1 per
|
||||
// https://www.robustperception.io/exposing-the-software-version-to-prometheus
|
||||
m.Set(1)
|
||||
}
|
||||
|
||||
func (r *metricRegistry) addPolicyCountCallback(service string, f func() int64) {
|
||||
if r.policyCount == nil {
|
||||
return
|
||||
}
|
||||
err := r.policyCount.UpsertEntry(f, metricdata.NewLabelValue(service))
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("telemetry/metrics: failed to get policy count metric")
|
||||
}
|
||||
}
|
||||
|
||||
func (r *metricRegistry) setConfigChecksum(service string, checksum uint64) {
|
||||
if r.configChecksum == nil {
|
||||
return
|
||||
}
|
||||
m, err := r.configChecksum.GetEntry(metricdata.NewLabelValue(service))
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("telemetry/metrics: failed to get config checksum metric")
|
||||
}
|
||||
m.Set(float64(checksum))
|
||||
}
|
||||
|
||||
func (r *metricRegistry) addInt64DerivedGaugeMetric(name string, desc string, service string, f func() int64) {
|
||||
|
||||
m, err := r.registry.AddInt64DerivedGauge(name, metric.WithDescription(desc), metric.WithLabelKeys("service"))
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to register metric")
|
||||
return
|
||||
}
|
||||
|
||||
err = m.UpsertEntry(
|
||||
f,
|
||||
metricdata.NewLabelValue(service),
|
||||
)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to update metric")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func (r *metricRegistry) addInt64DerivedCumulativeMetric(name string, desc string, service string, f func() int64) {
|
||||
|
||||
m, err := r.registry.AddInt64DerivedCumulative(name, metric.WithDescription(desc), metric.WithLabelKeys("service"))
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to register metric")
|
||||
return
|
||||
}
|
||||
|
||||
err = m.UpsertEntry(
|
||||
f,
|
||||
metricdata.NewLabelValue(service),
|
||||
)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to update metric")
|
||||
return
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue