mirror of
https://github.com/pomerium/pomerium.git
synced 2025-08-06 02:09:15 +02:00
Add storage metrics (#554)
* Add cache storage metrics - autocache client metrics - autocache server metrics - boltdb metrics - redis client metrics - refactor metrics registry to be general purpose
This commit is contained in:
parent
acfc880421
commit
cc504362e4
8 changed files with 400 additions and 108 deletions
|
@ -325,7 +325,35 @@ Expose a prometheus format HTTP endpoint on the specified port. Disabled by defa
|
||||||
**Metrics tracked**
|
**Metrics tracked**
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| :-------------------------------------------- | :-------- | :---------------------------------------------------------------------- |
|
| --------------------------------------------- | --------- | ----------------------------------------------------------------------- |
|
||||||
|
| boltdb_free_alloc_size_bytes | Gauge | Bytes allocated in free pages |
|
||||||
|
| boltdb_free_page_n | Gauge | Number of free pages on the freelist |
|
||||||
|
| boltdb_freelist_inuse_size_bytes | Gauge | Bytes used by the freelist |
|
||||||
|
| boltdb_open_txn | Gauge | number of currently open read transactions |
|
||||||
|
| boltdb_pending_page_n | Gauge | Number of pending pages on the freelist |
|
||||||
|
| boltdb_txn | Gauge | total number of started read transactions |
|
||||||
|
| boltdb_txn_cursor_total | Counter | Total number of cursors created |
|
||||||
|
| boltdb_txn_node_deref_total | Counter | Total number of node dereferences |
|
||||||
|
| boltdb_txn_node_total | Counter | Total number of node allocations |
|
||||||
|
| boltdb_txn_page_alloc_size_bytes_total | Counter | Total bytes allocated |
|
||||||
|
| boltdb_txn_page_total | Counter | Total number of page allocations |
|
||||||
|
| boltdb_txn_rebalance_duration_ms_total | Counter | Total time spent rebalancing |
|
||||||
|
| boltdb_txn_rebalance_total | Counter | Total number of node rebalances |
|
||||||
|
| boltdb_txn_spill_duration_ms_total | Counter | Total time spent spilling |
|
||||||
|
| boltdb_txn_spill_total | Counter | Total number of nodes spilled |
|
||||||
|
| boltdb_txn_split_total | Counter | Total number of nodes split |
|
||||||
|
| boltdb_txn_write_duration_ms_total | Counter | Total time spent writing to disk |
|
||||||
|
| boltdb_txn_write_total | Counter | Total number of writes performed |
|
||||||
|
| groupcache_cache_hits_total | Counter | Total cache hits in local or cluster cache |
|
||||||
|
| groupcache_cache_hits_total | Counter | Total cache hits in local or cluster cache |
|
||||||
|
| groupcache_gets_total | Counter | Total get request, including from peers |
|
||||||
|
| groupcache_loads_deduped_total | Counter | gets without cache hits after duplicate suppression |
|
||||||
|
| groupcache_loads_total | Counter | Total gets without cache hits |
|
||||||
|
| groupcache_local_load_errs_total | Counter | Total local load errors |
|
||||||
|
| groupcache_local_loads_total | Counter | Total good local loads |
|
||||||
|
| groupcache_peer_errors_total | Counter | Total errors from peers |
|
||||||
|
| groupcache_peer_loads_total | Counter | Total remote loads or cache hits without error |
|
||||||
|
| groupcache_server_requests_total | Counter | Total gets from peers |
|
||||||
| grpc_client_request_duration_ms | Histogram | GRPC client request duration by service |
|
| grpc_client_request_duration_ms | Histogram | GRPC client request duration by service |
|
||||||
| grpc_client_request_size_bytes | Histogram | GRPC client request size by service |
|
| grpc_client_request_size_bytes | Histogram | GRPC client request size by service |
|
||||||
| grpc_client_requests_total | Counter | Total GRPC client requests made by service |
|
| grpc_client_requests_total | Counter | Total GRPC client requests made by service |
|
||||||
|
@ -342,10 +370,17 @@ Expose a prometheus format HTTP endpoint on the specified port. Disabled by defa
|
||||||
| http_server_request_size_bytes | Histogram | HTTP server request size by service |
|
| http_server_request_size_bytes | Histogram | HTTP server request size by service |
|
||||||
| http_server_requests_total | Counter | Total HTTP server requests handled by service |
|
| http_server_requests_total | Counter | Total HTTP server requests handled by service |
|
||||||
| http_server_response_size_bytes | Histogram | HTTP server response size by service |
|
| http_server_response_size_bytes | Histogram | HTTP server response size by service |
|
||||||
|
| pomerium_build_info | Gauge | Pomerium build metadata by git revision, service, version and goversion |
|
||||||
| pomerium_config_checksum_int64 | Gauge | Currently loaded configuration checksum by service |
|
| pomerium_config_checksum_int64 | Gauge | Currently loaded configuration checksum by service |
|
||||||
| pomerium_config_last_reload_success | Gauge | Whether the last configuration reload succeeded by service |
|
| pomerium_config_last_reload_success | Gauge | Whether the last configuration reload succeeded by service |
|
||||||
| pomerium_config_last_reload_success_timestamp | Gauge | The timestamp of the last successful configuration reload by service |
|
| pomerium_config_last_reload_success_timestamp | Gauge | The timestamp of the last successful configuration reload by service |
|
||||||
| pomerium_build_info | Gauge | Pomerium build metadata by git revision, service, version and goversion |
|
| redis_conns | Gauge | Number of total connections in the pool |
|
||||||
|
| redis_hits_total | Counter | Total number of times free connection was found in the pool |
|
||||||
|
| redis_idle_conns | Gauge | Number of idle connections in the pool |
|
||||||
|
| redis_misses_total | Counter | Total number of times free connection was NOT found in the pool |
|
||||||
|
| redis_stale_conns_total | Counter | Total number of stale connections removed from the pool |
|
||||||
|
| redis_timeouts_total | Counter | Total number of times a wait timeout occurred |
|
||||||
|
|
||||||
|
|
||||||
### Tracing
|
### Tracing
|
||||||
|
|
||||||
|
@ -357,8 +392,8 @@ Each unit work is called a Span in a trace. Spans include metadata about the wor
|
||||||
|
|
||||||
| Config Key | Description | Required |
|
| Config Key | Description | Required |
|
||||||
| :--------------- | :---------------------------------------------------------------- | -------- |
|
| :--------------- | :---------------------------------------------------------------- | -------- |
|
||||||
| tracing_provider | The name of the tracing provider. (e.g. jaeger) | ✅ |
|
| tracing_provider | The name of the tracing provider. (e.g. jaeger) | ✅ |
|
||||||
| tracing_debug | Will disable [sampling](https://opencensus.io/tracing/sampling/). | ❌ |
|
| tracing_debug | Will disable [sampling](https://opencensus.io/tracing/sampling/). | ❌ |
|
||||||
|
|
||||||
#### Jaeger
|
#### Jaeger
|
||||||
|
|
||||||
|
@ -372,8 +407,8 @@ Each unit work is called a Span in a trace. Spans include metadata about the wor
|
||||||
|
|
||||||
| Config Key | Description | Required |
|
| Config Key | Description | Required |
|
||||||
| :-------------------------------- | :------------------------------------------ | -------- |
|
| :-------------------------------- | :------------------------------------------ | -------- |
|
||||||
| tracing_jaeger_collector_endpoint | Url to the Jaeger HTTP Thrift collector. | ✅ |
|
| tracing_jaeger_collector_endpoint | Url to the Jaeger HTTP Thrift collector. | ✅ |
|
||||||
| tracing_jaeger_agent_endpoint | Send spans to jaeger-agent at this address. | ✅ |
|
| tracing_jaeger_agent_endpoint | Send spans to jaeger-agent at this address. | ✅ |
|
||||||
|
|
||||||
#### Example
|
#### Example
|
||||||
|
|
||||||
|
|
|
@ -12,9 +12,11 @@ import (
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"github.com/golang/groupcache"
|
"github.com/golang/groupcache"
|
||||||
|
|
||||||
"github.com/pomerium/autocache"
|
"github.com/pomerium/autocache"
|
||||||
"github.com/pomerium/pomerium/internal/httputil"
|
"github.com/pomerium/pomerium/internal/httputil"
|
||||||
"github.com/pomerium/pomerium/internal/kv"
|
"github.com/pomerium/pomerium/internal/kv"
|
||||||
|
"github.com/pomerium/pomerium/internal/telemetry/metrics"
|
||||||
"github.com/pomerium/pomerium/internal/urlutil"
|
"github.com/pomerium/pomerium/internal/urlutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -116,13 +118,14 @@ func New(o *Options) (*Store, error) {
|
||||||
}
|
}
|
||||||
serverOpts := &httputil.ServerOptions{Addr: o.Addr}
|
serverOpts := &httputil.ServerOptions{Addr: o.Addr}
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
s.srv, err = httputil.NewServer(serverOpts, QueryParamToCtx(s.cluster), &wg)
|
s.srv, err = httputil.NewServer(serverOpts, metrics.HTTPMetricsHandler("groupcache")(QueryParamToCtx(s.cluster)), &wg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
if _, err := s.cluster.Join([]string{o.ClusterDomain}); err != nil {
|
if _, err := s.cluster.Join([]string{o.ClusterDomain}); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
metrics.AddGroupCacheMetrics(s.db)
|
||||||
return &s, nil
|
return &s, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -191,7 +194,9 @@ func (s signedSession) RoundTrip(req *http.Request) (*http.Response, error) {
|
||||||
q.Set(defaultQueryParamKey, session)
|
q.Set(defaultQueryParamKey, session)
|
||||||
newReqURL.RawQuery = q.Encode()
|
newReqURL.RawQuery = q.Encode()
|
||||||
newReq.URL = urlutil.NewSignedURL(s.sharedKey, &newReqURL).Sign()
|
newReq.URL = urlutil.NewSignedURL(s.sharedKey, &newReqURL).Sign()
|
||||||
return http.DefaultTransport.RoundTrip(newReq)
|
|
||||||
|
tripper := metrics.HTTPMetricsRoundTripper("cache", "groupcache")(http.DefaultTransport)
|
||||||
|
return tripper.RoundTrip(newReq)
|
||||||
}
|
}
|
||||||
|
|
||||||
// QueryParamToCtx takes a value from a query param and adds it to the
|
// QueryParamToCtx takes a value from a query param and adds it to the
|
||||||
|
|
|
@ -5,9 +5,10 @@ package bolt
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
|
||||||
"github.com/pomerium/pomerium/internal/kv"
|
|
||||||
|
|
||||||
bolt "go.etcd.io/bbolt"
|
bolt "go.etcd.io/bbolt"
|
||||||
|
|
||||||
|
"github.com/pomerium/pomerium/internal/kv"
|
||||||
|
"github.com/pomerium/pomerium/internal/telemetry/metrics"
|
||||||
)
|
)
|
||||||
|
|
||||||
var _ kv.Store = &Store{}
|
var _ kv.Store = &Store{}
|
||||||
|
@ -64,6 +65,7 @@ func New(o *Options) (*Store, error) {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
metrics.AddBoltDBMetrics(db.Stats)
|
||||||
return &Store{db: db, bucket: o.Bucket}, nil
|
return &Store{db: db, bucket: o.Bucket}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,9 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
"github.com/go-redis/redis/v7"
|
"github.com/go-redis/redis/v7"
|
||||||
|
|
||||||
"github.com/pomerium/pomerium/internal/kv"
|
"github.com/pomerium/pomerium/internal/kv"
|
||||||
|
"github.com/pomerium/pomerium/internal/telemetry/metrics"
|
||||||
)
|
)
|
||||||
|
|
||||||
var _ kv.Store = &Store{}
|
var _ kv.Store = &Store{}
|
||||||
|
@ -56,6 +58,7 @@ func New(o *Options) (*Store, error) {
|
||||||
return nil, fmt.Errorf("kv/redis: error connecting to redis: %w", err)
|
return nil, fmt.Errorf("kv/redis: error connecting to redis: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
metrics.AddRedisMetrics(db.PoolStats)
|
||||||
return &Store{db: db}, nil
|
return &Store{db: db}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,19 +2,14 @@ package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"runtime"
|
|
||||||
"sync"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/pomerium/pomerium/internal/log"
|
|
||||||
"github.com/pomerium/pomerium/internal/version"
|
|
||||||
|
|
||||||
"go.opencensus.io/metric"
|
|
||||||
"go.opencensus.io/metric/metricdata"
|
|
||||||
"go.opencensus.io/metric/metricproducer"
|
"go.opencensus.io/metric/metricproducer"
|
||||||
"go.opencensus.io/stats"
|
"go.opencensus.io/stats"
|
||||||
"go.opencensus.io/stats/view"
|
"go.opencensus.io/stats/view"
|
||||||
"go.opencensus.io/tag"
|
"go.opencensus.io/tag"
|
||||||
|
|
||||||
|
"github.com/pomerium/pomerium/internal/log"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -30,7 +25,6 @@ var (
|
||||||
"config_last_reload_success",
|
"config_last_reload_success",
|
||||||
"Returns 1 if last reload was successful",
|
"Returns 1 if last reload was successful",
|
||||||
"1")
|
"1")
|
||||||
registry = newMetricRegistry()
|
|
||||||
|
|
||||||
// ConfigLastReloadView contains the timestamp the configuration was last
|
// ConfigLastReloadView contains the timestamp the configuration was last
|
||||||
// reloaded, labeled by service.
|
// reloaded, labeled by service.
|
||||||
|
@ -79,75 +73,6 @@ func SetConfigInfo(service string, success bool) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// metricRegistry holds the non-view metrics and handles safe
|
|
||||||
// initialization and updates. Behavior without using newMetricRegistry()
|
|
||||||
// is undefined.
|
|
||||||
type metricRegistry struct {
|
|
||||||
registry *metric.Registry
|
|
||||||
buildInfo *metric.Int64Gauge
|
|
||||||
policyCount *metric.Int64DerivedGauge
|
|
||||||
configChecksum *metric.Float64Gauge
|
|
||||||
sync.Once
|
|
||||||
}
|
|
||||||
|
|
||||||
func newMetricRegistry() *metricRegistry {
|
|
||||||
r := new(metricRegistry)
|
|
||||||
r.init()
|
|
||||||
return r
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *metricRegistry) init() {
|
|
||||||
r.Do(
|
|
||||||
func() {
|
|
||||||
r.registry = metric.NewRegistry()
|
|
||||||
var err error
|
|
||||||
r.buildInfo, err = r.registry.AddInt64Gauge("build_info",
|
|
||||||
metric.WithDescription("Build Metadata"),
|
|
||||||
metric.WithLabelKeys("service", "version", "revision", "goversion"),
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msg("telemetry/metrics: failed to register build info metric")
|
|
||||||
}
|
|
||||||
|
|
||||||
r.configChecksum, err = r.registry.AddFloat64Gauge("config_checksum_decimal",
|
|
||||||
metric.WithDescription("Config checksum represented in decimal notation"),
|
|
||||||
metric.WithLabelKeys("service"),
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msg("telemetry/metrics: failed to register config checksum metric")
|
|
||||||
}
|
|
||||||
|
|
||||||
r.policyCount, err = r.registry.AddInt64DerivedGauge("policy_count_total",
|
|
||||||
metric.WithDescription("Total number of policies loaded"),
|
|
||||||
metric.WithLabelKeys("service"),
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msg("telemetry/metrics: failed to register policy count metric")
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
|
|
||||||
// have this exported
|
|
||||||
func (r *metricRegistry) setBuildInfo(service string) {
|
|
||||||
if registry.buildInfo == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
m, err := registry.buildInfo.GetEntry(
|
|
||||||
metricdata.NewLabelValue(service),
|
|
||||||
metricdata.NewLabelValue(version.FullVersion()),
|
|
||||||
metricdata.NewLabelValue(version.GitCommit),
|
|
||||||
metricdata.NewLabelValue((runtime.Version())),
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msg("telemetry/metrics: failed to get build info metric")
|
|
||||||
}
|
|
||||||
|
|
||||||
// This sets our build_info metric to a constant 1 per
|
|
||||||
// https://www.robustperception.io/exposing-the-software-version-to-prometheus
|
|
||||||
m.Set(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
|
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
|
||||||
// have this exported
|
// have this exported
|
||||||
func SetBuildInfo(service string) {
|
func SetBuildInfo(service string) {
|
||||||
|
@ -159,33 +84,12 @@ func RegisterInfoMetrics() {
|
||||||
metricproducer.GlobalManager().AddProducer(registry.registry)
|
metricproducer.GlobalManager().AddProducer(registry.registry)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *metricRegistry) setConfigChecksum(service string, checksum uint64) {
|
|
||||||
if r.configChecksum == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
m, err := r.configChecksum.GetEntry(metricdata.NewLabelValue(service))
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msg("telemetry/metrics: failed to get config checksum metric")
|
|
||||||
}
|
|
||||||
m.Set(float64(checksum))
|
|
||||||
}
|
|
||||||
|
|
||||||
// SetConfigChecksum creates the configuration checksum metric. You must call RegisterInfoMetrics to
|
// SetConfigChecksum creates the configuration checksum metric. You must call RegisterInfoMetrics to
|
||||||
// have this exported
|
// have this exported
|
||||||
func SetConfigChecksum(service string, checksum uint64) {
|
func SetConfigChecksum(service string, checksum uint64) {
|
||||||
registry.setConfigChecksum(service, checksum)
|
registry.setConfigChecksum(service, checksum)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *metricRegistry) addPolicyCountCallback(service string, f func() int64) {
|
|
||||||
if r.policyCount == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
err := r.policyCount.UpsertEntry(f, metricdata.NewLabelValue(service))
|
|
||||||
if err != nil {
|
|
||||||
log.Error().Err(err).Msg("telemetry/metrics: failed to get policy count metric")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// AddPolicyCountCallback sets the function to call when exporting the
|
// AddPolicyCountCallback sets the function to call when exporting the
|
||||||
// policy count metric. You must call RegisterInfoMetrics to have this
|
// policy count metric. You must call RegisterInfoMetrics to have this
|
||||||
// exported
|
// exported
|
||||||
|
|
107
internal/telemetry/metrics/kv.go
Normal file
107
internal/telemetry/metrics/kv.go
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/go-redis/redis/v7"
|
||||||
|
"github.com/golang/groupcache"
|
||||||
|
"go.etcd.io/bbolt"
|
||||||
|
)
|
||||||
|
|
||||||
|
// AddGroupCacheMetrics registers a metrics handler against a *groupcache.Group
|
||||||
|
func AddGroupCacheMetrics(gc *groupcache.Group) {
|
||||||
|
|
||||||
|
cumulativeMetrics := []struct {
|
||||||
|
name string
|
||||||
|
desc string
|
||||||
|
f func() int64
|
||||||
|
}{
|
||||||
|
{"groupcache_gets_total", "Total get request, including from peers", gc.Stats.Gets.Get},
|
||||||
|
{"groupcache_cache_hits_total", "Total cache hits in local or cluster cache", gc.Stats.CacheHits.Get},
|
||||||
|
{"groupcache_cache_hits_total", "Total cache hits in local or cluster cache", gc.Stats.CacheHits.Get},
|
||||||
|
{"groupcache_peer_loads_total", "Total remote loads or cache hits without error", gc.Stats.PeerLoads.Get},
|
||||||
|
{"groupcache_peer_errors_total", "Total errors from peers", gc.Stats.PeerErrors.Get},
|
||||||
|
{"groupcache_loads_total", "Total gets without cache hits", gc.Stats.Loads.Get},
|
||||||
|
{"groupcache_loads_deduped_total", "gets without cache hits after duplicate suppression", gc.Stats.LoadsDeduped.Get},
|
||||||
|
{"groupcache_local_loads_total", "Total good local loads", gc.Stats.LocalLoads.Get},
|
||||||
|
{"groupcache_local_load_errs_total", "Total local load errors", gc.Stats.LocalLoadErrs.Get},
|
||||||
|
{"groupcache_server_requests_total", "Total gets from peers", gc.Stats.ServerRequests.Get},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, m := range cumulativeMetrics {
|
||||||
|
registry.addInt64DerivedCumulativeMetric(m.name, m.desc, "autocache", m.f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddBoltDBMetrics registers a metrics handler against a *bbolt.DB
|
||||||
|
func AddBoltDBMetrics(stats func() bbolt.Stats) {
|
||||||
|
gaugeMetrics := []struct {
|
||||||
|
name string
|
||||||
|
desc string
|
||||||
|
f func() int64
|
||||||
|
}{
|
||||||
|
{"boltdb_free_page_n", "Number of free pages on the freelist", func() int64 { return int64(stats().FreePageN) }},
|
||||||
|
{"boltdb_pending_page_n", "Number of pending pages on the freelist", func() int64 { return int64(stats().PendingPageN) }},
|
||||||
|
{"boltdb_free_alloc_size_bytes", "Bytes allocated in free pages", func() int64 { return int64(stats().FreeAlloc) }},
|
||||||
|
{"boltdb_freelist_inuse_size_bytes", "Bytes used by the freelist", func() int64 { return int64(stats().FreelistInuse) }},
|
||||||
|
{"boltdb_txn", "total number of started read transactions", func() int64 { return int64(stats().TxN) }},
|
||||||
|
{"boltdb_open_txn", "number of currently open read transactions", func() int64 { return int64(stats().OpenTxN) }},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, m := range gaugeMetrics {
|
||||||
|
registry.addInt64DerivedGaugeMetric(m.name, m.desc, "boltdb", m.f)
|
||||||
|
}
|
||||||
|
|
||||||
|
cumulativeMetrics := []struct {
|
||||||
|
name string
|
||||||
|
desc string
|
||||||
|
f func() int64
|
||||||
|
}{
|
||||||
|
{"boltdb_txn_page_total", "Total number of page allocations", func() int64 { return int64(stats().TxStats.PageCount) }},
|
||||||
|
{"boltdb_txn_page_alloc_size_bytes_total", "Total bytes allocated", func() int64 { return int64(stats().TxStats.PageAlloc) }},
|
||||||
|
{"boltdb_txn_cursor_total", "Total number of cursors created", func() int64 { return int64(stats().TxStats.CursorCount) }},
|
||||||
|
{"boltdb_txn_node_total", "Total number of node allocations", func() int64 { return int64(stats().TxStats.NodeCount) }},
|
||||||
|
{"boltdb_txn_node_deref_total", "Total number of node dereferences", func() int64 { return int64(stats().TxStats.NodeDeref) }},
|
||||||
|
{"boltdb_txn_rebalance_total", "Total number of node rebalances", func() int64 { return int64(stats().TxStats.Rebalance) }},
|
||||||
|
{"boltdb_txn_rebalance_duration_ms_total", "Total time spent rebalancing", func() int64 { return stats().TxStats.RebalanceTime.Milliseconds() }},
|
||||||
|
{"boltdb_txn_split_total", "Total number of nodes split", func() int64 { return int64(stats().TxStats.Split) }},
|
||||||
|
{"boltdb_txn_spill_total", "Total number of nodes spilled", func() int64 { return int64(stats().TxStats.Spill) }},
|
||||||
|
{"boltdb_txn_spill_duration_ms_total", "Total time spent spilling", func() int64 { return stats().TxStats.SpillTime.Milliseconds() }},
|
||||||
|
{"boltdb_txn_write_total", "Total number of writes performed", func() int64 { return int64(stats().TxStats.Write) }},
|
||||||
|
{"boltdb_txn_write_duration_ms_total", "Total time spent writing to disk", func() int64 { return stats().TxStats.WriteTime.Milliseconds() }},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, m := range cumulativeMetrics {
|
||||||
|
registry.addInt64DerivedCumulativeMetric(m.name, m.desc, "boltdb", m.f)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddRedisMetrics registers a metrics handler against a redis Client's PoolStats() method
|
||||||
|
func AddRedisMetrics(stats func() *redis.PoolStats) {
|
||||||
|
gaugeMetrics := []struct {
|
||||||
|
name string
|
||||||
|
desc string
|
||||||
|
f func() int64
|
||||||
|
}{
|
||||||
|
{"redis_conns", "Number of total connections in the pool", func() int64 { return int64(stats().TotalConns) }},
|
||||||
|
{"redis_idle_conns", "Number of idle connections in the pool", func() int64 { return int64(stats().IdleConns) }},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, m := range gaugeMetrics {
|
||||||
|
registry.addInt64DerivedGaugeMetric(m.name, m.desc, "redis", m.f)
|
||||||
|
}
|
||||||
|
|
||||||
|
cumulativeMetrics := []struct {
|
||||||
|
name string
|
||||||
|
desc string
|
||||||
|
f func() int64
|
||||||
|
}{
|
||||||
|
{"redis_hits_total", "Total number of times free connection was found in the pool", func() int64 { return int64(stats().Hits) }},
|
||||||
|
{"redis_misses_total", "Total number of times free connection was NOT found in the pool", func() int64 { return int64(stats().Misses) }},
|
||||||
|
{"redis_timeouts_total", "Total number of times a wait timeout occurred", func() int64 { return int64(stats().Timeouts) }},
|
||||||
|
{"redis_stale_conns_total", "Total number of stale connections removed from the pool", func() int64 { return int64(stats().StaleConns) }},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, m := range cumulativeMetrics {
|
||||||
|
registry.addInt64DerivedCumulativeMetric(m.name, m.desc, "redis", m.f)
|
||||||
|
}
|
||||||
|
}
|
94
internal/telemetry/metrics/kv_test.go
Normal file
94
internal/telemetry/metrics/kv_test.go
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/go-redis/redis/v7"
|
||||||
|
"github.com/golang/groupcache"
|
||||||
|
"go.etcd.io/bbolt"
|
||||||
|
"go.opencensus.io/metric/metricdata"
|
||||||
|
)
|
||||||
|
|
||||||
|
func Test_AddGroupCacheMetrics(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
gc := &groupcache.Group{}
|
||||||
|
AddGroupCacheMetrics(gc)
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
stat *groupcache.AtomicInt
|
||||||
|
want int64
|
||||||
|
}{
|
||||||
|
{"groupcache_gets_total", &gc.Stats.Gets, 4},
|
||||||
|
{"groupcache_loads_total", &gc.Stats.Loads, 42},
|
||||||
|
{"groupcache_server_requests_total", &gc.Stats.ServerRequests, 8},
|
||||||
|
}
|
||||||
|
|
||||||
|
labelValues := []metricdata.LabelValue{
|
||||||
|
metricdata.NewLabelValue("autocache"),
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
tt.stat.Add(tt.want)
|
||||||
|
testMetricRetrieval(registry.registry.Read(), t, labelValues, tt.want, tt.name)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_AddBoltDBMetrics(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
stat bbolt.Stats
|
||||||
|
want int64
|
||||||
|
}{
|
||||||
|
{"boltdb_free_page_n", bbolt.Stats{FreePageN: 14}, 14},
|
||||||
|
{"boltdb_txn", bbolt.Stats{TxN: 88}, 88},
|
||||||
|
|
||||||
|
{"boltdb_txn_rebalance_duration_ms_total", bbolt.Stats{TxStats: bbolt.TxStats{RebalanceTime: 42 * time.Millisecond}}, 42},
|
||||||
|
{"boltdb_txn_write_total", bbolt.Stats{TxStats: bbolt.TxStats{Write: 42}}, 42},
|
||||||
|
}
|
||||||
|
|
||||||
|
labelValues := []metricdata.LabelValue{
|
||||||
|
metricdata.NewLabelValue("boltdb"),
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
AddBoltDBMetrics(func() bbolt.Stats { return tt.stat })
|
||||||
|
testMetricRetrieval(registry.registry.Read(), t, labelValues, tt.want, tt.name)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_AddRedisMetrics(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
stat *redis.PoolStats
|
||||||
|
want int64
|
||||||
|
}{
|
||||||
|
{"redis_conns", &redis.PoolStats{TotalConns: 7}, 7},
|
||||||
|
{"redis_hits_total", &redis.PoolStats{Hits: 78}, 78},
|
||||||
|
{"redis_timeouts_total", &redis.PoolStats{Timeouts: 2}, 2},
|
||||||
|
}
|
||||||
|
|
||||||
|
labelValues := []metricdata.LabelValue{
|
||||||
|
metricdata.NewLabelValue("redis"),
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
AddRedisMetrics(func() *redis.PoolStats { return tt.stat })
|
||||||
|
testMetricRetrieval(registry.registry.Read(), t, labelValues, tt.want, tt.name)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
142
internal/telemetry/metrics/registry.go
Normal file
142
internal/telemetry/metrics/registry.go
Normal file
|
@ -0,0 +1,142 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"runtime"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"go.opencensus.io/metric"
|
||||||
|
"go.opencensus.io/metric/metricdata"
|
||||||
|
|
||||||
|
"github.com/pomerium/pomerium/internal/log"
|
||||||
|
"github.com/pomerium/pomerium/internal/version"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
registry = newMetricRegistry()
|
||||||
|
)
|
||||||
|
|
||||||
|
// metricRegistry holds the non-view metrics and handles safe
|
||||||
|
// initialization and updates. Behavior without using newMetricRegistry()
|
||||||
|
// is undefined.
|
||||||
|
type metricRegistry struct {
|
||||||
|
registry *metric.Registry
|
||||||
|
buildInfo *metric.Int64Gauge
|
||||||
|
policyCount *metric.Int64DerivedGauge
|
||||||
|
configChecksum *metric.Float64Gauge
|
||||||
|
sync.Once
|
||||||
|
}
|
||||||
|
|
||||||
|
func newMetricRegistry() *metricRegistry {
|
||||||
|
r := new(metricRegistry)
|
||||||
|
r.init()
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *metricRegistry) init() {
|
||||||
|
r.Do(
|
||||||
|
func() {
|
||||||
|
r.registry = metric.NewRegistry()
|
||||||
|
var err error
|
||||||
|
r.buildInfo, err = r.registry.AddInt64Gauge("build_info",
|
||||||
|
metric.WithDescription("Build Metadata"),
|
||||||
|
metric.WithLabelKeys("service", "version", "revision", "goversion"),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Err(err).Msg("telemetry/metrics: failed to register build info metric")
|
||||||
|
}
|
||||||
|
|
||||||
|
r.configChecksum, err = r.registry.AddFloat64Gauge("config_checksum_decimal",
|
||||||
|
metric.WithDescription("Config checksum represented in decimal notation"),
|
||||||
|
metric.WithLabelKeys("service"),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Err(err).Msg("telemetry/metrics: failed to register config checksum metric")
|
||||||
|
}
|
||||||
|
|
||||||
|
r.policyCount, err = r.registry.AddInt64DerivedGauge("policy_count_total",
|
||||||
|
metric.WithDescription("Total number of policies loaded"),
|
||||||
|
metric.WithLabelKeys("service"),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Err(err).Msg("telemetry/metrics: failed to register policy count metric")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
|
||||||
|
// have this exported
|
||||||
|
func (r *metricRegistry) setBuildInfo(service string) {
|
||||||
|
if registry.buildInfo == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
m, err := registry.buildInfo.GetEntry(
|
||||||
|
metricdata.NewLabelValue(service),
|
||||||
|
metricdata.NewLabelValue(version.FullVersion()),
|
||||||
|
metricdata.NewLabelValue(version.GitCommit),
|
||||||
|
metricdata.NewLabelValue((runtime.Version())),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Err(err).Msg("telemetry/metrics: failed to get build info metric")
|
||||||
|
}
|
||||||
|
|
||||||
|
// This sets our build_info metric to a constant 1 per
|
||||||
|
// https://www.robustperception.io/exposing-the-software-version-to-prometheus
|
||||||
|
m.Set(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *metricRegistry) addPolicyCountCallback(service string, f func() int64) {
|
||||||
|
if r.policyCount == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
err := r.policyCount.UpsertEntry(f, metricdata.NewLabelValue(service))
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Err(err).Msg("telemetry/metrics: failed to get policy count metric")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *metricRegistry) setConfigChecksum(service string, checksum uint64) {
|
||||||
|
if r.configChecksum == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
m, err := r.configChecksum.GetEntry(metricdata.NewLabelValue(service))
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Err(err).Msg("telemetry/metrics: failed to get config checksum metric")
|
||||||
|
}
|
||||||
|
m.Set(float64(checksum))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *metricRegistry) addInt64DerivedGaugeMetric(name string, desc string, service string, f func() int64) {
|
||||||
|
|
||||||
|
m, err := r.registry.AddInt64DerivedGauge(name, metric.WithDescription(desc), metric.WithLabelKeys("service"))
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to register metric")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
err = m.UpsertEntry(
|
||||||
|
f,
|
||||||
|
metricdata.NewLabelValue(service),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to update metric")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *metricRegistry) addInt64DerivedCumulativeMetric(name string, desc string, service string, f func() int64) {
|
||||||
|
|
||||||
|
m, err := r.registry.AddInt64DerivedCumulative(name, metric.WithDescription(desc), metric.WithLabelKeys("service"))
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to register metric")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
err = m.UpsertEntry(
|
||||||
|
f,
|
||||||
|
metricdata.NewLabelValue(service),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to update metric")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue