Add storage metrics (#554)

* Add cache storage metrics

- autocache client metrics
- autocache server metrics
- boltdb metrics
- redis client metrics
- refactor metrics registry to be general purpose
This commit is contained in:
Travis Groth 2020-03-23 22:07:48 -04:00 committed by GitHub
parent acfc880421
commit cc504362e4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 400 additions and 108 deletions

View file

@ -325,7 +325,35 @@ Expose a prometheus format HTTP endpoint on the specified port. Disabled by defa
**Metrics tracked**
| Name | Type | Description |
| :-------------------------------------------- | :-------- | :---------------------------------------------------------------------- |
| --------------------------------------------- | --------- | ----------------------------------------------------------------------- |
| boltdb_free_alloc_size_bytes | Gauge | Bytes allocated in free pages |
| boltdb_free_page_n | Gauge | Number of free pages on the freelist |
| boltdb_freelist_inuse_size_bytes | Gauge | Bytes used by the freelist |
| boltdb_open_txn | Gauge | number of currently open read transactions |
| boltdb_pending_page_n | Gauge | Number of pending pages on the freelist |
| boltdb_txn | Gauge | total number of started read transactions |
| boltdb_txn_cursor_total | Counter | Total number of cursors created |
| boltdb_txn_node_deref_total | Counter | Total number of node dereferences |
| boltdb_txn_node_total | Counter | Total number of node allocations |
| boltdb_txn_page_alloc_size_bytes_total | Counter | Total bytes allocated |
| boltdb_txn_page_total | Counter | Total number of page allocations |
| boltdb_txn_rebalance_duration_ms_total | Counter | Total time spent rebalancing |
| boltdb_txn_rebalance_total | Counter | Total number of node rebalances |
| boltdb_txn_spill_duration_ms_total | Counter | Total time spent spilling |
| boltdb_txn_spill_total | Counter | Total number of nodes spilled |
| boltdb_txn_split_total | Counter | Total number of nodes split |
| boltdb_txn_write_duration_ms_total | Counter | Total time spent writing to disk |
| boltdb_txn_write_total | Counter | Total number of writes performed |
| groupcache_cache_hits_total | Counter | Total cache hits in local or cluster cache |
| groupcache_cache_hits_total | Counter | Total cache hits in local or cluster cache |
| groupcache_gets_total | Counter | Total get request, including from peers |
| groupcache_loads_deduped_total | Counter | gets without cache hits after duplicate suppression |
| groupcache_loads_total | Counter | Total gets without cache hits |
| groupcache_local_load_errs_total | Counter | Total local load errors |
| groupcache_local_loads_total | Counter | Total good local loads |
| groupcache_peer_errors_total | Counter | Total errors from peers |
| groupcache_peer_loads_total | Counter | Total remote loads or cache hits without error |
| groupcache_server_requests_total | Counter | Total gets from peers |
| grpc_client_request_duration_ms | Histogram | GRPC client request duration by service |
| grpc_client_request_size_bytes | Histogram | GRPC client request size by service |
| grpc_client_requests_total | Counter | Total GRPC client requests made by service |
@ -342,10 +370,17 @@ Expose a prometheus format HTTP endpoint on the specified port. Disabled by defa
| http_server_request_size_bytes | Histogram | HTTP server request size by service |
| http_server_requests_total | Counter | Total HTTP server requests handled by service |
| http_server_response_size_bytes | Histogram | HTTP server response size by service |
| pomerium_build_info | Gauge | Pomerium build metadata by git revision, service, version and goversion |
| pomerium_config_checksum_int64 | Gauge | Currently loaded configuration checksum by service |
| pomerium_config_last_reload_success | Gauge | Whether the last configuration reload succeeded by service |
| pomerium_config_last_reload_success_timestamp | Gauge | The timestamp of the last successful configuration reload by service |
| pomerium_build_info | Gauge | Pomerium build metadata by git revision, service, version and goversion |
| redis_conns | Gauge | Number of total connections in the pool |
| redis_hits_total | Counter | Total number of times free connection was found in the pool |
| redis_idle_conns | Gauge | Number of idle connections in the pool |
| redis_misses_total | Counter | Total number of times free connection was NOT found in the pool |
| redis_stale_conns_total | Counter | Total number of stale connections removed from the pool |
| redis_timeouts_total | Counter | Total number of times a wait timeout occurred |
### Tracing

View file

@ -12,9 +12,11 @@ import (
"sync"
"github.com/golang/groupcache"
"github.com/pomerium/autocache"
"github.com/pomerium/pomerium/internal/httputil"
"github.com/pomerium/pomerium/internal/kv"
"github.com/pomerium/pomerium/internal/telemetry/metrics"
"github.com/pomerium/pomerium/internal/urlutil"
)
@ -116,13 +118,14 @@ func New(o *Options) (*Store, error) {
}
serverOpts := &httputil.ServerOptions{Addr: o.Addr}
var wg sync.WaitGroup
s.srv, err = httputil.NewServer(serverOpts, QueryParamToCtx(s.cluster), &wg)
s.srv, err = httputil.NewServer(serverOpts, metrics.HTTPMetricsHandler("groupcache")(QueryParamToCtx(s.cluster)), &wg)
if err != nil {
return nil, err
}
if _, err := s.cluster.Join([]string{o.ClusterDomain}); err != nil {
return nil, err
}
metrics.AddGroupCacheMetrics(s.db)
return &s, nil
}
@ -191,7 +194,9 @@ func (s signedSession) RoundTrip(req *http.Request) (*http.Response, error) {
q.Set(defaultQueryParamKey, session)
newReqURL.RawQuery = q.Encode()
newReq.URL = urlutil.NewSignedURL(s.sharedKey, &newReqURL).Sign()
return http.DefaultTransport.RoundTrip(newReq)
tripper := metrics.HTTPMetricsRoundTripper("cache", "groupcache")(http.DefaultTransport)
return tripper.RoundTrip(newReq)
}
// QueryParamToCtx takes a value from a query param and adds it to the

View file

@ -5,9 +5,10 @@ package bolt
import (
"context"
"github.com/pomerium/pomerium/internal/kv"
bolt "go.etcd.io/bbolt"
"github.com/pomerium/pomerium/internal/kv"
"github.com/pomerium/pomerium/internal/telemetry/metrics"
)
var _ kv.Store = &Store{}
@ -64,6 +65,7 @@ func New(o *Options) (*Store, error) {
return nil, err
}
metrics.AddBoltDBMetrics(db.Stats)
return &Store{db: db, bucket: o.Bucket}, nil
}

View file

@ -9,7 +9,9 @@ import (
"fmt"
"github.com/go-redis/redis/v7"
"github.com/pomerium/pomerium/internal/kv"
"github.com/pomerium/pomerium/internal/telemetry/metrics"
)
var _ kv.Store = &Store{}
@ -56,6 +58,7 @@ func New(o *Options) (*Store, error) {
return nil, fmt.Errorf("kv/redis: error connecting to redis: %w", err)
}
metrics.AddRedisMetrics(db.PoolStats)
return &Store{db: db}, nil
}

View file

@ -2,19 +2,14 @@ package metrics
import (
"context"
"runtime"
"sync"
"time"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/version"
"go.opencensus.io/metric"
"go.opencensus.io/metric/metricdata"
"go.opencensus.io/metric/metricproducer"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
"github.com/pomerium/pomerium/internal/log"
)
var (
@ -30,7 +25,6 @@ var (
"config_last_reload_success",
"Returns 1 if last reload was successful",
"1")
registry = newMetricRegistry()
// ConfigLastReloadView contains the timestamp the configuration was last
// reloaded, labeled by service.
@ -79,75 +73,6 @@ func SetConfigInfo(service string, success bool) {
}
}
// metricRegistry holds the non-view metrics and handles safe
// initialization and updates. Behavior without using newMetricRegistry()
// is undefined.
type metricRegistry struct {
registry *metric.Registry
buildInfo *metric.Int64Gauge
policyCount *metric.Int64DerivedGauge
configChecksum *metric.Float64Gauge
sync.Once
}
func newMetricRegistry() *metricRegistry {
r := new(metricRegistry)
r.init()
return r
}
func (r *metricRegistry) init() {
r.Do(
func() {
r.registry = metric.NewRegistry()
var err error
r.buildInfo, err = r.registry.AddInt64Gauge("build_info",
metric.WithDescription("Build Metadata"),
metric.WithLabelKeys("service", "version", "revision", "goversion"),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to register build info metric")
}
r.configChecksum, err = r.registry.AddFloat64Gauge("config_checksum_decimal",
metric.WithDescription("Config checksum represented in decimal notation"),
metric.WithLabelKeys("service"),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to register config checksum metric")
}
r.policyCount, err = r.registry.AddInt64DerivedGauge("policy_count_total",
metric.WithDescription("Total number of policies loaded"),
metric.WithLabelKeys("service"),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to register policy count metric")
}
})
}
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
// have this exported
func (r *metricRegistry) setBuildInfo(service string) {
if registry.buildInfo == nil {
return
}
m, err := registry.buildInfo.GetEntry(
metricdata.NewLabelValue(service),
metricdata.NewLabelValue(version.FullVersion()),
metricdata.NewLabelValue(version.GitCommit),
metricdata.NewLabelValue((runtime.Version())),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to get build info metric")
}
// This sets our build_info metric to a constant 1 per
// https://www.robustperception.io/exposing-the-software-version-to-prometheus
m.Set(1)
}
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
// have this exported
func SetBuildInfo(service string) {
@ -159,33 +84,12 @@ func RegisterInfoMetrics() {
metricproducer.GlobalManager().AddProducer(registry.registry)
}
func (r *metricRegistry) setConfigChecksum(service string, checksum uint64) {
if r.configChecksum == nil {
return
}
m, err := r.configChecksum.GetEntry(metricdata.NewLabelValue(service))
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to get config checksum metric")
}
m.Set(float64(checksum))
}
// SetConfigChecksum creates the configuration checksum metric. You must call RegisterInfoMetrics to
// have this exported
func SetConfigChecksum(service string, checksum uint64) {
registry.setConfigChecksum(service, checksum)
}
func (r *metricRegistry) addPolicyCountCallback(service string, f func() int64) {
if r.policyCount == nil {
return
}
err := r.policyCount.UpsertEntry(f, metricdata.NewLabelValue(service))
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to get policy count metric")
}
}
// AddPolicyCountCallback sets the function to call when exporting the
// policy count metric. You must call RegisterInfoMetrics to have this
// exported

View file

@ -0,0 +1,107 @@
package metrics
import (
"github.com/go-redis/redis/v7"
"github.com/golang/groupcache"
"go.etcd.io/bbolt"
)
// AddGroupCacheMetrics registers a metrics handler against a *groupcache.Group
func AddGroupCacheMetrics(gc *groupcache.Group) {
cumulativeMetrics := []struct {
name string
desc string
f func() int64
}{
{"groupcache_gets_total", "Total get request, including from peers", gc.Stats.Gets.Get},
{"groupcache_cache_hits_total", "Total cache hits in local or cluster cache", gc.Stats.CacheHits.Get},
{"groupcache_cache_hits_total", "Total cache hits in local or cluster cache", gc.Stats.CacheHits.Get},
{"groupcache_peer_loads_total", "Total remote loads or cache hits without error", gc.Stats.PeerLoads.Get},
{"groupcache_peer_errors_total", "Total errors from peers", gc.Stats.PeerErrors.Get},
{"groupcache_loads_total", "Total gets without cache hits", gc.Stats.Loads.Get},
{"groupcache_loads_deduped_total", "gets without cache hits after duplicate suppression", gc.Stats.LoadsDeduped.Get},
{"groupcache_local_loads_total", "Total good local loads", gc.Stats.LocalLoads.Get},
{"groupcache_local_load_errs_total", "Total local load errors", gc.Stats.LocalLoadErrs.Get},
{"groupcache_server_requests_total", "Total gets from peers", gc.Stats.ServerRequests.Get},
}
for _, m := range cumulativeMetrics {
registry.addInt64DerivedCumulativeMetric(m.name, m.desc, "autocache", m.f)
}
}
// AddBoltDBMetrics registers a metrics handler against a *bbolt.DB
func AddBoltDBMetrics(stats func() bbolt.Stats) {
gaugeMetrics := []struct {
name string
desc string
f func() int64
}{
{"boltdb_free_page_n", "Number of free pages on the freelist", func() int64 { return int64(stats().FreePageN) }},
{"boltdb_pending_page_n", "Number of pending pages on the freelist", func() int64 { return int64(stats().PendingPageN) }},
{"boltdb_free_alloc_size_bytes", "Bytes allocated in free pages", func() int64 { return int64(stats().FreeAlloc) }},
{"boltdb_freelist_inuse_size_bytes", "Bytes used by the freelist", func() int64 { return int64(stats().FreelistInuse) }},
{"boltdb_txn", "total number of started read transactions", func() int64 { return int64(stats().TxN) }},
{"boltdb_open_txn", "number of currently open read transactions", func() int64 { return int64(stats().OpenTxN) }},
}
for _, m := range gaugeMetrics {
registry.addInt64DerivedGaugeMetric(m.name, m.desc, "boltdb", m.f)
}
cumulativeMetrics := []struct {
name string
desc string
f func() int64
}{
{"boltdb_txn_page_total", "Total number of page allocations", func() int64 { return int64(stats().TxStats.PageCount) }},
{"boltdb_txn_page_alloc_size_bytes_total", "Total bytes allocated", func() int64 { return int64(stats().TxStats.PageAlloc) }},
{"boltdb_txn_cursor_total", "Total number of cursors created", func() int64 { return int64(stats().TxStats.CursorCount) }},
{"boltdb_txn_node_total", "Total number of node allocations", func() int64 { return int64(stats().TxStats.NodeCount) }},
{"boltdb_txn_node_deref_total", "Total number of node dereferences", func() int64 { return int64(stats().TxStats.NodeDeref) }},
{"boltdb_txn_rebalance_total", "Total number of node rebalances", func() int64 { return int64(stats().TxStats.Rebalance) }},
{"boltdb_txn_rebalance_duration_ms_total", "Total time spent rebalancing", func() int64 { return stats().TxStats.RebalanceTime.Milliseconds() }},
{"boltdb_txn_split_total", "Total number of nodes split", func() int64 { return int64(stats().TxStats.Split) }},
{"boltdb_txn_spill_total", "Total number of nodes spilled", func() int64 { return int64(stats().TxStats.Spill) }},
{"boltdb_txn_spill_duration_ms_total", "Total time spent spilling", func() int64 { return stats().TxStats.SpillTime.Milliseconds() }},
{"boltdb_txn_write_total", "Total number of writes performed", func() int64 { return int64(stats().TxStats.Write) }},
{"boltdb_txn_write_duration_ms_total", "Total time spent writing to disk", func() int64 { return stats().TxStats.WriteTime.Milliseconds() }},
}
for _, m := range cumulativeMetrics {
registry.addInt64DerivedCumulativeMetric(m.name, m.desc, "boltdb", m.f)
}
}
// AddRedisMetrics registers a metrics handler against a redis Client's PoolStats() method
func AddRedisMetrics(stats func() *redis.PoolStats) {
gaugeMetrics := []struct {
name string
desc string
f func() int64
}{
{"redis_conns", "Number of total connections in the pool", func() int64 { return int64(stats().TotalConns) }},
{"redis_idle_conns", "Number of idle connections in the pool", func() int64 { return int64(stats().IdleConns) }},
}
for _, m := range gaugeMetrics {
registry.addInt64DerivedGaugeMetric(m.name, m.desc, "redis", m.f)
}
cumulativeMetrics := []struct {
name string
desc string
f func() int64
}{
{"redis_hits_total", "Total number of times free connection was found in the pool", func() int64 { return int64(stats().Hits) }},
{"redis_misses_total", "Total number of times free connection was NOT found in the pool", func() int64 { return int64(stats().Misses) }},
{"redis_timeouts_total", "Total number of times a wait timeout occurred", func() int64 { return int64(stats().Timeouts) }},
{"redis_stale_conns_total", "Total number of stale connections removed from the pool", func() int64 { return int64(stats().StaleConns) }},
}
for _, m := range cumulativeMetrics {
registry.addInt64DerivedCumulativeMetric(m.name, m.desc, "redis", m.f)
}
}

View file

@ -0,0 +1,94 @@
package metrics
import (
"testing"
"time"
"github.com/go-redis/redis/v7"
"github.com/golang/groupcache"
"go.etcd.io/bbolt"
"go.opencensus.io/metric/metricdata"
)
func Test_AddGroupCacheMetrics(t *testing.T) {
t.Parallel()
gc := &groupcache.Group{}
AddGroupCacheMetrics(gc)
tests := []struct {
name string
stat *groupcache.AtomicInt
want int64
}{
{"groupcache_gets_total", &gc.Stats.Gets, 4},
{"groupcache_loads_total", &gc.Stats.Loads, 42},
{"groupcache_server_requests_total", &gc.Stats.ServerRequests, 8},
}
labelValues := []metricdata.LabelValue{
metricdata.NewLabelValue("autocache"),
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tt.stat.Add(tt.want)
testMetricRetrieval(registry.registry.Read(), t, labelValues, tt.want, tt.name)
})
}
}
func Test_AddBoltDBMetrics(t *testing.T) {
t.Parallel()
tests := []struct {
name string
stat bbolt.Stats
want int64
}{
{"boltdb_free_page_n", bbolt.Stats{FreePageN: 14}, 14},
{"boltdb_txn", bbolt.Stats{TxN: 88}, 88},
{"boltdb_txn_rebalance_duration_ms_total", bbolt.Stats{TxStats: bbolt.TxStats{RebalanceTime: 42 * time.Millisecond}}, 42},
{"boltdb_txn_write_total", bbolt.Stats{TxStats: bbolt.TxStats{Write: 42}}, 42},
}
labelValues := []metricdata.LabelValue{
metricdata.NewLabelValue("boltdb"),
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
AddBoltDBMetrics(func() bbolt.Stats { return tt.stat })
testMetricRetrieval(registry.registry.Read(), t, labelValues, tt.want, tt.name)
})
}
}
func Test_AddRedisMetrics(t *testing.T) {
t.Parallel()
tests := []struct {
name string
stat *redis.PoolStats
want int64
}{
{"redis_conns", &redis.PoolStats{TotalConns: 7}, 7},
{"redis_hits_total", &redis.PoolStats{Hits: 78}, 78},
{"redis_timeouts_total", &redis.PoolStats{Timeouts: 2}, 2},
}
labelValues := []metricdata.LabelValue{
metricdata.NewLabelValue("redis"),
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
AddRedisMetrics(func() *redis.PoolStats { return tt.stat })
testMetricRetrieval(registry.registry.Read(), t, labelValues, tt.want, tt.name)
})
}
}

View file

@ -0,0 +1,142 @@
package metrics
import (
"runtime"
"sync"
"go.opencensus.io/metric"
"go.opencensus.io/metric/metricdata"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/version"
)
var (
registry = newMetricRegistry()
)
// metricRegistry holds the non-view metrics and handles safe
// initialization and updates. Behavior without using newMetricRegistry()
// is undefined.
type metricRegistry struct {
registry *metric.Registry
buildInfo *metric.Int64Gauge
policyCount *metric.Int64DerivedGauge
configChecksum *metric.Float64Gauge
sync.Once
}
func newMetricRegistry() *metricRegistry {
r := new(metricRegistry)
r.init()
return r
}
func (r *metricRegistry) init() {
r.Do(
func() {
r.registry = metric.NewRegistry()
var err error
r.buildInfo, err = r.registry.AddInt64Gauge("build_info",
metric.WithDescription("Build Metadata"),
metric.WithLabelKeys("service", "version", "revision", "goversion"),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to register build info metric")
}
r.configChecksum, err = r.registry.AddFloat64Gauge("config_checksum_decimal",
metric.WithDescription("Config checksum represented in decimal notation"),
metric.WithLabelKeys("service"),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to register config checksum metric")
}
r.policyCount, err = r.registry.AddInt64DerivedGauge("policy_count_total",
metric.WithDescription("Total number of policies loaded"),
metric.WithLabelKeys("service"),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to register policy count metric")
}
})
}
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
// have this exported
func (r *metricRegistry) setBuildInfo(service string) {
if registry.buildInfo == nil {
return
}
m, err := registry.buildInfo.GetEntry(
metricdata.NewLabelValue(service),
metricdata.NewLabelValue(version.FullVersion()),
metricdata.NewLabelValue(version.GitCommit),
metricdata.NewLabelValue((runtime.Version())),
)
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to get build info metric")
}
// This sets our build_info metric to a constant 1 per
// https://www.robustperception.io/exposing-the-software-version-to-prometheus
m.Set(1)
}
func (r *metricRegistry) addPolicyCountCallback(service string, f func() int64) {
if r.policyCount == nil {
return
}
err := r.policyCount.UpsertEntry(f, metricdata.NewLabelValue(service))
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to get policy count metric")
}
}
func (r *metricRegistry) setConfigChecksum(service string, checksum uint64) {
if r.configChecksum == nil {
return
}
m, err := r.configChecksum.GetEntry(metricdata.NewLabelValue(service))
if err != nil {
log.Error().Err(err).Msg("telemetry/metrics: failed to get config checksum metric")
}
m.Set(float64(checksum))
}
func (r *metricRegistry) addInt64DerivedGaugeMetric(name string, desc string, service string, f func() int64) {
m, err := r.registry.AddInt64DerivedGauge(name, metric.WithDescription(desc), metric.WithLabelKeys("service"))
if err != nil {
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to register metric")
return
}
err = m.UpsertEntry(
f,
metricdata.NewLabelValue(service),
)
if err != nil {
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to update metric")
return
}
}
func (r *metricRegistry) addInt64DerivedCumulativeMetric(name string, desc string, service string, f func() int64) {
m, err := r.registry.AddInt64DerivedCumulative(name, metric.WithDescription(desc), metric.WithLabelKeys("service"))
if err != nil {
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to register metric")
return
}
err = m.UpsertEntry(
f,
metricdata.NewLabelValue(service),
)
if err != nil {
log.Error().Err(err).Str("service", service).Msg("telemetry/metrics: failed to update metric")
return
}
}