Add info metrics

This commit is contained in:
Travis Groth 2019-07-15 23:18:54 -04:00
parent d21d3c45b5
commit db63956b0e
10 changed files with 348 additions and 19 deletions

View file

@ -23,6 +23,7 @@
- Add support for large cookie sessions by chunking. [GH-211] - Add support for large cookie sessions by chunking. [GH-211]
- Prefer [curve](https://wiki.mozilla.org/Security/Server_Side_TLS) X25519 to P256 for TLS connections. [GH-233] - Prefer [curve](https://wiki.mozilla.org/Security/Server_Side_TLS) X25519 to P256 for TLS connections. [GH-233]
- Add informational metrics. [GH-227]
## v0.1.0 ## v0.1.0

View file

@ -6,6 +6,7 @@ import (
"github.com/pomerium/pomerium/internal/config" "github.com/pomerium/pomerium/internal/config"
"github.com/pomerium/pomerium/internal/log" "github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/metrics"
) )
// ValidateOptions checks to see if configuration values are valid for the // ValidateOptions checks to see if configuration values are valid for the
@ -37,7 +38,6 @@ func New(opts config.Options) (*Authorize, error) {
} }
// errors handled by validate // errors handled by validate
sharedKey, _ := base64.StdEncoding.DecodeString(opts.SharedKey) sharedKey, _ := base64.StdEncoding.DecodeString(opts.SharedKey)
return &Authorize{ return &Authorize{
SharedKey: string(sharedKey), SharedKey: string(sharedKey),
identityAccess: NewIdentityWhitelist(opts.Policies, opts.Administrators), identityAccess: NewIdentityWhitelist(opts.Policies, opts.Administrators),
@ -47,6 +47,9 @@ func New(opts config.Options) (*Authorize, error) {
// NewIdentityWhitelist returns an indentity validator. // NewIdentityWhitelist returns an indentity validator.
// todo(bdd) : a radix-tree implementation is probably more efficient // todo(bdd) : a radix-tree implementation is probably more efficient
func NewIdentityWhitelist(policies []config.Policy, admins []string) IdentityValidator { func NewIdentityWhitelist(policies []config.Policy, admins []string) IdentityValidator {
metrics.AddPolicyCountCallback("authorize", func() int64 {
return int64(len(policies))
})
return newIdentityWhitelistMap(policies, admins) return newIdentityWhitelistMap(policies, admins)
} }

View file

@ -6,6 +6,7 @@ import (
"fmt" "fmt"
"net/http" "net/http"
"os" "os"
"strconv"
"time" "time"
"github.com/fsnotify/fsnotify" "github.com/fsnotify/fsnotify"
@ -87,6 +88,7 @@ func main() {
if opt.MetricsAddr != "" { if opt.MetricsAddr != "" {
go newPromListener(opt.MetricsAddr) go newPromListener(opt.MetricsAddr)
metrics.SetBuildInfo(opt.Services)
} }
if srv, err := startRedirectServer(opt.HTTPRedirectAddr); err != nil { if srv, err := startRedirectServer(opt.HTTPRedirectAddr); err != nil {
@ -162,6 +164,8 @@ func newPromListener(addr string) {
metrics.RegisterView(metrics.HTTPServerViews) metrics.RegisterView(metrics.HTTPServerViews)
metrics.RegisterView(metrics.GRPCClientViews) metrics.RegisterView(metrics.GRPCClientViews)
metrics.RegisterView(metrics.GRPCServerViews) metrics.RegisterView(metrics.GRPCServerViews)
metrics.RegisterInfoMetrics()
metrics.RegisterView(metrics.InfoViews)
log.Info().Str("MetricsAddr", addr).Msg("cmd/pomerium: starting prometheus endpoint") log.Info().Str("MetricsAddr", addr).Msg("cmd/pomerium: starting prometheus endpoint")
log.Error().Err(metrics.NewPromHTTPListener(addr)).Str("MetricsAddr", addr).Msg("cmd/pomerium: could not start metrics exporter") log.Error().Err(metrics.NewPromHTTPListener(addr)).Str("MetricsAddr", addr).Msg("cmd/pomerium: could not start metrics exporter")
@ -206,6 +210,14 @@ func parseOptions(configFile string) (*config.Options, error) {
if o.LogLevel != "" { if o.LogLevel != "" {
log.SetLevel(o.LogLevel) log.SetLevel(o.LogLevel)
} }
metrics.AddPolicyCountCallback(o.Services, func() int64 {
return int64(len(o.Policies))
})
checksumInt, err := strconv.ParseInt(fmt.Sprintf("0x%s", o.Checksum()), 0, 64)
if err != nil {
log.Warn().Err(err).Msg("Could not parse config checksum into integer")
}
metrics.SetConfigChecksum(o.Services, checksumInt)
return o, nil return o, nil
} }
@ -213,6 +225,7 @@ func handleConfigUpdate(opt *config.Options, services []config.OptionsUpdater) *
newOpt, err := parseOptions(*configFile) newOpt, err := parseOptions(*configFile)
if err != nil { if err != nil {
log.Error().Err(err).Msg("cmd/pomerium: could not reload configuration") log.Error().Err(err).Msg("cmd/pomerium: could not reload configuration")
metrics.SetConfigInfo(opt.Services, false, "")
return opt return opt
} }
optChecksum := opt.Checksum() optChecksum := opt.Checksum()
@ -232,8 +245,12 @@ func handleConfigUpdate(opt *config.Options, services []config.OptionsUpdater) *
for _, service := range services { for _, service := range services {
if err := service.UpdateOptions(*newOpt); err != nil { if err := service.UpdateOptions(*newOpt); err != nil {
log.Error().Err(err).Msg("cmd/pomerium: could not update options") log.Error().Err(err).Msg("cmd/pomerium: could not update options")
metrics.SetConfigInfo(opt.Services, false, "")
} }
} }
metrics.AddPolicyCountCallback(newOpt.Services, func() int64 {
return int64(len(newOpt.Policies))
})
metrics.SetConfigInfo(newOpt.Services, true, newOptChecksum)
return newOpt return newOpt
} }

View file

@ -166,22 +166,26 @@ Expose a prometheus format HTTP endpoint on the specified port. Disabled by defa
Name | Type | Description Name | Type | Description
:------------------------------ | :-------- | :-------------------------------------------- :------------------------------ | :-------- | :--------------------------------------------
http_server_requests_total | Counter | Total HTTP server requests handled by service grpc_client_request_duration_ms | Histogram | GRPC client request duration by service
http_server_response_size_bytes | Histogram | HTTP server response size by service grpc_client_request_size_bytes | Histogram | GRPC client request size by service
http_server_request_size_bytes | Histogram | HTTP server request size by service
http_server_request_duration_ms | Histogram | HTTP server request duration by service
http_client_requests_total | Counter | Total HTTP client requests made by service
http_client_response_size_bytes | Histogram | HTTP client response size by service
http_client_request_size_bytes | Histogram | HTTP client request size by service
http_client_request_duration_ms | Histogram | HTTP client request duration by service
grpc_client_requests_total | Counter | Total GRPC client requests made by service grpc_client_requests_total | Counter | Total GRPC client requests made by service
grpc_client_response_size_bytes | Histogram | GRPC client response size by service grpc_client_response_size_bytes | Histogram | GRPC client response size by service
grpc_client_request_size_bytes | Histogram | GRPC client request size by service grpc_server_request_duration_ms | Histogram | GRPC server request duration by service
grpc_client_request_duration_ms | Histogram | GRPC client request duration by service grpc_server_request_size_bytes | Histogram | GRPC server request size by service
grpc_server_requests_total | Counter | Total GRPC server requests made by service grpc_server_requests_total | Counter | Total GRPC server requests made by service
grpc_server_response_size_bytes | Histogram | GRPC server response size by service grpc_server_response_size_bytes | Histogram | GRPC server response size by service
grpc_server_request_size_bytes | Histogram | GRPC server request size by service http_client_request_duration_ms | Histogram | HTTP client request duration by service
grpc_server_request_duration_ms | Histogram | GRPC server request duration by service http_client_request_size_bytes | Histogram | HTTP client request size by service
http_client_requests_total | Counter | Total HTTP client requests made by service
http_client_response_size_bytes | Histogram | HTTP client response size by service
http_server_request_duration_ms | Histogram | HTTP server request duration by service
http_server_request_size_bytes | Histogram | HTTP server request size by service
http_server_requests_total | Counter | Total HTTP server requests handled by service
http_server_response_size_bytes | Histogram | HTTP server response size by service
pomerium_config_checksum_int64 | Gauge | Currently loaded configuration checksum by service
pomerium_config_last_reload_success | Gauge | Whether the last configuration reload succeeded by service
pomerium_config_last_reload_success_timestamp | Guage | The timestamp of the last successful configuration reload by service
pomerium_build_info | Gauge | Pomerium build metadata by git revision, service, version and goversion
### Policy ### Policy

View file

@ -4,6 +4,8 @@ import (
"strings" "strings"
"testing" "testing"
"github.com/google/go-cmp/cmp"
"go.opencensus.io/metric/metricdata"
"go.opencensus.io/stats/view" "go.opencensus.io/stats/view"
) )
@ -17,11 +19,41 @@ func testDataRetrieval(v *view.View, t *testing.T, want string) {
if err != nil { if err != nil {
t.Fatalf("%s: failed to retrieve data line %s", name, err) t.Fatalf("%s: failed to retrieve data line %s", name, err)
} }
if len(data) != 1 {
if want != "" && len(data) != 1 {
t.Fatalf("%s: received incorrect number of data rows: %d", name, len(data)) t.Fatalf("%s: received incorrect number of data rows: %d", name, len(data))
} }
if want == "" && len(data) > 0 {
t.Fatalf("%s: received incorrect number of data rows: %d", name, len(data))
} else if want == "" {
return
}
if !strings.HasPrefix(data[0].String(), want) { dataString := data[0].String()
t.Errorf("%s: Found unexpected data row: \nwant: %s\ngot: %s\n", name, want, data[0].String())
if want != "" && !strings.HasPrefix(dataString, want) {
t.Errorf("%s: Found unexpected data row: \nwant: %s\ngot: %s\n", name, want, dataString)
}
}
func testMetricRetrieval(metrics []*metricdata.Metric, t *testing.T, labels []metricdata.LabelValue, value int64, name string) {
found := false
for _, metric := range metrics {
if metric.Descriptor.Name != name {
found = true
continue
}
gotLabels := metric.TimeSeries[0].LabelValues
gotValue := metric.TimeSeries[0].Points[0].Value
if diff := cmp.Diff(gotLabels, labels); diff != "" {
t.Errorf("Failed to find metric labels:\n%s", diff)
}
if diff := cmp.Diff(gotValue, value); diff != "" {
t.Errorf("Failed to find metric value:\n%s", diff)
}
}
if !found {
t.Errorf("Could not find metric %s", name)
} }
} }

184
internal/metrics/info.go Normal file
View file

@ -0,0 +1,184 @@
package metrics // import "github.com/pomerium/pomerium/internal/metrics"
import (
"context"
"runtime"
"sync"
"time"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/version"
"go.opencensus.io/metric"
"go.opencensus.io/metric/metricdata"
"go.opencensus.io/metric/metricproducer"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
)
var (
//buildInfo = stats.Int64("build_info", "Build Metadata", "1")
configLastReload = stats.Int64("config_last_reload_success_timestamp", "Timestamp of last successful config reload", "seconds")
configLastReloadSuccess = stats.Int64("config_last_reload_success", "Returns 1 if last reload was successful", "1")
registry = newMetricRegistry()
// ConfigLastReloadView contains the timestamp the configuration was last
// reloaded, labeled by service
ConfigLastReloadView = &view.View{
Name: configLastReload.Name(),
Description: configLastReload.Description(),
Measure: configLastReload,
TagKeys: []tag.Key{keyService},
Aggregation: view.LastValue(),
}
// ConfigLastReloadSuccessView contains the result of the last configuration
// reload, labeled by service
ConfigLastReloadSuccessView = &view.View{
Name: configLastReloadSuccess.Name(),
Description: configLastReloadSuccess.Description(),
Measure: configLastReloadSuccess,
TagKeys: []tag.Key{keyService},
Aggregation: view.LastValue(),
}
)
// SetConfigInfo records the status, checksum and timestamp of a configuration reload. You must register InfoViews or the related
// config views before calling
func SetConfigInfo(service string, success bool, checksum string) {
if success {
serviceTag := tag.Insert(keyService, service)
if err := stats.RecordWithTags(
context.Background(),
[]tag.Mutator{serviceTag},
configLastReload.M(time.Now().Unix()),
); err != nil {
log.Error().Err(err).Msg("internal/metrics: failed to record config checksum timestamp")
}
if err := stats.RecordWithTags(
context.Background(),
[]tag.Mutator{serviceTag},
configLastReloadSuccess.M(1),
); err != nil {
log.Error().Err(err).Msg("internal/metrics: failed to record config reload")
}
} else {
stats.Record(context.Background(), configLastReloadSuccess.M(0))
}
}
// metricRegistry holds the non-view metrics and handles safe
// initialization and updates. Behavior without using newMetricRegistry()
// is undefined.
type metricRegistry struct {
registry *metric.Registry
buildInfo *metric.Int64Gauge
policyCount *metric.Int64DerivedGauge
configChecksum *metric.Int64Gauge
sync.Once
}
func newMetricRegistry() *metricRegistry {
r := new(metricRegistry)
r.init()
return r
}
func (r *metricRegistry) init() {
r.Do(
func() {
r.registry = metric.NewRegistry()
var err error
r.buildInfo, err = r.registry.AddInt64Gauge("build_info",
metric.WithDescription("Build Metadata"),
metric.WithLabelKeys("service", "version", "revision", "goversion"),
)
if err != nil {
log.Error().Err(err).Msg("internal/metrics: failed to register build info metric")
}
r.configChecksum, err = r.registry.AddInt64Gauge("config_checksum_int64",
metric.WithDescription("Config checksum represented in int64 notation"),
metric.WithLabelKeys("service"),
)
if err != nil {
log.Error().Err(err).Msg("internal/metrics: failed to register config checksum metric")
}
r.policyCount, err = r.registry.AddInt64DerivedGauge("policy_count_total",
metric.WithDescription("Total number of policies loaded"),
metric.WithLabelKeys("service"),
)
if err != nil {
log.Error().Err(err).Msg("internal/metrics: failed to register policy count metric")
}
})
}
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
// have this exported
func (r *metricRegistry) setBuildInfo(service string) {
if registry.buildInfo == nil {
return
}
m, err := registry.buildInfo.GetEntry(
metricdata.NewLabelValue(service),
metricdata.NewLabelValue(version.FullVersion()),
metricdata.NewLabelValue(version.GitCommit),
metricdata.NewLabelValue((runtime.Version())),
)
if err != nil {
log.Error().Err(err).Msg("internal/metrics: failed to get build info metric")
}
// This sets our build_info metric to a constant 1 per
// https://www.robustperception.io/exposing-the-software-version-to-prometheus
m.Set(1)
}
// SetBuildInfo records the pomerium build info. You must call RegisterInfoMetrics to
// have this exported
func SetBuildInfo(service string) {
registry.setBuildInfo(service)
}
// Register non-view based metrics registry globally for export
func RegisterInfoMetrics() {
metricproducer.GlobalManager().AddProducer(registry.registry)
}
func (r *metricRegistry) setConfigChecksum(service string, checksum int64) {
if r.configChecksum == nil {
return
}
m, err := r.configChecksum.GetEntry(metricdata.NewLabelValue(service))
if err != nil {
log.Error().Err(err).Msg("internal/metrics: failed to get config checksum metric")
}
m.Set(checksum)
}
// SetConfigChecksum creates the configuration checksum metric. You must call RegisterInfoMetrics to
// have this exported
func SetConfigChecksum(service string, checksum int64) {
registry.setConfigChecksum(service, checksum)
}
func (r *metricRegistry) addPolicyCountCallback(service string, f func() int64) {
if r.policyCount == nil {
return
}
err := r.policyCount.UpsertEntry(f, metricdata.NewLabelValue(service))
if err != nil {
log.Error().Err(err).Msg("internal/metrics: failed to get policy count metric")
}
}
// AddPolicyCountCallback sets the function to call when exporting the
// policy count metric. You must call RegisterInfoMetrics to have this
// exported
func AddPolicyCountCallback(service string, f func() int64) {
registry.addPolicyCountCallback(service, f)
}

View file

@ -0,0 +1,85 @@
package metrics // import "github.com/pomerium/pomerium/internal/metrics"
import (
"runtime"
"testing"
"github.com/pomerium/pomerium/internal/version"
"go.opencensus.io/metric/metricdata"
"go.opencensus.io/metric/metricproducer"
)
func Test_SetConfigInfo(t *testing.T) {
tests := []struct {
name string
success bool
checksum string
wantLastReload string
wantLastReloadSuccess string
}{
{"success", true, "abcde", "{ { {service test_service} }&{1.", "{ { {service test_service} }&{1} }"},
{"failed", false, "abcde", "", "{ { }&{0} }"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
UnRegisterView(InfoViews)
RegisterView(InfoViews)
SetConfigInfo("test_service", tt.success, tt.checksum)
testDataRetrieval(ConfigLastReloadView, t, tt.wantLastReload)
testDataRetrieval(ConfigLastReloadSuccessView, t, tt.wantLastReloadSuccess)
})
}
}
func Test_SetBuildInfo(t *testing.T) {
registry = newMetricRegistry()
version.Version = "v0.0.1"
version.GitCommit = "deadbeef"
wantLabels := []metricdata.LabelValue{
{Value: "test_service", Present: true},
{Value: version.FullVersion(), Present: true},
{Value: version.GitCommit, Present: true},
{Value: runtime.Version(), Present: true},
}
SetBuildInfo("test_service")
testMetricRetrieval(registry.registry.Read(), t, wantLabels, 1, "build_info")
}
func Test_AddPolicyCountCallback(t *testing.T) {
registry = newMetricRegistry()
wantValue := int64(42)
wantLabels := []metricdata.LabelValue{{Value: "test_service", Present: true}}
AddPolicyCountCallback("test_service", func() int64 { return wantValue })
testMetricRetrieval(registry.registry.Read(), t, wantLabels, wantValue, "policy_count_total")
}
func Test_SetConfigChecksum(t *testing.T) {
registry = newMetricRegistry()
wantValue := int64(42)
wantLabels := []metricdata.LabelValue{{Value: "test_service", Present: true}}
SetConfigChecksum("test_service", wantValue)
testMetricRetrieval(registry.registry.Read(), t, wantLabels, wantValue, "config_checksum_int64")
}
func Test_RegisterInfoMetrics(t *testing.T) {
metricproducer.GlobalManager().DeleteProducer(registry.registry)
RegisterInfoMetrics()
// Make sure registration de-dupes on multiple calls
RegisterInfoMetrics()
r := metricproducer.GlobalManager().GetAll()
if len(r) != 2 {
t.Error("Did not find enough registries")
}
}

View file

@ -5,7 +5,6 @@ import (
) )
var ( var (
// keyStatus tag.Key = tag.MustNewKey("status")
keyHTTPMethod tag.Key = tag.MustNewKey("http_method") keyHTTPMethod tag.Key = tag.MustNewKey("http_method")
keyService tag.Key = tag.MustNewKey("service") keyService tag.Key = tag.MustNewKey("service")
keyGRPCService tag.Key = tag.MustNewKey("grpc_service") keyGRPCService tag.Key = tag.MustNewKey("grpc_service")

View file

@ -14,6 +14,8 @@ var (
GRPCClientViews = []*view.View{GRPCClientRequestCountView, GRPCClientRequestDurationView, GRPCClientResponseSizeView, GRPCClientRequestSizeView} GRPCClientViews = []*view.View{GRPCClientRequestCountView, GRPCClientRequestDurationView, GRPCClientResponseSizeView, GRPCClientRequestSizeView}
// GRPCServerViews contains opencensus views for GRPC Server metrics // GRPCServerViews contains opencensus views for GRPC Server metrics
GRPCServerViews = []*view.View{GRPCServerRequestCountView, GRPCServerRequestDurationView, GRPCServerResponseSizeView, GRPCServerRequestSizeView} GRPCServerViews = []*view.View{GRPCServerRequestCountView, GRPCServerRequestDurationView, GRPCServerResponseSizeView, GRPCServerRequestSizeView}
// InfoViews contains opencensus views for Info metrics
InfoViews = []*view.View{ConfigLastReloadView, ConfigLastReloadSuccessView}
) )
// RegisterView registers one of the defined metrics views. It must be called for metrics to see metrics // RegisterView registers one of the defined metrics views. It must be called for metrics to see metrics

View file

@ -158,7 +158,9 @@ func New(opts config.Options) (*Proxy, error) {
if err := p.UpdatePolicies(&opts); err != nil { if err := p.UpdatePolicies(&opts); err != nil {
return nil, err return nil, err
} }
metrics.AddPolicyCountCallback("proxy", func() int64 {
return int64(len(p.routeConfigs))
})
p.AuthenticateClient, err = clients.NewAuthenticateClient("grpc", p.AuthenticateClient, err = clients.NewAuthenticateClient("grpc",
&clients.Options{ &clients.Options{
Addr: opts.AuthenticateURL, Addr: opts.AuthenticateURL,