mirror of
https://github.com/pomerium/pomerium.git
synced 2025-04-30 10:56:28 +02:00
zero/telemetry: internal envoy stats scraper and metrics producer (#5136)
This commit is contained in:
parent
c3534df885
commit
c1dec06afa
19 changed files with 667 additions and 301 deletions
|
@ -132,5 +132,19 @@ func (b *Builder) buildOutboundRoutes() []*envoy_config_route_v3.Route {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
routes = append(routes, &envoy_config_route_v3.Route{
|
||||||
|
Name: "envoy-metrics",
|
||||||
|
Match: &envoy_config_route_v3.RouteMatch{
|
||||||
|
PathSpecifier: &envoy_config_route_v3.RouteMatch_Prefix{Prefix: "/envoy/stats/prometheus"},
|
||||||
|
},
|
||||||
|
Action: &envoy_config_route_v3.Route_Route{
|
||||||
|
Route: &envoy_config_route_v3.RouteAction{
|
||||||
|
ClusterSpecifier: &envoy_config_route_v3.RouteAction_Cluster{
|
||||||
|
Cluster: envoyAdminClusterName,
|
||||||
|
},
|
||||||
|
PrefixRewrite: "/stats/prometheus",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
return routes
|
return routes
|
||||||
}
|
}
|
||||||
|
|
|
@ -61,6 +61,16 @@ func Test_buildOutboundRoutes(t *testing.T) {
|
||||||
"idleTimeout": "0s",
|
"idleTimeout": "0s",
|
||||||
"timeout": "0s"
|
"timeout": "0s"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"match": {
|
||||||
|
"prefix": "/envoy/stats/prometheus"
|
||||||
|
},
|
||||||
|
"name": "envoy-metrics",
|
||||||
|
"route": {
|
||||||
|
"cluster": "pomerium-envoy-admin",
|
||||||
|
"prefixRewrite": "/stats/prometheus"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]`, routes)
|
]`, routes)
|
||||||
}
|
}
|
||||||
|
|
1
go.mod
1
go.mod
|
@ -192,6 +192,7 @@ require (
|
||||||
github.com/spf13/afero v1.11.0 // indirect
|
github.com/spf13/afero v1.11.0 // indirect
|
||||||
github.com/spf13/cast v1.6.0 // indirect
|
github.com/spf13/cast v1.6.0 // indirect
|
||||||
github.com/spf13/pflag v1.0.5 // indirect
|
github.com/spf13/pflag v1.0.5 // indirect
|
||||||
|
github.com/stretchr/objx v0.5.2 // indirect
|
||||||
github.com/subosito/gotenv v1.6.0 // indirect
|
github.com/subosito/gotenv v1.6.0 // indirect
|
||||||
github.com/tchap/go-patricia/v2 v2.3.1 // indirect
|
github.com/tchap/go-patricia/v2 v2.3.1 // indirect
|
||||||
github.com/tinylib/msgp v1.1.2 // indirect
|
github.com/tinylib/msgp v1.1.2 // indirect
|
||||||
|
|
|
@ -2,16 +2,27 @@ package retry
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/cenkalti/backoff/v4"
|
"github.com/cenkalti/backoff/v4"
|
||||||
|
|
||||||
|
"github.com/pomerium/pomerium/internal/log"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type serviceName struct{}
|
||||||
|
|
||||||
// WithBackoff retries the given function with an exponential backoff,
|
// WithBackoff retries the given function with an exponential backoff,
|
||||||
// stopping when the context is done or the function returns a terminal error.
|
// stopping when the context is done or the function returns a terminal error.
|
||||||
func WithBackoff(ctx context.Context, fn func(context.Context) error) error {
|
func WithBackoff(ctx context.Context, name string, fn func(context.Context) error) error {
|
||||||
|
name, ctx = getServiceNameContext(ctx, name)
|
||||||
|
|
||||||
|
log.Debug(ctx).Str("service-name", name).Msg("starting")
|
||||||
|
defer log.Debug(ctx).Str("service-name", name).Msg("stopped")
|
||||||
|
|
||||||
b := backoff.NewExponentialBackOff()
|
b := backoff.NewExponentialBackOff()
|
||||||
b.MaxElapsedTime = 0
|
b.MaxElapsedTime = 0
|
||||||
return backoff.Retry(
|
return backoff.RetryNotify(
|
||||||
func() error {
|
func() error {
|
||||||
err := fn(ctx)
|
err := fn(ctx)
|
||||||
if IsTerminalError(err) {
|
if IsTerminalError(err) {
|
||||||
|
@ -20,5 +31,18 @@ func WithBackoff(ctx context.Context, fn func(context.Context) error) error {
|
||||||
return err
|
return err
|
||||||
},
|
},
|
||||||
backoff.WithContext(b, ctx),
|
backoff.WithContext(b, ctx),
|
||||||
|
func(err error, next time.Duration) {
|
||||||
|
log.Warn(ctx).Err(err).Str("service-name", name).Dur("next", next).Msg("retrying")
|
||||||
|
},
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getServiceNameContext(ctx context.Context, name string) (string, context.Context) {
|
||||||
|
names, ok := ctx.Value(serviceName{}).([]string)
|
||||||
|
if ok {
|
||||||
|
names = append(names, name)
|
||||||
|
} else {
|
||||||
|
names = []string{name}
|
||||||
|
}
|
||||||
|
return strings.Join(names, "."), context.WithValue(ctx, serviceName{}, names)
|
||||||
|
}
|
||||||
|
|
147
internal/telemetry/prometheus/producer.go
Normal file
147
internal/telemetry/prometheus/producer.go
Normal file
|
@ -0,0 +1,147 @@
|
||||||
|
package prometheus
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"go.opentelemetry.io/otel/sdk/instrumentation"
|
||||||
|
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
||||||
|
)
|
||||||
|
|
||||||
|
type producerConfig struct {
|
||||||
|
client *http.Client
|
||||||
|
scrapeURL string
|
||||||
|
scope instrumentation.Scope
|
||||||
|
startTime time.Time
|
||||||
|
metrics map[string]struct{}
|
||||||
|
labels map[string]struct{}
|
||||||
|
}
|
||||||
|
|
||||||
|
type ProducerOption func(*producerConfig)
|
||||||
|
|
||||||
|
func WithClient(client *http.Client) ProducerOption {
|
||||||
|
return func(cfg *producerConfig) {
|
||||||
|
cfg.client = client
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func WithScope(scope instrumentation.Scope) ProducerOption {
|
||||||
|
return func(cfg *producerConfig) {
|
||||||
|
cfg.scope = scope
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func WithStartTime(startTime time.Time) ProducerOption {
|
||||||
|
return func(cfg *producerConfig) {
|
||||||
|
cfg.startTime = startTime
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func WithIncludeMetrics(metrics ...string) ProducerOption {
|
||||||
|
return func(cfg *producerConfig) {
|
||||||
|
if cfg.metrics == nil {
|
||||||
|
cfg.metrics = make(map[string]struct{}, len(metrics))
|
||||||
|
}
|
||||||
|
for _, metric := range metrics {
|
||||||
|
cfg.metrics[metric] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func WithIncludeLabels(labels ...string) ProducerOption {
|
||||||
|
return func(cfg *producerConfig) {
|
||||||
|
if cfg.labels == nil {
|
||||||
|
cfg.labels = make(map[string]struct{}, len(labels))
|
||||||
|
}
|
||||||
|
for _, label := range labels {
|
||||||
|
cfg.labels[label] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func WithScrapeURL(scrapeURL string) ProducerOption {
|
||||||
|
return func(cfg *producerConfig) {
|
||||||
|
cfg.scrapeURL = scrapeURL
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func newProducerConfig(opts ...ProducerOption) *producerConfig {
|
||||||
|
cfg := &producerConfig{
|
||||||
|
client: http.DefaultClient,
|
||||||
|
}
|
||||||
|
for _, opt := range opts {
|
||||||
|
opt(cfg)
|
||||||
|
}
|
||||||
|
return cfg
|
||||||
|
}
|
||||||
|
|
||||||
|
type Producer struct {
|
||||||
|
producerConfig atomic.Value
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewProducer(opts ...ProducerOption) *Producer {
|
||||||
|
cfg := newProducerConfig(opts...)
|
||||||
|
|
||||||
|
p := new(Producer)
|
||||||
|
p.setConfig(cfg)
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Producer) UpdateConfig(opts ...ProducerOption) {
|
||||||
|
cfg := *p.loadConfig()
|
||||||
|
for _, opt := range opts {
|
||||||
|
opt(&cfg)
|
||||||
|
}
|
||||||
|
p.setConfig(&cfg)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
|
||||||
|
cfg := p.loadConfig()
|
||||||
|
|
||||||
|
if len(cfg.metrics) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, cfg.scrapeURL, nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||||
|
}
|
||||||
|
resp, err := cfg.client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to do request: %w", err)
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
metrics, err := ToOTLP(resp.Body, filter(cfg.metrics), filter(cfg.labels), cfg.startTime, time.Now())
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to convert metrics to OTLP: %w", err)
|
||||||
|
}
|
||||||
|
return []metricdata.ScopeMetrics{
|
||||||
|
{
|
||||||
|
Scope: cfg.scope,
|
||||||
|
Metrics: metrics,
|
||||||
|
},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Producer) setConfig(cfg *producerConfig) {
|
||||||
|
p.producerConfig.Store(cfg)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Producer) loadConfig() *producerConfig {
|
||||||
|
return p.producerConfig.Load().(*producerConfig)
|
||||||
|
}
|
||||||
|
|
||||||
|
func filter(src map[string]struct{}) func(k string) (string, bool) {
|
||||||
|
return func(k string) (string, bool) {
|
||||||
|
if len(src) == 0 {
|
||||||
|
return k, true
|
||||||
|
}
|
||||||
|
if _, ok := src[k]; ok {
|
||||||
|
return k, true
|
||||||
|
}
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,6 +3,9 @@ package mux
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"google.golang.org/protobuf/encoding/protojson"
|
||||||
|
|
||||||
"github.com/pomerium/pomerium/internal/zero/apierror"
|
"github.com/pomerium/pomerium/internal/zero/apierror"
|
||||||
"github.com/pomerium/pomerium/pkg/zero/connect"
|
"github.com/pomerium/pomerium/pkg/zero/connect"
|
||||||
|
@ -64,6 +67,19 @@ type message struct {
|
||||||
*connect.Message
|
*connect.Message
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (msg message) String() string {
|
||||||
|
var b strings.Builder
|
||||||
|
if msg.stateChange != nil {
|
||||||
|
b.WriteString("stateChange: ")
|
||||||
|
b.WriteString(string(*msg.stateChange))
|
||||||
|
}
|
||||||
|
if msg.Message != nil {
|
||||||
|
b.WriteString("message: ")
|
||||||
|
b.WriteString(protojson.Format(msg.Message))
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
type stateChange string
|
type stateChange string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
|
|
@ -5,20 +5,24 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"net"
|
||||||
|
"net/url"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/rs/zerolog"
|
"github.com/rs/zerolog"
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
|
|
||||||
"github.com/pomerium/pomerium/internal/log"
|
"github.com/pomerium/pomerium/internal/log"
|
||||||
|
"github.com/pomerium/pomerium/internal/retry"
|
||||||
sdk "github.com/pomerium/pomerium/internal/zero/api"
|
sdk "github.com/pomerium/pomerium/internal/zero/api"
|
||||||
"github.com/pomerium/pomerium/internal/zero/bootstrap"
|
"github.com/pomerium/pomerium/internal/zero/bootstrap"
|
||||||
"github.com/pomerium/pomerium/internal/zero/bootstrap/writers"
|
"github.com/pomerium/pomerium/internal/zero/bootstrap/writers"
|
||||||
connect_mux "github.com/pomerium/pomerium/internal/zero/connect-mux"
|
"github.com/pomerium/pomerium/internal/zero/healthcheck"
|
||||||
"github.com/pomerium/pomerium/internal/zero/reconciler"
|
"github.com/pomerium/pomerium/internal/zero/reconciler"
|
||||||
"github.com/pomerium/pomerium/internal/zero/telemetry/reporter"
|
"github.com/pomerium/pomerium/internal/zero/telemetry"
|
||||||
|
"github.com/pomerium/pomerium/internal/zero/telemetry/sessions"
|
||||||
"github.com/pomerium/pomerium/pkg/cmd/pomerium"
|
"github.com/pomerium/pomerium/pkg/cmd/pomerium"
|
||||||
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
||||||
"github.com/pomerium/pomerium/pkg/zero/connect"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Run runs Pomerium is managed mode using the provided token.
|
// Run runs Pomerium is managed mode using the provided token.
|
||||||
|
@ -68,7 +72,6 @@ type controller struct {
|
||||||
api *sdk.API
|
api *sdk.API
|
||||||
|
|
||||||
bootstrapConfig *bootstrap.Source
|
bootstrapConfig *bootstrap.Source
|
||||||
telemetryReporter *reporter.Reporter
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *controller) initAPI(ctx context.Context) error {
|
func (c *controller) initAPI(ctx context.Context) error {
|
||||||
|
@ -128,47 +131,70 @@ func (c *controller) runZeroControlLoop(ctx context.Context) error {
|
||||||
return fmt.Errorf("waiting for config source to be ready: %w", err)
|
return fmt.Errorf("waiting for config source to be ready: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
r := c.NewDatabrokerRestartRunner(ctx)
|
r := NewDatabrokerRestartRunner(ctx, c.bootstrapConfig)
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
|
|
||||||
err = c.initTelemetry(ctx, func() (databroker.DataBrokerServiceClient, error) {
|
var leaseStatus LeaseStatus
|
||||||
client, _, err := r.getDatabrokerClient()
|
tm, err := telemetry.New(ctx, c.api,
|
||||||
return client, err
|
r.GetDatabrokerClient,
|
||||||
})
|
leaseStatus.HasLease,
|
||||||
|
c.getEnvoyScrapeURL(),
|
||||||
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("init telemetry: %w", err)
|
return fmt.Errorf("init telemetry: %w", err)
|
||||||
}
|
}
|
||||||
defer c.shutdownTelemetry(ctx)
|
defer c.shutdownTelemetry(ctx, tm)
|
||||||
|
|
||||||
err = c.api.Watch(ctx, connect_mux.WithOnTelemetryRequested(func(ctx context.Context, _ *connect.TelemetryRequest) {
|
|
||||||
c.telemetryReporter.CollectAndExportMetrics(ctx)
|
|
||||||
}))
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("watch telemetry: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
eg, ctx := errgroup.WithContext(ctx)
|
eg, ctx := errgroup.WithContext(ctx)
|
||||||
|
eg.Go(func() error { return tm.Run(ctx) })
|
||||||
eg.Go(func() error {
|
eg.Go(func() error {
|
||||||
return r.Run(ctx,
|
return r.Run(ctx,
|
||||||
WithLease(
|
WithLease(
|
||||||
c.runReconcilerLeased,
|
c.runReconcilerLeased,
|
||||||
c.runSessionAnalyticsLeased,
|
c.runSessionAnalyticsLeased,
|
||||||
c.enableSessionAnalyticsReporting,
|
c.runPeriodicHealthChecksLeased,
|
||||||
c.runHealthChecksLeased,
|
leaseStatus.MonitorLease,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
eg.Go(func() error { return c.runTelemetryReporter(ctx) })
|
|
||||||
return eg.Wait()
|
return eg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *controller) runReconcilerLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
func (c *controller) shutdownTelemetry(ctx context.Context, tm *telemetry.Telemetry) {
|
||||||
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
ctx, cancel := context.WithTimeout(ctx, c.cfg.shutdownTimeout)
|
||||||
return c.Str("service", "zero-reconciler")
|
defer cancel()
|
||||||
})
|
|
||||||
|
|
||||||
|
err := tm.Shutdown(ctx)
|
||||||
|
if err != nil {
|
||||||
|
log.Ctx(ctx).Error().Err(err).Msg("error shutting down telemetry")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *controller) runReconcilerLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
||||||
|
return retry.WithBackoff(ctx, "zero-reconciler", func(ctx context.Context) error {
|
||||||
return reconciler.Run(ctx,
|
return reconciler.Run(ctx,
|
||||||
reconciler.WithAPI(c.api),
|
reconciler.WithAPI(c.api),
|
||||||
reconciler.WithDataBrokerClient(client),
|
reconciler.WithDataBrokerClient(client),
|
||||||
)
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *controller) runSessionAnalyticsLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
||||||
|
return retry.WithBackoff(ctx, "zero-analytics", func(ctx context.Context) error {
|
||||||
|
return sessions.Collect(ctx, client, time.Hour)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *controller) runPeriodicHealthChecksLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
||||||
|
return retry.WithBackoff(ctx, "zero-healthcheck", func(ctx context.Context) error {
|
||||||
|
return healthcheck.RunChecks(ctx, c.bootstrapConfig, client)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *controller) getEnvoyScrapeURL() string {
|
||||||
|
return (&url.URL{
|
||||||
|
Scheme: "http",
|
||||||
|
Host: net.JoinHostPort("localhost", c.bootstrapConfig.GetConfig().OutboundPort),
|
||||||
|
Path: "/envoy/stats/prometheus",
|
||||||
|
}).String()
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,13 +9,10 @@ import (
|
||||||
"net/url"
|
"net/url"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"golang.org/x/sync/errgroup"
|
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
|
|
||||||
"github.com/pomerium/pomerium/config"
|
"github.com/pomerium/pomerium/config"
|
||||||
"github.com/pomerium/pomerium/internal/log"
|
|
||||||
"github.com/pomerium/pomerium/internal/retry"
|
"github.com/pomerium/pomerium/internal/retry"
|
||||||
"github.com/pomerium/pomerium/internal/zero/bootstrap"
|
|
||||||
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
||||||
"github.com/pomerium/pomerium/pkg/grpcutil"
|
"github.com/pomerium/pomerium/pkg/grpcutil"
|
||||||
)
|
)
|
||||||
|
@ -23,9 +20,7 @@ import (
|
||||||
// ErrBootstrapConfigurationChanged is returned when the bootstrap configuration has changed and the function needs to be restarted.
|
// ErrBootstrapConfigurationChanged is returned when the bootstrap configuration has changed and the function needs to be restarted.
|
||||||
var ErrBootstrapConfigurationChanged = errors.New("bootstrap configuration changed")
|
var ErrBootstrapConfigurationChanged = errors.New("bootstrap configuration changed")
|
||||||
|
|
||||||
type runner struct {
|
type DatabrokerRestartRunner struct {
|
||||||
source *bootstrap.Source
|
|
||||||
|
|
||||||
lock sync.RWMutex
|
lock sync.RWMutex
|
||||||
cancel chan struct{}
|
cancel chan struct{}
|
||||||
conn *grpc.ClientConn
|
conn *grpc.ClientConn
|
||||||
|
@ -33,31 +28,39 @@ type runner struct {
|
||||||
initError error
|
initError error
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *controller) NewDatabrokerRestartRunner(ctx context.Context) *runner {
|
// NewDatabrokerRestartRunner is a helper to run a function that needs to be restarted when the underlying databroker configuration changes.
|
||||||
p := &runner{
|
func NewDatabrokerRestartRunner(
|
||||||
source: c.bootstrapConfig,
|
ctx context.Context,
|
||||||
}
|
src config.Source,
|
||||||
p.initLocked(ctx, c.bootstrapConfig.GetConfig())
|
) *DatabrokerRestartRunner {
|
||||||
c.bootstrapConfig.OnConfigChange(context.Background(), p.onConfigChange)
|
p := new(DatabrokerRestartRunner)
|
||||||
|
p.initLocked(ctx, src.GetConfig())
|
||||||
|
src.OnConfigChange(ctx, p.onConfigChange)
|
||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
type DbcFunc func(context.Context, databroker.DataBrokerServiceClient) error
|
func (p *DatabrokerRestartRunner) Run(
|
||||||
|
ctx context.Context,
|
||||||
func (p *runner) Run(ctx context.Context, funcs ...DbcFunc) error {
|
fn func(context.Context, databroker.DataBrokerServiceClient) error,
|
||||||
return retry.WithBackoff(ctx, func(ctx context.Context) error { return p.runUntilDatabrokerChanges(ctx, funcs...) })
|
) error {
|
||||||
|
return retry.WithBackoff(ctx, "databroker-restart", func(ctx context.Context) error { return p.runUntilDatabrokerChanges(ctx, fn) })
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close releases the resources used by the databroker provider.
|
// Close releases the resources used by the databroker provider.
|
||||||
func (p *runner) Close() {
|
func (p *DatabrokerRestartRunner) Close() {
|
||||||
p.lock.Lock()
|
p.lock.Lock()
|
||||||
defer p.lock.Unlock()
|
defer p.lock.Unlock()
|
||||||
|
|
||||||
p.closeLocked()
|
p.closeLocked()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (p *DatabrokerRestartRunner) GetDatabrokerClient() (databroker.DataBrokerServiceClient, error) {
|
||||||
|
client, _, err := p.getDatabrokerClient()
|
||||||
|
return client, err
|
||||||
|
}
|
||||||
|
|
||||||
// GetDatabrokerClient returns the databroker client and a channel that will be closed when the client is no longer valid.
|
// GetDatabrokerClient returns the databroker client and a channel that will be closed when the client is no longer valid.
|
||||||
func (p *runner) getDatabrokerClient() (databroker.DataBrokerServiceClient, <-chan struct{}, error) {
|
func (p *DatabrokerRestartRunner) getDatabrokerClient() (databroker.DataBrokerServiceClient, <-chan struct{}, error) {
|
||||||
p.lock.RLock()
|
p.lock.RLock()
|
||||||
defer p.lock.RUnlock()
|
defer p.lock.RUnlock()
|
||||||
|
|
||||||
|
@ -68,7 +71,7 @@ func (p *runner) getDatabrokerClient() (databroker.DataBrokerServiceClient, <-ch
|
||||||
return p.client, p.cancel, nil
|
return p.client, p.cancel, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *runner) onConfigChange(ctx context.Context, cfg *config.Config) {
|
func (p *DatabrokerRestartRunner) onConfigChange(ctx context.Context, cfg *config.Config) {
|
||||||
p.lock.Lock()
|
p.lock.Lock()
|
||||||
defer p.lock.Unlock()
|
defer p.lock.Unlock()
|
||||||
|
|
||||||
|
@ -76,7 +79,7 @@ func (p *runner) onConfigChange(ctx context.Context, cfg *config.Config) {
|
||||||
p.initLocked(ctx, cfg)
|
p.initLocked(ctx, cfg)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *runner) initLocked(ctx context.Context, cfg *config.Config) {
|
func (p *DatabrokerRestartRunner) initLocked(ctx context.Context, cfg *config.Config) {
|
||||||
conn, err := newDataBrokerConnection(ctx, cfg)
|
conn, err := newDataBrokerConnection(ctx, cfg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
p.initError = fmt.Errorf("databroker connection: %w", err)
|
p.initError = fmt.Errorf("databroker connection: %w", err)
|
||||||
|
@ -89,7 +92,7 @@ func (p *runner) initLocked(ctx context.Context, cfg *config.Config) {
|
||||||
p.initError = nil
|
p.initError = nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *runner) closeLocked() {
|
func (p *DatabrokerRestartRunner) closeLocked() {
|
||||||
if p.conn != nil {
|
if p.conn != nil {
|
||||||
p.conn.Close()
|
p.conn.Close()
|
||||||
p.conn = nil
|
p.conn = nil
|
||||||
|
@ -101,13 +104,10 @@ func (p *runner) closeLocked() {
|
||||||
p.initError = errors.New("databroker connection closed")
|
p.initError = errors.New("databroker connection closed")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *runner) runUntilDatabrokerChanges(
|
func (p *DatabrokerRestartRunner) runUntilDatabrokerChanges(
|
||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
funcs ...DbcFunc,
|
fn func(context.Context, databroker.DataBrokerServiceClient) error,
|
||||||
) error {
|
) error {
|
||||||
log.Debug(ctx).Msg("starting")
|
|
||||||
defer log.Debug(ctx).Msg("stop")
|
|
||||||
|
|
||||||
client, cancelCh, err := p.getDatabrokerClient()
|
client, cancelCh, err := p.getDatabrokerClient()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("get databroker client: %w", err)
|
return fmt.Errorf("get databroker client: %w", err)
|
||||||
|
@ -120,18 +120,11 @@ func (p *runner) runUntilDatabrokerChanges(
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
case <-cancelCh:
|
case <-cancelCh:
|
||||||
log.Debug(ctx).Msg("bootstrap configuration changed, restarting...")
|
|
||||||
cancel(ErrBootstrapConfigurationChanged)
|
cancel(ErrBootstrapConfigurationChanged)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
eg, ctx := errgroup.WithContext(ctx)
|
return fn(ctx, client)
|
||||||
for _, fn := range funcs {
|
|
||||||
eg.Go(func() error {
|
|
||||||
return retry.WithBackoff(ctx, func(ctx context.Context) error { return fn(ctx, client) })
|
|
||||||
})
|
|
||||||
}
|
|
||||||
return eg.Wait()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func newDataBrokerConnection(ctx context.Context, cfg *config.Config) (*grpc.ClientConn, error) {
|
func newDataBrokerConnection(ctx context.Context, cfg *config.Config) (*grpc.ClientConn, error) {
|
113
internal/zero/controller/databroker_restart_test.go
Normal file
113
internal/zero/controller/databroker_restart_test.go
Normal file
|
@ -0,0 +1,113 @@
|
||||||
|
package controller_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/base64"
|
||||||
|
"errors"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/mock"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"github.com/pomerium/pomerium/config"
|
||||||
|
"github.com/pomerium/pomerium/internal/zero/controller"
|
||||||
|
"github.com/pomerium/pomerium/pkg/cryptutil"
|
||||||
|
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
||||||
|
)
|
||||||
|
|
||||||
|
type mockConfigSource struct {
|
||||||
|
mock.Mock
|
||||||
|
config.Source
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *mockConfigSource) GetConfig() *config.Config {
|
||||||
|
args := s.Called()
|
||||||
|
return args.Get(0).(*config.Config)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *mockConfigSource) OnConfigChange(ctx context.Context, cl config.ChangeListener) {
|
||||||
|
s.Called(ctx, cl)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDatabrokerRestart(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
newConfig := func() *config.Config {
|
||||||
|
return &config.Config{
|
||||||
|
Options: &config.Options{
|
||||||
|
SharedKey: base64.StdEncoding.EncodeToString(cryptutil.NewKey()),
|
||||||
|
},
|
||||||
|
GRPCPort: ":12345",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("no error", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
src := new(mockConfigSource)
|
||||||
|
src.On("OnConfigChange", mock.Anything, mock.Anything).Once()
|
||||||
|
src.On("GetConfig").Once().Return(newConfig())
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
r := controller.NewDatabrokerRestartRunner(ctx, src)
|
||||||
|
defer r.Close()
|
||||||
|
|
||||||
|
err := r.Run(ctx, func(_ context.Context, _ databroker.DataBrokerServiceClient) error {
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
})
|
||||||
|
t.Run("error, retry", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
src := new(mockConfigSource)
|
||||||
|
src.On("OnConfigChange", mock.Anything, mock.Anything).Once()
|
||||||
|
src.On("GetConfig").Once().Return(newConfig())
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
r := controller.NewDatabrokerRestartRunner(ctx, src)
|
||||||
|
defer r.Close()
|
||||||
|
|
||||||
|
count := 0
|
||||||
|
err := r.Run(ctx, func(_ context.Context, _ databroker.DataBrokerServiceClient) error {
|
||||||
|
count++
|
||||||
|
if count == 1 {
|
||||||
|
return errors.New("simulated error")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 2, count)
|
||||||
|
})
|
||||||
|
t.Run("config changed, execution restarted", func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
src := new(mockConfigSource)
|
||||||
|
var cl config.ChangeListener
|
||||||
|
src.On("OnConfigChange", mock.Anything, mock.Anything).Once().Run(func(args mock.Arguments) {
|
||||||
|
cl = args.Get(1).(config.ChangeListener)
|
||||||
|
})
|
||||||
|
src.On("GetConfig").Once().Return(newConfig())
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
r := controller.NewDatabrokerRestartRunner(ctx, src)
|
||||||
|
defer r.Close()
|
||||||
|
|
||||||
|
count := 0
|
||||||
|
var clients [2]databroker.DataBrokerServiceClient
|
||||||
|
err := r.Run(ctx, func(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
||||||
|
clients[count] = client
|
||||||
|
count++
|
||||||
|
if count == 1 {
|
||||||
|
cl(context.Background(), newConfig())
|
||||||
|
<-ctx.Done()
|
||||||
|
require.ErrorIs(t, context.Cause(ctx), controller.ErrBootstrapConfigurationChanged)
|
||||||
|
return ctx.Err()
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 2, count)
|
||||||
|
require.NotEqual(t, clients[0], clients[1])
|
||||||
|
})
|
||||||
|
}
|
|
@ -2,18 +2,17 @@ package controller
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
|
|
||||||
"github.com/pomerium/pomerium/internal/log"
|
|
||||||
"github.com/pomerium/pomerium/internal/retry"
|
|
||||||
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
||||||
)
|
)
|
||||||
|
|
||||||
type leaser struct {
|
type leaser struct {
|
||||||
client databroker.DataBrokerServiceClient
|
client databroker.DataBrokerServiceClient
|
||||||
funcs []DbcFunc
|
funcs []func(context.Context, databroker.DataBrokerServiceClient) error
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetDataBrokerServiceClient implements the databroker.LeaseHandler interface.
|
// GetDataBrokerServiceClient implements the databroker.LeaseHandler interface.
|
||||||
|
@ -23,20 +22,18 @@ func (c *leaser) GetDataBrokerServiceClient() databroker.DataBrokerServiceClient
|
||||||
|
|
||||||
// RunLeased implements the databroker.LeaseHandler interface.
|
// RunLeased implements the databroker.LeaseHandler interface.
|
||||||
func (c *leaser) RunLeased(ctx context.Context) error {
|
func (c *leaser) RunLeased(ctx context.Context) error {
|
||||||
log.Debug(ctx).Msg("leaser: running leased functions")
|
|
||||||
|
|
||||||
eg, ctx := errgroup.WithContext(ctx)
|
eg, ctx := errgroup.WithContext(ctx)
|
||||||
for _, fn := range c.funcs {
|
for _, fn := range c.funcs {
|
||||||
eg.Go(func() error {
|
fn := fn
|
||||||
return retry.WithBackoff(ctx, func(ctx context.Context) error { return fn(ctx, c.client) })
|
eg.Go(func() error { return fn(ctx, c.client) })
|
||||||
})
|
|
||||||
}
|
}
|
||||||
err := eg.Wait()
|
err := eg.Wait()
|
||||||
log.Debug(ctx).Err(err).Msg("leaser: done running leased functions")
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func WithLease(funcs ...DbcFunc) DbcFunc {
|
func WithLease(
|
||||||
|
funcs ...func(context.Context, databroker.DataBrokerServiceClient) error,
|
||||||
|
) func(context.Context, databroker.DataBrokerServiceClient) error {
|
||||||
return func(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
return func(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
||||||
srv := &leaser{
|
srv := &leaser{
|
||||||
client: client,
|
client: client,
|
||||||
|
@ -46,3 +43,18 @@ func WithLease(funcs ...DbcFunc) DbcFunc {
|
||||||
return leaser.Run(ctx)
|
return leaser.Run(ctx)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type LeaseStatus struct {
|
||||||
|
v atomic.Bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *LeaseStatus) HasLease() bool {
|
||||||
|
return w.v.Load()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *LeaseStatus) MonitorLease(ctx context.Context, _ databroker.DataBrokerServiceClient) error {
|
||||||
|
w.v.Store(true)
|
||||||
|
<-ctx.Done()
|
||||||
|
w.v.Store(false)
|
||||||
|
return ctx.Err()
|
||||||
|
}
|
||||||
|
|
|
@ -1,79 +0,0 @@
|
||||||
package controller
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/rs/zerolog"
|
|
||||||
"go.opentelemetry.io/otel/sdk/instrumentation"
|
|
||||||
|
|
||||||
"github.com/pomerium/pomerium/internal/log"
|
|
||||||
"github.com/pomerium/pomerium/internal/zero/healthcheck"
|
|
||||||
"github.com/pomerium/pomerium/internal/zero/telemetry/reporter"
|
|
||||||
"github.com/pomerium/pomerium/internal/zero/telemetry/sessions"
|
|
||||||
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
|
||||||
"github.com/pomerium/pomerium/pkg/health"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
producerSessionAnalytics = "session-analytics"
|
|
||||||
)
|
|
||||||
|
|
||||||
func (c *controller) initTelemetry(ctx context.Context, clientProvider func() (databroker.DataBrokerServiceClient, error)) error {
|
|
||||||
sessionMetricProducer := sessions.NewProducer(instrumentation.Scope{}, clientProvider)
|
|
||||||
r, err := reporter.New(ctx, c.api.GetTelemetryConn(),
|
|
||||||
reporter.WithProducer(producerSessionAnalytics, sessionMetricProducer),
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error creating telemetry metrics reporter: %w", err)
|
|
||||||
}
|
|
||||||
c.telemetryReporter = r
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *controller) shutdownTelemetry(ctx context.Context) {
|
|
||||||
ctx, cancel := context.WithTimeout(context.WithoutCancel(ctx), c.cfg.shutdownTimeout)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
err := c.telemetryReporter.Shutdown(ctx)
|
|
||||||
if err != nil {
|
|
||||||
log.Warn(ctx).Err(err).Msg("telemetry reporter shutdown error")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *controller) runSessionAnalyticsLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
|
||||||
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
|
||||||
return c.Str("service", "zero-analytics")
|
|
||||||
})
|
|
||||||
|
|
||||||
return sessions.Collect(ctx, client, time.Hour)
|
|
||||||
}
|
|
||||||
|
|
||||||
// those metrics are cluster-wide, so we only enable their reporting when we have the lease
|
|
||||||
func (c *controller) enableSessionAnalyticsReporting(ctx context.Context, _ databroker.DataBrokerServiceClient) error {
|
|
||||||
_ = c.telemetryReporter.SetMetricProducerEnabled(producerSessionAnalytics, true)
|
|
||||||
defer func() { _ = c.telemetryReporter.SetMetricProducerEnabled(producerSessionAnalytics, false) }()
|
|
||||||
|
|
||||||
<-ctx.Done()
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *controller) runHealthChecksLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
|
||||||
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
|
||||||
return c.Str("service", "zero-health-checks")
|
|
||||||
})
|
|
||||||
|
|
||||||
return healthcheck.RunChecks(ctx, c.bootstrapConfig, client)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *controller) runTelemetryReporter(ctx context.Context) error {
|
|
||||||
health.SetProvider(c.telemetryReporter)
|
|
||||||
defer health.SetProvider(nil)
|
|
||||||
|
|
||||||
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
|
||||||
return c.Str("service", "zero-bootstrap")
|
|
||||||
})
|
|
||||||
|
|
||||||
return c.telemetryReporter.Run(ctx)
|
|
||||||
}
|
|
54
internal/zero/telemetry/metrics_producer.go
Normal file
54
internal/zero/telemetry/metrics_producer.go
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
package telemetry
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"sync/atomic"
|
||||||
|
|
||||||
|
"go.opentelemetry.io/otel/sdk/metric"
|
||||||
|
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
||||||
|
|
||||||
|
"github.com/pomerium/pomerium/internal/log"
|
||||||
|
)
|
||||||
|
|
||||||
|
// metricsProducer is a wrapper around a metric producer that can be enabled or disabled
|
||||||
|
type metricsProducer[P metric.Producer] struct {
|
||||||
|
enabled atomic.Bool
|
||||||
|
name string
|
||||||
|
producer P
|
||||||
|
}
|
||||||
|
|
||||||
|
func newMetricsProducer[P metric.Producer](name string, p P) *metricsProducer[P] {
|
||||||
|
return &metricsProducer[P]{
|
||||||
|
name: name,
|
||||||
|
producer: p,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Produce wraps the underlying producer's Produce method and logs any errors,
|
||||||
|
// to prevent the error from blocking the export of other metrics.
|
||||||
|
// also checks if the producer is enabled before producing metrics
|
||||||
|
func (p *metricsProducer[P]) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
|
||||||
|
if enabled := p.enabled.Load(); !enabled {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := p.producer.Produce(ctx)
|
||||||
|
if err != nil {
|
||||||
|
// we do not return the error here, as we do not want to block the export of other metrics
|
||||||
|
log.Error(ctx).Err(err).Str("producer", p.name).Msg("failed to produce metrics")
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return data, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *metricsProducer[P]) SetEnabled(v bool) {
|
||||||
|
p.enabled.Store(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *metricsProducer[P]) Name() string {
|
||||||
|
return p.name
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *metricsProducer[P]) Producer() P {
|
||||||
|
return p.producer
|
||||||
|
}
|
|
@ -5,25 +5,20 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
type config struct {
|
type config struct {
|
||||||
producers map[string]*metricsProducer
|
producers []metric.Producer
|
||||||
}
|
}
|
||||||
|
|
||||||
type Option func(*config)
|
type Option func(*config)
|
||||||
|
|
||||||
// WithProducer adds a metric producer to the reporter
|
// WithProducer adds a metric producer to the reporter
|
||||||
func WithProducer(name string, p metric.Producer) Option {
|
func WithProducer(p metric.Producer) Option {
|
||||||
return func(c *config) {
|
return func(c *config) {
|
||||||
if _, ok := c.producers[name]; ok {
|
c.producers = append(c.producers, p)
|
||||||
panic("duplicate producer name " + name)
|
|
||||||
}
|
|
||||||
c.producers[name] = newProducer(name, p)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getConfig(opts ...Option) config {
|
func getConfig(opts ...Option) config {
|
||||||
c := config{
|
var c config
|
||||||
producers: make(map[string]*metricsProducer),
|
|
||||||
}
|
|
||||||
for _, opt := range opts {
|
for _, opt := range opts {
|
||||||
opt(&c)
|
opt(&c)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,47 +0,0 @@
|
||||||
package reporter
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"sync/atomic"
|
|
||||||
|
|
||||||
"go.opentelemetry.io/otel/sdk/metric"
|
|
||||||
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
|
||||||
|
|
||||||
"github.com/pomerium/pomerium/internal/log"
|
|
||||||
)
|
|
||||||
|
|
||||||
type metricsProducer struct {
|
|
||||||
enabled atomic.Bool
|
|
||||||
name string
|
|
||||||
metric.Producer
|
|
||||||
}
|
|
||||||
|
|
||||||
func newProducer(name string, p metric.Producer) *metricsProducer {
|
|
||||||
return &metricsProducer{
|
|
||||||
name: name,
|
|
||||||
Producer: p,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var _ metric.Producer = (*metricsProducer)(nil)
|
|
||||||
|
|
||||||
// Produce wraps the underlying producer's Produce method and logs any errors,
|
|
||||||
// to prevent the error from blocking the export of other metrics.
|
|
||||||
// also checks if the producer is enabled before producing metrics
|
|
||||||
func (p *metricsProducer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
|
|
||||||
if enabled := p.enabled.Load(); !enabled {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
data, err := p.Producer.Produce(ctx)
|
|
||||||
if err != nil {
|
|
||||||
log.Error(ctx).Err(err).Str("producer", p.name).Msg("failed to produce metrics")
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return data, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// SetEnabled sets the enabled state of the producer
|
|
||||||
func (p *metricsProducer) SetEnabled(v bool) {
|
|
||||||
p.enabled.Store(v)
|
|
||||||
}
|
|
|
@ -6,38 +6,38 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
export_grpc "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
|
export_grpc "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
|
||||||
metric_sdk "go.opentelemetry.io/otel/sdk/metric"
|
"go.opentelemetry.io/otel/sdk/metric"
|
||||||
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
||||||
"go.opentelemetry.io/otel/sdk/resource"
|
"go.opentelemetry.io/otel/sdk/resource"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
|
|
||||||
"github.com/pomerium/pomerium/internal/log"
|
|
||||||
"github.com/pomerium/pomerium/pkg/health"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type metricsReporter struct {
|
type metricsReporter struct {
|
||||||
exporter *export_grpc.Exporter
|
exporter *export_grpc.Exporter
|
||||||
resource *resource.Resource
|
resource *resource.Resource
|
||||||
reader *metric_sdk.ManualReader
|
reader *metric.ManualReader
|
||||||
producers map[string]*metricsProducer
|
producers []metric.Producer
|
||||||
singleTask
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func newMetricsReporter(
|
func newMetricsReporter(
|
||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
conn *grpc.ClientConn,
|
conn *grpc.ClientConn,
|
||||||
resource *resource.Resource,
|
resource *resource.Resource,
|
||||||
producers map[string]*metricsProducer,
|
producers []metric.Producer,
|
||||||
) (*metricsReporter, error) {
|
) (*metricsReporter, error) {
|
||||||
exporter, err := export_grpc.New(ctx, export_grpc.WithGRPCConn(conn))
|
exporter, err := export_grpc.New(ctx, export_grpc.WithGRPCConn(conn))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("create exporter: %w", err)
|
return nil, fmt.Errorf("create exporter: %w", err)
|
||||||
}
|
}
|
||||||
readerOpts := make([]metric_sdk.ManualReaderOption, 0, len(producers))
|
readerOpts := make([]metric.ManualReaderOption, 0, len(producers))
|
||||||
for _, p := range producers {
|
for _, p := range producers {
|
||||||
readerOpts = append(readerOpts, metric_sdk.WithProducer(p))
|
readerOpts = append(readerOpts, metric.WithProducer(p))
|
||||||
}
|
}
|
||||||
reader := metric_sdk.NewManualReader(readerOpts...)
|
reader := metric.NewManualReader(readerOpts...)
|
||||||
|
_ = metric.NewMeterProvider(
|
||||||
|
metric.WithResource(resource),
|
||||||
|
metric.WithReader(reader),
|
||||||
|
)
|
||||||
return &metricsReporter{
|
return &metricsReporter{
|
||||||
exporter: exporter,
|
exporter: exporter,
|
||||||
resource: resource,
|
resource: resource,
|
||||||
|
@ -58,40 +58,16 @@ func (r *metricsReporter) Shutdown(ctx context.Context) error {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *metricsReporter) SetMetricProducerEnabled(name string, enabled bool) error {
|
func (r *metricsReporter) CollectAndExportMetrics(ctx context.Context) error {
|
||||||
p, ok := r.producers[name]
|
|
||||||
if !ok {
|
|
||||||
return fmt.Errorf("producer %q not found", name)
|
|
||||||
}
|
|
||||||
p.SetEnabled(enabled)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *metricsReporter) CollectAndExportMetrics(ctx context.Context) {
|
|
||||||
r.singleTask.Run(ctx, func(ctx context.Context) {
|
|
||||||
err := r.collectAndExport(ctx)
|
|
||||||
if errors.Is(err, ErrAnotherExecutionRequested) {
|
|
||||||
log.Warn(ctx).Msg("telemetry metrics were not sent, due to another execution requested")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
health.ReportError(health.CollectAndSendTelemetry, err)
|
|
||||||
} else {
|
|
||||||
health.ReportOK(health.CollectAndSendTelemetry)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *metricsReporter) collectAndExport(ctx context.Context) error {
|
|
||||||
rm := &metricdata.ResourceMetrics{
|
rm := &metricdata.ResourceMetrics{
|
||||||
Resource: r.resource,
|
Resource: r.resource,
|
||||||
}
|
}
|
||||||
err := withBackoff(ctx, "collect metrics", func(ctx context.Context) error { return r.reader.Collect(ctx, rm) })
|
err := r.reader.Collect(ctx, rm)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("collect metrics: %w", err)
|
return fmt.Errorf("collect metrics: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = withBackoff(ctx, "export metrics", func(ctx context.Context) error { return r.exporter.Export(ctx, rm) })
|
err = r.exporter.Export(ctx, rm)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("export metrics: %w", err)
|
return fmt.Errorf("export metrics: %w", err)
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,16 +5,14 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/cenkalti/backoff/v4"
|
|
||||||
"go.opentelemetry.io/otel/attribute"
|
"go.opentelemetry.io/otel/attribute"
|
||||||
"go.opentelemetry.io/otel/sdk/resource"
|
"go.opentelemetry.io/otel/sdk/resource"
|
||||||
semconv "go.opentelemetry.io/otel/semconv/v1.4.0"
|
semconv "go.opentelemetry.io/otel/semconv/v1.4.0"
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
|
|
||||||
"github.com/pomerium/pomerium/internal/log"
|
"github.com/pomerium/pomerium/internal/retry"
|
||||||
"github.com/pomerium/pomerium/internal/version"
|
"github.com/pomerium/pomerium/internal/version"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -52,8 +50,8 @@ func New(
|
||||||
func (r *Reporter) Run(ctx context.Context) error {
|
func (r *Reporter) Run(ctx context.Context) error {
|
||||||
eg, ctx := errgroup.WithContext(ctx)
|
eg, ctx := errgroup.WithContext(ctx)
|
||||||
|
|
||||||
eg.Go(func() error { return withBackoff(ctx, "metrics reporter", r.metricsReporter.Run) })
|
eg.Go(func() error { return retry.WithBackoff(ctx, "metrics reporter", r.metricsReporter.Run) })
|
||||||
eg.Go(func() error { return withBackoff(ctx, "health check reporter", r.healthCheckReporter.Run) })
|
eg.Go(func() error { return retry.WithBackoff(ctx, "health check reporter", r.healthCheckReporter.Run) })
|
||||||
|
|
||||||
return eg.Wait()
|
return eg.Wait()
|
||||||
}
|
}
|
||||||
|
@ -81,19 +79,3 @@ func getResource() *resource.Resource {
|
||||||
|
|
||||||
return resource.NewSchemaless(attr...)
|
return resource.NewSchemaless(attr...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func withBackoff(ctx context.Context, name string, f func(context.Context) error) error {
|
|
||||||
bo := backoff.NewExponentialBackOff()
|
|
||||||
bo.MaxElapsedTime = 0
|
|
||||||
return backoff.RetryNotify(
|
|
||||||
func() error { return f(ctx) },
|
|
||||||
backoff.WithContext(bo, ctx),
|
|
||||||
func(err error, d time.Duration) {
|
|
||||||
log.Warn(ctx).
|
|
||||||
Str("name", name).
|
|
||||||
Err(err).
|
|
||||||
Dur("backoff", d).
|
|
||||||
Msg("retrying")
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,27 +0,0 @@
|
||||||
package reporter
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
"sync"
|
|
||||||
)
|
|
||||||
|
|
||||||
type singleTask struct {
|
|
||||||
lock sync.Mutex
|
|
||||||
cancel context.CancelCauseFunc
|
|
||||||
}
|
|
||||||
|
|
||||||
var ErrAnotherExecutionRequested = errors.New("another execution requested")
|
|
||||||
|
|
||||||
func (s *singleTask) Run(ctx context.Context, f func(context.Context)) {
|
|
||||||
s.lock.Lock()
|
|
||||||
defer s.lock.Unlock()
|
|
||||||
|
|
||||||
if s.cancel != nil {
|
|
||||||
s.cancel(ErrAnotherExecutionRequested)
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx, cancel := context.WithCancelCause(ctx)
|
|
||||||
s.cancel = cancel
|
|
||||||
go f(ctx)
|
|
||||||
}
|
|
|
@ -6,14 +6,16 @@ import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"go.opentelemetry.io/otel/sdk/instrumentation"
|
"go.opentelemetry.io/otel/sdk/instrumentation"
|
||||||
"go.opentelemetry.io/otel/sdk/metric"
|
|
||||||
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
|
"google.golang.org/grpc/codes"
|
||||||
|
"google.golang.org/grpc/status"
|
||||||
|
|
||||||
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
||||||
|
"github.com/pomerium/pomerium/pkg/slices"
|
||||||
)
|
)
|
||||||
|
|
||||||
type producer struct {
|
type Producer struct {
|
||||||
scope instrumentation.Scope
|
scope instrumentation.Scope
|
||||||
clientProvider func() (databroker.DataBrokerServiceClient, error)
|
clientProvider func() (databroker.DataBrokerServiceClient, error)
|
||||||
}
|
}
|
||||||
|
@ -21,14 +23,14 @@ type producer struct {
|
||||||
func NewProducer(
|
func NewProducer(
|
||||||
scope instrumentation.Scope,
|
scope instrumentation.Scope,
|
||||||
clientProvider func() (databroker.DataBrokerServiceClient, error),
|
clientProvider func() (databroker.DataBrokerServiceClient, error),
|
||||||
) metric.Producer {
|
) *Producer {
|
||||||
return &producer{
|
return &Producer{
|
||||||
clientProvider: clientProvider,
|
clientProvider: clientProvider,
|
||||||
scope: scope,
|
scope: scope,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
|
func (p *Producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
|
||||||
client, err := p.clientProvider()
|
client, err := p.clientProvider()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("error getting client: %w", err)
|
return nil, fmt.Errorf("error getting client: %w", err)
|
||||||
|
@ -43,6 +45,9 @@ func (p *producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, erro
|
||||||
eg.Go(func() error {
|
eg.Go(func() error {
|
||||||
state, err := LoadMetricState(ctx, client, ids[i])
|
state, err := LoadMetricState(ctx, client, ids[i])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if status.Code(err) == codes.NotFound {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
metrics[i] = metricdata.Metrics{
|
metrics[i] = metricdata.Metrics{
|
||||||
|
@ -66,6 +71,11 @@ func (p *producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, erro
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
metrics = slices.Filter(metrics, func(v metricdata.Metrics) bool { return v.Name != "" })
|
||||||
|
if len(metrics) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
return []metricdata.ScopeMetrics{
|
return []metricdata.ScopeMetrics{
|
||||||
{
|
{
|
||||||
Scope: p.scope,
|
Scope: p.scope,
|
||||||
|
|
146
internal/zero/telemetry/telemetry.go
Normal file
146
internal/zero/telemetry/telemetry.go
Normal file
|
@ -0,0 +1,146 @@
|
||||||
|
package telemetry
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/rs/zerolog"
|
||||||
|
"go.opentelemetry.io/otel/sdk/instrumentation"
|
||||||
|
"golang.org/x/sync/errgroup"
|
||||||
|
|
||||||
|
"github.com/pomerium/pomerium/internal/log"
|
||||||
|
"github.com/pomerium/pomerium/internal/telemetry/prometheus"
|
||||||
|
sdk "github.com/pomerium/pomerium/internal/zero/api"
|
||||||
|
connect_mux "github.com/pomerium/pomerium/internal/zero/connect-mux"
|
||||||
|
"github.com/pomerium/pomerium/internal/zero/telemetry/reporter"
|
||||||
|
"github.com/pomerium/pomerium/internal/zero/telemetry/sessions"
|
||||||
|
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
||||||
|
"github.com/pomerium/pomerium/pkg/health"
|
||||||
|
"github.com/pomerium/pomerium/pkg/zero/connect"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Telemetry struct {
|
||||||
|
api *sdk.API
|
||||||
|
reporter *reporter.Reporter
|
||||||
|
|
||||||
|
envoyMetrics *metricsProducer[*prometheus.Producer]
|
||||||
|
sessionMetrics *metricsProducer[*sessions.Producer]
|
||||||
|
hasSessionMetricsLease func() bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(
|
||||||
|
ctx context.Context,
|
||||||
|
api *sdk.API,
|
||||||
|
clientProvider func() (databroker.DataBrokerServiceClient, error),
|
||||||
|
hasSessionMetricsLease func() bool,
|
||||||
|
envoyScrapeURL string,
|
||||||
|
) (*Telemetry, error) {
|
||||||
|
startTime := time.Now()
|
||||||
|
|
||||||
|
sessionMetricProducer := newMetricsProducer("sessions", buildSessionMetricsProducer(clientProvider))
|
||||||
|
envoyMetricProducer := newMetricsProducer("envoy", buildEnvoyMetricsProducer(envoyScrapeURL, startTime))
|
||||||
|
|
||||||
|
r, err := reporter.New(ctx, api.GetTelemetryConn(),
|
||||||
|
reporter.WithProducer(sessionMetricProducer),
|
||||||
|
reporter.WithProducer(envoyMetricProducer),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error creating telemetry metrics reporter: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &Telemetry{
|
||||||
|
api: api,
|
||||||
|
reporter: r,
|
||||||
|
sessionMetrics: sessionMetricProducer,
|
||||||
|
envoyMetrics: envoyMetricProducer,
|
||||||
|
hasSessionMetricsLease: hasSessionMetricsLease,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (srv *Telemetry) Shutdown(ctx context.Context) error {
|
||||||
|
return srv.reporter.Shutdown(ctx)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (srv *Telemetry) Run(ctx context.Context) error {
|
||||||
|
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
||||||
|
return c.Str("service", "telemetry-reporter")
|
||||||
|
})
|
||||||
|
|
||||||
|
eg, ctx := errgroup.WithContext(ctx)
|
||||||
|
eg.Go(func() error {
|
||||||
|
health.SetProvider(srv.reporter)
|
||||||
|
defer health.SetProvider(nil)
|
||||||
|
return srv.reporter.Run(ctx)
|
||||||
|
})
|
||||||
|
eg.Go(func() error { return srv.handleRequests(ctx) })
|
||||||
|
return eg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleRequests watches for telemetry requests as they are received from the cloud control plane and processes them.
|
||||||
|
func (srv *Telemetry) handleRequests(ctx context.Context) error {
|
||||||
|
requests := make(chan *connect.TelemetryRequest, 1)
|
||||||
|
|
||||||
|
eg, ctx := errgroup.WithContext(ctx)
|
||||||
|
eg.Go(func() error {
|
||||||
|
return srv.api.Watch(ctx, connect_mux.WithOnTelemetryRequested(
|
||||||
|
func(ctx context.Context, req *connect.TelemetryRequest) {
|
||||||
|
select {
|
||||||
|
case requests <- req:
|
||||||
|
default:
|
||||||
|
log.Warn(ctx).Msg("dropping telemetry request")
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
})
|
||||||
|
eg.Go(func() error {
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case req := <-requests:
|
||||||
|
srv.handleRequest(ctx, req)
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return eg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (srv *Telemetry) handleRequest(ctx context.Context, req *connect.TelemetryRequest) {
|
||||||
|
srv.configureEnvoyMetricsProducer(req.GetEnvoyMetrics())
|
||||||
|
srv.configureSessionMetricsProducer(req.GetSessionAnalytics())
|
||||||
|
|
||||||
|
err := srv.reporter.CollectAndExportMetrics(ctx)
|
||||||
|
if err != nil {
|
||||||
|
health.ReportError(health.CollectAndSendTelemetry, err)
|
||||||
|
} else {
|
||||||
|
health.ReportOK(health.CollectAndSendTelemetry)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildSessionMetricsProducer(clientProvider func() (databroker.DataBrokerServiceClient, error)) *sessions.Producer {
|
||||||
|
return sessions.NewProducer(instrumentation.Scope{Name: "pomerium-cluster"}, clientProvider)
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildEnvoyMetricsProducer(scrapeURL string, startTime time.Time) *prometheus.Producer {
|
||||||
|
return prometheus.NewProducer(
|
||||||
|
prometheus.WithScope(instrumentation.Scope{Name: "envoy"}),
|
||||||
|
prometheus.WithScrapeURL(scrapeURL),
|
||||||
|
prometheus.WithStartTime(startTime),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (srv *Telemetry) configureSessionMetricsProducer(req *connect.SessionAnalyticsRequest) {
|
||||||
|
srv.sessionMetrics.SetEnabled(req != nil && srv.hasSessionMetricsLease())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (srv *Telemetry) configureEnvoyMetricsProducer(req *connect.EnvoyMetricsRequest) {
|
||||||
|
if req == nil {
|
||||||
|
srv.envoyMetrics.SetEnabled(false)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
srv.envoyMetrics.Producer().UpdateConfig(
|
||||||
|
prometheus.WithIncludeMetrics(req.GetMetrics()...),
|
||||||
|
prometheus.WithIncludeLabels(req.GetLabels()...),
|
||||||
|
)
|
||||||
|
srv.envoyMetrics.SetEnabled(true)
|
||||||
|
}
|
Loading…
Add table
Reference in a new issue