mirror of
https://github.com/pomerium/pomerium.git
synced 2025-08-02 00:10:45 +02:00
zero/telemetry: internal envoy stats scraper and metrics producer (#5136)
This commit is contained in:
parent
c3534df885
commit
c1dec06afa
19 changed files with 667 additions and 301 deletions
54
internal/zero/telemetry/metrics_producer.go
Normal file
54
internal/zero/telemetry/metrics_producer.go
Normal file
|
@ -0,0 +1,54 @@
|
|||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync/atomic"
|
||||
|
||||
"go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/log"
|
||||
)
|
||||
|
||||
// metricsProducer is a wrapper around a metric producer that can be enabled or disabled
|
||||
type metricsProducer[P metric.Producer] struct {
|
||||
enabled atomic.Bool
|
||||
name string
|
||||
producer P
|
||||
}
|
||||
|
||||
func newMetricsProducer[P metric.Producer](name string, p P) *metricsProducer[P] {
|
||||
return &metricsProducer[P]{
|
||||
name: name,
|
||||
producer: p,
|
||||
}
|
||||
}
|
||||
|
||||
// Produce wraps the underlying producer's Produce method and logs any errors,
|
||||
// to prevent the error from blocking the export of other metrics.
|
||||
// also checks if the producer is enabled before producing metrics
|
||||
func (p *metricsProducer[P]) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
|
||||
if enabled := p.enabled.Load(); !enabled {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
data, err := p.producer.Produce(ctx)
|
||||
if err != nil {
|
||||
// we do not return the error here, as we do not want to block the export of other metrics
|
||||
log.Error(ctx).Err(err).Str("producer", p.name).Msg("failed to produce metrics")
|
||||
return nil, nil
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func (p *metricsProducer[P]) SetEnabled(v bool) {
|
||||
p.enabled.Store(v)
|
||||
}
|
||||
|
||||
func (p *metricsProducer[P]) Name() string {
|
||||
return p.name
|
||||
}
|
||||
|
||||
func (p *metricsProducer[P]) Producer() P {
|
||||
return p.producer
|
||||
}
|
|
@ -5,25 +5,20 @@ import (
|
|||
)
|
||||
|
||||
type config struct {
|
||||
producers map[string]*metricsProducer
|
||||
producers []metric.Producer
|
||||
}
|
||||
|
||||
type Option func(*config)
|
||||
|
||||
// WithProducer adds a metric producer to the reporter
|
||||
func WithProducer(name string, p metric.Producer) Option {
|
||||
func WithProducer(p metric.Producer) Option {
|
||||
return func(c *config) {
|
||||
if _, ok := c.producers[name]; ok {
|
||||
panic("duplicate producer name " + name)
|
||||
}
|
||||
c.producers[name] = newProducer(name, p)
|
||||
c.producers = append(c.producers, p)
|
||||
}
|
||||
}
|
||||
|
||||
func getConfig(opts ...Option) config {
|
||||
c := config{
|
||||
producers: make(map[string]*metricsProducer),
|
||||
}
|
||||
var c config
|
||||
for _, opt := range opts {
|
||||
opt(&c)
|
||||
}
|
||||
|
|
|
@ -1,47 +0,0 @@
|
|||
package reporter
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync/atomic"
|
||||
|
||||
"go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/log"
|
||||
)
|
||||
|
||||
type metricsProducer struct {
|
||||
enabled atomic.Bool
|
||||
name string
|
||||
metric.Producer
|
||||
}
|
||||
|
||||
func newProducer(name string, p metric.Producer) *metricsProducer {
|
||||
return &metricsProducer{
|
||||
name: name,
|
||||
Producer: p,
|
||||
}
|
||||
}
|
||||
|
||||
var _ metric.Producer = (*metricsProducer)(nil)
|
||||
|
||||
// Produce wraps the underlying producer's Produce method and logs any errors,
|
||||
// to prevent the error from blocking the export of other metrics.
|
||||
// also checks if the producer is enabled before producing metrics
|
||||
func (p *metricsProducer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
|
||||
if enabled := p.enabled.Load(); !enabled {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
data, err := p.Producer.Produce(ctx)
|
||||
if err != nil {
|
||||
log.Error(ctx).Err(err).Str("producer", p.name).Msg("failed to produce metrics")
|
||||
return nil, err
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// SetEnabled sets the enabled state of the producer
|
||||
func (p *metricsProducer) SetEnabled(v bool) {
|
||||
p.enabled.Store(v)
|
||||
}
|
|
@ -6,38 +6,38 @@ import (
|
|||
"fmt"
|
||||
|
||||
export_grpc "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
|
||||
metric_sdk "go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
"google.golang.org/grpc"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/log"
|
||||
"github.com/pomerium/pomerium/pkg/health"
|
||||
)
|
||||
|
||||
type metricsReporter struct {
|
||||
exporter *export_grpc.Exporter
|
||||
resource *resource.Resource
|
||||
reader *metric_sdk.ManualReader
|
||||
producers map[string]*metricsProducer
|
||||
singleTask
|
||||
reader *metric.ManualReader
|
||||
producers []metric.Producer
|
||||
}
|
||||
|
||||
func newMetricsReporter(
|
||||
ctx context.Context,
|
||||
conn *grpc.ClientConn,
|
||||
resource *resource.Resource,
|
||||
producers map[string]*metricsProducer,
|
||||
producers []metric.Producer,
|
||||
) (*metricsReporter, error) {
|
||||
exporter, err := export_grpc.New(ctx, export_grpc.WithGRPCConn(conn))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create exporter: %w", err)
|
||||
}
|
||||
readerOpts := make([]metric_sdk.ManualReaderOption, 0, len(producers))
|
||||
readerOpts := make([]metric.ManualReaderOption, 0, len(producers))
|
||||
for _, p := range producers {
|
||||
readerOpts = append(readerOpts, metric_sdk.WithProducer(p))
|
||||
readerOpts = append(readerOpts, metric.WithProducer(p))
|
||||
}
|
||||
reader := metric_sdk.NewManualReader(readerOpts...)
|
||||
reader := metric.NewManualReader(readerOpts...)
|
||||
_ = metric.NewMeterProvider(
|
||||
metric.WithResource(resource),
|
||||
metric.WithReader(reader),
|
||||
)
|
||||
return &metricsReporter{
|
||||
exporter: exporter,
|
||||
resource: resource,
|
||||
|
@ -58,40 +58,16 @@ func (r *metricsReporter) Shutdown(ctx context.Context) error {
|
|||
)
|
||||
}
|
||||
|
||||
func (r *metricsReporter) SetMetricProducerEnabled(name string, enabled bool) error {
|
||||
p, ok := r.producers[name]
|
||||
if !ok {
|
||||
return fmt.Errorf("producer %q not found", name)
|
||||
}
|
||||
p.SetEnabled(enabled)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *metricsReporter) CollectAndExportMetrics(ctx context.Context) {
|
||||
r.singleTask.Run(ctx, func(ctx context.Context) {
|
||||
err := r.collectAndExport(ctx)
|
||||
if errors.Is(err, ErrAnotherExecutionRequested) {
|
||||
log.Warn(ctx).Msg("telemetry metrics were not sent, due to another execution requested")
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
health.ReportError(health.CollectAndSendTelemetry, err)
|
||||
} else {
|
||||
health.ReportOK(health.CollectAndSendTelemetry)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (r *metricsReporter) collectAndExport(ctx context.Context) error {
|
||||
func (r *metricsReporter) CollectAndExportMetrics(ctx context.Context) error {
|
||||
rm := &metricdata.ResourceMetrics{
|
||||
Resource: r.resource,
|
||||
}
|
||||
err := withBackoff(ctx, "collect metrics", func(ctx context.Context) error { return r.reader.Collect(ctx, rm) })
|
||||
err := r.reader.Collect(ctx, rm)
|
||||
if err != nil {
|
||||
return fmt.Errorf("collect metrics: %w", err)
|
||||
}
|
||||
|
||||
err = withBackoff(ctx, "export metrics", func(ctx context.Context) error { return r.exporter.Export(ctx, rm) })
|
||||
err = r.exporter.Export(ctx, rm)
|
||||
if err != nil {
|
||||
return fmt.Errorf("export metrics: %w", err)
|
||||
}
|
||||
|
|
|
@ -5,16 +5,14 @@ import (
|
|||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/cenkalti/backoff/v4"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
semconv "go.opentelemetry.io/otel/semconv/v1.4.0"
|
||||
"golang.org/x/sync/errgroup"
|
||||
"google.golang.org/grpc"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/log"
|
||||
"github.com/pomerium/pomerium/internal/retry"
|
||||
"github.com/pomerium/pomerium/internal/version"
|
||||
)
|
||||
|
||||
|
@ -52,8 +50,8 @@ func New(
|
|||
func (r *Reporter) Run(ctx context.Context) error {
|
||||
eg, ctx := errgroup.WithContext(ctx)
|
||||
|
||||
eg.Go(func() error { return withBackoff(ctx, "metrics reporter", r.metricsReporter.Run) })
|
||||
eg.Go(func() error { return withBackoff(ctx, "health check reporter", r.healthCheckReporter.Run) })
|
||||
eg.Go(func() error { return retry.WithBackoff(ctx, "metrics reporter", r.metricsReporter.Run) })
|
||||
eg.Go(func() error { return retry.WithBackoff(ctx, "health check reporter", r.healthCheckReporter.Run) })
|
||||
|
||||
return eg.Wait()
|
||||
}
|
||||
|
@ -81,19 +79,3 @@ func getResource() *resource.Resource {
|
|||
|
||||
return resource.NewSchemaless(attr...)
|
||||
}
|
||||
|
||||
func withBackoff(ctx context.Context, name string, f func(context.Context) error) error {
|
||||
bo := backoff.NewExponentialBackOff()
|
||||
bo.MaxElapsedTime = 0
|
||||
return backoff.RetryNotify(
|
||||
func() error { return f(ctx) },
|
||||
backoff.WithContext(bo, ctx),
|
||||
func(err error, d time.Duration) {
|
||||
log.Warn(ctx).
|
||||
Str("name", name).
|
||||
Err(err).
|
||||
Dur("backoff", d).
|
||||
Msg("retrying")
|
||||
},
|
||||
)
|
||||
}
|
||||
|
|
|
@ -1,27 +0,0 @@
|
|||
package reporter
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type singleTask struct {
|
||||
lock sync.Mutex
|
||||
cancel context.CancelCauseFunc
|
||||
}
|
||||
|
||||
var ErrAnotherExecutionRequested = errors.New("another execution requested")
|
||||
|
||||
func (s *singleTask) Run(ctx context.Context, f func(context.Context)) {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
if s.cancel != nil {
|
||||
s.cancel(ErrAnotherExecutionRequested)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancelCause(ctx)
|
||||
s.cancel = cancel
|
||||
go f(ctx)
|
||||
}
|
|
@ -6,14 +6,16 @@ import (
|
|||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel/sdk/instrumentation"
|
||||
"go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/metric/metricdata"
|
||||
"golang.org/x/sync/errgroup"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
|
||||
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
||||
"github.com/pomerium/pomerium/pkg/slices"
|
||||
)
|
||||
|
||||
type producer struct {
|
||||
type Producer struct {
|
||||
scope instrumentation.Scope
|
||||
clientProvider func() (databroker.DataBrokerServiceClient, error)
|
||||
}
|
||||
|
@ -21,14 +23,14 @@ type producer struct {
|
|||
func NewProducer(
|
||||
scope instrumentation.Scope,
|
||||
clientProvider func() (databroker.DataBrokerServiceClient, error),
|
||||
) metric.Producer {
|
||||
return &producer{
|
||||
) *Producer {
|
||||
return &Producer{
|
||||
clientProvider: clientProvider,
|
||||
scope: scope,
|
||||
}
|
||||
}
|
||||
|
||||
func (p *producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
|
||||
func (p *Producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
|
||||
client, err := p.clientProvider()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting client: %w", err)
|
||||
|
@ -43,6 +45,9 @@ func (p *producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, erro
|
|||
eg.Go(func() error {
|
||||
state, err := LoadMetricState(ctx, client, ids[i])
|
||||
if err != nil {
|
||||
if status.Code(err) == codes.NotFound {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
metrics[i] = metricdata.Metrics{
|
||||
|
@ -66,6 +71,11 @@ func (p *producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, erro
|
|||
return nil, err
|
||||
}
|
||||
|
||||
metrics = slices.Filter(metrics, func(v metricdata.Metrics) bool { return v.Name != "" })
|
||||
if len(metrics) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return []metricdata.ScopeMetrics{
|
||||
{
|
||||
Scope: p.scope,
|
||||
|
|
146
internal/zero/telemetry/telemetry.go
Normal file
146
internal/zero/telemetry/telemetry.go
Normal file
|
@ -0,0 +1,146 @@
|
|||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/rs/zerolog"
|
||||
"go.opentelemetry.io/otel/sdk/instrumentation"
|
||||
"golang.org/x/sync/errgroup"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/log"
|
||||
"github.com/pomerium/pomerium/internal/telemetry/prometheus"
|
||||
sdk "github.com/pomerium/pomerium/internal/zero/api"
|
||||
connect_mux "github.com/pomerium/pomerium/internal/zero/connect-mux"
|
||||
"github.com/pomerium/pomerium/internal/zero/telemetry/reporter"
|
||||
"github.com/pomerium/pomerium/internal/zero/telemetry/sessions"
|
||||
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
||||
"github.com/pomerium/pomerium/pkg/health"
|
||||
"github.com/pomerium/pomerium/pkg/zero/connect"
|
||||
)
|
||||
|
||||
type Telemetry struct {
|
||||
api *sdk.API
|
||||
reporter *reporter.Reporter
|
||||
|
||||
envoyMetrics *metricsProducer[*prometheus.Producer]
|
||||
sessionMetrics *metricsProducer[*sessions.Producer]
|
||||
hasSessionMetricsLease func() bool
|
||||
}
|
||||
|
||||
func New(
|
||||
ctx context.Context,
|
||||
api *sdk.API,
|
||||
clientProvider func() (databroker.DataBrokerServiceClient, error),
|
||||
hasSessionMetricsLease func() bool,
|
||||
envoyScrapeURL string,
|
||||
) (*Telemetry, error) {
|
||||
startTime := time.Now()
|
||||
|
||||
sessionMetricProducer := newMetricsProducer("sessions", buildSessionMetricsProducer(clientProvider))
|
||||
envoyMetricProducer := newMetricsProducer("envoy", buildEnvoyMetricsProducer(envoyScrapeURL, startTime))
|
||||
|
||||
r, err := reporter.New(ctx, api.GetTelemetryConn(),
|
||||
reporter.WithProducer(sessionMetricProducer),
|
||||
reporter.WithProducer(envoyMetricProducer),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error creating telemetry metrics reporter: %w", err)
|
||||
}
|
||||
|
||||
return &Telemetry{
|
||||
api: api,
|
||||
reporter: r,
|
||||
sessionMetrics: sessionMetricProducer,
|
||||
envoyMetrics: envoyMetricProducer,
|
||||
hasSessionMetricsLease: hasSessionMetricsLease,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (srv *Telemetry) Shutdown(ctx context.Context) error {
|
||||
return srv.reporter.Shutdown(ctx)
|
||||
}
|
||||
|
||||
func (srv *Telemetry) Run(ctx context.Context) error {
|
||||
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
||||
return c.Str("service", "telemetry-reporter")
|
||||
})
|
||||
|
||||
eg, ctx := errgroup.WithContext(ctx)
|
||||
eg.Go(func() error {
|
||||
health.SetProvider(srv.reporter)
|
||||
defer health.SetProvider(nil)
|
||||
return srv.reporter.Run(ctx)
|
||||
})
|
||||
eg.Go(func() error { return srv.handleRequests(ctx) })
|
||||
return eg.Wait()
|
||||
}
|
||||
|
||||
// handleRequests watches for telemetry requests as they are received from the cloud control plane and processes them.
|
||||
func (srv *Telemetry) handleRequests(ctx context.Context) error {
|
||||
requests := make(chan *connect.TelemetryRequest, 1)
|
||||
|
||||
eg, ctx := errgroup.WithContext(ctx)
|
||||
eg.Go(func() error {
|
||||
return srv.api.Watch(ctx, connect_mux.WithOnTelemetryRequested(
|
||||
func(ctx context.Context, req *connect.TelemetryRequest) {
|
||||
select {
|
||||
case requests <- req:
|
||||
default:
|
||||
log.Warn(ctx).Msg("dropping telemetry request")
|
||||
}
|
||||
}))
|
||||
})
|
||||
eg.Go(func() error {
|
||||
for {
|
||||
select {
|
||||
case req := <-requests:
|
||||
srv.handleRequest(ctx, req)
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
})
|
||||
return eg.Wait()
|
||||
}
|
||||
|
||||
func (srv *Telemetry) handleRequest(ctx context.Context, req *connect.TelemetryRequest) {
|
||||
srv.configureEnvoyMetricsProducer(req.GetEnvoyMetrics())
|
||||
srv.configureSessionMetricsProducer(req.GetSessionAnalytics())
|
||||
|
||||
err := srv.reporter.CollectAndExportMetrics(ctx)
|
||||
if err != nil {
|
||||
health.ReportError(health.CollectAndSendTelemetry, err)
|
||||
} else {
|
||||
health.ReportOK(health.CollectAndSendTelemetry)
|
||||
}
|
||||
}
|
||||
|
||||
func buildSessionMetricsProducer(clientProvider func() (databroker.DataBrokerServiceClient, error)) *sessions.Producer {
|
||||
return sessions.NewProducer(instrumentation.Scope{Name: "pomerium-cluster"}, clientProvider)
|
||||
}
|
||||
|
||||
func buildEnvoyMetricsProducer(scrapeURL string, startTime time.Time) *prometheus.Producer {
|
||||
return prometheus.NewProducer(
|
||||
prometheus.WithScope(instrumentation.Scope{Name: "envoy"}),
|
||||
prometheus.WithScrapeURL(scrapeURL),
|
||||
prometheus.WithStartTime(startTime),
|
||||
)
|
||||
}
|
||||
|
||||
func (srv *Telemetry) configureSessionMetricsProducer(req *connect.SessionAnalyticsRequest) {
|
||||
srv.sessionMetrics.SetEnabled(req != nil && srv.hasSessionMetricsLease())
|
||||
}
|
||||
|
||||
func (srv *Telemetry) configureEnvoyMetricsProducer(req *connect.EnvoyMetricsRequest) {
|
||||
if req == nil {
|
||||
srv.envoyMetrics.SetEnabled(false)
|
||||
return
|
||||
}
|
||||
srv.envoyMetrics.Producer().UpdateConfig(
|
||||
prometheus.WithIncludeMetrics(req.GetMetrics()...),
|
||||
prometheus.WithIncludeLabels(req.GetLabels()...),
|
||||
)
|
||||
srv.envoyMetrics.SetEnabled(true)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue