zero/telemetry: internal envoy stats scraper and metrics producer (#5136)

This commit is contained in:
Denis Mishin 2024-06-16 20:41:05 -04:00 committed by GitHub
parent c3534df885
commit c1dec06afa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 667 additions and 301 deletions

View file

@ -0,0 +1,54 @@
package telemetry
import (
"context"
"sync/atomic"
"go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/metric/metricdata"
"github.com/pomerium/pomerium/internal/log"
)
// metricsProducer is a wrapper around a metric producer that can be enabled or disabled
type metricsProducer[P metric.Producer] struct {
enabled atomic.Bool
name string
producer P
}
func newMetricsProducer[P metric.Producer](name string, p P) *metricsProducer[P] {
return &metricsProducer[P]{
name: name,
producer: p,
}
}
// Produce wraps the underlying producer's Produce method and logs any errors,
// to prevent the error from blocking the export of other metrics.
// also checks if the producer is enabled before producing metrics
func (p *metricsProducer[P]) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
if enabled := p.enabled.Load(); !enabled {
return nil, nil
}
data, err := p.producer.Produce(ctx)
if err != nil {
// we do not return the error here, as we do not want to block the export of other metrics
log.Error(ctx).Err(err).Str("producer", p.name).Msg("failed to produce metrics")
return nil, nil
}
return data, nil
}
func (p *metricsProducer[P]) SetEnabled(v bool) {
p.enabled.Store(v)
}
func (p *metricsProducer[P]) Name() string {
return p.name
}
func (p *metricsProducer[P]) Producer() P {
return p.producer
}

View file

@ -5,25 +5,20 @@ import (
)
type config struct {
producers map[string]*metricsProducer
producers []metric.Producer
}
type Option func(*config)
// WithProducer adds a metric producer to the reporter
func WithProducer(name string, p metric.Producer) Option {
func WithProducer(p metric.Producer) Option {
return func(c *config) {
if _, ok := c.producers[name]; ok {
panic("duplicate producer name " + name)
}
c.producers[name] = newProducer(name, p)
c.producers = append(c.producers, p)
}
}
func getConfig(opts ...Option) config {
c := config{
producers: make(map[string]*metricsProducer),
}
var c config
for _, opt := range opts {
opt(&c)
}

View file

@ -1,47 +0,0 @@
package reporter
import (
"context"
"sync/atomic"
"go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/metric/metricdata"
"github.com/pomerium/pomerium/internal/log"
)
type metricsProducer struct {
enabled atomic.Bool
name string
metric.Producer
}
func newProducer(name string, p metric.Producer) *metricsProducer {
return &metricsProducer{
name: name,
Producer: p,
}
}
var _ metric.Producer = (*metricsProducer)(nil)
// Produce wraps the underlying producer's Produce method and logs any errors,
// to prevent the error from blocking the export of other metrics.
// also checks if the producer is enabled before producing metrics
func (p *metricsProducer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
if enabled := p.enabled.Load(); !enabled {
return nil, nil
}
data, err := p.Producer.Produce(ctx)
if err != nil {
log.Error(ctx).Err(err).Str("producer", p.name).Msg("failed to produce metrics")
return nil, err
}
return data, nil
}
// SetEnabled sets the enabled state of the producer
func (p *metricsProducer) SetEnabled(v bool) {
p.enabled.Store(v)
}

View file

@ -6,38 +6,38 @@ import (
"fmt"
export_grpc "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
metric_sdk "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/metric/metricdata"
"go.opentelemetry.io/otel/sdk/resource"
"google.golang.org/grpc"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/pkg/health"
)
type metricsReporter struct {
exporter *export_grpc.Exporter
resource *resource.Resource
reader *metric_sdk.ManualReader
producers map[string]*metricsProducer
singleTask
reader *metric.ManualReader
producers []metric.Producer
}
func newMetricsReporter(
ctx context.Context,
conn *grpc.ClientConn,
resource *resource.Resource,
producers map[string]*metricsProducer,
producers []metric.Producer,
) (*metricsReporter, error) {
exporter, err := export_grpc.New(ctx, export_grpc.WithGRPCConn(conn))
if err != nil {
return nil, fmt.Errorf("create exporter: %w", err)
}
readerOpts := make([]metric_sdk.ManualReaderOption, 0, len(producers))
readerOpts := make([]metric.ManualReaderOption, 0, len(producers))
for _, p := range producers {
readerOpts = append(readerOpts, metric_sdk.WithProducer(p))
readerOpts = append(readerOpts, metric.WithProducer(p))
}
reader := metric_sdk.NewManualReader(readerOpts...)
reader := metric.NewManualReader(readerOpts...)
_ = metric.NewMeterProvider(
metric.WithResource(resource),
metric.WithReader(reader),
)
return &metricsReporter{
exporter: exporter,
resource: resource,
@ -58,40 +58,16 @@ func (r *metricsReporter) Shutdown(ctx context.Context) error {
)
}
func (r *metricsReporter) SetMetricProducerEnabled(name string, enabled bool) error {
p, ok := r.producers[name]
if !ok {
return fmt.Errorf("producer %q not found", name)
}
p.SetEnabled(enabled)
return nil
}
func (r *metricsReporter) CollectAndExportMetrics(ctx context.Context) {
r.singleTask.Run(ctx, func(ctx context.Context) {
err := r.collectAndExport(ctx)
if errors.Is(err, ErrAnotherExecutionRequested) {
log.Warn(ctx).Msg("telemetry metrics were not sent, due to another execution requested")
return
}
if err != nil {
health.ReportError(health.CollectAndSendTelemetry, err)
} else {
health.ReportOK(health.CollectAndSendTelemetry)
}
})
}
func (r *metricsReporter) collectAndExport(ctx context.Context) error {
func (r *metricsReporter) CollectAndExportMetrics(ctx context.Context) error {
rm := &metricdata.ResourceMetrics{
Resource: r.resource,
}
err := withBackoff(ctx, "collect metrics", func(ctx context.Context) error { return r.reader.Collect(ctx, rm) })
err := r.reader.Collect(ctx, rm)
if err != nil {
return fmt.Errorf("collect metrics: %w", err)
}
err = withBackoff(ctx, "export metrics", func(ctx context.Context) error { return r.exporter.Export(ctx, rm) })
err = r.exporter.Export(ctx, rm)
if err != nil {
return fmt.Errorf("export metrics: %w", err)
}

View file

@ -5,16 +5,14 @@ import (
"context"
"fmt"
"os"
"time"
"github.com/cenkalti/backoff/v4"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/sdk/resource"
semconv "go.opentelemetry.io/otel/semconv/v1.4.0"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/retry"
"github.com/pomerium/pomerium/internal/version"
)
@ -52,8 +50,8 @@ func New(
func (r *Reporter) Run(ctx context.Context) error {
eg, ctx := errgroup.WithContext(ctx)
eg.Go(func() error { return withBackoff(ctx, "metrics reporter", r.metricsReporter.Run) })
eg.Go(func() error { return withBackoff(ctx, "health check reporter", r.healthCheckReporter.Run) })
eg.Go(func() error { return retry.WithBackoff(ctx, "metrics reporter", r.metricsReporter.Run) })
eg.Go(func() error { return retry.WithBackoff(ctx, "health check reporter", r.healthCheckReporter.Run) })
return eg.Wait()
}
@ -81,19 +79,3 @@ func getResource() *resource.Resource {
return resource.NewSchemaless(attr...)
}
func withBackoff(ctx context.Context, name string, f func(context.Context) error) error {
bo := backoff.NewExponentialBackOff()
bo.MaxElapsedTime = 0
return backoff.RetryNotify(
func() error { return f(ctx) },
backoff.WithContext(bo, ctx),
func(err error, d time.Duration) {
log.Warn(ctx).
Str("name", name).
Err(err).
Dur("backoff", d).
Msg("retrying")
},
)
}

View file

@ -1,27 +0,0 @@
package reporter
import (
"context"
"errors"
"sync"
)
type singleTask struct {
lock sync.Mutex
cancel context.CancelCauseFunc
}
var ErrAnotherExecutionRequested = errors.New("another execution requested")
func (s *singleTask) Run(ctx context.Context, f func(context.Context)) {
s.lock.Lock()
defer s.lock.Unlock()
if s.cancel != nil {
s.cancel(ErrAnotherExecutionRequested)
}
ctx, cancel := context.WithCancelCause(ctx)
s.cancel = cancel
go f(ctx)
}

View file

@ -6,14 +6,16 @@ import (
"time"
"go.opentelemetry.io/otel/sdk/instrumentation"
"go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/metric/metricdata"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"github.com/pomerium/pomerium/pkg/grpc/databroker"
"github.com/pomerium/pomerium/pkg/slices"
)
type producer struct {
type Producer struct {
scope instrumentation.Scope
clientProvider func() (databroker.DataBrokerServiceClient, error)
}
@ -21,14 +23,14 @@ type producer struct {
func NewProducer(
scope instrumentation.Scope,
clientProvider func() (databroker.DataBrokerServiceClient, error),
) metric.Producer {
return &producer{
) *Producer {
return &Producer{
clientProvider: clientProvider,
scope: scope,
}
}
func (p *producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
func (p *Producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, error) {
client, err := p.clientProvider()
if err != nil {
return nil, fmt.Errorf("error getting client: %w", err)
@ -43,6 +45,9 @@ func (p *producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, erro
eg.Go(func() error {
state, err := LoadMetricState(ctx, client, ids[i])
if err != nil {
if status.Code(err) == codes.NotFound {
return nil
}
return err
}
metrics[i] = metricdata.Metrics{
@ -66,6 +71,11 @@ func (p *producer) Produce(ctx context.Context) ([]metricdata.ScopeMetrics, erro
return nil, err
}
metrics = slices.Filter(metrics, func(v metricdata.Metrics) bool { return v.Name != "" })
if len(metrics) == 0 {
return nil, nil
}
return []metricdata.ScopeMetrics{
{
Scope: p.scope,

View file

@ -0,0 +1,146 @@
package telemetry
import (
"context"
"fmt"
"time"
"github.com/rs/zerolog"
"go.opentelemetry.io/otel/sdk/instrumentation"
"golang.org/x/sync/errgroup"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/telemetry/prometheus"
sdk "github.com/pomerium/pomerium/internal/zero/api"
connect_mux "github.com/pomerium/pomerium/internal/zero/connect-mux"
"github.com/pomerium/pomerium/internal/zero/telemetry/reporter"
"github.com/pomerium/pomerium/internal/zero/telemetry/sessions"
"github.com/pomerium/pomerium/pkg/grpc/databroker"
"github.com/pomerium/pomerium/pkg/health"
"github.com/pomerium/pomerium/pkg/zero/connect"
)
type Telemetry struct {
api *sdk.API
reporter *reporter.Reporter
envoyMetrics *metricsProducer[*prometheus.Producer]
sessionMetrics *metricsProducer[*sessions.Producer]
hasSessionMetricsLease func() bool
}
func New(
ctx context.Context,
api *sdk.API,
clientProvider func() (databroker.DataBrokerServiceClient, error),
hasSessionMetricsLease func() bool,
envoyScrapeURL string,
) (*Telemetry, error) {
startTime := time.Now()
sessionMetricProducer := newMetricsProducer("sessions", buildSessionMetricsProducer(clientProvider))
envoyMetricProducer := newMetricsProducer("envoy", buildEnvoyMetricsProducer(envoyScrapeURL, startTime))
r, err := reporter.New(ctx, api.GetTelemetryConn(),
reporter.WithProducer(sessionMetricProducer),
reporter.WithProducer(envoyMetricProducer),
)
if err != nil {
return nil, fmt.Errorf("error creating telemetry metrics reporter: %w", err)
}
return &Telemetry{
api: api,
reporter: r,
sessionMetrics: sessionMetricProducer,
envoyMetrics: envoyMetricProducer,
hasSessionMetricsLease: hasSessionMetricsLease,
}, nil
}
func (srv *Telemetry) Shutdown(ctx context.Context) error {
return srv.reporter.Shutdown(ctx)
}
func (srv *Telemetry) Run(ctx context.Context) error {
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "telemetry-reporter")
})
eg, ctx := errgroup.WithContext(ctx)
eg.Go(func() error {
health.SetProvider(srv.reporter)
defer health.SetProvider(nil)
return srv.reporter.Run(ctx)
})
eg.Go(func() error { return srv.handleRequests(ctx) })
return eg.Wait()
}
// handleRequests watches for telemetry requests as they are received from the cloud control plane and processes them.
func (srv *Telemetry) handleRequests(ctx context.Context) error {
requests := make(chan *connect.TelemetryRequest, 1)
eg, ctx := errgroup.WithContext(ctx)
eg.Go(func() error {
return srv.api.Watch(ctx, connect_mux.WithOnTelemetryRequested(
func(ctx context.Context, req *connect.TelemetryRequest) {
select {
case requests <- req:
default:
log.Warn(ctx).Msg("dropping telemetry request")
}
}))
})
eg.Go(func() error {
for {
select {
case req := <-requests:
srv.handleRequest(ctx, req)
case <-ctx.Done():
return ctx.Err()
}
}
})
return eg.Wait()
}
func (srv *Telemetry) handleRequest(ctx context.Context, req *connect.TelemetryRequest) {
srv.configureEnvoyMetricsProducer(req.GetEnvoyMetrics())
srv.configureSessionMetricsProducer(req.GetSessionAnalytics())
err := srv.reporter.CollectAndExportMetrics(ctx)
if err != nil {
health.ReportError(health.CollectAndSendTelemetry, err)
} else {
health.ReportOK(health.CollectAndSendTelemetry)
}
}
func buildSessionMetricsProducer(clientProvider func() (databroker.DataBrokerServiceClient, error)) *sessions.Producer {
return sessions.NewProducer(instrumentation.Scope{Name: "pomerium-cluster"}, clientProvider)
}
func buildEnvoyMetricsProducer(scrapeURL string, startTime time.Time) *prometheus.Producer {
return prometheus.NewProducer(
prometheus.WithScope(instrumentation.Scope{Name: "envoy"}),
prometheus.WithScrapeURL(scrapeURL),
prometheus.WithStartTime(startTime),
)
}
func (srv *Telemetry) configureSessionMetricsProducer(req *connect.SessionAnalyticsRequest) {
srv.sessionMetrics.SetEnabled(req != nil && srv.hasSessionMetricsLease())
}
func (srv *Telemetry) configureEnvoyMetricsProducer(req *connect.EnvoyMetricsRequest) {
if req == nil {
srv.envoyMetrics.SetEnabled(false)
return
}
srv.envoyMetrics.Producer().UpdateConfig(
prometheus.WithIncludeMetrics(req.GetMetrics()...),
prometheus.WithIncludeLabels(req.GetLabels()...),
)
srv.envoyMetrics.SetEnabled(true)
}