mirror of
https://github.com/pomerium/pomerium.git
synced 2025-08-02 16:30:17 +02:00
zero: refactor telemetry and controller (#5135)
* zero: refactor controller * refactor zero telemetry and controller * wire with connect handler * cr
This commit is contained in:
parent
cc636be707
commit
114f730dba
22 changed files with 612 additions and 342 deletions
|
@ -17,6 +17,7 @@ type controllerConfig struct {
|
|||
|
||||
reconcilerLeaseDuration time.Duration
|
||||
databrokerRequestTimeout time.Duration
|
||||
shutdownTimeout time.Duration
|
||||
}
|
||||
|
||||
// WithTmpDir sets the temporary directory to use.
|
||||
|
@ -110,6 +111,13 @@ func WithDatabrokerRequestTimeout(timeout time.Duration) Option {
|
|||
}
|
||||
}
|
||||
|
||||
// WithShutdownTimeout sets the timeout for shutting down and cleanup.
|
||||
func WithShutdownTimeout(timeout time.Duration) Option {
|
||||
return func(c *controllerConfig) {
|
||||
c.shutdownTimeout = timeout
|
||||
}
|
||||
}
|
||||
|
||||
func newControllerConfig(opts ...Option) *controllerConfig {
|
||||
c := new(controllerConfig)
|
||||
|
||||
|
@ -118,6 +126,7 @@ func newControllerConfig(opts ...Option) *controllerConfig {
|
|||
WithConnectAPIEndpoint("https://connect.pomerium.com"),
|
||||
WithDatabrokerLeaseDuration(time.Second * 30),
|
||||
WithDatabrokerRequestTimeout(time.Second * 30),
|
||||
WithShutdownTimeout(time.Second * 10),
|
||||
} {
|
||||
opt(c)
|
||||
}
|
||||
|
|
|
@ -5,22 +5,20 @@ import (
|
|||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/cenkalti/backoff/v4"
|
||||
"github.com/rs/zerolog"
|
||||
"golang.org/x/sync/errgroup"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/log"
|
||||
"github.com/pomerium/pomerium/internal/zero/analytics"
|
||||
sdk "github.com/pomerium/pomerium/internal/zero/api"
|
||||
"github.com/pomerium/pomerium/internal/zero/bootstrap"
|
||||
"github.com/pomerium/pomerium/internal/zero/bootstrap/writers"
|
||||
"github.com/pomerium/pomerium/internal/zero/healthcheck"
|
||||
connect_mux "github.com/pomerium/pomerium/internal/zero/connect-mux"
|
||||
"github.com/pomerium/pomerium/internal/zero/reconciler"
|
||||
"github.com/pomerium/pomerium/internal/zero/reporter"
|
||||
"github.com/pomerium/pomerium/internal/zero/telemetry/reporter"
|
||||
"github.com/pomerium/pomerium/pkg/cmd/pomerium"
|
||||
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
||||
"github.com/pomerium/pomerium/pkg/zero/connect"
|
||||
)
|
||||
|
||||
// Run runs Pomerium is managed mode using the provided token.
|
||||
|
@ -61,7 +59,6 @@ func Run(ctx context.Context, opts ...Option) error {
|
|||
eg.Go(func() error { return run(ctx, "zero-bootstrap", c.runBootstrap) })
|
||||
eg.Go(func() error { return run(ctx, "pomerium-core", c.runPomeriumCore) })
|
||||
eg.Go(func() error { return run(ctx, "zero-control-loop", c.runZeroControlLoop) })
|
||||
eg.Go(func() error { return run(ctx, "healh-check-reporter", c.runHealthCheckReporter) })
|
||||
return eg.Wait()
|
||||
}
|
||||
|
||||
|
@ -70,7 +67,8 @@ type controller struct {
|
|||
|
||||
api *sdk.API
|
||||
|
||||
bootstrapConfig *bootstrap.Source
|
||||
bootstrapConfig *bootstrap.Source
|
||||
telemetryReporter *reporter.Reporter
|
||||
}
|
||||
|
||||
func (c *controller) initAPI(ctx context.Context) error {
|
||||
|
@ -85,7 +83,6 @@ func (c *controller) initAPI(ctx context.Context) error {
|
|||
}
|
||||
|
||||
c.api = api
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -134,14 +131,35 @@ func (c *controller) runZeroControlLoop(ctx context.Context) error {
|
|||
r := c.NewDatabrokerRestartRunner(ctx)
|
||||
defer r.Close()
|
||||
|
||||
return r.Run(ctx,
|
||||
WithLease(
|
||||
c.runReconcilerLeased,
|
||||
c.runAnalyticsLeased,
|
||||
c.runMetricsReporterLeased,
|
||||
c.runHealthChecksLeased,
|
||||
),
|
||||
)
|
||||
err = c.initTelemetry(ctx, func() (databroker.DataBrokerServiceClient, error) {
|
||||
client, _, err := r.getDatabrokerClient()
|
||||
return client, err
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("init telemetry: %w", err)
|
||||
}
|
||||
defer c.shutdownTelemetry(ctx)
|
||||
|
||||
err = c.api.Watch(ctx, connect_mux.WithOnTelemetryRequested(func(ctx context.Context, _ *connect.TelemetryRequest) {
|
||||
c.telemetryReporter.CollectAndExportMetrics(ctx)
|
||||
}))
|
||||
if err != nil {
|
||||
return fmt.Errorf("watch telemetry: %w", err)
|
||||
}
|
||||
|
||||
eg, ctx := errgroup.WithContext(ctx)
|
||||
eg.Go(func() error {
|
||||
return r.Run(ctx,
|
||||
WithLease(
|
||||
c.runReconcilerLeased,
|
||||
c.runSessionAnalyticsLeased,
|
||||
c.enableSessionAnalyticsReporting,
|
||||
c.runHealthChecksLeased,
|
||||
),
|
||||
)
|
||||
})
|
||||
eg.Go(func() error { return c.runTelemetryReporter(ctx) })
|
||||
return eg.Wait()
|
||||
}
|
||||
|
||||
func (c *controller) runReconcilerLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
||||
|
@ -154,54 +172,3 @@ func (c *controller) runReconcilerLeased(ctx context.Context, client databroker.
|
|||
reconciler.WithDataBrokerClient(client),
|
||||
)
|
||||
}
|
||||
|
||||
func (c *controller) runAnalyticsLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
||||
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
||||
return c.Str("service", "zero-analytics")
|
||||
})
|
||||
|
||||
err := analytics.Collect(ctx, client, time.Hour)
|
||||
if err != nil && ctx.Err() == nil {
|
||||
log.Ctx(ctx).Error().Err(err).Msg("error collecting analytics, disabling")
|
||||
return nil
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (c *controller) runMetricsReporterLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
||||
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
||||
return c.Str("service", "zero-reporter")
|
||||
})
|
||||
|
||||
return c.api.ReportMetrics(ctx,
|
||||
reporter.WithCollectInterval(time.Hour),
|
||||
reporter.WithMetrics(analytics.Metrics(func() databroker.DataBrokerServiceClient { return client })...),
|
||||
)
|
||||
}
|
||||
|
||||
func (c *controller) runHealthChecksLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
||||
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
||||
return c.Str("service", "zero-health-checks")
|
||||
})
|
||||
|
||||
return healthcheck.RunChecks(ctx, c.bootstrapConfig, client)
|
||||
}
|
||||
|
||||
func (c *controller) runHealthCheckReporter(ctx context.Context) error {
|
||||
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
||||
return c.Str("service", "zero-health-check-reporter")
|
||||
})
|
||||
|
||||
bo := backoff.NewExponentialBackOff()
|
||||
bo.MaxElapsedTime = 0
|
||||
return backoff.RetryNotify(
|
||||
func() error {
|
||||
return c.api.ReportHealthChecks(ctx)
|
||||
},
|
||||
backoff.WithContext(bo, ctx),
|
||||
func(err error, next time.Duration) {
|
||||
log.Ctx(ctx).Warn().Err(err).Dur("next", next).Msg("health check reporter backoff")
|
||||
},
|
||||
)
|
||||
}
|
||||
|
|
79
internal/zero/controller/telemetry.go
Normal file
79
internal/zero/controller/telemetry.go
Normal file
|
@ -0,0 +1,79 @@
|
|||
package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/rs/zerolog"
|
||||
"go.opentelemetry.io/otel/sdk/instrumentation"
|
||||
|
||||
"github.com/pomerium/pomerium/internal/log"
|
||||
"github.com/pomerium/pomerium/internal/zero/healthcheck"
|
||||
"github.com/pomerium/pomerium/internal/zero/telemetry/reporter"
|
||||
"github.com/pomerium/pomerium/internal/zero/telemetry/sessions"
|
||||
"github.com/pomerium/pomerium/pkg/grpc/databroker"
|
||||
"github.com/pomerium/pomerium/pkg/health"
|
||||
)
|
||||
|
||||
const (
|
||||
producerSessionAnalytics = "session-analytics"
|
||||
)
|
||||
|
||||
func (c *controller) initTelemetry(ctx context.Context, clientProvider func() (databroker.DataBrokerServiceClient, error)) error {
|
||||
sessionMetricProducer := sessions.NewProducer(instrumentation.Scope{}, clientProvider)
|
||||
r, err := reporter.New(ctx, c.api.GetTelemetryConn(),
|
||||
reporter.WithProducer(producerSessionAnalytics, sessionMetricProducer),
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating telemetry metrics reporter: %w", err)
|
||||
}
|
||||
c.telemetryReporter = r
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *controller) shutdownTelemetry(ctx context.Context) {
|
||||
ctx, cancel := context.WithTimeout(context.WithoutCancel(ctx), c.cfg.shutdownTimeout)
|
||||
defer cancel()
|
||||
|
||||
err := c.telemetryReporter.Shutdown(ctx)
|
||||
if err != nil {
|
||||
log.Warn(ctx).Err(err).Msg("telemetry reporter shutdown error")
|
||||
}
|
||||
}
|
||||
|
||||
func (c *controller) runSessionAnalyticsLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
||||
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
||||
return c.Str("service", "zero-analytics")
|
||||
})
|
||||
|
||||
return sessions.Collect(ctx, client, time.Hour)
|
||||
}
|
||||
|
||||
// those metrics are cluster-wide, so we only enable their reporting when we have the lease
|
||||
func (c *controller) enableSessionAnalyticsReporting(ctx context.Context, _ databroker.DataBrokerServiceClient) error {
|
||||
_ = c.telemetryReporter.SetMetricProducerEnabled(producerSessionAnalytics, true)
|
||||
defer func() { _ = c.telemetryReporter.SetMetricProducerEnabled(producerSessionAnalytics, false) }()
|
||||
|
||||
<-ctx.Done()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *controller) runHealthChecksLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
|
||||
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
||||
return c.Str("service", "zero-health-checks")
|
||||
})
|
||||
|
||||
return healthcheck.RunChecks(ctx, c.bootstrapConfig, client)
|
||||
}
|
||||
|
||||
func (c *controller) runTelemetryReporter(ctx context.Context) error {
|
||||
health.SetProvider(c.telemetryReporter)
|
||||
defer health.SetProvider(nil)
|
||||
|
||||
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
|
||||
return c.Str("service", "zero-bootstrap")
|
||||
})
|
||||
|
||||
return c.telemetryReporter.Run(ctx)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue