zero: refactor telemetry and controller (#5135)

* zero: refactor controller

* refactor zero telemetry and controller

* wire with connect handler

* cr
This commit is contained in:
Denis Mishin 2024-06-12 21:59:25 -04:00 committed by GitHub
parent cc636be707
commit 114f730dba
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 612 additions and 342 deletions

View file

@ -5,22 +5,20 @@ import (
"context"
"errors"
"fmt"
"time"
"github.com/cenkalti/backoff/v4"
"github.com/rs/zerolog"
"golang.org/x/sync/errgroup"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/zero/analytics"
sdk "github.com/pomerium/pomerium/internal/zero/api"
"github.com/pomerium/pomerium/internal/zero/bootstrap"
"github.com/pomerium/pomerium/internal/zero/bootstrap/writers"
"github.com/pomerium/pomerium/internal/zero/healthcheck"
connect_mux "github.com/pomerium/pomerium/internal/zero/connect-mux"
"github.com/pomerium/pomerium/internal/zero/reconciler"
"github.com/pomerium/pomerium/internal/zero/reporter"
"github.com/pomerium/pomerium/internal/zero/telemetry/reporter"
"github.com/pomerium/pomerium/pkg/cmd/pomerium"
"github.com/pomerium/pomerium/pkg/grpc/databroker"
"github.com/pomerium/pomerium/pkg/zero/connect"
)
// Run runs Pomerium is managed mode using the provided token.
@ -61,7 +59,6 @@ func Run(ctx context.Context, opts ...Option) error {
eg.Go(func() error { return run(ctx, "zero-bootstrap", c.runBootstrap) })
eg.Go(func() error { return run(ctx, "pomerium-core", c.runPomeriumCore) })
eg.Go(func() error { return run(ctx, "zero-control-loop", c.runZeroControlLoop) })
eg.Go(func() error { return run(ctx, "healh-check-reporter", c.runHealthCheckReporter) })
return eg.Wait()
}
@ -70,7 +67,8 @@ type controller struct {
api *sdk.API
bootstrapConfig *bootstrap.Source
bootstrapConfig *bootstrap.Source
telemetryReporter *reporter.Reporter
}
func (c *controller) initAPI(ctx context.Context) error {
@ -85,7 +83,6 @@ func (c *controller) initAPI(ctx context.Context) error {
}
c.api = api
return nil
}
@ -134,14 +131,35 @@ func (c *controller) runZeroControlLoop(ctx context.Context) error {
r := c.NewDatabrokerRestartRunner(ctx)
defer r.Close()
return r.Run(ctx,
WithLease(
c.runReconcilerLeased,
c.runAnalyticsLeased,
c.runMetricsReporterLeased,
c.runHealthChecksLeased,
),
)
err = c.initTelemetry(ctx, func() (databroker.DataBrokerServiceClient, error) {
client, _, err := r.getDatabrokerClient()
return client, err
})
if err != nil {
return fmt.Errorf("init telemetry: %w", err)
}
defer c.shutdownTelemetry(ctx)
err = c.api.Watch(ctx, connect_mux.WithOnTelemetryRequested(func(ctx context.Context, _ *connect.TelemetryRequest) {
c.telemetryReporter.CollectAndExportMetrics(ctx)
}))
if err != nil {
return fmt.Errorf("watch telemetry: %w", err)
}
eg, ctx := errgroup.WithContext(ctx)
eg.Go(func() error {
return r.Run(ctx,
WithLease(
c.runReconcilerLeased,
c.runSessionAnalyticsLeased,
c.enableSessionAnalyticsReporting,
c.runHealthChecksLeased,
),
)
})
eg.Go(func() error { return c.runTelemetryReporter(ctx) })
return eg.Wait()
}
func (c *controller) runReconcilerLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
@ -154,54 +172,3 @@ func (c *controller) runReconcilerLeased(ctx context.Context, client databroker.
reconciler.WithDataBrokerClient(client),
)
}
func (c *controller) runAnalyticsLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "zero-analytics")
})
err := analytics.Collect(ctx, client, time.Hour)
if err != nil && ctx.Err() == nil {
log.Ctx(ctx).Error().Err(err).Msg("error collecting analytics, disabling")
return nil
}
return err
}
func (c *controller) runMetricsReporterLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "zero-reporter")
})
return c.api.ReportMetrics(ctx,
reporter.WithCollectInterval(time.Hour),
reporter.WithMetrics(analytics.Metrics(func() databroker.DataBrokerServiceClient { return client })...),
)
}
func (c *controller) runHealthChecksLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "zero-health-checks")
})
return healthcheck.RunChecks(ctx, c.bootstrapConfig, client)
}
func (c *controller) runHealthCheckReporter(ctx context.Context) error {
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "zero-health-check-reporter")
})
bo := backoff.NewExponentialBackOff()
bo.MaxElapsedTime = 0
return backoff.RetryNotify(
func() error {
return c.api.ReportHealthChecks(ctx)
},
backoff.WithContext(bo, ctx),
func(err error, next time.Duration) {
log.Ctx(ctx).Warn().Err(err).Dur("next", next).Msg("health check reporter backoff")
},
)
}