zero/telemetry: internal envoy stats scraper and metrics producer (#5136)

This commit is contained in:
Denis Mishin 2024-06-16 20:41:05 -04:00 committed by GitHub
parent c3534df885
commit c1dec06afa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 667 additions and 301 deletions

View file

@ -5,20 +5,24 @@ import (
"context"
"errors"
"fmt"
"net"
"net/url"
"time"
"github.com/rs/zerolog"
"golang.org/x/sync/errgroup"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/retry"
sdk "github.com/pomerium/pomerium/internal/zero/api"
"github.com/pomerium/pomerium/internal/zero/bootstrap"
"github.com/pomerium/pomerium/internal/zero/bootstrap/writers"
connect_mux "github.com/pomerium/pomerium/internal/zero/connect-mux"
"github.com/pomerium/pomerium/internal/zero/healthcheck"
"github.com/pomerium/pomerium/internal/zero/reconciler"
"github.com/pomerium/pomerium/internal/zero/telemetry/reporter"
"github.com/pomerium/pomerium/internal/zero/telemetry"
"github.com/pomerium/pomerium/internal/zero/telemetry/sessions"
"github.com/pomerium/pomerium/pkg/cmd/pomerium"
"github.com/pomerium/pomerium/pkg/grpc/databroker"
"github.com/pomerium/pomerium/pkg/zero/connect"
)
// Run runs Pomerium is managed mode using the provided token.
@ -67,8 +71,7 @@ type controller struct {
api *sdk.API
bootstrapConfig *bootstrap.Source
telemetryReporter *reporter.Reporter
bootstrapConfig *bootstrap.Source
}
func (c *controller) initAPI(ctx context.Context) error {
@ -128,47 +131,70 @@ func (c *controller) runZeroControlLoop(ctx context.Context) error {
return fmt.Errorf("waiting for config source to be ready: %w", err)
}
r := c.NewDatabrokerRestartRunner(ctx)
r := NewDatabrokerRestartRunner(ctx, c.bootstrapConfig)
defer r.Close()
err = c.initTelemetry(ctx, func() (databroker.DataBrokerServiceClient, error) {
client, _, err := r.getDatabrokerClient()
return client, err
})
var leaseStatus LeaseStatus
tm, err := telemetry.New(ctx, c.api,
r.GetDatabrokerClient,
leaseStatus.HasLease,
c.getEnvoyScrapeURL(),
)
if err != nil {
return fmt.Errorf("init telemetry: %w", err)
}
defer c.shutdownTelemetry(ctx)
err = c.api.Watch(ctx, connect_mux.WithOnTelemetryRequested(func(ctx context.Context, _ *connect.TelemetryRequest) {
c.telemetryReporter.CollectAndExportMetrics(ctx)
}))
if err != nil {
return fmt.Errorf("watch telemetry: %w", err)
}
defer c.shutdownTelemetry(ctx, tm)
eg, ctx := errgroup.WithContext(ctx)
eg.Go(func() error { return tm.Run(ctx) })
eg.Go(func() error {
return r.Run(ctx,
WithLease(
c.runReconcilerLeased,
c.runSessionAnalyticsLeased,
c.enableSessionAnalyticsReporting,
c.runHealthChecksLeased,
c.runPeriodicHealthChecksLeased,
leaseStatus.MonitorLease,
),
)
})
eg.Go(func() error { return c.runTelemetryReporter(ctx) })
return eg.Wait()
}
func (c *controller) runReconcilerLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "zero-reconciler")
})
func (c *controller) shutdownTelemetry(ctx context.Context, tm *telemetry.Telemetry) {
ctx, cancel := context.WithTimeout(ctx, c.cfg.shutdownTimeout)
defer cancel()
return reconciler.Run(ctx,
reconciler.WithAPI(c.api),
reconciler.WithDataBrokerClient(client),
)
err := tm.Shutdown(ctx)
if err != nil {
log.Ctx(ctx).Error().Err(err).Msg("error shutting down telemetry")
}
}
func (c *controller) runReconcilerLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
return retry.WithBackoff(ctx, "zero-reconciler", func(ctx context.Context) error {
return reconciler.Run(ctx,
reconciler.WithAPI(c.api),
reconciler.WithDataBrokerClient(client),
)
})
}
func (c *controller) runSessionAnalyticsLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
return retry.WithBackoff(ctx, "zero-analytics", func(ctx context.Context) error {
return sessions.Collect(ctx, client, time.Hour)
})
}
func (c *controller) runPeriodicHealthChecksLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
return retry.WithBackoff(ctx, "zero-healthcheck", func(ctx context.Context) error {
return healthcheck.RunChecks(ctx, c.bootstrapConfig, client)
})
}
func (c *controller) getEnvoyScrapeURL() string {
return (&url.URL{
Scheme: "http",
Host: net.JoinHostPort("localhost", c.bootstrapConfig.GetConfig().OutboundPort),
Path: "/envoy/stats/prometheus",
}).String()
}

View file

@ -9,13 +9,10 @@ import (
"net/url"
"sync"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc"
"github.com/pomerium/pomerium/config"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/retry"
"github.com/pomerium/pomerium/internal/zero/bootstrap"
"github.com/pomerium/pomerium/pkg/grpc/databroker"
"github.com/pomerium/pomerium/pkg/grpcutil"
)
@ -23,9 +20,7 @@ import (
// ErrBootstrapConfigurationChanged is returned when the bootstrap configuration has changed and the function needs to be restarted.
var ErrBootstrapConfigurationChanged = errors.New("bootstrap configuration changed")
type runner struct {
source *bootstrap.Source
type DatabrokerRestartRunner struct {
lock sync.RWMutex
cancel chan struct{}
conn *grpc.ClientConn
@ -33,31 +28,39 @@ type runner struct {
initError error
}
func (c *controller) NewDatabrokerRestartRunner(ctx context.Context) *runner {
p := &runner{
source: c.bootstrapConfig,
}
p.initLocked(ctx, c.bootstrapConfig.GetConfig())
c.bootstrapConfig.OnConfigChange(context.Background(), p.onConfigChange)
// NewDatabrokerRestartRunner is a helper to run a function that needs to be restarted when the underlying databroker configuration changes.
func NewDatabrokerRestartRunner(
ctx context.Context,
src config.Source,
) *DatabrokerRestartRunner {
p := new(DatabrokerRestartRunner)
p.initLocked(ctx, src.GetConfig())
src.OnConfigChange(ctx, p.onConfigChange)
return p
}
type DbcFunc func(context.Context, databroker.DataBrokerServiceClient) error
func (p *runner) Run(ctx context.Context, funcs ...DbcFunc) error {
return retry.WithBackoff(ctx, func(ctx context.Context) error { return p.runUntilDatabrokerChanges(ctx, funcs...) })
func (p *DatabrokerRestartRunner) Run(
ctx context.Context,
fn func(context.Context, databroker.DataBrokerServiceClient) error,
) error {
return retry.WithBackoff(ctx, "databroker-restart", func(ctx context.Context) error { return p.runUntilDatabrokerChanges(ctx, fn) })
}
// Close releases the resources used by the databroker provider.
func (p *runner) Close() {
func (p *DatabrokerRestartRunner) Close() {
p.lock.Lock()
defer p.lock.Unlock()
p.closeLocked()
}
func (p *DatabrokerRestartRunner) GetDatabrokerClient() (databroker.DataBrokerServiceClient, error) {
client, _, err := p.getDatabrokerClient()
return client, err
}
// GetDatabrokerClient returns the databroker client and a channel that will be closed when the client is no longer valid.
func (p *runner) getDatabrokerClient() (databroker.DataBrokerServiceClient, <-chan struct{}, error) {
func (p *DatabrokerRestartRunner) getDatabrokerClient() (databroker.DataBrokerServiceClient, <-chan struct{}, error) {
p.lock.RLock()
defer p.lock.RUnlock()
@ -68,7 +71,7 @@ func (p *runner) getDatabrokerClient() (databroker.DataBrokerServiceClient, <-ch
return p.client, p.cancel, nil
}
func (p *runner) onConfigChange(ctx context.Context, cfg *config.Config) {
func (p *DatabrokerRestartRunner) onConfigChange(ctx context.Context, cfg *config.Config) {
p.lock.Lock()
defer p.lock.Unlock()
@ -76,7 +79,7 @@ func (p *runner) onConfigChange(ctx context.Context, cfg *config.Config) {
p.initLocked(ctx, cfg)
}
func (p *runner) initLocked(ctx context.Context, cfg *config.Config) {
func (p *DatabrokerRestartRunner) initLocked(ctx context.Context, cfg *config.Config) {
conn, err := newDataBrokerConnection(ctx, cfg)
if err != nil {
p.initError = fmt.Errorf("databroker connection: %w", err)
@ -89,7 +92,7 @@ func (p *runner) initLocked(ctx context.Context, cfg *config.Config) {
p.initError = nil
}
func (p *runner) closeLocked() {
func (p *DatabrokerRestartRunner) closeLocked() {
if p.conn != nil {
p.conn.Close()
p.conn = nil
@ -101,13 +104,10 @@ func (p *runner) closeLocked() {
p.initError = errors.New("databroker connection closed")
}
func (p *runner) runUntilDatabrokerChanges(
func (p *DatabrokerRestartRunner) runUntilDatabrokerChanges(
ctx context.Context,
funcs ...DbcFunc,
fn func(context.Context, databroker.DataBrokerServiceClient) error,
) error {
log.Debug(ctx).Msg("starting")
defer log.Debug(ctx).Msg("stop")
client, cancelCh, err := p.getDatabrokerClient()
if err != nil {
return fmt.Errorf("get databroker client: %w", err)
@ -120,18 +120,11 @@ func (p *runner) runUntilDatabrokerChanges(
select {
case <-ctx.Done():
case <-cancelCh:
log.Debug(ctx).Msg("bootstrap configuration changed, restarting...")
cancel(ErrBootstrapConfigurationChanged)
}
}()
eg, ctx := errgroup.WithContext(ctx)
for _, fn := range funcs {
eg.Go(func() error {
return retry.WithBackoff(ctx, func(ctx context.Context) error { return fn(ctx, client) })
})
}
return eg.Wait()
return fn(ctx, client)
}
func newDataBrokerConnection(ctx context.Context, cfg *config.Config) (*grpc.ClientConn, error) {

View file

@ -0,0 +1,113 @@
package controller_test
import (
"context"
"encoding/base64"
"errors"
"testing"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
"github.com/pomerium/pomerium/config"
"github.com/pomerium/pomerium/internal/zero/controller"
"github.com/pomerium/pomerium/pkg/cryptutil"
"github.com/pomerium/pomerium/pkg/grpc/databroker"
)
type mockConfigSource struct {
mock.Mock
config.Source
}
func (s *mockConfigSource) GetConfig() *config.Config {
args := s.Called()
return args.Get(0).(*config.Config)
}
func (s *mockConfigSource) OnConfigChange(ctx context.Context, cl config.ChangeListener) {
s.Called(ctx, cl)
}
func TestDatabrokerRestart(t *testing.T) {
t.Parallel()
newConfig := func() *config.Config {
return &config.Config{
Options: &config.Options{
SharedKey: base64.StdEncoding.EncodeToString(cryptutil.NewKey()),
},
GRPCPort: ":12345",
}
}
t.Run("no error", func(t *testing.T) {
t.Parallel()
src := new(mockConfigSource)
src.On("OnConfigChange", mock.Anything, mock.Anything).Once()
src.On("GetConfig").Once().Return(newConfig())
ctx := context.Background()
r := controller.NewDatabrokerRestartRunner(ctx, src)
defer r.Close()
err := r.Run(ctx, func(_ context.Context, _ databroker.DataBrokerServiceClient) error {
return nil
})
require.NoError(t, err)
})
t.Run("error, retry", func(t *testing.T) {
t.Parallel()
src := new(mockConfigSource)
src.On("OnConfigChange", mock.Anything, mock.Anything).Once()
src.On("GetConfig").Once().Return(newConfig())
ctx := context.Background()
r := controller.NewDatabrokerRestartRunner(ctx, src)
defer r.Close()
count := 0
err := r.Run(ctx, func(_ context.Context, _ databroker.DataBrokerServiceClient) error {
count++
if count == 1 {
return errors.New("simulated error")
}
return nil
})
require.NoError(t, err)
require.Equal(t, 2, count)
})
t.Run("config changed, execution restarted", func(t *testing.T) {
t.Parallel()
src := new(mockConfigSource)
var cl config.ChangeListener
src.On("OnConfigChange", mock.Anything, mock.Anything).Once().Run(func(args mock.Arguments) {
cl = args.Get(1).(config.ChangeListener)
})
src.On("GetConfig").Once().Return(newConfig())
ctx := context.Background()
r := controller.NewDatabrokerRestartRunner(ctx, src)
defer r.Close()
count := 0
var clients [2]databroker.DataBrokerServiceClient
err := r.Run(ctx, func(ctx context.Context, client databroker.DataBrokerServiceClient) error {
clients[count] = client
count++
if count == 1 {
cl(context.Background(), newConfig())
<-ctx.Done()
require.ErrorIs(t, context.Cause(ctx), controller.ErrBootstrapConfigurationChanged)
return ctx.Err()
}
return nil
})
require.NoError(t, err)
require.Equal(t, 2, count)
require.NotEqual(t, clients[0], clients[1])
})
}

View file

@ -2,18 +2,17 @@ package controller
import (
"context"
"sync/atomic"
"time"
"golang.org/x/sync/errgroup"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/retry"
"github.com/pomerium/pomerium/pkg/grpc/databroker"
)
type leaser struct {
client databroker.DataBrokerServiceClient
funcs []DbcFunc
funcs []func(context.Context, databroker.DataBrokerServiceClient) error
}
// GetDataBrokerServiceClient implements the databroker.LeaseHandler interface.
@ -23,20 +22,18 @@ func (c *leaser) GetDataBrokerServiceClient() databroker.DataBrokerServiceClient
// RunLeased implements the databroker.LeaseHandler interface.
func (c *leaser) RunLeased(ctx context.Context) error {
log.Debug(ctx).Msg("leaser: running leased functions")
eg, ctx := errgroup.WithContext(ctx)
for _, fn := range c.funcs {
eg.Go(func() error {
return retry.WithBackoff(ctx, func(ctx context.Context) error { return fn(ctx, c.client) })
})
fn := fn
eg.Go(func() error { return fn(ctx, c.client) })
}
err := eg.Wait()
log.Debug(ctx).Err(err).Msg("leaser: done running leased functions")
return err
}
func WithLease(funcs ...DbcFunc) DbcFunc {
func WithLease(
funcs ...func(context.Context, databroker.DataBrokerServiceClient) error,
) func(context.Context, databroker.DataBrokerServiceClient) error {
return func(ctx context.Context, client databroker.DataBrokerServiceClient) error {
srv := &leaser{
client: client,
@ -46,3 +43,18 @@ func WithLease(funcs ...DbcFunc) DbcFunc {
return leaser.Run(ctx)
}
}
type LeaseStatus struct {
v atomic.Bool
}
func (w *LeaseStatus) HasLease() bool {
return w.v.Load()
}
func (w *LeaseStatus) MonitorLease(ctx context.Context, _ databroker.DataBrokerServiceClient) error {
w.v.Store(true)
<-ctx.Done()
w.v.Store(false)
return ctx.Err()
}

View file

@ -1,79 +0,0 @@
package controller
import (
"context"
"fmt"
"time"
"github.com/rs/zerolog"
"go.opentelemetry.io/otel/sdk/instrumentation"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/zero/healthcheck"
"github.com/pomerium/pomerium/internal/zero/telemetry/reporter"
"github.com/pomerium/pomerium/internal/zero/telemetry/sessions"
"github.com/pomerium/pomerium/pkg/grpc/databroker"
"github.com/pomerium/pomerium/pkg/health"
)
const (
producerSessionAnalytics = "session-analytics"
)
func (c *controller) initTelemetry(ctx context.Context, clientProvider func() (databroker.DataBrokerServiceClient, error)) error {
sessionMetricProducer := sessions.NewProducer(instrumentation.Scope{}, clientProvider)
r, err := reporter.New(ctx, c.api.GetTelemetryConn(),
reporter.WithProducer(producerSessionAnalytics, sessionMetricProducer),
)
if err != nil {
return fmt.Errorf("error creating telemetry metrics reporter: %w", err)
}
c.telemetryReporter = r
return nil
}
func (c *controller) shutdownTelemetry(ctx context.Context) {
ctx, cancel := context.WithTimeout(context.WithoutCancel(ctx), c.cfg.shutdownTimeout)
defer cancel()
err := c.telemetryReporter.Shutdown(ctx)
if err != nil {
log.Warn(ctx).Err(err).Msg("telemetry reporter shutdown error")
}
}
func (c *controller) runSessionAnalyticsLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "zero-analytics")
})
return sessions.Collect(ctx, client, time.Hour)
}
// those metrics are cluster-wide, so we only enable their reporting when we have the lease
func (c *controller) enableSessionAnalyticsReporting(ctx context.Context, _ databroker.DataBrokerServiceClient) error {
_ = c.telemetryReporter.SetMetricProducerEnabled(producerSessionAnalytics, true)
defer func() { _ = c.telemetryReporter.SetMetricProducerEnabled(producerSessionAnalytics, false) }()
<-ctx.Done()
return nil
}
func (c *controller) runHealthChecksLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "zero-health-checks")
})
return healthcheck.RunChecks(ctx, c.bootstrapConfig, client)
}
func (c *controller) runTelemetryReporter(ctx context.Context) error {
health.SetProvider(c.telemetryReporter)
defer health.SetProvider(nil)
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "zero-bootstrap")
})
return c.telemetryReporter.Run(ctx)
}