healthcheck: add common package, zero reporter and first xds check (#5059)

This commit is contained in:
Denis Mishin 2024-04-10 15:21:39 -04:00 committed by GitHub
parent 5af244f0e5
commit 991fca496c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 319 additions and 39 deletions

11
go.mod
View file

@ -64,11 +64,14 @@ require (
github.com/volatiletech/null/v9 v9.0.0 github.com/volatiletech/null/v9 v9.0.0
github.com/yuin/gopher-lua v1.1.1 github.com/yuin/gopher-lua v1.1.1
go.opencensus.io v0.24.0 go.opencensus.io v0.24.0
go.opentelemetry.io/otel v1.24.0 go.opentelemetry.io/otel v1.25.0
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0
go.opentelemetry.io/otel/metric v1.24.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.25.0
go.opentelemetry.io/otel/sdk v1.24.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.21.0
go.opentelemetry.io/otel/metric v1.25.0
go.opentelemetry.io/otel/sdk v1.25.0
go.opentelemetry.io/otel/sdk/metric v1.24.0 go.opentelemetry.io/otel/sdk/metric v1.24.0
go.opentelemetry.io/otel/trace v1.25.0
go.uber.org/automaxprocs v1.5.3 go.uber.org/automaxprocs v1.5.3
go.uber.org/zap v1.27.0 go.uber.org/zap v1.27.0
golang.org/x/crypto v0.21.0 golang.org/x/crypto v0.21.0
@ -204,9 +207,7 @@ require (
github.com/zeebo/blake3 v0.2.3 // indirect github.com/zeebo/blake3 v0.2.3 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.22.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.22.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.22.0 // indirect
go.opentelemetry.io/otel/trace v1.24.0 // indirect
go.opentelemetry.io/proto/otlp v1.1.0 // indirect go.opentelemetry.io/proto/otlp v1.1.0 // indirect
go.uber.org/multierr v1.11.0 // indirect go.uber.org/multierr v1.11.0 // indirect
golang.org/x/mod v0.14.0 // indirect golang.org/x/mod v0.14.0 // indirect

20
go.sum
View file

@ -722,24 +722,24 @@ go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.4
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0/go.mod h1:Mjt1i1INqiaoZOMGR1RIUJN+i3ChKoFRqzrRQhlkbs0= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0/go.mod h1:Mjt1i1INqiaoZOMGR1RIUJN+i3ChKoFRqzrRQhlkbs0=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 h1:jq9TW8u3so/bN+JPT166wjOI6/vQPF6Xe7nMNIltagk= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 h1:jq9TW8u3so/bN+JPT166wjOI6/vQPF6Xe7nMNIltagk=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0/go.mod h1:p8pYQP+m5XfbZm9fxtSKAbM6oIllS7s2AfxrChvc7iw= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0/go.mod h1:p8pYQP+m5XfbZm9fxtSKAbM6oIllS7s2AfxrChvc7iw=
go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo= go.opentelemetry.io/otel v1.25.0 h1:gldB5FfhRl7OJQbUHt/8s0a7cE8fbsPAtdpRaApKy4k=
go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo= go.opentelemetry.io/otel v1.25.0/go.mod h1:Wa2ds5NOXEMkCmUou1WA7ZBfLTHWIsp034OVD7AO+Vg=
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0 h1:f2jriWfOdldanBwS9jNBdeOKAQN7b4ugAMaNu1/1k9g= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0 h1:f2jriWfOdldanBwS9jNBdeOKAQN7b4ugAMaNu1/1k9g=
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0/go.mod h1:B+bcQI1yTY+N0vqMpoZbEN7+XU4tNM0DmUiOwebFJWI= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.24.0/go.mod h1:B+bcQI1yTY+N0vqMpoZbEN7+XU4tNM0DmUiOwebFJWI=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.22.0 h1:9M3+rhx7kZCIQQhQRYaZCdNu1V73tm4TvXs2ntl98C4= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.25.0 h1:dT33yIHtmsqpixFsSQPwNeY5drM9wTcoL8h0FWF4oGM=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.22.0/go.mod h1:noq80iT8rrHP1SfybmPiRGc9dc5M8RPmGvtwo7Oo7tc= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.25.0/go.mod h1:h95q0LBGh7hlAC08X2DhSeyIG02YQ0UyioTCVAqRPmc=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.21.0 h1:tIqheXEFWAZ7O8A7m+J0aPTmpJN3YQ7qetUAdkkkKpk= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.21.0 h1:tIqheXEFWAZ7O8A7m+J0aPTmpJN3YQ7qetUAdkkkKpk=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.21.0/go.mod h1:nUeKExfxAQVbiVFn32YXpXZZHZ61Cc3s3Rn1pDBGAb0= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.21.0/go.mod h1:nUeKExfxAQVbiVFn32YXpXZZHZ61Cc3s3Rn1pDBGAb0=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.22.0 h1:FyjCyI9jVEfqhUh2MoSkmolPjfh5fp2hnV0b0irxH4Q= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.22.0 h1:FyjCyI9jVEfqhUh2MoSkmolPjfh5fp2hnV0b0irxH4Q=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.22.0/go.mod h1:hYwym2nDEeZfG/motx0p7L7J1N1vyzIThemQsb4g2qY= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.22.0/go.mod h1:hYwym2nDEeZfG/motx0p7L7J1N1vyzIThemQsb4g2qY=
go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI= go.opentelemetry.io/otel/metric v1.25.0 h1:LUKbS7ArpFL/I2jJHdJcqMGxkRdxpPHE0VU/D4NuEwA=
go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco= go.opentelemetry.io/otel/metric v1.25.0/go.mod h1:rkDLUSd2lC5lq2dFNrX9LGAbINP5B7WBkC78RXCpH5s=
go.opentelemetry.io/otel/sdk v1.24.0 h1:YMPPDNymmQN3ZgczicBY3B6sf9n62Dlj9pWD3ucgoDw= go.opentelemetry.io/otel/sdk v1.25.0 h1:PDryEJPC8YJZQSyLY5eqLeafHtG+X7FWnf3aXMtxbqo=
go.opentelemetry.io/otel/sdk v1.24.0/go.mod h1:KVrIYw6tEubO9E96HQpcmpTKDVn9gdv35HoYiQWGDFg= go.opentelemetry.io/otel/sdk v1.25.0/go.mod h1:oFgzCM2zdsxKzz6zwpTZYLLQsFwc+K0daArPdIhuxkw=
go.opentelemetry.io/otel/sdk/metric v1.24.0 h1:yyMQrPzF+k88/DbH7o4FMAs80puqd+9osbiBrJrz/w8= go.opentelemetry.io/otel/sdk/metric v1.24.0 h1:yyMQrPzF+k88/DbH7o4FMAs80puqd+9osbiBrJrz/w8=
go.opentelemetry.io/otel/sdk/metric v1.24.0/go.mod h1:I6Y5FjH6rvEnTTAYQz3Mmv2kl6Ek5IIrmwTLqMrrOE0= go.opentelemetry.io/otel/sdk/metric v1.24.0/go.mod h1:I6Y5FjH6rvEnTTAYQz3Mmv2kl6Ek5IIrmwTLqMrrOE0=
go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI= go.opentelemetry.io/otel/trace v1.25.0 h1:tqukZGLwQYRIFtSQM2u2+yfMVTgGVeqRLPUYx1Dq6RM=
go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= go.opentelemetry.io/otel/trace v1.25.0/go.mod h1:hCCs70XM/ljO+BeQkyFnbK28SBIJ/Emuha+ccrCRT7I=
go.opentelemetry.io/proto/otlp v1.1.0 h1:2Di21piLrCqJ3U3eXGCTPHE9R8Nh+0uglSnOyxikMeI= go.opentelemetry.io/proto/otlp v1.1.0 h1:2Di21piLrCqJ3U3eXGCTPHE9R8Nh+0uglSnOyxikMeI=
go.opentelemetry.io/proto/otlp v1.1.0/go.mod h1:GpBHCBWiqvVLDqmHZsoMM3C5ySeKTC7ej/RNTae6MdY= go.opentelemetry.io/proto/otlp v1.1.0/go.mod h1:GpBHCBWiqvVLDqmHZsoMM3C5ySeKTC7ej/RNTae6MdY=
go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=

View file

@ -0,0 +1,51 @@
package xdsmgr
import (
"context"
"errors"
envoy_config_cluster_v3 "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3"
envoy_config_listener_v3 "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3"
envoy_config_route_v3 "github.com/envoyproxy/go-control-plane/envoy/config/route/v3"
envoy_service_discovery_v3 "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/pkg/health"
"github.com/pomerium/pomerium/pkg/protoutil"
)
var (
clusterTypeURL = protoutil.GetTypeURL((*envoy_config_cluster_v3.Cluster)(nil))
listenerTypeURL = protoutil.GetTypeURL((*envoy_config_listener_v3.Listener)(nil))
routeConfigurationTypeURL = protoutil.GetTypeURL((*envoy_config_route_v3.RouteConfiguration)(nil))
)
func logNACK(req *envoy_service_discovery_v3.DeltaDiscoveryRequest) {
log.Debug(context.Background()).
Str("type-url", req.GetTypeUrl()).
Any("error-detail", req.GetErrorDetail()).
Msg("xdsmgr: nack")
health.ReportError(getHealthCheck(req.GetTypeUrl()), errors.New(req.GetErrorDetail().GetMessage()))
}
func logACK(req *envoy_service_discovery_v3.DeltaDiscoveryRequest) {
log.Debug(context.Background()).
Str("type-url", req.GetTypeUrl()).
Msg("xdsmgr: ack")
health.ReportOK(getHealthCheck(req.GetTypeUrl()))
}
func getHealthCheck(typeURL string) health.Check {
switch typeURL {
case clusterTypeURL:
return health.XDSCluster
case listenerTypeURL:
return health.XDSListener
case routeConfigurationTypeURL:
return health.XDSRouteConfiguration
default:
return health.XDSOther
}
}

View file

@ -107,26 +107,21 @@ func (mgr *Manager) DeltaAggregatedResources(
case req.GetResponseNonce() == "": case req.GetResponseNonce() == "":
// neither an ACK or a NACK // neither an ACK or a NACK
case req.GetErrorDetail() != nil: case req.GetErrorDetail() != nil:
log.Debug(ctx).
Str("type-url", req.GetTypeUrl()).
Any("error-detail", req.GetErrorDetail()).
Msg("xdsmgr: nack")
// a NACK // a NACK
// - set the client resource versions to the current resource versions // - set the client resource versions to the current resource versions
state.clientResourceVersions = make(map[string]string) state.clientResourceVersions = make(map[string]string)
for _, resource := range mgr.resources[req.GetTypeUrl()] { for _, resource := range mgr.resources[req.GetTypeUrl()] {
state.clientResourceVersions[resource.Name] = resource.Version state.clientResourceVersions[resource.Name] = resource.Version
} }
logNACK(req)
case req.GetResponseNonce() == mgr.nonce: case req.GetResponseNonce() == mgr.nonce:
log.Debug(ctx).
Str("type-url", req.GetTypeUrl()).
Msg("xdsmgr: ack")
// an ACK for the last response // an ACK for the last response
// - set the client resource versions to the current resource versions // - set the client resource versions to the current resource versions
state.clientResourceVersions = make(map[string]string) state.clientResourceVersions = make(map[string]string)
for _, resource := range mgr.resources[req.GetTypeUrl()] { for _, resource := range mgr.resources[req.GetTypeUrl()] {
state.clientResourceVersions[resource.Name] = resource.Version state.clientResourceVersions[resource.Name] = resource.Version
} }
logACK(req)
default: default:
// an ACK for a response that's not the last response // an ACK for a response that's not the last response
log.Debug(ctx). log.Debug(ctx).

View file

@ -12,7 +12,8 @@ import (
"github.com/pomerium/pomerium/internal/zero/apierror" "github.com/pomerium/pomerium/internal/zero/apierror"
connect_mux "github.com/pomerium/pomerium/internal/zero/connect-mux" connect_mux "github.com/pomerium/pomerium/internal/zero/connect-mux"
"github.com/pomerium/pomerium/internal/zero/grpcconn" "github.com/pomerium/pomerium/internal/zero/grpcconn"
"github.com/pomerium/pomerium/internal/zero/reporter" "github.com/pomerium/pomerium/internal/zero/healthcheck"
metrics_reporter "github.com/pomerium/pomerium/internal/zero/reporter"
token_api "github.com/pomerium/pomerium/internal/zero/token" token_api "github.com/pomerium/pomerium/internal/zero/token"
"github.com/pomerium/pomerium/pkg/fanout" "github.com/pomerium/pomerium/pkg/fanout"
cluster_api "github.com/pomerium/pomerium/pkg/zero/cluster" cluster_api "github.com/pomerium/pomerium/pkg/zero/cluster"
@ -23,6 +24,7 @@ import (
type API struct { type API struct {
cfg *config cfg *config
cluster cluster_api.ClientWithResponsesInterface cluster cluster_api.ClientWithResponsesInterface
telemetryConn *grpc.ClientConn
mux *connect_mux.Mux mux *connect_mux.Mux
downloadURLCache *cluster_api.URLCache downloadURLCache *cluster_api.URLCache
tokenFn func(ctx context.Context, ttl time.Duration) (string, error) tokenFn func(ctx context.Context, ttl time.Duration) (string, error)
@ -75,24 +77,31 @@ func NewAPI(ctx context.Context, opts ...Option) (*API, error) {
return nil, fmt.Errorf("error creating connect grpc client: %w", err) return nil, fmt.Errorf("error creating connect grpc client: %w", err)
} }
telemetryGRPCConn, err := grpcconn.New(ctx, cfg.otelEndpoint, func(ctx context.Context) (string, error) {
return tokenCache.GetToken(ctx, minTelemetryTokenTTL)
})
if err != nil {
return nil, fmt.Errorf("error creating OTEL exporter grpc client: %w", err)
}
return &API{ return &API{
cfg: cfg, cfg: cfg,
cluster: clusterClient, cluster: clusterClient,
mux: connect_mux.New(connect_api.NewConnectClient(connectGRPCConn)), mux: connect_mux.New(connect_api.NewConnectClient(connectGRPCConn)),
telemetryConn: telemetryGRPCConn,
downloadURLCache: cluster_api.NewURLCache(), downloadURLCache: cluster_api.NewURLCache(),
tokenFn: tokenCache.GetToken, tokenFn: tokenCache.GetToken,
}, nil }, nil
} }
// Report runs metrics reporting to the cloud // ReportMetrics runs metrics reporting to the cloud
func (api *API) Report(ctx context.Context, opts ...reporter.Option) error { func (api *API) ReportMetrics(ctx context.Context, opts ...metrics_reporter.Option) error {
conn, err := grpcconn.New(ctx, api.cfg.otelEndpoint, func(ctx context.Context) (string, error) { return metrics_reporter.Run(ctx, api.telemetryConn, opts...)
return api.tokenFn(ctx, minTelemetryTokenTTL) }
})
if err != nil { // ReportHealthChecks runs health check reporting to the cloud
return fmt.Errorf("error creating OTEL exporter grpc client: %w", err) func (api *API) ReportHealthChecks(ctx context.Context) error {
} return healthcheck.NewReporter(api.telemetryConn).Run(ctx)
return reporter.Run(ctx, conn, opts...)
} }
// Connect connects to the connect API and allows watching for changes // Connect connects to the connect API and allows watching for changes

View file

@ -7,6 +7,7 @@ import (
"fmt" "fmt"
"time" "time"
"github.com/cenkalti/backoff/v4"
"github.com/rs/zerolog" "github.com/rs/zerolog"
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
@ -42,6 +43,7 @@ func Run(ctx context.Context, opts ...Option) error {
eg.Go(func() error { return run(ctx, "zero-bootstrap", c.runBootstrap) }) eg.Go(func() error { return run(ctx, "zero-bootstrap", c.runBootstrap) })
eg.Go(func() error { return run(ctx, "pomerium-core", c.runPomeriumCore) }) eg.Go(func() error { return run(ctx, "pomerium-core", c.runPomeriumCore) })
eg.Go(func() error { return run(ctx, "zero-control-loop", c.runZeroControlLoop) }) eg.Go(func() error { return run(ctx, "zero-control-loop", c.runZeroControlLoop) })
eg.Go(func() error { return run(ctx, "healh-check-reporter", c.runHealthCheckReporter) })
return eg.Wait() return eg.Wait()
} }
@ -103,13 +105,13 @@ func (c *controller) runConnect(ctx context.Context) error {
func (c *controller) runZeroControlLoop(ctx context.Context) error { func (c *controller) runZeroControlLoop(ctx context.Context) error {
return leaser.Run(ctx, c.bootstrapConfig, return leaser.Run(ctx, c.bootstrapConfig,
c.runReconciler, c.runReconcilerLeased,
c.runAnalytics, c.runAnalyticsLeased,
c.runReporter, c.runMetricsReporterLeased,
) )
} }
func (c *controller) runReconciler(ctx context.Context, client databroker.DataBrokerServiceClient) error { func (c *controller) runReconcilerLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context { ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "zero-reconciler") return c.Str("service", "zero-reconciler")
}) })
@ -120,7 +122,7 @@ func (c *controller) runReconciler(ctx context.Context, client databroker.DataBr
) )
} }
func (c *controller) runAnalytics(ctx context.Context, client databroker.DataBrokerServiceClient) error { func (c *controller) runAnalyticsLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context { ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "zero-analytics") return c.Str("service", "zero-analytics")
}) })
@ -134,13 +136,31 @@ func (c *controller) runAnalytics(ctx context.Context, client databroker.DataBro
return err return err
} }
func (c *controller) runReporter(ctx context.Context, client databroker.DataBrokerServiceClient) error { func (c *controller) runMetricsReporterLeased(ctx context.Context, client databroker.DataBrokerServiceClient) error {
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context { ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "zero-reporter") return c.Str("service", "zero-reporter")
}) })
return c.api.Report(ctx, return c.api.ReportMetrics(ctx,
reporter.WithCollectInterval(time.Hour), reporter.WithCollectInterval(time.Hour),
reporter.WithMetrics(analytics.Metrics(func() databroker.DataBrokerServiceClient { return client })...), reporter.WithMetrics(analytics.Metrics(func() databroker.DataBrokerServiceClient { return client })...),
) )
} }
func (c *controller) runHealthCheckReporter(ctx context.Context) error {
ctx = log.WithContext(ctx, func(c zerolog.Context) zerolog.Context {
return c.Str("service", "zero-health-check-reporter")
})
bo := backoff.NewExponentialBackOff()
bo.MaxElapsedTime = 0
return backoff.RetryNotify(
func() error {
return c.api.ReportHealthChecks(ctx)
},
backoff.WithContext(bo, ctx),
func(err error, next time.Duration) {
log.Ctx(ctx).Warn().Err(err).Dur("next", next).Msg("health check reporter backoff")
},
)
}

View file

@ -0,0 +1,126 @@
package healthcheck
import (
"context"
"fmt"
"os"
"time"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace"
export_grpc "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/sdk/resource"
trace_sdk "go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.4.0"
"go.opentelemetry.io/otel/trace"
"google.golang.org/grpc"
"github.com/pomerium/pomerium/internal/log"
"github.com/pomerium/pomerium/internal/version"
"github.com/pomerium/pomerium/pkg/health"
)
type Provider struct {
exporter *otlptrace.Exporter
tracer trace.Tracer
}
var _ health.Provider = (*Provider)(nil)
const (
shutdownTimeout = 30 * time.Second
serviceName = "pomerium-managed-core"
)
// NewReporter creates a new unstarted health check reporter
func NewReporter(
conn *grpc.ClientConn,
) *Provider {
p := new(Provider)
p.init(conn)
return p
}
func (p *Provider) Run(ctx context.Context) error {
health.SetProvider(p)
defer health.SetProvider(nil)
// we want the exporter
xc, cancel := context.WithCancel(context.WithoutCancel(ctx))
defer cancel()
err := p.exporter.Start(xc)
if err != nil {
// this should not happen for the gRPC exporter as its non-blocking
return fmt.Errorf("error starting health check exporter: %w", err)
}
<-ctx.Done()
return p.shutdown(xc)
}
func (p *Provider) shutdown(ctx context.Context) error {
ctx, cancel := context.WithTimeout(ctx, shutdownTimeout)
defer cancel()
return p.exporter.Shutdown(ctx)
}
func (p *Provider) init(conn *grpc.ClientConn) {
p.initExporter(conn)
p.initTracer()
}
func (p *Provider) initExporter(conn *grpc.ClientConn) {
p.exporter = export_grpc.NewUnstarted(export_grpc.WithGRPCConn(conn))
}
func (p *Provider) initTracer() {
processor := trace_sdk.NewBatchSpanProcessor(p.exporter)
provider := trace_sdk.NewTracerProvider(
trace_sdk.WithResource(p.getResource()),
trace_sdk.WithSampler(trace_sdk.AlwaysSample()),
trace_sdk.WithSpanProcessor(processor),
)
p.tracer = provider.Tracer(serviceName)
}
func (p *Provider) getResource() *resource.Resource {
attr := []attribute.KeyValue{
semconv.ServiceNameKey.String(serviceName),
semconv.ServiceVersionKey.String(version.FullVersion()),
}
hostname, err := os.Hostname()
if err == nil {
attr = append(attr, semconv.HostNameKey.String(hostname))
}
return resource.NewSchemaless(attr...)
}
func (p *Provider) ReportOK(check health.Check, attr ...health.Attr) {
ctx := context.Background()
log.Ctx(ctx).Debug().Str("check", string(check)).Msg("health check ok")
_, span := p.tracer.Start(ctx, string(check))
span.SetStatus(codes.Ok, "")
setAttributes(span, attr...)
span.End()
}
func (p *Provider) ReportError(check health.Check, err error, attr ...health.Attr) {
ctx := context.Background()
log.Ctx(ctx).Warn().Str("check", string(check)).Err(err).Msg("health check error")
_, span := p.tracer.Start(ctx, string(check))
span.SetStatus(codes.Error, err.Error())
setAttributes(span, attr...)
span.End()
}
func setAttributes(span trace.Span, attr ...health.Attr) {
for _, a := range attr {
span.SetAttributes(attribute.String(a.Key, a.Value))
}
}

14
pkg/health/check.go Normal file
View file

@ -0,0 +1,14 @@
package health
type Check string
const (
// XDSCluster checks whether the XDS Cluster resources were applied
XDSCluster = Check("xds.cluster")
// XDSListener checks whether the XDS Listener resources were applied
XDSListener = Check("xds.listener")
// XDSRouteConfiguration checks whether the XDS RouteConfiguration resources were applied
XDSRouteConfiguration = Check("xds.route-configuration")
// XDSOther is a catch-all for other XDS resources
XDSOther = Check("xds.other")
)

64
pkg/health/provider.go Normal file
View file

@ -0,0 +1,64 @@
package health
import (
"sync"
)
// Attr is a key-value pair that can be attached to a health check
type Attr struct {
Key string
Value string
}
// StrAttr creates a new string attribute
func StrAttr(key, value string) Attr {
return Attr{Key: key, Value: value}
}
// ReportOK reports that a check was successful
func ReportOK(check Check, attributes ...Attr) {
p := defaultProvider.Load()
if p != nil {
p.ReportOK(check, attributes...)
}
}
// ReportError reports that a check failed
func ReportError(check Check, err error, attributes ...Attr) {
p := defaultProvider.Load()
if p != nil {
p.ReportError(check, err, attributes...)
}
}
// Provider is the interface that must be implemented by a health check reporter
type Provider interface {
ReportOK(check Check, attributes ...Attr)
ReportError(check Check, err error, attributes ...Attr)
}
// SetProvider sets the health check provider
func SetProvider(p Provider) {
defaultProvider.Store(p)
}
type providerStore struct {
sync.RWMutex
provider Provider
}
func (p *providerStore) Load() Provider {
p.RLock()
defer p.RUnlock()
return p.provider
}
func (p *providerStore) Store(provider Provider) {
p.Lock()
defer p.Unlock()
p.provider = provider
}
var defaultProvider providerStore