From c19dd80fe697b37980f80657d5e48881fd1ce5b6 Mon Sep 17 00:00:00 2001 From: Denis Mishin Date: Wed, 22 Dec 2021 17:30:16 -0500 Subject: [PATCH] more idp metrics (#2842) --- docs/reference/readme.md | 32 +++- internal/identity/manager/manager.go | 5 +- internal/telemetry/metrics/info.go | 212 ++++++++++++++++++++++++++- pkg/metrics/constants.go | 28 ++++ 4 files changed, 269 insertions(+), 8 deletions(-) diff --git a/docs/reference/readme.md b/docs/reference/readme.md index bd6daa3f2..806ee197d 100644 --- a/docs/reference/readme.md +++ b/docs/reference/readme.md @@ -510,6 +510,8 @@ Expose a prometheus endpoint on the specified port. #### Pomerium Metrics Tracked +Each metric exposed by Pomerium has a `pomerium` prefix, which is omitted in the table below for brevity. + Name | Type | Description --------------------------------------------- | --------- | ----------------------------------------------------------------------- grpc_client_request_duration_ms | Histogram | GRPC client request duration by service @@ -528,21 +530,41 @@ http_server_request_duration_ms | Histogram | HTTP server request http_server_request_size_bytes | Histogram | HTTP server request size by service http_server_requests_total | Counter | Total HTTP server requests handled by service http_server_response_size_bytes | Histogram | HTTP server response size by service -pomerium_build_info | Gauge | Pomerium build metadata by git revision, service, version and goversion -pomerium_config_checksum_int64 | Gauge | Currently loaded configuration checksum by service -pomerium_config_last_reload_success | Gauge | Whether the last configuration reload succeeded by service -pomerium_config_last_reload_success_timestamp | Gauge | The timestamp of the last successful configuration reload by service +build_info | Gauge | Pomerium build metadata by git revision, service, version and goversion +config_checksum_int64 | Gauge | Currently loaded configuration checksum by service +config_last_reload_success | Gauge | Whether the last configuration reload succeeded by service +config_last_reload_success_timestamp | Gauge | The timestamp of the last successful configuration reload by service redis_conns | Gauge | Number of total connections in the pool redis_idle_conns | Gauge | Total number of times free connection was found in the pool redis_wait_count_total | Counter | Total number of connections waited for redis_wait_duration_ms_total | Counter | Total time spent waiting for connections storage_operation_duration_ms | Histogram | Storage operation duration by operation, result, backend and service +#### Identity Manager + +Identity manager metrics have `pomerium_identity_manager` prefix. + +Name | Type | Description +--------------------------------------------- | --------- | ----------------------------------------------------------------------- +last_refresh_timestamp | Gauge | Timestamp of last directory refresh operation. +user_refresh_success_timestamp | Gauge | Timestamp of last successful user refresh. +user_refresh_error_timestamp | Gauge | Timestamp of last user refresh ended in an error. +user_refresh_errors | Counter | User refresh error counter. +user_refresh_success | Counter | User refresh success counter. +user_group_refresh_success_timestamp | Gauge | Timestamp of last group successful user refresh. +user_group_refresh_error_timestamp | Gauge | Timestamp of last user group refresh ended in an error. +user_group_refresh_errors | Counter | User group refresh error counter. +user_group_refresh_success | Counter | User group refresh success counter. +session_refresh_success_timestamp | Gauge | Timestamp of last successful session refresh. +session_refresh_error_timestamp | Gauge | Timestamp of last session refresh ended in an error. +session_refresh_errors | Counter | Session refresh error counter. +session_refresh_success | Counter | Session refresh success counter. + #### Envoy Proxy Metrics As of `v0.9`, Pomerium uses [envoy](https://www.envoyproxy.io/) for the data plane. As such, proxy related metrics are sourced from envoy, and use envoy's internal [stats data model](https://www.envoyproxy.io/docs/envoy/latest/operations/stats_overview). Please see Envoy's documentation for information about specific metrics. -All metrics coming from envoy will be labeled with `service="pomerium"` or `service="pomerium-proxy"`, depending if you're running all-in-one or distributed service mode. +All metrics coming from envoy will be labeled with `service="pomerium"` or `service="pomerium-proxy"`, depending if you're running all-in-one or distributed service mode and have `pomerium` prefix added to the standard envoy metric name. ### Metrics Basic Authentication diff --git a/internal/identity/manager/manager.go b/internal/identity/manager/manager.go index 7d8e6235e..dea8a91a1 100644 --- a/internal/identity/manager/manager.go +++ b/internal/identity/manager/manager.go @@ -216,6 +216,7 @@ func (mgr *Manager) refreshDirectoryUserGroups(ctx context.Context) (nextRefresh defer clearTimeout() directoryGroups, directoryUsers, err := mgr.cfg.Load().directory.UserGroups(ctx) + metrics.RecordIdentityManagerUserGroupRefresh(ctx, err) if err != nil { msg := "failed to refresh directory users and groups" if ctx.Err() != nil { @@ -234,7 +235,7 @@ func (mgr *Manager) refreshDirectoryUserGroups(ctx context.Context) (nextRefresh mgr.mergeGroups(ctx, directoryGroups) mgr.mergeUsers(ctx, directoryUsers) - metrics.RecordIdentityManagerLastRefresh() + metrics.RecordIdentityManagerLastRefresh(ctx) return mgr.cfg.Load().groupRefreshInterval } @@ -404,6 +405,7 @@ func (mgr *Manager) refreshSession(ctx context.Context, userID, sessionID string } newToken, err := mgr.cfg.Load().authenticator.Refresh(ctx, FromOAuthToken(s.OauthToken), &s) + metrics.RecordIdentityManagerSessionRefresh(ctx, err) if isTemporaryError(err) { log.Error(ctx).Err(err). Str("user_id", s.GetUserId()). @@ -472,6 +474,7 @@ func (mgr *Manager) refreshUser(ctx context.Context, userID string) { } err := mgr.cfg.Load().authenticator.UpdateUserInfo(ctx, FromOAuthToken(s.OauthToken), &u) + metrics.RecordIdentityManagerUserRefresh(ctx, err) if isTemporaryError(err) { log.Error(ctx).Err(err). Str("user_id", s.GetUserId()). diff --git a/internal/telemetry/metrics/info.go b/internal/telemetry/metrics/info.go index 89eedcb9d..471a0bab1 100644 --- a/internal/telemetry/metrics/info.go +++ b/internal/telemetry/metrics/info.go @@ -21,6 +21,22 @@ var ( ConfigLastReloadView, ConfigLastReloadSuccessView, IdentityManagerLastRefreshView, + + IdentityManagerLastUserRefreshErrorTimestampView, + IdentityManagerLastUserRefreshErrorView, + IdentityManagerLastUserRefreshSuccessTimestampView, + IdentityManagerLastUserRefreshSuccessView, + + IdentityManagerLastUserGroupRefreshErrorTimestampView, + IdentityManagerLastUserGroupRefreshErrorView, + IdentityManagerLastUserGroupRefreshSuccessTimestampView, + IdentityManagerLastUserGroupRefreshSuccessView, + + IdentityManagerLastSessionRefreshErrorTimestampView, + IdentityManagerLastSessionRefreshErrorView, + IdentityManagerLastSessionRefreshSuccessTimestampView, + IdentityManagerLastSessionRefreshSuccessView, + ConfigDBVersionView, ConfigDBErrorsView, } @@ -49,6 +65,69 @@ var ( "seconds", ) + identityManagerLastUserRefreshSuccessTimestamp = stats.Int64( + metrics.IdentityManagerLastUserRefreshSuccessTimestamp, + "Timestamp of last successful directory user refresh success", + stats.UnitSeconds, + ) + identityManagerLastUserRefreshErrorTimestamp = stats.Int64( + metrics.IdentityManagerLastUserRefreshErrorTimestamp, + "Timestamp of last directory user refresh error", + stats.UnitSeconds, + ) + identityManagerLastUserRefreshSuccess = stats.Int64( + metrics.IdentityManagerLastUserRefreshSuccess, + "Total successful directory user refresh requests", + stats.UnitDimensionless, + ) + identityManagerLastUserRefreshError = stats.Int64( + metrics.IdentityManagerLastUserRefreshError, + "Total successful directory user refresh errors", + stats.UnitDimensionless, + ) + + identityManagerLastUserGroupRefreshSuccessTimestamp = stats.Int64( + metrics.IdentityManagerLastUserGroupRefreshSuccessTimestamp, + "Timestamp of last successful user group refresh success", + stats.UnitSeconds, + ) + identityManagerLastUserGroupRefreshErrorTimestamp = stats.Int64( + metrics.IdentityManagerLastUserGroupRefreshErrorTimestamp, + "Timestamp of last directory user group refresh error", + stats.UnitSeconds, + ) + identityManagerLastUserGroupRefreshSuccess = stats.Int64( + metrics.IdentityManagerLastUserGroupRefreshSuccess, + "Total successful directory user group refresh requests", + stats.UnitDimensionless, + ) + identityManagerLastUserGroupRefreshError = stats.Int64( + metrics.IdentityManagerLastUserGroupRefreshError, + "Total successful directory user refresh errors", + stats.UnitDimensionless, + ) + + identityManagerLastSessionRefreshSuccessTimestamp = stats.Int64( + metrics.IdentityManagerLastSessionRefreshSuccessTimestamp, + "Timestamp of last successful session refresh success", + stats.UnitSeconds, + ) + identityManagerLastSessionRefreshErrorTimestamp = stats.Int64( + metrics.IdentityManagerLastSessionRefreshErrorTimestamp, + "Timestamp of last session refresh error", + stats.UnitSeconds, + ) + identityManagerLastSessionRefreshSuccess = stats.Int64( + metrics.IdentityManagerLastSessionRefreshSuccess, + "Total successful session refresh requests", + stats.UnitDimensionless, + ) + identityManagerLastSessionRefreshError = stats.Int64( + metrics.IdentityManagerLastSessionRefreshError, + "Total successful session refresh errors", + stats.UnitDimensionless, + ) + // ConfigDBVersionView contains last databroker config version that was processed ConfigDBVersionView = &view.View{ Name: configDBVersion.Name(), @@ -95,11 +174,140 @@ var ( Measure: identityManagerLastRefresh, Aggregation: view.LastValue(), } + + // IdentityManagerLastUserRefreshSuccessView contains successful user refresh counter + IdentityManagerLastUserRefreshSuccessView = &view.View{ + Name: identityManagerLastUserRefreshSuccess.Name(), + Description: identityManagerLastUserRefreshSuccess.Description(), + Measure: identityManagerLastUserRefreshSuccess, + Aggregation: view.Count(), + } + // IdentityManagerLastUserRefreshErrorView contains user refresh errors counter + IdentityManagerLastUserRefreshErrorView = &view.View{ + Name: identityManagerLastUserRefreshError.Name(), + Description: identityManagerLastUserRefreshError.Description(), + Measure: identityManagerLastUserRefreshError, + Aggregation: view.Count(), + } + // IdentityManagerLastUserRefreshSuccessTimestampView contains successful user refresh counter + IdentityManagerLastUserRefreshSuccessTimestampView = &view.View{ + Name: identityManagerLastUserRefreshSuccessTimestamp.Name(), + Description: identityManagerLastUserRefreshSuccessTimestamp.Description(), + Measure: identityManagerLastUserRefreshSuccessTimestamp, + Aggregation: view.LastValue(), + } + // IdentityManagerLastUserRefreshErrorTimestampView contains user refresh errors counter + IdentityManagerLastUserRefreshErrorTimestampView = &view.View{ + Name: identityManagerLastUserRefreshErrorTimestamp.Name(), + Description: identityManagerLastUserRefreshErrorTimestamp.Description(), + Measure: identityManagerLastUserRefreshErrorTimestamp, + Aggregation: view.LastValue(), + } + + // IdentityManagerLastUserGroupRefreshSuccessView contains successful user group refresh counter + IdentityManagerLastUserGroupRefreshSuccessView = &view.View{ + Name: identityManagerLastUserGroupRefreshSuccess.Name(), + Description: identityManagerLastUserGroupRefreshSuccess.Description(), + Measure: identityManagerLastUserGroupRefreshSuccess, + Aggregation: view.Count(), + } + // IdentityManagerLastUserGroupRefreshErrorView contains user group refresh errors counter + IdentityManagerLastUserGroupRefreshErrorView = &view.View{ + Name: identityManagerLastUserGroupRefreshError.Name(), + Description: identityManagerLastUserGroupRefreshError.Description(), + Measure: identityManagerLastUserGroupRefreshError, + Aggregation: view.Count(), + } + // IdentityManagerLastUserGroupRefreshSuccessTimestampView contains successful user group refresh counter + IdentityManagerLastUserGroupRefreshSuccessTimestampView = &view.View{ + Name: identityManagerLastUserGroupRefreshSuccessTimestamp.Name(), + Description: identityManagerLastUserGroupRefreshSuccessTimestamp.Description(), + Measure: identityManagerLastUserGroupRefreshSuccessTimestamp, + Aggregation: view.LastValue(), + } + // IdentityManagerLastUserGroupRefreshErrorTimestampView contains user group refresh errors counter + IdentityManagerLastUserGroupRefreshErrorTimestampView = &view.View{ + Name: identityManagerLastUserGroupRefreshErrorTimestamp.Name(), + Description: identityManagerLastUserGroupRefreshErrorTimestamp.Description(), + Measure: identityManagerLastUserGroupRefreshErrorTimestamp, + Aggregation: view.LastValue(), + } + + // IdentityManagerLastSessionRefreshSuccessView contains successful user refresh counter + IdentityManagerLastSessionRefreshSuccessView = &view.View{ + Name: identityManagerLastSessionRefreshSuccess.Name(), + Description: identityManagerLastSessionRefreshSuccess.Description(), + Measure: identityManagerLastSessionRefreshSuccess, + Aggregation: view.Count(), + } + // IdentityManagerLastSessionRefreshErrorView contains user refresh errors counter + IdentityManagerLastSessionRefreshErrorView = &view.View{ + Name: identityManagerLastUserRefreshError.Name(), + Description: identityManagerLastUserRefreshError.Description(), + Measure: identityManagerLastUserRefreshError, + Aggregation: view.Count(), + } + // IdentityManagerLastSessionRefreshSuccessTimestampView contains successful session refresh counter + IdentityManagerLastSessionRefreshSuccessTimestampView = &view.View{ + Name: identityManagerLastSessionRefreshSuccessTimestamp.Name(), + Description: identityManagerLastSessionRefreshSuccessTimestamp.Description(), + Measure: identityManagerLastSessionRefreshSuccessTimestamp, + Aggregation: view.LastValue(), + } + // IdentityManagerLastSessionRefreshErrorTimestampView contains session refresh errors counter + IdentityManagerLastSessionRefreshErrorTimestampView = &view.View{ + Name: identityManagerLastSessionRefreshErrorTimestamp.Name(), + Description: identityManagerLastSessionRefreshErrorTimestamp.Description(), + Measure: identityManagerLastSessionRefreshErrorTimestamp, + Aggregation: view.LastValue(), + } ) // RecordIdentityManagerLastRefresh records that the identity manager refreshed users and groups. -func RecordIdentityManagerLastRefresh() { - stats.Record(context.Background(), identityManagerLastRefresh.M(time.Now().Unix())) +func RecordIdentityManagerLastRefresh(ctx context.Context) { + stats.Record(ctx, identityManagerLastRefresh.M(time.Now().Unix())) +} + +// RecordIdentityManagerUserRefresh updates timestamp and counter for user refresh +func RecordIdentityManagerUserRefresh(ctx context.Context, err error) { + counter := identityManagerLastUserRefreshSuccess + ts := identityManagerLastUserRefreshSuccessTimestamp + if err != nil { + counter = identityManagerLastUserRefreshError + ts = identityManagerLastUserRefreshErrorTimestamp + } + stats.Record(ctx, + ts.M(time.Now().Unix()), + counter.M(1), + ) +} + +// RecordIdentityManagerUserGroupRefresh updates timestamp and counter for user group update +func RecordIdentityManagerUserGroupRefresh(ctx context.Context, err error) { + counter := identityManagerLastUserGroupRefreshSuccess + ts := identityManagerLastUserGroupRefreshSuccessTimestamp + if err != nil { + counter = identityManagerLastUserGroupRefreshError + ts = identityManagerLastUserGroupRefreshErrorTimestamp + } + stats.Record(ctx, + ts.M(time.Now().Unix()), + counter.M(1), + ) +} + +// RecordIdentityManagerSessionRefresh updates timestamp and counter for session refresh +func RecordIdentityManagerSessionRefresh(ctx context.Context, err error) { + counter := identityManagerLastSessionRefreshSuccess + ts := identityManagerLastSessionRefreshSuccessTimestamp + if err != nil { + counter = identityManagerLastSessionRefreshError + ts = identityManagerLastSessionRefreshErrorTimestamp + } + stats.Record(ctx, + ts.M(time.Now().Unix()), + counter.M(1), + ) } // SetDBConfigInfo records status, databroker version and error count while parsing diff --git a/pkg/metrics/constants.go b/pkg/metrics/constants.go index ecc651ee8..84a8c0741 100644 --- a/pkg/metrics/constants.go +++ b/pkg/metrics/constants.go @@ -13,6 +13,34 @@ const ( ConfigLastReloadSuccess = "config_last_reload_success" // IdentityManagerLastRefreshTimestamp is IdP sync timestamp IdentityManagerLastRefreshTimestamp = "identity_manager_last_refresh_timestamp" + + // IdentityManagerLastUserRefreshSuccessTimestamp is a timestamp of last user refresh + IdentityManagerLastUserRefreshSuccessTimestamp = "identity_manager_last_user_refresh_success_timestamp" + // IdentityManagerLastUserRefreshErrorTimestamp is a timestamp of last user refresh error + IdentityManagerLastUserRefreshErrorTimestamp = "identity_manager_last_user_refresh_error_timestamp" + // IdentityManagerLastUserRefreshError is a counter of last user refresh errors + IdentityManagerLastUserRefreshError = "identity_manager_last_user_refresh_errors" + // IdentityManagerLastUserRefreshSuccess is a counter of last user refresh success + IdentityManagerLastUserRefreshSuccess = "identity_manager_last_user_refresh_success" + + // IdentityManagerLastUserGroupRefreshSuccessTimestamp is a timestamp of last user group refresh + IdentityManagerLastUserGroupRefreshSuccessTimestamp = "identity_manager_last_user_group_refresh_success_timestamp" + // IdentityManagerLastUserGroupRefreshErrorTimestamp is a timestamp of last user group refresh error + IdentityManagerLastUserGroupRefreshErrorTimestamp = "identity_manager_last_user_group_refresh_error_timestamp" + // IdentityManagerLastUserGroupRefreshError is a counter of last user group refresh errors + IdentityManagerLastUserGroupRefreshError = "identity_manager_last_user_group_refresh_errors" + // IdentityManagerLastUserGroupRefreshSuccess is a counter of last user group refresh success + IdentityManagerLastUserGroupRefreshSuccess = "identity_manager_last_user_group_refresh_success" + + // IdentityManagerLastSessionRefreshSuccessTimestamp is a timestamp of last session refresh + IdentityManagerLastSessionRefreshSuccessTimestamp = "identity_manager_last_session_refresh_success_timestamp" + // IdentityManagerLastSessionRefreshErrorTimestamp is a timestamp of last session refresh error + IdentityManagerLastSessionRefreshErrorTimestamp = "identity_manager_last_session_refresh_error_timestamp" + // IdentityManagerLastSessionRefreshError is a counter of last session refresh errors + IdentityManagerLastSessionRefreshError = "identity_manager_last_session_refresh_errors" + // IdentityManagerLastSessionRefreshSuccess is a counter of last session refresh success + IdentityManagerLastSessionRefreshSuccess = "identity_manager_last_session_refresh_success" + // BuildInfo is a gauge that may be used to detect whether component is live, and also has version BuildInfo = "build_info" // PolicyCountTotal is total amount of routes currently configured