config: use stable route ids for authorize matching and order xds responses (#5618)

## Summary
Update the `RouteID` to use the `policy.ID` if it is set. This makes it
so that updated routes use a stable identifier between updates so if the
envoy control plane is updated before the authorize service's internal
definitions (or vice-versa) the authorize service will still be able to
match the route.

The current behavior results in a 404 if envoy passes the old route id.
The new behavior will result in inconsistency, but it should be quickly
remedied. To help with debugging 4 new fields were added to the
authorize check log. The `route-id` and `route-checksum` as the
authorize sees it and the `envoy-route-id` and `envoy-route-checksum` as
envoy sees it.

I also updated the way we send updates to envoy to try and model their
recommended approach:

> In general, to avoid traffic drop, sequencing of updates should follow
a make before break model, wherein:
> 
> - CDS updates (if any) must always be pushed first.
> - EDS updates (if any) must arrive after CDS updates for the
respective clusters.
> - LDS updates must arrive after corresponding CDS/EDS updates.
> - RDS updates related to the newly added listeners must arrive after
CDS/EDS/LDS updates.
> - VHDS updates (if any) related to the newly added RouteConfigurations
must arrive after RDS updates.
> - Stale CDS clusters and related EDS endpoints (ones no longer being
referenced) can then be removed.

This should help avoid 404s when configuration is being updated.

## Related issues
-
[ENG-2386](https://linear.app/pomerium/issue/ENG-2386/large-number-of-routes-leads-to-404s-and-slowness)

## Checklist
- [x] reference any related issues
- [x] updated unit tests
- [x] add appropriate label (`enhancement`, `bug`, `breaking`,
`dependencies`, `ci`)
- [x] ready for review
This commit is contained in:
Caleb Doxsey 2025-05-19 10:52:15 -06:00 committed by GitHub
parent 2f179658b6
commit 7a6d7c5a3c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 278 additions and 85 deletions

View file

@ -32,10 +32,12 @@ import (
// Request contains the inputs needed for evaluation.
type Request struct {
IsInternal bool
Policy *config.Policy
HTTP RequestHTTP
Session RequestSession
IsInternal bool
Policy *config.Policy
HTTP RequestHTTP
Session RequestSession
EnvoyRouteChecksum uint64
EnvoyRouteID string
}
// RequestHTTP is the HTTP field in the request.
@ -141,7 +143,7 @@ type Result struct {
// An Evaluator evaluates policies.
type Evaluator struct {
store *store.Store
policyEvaluators map[uint64]*PolicyEvaluator
policyEvaluators map[string]*PolicyEvaluator
headersEvaluators *HeadersEvaluator
clientCA []byte
clientCRL []byte
@ -172,7 +174,7 @@ func New(
// If there is a previous Evaluator constructed from the same settings, we
// can reuse the HeadersEvaluator along with any PolicyEvaluators for
// unchanged policies.
var cachedPolicyEvaluators map[uint64]*PolicyEvaluator
var cachedPolicyEvaluators map[string]*PolicyEvaluator
if previous != nil && previous.cfgCacheKey == e.cfgCacheKey {
e.headersEvaluators = previous.headersEvaluators
cachedPolicyEvaluators = previous.policyEvaluators
@ -188,18 +190,18 @@ func New(
}
type routeEvaluator struct {
id uint64
id string
evaluator *PolicyEvaluator
}
func getOrCreatePolicyEvaluators(
ctx context.Context, cfg *evaluatorConfig, store *store.Store,
cachedPolicyEvaluators map[uint64]*PolicyEvaluator,
) (map[uint64]*PolicyEvaluator, error) {
cachedPolicyEvaluators map[string]*PolicyEvaluator,
) (map[string]*PolicyEvaluator, error) {
now := time.Now()
var reusedCount int
m := make(map[uint64]*PolicyEvaluator)
m := make(map[string]*PolicyEvaluator)
var builders []errgrouputil.BuilderFunc[routeEvaluator]
for i := range cfg.Policies {
configPolicy := cfg.Policies[i]