From 09eab16e6efd5d093ed92b4142d65c45d98595b7 Mon Sep 17 00:00:00 2001 From: Chris Randles Date: Wed, 27 May 2026 09:43:08 -0400 Subject: [PATCH 1/8] fix+test: [healthcheck,alb] auto-probe unconfigured pool members Signed-off-by: Chris Randles --- integration/alb_missing_healthcheck_test.go | 117 ++++++++++++++++++ integration/alb_testdata_test.go | 2 +- .../alb_missing_hc/missing_hc.yaml.tmpl | 36 ++++++ pkg/backends/alb/client.go | 10 +- pkg/backends/alb/errors/errors.go | 11 ++ pkg/backends/backends.go | 49 ++++++-- pkg/backends/healthcheck/healthcheck.go | 4 +- pkg/backends/healthcheck/options/defaults.go | 6 + pkg/observability/metrics/metrics.go | 15 +++ 9 files changed, 239 insertions(+), 11 deletions(-) create mode 100644 integration/alb_missing_healthcheck_test.go create mode 100644 integration/testdata/alb_missing_hc/missing_hc.yaml.tmpl diff --git a/integration/alb_missing_healthcheck_test.go b/integration/alb_missing_healthcheck_test.go new file mode 100644 index 000000000..ef1ff2645 --- /dev/null +++ b/integration/alb_missing_healthcheck_test.go @@ -0,0 +1,117 @@ +/* + * Copyright 2018 The Trickster Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package integration + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "net/url" + "os" + "path/filepath" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/trickstercache/trickster/v2/integration/internal/portutil" +) + +// TestALBPoolMemberWithoutHealthcheckNotRouted verifies that a non-virtual ALB +// pool member configured without a `healthcheck:` block does not receive +// fanout traffic when its upstream is unhealthy. The provider's default +// healthcheck must be auto-applied so the pool's dispatch-time filter can +// observe Failing status and exclude the member. +func TestALBPoolMemberWithoutHealthcheckNotRouted(t *testing.T) { + healthyResp := albTestdata(t, "alb_unavail/healthy.json") + + healthy := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + switch r.URL.Path { + case "/api/v1/status/buildinfo": + fmt.Fprint(w, `{"status":"success","data":{"version":"2.0"}}`) + case "/api/v1/query": + fmt.Fprint(w, `{"status":"success","data":{"resultType":"vector","result":[]}}`) + default: + fmt.Fprint(w, healthyResp) + } + })) + t.Cleanup(healthy.Close) + + var brokenDataHits atomic.Int64 + broken := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/api/v1/query_range" { + brokenDataHits.Add(1) + } + w.WriteHeader(http.StatusInternalServerError) + })) + t.Cleanup(broken.Close) + + ports, release := portutil.Reserve(t, 3) + frontPort, metricsPort, mgmtPort := ports[0], ports[1], ports[2] + + yaml := fmt.Sprintf(albTestdata(t, "alb_missing_hc/missing_hc.yaml.tmpl"), + frontPort, metricsPort, mgmtPort, healthy.URL, broken.URL) + + cfgPath := filepath.Join(t.TempDir(), "trickster.yaml") + require.NoError(t, os.WriteFile(cfgPath, []byte(yaml), 0644)) + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + release() + go startTrickster(t, ctx, expectedStartError{}, "-config", cfgPath) + + metricsAddr := fmt.Sprintf("127.0.0.1:%d", metricsPort) + waitForTrickster(t, metricsAddr) + + // Give the auto-applied probe time to fire, trip the broken backend to + // Failing, and let the ALB pool refresh its healthy snapshot. + time.Sleep(2 * time.Second) + dataBaseline := brokenDataHits.Load() + + frontAddr := fmt.Sprintf("127.0.0.1:%d", frontPort) + const reqs = 50 + now := time.Now().Unix() + for i := range reqs { + // query_range is mergeable in TSM and therefore triggers fanout to + // every healthy pool member. Use unique query strings so the cache + // doesn't short-circuit subsequent requests. + q := url.Values{ + "query": {fmt.Sprintf("up + 0*%d", i)}, + "start": {fmt.Sprintf("%d", now-300)}, + "end": {fmt.Sprintf("%d", now)}, + "step": {"15"}, + } + u := fmt.Sprintf("http://%s/alb-tsm-test/api/v1/query_range?%s", + frontAddr, q.Encode()) + resp, err := http.Get(u) + require.NoError(t, err) + resp.Body.Close() + } + + // Allow a small drain window in case any in-flight fanout slot was still + // dispatching when the assertion fired. + require.EventuallyWithT(t, func(collect *assert.CollectT) { + dataDelta := brokenDataHits.Load() - dataBaseline + assert.Equalf(collect, int64(0), dataDelta, + "broken backend received %d data requests via the ALB after probe transitioned it to Failing", + dataDelta) + }, 3*time.Second, 200*time.Millisecond) +} diff --git a/integration/alb_testdata_test.go b/integration/alb_testdata_test.go index 308658135..d28f3b31d 100644 --- a/integration/alb_testdata_test.go +++ b/integration/alb_testdata_test.go @@ -21,7 +21,7 @@ import ( "testing" ) -//go:embed testdata/alb_cache testdata/alb_tsm_correctness testdata/alb_response_headers testdata/alb_nested testdata/alb_per_path testdata/alb_unavail testdata/alb_request_headers testdata/alb_compose +//go:embed testdata/alb_cache testdata/alb_tsm_correctness testdata/alb_response_headers testdata/alb_nested testdata/alb_per_path testdata/alb_unavail testdata/alb_missing_hc testdata/alb_request_headers testdata/alb_compose var albTestdataFS embed.FS func albTestdata(t testing.TB, name string) string { diff --git a/integration/testdata/alb_missing_hc/missing_hc.yaml.tmpl b/integration/testdata/alb_missing_hc/missing_hc.yaml.tmpl new file mode 100644 index 000000000..dc2de70c5 --- /dev/null +++ b/integration/testdata/alb_missing_hc/missing_hc.yaml.tmpl @@ -0,0 +1,36 @@ + +frontend: + listen_port: %d +metrics: + listen_port: %d +mgmt: + listen_port: %d +logging: + log_level: warn +caches: + mem1: + provider: memory +backends: + prom-healthy: + provider: prometheus + origin_url: %s + cache_name: mem1 + healthcheck: + path: /api/v1/status/buildinfo + query: "" + interval: 100ms + timeout: 500ms + failure_threshold: 1 + recovery_threshold: 1 + prom-broken: + provider: prometheus + origin_url: %s + cache_name: mem1 + alb-tsm-test: + provider: alb + alb: + mechanism: tsm + output_format: prometheus + pool: + - prom-healthy + - prom-broken diff --git a/pkg/backends/alb/client.go b/pkg/backends/alb/client.go index ae09f6038..e3ade0789 100644 --- a/pkg/backends/alb/client.go +++ b/pkg/backends/alb/client.go @@ -171,7 +171,15 @@ func (c *Client) ValidateAndStartPool(clients backends.Backends, hcs healthcheck } hc, ok := hcs[n] if !ok { - // virtual backends (rule, alb) have no health checks; treat as passing + // Only virtual backends (rule, alb) may legitimately be absent from + // the healthcheck registry. A non-virtual miss means the operator + // did not configure healthcheck on this backend and the upstream + // auto-apply path also declined to register a probe; fabricating a + // dummy Status here would silently route fanout traffic regardless + // of upstream health. + if !backends.IsVirtual(tc.Configuration().Provider) { + return alberr.NewErrMissingHealthCheck(c.Name(), n) + } hc = healthcheck.NewStatus(n, "virtual", "", healthcheck.StatusPassing, time.Time{}, nil) } targets = append(targets, pool.NewTarget(tc.Router(), hc, tc)) diff --git a/pkg/backends/alb/errors/errors.go b/pkg/backends/alb/errors/errors.go index 8335156ac..5755c69c0 100644 --- a/pkg/backends/alb/errors/errors.go +++ b/pkg/backends/alb/errors/errors.go @@ -63,3 +63,14 @@ func NewErrInvalidUserRouterCreds(albName string) error { albName), } } + +// NewErrMissingHealthCheck reports a non-virtual ALB pool member that has no +// registered healthcheck Status. Without this guard the pool would route +// traffic to the member regardless of upstream health. +func NewErrMissingHealthCheck(albName, poolMemberName string) error { + return &InvalidALBOptionsError{ + error: fmt.Errorf("alb [%s] non-virtual pool member [%s] has no health check; "+ + "configure healthcheck options on the backend", + albName, poolMemberName), + } +} diff --git a/pkg/backends/backends.go b/pkg/backends/backends.go index 28f97de06..074a50fc1 100644 --- a/pkg/backends/backends.go +++ b/pkg/backends/backends.go @@ -21,8 +21,12 @@ import ( "net/http" "github.com/trickstercache/trickster/v2/pkg/backends/healthcheck" + ho "github.com/trickstercache/trickster/v2/pkg/backends/healthcheck/options" bo "github.com/trickstercache/trickster/v2/pkg/backends/options" "github.com/trickstercache/trickster/v2/pkg/backends/providers" + "github.com/trickstercache/trickster/v2/pkg/observability/logging" + "github.com/trickstercache/trickster/v2/pkg/observability/logging/logger" + "github.com/trickstercache/trickster/v2/pkg/observability/metrics" ) // Backends represents a map of Backends keyed by Name @@ -40,20 +44,44 @@ func (b Backends) StartHealthChecks(knownStatuses healthcheck.StatusLookup) (hea } if IsVirtual(bo.Provider) { // Virtual backends have no upstream to probe; register a synthetic - // passing status so nested ALBs surface in the health page and in - // outer pools' availablePoolMembers (issue #996). + // passing status so they surface in the health page and in outer + // ALB pool reporting. hc.RegisterVirtual(k, bo.Provider) continue } hco := bo.HealthCheck - if hco == nil { + def := c.DefaultHealthCheckConfig() + if hco == nil && def == nil { + // no probe possible for this provider; the ALB pool will reject + // this backend as a non-virtual member with no Status. continue } - bo.HealthCheck = c.DefaultHealthCheckConfig() - if bo.HealthCheck == nil { - bo.HealthCheck = hco + if def != nil { + bo.HealthCheck = def + if hco != nil { + bo.HealthCheck.Overlay(hco) + } } else { - bo.HealthCheck.Overlay(hco) + bo.HealthCheck = hco + } + var autoApplied bool + if bo.HealthCheck.Interval <= 0 { + // The operator did not configure a probe interval. A registered + // target with Interval == 0 never ticks, so its Status sticks at + // StatusUnchecked and the ALB pool's dispatch-time filter keeps + // routing fanout traffic regardless of upstream health. Apply a + // fast auto-default and surface that we did so. + bo.HealthCheck.Interval = ho.DefaultAutoProbeInterval + if bo.HealthCheck.FailureThreshold <= 0 { + bo.HealthCheck.FailureThreshold = 1 + } + if bo.HealthCheck.RecoveryThreshold <= 0 { + bo.HealthCheck.RecoveryThreshold = 1 + } + autoApplied = true + metrics.BackendsDefaultHealthCheckApplied.WithLabelValues(k, bo.Provider).Inc() + logger.Warn("auto-applied default healthcheck for backend", + logging.Pairs{"backend_name": k, "provider": bo.Provider}) } st, err := hc.Register(k, bo.Provider, bo.HealthCheck, c.HealthCheckHTTPClient()) if err != nil { @@ -63,6 +91,13 @@ func (b Backends) StartHealthChecks(knownStatuses healthcheck.StatusLookup) (hea if v := oldSt.Get(); v != healthcheck.StatusInitializing { st.Set(v) } + } else if autoApplied { + // Override the registration-time Initializing with Unchecked so the + // pool's dispatch filter accepts the target during the brief window + // before the first auto-probe completes. Targets the operator + // explicitly configured keep Initializing so they're held out + // until the probe confirms them. + st.Set(healthcheck.StatusUnchecked) } c.SetHealthCheckProbe(st.Prober()) } diff --git a/pkg/backends/healthcheck/healthcheck.go b/pkg/backends/healthcheck/healthcheck.go index 6c70f426f..5cb2ad01a 100644 --- a/pkg/backends/healthcheck/healthcheck.go +++ b/pkg/backends/healthcheck/healthcheck.go @@ -32,8 +32,8 @@ type HealthChecker interface { // Register a health check Target Register(name string, description string, options *ho.Options, client *http.Client) (*Status, error) // RegisterVirtual records a synthetic always-passing Status for a virtual - // backend (rule, alb) that has no upstream to probe. Without this, - // nested ALBs render as "nc" on the health page (issue #996). + // backend (rule, alb) that has no upstream to probe, so it surfaces in + // the health page and in outer ALB pool reporting. RegisterVirtual(name, description string) *Status // Remove a health check Target Unregister(name string) diff --git a/pkg/backends/healthcheck/options/defaults.go b/pkg/backends/healthcheck/options/defaults.go index 8535bca85..5856cfc6c 100644 --- a/pkg/backends/healthcheck/options/defaults.go +++ b/pkg/backends/healthcheck/options/defaults.go @@ -23,4 +23,10 @@ import ( const ( // DefaultHealthCheckTimeout is the default duration for health check probes to wait before timing out DefaultHealthCheckTimeout = 3 * time.Second + + // DefaultAutoProbeInterval is applied when StartHealthChecks auto-installs + // a provider's default health-check config because the operator did not + // configure one. The auto-applied probe must fire on a tick or the + // downstream pool filter will never see a status transition. + DefaultAutoProbeInterval = 5 * time.Second ) diff --git a/pkg/observability/metrics/metrics.go b/pkg/observability/metrics/metrics.go index 010ef87c3..54ef8f0ae 100644 --- a/pkg/observability/metrics/metrics.go +++ b/pkg/observability/metrics/metrics.go @@ -453,6 +453,20 @@ var ( }, []string{"backend_name"}, ) + + // BackendsDefaultHealthCheckApplied counts backends for which the provider's + // DefaultHealthCheckConfig was auto-installed by StartHealthChecks because + // the operator did not configure healthcheck options. A non-zero value + // flags configs that depended on an absent probe and benefit from auditing. + BackendsDefaultHealthCheckApplied = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Subsystem: healthSubsystem, + Name: "default_config_applied_total", + Help: "Count of backends that received an auto-applied default healthcheck config because none was configured.", + }, + []string{"backend_name", "provider"}, + ) ) func init() { @@ -478,6 +492,7 @@ func init() { prometheus.MustRegister(CacheIndexPanicRecovered) prometheus.MustRegister(HealthHandlerPanicRecovered) prometheus.MustRegister(HealthcheckStatusNotifyPanicRecovered) + prometheus.MustRegister(BackendsDefaultHealthCheckApplied) prometheus.MustRegister(CacheObjectOperations) prometheus.MustRegister(CacheByteOperations) prometheus.MustRegister(CacheEvents) From 18f777eb49a3beb25cc1595327ede6693e9d1c57 Mon Sep 17 00:00:00 2001 From: Chris Randles Date: Wed, 27 May 2026 10:37:21 -0400 Subject: [PATCH 2/8] test: demonstrate dpc chunk test wallclock regression Signed-off-by: Chris Randles From dc0c4c507c78b74485ad3f4ffe1024b5df6471e6 Mon Sep 17 00:00:00 2001 From: Chris Randles Date: Wed, 27 May 2026 10:46:10 -0400 Subject: [PATCH 3/8] test: [proxy/engines] anchor chunk-test reference time to wallclock Signed-off-by: Chris Randles --- .../engines/deltaproxycache_chunk_test.go | 43 ++++++++++++++++--- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/pkg/proxy/engines/deltaproxycache_chunk_test.go b/pkg/proxy/engines/deltaproxycache_chunk_test.go index 85c30202b..46f9f2686 100644 --- a/pkg/proxy/engines/deltaproxycache_chunk_test.go +++ b/pkg/proxy/engines/deltaproxycache_chunk_test.go @@ -29,6 +29,32 @@ import ( "github.com/trickstercache/trickster/v2/pkg/timeseries" ) +// DeltaProxyCacheRequest rejects requests whose extent ends before +// now - (step * TimeseriesRetention) by routing them straight through +// DoProxy (engine=HTTPProxy, status=proxy-only). With default +// TimeseriesRetention=1024 and step=1h that window is ~42.67d, measured +// against real wallclock. The two helpers below anchor each test's +// reference 'now' to a recent chunk bucket so the queries stay inside +// that window regardless of when the test runs, while preserving the +// in-bucket vs cross-bucket topologies the tests need to exercise. +// +// chunkSize = step * cache/options/defaults.DefaultTimeseriesChunkFactor (420). + +func chunkBucketAnchor(step time.Duration) time.Time { + chunkSize := step * 420 + // midpoint of the most-recently-completed chunk bucket; queries within + // ±200h of this point stay inside one bucket. + return time.Now().Add(-chunkSize).Truncate(chunkSize).Add(chunkSize / 2) +} + +func chunkBucketStraddle(step time.Duration) time.Time { + chunkSize := step * 420 + // 11h after a chunk-bucket boundary; the Q1 (-30h..-12h) and + // Q2 (-10h..-8h) extents used by the cross-bucket tests fall on + // opposite sides of the boundary just before 'now'. + return time.Now().Add(-chunkSize).Truncate(chunkSize).Add(11 * time.Hour) +} + func TestDeltaProxyCacheRequestMissThenHitChunksChunks(t *testing.T) { ts, w, r, rsc, err := setupTestHarnessDPC() rsc.CacheConfig.UseCacheChunking = true @@ -608,8 +634,12 @@ func TestDeltaProxyCacheRequestRangeMissChunks(t *testing.T) { step := time.Duration(3600) * time.Second - // fixed to keep all three queries in the same 17.5d chunk bucket; see the _CrossBucket test - now := time.Date(2026, 4, 15, 12, 0, 0, 0, time.UTC) + // anchor 'now' to the midpoint of a recently-completed 17.5d chunk bucket + // so all three queries land in the same bucket while staying inside the + // DPC eviction retention window (step * TimeseriesRetention = 42.67d at + // defaults). A fixed calendar date would eventually drift outside that + // window relative to wallclock and trip the proxy-only bypass. + now := chunkBucketAnchor(step) end := now.Add(-time.Duration(12) * time.Hour) extr := timeseries.Extent{Start: end.Add(-time.Duration(18) * time.Hour), End: end} @@ -756,8 +786,11 @@ func TestDeltaProxyCacheRequestRangeMissChunks_CrossBucket(t *testing.T) { step := time.Duration(3600) * time.Second - // kmiss write in bucket ending 2026-04-20T00:00Z; high-end read in next bucket - now := time.Date(2026, 4, 20, 11, 0, 0, 0, time.UTC) + // position 'now' 11h after a chunk-bucket boundary so the Q1 kmiss write + // (now-30h..now-12h) lands in the prior bucket and the Q2 high-end read + // (now-10h..now-8h) lands in the next bucket, exercising the cross-bucket + // recovery path while staying inside DPC eviction retention. + now := chunkBucketStraddle(step) end := now.Add(-time.Duration(12) * time.Hour) extr := timeseries.Extent{Start: end.Add(-time.Duration(18) * time.Hour), End: end} @@ -837,7 +870,7 @@ func TestDeltaProxyCacheRequestRangeMissChunks_CrossBucketPreservesPriorChunks(t o.FastForwardDisable = true step := time.Duration(3600) * time.Second - now := time.Date(2026, 4, 20, 11, 0, 0, 0, time.UTC) + now := chunkBucketStraddle(step) runQuery := func(start, end time.Time) *http.Response { u := r.URL From 4c61fd597d9426a0de394a2b7eea058768a104c2 Mon Sep 17 00:00:00 2001 From: Chris Randles Date: Wed, 27 May 2026 10:59:26 -0400 Subject: [PATCH 4/8] test: [proxy/engines] pin retention so fixed chunk fixtures clear wallclock gate Signed-off-by: Chris Randles --- .../engines/deltaproxycache_chunk_test.go | 46 ++++--------------- 1 file changed, 8 insertions(+), 38 deletions(-) diff --git a/pkg/proxy/engines/deltaproxycache_chunk_test.go b/pkg/proxy/engines/deltaproxycache_chunk_test.go index 46f9f2686..23d442b71 100644 --- a/pkg/proxy/engines/deltaproxycache_chunk_test.go +++ b/pkg/proxy/engines/deltaproxycache_chunk_test.go @@ -29,32 +29,6 @@ import ( "github.com/trickstercache/trickster/v2/pkg/timeseries" ) -// DeltaProxyCacheRequest rejects requests whose extent ends before -// now - (step * TimeseriesRetention) by routing them straight through -// DoProxy (engine=HTTPProxy, status=proxy-only). With default -// TimeseriesRetention=1024 and step=1h that window is ~42.67d, measured -// against real wallclock. The two helpers below anchor each test's -// reference 'now' to a recent chunk bucket so the queries stay inside -// that window regardless of when the test runs, while preserving the -// in-bucket vs cross-bucket topologies the tests need to exercise. -// -// chunkSize = step * cache/options/defaults.DefaultTimeseriesChunkFactor (420). - -func chunkBucketAnchor(step time.Duration) time.Time { - chunkSize := step * 420 - // midpoint of the most-recently-completed chunk bucket; queries within - // ±200h of this point stay inside one bucket. - return time.Now().Add(-chunkSize).Truncate(chunkSize).Add(chunkSize / 2) -} - -func chunkBucketStraddle(step time.Duration) time.Time { - chunkSize := step * 420 - // 11h after a chunk-bucket boundary; the Q1 (-30h..-12h) and - // Q2 (-10h..-8h) extents used by the cross-bucket tests fall on - // opposite sides of the boundary just before 'now'. - return time.Now().Add(-chunkSize).Truncate(chunkSize).Add(11 * time.Hour) -} - func TestDeltaProxyCacheRequestMissThenHitChunksChunks(t *testing.T) { ts, w, r, rsc, err := setupTestHarnessDPC() rsc.CacheConfig.UseCacheChunking = true @@ -631,15 +605,12 @@ func TestDeltaProxyCacheRequestRangeMissChunks(t *testing.T) { rsc.CacheConfig.Provider = "test" o.FastForwardDisable = true + o.TimeseriesRetention = 1 << 20 // pin against wallclock drift on fixed fixture step := time.Duration(3600) * time.Second - // anchor 'now' to the midpoint of a recently-completed 17.5d chunk bucket - // so all three queries land in the same bucket while staying inside the - // DPC eviction retention window (step * TimeseriesRetention = 42.67d at - // defaults). A fixed calendar date would eventually drift outside that - // window relative to wallclock and trip the proxy-only bypass. - now := chunkBucketAnchor(step) + // fixed to keep all three queries in the same 17.5d chunk bucket; see the _CrossBucket test + now := time.Date(2026, 4, 15, 12, 0, 0, 0, time.UTC) end := now.Add(-time.Duration(12) * time.Hour) extr := timeseries.Extent{Start: end.Add(-time.Duration(18) * time.Hour), End: end} @@ -783,14 +754,12 @@ func TestDeltaProxyCacheRequestRangeMissChunks_CrossBucket(t *testing.T) { o := rsc.BackendOptions rsc.CacheConfig.Provider = "test" o.FastForwardDisable = true + o.TimeseriesRetention = 1 << 20 // pin against wallclock drift on fixed fixture step := time.Duration(3600) * time.Second - // position 'now' 11h after a chunk-bucket boundary so the Q1 kmiss write - // (now-30h..now-12h) lands in the prior bucket and the Q2 high-end read - // (now-10h..now-8h) lands in the next bucket, exercising the cross-bucket - // recovery path while staying inside DPC eviction retention. - now := chunkBucketStraddle(step) + // kmiss write in bucket ending 2026-04-20T00:00Z; high-end read in next bucket + now := time.Date(2026, 4, 20, 11, 0, 0, 0, time.UTC) end := now.Add(-time.Duration(12) * time.Hour) extr := timeseries.Extent{Start: end.Add(-time.Duration(18) * time.Hour), End: end} @@ -868,9 +837,10 @@ func TestDeltaProxyCacheRequestRangeMissChunks_CrossBucketPreservesPriorChunks(t o := rsc.BackendOptions rsc.CacheConfig.Provider = "test" o.FastForwardDisable = true + o.TimeseriesRetention = 1 << 20 // pin against wallclock drift on fixed fixture step := time.Duration(3600) * time.Second - now := chunkBucketStraddle(step) + now := time.Date(2026, 4, 20, 11, 0, 0, 0, time.UTC) runQuery := func(start, end time.Time) *http.Response { u := r.URL From baba35b8e088099aa502673de935e61834969224 Mon Sep 17 00:00:00 2001 From: Chris Randles Date: Wed, 27 May 2026 11:23:00 -0400 Subject: [PATCH 5/8] test: [integration/engines] ignore auto-applied healthcheck probes in origin counter Signed-off-by: Chris Randles --- integration/engines_test.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/integration/engines_test.go b/integration/engines_test.go index c39782cac..e2f9e62ac 100644 --- a/integration/engines_test.go +++ b/integration/engines_test.go @@ -70,6 +70,14 @@ func engineSetup(t *testing.T) *engineFakeOrigin { srv := &httptest.Server{ Listener: ln, Config: &http.Server{Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // auto-applied prometheus healthcheck probes `/api/v1/query?query=up`; + // short-circuit them so per-test counters only see data traffic + if r.URL.Path == "/api/v1/query" && r.URL.Query().Get("query") == "up" { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = io.WriteString(w, `{"status":"success","data":{"resultType":"vector","result":[]}}`) + return + } o.mu.Lock() h := o.handler o.mu.Unlock() From 1ef6e30fdf77104e87371f61416c3ca95ede5979 Mon Sep 17 00:00:00 2001 From: Chris Randles Date: Wed, 27 May 2026 13:03:34 -0400 Subject: [PATCH 6/8] fix: [alb] warn + metric when healthy_floor admits failing members Signed-off-by: Chris Randles --- integration/alb_missing_healthcheck_test.go | 54 +++++++++++++++++++ .../alb_missing_hc/floor_warn.yaml.tmpl | 52 ++++++++++++++++++ pkg/backends/alb/client.go | 17 ++++++ pkg/observability/metrics/metrics.go | 16 ++++++ 4 files changed, 139 insertions(+) create mode 100644 integration/testdata/alb_missing_hc/floor_warn.yaml.tmpl diff --git a/integration/alb_missing_healthcheck_test.go b/integration/alb_missing_healthcheck_test.go index ef1ff2645..1d652bb1a 100644 --- a/integration/alb_missing_healthcheck_test.go +++ b/integration/alb_missing_healthcheck_test.go @@ -24,6 +24,7 @@ import ( "net/url" "os" "path/filepath" + "strings" "sync/atomic" "testing" "time" @@ -115,3 +116,56 @@ func TestALBPoolMemberWithoutHealthcheckNotRouted(t *testing.T) { dataDelta) }, 3*time.Second, 200*time.Millisecond) } + +// TestALBHealthyFloorAdmitsFailingMetric verifies the warning surface for an +// ALB whose healthy_floor admits Failing members. An operator who lowered +// healthy_floor below 0 to keep traffic flowing during the Initializing +// startup window also admits members the probe has confirmed broken; the +// `trickster_alb_pool_admits_failing` gauge surfaces that misconfiguration. +func TestALBHealthyFloorAdmitsFailingMetric(t *testing.T) { + healthy := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + fmt.Fprint(w, `{"status":"success","data":{"version":"2.0"}}`) + })) + t.Cleanup(healthy.Close) + broken := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + t.Cleanup(broken.Close) + + ports, release := portutil.Reserve(t, 3) + frontPort, metricsPort, mgmtPort := ports[0], ports[1], ports[2] + + yaml := fmt.Sprintf(albTestdata(t, "alb_missing_hc/floor_warn.yaml.tmpl"), + frontPort, metricsPort, mgmtPort, healthy.URL, broken.URL) + cfgPath := filepath.Join(t.TempDir(), "trickster.yaml") + require.NoError(t, os.WriteFile(cfgPath, []byte(yaml), 0644)) + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + release() + go startTrickster(t, ctx, expectedStartError{}, "-config", cfgPath) + + metricsAddr := fmt.Sprintf("127.0.0.1:%d", metricsPort) + waitForTrickster(t, metricsAddr) + + require.EventuallyWithT(t, func(collect *assert.CollectT) { + lines := checkTricksterMetrics(t, metricsAddr) + var admits, excludes string + for _, l := range lines { + if strings.HasPrefix(l, "trickster_alb_pool_admits_failing{") { + if strings.Contains(l, `backend_name="alb-admits-failing"`) { + admits = l + } + if strings.Contains(l, `backend_name="alb-excludes-failing"`) { + excludes = l + } + } + } + assert.True(collect, strings.HasSuffix(admits, " 1"), + "alb-admits-failing must report 1: %q", admits) + assert.True(collect, strings.HasSuffix(excludes, " 0"), + "alb-excludes-failing must report 0: %q", excludes) + }, 5*time.Second, 200*time.Millisecond) +} diff --git a/integration/testdata/alb_missing_hc/floor_warn.yaml.tmpl b/integration/testdata/alb_missing_hc/floor_warn.yaml.tmpl new file mode 100644 index 000000000..d11121ee8 --- /dev/null +++ b/integration/testdata/alb_missing_hc/floor_warn.yaml.tmpl @@ -0,0 +1,52 @@ + +frontend: + listen_port: %d +metrics: + listen_port: %d +mgmt: + listen_port: %d +logging: + log_level: warn +caches: + mem1: + provider: memory +backends: + prom-healthy: + provider: prometheus + origin_url: %s + cache_name: mem1 + healthcheck: + path: /api/v1/status/buildinfo + query: "" + interval: 100ms + timeout: 500ms + failure_threshold: 1 + recovery_threshold: 1 + prom-broken: + provider: prometheus + origin_url: %s + cache_name: mem1 + healthcheck: + path: /api/v1/status/buildinfo + query: "" + interval: 100ms + timeout: 500ms + failure_threshold: 1 + recovery_threshold: 1 + alb-admits-failing: + provider: alb + alb: + mechanism: tsm + output_format: prometheus + healthy_floor: -1 + pool: + - prom-healthy + - prom-broken + alb-excludes-failing: + provider: alb + alb: + mechanism: tsm + output_format: prometheus + pool: + - prom-healthy + - prom-broken diff --git a/pkg/backends/alb/client.go b/pkg/backends/alb/client.go index e3ade0789..fc68d552b 100644 --- a/pkg/backends/alb/client.go +++ b/pkg/backends/alb/client.go @@ -34,6 +34,9 @@ import ( rt "github.com/trickstercache/trickster/v2/pkg/backends/providers/registry/types" "github.com/trickstercache/trickster/v2/pkg/cache" "github.com/trickstercache/trickster/v2/pkg/errors" + "github.com/trickstercache/trickster/v2/pkg/observability/logging" + "github.com/trickstercache/trickster/v2/pkg/observability/logging/logger" + "github.com/trickstercache/trickster/v2/pkg/observability/metrics" authopt "github.com/trickstercache/trickster/v2/pkg/proxy/authenticator/options" authreg "github.com/trickstercache/trickster/v2/pkg/proxy/authenticator/registry" "github.com/trickstercache/trickster/v2/pkg/proxy/handlers" @@ -187,6 +190,20 @@ func (c *Client) ValidateAndStartPool(clients backends.Backends, hcs healthcheck if pm, ok := c.handler.(types.PoolMechanism); ok { pm.SetPool(pool.New(targets, o.HealthyFloor)) } + if o.HealthyFloor <= int(healthcheck.StatusFailing) { + // floor admits members whose probe has confirmed them down; operators + // who lowered the floor to keep traffic flowing during the startup + // Initializing window may not realize Failing slips in too. + metrics.ALBPoolAdmitsFailing.WithLabelValues(c.Name()).Set(1) + logger.Warn("alb healthy_floor admits members in Failing state", + logging.Pairs{ + "backend_name": c.Name(), + "healthy_floor": o.HealthyFloor, + "hint": "set healthy_floor: 0 to exclude probed-failing members", + }) + } else { + metrics.ALBPoolAdmitsFailing.WithLabelValues(c.Name()).Set(0) + } return nil } diff --git a/pkg/observability/metrics/metrics.go b/pkg/observability/metrics/metrics.go index 54ef8f0ae..cf2f31e8f 100644 --- a/pkg/observability/metrics/metrics.go +++ b/pkg/observability/metrics/metrics.go @@ -467,6 +467,21 @@ var ( }, []string{"backend_name", "provider"}, ) + + // ALBPoolAdmitsFailing flags ALB pools whose healthy_floor admits a Failing + // status (floor <= StatusFailing). Operators who set floor below 0 to keep + // traffic flowing during the Initializing window may not realize they're + // also admitting members the probe has confirmed broken; the gauge surfaces + // that misconfiguration without spamming logs. + ALBPoolAdmitsFailing = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricNamespace, + Subsystem: albSubsystem, + Name: "pool_admits_failing", + Help: "1 when an ALB pool's healthy_floor admits members in Failing state; 0 otherwise.", + }, + []string{"backend_name"}, + ) ) func init() { @@ -493,6 +508,7 @@ func init() { prometheus.MustRegister(HealthHandlerPanicRecovered) prometheus.MustRegister(HealthcheckStatusNotifyPanicRecovered) prometheus.MustRegister(BackendsDefaultHealthCheckApplied) + prometheus.MustRegister(ALBPoolAdmitsFailing) prometheus.MustRegister(CacheObjectOperations) prometheus.MustRegister(CacheByteOperations) prometheus.MustRegister(CacheEvents) From 6c24f39fdfc6e1566f5261000b6db655665858a1 Mon Sep 17 00:00:00 2001 From: Chris Randles Date: Wed, 27 May 2026 13:36:51 -0400 Subject: [PATCH 7/8] docs: [alb,metrics] document healthy_floor pitfall and new admits-failing gauge Signed-off-by: Chris Randles --- docs/alb.md | 2 ++ docs/metrics.md | 4 ++++ pkg/backends/alb/options/options.go | 14 ++++++++------ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/docs/alb.md b/docs/alb.md index 562aeb412..cd391db0a 100644 --- a/docs/alb.md +++ b/docs/alb.md @@ -447,6 +447,8 @@ Each ALB has a configurable `healthy_floor` value, which is the threshold for de Backends that do not have a [health check interval](./health#example+health+check+configuration+for+use+in+alb) configured will remain in a permanent state of `unknown`. Backends will also be in an `unknown` state from the time Trickster starts until the first of any configured automated health check is completed. Note that if an ALB is configured with `healthy_floor: 1`, any pool members that are not configured with an automated health check interval will never be included in the ALB's healthy pool, as their state is permanently `0`. +Setting `healthy_floor` below `0` admits members the probe has confirmed `unavailable`, not just members in the transient `unknown` state. If your goal is to keep traffic flowing during the cold-start window before the first probes complete, lower the pool members' `recovery_threshold` so they transition out of `unknown` faster -- don't lower the floor. When `healthy_floor < 0` Trickster emits a startup warning and sets the `trickster_alb_pool_admits_failing{backend_name}` gauge to `1`. + ### Example ALB Configuration Routing Only To Known Healthy Backends ```yaml diff --git a/docs/metrics.md b/docs/metrics.md index 917d55862..d95bf4e38 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -91,6 +91,10 @@ The following metrics are available for polling with any Trickster configuration * `operation` - the name of the operation being performed (read, write, etc.) * `status` - the result of the operation being performed +* `trickster_alb_pool_admits_failing` (Gauge) - 1 when an ALB pool's `healthy_floor` admits members in the `unavailable` state, 0 otherwise. See [alb.md](./alb.md#health-based-backend-selection) for the recommended floor. + * labels: + * `backend_name` - the name of the configured ALB backend + --- The following metrics are available only for Caches Types whose object lifecycle Trickster manages internally (Memory, Filesystem and bbolt): diff --git a/pkg/backends/alb/options/options.go b/pkg/backends/alb/options/options.go index 97023d0eb..7551fe1b5 100644 --- a/pkg/backends/alb/options/options.go +++ b/pkg/backends/alb/options/options.go @@ -38,12 +38,14 @@ type Options struct { MechanismName string `yaml:"mechanism,omitempty"` // Pool provides the list of backend names to be used by the load balancer Pool []string `yaml:"pool,omitempty"` - // HealthyFloor is the minimum health check status value to be considered Available in the pool - // -1 : all pool members are Available regardless of health check status - // 0 (default) : pool members with status of unknown (0) or healthy (1) are Available - // 1 : only pool members with status of healthy (1) are Available - // unknown means the first hc hasn't returned yet, - // or (more likely) HealthCheck Interval on target backend is not set + // HealthyFloor is the minimum health check status value admitted to the pool. + // Values below 0 admit members the probe has confirmed Failing; only set + // this below 0 if you intend to route to known-broken upstreams. + // -1 : all pool members admitted, including Failing + // 0 (default) : Unknown (0) and Healthy (1) admitted, Failing rejected + // 1 : only Healthy (1) admitted + // Unknown means the first health check hasn't returned yet, or the target + // backend has no health check interval configured. HealthyFloor int `yaml:"healthy_floor,omitempty"` // MaxCaptureBytes overrides the backend-level max_capture_bytes for this // ALB's fanout members. Set this when the ALB's expected response shape From c7780a8391a23f5df1ebae2c33a3744114180ab4 Mon Sep 17 00:00:00 2001 From: Chris Randles Date: Thu, 28 May 2026 10:48:15 -0400 Subject: [PATCH 8/8] fix: [alb] validate healthy_floor and warn on degraded single-member dispatch Signed-off-by: Chris Randles --- docs/alb.md | 2 +- docs/health.md | 2 + docs/metrics.md | 4 + integration/alb_missing_healthcheck_test.go | 203 +++++++++++------- integration/engines_test.go | 8 - ...missing_hc.yaml.tmpl => degrade.yaml.tmpl} | 17 +- .../alb_missing_hc/floor_reset.yaml.tmpl | 25 +++ pkg/backends/alb/client.go | 36 +++- pkg/backends/alb/errors/errors.go | 11 - .../alb/mech/tsm/time_series_merge.go | 42 +++- pkg/backends/alb/pool/pool.go | 8 + pkg/backends/backends.go | 45 +--- pkg/backends/healthcheck/options/defaults.go | 6 - pkg/observability/metrics/metrics.go | 30 +-- 14 files changed, 255 insertions(+), 184 deletions(-) rename integration/testdata/alb_missing_hc/{missing_hc.yaml.tmpl => degrade.yaml.tmpl} (69%) create mode 100644 integration/testdata/alb_missing_hc/floor_reset.yaml.tmpl diff --git a/docs/alb.md b/docs/alb.md index cd391db0a..c5d0a7709 100644 --- a/docs/alb.md +++ b/docs/alb.md @@ -445,7 +445,7 @@ A backend will report one of three possible health states to its ALBs: `unavaila Each ALB has a configurable `healthy_floor` value, which is the threshold for determining which pool members are included in the healthy pool, based on their instantaneous health state. The `healthy_floor` represents the minimum acceptable health state value for inclusion in the healthy pool. The default `healthy_floor` value is `0`, meaning Backends in a state `>= 0` (`unknown` and `available`) are included in the healthy pool. Setting `healthy_floor: 1` would include only `available` Backends, while a value of `-1` will include all backends in the configured pool, including those marked as `unavailable`. -Backends that do not have a [health check interval](./health#example+health+check+configuration+for+use+in+alb) configured will remain in a permanent state of `unknown`. Backends will also be in an `unknown` state from the time Trickster starts until the first of any configured automated health check is completed. Note that if an ALB is configured with `healthy_floor: 1`, any pool members that are not configured with an automated health check interval will never be included in the ALB's healthy pool, as their state is permanently `0`. +Backends that do not have a [health check interval](./health#example+health+check+configuration+for+use+in+alb) configured will remain in a permanent state of `unknown`. Backends will also be in an `unknown` state from the time Trickster starts until the first of any configured automated health check is completed. A pool member in a permanent `unknown` state can never reach `available`, so a `healthy_floor: 1` ALB whose members lack health checks would have an empty pool and return `502` for every request. To avoid that, Trickster resets such an ALB's effective floor to `0` at startup, emits a warning naming the ALB and the un-probed members, and sets the `trickster_alb_pool_floor_reset{backend_name}` gauge to `1`. Configure a health check interval on those members if you want `healthy_floor: 1` to apply. Setting `healthy_floor` below `0` admits members the probe has confirmed `unavailable`, not just members in the transient `unknown` state. If your goal is to keep traffic flowing during the cold-start window before the first probes complete, lower the pool members' `recovery_threshold` so they transition out of `unknown` faster -- don't lower the floor. When `healthy_floor < 0` Trickster emits a startup warning and sets the `trickster_alb_pool_admits_failing{backend_name}` gauge to `1`. diff --git a/docs/health.md b/docs/health.md index 950438ba4..c3345d739 100644 --- a/docs/health.md +++ b/docs/health.md @@ -102,6 +102,8 @@ backends: recovery_threshold: 3 # backend is healthy after 3 consecutive successes ``` +The Prometheus default probe is `/api/v1/query?query=up`. Some multi-tenant Prometheus gateways reject an unbounded `up` with `400 bad_data: "too many series found"`, which keeps the member out of any ALB pool it belongs to. Override `healthcheck.query` with a bounded expression the backend accepts (for example `query=vector(1)`) when probing such backends. + ## Other Ways to Monitor Health In addition to the out-of-the-box health checks to determine up-or-down status, you may want to setup alarms and thresholds based on the metrics instrumented by Trickster. See [metrics.md](metrics.md) for collecting performance metrics about Trickster. diff --git a/docs/metrics.md b/docs/metrics.md index d95bf4e38..0cc9a546e 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -95,6 +95,10 @@ The following metrics are available for polling with any Trickster configuration * labels: * `backend_name` - the name of the configured ALB backend +* `trickster_alb_pool_floor_reset` (Gauge) - 1 when an ALB pool's `healthy_floor` was reset to 0 at startup because pool members have no health check and could never reach the configured floor, 0 otherwise. See [alb.md](./alb.md#health-based-backend-selection). + * labels: + * `backend_name` - the name of the configured ALB backend + --- The following metrics are available only for Caches Types whose object lifecycle Trickster manages internally (Memory, Filesystem and bbolt): diff --git a/integration/alb_missing_healthcheck_test.go b/integration/alb_missing_healthcheck_test.go index 1d652bb1a..be46d9db7 100644 --- a/integration/alb_missing_healthcheck_test.go +++ b/integration/alb_missing_healthcheck_test.go @@ -18,7 +18,9 @@ package integration import ( "context" + "encoding/json" "fmt" + "io" "net/http" "net/http/httptest" "net/url" @@ -34,33 +36,19 @@ import ( "github.com/trickstercache/trickster/v2/integration/internal/portutil" ) -// TestALBPoolMemberWithoutHealthcheckNotRouted verifies that a non-virtual ALB -// pool member configured without a `healthcheck:` block does not receive -// fanout traffic when its upstream is unhealthy. The provider's default -// healthcheck must be auto-applied so the pool's dispatch-time filter can -// observe Failing status and exclude the member. -func TestALBPoolMemberWithoutHealthcheckNotRouted(t *testing.T) { - healthyResp := albTestdata(t, "alb_unavail/healthy.json") - +// TestALBHealthyFloorAdmitsFailingMetric verifies the warning surface for an +// ALB whose healthy_floor admits Failing members. An operator who lowered +// healthy_floor below 0 to keep traffic flowing during the Initializing +// startup window also admits members the probe has confirmed broken; the +// `trickster_alb_pool_admits_failing` gauge surfaces that misconfiguration. +func TestALBHealthyFloorAdmitsFailingMetric(t *testing.T) { healthy := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) - switch r.URL.Path { - case "/api/v1/status/buildinfo": - fmt.Fprint(w, `{"status":"success","data":{"version":"2.0"}}`) - case "/api/v1/query": - fmt.Fprint(w, `{"status":"success","data":{"resultType":"vector","result":[]}}`) - default: - fmt.Fprint(w, healthyResp) - } + fmt.Fprint(w, `{"status":"success","data":{"version":"2.0"}}`) })) t.Cleanup(healthy.Close) - - var brokenDataHits atomic.Int64 broken := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path == "/api/v1/query_range" { - brokenDataHits.Add(1) - } w.WriteHeader(http.StatusInternalServerError) })) t.Cleanup(broken.Close) @@ -68,9 +56,8 @@ func TestALBPoolMemberWithoutHealthcheckNotRouted(t *testing.T) { ports, release := portutil.Reserve(t, 3) frontPort, metricsPort, mgmtPort := ports[0], ports[1], ports[2] - yaml := fmt.Sprintf(albTestdata(t, "alb_missing_hc/missing_hc.yaml.tmpl"), + yaml := fmt.Sprintf(albTestdata(t, "alb_missing_hc/floor_warn.yaml.tmpl"), frontPort, metricsPort, mgmtPort, healthy.URL, broken.URL) - cfgPath := filepath.Join(t.TempDir(), "trickster.yaml") require.NoError(t, os.WriteFile(cfgPath, []byte(yaml), 0644)) @@ -82,63 +69,100 @@ func TestALBPoolMemberWithoutHealthcheckNotRouted(t *testing.T) { metricsAddr := fmt.Sprintf("127.0.0.1:%d", metricsPort) waitForTrickster(t, metricsAddr) - // Give the auto-applied probe time to fire, trip the broken backend to - // Failing, and let the ALB pool refresh its healthy snapshot. - time.Sleep(2 * time.Second) - dataBaseline := brokenDataHits.Load() - - frontAddr := fmt.Sprintf("127.0.0.1:%d", frontPort) - const reqs = 50 - now := time.Now().Unix() - for i := range reqs { - // query_range is mergeable in TSM and therefore triggers fanout to - // every healthy pool member. Use unique query strings so the cache - // doesn't short-circuit subsequent requests. - q := url.Values{ - "query": {fmt.Sprintf("up + 0*%d", i)}, - "start": {fmt.Sprintf("%d", now-300)}, - "end": {fmt.Sprintf("%d", now)}, - "step": {"15"}, + require.EventuallyWithT(t, func(collect *assert.CollectT) { + lines := checkTricksterMetrics(t, metricsAddr) + var admits, excludes string + for _, l := range lines { + if strings.HasPrefix(l, "trickster_alb_pool_admits_failing{") { + if strings.Contains(l, `backend_name="alb-admits-failing"`) { + admits = l + } + if strings.Contains(l, `backend_name="alb-excludes-failing"`) { + excludes = l + } + } } - u := fmt.Sprintf("http://%s/alb-tsm-test/api/v1/query_range?%s", - frontAddr, q.Encode()) - resp, err := http.Get(u) - require.NoError(t, err) - resp.Body.Close() - } + assert.True(collect, strings.HasSuffix(admits, " 1"), + "alb-admits-failing must report 1: %q", admits) + assert.True(collect, strings.HasSuffix(excludes, " 0"), + "alb-excludes-failing must report 0: %q", excludes) + }, 5*time.Second, 200*time.Millisecond) +} + +// TestALBHealthyFloorResetWhenMemberHasNoHealthcheck covers #1015: an ALB with +// healthy_floor: 1 whose only member has no health check. The member is stuck +// Unchecked and would be permanently excluded (empty pool -> 502). Trickster +// resets the effective floor to 0, sets trickster_alb_pool_floor_reset, and +// keeps serving 200. +func TestALBHealthyFloorResetWhenMemberHasNoHealthcheck(t *testing.T) { + vector := albTestdata(t, "alb_unavail/healthy.json") + origin := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + fmt.Fprint(w, vector) + })) + t.Cleanup(origin.Close) + + ports, release := portutil.Reserve(t, 3) + frontPort, metricsPort, mgmtPort := ports[0], ports[1], ports[2] + yaml := fmt.Sprintf(albTestdata(t, "alb_missing_hc/floor_reset.yaml.tmpl"), + frontPort, metricsPort, mgmtPort, origin.URL) + cfgPath := filepath.Join(t.TempDir(), "trickster.yaml") + require.NoError(t, os.WriteFile(cfgPath, []byte(yaml), 0644)) + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + release() + go startTrickster(t, ctx, expectedStartError{}, "-config", cfgPath) + + metricsAddr := fmt.Sprintf("127.0.0.1:%d", metricsPort) + waitForTrickster(t, metricsAddr) - // Allow a small drain window in case any in-flight fanout slot was still - // dispatching when the assertion fired. require.EventuallyWithT(t, func(collect *assert.CollectT) { - dataDelta := brokenDataHits.Load() - dataBaseline - assert.Equalf(collect, int64(0), dataDelta, - "broken backend received %d data requests via the ALB after probe transitioned it to Failing", - dataDelta) - }, 3*time.Second, 200*time.Millisecond) + var line string + for _, l := range checkTricksterMetrics(t, metricsAddr) { + if strings.HasPrefix(l, "trickster_alb_pool_floor_reset{") && + strings.Contains(l, `backend_name="alb-floor1"`) { + line = l + } + } + assert.True(collect, strings.HasSuffix(line, " 1"), + "alb-floor1 floor-reset gauge must be 1: %q", line) + }, 5*time.Second, 200*time.Millisecond) + + // member admitted under the reset floor -> 200, not an empty-pool 502 + frontAddr := fmt.Sprintf("127.0.0.1:%d", frontPort) + resp, err := http.Get(fmt.Sprintf("http://%s/alb-floor1/api/v1/query?query=up", frontAddr)) + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode) } -// TestALBHealthyFloorAdmitsFailingMetric verifies the warning surface for an -// ALB whose healthy_floor admits Failing members. An operator who lowered -// healthy_floor below 0 to keep traffic flowing during the Initializing -// startup window also admits members the probe has confirmed broken; the -// `trickster_alb_pool_admits_failing` gauge surfaces that misconfiguration. -func TestALBHealthyFloorAdmitsFailingMetric(t *testing.T) { - healthy := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { +// TestALBPoolDegradeWarnsInResponse covers thinker0's silent single-member +// degrade: a 2-member TSM pool where one member's probe fails drops to one live +// member. TSM still serves 200 from the survivor, but the response must carry a +// `warnings` entry so the caller knows the merge collapsed to a single shard. +func TestALBPoolDegradeWarnsInResponse(t *testing.T) { + vector := albTestdata(t, "alb_unavail/healthy.json") + ok := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) - fmt.Fprint(w, `{"status":"success","data":{"version":"2.0"}}`) + fmt.Fprint(w, vector) })) - t.Cleanup(healthy.Close) - broken := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Cleanup(ok.Close) + var badData atomic.Int64 + bad := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/status/buildinfo" { + badData.Add(1) + } w.WriteHeader(http.StatusInternalServerError) })) - t.Cleanup(broken.Close) + t.Cleanup(bad.Close) ports, release := portutil.Reserve(t, 3) frontPort, metricsPort, mgmtPort := ports[0], ports[1], ports[2] - - yaml := fmt.Sprintf(albTestdata(t, "alb_missing_hc/floor_warn.yaml.tmpl"), - frontPort, metricsPort, mgmtPort, healthy.URL, broken.URL) + yaml := fmt.Sprintf(albTestdata(t, "alb_missing_hc/degrade.yaml.tmpl"), + frontPort, metricsPort, mgmtPort, ok.URL, bad.URL) cfgPath := filepath.Join(t.TempDir(), "trickster.yaml") require.NoError(t, os.WriteFile(cfgPath, []byte(yaml), 0644)) @@ -147,25 +171,40 @@ func TestALBHealthyFloorAdmitsFailingMetric(t *testing.T) { release() go startTrickster(t, ctx, expectedStartError{}, "-config", cfgPath) + frontAddr := fmt.Sprintf("127.0.0.1:%d", frontPort) metricsAddr := fmt.Sprintf("127.0.0.1:%d", metricsPort) waitForTrickster(t, metricsAddr) + var n atomic.Int64 require.EventuallyWithT(t, func(collect *assert.CollectT) { - lines := checkTricksterMetrics(t, metricsAddr) - var admits, excludes string - for _, l := range lines { - if strings.HasPrefix(l, "trickster_alb_pool_admits_failing{") { - if strings.Contains(l, `backend_name="alb-admits-failing"`) { - admits = l - } - if strings.Contains(l, `backend_name="alb-excludes-failing"`) { - excludes = l - } + // unique query per attempt so the cache can't mask the live merge + q := fmt.Sprintf("up + 0*%d", n.Add(1)) + u := fmt.Sprintf("http://%s/alb-degrade/api/v1/query?query=%s", frontAddr, url.QueryEscape(q)) + resp, err := http.Get(u) + if !assert.NoError(collect, err) { + return + } + defer resp.Body.Close() + b, _ := io.ReadAll(resp.Body) + if !assert.Equal(collect, http.StatusOK, resp.StatusCode, "body: %s", b) { + return + } + var pr struct { + Status string `json:"status"` + Warnings []string `json:"warnings"` + } + if !assert.NoError(collect, json.Unmarshal(b, &pr)) { + return + } + var found bool + for _, wn := range pr.Warnings { + if strings.Contains(wn, "1 of 2 pool members") { + found = true } } - assert.True(collect, strings.HasSuffix(admits, " 1"), - "alb-admits-failing must report 1: %q", admits) - assert.True(collect, strings.HasSuffix(excludes, " 0"), - "alb-excludes-failing must report 0: %q", excludes) - }, 5*time.Second, 200*time.Millisecond) + assert.True(collect, found, + "expected degrade warning in response warnings: %v", pr.Warnings) + }, 6*time.Second, 200*time.Millisecond) + + require.Zero(t, badData.Load(), "failing member must not receive data requests") } diff --git a/integration/engines_test.go b/integration/engines_test.go index e2f9e62ac..c39782cac 100644 --- a/integration/engines_test.go +++ b/integration/engines_test.go @@ -70,14 +70,6 @@ func engineSetup(t *testing.T) *engineFakeOrigin { srv := &httptest.Server{ Listener: ln, Config: &http.Server{Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - // auto-applied prometheus healthcheck probes `/api/v1/query?query=up`; - // short-circuit them so per-test counters only see data traffic - if r.URL.Path == "/api/v1/query" && r.URL.Query().Get("query") == "up" { - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusOK) - _, _ = io.WriteString(w, `{"status":"success","data":{"resultType":"vector","result":[]}}`) - return - } o.mu.Lock() h := o.handler o.mu.Unlock() diff --git a/integration/testdata/alb_missing_hc/missing_hc.yaml.tmpl b/integration/testdata/alb_missing_hc/degrade.yaml.tmpl similarity index 69% rename from integration/testdata/alb_missing_hc/missing_hc.yaml.tmpl rename to integration/testdata/alb_missing_hc/degrade.yaml.tmpl index dc2de70c5..eee8ba57c 100644 --- a/integration/testdata/alb_missing_hc/missing_hc.yaml.tmpl +++ b/integration/testdata/alb_missing_hc/degrade.yaml.tmpl @@ -11,7 +11,7 @@ caches: mem1: provider: memory backends: - prom-healthy: + prom-ok: provider: prometheus origin_url: %s cache_name: mem1 @@ -22,15 +22,22 @@ backends: timeout: 500ms failure_threshold: 1 recovery_threshold: 1 - prom-broken: + prom-bad: provider: prometheus origin_url: %s cache_name: mem1 - alb-tsm-test: + healthcheck: + path: /api/v1/status/buildinfo + query: "" + interval: 100ms + timeout: 500ms + failure_threshold: 1 + recovery_threshold: 1 + alb-degrade: provider: alb alb: mechanism: tsm output_format: prometheus pool: - - prom-healthy - - prom-broken + - prom-ok + - prom-bad diff --git a/integration/testdata/alb_missing_hc/floor_reset.yaml.tmpl b/integration/testdata/alb_missing_hc/floor_reset.yaml.tmpl new file mode 100644 index 000000000..1497e73e7 --- /dev/null +++ b/integration/testdata/alb_missing_hc/floor_reset.yaml.tmpl @@ -0,0 +1,25 @@ + +frontend: + listen_port: %d +metrics: + listen_port: %d +mgmt: + listen_port: %d +logging: + log_level: warn +caches: + mem1: + provider: memory +backends: + prom-noprobe: + provider: prometheus + origin_url: %s + cache_name: mem1 + alb-floor1: + provider: alb + alb: + mechanism: tsm + output_format: prometheus + healthy_floor: 1 + pool: + - prom-noprobe diff --git a/pkg/backends/alb/client.go b/pkg/backends/alb/client.go index fc68d552b..dd0ef0778 100644 --- a/pkg/backends/alb/client.go +++ b/pkg/backends/alb/client.go @@ -19,6 +19,7 @@ package alb import ( "fmt" "net/http" + "strings" "time" "github.com/trickstercache/trickster/v2/pkg/backends" @@ -167,6 +168,7 @@ func (c *Client) ValidateAndStartPool(clients backends.Backends, hcs healthcheck return c.validateAndStartUserRouter(clients, hcs) } targets := make(pool.Targets, 0, len(o.Pool)) + var unprobed []string for _, n := range o.Pool { tc, ok := clients[n] if !ok { @@ -174,21 +176,35 @@ func (c *Client) ValidateAndStartPool(clients backends.Backends, hcs healthcheck } hc, ok := hcs[n] if !ok { - // Only virtual backends (rule, alb) may legitimately be absent from - // the healthcheck registry. A non-virtual miss means the operator - // did not configure healthcheck on this backend and the upstream - // auto-apply path also declined to register a probe; fabricating a - // dummy Status here would silently route fanout traffic regardless - // of upstream health. - if !backends.IsVirtual(tc.Configuration().Provider) { - return alberr.NewErrMissingHealthCheck(c.Name(), n) - } + // virtual backends (rule, alb) have no health checks; treat as passing hc = healthcheck.NewStatus(n, "virtual", "", healthcheck.StatusPassing, time.Time{}, nil) } + if mo := tc.Configuration(); !backends.IsVirtual(mo.Provider) && + (mo.HealthCheck == nil || mo.HealthCheck.Interval <= 0) { + unprobed = append(unprobed, n) + } targets = append(targets, pool.NewTarget(tc.Router(), hc, tc)) } + effectiveFloor := o.HealthyFloor + if o.HealthyFloor >= int(healthcheck.StatusPassing) && len(unprobed) > 0 { + // floor requires Passing, but these members have no probe interval and + // can never leave Unchecked, so they would be permanently excluded -- + // which can empty the pool entirely and 502 every request. Reset to 0 + // so unchecked members are admitted, and surface it loudly. + effectiveFloor = int(healthcheck.StatusUnchecked) + metrics.ALBPoolFloorReset.WithLabelValues(c.Name()).Set(1) + logger.Warn("alb healthy_floor reset to 0: pool members have no health check", + logging.Pairs{ + "backend_name": c.Name(), + "healthy_floor": o.HealthyFloor, + "members": strings.Join(unprobed, ","), + "hint": "configure healthcheck.interval on these members, or set healthy_floor: 0", + }) + } else { + metrics.ALBPoolFloorReset.WithLabelValues(c.Name()).Set(0) + } if pm, ok := c.handler.(types.PoolMechanism); ok { - pm.SetPool(pool.New(targets, o.HealthyFloor)) + pm.SetPool(pool.New(targets, effectiveFloor)) } if o.HealthyFloor <= int(healthcheck.StatusFailing) { // floor admits members whose probe has confirmed them down; operators diff --git a/pkg/backends/alb/errors/errors.go b/pkg/backends/alb/errors/errors.go index 5755c69c0..8335156ac 100644 --- a/pkg/backends/alb/errors/errors.go +++ b/pkg/backends/alb/errors/errors.go @@ -63,14 +63,3 @@ func NewErrInvalidUserRouterCreds(albName string) error { albName), } } - -// NewErrMissingHealthCheck reports a non-virtual ALB pool member that has no -// registered healthcheck Status. Without this guard the pool would route -// traffic to the member regardless of upstream health. -func NewErrMissingHealthCheck(albName, poolMemberName string) error { - return &InvalidALBOptionsError{ - error: fmt.Errorf("alb [%s] non-virtual pool member [%s] has no health check; "+ - "configure healthcheck options on the backend", - albName, poolMemberName), - } -} diff --git a/pkg/backends/alb/mech/tsm/time_series_merge.go b/pkg/backends/alb/mech/tsm/time_series_merge.go index 6b8cfcf5e..567798dc2 100644 --- a/pkg/backends/alb/mech/tsm/time_series_merge.go +++ b/pkg/backends/alb/mech/tsm/time_series_merge.go @@ -17,6 +17,7 @@ package tsm import ( + "fmt" "math" "net/http" "strconv" @@ -64,6 +65,10 @@ type handler struct { // poolVersion increments on every SetPool so cached pool-derived data // (stripKeys) can be invalidated without locking. poolVersion atomic.Uint64 + // degradeActive is true while a configured-multi-member pool is dispatching + // with only one live member. It throttles the operator WARN to once per + // healthy->degraded transition instead of once per request. + degradeActive atomic.Bool // cachedStripKeys memoizes the stripKeys slice across requests as long // as the pool hasn't been replaced. Hot path is a single atomic load // plus a uint64 compare. @@ -320,16 +325,41 @@ func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { stripKeys = h.computeStripKeys(hl) } + // A configured multi-member pool serving from a single live member is + // degraded: the merge has silently collapsed to one shard. Warn once per + // healthy->degraded transition (not per request) and route through the + // merge path so the warning reaches the response `warnings` field. + configured := p.ConfiguredLen() + degraded := configured > 1 && l == 1 + if degraded { + if h.degradeActive.CompareAndSwap(false, true) { + bn := "" + if rsc.BackendOptions != nil { + bn = rsc.BackendOptions.Name + } + logger.Warn("alb tsm pool degraded to single live member", + logging.Pairs{"backend_name": bn, "configured": configured, "live": l}) + } + dw := fmt.Sprintf("trickster: served from 1 of %d pool members; results may be incomplete", configured) + if warnMsg == "" { + warnMsg = dw + } else { + warnMsg += "; " + dw + } + } else { + h.degradeActive.Store(false) + } + // Single-live-member fast path: with one shard there is nothing to merge // or dedup against, so the lone backend's response IS the answer. Skip // only when label stripping or dual-query rewriting would still need to // happen (D1 covers the strip case; weighted-avg covers the dual-query - // case). Otherwise the merge path's OnResult stripping handles solo pools - // correctly, and degraded N-pools where N-1 are unhealthy now return the - // surviving member's response directly instead of 502'ing on a one-shard - // merge that has no peer to dedup or cross-merge against. - // dual-query still needs merge even for one healthy target; one-shard fast path requires all three conditions - if l == 1 && len(stripKeys) == 0 && !needsDualQuery { + // case), or when the pool is degraded (configured > 1, one live) so the + // merge path can attach the degrade warning. Otherwise the merge path's + // OnResult stripping handles solo pools correctly, and genuinely + // single-member pools return the lone response directly instead of 502'ing + // on a one-shard merge that has no peer to dedup or cross-merge against. + if l == 1 && len(stripKeys) == 0 && !needsDualQuery && !degraded { defaultHandler.ServeHTTP(w, r) return } diff --git a/pkg/backends/alb/pool/pool.go b/pkg/backends/alb/pool/pool.go index 583239758..209e87f5c 100644 --- a/pkg/backends/alb/pool/pool.go +++ b/pkg/backends/alb/pool/pool.go @@ -32,6 +32,10 @@ type Pool interface { // between a status flip and the asynchronous healthy-list refresh, so it // is the correct method for request dispatch. Targets() Targets + // ConfiguredLen returns the number of pool members as configured, regardless + // of current health. Mechanisms compare this against len(Targets()) to + // detect a pool degraded to a subset of its configured members. + ConfiguredLen() int // SetHealthy seeds the pool's healthy set from a handler list. Intended // for tests and bootstrap paths that don't drive status updates through // healthcheck subscribers. @@ -105,6 +109,10 @@ func (p *pool) snapshot() Targets { return nil } +func (p *pool) ConfiguredLen() int { + return len(p.targets) +} + func (p *pool) Targets() Targets { if lt := p.liveTargets.Load(); lt != nil && !p.refreshPending.Load() { cached := *lt diff --git a/pkg/backends/backends.go b/pkg/backends/backends.go index 074a50fc1..c4bc618b6 100644 --- a/pkg/backends/backends.go +++ b/pkg/backends/backends.go @@ -21,12 +21,8 @@ import ( "net/http" "github.com/trickstercache/trickster/v2/pkg/backends/healthcheck" - ho "github.com/trickstercache/trickster/v2/pkg/backends/healthcheck/options" bo "github.com/trickstercache/trickster/v2/pkg/backends/options" "github.com/trickstercache/trickster/v2/pkg/backends/providers" - "github.com/trickstercache/trickster/v2/pkg/observability/logging" - "github.com/trickstercache/trickster/v2/pkg/observability/logging/logger" - "github.com/trickstercache/trickster/v2/pkg/observability/metrics" ) // Backends represents a map of Backends keyed by Name @@ -50,38 +46,14 @@ func (b Backends) StartHealthChecks(knownStatuses healthcheck.StatusLookup) (hea continue } hco := bo.HealthCheck - def := c.DefaultHealthCheckConfig() - if hco == nil && def == nil { - // no probe possible for this provider; the ALB pool will reject - // this backend as a non-virtual member with no Status. + if hco == nil { continue } - if def != nil { - bo.HealthCheck = def - if hco != nil { - bo.HealthCheck.Overlay(hco) - } - } else { + bo.HealthCheck = c.DefaultHealthCheckConfig() + if bo.HealthCheck == nil { bo.HealthCheck = hco - } - var autoApplied bool - if bo.HealthCheck.Interval <= 0 { - // The operator did not configure a probe interval. A registered - // target with Interval == 0 never ticks, so its Status sticks at - // StatusUnchecked and the ALB pool's dispatch-time filter keeps - // routing fanout traffic regardless of upstream health. Apply a - // fast auto-default and surface that we did so. - bo.HealthCheck.Interval = ho.DefaultAutoProbeInterval - if bo.HealthCheck.FailureThreshold <= 0 { - bo.HealthCheck.FailureThreshold = 1 - } - if bo.HealthCheck.RecoveryThreshold <= 0 { - bo.HealthCheck.RecoveryThreshold = 1 - } - autoApplied = true - metrics.BackendsDefaultHealthCheckApplied.WithLabelValues(k, bo.Provider).Inc() - logger.Warn("auto-applied default healthcheck for backend", - logging.Pairs{"backend_name": k, "provider": bo.Provider}) + } else { + bo.HealthCheck.Overlay(hco) } st, err := hc.Register(k, bo.Provider, bo.HealthCheck, c.HealthCheckHTTPClient()) if err != nil { @@ -91,13 +63,6 @@ func (b Backends) StartHealthChecks(knownStatuses healthcheck.StatusLookup) (hea if v := oldSt.Get(); v != healthcheck.StatusInitializing { st.Set(v) } - } else if autoApplied { - // Override the registration-time Initializing with Unchecked so the - // pool's dispatch filter accepts the target during the brief window - // before the first auto-probe completes. Targets the operator - // explicitly configured keep Initializing so they're held out - // until the probe confirms them. - st.Set(healthcheck.StatusUnchecked) } c.SetHealthCheckProbe(st.Prober()) } diff --git a/pkg/backends/healthcheck/options/defaults.go b/pkg/backends/healthcheck/options/defaults.go index 5856cfc6c..8535bca85 100644 --- a/pkg/backends/healthcheck/options/defaults.go +++ b/pkg/backends/healthcheck/options/defaults.go @@ -23,10 +23,4 @@ import ( const ( // DefaultHealthCheckTimeout is the default duration for health check probes to wait before timing out DefaultHealthCheckTimeout = 3 * time.Second - - // DefaultAutoProbeInterval is applied when StartHealthChecks auto-installs - // a provider's default health-check config because the operator did not - // configure one. The auto-applied probe must fire on a tick or the - // downstream pool filter will never see a status transition. - DefaultAutoProbeInterval = 5 * time.Second ) diff --git a/pkg/observability/metrics/metrics.go b/pkg/observability/metrics/metrics.go index cf2f31e8f..0e2791622 100644 --- a/pkg/observability/metrics/metrics.go +++ b/pkg/observability/metrics/metrics.go @@ -454,20 +454,6 @@ var ( []string{"backend_name"}, ) - // BackendsDefaultHealthCheckApplied counts backends for which the provider's - // DefaultHealthCheckConfig was auto-installed by StartHealthChecks because - // the operator did not configure healthcheck options. A non-zero value - // flags configs that depended on an absent probe and benefit from auditing. - BackendsDefaultHealthCheckApplied = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: metricNamespace, - Subsystem: healthSubsystem, - Name: "default_config_applied_total", - Help: "Count of backends that received an auto-applied default healthcheck config because none was configured.", - }, - []string{"backend_name", "provider"}, - ) - // ALBPoolAdmitsFailing flags ALB pools whose healthy_floor admits a Failing // status (floor <= StatusFailing). Operators who set floor below 0 to keep // traffic flowing during the Initializing window may not realize they're @@ -482,6 +468,20 @@ var ( }, []string{"backend_name"}, ) + + // ALBPoolFloorReset flags ALB pools whose healthy_floor was reset to 0 at + // startup because one or more pool members have no health check and could + // never reach the configured floor (>= Passing), which would otherwise + // empty the pool and 502 every request. + ALBPoolFloorReset = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricNamespace, + Subsystem: albSubsystem, + Name: "pool_floor_reset", + Help: "1 when an ALB pool's healthy_floor was reset to 0 because members lack health checks; 0 otherwise.", + }, + []string{"backend_name"}, + ) ) func init() { @@ -507,8 +507,8 @@ func init() { prometheus.MustRegister(CacheIndexPanicRecovered) prometheus.MustRegister(HealthHandlerPanicRecovered) prometheus.MustRegister(HealthcheckStatusNotifyPanicRecovered) - prometheus.MustRegister(BackendsDefaultHealthCheckApplied) prometheus.MustRegister(ALBPoolAdmitsFailing) + prometheus.MustRegister(ALBPoolFloorReset) prometheus.MustRegister(CacheObjectOperations) prometheus.MustRegister(CacheByteOperations) prometheus.MustRegister(CacheEvents)