diff --git a/docs/alb.md b/docs/alb.md index 562aeb412..c5d0a7709 100644 --- a/docs/alb.md +++ b/docs/alb.md @@ -445,7 +445,9 @@ A backend will report one of three possible health states to its ALBs: `unavaila Each ALB has a configurable `healthy_floor` value, which is the threshold for determining which pool members are included in the healthy pool, based on their instantaneous health state. The `healthy_floor` represents the minimum acceptable health state value for inclusion in the healthy pool. The default `healthy_floor` value is `0`, meaning Backends in a state `>= 0` (`unknown` and `available`) are included in the healthy pool. Setting `healthy_floor: 1` would include only `available` Backends, while a value of `-1` will include all backends in the configured pool, including those marked as `unavailable`. -Backends that do not have a [health check interval](./health#example+health+check+configuration+for+use+in+alb) configured will remain in a permanent state of `unknown`. Backends will also be in an `unknown` state from the time Trickster starts until the first of any configured automated health check is completed. Note that if an ALB is configured with `healthy_floor: 1`, any pool members that are not configured with an automated health check interval will never be included in the ALB's healthy pool, as their state is permanently `0`. +Backends that do not have a [health check interval](./health#example+health+check+configuration+for+use+in+alb) configured will remain in a permanent state of `unknown`. Backends will also be in an `unknown` state from the time Trickster starts until the first of any configured automated health check is completed. A pool member in a permanent `unknown` state can never reach `available`, so a `healthy_floor: 1` ALB whose members lack health checks would have an empty pool and return `502` for every request. To avoid that, Trickster resets such an ALB's effective floor to `0` at startup, emits a warning naming the ALB and the un-probed members, and sets the `trickster_alb_pool_floor_reset{backend_name}` gauge to `1`. Configure a health check interval on those members if you want `healthy_floor: 1` to apply. + +Setting `healthy_floor` below `0` admits members the probe has confirmed `unavailable`, not just members in the transient `unknown` state. If your goal is to keep traffic flowing during the cold-start window before the first probes complete, lower the pool members' `recovery_threshold` so they transition out of `unknown` faster -- don't lower the floor. When `healthy_floor < 0` Trickster emits a startup warning and sets the `trickster_alb_pool_admits_failing{backend_name}` gauge to `1`. ### Example ALB Configuration Routing Only To Known Healthy Backends diff --git a/docs/health.md b/docs/health.md index 950438ba4..c3345d739 100644 --- a/docs/health.md +++ b/docs/health.md @@ -102,6 +102,8 @@ backends: recovery_threshold: 3 # backend is healthy after 3 consecutive successes ``` +The Prometheus default probe is `/api/v1/query?query=up`. Some multi-tenant Prometheus gateways reject an unbounded `up` with `400 bad_data: "too many series found"`, which keeps the member out of any ALB pool it belongs to. Override `healthcheck.query` with a bounded expression the backend accepts (for example `query=vector(1)`) when probing such backends. + ## Other Ways to Monitor Health In addition to the out-of-the-box health checks to determine up-or-down status, you may want to setup alarms and thresholds based on the metrics instrumented by Trickster. See [metrics.md](metrics.md) for collecting performance metrics about Trickster. diff --git a/docs/metrics.md b/docs/metrics.md index 917d55862..0cc9a546e 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -91,6 +91,14 @@ The following metrics are available for polling with any Trickster configuration * `operation` - the name of the operation being performed (read, write, etc.) * `status` - the result of the operation being performed +* `trickster_alb_pool_admits_failing` (Gauge) - 1 when an ALB pool's `healthy_floor` admits members in the `unavailable` state, 0 otherwise. See [alb.md](./alb.md#health-based-backend-selection) for the recommended floor. + * labels: + * `backend_name` - the name of the configured ALB backend + +* `trickster_alb_pool_floor_reset` (Gauge) - 1 when an ALB pool's `healthy_floor` was reset to 0 at startup because pool members have no health check and could never reach the configured floor, 0 otherwise. See [alb.md](./alb.md#health-based-backend-selection). + * labels: + * `backend_name` - the name of the configured ALB backend + --- The following metrics are available only for Caches Types whose object lifecycle Trickster manages internally (Memory, Filesystem and bbolt): diff --git a/integration/alb_missing_healthcheck_test.go b/integration/alb_missing_healthcheck_test.go new file mode 100644 index 000000000..be46d9db7 --- /dev/null +++ b/integration/alb_missing_healthcheck_test.go @@ -0,0 +1,210 @@ +/* + * Copyright 2018 The Trickster Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package integration + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/http/httptest" + "net/url" + "os" + "path/filepath" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/trickstercache/trickster/v2/integration/internal/portutil" +) + +// TestALBHealthyFloorAdmitsFailingMetric verifies the warning surface for an +// ALB whose healthy_floor admits Failing members. An operator who lowered +// healthy_floor below 0 to keep traffic flowing during the Initializing +// startup window also admits members the probe has confirmed broken; the +// `trickster_alb_pool_admits_failing` gauge surfaces that misconfiguration. +func TestALBHealthyFloorAdmitsFailingMetric(t *testing.T) { + healthy := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + fmt.Fprint(w, `{"status":"success","data":{"version":"2.0"}}`) + })) + t.Cleanup(healthy.Close) + broken := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + t.Cleanup(broken.Close) + + ports, release := portutil.Reserve(t, 3) + frontPort, metricsPort, mgmtPort := ports[0], ports[1], ports[2] + + yaml := fmt.Sprintf(albTestdata(t, "alb_missing_hc/floor_warn.yaml.tmpl"), + frontPort, metricsPort, mgmtPort, healthy.URL, broken.URL) + cfgPath := filepath.Join(t.TempDir(), "trickster.yaml") + require.NoError(t, os.WriteFile(cfgPath, []byte(yaml), 0644)) + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + release() + go startTrickster(t, ctx, expectedStartError{}, "-config", cfgPath) + + metricsAddr := fmt.Sprintf("127.0.0.1:%d", metricsPort) + waitForTrickster(t, metricsAddr) + + require.EventuallyWithT(t, func(collect *assert.CollectT) { + lines := checkTricksterMetrics(t, metricsAddr) + var admits, excludes string + for _, l := range lines { + if strings.HasPrefix(l, "trickster_alb_pool_admits_failing{") { + if strings.Contains(l, `backend_name="alb-admits-failing"`) { + admits = l + } + if strings.Contains(l, `backend_name="alb-excludes-failing"`) { + excludes = l + } + } + } + assert.True(collect, strings.HasSuffix(admits, " 1"), + "alb-admits-failing must report 1: %q", admits) + assert.True(collect, strings.HasSuffix(excludes, " 0"), + "alb-excludes-failing must report 0: %q", excludes) + }, 5*time.Second, 200*time.Millisecond) +} + +// TestALBHealthyFloorResetWhenMemberHasNoHealthcheck covers #1015: an ALB with +// healthy_floor: 1 whose only member has no health check. The member is stuck +// Unchecked and would be permanently excluded (empty pool -> 502). Trickster +// resets the effective floor to 0, sets trickster_alb_pool_floor_reset, and +// keeps serving 200. +func TestALBHealthyFloorResetWhenMemberHasNoHealthcheck(t *testing.T) { + vector := albTestdata(t, "alb_unavail/healthy.json") + origin := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + fmt.Fprint(w, vector) + })) + t.Cleanup(origin.Close) + + ports, release := portutil.Reserve(t, 3) + frontPort, metricsPort, mgmtPort := ports[0], ports[1], ports[2] + yaml := fmt.Sprintf(albTestdata(t, "alb_missing_hc/floor_reset.yaml.tmpl"), + frontPort, metricsPort, mgmtPort, origin.URL) + cfgPath := filepath.Join(t.TempDir(), "trickster.yaml") + require.NoError(t, os.WriteFile(cfgPath, []byte(yaml), 0644)) + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + release() + go startTrickster(t, ctx, expectedStartError{}, "-config", cfgPath) + + metricsAddr := fmt.Sprintf("127.0.0.1:%d", metricsPort) + waitForTrickster(t, metricsAddr) + + require.EventuallyWithT(t, func(collect *assert.CollectT) { + var line string + for _, l := range checkTricksterMetrics(t, metricsAddr) { + if strings.HasPrefix(l, "trickster_alb_pool_floor_reset{") && + strings.Contains(l, `backend_name="alb-floor1"`) { + line = l + } + } + assert.True(collect, strings.HasSuffix(line, " 1"), + "alb-floor1 floor-reset gauge must be 1: %q", line) + }, 5*time.Second, 200*time.Millisecond) + + // member admitted under the reset floor -> 200, not an empty-pool 502 + frontAddr := fmt.Sprintf("127.0.0.1:%d", frontPort) + resp, err := http.Get(fmt.Sprintf("http://%s/alb-floor1/api/v1/query?query=up", frontAddr)) + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode) +} + +// TestALBPoolDegradeWarnsInResponse covers thinker0's silent single-member +// degrade: a 2-member TSM pool where one member's probe fails drops to one live +// member. TSM still serves 200 from the survivor, but the response must carry a +// `warnings` entry so the caller knows the merge collapsed to a single shard. +func TestALBPoolDegradeWarnsInResponse(t *testing.T) { + vector := albTestdata(t, "alb_unavail/healthy.json") + ok := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + fmt.Fprint(w, vector) + })) + t.Cleanup(ok.Close) + var badData atomic.Int64 + bad := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/status/buildinfo" { + badData.Add(1) + } + w.WriteHeader(http.StatusInternalServerError) + })) + t.Cleanup(bad.Close) + + ports, release := portutil.Reserve(t, 3) + frontPort, metricsPort, mgmtPort := ports[0], ports[1], ports[2] + yaml := fmt.Sprintf(albTestdata(t, "alb_missing_hc/degrade.yaml.tmpl"), + frontPort, metricsPort, mgmtPort, ok.URL, bad.URL) + cfgPath := filepath.Join(t.TempDir(), "trickster.yaml") + require.NoError(t, os.WriteFile(cfgPath, []byte(yaml), 0644)) + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + release() + go startTrickster(t, ctx, expectedStartError{}, "-config", cfgPath) + + frontAddr := fmt.Sprintf("127.0.0.1:%d", frontPort) + metricsAddr := fmt.Sprintf("127.0.0.1:%d", metricsPort) + waitForTrickster(t, metricsAddr) + + var n atomic.Int64 + require.EventuallyWithT(t, func(collect *assert.CollectT) { + // unique query per attempt so the cache can't mask the live merge + q := fmt.Sprintf("up + 0*%d", n.Add(1)) + u := fmt.Sprintf("http://%s/alb-degrade/api/v1/query?query=%s", frontAddr, url.QueryEscape(q)) + resp, err := http.Get(u) + if !assert.NoError(collect, err) { + return + } + defer resp.Body.Close() + b, _ := io.ReadAll(resp.Body) + if !assert.Equal(collect, http.StatusOK, resp.StatusCode, "body: %s", b) { + return + } + var pr struct { + Status string `json:"status"` + Warnings []string `json:"warnings"` + } + if !assert.NoError(collect, json.Unmarshal(b, &pr)) { + return + } + var found bool + for _, wn := range pr.Warnings { + if strings.Contains(wn, "1 of 2 pool members") { + found = true + } + } + assert.True(collect, found, + "expected degrade warning in response warnings: %v", pr.Warnings) + }, 6*time.Second, 200*time.Millisecond) + + require.Zero(t, badData.Load(), "failing member must not receive data requests") +} diff --git a/integration/alb_testdata_test.go b/integration/alb_testdata_test.go index 308658135..d28f3b31d 100644 --- a/integration/alb_testdata_test.go +++ b/integration/alb_testdata_test.go @@ -21,7 +21,7 @@ import ( "testing" ) -//go:embed testdata/alb_cache testdata/alb_tsm_correctness testdata/alb_response_headers testdata/alb_nested testdata/alb_per_path testdata/alb_unavail testdata/alb_request_headers testdata/alb_compose +//go:embed testdata/alb_cache testdata/alb_tsm_correctness testdata/alb_response_headers testdata/alb_nested testdata/alb_per_path testdata/alb_unavail testdata/alb_missing_hc testdata/alb_request_headers testdata/alb_compose var albTestdataFS embed.FS func albTestdata(t testing.TB, name string) string { diff --git a/integration/testdata/alb_missing_hc/degrade.yaml.tmpl b/integration/testdata/alb_missing_hc/degrade.yaml.tmpl new file mode 100644 index 000000000..eee8ba57c --- /dev/null +++ b/integration/testdata/alb_missing_hc/degrade.yaml.tmpl @@ -0,0 +1,43 @@ + +frontend: + listen_port: %d +metrics: + listen_port: %d +mgmt: + listen_port: %d +logging: + log_level: warn +caches: + mem1: + provider: memory +backends: + prom-ok: + provider: prometheus + origin_url: %s + cache_name: mem1 + healthcheck: + path: /api/v1/status/buildinfo + query: "" + interval: 100ms + timeout: 500ms + failure_threshold: 1 + recovery_threshold: 1 + prom-bad: + provider: prometheus + origin_url: %s + cache_name: mem1 + healthcheck: + path: /api/v1/status/buildinfo + query: "" + interval: 100ms + timeout: 500ms + failure_threshold: 1 + recovery_threshold: 1 + alb-degrade: + provider: alb + alb: + mechanism: tsm + output_format: prometheus + pool: + - prom-ok + - prom-bad diff --git a/integration/testdata/alb_missing_hc/floor_reset.yaml.tmpl b/integration/testdata/alb_missing_hc/floor_reset.yaml.tmpl new file mode 100644 index 000000000..1497e73e7 --- /dev/null +++ b/integration/testdata/alb_missing_hc/floor_reset.yaml.tmpl @@ -0,0 +1,25 @@ + +frontend: + listen_port: %d +metrics: + listen_port: %d +mgmt: + listen_port: %d +logging: + log_level: warn +caches: + mem1: + provider: memory +backends: + prom-noprobe: + provider: prometheus + origin_url: %s + cache_name: mem1 + alb-floor1: + provider: alb + alb: + mechanism: tsm + output_format: prometheus + healthy_floor: 1 + pool: + - prom-noprobe diff --git a/integration/testdata/alb_missing_hc/floor_warn.yaml.tmpl b/integration/testdata/alb_missing_hc/floor_warn.yaml.tmpl new file mode 100644 index 000000000..d11121ee8 --- /dev/null +++ b/integration/testdata/alb_missing_hc/floor_warn.yaml.tmpl @@ -0,0 +1,52 @@ + +frontend: + listen_port: %d +metrics: + listen_port: %d +mgmt: + listen_port: %d +logging: + log_level: warn +caches: + mem1: + provider: memory +backends: + prom-healthy: + provider: prometheus + origin_url: %s + cache_name: mem1 + healthcheck: + path: /api/v1/status/buildinfo + query: "" + interval: 100ms + timeout: 500ms + failure_threshold: 1 + recovery_threshold: 1 + prom-broken: + provider: prometheus + origin_url: %s + cache_name: mem1 + healthcheck: + path: /api/v1/status/buildinfo + query: "" + interval: 100ms + timeout: 500ms + failure_threshold: 1 + recovery_threshold: 1 + alb-admits-failing: + provider: alb + alb: + mechanism: tsm + output_format: prometheus + healthy_floor: -1 + pool: + - prom-healthy + - prom-broken + alb-excludes-failing: + provider: alb + alb: + mechanism: tsm + output_format: prometheus + pool: + - prom-healthy + - prom-broken diff --git a/pkg/backends/alb/client.go b/pkg/backends/alb/client.go index ae09f6038..dd0ef0778 100644 --- a/pkg/backends/alb/client.go +++ b/pkg/backends/alb/client.go @@ -19,6 +19,7 @@ package alb import ( "fmt" "net/http" + "strings" "time" "github.com/trickstercache/trickster/v2/pkg/backends" @@ -34,6 +35,9 @@ import ( rt "github.com/trickstercache/trickster/v2/pkg/backends/providers/registry/types" "github.com/trickstercache/trickster/v2/pkg/cache" "github.com/trickstercache/trickster/v2/pkg/errors" + "github.com/trickstercache/trickster/v2/pkg/observability/logging" + "github.com/trickstercache/trickster/v2/pkg/observability/logging/logger" + "github.com/trickstercache/trickster/v2/pkg/observability/metrics" authopt "github.com/trickstercache/trickster/v2/pkg/proxy/authenticator/options" authreg "github.com/trickstercache/trickster/v2/pkg/proxy/authenticator/registry" "github.com/trickstercache/trickster/v2/pkg/proxy/handlers" @@ -164,6 +168,7 @@ func (c *Client) ValidateAndStartPool(clients backends.Backends, hcs healthcheck return c.validateAndStartUserRouter(clients, hcs) } targets := make(pool.Targets, 0, len(o.Pool)) + var unprobed []string for _, n := range o.Pool { tc, ok := clients[n] if !ok { @@ -174,10 +179,46 @@ func (c *Client) ValidateAndStartPool(clients backends.Backends, hcs healthcheck // virtual backends (rule, alb) have no health checks; treat as passing hc = healthcheck.NewStatus(n, "virtual", "", healthcheck.StatusPassing, time.Time{}, nil) } + if mo := tc.Configuration(); !backends.IsVirtual(mo.Provider) && + (mo.HealthCheck == nil || mo.HealthCheck.Interval <= 0) { + unprobed = append(unprobed, n) + } targets = append(targets, pool.NewTarget(tc.Router(), hc, tc)) } + effectiveFloor := o.HealthyFloor + if o.HealthyFloor >= int(healthcheck.StatusPassing) && len(unprobed) > 0 { + // floor requires Passing, but these members have no probe interval and + // can never leave Unchecked, so they would be permanently excluded -- + // which can empty the pool entirely and 502 every request. Reset to 0 + // so unchecked members are admitted, and surface it loudly. + effectiveFloor = int(healthcheck.StatusUnchecked) + metrics.ALBPoolFloorReset.WithLabelValues(c.Name()).Set(1) + logger.Warn("alb healthy_floor reset to 0: pool members have no health check", + logging.Pairs{ + "backend_name": c.Name(), + "healthy_floor": o.HealthyFloor, + "members": strings.Join(unprobed, ","), + "hint": "configure healthcheck.interval on these members, or set healthy_floor: 0", + }) + } else { + metrics.ALBPoolFloorReset.WithLabelValues(c.Name()).Set(0) + } if pm, ok := c.handler.(types.PoolMechanism); ok { - pm.SetPool(pool.New(targets, o.HealthyFloor)) + pm.SetPool(pool.New(targets, effectiveFloor)) + } + if o.HealthyFloor <= int(healthcheck.StatusFailing) { + // floor admits members whose probe has confirmed them down; operators + // who lowered the floor to keep traffic flowing during the startup + // Initializing window may not realize Failing slips in too. + metrics.ALBPoolAdmitsFailing.WithLabelValues(c.Name()).Set(1) + logger.Warn("alb healthy_floor admits members in Failing state", + logging.Pairs{ + "backend_name": c.Name(), + "healthy_floor": o.HealthyFloor, + "hint": "set healthy_floor: 0 to exclude probed-failing members", + }) + } else { + metrics.ALBPoolAdmitsFailing.WithLabelValues(c.Name()).Set(0) } return nil } diff --git a/pkg/backends/alb/mech/tsm/time_series_merge.go b/pkg/backends/alb/mech/tsm/time_series_merge.go index 6b8cfcf5e..567798dc2 100644 --- a/pkg/backends/alb/mech/tsm/time_series_merge.go +++ b/pkg/backends/alb/mech/tsm/time_series_merge.go @@ -17,6 +17,7 @@ package tsm import ( + "fmt" "math" "net/http" "strconv" @@ -64,6 +65,10 @@ type handler struct { // poolVersion increments on every SetPool so cached pool-derived data // (stripKeys) can be invalidated without locking. poolVersion atomic.Uint64 + // degradeActive is true while a configured-multi-member pool is dispatching + // with only one live member. It throttles the operator WARN to once per + // healthy->degraded transition instead of once per request. + degradeActive atomic.Bool // cachedStripKeys memoizes the stripKeys slice across requests as long // as the pool hasn't been replaced. Hot path is a single atomic load // plus a uint64 compare. @@ -320,16 +325,41 @@ func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { stripKeys = h.computeStripKeys(hl) } + // A configured multi-member pool serving from a single live member is + // degraded: the merge has silently collapsed to one shard. Warn once per + // healthy->degraded transition (not per request) and route through the + // merge path so the warning reaches the response `warnings` field. + configured := p.ConfiguredLen() + degraded := configured > 1 && l == 1 + if degraded { + if h.degradeActive.CompareAndSwap(false, true) { + bn := "" + if rsc.BackendOptions != nil { + bn = rsc.BackendOptions.Name + } + logger.Warn("alb tsm pool degraded to single live member", + logging.Pairs{"backend_name": bn, "configured": configured, "live": l}) + } + dw := fmt.Sprintf("trickster: served from 1 of %d pool members; results may be incomplete", configured) + if warnMsg == "" { + warnMsg = dw + } else { + warnMsg += "; " + dw + } + } else { + h.degradeActive.Store(false) + } + // Single-live-member fast path: with one shard there is nothing to merge // or dedup against, so the lone backend's response IS the answer. Skip // only when label stripping or dual-query rewriting would still need to // happen (D1 covers the strip case; weighted-avg covers the dual-query - // case). Otherwise the merge path's OnResult stripping handles solo pools - // correctly, and degraded N-pools where N-1 are unhealthy now return the - // surviving member's response directly instead of 502'ing on a one-shard - // merge that has no peer to dedup or cross-merge against. - // dual-query still needs merge even for one healthy target; one-shard fast path requires all three conditions - if l == 1 && len(stripKeys) == 0 && !needsDualQuery { + // case), or when the pool is degraded (configured > 1, one live) so the + // merge path can attach the degrade warning. Otherwise the merge path's + // OnResult stripping handles solo pools correctly, and genuinely + // single-member pools return the lone response directly instead of 502'ing + // on a one-shard merge that has no peer to dedup or cross-merge against. + if l == 1 && len(stripKeys) == 0 && !needsDualQuery && !degraded { defaultHandler.ServeHTTP(w, r) return } diff --git a/pkg/backends/alb/options/options.go b/pkg/backends/alb/options/options.go index 97023d0eb..7551fe1b5 100644 --- a/pkg/backends/alb/options/options.go +++ b/pkg/backends/alb/options/options.go @@ -38,12 +38,14 @@ type Options struct { MechanismName string `yaml:"mechanism,omitempty"` // Pool provides the list of backend names to be used by the load balancer Pool []string `yaml:"pool,omitempty"` - // HealthyFloor is the minimum health check status value to be considered Available in the pool - // -1 : all pool members are Available regardless of health check status - // 0 (default) : pool members with status of unknown (0) or healthy (1) are Available - // 1 : only pool members with status of healthy (1) are Available - // unknown means the first hc hasn't returned yet, - // or (more likely) HealthCheck Interval on target backend is not set + // HealthyFloor is the minimum health check status value admitted to the pool. + // Values below 0 admit members the probe has confirmed Failing; only set + // this below 0 if you intend to route to known-broken upstreams. + // -1 : all pool members admitted, including Failing + // 0 (default) : Unknown (0) and Healthy (1) admitted, Failing rejected + // 1 : only Healthy (1) admitted + // Unknown means the first health check hasn't returned yet, or the target + // backend has no health check interval configured. HealthyFloor int `yaml:"healthy_floor,omitempty"` // MaxCaptureBytes overrides the backend-level max_capture_bytes for this // ALB's fanout members. Set this when the ALB's expected response shape diff --git a/pkg/backends/alb/pool/pool.go b/pkg/backends/alb/pool/pool.go index 583239758..209e87f5c 100644 --- a/pkg/backends/alb/pool/pool.go +++ b/pkg/backends/alb/pool/pool.go @@ -32,6 +32,10 @@ type Pool interface { // between a status flip and the asynchronous healthy-list refresh, so it // is the correct method for request dispatch. Targets() Targets + // ConfiguredLen returns the number of pool members as configured, regardless + // of current health. Mechanisms compare this against len(Targets()) to + // detect a pool degraded to a subset of its configured members. + ConfiguredLen() int // SetHealthy seeds the pool's healthy set from a handler list. Intended // for tests and bootstrap paths that don't drive status updates through // healthcheck subscribers. @@ -105,6 +109,10 @@ func (p *pool) snapshot() Targets { return nil } +func (p *pool) ConfiguredLen() int { + return len(p.targets) +} + func (p *pool) Targets() Targets { if lt := p.liveTargets.Load(); lt != nil && !p.refreshPending.Load() { cached := *lt diff --git a/pkg/backends/backends.go b/pkg/backends/backends.go index 28f97de06..c4bc618b6 100644 --- a/pkg/backends/backends.go +++ b/pkg/backends/backends.go @@ -40,8 +40,8 @@ func (b Backends) StartHealthChecks(knownStatuses healthcheck.StatusLookup) (hea } if IsVirtual(bo.Provider) { // Virtual backends have no upstream to probe; register a synthetic - // passing status so nested ALBs surface in the health page and in - // outer pools' availablePoolMembers (issue #996). + // passing status so they surface in the health page and in outer + // ALB pool reporting. hc.RegisterVirtual(k, bo.Provider) continue } diff --git a/pkg/backends/healthcheck/healthcheck.go b/pkg/backends/healthcheck/healthcheck.go index 6c70f426f..5cb2ad01a 100644 --- a/pkg/backends/healthcheck/healthcheck.go +++ b/pkg/backends/healthcheck/healthcheck.go @@ -32,8 +32,8 @@ type HealthChecker interface { // Register a health check Target Register(name string, description string, options *ho.Options, client *http.Client) (*Status, error) // RegisterVirtual records a synthetic always-passing Status for a virtual - // backend (rule, alb) that has no upstream to probe. Without this, - // nested ALBs render as "nc" on the health page (issue #996). + // backend (rule, alb) that has no upstream to probe, so it surfaces in + // the health page and in outer ALB pool reporting. RegisterVirtual(name, description string) *Status // Remove a health check Target Unregister(name string) diff --git a/pkg/observability/metrics/metrics.go b/pkg/observability/metrics/metrics.go index 010ef87c3..0e2791622 100644 --- a/pkg/observability/metrics/metrics.go +++ b/pkg/observability/metrics/metrics.go @@ -453,6 +453,35 @@ var ( }, []string{"backend_name"}, ) + + // ALBPoolAdmitsFailing flags ALB pools whose healthy_floor admits a Failing + // status (floor <= StatusFailing). Operators who set floor below 0 to keep + // traffic flowing during the Initializing window may not realize they're + // also admitting members the probe has confirmed broken; the gauge surfaces + // that misconfiguration without spamming logs. + ALBPoolAdmitsFailing = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricNamespace, + Subsystem: albSubsystem, + Name: "pool_admits_failing", + Help: "1 when an ALB pool's healthy_floor admits members in Failing state; 0 otherwise.", + }, + []string{"backend_name"}, + ) + + // ALBPoolFloorReset flags ALB pools whose healthy_floor was reset to 0 at + // startup because one or more pool members have no health check and could + // never reach the configured floor (>= Passing), which would otherwise + // empty the pool and 502 every request. + ALBPoolFloorReset = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricNamespace, + Subsystem: albSubsystem, + Name: "pool_floor_reset", + Help: "1 when an ALB pool's healthy_floor was reset to 0 because members lack health checks; 0 otherwise.", + }, + []string{"backend_name"}, + ) ) func init() { @@ -478,6 +507,8 @@ func init() { prometheus.MustRegister(CacheIndexPanicRecovered) prometheus.MustRegister(HealthHandlerPanicRecovered) prometheus.MustRegister(HealthcheckStatusNotifyPanicRecovered) + prometheus.MustRegister(ALBPoolAdmitsFailing) + prometheus.MustRegister(ALBPoolFloorReset) prometheus.MustRegister(CacheObjectOperations) prometheus.MustRegister(CacheByteOperations) prometheus.MustRegister(CacheEvents) diff --git a/pkg/proxy/engines/deltaproxycache_chunk_test.go b/pkg/proxy/engines/deltaproxycache_chunk_test.go index 85c30202b..23d442b71 100644 --- a/pkg/proxy/engines/deltaproxycache_chunk_test.go +++ b/pkg/proxy/engines/deltaproxycache_chunk_test.go @@ -605,6 +605,7 @@ func TestDeltaProxyCacheRequestRangeMissChunks(t *testing.T) { rsc.CacheConfig.Provider = "test" o.FastForwardDisable = true + o.TimeseriesRetention = 1 << 20 // pin against wallclock drift on fixed fixture step := time.Duration(3600) * time.Second @@ -753,6 +754,7 @@ func TestDeltaProxyCacheRequestRangeMissChunks_CrossBucket(t *testing.T) { o := rsc.BackendOptions rsc.CacheConfig.Provider = "test" o.FastForwardDisable = true + o.TimeseriesRetention = 1 << 20 // pin against wallclock drift on fixed fixture step := time.Duration(3600) * time.Second @@ -835,6 +837,7 @@ func TestDeltaProxyCacheRequestRangeMissChunks_CrossBucketPreservesPriorChunks(t o := rsc.BackendOptions rsc.CacheConfig.Provider = "test" o.FastForwardDisable = true + o.TimeseriesRetention = 1 << 20 // pin against wallclock drift on fixed fixture step := time.Duration(3600) * time.Second now := time.Date(2026, 4, 20, 11, 0, 0, 0, time.UTC)