From 5de3ee7e7adad42bd6e88baf6f4052ee2cb06981 Mon Sep 17 00:00:00 2001 From: Jonathan Haas Date: Wed, 15 Apr 2026 17:13:57 -0700 Subject: [PATCH] Add siphon production ops profile --- README.md | 1 + charts/siphon/README.md | 10 ++++++++++ charts/siphon/values-production.yaml | 6 ++++++ charts/siphon/values.schema.json | 4 ++++ charts/siphon/values.yaml | 1 + cmd/tap/run.go | 2 +- config.example.yaml | 1 + config/config.go | 7 +++++++ config/config_test.go | 21 +++++++++++++++++++++ scripts/assert-chart-render.sh | 10 +++++++++- 10 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 charts/siphon/values-production.yaml diff --git a/README.md b/README.md index c957af9..4c61689 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,7 @@ When any admin token is configured (`server.admin_token`, `server.admin_token_se - Replay is capped by `server.admin_replay_max_limit` (default `2000`, valid range `1..100000`); accepted response includes replay job metadata (`job_id`, `status`, `effective_limit`, `max_limit`, `capped`, `dry_run`). - Replay job metadata retention/capacity is configurable (`server.admin_replay_job_ttl`, `server.admin_replay_job_max_jobs`), and backend is configurable (`server.admin_replay_store_backend=memory|sqlite`, `server.admin_replay_sqlite_path`). - Replay execution is configurable (`server.admin_replay_job_timeout`, `server.admin_replay_max_concurrent_jobs`) for bounded runtime and concurrency. + - HTTP shutdown grace is configurable via `server.shutdown_timeout` for slower drains in production. - Queue fan-out safety rails are configurable (`server.admin_replay_max_queued_per_ip`, `server.admin_replay_max_queued_per_token`) and return `409` when exceeded. - `GET /admin/replay-dlq` - Requires header `X-Admin-Token` with read permission (`admin_token`/`admin_token_secondary`/`admin_token_read`/`admin_token_replay`/`admin_token_cancel`). diff --git a/charts/siphon/README.md b/charts/siphon/README.md index 010024d..6df281b 100644 --- a/charts/siphon/README.md +++ b/charts/siphon/README.md @@ -185,6 +185,7 @@ Auth notes: - `config.nats.stream_compression` supports `none|s2`; `config.nats.stream_max_consumers` and `config.nats.stream_max_msgs_per_subject` must be `>= 0`. - `config.clickhouse.consumer_backoff` values must be positive and non-decreasing; when `config.clickhouse.consumer_max_deliver > 0`, it must equal the backoff list length. - Keep `config.clickhouse.consumer_fetch_max_wait < config.clickhouse.consumer_ack_wait` and `config.clickhouse.insert_timeout + config.clickhouse.flush_interval < config.clickhouse.consumer_ack_wait`. +- `config.server.shutdown_timeout` controls how long the tap server waits for in-flight work to drain before forcing HTTP shutdown. ## Ops hardening defaults @@ -197,6 +198,7 @@ Auth notes: - `networkPolicy.natsEgressTo=[]` and `networkPolicy.clickhouseEgressTo=[]` optionally scope derived transport rules to destination selectors (`namespaceSelector`, `podSelector`, `ipBlock`) for least-privilege egress. - `envSecrets` supports direct `env` values from secret key references. - `autoscaling.customMetrics` enables HPA custom metrics in addition to CPU/memory targets. +- `values-production.yaml` enables HPA by default for production installs (`minReplicas=2`, CPU+memory targets enabled). Example selector-based transport policy: @@ -209,6 +211,14 @@ helm upgrade --install siphon ./charts/siphon \ --set networkPolicy.clickhouseEgressTo[0].ipBlock.cidr=10.42.0.0/16 ``` +## Production profile + +```bash +helm upgrade --install siphon ./charts/siphon \ + --namespace siphon \ + -f ./charts/siphon/values-production.yaml +``` + ## Enable sqlite state persistence ```bash diff --git a/charts/siphon/values-production.yaml b/charts/siphon/values-production.yaml new file mode 100644 index 0000000..e215516 --- /dev/null +++ b/charts/siphon/values-production.yaml @@ -0,0 +1,6 @@ +autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 10 + targetCPUUtilizationPercentage: 80 + targetMemoryUtilizationPercentage: 80 diff --git a/charts/siphon/values.schema.json b/charts/siphon/values.schema.json index 4c85911..fb4a825 100644 --- a/charts/siphon/values.schema.json +++ b/charts/siphon/values.schema.json @@ -638,6 +638,10 @@ "type": "string", "description": "HTTP server write timeout." }, + "shutdown_timeout": { + "type": "string", + "description": "HTTP server shutdown grace timeout." + }, "max_body_size": { "type": "integer", "minimum": 1, diff --git a/charts/siphon/values.yaml b/charts/siphon/values.yaml index cf4ce6b..5ce6e23 100644 --- a/charts/siphon/values.yaml +++ b/charts/siphon/values.yaml @@ -230,6 +230,7 @@ config: base_path: /webhooks read_timeout: 10s write_timeout: 5s + shutdown_timeout: 10s max_body_size: 1048576 admin_token: ${TAP_ADMIN_TOKEN} admin_token_secondary: ${TAP_ADMIN_TOKEN_SECONDARY} diff --git a/cmd/tap/run.go b/cmd/tap/run.go index 4b6df64..49fbf11 100644 --- a/cmd/tap/run.go +++ b/cmd/tap/run.go @@ -163,7 +163,7 @@ func run(ctx context.Context, cfg config.Config, logger *slog.Logger) error { } } - shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second) + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), cfg.Server.ShutdownTimeout) defer shutdownCancel() if err := ingressServer.Shutdown(shutdownCtx); err != nil && !errors.Is(err, http.ErrServerClosed) { diff --git a/config.example.yaml b/config.example.yaml index 34386ad..eddba1e 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -191,6 +191,7 @@ server: base_path: /webhooks read_timeout: 10s write_timeout: 5s + shutdown_timeout: 10s max_body_size: 1048576 admin_token: ${TAP_ADMIN_TOKEN} admin_token_secondary: ${TAP_ADMIN_TOKEN_SECONDARY} diff --git a/config/config.go b/config/config.go index 35fa0c4..0be4d42 100644 --- a/config/config.go +++ b/config/config.go @@ -190,6 +190,7 @@ type ServerConfig struct { BasePath string `koanf:"base_path"` ReadTimeout time.Duration `koanf:"read_timeout"` WriteTimeout time.Duration `koanf:"write_timeout"` + ShutdownTimeout time.Duration `koanf:"shutdown_timeout"` MaxBodySize int64 `koanf:"max_body_size"` AdminToken string `koanf:"admin_token"` AdminTokenSecondary string `koanf:"admin_token_secondary"` @@ -339,6 +340,9 @@ func (c *Config) ApplyDefaults() { if c.Server.WriteTimeout == 0 { c.Server.WriteTimeout = 5 * time.Second } + if c.Server.ShutdownTimeout == 0 { + c.Server.ShutdownTimeout = 10 * time.Second + } if c.Server.MaxBodySize == 0 { c.Server.MaxBodySize = 1 << 20 } @@ -420,6 +424,9 @@ func (c Config) Validate() error { if c.Server.AdminReplayMaxLimit <= 0 || c.Server.AdminReplayMaxLimit > maxAdminReplayMaxLimit { return fmt.Errorf("server.admin_replay_max_limit must be in range 1..%d", maxAdminReplayMaxLimit) } + if c.Server.ShutdownTimeout <= 0 { + return fmt.Errorf("server.shutdown_timeout must be greater than 0") + } if c.Server.AdminReplayJobTTL <= 0 { return fmt.Errorf("server.admin_replay_job_ttl must be greater than 0") } diff --git a/config/config_test.go b/config/config_test.go index 6421899..bb91d58 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -35,6 +35,7 @@ server: t.Setenv("STRIPE_WEBHOOK_SECRET", "whsec_123") t.Setenv("TAP_SERVER_PORT", "9091") + t.Setenv("TAP_SERVER_SHUTDOWN_TIMEOUT", "17s") cfg, err := Load(path) if err != nil { @@ -43,6 +44,9 @@ server: if got := cfg.Server.Port; got != 9091 { t.Fatalf("expected env override port 9091, got %d", got) } + if got := cfg.Server.ShutdownTimeout; got != 17*time.Second { + t.Fatalf("expected env override shutdown timeout 17s, got %s", got) + } if cfg.Providers["stripe"].Secret != "whsec_123" { t.Fatalf("expected secret expansion") } @@ -179,6 +183,9 @@ func TestLoadConfigMissingFileAppliesDefaults(t *testing.T) { if cfg.Server.BasePath != "/webhooks" { t.Fatalf("expected default base path, got %q", cfg.Server.BasePath) } + if cfg.Server.ShutdownTimeout != 10*time.Second { + t.Fatalf("expected default shutdown timeout 10s, got %s", cfg.Server.ShutdownTimeout) + } if cfg.Server.AdminReplayMaxLimit != 2000 { t.Fatalf("expected default admin replay max limit 2000, got %d", cfg.Server.AdminReplayMaxLimit) } @@ -327,6 +334,16 @@ server: } } +func TestConfigValidateRejectsNonPositiveShutdownTimeout(t *testing.T) { + cfg := Config{} + cfg.ApplyDefaults() + cfg.Server.ShutdownTimeout = 0 + + if err := cfg.Validate(); err == nil || !strings.Contains(err.Error(), "server.shutdown_timeout") { + t.Fatalf("expected shutdown timeout validation error, got %v", err) + } +} + func TestLoadConfigVaultReferenceRequiresAddress(t *testing.T) { t.Setenv("VAULT_ADDR", "") @@ -417,6 +434,7 @@ func TestLoadConfigSnakeCaseEnvOverrides(t *testing.T) { t.Setenv("TAP_NATS_SECURE", "true") t.Setenv("TAP_NATS_CA_FILE", "/var/run/secrets/nats/ca.crt") t.Setenv("TAP_SERVER_MAX_BODY_SIZE", "2097152") + t.Setenv("TAP_SERVER_SHUTDOWN_TIMEOUT", "11s") t.Setenv("TAP_SERVER_ADMIN_REPLAY_MAX_LIMIT", "1234") t.Setenv("TAP_SERVER_ADMIN_REPLAY_JOB_TTL", "12h") t.Setenv("TAP_SERVER_ADMIN_REPLAY_JOB_MAX_JOBS", "777") @@ -481,6 +499,9 @@ func TestLoadConfigSnakeCaseEnvOverrides(t *testing.T) { if cfg.Server.MaxBodySize != 2097152 { t.Fatalf("expected server.max_body_size override, got %d", cfg.Server.MaxBodySize) } + if cfg.Server.ShutdownTimeout != 11*time.Second { + t.Fatalf("expected server.shutdown_timeout override, got %s", cfg.Server.ShutdownTimeout) + } if cfg.Server.AdminReplayMaxLimit != 1234 { t.Fatalf("expected server.admin_replay_max_limit override, got %d", cfg.Server.AdminReplayMaxLimit) } diff --git a/scripts/assert-chart-render.sh b/scripts/assert-chart-render.sh index b04116e..9528961 100755 --- a/scripts/assert-chart-render.sh +++ b/scripts/assert-chart-render.sh @@ -26,9 +26,10 @@ rendered_default="$(mktemp)" rendered_fixture="$(mktemp)" rendered_automount="$(mktemp)" rendered_startup_disabled="$(mktemp)" +rendered_production="$(mktemp)" fixture_values="$(mktemp)" cleanup() { - rm -f "${rendered_default}" "${rendered_fixture}" "${rendered_automount}" "${rendered_startup_disabled}" "${fixture_values}" + rm -f "${rendered_default}" "${rendered_fixture}" "${rendered_automount}" "${rendered_startup_disabled}" "${rendered_production}" "${fixture_values}" } trap cleanup EXIT @@ -65,6 +66,7 @@ helm template siphon charts/siphon >"${rendered_default}" helm template siphon charts/siphon -f "${fixture_values}" >"${rendered_fixture}" helm template siphon charts/siphon --set serviceAccount.automount=true >"${rendered_automount}" helm template siphon charts/siphon --set startupProbe.enabled=false >"${rendered_startup_disabled}" +helm template siphon charts/siphon -f charts/siphon/values-production.yaml >"${rendered_production}" default_automount="$(yq -r 'select(.kind == "Deployment") | .spec.template.spec.automountServiceAccountToken' "${rendered_default}")" [[ "${default_automount}" == "false" ]] || fail "default automountServiceAccountToken should be false, got ${default_automount}" @@ -93,4 +95,10 @@ startup_path="$(yq -r 'select(.kind == "Deployment") | .spec.template.spec.conta startup_disabled="$(yq -r 'select(.kind == "Deployment") | .spec.template.spec.containers[] | select(.name == "tap") | has("startupProbe")' "${rendered_startup_disabled}")" [[ "${startup_disabled}" == "false" ]] || fail "startupProbe should be omitted when startupProbe.enabled=false, got ${startup_disabled}" +production_hpa="$(yq -r 'select(.kind == "HorizontalPodAutoscaler") | .kind' "${rendered_production}")" +[[ "${production_hpa}" == "HorizontalPodAutoscaler" ]] || fail "production profile should render an HPA, got ${production_hpa}" + +production_replicas="$(yq -r 'select(.kind == "Deployment") | .spec | has("replicas")' "${rendered_production}" | head -n1)" +[[ "${production_replicas}" == "false" ]] || fail "production profile should omit deployment replicas when autoscaling is enabled, got ${production_replicas}" + echo "ok: chart render assertions passed"