From 5de3ee7e7adad42bd6e88baf6f4052ee2cb06981 Mon Sep 17 00:00:00 2001
From: Jonathan Haas <jonathan@haas.holdings>
Date: Wed, 15 Apr 2026 17:13:57 -0700
Subject: [PATCH] Add siphon production ops profile

---
 README.md                            |  1 +
 charts/siphon/README.md              | 10 ++++++++++
 charts/siphon/values-production.yaml |  6 ++++++
 charts/siphon/values.schema.json     |  4 ++++
 charts/siphon/values.yaml            |  1 +
 cmd/tap/run.go                       |  2 +-
 config.example.yaml                  |  1 +
 config/config.go                     |  7 +++++++
 config/config_test.go                | 21 +++++++++++++++++++++
 scripts/assert-chart-render.sh       | 10 +++++++++-
 10 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 charts/siphon/values-production.yaml

diff --git a/README.md b/README.md
index c957af9..4c61689 100644
--- a/README.md
+++ b/README.md
@@ -160,6 +160,7 @@ When any admin token is configured (`server.admin_token`, `server.admin_token_se
   - Replay is capped by `server.admin_replay_max_limit` (default `2000`, valid range `1..100000`); accepted response includes replay job metadata (`job_id`, `status`, `effective_limit`, `max_limit`, `capped`, `dry_run`).
   - Replay job metadata retention/capacity is configurable (`server.admin_replay_job_ttl`, `server.admin_replay_job_max_jobs`), and backend is configurable (`server.admin_replay_store_backend=memory|sqlite`, `server.admin_replay_sqlite_path`).
   - Replay execution is configurable (`server.admin_replay_job_timeout`, `server.admin_replay_max_concurrent_jobs`) for bounded runtime and concurrency.
+  - HTTP shutdown grace is configurable via `server.shutdown_timeout` for slower drains in production.
   - Queue fan-out safety rails are configurable (`server.admin_replay_max_queued_per_ip`, `server.admin_replay_max_queued_per_token`) and return `409` when exceeded.
 - `GET /admin/replay-dlq`
   - Requires header `X-Admin-Token` with read permission (`admin_token`/`admin_token_secondary`/`admin_token_read`/`admin_token_replay`/`admin_token_cancel`).
diff --git a/charts/siphon/README.md b/charts/siphon/README.md
index 010024d..6df281b 100644
--- a/charts/siphon/README.md
+++ b/charts/siphon/README.md
@@ -185,6 +185,7 @@ Auth notes:
 - `config.nats.stream_compression` supports `none|s2`; `config.nats.stream_max_consumers` and `config.nats.stream_max_msgs_per_subject` must be `>= 0`.
 - `config.clickhouse.consumer_backoff` values must be positive and non-decreasing; when `config.clickhouse.consumer_max_deliver > 0`, it must equal the backoff list length.
 - Keep `config.clickhouse.consumer_fetch_max_wait < config.clickhouse.consumer_ack_wait` and `config.clickhouse.insert_timeout + config.clickhouse.flush_interval < config.clickhouse.consumer_ack_wait`.
+- `config.server.shutdown_timeout` controls how long the tap server waits for in-flight work to drain before forcing HTTP shutdown.
 
 ## Ops hardening defaults
 
@@ -197,6 +198,7 @@ Auth notes:
 - `networkPolicy.natsEgressTo=[]` and `networkPolicy.clickhouseEgressTo=[]` optionally scope derived transport rules to destination selectors (`namespaceSelector`, `podSelector`, `ipBlock`) for least-privilege egress.
 - `envSecrets` supports direct `env` values from secret key references.
 - `autoscaling.customMetrics` enables HPA custom metrics in addition to CPU/memory targets.
+- `values-production.yaml` enables HPA by default for production installs (`minReplicas=2`, CPU+memory targets enabled).
 
 Example selector-based transport policy:
 
@@ -209,6 +211,14 @@ helm upgrade --install siphon ./charts/siphon \
   --set networkPolicy.clickhouseEgressTo[0].ipBlock.cidr=10.42.0.0/16
 ```
 
+## Production profile
+
+```bash
+helm upgrade --install siphon ./charts/siphon \
+  --namespace siphon \
+  -f ./charts/siphon/values-production.yaml
+```
+
 ## Enable sqlite state persistence
 
 ```bash
diff --git a/charts/siphon/values-production.yaml b/charts/siphon/values-production.yaml
new file mode 100644
index 0000000..e215516
--- /dev/null
+++ b/charts/siphon/values-production.yaml
@@ -0,0 +1,6 @@
+autoscaling:
+  enabled: true
+  minReplicas: 2
+  maxReplicas: 10
+  targetCPUUtilizationPercentage: 80
+  targetMemoryUtilizationPercentage: 80
diff --git a/charts/siphon/values.schema.json b/charts/siphon/values.schema.json
index 4c85911..fb4a825 100644
--- a/charts/siphon/values.schema.json
+++ b/charts/siphon/values.schema.json
@@ -638,6 +638,10 @@
               "type": "string",
               "description": "HTTP server write timeout."
             },
+            "shutdown_timeout": {
+              "type": "string",
+              "description": "HTTP server shutdown grace timeout."
+            },
             "max_body_size": {
               "type": "integer",
               "minimum": 1,
diff --git a/charts/siphon/values.yaml b/charts/siphon/values.yaml
index cf4ce6b..5ce6e23 100644
--- a/charts/siphon/values.yaml
+++ b/charts/siphon/values.yaml
@@ -230,6 +230,7 @@ config:
     base_path: /webhooks
     read_timeout: 10s
     write_timeout: 5s
+    shutdown_timeout: 10s
     max_body_size: 1048576
     admin_token: ${TAP_ADMIN_TOKEN}
     admin_token_secondary: ${TAP_ADMIN_TOKEN_SECONDARY}
diff --git a/cmd/tap/run.go b/cmd/tap/run.go
index 4b6df64..49fbf11 100644
--- a/cmd/tap/run.go
+++ b/cmd/tap/run.go
@@ -163,7 +163,7 @@ func run(ctx context.Context, cfg config.Config, logger *slog.Logger) error {
 		}
 	}
 
-	shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second)
+	shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), cfg.Server.ShutdownTimeout)
 	defer shutdownCancel()
 
 	if err := ingressServer.Shutdown(shutdownCtx); err != nil && !errors.Is(err, http.ErrServerClosed) {
diff --git a/config.example.yaml b/config.example.yaml
index 34386ad..eddba1e 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -191,6 +191,7 @@ server:
   base_path: /webhooks
   read_timeout: 10s
   write_timeout: 5s
+  shutdown_timeout: 10s
   max_body_size: 1048576
   admin_token: ${TAP_ADMIN_TOKEN}
   admin_token_secondary: ${TAP_ADMIN_TOKEN_SECONDARY}
diff --git a/config/config.go b/config/config.go
index 35fa0c4..0be4d42 100644
--- a/config/config.go
+++ b/config/config.go
@@ -190,6 +190,7 @@ type ServerConfig struct {
 	BasePath                  string        `koanf:"base_path"`
 	ReadTimeout               time.Duration `koanf:"read_timeout"`
 	WriteTimeout              time.Duration `koanf:"write_timeout"`
+	ShutdownTimeout           time.Duration `koanf:"shutdown_timeout"`
 	MaxBodySize               int64         `koanf:"max_body_size"`
 	AdminToken                string        `koanf:"admin_token"`
 	AdminTokenSecondary       string        `koanf:"admin_token_secondary"`
@@ -339,6 +340,9 @@ func (c *Config) ApplyDefaults() {
 	if c.Server.WriteTimeout == 0 {
 		c.Server.WriteTimeout = 5 * time.Second
 	}
+	if c.Server.ShutdownTimeout == 0 {
+		c.Server.ShutdownTimeout = 10 * time.Second
+	}
 	if c.Server.MaxBodySize == 0 {
 		c.Server.MaxBodySize = 1 << 20
 	}
@@ -420,6 +424,9 @@ func (c Config) Validate() error {
 	if c.Server.AdminReplayMaxLimit <= 0 || c.Server.AdminReplayMaxLimit > maxAdminReplayMaxLimit {
 		return fmt.Errorf("server.admin_replay_max_limit must be in range 1..%d", maxAdminReplayMaxLimit)
 	}
+	if c.Server.ShutdownTimeout <= 0 {
+		return fmt.Errorf("server.shutdown_timeout must be greater than 0")
+	}
 	if c.Server.AdminReplayJobTTL <= 0 {
 		return fmt.Errorf("server.admin_replay_job_ttl must be greater than 0")
 	}
diff --git a/config/config_test.go b/config/config_test.go
index 6421899..bb91d58 100644
--- a/config/config_test.go
+++ b/config/config_test.go
@@ -35,6 +35,7 @@ server:
 
 	t.Setenv("STRIPE_WEBHOOK_SECRET", "whsec_123")
 	t.Setenv("TAP_SERVER_PORT", "9091")
+	t.Setenv("TAP_SERVER_SHUTDOWN_TIMEOUT", "17s")
 
 	cfg, err := Load(path)
 	if err != nil {
@@ -43,6 +44,9 @@ server:
 	if got := cfg.Server.Port; got != 9091 {
 		t.Fatalf("expected env override port 9091, got %d", got)
 	}
+	if got := cfg.Server.ShutdownTimeout; got != 17*time.Second {
+		t.Fatalf("expected env override shutdown timeout 17s, got %s", got)
+	}
 	if cfg.Providers["stripe"].Secret != "whsec_123" {
 		t.Fatalf("expected secret expansion")
 	}
@@ -179,6 +183,9 @@ func TestLoadConfigMissingFileAppliesDefaults(t *testing.T) {
 	if cfg.Server.BasePath != "/webhooks" {
 		t.Fatalf("expected default base path, got %q", cfg.Server.BasePath)
 	}
+	if cfg.Server.ShutdownTimeout != 10*time.Second {
+		t.Fatalf("expected default shutdown timeout 10s, got %s", cfg.Server.ShutdownTimeout)
+	}
 	if cfg.Server.AdminReplayMaxLimit != 2000 {
 		t.Fatalf("expected default admin replay max limit 2000, got %d", cfg.Server.AdminReplayMaxLimit)
 	}
@@ -327,6 +334,16 @@ server:
 	}
 }
 
+func TestConfigValidateRejectsNonPositiveShutdownTimeout(t *testing.T) {
+	cfg := Config{}
+	cfg.ApplyDefaults()
+	cfg.Server.ShutdownTimeout = 0
+
+	if err := cfg.Validate(); err == nil || !strings.Contains(err.Error(), "server.shutdown_timeout") {
+		t.Fatalf("expected shutdown timeout validation error, got %v", err)
+	}
+}
+
 func TestLoadConfigVaultReferenceRequiresAddress(t *testing.T) {
 	t.Setenv("VAULT_ADDR", "")
 
@@ -417,6 +434,7 @@ func TestLoadConfigSnakeCaseEnvOverrides(t *testing.T) {
 	t.Setenv("TAP_NATS_SECURE", "true")
 	t.Setenv("TAP_NATS_CA_FILE", "/var/run/secrets/nats/ca.crt")
 	t.Setenv("TAP_SERVER_MAX_BODY_SIZE", "2097152")
+	t.Setenv("TAP_SERVER_SHUTDOWN_TIMEOUT", "11s")
 	t.Setenv("TAP_SERVER_ADMIN_REPLAY_MAX_LIMIT", "1234")
 	t.Setenv("TAP_SERVER_ADMIN_REPLAY_JOB_TTL", "12h")
 	t.Setenv("TAP_SERVER_ADMIN_REPLAY_JOB_MAX_JOBS", "777")
@@ -481,6 +499,9 @@ func TestLoadConfigSnakeCaseEnvOverrides(t *testing.T) {
 	if cfg.Server.MaxBodySize != 2097152 {
 		t.Fatalf("expected server.max_body_size override, got %d", cfg.Server.MaxBodySize)
 	}
+	if cfg.Server.ShutdownTimeout != 11*time.Second {
+		t.Fatalf("expected server.shutdown_timeout override, got %s", cfg.Server.ShutdownTimeout)
+	}
 	if cfg.Server.AdminReplayMaxLimit != 1234 {
 		t.Fatalf("expected server.admin_replay_max_limit override, got %d", cfg.Server.AdminReplayMaxLimit)
 	}
diff --git a/scripts/assert-chart-render.sh b/scripts/assert-chart-render.sh
index b04116e..9528961 100755
--- a/scripts/assert-chart-render.sh
+++ b/scripts/assert-chart-render.sh
@@ -26,9 +26,10 @@ rendered_default="$(mktemp)"
 rendered_fixture="$(mktemp)"
 rendered_automount="$(mktemp)"
 rendered_startup_disabled="$(mktemp)"
+rendered_production="$(mktemp)"
 fixture_values="$(mktemp)"
 cleanup() {
-  rm -f "${rendered_default}" "${rendered_fixture}" "${rendered_automount}" "${rendered_startup_disabled}" "${fixture_values}"
+  rm -f "${rendered_default}" "${rendered_fixture}" "${rendered_automount}" "${rendered_startup_disabled}" "${rendered_production}" "${fixture_values}"
 }
 trap cleanup EXIT
 
@@ -65,6 +66,7 @@ helm template siphon charts/siphon >"${rendered_default}"
 helm template siphon charts/siphon -f "${fixture_values}" >"${rendered_fixture}"
 helm template siphon charts/siphon --set serviceAccount.automount=true >"${rendered_automount}"
 helm template siphon charts/siphon --set startupProbe.enabled=false >"${rendered_startup_disabled}"
+helm template siphon charts/siphon -f charts/siphon/values-production.yaml >"${rendered_production}"
 
 default_automount="$(yq -r 'select(.kind == "Deployment") | .spec.template.spec.automountServiceAccountToken' "${rendered_default}")"
 [[ "${default_automount}" == "false" ]] || fail "default automountServiceAccountToken should be false, got ${default_automount}"
@@ -93,4 +95,10 @@ startup_path="$(yq -r 'select(.kind == "Deployment") | .spec.template.spec.conta
 startup_disabled="$(yq -r 'select(.kind == "Deployment") | .spec.template.spec.containers[] | select(.name == "tap") | has("startupProbe")' "${rendered_startup_disabled}")"
 [[ "${startup_disabled}" == "false" ]] || fail "startupProbe should be omitted when startupProbe.enabled=false, got ${startup_disabled}"
 
+production_hpa="$(yq -r 'select(.kind == "HorizontalPodAutoscaler") | .kind' "${rendered_production}")"
+[[ "${production_hpa}" == "HorizontalPodAutoscaler" ]] || fail "production profile should render an HPA, got ${production_hpa}"
+
+production_replicas="$(yq -r 'select(.kind == "Deployment") | .spec | has("replicas")' "${rendered_production}" | head -n1)"
+[[ "${production_replicas}" == "false" ]] || fail "production profile should omit deployment replicas when autoscaling is enabled, got ${production_replicas}"
+
 echo "ok: chart render assertions passed"