Skip to content
This repository was archived by the owner on Apr 16, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ When any admin token is configured (`server.admin_token`, `server.admin_token_se
- Replay is capped by `server.admin_replay_max_limit` (default `2000`, valid range `1..100000`); accepted response includes replay job metadata (`job_id`, `status`, `effective_limit`, `max_limit`, `capped`, `dry_run`).
- Replay job metadata retention/capacity is configurable (`server.admin_replay_job_ttl`, `server.admin_replay_job_max_jobs`), and backend is configurable (`server.admin_replay_store_backend=memory|sqlite`, `server.admin_replay_sqlite_path`).
- Replay execution is configurable (`server.admin_replay_job_timeout`, `server.admin_replay_max_concurrent_jobs`) for bounded runtime and concurrency.
- HTTP shutdown grace is configurable via `server.shutdown_timeout` for slower drains in production.
- Queue fan-out safety rails are configurable (`server.admin_replay_max_queued_per_ip`, `server.admin_replay_max_queued_per_token`) and return `409` when exceeded.
- `GET /admin/replay-dlq`
- Requires header `X-Admin-Token` with read permission (`admin_token`/`admin_token_secondary`/`admin_token_read`/`admin_token_replay`/`admin_token_cancel`).
Expand Down
10 changes: 10 additions & 0 deletions charts/siphon/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ Auth notes:
- `config.nats.stream_compression` supports `none|s2`; `config.nats.stream_max_consumers` and `config.nats.stream_max_msgs_per_subject` must be `>= 0`.
- `config.clickhouse.consumer_backoff` values must be positive and non-decreasing; when `config.clickhouse.consumer_max_deliver > 0`, it must equal the backoff list length.
- Keep `config.clickhouse.consumer_fetch_max_wait < config.clickhouse.consumer_ack_wait` and `config.clickhouse.insert_timeout + config.clickhouse.flush_interval < config.clickhouse.consumer_ack_wait`.
- `config.server.shutdown_timeout` controls how long the tap server waits for in-flight work to drain before forcing HTTP shutdown.

## Ops hardening defaults

Expand All @@ -197,6 +198,7 @@ Auth notes:
- `networkPolicy.natsEgressTo=[]` and `networkPolicy.clickhouseEgressTo=[]` optionally scope derived transport rules to destination selectors (`namespaceSelector`, `podSelector`, `ipBlock`) for least-privilege egress.
- `envSecrets` supports direct `env` values from secret key references.
- `autoscaling.customMetrics` enables HPA custom metrics in addition to CPU/memory targets.
- `values-production.yaml` enables HPA by default for production installs (`minReplicas=2`, CPU+memory targets enabled).

Example selector-based transport policy:

Expand All @@ -209,6 +211,14 @@ helm upgrade --install siphon ./charts/siphon \
--set networkPolicy.clickhouseEgressTo[0].ipBlock.cidr=10.42.0.0/16
```

## Production profile

```bash
helm upgrade --install siphon ./charts/siphon \
--namespace siphon \
-f ./charts/siphon/values-production.yaml
```

## Enable sqlite state persistence

```bash
Expand Down
6 changes: 6 additions & 0 deletions charts/siphon/values-production.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 10
targetCPUUtilizationPercentage: 80
targetMemoryUtilizationPercentage: 80
4 changes: 4 additions & 0 deletions charts/siphon/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,10 @@
"type": "string",
"description": "HTTP server write timeout."
},
"shutdown_timeout": {
"type": "string",
"description": "HTTP server shutdown grace timeout."
},
"max_body_size": {
"type": "integer",
"minimum": 1,
Expand Down
1 change: 1 addition & 0 deletions charts/siphon/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ config:
base_path: /webhooks
read_timeout: 10s
write_timeout: 5s
shutdown_timeout: 10s
max_body_size: 1048576
admin_token: ${TAP_ADMIN_TOKEN}
admin_token_secondary: ${TAP_ADMIN_TOKEN_SECONDARY}
Expand Down
2 changes: 1 addition & 1 deletion cmd/tap/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ func run(ctx context.Context, cfg config.Config, logger *slog.Logger) error {
}
}

shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second)
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), cfg.Server.ShutdownTimeout)
defer shutdownCancel()

if err := ingressServer.Shutdown(shutdownCtx); err != nil && !errors.Is(err, http.ErrServerClosed) {
Expand Down
1 change: 1 addition & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ server:
base_path: /webhooks
read_timeout: 10s
write_timeout: 5s
shutdown_timeout: 10s
max_body_size: 1048576
admin_token: ${TAP_ADMIN_TOKEN}
admin_token_secondary: ${TAP_ADMIN_TOKEN_SECONDARY}
Expand Down
7 changes: 7 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ type ServerConfig struct {
BasePath string `koanf:"base_path"`
ReadTimeout time.Duration `koanf:"read_timeout"`
WriteTimeout time.Duration `koanf:"write_timeout"`
ShutdownTimeout time.Duration `koanf:"shutdown_timeout"`
MaxBodySize int64 `koanf:"max_body_size"`
AdminToken string `koanf:"admin_token"`
AdminTokenSecondary string `koanf:"admin_token_secondary"`
Expand Down Expand Up @@ -339,6 +340,9 @@ func (c *Config) ApplyDefaults() {
if c.Server.WriteTimeout == 0 {
c.Server.WriteTimeout = 5 * time.Second
}
if c.Server.ShutdownTimeout == 0 {
c.Server.ShutdownTimeout = 10 * time.Second
}
if c.Server.MaxBodySize == 0 {
c.Server.MaxBodySize = 1 << 20
}
Expand Down Expand Up @@ -420,6 +424,9 @@ func (c Config) Validate() error {
if c.Server.AdminReplayMaxLimit <= 0 || c.Server.AdminReplayMaxLimit > maxAdminReplayMaxLimit {
return fmt.Errorf("server.admin_replay_max_limit must be in range 1..%d", maxAdminReplayMaxLimit)
}
if c.Server.ShutdownTimeout <= 0 {
return fmt.Errorf("server.shutdown_timeout must be greater than 0")
}
if c.Server.AdminReplayJobTTL <= 0 {
return fmt.Errorf("server.admin_replay_job_ttl must be greater than 0")
}
Expand Down
21 changes: 21 additions & 0 deletions config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ server:

t.Setenv("STRIPE_WEBHOOK_SECRET", "whsec_123")
t.Setenv("TAP_SERVER_PORT", "9091")
t.Setenv("TAP_SERVER_SHUTDOWN_TIMEOUT", "17s")

cfg, err := Load(path)
if err != nil {
Expand All @@ -43,6 +44,9 @@ server:
if got := cfg.Server.Port; got != 9091 {
t.Fatalf("expected env override port 9091, got %d", got)
}
if got := cfg.Server.ShutdownTimeout; got != 17*time.Second {
t.Fatalf("expected env override shutdown timeout 17s, got %s", got)
}
if cfg.Providers["stripe"].Secret != "whsec_123" {
t.Fatalf("expected secret expansion")
}
Expand Down Expand Up @@ -179,6 +183,9 @@ func TestLoadConfigMissingFileAppliesDefaults(t *testing.T) {
if cfg.Server.BasePath != "/webhooks" {
t.Fatalf("expected default base path, got %q", cfg.Server.BasePath)
}
if cfg.Server.ShutdownTimeout != 10*time.Second {
t.Fatalf("expected default shutdown timeout 10s, got %s", cfg.Server.ShutdownTimeout)
}
if cfg.Server.AdminReplayMaxLimit != 2000 {
t.Fatalf("expected default admin replay max limit 2000, got %d", cfg.Server.AdminReplayMaxLimit)
}
Expand Down Expand Up @@ -327,6 +334,16 @@ server:
}
}

func TestConfigValidateRejectsNonPositiveShutdownTimeout(t *testing.T) {
cfg := Config{}
cfg.ApplyDefaults()
cfg.Server.ShutdownTimeout = 0

if err := cfg.Validate(); err == nil || !strings.Contains(err.Error(), "server.shutdown_timeout") {
t.Fatalf("expected shutdown timeout validation error, got %v", err)
}
}

func TestLoadConfigVaultReferenceRequiresAddress(t *testing.T) {
t.Setenv("VAULT_ADDR", "")

Expand Down Expand Up @@ -417,6 +434,7 @@ func TestLoadConfigSnakeCaseEnvOverrides(t *testing.T) {
t.Setenv("TAP_NATS_SECURE", "true")
t.Setenv("TAP_NATS_CA_FILE", "/var/run/secrets/nats/ca.crt")
t.Setenv("TAP_SERVER_MAX_BODY_SIZE", "2097152")
t.Setenv("TAP_SERVER_SHUTDOWN_TIMEOUT", "11s")
t.Setenv("TAP_SERVER_ADMIN_REPLAY_MAX_LIMIT", "1234")
t.Setenv("TAP_SERVER_ADMIN_REPLAY_JOB_TTL", "12h")
t.Setenv("TAP_SERVER_ADMIN_REPLAY_JOB_MAX_JOBS", "777")
Expand Down Expand Up @@ -481,6 +499,9 @@ func TestLoadConfigSnakeCaseEnvOverrides(t *testing.T) {
if cfg.Server.MaxBodySize != 2097152 {
t.Fatalf("expected server.max_body_size override, got %d", cfg.Server.MaxBodySize)
}
if cfg.Server.ShutdownTimeout != 11*time.Second {
t.Fatalf("expected server.shutdown_timeout override, got %s", cfg.Server.ShutdownTimeout)
}
if cfg.Server.AdminReplayMaxLimit != 1234 {
t.Fatalf("expected server.admin_replay_max_limit override, got %d", cfg.Server.AdminReplayMaxLimit)
}
Expand Down
10 changes: 9 additions & 1 deletion scripts/assert-chart-render.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ rendered_default="$(mktemp)"
rendered_fixture="$(mktemp)"
rendered_automount="$(mktemp)"
rendered_startup_disabled="$(mktemp)"
rendered_production="$(mktemp)"
fixture_values="$(mktemp)"
cleanup() {
rm -f "${rendered_default}" "${rendered_fixture}" "${rendered_automount}" "${rendered_startup_disabled}" "${fixture_values}"
rm -f "${rendered_default}" "${rendered_fixture}" "${rendered_automount}" "${rendered_startup_disabled}" "${rendered_production}" "${fixture_values}"
}
trap cleanup EXIT

Expand Down Expand Up @@ -65,6 +66,7 @@ helm template siphon charts/siphon >"${rendered_default}"
helm template siphon charts/siphon -f "${fixture_values}" >"${rendered_fixture}"
helm template siphon charts/siphon --set serviceAccount.automount=true >"${rendered_automount}"
helm template siphon charts/siphon --set startupProbe.enabled=false >"${rendered_startup_disabled}"
helm template siphon charts/siphon -f charts/siphon/values-production.yaml >"${rendered_production}"

default_automount="$(yq -r 'select(.kind == "Deployment") | .spec.template.spec.automountServiceAccountToken' "${rendered_default}")"
[[ "${default_automount}" == "false" ]] || fail "default automountServiceAccountToken should be false, got ${default_automount}"
Expand Down Expand Up @@ -93,4 +95,10 @@ startup_path="$(yq -r 'select(.kind == "Deployment") | .spec.template.spec.conta
startup_disabled="$(yq -r 'select(.kind == "Deployment") | .spec.template.spec.containers[] | select(.name == "tap") | has("startupProbe")' "${rendered_startup_disabled}")"
[[ "${startup_disabled}" == "false" ]] || fail "startupProbe should be omitted when startupProbe.enabled=false, got ${startup_disabled}"

production_hpa="$(yq -r 'select(.kind == "HorizontalPodAutoscaler") | .kind' "${rendered_production}")"
[[ "${production_hpa}" == "HorizontalPodAutoscaler" ]] || fail "production profile should render an HPA, got ${production_hpa}"

production_replicas="$(yq -r 'select(.kind == "Deployment") | .spec | has("replicas")' "${rendered_production}" | head -n1)"
[[ "${production_replicas}" == "false" ]] || fail "production profile should omit deployment replicas when autoscaling is enabled, got ${production_replicas}"

echo "ok: chart render assertions passed"
Loading