fastly · dmichael-fastly · Jun 21, 2026 · Jun 21, 2026 · Jun 21, 2026 · Jun 21, 2026
diff --git a/.env.example b/.env.example
@@ -47,6 +47,118 @@
 # backend runs on a different host than the frontend.
 # NEXT_PUBLIC_API_URL=http://127.0.0.1:8000
 
+# ── Observability ──────────────────────────────────────────────────────────────
+# OpenTelemetry exporter. Default 'none' — no spans/metrics leave the process.
+# Set 'console' to dump spans and 60s metric snapshots to stdout (loud; useful
+# locally when chasing a perf regression). Don't set 'console' in prod — it
+# pollutes log aggregation with ~1 MB/min of JSON.
+# OTEL_EXPORTER=console
+
+# Log format. Default 'console' (colored TTY-friendly output). Set 'json' in
+# prod to emit structured JSON lines that downstream aggregators can parse.
+# STRUCTLOG_FORMAT=json
+
+# ── Security: trusted-proxy + data-dir gates ───────────────────────────────────
+# Comma-separated trusted proxy IPs. MUST be set in production alongside the
+# uvicorn flags '--proxy-headers --forwarded-allow-ips=<same-ips>' so the
+# remote-access middleware can read request.client.host as the real client IP.
+# Without this, leftmost-XFF spoofing becomes exploitable and IP-based gates
+# (rate-limit, admin detection, whitelist) silently no-op. Local dev leaves
+# this unset and the startup check downgrades to a WARNING.
+# TRUSTED_PROXY_IPS=127.0.0.1
+
+# uvicorn's own env-equivalent of '--forwarded-allow-ips'. Set in production
+# whenever TRUSTED_PROXY_IPS is set — defense in depth so a future refactor
+# that drops the CLI flag is still detected by the startup check.
+# UVICORN_FORWARDED_ALLOW_IPS=127.0.0.1
+
+# Make the proxy-headers check FATAL instead of WARNING. Set in production so
+# a misconfigured deploy refuses to start rather than running insecure.
+# REQUIRE_PROXY_HEADERS=1
+
+# Refuse to start if /app/data is not an actual mount point. Set in production
+# so a broken fstab can't silently ingest into an ephemeral location that
+# vanishes on the next reboot. Leave unset locally (the repo bind-mount isn't
+# a real mount point).
+# STRICT_DATA_DIR_CHECK=1
+
+# Extra hostnames that count as "local" for the remote-access middleware.
+# Comma-separated. Default allowlist already includes localhost, 127.0.0.1,
+# [::1], 0.0.0.0, testserver, backend, frontend, caddy, web. Add custom
+# Docker service names here if Caddy proxies through a different upstream.
+# LOCAL_HOSTS=backend,my-custom-service
+# Legacy aliases (read for backward compat; prefer LOCAL_HOSTS):
+# LOCAL_HOST_ALLOWLIST=
+# ALLOWED_HOSTS=
+
+# ── DuckDB connection pool ─────────────────────────────────────────────────────
+# Disable the pool entirely (fresh connection per request). Default ON.
+# DUCKDB_CONNECTION_POOL=0
+
+# Max concurrent connections per service in the pool. Default 8. Larger values
+# let more queries run in parallel but each pool conn carries its own DuckDB
+# memory budget — pair with DUCKDB_POOL_CONN_MEMORY_LIMIT to bound total RSS.
+# DUCKDB_POOL_MAX_SIZE=8
+
+# Per-pool-connection DuckDB memory cap (accepts '256MB', '1GB', '104857600').
+# WITHOUT this, every pool conn inherits the process-wide ~60%-of-RAM default,
+# so DUCKDB_POOL_MAX_SIZE concurrent queries can each balloon to multi-GB and
+# OOM the container under load. Recommended in production. Leave unset locally.
+# DUCKDB_POOL_CONN_MEMORY_LIMIT=1GB
+
+# Per-pool-connection DuckDB thread count. Default min(cpu_count, 8). With the
+# default 8-conn pool, that's 64 threads competing for ~8 cores — context
+# switching dominates. Set to roughly cpu_count // DUCKDB_POOL_MAX_SIZE to
+# trade single-query throughput for better tail latency under sustained load.
+# DUCKDB_POOL_CONN_THREADS=2
+
+# View-rebind lock timeout (ms) when API pool checkouts contend with cron's
+# view-update lock. Default 500ms — short so the pool never serialises behind
+# a stuck cron; falls back to the cached/persistent view on miss. Set 0 for
+# emergency-rollback to old blocking behaviour.
+# DUCKDB_POOL_API_REBIND_LOCK_TIMEOUT_MS=500
+
+# Pre-acquire pool connections at startup so the first request doesn't pay
+# ~150 ms per fresh-build conn. Default OFF (cold-start is faster); flip on
+# for low-latency-first-request workloads (parallelised /api/origin/aggregates).
+# DUCKDB_POOL_WARM_AT_BOOT=1
+# DUCKDB_POOL_WARM_AT_BOOT_COUNT=4
+
+# Drop leftover TEMP tables on connection release. Default OFF (cheap and
+# bounded; flip on if a long-running deployment is leaking TEMP entries).
+# DUCKDB_POOL_SWEEP=1
+
+# ── Local parquet compaction ───────────────────────────────────────────────────
+# Stop merging a partition once its total parquet size exceeds this (MB).
+# Default 256. Prevents a runaway single-file compaction from collapsing
+# scan parallelism (DuckDB parallelises across files). Don't lower below ~64.
+# LOCAL_COMPACT_MAX_PARTITION_MB=256
+
+# Hourly partitions older than this (days) become eligible for cross-hour
+# DAILY compaction. Recent hours stay hourly so dashboard time-range pruning
+# stays tight. Default 7 — validated empirically; do NOT lower (regressions
+# proven on scan-bound queries).
+# LOCAL_COMPACT_DAILY_TIER_DAYS=7
+
+# Daily files older than this (days) become eligible for WEEKLY compaction.
+# Only effective when log_retention_days > this. Default 30.
+# LOCAL_COMPACT_WEEKLY_TIER_DAYS=30
+
+# ── Admin query monitor ────────────────────────────────────────────────────────
+# Toggle the /api/admin/query-monitor surface. Default ON. Set to '0', 'false',
+# 'no', 'off', or '' to disable (admin endpoint then 404s; frontend treats it
+# as "missing" rather than "broken").
+# QUERY_MONITOR_ENABLED=true
+
+# ── CORS allowlist (production) ────────────────────────────────────────────────
+# Comma-separated list of allowed browser origins for credentialed XHR. Local
+# dev leaves this UNSET and FastAPI falls back to a localhost-only dev allowlist
+# (localhost:3000/3001/13002). In production set this to the public endpoint of
+# the analyst SPA (e.g. CORS_ORIGINS=https://logs.example.com) — the analyst UI
+# is same-origin behind Caddy so this is defense-in-depth, but it closes the
+# door on a localhost-bound hostile sidecar holding allow_credentials=True.
+# CORS_ORIGINS=https://logs.example.com
+
 # ── Docker only ────────────────────────────────────────────────────────────────
 # Set automatically by docker-compose; not needed for local dev.
 # API_PROXY_URL=http://backend:8000
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -6,6 +6,13 @@ on:
   pull_request:
     branches: [main]
 
+# Cancel superseded runs on the same ref (e.g. a force-push or rapid PR
+# updates). `main` keeps each push separate (group includes SHA) so we
+# never lose a post-merge run; PR refs collapse to one in-flight run.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.sha || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
 env:
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
 
@@ -32,13 +39,37 @@ jobs:
       - name: Format check (ruff)
         run: uv run ruff format --check .
 
-      - name: Type check (mypy)
-        run: uv run mypy backend/
+      - name: Type check (mypy, filtered through mypy-baseline)
+        # Pre-existing errors accepted via mypy-baseline.txt; the filter
+        # exits non-zero only on NET-NEW errors. Refresh the baseline after
+        # a burndown PR with
+        #   uv run mypy backend/ 2>&1 | uv run mypy-baseline sync
+        # and commit mypy-baseline.txt.
+        run: uv run mypy backend/ 2>&1 | uv run mypy-baseline filter
+
+      - name: Architectural contracts (import-linter, R-9)
+        # Enforces:
+        #   - Routers are independent (no router imports another router,
+        #     transitively). Pre-existing cross-router edges are
+        #     baselined in pyproject.toml [tool.importlinter]; new
+        #     edges fail the gate.
+        #   - Core does not depend on routers (no inversion of
+        #     web ↔ analytics layering).
+        run: uv run lint-imports
 
       - name: Install falco
+        # Pinned to match backend/Dockerfile's FALCO_VERSION so CI lints VCL
+        # with the SAME falco the prod backend uses. An unpinned `latest` is a
+        # moving supply-chain target AND can accept/reject a recv snippet
+        # differently than production (silent CI-vs-prod VCL-lint drift on a
+        # security-relevant validation path). Bump deliberately alongside the
+        # Dockerfile ARG.
         run: |
-          sudo curl -sL https://github.com/ysugimoto/falco/releases/latest/download/falco-linux-amd64 -o /usr/local/bin/falco
+          FALCO_VERSION=2.3.0
+          sudo curl -sSfL "https://github.com/ysugimoto/falco/releases/download/v${FALCO_VERSION}/falco-linux-amd64.tar.gz" \
+            | sudo tar -xz -C /usr/local/bin falco
           sudo chmod +x /usr/local/bin/falco
+          falco --version
 
       - name: Install gitleaks
         # Same curl-binary-to-PATH pattern as falco above. Version pinned so
@@ -59,6 +90,24 @@ jobs:
         # suppression playbook.
         run: gitleaks detect --no-banner --redact --config .gitleaks.toml --exit-code 1
 
+      - name: Install osv-scanner
+        # Same curl-binary-to-PATH pattern as falco and gitleaks above.
+        # Version pinned so a CVE-database refresh doesn't suddenly fail
+        # an unrelated PR; bump deliberately when wanted.
+        run: |
+          OSV_VERSION=2.2.4
+          sudo curl -sSfL "https://github.com/google/osv-scanner/releases/download/v${OSV_VERSION}/osv-scanner_linux_amd64" \
+            -o /usr/local/bin/osv-scanner
+          sudo chmod +x /usr/local/bin/osv-scanner
+          osv-scanner --version
+
+      - name: Dependency vulnerability scan (osv-scanner, CRITICAL gate)
+        # scripts/check_osv.py runs osv-scanner once and exits non-zero
+        # only on CRITICAL vulnerabilities. Lower severities print as a
+        # warning table but don't block — they get triaged via Dependabot.
+        # Lives in scripts/ so it's also runnable locally via `make osv`.
+        run: uv run python scripts/check_osv.py
+
       - name: Install terraform
         # Required by tests/utils/test_terraform_gen.py — runs `terraform fmt`
         # against generator output and `validate` when TERRAFORM_VALIDATE=1.
@@ -85,19 +134,46 @@ jobs:
         env:
           FALCO_REQUIRED: "1"
           TERRAFORM_VALIDATE: "1"
-        # Gate ratcheted as milestones land:
-        #   end Milestone A: 44% (baseline 46%, -2pp buffer)
-        #   end Milestone E: 47% (current 49% — keeps the 2pp buffer)
-        #   post-Milestone E coverage backfill: 55% (current 59% — 4pp buffer)
-        #   confidence-batch (insights+admin+services+dashboard+origin+
-        #   hypothesis+regression+E2E smoke): 78% (current 83% — 5pp buffer)
+        # Coverage gate convention: ratchet --cov-fail-under to current actual − 2pp.
+        # The 2pp buffer absorbs CI-vs-local jitter so it can't force-fail a build;
+        # raise it as backend coverage clears the next floor. `make ratchet` prints actual.
         #
         # `-n auto` parallelizes via pytest-xdist (TESTING_PLAN_3 item 21).
         # Verified safe: per-service SQLite (`{id}.metadata.db`) + per-test
         # tmp_path give file isolation; autouse `_reset_module_caches` resets
         # the 8 module-level caches between tests; moto fixtures are per-test.
         # Local run: 2268 passed in 58s under `-n auto` vs ~3min serial.
-        run: uv run pytest -n auto --cov=backend --cov-report=term --cov-fail-under=78
+        run: uv run pytest -n auto --cov=backend --cov-report=term --cov-fail-under=86
+
+      - name: Observability guard (no OTEL_EXPORTER=console in deploy files)
+        # SRE-10 / ADR-08 §5: the console exporter floods prod stdout with
+        # ~1 MB/min of JSON (the 2026-06-10 incident). The default is `none`
+        # in code; this catches a hardcoded `console` slipping into a tracked
+        # compose/Dockerfile/env. Also runnable locally via `make ci`.
+        run: bash scripts/check_no_console_otel.sh
+
+      - name: Security-regression count gate
+        # v2.0 cleanup Phase 0.8: asserts the
+        # @pytest.mark.security_regression count never drops below the
+        # baseline floor (24 — from the since-removed audit-findings/
+        # verified fixes). A refactor cannot silently delete coverage of a
+        # verified fix without surfacing the change.
+        run: bash scripts/check_security_regression_count.sh
+
+      - name: Emit perf samples (CI-scale synthetic load)
+        # Produces tests/perf/latest.json from a 100K-row in-memory
+        # DuckDB dataset (~2 s wall). The gate below compares to
+        # tests/perf/baseline.json and fails on >regression_pct_threshold%
+        # over baseline (50 % default; tuned for GH Actions runner
+        # variance at CI scale).
+        run: uv run python scripts/emit_perf_latest.py
+
+      - name: Perf gate (load-harness baseline)
+        # Compares the just-emitted latest.json against baseline.json.
+        # Production targets (≤2800 / ≤1900 ms) are documented in
+        # baseline.json's production_targets_comment for traceability
+        # but enforced by the manual loadtest probe, not this CI gate.
+        run: bash scripts/perf_gate.sh
 
   frontend:
     name: Frontend (Node)
@@ -130,17 +206,94 @@ jobs:
         run: npm ci
 
       - name: Generate API types
-        # `frontend/types/api.generated.ts` is gitignored — regenerated fresh
-        # on every CI run. The tsc step below is the drift guard: if a backend
-        # model changed in a way that breaks a frontend `components['schemas']`
-        # import, tsc fails here against the just-regenerated types.
+        # `frontend/types/api.generated.ts` is regenerated fresh on every
+        # CI run. The drift guard below catches the case where a contributor
+        # bypassed the pre-commit `regen-openapi` hook (or where the backend
+        # OpenAPI surface changed without a corresponding type regen). The
+        # backend-side guard is `tests/test_openapi_snapshot.py`; this is
+        # the consumer-side mirror.
         run: npm run gen:types
 
+      - name: Detect drift in generated OpenAPI types
+        # Pre-commit runs the same generator, so the only way this fires
+        # is (a) someone bypassed --no-verify or (b) the openapi-typescript
+        # tool version drifted between local and CI. Either way, the right
+        # response is to regenerate locally and commit.
+        run: |
+          if ! git diff --exit-code types/api.generated.ts openapi.json; then
+            echo "::error::Generated OpenAPI types are out of sync. Run 'npm run gen:types' locally and commit the result." >&2
+            exit 1
+          fi
+
       - name: Type check (tsc)
         run: npx tsc --noEmit
 
+      - name: ESLint count-ceiling gate
+        # ESLint was previously gated nowhere (this job runs gen:types + tsc +
+        # vitest; the backend job runs the Python import-linter). The gate
+        # fails if the source eslint error count rises above the committed
+        # ceiling, catching new `as any` / rules-of-hooks before runtime.
+        # Ratchet the ceiling down as violations are removed.
+        # The script resolves the repo root itself, so call it with ../.
+        run: bash ../scripts/check_eslint_count.sh
+
       - name: Tests (vitest with coverage)
-        # Gate ratcheted as milestones land:
-        #   end Milestone A: 40% (baseline 42.7%, -2pp buffer)
-        #   end Milestone E: 44% (current 46.55% — keeps the 2pp buffer)
-        run: npx vitest run --coverage --coverage.thresholds.lines=44
+        # Coverage gate convention: ratchet each threshold to current actual − 2pp
+        # (the 2pp buffer absorbs CI-vs-local jitter). GATE-03 (2026-06-19): enforce
+        # statements/functions/branches floors too, not just lines — else an uncovered
+        # error-path branch (no new lines) can't drop the gate, and the branch floor is
+        # what catches a happy-path-only test. `make ratchet` prints all four.
+        run: >-
+          npx vitest run --coverage
+          --coverage.thresholds.lines=66
+          --coverage.thresholds.statements=65
+          --coverage.thresholds.functions=54
+          --coverage.thresholds.branches=52
+
+  scorer:
+    name: Scorer (Rust)
+    runs-on: forge-amd64-medium
+    defaults:
+      run:
+        working-directory: compute/scorer
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Install Rust toolchain
+        # compute/scorer/rust-toolchain.toml pins the channel (1.90). We install
+        # that toolchain explicitly so `rustc`/`cargo` exist for the cache step
+        # below; rustup then honours the toml override when cargo runs here.
+        # Install rustup itself only if the runner image doesn't ship it.
+        run: |
+          if ! command -v cargo >/dev/null && [ ! -x "$HOME/.cargo/bin/cargo" ]; then
+            curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
+              | sh -s -- -y --default-toolchain 1.90 --profile minimal
+          fi
+          echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
+
+      - name: Cache cargo registry + build
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: compute/scorer
+
+      - name: Run scorer unit tests
+        # Native (host-target) tests: Python↔Rust normalize/cookie/matrix
+        # parity, session-expiry boundaries, and the scoring math. These are the
+        # 80+ `#[test]`s that no other CI job runs — a Rust-side normalizer,
+        # wire-format, or expiry regression ships green without this. `--locked`
+        # also fails if Cargo.lock drifted. No Fastly CLI needed: the dev
+        # profile builds for the host; Wasm is only built for deploy
+        # (`make scorer-package`).
+        run: cargo test --locked
+
+      - name: Audit dependencies for RustSec advisories
+        # The scorer verifies AES-GCM cookie integrity at the edge, so a future
+        # advisory in a crypto/RNG crate (aes-gcm/ghash/polyval/getrandom/time)
+        # must fail CI rather than ship green — `cargo test --locked` above does
+        # not check advisories. `cargo audit` exits non-zero on any known
+        # vulnerability in the locked tree. cargo-audit is cached by rust-cache
+        # (cache-bin) after the first install.
+        run: |
+          command -v cargo-audit >/dev/null || cargo install cargo-audit --locked
+          cargo audit