From 9b3b094f84953ef09b038610b25fe395d4902da7 Mon Sep 17 00:00:00 2001 From: Drew Michael Date: Sat, 20 Jun 2026 22:54:50 -0600 Subject: [PATCH 1/4] release: v2.0.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Architecture cleanup + feature release. The largest backend modules were carved into per-concern packages (with re-export shims), telemetry moved to OpenTelemetry + structlog, tenancy got a typed RequestContext boundary that can't be constructed without enforcing service access, and the frontend's hydration/navigation warm-up was replaced with policy. Composite analytics endpoints land as a hard cutover — frontend and backend ship together. Highlights (see CHANGELOG.md for the complete list): - Session scoring: in-UI redeploy + edge-drift warning, fail-open breakdown card, and explicit operator opt-in for edge Layer-2 enforcement (no clock-driven monitoring-to-blocking ramp; deployment age is advisory only). - Observability: every request mints a correlation id that threads through the access log (now with latency) and a persistent slow-query history; richer admin health snapshot and a deeper /api/health probe. - Human-readable PoP and ASN labels across the network, shielding, and origin views, sourced from one shared component seeded by /api/bootstrap. - Backend failures surface inline (no more silent spinners or fabricated zeros) and analytics reads are typed through the generated OpenAPI schema so a rename is a compile error. - Opt-in RUM Web Vitals; a timeout-guarded DuckDB instance-recycle job to bound the object-cache leak; self-healing reclaim of raw files stranded by an interrupted delete. - Consolidation: three SQLite pools collapse into one ThreadLocalPool, per-hour rollup writers share one path, and cron tails funnel through shared helpers. New CI gates: frontend ESLint ceiling, Rust scorer cargo-test, and import contracts. - Dependency freshness sweep across Python, frontend, and the scorer. Release prep: refreshed README, AGENTS, and CHANGELOG; corrected the ADR-12 version reference to 2.0.0; made `make dev` a real target; fixed a bare `./run.sh` so it honors the documented default ports (3000/8000) for fresh clones while still guarding explicitly-chosen tunnel ports; and removed the retired localhost.run mode from the share UI. Co-Authored-By: Claude Opus 4.8 --- .env.example | 112 + .github/workflows/ci.yml | 189 +- .github/workflows/cidr-refresh.yml | 53 + .github/workflows/e2e.yml | 103 + .github/workflows/perf-nightly.yml | 69 + .gitignore | 46 +- .gitleaks.toml | 19 +- .pre-commit-config.yaml | 86 +- AGENTS.md | 263 +- CHANGELOG.md | 504 +- CONTRIBUTING.md | 35 + Caddyfile | 55 +- MONKEYPATCHES.md | 37 +- Makefile | 126 +- README.md | 40 +- SECURITY.md | 42 +- backend/Dockerfile | 89 +- backend/_in_process_publisher.py | 112 + backend/config.py | 143 +- backend/core/_duckdb_status.py | 1207 + backend/core/_log_fields_data.py | 1304 + backend/core/data_migrations.py | 179 - backend/core/duckdb.py | 1360 +- backend/core/duckdb_pool.py | 654 +- backend/core/duckdb_recycle.py | 215 + backend/core/fastly/client.py | 145 +- backend/core/fastly/mock_fixtures.py | 196 + backend/core/fastly/service.py | 26 + backend/core/fastly/utils.py | 74 +- backend/core/field_registry.py | 572 + backend/core/iceberg.py | 4232 -- backend/core/iceberg/__init__.py | 173 + backend/core/iceberg/_core.py | 1180 + backend/core/iceberg/buffer.py | 1141 + backend/core/iceberg/fs.py | 530 + .../lake.py => core/iceberg/lake_info.py} | 8 +- backend/core/iceberg/manifest.py | 452 + backend/core/iceberg/sync.py | 570 + backend/core/iceberg/view.py | 1214 + backend/core/ingest.py | 309 +- backend/core/local_compaction.py | 391 +- backend/core/log_fields.py | 1351 +- backend/core/metadata/__init__.py | 332 + backend/core/metadata/alerts.py | 220 + backend/core/metadata/asn_cache.py | 49 + backend/core/metadata/base.py | 415 + backend/core/metadata/cron_log.py | 602 + backend/core/metadata/ingest_log.py | 926 + backend/core/metadata/reconciliation.py | 430 + backend/core/metadata/slow_queries.py | 168 + backend/core/metadata/state.py | 157 + backend/core/metadata/usage_log.py | 706 + backend/core/metadata/usage_log_db.py | 228 + backend/core/metadata/views.py | 122 + backend/core/metadata_db.py | 3168 -- backend/core/metric_snapshots.py | 288 + backend/core/query_attribution.py | 245 + backend/core/query_instrumentation.py | 471 + backend/core/query_registry.py | 614 + backend/core/request_context.py | 200 + backend/core/request_telemetry.py | 334 + backend/core/rollups.py | 1036 - backend/core/rollups/__init__.py | 195 + backend/core/rollups/_common.py | 934 + backend/core/rollups/day_bundles.py | 679 + backend/core/rollups/hour_bundles.py | 400 + backend/core/rollups/network_rtt.py | 106 + backend/core/rollups/network_speed.py | 117 + backend/core/rollups/origin_summary.py | 203 + backend/core/rollups/perf_latency.py | 152 + backend/core/rollups/recompute.py | 717 + backend/core/rollups/sessions.py | 144 + backend/core/rollups/slow_urls.py | 147 + backend/core/rollups/time_series.py | 110 + backend/core/rollups/verified_bots_ts.py | 115 + backend/core/rollups/wellknown_bots.py | 305 + backend/core/share_db.py | 1312 - backend/core/share_db/__init__.py | 177 + backend/core/share_db/audit.py | 77 + backend/core/share_db/connection.py | 200 + backend/core/share_db/invites.py | 553 + backend/core/share_db/passcode.py | 158 + backend/core/share_db/schema.py | 164 + backend/core/share_db/sessions.py | 73 + backend/core/share_db/settings.py | 40 + backend/core/share_db/tos.py | 33 + backend/core/share_db/validation.py | 280 + backend/core/sqlite_migrations.py | 187 +- backend/core/sqlite_pool.py | 344 + backend/core/web_vitals_store.py | 122 + backend/cron/__init__.py | 9 + backend/cron/decorators.py | 149 + backend/cron/jobs/__init__.py | 9 + backend/cron/jobs/_common.py | 106 + backend/cron/jobs/commit.py | 211 + backend/cron/jobs/compaction.py | 331 + backend/cron/jobs/duckdb_recycle.py | 31 + backend/cron/jobs/expire.py | 89 + backend/cron/jobs/insights_prewarmer.py | 95 + backend/cron/jobs/metadata.py | 721 + backend/cron/jobs/metric_snapshot.py | 174 + backend/cron/jobs/optimize.py | 140 + backend/cron/jobs/sync.py | 996 + backend/cron/schedule.py | 100 + backend/cron/scheduler.py | 1112 + backend/cron_progress.py | 58 +- backend/cron_runs_publisher.py | 34 + backend/deps.py | 76 +- backend/main.py | 601 +- backend/models/admin.py | 225 +- backend/models/admin_queries.py | 39 + backend/models/alerts.py | 4 + backend/models/common.py | 126 +- backend/models/custom_fields.py | 2 +- backend/models/dashboard.py | 149 +- backend/models/errors.py | 59 + backend/models/metrics.py | 9 - backend/models/network.py | 13 + backend/models/origin.py | 15 +- backend/models/performance.py | 13 +- backend/models/provision.py | 142 + backend/models/security.py | 9 + backend/models/services.py | 60 +- backend/models/session_scoring.py | 303 + backend/models/share_admin.py | 70 + backend/models/share_auth.py | 45 +- backend/models/usage.py | 25 +- backend/provision/__init__.py | 2 + backend/provision/cli.py | 306 +- backend/provision/fastly_api.py | 245 +- backend/provision/fos_setup.py | 30 +- backend/provision/orchestrator.py | 55 +- .../provision/session_scoring_orchestrator.py | 587 +- backend/provision/session_scoring_setup.py | 227 +- backend/provision/session_scoring_vcl.py | 149 +- backend/provision/utils.py | 72 +- backend/repositories/_base.py | 1874 +- backend/repositories/_presets_cache.py | 59 + backend/repositories/_sql/__init__.py | 18 + backend/repositories/_sql/alerts.py | 101 + backend/repositories/_sql/dashboard.py | 225 + backend/repositories/_sql/insights.py | 751 + backend/repositories/_sql/network.py | 351 + backend/repositories/_sql/origin.py | 321 + backend/repositories/_sql/query.py | 94 + backend/repositories/_sql/security.py | 326 + backend/repositories/_sql/sessions.py | 163 + backend/repositories/_sql/usage.py | 25 + backend/repositories/alerts.py | 88 +- backend/repositories/cron.py | 15 +- backend/repositories/dashboard.py | 743 +- backend/repositories/insights/__init__.py | 4 +- backend/repositories/insights/definitions.py | 531 +- backend/repositories/insights/registry.py | 4 +- backend/repositories/insights/repository.py | 1210 +- backend/repositories/network.py | 516 +- backend/repositories/origin.py | 1093 +- backend/repositories/performance.py | 344 +- backend/repositories/query.py | 218 +- backend/repositories/security.py | 1158 +- backend/repositories/session_scoring.py | 195 + backend/repositories/sessions.py | 610 +- backend/repositories/usage.py | 18 +- backend/repositories/utils/filters.py | 36 +- backend/repositories/utils/response_cache.py | 55 + backend/repositories/views.py | 57 +- backend/routers/_state_sync.py | 34 + backend/routers/admin.py | 1739 - backend/routers/admin/__init__.py | 99 + backend/routers/admin/_dir_size.py | 54 + backend/routers/admin/_helpers.py | 226 + backend/routers/admin/_router.py | 15 + backend/routers/admin/bot_sources.py | 37 + backend/routers/admin/compaction.py | 313 + backend/routers/admin/downloads.py | 293 + backend/routers/admin/health.py | 263 + backend/routers/admin/iceberg.py | 112 + backend/routers/admin/ingest.py | 40 + backend/routers/admin/log_accounting.py | 428 + backend/routers/admin/metric_history.py | 36 + backend/routers/admin/pop_locations.py | 49 + backend/routers/admin/sync_status.py | 304 + backend/routers/admin/system_metrics.py | 84 + backend/routers/admin/trees.py | 32 + backend/routers/admin_queries.py | 263 + backend/routers/admin_usage.py | 349 + backend/routers/alerts.py | 121 +- backend/routers/bootstrap.py | 556 +- backend/routers/dashboard.py | 369 +- backend/routers/debug.py | 3 +- backend/routers/insights.py | 75 +- backend/routers/network.py | 126 +- backend/routers/origin.py | 186 +- backend/routers/performance.py | 90 +- backend/routers/provision.py | 305 +- backend/routers/query.py | 159 +- backend/routers/security.py | 99 +- backend/routers/services/audit.py | 14 +- backend/routers/services/core.py | 815 +- backend/routers/services/cron.py | 83 +- backend/routers/session_scoring.py | 2291 +- backend/routers/session_scoring_admin.py | 1600 + backend/routers/sessions.py | 37 +- backend/routers/share_admin.py | 182 +- backend/routers/share_auth.py | 115 +- backend/routers/usage.py | 236 +- backend/routers/ux_events.py | 83 + backend/routers/views.py | 75 +- backend/routers/web_vitals.py | 107 + backend/scheduler.py | 2908 +- backend/scoring/evaluate.py | 24 +- backend/scoring/fixtures.py | 9 +- backend/scoring/labels.py | 72 +- backend/scoring/matrix.py | 127 +- backend/scoring/normalize.py | 86 +- backend/scoring/scorer.py | 70 +- backend/services/service_manager.py | 8 + backend/state_sync.py | 48 +- backend/sync_status_publisher.py | 26 + backend/system_metrics_sampler.py | 163 + backend/utils/active_requests.py | 135 + backend/utils/auth.py | 62 + backend/utils/bot_sources.py | 53 +- backend/utils/bounded_cache.py | 149 +- backend/utils/cache_registry.py | 84 + backend/utils/cdn.py | 52 - backend/utils/date_utils.py | 40 + backend/utils/hll.py | 300 + backend/utils/iceberg_expr.py | 41 + backend/utils/ngwaf.py | 14 +- backend/utils/ngwaf_bot_cache.py | 18 +- backend/utils/path_safety.py | 29 + backend/utils/pop_utils.py | 55 +- backend/utils/rdns_cache.py | 480 +- backend/utils/remote_access.py | 753 +- backend/utils/router_utils.py | 335 +- backend/utils/sql_validator.py | 365 +- backend/utils/sqlite_profiler.py | 67 +- backend/utils/sse_subscription.py | 74 + backend/utils/structlog_config.py | 183 + backend/utils/system_jobs.py | 12 +- backend/utils/telemetry.py | 171 +- backend/utils/telemetry_proxy.py | 194 +- .../utils/telemetry_response_middleware.py | 49 +- backend/utils/terraform_gen.py | 499 +- backend/utils/tunnel.py | 1022 - backend/utils/tunnel/__init__.py | 46 + backend/utils/tunnel/fingerprint.py | 31 + backend/utils/tunnel/manager.py | 556 + backend/utils/tunnel/rate_limiter.py | 97 + backend/utils/tunnel/session.py | 66 + backend/utils/tunnel/state.py | 87 + backend/utils/vcl_utils.py | 147 +- backend/utils/vcl_validator.py | 42 +- .../services/config.py => cache/.gitkeep | 0 caddy/Caddyfile.local | 56 + caddy/Dockerfile | 21 + compute/scorer/Cargo.lock | 67 +- compute/scorer/Cargo.toml | 6 +- compute/scorer/fastly.toml | 18 + compute/scorer/fixtures/local-matrix.json | 11 + compute/scorer/pkg/session-scorer.tar.gz | Bin 0 -> 195365 bytes compute/scorer/src/cookie.rs | 13 +- compute/scorer/src/main.rs | 516 +- compute/scorer/src/matrix.rs | 512 +- compute/scorer/src/normalize.rs | 289 +- compute/scorer/src/scorer.rs | 358 +- config.example.json | 18 + configs/.gitkeep | 0 configs/ssh_known_hosts | 30 - data/.gitkeep | 0 docker-compose.prod.yml | 68 + docker-compose.yml | 54 +- docs/ARCHITECTURE.md | 43 +- docs/adr/01-storage-model.md | 44 + docs/adr/02-request-lifecycle.md | 60 + docs/adr/03-tenancy.md | 54 + docs/adr/04-middleware-order.md | 83 + docs/adr/05-frontend-rendering-boundary.md | 69 + docs/adr/06-view-warming.md | 218 + docs/adr/07-feature-budgets.md | 114 + docs/adr/08-observability.md | 120 + docs/adr/09-error-handling.md | 116 + docs/adr/10-schema-evolution.md | 108 + docs/adr/11-secret-rotation.md | 114 + docs/adr/12-api-versioning.md | 129 + docs/adr/13-backup-dr.md | 145 + docs/adr/README.md | 21 + docs/deploy/README.md | 31 + docs/deploy/aws_ec2.md | 195 + docs/deploy/azure_vm.md | 205 + docs/deploy/gce.md | 193 + docs/deploy/generic_linux.md | 244 + docs/features.md | 15 +- docs/session_scoring_runbook.md | 84 +- frontend/Dockerfile | 67 +- frontend/__tests__/app/admin.test.tsx | 60 +- .../app/admin/session-scoring.test.tsx | 133 + frontend/__tests__/app/alerts.test.tsx | 15 +- .../app/alerts/AlertPreview.test.tsx | 222 + frontend/__tests__/app/charts.test.tsx | 82 + frontend/__tests__/app/dashboard.test.tsx | 30 +- .../app/dashboard/CardGrid.bot-error.test.tsx | 39 + frontend/__tests__/app/insights.test.tsx | 15 +- frontend/__tests__/app/logs.test.tsx | 118 + .../app/logs/QuickActionsBar.test.tsx | 73 + .../app/network.quality-error.test.tsx | 76 + frontend/__tests__/app/network.test.tsx | 75 + .../app/network/help-content.test.tsx | 147 + frontend/__tests__/app/origin.test.tsx | 68 + .../app/performance.nullcache.test.tsx | 121 + frontend/__tests__/app/performance.test.tsx | 70 + frontend/__tests__/app/query.test.tsx | 116 +- frontend/__tests__/app/security.test.tsx | 109 + frontend/__tests__/app/sessions.test.tsx | 65 + .../app/sessions/SessionDetail.error.test.tsx | 52 + .../app/share-login/acknowledge.test.tsx | 62 +- .../__tests__/app/share-login/page.test.tsx | 4 + .../app/share-login/safeReturnTarget.test.ts | 40 + .../app/usage.storage-error.test.tsx | 76 + frontend/__tests__/app/usage.test.tsx | 77 + .../components/ActiveFiltersBanner.test.tsx | 66 + .../components/AnalyticsCard.test.tsx | 112 +- .../__tests__/components/AppLayout.test.tsx | 46 +- .../components/ChartIntervalButtons.test.tsx | 69 + .../components/ChoroplethMap.test.tsx | 82 + .../__tests__/components/CronLiveLog.test.tsx | 191 + .../components/CustomFieldDrawer.test.tsx | 18 + .../components/CustomFieldsManager.test.tsx | 3 - .../__tests__/components/DataTable.test.tsx | 21 +- .../DataTable/DateTimeCell.test.tsx | 46 + .../__tests__/components/DebugPanel.test.tsx | 179 + .../components/DeltaIndicator.test.tsx | 54 + .../components/ErrorBoundary.test.tsx | 54 + .../__tests__/components/FilterBar.test.tsx | 73 +- .../components/FilterValueCell.test.tsx | 137 + .../__tests__/components/InsightCard.test.tsx | 26 +- .../Insights/sections/performance.test.tsx | 70 + .../Insights/sections/security.test.tsx | 74 + .../Insights/sections/traffic.test.tsx | 71 + .../__tests__/components/LazyMount.test.tsx | 48 + .../components/LogSettingsModal.test.tsx | 47 +- .../Map/NetworkMap/MapLayer.test.tsx | 371 + .../Map/NetworkMap/OverlayLayer.test.tsx | 119 + .../Map/NetworkMap/controls.test.tsx | 313 + .../components/Map/NetworkMap/index.test.tsx | 285 + .../components/Map/ShieldingMap.test.tsx | 415 + .../__tests__/components/NetworkMap.test.tsx | 54 + .../components/NoServiceSelected.test.tsx | 29 + .../PlotlyChart/ChartA11yTable.test.tsx | 89 + .../PlotlyChart/PlotlyChart.test.tsx | 146 + .../components/ProvisionWizard.test.tsx | 126 +- .../ProvisionWizard/wizard-api.test.ts | 436 + .../wizard-config-helpers.test.ts | 285 + .../ProvisionWizard/wizard-deploy.test.ts | 452 + .../ProvisionWizard/wizard-draft.test.ts | 212 + .../components/ReloadLoopGuard.test.tsx | 80 + .../components/ReportLayout.test.tsx | 123 + .../__tests__/components/ReportShell.test.tsx | 139 + .../components/ServiceSwitcher.test.tsx | 113 + .../SessionScoring/L2EnforcementCard.test.tsx | 157 + .../__tests__/components/Sparkline.test.tsx | 47 + .../__tests__/components/TimeAgo.test.tsx | 44 + .../components/TimeSeriesChart.test.tsx | 111 + .../components/UpdatingBadge.test.tsx | 27 + .../components/dashboard/CardGrid.test.tsx | 166 + .../components/dashboard/GeoMap.test.tsx | 164 + .../components/dashboard/TopTenTable.test.tsx | 275 + .../dashboard/TrafficChart.test.tsx | 152 + .../components/security/BotsSection.test.tsx | 246 + .../security/HeaderAnomaliesSection.test.tsx | 141 + .../security/NetworkSection.test.tsx | 127 + .../sessions/ScoringControls.test.tsx | 155 + .../sessions/SessionDetail.test.tsx | 320 + .../sessions/SessionsTable.test.tsx | 206 + .../components/ui/review-card.test.tsx | 46 + .../components/ui/section-header.test.tsx | 32 + .../components/ui/skeleton-grid.test.tsx | 32 + .../components/ui/stat-card.test.tsx | 35 + frontend/__tests__/helpers/maplibre-mock.ts | 173 + frontend/__tests__/helpers/page-smoke.tsx | 144 + frontend/__tests__/helpers/query.tsx | 29 + .../__tests__/hooks/useActiveService.test.ts | 61 + .../hooks/useAnalystHeartbeat.test.ts | 110 + frontend/__tests__/hooks/useBootstrap.test.ts | 69 +- .../__tests__/hooks/useCardVisibility.test.ts | 119 + .../__tests__/hooks/useCronRunsStream.test.ts | 395 + .../hooks/useDashboardBundle.test.ts | 218 + .../__tests__/hooks/useDashboardCards.test.ts | 172 + .../hooks/useDataWindowOverlap.test.ts | 198 + .../__tests__/hooks/useFilterUrlSync.test.ts | 103 + .../__tests__/hooks/useFilteredActive.test.ts | 295 + .../hooks/useHeaderBadgeStream.test.ts | 168 + .../__tests__/hooks/useIsDataReady.test.ts | 56 + .../hooks/useKeyboardShortcuts.test.ts | 118 + frontend/__tests__/hooks/useLastSync.test.ts | 119 + .../hooks/useLogFieldsCatalog.test.ts | 90 + .../__tests__/hooks/useNowSeconds.test.ts | 112 + .../__tests__/hooks/useReportConfig.test.ts | 73 +- frontend/__tests__/hooks/useSSE.test.ts | 49 + .../__tests__/hooks/useServiceStream.test.ts | 211 + .../hooks/useShareStatusBanner.test.tsx | 126 + .../__tests__/hooks/useSyncStatus.test.ts | 101 + .../hooks/useSyncStatusStream.test.ts | 152 + .../hooks/useSystemMetricsStream.test.ts | 157 + .../__tests__/hooks/useUrlFilterSync.test.ts | 59 +- .../__tests__/hooks/useUrlServiceSync.test.ts | 80 +- .../__tests__/lib/api-admin-token.test.ts | 250 + .../__tests__/lib/api-error-paths.test.ts | 14 +- .../__tests__/lib/api/custom-fields.test.ts | 281 + frontend/__tests__/lib/chart-helpers.test.ts | 51 + frontend/__tests__/lib/date.test.ts | 2 +- frontend/__tests__/lib/error-paths.test.tsx | 11 +- frontend/__tests__/lib/format.test.ts | 31 + frontend/__tests__/lib/pop.test.ts | 31 + frontend/__tests__/lib/table-utils.test.tsx | 68 + frontend/__tests__/lib/toast.test.ts | 176 + .../__tests__/lib/urlFilterHydration.test.ts | 119 + frontend/__tests__/lib/utils.test.ts | 60 +- .../lib/workers/buildTrafficData.test.ts | 64 + .../__tests__/lib/workers/parseJson.test.ts | 71 + frontend/__tests__/middleware.test.ts | 75 + frontend/__tests__/msw-coverage.test.ts | 232 + .../navigation/dashboard-filter-urls.test.ts | 9 +- frontend/__tests__/preload-manifest.test.ts | 120 - frontend/__tests__/ssr/bootstrap.test.ts | 179 + frontend/__tests__/ssr/logs_usage_log.test.ts | 175 + frontend/__tests__/ssr/tos.test.ts | 198 + frontend/__tests__/stores/filterStore.test.ts | 68 +- frontend/app/_routing.md | 69 + frontend/app/admin/AdminPrefetchLinks.tsx | 105 + .../app/admin/_sections/BotSourcesPanel.tsx | 251 + .../app/admin/_sections/CredentialsDialog.tsx | 168 + .../app/admin/_sections/DiagnosticsPanel.tsx | 78 + .../app/admin/_sections/GlobalSettings.tsx | 297 + frontend/app/admin/_sections/NgwafDialog.tsx | 191 + .../admin/_sections/OperationsOverview.tsx | 263 + .../app/admin/_sections/ServicesTable.tsx | 154 + .../admin/_sections/ServicesTableColumns.tsx | 311 + frontend/app/admin/_sections/SystemStatus.tsx | 97 + frontend/app/admin/page.tsx | 1427 +- frontend/app/admin/queries/_helpers.ts | 70 + .../admin/queries/_hooks/useFilteredActive.ts | 239 + .../queries/_hooks/useKeyboardShortcuts.ts | 67 + .../queries/_hooks/useQueryMonitorUrlSync.ts | 102 + .../admin/queries/_sections/ActiveTable.tsx | 77 + .../queries/_sections/CompletedTable.tsx | 66 + .../admin/queries/_sections/DbFilterChips.tsx | 23 + .../admin/queries/_sections/FilterChipRow.tsx | 33 + .../admin/queries/_sections/FilterChips.tsx | 25 + .../queries/_sections/PollingIndicator.tsx | 30 + .../queries/_sections/RowDetailDialog.tsx | 234 + .../admin/queries/_sections/ShortcutsHelp.tsx | 50 + .../admin/queries/_sections/SummaryStrip.tsx | 63 + .../admin/queries/_sections/queryColumns.tsx | 376 + frontend/app/admin/queries/_types.ts | 97 + frontend/app/admin/queries/loading.tsx | 5 + frontend/app/admin/queries/page.tsx | 664 + .../app/admin/session-scoring/loading.tsx | 11 +- frontend/app/admin/session-scoring/page.tsx | 119 +- frontend/app/admin/share/page.tsx | 127 +- .../trends/_sections/AdminTrendsClient.tsx | 155 + frontend/app/admin/trends/loading.tsx | 5 + frontend/app/admin/trends/page.tsx | 35 + .../app/admin/usage-log/_sections/Filters.tsx | 100 + .../admin/usage-log/_sections/UsageChart.tsx | 180 + .../usage-log/_sections/UsageLogClient.tsx | 358 + .../admin/usage-log/_sections/UsageTable.tsx | 96 + .../app/admin/usage-log/_sections/shared.ts | 35 + frontend/app/admin/usage-log/page.tsx | 709 +- frontend/app/alerts/_sections/AlertEditor.tsx | 449 + .../app/alerts/_sections/AlertPreview.tsx | 174 + .../app/alerts/_sections/AlertsClient.tsx | 348 + frontend/app/alerts/_sections/AlertsList.tsx | 300 + frontend/app/alerts/page.tsx | 997 +- frontend/app/charts/loading.tsx | 9 +- frontend/app/charts/page.tsx | 63 +- frontend/app/dashboard/_sections/CardGrid.tsx | 263 + frontend/app/dashboard/_sections/GeoMap.tsx | 88 + .../app/dashboard/_sections/TrafficChart.tsx | 249 + .../app/dashboard/_sections/categories.ts | 77 + .../app/dashboard/_sections/chartHelpers.ts | 207 + frontend/app/dashboard/_sections/types.ts | 30 + frontend/app/dashboard/loading.tsx | 9 +- frontend/app/dashboard/page.tsx | 1185 +- frontend/app/error.tsx | 89 + frontend/app/global-error.tsx | 109 + frontend/app/globals.css | 37 +- frontend/app/insights/loading.tsx | 9 +- frontend/app/insights/page.tsx | 54 +- frontend/app/layout.tsx | 202 +- frontend/app/logs/_sections/AuditColumns.tsx | 361 + frontend/app/logs/_sections/CronColumns.tsx | 311 + .../app/logs/_sections/CronExplanations.ts | 14 + .../app/logs/_sections/CronScheduleBox.tsx | 137 + frontend/app/logs/_sections/CronTab.tsx | 215 + .../logs/_sections/FloatingOperationsDock.tsx | 258 + frontend/app/logs/_sections/IngestionTab.tsx | 43 + frontend/app/logs/_sections/LogsClient.tsx | 276 + .../app/logs/_sections/QuickActionsBar.tsx | 166 + frontend/app/logs/_sections/SSEModal.tsx | 77 + frontend/app/logs/_sections/SchemaTab.tsx | 85 + .../app/logs/_sections/ServiceHistoryTab.tsx | 120 + frontend/app/logs/_state.ts | 514 + frontend/app/logs/page.tsx | 2177 +- frontend/app/network/help-content.tsx | 42 +- frontend/app/network/loading.tsx | 9 +- frontend/app/network/page.tsx | 494 +- frontend/app/not-found.tsx | 32 + frontend/app/origin/_sections/Aggregates.tsx | 91 + .../app/origin/_sections/LatencyHeatmap.tsx | 213 + frontend/app/origin/_sections/Timeseries.tsx | 143 + frontend/app/origin/loading.tsx | 9 +- frontend/app/origin/page.tsx | 533 +- frontend/app/performance/help-content.tsx | 1 - frontend/app/performance/loading.tsx | 9 +- frontend/app/performance/page.tsx | 200 +- frontend/app/query/_sections/ModeToggle.tsx | 32 + frontend/app/query/_sections/QueryToolbar.tsx | 175 + frontend/app/query/_sections/RawSqlMode.tsx | 151 + frontend/app/query/_sections/ResultsTable.tsx | 76 + .../app/query/_sections/StructuredMode.tsx | 37 + frontend/app/query/_sql_builder.ts | 122 + frontend/app/query/page.tsx | 601 +- .../app/security/_sections/BotsSection.tsx | 463 + .../security/_sections/ChartEmptyState.tsx | 13 + .../_sections/HeaderAnomaliesSection.tsx | 114 + .../app/security/_sections/NetworkSection.tsx | 135 + .../app/security/_sections/securityInfo.tsx | 169 + frontend/app/security/loading.tsx | 9 +- frontend/app/security/page.tsx | 715 +- .../sessions/_sections/ScoringControls.tsx | 101 + .../app/sessions/_sections/SessionDetail.tsx | 313 + .../app/sessions/_sections/SessionsTable.tsx | 210 + frontend/app/sessions/loading.tsx | 9 +- frontend/app/sessions/page.tsx | 633 +- frontend/app/share-login/ShareLoginForm.tsx | 195 + .../acknowledge/AcknowledgeButton.tsx | 75 + .../acknowledge/AcknowledgeFallback.tsx | 74 + .../app/share-login/acknowledge/loading.tsx | 20 + frontend/app/share-login/acknowledge/page.tsx | 114 +- frontend/app/share-login/loading.tsx | 20 + frontend/app/share-login/page.tsx | 157 +- frontend/app/usage/loading.tsx | 9 +- frontend/app/usage/page.tsx | 187 +- frontend/components/AnalyticsCard.tsx | 128 +- frontend/components/AppLayout.tsx | 740 +- frontend/components/BackToAdminLink.tsx | 36 + frontend/components/ChartIntervalButtons.tsx | 8 +- frontend/components/CodeEditor/CodeEditor.tsx | 90 +- frontend/components/CodeEditor/index.ts | 1 - .../CostCalculator/CostCalculator.tsx | 582 +- frontend/components/CostCalculator/Inputs.tsx | 93 + .../components/CostCalculator/Pricing.tsx | 96 + .../components/CostCalculator/Results.tsx | 69 + frontend/components/CostCalculator/calc.ts | 288 + frontend/components/CostCalculator/parts.tsx | 104 + frontend/components/CronLiveLog.tsx | 52 +- .../CronSettingsModal/CronSettingsModal.tsx | 382 +- .../components/CronSettingsModal/Preview.tsx | 30 + .../components/CronSettingsModal/Schedule.tsx | 242 + .../components/CronSettingsModal/Triggers.tsx | 147 + .../components/CronSettingsModal/constants.ts | 43 + .../CustomFields/CustomFieldDrawer.tsx | 110 +- .../CustomFields/CustomFieldsManager.tsx | 14 +- .../components/Dashboard/DashboardHeader.tsx | 47 +- .../Dashboard/FieldSearchDialog.tsx | 6 +- frontend/components/Dashboard/TopTenTable.tsx | 113 +- frontend/components/DashboardLinkCell.tsx | 36 - frontend/components/DataTable/Body.tsx | 64 + .../components/DataTable/ColumnPicker.tsx | 51 + .../DataTable/ColumnVisibilityDropdown.tsx | 6 +- frontend/components/DataTable/DataTable.tsx | 483 +- .../components/DataTable/DataTableBody.tsx | 112 + .../DataTable/DataTablePagination.tsx | 72 + .../DataTable/DataTableReadonly.tsx | 212 + .../components/DataTable/DateTimeCell.tsx | 8 +- frontend/components/DataTable/Header.tsx | 118 + .../components/DataTable/StaticHeader.tsx | 60 + frontend/components/DataTable/Toolbar.tsx | 49 + .../components/DataTable/useDataTableState.ts | 56 + frontend/components/DataWindowBanner.tsx | 79 + frontend/components/DebugPanel.tsx | 68 +- frontend/components/DeltaIndicator.tsx | 10 +- frontend/components/ErrorBoundary.tsx | 29 + .../components/FileBrowser/FileBrowser.tsx | 61 +- .../FilterBar/ActiveFiltersBanner.tsx | 95 + .../components/FilterBar/AddFilterDialog.tsx | 12 +- frontend/components/FilterBar/FilterBar.tsx | 317 +- .../components/FilterBar/SaveViewDialog.tsx | 44 +- .../components/FilterBar/ViewSelector.tsx | 127 +- frontend/components/FilterPopover.tsx | 69 - frontend/components/FilterValueCell.tsx | 161 + frontend/components/HydrateAdminToken.tsx | 34 + .../IcebergStatus/IcebergCalendar.tsx | 37 +- .../IcebergStatus/IcebergStatus.tsx | 56 +- .../Insights/CacheCollapseModal.tsx | 289 + .../Insights/ImpossibleDistanceModal.tsx | 45 +- frontend/components/Insights/InsightCard.tsx | 70 +- .../components/Insights/InsightDataModal.tsx | 8 +- .../components/Insights/InsightHelpModal.tsx | 566 - .../Insights/InsightHelpModal/index.tsx | 73 + .../InsightHelpModal/sections/cache.tsx | 91 + .../InsightHelpModal/sections/errors.tsx | 60 + .../sections/optimization.tsx | 55 + .../InsightHelpModal/sections/performance.tsx | 105 + .../InsightHelpModal/sections/security.tsx | 219 + .../InsightHelpModal/sections/traffic.tsx | 86 + .../Insights/InsightHelpModal/types.ts | 15 + .../components/Insights/InsightItemRow.tsx | 32 +- .../InviteAnalystDialog.tsx | 83 +- .../LogSettingsModal/CustomFields.tsx | 16 + .../LogSettingsModal/FieldGroups.tsx | 372 + .../LogSettingsModal/LogSettingsModal.tsx | 522 +- .../components/LogSettingsModal/Preview.tsx | 188 + frontend/components/Map/ChoroplethMap.tsx | 304 +- frontend/components/Map/NetworkMap.tsx | 562 - .../components/Map/NetworkMap/MapLayer.tsx | 369 + .../Map/NetworkMap/OverlayLayer.tsx | 69 + .../components/Map/NetworkMap/controls.tsx | 160 + frontend/components/Map/NetworkMap/index.tsx | 185 + frontend/components/Map/ShieldingMap.tsx | 140 +- frontend/components/Map/baseLayers.ts | 101 + frontend/components/Map/colors.ts | 27 + frontend/components/MetadataStorageCard.tsx | 70 +- frontend/components/NoServiceSelected.tsx | 2 +- .../components/PlotlyChart/ChartA11yTable.tsx | 60 + .../components/PlotlyChart/PlotlyChart.tsx | 112 +- .../components/PlotlyChart/PlotlyPrewarm.tsx | 13 +- .../__tests__/tracesToTable.test.ts | 98 + .../components/PlotlyChart/tracesToTable.ts | 168 + frontend/components/PopLabel.tsx | 25 + .../ProvisionWizard/JsonImportSection.tsx | 88 + .../ProvisionWizard/ProvisionWizard.tsx | 3605 +- .../ProvisionWizard/ResumeBanner.tsx | 63 + .../ProvisionWizard/WizardFooter.tsx | 239 + .../ProvisionWizard/WizardHeader.tsx | 62 + .../ProvisionWizard/steps/AnalyzeStep.tsx | 204 + .../ProvisionWizard/steps/ConfirmStep.tsx | 138 + .../ProvisionWizard/steps/ExecuteStep.tsx | 348 + .../ProvisionWizard/steps/FieldsStep.tsx | 120 + .../ProvisionWizard/steps/JoinStep.tsx | 520 + .../ProvisionWizard/steps/ModeStep.tsx | 78 + .../ProvisionWizard/steps/NgwafStep.tsx | 125 + .../ProvisionWizard/steps/ServiceStep.tsx | 97 + .../ProvisionWizard/steps/SettingsStep.tsx | 103 + .../ProvisionWizard/steps/StorageStep.tsx | 422 + .../ProvisionWizard/steps/TerraformStep.tsx | 157 + .../ProvisionWizard/steps/TokenStep.tsx | 74 + frontend/components/ProvisionWizard/types.ts | 320 + .../ProvisionWizard/useWizardState.ts | 652 + .../components/ProvisionWizard/wizard-api.ts | 229 + .../ProvisionWizard/wizard-config-helpers.ts | 134 + .../ProvisionWizard/wizard-deploy.ts | 386 + .../ProvisionWizard/wizard-draft.ts | 122 + .../ProvisionWizard/wizard-effects.ts | 262 + frontend/components/QueryProvider.tsx | 55 +- frontend/components/ReloadLoopGuard.tsx | 145 + frontend/components/ReportLayout.tsx | 52 +- frontend/components/ReportShell.tsx | 54 +- frontend/components/SSEModal/SSEModal.tsx | 70 +- .../components/SSEModal/SSEProgressView.tsx | 57 +- .../ServiceSwitcher/ServiceSwitcher.tsx | 225 +- .../SessionScoring/CardErrorState.tsx | 38 + .../SessionScoring/ComplianceChart.tsx | 28 +- .../SessionScoring/ExcludeRegexCard.tsx | 3 +- .../SessionScoring/FlagSessionPopover.tsx | 43 +- .../L2EnforcementCard/index.tsx | 253 + .../components/SessionScoring/LabelsTab.tsx | 81 +- .../SessionScoring/MatrixVersionsCard.tsx | 9 +- .../SessionScoring/PerReasonAucCard.tsx | 35 +- .../SessionScoring/RetrainButton.tsx | 1 + .../components/SessionScoring/RocPrCurves.tsx | 29 +- .../SessionScoring/ScoreDistChart.tsx | 47 +- .../SessionScoring/ScorerErrorsChart.tsx | 63 + .../ScorerFailOpenBreakdownCard.tsx | 155 + .../SessionScoring/ScorerLatencyChart.tsx | 106 + .../SessionScoring/ScoringHealthCard.tsx | 124 +- .../SessionScoring/SessionEventsDialog.tsx | 3 +- .../SessionScoring/StackedHourlyBarChart.tsx | 8 +- .../components/SessionScoring/StatusPanel.tsx | 121 +- .../SessionScoring/ThresholdSlider/Matrix.tsx | 83 + .../ThresholdSlider/Preview.tsx | 54 + .../SessionScoring/ThresholdSlider/Slider.tsx | 196 + .../index.tsx} | 276 +- .../SessionScoring/TopFlaggedTable.tsx | 27 +- .../SessionScoring/help-content.tsx | 91 + .../SessionScoring/useScorerTimeseries.ts | 42 + .../SessionScoring/useScoringQuery.ts | 41 + frontend/components/Sparkline.tsx | 344 + frontend/components/StoreHydrator.tsx | 46 + .../SyncFromCloudModal/SyncFromCloudModal.tsx | 18 +- .../SyncStatusBadge/SyncStatusBadge.tsx | 291 +- frontend/components/SystemHealthCard.tsx | 392 +- .../TeardownDialog/TeardownDialog.tsx | 12 +- frontend/components/TimeAgo.tsx | 42 + .../TimezoneSwitcher/TimezoneSwitcher.tsx | 2 +- frontend/components/UpdatingBadge.tsx | 6 +- .../WebVitalsReporter/WebVitalsReporter.tsx | 80 + .../share-dashboard/AuditLogPanel.tsx | 13 +- .../share-dashboard/CreateInviteDialog.tsx | 9 +- .../share-dashboard/InvitationsPanel.tsx | 29 +- .../share-dashboard/SessionsPanel.tsx | 22 +- .../share-dashboard/SharingControlPanel.tsx | 144 +- .../share-dashboard/useShareMutation.ts | 32 + frontend/components/share-dashboard/utils.ts | 20 +- .../components/skeletons/PageSkeleton.tsx | 46 +- frontend/components/ui/button.tsx | 22 +- frontend/components/ui/dialog.tsx | 6 + frontend/components/ui/dropdown-menu.tsx | 2 +- frontend/components/ui/empty-state.tsx | 14 - frontend/components/ui/label-with-info.tsx | 21 +- frontend/components/ui/metadata-item.tsx | 2 +- frontend/components/ui/page-header.tsx | 19 +- frontend/components/ui/select.tsx | 69 +- frontend/components/ui/slider.tsx | 11 +- frontend/components/ui/stat-card.tsx | 14 +- frontend/components/ui/switch.tsx | 10 + frontend/components/ui/tooltip.tsx | 14 +- frontend/e2e/a11y-admin-routes.spec.ts | 96 + frontend/e2e/a11y-routes.spec.ts | 125 + frontend/e2e/admin-login.spec.ts | 68 + frontend/e2e/analyst-share-login.spec.ts | 65 + .../e2e/custom-field-vcl-validation.spec.ts | 82 + frontend/e2e/dashboard-card-drag-drop.spec.ts | 82 + frontend/e2e/dashboard-multi-filter.spec.ts | 29 + frontend/e2e/global-setup.ts | 130 + frontend/e2e/global-teardown.ts | 32 + frontend/e2e/hydration-smoke.spec.ts | 131 + frontend/e2e/keyboard-navigation.spec.ts | 34 + frontend/e2e/maplibre-country-filter.spec.ts | 39 + .../e2e/plotly-chart-interactions.spec.ts | 31 + frontend/e2e/provision-teardown.spec.ts | 156 + frontend/e2e/provision-wizard.spec.ts | 137 + frontend/e2e/visual-regression.spec.ts | 114 + .../geo-map-dark-chromium-darwin.png | Bin 0 -> 15612 bytes .../geo-map-light-chromium-darwin.png | Bin 0 -> 15861 bytes .../traffic-chart-dark-chromium-darwin.png | Bin 0 -> 34605 bytes .../traffic-chart-light-chromium-darwin.png | Bin 0 -> 35104 bytes frontend/eslint.config.mjs | 16 + frontend/hooks/useActiveService.ts | 14 + frontend/hooks/useAnalystHeartbeat.ts | 15 +- frontend/hooks/useAnalystLogout.ts | 46 + frontend/hooks/useBootstrap.ts | 150 +- frontend/hooks/useCardVisibility.ts | 38 +- frontend/hooks/useCopyToClipboard.ts | 32 + frontend/hooks/useCronRunsStream.ts | 203 + frontend/hooks/useDashboardBundle.ts | 109 + frontend/hooks/useDataWindowOverlap.ts | 105 + frontend/hooks/useElapsedTime.ts | 13 +- frontend/hooks/useFieldLabel.ts | 12 +- frontend/hooks/useFieldValues.ts | 6 +- frontend/hooks/useFilterPayload.ts | 15 + frontend/hooks/useFilterUrlSync.ts | 76 + frontend/hooks/useHeaderBadgeStream.ts | 82 + frontend/hooks/useIsAnalyst.ts | 26 + frontend/hooks/useIsDataReady.ts | 62 +- frontend/hooks/useLastSync.ts | 61 + frontend/hooks/useLogFieldsCatalog.ts | 33 +- frontend/hooks/useMounted.ts | 20 + frontend/hooks/usePageContext.ts | 42 - frontend/hooks/useReportConfig.ts | 38 +- frontend/hooks/useSSE.ts | 48 +- frontend/hooks/useScoringLabels.ts | 25 +- frontend/hooks/useServiceStream.ts | 163 + frontend/hooks/useShareStatusBanner.tsx | 50 +- frontend/hooks/useShareStream.ts | 40 + frontend/hooks/useSyncStatus.ts | 77 + frontend/hooks/useSyncStatusStream.ts | 41 + frontend/hooks/useSystemMetricsStream.ts | 85 + frontend/hooks/useTimeRange.ts | 21 + frontend/hooks/useTimeseriesToTraces.ts | 4 +- frontend/hooks/useTimezone.ts | 11 + frontend/hooks/useUrlFilterSync.ts | 114 +- frontend/hooks/useUrlServiceSync.ts | 104 +- frontend/knip.config.ts | 77 + frontend/lib/_preload-chunks.json | 14 - frontend/lib/analystFetch.ts | 28 + frontend/lib/api.ts | 224 +- frontend/lib/api/custom-fields.ts | 14 +- frontend/lib/cron-cache-bust.ts | 68 + frontend/lib/date.ts | 28 +- frontend/lib/fetchWithTimeout.ts | 35 + frontend/lib/format.ts | 45 +- frontend/lib/pop.ts | 45 + frontend/lib/preload-manifest.ts | 54 - frontend/lib/sidebar-cookie.ts | 7 + frontend/lib/sse-parser.ts | 18 + frontend/lib/ssr/_transport.ts | 217 + frontend/lib/ssr/admin_trends.ts | 33 + frontend/lib/ssr/alerts.ts | 33 + frontend/lib/ssr/bootstrap.ts | 30 + frontend/lib/ssr/logs.ts | 43 + frontend/lib/ssr/seed.ts | 30 + frontend/lib/ssr/tos.ts | 20 + frontend/lib/ssr/usage_log.ts | 41 + frontend/lib/staleViewRetry.ts | 4 +- frontend/lib/table-columns.tsx | 25 +- frontend/lib/table-utils.tsx | 65 +- frontend/lib/toast.ts | 225 + frontend/lib/urlFilterHydration.ts | 129 + frontend/lib/utils.ts | 15 +- frontend/lib/ux-telemetry.ts | 48 + frontend/lib/workers/buildTrafficData.ts | 66 + frontend/lib/workers/chartDataWorker.ts | 24 + frontend/lib/workers/json-worker.ts | 8 + frontend/lib/workers/parseJson.ts | 26 + frontend/next.config.ts | 19 +- frontend/openapi.json | 40826 +++++++++++---- frontend/package-lock.json | 3848 +- frontend/package.json | 37 +- frontend/playwright.config.ts | 80 + frontend/proxy.ts | 153 +- frontend/public/fastly.svg | 2 +- frontend/public/geo/dma.geojson | 2 +- frontend/public/geo/world.topo.json | 1 + frontend/public/globe.svg | 2 +- frontend/scripts/build-preload-manifest.mjs | 139 - frontend/stores/adminTokenStore.ts | 23 + frontend/stores/debugStore.ts | 4 + frontend/stores/filterStore.ts | 94 +- frontend/stores/popGeoStore.ts | 18 + frontend/stores/serviceStore.ts | 8 + frontend/stores/timezoneStore.ts | 21 +- frontend/tests/backend-contract.test.ts | 218 + frontend/tests/msw/handlers.ts | 469 +- frontend/tests/setup-backend.ts | 112 + frontend/tsconfig.json | 18 +- frontend/types/api.generated.ts | 23823 +++++++-- frontend/types/filters.ts | 30 +- frontend/vitest.config.ts | 11 + frontend/vitest.setup.ts | 86 +- mypy-baseline.txt | 0 pyproject.toml | 399 +- run.sh | 69 +- scripts/README.md | 53 + scripts/analyze_web_vitals.py | 333 + scripts/backfill_rollups.py | 19 +- scripts/backup_service_configs.sh | 142 + scripts/baseline_metrics.sh | 102 + scripts/check_eslint_count.sh | 70 + scripts/check_no_console_otel.sh | 34 + scripts/check_osv.py | 112 +- scripts/check_security_regression_count.sh | 45 + scripts/cleanup_orphan_raw_logs.py | 102 + scripts/dev/restore_dev_from_snapshot.sh | 150 + scripts/dev/snapshot_prod_to_dev.sh | 240 + scripts/dev/sync-from-remote.sh | 24 +- scripts/emit_perf_latest.py | 217 + scripts/loadtest_generator.py | 65 +- scripts/perf_gate.sh | 85 + scripts/refresh_fastly_cidrs.py | 170 + scripts/run_contract_backend.py | 74 + scripts/scoring/deploy_wasm.sh | 65 +- scripts/usage_compare.py | 2 +- tests/cassettes/fastly_429_then_success.yaml | 45 + tests/cassettes/fastly_503_then_success.yaml | 5 + .../cassettes/fastly_get_service_success.yaml | 4 + tests/conftest.py | 286 +- tests/contract/openapi_baseline.json | 41009 ++++++++++++++++ .../test_api_response_snapshots.ambr | 40 + .../test_repository_sql_snapshots.ambr | 727 + tests/core/test_api_response_snapshots.py | 70 + .../test_buffer_commit_double_checkpoint.py | 249 + tests/core/test_buffer_commit_idempotent.py | 198 + tests/core/test_commit_crash_recovery.py | 324 + tests/core/test_common_models.py | 20 +- tests/core/test_custom_field_cross_service.py | 180 + tests/core/test_custom_field_fuzz.py | 140 + tests/core/test_custom_field_null_handling.py | 230 + tests/core/test_custom_field_roundtrip.py | 244 + tests/core/test_custom_field_type_mismatch.py | 64 + tests/core/test_duckdb_helpers.py | 139 +- tests/core/test_duckdb_pool.py | 1218 +- tests/core/test_duckdb_pool_drain.py | 182 + tests/core/test_duckdb_recycle.py | 104 + tests/core/test_duckdb_recycle_barrier.py | 91 + tests/core/test_duckdb_status_helpers.py | 1370 + tests/core/test_duckdb_type_roundtrip.py | 107 + tests/core/test_fastly_client.py | 16 +- tests/core/test_fastly_client_vcr.py | 26 +- .../core/test_fastly_edge_writes_backfill.py | 18 +- tests/core/test_field_registry.py | 404 + tests/core/test_iceberg.py | 201 +- tests/core/test_iceberg_buffer_branches.py | 341 + tests/core/test_iceberg_fs.py | 248 + tests/core/test_iceberg_helpers.py | 94 +- tests/core/test_iceberg_self_heal.py | 157 + tests/core/test_iceberg_sync_branches.py | 801 + tests/core/test_iceberg_view_branches.py | 298 + tests/core/test_ingest.py | 70 +- tests/core/test_ingest_corrupt_row_repair.py | 145 + tests/core/test_ingest_corruption.py | 2 +- tests/core/test_ingest_crash_recovery.py | 307 + tests/core/test_ingest_discovery.py | 609 +- tests/core/test_ingest_in_flight.py | 3 +- tests/core/test_ingest_partial_failure.py | 512 + tests/core/test_ingest_stateful.py | 194 + tests/core/test_ingest_timing.py | 4 +- tests/core/test_integration_custom_fields.py | 11 +- tests/core/test_lake_info.py | 24 +- tests/core/test_local_compaction.py | 235 +- tests/core/test_local_compaction_branches.py | 209 + tests/core/test_log_fields.py | 44 + tests/core/test_log_line_budget.py | 41 + tests/core/test_metadata_db_audit.py | 2 +- tests/core/test_metadata_db_concurrency.py | 75 +- tests/core/test_metadata_db_crud.py | 343 +- tests/core/test_metadata_db_migrations.py | 422 +- tests/core/test_metadata_db_reap.py | 223 +- tests/core/test_metadata_db_schema.py | 68 +- tests/core/test_metadata_state.py | 87 + tests/core/test_metric_snapshots.py | 119 + tests/core/test_multi_process_ingest.py | 205 + tests/core/test_query_instrumentation.py | 230 + tests/core/test_query_registry.py | 584 + .../core/test_reconcile_fastly_stats_gate.py | 8 +- tests/core/test_reconciliation.py | 363 + tests/core/test_repository_sql_snapshots.py | 343 + tests/core/test_request_context.py | 348 + tests/core/test_request_telemetry.py | 227 + tests/core/test_rollups_compaction.py | 250 + tests/core/test_rollups_day_bundles.py | 277 + tests/core/test_rollups_hour_bundling.py | 315 +- tests/core/test_rollups_network_rtt.py | 285 + tests/core/test_rollups_network_speed.py | 240 + tests/core/test_rollups_origin_summary.py | 699 + .../core/test_rollups_origin_summary_daily.py | 326 + tests/core/test_rollups_perf_latency.py | 287 + tests/core/test_rollups_recompute.py | 893 + tests/core/test_rollups_sessions.py | 317 + tests/core/test_rollups_slow_urls.py | 351 + tests/core/test_rollups_time_series.py | 264 + tests/core/test_rollups_verified_bots_ts.py | 274 + tests/core/test_rollups_wellknown_bots.py | 255 + .../test_rollups_wellknown_bots_writer.py | 339 + tests/core/test_scheduler_timing.py | 5 +- tests/core/test_slow_queries_persist.py | 191 + tests/core/test_sqlite_pool.py | 273 + tests/core/test_sqlite_wal_crash.py | 190 + tests/core/test_vcl_semantics.py | 60 +- tests/core/test_view_rebind_race.py | 313 + tests/core/test_web_vitals_store.py | 93 + .../correctness/test_live_rollup_agreement.py | 1252 + tests/cron/test_commit.py | 216 + tests/cron/test_compaction_jobs.py | 221 + tests/cron/test_duckdb_recycle_job.py | 31 + tests/cron/test_expire.py | 135 + tests/cron/test_insights_prewarmer.py | 227 + tests/cron/test_metadata.py | 274 + tests/cron/test_metric_snapshot_job.py | 333 + tests/cron/test_optimize.py | 182 + tests/cron/test_scheduler_branches.py | 262 + tests/cron/test_scheduler_recycle.py | 45 + tests/cron/test_sync_job.py | 897 + tests/fixtures/fastly_stubs.vcl | 40 + tests/perf/__init__.py | 0 tests/perf/baseline.json | 31 + tests/perf/test_benchmarks_micro.py | 163 + tests/provision/__init__.py | 0 .../test_session_scoring_orchestrator.py | 806 + tests/remote_access/test_middleware.py | 393 +- .../remote_access/test_share_admin_routes.py | 50 +- tests/remote_access/test_share_auth_routes.py | 176 +- tests/remote_access/test_share_db.py | 189 +- tests/remote_access/test_tunnel.py | 147 +- tests/repositories/_sql/__init__.py | 0 tests/repositories/_sql/test_alerts.py | 75 + tests/repositories/_sql/test_dashboard.py | 283 + tests/repositories/_sql/test_insights.py | 409 + tests/repositories/_sql/test_network.py | 360 + tests/repositories/_sql/test_origin.py | 231 + tests/repositories/_sql/test_query.py | 92 + tests/repositories/_sql/test_security.py | 237 + tests/repositories/_sql/test_sessions.py | 188 + tests/repositories/_sql/test_usage.py | 22 + .../test_aggregates_timeseries_properties.py | 9 +- tests/repositories/test_alerts.py | 72 +- .../repositories/test_all_repos_properties.py | 19 +- tests/repositories/test_base.py | 913 +- tests/repositories/test_base_branches.py | 292 + tests/repositories/test_base_helpers.py | 53 +- tests/repositories/test_cron.py | 4 +- tests/repositories/test_dashboard.py | 332 +- tests/repositories/test_filter_integration.py | 11 +- tests/repositories/test_insights.py | 414 +- .../repositories/test_insights_processors.py | 51 + tests/repositories/test_network.py | 132 + tests/repositories/test_origin.py | 60 +- tests/repositories/test_origin_aggregates.py | 543 + tests/repositories/test_performance.py | 171 +- tests/repositories/test_query.py | 193 +- tests/repositories/test_security.py | 1012 + tests/repositories/test_security_branches.py | 247 + .../repositories/test_session_scoring_repo.py | 349 + tests/repositories/test_sessions.py | 732 +- tests/repositories/test_time_series_rollup.py | 205 + .../repositories/test_usage_storage_stats.py | 182 +- tests/repositories/test_views.py | 113 +- tests/routers/services/test_audit_router.py | 2 +- .../services/test_core_get_endpoints.py | 63 +- tests/routers/services/test_cron_router.py | 75 +- .../test_custom_field_count_limits.py | 44 + tests/routers/test_admin_compaction.py | 391 + tests/routers/test_admin_get_endpoints.py | 81 +- tests/routers/test_admin_health_snapshot.py | 249 + tests/routers/test_admin_log_accounting.py | 30 +- tests/routers/test_admin_metric_history.py | 30 + .../routers/test_admin_mutation_endpoints.py | 234 +- tests/routers/test_admin_mutations.py | 4 +- tests/routers/test_admin_queries.py | 260 + .../test_admin_system_metrics_stream.py | 234 + tests/routers/test_alerts_and_views.py | 88 +- tests/routers/test_bootstrap.py | 227 +- .../test_bootstrap_graceful_degradation.py | 156 + tests/routers/test_cron_runs_stream.py | 4 +- tests/routers/test_cross_tenant_scope.py | 218 + tests/routers/test_dashboard_router.py | 485 + tests/routers/test_debug.py | 8 +- tests/routers/test_endpoints.py | 75 +- tests/routers/test_logging_settings.py | 126 + tests/routers/test_network_router.py | 95 + tests/routers/test_origin_router.py | 59 + tests/routers/test_pages.py | 22 +- tests/routers/test_pricing_defaults.py | 2 +- tests/routers/test_provision.py | 83 +- tests/routers/test_provision_branches.py | 320 + tests/routers/test_provision_lifecycle.py | 2 +- tests/routers/test_provision_teardown_auth.py | 15 +- .../test_provision_teardown_idempotency.py | 2 +- tests/routers/test_provision_wizard_e2e.py | 197 +- tests/routers/test_query_router.py | 248 +- tests/routers/test_rbac_audit_fixes.py | 310 + tests/routers/test_scoring_exclude_regex.py | 18 +- tests/routers/test_security_insights.py | 176 +- tests/routers/test_service_mutations.py | 186 +- tests/routers/test_session_scoring_router.py | 1262 +- tests/routers/test_usage_endpoints.py | 60 +- tests/routers/test_usage_log.py | 10 +- tests/routers/test_usage_router.py | 30 +- tests/routers/test_ux_events.py | 66 + tests/routers/test_web_vitals.py | 125 + tests/scoring/test_evaluate.py | 43 + tests/scoring/test_matrix.py | 102 +- tests/scoring/test_normalize.py | 96 +- tests/scoring/test_normalize_parity.py | 241 + .../scoring/test_normalize_runtime_parity.py | 264 + tests/scoring/test_scorer.py | 138 +- tests/scoring/test_scoring_vcl_l2_gaps.py | 73 + .../test_session_scoring_orchestrator.py | 292 +- tests/scoring/test_session_scoring_setup.py | 77 +- tests/scoring/test_session_scoring_vcl.py | 133 +- tests/security/__init__.py | 0 tests/security/conftest.py | 155 + tests/security/test_live_rbac_probes.py | 273 + .../test_no_infra_leak_in_tracked_tree.py | 104 + tests/test_analyze_web_vitals.py | 188 + tests/test_changelog_breaking_parity.py | 81 + tests/test_cron_progress.py | 30 +- tests/test_cron_runs_sse.py | 258 + tests/test_deps.py | 47 +- tests/test_dev_mode_no_crons.py | 205 + tests/test_e2e_pipeline.py | 98 + tests/test_e2e_pyiceberg_s3.py | 136 +- tests/test_error_envelope_contract.py | 103 + tests/test_http_exception_envelope_shape.py | 130 + tests/test_main.py | 277 +- tests/test_multi_service_e2e.py | 276 + tests/test_no_trace_leakage_sweep.py | 5 +- tests/test_provision_cli.py | 269 + tests/test_provision_cli_handlers.py | 75 +- tests/test_provision_fastly_failures.py | 215 + tests/test_provision_orchestrator.py | 149 +- tests/test_proxy_headers_regression.py | 8 +- tests/test_response_contract.py | 272 +- tests/test_scheduler.py | 221 +- tests/test_scheduler_apscheduler_stress.py | 28 +- tests/test_schemathesis_smoke.py | 95 + tests/test_smoke_end_to_end.py | 89 +- tests/test_sre_observability.py | 280 + tests/test_sync_status_sse.py | 424 + tests/test_trust_topology.py | 171 + .../__snapshots__/test_terraform_gen.ambr | 618 + tests/utils/polling.py | 69 + tests/utils/test_active_requests.py | 343 + tests/utils/test_auth.py | 48 + tests/utils/test_bounded_cache.py | 6 +- tests/utils/test_cache_registry.py | 92 + tests/utils/test_cdn.py | 118 - tests/utils/test_cdn_miss_tracking.py | 40 +- tests/utils/test_check_osv.py | 83 + tests/utils/test_date_utils.py | 13 +- tests/utils/test_fastly_api.py | 136 + tests/utils/test_fastly_api_orchestrators.py | 7 + tests/utils/test_fastly_mock_mode.py | 74 + tests/utils/test_fastly_utils.py | 224 +- tests/utils/test_fos_setup.py | 12 +- tests/utils/test_hll.py | 437 + tests/utils/test_ngwaf_vcr.py | 6 + tests/utils/test_rdns_async.py | 166 + tests/utils/test_rdns_cache.py | 163 +- tests/utils/test_refresh_fastly_cidrs.py | 134 + tests/utils/test_remote_access_branches.py | 1417 + tests/utils/test_router_utils.py | 120 +- tests/utils/test_sql_validator.py | 310 +- tests/utils/test_sqlite_profiler.py | 100 +- tests/utils/test_state_sync.py | 26 +- tests/utils/test_structlog_config.py | 131 + tests/utils/test_system_jobs.py | 5 +- tests/utils/test_telemetry.py | 22 +- tests/utils/test_telemetry_proxy.py | 208 +- tests/utils/test_telemetry_proxy_phase2.py | 14 +- tests/utils/test_telemetry_proxy_phase3a.py | 2 +- tests/utils/test_telemetry_proxy_phase3b.py | 12 +- .../test_telemetry_response_middleware.py | 189 +- tests/utils/test_telemetry_unit.py | 202 + tests/utils/test_terraform_gen.py | 283 +- tests/utils/test_testcontainers_smoke.py | 69 + tests/utils/test_tunnel_state.py | 120 + tests/utils/test_usage_logger.py | 38 +- tests/utils/test_vcl_utils.py | 51 + uv.lock | 1971 +- 1122 files changed, 261882 insertions(+), 61050 deletions(-) create mode 100644 .github/workflows/cidr-refresh.yml create mode 100644 .github/workflows/e2e.yml create mode 100644 .github/workflows/perf-nightly.yml create mode 100644 backend/_in_process_publisher.py create mode 100644 backend/core/_duckdb_status.py create mode 100644 backend/core/_log_fields_data.py delete mode 100644 backend/core/data_migrations.py create mode 100644 backend/core/duckdb_recycle.py create mode 100644 backend/core/fastly/mock_fixtures.py create mode 100644 backend/core/field_registry.py delete mode 100644 backend/core/iceberg.py create mode 100644 backend/core/iceberg/__init__.py create mode 100644 backend/core/iceberg/_core.py create mode 100644 backend/core/iceberg/buffer.py create mode 100644 backend/core/iceberg/fs.py rename backend/{models/lake.py => core/iceberg/lake_info.py} (97%) create mode 100644 backend/core/iceberg/manifest.py create mode 100644 backend/core/iceberg/sync.py create mode 100644 backend/core/iceberg/view.py create mode 100644 backend/core/metadata/__init__.py create mode 100644 backend/core/metadata/alerts.py create mode 100644 backend/core/metadata/asn_cache.py create mode 100644 backend/core/metadata/base.py create mode 100644 backend/core/metadata/cron_log.py create mode 100644 backend/core/metadata/ingest_log.py create mode 100644 backend/core/metadata/reconciliation.py create mode 100644 backend/core/metadata/slow_queries.py create mode 100644 backend/core/metadata/state.py create mode 100644 backend/core/metadata/usage_log.py create mode 100644 backend/core/metadata/usage_log_db.py create mode 100644 backend/core/metadata/views.py delete mode 100644 backend/core/metadata_db.py create mode 100644 backend/core/metric_snapshots.py create mode 100644 backend/core/query_attribution.py create mode 100644 backend/core/query_instrumentation.py create mode 100644 backend/core/query_registry.py create mode 100644 backend/core/request_context.py create mode 100644 backend/core/request_telemetry.py delete mode 100644 backend/core/rollups.py create mode 100644 backend/core/rollups/__init__.py create mode 100644 backend/core/rollups/_common.py create mode 100644 backend/core/rollups/day_bundles.py create mode 100644 backend/core/rollups/hour_bundles.py create mode 100644 backend/core/rollups/network_rtt.py create mode 100644 backend/core/rollups/network_speed.py create mode 100644 backend/core/rollups/origin_summary.py create mode 100644 backend/core/rollups/perf_latency.py create mode 100644 backend/core/rollups/recompute.py create mode 100644 backend/core/rollups/sessions.py create mode 100644 backend/core/rollups/slow_urls.py create mode 100644 backend/core/rollups/time_series.py create mode 100644 backend/core/rollups/verified_bots_ts.py create mode 100644 backend/core/rollups/wellknown_bots.py delete mode 100644 backend/core/share_db.py create mode 100644 backend/core/share_db/__init__.py create mode 100644 backend/core/share_db/audit.py create mode 100644 backend/core/share_db/connection.py create mode 100644 backend/core/share_db/invites.py create mode 100644 backend/core/share_db/passcode.py create mode 100644 backend/core/share_db/schema.py create mode 100644 backend/core/share_db/sessions.py create mode 100644 backend/core/share_db/settings.py create mode 100644 backend/core/share_db/tos.py create mode 100644 backend/core/share_db/validation.py create mode 100644 backend/core/sqlite_pool.py create mode 100644 backend/core/web_vitals_store.py create mode 100644 backend/cron/__init__.py create mode 100644 backend/cron/decorators.py create mode 100644 backend/cron/jobs/__init__.py create mode 100644 backend/cron/jobs/_common.py create mode 100644 backend/cron/jobs/commit.py create mode 100644 backend/cron/jobs/compaction.py create mode 100644 backend/cron/jobs/duckdb_recycle.py create mode 100644 backend/cron/jobs/expire.py create mode 100644 backend/cron/jobs/insights_prewarmer.py create mode 100644 backend/cron/jobs/metadata.py create mode 100644 backend/cron/jobs/metric_snapshot.py create mode 100644 backend/cron/jobs/optimize.py create mode 100644 backend/cron/jobs/sync.py create mode 100644 backend/cron/schedule.py create mode 100644 backend/cron/scheduler.py create mode 100644 backend/cron_runs_publisher.py create mode 100644 backend/models/admin_queries.py create mode 100644 backend/models/errors.py create mode 100644 backend/models/provision.py create mode 100644 backend/models/session_scoring.py create mode 100644 backend/models/share_admin.py create mode 100644 backend/repositories/_presets_cache.py create mode 100644 backend/repositories/_sql/__init__.py create mode 100644 backend/repositories/_sql/alerts.py create mode 100644 backend/repositories/_sql/dashboard.py create mode 100644 backend/repositories/_sql/insights.py create mode 100644 backend/repositories/_sql/network.py create mode 100644 backend/repositories/_sql/origin.py create mode 100644 backend/repositories/_sql/query.py create mode 100644 backend/repositories/_sql/security.py create mode 100644 backend/repositories/_sql/sessions.py create mode 100644 backend/repositories/_sql/usage.py create mode 100644 backend/repositories/session_scoring.py create mode 100644 backend/repositories/utils/response_cache.py create mode 100644 backend/routers/_state_sync.py delete mode 100644 backend/routers/admin.py create mode 100644 backend/routers/admin/__init__.py create mode 100644 backend/routers/admin/_dir_size.py create mode 100644 backend/routers/admin/_helpers.py create mode 100644 backend/routers/admin/_router.py create mode 100644 backend/routers/admin/bot_sources.py create mode 100644 backend/routers/admin/compaction.py create mode 100644 backend/routers/admin/downloads.py create mode 100644 backend/routers/admin/health.py create mode 100644 backend/routers/admin/iceberg.py create mode 100644 backend/routers/admin/ingest.py create mode 100644 backend/routers/admin/log_accounting.py create mode 100644 backend/routers/admin/metric_history.py create mode 100644 backend/routers/admin/pop_locations.py create mode 100644 backend/routers/admin/sync_status.py create mode 100644 backend/routers/admin/system_metrics.py create mode 100644 backend/routers/admin/trees.py create mode 100644 backend/routers/admin_queries.py create mode 100644 backend/routers/admin_usage.py create mode 100644 backend/routers/session_scoring_admin.py create mode 100644 backend/routers/ux_events.py create mode 100644 backend/routers/web_vitals.py create mode 100644 backend/sync_status_publisher.py create mode 100644 backend/system_metrics_sampler.py create mode 100644 backend/utils/active_requests.py create mode 100644 backend/utils/auth.py create mode 100644 backend/utils/cache_registry.py delete mode 100644 backend/utils/cdn.py create mode 100644 backend/utils/hll.py create mode 100644 backend/utils/iceberg_expr.py create mode 100644 backend/utils/path_safety.py create mode 100644 backend/utils/sse_subscription.py create mode 100644 backend/utils/structlog_config.py delete mode 100644 backend/utils/tunnel.py create mode 100644 backend/utils/tunnel/__init__.py create mode 100644 backend/utils/tunnel/fingerprint.py create mode 100644 backend/utils/tunnel/manager.py create mode 100644 backend/utils/tunnel/rate_limiter.py create mode 100644 backend/utils/tunnel/session.py create mode 100644 backend/utils/tunnel/state.py rename backend/routers/services/config.py => cache/.gitkeep (100%) create mode 100644 caddy/Caddyfile.local create mode 100644 compute/scorer/fixtures/local-matrix.json create mode 100644 compute/scorer/pkg/session-scorer.tar.gz create mode 100644 config.example.json create mode 100644 configs/.gitkeep delete mode 100644 configs/ssh_known_hosts create mode 100644 data/.gitkeep create mode 100644 docs/adr/01-storage-model.md create mode 100644 docs/adr/02-request-lifecycle.md create mode 100644 docs/adr/03-tenancy.md create mode 100644 docs/adr/04-middleware-order.md create mode 100644 docs/adr/05-frontend-rendering-boundary.md create mode 100644 docs/adr/06-view-warming.md create mode 100644 docs/adr/07-feature-budgets.md create mode 100644 docs/adr/08-observability.md create mode 100644 docs/adr/09-error-handling.md create mode 100644 docs/adr/10-schema-evolution.md create mode 100644 docs/adr/11-secret-rotation.md create mode 100644 docs/adr/12-api-versioning.md create mode 100644 docs/adr/13-backup-dr.md create mode 100644 docs/adr/README.md create mode 100644 docs/deploy/README.md create mode 100644 docs/deploy/aws_ec2.md create mode 100644 docs/deploy/azure_vm.md create mode 100644 docs/deploy/gce.md create mode 100644 docs/deploy/generic_linux.md create mode 100644 frontend/__tests__/app/admin/session-scoring.test.tsx create mode 100644 frontend/__tests__/app/alerts/AlertPreview.test.tsx create mode 100644 frontend/__tests__/app/charts.test.tsx create mode 100644 frontend/__tests__/app/dashboard/CardGrid.bot-error.test.tsx create mode 100644 frontend/__tests__/app/logs.test.tsx create mode 100644 frontend/__tests__/app/logs/QuickActionsBar.test.tsx create mode 100644 frontend/__tests__/app/network.quality-error.test.tsx create mode 100644 frontend/__tests__/app/network.test.tsx create mode 100644 frontend/__tests__/app/network/help-content.test.tsx create mode 100644 frontend/__tests__/app/origin.test.tsx create mode 100644 frontend/__tests__/app/performance.nullcache.test.tsx create mode 100644 frontend/__tests__/app/performance.test.tsx create mode 100644 frontend/__tests__/app/security.test.tsx create mode 100644 frontend/__tests__/app/sessions.test.tsx create mode 100644 frontend/__tests__/app/sessions/SessionDetail.error.test.tsx create mode 100644 frontend/__tests__/app/share-login/safeReturnTarget.test.ts create mode 100644 frontend/__tests__/app/usage.storage-error.test.tsx create mode 100644 frontend/__tests__/app/usage.test.tsx create mode 100644 frontend/__tests__/components/ActiveFiltersBanner.test.tsx create mode 100644 frontend/__tests__/components/ChartIntervalButtons.test.tsx create mode 100644 frontend/__tests__/components/ChoroplethMap.test.tsx create mode 100644 frontend/__tests__/components/CronLiveLog.test.tsx create mode 100644 frontend/__tests__/components/DataTable/DateTimeCell.test.tsx create mode 100644 frontend/__tests__/components/DebugPanel.test.tsx create mode 100644 frontend/__tests__/components/DeltaIndicator.test.tsx create mode 100644 frontend/__tests__/components/ErrorBoundary.test.tsx create mode 100644 frontend/__tests__/components/FilterValueCell.test.tsx create mode 100644 frontend/__tests__/components/Insights/sections/performance.test.tsx create mode 100644 frontend/__tests__/components/Insights/sections/security.test.tsx create mode 100644 frontend/__tests__/components/Insights/sections/traffic.test.tsx create mode 100644 frontend/__tests__/components/LazyMount.test.tsx create mode 100644 frontend/__tests__/components/Map/NetworkMap/MapLayer.test.tsx create mode 100644 frontend/__tests__/components/Map/NetworkMap/OverlayLayer.test.tsx create mode 100644 frontend/__tests__/components/Map/NetworkMap/controls.test.tsx create mode 100644 frontend/__tests__/components/Map/NetworkMap/index.test.tsx create mode 100644 frontend/__tests__/components/Map/ShieldingMap.test.tsx create mode 100644 frontend/__tests__/components/NetworkMap.test.tsx create mode 100644 frontend/__tests__/components/NoServiceSelected.test.tsx create mode 100644 frontend/__tests__/components/PlotlyChart/ChartA11yTable.test.tsx create mode 100644 frontend/__tests__/components/PlotlyChart/PlotlyChart.test.tsx create mode 100644 frontend/__tests__/components/ProvisionWizard/wizard-api.test.ts create mode 100644 frontend/__tests__/components/ProvisionWizard/wizard-config-helpers.test.ts create mode 100644 frontend/__tests__/components/ProvisionWizard/wizard-deploy.test.ts create mode 100644 frontend/__tests__/components/ProvisionWizard/wizard-draft.test.ts create mode 100644 frontend/__tests__/components/ReloadLoopGuard.test.tsx create mode 100644 frontend/__tests__/components/ReportLayout.test.tsx create mode 100644 frontend/__tests__/components/ReportShell.test.tsx create mode 100644 frontend/__tests__/components/ServiceSwitcher.test.tsx create mode 100644 frontend/__tests__/components/SessionScoring/L2EnforcementCard.test.tsx create mode 100644 frontend/__tests__/components/Sparkline.test.tsx create mode 100644 frontend/__tests__/components/TimeAgo.test.tsx create mode 100644 frontend/__tests__/components/TimeSeriesChart.test.tsx create mode 100644 frontend/__tests__/components/UpdatingBadge.test.tsx create mode 100644 frontend/__tests__/components/dashboard/CardGrid.test.tsx create mode 100644 frontend/__tests__/components/dashboard/GeoMap.test.tsx create mode 100644 frontend/__tests__/components/dashboard/TopTenTable.test.tsx create mode 100644 frontend/__tests__/components/dashboard/TrafficChart.test.tsx create mode 100644 frontend/__tests__/components/security/BotsSection.test.tsx create mode 100644 frontend/__tests__/components/security/HeaderAnomaliesSection.test.tsx create mode 100644 frontend/__tests__/components/security/NetworkSection.test.tsx create mode 100644 frontend/__tests__/components/sessions/ScoringControls.test.tsx create mode 100644 frontend/__tests__/components/sessions/SessionDetail.test.tsx create mode 100644 frontend/__tests__/components/sessions/SessionsTable.test.tsx create mode 100644 frontend/__tests__/components/ui/review-card.test.tsx create mode 100644 frontend/__tests__/components/ui/section-header.test.tsx create mode 100644 frontend/__tests__/components/ui/skeleton-grid.test.tsx create mode 100644 frontend/__tests__/components/ui/stat-card.test.tsx create mode 100644 frontend/__tests__/helpers/maplibre-mock.ts create mode 100644 frontend/__tests__/helpers/page-smoke.tsx create mode 100644 frontend/__tests__/helpers/query.tsx create mode 100644 frontend/__tests__/hooks/useActiveService.test.ts create mode 100644 frontend/__tests__/hooks/useAnalystHeartbeat.test.ts create mode 100644 frontend/__tests__/hooks/useCardVisibility.test.ts create mode 100644 frontend/__tests__/hooks/useCronRunsStream.test.ts create mode 100644 frontend/__tests__/hooks/useDashboardBundle.test.ts create mode 100644 frontend/__tests__/hooks/useDashboardCards.test.ts create mode 100644 frontend/__tests__/hooks/useDataWindowOverlap.test.ts create mode 100644 frontend/__tests__/hooks/useFilterUrlSync.test.ts create mode 100644 frontend/__tests__/hooks/useFilteredActive.test.ts create mode 100644 frontend/__tests__/hooks/useHeaderBadgeStream.test.ts create mode 100644 frontend/__tests__/hooks/useIsDataReady.test.ts create mode 100644 frontend/__tests__/hooks/useKeyboardShortcuts.test.ts create mode 100644 frontend/__tests__/hooks/useLastSync.test.ts create mode 100644 frontend/__tests__/hooks/useLogFieldsCatalog.test.ts create mode 100644 frontend/__tests__/hooks/useNowSeconds.test.ts create mode 100644 frontend/__tests__/hooks/useServiceStream.test.ts create mode 100644 frontend/__tests__/hooks/useShareStatusBanner.test.tsx create mode 100644 frontend/__tests__/hooks/useSyncStatus.test.ts create mode 100644 frontend/__tests__/hooks/useSyncStatusStream.test.ts create mode 100644 frontend/__tests__/hooks/useSystemMetricsStream.test.ts create mode 100644 frontend/__tests__/lib/api-admin-token.test.ts create mode 100644 frontend/__tests__/lib/api/custom-fields.test.ts create mode 100644 frontend/__tests__/lib/chart-helpers.test.ts create mode 100644 frontend/__tests__/lib/pop.test.ts create mode 100644 frontend/__tests__/lib/table-utils.test.tsx create mode 100644 frontend/__tests__/lib/toast.test.ts create mode 100644 frontend/__tests__/lib/urlFilterHydration.test.ts create mode 100644 frontend/__tests__/lib/workers/buildTrafficData.test.ts create mode 100644 frontend/__tests__/lib/workers/parseJson.test.ts create mode 100644 frontend/__tests__/msw-coverage.test.ts delete mode 100644 frontend/__tests__/preload-manifest.test.ts create mode 100644 frontend/__tests__/ssr/bootstrap.test.ts create mode 100644 frontend/__tests__/ssr/logs_usage_log.test.ts create mode 100644 frontend/__tests__/ssr/tos.test.ts create mode 100644 frontend/app/_routing.md create mode 100644 frontend/app/admin/AdminPrefetchLinks.tsx create mode 100644 frontend/app/admin/_sections/BotSourcesPanel.tsx create mode 100644 frontend/app/admin/_sections/CredentialsDialog.tsx create mode 100644 frontend/app/admin/_sections/DiagnosticsPanel.tsx create mode 100644 frontend/app/admin/_sections/GlobalSettings.tsx create mode 100644 frontend/app/admin/_sections/NgwafDialog.tsx create mode 100644 frontend/app/admin/_sections/OperationsOverview.tsx create mode 100644 frontend/app/admin/_sections/ServicesTable.tsx create mode 100644 frontend/app/admin/_sections/ServicesTableColumns.tsx create mode 100644 frontend/app/admin/_sections/SystemStatus.tsx create mode 100644 frontend/app/admin/queries/_helpers.ts create mode 100644 frontend/app/admin/queries/_hooks/useFilteredActive.ts create mode 100644 frontend/app/admin/queries/_hooks/useKeyboardShortcuts.ts create mode 100644 frontend/app/admin/queries/_hooks/useQueryMonitorUrlSync.ts create mode 100644 frontend/app/admin/queries/_sections/ActiveTable.tsx create mode 100644 frontend/app/admin/queries/_sections/CompletedTable.tsx create mode 100644 frontend/app/admin/queries/_sections/DbFilterChips.tsx create mode 100644 frontend/app/admin/queries/_sections/FilterChipRow.tsx create mode 100644 frontend/app/admin/queries/_sections/FilterChips.tsx create mode 100644 frontend/app/admin/queries/_sections/PollingIndicator.tsx create mode 100644 frontend/app/admin/queries/_sections/RowDetailDialog.tsx create mode 100644 frontend/app/admin/queries/_sections/ShortcutsHelp.tsx create mode 100644 frontend/app/admin/queries/_sections/SummaryStrip.tsx create mode 100644 frontend/app/admin/queries/_sections/queryColumns.tsx create mode 100644 frontend/app/admin/queries/_types.ts create mode 100644 frontend/app/admin/queries/loading.tsx create mode 100644 frontend/app/admin/queries/page.tsx create mode 100644 frontend/app/admin/trends/_sections/AdminTrendsClient.tsx create mode 100644 frontend/app/admin/trends/loading.tsx create mode 100644 frontend/app/admin/trends/page.tsx create mode 100644 frontend/app/admin/usage-log/_sections/Filters.tsx create mode 100644 frontend/app/admin/usage-log/_sections/UsageChart.tsx create mode 100644 frontend/app/admin/usage-log/_sections/UsageLogClient.tsx create mode 100644 frontend/app/admin/usage-log/_sections/UsageTable.tsx create mode 100644 frontend/app/admin/usage-log/_sections/shared.ts create mode 100644 frontend/app/alerts/_sections/AlertEditor.tsx create mode 100644 frontend/app/alerts/_sections/AlertPreview.tsx create mode 100644 frontend/app/alerts/_sections/AlertsClient.tsx create mode 100644 frontend/app/alerts/_sections/AlertsList.tsx create mode 100644 frontend/app/dashboard/_sections/CardGrid.tsx create mode 100644 frontend/app/dashboard/_sections/GeoMap.tsx create mode 100644 frontend/app/dashboard/_sections/TrafficChart.tsx create mode 100644 frontend/app/dashboard/_sections/categories.ts create mode 100644 frontend/app/dashboard/_sections/chartHelpers.ts create mode 100644 frontend/app/dashboard/_sections/types.ts create mode 100644 frontend/app/error.tsx create mode 100644 frontend/app/global-error.tsx create mode 100644 frontend/app/logs/_sections/AuditColumns.tsx create mode 100644 frontend/app/logs/_sections/CronColumns.tsx create mode 100644 frontend/app/logs/_sections/CronExplanations.ts create mode 100644 frontend/app/logs/_sections/CronScheduleBox.tsx create mode 100644 frontend/app/logs/_sections/CronTab.tsx create mode 100644 frontend/app/logs/_sections/FloatingOperationsDock.tsx create mode 100644 frontend/app/logs/_sections/IngestionTab.tsx create mode 100644 frontend/app/logs/_sections/LogsClient.tsx create mode 100644 frontend/app/logs/_sections/QuickActionsBar.tsx create mode 100644 frontend/app/logs/_sections/SSEModal.tsx create mode 100644 frontend/app/logs/_sections/SchemaTab.tsx create mode 100644 frontend/app/logs/_sections/ServiceHistoryTab.tsx create mode 100644 frontend/app/logs/_state.ts create mode 100644 frontend/app/not-found.tsx create mode 100644 frontend/app/origin/_sections/Aggregates.tsx create mode 100644 frontend/app/origin/_sections/LatencyHeatmap.tsx create mode 100644 frontend/app/origin/_sections/Timeseries.tsx create mode 100644 frontend/app/query/_sections/ModeToggle.tsx create mode 100644 frontend/app/query/_sections/QueryToolbar.tsx create mode 100644 frontend/app/query/_sections/RawSqlMode.tsx create mode 100644 frontend/app/query/_sections/ResultsTable.tsx create mode 100644 frontend/app/query/_sections/StructuredMode.tsx create mode 100644 frontend/app/query/_sql_builder.ts create mode 100644 frontend/app/security/_sections/BotsSection.tsx create mode 100644 frontend/app/security/_sections/ChartEmptyState.tsx create mode 100644 frontend/app/security/_sections/HeaderAnomaliesSection.tsx create mode 100644 frontend/app/security/_sections/NetworkSection.tsx create mode 100644 frontend/app/security/_sections/securityInfo.tsx create mode 100644 frontend/app/sessions/_sections/ScoringControls.tsx create mode 100644 frontend/app/sessions/_sections/SessionDetail.tsx create mode 100644 frontend/app/sessions/_sections/SessionsTable.tsx create mode 100644 frontend/app/share-login/ShareLoginForm.tsx create mode 100644 frontend/app/share-login/acknowledge/AcknowledgeButton.tsx create mode 100644 frontend/app/share-login/acknowledge/AcknowledgeFallback.tsx create mode 100644 frontend/app/share-login/acknowledge/loading.tsx create mode 100644 frontend/app/share-login/loading.tsx create mode 100644 frontend/components/BackToAdminLink.tsx delete mode 100644 frontend/components/CodeEditor/index.ts create mode 100644 frontend/components/CostCalculator/Inputs.tsx create mode 100644 frontend/components/CostCalculator/Pricing.tsx create mode 100644 frontend/components/CostCalculator/Results.tsx create mode 100644 frontend/components/CostCalculator/calc.ts create mode 100644 frontend/components/CostCalculator/parts.tsx create mode 100644 frontend/components/CronSettingsModal/Preview.tsx create mode 100644 frontend/components/CronSettingsModal/Schedule.tsx create mode 100644 frontend/components/CronSettingsModal/Triggers.tsx create mode 100644 frontend/components/CronSettingsModal/constants.ts delete mode 100644 frontend/components/DashboardLinkCell.tsx create mode 100644 frontend/components/DataTable/Body.tsx create mode 100644 frontend/components/DataTable/ColumnPicker.tsx create mode 100644 frontend/components/DataTable/DataTableBody.tsx create mode 100644 frontend/components/DataTable/DataTablePagination.tsx create mode 100644 frontend/components/DataTable/DataTableReadonly.tsx create mode 100644 frontend/components/DataTable/Header.tsx create mode 100644 frontend/components/DataTable/StaticHeader.tsx create mode 100644 frontend/components/DataTable/Toolbar.tsx create mode 100644 frontend/components/DataTable/useDataTableState.ts create mode 100644 frontend/components/DataWindowBanner.tsx create mode 100644 frontend/components/FilterBar/ActiveFiltersBanner.tsx delete mode 100644 frontend/components/FilterPopover.tsx create mode 100644 frontend/components/FilterValueCell.tsx create mode 100644 frontend/components/HydrateAdminToken.tsx create mode 100644 frontend/components/Insights/CacheCollapseModal.tsx delete mode 100644 frontend/components/Insights/InsightHelpModal.tsx create mode 100644 frontend/components/Insights/InsightHelpModal/index.tsx create mode 100644 frontend/components/Insights/InsightHelpModal/sections/cache.tsx create mode 100644 frontend/components/Insights/InsightHelpModal/sections/errors.tsx create mode 100644 frontend/components/Insights/InsightHelpModal/sections/optimization.tsx create mode 100644 frontend/components/Insights/InsightHelpModal/sections/performance.tsx create mode 100644 frontend/components/Insights/InsightHelpModal/sections/security.tsx create mode 100644 frontend/components/Insights/InsightHelpModal/sections/traffic.tsx create mode 100644 frontend/components/Insights/InsightHelpModal/types.ts create mode 100644 frontend/components/LogSettingsModal/CustomFields.tsx create mode 100644 frontend/components/LogSettingsModal/FieldGroups.tsx create mode 100644 frontend/components/LogSettingsModal/Preview.tsx delete mode 100644 frontend/components/Map/NetworkMap.tsx create mode 100644 frontend/components/Map/NetworkMap/MapLayer.tsx create mode 100644 frontend/components/Map/NetworkMap/OverlayLayer.tsx create mode 100644 frontend/components/Map/NetworkMap/controls.tsx create mode 100644 frontend/components/Map/NetworkMap/index.tsx create mode 100644 frontend/components/Map/baseLayers.ts create mode 100644 frontend/components/Map/colors.ts create mode 100644 frontend/components/PlotlyChart/ChartA11yTable.tsx create mode 100644 frontend/components/PlotlyChart/__tests__/tracesToTable.test.ts create mode 100644 frontend/components/PlotlyChart/tracesToTable.ts create mode 100644 frontend/components/PopLabel.tsx create mode 100644 frontend/components/ProvisionWizard/JsonImportSection.tsx create mode 100644 frontend/components/ProvisionWizard/ResumeBanner.tsx create mode 100644 frontend/components/ProvisionWizard/WizardFooter.tsx create mode 100644 frontend/components/ProvisionWizard/WizardHeader.tsx create mode 100644 frontend/components/ProvisionWizard/steps/AnalyzeStep.tsx create mode 100644 frontend/components/ProvisionWizard/steps/ConfirmStep.tsx create mode 100644 frontend/components/ProvisionWizard/steps/ExecuteStep.tsx create mode 100644 frontend/components/ProvisionWizard/steps/FieldsStep.tsx create mode 100644 frontend/components/ProvisionWizard/steps/JoinStep.tsx create mode 100644 frontend/components/ProvisionWizard/steps/ModeStep.tsx create mode 100644 frontend/components/ProvisionWizard/steps/NgwafStep.tsx create mode 100644 frontend/components/ProvisionWizard/steps/ServiceStep.tsx create mode 100644 frontend/components/ProvisionWizard/steps/SettingsStep.tsx create mode 100644 frontend/components/ProvisionWizard/steps/StorageStep.tsx create mode 100644 frontend/components/ProvisionWizard/steps/TerraformStep.tsx create mode 100644 frontend/components/ProvisionWizard/steps/TokenStep.tsx create mode 100644 frontend/components/ProvisionWizard/types.ts create mode 100644 frontend/components/ProvisionWizard/useWizardState.ts create mode 100644 frontend/components/ProvisionWizard/wizard-api.ts create mode 100644 frontend/components/ProvisionWizard/wizard-config-helpers.ts create mode 100644 frontend/components/ProvisionWizard/wizard-deploy.ts create mode 100644 frontend/components/ProvisionWizard/wizard-draft.ts create mode 100644 frontend/components/ProvisionWizard/wizard-effects.ts create mode 100644 frontend/components/ReloadLoopGuard.tsx create mode 100644 frontend/components/SessionScoring/CardErrorState.tsx create mode 100644 frontend/components/SessionScoring/L2EnforcementCard/index.tsx create mode 100644 frontend/components/SessionScoring/ScorerErrorsChart.tsx create mode 100644 frontend/components/SessionScoring/ScorerFailOpenBreakdownCard.tsx create mode 100644 frontend/components/SessionScoring/ScorerLatencyChart.tsx create mode 100644 frontend/components/SessionScoring/ThresholdSlider/Matrix.tsx create mode 100644 frontend/components/SessionScoring/ThresholdSlider/Preview.tsx create mode 100644 frontend/components/SessionScoring/ThresholdSlider/Slider.tsx rename frontend/components/SessionScoring/{ThresholdSlider.tsx => ThresholdSlider/index.tsx} (59%) create mode 100644 frontend/components/SessionScoring/useScorerTimeseries.ts create mode 100644 frontend/components/SessionScoring/useScoringQuery.ts create mode 100644 frontend/components/Sparkline.tsx create mode 100644 frontend/components/StoreHydrator.tsx create mode 100644 frontend/components/TimeAgo.tsx create mode 100644 frontend/components/WebVitalsReporter/WebVitalsReporter.tsx create mode 100644 frontend/components/share-dashboard/useShareMutation.ts delete mode 100644 frontend/components/ui/empty-state.tsx create mode 100644 frontend/e2e/a11y-admin-routes.spec.ts create mode 100644 frontend/e2e/a11y-routes.spec.ts create mode 100644 frontend/e2e/admin-login.spec.ts create mode 100644 frontend/e2e/analyst-share-login.spec.ts create mode 100644 frontend/e2e/custom-field-vcl-validation.spec.ts create mode 100644 frontend/e2e/dashboard-card-drag-drop.spec.ts create mode 100644 frontend/e2e/dashboard-multi-filter.spec.ts create mode 100644 frontend/e2e/global-setup.ts create mode 100644 frontend/e2e/global-teardown.ts create mode 100644 frontend/e2e/hydration-smoke.spec.ts create mode 100644 frontend/e2e/keyboard-navigation.spec.ts create mode 100644 frontend/e2e/maplibre-country-filter.spec.ts create mode 100644 frontend/e2e/plotly-chart-interactions.spec.ts create mode 100644 frontend/e2e/provision-teardown.spec.ts create mode 100644 frontend/e2e/provision-wizard.spec.ts create mode 100644 frontend/e2e/visual-regression.spec.ts create mode 100644 frontend/e2e/visual-regression.spec.ts-snapshots/geo-map-dark-chromium-darwin.png create mode 100644 frontend/e2e/visual-regression.spec.ts-snapshots/geo-map-light-chromium-darwin.png create mode 100644 frontend/e2e/visual-regression.spec.ts-snapshots/traffic-chart-dark-chromium-darwin.png create mode 100644 frontend/e2e/visual-regression.spec.ts-snapshots/traffic-chart-light-chromium-darwin.png create mode 100644 frontend/hooks/useActiveService.ts create mode 100644 frontend/hooks/useAnalystLogout.ts create mode 100644 frontend/hooks/useCopyToClipboard.ts create mode 100644 frontend/hooks/useCronRunsStream.ts create mode 100644 frontend/hooks/useDashboardBundle.ts create mode 100644 frontend/hooks/useDataWindowOverlap.ts create mode 100644 frontend/hooks/useFilterUrlSync.ts create mode 100644 frontend/hooks/useHeaderBadgeStream.ts create mode 100644 frontend/hooks/useIsAnalyst.ts create mode 100644 frontend/hooks/useLastSync.ts create mode 100644 frontend/hooks/useMounted.ts delete mode 100644 frontend/hooks/usePageContext.ts create mode 100644 frontend/hooks/useServiceStream.ts create mode 100644 frontend/hooks/useShareStream.ts create mode 100644 frontend/hooks/useSyncStatus.ts create mode 100644 frontend/hooks/useSyncStatusStream.ts create mode 100644 frontend/hooks/useSystemMetricsStream.ts create mode 100644 frontend/hooks/useTimeRange.ts create mode 100644 frontend/hooks/useTimezone.ts create mode 100644 frontend/knip.config.ts delete mode 100644 frontend/lib/_preload-chunks.json create mode 100644 frontend/lib/analystFetch.ts create mode 100644 frontend/lib/cron-cache-bust.ts create mode 100644 frontend/lib/fetchWithTimeout.ts create mode 100644 frontend/lib/pop.ts delete mode 100644 frontend/lib/preload-manifest.ts create mode 100644 frontend/lib/sidebar-cookie.ts create mode 100644 frontend/lib/sse-parser.ts create mode 100644 frontend/lib/ssr/_transport.ts create mode 100644 frontend/lib/ssr/admin_trends.ts create mode 100644 frontend/lib/ssr/alerts.ts create mode 100644 frontend/lib/ssr/bootstrap.ts create mode 100644 frontend/lib/ssr/logs.ts create mode 100644 frontend/lib/ssr/seed.ts create mode 100644 frontend/lib/ssr/tos.ts create mode 100644 frontend/lib/ssr/usage_log.ts create mode 100644 frontend/lib/toast.ts create mode 100644 frontend/lib/urlFilterHydration.ts create mode 100644 frontend/lib/ux-telemetry.ts create mode 100644 frontend/lib/workers/buildTrafficData.ts create mode 100644 frontend/lib/workers/chartDataWorker.ts create mode 100644 frontend/lib/workers/json-worker.ts create mode 100644 frontend/lib/workers/parseJson.ts create mode 100644 frontend/playwright.config.ts create mode 100644 frontend/public/geo/world.topo.json delete mode 100644 frontend/scripts/build-preload-manifest.mjs create mode 100644 frontend/stores/adminTokenStore.ts create mode 100644 frontend/stores/popGeoStore.ts create mode 100644 frontend/tests/backend-contract.test.ts create mode 100644 frontend/tests/setup-backend.ts create mode 100644 mypy-baseline.txt create mode 100644 scripts/README.md create mode 100644 scripts/analyze_web_vitals.py create mode 100755 scripts/backup_service_configs.sh create mode 100755 scripts/baseline_metrics.sh create mode 100755 scripts/check_eslint_count.sh create mode 100755 scripts/check_no_console_otel.sh create mode 100755 scripts/check_security_regression_count.sh create mode 100755 scripts/cleanup_orphan_raw_logs.py create mode 100755 scripts/dev/restore_dev_from_snapshot.sh create mode 100755 scripts/dev/snapshot_prod_to_dev.sh create mode 100644 scripts/emit_perf_latest.py create mode 100755 scripts/perf_gate.sh create mode 100644 scripts/refresh_fastly_cidrs.py create mode 100644 scripts/run_contract_backend.py create mode 100644 tests/cassettes/fastly_429_then_success.yaml create mode 100644 tests/contract/openapi_baseline.json create mode 100644 tests/core/__snapshots__/test_api_response_snapshots.ambr create mode 100644 tests/core/__snapshots__/test_repository_sql_snapshots.ambr create mode 100644 tests/core/test_api_response_snapshots.py create mode 100644 tests/core/test_buffer_commit_double_checkpoint.py create mode 100644 tests/core/test_buffer_commit_idempotent.py create mode 100644 tests/core/test_commit_crash_recovery.py create mode 100644 tests/core/test_custom_field_cross_service.py create mode 100644 tests/core/test_custom_field_fuzz.py create mode 100644 tests/core/test_custom_field_null_handling.py create mode 100644 tests/core/test_custom_field_roundtrip.py create mode 100644 tests/core/test_duckdb_pool_drain.py create mode 100644 tests/core/test_duckdb_recycle.py create mode 100644 tests/core/test_duckdb_recycle_barrier.py create mode 100644 tests/core/test_duckdb_status_helpers.py create mode 100644 tests/core/test_field_registry.py create mode 100644 tests/core/test_iceberg_buffer_branches.py create mode 100644 tests/core/test_iceberg_fs.py create mode 100644 tests/core/test_iceberg_self_heal.py create mode 100644 tests/core/test_iceberg_sync_branches.py create mode 100644 tests/core/test_iceberg_view_branches.py create mode 100644 tests/core/test_ingest_corrupt_row_repair.py create mode 100644 tests/core/test_ingest_crash_recovery.py create mode 100644 tests/core/test_ingest_partial_failure.py create mode 100644 tests/core/test_ingest_stateful.py create mode 100644 tests/core/test_local_compaction_branches.py create mode 100644 tests/core/test_metadata_state.py create mode 100644 tests/core/test_metric_snapshots.py create mode 100644 tests/core/test_multi_process_ingest.py create mode 100644 tests/core/test_query_instrumentation.py create mode 100644 tests/core/test_query_registry.py create mode 100644 tests/core/test_reconciliation.py create mode 100644 tests/core/test_repository_sql_snapshots.py create mode 100644 tests/core/test_request_context.py create mode 100644 tests/core/test_request_telemetry.py create mode 100644 tests/core/test_rollups_day_bundles.py create mode 100644 tests/core/test_rollups_network_rtt.py create mode 100644 tests/core/test_rollups_network_speed.py create mode 100644 tests/core/test_rollups_origin_summary.py create mode 100644 tests/core/test_rollups_origin_summary_daily.py create mode 100644 tests/core/test_rollups_perf_latency.py create mode 100644 tests/core/test_rollups_recompute.py create mode 100644 tests/core/test_rollups_sessions.py create mode 100644 tests/core/test_rollups_slow_urls.py create mode 100644 tests/core/test_rollups_time_series.py create mode 100644 tests/core/test_rollups_verified_bots_ts.py create mode 100644 tests/core/test_rollups_wellknown_bots.py create mode 100644 tests/core/test_rollups_wellknown_bots_writer.py create mode 100644 tests/core/test_slow_queries_persist.py create mode 100644 tests/core/test_sqlite_pool.py create mode 100644 tests/core/test_sqlite_wal_crash.py create mode 100644 tests/core/test_view_rebind_race.py create mode 100644 tests/core/test_web_vitals_store.py create mode 100644 tests/correctness/test_live_rollup_agreement.py create mode 100644 tests/cron/test_commit.py create mode 100644 tests/cron/test_compaction_jobs.py create mode 100644 tests/cron/test_duckdb_recycle_job.py create mode 100644 tests/cron/test_expire.py create mode 100644 tests/cron/test_insights_prewarmer.py create mode 100644 tests/cron/test_metadata.py create mode 100644 tests/cron/test_metric_snapshot_job.py create mode 100644 tests/cron/test_optimize.py create mode 100644 tests/cron/test_scheduler_branches.py create mode 100644 tests/cron/test_scheduler_recycle.py create mode 100644 tests/cron/test_sync_job.py create mode 100644 tests/fixtures/fastly_stubs.vcl create mode 100644 tests/perf/__init__.py create mode 100644 tests/perf/baseline.json create mode 100644 tests/perf/test_benchmarks_micro.py create mode 100644 tests/provision/__init__.py create mode 100644 tests/provision/test_session_scoring_orchestrator.py create mode 100644 tests/repositories/_sql/__init__.py create mode 100644 tests/repositories/_sql/test_alerts.py create mode 100644 tests/repositories/_sql/test_dashboard.py create mode 100644 tests/repositories/_sql/test_insights.py create mode 100644 tests/repositories/_sql/test_network.py create mode 100644 tests/repositories/_sql/test_origin.py create mode 100644 tests/repositories/_sql/test_query.py create mode 100644 tests/repositories/_sql/test_security.py create mode 100644 tests/repositories/_sql/test_sessions.py create mode 100644 tests/repositories/_sql/test_usage.py create mode 100644 tests/repositories/test_base_branches.py create mode 100644 tests/repositories/test_origin_aggregates.py create mode 100644 tests/repositories/test_security_branches.py create mode 100644 tests/repositories/test_session_scoring_repo.py create mode 100644 tests/repositories/test_time_series_rollup.py create mode 100644 tests/routers/test_admin_compaction.py create mode 100644 tests/routers/test_admin_health_snapshot.py create mode 100644 tests/routers/test_admin_metric_history.py create mode 100644 tests/routers/test_admin_queries.py create mode 100644 tests/routers/test_admin_system_metrics_stream.py create mode 100644 tests/routers/test_bootstrap_graceful_degradation.py create mode 100644 tests/routers/test_dashboard_router.py create mode 100644 tests/routers/test_network_router.py create mode 100644 tests/routers/test_provision_branches.py create mode 100644 tests/routers/test_rbac_audit_fixes.py create mode 100644 tests/routers/test_ux_events.py create mode 100644 tests/routers/test_web_vitals.py create mode 100644 tests/scoring/test_normalize_parity.py create mode 100644 tests/scoring/test_normalize_runtime_parity.py create mode 100644 tests/scoring/test_scoring_vcl_l2_gaps.py create mode 100644 tests/security/__init__.py create mode 100644 tests/security/conftest.py create mode 100644 tests/security/test_live_rbac_probes.py create mode 100644 tests/security/test_no_infra_leak_in_tracked_tree.py create mode 100644 tests/test_analyze_web_vitals.py create mode 100644 tests/test_changelog_breaking_parity.py create mode 100644 tests/test_cron_runs_sse.py create mode 100644 tests/test_dev_mode_no_crons.py create mode 100644 tests/test_error_envelope_contract.py create mode 100644 tests/test_http_exception_envelope_shape.py create mode 100644 tests/test_multi_service_e2e.py create mode 100644 tests/test_provision_cli.py create mode 100644 tests/test_provision_fastly_failures.py create mode 100644 tests/test_schemathesis_smoke.py create mode 100644 tests/test_sre_observability.py create mode 100644 tests/test_sync_status_sse.py create mode 100644 tests/test_trust_topology.py create mode 100644 tests/utils/__snapshots__/test_terraform_gen.ambr create mode 100644 tests/utils/polling.py create mode 100644 tests/utils/test_active_requests.py create mode 100644 tests/utils/test_auth.py create mode 100644 tests/utils/test_cache_registry.py delete mode 100644 tests/utils/test_cdn.py create mode 100644 tests/utils/test_check_osv.py create mode 100644 tests/utils/test_fastly_mock_mode.py create mode 100644 tests/utils/test_hll.py create mode 100644 tests/utils/test_rdns_async.py create mode 100644 tests/utils/test_refresh_fastly_cidrs.py create mode 100644 tests/utils/test_remote_access_branches.py create mode 100644 tests/utils/test_structlog_config.py create mode 100644 tests/utils/test_telemetry_unit.py create mode 100644 tests/utils/test_testcontainers_smoke.py create mode 100644 tests/utils/test_tunnel_state.py diff --git a/.env.example b/.env.example index 435b605f..80ef185e 100644 --- a/.env.example +++ b/.env.example @@ -47,6 +47,118 @@ # backend runs on a different host than the frontend. # NEXT_PUBLIC_API_URL=http://127.0.0.1:8000 +# ── Observability ────────────────────────────────────────────────────────────── +# OpenTelemetry exporter. Default 'none' — no spans/metrics leave the process. +# Set 'console' to dump spans and 60s metric snapshots to stdout (loud; useful +# locally when chasing a perf regression). Don't set 'console' in prod — it +# pollutes log aggregation with ~1 MB/min of JSON. +# OTEL_EXPORTER=console + +# Log format. Default 'console' (colored TTY-friendly output). Set 'json' in +# prod to emit structured JSON lines that downstream aggregators can parse. +# STRUCTLOG_FORMAT=json + +# ── Security: trusted-proxy + data-dir gates ─────────────────────────────────── +# Comma-separated trusted proxy IPs. MUST be set in production alongside the +# uvicorn flags '--proxy-headers --forwarded-allow-ips=' so the +# remote-access middleware can read request.client.host as the real client IP. +# Without this, leftmost-XFF spoofing becomes exploitable and IP-based gates +# (rate-limit, admin detection, whitelist) silently no-op. Local dev leaves +# this unset and the startup check downgrades to a WARNING. +# TRUSTED_PROXY_IPS=127.0.0.1 + +# uvicorn's own env-equivalent of '--forwarded-allow-ips'. Set in production +# whenever TRUSTED_PROXY_IPS is set — defense in depth so a future refactor +# that drops the CLI flag is still detected by the startup check. +# UVICORN_FORWARDED_ALLOW_IPS=127.0.0.1 + +# Make the proxy-headers check FATAL instead of WARNING. Set in production so +# a misconfigured deploy refuses to start rather than running insecure. +# REQUIRE_PROXY_HEADERS=1 + +# Refuse to start if /app/data is not an actual mount point. Set in production +# so a broken fstab can't silently ingest into an ephemeral location that +# vanishes on the next reboot. Leave unset locally (the repo bind-mount isn't +# a real mount point). +# STRICT_DATA_DIR_CHECK=1 + +# Extra hostnames that count as "local" for the remote-access middleware. +# Comma-separated. Default allowlist already includes localhost, 127.0.0.1, +# [::1], 0.0.0.0, testserver, backend, frontend, caddy, web. Add custom +# Docker service names here if Caddy proxies through a different upstream. +# LOCAL_HOSTS=backend,my-custom-service +# Legacy aliases (read for backward compat; prefer LOCAL_HOSTS): +# LOCAL_HOST_ALLOWLIST= +# ALLOWED_HOSTS= + +# ── DuckDB connection pool ───────────────────────────────────────────────────── +# Disable the pool entirely (fresh connection per request). Default ON. +# DUCKDB_CONNECTION_POOL=0 + +# Max concurrent connections per service in the pool. Default 8. Larger values +# let more queries run in parallel but each pool conn carries its own DuckDB +# memory budget — pair with DUCKDB_POOL_CONN_MEMORY_LIMIT to bound total RSS. +# DUCKDB_POOL_MAX_SIZE=8 + +# Per-pool-connection DuckDB memory cap (accepts '256MB', '1GB', '104857600'). +# WITHOUT this, every pool conn inherits the process-wide ~60%-of-RAM default, +# so DUCKDB_POOL_MAX_SIZE concurrent queries can each balloon to multi-GB and +# OOM the container under load. Recommended in production. Leave unset locally. +# DUCKDB_POOL_CONN_MEMORY_LIMIT=1GB + +# Per-pool-connection DuckDB thread count. Default min(cpu_count, 8). With the +# default 8-conn pool, that's 64 threads competing for ~8 cores — context +# switching dominates. Set to roughly cpu_count // DUCKDB_POOL_MAX_SIZE to +# trade single-query throughput for better tail latency under sustained load. +# DUCKDB_POOL_CONN_THREADS=2 + +# View-rebind lock timeout (ms) when API pool checkouts contend with cron's +# view-update lock. Default 500ms — short so the pool never serialises behind +# a stuck cron; falls back to the cached/persistent view on miss. Set 0 for +# emergency-rollback to old blocking behaviour. +# DUCKDB_POOL_API_REBIND_LOCK_TIMEOUT_MS=500 + +# Pre-acquire pool connections at startup so the first request doesn't pay +# ~150 ms per fresh-build conn. Default OFF (cold-start is faster); flip on +# for low-latency-first-request workloads (parallelised /api/origin/aggregates). +# DUCKDB_POOL_WARM_AT_BOOT=1 +# DUCKDB_POOL_WARM_AT_BOOT_COUNT=4 + +# Drop leftover TEMP tables on connection release. Default OFF (cheap and +# bounded; flip on if a long-running deployment is leaking TEMP entries). +# DUCKDB_POOL_SWEEP=1 + +# ── Local parquet compaction ─────────────────────────────────────────────────── +# Stop merging a partition once its total parquet size exceeds this (MB). +# Default 256. Prevents a runaway single-file compaction from collapsing +# scan parallelism (DuckDB parallelises across files). Don't lower below ~64. +# LOCAL_COMPACT_MAX_PARTITION_MB=256 + +# Hourly partitions older than this (days) become eligible for cross-hour +# DAILY compaction. Recent hours stay hourly so dashboard time-range pruning +# stays tight. Default 7 — validated empirically; do NOT lower (regressions +# proven on scan-bound queries). +# LOCAL_COMPACT_DAILY_TIER_DAYS=7 + +# Daily files older than this (days) become eligible for WEEKLY compaction. +# Only effective when log_retention_days > this. Default 30. +# LOCAL_COMPACT_WEEKLY_TIER_DAYS=30 + +# ── Admin query monitor ──────────────────────────────────────────────────────── +# Toggle the /api/admin/query-monitor surface. Default ON. Set to '0', 'false', +# 'no', 'off', or '' to disable (admin endpoint then 404s; frontend treats it +# as "missing" rather than "broken"). +# QUERY_MONITOR_ENABLED=true + +# ── CORS allowlist (production) ──────────────────────────────────────────────── +# Comma-separated list of allowed browser origins for credentialed XHR. Local +# dev leaves this UNSET and FastAPI falls back to a localhost-only dev allowlist +# (localhost:3000/3001/13002). In production set this to the public endpoint of +# the analyst SPA (e.g. CORS_ORIGINS=https://logs.example.com) — the analyst UI +# is same-origin behind Caddy so this is defense-in-depth, but it closes the +# door on a localhost-bound hostile sidecar holding allow_credentials=True. +# CORS_ORIGINS=https://logs.example.com + # ── Docker only ──────────────────────────────────────────────────────────────── # Set automatically by docker-compose; not needed for local dev. # API_PROXY_URL=http://backend:8000 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 00c0e2ae..647ace5b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,13 @@ on: pull_request: branches: [main] +# Cancel superseded runs on the same ref (e.g. a force-push or rapid PR +# updates). `main` keeps each push separate (group includes SHA) so we +# never lose a post-merge run; PR refs collapse to one in-flight run. +concurrency: + group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.sha || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true @@ -32,13 +39,37 @@ jobs: - name: Format check (ruff) run: uv run ruff format --check . - - name: Type check (mypy) - run: uv run mypy backend/ + - name: Type check (mypy, filtered through mypy-baseline) + # Pre-existing errors accepted via mypy-baseline.txt; the filter + # exits non-zero only on NET-NEW errors. Refresh the baseline after + # a burndown PR with + # uv run mypy backend/ 2>&1 | uv run mypy-baseline sync + # and commit mypy-baseline.txt. + run: uv run mypy backend/ 2>&1 | uv run mypy-baseline filter + + - name: Architectural contracts (import-linter, R-9) + # Enforces: + # - Routers are independent (no router imports another router, + # transitively). Pre-existing cross-router edges are + # baselined in pyproject.toml [tool.importlinter]; new + # edges fail the gate. + # - Core does not depend on routers (no inversion of + # web ↔ analytics layering). + run: uv run lint-imports - name: Install falco + # Pinned to match backend/Dockerfile's FALCO_VERSION so CI lints VCL + # with the SAME falco the prod backend uses. An unpinned `latest` is a + # moving supply-chain target AND can accept/reject a recv snippet + # differently than production (silent CI-vs-prod VCL-lint drift on a + # security-relevant validation path). Bump deliberately alongside the + # Dockerfile ARG. run: | - sudo curl -sL https://github.com/ysugimoto/falco/releases/latest/download/falco-linux-amd64 -o /usr/local/bin/falco + FALCO_VERSION=2.3.0 + sudo curl -sSfL "https://github.com/ysugimoto/falco/releases/download/v${FALCO_VERSION}/falco-linux-amd64.tar.gz" \ + | sudo tar -xz -C /usr/local/bin falco sudo chmod +x /usr/local/bin/falco + falco --version - name: Install gitleaks # Same curl-binary-to-PATH pattern as falco above. Version pinned so @@ -59,6 +90,24 @@ jobs: # suppression playbook. run: gitleaks detect --no-banner --redact --config .gitleaks.toml --exit-code 1 + - name: Install osv-scanner + # Same curl-binary-to-PATH pattern as falco and gitleaks above. + # Version pinned so a CVE-database refresh doesn't suddenly fail + # an unrelated PR; bump deliberately when wanted. + run: | + OSV_VERSION=2.2.4 + sudo curl -sSfL "https://github.com/google/osv-scanner/releases/download/v${OSV_VERSION}/osv-scanner_linux_amd64" \ + -o /usr/local/bin/osv-scanner + sudo chmod +x /usr/local/bin/osv-scanner + osv-scanner --version + + - name: Dependency vulnerability scan (osv-scanner, CRITICAL gate) + # scripts/check_osv.py runs osv-scanner once and exits non-zero + # only on CRITICAL vulnerabilities. Lower severities print as a + # warning table but don't block — they get triaged via Dependabot. + # Lives in scripts/ so it's also runnable locally via `make osv`. + run: uv run python scripts/check_osv.py + - name: Install terraform # Required by tests/utils/test_terraform_gen.py — runs `terraform fmt` # against generator output and `validate` when TERRAFORM_VALIDATE=1. @@ -85,19 +134,46 @@ jobs: env: FALCO_REQUIRED: "1" TERRAFORM_VALIDATE: "1" - # Gate ratcheted as milestones land: - # end Milestone A: 44% (baseline 46%, -2pp buffer) - # end Milestone E: 47% (current 49% — keeps the 2pp buffer) - # post-Milestone E coverage backfill: 55% (current 59% — 4pp buffer) - # confidence-batch (insights+admin+services+dashboard+origin+ - # hypothesis+regression+E2E smoke): 78% (current 83% — 5pp buffer) + # Coverage gate convention: ratchet --cov-fail-under to current actual − 2pp. + # The 2pp buffer absorbs CI-vs-local jitter so it can't force-fail a build; + # raise it as backend coverage clears the next floor. `make ratchet` prints actual. # # `-n auto` parallelizes via pytest-xdist (TESTING_PLAN_3 item 21). # Verified safe: per-service SQLite (`{id}.metadata.db`) + per-test # tmp_path give file isolation; autouse `_reset_module_caches` resets # the 8 module-level caches between tests; moto fixtures are per-test. # Local run: 2268 passed in 58s under `-n auto` vs ~3min serial. - run: uv run pytest -n auto --cov=backend --cov-report=term --cov-fail-under=78 + run: uv run pytest -n auto --cov=backend --cov-report=term --cov-fail-under=86 + + - name: Observability guard (no OTEL_EXPORTER=console in deploy files) + # SRE-10 / ADR-08 §5: the console exporter floods prod stdout with + # ~1 MB/min of JSON (the 2026-06-10 incident). The default is `none` + # in code; this catches a hardcoded `console` slipping into a tracked + # compose/Dockerfile/env. Also runnable locally via `make ci`. + run: bash scripts/check_no_console_otel.sh + + - name: Security-regression count gate + # v2.0 cleanup Phase 0.8: asserts the + # @pytest.mark.security_regression count never drops below the + # baseline floor (24 — from the since-removed audit-findings/ + # verified fixes). A refactor cannot silently delete coverage of a + # verified fix without surfacing the change. + run: bash scripts/check_security_regression_count.sh + + - name: Emit perf samples (CI-scale synthetic load) + # Produces tests/perf/latest.json from a 100K-row in-memory + # DuckDB dataset (~2 s wall). The gate below compares to + # tests/perf/baseline.json and fails on >regression_pct_threshold% + # over baseline (50 % default; tuned for GH Actions runner + # variance at CI scale). + run: uv run python scripts/emit_perf_latest.py + + - name: Perf gate (load-harness baseline) + # Compares the just-emitted latest.json against baseline.json. + # Production targets (≤2800 / ≤1900 ms) are documented in + # baseline.json's production_targets_comment for traceability + # but enforced by the manual loadtest probe, not this CI gate. + run: bash scripts/perf_gate.sh frontend: name: Frontend (Node) @@ -130,17 +206,94 @@ jobs: run: npm ci - name: Generate API types - # `frontend/types/api.generated.ts` is gitignored — regenerated fresh - # on every CI run. The tsc step below is the drift guard: if a backend - # model changed in a way that breaks a frontend `components['schemas']` - # import, tsc fails here against the just-regenerated types. + # `frontend/types/api.generated.ts` is regenerated fresh on every + # CI run. The drift guard below catches the case where a contributor + # bypassed the pre-commit `regen-openapi` hook (or where the backend + # OpenAPI surface changed without a corresponding type regen). The + # backend-side guard is `tests/test_openapi_snapshot.py`; this is + # the consumer-side mirror. run: npm run gen:types + - name: Detect drift in generated OpenAPI types + # Pre-commit runs the same generator, so the only way this fires + # is (a) someone bypassed --no-verify or (b) the openapi-typescript + # tool version drifted between local and CI. Either way, the right + # response is to regenerate locally and commit. + run: | + if ! git diff --exit-code types/api.generated.ts openapi.json; then + echo "::error::Generated OpenAPI types are out of sync. Run 'npm run gen:types' locally and commit the result." >&2 + exit 1 + fi + - name: Type check (tsc) run: npx tsc --noEmit + - name: ESLint count-ceiling gate + # ESLint was previously gated nowhere (this job runs gen:types + tsc + + # vitest; the backend job runs the Python import-linter). The gate + # fails if the source eslint error count rises above the committed + # ceiling, catching new `as any` / rules-of-hooks before runtime. + # Ratchet the ceiling down as violations are removed. + # The script resolves the repo root itself, so call it with ../. + run: bash ../scripts/check_eslint_count.sh + - name: Tests (vitest with coverage) - # Gate ratcheted as milestones land: - # end Milestone A: 40% (baseline 42.7%, -2pp buffer) - # end Milestone E: 44% (current 46.55% — keeps the 2pp buffer) - run: npx vitest run --coverage --coverage.thresholds.lines=44 + # Coverage gate convention: ratchet each threshold to current actual − 2pp + # (the 2pp buffer absorbs CI-vs-local jitter). GATE-03 (2026-06-19): enforce + # statements/functions/branches floors too, not just lines — else an uncovered + # error-path branch (no new lines) can't drop the gate, and the branch floor is + # what catches a happy-path-only test. `make ratchet` prints all four. + run: >- + npx vitest run --coverage + --coverage.thresholds.lines=66 + --coverage.thresholds.statements=65 + --coverage.thresholds.functions=54 + --coverage.thresholds.branches=52 + + scorer: + name: Scorer (Rust) + runs-on: forge-amd64-medium + defaults: + run: + working-directory: compute/scorer + + steps: + - uses: actions/checkout@v6 + + - name: Install Rust toolchain + # compute/scorer/rust-toolchain.toml pins the channel (1.90). We install + # that toolchain explicitly so `rustc`/`cargo` exist for the cache step + # below; rustup then honours the toml override when cargo runs here. + # Install rustup itself only if the runner image doesn't ship it. + run: | + if ! command -v cargo >/dev/null && [ ! -x "$HOME/.cargo/bin/cargo" ]; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \ + | sh -s -- -y --default-toolchain 1.90 --profile minimal + fi + echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" + + - name: Cache cargo registry + build + uses: Swatinem/rust-cache@v2 + with: + workspaces: compute/scorer + + - name: Run scorer unit tests + # Native (host-target) tests: Python↔Rust normalize/cookie/matrix + # parity, session-expiry boundaries, and the scoring math. These are the + # 80+ `#[test]`s that no other CI job runs — a Rust-side normalizer, + # wire-format, or expiry regression ships green without this. `--locked` + # also fails if Cargo.lock drifted. No Fastly CLI needed: the dev + # profile builds for the host; Wasm is only built for deploy + # (`make scorer-package`). + run: cargo test --locked + + - name: Audit dependencies for RustSec advisories + # The scorer verifies AES-GCM cookie integrity at the edge, so a future + # advisory in a crypto/RNG crate (aes-gcm/ghash/polyval/getrandom/time) + # must fail CI rather than ship green — `cargo test --locked` above does + # not check advisories. `cargo audit` exits non-zero on any known + # vulnerability in the locked tree. cargo-audit is cached by rust-cache + # (cache-bin) after the first install. + run: | + command -v cargo-audit >/dev/null || cargo install cargo-audit --locked + cargo audit diff --git a/.github/workflows/cidr-refresh.yml b/.github/workflows/cidr-refresh.yml new file mode 100644 index 00000000..47909585 --- /dev/null +++ b/.github/workflows/cidr-refresh.yml @@ -0,0 +1,53 @@ +name: Refresh Fastly CIDRs + +# Weekly refresh of the Fastly edge CIDR list in the repo-root Caddyfile. +# The @from_fastly_v4 matcher gates X-Forwarded-For rewriting on Fastly's +# published v4 ranges; a stale list silently classifies traffic from new +# POPs as direct (untrusted) until somebody refreshes it and reloads +# Caddy. The script is well-tested (scripts/refresh_fastly_cidrs.py); +# this workflow just runs it on a cadence and opens a PR if the file +# changed. Off-minute schedule on purpose so the runner pool isn't +# hammered at :00 alongside everybody else's hourly jobs. + +on: + schedule: + - cron: '13 9 * * 1' # Mondays at 09:13 UTC + workflow_dispatch: {} + +permissions: + contents: write + pull-requests: write + +jobs: + refresh: + name: Fetch + open PR on diff + runs-on: forge-amd64-medium + steps: + - uses: actions/checkout@v6 + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + python-version: "3.13" + + - name: Refresh Caddyfile + # No-op if the published list already matches what's in the + # Caddyfile (script prints "No changes …" and exits 0). Writes + # the updated matcher block otherwise; peter-evans/create-pull- + # request below only opens a PR when the working tree is dirty. + run: uv run python scripts/refresh_fastly_cidrs.py + + - name: Open PR if Caddyfile changed + uses: peter-evans/create-pull-request@v7 + with: + commit-message: 'chore: refresh Fastly edge CIDR list in Caddyfile' + branch: chore/refresh-fastly-cidrs + delete-branch: true + title: 'chore: refresh Fastly edge CIDR list' + body: | + Automated update from `scripts/refresh_fastly_cidrs.py`, triggered by the weekly `cidr-refresh.yml` workflow. + + The `@from_fastly_v4` matcher in [Caddyfile](../blob/main/Caddyfile) gates the `X-Forwarded-For` rewrite on Fastly-published edge ranges. A stale list silently classifies traffic from new POPs as direct (untrusted) until Caddy reloads. + + After merge: run `~/restart.sh caddy` (or equivalent) on the VM to pick up the new ranges. diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml new file mode 100644 index 00000000..2ca11f89 --- /dev/null +++ b/.github/workflows/e2e.yml @@ -0,0 +1,103 @@ +name: E2E (Playwright) + +# R-3d (testing_suite_audit_2026-06-14.md). Runs the Playwright +# journeys from frontend/e2e/ against a Chromium build with the +# FastAPI backend booted under FASTLY_MOCK_MODE=1 (see +# scripts/run_contract_backend.py + frontend/e2e/global-setup.ts). +# +# Blocking on every PR. The original audit recommended a 2-week +# soft-launch with continue-on-error: true; we skipped that grace +# window deliberately — the trade-off is that the first few weeks +# may surface flakes as blocking failures (use the uploaded trace +# artifact to debug), but the alternative was forgetting to flip +# the gate. To revisit: add `continue-on-error: true` to the job +# below if a specific test needs a green window to stabilise. +# +# Branch protection on `main` now requires status checks to pass before +# merge (GATE-04, 2026-06-19): `Backend (Python)`, `Frontend (Node)`, and +# `Scorer (Rust)` are required. The three `Playwright (chromium|firefox|webkit)` +# contexts are NOT yet required because this workflow lives only on a feature +# branch — GitHub can't require a context it has never seen reported on `main` +# (it would deadlock every PR as "Expected — waiting for status"). Once this +# file merges to `main` and the matrix reports once, append the three Playwright +# contexts to required_status_checks: +# gh api -X PUT repos/{owner}/{repo}/branches/main/protection/required_status_checks \ +# -f strict=false -f 'contexts[]=Backend (Python)' -f 'contexts[]=Frontend (Node)' \ +# -f 'contexts[]=Scorer (Rust)' -f 'contexts[]=Playwright (chromium)' \ +# -f 'contexts[]=Playwright (firefox)' -f 'contexts[]=Playwright (webkit)' + +on: + push: + branches: [main] + pull_request: + branches: [main] + +# Cancel superseded runs on the same ref (see ci.yml for the same pattern +# and rationale). main keeps each push separate; PR refs collapse to one +# in-flight run so a rapid push doesn't stack three browser matrices. +concurrency: + group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.sha || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true + +jobs: + playwright: + name: Playwright (${{ matrix.browser }}) + runs-on: forge-amd64-medium + strategy: + fail-fast: false + matrix: + browser: [chromium, firefox, webkit] + + steps: + - uses: actions/checkout@v6 + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + python-version: "3.13" + + - name: Install backend deps + run: uv sync + + - name: Setup Node + # Aligned with ci.yml (Node 24 + setup-node@v6) — package.json + # declares `engines.node: ">=24"`, so Node 22 here would silently + # run Playwright on an unsupported version. FORCE_JAVASCRIPT_ACTIONS_TO_NODE24 + # only affects GitHub's bundled actions infrastructure, not the + # Node runtime used by the test step. + uses: actions/setup-node@v6 + with: + node-version: 24 + cache: "npm" + cache-dependency-path: frontend/package-lock.json + + - name: Install frontend deps + run: cd frontend && npm ci + + - name: Install Playwright browser (${{ matrix.browser }}) + system deps + run: cd frontend && npx playwright install --with-deps ${{ matrix.browser }} + + - name: Run Playwright (${{ matrix.browser }}) + env: + CI: "1" + run: cd frontend && npx playwright test --project=${{ matrix.browser }} + + - name: Upload Playwright report + if: failure() || cancelled() + uses: actions/upload-artifact@v5 + with: + name: playwright-report-${{ matrix.browser }} + path: frontend/playwright-report + retention-days: 7 + + - name: Upload Playwright traces + if: failure() || cancelled() + uses: actions/upload-artifact@v5 + with: + name: playwright-traces-${{ matrix.browser }} + path: frontend/test-results + retention-days: 7 diff --git a/.github/workflows/perf-nightly.yml b/.github/workflows/perf-nightly.yml new file mode 100644 index 00000000..486edb1a --- /dev/null +++ b/.github/workflows/perf-nightly.yml @@ -0,0 +1,69 @@ +name: Perf nightly (1M rows) + +# Catches superlinear regressions the 100K PR-smoke gate can't see. +# The smoke gate (in ci.yml) is constant-factor-dominated at 100K, so a +# refactor that introduces an extra n-ary join or an accidental O(N²) +# can clear it while spiking nightly_1m by 2-3×. Runs on a daily cron +# (~06:30 UTC, just after the PR-merge zone) plus on-demand via +# workflow_dispatch. +# +# Wired against the same emit_perf_latest.py + perf_gate.sh as ci.yml +# — only PERF_NUM_ROWS differs. The scale_key embedded in latest.json +# tells perf_gate.sh which section of baseline.json's scenarios_by_scale +# to compare against, so a regression at 1M doesn't false-positive +# against the 100K thresholds. + +on: + schedule: + # Daily at 06:30 UTC. Off-peak for the GH Actions runner pool and + # well after the bulk of PR traffic has merged for the day. + - cron: "30 6 * * *" + workflow_dispatch: + inputs: + rows: + description: "Synthetic dataset row count (default 1,000,000)" + required: false + default: "1000000" + +jobs: + perf-nightly: + name: Perf gate (1M rows) + runs-on: forge-amd64-medium + timeout-minutes: 20 + + steps: + - uses: actions/checkout@v6 + + - name: Setup Python (uv) + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + + - name: Install Python deps + run: uv sync + + - name: Emit perf samples (nightly scale) + env: + # workflow_dispatch override > schedule default of 1_000_000. + PERF_NUM_ROWS: ${{ github.event.inputs.rows || '1000000' }} + # In-DuckDB synthetic generator does 1M rows in <1s; 5 cold + + # 7 warm query runs at 1M land around 60s total wall time. + run: uv run python scripts/emit_perf_latest.py + + - name: Perf gate (nightly_1m baseline) + # Compares the just-emitted latest.json against the + # scenarios_by_scale.nightly_1m section of baseline.json. + # Mismatch (e.g. PERF_NUM_ROWS bumped past the 1.5M boundary + # without adding nightly_5m baselines) fails fast with the + # actual scale_key emitted. + run: bash scripts/perf_gate.sh + + - name: Upload latest.json artifact + if: always() + uses: actions/upload-artifact@v5 + with: + name: perf-nightly-latest-${{ github.run_id }} + path: tests/perf/latest.json + # 14 days is enough headroom to debug a regression flagged + # on a Friday without paging anyone over the weekend. + retention-days: 14 diff --git a/.gitignore b/.gitignore index 33202558..77b76bd3 100644 --- a/.gitignore +++ b/.gitignore @@ -3,10 +3,10 @@ setup-state.json *.duckdb *.duckdb.wal /configs/* -# Security: the SSH known_hosts pin IS source-controlled — it's the -# trust anchor for the reverse-tunnel host-key check. Override the -# blanket configs/* ignore. -!/configs/ssh_known_hosts +# Keep the configs/ directory present on a fresh clone so "drop a JSON +# config here" works without a manual mkdir; the dir's real contents stay +# ignored by the blanket rule above. +!/configs/.gitkeep /data/* /data/system/* __pycache__/ @@ -57,26 +57,62 @@ frontend/*_output.txt # Reproducible via scripts/scoring/extract_traces.py against local data. tests/fixtures/scoring/ +# Per-run perf output written by scripts/emit_perf_latest.py and read by +# scripts/perf_gate.sh. The tracked baseline.json is the committed gate input. +tests/perf/latest.json + # Trained matrix.json carries real customer route names. Regenerable via # scripts/scoring/train.py against a fresh trace extract. compute/scorer/matrix.json +# Per-tenant matrices pulled from FOS on every backend startup +# (see backend/main.py:_ensure_scoring_matrix). matrix.default.json +# stays tracked — that's the in-repo fallback the scoring endpoint +# uses when neither the shared matrix.json nor a tenant matrix exists. +compute/scorer/matrix_*.json + # Rust build artifacts. compute/scorer/target/ compute/scorer/bin/ -compute/scorer/pkg/ +# Ignore the pkg build dir EXCEPT the committed, matrix-less Wasm package the +# backend ships in its image and deploys via the Fastly API (no toolchain on +# the VM). Rebuild with `make scorer-package` and recommit on scorer changes. +compute/scorer/pkg/* +!compute/scorer/pkg/session-scorer.tar.gz # Per-deployment secrets: AES cookie keys, deploy-time IDs the service files # might reference. NEVER commit. .scoring/ .aider* +# Snapshot of the GCE VM's restart script — drift-prone copy that also +# leaks VM name/zone/mount paths. The canonical version lives on the VM +# at ~/restart.sh; this local copy is for reference only. +/scripts/restart.sh.deployed + # Ad-hoc working directory for local profiling — HAR captures, per-page JSON # summaries, query trace dumps. The reusable harness scripts (profile.js, # split_per_page.py) live here for now; treat the whole tree as throwaway. /scratch/ +# Performance-audit campaign artifacts: HAR captures, per-sample telemetry, +# aggregated p50/p95/p99 summaries, per-page reports + improvement plans. +# Throwaway — regenerable by re-running scratch/perf_audit.mjs. +/performance-report/ + +# Architectural baseline-metrics snapshots from scripts/baseline_metrics.sh +# (make baseline). Generated, throwaway — regenerable any time. +/.metrics/ + # Local-only VS Code config (file-watcher / Pylance excludes for the # regenerating .next + cache trees). Personal to each contributor's editor # setup — not promoted to the repo by default. .vscode/ + +# Playwright artifacts — generated per-run, throwaway. The specs +# themselves live under frontend/e2e/ and ARE tracked. +/frontend/playwright-report/ +/frontend/test-results/ +# Separate Next.js dev cache for the Playwright dev server (port 13004) +# so its lockfile doesn't collide with the dev shell on 13002. +/frontend/.next-e2e/ diff --git a/.gitleaks.toml b/.gitleaks.toml index f09b2c4a..a9b9b075 100644 --- a/.gitleaks.toml +++ b/.gitleaks.toml @@ -38,10 +38,6 @@ paths = [ '''^tests/repositories/test_alerts\.py$''', # zeros Slack webhook fixture '''^tests/utils/test_sql_validator\.py$''', # blocked-function NAMES (e.g. "AWS_SECRET_ACCESS_KEY") - # Public SSH host key for localhost.run — sharing is the entire point - # (trust anchor for the reverse-tunnel host-key check). - '''^configs/ssh_known_hosts$''', - # Documentation: release notes and runbooks may reference example # tokens / credentials in prose. '''^docs/''', @@ -49,13 +45,18 @@ paths = [ '''^AGENTS\.md$''', # Working-tree-only artifacts (all gitignored; matter only for - # ad-hoc `--no-git` runs). gitleaks uses Go's RE2 engine, which - # doesn't support negative lookahead, so we list the per-service - # config filename pattern explicitly rather than "everything under - # configs/ except ssh_known_hosts". + # ad-hoc `--no-git` runs). We list the per-service config filename + # pattern explicitly. '''^frontend/\.next/''', # Next.js build cache '''^configs/.*\.json(\.bak.*)?$''', # real per-service Fastly configs (gitignored) - '''^data/''', # real SSH share key, share DB, runtime data + # data/ is fully gitignored, so these only affect ad-hoc `--no-git` + # runs. Scope to the actual runtime artifact types rather than the whole + # tree, so a stray plaintext secret accidentally dropped under data/ (a + # NEW file type) is still scanned instead of blanket-allowlisted. + '''^data/system/share_key(\.pub)?$''', # SSH share keypair (the real secret material) + '''^data/.*\.(db|duckdb)(-wal|-shm)?$''', # SQLite/DuckDB runtime stores (binary blobs) + '''^data/.*\.tmp$''', # transient store files + '''^data/(tunnel_state|system/usage_logging)\.json$''', # runtime state blobs '''.*/__pycache__/''', # Python bytecode '''\.pyc$''', ] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5a150d76..a4d1ce74 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,27 +1,54 @@ repos: + # Pinned ruff version must stay reasonably close to the version in + # pyproject.toml (currently ruff>=0.11) — drift triggers pre-existing + # rule changes (UP038, E731 strictness) that the project's actual ruff + # has already retired. Bump together when bumping either side. - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.0 + rev: v0.15.15 hooks: - id: ruff args: [--fix] - id: ruff-format - - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.15.0 + # mypy runs via the project's own uv env (matches what CI runs) and is + # piped through mypy-baseline so pre-existing errors stay accepted and + # only NET-NEW errors fail the commit. The baseline lives in + # mypy-baseline.txt at the repo root; refresh it after a burndown PR with + # uv run mypy backend/ 2>&1 | uv run mypy-baseline sync + # and commit the updated file. + - repo: local hooks: - id: mypy - additional_dependencies: - - types-boto3 - - types-pytz - - fastapi - - pydantic + name: mypy (full backend/, filtered through mypy-baseline) + language: system + # Always check the whole backend/ tree, not just changed files — + # per-file mypy only visits a partial import graph, which makes + # mypy-baseline report unrelated baseline entries as "fixed" and + # exit non-zero. Cost: ~10s per commit; benefit: matches CI exactly. + entry: bash -c 'uv run mypy backend/ 2>&1 | uv run mypy-baseline filter' + files: '^backend/.*\.py$' + pass_filenames: false - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: - id: trailing-whitespace + # syrupy snapshots embed the rendered SQL strings verbatim; many + # of the repository templates align CASE WHEN / SELECT columns + # with intentional trailing spaces. Trimming them desyncs the + # snapshot from what the source actually emits on next render. + exclude: '^tests/.*/__snapshots__/.*\.ambr$' - id: end-of-file-fixer + # openapi-typescript emits openapi.json without a trailing newline; + # end-of-file-fixer adds one, then the next regen-openapi run + # strips it. Excluding the generated artifact breaks the cycle. + exclude: '^frontend/openapi\.json$' - id: check-yaml + # Docker Compose Override files use !reset / !override custom YAML + # tags that PyYAML's default loader doesn't recognize. The compose + # spec validates them at compose-time; the check-yaml hook would + # falsely flag them here. + exclude: '^docker-compose\.prod\.yml$' - id: check-json - id: check-merge-conflict - id: debug-statements @@ -41,16 +68,21 @@ repos: - id: gitleaks # Regenerate the committed OpenAPI snapshot + typed frontend client - # whenever the FastAPI surface or the generator script changes. If the - # regenerated files differ from the staged version, pre-commit fails - # the commit and leaves the updated files in the working tree — re-stage - # them (`git add frontend/openapi.json frontend/types/api.generated.ts`) - # and re-commit. Requires `uv` and `npm` on PATH. + # whenever the FastAPI surface or the generator script changes. Runs at + # PRE-PUSH, not per-commit: when this fired on commit, a backend-only + # change rewrote frontend/openapi.json + types mid-commit, so every + # backend commit dirtied generated frontend files that then needed manual + # re-staging — and on a shared branch with parallel frontend work it was + # easy to sweep unrelated edits into the commit. At push, if the + # regenerated files differ, the push fails and leaves them in the working + # tree — commit them (`git add frontend/openapi.json frontend/types/api.generated.ts`) + # and re-push. Requires `uv` and `npm` on PATH. - repo: local hooks: - id: regen-openapi name: Regenerate frontend/openapi.json + types files: ^(backend/.*\.py|scripts/generate_openapi\.py)$ + stages: [pre-push] language: system pass_filenames: false entry: bash -c 'cd frontend && npm run --silent gen:types' @@ -60,3 +92,31 @@ repos: language: system pass_filenames: false entry: bash -c 'cd frontend && npx tsc --noEmit' + + # v2.0 cleanup (Phase 0.12): pre-push gate that the + # @pytest.mark.security_regression count hasn't dropped below + # the Phase 0 floor (24). Catches a refactor that silently + # removes coverage of a verified security fix before push, + # not in CI. `stages: [pre-push]` keeps it off the per-commit + # hot path (the gate takes ~2s to collect 3k+ tests). + - id: security-regression-count + name: Assert security_regression test count >= floor + stages: [pre-push] + language: system + pass_filenames: false + entry: bash scripts/check_security_regression_count.sh + + # Run the Rust scorer's native unit tests before pushing a change to + # compute/scorer/. The Python suite only pins the cross-language *tables* + # (tests/scoring/test_normalize_parity.py); the cargo `#[test]`s pin the + # normalize/cookie/matrix LOGIC + session-expiry boundaries that no Python + # test exercises. Pre-push (not per-commit) keeps the compile off the hot + # path; `files:` scopes it to scorer changes; skips with a notice if cargo + # isn't installed (the Scorer CI job hard-requires it). + - id: scorer-test + name: Rust scorer unit tests (cargo test) + files: ^compute/scorer/.*\.(rs|toml|lock)$ + stages: [pre-push] + language: system + pass_filenames: false + entry: bash -c 'if command -v cargo >/dev/null || [ -x "$HOME/.cargo/bin/cargo" ]; then PATH="$HOME/.cargo/bin:$PATH" cargo test --manifest-path compute/scorer/Cargo.toml --locked; else echo "skipping scorer-test - cargo not on PATH (CI enforces it)"; fi' diff --git a/AGENTS.md b/AGENTS.md index 7bf0fb01..910fa8a4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,6 +1,6 @@ # AGENTS.md — AI Agent Guide -The canonical reference for any AI agent working on this project. **Read this end-to-end before your first non-trivial change; re-read the [Traps & Gotchas](#traps--gotchas) section before every change.** +The canonical reference for any contributor or AI agent working on this project. **Read this end-to-end before your first non-trivial change; re-read the [Traps & Gotchas](#traps--gotchas) section before every change.** New here? Start with [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the system design, then come back here for the patterns and traps. ## How to use this file @@ -39,7 +39,7 @@ User-facing pitch + features list lives in [README.md](README.md). This file doc - Storage: FOS (S3-compatible), per-service DuckDB + SQLite (operational metadata), global SQLite for NGWAF bot cache + live-share - Optional: [`falco`](https://github.com/ysugimoto/falco) VCL linter — detected via `shutil.which("falco")`, degrades gracefully to regex checks when absent -**VCL editing:** when you write or edit a log format string, a custom field `vcl_log_expression`, or any VCL snippet, it must pass `falco lint`. Use `+` for string concatenation; wrap literals in heredoc strings (`{"literal"}`). Call sites: [backend/utils/vcl_utils.py](backend/utils/vcl_utils.py), [backend/provision.py](backend/provision.py), [backend/routers/services/core.py](backend/routers/services/core.py). +**VCL editing:** when you write or edit a log format string, a custom field `vcl_log_expression`, or any VCL snippet, it must pass `falco lint`. Use `+` for string concatenation; wrap literals in heredoc strings (`{"literal"}`). Call sites: [backend/utils/vcl_utils.py](backend/utils/vcl_utils.py), [backend/provision/fastly_api.py](backend/provision/fastly_api.py), [backend/routers/services/core.py](backend/routers/services/core.py). ## Architecture @@ -52,26 +52,54 @@ User-facing pitch + features list lives in [README.md](README.md). This file doc | Iceberg table | `s3://{bucket}/{prefix}/iceberg/` | Durable long-term storage, hour-partitioned | | Admin state | `s3://{bucket}/{prefix}/iceberg/meta/admin_state.json` | log_format_history, audit_logs, views, custom_fields (no alerts — alerts are per-instance) | | DuckDB | `data/services/{service_id}.duckdb` | Per-service analytical engine **only**: session-scoped `logs` view + temp tables | -| Service metadata DB | `data/services/{service_id}.metadata.db` | Per-service SQLite (WAL): `alerts`, `views`, `audit_logs`, `cron_runs`, `sources`, `ingested_files`, `asn_names`, `usage_log` | +| Service metadata DB | `data/services/{service_id}.metadata.db` | Per-service SQLite (WAL): `alerts`, `views`, `audit_logs`, `cron_runs`, `sources`, `ingested_files`, `asn_names`, `slow_queries` | +| Usage-log DB | `data/services/{service_id}.usage_log.db` | Per-service SQLite (WAL): `usage_log` + `usage_log_hourly_summary`, split out of `metadata.db` so the cron writer's lock never blocks admin readers | | NGWAF bot cache | `data/ngwaf/ngwaf_bot_cache.db` | Shared SQLite for VERIFIED-BOT enrichment | | Live-share DB | `data/system/remote_share.db` | Singleton SQLite (WAL): invites, sessions, audit, TOS, lockouts | | Service configs | `configs/{logging_service_id}.json` | Credentials, settings, log_fields config | The DuckDB `logs` view stitches the Iceberg table and the local Parquet buffer so queries always see all data without callers caring which layer holds which row. +### Package layout (post v2.0 carve-ups) + +Several historical monoliths were split into cohesive packages with thin re-export shims at the old paths so existing imports keep working: + +| Old path | New package | Shim status | +|---|---|---| +| `backend/core/iceberg.py` | [`backend/core/iceberg/`](backend/core/iceberg/) (`_core.py` + `fs.py`) | package `__init__.py` re-exports the historical public surface; the monkeypatched s3fs methods are now `FosS3FileSystem` / `CachedS3FileSystem` subclasses in `fs.py` | +| `backend/core/metadata_db.py` | [`backend/core/metadata/`](backend/core/metadata/) (`base`, `alerts`, `views`, `ingest_log`, `cron_log`, `asn_cache`, `usage_log`, `usage_log_db`, `reconciliation`, `slow_queries`, `state`) | package `__init__.py` re-exports the full surface and installs a `_ShimModule` proxy so `monkeypatch.setattr(metadata, "_DATA_DIR", ...)` still flips the live binding inside `metadata.base` (legacy callers alias the package as `metadata_db`). No separate `metadata_db.py` file remains — the proxy lives on the package | +| `backend/core/share_db.py` | [`backend/core/share_db/`](backend/core/share_db/) (`connection`, `schema`, `invites`, `sessions`, `audit`, `passcode`, `tos`, `settings`, `validation`) | package `__init__.py` re-exports the historical public surface; passcode hashing is argon2id (legacy scrypt verify branch stays for transparent rehash-on-login) | +| `backend/utils/tunnel.py` | [`backend/utils/tunnel/`](backend/utils/tunnel/) (`manager`, `session`, `rate_limiter`, `state`, `fingerprint`) | package `__init__.py` re-exports `get_tunnel_manager`, `AnalystSession`, etc. SSH-to-localhost.run code path (`_TUNNEL_URL_RE`, sleep listener, reconnect logic, `use_tunnel=True` branches) was deleted in v2.0 — only direct-mode (HTTPS public_endpoint) is supported. The `use_tunnel=True` kwarg still exists as a back-compat keyword that raises a clear error | +| `backend/scheduler.py` | [`backend/cron/`](backend/cron/) (`scheduler.py`, `decorators.py`, `jobs/{sync,commit,compaction,optimize,expire,metadata,duckdb_recycle,insights_prewarmer,metric_snapshot}.py`) | thin shim at [`backend/scheduler.py`](backend/scheduler.py) re-exports `get_scheduler`, `Scheduler`, `cron_task`, every `_run_*` job body, and the watchdog constants | +| `backend/routers/session_scoring.py` (was 2442) | [`backend/routers/session_scoring.py`](backend/routers/session_scoring.py) (~1.7k) + [`backend/routers/session_scoring_admin.py`](backend/routers/session_scoring_admin.py) (~1.6k) | sidecar holds retrain + admin-config endpoints (enforce-threshold, exclude-regex, enforce-status-code, matrix-versions, rotate-key, audit, threshold GET/PUT, L2-enforce GET/PUT, evaluation/per-reason, dashboard composite); registers on the shared router via import-for-side-effects at the bottom of `session_scoring.py` | +| `backend/routers/admin.py` (was 1650) | [`backend/routers/admin/`](backend/routers/admin/) (`pop_locations`, `ingest`, `trees`, `downloads`, `sync_status`, `compaction`, `health`, `log_accounting`, `iceberg`, `bot_sources`, `system_metrics`, `metric_history`, `_helpers`, `_dir_size`, `_router`) + [`backend/routers/admin_usage.py`](backend/routers/admin_usage.py) (sidecar) | v2.0 carve: 15 sub-modules each < 350 lines (`system_metrics` serves the system-vitals snapshot, `metric_history` the admin metric-history trend lines). `admin/__init__.py` re-exports the historical public surface (`router`, `compute_sync_status_cached`, `compute_log_accounting`, `LOG_ACCOUNTING_*`, `SustainedLossAlert`, `_QueueFile`, `_stream_from_worker`, `_fetch_file_to_zip`, `_resolve_source`, `_get_dir_size`, `ClientDisconnected`). `admin_usage.py` still attaches its endpoints to the shared `router` via `importlib.import_module` from the package init | +| `backend/core/rollups.py` (was 2045) | [`backend/core/rollups/`](backend/core/rollups/) (`_common`, `time_series`, `sessions`, `hour_bundles`, `day_bundles`, `recompute`, `wellknown_bots`, plus the per-dimension rollup writers `slow_urls`, `network_rtt`, `network_speed`, `origin_summary`, `verified_bots_ts`, `perf_latency`) | v2.0 carve: 13 sub-modules. `rollups/__init__.py` re-exports the rollup surface so `from backend.core.rollups import X` (or `from backend.core import rollups; rollups.X`) keeps working unchanged. Shared bits — constants, ident validators, path helpers, query builders, `_VIRTUAL_FIELD_BACKING`, and the shared per-hour bundle writer `build_per_hour_bundles` (writer-side mirror of `compact_closed_days`) — live in `_common.py` | +| `backend/core/log_fields.py` (was 1904) | [`backend/core/log_fields.py`](backend/core/log_fields.py) (659) + [`backend/core/_log_fields_data.py`](backend/core/_log_fields_data.py) (1277) | data-only carve: `LOG_FIELD_CATALOG`, `GROUP_INFO`, `GROUP_DEPENDENCIES`, `PRESETS`, `INSIGHT_DEFINITIONS` moved to the sidecar and re-imported. Zero behaviour change | +| `backend/core/duckdb.py` (was 2110) | [`backend/core/duckdb.py`](backend/core/duckdb.py) (1099) + [`backend/core/_duckdb_status.py`](backend/core/_duckdb_status.py) (1119) | `get_sync_status`, `refresh_config_status`, `update_top_values`, `get_ingested_files`, `delete_ingested_files`, `get_schema`, `_clear_schema_cache`, `get_asn_names` / `format_asn_label` / `enrich_asn_labels`, `update_cron_duration`, `log_usage_calls`, `backfill_fastly_edge_writes`, `reconcile_fastly_stats`, `purge_usage_log` move to the sidecar. Re-exported back into `backend.core.duckdb`. Sidecar late-binds shared helpers from the main module via `_db_main` to dodge the circular import | + +Other new modules introduced by the cleanup: + +- [`backend/repositories/_sql/`](backend/repositories/_sql/) — named, parameterized SQL templates extracted out of inline repo strings (one file per repo concern: `dashboard`, `security`, `network`, `origin`, etc.). Repository functions keep their names and signatures; they call into the templates instead of carrying SQL inline. +- [`backend/core/field_registry.py`](backend/core/field_registry.py) — Phase 7 (shipped, including step 13) typed registry that owns per-field declarations (code, display name, type, valid aggregations, valid filter ops, derivations, security-regex hooks). All readers migrated (dashboard CTE generator, rollup spec builder, top_n logic, SQL validator, scoring matrix labels, plus 8 step-13 callers: `services/core.py`, `provision/orchestrator.py`, `provision/fastly_api.py`, `provision/cli.py`, `iceberg/_core.py`, `ingest.py`, `models/custom_fields.py`, `state_sync.py`). Same-identity re-exports of every helper + constant preserve `from log_fields import X` callers. +- [`backend/core/request_context.py`](backend/core/request_context.py) — Phase 2 single FastAPI dependency that bundles `service_id`, `source`, `con`, `telemetry`, `analyst_session`, `cached_temps`. Replaces the v1 `AnalyticsDeps` bundle (deleted at the v2.0 cut — Phase 8.1/8.2) and folds `require_service_access` into context construction (there is no path that builds a context without enforcing tenancy). 23 analytics endpoints across 8 routers (dashboard / query / sessions / security / network / origin / performance / insights) now take `ctx: RequestContext = Depends(build_request_context)` directly. +- [`backend/core/request_telemetry.py`](backend/core/request_telemetry.py) — Phase 1 thin wrapper around the OTel tracer that owns section spans, query attribution, call log, cache state, and the `app.thread_wait_ms` custom metric instrumented at `_Pool.acquire`. Lives on `RequestContext`. +- [`backend/core/sqlite_pool.py`](backend/core/sqlite_pool.py) — `ThreadLocalPool`, the generic thread-local SQLite pool extracted from the three previously-duplicated pools in `metadata/base.py`, `metadata/usage_log_db.py`, and `share_db/connection.py`. Each is now a thin wrapper configuring `path_fn` / `schema_fn` / `connect_fn` / `on_borrow_fn` around the one shared implementation; `share_db` queries flow through `InstrumentedConnection` for the first time and appear in the Live Query Monitor under `service=__global_share__`. +- **Env / config handling** — there is no central settings class. App-level env vars are read via `os.getenv` at their use sites (e.g. `OTEL_EXPORTER` in [request_telemetry.py](backend/core/request_telemetry.py), `STRUCTLOG_FORMAT` in [structlog_config.py](backend/utils/structlog_config.py), pool tuning in [duckdb_pool.py](backend/core/duckdb_pool.py)); per-service credentials/settings live in `configs/{id}.json` loaded by [backend/config.py](backend/config.py). +- [`backend/core/iceberg/_core.py`](backend/core/iceberg/_core.py) `execute_with_stale_view_retry(con, src, fn)` — self-heal wrapper for code paths that open raw DuckDB connections instead of going through `QueryRunner`. On stale-buffer "No files found" errors, busts `_view_cache` via `clear_source_caches(keep_snapshot_cache=True)` + `update_iceberg_view(force=True)` then retries `fn` once. Used by `rdns_cache` discovery, `rollups` DESCRIBE sites, and `/api/query`. Pre-fix prod incidents: ~8h of 100%-failing rdns runs + analyst-visible query errors on the same buffer-deletion race. + ### Personas (where the two onboarding paths live) The README explains the two collaboration modes for end users. Implementation pointers: - **Admin** (`access_level: "read_write"`) — full ingest/management surface. Config: `configs/{logging_service_id}.json`. - **Analyst Path A — independent instance** (durable, JSON-config join). Read-only FOS credentials, runs its own copy of the app. Components: `POST /api/services/{service_id}/generate-viewer-key` → [`api_invite_analyst()`](backend/routers/services/core.py), `GET /api/provision/join` (SSE), [`InviteAnalystDialog`](frontend/components/InviteAnalystDialog/), ProvisionWizard "join" mode. -- **Analyst Path B — live shared instance** (SSH-tunnelled). No FOS credentials, uses admin's running process. See [Live Dashboard Sharing](#live-dashboard-sharing) below for components. +- **Analyst Path B — live shared instance** (direct-mode against an HTTPS public_endpoint; the SSH-tunnel-to-localhost.run option was deleted in v2.0). No FOS credentials, uses admin's running process. See [Live Dashboard Sharing](#live-dashboard-sharing) below for components. **Both paths must keep working.** Don't remove either. Don't introduce a "unified" replacement without keeping the JSON-config flow intact — it's the only option when the admin's instance can't stay running. ## Ingest Pipeline -APScheduler runs six job types per service: +APScheduler runs the core sync-family jobs per service (plus per-service `alerts` evaluation + `insights_prewarmer`, and process-global maintenance jobs — see the [Scheduler](#scheduler-backendcron) note): | Job | Schedule | Function | |---|---|---| @@ -108,7 +136,7 @@ The window between `iceberg.write_to_buffer` and `metadata_db.insert_ingested_fi ### Health probe -`GET /api/health` is cheap liveness. `GET /api/health?deep=1` also verifies per-service ingest freshness: reads `max(ingested_at) FROM ingested_files` and the latest terminal `sync` cron run per service; returns 503 when any service is `degraded` (last ingest older than `stale_minutes` — default 30 — or last sync errored). SQLite-only, never FOS or Fastly. Safe to wire into a load balancer. +`GET /api/health` is cheap liveness. `GET /api/health?deep=1` also verifies per-service ingest freshness: reads `max(ingested_at) FROM ingested_files` and the latest terminal `sync` cron run per service; returns 503 when any service is `degraded` (last ingest older than `stale_minutes` — default 30 — or last sync errored, or a sync row is stuck in `status='running'` past `_STUCK_SYNC_RUNNING_MINS` — the orphaned-sync-row condition — or the latest `commit` / `metadata_sync` cron errored). SQLite-only, never FOS or Fastly. Safe to wire into a load balancer. ## VCL Log Format & Custom Fields @@ -135,7 +163,7 @@ Generated by `generate_log_format()` in [backend/core/log_fields.py](backend/cor Key concepts: - `format_hash` — SHA-256 of generated format; detects drift between deployed VCL and local config. - `FASTLY_LOG_FORMAT_SAFE_MAX` ≈ 8,000 chars. Enforced before deployment. -- `generate_capture_vcl()` in [backend/provision.py](backend/provision.py) injects per-hook code (recv, miss, pass, fetch, error, deliver) to populate log variables. +- `generate_capture_vcl()` in [backend/provision/fastly_api.py](backend/provision/fastly_api.py) injects per-hook code (recv, miss, pass, fetch, error, deliver) to populate log variables. - `log_format_history` tracks format changes with before/after group lists, added/removed fields, actor. ### Custom fields @@ -154,8 +182,8 @@ lf = cfg.get("log_fields") or {"schema_version": 2, "custom_fields": []} Brief summaries; click through to source for details. -### Scheduler ([backend/scheduler.py](backend/scheduler.py)) -Single `BackgroundScheduler`. `_sync_jobs()` adds/removes per-service jobs on `reload()`. Per-run progress events tracked in [backend/cron_progress.py](backend/cron_progress.py) and streamed via SSE. +### Scheduler ([backend/cron/](backend/cron/)) +Single `BackgroundScheduler` owned by [backend/cron/scheduler.py](backend/cron/scheduler.py). `_sync_jobs()` adds/removes per-service jobs on `reload()`. The `@cron_task` decorator (telemetry context + usage-log flush + watchdog hard-cap) lives in [backend/cron/decorators.py](backend/cron/decorators.py). Per-job bodies live under [backend/cron/jobs/](backend/cron/jobs/) (`sync`, `commit`, `compaction`, `optimize`, `expire`, `metadata`, `insights_prewarmer`, plus the process-global `duckdb_recycle` and `metric_snapshot`). `duckdb_recycle` (bounds the DuckDB object-cache leak) and `metric_snapshot` (SRE sampler feeding `metric_history`) register once for the process, not per service. Per-run progress events tracked in [backend/cron_progress.py](backend/cron_progress.py) and streamed via SSE. [backend/scheduler.py](backend/scheduler.py) is a thin compat shim that re-exports the same public symbols. ### NGWAF Bot Detection ([backend/utils/ngwaf.py](backend/utils/ngwaf.py), [backend/utils/ngwaf_bot_cache.py](backend/utils/ngwaf_bot_cache.py)) Syncs VERIFIED-BOT requests from `GET https://api.fastly.com/ngwaf/v1/workspaces/{id}/requests`. JSON:API pagination via `meta.next_cursor`. Shared SQLite cache at `data/ngwaf/ngwaf_bot_cache.db`. Enriches log rows with `waf_req_id` + `waf_sig LIKE '%VERIFIED-BOT%'`. @@ -168,7 +196,7 @@ Both stored in per-service `metadata.db` (SQLite). Alerts are threshold-based wi ### State Sync ([backend/state_sync.py](backend/state_sync.py)) `export_admin_state` writes `audit_logs` + `views` from per-service SQLite, plus `log_format_history` + `custom_fields` from the config JSON, to `{prefix}/iceberg/meta/admin_state.json`. **Alerts are not synced** — each instance maintains its own. Only `read_write` services export. -### FOS Usage Logging ([backend/utils/usage_logger.py](backend/utils/usage_logger.py), [backend/core/metadata_db.py](backend/core/metadata_db.py)) +### FOS Usage Logging ([backend/utils/usage_logger.py](backend/utils/usage_logger.py), [backend/core/metadata/usage_log.py](backend/core/metadata/usage_log.py)) Every FOS Class A/B op and CDN download recorded to per-service `usage_log` SQLite for cost analysis. - Global toggle: `data/system/usage_logging.json` - Process-context tagging via `set_process_context()` in [backend/utils/telemetry.py](backend/utils/telemetry.py) — tags entries with `cron:sync:svc1` or `api:GET /api/...` @@ -176,45 +204,94 @@ Every FOS Class A/B op and CDN download recorded to per-service `usage_log` SQLi - Costs computed at query time from rate config — changing rates recomputes history. - Admin endpoints: `GET/PATCH /api/admin/usage-logging`, `GET/DELETE /api/admin/usage-log`, `GET /api/admin/usage-log/export`. Frontend: `/admin/usage-log`. -### Log-Line Accounting ([backend/routers/admin.py](backend/routers/admin.py) `api_log_accounting`) +### Log-Line Accounting ([backend/routers/admin/log_accounting.py](backend/routers/admin/log_accounting.py) `api_log_accounting`) Per-bucket reconciliation between Fastly's `/stats/service/{id}` log-emission counter and our `sum(row_count) FROM ingested_files`. - Field probe order: `log → log_records → log_entries → logging_requests`; first non-zero wins. All-zero logs a warning. - In-flight clamp: current bucket is in totals but excluded from sustained-loss scan (Fastly Stats lags ingest). - Sustained-loss alert: ≥2 consecutive completed buckets with `gap_pct ≥ 0.05`. - Frontend cadence: `staleTime 30s`, `refetchInterval 60s` → ≤1 Fastly Stats call/min per open admin tab. -### Iceberg Pointer + Summary Hash-Throttle ([backend/core/iceberg.py](backend/core/iceberg.py)) +### Iceberg Pointer + Summary Hash-Throttle ([backend/core/iceberg/_core.py](backend/core/iceberg/_core.py)) Every commit writes `metadata_location.txt` (unavoidable) and `table_summary.json` (skippable). The latter is content-hashed against `_table_summary_hash_cache`; identical payloads skip the PUT. Saves one FOS PUT per no-op commit in steady state. Cache is module-scope, process-lifetime. ### DuckDB Connection Pool ([backend/core/duckdb_pool.py](backend/core/duckdb_pool.py)) Per-service LIFO pool replaces per-request `duckdb.connect()` + S3 / iceberg setup + view rebind (~50ms steady-state). Pool size is `DUCKDB_POOL_MAX_SIZE` (default 8). All pool connections open with `read_only=False` — `get_connection` forces this so cron writers and pool readers don't trip DuckDB's "different configuration" error on the same file. Optional per-connection tuning: `DUCKDB_POOL_CONN_MEMORY_LIMIT` (e.g. `256MB`) caps RSS growth under concurrent large scans; `DUCKDB_POOL_CONN_THREADS` reduces context-switching when `pool_size × per_conn_threads` exceeds physical cores. View-binding happens outside the pool lock to avoid deadlocking the FastAPI thread pool when an Iceberg snapshot reload blocks. -### Hourly Top-N Rollups ([backend/core/rollups.py](backend/core/rollups.py), [scripts/backfill_rollups.py](scripts/backfill_rollups.py)) -Precomputes per-hour Top-N aggregates for the dashboard's most-asked fields (ip, country, url, custom fields) and writes them under `/data/rollups/`. Closed hours read from the rollup; the current ("live") hour merges the rollup with a fast scan of the buffer. Plus a per-minute time-series bundle (`rollups/timeseries/...`) used by the dashboard chart to skip the wide Iceberg scan. Skipped buckets fall back to the raw scan path. Generated by `local_compact_{id}` after each compaction pass; the global `optimize_{id}` job rebuilds the day's worth on each run. +**Pool wait observability** — `_Pool.acquire` records every checkout's wall-clock wait time to (a) the OTel `app.thread_wait_ms` histogram tagged `{outcome: reused | created | timeout, waited: true | false, service}` for off-box analysis via `docker compose logs backend | grep app.thread_wait_ms`, AND (b) a bounded in-process ring buffer (~1024 samples per service) consumed by `Pool.stats().wait` (p50/p95/p99/max/mean). `GET /api/admin/health-snapshot` exposes the per-service stats (plus `saturated_rejects_total` / `drain_rejects_total` pool-reject counters and the last-warmed timestamp); the `SystemHealthCard` on `/admin` renders top-level Pool wait p95 / Pool in-use / idle cards plus an expandable per-service table. ADR-03 escalation rule: p95 > 50ms ⇒ consider separate-process cron isolation; > 200ms flags red. Both paths are non-blocking (try/except around the recorder) so instrumentation can never break a checkout. + +### Hourly Top-N Rollups ([backend/core/rollups/](backend/core/rollups/), [scripts/backfill_rollups.py](scripts/backfill_rollups.py)) +Precomputes per-hour Top-N aggregates for the dashboard's most-asked fields (ip, country, url, custom fields) and writes them under `/rollups/`. Closed hours read from the rollup; the current ("live") hour merges the rollup with a fast scan of the buffer. Plus a per-minute time-series bundle (`rollups/hour_bundled/hour=H/time_series.parquet`) used by the dashboard chart to skip the wide Iceberg scan. Skipped buckets fall back to the raw scan path. Generated by `local_compact_{id}` after each compaction pass; the global `optimize_{id}` job rebuilds the day's worth on each run. + +**Bundle tiers** (cheapest first wins in the reader): +- `rollups/day_bundled/day=D/all_fields.parquet` — one parquet per day, all fields. Reader prefers this for fully-in-window closed days. +- `rollups/hour_bundled/hour=H/all_fields.parquet` — one parquet per hour, all fields. Reader uses for partial-day boundary hours + any day without a day-bundle. +- `rollups/hour/field=F/hour=H/*.parquet` — per-(field, hour). Original source of truth; the bundle writers read from here. +- `rollups/day/field=F/day=D/*.parquet` — per-(field, day). Source for the day-bundler. + +**Virtual fields** (`waf_sig_ind`, `edge_score_reason_ind` — see `_VIRTUAL_FIELD_BACKING` in `rollups/_common.py`) are CSV-unnested at WRITE time so the dashboard reader serves them through the standard rollup path instead of paying a 30-day unnest-during-query each request. Wired in `_run_per_field_copy` (rollups/recompute.py) via `_build_virtual_field_copy_query` (rollups/_common.py). Adding a new virtual field requires (a) appending to `_VIRTUAL_FIELD_BACKING`, (b) ensuring its `backing` column is on the schema, (c) a one-shot rebundle migration so existing hour/day bundles pick it up (see next point). + +**Stale-bundle hazard.** `bundle_hours` / `bundle_days` use mtime to skip up-to-date bundles, and the cron only re-bundles HOURS THAT JUST RECEIVED DATA. Closed historical hours never get re-touched. If you add a new field to the rollup writer (real or virtual), the per-(field, hour) parquets land but the bundled `all_fields.parquet` for closed hours stays without them — the dashboard's bundled-rollup reader returns 0 rows for the new field and the runtime fallback fires. Fix: delete the stale closed bundles and re-run the backfill — `backfill_missing_bundles` / `backfill_day_bundles` in [backend/core/rollups/](backend/core/rollups/), or the [`POST /api/admin/backfill-bundle-rollups`](backend/routers/admin/compaction.py) endpoint. + +**Live-hour batch must filter virtual fields out** before `execute_top_n_batch` (in `_base.py`'s `execute_top_n_rollups`): the SQL projects `field_name AS value` and virtual names aren't real columns on the live temp table. Passing them through BinderException's the whole UNION ALL and silently drops the live-hour merge for real fields too. See `live_fields = [f for f in fields if f in actual_cols]` at the merge site. + +**`live_temp` narrow projection** ([backend/repositories/dashboard.py](backend/repositories/dashboard.py)): only `conn_requests` + `timestamp` on the `chart_metric == "requests"` path. The runtime CSV-unnest fallback for virtual fields (`_exploded_top_n`) queries the BASE table via stashed `orig_table_name` / `orig_where_clause` / `orig_params`, not the temp, so the temp doesn't need to carry `waf_sig` / `edge_score_reason`. Map_data is derived from `all_top_res` instead of a separate query on the temp, so `country` isn't needed either. If you add a new consumer that reads from the temp, add its columns to `narrow_col_set` AND verify the chart_metric branches. + +**`get_top_bots` rollup-served UAs** ([backend/repositories/security.py](backend/repositories/security.py)): on the unfiltered path (`not filters`), top UAs come from `execute_top_n_rollups(["ua"], ..., limit=50000)` instead of scanning the iceberg view for the `ua` column. The NGWAF JOIN still needs the raw temp because `waf_req_id` is high-cardinality and not rollup-served — but the temp is single-column (`waf_req_id` only) when the rollup path serves UAs. Filtered requests fall back to the original combined `(ua, waf_req_id)` temp. + +**When adding a new analytics panel (or a new field rendered on an existing panel), consider a rollup at PR time.** Any panel reading from the per-request temp table on 30 d windows is a candidate. Workflow: + +1. **Measure first.** Hit the endpoint on prod via the admin tunnel with a 30 d window + empty `filters: {}` and inspect `_section_timings` in the JSON response. Any per-panel section > ~1 s on 30 d is rollup-worthy. (Audit JSONs under `performance-report/` usually don't carry section_timing — prefer a live curl.) +2. **Pick the shape.** Three reference templates already cover the common cases: + - Per-dimension percentiles (weighted-average across hours) — copy [backend/core/rollups/slow_urls.py](backend/core/rollups/slow_urls.py) (per-URL p50/p95/p99) or [backend/core/rollups/network_rtt.py](backend/core/rollups/network_rtt.py) (per-ASN p95/p99). Reader returns `{..., "_approx": True}`; the FE surfaces an "Approximate" badge on the affected panel. + - Exact GROUP BY counts — copy [backend/core/rollups/network_speed.py](backend/core/rollups/network_speed.py). Math is associative across hours; no `_approx` flag. + - Exact time series (re-bucketable) — copy [backend/core/rollups/verified_bots_ts.py](backend/core/rollups/verified_bots_ts.py). Store at MINUTE granularity (`date_trunc('minute', timestamp)`); the reader re-buckets via `time_bucket` to any caller `bucket_seconds` that's a multiple of 60 (gate `% 60 == 0` — non-multiples are inexact). Unlike the leaderboard shapes, the **day compactor PRESERVES the bucket_ts dimension** (`GROUP BY bucket_ts, dim`) so a series can still be produced over the window, and the reader is **hybrid**: `UNION ALL` of the closed-hour rollup + a scoped live query for the in-progress active hour from the temp table (the writer never rolls up the active hour), merged by an outer `GROUP BY (bucket, dim) SUM`. No `_approx` flag. + - Single-row-per-hour summary (multiple aggregates) — copy [backend/core/rollups/origin_summary.py](backend/core/rollups/origin_summary.py). +3. **Wire the 10 seams.** Writer module → constants in `_common.py` → exports in `__init__.py` → `recompute.py` hook (best-effort `try/except`) → daily compactor in `day_bundles.py` (mirror `compact_network_rtt_closed_days_to_daily`) → cron hook in `backend/cron/jobs/compaction.py` → reader method on `QueryRunner` in `backend/repositories/_base.py` (with the standard eligibility gates: `not has_filters` + window ≥ 48 h + ≥ 50% closed-hour coverage + day-prefer/hour-fallback walk) → dispatcher at the live-SQL call site (try rollup, fall through to live on `None`) → admin backfill endpoint in `backend/routers/admin/compaction.py` → tests + extend `tests/core/test_rollups_recompute.py` to patch the new `build_*` call in both `_swallows_downstream_bundle_errors` and `_malformed_hour_skipped`. +4. **After deploy, run the post-deploy backfill.** `POST /api/admin/backfill-bundle-rollups` walks the bundle tree and produces both the per-hour and per-day files for historical hours in one shot — without this, the new rollup only covers hours touched after deploy. +5. **Re-measure on prod.** Confirm `_section_timings` shows `_query_rollup` with sub-100 ms instead of `_query` with seconds. If the dispatcher routes but timing is unchanged, the rollup file wasn't built (check backfill counts). + +**Don't try these — they've been declined for documented reasons:** +- Pre-aggregating percentile sketches for cross-hour combine — DuckDB has no sketch combine. Use request-weighted averages with the count carried alongside (see `network_rtt.py` reader SQL + the no-sketch-combine comment in [backend/repositories/_base.py](backend/repositories/_base.py)). +- Collapsing rollup parquets into fewer-larger files for SCAN-bound queries — DuckDB parallelises across files. Daily-compaction for ROLLUPS is different (those are file-open-overhead-dominated, not scan-bound). +- Response-caching as the perf lever — new logs are always being ingested so the cache TTL has to be sub-minute. Real wins live on the cold-path SQL. +- `temp_table_create` itself — it's materialize-bound; prior CTE/view replacements REGRESSED downstream scans 5×. Add more rollups so fewer panels read from the temp at all instead of trying to make the temp faster. ### Response Telemetry Middleware ([backend/utils/telemetry_response_middleware.py](backend/utils/telemetry_response_middleware.py)) Backstop for endpoints that return a plain `dict` instead of going through `BaseResponse.with_telemetry`. Inspects JSON object responses, injects `_debug_queries` / `_debug_calls` / `_is_cached` from the contextvar collectors if missing. **Must be added INNER to `CompressMiddleware`** (i.e. `add_middleware(TelemetryResponseBodyMiddleware)` BEFORE `add_middleware(CompressMiddleware)`) so it sees the raw JSON, not br/zstd/gzip-encoded bytes. Skips streaming responses, non-dict bodies, and already-instrumented responses. Gated on `DEBUG_RESPONSES`; failure modes are silent + non-blocking. +### Live Query Monitor ([backend/core/query_registry.py](backend/core/query_registry.py), [backend/routers/admin_queries.py](backend/routers/admin_queries.py), [frontend/app/admin/queries/](frontend/app/admin/queries/)) +Real-time view of every executing DuckDB + SQLite query — attribution (analyst / admin / cron / system), caller `file:line`, pool slot, duration ticking up live, kind-aware Kill button that calls `con.interrupt()`. Page at `/admin/queries`, admin-only via `RemoteAccessMiddleware`. Polling at 300 ms; the Active panel promotes "completed in the last 10 s" rows as faded entries with an outcome badge so typical-traffic (p50 ≈ 0.2 ms, max ≈ 29 ms) queries are visible. Notable Slow Queries panel filters the completed-history ring buffer by threshold (100ms / 500ms / 1s / 2s / 5s), sorted slowest first. Queries above the persistence threshold are also written to a per-service `slow_queries` table ([backend/core/metadata/slow_queries.py](backend/core/metadata/slow_queries.py), in `metadata.db`) stamped with the request correlation id (`rid`, also emitted in the access log), so the panel can answer "what was slow yesterday?" across restarts. + +Instrumentation lives at two seams: SQLite `InstrumentedCursor` ([backend/utils/sqlite_profiler.py](backend/utils/sqlite_profiler.py)) registers/deregisters around `execute*`; DuckDB `InstrumentedDuckDBConnection` + `_InstrumentedResult` ([backend/core/query_instrumentation.py](backend/core/query_instrumentation.py)) wraps the connection returned from `checkout_connection` so deregistration happens at terminal-fetch time (fetchdf, arrow, etc.) rather than at `execute()` — DuckDB's execute returns in ~ms while fetch can run for seconds. Per-query overhead measured ~21 µs (~0.3% of dashboard bundle wall time). Cancel path is safe under pool reuse: a stamped `_conn_to_query[id(con)]` is verified under lock before `interrupt()` so a stale UI click never cancels a different query that's checked out the same physical connection later. + +Audit log fires on every successful cancel (`audit_log` in [backend/utils/structlog_config.py](backend/utils/structlog_config.py)) with the actor + full target attribution. OTel histograms: `app.active_queries.count`, `app.query_duration_ms`, `app.queries_cancelled_total`. Kill switches: `QUERY_MONITOR_ENABLED=0` hides the endpoints (404), `QUERY_REGISTRY_DISABLED=1` bypasses the hot path entirely for zero overhead. + ### CDN-Fronted Log Delivery FOS reads are fronted by a Fastly CDN VCL service (`cdn_service_id`, `cdn_url`, `cdn_secret`). The CDN validates a shared-secret query param to gate access; rate-limited to blunt brute-force. Separate from the logging service ID. +### Session Scoring (edge L2) ([backend/routers/session_scoring.py](backend/routers/session_scoring.py), [compute/scorer/](compute/scorer/)) +Edge-computed 0–100 risk score per request, combining cookie/timing signals (L1) with a PageRank route-transition matrix (L2). The Rust scorer at [compute/scorer/](compute/scorer/) builds to Wasm (`make scorer-package`) and runs on Fastly Compute as an instance-per-request sub-fetch; its native unit tests gate via `make scorer-test`. The matrix is **not** embedded — it's served from the `scoring_matrix` KV Store at runtime. Backend surface: read/retrain/admin-config endpoints in [session_scoring.py](backend/routers/session_scoring.py) + [session_scoring_admin.py](backend/routers/session_scoring_admin.py); deploy/teardown orchestration in [backend/provision/session_scoring_orchestrator.py](backend/provision/session_scoring_orchestrator.py); VCL generation in [backend/provision/session_scoring_vcl.py](backend/provision/session_scoring_vcl.py). + +- **L2 enforcement is explicit operator opt-in** — `GET`/`PUT /scoring/l2-enforce` (`L2EnforcementCard` in the UI). L2 is always computed and logged but contributes to the *enforced* combined score only after an operator enables it; enabling fades it in over three days, disabling returns it to observe-only. There is no clock-driven auto-ramp. Deployment age is only an advisory readiness gauge. +- **NGWAF skip-inspection on the sub-fetch** — the internal scoring sub-fetch carries `x-sigsci-skip-inspection-once` so NGWAF doesn't inspect (and 406) the internal call; it's set on the compute route and unset on the scrub + restart paths so the real-origin WAF path is never bypassed. (See [session_scoring_vcl.py](backend/provision/session_scoring_vcl.py).) +- **No VCL retry of the scoring sub-fetch** — only 2 restarts (score→origin spends them); fails open on timeout/error by design. + ### Live Dashboard Sharing -Components for the live-shared-instance remote-analyst feature (Path B). Three sharing modes are exposed to the admin: +Components for the live-shared-instance remote-analyst feature (Path B). Two direct-mode sharing modes are exposed to the admin (the SSH-reverse-tunnel via localhost.run was deleted in v2.0): -1. **SSH reverse tunnel** via localhost.run (default, easiest) -2. **Admin-provided hostname** (e.g. `https://logs.example.com`) — no third-party relay -3. **Admin-provided IP** (e.g. `https://203.0.113.42:8443`) — no relay, no DNS +1. **Admin-provided hostname** (e.g. `https://logs.example.com`) +2. **Admin-provided IP** (e.g. `https://203.0.113.42:8443`) -Modes 2 and 3 share a single backend code path: `ShareStartPayload.use_tunnel=False` + `public_endpoint=`. The mode selector in the UI is presentational — the backend only cares whether `use_tunnel` is set and (when false) that `public_endpoint` starts with `https://` (cookies need `secure=true`). +Both share a single backend code path: `ShareStartPayload.use_tunnel=False` + `public_endpoint=`. The mode selector in the UI is presentational — the backend only cares that `public_endpoint` starts with `https://` (cookies need `secure=true`). `use_tunnel=True` still exists as a back-compat keyword and now raises a clear error. Components: -- [backend/utils/tunnel.py](backend/utils/tunnel.py) — `TunnelManager` owns `ssh -R 80:localhost:8000 nokey@localhost.run` in tunnel mode, parses assigned `https://*.lhrun.dev` hostname, tracks `TunnelState`. In direct mode (hostname / IP), no subprocess is spawned — the admin-supplied `public_endpoint` is stored and `public_url()` returns it verbatim. Process singleton via `get_tunnel_manager()`; `reset_for_tests()` for pytest. +- [backend/utils/tunnel/](backend/utils/tunnel/) — package split: `manager.py` owns the `TunnelManager` singleton (direct-mode lifecycle, sever-all panic), `session.py` holds `AnalystSession`, `rate_limiter.py` is the sliding-window `_LoginRateLimiter`, `state.py` persists `tunnel_state.json`, `fingerprint.py` computes the session fingerprint hash. Process singleton via `get_tunnel_manager()`; `reset_for_tests()` for pytest. - [backend/utils/remote_access.py](backend/utils/remote_access.py) — `RemoteAccessMiddleware` does DNS-rebinding gate (Host/Origin allow-lists, including `testclient`/`testserver` for pytest), blocks admin paths on remote requests, applies response hardening (CSP, X-Frame-Options DENY, no-store, no-referrer). `_StaticAssetLimiter` rate-limits static assets to blunt scrapes. -- [backend/core/share_db.py](backend/core/share_db.py) — singleton SQLite at `data/system/remote_share.db`: `remote_invites`, `invite_services`, `remote_sessions`, `remote_share_audit_logs`, `share_settings`, `remote_invite_claim_tokens`, `share_tos_versions`. WAL mode, numbered migrations, bcrypt passcodes, per-IP/per-email lockout. +- [backend/core/share_db/](backend/core/share_db/) — package split: `connection.py` (pool + corruption self-heal with quarantine), `schema.py` (own MIGRATIONS dict + `apply_pending` + `PRAGMA user_version`), `invites.py`, `sessions.py`, `audit.py`, `passcode.py` (argon2id current default; scrypt verify branch stays for transparent rehash-on-login upgrade), `tos.py`, `settings.py`, `validation.py`. Singleton SQLite at `data/system/remote_share.db`: `remote_invites`, `invite_services`, `remote_sessions`, `remote_share_audit_logs`, `share_settings`, `remote_invite_claim_tokens`, `share_tos_versions`. WAL mode, per-IP/per-email lockout. - [backend/routers/share_auth.py](backend/routers/share_auth.py) (`/api/share/*`) — analyst-facing: `login`, `logout`, `acknowledge`, `heartbeat`, `claim/{token}`. Tagged so middleware lets them through the tunnel. - [backend/routers/share_admin.py](backend/routers/share_admin.py) (`/api/admin/share/*`, **blocked over tunnel**) — admin-facing: tunnel lifecycle, invite CRUD, session evict, panic/sever-all, backup export/import, GDPR erase, settings. -- Frontend: [ShareDashboardDialog](frontend/components/ShareDashboardDialog/), [/share-login](frontend/app/share-login/) (TOS-gated), [useAnalystHeartbeat](frontend/hooks/useAnalystHeartbeat.ts), [useShareStatusBanner](frontend/hooks/useShareStatusBanner.tsx). Watermark mounts in `AppLayout` when `bootstrap.settings.is_remote_analyst === true`. +- Frontend: [share-dashboard components](frontend/components/share-dashboard/) (sharing control, invites, sessions, audit panels), [/share-login](frontend/app/share-login/) (TOS-gated), [useAnalystHeartbeat](frontend/hooks/useAnalystHeartbeat.ts), [useShareStatusBanner](frontend/hooks/useShareStatusBanner.tsx). Watermark mounts in `AppLayout` when `bootstrap.settings.is_remote_analyst === true`. When adding an endpoint that analysts must reach over the tunnel, **register under `/api/share/*`** (auto-allowed) or update `_is_blocked_path()` — don't punch a hole somewhere obvious. (Trap #20.) @@ -223,11 +300,14 @@ When adding an endpoint that analysts must reach over the tunnel, **register und ### UI Wizard ([frontend/components/ProvisionWizard/ProvisionWizard.tsx](frontend/components/ProvisionWizard/ProvisionWizard.tsx)) Step order: `mode → token → service → storage → ngwaf → fields → execute`. Token entered in step 2 must be threaded into every Fastly-credentialed API call (including the NGWAF fetch). `execute` streams SSE. -### CLI ([backend/provision.py](backend/provision.py)) -- `python backend/provision.py` — interactive -- `python backend/provision.py --teardown --service-id {id}` — teardown +### CLI ([backend/provision/cli.py](backend/provision/cli.py)) +- `python -m backend.provision.cli provision` — interactive wizard +- `python -m backend.provision.cli teardown --service-id {id}` — teardown +- `python -m backend.provision.cli invite-analyst --service-id {id}` — generate a read-only analyst invite +- `python -m backend.provision.cli enable-scoring --service-id {id}` — enable (or redeploy) session scoring on a service +- `python -m backend.provision.cli disable-scoring --service-id {id}` — disable session scoring on a service -CLI supports provisioning and teardown only. There is no analyst join command — that path is web-only. +Subcommands: `provision` / `teardown` / `invite-analyst` / `update-logs` / `update-cdn` / `enable-scoring` / `disable-scoring` / `list-groups` / `list-fields`. ### Teardown Removes the FOS logging endpoint from the Fastly service, the CDN VCL service, the FOS access key, local config, local DuckDB, local cache. APScheduler cleans stale jobs on the next `reload()`. @@ -263,6 +343,20 @@ A global middleware in [frontend/lib/api.ts](frontend/lib/api.ts) checks `respon **Streaming/binary endpoints** (SSE, blobs) use raw `fetch()` — leave a comment so future readers don't "fix" it. +### Server-side bootstrap pre-fetch ([frontend/lib/ssr/bootstrap.ts](frontend/lib/ssr/bootstrap.ts), [frontend/app/layout.tsx](frontend/app/layout.tsx)) + +The root layout SSR-fetches `/api/bootstrap`, dehydrates it into the React Query cache (via a new `HydrationBoundary` in `QueryProvider`), and ships the JSON inline in the first HTML paint. `useBootstrap` and every hook that reads `bootstrap.*` via `queryClient.getQueryData(['bootstrap'])` find the data already cached on first render — no client-side bootstrap RTT, no `'No service selected'` flash, share banner in the initial paint. + +Adding a new SSR pre-fetch (e.g., for a per-page endpoint): + +1. **Use `node:http.request`, NOT `fetch()`.** Node's `fetch()` always overrides the `Host` header from the URL. The backend's `_remote_host_allowed` gate rejects remote-classified requests whose Host isn't the public endpoint — so without preserved Host, the SSR fetch returns 400 host_not_allowed and silently falls through to the client. +2. **Trust topology is `X-Remote-Analyst: 1`, not `X-Proxied-By-Caddy`.** The SSR runtime hits the backend over loopback. `is_request_remote` ([backend/utils/remote_access.py](backend/utils/remote_access.py)) classifies based on `request.client.host` first, so a forwarded Caddy marker is IGNORED. `X-Remote-Analyst: 1` is the loopback-honored primitive (gated on `tunnel_manager.is_sharing_active()`). Forward it ONLY when the inbound request carries `X-Proxied-By-Caddy` — otherwise the admin SSH-tunnel path is mis-classified as analyst and 400'd. (See history: the 2026-06-11 SSR-leak incident reverted in `f3d8dd7` / `546c279` was the previous-attempt version that forwarded `X-Proxied-By-Caddy` directly. Backend ignored it, returned admin payload, dehydration leaked admin fields into public HTML.) +3. **Always wrap in try/catch + bounded timeout, return `null` on any failure.** SSR errors must NEVER propagate into a broken page — the layout falls back to client fetch when the helper returns null. 5s is generous for prod cron contention; never block SSR longer. +4. **`force-dynamic` is REQUIRED** in any layout/page that does a per-request SSR fetch via `cookies()` / `headers()` from an imported helper. Next.js's static-analysis pass only detects direct `cookies()` calls in the component file itself — calls from an imported module won't flip the route to dynamic. Without `export const dynamic = "force-dynamic"` the layout gets SSG'd at build time (when the backend isn't reachable) and the dehydrated state is permanently empty. +5. **Adversarial test required:** before deploying, hit the prod public URL anonymous AND the admin tunnel and verify the dehydrated state shape. Anonymous public must contain only the `needs_login` stub (NO `sharing_active`, NO `ngwaf_workspace_id`, NO `sync_status`). Admin must contain the full payload. + +The `serviceStore` Zustand slice hydrates from the SSR-cached bootstrap in `useBootstrap`'s post-mount `useEffect` — for the one-render window before that effect fires, use [`useEffectiveServiceId`](frontend/hooks/useIsDataReady.ts) which falls back to `bootstrap.active_service_id` from the React Query cache. Direct reads of `useServiceStore(s => s.activeServiceId)` flash "No service selected" on first paint. + ### Canonical patterns (May 2026 DRY refactor — use these in new code) 1. **`response_model=` on every router handler.** Without it the OpenAPI emits `Record`. Routes using `Depends(get_source)` should also lift `service_id: str` into the signature so it appears as a path parameter. @@ -270,11 +364,15 @@ A global middleware in [frontend/lib/api.ts](frontend/lib/api.ts) checks `respon 3. **`ReportLayout`** for analytics pages — bundles `usePageContext + useReportConfig + useFilterPayload + useUrlFilterSync + useServiceQuery + ChartIntervalButtons + ReportShell`. Fall back to `ReportShell` only for multi-query or non-standard chrome pages. 4. **`HelpDialog`** from [components/ui/help-dialog.tsx](frontend/components/ui/help-dialog.tsx) — don't compose `Dialog + DialogHeader + DialogTitle` by hand for help content. 5. **`useBaseMap`** for any MapLibre setup. Don't duplicate the world-layer + theming inline. -6. **`metadata_db.record_audit(service_id, event_type=..., details=...)`** — direct. The `duckdb.log_audit_event` shim and `repositories/audit.py` pass-through were removed. +6. **`metadata.record_audit(service_id, event_type=..., details=...)`** — direct (or via the `metadata_db` shim; both resolve to the same `metadata.audit` impl). The `duckdb.log_audit_event` shim and `repositories/audit.py` pass-through were removed. 7. **`date_utils.parse_iso_utc` / `iso_z` / `iso_z_now`** — don't hand-roll `datetime.fromisoformat(s.replace("Z", "+00:00"))`. -8. **`@cron_task` decorator** in [backend/scheduler.py](backend/scheduler.py) — handles `start_call_tracking`, `set_process_context`, `flush_usage_log` finally-block. +8. **`@cron_task` decorator** in [backend/cron/decorators.py](backend/cron/decorators.py) — handles `start_call_tracking`, `set_process_context`, `flush_usage_log` finally-block, watchdog hard-cap. Re-exported from [backend/scheduler.py](backend/scheduler.py) for compat. 9. **`empty_schema_response(runner)`** in [_base.py](backend/repositories/_base.py) — return this when a repo function hits a service with no logs. 10. **`origin_latency_us_expr(actual_cols)`** in `_base.py` — don't hand-roll the `COALESCE("ottfb", "ttfb" * 1000000.0)` fragment. +11. **`useEffectiveServiceId`** in [hooks/useIsDataReady.ts](frontend/hooks/useIsDataReady.ts) — read this instead of `useServiceStore(s => s.activeServiceId)` whenever the answer matters on FIRST PAINT (gating views, building cache keys, "no service selected" branches). It falls back to `bootstrap.active_service_id` from the SSR-hydrated React Query cache so the page doesn't flash empty before the persisted Zustand store catches up. +12. **`analystFetch`** in [frontend/lib/analystFetch.ts](frontend/lib/analystFetch.ts) — shared analyst-facing fetch + response-envelope helper. Don't hand-roll the analyst fetch/error-unwrap per consumer. +13. **`CardErrorState`** — the shared inline card-error component (alert + Retry) for any dashboard/analytics card whose query can fail. Don't fabricate zeros or leave a spinner on a 5xx; render this. +14. **Shared PoP label** in [frontend/lib/pop.ts](frontend/lib/pop.ts) — render PoP codes as `DEN (Denver, CO - USA)` via the shared helper, seeded from `bootstrap.pop_geo`. Keep the raw code for click-to-filter; the label is display-only. ### Next.js navigation + loading conventions (READ BEFORE TOUCHING FRONTEND) @@ -345,16 +443,17 @@ share-dashboard buttons follow the same shape after the recent fix. would balloon the prefetch traffic. - **Hover-prefetch data, not just bundle:** when a Link target needs an API call to render meaningfully, add `onMouseEnter` that calls - `queryClient.prefetchQuery(...)`. Example: the Admin → Share Dashboard - link in [admin/page.tsx](frontend/app/admin/page.tsx#L791) warms the - share-status query so the destination renders real content - immediately instead of skeleton-then-swap. + `queryClient.prefetchQuery(...)`. Example: the Admin prefetch links in + [AdminPrefetchLinks.tsx](frontend/app/admin/AdminPrefetchLinks.tsx) + warm the share-status (and other destination) queries on hover so the + destination renders real content immediately instead of + skeleton-then-swap. **8. Wrap `router.replace()` inside effects in `startTransition`.** A synchronous `router.replace()` inside `useEffect` causes a render cascade that blocks paint. Examples: [useUrlServiceSync](frontend/hooks/useUrlServiceSync.ts), -[AppLayout redirect block](frontend/components/AppLayout.tsx#L163). All +[AppLayout redirect block](frontend/components/AppLayout.tsx). All existing call sites are wrapped; new ones must follow. **9. React Query defaults are set in @@ -375,12 +474,12 @@ re-renders triggered by store subscriptions. The trace shows which. - `backend/utils/audit_helpers.py` (referenced the long-removed DuckDB `_ingested_files` table) - `backend/repositories/audit.py` (was a 27-line pass-through) - `scripts/validate_logs.py` / `.sh` (depended on removed bits) -- `backend/core/duckdb.log_audit_event` shim (call `metadata_db.record_audit` directly; test patches must target `backend.core.metadata_db.record_audit`) +- `backend/core/duckdb.log_audit_event` shim (call `metadata.record_audit` directly; test patches must target `backend.core.metadata.audit.record_audit` — or `backend.core.metadata.record_audit` via the package re-export, which the `_ShimModule` proxy mirrors onto the live binding) - `QueryRunner.safe_select` / `safe_select_list` (use `actual_cols` directly) ## Testing -**The Rule:** before committing, run `make ci`. It runs ruff check → ruff format check → mypy → pytest → typecheck-frontend → vitest → osv-scanner. Add or update tests for every change; if a change is not testable in isolation, document why. +**The Rule:** before committing, run `make ci`. It runs the full gate in parallel (`-j2`): backend pytest + frontend vitest + frontend typecheck (with OpenAPI type regen) + frontend ESLint ceiling (`lint-frontend`) + ruff check + ruff format check + mypy + import-contracts + VCL lint tests (`vcl-test`) + Rust scorer cargo tests (`scorer-test`) + frontend dep resolution (`verify-deps`) + secret scan + OSV scan + OTEL console-exporter guard (`otel-guard`). Add or update tests for every change; if a change is not testable in isolation, document why. ### Backend (`tests/`, mirrors source tree) @@ -405,6 +504,51 @@ Patterns: `render` + `screen.getBy*` for components; `renderHook` for hooks; dir **What to test:** pure utilities exhaustively (filters, formatters, URL builders), hook state transitions, component key states (loading/error/empty/populated), navigation/URL helpers. +### E2E ([frontend/e2e/](frontend/e2e/), Playwright) + +Cross-browser matrix: chromium + firefox + webkit, all blocking on PR. Mock backend booted by [frontend/e2e/global-setup.ts](frontend/e2e/global-setup.ts) under `FASTLY_MOCK_MODE=1`. Run locally with `cd frontend && npx playwright test --project=chromium` (or `--project=firefox` / `webkit`). Trace + screenshot auto-uploaded on failure. + +SSE-stream specs MUST split on the multi-separator regex `/\r\n\r\n|\n\n|\r\r/` — `sse-starlette` emits `\r\n\r\n`, so the naive `\n\n` parser returns zero messages (see commit `0368868`). Mirrors the production `useServiceStream` regex. + +### Visual regression (opt-in, [frontend/e2e/visual-regression.spec.ts](frontend/e2e/visual-regression.spec.ts)) + +Gated behind `RUN_VISUAL_REGRESSION=1` so default CI stays at ~20 cross-browser tests (including the `a11y-routes`, `a11y-admin-routes`, and `keyboard-navigation` specs). Baselines for chromium-darwin committed under `e2e/visual-regression.spec.ts-snapshots/`. To bootstrap a new platform: + +```bash +RUN_VISUAL_REGRESSION=1 npx playwright test --project=chromium \ + e2e/visual-regression.spec.ts --update-snapshots +``` + +Snapshots embed `{browser}-{platform}` — a darwin baseline fails strict-pixel comparison on linux. CI baselines for linux need a one-time `--update-snapshots` run when the env var is flipped on in workflow. + +### Hot-path micro-benchmarks ([tests/perf/test_benchmarks_micro.py](tests/perf/test_benchmarks_micro.py)) + +Per-call cost benches for HyperLogLog + SQL utility paths. Auto-disabled under `xdist` (assertions still run as smoke); for real numbers: + +```bash +uv run pytest tests/perf/test_benchmarks_micro.py \ + -o 'addopts=-q' --benchmark-only +``` + +### Perf gate scales + +Three tiers in [tests/perf/baseline.json](tests/perf/baseline.json), driven by `PERF_NUM_ROWS`: + +- `smoke_100k` (default, PR-blocking) — `make ci` runs this on every push +- `mid_500k` (opt-in via `PERF_NUM_ROWS=500000`) — closes the 10× inflection gap; wire as a label-triggered PR job for query-shape-touching changes +- `nightly_1m` — cron-scheduled in [.github/workflows/perf-nightly.yml](.github/workflows/perf-nightly.yml) + +Refresh after legitimate perf improvements: +```bash +PERF_NUM_ROWS=500000 uv run python scripts/emit_perf_latest.py +bash scripts/perf_gate.sh +# then update the relevant scenario in baseline.json with headroom +``` + +### Stateful + property tests + +Hypothesis `RuleBasedStateMachine` pattern at [tests/core/test_ingest_stateful.py](tests/core/test_ingest_stateful.py) — first example in the repo. The `@initialize()` rule MUST `metadata_db.teardown(service_id)` because Hypothesis runs many instances per pytest function and the per-test SQLite file is shared across instances (otherwise: `FlakyStrategyDefinition`). + ## Traps & Gotchas This is the single most valuable section. Re-read it. @@ -457,10 +601,10 @@ A job fired after the config was deleted. The next `reload()` evicts the stale j The RHS of `~` or `!~` must be a literal. No variables, no concatenation. Use `regsub()` / `regsuball()` for dynamic logic. ### 15. Operational metadata lives in per-service SQLite, not DuckDB -Alerts, views, audit, cron history, ingested-file dedup, ASN names, source registration, usage telemetry → `data/services/{id}.metadata.db` (WAL). Read/write via [backend/core/metadata_db.py](backend/core/metadata_db.py) — never via DuckDB. JOINs against log data: ATTACH the SQLite read-only as `meta` via `attach_metadata_db()`, or pre-fetch and inline as a parameterised IN list (see `dashboard.py` ASN search). +Alerts, views, audit, cron history, ingested-file dedup, ASN names, source registration, slow-query history → `data/services/{id}.metadata.db` (WAL); usage telemetry (`usage_log` + `usage_log_hourly_summary`) now lives in the separate `data/services/{id}.usage_log.db` file so the cron writer's lock can't block admin readers. Read/write via [backend/core/metadata/](backend/core/metadata/) (legacy `from backend.core import metadata as metadata_db` call sites resolve through the package's `_ShimModule` proxy) — never via DuckDB. JOINs against log data: ATTACH the SQLite read-only as `meta` via `attach_metadata_db()`, or pre-fetch and inline as a parameterised IN list (see `dashboard.py` ASN search). SQLite connections open in WAL mode with `synchronous=NORMAL`, which lets writers and readers proceed without blocking each other under contention. ### 16. Monkeypatches → catalog in [MONKEYPATCHES.md](MONKEYPATCHES.md) -We patch six s3fs methods + one PyIceberg `SqlCatalog.load_table` at import time for telemetry-proxy routing, immutable-bytes caching, and table-object reuse. Every patch is documented in MONKEYPATCHES.md with site, motivating incident, and cleanup path. Update that file in the same commit when you add/modify/remove a patch. +Historically we patched six s3fs methods + one PyIceberg `SqlCatalog.load_table` at import time. Phase 4 of the v2.0 carve-up replaced the s3fs patches with `FosS3FileSystem` / `CachedS3FileSystem` subclasses in [backend/core/iceberg/fs.py](backend/core/iceberg/fs.py) registered as a pyiceberg `FileIO`. Whatever remains is documented in MONKEYPATCHES.md with site, motivating incident, and cleanup path. Update that file in the same commit when you add/modify/remove a patch. ### 17. MSW + openapi-fetch ordering — `server.listen()` must run at module load `openapi-fetch` captures `globalThis.fetch` at `createClient` time. [frontend/lib/api.ts](frontend/lib/api.ts) creates its client at module load, so MSW's `server.listen()` MUST execute at the top of [frontend/vitest.setup.ts](frontend/vitest.setup.ts) — **not inside `beforeAll`**. If listen runs after lib/api.ts is imported, the captured fetch is the unpatched original and every test silently bypasses MSW. Symptom: handlers never fire, requests hit real loopback. Don't move that call into a hook. @@ -475,11 +619,25 @@ Our [frontend/vitest.config.ts](frontend/vitest.config.ts) sets `globals: false` The tunnel exposes the same FastAPI app to the public internet. Middleware classifies by `Host` and blocks remote requests from admin paths — including `/api/admin/share/*`. When you add an endpoint analysts must reach, register under `/api/share/*` or update `_is_blocked_path()`. Don't remove the `testclient`/`testserver` allow-list entries — they're what let pytest hit admin routes. ### 21. `sync_data` orphan-cleanup vs local-compaction outputs -Local compaction writes merged rollups to three places: `/data/daily/`, `/data/weekly/`, and `/data/timestamp_hour=*/compacted_*.parquet`. None of these are tracked by the iceberg snapshot, so they are NOT in `cloud_files`/`active_paths`. The orphan-cleanup loop in [backend/core/iceberg.py](backend/core/iceberg.py) `sync_data()` walks the cache and deletes anything not in `active_paths`; without explicit allow-rules it nukes every compacted output, and the [`local_compacted_files` registry](backend/core/metadata_db.py) then blocks re-download of the source files — silently dropping rows from the view (production: 1.65M → 302K on 2026-05-31, then 1.66M → 1.62M on 2026-06-01 from the per-partition `compacted_*` variant). The fix is two-pronged: orphan-cleanup restricts its walk to `timestamp_hour=*` dirs AND skips `compacted_*.parquet` filenames. **If you add a new local-only output pattern, add it to both the dir skip and the file skip.** Integration coverage in [tests/core/test_local_compaction.py](tests/core/test_local_compaction.py)::`test_compaction_outputs_survive_iceberg_sync_orphan_cleanup` exercises the round-trip with real `compact_local_partitions` + real `sync_data`. +Local compaction writes merged rollups to three places: `/data/daily/`, `/data/weekly/`, and `/data/timestamp_hour=*/compacted_*.parquet`. None of these are tracked by the iceberg snapshot, so they are NOT in `cloud_files`/`active_paths`. The orphan-cleanup loop in [backend/core/iceberg/_core.py](backend/core/iceberg/_core.py) `sync_data()` walks the cache and deletes anything not in `active_paths`; without explicit allow-rules it nukes every compacted output, and the [`local_compacted_files` registry](backend/core/metadata/ingest_log.py) then blocks re-download of the source files — silently dropping rows from the view (production: 1.65M → 302K on 2026-05-31, then 1.66M → 1.62M on 2026-06-01 from the per-partition `compacted_*` variant). The fix is two-pronged: orphan-cleanup restricts its walk to `timestamp_hour=*` dirs AND skips `compacted_*.parquet` filenames. **If you add a new local-only output pattern, add it to both the dir skip and the file skip.** Integration coverage in [tests/core/test_local_compaction.py](tests/core/test_local_compaction.py)::`test_compaction_outputs_survive_iceberg_sync_orphan_cleanup` exercises the round-trip with real `compact_local_partitions` + real `sync_data`. ### 22. `unattended-upgrades` can OOM a memory-tight VM A 16 GB Linux VM running backend + frontend + caddy holds a steady-state working set in the 10-13 GB range. The Debian/Ubuntu nightly `apt-daily-upgrade.timer` forks a transient 1-2 GB downloader on top of that, which can trip an OOM kill that wedges the kernel (sshd dies; needs a VM reset). The mitigation is to `systemctl mask apt-daily.timer apt-daily-upgrade.timer unattended-upgrades.service` on the host and re-assert it on every restart so a re-image / apt-reinstall can't silently re-enable them. Trade-off: no automatic security patching — patch manually on a planned maintenance window with the backend container stopped. **If you provision a VM with more RAM, you may safely re-enable upgrades.** +### 23. SSR upstream fetch must use `node:http`, not `fetch()` +Node's `fetch()` always rewrites the `Host` header from the URL — there's no way to override it. The backend's `_remote_host_allowed` gate ([backend/utils/remote_access.py](backend/utils/remote_access.py)) rejects remote-classified requests whose `Host` isn't the public endpoint. SSR helpers like [frontend/lib/ssr/bootstrap.ts](frontend/lib/ssr/bootstrap.ts) use `node:http.request` which preserves arbitrary headers verbatim. If you write a new SSR helper, do NOT reach for `fetch()` — copy the `rawRequest` pattern. The 2026-06-11 SSR-leak incident (reverts `f3d8dd7` / `546c279`) was the first version using `fetch()`; the `Host` got rewritten to `127.0.0.1:8000`, the backend classified as admin-from-loopback, and the full admin bootstrap dehydrated into anonymous public HTML. + +### 24. Rollup writers must rebundle bundles after adding a field +`bundle_hours` / `bundle_days` use mtime to skip up-to-date bundles. The cron only re-bundles HOURS THAT JUST RECEIVED DATA. Closed historical hours never re-touch. So a new field added to the rollup writer (real or virtual) lands as a per-(field, hour) parquet but the bundled `all_fields.parquet` for closed hours stays without it — the dashboard's bundled-rollup reader returns 0 rows for the new field and the runtime fallback fires (defeats the perf win). Fix: delete the closed `all_fields.parquet` files and re-run the backfill (`backfill_missing_bundles` / `backfill_day_bundles` in [backend/core/rollups/](backend/core/rollups/), or the [`POST /api/admin/backfill-bundle-rollups`](backend/routers/admin/compaction.py) endpoint) so they get rewritten with the new field. + +### 25. Virtual fields blow up the live-hour batch if not filtered out +`execute_top_n_rollups` in [_base.py](backend/repositories/_base.py) needs the active-hour merge to include real fields' new rows. The live-hour SQL projects `field_name AS value` and BinderExceptions on any name that's not a column on the live temp. Virtual fields like `waf_sig_ind` don't exist as real columns — passing them through silently kills the whole UNION ALL (the outer `except Exception: pass` swallows it) and drops the live-hour merge for REAL fields too. Always filter to `actual_cols` before the batch: +```python +live_fields = [f for f in fields if f in actual_cols] +if live_fields: + live_res, _ = self.execute_top_n_batch(live_fields, tmp_name, ...) +``` + ## AI Agent Directives These apply to every change, regardless of scope. @@ -492,6 +650,7 @@ These apply to every change, regardless of scope. 4. **Test error paths.** Missing config, external 4xx/5xx, empty DB. 5. **Frontend tests live in `frontend/__tests__/`** mirroring source structure (`app/`, `components/`, `hooks/`, `lib/`). 6. **Verify in the real app when you can.** Start the server, drive the UI, watch the logs (we log every query and FOS call). Don't rely on green tests alone for feature correctness. +7. **Run the Playwright suite as part of the dev-verify checklist.** Alongside the `verify-dev-first` flow (`./run.sh --dev` on 18002/13002), run `cd frontend && npx playwright test --project=chromium` for any change touching the admin shell, dashboard, provision wizard, custom-field drawer, or share-login. The suite spawns its own backend on 18004 + frontend on 13004 via [frontend/playwright.config.ts](frontend/playwright.config.ts) so it doesn't collide with the dev shell on 18002/13002. Use `--project=chromium,firefox,webkit` before pushing if the change touches browser-only interactions (DnD, popovers, chart hover). ### Code Changes @@ -511,7 +670,7 @@ These apply to every change, regardless of scope. 14. **Never commit a real credential to suppress the scanner.** The point of the gate is exactly this. If a legitimate secret needs to live in the tree (e.g. an SSH public key used as a trust anchor), document why in a comment adjacent to the allowlist entry and explain why exposure is intentional. 15. **Never put real customer values in code, scripts, tests, or docs.** This includes Fastly service IDs (use `` or `${FASTLY_SERVICE_ID:?}` env vars in scripts), bucket names, real domains, real IPs (Fastly edge ranges are fine — they're published), real email addresses (use `you@example.com`), or screenshots that show the above. Test fixtures use placeholders (`TestLogSvcABC123`, `FAKE_TOKEN`, `"FROM_CONFIG"`). Real deployment values come from env vars / per-host config that's gitignored. 16. **Files that must never be committed** (covered by `.gitignore` — verify before any new directory of generated content lands): - - `.env` (real env), `configs/*.json` except `configs/ssh_known_hosts`, `data/system/` (real SSH key + share DB), `.scoring/` (per-deployment AES keys), `tests/fixtures/scoring/` (real prod traces). The `.gitleaks.toml` allowlist also covers these so a working-tree (`--no-git`) scan stays clean for ad-hoc local runs. + - `.env` (real env), `configs/*.json`, `data/system/` (real SSH key + share DB), `.scoring/` (per-deployment AES keys), `tests/fixtures/scoring/` (real prod traces). The `.gitleaks.toml` allowlist also covers these so a working-tree (`--no-git`) scan stays clean for ad-hoc local runs. ### Provisioning Wizard @@ -526,6 +685,32 @@ These apply to every change, regardless of scope. 17. All new endpoints get at least one test in `tests/routers/`. 18. Regenerate OpenAPI types after the endpoint lands: `cd frontend && npm run gen:types`. +### Architectural choices to preserve + +The 2026-06 retrospective surfaced several structural decisions the audit specifically validated. Don't rewrite these in a future reimagining: + +- **ADR-driven architecture with decisions captured AFTER the lesson lands.** This is the velocity strategy, not a debt. Continue the cadence — write the ADR after a phase ships, not before. +- **[MONKEYPATCHES.md](MONKEYPATCHES.md) as a living inventory** with root-cause attribution per patch (incident date, why upstream can't fix, removal criteria). +- **Property-based testing** (Hypothesis) for filter/query roundtrips. Catches drift without hand-written matrices. +- **RequestContext** making tenancy structurally impossible to bypass — can't construct without `_enforce_service_access`. +- **Modular package carves with re-export shims** for backward compat during refactor (the `metadata` package `_ShimModule` proxy + the `scheduler.py` re-export shim). +- **Named exception classes + explicit retry policies** (vs. generic `except Exception`). +- **MVP-then-iterate cadence with phase-based cleanup.** Don't propose "spike before shipping" rewrites — solo bandwidth and information-unavailability at v1.0 time make iterate-then-cleanup the right trade-off. + +### Anti-patterns explicitly rejected + +If a refactor proposal matches one of these, push back. Each was investigated and rejected during the 2026-06 audit; the rationale is preserved here so future-you / future-agent doesn't relitigate: + +- **Generic "schema codegen" infrastructure** for FilterSpec — `openapi-typescript` already handles the 80% case; codegen can't express the procedural collision-handling logic that's the actual duplication. +- **Premature `usePagination` / `PaginationConfig` context** when there are only 2 paginated endpoints with genuinely different sort semantics. +- **Centralized `RoleProvider` context** — role is 2 orthogonal flags (`analyst_session` × `is_remote_analyst`), not a hierarchy; an enum would have locked in a false model when SHARE-INVITED was added. +- **Multi-language scoring codegen** (Python ↔ Rust) — parity is enforced cheaply by fixture tests; codegen adds versioned-schema overhead and constrains schema evolution. +- **Pre-formatted server-side response values** — `TopTenTable` needs raw values for click handlers and map ops; pre-formatting forces double payload and locks display format into the API contract. +- **Cache-coherence "state machine" abstractions** — the bottleneck is DuckDB view rebuild time, not cache layer policy; a state machine wouldn't have prevented the 2026-06-09 transient-empty-result incident. +- **Unified `QueryExecutor`** for retry — stale-view and compaction-race are different error classes with different recovery costs; collapsing them creates a leaky abstraction. +- **Tentacle-parameter threading** through repository signatures (e.g., passing `RequestContext.cached_temps` to every repo function) — couples request scope to data layer. +- **Custom `FsspecFileIO` subclass to "fix" the s3fs monkeypatches** — investigated 2026-05-21 and rejected; pyiceberg instantiates `S3FileSystem` directly inside its `_s3()` builder, bypassing the FileIO layer entirely. Wait for upstream `supply-your-own-FileSystem-class` hook (tracked in [MONKEYPATCHES.md](MONKEYPATCHES.md)). + ## Keeping This File Current Update this file in the same commit that introduces: diff --git a/CHANGELOG.md b/CHANGELOG.md index 3309df7c..483fa862 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,508 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog 1.1.0](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.0.0] - 2026-06-20 + +### Added + +Feature work shipped alongside the cleanup sweep below. + +- **In-UI scorer redeploy.** The Session Scoring admin page gains a + **Redeploy** button plus an edge-drift warning when the deployed Wasm + lags the current scorer build, so re-shipping the edge scorer no longer + requires the CLI. +- **Scoring dashboard depth.** A fail-open breakdown card surfaces how + often the edge scorer failed open, a two-phase redeploy log shows code + vs matrix propagation, and the 1-hour window renders per-minute charts. +- **`enable-scoring` / `disable-scoring` CLI subcommands** for headless + provisioning and teardown of session scoring on a service. +- **Opt-in RUM Web Vitals collection.** Real-user Core Web Vitals can be + collected to a rotating JSONL sink, **off by default** and enabled only + via a host `.env` flag (never baked into the image). Ships with a dev + analysis script and size-based log rotation. +- **Edge Layer-2 enforcement is an explicit operator opt-in.** The + route-transition (L2) sub-score is always computed and logged, but its + contribution to the *enforced* combined score is now gated behind a + per-service switch instead of an automatic deployment-age ramp, so + there is no clock-driven monitoring-to-blocking transition. An + `L2EnforcementCard` near the threshold controls (confirm-dialog gated, + with a readiness banner) and a `GET`/`PUT /scoring/l2-enforce` endpoint + drive it; enabling fades L2 in over three days from the moment of + consent, disabling returns it to observe-only. Deployment age is now + only an advisory readiness gauge. +- **Request correlation + operations hardening.** Every request now mints + an app-level request id that threads through the admin/analyst access + log (which also gains per-request latency) and the slow-query + attribution, so a slow request can be pivoted to the queries it ran. + The admin health snapshot gains scheduler-tick liveness, recent + cross-service cron failures, the effective log/exporter mode, + config-backup freshness, an opt-in FOS reachability probe, DuckDB pool + saturation-reject / last-warmed counters, and a traffic-normalized + scoring fail-open rate; deep `/api/health` additionally degrades on a + stuck `running` sync row and on errored commit / metadata-sync crons. + The System Health card surfaces the new tiles and a cron-failure banner. +- **Human-readable PoP and ASN labels everywhere.** Points of presence + now render as `DEN (Denver, CO - USA)` (code prominent, location muted) + through one shared component sourced from a `pop_geo` map seeded by + `/api/bootstrap`. Wired into the Network Quality Avg RTT by PoP, the + Shielding Analysis table / map tooltip / a11y table, the dashboard + top-N PoP card, and origin by PoP. The Network Quality Avg RTT by ASN + breakdown likewise shows the ASN name alongside the number. + Click-to-filter keeps the raw code / ASN while the label is + display-only. + +### Performance + +- **Filter bar** no longer causes a pre-hydration layout shift on load. +- **Session scoring** paints its status panel immediately instead of + blocking on the analytics fetch. +- **Edge scorer** ships ~13% smaller (491 KB → 429 KB) via a gated + `wasm-opt` build pass on top of the existing cargo LTO/strip, and opens + each edge ConfigStore once per request instead of up to four times + (same fail-open / reject behavior). + +### Changed + +- **CI** gates frontend ESLint with a count-ceiling ratchet so the error + count can only ratchet down. + +### Dependencies + +- **Dependency freshness sweep** across all ecosystems. Python in-range: + `fastapi 0.136.3 → 0.138.0`, `duckdb 1.5.3 → 1.5.4`, `uvicorn 0.48.0 → + 0.49.0`, `sqlalchemy 2.0.50 → 2.0.51`, `pytest 9.0.3 → 9.1.1`, `ruff + 0.15.15 → 0.15.18`. Frontend majors: `@types/node 25 → 26`, + `react-plotly.js 2 → 4` (drops the redundant `@types/react-plotly.js`); + TypeScript stays at `^5.9` (the 6.0 bump broke the Docker `npm ci` via an + `openapi-typescript` peer). Frontend in-range: `next 16.2.6 → 16.2.9`, + `lucide-react`, `@radix-ui/react-slider`, `@playwright/test`, `vitest`, + `tailwindcss`. Scorer Cargo lockfile refreshed within constraints. + +### Documentation + +- **OSS front-door tidy** — the README Quick Start uses `docker compose` + (v2) with an EOL note for the v1 binary, CONTRIBUTING gains + development-setup and test-running sections, ARCHITECTURE glosses + FOS/NGWAF on first use and links a new ADR index, and the orphaned + `configs/ssh_known_hosts` plumbing left from the removed SSH + reverse-tunnel is dropped. + +### Cleanup + +Cleanup sweep applying an in-tree code-quality review. The pattern +across the work was the same on every front: kill the dual maintenance +that survived the package carve-up. + +- **Three SQLite pools collapse into one.** `metadata.base`, + `metadata.usage_log_db`, and `share_db.connection` all owned + identical thread-local pool machinery (same module globals, same + PRAGMAs, same init lock). They now share `ThreadLocalPool` in + `backend/core/sqlite_pool.py`. share_db queries flow through + `InstrumentedConnection` for the first time — they now appear in + the Live Query Monitor under `service=__global_share__`. +- **Origin summary's per-query templates collapse into one path.** + `TEMP_SUMMARY_ROLLUP` + `TEMP_SUMMARY_BY_EDGE` are gone; the live + and TEMP-table paths both use `SUMMARY_GROUPING_SETS` through a + shared `_shape_summary` helper that reads rows by column name + (`cursor.description` dict access) instead of positional indices. +- **Cron job tails consolidated.** Five `finally:` blocks ending in + the same `if run_id: update_cron_duration ... except: pass` + boilerplate route through `finalize_cron_duration`. The 16+ + `load_config / 404` preambles funnel through `load_service_config`. + Three `start_cron_run → spawn-thread → 503` triples collapse into + one `start_or_resume_cron`. Per-hour bundle walks + (`collect_hourly_bundle_paths`) and the two cross-package migration + runners (`run_pending_migrations`) get the same treatment. +- **Mixins + helpers for the small repeated shapes.** + `LogExtentsMixin` (`earliest_log_at` + `latest_log_at`), + `OkResponse` (`ok: bool = True`), `_atomic_write_json`, + `_get_cfg_field`, `client_ip`, `shim_attr`, plus iceberg + `_iceberg_root_prefix` + `_metadata_pointer_candidates`. +- **`fetch_service_name` now routes through the shared `fastly()` + client** instead of an inline urllib body. Adds a `timeout` keyword + to `fastly()` (default 30 s preserves the existing behavior of the + ~50 other call sites) and the name-fetch call site pins + `timeout=10` + `max_retries=1` so the cold-path tail caps at ~21 s + vs the client default of ~127 s. Caller is behind a 300 s name + cache so steady-state cost is unchanged. +- **`_run_falco_lint` absorbs the falco subprocess plumbing** shared + by `vcl_utils.lint_log_format` (logging-endpoint VCL check) and + `vcl_validator.lint_vcl` (scoring-snippet VCL check). Each caller + keeps its own falco-not-available handling, timeout budget, and + output parser — the helper only owns the tempfile lifecycle, + `subprocess.run` invocation, and tempfile-path redaction. The two + use cases stay distinct on purpose (logging is best-effort, scoring + is a security boundary). +- **Comment hygiene pass.** Removed stale, redundant, and duplicate-divider + comments across the tree and condensed embedded changelog blocks (the CI + coverage gates and the ESLint ceiling) down to their conventions. Load-bearing + rationale, incident references, and functional directives are left intact. + +### Fixed + +- `start_proxy_server` race that surfaced as + "proxy server is not running" when N reader threads called + `get_connection` simultaneously on a cold process. Concurrent + first-callers now serialise the thread-start decision and wait + on `_READY` outside the lock so every caller reads `_PORT` after + the server has bound. +- `get_metadata_storage_stats` + `cleanup_metadata` silently + ignored the `usage_log` table on every fresh service after + the v2.0 per-service-file split — the helpers still read + `metadata.db`. Routed through `usage_log_db` so admin storage + stats and the retention cleanup job actually see the rows. +- `sync.py` cron tail used to emit a misleading + "View refresh + warm: Xms" status event even on failure (the + success log sat outside the try/except). The shared + `refresh_view_and_warm_pool` puts the success log inside the + try/except so failure means no event. +- `start_cron_run` non-sync task types fell back to + `cron_compact.log_retention_days` via a buggy ternary; the + promoted `_TASK_TO_CRON_KEY` mapping plus a default 7-day + fallback gets the correct retention applied per task. +- `query_instrumentation._safe_weakref` silently no-op'd the + memory probe when wrapping non-weakref-able cursors; promoted + the registry-version's strong-ref-closure fallback so the probe + always tracks. +- `local_compaction` hour-tier tests were flaky on any clock more + than 30 days past the hardcoded sample dates — the fixture now + pins both `_DAILY_TIER_AGE_DAYS` and `_WEEKLY_TIER_AGE_DAYS` so + neither tier sweeps the test partitions out from under the + assertions. +- **Edge scorer** now treats encoded slashes (`%2F`) as data during + route normalization, so paths with encoded slashes are scored + correctly (the Wasm package was rebuilt to ship it). +- **Scoring sub-fetch** is no longer inspected by NGWAF, fixing + intermittent `compute-unavailable` 406s on the scoring path. +- **Analyst idle timeout** is no longer extended by background + telemetry beacons — only real analyst activity keeps a share session + alive. +- **Sync cron** never leaves an orphaned `running` row behind; a + per-task orphan timeout reaps a stalled run instead of wedging + ingestion on every subsequent tick. +- **Origin analytics** holds its pool connections until every parallel + gather worker finishes, fixing a connection released mid-query. +- **Stale browser tabs** no longer get stuck in a hard-reload loop. +- **Scoring admin** modal copy matches the orchestrator's actual + behavior, the redeploy modal text is no longer clipped, and status + fetches are type-clean. +- **Structured logging** surfaces stdlib `extra=` fields via + `ExtraAdder`. +- Restored the router-independence import contract. +- **Backend failures surface inline instead of silently.** Where a 5xx + previously left a forever spinner or fabricated zeros (the network map + and Network Quality section, the dashboard chart and bot cards, session + detail, the `/usage` storage + cost cards, and the logs import / commit + quick actions) the UI now shows an inline alert with a Retry. The + analytics request `sections` and several response reads are typed + through the generated OpenAPI schema so a backend rename is a compile + error rather than a blank card. +- **Interrupted-delete raw files** are now reclaimed automatically. A + restart between the dedup-ledger write and the object-storage delete + used to strand a raw `.gz` in the bucket, invisible until the daily + ledger trim re-listed it; the sync reconcile now re-issues the delete + from durable state (default-deny: only files positively proven to hold + ingested data are removed). +- **DuckDB object-cache leak** that grew unbounded under continuous + buffer + compaction file churn (and OOM-killed the container) is bounded + by a periodic, timeout-guarded instance-recycle job that briefly drains + connections — reads queue rather than fail during the drain — to free the + cache and let it re-warm lazily. Off by default + (`DUCKDB_RECYCLE_INTERVAL_MIN=0`), opt-in via deploy config. +- **Total Logs badge and ingest skipped-files counts** no longer drift + upward over time. The `ingested_files_summary` rollup is now recomputed + after a retention trim (it was incremented on ingest but never + decremented on delete), per-run `skipped_files` reports the files + actually re-seen and skipped rather than the dedup-ledger size, and the + Total Logs badge prefers the last-known-good row count during a + transient catalog rebuild. +- **Access logs render through structlog.** Uvicorn's private + `uvicorn.access` handler used to emit plaintext access lines (carrying no + `trace_id`) into the otherwise-structured stream; they are now bridged + through the shared root handler at startup so every post-boot line is + structured. +- **Accessibility remediation** brings every loopback-reachable admin and + usage route axe-clean: light-mode status / destructive color tokens + deepened to clear 4.5:1 contrast, a duplicate focus trap removed from + dialogs so nested selects stay open, and the cost-calculator number + inputs given accessible names. + +### Removed + +- `backend/utils/retry.py`, `backend/utils/cdn.py`, + `backend/core/settings.py` (Path-B removal of three migration + scaffolds that never adopted in tree). `pydantic-settings` + removed as a *direct* dependency from `pyproject.toml` (it was the + sole first-party consumer; it remains in `uv.lock` transitively via + the OpenAPI spec/schema validators used in tests). +- Legacy `usage_log` DDL + 3 triggers + 4 indexes in + `metadata.base._SCHEMA` (the table moved to its own per-service + file pre-2.0). `migrate_from_metadata_db` and + `_migration_003_rebuild_usage_log_hourly_summary` deleted. +- Scrypt passcode verify path + `PASSCODE_DEFAULT_ALGO_KEY` + + `_migration_003_passcode_algo_marker` (cutover happened + pre-2.0; fresh installs have no scrypt rows). +- `TunnelState.use_tunnel` + `tunnel_url` + the + `share_admin` response keys that exposed them (always + False/None since v2.0 deleted the SSH path). +- Per-checkin `_cleanup_temp_tables` sweep in `duckdb_pool` — + the "safety net" was unreachable because the failure path + discards the connection before the sweep can run. + +### Release overview + +Architecture cleanup release. The post-`v1.2.0` perf branch closed the +worst read-path latency by stacking remediation on top of an +architecture that wasn't designed for the workload; this release pays +that down. The largest backend files were carved into per-concern +packages, telemetry moved to OpenTelemetry + structlog, tenancy got a +typed `RequestContext` boundary, frontend hydration warm-up hacks were +replaced with policy, and the test + type gates ratcheted to a level +that catches regressions on the way in. Composite endpoints land as a +hard cutover — frontend + backend ship together, granular endpoints +deleted. + +### Architecture + +- **`backend/core/iceberg.py` (4,232 LOC)** → `iceberg/` package + (`view`, `catalog`, `warehouse`, `manifest`, `fs`, `_core`, + `buffer`, `ddl`, `snapshot_cache`, `dedup`, …). Custom + `FosFsspecFileIO(FsspecFileIO)` + `CachedFosS3FileSystem(S3FileSystem)` + subclasses replace 5 of the 6 historical `s3fs` monkeypatches; + only the `ThreadPoolExecutor.submit` ContextVar wrapper remains + (see [MONKEYPATCHES.md](MONKEYPATCHES.md)). +- **`backend/scheduler.py` (2,843 LOC)** → `backend/cron/` package + with `scheduler`, `decorators`, and per-job modules under + `cron/jobs/` (`sync`, `commit`, `compaction`, `optimize`, `expire`, + `metadata`, `gap_heal`, `rollup_compact_daily`). The scheduler + picks the **separate-pool** isolation strategy based on Phase 1 + thread-wait telemetry; the deferred-view-cache-invalidation hack + is gone. +- **`backend/core/metadata_db.py` (3,168 LOC)** → `backend/core/metadata/` + package with concern-partitioned mixins (`base`, `alerts`, `views`, + `ingest_log`, `cron_log`, `asn_cache`, `usage_log`, `reconciliation`, + `state`). `metadata_db.py` becomes a thin backward-compatible shim. +- **`backend/utils/tunnel.py` (1,022 LOC)** → `backend/utils/tunnel/` + package (`manager`, `session`, `rate_limiter`, `state`, + `fingerprint`). The SSH-to-localhost.run path is **deleted entirely** + (~400 lines): no more SSH subprocess + sleep-listener + reconnect + state machine. Direct-mode only; production has always used direct. +- **`backend/core/share_db.py` (1,312 LOC)** → `backend/core/share_db/` + package (`connection`, `schema`, `invites`, `sessions`, `audit`, + `passcode`, `tos`, `settings`). `argon2-cffi` replaces `scrypt` for + passcode hashing. +- **`backend/routers/admin.py` (1,650 LOC)** → `backend/routers/admin/` + package (14 sub-modules: `pop_locations`, `ingest`, `trees`, + `downloads`, `sync_status`, `compaction`, `health`, + `log_accounting`, `iceberg`, `bot_sources` + shared + `_helpers` / `_dir_size` / `_router`). +- **`backend/core/rollups.py` (2,045 LOC)** → `backend/core/rollups/` + package (8 sub-modules: `_common`, `time_series`, `sessions`, + `hour_bundles`, `day_bundles`, `recompute`, `wellknown_bots`). +- **`RequestContext` replaces `AnalyticsDeps`** ([`backend/core/request_context.py`](backend/core/request_context.py)). + Tenancy is enforced at context construction; routes never parse a + `service_id` from a path param. The security-load-bearing private + `read_only` attribute is now structurally unexposable as a query + param. +- **Composite endpoints + hard cutover** — `dashboard/bundle`, + `security/bundle`, `network/bundle` ship together with the frontend + swap. Granular per-card endpoints deleted, `_meta_con` parallel path + dropped, `is_cached/_is_cached` alias collapsed, + `AnalyticsDeps = RequestContext` shim removed. Top-5 backend files + now ≤ 1,461 LOC; no backend file > 1,500. + +### Telemetry, observability + +- **OpenTelemetry** (`opentelemetry-api/sdk` + + `fastapi`/`botocore`/`aiohttp` instrumentors) replaces the four + fragmented custom telemetry surfaces. Console exporter ships by + default; backends (Jaeger / Tempo / Honeycomb / …) are a + deploy-config decision, not part of this release. +- **`structlog`** wires `trace_id` + `span_id` into structured log + output via a custom processor. +- **`process_context_scope` + `_ACTIVE_CONTEXTS` mirror kept** at + [`backend/utils/telemetry.py`](backend/utils/telemetry.py). OTel context + propagation uses Python ContextVars under the hood, which inherit + the cross-thread limitation (fsspec iothread, pyiceberg + ThreadPoolExecutor) the manual mirror was built to solve; removing + the mirror would re-introduce the ~80%-NULL telemetry bucket + observed on 2026-05-20. Docstring + plan entry document the + reasoning. +- **`RequestTelemetry`** thin wrapper owns section spans, query + attribution, call log, and the custom `app.thread_wait_ms` metric + that fed the Phase 6 separate-pool decision. + +### Reliability, perf + +- **`aiodns` + `asyncio.gather` + bulk-transaction sqlite writes** in + [`backend/utils/rdns_cache.py`](backend/utils/rdns_cache.py) replace the + serial-blocking `socket.gethostbyaddr` loop that wedged the sync + worker for minutes on bulk lookups. +- **`tenacity`** decorator-based retry replaces ad-hoc try/except loops + for Fastly API + NGWAF + SQLite WAL-busy paths; centralised policy + on `Settings`. +- **`pydantic-settings`** centralises env-var reads + boot validation + (the "TRUSTED_PROXY_IPS required in prod" gate is now a pydantic + validator). +- **`cachetools`** replaces `bounded_cache` / `rdns_cache` / + `ngwaf_bot_cache` in-process LRU/TTL implementations. +- **Structured `.tf.json`** generation replaces f-string HCL + + `_hcl_escape` regex (`backend/utils/terraform_gen.py`), eliminating + the custom-HCL escaping injection vector. +- **`orjson` via FastAPI `ORJSONResponse`** for ~5–10× faster JSON + serialisation on composite endpoint payloads. +- **`rich` + `typer`** for the provision CLI; `httpx` everywhere + except `telemetry_proxy.py` (which stays on `aiohttp` for the proxy + server role). +- **`nuqs`** as the URL state source on the frontend, replacing the + custom Zustand/Effect sync hooks that produced hydration desync on + refresh. +- **`session_scoring._cached`** clears `_inflight` on the cache-hit + path too, not only on producer-path teardown — concurrent callers + on a hot cache key no longer leak the inflight registration when + the producer finishes before they wake up. +- **`iceberg/buffer.tombstone_buffer_files`** logs + skips on + marker-write failure (the immediate-`os.remove` fallback re-opened + the in-flight-query race the tombstone grace window exists to + close). Pair regression test pins the contract. +- **`DROP TABLE IF EXISTS` identifier quoting** at 11 temp-table + cleanup sites so the drop tolerates reserved keywords / hyphenated + service slugs that would otherwise raise. + +### Trust topology, middleware + +- **Middleware order asserted at boot AND in tests** — the + multi-paragraph prose comments in `main.py` were replaced with + one-line `# INVARIANT` markers + a boot-time crash if + `app.user_middleware` doesn't match the declared tuple. Snapshot + tests cover Caddy + docker-compose middleware order too. +- **`@pytest.mark.security_regression` marker + monotonic-count CI + gate** (floor: 24, from `audit-findings/`). Every test covering a + verified security fix carries the mark; a refactor cannot silently + drop coverage of a known fix. +- **Trust-topology snapshot tests** pin Caddy `@from_fastly` matcher, + XFF forwarding, `/share-login` rate-limit, and the backend + `--forwarded-allow-ips=127.0.0.1` flags. +- **`raise_internal(logger, exc, code, status)`** replaces + `raise HTTPException(detail={"error": str(e)})` at every backend + except site that previously echoed the original exception message + to the client. Detail is now `{"error": , "error_id": <8-hex>}`; + the full exception lands in the server log with the same + `error_id` so operators triage without the upstream body / token + fragments leaking on the wire. +- **`escape_sql_literal`** applied at every `read_parquet()` / + `glob()` site that interpolates a computed path. Closes the + injection surface a partially-validated path could open through + DuckDB's `read_parquet()` glob expansion. +- **Caddy container drops privileges** — `caddy/Dockerfile` adds + `USER caddy` (the base image ships the user). Caddy is the only + externally-facing socket and binds nothing below port 1024, so + there's no reason to keep `root` in the runtime. + +### Frontend + +- **RSC/CSR boundary** documented in `app/_routing.md`. The + hidden-Plotly + hidden-MapLibre + `setTimeout` warm-up hacks are + dropped; replaced with `modulepreload` + the styledata-event swap + pattern. +- **16 frontend files > 500 LOC split.** `ProvisionWizard.tsx` + (3,582 LOC) → `wizard/steps/*` + `state.ts` + `api.ts`; + `app/logs/page.tsx` (2,136 LOC) → `_sections/*` + `_state.ts`. + `app/admin`, `app/dashboard`, `app/alerts`, `app/security`, etc. + all post-split < 500. **No frontend file > 499 LOC.** +- **Live Query Monitor** — live-first sort, peak-memory column, + keyboard shortcuts, URL-persisted filters, per-run inline expand + for ×N cron-grouped rows, ≥ 30 s stuck-query pulse, copy-SQL, + sound notification removed. +- **Operations Overview cards** on the admin landing page surface + ingest gap + live query activity + slow-query count so the things + operators actually care about don't live three clicks deep. + Tone-coded (default → attention → warning → critical) so a + sustained_loss event jumps out. +- **Stable React keys on dynamic lists** — `DebugPanel`, `CronLiveLog`, + the network metro leaderboard, the query toolbar, and the + custom-field drawer now key off a stable identity instead of array + index. `useSSE` attaches a monotonic `_id` to each line so + append-only feeds (cron progress, query streams) keep stable keys + across re-renders. +- **Accessibility pass** — `FieldGroups` and `FileBrowser` disclosure + widgets are real `