Sentinel-Command/.github/workflows/deploy.yml at master · SourceBox-LLC/Sentinel-Command · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
name: Test & Deploy

on:
  push:
    branches:
      - master

jobs:
  test:
    name: Backend tests
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5

      - name: Install uv
        # astral-sh/setup-uv doesn't publish sliding major-version
        # tags (v7, v8 etc.) the way actions/* and docker/* do — only
        # specific versions like v8.1.0.  Pin to the exact tag and
        # bump manually when there's a reason to.  v8.x is the first
        # Node.js 24-ready major.
        uses: astral-sh/setup-uv@v8.1.0

      - name: Set up Python
        run: uv python install 3.12

      - name: Install dependencies
        working-directory: backend
        run: uv sync --extra dev

      # Lint before tests — ruff is sub-second and catches the kind of
      # issue (unused imports, import-sort drift, missing `from err` on
      # re-raises) that masks real bugs and clutters review diffs.  Ruleset
      # is conservative (F + E9 + W6 + I + B + UP) so the bar is "real
      # problems only" rather than "every style nit"; tighten in pyproject
      # when the team agrees on each new rule.
      - name: Lint with ruff
        working-directory: backend
        run: uv run ruff check

      # Scan our Python dep tree against the PyPA Advisory DB.
      # ``--strict`` makes any vulnerability a non-zero exit so a
      # known-bad transitive dep blocks the deploy.  Mirrors the
      # frontend's npm-audit gate (--audit-level=high --omit=dev)
      # in policy: this also blocks at high+ tiers — pip-audit
      # doesn't currently expose a severity gate, so anything in
      # the advisory DB counts.  When a CVE shows up with no fix
      # yet, add ``--ignore-vuln <PYSEC-ID>`` here with a comment
      # citing the upstream issue and the date we plan to revisit.
      - name: Dependency scan (pip-audit)
        working-directory: backend
        run: uv run pip-audit --strict

      - name: Run tests
        working-directory: backend
        run: uv run pytest -v

  frontend:
    name: Frontend audit + build
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5

      - uses: actions/setup-node@v6
        with:
          node-version: "20"
          cache: "npm"
          cache-dependency-path: frontend/package-lock.json

      - name: Install dependencies
        working-directory: frontend
        run: npm ci

      # `npm audit` flags vulnerabilities in production dependencies.
      # Threshold = high so low/moderate findings (transitive dev-only
      # CVEs that don't reach prod, advisories with no fix yet, etc.)
      # don't block legitimate deploys.  High + critical findings DO
      # block — those are real and need a fix or a documented
      # `--omit=optional` / override / waiver.
      #
      # `--omit=dev` skips devDependencies because they don't ship
      # to production; the prod bundle is what reaches a user.
      - name: npm audit (production deps only, high+critical)
        working-directory: frontend
        run: npm audit --audit-level=high --omit=dev

      # Vitest component tests — run BEFORE the build so a regression
      # caught by tests doesn't get the chance to ship via a successful
      # build.  We have ~50 tests today (HelpTooltip, InstallCloudNodeCard,
      # UpgradeModal, EmptyState, the API service helpers, the docs
      # page, and a sanity smoke test); the suite runs in <10s on CI
      # so the speed cost is negligible.
      #
      # ``npm test`` is the package.json alias for ``vitest run``
      # (one-shot, exits with status, no watch).  Vitest auto-discovers
      # files matching the ``include: ["tests/**/*.test.{js,jsx}"]``
      # pattern in vite.config.js.
      - name: Run frontend tests (vitest)
        working-directory: frontend
        run: npm test

      # Build now so a syntax or type error fails CI here rather than
      # mid-deploy.  Catches the same class of bug as backend pytest.
      - name: Build production bundle
        working-directory: frontend
        run: npm run build

  deploy:
    name: Deploy to Fly.io
    runs-on: ubuntu-latest
    needs: [test, frontend]
    concurrency:
      group: deploy
      cancel-in-progress: true
    steps:
      - uses: actions/checkout@v5

      - uses: superfly/flyctl-actions/setup-flyctl@master

      # Why two-step (build → machine update) instead of plain `fly deploy`:
      #
      # We run a single Fly Machine with a single persistent volume
      # (`opensentry_data` at /data, holding the SQLite DB).  `fly deploy`
      # for this topology is non-deterministic: sometimes it sees the
      # existing machine and updates it in place, sometimes it decides
      # the image config has "drifted enough" and tries to provision a
      # NEW machine alongside the old.  The new-machine path errors
      # immediately because the volume only has one attachment slot:
      #     "creating a new machine in group 'app' requires an
      #      unattached 'opensentry_data' volume."
      # We hit this on consecutive runs 2026-04-28 with strategy=rolling
      # AND strategy=immediate, and `max_unavailable` is rolling-only so
      # it didn't help either.
      #
      # `fly machine update --image …` is the explicit in-place API.
      # It targets a specific machine ID, restarts it on the new image,
      # and the volume stays attached throughout.  It cannot try to
      # create a new machine.  ~30-60s of downtime per deploy (same as
      # `strategy = "immediate"` on a good day) but reliably works.
      #
      # Builder choice has flipped THREE times now:
      #   - Original: --depot=true (depot.dev managed builder).
      #   - 2026-04-28: depot.dev timed out 5 min × 2 in a row
      #     (~10 min wasted per deploy).  Switched to --depot=false
      #     (Fly's standard remote builder).  ~100 deploys worked.
      #   - 2026-05-04 morning: Fly's standard remote builder started
      #     returning `unauthorized` on the WireGuard heartbeat for
      #     valid deploy tokens (Request ID 01KQTE4AHWKB2PAAS8A372EKNP).
      #     Swapped tokens — same failure.  Server-side scope change
      #     or platform incident; either way, CI was wedged.  Tried
      #     --depot=true again briefly: depot built fine but tagged
      #     the manifest under its own internal namespace
      #     (vo4x1o84n7ozql5y), so the subsequent `fly machine update`
      #     got MANIFEST_UNKNOWN looking for the image at the
      #     opensentry-command path.  Mismatch between depot's push
      #     and the two-step pattern we use.
      #   - 2026-05-04 afternoon (current): build locally on the
      #     GitHub runner with docker/build-push-action and push
      #     directly to registry.fly.io.  No third-party builders.
      #     No WireGuard.  Same FLY_API_TOKEN works for the registry
      #     push (proven on the failed depot run — depot's push to
      #     registry.fly.io itself succeeded; the namespace mismatch
      #     was on its side, not the registry's).  Dockerfile is a
      #     standard multi-stage build (node:20-alpine for frontend,
      #     uv:python3.12-bookworm-slim for backend) — no special
      #     build hardware needed.
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v4

      - name: Log in to Fly registry
        uses: docker/login-action@v4
        with:
          registry: registry.fly.io
          # Fly's registry ignores the username; only the token matters.
          username: x
          password: ${{ secrets.FLY_API_TOKEN }}

      - name: Build + push image to Fly registry
        id: build
        uses: docker/build-push-action@v7
        with:
          context: .
          push: true
          # Tag with the commit SHA so each deploy is uniquely
          # addressable + grep-friendly.  The previous flyctl-managed
          # format was deployment-<ULID> — a SHA is more useful for
          # cross-referencing the image to the source revision when
          # debugging.
          tags: registry.fly.io/opensentry-command:deployment-${{ github.sha }}
          # GitHub Actions cache for layers — ~30s saved on warm
          # builds vs. cold.  scope=deploy keeps it isolated from
          # any future workflows that might also use buildx.
          cache-from: type=gha,scope=deploy
          cache-to: type=gha,scope=deploy,mode=max

      - name: Export image tag for next step
        id: image
        run: |
          IMAGE="registry.fly.io/opensentry-command:deployment-${{ github.sha }}"
          echo "Captured image: $IMAGE"
          echo "image=$IMAGE" >> "$GITHUB_OUTPUT"

      - name: Update machine in place
        run: |
          set -e
          # We currently run exactly one machine.  If we ever scale to
          # multiple machines (or migrate to LiteFS / Postgres so we
          # don't need a single volume), this script needs to loop.
          MACHINE=$(flyctl machines list -a opensentry-command --json | jq -r '.[0].id')
          if [ -z "$MACHINE" ] || [ "$MACHINE" = "null" ]; then
            echo "::error::No machines found for opensentry-command"
            exit 1
          fi
          echo "Updating machine $MACHINE to $IMAGE"
          flyctl machine update "$MACHINE" --image "$IMAGE" --yes -a opensentry-command
        env:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
          # Image tag comes from the `image` step (the export step
          # after docker/build-push-action), not the build step.
          IMAGE: ${{ steps.image.outputs.image }}