From 46adbd9d632e9ff6723c43cdccd45a0d8e5d1a05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Tue, 30 Jun 2026 12:06:44 +1200 Subject: [PATCH 1/7] docs(restore): evaluate pgro handoff; add restore-replicas spec + canopy response MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Copy pgro's restore-verification handoff into docs/plans/. - Add .workhorse/specs/public-server/restore-replicas.md (RST): canopy as restore control plane — operator-declared replicas, worklist-driven executor, per-server targeting + restore-health. - Add docs/plans/pgro-restore-replicas-canopy-response.md: conformance verdict + the control-model inversion, for pgro sign-off. Co-Authored-By: Claude Opus 4.8 --- .../specs/public-server/restore-replicas.md | 147 ++++++ .../pgro-restore-replicas-canopy-response.md | 124 +++++ .../pgro-restore-verification-handoff.md | 483 ++++++++++++++++++ 3 files changed, 754 insertions(+) create mode 100644 .workhorse/specs/public-server/restore-replicas.md create mode 100644 docs/plans/pgro-restore-replicas-canopy-response.md create mode 100644 docs/plans/pgro-restore-verification-handoff.md diff --git a/.workhorse/specs/public-server/restore-replicas.md b/.workhorse/specs/public-server/restore-replicas.md new file mode 100644 index 00000000..aa6991ef --- /dev/null +++ b/.workhorse/specs/public-server/restore-replicas.md @@ -0,0 +1,147 @@ +--- +id: RST +--- + +# Managed restore replicas + +Canopy is the control plane for a fleet's *managed restore replicas*: standing replicas that Canopy decides should exist and keeps restored from the latest backups, driven through a restore consumer. +An external restore consumer — first-party infrastructure that restores backups into working Postgres replicas — is driven entirely by Canopy: Canopy declares which replicas should exist, hands out the snapshot to restore and short-lived read-only credentials for each, and records the restorability of every replica as the strongest backup-health signal. + +## Scope + +This spec covers *managed* restore replicas only: the standing replicas Canopy decides should exist and keeps current, and the restore-health signal they produce. + +It does not cover an operator restoring a backup by hand. +An operator performing disaster recovery or an ad-hoc restore selects a specific snapshot for a specific server and restores it through that server's own device tooling and credentials — the existing per-server restore path, unchanged by this spec. +That path is operator-driven and server-scoped: the operator chooses what to restore and where, and Canopy only issues the read-only credentials and snapshot information for that one server. +Managed replicas are the opposite mode: Canopy chooses what should be restored, continuously, with no operator selecting each one. +The two modes share Canopy's read-only credential issuance and snapshot authority; they differ in who decides what gets restored. + +## Why it exists + +A backup is only as good as its last successful restore. +Producing snapshots (a device backed up) and confirming they landed in the repo (a snapshot exists) are weaker guarantees than actually restoring one into a live database. +Canopy already knows every group, every server, every backup type, and the latest snapshot for each — so it is the natural authority on *what should be restored*. +Centralising that decision in Canopy eliminates the long-lived AWS keys a restore consumer would otherwise hold, makes the restore consumer a stateless executor of Canopy's intent, and closes the lifecycle loop end-to-end: produced, persisted, restorable. + +## Actors + +A **restore consumer** is first-party infrastructure that restores backups and reports their health. +It holds no standing access to any backup repo and stores no list of what to restore: it asks Canopy what replicas should exist, restores them, and reports back. +It owns only the mechanics of restoration — how a replica is provisioned, where it runs, how much storage it gets, when it is torn down. + +An **operator** declares, through Canopy, which replicas should exist and why. + +Canopy owns the *what* and the *why* (which group, which server, which type, to what end, how fresh) and the *authority* (which snapshot, which credentials, is it restorable). +The consumer owns the *how*. +This boundary is load-bearing: Canopy never models a consumer's runtime placement, and a consumer never decides on its own what to restore. + +## Identity and authorization + +A restore consumer authenticates as a single device holding the `backup-restore` role. +The role is generic: any future restore consumer uses the same role with its own declared replicas. +A `backup-restore` device has no implicit server and no implicit group; it is not a member of any group it reads. + +The role is read-only by contract, enforced at the API: + +- A `backup-restore` caller requesting backup (write) credentials is rejected. + The read-only guarantee is server-enforced, so a compromised consumer cannot pivot to writing or poisoning a repo. +- A `backup-restore` caller may obtain credentials and the worklist only for a `(group, type)` it has been authorised for. + +Authorization is the set of declared replicas (below): a consumer is authorised for exactly the `(group, type)` pairs that appear in its enabled replica declarations. +There is no separate grant object — declaring a replica *is* the authorization to read what that replica needs. + +A device reaches this role through one-off operator promotion, the same path a release-publishing device uses; no fleet-enrolment flow is involved. +Either transport Canopy already accepts for devices — tailnet identity or a client certificate — satisfies the role; the role, not the transport, is the contract. + +## Declared replicas + +An operator declares replicas against Canopy. +Each declaration carries: + +- the **group** whose repo holds the backups; +- the **type** of backup to restore; +- a **server** within the group, or all servers in the group when none is named; +- an **intent** describing what the replica is for; +- a human-readable **name**; +- a **freshness** bound: the maximum age of the restored snapshot before the replica is considered out of date and should be refreshed or re-verified; +- whether the declaration is **enabled**. + +Intent is an open set; unrecognised intents are preserved verbatim rather than rejected, so a consumer may advertise intents Canopy does not model. +The well-known intents are: + +- **verify** — a transient replica restored solely to prove the snapshot is restorable, then discarded; re-run on the freshness cadence. +- **analytics** — a persistent replica kept running for querying, refreshed to the latest snapshot on the freshness cadence. +- **disaster-recovery** — a periodic rehearsal of the full recovery path: a replica restored the way a real recovery would be, checked as a viable stand-in for the server, then discarded. It is the managed, automated counterpart to the operator-driven recovery in [Scope](#scope), not the recovery event itself. + +A declaration scoped to a whole group expands to one replica per current server in that group. +Servers joining or leaving a group change what the consumer is asked to maintain, with no per-server operator action. + +Declarations are managed through the operator interface (create, edit, enable/disable, delete) and are audited. +Deleting a declaration stops the consumer being asked to maintain that replica and revokes its authorization for that `(group, type)` if no other declaration covers it; recorded restore-health history is retained. + +## The worklist + +A restore consumer fetches its complete desired state from Canopy in one request, scoped to the calling consumer. +Canopy expands the consumer's enabled declarations against the current servers and the latest known snapshot for each, and returns one entry per concrete replica: + +- the declaration's identifier, group, server, type, intent, name, and freshness; +- the **snapshot to restore**: the snapshot identifier and its timestamp, or empty when no successful backup is yet known for that server and type; +- the repo coordinates needed to locate the backups (storage, bucket, prefix, region). + +The worklist does not carry credentials or the repo password. +The consumer reconciles the worklist against what it is actually running — creating, refreshing, and tearing down replicas to match — and is responsible for converging on the desired state over time. + +### Snapshot authority + +The snapshot Canopy hands out for a `(server, type)` is the snapshot identifier of that server's most recent successful backup run of that type. +This is the same snapshot the operator interface shows as the server's latest. +Canopy's independent repo inventory corroborates the snapshot's existence and timestamp; it is not currently the source of the identifier. + +## Credentials + +A consumer obtains credentials per `(group, type)` as it works, not for the whole fleet at once. +Canopy verifies the caller has an enabled declaration covering that `(group, type)`, then issues: + +- short-lived read-only object-storage credentials scoped to the group's repo; +- the repo password. + +The credentials permit reading the repo and nothing else; they cannot write, overwrite, or delete. +Each issuance is audited. +Absence of a covering declaration is a definitive refusal, not a transient error, and a consumer surfaces it as a clear failure for the operator to diagnose by inspecting the declaration in Canopy. + +The 1-hour lifetime of an issued credential does not bound restore duration: a consumer refreshes credentials as needed across a long restore. + +## Restore-health reporting + +A consumer reports the outcome of each replica back to Canopy. +A report carries: + +- the declaration, group, server, and type it concerns; +- the **snapshot** that was restored, joining the report to the produced-and-persisted record for that snapshot; +- the **outcome** — restored-and-healthy, or failed — and, on failure, an error description; +- whether the restored database came up healthy, and its Postgres major version; +- when the restore was observed; +- the object-storage traffic the restore moved. + +Restored-and-healthy means the snapshot restored, the database started, and the consumer's readiness checks passed — a stronger statement than a snapshot merely existing. +A failure covers any stage: the restore itself, the database failing to come up, or a readiness check failing. + +Reports are retained indefinitely as an audit trail. + +## Alerting + +A failed or overdue restore-health report is a group-level incident that pages regardless of any individual server's monitoring state, because an unrestorable backup is a control-plane and data-safety concern, not one server's operational noise. + +A failure raises a group-scoped restore-verification alert identifying the affected server and snapshot. +Each server's restore-health is tracked independently, so one server's failed restore does not mask or merge with another's. +The alert recovers when that server's next report for the same type is healthy. + +A replica with no recent healthy report within its freshness bound is overdue and raises the same alert; Canopy detects this on a periodic sweep rather than waiting for a report that never arrives. + +## Out of scope + +- How a consumer provisions, runs, names, or tears down a replica. +- A consumer's runtime placement, storage sizing, or scheduling. +- Scoping object-storage credentials below the granularity of a group's repo: one repo holds all of a group's servers' snapshots, so credentials are necessarily group-wide while targeting and reporting are per-server. +- Longer-lived or non-chained credentials: a consumer refreshes within a restore, so the per-issuance lifetime is not a constraint. diff --git a/docs/plans/pgro-restore-replicas-canopy-response.md b/docs/plans/pgro-restore-replicas-canopy-response.md new file mode 100644 index 00000000..59e8ef8b --- /dev/null +++ b/docs/plans/pgro-restore-replicas-canopy-response.md @@ -0,0 +1,124 @@ +# Canopy response to the PGRO restore-verification handoff + +**From:** canopy +**To:** pgro +**Re:** `pgro/docs/canopy-handoff.md` (copied here as +`pgro-restore-verification-handoff.md`) +**Status:** needs pgro sign-off on the control-model inversion (§3) before +canopy freezes wire shapes and before pgro builds. Canopy will build its side +against the model below; pgro adopting the inverted executor model is pgro's +call. + +The spec for canopy's side is `.workhorse/specs/public-server/restore-replicas.md`. + +--- + +## 1. The handoff is conformant + +Every load-bearing claim about canopy's current code checks out: the tailnet +node-identity auth and `TAILSCALE_REQUIRED_TAG` gate, the role-gating +extractor macro (note: an `admin` device passes every role-gated route), the +`securitySchemes` block, the `/backup-credentials` + `/backup-target` + +`/backup-report` handlers and their request/response shapes, the +session-policy → per-bucket STS role → repo-password flow, the `backup_runs` / +`backup_repo_snapshots` schemas, and the per-server-vs-group alerting split. +Three small inaccuracies, none of which change the design: + +- Device roles are four, not three (`untrusted` is the auto-created pre-trust + state). The role column is plain `TEXT` with no `CHECK`, so adding + `backup-restore` is a code change, not a schema migration. "Cert minting" + for the role is just operator trust-promotion (the releaser model) — no new + enrolment machinery, exactly as you guessed. +- §4.4.1 reason 7 is wrong: `run_id` is *not* shared between + `/backup-credentials` and `/backup-report`; the issuance audit row carries + no `run_id`. The "don't reuse `/backup-report`" conclusion still holds on the + other six reasons. +- The group-level alerting plumbing you call "concrete in PR #225" is already + merged: `raise_group_event` exists and `restore-verification` is already a + defined alert ref. The alert side is one call. + +## 2. Two corrections that changed the wire shapes + +These came out of review and both made it into the model below: + +- **Canopy supplies the snapshot id.** Canopy already knows the latest snapshot + per `(server, type)` (the latest successful `backup_runs` row). You should + not list the repo to discover what to restore — canopy hands it to you. +- **Restore is per-server, not per-group.** A group holds many servers, each + with its own snapshots inside the one shared per-group repo. Credentials are + necessarily group-wide (one kopia repo per group bucket), but *targeting* and + *health reporting* are per-server. `backup_restore_checks` and the + restore-health report carry `server_id`. + +## 3. The inversion: canopy drives, pgro executes + +This is the part that needs your sign-off, because it changes pgro's +architecture. Rather than pgro statically defining what it restores (a +CRD-defined list of groups/servers) and pulling per-group, **canopy becomes +the source of truth for which replicas should exist, and pgro reconciles +against it.** + +- An operator declares **replicas** in canopy: `(group, [server | all], type, + intent, name, freshness)`. The declaration is both the work item and the + authorization — there is no separate grant object. +- pgro fetches its **entire desired state in one call** — + `GET /restore-worklist`, scoped to the calling consumer — and gets one entry + per concrete replica: declaration id, group, server, type, intent, freshness, + the snapshot to restore (`{snapshot_id, snapshot_at}` or empty), and the repo + coordinates. +- pgro **reconciles**: create / refresh / tear down replicas to match the + worklist, fetching `POST /restore-credentials {group, type}` per group as it + goes. +- pgro **reports health** per replica: `POST /restore-verification` with the + declaration, group, server, type, restored snapshot, outcome, replica + health, Postgres version, and S3 traffic. + +**The boundary:** canopy owns *what / why / how-fresh*; pgro owns *how* — +provisioning, placement, storage sizing, scheduling, teardown. Canopy never +models your runtime; you never decide what to restore. + +**Intents** are an open set (`verify`, `analytics`, `disaster-recovery`, plus +anything you advertise). `verify` is transient (restore, prove, discard, re-run +on cadence); `analytics` is a persistent replica refreshed to latest on +cadence; `disaster-recovery` is a rehearsal. If canopy modelling your +*analytics/DR* replicas (not just +verification) is more centralisation than you want, say so — that is exactly +the boundary this sign-off is about. + +What pgro keeps from the original handoff: one canopy device, promoted once; +read-only by contract (write creds rejected at the API for the role); +best-effort reporting that never blocks restore progress; no `consumer_instance` +(one device, per-replica audit lives in your own records). + +## 4. Endpoint surface (shapes to be frozen on sign-off) + +- `GET /restore-worklist` → desired replicas (expanded per server) + per-group + repo coordinates + the snapshot to restore for each. +- `POST /restore-credentials {group, type}` → short-lived read-only creds + + repo password. Authorized iff an enabled declaration covers `(group, type)`. + `purpose=backup` rejected for this role. +- `POST /restore-verification {replica, group, server, type, snapshot_id, + outcome, error?, replica_healthy, postgres_version?, observed_at, s3_*}` → + per-server restore-health; 204 on success. + +## 5. Appendix A (bestool) deltas + +The original A.2/A.3 (`restore_credentials`, `restore_target`) are replaced by +a worklist fetch plus per-group `restore_credentials`; `restore_target` +collapses into the worklist. A.1 `RestoreVerification` gains `server_id` (and a +declaration id). A.4 `restore_verification` is unchanged in spirit. Canopy will +restate the exact bestool deltas once you've signed off on §3 and the shapes +are frozen. + +## 6. What canopy is building now + +Two PRs: + +1. **Control + access** — `backup-restore` role; the declared-replica model + + operator UI; `GET /restore-worklist`; `POST /restore-credentials`. +2. **Health** — `backup_restore_checks` + `POST /restore-verification`; + per-server group-level alert routing + recovery; the overdue-freshness sweep; + restore-health surfacing in the operator UI. + +Ping canopy if §3 is contentious; otherwise canopy freezes the shapes at the +end of PR1 and hands the restated Appendix A to bestool. diff --git a/docs/plans/pgro-restore-verification-handoff.md b/docs/plans/pgro-restore-verification-handoff.md new file mode 100644 index 00000000..3f5603ec --- /dev/null +++ b/docs/plans/pgro-restore-verification-handoff.md @@ -0,0 +1,483 @@ +# Handoff to canopy: PGRO restore-verification integration + +**From:** pgro +**To:** canopy (then canopy → bestool for the appendix) +**Status:** waiting on canopy. pgro will not start building until the +items in §4 land (or are contract-frozen) and the bestool additions in +§A ship in a published crate. + +This document is the actionable subset of pgro's full integration spec +(`pgro/docs/canopy-backup-integration.md`). Read that for the +why-it-looks-like-this; read this for what to build. Anything contentious +here gets bounced back to pgro before implementation. + +--- + +## 1. Context, brief + +pgro restores tamanu-postgres physical backups out of kopia repos into +working postgres replicas. Today it authenticates with hand-set, +long-lived AWS keys + repo password in a k8s Secret — the exact +long-lived-creds pattern the canopy backup-credentials system exists to +eliminate. Bringing pgro under canopy gets two things: + +1. **Eliminates the static keys** on the pgro side. canopy mediates + restore creds the same way it mediates device backup creds. +2. **Closes the lifecycle loop end-to-end.** A successful pgro restore + *proves the snapshot is restorable* — signal 3, the strongest + backup-health signal there is, stronger than signal 2's + "a snapshot exists in the repo". pgro reports per-replica restore + outcomes back to canopy; a failed/stale restorability check becomes + a high-severity group-level alert. + +This is the integration the canopy backup-credentials plan calls out in +§"External restore consumers + restore-verification (PGRO)" — pgro is +ready to build its side once canopy's side exists. + +--- + +## 2. Architecture pgro is building toward + +Read this so the wire-shape and identity choices below make sense in +context. + +- **One stable pgro operator Pod** sits on the tailnet and is the + single canopy device. It speaks to canopy directly. +- **Each kopia restore is a k8s Job** spawned by the operator. Each Job + Pod runs two containers: kopia, and a pgro-published proxy sidecar. +- **The proxy sidecar runs the bestool S3P loopback re-signing proxy** + (`bestool_kopia::proxy::spawn` from the published `bestool-kopia` + crate). kopia is pointed at `127.0.0.1` with dummy keys; the proxy + holds the live STS creds and re-signs each request. Same model as + bestool device backups and canopy's own maintenance jobs. +- **The proxy's `CredentialProvider`** doesn't call canopy directly. + It calls an in-cluster HTTP endpoint on the operator + (`/internal/restore-creds`), and the operator forwards to canopy. + This is forced by the identity model (§3) — Job Pods are not canopy + devices and have no way to authenticate. +- **`bestool-canopy::CanopyClient` auto-probes tailnet vs mTLS.** pgro + uses the tailnet path (via the Tailscale sidecar on the operator + Pod); mTLS is an optional fallback if a device cert is provisioned. + +Consequences worth flagging up front: + +- **The chained-STS 1-hour cap is a non-issue.** The proxy refreshes + creds between requests; long restores are bounded by canopy + reachability, not by any single issuance lifetime. pgro does not need + non-chained / direct-IRSA creds. +- **kopia never sees real AWS credentials.** It carries dummy keys and + talks to `127.0.0.1`. The `--session-token` / `AWS_SESSION_TOKEN` + question is moot. + +--- + +## 3. Identity model: one operator-Pod tailnet device + +canopy's tailnet auth identifies callers by **tailscale node identity** +(`commons-servers/src/device_auth/tailnet.rs:52` — looks up the source +IP via the tailnet directory, keys into `devices.tailscale_node_id`, +auto-creates an `Untrusted` device row on first contact). Tags are only +a coarse admission gate (`TAILSCALE_REQUIRED_TAG`). + +That means **one tailnet node = one canopy device record**. Per-Job +Tailscale sidecars would create one `Untrusted` row per Job pod, +forever — unworkable. + +So pgro will run **exactly one Tailscale sidecar**, on the operator +Pod, and pgro is **one canopy device**: + +- First contact creates an `Untrusted` row. +- canopy (admin) promotes it once to role `backup-restore` (working + name — see §4.1). +- The operator brokers everything for Job Pods over the in-cluster + network, so Job Pods never need their own canopy identity. + +The mTLS path is symmetric — one operator-Pod-mounted device cert, +one canopy device, same identity. Either path works; canopy's auth +mechanism is the only thing that differs. + +--- + +## 4. What canopy needs to build + +Five items. They depend on each other roughly in the order listed. + +### 4.1 New device role: `backup-restore` + +Add a fourth role alongside `server` / `releaser` / `admin`. + +- Generic, not pgro-specific. Any future restore-only consumer (an + external auditor's verifier, a separate test-restore harness) shares + the same role with its own external-restore grant. +- No server / group binding. Like `releaser-device`, the role itself + doesn't imply membership in any group. +- Add to the device-role enum, `securitySchemes` in + `crates/public-server/openapi.json`, route-gating macros, and the + cert-issuance flow (one-off operator-driven cert minting — does not + need the TPM-bound `canopy register` enrolment flow that bestool + servers use; the `releaser-device` provisioning path is the right + model). +- **`purpose=backup` must be rejected at the API layer for this role.** + A `backup-restore`-role caller hitting `/backup-credentials` with + `purpose=backup` gets `403`/`409`, full stop. The role's read-only + contract is server-enforced, not consumer-promised, and a compromised + pgro can't pivot to writing/poisoning. + +This is the biggest single blocker. Until this lands pgro cannot +authenticate at all. + +### 4.2 Group-aware credentials + target endpoints + +For server-bound roles, `device → server → group_id` resolves the group +implicitly. A `backup-restore`-role device has no implicit server and +no implicit group, so the request body has to carry `group`. + +Two viable shapes; canopy picks: + +- **(a) Add `group: Uuid` to the existing `CredentialsArgs` / + `BackupTarget` paths** and accept it only from `backup-restore`-role + callers. Smaller diff; mildly violates the principle that + device-authenticated requests don't put authz fields in the body. +- **(b) Sibling endpoints**: e.g. `POST /restore-credentials` and + `GET /restore-target?group=...`. Clean separation; bestool-canopy + gets two new methods rather than overloaded ones (matches the + appendix bestool deltas). + +pgro lightly prefers (b) for clarity, but defers to canopy. + +Behaviour either way: canopy verifies the `(consumer, group, type)` +external-restore grant (§4.3), then runs the same restore session +policy + per-bucket role + repo-password lookup it does today, and +returns `BackupCredentials` + `BackupTarget` unchanged. + +### 4.3 The external-restore grant + +The operator-authorised, audited authz primitive that says "consumer C +may read group G's type T, read-only." + +- Per `(consumer_device_id, group_id, type)`. New table; canopy picks + the name (`backup_restore_grants` or similar). +- Operator-authorised via the existing private-server UI or `canopy + ctl` CLI; audited. +- Checked at request time for `/restore-credentials` (4.2) and + `/restore-verification` (4.4). Absence is a clear 403, not a + transient error. +- pgro will surface that 403 as a clear `Failed` phase + Warning event + on the replica; the operator who set up the replica diagnoses by + going to canopy and inspecting / creating the grant. + +### 4.4 Restore-verification ingest endpoint + `backup_restore_checks` + +#### 4.4.1 Why NOT reuse `POST /backup-report` + +`/backup-report` already accepts `{ purpose: "restore", outcome, +snapshot_id, error, run_id }` — for **devices**. The shape looks close +to what pgro wants, but reusing it is wrong for seven concrete reasons: + +1. **Identity is auth-context-derived, not body-derived.** The handler + resolves `device_id`, `server_id`, and `group_id` from the + authenticated mTLS context (`crates/public-server/src/backup.rs:495`), + not the body. A `backup-restore`-role caller has no implicit server + or group; threading them through the body would break the invariant + that a device can't report a run as some *other* group. +2. **Schema is device-shaped.** `backup_runs` has `device_id UUID NOT + NULL REFERENCES devices(id)` and `group_id NOT NULL REFERENCES + server_groups(id)`. The pgro device row exists but it's not + "running" a backup for any server; satisfying the FKs requires + either sentinel data or schema changes. +3. **Two different "restore" meanings collide on `purpose`.** A device + with `purpose=restore` (e.g. `bestool canopy restore` for clone / + DR-test on the same fleet) writes to `backup_runs`. That is NOT a + signal-3 verification — it's a normal device-side restore and + should not raise a group-level "the backup isn't restorable" + incident. `purpose=restore` alone is not a sufficient discriminator + between device-restore-runs and signal-3 verifications. +4. **Alerting paths diverge.** `/backup-report` failure feeds per-server + staleness (signal 1, server-scoped). Signal 3 must feed group-scoped + `raise_group_event(ref = "restore-verification")` bypassing + per-server `is_monitored`. +5. **Side-effects don't match.** The handler clears `BackupRequest` + (`backup.rs:534`) so the heartbeat stops re-emitting "back up now" + for that server. Irrelevant for a pgro report. +6. **Payload shape is wrong.** `ReportArgs` carries `bytes_uploaded` + + `s3_*_bytes` (good — pgro's proxy emits those too) but lacks + `replica_healthy`, postgres major version, `observed_at` — the + load-bearing fields that make signal 3 stronger than signal 2. +7. **`run_id` semantics don't transfer.** For devices, `run_id` is the + same UUID across `/backup-credentials` (issuance audit) and + `/backup-report`, minted at run start, dup → 409. pgro's natural + identity is the snapshot being verified, not a per-run UUID; a + pgro-minted run UUID has no cross-table linkage to + `backup_credential_issuances`. + +By the time `/backup-report` has been extended to take `group_id`, +relaxed (or split off) the FKs, branched the handler on actor type, +routed failures differently, and gated the `BackupRequest::clear` +side-effect, the handler has forked. Cleaner to expose a sibling. + +#### 4.4.2 New endpoint + +Working title `POST /restore-verification` (canopy picks the name). +Authenticated as `backup-restore`-role; gated by the external-restore +grant for the body's `(group, type)`. + +Request body (proposed): + +```json +{ + "group": "", + "type": "tamanu-postgres", + "snapshot_id": "", + "outcome": "success" | "failure", + "error": "", + "replica_healthy": true, + "postgres_version": "", + "observed_at": "", + "s3_sent_raw_bytes": 12345, + "s3_sent_payload_bytes": 12300, + "s3_received_raw_bytes": 98765, + "s3_received_payload_bytes": 98700 +} +``` + +- `snapshot_id` is the join key into `backup_repo_snapshots` / + `backup_runs`. Load-bearing for closing the loop *backed up → + persisted → restorable*. +- `outcome=success` with `replica_healthy=true` means kopia restored + successfully AND postgres came up AND the operator's readiness gate + passed. +- `outcome=failure` with an `error` string covers restore-job failure, + deployment-never-ready, postgres-version mismatch, etc. +- S3 byte tallies come from the bestool proxy's `TrafficStats` (already + there in `bestool-kopia`). pgro emits them on success and failure, + same as `/backup-report`. + +#### 4.4.3 New table: `backup_restore_checks` + +Roughly: + +```sql +CREATE TABLE backup_restore_checks ( + id BIGSERIAL PRIMARY KEY, + consumer_device_id UUID NOT NULL REFERENCES devices(id), + group_id UUID NOT NULL REFERENCES server_groups(id), + type TEXT NOT NULL, + snapshot_id TEXT NOT NULL, + outcome TEXT NOT NULL CHECK (outcome IN ('success','failure')), + error TEXT, + replica_healthy BOOLEAN NOT NULL, + postgres_version TEXT, + observed_at TIMESTAMPTZ NOT NULL, + s3_sent_raw_bytes BIGINT, + s3_sent_payload_bytes BIGINT, + s3_received_raw_bytes BIGINT, + s3_received_payload_bytes BIGINT, + reported_at TIMESTAMPTZ NOT NULL DEFAULT now() +); +CREATE INDEX ON backup_restore_checks (group_id, type, observed_at DESC); +CREATE INDEX ON backup_restore_checks (snapshot_id); +``` + +Exact shape is canopy's call. pgro just needs the endpoint to accept +the body in §4.4.2 and reject 4xx clearly on grant/role failure. + +#### 4.4.4 Alert routing + +Plumb `outcome=failure` (and staleness — see §6 / "Open questions") +into: + +```rust +raise_group_event( + conn, group_id, + ref: "restore-verification", // const in database::backup::refs + severity: Severity::Error, // group-level; bypasses per-server is_monitored + description: ..., + message: ..., + active: true, +); +``` + +Already concrete in PR #225, no new plumbing on the alerting side — +just call it from the new handler. Recovery (`active: false`) on the +next successful report for the same `(group, type)`. + +### 4.5 Wire-type stability + +For pgro's side: please freeze the wire shapes for §4.2 and §4.4.2 +before merging the bestool changes (Appendix A). Mid-flight name churn +on `BackupCredentials` / `BackupTarget` fields would also cause +collateral damage — pgro is going to consume `bestool_canopy`'s +existing types verbatim, so renames there propagate. + +--- + +## 5. What pgro is NOT asking for + +These have come up in earlier rounds and pgro has explicitly **decided +against** them: + +- **Non-chained / longer-lived STS creds for pgro.** The proxy refreshes + out-of-band; the 1-hour chained cap is fine in practice. Don't burn + effort here on pgro's account. (canopy may still want it for its own + reasons — that's a canopy call.) +- **Reusing `/backup-report` for signal 3.** §4.4.1 covers why. +- **Server-side cred caching across pgro Jobs.** pgro's operator + already caches in-process for the broker (§Architecture); canopy + doesn't need to. +- **A new auth federation (OIDC).** pgro is happy with mTLS + tailnet. + OIDC would be useful for *other* future first-party consumers and + canopy can pursue it independently, but pgro doesn't need it. + +--- + +## 6. Open questions canopy owns + +Pick before / during implementation; flag back to pgro if any of these +change pgro-visible shape. + +1. **4.2 (a) vs (b):** group in body of existing endpoints, or sibling + `/restore-*` endpoints. pgro mildly prefers (b). +2. **Naming.** Role: `backup-restore` (pgro suggestion) vs whatever + canopy prefers. Endpoint: `/restore-verification` vs + `/backup-restore-check` vs… Table name: `backup_restore_checks` vs + `restore_verifications`. pgro doesn't care, just needs them stable + before bestool ships. +3. **Staleness detection for signal 3.** A successful report is + straightforward. "Stale" (no recent successful verification for a + `(group, type)`) is a periodic check canopy needs to run — out of + pgro's scope, but in scope for the alerting story. Define the + cadence + threshold canopy-side. +4. **`backup_restore_checks` retention.** pgro suggests indefinite + (audit trail, small rows); canopy decides. +5. **Per-Pod identity for audit.** pgro is intentionally one canopy + device; per-Job audit lives in pgro's own k8s record (CRD status, + events). If canopy wants to split per-Pod, pgro can include a + `consumer_instance` opaque string in the body — but the cost is + real and the value is unclear. Default: don't. +6. **Cert-issuance flow for the new role.** pgro will be tailscale-only + in normal operation; mTLS cert is the fallback. If canopy doesn't + want to build cert minting for the new role at all (tailscale-only, + period), pgro is fine with that — just confirm. + +--- + +## 7. Pgro-side commitments (so canopy knows what to expect) + +- pgro will be one canopy device. First contact creates `Untrusted`; + canopy admin promotes once. +- pgro will report `outcome=success` only when the deployment actually + passes the readiness gate (not on bare-kopia-success). Failure + reporting is best-effort and never blocks restore progression. +- pgro will at-most-once-per-restore, with retry across reconciles + until the report lands (status-tracked). +- pgro will not write or delete from any bucket. The proxy is fed by + the restore session policy; even if pgro is compromised it has no + write capability (compounded by §4.1's role-level `purpose=backup` + rejection). + +--- + +## Appendix A — Hand off to bestool + +Once §4 has landed (or shipped to a feature branch with frozen wire +shapes), canopy passes this list to bestool. All four are additive in +the published `bestool-canopy` crate; no breaking changes to existing +consumers, no new crate. `bestool-kopia` needs no changes. + +### A.1 `bestool_canopy::backup::RestoreVerification` (new) + +Public wire type mirroring §4.4.2: + +```rust +#[derive(Debug, Clone, Serialize)] +pub struct RestoreVerification<'a> { + pub group: Uuid, + pub r#type: &'a str, + pub snapshot_id: &'a str, + pub outcome: RunOutcome, // reuse existing enum + pub error: Option<&'a str>, + pub replica_healthy: bool, + pub postgres_version: Option<&'a str>, + pub observed_at: jiff::Timestamp, + pub s3_sent_raw_bytes: Option, + pub s3_sent_payload_bytes: Option, + pub s3_received_raw_bytes: Option, + pub s3_received_payload_bytes: Option, +} +``` + +Field-renaming-via-serde to whatever canopy lands; the Rust shape is +indicative. + +### A.2 `CanopyClient::restore_credentials(base, type, group) -> Result` + +Group-aware variant of `backup_credentials`. Posts to whichever +endpoint canopy picks in §4.2 (a) or (b); the response type is the +existing `BackupCredentials` unchanged. + +```rust +pub async fn restore_credentials( + &self, + base_url: &Url, + backup_type: &str, + group: Uuid, +) -> Result { ... } +``` + +### A.3 `CanopyClient::restore_target(base, group) -> Result` + +Same group issue for target lookup. Response is the existing +`TargetOutcome` (Ready/Dormant) — Dormant maps to grant-absent or +group-unconfigured. + +```rust +pub async fn restore_target( + &self, + base_url: &Url, + group: Uuid, +) -> Result { ... } +``` + +### A.4 `CanopyClient::restore_verification(base, &RestoreVerification) -> Result<()>` + +Posts to canopy's new ingest endpoint. 204 on success; surface +4xx body as error. + +```rust +pub async fn restore_verification( + &self, + base_url: &Url, + report: &RestoreVerification<'_>, +) -> Result<()> { ... } +``` + +### A.5 What does NOT change in bestool + +- `bestool-kopia` — no changes. `proxy::spawn`, `CredentialProvider`, + `Credentials`, `TrafficStats` are exactly what pgro consumes. +- `CanopyClient::new(...)` — already accepts `device_key_pem: + Option<&str>`, so pgro's tailscale-only operator works as-is. +- `Purpose::Restore` — already there. +- `BackupCredentials` / `BackupTarget` shapes — pgro consumes these + verbatim; please don't reshape them mid-flight (see §4.5). + +### A.6 Suggested release shape + +One bestool-canopy minor version bump containing all four additions, +landing after canopy's endpoints exist on at least a feature branch +with frozen wire shapes. Tag and publish; pgro depends on `^X.Y`. + +--- + +## Next round + +Once §4 + Appendix A have shipped, ping pgro. pgro will: + +1. Read the as-implemented wire + types (any drift from this doc is + fine, just needs to be visible). +2. Re-evaluate the open questions in + `pgro/docs/canopy-backup-integration.md` and tighten the spec to + match what canopy actually shipped. +3. Start building Part 1 (canopy client wiring + CRD field + sidecar + image) and Part 2 (the restore-verification reporter) against the + real surfaces. From 95bf1362d95c4f1ba6e3d32f5baf5c626e16387f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Tue, 30 Jun 2026 14:10:10 +1200 Subject: [PATCH 2/7] docs(restore): invert unsupported-intent handling to capability registration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consumer registers supported intents on start/change; canopy persists them, constrains the declaration UX, dispatches only matching worklist entries, and surfaces capability-shrink gaps to operators — instead of the consumer reactively reporting unsupported intents (which conflated a capability mismatch with an unrestorable-backup page). Resolves pgro's post-sign-off open question. Adds POST /restore-capabilities to the PR1 surface; /restore-verification outcome stays success/failure. Co-Authored-By: Claude Opus 4.8 --- .../specs/public-server/restore-replicas.md | 18 +++++++- .../pgro-restore-replicas-canopy-response.md | 46 ++++++++++++++++--- 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/.workhorse/specs/public-server/restore-replicas.md b/.workhorse/specs/public-server/restore-replicas.md index aa6991ef..c76d5a36 100644 --- a/.workhorse/specs/public-server/restore-replicas.md +++ b/.workhorse/specs/public-server/restore-replicas.md @@ -54,6 +54,20 @@ There is no separate grant object — declaring a replica *is* the authorization A device reaches this role through one-off operator promotion, the same path a release-publishing device uses; no fleet-enrolment flow is involved. Either transport Canopy already accepts for devices — tailnet identity or a client certificate — satisfies the role; the role, not the transport, is the contract. +## Consumer capabilities + +A restore consumer advertises the set of intents it can satisfy, and registers it with Canopy when it starts and whenever it changes. +Canopy stores the set against the consumer and treats it as the authority on what that consumer can be asked to do. + +The registered set governs two things: + +- **What can be declared.** Canopy offers operators the intents the chosen consumer supports when they declare a replica. +- **What is dispatched.** A consumer's worklist includes only entries whose intent it currently supports; Canopy never asks a consumer to satisfy an intent it has not advertised. + +When a consumer's set grows, the new intents become available for operators to assign, so a consumer gaining a capability is reflected without operator guesswork. +When a consumer's set shrinks, any enabled declaration whose intent is no longer supported becomes a *gap*: Canopy drops it from the worklist immediately and surfaces it to operators as a declaration no consumer can currently satisfy, to reassign or retire. +A gap is a configuration state shown to the operator, not a restore-health incident; the backups themselves are unaffected. + ## Declared replicas An operator declares replicas against Canopy. @@ -74,6 +88,8 @@ The well-known intents are: - **analytics** — a persistent replica kept running for querying, refreshed to the latest snapshot on the freshness cadence. - **disaster-recovery** — a periodic rehearsal of the full recovery path: a replica restored the way a real recovery would be, checked as a viable stand-in for the server, then discarded. It is the managed, automated counterpart to the operator-driven recovery in [Scope](#scope), not the recovery event itself. +A declaration's intent must be one the chosen consumer supports (see [Consumer capabilities](#consumer-capabilities)); a declaration whose intent is unsupported is a gap, surfaced to the operator and never dispatched. + A declaration scoped to a whole group expands to one replica per current server in that group. Servers joining or leaving a group change what the consumer is asked to maintain, with no per-server operator action. @@ -83,7 +99,7 @@ Deleting a declaration stops the consumer being asked to maintain that replica a ## The worklist A restore consumer fetches its complete desired state from Canopy in one request, scoped to the calling consumer. -Canopy expands the consumer's enabled declarations against the current servers and the latest known snapshot for each, and returns one entry per concrete replica: +Canopy expands the consumer's enabled declarations — those whose intent the consumer currently supports — against the current servers and the latest known snapshot for each, and returns one entry per concrete replica: - the declaration's identifier, group, server, type, intent, name, and freshness; - the **snapshot to restore**: the snapshot identifier and its timestamp, or empty when no successful backup is yet known for that server and type; diff --git a/docs/plans/pgro-restore-replicas-canopy-response.md b/docs/plans/pgro-restore-replicas-canopy-response.md index 59e8ef8b..89b8e0a3 100644 --- a/docs/plans/pgro-restore-replicas-canopy-response.md +++ b/docs/plans/pgro-restore-replicas-canopy-response.md @@ -92,8 +92,12 @@ best-effort reporting that never blocks restore progress; no `consumer_instance` ## 4. Endpoint surface (shapes to be frozen on sign-off) +- `POST /restore-capabilities {intents: [...]}` → pgro registers the intents it + can satisfy, on start and whenever they change. Canopy persists the set and + dispatches only matching worklist entries (see §7). - `GET /restore-worklist` → desired replicas (expanded per server) + per-group - repo coordinates + the snapshot to restore for each. + repo coordinates + the snapshot to restore for each. Only entries whose intent + pgro currently supports are returned. - `POST /restore-credentials {group, type}` → short-lived read-only creds + repo password. Authorized iff an enabled declaration covers `(group, type)`. `purpose=backup` rejected for this role. @@ -106,19 +110,47 @@ best-effort reporting that never blocks restore progress; no `consumer_instance` The original A.2/A.3 (`restore_credentials`, `restore_target`) are replaced by a worklist fetch plus per-group `restore_credentials`; `restore_target` collapses into the worklist. A.1 `RestoreVerification` gains `server_id` (and a -declaration id). A.4 `restore_verification` is unchanged in spirit. Canopy will -restate the exact bestool deltas once you've signed off on §3 and the shapes -are frozen. +declaration id). A.4 `restore_verification` is unchanged in spirit. A new +`CanopyClient::restore_capabilities(base, &[intents])` registers the supported +intents (§7). Canopy will restate the exact bestool deltas once the shapes are +frozen. ## 6. What canopy is building now Two PRs: 1. **Control + access** — `backup-restore` role; the declared-replica model + - operator UI; `GET /restore-worklist`; `POST /restore-credentials`. + operator UI; consumer capability registration (`POST /restore-capabilities`) + + capability-aware declaration UX + gap surfacing; `GET /restore-worklist`; + `POST /restore-credentials`. 2. **Health** — `backup_restore_checks` + `POST /restore-verification`; per-server group-level alert routing + recovery; the overdue-freshness sweep; restore-health surfacing in the operator UI. -Ping canopy if §3 is contentious; otherwise canopy freezes the shapes at the -end of PR1 and hands the restated Appendix A to bestool. +Canopy freezes the shapes at the end of PR1 and hands the restated Appendix A to +bestool. + +## 7. Resolution of pgro's open question — unsupported intents + +pgro's sign-off asked how canopy should handle an intent pgro doesn't +implement, defaulting to an implicit `outcome=failure, error="unsupported"` +report. Canopy is taking the structured route instead, because the implicit one +conflates a *capability mismatch* with an *unrestorable backup* — the latter +pages a group-level incident, which is the wrong response to "pgro can't do +this intent yet." + +Inverted model: **pgro registers its supported intents** (`POST +/restore-capabilities`) on start and on change; canopy persists them and: + +- offers operators only supported intents when they declare a replica; +- dispatches only matching worklist entries — pgro never receives an intent it + hasn't advertised, so there is no unsupported-intent report and no spurious + page; +- when pgro's set **grows**, the new intents become assignable; when it + **shrinks**, declarations stranded on a now-unsupported intent become *gaps* — + dropped from the worklist immediately and surfaced to operators to reassign or + retire, as configuration state, not a restore-health incident. + +Consequence for pgro: implement `restore_capabilities` registration on start; +the `/restore-verification` outcome stays just success/failure (no `unsupported` +value). From 4a88c0473aec0dca9ce7dd033e39c42c83b80bce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Tue, 30 Jun 2026 14:23:08 +1200 Subject: [PATCH 3/7] feat(restore): backup-restore role, restore-replica + capability schema & models - Add the backup-restore device role (commons-types enum + auth extractor macro + openapi security scheme + drift test + UI role picker/colour). - New RestoreIntent open enum (verify/analytics/disaster-recovery/custom), mirroring BackupType. - Migration restore_replicas: restore_replicas (declared replicas) + restore_consumer_capabilities tables. - database::restore models + CRUD (RestoreReplica, RestoreConsumerCapability, capability register-as-insert-then-prune, creds authz check) and Server::list_live_in_group for worklist expansion. Co-Authored-By: Claude Opus 4.8 --- crates/commons-servers/src/device_auth/mod.rs | 1 + crates/commons-types/src/backup.rs | 103 +++++++ crates/commons-types/src/device.rs | 4 + crates/database/src/lib.rs | 5 +- crates/database/src/restore.rs | 263 ++++++++++++++++++ crates/database/src/schema.rs | 32 +++ crates/database/src/servers.rs | 17 ++ crates/public-server/src/openapi.rs | 1 + crates/public-server/tests/openapi_spec.rs | 7 +- .../down.sql | 2 + .../up.sql | 51 ++++ private-web/src/components/DeviceShorty.tsx | 1 + private-web/src/routes/DeviceDetail.tsx | 7 +- 13 files changed, 491 insertions(+), 3 deletions(-) create mode 100644 crates/database/src/restore.rs create mode 100644 migrations/2026-06-30-021427-0000_restore_replicas/down.sql create mode 100644 migrations/2026-06-30-021427-0000_restore_replicas/up.sql diff --git a/crates/commons-servers/src/device_auth/mod.rs b/crates/commons-servers/src/device_auth/mod.rs index cf6903c2..6c60664d 100644 --- a/crates/commons-servers/src/device_auth/mod.rs +++ b/crates/commons-servers/src/device_auth/mod.rs @@ -82,6 +82,7 @@ macro_rules! device_role_struct { device_role_struct!(AdminDevice, DeviceRole::Admin); device_role_struct!(ServerDevice, DeviceRole::Server); device_role_struct!(ReleaserDevice, DeviceRole::Releaser); +device_role_struct!(BackupRestoreDevice, DeviceRole::BackupRestore); impl axum::extract::FromRequestParts for AuthDevice where diff --git a/crates/commons-types/src/backup.rs b/crates/commons-types/src/backup.rs index 320ae347..bab57202 100644 --- a/crates/commons-types/src/backup.rs +++ b/crates/commons-types/src/backup.rs @@ -280,6 +280,109 @@ where } } +/// What a managed restore replica is for. Open by design, mirroring +/// [`BackupType`]: a restore consumer advertises the intents it can satisfy and +/// Canopy preserves any it does not model in `Custom` rather than rejecting it. +/// Stored as `TEXT`; serializes as a plain string (no DB `CHECK`). +#[derive(Debug, Clone, PartialEq, Eq, Hash, AsExpression, FromSqlRow)] +#[diesel(sql_type = Text)] +pub enum RestoreIntent { + /// A transient replica restored only to prove the snapshot is restorable. + Verify, + /// A persistent replica kept running for querying. + Analytics, + /// A periodic rehearsal of the full recovery path. + DisasterRecovery, + /// Any other intent name, preserved as advertised. + Custom(String), +} + +impl RestoreIntent { + const VERIFY: &'static str = "verify"; + const ANALYTICS: &'static str = "analytics"; + const DISASTER_RECOVERY: &'static str = "disaster-recovery"; + + /// The wire/DB string for this intent. + pub fn as_str(&self) -> &str { + match self { + Self::Verify => Self::VERIFY, + Self::Analytics => Self::ANALYTICS, + Self::DisasterRecovery => Self::DISASTER_RECOVERY, + Self::Custom(s) => s, + } + } +} + +impl Display for RestoreIntent { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +impl From for RestoreIntent { + fn from(s: String) -> Self { + match s.as_str() { + Self::VERIFY => Self::Verify, + Self::ANALYTICS => Self::Analytics, + Self::DISASTER_RECOVERY => Self::DisasterRecovery, + _ => Self::Custom(s), + } + } +} + +impl From<&str> for RestoreIntent { + fn from(s: &str) -> Self { + Self::from(s.to_owned()) + } +} + +impl FromStr for RestoreIntent { + type Err = std::convert::Infallible; + fn from_str(s: &str) -> Result { + Ok(Self::from(s)) + } +} + +impl From for String { + fn from(v: RestoreIntent) -> Self { + match v { + RestoreIntent::Custom(s) => s, + other => other.as_str().to_owned(), + } + } +} + +impl Serialize for RestoreIntent { + fn serialize(&self, s: S) -> Result { + s.serialize_str(self.as_str()) + } +} + +impl<'de> Deserialize<'de> for RestoreIntent { + fn deserialize>(d: D) -> Result { + Ok(Self::from(String::deserialize(d)?)) + } +} + +impl FromSql for RestoreIntent +where + DB: Backend, + String: FromSql, +{ + fn from_sql(bytes: DB::RawValue<'_>) -> deserialize::Result { + Ok(Self::from(String::from_sql(bytes)?)) + } +} + +impl ToSql for RestoreIntent +where + String: ToSql, +{ + fn to_sql<'b>(&'b self, out: &mut Output<'b, '_, diesel::pg::Pg>) -> serialize::Result { + >::to_sql(self.as_str(), &mut out.reborrow()) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/commons-types/src/device.rs b/crates/commons-types/src/device.rs index c48f03d0..e60ddd98 100644 --- a/crates/commons-types/src/device.rs +++ b/crates/commons-types/src/device.rs @@ -27,6 +27,8 @@ pub enum DeviceRole { Admin, Releaser, Server, + #[serde(rename = "backup-restore")] + BackupRestore, } #[derive(Debug, Clone, Copy, thiserror::Error)] @@ -42,6 +44,7 @@ impl std::str::FromStr for DeviceRole { "admin" => Ok(Self::Admin), "releaser" => Ok(Self::Releaser), "server" => Ok(Self::Server), + "backup-restore" => Ok(Self::BackupRestore), _ => Err(DeviceRoleFromStringError), } } @@ -62,6 +65,7 @@ impl std::fmt::Display for DeviceRole { DeviceRole::Admin => "admin", DeviceRole::Releaser => "releaser", DeviceRole::Server => "server", + DeviceRole::BackupRestore => "backup-restore", }; write!(f, "{}", s) } diff --git a/crates/database/src/lib.rs b/crates/database/src/lib.rs index b4812f6b..033db439 100644 --- a/crates/database/src/lib.rs +++ b/crates/database/src/lib.rs @@ -14,6 +14,7 @@ pub mod healthcheck_severities; pub mod issues; pub mod notes; pub mod pg_duration; +pub mod restore; pub mod schema; pub mod server_enrollment_challenges; pub mod server_enrollment_tokens; @@ -38,9 +39,11 @@ pub use backups::{ }; pub use bestool_snippets::{BestoolSnippet, NewBestoolSnippet}; pub use commons_types::backup::{ - BackupConfigStatus, BackupPurpose, BackupRepoMode, BackupType, MaintenanceKind, RunOutcome, + BackupConfigStatus, BackupPurpose, BackupRepoMode, BackupType, MaintenanceKind, RestoreIntent, + RunOutcome, }; pub use devices::{Device, DeviceConnection, DeviceKey, DeviceWithInfo}; +pub use restore::{NewRestoreReplica, RestoreConsumerCapability, RestoreReplica}; pub type Db = Pool; diff --git a/crates/database/src/restore.rs b/crates/database/src/restore.rs new file mode 100644 index 00000000..7675ce4a --- /dev/null +++ b/crates/database/src/restore.rs @@ -0,0 +1,263 @@ +//! Managed restore replicas (RST): the control-plane state for driving an +//! external restore consumer. Operators declare which replicas should exist +//! ([`RestoreReplica`]); consumers register the intents they can satisfy +//! ([`RestoreConsumerCapability`]). The worklist expansion, credential issuance, +//! and restore-health ingest live in the public-server and `jobs` components. + +use commons_errors::{AppError, Result}; +use commons_types::backup::{BackupType, RestoreIntent}; +use diesel::{ + prelude::*, + result::{DatabaseErrorKind, Error as DieselError}, +}; +use diesel_async::{AsyncPgConnection, RunQueryDsl}; +use jiff::Timestamp; +use serde::Serialize; +use uuid::Uuid; + +use crate::pg_duration::PgDuration; + +/// An operator-declared replica: a consumer should keep a replica of a +/// `(group, [server | all servers], type)` for a given intent. The declaration +/// is both the work item (it expands into worklist entries) and the +/// authorization (it grants the consumer read access to that `(group, type)`). +#[derive(Debug, Clone, Serialize, Queryable, Selectable, utoipa::ToSchema)] +#[diesel(table_name = crate::schema::restore_replicas)] +#[diesel(check_for_backend(diesel::pg::Pg))] +pub struct RestoreReplica { + pub id: Uuid, + pub consumer_device_id: Uuid, + pub group_id: Uuid, + /// `None` = all current servers in the group, expanded at worklist time. + pub server_id: Option, + #[diesel(column_name = type_)] + #[schema(value_type = String)] + pub r#type: BackupType, + #[schema(value_type = String)] + pub intent: RestoreIntent, + pub name: String, + /// Max age of the restored snapshot before the replica is overdue, in + /// whole seconds; `None` = always track the latest snapshot. + #[schema(value_type = Option)] + pub freshness: Option, + pub enabled: bool, + pub created_by: Option, + #[diesel(deserialize_as = jiff_diesel::Timestamp, serialize_as = jiff_diesel::Timestamp)] + pub created_at: Timestamp, + #[diesel(deserialize_as = jiff_diesel::Timestamp, serialize_as = jiff_diesel::Timestamp)] + pub updated_at: Timestamp, +} + +#[derive(Debug, Clone, Insertable)] +#[diesel(table_name = crate::schema::restore_replicas)] +pub struct NewRestoreReplica { + pub consumer_device_id: Uuid, + pub group_id: Uuid, + pub server_id: Option, + #[diesel(column_name = type_)] + pub r#type: BackupType, + pub intent: RestoreIntent, + pub name: String, + pub freshness: Option, + pub created_by: Option, +} + +impl RestoreReplica { + /// Create a declaration. A duplicate `(consumer, group, type, intent, + /// server)` scope maps to `409`. + pub async fn create(db: &mut AsyncPgConnection, new: NewRestoreReplica) -> Result { + use crate::schema::restore_replicas::dsl; + match diesel::insert_into(dsl::restore_replicas) + .values(new) + .returning(Self::as_select()) + .get_result(db) + .await + { + Ok(row) => Ok(row), + Err(DieselError::DatabaseError(DatabaseErrorKind::UniqueViolation, _)) => Err( + AppError::Conflict("a matching restore replica is already declared".into()), + ), + Err(e) => Err(AppError::from(e)), + } + } + + /// Every declaration, newest first — the operator overview. + pub async fn list_all(db: &mut AsyncPgConnection) -> Result> { + use crate::schema::restore_replicas::dsl; + dsl::restore_replicas + .select(Self::as_select()) + .order(dsl::created_at.desc()) + .load(db) + .await + .map_err(AppError::from) + } + + /// Declarations scoped to a group. + pub async fn list_for_group(db: &mut AsyncPgConnection, group_id: Uuid) -> Result> { + use crate::schema::restore_replicas::dsl; + dsl::restore_replicas + .select(Self::as_select()) + .filter(dsl::group_id.eq(group_id)) + .order(dsl::created_at.desc()) + .load(db) + .await + .map_err(AppError::from) + } + + /// Enabled declarations for a consumer — the basis of its worklist (before + /// per-server expansion and capability filtering). + pub async fn list_enabled_for_consumer( + db: &mut AsyncPgConnection, + consumer_device_id: Uuid, + ) -> Result> { + use crate::schema::restore_replicas::dsl; + dsl::restore_replicas + .select(Self::as_select()) + .filter(dsl::consumer_device_id.eq(consumer_device_id)) + .filter(dsl::enabled.eq(true)) + .order(dsl::created_at.desc()) + .load(db) + .await + .map_err(AppError::from) + } + + pub async fn get(db: &mut AsyncPgConnection, id: Uuid) -> Result { + use crate::schema::restore_replicas::dsl; + dsl::restore_replicas + .select(Self::as_select()) + .filter(dsl::id.eq(id)) + .first(db) + .await + .optional() + .map_err(AppError::from)? + .ok_or(AppError::DatabaseQuery(DieselError::NotFound)) + } + + /// Edit the non-structural fields. Scope fields (consumer, group, server, + /// type, intent) are immutable — change them by deleting and recreating. + pub async fn update( + db: &mut AsyncPgConnection, + id: Uuid, + name: &str, + freshness: Option, + enabled: bool, + ) -> Result { + use crate::schema::restore_replicas::dsl; + diesel::update(dsl::restore_replicas.filter(dsl::id.eq(id))) + .set(( + dsl::name.eq(name), + dsl::freshness.eq(freshness), + dsl::enabled.eq(enabled), + )) + .returning(Self::as_select()) + .get_result(db) + .await + .optional() + .map_err(AppError::from)? + .ok_or(AppError::DatabaseQuery(DieselError::NotFound)) + } + + pub async fn delete(db: &mut AsyncPgConnection, id: Uuid) -> Result<()> { + use crate::schema::restore_replicas::dsl; + let n = diesel::delete(dsl::restore_replicas.filter(dsl::id.eq(id))) + .execute(db) + .await?; + if n == 0 { + return Err(AppError::DatabaseQuery(DieselError::NotFound)); + } + Ok(()) + } + + /// Whether an enabled declaration covers `(consumer, group, type)` — the + /// authorization check for issuing restore credentials. A server-scoped or + /// a group-wide declaration both satisfy it. + pub async fn authorizes( + db: &mut AsyncPgConnection, + consumer_device_id: Uuid, + group_id: Uuid, + r#type: &BackupType, + ) -> Result { + use crate::schema::restore_replicas::dsl; + let n: i64 = dsl::restore_replicas + .filter(dsl::consumer_device_id.eq(consumer_device_id)) + .filter(dsl::group_id.eq(group_id)) + .filter(dsl::type_.eq(r#type.as_str())) + .filter(dsl::enabled.eq(true)) + .count() + .get_result(db) + .await?; + Ok(n > 0) + } +} + +/// One intent a consumer can satisfy. The full set is registered by the +/// consumer on start and whenever it changes; Canopy dispatches only matching +/// worklist entries and constrains the declaration UX to this set. +#[derive(Debug, Clone, Serialize, Queryable, Selectable, utoipa::ToSchema)] +#[diesel(table_name = crate::schema::restore_consumer_capabilities)] +#[diesel(check_for_backend(diesel::pg::Pg))] +pub struct RestoreConsumerCapability { + pub consumer_device_id: Uuid, + #[schema(value_type = String)] + pub intent: RestoreIntent, + #[diesel(deserialize_as = jiff_diesel::Timestamp, serialize_as = jiff_diesel::Timestamp)] + pub registered_at: Timestamp, +} + +impl RestoreConsumerCapability { + /// Replace a consumer's capability set with `intents`. Implemented as + /// insert-then-prune (not a transaction) so there is never a window where + /// a still-valid intent is absent: new intents are inserted first, then any + /// no longer present are removed. + pub async fn register( + db: &mut AsyncPgConnection, + consumer_device_id: Uuid, + intents: &[RestoreIntent], + ) -> Result<()> { + use crate::schema::restore_consumer_capabilities::dsl; + + let strings: Vec = intents.iter().map(|i| i.as_str().to_owned()).collect(); + + let rows: Vec<_> = intents + .iter() + .map(|i| { + ( + dsl::consumer_device_id.eq(consumer_device_id), + dsl::intent.eq(i.as_str().to_owned()), + ) + }) + .collect(); + if !rows.is_empty() { + diesel::insert_into(dsl::restore_consumer_capabilities) + .values(rows) + .on_conflict((dsl::consumer_device_id, dsl::intent)) + .do_nothing() + .execute(db) + .await?; + } + + diesel::delete( + dsl::restore_consumer_capabilities + .filter(dsl::consumer_device_id.eq(consumer_device_id)) + .filter(dsl::intent.ne_all(strings)), + ) + .execute(db) + .await?; + Ok(()) + } + + /// The intents a consumer currently supports. + pub async fn list_for_consumer( + db: &mut AsyncPgConnection, + consumer_device_id: Uuid, + ) -> Result> { + use crate::schema::restore_consumer_capabilities::dsl; + let rows: Vec = dsl::restore_consumer_capabilities + .filter(dsl::consumer_device_id.eq(consumer_device_id)) + .select(dsl::intent) + .order(dsl::intent.asc()) + .load(db) + .await?; + Ok(rows.into_iter().map(RestoreIntent::from).collect()) + } +} diff --git a/crates/database/src/schema.rs b/crates/database/src/schema.rs index de81559b..3a3e7e52 100644 --- a/crates/database/src/schema.rs +++ b/crates/database/src/schema.rs @@ -291,6 +291,32 @@ diesel::table! { } } +diesel::table! { + restore_consumer_capabilities (consumer_device_id, intent) { + consumer_device_id -> Uuid, + intent -> Text, + registered_at -> Timestamptz, + } +} + +diesel::table! { + restore_replicas (id) { + id -> Uuid, + consumer_device_id -> Uuid, + group_id -> Uuid, + server_id -> Nullable, + #[sql_name = "type"] + type_ -> Text, + intent -> Text, + name -> Text, + freshness -> Nullable, + enabled -> Bool, + created_by -> Nullable, + created_at -> Timestamptz, + updated_at -> Timestamptz, + } +} + diesel::table! { server_backup_capabilities (server_id, type_) { server_id -> Uuid, @@ -524,6 +550,10 @@ diesel::joinable!(issue_notes -> issues (issue_id)); diesel::joinable!(issues -> devices (device_id)); diesel::joinable!(issues -> server_groups (server_group_id)); diesel::joinable!(issues -> servers (server_id)); +diesel::joinable!(restore_consumer_capabilities -> devices (consumer_device_id)); +diesel::joinable!(restore_replicas -> devices (consumer_device_id)); +diesel::joinable!(restore_replicas -> server_groups (group_id)); +diesel::joinable!(restore_replicas -> servers (server_id)); diesel::joinable!(server_backup_capabilities -> servers (server_id)); diesel::joinable!(server_enrollment_challenges -> servers (server_id)); diesel::joinable!(server_enrollment_tokens -> servers (server_id)); @@ -563,6 +593,8 @@ diesel::allow_tables_to_appear_in_same_query!( incidents, issue_notes, issues, + restore_consumer_capabilities, + restore_replicas, server_backup_capabilities, server_enrollment_challenges, server_enrollment_tokens, diff --git a/crates/database/src/servers.rs b/crates/database/src/servers.rs index 97e902f9..a4bd34b0 100644 --- a/crates/database/src/servers.rs +++ b/crates/database/src/servers.rs @@ -455,6 +455,23 @@ impl Server { .map_err(AppError::from) } + /// All live (non-archived) servers in a group, ordered by name. Used to + /// expand a group-wide restore-replica declaration into per-server entries. + pub async fn list_live_in_group( + db: &mut AsyncPgConnection, + group_id_: Uuid, + ) -> Result> { + use crate::schema::servers::dsl::*; + servers + .select(Self::as_select()) + .filter(group_id.eq(group_id_)) + .filter(deleted_at.is_null()) + .order(name.asc()) + .load(db) + .await + .map_err(AppError::from) + } + /// All servers without a group, ordered by name. Used by the Ungrouped UI tab. pub async fn list_ungrouped(db: &mut AsyncPgConnection) -> Result> { use crate::schema::servers::dsl::*; diff --git a/crates/public-server/src/openapi.rs b/crates/public-server/src/openapi.rs index d033f784..11001f10 100644 --- a/crates/public-server/src/openapi.rs +++ b/crates/public-server/src/openapi.rs @@ -45,6 +45,7 @@ impl Modify for SecuritySchemes { }; components.add_security_scheme("server-device", role_scheme("server")); components.add_security_scheme("releaser-device", role_scheme("releaser")); + components.add_security_scheme("backup-restore-device", role_scheme("backup-restore")); components.add_security_scheme( "admin-device", SecurityScheme::MutualTls { diff --git a/crates/public-server/tests/openapi_spec.rs b/crates/public-server/tests/openapi_spec.rs index 8f44bfd2..b2b9ab34 100644 --- a/crates/public-server/tests/openapi_spec.rs +++ b/crates/public-server/tests/openapi_spec.rs @@ -16,7 +16,12 @@ fn build_spec() -> serde_json::Value { fn spec_has_security_schemes() { let spec = build_spec(); let schemes = &spec["components"]["securitySchemes"]; - for s in ["server-device", "releaser-device", "admin-device"] { + for s in [ + "server-device", + "releaser-device", + "admin-device", + "backup-restore-device", + ] { assert!(schemes[s].is_object(), "{s} scheme present"); } } diff --git a/migrations/2026-06-30-021427-0000_restore_replicas/down.sql b/migrations/2026-06-30-021427-0000_restore_replicas/down.sql new file mode 100644 index 00000000..adcdeca5 --- /dev/null +++ b/migrations/2026-06-30-021427-0000_restore_replicas/down.sql @@ -0,0 +1,2 @@ +DROP TABLE restore_consumer_capabilities; +DROP TABLE restore_replicas; diff --git a/migrations/2026-06-30-021427-0000_restore_replicas/up.sql b/migrations/2026-06-30-021427-0000_restore_replicas/up.sql new file mode 100644 index 00000000..8a1919a9 --- /dev/null +++ b/migrations/2026-06-30-021427-0000_restore_replicas/up.sql @@ -0,0 +1,51 @@ +-- Managed restore replicas (RST): operator-declared desired replicas that a +-- restore consumer reconciles against, plus the set of intents each consumer +-- can satisfy. + +-- A declared replica: the operator's statement that a consumer should keep a +-- replica of a (group, [server | all servers], type) for a given intent. The +-- declaration is both the work item and the authorization to read what it +-- needs. +CREATE TABLE restore_replicas ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + consumer_device_id UUID NOT NULL REFERENCES devices(id), + group_id UUID NOT NULL REFERENCES server_groups(id), + -- NULL = all current servers in the group (expanded at worklist time). + server_id UUID REFERENCES servers(id), + type TEXT NOT NULL, + intent TEXT NOT NULL, + name TEXT NOT NULL, + -- Max age of the restored snapshot before the replica is overdue; NULL = + -- always track the latest snapshot. + freshness INTERVAL, + enabled BOOLEAN NOT NULL DEFAULT TRUE, + created_by TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +SELECT diesel_manage_updated_at('restore_replicas'); + +-- One declaration per (consumer, group, type, intent) scope. A server-specific +-- row and a group-wide (server_id NULL) row are tracked under separate partial +-- indexes because NULLs do not compare equal in a plain unique constraint. +CREATE UNIQUE INDEX restore_replicas_scope_server + ON restore_replicas (consumer_device_id, group_id, type, intent, server_id) + WHERE server_id IS NOT NULL; +CREATE UNIQUE INDEX restore_replicas_scope_group + ON restore_replicas (consumer_device_id, group_id, type, intent) + WHERE server_id IS NULL; + +CREATE INDEX restore_replicas_consumer ON restore_replicas (consumer_device_id); +CREATE INDEX restore_replicas_group ON restore_replicas (group_id); + +-- The set of intents a consumer can satisfy, registered by the consumer on +-- start and whenever it changes. Canopy dispatches only matching worklist +-- entries and constrains the declaration UX to this set; an enabled +-- declaration whose intent is absent here is a surfaced gap. +CREATE TABLE restore_consumer_capabilities ( + consumer_device_id UUID NOT NULL REFERENCES devices(id), + intent TEXT NOT NULL, + registered_at TIMESTAMPTZ NOT NULL DEFAULT now(), + PRIMARY KEY (consumer_device_id, intent) +); diff --git a/private-web/src/components/DeviceShorty.tsx b/private-web/src/components/DeviceShorty.tsx index a871883b..f2fc6419 100644 --- a/private-web/src/components/DeviceShorty.tsx +++ b/private-web/src/components/DeviceShorty.tsx @@ -10,6 +10,7 @@ const ROLE_COLORS: Record< server: "primary", releaser: "warning", admin: "info", + "backup-restore": "primary", }; export function deviceDisplayName(info: DeviceInfo): string { diff --git a/private-web/src/routes/DeviceDetail.tsx b/private-web/src/routes/DeviceDetail.tsx index 1646dcbc..5d038a4d 100644 --- a/private-web/src/routes/DeviceDetail.tsx +++ b/private-web/src/routes/DeviceDetail.tsx @@ -31,7 +31,12 @@ import { type DeviceRole, } from "../types"; -const TRUSTABLE_ROLES: DeviceRole[] = ["server", "releaser", "admin"]; +const TRUSTABLE_ROLES: DeviceRole[] = [ + "server", + "releaser", + "admin", + "backup-restore", +]; export default function DeviceDetail() { const { id = "" } = useParams<{ id: string }>(); From 631373cee64b585254407e6891405d444d726ee8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Tue, 30 Jun 2026 14:34:35 +1200 Subject: [PATCH 4/7] feat(restore): worklist, restore-credentials, capability + admin endpoints Public-server (backup-restore role): POST /restore-capabilities (register supported intents), GET /restore-worklist (enabled declarations expanded per live server, capability-filtered, with the latest snapshot + repo coords), POST /restore-credentials (read-only STS + repo password, authz via declaration). Private-server admin API (crate::fns::restore_replicas): list/for_group/ consumers/create/update/delete, with per-declaration gap computation. Adds Device::list_by_role; regenerates public + private openapi and api-types.ts (DeviceRole gains backup-restore). Co-Authored-By: Claude Opus 4.8 --- crates/database/src/devices.rs | 13 + crates/private-server/src/fns.rs | 2 + .../src/fns/restore_replicas.rs | 309 ++++++++++++++ crates/public-server/openapi.json | 221 ++++++++++ crates/public-server/src/backup.rs | 2 +- crates/public-server/src/lib.rs | 2 + crates/public-server/src/openapi.rs | 1 + crates/public-server/src/restore.rs | 366 +++++++++++++++++ private-web/openapi.json | 376 +++++++++++++++++- private-web/src/api-types.ts | 311 ++++++++++++++- 10 files changed, 1586 insertions(+), 17 deletions(-) create mode 100644 crates/private-server/src/fns/restore_replicas.rs create mode 100644 crates/public-server/src/restore.rs diff --git a/crates/database/src/devices.rs b/crates/database/src/devices.rs index 6273c351..197e483c 100644 --- a/crates/database/src/devices.rs +++ b/crates/database/src/devices.rs @@ -524,6 +524,19 @@ impl Device { Self::list_trusted_with_info_paginated(db, i64::MAX, 0).await } + /// All devices holding a given role, newest first. Used to list restore + /// consumers (`backup-restore` devices) for the operator's replica forms. + pub async fn list_by_role(db: &mut AsyncPgConnection, role: DeviceRole) -> Result> { + use crate::schema::devices; + devices::table + .select(Self::as_select()) + .filter(devices::role.eq(role)) + .order(devices::created_at.desc()) + .load(db) + .await + .map_err(AppError::from) + } + /// List trusted devices with pagination. pub async fn list_trusted_with_info_paginated( db: &mut AsyncPgConnection, diff --git a/crates/private-server/src/fns.rs b/crates/private-server/src/fns.rs index aa151194..df2e3a1b 100644 --- a/crates/private-server/src/fns.rs +++ b/crates/private-server/src/fns.rs @@ -9,6 +9,7 @@ pub mod devices; pub mod healthchecks; pub mod incidents; pub mod issues; +pub mod restore_replicas; pub mod server_groups; pub mod servers; pub mod silenced_refs; @@ -37,6 +38,7 @@ pub fn routes() -> OpenApiRouter { .nest("/healthchecks", healthchecks::routes()) .nest("/incidents", incidents::routes()) .nest("/issues", issues::routes()) + .nest("/restore_replicas", restore_replicas::routes()) .nest("/server_groups", server_groups::routes()) .nest("/servers", servers::routes()) .nest("/silenced_refs", silenced_refs::routes()) diff --git a/crates/private-server/src/fns/restore_replicas.rs b/crates/private-server/src/fns/restore_replicas.rs new file mode 100644 index 00000000..fd7d54c7 --- /dev/null +++ b/crates/private-server/src/fns/restore_replicas.rs @@ -0,0 +1,309 @@ +//! Operator-facing managed-restore endpoints (private-server, admin SPA). +//! +//! Thin wrappers over `database::restore`. Operators declare which replicas a +//! restore consumer should maintain, and see each consumer's registered +//! capabilities so the declaration UX can offer only supported intents and flag +//! declarations whose intent is currently unsupported (a *gap*). +//! +//! Reads are open to any tailnet user; mutations require admin. + +use std::collections::{HashMap, HashSet}; + +use axum::Json; +use axum::extract::State; +use commons_errors::{ProblemDetailsSchema, Result}; +use commons_servers::tailscale_auth::TailscaleAdmin; +use commons_types::device::DeviceRole; +use commons_types::{ + Uuid, + backup::{BackupType, RestoreIntent}, +}; +use database::diesel_async::AsyncPgConnection; +use database::pg_duration::PgDuration; +use database::{NewRestoreReplica, RestoreConsumerCapability, RestoreReplica, devices::Device}; +use jiff::{SignedDuration, Timestamp}; +use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; +use utoipa_axum::{router::OpenApiRouter, routes}; + +use crate::state::AppState; + +pub fn routes() -> OpenApiRouter { + OpenApiRouter::new() + .routes(routes!(list)) + .routes(routes!(for_group)) + .routes(routes!(consumers)) + .routes(routes!(create)) + .routes(routes!(update)) + .routes(routes!(delete)) +} + +// ── Wire types ────────────────────────────────────────────────────────────── + +/// A declared replica for the operator UI. `gap` is true when the consumer does +/// not currently advertise this declaration's intent, so Canopy is not +/// dispatching it. +#[derive(Debug, Clone, Serialize, ToSchema)] +pub struct RestoreReplicaView { + pub id: Uuid, + pub consumer_device_id: Uuid, + pub consumer_name: Option, + pub group_id: Uuid, + pub server_id: Option, + #[schema(value_type = String)] + pub r#type: BackupType, + #[schema(value_type = String)] + pub intent: RestoreIntent, + pub name: String, + pub freshness_seconds: Option, + pub enabled: bool, + pub gap: bool, + pub created_by: Option, + #[schema(value_type = String)] + pub created_at: Timestamp, + #[schema(value_type = String)] + pub updated_at: Timestamp, +} + +/// A restore consumer (a `backup-restore` device) and the intents it currently +/// supports — drives the declaration form's consumer and intent pickers. +#[derive(Debug, Clone, Serialize, ToSchema)] +pub struct RestoreConsumerView { + pub device_id: Uuid, + pub name: Option, + #[schema(value_type = Vec)] + pub intents: Vec, +} + +#[derive(Debug, Deserialize, ToSchema)] +pub struct GroupArgs { + pub server_group_id: Uuid, +} + +#[derive(Debug, Deserialize, ToSchema)] +pub struct CreateArgs { + pub consumer_device_id: Uuid, + pub group_id: Uuid, + /// `None` = all current servers in the group. + pub server_id: Option, + #[schema(value_type = String)] + pub r#type: BackupType, + #[schema(value_type = String)] + pub intent: RestoreIntent, + pub name: String, + /// Max snapshot age before overdue, in whole seconds; `None` = latest only. + pub freshness_seconds: Option, +} + +#[derive(Debug, Deserialize, ToSchema)] +pub struct UpdateArgs { + pub id: Uuid, + pub name: String, + pub freshness_seconds: Option, + pub enabled: bool, +} + +#[derive(Debug, Deserialize, ToSchema)] +pub struct IdArgs { + pub id: Uuid, +} + +// ── Helpers ─────────────────────────────────────────────────────────────── + +fn freshness_to_pg(seconds: Option) -> Option { + seconds.map(|s| PgDuration(SignedDuration::from_secs(s))) +} + +/// Build views from declarations, resolving consumer display names and the +/// per-consumer capability set so `gap` can be computed. +async fn to_views( + conn: &mut AsyncPgConnection, + replicas: Vec, +) -> Result> { + let consumer_ids: HashSet = replicas.iter().map(|r| r.consumer_device_id).collect(); + + // Consumer display names come from the set of restore-consumer devices. + let names: HashMap> = + Device::list_by_role(conn, DeviceRole::BackupRestore) + .await? + .into_iter() + .map(|d| (d.id, d.tailscale_node_name)) + .collect(); + + let mut caps: HashMap> = HashMap::new(); + for id in consumer_ids { + let set: HashSet = RestoreConsumerCapability::list_for_consumer(conn, id) + .await? + .into_iter() + .collect(); + caps.insert(id, set); + } + + Ok(replicas + .into_iter() + .map(|r| { + let gap = !caps + .get(&r.consumer_device_id) + .map(|s| s.contains(&r.intent)) + .unwrap_or(false); + RestoreReplicaView { + consumer_name: names.get(&r.consumer_device_id).cloned().flatten(), + freshness_seconds: r.freshness.map(|f| f.0.as_secs()), + gap, + id: r.id, + consumer_device_id: r.consumer_device_id, + group_id: r.group_id, + server_id: r.server_id, + r#type: r.r#type, + intent: r.intent, + name: r.name, + enabled: r.enabled, + created_by: r.created_by, + created_at: r.created_at, + updated_at: r.updated_at, + } + }) + .collect()) +} + +// ── Handlers ────────────────────────────────────────────────────────────── + +#[utoipa::path( + post, + path = "/list", + operation_id = "restore_replicas_list", + tag = "restore_replicas", + security(("tailscale-user" = [])), + responses((status = 200, body = Vec)), +)] +pub async fn list(State(state): State) -> Result>> { + let mut conn = state.db.get().await?; + let replicas = RestoreReplica::list_all(&mut conn).await?; + Ok(Json(to_views(&mut conn, replicas).await?)) +} + +#[utoipa::path( + post, + path = "/for_group", + operation_id = "restore_replicas_for_group", + tag = "restore_replicas", + security(("tailscale-user" = [])), + request_body = GroupArgs, + responses((status = 200, body = Vec)), +)] +pub async fn for_group( + State(state): State, + Json(args): Json, +) -> Result>> { + let mut conn = state.db.get().await?; + let replicas = RestoreReplica::list_for_group(&mut conn, args.server_group_id).await?; + Ok(Json(to_views(&mut conn, replicas).await?)) +} + +#[utoipa::path( + post, + path = "/consumers", + operation_id = "restore_replicas_consumers", + tag = "restore_replicas", + security(("tailscale-user" = [])), + responses((status = 200, body = Vec)), +)] +pub async fn consumers(State(state): State) -> Result>> { + let mut conn = state.db.get().await?; + let devices = Device::list_by_role(&mut conn, DeviceRole::BackupRestore).await?; + let mut out = Vec::with_capacity(devices.len()); + for d in devices { + let intents = RestoreConsumerCapability::list_for_consumer(&mut conn, d.id).await?; + out.push(RestoreConsumerView { + device_id: d.id, + name: d.tailscale_node_name, + intents, + }); + } + Ok(Json(out)) +} + +#[utoipa::path( + post, + path = "/create", + operation_id = "restore_replicas_create", + tag = "restore_replicas", + security(("tailscale-admin" = [])), + request_body = CreateArgs, + responses( + (status = 200, body = RestoreReplicaView), + (status = 409, description = "A matching declaration already exists.", body = ProblemDetailsSchema), + ), +)] +pub async fn create( + State(state): State, + TailscaleAdmin(admin): TailscaleAdmin, + Json(args): Json, +) -> Result> { + let mut conn = state.db.get().await?; + let replica = RestoreReplica::create( + &mut conn, + NewRestoreReplica { + consumer_device_id: args.consumer_device_id, + group_id: args.group_id, + server_id: args.server_id, + r#type: args.r#type, + intent: args.intent, + name: args.name, + freshness: freshness_to_pg(args.freshness_seconds), + created_by: Some(admin.login), + }, + ) + .await?; + let views = to_views(&mut conn, vec![replica]).await?; + Ok(Json(views.into_iter().next().expect("one view"))) +} + +#[utoipa::path( + post, + path = "/update", + operation_id = "restore_replicas_update", + tag = "restore_replicas", + security(("tailscale-admin" = [])), + request_body = UpdateArgs, + responses( + (status = 200, body = RestoreReplicaView), + (status = 404, body = ProblemDetailsSchema), + ), +)] +pub async fn update( + State(state): State, + _admin: TailscaleAdmin, + Json(args): Json, +) -> Result> { + let mut conn = state.db.get().await?; + let replica = RestoreReplica::update( + &mut conn, + args.id, + &args.name, + freshness_to_pg(args.freshness_seconds), + args.enabled, + ) + .await?; + let views = to_views(&mut conn, vec![replica]).await?; + Ok(Json(views.into_iter().next().expect("one view"))) +} + +#[utoipa::path( + post, + path = "/delete", + operation_id = "restore_replicas_delete", + tag = "restore_replicas", + security(("tailscale-admin" = [])), + request_body = IdArgs, + responses((status = 200), (status = 404, body = ProblemDetailsSchema)), +)] +pub async fn delete( + State(state): State, + _admin: TailscaleAdmin, + Json(args): Json, +) -> Result> { + let mut conn = state.db.get().await?; + RestoreReplica::delete(&mut conn, args.id).await?; + Ok(Json(())) +} diff --git a/crates/public-server/openapi.json b/crates/public-server/openapi.json index b468fcc4..e7301488 100644 --- a/crates/public-server/openapi.json +++ b/crates/public-server/openapi.json @@ -433,6 +433,127 @@ ] } }, + "/restore-capabilities": { + "post": { + "tags": [ + "restore" + ], + "operationId": "capabilities", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CapabilitiesArgs" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "Capability set registered." + } + }, + "security": [ + { + "backup-restore-device": [] + } + ] + } + }, + "/restore-credentials": { + "post": { + "tags": [ + "restore" + ], + "operationId": "credentials", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CredentialsArgs" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RestoreCredentials" + } + } + } + }, + "403": { + "description": "No enabled declaration authorizes this (group, type).", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ProblemDetailsSchema" + } + } + } + }, + "409": { + "description": "Group has no ready backup config.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ProblemDetailsSchema" + } + } + } + }, + "502": { + "description": "STS issuance or repo-password read failed or is not configured.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ProblemDetailsSchema" + } + } + } + } + }, + "security": [ + { + "backup-restore-device": [] + } + ] + } + }, + "/restore-worklist": { + "get": { + "tags": [ + "restore" + ], + "operationId": "worklist", + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/WorklistEntry" + } + } + } + } + } + }, + "security": [ + { + "backup-restore-device": [] + } + ] + } + }, "/servers": { "get": { "tags": [ @@ -1511,6 +1632,23 @@ } } }, + "RestoreCredentials": { + "type": "object", + "description": "Read-only credentials plus the repo password for one `(group, type)`. The\nAWS creds are the `credential_process` shape the consumer's proxy refreshes;\nthe password opens the kopia repo.", + "required": [ + "credentials", + "repo_password" + ], + "properties": { + "credentials": { + "$ref": "#/components/schemas/CredentialProcessOutput" + }, + "repo_password": { + "type": "string", + "description": "The kopia repo passphrase, read from the group's k8s Secret." + } + } + }, "RunOutcome": { "type": "string", "description": "Outcome of a reported backup/restore run.", @@ -1769,6 +1907,81 @@ "$ref": "#/components/schemas/VersionStatus" } } + }, + "WorklistEntry": { + "type": "object", + "description": "One concrete replica the consumer should maintain: a declaration expanded\nagainst a single server, carrying the snapshot to restore and the repo\ncoordinates to find it. Credentials and the repo password are obtained\nseparately via `/restore-credentials`.", + "required": [ + "replica_id", + "group_id", + "server_id", + "type", + "intent", + "name", + "storage", + "bucket", + "prefix", + "region" + ], + "properties": { + "bucket": { + "type": "string" + }, + "freshness_seconds": { + "type": [ + "integer", + "null" + ], + "format": "int64", + "description": "Max snapshot age before the replica is overdue, in whole seconds;\n`None` = always track the latest." + }, + "group_id": { + "type": "string", + "format": "uuid" + }, + "intent": { + "type": "string" + }, + "name": { + "type": "string" + }, + "prefix": { + "type": "string" + }, + "region": { + "type": "string" + }, + "replica_id": { + "type": "string", + "format": "uuid", + "description": "The declaration this entry came from." + }, + "server_id": { + "type": "string", + "format": "uuid" + }, + "snapshot_at": { + "type": [ + "string", + "null" + ], + "description": "RFC3339 timestamp of that snapshot, if known." + }, + "snapshot_id": { + "type": [ + "string", + "null" + ], + "description": "The snapshot Canopy wants restored — the latest successful backup for\nthis `(server, type)`. `None` when no successful backup is yet known." + }, + "storage": { + "type": "string", + "description": "Always `\"s3\"`." + }, + "type": { + "type": "string" + } + } } }, "securitySchemes": { @@ -1776,6 +1989,10 @@ "type": "mutualTLS", "description": "mTLS client certificate for a device with the `admin` role." }, + "backup-restore-device": { + "type": "mutualTLS", + "description": "mTLS client certificate for a device with the `backup-restore` role (or `admin`)." + }, "releaser-device": { "type": "mutualTLS", "description": "mTLS client certificate for a device with the `releaser` role (or `admin`)." @@ -1803,6 +2020,10 @@ "name": "events", "description": "Device-pushed events; rolled up into issues and incidents server-side." }, + { + "name": "restore", + "description": "Managed restore replicas: consumer capability registration, worklist, and read-only restore credentials." + }, { "name": "servers", "description": "Server registry — listing for the public, self-registration for server devices." diff --git a/crates/public-server/src/backup.rs b/crates/public-server/src/backup.rs index f0f326e0..4408442d 100644 --- a/crates/public-server/src/backup.rs +++ b/crates/public-server/src/backup.rs @@ -47,7 +47,7 @@ pub const REPO_PASSWORD_SECRET_KEY: &str = "password"; /// Fallback AWS region served by `GET /backup-target` when the group config's /// `region` is NULL. Read from `AWS_REGION` (the EKS pod always has it), with a /// last-resort default so the endpoint always returns a concrete region string. -fn deployment_default_region() -> String { +pub(crate) fn deployment_default_region() -> String { std::env::var("AWS_REGION") .or_else(|_| std::env::var("AWS_DEFAULT_REGION")) .unwrap_or_else(|_| "us-east-1".to_string()) diff --git a/crates/public-server/src/lib.rs b/crates/public-server/src/lib.rs index d740c7dc..7779f47a 100644 --- a/crates/public-server/src/lib.rs +++ b/crates/public-server/src/lib.rs @@ -12,6 +12,7 @@ pub mod openapi; #[cfg(feature = "ui")] pub mod password; pub mod ratelimit; +pub mod restore; #[cfg(feature = "ui")] pub mod server_versions; pub mod servers; @@ -27,6 +28,7 @@ pub fn routes() -> OpenApiRouter { let mut router = OpenApiRouter::new() .merge(events::routes()) .merge(backup::routes()) + .merge(restore::routes()) .nest("/artifacts", artifacts::routes()) .nest("/bestool", bestool::routes()) .nest("/servers", servers::routes()) diff --git a/crates/public-server/src/openapi.rs b/crates/public-server/src/openapi.rs index 11001f10..a1233697 100644 --- a/crates/public-server/src/openapi.rs +++ b/crates/public-server/src/openapi.rs @@ -19,6 +19,7 @@ use utoipa::{Modify, OpenApi, openapi::security::SecurityScheme}; (name = "backup", description = "Device backup credential minting, target config, capability registration, and run reporting."), (name = "bestool", description = "Bestool SQL snippet read API."), (name = "events", description = "Device-pushed events; rolled up into issues and incidents server-side."), + (name = "restore", description = "Managed restore replicas: consumer capability registration, worklist, and read-only restore credentials."), (name = "servers", description = "Server registry — listing for the public, self-registration for server devices."), (name = "statuses", description = "Heartbeat / status submissions from server devices."), (name = "versions", description = "Canopy release versions and their downloadable artifacts."), diff --git a/crates/public-server/src/restore.rs b/crates/public-server/src/restore.rs new file mode 100644 index 00000000..3dfe9e9b --- /dev/null +++ b/crates/public-server/src/restore.rs @@ -0,0 +1,366 @@ +//! Managed-restore endpoints (RST) — the consumer-facing side of the restore +//! control plane. All `BackupRestoreDevice`-authenticated, mounted at the root: +//! +//! - `POST /restore-capabilities` — the consumer registers the intents it can +//! satisfy. Canopy dispatches only matching worklist entries. +//! - `GET /restore-worklist` — the consumer's complete desired state: its +//! enabled declarations expanded per current server, each carrying the +//! snapshot Canopy wants restored and the repo coordinates to find it. +//! - `POST /restore-credentials` — short-lived **read-only** S3 creds plus the +//! repo password for one `(group, type)` the consumer is authorized for. +//! +//! The `backup-restore` role is read-only by construction: it cannot reach the +//! `ServerDevice`-gated `/backup-credentials`, and `/restore-credentials` only +//! ever issues the read-only [`restore_session_policy`]. + +use aws_sdk_sts::operation::RequestId as _; +use axum::{Json, extract::State, http::StatusCode}; +use commons_errors::{AppError, ProblemDetailsSchema, Result}; +use commons_servers::device_auth::BackupRestoreDevice; +use commons_types::backup::{BackupPurpose, BackupType, RestoreIntent}; +use database::{ + Db, + backups::{BackupRun, NewBackupCredentialIssuance, ServerGroupBackupConfig}, + restore::{RestoreConsumerCapability, RestoreReplica}, + servers::Server, +}; +use jiff::Timestamp; +use serde::{Deserialize, Serialize}; +use std::collections::HashSet; +use utoipa::ToSchema; +use utoipa_axum::{router::OpenApiRouter, routes}; +use uuid::Uuid; + +use crate::{ + backup::{ + CredentialProcessOutput, REPO_PASSWORD_SECRET_KEY, deployment_default_region, + restore_session_policy, + }, + state::{AppState, BackupSecrets}, +}; + +pub fn routes() -> OpenApiRouter { + OpenApiRouter::new() + .routes(routes!(capabilities)) + .routes(routes!(worklist)) + .routes(routes!(credentials)) +} + +// --------------------------------------------------------------------------- +// POST /restore-capabilities +// --------------------------------------------------------------------------- + +#[derive(Debug, Deserialize, ToSchema)] +pub struct CapabilitiesArgs { + /// The intents this consumer can satisfy (e.g. `verify`, `analytics`, + /// `disaster-recovery`). Replaces the consumer's registered set wholesale. + #[schema(value_type = Vec)] + pub intents: Vec, +} + +#[utoipa::path( + post, + path = "/restore-capabilities", + tag = "restore", + security(("backup-restore-device" = [])), + request_body = CapabilitiesArgs, + responses((status = 204, description = "Capability set registered.")), +)] +async fn capabilities( + State(db): State, + device: BackupRestoreDevice, + Json(args): Json, +) -> Result { + let mut conn = db.get().await?; + let consumer_device_id = device.0.0.id; + RestoreConsumerCapability::register(&mut conn, consumer_device_id, &args.intents).await?; + Ok(StatusCode::NO_CONTENT) +} + +// --------------------------------------------------------------------------- +// GET /restore-worklist +// --------------------------------------------------------------------------- + +/// One concrete replica the consumer should maintain: a declaration expanded +/// against a single server, carrying the snapshot to restore and the repo +/// coordinates to find it. Credentials and the repo password are obtained +/// separately via `/restore-credentials`. +#[derive(Debug, Serialize, ToSchema)] +pub struct WorklistEntry { + /// The declaration this entry came from. + pub replica_id: Uuid, + pub group_id: Uuid, + pub server_id: Uuid, + #[schema(value_type = String)] + pub r#type: BackupType, + #[schema(value_type = String)] + pub intent: RestoreIntent, + pub name: String, + /// Max snapshot age before the replica is overdue, in whole seconds; + /// `None` = always track the latest. + pub freshness_seconds: Option, + /// The snapshot Canopy wants restored — the latest successful backup for + /// this `(server, type)`. `None` when no successful backup is yet known. + pub snapshot_id: Option, + /// RFC3339 timestamp of that snapshot, if known. + pub snapshot_at: Option, + /// Always `"s3"`. + pub storage: String, + pub bucket: String, + pub prefix: String, + pub region: String, +} + +#[utoipa::path( + get, + path = "/restore-worklist", + tag = "restore", + security(("backup-restore-device" = [])), + responses((status = 200, body = Vec)), +)] +async fn worklist( + State(db): State, + device: BackupRestoreDevice, +) -> Result>> { + let mut conn = db.get().await?; + let consumer_device_id = device.0.0.id; + + // Only intents the consumer currently supports are dispatched; a declaration + // on an unsupported intent is a gap, surfaced to operators, never sent here. + let supported: HashSet = + RestoreConsumerCapability::list_for_consumer(&mut conn, consumer_device_id) + .await? + .into_iter() + .collect(); + + let mut declarations = RestoreReplica::list_enabled_for_consumer(&mut conn, consumer_device_id) + .await? + .into_iter() + .filter(|d| supported.contains(&d.intent)) + .collect::>(); + // Process server-specific declarations before group-wide ones so a + // server-scoped declaration wins the dedup over a group-wide one covering + // the same (server, type, intent). + declarations.sort_by_key(|d| d.server_id.is_none()); + + let mut out: Vec = Vec::new(); + let mut seen: HashSet<(Uuid, String, String)> = HashSet::new(); + // Per-group caches so a group referenced by several declarations is resolved + // once. + let mut snapshot_cache: std::collections::HashMap< + Uuid, + std::collections::HashMap<(Uuid, BackupType), BackupRun>, + > = std::collections::HashMap::new(); + + for d in declarations { + // A worklist entry needs somewhere to restore from: skip groups without + // a ready config (they surface elsewhere as not-yet-restorable). + let Some(cfg) = ServerGroupBackupConfig::get(&mut conn, d.group_id).await? else { + continue; + }; + if cfg.status != commons_types::backup::BackupConfigStatus::Ready { + continue; + } + + let servers = match d.server_id { + Some(sid) => { + let s = Server::get_by_id(&mut conn, sid).await?; + // Skip a declaration whose server has left the group or been + // archived; it lingers as a no-op until the operator retires it. + if s.group_id == Some(d.group_id) && s.deleted_at.is_none() { + vec![s] + } else { + vec![] + } + } + None => Server::list_live_in_group(&mut conn, d.group_id).await?, + }; + + if !snapshot_cache.contains_key(&d.group_id) { + let map = + BackupRun::latest_success_by_server_type_for_group(&mut conn, d.group_id).await?; + snapshot_cache.insert(d.group_id, map); + } + let snapshots = &snapshot_cache[&d.group_id]; + + let region = cfg.region.clone().unwrap_or_else(deployment_default_region); + for server in servers { + let key = (server.id, d.r#type.to_string(), d.intent.to_string()); + if !seen.insert(key) { + continue; + } + let latest = snapshots.get(&(server.id, d.r#type.clone())); + out.push(WorklistEntry { + replica_id: d.id, + group_id: d.group_id, + server_id: server.id, + r#type: d.r#type.clone(), + intent: d.intent.clone(), + name: d.name.clone(), + freshness_seconds: d.freshness.map(|f| f.0.as_secs()), + snapshot_id: latest.and_then(|r| r.snapshot_id.clone()), + snapshot_at: latest.map(|r| r.reported_at.to_string()), + storage: "s3".into(), + bucket: cfg.bucket.clone(), + prefix: cfg.prefix.clone(), + region: region.clone(), + }); + } + } + + Ok(Json(out)) +} + +// --------------------------------------------------------------------------- +// POST /restore-credentials +// --------------------------------------------------------------------------- + +#[derive(Debug, Deserialize, ToSchema)] +pub struct CredentialsArgs { + /// The group whose repo to read. + pub group: Uuid, + /// The backup type to restore. + #[schema(value_type = String)] + pub r#type: BackupType, +} + +/// Read-only credentials plus the repo password for one `(group, type)`. The +/// AWS creds are the `credential_process` shape the consumer's proxy refreshes; +/// the password opens the kopia repo. +#[derive(Debug, Serialize, ToSchema)] +pub struct RestoreCredentials { + pub credentials: CredentialProcessOutput, + /// The kopia repo passphrase, read from the group's k8s Secret. + pub repo_password: String, +} + +#[utoipa::path( + post, + path = "/restore-credentials", + tag = "restore", + security(("backup-restore-device" = [])), + request_body = CredentialsArgs, + responses( + (status = 200, body = RestoreCredentials), + (status = 403, description = "No enabled declaration authorizes this (group, type).", body = ProblemDetailsSchema), + (status = 409, description = "Group has no ready backup config.", body = ProblemDetailsSchema), + (status = 502, description = "STS issuance or repo-password read failed or is not configured.", body = ProblemDetailsSchema), + ), +)] +async fn credentials( + State(db): State, + State(sts): State>, + State(kube): State>, + device: BackupRestoreDevice, + Json(args): Json, +) -> Result> { + let mut conn = db.get().await?; + let consumer_device_id = device.0.0.id; + + // Authorization is the declared replica: a consumer may read exactly the + // (group, type) pairs its enabled declarations cover. + if !RestoreReplica::authorizes(&mut conn, consumer_device_id, args.group, &args.r#type).await? { + return Err(AppError::AuthInsufficientPermissions { + required: "an enabled restore-replica declaration for this group and type".into(), + }); + } + + let cfg = ServerGroupBackupConfig::get(&mut conn, args.group) + .await? + .ok_or_else(|| AppError::Conflict("group has no backup config".into()))?; + if cfg.status != commons_types::backup::BackupConfigStatus::Ready { + return Err(AppError::Conflict( + "group backup config is not ready".into(), + )); + } + + // Always read-only — this role cannot mint write creds. + let session_policy = restore_session_policy(&cfg.bucket, &cfg.prefix); + + let Some(sts) = sts else { + tracing::error!(group = %args.group, "restore-credentials: STS client not configured"); + return Err(AppError::Upstream( + "credential issuer not configured".into(), + )); + }; + + let session_name = format!("canopy-restore-{consumer_device_id}"); + let resp = sts + .assume_role() + .role_arn(&cfg.target_role_arn) + .role_session_name(session_name) + .policy(session_policy) + .duration_seconds(3600) + .send() + .await + .map_err(|err| { + let request_id = err.request_id().unwrap_or(""); + tracing::error!( + group = %args.group, + role = %cfg.target_role_arn, + request_id, + error = ?err, + "restore-credentials: AssumeRole failed", + ); + AppError::Upstream("credential issuance failed".into()) + })?; + + let sts_request_id = resp.request_id().map(str::to_owned); + let creds = resp.credentials().ok_or_else(|| { + tracing::error!(group = %args.group, "restore-credentials: AssumeRole returned no credentials"); + AppError::Upstream("credential issuance returned no credentials".into()) + })?; + + let expiry_secs = creds.expiration().secs(); + let expires_at = Timestamp::from_second(expiry_secs).map_err(|err| { + tracing::error!(group = %args.group, error = ?err, "restore-credentials: bad expiration"); + AppError::Upstream("credential issuance returned an invalid expiration".into()) + })?; + let access_key_id = creds.access_key_id().to_owned(); + + let Some(kube) = kube else { + tracing::error!(group = %args.group, "restore-credentials: kube client not configured"); + return Err(AppError::Upstream("secret store not configured".into())); + }; + let repo_password = kube + .read_password(&cfg.repo_password_ref, REPO_PASSWORD_SECRET_KEY) + .await + .map_err(|err| { + tracing::error!( + group = %args.group, + secret = %cfg.repo_password_ref, + error = ?err, + "restore-credentials: reading repo-password Secret failed", + ); + AppError::Upstream("repo password unavailable".into()) + })?; + + // Audit BEFORE returning — never hand out creds we didn't record. + database::backups::BackupCredentialIssuance::record( + &mut conn, + NewBackupCredentialIssuance { + device_id: consumer_device_id, + group_id: args.group, + r#type: args.r#type.clone(), + expires_at, + purpose: BackupPurpose::Restore, + sts_assumed_role: cfg.target_role_arn.clone(), + sts_request_id, + access_key_id: Some(access_key_id.clone()), + bucket: cfg.bucket.clone(), + prefix: cfg.prefix.clone(), + }, + ) + .await?; + + Ok(Json(RestoreCredentials { + credentials: CredentialProcessOutput { + version: 1, + access_key_id, + secret_access_key: creds.secret_access_key().to_owned(), + session_token: creds.session_token().to_owned(), + expiration: expires_at.to_string(), + }, + repo_password, + })) +} diff --git a/private-web/openapi.json b/private-web/openapi.json index e4adee74..904f5e37 100644 --- a/private-web/openapi.json +++ b/private-web/openapi.json @@ -3175,6 +3175,228 @@ ] } }, + "/api/restore_replicas/consumers": { + "post": { + "tags": [ + "restore_replicas" + ], + "operationId": "restore_replicas_consumers", + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/RestoreConsumerView" + } + } + } + } + } + }, + "security": [ + { + "tailscale-user": [] + } + ] + } + }, + "/api/restore_replicas/create": { + "post": { + "tags": [ + "restore_replicas" + ], + "operationId": "restore_replicas_create", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateArgs" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RestoreReplicaView" + } + } + } + }, + "409": { + "description": "A matching declaration already exists.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ProblemDetailsSchema" + } + } + } + } + }, + "security": [ + { + "tailscale-admin": [] + } + ] + } + }, + "/api/restore_replicas/delete": { + "post": { + "tags": [ + "restore_replicas" + ], + "operationId": "restore_replicas_delete", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/IdArgs" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "" + }, + "404": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ProblemDetailsSchema" + } + } + } + } + }, + "security": [ + { + "tailscale-admin": [] + } + ] + } + }, + "/api/restore_replicas/for_group": { + "post": { + "tags": [ + "restore_replicas" + ], + "operationId": "restore_replicas_for_group", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/GroupArgs" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/RestoreReplicaView" + } + } + } + } + } + }, + "security": [ + { + "tailscale-user": [] + } + ] + } + }, + "/api/restore_replicas/list": { + "post": { + "tags": [ + "restore_replicas" + ], + "operationId": "restore_replicas_list", + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/RestoreReplicaView" + } + } + } + } + } + }, + "security": [ + { + "tailscale-user": [] + } + ] + } + }, + "/api/restore_replicas/update": { + "post": { + "tags": [ + "restore_replicas" + ], + "operationId": "restore_replicas_update", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/UpdateArgs" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RestoreReplicaView" + } + } + } + }, + "404": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ProblemDetailsSchema" + } + } + } + } + }, + "security": [ + { + "tailscale-admin": [] + } + ] + } + }, "/api/server_groups/create": { "post": { "tags": [ @@ -5718,25 +5940,45 @@ "CreateArgs": { "type": "object", "required": [ + "consumer_device_id", + "group_id", + "type", + "intent", "name" ], "properties": { - "name": { + "consumer_device_id": { + "type": "string", + "format": "uuid" + }, + "freshness_seconds": { + "type": [ + "integer", + "null" + ], + "format": "int64", + "description": "Max snapshot age before overdue, in whole seconds; `None` = latest only." + }, + "group_id": { + "type": "string", + "format": "uuid" + }, + "intent": { "type": "string" }, - "notes": { + "name": { "type": "string" }, - "slack_open_delay": { + "server_id": { "type": [ - "integer", + "string", "null" ], - "format": "int64", - "description": "Optional initial value (seconds) for the group's Slack open\ncooldown. Omit to let the database default apply." + "format": "uuid", + "description": "`None` = all current servers in the group." }, - "tags": { - "$ref": "#/components/schemas/TagMap" + "type": { + "type": "string" } } }, @@ -6117,7 +6359,8 @@ "untrusted", "admin", "releaser", - "server" + "server", + "backup-restore" ] }, "EnrollmentStatus": { @@ -6647,6 +6890,18 @@ } } }, + "IdArgs": { + "type": "object", + "required": [ + "id" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + } + } + }, "IncidentData": { "type": "object", "required": [ @@ -8196,6 +8451,109 @@ "flapping" ] }, + "RestoreConsumerView": { + "type": "object", + "description": "A restore consumer (a `backup-restore` device) and the intents it currently\nsupports — drives the declaration form's consumer and intent pickers.", + "required": [ + "device_id", + "intents" + ], + "properties": { + "device_id": { + "type": "string", + "format": "uuid" + }, + "intents": { + "type": "array", + "items": { + "type": "string" + } + }, + "name": { + "type": [ + "string", + "null" + ] + } + } + }, + "RestoreReplicaView": { + "type": "object", + "description": "A declared replica for the operator UI. `gap` is true when the consumer does\nnot currently advertise this declaration's intent, so Canopy is not\ndispatching it.", + "required": [ + "id", + "consumer_device_id", + "group_id", + "type", + "intent", + "name", + "enabled", + "gap", + "created_at", + "updated_at" + ], + "properties": { + "consumer_device_id": { + "type": "string", + "format": "uuid" + }, + "consumer_name": { + "type": [ + "string", + "null" + ] + }, + "created_at": { + "type": "string" + }, + "created_by": { + "type": [ + "string", + "null" + ] + }, + "enabled": { + "type": "boolean" + }, + "freshness_seconds": { + "type": [ + "integer", + "null" + ], + "format": "int64" + }, + "gap": { + "type": "boolean" + }, + "group_id": { + "type": "string", + "format": "uuid" + }, + "id": { + "type": "string", + "format": "uuid" + }, + "intent": { + "type": "string" + }, + "name": { + "type": "string" + }, + "server_id": { + "type": [ + "string", + "null" + ], + "format": "uuid" + }, + "type": { + "type": "string" + }, + "updated_at": { + "type": "string" + } + } + }, "RetentionPolicy": { "type": "object", "description": "kopia `keep-*` retention policy. Org-minimum floors\n(`keep_daily ≥ 7, keep_weekly ≥ 4, keep_monthly ≥ 6`) are enforced by\n[`RetentionPolicy::validate_floor`] on create/update — unless the config\nopts out via its `allow_below_floor` flag (dangerous).", diff --git a/private-web/src/api-types.ts b/private-web/src/api-types.ts index 8b3807d0..0b138f06 100644 --- a/private-web/src/api-types.ts +++ b/private-web/src/api-types.ts @@ -1319,6 +1319,102 @@ export interface paths { patch?: never; trace?: never; }; + "/api/restore_replicas/consumers": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + post: operations["restore_replicas_consumers"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/restore_replicas/create": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + post: operations["restore_replicas_create"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/restore_replicas/delete": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + post: operations["restore_replicas_delete"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/restore_replicas/for_group": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + post: operations["restore_replicas_for_group"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/restore_replicas/list": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + post: operations["restore_replicas_list"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/restore_replicas/update": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + post: operations["restore_replicas_update"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/server_groups/create": { parameters: { query?: never; @@ -2344,15 +2440,23 @@ export interface components { limit?: number | null; }; CreateArgs: { - name: string; - notes?: string; + /** Format: uuid */ + consumer_device_id: string; /** * Format: int64 - * @description Optional initial value (seconds) for the group's Slack open - * cooldown. Omit to let the database default apply. + * @description Max snapshot age before overdue, in whole seconds; `None` = latest only. */ - slack_open_delay?: number | null; - tags?: components["schemas"]["TagMap"]; + freshness_seconds?: number | null; + /** Format: uuid */ + group_id: string; + intent: string; + name: string; + /** + * Format: uuid + * @description `None` = all current servers in the group. + */ + server_id?: string | null; + type: string; }; CreateArtifactArgs: { artifact_type: string; @@ -2468,7 +2572,7 @@ export interface components { pem_data: string; }; /** @enum {string} */ - DeviceRole: "untrusted" | "admin" | "releaser" | "server"; + DeviceRole: "untrusted" | "admin" | "releaser" | "server" | "backup-restore"; EnrollmentStatus: { /** * Format: date-time @@ -2708,6 +2812,10 @@ export interface components { /** Format: uuid */ id: string; }; + IdArgs: { + /** Format: uuid */ + id: string; + }; IncidentData: { /** Format: date-time */ closed_at?: string | null; @@ -3314,6 +3422,42 @@ export interface components { * @enum {string} */ ResolvedReason: "fixed" | "wont_fix" | "expected" | "duplicate" | "flapping"; + /** + * @description A restore consumer (a `backup-restore` device) and the intents it currently + * supports — drives the declaration form's consumer and intent pickers. + */ + RestoreConsumerView: { + /** Format: uuid */ + device_id: string; + intents: string[]; + name?: string | null; + }; + /** + * @description A declared replica for the operator UI. `gap` is true when the consumer does + * not currently advertise this declaration's intent, so Canopy is not + * dispatching it. + */ + RestoreReplicaView: { + /** Format: uuid */ + consumer_device_id: string; + consumer_name?: string | null; + created_at: string; + created_by?: string | null; + enabled: boolean; + /** Format: int64 */ + freshness_seconds?: number | null; + gap: boolean; + /** Format: uuid */ + group_id: string; + /** Format: uuid */ + id: string; + intent: string; + name: string; + /** Format: uuid */ + server_id?: string | null; + type: string; + updated_at: string; + }; /** * @description kopia `keep-*` retention policy. Org-minimum floors * (`keep_daily ≥ 7, keep_weekly ≥ 4, keep_monthly ≥ 6`) are enforced by @@ -6182,6 +6326,159 @@ export interface operations { }; }; }; + restore_replicas_consumers: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["RestoreConsumerView"][]; + }; + }; + }; + }; + restore_replicas_create: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody: { + content: { + "application/json": components["schemas"]["CreateArgs"]; + }; + }; + responses: { + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["RestoreReplicaView"]; + }; + }; + /** @description A matching declaration already exists. */ + 409: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["ProblemDetailsSchema"]; + }; + }; + }; + }; + restore_replicas_delete: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody: { + content: { + "application/json": components["schemas"]["IdArgs"]; + }; + }; + responses: { + 200: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + 404: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["ProblemDetailsSchema"]; + }; + }; + }; + }; + restore_replicas_for_group: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody: { + content: { + "application/json": components["schemas"]["GroupArgs"]; + }; + }; + responses: { + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["RestoreReplicaView"][]; + }; + }; + }; + }; + restore_replicas_list: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["RestoreReplicaView"][]; + }; + }; + }; + }; + restore_replicas_update: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody: { + content: { + "application/json": components["schemas"]["UpdateArgs"]; + }; + }; + responses: { + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["RestoreReplicaView"]; + }; + }; + 404: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["ProblemDetailsSchema"]; + }; + }; + }; + }; server_groups_create: { parameters: { query?: never; From 1e3ee66ab932a1337c8fdebcb805674c3baac3f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Tue, 30 Jun 2026 14:40:46 +1200 Subject: [PATCH 5/7] feat(private-web): restore-replicas operator UI + e2e New /restore-replicas page: declarations table (scope, intent with gap chip, enable toggle, delete), consumers panel showing each backup-restore device's registered capabilities, and a declare dialog with consumer/group/server/type/intent pickers (intent options annotate unsupported choices). Nav entry + route. Adds backup-restore to the device trust picker (earlier commit). e2e: seedRestoreReplica + seedRestoreConsumerCapability helpers, restore tables added to resetSeededTables, and restore-replicas.spec.ts covering empty state, gap flagging, consumers panel, delete, enable toggle, and the declare dialog. Co-Authored-By: Claude Opus 4.8 --- private-web/e2e/restore-replicas.spec.ts | 172 ++++++++ private-web/e2e/seed.ts | 75 +++- private-web/src/App.tsx | 3 + private-web/src/routes/RestoreReplicas.tsx | 478 +++++++++++++++++++++ 4 files changed, 727 insertions(+), 1 deletion(-) create mode 100644 private-web/e2e/restore-replicas.spec.ts create mode 100644 private-web/src/routes/RestoreReplicas.tsx diff --git a/private-web/e2e/restore-replicas.spec.ts b/private-web/e2e/restore-replicas.spec.ts new file mode 100644 index 00000000..c63eaf96 --- /dev/null +++ b/private-web/e2e/restore-replicas.spec.ts @@ -0,0 +1,172 @@ +import { expect, test } from "./test-fixtures"; +import { + resetSeededTables, + seedDevice, + seedRestoreConsumerCapability, + seedRestoreReplica, + seedServer, + seedServerGroup, +} from "./seed"; + +// The e2e fixture runs the private-server in a debug build, so the Tailscale +// auth bypass treats every caller as `admin@localhost` (an admin). These specs +// exercise the operator-facing managed-restore UI. + +test.describe("restore replicas", () => { + test.beforeEach(async ({ sql }) => { + await resetSeededTables(sql); + }); + + test("empty state shows the no-declarations banner", async ({ page }) => { + await page.goto("/restore-replicas"); + await expect( + page.getByText(/no restore replicas declared/i), + ).toBeVisible(); + }); + + test("a seeded declaration renders; an unsupported intent is flagged as a gap", async ({ + page, + sql, + }) => { + const consumer = await seedDevice(sql, { role: "backup-restore" }); + await seedRestoreConsumerCapability(sql, { + deviceId: consumer.id, + intents: ["verify"], + }); + const group = await seedServerGroup(sql, { name: "rr-group" }); + + // Supported intent — no gap. + await seedRestoreReplica(sql, { + consumerDeviceId: consumer.id, + groupId: group.id, + intent: "verify", + name: "verify-all", + }); + // Unsupported intent — gap. + await seedRestoreReplica(sql, { + consumerDeviceId: consumer.id, + groupId: group.id, + intent: "analytics", + name: "analytics-all", + }); + + await page.goto("/restore-replicas"); + + const verifyRow = page.getByRole("row", { name: /verify-all/ }); + const analyticsRow = page.getByRole("row", { name: /analytics-all/ }); + await expect(verifyRow).toBeVisible(); + await expect(analyticsRow).toBeVisible(); + // The unsupported declaration carries a gap chip; the supported one does not. + await expect(analyticsRow.getByText("gap")).toBeVisible(); + await expect(verifyRow.getByText("gap")).toHaveCount(0); + }); + + test("consumers panel lists the device and its capabilities", async ({ + page, + sql, + }) => { + const consumer = await seedDevice(sql, { role: "backup-restore" }); + await seedRestoreConsumerCapability(sql, { + deviceId: consumer.id, + intents: ["verify", "disaster-recovery"], + }); + + await page.goto("/restore-replicas"); + // The consumer's intents render as chips. + await expect(page.getByText("verify").first()).toBeVisible(); + await expect(page.getByText("disaster-recovery").first()).toBeVisible(); + }); + + test("deleting a declaration removes it", async ({ page, sql }) => { + const consumer = await seedDevice(sql, { role: "backup-restore" }); + await seedRestoreConsumerCapability(sql, { + deviceId: consumer.id, + intents: ["verify"], + }); + const group = await seedServerGroup(sql, { name: "del-group" }); + await seedRestoreReplica(sql, { + consumerDeviceId: consumer.id, + groupId: group.id, + intent: "verify", + name: "doomed", + }); + + await page.goto("/restore-replicas"); + await expect(page.getByRole("row", { name: /doomed/ })).toBeVisible(); + await page.getByRole("button", { name: "delete doomed" }).click(); + await expect(page.getByRole("row", { name: /doomed/ })).toHaveCount(0); + + const rows = await sql.query<{ count: string }>( + "SELECT count(*) AS count FROM restore_replicas", + ); + expect(Number(rows[0]!.count)).toBe(0); + }); + + test("toggling enabled flips the row in the database", async ({ + page, + sql, + }) => { + const consumer = await seedDevice(sql, { role: "backup-restore" }); + await seedRestoreConsumerCapability(sql, { + deviceId: consumer.id, + intents: ["verify"], + }); + const group = await seedServerGroup(sql, { name: "tog-group" }); + const replica = await seedRestoreReplica(sql, { + consumerDeviceId: consumer.id, + groupId: group.id, + intent: "verify", + name: "togglable", + enabled: true, + }); + + await page.goto("/restore-replicas"); + await page + .getByRole("row", { name: /togglable/ }) + .locator('input[type="checkbox"]') + .click(); + + await expect + .poll(async () => { + const rows = await sql.query<{ enabled: boolean }>( + "SELECT enabled FROM restore_replicas WHERE id = $1", + [replica.id], + ); + return rows[0]?.enabled; + }) + .toBe(false); + }); + + test("declaring a replica through the dialog persists it", async ({ + page, + sql, + }) => { + const consumer = await seedDevice(sql, { role: "backup-restore" }); + await seedRestoreConsumerCapability(sql, { + deviceId: consumer.id, + intents: ["verify"], + }); + const group = await seedServerGroup(sql, { name: "create-group" }); + await seedServer(sql, { groupId: group.id, name: "srv-a" }); + + await page.goto("/restore-replicas"); + await page.getByRole("button", { name: /declare replica/i }).click(); + + await page.getByLabel("Consumer").click(); + await page.getByRole("option").first().click(); + await page.getByLabel("Group").click(); + await page.getByRole("option", { name: "create-group" }).click(); + await page.getByLabel("Name").fill("dialog-made"); + await page + .getByRole("button", { name: /^declare$/i }) + .click(); + + await expect( + page.getByRole("row", { name: /dialog-made/ }), + ).toBeVisible(); + const rows = await sql.query<{ name: string }>( + "SELECT name FROM restore_replicas WHERE name = 'dialog-made'", + ); + expect(rows).toHaveLength(1); + }); +}); diff --git a/private-web/e2e/seed.ts b/private-web/e2e/seed.ts index 988990da..e0b61428 100644 --- a/private-web/e2e/seed.ts +++ b/private-web/e2e/seed.ts @@ -47,7 +47,7 @@ function randomLabel(prefix: string): string { * statement with CASCADE. */ export async function resetSeededTables(sql: Sql): Promise { await sql.query( - "TRUNCATE statuses, issues, device_keys, servers, server_groups, devices, versions, tailscale_users, server_group_backup_config, server_group_backup_schedule, server_backup_capabilities, backup_requests, backup_runs, backup_repo_stats, backup_maintenance_runs, backup_credential_issuances RESTART IDENTITY CASCADE", + "TRUNCATE statuses, issues, device_keys, servers, server_groups, devices, versions, tailscale_users, server_group_backup_config, server_group_backup_schedule, server_backup_capabilities, backup_requests, backup_runs, backup_repo_stats, backup_maintenance_runs, backup_credential_issuances, restore_replicas, restore_consumer_capabilities RESTART IDENTITY CASCADE", ); } @@ -573,3 +573,76 @@ export async function seedBackupRequest( ], ); } + +/** Register the intents a restore consumer (a `backup-restore` device) supports. */ +export async function seedRestoreConsumerCapability( + sql: Sql, + opts: { deviceId: string; intents: string[] }, +): Promise { + for (const intent of opts.intents) { + await sql.query( + `INSERT INTO restore_consumer_capabilities (consumer_device_id, intent) + VALUES ($1, $2)`, + [opts.deviceId, intent], + ); + } +} + +export interface SeededRestoreReplica { + id: string; +} + +/** Seed a declared restore replica. */ +export async function seedRestoreReplica( + sql: Sql, + opts: { + consumerDeviceId: string; + groupId: string; + /** Omit for a whole-group declaration. */ + serverId?: string | null; + type?: string; + intent?: string; + name?: string; + /** Whole seconds; omit for "latest only". */ + freshnessSeconds?: number | null; + enabled?: boolean; + }, +): Promise { + const id = randomUUID(); + const freshness = opts.freshnessSeconds ?? null; + if (freshness == null) { + await sql.query( + `INSERT INTO restore_replicas + (id, consumer_device_id, group_id, server_id, type, intent, name, enabled) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, + [ + id, + opts.consumerDeviceId, + opts.groupId, + opts.serverId ?? null, + opts.type ?? "tamanu-postgres", + opts.intent ?? "verify", + opts.name ?? randomLabel("replica"), + opts.enabled ?? true, + ], + ); + } else { + await sql.query( + `INSERT INTO restore_replicas + (id, consumer_device_id, group_id, server_id, type, intent, name, freshness, enabled) + VALUES ($1, $2, $3, $4, $5, $6, $7, make_interval(secs => $8), $9)`, + [ + id, + opts.consumerDeviceId, + opts.groupId, + opts.serverId ?? null, + opts.type ?? "tamanu-postgres", + opts.intent ?? "verify", + opts.name ?? randomLabel("replica"), + freshness, + opts.enabled ?? true, + ], + ); + } + return { id }; +} diff --git a/private-web/src/App.tsx b/private-web/src/App.tsx index 7ccc9caa..3d1baf87 100644 --- a/private-web/src/App.tsx +++ b/private-web/src/App.tsx @@ -17,6 +17,7 @@ import BackupConfig from "./routes/BackupConfig"; import BackupDefaults from "./routes/BackupDefaults"; import RecoveryVault from "./routes/RecoveryVault"; import BackupPanel from "./routes/BackupPanel"; +import RestoreReplicas from "./routes/RestoreReplicas"; import Bestool from "./routes/Bestool"; import BestoolSnippetDetail from "./routes/BestoolSnippetDetail"; import BestoolSnippets from "./routes/BestoolSnippets"; @@ -55,6 +56,7 @@ const BASE_NAV: NavItem[] = [ { label: "Servers", to: "/servers" }, { label: "Versions", to: "/versions" }, { label: "Devices", to: "/devices" }, + { label: "Restore", to: "/restore-replicas" }, { label: "Bestool", to: "/bestool" }, { label: "Settings", to: "/settings" }, ]; @@ -226,6 +228,7 @@ export default function App() { /> } /> + } /> }> = 1 ? `${hours}h` : `${seconds}s`; +} + +export default function RestoreReplicas() { + usePageTitle("Restore replicas"); + const [tick, setTick] = useState(0); + const reload = () => setTick((t) => t + 1); + + const replicas = useApi("restore_replicas", "list", {}, [tick]); + const consumers = useApi("restore_replicas", "consumers", {}, [tick]); + + const [createOpen, setCreateOpen] = useState(false); + const [error, setError] = useState(null); + + const onDelete = async (id: string) => { + try { + await callApi("restore_replicas", "delete", { id }); + reload(); + } catch (err) { + setError(formatError(err)); + } + }; + + const onToggle = async ( + id: string, + name: string, + freshnessSeconds: number | null | undefined, + enabled: boolean, + ) => { + try { + await callApi("restore_replicas", "update", { + id, + name, + freshness_seconds: freshnessSeconds ?? null, + enabled, + }); + reload(); + } catch (err) { + setError(formatError(err)); + } + }; + + return ( + + + + Restore replicas + + + + + + Canopy decides which replicas a restore consumer should keep. Each + declaration expands to one replica per matching server, restored from + the latest snapshot Canopy knows about. + + + {error && ( + setError(null)}> + {error} + + )} + + + + Declarations + + {replicas.status === "loading" || replicas.status === "idle" ? ( + + ) : replicas.status === "error" ? ( + {replicas.error.message} + ) : replicas.data.length === 0 ? ( + No restore replicas declared. + ) : ( + + + + + Name + Consumer + Scope + Type + Intent + Freshness + Enabled + Actions + + + + {replicas.data.map((r) => ( + + {r.name} + + {r.consumer_name ?? r.consumer_device_id.slice(0, 8)} + + + {r.server_id ? "one server" : "whole group"} + + {r.type} + + + {r.intent} + {r.gap && ( + + + + )} + + + {freshnessLabel(r.freshness_seconds)} + + + onToggle( + r.id, + r.name, + r.freshness_seconds, + e.target.checked, + ) + } + slotProps={{ + input: { "aria-label": `toggle ${r.name}` }, + }} + /> + + + onDelete(r.id)} + > + + + + + ))} + +
+
+ )} +
+ + + + Consumers + + {consumers.status === "ok" && consumers.data.length === 0 && ( + + No restore consumers. Promote a device to the{" "} + backup-restore role on its device page. + + )} + {consumers.status === "ok" && consumers.data.length > 0 && ( + + {consumers.data.map((c) => ( + + + {c.name ?? c.device_id} + + + {c.intents.length === 0 ? ( + + No capabilities registered yet. + + ) : ( + c.intents.map((i) => ( + + )) + )} + + + ))} + + )} + + + {createOpen && ( + setCreateOpen(false)} + onCreated={() => { + setCreateOpen(false); + reload(); + }} + consumers={ + consumers.status === "ok" ? consumers.data : [] + } + /> + )} +
+ ); +} + +interface ConsumerOption { + device_id: string; + name?: string | null; + intents: string[]; +} + +function CreateReplicaDialog({ + onClose, + onCreated, + consumers, +}: { + onClose: () => void; + onCreated: () => void; + consumers: ConsumerOption[]; +}) { + const groups = useApi("server_groups", "list"); + const typeDefaults = useApi("backups", "type_defaults"); + + const [consumerId, setConsumerId] = useState(""); + const [groupId, setGroupId] = useState(""); + const [serverId, setServerId] = useState(""); // "" = whole group + const [type, setType] = useState("tamanu-postgres"); + const [intent, setIntent] = useState("verify"); + const [name, setName] = useState(""); + const [freshnessHours, setFreshnessHours] = useState(""); + const [pending, setPending] = useState(false); + const [error, setError] = useState(null); + + const selectedConsumer = consumers.find((c) => c.device_id === consumerId); + const intentOptions = Array.from( + new Set([...(selectedConsumer?.intents ?? []), ...WELL_KNOWN_INTENTS]), + ); + + const onSubmit = async () => { + if (!consumerId) return setError("Pick a consumer"); + if (!groupId) return setError("Pick a group"); + if (!name.trim()) return setError("Name cannot be empty"); + const hours = freshnessHours.trim(); + const freshness_seconds = + hours === "" ? null : Math.round(Number(hours) * 3600); + if (freshness_seconds != null && !Number.isFinite(freshness_seconds)) { + return setError("Freshness must be a number of hours"); + } + setPending(true); + setError(null); + try { + await callApi("restore_replicas", "create", { + consumer_device_id: consumerId, + group_id: groupId, + server_id: serverId || null, + type, + intent, + name: name.trim(), + freshness_seconds, + }); + onCreated(); + } catch (err) { + setError(formatError(err)); + setPending(false); + } + }; + + const typeOptions = + typeDefaults.status === "ok" && typeDefaults.data.length > 0 + ? typeDefaults.data.map((t) => t.type) + : ["tamanu-postgres"]; + + return ( + !pending && onClose()} fullWidth maxWidth="sm"> + Declare restore replica + + + + Consumer + + + + + Group + + + + {groupId && ( + + )} + + + Type + + + + + Intent + + + + setName(e.target.value)} + /> + + setFreshnessHours(e.target.value)} + /> + + {error && {error}} + + + + + + + + ); +} + +function ServerScopeSelect({ + groupId, + value, + onChange, +}: { + groupId: string; + value: string; + onChange: (v: string) => void; +}) { + const detail = useApi( + "server_groups", + "get", + { server_group_id: groupId }, + [groupId], + ); + const servers = + detail.status === "ok" ? detail.data.servers.filter((s) => !s.archived) : []; + return ( + + Server + + + ); +} From 618a4e9737a8855d7eb919f9190713557af8af24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Tue, 30 Jun 2026 15:06:31 +1200 Subject: [PATCH 6/7] test(restore): db model + public-server endpoint coverage database::restore: CRUD roundtrip, duplicate-scope 409 (server vs group scope separate), update/delete, authorizes (enabled/group/type/disabled), capability register replace semantics. public-server::restore: capability-filtered worklist, per-server expansion of a group-wide declaration, server-specific-over-group-wide dedup, empty-without-capabilities, restore-credentials 403 (no declaration) / 502 (authorized but STS unconfigured), and non-consumer-role rejection. Co-Authored-By: Claude Opus 4.8 --- crates/database/tests/restore.rs | 289 ++++++++++++++++++++++++ crates/public-server/tests/restore.rs | 308 ++++++++++++++++++++++++++ 2 files changed, 597 insertions(+) create mode 100644 crates/database/tests/restore.rs create mode 100644 crates/public-server/tests/restore.rs diff --git a/crates/database/tests/restore.rs b/crates/database/tests/restore.rs new file mode 100644 index 00000000..eb473801 --- /dev/null +++ b/crates/database/tests/restore.rs @@ -0,0 +1,289 @@ +//! DB-layer tests for the managed-restore models (`database::restore`). +//! Exercises the model helpers directly against a fresh migrated DB — no HTTP. + +use commons_errors::AppError; +use commons_tests::db::TestDb; +use commons_types::backup::{BackupType, RestoreIntent}; +use database::diesel_async::AsyncPgConnection; +use database::pg_duration::PgDuration; +use database::{NewRestoreReplica, RestoreConsumerCapability, RestoreReplica}; +use diesel::{sql_query, sql_types}; +use diesel_async::RunQueryDsl; +use jiff::SignedDuration; +use uuid::Uuid; + +#[derive(diesel::QueryableByName)] +struct RowId { + #[diesel(sql_type = sql_types::Uuid)] + id: Uuid, +} + +async fn insert_group(conn: &mut AsyncPgConnection, name: &str) -> Uuid { + sql_query("INSERT INTO server_groups (name) VALUES ($1) RETURNING id") + .bind::(name) + .get_result::(conn) + .await + .expect("insert group") + .id +} + +async fn insert_server(conn: &mut AsyncPgConnection, group_id: Uuid) -> Uuid { + let host = format!("http://test.invalid/{}", Uuid::new_v4()); + sql_query("INSERT INTO servers (host, kind, group_id) VALUES ($1, 'central', $2) RETURNING id") + .bind::(host) + .bind::(group_id) + .get_result::(conn) + .await + .expect("insert server") + .id +} + +async fn insert_consumer(conn: &mut AsyncPgConnection) -> Uuid { + sql_query("INSERT INTO devices (role) VALUES ('backup-restore') RETURNING id") + .get_result::(conn) + .await + .expect("insert device") + .id +} + +fn new_replica( + consumer: Uuid, + group: Uuid, + server: Option, + intent: RestoreIntent, + name: &str, +) -> NewRestoreReplica { + NewRestoreReplica { + consumer_device_id: consumer, + group_id: group, + server_id: server, + r#type: BackupType::TamanuPostgres, + intent, + name: name.into(), + freshness: None, + created_by: Some("op@example.com".into()), + } +} + +#[tokio::test(flavor = "multi_thread")] +async fn create_list_get_roundtrip() { + TestDb::run(|mut conn, _url| async move { + let consumer = insert_consumer(&mut conn).await; + let group = insert_group(&mut conn, "g").await; + + let created = RestoreReplica::create( + &mut conn, + new_replica(consumer, group, None, RestoreIntent::Verify, "verify-all"), + ) + .await + .expect("create"); + assert_eq!(created.name, "verify-all"); + assert_eq!(created.intent, RestoreIntent::Verify); + assert!(created.enabled, "new declarations default to enabled"); + assert_eq!(created.created_by.as_deref(), Some("op@example.com")); + + let got = RestoreReplica::get(&mut conn, created.id) + .await + .expect("get"); + assert_eq!(got.id, created.id); + + let all = RestoreReplica::list_all(&mut conn).await.expect("list_all"); + assert_eq!(all.len(), 1); + + let for_group = RestoreReplica::list_for_group(&mut conn, group) + .await + .expect("list_for_group"); + assert_eq!(for_group.len(), 1); + + let enabled = RestoreReplica::list_enabled_for_consumer(&mut conn, consumer) + .await + .expect("list_enabled"); + assert_eq!(enabled.len(), 1); + }) + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn duplicate_scope_conflicts_but_server_scope_is_separate() { + TestDb::run(|mut conn, _url| async move { + let consumer = insert_consumer(&mut conn).await; + let group = insert_group(&mut conn, "g").await; + let server = insert_server(&mut conn, group).await; + + RestoreReplica::create( + &mut conn, + new_replica(consumer, group, None, RestoreIntent::Verify, "group-wide"), + ) + .await + .expect("group-wide"); + + // Same (consumer, group, type, intent) group-wide scope → 409. + let dup = RestoreReplica::create( + &mut conn, + new_replica(consumer, group, None, RestoreIntent::Verify, "dup"), + ) + .await; + assert!(matches!(dup, Err(AppError::Conflict(_))), "got {dup:?}"); + + // A server-scoped declaration for the same tuple is tracked separately. + RestoreReplica::create( + &mut conn, + new_replica( + consumer, + group, + Some(server), + RestoreIntent::Verify, + "server-scoped", + ), + ) + .await + .expect("server-scoped coexists with group-wide"); + }) + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn update_and_delete() { + TestDb::run(|mut conn, _url| async move { + let consumer = insert_consumer(&mut conn).await; + let group = insert_group(&mut conn, "g").await; + let r = RestoreReplica::create( + &mut conn, + new_replica(consumer, group, None, RestoreIntent::Verify, "n"), + ) + .await + .expect("create"); + + let updated = RestoreReplica::update( + &mut conn, + r.id, + "renamed", + Some(PgDuration(SignedDuration::from_secs(7200))), + false, + ) + .await + .expect("update"); + assert_eq!(updated.name, "renamed"); + assert!(!updated.enabled); + assert_eq!(updated.freshness.map(|f| f.0.as_secs()), Some(7200)); + + // Disabled declarations drop out of the consumer worklist basis. + let enabled = RestoreReplica::list_enabled_for_consumer(&mut conn, consumer) + .await + .expect("list_enabled"); + assert!(enabled.is_empty()); + + RestoreReplica::delete(&mut conn, r.id) + .await + .expect("delete"); + assert!(RestoreReplica::get(&mut conn, r.id).await.is_err()); + assert!( + RestoreReplica::delete(&mut conn, r.id).await.is_err(), + "deleting a missing declaration errors" + ); + }) + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn authorizes_only_with_enabled_matching_declaration() { + TestDb::run(|mut conn, _url| async move { + let consumer = insert_consumer(&mut conn).await; + let group = insert_group(&mut conn, "g").await; + let other_group = insert_group(&mut conn, "other").await; + let tpg = BackupType::TamanuPostgres; + + assert!( + !RestoreReplica::authorizes(&mut conn, consumer, group, &tpg) + .await + .unwrap(), + "no declaration → not authorized" + ); + + let r = RestoreReplica::create( + &mut conn, + new_replica(consumer, group, None, RestoreIntent::Verify, "n"), + ) + .await + .expect("create"); + + assert!( + RestoreReplica::authorizes(&mut conn, consumer, group, &tpg) + .await + .unwrap(), + "enabled declaration → authorized" + ); + assert!( + !RestoreReplica::authorizes(&mut conn, consumer, other_group, &tpg) + .await + .unwrap(), + "different group → not authorized" + ); + assert!( + !RestoreReplica::authorizes(&mut conn, consumer, group, &BackupType::from("files")) + .await + .unwrap(), + "different type → not authorized" + ); + + // Disabling the only declaration revokes authorization. + RestoreReplica::update(&mut conn, r.id, "n", None, false) + .await + .expect("disable"); + assert!( + !RestoreReplica::authorizes(&mut conn, consumer, group, &tpg) + .await + .unwrap(), + "disabled declaration → not authorized" + ); + }) + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn capability_register_replaces_set() { + TestDb::run(|mut conn, _url| async move { + let consumer = insert_consumer(&mut conn).await; + + RestoreConsumerCapability::register( + &mut conn, + consumer, + &[RestoreIntent::Verify, RestoreIntent::Analytics], + ) + .await + .expect("register"); + let mut got = RestoreConsumerCapability::list_for_consumer(&mut conn, consumer) + .await + .expect("list"); + got.sort_by_key(|i| i.to_string()); + assert_eq!(got, vec![RestoreIntent::Analytics, RestoreIntent::Verify]); + + // Re-register a different set: verify is kept, analytics dropped, + // disaster-recovery added. + RestoreConsumerCapability::register( + &mut conn, + consumer, + &[RestoreIntent::Verify, RestoreIntent::DisasterRecovery], + ) + .await + .expect("re-register"); + let mut got = RestoreConsumerCapability::list_for_consumer(&mut conn, consumer) + .await + .expect("list"); + got.sort_by_key(|i| i.to_string()); + assert_eq!( + got, + vec![RestoreIntent::DisasterRecovery, RestoreIntent::Verify] + ); + + // Empty set clears all capabilities. + RestoreConsumerCapability::register(&mut conn, consumer, &[]) + .await + .expect("clear"); + let got = RestoreConsumerCapability::list_for_consumer(&mut conn, consumer) + .await + .expect("list"); + assert!(got.is_empty()); + }) + .await; +} diff --git a/crates/public-server/tests/restore.rs b/crates/public-server/tests/restore.rs new file mode 100644 index 00000000..fcf68230 --- /dev/null +++ b/crates/public-server/tests/restore.rs @@ -0,0 +1,308 @@ +//! HTTP tests for the managed-restore endpoints (backup-restore role). The +//! worklist/capability paths run against the standard harness (no STS/kube +//! needed); restore-credentials is covered for its authz (403) and the +//! authorized-but-unconfigured (502) paths. + +use diesel::{sql_query, sql_types}; +use diesel_async::{AsyncPgConnection, RunQueryDsl}; +use uuid::Uuid; + +async fn make_group(conn: &mut AsyncPgConnection) -> Uuid { + let id = Uuid::new_v4(); + sql_query("INSERT INTO server_groups (id, name) VALUES ($1, 'restore-test-group')") + .bind::(id) + .execute(conn) + .await + .expect("insert group"); + id +} + +async fn make_config(conn: &mut AsyncPgConnection, group_id: Uuid, status: &str) { + sql_query( + "INSERT INTO server_group_backup_config \ + (group_id, bucket, prefix, target_role_arn, maintenance_role_arn, region, repo_password_ref, status) \ + VALUES ($1, 'grp-bucket', '', 'arn:aws:iam::123456789012:role/grp', 'arn:aws:iam::123456789012:role/grp-maint', 'ap-southeast-2', 'grp-repo-pw', $2)", + ) + .bind::(group_id) + .bind::(status) + .execute(conn) + .await + .expect("insert config"); +} + +async fn make_server(conn: &mut AsyncPgConnection, group_id: Uuid) -> Uuid { + let server_id = Uuid::new_v4(); + let host = format!("https://srv-{server_id}.example.com"); + sql_query("INSERT INTO servers (id, host, kind, group_id) VALUES ($1, $2, 'central', $3)") + .bind::(server_id) + .bind::(host) + .bind::(group_id) + .execute(conn) + .await + .expect("insert server"); + server_id +} + +/// A successful `backup` run = the snapshot the worklist should surface. +async fn make_success_run( + conn: &mut AsyncPgConnection, + device_id: Uuid, + group_id: Uuid, + server_id: Uuid, + snapshot_id: &str, +) { + sql_query( + "INSERT INTO backup_runs (id, device_id, group_id, server_id, type, purpose, outcome, snapshot_id) \ + VALUES ($1, $2, $3, $4, 'tamanu-postgres', 'backup', 'success', $5)", + ) + .bind::(Uuid::new_v4()) + .bind::(device_id) + .bind::(group_id) + .bind::(server_id) + .bind::(snapshot_id) + .execute(conn) + .await + .expect("insert run"); +} + +async fn declare_replica( + conn: &mut AsyncPgConnection, + consumer: Uuid, + group_id: Uuid, + intent: &str, +) { + sql_query( + "INSERT INTO restore_replicas (consumer_device_id, group_id, type, intent, name) \ + VALUES ($1, $2, 'tamanu-postgres', $3, $4)", + ) + .bind::(consumer) + .bind::(group_id) + .bind::(intent) + .bind::(format!("{intent}-decl")) + .execute(conn) + .await + .expect("insert declaration"); +} + +async fn declare_replica_server( + conn: &mut AsyncPgConnection, + consumer: Uuid, + group_id: Uuid, + server_id: Uuid, + intent: &str, +) { + sql_query( + "INSERT INTO restore_replicas (consumer_device_id, group_id, server_id, type, intent, name) \ + VALUES ($1, $2, $3, 'tamanu-postgres', $4, $5)", + ) + .bind::(consumer) + .bind::(group_id) + .bind::(server_id) + .bind::(intent) + .bind::(format!("{intent}-server-decl")) + .execute(conn) + .await + .expect("insert server declaration"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn capabilities_register_then_worklist_filters_by_intent() { + commons_tests::server::run_with_device_auth( + "backup-restore", + async |mut conn, cert, device_id, public, _| { + let group = make_group(&mut conn).await; + make_config(&mut conn, group, "ready").await; + let server = make_server(&mut conn, group).await; + make_success_run(&mut conn, device_id, group, server, "snap-1").await; + + // Two whole-group declarations, different intents. + declare_replica(&mut conn, device_id, group, "verify").await; + declare_replica(&mut conn, device_id, group, "analytics").await; + + // Register only `verify`. + public + .post("/restore-capabilities") + .add_header("mtls-certificate", &cert) + .json(&serde_json::json!({ "intents": ["verify"] })) + .await + .assert_status(http::StatusCode::NO_CONTENT); + + let resp = public + .get("/restore-worklist") + .add_header("mtls-certificate", &cert) + .await; + resp.assert_status_ok(); + let entries: Vec = resp.json(); + // Only the `verify` declaration is dispatched; `analytics` is a gap. + assert_eq!(entries.len(), 1, "got {entries:?}"); + assert_eq!(entries[0]["intent"], "verify"); + assert_eq!(entries[0]["server_id"], server.to_string()); + assert_eq!(entries[0]["snapshot_id"], "snap-1"); + assert_eq!(entries[0]["bucket"], "grp-bucket"); + }, + ) + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn worklist_expands_group_wide_to_each_server() { + commons_tests::server::run_with_device_auth( + "backup-restore", + async |mut conn, cert, device_id, public, _| { + let group = make_group(&mut conn).await; + make_config(&mut conn, group, "ready").await; + let server_a = make_server(&mut conn, group).await; + let server_b = make_server(&mut conn, group).await; + make_success_run(&mut conn, device_id, group, server_a, "snap-a").await; + make_success_run(&mut conn, device_id, group, server_b, "snap-b").await; + declare_replica(&mut conn, device_id, group, "verify").await; + public + .post("/restore-capabilities") + .add_header("mtls-certificate", &cert) + .json(&serde_json::json!({ "intents": ["verify"] })) + .await + .assert_status(http::StatusCode::NO_CONTENT); + + let resp = public + .get("/restore-worklist") + .add_header("mtls-certificate", &cert) + .await; + resp.assert_status_ok(); + let entries: Vec = resp.json(); + // One whole-group declaration → one entry per live server, each with + // its own latest snapshot. + assert_eq!(entries.len(), 2, "got {entries:?}"); + let mut by_server: std::collections::HashMap = entries + .iter() + .map(|e| { + ( + e["server_id"].as_str().unwrap().to_owned(), + e["snapshot_id"].as_str().unwrap().to_owned(), + ) + }) + .collect(); + assert_eq!( + by_server.remove(&server_a.to_string()).as_deref(), + Some("snap-a") + ); + assert_eq!( + by_server.remove(&server_b.to_string()).as_deref(), + Some("snap-b") + ); + }, + ) + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn worklist_dedupes_server_specific_over_group_wide() { + commons_tests::server::run_with_device_auth( + "backup-restore", + async |mut conn, cert, device_id, public, _| { + let group = make_group(&mut conn).await; + make_config(&mut conn, group, "ready").await; + let server = make_server(&mut conn, group).await; + make_success_run(&mut conn, device_id, group, server, "snap-1").await; + // Both a whole-group and a server-specific declaration of the same + // (type, intent) cover this server. + declare_replica(&mut conn, device_id, group, "verify").await; + declare_replica_server(&mut conn, device_id, group, server, "verify").await; + public + .post("/restore-capabilities") + .add_header("mtls-certificate", &cert) + .json(&serde_json::json!({ "intents": ["verify"] })) + .await + .assert_status(http::StatusCode::NO_CONTENT); + + let resp = public + .get("/restore-worklist") + .add_header("mtls-certificate", &cert) + .await; + resp.assert_status_ok(); + let entries: Vec = resp.json(); + // Deduped to a single entry for the server, not two. + assert_eq!(entries.len(), 1, "got {entries:?}"); + assert_eq!(entries[0]["server_id"], server.to_string()); + }, + ) + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn worklist_empty_without_registered_capabilities() { + commons_tests::server::run_with_device_auth( + "backup-restore", + async |mut conn, cert, device_id, public, _| { + let group = make_group(&mut conn).await; + make_config(&mut conn, group, "ready").await; + make_server(&mut conn, group).await; + declare_replica(&mut conn, device_id, group, "verify").await; + + // No capabilities registered → nothing dispatched. + let resp = public + .get("/restore-worklist") + .add_header("mtls-certificate", &cert) + .await; + resp.assert_status_ok(); + let entries: Vec = resp.json(); + assert!(entries.is_empty(), "got {entries:?}"); + }, + ) + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn restore_credentials_without_declaration_is_403() { + commons_tests::server::run_with_device_auth( + "backup-restore", + async |mut conn, cert, _device_id, public, _| { + let group = make_group(&mut conn).await; + make_config(&mut conn, group, "ready").await; + let resp = public + .post("/restore-credentials") + .add_header("mtls-certificate", &cert) + .json(&serde_json::json!({ "group": group, "type": "tamanu-postgres" })) + .await; + resp.assert_status(http::StatusCode::FORBIDDEN); + }, + ) + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn restore_credentials_authorized_but_unconfigured_is_502() { + commons_tests::server::run_with_device_auth( + "backup-restore", + async |mut conn, cert, device_id, public, _| { + let group = make_group(&mut conn).await; + make_config(&mut conn, group, "ready").await; + declare_replica(&mut conn, device_id, group, "verify").await; + + // Authorization passes; the harness has no STS client, so issuance + // fails upstream rather than 403. + let resp = public + .post("/restore-credentials") + .add_header("mtls-certificate", &cert) + .json(&serde_json::json!({ "group": group, "type": "tamanu-postgres" })) + .await; + resp.assert_status(http::StatusCode::BAD_GATEWAY); + }, + ) + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn restore_endpoints_reject_non_consumer_role() { + // A `server`-role device cannot reach the backup-restore endpoints. + commons_tests::server::run_with_device_auth( + "server", + async |_conn, cert, _device_id, public, _| { + let resp = public + .get("/restore-worklist") + .add_header("mtls-certificate", &cert) + .await; + resp.assert_status(http::StatusCode::FORBIDDEN); + }, + ) + .await; +} From e36c5e66d8b89e7535fdc06b6edd5c204dd668f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Tue, 30 Jun 2026 16:32:06 +1200 Subject: [PATCH 7/7] docs(restore): clarify freshness is restore-cadence, not backup interval --- .workhorse/specs/public-server/restore-replicas.md | 9 ++++++++- crates/database/src/restore.rs | 5 +++-- crates/public-server/openapi.json | 2 +- crates/public-server/src/restore.rs | 5 +++-- .../2026-06-30-021427-0000_restore_replicas/up.sql | 5 +++-- 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/.workhorse/specs/public-server/restore-replicas.md b/.workhorse/specs/public-server/restore-replicas.md index c76d5a36..ebd0294f 100644 --- a/.workhorse/specs/public-server/restore-replicas.md +++ b/.workhorse/specs/public-server/restore-replicas.md @@ -78,7 +78,7 @@ Each declaration carries: - a **server** within the group, or all servers in the group when none is named; - an **intent** describing what the replica is for; - a human-readable **name**; -- a **freshness** bound: the maximum age of the restored snapshot before the replica is considered out of date and should be refreshed or re-verified; +- a **freshness** bound: the maximum time the replica may go without a fresh successful restore before it is considered overdue — a bound on the consumer's *restore* cadence, deliberately independent of how often backups are produced (below); - whether the declaration is **enabled**. Intent is an open set; unrecognised intents are preserved verbatim rather than rejected, so a consumer may advertise intents Canopy does not model. @@ -108,6 +108,13 @@ Canopy expands the consumer's enabled declarations — those whose intent the co The worklist does not carry credentials or the repo password. The consumer reconciles the worklist against what it is actually running — creating, refreshing, and tearing down replicas to match — and is responsible for converging on the desired state over time. +### Latest state, not a queue + +Each entry names the *latest* snapshot for its `(server, type)`, not a backlog to drain. +A consumer restores on its own cadence and skips the intermediate snapshots produced since its last restore; restoring less often than backups are produced is expected, not a failure. +A restore can take far longer than the interval between backups — the data is slow to download and restore, and a persistent replica may be held up while its workload runs — so the consumer's restore cadence is independent of, and typically much slower than, the backup cadence. +Consequently a replica's **freshness** bound is set to cover the consumer's restore cycle (download, restore, and any hold), not the backup interval: setting it to the backup interval would alert continuously even when restores are keeping pace as designed. + ### Snapshot authority The snapshot Canopy hands out for a `(server, type)` is the snapshot identifier of that server's most recent successful backup run of that type. diff --git a/crates/database/src/restore.rs b/crates/database/src/restore.rs index 7675ce4a..18d34bad 100644 --- a/crates/database/src/restore.rs +++ b/crates/database/src/restore.rs @@ -36,8 +36,9 @@ pub struct RestoreReplica { #[schema(value_type = String)] pub intent: RestoreIntent, pub name: String, - /// Max age of the restored snapshot before the replica is overdue, in - /// whole seconds; `None` = always track the latest snapshot. + /// Max time since the last healthy restore before the replica is overdue + /// — the consumer's *restore* cadence (download + restore + any hold), not + /// the backup interval. In whole seconds; `None` = no overdue bound. #[schema(value_type = Option)] pub freshness: Option, pub enabled: bool, diff --git a/crates/public-server/openapi.json b/crates/public-server/openapi.json index e7301488..d0997ddb 100644 --- a/crates/public-server/openapi.json +++ b/crates/public-server/openapi.json @@ -1933,7 +1933,7 @@ "null" ], "format": "int64", - "description": "Max snapshot age before the replica is overdue, in whole seconds;\n`None` = always track the latest." + "description": "Max time since the last healthy restore before overdue, in whole seconds\n— the consumer's restore cadence, not the backup interval; `None` = no\noverdue bound." }, "group_id": { "type": "string", diff --git a/crates/public-server/src/restore.rs b/crates/public-server/src/restore.rs index 3dfe9e9b..89c0be08 100644 --- a/crates/public-server/src/restore.rs +++ b/crates/public-server/src/restore.rs @@ -96,8 +96,9 @@ pub struct WorklistEntry { #[schema(value_type = String)] pub intent: RestoreIntent, pub name: String, - /// Max snapshot age before the replica is overdue, in whole seconds; - /// `None` = always track the latest. + /// Max time since the last healthy restore before overdue, in whole seconds + /// — the consumer's restore cadence, not the backup interval; `None` = no + /// overdue bound. pub freshness_seconds: Option, /// The snapshot Canopy wants restored — the latest successful backup for /// this `(server, type)`. `None` when no successful backup is yet known. diff --git a/migrations/2026-06-30-021427-0000_restore_replicas/up.sql b/migrations/2026-06-30-021427-0000_restore_replicas/up.sql index 8a1919a9..4127f0db 100644 --- a/migrations/2026-06-30-021427-0000_restore_replicas/up.sql +++ b/migrations/2026-06-30-021427-0000_restore_replicas/up.sql @@ -15,8 +15,9 @@ CREATE TABLE restore_replicas ( type TEXT NOT NULL, intent TEXT NOT NULL, name TEXT NOT NULL, - -- Max age of the restored snapshot before the replica is overdue; NULL = - -- always track the latest snapshot. + -- Max time since the last healthy restore before the replica is overdue — + -- the consumer's restore cadence (download + restore + any hold), not the + -- backup interval; NULL = no overdue bound. freshness INTERVAL, enabled BOOLEAN NOT NULL DEFAULT TRUE, created_by TEXT,