From 46adbd9d632e9ff6723c43cdccd45a0d8e5d1a05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= <felix@bes.au>
Date: Tue, 30 Jun 2026 12:06:44 +1200
Subject: [PATCH 1/7] docs(restore): evaluate pgro handoff; add
 restore-replicas spec + canopy response
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Copy pgro's restore-verification handoff into docs/plans/.
- Add .workhorse/specs/public-server/restore-replicas.md (RST): canopy as
  restore control plane — operator-declared replicas, worklist-driven
  executor, per-server targeting + restore-health.
- Add docs/plans/pgro-restore-replicas-canopy-response.md: conformance
  verdict + the control-model inversion, for pgro sign-off.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../specs/public-server/restore-replicas.md   | 147 ++++++
 .../pgro-restore-replicas-canopy-response.md  | 124 +++++
 .../pgro-restore-verification-handoff.md      | 483 ++++++++++++++++++
 3 files changed, 754 insertions(+)
 create mode 100644 .workhorse/specs/public-server/restore-replicas.md
 create mode 100644 docs/plans/pgro-restore-replicas-canopy-response.md
 create mode 100644 docs/plans/pgro-restore-verification-handoff.md

diff --git a/.workhorse/specs/public-server/restore-replicas.md b/.workhorse/specs/public-server/restore-replicas.md
new file mode 100644
index 00000000..aa6991ef
--- /dev/null
+++ b/.workhorse/specs/public-server/restore-replicas.md
@@ -0,0 +1,147 @@
+---
+id: RST
+---
+
+# Managed restore replicas
+
+Canopy is the control plane for a fleet's *managed restore replicas*: standing replicas that Canopy decides should exist and keeps restored from the latest backups, driven through a restore consumer.
+An external restore consumer — first-party infrastructure that restores backups into working Postgres replicas — is driven entirely by Canopy: Canopy declares which replicas should exist, hands out the snapshot to restore and short-lived read-only credentials for each, and records the restorability of every replica as the strongest backup-health signal.
+
+## Scope
+
+This spec covers *managed* restore replicas only: the standing replicas Canopy decides should exist and keeps current, and the restore-health signal they produce.
+
+It does not cover an operator restoring a backup by hand.
+An operator performing disaster recovery or an ad-hoc restore selects a specific snapshot for a specific server and restores it through that server's own device tooling and credentials — the existing per-server restore path, unchanged by this spec.
+That path is operator-driven and server-scoped: the operator chooses what to restore and where, and Canopy only issues the read-only credentials and snapshot information for that one server.
+Managed replicas are the opposite mode: Canopy chooses what should be restored, continuously, with no operator selecting each one.
+The two modes share Canopy's read-only credential issuance and snapshot authority; they differ in who decides what gets restored.
+
+## Why it exists
+
+A backup is only as good as its last successful restore.
+Producing snapshots (a device backed up) and confirming they landed in the repo (a snapshot exists) are weaker guarantees than actually restoring one into a live database.
+Canopy already knows every group, every server, every backup type, and the latest snapshot for each — so it is the natural authority on *what should be restored*.
+Centralising that decision in Canopy eliminates the long-lived AWS keys a restore consumer would otherwise hold, makes the restore consumer a stateless executor of Canopy's intent, and closes the lifecycle loop end-to-end: produced, persisted, restorable.
+
+## Actors
+
+A **restore consumer** is first-party infrastructure that restores backups and reports their health.
+It holds no standing access to any backup repo and stores no list of what to restore: it asks Canopy what replicas should exist, restores them, and reports back.
+It owns only the mechanics of restoration — how a replica is provisioned, where it runs, how much storage it gets, when it is torn down.
+
+An **operator** declares, through Canopy, which replicas should exist and why.
+
+Canopy owns the *what* and the *why* (which group, which server, which type, to what end, how fresh) and the *authority* (which snapshot, which credentials, is it restorable).
+The consumer owns the *how*.
+This boundary is load-bearing: Canopy never models a consumer's runtime placement, and a consumer never decides on its own what to restore.
+
+## Identity and authorization
+
+A restore consumer authenticates as a single device holding the `backup-restore` role.
+The role is generic: any future restore consumer uses the same role with its own declared replicas.
+A `backup-restore` device has no implicit server and no implicit group; it is not a member of any group it reads.
+
+The role is read-only by contract, enforced at the API:
+
+- A `backup-restore` caller requesting backup (write) credentials is rejected.
+  The read-only guarantee is server-enforced, so a compromised consumer cannot pivot to writing or poisoning a repo.
+- A `backup-restore` caller may obtain credentials and the worklist only for a `(group, type)` it has been authorised for.
+
+Authorization is the set of declared replicas (below): a consumer is authorised for exactly the `(group, type)` pairs that appear in its enabled replica declarations.
+There is no separate grant object — declaring a replica *is* the authorization to read what that replica needs.
+
+A device reaches this role through one-off operator promotion, the same path a release-publishing device uses; no fleet-enrolment flow is involved.
+Either transport Canopy already accepts for devices — tailnet identity or a client certificate — satisfies the role; the role, not the transport, is the contract.
+
+## Declared replicas
+
+An operator declares replicas against Canopy.
+Each declaration carries:
+
+- the **group** whose repo holds the backups;
+- the **type** of backup to restore;
+- a **server** within the group, or all servers in the group when none is named;
+- an **intent** describing what the replica is for;
+- a human-readable **name**;
+- a **freshness** bound: the maximum age of the restored snapshot before the replica is considered out of date and should be refreshed or re-verified;
+- whether the declaration is **enabled**.
+
+Intent is an open set; unrecognised intents are preserved verbatim rather than rejected, so a consumer may advertise intents Canopy does not model.
+The well-known intents are:
+
+- **verify** — a transient replica restored solely to prove the snapshot is restorable, then discarded; re-run on the freshness cadence.
+- **analytics** — a persistent replica kept running for querying, refreshed to the latest snapshot on the freshness cadence.
+- **disaster-recovery** — a periodic rehearsal of the full recovery path: a replica restored the way a real recovery would be, checked as a viable stand-in for the server, then discarded. It is the managed, automated counterpart to the operator-driven recovery in [Scope](#scope), not the recovery event itself.
+
+A declaration scoped to a whole group expands to one replica per current server in that group.
+Servers joining or leaving a group change what the consumer is asked to maintain, with no per-server operator action.
+
+Declarations are managed through the operator interface (create, edit, enable/disable, delete) and are audited.
+Deleting a declaration stops the consumer being asked to maintain that replica and revokes its authorization for that `(group, type)` if no other declaration covers it; recorded restore-health history is retained.
+
+## The worklist
+
+A restore consumer fetches its complete desired state from Canopy in one request, scoped to the calling consumer.
+Canopy expands the consumer's enabled declarations against the current servers and the latest known snapshot for each, and returns one entry per concrete replica:
+
+- the declaration's identifier, group, server, type, intent, name, and freshness;
+- the **snapshot to restore**: the snapshot identifier and its timestamp, or empty when no successful backup is yet known for that server and type;
+- the repo coordinates needed to locate the backups (storage, bucket, prefix, region).
+
+The worklist does not carry credentials or the repo password.
+The consumer reconciles the worklist against what it is actually running — creating, refreshing, and tearing down replicas to match — and is responsible for converging on the desired state over time.
+
+### Snapshot authority
+
+The snapshot Canopy hands out for a `(server, type)` is the snapshot identifier of that server's most recent successful backup run of that type.
+This is the same snapshot the operator interface shows as the server's latest.
+Canopy's independent repo inventory corroborates the snapshot's existence and timestamp; it is not currently the source of the identifier.
+
+## Credentials
+
+A consumer obtains credentials per `(group, type)` as it works, not for the whole fleet at once.
+Canopy verifies the caller has an enabled declaration covering that `(group, type)`, then issues:
+
+- short-lived read-only object-storage credentials scoped to the group's repo;
+- the repo password.
+
+The credentials permit reading the repo and nothing else; they cannot write, overwrite, or delete.
+Each issuance is audited.
+Absence of a covering declaration is a definitive refusal, not a transient error, and a consumer surfaces it as a clear failure for the operator to diagnose by inspecting the declaration in Canopy.
+
+The 1-hour lifetime of an issued credential does not bound restore duration: a consumer refreshes credentials as needed across a long restore.
+
+## Restore-health reporting
+
+A consumer reports the outcome of each replica back to Canopy.
+A report carries:
+
+- the declaration, group, server, and type it concerns;
+- the **snapshot** that was restored, joining the report to the produced-and-persisted record for that snapshot;
+- the **outcome** — restored-and-healthy, or failed — and, on failure, an error description;
+- whether the restored database came up healthy, and its Postgres major version;
+- when the restore was observed;
+- the object-storage traffic the restore moved.
+
+Restored-and-healthy means the snapshot restored, the database started, and the consumer's readiness checks passed — a stronger statement than a snapshot merely existing.
+A failure covers any stage: the restore itself, the database failing to come up, or a readiness check failing.
+
+Reports are retained indefinitely as an audit trail.
+
+## Alerting
+
+A failed or overdue restore-health report is a group-level incident that pages regardless of any individual server's monitoring state, because an unrestorable backup is a control-plane and data-safety concern, not one server's operational noise.
+
+A failure raises a group-scoped restore-verification alert identifying the affected server and snapshot.
+Each server's restore-health is tracked independently, so one server's failed restore does not mask or merge with another's.
+The alert recovers when that server's next report for the same type is healthy.
+
+A replica with no recent healthy report within its freshness bound is overdue and raises the same alert; Canopy detects this on a periodic sweep rather than waiting for a report that never arrives.
+
+## Out of scope
+
+- How a consumer provisions, runs, names, or tears down a replica.
+- A consumer's runtime placement, storage sizing, or scheduling.
+- Scoping object-storage credentials below the granularity of a group's repo: one repo holds all of a group's servers' snapshots, so credentials are necessarily group-wide while targeting and reporting are per-server.
+- Longer-lived or non-chained credentials: a consumer refreshes within a restore, so the per-issuance lifetime is not a constraint.
diff --git a/docs/plans/pgro-restore-replicas-canopy-response.md b/docs/plans/pgro-restore-replicas-canopy-response.md
new file mode 100644
index 00000000..59e8ef8b
--- /dev/null
+++ b/docs/plans/pgro-restore-replicas-canopy-response.md
@@ -0,0 +1,124 @@
+# Canopy response to the PGRO restore-verification handoff
+
+**From:** canopy
+**To:** pgro
+**Re:** `pgro/docs/canopy-handoff.md` (copied here as
+`pgro-restore-verification-handoff.md`)
+**Status:** needs pgro sign-off on the control-model inversion (§3) before
+canopy freezes wire shapes and before pgro builds. Canopy will build its side
+against the model below; pgro adopting the inverted executor model is pgro's
+call.
+
+The spec for canopy's side is `.workhorse/specs/public-server/restore-replicas.md`.
+
+---
+
+## 1. The handoff is conformant
+
+Every load-bearing claim about canopy's current code checks out: the tailnet
+node-identity auth and `TAILSCALE_REQUIRED_TAG` gate, the role-gating
+extractor macro (note: an `admin` device passes every role-gated route), the
+`securitySchemes` block, the `/backup-credentials` + `/backup-target` +
+`/backup-report` handlers and their request/response shapes, the
+session-policy → per-bucket STS role → repo-password flow, the `backup_runs` /
+`backup_repo_snapshots` schemas, and the per-server-vs-group alerting split.
+Three small inaccuracies, none of which change the design:
+
+- Device roles are four, not three (`untrusted` is the auto-created pre-trust
+  state). The role column is plain `TEXT` with no `CHECK`, so adding
+  `backup-restore` is a code change, not a schema migration. "Cert minting"
+  for the role is just operator trust-promotion (the releaser model) — no new
+  enrolment machinery, exactly as you guessed.
+- §4.4.1 reason 7 is wrong: `run_id` is *not* shared between
+  `/backup-credentials` and `/backup-report`; the issuance audit row carries
+  no `run_id`. The "don't reuse `/backup-report`" conclusion still holds on the
+  other six reasons.
+- The group-level alerting plumbing you call "concrete in PR #225" is already
+  merged: `raise_group_event` exists and `restore-verification` is already a
+  defined alert ref. The alert side is one call.
+
+## 2. Two corrections that changed the wire shapes
+
+These came out of review and both made it into the model below:
+
+- **Canopy supplies the snapshot id.** Canopy already knows the latest snapshot
+  per `(server, type)` (the latest successful `backup_runs` row). You should
+  not list the repo to discover what to restore — canopy hands it to you.
+- **Restore is per-server, not per-group.** A group holds many servers, each
+  with its own snapshots inside the one shared per-group repo. Credentials are
+  necessarily group-wide (one kopia repo per group bucket), but *targeting* and
+  *health reporting* are per-server. `backup_restore_checks` and the
+  restore-health report carry `server_id`.
+
+## 3. The inversion: canopy drives, pgro executes
+
+This is the part that needs your sign-off, because it changes pgro's
+architecture. Rather than pgro statically defining what it restores (a
+CRD-defined list of groups/servers) and pulling per-group, **canopy becomes
+the source of truth for which replicas should exist, and pgro reconciles
+against it.**
+
+- An operator declares **replicas** in canopy: `(group, [server | all], type,
+  intent, name, freshness)`. The declaration is both the work item and the
+  authorization — there is no separate grant object.
+- pgro fetches its **entire desired state in one call** —
+  `GET /restore-worklist`, scoped to the calling consumer — and gets one entry
+  per concrete replica: declaration id, group, server, type, intent, freshness,
+  the snapshot to restore (`{snapshot_id, snapshot_at}` or empty), and the repo
+  coordinates.
+- pgro **reconciles**: create / refresh / tear down replicas to match the
+  worklist, fetching `POST /restore-credentials {group, type}` per group as it
+  goes.
+- pgro **reports health** per replica: `POST /restore-verification` with the
+  declaration, group, server, type, restored snapshot, outcome, replica
+  health, Postgres version, and S3 traffic.
+
+**The boundary:** canopy owns *what / why / how-fresh*; pgro owns *how* —
+provisioning, placement, storage sizing, scheduling, teardown. Canopy never
+models your runtime; you never decide what to restore.
+
+**Intents** are an open set (`verify`, `analytics`, `disaster-recovery`, plus
+anything you advertise). `verify` is transient (restore, prove, discard, re-run
+on cadence); `analytics` is a persistent replica refreshed to latest on
+cadence; `disaster-recovery` is a rehearsal. If canopy modelling your
+*analytics/DR* replicas (not just
+verification) is more centralisation than you want, say so — that is exactly
+the boundary this sign-off is about.
+
+What pgro keeps from the original handoff: one canopy device, promoted once;
+read-only by contract (write creds rejected at the API for the role);
+best-effort reporting that never blocks restore progress; no `consumer_instance`
+(one device, per-replica audit lives in your own records).
+
+## 4. Endpoint surface (shapes to be frozen on sign-off)
+
+- `GET  /restore-worklist` → desired replicas (expanded per server) + per-group
+  repo coordinates + the snapshot to restore for each.
+- `POST /restore-credentials {group, type}` → short-lived read-only creds +
+  repo password. Authorized iff an enabled declaration covers `(group, type)`.
+  `purpose=backup` rejected for this role.
+- `POST /restore-verification {replica, group, server, type, snapshot_id,
+  outcome, error?, replica_healthy, postgres_version?, observed_at, s3_*}` →
+  per-server restore-health; 204 on success.
+
+## 5. Appendix A (bestool) deltas
+
+The original A.2/A.3 (`restore_credentials`, `restore_target`) are replaced by
+a worklist fetch plus per-group `restore_credentials`; `restore_target`
+collapses into the worklist. A.1 `RestoreVerification` gains `server_id` (and a
+declaration id). A.4 `restore_verification` is unchanged in spirit. Canopy will
+restate the exact bestool deltas once you've signed off on §3 and the shapes
+are frozen.
+
+## 6. What canopy is building now
+
+Two PRs:
+
+1. **Control + access** — `backup-restore` role; the declared-replica model +
+   operator UI; `GET /restore-worklist`; `POST /restore-credentials`.
+2. **Health** — `backup_restore_checks` + `POST /restore-verification`;
+   per-server group-level alert routing + recovery; the overdue-freshness sweep;
+   restore-health surfacing in the operator UI.
+
+Ping canopy if §3 is contentious; otherwise canopy freezes the shapes at the
+end of PR1 and hands the restated Appendix A to bestool.
diff --git a/docs/plans/pgro-restore-verification-handoff.md b/docs/plans/pgro-restore-verification-handoff.md
new file mode 100644
index 00000000..3f5603ec
--- /dev/null
+++ b/docs/plans/pgro-restore-verification-handoff.md
@@ -0,0 +1,483 @@
+# Handoff to canopy: PGRO restore-verification integration
+
+**From:** pgro
+**To:** canopy (then canopy → bestool for the appendix)
+**Status:** waiting on canopy. pgro will not start building until the
+items in §4 land (or are contract-frozen) and the bestool additions in
+§A ship in a published crate.
+
+This document is the actionable subset of pgro's full integration spec
+(`pgro/docs/canopy-backup-integration.md`). Read that for the
+why-it-looks-like-this; read this for what to build. Anything contentious
+here gets bounced back to pgro before implementation.
+
+---
+
+## 1. Context, brief
+
+pgro restores tamanu-postgres physical backups out of kopia repos into
+working postgres replicas. Today it authenticates with hand-set,
+long-lived AWS keys + repo password in a k8s Secret — the exact
+long-lived-creds pattern the canopy backup-credentials system exists to
+eliminate. Bringing pgro under canopy gets two things:
+
+1. **Eliminates the static keys** on the pgro side. canopy mediates
+   restore creds the same way it mediates device backup creds.
+2. **Closes the lifecycle loop end-to-end.** A successful pgro restore
+   *proves the snapshot is restorable* — signal 3, the strongest
+   backup-health signal there is, stronger than signal 2's
+   "a snapshot exists in the repo". pgro reports per-replica restore
+   outcomes back to canopy; a failed/stale restorability check becomes
+   a high-severity group-level alert.
+
+This is the integration the canopy backup-credentials plan calls out in
+§"External restore consumers + restore-verification (PGRO)" — pgro is
+ready to build its side once canopy's side exists.
+
+---
+
+## 2. Architecture pgro is building toward
+
+Read this so the wire-shape and identity choices below make sense in
+context.
+
+- **One stable pgro operator Pod** sits on the tailnet and is the
+  single canopy device. It speaks to canopy directly.
+- **Each kopia restore is a k8s Job** spawned by the operator. Each Job
+  Pod runs two containers: kopia, and a pgro-published proxy sidecar.
+- **The proxy sidecar runs the bestool S3P loopback re-signing proxy**
+  (`bestool_kopia::proxy::spawn` from the published `bestool-kopia`
+  crate). kopia is pointed at `127.0.0.1` with dummy keys; the proxy
+  holds the live STS creds and re-signs each request. Same model as
+  bestool device backups and canopy's own maintenance jobs.
+- **The proxy's `CredentialProvider`** doesn't call canopy directly.
+  It calls an in-cluster HTTP endpoint on the operator
+  (`/internal/restore-creds`), and the operator forwards to canopy.
+  This is forced by the identity model (§3) — Job Pods are not canopy
+  devices and have no way to authenticate.
+- **`bestool-canopy::CanopyClient` auto-probes tailnet vs mTLS.** pgro
+  uses the tailnet path (via the Tailscale sidecar on the operator
+  Pod); mTLS is an optional fallback if a device cert is provisioned.
+
+Consequences worth flagging up front:
+
+- **The chained-STS 1-hour cap is a non-issue.** The proxy refreshes
+  creds between requests; long restores are bounded by canopy
+  reachability, not by any single issuance lifetime. pgro does not need
+  non-chained / direct-IRSA creds.
+- **kopia never sees real AWS credentials.** It carries dummy keys and
+  talks to `127.0.0.1`. The `--session-token` / `AWS_SESSION_TOKEN`
+  question is moot.
+
+---
+
+## 3. Identity model: one operator-Pod tailnet device
+
+canopy's tailnet auth identifies callers by **tailscale node identity**
+(`commons-servers/src/device_auth/tailnet.rs:52` — looks up the source
+IP via the tailnet directory, keys into `devices.tailscale_node_id`,
+auto-creates an `Untrusted` device row on first contact). Tags are only
+a coarse admission gate (`TAILSCALE_REQUIRED_TAG`).
+
+That means **one tailnet node = one canopy device record**. Per-Job
+Tailscale sidecars would create one `Untrusted` row per Job pod,
+forever — unworkable.
+
+So pgro will run **exactly one Tailscale sidecar**, on the operator
+Pod, and pgro is **one canopy device**:
+
+- First contact creates an `Untrusted` row.
+- canopy (admin) promotes it once to role `backup-restore` (working
+  name — see §4.1).
+- The operator brokers everything for Job Pods over the in-cluster
+  network, so Job Pods never need their own canopy identity.
+
+The mTLS path is symmetric — one operator-Pod-mounted device cert,
+one canopy device, same identity. Either path works; canopy's auth
+mechanism is the only thing that differs.
+
+---
+
+## 4. What canopy needs to build
+
+Five items. They depend on each other roughly in the order listed.
+
+### 4.1 New device role: `backup-restore`
+
+Add a fourth role alongside `server` / `releaser` / `admin`.
+
+- Generic, not pgro-specific. Any future restore-only consumer (an
+  external auditor's verifier, a separate test-restore harness) shares
+  the same role with its own external-restore grant.
+- No server / group binding. Like `releaser-device`, the role itself
+  doesn't imply membership in any group.
+- Add to the device-role enum, `securitySchemes` in
+  `crates/public-server/openapi.json`, route-gating macros, and the
+  cert-issuance flow (one-off operator-driven cert minting — does not
+  need the TPM-bound `canopy register` enrolment flow that bestool
+  servers use; the `releaser-device` provisioning path is the right
+  model).
+- **`purpose=backup` must be rejected at the API layer for this role.**
+  A `backup-restore`-role caller hitting `/backup-credentials` with
+  `purpose=backup` gets `403`/`409`, full stop. The role's read-only
+  contract is server-enforced, not consumer-promised, and a compromised
+  pgro can't pivot to writing/poisoning.
+
+This is the biggest single blocker. Until this lands pgro cannot
+authenticate at all.
+
+### 4.2 Group-aware credentials + target endpoints
+
+For server-bound roles, `device → server → group_id` resolves the group
+implicitly. A `backup-restore`-role device has no implicit server and
+no implicit group, so the request body has to carry `group`.
+
+Two viable shapes; canopy picks:
+
+- **(a) Add `group: Uuid` to the existing `CredentialsArgs` /
+  `BackupTarget` paths** and accept it only from `backup-restore`-role
+  callers. Smaller diff; mildly violates the principle that
+  device-authenticated requests don't put authz fields in the body.
+- **(b) Sibling endpoints**: e.g. `POST /restore-credentials` and
+  `GET /restore-target?group=...`. Clean separation; bestool-canopy
+  gets two new methods rather than overloaded ones (matches the
+  appendix bestool deltas).
+
+pgro lightly prefers (b) for clarity, but defers to canopy.
+
+Behaviour either way: canopy verifies the `(consumer, group, type)`
+external-restore grant (§4.3), then runs the same restore session
+policy + per-bucket role + repo-password lookup it does today, and
+returns `BackupCredentials` + `BackupTarget` unchanged.
+
+### 4.3 The external-restore grant
+
+The operator-authorised, audited authz primitive that says "consumer C
+may read group G's type T, read-only."
+
+- Per `(consumer_device_id, group_id, type)`. New table; canopy picks
+  the name (`backup_restore_grants` or similar).
+- Operator-authorised via the existing private-server UI or `canopy
+  ctl` CLI; audited.
+- Checked at request time for `/restore-credentials` (4.2) and
+  `/restore-verification` (4.4). Absence is a clear 403, not a
+  transient error.
+- pgro will surface that 403 as a clear `Failed` phase + Warning event
+  on the replica; the operator who set up the replica diagnoses by
+  going to canopy and inspecting / creating the grant.
+
+### 4.4 Restore-verification ingest endpoint + `backup_restore_checks`
+
+#### 4.4.1 Why NOT reuse `POST /backup-report`
+
+`/backup-report` already accepts `{ purpose: "restore", outcome,
+snapshot_id, error, run_id }` — for **devices**. The shape looks close
+to what pgro wants, but reusing it is wrong for seven concrete reasons:
+
+1. **Identity is auth-context-derived, not body-derived.** The handler
+   resolves `device_id`, `server_id`, and `group_id` from the
+   authenticated mTLS context (`crates/public-server/src/backup.rs:495`),
+   not the body. A `backup-restore`-role caller has no implicit server
+   or group; threading them through the body would break the invariant
+   that a device can't report a run as some *other* group.
+2. **Schema is device-shaped.** `backup_runs` has `device_id UUID NOT
+   NULL REFERENCES devices(id)` and `group_id NOT NULL REFERENCES
+   server_groups(id)`. The pgro device row exists but it's not
+   "running" a backup for any server; satisfying the FKs requires
+   either sentinel data or schema changes.
+3. **Two different "restore" meanings collide on `purpose`.** A device
+   with `purpose=restore` (e.g. `bestool canopy restore` for clone /
+   DR-test on the same fleet) writes to `backup_runs`. That is NOT a
+   signal-3 verification — it's a normal device-side restore and
+   should not raise a group-level "the backup isn't restorable"
+   incident. `purpose=restore` alone is not a sufficient discriminator
+   between device-restore-runs and signal-3 verifications.
+4. **Alerting paths diverge.** `/backup-report` failure feeds per-server
+   staleness (signal 1, server-scoped). Signal 3 must feed group-scoped
+   `raise_group_event(ref = "restore-verification")` bypassing
+   per-server `is_monitored`.
+5. **Side-effects don't match.** The handler clears `BackupRequest`
+   (`backup.rs:534`) so the heartbeat stops re-emitting "back up now"
+   for that server. Irrelevant for a pgro report.
+6. **Payload shape is wrong.** `ReportArgs` carries `bytes_uploaded` +
+   `s3_*_bytes` (good — pgro's proxy emits those too) but lacks
+   `replica_healthy`, postgres major version, `observed_at` — the
+   load-bearing fields that make signal 3 stronger than signal 2.
+7. **`run_id` semantics don't transfer.** For devices, `run_id` is the
+   same UUID across `/backup-credentials` (issuance audit) and
+   `/backup-report`, minted at run start, dup → 409. pgro's natural
+   identity is the snapshot being verified, not a per-run UUID; a
+   pgro-minted run UUID has no cross-table linkage to
+   `backup_credential_issuances`.
+
+By the time `/backup-report` has been extended to take `group_id`,
+relaxed (or split off) the FKs, branched the handler on actor type,
+routed failures differently, and gated the `BackupRequest::clear`
+side-effect, the handler has forked. Cleaner to expose a sibling.
+
+#### 4.4.2 New endpoint
+
+Working title `POST /restore-verification` (canopy picks the name).
+Authenticated as `backup-restore`-role; gated by the external-restore
+grant for the body's `(group, type)`.
+
+Request body (proposed):
+
+```json
+{
+  "group": "<uuid>",
+  "type": "tamanu-postgres",
+  "snapshot_id": "<kopia snapshot id>",
+  "outcome": "success" | "failure",
+  "error": "<string, only on failure>",
+  "replica_healthy": true,
+  "postgres_version": "<major, e.g. \"15\">",
+  "observed_at": "<RFC3339>",
+  "s3_sent_raw_bytes": 12345,
+  "s3_sent_payload_bytes": 12300,
+  "s3_received_raw_bytes": 98765,
+  "s3_received_payload_bytes": 98700
+}
+```
+
+- `snapshot_id` is the join key into `backup_repo_snapshots` /
+  `backup_runs`. Load-bearing for closing the loop *backed up →
+  persisted → restorable*.
+- `outcome=success` with `replica_healthy=true` means kopia restored
+  successfully AND postgres came up AND the operator's readiness gate
+  passed.
+- `outcome=failure` with an `error` string covers restore-job failure,
+  deployment-never-ready, postgres-version mismatch, etc.
+- S3 byte tallies come from the bestool proxy's `TrafficStats` (already
+  there in `bestool-kopia`). pgro emits them on success and failure,
+  same as `/backup-report`.
+
+#### 4.4.3 New table: `backup_restore_checks`
+
+Roughly:
+
+```sql
+CREATE TABLE backup_restore_checks (
+    id              BIGSERIAL PRIMARY KEY,
+    consumer_device_id UUID NOT NULL REFERENCES devices(id),
+    group_id        UUID NOT NULL REFERENCES server_groups(id),
+    type            TEXT NOT NULL,
+    snapshot_id     TEXT NOT NULL,
+    outcome         TEXT NOT NULL CHECK (outcome IN ('success','failure')),
+    error           TEXT,
+    replica_healthy BOOLEAN NOT NULL,
+    postgres_version TEXT,
+    observed_at     TIMESTAMPTZ NOT NULL,
+    s3_sent_raw_bytes      BIGINT,
+    s3_sent_payload_bytes  BIGINT,
+    s3_received_raw_bytes  BIGINT,
+    s3_received_payload_bytes BIGINT,
+    reported_at     TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+CREATE INDEX ON backup_restore_checks (group_id, type, observed_at DESC);
+CREATE INDEX ON backup_restore_checks (snapshot_id);
+```
+
+Exact shape is canopy's call. pgro just needs the endpoint to accept
+the body in §4.4.2 and reject 4xx clearly on grant/role failure.
+
+#### 4.4.4 Alert routing
+
+Plumb `outcome=failure` (and staleness — see §6 / "Open questions")
+into:
+
+```rust
+raise_group_event(
+    conn, group_id,
+    ref: "restore-verification",     // const in database::backup::refs
+    severity: Severity::Error,        // group-level; bypasses per-server is_monitored
+    description: ...,
+    message: ...,
+    active: true,
+);
+```
+
+Already concrete in PR #225, no new plumbing on the alerting side —
+just call it from the new handler. Recovery (`active: false`) on the
+next successful report for the same `(group, type)`.
+
+### 4.5 Wire-type stability
+
+For pgro's side: please freeze the wire shapes for §4.2 and §4.4.2
+before merging the bestool changes (Appendix A). Mid-flight name churn
+on `BackupCredentials` / `BackupTarget` fields would also cause
+collateral damage — pgro is going to consume `bestool_canopy`'s
+existing types verbatim, so renames there propagate.
+
+---
+
+## 5. What pgro is NOT asking for
+
+These have come up in earlier rounds and pgro has explicitly **decided
+against** them:
+
+- **Non-chained / longer-lived STS creds for pgro.** The proxy refreshes
+  out-of-band; the 1-hour chained cap is fine in practice. Don't burn
+  effort here on pgro's account. (canopy may still want it for its own
+  reasons — that's a canopy call.)
+- **Reusing `/backup-report` for signal 3.** §4.4.1 covers why.
+- **Server-side cred caching across pgro Jobs.** pgro's operator
+  already caches in-process for the broker (§Architecture); canopy
+  doesn't need to.
+- **A new auth federation (OIDC).** pgro is happy with mTLS + tailnet.
+  OIDC would be useful for *other* future first-party consumers and
+  canopy can pursue it independently, but pgro doesn't need it.
+
+---
+
+## 6. Open questions canopy owns
+
+Pick before / during implementation; flag back to pgro if any of these
+change pgro-visible shape.
+
+1. **4.2 (a) vs (b):** group in body of existing endpoints, or sibling
+   `/restore-*` endpoints. pgro mildly prefers (b).
+2. **Naming.** Role: `backup-restore` (pgro suggestion) vs whatever
+   canopy prefers. Endpoint: `/restore-verification` vs
+   `/backup-restore-check` vs… Table name: `backup_restore_checks` vs
+   `restore_verifications`. pgro doesn't care, just needs them stable
+   before bestool ships.
+3. **Staleness detection for signal 3.** A successful report is
+   straightforward. "Stale" (no recent successful verification for a
+   `(group, type)`) is a periodic check canopy needs to run — out of
+   pgro's scope, but in scope for the alerting story. Define the
+   cadence + threshold canopy-side.
+4. **`backup_restore_checks` retention.** pgro suggests indefinite
+   (audit trail, small rows); canopy decides.
+5. **Per-Pod identity for audit.** pgro is intentionally one canopy
+   device; per-Job audit lives in pgro's own k8s record (CRD status,
+   events). If canopy wants to split per-Pod, pgro can include a
+   `consumer_instance` opaque string in the body — but the cost is
+   real and the value is unclear. Default: don't.
+6. **Cert-issuance flow for the new role.** pgro will be tailscale-only
+   in normal operation; mTLS cert is the fallback. If canopy doesn't
+   want to build cert minting for the new role at all (tailscale-only,
+   period), pgro is fine with that — just confirm.
+
+---
+
+## 7. Pgro-side commitments (so canopy knows what to expect)
+
+- pgro will be one canopy device. First contact creates `Untrusted`;
+  canopy admin promotes once.
+- pgro will report `outcome=success` only when the deployment actually
+  passes the readiness gate (not on bare-kopia-success). Failure
+  reporting is best-effort and never blocks restore progression.
+- pgro will at-most-once-per-restore, with retry across reconciles
+  until the report lands (status-tracked).
+- pgro will not write or delete from any bucket. The proxy is fed by
+  the restore session policy; even if pgro is compromised it has no
+  write capability (compounded by §4.1's role-level `purpose=backup`
+  rejection).
+
+---
+
+## Appendix A — Hand off to bestool
+
+Once §4 has landed (or shipped to a feature branch with frozen wire
+shapes), canopy passes this list to bestool. All four are additive in
+the published `bestool-canopy` crate; no breaking changes to existing
+consumers, no new crate. `bestool-kopia` needs no changes.
+
+### A.1 `bestool_canopy::backup::RestoreVerification` (new)
+
+Public wire type mirroring §4.4.2:
+
+```rust
+#[derive(Debug, Clone, Serialize)]
+pub struct RestoreVerification<'a> {
+    pub group: Uuid,
+    pub r#type: &'a str,
+    pub snapshot_id: &'a str,
+    pub outcome: RunOutcome,                // reuse existing enum
+    pub error: Option<&'a str>,
+    pub replica_healthy: bool,
+    pub postgres_version: Option<&'a str>,
+    pub observed_at: jiff::Timestamp,
+    pub s3_sent_raw_bytes: Option<i64>,
+    pub s3_sent_payload_bytes: Option<i64>,
+    pub s3_received_raw_bytes: Option<i64>,
+    pub s3_received_payload_bytes: Option<i64>,
+}
+```
+
+Field-renaming-via-serde to whatever canopy lands; the Rust shape is
+indicative.
+
+### A.2 `CanopyClient::restore_credentials(base, type, group) -> Result<BackupCredentials>`
+
+Group-aware variant of `backup_credentials`. Posts to whichever
+endpoint canopy picks in §4.2 (a) or (b); the response type is the
+existing `BackupCredentials` unchanged.
+
+```rust
+pub async fn restore_credentials(
+    &self,
+    base_url: &Url,
+    backup_type: &str,
+    group: Uuid,
+) -> Result<BackupCredentials> { ... }
+```
+
+### A.3 `CanopyClient::restore_target(base, group) -> Result<TargetOutcome>`
+
+Same group issue for target lookup. Response is the existing
+`TargetOutcome` (Ready/Dormant) — Dormant maps to grant-absent or
+group-unconfigured.
+
+```rust
+pub async fn restore_target(
+    &self,
+    base_url: &Url,
+    group: Uuid,
+) -> Result<TargetOutcome> { ... }
+```
+
+### A.4 `CanopyClient::restore_verification(base, &RestoreVerification) -> Result<()>`
+
+Posts to canopy's new ingest endpoint. 204 on success; surface
+4xx body as error.
+
+```rust
+pub async fn restore_verification(
+    &self,
+    base_url: &Url,
+    report: &RestoreVerification<'_>,
+) -> Result<()> { ... }
+```
+
+### A.5 What does NOT change in bestool
+
+- `bestool-kopia` — no changes. `proxy::spawn`, `CredentialProvider`,
+  `Credentials`, `TrafficStats` are exactly what pgro consumes.
+- `CanopyClient::new(...)` — already accepts `device_key_pem:
+  Option<&str>`, so pgro's tailscale-only operator works as-is.
+- `Purpose::Restore` — already there.
+- `BackupCredentials` / `BackupTarget` shapes — pgro consumes these
+  verbatim; please don't reshape them mid-flight (see §4.5).
+
+### A.6 Suggested release shape
+
+One bestool-canopy minor version bump containing all four additions,
+landing after canopy's endpoints exist on at least a feature branch
+with frozen wire shapes. Tag and publish; pgro depends on `^X.Y`.
+
+---
+
+## Next round
+
+Once §4 + Appendix A have shipped, ping pgro. pgro will:
+
+1. Read the as-implemented wire + types (any drift from this doc is
+   fine, just needs to be visible).
+2. Re-evaluate the open questions in
+   `pgro/docs/canopy-backup-integration.md` and tighten the spec to
+   match what canopy actually shipped.
+3. Start building Part 1 (canopy client wiring + CRD field + sidecar
+   image) and Part 2 (the restore-verification reporter) against the
+   real surfaces.

From 95bf1362d95c4f1ba6e3d32f5baf5c626e16387f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= <felix@bes.au>
Date: Tue, 30 Jun 2026 14:10:10 +1200
Subject: [PATCH 2/7] docs(restore): invert unsupported-intent handling to
 capability registration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Consumer registers supported intents on start/change; canopy persists
them, constrains the declaration UX, dispatches only matching worklist
entries, and surfaces capability-shrink gaps to operators — instead of
the consumer reactively reporting unsupported intents (which conflated a
capability mismatch with an unrestorable-backup page).

Resolves pgro's post-sign-off open question. Adds POST /restore-capabilities
to the PR1 surface; /restore-verification outcome stays success/failure.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../specs/public-server/restore-replicas.md   | 18 +++++++-
 .../pgro-restore-replicas-canopy-response.md  | 46 ++++++++++++++++---
 2 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/.workhorse/specs/public-server/restore-replicas.md b/.workhorse/specs/public-server/restore-replicas.md
index aa6991ef..c76d5a36 100644
--- a/.workhorse/specs/public-server/restore-replicas.md
+++ b/.workhorse/specs/public-server/restore-replicas.md
@@ -54,6 +54,20 @@ There is no separate grant object — declaring a replica *is* the authorization
 A device reaches this role through one-off operator promotion, the same path a release-publishing device uses; no fleet-enrolment flow is involved.
 Either transport Canopy already accepts for devices — tailnet identity or a client certificate — satisfies the role; the role, not the transport, is the contract.
 
+## Consumer capabilities
+
+A restore consumer advertises the set of intents it can satisfy, and registers it with Canopy when it starts and whenever it changes.
+Canopy stores the set against the consumer and treats it as the authority on what that consumer can be asked to do.
+
+The registered set governs two things:
+
+- **What can be declared.** Canopy offers operators the intents the chosen consumer supports when they declare a replica.
+- **What is dispatched.** A consumer's worklist includes only entries whose intent it currently supports; Canopy never asks a consumer to satisfy an intent it has not advertised.
+
+When a consumer's set grows, the new intents become available for operators to assign, so a consumer gaining a capability is reflected without operator guesswork.
+When a consumer's set shrinks, any enabled declaration whose intent is no longer supported becomes a *gap*: Canopy drops it from the worklist immediately and surfaces it to operators as a declaration no consumer can currently satisfy, to reassign or retire.
+A gap is a configuration state shown to the operator, not a restore-health incident; the backups themselves are unaffected.
+
 ## Declared replicas
 
 An operator declares replicas against Canopy.
@@ -74,6 +88,8 @@ The well-known intents are:
 - **analytics** — a persistent replica kept running for querying, refreshed to the latest snapshot on the freshness cadence.
 - **disaster-recovery** — a periodic rehearsal of the full recovery path: a replica restored the way a real recovery would be, checked as a viable stand-in for the server, then discarded. It is the managed, automated counterpart to the operator-driven recovery in [Scope](#scope), not the recovery event itself.
 
+A declaration's intent must be one the chosen consumer supports (see [Consumer capabilities](#consumer-capabilities)); a declaration whose intent is unsupported is a gap, surfaced to the operator and never dispatched.
+
 A declaration scoped to a whole group expands to one replica per current server in that group.
 Servers joining or leaving a group change what the consumer is asked to maintain, with no per-server operator action.
 
@@ -83,7 +99,7 @@ Deleting a declaration stops the consumer being asked to maintain that replica a
 ## The worklist
 
 A restore consumer fetches its complete desired state from Canopy in one request, scoped to the calling consumer.
-Canopy expands the consumer's enabled declarations against the current servers and the latest known snapshot for each, and returns one entry per concrete replica:
+Canopy expands the consumer's enabled declarations — those whose intent the consumer currently supports — against the current servers and the latest known snapshot for each, and returns one entry per concrete replica:
 
 - the declaration's identifier, group, server, type, intent, name, and freshness;
 - the **snapshot to restore**: the snapshot identifier and its timestamp, or empty when no successful backup is yet known for that server and type;
diff --git a/docs/plans/pgro-restore-replicas-canopy-response.md b/docs/plans/pgro-restore-replicas-canopy-response.md
index 59e8ef8b..89b8e0a3 100644
--- a/docs/plans/pgro-restore-replicas-canopy-response.md
+++ b/docs/plans/pgro-restore-replicas-canopy-response.md
@@ -92,8 +92,12 @@ best-effort reporting that never blocks restore progress; no `consumer_instance`
 
 ## 4. Endpoint surface (shapes to be frozen on sign-off)
 
+- `POST /restore-capabilities {intents: [...]}` → pgro registers the intents it
+  can satisfy, on start and whenever they change. Canopy persists the set and
+  dispatches only matching worklist entries (see §7).
 - `GET  /restore-worklist` → desired replicas (expanded per server) + per-group
-  repo coordinates + the snapshot to restore for each.
+  repo coordinates + the snapshot to restore for each. Only entries whose intent
+  pgro currently supports are returned.
 - `POST /restore-credentials {group, type}` → short-lived read-only creds +
   repo password. Authorized iff an enabled declaration covers `(group, type)`.
   `purpose=backup` rejected for this role.
@@ -106,19 +110,47 @@ best-effort reporting that never blocks restore progress; no `consumer_instance`
 The original A.2/A.3 (`restore_credentials`, `restore_target`) are replaced by
 a worklist fetch plus per-group `restore_credentials`; `restore_target`
 collapses into the worklist. A.1 `RestoreVerification` gains `server_id` (and a
-declaration id). A.4 `restore_verification` is unchanged in spirit. Canopy will
-restate the exact bestool deltas once you've signed off on §3 and the shapes
-are frozen.
+declaration id). A.4 `restore_verification` is unchanged in spirit. A new
+`CanopyClient::restore_capabilities(base, &[intents])` registers the supported
+intents (§7). Canopy will restate the exact bestool deltas once the shapes are
+frozen.
 
 ## 6. What canopy is building now
 
 Two PRs:
 
 1. **Control + access** — `backup-restore` role; the declared-replica model +
-   operator UI; `GET /restore-worklist`; `POST /restore-credentials`.
+   operator UI; consumer capability registration (`POST /restore-capabilities`)
+   + capability-aware declaration UX + gap surfacing; `GET /restore-worklist`;
+   `POST /restore-credentials`.
 2. **Health** — `backup_restore_checks` + `POST /restore-verification`;
    per-server group-level alert routing + recovery; the overdue-freshness sweep;
    restore-health surfacing in the operator UI.
 
-Ping canopy if §3 is contentious; otherwise canopy freezes the shapes at the
-end of PR1 and hands the restated Appendix A to bestool.
+Canopy freezes the shapes at the end of PR1 and hands the restated Appendix A to
+bestool.
+
+## 7. Resolution of pgro's open question — unsupported intents
+
+pgro's sign-off asked how canopy should handle an intent pgro doesn't
+implement, defaulting to an implicit `outcome=failure, error="unsupported"`
+report. Canopy is taking the structured route instead, because the implicit one
+conflates a *capability mismatch* with an *unrestorable backup* — the latter
+pages a group-level incident, which is the wrong response to "pgro can't do
+this intent yet."
+
+Inverted model: **pgro registers its supported intents** (`POST
+/restore-capabilities`) on start and on change; canopy persists them and:
+
+- offers operators only supported intents when they declare a replica;
+- dispatches only matching worklist entries — pgro never receives an intent it
+  hasn't advertised, so there is no unsupported-intent report and no spurious
+  page;
+- when pgro's set **grows**, the new intents become assignable; when it
+  **shrinks**, declarations stranded on a now-unsupported intent become *gaps* —
+  dropped from the worklist immediately and surfaced to operators to reassign or
+  retire, as configuration state, not a restore-health incident.
+
+Consequence for pgro: implement `restore_capabilities` registration on start;
+the `/restore-verification` outcome stays just success/failure (no `unsupported`
+value).

From 4a88c0473aec0dca9ce7dd033e39c42c83b80bce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= <felix@bes.au>
Date: Tue, 30 Jun 2026 14:23:08 +1200
Subject: [PATCH 3/7] feat(restore): backup-restore role, restore-replica +
 capability schema & models

- Add the backup-restore device role (commons-types enum + auth extractor
  macro + openapi security scheme + drift test + UI role picker/colour).
- New RestoreIntent open enum (verify/analytics/disaster-recovery/custom),
  mirroring BackupType.
- Migration restore_replicas: restore_replicas (declared replicas) +
  restore_consumer_capabilities tables.
- database::restore models + CRUD (RestoreReplica, RestoreConsumerCapability,
  capability register-as-insert-then-prune, creds authz check) and
  Server::list_live_in_group for worklist expansion.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 crates/commons-servers/src/device_auth/mod.rs |   1 +
 crates/commons-types/src/backup.rs            | 103 +++++++
 crates/commons-types/src/device.rs            |   4 +
 crates/database/src/lib.rs                    |   5 +-
 crates/database/src/restore.rs                | 263 ++++++++++++++++++
 crates/database/src/schema.rs                 |  32 +++
 crates/database/src/servers.rs                |  17 ++
 crates/public-server/src/openapi.rs           |   1 +
 crates/public-server/tests/openapi_spec.rs    |   7 +-
 .../down.sql                                  |   2 +
 .../up.sql                                    |  51 ++++
 private-web/src/components/DeviceShorty.tsx   |   1 +
 private-web/src/routes/DeviceDetail.tsx       |   7 +-
 13 files changed, 491 insertions(+), 3 deletions(-)
 create mode 100644 crates/database/src/restore.rs
 create mode 100644 migrations/2026-06-30-021427-0000_restore_replicas/down.sql
 create mode 100644 migrations/2026-06-30-021427-0000_restore_replicas/up.sql

diff --git a/crates/commons-servers/src/device_auth/mod.rs b/crates/commons-servers/src/device_auth/mod.rs
index cf6903c2..6c60664d 100644
--- a/crates/commons-servers/src/device_auth/mod.rs
+++ b/crates/commons-servers/src/device_auth/mod.rs
@@ -82,6 +82,7 @@ macro_rules! device_role_struct {
 device_role_struct!(AdminDevice, DeviceRole::Admin);
 device_role_struct!(ServerDevice, DeviceRole::Server);
 device_role_struct!(ReleaserDevice, DeviceRole::Releaser);
+device_role_struct!(BackupRestoreDevice, DeviceRole::BackupRestore);
 
 impl<S> axum::extract::FromRequestParts<S> for AuthDevice
 where
diff --git a/crates/commons-types/src/backup.rs b/crates/commons-types/src/backup.rs
index 320ae347..bab57202 100644
--- a/crates/commons-types/src/backup.rs
+++ b/crates/commons-types/src/backup.rs
@@ -280,6 +280,109 @@ where
 	}
 }
 
+/// What a managed restore replica is for. Open by design, mirroring
+/// [`BackupType`]: a restore consumer advertises the intents it can satisfy and
+/// Canopy preserves any it does not model in `Custom` rather than rejecting it.
+/// Stored as `TEXT`; serializes as a plain string (no DB `CHECK`).
+#[derive(Debug, Clone, PartialEq, Eq, Hash, AsExpression, FromSqlRow)]
+#[diesel(sql_type = Text)]
+pub enum RestoreIntent {
+	/// A transient replica restored only to prove the snapshot is restorable.
+	Verify,
+	/// A persistent replica kept running for querying.
+	Analytics,
+	/// A periodic rehearsal of the full recovery path.
+	DisasterRecovery,
+	/// Any other intent name, preserved as advertised.
+	Custom(String),
+}
+
+impl RestoreIntent {
+	const VERIFY: &'static str = "verify";
+	const ANALYTICS: &'static str = "analytics";
+	const DISASTER_RECOVERY: &'static str = "disaster-recovery";
+
+	/// The wire/DB string for this intent.
+	pub fn as_str(&self) -> &str {
+		match self {
+			Self::Verify => Self::VERIFY,
+			Self::Analytics => Self::ANALYTICS,
+			Self::DisasterRecovery => Self::DISASTER_RECOVERY,
+			Self::Custom(s) => s,
+		}
+	}
+}
+
+impl Display for RestoreIntent {
+	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+		f.write_str(self.as_str())
+	}
+}
+
+impl From<String> for RestoreIntent {
+	fn from(s: String) -> Self {
+		match s.as_str() {
+			Self::VERIFY => Self::Verify,
+			Self::ANALYTICS => Self::Analytics,
+			Self::DISASTER_RECOVERY => Self::DisasterRecovery,
+			_ => Self::Custom(s),
+		}
+	}
+}
+
+impl From<&str> for RestoreIntent {
+	fn from(s: &str) -> Self {
+		Self::from(s.to_owned())
+	}
+}
+
+impl FromStr for RestoreIntent {
+	type Err = std::convert::Infallible;
+	fn from_str(s: &str) -> Result<Self, Self::Err> {
+		Ok(Self::from(s))
+	}
+}
+
+impl From<RestoreIntent> for String {
+	fn from(v: RestoreIntent) -> Self {
+		match v {
+			RestoreIntent::Custom(s) => s,
+			other => other.as_str().to_owned(),
+		}
+	}
+}
+
+impl Serialize for RestoreIntent {
+	fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+		s.serialize_str(self.as_str())
+	}
+}
+
+impl<'de> Deserialize<'de> for RestoreIntent {
+	fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
+		Ok(Self::from(String::deserialize(d)?))
+	}
+}
+
+impl<DB> FromSql<Text, DB> for RestoreIntent
+where
+	DB: Backend,
+	String: FromSql<Text, DB>,
+{
+	fn from_sql(bytes: DB::RawValue<'_>) -> deserialize::Result<Self> {
+		Ok(Self::from(String::from_sql(bytes)?))
+	}
+}
+
+impl ToSql<Text, diesel::pg::Pg> for RestoreIntent
+where
+	String: ToSql<Text, diesel::pg::Pg>,
+{
+	fn to_sql<'b>(&'b self, out: &mut Output<'b, '_, diesel::pg::Pg>) -> serialize::Result {
+		<str as ToSql<Text, diesel::pg::Pg>>::to_sql(self.as_str(), &mut out.reborrow())
+	}
+}
+
 #[cfg(test)]
 mod tests {
 	use super::*;
diff --git a/crates/commons-types/src/device.rs b/crates/commons-types/src/device.rs
index c48f03d0..e60ddd98 100644
--- a/crates/commons-types/src/device.rs
+++ b/crates/commons-types/src/device.rs
@@ -27,6 +27,8 @@ pub enum DeviceRole {
 	Admin,
 	Releaser,
 	Server,
+	#[serde(rename = "backup-restore")]
+	BackupRestore,
 }
 
 #[derive(Debug, Clone, Copy, thiserror::Error)]
@@ -42,6 +44,7 @@ impl std::str::FromStr for DeviceRole {
 			"admin" => Ok(Self::Admin),
 			"releaser" => Ok(Self::Releaser),
 			"server" => Ok(Self::Server),
+			"backup-restore" => Ok(Self::BackupRestore),
 			_ => Err(DeviceRoleFromStringError),
 		}
 	}
@@ -62,6 +65,7 @@ impl std::fmt::Display for DeviceRole {
 			DeviceRole::Admin => "admin",
 			DeviceRole::Releaser => "releaser",
 			DeviceRole::Server => "server",
+			DeviceRole::BackupRestore => "backup-restore",
 		};
 		write!(f, "{}", s)
 	}
diff --git a/crates/database/src/lib.rs b/crates/database/src/lib.rs
index b4812f6b..033db439 100644
--- a/crates/database/src/lib.rs
+++ b/crates/database/src/lib.rs
@@ -14,6 +14,7 @@ pub mod healthcheck_severities;
 pub mod issues;
 pub mod notes;
 pub mod pg_duration;
+pub mod restore;
 pub mod schema;
 pub mod server_enrollment_challenges;
 pub mod server_enrollment_tokens;
@@ -38,9 +39,11 @@ pub use backups::{
 };
 pub use bestool_snippets::{BestoolSnippet, NewBestoolSnippet};
 pub use commons_types::backup::{
-	BackupConfigStatus, BackupPurpose, BackupRepoMode, BackupType, MaintenanceKind, RunOutcome,
+	BackupConfigStatus, BackupPurpose, BackupRepoMode, BackupType, MaintenanceKind, RestoreIntent,
+	RunOutcome,
 };
 pub use devices::{Device, DeviceConnection, DeviceKey, DeviceWithInfo};
+pub use restore::{NewRestoreReplica, RestoreConsumerCapability, RestoreReplica};
 
 pub type Db = Pool<AsyncPgConnection>;
 
diff --git a/crates/database/src/restore.rs b/crates/database/src/restore.rs
new file mode 100644
index 00000000..7675ce4a
--- /dev/null
+++ b/crates/database/src/restore.rs
@@ -0,0 +1,263 @@
+//! Managed restore replicas (RST): the control-plane state for driving an
+//! external restore consumer. Operators declare which replicas should exist
+//! ([`RestoreReplica`]); consumers register the intents they can satisfy
+//! ([`RestoreConsumerCapability`]). The worklist expansion, credential issuance,
+//! and restore-health ingest live in the public-server and `jobs` components.
+
+use commons_errors::{AppError, Result};
+use commons_types::backup::{BackupType, RestoreIntent};
+use diesel::{
+	prelude::*,
+	result::{DatabaseErrorKind, Error as DieselError},
+};
+use diesel_async::{AsyncPgConnection, RunQueryDsl};
+use jiff::Timestamp;
+use serde::Serialize;
+use uuid::Uuid;
+
+use crate::pg_duration::PgDuration;
+
+/// An operator-declared replica: a consumer should keep a replica of a
+/// `(group, [server | all servers], type)` for a given intent. The declaration
+/// is both the work item (it expands into worklist entries) and the
+/// authorization (it grants the consumer read access to that `(group, type)`).
+#[derive(Debug, Clone, Serialize, Queryable, Selectable, utoipa::ToSchema)]
+#[diesel(table_name = crate::schema::restore_replicas)]
+#[diesel(check_for_backend(diesel::pg::Pg))]
+pub struct RestoreReplica {
+	pub id: Uuid,
+	pub consumer_device_id: Uuid,
+	pub group_id: Uuid,
+	/// `None` = all current servers in the group, expanded at worklist time.
+	pub server_id: Option<Uuid>,
+	#[diesel(column_name = type_)]
+	#[schema(value_type = String)]
+	pub r#type: BackupType,
+	#[schema(value_type = String)]
+	pub intent: RestoreIntent,
+	pub name: String,
+	/// Max age of the restored snapshot before the replica is overdue, in
+	/// whole seconds; `None` = always track the latest snapshot.
+	#[schema(value_type = Option<i64>)]
+	pub freshness: Option<PgDuration>,
+	pub enabled: bool,
+	pub created_by: Option<String>,
+	#[diesel(deserialize_as = jiff_diesel::Timestamp, serialize_as = jiff_diesel::Timestamp)]
+	pub created_at: Timestamp,
+	#[diesel(deserialize_as = jiff_diesel::Timestamp, serialize_as = jiff_diesel::Timestamp)]
+	pub updated_at: Timestamp,
+}
+
+#[derive(Debug, Clone, Insertable)]
+#[diesel(table_name = crate::schema::restore_replicas)]
+pub struct NewRestoreReplica {
+	pub consumer_device_id: Uuid,
+	pub group_id: Uuid,
+	pub server_id: Option<Uuid>,
+	#[diesel(column_name = type_)]
+	pub r#type: BackupType,
+	pub intent: RestoreIntent,
+	pub name: String,
+	pub freshness: Option<PgDuration>,
+	pub created_by: Option<String>,
+}
+
+impl RestoreReplica {
+	/// Create a declaration. A duplicate `(consumer, group, type, intent,
+	/// server)` scope maps to `409`.
+	pub async fn create(db: &mut AsyncPgConnection, new: NewRestoreReplica) -> Result<Self> {
+		use crate::schema::restore_replicas::dsl;
+		match diesel::insert_into(dsl::restore_replicas)
+			.values(new)
+			.returning(Self::as_select())
+			.get_result(db)
+			.await
+		{
+			Ok(row) => Ok(row),
+			Err(DieselError::DatabaseError(DatabaseErrorKind::UniqueViolation, _)) => Err(
+				AppError::Conflict("a matching restore replica is already declared".into()),
+			),
+			Err(e) => Err(AppError::from(e)),
+		}
+	}
+
+	/// Every declaration, newest first — the operator overview.
+	pub async fn list_all(db: &mut AsyncPgConnection) -> Result<Vec<Self>> {
+		use crate::schema::restore_replicas::dsl;
+		dsl::restore_replicas
+			.select(Self::as_select())
+			.order(dsl::created_at.desc())
+			.load(db)
+			.await
+			.map_err(AppError::from)
+	}
+
+	/// Declarations scoped to a group.
+	pub async fn list_for_group(db: &mut AsyncPgConnection, group_id: Uuid) -> Result<Vec<Self>> {
+		use crate::schema::restore_replicas::dsl;
+		dsl::restore_replicas
+			.select(Self::as_select())
+			.filter(dsl::group_id.eq(group_id))
+			.order(dsl::created_at.desc())
+			.load(db)
+			.await
+			.map_err(AppError::from)
+	}
+
+	/// Enabled declarations for a consumer — the basis of its worklist (before
+	/// per-server expansion and capability filtering).
+	pub async fn list_enabled_for_consumer(
+		db: &mut AsyncPgConnection,
+		consumer_device_id: Uuid,
+	) -> Result<Vec<Self>> {
+		use crate::schema::restore_replicas::dsl;
+		dsl::restore_replicas
+			.select(Self::as_select())
+			.filter(dsl::consumer_device_id.eq(consumer_device_id))
+			.filter(dsl::enabled.eq(true))
+			.order(dsl::created_at.desc())
+			.load(db)
+			.await
+			.map_err(AppError::from)
+	}
+
+	pub async fn get(db: &mut AsyncPgConnection, id: Uuid) -> Result<Self> {
+		use crate::schema::restore_replicas::dsl;
+		dsl::restore_replicas
+			.select(Self::as_select())
+			.filter(dsl::id.eq(id))
+			.first(db)
+			.await
+			.optional()
+			.map_err(AppError::from)?
+			.ok_or(AppError::DatabaseQuery(DieselError::NotFound))
+	}
+
+	/// Edit the non-structural fields. Scope fields (consumer, group, server,
+	/// type, intent) are immutable — change them by deleting and recreating.
+	pub async fn update(
+		db: &mut AsyncPgConnection,
+		id: Uuid,
+		name: &str,
+		freshness: Option<PgDuration>,
+		enabled: bool,
+	) -> Result<Self> {
+		use crate::schema::restore_replicas::dsl;
+		diesel::update(dsl::restore_replicas.filter(dsl::id.eq(id)))
+			.set((
+				dsl::name.eq(name),
+				dsl::freshness.eq(freshness),
+				dsl::enabled.eq(enabled),
+			))
+			.returning(Self::as_select())
+			.get_result(db)
+			.await
+			.optional()
+			.map_err(AppError::from)?
+			.ok_or(AppError::DatabaseQuery(DieselError::NotFound))
+	}
+
+	pub async fn delete(db: &mut AsyncPgConnection, id: Uuid) -> Result<()> {
+		use crate::schema::restore_replicas::dsl;
+		let n = diesel::delete(dsl::restore_replicas.filter(dsl::id.eq(id)))
+			.execute(db)
+			.await?;
+		if n == 0 {
+			return Err(AppError::DatabaseQuery(DieselError::NotFound));
+		}
+		Ok(())
+	}
+
+	/// Whether an enabled declaration covers `(consumer, group, type)` — the
+	/// authorization check for issuing restore credentials. A server-scoped or
+	/// a group-wide declaration both satisfy it.
+	pub async fn authorizes(
+		db: &mut AsyncPgConnection,
+		consumer_device_id: Uuid,
+		group_id: Uuid,
+		r#type: &BackupType,
+	) -> Result<bool> {
+		use crate::schema::restore_replicas::dsl;
+		let n: i64 = dsl::restore_replicas
+			.filter(dsl::consumer_device_id.eq(consumer_device_id))
+			.filter(dsl::group_id.eq(group_id))
+			.filter(dsl::type_.eq(r#type.as_str()))
+			.filter(dsl::enabled.eq(true))
+			.count()
+			.get_result(db)
+			.await?;
+		Ok(n > 0)
+	}
+}
+
+/// One intent a consumer can satisfy. The full set is registered by the
+/// consumer on start and whenever it changes; Canopy dispatches only matching
+/// worklist entries and constrains the declaration UX to this set.
+#[derive(Debug, Clone, Serialize, Queryable, Selectable, utoipa::ToSchema)]
+#[diesel(table_name = crate::schema::restore_consumer_capabilities)]
+#[diesel(check_for_backend(diesel::pg::Pg))]
+pub struct RestoreConsumerCapability {
+	pub consumer_device_id: Uuid,
+	#[schema(value_type = String)]
+	pub intent: RestoreIntent,
+	#[diesel(deserialize_as = jiff_diesel::Timestamp, serialize_as = jiff_diesel::Timestamp)]
+	pub registered_at: Timestamp,
+}
+
+impl RestoreConsumerCapability {
+	/// Replace a consumer's capability set with `intents`. Implemented as
+	/// insert-then-prune (not a transaction) so there is never a window where
+	/// a still-valid intent is absent: new intents are inserted first, then any
+	/// no longer present are removed.
+	pub async fn register(
+		db: &mut AsyncPgConnection,
+		consumer_device_id: Uuid,
+		intents: &[RestoreIntent],
+	) -> Result<()> {
+		use crate::schema::restore_consumer_capabilities::dsl;
+
+		let strings: Vec<String> = intents.iter().map(|i| i.as_str().to_owned()).collect();
+
+		let rows: Vec<_> = intents
+			.iter()
+			.map(|i| {
+				(
+					dsl::consumer_device_id.eq(consumer_device_id),
+					dsl::intent.eq(i.as_str().to_owned()),
+				)
+			})
+			.collect();
+		if !rows.is_empty() {
+			diesel::insert_into(dsl::restore_consumer_capabilities)
+				.values(rows)
+				.on_conflict((dsl::consumer_device_id, dsl::intent))
+				.do_nothing()
+				.execute(db)
+				.await?;
+		}
+
+		diesel::delete(
+			dsl::restore_consumer_capabilities
+				.filter(dsl::consumer_device_id.eq(consumer_device_id))
+				.filter(dsl::intent.ne_all(strings)),
+		)
+		.execute(db)
+		.await?;
+		Ok(())
+	}
+
+	/// The intents a consumer currently supports.
+	pub async fn list_for_consumer(
+		db: &mut AsyncPgConnection,
+		consumer_device_id: Uuid,
+	) -> Result<Vec<RestoreIntent>> {
+		use crate::schema::restore_consumer_capabilities::dsl;
+		let rows: Vec<String> = dsl::restore_consumer_capabilities
+			.filter(dsl::consumer_device_id.eq(consumer_device_id))
+			.select(dsl::intent)
+			.order(dsl::intent.asc())
+			.load(db)
+			.await?;
+		Ok(rows.into_iter().map(RestoreIntent::from).collect())
+	}
+}
diff --git a/crates/database/src/schema.rs b/crates/database/src/schema.rs
index de81559b..3a3e7e52 100644
--- a/crates/database/src/schema.rs
+++ b/crates/database/src/schema.rs
@@ -291,6 +291,32 @@ diesel::table! {
 	}
 }
 
+diesel::table! {
+	restore_consumer_capabilities (consumer_device_id, intent) {
+		consumer_device_id -> Uuid,
+		intent -> Text,
+		registered_at -> Timestamptz,
+	}
+}
+
+diesel::table! {
+	restore_replicas (id) {
+		id -> Uuid,
+		consumer_device_id -> Uuid,
+		group_id -> Uuid,
+		server_id -> Nullable<Uuid>,
+		#[sql_name = "type"]
+		type_ -> Text,
+		intent -> Text,
+		name -> Text,
+		freshness -> Nullable<Interval>,
+		enabled -> Bool,
+		created_by -> Nullable<Text>,
+		created_at -> Timestamptz,
+		updated_at -> Timestamptz,
+	}
+}
+
 diesel::table! {
 	server_backup_capabilities (server_id, type_) {
 		server_id -> Uuid,
@@ -524,6 +550,10 @@ diesel::joinable!(issue_notes -> issues (issue_id));
 diesel::joinable!(issues -> devices (device_id));
 diesel::joinable!(issues -> server_groups (server_group_id));
 diesel::joinable!(issues -> servers (server_id));
+diesel::joinable!(restore_consumer_capabilities -> devices (consumer_device_id));
+diesel::joinable!(restore_replicas -> devices (consumer_device_id));
+diesel::joinable!(restore_replicas -> server_groups (group_id));
+diesel::joinable!(restore_replicas -> servers (server_id));
 diesel::joinable!(server_backup_capabilities -> servers (server_id));
 diesel::joinable!(server_enrollment_challenges -> servers (server_id));
 diesel::joinable!(server_enrollment_tokens -> servers (server_id));
@@ -563,6 +593,8 @@ diesel::allow_tables_to_appear_in_same_query!(
 	incidents,
 	issue_notes,
 	issues,
+	restore_consumer_capabilities,
+	restore_replicas,
 	server_backup_capabilities,
 	server_enrollment_challenges,
 	server_enrollment_tokens,
diff --git a/crates/database/src/servers.rs b/crates/database/src/servers.rs
index 97e902f9..a4bd34b0 100644
--- a/crates/database/src/servers.rs
+++ b/crates/database/src/servers.rs
@@ -455,6 +455,23 @@ impl Server {
 			.map_err(AppError::from)
 	}
 
+	/// All live (non-archived) servers in a group, ordered by name. Used to
+	/// expand a group-wide restore-replica declaration into per-server entries.
+	pub async fn list_live_in_group(
+		db: &mut AsyncPgConnection,
+		group_id_: Uuid,
+	) -> Result<Vec<Self>> {
+		use crate::schema::servers::dsl::*;
+		servers
+			.select(Self::as_select())
+			.filter(group_id.eq(group_id_))
+			.filter(deleted_at.is_null())
+			.order(name.asc())
+			.load(db)
+			.await
+			.map_err(AppError::from)
+	}
+
 	/// All servers without a group, ordered by name. Used by the Ungrouped UI tab.
 	pub async fn list_ungrouped(db: &mut AsyncPgConnection) -> Result<Vec<Self>> {
 		use crate::schema::servers::dsl::*;
diff --git a/crates/public-server/src/openapi.rs b/crates/public-server/src/openapi.rs
index d033f784..11001f10 100644
--- a/crates/public-server/src/openapi.rs
+++ b/crates/public-server/src/openapi.rs
@@ -45,6 +45,7 @@ impl Modify for SecuritySchemes {
 		};
 		components.add_security_scheme("server-device", role_scheme("server"));
 		components.add_security_scheme("releaser-device", role_scheme("releaser"));
+		components.add_security_scheme("backup-restore-device", role_scheme("backup-restore"));
 		components.add_security_scheme(
 			"admin-device",
 			SecurityScheme::MutualTls {
diff --git a/crates/public-server/tests/openapi_spec.rs b/crates/public-server/tests/openapi_spec.rs
index 8f44bfd2..b2b9ab34 100644
--- a/crates/public-server/tests/openapi_spec.rs
+++ b/crates/public-server/tests/openapi_spec.rs
@@ -16,7 +16,12 @@ fn build_spec() -> serde_json::Value {
 fn spec_has_security_schemes() {
 	let spec = build_spec();
 	let schemes = &spec["components"]["securitySchemes"];
-	for s in ["server-device", "releaser-device", "admin-device"] {
+	for s in [
+		"server-device",
+		"releaser-device",
+		"admin-device",
+		"backup-restore-device",
+	] {
 		assert!(schemes[s].is_object(), "{s} scheme present");
 	}
 }
diff --git a/migrations/2026-06-30-021427-0000_restore_replicas/down.sql b/migrations/2026-06-30-021427-0000_restore_replicas/down.sql
new file mode 100644
index 00000000..adcdeca5
--- /dev/null
+++ b/migrations/2026-06-30-021427-0000_restore_replicas/down.sql
@@ -0,0 +1,2 @@
+DROP TABLE restore_consumer_capabilities;
+DROP TABLE restore_replicas;
diff --git a/migrations/2026-06-30-021427-0000_restore_replicas/up.sql b/migrations/2026-06-30-021427-0000_restore_replicas/up.sql
new file mode 100644
index 00000000..8a1919a9
--- /dev/null
+++ b/migrations/2026-06-30-021427-0000_restore_replicas/up.sql
@@ -0,0 +1,51 @@
+-- Managed restore replicas (RST): operator-declared desired replicas that a
+-- restore consumer reconciles against, plus the set of intents each consumer
+-- can satisfy.
+
+-- A declared replica: the operator's statement that a consumer should keep a
+-- replica of a (group, [server | all servers], type) for a given intent. The
+-- declaration is both the work item and the authorization to read what it
+-- needs.
+CREATE TABLE restore_replicas (
+	id                 UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+	consumer_device_id UUID NOT NULL REFERENCES devices(id),
+	group_id           UUID NOT NULL REFERENCES server_groups(id),
+	-- NULL = all current servers in the group (expanded at worklist time).
+	server_id          UUID REFERENCES servers(id),
+	type               TEXT NOT NULL,
+	intent             TEXT NOT NULL,
+	name               TEXT NOT NULL,
+	-- Max age of the restored snapshot before the replica is overdue; NULL =
+	-- always track the latest snapshot.
+	freshness          INTERVAL,
+	enabled            BOOLEAN NOT NULL DEFAULT TRUE,
+	created_by         TEXT,
+	created_at         TIMESTAMPTZ NOT NULL DEFAULT now(),
+	updated_at         TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+SELECT diesel_manage_updated_at('restore_replicas');
+
+-- One declaration per (consumer, group, type, intent) scope. A server-specific
+-- row and a group-wide (server_id NULL) row are tracked under separate partial
+-- indexes because NULLs do not compare equal in a plain unique constraint.
+CREATE UNIQUE INDEX restore_replicas_scope_server
+	ON restore_replicas (consumer_device_id, group_id, type, intent, server_id)
+	WHERE server_id IS NOT NULL;
+CREATE UNIQUE INDEX restore_replicas_scope_group
+	ON restore_replicas (consumer_device_id, group_id, type, intent)
+	WHERE server_id IS NULL;
+
+CREATE INDEX restore_replicas_consumer ON restore_replicas (consumer_device_id);
+CREATE INDEX restore_replicas_group ON restore_replicas (group_id);
+
+-- The set of intents a consumer can satisfy, registered by the consumer on
+-- start and whenever it changes. Canopy dispatches only matching worklist
+-- entries and constrains the declaration UX to this set; an enabled
+-- declaration whose intent is absent here is a surfaced gap.
+CREATE TABLE restore_consumer_capabilities (
+	consumer_device_id UUID NOT NULL REFERENCES devices(id),
+	intent             TEXT NOT NULL,
+	registered_at      TIMESTAMPTZ NOT NULL DEFAULT now(),
+	PRIMARY KEY (consumer_device_id, intent)
+);
diff --git a/private-web/src/components/DeviceShorty.tsx b/private-web/src/components/DeviceShorty.tsx
index a871883b..f2fc6419 100644
--- a/private-web/src/components/DeviceShorty.tsx
+++ b/private-web/src/components/DeviceShorty.tsx
@@ -10,6 +10,7 @@ const ROLE_COLORS: Record<
 	server: "primary",
 	releaser: "warning",
 	admin: "info",
+	"backup-restore": "primary",
 };
 
 export function deviceDisplayName(info: DeviceInfo): string {
diff --git a/private-web/src/routes/DeviceDetail.tsx b/private-web/src/routes/DeviceDetail.tsx
index 1646dcbc..5d038a4d 100644
--- a/private-web/src/routes/DeviceDetail.tsx
+++ b/private-web/src/routes/DeviceDetail.tsx
@@ -31,7 +31,12 @@ import {
 	type DeviceRole,
 } from "../types";
 
-const TRUSTABLE_ROLES: DeviceRole[] = ["server", "releaser", "admin"];
+const TRUSTABLE_ROLES: DeviceRole[] = [
+	"server",
+	"releaser",
+	"admin",
+	"backup-restore",
+];
 
 export default function DeviceDetail() {
 	const { id = "" } = useParams<{ id: string }>();

From 631373cee64b585254407e6891405d444d726ee8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= <felix@bes.au>
Date: Tue, 30 Jun 2026 14:34:35 +1200
Subject: [PATCH 4/7] feat(restore): worklist, restore-credentials, capability
 + admin endpoints

Public-server (backup-restore role): POST /restore-capabilities (register
supported intents), GET /restore-worklist (enabled declarations expanded
per live server, capability-filtered, with the latest snapshot + repo
coords), POST /restore-credentials (read-only STS + repo password, authz
via declaration).

Private-server admin API (crate::fns::restore_replicas): list/for_group/
consumers/create/update/delete, with per-declaration gap computation.

Adds Device::list_by_role; regenerates public + private openapi and
api-types.ts (DeviceRole gains backup-restore).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 crates/database/src/devices.rs                |  13 +
 crates/private-server/src/fns.rs              |   2 +
 .../src/fns/restore_replicas.rs               | 309 ++++++++++++++
 crates/public-server/openapi.json             | 221 ++++++++++
 crates/public-server/src/backup.rs            |   2 +-
 crates/public-server/src/lib.rs               |   2 +
 crates/public-server/src/openapi.rs           |   1 +
 crates/public-server/src/restore.rs           | 366 +++++++++++++++++
 private-web/openapi.json                      | 376 +++++++++++++++++-
 private-web/src/api-types.ts                  | 311 ++++++++++++++-
 10 files changed, 1586 insertions(+), 17 deletions(-)
 create mode 100644 crates/private-server/src/fns/restore_replicas.rs
 create mode 100644 crates/public-server/src/restore.rs

diff --git a/crates/database/src/devices.rs b/crates/database/src/devices.rs
index 6273c351..197e483c 100644
--- a/crates/database/src/devices.rs
+++ b/crates/database/src/devices.rs
@@ -524,6 +524,19 @@ impl Device {
 		Self::list_trusted_with_info_paginated(db, i64::MAX, 0).await
 	}
 
+	/// All devices holding a given role, newest first. Used to list restore
+	/// consumers (`backup-restore` devices) for the operator's replica forms.
+	pub async fn list_by_role(db: &mut AsyncPgConnection, role: DeviceRole) -> Result<Vec<Self>> {
+		use crate::schema::devices;
+		devices::table
+			.select(Self::as_select())
+			.filter(devices::role.eq(role))
+			.order(devices::created_at.desc())
+			.load(db)
+			.await
+			.map_err(AppError::from)
+	}
+
 	/// List trusted devices with pagination.
 	pub async fn list_trusted_with_info_paginated(
 		db: &mut AsyncPgConnection,
diff --git a/crates/private-server/src/fns.rs b/crates/private-server/src/fns.rs
index aa151194..df2e3a1b 100644
--- a/crates/private-server/src/fns.rs
+++ b/crates/private-server/src/fns.rs
@@ -9,6 +9,7 @@ pub mod devices;
 pub mod healthchecks;
 pub mod incidents;
 pub mod issues;
+pub mod restore_replicas;
 pub mod server_groups;
 pub mod servers;
 pub mod silenced_refs;
@@ -37,6 +38,7 @@ pub fn routes() -> OpenApiRouter<crate::state::AppState> {
 			.nest("/healthchecks", healthchecks::routes())
 			.nest("/incidents", incidents::routes())
 			.nest("/issues", issues::routes())
+			.nest("/restore_replicas", restore_replicas::routes())
 			.nest("/server_groups", server_groups::routes())
 			.nest("/servers", servers::routes())
 			.nest("/silenced_refs", silenced_refs::routes())
diff --git a/crates/private-server/src/fns/restore_replicas.rs b/crates/private-server/src/fns/restore_replicas.rs
new file mode 100644
index 00000000..fd7d54c7
--- /dev/null
+++ b/crates/private-server/src/fns/restore_replicas.rs
@@ -0,0 +1,309 @@
+//! Operator-facing managed-restore endpoints (private-server, admin SPA).
+//!
+//! Thin wrappers over `database::restore`. Operators declare which replicas a
+//! restore consumer should maintain, and see each consumer's registered
+//! capabilities so the declaration UX can offer only supported intents and flag
+//! declarations whose intent is currently unsupported (a *gap*).
+//!
+//! Reads are open to any tailnet user; mutations require admin.
+
+use std::collections::{HashMap, HashSet};
+
+use axum::Json;
+use axum::extract::State;
+use commons_errors::{ProblemDetailsSchema, Result};
+use commons_servers::tailscale_auth::TailscaleAdmin;
+use commons_types::device::DeviceRole;
+use commons_types::{
+	Uuid,
+	backup::{BackupType, RestoreIntent},
+};
+use database::diesel_async::AsyncPgConnection;
+use database::pg_duration::PgDuration;
+use database::{NewRestoreReplica, RestoreConsumerCapability, RestoreReplica, devices::Device};
+use jiff::{SignedDuration, Timestamp};
+use serde::{Deserialize, Serialize};
+use utoipa::ToSchema;
+use utoipa_axum::{router::OpenApiRouter, routes};
+
+use crate::state::AppState;
+
+pub fn routes() -> OpenApiRouter<AppState> {
+	OpenApiRouter::new()
+		.routes(routes!(list))
+		.routes(routes!(for_group))
+		.routes(routes!(consumers))
+		.routes(routes!(create))
+		.routes(routes!(update))
+		.routes(routes!(delete))
+}
+
+// ── Wire types ──────────────────────────────────────────────────────────────
+
+/// A declared replica for the operator UI. `gap` is true when the consumer does
+/// not currently advertise this declaration's intent, so Canopy is not
+/// dispatching it.
+#[derive(Debug, Clone, Serialize, ToSchema)]
+pub struct RestoreReplicaView {
+	pub id: Uuid,
+	pub consumer_device_id: Uuid,
+	pub consumer_name: Option<String>,
+	pub group_id: Uuid,
+	pub server_id: Option<Uuid>,
+	#[schema(value_type = String)]
+	pub r#type: BackupType,
+	#[schema(value_type = String)]
+	pub intent: RestoreIntent,
+	pub name: String,
+	pub freshness_seconds: Option<i64>,
+	pub enabled: bool,
+	pub gap: bool,
+	pub created_by: Option<String>,
+	#[schema(value_type = String)]
+	pub created_at: Timestamp,
+	#[schema(value_type = String)]
+	pub updated_at: Timestamp,
+}
+
+/// A restore consumer (a `backup-restore` device) and the intents it currently
+/// supports — drives the declaration form's consumer and intent pickers.
+#[derive(Debug, Clone, Serialize, ToSchema)]
+pub struct RestoreConsumerView {
+	pub device_id: Uuid,
+	pub name: Option<String>,
+	#[schema(value_type = Vec<String>)]
+	pub intents: Vec<RestoreIntent>,
+}
+
+#[derive(Debug, Deserialize, ToSchema)]
+pub struct GroupArgs {
+	pub server_group_id: Uuid,
+}
+
+#[derive(Debug, Deserialize, ToSchema)]
+pub struct CreateArgs {
+	pub consumer_device_id: Uuid,
+	pub group_id: Uuid,
+	/// `None` = all current servers in the group.
+	pub server_id: Option<Uuid>,
+	#[schema(value_type = String)]
+	pub r#type: BackupType,
+	#[schema(value_type = String)]
+	pub intent: RestoreIntent,
+	pub name: String,
+	/// Max snapshot age before overdue, in whole seconds; `None` = latest only.
+	pub freshness_seconds: Option<i64>,
+}
+
+#[derive(Debug, Deserialize, ToSchema)]
+pub struct UpdateArgs {
+	pub id: Uuid,
+	pub name: String,
+	pub freshness_seconds: Option<i64>,
+	pub enabled: bool,
+}
+
+#[derive(Debug, Deserialize, ToSchema)]
+pub struct IdArgs {
+	pub id: Uuid,
+}
+
+// ── Helpers ───────────────────────────────────────────────────────────────
+
+fn freshness_to_pg(seconds: Option<i64>) -> Option<PgDuration> {
+	seconds.map(|s| PgDuration(SignedDuration::from_secs(s)))
+}
+
+/// Build views from declarations, resolving consumer display names and the
+/// per-consumer capability set so `gap` can be computed.
+async fn to_views(
+	conn: &mut AsyncPgConnection,
+	replicas: Vec<RestoreReplica>,
+) -> Result<Vec<RestoreReplicaView>> {
+	let consumer_ids: HashSet<Uuid> = replicas.iter().map(|r| r.consumer_device_id).collect();
+
+	// Consumer display names come from the set of restore-consumer devices.
+	let names: HashMap<Uuid, Option<String>> =
+		Device::list_by_role(conn, DeviceRole::BackupRestore)
+			.await?
+			.into_iter()
+			.map(|d| (d.id, d.tailscale_node_name))
+			.collect();
+
+	let mut caps: HashMap<Uuid, HashSet<RestoreIntent>> = HashMap::new();
+	for id in consumer_ids {
+		let set: HashSet<RestoreIntent> = RestoreConsumerCapability::list_for_consumer(conn, id)
+			.await?
+			.into_iter()
+			.collect();
+		caps.insert(id, set);
+	}
+
+	Ok(replicas
+		.into_iter()
+		.map(|r| {
+			let gap = !caps
+				.get(&r.consumer_device_id)
+				.map(|s| s.contains(&r.intent))
+				.unwrap_or(false);
+			RestoreReplicaView {
+				consumer_name: names.get(&r.consumer_device_id).cloned().flatten(),
+				freshness_seconds: r.freshness.map(|f| f.0.as_secs()),
+				gap,
+				id: r.id,
+				consumer_device_id: r.consumer_device_id,
+				group_id: r.group_id,
+				server_id: r.server_id,
+				r#type: r.r#type,
+				intent: r.intent,
+				name: r.name,
+				enabled: r.enabled,
+				created_by: r.created_by,
+				created_at: r.created_at,
+				updated_at: r.updated_at,
+			}
+		})
+		.collect())
+}
+
+// ── Handlers ──────────────────────────────────────────────────────────────
+
+#[utoipa::path(
+	post,
+	path = "/list",
+	operation_id = "restore_replicas_list",
+	tag = "restore_replicas",
+	security(("tailscale-user" = [])),
+	responses((status = 200, body = Vec<RestoreReplicaView>)),
+)]
+pub async fn list(State(state): State<AppState>) -> Result<Json<Vec<RestoreReplicaView>>> {
+	let mut conn = state.db.get().await?;
+	let replicas = RestoreReplica::list_all(&mut conn).await?;
+	Ok(Json(to_views(&mut conn, replicas).await?))
+}
+
+#[utoipa::path(
+	post,
+	path = "/for_group",
+	operation_id = "restore_replicas_for_group",
+	tag = "restore_replicas",
+	security(("tailscale-user" = [])),
+	request_body = GroupArgs,
+	responses((status = 200, body = Vec<RestoreReplicaView>)),
+)]
+pub async fn for_group(
+	State(state): State<AppState>,
+	Json(args): Json<GroupArgs>,
+) -> Result<Json<Vec<RestoreReplicaView>>> {
+	let mut conn = state.db.get().await?;
+	let replicas = RestoreReplica::list_for_group(&mut conn, args.server_group_id).await?;
+	Ok(Json(to_views(&mut conn, replicas).await?))
+}
+
+#[utoipa::path(
+	post,
+	path = "/consumers",
+	operation_id = "restore_replicas_consumers",
+	tag = "restore_replicas",
+	security(("tailscale-user" = [])),
+	responses((status = 200, body = Vec<RestoreConsumerView>)),
+)]
+pub async fn consumers(State(state): State<AppState>) -> Result<Json<Vec<RestoreConsumerView>>> {
+	let mut conn = state.db.get().await?;
+	let devices = Device::list_by_role(&mut conn, DeviceRole::BackupRestore).await?;
+	let mut out = Vec::with_capacity(devices.len());
+	for d in devices {
+		let intents = RestoreConsumerCapability::list_for_consumer(&mut conn, d.id).await?;
+		out.push(RestoreConsumerView {
+			device_id: d.id,
+			name: d.tailscale_node_name,
+			intents,
+		});
+	}
+	Ok(Json(out))
+}
+
+#[utoipa::path(
+	post,
+	path = "/create",
+	operation_id = "restore_replicas_create",
+	tag = "restore_replicas",
+	security(("tailscale-admin" = [])),
+	request_body = CreateArgs,
+	responses(
+		(status = 200, body = RestoreReplicaView),
+		(status = 409, description = "A matching declaration already exists.", body = ProblemDetailsSchema),
+	),
+)]
+pub async fn create(
+	State(state): State<AppState>,
+	TailscaleAdmin(admin): TailscaleAdmin,
+	Json(args): Json<CreateArgs>,
+) -> Result<Json<RestoreReplicaView>> {
+	let mut conn = state.db.get().await?;
+	let replica = RestoreReplica::create(
+		&mut conn,
+		NewRestoreReplica {
+			consumer_device_id: args.consumer_device_id,
+			group_id: args.group_id,
+			server_id: args.server_id,
+			r#type: args.r#type,
+			intent: args.intent,
+			name: args.name,
+			freshness: freshness_to_pg(args.freshness_seconds),
+			created_by: Some(admin.login),
+		},
+	)
+	.await?;
+	let views = to_views(&mut conn, vec![replica]).await?;
+	Ok(Json(views.into_iter().next().expect("one view")))
+}
+
+#[utoipa::path(
+	post,
+	path = "/update",
+	operation_id = "restore_replicas_update",
+	tag = "restore_replicas",
+	security(("tailscale-admin" = [])),
+	request_body = UpdateArgs,
+	responses(
+		(status = 200, body = RestoreReplicaView),
+		(status = 404, body = ProblemDetailsSchema),
+	),
+)]
+pub async fn update(
+	State(state): State<AppState>,
+	_admin: TailscaleAdmin,
+	Json(args): Json<UpdateArgs>,
+) -> Result<Json<RestoreReplicaView>> {
+	let mut conn = state.db.get().await?;
+	let replica = RestoreReplica::update(
+		&mut conn,
+		args.id,
+		&args.name,
+		freshness_to_pg(args.freshness_seconds),
+		args.enabled,
+	)
+	.await?;
+	let views = to_views(&mut conn, vec![replica]).await?;
+	Ok(Json(views.into_iter().next().expect("one view")))
+}
+
+#[utoipa::path(
+	post,
+	path = "/delete",
+	operation_id = "restore_replicas_delete",
+	tag = "restore_replicas",
+	security(("tailscale-admin" = [])),
+	request_body = IdArgs,
+	responses((status = 200), (status = 404, body = ProblemDetailsSchema)),
+)]
+pub async fn delete(
+	State(state): State<AppState>,
+	_admin: TailscaleAdmin,
+	Json(args): Json<IdArgs>,
+) -> Result<Json<()>> {
+	let mut conn = state.db.get().await?;
+	RestoreReplica::delete(&mut conn, args.id).await?;
+	Ok(Json(()))
+}
diff --git a/crates/public-server/openapi.json b/crates/public-server/openapi.json
index b468fcc4..e7301488 100644
--- a/crates/public-server/openapi.json
+++ b/crates/public-server/openapi.json
@@ -433,6 +433,127 @@
         ]
       }
     },
+    "/restore-capabilities": {
+      "post": {
+        "tags": [
+          "restore"
+        ],
+        "operationId": "capabilities",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/CapabilitiesArgs"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "204": {
+            "description": "Capability set registered."
+          }
+        },
+        "security": [
+          {
+            "backup-restore-device": []
+          }
+        ]
+      }
+    },
+    "/restore-credentials": {
+      "post": {
+        "tags": [
+          "restore"
+        ],
+        "operationId": "credentials",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/CredentialsArgs"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/RestoreCredentials"
+                }
+              }
+            }
+          },
+          "403": {
+            "description": "No enabled declaration authorizes this (group, type).",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ProblemDetailsSchema"
+                }
+              }
+            }
+          },
+          "409": {
+            "description": "Group has no ready backup config.",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ProblemDetailsSchema"
+                }
+              }
+            }
+          },
+          "502": {
+            "description": "STS issuance or repo-password read failed or is not configured.",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ProblemDetailsSchema"
+                }
+              }
+            }
+          }
+        },
+        "security": [
+          {
+            "backup-restore-device": []
+          }
+        ]
+      }
+    },
+    "/restore-worklist": {
+      "get": {
+        "tags": [
+          "restore"
+        ],
+        "operationId": "worklist",
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "array",
+                  "items": {
+                    "$ref": "#/components/schemas/WorklistEntry"
+                  }
+                }
+              }
+            }
+          }
+        },
+        "security": [
+          {
+            "backup-restore-device": []
+          }
+        ]
+      }
+    },
     "/servers": {
       "get": {
         "tags": [
@@ -1511,6 +1632,23 @@
           }
         }
       },
+      "RestoreCredentials": {
+        "type": "object",
+        "description": "Read-only credentials plus the repo password for one `(group, type)`. The\nAWS creds are the `credential_process` shape the consumer's proxy refreshes;\nthe password opens the kopia repo.",
+        "required": [
+          "credentials",
+          "repo_password"
+        ],
+        "properties": {
+          "credentials": {
+            "$ref": "#/components/schemas/CredentialProcessOutput"
+          },
+          "repo_password": {
+            "type": "string",
+            "description": "The kopia repo passphrase, read from the group's k8s Secret."
+          }
+        }
+      },
       "RunOutcome": {
         "type": "string",
         "description": "Outcome of a reported backup/restore run.",
@@ -1769,6 +1907,81 @@
             "$ref": "#/components/schemas/VersionStatus"
           }
         }
+      },
+      "WorklistEntry": {
+        "type": "object",
+        "description": "One concrete replica the consumer should maintain: a declaration expanded\nagainst a single server, carrying the snapshot to restore and the repo\ncoordinates to find it. Credentials and the repo password are obtained\nseparately via `/restore-credentials`.",
+        "required": [
+          "replica_id",
+          "group_id",
+          "server_id",
+          "type",
+          "intent",
+          "name",
+          "storage",
+          "bucket",
+          "prefix",
+          "region"
+        ],
+        "properties": {
+          "bucket": {
+            "type": "string"
+          },
+          "freshness_seconds": {
+            "type": [
+              "integer",
+              "null"
+            ],
+            "format": "int64",
+            "description": "Max snapshot age before the replica is overdue, in whole seconds;\n`None` = always track the latest."
+          },
+          "group_id": {
+            "type": "string",
+            "format": "uuid"
+          },
+          "intent": {
+            "type": "string"
+          },
+          "name": {
+            "type": "string"
+          },
+          "prefix": {
+            "type": "string"
+          },
+          "region": {
+            "type": "string"
+          },
+          "replica_id": {
+            "type": "string",
+            "format": "uuid",
+            "description": "The declaration this entry came from."
+          },
+          "server_id": {
+            "type": "string",
+            "format": "uuid"
+          },
+          "snapshot_at": {
+            "type": [
+              "string",
+              "null"
+            ],
+            "description": "RFC3339 timestamp of that snapshot, if known."
+          },
+          "snapshot_id": {
+            "type": [
+              "string",
+              "null"
+            ],
+            "description": "The snapshot Canopy wants restored — the latest successful backup for\nthis `(server, type)`. `None` when no successful backup is yet known."
+          },
+          "storage": {
+            "type": "string",
+            "description": "Always `\"s3\"`."
+          },
+          "type": {
+            "type": "string"
+          }
+        }
       }
     },
     "securitySchemes": {
@@ -1776,6 +1989,10 @@
         "type": "mutualTLS",
         "description": "mTLS client certificate for a device with the `admin` role."
       },
+      "backup-restore-device": {
+        "type": "mutualTLS",
+        "description": "mTLS client certificate for a device with the `backup-restore` role (or `admin`)."
+      },
       "releaser-device": {
         "type": "mutualTLS",
         "description": "mTLS client certificate for a device with the `releaser` role (or `admin`)."
@@ -1803,6 +2020,10 @@
       "name": "events",
       "description": "Device-pushed events; rolled up into issues and incidents server-side."
     },
+    {
+      "name": "restore",
+      "description": "Managed restore replicas: consumer capability registration, worklist, and read-only restore credentials."
+    },
     {
       "name": "servers",
       "description": "Server registry — listing for the public, self-registration for server devices."
diff --git a/crates/public-server/src/backup.rs b/crates/public-server/src/backup.rs
index f0f326e0..4408442d 100644
--- a/crates/public-server/src/backup.rs
+++ b/crates/public-server/src/backup.rs
@@ -47,7 +47,7 @@ pub const REPO_PASSWORD_SECRET_KEY: &str = "password";
 /// Fallback AWS region served by `GET /backup-target` when the group config's
 /// `region` is NULL. Read from `AWS_REGION` (the EKS pod always has it), with a
 /// last-resort default so the endpoint always returns a concrete region string.
-fn deployment_default_region() -> String {
+pub(crate) fn deployment_default_region() -> String {
 	std::env::var("AWS_REGION")
 		.or_else(|_| std::env::var("AWS_DEFAULT_REGION"))
 		.unwrap_or_else(|_| "us-east-1".to_string())
diff --git a/crates/public-server/src/lib.rs b/crates/public-server/src/lib.rs
index d740c7dc..7779f47a 100644
--- a/crates/public-server/src/lib.rs
+++ b/crates/public-server/src/lib.rs
@@ -12,6 +12,7 @@ pub mod openapi;
 #[cfg(feature = "ui")]
 pub mod password;
 pub mod ratelimit;
+pub mod restore;
 #[cfg(feature = "ui")]
 pub mod server_versions;
 pub mod servers;
@@ -27,6 +28,7 @@ pub fn routes() -> OpenApiRouter<AppState> {
 	let mut router = OpenApiRouter::new()
 		.merge(events::routes())
 		.merge(backup::routes())
+		.merge(restore::routes())
 		.nest("/artifacts", artifacts::routes())
 		.nest("/bestool", bestool::routes())
 		.nest("/servers", servers::routes())
diff --git a/crates/public-server/src/openapi.rs b/crates/public-server/src/openapi.rs
index 11001f10..a1233697 100644
--- a/crates/public-server/src/openapi.rs
+++ b/crates/public-server/src/openapi.rs
@@ -19,6 +19,7 @@ use utoipa::{Modify, OpenApi, openapi::security::SecurityScheme};
 		(name = "backup", description = "Device backup credential minting, target config, capability registration, and run reporting."),
 		(name = "bestool", description = "Bestool SQL snippet read API."),
 		(name = "events", description = "Device-pushed events; rolled up into issues and incidents server-side."),
+		(name = "restore", description = "Managed restore replicas: consumer capability registration, worklist, and read-only restore credentials."),
 		(name = "servers", description = "Server registry — listing for the public, self-registration for server devices."),
 		(name = "statuses", description = "Heartbeat / status submissions from server devices."),
 		(name = "versions", description = "Canopy release versions and their downloadable artifacts."),
diff --git a/crates/public-server/src/restore.rs b/crates/public-server/src/restore.rs
new file mode 100644
index 00000000..3dfe9e9b
--- /dev/null
+++ b/crates/public-server/src/restore.rs
@@ -0,0 +1,366 @@
+//! Managed-restore endpoints (RST) — the consumer-facing side of the restore
+//! control plane. All `BackupRestoreDevice`-authenticated, mounted at the root:
+//!
+//! - `POST /restore-capabilities` — the consumer registers the intents it can
+//!   satisfy. Canopy dispatches only matching worklist entries.
+//! - `GET  /restore-worklist` — the consumer's complete desired state: its
+//!   enabled declarations expanded per current server, each carrying the
+//!   snapshot Canopy wants restored and the repo coordinates to find it.
+//! - `POST /restore-credentials` — short-lived **read-only** S3 creds plus the
+//!   repo password for one `(group, type)` the consumer is authorized for.
+//!
+//! The `backup-restore` role is read-only by construction: it cannot reach the
+//! `ServerDevice`-gated `/backup-credentials`, and `/restore-credentials` only
+//! ever issues the read-only [`restore_session_policy`].
+
+use aws_sdk_sts::operation::RequestId as _;
+use axum::{Json, extract::State, http::StatusCode};
+use commons_errors::{AppError, ProblemDetailsSchema, Result};
+use commons_servers::device_auth::BackupRestoreDevice;
+use commons_types::backup::{BackupPurpose, BackupType, RestoreIntent};
+use database::{
+	Db,
+	backups::{BackupRun, NewBackupCredentialIssuance, ServerGroupBackupConfig},
+	restore::{RestoreConsumerCapability, RestoreReplica},
+	servers::Server,
+};
+use jiff::Timestamp;
+use serde::{Deserialize, Serialize};
+use std::collections::HashSet;
+use utoipa::ToSchema;
+use utoipa_axum::{router::OpenApiRouter, routes};
+use uuid::Uuid;
+
+use crate::{
+	backup::{
+		CredentialProcessOutput, REPO_PASSWORD_SECRET_KEY, deployment_default_region,
+		restore_session_policy,
+	},
+	state::{AppState, BackupSecrets},
+};
+
+pub fn routes() -> OpenApiRouter<AppState> {
+	OpenApiRouter::new()
+		.routes(routes!(capabilities))
+		.routes(routes!(worklist))
+		.routes(routes!(credentials))
+}
+
+// ---------------------------------------------------------------------------
+// POST /restore-capabilities
+// ---------------------------------------------------------------------------
+
+#[derive(Debug, Deserialize, ToSchema)]
+pub struct CapabilitiesArgs {
+	/// The intents this consumer can satisfy (e.g. `verify`, `analytics`,
+	/// `disaster-recovery`). Replaces the consumer's registered set wholesale.
+	#[schema(value_type = Vec<String>)]
+	pub intents: Vec<RestoreIntent>,
+}
+
+#[utoipa::path(
+	post,
+	path = "/restore-capabilities",
+	tag = "restore",
+	security(("backup-restore-device" = [])),
+	request_body = CapabilitiesArgs,
+	responses((status = 204, description = "Capability set registered.")),
+)]
+async fn capabilities(
+	State(db): State<Db>,
+	device: BackupRestoreDevice,
+	Json(args): Json<CapabilitiesArgs>,
+) -> Result<StatusCode> {
+	let mut conn = db.get().await?;
+	let consumer_device_id = device.0.0.id;
+	RestoreConsumerCapability::register(&mut conn, consumer_device_id, &args.intents).await?;
+	Ok(StatusCode::NO_CONTENT)
+}
+
+// ---------------------------------------------------------------------------
+// GET /restore-worklist
+// ---------------------------------------------------------------------------
+
+/// One concrete replica the consumer should maintain: a declaration expanded
+/// against a single server, carrying the snapshot to restore and the repo
+/// coordinates to find it. Credentials and the repo password are obtained
+/// separately via `/restore-credentials`.
+#[derive(Debug, Serialize, ToSchema)]
+pub struct WorklistEntry {
+	/// The declaration this entry came from.
+	pub replica_id: Uuid,
+	pub group_id: Uuid,
+	pub server_id: Uuid,
+	#[schema(value_type = String)]
+	pub r#type: BackupType,
+	#[schema(value_type = String)]
+	pub intent: RestoreIntent,
+	pub name: String,
+	/// Max snapshot age before the replica is overdue, in whole seconds;
+	/// `None` = always track the latest.
+	pub freshness_seconds: Option<i64>,
+	/// The snapshot Canopy wants restored — the latest successful backup for
+	/// this `(server, type)`. `None` when no successful backup is yet known.
+	pub snapshot_id: Option<String>,
+	/// RFC3339 timestamp of that snapshot, if known.
+	pub snapshot_at: Option<String>,
+	/// Always `"s3"`.
+	pub storage: String,
+	pub bucket: String,
+	pub prefix: String,
+	pub region: String,
+}
+
+#[utoipa::path(
+	get,
+	path = "/restore-worklist",
+	tag = "restore",
+	security(("backup-restore-device" = [])),
+	responses((status = 200, body = Vec<WorklistEntry>)),
+)]
+async fn worklist(
+	State(db): State<Db>,
+	device: BackupRestoreDevice,
+) -> Result<Json<Vec<WorklistEntry>>> {
+	let mut conn = db.get().await?;
+	let consumer_device_id = device.0.0.id;
+
+	// Only intents the consumer currently supports are dispatched; a declaration
+	// on an unsupported intent is a gap, surfaced to operators, never sent here.
+	let supported: HashSet<RestoreIntent> =
+		RestoreConsumerCapability::list_for_consumer(&mut conn, consumer_device_id)
+			.await?
+			.into_iter()
+			.collect();
+
+	let mut declarations = RestoreReplica::list_enabled_for_consumer(&mut conn, consumer_device_id)
+		.await?
+		.into_iter()
+		.filter(|d| supported.contains(&d.intent))
+		.collect::<Vec<_>>();
+	// Process server-specific declarations before group-wide ones so a
+	// server-scoped declaration wins the dedup over a group-wide one covering
+	// the same (server, type, intent).
+	declarations.sort_by_key(|d| d.server_id.is_none());
+
+	let mut out: Vec<WorklistEntry> = Vec::new();
+	let mut seen: HashSet<(Uuid, String, String)> = HashSet::new();
+	// Per-group caches so a group referenced by several declarations is resolved
+	// once.
+	let mut snapshot_cache: std::collections::HashMap<
+		Uuid,
+		std::collections::HashMap<(Uuid, BackupType), BackupRun>,
+	> = std::collections::HashMap::new();
+
+	for d in declarations {
+		// A worklist entry needs somewhere to restore from: skip groups without
+		// a ready config (they surface elsewhere as not-yet-restorable).
+		let Some(cfg) = ServerGroupBackupConfig::get(&mut conn, d.group_id).await? else {
+			continue;
+		};
+		if cfg.status != commons_types::backup::BackupConfigStatus::Ready {
+			continue;
+		}
+
+		let servers = match d.server_id {
+			Some(sid) => {
+				let s = Server::get_by_id(&mut conn, sid).await?;
+				// Skip a declaration whose server has left the group or been
+				// archived; it lingers as a no-op until the operator retires it.
+				if s.group_id == Some(d.group_id) && s.deleted_at.is_none() {
+					vec![s]
+				} else {
+					vec![]
+				}
+			}
+			None => Server::list_live_in_group(&mut conn, d.group_id).await?,
+		};
+
+		if !snapshot_cache.contains_key(&d.group_id) {
+			let map =
+				BackupRun::latest_success_by_server_type_for_group(&mut conn, d.group_id).await?;
+			snapshot_cache.insert(d.group_id, map);
+		}
+		let snapshots = &snapshot_cache[&d.group_id];
+
+		let region = cfg.region.clone().unwrap_or_else(deployment_default_region);
+		for server in servers {
+			let key = (server.id, d.r#type.to_string(), d.intent.to_string());
+			if !seen.insert(key) {
+				continue;
+			}
+			let latest = snapshots.get(&(server.id, d.r#type.clone()));
+			out.push(WorklistEntry {
+				replica_id: d.id,
+				group_id: d.group_id,
+				server_id: server.id,
+				r#type: d.r#type.clone(),
+				intent: d.intent.clone(),
+				name: d.name.clone(),
+				freshness_seconds: d.freshness.map(|f| f.0.as_secs()),
+				snapshot_id: latest.and_then(|r| r.snapshot_id.clone()),
+				snapshot_at: latest.map(|r| r.reported_at.to_string()),
+				storage: "s3".into(),
+				bucket: cfg.bucket.clone(),
+				prefix: cfg.prefix.clone(),
+				region: region.clone(),
+			});
+		}
+	}
+
+	Ok(Json(out))
+}
+
+// ---------------------------------------------------------------------------
+// POST /restore-credentials
+// ---------------------------------------------------------------------------
+
+#[derive(Debug, Deserialize, ToSchema)]
+pub struct CredentialsArgs {
+	/// The group whose repo to read.
+	pub group: Uuid,
+	/// The backup type to restore.
+	#[schema(value_type = String)]
+	pub r#type: BackupType,
+}
+
+/// Read-only credentials plus the repo password for one `(group, type)`. The
+/// AWS creds are the `credential_process` shape the consumer's proxy refreshes;
+/// the password opens the kopia repo.
+#[derive(Debug, Serialize, ToSchema)]
+pub struct RestoreCredentials {
+	pub credentials: CredentialProcessOutput,
+	/// The kopia repo passphrase, read from the group's k8s Secret.
+	pub repo_password: String,
+}
+
+#[utoipa::path(
+	post,
+	path = "/restore-credentials",
+	tag = "restore",
+	security(("backup-restore-device" = [])),
+	request_body = CredentialsArgs,
+	responses(
+		(status = 200, body = RestoreCredentials),
+		(status = 403, description = "No enabled declaration authorizes this (group, type).", body = ProblemDetailsSchema),
+		(status = 409, description = "Group has no ready backup config.", body = ProblemDetailsSchema),
+		(status = 502, description = "STS issuance or repo-password read failed or is not configured.", body = ProblemDetailsSchema),
+	),
+)]
+async fn credentials(
+	State(db): State<Db>,
+	State(sts): State<Option<aws_sdk_sts::Client>>,
+	State(kube): State<Option<BackupSecrets>>,
+	device: BackupRestoreDevice,
+	Json(args): Json<CredentialsArgs>,
+) -> Result<Json<RestoreCredentials>> {
+	let mut conn = db.get().await?;
+	let consumer_device_id = device.0.0.id;
+
+	// Authorization is the declared replica: a consumer may read exactly the
+	// (group, type) pairs its enabled declarations cover.
+	if !RestoreReplica::authorizes(&mut conn, consumer_device_id, args.group, &args.r#type).await? {
+		return Err(AppError::AuthInsufficientPermissions {
+			required: "an enabled restore-replica declaration for this group and type".into(),
+		});
+	}
+
+	let cfg = ServerGroupBackupConfig::get(&mut conn, args.group)
+		.await?
+		.ok_or_else(|| AppError::Conflict("group has no backup config".into()))?;
+	if cfg.status != commons_types::backup::BackupConfigStatus::Ready {
+		return Err(AppError::Conflict(
+			"group backup config is not ready".into(),
+		));
+	}
+
+	// Always read-only — this role cannot mint write creds.
+	let session_policy = restore_session_policy(&cfg.bucket, &cfg.prefix);
+
+	let Some(sts) = sts else {
+		tracing::error!(group = %args.group, "restore-credentials: STS client not configured");
+		return Err(AppError::Upstream(
+			"credential issuer not configured".into(),
+		));
+	};
+
+	let session_name = format!("canopy-restore-{consumer_device_id}");
+	let resp = sts
+		.assume_role()
+		.role_arn(&cfg.target_role_arn)
+		.role_session_name(session_name)
+		.policy(session_policy)
+		.duration_seconds(3600)
+		.send()
+		.await
+		.map_err(|err| {
+			let request_id = err.request_id().unwrap_or("<none>");
+			tracing::error!(
+				group = %args.group,
+				role = %cfg.target_role_arn,
+				request_id,
+				error = ?err,
+				"restore-credentials: AssumeRole failed",
+			);
+			AppError::Upstream("credential issuance failed".into())
+		})?;
+
+	let sts_request_id = resp.request_id().map(str::to_owned);
+	let creds = resp.credentials().ok_or_else(|| {
+		tracing::error!(group = %args.group, "restore-credentials: AssumeRole returned no credentials");
+		AppError::Upstream("credential issuance returned no credentials".into())
+	})?;
+
+	let expiry_secs = creds.expiration().secs();
+	let expires_at = Timestamp::from_second(expiry_secs).map_err(|err| {
+		tracing::error!(group = %args.group, error = ?err, "restore-credentials: bad expiration");
+		AppError::Upstream("credential issuance returned an invalid expiration".into())
+	})?;
+	let access_key_id = creds.access_key_id().to_owned();
+
+	let Some(kube) = kube else {
+		tracing::error!(group = %args.group, "restore-credentials: kube client not configured");
+		return Err(AppError::Upstream("secret store not configured".into()));
+	};
+	let repo_password = kube
+		.read_password(&cfg.repo_password_ref, REPO_PASSWORD_SECRET_KEY)
+		.await
+		.map_err(|err| {
+			tracing::error!(
+				group = %args.group,
+				secret = %cfg.repo_password_ref,
+				error = ?err,
+				"restore-credentials: reading repo-password Secret failed",
+			);
+			AppError::Upstream("repo password unavailable".into())
+		})?;
+
+	// Audit BEFORE returning — never hand out creds we didn't record.
+	database::backups::BackupCredentialIssuance::record(
+		&mut conn,
+		NewBackupCredentialIssuance {
+			device_id: consumer_device_id,
+			group_id: args.group,
+			r#type: args.r#type.clone(),
+			expires_at,
+			purpose: BackupPurpose::Restore,
+			sts_assumed_role: cfg.target_role_arn.clone(),
+			sts_request_id,
+			access_key_id: Some(access_key_id.clone()),
+			bucket: cfg.bucket.clone(),
+			prefix: cfg.prefix.clone(),
+		},
+	)
+	.await?;
+
+	Ok(Json(RestoreCredentials {
+		credentials: CredentialProcessOutput {
+			version: 1,
+			access_key_id,
+			secret_access_key: creds.secret_access_key().to_owned(),
+			session_token: creds.session_token().to_owned(),
+			expiration: expires_at.to_string(),
+		},
+		repo_password,
+	}))
+}
diff --git a/private-web/openapi.json b/private-web/openapi.json
index e4adee74..904f5e37 100644
--- a/private-web/openapi.json
+++ b/private-web/openapi.json
@@ -3175,6 +3175,228 @@
         ]
       }
     },
+    "/api/restore_replicas/consumers": {
+      "post": {
+        "tags": [
+          "restore_replicas"
+        ],
+        "operationId": "restore_replicas_consumers",
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "array",
+                  "items": {
+                    "$ref": "#/components/schemas/RestoreConsumerView"
+                  }
+                }
+              }
+            }
+          }
+        },
+        "security": [
+          {
+            "tailscale-user": []
+          }
+        ]
+      }
+    },
+    "/api/restore_replicas/create": {
+      "post": {
+        "tags": [
+          "restore_replicas"
+        ],
+        "operationId": "restore_replicas_create",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/CreateArgs"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/RestoreReplicaView"
+                }
+              }
+            }
+          },
+          "409": {
+            "description": "A matching declaration already exists.",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ProblemDetailsSchema"
+                }
+              }
+            }
+          }
+        },
+        "security": [
+          {
+            "tailscale-admin": []
+          }
+        ]
+      }
+    },
+    "/api/restore_replicas/delete": {
+      "post": {
+        "tags": [
+          "restore_replicas"
+        ],
+        "operationId": "restore_replicas_delete",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/IdArgs"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": ""
+          },
+          "404": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ProblemDetailsSchema"
+                }
+              }
+            }
+          }
+        },
+        "security": [
+          {
+            "tailscale-admin": []
+          }
+        ]
+      }
+    },
+    "/api/restore_replicas/for_group": {
+      "post": {
+        "tags": [
+          "restore_replicas"
+        ],
+        "operationId": "restore_replicas_for_group",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/GroupArgs"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "array",
+                  "items": {
+                    "$ref": "#/components/schemas/RestoreReplicaView"
+                  }
+                }
+              }
+            }
+          }
+        },
+        "security": [
+          {
+            "tailscale-user": []
+          }
+        ]
+      }
+    },
+    "/api/restore_replicas/list": {
+      "post": {
+        "tags": [
+          "restore_replicas"
+        ],
+        "operationId": "restore_replicas_list",
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "array",
+                  "items": {
+                    "$ref": "#/components/schemas/RestoreReplicaView"
+                  }
+                }
+              }
+            }
+          }
+        },
+        "security": [
+          {
+            "tailscale-user": []
+          }
+        ]
+      }
+    },
+    "/api/restore_replicas/update": {
+      "post": {
+        "tags": [
+          "restore_replicas"
+        ],
+        "operationId": "restore_replicas_update",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/UpdateArgs"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/RestoreReplicaView"
+                }
+              }
+            }
+          },
+          "404": {
+            "description": "",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ProblemDetailsSchema"
+                }
+              }
+            }
+          }
+        },
+        "security": [
+          {
+            "tailscale-admin": []
+          }
+        ]
+      }
+    },
     "/api/server_groups/create": {
       "post": {
         "tags": [
@@ -5718,25 +5940,45 @@
       "CreateArgs": {
         "type": "object",
         "required": [
+          "consumer_device_id",
+          "group_id",
+          "type",
+          "intent",
           "name"
         ],
         "properties": {
-          "name": {
+          "consumer_device_id": {
+            "type": "string",
+            "format": "uuid"
+          },
+          "freshness_seconds": {
+            "type": [
+              "integer",
+              "null"
+            ],
+            "format": "int64",
+            "description": "Max snapshot age before overdue, in whole seconds; `None` = latest only."
+          },
+          "group_id": {
+            "type": "string",
+            "format": "uuid"
+          },
+          "intent": {
             "type": "string"
           },
-          "notes": {
+          "name": {
             "type": "string"
           },
-          "slack_open_delay": {
+          "server_id": {
             "type": [
-              "integer",
+              "string",
               "null"
             ],
-            "format": "int64",
-            "description": "Optional initial value (seconds) for the group's Slack open\ncooldown. Omit to let the database default apply."
+            "format": "uuid",
+            "description": "`None` = all current servers in the group."
           },
-          "tags": {
-            "$ref": "#/components/schemas/TagMap"
+          "type": {
+            "type": "string"
           }
         }
       },
@@ -6117,7 +6359,8 @@
           "untrusted",
           "admin",
           "releaser",
-          "server"
+          "server",
+          "backup-restore"
         ]
       },
       "EnrollmentStatus": {
@@ -6647,6 +6890,18 @@
           }
         }
       },
+      "IdArgs": {
+        "type": "object",
+        "required": [
+          "id"
+        ],
+        "properties": {
+          "id": {
+            "type": "string",
+            "format": "uuid"
+          }
+        }
+      },
       "IncidentData": {
         "type": "object",
         "required": [
@@ -8196,6 +8451,109 @@
           "flapping"
         ]
       },
+      "RestoreConsumerView": {
+        "type": "object",
+        "description": "A restore consumer (a `backup-restore` device) and the intents it currently\nsupports — drives the declaration form's consumer and intent pickers.",
+        "required": [
+          "device_id",
+          "intents"
+        ],
+        "properties": {
+          "device_id": {
+            "type": "string",
+            "format": "uuid"
+          },
+          "intents": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          },
+          "name": {
+            "type": [
+              "string",
+              "null"
+            ]
+          }
+        }
+      },
+      "RestoreReplicaView": {
+        "type": "object",
+        "description": "A declared replica for the operator UI. `gap` is true when the consumer does\nnot currently advertise this declaration's intent, so Canopy is not\ndispatching it.",
+        "required": [
+          "id",
+          "consumer_device_id",
+          "group_id",
+          "type",
+          "intent",
+          "name",
+          "enabled",
+          "gap",
+          "created_at",
+          "updated_at"
+        ],
+        "properties": {
+          "consumer_device_id": {
+            "type": "string",
+            "format": "uuid"
+          },
+          "consumer_name": {
+            "type": [
+              "string",
+              "null"
+            ]
+          },
+          "created_at": {
+            "type": "string"
+          },
+          "created_by": {
+            "type": [
+              "string",
+              "null"
+            ]
+          },
+          "enabled": {
+            "type": "boolean"
+          },
+          "freshness_seconds": {
+            "type": [
+              "integer",
+              "null"
+            ],
+            "format": "int64"
+          },
+          "gap": {
+            "type": "boolean"
+          },
+          "group_id": {
+            "type": "string",
+            "format": "uuid"
+          },
+          "id": {
+            "type": "string",
+            "format": "uuid"
+          },
+          "intent": {
+            "type": "string"
+          },
+          "name": {
+            "type": "string"
+          },
+          "server_id": {
+            "type": [
+              "string",
+              "null"
+            ],
+            "format": "uuid"
+          },
+          "type": {
+            "type": "string"
+          },
+          "updated_at": {
+            "type": "string"
+          }
+        }
+      },
       "RetentionPolicy": {
         "type": "object",
         "description": "kopia `keep-*` retention policy. Org-minimum floors\n(`keep_daily ≥ 7, keep_weekly ≥ 4, keep_monthly ≥ 6`) are enforced by\n[`RetentionPolicy::validate_floor`] on create/update — unless the config\nopts out via its `allow_below_floor` flag (dangerous).",
diff --git a/private-web/src/api-types.ts b/private-web/src/api-types.ts
index 8b3807d0..0b138f06 100644
--- a/private-web/src/api-types.ts
+++ b/private-web/src/api-types.ts
@@ -1319,6 +1319,102 @@ export interface paths {
         patch?: never;
         trace?: never;
     };
+    "/api/restore_replicas/consumers": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        post: operations["restore_replicas_consumers"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/restore_replicas/create": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        post: operations["restore_replicas_create"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/restore_replicas/delete": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        post: operations["restore_replicas_delete"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/restore_replicas/for_group": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        post: operations["restore_replicas_for_group"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/restore_replicas/list": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        post: operations["restore_replicas_list"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/restore_replicas/update": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        post: operations["restore_replicas_update"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
     "/api/server_groups/create": {
         parameters: {
             query?: never;
@@ -2344,15 +2440,23 @@ export interface components {
             limit?: number | null;
         };
         CreateArgs: {
-            name: string;
-            notes?: string;
+            /** Format: uuid */
+            consumer_device_id: string;
             /**
              * Format: int64
-             * @description Optional initial value (seconds) for the group's Slack open
-             *     cooldown. Omit to let the database default apply.
+             * @description Max snapshot age before overdue, in whole seconds; `None` = latest only.
              */
-            slack_open_delay?: number | null;
-            tags?: components["schemas"]["TagMap"];
+            freshness_seconds?: number | null;
+            /** Format: uuid */
+            group_id: string;
+            intent: string;
+            name: string;
+            /**
+             * Format: uuid
+             * @description `None` = all current servers in the group.
+             */
+            server_id?: string | null;
+            type: string;
         };
         CreateArtifactArgs: {
             artifact_type: string;
@@ -2468,7 +2572,7 @@ export interface components {
             pem_data: string;
         };
         /** @enum {string} */
-        DeviceRole: "untrusted" | "admin" | "releaser" | "server";
+        DeviceRole: "untrusted" | "admin" | "releaser" | "server" | "backup-restore";
         EnrollmentStatus: {
             /**
              * Format: date-time
@@ -2708,6 +2812,10 @@ export interface components {
             /** Format: uuid */
             id: string;
         };
+        IdArgs: {
+            /** Format: uuid */
+            id: string;
+        };
         IncidentData: {
             /** Format: date-time */
             closed_at?: string | null;
@@ -3314,6 +3422,42 @@ export interface components {
          * @enum {string}
          */
         ResolvedReason: "fixed" | "wont_fix" | "expected" | "duplicate" | "flapping";
+        /**
+         * @description A restore consumer (a `backup-restore` device) and the intents it currently
+         *     supports — drives the declaration form's consumer and intent pickers.
+         */
+        RestoreConsumerView: {
+            /** Format: uuid */
+            device_id: string;
+            intents: string[];
+            name?: string | null;
+        };
+        /**
+         * @description A declared replica for the operator UI. `gap` is true when the consumer does
+         *     not currently advertise this declaration's intent, so Canopy is not
+         *     dispatching it.
+         */
+        RestoreReplicaView: {
+            /** Format: uuid */
+            consumer_device_id: string;
+            consumer_name?: string | null;
+            created_at: string;
+            created_by?: string | null;
+            enabled: boolean;
+            /** Format: int64 */
+            freshness_seconds?: number | null;
+            gap: boolean;
+            /** Format: uuid */
+            group_id: string;
+            /** Format: uuid */
+            id: string;
+            intent: string;
+            name: string;
+            /** Format: uuid */
+            server_id?: string | null;
+            type: string;
+            updated_at: string;
+        };
         /**
          * @description kopia `keep-*` retention policy. Org-minimum floors
          *     (`keep_daily ≥ 7, keep_weekly ≥ 4, keep_monthly ≥ 6`) are enforced by
@@ -6182,6 +6326,159 @@ export interface operations {
             };
         };
     };
+    restore_replicas_consumers: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["RestoreConsumerView"][];
+                };
+            };
+        };
+    };
+    restore_replicas_create: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        requestBody: {
+            content: {
+                "application/json": components["schemas"]["CreateArgs"];
+            };
+        };
+        responses: {
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["RestoreReplicaView"];
+                };
+            };
+            /** @description A matching declaration already exists. */
+            409: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["ProblemDetailsSchema"];
+                };
+            };
+        };
+    };
+    restore_replicas_delete: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        requestBody: {
+            content: {
+                "application/json": components["schemas"]["IdArgs"];
+            };
+        };
+        responses: {
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content?: never;
+            };
+            404: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["ProblemDetailsSchema"];
+                };
+            };
+        };
+    };
+    restore_replicas_for_group: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        requestBody: {
+            content: {
+                "application/json": components["schemas"]["GroupArgs"];
+            };
+        };
+        responses: {
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["RestoreReplicaView"][];
+                };
+            };
+        };
+    };
+    restore_replicas_list: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["RestoreReplicaView"][];
+                };
+            };
+        };
+    };
+    restore_replicas_update: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        requestBody: {
+            content: {
+                "application/json": components["schemas"]["UpdateArgs"];
+            };
+        };
+        responses: {
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["RestoreReplicaView"];
+                };
+            };
+            404: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["ProblemDetailsSchema"];
+                };
+            };
+        };
+    };
     server_groups_create: {
         parameters: {
             query?: never;

From 1e3ee66ab932a1337c8fdebcb805674c3baac3f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= <felix@bes.au>
Date: Tue, 30 Jun 2026 14:40:46 +1200
Subject: [PATCH 5/7] feat(private-web): restore-replicas operator UI + e2e

New /restore-replicas page: declarations table (scope, intent with gap
chip, enable toggle, delete), consumers panel showing each backup-restore
device's registered capabilities, and a declare dialog with
consumer/group/server/type/intent pickers (intent options annotate
unsupported choices). Nav entry + route. Adds backup-restore to the device
trust picker (earlier commit).

e2e: seedRestoreReplica + seedRestoreConsumerCapability helpers, restore
tables added to resetSeededTables, and restore-replicas.spec.ts covering
empty state, gap flagging, consumers panel, delete, enable toggle, and the
declare dialog.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 private-web/e2e/restore-replicas.spec.ts   | 172 ++++++++
 private-web/e2e/seed.ts                    |  75 +++-
 private-web/src/App.tsx                    |   3 +
 private-web/src/routes/RestoreReplicas.tsx | 478 +++++++++++++++++++++
 4 files changed, 727 insertions(+), 1 deletion(-)
 create mode 100644 private-web/e2e/restore-replicas.spec.ts
 create mode 100644 private-web/src/routes/RestoreReplicas.tsx

diff --git a/private-web/e2e/restore-replicas.spec.ts b/private-web/e2e/restore-replicas.spec.ts
new file mode 100644
index 00000000..c63eaf96
--- /dev/null
+++ b/private-web/e2e/restore-replicas.spec.ts
@@ -0,0 +1,172 @@
+import { expect, test } from "./test-fixtures";
+import {
+	resetSeededTables,
+	seedDevice,
+	seedRestoreConsumerCapability,
+	seedRestoreReplica,
+	seedServer,
+	seedServerGroup,
+} from "./seed";
+
+// The e2e fixture runs the private-server in a debug build, so the Tailscale
+// auth bypass treats every caller as `admin@localhost` (an admin). These specs
+// exercise the operator-facing managed-restore UI.
+
+test.describe("restore replicas", () => {
+	test.beforeEach(async ({ sql }) => {
+		await resetSeededTables(sql);
+	});
+
+	test("empty state shows the no-declarations banner", async ({ page }) => {
+		await page.goto("/restore-replicas");
+		await expect(
+			page.getByText(/no restore replicas declared/i),
+		).toBeVisible();
+	});
+
+	test("a seeded declaration renders; an unsupported intent is flagged as a gap", async ({
+		page,
+		sql,
+	}) => {
+		const consumer = await seedDevice(sql, { role: "backup-restore" });
+		await seedRestoreConsumerCapability(sql, {
+			deviceId: consumer.id,
+			intents: ["verify"],
+		});
+		const group = await seedServerGroup(sql, { name: "rr-group" });
+
+		// Supported intent — no gap.
+		await seedRestoreReplica(sql, {
+			consumerDeviceId: consumer.id,
+			groupId: group.id,
+			intent: "verify",
+			name: "verify-all",
+		});
+		// Unsupported intent — gap.
+		await seedRestoreReplica(sql, {
+			consumerDeviceId: consumer.id,
+			groupId: group.id,
+			intent: "analytics",
+			name: "analytics-all",
+		});
+
+		await page.goto("/restore-replicas");
+
+		const verifyRow = page.getByRole("row", { name: /verify-all/ });
+		const analyticsRow = page.getByRole("row", { name: /analytics-all/ });
+		await expect(verifyRow).toBeVisible();
+		await expect(analyticsRow).toBeVisible();
+		// The unsupported declaration carries a gap chip; the supported one does not.
+		await expect(analyticsRow.getByText("gap")).toBeVisible();
+		await expect(verifyRow.getByText("gap")).toHaveCount(0);
+	});
+
+	test("consumers panel lists the device and its capabilities", async ({
+		page,
+		sql,
+	}) => {
+		const consumer = await seedDevice(sql, { role: "backup-restore" });
+		await seedRestoreConsumerCapability(sql, {
+			deviceId: consumer.id,
+			intents: ["verify", "disaster-recovery"],
+		});
+
+		await page.goto("/restore-replicas");
+		// The consumer's intents render as chips.
+		await expect(page.getByText("verify").first()).toBeVisible();
+		await expect(page.getByText("disaster-recovery").first()).toBeVisible();
+	});
+
+	test("deleting a declaration removes it", async ({ page, sql }) => {
+		const consumer = await seedDevice(sql, { role: "backup-restore" });
+		await seedRestoreConsumerCapability(sql, {
+			deviceId: consumer.id,
+			intents: ["verify"],
+		});
+		const group = await seedServerGroup(sql, { name: "del-group" });
+		await seedRestoreReplica(sql, {
+			consumerDeviceId: consumer.id,
+			groupId: group.id,
+			intent: "verify",
+			name: "doomed",
+		});
+
+		await page.goto("/restore-replicas");
+		await expect(page.getByRole("row", { name: /doomed/ })).toBeVisible();
+		await page.getByRole("button", { name: "delete doomed" }).click();
+		await expect(page.getByRole("row", { name: /doomed/ })).toHaveCount(0);
+
+		const rows = await sql.query<{ count: string }>(
+			"SELECT count(*) AS count FROM restore_replicas",
+		);
+		expect(Number(rows[0]!.count)).toBe(0);
+	});
+
+	test("toggling enabled flips the row in the database", async ({
+		page,
+		sql,
+	}) => {
+		const consumer = await seedDevice(sql, { role: "backup-restore" });
+		await seedRestoreConsumerCapability(sql, {
+			deviceId: consumer.id,
+			intents: ["verify"],
+		});
+		const group = await seedServerGroup(sql, { name: "tog-group" });
+		const replica = await seedRestoreReplica(sql, {
+			consumerDeviceId: consumer.id,
+			groupId: group.id,
+			intent: "verify",
+			name: "togglable",
+			enabled: true,
+		});
+
+		await page.goto("/restore-replicas");
+		await page
+			.getByRole("row", { name: /togglable/ })
+			.locator('input[type="checkbox"]')
+			.click();
+
+		await expect
+			.poll(async () => {
+				const rows = await sql.query<{ enabled: boolean }>(
+					"SELECT enabled FROM restore_replicas WHERE id = $1",
+					[replica.id],
+				);
+				return rows[0]?.enabled;
+			})
+			.toBe(false);
+	});
+
+	test("declaring a replica through the dialog persists it", async ({
+		page,
+		sql,
+	}) => {
+		const consumer = await seedDevice(sql, { role: "backup-restore" });
+		await seedRestoreConsumerCapability(sql, {
+			deviceId: consumer.id,
+			intents: ["verify"],
+		});
+		const group = await seedServerGroup(sql, { name: "create-group" });
+		await seedServer(sql, { groupId: group.id, name: "srv-a" });
+
+		await page.goto("/restore-replicas");
+		await page.getByRole("button", { name: /declare replica/i }).click();
+
+		await page.getByLabel("Consumer").click();
+		await page.getByRole("option").first().click();
+		await page.getByLabel("Group").click();
+		await page.getByRole("option", { name: "create-group" }).click();
+		await page.getByLabel("Name").fill("dialog-made");
+		await page
+			.getByRole("button", { name: /^declare$/i })
+			.click();
+
+		await expect(
+			page.getByRole("row", { name: /dialog-made/ }),
+		).toBeVisible();
+		const rows = await sql.query<{ name: string }>(
+			"SELECT name FROM restore_replicas WHERE name = 'dialog-made'",
+		);
+		expect(rows).toHaveLength(1);
+	});
+});
diff --git a/private-web/e2e/seed.ts b/private-web/e2e/seed.ts
index 988990da..e0b61428 100644
--- a/private-web/e2e/seed.ts
+++ b/private-web/e2e/seed.ts
@@ -47,7 +47,7 @@ function randomLabel(prefix: string): string {
  * statement with CASCADE. */
 export async function resetSeededTables(sql: Sql): Promise<void> {
 	await sql.query(
-		"TRUNCATE statuses, issues, device_keys, servers, server_groups, devices, versions, tailscale_users, server_group_backup_config, server_group_backup_schedule, server_backup_capabilities, backup_requests, backup_runs, backup_repo_stats, backup_maintenance_runs, backup_credential_issuances RESTART IDENTITY CASCADE",
+		"TRUNCATE statuses, issues, device_keys, servers, server_groups, devices, versions, tailscale_users, server_group_backup_config, server_group_backup_schedule, server_backup_capabilities, backup_requests, backup_runs, backup_repo_stats, backup_maintenance_runs, backup_credential_issuances, restore_replicas, restore_consumer_capabilities RESTART IDENTITY CASCADE",
 	);
 }
 
@@ -573,3 +573,76 @@ export async function seedBackupRequest(
 		],
 	);
 }
+
+/** Register the intents a restore consumer (a `backup-restore` device) supports. */
+export async function seedRestoreConsumerCapability(
+	sql: Sql,
+	opts: { deviceId: string; intents: string[] },
+): Promise<void> {
+	for (const intent of opts.intents) {
+		await sql.query(
+			`INSERT INTO restore_consumer_capabilities (consumer_device_id, intent)
+			 VALUES ($1, $2)`,
+			[opts.deviceId, intent],
+		);
+	}
+}
+
+export interface SeededRestoreReplica {
+	id: string;
+}
+
+/** Seed a declared restore replica. */
+export async function seedRestoreReplica(
+	sql: Sql,
+	opts: {
+		consumerDeviceId: string;
+		groupId: string;
+		/** Omit for a whole-group declaration. */
+		serverId?: string | null;
+		type?: string;
+		intent?: string;
+		name?: string;
+		/** Whole seconds; omit for "latest only". */
+		freshnessSeconds?: number | null;
+		enabled?: boolean;
+	},
+): Promise<SeededRestoreReplica> {
+	const id = randomUUID();
+	const freshness = opts.freshnessSeconds ?? null;
+	if (freshness == null) {
+		await sql.query(
+			`INSERT INTO restore_replicas
+			 (id, consumer_device_id, group_id, server_id, type, intent, name, enabled)
+			 VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`,
+			[
+				id,
+				opts.consumerDeviceId,
+				opts.groupId,
+				opts.serverId ?? null,
+				opts.type ?? "tamanu-postgres",
+				opts.intent ?? "verify",
+				opts.name ?? randomLabel("replica"),
+				opts.enabled ?? true,
+			],
+		);
+	} else {
+		await sql.query(
+			`INSERT INTO restore_replicas
+			 (id, consumer_device_id, group_id, server_id, type, intent, name, freshness, enabled)
+			 VALUES ($1, $2, $3, $4, $5, $6, $7, make_interval(secs => $8), $9)`,
+			[
+				id,
+				opts.consumerDeviceId,
+				opts.groupId,
+				opts.serverId ?? null,
+				opts.type ?? "tamanu-postgres",
+				opts.intent ?? "verify",
+				opts.name ?? randomLabel("replica"),
+				freshness,
+				opts.enabled ?? true,
+			],
+		);
+	}
+	return { id };
+}
diff --git a/private-web/src/App.tsx b/private-web/src/App.tsx
index 7ccc9caa..3d1baf87 100644
--- a/private-web/src/App.tsx
+++ b/private-web/src/App.tsx
@@ -17,6 +17,7 @@ import BackupConfig from "./routes/BackupConfig";
 import BackupDefaults from "./routes/BackupDefaults";
 import RecoveryVault from "./routes/RecoveryVault";
 import BackupPanel from "./routes/BackupPanel";
+import RestoreReplicas from "./routes/RestoreReplicas";
 import Bestool from "./routes/Bestool";
 import BestoolSnippetDetail from "./routes/BestoolSnippetDetail";
 import BestoolSnippets from "./routes/BestoolSnippets";
@@ -55,6 +56,7 @@ const BASE_NAV: NavItem[] = [
 	{ label: "Servers", to: "/servers" },
 	{ label: "Versions", to: "/versions" },
 	{ label: "Devices", to: "/devices" },
+	{ label: "Restore", to: "/restore-replicas" },
 	{ label: "Bestool", to: "/bestool" },
 	{ label: "Settings", to: "/settings" },
 ];
@@ -226,6 +228,7 @@ export default function App() {
 						/>
 					</Route>
 					<Route path="/devices/:id" element={<DeviceDetail />} />
+					<Route path="/restore-replicas" element={<RestoreReplicas />} />
 					<Route path="/bestool" element={<Bestool />}>
 						<Route
 							index
diff --git a/private-web/src/routes/RestoreReplicas.tsx b/private-web/src/routes/RestoreReplicas.tsx
new file mode 100644
index 00000000..fc4e7d5b
--- /dev/null
+++ b/private-web/src/routes/RestoreReplicas.tsx
@@ -0,0 +1,478 @@
+import AddIcon from "@mui/icons-material/Add";
+import DeleteIcon from "@mui/icons-material/Delete";
+import {
+	Alert,
+	Box,
+	Button,
+	Chip,
+	Dialog,
+	DialogActions,
+	DialogContent,
+	DialogTitle,
+	FormControl,
+	IconButton,
+	InputLabel,
+	LinearProgress,
+	MenuItem,
+	Paper,
+	Select,
+	Stack,
+	Switch,
+	Table,
+	TableBody,
+	TableCell,
+	TableHead,
+	TableRow,
+	TextField,
+	Tooltip,
+	Typography,
+} from "@mui/material";
+import { useState } from "react";
+import { ApiError, callApi, useApi } from "../api";
+import { usePageTitle } from "../hooks/usePageTitle";
+
+const WELL_KNOWN_INTENTS = ["verify", "analytics", "disaster-recovery"];
+
+function formatError(err: unknown): string {
+	if (err instanceof ApiError) {
+		const detail = err.detail as { title?: string } | null;
+		return detail?.title ?? err.message;
+	}
+	if (err instanceof Error) return err.message;
+	return String(err);
+}
+
+function freshnessLabel(seconds: number | null | undefined): string {
+	if (seconds == null) return "latest";
+	const hours = seconds / 3600;
+	return hours >= 1 ? `${hours}h` : `${seconds}s`;
+}
+
+export default function RestoreReplicas() {
+	usePageTitle("Restore replicas");
+	const [tick, setTick] = useState(0);
+	const reload = () => setTick((t) => t + 1);
+
+	const replicas = useApi("restore_replicas", "list", {}, [tick]);
+	const consumers = useApi("restore_replicas", "consumers", {}, [tick]);
+
+	const [createOpen, setCreateOpen] = useState(false);
+	const [error, setError] = useState<string | null>(null);
+
+	const onDelete = async (id: string) => {
+		try {
+			await callApi("restore_replicas", "delete", { id });
+			reload();
+		} catch (err) {
+			setError(formatError(err));
+		}
+	};
+
+	const onToggle = async (
+		id: string,
+		name: string,
+		freshnessSeconds: number | null | undefined,
+		enabled: boolean,
+	) => {
+		try {
+			await callApi("restore_replicas", "update", {
+				id,
+				name,
+				freshness_seconds: freshnessSeconds ?? null,
+				enabled,
+			});
+			reload();
+		} catch (err) {
+			setError(formatError(err));
+		}
+	};
+
+	return (
+		<Stack spacing={3}>
+			<Stack
+				direction="row"
+				sx={{ alignItems: "center", justifyContent: "space-between" }}
+			>
+				<Typography variant="h5" component="h1">
+					Restore replicas
+				</Typography>
+				<Button
+					variant="contained"
+					startIcon={<AddIcon />}
+					onClick={() => setCreateOpen(true)}
+				>
+					Declare replica
+				</Button>
+			</Stack>
+
+			<Typography variant="body2" color="text.secondary">
+				Canopy decides which replicas a restore consumer should keep. Each
+				declaration expands to one replica per matching server, restored from
+				the latest snapshot Canopy knows about.
+			</Typography>
+
+			{error && (
+				<Alert severity="error" onClose={() => setError(null)}>
+					{error}
+				</Alert>
+			)}
+
+			<Box>
+				<Typography variant="h6" component="h2" gutterBottom>
+					Declarations
+				</Typography>
+				{replicas.status === "loading" || replicas.status === "idle" ? (
+					<LinearProgress />
+				) : replicas.status === "error" ? (
+					<Alert severity="error">{replicas.error.message}</Alert>
+				) : replicas.data.length === 0 ? (
+					<Alert severity="info">No restore replicas declared.</Alert>
+				) : (
+					<Paper variant="outlined">
+						<Table size="small">
+							<TableHead>
+								<TableRow>
+									<TableCell>Name</TableCell>
+									<TableCell>Consumer</TableCell>
+									<TableCell>Scope</TableCell>
+									<TableCell>Type</TableCell>
+									<TableCell>Intent</TableCell>
+									<TableCell>Freshness</TableCell>
+									<TableCell>Enabled</TableCell>
+									<TableCell align="right">Actions</TableCell>
+								</TableRow>
+							</TableHead>
+							<TableBody>
+								{replicas.data.map((r) => (
+									<TableRow key={r.id}>
+										<TableCell>{r.name}</TableCell>
+										<TableCell>
+											{r.consumer_name ?? r.consumer_device_id.slice(0, 8)}
+										</TableCell>
+										<TableCell>
+											{r.server_id ? "one server" : "whole group"}
+										</TableCell>
+										<TableCell>{r.type}</TableCell>
+										<TableCell>
+											<Stack
+												direction="row"
+												spacing={0.5}
+												sx={{ alignItems: "center" }}
+											>
+												<span>{r.intent}</span>
+												{r.gap && (
+													<Tooltip title="The consumer does not currently support this intent, so Canopy is not dispatching it.">
+														<Chip label="gap" color="warning" size="small" />
+													</Tooltip>
+												)}
+											</Stack>
+										</TableCell>
+										<TableCell>{freshnessLabel(r.freshness_seconds)}</TableCell>
+										<TableCell>
+											<Switch
+												checked={r.enabled}
+												onChange={(e) =>
+													onToggle(
+														r.id,
+														r.name,
+														r.freshness_seconds,
+														e.target.checked,
+													)
+												}
+												slotProps={{
+													input: { "aria-label": `toggle ${r.name}` },
+												}}
+											/>
+										</TableCell>
+										<TableCell align="right">
+											<IconButton
+												edge="end"
+												aria-label={`delete ${r.name}`}
+												onClick={() => onDelete(r.id)}
+											>
+												<DeleteIcon />
+											</IconButton>
+										</TableCell>
+									</TableRow>
+								))}
+							</TableBody>
+						</Table>
+					</Paper>
+				)}
+			</Box>
+
+			<Box>
+				<Typography variant="h6" component="h2" gutterBottom>
+					Consumers
+				</Typography>
+				{consumers.status === "ok" && consumers.data.length === 0 && (
+					<Alert severity="info">
+						No restore consumers. Promote a device to the{" "}
+						<code>backup-restore</code> role on its device page.
+					</Alert>
+				)}
+				{consumers.status === "ok" && consumers.data.length > 0 && (
+					<Stack spacing={1}>
+						{consumers.data.map((c) => (
+							<Paper key={c.device_id} variant="outlined" sx={{ p: 1.5 }}>
+								<Typography variant="subtitle2">
+									{c.name ?? c.device_id}
+								</Typography>
+								<Stack direction="row" spacing={0.5} sx={{ mt: 0.5 }}>
+									{c.intents.length === 0 ? (
+										<Typography variant="body2" color="text.secondary">
+											No capabilities registered yet.
+										</Typography>
+									) : (
+										c.intents.map((i) => (
+											<Chip key={i} label={i} size="small" />
+										))
+									)}
+								</Stack>
+							</Paper>
+						))}
+					</Stack>
+				)}
+			</Box>
+
+			{createOpen && (
+				<CreateReplicaDialog
+					onClose={() => setCreateOpen(false)}
+					onCreated={() => {
+						setCreateOpen(false);
+						reload();
+					}}
+					consumers={
+						consumers.status === "ok" ? consumers.data : []
+					}
+				/>
+			)}
+		</Stack>
+	);
+}
+
+interface ConsumerOption {
+	device_id: string;
+	name?: string | null;
+	intents: string[];
+}
+
+function CreateReplicaDialog({
+	onClose,
+	onCreated,
+	consumers,
+}: {
+	onClose: () => void;
+	onCreated: () => void;
+	consumers: ConsumerOption[];
+}) {
+	const groups = useApi("server_groups", "list");
+	const typeDefaults = useApi("backups", "type_defaults");
+
+	const [consumerId, setConsumerId] = useState("");
+	const [groupId, setGroupId] = useState("");
+	const [serverId, setServerId] = useState(""); // "" = whole group
+	const [type, setType] = useState("tamanu-postgres");
+	const [intent, setIntent] = useState("verify");
+	const [name, setName] = useState("");
+	const [freshnessHours, setFreshnessHours] = useState("");
+	const [pending, setPending] = useState(false);
+	const [error, setError] = useState<string | null>(null);
+
+	const selectedConsumer = consumers.find((c) => c.device_id === consumerId);
+	const intentOptions = Array.from(
+		new Set([...(selectedConsumer?.intents ?? []), ...WELL_KNOWN_INTENTS]),
+	);
+
+	const onSubmit = async () => {
+		if (!consumerId) return setError("Pick a consumer");
+		if (!groupId) return setError("Pick a group");
+		if (!name.trim()) return setError("Name cannot be empty");
+		const hours = freshnessHours.trim();
+		const freshness_seconds =
+			hours === "" ? null : Math.round(Number(hours) * 3600);
+		if (freshness_seconds != null && !Number.isFinite(freshness_seconds)) {
+			return setError("Freshness must be a number of hours");
+		}
+		setPending(true);
+		setError(null);
+		try {
+			await callApi("restore_replicas", "create", {
+				consumer_device_id: consumerId,
+				group_id: groupId,
+				server_id: serverId || null,
+				type,
+				intent,
+				name: name.trim(),
+				freshness_seconds,
+			});
+			onCreated();
+		} catch (err) {
+			setError(formatError(err));
+			setPending(false);
+		}
+	};
+
+	const typeOptions =
+		typeDefaults.status === "ok" && typeDefaults.data.length > 0
+			? typeDefaults.data.map((t) => t.type)
+			: ["tamanu-postgres"];
+
+	return (
+		<Dialog open onClose={() => !pending && onClose()} fullWidth maxWidth="sm">
+			<DialogTitle>Declare restore replica</DialogTitle>
+			<DialogContent>
+				<Stack spacing={2} sx={{ mt: 1 }}>
+					<FormControl fullWidth size="small">
+						<InputLabel id="consumer-label">Consumer</InputLabel>
+						<Select
+							labelId="consumer-label"
+							label="Consumer"
+							value={consumerId}
+							onChange={(e) => {
+								setConsumerId(e.target.value);
+								setError(null);
+							}}
+						>
+							{consumers.map((c) => (
+								<MenuItem key={c.device_id} value={c.device_id}>
+									{c.name ?? c.device_id}
+								</MenuItem>
+							))}
+						</Select>
+					</FormControl>
+
+					<FormControl fullWidth size="small">
+						<InputLabel id="group-label">Group</InputLabel>
+						<Select
+							labelId="group-label"
+							label="Group"
+							value={groupId}
+							onChange={(e) => {
+								setGroupId(e.target.value);
+								setServerId("");
+							}}
+						>
+							{groups.status === "ok" &&
+								groups.data.map((g) => (
+									<MenuItem key={g.id} value={g.id}>
+										{g.name ?? g.id}
+									</MenuItem>
+								))}
+						</Select>
+					</FormControl>
+
+					{groupId && (
+						<ServerScopeSelect
+							groupId={groupId}
+							value={serverId}
+							onChange={setServerId}
+						/>
+					)}
+
+					<FormControl fullWidth size="small">
+						<InputLabel id="type-label">Type</InputLabel>
+						<Select
+							labelId="type-label"
+							label="Type"
+							value={type}
+							onChange={(e) => setType(e.target.value)}
+						>
+							{typeOptions.map((t) => (
+								<MenuItem key={t} value={t}>
+									{t}
+								</MenuItem>
+							))}
+						</Select>
+					</FormControl>
+
+					<FormControl fullWidth size="small">
+						<InputLabel id="intent-label">Intent</InputLabel>
+						<Select
+							labelId="intent-label"
+							label="Intent"
+							value={intent}
+							onChange={(e) => setIntent(e.target.value)}
+						>
+							{intentOptions.map((i) => {
+								const supported =
+									selectedConsumer?.intents.includes(i) ?? false;
+								return (
+									<MenuItem key={i} value={i}>
+										{i}
+										{!supported && " (unsupported — will be a gap)"}
+									</MenuItem>
+								);
+							})}
+						</Select>
+					</FormControl>
+
+					<TextField
+						size="small"
+						fullWidth
+						label="Name"
+						value={name}
+						onChange={(e) => setName(e.target.value)}
+					/>
+
+					<TextField
+						size="small"
+						fullWidth
+						type="number"
+						label="Freshness (hours, optional)"
+						placeholder="latest only"
+						value={freshnessHours}
+						onChange={(e) => setFreshnessHours(e.target.value)}
+					/>
+
+					{error && <Alert severity="error">{error}</Alert>}
+				</Stack>
+			</DialogContent>
+			<DialogActions>
+				<Button onClick={onClose} disabled={pending}>
+					Cancel
+				</Button>
+				<Button variant="contained" onClick={onSubmit} disabled={pending}>
+					{pending ? "Declaring…" : "Declare"}
+				</Button>
+			</DialogActions>
+		</Dialog>
+	);
+}
+
+function ServerScopeSelect({
+	groupId,
+	value,
+	onChange,
+}: {
+	groupId: string;
+	value: string;
+	onChange: (v: string) => void;
+}) {
+	const detail = useApi(
+		"server_groups",
+		"get",
+		{ server_group_id: groupId },
+		[groupId],
+	);
+	const servers =
+		detail.status === "ok" ? detail.data.servers.filter((s) => !s.archived) : [];
+	return (
+		<FormControl fullWidth size="small">
+			<InputLabel id="server-label">Server</InputLabel>
+			<Select
+				labelId="server-label"
+				label="Server"
+				value={value}
+				onChange={(e) => onChange(e.target.value)}
+			>
+				<MenuItem value="">All servers in the group</MenuItem>
+				{servers.map((s) => (
+					<MenuItem key={s.id} value={s.id}>
+						{s.name ?? s.display_host ?? s.id}
+					</MenuItem>
+				))}
+			</Select>
+		</FormControl>
+	);
+}

From 618a4e9737a8855d7eb919f9190713557af8af24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= <felix@bes.au>
Date: Tue, 30 Jun 2026 15:06:31 +1200
Subject: [PATCH 6/7] test(restore): db model + public-server endpoint coverage

database::restore: CRUD roundtrip, duplicate-scope 409 (server vs group
scope separate), update/delete, authorizes (enabled/group/type/disabled),
capability register replace semantics.

public-server::restore: capability-filtered worklist, per-server expansion
of a group-wide declaration, server-specific-over-group-wide dedup,
empty-without-capabilities, restore-credentials 403 (no declaration) / 502
(authorized but STS unconfigured), and non-consumer-role rejection.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 crates/database/tests/restore.rs      | 289 ++++++++++++++++++++++++
 crates/public-server/tests/restore.rs | 308 ++++++++++++++++++++++++++
 2 files changed, 597 insertions(+)
 create mode 100644 crates/database/tests/restore.rs
 create mode 100644 crates/public-server/tests/restore.rs

diff --git a/crates/database/tests/restore.rs b/crates/database/tests/restore.rs
new file mode 100644
index 00000000..eb473801
--- /dev/null
+++ b/crates/database/tests/restore.rs
@@ -0,0 +1,289 @@
+//! DB-layer tests for the managed-restore models (`database::restore`).
+//! Exercises the model helpers directly against a fresh migrated DB — no HTTP.
+
+use commons_errors::AppError;
+use commons_tests::db::TestDb;
+use commons_types::backup::{BackupType, RestoreIntent};
+use database::diesel_async::AsyncPgConnection;
+use database::pg_duration::PgDuration;
+use database::{NewRestoreReplica, RestoreConsumerCapability, RestoreReplica};
+use diesel::{sql_query, sql_types};
+use diesel_async::RunQueryDsl;
+use jiff::SignedDuration;
+use uuid::Uuid;
+
+#[derive(diesel::QueryableByName)]
+struct RowId {
+	#[diesel(sql_type = sql_types::Uuid)]
+	id: Uuid,
+}
+
+async fn insert_group(conn: &mut AsyncPgConnection, name: &str) -> Uuid {
+	sql_query("INSERT INTO server_groups (name) VALUES ($1) RETURNING id")
+		.bind::<sql_types::Text, _>(name)
+		.get_result::<RowId>(conn)
+		.await
+		.expect("insert group")
+		.id
+}
+
+async fn insert_server(conn: &mut AsyncPgConnection, group_id: Uuid) -> Uuid {
+	let host = format!("http://test.invalid/{}", Uuid::new_v4());
+	sql_query("INSERT INTO servers (host, kind, group_id) VALUES ($1, 'central', $2) RETURNING id")
+		.bind::<sql_types::Text, _>(host)
+		.bind::<sql_types::Uuid, _>(group_id)
+		.get_result::<RowId>(conn)
+		.await
+		.expect("insert server")
+		.id
+}
+
+async fn insert_consumer(conn: &mut AsyncPgConnection) -> Uuid {
+	sql_query("INSERT INTO devices (role) VALUES ('backup-restore') RETURNING id")
+		.get_result::<RowId>(conn)
+		.await
+		.expect("insert device")
+		.id
+}
+
+fn new_replica(
+	consumer: Uuid,
+	group: Uuid,
+	server: Option<Uuid>,
+	intent: RestoreIntent,
+	name: &str,
+) -> NewRestoreReplica {
+	NewRestoreReplica {
+		consumer_device_id: consumer,
+		group_id: group,
+		server_id: server,
+		r#type: BackupType::TamanuPostgres,
+		intent,
+		name: name.into(),
+		freshness: None,
+		created_by: Some("op@example.com".into()),
+	}
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn create_list_get_roundtrip() {
+	TestDb::run(|mut conn, _url| async move {
+		let consumer = insert_consumer(&mut conn).await;
+		let group = insert_group(&mut conn, "g").await;
+
+		let created = RestoreReplica::create(
+			&mut conn,
+			new_replica(consumer, group, None, RestoreIntent::Verify, "verify-all"),
+		)
+		.await
+		.expect("create");
+		assert_eq!(created.name, "verify-all");
+		assert_eq!(created.intent, RestoreIntent::Verify);
+		assert!(created.enabled, "new declarations default to enabled");
+		assert_eq!(created.created_by.as_deref(), Some("op@example.com"));
+
+		let got = RestoreReplica::get(&mut conn, created.id)
+			.await
+			.expect("get");
+		assert_eq!(got.id, created.id);
+
+		let all = RestoreReplica::list_all(&mut conn).await.expect("list_all");
+		assert_eq!(all.len(), 1);
+
+		let for_group = RestoreReplica::list_for_group(&mut conn, group)
+			.await
+			.expect("list_for_group");
+		assert_eq!(for_group.len(), 1);
+
+		let enabled = RestoreReplica::list_enabled_for_consumer(&mut conn, consumer)
+			.await
+			.expect("list_enabled");
+		assert_eq!(enabled.len(), 1);
+	})
+	.await;
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn duplicate_scope_conflicts_but_server_scope_is_separate() {
+	TestDb::run(|mut conn, _url| async move {
+		let consumer = insert_consumer(&mut conn).await;
+		let group = insert_group(&mut conn, "g").await;
+		let server = insert_server(&mut conn, group).await;
+
+		RestoreReplica::create(
+			&mut conn,
+			new_replica(consumer, group, None, RestoreIntent::Verify, "group-wide"),
+		)
+		.await
+		.expect("group-wide");
+
+		// Same (consumer, group, type, intent) group-wide scope → 409.
+		let dup = RestoreReplica::create(
+			&mut conn,
+			new_replica(consumer, group, None, RestoreIntent::Verify, "dup"),
+		)
+		.await;
+		assert!(matches!(dup, Err(AppError::Conflict(_))), "got {dup:?}");
+
+		// A server-scoped declaration for the same tuple is tracked separately.
+		RestoreReplica::create(
+			&mut conn,
+			new_replica(
+				consumer,
+				group,
+				Some(server),
+				RestoreIntent::Verify,
+				"server-scoped",
+			),
+		)
+		.await
+		.expect("server-scoped coexists with group-wide");
+	})
+	.await;
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn update_and_delete() {
+	TestDb::run(|mut conn, _url| async move {
+		let consumer = insert_consumer(&mut conn).await;
+		let group = insert_group(&mut conn, "g").await;
+		let r = RestoreReplica::create(
+			&mut conn,
+			new_replica(consumer, group, None, RestoreIntent::Verify, "n"),
+		)
+		.await
+		.expect("create");
+
+		let updated = RestoreReplica::update(
+			&mut conn,
+			r.id,
+			"renamed",
+			Some(PgDuration(SignedDuration::from_secs(7200))),
+			false,
+		)
+		.await
+		.expect("update");
+		assert_eq!(updated.name, "renamed");
+		assert!(!updated.enabled);
+		assert_eq!(updated.freshness.map(|f| f.0.as_secs()), Some(7200));
+
+		// Disabled declarations drop out of the consumer worklist basis.
+		let enabled = RestoreReplica::list_enabled_for_consumer(&mut conn, consumer)
+			.await
+			.expect("list_enabled");
+		assert!(enabled.is_empty());
+
+		RestoreReplica::delete(&mut conn, r.id)
+			.await
+			.expect("delete");
+		assert!(RestoreReplica::get(&mut conn, r.id).await.is_err());
+		assert!(
+			RestoreReplica::delete(&mut conn, r.id).await.is_err(),
+			"deleting a missing declaration errors"
+		);
+	})
+	.await;
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn authorizes_only_with_enabled_matching_declaration() {
+	TestDb::run(|mut conn, _url| async move {
+		let consumer = insert_consumer(&mut conn).await;
+		let group = insert_group(&mut conn, "g").await;
+		let other_group = insert_group(&mut conn, "other").await;
+		let tpg = BackupType::TamanuPostgres;
+
+		assert!(
+			!RestoreReplica::authorizes(&mut conn, consumer, group, &tpg)
+				.await
+				.unwrap(),
+			"no declaration → not authorized"
+		);
+
+		let r = RestoreReplica::create(
+			&mut conn,
+			new_replica(consumer, group, None, RestoreIntent::Verify, "n"),
+		)
+		.await
+		.expect("create");
+
+		assert!(
+			RestoreReplica::authorizes(&mut conn, consumer, group, &tpg)
+				.await
+				.unwrap(),
+			"enabled declaration → authorized"
+		);
+		assert!(
+			!RestoreReplica::authorizes(&mut conn, consumer, other_group, &tpg)
+				.await
+				.unwrap(),
+			"different group → not authorized"
+		);
+		assert!(
+			!RestoreReplica::authorizes(&mut conn, consumer, group, &BackupType::from("files"))
+				.await
+				.unwrap(),
+			"different type → not authorized"
+		);
+
+		// Disabling the only declaration revokes authorization.
+		RestoreReplica::update(&mut conn, r.id, "n", None, false)
+			.await
+			.expect("disable");
+		assert!(
+			!RestoreReplica::authorizes(&mut conn, consumer, group, &tpg)
+				.await
+				.unwrap(),
+			"disabled declaration → not authorized"
+		);
+	})
+	.await;
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn capability_register_replaces_set() {
+	TestDb::run(|mut conn, _url| async move {
+		let consumer = insert_consumer(&mut conn).await;
+
+		RestoreConsumerCapability::register(
+			&mut conn,
+			consumer,
+			&[RestoreIntent::Verify, RestoreIntent::Analytics],
+		)
+		.await
+		.expect("register");
+		let mut got = RestoreConsumerCapability::list_for_consumer(&mut conn, consumer)
+			.await
+			.expect("list");
+		got.sort_by_key(|i| i.to_string());
+		assert_eq!(got, vec![RestoreIntent::Analytics, RestoreIntent::Verify]);
+
+		// Re-register a different set: verify is kept, analytics dropped,
+		// disaster-recovery added.
+		RestoreConsumerCapability::register(
+			&mut conn,
+			consumer,
+			&[RestoreIntent::Verify, RestoreIntent::DisasterRecovery],
+		)
+		.await
+		.expect("re-register");
+		let mut got = RestoreConsumerCapability::list_for_consumer(&mut conn, consumer)
+			.await
+			.expect("list");
+		got.sort_by_key(|i| i.to_string());
+		assert_eq!(
+			got,
+			vec![RestoreIntent::DisasterRecovery, RestoreIntent::Verify]
+		);
+
+		// Empty set clears all capabilities.
+		RestoreConsumerCapability::register(&mut conn, consumer, &[])
+			.await
+			.expect("clear");
+		let got = RestoreConsumerCapability::list_for_consumer(&mut conn, consumer)
+			.await
+			.expect("list");
+		assert!(got.is_empty());
+	})
+	.await;
+}
diff --git a/crates/public-server/tests/restore.rs b/crates/public-server/tests/restore.rs
new file mode 100644
index 00000000..fcf68230
--- /dev/null
+++ b/crates/public-server/tests/restore.rs
@@ -0,0 +1,308 @@
+//! HTTP tests for the managed-restore endpoints (backup-restore role). The
+//! worklist/capability paths run against the standard harness (no STS/kube
+//! needed); restore-credentials is covered for its authz (403) and the
+//! authorized-but-unconfigured (502) paths.
+
+use diesel::{sql_query, sql_types};
+use diesel_async::{AsyncPgConnection, RunQueryDsl};
+use uuid::Uuid;
+
+async fn make_group(conn: &mut AsyncPgConnection) -> Uuid {
+	let id = Uuid::new_v4();
+	sql_query("INSERT INTO server_groups (id, name) VALUES ($1, 'restore-test-group')")
+		.bind::<sql_types::Uuid, _>(id)
+		.execute(conn)
+		.await
+		.expect("insert group");
+	id
+}
+
+async fn make_config(conn: &mut AsyncPgConnection, group_id: Uuid, status: &str) {
+	sql_query(
+		"INSERT INTO server_group_backup_config \
+		 (group_id, bucket, prefix, target_role_arn, maintenance_role_arn, region, repo_password_ref, status) \
+		 VALUES ($1, 'grp-bucket', '', 'arn:aws:iam::123456789012:role/grp', 'arn:aws:iam::123456789012:role/grp-maint', 'ap-southeast-2', 'grp-repo-pw', $2)",
+	)
+	.bind::<sql_types::Uuid, _>(group_id)
+	.bind::<sql_types::Text, _>(status)
+	.execute(conn)
+	.await
+	.expect("insert config");
+}
+
+async fn make_server(conn: &mut AsyncPgConnection, group_id: Uuid) -> Uuid {
+	let server_id = Uuid::new_v4();
+	let host = format!("https://srv-{server_id}.example.com");
+	sql_query("INSERT INTO servers (id, host, kind, group_id) VALUES ($1, $2, 'central', $3)")
+		.bind::<sql_types::Uuid, _>(server_id)
+		.bind::<sql_types::Text, _>(host)
+		.bind::<sql_types::Uuid, _>(group_id)
+		.execute(conn)
+		.await
+		.expect("insert server");
+	server_id
+}
+
+/// A successful `backup` run = the snapshot the worklist should surface.
+async fn make_success_run(
+	conn: &mut AsyncPgConnection,
+	device_id: Uuid,
+	group_id: Uuid,
+	server_id: Uuid,
+	snapshot_id: &str,
+) {
+	sql_query(
+		"INSERT INTO backup_runs (id, device_id, group_id, server_id, type, purpose, outcome, snapshot_id) \
+		 VALUES ($1, $2, $3, $4, 'tamanu-postgres', 'backup', 'success', $5)",
+	)
+	.bind::<sql_types::Uuid, _>(Uuid::new_v4())
+	.bind::<sql_types::Uuid, _>(device_id)
+	.bind::<sql_types::Uuid, _>(group_id)
+	.bind::<sql_types::Uuid, _>(server_id)
+	.bind::<sql_types::Text, _>(snapshot_id)
+	.execute(conn)
+	.await
+	.expect("insert run");
+}
+
+async fn declare_replica(
+	conn: &mut AsyncPgConnection,
+	consumer: Uuid,
+	group_id: Uuid,
+	intent: &str,
+) {
+	sql_query(
+		"INSERT INTO restore_replicas (consumer_device_id, group_id, type, intent, name) \
+		 VALUES ($1, $2, 'tamanu-postgres', $3, $4)",
+	)
+	.bind::<sql_types::Uuid, _>(consumer)
+	.bind::<sql_types::Uuid, _>(group_id)
+	.bind::<sql_types::Text, _>(intent)
+	.bind::<sql_types::Text, _>(format!("{intent}-decl"))
+	.execute(conn)
+	.await
+	.expect("insert declaration");
+}
+
+async fn declare_replica_server(
+	conn: &mut AsyncPgConnection,
+	consumer: Uuid,
+	group_id: Uuid,
+	server_id: Uuid,
+	intent: &str,
+) {
+	sql_query(
+		"INSERT INTO restore_replicas (consumer_device_id, group_id, server_id, type, intent, name) \
+		 VALUES ($1, $2, $3, 'tamanu-postgres', $4, $5)",
+	)
+	.bind::<sql_types::Uuid, _>(consumer)
+	.bind::<sql_types::Uuid, _>(group_id)
+	.bind::<sql_types::Uuid, _>(server_id)
+	.bind::<sql_types::Text, _>(intent)
+	.bind::<sql_types::Text, _>(format!("{intent}-server-decl"))
+	.execute(conn)
+	.await
+	.expect("insert server declaration");
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn capabilities_register_then_worklist_filters_by_intent() {
+	commons_tests::server::run_with_device_auth(
+		"backup-restore",
+		async |mut conn, cert, device_id, public, _| {
+			let group = make_group(&mut conn).await;
+			make_config(&mut conn, group, "ready").await;
+			let server = make_server(&mut conn, group).await;
+			make_success_run(&mut conn, device_id, group, server, "snap-1").await;
+
+			// Two whole-group declarations, different intents.
+			declare_replica(&mut conn, device_id, group, "verify").await;
+			declare_replica(&mut conn, device_id, group, "analytics").await;
+
+			// Register only `verify`.
+			public
+				.post("/restore-capabilities")
+				.add_header("mtls-certificate", &cert)
+				.json(&serde_json::json!({ "intents": ["verify"] }))
+				.await
+				.assert_status(http::StatusCode::NO_CONTENT);
+
+			let resp = public
+				.get("/restore-worklist")
+				.add_header("mtls-certificate", &cert)
+				.await;
+			resp.assert_status_ok();
+			let entries: Vec<serde_json::Value> = resp.json();
+			// Only the `verify` declaration is dispatched; `analytics` is a gap.
+			assert_eq!(entries.len(), 1, "got {entries:?}");
+			assert_eq!(entries[0]["intent"], "verify");
+			assert_eq!(entries[0]["server_id"], server.to_string());
+			assert_eq!(entries[0]["snapshot_id"], "snap-1");
+			assert_eq!(entries[0]["bucket"], "grp-bucket");
+		},
+	)
+	.await;
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn worklist_expands_group_wide_to_each_server() {
+	commons_tests::server::run_with_device_auth(
+		"backup-restore",
+		async |mut conn, cert, device_id, public, _| {
+			let group = make_group(&mut conn).await;
+			make_config(&mut conn, group, "ready").await;
+			let server_a = make_server(&mut conn, group).await;
+			let server_b = make_server(&mut conn, group).await;
+			make_success_run(&mut conn, device_id, group, server_a, "snap-a").await;
+			make_success_run(&mut conn, device_id, group, server_b, "snap-b").await;
+			declare_replica(&mut conn, device_id, group, "verify").await;
+			public
+				.post("/restore-capabilities")
+				.add_header("mtls-certificate", &cert)
+				.json(&serde_json::json!({ "intents": ["verify"] }))
+				.await
+				.assert_status(http::StatusCode::NO_CONTENT);
+
+			let resp = public
+				.get("/restore-worklist")
+				.add_header("mtls-certificate", &cert)
+				.await;
+			resp.assert_status_ok();
+			let entries: Vec<serde_json::Value> = resp.json();
+			// One whole-group declaration → one entry per live server, each with
+			// its own latest snapshot.
+			assert_eq!(entries.len(), 2, "got {entries:?}");
+			let mut by_server: std::collections::HashMap<String, String> = entries
+				.iter()
+				.map(|e| {
+					(
+						e["server_id"].as_str().unwrap().to_owned(),
+						e["snapshot_id"].as_str().unwrap().to_owned(),
+					)
+				})
+				.collect();
+			assert_eq!(
+				by_server.remove(&server_a.to_string()).as_deref(),
+				Some("snap-a")
+			);
+			assert_eq!(
+				by_server.remove(&server_b.to_string()).as_deref(),
+				Some("snap-b")
+			);
+		},
+	)
+	.await;
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn worklist_dedupes_server_specific_over_group_wide() {
+	commons_tests::server::run_with_device_auth(
+		"backup-restore",
+		async |mut conn, cert, device_id, public, _| {
+			let group = make_group(&mut conn).await;
+			make_config(&mut conn, group, "ready").await;
+			let server = make_server(&mut conn, group).await;
+			make_success_run(&mut conn, device_id, group, server, "snap-1").await;
+			// Both a whole-group and a server-specific declaration of the same
+			// (type, intent) cover this server.
+			declare_replica(&mut conn, device_id, group, "verify").await;
+			declare_replica_server(&mut conn, device_id, group, server, "verify").await;
+			public
+				.post("/restore-capabilities")
+				.add_header("mtls-certificate", &cert)
+				.json(&serde_json::json!({ "intents": ["verify"] }))
+				.await
+				.assert_status(http::StatusCode::NO_CONTENT);
+
+			let resp = public
+				.get("/restore-worklist")
+				.add_header("mtls-certificate", &cert)
+				.await;
+			resp.assert_status_ok();
+			let entries: Vec<serde_json::Value> = resp.json();
+			// Deduped to a single entry for the server, not two.
+			assert_eq!(entries.len(), 1, "got {entries:?}");
+			assert_eq!(entries[0]["server_id"], server.to_string());
+		},
+	)
+	.await;
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn worklist_empty_without_registered_capabilities() {
+	commons_tests::server::run_with_device_auth(
+		"backup-restore",
+		async |mut conn, cert, device_id, public, _| {
+			let group = make_group(&mut conn).await;
+			make_config(&mut conn, group, "ready").await;
+			make_server(&mut conn, group).await;
+			declare_replica(&mut conn, device_id, group, "verify").await;
+
+			// No capabilities registered → nothing dispatched.
+			let resp = public
+				.get("/restore-worklist")
+				.add_header("mtls-certificate", &cert)
+				.await;
+			resp.assert_status_ok();
+			let entries: Vec<serde_json::Value> = resp.json();
+			assert!(entries.is_empty(), "got {entries:?}");
+		},
+	)
+	.await;
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn restore_credentials_without_declaration_is_403() {
+	commons_tests::server::run_with_device_auth(
+		"backup-restore",
+		async |mut conn, cert, _device_id, public, _| {
+			let group = make_group(&mut conn).await;
+			make_config(&mut conn, group, "ready").await;
+			let resp = public
+				.post("/restore-credentials")
+				.add_header("mtls-certificate", &cert)
+				.json(&serde_json::json!({ "group": group, "type": "tamanu-postgres" }))
+				.await;
+			resp.assert_status(http::StatusCode::FORBIDDEN);
+		},
+	)
+	.await;
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn restore_credentials_authorized_but_unconfigured_is_502() {
+	commons_tests::server::run_with_device_auth(
+		"backup-restore",
+		async |mut conn, cert, device_id, public, _| {
+			let group = make_group(&mut conn).await;
+			make_config(&mut conn, group, "ready").await;
+			declare_replica(&mut conn, device_id, group, "verify").await;
+
+			// Authorization passes; the harness has no STS client, so issuance
+			// fails upstream rather than 403.
+			let resp = public
+				.post("/restore-credentials")
+				.add_header("mtls-certificate", &cert)
+				.json(&serde_json::json!({ "group": group, "type": "tamanu-postgres" }))
+				.await;
+			resp.assert_status(http::StatusCode::BAD_GATEWAY);
+		},
+	)
+	.await;
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn restore_endpoints_reject_non_consumer_role() {
+	// A `server`-role device cannot reach the backup-restore endpoints.
+	commons_tests::server::run_with_device_auth(
+		"server",
+		async |_conn, cert, _device_id, public, _| {
+			let resp = public
+				.get("/restore-worklist")
+				.add_header("mtls-certificate", &cert)
+				.await;
+			resp.assert_status(http::StatusCode::FORBIDDEN);
+		},
+	)
+	.await;
+}

From e36c5e66d8b89e7535fdc06b6edd5c204dd668f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= <felix@bes.au>
Date: Tue, 30 Jun 2026 16:32:06 +1200
Subject: [PATCH 7/7] docs(restore): clarify freshness is restore-cadence, not
 backup interval

---
 .workhorse/specs/public-server/restore-replicas.md       | 9 ++++++++-
 crates/database/src/restore.rs                           | 5 +++--
 crates/public-server/openapi.json                        | 2 +-
 crates/public-server/src/restore.rs                      | 5 +++--
 .../2026-06-30-021427-0000_restore_replicas/up.sql       | 5 +++--
 5 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/.workhorse/specs/public-server/restore-replicas.md b/.workhorse/specs/public-server/restore-replicas.md
index c76d5a36..ebd0294f 100644
--- a/.workhorse/specs/public-server/restore-replicas.md
+++ b/.workhorse/specs/public-server/restore-replicas.md
@@ -78,7 +78,7 @@ Each declaration carries:
 - a **server** within the group, or all servers in the group when none is named;
 - an **intent** describing what the replica is for;
 - a human-readable **name**;
-- a **freshness** bound: the maximum age of the restored snapshot before the replica is considered out of date and should be refreshed or re-verified;
+- a **freshness** bound: the maximum time the replica may go without a fresh successful restore before it is considered overdue — a bound on the consumer's *restore* cadence, deliberately independent of how often backups are produced (below);
 - whether the declaration is **enabled**.
 
 Intent is an open set; unrecognised intents are preserved verbatim rather than rejected, so a consumer may advertise intents Canopy does not model.
@@ -108,6 +108,13 @@ Canopy expands the consumer's enabled declarations — those whose intent the co
 The worklist does not carry credentials or the repo password.
 The consumer reconciles the worklist against what it is actually running — creating, refreshing, and tearing down replicas to match — and is responsible for converging on the desired state over time.
 
+### Latest state, not a queue
+
+Each entry names the *latest* snapshot for its `(server, type)`, not a backlog to drain.
+A consumer restores on its own cadence and skips the intermediate snapshots produced since its last restore; restoring less often than backups are produced is expected, not a failure.
+A restore can take far longer than the interval between backups — the data is slow to download and restore, and a persistent replica may be held up while its workload runs — so the consumer's restore cadence is independent of, and typically much slower than, the backup cadence.
+Consequently a replica's **freshness** bound is set to cover the consumer's restore cycle (download, restore, and any hold), not the backup interval: setting it to the backup interval would alert continuously even when restores are keeping pace as designed.
+
 ### Snapshot authority
 
 The snapshot Canopy hands out for a `(server, type)` is the snapshot identifier of that server's most recent successful backup run of that type.
diff --git a/crates/database/src/restore.rs b/crates/database/src/restore.rs
index 7675ce4a..18d34bad 100644
--- a/crates/database/src/restore.rs
+++ b/crates/database/src/restore.rs
@@ -36,8 +36,9 @@ pub struct RestoreReplica {
 	#[schema(value_type = String)]
 	pub intent: RestoreIntent,
 	pub name: String,
-	/// Max age of the restored snapshot before the replica is overdue, in
-	/// whole seconds; `None` = always track the latest snapshot.
+	/// Max time since the last healthy restore before the replica is overdue
+	/// — the consumer's *restore* cadence (download + restore + any hold), not
+	/// the backup interval. In whole seconds; `None` = no overdue bound.
 	#[schema(value_type = Option<i64>)]
 	pub freshness: Option<PgDuration>,
 	pub enabled: bool,
diff --git a/crates/public-server/openapi.json b/crates/public-server/openapi.json
index e7301488..d0997ddb 100644
--- a/crates/public-server/openapi.json
+++ b/crates/public-server/openapi.json
@@ -1933,7 +1933,7 @@
               "null"
             ],
             "format": "int64",
-            "description": "Max snapshot age before the replica is overdue, in whole seconds;\n`None` = always track the latest."
+            "description": "Max time since the last healthy restore before overdue, in whole seconds\n— the consumer's restore cadence, not the backup interval; `None` = no\noverdue bound."
           },
           "group_id": {
             "type": "string",
diff --git a/crates/public-server/src/restore.rs b/crates/public-server/src/restore.rs
index 3dfe9e9b..89c0be08 100644
--- a/crates/public-server/src/restore.rs
+++ b/crates/public-server/src/restore.rs
@@ -96,8 +96,9 @@ pub struct WorklistEntry {
 	#[schema(value_type = String)]
 	pub intent: RestoreIntent,
 	pub name: String,
-	/// Max snapshot age before the replica is overdue, in whole seconds;
-	/// `None` = always track the latest.
+	/// Max time since the last healthy restore before overdue, in whole seconds
+	/// — the consumer's restore cadence, not the backup interval; `None` = no
+	/// overdue bound.
 	pub freshness_seconds: Option<i64>,
 	/// The snapshot Canopy wants restored — the latest successful backup for
 	/// this `(server, type)`. `None` when no successful backup is yet known.
diff --git a/migrations/2026-06-30-021427-0000_restore_replicas/up.sql b/migrations/2026-06-30-021427-0000_restore_replicas/up.sql
index 8a1919a9..4127f0db 100644
--- a/migrations/2026-06-30-021427-0000_restore_replicas/up.sql
+++ b/migrations/2026-06-30-021427-0000_restore_replicas/up.sql
@@ -15,8 +15,9 @@ CREATE TABLE restore_replicas (
 	type               TEXT NOT NULL,
 	intent             TEXT NOT NULL,
 	name               TEXT NOT NULL,
-	-- Max age of the restored snapshot before the replica is overdue; NULL =
-	-- always track the latest snapshot.
+	-- Max time since the last healthy restore before the replica is overdue —
+	-- the consumer's restore cadence (download + restore + any hold), not the
+	-- backup interval; NULL = no overdue bound.
 	freshness          INTERVAL,
 	enabled            BOOLEAN NOT NULL DEFAULT TRUE,
 	created_by         TEXT,