diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 56568a2..aa57ea9 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -254,14 +254,23 @@ Auto-Approve Route to Inspector
### Sync to Cloud
-Only **approved episodes** (QA score ≥ 90% or inspector-approved) are synced to cloud:
+Only **approved episodes** (QA score >= 90% or inspector-approved) are eligible for cloud sync:
```
Edge MinIO ══(push)══► Cloud S3
```
-Sync is handled via `POST /api/v1/sync/episode` with:
-- Sidecar JSON (metadata)
+Keystone should default to manual cloud sync scheduling. `KEYSTONE_SYNC_ENABLED`
+controls whether cloud sync capability and the worker are available.
+`KEYSTONE_SYNC_AUTO_SCAN_ENABLED` controls whether the worker periodically
+discovers newly eligible approved unsynced episodes, and its default is
+`false`. With the default setting, recorded data remains in edge MinIO until an
+admin manually syncs one episode or explicitly scans and queues eligible
+episodes.
+
+Sync upload is executed by the edge `SyncWorker` through the cloud data gateway
+with:
+- Sidecar JSON metadata converted to raw tags
- MCAP file (raw sensor data)
- SHA-256 checksums for integrity validation
@@ -376,10 +385,14 @@ EDGE (Source of Truth) CLOUD (Eventually Consistent Replica)
**After network restored**:
-1. Edge sync worker resumes
-2. Queued episodes uploaded (`POST /api/v1/sync/episode`)
-3. Cloud processes each asynchronously (returns `202 Accepted`)
-4. Cloud DB becomes eventually consistent with edge
+1. Edge sync worker resumes and drains persisted `pending` sync rows.
+2. Already queued or retryable sync jobs continue according to retry policy.
+3. If `KEYSTONE_SYNC_AUTO_SCAN_ENABLED=true`, newly eligible approved unsynced
+ episodes are discovered and queued automatically.
+4. If automatic discovery is disabled, new local episodes remain unsynced until
+ an admin manually syncs one episode or explicitly scans eligible episodes.
+5. Cloud processes each upload asynchronously and cloud DB becomes eventually
+ consistent with edge.
### Sync Guarantees
diff --git a/README.md b/README.md
index d1158c3..6c64b85 100644
--- a/README.md
+++ b/README.md
@@ -76,6 +76,8 @@ Configuration is loaded from environment variables. See [`docker/.env.example`](
| `KEYSTONE_MINIO_ENDPOINT` | `http://localhost:9000` | MinIO endpoint |
| `KEYSTONE_MYSQL_HOST` | `localhost` | MySQL host |
| `KEYSTONE_MYSQL_PASSWORD` | *required* | MySQL password |
+| `KEYSTONE_SYNC_ENABLED` | `true` | Enable cloud sync capability, worker, and manual sync APIs when cloud endpoints and credentials are configured |
+| `KEYSTONE_SYNC_AUTO_SCAN_ENABLED` | `false` | Enable periodic automatic discovery of newly eligible approved unsynced episodes |
### Cloud Sync Credentials
@@ -85,6 +87,14 @@ to `AuthService.ExchangeCredential` as `credential_base64`. Keystone does not
decode, split, validate, or derive `site_id` / secret values from this key; the
cloud AuthService owns credential interpretation and validation.
+Cloud sync capability and automatic scheduling are separate. Keep
+`KEYSTONE_SYNC_ENABLED=true` when admins should be able to manually sync data to
+cloud. Leave `KEYSTONE_SYNC_AUTO_SCAN_ENABLED=false` for the default manual-only
+mode, where newly recorded or newly approved episodes remain local until an
+admin triggers single-episode sync or an explicit batch scan. Set
+`KEYSTONE_SYNC_AUTO_SCAN_ENABLED=true` only when the site should automatically
+queue every newly eligible approved unsynced episode.
+
## Project Structure
```
diff --git a/cmd/keystone-edge/main.go b/cmd/keystone-edge/main.go
index 002de8d..7164bff 100644
--- a/cmd/keystone-edge/main.go
+++ b/cmd/keystone-edge/main.go
@@ -7,6 +7,7 @@ package main
import (
"context"
+ "errors"
"flag"
"fmt"
"os"
@@ -154,17 +155,18 @@ func main() {
}
syncWorker = services.NewSyncWorker(db.DB, uploader, s3Client, cfg.Storage.Bucket, services.SyncWorkerConfig{
- BatchSize: cfg.Sync.BatchSize,
- MaxConcurrent: cfg.Sync.MaxConcurrent,
- MaxRetries: cfg.Sync.MaxRetries,
- IntervalSec: cfg.Sync.WorkerIntervalSec,
- RetryBaseSec: cfg.Sync.RetryBaseSec,
- RetryMaxSec: cfg.Sync.RetryMaxSec,
- RetryJitterSec: cfg.Sync.RetryJitterSec,
+ BatchSize: cfg.Sync.BatchSize,
+ MaxConcurrent: cfg.Sync.MaxConcurrent,
+ MaxRetries: cfg.Sync.MaxRetries,
+ AutoScanEnabled: cfg.Sync.AutoScanEnabled,
+ IntervalSec: cfg.Sync.WorkerIntervalSec,
+ RetryBaseSec: cfg.Sync.RetryBaseSec,
+ RetryMaxSec: cfg.Sync.RetryMaxSec,
+ RetryJitterSec: cfg.Sync.RetryJitterSec,
}, &cfg.Sync)
syncWorker.Start()
- logger.Printf("[SYNC] Cloud sync worker started: auth=%s gateway=%s", cfg.Sync.AuthEndpoint, cfg.Sync.GatewayEndpoint)
+ logger.Printf("[SYNC] Cloud sync worker started: auth=%s gateway=%s auto_scan=%t", cfg.Sync.AuthEndpoint, cfg.Sync.GatewayEndpoint, cfg.Sync.AutoScanEnabled)
} else {
logger.Println("[SYNC] Cloud sync disabled (KEYSTONE_SYNC_ENABLED=false or missing endpoints)")
}
@@ -184,11 +186,16 @@ func main() {
logger.Println("[SERVER] Shutting down...")
- ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ shutdownTimeout := 30 * time.Second
+ ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout)
defer cancel()
if err := srv.Shutdown(ctx); err != nil {
- logger.Printf("[SERVER] Error during shutdown: %v", err)
+ if errors.Is(err, context.DeadlineExceeded) || errors.Is(ctx.Err(), context.DeadlineExceeded) {
+ logger.Printf("[SERVER] Error during shutdown after %s (timeout_ms=%d): %v", shutdownTimeout, shutdownTimeout.Milliseconds(), err)
+ } else {
+ logger.Printf("[SERVER] Error during shutdown: %v", err)
+ }
}
logger.Println("[SERVER] Keystone Edge stopped")
diff --git a/docs/designs/cloud-sync-persistent-queue-design.md b/docs/designs/cloud-sync-persistent-queue-design.md
index b97e4fa..0e9f678 100644
--- a/docs/designs/cloud-sync-persistent-queue-design.md
+++ b/docs/designs/cloud-sync-persistent-queue-design.md
@@ -76,7 +76,25 @@ The polling worker periodically:
Automatic retry also does not create an observable queued period.
-### 2.3 Frontend Listing
+### 2.3 Automatic Discovery
+
+The worker also has a discovery path that finds approved, unsynced episodes and
+enqueues them without an operator action. That behavior should be controlled by
+`KEYSTONE_SYNC_AUTO_SCAN_ENABLED` and should default to `false`.
+
+With the default disabled setting:
+
+- newly recorded or newly approved episodes remain local after upload/QA;
+- single-episode manual sync still works through `POST /api/v1/sync/episodes/:id`;
+- explicit batch scan still works through `POST /api/v1/sync/episodes`;
+- persisted `pending` rows and due retries from already attempted sync jobs are
+ still processed by the worker.
+
+This separates automatic discovery from retry/backoff recovery. A failed manual
+sync can continue retrying according to sync retry policy without re-enabling
+automatic discovery for unrelated new episodes.
+
+### 2.4 Frontend Listing
Cloud Sync Center lists one row per episode using the latest `sync_logs` row.
Its queued count is the number of latest rows whose status is `pending`.
@@ -90,6 +108,8 @@ Therefore, a job that only exists in memory cannot be counted as queued.
- Preserve manual retry semantics: operators can retry exhausted or backoff
failures explicitly.
- Preserve automatic retry limits and backoff behavior.
+- Disable automatic discovery of newly eligible episodes by default while keeping
+ manual enqueue and retry recovery available.
- Allow queued work to recover after Keystone restarts.
- Keep the change scoped to the existing `sync_logs` model unless stronger
queue semantics become necessary later.
@@ -164,10 +184,12 @@ The worker loop should process DB-backed queued work before discovering new work
1. Dispatch latest `pending` sync logs.
2. Promote due failed rows to `pending`, then dispatch them.
-3. Discover approved, unsynced episodes with no active/latest completed log.
+3. Discover approved, unsynced episodes with no active/latest completed log only
+ when automatic discovery is enabled.
This makes the queue restart-safe. If Keystone crashes after writing `pending`
but before placing the job into memory, the next polling cycle will pick it up.
+Disabling automatic discovery must not disable this pending-row recovery path.
### 5.4 Claiming Pending Rows
@@ -229,6 +251,24 @@ After a successful manual retry:
The frontend should continue treating the backend summary endpoint as the source
of truth.
+### 5.7 Automatic Discovery Switch
+
+Add a worker configuration field backed by an environment variable:
+
+| Field | Environment Variable | Default | Meaning |
+|-------|----------------------|---------|---------|
+| `AutoScanEnabled` | `KEYSTONE_SYNC_AUTO_SCAN_ENABLED` | `false` | Permit periodic discovery of newly eligible approved unsynced episodes |
+
+`KEYSTONE_SYNC_ENABLED` remains the cloud sync capability switch. Setting it to
+`false` disables the worker and therefore disables manual sync APIs. To support
+manual-only production mode, keep `KEYSTONE_SYNC_ENABLED=true` and leave
+`KEYSTONE_SYNC_AUTO_SCAN_ENABLED=false`.
+
+The worker polling loop should be structured so that this switch only gates the
+new-episode discovery step. It should not gate dispatch of existing `pending`
+rows, manual enqueue acceleration, or retry/backoff handling for already
+attempted sync work.
+
## 6. Risks and Mitigations
### 6.1 Duplicate Pending Rows
@@ -308,6 +348,16 @@ Mitigation:
## 7. Implementation Plan
+### Phase 0: Automatic Discovery Control
+
+- Add `KEYSTONE_SYNC_AUTO_SCAN_ENABLED` with default `false`.
+- Add matching `SyncConfig.AutoScanEnabled` and worker config fields.
+- Return `auto_scan_enabled` from `GET /api/v1/sync/config` as read-only
+ runtime state.
+- Gate only the worker's newly eligible episode discovery path.
+- Keep manual single-episode sync, explicit batch scan, pending-row recovery,
+ and retry/backoff processing available while the worker is running.
+
### Phase 1: Durable Manual Queue
- Add a worker method to create or reuse a pending sync log transactionally.
@@ -347,6 +397,12 @@ Retry 4 failed episodes with MaxConcurrent=2
### Backend Unit Tests
+- Worker polling does not auto-discover approved unsynced episodes when
+ `AutoScanEnabled=false`.
+- Worker polling still dispatches persisted pending rows when
+ `AutoScanEnabled=false`.
+- Worker polling auto-discovers approved unsynced episodes when
+ `AutoScanEnabled=true`.
- Manual retry creates a pending row for a failed exhausted episode.
- Manual retry rejects an episode whose latest row is pending.
- Manual retry rejects an episode whose latest row is in_progress.
@@ -374,8 +430,10 @@ Retry 4 failed episodes with MaxConcurrent=2
## 9. Recommendation
-Implement Phase 1 first. It fixes the operator-visible problem with a small,
-contained backend change and does not require a new database table.
+Implement Phase 0 and Phase 1 first. Phase 0 prevents newly recorded data from
+automatically leaving the edge by default, while Phase 1 preserves clear manual
+queue visibility. Both changes are small, contained backend changes and do not
+require a new database table.
Do not ship durable pending without DB-backed duplicate protection and polling
recovery. Those two pieces are required for correctness; otherwise the system
diff --git a/docs/designs/cloud-sync-ui-implementation.md b/docs/designs/cloud-sync-ui-implementation.md
index 9439228..31877cd 100644
--- a/docs/designs/cloud-sync-ui-implementation.md
+++ b/docs/designs/cloud-sync-ui-implementation.md
@@ -34,6 +34,11 @@ Synapse. Data production statistics should keep cloud sync as a metric only, but
should not be the primary place for queue control, retry diagnosis, navigation,
or future sync operations.
+Cloud sync scheduling should be explicit by default. Keystone should keep cloud
+sync capability available when configured, but automatic discovery of newly
+approved unsynced episodes should default to off. Operators or admins can still
+manually enqueue one episode or explicitly scan and enqueue eligible episodes.
+
## 2. Current Backend Capability
### 2.1 Existing Cloud Sync APIs
@@ -43,7 +48,7 @@ endpoints recommended for the episode-centered Cloud Sync Center redesign:
| Method | Path | Purpose |
|--------|------|---------|
-| `POST` | `/api/v1/sync/episodes` | Enqueue pending approved, unsynced episodes for cloud sync |
+| `POST` | `/api/v1/sync/episodes` | Manually scan and enqueue approved, unsynced episodes for cloud sync |
| `POST` | `/api/v1/sync/episodes/:id` | Enqueue one episode for cloud sync by numeric episode ID |
| `GET` | `/api/v1/sync/episodes` | Existing: list raw sync log entries for history/diagnosis |
| `GET` | `/api/v1/sync/episodes/summary` | Recommended new endpoint: list latest sync state grouped by episode |
@@ -77,7 +82,33 @@ from local MinIO, uploads it through the cloud DataGateway/OSS flow, and updates
- `episodes.cloud_mcap_path`
- `episodes.cloud_processed`
-### 2.3 Manual Trigger Semantics
+### 2.3 Scheduling Modes
+
+Keystone should separate two concerns:
+
+| Concern | Configuration | Default | Behavior |
+|---------|---------------|---------|----------|
+| Cloud sync capability | `KEYSTONE_SYNC_ENABLED` | `true` | Creates the cloud clients, `SyncWorker`, sync APIs, and manual enqueue path when cloud endpoints and credentials are present |
+| Automatic episode discovery | `KEYSTONE_SYNC_AUTO_SCAN_ENABLED` | `false` | When enabled, the worker periodically finds newly eligible approved unsynced episodes and enqueues them without an operator action |
+
+When `KEYSTONE_SYNC_AUTO_SCAN_ENABLED=false`, Keystone must not automatically
+enqueue newly completed or newly approved episodes just because they match the
+eligibility rules. This is the recommended default for controlled production
+sites where data should remain local until an admin explicitly syncs it.
+
+Manual sync remains available while `KEYSTONE_SYNC_ENABLED=true` and the worker
+is running:
+
+- `POST /api/v1/sync/episodes/:id` enqueues one approved unsynced episode.
+- `POST /api/v1/sync/episodes` explicitly scans and enqueues all currently
+ eligible approved unsynced episodes.
+
+Automatic retry is separate from automatic discovery. Once an episode has been
+manually enqueued and a `sync_logs` row exists, retry/backoff may continue using
+the existing retry rules. Disabling automatic discovery should not strand
+already queued or already attempted sync work.
+
+### 2.4 Manual Trigger Semantics
The existing manual APIs have different retry behavior.
@@ -110,7 +141,7 @@ Behavior:
For product clarity, the current batch API should be presented as "scan and
enqueue eligible pending episodes", not as "force retry all failures".
-### 2.4 Listing Model
+### 2.5 Listing Model
`sync_logs` is an audit-style attempt-chain table. One episode may have multiple
rows because manual retry can create a fresh `sync_logs` row when the latest
@@ -138,6 +169,7 @@ as an expandable row, drawer, or "View history" action.
| Admin | Retry failed cloud sync jobs safely |
| Admin | Understand why a sync failed |
| Admin | Trigger sync for all eligible unsynced episodes |
+| Admin | Keep newly recorded local data from syncing automatically by default |
| Operator | See whether collected data is available, without managing retries |
### 3.2 Design Goals
@@ -147,6 +179,7 @@ as an expandable row, drawer, or "View history" action.
- Avoid fake progress bars because Keystone does not expose upload percentage.
- Make failure recovery explicit and auditable.
- Keep destructive or expensive bulk actions deliberate.
+- Make automatic discovery opt-in; default operation is manual enqueue.
- Preserve current backend behavior while identifying API gaps for follow-up.
- Keep the data production statistics page focused on analysis, not operations.
@@ -243,7 +276,7 @@ Primary actions:
| Scan and queue eligible unsynced episodes | `POST /api/v1/sync/episodes` | Confirm before enqueue, then refresh counts/table |
| Retry one failed episode | `POST /api/v1/sync/episodes/:id` | Row-level action, then refresh the row/list |
| Retry failed episodes in bulk | Not explicit yet | TODO; requires backend API semantics |
-| Pause/resume automatic sync | Not supported yet | TODO; do not show as active control |
+| Enable/disable automatic discovery | Config only | Default off; show read-only state, do not expose as a live UI toggle |
| Edit sync configuration | Not supported yet | TODO; read-only summary for now |
The page should make queue state explicit:
@@ -730,6 +763,7 @@ TODO: extend `GET /sync/config` with non-sensitive runtime values:
```json
{
"worker_running": true,
+ "auto_scan_enabled": false,
"batch_size": 10,
"max_concurrent": 2,
"max_retries": 5,
@@ -822,6 +856,7 @@ Acceptance criteria:
- Add latest sync summary to episode list/detail responses.
- Add batch trigger body with `force` and filters.
- Extend sync config response with read-only non-sensitive values.
+- Add `auto_scan_enabled` to sync config response.
- Add admin auth guard for sync APIs.
Acceptance criteria:
@@ -841,3 +876,4 @@ Acceptance criteria:
| Should batch sync use current statistics filters? | Only after backend supports filter parameters |
| Should long-running uploads show percentage? | No; backend does not expose progress |
| Should `.env` sync config be editable in Synapse? | Not now; keep as TODO after read-only config visibility |
+| Should automatic discovery run by default? | No; default `KEYSTONE_SYNC_AUTO_SCAN_ENABLED=false` |
diff --git a/docs/designs/recorder-axon-interaction-test-plan.md b/docs/designs/recorder-axon-interaction-test-plan.md
new file mode 100644
index 0000000..9d267f0
--- /dev/null
+++ b/docs/designs/recorder-axon-interaction-test-plan.md
@@ -0,0 +1,120 @@
+# Recorder / Axon Interaction Test Plan
+
+This document defines the Keystone-side test coverage for recorder WebSocket and RPC behavior. The goal is to find protocol, state recovery, and duplicate-dispatch bugs with tests. If a test fails because product behavior is wrong, fix implementation code; do not weaken assertions only to make tests pass.
+
+## Scope
+
+Covered now:
+
+1. Keystone handler/unit behavior around recorder state, RPC result handling, task state transitions, connection replacement, and disconnect recovery.
+2. Keystone-to-fake-Axon WebSocket interaction behavior using real websocket connections, including connect-time state sync, reconnect, late messages, RPC response shape, timeout, and duplicate config prevention.
+
+Out of scope for this document:
+
+- Synapse browser/UI tests.
+- Axon C++ implementation tests.
+- Network stack integration outside in-process `httptest` WebSocket servers.
+
+## Invariants
+
+- Config may be sent only when Keystone has a trusted state snapshot for the current recorder connection and that snapshot is `idle`.
+- A recorder connection starts as unsynced. `state_synced` becomes true only after a non-empty recorder state snapshot from `state_update` or `get_state`.
+- If the recorder is `ready`, `recording`, or `paused`, any new `config` request must be rejected before Keystone sends an RPC.
+- A ping timeout or recorder WebSocket close may revert DB tasks from `ready` or `in_progress` to `pending`, but reconnect state sync must restore the same task when Axon still reports it.
+- A task that is reverted to `pending` during reconnect must not be reconfigured while Axon still reports `ready` with that task id.
+- Old/replaced recorder connections must not update DB state, satisfy pending RPCs, or override current connection state.
+- RPC success controls DB transitions. `success:false`, timeout, or disconnected recorder must not advance/revert task state as if the command succeeded.
+- Repeated state events for the same task must be idempotent.
+
+## Keystone Handler / Unit Test Matrix
+
+### State snapshot and DB reconciliation
+
+- [x] `state_update ready + task_id` advances `pending -> ready` and sets `ready_at`.
+- [x] `state_update recording + task_id` advances `pending/ready -> in_progress` and sets `started_at`.
+- [x] `state_update paused + task_id` advances `pending/ready -> in_progress` and sets `started_at`.
+- [x] `get_state ready` with `task_config.task_id` advances `pending -> ready`.
+- [x] `get_state recording` with top-level `task_id` advances `pending/ready -> in_progress`.
+- [x] `get_state paused` with `task_config.task_id` advances `pending/ready -> in_progress`.
+- [x] `idle + task_id` does not advance or revert DB state.
+- [x] non-idle state without `task_id` marks connection synced but does not change DB state.
+- [x] empty state snapshot keeps connection unsynced.
+- [x] duplicate ready/recording snapshots are idempotent and do not corrupt timestamps.
+
+### State API contract
+
+- [x] disconnected `/state` returns `connected=false`, `state_synced=false`, `syncing=false`.
+- [x] connected but unsynced `/state` returns `connected=true`, `state_synced=false`, `syncing=true`.
+- [x] connected and synced `/state` returns `state_synced=true`, `syncing=false`.
+- [x] `/devices` includes `state_synced` for each connected recorder.
+
+### Config gating
+
+- [x] `config` before initial state snapshot returns 409 and sends no RPC.
+- [x] `config` while synced recorder is `ready` returns 409 and sends no RPC.
+- [x] `config` while synced recorder is `recording` returns 409 and sends no RPC.
+- [x] `config` while synced recorder is `paused` returns 409 and sends no RPC.
+- [x] `config` while synced recorder state is unknown/non-idle returns 409 and sends no RPC.
+- [x] `config` after synced `idle` sends RPC and advances `pending -> ready` only on success.
+- [x] `config` with `success:false` keeps task `pending`.
+- [x] `config` timeout keeps task `pending`.
+- [x] `config` disconnected returns 404 and keeps task `pending`.
+
+### Command RPC side effects
+
+- [x] `begin` with `success:false` keeps task unchanged.
+- [x] `begin` success advances `pending/ready -> in_progress` and is idempotent for already `in_progress`.
+- [x] `begin` timeout or disconnect keeps task unchanged.
+- [x] `cancel` success reverts `ready/in_progress -> pending` only for provided task id.
+- [x] `cancel` `success:false`, timeout, or disconnect keeps task unchanged.
+- [x] `clear` success reverts `ready -> pending`, does not revert `in_progress`, and sends no task payload to Axon.
+- [x] `clear` `success:false`, timeout, or disconnect keeps task unchanged.
+- [x] `pause`, `resume`, `finish`, and `quit` forward the RPC action and do not mutate task state directly.
+- [x] `get_stats` disconnected returns HTTP 200 with `connected=false` and empty data.
+- [x] `get_stats` Axon failure returns HTTP 200 with `connected=true`, data, and error message.
+- [x] `get_stats` timeout returns HTTP 504.
+
+### Disconnect and replacement
+
+- [x] messages from replaced recorder connections are ignored for `state_update` and `config_applied`.
+- [x] late `rpc_response` from replaced connections cannot satisfy current pending RPCs.
+- [x] recorder disconnect reverts matching `ready` tasks to `pending` without sending recorder RPCs.
+- [x] recorder disconnect reverts matching `in_progress` tasks to `pending` without sending recorder RPCs.
+- [x] recorder disconnect does not revert `pending`, `completed`, `failed`, deleted, or other-device tasks.
+- [x] old connection disconnect after replacement does not revert tasks for the current connection.
+
+## Keystone <-> Fake Axon WebSocket Test Matrix
+
+### Connect-time sync
+
+- [x] On WebSocket connect, Keystone sends `get_state`; fake Axon replies `idle`; `/state` becomes synced idle.
+- [x] Fake Axon sends `state_update ready + task_id` before `get_state` response; task advances to `ready` and stays correct when `get_state` later returns same state.
+- [x] `get_state` returns `ready + task_id`; task advances to `ready`; direct `config` is rejected as busy.
+- [x] `get_state` returns `recording + task_id`; task advances to `in_progress`; direct `config` is rejected as busy.
+- [x] `get_state` timeout leaves the connection unsynced, `/state` keeps `syncing=true`, and `config` is rejected until a valid `state_update` arrives.
+- [x] `get_state` returns `success:false`; connection remains unsynced and `config` is rejected until a valid `state_update` arrives.
+- [x] `state_update` with empty current state does not mark connection synced.
+
+### Reconnect after timeout / close
+
+- [x] Ready task path: task is `ready`; recorder WebSocket closes; DB reverts to `pending`; fake Axon reconnects and reports `ready + same task_id`; DB returns to `ready`; no `config` RPC is sent during the window.
+- [x] Recording task path: task is `in_progress`; recorder WebSocket closes; DB reverts to `pending`; fake Axon reconnects and reports `recording + same task_id`; DB returns to `in_progress`; no `config` RPC is sent during the window.
+- [x] Idle-after-reconnect path: task is reverted to `pending`; fake Axon reconnects and reports `idle`; `config` is allowed and sends exactly one RPC.
+- [x] Reconnect replaces a still-active old connection; old connection late `state_update`, `config_applied`, and `rpc_response` do not mutate task state or satisfy new requests.
+
+### RPC action protocol
+
+- [x] `config` sends action `config` with `task_config` payload exactly once when recorder is synced idle.
+- [x] `clear` sends action `clear` without UI task payload even when HTTP body includes `task_id`.
+- [x] `begin`, `cancel`, `finish`, `pause`, `resume`, `quit`, and `get_stats` send the expected action names and payloads.
+- [x] Axon response with unmatched `request_id` is ignored and does not unblock Keystone RPC waiters.
+- [x] Axon response `success:false` returns HTTP 200 with the RPC response body but does not mutate DB state.
+- [x] Missing Axon response causes Keystone HTTP timeout and no DB mutation.
+
+## Implementation Notes
+
+- Prefer deterministic fake Axon helpers over sleeps. Helpers should expose channels for inbound RPC requests and send explicit RPC responses by request id.
+- Use real `httptest` WebSocket servers for layer 2 tests so the Keystone read loop, connect-time `get_state`, and disconnect defers are exercised.
+- Keep DB fixtures small but include `robots`, `workstations`, and `tasks` when testing disconnect joins.
+- Use `ResponseTimeout` values short enough for timeout tests, but avoid making the suite flaky.
+- Keep assertions strict: status code, response body, sent action, request payload, DB status, and timestamp side effects should be checked where relevant.
diff --git a/docs/designs/recorder-axon-interaction-tests.html b/docs/designs/recorder-axon-interaction-tests.html
new file mode 100644
index 0000000..9a225b2
--- /dev/null
+++ b/docs/designs/recorder-axon-interaction-tests.html
@@ -0,0 +1,407 @@
+
+
+
+
+
+ Keystone / Axon Recorder 交互测试说明
+
+
+
+
+
+
+
+
8条核心不变量,定义系统必须守住的边界。
+
7个测试分组,从状态同步到重连恢复逐层覆盖。
+
40+个具体断言点,检查 HTTP 状态、RPC 是否发送、DB 是否变化。
+
0当前文档中没有未覆盖的测试项。
+
+
+
+ 先看结论
系统必须守住的事
+ Keystone 和 Axon 的交互最怕“状态还没同步就下发新任务”。所以测试围绕一个核心原则展开:只有确认 Axon 当前是可信的 idle 状态,Keystone 才能发 config。
+
+
1. 新连接Recorder 连接上来时先视为状态未知,不能配置任务。
+
2. 拉状态Keystone 发 get_state,或者等待 Axon 发 state_update。
+
3. 状态可信只有拿到非空状态,才认为这条连接已经同步。
+
4. 忙碌拒绝如果 Axon 是 ready、recording、paused,新 config 必须拒绝。
+
5. 断线恢复DB 可以短暂回到 pending,但重连后要按 Axon 真实状态恢复。
+
+
+
+
+ 状态同步与 DB 恢复
10 项
+ 这些测试确认 Keystone 能正确理解 Axon 上报的状态,并把任务表恢复到正确状态;同时避免空状态、重复状态把 DB 搞乱。
+
+
状态恢复Axon 报 ready,要把 pending 任务恢复成 ready
- 模拟
- Axon 发
state_update ready + task_id,或 get_state ready。 - 期望
- DB 中对应任务从
pending 变成 ready,并写入 ready_at。 - 防止
- Recorder 已经准备好,但 Keystone 还以为任务没配置。
TestRecorderStateUpdateReadyRestoresPendingTask
TestRecorderGetStateSnapshotReadyRestoresPendingTask
TestRecorderStateSnapshotReconcilesAdditionalStates
+
状态恢复Axon 报 recording 或 paused,要恢复成 in_progress
- 模拟
- Axon 上报
recording 或 paused,并带当前 task_id。 - 期望
- DB 中任务进入
in_progress,并设置 started_at。 - 防止
- 设备已经录制中,但 Keystone 仍显示任务待开始。
TestRecorderStateUpdateRecordingAdvancesPendingTask
TestRecorderStateUpdatePausedAdvancesPendingTask
TestRecorderStateSnapshotReconcilesAdditionalStates
+
边界状态Axon 报 idle,即使带 task_id,也不能推进 DB
- 模拟
- 收到
idle + task_id 这种容易误判的状态。 - 期望
- 任务状态不变,不自动变成
ready 或 in_progress。 - 防止
- 把空闲设备误认为已经配置好任务。
TestRecorderStateSnapshotReconcilesAdditionalStates
+
无 task_id非 idle 但没有 task_id,只同步连接,不改任务
- 模拟
- Axon 报
ready 但没带任务号。 - 期望
- 连接可以标记为已同步,但 DB 任务不能被推进。
- 防止
- 不知道是哪条任务时误改 DB。
TestRecorderNonIdleWithoutTaskIDMarksSyncedWithoutDBChange
+
同步门闩空状态不能算同步完成
- 模拟
- Axon 发空的
state_update 或空快照。 - 期望
state_synced 仍是 false,前后端继续认为还在同步中。- 防止
- 空状态被当成 idle,导致错误开放 config。
TestRecorderEmptyStateSnapshotKeepsConnectionUnsynced
TestRecorderWebSocketEmptyStateUpdateDoesNotMarkSynced
+
幂等重复状态不能破坏时间戳
- 模拟
state_update ready 先到,随后 get_state ready 又到。- 期望
- 任务仍是
ready,ready_at 不被重复刷新。 - 防止
- 重复消息导致时间线被篡改,影响追踪。
TestRecorderWebSocketStateUpdateBeforeGetStateResponseIsIdempotent
+
+
+
+
+ 状态 API 契约
4 项
+ 这些测试确认 Synapse 或其它调用方看到的状态字段是可靠的,尤其是 state_synced 和 syncing。
+
+
API断开时 /state 要明确说未连接
- 模拟
- Recorder WebSocket 不存在。
- 期望
- 返回
connected=false、state_synced=false、syncing=false。 - 防止
- 前端把离线误当同步中或可操作。
TestRecorderGetStateReportsSyncingBeforeInitialStateSnapshot
相关 disconnected /state 断言
+
API连接上但没同步时,要显示 syncing
- 模拟
- WebSocket 已连接,但还没收到有效状态。
- 期望
connected=true、syncing=true。- 防止
- 重连窗口里误开放配置表单。
TestRecorderGetStateReportsSyncingBeforeInitialStateSnapshot
+
API同步完成后,/state 要显示可信状态
- 模拟
- 收到有效
idle 快照。 - 期望
state_synced=true、syncing=false。- 防止
- 设备已经可判断状态,但 UI 一直卡在同步中。
TestRecorderGetStateReportsSyncedAfterInitialStateSnapshot
TestRecorderWebSocketConnectGetStateIdleMarksStateSynced
+
API/devices 也要带 state_synced
- 模拟
- 一个 Recorder 已同步,一个未同步。
- 期望
- 设备列表能区分每台设备是否完成状态同步。
- 防止
- 列表层只知道在线,不知道能不能安全下发任务。
TestRecorderListDevicesIncludesStateSynced
+
+
+
+
+ config 下发门闩
9 项
+ 这些测试直接保护“不要重复下发任务”。只要 Keystone 还没确认 Axon 是可信 idle,就不能把 config 送到设备。
+
+
禁止重复下发初始状态未同步时,config 必须 409
- 模拟
- Recorder 刚重连,还没有有效状态。
- 期望
- HTTP 返回 409,且 fake Axon 收不到
config。 - 防止
- 重连窗口把 pending 任务重复配置给设备。
TestRecorderConfigRejectsBeforeInitialStateSnapshot
TestRecorderWebSocketUnsyncedConfigRejectedUntilValidStateUpdate
+
忙碌保护Axon 已 ready / recording / paused 时,config 必须拒绝
- 模拟
- 状态已同步,但 Axon 正持有任务或录制中。
- 期望
- 返回 409,且不发 RPC。
- 防止
- 给一个已经有任务的 Recorder 再塞一条任务。
TestRecorderConfigRejectsWhenSyncedRecorderBusy
TestRecorderConfigRejectsAdditionalBusyStates
TestRecorderWebSocketGetStateReadyRejectsConfigWithoutSendingRPC
+
允许路径只有 synced idle 才能发 config
- 模拟
- Keystone 已确认 Axon 是
idle。 - 期望
- 发出一次
config RPC;成功后 DB 从 pending 变 ready。 - 防止
- 正常生产路径被安全门闩误伤。
TestRecorderConfigAdvancesTaskAfterInitialStateSnapshot
TestRecorderWebSocketIdleReconnectAllowsSingleConfig
+
失败不改 DBconfig 返回 success:false,任务仍是 pending
- 模拟
- Axon 拒绝配置。
- 期望
- Keystone 不把任务推进到
ready。 - 防止
- 设备没接受任务,但后台显示已就绪。
TestRecorderConfigDoesNotAdvanceTaskWhenRPCResponseUnsuccessful
+
超时保护config 超时,任务仍是 pending
- 模拟
- Keystone 发了
config,Axon 没回应。 - 期望
- HTTP 504,DB 不变。
- 防止
- 不知道设备有没有收到时,后台抢先标 ready。
TestRecorderConfigTimeoutAndDisconnectedKeepTaskPending
Missing Axon response assertions
+
离线保护Recorder 离线时 config 返回 404
- 模拟
- 没有对应 WebSocket 连接。
- 期望
- 返回 404,任务保持 pending。
- 防止
- 给不存在的连接制造假成功。
TestRecorderConfigTimeoutAndDisconnectedKeepTaskPending
+
+
+
+
+ 命令 RPC 的副作用
11 项
+ 这些测试检查 begin、cancel、clear、pause、resume、finish、quit、get_stats 等命令:哪些命令应该改 DB,哪些只应该转发给 Axon。
+
+
beginbegin 成功才进入 in_progress
- 模拟
- Axon 对
begin 返回成功。 - 期望
- 任务从
pending/ready 进入 in_progress;已在 in_progress 时保持幂等。 - 防止
- 重复 begin 或状态不一致造成任务错乱。
TestRecorderBeginSuccessTimeoutAndDisconnectedTaskState
TestRecorderBeginSuccessAdvancesReadyAndLeavesInProgress
+
begin 失败begin 失败、超时或断连时不改任务
- 模拟
- Axon 返回失败、不响应、或 Recorder 不在线。
- 期望
- 任务状态保持原样。
- 防止
- 设备没开始录制,后台却显示录制中。
TestRecorderBeginDoesNotAdvanceTaskWhenRPCResponseUnsuccessful
TestRecorderBeginSuccessTimeoutAndDisconnectedTaskState
+
cancelcancel 成功才把 ready/in_progress 回退 pending
- 模拟
- Axon 接受取消命令。
- 期望
- 只有提供的
task_id 从运行态回到 pending。 - 防止
- 取消一条任务时误伤其它任务。
TestRecorderCancelAndClearTaskStateSideEffects
+
cancel 失败cancel 失败、超时或断连时不回退 DB
- 模拟
- Axon 拒绝取消、无响应、或断线。
- 期望
- DB 继续保持原状态。
- 防止
- 设备还在录制,后台却把任务放回 pending。
TestRecorderCancelAndClearTaskStateSideEffects
TestRecorderCancelTimeoutAndDisconnectedKeepTaskState
+
clearclear 成功只清 ready,不清 in_progress
- 模拟
- 对 ready 任务执行 clear,同时存在 in_progress 任务。
- 期望
- ready 任务回 pending,in_progress 任务不变;发给 Axon 的 clear 不带 UI 的 task payload。
- 防止
- 清配置时误取消录制中任务,或协议 payload 不稳定。
TestRecorderCancelAndClearTaskStateSideEffects
TestRecorderWebSocketRPCActionProtocol
+
clear 失败clear 失败、超时或断连时不改 DB
- 模拟
- Axon 拒绝 clear、不响应、或离线。
- 期望
- ready 任务仍保持 ready。
- 防止
- 设备没清掉配置,后台却显示可重新配置。
TestRecorderClearFailureTimeoutAndDisconnectedKeepTaskState
+
纯转发命令pause / resume / finish / quit 只发 RPC,不直接改 DB
- 模拟
- 这些命令返回成功。
- 期望
- Keystone 只转发命令,不擅自改变任务状态。
- 防止
- 设备实际状态还没上报,后台先行改状态。
TestRecorderForwardOnlyActionsDoNotMutateTaskState
TestRecorderWebSocketRPCActionProtocol
+
statsget_stats 的三种结果都要明确
- 模拟
- Recorder 离线、Axon 返回失败、Axon 不响应。
- 期望
- 离线返回
connected=false;设备失败返回 200 + error;超时返回 504。 - 防止
- 监控面板把离线、设备错误、网络超时混成一种情况。
TestRecorderGetStatsContracts
+
+
+
+
+ 断线、重连和旧连接污染
10 项
+ 这些测试专门覆盖 ping timeout / WebSocket close 后的危险窗口:DB 可能短暂回 pending,但不能因此重复下发,也不能让旧连接的迟到消息污染新连接。
+
+
断线回退断开时只回退本设备 runnable 任务
- 模拟
- 同一设备下有 ready、in_progress、pending、completed、deleted 任务,另一个设备也有 ready。
- 期望
- 只把本设备 ready/in_progress 回 pending,并清理运行时字段。
- 防止
- 断线清理误伤别的设备或已完成任务。
TestRecorderDisconnectRevertsOnlyRunnableTasksForDevice
+
ready 重连ready 任务断线后,重连仍恢复同一条任务
- 模拟
- DB 中 ready 任务因断线回 pending;Axon 重连后报告
ready + same task_id。 - 期望
- DB 恢复 ready;窗口内和恢复后都不重复发 config。
- 防止
- 你关心的“同一 task_id 被再次下发 config”。
TestRecorderWebSocketReadyReconnectRestoresSameTaskWithoutDuplicateConfig
+
recording 重连录制中断线后,重连恢复 in_progress
- 模拟
- DB 中 in_progress 任务断线回 pending;Axon 重连后报告
recording + same task_id。 - 期望
- DB 恢复 in_progress;不会开放新 config。
- 防止
- 设备仍在录制,后台却允许配置下一条。
TestRecorderWebSocketRecordingReconnectRestoresSameTaskWithoutDuplicateConfig
+
idle 重连如果 Axon 重连后确实 idle,才允许重新 config
- 模拟
- 断线回 pending 后,Axon 重连报告
idle。 - 期望
- Keystone 允许 config,并且只发送一次。
- 防止
- 安全门闩过严,导致设备确实空闲时无法恢复生产。
TestRecorderWebSocketIdleReconnectAllowsSingleConfig
+
旧连接污染旧连接的 state_update / config_applied 必须忽略
- 模拟
- 新连接已经替换旧连接,旧连接又迟到发状态变更消息。
- 期望
- DB 不被旧连接推进。
- 防止
- 网络迟到消息把当前真实状态覆盖掉。
TestRecorderIgnoresStateChangingMessagesFromReplacedConnection
TestRecorderWebSocketReplacementIgnoresOldConnectionLateMessages
+
旧响应污染旧连接的 rpc_response 不能完成新请求
- 模拟
- 新连接上有 pending RPC;旧连接伪造同 request_id 的响应。
- 期望
- 旧响应被忽略,只有当前连接响应才能完成请求。
- 防止
- 旧连接的成功响应让新请求误成功。
TestRecorderIgnoresRPCResponseFromReplacedConnection
TestRecorderWebSocketReplacementIgnoresOldConnectionLateMessages
+
替换断开旧连接被替换后断开,不应触发任务回退
- 模拟
- 新连接已经接管,旧连接随后 disconnect。
- 期望
- 当前 ready 任务不被旧连接断开回退。
- 防止
- 连接接管后,被旧连接清理逻辑误伤。
TestOldRecorderDisconnectAfterReplacementDoesNotRevertTasks
+
+
+
+
+ connect-time get_state 与协议细节
8 项
+ 这些测试使用真实 WebSocket 和 fake Axon,检查 Keystone 发出的 RPC action、payload、request_id 匹配和超时行为。
+
+
连接握手WebSocket 一连上,Keystone 会主动 get_state
- 模拟
- fake Axon 建立 WebSocket 连接。
- 期望
- 首先收到
get_state;回复 idle 后 Keystone 状态变 synced idle。 - 防止
- 重连后 Keystone 不主动校准设备状态。
TestRecorderWebSocketConnectGetStateIdleMarksStateSynced
+
get_state 超时get_state 超时后仍保持 syncing,config 继续阻断
- 模拟
- fake Axon 收到
get_state 但不回应,等到 ResponseTimeout 之后再操作。 - 期望
/state 仍是 syncing=true;config 返回 409 且不发 RPC;之后有效 state_update 才解锁。- 防止
- 状态拉取超时后被误当成 idle。
TestRecorderWebSocketGetStateTimeoutLeavesUnsyncedAndBlocksConfig
+
get_state 失败get_state 返回 success:false,也不能解锁 config
- 模拟
- Axon 明确回复状态不可用。
- 期望
- 连接仍 unsynced,config 仍被阻断。
- 防止
- 把失败响应当成有效状态。
TestRecorderWebSocketGetStateFailureLeavesUnsyncedAndBlocksConfig
+
RPC payload每个命令发出的 action 和 payload 要稳定
- 模拟
- 依次调用 config、begin、cancel、clear、finish、pause、resume、quit、get_stats。
- 期望
- fake Axon 收到正确 action;config 带 task_config;clear 不带 UI task_id。
- 防止
- 协议字段改错,导致 Axon 端无法解析。
TestRecorderWebSocketRPCActionProtocol
+
request_id不匹配的 rpc_response 不能解开 pending RPC
- 模拟
- Axon 返回一个不存在的
request_id。 - 期望
- Keystone 继续等待真正响应。
- 防止
- 不同请求串线,HTTP 请求拿到别人的结果。
TestRecorderUnmatchedRPCResponseDoesNotUnblockPendingRequest
+
无响应Axon 不响应时,Keystone HTTP 超时且不改 DB
- 模拟
- fake Axon 收到命令但不回 rpc_response。
- 期望
- HTTP 504,DB 不做成功态变更。
- 防止
- 设备是否执行未知时,后台提前写成功。
TestRecorderConfigTimeoutAndDisconnectedKeepTaskPending
TestRecorderBeginSuccessTimeoutAndDisconnectedTaskState
TestRecorderGetStatsContracts
+
+
+
+
+ 怎么读这些测试
实用说明
+
+
如果某个测试失败,不应该先改测试去“适配现状”。这些测试是用来暴露潜在问题的:例如任务被错误推进、断线后被重复 config、旧连接消息污染新连接、或者 Axon 超时后 Keystone 错误解锁。正确处理方式是先判断测试描述的业务预期是否成立;如果成立,就修实现。
+
这份 HTML 是给人看的说明;机器执行的测试仍以 Go 测试文件为准。详细测试清单也保留在 recorder-axon-interaction-test-plan.md。
+
+
+
+
+
+
+
diff --git a/internal/api/handlers/axon_rpc.go b/internal/api/handlers/axon_rpc.go
index 7f9ccdd..68f34fe 100644
--- a/internal/api/handlers/axon_rpc.go
+++ b/internal/api/handlers/axon_rpc.go
@@ -10,7 +10,9 @@ import (
"database/sql"
"encoding/json"
"errors"
+ "fmt"
"net/http"
+ "strconv"
"strings"
"time"
@@ -25,9 +27,11 @@ import (
// RecorderHandler handles REST and WebSocket traffic for Axon Recorder RPC.
type RecorderHandler struct {
- hub *services.RecorderHub
- cfg *config.RecorderConfig
- db *sqlx.DB
+ hub *services.RecorderHub
+ transferHub *services.TransferHub
+ stateBroker *services.DeviceStateBroker
+ cfg *config.RecorderConfig
+ db *sqlx.DB
}
// NewRecorderHandler creates a new RecorderHandler.
@@ -35,6 +39,15 @@ func NewRecorderHandler(hub *services.RecorderHub, cfg *config.RecorderConfig, d
return &RecorderHandler{hub: hub, cfg: cfg, db: db}
}
+// SetDeviceStateDeps enables device connection/state event publishing.
+func (h *RecorderHandler) SetDeviceStateDeps(transferHub *services.TransferHub, broker *services.DeviceStateBroker) {
+ if h == nil {
+ return
+ }
+ h.transferHub = transferHub
+ h.stateBroker = broker
+}
+
// ConfigRequest represents the request body for config RPC.
// @Description Request body for recorder config
type ConfigRequest struct {
@@ -129,7 +142,8 @@ func (h *RecorderHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request
// Validate device exists in robots table (if DB is configured)
if h.db != nil {
// Add independent 5s timeout to avoid blocking on slow DB queries
- queryCtx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
+ queryTimeout := 5 * time.Second
+ queryCtx, cancel := context.WithTimeout(r.Context(), queryTimeout)
defer cancel()
var count int
@@ -137,7 +151,11 @@ func (h *RecorderHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request
if err := h.db.GetContext(queryCtx, &count,
"SELECT COUNT(1) FROM robots WHERE device_id = ? AND deleted_at IS NULL", deviceID,
); err != nil {
- logger.Printf("[RECORDER] Device %s: DB query error: %v", deviceID, err)
+ if errors.Is(err, context.DeadlineExceeded) || errors.Is(queryCtx.Err(), context.DeadlineExceeded) {
+ logger.Printf("[RECORDER] Device %s: DB query timeout after %s (timeout_ms=%d): %v", deviceID, timeoutLogValue(queryTimeout), timeoutLogMilliseconds(queryTimeout), err)
+ } else {
+ logger.Printf("[RECORDER] Device %s: DB query error: %v", deviceID, err)
+ }
}
// Check count regardless of DB error (count defaults to 0 on error)
if count == 0 {
@@ -156,17 +174,9 @@ func (h *RecorderHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request
remoteIP := extractIP(r.RemoteAddr)
rc := h.hub.NewRecorderConn(conn, deviceID, remoteIP)
- staleConn, ok := h.hub.ConnectWithStaleThreshold(deviceID, rc, recorderStaleThreshold(h.cfg))
- if !ok {
- logger.Printf("[RECORDER] Device %s: connection rejected (already connected)", deviceID)
- if err := conn.Close(websocket.StatusPolicyViolation, "device already connected"); err != nil {
- if !isExpectedWebSocketCloseError(err) {
- logger.Printf("[RECORDER] Device %s: WebSocket close error: %v", deviceID, err)
- }
- }
- return
- }
- closeStaleRecorderConn(deviceID, staleConn)
+ replacedConn := h.hub.ConnectReplacingExisting(deviceID, rc)
+ closeReplacedRecorderConn(deviceID, replacedConn)
+ publishDeviceConnectionEvent(h.stateBroker, h.hub, h.transferHub, deviceID, "recorder_connected")
defer func() {
if err := conn.Close(websocket.StatusNormalClosure, ""); err != nil {
@@ -177,6 +187,7 @@ func (h *RecorderHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request
}()
defer func() {
if h.hub.Disconnect(deviceID, rc) {
+ publishDeviceConnectionEvent(h.stateBroker, h.hub, h.transferHub, deviceID, "recorder_disconnected")
revertRunnableTasksOnDeviceDisconnect(h.db, deviceID, nil, 0, false)
}
}()
@@ -186,6 +197,7 @@ func (h *RecorderHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request
// #nosec G706 -- Set aside for now
logger.Printf("[RECORDER] Recorder %s connected from %s", deviceID, remoteIP)
+ go h.syncRecorderStateFromDevice(ctx, rc)
for {
_, raw, err := conn.Read(ctx)
@@ -236,6 +248,10 @@ func (h *RecorderHandler) Config(c *gin.Context) {
}
}
+ if !h.requireRecorderReadyForConfig(c) {
+ return
+ }
+
if !h.callRPC(c, "config", params) {
return
}
@@ -308,6 +324,7 @@ func (h *RecorderHandler) Finish(c *gin.Context) {
if !ok {
return
}
+ params = h.fillRecorderTaskIDFromCache(c.Param("device_id"), params)
h.callRPC(c, "finish", params)
}
@@ -493,12 +510,14 @@ func (h *RecorderHandler) GetStats(c *gin.Context) {
return
}
- response, err := h.hub.SendRPC(c.Request.Context(), deviceID, "get_stats", nil, time.Duration(h.cfg.ResponseTimeout)*time.Second)
+ timeout := recorderRPCResponseTimeout(h.cfg)
+ response, err := h.hub.SendRPC(c.Request.Context(), deviceID, "get_stats", nil, timeout)
if err != nil {
switch {
case errors.Is(err, services.ErrRecorderNotConnected):
c.JSON(http.StatusOK, disconnectedRecorderStatsResponse())
case errors.Is(err, services.ErrRecorderRPCTimeout):
+ logRecorderRPCTimeout(deviceID, "get_stats", "", "stats", timeout, err)
c.JSON(http.StatusGatewayTimeout, gin.H{"error": err.Error()})
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -523,28 +542,303 @@ func disconnectedRecorderStatsResponse() gin.H {
}
}
+const defaultRecorderFreshMaxAge = time.Second
+
+func isTruthyQuery(value string) bool {
+ value = strings.ToLower(strings.TrimSpace(value))
+ return value == "1" || value == "true" || value == "yes"
+}
+
+func recorderRefreshMaxAge(c *gin.Context) time.Duration {
+ value := strings.TrimSpace(c.Query("max_age_ms"))
+ if value == "" {
+ return defaultRecorderFreshMaxAge
+ }
+ ms, err := strconv.Atoi(value)
+ if err != nil || ms < 0 {
+ return defaultRecorderFreshMaxAge
+ }
+ return time.Duration(ms) * time.Millisecond
+}
+
+func recorderStateAge(state services.RecorderState) time.Duration {
+ if state.UpdatedAt.IsZero() {
+ return time.Duration(1<<63 - 1)
+ }
+ age := time.Since(state.UpdatedAt)
+ if age < 0 {
+ return 0
+ }
+ return age
+}
+
+func recorderStateAgeMS(state services.RecorderState) int64 {
+ age := recorderStateAge(state)
+ if age == time.Duration(1<<63-1) {
+ return 0
+ }
+ return age.Milliseconds()
+}
+
+func (h *RecorderHandler) disconnectedRecorderStateResponse(deviceID string) gin.H {
+ return gin.H{
+ "connected": false,
+ "current_state": "unknown",
+ "previous_state": "",
+ "task_id": "",
+ "updated_at": time.Now().UTC(),
+ "state_synced": false,
+ "syncing": false,
+ "fresh": false,
+ "age_ms": 0,
+ "source": "disconnected",
+ "state_version": h.recorderStateVersion(deviceID),
+ }
+}
+
+func (h *RecorderHandler) recorderStateResponse(deviceID string, rc *services.RecorderConn, fresh bool) gin.H {
+ st := rc.GetState()
+ synced := rc.IsStateSynced()
+ return gin.H{
+ "connected": true,
+ "current_state": st.CurrentState,
+ "previous_state": recorderPreviousStateFromRaw(st.Raw),
+ "task_id": st.TaskID,
+ "updated_at": st.UpdatedAt,
+ "state_synced": synced,
+ "syncing": !synced,
+ "fresh": fresh,
+ "age_ms": recorderStateAgeMS(st),
+ "source": strings.TrimSpace(st.Source),
+ "state_version": h.recorderStateVersion(deviceID),
+ }
+}
+
+func (h *RecorderHandler) recorderStateVersion(deviceID string) uint64 {
+ if h == nil || h.stateBroker == nil {
+ return 0
+ }
+ return h.stateBroker.CurrentVersion(strings.TrimSpace(deviceID))
+}
+
+func (h *RecorderHandler) refreshRecorderState(ctx context.Context, deviceID string, rc *services.RecorderConn, maxAge time.Duration) (services.RecorderState, bool, error) {
+ if h == nil || h.hub == nil || rc == nil || strings.TrimSpace(deviceID) == "" {
+ return services.RecorderState{}, false, services.ErrRecorderNotConnected
+ }
+ if h.hub.Get(deviceID) != rc {
+ return services.RecorderState{}, false, services.ErrRecorderNotConnected
+ }
+
+ state := rc.GetState()
+ if maxAge >= 0 && rc.IsStateSynced() && recorderStateAge(state) <= maxAge {
+ return state, true, nil
+ }
+
+ timeout := recorderRPCResponseTimeout(h.cfg)
+ response, err := h.hub.SendRPC(ctx, deviceID, "get_state", nil, timeout)
+ if err != nil {
+ if errors.Is(err, services.ErrRecorderRPCTimeout) {
+ logRecorderRPCTimeout(deviceID, "get_state", "", "state_refresh", timeout, err)
+ }
+ h.markRecorderSyncing(rc, "get_state", err)
+ return rc.GetState(), false, err
+ }
+ if response == nil || !response.Success {
+ message := "recorder get_state returned unsuccessful response"
+ if response != nil && strings.TrimSpace(response.Message) != "" {
+ message = strings.TrimSpace(response.Message)
+ }
+ err := errors.New(message)
+ h.markRecorderSyncing(rc, "get_state", err)
+ return rc.GetState(), false, err
+ }
+ if h.hub.Get(deviceID) != rc {
+ return services.RecorderState{}, false, services.ErrRecorderNotConnected
+ }
+
+ state = recorderStateFromRPCData(response.Data)
+ if strings.TrimSpace(state.CurrentState) == "" {
+ err := errors.New("recorder get_state returned empty state")
+ h.markRecorderSyncing(rc, "get_state", err)
+ return rc.GetState(), false, err
+ }
+ if err := h.applyRecorderStateSnapshot(rc, state, "get_state"); err != nil {
+ return rc.GetState(), false, err
+ }
+ return rc.GetState(), true, nil
+}
+
+func (h *RecorderHandler) markRecorderSyncing(rc *services.RecorderConn, source string, syncErr error) {
+ if h == nil || h.hub == nil || rc == nil || h.hub.Get(rc.DeviceID) != rc {
+ return
+ }
+ rc.MarkStateSyncing(source, syncErr)
+ st := rc.GetState()
+ if h.stateBroker != nil {
+ ev := recorderStateEvent(st, "recorder_syncing", source, false)
+ ev["state_synced"] = false
+ ev["syncing"] = true
+ if syncErr != nil {
+ ev["error"] = syncErr.Error()
+ }
+ h.stateBroker.Publish(rc.DeviceID, ev)
+ }
+}
+
+func (h *RecorderHandler) publishRecorderStateSnapshot(rc *services.RecorderConn, state services.RecorderState, source string, fresh bool) {
+ if h == nil || h.stateBroker == nil || rc == nil {
+ return
+ }
+ h.stateBroker.Publish(rc.DeviceID, recorderStateEvent(state, "recorder_state", source, fresh))
+}
+
+func recorderStateEvent(state services.RecorderState, eventType string, source string, fresh bool) services.DeviceStateEvent {
+ synced := strings.TrimSpace(state.CurrentState) != ""
+ return services.DeviceStateEvent{
+ "type": eventType,
+ "current_state": state.CurrentState,
+ "previous_state": recorderPreviousStateFromRaw(state.Raw),
+ "task_id": state.TaskID,
+ "state_synced": synced,
+ "syncing": !synced,
+ "fresh": fresh,
+ "age_ms": recorderStateAgeMS(state),
+ "source": strings.TrimSpace(source),
+ }
+}
+
+func (h *RecorderHandler) handleRecorderRPCTimeout(ctx context.Context, deviceID string, action string, params map[string]interface{}, timeout time.Duration, timeoutErr error) {
+ taskID := firstNonEmptyString(params, "task_id")
+ logRecorderRPCTimeout(deviceID, action, taskID, "api_rpc", timeout, timeoutErr)
+ if rc := h.hub.Get(deviceID); rc != nil {
+ h.markRecorderSyncing(rc, "rpc_timeout:"+action, timeoutErr)
+ syncTimeout := timeout
+ if syncTimeout <= 0 {
+ syncTimeout = recorderRPCResponseTimeout(h.cfg)
+ }
+ syncCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), syncTimeout)
+ go func() {
+ defer cancel()
+ h.syncRecorderStateFromDevice(syncCtx, rc)
+ }()
+ }
+ if h.stateBroker == nil {
+ return
+ }
+ h.stateBroker.Publish(deviceID, services.DeviceStateEvent{
+ "type": "recorder_operation_unknown",
+ "action": strings.TrimSpace(action),
+ "task_id": taskID,
+ "result_unknown": true,
+ "source": "rpc_timeout:" + strings.TrimSpace(action),
+ "error": timeoutErr.Error(),
+ "timeout": timeoutLogValue(timeout),
+ "timeout_ms": timeoutLogMilliseconds(timeout),
+ })
+}
+
+func (h *RecorderHandler) requireRecorderReadyForConfig(c *gin.Context) bool {
+ deviceID := strings.TrimSpace(c.Param("device_id"))
+ if deviceID == "" {
+ c.JSON(http.StatusBadRequest, gin.H{"error": "device_id is required"})
+ return false
+ }
+ if h == nil || h.hub == nil {
+ c.JSON(http.StatusInternalServerError, gin.H{"error": "recorder hub is not configured"})
+ return false
+ }
+
+ if h.transferHub != nil && h.transferHub.Get(deviceID) == nil {
+ c.JSON(http.StatusConflict, gin.H{
+ "code": "transfer_not_connected",
+ "error": "transfer is not connected; config is rejected",
+ })
+ return false
+ }
+
+ rc := h.hub.Get(deviceID)
+ if rc == nil {
+ c.JSON(http.StatusNotFound, gin.H{"error": services.ErrRecorderNotConnected.Error()})
+ return false
+ }
+ if !rc.IsStateSynced() {
+ c.JSON(http.StatusConflict, gin.H{
+ "code": "recorder_state_syncing",
+ "error": "recorder state is syncing; retry after initial state snapshot",
+ })
+ return false
+ }
+
+ state := rc.GetState()
+ if recorderStateAge(state) > defaultRecorderFreshMaxAge {
+ refreshed, _, err := h.refreshRecorderState(c.Request.Context(), deviceID, rc, -1)
+ if err != nil {
+ status := http.StatusConflict
+ if errors.Is(err, services.ErrRecorderRPCTimeout) {
+ status = http.StatusGatewayTimeout
+ }
+ out := h.recorderStateResponse(deviceID, rc, false)
+ out["code"] = "recorder_state_refresh_failed"
+ out["error"] = err.Error()
+ c.JSON(status, out)
+ return false
+ }
+ state = refreshed
+ }
+
+ current := strings.ToLower(strings.TrimSpace(state.CurrentState))
+ if current != "idle" {
+ c.JSON(http.StatusConflict, gin.H{
+ "code": "recorder_busy",
+ "error": "recorder is not idle; config is rejected",
+ "current_state": state.CurrentState,
+ "task_id": state.TaskID,
+ })
+ return false
+ }
+ return true
+}
+
func (h *RecorderHandler) callRPC(c *gin.Context, action string, params map[string]interface{}) bool {
- deviceID := c.Param("device_id")
+ deviceID := strings.TrimSpace(c.Param("device_id"))
if deviceID == "" {
c.JSON(http.StatusBadRequest, gin.H{"error": "device_id is required"})
return false
}
- response, err := h.hub.SendRPC(c.Request.Context(), deviceID, action, params, time.Duration(h.cfg.ResponseTimeout)*time.Second)
+ timeout := recorderRPCResponseTimeout(h.cfg)
+ response, err := h.hub.SendRPC(c.Request.Context(), deviceID, action, params, timeout)
if err != nil {
switch {
case errors.Is(err, services.ErrRecorderNotConnected):
c.JSON(http.StatusNotFound, gin.H{"error": err.Error()})
case errors.Is(err, services.ErrRecorderRPCTimeout):
- c.JSON(http.StatusGatewayTimeout, gin.H{"error": err.Error()})
+ h.handleRecorderRPCTimeout(c.Request.Context(), deviceID, action, params, timeout, err)
+ c.JSON(http.StatusGatewayTimeout, gin.H{
+ "error": err.Error(),
+ "action": action,
+ "result_unknown": true,
+ })
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
return false
}
+ if response != nil && response.Success {
+ state := recorderStateFromRPCData(response.Data)
+ if strings.TrimSpace(state.TaskID) == "" && recorderStateKeepsTaskID(state.CurrentState) {
+ state.TaskID = recorderTaskIDFromRPCParams(params)
+ }
+ if strings.TrimSpace(state.CurrentState) != "" {
+ if rc := h.hub.Get(deviceID); rc != nil {
+ _ = h.applyRecorderStateSnapshot(rc, state, "rpc_response:"+action)
+ }
+ }
+ }
+
c.JSON(http.StatusOK, response)
- return true
+ return response != nil && response.Success
}
func (h *RecorderHandler) bindOptionalParams(c *gin.Context) (map[string]interface{}, bool) {
@@ -560,6 +854,33 @@ func (h *RecorderHandler) bindOptionalParams(c *gin.Context) (map[string]interfa
return params, true
}
+func recorderTaskIDFromRPCParams(params map[string]interface{}) string {
+ taskID := firstNonEmptyString(params, "task_id")
+ if taskID != "" {
+ return taskID
+ }
+ return stringValue(mapValue(params, "task_config"), "task_id")
+}
+
+func (h *RecorderHandler) fillRecorderTaskIDFromCache(deviceID string, params map[string]interface{}) map[string]interface{} {
+ if recorderTaskIDFromRPCParams(params) != "" || h == nil || h.hub == nil {
+ return params
+ }
+ rc := h.hub.Get(strings.TrimSpace(deviceID))
+ if rc == nil {
+ return params
+ }
+ taskID := strings.TrimSpace(rc.GetState().TaskID)
+ if taskID == "" {
+ return params
+ }
+ if params == nil {
+ params = map[string]interface{}{}
+ }
+ params["task_id"] = taskID
+ return params
+}
+
// ListDevices returns all currently connected recorder devices.
//
// @Summary List connected recorders
@@ -584,27 +905,46 @@ func (h *RecorderHandler) ListDevices(c *gin.Context) {
// @Success 200 {object} map[string]interface{} "connected=false when recorder WS is not active"
// @Router /recorder/{device_id}/state [get]
func (h *RecorderHandler) GetState(c *gin.Context) {
- deviceID := c.Param("device_id")
+ deviceID := strings.TrimSpace(c.Param("device_id"))
+ if deviceID == "" {
+ c.JSON(http.StatusBadRequest, gin.H{"error": "device_id is required"})
+ return
+ }
+ if h == nil || h.hub == nil {
+ c.JSON(http.StatusInternalServerError, gin.H{"error": "recorder hub is not configured"})
+ return
+ }
+
rc := h.hub.Get(deviceID)
if rc == nil {
- c.JSON(http.StatusOK, gin.H{
- "connected": false,
- "current_state": "unknown",
- "previous_state": "",
- "task_id": "",
- "updated_at": time.Now().UTC(),
- })
+ c.JSON(http.StatusOK, h.disconnectedRecorderStateResponse(deviceID))
return
}
- st := rc.GetState()
- prev := recorderPreviousStateFromRaw(st.Raw)
- c.JSON(http.StatusOK, gin.H{
- "connected": true,
- "current_state": st.CurrentState,
- "previous_state": prev,
- "task_id": st.TaskID,
- "updated_at": st.UpdatedAt,
- })
+
+ fresh := false
+ if isTruthyQuery(c.Query("refresh")) {
+ var err error
+ _, fresh, err = h.refreshRecorderState(c.Request.Context(), deviceID, rc, recorderRefreshMaxAge(c))
+ if err != nil {
+ if errors.Is(err, services.ErrRecorderNotConnected) {
+ c.JSON(http.StatusOK, h.disconnectedRecorderStateResponse(deviceID))
+ return
+ }
+ status := http.StatusConflict
+ if errors.Is(err, services.ErrRecorderRPCTimeout) {
+ status = http.StatusGatewayTimeout
+ }
+ out := h.recorderStateResponse(deviceID, rc, false)
+ out["error"] = err.Error()
+ if errors.Is(err, services.ErrRecorderRPCTimeout) {
+ out["result_unknown"] = true
+ }
+ c.JSON(status, out)
+ return
+ }
+ }
+
+ c.JSON(http.StatusOK, h.recorderStateResponse(deviceID, rc, fresh))
}
func recorderPreviousStateFromRaw(raw map[string]interface{}) string {
@@ -616,6 +956,11 @@ func recorderPreviousStateFromRaw(raw map[string]interface{}) string {
}
func (h *RecorderHandler) handleMessage(deviceID string, rc *services.RecorderConn, msg map[string]interface{}) {
+ if h.hub.Get(deviceID) != rc {
+ logger.Printf("[RECORDER] Recorder %s ignored message from replaced connection", deviceID)
+ return
+ }
+
msgType, _ := msg["type"].(string)
switch msgType {
case "rpc_response":
@@ -657,13 +1002,106 @@ func (h *RecorderHandler) handleStateUpdate(rc *services.RecorderConn, msg map[s
TaskID: stringValue(data, "task_id"),
Raw: data,
}
+ _ = h.applyRecorderStateSnapshot(rc, state, "state_update")
+}
+
+func (h *RecorderHandler) syncRecorderStateFromDevice(ctx context.Context, rc *services.RecorderConn) {
+ if h == nil || h.hub == nil || rc == nil || strings.TrimSpace(rc.DeviceID) == "" {
+ return
+ }
+ if h.hub.Get(rc.DeviceID) != rc {
+ return
+ }
+
+ if _, _, err := h.refreshRecorderState(ctx, rc.DeviceID, rc, -1); err != nil {
+ if !errors.Is(err, services.ErrRecorderNotConnected) && !errors.Is(err, context.Canceled) {
+ logger.Printf("[RECORDER] Recorder %s get_state after connect failed: %v", rc.DeviceID, err)
+ }
+ }
+}
+
+func (h *RecorderHandler) applyRecorderStateSnapshot(rc *services.RecorderConn, state services.RecorderState, source string) error {
+ if rc == nil {
+ return nil
+ }
+ state.Source = strings.TrimSpace(source)
+ state.TaskID = strings.TrimSpace(state.TaskID)
+ if !recorderStateKeepsTaskID(state.CurrentState) {
+ state.TaskID = ""
+ }
+ if state.TaskID == "" && recorderStateCanInheritTaskID(state.CurrentState, state.Source) {
+ state.TaskID = strings.TrimSpace(rc.GetState().TaskID)
+ }
+ if state.TaskID == "" && recorderStateRequiresTaskID(state.CurrentState, state.Source) {
+ err := fmt.Errorf("recorder %s snapshot missing task_id for state=%s", state.Source, strings.TrimSpace(state.CurrentState))
+ h.markRecorderSyncing(rc, state.Source, err)
+ logger.Printf("[RECORDER] Recorder %s ignored %s state=%s without task_id", rc.DeviceID, state.Source, state.CurrentState)
+ return err
+ }
rc.UpdateState(state)
- h.reconcileRecorderTaskState(rc.DeviceID, state)
- // #nosec G706 -- Set aside for now
- logger.Printf("[RECORDER] Recorder %s state=%s task=%s", rc.DeviceID, state.CurrentState, state.TaskID)
+ st := rc.GetState()
+ h.reconcileRecorderTaskState(rc.DeviceID, st, source)
+ h.publishRecorderStateSnapshot(rc, st, source, true)
+ if source == "state_update" {
+ // #nosec G706 -- Set aside for now
+ logger.Printf("[RECORDER] Recorder %s state=%s task=%s", rc.DeviceID, st.CurrentState, st.TaskID)
+ return nil
+ }
+ logger.Printf("[RECORDER] Recorder %s state=%s task=%s source=%s", rc.DeviceID, st.CurrentState, st.TaskID, source)
+ return nil
+}
+
+func recorderStateFromRPCData(data map[string]interface{}) services.RecorderState {
+ state := services.RecorderState{
+ CurrentState: firstNonEmptyString(data, "state", "current", "current_state"),
+ TaskID: firstNonEmptyString(data, "task_id"),
+ Raw: data,
+ }
+ if strings.TrimSpace(state.TaskID) == "" {
+ state.TaskID = stringValue(mapValue(data, "task_config"), "task_id")
+ }
+ return state
+}
+
+func recorderStateKeepsTaskID(state string) bool {
+ switch strings.ToLower(strings.TrimSpace(state)) {
+ case "ready", "recording", "paused":
+ return true
+ default:
+ return false
+ }
+}
+
+func recorderStateCanInheritTaskID(state string, source string) bool {
+ switch strings.TrimSpace(source) {
+ case "rpc_response:pause":
+ return strings.EqualFold(strings.TrimSpace(state), "paused")
+ case "rpc_response:resume":
+ return strings.EqualFold(strings.TrimSpace(state), "recording")
+ default:
+ return false
+ }
}
-func (h *RecorderHandler) reconcileRecorderTaskState(deviceID string, state services.RecorderState) {
+func recorderStateRequiresTaskID(state string, source string) bool {
+ switch strings.TrimSpace(source) {
+ case "get_state", "state_update":
+ return recorderStateKeepsTaskID(state)
+ default:
+ return false
+ }
+}
+
+func firstNonEmptyString(m map[string]interface{}, keys ...string) string {
+ for _, key := range keys {
+ if value := strings.TrimSpace(stringValue(m, key)); value != "" {
+ return value
+ }
+ }
+ return ""
+}
+
+func (h *RecorderHandler) reconcileRecorderTaskState(deviceID string, state services.RecorderState, source string) {
if h.db == nil {
return
}
@@ -672,17 +1110,21 @@ func (h *RecorderHandler) reconcileRecorderTaskState(deviceID string, state serv
return
}
- switch strings.TrimSpace(state.CurrentState) {
+ if strings.TrimSpace(source) == "" {
+ source = "state_update"
+ }
+ currentState := strings.ToLower(strings.TrimSpace(state.CurrentState))
+ switch currentState {
case "ready":
- advanceTaskPendingToReady(h.db, deviceID, taskID, "state_update ready")
- case "recording":
+ advanceTaskPendingToReady(h.db, deviceID, taskID, source+" ready")
+ case "recording", "paused":
res, err := advanceTaskPendingOrReadyToInProgress(h.db, taskID)
if err != nil {
- logger.Printf("[RECORDER] Recorder %s: failed to advance task pending/ready->in_progress after state_update recording: task=%s err=%v", deviceID, taskID, err)
+ logger.Printf("[RECORDER] Recorder %s: failed to advance task pending/ready->in_progress after %s %s: task=%s err=%v", deviceID, source, currentState, taskID, err)
return
}
if n, _ := res.RowsAffected(); n > 0 {
- logger.Printf("[RECORDER] Device %s: task status reconciled: task=%s source=state_update_recording status=in_progress", deviceID, taskID)
+ logger.Printf("[RECORDER] Device %s: task status reconciled: task=%s source=%s_%s status=in_progress", deviceID, taskID, source, currentState)
}
}
}
@@ -775,10 +1217,11 @@ func (h *RecorderHandler) pingLoop(ctx context.Context, rc *services.RecorderCon
case <-ticker.C:
pingCtx, cancel := context.WithTimeout(ctx, timeout)
err := rc.Conn.Ping(pingCtx)
+ timedOut := errors.Is(err, context.DeadlineExceeded) || errors.Is(pingCtx.Err(), context.DeadlineExceeded)
cancel()
if err != nil {
if ctx.Err() == nil {
- logger.Printf("[RECORDER] Recorder %s ping failed: %v", rc.DeviceID, err)
+ logWebSocketPingFailure("RECORDER", "Recorder", rc.DeviceID, timeout, timedOut, err)
if closeErr := rc.Conn.CloseNow(); closeErr != nil {
if !isExpectedWebSocketCloseError(closeErr) {
logger.Printf("[RECORDER] Recorder %s close after ping failure: %v", rc.DeviceID, closeErr)
@@ -794,6 +1237,56 @@ func (h *RecorderHandler) pingLoop(ctx context.Context, rc *services.RecorderCon
}
}
+func recorderRPCResponseTimeout(cfg *config.RecorderConfig) time.Duration {
+ if cfg == nil || cfg.ResponseTimeout <= 0 {
+ return 15 * time.Second
+ }
+ return time.Duration(cfg.ResponseTimeout) * time.Second
+}
+
+func timeoutLogValue(timeout time.Duration) string {
+ if timeout <= 0 {
+ return "unknown"
+ }
+ return timeout.String()
+}
+
+func timeoutLogMilliseconds(timeout time.Duration) int64 {
+ if timeout <= 0 {
+ return 0
+ }
+ return timeout.Milliseconds()
+}
+
+func logRecorderRPCTimeout(deviceID, action, taskID, source string, timeout time.Duration, err error) {
+ deviceID = strings.TrimSpace(deviceID)
+ action = strings.TrimSpace(action)
+ if action == "" {
+ action = "unknown"
+ }
+ source = strings.TrimSpace(source)
+ if source == "" {
+ source = "unknown"
+ }
+ taskID = strings.TrimSpace(taskID)
+ if taskID != "" {
+ logger.Printf("[RECORDER] Recorder %s RPC timeout after %s (timeout_ms=%d): action=%s task=%s source=%s err=%v", deviceID, timeoutLogValue(timeout), timeoutLogMilliseconds(timeout), action, taskID, source, err)
+ return
+ }
+ logger.Printf("[RECORDER] Recorder %s RPC timeout after %s (timeout_ms=%d): action=%s source=%s err=%v", deviceID, timeoutLogValue(timeout), timeoutLogMilliseconds(timeout), action, source, err)
+}
+
+func logWebSocketPingFailure(component, label, deviceID string, timeout time.Duration, timedOut bool, err error) {
+ component = strings.TrimSpace(component)
+ label = strings.TrimSpace(label)
+ deviceID = strings.TrimSpace(deviceID)
+ if timedOut {
+ logger.Printf("[%s] %s %s ping timeout after %s (timeout_ms=%d): %v", component, label, deviceID, timeoutLogValue(timeout), timeoutLogMilliseconds(timeout), err)
+ return
+ }
+ logger.Printf("[%s] %s %s ping failed (timeout=%s timeout_ms=%d): %v", component, label, deviceID, timeoutLogValue(timeout), timeoutLogMilliseconds(timeout), err)
+}
+
func recorderPingInterval(cfg *config.RecorderConfig) time.Duration {
if cfg == nil || cfg.PingInterval <= 0 {
return 0
@@ -808,21 +1301,14 @@ func recorderPingTimeout(cfg *config.RecorderConfig) time.Duration {
return time.Duration(cfg.PingTimeout) * time.Second
}
-func recorderStaleThreshold(cfg *config.RecorderConfig) time.Duration {
- if cfg == nil || cfg.StaleThreshold <= 0 {
- return 0
- }
- return time.Duration(cfg.StaleThreshold) * time.Second
-}
-
-func closeStaleRecorderConn(deviceID string, rc *services.RecorderConn) {
+func closeReplacedRecorderConn(deviceID string, rc *services.RecorderConn) {
if rc == nil || rc.Conn == nil {
return
}
- logger.Printf("[RECORDER] Device %s: closing stale WebSocket connection", deviceID)
+ logger.Printf("[RECORDER] Device %s: closing replaced WebSocket connection", deviceID)
if err := rc.Conn.CloseNow(); err != nil {
if !isExpectedWebSocketCloseError(err) {
- logger.Printf("[RECORDER] Device %s: stale WebSocket close error: %v", deviceID, err)
+ logger.Printf("[RECORDER] Device %s: replaced WebSocket close error: %v", deviceID, err)
}
}
}
diff --git a/internal/api/handlers/batch.go b/internal/api/handlers/batch.go
index de1d670..39c95b2 100644
--- a/internal/api/handlers/batch.go
+++ b/internal/api/handlers/batch.go
@@ -8,6 +8,7 @@ import (
"context"
"database/sql"
"encoding/json"
+ "errors"
"fmt"
"io"
"net/http"
@@ -1157,7 +1158,15 @@ func notifyRecorderCancelTasksWithHub(ctx context.Context, hub *services.Recorde
continue
}
if err != nil {
- logger.Printf("[BATCH] Batch %d: failed to notify recorder (status=%s): device=%s task=%s err=%v", batchID, st, deviceID, taskID, err)
+ action := "cancel"
+ if st == "ready" {
+ action = "clear"
+ }
+ if errors.Is(err, services.ErrRecorderRPCTimeout) {
+ logRecorderRPCTimeout(deviceID, action, taskID, "batch_cancel", timeout, err)
+ } else {
+ logger.Printf("[BATCH] Batch %d: failed to notify recorder (status=%s): device=%s task=%s err=%v", batchID, st, deviceID, taskID, err)
+ }
}
}
}
diff --git a/internal/api/handlers/device_state_stream.go b/internal/api/handlers/device_state_stream.go
new file mode 100644
index 0000000..25ced99
--- /dev/null
+++ b/internal/api/handlers/device_state_stream.go
@@ -0,0 +1,159 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package handlers
+
+import (
+ "encoding/json"
+ "fmt"
+ "net/http"
+ "strings"
+ "time"
+
+ "github.com/gin-gonic/gin"
+
+ "archebase.com/keystone-edge/internal/services"
+)
+
+// DeviceStateHandler streams recorder state and recorder/transfer connection events.
+type DeviceStateHandler struct {
+ broker *services.DeviceStateBroker
+ recorderHub *services.RecorderHub
+ transferHub *services.TransferHub
+}
+
+// NewDeviceStateHandler creates a DeviceStateHandler.
+func NewDeviceStateHandler(broker *services.DeviceStateBroker, recorderHub *services.RecorderHub, transferHub *services.TransferHub) *DeviceStateHandler {
+ return &DeviceStateHandler{broker: broker, recorderHub: recorderHub, transferHub: transferHub}
+}
+
+// RegisterRoutes registers device state stream routes.
+func (h *DeviceStateHandler) RegisterRoutes(apiV1 *gin.RouterGroup) {
+ apiV1.GET("/device-state/stream", h.Stream)
+}
+
+// Stream emits Server-Sent Events for device state changes.
+func (h *DeviceStateHandler) Stream(c *gin.Context) {
+ if h == nil || h.broker == nil {
+ c.JSON(http.StatusServiceUnavailable, gin.H{"error": "device state stream is not configured"})
+ return
+ }
+
+ deviceFilter := deviceStateStreamDeviceFilter(c)
+ w := c.Writer
+ w.Header().Set("Content-Type", "text/event-stream")
+ w.Header().Set("Cache-Control", "no-cache")
+ w.Header().Set("Connection", "keep-alive")
+ w.Header().Set("X-Accel-Buffering", "no")
+ w.WriteHeader(http.StatusOK)
+ w.Flush()
+
+ events, unsubscribe := h.broker.Subscribe(64)
+ defer unsubscribe()
+
+ heartbeat := time.NewTicker(15 * time.Second)
+ defer heartbeat.Stop()
+
+ for {
+ select {
+ case <-c.Request.Context().Done():
+ return
+ case ev, ok := <-events:
+ if !ok {
+ return
+ }
+ if !deviceStateEventMatchesFilter(ev, deviceFilter) {
+ continue
+ }
+ if err := writeDeviceStateSSE(w, ev); err != nil {
+ return
+ }
+ case <-heartbeat.C:
+ if _, err := fmt.Fprint(w, ": keepalive\n\n"); err != nil {
+ return
+ }
+ w.Flush()
+ }
+ }
+}
+
+func writeDeviceStateSSE(w gin.ResponseWriter, ev services.DeviceStateEvent) error {
+ payload, err := json.Marshal(ev)
+ if err != nil {
+ return err
+ }
+ eventType := strings.TrimSpace(stringFromEvent(ev, "type"))
+ if eventType == "" {
+ eventType = "device_state"
+ }
+ if version, ok := ev["state_version"]; ok {
+ if _, err := fmt.Fprintf(w, "id: %v\n", version); err != nil {
+ return err
+ }
+ }
+ if _, err := fmt.Fprintf(w, "event: %s\n", eventType); err != nil {
+ return err
+ }
+ if _, err := fmt.Fprintf(w, "data: %s\n\n", payload); err != nil {
+ return err
+ }
+ w.Flush()
+ return nil
+}
+
+func stringFromEvent(ev services.DeviceStateEvent, key string) string {
+ if ev == nil {
+ return ""
+ }
+ if s, ok := ev[key].(string); ok {
+ return s
+ }
+ return ""
+}
+
+func deviceStateStreamDeviceFilter(c *gin.Context) map[string]struct{} {
+ ids := c.QueryArray("device_id")
+ if csv := strings.TrimSpace(c.Query("device_ids")); csv != "" {
+ ids = append(ids, strings.Split(csv, ",")...)
+ }
+ filter := make(map[string]struct{}, len(ids))
+ for _, id := range ids {
+ id = strings.TrimSpace(id)
+ if id != "" {
+ filter[id] = struct{}{}
+ }
+ }
+ return filter
+}
+
+func deviceStateEventMatchesFilter(ev services.DeviceStateEvent, filter map[string]struct{}) bool {
+ if len(filter) == 0 {
+ return true
+ }
+ deviceID := strings.TrimSpace(stringFromEvent(ev, "device_id"))
+ _, ok := filter[deviceID]
+ return ok
+}
+
+func publishDeviceConnectionEvent(
+ broker *services.DeviceStateBroker,
+ recorderHub *services.RecorderHub,
+ transferHub *services.TransferHub,
+ deviceID string,
+ source string,
+) {
+ deviceID = strings.TrimSpace(deviceID)
+ if broker == nil || deviceID == "" {
+ return
+ }
+ recorderConnected := recorderHub != nil && recorderHub.Get(deviceID) != nil
+ transferConnected := transferHub != nil && transferHub.Get(deviceID) != nil
+ broker.Publish(deviceID, services.DeviceStateEvent{
+ "type": "device_connection",
+ "recorder_connected": recorderConnected,
+ "transfer_connected": transferConnected,
+ "connected": recorderConnected && transferConnected,
+ "source": strings.TrimSpace(source),
+ })
+}
diff --git a/internal/api/handlers/recorder_axon_interaction_test.go b/internal/api/handlers/recorder_axon_interaction_test.go
new file mode 100644
index 0000000..0d6cbfd
--- /dev/null
+++ b/internal/api/handlers/recorder_axon_interaction_test.go
@@ -0,0 +1,1676 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package handlers
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+ "time"
+
+ "archebase.com/keystone-edge/internal/config"
+ "archebase.com/keystone-edge/internal/services"
+ "github.com/coder/websocket"
+ "github.com/coder/websocket/wsjson"
+ "github.com/gin-gonic/gin"
+ "github.com/jmoiron/sqlx"
+ _ "modernc.org/sqlite"
+)
+
+func TestRecorderStateSnapshotReconcilesAdditionalStates(t *testing.T) {
+ tests := []struct {
+ name string
+ initial string
+ state services.RecorderState
+ wantStatus string
+ wantColumn string
+ }{
+ {
+ name: "get_state recording with top-level task_id advances to in_progress",
+ initial: "pending",
+ state: recorderStateFromRPCData(map[string]interface{}{"state": "recording", "task_id": "task-state"}),
+ wantStatus: "in_progress",
+ wantColumn: "started_at",
+ },
+ {
+ name: "get_state paused with nested task_config advances to in_progress",
+ initial: "ready",
+ state: recorderStateFromRPCData(map[string]interface{}{"state": "paused", "task_config": map[string]interface{}{"task_id": "task-state"}}),
+ wantStatus: "in_progress",
+ wantColumn: "started_at",
+ },
+ {
+ name: "idle with task_id does not change pending task",
+ initial: "pending",
+ state: services.RecorderState{CurrentState: "idle", TaskID: "task-state"},
+ wantStatus: "pending",
+ },
+ {
+ name: "ready without task_id does not change pending task",
+ initial: "pending",
+ state: services.RecorderState{CurrentState: "ready"},
+ wantStatus: "pending",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-state", tt.initial)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db)
+ rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1")
+
+ _ = handler.applyRecorderStateSnapshot(rc, tt.state, "get_state")
+
+ assertTaskStateRecoveryStatus(t, db, "task-state", tt.wantStatus)
+ if tt.wantColumn != "" {
+ assertTaskStateRecoveryTimestampSet(t, db, "task-state", tt.wantColumn)
+ }
+ })
+ }
+}
+
+func TestRecorderAuthoritativeNonIdleWithoutTaskIDKeepsUnsyncedWithoutDBChange(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-no-id", "pending")
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db)
+ rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1")
+ if !hub.Connect("robot-001", rc) {
+ t.Fatalf("connect recorder: initial connect failed")
+ }
+
+ if err := handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "ready"}, "state_update"); err == nil {
+ t.Fatalf("expected missing task_id error for authoritative state_update")
+ }
+
+ if rc.IsStateSynced() {
+ t.Fatalf("authoritative non-idle state without task_id marked connection synced")
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-no-id", "pending")
+}
+
+func TestRecorderEmptyStateSnapshotKeepsConnectionUnsynced(t *testing.T) {
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{}, nil)
+ rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1")
+
+ _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{}, "state_update")
+
+ if rc.IsStateSynced() {
+ t.Fatalf("empty state snapshot marked connection as synced")
+ }
+}
+
+func TestRecorderListDevicesIncludesStateSynced(t *testing.T) {
+ hub := services.NewRecorderHub()
+ unsynced := hub.NewRecorderConn(nil, "robot-unsynced", "127.0.0.1")
+ if !hub.Connect("robot-unsynced", unsynced) {
+ t.Fatalf("connect unsynced recorder failed")
+ }
+ synced := hub.NewRecorderConn(nil, "robot-synced", "127.0.0.1")
+ synced.UpdateState(services.RecorderState{CurrentState: "idle"})
+ if !hub.Connect("robot-synced", synced) {
+ t.Fatalf("connect synced recorder failed")
+ }
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{}, nil)
+
+ gin.SetMode(gin.TestMode)
+ router := gin.New()
+ router.GET("/recorder/devices", handler.ListDevices)
+ w := httptest.NewRecorder()
+ router.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/recorder/devices", nil))
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ var body struct {
+ Devices []struct {
+ DeviceID string `json:"device_id"`
+ StateSynced bool `json:"state_synced"`
+ } `json:"devices"`
+ }
+ if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
+ t.Fatalf("decode list devices response: %v", err)
+ }
+ got := map[string]bool{}
+ for _, device := range body.Devices {
+ got[device.DeviceID] = device.StateSynced
+ }
+ if got["robot-unsynced"] {
+ t.Fatalf("unsynced recorder reported state_synced=true: %#v", body.Devices)
+ }
+ if !got["robot-synced"] {
+ t.Fatalf("synced recorder reported state_synced=false: %#v", body.Devices)
+ }
+}
+
+func TestRecorderConfigRejectsAdditionalBusyStates(t *testing.T) {
+ for _, state := range []string{"recording", "paused", "unknown", "maintenance"} {
+ t.Run(state, func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-config-busy", "pending")
+
+ hub := services.NewRecorderHub()
+ rpcCalled := make(chan services.RPCRequest, 1)
+ rc := attachRecorderRPCResponderWithConn(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ rpcCalled <- req
+ return services.RPCResponse{Success: true}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: state, TaskID: "current-task"}, "state_update")
+
+ router := newRecorderInteractionRouter(handler)
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-config-busy"}}`)
+
+ if w.Code != http.StatusConflict {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String())
+ }
+ select {
+ case req := <-rpcCalled:
+ t.Fatalf("unexpected RPC sent while recorder state=%s: %#v", state, req)
+ default:
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-config-busy", "pending")
+ })
+ }
+}
+
+func TestRecorderConfigTimeoutAndDisconnectedKeepTaskPending(t *testing.T) {
+ t.Run("timeout", func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-config-timeout", "pending")
+
+ hub := services.NewRecorderHub()
+ rc, requests := attachRecorderRPCObserverWithConn(t, hub, "robot-001")
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update")
+
+ router := newRecorderInteractionRouter(handler)
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-config-timeout"}}`)
+
+ if w.Code != http.StatusGatewayTimeout {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusGatewayTimeout, w.Body.String())
+ }
+ req := receiveRecorderRPCRequest(t, requests, "config")
+ if req.Action != "config" {
+ t.Fatalf("action=%q want=config", req.Action)
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-config-timeout", "pending")
+ })
+
+ t.Run("disconnected", func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-config-disconnected", "pending")
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-config-disconnected"}}`)
+
+ if w.Code != http.StatusNotFound {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusNotFound, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-config-disconnected", "pending")
+ })
+}
+
+func TestRecorderBeginSuccessTimeoutAndDisconnectedTaskState(t *testing.T) {
+ t.Run("success advances pending to in_progress", func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-begin-success", "pending")
+
+ hub := services.NewRecorderHub()
+ attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ return services.RPCResponse{Success: true}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/begin", `{"task_id":"task-begin-success"}`)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-begin-success", "in_progress")
+ assertTaskStateRecoveryTimestampSet(t, db, "task-begin-success", "started_at")
+ })
+
+ t.Run("timeout keeps pending", func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-begin-timeout", "pending")
+
+ hub := services.NewRecorderHub()
+ _, requests := attachRecorderRPCObserverWithConn(t, hub, "robot-001")
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/begin", `{"task_id":"task-begin-timeout"}`)
+
+ if w.Code != http.StatusGatewayTimeout {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusGatewayTimeout, w.Body.String())
+ }
+ receiveRecorderRPCRequest(t, requests, "begin")
+ assertTaskStateRecoveryStatus(t, db, "task-begin-timeout", "pending")
+ })
+
+ t.Run("disconnected keeps pending", func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-begin-disconnected", "pending")
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/begin", `{"task_id":"task-begin-disconnected"}`)
+
+ if w.Code != http.StatusNotFound {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusNotFound, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-begin-disconnected", "pending")
+ })
+}
+
+func TestRecorderBeginSuccessAdvancesReadyAndLeavesInProgress(t *testing.T) {
+ for _, tt := range []struct {
+ name string
+ initial string
+ wantStatus string
+ }{
+ {name: "ready advances to in_progress", initial: "ready", wantStatus: "in_progress"},
+ {name: "in_progress stays in_progress", initial: "in_progress", wantStatus: "in_progress"},
+ } {
+ t.Run(tt.name, func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-begin-idempotent", tt.initial)
+
+ hub := services.NewRecorderHub()
+ attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ return services.RPCResponse{Success: true}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/begin", `{"task_id":"task-begin-idempotent"}`)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-begin-idempotent", tt.wantStatus)
+ if tt.initial == "ready" {
+ assertTaskStateRecoveryTimestampSet(t, db, "task-begin-idempotent", "started_at")
+ }
+ })
+ }
+}
+
+func TestRecorderForwardOnlyActionsDoNotMutateTaskState(t *testing.T) {
+ for _, tt := range []struct {
+ name string
+ path string
+ body string
+ initial string
+ }{
+ {name: "finish", path: "/recorder/robot-001/finish", body: `{"task_id":"task-forward-only"}`, initial: "in_progress"},
+ {name: "pause", path: "/recorder/robot-001/pause", body: `{}`, initial: "in_progress"},
+ {name: "resume", path: "/recorder/robot-001/resume", body: `{}`, initial: "in_progress"},
+ {name: "quit", path: "/recorder/robot-001/quit", body: `{}`, initial: "ready"},
+ } {
+ t.Run(tt.name, func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-forward-only", tt.initial)
+
+ hub := services.NewRecorderHub()
+ attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ return services.RPCResponse{Success: true}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, tt.path, tt.body)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-forward-only", tt.initial)
+ })
+ }
+}
+
+func TestRecorderRPCPauseResumeWithoutTaskIDPreservesBoundTask(t *testing.T) {
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{}, nil)
+ rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1")
+ if !hub.Connect("robot-001", rc) {
+ t.Fatalf("connect recorder: initial connect failed")
+ }
+
+ _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "recording", TaskID: "task-live"}, "state_update")
+ _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "paused"}, "rpc_response:pause")
+
+ st := rc.GetState()
+ if st.CurrentState != "paused" || st.TaskID != "task-live" {
+ t.Fatalf("state=(%q,%q), want paused/task-live", st.CurrentState, st.TaskID)
+ }
+
+ _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "recording"}, "rpc_response:resume")
+ st = rc.GetState()
+ if st.CurrentState != "recording" || st.TaskID != "task-live" {
+ t.Fatalf("state=(%q,%q), want recording/task-live", st.CurrentState, st.TaskID)
+ }
+
+ _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle", TaskID: "task-live"}, "rpc_response:finish")
+ st = rc.GetState()
+ if st.CurrentState != "idle" || st.TaskID != "" {
+ t.Fatalf("idle state=(%q,%q), want idle with empty task", st.CurrentState, st.TaskID)
+ }
+}
+
+func TestRecorderAuthoritativeStateWithoutTaskIDMarksUnsynced(t *testing.T) {
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{}, nil)
+ rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1")
+ if !hub.Connect("robot-001", rc) {
+ t.Fatalf("connect recorder: initial connect failed")
+ }
+
+ _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "recording", TaskID: "task-live"}, "state_update")
+ if err := handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "paused"}, "state_update"); err == nil {
+ t.Fatalf("expected missing task_id error for authoritative state_update")
+ }
+
+ st := rc.GetState()
+ if st.CurrentState != "recording" || st.TaskID != "task-live" {
+ t.Fatalf("state=(%q,%q), want previous recording/task-live", st.CurrentState, st.TaskID)
+ }
+ if rc.IsStateSynced() {
+ t.Fatalf("state should be marked unsynced after authoritative snapshot without task_id")
+ }
+}
+
+func TestRecorderFinishUsesCachedTaskIDWhenBodyEmpty(t *testing.T) {
+ hub := services.NewRecorderHub()
+ reqC := make(chan services.RPCRequest, 1)
+ rc := attachRecorderRPCResponderWithConn(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ reqC <- req
+ return services.RPCResponse{Success: true, Data: map[string]interface{}{"state": "idle"}}
+ })
+ rc.UpdateState(services.RecorderState{CurrentState: "paused", TaskID: "task-live"})
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, nil)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/finish", `{}`)
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+
+ select {
+ case req := <-reqC:
+ if got := strings.TrimSpace(stringValue(req.Params, "task_id")); got != "task-live" {
+ t.Fatalf("finish params task_id=%q want task-live; params=%#v", got, req.Params)
+ }
+ case <-time.After(time.Second):
+ t.Fatalf("timed out waiting for finish RPC")
+ }
+
+ st := rc.GetState()
+ if st.CurrentState != "idle" || st.TaskID != "" {
+ t.Fatalf("finish response state=(%q,%q), want idle with empty task", st.CurrentState, st.TaskID)
+ }
+}
+
+func TestRecorderCancelAndClearTaskStateSideEffects(t *testing.T) {
+ t.Run("cancel success reverts ready and in_progress", func(t *testing.T) {
+ for _, initial := range []string{"ready", "in_progress"} {
+ t.Run(initial, func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-cancel", initial)
+
+ hub := services.NewRecorderHub()
+ attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ return services.RPCResponse{Success: true}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/cancel", `{"task_id":"task-cancel"}`)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-cancel", "pending")
+ })
+ }
+ })
+
+ t.Run("cancel failure keeps in_progress", func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-cancel-false", "in_progress")
+
+ hub := services.NewRecorderHub()
+ attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ return services.RPCResponse{Success: false, Message: "device rejected cancel"}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/cancel", `{"task_id":"task-cancel-false"}`)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-cancel-false", "in_progress")
+ })
+
+ t.Run("clear success reverts only ready and strips task payload", func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-clear-ready", "ready")
+ seedTaskStateRecoveryTask(t, db, "task-clear-progress", "in_progress")
+
+ hub := services.NewRecorderHub()
+ rpcRequests := make(chan services.RPCRequest, 1)
+ attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ rpcRequests <- req
+ return services.RPCResponse{Success: true}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/clear", `{"task_id":"task-clear-ready"}`)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ req := receiveRecorderRPCRequest(t, rpcRequests, "clear")
+ if len(req.Params) != 0 {
+ t.Fatalf("clear RPC params=%#v want empty", req.Params)
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-clear-ready", "pending")
+ assertTaskStateRecoveryStatus(t, db, "task-clear-progress", "in_progress")
+ })
+}
+
+func TestRecorderCancelTimeoutAndDisconnectedKeepTaskState(t *testing.T) {
+ t.Run("timeout", func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-cancel-timeout", "in_progress")
+
+ hub := services.NewRecorderHub()
+ _, requests := attachRecorderRPCObserverWithConn(t, hub, "robot-001")
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/cancel", `{"task_id":"task-cancel-timeout"}`)
+
+ if w.Code != http.StatusGatewayTimeout {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusGatewayTimeout, w.Body.String())
+ }
+ receiveRecorderRPCRequest(t, requests, "cancel")
+ assertTaskStateRecoveryStatus(t, db, "task-cancel-timeout", "in_progress")
+ })
+
+ t.Run("disconnected", func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-cancel-disconnected", "ready")
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/cancel", `{"task_id":"task-cancel-disconnected"}`)
+
+ if w.Code != http.StatusNotFound {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusNotFound, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-cancel-disconnected", "ready")
+ })
+}
+
+func TestRecorderClearFailureTimeoutAndDisconnectedKeepTaskState(t *testing.T) {
+ t.Run("success false", func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-clear-false", "ready")
+
+ hub := services.NewRecorderHub()
+ attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ return services.RPCResponse{Success: false, Message: "device rejected clear"}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/clear", `{"task_id":"task-clear-false"}`)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-clear-false", "ready")
+ })
+
+ t.Run("timeout", func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-clear-timeout", "ready")
+
+ hub := services.NewRecorderHub()
+ _, requests := attachRecorderRPCObserverWithConn(t, hub, "robot-001")
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/clear", `{"task_id":"task-clear-timeout"}`)
+
+ if w.Code != http.StatusGatewayTimeout {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusGatewayTimeout, w.Body.String())
+ }
+ receiveRecorderRPCRequest(t, requests, "clear")
+ assertTaskStateRecoveryStatus(t, db, "task-clear-timeout", "ready")
+ })
+
+ t.Run("disconnected", func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-clear-disconnected", "ready")
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ router := newRecorderInteractionRouter(handler)
+
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/clear", `{"task_id":"task-clear-disconnected"}`)
+
+ if w.Code != http.StatusNotFound {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusNotFound, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-clear-disconnected", "ready")
+ })
+}
+
+func TestRecorderGetStatsContracts(t *testing.T) {
+ t.Run("disconnected returns connected false", func(t *testing.T) {
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, nil)
+ router := newRecorderInteractionRouter(handler)
+
+ w := httptest.NewRecorder()
+ router.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/recorder/robot-001/stats", nil))
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ var body map[string]interface{}
+ if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
+ t.Fatalf("decode stats response: %v", err)
+ }
+ if body["connected"] != false {
+ t.Fatalf("connected=%v want=false body=%v", body["connected"], body)
+ }
+ })
+
+ t.Run("axon failure returns connected true with error", func(t *testing.T) {
+ hub := services.NewRecorderHub()
+ attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ return services.RPCResponse{Success: false, Message: "stats unavailable", Data: map[string]interface{}{"written": float64(3)}}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, nil)
+ router := newRecorderInteractionRouter(handler)
+
+ w := httptest.NewRecorder()
+ router.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/recorder/robot-001/stats", nil))
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ var body map[string]interface{}
+ if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
+ t.Fatalf("decode stats response: %v", err)
+ }
+ if body["connected"] != true || body["error"] != "stats unavailable" {
+ t.Fatalf("unexpected stats body=%v", body)
+ }
+ })
+
+ t.Run("timeout returns gateway timeout", func(t *testing.T) {
+ hub := services.NewRecorderHub()
+ _, requests := attachRecorderRPCObserverWithConn(t, hub, "robot-001")
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, nil)
+ router := newRecorderInteractionRouter(handler)
+
+ w := httptest.NewRecorder()
+ router.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/recorder/robot-001/stats", nil))
+
+ if w.Code != http.StatusGatewayTimeout {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusGatewayTimeout, w.Body.String())
+ }
+ receiveRecorderRPCRequest(t, requests, "get_stats")
+ })
+}
+
+func TestRecorderDisconnectRevertsOnlyRunnableTasksForDevice(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+ seedRecorderInteractionDevice(t, db, "robot-002", 2, 102)
+ seedRecorderInteractionTask(t, db, 101, "task-ready", "ready", false)
+ seedRecorderInteractionTask(t, db, 101, "task-progress", "in_progress", false)
+ seedRecorderInteractionTask(t, db, 101, "task-pending", "pending", false)
+ seedRecorderInteractionTask(t, db, 101, "task-completed", "completed", false)
+ seedRecorderInteractionTask(t, db, 101, "task-deleted", "ready", true)
+ seedRecorderInteractionTask(t, db, 102, "task-other-device", "ready", false)
+
+ revertRunnableTasksOnDeviceDisconnect(db, "robot-001", nil, 0, false)
+
+ assertRecorderInteractionTaskStatus(t, db, "task-ready", "pending")
+ assertRecorderInteractionTaskStatus(t, db, "task-progress", "pending")
+ assertRecorderInteractionTaskStatus(t, db, "task-pending", "pending")
+ assertRecorderInteractionTaskStatus(t, db, "task-completed", "completed")
+ assertRecorderInteractionTaskStatus(t, db, "task-deleted", "ready")
+ assertRecorderInteractionTaskStatus(t, db, "task-other-device", "ready")
+ assertRecorderInteractionTaskRuntimeCleared(t, db, "task-ready")
+ assertRecorderInteractionTaskRuntimeCleared(t, db, "task-progress")
+}
+
+func TestOldRecorderDisconnectAfterReplacementDoesNotRevertTasks(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+ seedRecorderInteractionTask(t, db, 101, "task-current-ready", "ready", false)
+
+ hub := services.NewRecorderHub()
+ oldConn := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1")
+ if !hub.Connect("robot-001", oldConn) {
+ t.Fatalf("connect old recorder failed")
+ }
+ newConn := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1")
+ hub.ConnectReplacingExisting("robot-001", newConn)
+ if hub.Disconnect("robot-001", oldConn) {
+ revertRunnableTasksOnDeviceDisconnect(db, "robot-001", nil, 0, false)
+ }
+
+ assertRecorderInteractionTaskStatus(t, db, "task-current-ready", "ready")
+}
+
+func newRecorderInteractionRouter(handler *RecorderHandler) *gin.Engine {
+ gin.SetMode(gin.TestMode)
+ router := gin.New()
+ router.GET("/recorder/devices", handler.ListDevices)
+ router.GET("/recorder/:device_id/state", handler.GetState)
+ router.GET("/recorder/:device_id/stats", handler.GetStats)
+ router.POST("/recorder/:device_id/config", handler.Config)
+ router.POST("/recorder/:device_id/begin", handler.Begin)
+ router.POST("/recorder/:device_id/finish", handler.Finish)
+ router.POST("/recorder/:device_id/pause", handler.Pause)
+ router.POST("/recorder/:device_id/resume", handler.Resume)
+ router.POST("/recorder/:device_id/cancel", handler.Cancel)
+ router.POST("/recorder/:device_id/clear", handler.Clear)
+ router.POST("/recorder/:device_id/quit", handler.Quit)
+ return router
+}
+
+func recorderInteractionPost(t *testing.T, router *gin.Engine, path string, body string) *httptest.ResponseRecorder {
+ t.Helper()
+ w := httptest.NewRecorder()
+ req := httptest.NewRequest(http.MethodPost, path, bytes.NewReader([]byte(body)))
+ req.Header.Set("Content-Type", "application/json")
+ router.ServeHTTP(w, req)
+ return w
+}
+
+func attachRecorderRPCObserverWithConn(t *testing.T, hub *services.RecorderHub, deviceID string) (*services.RecorderConn, <-chan services.RPCRequest) {
+ t.Helper()
+ serverConn, clientConn := newRecorderHandlerTestWebSocketPair(t)
+ rc := hub.NewRecorderConn(serverConn, deviceID, "127.0.0.1")
+ if !hub.Connect(deviceID, rc) {
+ t.Fatalf("connect recorder: initial connect failed")
+ }
+
+ requests := make(chan services.RPCRequest, 8)
+ ctx, cancel := context.WithCancel(context.Background())
+ t.Cleanup(cancel)
+ go func() {
+ for {
+ var req services.RPCRequest
+ if err := wsjson.Read(ctx, clientConn, &req); err != nil {
+ return
+ }
+ requests <- req
+ }
+ }()
+ return rc, requests
+}
+
+func receiveRecorderRPCRequest(t *testing.T, requests <-chan services.RPCRequest, wantAction string) services.RPCRequest {
+ t.Helper()
+ select {
+ case req := <-requests:
+ if wantAction != "" && req.Action != wantAction {
+ t.Fatalf("RPC action=%q want=%q req=%#v", req.Action, wantAction, req)
+ }
+ return req
+ case <-time.After(2 * time.Second):
+ t.Fatalf("timed out waiting for recorder RPC action %q", wantAction)
+ }
+ return services.RPCRequest{}
+}
+
+func newRecorderInteractionDB(t *testing.T) *sqlx.DB {
+ t.Helper()
+ name := strings.NewReplacer("/", "_", " ", "_", "-", "_").Replace(t.Name())
+ db, err := sqlx.Open("sqlite", "file:"+name+"?mode=memory&cache=shared")
+ if err != nil {
+ t.Fatalf("open sqlite db: %v", err)
+ }
+ db.SetMaxOpenConns(8)
+ t.Cleanup(func() { _ = db.Close() })
+ if _, err := db.Exec(`CREATE TABLE robots (
+ id INTEGER PRIMARY KEY,
+ device_id TEXT NOT NULL,
+ deleted_at TIMESTAMP NULL
+ )`); err != nil {
+ t.Fatalf("create robots schema: %v", err)
+ }
+ if _, err := db.Exec(`CREATE TABLE workstations (
+ id INTEGER PRIMARY KEY,
+ robot_id INTEGER NOT NULL,
+ deleted_at TIMESTAMP NULL
+ )`); err != nil {
+ t.Fatalf("create workstations schema: %v", err)
+ }
+ if _, err := db.Exec(`CREATE TABLE tasks (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ task_id TEXT NOT NULL,
+ batch_id INTEGER NOT NULL DEFAULT 0,
+ workstation_id INTEGER NOT NULL,
+ status TEXT NOT NULL,
+ ready_at TIMESTAMP NULL,
+ started_at TIMESTAMP NULL,
+ completed_at TIMESTAMP NULL,
+ error_message TEXT NULL,
+ created_at TIMESTAMP NOT NULL,
+ updated_at TIMESTAMP NOT NULL,
+ deleted_at TIMESTAMP NULL
+ )`); err != nil {
+ t.Fatalf("create tasks schema: %v", err)
+ }
+ return db
+}
+
+func seedRecorderInteractionDevice(t *testing.T, db *sqlx.DB, deviceID string, robotID int64, workstationID int64) {
+ t.Helper()
+ if _, err := db.Exec(`INSERT INTO robots (id, device_id) VALUES (?, ?)`, robotID, deviceID); err != nil {
+ t.Fatalf("seed robot: %v", err)
+ }
+ if _, err := db.Exec(`INSERT INTO workstations (id, robot_id) VALUES (?, ?)`, workstationID, robotID); err != nil {
+ t.Fatalf("seed workstation: %v", err)
+ }
+}
+
+func seedRecorderInteractionTask(t *testing.T, db *sqlx.DB, workstationID int64, taskID string, status string, deleted bool) {
+ t.Helper()
+ now := time.Now().UTC()
+ var deletedAt interface{}
+ if deleted {
+ deletedAt = now
+ }
+ if _, err := db.Exec(
+ `INSERT INTO tasks (task_id, batch_id, workstation_id, status, ready_at, started_at, completed_at, error_message, created_at, updated_at, deleted_at)
+ VALUES (?, 1, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+ taskID,
+ workstationID,
+ status,
+ now,
+ now,
+ now,
+ "previous error",
+ now,
+ now,
+ deletedAt,
+ ); err != nil {
+ t.Fatalf("seed task %s: %v", taskID, err)
+ }
+}
+
+func assertRecorderInteractionTaskStatus(t *testing.T, db *sqlx.DB, taskID string, want string) {
+ t.Helper()
+ var got string
+ if err := db.Get(&got, `SELECT status FROM tasks WHERE task_id = ?`, taskID); err != nil {
+ t.Fatalf("query task status: %v", err)
+ }
+ if got != want {
+ t.Fatalf("task %s status=%q want=%q", taskID, got, want)
+ }
+}
+
+func assertRecorderInteractionTaskRuntimeCleared(t *testing.T, db *sqlx.DB, taskID string) {
+ t.Helper()
+ var count int
+ if err := db.Get(&count, `SELECT CASE WHEN ready_at IS NULL AND started_at IS NULL AND completed_at IS NULL AND error_message IS NULL THEN 1 ELSE 0 END FROM tasks WHERE task_id = ?`, taskID); err != nil {
+ t.Fatalf("query runtime columns for %s: %v", taskID, err)
+ }
+ if count != 1 {
+ t.Fatalf("runtime columns for task %s were not cleared", taskID)
+ }
+}
+
+func TestRecorderIgnoresRPCResponseFromReplacedConnection(t *testing.T) {
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{}, nil)
+ oldConn := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1")
+ if !hub.Connect("robot-001", oldConn) {
+ t.Fatalf("connect old recorder failed")
+ }
+
+ serverConn, clientConn := newRecorderHandlerTestWebSocketPair(t)
+ newConn := hub.NewRecorderConn(serverConn, "robot-001", "127.0.0.1")
+ hub.ConnectReplacingExisting("robot-001", newConn)
+
+ type rpcResult struct {
+ resp *services.RPCResponse
+ err error
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+ resultC := make(chan rpcResult, 1)
+ go func() {
+ resp, err := hub.SendRPC(ctx, "robot-001", "begin", nil, time.Second)
+ resultC <- rpcResult{resp: resp, err: err}
+ }()
+
+ var req services.RPCRequest
+ if err := wsjson.Read(ctx, clientConn, &req); err != nil {
+ t.Fatalf("read current recorder request: %v", err)
+ }
+ handler.handleMessage("robot-001", oldConn, map[string]interface{}{
+ "type": "rpc_response",
+ "request_id": req.RequestID,
+ "success": true,
+ })
+
+ select {
+ case result := <-resultC:
+ t.Fatalf("old connection rpc_response completed current RPC: resp=%+v err=%v", result.resp, result.err)
+ case <-time.After(150 * time.Millisecond):
+ }
+
+ if !hub.HandleRPCResponse("robot-001", &services.RPCResponse{RequestID: req.RequestID, Success: false, Message: "current response"}) {
+ t.Fatalf("current rpc_response was not matched")
+ }
+ select {
+ case result := <-resultC:
+ if result.err != nil {
+ t.Fatalf("current response returned error: %v", result.err)
+ }
+ if result.resp == nil || result.resp.Success {
+ t.Fatalf("current response=%+v want success=false", result.resp)
+ }
+ case <-time.After(time.Second):
+ t.Fatalf("timed out waiting for current rpc_response")
+ }
+}
+
+func TestRecorderUnmatchedRPCResponseDoesNotUnblockPendingRequest(t *testing.T) {
+ hub := services.NewRecorderHub()
+ serverConn, clientConn := newRecorderHandlerTestWebSocketPair(t)
+ rc := hub.NewRecorderConn(serverConn, "robot-001", "127.0.0.1")
+ if !hub.Connect("robot-001", rc) {
+ t.Fatalf("connect recorder failed")
+ }
+
+ type rpcResult struct {
+ resp *services.RPCResponse
+ err error
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+ resultC := make(chan rpcResult, 1)
+ go func() {
+ resp, err := hub.SendRPC(ctx, "robot-001", "begin", nil, time.Second)
+ resultC <- rpcResult{resp: resp, err: err}
+ }()
+
+ var req services.RPCRequest
+ if err := wsjson.Read(ctx, clientConn, &req); err != nil {
+ t.Fatalf("read recorder request: %v", err)
+ }
+ if hub.HandleRPCResponse("robot-001", &services.RPCResponse{RequestID: "bogus", Success: true}) {
+ t.Fatalf("unmatched rpc_response returned matched=true")
+ }
+ select {
+ case result := <-resultC:
+ t.Fatalf("unmatched rpc_response completed pending RPC: resp=%+v err=%v", result.resp, result.err)
+ case <-time.After(150 * time.Millisecond):
+ }
+
+ if !hub.HandleRPCResponse("robot-001", &services.RPCResponse{RequestID: req.RequestID, Success: true}) {
+ t.Fatalf("matched rpc_response returned false")
+ }
+ select {
+ case result := <-resultC:
+ if result.err != nil || result.resp == nil || !result.resp.Success {
+ t.Fatalf("matched response result resp=%+v err=%v", result.resp, result.err)
+ }
+ case <-time.After(time.Second):
+ t.Fatalf("timed out waiting for matched rpc_response")
+ }
+}
+
+func TestRecorderWebSocketEmptyStateUpdateDoesNotMarkSynced(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001")
+ axon := connectFakeRecorderAxon(t, wsURL)
+ axon.receiveRPC(t, "get_state")
+
+ axon.sendStateUpdate(t, "", "")
+ waitForRecorderCachedRawState(t, hub, "robot-001", false, "", "")
+
+ router := newRecorderInteractionRouter(handler)
+ w := httptest.NewRecorder()
+ router.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/recorder/robot-001/state", nil))
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ var body map[string]interface{}
+ if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
+ t.Fatalf("decode state response: %v", err)
+ }
+ if body["state_synced"] != false || body["syncing"] != true {
+ t.Fatalf("empty state_update should keep syncing; body=%v", body)
+ }
+}
+
+func TestRecorderWebSocketReplacementIgnoresOldConnectionLateMessages(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+ seedRecorderInteractionTask(t, db, 101, "task-old-late", "pending", false)
+ seedRecorderInteractionTask(t, db, 101, "task-current-config", "pending", false)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001")
+ oldAxon := connectFakeRecorderAxon(t, wsURL)
+ oldAxon.receiveRPC(t, "get_state")
+ oldConn := hub.Get("robot-001")
+ if oldConn == nil {
+ t.Fatalf("old recorder connection was not registered")
+ }
+
+ newAxon := connectFakeRecorderAxon(t, wsURL)
+ getState := newAxon.receiveRPC(t, "get_state")
+ newAxon.respondRPC(t, getState, true, "", map[string]interface{}{"state": "idle"})
+ waitForRecorderCachedState(t, hub, "robot-001", true, "idle", "")
+
+ handler.handleMessage("robot-001", oldConn, map[string]interface{}{
+ "type": "state_update",
+ "data": map[string]interface{}{"current": "ready", "task_id": "task-old-late"},
+ })
+ handler.handleMessage("robot-001", oldConn, map[string]interface{}{
+ "type": "config_applied",
+ "data": map[string]interface{}{"task_id": "task-old-late"},
+ })
+ assertRecorderInteractionTaskStatus(t, db, "task-old-late", "pending")
+
+ router := newRecorderInteractionRouter(handler)
+ resultC := recorderInteractionPostAsync(router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-current-config"}}`)
+ configReq := newAxon.receiveRPC(t, "config")
+ handler.handleMessage("robot-001", oldConn, map[string]interface{}{
+ "type": "rpc_response",
+ "request_id": configReq.RequestID,
+ "success": true,
+ })
+ select {
+ case w := <-resultC:
+ t.Fatalf("old rpc_response completed current config request: status=%d body=%s", w.Code, w.Body.String())
+ case <-time.After(150 * time.Millisecond):
+ }
+ newAxon.respondRPC(t, configReq, true, "", nil)
+ w := receiveRecorderHTTPResponse(t, resultC)
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ assertRecorderInteractionTaskStatus(t, db, "task-current-config", "ready")
+}
+
+func TestRecorderWebSocketGetStateTimeoutLeavesUnsyncedAndBlocksConfig(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+ seedRecorderInteractionTask(t, db, 101, "task-timeout-blocked", "pending", false)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001")
+ axon := connectFakeRecorderAxon(t, wsURL)
+ axon.receiveRPC(t, "get_state")
+
+ waitForRecorderConnectionSyncedFlagAfter(t, hub, "robot-001", false, 1200*time.Millisecond)
+
+ router := newRecorderInteractionRouter(handler)
+ stateW := httptest.NewRecorder()
+ router.ServeHTTP(stateW, httptest.NewRequest(http.MethodGet, "/recorder/robot-001/state", nil))
+ if stateW.Code != http.StatusOK {
+ t.Fatalf("state status=%d want=%d body=%s", stateW.Code, http.StatusOK, stateW.Body.String())
+ }
+ var stateBody map[string]interface{}
+ if err := json.Unmarshal(stateW.Body.Bytes(), &stateBody); err != nil {
+ t.Fatalf("decode state response: %v", err)
+ }
+ if stateBody["state_synced"] != false || stateBody["syncing"] != true {
+ t.Fatalf("timed-out get_state should keep syncing; body=%v", stateBody)
+ }
+
+ configW := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-timeout-blocked"}}`)
+ if configW.Code != http.StatusConflict {
+ t.Fatalf("config status=%d want=%d body=%s", configW.Code, http.StatusConflict, configW.Body.String())
+ }
+ axon.assertNoRPC(t, 150*time.Millisecond)
+ assertRecorderInteractionTaskStatus(t, db, "task-timeout-blocked", "pending")
+
+ axon.sendStateUpdate(t, "idle", "")
+ waitForRecorderCachedState(t, hub, "robot-001", true, "idle", "")
+ resultC := recorderInteractionPostAsync(router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-timeout-blocked"}}`)
+ configReq := axon.receiveRPC(t, "config")
+ axon.respondRPC(t, configReq, true, "", nil)
+ configW = receiveRecorderHTTPResponse(t, resultC)
+ if configW.Code != http.StatusOK {
+ t.Fatalf("config after state_update status=%d want=%d body=%s", configW.Code, http.StatusOK, configW.Body.String())
+ }
+ assertRecorderInteractionTaskStatus(t, db, "task-timeout-blocked", "ready")
+}
+
+func TestRecorderWebSocketGetStateFailureLeavesUnsyncedAndBlocksConfig(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+ seedRecorderInteractionTask(t, db, 101, "task-blocked", "pending", false)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001")
+ axon := connectFakeRecorderAxon(t, wsURL)
+
+ req := axon.receiveRPC(t, "get_state")
+ axon.respondRPC(t, req, false, "state unavailable", nil)
+ waitForRecorderConnectionSyncedFlag(t, hub, "robot-001", false)
+
+ router := newRecorderInteractionRouter(handler)
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-blocked"}}`)
+ if w.Code != http.StatusConflict {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String())
+ }
+ axon.assertNoRPC(t, 150*time.Millisecond)
+ assertRecorderInteractionTaskStatus(t, db, "task-blocked", "pending")
+}
+
+func TestRecorderWebSocketRecordingReconnectRestoresSameTaskWithoutDuplicateConfig(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+ seedRecorderInteractionTask(t, db, 101, "task-recording-reconnect", "in_progress", false)
+ seedRecorderInteractionTask(t, db, 101, "task-next-recording", "pending", false)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001")
+
+ first := connectFakeRecorderAxon(t, wsURL)
+ first.receiveRPC(t, "get_state")
+ first.closeNow()
+ waitForRecorderHubDisconnected(t, hub, "robot-001")
+ waitForRecorderInteractionTaskStatus(t, db, "task-recording-reconnect", "pending")
+
+ second := connectFakeRecorderAxon(t, wsURL)
+ req := second.receiveRPC(t, "get_state")
+ second.respondRPC(t, req, true, "", map[string]interface{}{"state": "recording", "task_id": "task-recording-reconnect"})
+ waitForRecorderInteractionTaskStatus(t, db, "task-recording-reconnect", "in_progress")
+ waitForRecorderCachedState(t, hub, "robot-001", true, "recording", "task-recording-reconnect")
+
+ router := newRecorderInteractionRouter(handler)
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-next-recording"}}`)
+ if w.Code != http.StatusConflict {
+ t.Fatalf("status after recording reconnect=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String())
+ }
+ second.assertNoRPC(t, 150*time.Millisecond)
+ assertRecorderInteractionTaskStatus(t, db, "task-next-recording", "pending")
+}
+
+func TestRecorderWebSocketRPCActionProtocol(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+ seedRecorderInteractionTask(t, db, 101, "task-protocol", "pending", false)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001")
+ axon := connectFakeRecorderAxon(t, wsURL)
+ getState := axon.receiveRPC(t, "get_state")
+ axon.respondRPC(t, getState, true, "", map[string]interface{}{"state": "idle"})
+ waitForRecorderCachedState(t, hub, "robot-001", true, "idle", "")
+ router := newRecorderInteractionRouter(handler)
+
+ cases := []struct {
+ name string
+ method string
+ path string
+ body string
+ wantAction string
+ check func(t *testing.T, req services.RPCRequest)
+ }{
+ {
+ name: "config forwards task_config payload",
+ method: http.MethodPost,
+ path: "/recorder/robot-001/config",
+ body: `{"task_config":{"task_id":"task-protocol","device_id":"robot-001"}}`,
+ wantAction: "config",
+ check: func(t *testing.T, req services.RPCRequest) {
+ t.Helper()
+ tc, ok := req.Params["task_config"].(map[string]interface{})
+ if !ok || tc["task_id"] != "task-protocol" || tc["device_id"] != "robot-001" {
+ t.Fatalf("config params=%#v missing task_config", req.Params)
+ }
+ },
+ },
+ {
+ name: "begin forwards task_id",
+ method: http.MethodPost,
+ path: "/recorder/robot-001/begin",
+ body: `{"task_id":"task-protocol"}`,
+ wantAction: "begin",
+ check: func(t *testing.T, req services.RPCRequest) {
+ t.Helper()
+ if req.Params["task_id"] != "task-protocol" {
+ t.Fatalf("begin params=%#v missing task_id", req.Params)
+ }
+ },
+ },
+ {
+ name: "cancel forwards task_id",
+ method: http.MethodPost,
+ path: "/recorder/robot-001/cancel",
+ body: `{"task_id":"task-protocol"}`,
+ wantAction: "cancel",
+ check: func(t *testing.T, req services.RPCRequest) {
+ t.Helper()
+ if req.Params["task_id"] != "task-protocol" {
+ t.Fatalf("cancel params=%#v missing task_id", req.Params)
+ }
+ },
+ },
+ {
+ name: "clear strips http task_id payload",
+ method: http.MethodPost,
+ path: "/recorder/robot-001/clear",
+ body: `{"task_id":"task-protocol"}`,
+ wantAction: "clear",
+ check: func(t *testing.T, req services.RPCRequest) {
+ t.Helper()
+ if len(req.Params) != 0 {
+ t.Fatalf("clear params=%#v want empty", req.Params)
+ }
+ },
+ },
+ {name: "finish forwards task_id", method: http.MethodPost, path: "/recorder/robot-001/finish", body: `{"task_id":"task-protocol"}`, wantAction: "finish"},
+ {name: "pause action", method: http.MethodPost, path: "/recorder/robot-001/pause", body: `{}`, wantAction: "pause"},
+ {name: "resume action", method: http.MethodPost, path: "/recorder/robot-001/resume", body: `{}`, wantAction: "resume"},
+ {name: "quit action", method: http.MethodPost, path: "/recorder/robot-001/quit", body: `{}`, wantAction: "quit"},
+ {name: "get_stats action", method: http.MethodGet, path: "/recorder/robot-001/stats", wantAction: "get_stats"},
+ }
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ resultC := recorderInteractionRequestAsync(router, tc.method, tc.path, tc.body)
+ req := axon.receiveRPC(t, tc.wantAction)
+ if tc.check != nil {
+ tc.check(t, req)
+ }
+ axon.respondRPC(t, req, true, "", map[string]interface{}{"ok": true})
+ w := receiveRecorderHTTPResponse(t, resultC)
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ })
+ }
+}
+
+func TestRecorderWebSocketConnectGetStateIdleMarksStateSynced(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001")
+ axon := connectFakeRecorderAxon(t, wsURL)
+
+ req := axon.receiveRPC(t, "get_state")
+ axon.respondRPC(t, req, true, "", map[string]interface{}{"state": "idle"})
+
+ waitForRecorderCachedState(t, hub, "robot-001", true, "idle", "")
+
+ router := newRecorderInteractionRouter(handler)
+ w := httptest.NewRecorder()
+ router.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/recorder/robot-001/state", nil))
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ var body map[string]interface{}
+ if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
+ t.Fatalf("decode state response: %v", err)
+ }
+ if body["connected"] != true || body["state_synced"] != true || body["syncing"] != false || body["current_state"] != "idle" {
+ t.Fatalf("unexpected state response: %v", body)
+ }
+}
+
+func TestRecorderWebSocketStateUpdateBeforeGetStateResponseIsIdempotent(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+ seedRecorderInteractionTask(t, db, 101, "task-ready-ws", "pending", false)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001")
+ axon := connectFakeRecorderAxon(t, wsURL)
+
+ axon.sendStateUpdate(t, "ready", "task-ready-ws")
+ waitForRecorderInteractionTaskStatus(t, db, "task-ready-ws", "ready")
+ readyAt := recorderInteractionTaskTimestamp(t, db, "task-ready-ws", "ready_at")
+ if readyAt == "" {
+ t.Fatalf("ready_at was not set after state_update")
+ }
+
+ req := axon.receiveRPC(t, "get_state")
+ axon.respondRPC(t, req, true, "", map[string]interface{}{"state": "ready", "task_id": "task-ready-ws"})
+
+ waitForRecorderCachedState(t, hub, "robot-001", true, "ready", "task-ready-ws")
+ assertRecorderInteractionTaskStatus(t, db, "task-ready-ws", "ready")
+ if got := recorderInteractionTaskTimestamp(t, db, "task-ready-ws", "ready_at"); got != readyAt {
+ t.Fatalf("ready_at changed after duplicate get_state: got=%q want=%q", got, readyAt)
+ }
+}
+
+func TestRecorderWebSocketGetStateReadyRejectsConfigWithoutSendingRPC(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+ seedRecorderInteractionTask(t, db, 101, "task-current", "pending", false)
+ seedRecorderInteractionTask(t, db, 101, "task-next", "pending", false)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001")
+ axon := connectFakeRecorderAxon(t, wsURL)
+
+ req := axon.receiveRPC(t, "get_state")
+ axon.respondRPC(t, req, true, "", map[string]interface{}{"state": "ready", "task_id": "task-current"})
+ waitForRecorderInteractionTaskStatus(t, db, "task-current", "ready")
+ waitForRecorderCachedState(t, hub, "robot-001", true, "ready", "task-current")
+
+ router := newRecorderInteractionRouter(handler)
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-next"}}`)
+ if w.Code != http.StatusConflict {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String())
+ }
+ axon.assertNoRPC(t, 150*time.Millisecond)
+ assertRecorderInteractionTaskStatus(t, db, "task-next", "pending")
+}
+
+func TestRecorderWebSocketUnsyncedConfigRejectedUntilValidStateUpdate(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+ seedRecorderInteractionTask(t, db, 101, "task-after-sync", "pending", false)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001")
+ axon := connectFakeRecorderAxon(t, wsURL)
+
+ axon.receiveRPC(t, "get_state")
+ router := newRecorderInteractionRouter(handler)
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-after-sync"}}`)
+ if w.Code != http.StatusConflict {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String())
+ }
+ axon.assertNoRPC(t, 150*time.Millisecond)
+ assertRecorderInteractionTaskStatus(t, db, "task-after-sync", "pending")
+
+ axon.sendStateUpdate(t, "idle", "")
+ waitForRecorderCachedState(t, hub, "robot-001", true, "idle", "")
+
+ resultC := recorderInteractionPostAsync(router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-after-sync"}}`)
+ configReq := axon.receiveRPC(t, "config")
+ axon.respondRPC(t, configReq, true, "", nil)
+ w = receiveRecorderHTTPResponse(t, resultC)
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ assertRecorderInteractionTaskStatus(t, db, "task-after-sync", "ready")
+ axon.assertNoRPC(t, 150*time.Millisecond)
+}
+
+func TestRecorderWebSocketReadyReconnectRestoresSameTaskWithoutDuplicateConfig(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+ seedRecorderInteractionTask(t, db, 101, "task-ready-reconnect", "ready", false)
+ seedRecorderInteractionTask(t, db, 101, "task-next-reconnect", "pending", false)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001")
+
+ first := connectFakeRecorderAxon(t, wsURL)
+ first.receiveRPC(t, "get_state")
+ first.closeNow()
+ waitForRecorderHubDisconnected(t, hub, "robot-001")
+ waitForRecorderInteractionTaskStatus(t, db, "task-ready-reconnect", "pending")
+
+ router := newRecorderInteractionRouter(handler)
+ w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-ready-reconnect"}}`)
+ if w.Code != http.StatusNotFound {
+ t.Fatalf("status during disconnect=%d want=%d body=%s", w.Code, http.StatusNotFound, w.Body.String())
+ }
+
+ second := connectFakeRecorderAxon(t, wsURL)
+ req := second.receiveRPC(t, "get_state")
+ second.respondRPC(t, req, true, "", map[string]interface{}{"state": "ready", "task_id": "task-ready-reconnect"})
+ waitForRecorderInteractionTaskStatus(t, db, "task-ready-reconnect", "ready")
+ waitForRecorderCachedState(t, hub, "robot-001", true, "ready", "task-ready-reconnect")
+
+ w = recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-next-reconnect"}}`)
+ if w.Code != http.StatusConflict {
+ t.Fatalf("status after ready reconnect=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String())
+ }
+ second.assertNoRPC(t, 150*time.Millisecond)
+ assertRecorderInteractionTaskStatus(t, db, "task-next-reconnect", "pending")
+}
+
+func TestRecorderWebSocketIdleReconnectAllowsSingleConfig(t *testing.T) {
+ db := newRecorderInteractionDB(t)
+ seedRecorderInteractionDevice(t, db, "robot-001", 1, 101)
+ seedRecorderInteractionTask(t, db, 101, "task-idle-reconnect", "ready", false)
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001")
+
+ first := connectFakeRecorderAxon(t, wsURL)
+ first.receiveRPC(t, "get_state")
+ first.closeNow()
+ waitForRecorderHubDisconnected(t, hub, "robot-001")
+ waitForRecorderInteractionTaskStatus(t, db, "task-idle-reconnect", "pending")
+
+ second := connectFakeRecorderAxon(t, wsURL)
+ req := second.receiveRPC(t, "get_state")
+ second.respondRPC(t, req, true, "", map[string]interface{}{"state": "idle"})
+ waitForRecorderCachedState(t, hub, "robot-001", true, "idle", "")
+
+ router := newRecorderInteractionRouter(handler)
+ resultC := recorderInteractionPostAsync(router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-idle-reconnect"}}`)
+ configReq := second.receiveRPC(t, "config")
+ second.respondRPC(t, configReq, true, "", nil)
+ w := receiveRecorderHTTPResponse(t, resultC)
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ assertRecorderInteractionTaskStatus(t, db, "task-idle-reconnect", "ready")
+ second.assertNoRPC(t, 150*time.Millisecond)
+}
+
+type fakeRecorderAxon struct {
+ conn *websocket.Conn
+ requests chan services.RPCRequest
+ cancel context.CancelFunc
+}
+
+func newRecorderWebSocketTestServer(t *testing.T, handler *RecorderHandler, deviceID string) string {
+ t.Helper()
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ handler.HandleWebSocket(w, r, deviceID)
+ }))
+ t.Cleanup(server.Close)
+ return "ws" + strings.TrimPrefix(server.URL, "http")
+}
+
+func connectFakeRecorderAxon(t *testing.T, wsURL string) *fakeRecorderAxon {
+ t.Helper()
+ conn, _, err := websocket.Dial(context.Background(), wsURL, nil)
+ if err != nil {
+ t.Fatalf("dial fake recorder websocket: %v", err)
+ }
+ ctx, cancel := context.WithCancel(context.Background())
+ axon := &fakeRecorderAxon{
+ conn: conn,
+ requests: make(chan services.RPCRequest, 16),
+ cancel: cancel,
+ }
+ t.Cleanup(func() {
+ cancel()
+ _ = conn.CloseNow()
+ })
+ go func() {
+ for {
+ var req services.RPCRequest
+ if err := wsjson.Read(ctx, conn, &req); err != nil {
+ return
+ }
+ axon.requests <- req
+ }
+ }()
+ return axon
+}
+
+func (f *fakeRecorderAxon) receiveRPC(t *testing.T, wantAction string) services.RPCRequest {
+ t.Helper()
+ return receiveRecorderRPCRequest(t, f.requests, wantAction)
+}
+
+func (f *fakeRecorderAxon) respondRPC(t *testing.T, req services.RPCRequest, success bool, message string, data map[string]interface{}) {
+ t.Helper()
+ ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+ defer cancel()
+ resp := services.RPCResponse{
+ Type: "rpc_response",
+ RequestID: req.RequestID,
+ Success: success,
+ Message: message,
+ Data: data,
+ }
+ if err := wsjson.Write(ctx, f.conn, resp); err != nil {
+ t.Fatalf("write fake recorder rpc response: %v", err)
+ }
+}
+
+func (f *fakeRecorderAxon) sendStateUpdate(t *testing.T, current string, taskID string) {
+ t.Helper()
+ ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+ defer cancel()
+ data := map[string]interface{}{"current": current}
+ if taskID != "" {
+ data["task_id"] = taskID
+ }
+ msg := map[string]interface{}{
+ "type": "state_update",
+ "data": data,
+ }
+ if err := wsjson.Write(ctx, f.conn, msg); err != nil {
+ t.Fatalf("write fake recorder state_update: %v", err)
+ }
+}
+
+func (f *fakeRecorderAxon) assertNoRPC(t *testing.T, duration time.Duration) {
+ t.Helper()
+ select {
+ case req := <-f.requests:
+ t.Fatalf("unexpected recorder RPC: %#v", req)
+ case <-time.After(duration):
+ }
+}
+
+func (f *fakeRecorderAxon) closeNow() {
+ f.cancel()
+ _ = f.conn.CloseNow()
+}
+
+func recorderInteractionPostAsync(router *gin.Engine, path string, body string) <-chan *httptest.ResponseRecorder {
+ return recorderInteractionRequestAsync(router, http.MethodPost, path, body)
+}
+
+func recorderInteractionRequestAsync(router *gin.Engine, method string, path string, body string) <-chan *httptest.ResponseRecorder {
+ resultC := make(chan *httptest.ResponseRecorder, 1)
+ go func() {
+ w := httptest.NewRecorder()
+ req := httptest.NewRequest(method, path, bytes.NewReader([]byte(body)))
+ if method != http.MethodGet {
+ req.Header.Set("Content-Type", "application/json")
+ }
+ router.ServeHTTP(w, req)
+ resultC <- w
+ }()
+ return resultC
+}
+
+func receiveRecorderHTTPResponse(t *testing.T, resultC <-chan *httptest.ResponseRecorder) *httptest.ResponseRecorder {
+ t.Helper()
+ select {
+ case w := <-resultC:
+ return w
+ case <-time.After(2 * time.Second):
+ t.Fatalf("timed out waiting for recorder HTTP response")
+ }
+ return nil
+}
+
+func waitForRecorderConnectionSyncedFlagAfter(t *testing.T, hub *services.RecorderHub, deviceID string, wantSynced bool, minWait time.Duration) {
+ t.Helper()
+ deadline := time.Now().Add(3 * time.Second)
+ readyAt := time.Now().Add(minWait)
+ for time.Now().Before(deadline) {
+ rc := hub.Get(deviceID)
+ if rc != nil && rc.IsStateSynced() == wantSynced && time.Now().After(readyAt) {
+ return
+ }
+ time.Sleep(10 * time.Millisecond)
+ }
+ rc := hub.Get(deviceID)
+ if rc == nil {
+ t.Fatalf("recorder %s not connected while waiting after %s for synced=%v", deviceID, minWait, wantSynced)
+ }
+ t.Fatalf("recorder synced=%v want=%v after %s", rc.IsStateSynced(), wantSynced, minWait)
+}
+
+func waitForRecorderCachedRawState(t *testing.T, hub *services.RecorderHub, deviceID string, wantSynced bool, wantState string, wantTaskID string) {
+ t.Helper()
+ deadline := time.Now().Add(2 * time.Second)
+ for time.Now().Before(deadline) {
+ rc := hub.Get(deviceID)
+ if rc != nil && rc.IsStateSynced() == wantSynced {
+ st := rc.GetState()
+ if strings.TrimSpace(st.CurrentState) == wantState && strings.TrimSpace(st.TaskID) == wantTaskID {
+ return
+ }
+ }
+ time.Sleep(10 * time.Millisecond)
+ }
+ rc := hub.Get(deviceID)
+ if rc == nil {
+ t.Fatalf("recorder %s not connected while waiting for raw state=%q task=%q synced=%v", deviceID, wantState, wantTaskID, wantSynced)
+ }
+ st := rc.GetState()
+ t.Fatalf("recorder raw state current=%q task=%q synced=%v, want current=%q task=%q synced=%v", st.CurrentState, st.TaskID, rc.IsStateSynced(), wantState, wantTaskID, wantSynced)
+}
+
+func waitForRecorderConnectionSyncedFlag(t *testing.T, hub *services.RecorderHub, deviceID string, wantSynced bool) {
+ t.Helper()
+ deadline := time.Now().Add(2 * time.Second)
+ for time.Now().Before(deadline) {
+ rc := hub.Get(deviceID)
+ if rc != nil && rc.IsStateSynced() == wantSynced {
+ return
+ }
+ time.Sleep(10 * time.Millisecond)
+ }
+ rc := hub.Get(deviceID)
+ if rc == nil {
+ t.Fatalf("recorder %s not connected while waiting for synced=%v", deviceID, wantSynced)
+ }
+ t.Fatalf("recorder synced=%v want=%v", rc.IsStateSynced(), wantSynced)
+}
+
+func waitForRecorderCachedState(t *testing.T, hub *services.RecorderHub, deviceID string, wantSynced bool, wantState string, wantTaskID string) {
+ t.Helper()
+ deadline := time.Now().Add(2 * time.Second)
+ for time.Now().Before(deadline) {
+ rc := hub.Get(deviceID)
+ if rc != nil && rc.IsStateSynced() == wantSynced {
+ st := rc.GetState()
+ if strings.EqualFold(strings.TrimSpace(st.CurrentState), wantState) && strings.TrimSpace(st.TaskID) == wantTaskID {
+ return
+ }
+ }
+ time.Sleep(10 * time.Millisecond)
+ }
+ rc := hub.Get(deviceID)
+ if rc == nil {
+ t.Fatalf("recorder %s not connected while waiting for state=%s task=%s synced=%v", deviceID, wantState, wantTaskID, wantSynced)
+ }
+ st := rc.GetState()
+ t.Fatalf("recorder state current=%q task=%q synced=%v, want current=%q task=%q synced=%v", st.CurrentState, st.TaskID, rc.IsStateSynced(), wantState, wantTaskID, wantSynced)
+}
+
+func waitForRecorderHubDisconnected(t *testing.T, hub *services.RecorderHub, deviceID string) {
+ t.Helper()
+ deadline := time.Now().Add(2 * time.Second)
+ for time.Now().Before(deadline) {
+ if hub.Get(deviceID) == nil {
+ return
+ }
+ time.Sleep(10 * time.Millisecond)
+ }
+ t.Fatalf("recorder %s remained connected", deviceID)
+}
+
+func waitForRecorderInteractionTaskStatus(t *testing.T, db *sqlx.DB, taskID string, want string) {
+ t.Helper()
+ deadline := time.Now().Add(2 * time.Second)
+ for time.Now().Before(deadline) {
+ var got string
+ if err := db.Get(&got, `SELECT status FROM tasks WHERE task_id = ?`, taskID); err != nil {
+ t.Fatalf("query task status: %v", err)
+ }
+ if got == want {
+ return
+ }
+ time.Sleep(10 * time.Millisecond)
+ }
+ assertRecorderInteractionTaskStatus(t, db, taskID, want)
+}
+
+func recorderInteractionTaskTimestamp(t *testing.T, db *sqlx.DB, taskID string, column string) string {
+ t.Helper()
+ if column != "ready_at" && column != "started_at" && column != "completed_at" {
+ t.Fatalf("unexpected timestamp column %q", column)
+ }
+ var got string
+ if err := db.Get(&got, `SELECT COALESCE(CAST(`+column+` AS TEXT), '') FROM tasks WHERE task_id = ?`, taskID); err != nil {
+ t.Fatalf("query %s for %s: %v", column, taskID, err)
+ }
+ return got
+}
diff --git a/internal/api/handlers/sync.go b/internal/api/handlers/sync.go
index 367c60d..9d1694c 100644
--- a/internal/api/handlers/sync.go
+++ b/internal/api/handlers/sync.go
@@ -596,14 +596,17 @@ func (h *SyncHandler) GetSyncStatus(c *gin.Context) {
// @Router /sync/config [get]
func (h *SyncHandler) GetSyncConfig(c *gin.Context) {
workerRunning := false
+ autoScanEnabled := false
maxRetries := 0
if h.syncWorker != nil {
workerRunning = h.syncWorker.IsRunning()
+ autoScanEnabled = h.syncWorker.AutoScanEnabled()
maxRetries = h.syncWorker.MaxRetries()
}
c.JSON(http.StatusOK, gin.H{
- "worker_running": workerRunning,
- "max_retries": maxRetries,
+ "worker_running": workerRunning,
+ "auto_scan_enabled": autoScanEnabled,
+ "max_retries": maxRetries,
})
}
diff --git a/internal/api/handlers/sync_test.go b/internal/api/handlers/sync_test.go
index 0c34502..a825937 100644
--- a/internal/api/handlers/sync_test.go
+++ b/internal/api/handlers/sync_test.go
@@ -11,6 +11,8 @@ import (
"testing"
"time"
+ "archebase.com/keystone-edge/internal/services"
+
"github.com/gin-gonic/gin"
"github.com/jmoiron/sqlx"
_ "modernc.org/sqlite"
@@ -26,6 +28,44 @@ func TestSyncHandlerRegisterRoutes(t *testing.T) {
handler.RegisterRoutes(api)
}
+func TestGetSyncConfigIncludesAutoScanEnabled(t *testing.T) {
+ gin.SetMode(gin.TestMode)
+
+ worker := services.NewSyncWorker(nil, nil, nil, "", services.SyncWorkerConfig{
+ MaxRetries: 5,
+ AutoScanEnabled: true,
+ }, nil)
+ router := gin.New()
+ handler := NewSyncHandler(nil, worker)
+ handler.RegisterRoutes(router.Group("/api/v1"))
+
+ req := httptest.NewRequest(http.MethodGet, "/api/v1/sync/config", nil)
+ rec := httptest.NewRecorder()
+ router.ServeHTTP(rec, req)
+
+ if rec.Code != http.StatusOK {
+ t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String())
+ }
+
+ var got struct {
+ WorkerRunning bool `json:"worker_running"`
+ AutoScanEnabled bool `json:"auto_scan_enabled"`
+ MaxRetries int `json:"max_retries"`
+ }
+ if err := json.Unmarshal(rec.Body.Bytes(), &got); err != nil {
+ t.Fatalf("decode response: %v", err)
+ }
+ if got.WorkerRunning {
+ t.Fatal("worker_running = true, want false for not-started test worker")
+ }
+ if !got.AutoScanEnabled {
+ t.Fatal("auto_scan_enabled = false, want true")
+ }
+ if got.MaxRetries != 5 {
+ t.Fatalf("max_retries = %d, want 5", got.MaxRetries)
+ }
+}
+
func TestListEpisodeSyncSummariesGroupsByEpisode(t *testing.T) {
gin.SetMode(gin.TestMode)
db := setupSyncHandlerTestDB(t)
diff --git a/internal/api/handlers/task.go b/internal/api/handlers/task.go
index 9f6db6c..73f3a3e 100644
--- a/internal/api/handlers/task.go
+++ b/internal/api/handlers/task.go
@@ -6,9 +6,11 @@
package handlers
import (
+ "context"
"crypto/rand"
"database/sql"
"encoding/hex"
+ "errors"
"fmt"
"net/http"
"strconv"
@@ -56,24 +58,42 @@ func newPublicBatchID(now time.Time, seq int) (string, error) {
), nil
}
+type transferUploadHub interface {
+ Get(deviceID string) *services.TransferConn
+ SendToDeviceWithTimeout(ctx context.Context, deviceID string, msg map[string]interface{}, timeout time.Duration) error
+}
+
// TaskHandler handles task-related HTTP requests
type TaskHandler struct {
- db *sqlx.DB
- hub *services.TransferHub
- recorderHub *services.RecorderHub
- recorderRPCTimeout time.Duration
+ db *sqlx.DB
+ hub transferUploadHub
+ recorderHub *services.RecorderHub
+ recorderRPCTimeout time.Duration
+ transferWriteTimeout time.Duration
}
-// NewTaskHandler creates a new TaskHandler
-func NewTaskHandler(db *sqlx.DB, hub *services.TransferHub, recorderHub *services.RecorderHub, recorderRPCTimeout time.Duration) *TaskHandler {
+// NewTaskHandler creates a new TaskHandler.
+func NewTaskHandler(db *sqlx.DB, hub *services.TransferHub, recorderHub *services.RecorderHub, recorderRPCTimeout time.Duration, transferWriteTimeout ...time.Duration) *TaskHandler {
+ writeTimeout := services.DefaultTransferWriteTimeout
+ if len(transferWriteTimeout) > 0 && transferWriteTimeout[0] > 0 {
+ writeTimeout = transferWriteTimeout[0]
+ }
return &TaskHandler{
- db: db,
- hub: hub,
- recorderHub: recorderHub,
- recorderRPCTimeout: recorderRPCTimeout,
+ db: db,
+ hub: hub,
+ recorderHub: recorderHub,
+ recorderRPCTimeout: recorderRPCTimeout,
+ transferWriteTimeout: writeTimeout,
}
}
+func (h *TaskHandler) axonTransferWriteTimeout() time.Duration {
+ if h == nil || h.transferWriteTimeout <= 0 {
+ return services.DefaultTransferWriteTimeout
+ }
+ return h.transferWriteTimeout
+}
+
// TaskConfig represents the task configuration response
type TaskConfig struct {
TaskID string `json:"task_id"`
@@ -1050,9 +1070,16 @@ func (h *TaskHandler) OnRecordingFinish(c *gin.Context) {
"priority": 1,
}
- if err := h.hub.SendToDevice(c.Request.Context(), deviceID, uploadRequest); err != nil {
- logger.Printf("[RECORDER] Failed to send upload_request to device %s: %v", deviceID, err)
- c.JSON(http.StatusInternalServerError, gin.H{
+ writeTimeout := h.axonTransferWriteTimeout()
+ if err := h.hub.SendToDeviceWithTimeout(c.Request.Context(), deviceID, uploadRequest, writeTimeout); err != nil {
+ status := http.StatusInternalServerError
+ if errors.Is(err, services.ErrTransferWriteTimeout) {
+ status = http.StatusGatewayTimeout
+ logger.Printf("[TRANSFER] Device %s: auto upload_request timed out after %s for task=%s: %v", deviceID, timeoutLogValue(writeTimeout), callback.TaskID, err)
+ } else {
+ logger.Printf("[RECORDER] Failed to send upload_request to device %s: %v", deviceID, err)
+ }
+ c.JSON(status, gin.H{
"error_msg": "Failed to trigger upload: " + err.Error(),
})
return
diff --git a/internal/api/handlers/task_state_recovery_test.go b/internal/api/handlers/task_state_recovery_test.go
index bb1f17e..bc824d6 100644
--- a/internal/api/handlers/task_state_recovery_test.go
+++ b/internal/api/handlers/task_state_recovery_test.go
@@ -6,14 +6,19 @@ package handlers
import (
"bytes"
+ "context"
"encoding/json"
+ "fmt"
"net/http"
"net/http/httptest"
+ "strings"
"testing"
"time"
"archebase.com/keystone-edge/internal/config"
"archebase.com/keystone-edge/internal/services"
+ "github.com/coder/websocket"
+ "github.com/coder/websocket/wsjson"
"github.com/gin-gonic/gin"
"github.com/jmoiron/sqlx"
_ "modernc.org/sqlite"
@@ -59,6 +64,323 @@ func TestRecorderStateUpdateRecordingAdvancesPendingTask(t *testing.T) {
assertTaskStateRecoveryTimestampSet(t, db, "task-recording", "started_at")
}
+func TestRecorderStateUpdatePausedAdvancesPendingTask(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-paused", "pending")
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db)
+ rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1")
+
+ handler.handleStateUpdate(rc, map[string]interface{}{
+ "data": map[string]interface{}{
+ "current": "paused",
+ "task_id": "task-paused",
+ },
+ })
+
+ assertTaskStateRecoveryStatus(t, db, "task-paused", "in_progress")
+ assertTaskStateRecoveryTimestampSet(t, db, "task-paused", "started_at")
+}
+
+func TestRecorderGetStateSnapshotReadyRestoresPendingTask(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-rpc-ready", "pending")
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db)
+ rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1")
+ state := recorderStateFromRPCData(map[string]interface{}{
+ "state": "ready",
+ "task_config": map[string]interface{}{
+ "task_id": "task-rpc-ready",
+ },
+ })
+
+ _ = handler.applyRecorderStateSnapshot(rc, state, "get_state")
+
+ assertTaskStateRecoveryStatus(t, db, "task-rpc-ready", "ready")
+ assertTaskStateRecoveryTimestampSet(t, db, "task-rpc-ready", "ready_at")
+}
+
+func TestRecorderGetStateReportsSyncingBeforeInitialStateSnapshot(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+
+ hub := services.NewRecorderHub()
+ attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ return services.RPCResponse{Success: true}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db)
+
+ gin.SetMode(gin.TestMode)
+ router := gin.New()
+ router.GET("/recorder/:device_id/state", handler.GetState)
+ req := httptest.NewRequest(http.MethodGet, "/recorder/robot-001/state", nil)
+ w := httptest.NewRecorder()
+
+ router.ServeHTTP(w, req)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+
+ var body map[string]interface{}
+ if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
+ t.Fatalf("decode state response: %v", err)
+ }
+ if body["connected"] != true {
+ t.Fatalf("connected=%v want=true body=%v", body["connected"], body)
+ }
+ if body["state_synced"] != false {
+ t.Fatalf("state_synced=%v want=false body=%v", body["state_synced"], body)
+ }
+ if body["syncing"] != true {
+ t.Fatalf("syncing=%v want=true body=%v", body["syncing"], body)
+ }
+}
+
+func TestRecorderGetStateReportsSyncedAfterInitialStateSnapshot(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db)
+ rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1")
+ if !hub.Connect("robot-001", rc) {
+ t.Fatalf("connect recorder: initial connect failed")
+ }
+ _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update")
+
+ gin.SetMode(gin.TestMode)
+ router := gin.New()
+ router.GET("/recorder/:device_id/state", handler.GetState)
+ req := httptest.NewRequest(http.MethodGet, "/recorder/robot-001/state", nil)
+ w := httptest.NewRecorder()
+
+ router.ServeHTTP(w, req)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+
+ var body map[string]interface{}
+ if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
+ t.Fatalf("decode state response: %v", err)
+ }
+ if body["state_synced"] != true {
+ t.Fatalf("state_synced=%v want=true body=%v", body["state_synced"], body)
+ }
+ if body["syncing"] != false {
+ t.Fatalf("syncing=%v want=false body=%v", body["syncing"], body)
+ }
+}
+
+func TestRecorderConfigRejectsBeforeInitialStateSnapshot(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-config-syncing", "pending")
+
+ hub := services.NewRecorderHub()
+ rpcCalled := make(chan struct{}, 1)
+ attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ rpcCalled <- struct{}{}
+ return services.RPCResponse{Success: true}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+
+ gin.SetMode(gin.TestMode)
+ router := gin.New()
+ router.POST("/recorder/:device_id/config", handler.Config)
+ body := []byte(`{"task_config":{"task_id":"task-config-syncing"}}`)
+ req := httptest.NewRequest(http.MethodPost, "/recorder/robot-001/config", bytes.NewReader(body))
+ req.Header.Set("Content-Type", "application/json")
+ w := httptest.NewRecorder()
+
+ router.ServeHTTP(w, req)
+
+ if w.Code != http.StatusConflict {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String())
+ }
+ select {
+ case <-rpcCalled:
+ t.Fatalf("config RPC was sent before recorder state sync completed")
+ default:
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-config-syncing", "pending")
+}
+
+func TestRecorderConfigRejectsWhenSyncedRecorderBusy(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-config-busy", "pending")
+
+ hub := services.NewRecorderHub()
+ rpcCalled := make(chan struct{}, 1)
+ rc := attachRecorderRPCResponderWithConn(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ rpcCalled <- struct{}{}
+ return services.RPCResponse{Success: true}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{
+ CurrentState: "ready",
+ TaskID: "existing-task",
+ }, "state_update")
+
+ gin.SetMode(gin.TestMode)
+ router := gin.New()
+ router.POST("/recorder/:device_id/config", handler.Config)
+ body := []byte(`{"task_config":{"task_id":"task-config-busy"}}`)
+ req := httptest.NewRequest(http.MethodPost, "/recorder/robot-001/config", bytes.NewReader(body))
+ req.Header.Set("Content-Type", "application/json")
+ w := httptest.NewRecorder()
+
+ router.ServeHTTP(w, req)
+
+ if w.Code != http.StatusConflict {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String())
+ }
+ select {
+ case <-rpcCalled:
+ t.Fatalf("config RPC was sent while recorder was already busy")
+ default:
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-config-busy", "pending")
+}
+
+func TestRecorderConfigAdvancesTaskAfterInitialStateSnapshot(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-config-synced", "pending")
+
+ hub := services.NewRecorderHub()
+ rc := attachRecorderRPCResponderWithConn(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ return services.RPCResponse{Success: true}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update")
+
+ gin.SetMode(gin.TestMode)
+ router := gin.New()
+ router.POST("/recorder/:device_id/config", handler.Config)
+ body := []byte(`{"task_config":{"task_id":"task-config-synced"}}`)
+ req := httptest.NewRequest(http.MethodPost, "/recorder/robot-001/config", bytes.NewReader(body))
+ req.Header.Set("Content-Type", "application/json")
+ w := httptest.NewRecorder()
+
+ router.ServeHTTP(w, req)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-config-synced", "ready")
+}
+
+func TestRecorderConfigDoesNotAdvanceTaskWhenRPCResponseUnsuccessful(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-config-false", "pending")
+
+ hub := services.NewRecorderHub()
+ rc := attachRecorderRPCResponderWithConn(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ return services.RPCResponse{Success: false, Message: "device rejected config"}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+ _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update")
+
+ gin.SetMode(gin.TestMode)
+ router := gin.New()
+ router.POST("/recorder/:device_id/config", handler.Config)
+ body := []byte(`{"task_config":{"task_id":"task-config-false"}}`)
+ req := httptest.NewRequest(http.MethodPost, "/recorder/robot-001/config", bytes.NewReader(body))
+ req.Header.Set("Content-Type", "application/json")
+ w := httptest.NewRecorder()
+
+ router.ServeHTTP(w, req)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-config-false", "pending")
+}
+
+func TestRecorderBeginDoesNotAdvanceTaskWhenRPCResponseUnsuccessful(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-begin-false", "pending")
+
+ hub := services.NewRecorderHub()
+ attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse {
+ return services.RPCResponse{Success: false, Message: "device rejected begin"}
+ })
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db)
+
+ gin.SetMode(gin.TestMode)
+ router := gin.New()
+ router.POST("/recorder/:device_id/begin", handler.Begin)
+ body := []byte(`{"task_id":"task-begin-false"}`)
+ req := httptest.NewRequest(http.MethodPost, "/recorder/robot-001/begin", bytes.NewReader(body))
+ req.Header.Set("Content-Type", "application/json")
+ w := httptest.NewRecorder()
+
+ router.ServeHTTP(w, req)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+ }
+ assertTaskStateRecoveryStatus(t, db, "task-begin-false", "pending")
+}
+
+func TestRecorderIgnoresStateChangingMessagesFromReplacedConnection(t *testing.T) {
+ tests := []struct {
+ name string
+ msg map[string]interface{}
+ }{
+ {
+ name: "state_update",
+ msg: map[string]interface{}{
+ "type": "state_update",
+ "data": map[string]interface{}{
+ "current": "ready",
+ "task_id": "task-replaced",
+ },
+ },
+ },
+ {
+ name: "config_applied",
+ msg: map[string]interface{}{
+ "type": "config_applied",
+ "data": map[string]interface{}{
+ "task_id": "task-replaced",
+ },
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ db := newTaskStateRecoveryDB(t)
+ defer db.Close()
+ seedTaskStateRecoveryTask(t, db, "task-replaced", "pending")
+
+ hub := services.NewRecorderHub()
+ handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db)
+ oldConn := hub.NewRecorderConn(&websocket.Conn{}, "robot-001", "127.0.0.1")
+ if !hub.Connect("robot-001", oldConn) {
+ t.Fatalf("initial connect failed")
+ }
+ newConn := hub.NewRecorderConn(&websocket.Conn{}, "robot-001", "127.0.0.1")
+ hub.ConnectReplacingExisting("robot-001", newConn)
+
+ handler.handleMessage("robot-001", oldConn, tt.msg)
+
+ assertTaskStateRecoveryStatus(t, db, "task-replaced", "pending")
+ })
+ }
+}
+
func TestRecordingStartCallbackAdvancesPendingTask(t *testing.T) {
db := newTaskStateRecoveryDB(t)
defer db.Close()
@@ -90,6 +412,89 @@ func TestRecordingStartCallbackAdvancesPendingTask(t *testing.T) {
assertTaskStateRecoveryTimestampSet(t, db, "task-start", "started_at")
}
+func TestTaskHandlerAxonTransferWriteTimeout(t *testing.T) {
+ custom := 250 * time.Millisecond
+ handler := NewTaskHandler(nil, nil, nil, 0, custom)
+ if got := handler.axonTransferWriteTimeout(); got != custom {
+ t.Fatalf("axonTransferWriteTimeout()=%s want=%s", got, custom)
+ }
+
+ handler = NewTaskHandler(nil, nil, nil, 0, 0)
+ if got := handler.axonTransferWriteTimeout(); got != services.DefaultTransferWriteTimeout {
+ t.Fatalf("default axonTransferWriteTimeout()=%s want=%s", got, services.DefaultTransferWriteTimeout)
+ }
+}
+
+func TestRecordingFinishAutoUploadUsesConfiguredTransferWriteTimeout(t *testing.T) {
+ custom := 250 * time.Millisecond
+ hub := &recordingFinishTransferHub{
+ conn: &services.TransferConn{DeviceID: "robot-001"},
+ }
+ handler := &TaskHandler{hub: hub, transferWriteTimeout: custom}
+
+ gin.SetMode(gin.TestMode)
+ router := gin.New()
+ handler.RegisterCallbackRoutes(router.Group("/callbacks"))
+
+ body, err := json.Marshal(RecordingFinishCallback{
+ TaskID: "task-finish",
+ DeviceID: "robot-001",
+ Status: "finished",
+ FinishedAt: time.Now().UTC().Format(time.RFC3339),
+ OutputPath: "/data/task-finish.mcap",
+ })
+ if err != nil {
+ t.Fatalf("marshal callback: %v", err)
+ }
+
+ req := httptest.NewRequest(http.MethodPost, "/callbacks/finish", bytes.NewReader(body))
+ req.Header.Set("Content-Type", "application/json")
+ w := httptest.NewRecorder()
+ router.ServeHTTP(w, req)
+
+ if w.Code != http.StatusGatewayTimeout {
+ t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusGatewayTimeout, w.Body.String())
+ }
+ if hub.getDeviceID != "robot-001" {
+ t.Fatalf("Get device=%q want=%q", hub.getDeviceID, "robot-001")
+ }
+ if hub.sendDeviceID != "robot-001" {
+ t.Fatalf("Send device=%q want=%q", hub.sendDeviceID, "robot-001")
+ }
+ if hub.timeout != custom {
+ t.Fatalf("Send timeout=%s want=%s", hub.timeout, custom)
+ }
+ if got := fmt.Sprint(hub.msg["type"]); got != "upload_request" {
+ t.Fatalf("message type=%q want=%q", got, "upload_request")
+ }
+ if got := fmt.Sprint(hub.msg["task_id"]); got != "task-finish" {
+ t.Fatalf("message task_id=%q want=%q", got, "task-finish")
+ }
+ if !strings.Contains(w.Body.String(), custom.String()) {
+ t.Fatalf("response body %q does not mention custom timeout %s", w.Body.String(), custom)
+ }
+}
+
+type recordingFinishTransferHub struct {
+ conn *services.TransferConn
+ getDeviceID string
+ sendDeviceID string
+ msg map[string]interface{}
+ timeout time.Duration
+}
+
+func (h *recordingFinishTransferHub) Get(deviceID string) *services.TransferConn {
+ h.getDeviceID = deviceID
+ return h.conn
+}
+
+func (h *recordingFinishTransferHub) SendToDeviceWithTimeout(ctx context.Context, deviceID string, msg map[string]interface{}, timeout time.Duration) error {
+ h.sendDeviceID = deviceID
+ h.msg = msg
+ h.timeout = timeout
+ return fmt.Errorf("%w after %s: fake blocked transfer write", services.ErrTransferWriteTimeout, timeout)
+}
+
func newTaskStateRecoveryDB(t *testing.T) *sqlx.DB {
t.Helper()
db, err := sqlx.Open("sqlite", ":memory:")
@@ -150,3 +555,73 @@ func assertTaskStateRecoveryTimestampSet(t *testing.T, db *sqlx.DB, taskID strin
t.Fatalf("task %s was not set", column)
}
}
+
+func attachRecorderRPCResponder(t *testing.T, hub *services.RecorderHub, deviceID string, respond func(services.RPCRequest) services.RPCResponse) {
+ t.Helper()
+ attachRecorderRPCResponderWithConn(t, hub, deviceID, respond)
+}
+
+func attachRecorderRPCResponderWithConn(t *testing.T, hub *services.RecorderHub, deviceID string, respond func(services.RPCRequest) services.RPCResponse) *services.RecorderConn {
+ t.Helper()
+ serverConn, clientConn := newRecorderHandlerTestWebSocketPair(t)
+ rc := hub.NewRecorderConn(serverConn, deviceID, "127.0.0.1")
+ if !hub.Connect(deviceID, rc) {
+ t.Fatalf("connect recorder: initial connect failed")
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+ t.Cleanup(cancel)
+ go func() {
+ for {
+ var req services.RPCRequest
+ if err := wsjson.Read(ctx, clientConn, &req); err != nil {
+ return
+ }
+ resp := respond(req)
+ if resp.Type == "" {
+ resp.Type = "rpc_response"
+ }
+ if resp.RequestID == "" {
+ resp.RequestID = req.RequestID
+ }
+ hub.HandleRPCResponse(deviceID, &resp)
+ }
+ }()
+ return rc
+}
+
+func newRecorderHandlerTestWebSocketPair(t *testing.T) (*websocket.Conn, *websocket.Conn) {
+ t.Helper()
+ serverConnC := make(chan *websocket.Conn, 1)
+ errC := make(chan error, 1)
+
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ conn, err := websocket.Accept(w, r, &websocket.AcceptOptions{InsecureSkipVerify: true})
+ if err != nil {
+ errC <- err
+ return
+ }
+ serverConnC <- conn
+ <-r.Context().Done()
+ }))
+ t.Cleanup(server.Close)
+
+ clientConn, _, err := websocket.Dial(context.Background(), "ws"+strings.TrimPrefix(server.URL, "http"), nil)
+ if err != nil {
+ t.Fatalf("dial websocket: %v", err)
+ }
+
+ select {
+ case serverConn := <-serverConnC:
+ t.Cleanup(func() {
+ _ = clientConn.CloseNow()
+ _ = serverConn.CloseNow()
+ })
+ return serverConn, clientConn
+ case err := <-errC:
+ t.Fatalf("accept websocket: %v", err)
+ case <-time.After(time.Second):
+ t.Fatalf("timed out waiting for websocket accept")
+ }
+ return nil, nil
+}
diff --git a/internal/api/handlers/transfer.go b/internal/api/handlers/transfer.go
index b282db7..1e70ce5 100644
--- a/internal/api/handlers/transfer.go
+++ b/internal/api/handlers/transfer.go
@@ -20,7 +20,6 @@ import (
"time"
"github.com/coder/websocket"
- "github.com/coder/websocket/wsjson"
"github.com/gin-gonic/gin"
"github.com/google/uuid"
"github.com/jmoiron/sqlx"
@@ -45,9 +44,7 @@ type TransferHandler struct {
// recorderHub is used on transfer disconnect to notify recorder (clear/cancel) before reverting tasks.
recorderHub *services.RecorderHub
recorderRPCTimeout time.Duration
-
- uploadNotFoundMu sync.Mutex
- uploadNotFoundAttempts map[string]int
+ stateBroker *services.DeviceStateBroker
}
// NewTransferHandler creates a new TransferHandler.
@@ -55,48 +52,26 @@ type TransferHandler struct {
// recorderHub may be nil (disables recorder RPC on transfer disconnect).
func NewTransferHandler(hub *services.TransferHub, cfg *config.TransferConfig, db *sqlx.DB, s3Client *s3.Client, bucket string, factoryID string, recorderHub *services.RecorderHub, recorderRPCTimeout time.Duration) *TransferHandler {
return &TransferHandler{
- hub: hub,
- cfg: cfg,
- db: db,
- s3: s3Client,
- bucket: bucket,
- factoryID: factoryID,
- recorderHub: recorderHub,
- recorderRPCTimeout: recorderRPCTimeout,
- uploadNotFoundAttempts: make(map[string]int),
+ hub: hub,
+ cfg: cfg,
+ db: db,
+ s3: s3Client,
+ bucket: bucket,
+ factoryID: factoryID,
+ recorderHub: recorderHub,
+ recorderRPCTimeout: recorderRPCTimeout,
client: &http.Client{
Timeout: 10 * time.Second,
},
}
}
-func uploadNotFoundKey(deviceID, taskID string) string {
- return deviceID + "|" + taskID
-}
-
-func (h *TransferHandler) clearUploadNotFoundAttempts(deviceID, taskID string) {
- if strings.TrimSpace(deviceID) == "" || strings.TrimSpace(taskID) == "" {
- return
- }
- key := uploadNotFoundKey(deviceID, taskID)
- h.uploadNotFoundMu.Lock()
- delete(h.uploadNotFoundAttempts, key)
- h.uploadNotFoundMu.Unlock()
-}
-
-func (h *TransferHandler) clearUploadNotFoundAttemptsByDevice(deviceID string) {
- deviceID = strings.TrimSpace(deviceID)
- if deviceID == "" {
+// SetDeviceStateBroker enables device connection event publishing.
+func (h *TransferHandler) SetDeviceStateBroker(broker *services.DeviceStateBroker) {
+ if h == nil {
return
}
- prefix := deviceID + "|"
- h.uploadNotFoundMu.Lock()
- for key := range h.uploadNotFoundAttempts {
- if strings.HasPrefix(key, prefix) {
- delete(h.uploadNotFoundAttempts, key)
- }
- }
- h.uploadNotFoundMu.Unlock()
+ h.stateBroker = broker
}
// RegisterRoutes registers all transfer-related REST routes
@@ -121,7 +96,8 @@ func (h *TransferHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request
// Validate device exists in robots table (if DB is configured)
if h.db != nil {
// Add independent 5s timeout to avoid blocking on slow DB queries
- queryCtx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
+ queryTimeout := 5 * time.Second
+ queryCtx, cancel := context.WithTimeout(r.Context(), queryTimeout)
defer cancel()
var count int
@@ -129,7 +105,11 @@ func (h *TransferHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request
if err := h.db.GetContext(queryCtx, &count,
"SELECT COUNT(1) FROM robots WHERE device_id = ? AND deleted_at IS NULL", deviceID,
); err != nil {
- logger.Printf("[TRANSFER] Device %s: DB query error: %v", deviceID, err)
+ if errors.Is(err, context.DeadlineExceeded) || errors.Is(queryCtx.Err(), context.DeadlineExceeded) {
+ logger.Printf("[TRANSFER] Device %s: DB query timeout after %s (timeout_ms=%d): %v", deviceID, timeoutLogValue(queryTimeout), timeoutLogMilliseconds(queryTimeout), err)
+ } else {
+ logger.Printf("[TRANSFER] Device %s: DB query error: %v", deviceID, err)
+ }
}
// Check count regardless of DB error (count defaults to 0 on error)
if count == 0 {
@@ -149,17 +129,9 @@ func (h *TransferHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request
remoteIP := extractIP(r.RemoteAddr)
dc := h.hub.NewTransferConn(conn, deviceID, remoteIP)
- staleConn, ok := h.hub.ConnectWithStaleThreshold(deviceID, dc, transferStaleThreshold(h.cfg))
- if !ok {
- logger.Printf("[TRANSFER] Device %s: connection rejected (already connected)", deviceID)
- if err := conn.Close(websocket.StatusPolicyViolation, "device already connected"); err != nil {
- if !isExpectedWebSocketCloseError(err) {
- logger.Printf("[TRANSFER] WebSocket close error for device %s: %v", deviceID, err)
- }
- }
- return
- }
- closeStaleTransferConn(deviceID, staleConn)
+ replacedConn := h.hub.ConnectReplacingExisting(deviceID, dc)
+ closeReplacedTransferConn(deviceID, replacedConn)
+ publishDeviceConnectionEvent(h.stateBroker, h.recorderHub, h.hub, deviceID, "transfer_connected")
defer func() {
if err := conn.Close(websocket.StatusNormalClosure, ""); err != nil {
@@ -168,9 +140,9 @@ func (h *TransferHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request
}
}
}()
- defer h.clearUploadNotFoundAttemptsByDevice(deviceID)
defer func() {
if h.hub.Disconnect(deviceID, dc) {
+ publishDeviceConnectionEvent(h.stateBroker, h.recorderHub, h.hub, deviceID, "transfer_disconnected")
revertRunnableTasksOnDeviceDisconnect(h.db, deviceID, h.recorderHub, h.recorderRPCTimeout, true)
}
}()
@@ -227,10 +199,11 @@ func (h *TransferHandler) pingLoop(ctx context.Context, dc *services.TransferCon
case <-ticker.C:
pingCtx, cancel := context.WithTimeout(ctx, timeout)
err := dc.Conn.Ping(pingCtx)
+ timedOut := errors.Is(err, context.DeadlineExceeded) || errors.Is(pingCtx.Err(), context.DeadlineExceeded)
cancel()
if err != nil {
if ctx.Err() == nil {
- logger.Printf("[TRANSFER] Device %s ping failed: %v", dc.DeviceID, err)
+ logWebSocketPingFailure("TRANSFER", "Device", dc.DeviceID, timeout, timedOut, err)
if closeErr := dc.Conn.CloseNow(); closeErr != nil {
if !isExpectedWebSocketCloseError(closeErr) {
logger.Printf("[TRANSFER] Device %s close after ping failure: %v", dc.DeviceID, closeErr)
@@ -260,27 +233,62 @@ func transferPingTimeout(cfg *config.TransferConfig) time.Duration {
return time.Duration(cfg.PingTimeout) * time.Second
}
-func transferStaleThreshold(cfg *config.TransferConfig) time.Duration {
- if cfg == nil || cfg.StaleThreshold <= 0 {
- return 0
+func transferWriteTimeout(cfg *config.TransferConfig) time.Duration {
+ if cfg == nil || cfg.WriteTimeout <= 0 {
+ return services.DefaultTransferWriteTimeout
}
- return time.Duration(cfg.StaleThreshold) * time.Second
+ return time.Duration(cfg.WriteTimeout) * time.Second
}
-func closeStaleTransferConn(deviceID string, dc *services.TransferConn) {
+func transferMessageType(msg map[string]interface{}) string {
+ msgType := strings.TrimSpace(stringVal(msg, "type"))
+ if msgType == "" {
+ return "message"
+ }
+ return msgType
+}
+
+func logTransferSendFailure(deviceID string, msgType string, timeout time.Duration, err error) {
+ if errors.Is(err, services.ErrTransferWriteTimeout) {
+ logger.Printf("[TRANSFER] Device %s: send %s timed out after %s: %v", deviceID, msgType, timeoutLogValue(timeout), err)
+ return
+ }
+ logger.Printf("[TRANSFER] Device %s: failed to send %s: %v", deviceID, msgType, err)
+}
+
+func (h *TransferHandler) sendToDevice(c *gin.Context, deviceID string, msg map[string]interface{}) bool {
+ timeout := transferWriteTimeout(h.cfg)
+ if err := h.hub.SendToDeviceWithTimeout(c.Request.Context(), deviceID, msg, timeout); err != nil {
+ logTransferSendFailure(deviceID, transferMessageType(msg), timeout, err)
+ status := http.StatusNotFound
+ if errors.Is(err, services.ErrTransferWriteTimeout) {
+ status = http.StatusGatewayTimeout
+ }
+ c.JSON(status, gin.H{"error": err.Error()})
+ return false
+ }
+ return true
+}
+
+func closeReplacedTransferConn(deviceID string, dc *services.TransferConn) {
if dc == nil || dc.Conn == nil {
return
}
- logger.Printf("[TRANSFER] Device %s: closing stale WebSocket connection", deviceID)
+ logger.Printf("[TRANSFER] Device %s: closing replaced WebSocket connection", deviceID)
if err := dc.Conn.CloseNow(); err != nil {
if !isExpectedWebSocketCloseError(err) {
- logger.Printf("[TRANSFER] Device %s: stale WebSocket close error: %v", deviceID, err)
+ logger.Printf("[TRANSFER] Device %s: replaced WebSocket close error: %v", deviceID, err)
}
}
}
// handleMessage dispatches an inbound WebSocket message to the appropriate handler
func (h *TransferHandler) handleMessage(ctx context.Context, dc *services.TransferConn, msg map[string]interface{}) {
+ if h.hub.Get(dc.DeviceID) != dc {
+ logger.Printf("[TRANSFER] Device %s ignored message from replaced connection", dc.DeviceID)
+ return
+ }
+
msgType, _ := msg["type"].(string)
switch msgType {
@@ -330,7 +338,6 @@ func (h *TransferHandler) onUploadStarted(dc *services.TransferConn, msg map[str
return
}
taskID := stringVal(data, "task_id")
- h.clearUploadNotFoundAttempts(dc.DeviceID, taskID)
// #nosec G706 -- Set aside for now
logger.Printf("[TRANSFER] Device %s: upload started task=%s total_bytes=%d",
dc.DeviceID, taskID, int64Val(data, "total_bytes"))
@@ -412,7 +419,6 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra
logger.Printf("[TRANSFER] Device %s: upload complete taskID is empty", dc.DeviceID)
return
}
- h.clearUploadNotFoundAttempts(dc.DeviceID, taskID)
// #nosec G706 -- Set aside for now
logger.Printf("[TRANSFER] Device %s: upload complete for task=%s", dc.DeviceID, taskID)
@@ -679,15 +685,11 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra
"type": "upload_ack",
"task_id": taskID,
}
- dc.WriteMu.Lock()
- if err := wsjson.Write(ctx, dc.Conn, ackMsg); err != nil {
- dc.WriteMu.Unlock()
- // #nosec G706 -- Set aside for now
- logger.Printf("[TRANSFER] Device %s: failed to send upload_ack for task=%s: %v", dc.DeviceID, taskID, err)
+ writeTimeout := transferWriteTimeout(h.cfg)
+ if err := h.hub.SendToConnWithTimeout(ctx, dc, ackMsg, writeTimeout); err != nil {
+ logTransferSendFailure(dc.DeviceID, "upload_ack", writeTimeout, err)
return
}
- dc.WriteMu.Unlock()
- dc.RecordEvent("outbound", ackMsg)
// #nosec G706 -- Set aside for now
logger.Printf("[TRANSFER] Device %s: upload_ack sent for task=%s", dc.DeviceID, taskID)
@@ -726,7 +728,6 @@ func (h *TransferHandler) onUploadFailed(ctx context.Context, dc *services.Trans
taskID := stringVal(data, "task_id")
reason := stringVal(data, "reason")
retryCount := intVal(data, "retry_count")
- h.clearUploadNotFoundAttempts(dc.DeviceID, taskID)
// Log full message for debugging
// #nosec G706 -- Set aside for now
@@ -856,11 +857,19 @@ func revertRunnableTasksOnDeviceDisconnect(db *sqlx.DB, deviceID string, recorde
switch st {
case "ready":
if _, err := recorderHub.SendRPC(rpcCtx, deviceID, "clear", nil, timeout); err != nil {
- logger.Printf("[DEVICE] Device %s: recorder clear after transfer disconnect failed (task=%s): %v", deviceID, tid, err)
+ if errors.Is(err, services.ErrRecorderRPCTimeout) {
+ logRecorderRPCTimeout(deviceID, "clear", tid, "transfer_disconnect", timeout, err)
+ } else {
+ logger.Printf("[DEVICE] Device %s: recorder clear after transfer disconnect failed (task=%s): %v", deviceID, tid, err)
+ }
}
case "in_progress":
if _, err := recorderHub.SendRPC(rpcCtx, deviceID, "cancel", map[string]interface{}{"task_id": tid}, timeout); err != nil {
- logger.Printf("[DEVICE] Device %s: recorder cancel after transfer disconnect failed (task=%s): %v", deviceID, tid, err)
+ if errors.Is(err, services.ErrRecorderRPCTimeout) {
+ logRecorderRPCTimeout(deviceID, "cancel", tid, "transfer_disconnect", timeout, err)
+ } else {
+ logger.Printf("[DEVICE] Device %s: recorder cancel after transfer disconnect failed (task=%s): %v", deviceID, tid, err)
+ }
}
}
}
@@ -890,102 +899,22 @@ func revertRunnableTasksOnDeviceDisconnect(db *sqlx.DB, deviceID string, recorde
}
}
-// onUploadNotFound handles "upload_not_found" message
+// onUploadNotFound handles "upload_not_found" message. Keystone only logs this
+// report; it does not retry upload_request because the transfer side owns local
+// file discovery and retry policy.
func (h *TransferHandler) onUploadNotFound(dc *services.TransferConn, msg map[string]interface{}) {
data, _ := msg["data"].(map[string]interface{})
if data == nil {
+ logger.Printf("[TRANSFER] ERROR: Device %s reported upload_not_found with empty data", dc.DeviceID)
return
}
- taskID := stringVal(data, "task_id")
- detail := stringVal(data, "detail")
- if strings.TrimSpace(taskID) == "" {
- return
- }
-
- const maxAttempts = 5
- key := uploadNotFoundKey(dc.DeviceID, taskID)
-
- h.uploadNotFoundMu.Lock()
- attempt := h.uploadNotFoundAttempts[key] + 1
- h.uploadNotFoundAttempts[key] = attempt
- h.uploadNotFoundMu.Unlock()
-
- // #nosec G706 -- Set aside for now
- logger.Printf("[TRANSFER] Device %s: task=%s not found (attempt=%d/%d) detail=%q", dc.DeviceID, taskID, attempt, maxAttempts, detail)
-
- if attempt < maxAttempts {
- // Simple backoff: 1s, 2s, 3s, 4s (capped).
- delay := time.Duration(attempt) * time.Second
- if delay > 5*time.Second {
- delay = 5 * time.Second
- }
-
- go func(deviceID, tid string, attemptNow int, d time.Duration) {
- time.Sleep(d)
- if h.hub.Get(deviceID) == nil {
- logger.Printf("[TRANSFER] Device %s: skip retry upload_request for task=%s (device not connected)", deviceID, tid)
- return
- }
-
- keyNow := uploadNotFoundKey(deviceID, tid)
- h.uploadNotFoundMu.Lock()
- currentAttempt, ok := h.uploadNotFoundAttempts[keyNow]
- h.uploadNotFoundMu.Unlock()
- if !ok || currentAttempt != attemptNow {
- logger.Printf("[TRANSFER] Device %s: skip stale retry upload_request for task=%s attempt=%d", deviceID, tid, attemptNow)
- return
- }
-
- req := map[string]interface{}{
- "type": "upload_request",
- "task_id": tid,
- "priority": 1,
- }
- if err := h.hub.SendToDevice(context.Background(), deviceID, req); err != nil {
- logger.Printf("[TRANSFER] Device %s: retry upload_request failed for task=%s attempt=%d err=%v", deviceID, tid, attemptNow, err)
- return
- }
- logger.Printf("[TRANSFER] Device %s: retry upload_request sent for task=%s attempt=%d", deviceID, tid, attemptNow)
- }(dc.DeviceID, taskID, attempt, delay)
- return
- }
-
- // Reached max attempts: stop retrying and mark task as failed (best-effort).
- h.uploadNotFoundMu.Lock()
- delete(h.uploadNotFoundAttempts, key)
- h.uploadNotFoundMu.Unlock()
-
- if h.db == nil {
- return
- }
- now := time.Now().UTC()
- detail = strings.TrimSpace(detail)
- errMsg := fmt.Sprintf("upload_not_found after %d attempts", maxAttempts)
- if detail != "" {
- errMsg = fmt.Sprintf("%s: %s", errMsg, detail)
- }
- result, err := h.db.ExecContext(context.Background(), `
- UPDATE tasks
- SET
- status = 'failed',
- error_message = ?,
- completed_at = CASE WHEN completed_at IS NULL THEN ? ELSE completed_at END,
- updated_at = ?
- WHERE task_id = ? AND status = 'in_progress' AND deleted_at IS NULL
- `, errMsg, now, now, taskID)
- if err != nil {
- logger.Printf("[TRANSFER] Device %s: failed to mark task failed on upload_not_found: task=%s err=%v", dc.DeviceID, taskID, err)
+ taskID := strings.TrimSpace(stringVal(data, "task_id"))
+ detail := strings.TrimSpace(stringVal(data, "detail"))
+ if taskID == "" {
+ logger.Printf("[TRANSFER] ERROR: Device %s reported upload_not_found without task_id detail=%q", dc.DeviceID, detail)
return
}
- if rows, _ := result.RowsAffected(); rows > 0 {
- logger.Printf("[TRANSFER] Device %s: task=%s marked as failed due to upload_not_found", dc.DeviceID, taskID)
- var batchID int64
- if err := h.db.QueryRowContext(context.Background(),
- "SELECT batch_id FROM tasks WHERE task_id = ? AND deleted_at IS NULL", taskID,
- ).Scan(&batchID); err == nil && batchID > 0 {
- go tryAdvanceBatchStatus(h.db, batchID)
- }
- }
+ logger.Printf("[TRANSFER] ERROR: Device %s reported upload_not_found task=%s detail=%q", dc.DeviceID, taskID, detail)
}
// onStatus handles "status" message and updates the device status snapshot
@@ -1061,8 +990,7 @@ func (h *TransferHandler) UploadRequest(c *gin.Context) {
"task_id": body.TaskID,
"priority": body.Priority,
}
- if err := h.hub.SendToDevice(c.Request.Context(), deviceID, msg); err != nil {
- c.JSON(http.StatusNotFound, gin.H{"error": err.Error()})
+ if !h.sendToDevice(c, deviceID, msg) {
return
}
c.JSON(http.StatusOK, gin.H{"status": "sent"})
@@ -1098,9 +1026,7 @@ func (h *TransferHandler) UploadAll(c *gin.Context) {
msg := map[string]interface{}{"type": "upload_all"}
logger.Printf("[TRANSFER] Sending message to device %s: %+v", deviceID, msg)
- if err := h.hub.SendToDevice(c.Request.Context(), deviceID, msg); err != nil {
- logger.Printf("[TRANSFER] Failed to send message to device %s: %v", deviceID, err)
- c.JSON(http.StatusNotFound, gin.H{"error": err.Error()})
+ if !h.sendToDevice(c, deviceID, msg) {
return
}
@@ -1134,8 +1060,7 @@ func (h *TransferHandler) CancelUpload(c *gin.Context) {
"type": "cancel",
"task_id": body.TaskID,
}
- if err := h.hub.SendToDevice(c.Request.Context(), deviceID, msg); err != nil {
- c.JSON(http.StatusNotFound, gin.H{"error": err.Error()})
+ if !h.sendToDevice(c, deviceID, msg) {
return
}
c.JSON(http.StatusOK, gin.H{"status": "sent"})
@@ -1161,8 +1086,7 @@ func (h *TransferHandler) StatusQuery(c *gin.Context) {
}
msg := map[string]interface{}{"type": "status_query"}
- if err := h.hub.SendToDevice(c.Request.Context(), deviceID, msg); err != nil {
- c.JSON(http.StatusNotFound, gin.H{"error": err.Error()})
+ if !h.sendToDevice(c, deviceID, msg) {
return
}
c.JSON(http.StatusOK, gin.H{"status": "sent"})
@@ -1194,8 +1118,7 @@ func (h *TransferHandler) ManualACK(c *gin.Context) {
"type": "upload_ack",
"task_id": body.TaskID,
}
- if err := h.hub.SendToDevice(c.Request.Context(), deviceID, msg); err != nil {
- c.JSON(http.StatusNotFound, gin.H{"error": err.Error()})
+ if !h.sendToDevice(c, deviceID, msg) {
return
}
diff --git a/internal/api/handlers/transfer_connection_takeover_test.go b/internal/api/handlers/transfer_connection_takeover_test.go
new file mode 100644
index 0000000..aa553f0
--- /dev/null
+++ b/internal/api/handlers/transfer_connection_takeover_test.go
@@ -0,0 +1,111 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package handlers
+
+import (
+ "context"
+ "testing"
+ "time"
+
+ "archebase.com/keystone-edge/internal/config"
+ "archebase.com/keystone-edge/internal/services"
+ "github.com/coder/websocket"
+ "github.com/jmoiron/sqlx"
+ _ "modernc.org/sqlite"
+)
+
+func TestTransferIgnoresUploadFailedFromReplacedConnection(t *testing.T) {
+ db := newTransferTakeoverDB(t)
+ defer db.Close()
+ seedTransferTakeoverTask(t, db, "task-old-upload", "in_progress")
+
+ hub := services.NewTransferHub(10)
+ handler := NewTransferHandler(hub, &config.TransferConfig{}, db, nil, "", "", nil, 0)
+ oldConn := hub.NewTransferConn(&websocket.Conn{}, "robot-001", "127.0.0.1")
+ if !hub.Connect("robot-001", oldConn) {
+ t.Fatalf("initial connect failed")
+ }
+ newConn := hub.NewTransferConn(&websocket.Conn{}, "robot-001", "127.0.0.1")
+ hub.ConnectReplacingExisting("robot-001", newConn)
+
+ handler.handleMessage(context.Background(), oldConn, map[string]interface{}{
+ "type": "upload_failed",
+ "data": map[string]interface{}{
+ "task_id": "task-old-upload",
+ "reason": "old connection message",
+ },
+ })
+
+ assertTransferTakeoverTaskStatus(t, db, "task-old-upload", "in_progress")
+}
+
+func TestTransferProcessesUploadFailedFromCurrentConnection(t *testing.T) {
+ db := newTransferTakeoverDB(t)
+ defer db.Close()
+ seedTransferTakeoverTask(t, db, "task-current-upload", "in_progress")
+
+ hub := services.NewTransferHub(10)
+ handler := NewTransferHandler(hub, &config.TransferConfig{}, db, nil, "", "", nil, 0)
+ dc := hub.NewTransferConn(&websocket.Conn{}, "robot-001", "127.0.0.1")
+ if !hub.Connect("robot-001", dc) {
+ t.Fatalf("connect failed")
+ }
+
+ handler.handleMessage(context.Background(), dc, map[string]interface{}{
+ "type": "upload_failed",
+ "data": map[string]interface{}{
+ "task_id": "task-current-upload",
+ "reason": "current connection message",
+ },
+ })
+
+ assertTransferTakeoverTaskStatus(t, db, "task-current-upload", "failed")
+}
+
+func newTransferTakeoverDB(t *testing.T) *sqlx.DB {
+ t.Helper()
+ db, err := sqlx.Open("sqlite", ":memory:")
+ if err != nil {
+ t.Fatalf("open sqlite db: %v", err)
+ }
+ if _, err := db.Exec(`CREATE TABLE tasks (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ task_id TEXT NOT NULL,
+ batch_id INTEGER NOT NULL DEFAULT 0,
+ status TEXT NOT NULL,
+ completed_at TIMESTAMP NULL,
+ created_at TIMESTAMP NOT NULL,
+ updated_at TIMESTAMP NOT NULL,
+ deleted_at TIMESTAMP NULL
+ )`); err != nil {
+ t.Fatalf("create tasks schema: %v", err)
+ }
+ return db
+}
+
+func seedTransferTakeoverTask(t *testing.T, db *sqlx.DB, taskID string, status string) {
+ t.Helper()
+ now := time.Now().UTC()
+ if _, err := db.Exec(
+ `INSERT INTO tasks (task_id, status, created_at, updated_at) VALUES (?, ?, ?, ?)`,
+ taskID,
+ status,
+ now,
+ now,
+ ); err != nil {
+ t.Fatalf("seed task: %v", err)
+ }
+}
+
+func assertTransferTakeoverTaskStatus(t *testing.T, db *sqlx.DB, taskID string, want string) {
+ t.Helper()
+ var got string
+ if err := db.Get(&got, `SELECT status FROM tasks WHERE task_id = ?`, taskID); err != nil {
+ t.Fatalf("query task status: %v", err)
+ }
+ if got != want {
+ t.Fatalf("task status=%q want=%q", got, want)
+ }
+}
diff --git a/internal/cloud/auth_client.go b/internal/cloud/auth_client.go
index 4bcc5b5..dcc4d7d 100644
--- a/internal/cloud/auth_client.go
+++ b/internal/cloud/auth_client.go
@@ -9,6 +9,7 @@ package cloud
import (
"context"
+ "errors"
"fmt"
"strings"
"sync"
@@ -119,11 +120,17 @@ func (c *AuthClient) exchangeCredential(ctx context.Context, credentialBase64 st
}
client := pb.NewAuthServiceClient(conn)
+ startedAt := time.Now()
resp, err := client.ExchangeCredential(ctx, &pb.ExchangeCredentialRequest{
CredentialBase64: credentialBase64,
})
if err != nil {
- logger.Printf("[CLOUD-AUTH] ExchangeCredential RPC failed, resetting gRPC connection: %v", err)
+ timeout := timeoutDuration(ctx, startedAt, 0)
+ if isTimeoutError(err) || errors.Is(ctx.Err(), context.DeadlineExceeded) {
+ logger.Printf("[CLOUD-AUTH] ExchangeCredential RPC timeout after %s (timeout_ms=%d), resetting gRPC connection: %v", timeoutLogValue(timeout), timeoutLogMilliseconds(timeout), err)
+ } else {
+ logger.Printf("[CLOUD-AUTH] ExchangeCredential RPC failed, resetting gRPC connection: %v", err)
+ }
if closeErr := c.Close(); closeErr != nil {
logger.Printf("[CLOUD-AUTH] failed to reset gRPC connection after RPC error: %v", closeErr)
} else {
diff --git a/internal/cloud/gateway_client.go b/internal/cloud/gateway_client.go
index 454628f..c91d238 100644
--- a/internal/cloud/gateway_client.go
+++ b/internal/cloud/gateway_client.go
@@ -198,7 +198,11 @@ func (c *GatewayClient) AbortUpload(ctx context.Context, logicalUploadID string,
if err != nil {
// AbortUpload is best-effort: do not reset the shared gRPC connection on failure,
// as that would disrupt subsequent normal uploads. Only log the error.
- logger.Printf("[CLOUD-GATEWAY] AbortUpload RPC failed: %v", err)
+ if isTimeoutError(err) || errors.Is(rpcCtx.Err(), context.DeadlineExceeded) {
+ logger.Printf("[CLOUD-GATEWAY] AbortUpload RPC timeout after %s (timeout_ms=%d): %v", timeoutLogValue(c.cfg.RequestTimeout), timeoutLogMilliseconds(c.cfg.RequestTimeout), err)
+ } else {
+ logger.Printf("[CLOUD-GATEWAY] AbortUpload RPC failed: %v", err)
+ }
return fmt.Errorf("AbortUpload RPC: %w", err)
}
return nil
@@ -254,7 +258,11 @@ func (c *GatewayClient) doRPC(ctx context.Context, rpcName string, fn func(*grpc
}
// First attempt failed. Reset the stale connection and retry once with a fresh one.
- logger.Printf("[CLOUD-GATEWAY] %s RPC failed, resetting connection and retrying: %v", rpcName, err)
+ if isTimeoutError(err) {
+ logger.Printf("[CLOUD-GATEWAY] %s RPC timeout after %s (timeout_ms=%d), resetting connection and retrying: %v", rpcName, timeoutLogValue(c.cfg.RequestTimeout), timeoutLogMilliseconds(c.cfg.RequestTimeout), err)
+ } else {
+ logger.Printf("[CLOUD-GATEWAY] %s RPC failed, resetting connection and retrying: %v", rpcName, err)
+ }
if closeErr := c.Close(); closeErr != nil {
logger.Printf("[CLOUD-GATEWAY] failed to close stale connection: %v", closeErr)
}
@@ -269,7 +277,11 @@ func (c *GatewayClient) doRPC(ctx context.Context, rpcName string, fn func(*grpc
if err = fn(conn); err != nil {
// Retry also failed — this is a genuine server-side error, not a stale connection.
- logger.Printf("[CLOUD-GATEWAY] %s RPC failed after reconnect: %v", rpcName, err)
+ if isTimeoutError(err) {
+ logger.Printf("[CLOUD-GATEWAY] %s RPC timeout after %s (timeout_ms=%d) after reconnect: %v", rpcName, timeoutLogValue(c.cfg.RequestTimeout), timeoutLogMilliseconds(c.cfg.RequestTimeout), err)
+ } else {
+ logger.Printf("[CLOUD-GATEWAY] %s RPC failed after reconnect: %v", rpcName, err)
+ }
if closeErr := c.Close(); closeErr != nil {
logger.Printf("[CLOUD-GATEWAY] failed to close connection after retry failure: %v", closeErr)
}
@@ -287,6 +299,9 @@ func (c *GatewayClient) getAuthHeader(ctx context.Context) (string, error) {
defer authCancel()
header, err := c.authClient.GetAuthHeader(authCtx)
if err != nil {
+ if isTimeoutError(err) || errors.Is(authCtx.Err(), context.DeadlineExceeded) {
+ logger.Printf("[CLOUD-GATEWAY] Auth header acquisition timeout after %s (timeout_ms=%d): %v", timeoutLogValue(c.cfg.RequestTimeout), timeoutLogMilliseconds(c.cfg.RequestTimeout), err)
+ }
return "", fmt.Errorf("get auth header: %w", err)
}
return header, nil
diff --git a/internal/cloud/oss_uploader.go b/internal/cloud/oss_uploader.go
index 869c68e..25d1d16 100644
--- a/internal/cloud/oss_uploader.go
+++ b/internal/cloud/oss_uploader.go
@@ -222,9 +222,14 @@ func (u *OSSUploader) sendRequest(ctx context.Context, session *UploadSession, m
req.Header.Set(k, v)
}
- // #nosec G704 -- URL is OSS endpoint from authenticated DataGateway gRPC upload session
- resp, err := u.httpClient.Do(req)
+ startedAt := time.Now()
+ resp, err := u.httpClient.Do(req) //nolint:gosec // G704: URL is OSS endpoint from authenticated DataGateway gRPC upload session
if err != nil {
+ if isTimeoutError(err) || errors.Is(ctx.Err(), context.DeadlineExceeded) {
+ timeout := timeoutDuration(ctx, startedAt, u.httpClient.Timeout)
+ logger.Printf("[OSS] %s %s timeout after %s (timeout_ms=%d): %v", method, objectKey, timeoutLogValue(timeout), timeoutLogMilliseconds(timeout), err)
+ return nil, fmt.Errorf("http request timeout after %s: %w", timeoutLogValue(timeout), err)
+ }
return nil, fmt.Errorf("http request: %w", err)
}
diff --git a/internal/cloud/timeout_log.go b/internal/cloud/timeout_log.go
new file mode 100644
index 0000000..e16857c
--- /dev/null
+++ b/internal/cloud/timeout_log.go
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package cloud
+
+import (
+ "context"
+ "errors"
+ "time"
+
+ "google.golang.org/grpc/codes"
+ "google.golang.org/grpc/status"
+)
+
+func isTimeoutError(err error) bool {
+ if err == nil {
+ return false
+ }
+ if errors.Is(err, context.DeadlineExceeded) {
+ return true
+ }
+ if st, ok := status.FromError(err); ok && st.Code() == codes.DeadlineExceeded {
+ return true
+ }
+ var timeoutErr interface{ Timeout() bool }
+ return errors.As(err, &timeoutErr) && timeoutErr.Timeout()
+}
+
+func timeoutDuration(ctx context.Context, start time.Time, configured time.Duration) time.Duration {
+ timeout := configured
+ if deadline, ok := ctx.Deadline(); ok {
+ if d := deadline.Sub(start); d > 0 && (timeout <= 0 || d < timeout) {
+ timeout = d.Round(time.Millisecond)
+ }
+ }
+ if timeout > 0 {
+ return timeout
+ }
+ if !start.IsZero() {
+ return time.Since(start).Round(time.Millisecond)
+ }
+ return 0
+}
+
+func timeoutLogValue(timeout time.Duration) string {
+ if timeout <= 0 {
+ return "unknown"
+ }
+ return timeout.String()
+}
+
+func timeoutLogMilliseconds(timeout time.Duration) int64 {
+ if timeout <= 0 {
+ return 0
+ }
+ return timeout.Milliseconds()
+}
diff --git a/internal/config/config.go b/internal/config/config.go
index 914a944..0f1a635 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -66,9 +66,10 @@ type QAConfig struct {
// SyncConfig synchronization configuration
type SyncConfig struct {
- Enabled bool
- BatchSize int
- MaxRetries int
+ Enabled bool
+ AutoScanEnabled bool
+ BatchSize int
+ MaxRetries int
// Cloud upload settings (data-platform integration)
AuthEndpoint string // gRPC endpoint for AuthService
@@ -119,6 +120,7 @@ type TransferConfig struct {
WSPort int
MaxEvents int
ReadTimeout int // seconds
+ WriteTimeout int // seconds
PingInterval int // seconds
PingTimeout int // seconds
StaleThreshold int // seconds
@@ -183,6 +185,7 @@ func Load() (*Config, error) {
},
Sync: SyncConfig{
Enabled: getEnvBool("KEYSTONE_SYNC_ENABLED", true),
+ AutoScanEnabled: getEnvBool("KEYSTONE_SYNC_AUTO_SCAN_ENABLED", false),
BatchSize: getEnvInt("KEYSTONE_SYNC_BATCH_SIZE", 10),
MaxRetries: getEnvInt("KEYSTONE_SYNC_MAX_RETRIES", 5),
AuthEndpoint: getEnv("KEYSTONE_CLOUD_AUTH_ENDPOINT", ""),
@@ -233,6 +236,7 @@ func Load() (*Config, error) {
WSPort: getEnvInt("KEYSTONE_AXON_TRANSFER_WS_PORT", 8090),
MaxEvents: getEnvInt("KEYSTONE_AXON_TRANSFER_MAX_EVENTS", 10000),
ReadTimeout: getEnvInt("KEYSTONE_AXON_TRANSFER_READ_TIMEOUT", 30),
+ WriteTimeout: getEnvInt("KEYSTONE_AXON_TRANSFER_WRITE_TIMEOUT", 10),
PingInterval: getEnvInt("KEYSTONE_AXON_TRANSFER_PING_INTERVAL", 25),
PingTimeout: getEnvInt("KEYSTONE_AXON_TRANSFER_PING_TIMEOUT", 10),
StaleThreshold: getEnvInt("KEYSTONE_AXON_TRANSFER_STALE_THRESHOLD", 60),
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
index 7d9532d..dd918e6 100644
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -14,12 +14,13 @@ import (
func TestLoad(t *testing.T) {
// Save original environment variables
originalEnv := map[string]string{
- "KEYSTONE_MODE": os.Getenv("KEYSTONE_MODE"),
- "KEYSTONE_MYSQL_HOST": os.Getenv("KEYSTONE_MYSQL_HOST"),
- "KEYSTONE_MYSQL_PASSWORD": os.Getenv("KEYSTONE_MYSQL_PASSWORD"),
- "KEYSTONE_MINIO_ACCESS_KEY": os.Getenv("KEYSTONE_MINIO_ACCESS_KEY"),
- "KEYSTONE_MINIO_SECRET_KEY": os.Getenv("KEYSTONE_MINIO_SECRET_KEY"),
- "KEYSTONE_FACTORY_ID": os.Getenv("KEYSTONE_FACTORY_ID"),
+ "KEYSTONE_MODE": os.Getenv("KEYSTONE_MODE"),
+ "KEYSTONE_MYSQL_HOST": os.Getenv("KEYSTONE_MYSQL_HOST"),
+ "KEYSTONE_MYSQL_PASSWORD": os.Getenv("KEYSTONE_MYSQL_PASSWORD"),
+ "KEYSTONE_MINIO_ACCESS_KEY": os.Getenv("KEYSTONE_MINIO_ACCESS_KEY"),
+ "KEYSTONE_MINIO_SECRET_KEY": os.Getenv("KEYSTONE_MINIO_SECRET_KEY"),
+ "KEYSTONE_FACTORY_ID": os.Getenv("KEYSTONE_FACTORY_ID"),
+ "KEYSTONE_SYNC_AUTO_SCAN_ENABLED": os.Getenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED"),
}
defer func() {
// Restore original environment variables
@@ -33,6 +34,7 @@ func TestLoad(t *testing.T) {
}()
// Set test environment variables
+ os.Unsetenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED")
os.Setenv("KEYSTONE_MYSQL_PASSWORD", "test-password")
os.Setenv("KEYSTONE_MINIO_ACCESS_KEY", "test-access-key")
os.Setenv("KEYSTONE_MINIO_SECRET_KEY", "test-secret-key")
@@ -64,6 +66,10 @@ func TestLoad(t *testing.T) {
t.Errorf("Load().Storage.Bucket = %v, want edge-factory-test", cfg.Storage.Bucket)
}
+ if cfg.Sync.AutoScanEnabled {
+ t.Error("Load().Sync.AutoScanEnabled should default to false")
+ }
+
// Verify QA configuration
if !cfg.QA.Enabled {
t.Error("Load().QA.Enabled should default to true")
@@ -98,6 +104,7 @@ func TestLoadWithCustomEnv(t *testing.T) {
"KEYSTONE_QA_MAX_WORKERS": os.Getenv("KEYSTONE_QA_MAX_WORKERS"),
"KEYSTONE_MAX_MEMORY_MB": os.Getenv("KEYSTONE_MAX_MEMORY_MB"),
"KEYSTONE_DASHBOARD_DISPLAY_TOKEN": os.Getenv("KEYSTONE_DASHBOARD_DISPLAY_TOKEN"),
+ "KEYSTONE_SYNC_AUTO_SCAN_ENABLED": os.Getenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED"),
}
defer func() {
for k, v := range originalEnv {
@@ -118,6 +125,7 @@ func TestLoadWithCustomEnv(t *testing.T) {
os.Setenv("KEYSTONE_QA_MAX_WORKERS", "8")
os.Setenv("KEYSTONE_MAX_MEMORY_MB", "8192")
os.Setenv("KEYSTONE_DASHBOARD_DISPLAY_TOKEN", "display-secret")
+ os.Setenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED", "true")
cfg, err := Load()
if err != nil {
@@ -139,6 +147,10 @@ func TestLoadWithCustomEnv(t *testing.T) {
if cfg.Auth.DashboardDisplayToken != "display-secret" {
t.Errorf("Load().Auth.DashboardDisplayToken = %q, want display-secret", cfg.Auth.DashboardDisplayToken)
}
+
+ if !cfg.Sync.AutoScanEnabled {
+ t.Error("Load().Sync.AutoScanEnabled = false, want true")
+ }
}
func TestConfigValidate(t *testing.T) {
diff --git a/internal/server/server.go b/internal/server/server.go
index 45e0a09..ff908c0 100644
--- a/internal/server/server.go
+++ b/internal/server/server.go
@@ -7,6 +7,7 @@ package server
import (
"context"
+ "errors"
"fmt"
"net"
"net/http"
@@ -37,6 +38,7 @@ type Server struct {
storage *handlers.StorageHandler
transfer *handlers.TransferHandler
recorder *handlers.RecorderHandler
+ deviceState *handlers.DeviceStateHandler
episode *handlers.EpisodeHandler
task *handlers.TaskHandler
batch *handlers.BatchHandler
@@ -65,6 +67,13 @@ type Server struct {
engine *gin.Engine
}
+func axonTransferWriteTimeout(cfg *config.TransferConfig) time.Duration {
+ if cfg == nil || cfg.WriteTimeout <= 0 {
+ return services.DefaultTransferWriteTimeout
+ }
+ return time.Duration(cfg.WriteTimeout) * time.Second
+}
+
// New creates a new server instance.
// db and s3Client are optional; pass nil to disable Verified ACK.
// syncWorker is optional; pass nil to disable cloud sync API.
@@ -87,6 +96,7 @@ func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *servi
}
// Recorder hub must exist before TransferHandler (transfer disconnect notifies recorder via RPC).
+ stateBroker := services.NewDeviceStateBroker()
recorderHub := services.NewRecorderHub()
recorderHandler := handlers.NewRecorderHandler(recorderHub, &cfg.AxonRecorder, db)
recorderRPCTimeout := time.Duration(cfg.AxonRecorder.ResponseTimeout) * time.Second
@@ -94,12 +104,17 @@ func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *servi
// Create TransferHub and TransferHandler for Transfer Service
transferHub := services.NewTransferHub(cfg.AxonTransfer.MaxEvents)
transferHandler := handlers.NewTransferHandler(transferHub, &cfg.AxonTransfer, db, s3Client, cfg.Storage.Bucket, cfg.AxonTransfer.FactoryID, recorderHub, recorderRPCTimeout)
+ recorderHandler.SetDeviceStateDeps(transferHub, stateBroker)
+ transferHandler.SetDeviceStateBroker(stateBroker)
+ deviceStateHandler := handlers.NewDeviceStateHandler(stateBroker, recorderHub, transferHub)
// Create EpisodeHandler for episode listing
episodeHandler := handlers.NewEpisodeHandler(db, s3Client, cfg.Storage.Bucket, &cfg.Auth)
+ transferWriteTimeout := axonTransferWriteTimeout(&cfg.AxonTransfer)
+
// Create TaskHandler for task configuration
- taskHandler := handlers.NewTaskHandler(db, transferHub, recorderHub, recorderRPCTimeout)
+ taskHandler := handlers.NewTaskHandler(db, transferHub, recorderHub, recorderRPCTimeout, transferWriteTimeout)
// Create database-dependent handlers only when DB is available
var (
@@ -152,6 +167,7 @@ func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *servi
storage: storageHandler,
transfer: transferHandler,
recorder: recorderHandler,
+ deviceState: deviceStateHandler,
episode: episodeHandler,
task: taskHandler,
batch: batchHandler,
@@ -319,6 +335,9 @@ func (s *Server) buildRoutes() http.Handler {
v1Recorder := v1Routes.Group("/recorder")
s.recorder.RegisterRoutes(v1Recorder)
+ if s.deviceState != nil {
+ s.deviceState.RegisterRoutes(v1Routes)
+ }
// Swagger documentation - serve at both root and api/v1 path
s.engine.GET("/swagger/*any", ginSwagger.WrapHandler(swaggerFiles.Handler))
@@ -414,29 +433,45 @@ func (s *Server) Shutdown(ctx context.Context) error {
s.isRunning = false
s.shutdownMu.Unlock()
- logger.Printf("[SERVER] Shutting down HTTP server")
+ startedAt := time.Now()
+ shutdownTimeout := time.Duration(s.cfg.Server.ShutdownTimeout) * time.Second
+ effectiveShutdownTimeout := shutdownTimeout
+ if deadline, ok := ctx.Deadline(); ok {
+ if d := deadline.Sub(startedAt); d > 0 && (effectiveShutdownTimeout <= 0 || d < effectiveShutdownTimeout) {
+ effectiveShutdownTimeout = d.Round(time.Millisecond)
+ }
+ }
+ logger.Printf("[SERVER] Shutting down HTTP server (timeout=%s timeout_ms=%d)", effectiveShutdownTimeout, effectiveShutdownTimeout.Milliseconds())
- ctx, cancel := context.WithTimeout(ctx, time.Duration(s.cfg.Server.ShutdownTimeout)*time.Second)
+ ctx, cancel := context.WithTimeout(ctx, shutdownTimeout)
defer cancel()
+ logShutdownError := func(component string, err error) {
+ if errors.Is(err, context.DeadlineExceeded) || errors.Is(ctx.Err(), context.DeadlineExceeded) {
+ logger.Printf("[SERVER] %s shutdown timeout after %s (timeout_ms=%d): %v", component, effectiveShutdownTimeout, effectiveShutdownTimeout.Milliseconds(), err)
+ return
+ }
+ logger.Printf("[SERVER] %s shutdown error: %v", component, err)
+ }
+
var shutdownErr error
// Shutdown both servers
if err := s.httpServer.Shutdown(ctx); err != nil {
- logger.Printf("[SERVER] HTTP server shutdown error: %v", err)
+ logShutdownError("HTTP server", err)
if shutdownErr == nil {
shutdownErr = fmt.Errorf("http server shutdown: %w", err)
}
}
if err := s.transferWSServer.Shutdown(ctx); err != nil {
- logger.Printf("[SERVER] Transfer WebSocket server shutdown error: %v", err)
+ logShutdownError("Transfer WebSocket server", err)
if shutdownErr == nil {
shutdownErr = fmt.Errorf("transfer websocket shutdown: %w", err)
}
}
if s.recorderWSServer != nil {
if err := s.recorderWSServer.Shutdown(ctx); err != nil {
- logger.Printf("[SERVER] Recorder WebSocket server shutdown error: %v", err)
+ logShutdownError("Recorder WebSocket server", err)
if shutdownErr == nil {
shutdownErr = fmt.Errorf("recorder websocket shutdown: %w", err)
}
@@ -446,7 +481,7 @@ func (s *Server) Shutdown(ctx context.Context) error {
// Stop sync worker
if s.syncWorker != nil {
if err := s.syncWorker.Stop(ctx); err != nil {
- logger.Printf("[SERVER] Sync worker shutdown error: %v", err)
+ logShutdownError("Sync worker", err)
if shutdownErr == nil {
shutdownErr = fmt.Errorf("sync worker shutdown: %w", err)
}
diff --git a/internal/server/server_test.go b/internal/server/server_test.go
new file mode 100644
index 0000000..028179e
--- /dev/null
+++ b/internal/server/server_test.go
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package server
+
+import (
+ "testing"
+ "time"
+
+ "archebase.com/keystone-edge/internal/config"
+ "archebase.com/keystone-edge/internal/services"
+)
+
+func TestAxonTransferWriteTimeoutFromConfig(t *testing.T) {
+ tests := []struct {
+ name string
+ cfg *config.TransferConfig
+ want time.Duration
+ }{
+ {name: "nil config", cfg: nil, want: services.DefaultTransferWriteTimeout},
+ {name: "zero config", cfg: &config.TransferConfig{}, want: services.DefaultTransferWriteTimeout},
+ {name: "custom seconds", cfg: &config.TransferConfig{WriteTimeout: 7}, want: 7 * time.Second},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ if got := axonTransferWriteTimeout(tt.cfg); got != tt.want {
+ t.Fatalf("axonTransferWriteTimeout()=%s want=%s", got, tt.want)
+ }
+ })
+ }
+}
diff --git a/internal/services/device_state_broker.go b/internal/services/device_state_broker.go
new file mode 100644
index 0000000..ef06fc5
--- /dev/null
+++ b/internal/services/device_state_broker.go
@@ -0,0 +1,104 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package services
+
+import (
+ "sync"
+ "time"
+)
+
+// DeviceStateEvent is a versioned event emitted when recorder state or device
+// connection state changes.
+type DeviceStateEvent map[string]interface{}
+
+// DeviceStateBroker fans out device state events to API stream subscribers.
+type DeviceStateBroker struct {
+ mu sync.RWMutex
+ nextSubID uint64
+ versions map[string]uint64
+ subscribers map[uint64]chan DeviceStateEvent
+}
+
+// NewDeviceStateBroker creates a state event broker.
+func NewDeviceStateBroker() *DeviceStateBroker {
+ return &DeviceStateBroker{
+ versions: make(map[string]uint64),
+ subscribers: make(map[uint64]chan DeviceStateEvent),
+ }
+}
+
+// CurrentVersion returns the latest emitted version for a device.
+func (b *DeviceStateBroker) CurrentVersion(deviceID string) uint64 {
+ if b == nil || deviceID == "" {
+ return 0
+ }
+ b.mu.RLock()
+ defer b.mu.RUnlock()
+ return b.versions[deviceID]
+}
+
+// Publish assigns a per-device monotonically increasing state_version and fans
+// the event out to subscribers. Slow subscribers may drop events; clients must
+// recover by fetching a full snapshot after reconnect.
+func (b *DeviceStateBroker) Publish(deviceID string, event DeviceStateEvent) DeviceStateEvent {
+ if b == nil || deviceID == "" || event == nil {
+ return event
+ }
+
+ b.mu.Lock()
+ version := b.versions[deviceID] + 1
+ b.versions[deviceID] = version
+ event["device_id"] = deviceID
+ event["state_version"] = version
+ if _, ok := event["updated_at"]; !ok {
+ event["updated_at"] = time.Now().UTC().Format(time.RFC3339Nano)
+ }
+
+ subscribers := make([]chan DeviceStateEvent, 0, len(b.subscribers))
+ for _, ch := range b.subscribers {
+ subscribers = append(subscribers, ch)
+ }
+ b.mu.Unlock()
+
+ for _, ch := range subscribers {
+ select {
+ case ch <- cloneDeviceStateEvent(event):
+ default:
+ }
+ }
+ return event
+}
+
+// Subscribe registers a subscriber and returns a channel plus cleanup function.
+func (b *DeviceStateBroker) Subscribe(buffer int) (<-chan DeviceStateEvent, func()) {
+ if buffer <= 0 {
+ buffer = 32
+ }
+ ch := make(chan DeviceStateEvent, buffer)
+ if b == nil {
+ close(ch)
+ return ch, func() {}
+ }
+
+ b.mu.Lock()
+ b.nextSubID++
+ id := b.nextSubID
+ b.subscribers[id] = ch
+ b.mu.Unlock()
+
+ return ch, func() {
+ b.mu.Lock()
+ delete(b.subscribers, id)
+ b.mu.Unlock()
+ }
+}
+
+func cloneDeviceStateEvent(src DeviceStateEvent) DeviceStateEvent {
+ out := make(DeviceStateEvent, len(src))
+ for k, v := range src {
+ out[k] = v
+ }
+ return out
+}
diff --git a/internal/services/device_state_broker_test.go b/internal/services/device_state_broker_test.go
new file mode 100644
index 0000000..1debaab
--- /dev/null
+++ b/internal/services/device_state_broker_test.go
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package services
+
+import "testing"
+
+func TestDeviceStateBrokerUnsubscribeDoesNotCloseChannel(t *testing.T) {
+ broker := NewDeviceStateBroker()
+ events, unsubscribe := broker.Subscribe(1)
+
+ unsubscribe()
+
+ select {
+ case _, ok := <-events:
+ if !ok {
+ t.Fatalf("unsubscribe closed subscriber channel; concurrent publish can panic on send")
+ }
+ default:
+ }
+}
+
+func TestDeviceStateBrokerConcurrentPublishUnsubscribe(t *testing.T) {
+ broker := NewDeviceStateBroker()
+
+ for i := 0; i < 1000; i++ {
+ _, unsubscribe := broker.Subscribe(1)
+ done := make(chan struct{})
+ go func() {
+ defer close(done)
+ broker.Publish("robot-001", DeviceStateEvent{"type": "recorder_state"})
+ }()
+ unsubscribe()
+ <-done
+ }
+}
diff --git a/internal/services/hub.go b/internal/services/hub.go
index ee7e30b..c8516cb 100644
--- a/internal/services/hub.go
+++ b/internal/services/hub.go
@@ -91,9 +91,31 @@ func (h *Hub[T]) connectWithStaleThreshold(deviceID string, conn T, staleThresho
return zero, true
}
+// connectReplacingExisting registers conn under deviceID. If another connection
+// is already registered for the same device, the new connection takes over and
+// the previous connection is returned so the caller can close it outside the hub
+// lock.
+func (h *Hub[T]) connectReplacingExisting(deviceID string, conn T) T {
+ var replaced T
+
+ h.mu.Lock()
+ defer h.mu.Unlock()
+
+ if old, exists := h.connections[deviceID]; exists {
+ if old.GetWSConn() != conn.GetWSConn() {
+ replaced = old
+ logger.Printf("[%s] Hub: replacing existing connection for device %s", h.label, deviceID)
+ }
+ }
+
+ h.connections[deviceID] = conn
+ logger.Printf("[%s] Hub: device %s registered, total connections=%d", h.label, deviceID, len(h.connections))
+ return replaced
+}
+
// disconnect removes the connection for deviceID only if it matches conn.
-// This avoids a stale handler goroutine deleting a newer connection after
-// rejecting takeover. Returns true if an entry was removed.
+// This avoids an old handler goroutine deleting a newer connection after
+// replacement. Returns true if an entry was removed.
func (h *Hub[T]) disconnect(deviceID string, conn T) bool {
h.mu.Lock()
defer h.mu.Unlock()
diff --git a/internal/services/hub_test.go b/internal/services/hub_test.go
index 4287f4e..251200b 100644
--- a/internal/services/hub_test.go
+++ b/internal/services/hub_test.go
@@ -5,6 +5,12 @@
package services
import (
+ "context"
+ "errors"
+ "fmt"
+ "net/http"
+ "net/http/httptest"
+ "strings"
"testing"
"time"
@@ -69,3 +75,126 @@ func TestRecorderHubConnectWithStaleThresholdReplacesStaleConnection(t *testing.
t.Fatalf("hub connection=%p want nil", got)
}
}
+
+func TestRecorderHubConnectReplacingExistingTakesOverFreshConnection(t *testing.T) {
+ hub := NewRecorderHub()
+ deviceID := "robot-001"
+
+ oldConn := hub.NewRecorderConn(&websocket.Conn{}, deviceID, "127.0.0.1")
+ oldConn.LastSeenAt = time.Now()
+ if !hub.Connect(deviceID, oldConn) {
+ t.Fatalf("initial connect failed")
+ }
+
+ newConn := hub.NewRecorderConn(&websocket.Conn{}, deviceID, "127.0.0.1")
+ replaced := hub.ConnectReplacingExisting(deviceID, newConn)
+ if replaced != oldConn {
+ t.Fatalf("replaced=%p want oldConn=%p", replaced, oldConn)
+ }
+ if got := hub.Get(deviceID); got != newConn {
+ t.Fatalf("hub connection=%p want newConn=%p", got, newConn)
+ }
+ if hub.Disconnect(deviceID, oldConn) {
+ t.Fatalf("old connection disconnected the current hub entry")
+ }
+ if got := hub.Get(deviceID); got != newConn {
+ t.Fatalf("old disconnect changed current hub connection")
+ }
+ if !hub.Disconnect(deviceID, newConn) {
+ t.Fatalf("new connection did not disconnect")
+ }
+}
+
+func TestRecorderHubConnectReplacingExistingFailsPendingRPCWaitersWithError(t *testing.T) {
+ hub := NewRecorderHub()
+ deviceID := "robot-001"
+ serverConn, clientConn := newRecorderHubTestWebSocketPair(t)
+
+ ctx, cancel := context.WithCancel(context.Background())
+ t.Cleanup(cancel)
+ go func() {
+ _, _, _ = clientConn.Read(ctx)
+ }()
+
+ oldConn := hub.NewRecorderConn(serverConn, deviceID, "127.0.0.1")
+ if !hub.Connect(deviceID, oldConn) {
+ t.Fatalf("initial connect failed")
+ }
+
+ errC := make(chan error, 1)
+ go func() {
+ resp, err := hub.SendRPC(ctx, deviceID, "begin", nil, time.Second)
+ if !errors.Is(err, ErrRecorderNotConnected) {
+ errC <- fmt.Errorf("SendRPC err=%v resp=%+v, want ErrRecorderNotConnected", err, resp)
+ return
+ }
+ errC <- nil
+ }()
+
+ waitForPendingRecorderRPC(t, oldConn)
+ newConn := hub.NewRecorderConn(&websocket.Conn{}, deviceID, "127.0.0.1")
+ replaced := hub.ConnectReplacingExisting(deviceID, newConn)
+ if replaced != oldConn {
+ t.Fatalf("replaced=%p want oldConn=%p", replaced, oldConn)
+ }
+
+ select {
+ case err := <-errC:
+ if err != nil {
+ t.Fatal(err)
+ }
+ case <-time.After(time.Second):
+ t.Fatalf("SendRPC did not return after connection replacement")
+ }
+}
+
+func waitForPendingRecorderRPC(t *testing.T, rc *RecorderConn) {
+ t.Helper()
+ deadline := time.Now().Add(time.Second)
+ for time.Now().Before(deadline) {
+ rc.PendingMu.Lock()
+ n := len(rc.Pending)
+ rc.PendingMu.Unlock()
+ if n > 0 {
+ return
+ }
+ time.Sleep(10 * time.Millisecond)
+ }
+ t.Fatalf("timed out waiting for pending RPC")
+}
+
+func newRecorderHubTestWebSocketPair(t *testing.T) (*websocket.Conn, *websocket.Conn) {
+ t.Helper()
+ serverConnC := make(chan *websocket.Conn, 1)
+ errC := make(chan error, 1)
+
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ conn, err := websocket.Accept(w, r, &websocket.AcceptOptions{InsecureSkipVerify: true})
+ if err != nil {
+ errC <- err
+ return
+ }
+ serverConnC <- conn
+ <-r.Context().Done()
+ }))
+ t.Cleanup(server.Close)
+
+ clientConn, _, err := websocket.Dial(context.Background(), "ws"+strings.TrimPrefix(server.URL, "http"), nil)
+ if err != nil {
+ t.Fatalf("dial websocket: %v", err)
+ }
+
+ select {
+ case serverConn := <-serverConnC:
+ t.Cleanup(func() {
+ _ = clientConn.CloseNow()
+ _ = serverConn.CloseNow()
+ })
+ return serverConn, clientConn
+ case err := <-errC:
+ t.Fatalf("accept websocket: %v", err)
+ case <-time.After(time.Second):
+ t.Fatalf("timed out waiting for websocket accept")
+ }
+ return nil, nil
+}
diff --git a/internal/services/recorder_hub.go b/internal/services/recorder_hub.go
index 902989e..4df29ce 100644
--- a/internal/services/recorder_hub.go
+++ b/internal/services/recorder_hub.go
@@ -9,6 +9,7 @@ import (
"context"
"errors"
"fmt"
+ "strings"
"sync"
"time"
@@ -26,10 +27,13 @@ var (
// RecorderState stores the latest state snapshot reported by a recorder.
type RecorderState struct {
- CurrentState string `json:"current_state"`
- TaskID string `json:"task_id,omitempty"`
- UpdatedAt time.Time `json:"updated_at"`
- Raw map[string]interface{} `json:"raw,omitempty"`
+ CurrentState string `json:"current_state"`
+ TaskID string `json:"task_id,omitempty"`
+ UpdatedAt time.Time `json:"updated_at"`
+ Source string `json:"source,omitempty"`
+ LastSyncAttemptAt time.Time `json:"last_sync_attempt_at,omitempty"`
+ LastSyncError string `json:"last_sync_error,omitempty"`
+ Raw map[string]interface{} `json:"raw,omitempty"`
}
// RecorderInfo is a read-only snapshot of a connected recorder.
@@ -39,6 +43,7 @@ type RecorderInfo struct {
ConnectedAt time.Time `json:"connected_at"`
LastSeenAt time.Time `json:"last_seen_at"`
State RecorderState `json:"state"`
+ StateSynced bool `json:"state_synced"`
}
// RPCRequest represents a command sent from Keystone to Axon Recorder.
@@ -56,6 +61,7 @@ type RPCResponse struct {
Success bool `json:"success"`
Message string `json:"message,omitempty"`
Data map[string]interface{} `json:"data,omitempty"`
+ LocalErr error `json:"-"`
}
// PendingRPC tracks an in-flight RPC waiting for a response.
@@ -73,6 +79,7 @@ type RecorderConn struct {
ConnectedAt time.Time
LastSeenAt time.Time
State RecorderState
+ StateSynced bool
WriteMu sync.Mutex
PendingMu sync.Mutex
@@ -99,12 +106,41 @@ func (r *RecorderConn) GetState() RecorderState {
return r.State
}
+// IsStateSynced reports whether this connection has received an initial state snapshot.
+func (r *RecorderConn) IsStateSynced() bool {
+ r.StateMu.RLock()
+ defer r.StateMu.RUnlock()
+ return r.StateSynced
+}
+
// UpdateState updates the recorder state snapshot.
func (r *RecorderConn) UpdateState(state RecorderState) {
r.StateMu.Lock()
defer r.StateMu.Unlock()
- state.UpdatedAt = time.Now()
+ state.UpdatedAt = time.Now().UTC()
+ state.LastSyncError = ""
+ r.State = state
+ r.StateSynced = strings.TrimSpace(state.CurrentState) != ""
+}
+
+// MarkStateSyncing keeps the last snapshot for display but marks it unsafe for writes.
+func (r *RecorderConn) MarkStateSyncing(source string, syncErr error) {
+ r.StateMu.Lock()
+ defer r.StateMu.Unlock()
+ state := r.State
+ now := time.Now().UTC()
+ if state.UpdatedAt.IsZero() {
+ state.UpdatedAt = now
+ }
+ state.Source = strings.TrimSpace(source)
+ state.LastSyncAttemptAt = now
+ if syncErr != nil {
+ state.LastSyncError = syncErr.Error()
+ } else {
+ state.LastSyncError = ""
+ }
r.State = state
+ r.StateSynced = false
}
// RecorderHub manages all active Axon Recorder WebSocket connections.
@@ -149,14 +185,36 @@ func (h *RecorderHub) ConnectWithStaleThreshold(deviceID string, rc *RecorderCon
return h.connectWithStaleThreshold(deviceID, rc, staleThreshold)
}
+// ConnectReplacingExisting registers a recorder connection, replacing any
+// existing connection for the same device and returning it to the caller.
+func (h *RecorderHub) ConnectReplacingExisting(deviceID string, rc *RecorderConn) *RecorderConn {
+ replaced := h.connectReplacingExisting(deviceID, rc)
+ if replaced != nil {
+ h.failPendingRPCs(replaced, ErrRecorderNotConnected)
+ }
+ return replaced
+}
+
// Disconnect removes a recorder connection and drains any pending RPC waiters.
func (h *RecorderHub) Disconnect(deviceID string, rc *RecorderConn) bool {
if !h.disconnect(deviceID, rc) {
return false
}
+ h.failPendingRPCs(rc, ErrRecorderNotConnected)
+ return true
+}
+
+func (h *RecorderHub) failPendingRPCs(rc *RecorderConn, err error) {
+ if rc == nil {
+ return
+ }
+ message := ""
+ if err != nil {
+ message = err.Error()
+ }
- // Unblock any goroutines waiting for an RPC response from this device.
rc.PendingMu.Lock()
+ defer rc.PendingMu.Unlock()
for requestID, pending := range rc.Pending {
delete(rc.Pending, requestID)
// Non-blocking send: the waiter may have already timed out.
@@ -164,13 +222,12 @@ func (h *RecorderHub) Disconnect(deviceID string, rc *RecorderConn) bool {
case pending.ResponseC <- &RPCResponse{
RequestID: requestID,
Success: false,
- Message: ErrRecorderNotConnected.Error(),
+ Message: message,
+ LocalErr: err,
}:
default:
}
}
- rc.PendingMu.Unlock()
- return true
}
// Get returns the recorder connection for a device, or nil if not connected.
@@ -189,6 +246,7 @@ func (h *RecorderHub) ListDevices() []RecorderInfo {
ConnectedAt: rc.ConnectedAt,
LastSeenAt: rc.LastSeenAt,
State: rc.GetState(),
+ StateSynced: rc.IsStateSynced(),
})
}
return result
@@ -242,13 +300,24 @@ func (h *RecorderHub) SendRPC(ctx context.Context, deviceID, action string, para
Params: params,
}
+ writeCtx := ctx
+ var writeCancel context.CancelFunc
+ if timeout > 0 {
+ writeCtx, writeCancel = context.WithTimeout(ctx, timeout)
+ }
rc.WriteMu.Lock()
- writeErr := wsjson.Write(ctx, rc.Conn, req)
+ writeErr := wsjson.Write(writeCtx, rc.Conn, req)
rc.WriteMu.Unlock()
+ if writeCancel != nil {
+ writeCancel()
+ }
if writeErr != nil {
rc.PendingMu.Lock()
delete(rc.Pending, requestID)
rc.PendingMu.Unlock()
+ if errors.Is(writeErr, context.DeadlineExceeded) {
+ return nil, ErrRecorderRPCTimeout
+ }
return nil, fmt.Errorf("write rpc request: %w", writeErr)
}
@@ -257,6 +326,9 @@ func (h *RecorderHub) SendRPC(ctx context.Context, deviceID, action string, para
select {
case response := <-pending.ResponseC:
+ if response != nil && response.LocalErr != nil {
+ return nil, response.LocalErr
+ }
return response, nil
case <-waitCtx.Done():
rc.PendingMu.Lock()
diff --git a/internal/services/sync_worker.go b/internal/services/sync_worker.go
index 46e93d9..f9971bb 100644
--- a/internal/services/sync_worker.go
+++ b/internal/services/sync_worker.go
@@ -27,13 +27,14 @@ import (
// SyncWorkerConfig provides the runtime configuration for the sync worker.
type SyncWorkerConfig struct {
- BatchSize int
- MaxConcurrent int
- MaxRetries int
- IntervalSec int
- RetryBaseSec int
- RetryMaxSec int
- RetryJitterSec int
+ BatchSize int
+ MaxConcurrent int
+ MaxRetries int
+ AutoScanEnabled bool
+ IntervalSec int
+ RetryBaseSec int
+ RetryMaxSec int
+ RetryJitterSec int
}
type syncEnqueueRequest struct {
@@ -41,8 +42,8 @@ type syncEnqueueRequest struct {
manual bool
}
-// SyncWorker is a background goroutine that periodically scans for approved
-// episodes and uploads them to cloud via the data-platform gateway.
+// SyncWorker is a background goroutine that processes queued cloud sync work
+// and optionally discovers approved episodes for automatic cloud upload.
type SyncWorker struct {
db *sqlx.DB
uploader *cloud.Uploader
@@ -134,6 +135,7 @@ func (w *SyncWorker) Start() {
// Stop gracefully stops the sync worker within the provided context deadline.
func (w *SyncWorker) Stop(ctx context.Context) error {
+ startedAt := time.Now()
w.mu.Lock()
if !w.running.Load() {
done := w.stopDone
@@ -145,7 +147,7 @@ func (w *SyncWorker) Stop(ctx context.Context) error {
case <-done:
return nil
case <-ctx.Done():
- return fmt.Errorf("sync worker stop timeout: %w", ctx.Err())
+ return syncWorkerStopTimeoutError(ctx, startedAt)
}
}
@@ -159,7 +161,7 @@ func (w *SyncWorker) Stop(ctx context.Context) error {
case <-done:
return nil
case <-ctx.Done():
- return fmt.Errorf("sync worker stop timeout: %w", ctx.Err())
+ return syncWorkerStopTimeoutError(ctx, startedAt)
}
}
@@ -181,10 +183,21 @@ func (w *SyncWorker) Stop(ctx context.Context) error {
logger.Println("[SYNC-WORKER] Stopped")
return nil
case <-ctx.Done():
- return fmt.Errorf("sync worker stop timeout: %w", ctx.Err())
+ return syncWorkerStopTimeoutError(ctx, startedAt)
}
}
+func syncWorkerStopTimeoutError(ctx context.Context, startedAt time.Time) error {
+ timeout := time.Since(startedAt).Round(time.Millisecond)
+ if deadline, ok := ctx.Deadline(); ok {
+ if d := deadline.Sub(startedAt); d > 0 {
+ timeout = d.Round(time.Millisecond)
+ }
+ }
+ logger.Printf("[SYNC-WORKER] Stop timeout after %s (timeout_ms=%d): %v", timeout, timeout.Milliseconds(), ctx.Err())
+ return fmt.Errorf("sync worker stop timeout after %s: %w", timeout, ctx.Err())
+}
+
// IsRunning returns whether the worker is currently running.
func (w *SyncWorker) IsRunning() bool {
return w.running.Load()
@@ -195,6 +208,11 @@ func (w *SyncWorker) MaxRetries() int {
return w.cfg.MaxRetries
}
+// AutoScanEnabled returns whether the worker periodically discovers newly eligible episodes.
+func (w *SyncWorker) AutoScanEnabled() bool {
+ return w.cfg.AutoScanEnabled
+}
+
// EnqueueEpisode adds a specific episode ID for immediate sync processing.
func (w *SyncWorker) EnqueueEpisode(ctx context.Context, episodeID int64) error {
return w.enqueueEpisode(ctx, episodeID, false)
@@ -554,6 +572,10 @@ func (w *SyncWorker) pollAndProcess(ctx context.Context) {
// Then, retry any failed episodes that are due.
w.retryFailedEpisodes(ctx)
+ if !w.cfg.AutoScanEnabled {
+ return
+ }
+
// Finally, find newly eligible episodes and persist them as queued work.
ids, err := w.findPendingEpisodes(ctx, false)
if err != nil {
diff --git a/internal/services/sync_worker_test.go b/internal/services/sync_worker_test.go
index 2a34575..02d3151 100644
--- a/internal/services/sync_worker_test.go
+++ b/internal/services/sync_worker_test.go
@@ -315,6 +315,108 @@ func TestDispatchPendingSyncLogs_DispatchesPersistedRows(t *testing.T) {
}
}
+func TestPollAndProcess_SkipsAutoDiscoveryWhenDisabled(t *testing.T) {
+ db := newTestSyncWorkerDB(t)
+ w := &SyncWorker{
+ db: db,
+ cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3, AutoScanEnabled: false},
+ jobCh: make(chan syncEnqueueRequest, 1),
+ enqueuedEpisode: make(map[int64]struct{}),
+ }
+
+ insertEpisodeForSyncWorkerTest(t, db, 20, "approved", false)
+
+ w.pollAndProcess(context.Background())
+
+ if count := countSyncLogsForSyncWorkerTest(t, db, 20); count != 0 {
+ t.Fatalf("sync log count = %d, want 0 when auto scan is disabled", count)
+ }
+ select {
+ case got := <-w.jobCh:
+ t.Fatalf("unexpected job dispatched with auto scan disabled: %+v", got)
+ default:
+ }
+}
+
+func TestPollAndProcess_DispatchesPendingRowsWhenAutoDiscoveryDisabled(t *testing.T) {
+ db := newTestSyncWorkerDB(t)
+ w := &SyncWorker{
+ db: db,
+ cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3, AutoScanEnabled: false},
+ jobCh: make(chan syncEnqueueRequest, 1),
+ enqueuedEpisode: make(map[int64]struct{}),
+ }
+
+ insertEpisodeForSyncWorkerTest(t, db, 21, "approved", false)
+ insertSyncLogForSyncWorkerTest(t, db, 21, "pending", 0)
+
+ w.pollAndProcess(context.Background())
+
+ select {
+ case got := <-w.jobCh:
+ if got.episodeID != 21 {
+ t.Fatalf("unexpected episode id: got %d want 21", got.episodeID)
+ }
+ default:
+ t.Fatal("expected pending row to be dispatched with auto scan disabled")
+ }
+}
+
+func TestPollAndProcess_RetriesDueFailuresWhenAutoDiscoveryDisabled(t *testing.T) {
+ db := newTestSyncWorkerDB(t)
+ w := &SyncWorker{
+ db: db,
+ cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3, AutoScanEnabled: false},
+ jobCh: make(chan syncEnqueueRequest, 1),
+ enqueuedEpisode: make(map[int64]struct{}),
+ }
+
+ insertEpisodeForSyncWorkerTest(t, db, 22, "approved", false)
+ insertSyncLogForSyncWorkerTest(t, db, 22, "failed", 1)
+
+ w.pollAndProcess(context.Background())
+
+ latest := latestSyncLogForSyncWorkerTest(t, db, 22)
+ if latest.Status != "pending" {
+ t.Fatalf("latest status = %q, want pending", latest.Status)
+ }
+ select {
+ case got := <-w.jobCh:
+ if got.episodeID != 22 {
+ t.Fatalf("unexpected episode id: got %d want 22", got.episodeID)
+ }
+ default:
+ t.Fatal("expected retryable failure to be dispatched with auto scan disabled")
+ }
+}
+
+func TestPollAndProcess_DiscoversPendingEpisodesWhenAutoScanEnabled(t *testing.T) {
+ db := newTestSyncWorkerDB(t)
+ w := &SyncWorker{
+ db: db,
+ cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3, AutoScanEnabled: true},
+ jobCh: make(chan syncEnqueueRequest, 1),
+ enqueuedEpisode: make(map[int64]struct{}),
+ }
+
+ insertEpisodeForSyncWorkerTest(t, db, 23, "approved", false)
+
+ w.pollAndProcess(context.Background())
+
+ latest := latestSyncLogForSyncWorkerTest(t, db, 23)
+ if latest.Status != "pending" {
+ t.Fatalf("latest status = %q, want pending", latest.Status)
+ }
+ select {
+ case got := <-w.jobCh:
+ if got.episodeID != 23 {
+ t.Fatalf("unexpected episode id: got %d want 23", got.episodeID)
+ }
+ default:
+ t.Fatal("expected auto-discovered episode to be dispatched")
+ }
+}
+
func TestRetryFailedEpisodes_PromotesDueFailureToPendingBeforeDispatch(t *testing.T) {
db := newTestSyncWorkerDB(t)
w := &SyncWorker{
diff --git a/internal/services/transfer_hub.go b/internal/services/transfer_hub.go
index 0272fd0..1a9378d 100644
--- a/internal/services/transfer_hub.go
+++ b/internal/services/transfer_hub.go
@@ -7,6 +7,7 @@ package services
import (
"context"
+ "errors"
"fmt"
"sync"
"time"
@@ -15,6 +16,16 @@ import (
"github.com/coder/websocket/wsjson"
)
+var (
+ // ErrTransferWriteTimeout indicates that a transfer WebSocket write exceeded its deadline.
+ ErrTransferWriteTimeout = errors.New("transfer write timeout")
+)
+
+const (
+ // DefaultTransferWriteTimeout caps device writes when no handler-specific timeout is configured.
+ DefaultTransferWriteTimeout = 10 * time.Second
+)
+
// DeviceEvent represents a single event recorded for a device
type DeviceEvent struct {
Direction string `json:"direction"` // "inbound" or "outbound"
@@ -197,6 +208,12 @@ func (h *TransferHub) ConnectWithStaleThreshold(deviceID string, dc *TransferCon
return h.connectWithStaleThreshold(deviceID, dc, staleThreshold)
}
+// ConnectReplacingExisting registers a transfer connection, replacing any
+// existing connection for the same device and returning it to the caller.
+func (h *TransferHub) ConnectReplacingExisting(deviceID string, dc *TransferConn) *TransferConn {
+ return h.connectReplacingExisting(deviceID, dc)
+}
+
// Disconnect removes a device connection
func (h *TransferHub) Disconnect(deviceID string, dc *TransferConn) bool {
return h.disconnect(deviceID, dc)
@@ -223,18 +240,43 @@ func (h *TransferHub) ListDevices() []DeviceInfo {
return result
}
-// SendToDevice sends a JSON message to a connected device via WebSocket
+// SendToDevice sends a JSON message to a connected device via WebSocket.
func (h *TransferHub) SendToDevice(ctx context.Context, deviceID string, msg map[string]interface{}) error {
+ return h.SendToDeviceWithTimeout(ctx, deviceID, msg, DefaultTransferWriteTimeout)
+}
+
+// SendToDeviceWithTimeout sends a JSON message to a connected device with a bounded write deadline.
+func (h *TransferHub) SendToDeviceWithTimeout(ctx context.Context, deviceID string, msg map[string]interface{}, timeout time.Duration) error {
dc := h.Get(deviceID)
if dc == nil {
return fmt.Errorf("device %s not connected", deviceID)
}
+ return h.SendToConnWithTimeout(ctx, dc, msg, timeout)
+}
+
+// SendToConnWithTimeout sends a JSON message to an existing transfer connection.
+func (h *TransferHub) SendToConnWithTimeout(ctx context.Context, dc *TransferConn, msg map[string]interface{}, timeout time.Duration) error {
+ if dc == nil || dc.Conn == nil {
+ return fmt.Errorf("device not connected")
+ }
+ if ctx == nil {
+ ctx = context.Background()
+ }
+ if timeout <= 0 {
+ timeout = DefaultTransferWriteTimeout
+ }
+
+ writeCtx, cancel := context.WithTimeout(ctx, timeout)
+ defer cancel()
dc.WriteMu.Lock()
defer dc.WriteMu.Unlock()
- if err := wsjson.Write(ctx, dc.Conn, msg); err != nil {
- return fmt.Errorf("failed to send message to device %s: %w", deviceID, err)
+ if err := wsjson.Write(writeCtx, dc.Conn, msg); err != nil {
+ if errors.Is(err, context.DeadlineExceeded) || errors.Is(writeCtx.Err(), context.DeadlineExceeded) {
+ return fmt.Errorf("%w after %s: failed to send message to device %s", ErrTransferWriteTimeout, timeout, dc.DeviceID)
+ }
+ return fmt.Errorf("failed to send message to device %s: %w", dc.DeviceID, err)
}
dc.RecordEvent("outbound", msg)