From 6807f76522d95ac75e24fed5c679f3c90a1a738e Mon Sep 17 00:00:00 2001 From: chao liu Date: Thu, 28 May 2026 18:09:48 +0800 Subject: [PATCH 01/12] fix websocket connection takeover --- internal/api/handlers/axon_rpc.go | 19 +++++-------------- internal/api/handlers/transfer.go | 19 +++++-------------- internal/services/hub.go | 26 ++++++++++++++++++++++++-- internal/services/hub_test.go | 29 +++++++++++++++++++++++++++++ internal/services/recorder_hub.go | 24 ++++++++++++++++++++---- internal/services/transfer_hub.go | 6 ++++++ 6 files changed, 89 insertions(+), 34 deletions(-) diff --git a/internal/api/handlers/axon_rpc.go b/internal/api/handlers/axon_rpc.go index 7f9ccdd..783670d 100644 --- a/internal/api/handlers/axon_rpc.go +++ b/internal/api/handlers/axon_rpc.go @@ -156,17 +156,8 @@ func (h *RecorderHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request remoteIP := extractIP(r.RemoteAddr) rc := h.hub.NewRecorderConn(conn, deviceID, remoteIP) - staleConn, ok := h.hub.ConnectWithStaleThreshold(deviceID, rc, recorderStaleThreshold(h.cfg)) - if !ok { - logger.Printf("[RECORDER] Device %s: connection rejected (already connected)", deviceID) - if err := conn.Close(websocket.StatusPolicyViolation, "device already connected"); err != nil { - if !isExpectedWebSocketCloseError(err) { - logger.Printf("[RECORDER] Device %s: WebSocket close error: %v", deviceID, err) - } - } - return - } - closeStaleRecorderConn(deviceID, staleConn) + replacedConn := h.hub.ConnectReplacingExisting(deviceID, rc) + closeReplacedRecorderConn(deviceID, replacedConn) defer func() { if err := conn.Close(websocket.StatusNormalClosure, ""); err != nil { @@ -815,14 +806,14 @@ func recorderStaleThreshold(cfg *config.RecorderConfig) time.Duration { return time.Duration(cfg.StaleThreshold) * time.Second } -func closeStaleRecorderConn(deviceID string, rc *services.RecorderConn) { +func closeReplacedRecorderConn(deviceID string, rc *services.RecorderConn) { if rc == nil || rc.Conn == nil { return } - logger.Printf("[RECORDER] Device %s: closing stale WebSocket connection", deviceID) + logger.Printf("[RECORDER] Device %s: closing replaced WebSocket connection", deviceID) if err := rc.Conn.CloseNow(); err != nil { if !isExpectedWebSocketCloseError(err) { - logger.Printf("[RECORDER] Device %s: stale WebSocket close error: %v", deviceID, err) + logger.Printf("[RECORDER] Device %s: replaced WebSocket close error: %v", deviceID, err) } } } diff --git a/internal/api/handlers/transfer.go b/internal/api/handlers/transfer.go index b282db7..232cc69 100644 --- a/internal/api/handlers/transfer.go +++ b/internal/api/handlers/transfer.go @@ -149,17 +149,8 @@ func (h *TransferHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request remoteIP := extractIP(r.RemoteAddr) dc := h.hub.NewTransferConn(conn, deviceID, remoteIP) - staleConn, ok := h.hub.ConnectWithStaleThreshold(deviceID, dc, transferStaleThreshold(h.cfg)) - if !ok { - logger.Printf("[TRANSFER] Device %s: connection rejected (already connected)", deviceID) - if err := conn.Close(websocket.StatusPolicyViolation, "device already connected"); err != nil { - if !isExpectedWebSocketCloseError(err) { - logger.Printf("[TRANSFER] WebSocket close error for device %s: %v", deviceID, err) - } - } - return - } - closeStaleTransferConn(deviceID, staleConn) + replacedConn := h.hub.ConnectReplacingExisting(deviceID, dc) + closeReplacedTransferConn(deviceID, replacedConn) defer func() { if err := conn.Close(websocket.StatusNormalClosure, ""); err != nil { @@ -267,14 +258,14 @@ func transferStaleThreshold(cfg *config.TransferConfig) time.Duration { return time.Duration(cfg.StaleThreshold) * time.Second } -func closeStaleTransferConn(deviceID string, dc *services.TransferConn) { +func closeReplacedTransferConn(deviceID string, dc *services.TransferConn) { if dc == nil || dc.Conn == nil { return } - logger.Printf("[TRANSFER] Device %s: closing stale WebSocket connection", deviceID) + logger.Printf("[TRANSFER] Device %s: closing replaced WebSocket connection", deviceID) if err := dc.Conn.CloseNow(); err != nil { if !isExpectedWebSocketCloseError(err) { - logger.Printf("[TRANSFER] Device %s: stale WebSocket close error: %v", deviceID, err) + logger.Printf("[TRANSFER] Device %s: replaced WebSocket close error: %v", deviceID, err) } } } diff --git a/internal/services/hub.go b/internal/services/hub.go index ee7e30b..c8516cb 100644 --- a/internal/services/hub.go +++ b/internal/services/hub.go @@ -91,9 +91,31 @@ func (h *Hub[T]) connectWithStaleThreshold(deviceID string, conn T, staleThresho return zero, true } +// connectReplacingExisting registers conn under deviceID. If another connection +// is already registered for the same device, the new connection takes over and +// the previous connection is returned so the caller can close it outside the hub +// lock. +func (h *Hub[T]) connectReplacingExisting(deviceID string, conn T) T { + var replaced T + + h.mu.Lock() + defer h.mu.Unlock() + + if old, exists := h.connections[deviceID]; exists { + if old.GetWSConn() != conn.GetWSConn() { + replaced = old + logger.Printf("[%s] Hub: replacing existing connection for device %s", h.label, deviceID) + } + } + + h.connections[deviceID] = conn + logger.Printf("[%s] Hub: device %s registered, total connections=%d", h.label, deviceID, len(h.connections)) + return replaced +} + // disconnect removes the connection for deviceID only if it matches conn. -// This avoids a stale handler goroutine deleting a newer connection after -// rejecting takeover. Returns true if an entry was removed. +// This avoids an old handler goroutine deleting a newer connection after +// replacement. Returns true if an entry was removed. func (h *Hub[T]) disconnect(deviceID string, conn T) bool { h.mu.Lock() defer h.mu.Unlock() diff --git a/internal/services/hub_test.go b/internal/services/hub_test.go index 4287f4e..7c2bb70 100644 --- a/internal/services/hub_test.go +++ b/internal/services/hub_test.go @@ -69,3 +69,32 @@ func TestRecorderHubConnectWithStaleThresholdReplacesStaleConnection(t *testing. t.Fatalf("hub connection=%p want nil", got) } } + +func TestRecorderHubConnectReplacingExistingTakesOverFreshConnection(t *testing.T) { + hub := NewRecorderHub() + deviceID := "robot-001" + + oldConn := hub.NewRecorderConn(&websocket.Conn{}, deviceID, "127.0.0.1") + oldConn.LastSeenAt = time.Now() + if !hub.Connect(deviceID, oldConn) { + t.Fatalf("initial connect failed") + } + + newConn := hub.NewRecorderConn(&websocket.Conn{}, deviceID, "127.0.0.1") + replaced := hub.ConnectReplacingExisting(deviceID, newConn) + if replaced != oldConn { + t.Fatalf("replaced=%p want oldConn=%p", replaced, oldConn) + } + if got := hub.Get(deviceID); got != newConn { + t.Fatalf("hub connection=%p want newConn=%p", got, newConn) + } + if hub.Disconnect(deviceID, oldConn) { + t.Fatalf("old connection disconnected the current hub entry") + } + if got := hub.Get(deviceID); got != newConn { + t.Fatalf("old disconnect changed current hub connection") + } + if !hub.Disconnect(deviceID, newConn) { + t.Fatalf("new connection did not disconnect") + } +} diff --git a/internal/services/recorder_hub.go b/internal/services/recorder_hub.go index 902989e..8e15b4b 100644 --- a/internal/services/recorder_hub.go +++ b/internal/services/recorder_hub.go @@ -149,14 +149,32 @@ func (h *RecorderHub) ConnectWithStaleThreshold(deviceID string, rc *RecorderCon return h.connectWithStaleThreshold(deviceID, rc, staleThreshold) } +// ConnectReplacingExisting registers a recorder connection, replacing any +// existing connection for the same device and returning it to the caller. +func (h *RecorderHub) ConnectReplacingExisting(deviceID string, rc *RecorderConn) *RecorderConn { + replaced := h.connectReplacingExisting(deviceID, rc) + if replaced != nil { + h.failPendingRPCs(replaced, ErrRecorderNotConnected.Error()) + } + return replaced +} + // Disconnect removes a recorder connection and drains any pending RPC waiters. func (h *RecorderHub) Disconnect(deviceID string, rc *RecorderConn) bool { if !h.disconnect(deviceID, rc) { return false } + h.failPendingRPCs(rc, ErrRecorderNotConnected.Error()) + return true +} + +func (h *RecorderHub) failPendingRPCs(rc *RecorderConn, message string) { + if rc == nil { + return + } - // Unblock any goroutines waiting for an RPC response from this device. rc.PendingMu.Lock() + defer rc.PendingMu.Unlock() for requestID, pending := range rc.Pending { delete(rc.Pending, requestID) // Non-blocking send: the waiter may have already timed out. @@ -164,13 +182,11 @@ func (h *RecorderHub) Disconnect(deviceID string, rc *RecorderConn) bool { case pending.ResponseC <- &RPCResponse{ RequestID: requestID, Success: false, - Message: ErrRecorderNotConnected.Error(), + Message: message, }: default: } } - rc.PendingMu.Unlock() - return true } // Get returns the recorder connection for a device, or nil if not connected. diff --git a/internal/services/transfer_hub.go b/internal/services/transfer_hub.go index 0272fd0..dfb778f 100644 --- a/internal/services/transfer_hub.go +++ b/internal/services/transfer_hub.go @@ -197,6 +197,12 @@ func (h *TransferHub) ConnectWithStaleThreshold(deviceID string, dc *TransferCon return h.connectWithStaleThreshold(deviceID, dc, staleThreshold) } +// ConnectReplacingExisting registers a transfer connection, replacing any +// existing connection for the same device and returning it to the caller. +func (h *TransferHub) ConnectReplacingExisting(deviceID string, dc *TransferConn) *TransferConn { + return h.connectReplacingExisting(deviceID, dc) +} + // Disconnect removes a device connection func (h *TransferHub) Disconnect(deviceID string, dc *TransferConn) bool { return h.disconnect(deviceID, dc) From b4c07364b5d90ce70946dd142c0e30d396a61bce Mon Sep 17 00:00:00 2001 From: chao liu Date: Thu, 28 May 2026 19:16:47 +0800 Subject: [PATCH 02/12] reconcile recorder state after reconnect --- internal/api/handlers/axon_rpc.go | 105 ++++++++++++++++-- .../api/handlers/task_state_recovery_test.go | 41 +++++++ 2 files changed, 135 insertions(+), 11 deletions(-) diff --git a/internal/api/handlers/axon_rpc.go b/internal/api/handlers/axon_rpc.go index 783670d..934e207 100644 --- a/internal/api/handlers/axon_rpc.go +++ b/internal/api/handlers/axon_rpc.go @@ -177,6 +177,7 @@ func (h *RecorderHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request // #nosec G706 -- Set aside for now logger.Printf("[RECORDER] Recorder %s connected from %s", deviceID, remoteIP) + go h.syncRecorderStateFromDevice(rc) for { _, raw, err := conn.Read(ctx) @@ -484,7 +485,7 @@ func (h *RecorderHandler) GetStats(c *gin.Context) { return } - response, err := h.hub.SendRPC(c.Request.Context(), deviceID, "get_stats", nil, time.Duration(h.cfg.ResponseTimeout)*time.Second) + response, err := h.hub.SendRPC(c.Request.Context(), deviceID, "get_stats", nil, recorderRPCResponseTimeout(h.cfg)) if err != nil { switch { case errors.Is(err, services.ErrRecorderNotConnected): @@ -521,7 +522,7 @@ func (h *RecorderHandler) callRPC(c *gin.Context, action string, params map[stri return false } - response, err := h.hub.SendRPC(c.Request.Context(), deviceID, action, params, time.Duration(h.cfg.ResponseTimeout)*time.Second) + response, err := h.hub.SendRPC(c.Request.Context(), deviceID, action, params, recorderRPCResponseTimeout(h.cfg)) if err != nil { switch { case errors.Is(err, services.ErrRecorderNotConnected): @@ -648,13 +649,84 @@ func (h *RecorderHandler) handleStateUpdate(rc *services.RecorderConn, msg map[s TaskID: stringValue(data, "task_id"), Raw: data, } + h.applyRecorderStateSnapshot(rc, state, "state_update") +} + +func (h *RecorderHandler) syncRecorderStateFromDevice(rc *services.RecorderConn) { + if h == nil || h.hub == nil || rc == nil || strings.TrimSpace(rc.DeviceID) == "" { + return + } + if h.hub.Get(rc.DeviceID) != rc { + return + } + + timeout := recorderRPCResponseTimeout(h.cfg) + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + response, err := h.hub.SendRPC(ctx, rc.DeviceID, "get_state", nil, timeout) + if err != nil { + if !errors.Is(err, services.ErrRecorderNotConnected) { + logger.Printf("[RECORDER] Recorder %s get_state after connect failed: %v", rc.DeviceID, err) + } + return + } + if response == nil || !response.Success { + message := "unsuccessful response" + if response != nil && strings.TrimSpace(response.Message) != "" { + message = response.Message + } + logger.Printf("[RECORDER] Recorder %s get_state after connect skipped: %s", rc.DeviceID, message) + return + } + if h.hub.Get(rc.DeviceID) != rc { + return + } + + state := recorderStateFromRPCData(response.Data) + if strings.TrimSpace(state.CurrentState) == "" { + logger.Printf("[RECORDER] Recorder %s get_state after connect returned empty state", rc.DeviceID) + return + } + h.applyRecorderStateSnapshot(rc, state, "get_state") +} + +func (h *RecorderHandler) applyRecorderStateSnapshot(rc *services.RecorderConn, state services.RecorderState, source string) { + if rc == nil { + return + } rc.UpdateState(state) - h.reconcileRecorderTaskState(rc.DeviceID, state) - // #nosec G706 -- Set aside for now - logger.Printf("[RECORDER] Recorder %s state=%s task=%s", rc.DeviceID, state.CurrentState, state.TaskID) + h.reconcileRecorderTaskState(rc.DeviceID, state, source) + if source == "state_update" { + // #nosec G706 -- Set aside for now + logger.Printf("[RECORDER] Recorder %s state=%s task=%s", rc.DeviceID, state.CurrentState, state.TaskID) + return + } + logger.Printf("[RECORDER] Recorder %s state=%s task=%s source=%s", rc.DeviceID, state.CurrentState, state.TaskID, source) } -func (h *RecorderHandler) reconcileRecorderTaskState(deviceID string, state services.RecorderState) { +func recorderStateFromRPCData(data map[string]interface{}) services.RecorderState { + state := services.RecorderState{ + CurrentState: firstNonEmptyString(data, "state", "current", "current_state"), + TaskID: firstNonEmptyString(data, "task_id"), + Raw: data, + } + if strings.TrimSpace(state.TaskID) == "" { + state.TaskID = stringValue(mapValue(data, "task_config"), "task_id") + } + return state +} + +func firstNonEmptyString(m map[string]interface{}, keys ...string) string { + for _, key := range keys { + if value := strings.TrimSpace(stringValue(m, key)); value != "" { + return value + } + } + return "" +} + +func (h *RecorderHandler) reconcileRecorderTaskState(deviceID string, state services.RecorderState, source string) { if h.db == nil { return } @@ -663,17 +735,21 @@ func (h *RecorderHandler) reconcileRecorderTaskState(deviceID string, state serv return } - switch strings.TrimSpace(state.CurrentState) { + if strings.TrimSpace(source) == "" { + source = "state_update" + } + currentState := strings.ToLower(strings.TrimSpace(state.CurrentState)) + switch currentState { case "ready": - advanceTaskPendingToReady(h.db, deviceID, taskID, "state_update ready") - case "recording": + advanceTaskPendingToReady(h.db, deviceID, taskID, source+" ready") + case "recording", "paused": res, err := advanceTaskPendingOrReadyToInProgress(h.db, taskID) if err != nil { - logger.Printf("[RECORDER] Recorder %s: failed to advance task pending/ready->in_progress after state_update recording: task=%s err=%v", deviceID, taskID, err) + logger.Printf("[RECORDER] Recorder %s: failed to advance task pending/ready->in_progress after %s %s: task=%s err=%v", deviceID, source, currentState, taskID, err) return } if n, _ := res.RowsAffected(); n > 0 { - logger.Printf("[RECORDER] Device %s: task status reconciled: task=%s source=state_update_recording status=in_progress", deviceID, taskID) + logger.Printf("[RECORDER] Device %s: task status reconciled: task=%s source=%s_%s status=in_progress", deviceID, taskID, source, currentState) } } } @@ -785,6 +861,13 @@ func (h *RecorderHandler) pingLoop(ctx context.Context, rc *services.RecorderCon } } +func recorderRPCResponseTimeout(cfg *config.RecorderConfig) time.Duration { + if cfg == nil || cfg.ResponseTimeout <= 0 { + return 15 * time.Second + } + return time.Duration(cfg.ResponseTimeout) * time.Second +} + func recorderPingInterval(cfg *config.RecorderConfig) time.Duration { if cfg == nil || cfg.PingInterval <= 0 { return 0 diff --git a/internal/api/handlers/task_state_recovery_test.go b/internal/api/handlers/task_state_recovery_test.go index bb1f17e..d1acc92 100644 --- a/internal/api/handlers/task_state_recovery_test.go +++ b/internal/api/handlers/task_state_recovery_test.go @@ -59,6 +59,47 @@ func TestRecorderStateUpdateRecordingAdvancesPendingTask(t *testing.T) { assertTaskStateRecoveryTimestampSet(t, db, "task-recording", "started_at") } +func TestRecorderStateUpdatePausedAdvancesPendingTask(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-paused", "pending") + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db) + rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") + + handler.handleStateUpdate(rc, map[string]interface{}{ + "data": map[string]interface{}{ + "current": "paused", + "task_id": "task-paused", + }, + }) + + assertTaskStateRecoveryStatus(t, db, "task-paused", "in_progress") + assertTaskStateRecoveryTimestampSet(t, db, "task-paused", "started_at") +} + +func TestRecorderGetStateSnapshotReadyRestoresPendingTask(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-rpc-ready", "pending") + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db) + rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") + state := recorderStateFromRPCData(map[string]interface{}{ + "state": "ready", + "task_config": map[string]interface{}{ + "task_id": "task-rpc-ready", + }, + }) + + handler.applyRecorderStateSnapshot(rc, state, "get_state") + + assertTaskStateRecoveryStatus(t, db, "task-rpc-ready", "ready") + assertTaskStateRecoveryTimestampSet(t, db, "task-rpc-ready", "ready_at") +} + func TestRecordingStartCallbackAdvancesPendingTask(t *testing.T) { db := newTaskStateRecoveryDB(t) defer db.Close() From 09c46481318fdb3b24de12b76f279f9e8e48c72b Mon Sep 17 00:00:00 2001 From: chao liu Date: Thu, 28 May 2026 19:33:51 +0800 Subject: [PATCH 03/12] log upload not found without retry --- internal/api/handlers/transfer.go | 151 ++++-------------------------- 1 file changed, 17 insertions(+), 134 deletions(-) diff --git a/internal/api/handlers/transfer.go b/internal/api/handlers/transfer.go index 232cc69..c64bb88 100644 --- a/internal/api/handlers/transfer.go +++ b/internal/api/handlers/transfer.go @@ -45,9 +45,6 @@ type TransferHandler struct { // recorderHub is used on transfer disconnect to notify recorder (clear/cancel) before reverting tasks. recorderHub *services.RecorderHub recorderRPCTimeout time.Duration - - uploadNotFoundMu sync.Mutex - uploadNotFoundAttempts map[string]int } // NewTransferHandler creates a new TransferHandler. @@ -55,50 +52,20 @@ type TransferHandler struct { // recorderHub may be nil (disables recorder RPC on transfer disconnect). func NewTransferHandler(hub *services.TransferHub, cfg *config.TransferConfig, db *sqlx.DB, s3Client *s3.Client, bucket string, factoryID string, recorderHub *services.RecorderHub, recorderRPCTimeout time.Duration) *TransferHandler { return &TransferHandler{ - hub: hub, - cfg: cfg, - db: db, - s3: s3Client, - bucket: bucket, - factoryID: factoryID, - recorderHub: recorderHub, - recorderRPCTimeout: recorderRPCTimeout, - uploadNotFoundAttempts: make(map[string]int), + hub: hub, + cfg: cfg, + db: db, + s3: s3Client, + bucket: bucket, + factoryID: factoryID, + recorderHub: recorderHub, + recorderRPCTimeout: recorderRPCTimeout, client: &http.Client{ Timeout: 10 * time.Second, }, } } -func uploadNotFoundKey(deviceID, taskID string) string { - return deviceID + "|" + taskID -} - -func (h *TransferHandler) clearUploadNotFoundAttempts(deviceID, taskID string) { - if strings.TrimSpace(deviceID) == "" || strings.TrimSpace(taskID) == "" { - return - } - key := uploadNotFoundKey(deviceID, taskID) - h.uploadNotFoundMu.Lock() - delete(h.uploadNotFoundAttempts, key) - h.uploadNotFoundMu.Unlock() -} - -func (h *TransferHandler) clearUploadNotFoundAttemptsByDevice(deviceID string) { - deviceID = strings.TrimSpace(deviceID) - if deviceID == "" { - return - } - prefix := deviceID + "|" - h.uploadNotFoundMu.Lock() - for key := range h.uploadNotFoundAttempts { - if strings.HasPrefix(key, prefix) { - delete(h.uploadNotFoundAttempts, key) - } - } - h.uploadNotFoundMu.Unlock() -} - // RegisterRoutes registers all transfer-related REST routes func (h *TransferHandler) RegisterRoutes(apiV1 *gin.RouterGroup) { // Note: apiV1 is already /transfer (set by server.go) @@ -159,7 +126,6 @@ func (h *TransferHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request } } }() - defer h.clearUploadNotFoundAttemptsByDevice(deviceID) defer func() { if h.hub.Disconnect(deviceID, dc) { revertRunnableTasksOnDeviceDisconnect(h.db, deviceID, h.recorderHub, h.recorderRPCTimeout, true) @@ -321,7 +287,6 @@ func (h *TransferHandler) onUploadStarted(dc *services.TransferConn, msg map[str return } taskID := stringVal(data, "task_id") - h.clearUploadNotFoundAttempts(dc.DeviceID, taskID) // #nosec G706 -- Set aside for now logger.Printf("[TRANSFER] Device %s: upload started task=%s total_bytes=%d", dc.DeviceID, taskID, int64Val(data, "total_bytes")) @@ -403,7 +368,6 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra logger.Printf("[TRANSFER] Device %s: upload complete taskID is empty", dc.DeviceID) return } - h.clearUploadNotFoundAttempts(dc.DeviceID, taskID) // #nosec G706 -- Set aside for now logger.Printf("[TRANSFER] Device %s: upload complete for task=%s", dc.DeviceID, taskID) @@ -717,7 +681,6 @@ func (h *TransferHandler) onUploadFailed(ctx context.Context, dc *services.Trans taskID := stringVal(data, "task_id") reason := stringVal(data, "reason") retryCount := intVal(data, "retry_count") - h.clearUploadNotFoundAttempts(dc.DeviceID, taskID) // Log full message for debugging // #nosec G706 -- Set aside for now @@ -881,102 +844,22 @@ func revertRunnableTasksOnDeviceDisconnect(db *sqlx.DB, deviceID string, recorde } } -// onUploadNotFound handles "upload_not_found" message +// onUploadNotFound handles "upload_not_found" message. Keystone only logs this +// report; it does not retry upload_request because the transfer side owns local +// file discovery and retry policy. func (h *TransferHandler) onUploadNotFound(dc *services.TransferConn, msg map[string]interface{}) { data, _ := msg["data"].(map[string]interface{}) if data == nil { + logger.Printf("[TRANSFER] ERROR: Device %s reported upload_not_found with empty data", dc.DeviceID) return } - taskID := stringVal(data, "task_id") - detail := stringVal(data, "detail") - if strings.TrimSpace(taskID) == "" { - return - } - - const maxAttempts = 5 - key := uploadNotFoundKey(dc.DeviceID, taskID) - - h.uploadNotFoundMu.Lock() - attempt := h.uploadNotFoundAttempts[key] + 1 - h.uploadNotFoundAttempts[key] = attempt - h.uploadNotFoundMu.Unlock() - - // #nosec G706 -- Set aside for now - logger.Printf("[TRANSFER] Device %s: task=%s not found (attempt=%d/%d) detail=%q", dc.DeviceID, taskID, attempt, maxAttempts, detail) - - if attempt < maxAttempts { - // Simple backoff: 1s, 2s, 3s, 4s (capped). - delay := time.Duration(attempt) * time.Second - if delay > 5*time.Second { - delay = 5 * time.Second - } - - go func(deviceID, tid string, attemptNow int, d time.Duration) { - time.Sleep(d) - if h.hub.Get(deviceID) == nil { - logger.Printf("[TRANSFER] Device %s: skip retry upload_request for task=%s (device not connected)", deviceID, tid) - return - } - - keyNow := uploadNotFoundKey(deviceID, tid) - h.uploadNotFoundMu.Lock() - currentAttempt, ok := h.uploadNotFoundAttempts[keyNow] - h.uploadNotFoundMu.Unlock() - if !ok || currentAttempt != attemptNow { - logger.Printf("[TRANSFER] Device %s: skip stale retry upload_request for task=%s attempt=%d", deviceID, tid, attemptNow) - return - } - - req := map[string]interface{}{ - "type": "upload_request", - "task_id": tid, - "priority": 1, - } - if err := h.hub.SendToDevice(context.Background(), deviceID, req); err != nil { - logger.Printf("[TRANSFER] Device %s: retry upload_request failed for task=%s attempt=%d err=%v", deviceID, tid, attemptNow, err) - return - } - logger.Printf("[TRANSFER] Device %s: retry upload_request sent for task=%s attempt=%d", deviceID, tid, attemptNow) - }(dc.DeviceID, taskID, attempt, delay) - return - } - - // Reached max attempts: stop retrying and mark task as failed (best-effort). - h.uploadNotFoundMu.Lock() - delete(h.uploadNotFoundAttempts, key) - h.uploadNotFoundMu.Unlock() - - if h.db == nil { - return - } - now := time.Now().UTC() - detail = strings.TrimSpace(detail) - errMsg := fmt.Sprintf("upload_not_found after %d attempts", maxAttempts) - if detail != "" { - errMsg = fmt.Sprintf("%s: %s", errMsg, detail) - } - result, err := h.db.ExecContext(context.Background(), ` - UPDATE tasks - SET - status = 'failed', - error_message = ?, - completed_at = CASE WHEN completed_at IS NULL THEN ? ELSE completed_at END, - updated_at = ? - WHERE task_id = ? AND status = 'in_progress' AND deleted_at IS NULL - `, errMsg, now, now, taskID) - if err != nil { - logger.Printf("[TRANSFER] Device %s: failed to mark task failed on upload_not_found: task=%s err=%v", dc.DeviceID, taskID, err) + taskID := strings.TrimSpace(stringVal(data, "task_id")) + detail := strings.TrimSpace(stringVal(data, "detail")) + if taskID == "" { + logger.Printf("[TRANSFER] ERROR: Device %s reported upload_not_found without task_id detail=%q", dc.DeviceID, detail) return } - if rows, _ := result.RowsAffected(); rows > 0 { - logger.Printf("[TRANSFER] Device %s: task=%s marked as failed due to upload_not_found", dc.DeviceID, taskID) - var batchID int64 - if err := h.db.QueryRowContext(context.Background(), - "SELECT batch_id FROM tasks WHERE task_id = ? AND deleted_at IS NULL", taskID, - ).Scan(&batchID); err == nil && batchID > 0 { - go tryAdvanceBatchStatus(h.db, batchID) - } - } + logger.Printf("[TRANSFER] ERROR: Device %s reported upload_not_found task=%s detail=%q", dc.DeviceID, taskID, detail) } // onStatus handles "status" message and updates the device status snapshot From 7ce9d625543a7191602775f49cd0ebde2827bea9 Mon Sep 17 00:00:00 2001 From: chao liu Date: Thu, 28 May 2026 19:50:47 +0800 Subject: [PATCH 04/12] guard recorder reconnect state changes --- internal/api/handlers/axon_rpc.go | 7 +- .../api/handlers/task_state_recovery_test.go | 170 ++++++++++++++++++ internal/services/hub_test.go | 100 +++++++++++ internal/services/recorder_hub.go | 15 +- 4 files changed, 288 insertions(+), 4 deletions(-) diff --git a/internal/api/handlers/axon_rpc.go b/internal/api/handlers/axon_rpc.go index 934e207..bda9556 100644 --- a/internal/api/handlers/axon_rpc.go +++ b/internal/api/handlers/axon_rpc.go @@ -536,7 +536,7 @@ func (h *RecorderHandler) callRPC(c *gin.Context, action string, params map[stri } c.JSON(http.StatusOK, response) - return true + return response != nil && response.Success } func (h *RecorderHandler) bindOptionalParams(c *gin.Context) (map[string]interface{}, bool) { @@ -608,6 +608,11 @@ func recorderPreviousStateFromRaw(raw map[string]interface{}) string { } func (h *RecorderHandler) handleMessage(deviceID string, rc *services.RecorderConn, msg map[string]interface{}) { + if h.hub.Get(deviceID) != rc { + logger.Printf("[RECORDER] Recorder %s ignored message from replaced connection", deviceID) + return + } + msgType, _ := msg["type"].(string) switch msgType { case "rpc_response": diff --git a/internal/api/handlers/task_state_recovery_test.go b/internal/api/handlers/task_state_recovery_test.go index d1acc92..073734e 100644 --- a/internal/api/handlers/task_state_recovery_test.go +++ b/internal/api/handlers/task_state_recovery_test.go @@ -6,14 +6,18 @@ package handlers import ( "bytes" + "context" "encoding/json" "net/http" "net/http/httptest" + "strings" "testing" "time" "archebase.com/keystone-edge/internal/config" "archebase.com/keystone-edge/internal/services" + "github.com/coder/websocket" + "github.com/coder/websocket/wsjson" "github.com/gin-gonic/gin" "github.com/jmoiron/sqlx" _ "modernc.org/sqlite" @@ -100,6 +104,108 @@ func TestRecorderGetStateSnapshotReadyRestoresPendingTask(t *testing.T) { assertTaskStateRecoveryTimestampSet(t, db, "task-rpc-ready", "ready_at") } +func TestRecorderConfigDoesNotAdvanceTaskWhenRPCResponseUnsuccessful(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-config-false", "pending") + + hub := services.NewRecorderHub() + attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + return services.RPCResponse{Success: false, Message: "device rejected config"} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + + gin.SetMode(gin.TestMode) + router := gin.New() + router.POST("/recorder/:device_id/config", handler.Config) + body := []byte(`{"task_config":{"task_id":"task-config-false"}}`) + req := httptest.NewRequest(http.MethodPost, "/recorder/robot-001/config", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-config-false", "pending") +} + +func TestRecorderBeginDoesNotAdvanceTaskWhenRPCResponseUnsuccessful(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-begin-false", "pending") + + hub := services.NewRecorderHub() + attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + return services.RPCResponse{Success: false, Message: "device rejected begin"} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + + gin.SetMode(gin.TestMode) + router := gin.New() + router.POST("/recorder/:device_id/begin", handler.Begin) + body := []byte(`{"task_id":"task-begin-false"}`) + req := httptest.NewRequest(http.MethodPost, "/recorder/robot-001/begin", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-begin-false", "pending") +} + +func TestRecorderIgnoresStateChangingMessagesFromReplacedConnection(t *testing.T) { + tests := []struct { + name string + msg map[string]interface{} + }{ + { + name: "state_update", + msg: map[string]interface{}{ + "type": "state_update", + "data": map[string]interface{}{ + "current": "ready", + "task_id": "task-replaced", + }, + }, + }, + { + name: "config_applied", + msg: map[string]interface{}{ + "type": "config_applied", + "data": map[string]interface{}{ + "task_id": "task-replaced", + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-replaced", "pending") + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db) + oldConn := hub.NewRecorderConn(&websocket.Conn{}, "robot-001", "127.0.0.1") + if !hub.Connect("robot-001", oldConn) { + t.Fatalf("initial connect failed") + } + newConn := hub.NewRecorderConn(&websocket.Conn{}, "robot-001", "127.0.0.1") + hub.ConnectReplacingExisting("robot-001", newConn) + + handler.handleMessage("robot-001", oldConn, tt.msg) + + assertTaskStateRecoveryStatus(t, db, "task-replaced", "pending") + }) + } +} + func TestRecordingStartCallbackAdvancesPendingTask(t *testing.T) { db := newTaskStateRecoveryDB(t) defer db.Close() @@ -191,3 +297,67 @@ func assertTaskStateRecoveryTimestampSet(t *testing.T, db *sqlx.DB, taskID strin t.Fatalf("task %s was not set", column) } } + +func attachRecorderRPCResponder(t *testing.T, hub *services.RecorderHub, deviceID string, respond func(services.RPCRequest) services.RPCResponse) { + t.Helper() + serverConn, clientConn := newRecorderHandlerTestWebSocketPair(t) + rc := hub.NewRecorderConn(serverConn, deviceID, "127.0.0.1") + if !hub.Connect(deviceID, rc) { + t.Fatalf("connect recorder: initial connect failed") + } + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + go func() { + for { + var req services.RPCRequest + if err := wsjson.Read(ctx, clientConn, &req); err != nil { + return + } + resp := respond(req) + if resp.Type == "" { + resp.Type = "rpc_response" + } + if resp.RequestID == "" { + resp.RequestID = req.RequestID + } + hub.HandleRPCResponse(deviceID, &resp) + } + }() +} + +func newRecorderHandlerTestWebSocketPair(t *testing.T) (*websocket.Conn, *websocket.Conn) { + t.Helper() + serverConnC := make(chan *websocket.Conn, 1) + errC := make(chan error, 1) + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + conn, err := websocket.Accept(w, r, &websocket.AcceptOptions{InsecureSkipVerify: true}) + if err != nil { + errC <- err + return + } + serverConnC <- conn + <-r.Context().Done() + })) + t.Cleanup(server.Close) + + clientConn, _, err := websocket.Dial(context.Background(), "ws"+strings.TrimPrefix(server.URL, "http"), nil) + if err != nil { + t.Fatalf("dial websocket: %v", err) + } + + select { + case serverConn := <-serverConnC: + t.Cleanup(func() { + _ = clientConn.CloseNow() + _ = serverConn.CloseNow() + }) + return serverConn, clientConn + case err := <-errC: + t.Fatalf("accept websocket: %v", err) + case <-time.After(time.Second): + t.Fatalf("timed out waiting for websocket accept") + } + return nil, nil +} diff --git a/internal/services/hub_test.go b/internal/services/hub_test.go index 7c2bb70..251200b 100644 --- a/internal/services/hub_test.go +++ b/internal/services/hub_test.go @@ -5,6 +5,12 @@ package services import ( + "context" + "errors" + "fmt" + "net/http" + "net/http/httptest" + "strings" "testing" "time" @@ -98,3 +104,97 @@ func TestRecorderHubConnectReplacingExistingTakesOverFreshConnection(t *testing. t.Fatalf("new connection did not disconnect") } } + +func TestRecorderHubConnectReplacingExistingFailsPendingRPCWaitersWithError(t *testing.T) { + hub := NewRecorderHub() + deviceID := "robot-001" + serverConn, clientConn := newRecorderHubTestWebSocketPair(t) + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + go func() { + _, _, _ = clientConn.Read(ctx) + }() + + oldConn := hub.NewRecorderConn(serverConn, deviceID, "127.0.0.1") + if !hub.Connect(deviceID, oldConn) { + t.Fatalf("initial connect failed") + } + + errC := make(chan error, 1) + go func() { + resp, err := hub.SendRPC(ctx, deviceID, "begin", nil, time.Second) + if !errors.Is(err, ErrRecorderNotConnected) { + errC <- fmt.Errorf("SendRPC err=%v resp=%+v, want ErrRecorderNotConnected", err, resp) + return + } + errC <- nil + }() + + waitForPendingRecorderRPC(t, oldConn) + newConn := hub.NewRecorderConn(&websocket.Conn{}, deviceID, "127.0.0.1") + replaced := hub.ConnectReplacingExisting(deviceID, newConn) + if replaced != oldConn { + t.Fatalf("replaced=%p want oldConn=%p", replaced, oldConn) + } + + select { + case err := <-errC: + if err != nil { + t.Fatal(err) + } + case <-time.After(time.Second): + t.Fatalf("SendRPC did not return after connection replacement") + } +} + +func waitForPendingRecorderRPC(t *testing.T, rc *RecorderConn) { + t.Helper() + deadline := time.Now().Add(time.Second) + for time.Now().Before(deadline) { + rc.PendingMu.Lock() + n := len(rc.Pending) + rc.PendingMu.Unlock() + if n > 0 { + return + } + time.Sleep(10 * time.Millisecond) + } + t.Fatalf("timed out waiting for pending RPC") +} + +func newRecorderHubTestWebSocketPair(t *testing.T) (*websocket.Conn, *websocket.Conn) { + t.Helper() + serverConnC := make(chan *websocket.Conn, 1) + errC := make(chan error, 1) + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + conn, err := websocket.Accept(w, r, &websocket.AcceptOptions{InsecureSkipVerify: true}) + if err != nil { + errC <- err + return + } + serverConnC <- conn + <-r.Context().Done() + })) + t.Cleanup(server.Close) + + clientConn, _, err := websocket.Dial(context.Background(), "ws"+strings.TrimPrefix(server.URL, "http"), nil) + if err != nil { + t.Fatalf("dial websocket: %v", err) + } + + select { + case serverConn := <-serverConnC: + t.Cleanup(func() { + _ = clientConn.CloseNow() + _ = serverConn.CloseNow() + }) + return serverConn, clientConn + case err := <-errC: + t.Fatalf("accept websocket: %v", err) + case <-time.After(time.Second): + t.Fatalf("timed out waiting for websocket accept") + } + return nil, nil +} diff --git a/internal/services/recorder_hub.go b/internal/services/recorder_hub.go index 8e15b4b..6ef6606 100644 --- a/internal/services/recorder_hub.go +++ b/internal/services/recorder_hub.go @@ -56,6 +56,7 @@ type RPCResponse struct { Success bool `json:"success"` Message string `json:"message,omitempty"` Data map[string]interface{} `json:"data,omitempty"` + LocalErr error `json:"-"` } // PendingRPC tracks an in-flight RPC waiting for a response. @@ -154,7 +155,7 @@ func (h *RecorderHub) ConnectWithStaleThreshold(deviceID string, rc *RecorderCon func (h *RecorderHub) ConnectReplacingExisting(deviceID string, rc *RecorderConn) *RecorderConn { replaced := h.connectReplacingExisting(deviceID, rc) if replaced != nil { - h.failPendingRPCs(replaced, ErrRecorderNotConnected.Error()) + h.failPendingRPCs(replaced, ErrRecorderNotConnected) } return replaced } @@ -164,14 +165,18 @@ func (h *RecorderHub) Disconnect(deviceID string, rc *RecorderConn) bool { if !h.disconnect(deviceID, rc) { return false } - h.failPendingRPCs(rc, ErrRecorderNotConnected.Error()) + h.failPendingRPCs(rc, ErrRecorderNotConnected) return true } -func (h *RecorderHub) failPendingRPCs(rc *RecorderConn, message string) { +func (h *RecorderHub) failPendingRPCs(rc *RecorderConn, err error) { if rc == nil { return } + message := "" + if err != nil { + message = err.Error() + } rc.PendingMu.Lock() defer rc.PendingMu.Unlock() @@ -183,6 +188,7 @@ func (h *RecorderHub) failPendingRPCs(rc *RecorderConn, message string) { RequestID: requestID, Success: false, Message: message, + LocalErr: err, }: default: } @@ -273,6 +279,9 @@ func (h *RecorderHub) SendRPC(ctx context.Context, deviceID, action string, para select { case response := <-pending.ResponseC: + if response != nil && response.LocalErr != nil { + return nil, response.LocalErr + } return response, nil case <-waitCtx.Done(): rc.PendingMu.Lock() From 0f86b0577de067b6de6b99ed043c4a2a6b14590b Mon Sep 17 00:00:00 2001 From: chao liu Date: Thu, 28 May 2026 19:58:50 +0800 Subject: [PATCH 05/12] ignore replaced transfer connection messages --- internal/api/handlers/transfer.go | 5 + .../transfer_connection_takeover_test.go | 111 ++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 internal/api/handlers/transfer_connection_takeover_test.go diff --git a/internal/api/handlers/transfer.go b/internal/api/handlers/transfer.go index c64bb88..acc6870 100644 --- a/internal/api/handlers/transfer.go +++ b/internal/api/handlers/transfer.go @@ -238,6 +238,11 @@ func closeReplacedTransferConn(deviceID string, dc *services.TransferConn) { // handleMessage dispatches an inbound WebSocket message to the appropriate handler func (h *TransferHandler) handleMessage(ctx context.Context, dc *services.TransferConn, msg map[string]interface{}) { + if h.hub.Get(dc.DeviceID) != dc { + logger.Printf("[TRANSFER] Device %s ignored message from replaced connection", dc.DeviceID) + return + } + msgType, _ := msg["type"].(string) switch msgType { diff --git a/internal/api/handlers/transfer_connection_takeover_test.go b/internal/api/handlers/transfer_connection_takeover_test.go new file mode 100644 index 0000000..aa553f0 --- /dev/null +++ b/internal/api/handlers/transfer_connection_takeover_test.go @@ -0,0 +1,111 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package handlers + +import ( + "context" + "testing" + "time" + + "archebase.com/keystone-edge/internal/config" + "archebase.com/keystone-edge/internal/services" + "github.com/coder/websocket" + "github.com/jmoiron/sqlx" + _ "modernc.org/sqlite" +) + +func TestTransferIgnoresUploadFailedFromReplacedConnection(t *testing.T) { + db := newTransferTakeoverDB(t) + defer db.Close() + seedTransferTakeoverTask(t, db, "task-old-upload", "in_progress") + + hub := services.NewTransferHub(10) + handler := NewTransferHandler(hub, &config.TransferConfig{}, db, nil, "", "", nil, 0) + oldConn := hub.NewTransferConn(&websocket.Conn{}, "robot-001", "127.0.0.1") + if !hub.Connect("robot-001", oldConn) { + t.Fatalf("initial connect failed") + } + newConn := hub.NewTransferConn(&websocket.Conn{}, "robot-001", "127.0.0.1") + hub.ConnectReplacingExisting("robot-001", newConn) + + handler.handleMessage(context.Background(), oldConn, map[string]interface{}{ + "type": "upload_failed", + "data": map[string]interface{}{ + "task_id": "task-old-upload", + "reason": "old connection message", + }, + }) + + assertTransferTakeoverTaskStatus(t, db, "task-old-upload", "in_progress") +} + +func TestTransferProcessesUploadFailedFromCurrentConnection(t *testing.T) { + db := newTransferTakeoverDB(t) + defer db.Close() + seedTransferTakeoverTask(t, db, "task-current-upload", "in_progress") + + hub := services.NewTransferHub(10) + handler := NewTransferHandler(hub, &config.TransferConfig{}, db, nil, "", "", nil, 0) + dc := hub.NewTransferConn(&websocket.Conn{}, "robot-001", "127.0.0.1") + if !hub.Connect("robot-001", dc) { + t.Fatalf("connect failed") + } + + handler.handleMessage(context.Background(), dc, map[string]interface{}{ + "type": "upload_failed", + "data": map[string]interface{}{ + "task_id": "task-current-upload", + "reason": "current connection message", + }, + }) + + assertTransferTakeoverTaskStatus(t, db, "task-current-upload", "failed") +} + +func newTransferTakeoverDB(t *testing.T) *sqlx.DB { + t.Helper() + db, err := sqlx.Open("sqlite", ":memory:") + if err != nil { + t.Fatalf("open sqlite db: %v", err) + } + if _, err := db.Exec(`CREATE TABLE tasks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + task_id TEXT NOT NULL, + batch_id INTEGER NOT NULL DEFAULT 0, + status TEXT NOT NULL, + completed_at TIMESTAMP NULL, + created_at TIMESTAMP NOT NULL, + updated_at TIMESTAMP NOT NULL, + deleted_at TIMESTAMP NULL + )`); err != nil { + t.Fatalf("create tasks schema: %v", err) + } + return db +} + +func seedTransferTakeoverTask(t *testing.T, db *sqlx.DB, taskID string, status string) { + t.Helper() + now := time.Now().UTC() + if _, err := db.Exec( + `INSERT INTO tasks (task_id, status, created_at, updated_at) VALUES (?, ?, ?, ?)`, + taskID, + status, + now, + now, + ); err != nil { + t.Fatalf("seed task: %v", err) + } +} + +func assertTransferTakeoverTaskStatus(t *testing.T, db *sqlx.DB, taskID string, want string) { + t.Helper() + var got string + if err := db.Get(&got, `SELECT status FROM tasks WHERE task_id = ?`, taskID); err != nil { + t.Fatalf("query task status: %v", err) + } + if got != want { + t.Fatalf("task status=%q want=%q", got, want) + } +} From f3044ead2d8edaef24724d76c96d7ae1dbaa37e2 Mon Sep 17 00:00:00 2001 From: chao liu Date: Fri, 29 May 2026 00:51:59 +0800 Subject: [PATCH 06/12] Prevent recorder config during reconnect sync --- internal/api/handlers/axon_rpc.go | 47 +++++ .../api/handlers/task_state_recovery_test.go | 182 +++++++++++++++++- internal/services/recorder_hub.go | 12 ++ 3 files changed, 240 insertions(+), 1 deletion(-) diff --git a/internal/api/handlers/axon_rpc.go b/internal/api/handlers/axon_rpc.go index bda9556..d137eec 100644 --- a/internal/api/handlers/axon_rpc.go +++ b/internal/api/handlers/axon_rpc.go @@ -228,6 +228,10 @@ func (h *RecorderHandler) Config(c *gin.Context) { } } + if !h.requireRecorderReadyForConfig(c) { + return + } + if !h.callRPC(c, "config", params) { return } @@ -515,6 +519,44 @@ func disconnectedRecorderStatsResponse() gin.H { } } +func (h *RecorderHandler) requireRecorderReadyForConfig(c *gin.Context) bool { + deviceID := strings.TrimSpace(c.Param("device_id")) + if deviceID == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "device_id is required"}) + return false + } + if h == nil || h.hub == nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "recorder hub is not configured"}) + return false + } + + rc := h.hub.Get(deviceID) + if rc == nil { + c.JSON(http.StatusNotFound, gin.H{"error": services.ErrRecorderNotConnected.Error()}) + return false + } + if !rc.IsStateSynced() { + c.JSON(http.StatusConflict, gin.H{ + "code": "recorder_state_syncing", + "error": "recorder state is syncing; retry after initial state snapshot", + }) + return false + } + + state := rc.GetState() + current := strings.ToLower(strings.TrimSpace(state.CurrentState)) + if current != "idle" { + c.JSON(http.StatusConflict, gin.H{ + "code": "recorder_busy", + "error": "recorder is not idle; config is rejected", + "current_state": state.CurrentState, + "task_id": state.TaskID, + }) + return false + } + return true +} + func (h *RecorderHandler) callRPC(c *gin.Context, action string, params map[string]interface{}) bool { deviceID := c.Param("device_id") if deviceID == "" { @@ -585,10 +627,13 @@ func (h *RecorderHandler) GetState(c *gin.Context) { "previous_state": "", "task_id": "", "updated_at": time.Now().UTC(), + "state_synced": false, + "syncing": false, }) return } st := rc.GetState() + synced := rc.IsStateSynced() prev := recorderPreviousStateFromRaw(st.Raw) c.JSON(http.StatusOK, gin.H{ "connected": true, @@ -596,6 +641,8 @@ func (h *RecorderHandler) GetState(c *gin.Context) { "previous_state": prev, "task_id": st.TaskID, "updated_at": st.UpdatedAt, + "state_synced": synced, + "syncing": !synced, }) } diff --git a/internal/api/handlers/task_state_recovery_test.go b/internal/api/handlers/task_state_recovery_test.go index 073734e..9ed59c8 100644 --- a/internal/api/handlers/task_state_recovery_test.go +++ b/internal/api/handlers/task_state_recovery_test.go @@ -104,16 +104,190 @@ func TestRecorderGetStateSnapshotReadyRestoresPendingTask(t *testing.T) { assertTaskStateRecoveryTimestampSet(t, db, "task-rpc-ready", "ready_at") } +func TestRecorderGetStateReportsSyncingBeforeInitialStateSnapshot(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + + hub := services.NewRecorderHub() + attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + return services.RPCResponse{Success: true} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db) + + gin.SetMode(gin.TestMode) + router := gin.New() + router.GET("/recorder/:device_id/state", handler.GetState) + req := httptest.NewRequest(http.MethodGet, "/recorder/robot-001/state", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + + var body map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil { + t.Fatalf("decode state response: %v", err) + } + if body["connected"] != true { + t.Fatalf("connected=%v want=true body=%v", body["connected"], body) + } + if body["state_synced"] != false { + t.Fatalf("state_synced=%v want=false body=%v", body["state_synced"], body) + } + if body["syncing"] != true { + t.Fatalf("syncing=%v want=true body=%v", body["syncing"], body) + } +} + +func TestRecorderGetStateReportsSyncedAfterInitialStateSnapshot(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db) + rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") + if !hub.Connect("robot-001", rc) { + t.Fatalf("connect recorder: initial connect failed") + } + handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update") + + gin.SetMode(gin.TestMode) + router := gin.New() + router.GET("/recorder/:device_id/state", handler.GetState) + req := httptest.NewRequest(http.MethodGet, "/recorder/robot-001/state", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + + var body map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil { + t.Fatalf("decode state response: %v", err) + } + if body["state_synced"] != true { + t.Fatalf("state_synced=%v want=true body=%v", body["state_synced"], body) + } + if body["syncing"] != false { + t.Fatalf("syncing=%v want=false body=%v", body["syncing"], body) + } +} + +func TestRecorderConfigRejectsBeforeInitialStateSnapshot(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-config-syncing", "pending") + + hub := services.NewRecorderHub() + rpcCalled := make(chan struct{}, 1) + attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + rpcCalled <- struct{}{} + return services.RPCResponse{Success: true} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + + gin.SetMode(gin.TestMode) + router := gin.New() + router.POST("/recorder/:device_id/config", handler.Config) + body := []byte(`{"task_config":{"task_id":"task-config-syncing"}}`) + req := httptest.NewRequest(http.MethodPost, "/recorder/robot-001/config", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + if w.Code != http.StatusConflict { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String()) + } + select { + case <-rpcCalled: + t.Fatalf("config RPC was sent before recorder state sync completed") + default: + } + assertTaskStateRecoveryStatus(t, db, "task-config-syncing", "pending") +} + +func TestRecorderConfigRejectsWhenSyncedRecorderBusy(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-config-busy", "pending") + + hub := services.NewRecorderHub() + rpcCalled := make(chan struct{}, 1) + rc := attachRecorderRPCResponderWithConn(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + rpcCalled <- struct{}{} + return services.RPCResponse{Success: true} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + handler.applyRecorderStateSnapshot(rc, services.RecorderState{ + CurrentState: "ready", + TaskID: "existing-task", + }, "state_update") + + gin.SetMode(gin.TestMode) + router := gin.New() + router.POST("/recorder/:device_id/config", handler.Config) + body := []byte(`{"task_config":{"task_id":"task-config-busy"}}`) + req := httptest.NewRequest(http.MethodPost, "/recorder/robot-001/config", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + if w.Code != http.StatusConflict { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String()) + } + select { + case <-rpcCalled: + t.Fatalf("config RPC was sent while recorder was already busy") + default: + } + assertTaskStateRecoveryStatus(t, db, "task-config-busy", "pending") +} + +func TestRecorderConfigAdvancesTaskAfterInitialStateSnapshot(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-config-synced", "pending") + + hub := services.NewRecorderHub() + rc := attachRecorderRPCResponderWithConn(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + return services.RPCResponse{Success: true} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update") + + gin.SetMode(gin.TestMode) + router := gin.New() + router.POST("/recorder/:device_id/config", handler.Config) + body := []byte(`{"task_config":{"task_id":"task-config-synced"}}`) + req := httptest.NewRequest(http.MethodPost, "/recorder/robot-001/config", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-config-synced", "ready") +} + func TestRecorderConfigDoesNotAdvanceTaskWhenRPCResponseUnsuccessful(t *testing.T) { db := newTaskStateRecoveryDB(t) defer db.Close() seedTaskStateRecoveryTask(t, db, "task-config-false", "pending") hub := services.NewRecorderHub() - attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + rc := attachRecorderRPCResponderWithConn(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { return services.RPCResponse{Success: false, Message: "device rejected config"} }) handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update") gin.SetMode(gin.TestMode) router := gin.New() @@ -299,6 +473,11 @@ func assertTaskStateRecoveryTimestampSet(t *testing.T, db *sqlx.DB, taskID strin } func attachRecorderRPCResponder(t *testing.T, hub *services.RecorderHub, deviceID string, respond func(services.RPCRequest) services.RPCResponse) { + t.Helper() + attachRecorderRPCResponderWithConn(t, hub, deviceID, respond) +} + +func attachRecorderRPCResponderWithConn(t *testing.T, hub *services.RecorderHub, deviceID string, respond func(services.RPCRequest) services.RPCResponse) *services.RecorderConn { t.Helper() serverConn, clientConn := newRecorderHandlerTestWebSocketPair(t) rc := hub.NewRecorderConn(serverConn, deviceID, "127.0.0.1") @@ -324,6 +503,7 @@ func attachRecorderRPCResponder(t *testing.T, hub *services.RecorderHub, deviceI hub.HandleRPCResponse(deviceID, &resp) } }() + return rc } func newRecorderHandlerTestWebSocketPair(t *testing.T) (*websocket.Conn, *websocket.Conn) { diff --git a/internal/services/recorder_hub.go b/internal/services/recorder_hub.go index 6ef6606..05e5960 100644 --- a/internal/services/recorder_hub.go +++ b/internal/services/recorder_hub.go @@ -9,6 +9,7 @@ import ( "context" "errors" "fmt" + "strings" "sync" "time" @@ -39,6 +40,7 @@ type RecorderInfo struct { ConnectedAt time.Time `json:"connected_at"` LastSeenAt time.Time `json:"last_seen_at"` State RecorderState `json:"state"` + StateSynced bool `json:"state_synced"` } // RPCRequest represents a command sent from Keystone to Axon Recorder. @@ -74,6 +76,7 @@ type RecorderConn struct { ConnectedAt time.Time LastSeenAt time.Time State RecorderState + StateSynced bool WriteMu sync.Mutex PendingMu sync.Mutex @@ -100,12 +103,20 @@ func (r *RecorderConn) GetState() RecorderState { return r.State } +// IsStateSynced reports whether this connection has received an initial state snapshot. +func (r *RecorderConn) IsStateSynced() bool { + r.StateMu.RLock() + defer r.StateMu.RUnlock() + return r.StateSynced +} + // UpdateState updates the recorder state snapshot. func (r *RecorderConn) UpdateState(state RecorderState) { r.StateMu.Lock() defer r.StateMu.Unlock() state.UpdatedAt = time.Now() r.State = state + r.StateSynced = strings.TrimSpace(state.CurrentState) != "" } // RecorderHub manages all active Axon Recorder WebSocket connections. @@ -211,6 +222,7 @@ func (h *RecorderHub) ListDevices() []RecorderInfo { ConnectedAt: rc.ConnectedAt, LastSeenAt: rc.LastSeenAt, State: rc.GetState(), + StateSynced: rc.IsStateSynced(), }) } return result From f63fe8b11405561edf9942baeade0d5214e26700 Mon Sep 17 00:00:00 2001 From: chao liu Date: Fri, 29 May 2026 10:01:29 +0800 Subject: [PATCH 07/12] feat: add device state stream --- internal/api/handlers/axon_rpc.go | 333 ++++++++++++++++--- internal/api/handlers/device_state_stream.go | 159 +++++++++ internal/api/handlers/transfer.go | 11 + internal/server/server.go | 9 + internal/services/device_state_broker.go | 107 ++++++ internal/services/recorder_hub.go | 34 +- 6 files changed, 593 insertions(+), 60 deletions(-) create mode 100644 internal/api/handlers/device_state_stream.go create mode 100644 internal/services/device_state_broker.go diff --git a/internal/api/handlers/axon_rpc.go b/internal/api/handlers/axon_rpc.go index d137eec..6253ca5 100644 --- a/internal/api/handlers/axon_rpc.go +++ b/internal/api/handlers/axon_rpc.go @@ -11,6 +11,7 @@ import ( "encoding/json" "errors" "net/http" + "strconv" "strings" "time" @@ -25,9 +26,11 @@ import ( // RecorderHandler handles REST and WebSocket traffic for Axon Recorder RPC. type RecorderHandler struct { - hub *services.RecorderHub - cfg *config.RecorderConfig - db *sqlx.DB + hub *services.RecorderHub + transferHub *services.TransferHub + stateBroker *services.DeviceStateBroker + cfg *config.RecorderConfig + db *sqlx.DB } // NewRecorderHandler creates a new RecorderHandler. @@ -35,6 +38,15 @@ func NewRecorderHandler(hub *services.RecorderHub, cfg *config.RecorderConfig, d return &RecorderHandler{hub: hub, cfg: cfg, db: db} } +// SetDeviceStateDeps enables device connection/state event publishing. +func (h *RecorderHandler) SetDeviceStateDeps(transferHub *services.TransferHub, broker *services.DeviceStateBroker) { + if h == nil { + return + } + h.transferHub = transferHub + h.stateBroker = broker +} + // ConfigRequest represents the request body for config RPC. // @Description Request body for recorder config type ConfigRequest struct { @@ -158,6 +170,7 @@ func (h *RecorderHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request rc := h.hub.NewRecorderConn(conn, deviceID, remoteIP) replacedConn := h.hub.ConnectReplacingExisting(deviceID, rc) closeReplacedRecorderConn(deviceID, replacedConn) + publishDeviceConnectionEvent(h.stateBroker, h.hub, h.transferHub, deviceID, "recorder_connected") defer func() { if err := conn.Close(websocket.StatusNormalClosure, ""); err != nil { @@ -168,6 +181,7 @@ func (h *RecorderHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request }() defer func() { if h.hub.Disconnect(deviceID, rc) { + publishDeviceConnectionEvent(h.stateBroker, h.hub, h.transferHub, deviceID, "recorder_disconnected") revertRunnableTasksOnDeviceDisconnect(h.db, deviceID, nil, 0, false) } }() @@ -519,6 +533,184 @@ func disconnectedRecorderStatsResponse() gin.H { } } +const defaultRecorderFreshMaxAge = time.Second + +func isTruthyQuery(value string) bool { + value = strings.ToLower(strings.TrimSpace(value)) + return value == "1" || value == "true" || value == "yes" +} + +func recorderRefreshMaxAge(c *gin.Context) time.Duration { + value := strings.TrimSpace(c.Query("max_age_ms")) + if value == "" { + return defaultRecorderFreshMaxAge + } + ms, err := strconv.Atoi(value) + if err != nil || ms < 0 { + return defaultRecorderFreshMaxAge + } + return time.Duration(ms) * time.Millisecond +} + +func recorderStateAge(state services.RecorderState) time.Duration { + if state.UpdatedAt.IsZero() { + return time.Duration(1<<63 - 1) + } + age := time.Since(state.UpdatedAt) + if age < 0 { + return 0 + } + return age +} + +func recorderStateAgeMS(state services.RecorderState) int64 { + age := recorderStateAge(state) + if age == time.Duration(1<<63-1) { + return 0 + } + return age.Milliseconds() +} + +func (h *RecorderHandler) disconnectedRecorderStateResponse(deviceID string) gin.H { + return gin.H{ + "connected": false, + "current_state": "unknown", + "previous_state": "", + "task_id": "", + "updated_at": time.Now().UTC(), + "state_synced": false, + "syncing": false, + "fresh": false, + "age_ms": 0, + "source": "disconnected", + "state_version": h.recorderStateVersion(deviceID), + } +} + +func (h *RecorderHandler) recorderStateResponse(deviceID string, rc *services.RecorderConn, fresh bool) gin.H { + st := rc.GetState() + synced := rc.IsStateSynced() + return gin.H{ + "connected": true, + "current_state": st.CurrentState, + "previous_state": recorderPreviousStateFromRaw(st.Raw), + "task_id": st.TaskID, + "updated_at": st.UpdatedAt, + "state_synced": synced, + "syncing": !synced, + "fresh": fresh, + "age_ms": recorderStateAgeMS(st), + "source": strings.TrimSpace(st.Source), + "state_version": h.recorderStateVersion(deviceID), + } +} + +func (h *RecorderHandler) recorderStateVersion(deviceID string) uint64 { + if h == nil || h.stateBroker == nil { + return 0 + } + return h.stateBroker.CurrentVersion(strings.TrimSpace(deviceID)) +} + +func (h *RecorderHandler) refreshRecorderState(ctx context.Context, deviceID string, rc *services.RecorderConn, maxAge time.Duration) (services.RecorderState, bool, error) { + if h == nil || h.hub == nil || rc == nil || strings.TrimSpace(deviceID) == "" { + return services.RecorderState{}, false, services.ErrRecorderNotConnected + } + if h.hub.Get(deviceID) != rc { + return services.RecorderState{}, false, services.ErrRecorderNotConnected + } + + state := rc.GetState() + if maxAge >= 0 && rc.IsStateSynced() && recorderStateAge(state) <= maxAge { + return state, true, nil + } + + timeout := recorderRPCResponseTimeout(h.cfg) + response, err := h.hub.SendRPC(ctx, deviceID, "get_state", nil, timeout) + if err != nil { + h.markRecorderSyncing(rc, "get_state", err) + return rc.GetState(), false, err + } + if response == nil || !response.Success { + message := "recorder get_state returned unsuccessful response" + if response != nil && strings.TrimSpace(response.Message) != "" { + message = strings.TrimSpace(response.Message) + } + err := errors.New(message) + h.markRecorderSyncing(rc, "get_state", err) + return rc.GetState(), false, err + } + if h.hub.Get(deviceID) != rc { + return services.RecorderState{}, false, services.ErrRecorderNotConnected + } + + state = recorderStateFromRPCData(response.Data) + if strings.TrimSpace(state.CurrentState) == "" { + err := errors.New("recorder get_state returned empty state") + h.markRecorderSyncing(rc, "get_state", err) + return rc.GetState(), false, err + } + h.applyRecorderStateSnapshot(rc, state, "get_state") + return rc.GetState(), true, nil +} + +func (h *RecorderHandler) markRecorderSyncing(rc *services.RecorderConn, source string, syncErr error) { + if h == nil || h.hub == nil || rc == nil || h.hub.Get(rc.DeviceID) != rc { + return + } + rc.MarkStateSyncing(source, syncErr) + st := rc.GetState() + if h.stateBroker != nil { + ev := recorderStateEvent(st, "recorder_syncing", source, false) + ev["state_synced"] = false + ev["syncing"] = true + if syncErr != nil { + ev["error"] = syncErr.Error() + } + h.stateBroker.Publish(rc.DeviceID, ev) + } +} + +func (h *RecorderHandler) publishRecorderStateSnapshot(rc *services.RecorderConn, state services.RecorderState, source string, fresh bool) { + if h == nil || h.stateBroker == nil || rc == nil { + return + } + h.stateBroker.Publish(rc.DeviceID, recorderStateEvent(state, "recorder_state", source, fresh)) +} + +func recorderStateEvent(state services.RecorderState, eventType string, source string, fresh bool) services.DeviceStateEvent { + synced := strings.TrimSpace(state.CurrentState) != "" + return services.DeviceStateEvent{ + "type": eventType, + "current_state": state.CurrentState, + "previous_state": recorderPreviousStateFromRaw(state.Raw), + "task_id": state.TaskID, + "state_synced": synced, + "syncing": !synced, + "fresh": fresh, + "age_ms": recorderStateAgeMS(state), + "source": strings.TrimSpace(source), + } +} + +func (h *RecorderHandler) handleRecorderRPCTimeout(deviceID string, action string, params map[string]interface{}, timeoutErr error) { + if rc := h.hub.Get(deviceID); rc != nil { + h.markRecorderSyncing(rc, "rpc_timeout:"+action, timeoutErr) + go h.syncRecorderStateFromDevice(rc) + } + if h.stateBroker == nil { + return + } + h.stateBroker.Publish(deviceID, services.DeviceStateEvent{ + "type": "recorder_operation_unknown", + "action": strings.TrimSpace(action), + "task_id": firstNonEmptyString(params, "task_id"), + "result_unknown": true, + "source": "rpc_timeout:" + strings.TrimSpace(action), + "error": timeoutErr.Error(), + }) +} + func (h *RecorderHandler) requireRecorderReadyForConfig(c *gin.Context) bool { deviceID := strings.TrimSpace(c.Param("device_id")) if deviceID == "" { @@ -530,6 +722,14 @@ func (h *RecorderHandler) requireRecorderReadyForConfig(c *gin.Context) bool { return false } + if h.transferHub != nil && h.transferHub.Get(deviceID) == nil { + c.JSON(http.StatusConflict, gin.H{ + "code": "transfer_not_connected", + "error": "transfer is not connected; config is rejected", + }) + return false + } + rc := h.hub.Get(deviceID) if rc == nil { c.JSON(http.StatusNotFound, gin.H{"error": services.ErrRecorderNotConnected.Error()}) @@ -544,6 +744,22 @@ func (h *RecorderHandler) requireRecorderReadyForConfig(c *gin.Context) bool { } state := rc.GetState() + if recorderStateAge(state) > defaultRecorderFreshMaxAge { + refreshed, _, err := h.refreshRecorderState(c.Request.Context(), deviceID, rc, -1) + if err != nil { + status := http.StatusConflict + if errors.Is(err, services.ErrRecorderRPCTimeout) { + status = http.StatusGatewayTimeout + } + out := h.recorderStateResponse(deviceID, rc, false) + out["code"] = "recorder_state_refresh_failed" + out["error"] = err.Error() + c.JSON(status, out) + return false + } + state = refreshed + } + current := strings.ToLower(strings.TrimSpace(state.CurrentState)) if current != "idle" { c.JSON(http.StatusConflict, gin.H{ @@ -558,7 +774,7 @@ func (h *RecorderHandler) requireRecorderReadyForConfig(c *gin.Context) bool { } func (h *RecorderHandler) callRPC(c *gin.Context, action string, params map[string]interface{}) bool { - deviceID := c.Param("device_id") + deviceID := strings.TrimSpace(c.Param("device_id")) if deviceID == "" { c.JSON(http.StatusBadRequest, gin.H{"error": "device_id is required"}) return false @@ -570,13 +786,27 @@ func (h *RecorderHandler) callRPC(c *gin.Context, action string, params map[stri case errors.Is(err, services.ErrRecorderNotConnected): c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) case errors.Is(err, services.ErrRecorderRPCTimeout): - c.JSON(http.StatusGatewayTimeout, gin.H{"error": err.Error()}) + h.handleRecorderRPCTimeout(deviceID, action, params, err) + c.JSON(http.StatusGatewayTimeout, gin.H{ + "error": err.Error(), + "action": action, + "result_unknown": true, + }) default: c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) } return false } + if response != nil && response.Success { + state := recorderStateFromRPCData(response.Data) + if strings.TrimSpace(state.CurrentState) != "" { + if rc := h.hub.Get(deviceID); rc != nil { + h.applyRecorderStateSnapshot(rc, state, "rpc_response:"+action) + } + } + } + c.JSON(http.StatusOK, response) return response != nil && response.Success } @@ -618,32 +848,46 @@ func (h *RecorderHandler) ListDevices(c *gin.Context) { // @Success 200 {object} map[string]interface{} "connected=false when recorder WS is not active" // @Router /recorder/{device_id}/state [get] func (h *RecorderHandler) GetState(c *gin.Context) { - deviceID := c.Param("device_id") + deviceID := strings.TrimSpace(c.Param("device_id")) + if deviceID == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "device_id is required"}) + return + } + if h == nil || h.hub == nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "recorder hub is not configured"}) + return + } + rc := h.hub.Get(deviceID) if rc == nil { - c.JSON(http.StatusOK, gin.H{ - "connected": false, - "current_state": "unknown", - "previous_state": "", - "task_id": "", - "updated_at": time.Now().UTC(), - "state_synced": false, - "syncing": false, - }) + c.JSON(http.StatusOK, h.disconnectedRecorderStateResponse(deviceID)) return } - st := rc.GetState() - synced := rc.IsStateSynced() - prev := recorderPreviousStateFromRaw(st.Raw) - c.JSON(http.StatusOK, gin.H{ - "connected": true, - "current_state": st.CurrentState, - "previous_state": prev, - "task_id": st.TaskID, - "updated_at": st.UpdatedAt, - "state_synced": synced, - "syncing": !synced, - }) + + fresh := false + if isTruthyQuery(c.Query("refresh")) { + var err error + _, fresh, err = h.refreshRecorderState(c.Request.Context(), deviceID, rc, recorderRefreshMaxAge(c)) + if err != nil { + if errors.Is(err, services.ErrRecorderNotConnected) { + c.JSON(http.StatusOK, h.disconnectedRecorderStateResponse(deviceID)) + return + } + status := http.StatusConflict + if errors.Is(err, services.ErrRecorderRPCTimeout) { + status = http.StatusGatewayTimeout + } + out := h.recorderStateResponse(deviceID, rc, false) + out["error"] = err.Error() + if errors.Is(err, services.ErrRecorderRPCTimeout) { + out["result_unknown"] = true + } + c.JSON(status, out) + return + } + } + + c.JSON(http.StatusOK, h.recorderStateResponse(deviceID, rc, fresh)) } func recorderPreviousStateFromRaw(raw map[string]interface{}) string { @@ -712,49 +956,28 @@ func (h *RecorderHandler) syncRecorderStateFromDevice(rc *services.RecorderConn) return } - timeout := recorderRPCResponseTimeout(h.cfg) - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - - response, err := h.hub.SendRPC(ctx, rc.DeviceID, "get_state", nil, timeout) - if err != nil { + if _, _, err := h.refreshRecorderState(context.Background(), rc.DeviceID, rc, -1); err != nil { if !errors.Is(err, services.ErrRecorderNotConnected) { logger.Printf("[RECORDER] Recorder %s get_state after connect failed: %v", rc.DeviceID, err) } - return - } - if response == nil || !response.Success { - message := "unsuccessful response" - if response != nil && strings.TrimSpace(response.Message) != "" { - message = response.Message - } - logger.Printf("[RECORDER] Recorder %s get_state after connect skipped: %s", rc.DeviceID, message) - return } - if h.hub.Get(rc.DeviceID) != rc { - return - } - - state := recorderStateFromRPCData(response.Data) - if strings.TrimSpace(state.CurrentState) == "" { - logger.Printf("[RECORDER] Recorder %s get_state after connect returned empty state", rc.DeviceID) - return - } - h.applyRecorderStateSnapshot(rc, state, "get_state") } func (h *RecorderHandler) applyRecorderStateSnapshot(rc *services.RecorderConn, state services.RecorderState, source string) { if rc == nil { return } + state.Source = strings.TrimSpace(source) rc.UpdateState(state) - h.reconcileRecorderTaskState(rc.DeviceID, state, source) + st := rc.GetState() + h.reconcileRecorderTaskState(rc.DeviceID, st, source) + h.publishRecorderStateSnapshot(rc, st, source, true) if source == "state_update" { // #nosec G706 -- Set aside for now - logger.Printf("[RECORDER] Recorder %s state=%s task=%s", rc.DeviceID, state.CurrentState, state.TaskID) + logger.Printf("[RECORDER] Recorder %s state=%s task=%s", rc.DeviceID, st.CurrentState, st.TaskID) return } - logger.Printf("[RECORDER] Recorder %s state=%s task=%s source=%s", rc.DeviceID, state.CurrentState, state.TaskID, source) + logger.Printf("[RECORDER] Recorder %s state=%s task=%s source=%s", rc.DeviceID, st.CurrentState, st.TaskID, source) } func recorderStateFromRPCData(data map[string]interface{}) services.RecorderState { diff --git a/internal/api/handlers/device_state_stream.go b/internal/api/handlers/device_state_stream.go new file mode 100644 index 0000000..25ced99 --- /dev/null +++ b/internal/api/handlers/device_state_stream.go @@ -0,0 +1,159 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package handlers + +import ( + "encoding/json" + "fmt" + "net/http" + "strings" + "time" + + "github.com/gin-gonic/gin" + + "archebase.com/keystone-edge/internal/services" +) + +// DeviceStateHandler streams recorder state and recorder/transfer connection events. +type DeviceStateHandler struct { + broker *services.DeviceStateBroker + recorderHub *services.RecorderHub + transferHub *services.TransferHub +} + +// NewDeviceStateHandler creates a DeviceStateHandler. +func NewDeviceStateHandler(broker *services.DeviceStateBroker, recorderHub *services.RecorderHub, transferHub *services.TransferHub) *DeviceStateHandler { + return &DeviceStateHandler{broker: broker, recorderHub: recorderHub, transferHub: transferHub} +} + +// RegisterRoutes registers device state stream routes. +func (h *DeviceStateHandler) RegisterRoutes(apiV1 *gin.RouterGroup) { + apiV1.GET("/device-state/stream", h.Stream) +} + +// Stream emits Server-Sent Events for device state changes. +func (h *DeviceStateHandler) Stream(c *gin.Context) { + if h == nil || h.broker == nil { + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "device state stream is not configured"}) + return + } + + deviceFilter := deviceStateStreamDeviceFilter(c) + w := c.Writer + w.Header().Set("Content-Type", "text/event-stream") + w.Header().Set("Cache-Control", "no-cache") + w.Header().Set("Connection", "keep-alive") + w.Header().Set("X-Accel-Buffering", "no") + w.WriteHeader(http.StatusOK) + w.Flush() + + events, unsubscribe := h.broker.Subscribe(64) + defer unsubscribe() + + heartbeat := time.NewTicker(15 * time.Second) + defer heartbeat.Stop() + + for { + select { + case <-c.Request.Context().Done(): + return + case ev, ok := <-events: + if !ok { + return + } + if !deviceStateEventMatchesFilter(ev, deviceFilter) { + continue + } + if err := writeDeviceStateSSE(w, ev); err != nil { + return + } + case <-heartbeat.C: + if _, err := fmt.Fprint(w, ": keepalive\n\n"); err != nil { + return + } + w.Flush() + } + } +} + +func writeDeviceStateSSE(w gin.ResponseWriter, ev services.DeviceStateEvent) error { + payload, err := json.Marshal(ev) + if err != nil { + return err + } + eventType := strings.TrimSpace(stringFromEvent(ev, "type")) + if eventType == "" { + eventType = "device_state" + } + if version, ok := ev["state_version"]; ok { + if _, err := fmt.Fprintf(w, "id: %v\n", version); err != nil { + return err + } + } + if _, err := fmt.Fprintf(w, "event: %s\n", eventType); err != nil { + return err + } + if _, err := fmt.Fprintf(w, "data: %s\n\n", payload); err != nil { + return err + } + w.Flush() + return nil +} + +func stringFromEvent(ev services.DeviceStateEvent, key string) string { + if ev == nil { + return "" + } + if s, ok := ev[key].(string); ok { + return s + } + return "" +} + +func deviceStateStreamDeviceFilter(c *gin.Context) map[string]struct{} { + ids := c.QueryArray("device_id") + if csv := strings.TrimSpace(c.Query("device_ids")); csv != "" { + ids = append(ids, strings.Split(csv, ",")...) + } + filter := make(map[string]struct{}, len(ids)) + for _, id := range ids { + id = strings.TrimSpace(id) + if id != "" { + filter[id] = struct{}{} + } + } + return filter +} + +func deviceStateEventMatchesFilter(ev services.DeviceStateEvent, filter map[string]struct{}) bool { + if len(filter) == 0 { + return true + } + deviceID := strings.TrimSpace(stringFromEvent(ev, "device_id")) + _, ok := filter[deviceID] + return ok +} + +func publishDeviceConnectionEvent( + broker *services.DeviceStateBroker, + recorderHub *services.RecorderHub, + transferHub *services.TransferHub, + deviceID string, + source string, +) { + deviceID = strings.TrimSpace(deviceID) + if broker == nil || deviceID == "" { + return + } + recorderConnected := recorderHub != nil && recorderHub.Get(deviceID) != nil + transferConnected := transferHub != nil && transferHub.Get(deviceID) != nil + broker.Publish(deviceID, services.DeviceStateEvent{ + "type": "device_connection", + "recorder_connected": recorderConnected, + "transfer_connected": transferConnected, + "connected": recorderConnected && transferConnected, + "source": strings.TrimSpace(source), + }) +} diff --git a/internal/api/handlers/transfer.go b/internal/api/handlers/transfer.go index acc6870..9b82d36 100644 --- a/internal/api/handlers/transfer.go +++ b/internal/api/handlers/transfer.go @@ -45,6 +45,7 @@ type TransferHandler struct { // recorderHub is used on transfer disconnect to notify recorder (clear/cancel) before reverting tasks. recorderHub *services.RecorderHub recorderRPCTimeout time.Duration + stateBroker *services.DeviceStateBroker } // NewTransferHandler creates a new TransferHandler. @@ -66,6 +67,14 @@ func NewTransferHandler(hub *services.TransferHub, cfg *config.TransferConfig, d } } +// SetDeviceStateBroker enables device connection event publishing. +func (h *TransferHandler) SetDeviceStateBroker(broker *services.DeviceStateBroker) { + if h == nil { + return + } + h.stateBroker = broker +} + // RegisterRoutes registers all transfer-related REST routes func (h *TransferHandler) RegisterRoutes(apiV1 *gin.RouterGroup) { // Note: apiV1 is already /transfer (set by server.go) @@ -118,6 +127,7 @@ func (h *TransferHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request dc := h.hub.NewTransferConn(conn, deviceID, remoteIP) replacedConn := h.hub.ConnectReplacingExisting(deviceID, dc) closeReplacedTransferConn(deviceID, replacedConn) + publishDeviceConnectionEvent(h.stateBroker, h.recorderHub, h.hub, deviceID, "transfer_connected") defer func() { if err := conn.Close(websocket.StatusNormalClosure, ""); err != nil { @@ -128,6 +138,7 @@ func (h *TransferHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request }() defer func() { if h.hub.Disconnect(deviceID, dc) { + publishDeviceConnectionEvent(h.stateBroker, h.recorderHub, h.hub, deviceID, "transfer_disconnected") revertRunnableTasksOnDeviceDisconnect(h.db, deviceID, h.recorderHub, h.recorderRPCTimeout, true) } }() diff --git a/internal/server/server.go b/internal/server/server.go index 45e0a09..f500efa 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -37,6 +37,7 @@ type Server struct { storage *handlers.StorageHandler transfer *handlers.TransferHandler recorder *handlers.RecorderHandler + deviceState *handlers.DeviceStateHandler episode *handlers.EpisodeHandler task *handlers.TaskHandler batch *handlers.BatchHandler @@ -87,6 +88,7 @@ func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *servi } // Recorder hub must exist before TransferHandler (transfer disconnect notifies recorder via RPC). + stateBroker := services.NewDeviceStateBroker() recorderHub := services.NewRecorderHub() recorderHandler := handlers.NewRecorderHandler(recorderHub, &cfg.AxonRecorder, db) recorderRPCTimeout := time.Duration(cfg.AxonRecorder.ResponseTimeout) * time.Second @@ -94,6 +96,9 @@ func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *servi // Create TransferHub and TransferHandler for Transfer Service transferHub := services.NewTransferHub(cfg.AxonTransfer.MaxEvents) transferHandler := handlers.NewTransferHandler(transferHub, &cfg.AxonTransfer, db, s3Client, cfg.Storage.Bucket, cfg.AxonTransfer.FactoryID, recorderHub, recorderRPCTimeout) + recorderHandler.SetDeviceStateDeps(transferHub, stateBroker) + transferHandler.SetDeviceStateBroker(stateBroker) + deviceStateHandler := handlers.NewDeviceStateHandler(stateBroker, recorderHub, transferHub) // Create EpisodeHandler for episode listing episodeHandler := handlers.NewEpisodeHandler(db, s3Client, cfg.Storage.Bucket, &cfg.Auth) @@ -152,6 +157,7 @@ func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *servi storage: storageHandler, transfer: transferHandler, recorder: recorderHandler, + deviceState: deviceStateHandler, episode: episodeHandler, task: taskHandler, batch: batchHandler, @@ -319,6 +325,9 @@ func (s *Server) buildRoutes() http.Handler { v1Recorder := v1Routes.Group("/recorder") s.recorder.RegisterRoutes(v1Recorder) + if s.deviceState != nil { + s.deviceState.RegisterRoutes(v1Routes) + } // Swagger documentation - serve at both root and api/v1 path s.engine.GET("/swagger/*any", ginSwagger.WrapHandler(swaggerFiles.Handler)) diff --git a/internal/services/device_state_broker.go b/internal/services/device_state_broker.go new file mode 100644 index 0000000..505c8b6 --- /dev/null +++ b/internal/services/device_state_broker.go @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "sync" + "time" +) + +// DeviceStateEvent is a versioned event emitted when recorder state or device +// connection state changes. +type DeviceStateEvent map[string]interface{} + +// DeviceStateBroker fans out device state events to API stream subscribers. +type DeviceStateBroker struct { + mu sync.RWMutex + nextSubID uint64 + versions map[string]uint64 + subscribers map[uint64]chan DeviceStateEvent +} + +// NewDeviceStateBroker creates a state event broker. +func NewDeviceStateBroker() *DeviceStateBroker { + return &DeviceStateBroker{ + versions: make(map[string]uint64), + subscribers: make(map[uint64]chan DeviceStateEvent), + } +} + +// CurrentVersion returns the latest emitted version for a device. +func (b *DeviceStateBroker) CurrentVersion(deviceID string) uint64 { + if b == nil || deviceID == "" { + return 0 + } + b.mu.RLock() + defer b.mu.RUnlock() + return b.versions[deviceID] +} + +// Publish assigns a per-device monotonically increasing state_version and fans +// the event out to subscribers. Slow subscribers may drop events; clients must +// recover by fetching a full snapshot after reconnect. +func (b *DeviceStateBroker) Publish(deviceID string, event DeviceStateEvent) DeviceStateEvent { + if b == nil || deviceID == "" || event == nil { + return event + } + + b.mu.Lock() + version := b.versions[deviceID] + 1 + b.versions[deviceID] = version + event["device_id"] = deviceID + event["state_version"] = version + if _, ok := event["updated_at"]; !ok { + event["updated_at"] = time.Now().UTC().Format(time.RFC3339Nano) + } + + subscribers := make([]chan DeviceStateEvent, 0, len(b.subscribers)) + for _, ch := range b.subscribers { + subscribers = append(subscribers, ch) + } + b.mu.Unlock() + + for _, ch := range subscribers { + select { + case ch <- cloneDeviceStateEvent(event): + default: + } + } + return event +} + +// Subscribe registers a subscriber and returns a channel plus cleanup function. +func (b *DeviceStateBroker) Subscribe(buffer int) (<-chan DeviceStateEvent, func()) { + if buffer <= 0 { + buffer = 32 + } + ch := make(chan DeviceStateEvent, buffer) + if b == nil { + close(ch) + return ch, func() {} + } + + b.mu.Lock() + b.nextSubID++ + id := b.nextSubID + b.subscribers[id] = ch + b.mu.Unlock() + + return ch, func() { + b.mu.Lock() + if existing, ok := b.subscribers[id]; ok { + delete(b.subscribers, id) + close(existing) + } + b.mu.Unlock() + } +} + +func cloneDeviceStateEvent(src DeviceStateEvent) DeviceStateEvent { + out := make(DeviceStateEvent, len(src)) + for k, v := range src { + out[k] = v + } + return out +} diff --git a/internal/services/recorder_hub.go b/internal/services/recorder_hub.go index 05e5960..1077311 100644 --- a/internal/services/recorder_hub.go +++ b/internal/services/recorder_hub.go @@ -27,10 +27,13 @@ var ( // RecorderState stores the latest state snapshot reported by a recorder. type RecorderState struct { - CurrentState string `json:"current_state"` - TaskID string `json:"task_id,omitempty"` - UpdatedAt time.Time `json:"updated_at"` - Raw map[string]interface{} `json:"raw,omitempty"` + CurrentState string `json:"current_state"` + TaskID string `json:"task_id,omitempty"` + UpdatedAt time.Time `json:"updated_at"` + Source string `json:"source,omitempty"` + LastSyncAttemptAt time.Time `json:"last_sync_attempt_at,omitempty"` + LastSyncError string `json:"last_sync_error,omitempty"` + Raw map[string]interface{} `json:"raw,omitempty"` } // RecorderInfo is a read-only snapshot of a connected recorder. @@ -114,11 +117,32 @@ func (r *RecorderConn) IsStateSynced() bool { func (r *RecorderConn) UpdateState(state RecorderState) { r.StateMu.Lock() defer r.StateMu.Unlock() - state.UpdatedAt = time.Now() + state.UpdatedAt = time.Now().UTC() + state.LastSyncError = "" r.State = state r.StateSynced = strings.TrimSpace(state.CurrentState) != "" } +// MarkStateSyncing keeps the last snapshot for display but marks it unsafe for writes. +func (r *RecorderConn) MarkStateSyncing(source string, syncErr error) { + r.StateMu.Lock() + defer r.StateMu.Unlock() + state := r.State + now := time.Now().UTC() + if state.UpdatedAt.IsZero() { + state.UpdatedAt = now + } + state.Source = strings.TrimSpace(source) + state.LastSyncAttemptAt = now + if syncErr != nil { + state.LastSyncError = syncErr.Error() + } else { + state.LastSyncError = "" + } + r.State = state + r.StateSynced = false +} + // RecorderHub manages all active Axon Recorder WebSocket connections. // It embeds the generic Hub[*RecorderConn] for connection lifecycle and // adds the RPC request/response matching layer on top. From 592c45894dc633a5b96b47815b8c28355164a534 Mon Sep 17 00:00:00 2001 From: chao liu Date: Fri, 29 May 2026 10:49:04 +0800 Subject: [PATCH 08/12] fix: harden recorder state coordination --- .../recorder-axon-interaction-test-plan.md | 120 ++ .../recorder-axon-interaction-tests.html | 407 +++++ internal/api/handlers/axon_rpc.go | 83 +- internal/api/handlers/batch.go | 11 +- .../recorder_axon_interaction_test.go | 1589 +++++++++++++++++ internal/api/handlers/transfer.go | 22 +- internal/services/device_state_broker.go | 5 +- internal/services/device_state_broker_test.go | 37 + internal/services/recorder_hub.go | 13 +- 9 files changed, 2253 insertions(+), 34 deletions(-) create mode 100644 docs/designs/recorder-axon-interaction-test-plan.md create mode 100644 docs/designs/recorder-axon-interaction-tests.html create mode 100644 internal/api/handlers/recorder_axon_interaction_test.go create mode 100644 internal/services/device_state_broker_test.go diff --git a/docs/designs/recorder-axon-interaction-test-plan.md b/docs/designs/recorder-axon-interaction-test-plan.md new file mode 100644 index 0000000..9d267f0 --- /dev/null +++ b/docs/designs/recorder-axon-interaction-test-plan.md @@ -0,0 +1,120 @@ +# Recorder / Axon Interaction Test Plan + +This document defines the Keystone-side test coverage for recorder WebSocket and RPC behavior. The goal is to find protocol, state recovery, and duplicate-dispatch bugs with tests. If a test fails because product behavior is wrong, fix implementation code; do not weaken assertions only to make tests pass. + +## Scope + +Covered now: + +1. Keystone handler/unit behavior around recorder state, RPC result handling, task state transitions, connection replacement, and disconnect recovery. +2. Keystone-to-fake-Axon WebSocket interaction behavior using real websocket connections, including connect-time state sync, reconnect, late messages, RPC response shape, timeout, and duplicate config prevention. + +Out of scope for this document: + +- Synapse browser/UI tests. +- Axon C++ implementation tests. +- Network stack integration outside in-process `httptest` WebSocket servers. + +## Invariants + +- Config may be sent only when Keystone has a trusted state snapshot for the current recorder connection and that snapshot is `idle`. +- A recorder connection starts as unsynced. `state_synced` becomes true only after a non-empty recorder state snapshot from `state_update` or `get_state`. +- If the recorder is `ready`, `recording`, or `paused`, any new `config` request must be rejected before Keystone sends an RPC. +- A ping timeout or recorder WebSocket close may revert DB tasks from `ready` or `in_progress` to `pending`, but reconnect state sync must restore the same task when Axon still reports it. +- A task that is reverted to `pending` during reconnect must not be reconfigured while Axon still reports `ready` with that task id. +- Old/replaced recorder connections must not update DB state, satisfy pending RPCs, or override current connection state. +- RPC success controls DB transitions. `success:false`, timeout, or disconnected recorder must not advance/revert task state as if the command succeeded. +- Repeated state events for the same task must be idempotent. + +## Keystone Handler / Unit Test Matrix + +### State snapshot and DB reconciliation + +- [x] `state_update ready + task_id` advances `pending -> ready` and sets `ready_at`. +- [x] `state_update recording + task_id` advances `pending/ready -> in_progress` and sets `started_at`. +- [x] `state_update paused + task_id` advances `pending/ready -> in_progress` and sets `started_at`. +- [x] `get_state ready` with `task_config.task_id` advances `pending -> ready`. +- [x] `get_state recording` with top-level `task_id` advances `pending/ready -> in_progress`. +- [x] `get_state paused` with `task_config.task_id` advances `pending/ready -> in_progress`. +- [x] `idle + task_id` does not advance or revert DB state. +- [x] non-idle state without `task_id` marks connection synced but does not change DB state. +- [x] empty state snapshot keeps connection unsynced. +- [x] duplicate ready/recording snapshots are idempotent and do not corrupt timestamps. + +### State API contract + +- [x] disconnected `/state` returns `connected=false`, `state_synced=false`, `syncing=false`. +- [x] connected but unsynced `/state` returns `connected=true`, `state_synced=false`, `syncing=true`. +- [x] connected and synced `/state` returns `state_synced=true`, `syncing=false`. +- [x] `/devices` includes `state_synced` for each connected recorder. + +### Config gating + +- [x] `config` before initial state snapshot returns 409 and sends no RPC. +- [x] `config` while synced recorder is `ready` returns 409 and sends no RPC. +- [x] `config` while synced recorder is `recording` returns 409 and sends no RPC. +- [x] `config` while synced recorder is `paused` returns 409 and sends no RPC. +- [x] `config` while synced recorder state is unknown/non-idle returns 409 and sends no RPC. +- [x] `config` after synced `idle` sends RPC and advances `pending -> ready` only on success. +- [x] `config` with `success:false` keeps task `pending`. +- [x] `config` timeout keeps task `pending`. +- [x] `config` disconnected returns 404 and keeps task `pending`. + +### Command RPC side effects + +- [x] `begin` with `success:false` keeps task unchanged. +- [x] `begin` success advances `pending/ready -> in_progress` and is idempotent for already `in_progress`. +- [x] `begin` timeout or disconnect keeps task unchanged. +- [x] `cancel` success reverts `ready/in_progress -> pending` only for provided task id. +- [x] `cancel` `success:false`, timeout, or disconnect keeps task unchanged. +- [x] `clear` success reverts `ready -> pending`, does not revert `in_progress`, and sends no task payload to Axon. +- [x] `clear` `success:false`, timeout, or disconnect keeps task unchanged. +- [x] `pause`, `resume`, `finish`, and `quit` forward the RPC action and do not mutate task state directly. +- [x] `get_stats` disconnected returns HTTP 200 with `connected=false` and empty data. +- [x] `get_stats` Axon failure returns HTTP 200 with `connected=true`, data, and error message. +- [x] `get_stats` timeout returns HTTP 504. + +### Disconnect and replacement + +- [x] messages from replaced recorder connections are ignored for `state_update` and `config_applied`. +- [x] late `rpc_response` from replaced connections cannot satisfy current pending RPCs. +- [x] recorder disconnect reverts matching `ready` tasks to `pending` without sending recorder RPCs. +- [x] recorder disconnect reverts matching `in_progress` tasks to `pending` without sending recorder RPCs. +- [x] recorder disconnect does not revert `pending`, `completed`, `failed`, deleted, or other-device tasks. +- [x] old connection disconnect after replacement does not revert tasks for the current connection. + +## Keystone <-> Fake Axon WebSocket Test Matrix + +### Connect-time sync + +- [x] On WebSocket connect, Keystone sends `get_state`; fake Axon replies `idle`; `/state` becomes synced idle. +- [x] Fake Axon sends `state_update ready + task_id` before `get_state` response; task advances to `ready` and stays correct when `get_state` later returns same state. +- [x] `get_state` returns `ready + task_id`; task advances to `ready`; direct `config` is rejected as busy. +- [x] `get_state` returns `recording + task_id`; task advances to `in_progress`; direct `config` is rejected as busy. +- [x] `get_state` timeout leaves the connection unsynced, `/state` keeps `syncing=true`, and `config` is rejected until a valid `state_update` arrives. +- [x] `get_state` returns `success:false`; connection remains unsynced and `config` is rejected until a valid `state_update` arrives. +- [x] `state_update` with empty current state does not mark connection synced. + +### Reconnect after timeout / close + +- [x] Ready task path: task is `ready`; recorder WebSocket closes; DB reverts to `pending`; fake Axon reconnects and reports `ready + same task_id`; DB returns to `ready`; no `config` RPC is sent during the window. +- [x] Recording task path: task is `in_progress`; recorder WebSocket closes; DB reverts to `pending`; fake Axon reconnects and reports `recording + same task_id`; DB returns to `in_progress`; no `config` RPC is sent during the window. +- [x] Idle-after-reconnect path: task is reverted to `pending`; fake Axon reconnects and reports `idle`; `config` is allowed and sends exactly one RPC. +- [x] Reconnect replaces a still-active old connection; old connection late `state_update`, `config_applied`, and `rpc_response` do not mutate task state or satisfy new requests. + +### RPC action protocol + +- [x] `config` sends action `config` with `task_config` payload exactly once when recorder is synced idle. +- [x] `clear` sends action `clear` without UI task payload even when HTTP body includes `task_id`. +- [x] `begin`, `cancel`, `finish`, `pause`, `resume`, `quit`, and `get_stats` send the expected action names and payloads. +- [x] Axon response with unmatched `request_id` is ignored and does not unblock Keystone RPC waiters. +- [x] Axon response `success:false` returns HTTP 200 with the RPC response body but does not mutate DB state. +- [x] Missing Axon response causes Keystone HTTP timeout and no DB mutation. + +## Implementation Notes + +- Prefer deterministic fake Axon helpers over sleeps. Helpers should expose channels for inbound RPC requests and send explicit RPC responses by request id. +- Use real `httptest` WebSocket servers for layer 2 tests so the Keystone read loop, connect-time `get_state`, and disconnect defers are exercised. +- Keep DB fixtures small but include `robots`, `workstations`, and `tasks` when testing disconnect joins. +- Use `ResponseTimeout` values short enough for timeout tests, but avoid making the suite flaky. +- Keep assertions strict: status code, response body, sent action, request payload, DB status, and timestamp side effects should be checked where relevant. diff --git a/docs/designs/recorder-axon-interaction-tests.html b/docs/designs/recorder-axon-interaction-tests.html new file mode 100644 index 0000000..9a225b2 --- /dev/null +++ b/docs/designs/recorder-axon-interaction-tests.html @@ -0,0 +1,407 @@ + + + + + + Keystone / Axon Recorder 交互测试说明 + + + +
+
+

Keystone / Axon Recorder 交互测试说明

+

这份文档把 Keystone 和 Axon Recorder 之间的关键测试项讲清楚:每个测试模拟什么情况、系统应该怎么反应、以及它防止哪类线上事故。

+
+ 覆盖范围:Keystone handler/unit + fake Axon WebSocket + 重点风险:重复 config、断线恢复、旧连接污染、RPC 超时 + 对应文件:recorder_axon_interaction_test.go / task_state_recovery_test.go +
+
+
+ +
+
+
8条核心不变量,定义系统必须守住的边界。
+
7个测试分组,从状态同步到重连恢复逐层覆盖。
+
40+个具体断言点,检查 HTTP 状态、RPC 是否发送、DB 是否变化。
+
0当前文档中没有未覆盖的测试项。
+
+ +
+

先看结论

系统必须守住的事
+

Keystone 和 Axon 的交互最怕“状态还没同步就下发新任务”。所以测试围绕一个核心原则展开:只有确认 Axon 当前是可信的 idle 状态,Keystone 才能发 config

+
+
1. 新连接Recorder 连接上来时先视为状态未知,不能配置任务。
+
2. 拉状态Keystone 发 get_state,或者等待 Axon 发 state_update
+
3. 状态可信只有拿到非空状态,才认为这条连接已经同步。
+
4. 忙碌拒绝如果 Axon 是 readyrecordingpaused,新 config 必须拒绝。
+
5. 断线恢复DB 可以短暂回到 pending,但重连后要按 Axon 真实状态恢复。
+
+
+ +
+

状态同步与 DB 恢复

10 项
+

这些测试确认 Keystone 能正确理解 Axon 上报的状态,并把任务表恢复到正确状态;同时避免空状态、重复状态把 DB 搞乱。

+
+
状态恢复

Axon 报 ready,要把 pending 任务恢复成 ready

模拟
Axon 发 state_update ready + task_id,或 get_state ready
期望
DB 中对应任务从 pending 变成 ready,并写入 ready_at
防止
Recorder 已经准备好,但 Keystone 还以为任务没配置。
TestRecorderStateUpdateReadyRestoresPendingTask
TestRecorderGetStateSnapshotReadyRestoresPendingTask
TestRecorderStateSnapshotReconcilesAdditionalStates
+
状态恢复

Axon 报 recording 或 paused,要恢复成 in_progress

模拟
Axon 上报 recordingpaused,并带当前 task_id
期望
DB 中任务进入 in_progress,并设置 started_at
防止
设备已经录制中,但 Keystone 仍显示任务待开始。
TestRecorderStateUpdateRecordingAdvancesPendingTask
TestRecorderStateUpdatePausedAdvancesPendingTask
TestRecorderStateSnapshotReconcilesAdditionalStates
+
边界状态

Axon 报 idle,即使带 task_id,也不能推进 DB

模拟
收到 idle + task_id 这种容易误判的状态。
期望
任务状态不变,不自动变成 readyin_progress
防止
把空闲设备误认为已经配置好任务。
TestRecorderStateSnapshotReconcilesAdditionalStates
+
无 task_id

非 idle 但没有 task_id,只同步连接,不改任务

模拟
Axon 报 ready 但没带任务号。
期望
连接可以标记为已同步,但 DB 任务不能被推进。
防止
不知道是哪条任务时误改 DB。
TestRecorderNonIdleWithoutTaskIDMarksSyncedWithoutDBChange
+
同步门闩

空状态不能算同步完成

模拟
Axon 发空的 state_update 或空快照。
期望
state_synced 仍是 false,前后端继续认为还在同步中。
防止
空状态被当成 idle,导致错误开放 config。
TestRecorderEmptyStateSnapshotKeepsConnectionUnsynced
TestRecorderWebSocketEmptyStateUpdateDoesNotMarkSynced
+
幂等

重复状态不能破坏时间戳

模拟
state_update ready 先到,随后 get_state ready 又到。
期望
任务仍是 readyready_at 不被重复刷新。
防止
重复消息导致时间线被篡改,影响追踪。
TestRecorderWebSocketStateUpdateBeforeGetStateResponseIsIdempotent
+
+
+ +
+

状态 API 契约

4 项
+

这些测试确认 Synapse 或其它调用方看到的状态字段是可靠的,尤其是 state_syncedsyncing

+
+
API

断开时 /state 要明确说未连接

模拟
Recorder WebSocket 不存在。
期望
返回 connected=falsestate_synced=falsesyncing=false
防止
前端把离线误当同步中或可操作。
TestRecorderGetStateReportsSyncingBeforeInitialStateSnapshot
相关 disconnected /state 断言
+
API

连接上但没同步时,要显示 syncing

模拟
WebSocket 已连接,但还没收到有效状态。
期望
connected=truesyncing=true
防止
重连窗口里误开放配置表单。
TestRecorderGetStateReportsSyncingBeforeInitialStateSnapshot
+
API

同步完成后,/state 要显示可信状态

模拟
收到有效 idle 快照。
期望
state_synced=truesyncing=false
防止
设备已经可判断状态,但 UI 一直卡在同步中。
TestRecorderGetStateReportsSyncedAfterInitialStateSnapshot
TestRecorderWebSocketConnectGetStateIdleMarksStateSynced
+
API

/devices 也要带 state_synced

模拟
一个 Recorder 已同步,一个未同步。
期望
设备列表能区分每台设备是否完成状态同步。
防止
列表层只知道在线,不知道能不能安全下发任务。
TestRecorderListDevicesIncludesStateSynced
+
+
+ +
+

config 下发门闩

9 项
+

这些测试直接保护“不要重复下发任务”。只要 Keystone 还没确认 Axon 是可信 idle,就不能把 config 送到设备。

+
+
禁止重复下发

初始状态未同步时,config 必须 409

模拟
Recorder 刚重连,还没有有效状态。
期望
HTTP 返回 409,且 fake Axon 收不到 config
防止
重连窗口把 pending 任务重复配置给设备。
TestRecorderConfigRejectsBeforeInitialStateSnapshot
TestRecorderWebSocketUnsyncedConfigRejectedUntilValidStateUpdate
+
忙碌保护

Axon 已 ready / recording / paused 时,config 必须拒绝

模拟
状态已同步,但 Axon 正持有任务或录制中。
期望
返回 409,且不发 RPC。
防止
给一个已经有任务的 Recorder 再塞一条任务。
TestRecorderConfigRejectsWhenSyncedRecorderBusy
TestRecorderConfigRejectsAdditionalBusyStates
TestRecorderWebSocketGetStateReadyRejectsConfigWithoutSendingRPC
+
允许路径

只有 synced idle 才能发 config

模拟
Keystone 已确认 Axon 是 idle
期望
发出一次 config RPC;成功后 DB 从 pendingready
防止
正常生产路径被安全门闩误伤。
TestRecorderConfigAdvancesTaskAfterInitialStateSnapshot
TestRecorderWebSocketIdleReconnectAllowsSingleConfig
+
失败不改 DB

config 返回 success:false,任务仍是 pending

模拟
Axon 拒绝配置。
期望
Keystone 不把任务推进到 ready
防止
设备没接受任务,但后台显示已就绪。
TestRecorderConfigDoesNotAdvanceTaskWhenRPCResponseUnsuccessful
+
超时保护

config 超时,任务仍是 pending

模拟
Keystone 发了 config,Axon 没回应。
期望
HTTP 504,DB 不变。
防止
不知道设备有没有收到时,后台抢先标 ready。
TestRecorderConfigTimeoutAndDisconnectedKeepTaskPending
Missing Axon response assertions
+
离线保护

Recorder 离线时 config 返回 404

模拟
没有对应 WebSocket 连接。
期望
返回 404,任务保持 pending。
防止
给不存在的连接制造假成功。
TestRecorderConfigTimeoutAndDisconnectedKeepTaskPending
+
+
+ +
+

命令 RPC 的副作用

11 项
+

这些测试检查 begin、cancel、clear、pause、resume、finish、quit、get_stats 等命令:哪些命令应该改 DB,哪些只应该转发给 Axon。

+
+
begin

begin 成功才进入 in_progress

模拟
Axon 对 begin 返回成功。
期望
任务从 pending/ready 进入 in_progress;已在 in_progress 时保持幂等。
防止
重复 begin 或状态不一致造成任务错乱。
TestRecorderBeginSuccessTimeoutAndDisconnectedTaskState
TestRecorderBeginSuccessAdvancesReadyAndLeavesInProgress
+
begin 失败

begin 失败、超时或断连时不改任务

模拟
Axon 返回失败、不响应、或 Recorder 不在线。
期望
任务状态保持原样。
防止
设备没开始录制,后台却显示录制中。
TestRecorderBeginDoesNotAdvanceTaskWhenRPCResponseUnsuccessful
TestRecorderBeginSuccessTimeoutAndDisconnectedTaskState
+
cancel

cancel 成功才把 ready/in_progress 回退 pending

模拟
Axon 接受取消命令。
期望
只有提供的 task_id 从运行态回到 pending
防止
取消一条任务时误伤其它任务。
TestRecorderCancelAndClearTaskStateSideEffects
+
cancel 失败

cancel 失败、超时或断连时不回退 DB

模拟
Axon 拒绝取消、无响应、或断线。
期望
DB 继续保持原状态。
防止
设备还在录制,后台却把任务放回 pending。
TestRecorderCancelAndClearTaskStateSideEffects
TestRecorderCancelTimeoutAndDisconnectedKeepTaskState
+
clear

clear 成功只清 ready,不清 in_progress

模拟
对 ready 任务执行 clear,同时存在 in_progress 任务。
期望
ready 任务回 pending,in_progress 任务不变;发给 Axon 的 clear 不带 UI 的 task payload。
防止
清配置时误取消录制中任务,或协议 payload 不稳定。
TestRecorderCancelAndClearTaskStateSideEffects
TestRecorderWebSocketRPCActionProtocol
+
clear 失败

clear 失败、超时或断连时不改 DB

模拟
Axon 拒绝 clear、不响应、或离线。
期望
ready 任务仍保持 ready。
防止
设备没清掉配置,后台却显示可重新配置。
TestRecorderClearFailureTimeoutAndDisconnectedKeepTaskState
+
纯转发命令

pause / resume / finish / quit 只发 RPC,不直接改 DB

模拟
这些命令返回成功。
期望
Keystone 只转发命令,不擅自改变任务状态。
防止
设备实际状态还没上报,后台先行改状态。
TestRecorderForwardOnlyActionsDoNotMutateTaskState
TestRecorderWebSocketRPCActionProtocol
+
stats

get_stats 的三种结果都要明确

模拟
Recorder 离线、Axon 返回失败、Axon 不响应。
期望
离线返回 connected=false;设备失败返回 200 + error;超时返回 504。
防止
监控面板把离线、设备错误、网络超时混成一种情况。
TestRecorderGetStatsContracts
+
+
+ +
+

断线、重连和旧连接污染

10 项
+

这些测试专门覆盖 ping timeout / WebSocket close 后的危险窗口:DB 可能短暂回 pending,但不能因此重复下发,也不能让旧连接的迟到消息污染新连接。

+
+
断线回退

断开时只回退本设备 runnable 任务

模拟
同一设备下有 ready、in_progress、pending、completed、deleted 任务,另一个设备也有 ready。
期望
只把本设备 ready/in_progress 回 pending,并清理运行时字段。
防止
断线清理误伤别的设备或已完成任务。
TestRecorderDisconnectRevertsOnlyRunnableTasksForDevice
+
ready 重连

ready 任务断线后,重连仍恢复同一条任务

模拟
DB 中 ready 任务因断线回 pending;Axon 重连后报告 ready + same task_id
期望
DB 恢复 ready;窗口内和恢复后都不重复发 config。
防止
你关心的“同一 task_id 被再次下发 config”。
TestRecorderWebSocketReadyReconnectRestoresSameTaskWithoutDuplicateConfig
+
recording 重连

录制中断线后,重连恢复 in_progress

模拟
DB 中 in_progress 任务断线回 pending;Axon 重连后报告 recording + same task_id
期望
DB 恢复 in_progress;不会开放新 config。
防止
设备仍在录制,后台却允许配置下一条。
TestRecorderWebSocketRecordingReconnectRestoresSameTaskWithoutDuplicateConfig
+
idle 重连

如果 Axon 重连后确实 idle,才允许重新 config

模拟
断线回 pending 后,Axon 重连报告 idle
期望
Keystone 允许 config,并且只发送一次。
防止
安全门闩过严,导致设备确实空闲时无法恢复生产。
TestRecorderWebSocketIdleReconnectAllowsSingleConfig
+
旧连接污染

旧连接的 state_update / config_applied 必须忽略

模拟
新连接已经替换旧连接,旧连接又迟到发状态变更消息。
期望
DB 不被旧连接推进。
防止
网络迟到消息把当前真实状态覆盖掉。
TestRecorderIgnoresStateChangingMessagesFromReplacedConnection
TestRecorderWebSocketReplacementIgnoresOldConnectionLateMessages
+
旧响应污染

旧连接的 rpc_response 不能完成新请求

模拟
新连接上有 pending RPC;旧连接伪造同 request_id 的响应。
期望
旧响应被忽略,只有当前连接响应才能完成请求。
防止
旧连接的成功响应让新请求误成功。
TestRecorderIgnoresRPCResponseFromReplacedConnection
TestRecorderWebSocketReplacementIgnoresOldConnectionLateMessages
+
替换断开

旧连接被替换后断开,不应触发任务回退

模拟
新连接已经接管,旧连接随后 disconnect。
期望
当前 ready 任务不被旧连接断开回退。
防止
连接接管后,被旧连接清理逻辑误伤。
TestOldRecorderDisconnectAfterReplacementDoesNotRevertTasks
+
+
+ +
+

connect-time get_state 与协议细节

8 项
+

这些测试使用真实 WebSocket 和 fake Axon,检查 Keystone 发出的 RPC action、payload、request_id 匹配和超时行为。

+
+
连接握手

WebSocket 一连上,Keystone 会主动 get_state

模拟
fake Axon 建立 WebSocket 连接。
期望
首先收到 get_state;回复 idle 后 Keystone 状态变 synced idle。
防止
重连后 Keystone 不主动校准设备状态。
TestRecorderWebSocketConnectGetStateIdleMarksStateSynced
+
get_state 超时

get_state 超时后仍保持 syncing,config 继续阻断

模拟
fake Axon 收到 get_state 但不回应,等到 ResponseTimeout 之后再操作。
期望
/state 仍是 syncing=true;config 返回 409 且不发 RPC;之后有效 state_update 才解锁。
防止
状态拉取超时后被误当成 idle。
TestRecorderWebSocketGetStateTimeoutLeavesUnsyncedAndBlocksConfig
+
get_state 失败

get_state 返回 success:false,也不能解锁 config

模拟
Axon 明确回复状态不可用。
期望
连接仍 unsynced,config 仍被阻断。
防止
把失败响应当成有效状态。
TestRecorderWebSocketGetStateFailureLeavesUnsyncedAndBlocksConfig
+
RPC payload

每个命令发出的 action 和 payload 要稳定

模拟
依次调用 config、begin、cancel、clear、finish、pause、resume、quit、get_stats。
期望
fake Axon 收到正确 action;config 带 task_config;clear 不带 UI task_id。
防止
协议字段改错,导致 Axon 端无法解析。
TestRecorderWebSocketRPCActionProtocol
+
request_id

不匹配的 rpc_response 不能解开 pending RPC

模拟
Axon 返回一个不存在的 request_id
期望
Keystone 继续等待真正响应。
防止
不同请求串线,HTTP 请求拿到别人的结果。
TestRecorderUnmatchedRPCResponseDoesNotUnblockPendingRequest
+
无响应

Axon 不响应时,Keystone HTTP 超时且不改 DB

模拟
fake Axon 收到命令但不回 rpc_response。
期望
HTTP 504,DB 不做成功态变更。
防止
设备是否执行未知时,后台提前写成功。
TestRecorderConfigTimeoutAndDisconnectedKeepTaskPending
TestRecorderBeginSuccessTimeoutAndDisconnectedTaskState
TestRecorderGetStatsContracts
+
+
+ +
+

怎么读这些测试

实用说明
+
+

如果某个测试失败,不应该先改测试去“适配现状”。这些测试是用来暴露潜在问题的:例如任务被错误推进、断线后被重复 config、旧连接消息污染新连接、或者 Axon 超时后 Keystone 错误解锁。正确处理方式是先判断测试描述的业务预期是否成立;如果成立,就修实现。

+

这份 HTML 是给人看的说明;机器执行的测试仍以 Go 测试文件为准。详细测试清单也保留在 recorder-axon-interaction-test-plan.md

+
+
+
+ +
+
Generated documentation for Keystone Recorder / Axon interaction tests. Open this file directly in a browser.
+
+ + diff --git a/internal/api/handlers/axon_rpc.go b/internal/api/handlers/axon_rpc.go index 6253ca5..1f008ee 100644 --- a/internal/api/handlers/axon_rpc.go +++ b/internal/api/handlers/axon_rpc.go @@ -191,7 +191,7 @@ func (h *RecorderHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request // #nosec G706 -- Set aside for now logger.Printf("[RECORDER] Recorder %s connected from %s", deviceID, remoteIP) - go h.syncRecorderStateFromDevice(rc) + go h.syncRecorderStateFromDevice(ctx, rc) for { _, raw, err := conn.Read(ctx) @@ -503,12 +503,14 @@ func (h *RecorderHandler) GetStats(c *gin.Context) { return } - response, err := h.hub.SendRPC(c.Request.Context(), deviceID, "get_stats", nil, recorderRPCResponseTimeout(h.cfg)) + timeout := recorderRPCResponseTimeout(h.cfg) + response, err := h.hub.SendRPC(c.Request.Context(), deviceID, "get_stats", nil, timeout) if err != nil { switch { case errors.Is(err, services.ErrRecorderNotConnected): c.JSON(http.StatusOK, disconnectedRecorderStatsResponse()) case errors.Is(err, services.ErrRecorderRPCTimeout): + logRecorderRPCTimeout(deviceID, "get_stats", "", "stats", timeout, err) c.JSON(http.StatusGatewayTimeout, gin.H{"error": err.Error()}) default: c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) @@ -628,6 +630,9 @@ func (h *RecorderHandler) refreshRecorderState(ctx context.Context, deviceID str timeout := recorderRPCResponseTimeout(h.cfg) response, err := h.hub.SendRPC(ctx, deviceID, "get_state", nil, timeout) if err != nil { + if errors.Is(err, services.ErrRecorderRPCTimeout) { + logRecorderRPCTimeout(deviceID, "get_state", "", "state_refresh", timeout, err) + } h.markRecorderSyncing(rc, "get_state", err) return rc.GetState(), false, err } @@ -693,10 +698,12 @@ func recorderStateEvent(state services.RecorderState, eventType string, source s } } -func (h *RecorderHandler) handleRecorderRPCTimeout(deviceID string, action string, params map[string]interface{}, timeoutErr error) { +func (h *RecorderHandler) handleRecorderRPCTimeout(ctx context.Context, deviceID string, action string, params map[string]interface{}, timeout time.Duration, timeoutErr error) { + taskID := firstNonEmptyString(params, "task_id") + logRecorderRPCTimeout(deviceID, action, taskID, "api_rpc", timeout, timeoutErr) if rc := h.hub.Get(deviceID); rc != nil { h.markRecorderSyncing(rc, "rpc_timeout:"+action, timeoutErr) - go h.syncRecorderStateFromDevice(rc) + go h.syncRecorderStateFromDevice(ctx, rc) } if h.stateBroker == nil { return @@ -704,10 +711,12 @@ func (h *RecorderHandler) handleRecorderRPCTimeout(deviceID string, action strin h.stateBroker.Publish(deviceID, services.DeviceStateEvent{ "type": "recorder_operation_unknown", "action": strings.TrimSpace(action), - "task_id": firstNonEmptyString(params, "task_id"), + "task_id": taskID, "result_unknown": true, "source": "rpc_timeout:" + strings.TrimSpace(action), "error": timeoutErr.Error(), + "timeout": timeoutLogValue(timeout), + "timeout_ms": timeoutLogMilliseconds(timeout), }) } @@ -780,13 +789,14 @@ func (h *RecorderHandler) callRPC(c *gin.Context, action string, params map[stri return false } - response, err := h.hub.SendRPC(c.Request.Context(), deviceID, action, params, recorderRPCResponseTimeout(h.cfg)) + timeout := recorderRPCResponseTimeout(h.cfg) + response, err := h.hub.SendRPC(c.Request.Context(), deviceID, action, params, timeout) if err != nil { switch { case errors.Is(err, services.ErrRecorderNotConnected): c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) case errors.Is(err, services.ErrRecorderRPCTimeout): - h.handleRecorderRPCTimeout(deviceID, action, params, err) + h.handleRecorderRPCTimeout(c.Request.Context(), deviceID, action, params, timeout, err) c.JSON(http.StatusGatewayTimeout, gin.H{ "error": err.Error(), "action": action, @@ -948,7 +958,7 @@ func (h *RecorderHandler) handleStateUpdate(rc *services.RecorderConn, msg map[s h.applyRecorderStateSnapshot(rc, state, "state_update") } -func (h *RecorderHandler) syncRecorderStateFromDevice(rc *services.RecorderConn) { +func (h *RecorderHandler) syncRecorderStateFromDevice(ctx context.Context, rc *services.RecorderConn) { if h == nil || h.hub == nil || rc == nil || strings.TrimSpace(rc.DeviceID) == "" { return } @@ -956,8 +966,8 @@ func (h *RecorderHandler) syncRecorderStateFromDevice(rc *services.RecorderConn) return } - if _, _, err := h.refreshRecorderState(context.Background(), rc.DeviceID, rc, -1); err != nil { - if !errors.Is(err, services.ErrRecorderNotConnected) { + if _, _, err := h.refreshRecorderState(ctx, rc.DeviceID, rc, -1); err != nil { + if !errors.Is(err, services.ErrRecorderNotConnected) && !errors.Is(err, context.Canceled) { logger.Printf("[RECORDER] Recorder %s get_state after connect failed: %v", rc.DeviceID, err) } } @@ -1117,10 +1127,11 @@ func (h *RecorderHandler) pingLoop(ctx context.Context, rc *services.RecorderCon case <-ticker.C: pingCtx, cancel := context.WithTimeout(ctx, timeout) err := rc.Conn.Ping(pingCtx) + timedOut := errors.Is(err, context.DeadlineExceeded) || errors.Is(pingCtx.Err(), context.DeadlineExceeded) cancel() if err != nil { if ctx.Err() == nil { - logger.Printf("[RECORDER] Recorder %s ping failed: %v", rc.DeviceID, err) + logWebSocketPingFailure("RECORDER", "Recorder", rc.DeviceID, timeout, timedOut, err) if closeErr := rc.Conn.CloseNow(); closeErr != nil { if !isExpectedWebSocketCloseError(closeErr) { logger.Printf("[RECORDER] Recorder %s close after ping failure: %v", rc.DeviceID, closeErr) @@ -1143,6 +1154,49 @@ func recorderRPCResponseTimeout(cfg *config.RecorderConfig) time.Duration { return time.Duration(cfg.ResponseTimeout) * time.Second } +func timeoutLogValue(timeout time.Duration) string { + if timeout <= 0 { + return "unknown" + } + return timeout.String() +} + +func timeoutLogMilliseconds(timeout time.Duration) int64 { + if timeout <= 0 { + return 0 + } + return timeout.Milliseconds() +} + +func logRecorderRPCTimeout(deviceID, action, taskID, source string, timeout time.Duration, err error) { + deviceID = strings.TrimSpace(deviceID) + action = strings.TrimSpace(action) + if action == "" { + action = "unknown" + } + source = strings.TrimSpace(source) + if source == "" { + source = "unknown" + } + taskID = strings.TrimSpace(taskID) + if taskID != "" { + logger.Printf("[RECORDER] Recorder %s RPC timeout after %s (timeout_ms=%d): action=%s task=%s source=%s err=%v", deviceID, timeoutLogValue(timeout), timeoutLogMilliseconds(timeout), action, taskID, source, err) + return + } + logger.Printf("[RECORDER] Recorder %s RPC timeout after %s (timeout_ms=%d): action=%s source=%s err=%v", deviceID, timeoutLogValue(timeout), timeoutLogMilliseconds(timeout), action, source, err) +} + +func logWebSocketPingFailure(component, label, deviceID string, timeout time.Duration, timedOut bool, err error) { + component = strings.TrimSpace(component) + label = strings.TrimSpace(label) + deviceID = strings.TrimSpace(deviceID) + if timedOut { + logger.Printf("[%s] %s %s ping timeout after %s (timeout_ms=%d): %v", component, label, deviceID, timeoutLogValue(timeout), timeoutLogMilliseconds(timeout), err) + return + } + logger.Printf("[%s] %s %s ping failed (timeout=%s timeout_ms=%d): %v", component, label, deviceID, timeoutLogValue(timeout), timeoutLogMilliseconds(timeout), err) +} + func recorderPingInterval(cfg *config.RecorderConfig) time.Duration { if cfg == nil || cfg.PingInterval <= 0 { return 0 @@ -1157,13 +1211,6 @@ func recorderPingTimeout(cfg *config.RecorderConfig) time.Duration { return time.Duration(cfg.PingTimeout) * time.Second } -func recorderStaleThreshold(cfg *config.RecorderConfig) time.Duration { - if cfg == nil || cfg.StaleThreshold <= 0 { - return 0 - } - return time.Duration(cfg.StaleThreshold) * time.Second -} - func closeReplacedRecorderConn(deviceID string, rc *services.RecorderConn) { if rc == nil || rc.Conn == nil { return diff --git a/internal/api/handlers/batch.go b/internal/api/handlers/batch.go index de1d670..39c95b2 100644 --- a/internal/api/handlers/batch.go +++ b/internal/api/handlers/batch.go @@ -8,6 +8,7 @@ import ( "context" "database/sql" "encoding/json" + "errors" "fmt" "io" "net/http" @@ -1157,7 +1158,15 @@ func notifyRecorderCancelTasksWithHub(ctx context.Context, hub *services.Recorde continue } if err != nil { - logger.Printf("[BATCH] Batch %d: failed to notify recorder (status=%s): device=%s task=%s err=%v", batchID, st, deviceID, taskID, err) + action := "cancel" + if st == "ready" { + action = "clear" + } + if errors.Is(err, services.ErrRecorderRPCTimeout) { + logRecorderRPCTimeout(deviceID, action, taskID, "batch_cancel", timeout, err) + } else { + logger.Printf("[BATCH] Batch %d: failed to notify recorder (status=%s): device=%s task=%s err=%v", batchID, st, deviceID, taskID, err) + } } } } diff --git a/internal/api/handlers/recorder_axon_interaction_test.go b/internal/api/handlers/recorder_axon_interaction_test.go new file mode 100644 index 0000000..9851cef --- /dev/null +++ b/internal/api/handlers/recorder_axon_interaction_test.go @@ -0,0 +1,1589 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package handlers + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "archebase.com/keystone-edge/internal/config" + "archebase.com/keystone-edge/internal/services" + "github.com/coder/websocket" + "github.com/coder/websocket/wsjson" + "github.com/gin-gonic/gin" + "github.com/jmoiron/sqlx" + _ "modernc.org/sqlite" +) + +func TestRecorderStateSnapshotReconcilesAdditionalStates(t *testing.T) { + tests := []struct { + name string + initial string + state services.RecorderState + wantStatus string + wantColumn string + }{ + { + name: "get_state recording with top-level task_id advances to in_progress", + initial: "pending", + state: recorderStateFromRPCData(map[string]interface{}{"state": "recording", "task_id": "task-state"}), + wantStatus: "in_progress", + wantColumn: "started_at", + }, + { + name: "get_state paused with nested task_config advances to in_progress", + initial: "ready", + state: recorderStateFromRPCData(map[string]interface{}{"state": "paused", "task_config": map[string]interface{}{"task_id": "task-state"}}), + wantStatus: "in_progress", + wantColumn: "started_at", + }, + { + name: "idle with task_id does not change pending task", + initial: "pending", + state: services.RecorderState{CurrentState: "idle", TaskID: "task-state"}, + wantStatus: "pending", + }, + { + name: "ready without task_id does not change pending task", + initial: "pending", + state: services.RecorderState{CurrentState: "ready"}, + wantStatus: "pending", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-state", tt.initial) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db) + rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") + + handler.applyRecorderStateSnapshot(rc, tt.state, "get_state") + + assertTaskStateRecoveryStatus(t, db, "task-state", tt.wantStatus) + if tt.wantColumn != "" { + assertTaskStateRecoveryTimestampSet(t, db, "task-state", tt.wantColumn) + } + }) + } +} + +func TestRecorderNonIdleWithoutTaskIDMarksSyncedWithoutDBChange(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-no-id", "pending") + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db) + rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") + + handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "ready"}, "state_update") + + if !rc.IsStateSynced() { + t.Fatalf("non-empty state without task_id did not mark connection synced") + } + assertTaskStateRecoveryStatus(t, db, "task-no-id", "pending") +} + +func TestRecorderEmptyStateSnapshotKeepsConnectionUnsynced(t *testing.T) { + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{}, nil) + rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") + + handler.applyRecorderStateSnapshot(rc, services.RecorderState{}, "state_update") + + if rc.IsStateSynced() { + t.Fatalf("empty state snapshot marked connection as synced") + } +} + +func TestRecorderListDevicesIncludesStateSynced(t *testing.T) { + hub := services.NewRecorderHub() + unsynced := hub.NewRecorderConn(nil, "robot-unsynced", "127.0.0.1") + if !hub.Connect("robot-unsynced", unsynced) { + t.Fatalf("connect unsynced recorder failed") + } + synced := hub.NewRecorderConn(nil, "robot-synced", "127.0.0.1") + synced.UpdateState(services.RecorderState{CurrentState: "idle"}) + if !hub.Connect("robot-synced", synced) { + t.Fatalf("connect synced recorder failed") + } + handler := NewRecorderHandler(hub, &config.RecorderConfig{}, nil) + + gin.SetMode(gin.TestMode) + router := gin.New() + router.GET("/recorder/devices", handler.ListDevices) + w := httptest.NewRecorder() + router.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/recorder/devices", nil)) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + var body struct { + Devices []struct { + DeviceID string `json:"device_id"` + StateSynced bool `json:"state_synced"` + } `json:"devices"` + } + if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil { + t.Fatalf("decode list devices response: %v", err) + } + got := map[string]bool{} + for _, device := range body.Devices { + got[device.DeviceID] = device.StateSynced + } + if got["robot-unsynced"] { + t.Fatalf("unsynced recorder reported state_synced=true: %#v", body.Devices) + } + if !got["robot-synced"] { + t.Fatalf("synced recorder reported state_synced=false: %#v", body.Devices) + } +} + +func TestRecorderConfigRejectsAdditionalBusyStates(t *testing.T) { + for _, state := range []string{"recording", "paused", "unknown", "maintenance"} { + t.Run(state, func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-config-busy", "pending") + + hub := services.NewRecorderHub() + rpcCalled := make(chan services.RPCRequest, 1) + rc := attachRecorderRPCResponderWithConn(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + rpcCalled <- req + return services.RPCResponse{Success: true} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: state, TaskID: "current-task"}, "state_update") + + router := newRecorderInteractionRouter(handler) + w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-config-busy"}}`) + + if w.Code != http.StatusConflict { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String()) + } + select { + case req := <-rpcCalled: + t.Fatalf("unexpected RPC sent while recorder state=%s: %#v", state, req) + default: + } + assertTaskStateRecoveryStatus(t, db, "task-config-busy", "pending") + }) + } +} + +func TestRecorderConfigTimeoutAndDisconnectedKeepTaskPending(t *testing.T) { + t.Run("timeout", func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-config-timeout", "pending") + + hub := services.NewRecorderHub() + rc, requests := attachRecorderRPCObserverWithConn(t, hub, "robot-001") + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update") + + router := newRecorderInteractionRouter(handler) + w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-config-timeout"}}`) + + if w.Code != http.StatusGatewayTimeout { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusGatewayTimeout, w.Body.String()) + } + req := receiveRecorderRPCRequest(t, requests, "config") + if req.Action != "config" { + t.Fatalf("action=%q want=config", req.Action) + } + assertTaskStateRecoveryStatus(t, db, "task-config-timeout", "pending") + }) + + t.Run("disconnected", func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-config-disconnected", "pending") + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-config-disconnected"}}`) + + if w.Code != http.StatusNotFound { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusNotFound, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-config-disconnected", "pending") + }) +} + +func TestRecorderBeginSuccessTimeoutAndDisconnectedTaskState(t *testing.T) { + t.Run("success advances pending to in_progress", func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-begin-success", "pending") + + hub := services.NewRecorderHub() + attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + return services.RPCResponse{Success: true} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/begin", `{"task_id":"task-begin-success"}`) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-begin-success", "in_progress") + assertTaskStateRecoveryTimestampSet(t, db, "task-begin-success", "started_at") + }) + + t.Run("timeout keeps pending", func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-begin-timeout", "pending") + + hub := services.NewRecorderHub() + _, requests := attachRecorderRPCObserverWithConn(t, hub, "robot-001") + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/begin", `{"task_id":"task-begin-timeout"}`) + + if w.Code != http.StatusGatewayTimeout { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusGatewayTimeout, w.Body.String()) + } + receiveRecorderRPCRequest(t, requests, "begin") + assertTaskStateRecoveryStatus(t, db, "task-begin-timeout", "pending") + }) + + t.Run("disconnected keeps pending", func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-begin-disconnected", "pending") + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/begin", `{"task_id":"task-begin-disconnected"}`) + + if w.Code != http.StatusNotFound { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusNotFound, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-begin-disconnected", "pending") + }) +} + +func TestRecorderBeginSuccessAdvancesReadyAndLeavesInProgress(t *testing.T) { + for _, tt := range []struct { + name string + initial string + wantStatus string + }{ + {name: "ready advances to in_progress", initial: "ready", wantStatus: "in_progress"}, + {name: "in_progress stays in_progress", initial: "in_progress", wantStatus: "in_progress"}, + } { + t.Run(tt.name, func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-begin-idempotent", tt.initial) + + hub := services.NewRecorderHub() + attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + return services.RPCResponse{Success: true} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/begin", `{"task_id":"task-begin-idempotent"}`) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-begin-idempotent", tt.wantStatus) + if tt.initial == "ready" { + assertTaskStateRecoveryTimestampSet(t, db, "task-begin-idempotent", "started_at") + } + }) + } +} + +func TestRecorderForwardOnlyActionsDoNotMutateTaskState(t *testing.T) { + for _, tt := range []struct { + name string + path string + body string + initial string + }{ + {name: "finish", path: "/recorder/robot-001/finish", body: `{"task_id":"task-forward-only"}`, initial: "in_progress"}, + {name: "pause", path: "/recorder/robot-001/pause", body: `{}`, initial: "in_progress"}, + {name: "resume", path: "/recorder/robot-001/resume", body: `{}`, initial: "in_progress"}, + {name: "quit", path: "/recorder/robot-001/quit", body: `{}`, initial: "ready"}, + } { + t.Run(tt.name, func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-forward-only", tt.initial) + + hub := services.NewRecorderHub() + attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + return services.RPCResponse{Success: true} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, tt.path, tt.body) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-forward-only", tt.initial) + }) + } +} + +func TestRecorderCancelAndClearTaskStateSideEffects(t *testing.T) { + t.Run("cancel success reverts ready and in_progress", func(t *testing.T) { + for _, initial := range []string{"ready", "in_progress"} { + t.Run(initial, func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-cancel", initial) + + hub := services.NewRecorderHub() + attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + return services.RPCResponse{Success: true} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/cancel", `{"task_id":"task-cancel"}`) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-cancel", "pending") + }) + } + }) + + t.Run("cancel failure keeps in_progress", func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-cancel-false", "in_progress") + + hub := services.NewRecorderHub() + attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + return services.RPCResponse{Success: false, Message: "device rejected cancel"} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/cancel", `{"task_id":"task-cancel-false"}`) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-cancel-false", "in_progress") + }) + + t.Run("clear success reverts only ready and strips task payload", func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-clear-ready", "ready") + seedTaskStateRecoveryTask(t, db, "task-clear-progress", "in_progress") + + hub := services.NewRecorderHub() + rpcRequests := make(chan services.RPCRequest, 1) + attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + rpcRequests <- req + return services.RPCResponse{Success: true} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/clear", `{"task_id":"task-clear-ready"}`) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + req := receiveRecorderRPCRequest(t, rpcRequests, "clear") + if len(req.Params) != 0 { + t.Fatalf("clear RPC params=%#v want empty", req.Params) + } + assertTaskStateRecoveryStatus(t, db, "task-clear-ready", "pending") + assertTaskStateRecoveryStatus(t, db, "task-clear-progress", "in_progress") + }) +} + +func TestRecorderCancelTimeoutAndDisconnectedKeepTaskState(t *testing.T) { + t.Run("timeout", func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-cancel-timeout", "in_progress") + + hub := services.NewRecorderHub() + _, requests := attachRecorderRPCObserverWithConn(t, hub, "robot-001") + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/cancel", `{"task_id":"task-cancel-timeout"}`) + + if w.Code != http.StatusGatewayTimeout { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusGatewayTimeout, w.Body.String()) + } + receiveRecorderRPCRequest(t, requests, "cancel") + assertTaskStateRecoveryStatus(t, db, "task-cancel-timeout", "in_progress") + }) + + t.Run("disconnected", func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-cancel-disconnected", "ready") + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/cancel", `{"task_id":"task-cancel-disconnected"}`) + + if w.Code != http.StatusNotFound { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusNotFound, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-cancel-disconnected", "ready") + }) +} + +func TestRecorderClearFailureTimeoutAndDisconnectedKeepTaskState(t *testing.T) { + t.Run("success false", func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-clear-false", "ready") + + hub := services.NewRecorderHub() + attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + return services.RPCResponse{Success: false, Message: "device rejected clear"} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/clear", `{"task_id":"task-clear-false"}`) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-clear-false", "ready") + }) + + t.Run("timeout", func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-clear-timeout", "ready") + + hub := services.NewRecorderHub() + _, requests := attachRecorderRPCObserverWithConn(t, hub, "robot-001") + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/clear", `{"task_id":"task-clear-timeout"}`) + + if w.Code != http.StatusGatewayTimeout { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusGatewayTimeout, w.Body.String()) + } + receiveRecorderRPCRequest(t, requests, "clear") + assertTaskStateRecoveryStatus(t, db, "task-clear-timeout", "ready") + }) + + t.Run("disconnected", func(t *testing.T) { + db := newTaskStateRecoveryDB(t) + defer db.Close() + seedTaskStateRecoveryTask(t, db, "task-clear-disconnected", "ready") + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/clear", `{"task_id":"task-clear-disconnected"}`) + + if w.Code != http.StatusNotFound { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusNotFound, w.Body.String()) + } + assertTaskStateRecoveryStatus(t, db, "task-clear-disconnected", "ready") + }) +} + +func TestRecorderGetStatsContracts(t *testing.T) { + t.Run("disconnected returns connected false", func(t *testing.T) { + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, nil) + router := newRecorderInteractionRouter(handler) + + w := httptest.NewRecorder() + router.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/recorder/robot-001/stats", nil)) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + var body map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil { + t.Fatalf("decode stats response: %v", err) + } + if body["connected"] != false { + t.Fatalf("connected=%v want=false body=%v", body["connected"], body) + } + }) + + t.Run("axon failure returns connected true with error", func(t *testing.T) { + hub := services.NewRecorderHub() + attachRecorderRPCResponder(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + return services.RPCResponse{Success: false, Message: "stats unavailable", Data: map[string]interface{}{"written": float64(3)}} + }) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, nil) + router := newRecorderInteractionRouter(handler) + + w := httptest.NewRecorder() + router.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/recorder/robot-001/stats", nil)) + + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + var body map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil { + t.Fatalf("decode stats response: %v", err) + } + if body["connected"] != true || body["error"] != "stats unavailable" { + t.Fatalf("unexpected stats body=%v", body) + } + }) + + t.Run("timeout returns gateway timeout", func(t *testing.T) { + hub := services.NewRecorderHub() + _, requests := attachRecorderRPCObserverWithConn(t, hub, "robot-001") + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, nil) + router := newRecorderInteractionRouter(handler) + + w := httptest.NewRecorder() + router.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/recorder/robot-001/stats", nil)) + + if w.Code != http.StatusGatewayTimeout { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusGatewayTimeout, w.Body.String()) + } + receiveRecorderRPCRequest(t, requests, "get_stats") + }) +} + +func TestRecorderDisconnectRevertsOnlyRunnableTasksForDevice(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + seedRecorderInteractionDevice(t, db, "robot-002", 2, 102) + seedRecorderInteractionTask(t, db, 101, "task-ready", "ready", false) + seedRecorderInteractionTask(t, db, 101, "task-progress", "in_progress", false) + seedRecorderInteractionTask(t, db, 101, "task-pending", "pending", false) + seedRecorderInteractionTask(t, db, 101, "task-completed", "completed", false) + seedRecorderInteractionTask(t, db, 101, "task-deleted", "ready", true) + seedRecorderInteractionTask(t, db, 102, "task-other-device", "ready", false) + + revertRunnableTasksOnDeviceDisconnect(db, "robot-001", nil, 0, false) + + assertRecorderInteractionTaskStatus(t, db, "task-ready", "pending") + assertRecorderInteractionTaskStatus(t, db, "task-progress", "pending") + assertRecorderInteractionTaskStatus(t, db, "task-pending", "pending") + assertRecorderInteractionTaskStatus(t, db, "task-completed", "completed") + assertRecorderInteractionTaskStatus(t, db, "task-deleted", "ready") + assertRecorderInteractionTaskStatus(t, db, "task-other-device", "ready") + assertRecorderInteractionTaskRuntimeCleared(t, db, "task-ready") + assertRecorderInteractionTaskRuntimeCleared(t, db, "task-progress") +} + +func TestOldRecorderDisconnectAfterReplacementDoesNotRevertTasks(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + seedRecorderInteractionTask(t, db, 101, "task-current-ready", "ready", false) + + hub := services.NewRecorderHub() + oldConn := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") + if !hub.Connect("robot-001", oldConn) { + t.Fatalf("connect old recorder failed") + } + newConn := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") + hub.ConnectReplacingExisting("robot-001", newConn) + if hub.Disconnect("robot-001", oldConn) { + revertRunnableTasksOnDeviceDisconnect(db, "robot-001", nil, 0, false) + } + + assertRecorderInteractionTaskStatus(t, db, "task-current-ready", "ready") +} + +func newRecorderInteractionRouter(handler *RecorderHandler) *gin.Engine { + gin.SetMode(gin.TestMode) + router := gin.New() + router.GET("/recorder/devices", handler.ListDevices) + router.GET("/recorder/:device_id/state", handler.GetState) + router.GET("/recorder/:device_id/stats", handler.GetStats) + router.POST("/recorder/:device_id/config", handler.Config) + router.POST("/recorder/:device_id/begin", handler.Begin) + router.POST("/recorder/:device_id/finish", handler.Finish) + router.POST("/recorder/:device_id/pause", handler.Pause) + router.POST("/recorder/:device_id/resume", handler.Resume) + router.POST("/recorder/:device_id/cancel", handler.Cancel) + router.POST("/recorder/:device_id/clear", handler.Clear) + router.POST("/recorder/:device_id/quit", handler.Quit) + return router +} + +func recorderInteractionPost(t *testing.T, router *gin.Engine, path string, body string) *httptest.ResponseRecorder { + t.Helper() + w := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, path, bytes.NewReader([]byte(body))) + req.Header.Set("Content-Type", "application/json") + router.ServeHTTP(w, req) + return w +} + +func attachRecorderRPCObserverWithConn(t *testing.T, hub *services.RecorderHub, deviceID string) (*services.RecorderConn, <-chan services.RPCRequest) { + t.Helper() + serverConn, clientConn := newRecorderHandlerTestWebSocketPair(t) + rc := hub.NewRecorderConn(serverConn, deviceID, "127.0.0.1") + if !hub.Connect(deviceID, rc) { + t.Fatalf("connect recorder: initial connect failed") + } + + requests := make(chan services.RPCRequest, 8) + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + go func() { + for { + var req services.RPCRequest + if err := wsjson.Read(ctx, clientConn, &req); err != nil { + return + } + requests <- req + } + }() + return rc, requests +} + +func receiveRecorderRPCRequest(t *testing.T, requests <-chan services.RPCRequest, wantAction string) services.RPCRequest { + t.Helper() + select { + case req := <-requests: + if wantAction != "" && req.Action != wantAction { + t.Fatalf("RPC action=%q want=%q req=%#v", req.Action, wantAction, req) + } + return req + case <-time.After(2 * time.Second): + t.Fatalf("timed out waiting for recorder RPC action %q", wantAction) + } + return services.RPCRequest{} +} + +func newRecorderInteractionDB(t *testing.T) *sqlx.DB { + t.Helper() + name := strings.NewReplacer("/", "_", " ", "_", "-", "_").Replace(t.Name()) + db, err := sqlx.Open("sqlite", "file:"+name+"?mode=memory&cache=shared") + if err != nil { + t.Fatalf("open sqlite db: %v", err) + } + db.SetMaxOpenConns(8) + t.Cleanup(func() { _ = db.Close() }) + if _, err := db.Exec(`CREATE TABLE robots ( + id INTEGER PRIMARY KEY, + device_id TEXT NOT NULL, + deleted_at TIMESTAMP NULL + )`); err != nil { + t.Fatalf("create robots schema: %v", err) + } + if _, err := db.Exec(`CREATE TABLE workstations ( + id INTEGER PRIMARY KEY, + robot_id INTEGER NOT NULL, + deleted_at TIMESTAMP NULL + )`); err != nil { + t.Fatalf("create workstations schema: %v", err) + } + if _, err := db.Exec(`CREATE TABLE tasks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + task_id TEXT NOT NULL, + batch_id INTEGER NOT NULL DEFAULT 0, + workstation_id INTEGER NOT NULL, + status TEXT NOT NULL, + ready_at TIMESTAMP NULL, + started_at TIMESTAMP NULL, + completed_at TIMESTAMP NULL, + error_message TEXT NULL, + created_at TIMESTAMP NOT NULL, + updated_at TIMESTAMP NOT NULL, + deleted_at TIMESTAMP NULL + )`); err != nil { + t.Fatalf("create tasks schema: %v", err) + } + return db +} + +func seedRecorderInteractionDevice(t *testing.T, db *sqlx.DB, deviceID string, robotID int64, workstationID int64) { + t.Helper() + if _, err := db.Exec(`INSERT INTO robots (id, device_id) VALUES (?, ?)`, robotID, deviceID); err != nil { + t.Fatalf("seed robot: %v", err) + } + if _, err := db.Exec(`INSERT INTO workstations (id, robot_id) VALUES (?, ?)`, workstationID, robotID); err != nil { + t.Fatalf("seed workstation: %v", err) + } +} + +func seedRecorderInteractionTask(t *testing.T, db *sqlx.DB, workstationID int64, taskID string, status string, deleted bool) { + t.Helper() + now := time.Now().UTC() + var deletedAt interface{} + if deleted { + deletedAt = now + } + if _, err := db.Exec( + `INSERT INTO tasks (task_id, batch_id, workstation_id, status, ready_at, started_at, completed_at, error_message, created_at, updated_at, deleted_at) + VALUES (?, 1, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + taskID, + workstationID, + status, + now, + now, + now, + "previous error", + now, + now, + deletedAt, + ); err != nil { + t.Fatalf("seed task %s: %v", taskID, err) + } +} + +func assertRecorderInteractionTaskStatus(t *testing.T, db *sqlx.DB, taskID string, want string) { + t.Helper() + var got string + if err := db.Get(&got, `SELECT status FROM tasks WHERE task_id = ?`, taskID); err != nil { + t.Fatalf("query task status: %v", err) + } + if got != want { + t.Fatalf("task %s status=%q want=%q", taskID, got, want) + } +} + +func assertRecorderInteractionTaskRuntimeCleared(t *testing.T, db *sqlx.DB, taskID string) { + t.Helper() + var count int + if err := db.Get(&count, `SELECT CASE WHEN ready_at IS NULL AND started_at IS NULL AND completed_at IS NULL AND error_message IS NULL THEN 1 ELSE 0 END FROM tasks WHERE task_id = ?`, taskID); err != nil { + t.Fatalf("query runtime columns for %s: %v", taskID, err) + } + if count != 1 { + t.Fatalf("runtime columns for task %s were not cleared", taskID) + } +} + +func TestRecorderIgnoresRPCResponseFromReplacedConnection(t *testing.T) { + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{}, nil) + oldConn := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") + if !hub.Connect("robot-001", oldConn) { + t.Fatalf("connect old recorder failed") + } + + serverConn, clientConn := newRecorderHandlerTestWebSocketPair(t) + newConn := hub.NewRecorderConn(serverConn, "robot-001", "127.0.0.1") + hub.ConnectReplacingExisting("robot-001", newConn) + + type rpcResult struct { + resp *services.RPCResponse + err error + } + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + resultC := make(chan rpcResult, 1) + go func() { + resp, err := hub.SendRPC(ctx, "robot-001", "begin", nil, time.Second) + resultC <- rpcResult{resp: resp, err: err} + }() + + var req services.RPCRequest + if err := wsjson.Read(ctx, clientConn, &req); err != nil { + t.Fatalf("read current recorder request: %v", err) + } + handler.handleMessage("robot-001", oldConn, map[string]interface{}{ + "type": "rpc_response", + "request_id": req.RequestID, + "success": true, + }) + + select { + case result := <-resultC: + t.Fatalf("old connection rpc_response completed current RPC: resp=%+v err=%v", result.resp, result.err) + case <-time.After(150 * time.Millisecond): + } + + if !hub.HandleRPCResponse("robot-001", &services.RPCResponse{RequestID: req.RequestID, Success: false, Message: "current response"}) { + t.Fatalf("current rpc_response was not matched") + } + select { + case result := <-resultC: + if result.err != nil { + t.Fatalf("current response returned error: %v", result.err) + } + if result.resp == nil || result.resp.Success { + t.Fatalf("current response=%+v want success=false", result.resp) + } + case <-time.After(time.Second): + t.Fatalf("timed out waiting for current rpc_response") + } +} + +func TestRecorderUnmatchedRPCResponseDoesNotUnblockPendingRequest(t *testing.T) { + hub := services.NewRecorderHub() + serverConn, clientConn := newRecorderHandlerTestWebSocketPair(t) + rc := hub.NewRecorderConn(serverConn, "robot-001", "127.0.0.1") + if !hub.Connect("robot-001", rc) { + t.Fatalf("connect recorder failed") + } + + type rpcResult struct { + resp *services.RPCResponse + err error + } + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + resultC := make(chan rpcResult, 1) + go func() { + resp, err := hub.SendRPC(ctx, "robot-001", "begin", nil, time.Second) + resultC <- rpcResult{resp: resp, err: err} + }() + + var req services.RPCRequest + if err := wsjson.Read(ctx, clientConn, &req); err != nil { + t.Fatalf("read recorder request: %v", err) + } + if hub.HandleRPCResponse("robot-001", &services.RPCResponse{RequestID: "bogus", Success: true}) { + t.Fatalf("unmatched rpc_response returned matched=true") + } + select { + case result := <-resultC: + t.Fatalf("unmatched rpc_response completed pending RPC: resp=%+v err=%v", result.resp, result.err) + case <-time.After(150 * time.Millisecond): + } + + if !hub.HandleRPCResponse("robot-001", &services.RPCResponse{RequestID: req.RequestID, Success: true}) { + t.Fatalf("matched rpc_response returned false") + } + select { + case result := <-resultC: + if result.err != nil || result.resp == nil || !result.resp.Success { + t.Fatalf("matched response result resp=%+v err=%v", result.resp, result.err) + } + case <-time.After(time.Second): + t.Fatalf("timed out waiting for matched rpc_response") + } +} + +func TestRecorderWebSocketEmptyStateUpdateDoesNotMarkSynced(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001") + axon := connectFakeRecorderAxon(t, wsURL) + axon.receiveRPC(t, "get_state") + + axon.sendStateUpdate(t, "", "") + waitForRecorderCachedRawState(t, hub, "robot-001", false, "", "") + + router := newRecorderInteractionRouter(handler) + w := httptest.NewRecorder() + router.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/recorder/robot-001/state", nil)) + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + var body map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil { + t.Fatalf("decode state response: %v", err) + } + if body["state_synced"] != false || body["syncing"] != true { + t.Fatalf("empty state_update should keep syncing; body=%v", body) + } +} + +func TestRecorderWebSocketReplacementIgnoresOldConnectionLateMessages(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + seedRecorderInteractionTask(t, db, 101, "task-old-late", "pending", false) + seedRecorderInteractionTask(t, db, 101, "task-current-config", "pending", false) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001") + oldAxon := connectFakeRecorderAxon(t, wsURL) + oldAxon.receiveRPC(t, "get_state") + oldConn := hub.Get("robot-001") + if oldConn == nil { + t.Fatalf("old recorder connection was not registered") + } + + newAxon := connectFakeRecorderAxon(t, wsURL) + getState := newAxon.receiveRPC(t, "get_state") + newAxon.respondRPC(t, getState, true, "", map[string]interface{}{"state": "idle"}) + waitForRecorderCachedState(t, hub, "robot-001", true, "idle", "") + + handler.handleMessage("robot-001", oldConn, map[string]interface{}{ + "type": "state_update", + "data": map[string]interface{}{"current": "ready", "task_id": "task-old-late"}, + }) + handler.handleMessage("robot-001", oldConn, map[string]interface{}{ + "type": "config_applied", + "data": map[string]interface{}{"task_id": "task-old-late"}, + }) + assertRecorderInteractionTaskStatus(t, db, "task-old-late", "pending") + + router := newRecorderInteractionRouter(handler) + resultC := recorderInteractionPostAsync(router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-current-config"}}`) + configReq := newAxon.receiveRPC(t, "config") + handler.handleMessage("robot-001", oldConn, map[string]interface{}{ + "type": "rpc_response", + "request_id": configReq.RequestID, + "success": true, + }) + select { + case w := <-resultC: + t.Fatalf("old rpc_response completed current config request: status=%d body=%s", w.Code, w.Body.String()) + case <-time.After(150 * time.Millisecond): + } + newAxon.respondRPC(t, configReq, true, "", nil) + w := receiveRecorderHTTPResponse(t, resultC) + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + assertRecorderInteractionTaskStatus(t, db, "task-current-config", "ready") +} + +func TestRecorderWebSocketGetStateTimeoutLeavesUnsyncedAndBlocksConfig(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + seedRecorderInteractionTask(t, db, 101, "task-timeout-blocked", "pending", false) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001") + axon := connectFakeRecorderAxon(t, wsURL) + axon.receiveRPC(t, "get_state") + + waitForRecorderConnectionSyncedFlagAfter(t, hub, "robot-001", false, 1200*time.Millisecond) + + router := newRecorderInteractionRouter(handler) + stateW := httptest.NewRecorder() + router.ServeHTTP(stateW, httptest.NewRequest(http.MethodGet, "/recorder/robot-001/state", nil)) + if stateW.Code != http.StatusOK { + t.Fatalf("state status=%d want=%d body=%s", stateW.Code, http.StatusOK, stateW.Body.String()) + } + var stateBody map[string]interface{} + if err := json.Unmarshal(stateW.Body.Bytes(), &stateBody); err != nil { + t.Fatalf("decode state response: %v", err) + } + if stateBody["state_synced"] != false || stateBody["syncing"] != true { + t.Fatalf("timed-out get_state should keep syncing; body=%v", stateBody) + } + + configW := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-timeout-blocked"}}`) + if configW.Code != http.StatusConflict { + t.Fatalf("config status=%d want=%d body=%s", configW.Code, http.StatusConflict, configW.Body.String()) + } + axon.assertNoRPC(t, 150*time.Millisecond) + assertRecorderInteractionTaskStatus(t, db, "task-timeout-blocked", "pending") + + axon.sendStateUpdate(t, "idle", "") + waitForRecorderCachedState(t, hub, "robot-001", true, "idle", "") + resultC := recorderInteractionPostAsync(router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-timeout-blocked"}}`) + configReq := axon.receiveRPC(t, "config") + axon.respondRPC(t, configReq, true, "", nil) + configW = receiveRecorderHTTPResponse(t, resultC) + if configW.Code != http.StatusOK { + t.Fatalf("config after state_update status=%d want=%d body=%s", configW.Code, http.StatusOK, configW.Body.String()) + } + assertRecorderInteractionTaskStatus(t, db, "task-timeout-blocked", "ready") +} + +func TestRecorderWebSocketGetStateFailureLeavesUnsyncedAndBlocksConfig(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + seedRecorderInteractionTask(t, db, 101, "task-blocked", "pending", false) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001") + axon := connectFakeRecorderAxon(t, wsURL) + + req := axon.receiveRPC(t, "get_state") + axon.respondRPC(t, req, false, "state unavailable", nil) + waitForRecorderConnectionSyncedFlag(t, hub, "robot-001", false) + + router := newRecorderInteractionRouter(handler) + w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-blocked"}}`) + if w.Code != http.StatusConflict { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String()) + } + axon.assertNoRPC(t, 150*time.Millisecond) + assertRecorderInteractionTaskStatus(t, db, "task-blocked", "pending") +} + +func TestRecorderWebSocketRecordingReconnectRestoresSameTaskWithoutDuplicateConfig(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + seedRecorderInteractionTask(t, db, 101, "task-recording-reconnect", "in_progress", false) + seedRecorderInteractionTask(t, db, 101, "task-next-recording", "pending", false) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001") + + first := connectFakeRecorderAxon(t, wsURL) + first.receiveRPC(t, "get_state") + first.closeNow() + waitForRecorderHubDisconnected(t, hub, "robot-001") + waitForRecorderInteractionTaskStatus(t, db, "task-recording-reconnect", "pending") + + second := connectFakeRecorderAxon(t, wsURL) + req := second.receiveRPC(t, "get_state") + second.respondRPC(t, req, true, "", map[string]interface{}{"state": "recording", "task_id": "task-recording-reconnect"}) + waitForRecorderInteractionTaskStatus(t, db, "task-recording-reconnect", "in_progress") + waitForRecorderCachedState(t, hub, "robot-001", true, "recording", "task-recording-reconnect") + + router := newRecorderInteractionRouter(handler) + w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-next-recording"}}`) + if w.Code != http.StatusConflict { + t.Fatalf("status after recording reconnect=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String()) + } + second.assertNoRPC(t, 150*time.Millisecond) + assertRecorderInteractionTaskStatus(t, db, "task-next-recording", "pending") +} + +func TestRecorderWebSocketRPCActionProtocol(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + seedRecorderInteractionTask(t, db, 101, "task-protocol", "pending", false) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001") + axon := connectFakeRecorderAxon(t, wsURL) + getState := axon.receiveRPC(t, "get_state") + axon.respondRPC(t, getState, true, "", map[string]interface{}{"state": "idle"}) + waitForRecorderCachedState(t, hub, "robot-001", true, "idle", "") + router := newRecorderInteractionRouter(handler) + + cases := []struct { + name string + method string + path string + body string + wantAction string + check func(t *testing.T, req services.RPCRequest) + }{ + { + name: "config forwards task_config payload", + method: http.MethodPost, + path: "/recorder/robot-001/config", + body: `{"task_config":{"task_id":"task-protocol","device_id":"robot-001"}}`, + wantAction: "config", + check: func(t *testing.T, req services.RPCRequest) { + t.Helper() + tc, ok := req.Params["task_config"].(map[string]interface{}) + if !ok || tc["task_id"] != "task-protocol" || tc["device_id"] != "robot-001" { + t.Fatalf("config params=%#v missing task_config", req.Params) + } + }, + }, + { + name: "begin forwards task_id", + method: http.MethodPost, + path: "/recorder/robot-001/begin", + body: `{"task_id":"task-protocol"}`, + wantAction: "begin", + check: func(t *testing.T, req services.RPCRequest) { + t.Helper() + if req.Params["task_id"] != "task-protocol" { + t.Fatalf("begin params=%#v missing task_id", req.Params) + } + }, + }, + { + name: "cancel forwards task_id", + method: http.MethodPost, + path: "/recorder/robot-001/cancel", + body: `{"task_id":"task-protocol"}`, + wantAction: "cancel", + check: func(t *testing.T, req services.RPCRequest) { + t.Helper() + if req.Params["task_id"] != "task-protocol" { + t.Fatalf("cancel params=%#v missing task_id", req.Params) + } + }, + }, + { + name: "clear strips http task_id payload", + method: http.MethodPost, + path: "/recorder/robot-001/clear", + body: `{"task_id":"task-protocol"}`, + wantAction: "clear", + check: func(t *testing.T, req services.RPCRequest) { + t.Helper() + if len(req.Params) != 0 { + t.Fatalf("clear params=%#v want empty", req.Params) + } + }, + }, + {name: "finish forwards task_id", method: http.MethodPost, path: "/recorder/robot-001/finish", body: `{"task_id":"task-protocol"}`, wantAction: "finish"}, + {name: "pause action", method: http.MethodPost, path: "/recorder/robot-001/pause", body: `{}`, wantAction: "pause"}, + {name: "resume action", method: http.MethodPost, path: "/recorder/robot-001/resume", body: `{}`, wantAction: "resume"}, + {name: "quit action", method: http.MethodPost, path: "/recorder/robot-001/quit", body: `{}`, wantAction: "quit"}, + {name: "get_stats action", method: http.MethodGet, path: "/recorder/robot-001/stats", wantAction: "get_stats"}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + resultC := recorderInteractionRequestAsync(router, tc.method, tc.path, tc.body) + req := axon.receiveRPC(t, tc.wantAction) + if tc.check != nil { + tc.check(t, req) + } + axon.respondRPC(t, req, true, "", map[string]interface{}{"ok": true}) + w := receiveRecorderHTTPResponse(t, resultC) + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + }) + } +} + +func TestRecorderWebSocketConnectGetStateIdleMarksStateSynced(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001") + axon := connectFakeRecorderAxon(t, wsURL) + + req := axon.receiveRPC(t, "get_state") + axon.respondRPC(t, req, true, "", map[string]interface{}{"state": "idle"}) + + waitForRecorderCachedState(t, hub, "robot-001", true, "idle", "") + + router := newRecorderInteractionRouter(handler) + w := httptest.NewRecorder() + router.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/recorder/robot-001/state", nil)) + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + var body map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil { + t.Fatalf("decode state response: %v", err) + } + if body["connected"] != true || body["state_synced"] != true || body["syncing"] != false || body["current_state"] != "idle" { + t.Fatalf("unexpected state response: %v", body) + } +} + +func TestRecorderWebSocketStateUpdateBeforeGetStateResponseIsIdempotent(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + seedRecorderInteractionTask(t, db, 101, "task-ready-ws", "pending", false) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001") + axon := connectFakeRecorderAxon(t, wsURL) + + axon.sendStateUpdate(t, "ready", "task-ready-ws") + waitForRecorderInteractionTaskStatus(t, db, "task-ready-ws", "ready") + readyAt := recorderInteractionTaskTimestamp(t, db, "task-ready-ws", "ready_at") + if readyAt == "" { + t.Fatalf("ready_at was not set after state_update") + } + + req := axon.receiveRPC(t, "get_state") + axon.respondRPC(t, req, true, "", map[string]interface{}{"state": "ready", "task_id": "task-ready-ws"}) + + waitForRecorderCachedState(t, hub, "robot-001", true, "ready", "task-ready-ws") + assertRecorderInteractionTaskStatus(t, db, "task-ready-ws", "ready") + if got := recorderInteractionTaskTimestamp(t, db, "task-ready-ws", "ready_at"); got != readyAt { + t.Fatalf("ready_at changed after duplicate get_state: got=%q want=%q", got, readyAt) + } +} + +func TestRecorderWebSocketGetStateReadyRejectsConfigWithoutSendingRPC(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + seedRecorderInteractionTask(t, db, 101, "task-current", "pending", false) + seedRecorderInteractionTask(t, db, 101, "task-next", "pending", false) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001") + axon := connectFakeRecorderAxon(t, wsURL) + + req := axon.receiveRPC(t, "get_state") + axon.respondRPC(t, req, true, "", map[string]interface{}{"state": "ready", "task_id": "task-current"}) + waitForRecorderInteractionTaskStatus(t, db, "task-current", "ready") + waitForRecorderCachedState(t, hub, "robot-001", true, "ready", "task-current") + + router := newRecorderInteractionRouter(handler) + w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-next"}}`) + if w.Code != http.StatusConflict { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String()) + } + axon.assertNoRPC(t, 150*time.Millisecond) + assertRecorderInteractionTaskStatus(t, db, "task-next", "pending") +} + +func TestRecorderWebSocketUnsyncedConfigRejectedUntilValidStateUpdate(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + seedRecorderInteractionTask(t, db, 101, "task-after-sync", "pending", false) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001") + axon := connectFakeRecorderAxon(t, wsURL) + + axon.receiveRPC(t, "get_state") + router := newRecorderInteractionRouter(handler) + w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-after-sync"}}`) + if w.Code != http.StatusConflict { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String()) + } + axon.assertNoRPC(t, 150*time.Millisecond) + assertRecorderInteractionTaskStatus(t, db, "task-after-sync", "pending") + + axon.sendStateUpdate(t, "idle", "") + waitForRecorderCachedState(t, hub, "robot-001", true, "idle", "") + + resultC := recorderInteractionPostAsync(router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-after-sync"}}`) + configReq := axon.receiveRPC(t, "config") + axon.respondRPC(t, configReq, true, "", nil) + w = receiveRecorderHTTPResponse(t, resultC) + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + assertRecorderInteractionTaskStatus(t, db, "task-after-sync", "ready") + axon.assertNoRPC(t, 150*time.Millisecond) +} + +func TestRecorderWebSocketReadyReconnectRestoresSameTaskWithoutDuplicateConfig(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + seedRecorderInteractionTask(t, db, 101, "task-ready-reconnect", "ready", false) + seedRecorderInteractionTask(t, db, 101, "task-next-reconnect", "pending", false) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001") + + first := connectFakeRecorderAxon(t, wsURL) + first.receiveRPC(t, "get_state") + first.closeNow() + waitForRecorderHubDisconnected(t, hub, "robot-001") + waitForRecorderInteractionTaskStatus(t, db, "task-ready-reconnect", "pending") + + router := newRecorderInteractionRouter(handler) + w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-ready-reconnect"}}`) + if w.Code != http.StatusNotFound { + t.Fatalf("status during disconnect=%d want=%d body=%s", w.Code, http.StatusNotFound, w.Body.String()) + } + + second := connectFakeRecorderAxon(t, wsURL) + req := second.receiveRPC(t, "get_state") + second.respondRPC(t, req, true, "", map[string]interface{}{"state": "ready", "task_id": "task-ready-reconnect"}) + waitForRecorderInteractionTaskStatus(t, db, "task-ready-reconnect", "ready") + waitForRecorderCachedState(t, hub, "robot-001", true, "ready", "task-ready-reconnect") + + w = recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-next-reconnect"}}`) + if w.Code != http.StatusConflict { + t.Fatalf("status after ready reconnect=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String()) + } + second.assertNoRPC(t, 150*time.Millisecond) + assertRecorderInteractionTaskStatus(t, db, "task-next-reconnect", "pending") +} + +func TestRecorderWebSocketIdleReconnectAllowsSingleConfig(t *testing.T) { + db := newRecorderInteractionDB(t) + seedRecorderInteractionDevice(t, db, "robot-001", 1, 101) + seedRecorderInteractionTask(t, db, 101, "task-idle-reconnect", "ready", false) + + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) + wsURL := newRecorderWebSocketTestServer(t, handler, "robot-001") + + first := connectFakeRecorderAxon(t, wsURL) + first.receiveRPC(t, "get_state") + first.closeNow() + waitForRecorderHubDisconnected(t, hub, "robot-001") + waitForRecorderInteractionTaskStatus(t, db, "task-idle-reconnect", "pending") + + second := connectFakeRecorderAxon(t, wsURL) + req := second.receiveRPC(t, "get_state") + second.respondRPC(t, req, true, "", map[string]interface{}{"state": "idle"}) + waitForRecorderCachedState(t, hub, "robot-001", true, "idle", "") + + router := newRecorderInteractionRouter(handler) + resultC := recorderInteractionPostAsync(router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-idle-reconnect"}}`) + configReq := second.receiveRPC(t, "config") + second.respondRPC(t, configReq, true, "", nil) + w := receiveRecorderHTTPResponse(t, resultC) + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + assertRecorderInteractionTaskStatus(t, db, "task-idle-reconnect", "ready") + second.assertNoRPC(t, 150*time.Millisecond) +} + +type fakeRecorderAxon struct { + conn *websocket.Conn + requests chan services.RPCRequest + cancel context.CancelFunc +} + +func newRecorderWebSocketTestServer(t *testing.T, handler *RecorderHandler, deviceID string) string { + t.Helper() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + handler.HandleWebSocket(w, r, deviceID) + })) + t.Cleanup(server.Close) + return "ws" + strings.TrimPrefix(server.URL, "http") +} + +func connectFakeRecorderAxon(t *testing.T, wsURL string) *fakeRecorderAxon { + t.Helper() + conn, _, err := websocket.Dial(context.Background(), wsURL, nil) + if err != nil { + t.Fatalf("dial fake recorder websocket: %v", err) + } + ctx, cancel := context.WithCancel(context.Background()) + axon := &fakeRecorderAxon{ + conn: conn, + requests: make(chan services.RPCRequest, 16), + cancel: cancel, + } + t.Cleanup(func() { + cancel() + _ = conn.CloseNow() + }) + go func() { + for { + var req services.RPCRequest + if err := wsjson.Read(ctx, conn, &req); err != nil { + return + } + axon.requests <- req + } + }() + return axon +} + +func (f *fakeRecorderAxon) receiveRPC(t *testing.T, wantAction string) services.RPCRequest { + t.Helper() + return receiveRecorderRPCRequest(t, f.requests, wantAction) +} + +func (f *fakeRecorderAxon) respondRPC(t *testing.T, req services.RPCRequest, success bool, message string, data map[string]interface{}) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + resp := services.RPCResponse{ + Type: "rpc_response", + RequestID: req.RequestID, + Success: success, + Message: message, + Data: data, + } + if err := wsjson.Write(ctx, f.conn, resp); err != nil { + t.Fatalf("write fake recorder rpc response: %v", err) + } +} + +func (f *fakeRecorderAxon) sendStateUpdate(t *testing.T, current string, taskID string) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + data := map[string]interface{}{"current": current} + if taskID != "" { + data["task_id"] = taskID + } + msg := map[string]interface{}{ + "type": "state_update", + "data": data, + } + if err := wsjson.Write(ctx, f.conn, msg); err != nil { + t.Fatalf("write fake recorder state_update: %v", err) + } +} + +func (f *fakeRecorderAxon) assertNoRPC(t *testing.T, duration time.Duration) { + t.Helper() + select { + case req := <-f.requests: + t.Fatalf("unexpected recorder RPC: %#v", req) + case <-time.After(duration): + } +} + +func (f *fakeRecorderAxon) closeNow() { + f.cancel() + _ = f.conn.CloseNow() +} + +func recorderInteractionPostAsync(router *gin.Engine, path string, body string) <-chan *httptest.ResponseRecorder { + return recorderInteractionRequestAsync(router, http.MethodPost, path, body) +} + +func recorderInteractionRequestAsync(router *gin.Engine, method string, path string, body string) <-chan *httptest.ResponseRecorder { + resultC := make(chan *httptest.ResponseRecorder, 1) + go func() { + w := httptest.NewRecorder() + req := httptest.NewRequest(method, path, bytes.NewReader([]byte(body))) + if method != http.MethodGet { + req.Header.Set("Content-Type", "application/json") + } + router.ServeHTTP(w, req) + resultC <- w + }() + return resultC +} + +func receiveRecorderHTTPResponse(t *testing.T, resultC <-chan *httptest.ResponseRecorder) *httptest.ResponseRecorder { + t.Helper() + select { + case w := <-resultC: + return w + case <-time.After(2 * time.Second): + t.Fatalf("timed out waiting for recorder HTTP response") + } + return nil +} + +func waitForRecorderConnectionSyncedFlagAfter(t *testing.T, hub *services.RecorderHub, deviceID string, wantSynced bool, minWait time.Duration) { + t.Helper() + deadline := time.Now().Add(3 * time.Second) + readyAt := time.Now().Add(minWait) + for time.Now().Before(deadline) { + rc := hub.Get(deviceID) + if rc != nil && rc.IsStateSynced() == wantSynced && time.Now().After(readyAt) { + return + } + time.Sleep(10 * time.Millisecond) + } + rc := hub.Get(deviceID) + if rc == nil { + t.Fatalf("recorder %s not connected while waiting after %s for synced=%v", deviceID, minWait, wantSynced) + } + t.Fatalf("recorder synced=%v want=%v after %s", rc.IsStateSynced(), wantSynced, minWait) +} + +func waitForRecorderCachedRawState(t *testing.T, hub *services.RecorderHub, deviceID string, wantSynced bool, wantState string, wantTaskID string) { + t.Helper() + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + rc := hub.Get(deviceID) + if rc != nil && rc.IsStateSynced() == wantSynced { + st := rc.GetState() + if strings.TrimSpace(st.CurrentState) == wantState && strings.TrimSpace(st.TaskID) == wantTaskID { + return + } + } + time.Sleep(10 * time.Millisecond) + } + rc := hub.Get(deviceID) + if rc == nil { + t.Fatalf("recorder %s not connected while waiting for raw state=%q task=%q synced=%v", deviceID, wantState, wantTaskID, wantSynced) + } + st := rc.GetState() + t.Fatalf("recorder raw state current=%q task=%q synced=%v, want current=%q task=%q synced=%v", st.CurrentState, st.TaskID, rc.IsStateSynced(), wantState, wantTaskID, wantSynced) +} + +func waitForRecorderConnectionSyncedFlag(t *testing.T, hub *services.RecorderHub, deviceID string, wantSynced bool) { + t.Helper() + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + rc := hub.Get(deviceID) + if rc != nil && rc.IsStateSynced() == wantSynced { + return + } + time.Sleep(10 * time.Millisecond) + } + rc := hub.Get(deviceID) + if rc == nil { + t.Fatalf("recorder %s not connected while waiting for synced=%v", deviceID, wantSynced) + } + t.Fatalf("recorder synced=%v want=%v", rc.IsStateSynced(), wantSynced) +} + +func waitForRecorderCachedState(t *testing.T, hub *services.RecorderHub, deviceID string, wantSynced bool, wantState string, wantTaskID string) { + t.Helper() + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + rc := hub.Get(deviceID) + if rc != nil && rc.IsStateSynced() == wantSynced { + st := rc.GetState() + if strings.EqualFold(strings.TrimSpace(st.CurrentState), wantState) && strings.TrimSpace(st.TaskID) == wantTaskID { + return + } + } + time.Sleep(10 * time.Millisecond) + } + rc := hub.Get(deviceID) + if rc == nil { + t.Fatalf("recorder %s not connected while waiting for state=%s task=%s synced=%v", deviceID, wantState, wantTaskID, wantSynced) + } + st := rc.GetState() + t.Fatalf("recorder state current=%q task=%q synced=%v, want current=%q task=%q synced=%v", st.CurrentState, st.TaskID, rc.IsStateSynced(), wantState, wantTaskID, wantSynced) +} + +func waitForRecorderHubDisconnected(t *testing.T, hub *services.RecorderHub, deviceID string) { + t.Helper() + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if hub.Get(deviceID) == nil { + return + } + time.Sleep(10 * time.Millisecond) + } + t.Fatalf("recorder %s remained connected", deviceID) +} + +func waitForRecorderInteractionTaskStatus(t *testing.T, db *sqlx.DB, taskID string, want string) { + t.Helper() + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + var got string + if err := db.Get(&got, `SELECT status FROM tasks WHERE task_id = ?`, taskID); err != nil { + t.Fatalf("query task status: %v", err) + } + if got == want { + return + } + time.Sleep(10 * time.Millisecond) + } + assertRecorderInteractionTaskStatus(t, db, taskID, want) +} + +func recorderInteractionTaskTimestamp(t *testing.T, db *sqlx.DB, taskID string, column string) string { + t.Helper() + if column != "ready_at" && column != "started_at" && column != "completed_at" { + t.Fatalf("unexpected timestamp column %q", column) + } + var got string + if err := db.Get(&got, `SELECT COALESCE(CAST(`+column+` AS TEXT), '') FROM tasks WHERE task_id = ?`, taskID); err != nil { + t.Fatalf("query %s for %s: %v", column, taskID, err) + } + return got +} diff --git a/internal/api/handlers/transfer.go b/internal/api/handlers/transfer.go index 9b82d36..ef935a4 100644 --- a/internal/api/handlers/transfer.go +++ b/internal/api/handlers/transfer.go @@ -195,10 +195,11 @@ func (h *TransferHandler) pingLoop(ctx context.Context, dc *services.TransferCon case <-ticker.C: pingCtx, cancel := context.WithTimeout(ctx, timeout) err := dc.Conn.Ping(pingCtx) + timedOut := errors.Is(err, context.DeadlineExceeded) || errors.Is(pingCtx.Err(), context.DeadlineExceeded) cancel() if err != nil { if ctx.Err() == nil { - logger.Printf("[TRANSFER] Device %s ping failed: %v", dc.DeviceID, err) + logWebSocketPingFailure("TRANSFER", "Device", dc.DeviceID, timeout, timedOut, err) if closeErr := dc.Conn.CloseNow(); closeErr != nil { if !isExpectedWebSocketCloseError(closeErr) { logger.Printf("[TRANSFER] Device %s close after ping failure: %v", dc.DeviceID, closeErr) @@ -228,13 +229,6 @@ func transferPingTimeout(cfg *config.TransferConfig) time.Duration { return time.Duration(cfg.PingTimeout) * time.Second } -func transferStaleThreshold(cfg *config.TransferConfig) time.Duration { - if cfg == nil || cfg.StaleThreshold <= 0 { - return 0 - } - return time.Duration(cfg.StaleThreshold) * time.Second -} - func closeReplacedTransferConn(deviceID string, dc *services.TransferConn) { if dc == nil || dc.Conn == nil { return @@ -826,11 +820,19 @@ func revertRunnableTasksOnDeviceDisconnect(db *sqlx.DB, deviceID string, recorde switch st { case "ready": if _, err := recorderHub.SendRPC(rpcCtx, deviceID, "clear", nil, timeout); err != nil { - logger.Printf("[DEVICE] Device %s: recorder clear after transfer disconnect failed (task=%s): %v", deviceID, tid, err) + if errors.Is(err, services.ErrRecorderRPCTimeout) { + logRecorderRPCTimeout(deviceID, "clear", tid, "transfer_disconnect", timeout, err) + } else { + logger.Printf("[DEVICE] Device %s: recorder clear after transfer disconnect failed (task=%s): %v", deviceID, tid, err) + } } case "in_progress": if _, err := recorderHub.SendRPC(rpcCtx, deviceID, "cancel", map[string]interface{}{"task_id": tid}, timeout); err != nil { - logger.Printf("[DEVICE] Device %s: recorder cancel after transfer disconnect failed (task=%s): %v", deviceID, tid, err) + if errors.Is(err, services.ErrRecorderRPCTimeout) { + logRecorderRPCTimeout(deviceID, "cancel", tid, "transfer_disconnect", timeout, err) + } else { + logger.Printf("[DEVICE] Device %s: recorder cancel after transfer disconnect failed (task=%s): %v", deviceID, tid, err) + } } } } diff --git a/internal/services/device_state_broker.go b/internal/services/device_state_broker.go index 505c8b6..ef06fc5 100644 --- a/internal/services/device_state_broker.go +++ b/internal/services/device_state_broker.go @@ -90,10 +90,7 @@ func (b *DeviceStateBroker) Subscribe(buffer int) (<-chan DeviceStateEvent, func return ch, func() { b.mu.Lock() - if existing, ok := b.subscribers[id]; ok { - delete(b.subscribers, id) - close(existing) - } + delete(b.subscribers, id) b.mu.Unlock() } } diff --git a/internal/services/device_state_broker_test.go b/internal/services/device_state_broker_test.go new file mode 100644 index 0000000..1debaab --- /dev/null +++ b/internal/services/device_state_broker_test.go @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import "testing" + +func TestDeviceStateBrokerUnsubscribeDoesNotCloseChannel(t *testing.T) { + broker := NewDeviceStateBroker() + events, unsubscribe := broker.Subscribe(1) + + unsubscribe() + + select { + case _, ok := <-events: + if !ok { + t.Fatalf("unsubscribe closed subscriber channel; concurrent publish can panic on send") + } + default: + } +} + +func TestDeviceStateBrokerConcurrentPublishUnsubscribe(t *testing.T) { + broker := NewDeviceStateBroker() + + for i := 0; i < 1000; i++ { + _, unsubscribe := broker.Subscribe(1) + done := make(chan struct{}) + go func() { + defer close(done) + broker.Publish("robot-001", DeviceStateEvent{"type": "recorder_state"}) + }() + unsubscribe() + <-done + } +} diff --git a/internal/services/recorder_hub.go b/internal/services/recorder_hub.go index 1077311..4df29ce 100644 --- a/internal/services/recorder_hub.go +++ b/internal/services/recorder_hub.go @@ -300,13 +300,24 @@ func (h *RecorderHub) SendRPC(ctx context.Context, deviceID, action string, para Params: params, } + writeCtx := ctx + var writeCancel context.CancelFunc + if timeout > 0 { + writeCtx, writeCancel = context.WithTimeout(ctx, timeout) + } rc.WriteMu.Lock() - writeErr := wsjson.Write(ctx, rc.Conn, req) + writeErr := wsjson.Write(writeCtx, rc.Conn, req) rc.WriteMu.Unlock() + if writeCancel != nil { + writeCancel() + } if writeErr != nil { rc.PendingMu.Lock() delete(rc.Pending, requestID) rc.PendingMu.Unlock() + if errors.Is(writeErr, context.DeadlineExceeded) { + return nil, ErrRecorderRPCTimeout + } return nil, fmt.Errorf("write rpc request: %w", writeErr) } From 6d976a7c1c679ebb106227bb8c1adc8072d762f0 Mon Sep 17 00:00:00 2001 From: chao liu Date: Fri, 29 May 2026 13:01:21 +0800 Subject: [PATCH 09/12] Fix recorder state sync and transfer timeouts --- internal/api/handlers/axon_rpc.go | 99 +++++++++++++++-- .../recorder_axon_interaction_test.go | 103 ++++++++++++++++-- internal/api/handlers/task.go | 53 ++++++--- .../api/handlers/task_state_recovery_test.go | 94 +++++++++++++++- internal/api/handlers/transfer.go | 64 +++++++---- internal/config/config.go | 2 + internal/server/server.go | 11 +- internal/server/server_test.go | 33 ++++++ internal/services/transfer_hub.go | 42 ++++++- 9 files changed, 445 insertions(+), 56 deletions(-) create mode 100644 internal/server/server_test.go diff --git a/internal/api/handlers/axon_rpc.go b/internal/api/handlers/axon_rpc.go index 1f008ee..af22846 100644 --- a/internal/api/handlers/axon_rpc.go +++ b/internal/api/handlers/axon_rpc.go @@ -10,6 +10,7 @@ import ( "database/sql" "encoding/json" "errors" + "fmt" "net/http" "strconv" "strings" @@ -318,6 +319,7 @@ func (h *RecorderHandler) Finish(c *gin.Context) { if !ok { return } + params = h.fillRecorderTaskIDFromCache(c.Param("device_id"), params) h.callRPC(c, "finish", params) } @@ -655,7 +657,9 @@ func (h *RecorderHandler) refreshRecorderState(ctx context.Context, deviceID str h.markRecorderSyncing(rc, "get_state", err) return rc.GetState(), false, err } - h.applyRecorderStateSnapshot(rc, state, "get_state") + if err := h.applyRecorderStateSnapshot(rc, state, "get_state"); err != nil { + return rc.GetState(), false, err + } return rc.GetState(), true, nil } @@ -703,7 +707,15 @@ func (h *RecorderHandler) handleRecorderRPCTimeout(ctx context.Context, deviceID logRecorderRPCTimeout(deviceID, action, taskID, "api_rpc", timeout, timeoutErr) if rc := h.hub.Get(deviceID); rc != nil { h.markRecorderSyncing(rc, "rpc_timeout:"+action, timeoutErr) - go h.syncRecorderStateFromDevice(ctx, rc) + syncTimeout := timeout + if syncTimeout <= 0 { + syncTimeout = recorderRPCResponseTimeout(h.cfg) + } + syncCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), syncTimeout) + go func() { + defer cancel() + h.syncRecorderStateFromDevice(syncCtx, rc) + }() } if h.stateBroker == nil { return @@ -810,9 +822,12 @@ func (h *RecorderHandler) callRPC(c *gin.Context, action string, params map[stri if response != nil && response.Success { state := recorderStateFromRPCData(response.Data) + if strings.TrimSpace(state.TaskID) == "" && recorderStateKeepsTaskID(state.CurrentState) { + state.TaskID = recorderTaskIDFromRPCParams(params) + } if strings.TrimSpace(state.CurrentState) != "" { if rc := h.hub.Get(deviceID); rc != nil { - h.applyRecorderStateSnapshot(rc, state, "rpc_response:"+action) + _ = h.applyRecorderStateSnapshot(rc, state, "rpc_response:"+action) } } } @@ -834,6 +849,33 @@ func (h *RecorderHandler) bindOptionalParams(c *gin.Context) (map[string]interfa return params, true } +func recorderTaskIDFromRPCParams(params map[string]interface{}) string { + taskID := firstNonEmptyString(params, "task_id") + if taskID != "" { + return taskID + } + return stringValue(mapValue(params, "task_config"), "task_id") +} + +func (h *RecorderHandler) fillRecorderTaskIDFromCache(deviceID string, params map[string]interface{}) map[string]interface{} { + if recorderTaskIDFromRPCParams(params) != "" || h == nil || h.hub == nil { + return params + } + rc := h.hub.Get(strings.TrimSpace(deviceID)) + if rc == nil { + return params + } + taskID := strings.TrimSpace(rc.GetState().TaskID) + if taskID == "" { + return params + } + if params == nil { + params = map[string]interface{}{} + } + params["task_id"] = taskID + return params +} + // ListDevices returns all currently connected recorder devices. // // @Summary List connected recorders @@ -955,7 +997,7 @@ func (h *RecorderHandler) handleStateUpdate(rc *services.RecorderConn, msg map[s TaskID: stringValue(data, "task_id"), Raw: data, } - h.applyRecorderStateSnapshot(rc, state, "state_update") + _ = h.applyRecorderStateSnapshot(rc, state, "state_update") } func (h *RecorderHandler) syncRecorderStateFromDevice(ctx context.Context, rc *services.RecorderConn) { @@ -973,11 +1015,24 @@ func (h *RecorderHandler) syncRecorderStateFromDevice(ctx context.Context, rc *s } } -func (h *RecorderHandler) applyRecorderStateSnapshot(rc *services.RecorderConn, state services.RecorderState, source string) { +func (h *RecorderHandler) applyRecorderStateSnapshot(rc *services.RecorderConn, state services.RecorderState, source string) error { if rc == nil { - return + return nil } state.Source = strings.TrimSpace(source) + state.TaskID = strings.TrimSpace(state.TaskID) + if !recorderStateKeepsTaskID(state.CurrentState) { + state.TaskID = "" + } + if state.TaskID == "" && recorderStateCanInheritTaskID(state.CurrentState, state.Source) { + state.TaskID = strings.TrimSpace(rc.GetState().TaskID) + } + if state.TaskID == "" && recorderStateRequiresTaskID(state.CurrentState, state.Source) { + err := fmt.Errorf("recorder %s snapshot missing task_id for state=%s", state.Source, strings.TrimSpace(state.CurrentState)) + h.markRecorderSyncing(rc, state.Source, err) + logger.Printf("[RECORDER] Recorder %s ignored %s state=%s without task_id", rc.DeviceID, state.Source, state.CurrentState) + return err + } rc.UpdateState(state) st := rc.GetState() h.reconcileRecorderTaskState(rc.DeviceID, st, source) @@ -985,9 +1040,10 @@ func (h *RecorderHandler) applyRecorderStateSnapshot(rc *services.RecorderConn, if source == "state_update" { // #nosec G706 -- Set aside for now logger.Printf("[RECORDER] Recorder %s state=%s task=%s", rc.DeviceID, st.CurrentState, st.TaskID) - return + return nil } logger.Printf("[RECORDER] Recorder %s state=%s task=%s source=%s", rc.DeviceID, st.CurrentState, st.TaskID, source) + return nil } func recorderStateFromRPCData(data map[string]interface{}) services.RecorderState { @@ -1002,6 +1058,35 @@ func recorderStateFromRPCData(data map[string]interface{}) services.RecorderStat return state } +func recorderStateKeepsTaskID(state string) bool { + switch strings.ToLower(strings.TrimSpace(state)) { + case "ready", "recording", "paused": + return true + default: + return false + } +} + +func recorderStateCanInheritTaskID(state string, source string) bool { + switch strings.TrimSpace(source) { + case "rpc_response:pause": + return strings.EqualFold(strings.TrimSpace(state), "paused") + case "rpc_response:resume": + return strings.EqualFold(strings.TrimSpace(state), "recording") + default: + return false + } +} + +func recorderStateRequiresTaskID(state string, source string) bool { + switch strings.TrimSpace(source) { + case "get_state", "state_update": + return recorderStateKeepsTaskID(state) + default: + return false + } +} + func firstNonEmptyString(m map[string]interface{}, keys ...string) string { for _, key := range keys { if value := strings.TrimSpace(stringValue(m, key)); value != "" { diff --git a/internal/api/handlers/recorder_axon_interaction_test.go b/internal/api/handlers/recorder_axon_interaction_test.go index 9851cef..0d6cbfd 100644 --- a/internal/api/handlers/recorder_axon_interaction_test.go +++ b/internal/api/handlers/recorder_axon_interaction_test.go @@ -69,7 +69,7 @@ func TestRecorderStateSnapshotReconcilesAdditionalStates(t *testing.T) { handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db) rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") - handler.applyRecorderStateSnapshot(rc, tt.state, "get_state") + _ = handler.applyRecorderStateSnapshot(rc, tt.state, "get_state") assertTaskStateRecoveryStatus(t, db, "task-state", tt.wantStatus) if tt.wantColumn != "" { @@ -79,7 +79,7 @@ func TestRecorderStateSnapshotReconcilesAdditionalStates(t *testing.T) { } } -func TestRecorderNonIdleWithoutTaskIDMarksSyncedWithoutDBChange(t *testing.T) { +func TestRecorderAuthoritativeNonIdleWithoutTaskIDKeepsUnsyncedWithoutDBChange(t *testing.T) { db := newTaskStateRecoveryDB(t) defer db.Close() seedTaskStateRecoveryTask(t, db, "task-no-id", "pending") @@ -87,11 +87,16 @@ func TestRecorderNonIdleWithoutTaskIDMarksSyncedWithoutDBChange(t *testing.T) { hub := services.NewRecorderHub() handler := NewRecorderHandler(hub, &config.RecorderConfig{}, db) rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") + if !hub.Connect("robot-001", rc) { + t.Fatalf("connect recorder: initial connect failed") + } - handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "ready"}, "state_update") + if err := handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "ready"}, "state_update"); err == nil { + t.Fatalf("expected missing task_id error for authoritative state_update") + } - if !rc.IsStateSynced() { - t.Fatalf("non-empty state without task_id did not mark connection synced") + if rc.IsStateSynced() { + t.Fatalf("authoritative non-idle state without task_id marked connection synced") } assertTaskStateRecoveryStatus(t, db, "task-no-id", "pending") } @@ -101,7 +106,7 @@ func TestRecorderEmptyStateSnapshotKeepsConnectionUnsynced(t *testing.T) { handler := NewRecorderHandler(hub, &config.RecorderConfig{}, nil) rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") - handler.applyRecorderStateSnapshot(rc, services.RecorderState{}, "state_update") + _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{}, "state_update") if rc.IsStateSynced() { t.Fatalf("empty state snapshot marked connection as synced") @@ -165,7 +170,7 @@ func TestRecorderConfigRejectsAdditionalBusyStates(t *testing.T) { return services.RPCResponse{Success: true} }) handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) - handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: state, TaskID: "current-task"}, "state_update") + _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: state, TaskID: "current-task"}, "state_update") router := newRecorderInteractionRouter(handler) w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-config-busy"}}`) @@ -192,7 +197,7 @@ func TestRecorderConfigTimeoutAndDisconnectedKeepTaskPending(t *testing.T) { hub := services.NewRecorderHub() rc, requests := attachRecorderRPCObserverWithConn(t, hub, "robot-001") handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) - handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update") + _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update") router := newRecorderInteractionRouter(handler) w := recorderInteractionPost(t, router, "/recorder/robot-001/config", `{"task_config":{"task_id":"task-config-timeout"}}`) @@ -351,6 +356,88 @@ func TestRecorderForwardOnlyActionsDoNotMutateTaskState(t *testing.T) { } } +func TestRecorderRPCPauseResumeWithoutTaskIDPreservesBoundTask(t *testing.T) { + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{}, nil) + rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") + if !hub.Connect("robot-001", rc) { + t.Fatalf("connect recorder: initial connect failed") + } + + _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "recording", TaskID: "task-live"}, "state_update") + _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "paused"}, "rpc_response:pause") + + st := rc.GetState() + if st.CurrentState != "paused" || st.TaskID != "task-live" { + t.Fatalf("state=(%q,%q), want paused/task-live", st.CurrentState, st.TaskID) + } + + _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "recording"}, "rpc_response:resume") + st = rc.GetState() + if st.CurrentState != "recording" || st.TaskID != "task-live" { + t.Fatalf("state=(%q,%q), want recording/task-live", st.CurrentState, st.TaskID) + } + + _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle", TaskID: "task-live"}, "rpc_response:finish") + st = rc.GetState() + if st.CurrentState != "idle" || st.TaskID != "" { + t.Fatalf("idle state=(%q,%q), want idle with empty task", st.CurrentState, st.TaskID) + } +} + +func TestRecorderAuthoritativeStateWithoutTaskIDMarksUnsynced(t *testing.T) { + hub := services.NewRecorderHub() + handler := NewRecorderHandler(hub, &config.RecorderConfig{}, nil) + rc := hub.NewRecorderConn(nil, "robot-001", "127.0.0.1") + if !hub.Connect("robot-001", rc) { + t.Fatalf("connect recorder: initial connect failed") + } + + _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "recording", TaskID: "task-live"}, "state_update") + if err := handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "paused"}, "state_update"); err == nil { + t.Fatalf("expected missing task_id error for authoritative state_update") + } + + st := rc.GetState() + if st.CurrentState != "recording" || st.TaskID != "task-live" { + t.Fatalf("state=(%q,%q), want previous recording/task-live", st.CurrentState, st.TaskID) + } + if rc.IsStateSynced() { + t.Fatalf("state should be marked unsynced after authoritative snapshot without task_id") + } +} + +func TestRecorderFinishUsesCachedTaskIDWhenBodyEmpty(t *testing.T) { + hub := services.NewRecorderHub() + reqC := make(chan services.RPCRequest, 1) + rc := attachRecorderRPCResponderWithConn(t, hub, "robot-001", func(req services.RPCRequest) services.RPCResponse { + reqC <- req + return services.RPCResponse{Success: true, Data: map[string]interface{}{"state": "idle"}} + }) + rc.UpdateState(services.RecorderState{CurrentState: "paused", TaskID: "task-live"}) + handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, nil) + router := newRecorderInteractionRouter(handler) + + w := recorderInteractionPost(t, router, "/recorder/robot-001/finish", `{}`) + if w.Code != http.StatusOK { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + + select { + case req := <-reqC: + if got := strings.TrimSpace(stringValue(req.Params, "task_id")); got != "task-live" { + t.Fatalf("finish params task_id=%q want task-live; params=%#v", got, req.Params) + } + case <-time.After(time.Second): + t.Fatalf("timed out waiting for finish RPC") + } + + st := rc.GetState() + if st.CurrentState != "idle" || st.TaskID != "" { + t.Fatalf("finish response state=(%q,%q), want idle with empty task", st.CurrentState, st.TaskID) + } +} + func TestRecorderCancelAndClearTaskStateSideEffects(t *testing.T) { t.Run("cancel success reverts ready and in_progress", func(t *testing.T) { for _, initial := range []string{"ready", "in_progress"} { diff --git a/internal/api/handlers/task.go b/internal/api/handlers/task.go index 9f6db6c..73f3a3e 100644 --- a/internal/api/handlers/task.go +++ b/internal/api/handlers/task.go @@ -6,9 +6,11 @@ package handlers import ( + "context" "crypto/rand" "database/sql" "encoding/hex" + "errors" "fmt" "net/http" "strconv" @@ -56,24 +58,42 @@ func newPublicBatchID(now time.Time, seq int) (string, error) { ), nil } +type transferUploadHub interface { + Get(deviceID string) *services.TransferConn + SendToDeviceWithTimeout(ctx context.Context, deviceID string, msg map[string]interface{}, timeout time.Duration) error +} + // TaskHandler handles task-related HTTP requests type TaskHandler struct { - db *sqlx.DB - hub *services.TransferHub - recorderHub *services.RecorderHub - recorderRPCTimeout time.Duration + db *sqlx.DB + hub transferUploadHub + recorderHub *services.RecorderHub + recorderRPCTimeout time.Duration + transferWriteTimeout time.Duration } -// NewTaskHandler creates a new TaskHandler -func NewTaskHandler(db *sqlx.DB, hub *services.TransferHub, recorderHub *services.RecorderHub, recorderRPCTimeout time.Duration) *TaskHandler { +// NewTaskHandler creates a new TaskHandler. +func NewTaskHandler(db *sqlx.DB, hub *services.TransferHub, recorderHub *services.RecorderHub, recorderRPCTimeout time.Duration, transferWriteTimeout ...time.Duration) *TaskHandler { + writeTimeout := services.DefaultTransferWriteTimeout + if len(transferWriteTimeout) > 0 && transferWriteTimeout[0] > 0 { + writeTimeout = transferWriteTimeout[0] + } return &TaskHandler{ - db: db, - hub: hub, - recorderHub: recorderHub, - recorderRPCTimeout: recorderRPCTimeout, + db: db, + hub: hub, + recorderHub: recorderHub, + recorderRPCTimeout: recorderRPCTimeout, + transferWriteTimeout: writeTimeout, } } +func (h *TaskHandler) axonTransferWriteTimeout() time.Duration { + if h == nil || h.transferWriteTimeout <= 0 { + return services.DefaultTransferWriteTimeout + } + return h.transferWriteTimeout +} + // TaskConfig represents the task configuration response type TaskConfig struct { TaskID string `json:"task_id"` @@ -1050,9 +1070,16 @@ func (h *TaskHandler) OnRecordingFinish(c *gin.Context) { "priority": 1, } - if err := h.hub.SendToDevice(c.Request.Context(), deviceID, uploadRequest); err != nil { - logger.Printf("[RECORDER] Failed to send upload_request to device %s: %v", deviceID, err) - c.JSON(http.StatusInternalServerError, gin.H{ + writeTimeout := h.axonTransferWriteTimeout() + if err := h.hub.SendToDeviceWithTimeout(c.Request.Context(), deviceID, uploadRequest, writeTimeout); err != nil { + status := http.StatusInternalServerError + if errors.Is(err, services.ErrTransferWriteTimeout) { + status = http.StatusGatewayTimeout + logger.Printf("[TRANSFER] Device %s: auto upload_request timed out after %s for task=%s: %v", deviceID, timeoutLogValue(writeTimeout), callback.TaskID, err) + } else { + logger.Printf("[RECORDER] Failed to send upload_request to device %s: %v", deviceID, err) + } + c.JSON(status, gin.H{ "error_msg": "Failed to trigger upload: " + err.Error(), }) return diff --git a/internal/api/handlers/task_state_recovery_test.go b/internal/api/handlers/task_state_recovery_test.go index 9ed59c8..bc824d6 100644 --- a/internal/api/handlers/task_state_recovery_test.go +++ b/internal/api/handlers/task_state_recovery_test.go @@ -8,6 +8,7 @@ import ( "bytes" "context" "encoding/json" + "fmt" "net/http" "net/http/httptest" "strings" @@ -98,7 +99,7 @@ func TestRecorderGetStateSnapshotReadyRestoresPendingTask(t *testing.T) { }, }) - handler.applyRecorderStateSnapshot(rc, state, "get_state") + _ = handler.applyRecorderStateSnapshot(rc, state, "get_state") assertTaskStateRecoveryStatus(t, db, "task-rpc-ready", "ready") assertTaskStateRecoveryTimestampSet(t, db, "task-rpc-ready", "ready_at") @@ -151,7 +152,7 @@ func TestRecorderGetStateReportsSyncedAfterInitialStateSnapshot(t *testing.T) { if !hub.Connect("robot-001", rc) { t.Fatalf("connect recorder: initial connect failed") } - handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update") + _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update") gin.SetMode(gin.TestMode) router := gin.New() @@ -223,7 +224,7 @@ func TestRecorderConfigRejectsWhenSyncedRecorderBusy(t *testing.T) { return services.RPCResponse{Success: true} }) handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) - handler.applyRecorderStateSnapshot(rc, services.RecorderState{ + _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{ CurrentState: "ready", TaskID: "existing-task", }, "state_update") @@ -259,7 +260,7 @@ func TestRecorderConfigAdvancesTaskAfterInitialStateSnapshot(t *testing.T) { return services.RPCResponse{Success: true} }) handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) - handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update") + _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update") gin.SetMode(gin.TestMode) router := gin.New() @@ -287,7 +288,7 @@ func TestRecorderConfigDoesNotAdvanceTaskWhenRPCResponseUnsuccessful(t *testing. return services.RPCResponse{Success: false, Message: "device rejected config"} }) handler := NewRecorderHandler(hub, &config.RecorderConfig{ResponseTimeout: 1}, db) - handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update") + _ = handler.applyRecorderStateSnapshot(rc, services.RecorderState{CurrentState: "idle"}, "state_update") gin.SetMode(gin.TestMode) router := gin.New() @@ -411,6 +412,89 @@ func TestRecordingStartCallbackAdvancesPendingTask(t *testing.T) { assertTaskStateRecoveryTimestampSet(t, db, "task-start", "started_at") } +func TestTaskHandlerAxonTransferWriteTimeout(t *testing.T) { + custom := 250 * time.Millisecond + handler := NewTaskHandler(nil, nil, nil, 0, custom) + if got := handler.axonTransferWriteTimeout(); got != custom { + t.Fatalf("axonTransferWriteTimeout()=%s want=%s", got, custom) + } + + handler = NewTaskHandler(nil, nil, nil, 0, 0) + if got := handler.axonTransferWriteTimeout(); got != services.DefaultTransferWriteTimeout { + t.Fatalf("default axonTransferWriteTimeout()=%s want=%s", got, services.DefaultTransferWriteTimeout) + } +} + +func TestRecordingFinishAutoUploadUsesConfiguredTransferWriteTimeout(t *testing.T) { + custom := 250 * time.Millisecond + hub := &recordingFinishTransferHub{ + conn: &services.TransferConn{DeviceID: "robot-001"}, + } + handler := &TaskHandler{hub: hub, transferWriteTimeout: custom} + + gin.SetMode(gin.TestMode) + router := gin.New() + handler.RegisterCallbackRoutes(router.Group("/callbacks")) + + body, err := json.Marshal(RecordingFinishCallback{ + TaskID: "task-finish", + DeviceID: "robot-001", + Status: "finished", + FinishedAt: time.Now().UTC().Format(time.RFC3339), + OutputPath: "/data/task-finish.mcap", + }) + if err != nil { + t.Fatalf("marshal callback: %v", err) + } + + req := httptest.NewRequest(http.MethodPost, "/callbacks/finish", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + if w.Code != http.StatusGatewayTimeout { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusGatewayTimeout, w.Body.String()) + } + if hub.getDeviceID != "robot-001" { + t.Fatalf("Get device=%q want=%q", hub.getDeviceID, "robot-001") + } + if hub.sendDeviceID != "robot-001" { + t.Fatalf("Send device=%q want=%q", hub.sendDeviceID, "robot-001") + } + if hub.timeout != custom { + t.Fatalf("Send timeout=%s want=%s", hub.timeout, custom) + } + if got := fmt.Sprint(hub.msg["type"]); got != "upload_request" { + t.Fatalf("message type=%q want=%q", got, "upload_request") + } + if got := fmt.Sprint(hub.msg["task_id"]); got != "task-finish" { + t.Fatalf("message task_id=%q want=%q", got, "task-finish") + } + if !strings.Contains(w.Body.String(), custom.String()) { + t.Fatalf("response body %q does not mention custom timeout %s", w.Body.String(), custom) + } +} + +type recordingFinishTransferHub struct { + conn *services.TransferConn + getDeviceID string + sendDeviceID string + msg map[string]interface{} + timeout time.Duration +} + +func (h *recordingFinishTransferHub) Get(deviceID string) *services.TransferConn { + h.getDeviceID = deviceID + return h.conn +} + +func (h *recordingFinishTransferHub) SendToDeviceWithTimeout(ctx context.Context, deviceID string, msg map[string]interface{}, timeout time.Duration) error { + h.sendDeviceID = deviceID + h.msg = msg + h.timeout = timeout + return fmt.Errorf("%w after %s: fake blocked transfer write", services.ErrTransferWriteTimeout, timeout) +} + func newTaskStateRecoveryDB(t *testing.T) *sqlx.DB { t.Helper() db, err := sqlx.Open("sqlite", ":memory:") diff --git a/internal/api/handlers/transfer.go b/internal/api/handlers/transfer.go index ef935a4..4715a27 100644 --- a/internal/api/handlers/transfer.go +++ b/internal/api/handlers/transfer.go @@ -20,7 +20,6 @@ import ( "time" "github.com/coder/websocket" - "github.com/coder/websocket/wsjson" "github.com/gin-gonic/gin" "github.com/google/uuid" "github.com/jmoiron/sqlx" @@ -229,6 +228,43 @@ func transferPingTimeout(cfg *config.TransferConfig) time.Duration { return time.Duration(cfg.PingTimeout) * time.Second } +func transferWriteTimeout(cfg *config.TransferConfig) time.Duration { + if cfg == nil || cfg.WriteTimeout <= 0 { + return services.DefaultTransferWriteTimeout + } + return time.Duration(cfg.WriteTimeout) * time.Second +} + +func transferMessageType(msg map[string]interface{}) string { + msgType := strings.TrimSpace(stringVal(msg, "type")) + if msgType == "" { + return "message" + } + return msgType +} + +func logTransferSendFailure(deviceID string, msgType string, timeout time.Duration, err error) { + if errors.Is(err, services.ErrTransferWriteTimeout) { + logger.Printf("[TRANSFER] Device %s: send %s timed out after %s: %v", deviceID, msgType, timeoutLogValue(timeout), err) + return + } + logger.Printf("[TRANSFER] Device %s: failed to send %s: %v", deviceID, msgType, err) +} + +func (h *TransferHandler) sendToDevice(c *gin.Context, deviceID string, msg map[string]interface{}) bool { + timeout := transferWriteTimeout(h.cfg) + if err := h.hub.SendToDeviceWithTimeout(c.Request.Context(), deviceID, msg, timeout); err != nil { + logTransferSendFailure(deviceID, transferMessageType(msg), timeout, err) + status := http.StatusNotFound + if errors.Is(err, services.ErrTransferWriteTimeout) { + status = http.StatusGatewayTimeout + } + c.JSON(status, gin.H{"error": err.Error()}) + return false + } + return true +} + func closeReplacedTransferConn(deviceID string, dc *services.TransferConn) { if dc == nil || dc.Conn == nil { return @@ -644,15 +680,11 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra "type": "upload_ack", "task_id": taskID, } - dc.WriteMu.Lock() - if err := wsjson.Write(ctx, dc.Conn, ackMsg); err != nil { - dc.WriteMu.Unlock() - // #nosec G706 -- Set aside for now - logger.Printf("[TRANSFER] Device %s: failed to send upload_ack for task=%s: %v", dc.DeviceID, taskID, err) + writeTimeout := transferWriteTimeout(h.cfg) + if err := h.hub.SendToConnWithTimeout(ctx, dc, ackMsg, writeTimeout); err != nil { + logTransferSendFailure(dc.DeviceID, "upload_ack", writeTimeout, err) return } - dc.WriteMu.Unlock() - dc.RecordEvent("outbound", ackMsg) // #nosec G706 -- Set aside for now logger.Printf("[TRANSFER] Device %s: upload_ack sent for task=%s", dc.DeviceID, taskID) @@ -953,8 +985,7 @@ func (h *TransferHandler) UploadRequest(c *gin.Context) { "task_id": body.TaskID, "priority": body.Priority, } - if err := h.hub.SendToDevice(c.Request.Context(), deviceID, msg); err != nil { - c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) + if !h.sendToDevice(c, deviceID, msg) { return } c.JSON(http.StatusOK, gin.H{"status": "sent"}) @@ -990,9 +1021,7 @@ func (h *TransferHandler) UploadAll(c *gin.Context) { msg := map[string]interface{}{"type": "upload_all"} logger.Printf("[TRANSFER] Sending message to device %s: %+v", deviceID, msg) - if err := h.hub.SendToDevice(c.Request.Context(), deviceID, msg); err != nil { - logger.Printf("[TRANSFER] Failed to send message to device %s: %v", deviceID, err) - c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) + if !h.sendToDevice(c, deviceID, msg) { return } @@ -1026,8 +1055,7 @@ func (h *TransferHandler) CancelUpload(c *gin.Context) { "type": "cancel", "task_id": body.TaskID, } - if err := h.hub.SendToDevice(c.Request.Context(), deviceID, msg); err != nil { - c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) + if !h.sendToDevice(c, deviceID, msg) { return } c.JSON(http.StatusOK, gin.H{"status": "sent"}) @@ -1053,8 +1081,7 @@ func (h *TransferHandler) StatusQuery(c *gin.Context) { } msg := map[string]interface{}{"type": "status_query"} - if err := h.hub.SendToDevice(c.Request.Context(), deviceID, msg); err != nil { - c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) + if !h.sendToDevice(c, deviceID, msg) { return } c.JSON(http.StatusOK, gin.H{"status": "sent"}) @@ -1086,8 +1113,7 @@ func (h *TransferHandler) ManualACK(c *gin.Context) { "type": "upload_ack", "task_id": body.TaskID, } - if err := h.hub.SendToDevice(c.Request.Context(), deviceID, msg); err != nil { - c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) + if !h.sendToDevice(c, deviceID, msg) { return } diff --git a/internal/config/config.go b/internal/config/config.go index 914a944..6b76d88 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -119,6 +119,7 @@ type TransferConfig struct { WSPort int MaxEvents int ReadTimeout int // seconds + WriteTimeout int // seconds PingInterval int // seconds PingTimeout int // seconds StaleThreshold int // seconds @@ -233,6 +234,7 @@ func Load() (*Config, error) { WSPort: getEnvInt("KEYSTONE_AXON_TRANSFER_WS_PORT", 8090), MaxEvents: getEnvInt("KEYSTONE_AXON_TRANSFER_MAX_EVENTS", 10000), ReadTimeout: getEnvInt("KEYSTONE_AXON_TRANSFER_READ_TIMEOUT", 30), + WriteTimeout: getEnvInt("KEYSTONE_AXON_TRANSFER_WRITE_TIMEOUT", 10), PingInterval: getEnvInt("KEYSTONE_AXON_TRANSFER_PING_INTERVAL", 25), PingTimeout: getEnvInt("KEYSTONE_AXON_TRANSFER_PING_TIMEOUT", 10), StaleThreshold: getEnvInt("KEYSTONE_AXON_TRANSFER_STALE_THRESHOLD", 60), diff --git a/internal/server/server.go b/internal/server/server.go index f500efa..5330a34 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -66,6 +66,13 @@ type Server struct { engine *gin.Engine } +func axonTransferWriteTimeout(cfg *config.TransferConfig) time.Duration { + if cfg == nil || cfg.WriteTimeout <= 0 { + return services.DefaultTransferWriteTimeout + } + return time.Duration(cfg.WriteTimeout) * time.Second +} + // New creates a new server instance. // db and s3Client are optional; pass nil to disable Verified ACK. // syncWorker is optional; pass nil to disable cloud sync API. @@ -103,8 +110,10 @@ func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *servi // Create EpisodeHandler for episode listing episodeHandler := handlers.NewEpisodeHandler(db, s3Client, cfg.Storage.Bucket, &cfg.Auth) + transferWriteTimeout := axonTransferWriteTimeout(&cfg.AxonTransfer) + // Create TaskHandler for task configuration - taskHandler := handlers.NewTaskHandler(db, transferHub, recorderHub, recorderRPCTimeout) + taskHandler := handlers.NewTaskHandler(db, transferHub, recorderHub, recorderRPCTimeout, transferWriteTimeout) // Create database-dependent handlers only when DB is available var ( diff --git a/internal/server/server_test.go b/internal/server/server_test.go new file mode 100644 index 0000000..028179e --- /dev/null +++ b/internal/server/server_test.go @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package server + +import ( + "testing" + "time" + + "archebase.com/keystone-edge/internal/config" + "archebase.com/keystone-edge/internal/services" +) + +func TestAxonTransferWriteTimeoutFromConfig(t *testing.T) { + tests := []struct { + name string + cfg *config.TransferConfig + want time.Duration + }{ + {name: "nil config", cfg: nil, want: services.DefaultTransferWriteTimeout}, + {name: "zero config", cfg: &config.TransferConfig{}, want: services.DefaultTransferWriteTimeout}, + {name: "custom seconds", cfg: &config.TransferConfig{WriteTimeout: 7}, want: 7 * time.Second}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := axonTransferWriteTimeout(tt.cfg); got != tt.want { + t.Fatalf("axonTransferWriteTimeout()=%s want=%s", got, tt.want) + } + }) + } +} diff --git a/internal/services/transfer_hub.go b/internal/services/transfer_hub.go index dfb778f..1a9378d 100644 --- a/internal/services/transfer_hub.go +++ b/internal/services/transfer_hub.go @@ -7,6 +7,7 @@ package services import ( "context" + "errors" "fmt" "sync" "time" @@ -15,6 +16,16 @@ import ( "github.com/coder/websocket/wsjson" ) +var ( + // ErrTransferWriteTimeout indicates that a transfer WebSocket write exceeded its deadline. + ErrTransferWriteTimeout = errors.New("transfer write timeout") +) + +const ( + // DefaultTransferWriteTimeout caps device writes when no handler-specific timeout is configured. + DefaultTransferWriteTimeout = 10 * time.Second +) + // DeviceEvent represents a single event recorded for a device type DeviceEvent struct { Direction string `json:"direction"` // "inbound" or "outbound" @@ -229,18 +240,43 @@ func (h *TransferHub) ListDevices() []DeviceInfo { return result } -// SendToDevice sends a JSON message to a connected device via WebSocket +// SendToDevice sends a JSON message to a connected device via WebSocket. func (h *TransferHub) SendToDevice(ctx context.Context, deviceID string, msg map[string]interface{}) error { + return h.SendToDeviceWithTimeout(ctx, deviceID, msg, DefaultTransferWriteTimeout) +} + +// SendToDeviceWithTimeout sends a JSON message to a connected device with a bounded write deadline. +func (h *TransferHub) SendToDeviceWithTimeout(ctx context.Context, deviceID string, msg map[string]interface{}, timeout time.Duration) error { dc := h.Get(deviceID) if dc == nil { return fmt.Errorf("device %s not connected", deviceID) } + return h.SendToConnWithTimeout(ctx, dc, msg, timeout) +} + +// SendToConnWithTimeout sends a JSON message to an existing transfer connection. +func (h *TransferHub) SendToConnWithTimeout(ctx context.Context, dc *TransferConn, msg map[string]interface{}, timeout time.Duration) error { + if dc == nil || dc.Conn == nil { + return fmt.Errorf("device not connected") + } + if ctx == nil { + ctx = context.Background() + } + if timeout <= 0 { + timeout = DefaultTransferWriteTimeout + } + + writeCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() dc.WriteMu.Lock() defer dc.WriteMu.Unlock() - if err := wsjson.Write(ctx, dc.Conn, msg); err != nil { - return fmt.Errorf("failed to send message to device %s: %w", deviceID, err) + if err := wsjson.Write(writeCtx, dc.Conn, msg); err != nil { + if errors.Is(err, context.DeadlineExceeded) || errors.Is(writeCtx.Err(), context.DeadlineExceeded) { + return fmt.Errorf("%w after %s: failed to send message to device %s", ErrTransferWriteTimeout, timeout, dc.DeviceID) + } + return fmt.Errorf("failed to send message to device %s: %w", dc.DeviceID, err) } dc.RecordEvent("outbound", msg) From e30f7d99681a8f9e9f1bbab948b7ea56308fbab4 Mon Sep 17 00:00:00 2001 From: chao liu Date: Fri, 29 May 2026 13:19:20 +0800 Subject: [PATCH 10/12] Add timeout logging across Keystone --- cmd/keystone-edge/main.go | 10 ++++-- internal/api/handlers/axon_rpc.go | 9 +++-- internal/api/handlers/transfer.go | 9 +++-- internal/cloud/auth_client.go | 9 ++++- internal/cloud/gateway_client.go | 21 +++++++++-- internal/cloud/oss_uploader.go | 6 ++++ internal/cloud/timeout_log.go | 58 +++++++++++++++++++++++++++++++ internal/server/server.go | 29 ++++++++++++---- internal/services/sync_worker.go | 18 ++++++++-- 9 files changed, 150 insertions(+), 19 deletions(-) create mode 100644 internal/cloud/timeout_log.go diff --git a/cmd/keystone-edge/main.go b/cmd/keystone-edge/main.go index 002de8d..2dfad89 100644 --- a/cmd/keystone-edge/main.go +++ b/cmd/keystone-edge/main.go @@ -7,6 +7,7 @@ package main import ( "context" + "errors" "flag" "fmt" "os" @@ -184,11 +185,16 @@ func main() { logger.Println("[SERVER] Shutting down...") - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + shutdownTimeout := 30 * time.Second + ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout) defer cancel() if err := srv.Shutdown(ctx); err != nil { - logger.Printf("[SERVER] Error during shutdown: %v", err) + if errors.Is(err, context.DeadlineExceeded) || errors.Is(ctx.Err(), context.DeadlineExceeded) { + logger.Printf("[SERVER] Error during shutdown after %s (timeout_ms=%d): %v", shutdownTimeout, shutdownTimeout.Milliseconds(), err) + } else { + logger.Printf("[SERVER] Error during shutdown: %v", err) + } } logger.Println("[SERVER] Keystone Edge stopped") diff --git a/internal/api/handlers/axon_rpc.go b/internal/api/handlers/axon_rpc.go index af22846..68f34fe 100644 --- a/internal/api/handlers/axon_rpc.go +++ b/internal/api/handlers/axon_rpc.go @@ -142,7 +142,8 @@ func (h *RecorderHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request // Validate device exists in robots table (if DB is configured) if h.db != nil { // Add independent 5s timeout to avoid blocking on slow DB queries - queryCtx, cancel := context.WithTimeout(r.Context(), 5*time.Second) + queryTimeout := 5 * time.Second + queryCtx, cancel := context.WithTimeout(r.Context(), queryTimeout) defer cancel() var count int @@ -150,7 +151,11 @@ func (h *RecorderHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request if err := h.db.GetContext(queryCtx, &count, "SELECT COUNT(1) FROM robots WHERE device_id = ? AND deleted_at IS NULL", deviceID, ); err != nil { - logger.Printf("[RECORDER] Device %s: DB query error: %v", deviceID, err) + if errors.Is(err, context.DeadlineExceeded) || errors.Is(queryCtx.Err(), context.DeadlineExceeded) { + logger.Printf("[RECORDER] Device %s: DB query timeout after %s (timeout_ms=%d): %v", deviceID, timeoutLogValue(queryTimeout), timeoutLogMilliseconds(queryTimeout), err) + } else { + logger.Printf("[RECORDER] Device %s: DB query error: %v", deviceID, err) + } } // Check count regardless of DB error (count defaults to 0 on error) if count == 0 { diff --git a/internal/api/handlers/transfer.go b/internal/api/handlers/transfer.go index 4715a27..1e70ce5 100644 --- a/internal/api/handlers/transfer.go +++ b/internal/api/handlers/transfer.go @@ -96,7 +96,8 @@ func (h *TransferHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request // Validate device exists in robots table (if DB is configured) if h.db != nil { // Add independent 5s timeout to avoid blocking on slow DB queries - queryCtx, cancel := context.WithTimeout(r.Context(), 5*time.Second) + queryTimeout := 5 * time.Second + queryCtx, cancel := context.WithTimeout(r.Context(), queryTimeout) defer cancel() var count int @@ -104,7 +105,11 @@ func (h *TransferHandler) HandleWebSocket(w http.ResponseWriter, r *http.Request if err := h.db.GetContext(queryCtx, &count, "SELECT COUNT(1) FROM robots WHERE device_id = ? AND deleted_at IS NULL", deviceID, ); err != nil { - logger.Printf("[TRANSFER] Device %s: DB query error: %v", deviceID, err) + if errors.Is(err, context.DeadlineExceeded) || errors.Is(queryCtx.Err(), context.DeadlineExceeded) { + logger.Printf("[TRANSFER] Device %s: DB query timeout after %s (timeout_ms=%d): %v", deviceID, timeoutLogValue(queryTimeout), timeoutLogMilliseconds(queryTimeout), err) + } else { + logger.Printf("[TRANSFER] Device %s: DB query error: %v", deviceID, err) + } } // Check count regardless of DB error (count defaults to 0 on error) if count == 0 { diff --git a/internal/cloud/auth_client.go b/internal/cloud/auth_client.go index 4bcc5b5..dcc4d7d 100644 --- a/internal/cloud/auth_client.go +++ b/internal/cloud/auth_client.go @@ -9,6 +9,7 @@ package cloud import ( "context" + "errors" "fmt" "strings" "sync" @@ -119,11 +120,17 @@ func (c *AuthClient) exchangeCredential(ctx context.Context, credentialBase64 st } client := pb.NewAuthServiceClient(conn) + startedAt := time.Now() resp, err := client.ExchangeCredential(ctx, &pb.ExchangeCredentialRequest{ CredentialBase64: credentialBase64, }) if err != nil { - logger.Printf("[CLOUD-AUTH] ExchangeCredential RPC failed, resetting gRPC connection: %v", err) + timeout := timeoutDuration(ctx, startedAt, 0) + if isTimeoutError(err) || errors.Is(ctx.Err(), context.DeadlineExceeded) { + logger.Printf("[CLOUD-AUTH] ExchangeCredential RPC timeout after %s (timeout_ms=%d), resetting gRPC connection: %v", timeoutLogValue(timeout), timeoutLogMilliseconds(timeout), err) + } else { + logger.Printf("[CLOUD-AUTH] ExchangeCredential RPC failed, resetting gRPC connection: %v", err) + } if closeErr := c.Close(); closeErr != nil { logger.Printf("[CLOUD-AUTH] failed to reset gRPC connection after RPC error: %v", closeErr) } else { diff --git a/internal/cloud/gateway_client.go b/internal/cloud/gateway_client.go index 454628f..c91d238 100644 --- a/internal/cloud/gateway_client.go +++ b/internal/cloud/gateway_client.go @@ -198,7 +198,11 @@ func (c *GatewayClient) AbortUpload(ctx context.Context, logicalUploadID string, if err != nil { // AbortUpload is best-effort: do not reset the shared gRPC connection on failure, // as that would disrupt subsequent normal uploads. Only log the error. - logger.Printf("[CLOUD-GATEWAY] AbortUpload RPC failed: %v", err) + if isTimeoutError(err) || errors.Is(rpcCtx.Err(), context.DeadlineExceeded) { + logger.Printf("[CLOUD-GATEWAY] AbortUpload RPC timeout after %s (timeout_ms=%d): %v", timeoutLogValue(c.cfg.RequestTimeout), timeoutLogMilliseconds(c.cfg.RequestTimeout), err) + } else { + logger.Printf("[CLOUD-GATEWAY] AbortUpload RPC failed: %v", err) + } return fmt.Errorf("AbortUpload RPC: %w", err) } return nil @@ -254,7 +258,11 @@ func (c *GatewayClient) doRPC(ctx context.Context, rpcName string, fn func(*grpc } // First attempt failed. Reset the stale connection and retry once with a fresh one. - logger.Printf("[CLOUD-GATEWAY] %s RPC failed, resetting connection and retrying: %v", rpcName, err) + if isTimeoutError(err) { + logger.Printf("[CLOUD-GATEWAY] %s RPC timeout after %s (timeout_ms=%d), resetting connection and retrying: %v", rpcName, timeoutLogValue(c.cfg.RequestTimeout), timeoutLogMilliseconds(c.cfg.RequestTimeout), err) + } else { + logger.Printf("[CLOUD-GATEWAY] %s RPC failed, resetting connection and retrying: %v", rpcName, err) + } if closeErr := c.Close(); closeErr != nil { logger.Printf("[CLOUD-GATEWAY] failed to close stale connection: %v", closeErr) } @@ -269,7 +277,11 @@ func (c *GatewayClient) doRPC(ctx context.Context, rpcName string, fn func(*grpc if err = fn(conn); err != nil { // Retry also failed — this is a genuine server-side error, not a stale connection. - logger.Printf("[CLOUD-GATEWAY] %s RPC failed after reconnect: %v", rpcName, err) + if isTimeoutError(err) { + logger.Printf("[CLOUD-GATEWAY] %s RPC timeout after %s (timeout_ms=%d) after reconnect: %v", rpcName, timeoutLogValue(c.cfg.RequestTimeout), timeoutLogMilliseconds(c.cfg.RequestTimeout), err) + } else { + logger.Printf("[CLOUD-GATEWAY] %s RPC failed after reconnect: %v", rpcName, err) + } if closeErr := c.Close(); closeErr != nil { logger.Printf("[CLOUD-GATEWAY] failed to close connection after retry failure: %v", closeErr) } @@ -287,6 +299,9 @@ func (c *GatewayClient) getAuthHeader(ctx context.Context) (string, error) { defer authCancel() header, err := c.authClient.GetAuthHeader(authCtx) if err != nil { + if isTimeoutError(err) || errors.Is(authCtx.Err(), context.DeadlineExceeded) { + logger.Printf("[CLOUD-GATEWAY] Auth header acquisition timeout after %s (timeout_ms=%d): %v", timeoutLogValue(c.cfg.RequestTimeout), timeoutLogMilliseconds(c.cfg.RequestTimeout), err) + } return "", fmt.Errorf("get auth header: %w", err) } return header, nil diff --git a/internal/cloud/oss_uploader.go b/internal/cloud/oss_uploader.go index 869c68e..797b676 100644 --- a/internal/cloud/oss_uploader.go +++ b/internal/cloud/oss_uploader.go @@ -223,8 +223,14 @@ func (u *OSSUploader) sendRequest(ctx context.Context, session *UploadSession, m } // #nosec G704 -- URL is OSS endpoint from authenticated DataGateway gRPC upload session + startedAt := time.Now() resp, err := u.httpClient.Do(req) if err != nil { + if isTimeoutError(err) || errors.Is(ctx.Err(), context.DeadlineExceeded) { + timeout := timeoutDuration(ctx, startedAt, u.httpClient.Timeout) + logger.Printf("[OSS] %s %s timeout after %s (timeout_ms=%d): %v", method, objectKey, timeoutLogValue(timeout), timeoutLogMilliseconds(timeout), err) + return nil, fmt.Errorf("http request timeout after %s: %w", timeoutLogValue(timeout), err) + } return nil, fmt.Errorf("http request: %w", err) } diff --git a/internal/cloud/timeout_log.go b/internal/cloud/timeout_log.go new file mode 100644 index 0000000..e16857c --- /dev/null +++ b/internal/cloud/timeout_log.go @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package cloud + +import ( + "context" + "errors" + "time" + + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +func isTimeoutError(err error) bool { + if err == nil { + return false + } + if errors.Is(err, context.DeadlineExceeded) { + return true + } + if st, ok := status.FromError(err); ok && st.Code() == codes.DeadlineExceeded { + return true + } + var timeoutErr interface{ Timeout() bool } + return errors.As(err, &timeoutErr) && timeoutErr.Timeout() +} + +func timeoutDuration(ctx context.Context, start time.Time, configured time.Duration) time.Duration { + timeout := configured + if deadline, ok := ctx.Deadline(); ok { + if d := deadline.Sub(start); d > 0 && (timeout <= 0 || d < timeout) { + timeout = d.Round(time.Millisecond) + } + } + if timeout > 0 { + return timeout + } + if !start.IsZero() { + return time.Since(start).Round(time.Millisecond) + } + return 0 +} + +func timeoutLogValue(timeout time.Duration) string { + if timeout <= 0 { + return "unknown" + } + return timeout.String() +} + +func timeoutLogMilliseconds(timeout time.Duration) int64 { + if timeout <= 0 { + return 0 + } + return timeout.Milliseconds() +} diff --git a/internal/server/server.go b/internal/server/server.go index 5330a34..ff908c0 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -7,6 +7,7 @@ package server import ( "context" + "errors" "fmt" "net" "net/http" @@ -432,29 +433,45 @@ func (s *Server) Shutdown(ctx context.Context) error { s.isRunning = false s.shutdownMu.Unlock() - logger.Printf("[SERVER] Shutting down HTTP server") + startedAt := time.Now() + shutdownTimeout := time.Duration(s.cfg.Server.ShutdownTimeout) * time.Second + effectiveShutdownTimeout := shutdownTimeout + if deadline, ok := ctx.Deadline(); ok { + if d := deadline.Sub(startedAt); d > 0 && (effectiveShutdownTimeout <= 0 || d < effectiveShutdownTimeout) { + effectiveShutdownTimeout = d.Round(time.Millisecond) + } + } + logger.Printf("[SERVER] Shutting down HTTP server (timeout=%s timeout_ms=%d)", effectiveShutdownTimeout, effectiveShutdownTimeout.Milliseconds()) - ctx, cancel := context.WithTimeout(ctx, time.Duration(s.cfg.Server.ShutdownTimeout)*time.Second) + ctx, cancel := context.WithTimeout(ctx, shutdownTimeout) defer cancel() + logShutdownError := func(component string, err error) { + if errors.Is(err, context.DeadlineExceeded) || errors.Is(ctx.Err(), context.DeadlineExceeded) { + logger.Printf("[SERVER] %s shutdown timeout after %s (timeout_ms=%d): %v", component, effectiveShutdownTimeout, effectiveShutdownTimeout.Milliseconds(), err) + return + } + logger.Printf("[SERVER] %s shutdown error: %v", component, err) + } + var shutdownErr error // Shutdown both servers if err := s.httpServer.Shutdown(ctx); err != nil { - logger.Printf("[SERVER] HTTP server shutdown error: %v", err) + logShutdownError("HTTP server", err) if shutdownErr == nil { shutdownErr = fmt.Errorf("http server shutdown: %w", err) } } if err := s.transferWSServer.Shutdown(ctx); err != nil { - logger.Printf("[SERVER] Transfer WebSocket server shutdown error: %v", err) + logShutdownError("Transfer WebSocket server", err) if shutdownErr == nil { shutdownErr = fmt.Errorf("transfer websocket shutdown: %w", err) } } if s.recorderWSServer != nil { if err := s.recorderWSServer.Shutdown(ctx); err != nil { - logger.Printf("[SERVER] Recorder WebSocket server shutdown error: %v", err) + logShutdownError("Recorder WebSocket server", err) if shutdownErr == nil { shutdownErr = fmt.Errorf("recorder websocket shutdown: %w", err) } @@ -464,7 +481,7 @@ func (s *Server) Shutdown(ctx context.Context) error { // Stop sync worker if s.syncWorker != nil { if err := s.syncWorker.Stop(ctx); err != nil { - logger.Printf("[SERVER] Sync worker shutdown error: %v", err) + logShutdownError("Sync worker", err) if shutdownErr == nil { shutdownErr = fmt.Errorf("sync worker shutdown: %w", err) } diff --git a/internal/services/sync_worker.go b/internal/services/sync_worker.go index 46e93d9..0ecce3e 100644 --- a/internal/services/sync_worker.go +++ b/internal/services/sync_worker.go @@ -134,6 +134,7 @@ func (w *SyncWorker) Start() { // Stop gracefully stops the sync worker within the provided context deadline. func (w *SyncWorker) Stop(ctx context.Context) error { + startedAt := time.Now() w.mu.Lock() if !w.running.Load() { done := w.stopDone @@ -145,7 +146,7 @@ func (w *SyncWorker) Stop(ctx context.Context) error { case <-done: return nil case <-ctx.Done(): - return fmt.Errorf("sync worker stop timeout: %w", ctx.Err()) + return syncWorkerStopTimeoutError(ctx, startedAt) } } @@ -159,7 +160,7 @@ func (w *SyncWorker) Stop(ctx context.Context) error { case <-done: return nil case <-ctx.Done(): - return fmt.Errorf("sync worker stop timeout: %w", ctx.Err()) + return syncWorkerStopTimeoutError(ctx, startedAt) } } @@ -181,10 +182,21 @@ func (w *SyncWorker) Stop(ctx context.Context) error { logger.Println("[SYNC-WORKER] Stopped") return nil case <-ctx.Done(): - return fmt.Errorf("sync worker stop timeout: %w", ctx.Err()) + return syncWorkerStopTimeoutError(ctx, startedAt) } } +func syncWorkerStopTimeoutError(ctx context.Context, startedAt time.Time) error { + timeout := time.Since(startedAt).Round(time.Millisecond) + if deadline, ok := ctx.Deadline(); ok { + if d := deadline.Sub(startedAt); d > 0 { + timeout = d.Round(time.Millisecond) + } + } + logger.Printf("[SYNC-WORKER] Stop timeout after %s (timeout_ms=%d): %v", timeout, timeout.Milliseconds(), ctx.Err()) + return fmt.Errorf("sync worker stop timeout after %s: %w", timeout, ctx.Err()) +} + // IsRunning returns whether the worker is currently running. func (w *SyncWorker) IsRunning() bool { return w.running.Load() From 1038fd3749db37638dcde98209438689aba032e9 Mon Sep 17 00:00:00 2001 From: chaoliu Date: Fri, 29 May 2026 13:44:29 +0800 Subject: [PATCH 11/12] Add cloud sync auto scan toggle --- ARCHITECTURE.md | 27 +++-- README.md | 10 ++ cmd/keystone-edge/main.go | 17 +-- .../cloud-sync-persistent-queue-design.md | 66 +++++++++++- docs/designs/cloud-sync-ui-implementation.md | 44 +++++++- internal/api/handlers/sync.go | 7 +- internal/api/handlers/sync_test.go | 40 +++++++ internal/config/config.go | 8 +- internal/config/config_test.go | 24 +++-- internal/services/sync_worker.go | 28 +++-- internal/services/sync_worker_test.go | 102 ++++++++++++++++++ 11 files changed, 330 insertions(+), 43 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 56568a2..aa57ea9 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -254,14 +254,23 @@ Auto-Approve Route to Inspector ### Sync to Cloud -Only **approved episodes** (QA score ≥ 90% or inspector-approved) are synced to cloud: +Only **approved episodes** (QA score >= 90% or inspector-approved) are eligible for cloud sync: ``` Edge MinIO ══(push)══► Cloud S3 ``` -Sync is handled via `POST /api/v1/sync/episode` with: -- Sidecar JSON (metadata) +Keystone should default to manual cloud sync scheduling. `KEYSTONE_SYNC_ENABLED` +controls whether cloud sync capability and the worker are available. +`KEYSTONE_SYNC_AUTO_SCAN_ENABLED` controls whether the worker periodically +discovers newly eligible approved unsynced episodes, and its default is +`false`. With the default setting, recorded data remains in edge MinIO until an +admin manually syncs one episode or explicitly scans and queues eligible +episodes. + +Sync upload is executed by the edge `SyncWorker` through the cloud data gateway +with: +- Sidecar JSON metadata converted to raw tags - MCAP file (raw sensor data) - SHA-256 checksums for integrity validation @@ -376,10 +385,14 @@ EDGE (Source of Truth) CLOUD (Eventually Consistent Replica) **After network restored**: -1. Edge sync worker resumes -2. Queued episodes uploaded (`POST /api/v1/sync/episode`) -3. Cloud processes each asynchronously (returns `202 Accepted`) -4. Cloud DB becomes eventually consistent with edge +1. Edge sync worker resumes and drains persisted `pending` sync rows. +2. Already queued or retryable sync jobs continue according to retry policy. +3. If `KEYSTONE_SYNC_AUTO_SCAN_ENABLED=true`, newly eligible approved unsynced + episodes are discovered and queued automatically. +4. If automatic discovery is disabled, new local episodes remain unsynced until + an admin manually syncs one episode or explicitly scans eligible episodes. +5. Cloud processes each upload asynchronously and cloud DB becomes eventually + consistent with edge. ### Sync Guarantees diff --git a/README.md b/README.md index d1158c3..6c64b85 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,8 @@ Configuration is loaded from environment variables. See [`docker/.env.example`]( | `KEYSTONE_MINIO_ENDPOINT` | `http://localhost:9000` | MinIO endpoint | | `KEYSTONE_MYSQL_HOST` | `localhost` | MySQL host | | `KEYSTONE_MYSQL_PASSWORD` | *required* | MySQL password | +| `KEYSTONE_SYNC_ENABLED` | `true` | Enable cloud sync capability, worker, and manual sync APIs when cloud endpoints and credentials are configured | +| `KEYSTONE_SYNC_AUTO_SCAN_ENABLED` | `false` | Enable periodic automatic discovery of newly eligible approved unsynced episodes | ### Cloud Sync Credentials @@ -85,6 +87,14 @@ to `AuthService.ExchangeCredential` as `credential_base64`. Keystone does not decode, split, validate, or derive `site_id` / secret values from this key; the cloud AuthService owns credential interpretation and validation. +Cloud sync capability and automatic scheduling are separate. Keep +`KEYSTONE_SYNC_ENABLED=true` when admins should be able to manually sync data to +cloud. Leave `KEYSTONE_SYNC_AUTO_SCAN_ENABLED=false` for the default manual-only +mode, where newly recorded or newly approved episodes remain local until an +admin triggers single-episode sync or an explicit batch scan. Set +`KEYSTONE_SYNC_AUTO_SCAN_ENABLED=true` only when the site should automatically +queue every newly eligible approved unsynced episode. + ## Project Structure ``` diff --git a/cmd/keystone-edge/main.go b/cmd/keystone-edge/main.go index 2dfad89..7164bff 100644 --- a/cmd/keystone-edge/main.go +++ b/cmd/keystone-edge/main.go @@ -155,17 +155,18 @@ func main() { } syncWorker = services.NewSyncWorker(db.DB, uploader, s3Client, cfg.Storage.Bucket, services.SyncWorkerConfig{ - BatchSize: cfg.Sync.BatchSize, - MaxConcurrent: cfg.Sync.MaxConcurrent, - MaxRetries: cfg.Sync.MaxRetries, - IntervalSec: cfg.Sync.WorkerIntervalSec, - RetryBaseSec: cfg.Sync.RetryBaseSec, - RetryMaxSec: cfg.Sync.RetryMaxSec, - RetryJitterSec: cfg.Sync.RetryJitterSec, + BatchSize: cfg.Sync.BatchSize, + MaxConcurrent: cfg.Sync.MaxConcurrent, + MaxRetries: cfg.Sync.MaxRetries, + AutoScanEnabled: cfg.Sync.AutoScanEnabled, + IntervalSec: cfg.Sync.WorkerIntervalSec, + RetryBaseSec: cfg.Sync.RetryBaseSec, + RetryMaxSec: cfg.Sync.RetryMaxSec, + RetryJitterSec: cfg.Sync.RetryJitterSec, }, &cfg.Sync) syncWorker.Start() - logger.Printf("[SYNC] Cloud sync worker started: auth=%s gateway=%s", cfg.Sync.AuthEndpoint, cfg.Sync.GatewayEndpoint) + logger.Printf("[SYNC] Cloud sync worker started: auth=%s gateway=%s auto_scan=%t", cfg.Sync.AuthEndpoint, cfg.Sync.GatewayEndpoint, cfg.Sync.AutoScanEnabled) } else { logger.Println("[SYNC] Cloud sync disabled (KEYSTONE_SYNC_ENABLED=false or missing endpoints)") } diff --git a/docs/designs/cloud-sync-persistent-queue-design.md b/docs/designs/cloud-sync-persistent-queue-design.md index b97e4fa..0e9f678 100644 --- a/docs/designs/cloud-sync-persistent-queue-design.md +++ b/docs/designs/cloud-sync-persistent-queue-design.md @@ -76,7 +76,25 @@ The polling worker periodically: Automatic retry also does not create an observable queued period. -### 2.3 Frontend Listing +### 2.3 Automatic Discovery + +The worker also has a discovery path that finds approved, unsynced episodes and +enqueues them without an operator action. That behavior should be controlled by +`KEYSTONE_SYNC_AUTO_SCAN_ENABLED` and should default to `false`. + +With the default disabled setting: + +- newly recorded or newly approved episodes remain local after upload/QA; +- single-episode manual sync still works through `POST /api/v1/sync/episodes/:id`; +- explicit batch scan still works through `POST /api/v1/sync/episodes`; +- persisted `pending` rows and due retries from already attempted sync jobs are + still processed by the worker. + +This separates automatic discovery from retry/backoff recovery. A failed manual +sync can continue retrying according to sync retry policy without re-enabling +automatic discovery for unrelated new episodes. + +### 2.4 Frontend Listing Cloud Sync Center lists one row per episode using the latest `sync_logs` row. Its queued count is the number of latest rows whose status is `pending`. @@ -90,6 +108,8 @@ Therefore, a job that only exists in memory cannot be counted as queued. - Preserve manual retry semantics: operators can retry exhausted or backoff failures explicitly. - Preserve automatic retry limits and backoff behavior. +- Disable automatic discovery of newly eligible episodes by default while keeping + manual enqueue and retry recovery available. - Allow queued work to recover after Keystone restarts. - Keep the change scoped to the existing `sync_logs` model unless stronger queue semantics become necessary later. @@ -164,10 +184,12 @@ The worker loop should process DB-backed queued work before discovering new work 1. Dispatch latest `pending` sync logs. 2. Promote due failed rows to `pending`, then dispatch them. -3. Discover approved, unsynced episodes with no active/latest completed log. +3. Discover approved, unsynced episodes with no active/latest completed log only + when automatic discovery is enabled. This makes the queue restart-safe. If Keystone crashes after writing `pending` but before placing the job into memory, the next polling cycle will pick it up. +Disabling automatic discovery must not disable this pending-row recovery path. ### 5.4 Claiming Pending Rows @@ -229,6 +251,24 @@ After a successful manual retry: The frontend should continue treating the backend summary endpoint as the source of truth. +### 5.7 Automatic Discovery Switch + +Add a worker configuration field backed by an environment variable: + +| Field | Environment Variable | Default | Meaning | +|-------|----------------------|---------|---------| +| `AutoScanEnabled` | `KEYSTONE_SYNC_AUTO_SCAN_ENABLED` | `false` | Permit periodic discovery of newly eligible approved unsynced episodes | + +`KEYSTONE_SYNC_ENABLED` remains the cloud sync capability switch. Setting it to +`false` disables the worker and therefore disables manual sync APIs. To support +manual-only production mode, keep `KEYSTONE_SYNC_ENABLED=true` and leave +`KEYSTONE_SYNC_AUTO_SCAN_ENABLED=false`. + +The worker polling loop should be structured so that this switch only gates the +new-episode discovery step. It should not gate dispatch of existing `pending` +rows, manual enqueue acceleration, or retry/backoff handling for already +attempted sync work. + ## 6. Risks and Mitigations ### 6.1 Duplicate Pending Rows @@ -308,6 +348,16 @@ Mitigation: ## 7. Implementation Plan +### Phase 0: Automatic Discovery Control + +- Add `KEYSTONE_SYNC_AUTO_SCAN_ENABLED` with default `false`. +- Add matching `SyncConfig.AutoScanEnabled` and worker config fields. +- Return `auto_scan_enabled` from `GET /api/v1/sync/config` as read-only + runtime state. +- Gate only the worker's newly eligible episode discovery path. +- Keep manual single-episode sync, explicit batch scan, pending-row recovery, + and retry/backoff processing available while the worker is running. + ### Phase 1: Durable Manual Queue - Add a worker method to create or reuse a pending sync log transactionally. @@ -347,6 +397,12 @@ Retry 4 failed episodes with MaxConcurrent=2 ### Backend Unit Tests +- Worker polling does not auto-discover approved unsynced episodes when + `AutoScanEnabled=false`. +- Worker polling still dispatches persisted pending rows when + `AutoScanEnabled=false`. +- Worker polling auto-discovers approved unsynced episodes when + `AutoScanEnabled=true`. - Manual retry creates a pending row for a failed exhausted episode. - Manual retry rejects an episode whose latest row is pending. - Manual retry rejects an episode whose latest row is in_progress. @@ -374,8 +430,10 @@ Retry 4 failed episodes with MaxConcurrent=2 ## 9. Recommendation -Implement Phase 1 first. It fixes the operator-visible problem with a small, -contained backend change and does not require a new database table. +Implement Phase 0 and Phase 1 first. Phase 0 prevents newly recorded data from +automatically leaving the edge by default, while Phase 1 preserves clear manual +queue visibility. Both changes are small, contained backend changes and do not +require a new database table. Do not ship durable pending without DB-backed duplicate protection and polling recovery. Those two pieces are required for correctness; otherwise the system diff --git a/docs/designs/cloud-sync-ui-implementation.md b/docs/designs/cloud-sync-ui-implementation.md index 9439228..31877cd 100644 --- a/docs/designs/cloud-sync-ui-implementation.md +++ b/docs/designs/cloud-sync-ui-implementation.md @@ -34,6 +34,11 @@ Synapse. Data production statistics should keep cloud sync as a metric only, but should not be the primary place for queue control, retry diagnosis, navigation, or future sync operations. +Cloud sync scheduling should be explicit by default. Keystone should keep cloud +sync capability available when configured, but automatic discovery of newly +approved unsynced episodes should default to off. Operators or admins can still +manually enqueue one episode or explicitly scan and enqueue eligible episodes. + ## 2. Current Backend Capability ### 2.1 Existing Cloud Sync APIs @@ -43,7 +48,7 @@ endpoints recommended for the episode-centered Cloud Sync Center redesign: | Method | Path | Purpose | |--------|------|---------| -| `POST` | `/api/v1/sync/episodes` | Enqueue pending approved, unsynced episodes for cloud sync | +| `POST` | `/api/v1/sync/episodes` | Manually scan and enqueue approved, unsynced episodes for cloud sync | | `POST` | `/api/v1/sync/episodes/:id` | Enqueue one episode for cloud sync by numeric episode ID | | `GET` | `/api/v1/sync/episodes` | Existing: list raw sync log entries for history/diagnosis | | `GET` | `/api/v1/sync/episodes/summary` | Recommended new endpoint: list latest sync state grouped by episode | @@ -77,7 +82,33 @@ from local MinIO, uploads it through the cloud DataGateway/OSS flow, and updates - `episodes.cloud_mcap_path` - `episodes.cloud_processed` -### 2.3 Manual Trigger Semantics +### 2.3 Scheduling Modes + +Keystone should separate two concerns: + +| Concern | Configuration | Default | Behavior | +|---------|---------------|---------|----------| +| Cloud sync capability | `KEYSTONE_SYNC_ENABLED` | `true` | Creates the cloud clients, `SyncWorker`, sync APIs, and manual enqueue path when cloud endpoints and credentials are present | +| Automatic episode discovery | `KEYSTONE_SYNC_AUTO_SCAN_ENABLED` | `false` | When enabled, the worker periodically finds newly eligible approved unsynced episodes and enqueues them without an operator action | + +When `KEYSTONE_SYNC_AUTO_SCAN_ENABLED=false`, Keystone must not automatically +enqueue newly completed or newly approved episodes just because they match the +eligibility rules. This is the recommended default for controlled production +sites where data should remain local until an admin explicitly syncs it. + +Manual sync remains available while `KEYSTONE_SYNC_ENABLED=true` and the worker +is running: + +- `POST /api/v1/sync/episodes/:id` enqueues one approved unsynced episode. +- `POST /api/v1/sync/episodes` explicitly scans and enqueues all currently + eligible approved unsynced episodes. + +Automatic retry is separate from automatic discovery. Once an episode has been +manually enqueued and a `sync_logs` row exists, retry/backoff may continue using +the existing retry rules. Disabling automatic discovery should not strand +already queued or already attempted sync work. + +### 2.4 Manual Trigger Semantics The existing manual APIs have different retry behavior. @@ -110,7 +141,7 @@ Behavior: For product clarity, the current batch API should be presented as "scan and enqueue eligible pending episodes", not as "force retry all failures". -### 2.4 Listing Model +### 2.5 Listing Model `sync_logs` is an audit-style attempt-chain table. One episode may have multiple rows because manual retry can create a fresh `sync_logs` row when the latest @@ -138,6 +169,7 @@ as an expandable row, drawer, or "View history" action. | Admin | Retry failed cloud sync jobs safely | | Admin | Understand why a sync failed | | Admin | Trigger sync for all eligible unsynced episodes | +| Admin | Keep newly recorded local data from syncing automatically by default | | Operator | See whether collected data is available, without managing retries | ### 3.2 Design Goals @@ -147,6 +179,7 @@ as an expandable row, drawer, or "View history" action. - Avoid fake progress bars because Keystone does not expose upload percentage. - Make failure recovery explicit and auditable. - Keep destructive or expensive bulk actions deliberate. +- Make automatic discovery opt-in; default operation is manual enqueue. - Preserve current backend behavior while identifying API gaps for follow-up. - Keep the data production statistics page focused on analysis, not operations. @@ -243,7 +276,7 @@ Primary actions: | Scan and queue eligible unsynced episodes | `POST /api/v1/sync/episodes` | Confirm before enqueue, then refresh counts/table | | Retry one failed episode | `POST /api/v1/sync/episodes/:id` | Row-level action, then refresh the row/list | | Retry failed episodes in bulk | Not explicit yet | TODO; requires backend API semantics | -| Pause/resume automatic sync | Not supported yet | TODO; do not show as active control | +| Enable/disable automatic discovery | Config only | Default off; show read-only state, do not expose as a live UI toggle | | Edit sync configuration | Not supported yet | TODO; read-only summary for now | The page should make queue state explicit: @@ -730,6 +763,7 @@ TODO: extend `GET /sync/config` with non-sensitive runtime values: ```json { "worker_running": true, + "auto_scan_enabled": false, "batch_size": 10, "max_concurrent": 2, "max_retries": 5, @@ -822,6 +856,7 @@ Acceptance criteria: - Add latest sync summary to episode list/detail responses. - Add batch trigger body with `force` and filters. - Extend sync config response with read-only non-sensitive values. +- Add `auto_scan_enabled` to sync config response. - Add admin auth guard for sync APIs. Acceptance criteria: @@ -841,3 +876,4 @@ Acceptance criteria: | Should batch sync use current statistics filters? | Only after backend supports filter parameters | | Should long-running uploads show percentage? | No; backend does not expose progress | | Should `.env` sync config be editable in Synapse? | Not now; keep as TODO after read-only config visibility | +| Should automatic discovery run by default? | No; default `KEYSTONE_SYNC_AUTO_SCAN_ENABLED=false` | diff --git a/internal/api/handlers/sync.go b/internal/api/handlers/sync.go index 367c60d..9d1694c 100644 --- a/internal/api/handlers/sync.go +++ b/internal/api/handlers/sync.go @@ -596,14 +596,17 @@ func (h *SyncHandler) GetSyncStatus(c *gin.Context) { // @Router /sync/config [get] func (h *SyncHandler) GetSyncConfig(c *gin.Context) { workerRunning := false + autoScanEnabled := false maxRetries := 0 if h.syncWorker != nil { workerRunning = h.syncWorker.IsRunning() + autoScanEnabled = h.syncWorker.AutoScanEnabled() maxRetries = h.syncWorker.MaxRetries() } c.JSON(http.StatusOK, gin.H{ - "worker_running": workerRunning, - "max_retries": maxRetries, + "worker_running": workerRunning, + "auto_scan_enabled": autoScanEnabled, + "max_retries": maxRetries, }) } diff --git a/internal/api/handlers/sync_test.go b/internal/api/handlers/sync_test.go index 0c34502..a825937 100644 --- a/internal/api/handlers/sync_test.go +++ b/internal/api/handlers/sync_test.go @@ -11,6 +11,8 @@ import ( "testing" "time" + "archebase.com/keystone-edge/internal/services" + "github.com/gin-gonic/gin" "github.com/jmoiron/sqlx" _ "modernc.org/sqlite" @@ -26,6 +28,44 @@ func TestSyncHandlerRegisterRoutes(t *testing.T) { handler.RegisterRoutes(api) } +func TestGetSyncConfigIncludesAutoScanEnabled(t *testing.T) { + gin.SetMode(gin.TestMode) + + worker := services.NewSyncWorker(nil, nil, nil, "", services.SyncWorkerConfig{ + MaxRetries: 5, + AutoScanEnabled: true, + }, nil) + router := gin.New() + handler := NewSyncHandler(nil, worker) + handler.RegisterRoutes(router.Group("/api/v1")) + + req := httptest.NewRequest(http.MethodGet, "/api/v1/sync/config", nil) + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String()) + } + + var got struct { + WorkerRunning bool `json:"worker_running"` + AutoScanEnabled bool `json:"auto_scan_enabled"` + MaxRetries int `json:"max_retries"` + } + if err := json.Unmarshal(rec.Body.Bytes(), &got); err != nil { + t.Fatalf("decode response: %v", err) + } + if got.WorkerRunning { + t.Fatal("worker_running = true, want false for not-started test worker") + } + if !got.AutoScanEnabled { + t.Fatal("auto_scan_enabled = false, want true") + } + if got.MaxRetries != 5 { + t.Fatalf("max_retries = %d, want 5", got.MaxRetries) + } +} + func TestListEpisodeSyncSummariesGroupsByEpisode(t *testing.T) { gin.SetMode(gin.TestMode) db := setupSyncHandlerTestDB(t) diff --git a/internal/config/config.go b/internal/config/config.go index 6b76d88..0f1a635 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -66,9 +66,10 @@ type QAConfig struct { // SyncConfig synchronization configuration type SyncConfig struct { - Enabled bool - BatchSize int - MaxRetries int + Enabled bool + AutoScanEnabled bool + BatchSize int + MaxRetries int // Cloud upload settings (data-platform integration) AuthEndpoint string // gRPC endpoint for AuthService @@ -184,6 +185,7 @@ func Load() (*Config, error) { }, Sync: SyncConfig{ Enabled: getEnvBool("KEYSTONE_SYNC_ENABLED", true), + AutoScanEnabled: getEnvBool("KEYSTONE_SYNC_AUTO_SCAN_ENABLED", false), BatchSize: getEnvInt("KEYSTONE_SYNC_BATCH_SIZE", 10), MaxRetries: getEnvInt("KEYSTONE_SYNC_MAX_RETRIES", 5), AuthEndpoint: getEnv("KEYSTONE_CLOUD_AUTH_ENDPOINT", ""), diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 7d9532d..dd918e6 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -14,12 +14,13 @@ import ( func TestLoad(t *testing.T) { // Save original environment variables originalEnv := map[string]string{ - "KEYSTONE_MODE": os.Getenv("KEYSTONE_MODE"), - "KEYSTONE_MYSQL_HOST": os.Getenv("KEYSTONE_MYSQL_HOST"), - "KEYSTONE_MYSQL_PASSWORD": os.Getenv("KEYSTONE_MYSQL_PASSWORD"), - "KEYSTONE_MINIO_ACCESS_KEY": os.Getenv("KEYSTONE_MINIO_ACCESS_KEY"), - "KEYSTONE_MINIO_SECRET_KEY": os.Getenv("KEYSTONE_MINIO_SECRET_KEY"), - "KEYSTONE_FACTORY_ID": os.Getenv("KEYSTONE_FACTORY_ID"), + "KEYSTONE_MODE": os.Getenv("KEYSTONE_MODE"), + "KEYSTONE_MYSQL_HOST": os.Getenv("KEYSTONE_MYSQL_HOST"), + "KEYSTONE_MYSQL_PASSWORD": os.Getenv("KEYSTONE_MYSQL_PASSWORD"), + "KEYSTONE_MINIO_ACCESS_KEY": os.Getenv("KEYSTONE_MINIO_ACCESS_KEY"), + "KEYSTONE_MINIO_SECRET_KEY": os.Getenv("KEYSTONE_MINIO_SECRET_KEY"), + "KEYSTONE_FACTORY_ID": os.Getenv("KEYSTONE_FACTORY_ID"), + "KEYSTONE_SYNC_AUTO_SCAN_ENABLED": os.Getenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED"), } defer func() { // Restore original environment variables @@ -33,6 +34,7 @@ func TestLoad(t *testing.T) { }() // Set test environment variables + os.Unsetenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED") os.Setenv("KEYSTONE_MYSQL_PASSWORD", "test-password") os.Setenv("KEYSTONE_MINIO_ACCESS_KEY", "test-access-key") os.Setenv("KEYSTONE_MINIO_SECRET_KEY", "test-secret-key") @@ -64,6 +66,10 @@ func TestLoad(t *testing.T) { t.Errorf("Load().Storage.Bucket = %v, want edge-factory-test", cfg.Storage.Bucket) } + if cfg.Sync.AutoScanEnabled { + t.Error("Load().Sync.AutoScanEnabled should default to false") + } + // Verify QA configuration if !cfg.QA.Enabled { t.Error("Load().QA.Enabled should default to true") @@ -98,6 +104,7 @@ func TestLoadWithCustomEnv(t *testing.T) { "KEYSTONE_QA_MAX_WORKERS": os.Getenv("KEYSTONE_QA_MAX_WORKERS"), "KEYSTONE_MAX_MEMORY_MB": os.Getenv("KEYSTONE_MAX_MEMORY_MB"), "KEYSTONE_DASHBOARD_DISPLAY_TOKEN": os.Getenv("KEYSTONE_DASHBOARD_DISPLAY_TOKEN"), + "KEYSTONE_SYNC_AUTO_SCAN_ENABLED": os.Getenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED"), } defer func() { for k, v := range originalEnv { @@ -118,6 +125,7 @@ func TestLoadWithCustomEnv(t *testing.T) { os.Setenv("KEYSTONE_QA_MAX_WORKERS", "8") os.Setenv("KEYSTONE_MAX_MEMORY_MB", "8192") os.Setenv("KEYSTONE_DASHBOARD_DISPLAY_TOKEN", "display-secret") + os.Setenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED", "true") cfg, err := Load() if err != nil { @@ -139,6 +147,10 @@ func TestLoadWithCustomEnv(t *testing.T) { if cfg.Auth.DashboardDisplayToken != "display-secret" { t.Errorf("Load().Auth.DashboardDisplayToken = %q, want display-secret", cfg.Auth.DashboardDisplayToken) } + + if !cfg.Sync.AutoScanEnabled { + t.Error("Load().Sync.AutoScanEnabled = false, want true") + } } func TestConfigValidate(t *testing.T) { diff --git a/internal/services/sync_worker.go b/internal/services/sync_worker.go index 0ecce3e..f9971bb 100644 --- a/internal/services/sync_worker.go +++ b/internal/services/sync_worker.go @@ -27,13 +27,14 @@ import ( // SyncWorkerConfig provides the runtime configuration for the sync worker. type SyncWorkerConfig struct { - BatchSize int - MaxConcurrent int - MaxRetries int - IntervalSec int - RetryBaseSec int - RetryMaxSec int - RetryJitterSec int + BatchSize int + MaxConcurrent int + MaxRetries int + AutoScanEnabled bool + IntervalSec int + RetryBaseSec int + RetryMaxSec int + RetryJitterSec int } type syncEnqueueRequest struct { @@ -41,8 +42,8 @@ type syncEnqueueRequest struct { manual bool } -// SyncWorker is a background goroutine that periodically scans for approved -// episodes and uploads them to cloud via the data-platform gateway. +// SyncWorker is a background goroutine that processes queued cloud sync work +// and optionally discovers approved episodes for automatic cloud upload. type SyncWorker struct { db *sqlx.DB uploader *cloud.Uploader @@ -207,6 +208,11 @@ func (w *SyncWorker) MaxRetries() int { return w.cfg.MaxRetries } +// AutoScanEnabled returns whether the worker periodically discovers newly eligible episodes. +func (w *SyncWorker) AutoScanEnabled() bool { + return w.cfg.AutoScanEnabled +} + // EnqueueEpisode adds a specific episode ID for immediate sync processing. func (w *SyncWorker) EnqueueEpisode(ctx context.Context, episodeID int64) error { return w.enqueueEpisode(ctx, episodeID, false) @@ -566,6 +572,10 @@ func (w *SyncWorker) pollAndProcess(ctx context.Context) { // Then, retry any failed episodes that are due. w.retryFailedEpisodes(ctx) + if !w.cfg.AutoScanEnabled { + return + } + // Finally, find newly eligible episodes and persist them as queued work. ids, err := w.findPendingEpisodes(ctx, false) if err != nil { diff --git a/internal/services/sync_worker_test.go b/internal/services/sync_worker_test.go index 2a34575..02d3151 100644 --- a/internal/services/sync_worker_test.go +++ b/internal/services/sync_worker_test.go @@ -315,6 +315,108 @@ func TestDispatchPendingSyncLogs_DispatchesPersistedRows(t *testing.T) { } } +func TestPollAndProcess_SkipsAutoDiscoveryWhenDisabled(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3, AutoScanEnabled: false}, + jobCh: make(chan syncEnqueueRequest, 1), + enqueuedEpisode: make(map[int64]struct{}), + } + + insertEpisodeForSyncWorkerTest(t, db, 20, "approved", false) + + w.pollAndProcess(context.Background()) + + if count := countSyncLogsForSyncWorkerTest(t, db, 20); count != 0 { + t.Fatalf("sync log count = %d, want 0 when auto scan is disabled", count) + } + select { + case got := <-w.jobCh: + t.Fatalf("unexpected job dispatched with auto scan disabled: %+v", got) + default: + } +} + +func TestPollAndProcess_DispatchesPendingRowsWhenAutoDiscoveryDisabled(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3, AutoScanEnabled: false}, + jobCh: make(chan syncEnqueueRequest, 1), + enqueuedEpisode: make(map[int64]struct{}), + } + + insertEpisodeForSyncWorkerTest(t, db, 21, "approved", false) + insertSyncLogForSyncWorkerTest(t, db, 21, "pending", 0) + + w.pollAndProcess(context.Background()) + + select { + case got := <-w.jobCh: + if got.episodeID != 21 { + t.Fatalf("unexpected episode id: got %d want 21", got.episodeID) + } + default: + t.Fatal("expected pending row to be dispatched with auto scan disabled") + } +} + +func TestPollAndProcess_RetriesDueFailuresWhenAutoDiscoveryDisabled(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3, AutoScanEnabled: false}, + jobCh: make(chan syncEnqueueRequest, 1), + enqueuedEpisode: make(map[int64]struct{}), + } + + insertEpisodeForSyncWorkerTest(t, db, 22, "approved", false) + insertSyncLogForSyncWorkerTest(t, db, 22, "failed", 1) + + w.pollAndProcess(context.Background()) + + latest := latestSyncLogForSyncWorkerTest(t, db, 22) + if latest.Status != "pending" { + t.Fatalf("latest status = %q, want pending", latest.Status) + } + select { + case got := <-w.jobCh: + if got.episodeID != 22 { + t.Fatalf("unexpected episode id: got %d want 22", got.episodeID) + } + default: + t.Fatal("expected retryable failure to be dispatched with auto scan disabled") + } +} + +func TestPollAndProcess_DiscoversPendingEpisodesWhenAutoScanEnabled(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3, AutoScanEnabled: true}, + jobCh: make(chan syncEnqueueRequest, 1), + enqueuedEpisode: make(map[int64]struct{}), + } + + insertEpisodeForSyncWorkerTest(t, db, 23, "approved", false) + + w.pollAndProcess(context.Background()) + + latest := latestSyncLogForSyncWorkerTest(t, db, 23) + if latest.Status != "pending" { + t.Fatalf("latest status = %q, want pending", latest.Status) + } + select { + case got := <-w.jobCh: + if got.episodeID != 23 { + t.Fatalf("unexpected episode id: got %d want 23", got.episodeID) + } + default: + t.Fatal("expected auto-discovered episode to be dispatched") + } +} + func TestRetryFailedEpisodes_PromotesDueFailureToPendingBeforeDispatch(t *testing.T) { db := newTestSyncWorkerDB(t) w := &SyncWorker{ From 26372ce2610effbac34e3c8a0b6afeedea6ff2dc Mon Sep 17 00:00:00 2001 From: chaoliu Date: Fri, 29 May 2026 14:10:15 +0800 Subject: [PATCH 12/12] Fix OSS uploader gosec lint suppression --- internal/cloud/oss_uploader.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/internal/cloud/oss_uploader.go b/internal/cloud/oss_uploader.go index 797b676..25d1d16 100644 --- a/internal/cloud/oss_uploader.go +++ b/internal/cloud/oss_uploader.go @@ -222,9 +222,8 @@ func (u *OSSUploader) sendRequest(ctx context.Context, session *UploadSession, m req.Header.Set(k, v) } - // #nosec G704 -- URL is OSS endpoint from authenticated DataGateway gRPC upload session startedAt := time.Now() - resp, err := u.httpClient.Do(req) + resp, err := u.httpClient.Do(req) //nolint:gosec // G704: URL is OSS endpoint from authenticated DataGateway gRPC upload session if err != nil { if isTimeoutError(err) || errors.Is(ctx.Err(), context.DeadlineExceeded) { timeout := timeoutDuration(ctx, startedAt, u.httpClient.Timeout)