diff --git a/cmd/keystone-edge/main.go b/cmd/keystone-edge/main.go index 7164bff..828437b 100644 --- a/cmd/keystone-edge/main.go +++ b/cmd/keystone-edge/main.go @@ -17,7 +17,6 @@ import ( "github.com/joho/godotenv" - "archebase.com/keystone-edge/internal/cloud" "archebase.com/keystone-edge/internal/config" "archebase.com/keystone-edge/internal/logger" "archebase.com/keystone-edge/internal/server" @@ -115,46 +114,8 @@ func main() { // Initialize cloud sync worker var syncWorker *services.SyncWorker - if cfg.Sync.Enabled && cfg.Sync.AuthEndpoint != "" && cfg.Sync.GatewayEndpoint != "" && s3Client != nil { - authClient := cloud.NewAuthClient(cloud.AuthClientConfig{ - Endpoint: cfg.Sync.AuthEndpoint, - UseTLS: cfg.Sync.CloudUseTLS, - TLSCAFile: cfg.Sync.CloudTLSCAFile, - TLSServerName: cfg.Sync.CloudTLSServerName, - APIKey: cfg.Sync.APIKey, - RefreshBefore: 60 * time.Second, - }) - - gatewayClient := cloud.NewGatewayClient(cloud.GatewayClientConfig{ - Endpoint: cfg.Sync.GatewayEndpoint, - UseTLS: cfg.Sync.CloudUseTLS, - TLSCAFile: cfg.Sync.CloudTLSCAFile, - TLSServerName: cfg.Sync.CloudTLSServerName, - RequestTimeout: time.Duration(cfg.Sync.RequestTimeoutSec) * time.Second, - }, authClient) - // Close gateway client before auth client (LIFO defer order). - defer func() { - if err := authClient.Close(); err != nil { - logger.Printf("[SYNC] Failed to close auth client: %v", err) - } - }() - defer func() { - if err := gatewayClient.Close(); err != nil { - logger.Printf("[SYNC] Failed to close gateway client: %v", err) - } - }() - - uploader, err := cloud.NewUploader(gatewayClient, s3Client, cfg.Storage.Bucket, cloud.UploaderConfig{ - RequestTimeout: time.Duration(cfg.Sync.RequestTimeoutSec) * time.Second, - OSSTimeout: time.Duration(cfg.Sync.OSSTimeoutSec) * time.Second, - PersistRootDir: cfg.Sync.PersistRootDir, - MaxRestartCount: uint32(cfg.Sync.MaxRestartCount), //nolint:gosec // non-negative guaranteed by config.Validate() - }) - if err != nil { - logger.Fatalf("[SYNC] Failed to initialise uploader: %v", err) - } - - syncWorker = services.NewSyncWorker(db.DB, uploader, s3Client, cfg.Storage.Bucket, services.SyncWorkerConfig{ + if cfg.Sync.Enabled && cfg.Sync.DPConfigPath != "" && s3Client != nil { + syncWorker = services.NewSyncWorker(db.DB, nil, s3Client, cfg.Storage.Bucket, services.SyncWorkerConfig{ BatchSize: cfg.Sync.BatchSize, MaxConcurrent: cfg.Sync.MaxConcurrent, MaxRetries: cfg.Sync.MaxRetries, @@ -166,9 +127,9 @@ func main() { }, &cfg.Sync) syncWorker.Start() - logger.Printf("[SYNC] Cloud sync worker started: auth=%s gateway=%s auto_scan=%t", cfg.Sync.AuthEndpoint, cfg.Sync.GatewayEndpoint, cfg.Sync.AutoScanEnabled) + logger.Printf("[SYNC] Cloud sync worker started: dp_config=%s auto_scan=%t", cfg.Sync.DPConfigPath, cfg.Sync.AutoScanEnabled) } else { - logger.Println("[SYNC] Cloud sync disabled (KEYSTONE_SYNC_ENABLED=false or missing endpoints)") + logger.Println("[SYNC] Cloud sync disabled (KEYSTONE_SYNC_ENABLED=false, missing KEYSTONE_SYNC_DP_CONFIG, or S3 unavailable)") } // Initialize and start HTTP server diff --git a/docker/.env.example b/docker/.env.example index 7d1c145..34ec451 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -42,15 +42,7 @@ KEYSTONE_MINIO_USE_SSL=false KEYSTONE_SYNC_ENABLED=true KEYSTONE_SYNC_BATCH_SIZE=10 KEYSTONE_SYNC_MAX_RETRIES=5 -KEYSTONE_CLOUD_AUTH_ENDPOINT=127.0.0.1:50051 -KEYSTONE_CLOUD_GATEWAY_ENDPOINT=127.0.0.1:50053 -KEYSTONE_CLOUD_USE_TLS=false -# Optional: custom CA bundle for TLS verification (PEM). -# KEYSTONE_CLOUD_TLS_CA_FILE=/etc/ssl/certs/your-ca.pem -# Optional: override TLS server name (SNI / verification), useful when endpoint is an IP. -# KEYSTONE_CLOUD_TLS_SERVER_NAME=cloud.example.com -# API key issued by the data-platform (base64url, no padding). -KEYSTONE_CLOUD_API_KEY=your-api-key-here +KEYSTONE_SYNC_DP_CONFIG=~/.archebase/config.json KEYSTONE_SYNC_WORKER_INTERVAL=15 KEYSTONE_SYNC_REQUEST_TIMEOUT=30 KEYSTONE_SYNC_OSS_TIMEOUT=120 diff --git a/docs/designs/cli-cloud-sync-sidepath.md b/docs/designs/cli-cloud-sync-sidepath.md new file mode 100644 index 0000000..14a8681 --- /dev/null +++ b/docs/designs/cli-cloud-sync-sidepath.md @@ -0,0 +1,497 @@ + + +# CLI Cloud Sync Sidepath Design + +Status: Superseded. This sidepath is not implemented in Keystone anymore. +Native cloud sync now uploads directly with the Go uploader and Data Platform +device profiles; Keystone no longer registers CLI sync APIs, starts a +`CLISyncRunner`, reads `KEYSTONE_CLI_SYNC_*` config, or creates +`cli_sync_runs` migrations. + +## 1. Overview + +This document defines a sidepath for syncing one Keystone episode to cloud by +running the data-platform `dp` CLI from Keystone, while keeping the existing +`SyncWorker -> data-platform DataGateway` flow unchanged. + +The sidepath is intended for controlled operations and emergency recovery, not +as the default production upload path. + +Target flow: + +```text +Synapse "CLI sync to cloud" button + -> Keystone CLI sync API + -> Keystone CLI sync runner + -> download MCAP from Keystone MinIO to a temporary local file + -> read sidecar JSON and flatten scalar metadata into --tag arguments + -> dp --json data upload --tag ... + -> record dp result + -> mark the episode cloud_synced on success +``` + +The existing cloud sync flow remains: + +```text +Synapse normal sync action + -> Keystone SyncWorker queue + -> Keystone Go uploader + -> data-platform DataGateway + -> cloud object storage +``` + +## 2. Goals + +- Add a Synapse action named `CLI sync to cloud` for a single episode. +- Keep the current `POST /api/v1/sync/episodes/:id` behavior unchanged. +- Keep the current `SyncWorker` queue, retry, backoff, and auto-scan behavior + unchanged. +- Upload the episode MCAP through `dp data upload`. +- Read the episode sidecar JSON and pass scalar metadata through + `dp data upload --tag`. Array fields are skipped in the first version so the + existing `dp` CLI does not need to change its comma-separated tag parser. +- Persist CLI run audit data, including `fileId`, `logicalUploadId`, `uploadId`, + `objectKey`, command duration, and sanitized error output. +- On successful CLI upload, update: + - `episodes.cloud_synced = TRUE` + - `episodes.cloud_synced_at` + - `episodes.cloud_mcap_path` + - `episodes.cloud_processed = FALSE` +- On successful CLI upload, append a normal `sync_logs.completed` row so the + existing Cloud Sync Center summary can show the episode as synced. + +## 3. Non-Goals + +- Do not replace `SyncWorker`. +- Do not make CLI sync the default action. +- Do not add batch CLI sync in the first version. +- Do not retry CLI sync automatically. +- Do not let the existing `SyncWorker` process CLI pending or failed states. +- Do not upload the sidecar JSON object through the CLI sidepath in the first + version. Its scalar content is still required as upload tags for the MCAP + object. +- Do not expose `dp` command output containing secrets to the browser. + +## 4. Recommended Architecture + +Use a separate `cli_sync_runs` table for pending, in-progress, and failed CLI +runs. This avoids putting CLI `pending` or `failed` rows into `sync_logs`, which +would otherwise be visible to the existing `SyncWorker` polling queries. + +Only after the CLI upload succeeds should Keystone append a `sync_logs` row with +`status = 'completed'`. That completed row is terminal and will not be retried +by the existing worker. + +```text +api request + -> insert cli_sync_runs(status='pending') + -> background runner claims run + -> cli_sync_runs(status='in_progress') + -> read sidecar JSON tags + -> run dp upload + -> success: + cli_sync_runs(status='completed', dp ids...) + sync_logs(status='completed', destination_path=objectKey...) + episodes.cloud_synced = TRUE + -> failure: + cli_sync_runs(status='failed', sanitized error...) + no sync_logs write + episodes unchanged +``` + +This keeps normal sync history authoritative while still allowing CLI success to +close the episode's cloud sync state. + +## 5. Backend API + +### 5.1 Trigger CLI Sync + +```http +POST /api/v1/sync/episodes/:id/cli +``` + +Request body: + +```json +{} +``` + +Response: + +```json +{ + "status": "accepted", + "episode_id": 123, + "run_id": 456, + "message": "episode accepted for CLI cloud sync" +} +``` + +Validation: + +| Check | Response | +|---|---| +| CLI sync feature disabled | `503 Service Unavailable` | +| invalid episode id | `400 Bad Request` | +| episode missing or deleted | `404 Not Found` | +| `qa_status` is not `approved` or `inspector_approved` | `400 Bad Request` | +| `cloud_synced = TRUE` | `409 Conflict` | +| latest normal sync log is `pending` or `in_progress` | `409 Conflict` | +| existing CLI run is `pending` or `in_progress` | `409 Conflict` | +| CLI runner queue is full | `429 Too Many Requests` | + +The endpoint must return after the run is queued. It must not hold the HTTP +request open for the entire upload. + +### 5.2 Get Latest CLI Sync Run + +```http +GET /api/v1/sync/episodes/:id/cli/status +``` + +Response: + +```json +{ + "id": 456, + "episode_id": 123, + "status": "in_progress", + "file_id": null, + "logical_upload_id": null, + "upload_id": null, + "object_key": null, + "file_size": null, + "started_at": "2026-06-02T08:10:00Z", + "completed_at": null, + "error_message": null +} +``` + +The frontend uses this endpoint to show button state while the sidepath is +running. The normal sync summary remains sourced from `sync_logs`. + +## 6. Data Model + +### 6.1 New Table + +```sql +CREATE TABLE IF NOT EXISTS cli_sync_runs ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + episode_id BIGINT NOT NULL, + status ENUM('pending', 'in_progress', 'completed', 'failed') NOT NULL DEFAULT 'pending', + source_path VARCHAR(1024), + temp_path VARCHAR(1024), + dp_config_path VARCHAR(1024), + file_id VARCHAR(255), + logical_upload_id VARCHAR(255), + upload_id VARCHAR(255), + bucket VARCHAR(255), + object_key VARCHAR(1024), + file_size BIGINT, + oss_object_etag VARCHAR(255), + duration_sec INT, + error_message TEXT, + stdout_json JSON DEFAULT NULL, + started_at TIMESTAMP NULL, + completed_at TIMESTAMP NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + INDEX idx_cli_sync_episode (episode_id), + INDEX idx_cli_sync_status (status), + INDEX idx_cli_sync_created (created_at) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; +``` + +### 6.2 Why Not Store Pending CLI Runs In `sync_logs` + +The existing worker polls latest `sync_logs.status = 'pending'` rows and +retryable `failed` rows. If CLI pending or failed rows are written to +`sync_logs`, the normal worker can claim them and run the regular data-gateway +upload path. That would mix the two channels and violate this design's goal. + +For this reason: + +- `cli_sync_runs` owns CLI pending, in-progress, and failed states. +- `sync_logs` receives a completed row only after CLI upload succeeds. +- `episodes.cloud_synced` is updated only after CLI upload succeeds. + +### 6.3 Successful CLI Sync Log Row + +On success, insert: + +```sql +INSERT INTO sync_logs ( + episode_id, + source_path, + destination_path, + status, + bytes_transferred, + duration_sec, + attempt_count, + started_at, + completed_at +) VALUES (?, ?, ?, 'completed', ?, ?, 1, ?, ?); +``` + +Use `destination_path = dp.objectKey`. Store `dp.fileId` and +`dp.logicalUploadId` in `cli_sync_runs`. + +## 7. CLI Runner + +### 7.1 Command Construction + +The runner must call `dp` without a shell: + +```text +exec.CommandContext(ctx, dpBin, + "--config", dpConfigPath, + "--json", + "data", "upload", tempFile, + "--device", "", + "--tag", "episode_id=", + "--tag", "keystone_episode_id=", + "--tag", "device_id=", + "--tag", "sync_channel=keystone_cli", + "--tag", "", + "--hint", "source=keystone_cli_sync", +) +``` + +Do not build a single shell command string. +The device id is resolved from the episode workstation robot +(`robots.device_id`, falling back to `workstations.robot_serial`). The selected +`dp` config must contain a matching initialized device profile in `devices[]`. + +### 7.2 Tags + +Required tags: + +| Tag | Value | +|---|---| +| `episode_id` | `episodes.episode_id` | +| `keystone_episode_id` | numeric `episodes.id` | +| `device_id` | `robots.device_id` resolved through the episode workstation | +| `sync_channel` | `keystone_cli` | + +Required sidecar-derived tags: + +| Source | Handling | +|---|---| +| sidecar JSON scalar fields | Flatten to string key/value pairs and pass as repeated `--tag key=value` arguments | +| sidecar JSON arrays | Skip in the first version | +| `topics_summary` | Exclude, matching the existing worker's filtering intent | +| nested objects | Flatten with dot notation | + +Recommended tags: + +| Tag | Value | +|---|---| +| `task_id` | `episodes.task_id`, when available | +| `factory_id` | `episodes.factory_id`, when available | +| `organization_id` | `episodes.organization_id`, when available | + +The CLI sidepath uploads only the MCAP file body, but sidecar JSON metadata is +not optional. Scalar sidecar fields must be included as tags; array fields are +left out for the first version. If `sidecar_path` is missing, unreadable, or +malformed, the CLI run should fail before invoking `dp`. This is stricter than +the current worker's best-effort sidecar handling and prevents cloud objects +from being created without the metadata required for filtering. + +The implementation must enforce a max tag count and max tag size so the CLI +command line cannot exceed OS limits. + +### 7.3 Temporary File Handling + +The runner downloads the MCAP from Keystone MinIO to a temporary file before +calling `dp`. + +Requirements: + +- Use a dedicated directory such as `/var/lib/keystone/cli-sync`. +- Create temporary files with mode `0600`. +- Delete the temporary file after success or failure unless + `KEYSTONE_CLI_SYNC_KEEP_TEMP=true`. +- Refuse to start if the temp directory is not writable. +- Check free disk space before download when a disk watermark helper is + available. + +### 7.4 JSON Output Parsing + +Expected `dp --json data upload` fields: + +```json +{ + "logicalUploadId": "logical-1", + "fileId": "file-1", + "bucket": "bucket-a", + "objectKey": "objects/file-1.mcap", + "fileSize": 123456789, + "ossObjectEtag": "etag", + "identity": "api-key", + "deviceId": null +} +``` + +The runner must validate that `fileId`, `logicalUploadId`, `objectKey`, and +`fileSize` are present before marking the run completed. + +## 8. Configuration + +Add a separate config group rather than reusing `SyncConfig`. + +| Environment variable | Default | Description | +|---|---|---| +| `KEYSTONE_CLI_SYNC_ENABLED` | `false` | Enables the sidepath API and runner | +| `KEYSTONE_CLI_SYNC_DP_BIN` | `dp` | Path or binary name for the data-platform CLI | +| `KEYSTONE_CLI_SYNC_DP_CONFIG` | empty | SDK config JSON path passed to `dp --config` | +| `KEYSTONE_CLI_SYNC_TEMP_DIR` | `/var/lib/keystone/cli-sync` | Temporary MCAP staging directory | +| `KEYSTONE_CLI_SYNC_MAX_CONCURRENT` | `1` | Max concurrent CLI uploads | +| `KEYSTONE_CLI_SYNC_QUEUE_SIZE` | `16` | Max queued CLI runs | +| `KEYSTONE_CLI_SYNC_TIMEOUT_SEC` | `7200` | Per-run timeout | +| `KEYSTONE_CLI_SYNC_KEEP_TEMP` | `false` | Keeps staged files for debugging | +| `KEYSTONE_CLI_SYNC_MAX_TAGS` | `128` | Max tags passed to CLI | +| `KEYSTONE_CLI_SYNC_MAX_TAG_BYTES` | `65536` | Max total encoded tag bytes | + +Startup validation when enabled: + +- `dp` binary exists and is executable. +- `KEYSTONE_CLI_SYNC_DP_CONFIG` is set and readable. +- Temp directory exists or can be created. +- Temp directory is writable. + +## 9. Frontend Behavior + +### 9.1 Cloud Sync Center + +Add a row action next to existing `Retry` and `History` actions: + +```text +CLI sync to cloud +``` + +Show it only when the feature flag from config/status says CLI sync is enabled. + +Disable it when: + +- the row status is `pending` or `in_progress`; +- the row status is `completed`; +- the episode has an active CLI run; +- a row action is already running; +- the user does not have admin permission. + +After clicking: + +1. Call `POST /api/v1/sync/episodes/:id/cli`. +2. Show the row as `CLI queued` or `CLI syncing` using the CLI status endpoint. +3. Poll `GET /api/v1/sync/episodes/:id/cli/status`. +4. On CLI completion, refresh normal sync summaries. +5. On CLI failure, keep the normal sync row unchanged and show the sanitized CLI + error. + +### 9.2 Episode Detail + +Add the same action for approved, unsynced episodes. This is important because +an approved unsynced episode may not yet have any `sync_logs` row and therefore +may not appear in the Cloud Sync Center table. + +## 10. Security + +- The trigger API must require admin authorization. +- `dp` must be launched through `exec.CommandContext`, never through a shell. +- Do not pass API keys on the command line. +- Store credentials only in the `dp` config file with restrictive permissions. +- Redact stdout, stderr, paths, and errors before returning anything to the + frontend. +- Do not log full `dp` config contents. +- Do not log temporary object storage credentials or presigned URLs. +- Limit concurrent CLI runs to protect Keystone CPU, disk, and network. + +## 11. Concurrency And Races + +Keystone should prevent multiple active CLI runs for the same episode by checking +`cli_sync_runs.status IN ('pending', 'in_progress')` inside a transaction. + +Before marking success, lock the `episodes` row and re-check `cloud_synced`. + +If the normal SyncWorker synced the episode while the CLI run was uploading: + +- mark the CLI run as completed with its `dp` result; +- do not overwrite `episodes.cloud_mcap_path`; +- do not insert a second `sync_logs.completed` row unless product explicitly + wants duplicate completed history; +- include a `duplicate_after_upload` marker in `cli_sync_runs.stdout_json` or a + dedicated metadata field if one is added later. + +Residual risk: if `dp` upload succeeds but Keystone crashes before recording the +result, a later manual CLI retry can upload a duplicate object. This is accepted +for the sidepath's emergency-use scope. A future implementation can reduce this +by adding a data-platform idempotency key or a server-side upload lookup by +`episode_id`. + +## 12. Rollout Plan + +1. Add `cli_sync_runs` migration and model helpers. +2. Add CLI sync config with default disabled. +3. Add the backend runner with a fake `dp` executable test fixture. +4. Add `POST /sync/episodes/:id/cli` and latest status endpoint. +5. Add Synapse API wrapper methods. +6. Add Episode Detail button. +7. Add Cloud Sync Center row button and CLI status overlay. +8. Enable only in a staging environment. +9. Run one approved small MCAP through CLI sync and verify: + - data-platform object is visible; + - expected sidecar JSON scalar fields are visible as data-platform raw tags; + - `cli_sync_runs` contains `fileId` and `logicalUploadId`; + - `sync_logs` has a completed row; + - `episodes.cloud_synced = TRUE`; + - normal SyncWorker does not retry the episode. + +## 13. Test Plan + +Backend unit tests: + +- rejects disabled feature; +- rejects non-approved episodes; +- rejects already cloud-synced episodes; +- rejects active normal sync rows; +- rejects active CLI runs; +- fails when sidecar JSON is missing, unreadable, or malformed; +- passes flattened sidecar JSON scalar fields as repeated `--tag` arguments; +- builds `dp` argv without a shell; +- parses valid `dp --json` output; +- rejects missing `fileId`, `logicalUploadId`, or `objectKey`; +- redacts stderr before API response; +- records failed CLI runs without writing `sync_logs`; +- records successful CLI runs and inserts one completed `sync_logs` row. + +Backend integration tests: + +- fake MinIO object is staged to temp file; +- fake `dp` executable receives the expected args; +- temp file is deleted after success and failure; +- success updates `episodes.cloud_synced`; +- normal sync summary sees the completed row after success. + +Frontend tests: + +- button is hidden when CLI sync config is disabled; +- button is disabled for completed, pending, and in-progress rows; +- click calls `triggerEpisodeCli`; +- active CLI status changes row action text; +- completed CLI run refreshes normal summaries; +- failed CLI run shows sanitized error and leaves normal row state unchanged. + +## 14. Open Questions + +- Should CLI failures appear in the Cloud Sync Center main table, or only as a + per-episode CLI status/badge? +- Should a successful CLI sync always append `sync_logs.completed`, even when + the latest normal row is already completed by a race? +- Does data-platform need an explicit idempotency key for `dp data upload` so + crash-after-upload can be recovered without duplicate objects? +- Should the `dp` config use a site API key or a device profile for the Keystone + edge site? diff --git a/docs/designs/cli-cloud-sync-sidepath.zh.html b/docs/designs/cli-cloud-sync-sidepath.zh.html new file mode 100644 index 0000000..fad753d --- /dev/null +++ b/docs/designs/cli-cloud-sync-sidepath.zh.html @@ -0,0 +1,834 @@ + + + + + + + CLI 同步到云旁路设计 + + + +
+
+
+

Keystone / Synapse Design

+

CLI 同步到云旁路设计

+

在不改动现有 Keystone 云同步主链路的前提下,新增一个由 Synapse 触发、Keystone 后台执行 dp data upload 的单片段应急同步入口。首版只上传 MCAP 文件本体,但必须读取 sidecar JSON,并把其中标量元数据作为 --tag 传给 data-platform;数组字段先跳过,dp 本身不需要改。CLI 上传成功后回写 Keystone 云同步状态,并保留 data-platform 返回的审计 ID。

+
+ 方案 2:成功后回写 episode + 现有 SyncWorker 不变 + 默认关闭,按环境启用 +
+
+
+ 文档状态 +
状态:已废弃;当前实现改为 Keystone Go uploader 原生 direct sync,不再实现 CLI sync API、CLISyncRunner、KEYSTONE_CLI_SYNC_* 配置或 cli_sync_runs 迁移。
+
用途:实现设计 / 评审
+
范围:Keystone 后端、Synapse 前端;data-platform CLI 只作为外部命令调用
+
日期:2026-06-02
+
+
+ + + +
+

1. 设计结论

+

推荐新增独立的 CLI 同步旁路,而不是让前端直接调用 CLI,也不是把 CLI pending/failed 状态写入现有 sync_logs。核心原则是:正常同步继续归 SyncWorker 管,CLI 同步只作为手动应急通道。

+ +
+
+ 主链路不动 +

POST /sync/episodes/:id、自动扫描、重试和 backoff 都保持现状。

+
+
+ 旁路独立记账 +

用新表 cli_sync_runs 记录 CLI 的 pending、running、failed 和 completed。

+
+
+ 成功后闭环 +

CLI 成功后写 episodes.cloud_synced,并追加一条 sync_logs.completed

+
+
+ +
+ 关键约束:不要把 CLI 的 pending 或 failed 行写进 sync_logs。现有 worker 会扫描最新 pending 和可重试 failed 行,如果 CLI 行进入这张表,可能被正常同步 worker 误认领。 +
+
+ +
+

2. 目标流程

+
+
+

新增 CLI 旁路

+
+
Synapse 按钮管理员点击「CLI 同步到云」。
+
Keystone API创建 cli_sync_runs.pending 并返回 202 Accepted
+
Keystone Runner从 MinIO 下载 MCAP 到临时文件,并读取 sidecar JSON。
+
dp CLI执行 dp --json data upload,将 sidecar 标量元数据作为重复 --tag 参数传入。
+
状态回写写入 CLI 审计数据、sync_logs.completed 和 episode 云同步字段。
+
+
+ +
+

现有正常同步

+
+
Synapse 正常同步仍调用 POST /api/v1/sync/episodes/:id
+
SyncWorker 队列负责 pending、in_progress、failed 和重试。
+
Go Uploader通过 data-platform DataGateway 和 OSS 上传。
+
完成态更新 sync_logsepisodes
+
+
+ 两条链路最后都可以把 episode 标记为已同步,但只有正常链路参与自动发现和自动重试。 +
+
+
+
+ +
+

3. 目标与非目标

+
+
+

目标

+
    +
  • 增加单 episode 的「CLI 同步到云」动作。
  • +
  • 上传 MCAP 文件到 data-platform 云端对象存储。
  • +
  • 读取 sidecar JSON,并把标量字段作为 dp data upload --tag 传递;数组字段首版先跳过。
  • +
  • 保存 fileIdlogicalUploadIduploadIdobjectKey 等审计信息。
  • +
  • 成功后更新 episodes.cloud_syncedcloud_synced_atcloud_mcap_pathcloud_processed
  • +
  • 成功后插入一条 sync_logs.completed,让现有 Cloud Sync Center 能看到完成态。
  • +
+
+
+

非目标

+
    +
  • 不替换 SyncWorker
  • +
  • 不提供批量 CLI 同步。
  • +
  • 不自动重试 CLI 失败任务。
  • +
  • 不把 CLI 失败任务混入正常同步主表。
  • +
  • 首版不上传 sidecar JSON 文件本体;但它的标量内容必须作为 MCAP 上传 tags。
  • +
  • 不把包含敏感信息的 CLI 输出返回给浏览器。
  • +
+
+
+
+ +
+

4. 后端接口

+

4.1 触发 CLI 同步

+
POST /api/v1/sync/episodes/:id/cli
+

请求体为空对象即可:

+
{
+  "status": "accepted",
+  "episode_id": 123,
+  "run_id": 456,
+  "message": "episode accepted for CLI cloud sync"
+}
+ + + + + + + + + + + + + + + +
校验项失败响应
CLI 同步功能未启用503 Service Unavailable
episode id 非法400 Bad Request
episode 不存在或已删除404 Not Found
qa_status 不是 approvedinspector_approved400 Bad Request
cloud_synced = TRUE409 Conflict
正常同步最新状态为 pendingin_progress409 Conflict
已有 CLI run 为 pendingin_progress409 Conflict
CLI runner 队列已满429 Too Many Requests
+ +

4.2 查询最新 CLI 状态

+
GET /api/v1/sync/episodes/:id/cli/status
+
{
+  "id": 456,
+  "episode_id": 123,
+  "status": "in_progress",
+  "file_id": null,
+  "logical_upload_id": null,
+  "upload_id": null,
+  "object_key": null,
+  "file_size": null,
+  "started_at": "2026-06-02T08:10:00Z",
+  "completed_at": null,
+  "error_message": null
+}
+
+ +
+

5. 数据模型

+

新增 cli_sync_runs,专门承载 CLI 旁路生命周期。正常同步的 sync_logs 只在 CLI 成功后接收一条 completed 审计行。

+ +

5.1 CLI run 表

+
CREATE TABLE IF NOT EXISTS cli_sync_runs (
+    id BIGINT AUTO_INCREMENT PRIMARY KEY,
+    episode_id BIGINT NOT NULL,
+    status ENUM('pending', 'in_progress', 'completed', 'failed') NOT NULL DEFAULT 'pending',
+    source_path VARCHAR(1024),
+    temp_path VARCHAR(1024),
+    dp_config_path VARCHAR(1024),
+    file_id VARCHAR(255),
+    logical_upload_id VARCHAR(255),
+    upload_id VARCHAR(255),
+    bucket VARCHAR(255),
+    object_key VARCHAR(1024),
+    file_size BIGINT,
+    oss_object_etag VARCHAR(255),
+    duration_sec INT,
+    error_message TEXT,
+    stdout_json JSON DEFAULT NULL,
+    started_at TIMESTAMP NULL,
+    completed_at TIMESTAMP NULL,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+    INDEX idx_cli_sync_episode (episode_id),
+    INDEX idx_cli_sync_status (status),
+    INDEX idx_cli_sync_created (created_at)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+ +

5.2 成功后的 normal sync log

+
INSERT INTO sync_logs (
+    episode_id,
+    source_path,
+    destination_path,
+    status,
+    bytes_transferred,
+    duration_sec,
+    attempt_count,
+    started_at,
+    completed_at
+) VALUES (?, ?, ?, 'completed', ?, ?, 1, ?, ?);
+ +
+
1. pendingAPI 接受请求,写入 CLI 独立表。
+
2. in_progressrunner 已 claim,正在 staging 或上传。
+
3A. failed只更新 CLI 表,episode 不变。
+
3B. completedCLI 表记录 dp 返回 ID。
+
4. syncedsync_logs.completed 与 episode 云同步字段。
+
+
+ +
+

6. CLI Runner

+

6.1 命令构造

+

必须使用 exec.CommandContext 参数数组调用,不能拼 shell 字符串。

+
exec.CommandContext(ctx, dpBin,
+  "--config", dpConfigPath,
+  "--json",
+  "data", "upload", tempFile,
+  "--device", "<robot device id>",
+  "--tag", "episode_id=<episode public id>",
+  "--tag", "keystone_episode_id=<numeric id>",
+  "--tag", "device_id=<robot device id>",
+  "--tag", "sync_channel=keystone_cli",
+  "--tag", "<flattened sidecar key=value>",
+  "--hint", "source=keystone_cli_sync",
+)
+

设备 ID 通过 episode 对应工位的机器人解析,优先使用 robots.device_id,回退到 workstations.robot_serial。所选 dp config 的 devices[] 中必须已有这个 device profile。

+ +

6.2 标签

+ + + + + + + + + + + + + + + + +
标签来源要求
episode_idepisodes.episode_id必填
keystone_episode_idepisodes.id必填
device_idepisode 工位对应的 robots.device_id必填,同时作为 --device 参数
sync_channel固定 keystone_cli必填
sidecar JSON 标量字段episodes.sidecar_path 指向的 JSON必填,扁平化后作为重复 --tag
sidecar JSON 数组字段例如 topic 列表、skills首版跳过,不传给 CLI
topics_summarysidecar JSON排除,避免 tag 过大
task_idepisodes.task_id可选
factory_idepisodes.factory_id可选
organization_idepisodes.organization_id可选
+
+ CLI 首版只上传 MCAP 对象,不上传 sidecar JSON 对象。但 sidecar JSON 元数据不是可选项:标量字段必须作为 tag 传入,数组字段首版先跳过;如果 sidecar_path 缺失、对象读不到或 JSON 解析失败,本次 CLI run 应在调用 dp 前失败,避免云端产生缺少关键过滤标签的对象。 +
+

sidecar 字段扁平化应复用现有同步 worker 的意图:普通字段转成字符串 key/value,嵌套对象用点号展开;数组字段首版跳过。同时必须受 KEYSTONE_CLI_SYNC_MAX_TAGSKEYSTONE_CLI_SYNC_MAX_TAG_BYTES 限制。

+ +

6.3 临时文件

+
    +
  • 默认目录:/var/lib/keystone/cli-sync
  • +
  • 临时文件权限:0600
  • +
  • 成功或失败后删除临时文件。
  • +
  • KEYSTONE_CLI_SYNC_KEEP_TEMP=true 时保留临时文件用于排障。
  • +
  • 启动时校验目录可写。
  • +
  • 可用时检查磁盘水位。
  • +
+ +

6.4 dp JSON 输出

+
{
+  "logicalUploadId": "logical-1",
+  "fileId": "file-1",
+  "bucket": "bucket-a",
+  "objectKey": "objects/file-1.mcap",
+  "fileSize": 123456789,
+  "ossObjectEtag": "etag",
+  "identity": "api-key",
+  "deviceId": null
+}
+

标记成功前必须校验 fileIdlogicalUploadIdobjectKeyfileSize 非空且合法。

+
+ +
+

7. 配置

+ + + + + + + + + + + + + + + + +
环境变量默认值说明
KEYSTONE_CLI_SYNC_ENABLEDfalse启用旁路 API 和 runner。
KEYSTONE_CLI_SYNC_DP_BINdpdata-platform CLI 二进制路径或名称。
KEYSTONE_CLI_SYNC_DP_CONFIG传给 dp --config 的 SDK 配置文件。
KEYSTONE_CLI_SYNC_TEMP_DIR/var/lib/keystone/cli-syncMCAP staging 目录。
KEYSTONE_CLI_SYNC_MAX_CONCURRENT1最大并发 CLI 上传数。
KEYSTONE_CLI_SYNC_QUEUE_SIZE16最大排队 run 数。
KEYSTONE_CLI_SYNC_TIMEOUT_SEC7200单次 CLI run 超时时间。
KEYSTONE_CLI_SYNC_KEEP_TEMPfalse是否保留临时文件。
KEYSTONE_CLI_SYNC_MAX_TAGS128传给 CLI 的最大 tag 数。
KEYSTONE_CLI_SYNC_MAX_TAG_BYTES65536编码后 tag 总字节上限。
+
+ 启用时启动校验:dp 可执行、dp 配置文件可读、临时目录可创建且可写。 +
+
+ +
+

8. 前端交互

+

8.1 Cloud Sync Center

+

在现有「重试」「历史」旁边增加一个行级动作:

+
CLI 同步到云
+

仅在后端配置显示 CLI sync enabled 时展示。以下情况禁用:

+
    +
  • 正常同步状态是 pendingin_progress
  • +
  • 正常同步状态是 completed
  • +
  • 当前 episode 已有 active CLI run。
  • +
  • 当前行已有操作在提交。
  • +
  • 当前用户不是 admin。
  • +
+ +

8.2 Episode Detail

+

Episode 详情页也需要同一个动作。原因是 approved 但还没有任何 sync_logs 的 episode 可能不会出现在 Cloud Sync Center 列表里。

+ +

8.3 状态展示

+ + + + + + + + + + +
CLI 状态按钮文案页面行为
pendingCLI 已入队轮询 CLI status。
in_progressCLI 同步中禁用重复点击。
completedCLI 已完成刷新正常同步 summary。
failedCLI 同步失败显示脱敏错误,正常同步行不变。
+
+ +
+

9. 安全、并发与竞态

+
+
+

安全要求

+
    +
  • 触发 API 必须要求 admin 权限。
  • +
  • 只能使用 exec.CommandContext 调用 CLI。
  • +
  • 不要把 API key 放到命令行参数。
  • +
  • dp 凭证只放在权限受控的 config 文件中。
  • +
  • 返回前端的 stdout、stderr 和错误信息必须脱敏。
  • +
  • 不要记录完整 dp config、临时凭证或 presigned URL。
  • +
+
+
+

并发策略

+
    +
  • 同一 episode 同时只允许一个 active CLI run。
  • +
  • 创建 run 时在事务里检查 active normal sync 和 active CLI run。
  • +
  • 完成前锁定 episodes 行并重新检查 cloud_synced
  • +
  • 默认并发为 1,避免占满 Keystone 磁盘、CPU 和网络。
  • +
+
+
+ +
+ 如果 CLI 上传成功后 Keystone 在落库前崩溃,后续人工重试可能产生重复云端对象。首版接受这个应急通道风险,后续可通过 data-platform 上传 idempotency key 或按 episode_id 查询已上传对象来降低风险。 +
+ +

正常 worker 与 CLI 同时完成

+

如果正常 SyncWorker 在 CLI 上传期间已经把 episode 同步完成,CLI runner 完成落库时应:

+
    +
  • 将 CLI run 标记为 completed,并保留 dp 返回的审计信息。
  • +
  • 不覆盖 episodes.cloud_mcap_path
  • +
  • 默认不插入第二条 sync_logs.completed,除非产品明确需要重复完成历史。
  • +
+
+ +
+

10. 落地计划与验收

+

10.1 实施顺序

+
    +
  1. 新增 cli_sync_runs migration 和 repository helper。
  2. +
  3. 新增 CLI sync config,默认关闭。
  4. +
  5. 实现 backend runner,并用 fake dp 可执行文件做测试。
  6. +
  7. 新增 POST /sync/episodes/:id/cli 和 CLI status endpoint。
  8. +
  9. 新增 Synapse API wrapper。
  10. +
  11. Episode Detail 增加按钮。
  12. +
  13. Cloud Sync Center 增加行按钮和 CLI 状态展示。
  14. +
  15. 只在 staging 环境开启。
  16. +
+ +

10.2 验收标准

+
    +
  • 一个 approved 小 MCAP 可以通过 CLI 同步到云。
  • +
  • data-platform 对象列表可见该文件。
  • +
  • 预期 sidecar JSON 标量字段可在 data-platform raw tags 中看到。
  • +
  • cli_sync_runs 记录 fileIdlogicalUploadId
  • +
  • sync_logs 出现一条 completed 行。
  • +
  • episodes.cloud_synced = TRUE
  • +
  • 正常 SyncWorker 不会再次处理该 episode。
  • +
+ +

10.3 必测用例

+
    +
  • 功能关闭时拒绝请求。
  • +
  • 非 approved episode 被拒绝。
  • +
  • 已 cloud_synced episode 被拒绝。
  • +
  • active normal sync 行被拒绝。
  • +
  • active CLI run 被拒绝。
  • +
  • sidecar JSON 缺失、不可读或格式错误时 run 失败。
  • +
  • sidecar JSON 标量字段会作为重复 --tag 参数传给 dp,数组字段首版跳过。
  • +
  • dp argv 不经过 shell。
  • +
  • 解析合法 dp --json 输出。
  • +
  • 缺少关键字段时标记 failed。
  • +
  • 失败不写 sync_logs
  • +
  • 成功更新 episode 和 completed sync log。
  • +
  • 临时文件成功和失败后都会清理。
  • +
  • 前端失败提示只显示脱敏错误。
  • +
+
+ +
+

11. 待确认问题

+
    +
  • CLI 失败是否需要进入 Cloud Sync Center 主表,还是只在 episode 详情/CLI badge 展示?
  • +
  • 正常 worker 已经完成时,CLI completed 是否需要单独追加到 sync_logs 历史?
  • +
  • dp data upload 是否需要 data-platform 支持显式 idempotency key?
  • +
  • dp config 应使用 site API key,还是为 Keystone edge site 建一个 device profile?
  • +
+
+
+ + diff --git a/docs/designs/cloud-sync-go-direct-upload.zh.html b/docs/designs/cloud-sync-go-direct-upload.zh.html new file mode 100644 index 0000000..65f0a88 --- /dev/null +++ b/docs/designs/cloud-sync-go-direct-upload.zh.html @@ -0,0 +1,825 @@ + + + + + + + Keystone 原生云同步直连 Data Platform 上传方案 + + + +
+
+
+

Cloud Sync / Data Platform

+

Keystone 原生云同步直连 Data Platform 上传方案

+

+ 本方案采用方案 A:Keystone 读取 KEYSTONE_SYNC_DP_CONFIG 指向的 data-platform + config,按 episode 对应的 asset_id 选择 device profile,并复用现有 Go 上传器完成 + MinIO 到 Data Platform OSS 的流式上传。第一版直接改造 Keystone 原有 cloud sync 上传逻辑, + 不再依赖 dp data upload 或 Keystone 自有 cloud API key。 +

+
+
+ 设计状态 + 草案,面向第一版实现
+ 目标路径:原生 cloud sync,不下载 MCAP 到本地,不依赖 dp data upload
+ 兼容重点:robots.asset_id、device API key、device tags、raw tags 合成 +
+
+ + + +
+

目标与非目标

+
+
+

目标

+
    +
  • 更新 Keystone 原有 cloud sync worker 的上传身份和 raw tags 合成逻辑。
  • +
  • 复用既有 robots.asset_id 作为“云资产编号”,即本地 robot 与 Data Platform device 的稳定映射。
  • +
  • episode 创建时将当时的 asset_id 快照写入 episodes.metadata.asset_id
  • +
  • 读取 KEYSTONE_SYNC_DP_CONFIG 指向的 data-platform config,按 asset_id 选择 device profile。
  • +
  • 使用 device profile 的 apiKey 与 AuthService 交换 Bearer token。
  • +
  • 复用 Keystone 现有 cloud.Uploader,从 MinIO 流式上传到 Data Platform OSS。
  • +
  • 复刻 Rust SDK 的 raw tags 合并与冲突校验规则。
  • +
+
+
+

非目标

+
    +
  • 不在第一版集成 dp device init,device profile 由现场工程师提前初始化。
  • +
  • 不生成或修改 Data Platform device id,只存储自动化流程写入的 asset_id
  • +
  • 不迁移 data-platform config 到 Keystone 配置中心。
  • +
  • 删除 Keystone 后端 CLI 同步旁路,包括 CLISyncRunner、CLI sync API、CLI sync 配置和 cli_sync_runs 表迁移。
  • +
  • 不在每个 episode 上传前执行 device init;init 是一次性准备或凭证轮换动作。
  • +
  • 不做历史 episode 的自动 asset_id 回填工具;缺失时给出清晰错误并允许手动重试。
  • +
  • 不新增 direct sync raw tag 数量或总字节数限制。
  • +
  • 不把 MCAP 完整读入内存,只保持单分片缓冲。
  • +
+
+
+
+ +
+

当前行为

+

+ Keystone 原有 cloud sync 已经是 Go 直连上传:从 MinIO 流式读取 MCAP,创建 data-gateway 上传会话, + 再分片上传到 Data Platform OSS。当前差异在于它使用 Keystone 自己的 sync API key,并没有像 + dp data upload --device 一样读取 device profile、注入 device tags 和 reserved raw tags。 + 当前 worker 也没有读取 episode 的云端 device 快照,sidecar raw tag 读取失败时会 best-effort 继续上传。 +

+
Keystone DB episode
+  -> build sidecar raw tags
+  -> AuthService ExchangeCredential with Keystone sync API key
+  -> data-gateway CreateLogicalUpload
+  -> MinIO GetObject stream
+  -> OSS multipart upload
+  -> data-gateway CompleteUpload
+  -> update sync_logs / episodes
+
+

+ 所以第一版不需要新增上传链路,但必须把原有 worker 的上传身份切到 device profile, + 补齐 asset_id 解析、strict sidecar、non-retryable 错误和与 data-platform Rust SDK 一致的 raw tags 合成规则。 +

+
+
+ +
+

目标架构

+

+ 新路径保留原有 cloud sync worker 的触发、重试、状态更新和同步日志。每次处理 episode 时, + worker 先解析 episode 对应的 asset_id,再根据该值读取 data-platform device + profile,并用该 profile 的 API key 构造本次上传专用客户端。 +

+
Keystone DB episode
+  -> resolve asset_id from episodes.metadata
+     or fallback through episode.workstation_id -> workstations -> robots.asset_id
+  -> load DP config from KEYSTONE_SYNC_DP_CONFIG
+  -> select devices[].deviceId == asset_id
+  -> build effective raw tags
+  -> AuthService ExchangeCredential with device apiKey
+  -> data-gateway CreateLogicalUpload
+  -> MinIO GetObject stream
+  -> OSS multipart upload
+  -> data-gateway CompleteUpload
+  -> update sync_logs / episodes
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
模块职责建议位置
Robot asset mapping保存本地 robot 到 Data Platform device 的不可变映射。robots.asset_id、robot API、数据库迁移
Asset resolver优先读取 episodes.metadata.asset_id,缺失时按历史 workstation 反查 robot。internal/services/dp_asset_resolver.go
DP config loader解析 endpointsdevices[],按 device id 返回上传所需配置。internal/services/dp_config_loader.go
Raw tag builder复刻 Rust SDK 的 tag 合并顺序与冲突规则。internal/services/dp_raw_tags.go
Direct uploader factory按 episode 创建本次专用 AuthClientGatewayClientcloud.Uploaderinternal/services/sync_worker.go
Cloud uploader复用现有 data-gateway 与 OSS multipart 上传能力。internal/cloud/uploader.go
+
+ +
+

云资产编号映射规则

+

+ Keystone 本地 robots.device_id 继续表示 Axon / Keystone 内部设备编号,不参与 Data Platform + device 身份选择。云交互只使用既有 robots.asset_id,前端文案统一显示为“云资产编号”。 +

+ +

Robot 字段规则

+
    +
  • robots.asset_id 初始允许为空,创建 robot 时可写可不写。
  • +
  • 首次设置非空后不可修改、不可清空;同值更新视为幂等。
  • +
  • active robots 的非空 asset_id 必须唯一,软删除 robot 不占用唯一性。
  • +
  • 保存前 trim;空字符串按 NULL;最大长度 100;不做 Data Platform device id 格式正则。
  • +
  • robot create / update / list / detail API 暴露 asset_id
  • +
+ +

Episode 快照规则

+
    +
  • episode 创建时,如果能从 task -> workstation -> robot 解析到非空 asset_id,写入 episodes.metadata.asset_id
  • +
  • episode 创建不因 asset_id 缺失失败,本地采集、QA 和入库继续成功。
  • +
  • 不修改 sidecar JSON,不把 asset_id 写回采集产物。
  • +
  • 第一版不提供自动历史回填工具;缺失时通过错误信息提示配置 robot 或手动回填 metadata 后再手动同步。
  • +
+ +

上传时解析优先级

+
if episodes.metadata.asset_id is non-empty:
+    use metadata.asset_id
+else:
+    load workstation by episode.workstation_id, including soft-deleted workstation rows
+    load robots.asset_id by workstation.robot_id
+    use robots.asset_id if non-empty
+if still empty:
+    fail as non-retryable configuration error
+ +
+

+ cloud sync 不 fallback 到 robots.device_id。工位当前允许直接更新 robot_id; + 后续如果换绑改为“旧工位软删 + 新工位记录”,fallback 查询也必须允许读取软删除 workstation, + 因为 episode 的 workstation_id 是历史引用。 +

+
+
+ +
+

方案 A:读取 Data Platform Config

+

+ Keystone 使用 KEYSTONE_SYNC_DP_CONFIG 指向的 data-platform config。该文件由现场工程师提前通过 + dp configdp device init 生成和维护,Keystone 只解析直连上传需要的字段。 + 原生 direct sync 不再依赖 KEYSTONE_CLOUD_API_KEY、 + KEYSTONE_SYNC_AUTH_ENDPOINTKEYSTONE_SYNC_GATEWAY_ENDPOINT。 +

+ +

需要解析的 JSON 字段

+
{
+  "version": 3,
+  "endpoints": {
+    "auth": "https://auth.example.com:50051",
+    "gateway": "https://gateway.example.com:50052"
+  },
+  "devices": [
+    {
+      "deviceId": "AB-F0001-T0001-000006",
+      "apiKey": "ak_v1.device_secret",
+      "tags": {
+        "778a6d83c9ec49108537542a570966ee.device_id": "AB-F0001-T0001-000006",
+        "line": "a"
+      },
+      "initializedAtUnix": 1760000000
+    }
+  ]
+}
+ +

Go 结构建议

+
type DPConfigFile struct {
+    Version   *int              `json:"version,omitempty"`
+    Endpoints DPConfigEndpoints `json:"endpoints"`
+    Devices   []DPDeviceProfile `json:"devices"`
+}
+
+type DPConfigEndpoints struct {
+    Auth    string `json:"auth"`
+    Gateway string `json:"gateway"`
+}
+
+type DPDeviceProfile struct {
+    DeviceID string            `json:"deviceId"`
+    APIKey   string            `json:"apiKey"`
+    Tags     map[string]string `json:"tags"`
+}
+
+type DPResolvedEndpoint struct {
+    Target    string
+    UseTLS    bool
+    ServerName string
+}
+ +

解析规则

+
    +
  • version 缺失或等于 3 可接受;存在且不等于 3 时失败。
  • +
  • devices[].deviceId trim 后比较,大小写敏感;重复 device id 直接失败。
  • +
  • deviceId 必须与 Keystone 解析出的 asset_id 一致。
  • +
  • apiKey 不能为空,且永不打印明文日志。
  • +
  • tags 不能为空,保持与 Rust SDK require_device_upload() 一致;tag key/value 不 trim、不改写,但 key 必须非空。
  • +
  • endpoints.authendpoints.gateway 必须来自 config 文件,不支持 ARCHEBASE_*KEYSTONE_SYNC_* overlay。
  • +
  • 每个 episode 上传前重新读取 config 文件,避免长期进程缓存旧 device profile。
  • +
+ +

Endpoint 与 TLS 规则

+
    +
  • https://host[:port] 使用 TLS gRPC;未写端口时补 443;TLS CA 使用系统 CA,server name 使用 URL host。
  • +
  • http://host[:port] 使用 insecure gRPC;未写端口时补 80
  • +
  • host[:port] 兼容裸地址,按 insecure gRPC 处理,不自动补端口。
  • +
  • endpoint 禁止 path、query 和 fragment,例如 https://host:50051/foo 应视为配置错误。
  • +
  • Auth 和 Gateway 不强制使用同一种 scheme,分别按各自 endpoint 解析。
  • +
  • 第一版不支持自定义 CA 文件或 TLS server name override。
  • +
+ +
+

+ 第一版不把 device profile 写入 Keystone 数据库。Keystone 只消费 data-platform config, + 这样可以最大限度贴近当前 dp --config ... --device ... 的上传身份语义,同时避免 + Keystone 自有 sync API key 与 device API key 混用。 +

+
+ +

现场前置动作

+
dp --config /home/shark/.archebase/config.json config
+dp --config /home/shark/.archebase/config.json device init AB-F0001-T0001-000006
+

+ 初始化成功后,config 中会出现对应的 devices[] profile。后续 Keystone 上传只读取该 profile, + 不在上传前自动执行 init。凭证过期、设备迁移或平台侧 tags 变化时,由现场工程师执行 + dp device reinit ... --yes 轮换。 +

+
+ +
+

Raw Tags 合并规则

+

+ 直连上传必须复刻 data-platform Rust SDK 的 build_upload_tags() 语义。合并过程使用非冲突插入: + 如果 key 已存在且 value 不同,直接失败;如果 key 已存在且 value 相同,视为幂等。 +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
顺序来源说明
1device profile tags来自 devices[].tags,由 data-platform device init 生成。
2device id reserved tag778a6d83c9ec49108537542a570966ee.device_id,值为 profile 的 deviceId
3original file reserved taga206e337ecdf70a93bb611cf6a30c346.raw_file,值固定使用 MinIO MCAP object key 的 basename。
4Keystone sidecar tags从 sidecar JSON 扁平化得到;数组字段 JSON encode,顶层 topics_summary 排除。
5Keystone extra tagsepisode_idkeystone_episode_idsync_channeltask_idfactory_idorganization_id
+ +
+

+ 原有 cloud sync 没有本地临时文件,因此 reserved raw_file 不读取 sidecar 字段, + 只使用 basename(stripBucketPrefix(episodes.mcap_path))。如果 basename 为空,本次上传失败。 +

+
+ +
+

+ Keystone 不新增普通 device_id raw tag。设备归属只通过 + 778a6d83c9ec49108537542a570966ee.device_id reserved tag 表达,并由 Keystone 本地注入、 + data-gateway 服务端二次校验。 +

+
+ +

Sidecar 规则

+
    +
  • direct device sync 改为 strict:sidecar_path 为空、对象不可读或 JSON 解析失败时,不创建 data-gateway upload session。
  • +
  • 数组字段保留为 JSON 字符串;顶层 topics_summary 继续排除。
  • +
  • 第一版不新增 raw tag 数量或总字节数限制。
  • +
+ +

合并伪代码

+
merged := map[string]string{}
+insertAllNonConflicting(merged, deviceProfile.Tags)
+insertNonConflicting(merged, deviceIDRawTagKey, deviceProfile.DeviceID)
+insertNonConflicting(merged, originalFileRawTagKey, mcapBaseName)
+insertAllNonConflicting(merged, sidecarTags)
+insertAllNonConflicting(merged, keystoneExtraTags)
+return merged
+
+ +
+

直连上传流程

+
+
1. 领取 episode:沿用原有 cloud sync worker 的自动扫描、手动触发、重试和并发控制。
+
2. 加载 episode:读取 MCAP MinIO key、sidecar path、metadata、workstation id 和任务上下文。
+
3. 解析 asset_id:优先使用 episodes.metadata.asset_id,否则通过历史 workstation 反查 robots.asset_id
+
4. 加载 DP config:从 KEYSTONE_SYNC_DP_CONFIG 读取 device profile 和 endpoints。
+
5. 构造 raw tags:合并 device tags、reserved tags、sidecar tags 和 Keystone extra tags,执行冲突校验。
+
6. 构造 direct uploader:为本次 episode 创建专用 AuthClientGatewayClientcloud.Uploader
+
7. 执行上传:调用 cloud.Uploader.Upload(),从 MinIO 流式读取 MCAP 并上传 OSS。
+
8. 写回结果:沿用原有成功路径,更新 sync_logsepisodes.cloud_syncedcloud_mcap_path
+
+ +

结果字段映射

+

+ 第一版不新增 episodes 字段,也不扩展 sync_logs 表。Data Platform 审计 ID 先通过日志输出; + 如后续 UI 或 API 需要直接按 episode 查询,再单独扩表。 +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
结果Go direct 来源说明
file_idcloud.UploadResult.UploadID第一版不落库,只记录日志;如后续需要在 Keystone 记录 Data Platform 文件 ID,可直接使用该值。
logical_upload_idcloud.UploadResult.LogicalUploadID第一版不落库,只记录日志。
upload_idcloud.UploadResult.UploadID与 Data Platform SDK 返回的 fileId 当前等价,第一版不落库。
object_keycloud.UploadResult.ObjectKey写入 sync_logs.destination_pathepisodes.cloud_mcap_path
oss_object_etagcloud.UploadResult.OSSObjectETag客户端计算并回传给 data-gateway 的 multipart ETag,第一版不落库,只记录日志。
+
+ +
+

实施步骤

+
    +
  1. + 复用 robots.asset_id 字段,增加 active 非空唯一约束; + create / update 实现 trim、控制字符校验、“首次非空设置后不可修改、不可清空、同值幂等”。 +
  2. +
  3. + episode 创建时解析 task -> workstation -> robot 的 asset_id,非空时写入 + episodes.metadata.asset_id,但缺失不阻止 episode 创建。 +
  4. +
  5. + 新增 DP config loader,读取 SyncConfig.DPConfigPath / KEYSTONE_SYNC_DP_CONFIG, + 校验 version、endpoint、重复 device id、空 apiKey 和空 tags,并按 asset_id 返回 profile。 +
  6. +
  7. + 给 SyncWorker 增加 asset_id resolver:优先读 episodes.metadata.asset_id, + 缺失时允许读取软删除 workstation 并反查 robots.asset_id。 +
  8. +
  9. + 新增 raw tags builder,包含两个 reserved key 常量、非冲突插入、MinIO basename 选择、strict sidecar + 和 Keystone extra tags;不添加普通 device_id。 +
  10. +
  11. + 调整 uploader 构造方式,每个 episode 创建本次专用 AuthClient/GatewayClient/Uploader; + endpoint scheme 决定 TLS,TLS 使用系统 CA。 +
  12. +
  13. + 在 cloud.UploadRequest 和 persisted upload state 中记录 AssetID; + 恢复上传时只有 MCAP key 和 asset_id 同时匹配才允许复用旧 session。 +
  14. +
  15. + 更新 GatewayClient.CompleteUpload() 签名,complete 时回传 session.PartSizeBytes 到 + CompleteUploadRequest.part_size_bytes。 +
  16. +
  17. + 引入 retryable / non-retryable 错误分类:non-retryable failed 写 next_retry_at = NULL; + auto scan 跳过 latest failed 且 next_retry_at IS NULL 的 episode,manual sync 仍可重试。 +
  18. +
  19. + 删除 CLI 同步旁路:Synapse 不展示 CLI sync UI,Keystone 不注册 CLI sync API,不初始化 + CLISyncRunner,不读取 KEYSTONE_CLI_SYNC_* 配置,也不保留 + cli_sync_runs 迁移。 +
  20. +
+
+ +
+

风险与处理

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
风险影响处理策略
asset_id 缺失无法选择 device profile,episode 本次 sync 失败。写入 non-retryable failed,next_retry_at = NULL;错误信息包含 episode、workstation、robot 和修复方向,手动修复后手动重试。
asset_id 填错本地 robot 会永久绑定错误的 Data Platform device。字段首次非空设置后不可修改;未来由自动化流程写入,第一版不提供 break-glass 维护入口。
device profile 缺失或不完整上传前失败,episode 本次 sync 失败。错误信息包含 asset_id 和 config path,但不打印 api key;提示现场执行 dp device initdp device reinit
endpoint / TLS 配置错误Auth 或 Gateway 连接失败。endpoint 只来自 DP config;按 http/https scheme 自动解析 TLS;禁止 path/query/fragment,日志打印 target 和 TLS 标志。
secret 泄漏日志或错误信息暴露 device api key。loader 和上传日志只打印 asset_id、config path、endpoint,不打印 api key、token、STS secret。
sidecar 缺失或格式错误云端对象缺少业务 raw tags,影响检索。direct sync 对 sidecar strict;格式错误 non-retryable,MinIO 读对象失败可自动重试。
恢复状态身份混用同一 MCAP 可能复用另一个 device 身份创建的 upload session。persisted upload state 记录 asset_id,恢复时必须同时匹配 MCAP key 和 asset_id
CLI sidepath 遗留用户可能继续使用旧 CLI 同步入口,产生两条语义不同的同步路径。删除后端 CLI sync runner、API、配置和表迁移;只保留原生 direct sync 入口。
+
+ +
+

测试计划

+
    +
  • 单元测试 robot API / 存储:asset_id 可首次设置、同值幂等、不可修改、不可清空、active 非空唯一。
  • +
  • 单元测试 episode 创建:有 asset_id 时写入 episodes.metadata.asset_id,缺失时仍创建 episode。
  • +
  • 单元测试 asset_id resolver:metadata 优先、fallback 读取软删除 workstation、缺失时报 non-retryable 错误、不 fallback 到 robots.device_id
  • +
  • 单元测试 DP config loader:version、endpoint scheme/TLS、禁止 path/query/fragment、成功选择 device、缺失 device、空 apiKey、空 tags、重复 deviceId。
  • +
  • 单元测试 raw tags builder:合并顺序、reserved device tag 注入、raw_file 使用 MinIO basename、相同 key 相同 value 幂等、相同 key 不同 value 报错、空 value 保留。
  • +
  • 单元测试 SyncWorker 错误分类:non-retryable failed 写 next_retry_at=NULL,auto scan 跳过,manual sync 可重新尝试。
  • +
  • 单元测试 uploader 持久化恢复:同 MCAP key 但 asset_id 不同不复用旧 state。
  • +
  • 集成测试 fake gateway/OSS:验证使用 device API key、raw tags 完整、part_size_bytes 回传、object_key 写回现有 DB 字段。
  • +
  • 现场灰度:同一小 MCAP 分别跑当前原始上传和 device profile 上传,对比 raw tags、文件大小、ETag、Data Platform 可检索性。
  • +
+ +
+

+ 验收标准:原生 cloud sync 不产生本地 MCAP 临时文件,不依赖 dp data upload, + 不依赖 KEYSTONE_CLOUD_API_KEYKEYSTONE_SYNC_AUTH_ENDPOINT 或 + KEYSTONE_SYNC_GATEWAY_ENDPOINT, + Data Platform 中的文件可通过 fileId 检索,Keystone episode 状态与现有 cloud sync 一致。 +

+
+
+
+ + diff --git a/docs/designs/data-quality-center-mvp.zh.html b/docs/designs/data-quality-center-mvp.zh.html new file mode 100644 index 0000000..673d326 --- /dev/null +++ b/docs/designs/data-quality-center-mvp.zh.html @@ -0,0 +1,892 @@ + + + + + + + 数据质检中心简化版设计 + + + +
+
+
+

Synapse / Keystone MVP

+

数据质检中心简化版设计

+

+ 简化版仍然使用 Python 脚本做质检,但首发只提供一个系统内置固定脚本,用来做 MCAP 预览可用性 smoke check。管理员在 Synapse 的数据运维模块进入“质检中心”,即可对全部 Episode 或筛选结果发起质检。 +

+
+ +
+ + + +
+

1. 设计取舍

+
+

+ 完整版的“脚本管理 + 版本 + run + job + override + 独立 Runner”能力适合长期演进,但首发上线成本偏高。简化版保留脚本执行能力,把管理面压缩成“一个内置固定脚本 + 多个质检任务”。 +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
能力完整版简化版
脚本数量多个脚本,支持 global / sop 范围只提供一个内置脚本 builtin_mcap_preview_smoke_check,覆盖所有数据
版本管理脚本定义和不可变版本分表不做上传和版本管理;job 快照保存内置脚本 key、version、SHA
执行器独立 keystone-quality-runner 进程Keystone 内置轻量 worker,默认并发 1
触发方式Episode 创建自动触发,支持重跑支持新数据自动入队,也支持 Synapse 一键全量/筛选触发
人工覆盖独立 quality_overrides复用现有 inspections 表和 episodes QA 字段
+

+ 这个方案的关键约束是“首发只解决预览入口的可读性问题”,包括对象 size / range read 异常和 MCAP 边界 magic 异常。如果后续需要按 SOP、机器人类型或场景配置不同脚本,再升级到完整版的脚本版本模型。 +

+
+
+ +
+

2. MVP 范围

+
+
+

包含

+
    +
  • Synapse 管理后台新增 数据运维 / 质检中心
  • +
  • Keystone 内置固定 Python 脚本 builtin_mcap_preview_smoke_check,全局适用于所有 Episode。
  • +
  • 内置脚本先检查对象 size / range read 是否可用,再检查 MCAP 边界 magic。
  • +
  • 脚本元数据以代码常量形式提供,执行批次和执行任务存 MySQL。
  • +
  • 支持对全部非删除 Episode 发起质检。
  • +
  • 支持按 QA 状态、创建时间、设备 ID、采集员工号筛选后发起质检。
  • +
  • 支持新创建 Episode 自动进入质检队列,开关可配置。
  • +
  • 脚本结果回写 episodes.qa_statusqa_scorequality_flag
  • +
  • 执行异常、超时、非法输出统一进入 needs_inspection
  • +
  • Episode 详情支持查看最近一次脚本结果、手动重跑、人工通过/驳回。
  • +
+
+
+

不包含

+
    +
  • 不做多脚本并行规则。
  • +
  • 不做 SOP / 场景 / 机器人类型范围匹配。
  • +
  • 不做在线代码编辑器。
  • +
  • 不做脚本上传、替换、ZIP、多文件包、Git 脚本源或动态安装依赖。
  • +
  • 不做激活前 test run。
  • +
  • 不做复杂版本列表、版本 diff 或回滚页面。
  • +
  • 不做任务取消;误触发时让当前 job 完成,后续可重新触发质检。
  • +
  • 不自动撤回已经云同步的数据。
  • +
+
+
+
+ +
+

3. 用户流程

+
+
+ 1 +
管理员进入 Synapse 数据运维 / 质检中心,看到内置脚本、最近批次、任务列表和 QA 汇总。
+
+
+ 2 +
管理员不需要上传脚本,直接点击“质检全部数据”或设置筛选条件后点击“质检筛选结果”。
+
+
+ 3 +
Keystone 为匹配的 Episode 创建 quality_jobs,每个 job 都记录内置脚本 key、version 和 SHA。
+
+
+ 4 +
Keystone 内置 worker 领取 pending job,读取 MCAP 前 8 字节和后 8 字节,执行内置 Python 脚本,并保存 stdout、stderr、result JSON 摘要。
+
+
+ 5 +
job 完成后 Keystone 按脚本输出更新 Episode QA 状态。云同步仍只放行 approvedinspector_approved
+
+
+
+ +
+

4. 架构

+
+
Synapse Admin
+  -> /api/v1/admin/quality/batches      start all-data or filtered QA
+  -> /api/v1/admin/quality/jobs         inspect job status and result
+  -> /api/v1/episodes/:id/quality-*     rerun or manual decision
+
+Keystone API
+  -> MySQL: quality_batches, quality_jobs
+  -> MySQL: episodes.qa_status, qa_score, quality_flag
+  -> MinIO: episode MCAP
+
+Keystone built-in quality worker
+  -> claims pending quality_jobs
+  -> reads MCAP leading 8 bytes and trailing 8 bytes
+  -> runs builtin_mcap_preview_smoke_check.py
+  -> updates quality_jobs and episodes
+

+ 简化版不新增独立部署进程,worker 随 Keystone 启动。默认并发为 1,避免首发时脚本执行挤占过多机器资源。 +

+
+
+ +
+

5. 数据模型

+

新增 2 张表即可支撑批量触发和执行记录。脚本不落库、不上传,脚本 key、version、SHA 由 Keystone 代码常量提供。人工复核继续复用已有 inspections 表。

+ +

quality_batches

+
id
+script_key              builtin_mcap_preview_smoke_check
+script_version          e.g. 2026.06.01
+script_sha256           sha256 of embedded script content
+trigger_type            all | filtered | episode | auto_episode
+triggered_by            admin username or system
+filter_json             actual filters used to enqueue jobs
+status                  pending | running | completed
+total_count
+pending_count
+running_count
+succeeded_count
+failed_count
+created_at
+completed_at
+

批次只用于 Synapse 展示进度。统计值可以由 quality_jobs 聚合后回写,首发也可以查询时实时计算。

+ +

quality_jobs

+
id
+batch_id
+episode_id
+script_key
+script_version
+script_sha256
+status                  pending | running | succeeded | failed | timeout | invalid_result
+decision                passed | rejected | uncertain
+score
+summary
+result_json
+stdout_excerpt
+stderr_excerpt
+error_message
+duration_ms
+locked_at
+started_at
+finished_at
+created_at
+updated_at
+

+ script_keyscript_versionscript_sha256 是执行快照。即使后续升级内置脚本,历史 job 也能看出当时实际使用的检查逻辑。 +

+
+ +
+

6. 状态规则

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
脚本/执行结果Episode QA 状态说明
job 创建或运行中qa_running用于在列表和详情页提示正在质检。
decision = passedapproved同时写 auto_approved = true,可进入云同步。
decision = rejectedrejected数据不可用,云同步不放行。
decision = uncertainneeds_inspection脚本无法可靠判断,交给人工复核。
failed / timeout / invalid_resultneeds_inspection执行失败不等于数据坏,只要求人工复核。
人工通过inspector_approved写入 inspections,可进入云同步。
人工驳回rejected写入 inspections,不进入云同步。
+

+ 对已经云同步的 Episode 重新质检时,只更新 Keystone 本地 QA 状态和质检记录,不自动删除或撤回云端对象。Synapse 在“包含已同步数据”开关旁提示这个风险。 +

+
+ +
+

7. 内置脚本

+
+

+ 首发固定脚本为 builtin_mcap_preview_smoke_check。它不是完整 MCAP 解析器,而是预览 smoke check:先检查对象是否能拿到有效 size 和必要字节范围,再用边界 magic 检查快速拦截数据预览中出现的这类错误: +

+
Expected MCAP magic '89 4d 43 41 50 30 0d 0a',
+found '2f 06 84 5c 5b ea dc 8b' [library=libmcap 2.1.2]
+

+ MCAP magic 是 8 字节:89 4d 43 41 50 30 0d 0a。MCAP 文件开头有一次 magic,文件结尾也有一次 trailing magic。 + Synapse 预览使用的 @mcap/core 会在初始化时先检查开头 magic,再读取 Header,随后检查结尾 trailing magic。 +

+

+ 因为错误里已经带了 [library=libmcap 2.1.2],说明 Header 大概率已经读成功;这种情况下更可疑的是文件结尾 trailing magic 不匹配,而不是文件开头不匹配。所以默认脚本必须同时检查开头和结尾,不能只查开头。 +

+

+ 实现时优先使用对象存储 range read:读取 offset 0..7size-8..size-1 即可发现这类 magic mismatch。只有当前 S3 client 不方便做 range read 时,才退回下载到临时文件后检查边界字节。 + 边界 magic 匹配只能说明这个 smoke check 通过,不代表 MCAP 内部索引、chunk、CRC 或压缩数据一定可读。 +

+

+ Failed to fetch size: 416 Requested Range Not Satisfiable 也属于这个 smoke check 的覆盖范围。MinIO UI 里 size 显示 - 时,通常表示前端拿不到普通文件对象的有效大小,或者当前路径不是一个可按字节范围读取的 MCAP 对象。该问题发生在 magic 检查之前,应记录为对象读取/size 异常。 +

+ +

判定规则

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
检查项结果Episode QA 状态
无法获取对象 size,或 size 为空 / 未知 / 非数字decision = rejectedrejected
文件大小小于 16 字节decision = rejectedrejected
读取开头或结尾 range 返回 416 Requested Range Not Satisfiabledecision = rejectedrejected
开头 8 字节不是 MCAP magicdecision = rejectedrejected
结尾 8 字节不是 MCAP magicdecision = rejectedrejected
开头和结尾 magic 都匹配decision = passedapproved,含义是通过当前内置 smoke check
对象网络超时、权限错误、MinIO 临时错误、脚本异常status = failedneeds_inspection
+ +

默认输出示例

+
{
+  "decision": "rejected",
+  "score": 0.0,
+  "summary": "MCAP object range is not satisfiable",
+  "findings": [
+    {
+      "severity": "error",
+      "code": "mcap_range_not_satisfiable",
+      "message": "Failed to read MCAP boundary bytes: 416 Requested Range Not Satisfiable",
+      "http_status": 416,
+      "hint": "MinIO object size is unavailable or the path is not a readable MCAP object"
+    }
+  ]
+}
+
{
+  "decision": "rejected",
+  "score": 0.0,
+  "summary": "MCAP trailing magic mismatch",
+  "findings": [
+    {
+      "severity": "error",
+      "code": "mcap_trailing_magic_mismatch",
+      "message": "Expected trailing magic 89 4d 43 41 50 30 0d 0a, found 2f 06 84 5c 5b ea dc 8b",
+      "expected_hex": "89 4d 43 41 50 30 0d 0a",
+      "actual_hex": "2f 06 84 5c 5b ea dc 8b",
+      "offset": "file_end_minus_8"
+    }
+  ]
+}
+

+ 这个脚本只能证明 MCAP 边界 magic 没有命中已知预览错误,不能证明 MCAP 内部索引、chunk、CRC 或压缩数据一定可读。后续需要更强校验时,再升级为完整 MCAP reader smoke test。 +

+
+
+ +
+

8. 脚本契约

+
+

首发不做用户上传脚本,内置脚本使用轻量输入 JSON。Keystone worker 先尝试读取 MCAP 对象大小、开头 8 字节和结尾 8 字节,再执行 Python:

+
python3 script.py --input input.json --output result.json
+

脚本必须将业务结果写入 --output 指定的 JSON 文件。stdout / stderr 只作为诊断日志保存。

+

输入文件

+
{
+  "episode_id": 42,
+  "mcap_path": "bucket/path/to/file.mcap",
+  "file_size_bytes": 123456789,
+  "object_status": "readable",
+  "object_error": null,
+  "expected_magic_hex": "89 4d 43 41 50 30 0d 0a",
+  "leading_magic_hex": "89 4d 43 41 50 30 0d 0a",
+  "trailing_magic_hex": "2f 06 84 5c 5b ea dc 8b"
+}
+

如果 worker 在获取 size 或 range read 时已经失败,也仍然生成输入 JSON 交给内置脚本输出标准化结果:

+
{
+  "episode_id": 42,
+  "mcap_path": "bucket/path/to/file.mcap",
+  "file_size_bytes": null,
+  "object_status": "range_not_satisfiable",
+  "object_error": "416 Requested Range Not Satisfiable",
+  "expected_magic_hex": "89 4d 43 41 50 30 0d 0a",
+  "leading_magic_hex": null,
+  "trailing_magic_hex": null
+}
+

最小输出

+
{
+  "decision": "passed",
+  "score": 1.0,
+  "summary": "ok",
+  "findings": []
+}
+

字段规则

+ + + + + + + + + + + + + + + + + + + + + + + + + +
字段规则
decision必填,只允许 passedrejecteduncertain
score可选,0 到 1,写入 episodes.qa_score
summary可选字符串,写入 job 摘要;当 rejected / uncertain 时同步到 episodes.quality_flag
findings可选数组,完整保存在 quality_jobs.result_json
+
    +
  • 内置脚本
  • +
  • 默认超时 30 秒
  • +
  • 默认并发 1
  • +
  • 不安装依赖
  • +
  • 不暴露数据库或 MinIO 凭证
  • +
+
+
+ +
+

9. API 草案

+
+
+

质检中心

+
GET  /api/v1/admin/quality/overview
+GET  /api/v1/admin/quality/batches
+POST /api/v1/admin/quality/batches
+GET  /api/v1/admin/quality/jobs
+
+
+

Episode 质检操作

+
GET  /api/v1/episodes/:id/quality-jobs
+POST /api/v1/episodes/:id/quality-rerun
+POST /api/v1/episodes/:id/quality-decision
+
+
+ +

质检中心概览

+
GET /api/v1/admin/quality/overview
+
+{
+  "script": {
+    "key": "builtin_mcap_preview_smoke_check",
+    "version": "2026.06.01",
+    "expected_magic_hex": "89 4d 43 41 50 30 0d 0a",
+    "checks": ["object_size", "range_read", "leading_magic", "trailing_magic"]
+  },
+  "summary": {
+    "total": 1200,
+    "pending_qa": 20,
+    "qa_running": 4,
+    "approved": 1100,
+    "needs_inspection": 30,
+    "rejected": 46
+  }
+}
+ +

触发全量质检

+
POST /api/v1/admin/quality/batches
+Content-Type: application/json
+
+{
+  "scope": "all",
+  "include_cloud_synced": true
+}
+ +

触发筛选质检

+
POST /api/v1/admin/quality/batches
+Content-Type: application/json
+
+{
+  "scope": "filtered",
+  "filters": {
+    "qa_status": ["pending_qa", "needs_inspection"],
+    "created_at_from": "2026-06-01T00:00:00Z",
+    "created_at_to": "2026-06-02T00:00:00Z",
+    "robot_device_id": "robot-001",
+    "collector_operator_id": "collector-001"
+  }
+}
+ +

人工复核

+
POST /api/v1/episodes/:id/quality-decision
+Content-Type: application/json
+
+{
+  "decision": "approved",
+  "reason": "人工预览 MCAP 后确认可用"
+}
+

所有 /admin/quality/* 和写操作首发只开放给 admin。

+
+ +
+

10. Synapse 页面

+
+
+

导航入口

+
    +
  • AdminSidebar.vue 的“数据运维”分组新增 质检中心
  • +
  • 新增路由 /admin/quality,路由名 AdminQualityCenter
  • +
  • 页面文件建议为 views/admin/quality/QualityCenter.vue
  • +
+
+
+

质检中心首屏

+
    +
  • 顶部汇总:总数据、待质检、质检中、已通过、需复核、已驳回。
  • +
  • 脚本卡片:内置脚本 key、version、检查项、预期 magic、默认超时。
  • +
  • 操作区:质检全部数据、质检筛选结果。
  • +
  • 筛选项复用数据生产统计页面口径:QA 状态、时间范围、设备、采集员。
  • +
+
+
+

列表与详情

+
    +
  • 批次列表展示发起人、范围、总数、完成数、失败数、创建时间。
  • +
  • job 列表展示 Episode、状态、decision、score、summary、耗时。
  • +
  • 点击 Episode 跳转现有 Episode 详情页。
  • +
  • Episode 详情页新增“脚本质检”卡片:最近 job、result JSON、重跑、人工通过/驳回。
  • +
+
+
+

+ 首发页面可以复用现有 ListPageLayoutDataTableModalBaseInputBaseSelectBaseTextarea,不新建设计系统组件。 +

+
+ +
+

11. 实施顺序

+
    +
  1. Keystone 新增迁移:quality_batchesquality_jobs
  2. +
  3. Keystone 新增内置脚本文件或嵌入式脚本常量:builtin_mcap_preview_smoke_check.py
  4. +
  5. Keystone 新增 QualityHandler:overview、batch enqueue、job 列表、Episode 重跑和人工决策。
  6. +
  7. Keystone 新增内置 quality worker:领取 pending job、执行内置脚本、落库结果、更新 Episode QA 状态。
  8. +
  9. 上传完成路径增加自动入队:当 QUALITY_AUTO_RUN_ON_UPLOAD=true 时创建单 Episode job。
  10. +
  11. Synapse 新增 api/quality.jsQualityCenter.vue、路由和侧边栏入口。
  12. +
  13. Synapse Episode 详情页新增脚本质检卡片,接入重跑和人工通过/驳回。
  14. +
  15. 验证:全量触发、筛选触发、size 不可用、range 416、开头 magic 异常、结尾 magic 异常、文件过小、执行异常、云同步资格。
  16. +
+ +

首发验收标准

+
+
    +
  • 管理员能在“质检中心”看到内置脚本 builtin_mcap_preview_smoke_check 的说明。
  • +
  • 管理员能点击一次对全部 Episode 创建质检任务。
  • +
  • 管理员能按筛选条件只质检一部分 Episode。
  • +
  • 对象 size 不可用或 MinIO 显示 size 为 - 时 Episode 变为 rejected,finding 写明 size 异常。
  • +
  • 读取边界字节返回 416 Requested Range Not Satisfiable 时 Episode 变为 rejected,finding 写明 range 异常。
  • +
  • 开头和结尾 magic 都正确时 Episode 变为 approved,表示当前内置 smoke check 通过。
  • +
  • 开头 magic 不匹配时 Episode 变为 rejected,finding 写明实际开头 8 字节。
  • +
  • 结尾 magic 不匹配时 Episode 变为 rejected,finding 写明实际结尾 8 字节。
  • +
  • 脚本超时、异常或对象读取失败后 Episode 变为 needs_inspection
  • +
  • Episode 详情能看到最近一次脚本执行结果,并能人工通过或驳回。
  • +
  • 云同步继续只允许 approvedinspector_approved
  • +
+
+
+
+ + diff --git a/docs/designs/data-quality-script-management.zh.html b/docs/designs/data-quality-script-management.zh.html new file mode 100755 index 0000000..c58ae10 --- /dev/null +++ b/docs/designs/data-quality-script-management.zh.html @@ -0,0 +1,878 @@ + + + + + + + 数据质检脚本管理设计 + + + +
+
+
+

Keystone / Synapse Phase 1

+

数据质检脚本管理设计

+

+ 第一版只做一个可落地的最小闭环:上传 Python 质检脚本,Episode 入库后自动触发独立 Runner 执行,结果写回 QA 状态,并由 QA 状态控制云同步资格。 +

+
+ +
+ + + +
+

1. 一期范围

+
+
+

包含

+
    +
  • 只支持 Python 脚本。
  • +
  • 上传单个 .py 文件,最大 1 MB。
  • +
  • 脚本文件存 MinIO,元数据和执行记录存 MySQL。
  • +
  • 脚本版本不可变,上传后默认不激活。
  • +
  • 管理员显式激活版本;同一脚本只允许一个 active 版本。
  • +
  • 触发范围支持 globalsop
  • +
  • Episode 新建后自动触发质检。
  • +
  • 支持单个 Episode 手动重跑。
  • +
  • 支持从 needs_inspectionrejected 人工覆盖。
  • +
  • 固定 Runner Runtime:python3.11-mcap
  • +
+
+
+

不包含

+
    +
  • 不做 UI 在线代码编辑器。
  • +
  • 不支持 ZIP、多文件脚本包、Git 脚本源。
  • +
  • 不支持脚本自带依赖安装。
  • +
  • 不做激活前 test run。
  • +
  • 不做上传时 Python 语法校验。
  • +
  • 不做 job cancel、自动 retry、历史批量回扫。
  • +
  • 不建批次 QA 汇总表。
  • +
  • 不做补采批次,不做任务回退。
  • +
  • 不改 tasks.statusbatches.statusorders.status
  • +
+
+
+
+ +
+

2. 已实现基础

+

以下能力已存在于当前 Keystone / Synapse 体系中,是质检脚本管理的一期实现基础,不需要从零建设。

+
+
+

Keystone 已有基础

+
    +
  • 任务、批次、订单、Episode 等生产数据模型。
  • +
  • 上传完成后创建 Episode,并保存 MCAP / sidecar 的对象路径。
  • +
  • Episode 已有 qa_statusauto_approvedquality_flag 等质检相关字段。
  • +
  • MinIO / S3 存储接入能力,可保存 MCAP、sidecar 和后续脚本 artifact。
  • +
  • 云同步 worker 已按 approved / inspector_approved 过滤可同步 Episode。
  • +
  • JWT 鉴权和 admin / data_collector 角色基础。
  • +
+
+
+

Synapse 已有基础

+
    +
  • Admin 管理后台布局、导航和 CRUD 页面模式。
  • +
  • 通用 API client、分页列表、表单、弹窗和确认对话框组件。
  • +
  • Episode 详情页和数据预览能力。
  • +
  • 任务、批次、统计、云同步等后台页面,可接入 QA 状态展示。
  • +
  • 已有数据生产统计页面,可继续按 episodes.qa_status 聚合。
  • +
+
+
+
+ +
+

3. 待实现模块

+

一期需要新增的是质检脚本管理和独立 Runner 执行闭环,生产主状态机不纳入本次改造。

+
+
+

Keystone 后端

+
    +
  • 新增 quality_scriptsquality_script_versionsquality_runsquality_run_jobsquality_overrides 表。
  • +
  • 新增脚本管理、版本上传、激活/停用、run/job 查询 API。
  • +
  • 在 Episode 创建后匹配 active 脚本并创建质检 run/jobs。
  • +
  • 新增手动重跑和人工覆盖 API。
  • +
  • 调整上传完成路径:有脚本时进入 qa_running,无脚本时自动 approved
  • +
+
+
+

Quality Runner

+
    +
  • 新增独立命令或服务 keystone-quality-runner
  • +
  • 从 MySQL 领取 quality_run_jobs
  • +
  • 从 MinIO 下载 MCAP、sidecar 和 Python 脚本。
  • +
  • 按固定命令执行 Python 子进程并限制超时。
  • +
  • 写回 job 结果,并在 run 完成后结算 Episode QA 状态。
  • +
+
+
+

Synapse 前端

+
    +
  • 新增“数据质检”后台入口。
  • +
  • 脚本列表、脚本元数据表单、版本上传和激活/停用页面。
  • +
  • run/job 列表和结果详情。
  • +
  • Episode 详情页增加 QA 面板、手动重跑和人工覆盖入口。
  • +
  • 批次、统计、云同步相关页面展示有效 QA 状态。
  • +
+
+
+
+ +
+

4. 架构

+
+
Synapse Admin
+  -> Keystone REST API
+      -> MySQL: scripts, versions, runs, jobs, overrides
+      -> MinIO: uploaded script files
+
+Keystone upload_complete
+  -> creates episode
+  -> matches active quality script versions
+  -> creates quality run and jobs
+
+keystone-quality-runner
+  -> polls MySQL quality_run_jobs
+  -> downloads MCAP, sidecar, and script from MinIO
+  -> runs Python script in a child process
+  -> writes job result
+  -> settles the parent quality run
+  -> updates episode QA status
+

+ Runner 是独立进程,不放在 Keystone API 进程内执行 Python。推荐同仓库、同发布包,但运行成两个命令: + keystone-edgekeystone-quality-runner。 +

+
+
+ +
+

5. Episode QA 状态

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
状态含义
pending_qaEpisode 已创建,但质检还没有开始。
qa_running当前质检轮次仍有 pending 或 running job。
approved所有匹配脚本都通过,或没有任何匹配脚本。
needs_inspection脚本异常、超时、输出非法,或脚本返回 uncertain
inspector_approved管理员人工确认通过。
rejected脚本或管理员明确驳回该数据。
+

+ 一期质检系统不写 episodes.qa_status = failed。执行失败不等于数据坏,统一进入 needs_inspection。 +

+

云同步资格仍只认 approvedinspector_approved

+
+ +
+

6. 脚本版本规则

+
+
    +
  • 上传只接受一个 .py 文件,空文件拒绝,最大 1 MB。
  • +
  • 版本号必须是 SemVer:MAJOR.MINOR.PATCH,不带 v 前缀。
  • +
  • Keystone 计算并保存 SHA256。
  • +
  • MinIO 路径使用 slug/version/sha256.py,不信任原始文件名。
  • +
  • 上传版本默认 inactive,不会自动激活。
  • +
  • 激活版本是单独的 admin 操作。
  • +
  • 激活一个版本会停用同一脚本的旧 active 版本。
  • +
  • 已经排队或运行中的 job 继续使用它引用的不可变版本。
  • +
  • 一期不物理删除脚本版本或 MinIO artifact。
  • +
+

+ 版本同时携带执行策略:languageruntimetimeout_seconds、 + scope_typescope_ref_iddefault_config。修改代码、配置、超时或适用范围都需要上传新版本。 +

+
+
+ +
+

7. 数据模型

+

一期使用 5 张质检表,先不拆独立 findings 表;脚本输出里的 findings 存在 quality_run_jobs.result_json

+ +

quality_scripts

+
id
+slug
+name
+description
+status              active | inactive
+created_by
+created_at
+updated_at
+deleted_at
+ +

quality_script_versions

+
id
+script_id
+version
+language            python
+runtime             python3.11-mcap
+entrypoint          normalized uploaded filename
+artifact_uri
+artifact_sha256
+artifact_size_bytes
+timeout_seconds
+scope_type          global | sop
+scope_ref_id        null for global, sop id for sop
+default_config      JSON
+status              active | inactive
+created_by
+created_at
+deleted_at
+ +

quality_runs

+
id
+episode_id
+trigger_type        auto | manual
+triggered_by        system or admin user id/name
+status              pending | running | completed
+final_qa_status
+settlement_reason
+created_at
+started_at
+completed_at
+

同一个 Episode 一期只允许一个 active 质检轮次;如果还有 pending/running job,手动重跑返回 409 Conflict

+ +

quality_run_jobs

+
id
+quality_run_id
+episode_id
+script_version_id
+status              pending | running | succeeded | failed | timeout | invalid_result
+decision            passed | rejected | uncertain
+runner_id
+locked_at
+started_at
+finished_at
+score
+summary
+result_json         full script output, including findings
+stdout_excerpt
+stderr_excerpt
+error_message
+duration_ms
+created_at
+updated_at
+

status = failed 表示 Runner 或 job 执行失败,不表示数据被驳回。

+ +

quality_overrides

+
id
+episode_id
+previous_qa_status
+new_qa_status
+decision            approved | rejected
+reason
+operator_id
+operator_name
+created_at
+

人工覆盖只更新 episodes.qa_status,不改历史 quality_runsquality_run_jobs

+
+ +
+

8. 自动触发与结算

+
+
+ 1 +
上传完成后 Keystone 创建 Episode,初始写 qa_status = pending_qa
+
+
+ 2 +
匹配所有 active global 版本,以及 SOP 匹配的 active sop 版本。
+
+
+ 3 +
如果没有匹配脚本,直接写 approvedauto_approved = true
+
+
+ 4 +
如果有匹配脚本,创建一个 quality_runs 和多条 quality_run_jobs,Episode 进入 qa_running
+
+
+ 5 +
所有 job 完成后统一结算;rejected 不短路其他脚本。
+
+
+ +

结算规则

+
if any job timed out, failed to execute, produced invalid output, or returned uncertain:
+  episode.qa_status = needs_inspection
+else if any job returned rejected:
+  episode.qa_status = rejected
+else:
+  episode.qa_status = approved
+ +

手动重跑

+
    +
  • 仅 admin 可用。
  • +
  • 使用当前 active 脚本版本。
  • +
  • 不允许临时覆盖配置。
  • +
  • 不使用历史轮次里的旧版本。
  • +
  • 如果 Episode 已有 active QA job,返回 409 Conflict
  • +
+
+ +
+

9. 脚本执行契约

+

Runner 为每个 job 准备临时目录:

+
input.mcap
+sidecar.json
+config.json
+result.json
+script.py
+

执行命令固定为:

+
python script.py --mcap input.mcap --sidecar sidecar.json --config config.json --output result.json
+

业务结果必须写入 --output 指定的 JSON 文件;stdoutstderr 只作为日志保存。

+ +

最小输出

+
{
+  "decision": "passed",
+  "score": 1.0,
+  "summary": "ok",
+  "findings": []
+}
+ +

结果字段

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
字段规则
decision必填:passedrejecteduncertain
score可选,0 到 1。
summary可选字符串。
findings可选数组,保存在 result_json 中。
findings[].severityinfowarningerror
findings[].message必填字符串。
+ +
    +
  • passed:脚本接受数据
  • +
  • rejected:脚本驳回数据
  • +
  • uncertain:脚本无法可靠判断
  • +
+

输出文件缺失、JSON 非法、缺少 decision 或未知 decision,job 状态写为 invalid_result

+
+ +
+

10. Runner 行为

+
+
    +
  • 轮询 MySQL quality_run_jobs
  • +
  • 用事务和行锁领取 job。
  • +
  • runner_idlocked_at 标识所有权。
  • +
  • 直接从 MinIO 下载 MCAP、sidecar 和脚本文件。
  • +
  • 以子进程执行 Python。
  • +
  • 执行 timeout_seconds 超时控制。
  • +
  • 截断保存 stdout/stderr。
  • +
  • 不把数据库或 MinIO 凭证传给脚本进程。
  • +
  • 不自动 retry。
  • +
  • stale running job 超过超时加宽限期后视为执行失败。
  • +
+

建议配置项:QUALITY_RUNNER_CONCURRENCYQUALITY_RUNNER_POLL_INTERVAL_SECONDS

+
+
+ +
+

11. 人工覆盖

+
+
+

允许来源状态

+
    +
  • needs_inspection
  • +
  • rejected
  • +
+
+
+

允许目标

+
    +
  • approved -> inspector_approved
  • +
  • rejected -> rejected
  • +
+
+
+
    +
  • reason 必填。
  • +
  • pending_qaqa_running 时不允许覆盖。
  • +
  • 覆盖只更新 Episode 的 effective QA 状态。
  • +
  • 覆盖不改脚本执行历史。
  • +
  • 覆盖写入 quality_overrides
  • +
+
POST /api/v1/episodes/:id/quality-override
+Content-Type: application/json
+
+{
+  "decision": "approved",
+  "reason": "manual review confirmed the data is usable"
+}
+
+ +
+

12. API 草案

+
+
+

脚本管理

+
GET   /api/v1/quality/scripts
+POST  /api/v1/quality/scripts
+GET   /api/v1/quality/scripts/:id
+PATCH /api/v1/quality/scripts/:id
+POST  /api/v1/quality/scripts/:id/versions
+GET   /api/v1/quality/scripts/:id/versions
+POST  /api/v1/quality/script-versions/:id/activate
+POST  /api/v1/quality/script-versions/:id/deactivate
+
+
+

执行与复核

+
GET  /api/v1/quality/runs
+GET  /api/v1/quality/jobs
+POST /api/v1/episodes/:id/quality-runs
+POST /api/v1/episodes/:id/quality-override
+
+
+

所有脚本管理、手动重跑和人工覆盖 API 一期都只开放给 admin。

+
+ +
+

13. Synapse 页面范围

+
+
+

脚本管理

+
    +
  • 脚本列表。
  • +
  • 创建和编辑元数据。
  • +
  • 上传脚本版本。
  • +
  • 激活和停用版本。
  • +
+
+
+

执行记录

+
    +
  • run/job 列表。
  • +
  • 按状态、脚本、Episode 筛选。
  • +
  • 查看 stdout/stderr 摘要。
  • +
  • 查看 result_json 中的 findings。
  • +
+
+
+

Episode 详情

+
    +
  • 显示 effective QA 状态。
  • +
  • 显示最新 run 状态。
  • +
  • 展示每个脚本 job 结果。
  • +
  • 支持手动重跑和人工覆盖。
  • +
+
+
+

一期不需要在线代码编辑器。

+
+ +
+

14. 批次和统计查询

+

一期不保存批次 QA 汇总表。批次详情和统计页面需要时直接聚合 episodes.qa_status

+
SELECT
+  COUNT(*) AS total,
+  SUM(qa_status = 'approved') AS approved_count,
+  SUM(qa_status = 'inspector_approved') AS inspector_approved_count,
+  SUM(qa_status = 'rejected') AS rejected_count,
+  SUM(qa_status = 'needs_inspection') AS needs_inspection_count,
+  SUM(qa_status = 'qa_running') AS qa_running_count,
+  SUM(qa_status = 'pending_qa') AS pending_qa_count
+FROM episodes
+WHERE batch_id = ? AND deleted_at IS NULL;
+
+ +
+

15. 固定 Runtime

+
+

一期 Runtime 固定为 python3.11-mcap,脚本不能上传或安装自己的依赖。

+

初始依赖集合

+
    +
  • mcap
  • +
  • numpy
  • +
  • pandas
  • +
  • Pillow
  • +
  • opencv-python-headless
  • +
  • pyyaml
  • +
  • jsonschema
  • +
+
+
+ +
+

16. 后续演进

+
    +
  • 第二阶段支持不可变 ZIP 包,包含 main.pyrequirements.lock 和可选 config.schema.json
  • +
  • 第二阶段在执行前构建受控 Python 环境,不在 job 执行时动态安装依赖。
  • +
  • 第三阶段支持脚本专属容器镜像,用于复杂依赖、模型文件、GPU Runtime 或更强隔离。
  • +
  • 未来执行器继续复用同一套输入和输出契约。
  • +
+
+
+ + diff --git a/internal/api/handlers/robot.go b/internal/api/handlers/robot.go index a497a96..94fe64e 100644 --- a/internal/api/handlers/robot.go +++ b/internal/api/handlers/robot.go @@ -15,6 +15,8 @@ import ( "strconv" "strings" "time" + "unicode" + "unicode/utf8" "archebase.com/keystone-edge/internal/logger" "archebase.com/keystone-edge/internal/services" @@ -134,6 +136,48 @@ func robotMetadataFromDB(ns sql.NullString) interface{} { return parseJSONRaw(ns.String) } +func normalizeAssetID(raw string) (sql.NullString, error) { + value := strings.TrimSpace(raw) + if value == "" { + return sql.NullString{}, nil + } + if utf8.RuneCountInString(value) > 100 { + return sql.NullString{}, fmt.Errorf("asset_id must be at most 100 characters") + } + for _, r := range value { + if unicode.IsControl(r) { + return sql.NullString{}, fmt.Errorf("asset_id must not contain control characters") + } + } + return sql.NullString{String: value, Valid: true}, nil +} + +func assetIDValue(ns sql.NullString) string { + if !ns.Valid { + return "" + } + return strings.TrimSpace(ns.String) +} + +func (h *RobotHandler) assetIDInUse(assetID string, excludeRobotID int64) (bool, error) { + assetID = strings.TrimSpace(assetID) + if assetID == "" { + return false, nil + } + var exists bool + query := "SELECT EXISTS(SELECT 1 FROM robots WHERE asset_id = ? AND deleted_at IS NULL" + args := []interface{}{assetID} + if excludeRobotID > 0 { + query += " AND id <> ?" + args = append(args, excludeRobotID) + } + query += ")" + if err := h.db.Get(&exists, query, args...); err != nil { + return false, err + } + return exists, nil +} + func (h *RobotHandler) connectionState(deviceID string) (connected bool, connectedAt string) { connected, connectedAt, _, _ = h.connectionStateDetailed(deviceID) return connected, connectedAt @@ -462,6 +506,11 @@ func (h *RobotHandler) CreateRobot(c *gin.Context) { req.RobotTypeID = strings.TrimSpace(req.RobotTypeID) req.DeviceID = strings.TrimSpace(req.DeviceID) req.FactoryID = strings.TrimSpace(req.FactoryID) + assetID, err := normalizeAssetID(req.AssetID) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } if req.RobotTypeID == "" { c.JSON(http.StatusBadRequest, gin.H{"error": "robot_type_id is required"}) @@ -477,6 +526,18 @@ func (h *RobotHandler) CreateRobot(c *gin.Context) { c.JSON(http.StatusBadRequest, gin.H{"error": "factory_id is required"}) return } + if assetID.Valid { + inUse, err := h.assetIDInUse(assetID.String, 0) + if err != nil { + logger.Printf("[ROBOT] Failed to check asset_id uniqueness: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to create robot"}) + return + } + if inUse { + c.JSON(http.StatusConflict, gin.H{"error": "asset_id is already assigned to another robot"}) + return + } + } // Parse robot_type_id as numeric value robotTypeID, err := strconv.ParseInt(req.RobotTypeID, 10, 64) @@ -509,11 +570,6 @@ func (h *RobotHandler) CreateRobot(c *gin.Context) { now := time.Now().UTC() - var assetIDStr sql.NullString - if a := strings.TrimSpace(req.AssetID); a != "" { - assetIDStr = sql.NullString{String: a, Valid: true} - } - metadataStr := sql.NullString{String: "{}", Valid: true} if req.Metadata != nil { metadataJSON, err := json.Marshal(req.Metadata) @@ -539,7 +595,7 @@ func (h *RobotHandler) CreateRobot(c *gin.Context) { robotTypeID, req.DeviceID, factoryID, - assetIDStr, + assetID, "active", metadataStr, now, @@ -677,7 +733,7 @@ type UpdateRobotRequest struct { RobotTypeID *string `json:"robot_type_id,omitempty"` DeviceID *string `json:"device_id,omitempty"` FactoryID *string `json:"factory_id,omitempty"` - AssetID *string `json:"asset_id,omitempty"` + AssetID json.RawMessage `json:"asset_id,omitempty" swaggertype:"string"` Status *string `json:"status,omitempty"` Metadata json.RawMessage `json:"metadata,omitempty" swaggertype:"object"` } @@ -710,13 +766,19 @@ func (h *RobotHandler) UpdateRobot(c *gin.Context) { return } - // Check if robot exists - var exists bool - err = h.db.Get(&exists, "SELECT EXISTS(SELECT 1 FROM robots WHERE id = ? AND deleted_at IS NULL)", id) - if err != nil || !exists { + var current struct { + AssetID sql.NullString `db:"asset_id"` + } + err = h.db.Get(¤t, "SELECT asset_id FROM robots WHERE id = ? AND deleted_at IS NULL", id) + if err == sql.ErrNoRows { c.JSON(http.StatusNotFound, gin.H{"error": "robot not found"}) return } + if err != nil { + logger.Printf("[ROBOT] Failed to query robot: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to update robot"}) + return + } // Validate status if provided validStatuses := map[string]bool{ @@ -760,6 +822,47 @@ func (h *RobotHandler) UpdateRobot(c *gin.Context) { args = append(args, deviceID) } + if len(req.AssetID) > 0 { + var rawAssetID string + meta := bytes.TrimSpace(req.AssetID) + if bytes.Equal(meta, []byte("null")) { + rawAssetID = "" + } else if err := json.Unmarshal(req.AssetID, &rawAssetID); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "asset_id must be a string or null"}) + return + } + assetID, err := normalizeAssetID(rawAssetID) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + currentAssetID := assetIDValue(current.AssetID) + if currentAssetID != "" { + if !assetID.Valid { + c.JSON(http.StatusBadRequest, gin.H{"error": "asset_id cannot be cleared once set"}) + return + } + if assetID.String != currentAssetID { + c.JSON(http.StatusBadRequest, gin.H{"error": "asset_id cannot be changed once set"}) + return + } + } + if assetID.Valid && assetID.String != currentAssetID { + inUse, err := h.assetIDInUse(assetID.String, id) + if err != nil { + logger.Printf("[ROBOT] Failed to check asset_id uniqueness: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to update robot"}) + return + } + if inUse { + c.JSON(http.StatusConflict, gin.H{"error": "asset_id is already assigned to another robot"}) + return + } + } + updates = append(updates, "asset_id = ?") + args = append(args, assetID) + } + if req.FactoryID != nil { if *req.FactoryID == "" { c.JSON(http.StatusBadRequest, gin.H{"error": "factory_id cannot be empty"}) @@ -781,16 +884,6 @@ func (h *RobotHandler) UpdateRobot(c *gin.Context) { args = append(args, parsedFactoryID) } - if req.AssetID != nil { - trimmed := strings.TrimSpace(*req.AssetID) - var a sql.NullString - if trimmed != "" { - a = sql.NullString{String: trimmed, Valid: true} - } - updates = append(updates, "asset_id = ?") - args = append(args, a) - } - if req.Status != nil { status := strings.TrimSpace(*req.Status) if !validStatuses[status] { diff --git a/internal/api/handlers/robot_test.go b/internal/api/handlers/robot_test.go index 02777b1..fa7c1e1 100644 --- a/internal/api/handlers/robot_test.go +++ b/internal/api/handlers/robot_test.go @@ -5,9 +5,12 @@ package handlers import ( + "bytes" + "database/sql" "encoding/json" "net/http" "net/http/httptest" + "strings" "testing" "time" @@ -262,6 +265,164 @@ func TestRobotHandlerListRobots_ConnectedFilterUsesHubIntersection(t *testing.T) }) } +func TestRobotHandlerAssetID_CreateUpdateAndList(t *testing.T) { + db := newTestRobotHandlerDB(t) + defer db.Close() + seedRobotLookups(t, db) + + r := newTestRobotRouter(t, db) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString(`{ + "robot_type_id": "10", + "device_id": "local-device-1", + "asset_id": " asset-1 ", + "factory_id": "30" + }`)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + + if w.Code != http.StatusCreated { + t.Fatalf("create status=%d want=%d body=%s", w.Code, http.StatusCreated, w.Body.String()) + } + var created CreateRobotResponse + if err := json.Unmarshal(w.Body.Bytes(), &created); err != nil { + t.Fatalf("unmarshal create response: %v", err) + } + if created.AssetID != "asset-1" { + t.Fatalf("created asset_id=%v want asset-1", created.AssetID) + } + + req = httptest.NewRequest(http.MethodGet, "/api/v1/robots", nil) + w = httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusOK { + t.Fatalf("list status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + var list RobotListResponse + if err := json.Unmarshal(w.Body.Bytes(), &list); err != nil { + t.Fatalf("unmarshal list response: %v", err) + } + if len(list.Items) != 1 || list.Items[0].AssetID != "asset-1" { + t.Fatalf("list asset_id response=%#v", list) + } + + req = httptest.NewRequest(http.MethodPut, "/api/v1/robots/"+created.ID, bytes.NewBufferString(`{"asset_id":"asset-1"}`)) + req.Header.Set("Content-Type", "application/json") + w = httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusOK { + t.Fatalf("same-value update status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } +} + +func TestRobotHandlerAssetID_ImmutableOnceSet(t *testing.T) { + db := newTestRobotHandlerDB(t) + defer db.Close() + seedRobotLookups(t, db) + seedRobot(t, db, 1, "local-device-1", "asset-1", nil) + + r := newTestRobotRouter(t, db) + + for _, tt := range []struct { + name string + body string + }{ + {name: "change rejected", body: `{"asset_id":"asset-2"}`}, + {name: "clear rejected", body: `{"asset_id":""}`}, + {name: "blank clear rejected", body: `{"asset_id":" "}`}, + } { + t.Run(tt.name, func(t *testing.T) { + req := httptest.NewRequest(http.MethodPut, "/api/v1/robots/1", bytes.NewBufferString(tt.body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusBadRequest { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusBadRequest, w.Body.String()) + } + }) + } + + req := httptest.NewRequest(http.MethodPut, "/api/v1/robots/1", bytes.NewBufferString(`{"asset_id":null}`)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusBadRequest { + t.Fatalf("null clear status=%d want=%d body=%s", w.Code, http.StatusBadRequest, w.Body.String()) + } +} + +func TestRobotHandlerAssetID_UniqueAmongActiveRobots(t *testing.T) { + db := newTestRobotHandlerDB(t) + defer db.Close() + seedRobotLookups(t, db) + seedRobot(t, db, 1, "local-device-1", "asset-1", nil) + deletedAt := time.Now().UTC() + seedRobot(t, db, 2, "deleted-device", "deleted-asset", &deletedAt) + + r := newTestRobotRouter(t, db) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString(`{ + "robot_type_id": "10", + "device_id": "local-device-2", + "asset_id": "asset-1", + "factory_id": "30" + }`)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusConflict { + t.Fatalf("duplicate create status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String()) + } + + req = httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString(`{ + "robot_type_id": "10", + "device_id": "local-device-3", + "asset_id": "deleted-asset", + "factory_id": "30" + }`)) + req.Header.Set("Content-Type", "application/json") + w = httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusCreated { + t.Fatalf("soft-deleted reuse status=%d want=%d body=%s", w.Code, http.StatusCreated, w.Body.String()) + } +} + +func TestRobotHandlerAssetID_Validation(t *testing.T) { + db := newTestRobotHandlerDB(t) + defer db.Close() + seedRobotLookups(t, db) + + r := newTestRobotRouter(t, db) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString("{\n"+ + `"robot_type_id":"10",`+ + `"device_id":"local-device-1",`+ + `"factory_id":"30",`+ + `"asset_id":"asset\u0001id"}`)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusBadRequest { + t.Fatalf("control char status=%d want=%d body=%s", w.Code, http.StatusBadRequest, w.Body.String()) + } + + longID := strings.Repeat("a", 101) + req = httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString(`{ + "robot_type_id": "10", + "device_id": "local-device-2", + "factory_id": "30", + "asset_id": "`+longID+`" + }`)) + req.Header.Set("Content-Type", "application/json") + w = httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusBadRequest { + t.Fatalf("long id status=%d want=%d body=%s", w.Code, http.StatusBadRequest, w.Body.String()) + } +} + func newTestRobotRouter(t *testing.T, db *sqlx.DB) *gin.Engine { t.Helper() return newTestRobotRouterWithHubs(t, db, nil, nil) @@ -306,8 +467,8 @@ func newTestRobotHandlerDB(t *testing.T) *sqlx.DB { id INTEGER PRIMARY KEY, robot_type_id INTEGER NOT NULL, device_id TEXT NOT NULL, - factory_id INTEGER NOT NULL, asset_id TEXT, + factory_id INTEGER NOT NULL, status TEXT NOT NULL, metadata TEXT, created_at TIMESTAMP, @@ -336,3 +497,32 @@ func newTestRobotHandlerDB(t *testing.T) *sqlx.DB { return db } + +func seedRobotLookups(t *testing.T, db *sqlx.DB) { + t.Helper() + if _, err := db.Exec(`INSERT INTO robot_types (id, name, model, deleted_at) VALUES (10, 'Arm Type', 'Model-A', NULL)`); err != nil { + t.Fatalf("seed robot type: %v", err) + } + if _, err := db.Exec(`INSERT INTO factories (id, name, slug, deleted_at) VALUES (30, 'Factory 30', 'fac-30', NULL)`); err != nil { + t.Fatalf("seed factory: %v", err) + } +} + +func seedRobot(t *testing.T, db *sqlx.DB, id int64, deviceID string, assetID string, deletedAt *time.Time) { + t.Helper() + var asset sql.NullString + if strings.TrimSpace(assetID) != "" { + asset = sql.NullString{String: strings.TrimSpace(assetID), Valid: true} + } + var deleted sql.NullTime + if deletedAt != nil { + deleted = sql.NullTime{Time: *deletedAt, Valid: true} + } + now := time.Now().UTC() + if _, err := db.Exec(` + INSERT INTO robots (id, robot_type_id, device_id, asset_id, factory_id, status, created_at, updated_at, deleted_at) + VALUES (?, 10, ?, ?, 30, 'active', ?, ?, ?) + `, id, deviceID, asset, now, now, deleted); err != nil { + t.Fatalf("seed robot %d: %v", id, err) + } +} diff --git a/internal/api/handlers/sync.go b/internal/api/handlers/sync.go index 9d1694c..eadd585 100644 --- a/internal/api/handlers/sync.go +++ b/internal/api/handlers/sync.go @@ -32,6 +32,7 @@ func NewSyncHandler(db *sqlx.DB, syncWorker *services.SyncWorker) *SyncHandler { // RegisterRoutes registers cloud sync related routes. func (h *SyncHandler) RegisterRoutes(apiV1 *gin.RouterGroup) { apiV1.POST("/sync/episodes", h.TriggerBatchSync) + apiV1.POST("/sync/episodes/:id/resync", h.TriggerEpisodeResync) apiV1.POST("/sync/episodes/:id", h.TriggerEpisodeSync) apiV1.GET("/sync/episodes", h.ListSyncJobs) apiV1.GET("/sync/episodes/summary", h.ListEpisodeSyncSummaries) @@ -40,6 +41,103 @@ func (h *SyncHandler) RegisterRoutes(apiV1 *gin.RouterGroup) { apiV1.GET("/sync/config", h.GetSyncConfig) } +type syncEpisodeActionRow struct { + QaStatus string `db:"qa_status"` + CloudSynced bool `db:"cloud_synced"` +} + +func (h *SyncHandler) loadSyncEpisodeForAction(c *gin.Context, episodeID int64) (syncEpisodeActionRow, bool) { + var row syncEpisodeActionRow + err := h.db.Get(&row, "SELECT qa_status, cloud_synced FROM episodes WHERE id = ? AND deleted_at IS NULL", episodeID) + if err == sql.ErrNoRows { + c.JSON(http.StatusNotFound, gin.H{"error": "episode not found"}) + return row, false + } + if err != nil { + logger.Printf("[SYNC] Failed to query episode %d: %v", episodeID, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to query episode"}) + return row, false + } + return row, true +} + +func (h *SyncHandler) enqueueSyncErrorResponse(c *gin.Context, episodeID int64, err error) { + switch { + case errors.Is(err, services.ErrSyncWorkerNotRunning): + c.JSON(http.StatusServiceUnavailable, gin.H{ + "error": err.Error(), + "episode_id": episodeID, + "status": "worker_not_running", + }) + case errors.Is(err, services.ErrEpisodeAlreadyEnqueued), errors.Is(err, services.ErrSyncAlreadyInProgress): + c.JSON(http.StatusConflict, gin.H{ + "error": err.Error(), + "episode_id": episodeID, + "status": "already_queued", + }) + case errors.Is(err, services.ErrSyncQueueFull): + c.JSON(http.StatusTooManyRequests, gin.H{ + "error": err.Error(), + "episode_id": episodeID, + "status": "queue_full", + }) + default: + logger.Printf("[SYNC] Enqueue episode %d failed: %v", episodeID, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to enqueue episode"}) + } +} + +// TriggerEpisodeResync queues a new cloud upload for an already-synced episode. +// +// @Summary Resync episode to cloud +// @Description Enqueues a new cloud upload for an already-synced episode without clearing previous sync history +// @Tags sync +// @Produce json +// @Param id path int true "Episode ID" +// @Success 202 {object} map[string]interface{} +// @Failure 400 {object} map[string]string +// @Failure 404 {object} map[string]string +// @Failure 409 {object} map[string]string +// @Failure 500 {object} map[string]string +// @Router /sync/episodes/{id}/resync [post] +func (h *SyncHandler) TriggerEpisodeResync(c *gin.Context) { + if h.syncWorker == nil { + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "sync worker is not configured"}) + return + } + + episodeID, ok := parseEpisodeIDParam(c) + if !ok { + return + } + + row, ok := h.loadSyncEpisodeForAction(c, episodeID) + if !ok { + return + } + if row.QaStatus != "approved" && row.QaStatus != "inspector_approved" { + c.JSON(http.StatusBadRequest, gin.H{ + "error": fmt.Sprintf("episode qa_status is %q, must be approved or inspector_approved", row.QaStatus), + }) + return + } + if !row.CloudSynced { + c.JSON(http.StatusBadRequest, gin.H{"error": "episode has not completed cloud sync; use normal sync instead"}) + return + } + + if err := h.syncWorker.EnqueueEpisodeResync(c.Request.Context(), episodeID); err != nil { + h.enqueueSyncErrorResponse(c, episodeID, err) + return + } + + c.JSON(http.StatusAccepted, gin.H{ + "status": "accepted", + "episode_id": episodeID, + "message": "episode enqueued for cloud resync", + }) +} + // syncLogRow represents a row from the sync_logs table. type syncLogRow struct { ID int64 `db:"id"` diff --git a/internal/api/handlers/transfer.go b/internal/api/handlers/transfer.go index 1e70ce5..3ebb650 100644 --- a/internal/api/handlers/transfer.go +++ b/internal/api/handlers/transfer.go @@ -391,6 +391,30 @@ func readSidecarFromS3(ctx context.Context, s3Client *s3.Client, bucket, jsonKey return &sc } +func assetIDSnapshotMetadata(ctx context.Context, tx *sql.Tx, workstationID sql.NullInt64) sql.NullString { + if tx == nil || !workstationID.Valid || workstationID.Int64 <= 0 { + return sql.NullString{} + } + var assetID sql.NullString + err := tx.QueryRowContext(ctx, ` + SELECT r.asset_id + FROM workstations ws + LEFT JOIN robots r ON r.id = ws.robot_id AND r.deleted_at IS NULL + WHERE ws.id = ? AND ws.deleted_at IS NULL + LIMIT 1 + `, workstationID.Int64).Scan(&assetID) + if err != nil || !assetID.Valid || strings.TrimSpace(assetID.String) == "" { + return sql.NullString{} + } + data, err := json.Marshal(map[string]string{ + "asset_id": strings.TrimSpace(assetID.String), + }) + if err != nil { + return sql.NullString{} + } + return sql.NullString{String: string(data), Valid: true} +} + func uploadCompleteS3Key(data map[string]interface{}) string { return strings.TrimSpace(stringVal(data, "s3_key")) } @@ -617,6 +641,7 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra checksum = sql.NullString{String: sc.Recording.ChecksumSHA256, Valid: true} } } + episodeMetadata := assetIDSnapshotMetadata(ctx, tx, taskRow.WorkstationID) _, dbErr := tx.ExecContext(ctx, `INSERT INTO episodes ( @@ -635,8 +660,9 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra duration_sec, file_size_bytes, checksum, - qa_status - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + qa_status, + metadata + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, episodeID, taskRow.ID, taskRow.BatchID, @@ -653,6 +679,7 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra fileSizeBytes, checksum, "approved", + episodeMetadata, ) if dbErr != nil { // #nosec G706 -- Set aside for now diff --git a/internal/api/handlers/transfer_asset_id_snapshot_test.go b/internal/api/handlers/transfer_asset_id_snapshot_test.go new file mode 100644 index 0000000..9d71be7 --- /dev/null +++ b/internal/api/handlers/transfer_asset_id_snapshot_test.go @@ -0,0 +1,95 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package handlers + +import ( + "context" + "database/sql" + "encoding/json" + "testing" + + _ "modernc.org/sqlite" +) + +func TestAssetIDSnapshotMetadata_WritesWhenRobotHasAssetID(t *testing.T) { + db, err := sql.Open("sqlite", ":memory:") + if err != nil { + t.Fatalf("open sqlite db: %v", err) + } + defer db.Close() + + createAssetIDSnapshotSchema(t, db) + if _, err := db.Exec(`INSERT INTO robots (id, asset_id, deleted_at) VALUES (1, ' asset-1 ', NULL)`); err != nil { + t.Fatalf("seed robot: %v", err) + } + if _, err := db.Exec(`INSERT INTO workstations (id, robot_id, deleted_at) VALUES (10, 1, NULL)`); err != nil { + t.Fatalf("seed workstation: %v", err) + } + + tx, err := db.BeginTx(context.Background(), nil) + if err != nil { + t.Fatalf("begin tx: %v", err) + } + defer tx.Rollback() + + got := assetIDSnapshotMetadata(context.Background(), tx, sql.NullInt64{Int64: 10, Valid: true}) + if !got.Valid { + t.Fatal("metadata was not written") + } + var decoded map[string]string + if err := json.Unmarshal([]byte(got.String), &decoded); err != nil { + t.Fatalf("unmarshal metadata: %v", err) + } + if decoded["asset_id"] != "asset-1" { + t.Fatalf("asset_id=%q want asset-1", decoded["asset_id"]) + } +} + +func TestAssetIDSnapshotMetadata_MissingDoesNotFailEpisodeCreationPath(t *testing.T) { + db, err := sql.Open("sqlite", ":memory:") + if err != nil { + t.Fatalf("open sqlite db: %v", err) + } + defer db.Close() + + createAssetIDSnapshotSchema(t, db) + if _, err := db.Exec(`INSERT INTO robots (id, asset_id, deleted_at) VALUES (1, NULL, NULL)`); err != nil { + t.Fatalf("seed robot: %v", err) + } + if _, err := db.Exec(`INSERT INTO workstations (id, robot_id, deleted_at) VALUES (10, 1, NULL)`); err != nil { + t.Fatalf("seed workstation: %v", err) + } + + tx, err := db.BeginTx(context.Background(), nil) + if err != nil { + t.Fatalf("begin tx: %v", err) + } + defer tx.Rollback() + + got := assetIDSnapshotMetadata(context.Background(), tx, sql.NullInt64{Int64: 10, Valid: true}) + if got.Valid { + t.Fatalf("metadata valid=%t value=%q, want NULL", got.Valid, got.String) + } +} + +func createAssetIDSnapshotSchema(t *testing.T, db *sql.DB) { + t.Helper() + for _, stmt := range []string{ + `CREATE TABLE robots ( + id INTEGER PRIMARY KEY, + asset_id TEXT, + deleted_at TIMESTAMP NULL + )`, + `CREATE TABLE workstations ( + id INTEGER PRIMARY KEY, + robot_id INTEGER, + deleted_at TIMESTAMP NULL + )`, + } { + if _, err := db.Exec(stmt); err != nil { + t.Fatalf("create schema: %v", err) + } + } +} diff --git a/internal/cloud/cloudpb/data_gateway.pb.go b/internal/cloud/cloudpb/data_gateway.pb.go index fb89247..2b98e31 100644 --- a/internal/cloud/cloudpb/data_gateway.pb.go +++ b/internal/cloud/cloudpb/data_gateway.pb.go @@ -870,6 +870,7 @@ type CompleteUploadRequest struct { RawTags map[string]string `protobuf:"bytes,3,rep,name=raw_tags,json=rawTags,proto3" json:"raw_tags,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` CompletedPartCount int32 `protobuf:"varint,4,opt,name=completed_part_count,json=completedPartCount,proto3" json:"completed_part_count,omitempty"` OssObjectEtag string `protobuf:"bytes,5,opt,name=oss_object_etag,json=ossObjectEtag,proto3" json:"oss_object_etag,omitempty"` + PartSizeBytes int64 `protobuf:"varint,6,opt,name=part_size_bytes,json=partSizeBytes,proto3" json:"part_size_bytes,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -939,6 +940,13 @@ func (x *CompleteUploadRequest) GetOssObjectEtag() string { return "" } +func (x *CompleteUploadRequest) GetPartSizeBytes() int64 { + if x != nil { + return x.PartSizeBytes + } + return 0 +} + type CompleteUploadResponse struct { state protoimpl.MessageState `protogen:"open.v1"` unknownFields protoimpl.UnknownFields @@ -1287,13 +1295,14 @@ const file_data_gateway_proto_rawDesc = "" + "\x06reason\x18\x02 \x01(\tR\x06reason\"^\n" + "\x13AbortUploadResponse\x12*\n" + "\x11logical_upload_id\x18\x01 \x01(\tR\x0flogicalUploadId\x12\x1b\n" + - "\tupload_id\x18\x02 \x01(\tR\buploadId\"\xc1\x02\n" + + "\tupload_id\x18\x02 \x01(\tR\buploadId\"\xe9\x02\n" + "\x15CompleteUploadRequest\x12\x1b\n" + "\tupload_id\x18\x01 \x01(\tR\buploadId\x12\x1b\n" + "\tfile_size\x18\x02 \x01(\x03R\bfileSize\x12X\n" + "\braw_tags\x18\x03 \x03(\v2=.archebase.data_gateway.v1.CompleteUploadRequest.RawTagsEntryR\arawTags\x120\n" + "\x14completed_part_count\x18\x04 \x01(\x05R\x12completedPartCount\x12&\n" + - "\x0foss_object_etag\x18\x05 \x01(\tR\rossObjectEtag\x1a:\n" + + "\x0foss_object_etag\x18\x05 \x01(\tR\rossObjectEtag\x12&\n" + + "\x0fpart_size_bytes\x18\x06 \x01(\x03R\rpartSizeBytes\x1a:\n" + "\fRawTagsEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\x18\n" + diff --git a/internal/cloud/cloudpb/proto/data_gateway.proto b/internal/cloud/cloudpb/proto/data_gateway.proto index e3f180c..a5be7a4 100644 --- a/internal/cloud/cloudpb/proto/data_gateway.proto +++ b/internal/cloud/cloudpb/proto/data_gateway.proto @@ -111,6 +111,7 @@ message CompleteUploadRequest { map raw_tags = 3; int32 completed_part_count = 4; string oss_object_etag = 5; + int64 part_size_bytes = 6; } message CompleteUploadResponse {} diff --git a/internal/cloud/gateway_client.go b/internal/cloud/gateway_client.go index c91d238..5cc842d 100644 --- a/internal/cloud/gateway_client.go +++ b/internal/cloud/gateway_client.go @@ -209,7 +209,7 @@ func (c *GatewayClient) AbortUpload(ctx context.Context, logicalUploadID string, } // CompleteUpload notifies the data-gateway that all parts have been uploaded to OSS. -func (c *GatewayClient) CompleteUpload(ctx context.Context, uploadID string, fileSize int64, rawTags map[string]string, completedPartCount int32, ossObjectEtag string) error { +func (c *GatewayClient) CompleteUpload(ctx context.Context, uploadID string, fileSize int64, rawTags map[string]string, completedPartCount int32, ossObjectEtag string, partSizeBytes int64) error { authHeader, err := c.getAuthHeader(ctx) if err != nil { return err @@ -226,6 +226,7 @@ func (c *GatewayClient) CompleteUpload(ctx context.Context, uploadID string, fil RawTags: rawTags, CompletedPartCount: completedPartCount, OssObjectEtag: ossObjectEtag, + PartSizeBytes: partSizeBytes, }) return rpcErr }) diff --git a/internal/cloud/gateway_client_test.go b/internal/cloud/gateway_client_test.go new file mode 100644 index 0000000..e8ab36a --- /dev/null +++ b/internal/cloud/gateway_client_test.go @@ -0,0 +1,83 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package cloud + +import ( + "context" + "net" + "testing" + "time" + + pb "archebase.com/keystone-edge/internal/cloud/cloudpb" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/test/bufconn" +) + +type completeUploadCaptureServer struct { + pb.UnimplementedDataGatewayServiceServer + req *pb.CompleteUploadRequest +} + +func (s *completeUploadCaptureServer) CompleteUpload(_ context.Context, req *pb.CompleteUploadRequest) (*pb.CompleteUploadResponse, error) { + s.req = req + return &pb.CompleteUploadResponse{}, nil +} + +func TestGatewayClientCompleteUploadSendsPartSizeBytes(t *testing.T) { + listener := bufconn.Listen(1024 * 1024) + server := grpc.NewServer() + capture := &completeUploadCaptureServer{} + pb.RegisterDataGatewayServiceServer(server, capture) + go func() { + if err := server.Serve(listener); err != nil { + t.Logf("bufconn server exited: %v", err) + } + }() + t.Cleanup(func() { + server.Stop() + _ = listener.Close() + }) + + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + conn, err := grpc.DialContext(ctx, "bufnet", //nolint:staticcheck // bufconn tests still use DialContext. + grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) { + return listener.Dial() + }), + grpc.WithTransportCredentials(insecure.NewCredentials()), + ) + if err != nil { + t.Fatalf("dial bufconn: %v", err) + } + t.Cleanup(func() { _ = conn.Close() }) + + authClient := &AuthClient{ + token: &AuthToken{ + AccessToken: "test-token", + ExpiresAt: time.Now().Add(time.Hour), + }, + } + client := &GatewayClient{ + cfg: GatewayClientConfig{ + RequestTimeout: time.Second, + }, + authClient: authClient, + conn: conn, + } + + if err := client.CompleteUpload(ctx, "upload-1", 1234, map[string]string{"k": "v"}, 2, `"etag"`, 8*1024*1024); err != nil { + t.Fatalf("CompleteUpload() error = %v", err) + } + if capture.req == nil { + t.Fatal("CompleteUpload request was not captured") + } + if capture.req.PartSizeBytes != 8*1024*1024 { + t.Fatalf("PartSizeBytes=%d want %d", capture.req.PartSizeBytes, 8*1024*1024) + } + if capture.req.RawTags["k"] != "v" { + t.Fatalf("RawTags=%+v", capture.req.RawTags) + } +} diff --git a/internal/cloud/uploader.go b/internal/cloud/uploader.go index e7cd01c..8d746da 100644 --- a/internal/cloud/uploader.go +++ b/internal/cloud/uploader.go @@ -13,6 +13,7 @@ import ( "math" "os" "path/filepath" + "strings" "time" pb "archebase.com/keystone-edge/internal/cloud/cloudpb" @@ -42,6 +43,8 @@ type UploadRequest struct { EpisodeID string // McapKey is the MinIO object key for the MCAP file (without bucket prefix). McapKey string + // AssetID is the Data Platform device id used for this upload. + AssetID string // RawTags are arbitrary key-value tags passed to the data-gateway. RawTags map[string]string // ClientHints are passed to CreateLogicalUpload for server-side routing. @@ -69,7 +72,9 @@ type persistedUploadState struct { Endpoint string `json:"endpoint"` ObjectKey string `json:"object_key"` McapKey string `json:"mcap_key"` + AssetID string `json:"asset_id"` FileSize int64 `json:"file_size"` + PartSizeBytes int64 `json:"part_size_bytes,omitempty"` UpdatedAt time.Time `json:"updated_at"` } @@ -99,7 +104,7 @@ type gatewayClient interface { GetUploadRecovery(ctx context.Context, logicalUploadID string) (*UploadRecoveryInfo, error) ReissueUploadCredentials(ctx context.Context, uploadID string) (*UploadSession, error) AbortUpload(ctx context.Context, logicalUploadID string, reason string) error - CompleteUpload(ctx context.Context, uploadID string, fileSize int64, rawTags map[string]string, completedPartCount int32, ossObjectEtag string) error + CompleteUpload(ctx context.Context, uploadID string, fileSize int64, rawTags map[string]string, completedPartCount int32, ossObjectEtag string, partSizeBytes int64) error } // ossClient is the subset of OSSUploader methods used by Uploader. @@ -171,8 +176,18 @@ func (u *Uploader) validatePersistDir() error { // It uses context.Background() as base to ensure the abort is independent of the // caller's context, but with a 30s timeout to prevent indefinite hanging. func (u *Uploader) abortMultipartUpload(session *UploadSession, multipartUploadID string) { + if session == nil { + logger.Printf("[CLOUD-UPLOAD] Warning: skip OSS abort for multipart_upload_id=%s: missing upload session", multipartUploadID) + return + } abortCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() + refreshed, err := u.ensureFreshUploadCredentials(abortCtx, session) + if err != nil { + logger.Printf("[CLOUD-UPLOAD] Warning: refresh credentials before abort failed (proceeding anyway): %v", err) + } else { + session = refreshed + } u.oss.AbortMultipartUpload(abortCtx, session, multipartUploadID) } @@ -205,7 +220,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult logger.Printf("[CLOUD-UPLOAD] Starting upload: episode=%s mcap=%s size=%d", req.EpisodeID, req.McapKey, fileSize) // Step 2: Prepare upload session (with recovery if persisted state exists) - prepared, err := u.prepareUploadSession(ctx, hints, req.McapKey, fileSize) + prepared, err := u.prepareUploadSession(ctx, hints, req.McapKey, req.AssetID, fileSize) if err != nil { return nil, fmt.Errorf("prepare upload session: %w", err) } @@ -219,7 +234,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult if prepared.ossCompleteETag != "" { logger.Printf("[CLOUD-UPLOAD] OSS object already verified (COMPLETE_ONLY): logical_upload_id=%s etag=%s parts=%d", session.LogicalUploadID, prepared.ossCompleteETag, prepared.ossCompletePartCount) - if err := u.gateway.CompleteUpload(ctx, session.UploadID, fileSize, req.RawTags, prepared.ossCompletePartCount, prepared.ossCompleteETag); err != nil { + if err := u.gateway.CompleteUpload(ctx, session.UploadID, fileSize, req.RawTags, prepared.ossCompletePartCount, prepared.ossCompleteETag, session.PartSizeBytes); err != nil { return nil, fmt.Errorf("complete upload on gateway (oss-already-complete): %w", err) } u.cleanupPersistedState(session.LogicalUploadID) @@ -246,7 +261,9 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult Endpoint: session.Endpoint, ObjectKey: session.ObjectKey, McapKey: req.McapKey, + AssetID: req.AssetID, FileSize: fileSize, + PartSizeBytes: session.PartSizeBytes, UpdatedAt: time.Now(), }); err != nil { return nil, fmt.Errorf("persist initial upload state: %w", err) @@ -269,7 +286,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult // InitiateMultipartUpload succeeds, before streaming any parts. This requires splitting // uploadParts into an initiate step (called here, result persisted) and a stream step. // The Rust SDK has the same gap; defer fixing until the upstream SDK is updated. - multipartUploadID, parts, partMD5s, err := u.uploadParts(ctx, req, session, fileSize) + session, multipartUploadID, parts, partMD5s, err := u.uploadParts(ctx, req, session, fileSize) if err != nil { return nil, err } @@ -285,7 +302,9 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult Endpoint: session.Endpoint, ObjectKey: session.ObjectKey, McapKey: req.McapKey, + AssetID: req.AssetID, FileSize: fileSize, + PartSizeBytes: session.PartSizeBytes, UpdatedAt: time.Now(), }); err != nil { logger.Printf("[CLOUD-UPLOAD] Warning: failed to update state with multipart_upload_id: %v", err) @@ -296,7 +315,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult // Step 4: Refresh STS credentials if about to expire before CompleteUpload RPC if time.Until(session.STSExpireAt) <= u.cfg.RequestTimeout { - refreshed, err := u.gateway.ReissueUploadCredentials(ctx, session.UploadID) + refreshed, err := u.refreshUploadCredentials(ctx, session) if err != nil { logger.Printf("[CLOUD-UPLOAD] Warning: refresh credentials failed (proceeding anyway): %v", err) } else { @@ -309,7 +328,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult return nil, fmt.Errorf("too many upload parts: %d", len(parts)) } //nolint:gosec // G115: len(parts) validated to fit into int32 above - if err := u.gateway.CompleteUpload(ctx, session.UploadID, fileSize, req.RawTags, int32(len(parts)), localETag); err != nil { + if err := u.gateway.CompleteUpload(ctx, session.UploadID, fileSize, req.RawTags, int32(len(parts)), localETag, session.PartSizeBytes); err != nil { return nil, fmt.Errorf("complete upload on gateway: %w", err) } @@ -341,8 +360,8 @@ type preparedSession struct { // prepareUploadSession checks for persisted state and either resumes or creates a new session. // It mirrors the Rust SDK's prepare_upload_session logic. -func (u *Uploader) prepareUploadSession(ctx context.Context, clientHints map[string]string, mcapKey string, fileSize int64) (preparedSession, error) { - state, err := u.findPersistedStateByKey(mcapKey) +func (u *Uploader) prepareUploadSession(ctx context.Context, clientHints map[string]string, mcapKey string, assetID string, fileSize int64) (preparedSession, error) { + state, err := u.findPersistedStateByKey(mcapKey, assetID) if err != nil { return preparedSession{}, fmt.Errorf("load persisted state: %w", err) } @@ -397,6 +416,7 @@ func (u *Uploader) prepareUploadSession(ctx context.Context, clientHints map[str Endpoint: newSession.Endpoint, ObjectKey: newSession.ObjectKey, McapKey: mcapKey, + AssetID: assetID, FileSize: fileSize, UpdatedAt: time.Now(), }); err != nil { @@ -461,6 +481,9 @@ func (u *Uploader) decideResumeAction(ctx context.Context, state *persistedUploa // Treat RPC failures as transient: preserve local state for next retry. return resumeContinue, nil, "", 0, fmt.Errorf("ReissueUploadCredentials: %w", err) } + if state.PartSizeBytes > 0 { + session.PartSizeBytes = state.PartSizeBytes + } if state.MultipartUploadID != "" { outcome, err := u.reconcileRemoteParts(ctx, session, state.MultipartUploadID) @@ -537,32 +560,80 @@ func (u *Uploader) reconcileCompletedObject(ctx context.Context, session *Upload return reconcileRestart, nil } +// partStreamFactory opens a stream for a specific byte range of the MCAP file. +// Each call returns an independent io.ReadCloser so that connections are not +// kept idle across part uploads. +type partStreamFactory func(ctx context.Context, offset, length int64) (io.ReadCloser, error) + +// minioRangeReader returns a partStreamFactory that reads byte ranges from +// MinIO using independent ranged GetObject requests. +func (u *Uploader) minioRangeReader(key string) partStreamFactory { + return func(ctx context.Context, offset, length int64) (io.ReadCloser, error) { + opts := minio.GetObjectOptions{} + if err := opts.SetRange(offset, offset+length-1); err != nil { + return nil, fmt.Errorf("set range %d-%d: %w", offset, offset+length-1, err) + } + obj, err := u.minioClient.GetObject(ctx, u.minioBucket, key, opts) + if err != nil { + return nil, fmt.Errorf("get minio object range %d-%d: %w", offset, offset+length-1, err) + } + return obj, nil + } +} + // uploadParts streams the MCAP from MinIO and uploads it to OSS in parts. // Returns the OSS multipart upload ID, the list of uploaded parts, per-part MD5 digests, and any error. -func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session *UploadSession, fileSize int64) (string, []UploadedPart, [][16]byte, error) { +func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session *UploadSession, fileSize int64) (*UploadSession, string, []UploadedPart, [][16]byte, error) { + fixedPartSizeBytes := normalizedPartSizeBytes(session.PartSizeBytes) + session, err := u.ensureFreshUploadCredentials(ctx, session) + if err != nil { + return nil, "", nil, nil, fmt.Errorf("refresh credentials before initiate multipart upload: %w", err) + } + session.PartSizeBytes = fixedPartSizeBytes + // Initiate multipart upload on OSS multipartUploadID, err := u.oss.InitiateMultipartUpload(ctx, session) if err != nil { - return "", nil, nil, fmt.Errorf("initiate multipart upload: %w", err) + return nil, "", nil, nil, fmt.Errorf("initiate multipart upload: %w", err) } logger.Printf("[CLOUD-UPLOAD] Multipart initiated: multipart_upload_id=%s", multipartUploadID) - // Stream from MinIO → OSS in parts - mcapStream, err := u.minioClient.GetObject(ctx, u.minioBucket, req.McapKey, minio.GetObjectOptions{}) + // Stream from MinIO to OSS in parts. + // Each part uses an independent ranged GetObject so that the MinIO HTTP + // connection is not left idle during OSS part uploads. A single streaming + // response would risk idle connection timeout (~20-25s on MinIO or network + // intermediaries) when upload speed is slow. + session, parts, partMD5s, err := u.streamMultipartParts(ctx, req.EpisodeID, session, multipartUploadID, fileSize, fixedPartSizeBytes, u.minioRangeReader(req.McapKey)) if err != nil { u.abortMultipartUpload(session, multipartUploadID) - return "", nil, nil, fmt.Errorf("get minio object %s: %w", req.McapKey, err) + return nil, "", nil, nil, err } - defer func() { - _ = mcapStream.Close() - }() - partSizeBytes := session.PartSizeBytes - if partSizeBytes <= 0 { - partSizeBytes = 8 * 1024 * 1024 // 8MB default + session, err = u.ensureFreshUploadCredentials(ctx, session) + if err != nil { + u.abortMultipartUpload(session, multipartUploadID) + return nil, "", nil, nil, fmt.Errorf("refresh credentials before complete multipart upload: %w", err) + } + session.PartSizeBytes = fixedPartSizeBytes + + // Complete multipart upload on OSS + if _, err := u.oss.CompleteMultipartUpload(ctx, session, multipartUploadID, parts); err != nil { + u.abortMultipartUpload(session, multipartUploadID) + return nil, "", nil, nil, fmt.Errorf("complete multipart upload on OSS: %w", err) } - buf := make([]byte, partSizeBytes) + return session, multipartUploadID, parts, partMD5s, nil +} + +func (u *Uploader) streamMultipartParts(ctx context.Context, episodeID string, session *UploadSession, multipartUploadID string, fileSize int64, partSizeBytes int64, newPartStream partStreamFactory) (*UploadSession, []UploadedPart, [][16]byte, error) { + partSizeBytes = normalizedPartSizeBytes(partSizeBytes) + session.PartSizeBytes = partSizeBytes + partSize := int(partSizeBytes) + if int64(partSize) != partSizeBytes { + return session, nil, nil, fmt.Errorf("invalid part_size_bytes %d", partSizeBytes) + } + + buf := make([]byte, partSize) var parts []UploadedPart var partMD5s [][16]byte var offset int64 @@ -570,8 +641,7 @@ func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session * for offset < fileSize { if err := ctx.Err(); err != nil { - u.abortMultipartUpload(session, multipartUploadID) - return "", nil, nil, err + return session, nil, nil, err } remaining := fileSize - offset @@ -580,19 +650,44 @@ func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session * readSize = remaining } - n, readErr := io.ReadFull(mcapStream, buf[:readSize]) - if readErr != nil && readErr != io.ErrUnexpectedEOF { - u.abortMultipartUpload(session, multipartUploadID) - return "", nil, nil, fmt.Errorf("read part %d from minio: %w", partNumber, readErr) + // Open a new connection for each part so that the MinIO stream is not + // left idle during OSS uploads. MinIO or intervening network equipment + // may drop idle streaming connections after ~20-25s, and the OSS upload + // between part reads can easily exceed this threshold on slow networks. + partStream, err := newPartStream(ctx, offset, readSize) + if err != nil { + return session, nil, nil, fmt.Errorf("open part %d stream at offset %d: %w", partNumber, offset, err) + } + + n, readErr := io.ReadFull(partStream, buf[:int(readSize)]) + _ = partStream.Close() // close ASAP, best-effort + if readErr != nil { + return session, nil, nil, fmt.Errorf("read part %d from minio: expected %d bytes, got %d: %w", partNumber, readSize, n, readErr) + } + if int64(n) != readSize { + return session, nil, nil, fmt.Errorf("read part %d from minio: expected %d bytes, got %d", partNumber, readSize, n) } partSlice := buf[:n] partMD5s = append(partMD5s, MD5DigestBytes(partSlice)) + session, err = u.ensureFreshUploadCredentials(ctx, session) + if err != nil { + return session, nil, nil, fmt.Errorf("refresh credentials before upload part %d: %w", partNumber, err) + } + etag, err := u.oss.UploadPart(ctx, session, multipartUploadID, partNumber, partSlice) + if err != nil && isSecurityTokenExpiredError(err) { + refreshed, refreshErr := u.refreshUploadCredentials(ctx, session) + if refreshErr != nil { + return session, nil, nil, fmt.Errorf("refresh credentials after upload part %d token expiry: %w", partNumber, refreshErr) + } + session = refreshed + session.PartSizeBytes = partSizeBytes + etag, err = u.oss.UploadPart(ctx, session, multipartUploadID, partNumber, partSlice) + } if err != nil { - u.abortMultipartUpload(session, multipartUploadID) - return "", nil, nil, fmt.Errorf("upload part %d: %w", partNumber, err) + return session, nil, nil, fmt.Errorf("upload part %d: %w", partNumber, err) } parts = append(parts, UploadedPart{ @@ -603,19 +698,55 @@ func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session * offset += int64(n) partNumber++ - if partNumber%10 == 0 { - logger.Printf("[CLOUD-UPLOAD] Progress: episode=%s parts=%d offset=%d/%d", - req.EpisodeID, len(parts), offset, fileSize) - } + logger.Printf("[CLOUD-UPLOAD] Progress: episode=%s parts=%d offset=%d/%d", + episodeID, len(parts), offset, fileSize) } - // Complete multipart upload on OSS - if _, err := u.oss.CompleteMultipartUpload(ctx, session, multipartUploadID, parts); err != nil { - u.abortMultipartUpload(session, multipartUploadID) - return "", nil, nil, fmt.Errorf("complete multipart upload on OSS: %w", err) + return session, parts, partMD5s, nil +} + +func (u *Uploader) ensureFreshUploadCredentials(ctx context.Context, session *UploadSession) (*UploadSession, error) { + if session == nil { + return nil, fmt.Errorf("missing upload session") + } + if time.Until(session.STSExpireAt) > u.stsRefreshWindow() { + return session, nil } + return u.refreshUploadCredentials(ctx, session) +} + +func (u *Uploader) refreshUploadCredentials(ctx context.Context, session *UploadSession) (*UploadSession, error) { + if u.gateway == nil { + return nil, fmt.Errorf("gateway client is not configured") + } + refreshed, err := u.gateway.ReissueUploadCredentials(ctx, session.UploadID) + if err != nil { + return nil, err + } + refreshed.PartSizeBytes = normalizedPartSizeBytes(session.PartSizeBytes) + return refreshed, nil +} + +func normalizedPartSizeBytes(partSizeBytes int64) int64 { + if partSizeBytes <= 0 { + return 8 * 1024 * 1024 + } + return partSizeBytes +} + +func (u *Uploader) stsRefreshWindow() time.Duration { + window := u.cfg.RequestTimeout + if u.cfg.OSSTimeout > window { + window = u.cfg.OSSTimeout + } + if window <= 0 { + window = 30 * time.Second + } + return window + 30*time.Second +} - return multipartUploadID, parts, partMD5s, nil +func isSecurityTokenExpiredError(err error) bool { + return err != nil && strings.Contains(err.Error(), "SecurityTokenExpired") } // abortAndCleanupSession notifies the data-gateway to abort the logical upload session @@ -680,8 +811,10 @@ func (u *Uploader) cleanupPersistedState(logicalUploadID string) { } } -// findPersistedStateByKey scans the active state directory for a state matching the given mcap key. -func (u *Uploader) findPersistedStateByKey(mcapKey string) (*persistedUploadState, error) { +// findPersistedStateByKey scans the active state directory for a state matching the given +// MCAP key and asset id. Upload sessions are device-scoped and must not be reused +// across different Data Platform devices even when the MCAP object key is identical. +func (u *Uploader) findPersistedStateByKey(mcapKey string, assetID string) (*persistedUploadState, error) { if u.cfg.PersistRootDir == "" { return nil, nil } @@ -707,7 +840,7 @@ func (u *Uploader) findPersistedStateByKey(mcapKey string) (*persistedUploadStat logger.Printf("[CLOUD-UPLOAD] Warning: failed to parse state file %s: %v", entry.Name(), err) continue } - if state.McapKey == mcapKey { + if state.McapKey == mcapKey && state.AssetID == assetID { return &state, nil } } diff --git a/internal/cloud/uploader_test.go b/internal/cloud/uploader_test.go index b510f5f..99e3432 100644 --- a/internal/cloud/uploader_test.go +++ b/internal/cloud/uploader_test.go @@ -5,11 +5,14 @@ package cloud import ( + "bytes" "context" "encoding/json" "errors" + "io" "os" "path/filepath" + "strings" "testing" "time" @@ -122,11 +125,12 @@ func TestFindPersistedStateByKey(t *testing.T) { LogicalUploadID: "logical-find-test", UploadID: "upload-find-test", McapKey: "episodes/7/find.mcap", + AssetID: "asset-a", FileSize: 256, UpdatedAt: time.Now(), }) - got, err := u.findPersistedStateByKey("episodes/7/find.mcap") + got, err := u.findPersistedStateByKey("episodes/7/find.mcap", "asset-a") if err != nil { t.Fatalf("findPersistedStateByKey: %v", err) } @@ -138,12 +142,36 @@ func TestFindPersistedStateByKey(t *testing.T) { } } +func TestFindPersistedStateByKey_DoesNotReuseDifferentAssetID(t *testing.T) { + dir := t.TempDir() + u := newTestUploader(dir) + + activeDir := filepath.Join(dir, "data-gateway-client", "uploads", "active") + writeTempState(t, activeDir, &persistedUploadState{ + Version: 1, + LogicalUploadID: "logical-device-a", + UploadID: "upload-device-a", + McapKey: "episodes/7/find.mcap", + AssetID: "asset-a", + FileSize: 256, + UpdatedAt: time.Now(), + }) + + got, err := u.findPersistedStateByKey("episodes/7/find.mcap", "asset-b") + if err != nil { + t.Fatalf("findPersistedStateByKey: %v", err) + } + if got != nil { + t.Fatalf("expected nil for different AssetID, got %+v", got) + } +} + // TestFindPersistedStateByKey_NotFound verifies nil is returned for unknown keys. func TestFindPersistedStateByKey_NotFound(t *testing.T) { dir := t.TempDir() u := newTestUploader(dir) - got, err := u.findPersistedStateByKey("episodes/99/missing.mcap") + got, err := u.findPersistedStateByKey("episodes/99/missing.mcap", "asset-a") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -156,7 +184,7 @@ func TestFindPersistedStateByKey_NotFound(t *testing.T) { func TestFindPersistedStateByKey_EmptyPersistRootDir(t *testing.T) { u := newTestUploader("") - got, err := u.findPersistedStateByKey("episodes/1/file.mcap") + got, err := u.findPersistedStateByKey("episodes/1/file.mcap", "asset-a") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -197,6 +225,7 @@ func TestPersistedStateRoundTrip(t *testing.T) { Endpoint: "https://oss.example.com", ObjectKey: "uploads/1/test", McapKey: "episodes/1/test.mcap", + AssetID: "asset-a", FileSize: 4096, UpdatedAt: now, } @@ -225,6 +254,9 @@ func TestPersistedStateRoundTrip(t *testing.T) { if decoded.McapKey != original.McapKey { t.Errorf("McapKey = %q, want %q", decoded.McapKey, original.McapKey) } + if decoded.AssetID != original.AssetID { + t.Errorf("AssetID = %q, want %q", decoded.AssetID, original.AssetID) + } } // TestPrepareUploadSession_PermanentFailure_FileSizeMismatch verifies that a persisted state @@ -241,6 +273,7 @@ func TestPrepareUploadSession_PermanentFailure_FileSizeMismatch(t *testing.T) { LogicalUploadID: "logical-size-mismatch", UploadID: "upload-size-mismatch", McapKey: "episodes/1/mismatch.mcap", + AssetID: "asset-a", FileSize: 1024, // persisted as 1024 UpdatedAt: time.Now(), }) @@ -250,6 +283,7 @@ func TestPrepareUploadSession_PermanentFailure_FileSizeMismatch(t *testing.T) { context.Background(), map[string]string{}, "episodes/1/mismatch.mcap", + "asset-a", 512, // actual size differs ) if err == nil { @@ -271,6 +305,7 @@ func TestPrepareUploadSession_PermanentFailure_CleanupOnSizeMismatch(t *testing. LogicalUploadID: "logical-cleanup-mismatch", UploadID: "upload-cleanup-mismatch", McapKey: "episodes/2/cleanup.mcap", + AssetID: "asset-a", FileSize: 1024, // persisted size RestartCount: 0, UpdatedAt: time.Now(), @@ -280,6 +315,7 @@ func TestPrepareUploadSession_PermanentFailure_CleanupOnSizeMismatch(t *testing. context.Background(), map[string]string{}, "episodes/2/cleanup.mcap", + "asset-a", 512, // different from persisted ) if err == nil { @@ -326,13 +362,14 @@ func TestPrepareUploadSession_Restart_OldStatePreservedOnRPCFailure(t *testing.T LogicalUploadID: "logical-old", UploadID: "upload-old", McapKey: "episodes/10/restart-rpc-fail.mcap", + AssetID: "asset-a", FileSize: 512, RestartCount: 0, UpdatedAt: time.Now(), }) u := newDecideResumeUploader(dir, gw, &fakeOSS{}) - _, err := u.prepareUploadSession(context.Background(), map[string]string{}, "episodes/10/restart-rpc-fail.mcap", 512) + _, err := u.prepareUploadSession(context.Background(), map[string]string{}, "episodes/10/restart-rpc-fail.mcap", "asset-a", 512) if err == nil { t.Fatal("expected error when CreateLogicalUpload fails, got nil") } @@ -374,13 +411,14 @@ func TestPrepareUploadSession_Restart_NewStatePersisted_OldStateRemoved(t *testi LogicalUploadID: "logical-old", UploadID: "upload-old", McapKey: "episodes/11/restart-ok.mcap", + AssetID: "asset-a", FileSize: 512, RestartCount: 0, UpdatedAt: time.Now(), }) u := newDecideResumeUploader(dir, gw, &fakeOSS{}) - prepared, err := u.prepareUploadSession(context.Background(), map[string]string{}, "episodes/11/restart-ok.mcap", 512) + prepared, err := u.prepareUploadSession(context.Background(), map[string]string{}, "episodes/11/restart-ok.mcap", "asset-a", 512) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -528,7 +566,7 @@ func (f *fakeGateway) AbortUpload(_ context.Context, _ string, _ string) error { return nil } -func (f *fakeGateway) CompleteUpload(_ context.Context, _ string, _ int64, _ map[string]string, _ int32, _ string) error { +func (f *fakeGateway) CompleteUpload(_ context.Context, _ string, _ int64, _ map[string]string, _ int32, _ string, _ int64) error { panic("fakeGateway.CompleteUpload called unexpectedly") } @@ -538,6 +576,8 @@ type fakeOSS struct { listPartsFn func(ctx context.Context, session *UploadSession, multipartUploadID string) error // headObjectETagFn is called by HeadObjectETag; must be set for tests that reach it. headObjectETagFn func(ctx context.Context, session *UploadSession) (string, error) + // uploadPartFn is called by UploadPart; must be set for tests that reach it. + uploadPartFn func(ctx context.Context, session *UploadSession, multipartUploadID string, partNumber int, body []byte) (string, error) } func (f *fakeOSS) ListParts(ctx context.Context, session *UploadSession, multipartUploadID string) error { @@ -558,8 +598,11 @@ func (f *fakeOSS) InitiateMultipartUpload(_ context.Context, _ *UploadSession) ( panic("fakeOSS.InitiateMultipartUpload called unexpectedly") } -func (f *fakeOSS) UploadPart(_ context.Context, _ *UploadSession, _ string, _ int, _ []byte) (string, error) { - panic("fakeOSS.UploadPart called unexpectedly") +func (f *fakeOSS) UploadPart(ctx context.Context, session *UploadSession, multipartUploadID string, partNumber int, body []byte) (string, error) { + if f.uploadPartFn == nil { + panic("fakeOSS.UploadPart called unexpectedly") + } + return f.uploadPartFn(ctx, session, multipartUploadID, partNumber, body) } func (f *fakeOSS) CompleteMultipartUpload(_ context.Context, _ *UploadSession, _ string, _ []UploadedPart) (string, error) { @@ -596,6 +639,215 @@ func makeSession(logicalID, uploadID string) *UploadSession { } } +func TestStreamMultipartParts_UploadsExpectedPartBoundaries(t *testing.T) { + var gotPartNumbers []int + var gotSizes []int + oss := &fakeOSS{ + uploadPartFn: func(_ context.Context, _ *UploadSession, _ string, partNumber int, body []byte) (string, error) { + gotPartNumbers = append(gotPartNumbers, partNumber) + gotSizes = append(gotSizes, len(body)) + return "etag", nil + }, + } + u := newDecideResumeUploader("", &fakeGateway{}, oss) + session := makeSession("logical-stream", "upload-stream") + session.PartSizeBytes = 10 + + payload := []byte("abcdefghijklmnopqrstuvwxy") + factory := func(_ context.Context, offset, length int64) (io.ReadCloser, error) { + end := int(offset + length) + if end > len(payload) { + end = len(payload) + } + return io.NopCloser(bytes.NewReader(payload[offset:end])), nil + } + _, parts, partMD5s, err := u.streamMultipartParts( + context.Background(), + "episode-stream", + session, + "multipart-stream", + int64(len(payload)), + session.PartSizeBytes, + factory, + ) + if err != nil { + t.Fatalf("streamMultipartParts() error = %v", err) + } + if len(parts) != 3 { + t.Fatalf("uploaded part count = %d, want 3", len(parts)) + } + if len(partMD5s) != 3 { + t.Fatalf("part MD5 count = %d, want 3", len(partMD5s)) + } + + wantPartNumbers := []int{1, 2, 3} + wantSizes := []int{10, 10, 5} + for i := range wantPartNumbers { + if gotPartNumbers[i] != wantPartNumbers[i] { + t.Fatalf("part number[%d] = %d, want %d", i, gotPartNumbers[i], wantPartNumbers[i]) + } + if gotSizes[i] != wantSizes[i] { + t.Fatalf("part size[%d] = %d, want %d", i, gotSizes[i], wantSizes[i]) + } + } +} + +func TestStreamMultipartParts_EarlyEOFStopsInsteadOfUploadingEmptyParts(t *testing.T) { + var uploadedPartNumbers []int + oss := &fakeOSS{ + uploadPartFn: func(_ context.Context, _ *UploadSession, _ string, partNumber int, body []byte) (string, error) { + uploadedPartNumbers = append(uploadedPartNumbers, partNumber) + if len(body) == 0 { + t.Fatalf("uploaded empty part %d", partNumber) + } + return "etag", nil + }, + } + u := newDecideResumeUploader("", &fakeGateway{}, oss) + session := makeSession("logical-short", "upload-short") + session.PartSizeBytes = 10 + + payload := []byte("abcdefghijkl") // 12 bytes — part 2 will fail with short read + factory := func(_ context.Context, offset, length int64) (io.ReadCloser, error) { + end := int(offset + length) + if end > len(payload) { + end = len(payload) + } + return io.NopCloser(bytes.NewReader(payload[offset:end])), nil + } + + _, _, _, err := u.streamMultipartParts( + context.Background(), + "episode-short", + session, + "multipart-short", + 25, + session.PartSizeBytes, + factory, + ) + if err == nil { + t.Fatal("expected error for early EOF, got nil") + } + if !strings.Contains(err.Error(), "expected 10 bytes, got 2") { + t.Fatalf("error = %q, want short read details", err.Error()) + } + if len(uploadedPartNumbers) != 1 || uploadedPartNumbers[0] != 1 { + t.Fatalf("uploaded parts = %v, want only first complete part", uploadedPartNumbers) + } +} + +func TestStreamMultipartParts_RefreshesCredentialsBeforeUploadPart(t *testing.T) { + var reissueCalls int + gw := &fakeGateway{ + reissueCredentialsFn: func(_ context.Context, uploadID string) (*UploadSession, error) { + reissueCalls++ + if uploadID != "upload-expiring" { + t.Fatalf("uploadID = %q, want upload-expiring", uploadID) + } + refreshed := makeSession("logical-expiring", uploadID) + refreshed.STSAccessKeyID = "fresh-key" + refreshed.PartSizeBytes = 99 + return refreshed, nil + }, + } + + var usedAccessKeyID string + var usedPartSizeBytes int64 + oss := &fakeOSS{ + uploadPartFn: func(_ context.Context, session *UploadSession, _ string, _ int, _ []byte) (string, error) { + usedAccessKeyID = session.STSAccessKeyID + usedPartSizeBytes = session.PartSizeBytes + return "etag", nil + }, + } + u := newDecideResumeUploader("", gw, oss) + session := makeSession("logical-expiring", "upload-expiring") + session.STSAccessKeyID = "stale-key" + session.STSExpireAt = time.Now().Add(10 * time.Second) + session.PartSizeBytes = 4 + + payload := []byte("abcd") + factory := func(_ context.Context, offset, length int64) (io.ReadCloser, error) { + return io.NopCloser(bytes.NewReader(payload[offset : offset+length])), nil + } + + finalSession, parts, _, err := u.streamMultipartParts(context.Background(), "episode-expiring", session, "multipart-expiring", int64(len(payload)), session.PartSizeBytes, factory) + if err != nil { + t.Fatalf("streamMultipartParts() error = %v", err) + } + if len(parts) != 1 { + t.Fatalf("uploaded part count = %d, want 1", len(parts)) + } + if reissueCalls != 1 { + t.Fatalf("ReissueUploadCredentials calls = %d, want 1", reissueCalls) + } + if usedAccessKeyID != "fresh-key" { + t.Fatalf("UploadPart access key = %q, want fresh-key", usedAccessKeyID) + } + if usedPartSizeBytes != 4 { + t.Fatalf("UploadPart part size = %d, want fixed original size 4", usedPartSizeBytes) + } + if finalSession.PartSizeBytes != 4 { + t.Fatalf("final session part size = %d, want fixed original size 4", finalSession.PartSizeBytes) + } +} + +func TestStreamMultipartParts_RetriesCurrentPartAfterSecurityTokenExpired(t *testing.T) { + var reissueCalls int + gw := &fakeGateway{ + reissueCredentialsFn: func(_ context.Context, uploadID string) (*UploadSession, error) { + reissueCalls++ + refreshed := makeSession("logical-retry", uploadID) + refreshed.STSAccessKeyID = "fresh-key" + return refreshed, nil + }, + } + + var uploadPartCalls int + var partNumbers []int + var usedAccessKeyIDs []string + oss := &fakeOSS{ + uploadPartFn: func(_ context.Context, session *UploadSession, _ string, partNumber int, _ []byte) (string, error) { + uploadPartCalls++ + partNumbers = append(partNumbers, partNumber) + usedAccessKeyIDs = append(usedAccessKeyIDs, session.STSAccessKeyID) + if uploadPartCalls == 1 { + return "", errors.New("oss returned status 403: SecurityTokenExpired") + } + return "etag", nil + }, + } + u := newDecideResumeUploader("", gw, oss) + session := makeSession("logical-retry", "upload-retry") + session.STSAccessKeyID = "stale-key" + session.PartSizeBytes = 4 + + payload := []byte("abcd") + factory := func(_ context.Context, offset, length int64) (io.ReadCloser, error) { + return io.NopCloser(bytes.NewReader(payload[offset : offset+length])), nil + } + + _, parts, _, err := u.streamMultipartParts(context.Background(), "episode-retry", session, "multipart-retry", int64(len(payload)), session.PartSizeBytes, factory) + if err != nil { + t.Fatalf("streamMultipartParts() error = %v", err) + } + if len(parts) != 1 { + t.Fatalf("uploaded part count = %d, want 1", len(parts)) + } + if reissueCalls != 1 { + t.Fatalf("ReissueUploadCredentials calls = %d, want 1", reissueCalls) + } + if uploadPartCalls != 2 { + t.Fatalf("UploadPart calls = %d, want 2", uploadPartCalls) + } + if partNumbers[0] != 1 || partNumbers[1] != 1 { + t.Fatalf("part numbers = %v, want [1 1]", partNumbers) + } + if usedAccessKeyIDs[0] != "stale-key" || usedAccessKeyIDs[1] != "fresh-key" { + t.Fatalf("access keys = %v, want [stale-key fresh-key]", usedAccessKeyIDs) + } +} + // ============================================================================= // decideResumeAction unit tests // ============================================================================= diff --git a/internal/config/config.go b/internal/config/config.go index 0f1a635..4df907e 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -8,6 +8,7 @@ package config import ( "fmt" "os" + "path/filepath" "strconv" "strings" ) @@ -87,6 +88,7 @@ type SyncConfig struct { RetryJitterSec int // max additive jitter in seconds PersistRootDir string // root directory for persisting upload state across restarts; empty disables persistence MaxRestartCount int // max number of upload restarts before permanent failure; 0 uses uploader default (3) + DPConfigPath string // data-platform config path for direct device-profile uploads } // FeaturesConfig feature flags configuration @@ -203,6 +205,7 @@ func Load() (*Config, error) { RetryJitterSec: getEnvInt("KEYSTONE_SYNC_RETRY_JITTER_SEC", 30), PersistRootDir: getEnv("KEYSTONE_SYNC_PERSIST_ROOT_DIR", ""), MaxRestartCount: getEnvInt("KEYSTONE_SYNC_MAX_RESTART_COUNT", 3), + DPConfigPath: getEnv("KEYSTONE_SYNC_DP_CONFIG", defaultDPConfigPath()), }, Auth: AuthConfig{ JWTSecret: getEnv("KEYSTONE_JWT_SECRET", ""), @@ -274,17 +277,18 @@ func (c *Config) Validate() error { return fmt.Errorf("KEYSTONE_ADMIN_USERNAME and KEYSTONE_ADMIN_PASSWORD must both be set or both be empty") } if c.Sync.Enabled { - if strings.TrimSpace(c.Sync.AuthEndpoint) == "" { - return fmt.Errorf("sync auth endpoint is required when sync is enabled") + c.Sync.DPConfigPath = strings.TrimSpace(c.Sync.DPConfigPath) + if c.Sync.DPConfigPath == "" { + return fmt.Errorf("KEYSTONE_SYNC_DP_CONFIG is required when sync is enabled") } - if strings.TrimSpace(c.Sync.GatewayEndpoint) == "" { - return fmt.Errorf("sync gateway endpoint is required when sync is enabled") + expandedDPConfigPath, err := expandHomePath(c.Sync.DPConfigPath) + if err != nil { + return fmt.Errorf("KEYSTONE_SYNC_DP_CONFIG %q is invalid: %w", c.Sync.DPConfigPath, err) } - apiKey := strings.TrimSpace(c.Sync.APIKey) - if apiKey == "" { - return fmt.Errorf("KEYSTONE_CLOUD_API_KEY is required when sync is enabled") - } - c.Sync.APIKey = apiKey + c.Sync.DPConfigPath = expandedDPConfigPath + c.Sync.AuthEndpoint = strings.TrimSpace(c.Sync.AuthEndpoint) + c.Sync.GatewayEndpoint = strings.TrimSpace(c.Sync.GatewayEndpoint) + c.Sync.APIKey = strings.TrimSpace(c.Sync.APIKey) if c.Sync.BatchSize <= 0 { return fmt.Errorf("sync batch size must be greater than 0 when sync is enabled") } @@ -329,6 +333,28 @@ func getEnv(key, fallback string) string { return fallback } +func defaultDPConfigPath() string { + home, err := os.UserHomeDir() + if err != nil || strings.TrimSpace(home) == "" { + return "~/.archebase/config.json" + } + return filepath.Join(home, ".archebase", "config.json") +} + +func expandHomePath(path string) (string, error) { + if path != "~" && !strings.HasPrefix(path, "~/") { + return path, nil + } + home, err := os.UserHomeDir() + if err != nil || strings.TrimSpace(home) == "" { + return "", fmt.Errorf("home directory is not available") + } + if path == "~" { + return home, nil + } + return filepath.Join(home, strings.TrimPrefix(path, "~/")), nil +} + func getEnvInt(key string, fallback int) int { if val := os.Getenv(key); val != "" { if i, err := strconv.Atoi(val); err == nil { diff --git a/internal/config/config_test.go b/internal/config/config_test.go index dd918e6..4578975 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -7,6 +7,7 @@ package config import ( "os" + "path/filepath" "strings" "testing" ) @@ -21,6 +22,7 @@ func TestLoad(t *testing.T) { "KEYSTONE_MINIO_SECRET_KEY": os.Getenv("KEYSTONE_MINIO_SECRET_KEY"), "KEYSTONE_FACTORY_ID": os.Getenv("KEYSTONE_FACTORY_ID"), "KEYSTONE_SYNC_AUTO_SCAN_ENABLED": os.Getenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED"), + "KEYSTONE_SYNC_DP_CONFIG": os.Getenv("KEYSTONE_SYNC_DP_CONFIG"), } defer func() { // Restore original environment variables @@ -35,6 +37,7 @@ func TestLoad(t *testing.T) { // Set test environment variables os.Unsetenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED") + os.Unsetenv("KEYSTONE_SYNC_DP_CONFIG") os.Setenv("KEYSTONE_MYSQL_PASSWORD", "test-password") os.Setenv("KEYSTONE_MINIO_ACCESS_KEY", "test-access-key") os.Setenv("KEYSTONE_MINIO_SECRET_KEY", "test-secret-key") @@ -69,6 +72,13 @@ func TestLoad(t *testing.T) { if cfg.Sync.AutoScanEnabled { t.Error("Load().Sync.AutoScanEnabled should default to false") } + home, err := os.UserHomeDir() + if err != nil { + t.Fatalf("os.UserHomeDir() error = %v", err) + } + if cfg.Sync.DPConfigPath != filepath.Join(home, ".archebase", "config.json") { + t.Errorf("Load().Sync.DPConfigPath = %q, want default ~/.archebase/config.json", cfg.Sync.DPConfigPath) + } // Verify QA configuration if !cfg.QA.Enabled { @@ -270,7 +280,7 @@ func TestConfigValidate(t *testing.T) { } } -func TestValidateSyncAPIKey(t *testing.T) { +func TestValidateSyncDPConfig(t *testing.T) { validBase := Config{ Server: ServerConfig{Mode: "edge"}, Database: DatabaseConfig{DSN: "user:pass@tcp(localhost:3306)/db"}, @@ -278,7 +288,7 @@ func TestValidateSyncAPIKey(t *testing.T) { Auth: AuthConfig{JWTSecret: "jwt-secret"}, } - t.Run("sync disabled — no API key required", func(t *testing.T) { + t.Run("sync disabled — no DP config required", func(t *testing.T) { cfg := validBase cfg.Sync = SyncConfig{Enabled: false} if err := cfg.Validate(); err != nil { @@ -286,13 +296,11 @@ func TestValidateSyncAPIKey(t *testing.T) { } }) - t.Run("sync enabled — missing API key", func(t *testing.T) { + t.Run("sync enabled — missing DP config", func(t *testing.T) { cfg := validBase cfg.Sync = SyncConfig{ Enabled: true, - AuthEndpoint: "auth:443", - GatewayEndpoint: "gateway:443", - APIKey: "", + DPConfigPath: "", BatchSize: 10, MaxRetries: 5, MaxConcurrent: 2, @@ -302,18 +310,38 @@ func TestValidateSyncAPIKey(t *testing.T) { RetryBaseSec: 30, RetryMaxSec: 1800, } - if err := cfg.Validate(); err == nil { - t.Error("Validate() expected error for missing API key, got nil") + if err := cfg.Validate(); err == nil || !strings.Contains(err.Error(), "KEYSTONE_SYNC_DP_CONFIG") { + t.Fatalf("Validate() error = %v, want KEYSTONE_SYNC_DP_CONFIG error", err) + } + }) + + t.Run("sync enabled — old cloud endpoint and API key are not required", func(t *testing.T) { + cfg := validBase + cfg.Sync = SyncConfig{ + Enabled: true, + DPConfigPath: "/etc/keystone/dp-config.json", + BatchSize: 10, + MaxRetries: 5, + MaxConcurrent: 2, + WorkerIntervalSec: 60, + RequestTimeoutSec: 30, + OSSTimeoutSec: 300, + RetryBaseSec: 30, + RetryMaxSec: 1800, + } + if err := cfg.Validate(); err != nil { + t.Fatalf("Validate() unexpected error = %v", err) + } + if cfg.Sync.AuthEndpoint != "" || cfg.Sync.GatewayEndpoint != "" || cfg.Sync.APIKey != "" { + t.Fatalf("legacy cloud config should remain optional and empty: %+v", cfg.Sync) } }) - t.Run("sync enabled — arbitrary opaque API key accepted", func(t *testing.T) { + t.Run("sync enabled — trims DP config whitespace", func(t *testing.T) { cfg := validBase cfg.Sync = SyncConfig{ Enabled: true, - AuthEndpoint: "auth:443", - GatewayEndpoint: "gateway:443", - APIKey: "notvalidbase64!!!", + DPConfigPath: " /etc/keystone/dp-config.json ", BatchSize: 10, MaxRetries: 5, MaxConcurrent: 2, @@ -326,18 +354,18 @@ func TestValidateSyncAPIKey(t *testing.T) { if err := cfg.Validate(); err != nil { t.Fatalf("Validate() unexpected error = %v", err) } - if cfg.Sync.APIKey != "notvalidbase64!!!" { - t.Errorf("APIKey = %q, want %q", cfg.Sync.APIKey, "notvalidbase64!!!") + if cfg.Sync.DPConfigPath != "/etc/keystone/dp-config.json" { + t.Errorf("DPConfigPath = %q, want trimmed path", cfg.Sync.DPConfigPath) } }) - t.Run("sync enabled — trims API key whitespace", func(t *testing.T) { + t.Run("sync enabled — expands DP config home path", func(t *testing.T) { + home := t.TempDir() + t.Setenv("HOME", home) cfg := validBase cfg.Sync = SyncConfig{ Enabled: true, - AuthEndpoint: "auth:443", - GatewayEndpoint: "gateway:443", - APIKey: " cloud-issued-key ", + DPConfigPath: "~/.archebase/config.json", BatchSize: 10, MaxRetries: 5, MaxConcurrent: 2, @@ -350,8 +378,8 @@ func TestValidateSyncAPIKey(t *testing.T) { if err := cfg.Validate(); err != nil { t.Fatalf("Validate() unexpected error = %v", err) } - if cfg.Sync.APIKey != "cloud-issued-key" { - t.Errorf("APIKey = %q, want %q", cfg.Sync.APIKey, "cloud-issued-key") + if cfg.Sync.DPConfigPath != filepath.Join(home, ".archebase", "config.json") { + t.Errorf("DPConfigPath = %q, want expanded home path", cfg.Sync.DPConfigPath) } }) } diff --git a/internal/server/server.go b/internal/server/server.go index ff908c0..30c65ed 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -76,7 +76,7 @@ func axonTransferWriteTimeout(cfg *config.TransferConfig) time.Duration { // New creates a new server instance. // db and s3Client are optional; pass nil to disable Verified ACK. -// syncWorker is optional; pass nil to disable cloud sync API. +// syncWorker is optional; pass nil to disable cloud sync APIs. func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *services.SyncWorker) *Server { // Create Gin engine gin.SetMode(gin.ReleaseMode) diff --git a/internal/services/dp_asset_resolver.go b/internal/services/dp_asset_resolver.go new file mode 100644 index 0000000..78f3e29 --- /dev/null +++ b/internal/services/dp_asset_resolver.go @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "strings" + + "github.com/jmoiron/sqlx" +) + +func assetIDFromEpisodeMetadata(metadata sql.NullString) string { + if !metadata.Valid || strings.TrimSpace(metadata.String) == "" { + return "" + } + var raw map[string]interface{} + if err := json.Unmarshal([]byte(metadata.String), &raw); err != nil { + return "" + } + value, _ := raw["asset_id"].(string) + return strings.TrimSpace(value) +} + +func resolveAssetIDForEpisode(ctx context.Context, db *sqlx.DB, episodeID int64, metadata sql.NullString, workstationID sql.NullInt64) (string, error) { + if assetID := assetIDFromEpisodeMetadata(metadata); assetID != "" { + return assetID, nil + } + if db == nil { + return "", fmt.Errorf("database is not available") + } + if !workstationID.Valid || workstationID.Int64 <= 0 { + return "", fmt.Errorf("episode %d has no asset_id metadata and no workstation_id", episodeID) + } + + var row struct { + AssetID sql.NullString `db:"asset_id"` + } + err := db.GetContext(ctx, &row, ` + SELECT r.asset_id + FROM workstations ws + LEFT JOIN robots r ON r.id = ws.robot_id + WHERE ws.id = ? + LIMIT 1 + `, workstationID.Int64) + if err == sql.ErrNoRows { + return "", fmt.Errorf("episode %d workstation %d not found while resolving asset_id", episodeID, workstationID.Int64) + } + if err != nil { + return "", fmt.Errorf("resolve asset_id for episode %d workstation %d: %w", episodeID, workstationID.Int64, err) + } + assetID := strings.TrimSpace(row.AssetID.String) + if !row.AssetID.Valid || assetID == "" { + return "", fmt.Errorf("episode %d workstation %d has no robot asset_id", episodeID, workstationID.Int64) + } + return assetID, nil +} diff --git a/internal/services/dp_asset_resolver_test.go b/internal/services/dp_asset_resolver_test.go new file mode 100644 index 0000000..d000738 --- /dev/null +++ b/internal/services/dp_asset_resolver_test.go @@ -0,0 +1,112 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "context" + "database/sql" + "strings" + "testing" + + "github.com/jmoiron/sqlx" + _ "modernc.org/sqlite" +) + +func newTestAssetResolverDB(t *testing.T) *sqlx.DB { + t.Helper() + db, err := sqlx.Open("sqlite", ":memory:") + if err != nil { + t.Fatalf("open sqlite db: %v", err) + } + for _, stmt := range []string{ + `CREATE TABLE robots ( + id INTEGER PRIMARY KEY, + device_id TEXT NOT NULL, + asset_id TEXT, + deleted_at TIMESTAMP NULL + )`, + `CREATE TABLE workstations ( + id INTEGER PRIMARY KEY, + robot_id INTEGER, + deleted_at TIMESTAMP NULL + )`, + } { + if _, err := db.Exec(stmt); err != nil { + _ = db.Close() + t.Fatalf("create schema: %v", err) + } + } + t.Cleanup(func() { _ = db.Close() }) + return db +} + +func TestResolveAssetIDForEpisode_MetadataWins(t *testing.T) { + db := newTestAssetResolverDB(t) + if _, err := db.Exec(`INSERT INTO robots (id, device_id, asset_id) VALUES (1, 'local-device', 'fallback-asset')`); err != nil { + t.Fatalf("seed robot: %v", err) + } + if _, err := db.Exec(`INSERT INTO workstations (id, robot_id) VALUES (10, 1)`); err != nil { + t.Fatalf("seed workstation: %v", err) + } + + got, err := resolveAssetIDForEpisode( + context.Background(), + db, + 1, + sql.NullString{String: `{"asset_id":" snapshot-asset "}`, Valid: true}, + sql.NullInt64{Int64: 10, Valid: true}, + ) + if err != nil { + t.Fatalf("resolveAssetIDForEpisode() error = %v", err) + } + if got != "snapshot-asset" { + t.Fatalf("asset_id=%q want snapshot-asset", got) + } +} + +func TestResolveAssetIDForEpisode_FallbackReadsSoftDeletedWorkstation(t *testing.T) { + db := newTestAssetResolverDB(t) + if _, err := db.Exec(`INSERT INTO robots (id, device_id, asset_id) VALUES (1, 'local-device', 'fallback-asset')`); err != nil { + t.Fatalf("seed robot: %v", err) + } + if _, err := db.Exec(`INSERT INTO workstations (id, robot_id, deleted_at) VALUES (10, 1, CURRENT_TIMESTAMP)`); err != nil { + t.Fatalf("seed workstation: %v", err) + } + + got, err := resolveAssetIDForEpisode( + context.Background(), + db, + 1, + sql.NullString{}, + sql.NullInt64{Int64: 10, Valid: true}, + ) + if err != nil { + t.Fatalf("resolveAssetIDForEpisode() error = %v", err) + } + if got != "fallback-asset" { + t.Fatalf("asset_id=%q want fallback-asset", got) + } +} + +func TestResolveAssetIDForEpisode_MissingDoesNotFallbackToLocalDeviceID(t *testing.T) { + db := newTestAssetResolverDB(t) + if _, err := db.Exec(`INSERT INTO robots (id, device_id, asset_id) VALUES (1, 'local-device', NULL)`); err != nil { + t.Fatalf("seed robot: %v", err) + } + if _, err := db.Exec(`INSERT INTO workstations (id, robot_id) VALUES (10, 1)`); err != nil { + t.Fatalf("seed workstation: %v", err) + } + + _, err := resolveAssetIDForEpisode( + context.Background(), + db, + 1, + sql.NullString{}, + sql.NullInt64{Int64: 10, Valid: true}, + ) + if err == nil || !strings.Contains(err.Error(), "asset_id") { + t.Fatalf("error=%v want asset_id missing error", err) + } +} diff --git a/internal/services/dp_config_loader.go b/internal/services/dp_config_loader.go new file mode 100644 index 0000000..f85f773 --- /dev/null +++ b/internal/services/dp_config_loader.go @@ -0,0 +1,171 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "encoding/json" + "fmt" + "net" + "net/url" + "os" + "strings" +) + +// DPConfigFile is the subset of data-platform config consumed by direct sync. +type DPConfigFile struct { + Version *int `json:"version,omitempty"` + Endpoints DPConfigEndpoints `json:"endpoints"` + Devices []DPDeviceProfile `json:"devices"` +} + +// DPConfigEndpoints contains the auth and gateway endpoints from a DP config file. +type DPConfigEndpoints struct { + Auth string `json:"auth"` + Gateway string `json:"gateway"` +} + +// DPDeviceProfile contains upload credentials and tags for one DP device. +type DPDeviceProfile struct { + DeviceID string `json:"deviceId"` + APIKey string `json:"apiKey"` // #nosec G117 -- operator-provided local DP upload config credential + Tags map[string]string `json:"tags"` +} + +// DPResolvedEndpoint is a normalized upload service endpoint. +type DPResolvedEndpoint struct { + Target string + UseTLS bool + ServerName string +} + +// DPDeviceUploadConfig contains the resolved upload config for one asset ID. +type DPDeviceUploadConfig struct { + ConfigPath string + Auth DPResolvedEndpoint + Gateway DPResolvedEndpoint + Profile DPDeviceProfile +} + +func loadDPDeviceUploadConfig(configPath string, assetID string) (*DPDeviceUploadConfig, error) { + configPath = strings.TrimSpace(configPath) + assetID = strings.TrimSpace(assetID) + if configPath == "" { + return nil, fmt.Errorf("KEYSTONE_SYNC_DP_CONFIG is required") + } + if assetID == "" { + return nil, fmt.Errorf("asset_id is required") + } + + data, err := os.ReadFile(configPath) //nolint:gosec // operator-controlled config path + if err != nil { + return nil, fmt.Errorf("read DP config %s: %w", configPath, err) + } + + var cfg DPConfigFile + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("parse DP config %s: %w", configPath, err) + } + if cfg.Version != nil && *cfg.Version != 3 { + return nil, fmt.Errorf("DP config %s has unsupported version %d", configPath, *cfg.Version) + } + + authEndpoint, err := parseDPResolvedEndpoint(cfg.Endpoints.Auth) + if err != nil { + return nil, fmt.Errorf("invalid endpoints.auth in DP config %s: %w", configPath, err) + } + gatewayEndpoint, err := parseDPResolvedEndpoint(cfg.Endpoints.Gateway) + if err != nil { + return nil, fmt.Errorf("invalid endpoints.gateway in DP config %s: %w", configPath, err) + } + + devices := make(map[string]DPDeviceProfile, len(cfg.Devices)) + for idx, device := range cfg.Devices { + deviceID := strings.TrimSpace(device.DeviceID) + if deviceID == "" { + return nil, fmt.Errorf("DP config %s devices[%d].deviceId is empty", configPath, idx) + } + if _, exists := devices[deviceID]; exists { + return nil, fmt.Errorf("DP config %s has duplicate deviceId %q", configPath, deviceID) + } + device.DeviceID = deviceID + devices[deviceID] = device + } + + profile, ok := devices[assetID] + if !ok { + return nil, fmt.Errorf("DP config %s has no device profile for asset_id %q", configPath, assetID) + } + profile.APIKey = strings.TrimSpace(profile.APIKey) + if profile.APIKey == "" { + return nil, fmt.Errorf("DP config %s device %q apiKey is empty", configPath, assetID) + } + if len(profile.Tags) == 0 { + return nil, fmt.Errorf("DP config %s device %q tags must be non-empty", configPath, assetID) + } + for key := range profile.Tags { + if key == "" { + return nil, fmt.Errorf("DP config %s device %q has an empty tag key", configPath, assetID) + } + } + + return &DPDeviceUploadConfig{ + ConfigPath: configPath, + Auth: authEndpoint, + Gateway: gatewayEndpoint, + Profile: profile, + }, nil +} + +func parseDPResolvedEndpoint(raw string) (DPResolvedEndpoint, error) { + value := strings.TrimSpace(raw) + if value == "" { + return DPResolvedEndpoint{}, fmt.Errorf("endpoint is required") + } + + if strings.Contains(value, "://") { + parsed, err := url.Parse(value) + if err != nil { + return DPResolvedEndpoint{}, err + } + if parsed.Scheme != "http" && parsed.Scheme != "https" { + return DPResolvedEndpoint{}, fmt.Errorf("unsupported scheme %q", parsed.Scheme) + } + if parsed.Host == "" || parsed.User != nil { + return DPResolvedEndpoint{}, fmt.Errorf("endpoint must be host[:port]") + } + if parsed.Path != "" || parsed.RawQuery != "" || parsed.Fragment != "" { + return DPResolvedEndpoint{}, fmt.Errorf("endpoint must not include path, query, or fragment") + } + host := parsed.Hostname() + if host == "" { + return DPResolvedEndpoint{}, fmt.Errorf("endpoint host is required") + } + target := parsed.Host + if parsed.Port() == "" { + defaultPort := "80" + if parsed.Scheme == "https" { + defaultPort = "443" + } + target = net.JoinHostPort(host, defaultPort) + } + return DPResolvedEndpoint{ + Target: target, + UseTLS: parsed.Scheme == "https", + ServerName: tlsServerNameForScheme(parsed.Scheme, host), + }, nil + } + + if strings.ContainsAny(value, "/?#") { + return DPResolvedEndpoint{}, fmt.Errorf("bare endpoint must not include path, query, or fragment") + } + return DPResolvedEndpoint{Target: value, UseTLS: false}, nil +} + +func tlsServerNameForScheme(scheme string, host string) string { + if scheme == "https" { + return host + } + return "" +} diff --git a/internal/services/dp_config_loader_test.go b/internal/services/dp_config_loader_test.go new file mode 100644 index 0000000..deff985 --- /dev/null +++ b/internal/services/dp_config_loader_test.go @@ -0,0 +1,176 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func writeDPConfigFixture(t *testing.T, body string) string { + t.Helper() + path := filepath.Join(t.TempDir(), "dp-config.json") + if err := os.WriteFile(path, []byte(body), 0o600); err != nil { + t.Fatalf("write DP config fixture: %v", err) + } + return path +} + +func validDPConfigJSON(extra string) string { + version := `"version":3,` + if extra == "missing-version" { + version = "" + } + return `{ + ` + version + ` + "endpoints": { + "auth": "https://auth.example.com", + "gateway": "gateway.example.com:7443" + }, + "devices": [{ + "deviceId": " asset-1 ", + "apiKey": " api-key-1 ", + "tags": {"line": "A", "empty_value": ""} + }] + }` +} + +func TestLoadDPDeviceUploadConfig_SelectsDeviceAndEndpoints(t *testing.T) { + for _, tt := range []struct { + name string + body string + }{ + {name: "version 3", body: validDPConfigJSON("")}, + {name: "missing version", body: validDPConfigJSON("missing-version")}, + } { + t.Run(tt.name, func(t *testing.T) { + cfg, err := loadDPDeviceUploadConfig(writeDPConfigFixture(t, tt.body), "asset-1") + if err != nil { + t.Fatalf("loadDPDeviceUploadConfig() error = %v", err) + } + if cfg.Profile.DeviceID != "asset-1" { + t.Fatalf("Profile.DeviceID=%q want asset-1", cfg.Profile.DeviceID) + } + if cfg.Profile.APIKey != "api-key-1" { + t.Fatalf("Profile.APIKey was not trimmed") + } + if cfg.Auth.Target != "auth.example.com:443" || !cfg.Auth.UseTLS || cfg.Auth.ServerName != "auth.example.com" { + t.Fatalf("auth endpoint=%+v", cfg.Auth) + } + if cfg.Gateway.Target != "gateway.example.com:7443" || cfg.Gateway.UseTLS { + t.Fatalf("gateway endpoint=%+v", cfg.Gateway) + } + if cfg.Profile.Tags["empty_value"] != "" { + t.Fatalf("empty tag values must be preserved: %+v", cfg.Profile.Tags) + } + }) + } +} + +func TestParseDPResolvedEndpoint(t *testing.T) { + tests := []struct { + raw string + target string + useTLS bool + serverName string + }{ + {raw: "https://dp.example.com", target: "dp.example.com:443", useTLS: true, serverName: "dp.example.com"}, + {raw: "https://dp.example.com:9443", target: "dp.example.com:9443", useTLS: true, serverName: "dp.example.com"}, + {raw: "http://dp.example.com", target: "dp.example.com:80", useTLS: false}, + {raw: "dp.example.com:7443", target: "dp.example.com:7443", useTLS: false}, + {raw: "dp.example.com", target: "dp.example.com", useTLS: false}, + } + for _, tt := range tests { + t.Run(tt.raw, func(t *testing.T) { + got, err := parseDPResolvedEndpoint(tt.raw) + if err != nil { + t.Fatalf("parseDPResolvedEndpoint() error = %v", err) + } + if got.Target != tt.target || got.UseTLS != tt.useTLS || got.ServerName != tt.serverName { + t.Fatalf("parseDPResolvedEndpoint()=%+v want target=%q tls=%t server=%q", got, tt.target, tt.useTLS, tt.serverName) + } + }) + } +} + +func TestParseDPResolvedEndpointRejectsUnsupportedForms(t *testing.T) { + for _, raw := range []string{ + "", + "https://dp.example.com/path", + "https://dp.example.com?x=1", + "https://dp.example.com#frag", + "ftp://dp.example.com", + "dp.example.com/path", + "dp.example.com?x=1", + "dp.example.com#frag", + } { + t.Run(raw, func(t *testing.T) { + if _, err := parseDPResolvedEndpoint(raw); err == nil { + t.Fatalf("parseDPResolvedEndpoint(%q) expected error", raw) + } + }) + } +} + +func TestLoadDPDeviceUploadConfigRejectsContractErrors(t *testing.T) { + tests := []struct { + name string + body string + deviceID string + want string + }{ + { + name: "unsupported version", + body: `{"version":2,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"key","tags":{"k":"v"}}]}`, + want: "unsupported version", + }, + { + name: "missing device", + body: validDPConfigJSON(""), + deviceID: "CLOUD-device-1", + want: "no device profile", + }, + { + name: "empty api key", + body: `{"version":3,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":" ","tags":{"k":"v"}}]}`, + want: "apiKey is empty", + }, + { + name: "empty tags", + body: `{"version":3,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"key","tags":{}}]}`, + want: "tags must be non-empty", + }, + { + name: "empty tag key", + body: `{"version":3,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"key","tags":{"":"v"}}]}`, + want: "empty tag key", + }, + { + name: "duplicate device", + body: `{"version":3,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":" asset-1 ","apiKey":"key","tags":{"k":"v"}},{"deviceId":"asset-1","apiKey":"key2","tags":{"k":"v"}}]}`, + want: "duplicate deviceId", + }, + { + name: "missing endpoint", + body: `{"version":3,"endpoints":{"auth":"","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"key","tags":{"k":"v"}}]}`, + want: "endpoints.auth", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + deviceID := tt.deviceID + if deviceID == "" { + deviceID = "asset-1" + } + _, err := loadDPDeviceUploadConfig(writeDPConfigFixture(t, tt.body), deviceID) + if err == nil || !strings.Contains(err.Error(), tt.want) { + t.Fatalf("error=%v want contains %q", err, tt.want) + } + }) + } +} diff --git a/internal/services/dp_raw_tags.go b/internal/services/dp_raw_tags.go new file mode 100644 index 0000000..925116a --- /dev/null +++ b/internal/services/dp_raw_tags.go @@ -0,0 +1,96 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "database/sql" + "fmt" + "path" + "strconv" + "strings" +) + +const ( + dpReservedDeviceIDTagKey = "778a6d83c9ec49108537542a570966ee.device_id" + dpReservedRawFileTagKey = "a206e337ecdf70a93bb611cf6a30c346.raw_file" +) + +type dpRawTagsInput struct { + Profile DPDeviceProfile + McapKey string + SidecarTags map[string]string + EpisodeID int64 + EpisodePublicID string + TaskID int64 + FactoryID sql.NullInt64 + OrganizationID sql.NullInt64 +} + +func buildDPDirectRawTags(input dpRawTagsInput) (map[string]string, error) { + mcapKey := stripBucketPrefix(input.McapKey) + rawFile := path.Base(strings.TrimSpace(mcapKey)) + if rawFile == "" || rawFile == "." || rawFile == "/" { + return nil, fmt.Errorf("raw_file basename is empty for mcap key %q", input.McapKey) + } + + merged := make(map[string]string, len(input.Profile.Tags)+len(input.SidecarTags)+8) + if err := insertAllNonConflictingTags(merged, input.Profile.Tags); err != nil { + return nil, fmt.Errorf("device profile tags: %w", err) + } + if err := insertNonConflictingTag(merged, dpReservedDeviceIDTagKey, input.Profile.DeviceID); err != nil { + return nil, err + } + if err := insertNonConflictingTag(merged, dpReservedRawFileTagKey, rawFile); err != nil { + return nil, err + } + if err := insertAllNonConflictingTags(merged, input.SidecarTags); err != nil { + return nil, fmt.Errorf("sidecar tags: %w", err) + } + if err := insertAllNonConflictingTags(merged, keystoneExtraTags(input)); err != nil { + return nil, fmt.Errorf("keystone extra tags: %w", err) + } + return merged, nil +} + +func keystoneExtraTags(input dpRawTagsInput) map[string]string { + tags := map[string]string{ + "episode_id": input.EpisodePublicID, + "keystone_episode_id": strconv.FormatInt(input.EpisodeID, 10), + "sync_channel": "keystone_direct", + } + if input.TaskID > 0 { + tags["task_id"] = strconv.FormatInt(input.TaskID, 10) + } + if input.FactoryID.Valid { + tags["factory_id"] = strconv.FormatInt(input.FactoryID.Int64, 10) + } + if input.OrganizationID.Valid { + tags["organization_id"] = strconv.FormatInt(input.OrganizationID.Int64, 10) + } + return tags +} + +func insertAllNonConflictingTags(dst map[string]string, src map[string]string) error { + for key, value := range src { + if err := insertNonConflictingTag(dst, key, value); err != nil { + return err + } + } + return nil +} + +func insertNonConflictingTag(dst map[string]string, key string, value string) error { + if key == "" { + return fmt.Errorf("raw tag key must not be empty") + } + if existing, ok := dst[key]; ok { + if existing != value { + return fmt.Errorf("raw tag conflict for key %q", key) + } + return nil + } + dst[key] = value + return nil +} diff --git a/internal/services/dp_raw_tags_test.go b/internal/services/dp_raw_tags_test.go new file mode 100644 index 0000000..bbca857 --- /dev/null +++ b/internal/services/dp_raw_tags_test.go @@ -0,0 +1,165 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "database/sql" + "strings" + "testing" +) + +func TestBuildDPDirectRawTags_MergesInDocumentedOrder(t *testing.T) { + got, err := buildDPDirectRawTags(dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{ + "profile": "tag", + "same": "value", + }, + }, + McapKey: "edge-factory/factory/device/task.mcap", + SidecarTags: map[string]string{ + "same": "value", + "array_field": `["a","b"]`, + "empty_value": "", + }, + EpisodeID: 42, + EpisodePublicID: "episode-public-42", + TaskID: 77, + FactoryID: sql.NullInt64{Int64: 3, Valid: true}, + OrganizationID: sql.NullInt64{Int64: 9, Valid: true}, + }) + if err != nil { + t.Fatalf("buildDPDirectRawTags() error = %v", err) + } + + cases := map[string]string{ + "profile": "tag", + "same": "value", + dpReservedDeviceIDTagKey: "asset-1", + dpReservedRawFileTagKey: "task.mcap", + "array_field": `["a","b"]`, + "empty_value": "", + "episode_id": "episode-public-42", + "keystone_episode_id": "42", + "sync_channel": "keystone_direct", + "task_id": "77", + "factory_id": "3", + "organization_id": "9", + } + for key, want := range cases { + if got[key] != want { + t.Fatalf("tag[%q]=%q want %q tags=%+v", key, got[key], want, got) + } + } + if _, ok := got["device_id"]; ok { + t.Fatalf("ordinary device_id raw tag must not be injected: %+v", got) + } +} + +func TestBuildDPDirectRawTags_UsesMcapKeyBasenameNotSidecarMcapFile(t *testing.T) { + got, err := buildDPDirectRawTags(dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{"profile": "tag"}, + }, + McapKey: "bucket/minio/path/actual.mcap", + SidecarTags: map[string]string{ + "mcap_file": "sidecar-claimed.mcap", + }, + EpisodeID: 1, + EpisodePublicID: "episode-1", + }) + if err != nil { + t.Fatalf("buildDPDirectRawTags() error = %v", err) + } + if got[dpReservedRawFileTagKey] != "actual.mcap" { + t.Fatalf("raw_file=%q want actual.mcap", got[dpReservedRawFileTagKey]) + } + if got["mcap_file"] != "sidecar-claimed.mcap" { + t.Fatalf("sidecar mcap_file should remain ordinary sidecar tag: %+v", got) + } +} + +func TestBuildDPDirectRawTags_ConflictingTagsFail(t *testing.T) { + tests := []struct { + name string + input dpRawTagsInput + }{ + { + name: "profile conflicts with reserved device id", + input: dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{dpReservedDeviceIDTagKey: "other-device"}, + }, + McapKey: "bucket/file.mcap", + EpisodeID: 1, + EpisodePublicID: "episode-1", + }, + }, + { + name: "sidecar conflicts with profile", + input: dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{"scene": "profile"}, + }, + McapKey: "bucket/file.mcap", + SidecarTags: map[string]string{"scene": "sidecar"}, + EpisodeID: 1, + EpisodePublicID: "episode-1", + }, + }, + { + name: "sidecar conflicts with keystone extra", + input: dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{"profile": "tag"}, + }, + McapKey: "bucket/file.mcap", + SidecarTags: map[string]string{"sync_channel": "other"}, + EpisodeID: 1, + EpisodePublicID: "episode-1", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if _, err := buildDPDirectRawTags(tt.input); err == nil || !strings.Contains(err.Error(), "conflict") { + t.Fatalf("error=%v want conflict", err) + } + }) + } +} + +func TestBuildDPDirectRawTags_RejectsEmptyKeyAndRawFile(t *testing.T) { + _, err := buildDPDirectRawTags(dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{"": "value"}, + }, + McapKey: "bucket/file.mcap", + EpisodeID: 1, + EpisodePublicID: "episode-1", + }) + if err == nil || !strings.Contains(err.Error(), "key") { + t.Fatalf("empty key error=%v", err) + } + + _, err = buildDPDirectRawTags(dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{"profile": "tag"}, + }, + McapKey: "bucket/", + EpisodeID: 1, + EpisodePublicID: "episode-1", + }) + if err == nil || !strings.Contains(err.Error(), "raw_file") { + t.Fatalf("empty raw_file error=%v", err) + } +} diff --git a/internal/services/sync_errors.go b/internal/services/sync_errors.go new file mode 100644 index 0000000..c07e3ff --- /dev/null +++ b/internal/services/sync_errors.go @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "errors" + "fmt" +) + +type syncNonRetryableError struct { + err error +} + +func (e *syncNonRetryableError) Error() string { + if e == nil || e.err == nil { + return "" + } + return e.err.Error() +} + +func (e *syncNonRetryableError) Unwrap() error { + if e == nil { + return nil + } + return e.err +} + +func newNonRetryableSyncError(format string, args ...interface{}) error { + return &syncNonRetryableError{err: fmt.Errorf(format, args...)} +} + +func wrapNonRetryableSyncError(err error, format string, args ...interface{}) error { + if err == nil { + return nil + } + msg := fmt.Sprintf(format, args...) + return &syncNonRetryableError{err: fmt.Errorf("%s: %w", msg, err)} +} + +func isNonRetryableSyncError(err error) bool { + var target *syncNonRetryableError + return errors.As(err, &target) +} diff --git a/internal/services/sync_worker.go b/internal/services/sync_worker.go index f9971bb..52bc7cd 100644 --- a/internal/services/sync_worker.go +++ b/internal/services/sync_worker.go @@ -40,6 +40,20 @@ type SyncWorkerConfig struct { type syncEnqueueRequest struct { episodeID int64 manual bool + resync bool +} + +type syncEpisodeUploadRow struct { + ID int64 `db:"id"` + EpisodeUUID string `db:"episode_id"` + TaskID int64 `db:"task_id"` + McapPath string `db:"mcap_path"` + SidecarPath string `db:"sidecar_path"` + CloudSynced bool `db:"cloud_synced"` + Metadata sql.NullString `db:"metadata"` + WorkstationID sql.NullInt64 `db:"workstation_id"` + FactoryID sql.NullInt64 `db:"factory_id"` + OrganizationID sql.NullInt64 `db:"organization_id"` } // SyncWorker is a background goroutine that processes queued cloud sync work @@ -85,6 +99,7 @@ var ( errSyncRetryBackoffActive = errors.New("sync retry backoff active") errSyncRetryExhausted = errors.New("sync retry max retries exceeded") errSyncAlreadyCompleted = errors.New("sync already completed") + errSyncNonRetryableFailed = errors.New("sync latest failure is non-retryable") ) // NewSyncWorker creates a new sync worker. Call Start() to begin background processing. @@ -224,13 +239,25 @@ func (w *SyncWorker) EnqueueEpisodeManual(ctx context.Context, episodeID int64) if !w.running.Load() { return ErrSyncWorkerNotRunning } - if err := w.persistPendingSyncLog(ctx, episodeID, true); err != nil { + if err := w.persistPendingSyncLog(ctx, episodeID, true, false); err != nil { return err } w.enqueuePersistedEpisode(ctx, syncEnqueueRequest{episodeID: episodeID, manual: true}) return nil } +// EnqueueEpisodeResync queues a new upload attempt for an episode that has already synced. +func (w *SyncWorker) EnqueueEpisodeResync(ctx context.Context, episodeID int64) error { + if !w.running.Load() { + return ErrSyncWorkerNotRunning + } + if err := w.persistResyncSyncLog(ctx, episodeID); err != nil { + return err + } + w.enqueuePersistedEpisode(ctx, syncEnqueueRequest{episodeID: episodeID, manual: true, resync: true}) + return nil +} + func (w *SyncWorker) enqueueEpisode(ctx context.Context, episodeID int64, manual bool) error { if !w.running.Load() { return ErrSyncWorkerNotRunning @@ -267,7 +294,7 @@ func (w *SyncWorker) enqueuePersistedEpisode(ctx context.Context, req syncEnqueu } } -func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64, manual bool) error { +func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64, manual bool, allowSynced bool) error { if w.db == nil { return nil } @@ -293,7 +320,7 @@ func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64, } return fmt.Errorf("lock episode %d: %w", episodeID, err) } - if episode.CloudSynced { + if episode.CloudSynced && !allowSynced { return fmt.Errorf("episode %d already synced", episodeID) } @@ -340,13 +367,16 @@ func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64, case "completed": return fmt.Errorf("%w for episode %d", errSyncAlreadyCompleted, episodeID) case "failed": - retryDue := !latest.NextRetry.Valid || !latest.NextRetry.Time.After(now) + retryDue := latest.NextRetry.Valid && !latest.NextRetry.Time.After(now) if latest.AttemptCount < w.cfg.MaxRetries && retryDue { if err := promoteFailedSyncLogToPending(ctx, tx, latest.ID, now); err != nil { return err } return tx.Commit() } + if !manual && !latest.NextRetry.Valid { + return fmt.Errorf("%w for episode %d", errSyncNonRetryableFailed, episodeID) + } if !manual && latest.AttemptCount >= w.cfg.MaxRetries { return fmt.Errorf("%w for episode %d", errSyncRetryExhausted, episodeID) } @@ -362,6 +392,55 @@ func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64, } } +func (w *SyncWorker) persistResyncSyncLog(ctx context.Context, episodeID int64) error { + if w.db == nil { + return nil + } + + tx, err := w.db.BeginTxx(ctx, nil) + if err != nil { + return fmt.Errorf("begin resync sync_log transaction: %w", err) + } + defer func() { _ = tx.Rollback() }() + + lockClause := txLockClause(tx) + var episode struct { + ID int64 `db:"id"` + CloudSynced bool `db:"cloud_synced"` + } + if err := tx.GetContext(ctx, &episode, ` + SELECT id, cloud_synced + FROM episodes + WHERE id = ? AND deleted_at IS NULL + `+lockClause, episodeID); err != nil { + if err == sql.ErrNoRows { + return fmt.Errorf("episode %d not found", episodeID) + } + return fmt.Errorf("lock episode %d for resync: %w", episodeID, err) + } + if !episode.CloudSynced { + return fmt.Errorf("episode %d has not completed cloud sync", episodeID) + } + + var activeCount int + if err := tx.GetContext(ctx, &activeCount, ` + SELECT COUNT(*) + FROM sync_logs + WHERE episode_id = ? + AND status IN ('pending', 'in_progress') + `, episodeID); err != nil { + return fmt.Errorf("query active resync sync_log count: %w", err) + } + if activeCount > 0 { + return fmt.Errorf("%w for episode %d", ErrSyncAlreadyInProgress, episodeID) + } + + if err := insertPendingSyncLog(ctx, tx, episodeID, time.Now().UTC(), 0); err != nil { + return err + } + return tx.Commit() +} + func insertPendingSyncLog(ctx context.Context, tx *sqlx.Tx, episodeID int64, queuedAt time.Time, attemptCount int) error { if _, err := tx.ExecContext(ctx, ` INSERT INTO sync_logs (episode_id, status, attempt_count, started_at) @@ -408,7 +487,8 @@ func isSkippablePendingError(err error) bool { return errors.Is(err, ErrSyncAlreadyInProgress) || errors.Is(err, errSyncRetryBackoffActive) || errors.Is(err, errSyncRetryExhausted) || - errors.Is(err, errSyncAlreadyCompleted) + errors.Is(err, errSyncAlreadyCompleted) || + errors.Is(err, errSyncNonRetryableFailed) } // EnqueuePendingEpisodes scans for all approved but un-synced episodes and enqueues them. @@ -424,7 +504,7 @@ func (w *SyncWorker) EnqueuePendingEpisodes(ctx context.Context) (int, error) { } count := 0 for _, id := range ids { - if err := w.persistPendingSyncLog(ctx, id, false); err != nil { + if err := w.persistPendingSyncLog(ctx, id, false, false); err != nil { if isSkippablePendingError(err) { continue } @@ -497,9 +577,9 @@ func (w *SyncWorker) processEnqueuedEpisode(ctx context.Context, req syncEnqueue w.processEnqueuedEpisodeWith(ctx, req, w.processEpisodeWithMode) } -func (w *SyncWorker) processEnqueuedEpisodeWith(ctx context.Context, req syncEnqueueRequest, process func(context.Context, int64, bool)) { +func (w *SyncWorker) processEnqueuedEpisodeWith(ctx context.Context, req syncEnqueueRequest, process func(context.Context, int64, bool, bool)) { defer w.unmarkEnqueued(req.episodeID) - process(ctx, req.episodeID, req.manual) + process(ctx, req.episodeID, req.manual, req.resync) } func (w *SyncWorker) dispatchJob(ctx context.Context, req syncEnqueueRequest) { @@ -590,7 +670,7 @@ func (w *SyncWorker) pollAndProcess(ctx context.Context) { logger.Printf("[SYNC-WORKER] Found %d episodes to sync", len(ids)) for _, id := range ids { - if err := w.persistPendingSyncLog(ctx, id, false); err != nil { + if err := w.persistPendingSyncLog(ctx, id, false, false); err != nil { if isSkippablePendingError(err) { continue } @@ -602,13 +682,13 @@ func (w *SyncWorker) pollAndProcess(ctx context.Context) { } func (w *SyncWorker) dispatchPendingSyncLogs(ctx context.Context) { - ids, err := w.findPendingSyncLogEpisodes(ctx) + reqs, err := w.findPendingSyncLogEpisodes(ctx) if err != nil { logger.Printf("[SYNC-WORKER] Failed to find queued sync logs: %v", err) return } - for _, id := range ids { - w.dispatchPersistedJob(ctx, syncEnqueueRequest{episodeID: id, manual: false}) + for _, req := range reqs { + w.dispatchPersistedJob(ctx, req) } } @@ -619,10 +699,13 @@ func (w *SyncWorker) dispatchPersistedJob(ctx context.Context, req syncEnqueueRe w.dispatchJob(ctx, req) } -func (w *SyncWorker) findPendingSyncLogEpisodes(ctx context.Context) ([]int64, error) { - var ids []int64 - if err := w.db.SelectContext(ctx, &ids, ` - SELECT latest_log.episode_id +func (w *SyncWorker) findPendingSyncLogEpisodes(ctx context.Context) ([]syncEnqueueRequest, error) { + var rows []struct { + EpisodeID int64 `db:"episode_id"` + CloudSynced bool `db:"cloud_synced"` + } + if err := w.db.SelectContext(ctx, &rows, ` + SELECT latest_log.episode_id, e.cloud_synced FROM sync_logs latest_log INNER JOIN ( SELECT episode_id, MAX(id) AS latest_id @@ -631,14 +714,17 @@ func (w *SyncWorker) findPendingSyncLogEpisodes(ctx context.Context) ([]int64, e ) latest ON latest_log.episode_id = latest.episode_id AND latest_log.id = latest.latest_id INNER JOIN episodes e ON e.id = latest_log.episode_id WHERE latest_log.status = 'pending' - AND e.cloud_synced = FALSE AND e.deleted_at IS NULL ORDER BY latest_log.started_at ASC, latest_log.id ASC LIMIT ? `, w.cfg.BatchSize); err != nil { return nil, fmt.Errorf("query pending sync logs: %w", err) } - return ids, nil + reqs := make([]syncEnqueueRequest, len(rows)) + for i, row := range rows { + reqs[i] = syncEnqueueRequest{episodeID: row.EpisodeID, resync: row.CloudSynced} + } + return reqs, nil } func (w *SyncWorker) findPendingEpisodes(ctx context.Context, includeExhaustedFailures bool) ([]int64, error) { @@ -682,6 +768,17 @@ func (w *SyncWorker) findPendingEpisodes(ctx context.Context, includeExhaustedFa WHERE sl.episode_id = e.id AND sl.status = 'failed' AND sl.attempt_count >= ? + ) + AND NOT EXISTS ( + SELECT 1 FROM sync_logs sl + INNER JOIN ( + SELECT episode_id, MAX(id) AS latest_id + FROM sync_logs + GROUP BY episode_id + ) t ON sl.episode_id = t.episode_id AND sl.id = t.latest_id + WHERE sl.episode_id = e.id + AND sl.status = 'failed' + AND sl.next_retry_at IS NULL )`) err = w.db.SelectContext(ctx, &ids, query, w.cfg.MaxRetries, w.cfg.BatchSize) } else { @@ -695,19 +792,25 @@ func (w *SyncWorker) findPendingEpisodes(ctx context.Context, includeExhaustedFa } func (w *SyncWorker) retryFailedEpisodes(ctx context.Context) { - var ids []int64 + var rows []struct { + EpisodeID int64 `db:"episode_id"` + CloudSynced bool `db:"cloud_synced"` + } now := time.Now().UTC() - err := w.db.SelectContext(ctx, &ids, ` - SELECT sl.episode_id + err := w.db.SelectContext(ctx, &rows, ` + SELECT sl.episode_id, e.cloud_synced FROM sync_logs sl INNER JOIN ( SELECT episode_id, MAX(id) AS latest_id FROM sync_logs GROUP BY episode_id ) t ON sl.episode_id = t.episode_id AND sl.id = t.latest_id + INNER JOIN episodes e ON e.id = sl.episode_id WHERE sl.status = 'failed' + AND e.deleted_at IS NULL AND sl.attempt_count < ? - AND (sl.next_retry_at IS NULL OR sl.next_retry_at <= ?) + AND sl.next_retry_at IS NOT NULL + AND sl.next_retry_at <= ? AND NOT EXISTS ( SELECT 1 FROM sync_logs sl2 WHERE sl2.episode_id = sl.episode_id @@ -721,33 +824,36 @@ func (w *SyncWorker) retryFailedEpisodes(ctx context.Context) { return } - if len(ids) == 0 { + if len(rows) == 0 { return } - for _, id := range ids { - if err := w.persistPendingSyncLog(ctx, id, false); err != nil { + for _, row := range rows { + if err := w.persistPendingSyncLog(ctx, row.EpisodeID, false, row.CloudSynced); err != nil { if isSkippablePendingError(err) { continue } - logger.Printf("[SYNC-WORKER] Failed to queue retry for episode %d: %v", id, err) + logger.Printf("[SYNC-WORKER] Failed to queue retry for episode %d: %v", row.EpisodeID, err) continue } - w.dispatchPersistedJob(ctx, syncEnqueueRequest{episodeID: id, manual: false}) + w.dispatchPersistedJob(ctx, syncEnqueueRequest{episodeID: row.EpisodeID, manual: false, resync: row.CloudSynced}) } } -func (w *SyncWorker) processEpisodeWithMode(ctx context.Context, episodeID int64, manual bool) { - // Fetch episode details - var ep struct { - ID int64 `db:"id"` - EpisodeUUID string `db:"episode_id"` - McapPath string `db:"mcap_path"` - SidecarPath string `db:"sidecar_path"` - CloudSynced bool `db:"cloud_synced"` - } +func (w *SyncWorker) processEpisodeWithMode(ctx context.Context, episodeID int64, manual bool, resync bool) { + var ep syncEpisodeUploadRow err := w.db.GetContext(ctx, &ep, ` - SELECT id, episode_id, mcap_path, sidecar_path, cloud_synced + SELECT + id, + episode_id, + task_id, + mcap_path, + sidecar_path, + cloud_synced, + metadata, + workstation_id, + factory_id, + organization_id FROM episodes WHERE id = ? AND deleted_at IS NULL `, episodeID) @@ -760,55 +866,151 @@ func (w *SyncWorker) processEpisodeWithMode(ctx context.Context, episodeID int64 return } - if ep.CloudSynced { + if ep.CloudSynced && !resync { //logger.Printf("[SYNC-WORKER] Episode %d already synced, skipping", episodeID) return } - // Extract the MinIO object key from the stored path (strip bucket prefix) - mcapKey := stripBucketPrefix(ep.McapPath) + syncLogID, attemptCount, err := w.acquireSyncLogWithMode(ctx, episodeID, ep.McapPath, manual) + if err != nil { + //logger.Printf("[SYNC-WORKER] Failed to acquire sync log for episode %d: %v", episodeID, err) + return + } - if mcapKey == "" { - logger.Printf("[SYNC-WORKER] Episode %d has empty mcap_path, skipping", episodeID) + startTime := time.Now() + + result, err := w.uploadEpisodeDirect(ctx, ep) + if err != nil { + duration := int64(time.Since(startTime).Seconds()) + w.markSyncFailed(ctx, syncLogID, episodeID, duration, err, attemptCount) return } - // Build raw tags from sidecar JSON (best-effort: log and continue on failure). - rawTags := map[string]string{ - "episode_id": ep.EpisodeUUID, + // Success: update episode and sync_log + duration := int64(time.Since(startTime).Seconds()) + w.markSyncCompleted(ctx, syncLogID, episodeID, result, duration) +} + +func (w *SyncWorker) uploadEpisodeDirect(ctx context.Context, ep syncEpisodeUploadRow) (*cloud.UploadResult, error) { + mcapKey := stripBucketPrefix(ep.McapPath) + if mcapKey == "" { + return nil, newNonRetryableSyncError("episode %d has empty mcap_path", ep.ID) } - if sidecarTags, err := w.tagsFromSidecar(ctx, ep.SidecarPath); err != nil { - logger.Printf("[SYNC-WORKER] Episode %d: failed to read sidecar tags, uploading without them: %v", episodeID, err) - } else { - for k, v := range sidecarTags { - rawTags[k] = v - } + + assetID, err := resolveAssetIDForEpisode(ctx, w.db, ep.ID, ep.Metadata, ep.WorkstationID) + if err != nil { + return nil, wrapNonRetryableSyncError(err, "resolve asset_id for episode %d", ep.ID) } - // Reuse latest failed sync_log when retry is due, otherwise insert a new row. - syncLogID, attemptCount, err := w.acquireSyncLogWithMode(ctx, episodeID, ep.McapPath, manual) + if w.syncCfg == nil || strings.TrimSpace(w.syncCfg.DPConfigPath) == "" { + return nil, newNonRetryableSyncError("KEYSTONE_SYNC_DP_CONFIG is required for direct sync") + } + dpConfig, err := loadDPDeviceUploadConfig(w.syncCfg.DPConfigPath, assetID) if err != nil { - //logger.Printf("[SYNC-WORKER] Failed to acquire sync log for episode %d: %v", episodeID, err) - return + return nil, wrapNonRetryableSyncError(err, "load DP config for asset_id %s", assetID) } - startTime := time.Now() + sidecarTags, err := w.directTagsFromSidecar(ctx, ep.SidecarPath) + if err != nil { + return nil, err + } + + rawTags, err := buildDPDirectRawTags(dpRawTagsInput{ + Profile: dpConfig.Profile, + McapKey: mcapKey, + SidecarTags: sidecarTags, + EpisodeID: ep.ID, + EpisodePublicID: ep.EpisodeUUID, + TaskID: ep.TaskID, + FactoryID: ep.FactoryID, + OrganizationID: ep.OrganizationID, + }) + if err != nil { + return nil, wrapNonRetryableSyncError(err, "build raw tags for episode %d", ep.ID) + } + + uploader, cleanup, err := w.newDirectUploader(dpConfig) + if err != nil { + return nil, fmt.Errorf("create direct uploader for asset_id %s: %w", assetID, err) + } + defer cleanup() + + logger.Printf("[SYNC-WORKER] Episode %d direct sync config resolved: asset_id=%s auth=%s auth_tls=%t gateway=%s gateway_tls=%t", + ep.ID, assetID, dpConfig.Auth.Target, dpConfig.Auth.UseTLS, dpConfig.Gateway.Target, dpConfig.Gateway.UseTLS) - // Execute upload - result, err := w.uploader.Upload(ctx, cloud.UploadRequest{ + return uploader.Upload(ctx, cloud.UploadRequest{ EpisodeID: ep.EpisodeUUID, McapKey: mcapKey, + AssetID: assetID, RawTags: rawTags, }) +} + +func (w *SyncWorker) newDirectUploader(dpConfig *DPDeviceUploadConfig) (*cloud.Uploader, func(), error) { + if dpConfig == nil { + return nil, func() {}, fmt.Errorf("missing DP upload config") + } + authClient := cloud.NewAuthClient(cloud.AuthClientConfig{ + Endpoint: dpConfig.Auth.Target, + UseTLS: dpConfig.Auth.UseTLS, + TLSServerName: dpConfig.Auth.ServerName, + APIKey: dpConfig.Profile.APIKey, + RefreshBefore: 60 * time.Second, + }) + gatewayClient := cloud.NewGatewayClient(cloud.GatewayClientConfig{ + Endpoint: dpConfig.Gateway.Target, + UseTLS: dpConfig.Gateway.UseTLS, + TLSServerName: dpConfig.Gateway.ServerName, + RequestTimeout: w.syncRequestTimeout(), + }, authClient) + cleanup := func() { + if err := gatewayClient.Close(); err != nil { + logger.Printf("[SYNC-WORKER] Failed to close direct gateway client: %v", err) + } + if err := authClient.Close(); err != nil { + logger.Printf("[SYNC-WORKER] Failed to close direct auth client: %v", err) + } + } + + uploader, err := cloud.NewUploader(gatewayClient, w.minioClient, w.minioBucket, cloud.UploaderConfig{ + RequestTimeout: w.syncRequestTimeout(), + OSSTimeout: w.syncOSSTimeout(), + PersistRootDir: w.syncPersistRootDir(), + MaxRestartCount: uint32(w.syncMaxRestartCount()), //nolint:gosec // non-negative by helper + }) if err != nil { - duration := int64(time.Since(startTime).Seconds()) - w.markSyncFailed(ctx, syncLogID, episodeID, duration, err, attemptCount) - return + cleanup() + return nil, func() {}, err + } + return uploader, cleanup, nil +} + +func (w *SyncWorker) syncRequestTimeout() time.Duration { + if w.syncCfg != nil && w.syncCfg.RequestTimeoutSec > 0 { + return time.Duration(w.syncCfg.RequestTimeoutSec) * time.Second } + return 30 * time.Second +} - // Success: update episode and sync_log - duration := int64(time.Since(startTime).Seconds()) - w.markSyncCompleted(ctx, syncLogID, episodeID, result, duration) +func (w *SyncWorker) syncOSSTimeout() time.Duration { + if w.syncCfg != nil && w.syncCfg.OSSTimeoutSec > 0 { + return time.Duration(w.syncCfg.OSSTimeoutSec) * time.Second + } + return 300 * time.Second +} + +func (w *SyncWorker) syncPersistRootDir() string { + if w.syncCfg == nil { + return "" + } + return w.syncCfg.PersistRootDir +} + +func (w *SyncWorker) syncMaxRestartCount() int { + if w.syncCfg != nil && w.syncCfg.MaxRestartCount >= 0 { + return w.syncCfg.MaxRestartCount + } + return 3 } func (w *SyncWorker) acquireSyncLogWithMode(ctx context.Context, episodeID int64, sourcePath string, manual bool) (int64, int, error) { @@ -893,7 +1095,7 @@ func (w *SyncWorker) acquireSyncLogWithMode(ctx context.Context, episodeID int64 case "completed": return 0, 0, fmt.Errorf("episode %d already has completed sync_log", episodeID) case "failed": - retryDue := !latest.NextRetry.Valid || !latest.NextRetry.Time.After(now) + retryDue := latest.NextRetry.Valid && !latest.NextRetry.Time.After(now) if latest.AttemptCount < w.cfg.MaxRetries && retryDue { res, updErr := tx.ExecContext(ctx, ` UPDATE sync_logs @@ -924,6 +1126,9 @@ func (w *SyncWorker) acquireSyncLogWithMode(ctx context.Context, episodeID int64 return latest.ID, latest.AttemptCount + 1, nil } + if !manual && !latest.NextRetry.Valid { + return 0, 0, fmt.Errorf("%w for episode %d", errSyncNonRetryableFailed, episodeID) + } if !manual && latest.AttemptCount >= w.cfg.MaxRetries { return 0, 0, fmt.Errorf("max retries exceeded for episode %d", episodeID) } @@ -1006,8 +1211,11 @@ func (w *SyncWorker) markSyncFailed(ctx context.Context, syncLogID, episodeID, d now := time.Now().UTC() errMsg := uploadErr.Error() - backoff := w.nextRetryDelay(attemptCount) - nextRetry := now.Add(backoff) + var nextRetry sql.NullTime + if !isNonRetryableSyncError(uploadErr) { + backoff := w.nextRetryDelay(attemptCount) + nextRetry = sql.NullTime{Time: now.Add(backoff), Valid: true} + } if _, err := w.db.ExecContext(ctx, ` UPDATE sync_logs @@ -1021,8 +1229,13 @@ func (w *SyncWorker) markSyncFailed(ctx context.Context, syncLogID, episodeID, d logger.Printf("[SYNC-WORKER] Failed to update sync log %d as failed: %v", syncLogID, err) } - logger.Printf("[SYNC-WORKER] Episode %d sync failed: %v (attempt=%d, next_retry=%v)", - episodeID, uploadErr, attemptCount, nextRetry.Format(time.RFC3339)) + if nextRetry.Valid { + logger.Printf("[SYNC-WORKER] Episode %d sync failed: %v (attempt=%d, next_retry=%v)", + episodeID, uploadErr, attemptCount, nextRetry.Time.Format(time.RFC3339)) + return + } + logger.Printf("[SYNC-WORKER] Episode %d sync failed non-retryable: %v (attempt=%d)", + episodeID, uploadErr, attemptCount) } func (w *SyncWorker) nextRetryDelay(attemptCount int) time.Duration { @@ -1068,13 +1281,10 @@ func (w *SyncWorker) nextRetryDelay(attemptCount int) time.Duration { return time.Duration(totalSec * float64(time.Second)) } -// tagsFromSidecar reads the sidecar JSON from MinIO and returns it as a flat string map -// for use as RawTags. topics_summary is excluded. Returns nil map and an error if the -// sidecar path is empty, the object cannot be read, or the JSON is malformed. -func (w *SyncWorker) tagsFromSidecar(ctx context.Context, sidecarPath string) (map[string]string, error) { +func (w *SyncWorker) directTagsFromSidecar(ctx context.Context, sidecarPath string) (map[string]string, error) { key := stripBucketPrefix(sidecarPath) if key == "" { - return nil, fmt.Errorf("empty sidecar_path") + return nil, newNonRetryableSyncError("empty sidecar_path") } if w.minioClient == nil { return nil, fmt.Errorf("minio client not available") @@ -1095,7 +1305,7 @@ func (w *SyncWorker) tagsFromSidecar(ctx context.Context, sidecarPath string) (m tags, err := flattenSidecar(data) if err != nil { - return nil, fmt.Errorf("flatten sidecar %s: %w", key, err) + return nil, wrapNonRetryableSyncError(err, "flatten sidecar %s", key) } return tags, nil } diff --git a/internal/services/sync_worker_test.go b/internal/services/sync_worker_test.go index 02d3151..d5d26d6 100644 --- a/internal/services/sync_worker_test.go +++ b/internal/services/sync_worker_test.go @@ -5,11 +5,17 @@ package services import ( + "bytes" "context" + "database/sql" "errors" + "log" + "strings" "testing" "time" + "archebase.com/keystone-edge/internal/cloud" + "archebase.com/keystone-edge/internal/logger" "github.com/jmoiron/sqlx" _ "modernc.org/sqlite" ) @@ -129,6 +135,27 @@ func TestFindPendingEpisodes_ExcludesExhaustedFailuresFromPollingOnly(t *testing assertEpisodeIDs(t, pollIDs, []int64{1, 3}) } +func TestFindPendingEpisodes_SkipsNonRetryableFailuresFromPollingOnly(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{db: db, cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3}} + + insertEpisodeForSyncWorkerTest(t, db, 5, "approved", false) + insertEpisodeForSyncWorkerTest(t, db, 6, "approved", false) + insertNonRetryableSyncLogForSyncWorkerTest(t, db, 6, "failed", 1) + + apiIDs, err := w.findPendingEpisodes(context.Background(), true) + if err != nil { + t.Fatalf("api pending query failed: %v", err) + } + assertEpisodeIDs(t, apiIDs, []int64{5, 6}) + + pollIDs, err := w.findPendingEpisodes(context.Background(), false) + if err != nil { + t.Fatalf("poll pending query failed: %v", err) + } + assertEpisodeIDs(t, pollIDs, []int64{5}) +} + func TestEnqueueEpisodeManual_AllowsExhaustedRetryEpisode(t *testing.T) { db := newTestSyncWorkerDB(t) w := &SyncWorker{ @@ -199,6 +226,68 @@ func TestEnqueueEpisodeManual_PromotesDueFailureToPending(t *testing.T) { } } +func TestEnqueueEpisodeResync_AllowsAlreadySyncedEpisode(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3}, + enqueueCh: make(chan syncEnqueueRequest, 1), + enqueuedEpisode: make(map[int64]struct{}), + } + w.running.Store(true) + + insertEpisodeForSyncWorkerTest(t, db, 27, "approved", true) + insertSyncLogForSyncWorkerTest(t, db, 27, "completed", 1) + + if err := w.EnqueueEpisodeResync(context.Background(), 27); err != nil { + t.Fatalf("resync enqueue failed: %v", err) + } + + latest := latestSyncLogForSyncWorkerTest(t, db, 27) + if latest.Status != "pending" { + t.Fatalf("latest status = %q, want pending", latest.Status) + } + if count := countSyncLogsForSyncWorkerTest(t, db, 27); count != 2 { + t.Fatalf("sync log count = %d, want completed history plus resync pending", count) + } + + select { + case got := <-w.enqueueCh: + if got.episodeID != 27 { + t.Fatalf("unexpected episode id: got %d want 27", got.episodeID) + } + if !got.manual || !got.resync { + t.Fatalf("enqueue flags = manual:%t resync:%t, want both true", got.manual, got.resync) + } + default: + t.Fatal("expected resync episode to be enqueued") + } +} + +func TestDispatchPendingSyncLogs_TreatsSyncedPendingRowsAsResync(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3}, + jobCh: make(chan syncEnqueueRequest, 1), + enqueuedEpisode: make(map[int64]struct{}), + } + + insertEpisodeForSyncWorkerTest(t, db, 28, "approved", true) + insertSyncLogForSyncWorkerTest(t, db, 28, "pending", 0) + + w.dispatchPendingSyncLogs(context.Background()) + + select { + case got := <-w.jobCh: + if got.episodeID != 28 || !got.resync { + t.Fatalf("dispatched request = %+v, want episode 28 resync", got) + } + default: + t.Fatal("expected synced pending row to be dispatched as resync") + } +} + func TestEnqueueEpisode_RejectsInProgressEpisode(t *testing.T) { db := newTestSyncWorkerDB(t) w := &SyncWorker{ @@ -262,6 +351,35 @@ func TestEnqueueEpisodeManual_RejectsPendingEpisode(t *testing.T) { } } +func TestEnqueueEpisodeManual_AllowsNonRetryableFailure(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3}, + enqueueCh: make(chan syncEnqueueRequest, 1), + enqueuedEpisode: make(map[int64]struct{}), + } + w.running.Store(true) + + insertEpisodeForSyncWorkerTest(t, db, 24, "approved", false) + insertNonRetryableSyncLogForSyncWorkerTest(t, db, 24, "failed", 1) + + if err := w.EnqueueEpisodeManual(context.Background(), 24); err != nil { + t.Fatalf("manual enqueue failed: %v", err) + } + + latest := latestSyncLogForSyncWorkerTest(t, db, 24) + if latest.Status != "pending" { + t.Fatalf("latest status = %q, want pending", latest.Status) + } + if latest.AttemptCount != 0 { + t.Fatalf("latest attempt_count = %d, want fresh pending attempt count 0", latest.AttemptCount) + } + if count := countSyncLogsForSyncWorkerTest(t, db, 24); count != 2 { + t.Fatalf("sync log count = %d, want failed history plus fresh pending", count) + } +} + func TestEnqueuePendingEpisodes_PersistsPendingWhenMemoryQueueFull(t *testing.T) { db := newTestSyncWorkerDB(t) w := &SyncWorker{ @@ -448,6 +566,54 @@ func TestRetryFailedEpisodes_PromotesDueFailureToPendingBeforeDispatch(t *testin } } +func TestRetryFailedEpisodes_IgnoresMissingDeletedAndRetriesSyncedEpisodesAsResync(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3}, + jobCh: make(chan syncEnqueueRequest, 2), + enqueuedEpisode: make(map[int64]struct{}), + } + + insertSyncLogForSyncWorkerTest(t, db, 2, "failed", 1) + insertEpisodeForSyncWorkerTest(t, db, 3, "approved", false) + insertSyncLogForSyncWorkerTest(t, db, 3, "failed", 1) + if _, err := db.Exec(`UPDATE episodes SET deleted_at = ? WHERE id = 3`, time.Now().UTC()); err != nil { + t.Fatalf("mark episode deleted: %v", err) + } + insertEpisodeForSyncWorkerTest(t, db, 4, "approved", true) + insertSyncLogForSyncWorkerTest(t, db, 4, "failed", 1) + insertEpisodeForSyncWorkerTest(t, db, 5, "approved", false) + insertSyncLogForSyncWorkerTest(t, db, 5, "failed", 1) + + var logs bytes.Buffer + previousLogger := logger.Get() + logger.Set(log.New(&logs, "", 0)) + t.Cleanup(func() { logger.Set(previousLogger) }) + + w.retryFailedEpisodes(context.Background()) + + if strings.Contains(logs.String(), "Failed to queue retry") { + t.Fatalf("unexpected retry queue failure log: %s", logs.String()) + } + + for _, episodeID := range []int64{4, 5} { + latest := latestSyncLogForSyncWorkerTest(t, db, episodeID) + if latest.Status != "pending" { + t.Fatalf("episode %d latest status = %q, want pending", episodeID, latest.Status) + } + } + + gotSynced := <-w.jobCh + if gotSynced.episodeID != 4 || !gotSynced.resync { + t.Fatalf("unexpected synced retry dispatch: got %+v want episode 4 resync", gotSynced) + } + gotUnsynced := <-w.jobCh + if gotUnsynced.episodeID != 5 || gotUnsynced.resync { + t.Fatalf("unexpected unsynced retry dispatch: got %+v want episode 5 non-resync", gotUnsynced) + } +} + func TestAcquireSyncLogWithMode_ClaimsFreshPendingRow(t *testing.T) { db := newTestSyncWorkerDB(t) w := &SyncWorker{ @@ -517,7 +683,7 @@ func TestProcessEnqueuedEpisode_HoldsMarkerUntilProcessingReturns(t *testing.T) w.processEnqueuedEpisodeWith( context.Background(), syncEnqueueRequest{episodeID: 77, manual: true}, - func(context.Context, int64, bool) { + func(context.Context, int64, bool, bool) { close(started) <-release }, @@ -616,6 +782,74 @@ func TestNextRetryDelay_IncludesBoundedJitter(t *testing.T) { } } +func TestMarkSyncFailed_NonRetryableClearsNextRetry(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{RetryBaseSec: 30, RetryMaxSec: 1800}, + } + + insertEpisodeForSyncWorkerTest(t, db, 25, "approved", false) + insertSyncLogForSyncWorkerTest(t, db, 25, "in_progress", 1) + var syncLogID int64 + if err := db.Get(&syncLogID, "SELECT id FROM sync_logs WHERE episode_id = ?", 25); err != nil { + t.Fatalf("query sync log id: %v", err) + } + + w.markSyncFailed(context.Background(), syncLogID, 25, 0, newNonRetryableSyncError("asset_id missing"), 1) + + latest := latestSyncLogForSyncWorkerTest(t, db, 25) + if latest.Status != "failed" { + t.Fatalf("latest status = %q, want failed", latest.Status) + } + if latest.NextRetry.Valid { + t.Fatalf("next_retry_at valid = true, want NULL") + } +} + +func TestMarkSyncCompleted_WritesExistingCloudFields(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{db: db} + + insertEpisodeForSyncWorkerTest(t, db, 26, "approved", false) + insertSyncLogForSyncWorkerTest(t, db, 26, "in_progress", 1) + var syncLogID int64 + if err := db.Get(&syncLogID, "SELECT id FROM sync_logs WHERE episode_id = ?", 26); err != nil { + t.Fatalf("query sync log id: %v", err) + } + + w.markSyncCompleted(context.Background(), syncLogID, 26, &cloud.UploadResult{ + LogicalUploadID: "logical-26", + UploadID: "upload-26", + ObjectKey: "cloud/object.mcap", + FileSize: 12345, + }, 3) + + var ep struct { + CloudSynced bool `db:"cloud_synced"` + CloudMcapPath string `db:"cloud_mcap_path"` + CloudProcessed bool `db:"cloud_processed"` + } + if err := db.Get(&ep, "SELECT cloud_synced, cloud_mcap_path, cloud_processed FROM episodes WHERE id = ?", 26); err != nil { + t.Fatalf("query episode cloud fields: %v", err) + } + if !ep.CloudSynced || ep.CloudMcapPath != "cloud/object.mcap" || ep.CloudProcessed { + t.Fatalf("episode cloud fields = %+v", ep) + } + + var logRow struct { + Status string `db:"status"` + DestinationPath string `db:"destination_path"` + BytesTransferred int64 `db:"bytes_transferred"` + } + if err := db.Get(&logRow, "SELECT status, destination_path, bytes_transferred FROM sync_logs WHERE id = ?", syncLogID); err != nil { + t.Fatalf("query sync log completion fields: %v", err) + } + if logRow.Status != "completed" || logRow.DestinationPath != "cloud/object.mcap" || logRow.BytesTransferred != 12345 { + t.Fatalf("sync log completion fields = %+v", logRow) + } +} + func newTestSyncWorkerDB(t *testing.T) *sqlx.DB { t.Helper() @@ -629,6 +863,9 @@ func newTestSyncWorkerDB(t *testing.T) *sqlx.DB { id INTEGER PRIMARY KEY, qa_status TEXT NOT NULL, cloud_synced BOOLEAN NOT NULL DEFAULT 0, + cloud_synced_at TIMESTAMP NULL, + cloud_mcap_path TEXT, + cloud_processed BOOLEAN NOT NULL DEFAULT 0, deleted_at TIMESTAMP NULL, created_at TIMESTAMP NOT NULL )`, @@ -637,6 +874,8 @@ func newTestSyncWorkerDB(t *testing.T) *sqlx.DB { episode_id INTEGER NOT NULL, source_path TEXT, status TEXT NOT NULL, + destination_path TEXT, + bytes_transferred INTEGER, duration_sec INTEGER, error_message TEXT, attempt_count INTEGER NOT NULL DEFAULT 0, @@ -675,18 +914,35 @@ func insertEpisodeForSyncWorkerTest(t *testing.T, db *sqlx.DB, id int64, qaStatu func insertSyncLogForSyncWorkerTest(t *testing.T, db *sqlx.DB, episodeID int64, status string, attemptCount int) { t.Helper() + startedAt := time.Date(2026, 2, int(episodeID), 0, 0, 0, 0, time.UTC) + nextRetry := sql.NullTime{} + if status == "failed" { + nextRetry = sql.NullTime{Time: startedAt.Add(time.Second), Valid: true} + } + if _, err := db.Exec(` + INSERT INTO sync_logs (episode_id, status, attempt_count, started_at, next_retry_at) + VALUES (?, ?, ?, ?, ?) + `, episodeID, status, attemptCount, startedAt, nextRetry); err != nil { + t.Fatalf("insert sync log for episode %d: %v", episodeID, err) + } +} + +func insertNonRetryableSyncLogForSyncWorkerTest(t *testing.T, db *sqlx.DB, episodeID int64, status string, attemptCount int) { + t.Helper() + startedAt := time.Date(2026, 2, int(episodeID), 0, 0, 0, 0, time.UTC) if _, err := db.Exec(` - INSERT INTO sync_logs (episode_id, status, attempt_count, started_at) - VALUES (?, ?, ?, ?) + INSERT INTO sync_logs (episode_id, status, attempt_count, started_at, next_retry_at) + VALUES (?, ?, ?, ?, NULL) `, episodeID, status, attemptCount, startedAt); err != nil { t.Fatalf("insert sync log for episode %d: %v", episodeID, err) } } type syncLogForSyncWorkerTest struct { - Status string `db:"status"` - AttemptCount int `db:"attempt_count"` + Status string `db:"status"` + AttemptCount int `db:"attempt_count"` + NextRetry sql.NullTime `db:"next_retry_at"` } func latestSyncLogForSyncWorkerTest(t *testing.T, db *sqlx.DB, episodeID int64) syncLogForSyncWorkerTest { @@ -694,7 +950,7 @@ func latestSyncLogForSyncWorkerTest(t *testing.T, db *sqlx.DB, episodeID int64) var row syncLogForSyncWorkerTest if err := db.Get(&row, ` - SELECT status, attempt_count + SELECT status, attempt_count, next_retry_at FROM sync_logs WHERE episode_id = ? ORDER BY id DESC diff --git a/internal/storage/database/migrations/000004_robot_asset_id.down.sql b/internal/storage/database/migrations/000004_robot_asset_id.down.sql new file mode 100644 index 0000000..c601e3d --- /dev/null +++ b/internal/storage/database/migrations/000004_robot_asset_id.down.sql @@ -0,0 +1,7 @@ +-- SPDX-FileCopyrightText: 2026 ArcheBase +-- +-- SPDX-License-Identifier: MulanPSL-2.0 + +ALTER TABLE robots + DROP INDEX idx_asset_active_unique, + DROP COLUMN _asset_unique; diff --git a/internal/storage/database/migrations/000004_robot_asset_id.up.sql b/internal/storage/database/migrations/000004_robot_asset_id.up.sql new file mode 100644 index 0000000..6bcf96c --- /dev/null +++ b/internal/storage/database/migrations/000004_robot_asset_id.up.sql @@ -0,0 +1,14 @@ +-- SPDX-FileCopyrightText: 2026 ArcheBase +-- +-- SPDX-License-Identifier: MulanPSL-2.0 + +ALTER TABLE robots + ADD COLUMN _asset_unique VARCHAR(100) + GENERATED ALWAYS AS ( + CASE + WHEN deleted_at IS NULL AND asset_id IS NOT NULL AND asset_id <> '' + THEN asset_id + ELSE NULL + END + ) STORED, + ADD UNIQUE INDEX idx_asset_active_unique (_asset_unique);