From 18a12c363ac0276dfe320bda8c760292af5e1a8b Mon Sep 17 00:00:00 2001 From: chaoliu Date: Tue, 2 Jun 2026 21:01:14 +0800 Subject: [PATCH 1/7] feat: support dp cli to run cloud sync --- cmd/keystone-edge/main.go | 26 +- docs/designs/cli-cloud-sync-sidepath.md | 491 ++++++++++ docs/designs/cli-cloud-sync-sidepath.zh.html | 833 ++++++++++++++++ docs/designs/data-quality-center-mvp.zh.html | 892 ++++++++++++++++++ .../data-quality-script-management.zh.html | 878 +++++++++++++++++ internal/api/handlers/sync.go | 192 +++- internal/config/config.go | 67 ++ internal/server/server.go | 20 +- internal/services/cli_sync_runner.go | 887 +++++++++++++++++ internal/services/sidecar_tags.go | 41 + internal/services/sidecar_tags_test.go | 39 + .../migrations/000004_cli_sync_runs.down.sql | 5 + .../migrations/000004_cli_sync_runs.up.sql | 29 + 13 files changed, 4392 insertions(+), 8 deletions(-) create mode 100644 docs/designs/cli-cloud-sync-sidepath.md create mode 100644 docs/designs/cli-cloud-sync-sidepath.zh.html create mode 100644 docs/designs/data-quality-center-mvp.zh.html create mode 100755 docs/designs/data-quality-script-management.zh.html create mode 100644 internal/services/cli_sync_runner.go create mode 100644 internal/storage/database/migrations/000004_cli_sync_runs.down.sql create mode 100644 internal/storage/database/migrations/000004_cli_sync_runs.up.sql diff --git a/cmd/keystone-edge/main.go b/cmd/keystone-edge/main.go index 7164bff..b376890 100644 --- a/cmd/keystone-edge/main.go +++ b/cmd/keystone-edge/main.go @@ -171,8 +171,32 @@ func main() { logger.Println("[SYNC] Cloud sync disabled (KEYSTONE_SYNC_ENABLED=false or missing endpoints)") } + var cliSyncRunner *services.CLISyncRunner + if cfg.CLISync.Enabled && s3Client != nil { + var err error + cliSyncRunner, err = services.NewCLISyncRunner(db.DB, s3Client, cfg.Storage.Bucket, services.CLISyncRunnerConfig{ + Enabled: cfg.CLISync.Enabled, + DPBin: cfg.CLISync.DPBin, + DPConfigPath: cfg.CLISync.DPConfigPath, + TempDir: cfg.CLISync.TempDir, + MaxConcurrent: cfg.CLISync.MaxConcurrent, + QueueSize: cfg.CLISync.QueueSize, + TimeoutSec: cfg.CLISync.TimeoutSec, + KeepTemp: cfg.CLISync.KeepTemp, + MaxTags: cfg.CLISync.MaxTags, + MaxTagBytes: cfg.CLISync.MaxTagBytes, + }) + if err != nil { + logger.Fatalf("[CLI-SYNC] Failed to initialise CLI sync runner: %v", err) + } + cliSyncRunner.Start() + logger.Printf("[CLI-SYNC] CLI sync runner started: dp=%s config=%s", cfg.CLISync.DPBin, cfg.CLISync.DPConfigPath) + } else if cfg.CLISync.Enabled { + logger.Println("[CLI-SYNC] CLI sync disabled because S3/MinIO is unavailable") + } + // Initialize and start HTTP server - srv := server.New(cfg, db.DB, s3Client, syncWorker) + srv := server.New(cfg, db.DB, s3Client, syncWorker, cliSyncRunner) if err := srv.Start(); err != nil { logger.Fatalf("[SERVER] Failed to start server: %v", err) } diff --git a/docs/designs/cli-cloud-sync-sidepath.md b/docs/designs/cli-cloud-sync-sidepath.md new file mode 100644 index 0000000..9bbe508 --- /dev/null +++ b/docs/designs/cli-cloud-sync-sidepath.md @@ -0,0 +1,491 @@ + + +# CLI Cloud Sync Sidepath Design + +## 1. Overview + +This document defines a sidepath for syncing one Keystone episode to cloud by +running the data-platform `dp` CLI from Keystone, while keeping the existing +`SyncWorker -> data-platform DataGateway` flow unchanged. + +The sidepath is intended for controlled operations and emergency recovery, not +as the default production upload path. + +Target flow: + +```text +Synapse "CLI sync to cloud" button + -> Keystone CLI sync API + -> Keystone CLI sync runner + -> download MCAP from Keystone MinIO to a temporary local file + -> read sidecar JSON and flatten scalar metadata into --tag arguments + -> dp --json data upload --tag ... + -> record dp result + -> mark the episode cloud_synced on success +``` + +The existing cloud sync flow remains: + +```text +Synapse normal sync action + -> Keystone SyncWorker queue + -> Keystone Go uploader + -> data-platform DataGateway + -> cloud object storage +``` + +## 2. Goals + +- Add a Synapse action named `CLI sync to cloud` for a single episode. +- Keep the current `POST /api/v1/sync/episodes/:id` behavior unchanged. +- Keep the current `SyncWorker` queue, retry, backoff, and auto-scan behavior + unchanged. +- Upload the episode MCAP through `dp data upload`. +- Read the episode sidecar JSON and pass scalar metadata through + `dp data upload --tag`. Array fields are skipped in the first version so the + existing `dp` CLI does not need to change its comma-separated tag parser. +- Persist CLI run audit data, including `fileId`, `logicalUploadId`, `uploadId`, + `objectKey`, command duration, and sanitized error output. +- On successful CLI upload, update: + - `episodes.cloud_synced = TRUE` + - `episodes.cloud_synced_at` + - `episodes.cloud_mcap_path` + - `episodes.cloud_processed = FALSE` +- On successful CLI upload, append a normal `sync_logs.completed` row so the + existing Cloud Sync Center summary can show the episode as synced. + +## 3. Non-Goals + +- Do not replace `SyncWorker`. +- Do not make CLI sync the default action. +- Do not add batch CLI sync in the first version. +- Do not retry CLI sync automatically. +- Do not let the existing `SyncWorker` process CLI pending or failed states. +- Do not upload the sidecar JSON object through the CLI sidepath in the first + version. Its scalar content is still required as upload tags for the MCAP + object. +- Do not expose `dp` command output containing secrets to the browser. + +## 4. Recommended Architecture + +Use a separate `cli_sync_runs` table for pending, in-progress, and failed CLI +runs. This avoids putting CLI `pending` or `failed` rows into `sync_logs`, which +would otherwise be visible to the existing `SyncWorker` polling queries. + +Only after the CLI upload succeeds should Keystone append a `sync_logs` row with +`status = 'completed'`. That completed row is terminal and will not be retried +by the existing worker. + +```text +api request + -> insert cli_sync_runs(status='pending') + -> background runner claims run + -> cli_sync_runs(status='in_progress') + -> read sidecar JSON tags + -> run dp upload + -> success: + cli_sync_runs(status='completed', dp ids...) + sync_logs(status='completed', destination_path=objectKey...) + episodes.cloud_synced = TRUE + -> failure: + cli_sync_runs(status='failed', sanitized error...) + no sync_logs write + episodes unchanged +``` + +This keeps normal sync history authoritative while still allowing CLI success to +close the episode's cloud sync state. + +## 5. Backend API + +### 5.1 Trigger CLI Sync + +```http +POST /api/v1/sync/episodes/:id/cli +``` + +Request body: + +```json +{} +``` + +Response: + +```json +{ + "status": "accepted", + "episode_id": 123, + "run_id": 456, + "message": "episode accepted for CLI cloud sync" +} +``` + +Validation: + +| Check | Response | +|---|---| +| CLI sync feature disabled | `503 Service Unavailable` | +| invalid episode id | `400 Bad Request` | +| episode missing or deleted | `404 Not Found` | +| `qa_status` is not `approved` or `inspector_approved` | `400 Bad Request` | +| `cloud_synced = TRUE` | `409 Conflict` | +| latest normal sync log is `pending` or `in_progress` | `409 Conflict` | +| existing CLI run is `pending` or `in_progress` | `409 Conflict` | +| CLI runner queue is full | `429 Too Many Requests` | + +The endpoint must return after the run is queued. It must not hold the HTTP +request open for the entire upload. + +### 5.2 Get Latest CLI Sync Run + +```http +GET /api/v1/sync/episodes/:id/cli/status +``` + +Response: + +```json +{ + "id": 456, + "episode_id": 123, + "status": "in_progress", + "file_id": null, + "logical_upload_id": null, + "upload_id": null, + "object_key": null, + "file_size": null, + "started_at": "2026-06-02T08:10:00Z", + "completed_at": null, + "error_message": null +} +``` + +The frontend uses this endpoint to show button state while the sidepath is +running. The normal sync summary remains sourced from `sync_logs`. + +## 6. Data Model + +### 6.1 New Table + +```sql +CREATE TABLE IF NOT EXISTS cli_sync_runs ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + episode_id BIGINT NOT NULL, + status ENUM('pending', 'in_progress', 'completed', 'failed') NOT NULL DEFAULT 'pending', + source_path VARCHAR(1024), + temp_path VARCHAR(1024), + dp_config_path VARCHAR(1024), + file_id VARCHAR(255), + logical_upload_id VARCHAR(255), + upload_id VARCHAR(255), + bucket VARCHAR(255), + object_key VARCHAR(1024), + file_size BIGINT, + oss_object_etag VARCHAR(255), + duration_sec INT, + error_message TEXT, + stdout_json JSON DEFAULT NULL, + started_at TIMESTAMP NULL, + completed_at TIMESTAMP NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + INDEX idx_cli_sync_episode (episode_id), + INDEX idx_cli_sync_status (status), + INDEX idx_cli_sync_created (created_at) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; +``` + +### 6.2 Why Not Store Pending CLI Runs In `sync_logs` + +The existing worker polls latest `sync_logs.status = 'pending'` rows and +retryable `failed` rows. If CLI pending or failed rows are written to +`sync_logs`, the normal worker can claim them and run the regular data-gateway +upload path. That would mix the two channels and violate this design's goal. + +For this reason: + +- `cli_sync_runs` owns CLI pending, in-progress, and failed states. +- `sync_logs` receives a completed row only after CLI upload succeeds. +- `episodes.cloud_synced` is updated only after CLI upload succeeds. + +### 6.3 Successful CLI Sync Log Row + +On success, insert: + +```sql +INSERT INTO sync_logs ( + episode_id, + source_path, + destination_path, + status, + bytes_transferred, + duration_sec, + attempt_count, + started_at, + completed_at +) VALUES (?, ?, ?, 'completed', ?, ?, 1, ?, ?); +``` + +Use `destination_path = dp.objectKey`. Store `dp.fileId` and +`dp.logicalUploadId` in `cli_sync_runs`. + +## 7. CLI Runner + +### 7.1 Command Construction + +The runner must call `dp` without a shell: + +```text +exec.CommandContext(ctx, dpBin, + "--config", dpConfigPath, + "--json", + "data", "upload", tempFile, + "--device", "", + "--tag", "episode_id=", + "--tag", "keystone_episode_id=", + "--tag", "device_id=", + "--tag", "sync_channel=keystone_cli", + "--tag", "", + "--hint", "source=keystone_cli_sync", +) +``` + +Do not build a single shell command string. +The device id is resolved from the episode workstation robot +(`robots.device_id`, falling back to `workstations.robot_serial`). The selected +`dp` config must contain a matching initialized device profile in `devices[]`. + +### 7.2 Tags + +Required tags: + +| Tag | Value | +|---|---| +| `episode_id` | `episodes.episode_id` | +| `keystone_episode_id` | numeric `episodes.id` | +| `device_id` | `robots.device_id` resolved through the episode workstation | +| `sync_channel` | `keystone_cli` | + +Required sidecar-derived tags: + +| Source | Handling | +|---|---| +| sidecar JSON scalar fields | Flatten to string key/value pairs and pass as repeated `--tag key=value` arguments | +| sidecar JSON arrays | Skip in the first version | +| `topics_summary` | Exclude, matching the existing worker's filtering intent | +| nested objects | Flatten with dot notation | + +Recommended tags: + +| Tag | Value | +|---|---| +| `task_id` | `episodes.task_id`, when available | +| `factory_id` | `episodes.factory_id`, when available | +| `organization_id` | `episodes.organization_id`, when available | + +The CLI sidepath uploads only the MCAP file body, but sidecar JSON metadata is +not optional. Scalar sidecar fields must be included as tags; array fields are +left out for the first version. If `sidecar_path` is missing, unreadable, or +malformed, the CLI run should fail before invoking `dp`. This is stricter than +the current worker's best-effort sidecar handling and prevents cloud objects +from being created without the metadata required for filtering. + +The implementation must enforce a max tag count and max tag size so the CLI +command line cannot exceed OS limits. + +### 7.3 Temporary File Handling + +The runner downloads the MCAP from Keystone MinIO to a temporary file before +calling `dp`. + +Requirements: + +- Use a dedicated directory such as `/var/lib/keystone/cli-sync`. +- Create temporary files with mode `0600`. +- Delete the temporary file after success or failure unless + `KEYSTONE_CLI_SYNC_KEEP_TEMP=true`. +- Refuse to start if the temp directory is not writable. +- Check free disk space before download when a disk watermark helper is + available. + +### 7.4 JSON Output Parsing + +Expected `dp --json data upload` fields: + +```json +{ + "logicalUploadId": "logical-1", + "fileId": "file-1", + "bucket": "bucket-a", + "objectKey": "objects/file-1.mcap", + "fileSize": 123456789, + "ossObjectEtag": "etag", + "identity": "api-key", + "deviceId": null +} +``` + +The runner must validate that `fileId`, `logicalUploadId`, `objectKey`, and +`fileSize` are present before marking the run completed. + +## 8. Configuration + +Add a separate config group rather than reusing `SyncConfig`. + +| Environment variable | Default | Description | +|---|---|---| +| `KEYSTONE_CLI_SYNC_ENABLED` | `false` | Enables the sidepath API and runner | +| `KEYSTONE_CLI_SYNC_DP_BIN` | `dp` | Path or binary name for the data-platform CLI | +| `KEYSTONE_CLI_SYNC_DP_CONFIG` | empty | SDK config JSON path passed to `dp --config` | +| `KEYSTONE_CLI_SYNC_TEMP_DIR` | `/var/lib/keystone/cli-sync` | Temporary MCAP staging directory | +| `KEYSTONE_CLI_SYNC_MAX_CONCURRENT` | `1` | Max concurrent CLI uploads | +| `KEYSTONE_CLI_SYNC_QUEUE_SIZE` | `16` | Max queued CLI runs | +| `KEYSTONE_CLI_SYNC_TIMEOUT_SEC` | `7200` | Per-run timeout | +| `KEYSTONE_CLI_SYNC_KEEP_TEMP` | `false` | Keeps staged files for debugging | +| `KEYSTONE_CLI_SYNC_MAX_TAGS` | `128` | Max tags passed to CLI | +| `KEYSTONE_CLI_SYNC_MAX_TAG_BYTES` | `65536` | Max total encoded tag bytes | + +Startup validation when enabled: + +- `dp` binary exists and is executable. +- `KEYSTONE_CLI_SYNC_DP_CONFIG` is set and readable. +- Temp directory exists or can be created. +- Temp directory is writable. + +## 9. Frontend Behavior + +### 9.1 Cloud Sync Center + +Add a row action next to existing `Retry` and `History` actions: + +```text +CLI sync to cloud +``` + +Show it only when the feature flag from config/status says CLI sync is enabled. + +Disable it when: + +- the row status is `pending` or `in_progress`; +- the row status is `completed`; +- the episode has an active CLI run; +- a row action is already running; +- the user does not have admin permission. + +After clicking: + +1. Call `POST /api/v1/sync/episodes/:id/cli`. +2. Show the row as `CLI queued` or `CLI syncing` using the CLI status endpoint. +3. Poll `GET /api/v1/sync/episodes/:id/cli/status`. +4. On CLI completion, refresh normal sync summaries. +5. On CLI failure, keep the normal sync row unchanged and show the sanitized CLI + error. + +### 9.2 Episode Detail + +Add the same action for approved, unsynced episodes. This is important because +an approved unsynced episode may not yet have any `sync_logs` row and therefore +may not appear in the Cloud Sync Center table. + +## 10. Security + +- The trigger API must require admin authorization. +- `dp` must be launched through `exec.CommandContext`, never through a shell. +- Do not pass API keys on the command line. +- Store credentials only in the `dp` config file with restrictive permissions. +- Redact stdout, stderr, paths, and errors before returning anything to the + frontend. +- Do not log full `dp` config contents. +- Do not log temporary object storage credentials or presigned URLs. +- Limit concurrent CLI runs to protect Keystone CPU, disk, and network. + +## 11. Concurrency And Races + +Keystone should prevent multiple active CLI runs for the same episode by checking +`cli_sync_runs.status IN ('pending', 'in_progress')` inside a transaction. + +Before marking success, lock the `episodes` row and re-check `cloud_synced`. + +If the normal SyncWorker synced the episode while the CLI run was uploading: + +- mark the CLI run as completed with its `dp` result; +- do not overwrite `episodes.cloud_mcap_path`; +- do not insert a second `sync_logs.completed` row unless product explicitly + wants duplicate completed history; +- include a `duplicate_after_upload` marker in `cli_sync_runs.stdout_json` or a + dedicated metadata field if one is added later. + +Residual risk: if `dp` upload succeeds but Keystone crashes before recording the +result, a later manual CLI retry can upload a duplicate object. This is accepted +for the sidepath's emergency-use scope. A future implementation can reduce this +by adding a data-platform idempotency key or a server-side upload lookup by +`episode_id`. + +## 12. Rollout Plan + +1. Add `cli_sync_runs` migration and model helpers. +2. Add CLI sync config with default disabled. +3. Add the backend runner with a fake `dp` executable test fixture. +4. Add `POST /sync/episodes/:id/cli` and latest status endpoint. +5. Add Synapse API wrapper methods. +6. Add Episode Detail button. +7. Add Cloud Sync Center row button and CLI status overlay. +8. Enable only in a staging environment. +9. Run one approved small MCAP through CLI sync and verify: + - data-platform object is visible; + - expected sidecar JSON scalar fields are visible as data-platform raw tags; + - `cli_sync_runs` contains `fileId` and `logicalUploadId`; + - `sync_logs` has a completed row; + - `episodes.cloud_synced = TRUE`; + - normal SyncWorker does not retry the episode. + +## 13. Test Plan + +Backend unit tests: + +- rejects disabled feature; +- rejects non-approved episodes; +- rejects already cloud-synced episodes; +- rejects active normal sync rows; +- rejects active CLI runs; +- fails when sidecar JSON is missing, unreadable, or malformed; +- passes flattened sidecar JSON scalar fields as repeated `--tag` arguments; +- builds `dp` argv without a shell; +- parses valid `dp --json` output; +- rejects missing `fileId`, `logicalUploadId`, or `objectKey`; +- redacts stderr before API response; +- records failed CLI runs without writing `sync_logs`; +- records successful CLI runs and inserts one completed `sync_logs` row. + +Backend integration tests: + +- fake MinIO object is staged to temp file; +- fake `dp` executable receives the expected args; +- temp file is deleted after success and failure; +- success updates `episodes.cloud_synced`; +- normal sync summary sees the completed row after success. + +Frontend tests: + +- button is hidden when CLI sync config is disabled; +- button is disabled for completed, pending, and in-progress rows; +- click calls `triggerEpisodeCli`; +- active CLI status changes row action text; +- completed CLI run refreshes normal summaries; +- failed CLI run shows sanitized error and leaves normal row state unchanged. + +## 14. Open Questions + +- Should CLI failures appear in the Cloud Sync Center main table, or only as a + per-episode CLI status/badge? +- Should a successful CLI sync always append `sync_logs.completed`, even when + the latest normal row is already completed by a race? +- Does data-platform need an explicit idempotency key for `dp data upload` so + crash-after-upload can be recovered without duplicate objects? +- Should the `dp` config use a site API key or a device profile for the Keystone + edge site? diff --git a/docs/designs/cli-cloud-sync-sidepath.zh.html b/docs/designs/cli-cloud-sync-sidepath.zh.html new file mode 100644 index 0000000..9ac7b3f --- /dev/null +++ b/docs/designs/cli-cloud-sync-sidepath.zh.html @@ -0,0 +1,833 @@ + + + + + + + CLI 同步到云旁路设计 + + + +
+
+
+

Keystone / Synapse Design

+

CLI 同步到云旁路设计

+

在不改动现有 Keystone 云同步主链路的前提下,新增一个由 Synapse 触发、Keystone 后台执行 dp data upload 的单片段应急同步入口。首版只上传 MCAP 文件本体,但必须读取 sidecar JSON,并把其中标量元数据作为 --tag 传给 data-platform;数组字段先跳过,dp 本身不需要改。CLI 上传成功后回写 Keystone 云同步状态,并保留 data-platform 返回的审计 ID。

+
+ 方案 2:成功后回写 episode + 现有 SyncWorker 不变 + 默认关闭,按环境启用 +
+
+
+ 文档状态 +
用途:实现设计 / 评审
+
范围:Keystone 后端、Synapse 前端;data-platform CLI 只作为外部命令调用
+
日期:2026-06-02
+
+
+ + + +
+

1. 设计结论

+

推荐新增独立的 CLI 同步旁路,而不是让前端直接调用 CLI,也不是把 CLI pending/failed 状态写入现有 sync_logs。核心原则是:正常同步继续归 SyncWorker 管,CLI 同步只作为手动应急通道。

+ +
+
+ 主链路不动 +

POST /sync/episodes/:id、自动扫描、重试和 backoff 都保持现状。

+
+
+ 旁路独立记账 +

用新表 cli_sync_runs 记录 CLI 的 pending、running、failed 和 completed。

+
+
+ 成功后闭环 +

CLI 成功后写 episodes.cloud_synced,并追加一条 sync_logs.completed

+
+
+ +
+ 关键约束:不要把 CLI 的 pending 或 failed 行写进 sync_logs。现有 worker 会扫描最新 pending 和可重试 failed 行,如果 CLI 行进入这张表,可能被正常同步 worker 误认领。 +
+
+ +
+

2. 目标流程

+
+
+

新增 CLI 旁路

+
+
Synapse 按钮管理员点击「CLI 同步到云」。
+
Keystone API创建 cli_sync_runs.pending 并返回 202 Accepted
+
Keystone Runner从 MinIO 下载 MCAP 到临时文件,并读取 sidecar JSON。
+
dp CLI执行 dp --json data upload,将 sidecar 标量元数据作为重复 --tag 参数传入。
+
状态回写写入 CLI 审计数据、sync_logs.completed 和 episode 云同步字段。
+
+
+ +
+

现有正常同步

+
+
Synapse 正常同步仍调用 POST /api/v1/sync/episodes/:id
+
SyncWorker 队列负责 pending、in_progress、failed 和重试。
+
Go Uploader通过 data-platform DataGateway 和 OSS 上传。
+
完成态更新 sync_logsepisodes
+
+
+ 两条链路最后都可以把 episode 标记为已同步,但只有正常链路参与自动发现和自动重试。 +
+
+
+
+ +
+

3. 目标与非目标

+
+
+

目标

+
    +
  • 增加单 episode 的「CLI 同步到云」动作。
  • +
  • 上传 MCAP 文件到 data-platform 云端对象存储。
  • +
  • 读取 sidecar JSON,并把标量字段作为 dp data upload --tag 传递;数组字段首版先跳过。
  • +
  • 保存 fileIdlogicalUploadIduploadIdobjectKey 等审计信息。
  • +
  • 成功后更新 episodes.cloud_syncedcloud_synced_atcloud_mcap_pathcloud_processed
  • +
  • 成功后插入一条 sync_logs.completed,让现有 Cloud Sync Center 能看到完成态。
  • +
+
+
+

非目标

+
    +
  • 不替换 SyncWorker
  • +
  • 不提供批量 CLI 同步。
  • +
  • 不自动重试 CLI 失败任务。
  • +
  • 不把 CLI 失败任务混入正常同步主表。
  • +
  • 首版不上传 sidecar JSON 文件本体;但它的标量内容必须作为 MCAP 上传 tags。
  • +
  • 不把包含敏感信息的 CLI 输出返回给浏览器。
  • +
+
+
+
+ +
+

4. 后端接口

+

4.1 触发 CLI 同步

+
POST /api/v1/sync/episodes/:id/cli
+

请求体为空对象即可:

+
{
+  "status": "accepted",
+  "episode_id": 123,
+  "run_id": 456,
+  "message": "episode accepted for CLI cloud sync"
+}
+ + + + + + + + + + + + + + + +
校验项失败响应
CLI 同步功能未启用503 Service Unavailable
episode id 非法400 Bad Request
episode 不存在或已删除404 Not Found
qa_status 不是 approvedinspector_approved400 Bad Request
cloud_synced = TRUE409 Conflict
正常同步最新状态为 pendingin_progress409 Conflict
已有 CLI run 为 pendingin_progress409 Conflict
CLI runner 队列已满429 Too Many Requests
+ +

4.2 查询最新 CLI 状态

+
GET /api/v1/sync/episodes/:id/cli/status
+
{
+  "id": 456,
+  "episode_id": 123,
+  "status": "in_progress",
+  "file_id": null,
+  "logical_upload_id": null,
+  "upload_id": null,
+  "object_key": null,
+  "file_size": null,
+  "started_at": "2026-06-02T08:10:00Z",
+  "completed_at": null,
+  "error_message": null
+}
+
+ +
+

5. 数据模型

+

新增 cli_sync_runs,专门承载 CLI 旁路生命周期。正常同步的 sync_logs 只在 CLI 成功后接收一条 completed 审计行。

+ +

5.1 CLI run 表

+
CREATE TABLE IF NOT EXISTS cli_sync_runs (
+    id BIGINT AUTO_INCREMENT PRIMARY KEY,
+    episode_id BIGINT NOT NULL,
+    status ENUM('pending', 'in_progress', 'completed', 'failed') NOT NULL DEFAULT 'pending',
+    source_path VARCHAR(1024),
+    temp_path VARCHAR(1024),
+    dp_config_path VARCHAR(1024),
+    file_id VARCHAR(255),
+    logical_upload_id VARCHAR(255),
+    upload_id VARCHAR(255),
+    bucket VARCHAR(255),
+    object_key VARCHAR(1024),
+    file_size BIGINT,
+    oss_object_etag VARCHAR(255),
+    duration_sec INT,
+    error_message TEXT,
+    stdout_json JSON DEFAULT NULL,
+    started_at TIMESTAMP NULL,
+    completed_at TIMESTAMP NULL,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+    INDEX idx_cli_sync_episode (episode_id),
+    INDEX idx_cli_sync_status (status),
+    INDEX idx_cli_sync_created (created_at)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+ +

5.2 成功后的 normal sync log

+
INSERT INTO sync_logs (
+    episode_id,
+    source_path,
+    destination_path,
+    status,
+    bytes_transferred,
+    duration_sec,
+    attempt_count,
+    started_at,
+    completed_at
+) VALUES (?, ?, ?, 'completed', ?, ?, 1, ?, ?);
+ +
+
1. pendingAPI 接受请求,写入 CLI 独立表。
+
2. in_progressrunner 已 claim,正在 staging 或上传。
+
3A. failed只更新 CLI 表,episode 不变。
+
3B. completedCLI 表记录 dp 返回 ID。
+
4. syncedsync_logs.completed 与 episode 云同步字段。
+
+
+ +
+

6. CLI Runner

+

6.1 命令构造

+

必须使用 exec.CommandContext 参数数组调用,不能拼 shell 字符串。

+
exec.CommandContext(ctx, dpBin,
+  "--config", dpConfigPath,
+  "--json",
+  "data", "upload", tempFile,
+  "--device", "<robot device id>",
+  "--tag", "episode_id=<episode public id>",
+  "--tag", "keystone_episode_id=<numeric id>",
+  "--tag", "device_id=<robot device id>",
+  "--tag", "sync_channel=keystone_cli",
+  "--tag", "<flattened sidecar key=value>",
+  "--hint", "source=keystone_cli_sync",
+)
+

设备 ID 通过 episode 对应工位的机器人解析,优先使用 robots.device_id,回退到 workstations.robot_serial。所选 dp config 的 devices[] 中必须已有这个 device profile。

+ +

6.2 标签

+ + + + + + + + + + + + + + + + +
标签来源要求
episode_idepisodes.episode_id必填
keystone_episode_idepisodes.id必填
device_idepisode 工位对应的 robots.device_id必填,同时作为 --device 参数
sync_channel固定 keystone_cli必填
sidecar JSON 标量字段episodes.sidecar_path 指向的 JSON必填,扁平化后作为重复 --tag
sidecar JSON 数组字段例如 topic 列表、skills首版跳过,不传给 CLI
topics_summarysidecar JSON排除,避免 tag 过大
task_idepisodes.task_id可选
factory_idepisodes.factory_id可选
organization_idepisodes.organization_id可选
+
+ CLI 首版只上传 MCAP 对象,不上传 sidecar JSON 对象。但 sidecar JSON 元数据不是可选项:标量字段必须作为 tag 传入,数组字段首版先跳过;如果 sidecar_path 缺失、对象读不到或 JSON 解析失败,本次 CLI run 应在调用 dp 前失败,避免云端产生缺少关键过滤标签的对象。 +
+

sidecar 字段扁平化应复用现有同步 worker 的意图:普通字段转成字符串 key/value,嵌套对象用点号展开;数组字段首版跳过。同时必须受 KEYSTONE_CLI_SYNC_MAX_TAGSKEYSTONE_CLI_SYNC_MAX_TAG_BYTES 限制。

+ +

6.3 临时文件

+
    +
  • 默认目录:/var/lib/keystone/cli-sync
  • +
  • 临时文件权限:0600
  • +
  • 成功或失败后删除临时文件。
  • +
  • KEYSTONE_CLI_SYNC_KEEP_TEMP=true 时保留临时文件用于排障。
  • +
  • 启动时校验目录可写。
  • +
  • 可用时检查磁盘水位。
  • +
+ +

6.4 dp JSON 输出

+
{
+  "logicalUploadId": "logical-1",
+  "fileId": "file-1",
+  "bucket": "bucket-a",
+  "objectKey": "objects/file-1.mcap",
+  "fileSize": 123456789,
+  "ossObjectEtag": "etag",
+  "identity": "api-key",
+  "deviceId": null
+}
+

标记成功前必须校验 fileIdlogicalUploadIdobjectKeyfileSize 非空且合法。

+
+ +
+

7. 配置

+ + + + + + + + + + + + + + + + +
环境变量默认值说明
KEYSTONE_CLI_SYNC_ENABLEDfalse启用旁路 API 和 runner。
KEYSTONE_CLI_SYNC_DP_BINdpdata-platform CLI 二进制路径或名称。
KEYSTONE_CLI_SYNC_DP_CONFIG传给 dp --config 的 SDK 配置文件。
KEYSTONE_CLI_SYNC_TEMP_DIR/var/lib/keystone/cli-syncMCAP staging 目录。
KEYSTONE_CLI_SYNC_MAX_CONCURRENT1最大并发 CLI 上传数。
KEYSTONE_CLI_SYNC_QUEUE_SIZE16最大排队 run 数。
KEYSTONE_CLI_SYNC_TIMEOUT_SEC7200单次 CLI run 超时时间。
KEYSTONE_CLI_SYNC_KEEP_TEMPfalse是否保留临时文件。
KEYSTONE_CLI_SYNC_MAX_TAGS128传给 CLI 的最大 tag 数。
KEYSTONE_CLI_SYNC_MAX_TAG_BYTES65536编码后 tag 总字节上限。
+
+ 启用时启动校验:dp 可执行、dp 配置文件可读、临时目录可创建且可写。 +
+
+ +
+

8. 前端交互

+

8.1 Cloud Sync Center

+

在现有「重试」「历史」旁边增加一个行级动作:

+
CLI 同步到云
+

仅在后端配置显示 CLI sync enabled 时展示。以下情况禁用:

+
    +
  • 正常同步状态是 pendingin_progress
  • +
  • 正常同步状态是 completed
  • +
  • 当前 episode 已有 active CLI run。
  • +
  • 当前行已有操作在提交。
  • +
  • 当前用户不是 admin。
  • +
+ +

8.2 Episode Detail

+

Episode 详情页也需要同一个动作。原因是 approved 但还没有任何 sync_logs 的 episode 可能不会出现在 Cloud Sync Center 列表里。

+ +

8.3 状态展示

+ + + + + + + + + + +
CLI 状态按钮文案页面行为
pendingCLI 已入队轮询 CLI status。
in_progressCLI 同步中禁用重复点击。
completedCLI 已完成刷新正常同步 summary。
failedCLI 同步失败显示脱敏错误,正常同步行不变。
+
+ +
+

9. 安全、并发与竞态

+
+
+

安全要求

+
    +
  • 触发 API 必须要求 admin 权限。
  • +
  • 只能使用 exec.CommandContext 调用 CLI。
  • +
  • 不要把 API key 放到命令行参数。
  • +
  • dp 凭证只放在权限受控的 config 文件中。
  • +
  • 返回前端的 stdout、stderr 和错误信息必须脱敏。
  • +
  • 不要记录完整 dp config、临时凭证或 presigned URL。
  • +
+
+
+

并发策略

+
    +
  • 同一 episode 同时只允许一个 active CLI run。
  • +
  • 创建 run 时在事务里检查 active normal sync 和 active CLI run。
  • +
  • 完成前锁定 episodes 行并重新检查 cloud_synced
  • +
  • 默认并发为 1,避免占满 Keystone 磁盘、CPU 和网络。
  • +
+
+
+ +
+ 如果 CLI 上传成功后 Keystone 在落库前崩溃,后续人工重试可能产生重复云端对象。首版接受这个应急通道风险,后续可通过 data-platform 上传 idempotency key 或按 episode_id 查询已上传对象来降低风险。 +
+ +

正常 worker 与 CLI 同时完成

+

如果正常 SyncWorker 在 CLI 上传期间已经把 episode 同步完成,CLI runner 完成落库时应:

+
    +
  • 将 CLI run 标记为 completed,并保留 dp 返回的审计信息。
  • +
  • 不覆盖 episodes.cloud_mcap_path
  • +
  • 默认不插入第二条 sync_logs.completed,除非产品明确需要重复完成历史。
  • +
+
+ +
+

10. 落地计划与验收

+

10.1 实施顺序

+
    +
  1. 新增 cli_sync_runs migration 和 repository helper。
  2. +
  3. 新增 CLI sync config,默认关闭。
  4. +
  5. 实现 backend runner,并用 fake dp 可执行文件做测试。
  6. +
  7. 新增 POST /sync/episodes/:id/cli 和 CLI status endpoint。
  8. +
  9. 新增 Synapse API wrapper。
  10. +
  11. Episode Detail 增加按钮。
  12. +
  13. Cloud Sync Center 增加行按钮和 CLI 状态展示。
  14. +
  15. 只在 staging 环境开启。
  16. +
+ +

10.2 验收标准

+
    +
  • 一个 approved 小 MCAP 可以通过 CLI 同步到云。
  • +
  • data-platform 对象列表可见该文件。
  • +
  • 预期 sidecar JSON 标量字段可在 data-platform raw tags 中看到。
  • +
  • cli_sync_runs 记录 fileIdlogicalUploadId
  • +
  • sync_logs 出现一条 completed 行。
  • +
  • episodes.cloud_synced = TRUE
  • +
  • 正常 SyncWorker 不会再次处理该 episode。
  • +
+ +

10.3 必测用例

+
    +
  • 功能关闭时拒绝请求。
  • +
  • 非 approved episode 被拒绝。
  • +
  • 已 cloud_synced episode 被拒绝。
  • +
  • active normal sync 行被拒绝。
  • +
  • active CLI run 被拒绝。
  • +
  • sidecar JSON 缺失、不可读或格式错误时 run 失败。
  • +
  • sidecar JSON 标量字段会作为重复 --tag 参数传给 dp,数组字段首版跳过。
  • +
  • dp argv 不经过 shell。
  • +
  • 解析合法 dp --json 输出。
  • +
  • 缺少关键字段时标记 failed。
  • +
  • 失败不写 sync_logs
  • +
  • 成功更新 episode 和 completed sync log。
  • +
  • 临时文件成功和失败后都会清理。
  • +
  • 前端失败提示只显示脱敏错误。
  • +
+
+ +
+

11. 待确认问题

+
    +
  • CLI 失败是否需要进入 Cloud Sync Center 主表,还是只在 episode 详情/CLI badge 展示?
  • +
  • 正常 worker 已经完成时,CLI completed 是否需要单独追加到 sync_logs 历史?
  • +
  • dp data upload 是否需要 data-platform 支持显式 idempotency key?
  • +
  • dp config 应使用 site API key,还是为 Keystone edge site 建一个 device profile?
  • +
+
+
+ + diff --git a/docs/designs/data-quality-center-mvp.zh.html b/docs/designs/data-quality-center-mvp.zh.html new file mode 100644 index 0000000..673d326 --- /dev/null +++ b/docs/designs/data-quality-center-mvp.zh.html @@ -0,0 +1,892 @@ + + + + + + + 数据质检中心简化版设计 + + + +
+
+
+

Synapse / Keystone MVP

+

数据质检中心简化版设计

+

+ 简化版仍然使用 Python 脚本做质检,但首发只提供一个系统内置固定脚本,用来做 MCAP 预览可用性 smoke check。管理员在 Synapse 的数据运维模块进入“质检中心”,即可对全部 Episode 或筛选结果发起质检。 +

+
+ +
+ + + +
+

1. 设计取舍

+
+

+ 完整版的“脚本管理 + 版本 + run + job + override + 独立 Runner”能力适合长期演进,但首发上线成本偏高。简化版保留脚本执行能力,把管理面压缩成“一个内置固定脚本 + 多个质检任务”。 +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
能力完整版简化版
脚本数量多个脚本,支持 global / sop 范围只提供一个内置脚本 builtin_mcap_preview_smoke_check,覆盖所有数据
版本管理脚本定义和不可变版本分表不做上传和版本管理;job 快照保存内置脚本 key、version、SHA
执行器独立 keystone-quality-runner 进程Keystone 内置轻量 worker,默认并发 1
触发方式Episode 创建自动触发,支持重跑支持新数据自动入队,也支持 Synapse 一键全量/筛选触发
人工覆盖独立 quality_overrides复用现有 inspections 表和 episodes QA 字段
+

+ 这个方案的关键约束是“首发只解决预览入口的可读性问题”,包括对象 size / range read 异常和 MCAP 边界 magic 异常。如果后续需要按 SOP、机器人类型或场景配置不同脚本,再升级到完整版的脚本版本模型。 +

+
+
+ +
+

2. MVP 范围

+
+
+

包含

+
    +
  • Synapse 管理后台新增 数据运维 / 质检中心
  • +
  • Keystone 内置固定 Python 脚本 builtin_mcap_preview_smoke_check,全局适用于所有 Episode。
  • +
  • 内置脚本先检查对象 size / range read 是否可用,再检查 MCAP 边界 magic。
  • +
  • 脚本元数据以代码常量形式提供,执行批次和执行任务存 MySQL。
  • +
  • 支持对全部非删除 Episode 发起质检。
  • +
  • 支持按 QA 状态、创建时间、设备 ID、采集员工号筛选后发起质检。
  • +
  • 支持新创建 Episode 自动进入质检队列,开关可配置。
  • +
  • 脚本结果回写 episodes.qa_statusqa_scorequality_flag
  • +
  • 执行异常、超时、非法输出统一进入 needs_inspection
  • +
  • Episode 详情支持查看最近一次脚本结果、手动重跑、人工通过/驳回。
  • +
+
+
+

不包含

+
    +
  • 不做多脚本并行规则。
  • +
  • 不做 SOP / 场景 / 机器人类型范围匹配。
  • +
  • 不做在线代码编辑器。
  • +
  • 不做脚本上传、替换、ZIP、多文件包、Git 脚本源或动态安装依赖。
  • +
  • 不做激活前 test run。
  • +
  • 不做复杂版本列表、版本 diff 或回滚页面。
  • +
  • 不做任务取消;误触发时让当前 job 完成,后续可重新触发质检。
  • +
  • 不自动撤回已经云同步的数据。
  • +
+
+
+
+ +
+

3. 用户流程

+
+
+ 1 +
管理员进入 Synapse 数据运维 / 质检中心,看到内置脚本、最近批次、任务列表和 QA 汇总。
+
+
+ 2 +
管理员不需要上传脚本,直接点击“质检全部数据”或设置筛选条件后点击“质检筛选结果”。
+
+
+ 3 +
Keystone 为匹配的 Episode 创建 quality_jobs,每个 job 都记录内置脚本 key、version 和 SHA。
+
+
+ 4 +
Keystone 内置 worker 领取 pending job,读取 MCAP 前 8 字节和后 8 字节,执行内置 Python 脚本,并保存 stdout、stderr、result JSON 摘要。
+
+
+ 5 +
job 完成后 Keystone 按脚本输出更新 Episode QA 状态。云同步仍只放行 approvedinspector_approved
+
+
+
+ +
+

4. 架构

+
+
Synapse Admin
+  -> /api/v1/admin/quality/batches      start all-data or filtered QA
+  -> /api/v1/admin/quality/jobs         inspect job status and result
+  -> /api/v1/episodes/:id/quality-*     rerun or manual decision
+
+Keystone API
+  -> MySQL: quality_batches, quality_jobs
+  -> MySQL: episodes.qa_status, qa_score, quality_flag
+  -> MinIO: episode MCAP
+
+Keystone built-in quality worker
+  -> claims pending quality_jobs
+  -> reads MCAP leading 8 bytes and trailing 8 bytes
+  -> runs builtin_mcap_preview_smoke_check.py
+  -> updates quality_jobs and episodes
+

+ 简化版不新增独立部署进程,worker 随 Keystone 启动。默认并发为 1,避免首发时脚本执行挤占过多机器资源。 +

+
+
+ +
+

5. 数据模型

+

新增 2 张表即可支撑批量触发和执行记录。脚本不落库、不上传,脚本 key、version、SHA 由 Keystone 代码常量提供。人工复核继续复用已有 inspections 表。

+ +

quality_batches

+
id
+script_key              builtin_mcap_preview_smoke_check
+script_version          e.g. 2026.06.01
+script_sha256           sha256 of embedded script content
+trigger_type            all | filtered | episode | auto_episode
+triggered_by            admin username or system
+filter_json             actual filters used to enqueue jobs
+status                  pending | running | completed
+total_count
+pending_count
+running_count
+succeeded_count
+failed_count
+created_at
+completed_at
+

批次只用于 Synapse 展示进度。统计值可以由 quality_jobs 聚合后回写,首发也可以查询时实时计算。

+ +

quality_jobs

+
id
+batch_id
+episode_id
+script_key
+script_version
+script_sha256
+status                  pending | running | succeeded | failed | timeout | invalid_result
+decision                passed | rejected | uncertain
+score
+summary
+result_json
+stdout_excerpt
+stderr_excerpt
+error_message
+duration_ms
+locked_at
+started_at
+finished_at
+created_at
+updated_at
+

+ script_keyscript_versionscript_sha256 是执行快照。即使后续升级内置脚本,历史 job 也能看出当时实际使用的检查逻辑。 +

+
+ +
+

6. 状态规则

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
脚本/执行结果Episode QA 状态说明
job 创建或运行中qa_running用于在列表和详情页提示正在质检。
decision = passedapproved同时写 auto_approved = true,可进入云同步。
decision = rejectedrejected数据不可用,云同步不放行。
decision = uncertainneeds_inspection脚本无法可靠判断,交给人工复核。
failed / timeout / invalid_resultneeds_inspection执行失败不等于数据坏,只要求人工复核。
人工通过inspector_approved写入 inspections,可进入云同步。
人工驳回rejected写入 inspections,不进入云同步。
+

+ 对已经云同步的 Episode 重新质检时,只更新 Keystone 本地 QA 状态和质检记录,不自动删除或撤回云端对象。Synapse 在“包含已同步数据”开关旁提示这个风险。 +

+
+ +
+

7. 内置脚本

+
+

+ 首发固定脚本为 builtin_mcap_preview_smoke_check。它不是完整 MCAP 解析器,而是预览 smoke check:先检查对象是否能拿到有效 size 和必要字节范围,再用边界 magic 检查快速拦截数据预览中出现的这类错误: +

+
Expected MCAP magic '89 4d 43 41 50 30 0d 0a',
+found '2f 06 84 5c 5b ea dc 8b' [library=libmcap 2.1.2]
+

+ MCAP magic 是 8 字节:89 4d 43 41 50 30 0d 0a。MCAP 文件开头有一次 magic,文件结尾也有一次 trailing magic。 + Synapse 预览使用的 @mcap/core 会在初始化时先检查开头 magic,再读取 Header,随后检查结尾 trailing magic。 +

+

+ 因为错误里已经带了 [library=libmcap 2.1.2],说明 Header 大概率已经读成功;这种情况下更可疑的是文件结尾 trailing magic 不匹配,而不是文件开头不匹配。所以默认脚本必须同时检查开头和结尾,不能只查开头。 +

+

+ 实现时优先使用对象存储 range read:读取 offset 0..7size-8..size-1 即可发现这类 magic mismatch。只有当前 S3 client 不方便做 range read 时,才退回下载到临时文件后检查边界字节。 + 边界 magic 匹配只能说明这个 smoke check 通过,不代表 MCAP 内部索引、chunk、CRC 或压缩数据一定可读。 +

+

+ Failed to fetch size: 416 Requested Range Not Satisfiable 也属于这个 smoke check 的覆盖范围。MinIO UI 里 size 显示 - 时,通常表示前端拿不到普通文件对象的有效大小,或者当前路径不是一个可按字节范围读取的 MCAP 对象。该问题发生在 magic 检查之前,应记录为对象读取/size 异常。 +

+ +

判定规则

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
检查项结果Episode QA 状态
无法获取对象 size,或 size 为空 / 未知 / 非数字decision = rejectedrejected
文件大小小于 16 字节decision = rejectedrejected
读取开头或结尾 range 返回 416 Requested Range Not Satisfiabledecision = rejectedrejected
开头 8 字节不是 MCAP magicdecision = rejectedrejected
结尾 8 字节不是 MCAP magicdecision = rejectedrejected
开头和结尾 magic 都匹配decision = passedapproved,含义是通过当前内置 smoke check
对象网络超时、权限错误、MinIO 临时错误、脚本异常status = failedneeds_inspection
+ +

默认输出示例

+
{
+  "decision": "rejected",
+  "score": 0.0,
+  "summary": "MCAP object range is not satisfiable",
+  "findings": [
+    {
+      "severity": "error",
+      "code": "mcap_range_not_satisfiable",
+      "message": "Failed to read MCAP boundary bytes: 416 Requested Range Not Satisfiable",
+      "http_status": 416,
+      "hint": "MinIO object size is unavailable or the path is not a readable MCAP object"
+    }
+  ]
+}
+
{
+  "decision": "rejected",
+  "score": 0.0,
+  "summary": "MCAP trailing magic mismatch",
+  "findings": [
+    {
+      "severity": "error",
+      "code": "mcap_trailing_magic_mismatch",
+      "message": "Expected trailing magic 89 4d 43 41 50 30 0d 0a, found 2f 06 84 5c 5b ea dc 8b",
+      "expected_hex": "89 4d 43 41 50 30 0d 0a",
+      "actual_hex": "2f 06 84 5c 5b ea dc 8b",
+      "offset": "file_end_minus_8"
+    }
+  ]
+}
+

+ 这个脚本只能证明 MCAP 边界 magic 没有命中已知预览错误,不能证明 MCAP 内部索引、chunk、CRC 或压缩数据一定可读。后续需要更强校验时,再升级为完整 MCAP reader smoke test。 +

+
+
+ +
+

8. 脚本契约

+
+

首发不做用户上传脚本,内置脚本使用轻量输入 JSON。Keystone worker 先尝试读取 MCAP 对象大小、开头 8 字节和结尾 8 字节,再执行 Python:

+
python3 script.py --input input.json --output result.json
+

脚本必须将业务结果写入 --output 指定的 JSON 文件。stdout / stderr 只作为诊断日志保存。

+

输入文件

+
{
+  "episode_id": 42,
+  "mcap_path": "bucket/path/to/file.mcap",
+  "file_size_bytes": 123456789,
+  "object_status": "readable",
+  "object_error": null,
+  "expected_magic_hex": "89 4d 43 41 50 30 0d 0a",
+  "leading_magic_hex": "89 4d 43 41 50 30 0d 0a",
+  "trailing_magic_hex": "2f 06 84 5c 5b ea dc 8b"
+}
+

如果 worker 在获取 size 或 range read 时已经失败,也仍然生成输入 JSON 交给内置脚本输出标准化结果:

+
{
+  "episode_id": 42,
+  "mcap_path": "bucket/path/to/file.mcap",
+  "file_size_bytes": null,
+  "object_status": "range_not_satisfiable",
+  "object_error": "416 Requested Range Not Satisfiable",
+  "expected_magic_hex": "89 4d 43 41 50 30 0d 0a",
+  "leading_magic_hex": null,
+  "trailing_magic_hex": null
+}
+

最小输出

+
{
+  "decision": "passed",
+  "score": 1.0,
+  "summary": "ok",
+  "findings": []
+}
+

字段规则

+ + + + + + + + + + + + + + + + + + + + + + + + + +
字段规则
decision必填,只允许 passedrejecteduncertain
score可选,0 到 1,写入 episodes.qa_score
summary可选字符串,写入 job 摘要;当 rejected / uncertain 时同步到 episodes.quality_flag
findings可选数组,完整保存在 quality_jobs.result_json
+
    +
  • 内置脚本
  • +
  • 默认超时 30 秒
  • +
  • 默认并发 1
  • +
  • 不安装依赖
  • +
  • 不暴露数据库或 MinIO 凭证
  • +
+
+
+ +
+

9. API 草案

+
+
+

质检中心

+
GET  /api/v1/admin/quality/overview
+GET  /api/v1/admin/quality/batches
+POST /api/v1/admin/quality/batches
+GET  /api/v1/admin/quality/jobs
+
+
+

Episode 质检操作

+
GET  /api/v1/episodes/:id/quality-jobs
+POST /api/v1/episodes/:id/quality-rerun
+POST /api/v1/episodes/:id/quality-decision
+
+
+ +

质检中心概览

+
GET /api/v1/admin/quality/overview
+
+{
+  "script": {
+    "key": "builtin_mcap_preview_smoke_check",
+    "version": "2026.06.01",
+    "expected_magic_hex": "89 4d 43 41 50 30 0d 0a",
+    "checks": ["object_size", "range_read", "leading_magic", "trailing_magic"]
+  },
+  "summary": {
+    "total": 1200,
+    "pending_qa": 20,
+    "qa_running": 4,
+    "approved": 1100,
+    "needs_inspection": 30,
+    "rejected": 46
+  }
+}
+ +

触发全量质检

+
POST /api/v1/admin/quality/batches
+Content-Type: application/json
+
+{
+  "scope": "all",
+  "include_cloud_synced": true
+}
+ +

触发筛选质检

+
POST /api/v1/admin/quality/batches
+Content-Type: application/json
+
+{
+  "scope": "filtered",
+  "filters": {
+    "qa_status": ["pending_qa", "needs_inspection"],
+    "created_at_from": "2026-06-01T00:00:00Z",
+    "created_at_to": "2026-06-02T00:00:00Z",
+    "robot_device_id": "robot-001",
+    "collector_operator_id": "collector-001"
+  }
+}
+ +

人工复核

+
POST /api/v1/episodes/:id/quality-decision
+Content-Type: application/json
+
+{
+  "decision": "approved",
+  "reason": "人工预览 MCAP 后确认可用"
+}
+

所有 /admin/quality/* 和写操作首发只开放给 admin。

+
+ +
+

10. Synapse 页面

+
+
+

导航入口

+
    +
  • AdminSidebar.vue 的“数据运维”分组新增 质检中心
  • +
  • 新增路由 /admin/quality,路由名 AdminQualityCenter
  • +
  • 页面文件建议为 views/admin/quality/QualityCenter.vue
  • +
+
+
+

质检中心首屏

+
    +
  • 顶部汇总:总数据、待质检、质检中、已通过、需复核、已驳回。
  • +
  • 脚本卡片:内置脚本 key、version、检查项、预期 magic、默认超时。
  • +
  • 操作区:质检全部数据、质检筛选结果。
  • +
  • 筛选项复用数据生产统计页面口径:QA 状态、时间范围、设备、采集员。
  • +
+
+
+

列表与详情

+
    +
  • 批次列表展示发起人、范围、总数、完成数、失败数、创建时间。
  • +
  • job 列表展示 Episode、状态、decision、score、summary、耗时。
  • +
  • 点击 Episode 跳转现有 Episode 详情页。
  • +
  • Episode 详情页新增“脚本质检”卡片:最近 job、result JSON、重跑、人工通过/驳回。
  • +
+
+
+

+ 首发页面可以复用现有 ListPageLayoutDataTableModalBaseInputBaseSelectBaseTextarea,不新建设计系统组件。 +

+
+ +
+

11. 实施顺序

+
    +
  1. Keystone 新增迁移:quality_batchesquality_jobs
  2. +
  3. Keystone 新增内置脚本文件或嵌入式脚本常量:builtin_mcap_preview_smoke_check.py
  4. +
  5. Keystone 新增 QualityHandler:overview、batch enqueue、job 列表、Episode 重跑和人工决策。
  6. +
  7. Keystone 新增内置 quality worker:领取 pending job、执行内置脚本、落库结果、更新 Episode QA 状态。
  8. +
  9. 上传完成路径增加自动入队:当 QUALITY_AUTO_RUN_ON_UPLOAD=true 时创建单 Episode job。
  10. +
  11. Synapse 新增 api/quality.jsQualityCenter.vue、路由和侧边栏入口。
  12. +
  13. Synapse Episode 详情页新增脚本质检卡片,接入重跑和人工通过/驳回。
  14. +
  15. 验证:全量触发、筛选触发、size 不可用、range 416、开头 magic 异常、结尾 magic 异常、文件过小、执行异常、云同步资格。
  16. +
+ +

首发验收标准

+
+
    +
  • 管理员能在“质检中心”看到内置脚本 builtin_mcap_preview_smoke_check 的说明。
  • +
  • 管理员能点击一次对全部 Episode 创建质检任务。
  • +
  • 管理员能按筛选条件只质检一部分 Episode。
  • +
  • 对象 size 不可用或 MinIO 显示 size 为 - 时 Episode 变为 rejected,finding 写明 size 异常。
  • +
  • 读取边界字节返回 416 Requested Range Not Satisfiable 时 Episode 变为 rejected,finding 写明 range 异常。
  • +
  • 开头和结尾 magic 都正确时 Episode 变为 approved,表示当前内置 smoke check 通过。
  • +
  • 开头 magic 不匹配时 Episode 变为 rejected,finding 写明实际开头 8 字节。
  • +
  • 结尾 magic 不匹配时 Episode 变为 rejected,finding 写明实际结尾 8 字节。
  • +
  • 脚本超时、异常或对象读取失败后 Episode 变为 needs_inspection
  • +
  • Episode 详情能看到最近一次脚本执行结果,并能人工通过或驳回。
  • +
  • 云同步继续只允许 approvedinspector_approved
  • +
+
+
+
+ + diff --git a/docs/designs/data-quality-script-management.zh.html b/docs/designs/data-quality-script-management.zh.html new file mode 100755 index 0000000..c58ae10 --- /dev/null +++ b/docs/designs/data-quality-script-management.zh.html @@ -0,0 +1,878 @@ + + + + + + + 数据质检脚本管理设计 + + + +
+
+
+

Keystone / Synapse Phase 1

+

数据质检脚本管理设计

+

+ 第一版只做一个可落地的最小闭环:上传 Python 质检脚本,Episode 入库后自动触发独立 Runner 执行,结果写回 QA 状态,并由 QA 状态控制云同步资格。 +

+
+ +
+ + + +
+

1. 一期范围

+
+
+

包含

+
    +
  • 只支持 Python 脚本。
  • +
  • 上传单个 .py 文件,最大 1 MB。
  • +
  • 脚本文件存 MinIO,元数据和执行记录存 MySQL。
  • +
  • 脚本版本不可变,上传后默认不激活。
  • +
  • 管理员显式激活版本;同一脚本只允许一个 active 版本。
  • +
  • 触发范围支持 globalsop
  • +
  • Episode 新建后自动触发质检。
  • +
  • 支持单个 Episode 手动重跑。
  • +
  • 支持从 needs_inspectionrejected 人工覆盖。
  • +
  • 固定 Runner Runtime:python3.11-mcap
  • +
+
+
+

不包含

+
    +
  • 不做 UI 在线代码编辑器。
  • +
  • 不支持 ZIP、多文件脚本包、Git 脚本源。
  • +
  • 不支持脚本自带依赖安装。
  • +
  • 不做激活前 test run。
  • +
  • 不做上传时 Python 语法校验。
  • +
  • 不做 job cancel、自动 retry、历史批量回扫。
  • +
  • 不建批次 QA 汇总表。
  • +
  • 不做补采批次,不做任务回退。
  • +
  • 不改 tasks.statusbatches.statusorders.status
  • +
+
+
+
+ +
+

2. 已实现基础

+

以下能力已存在于当前 Keystone / Synapse 体系中,是质检脚本管理的一期实现基础,不需要从零建设。

+
+
+

Keystone 已有基础

+
    +
  • 任务、批次、订单、Episode 等生产数据模型。
  • +
  • 上传完成后创建 Episode,并保存 MCAP / sidecar 的对象路径。
  • +
  • Episode 已有 qa_statusauto_approvedquality_flag 等质检相关字段。
  • +
  • MinIO / S3 存储接入能力,可保存 MCAP、sidecar 和后续脚本 artifact。
  • +
  • 云同步 worker 已按 approved / inspector_approved 过滤可同步 Episode。
  • +
  • JWT 鉴权和 admin / data_collector 角色基础。
  • +
+
+
+

Synapse 已有基础

+
    +
  • Admin 管理后台布局、导航和 CRUD 页面模式。
  • +
  • 通用 API client、分页列表、表单、弹窗和确认对话框组件。
  • +
  • Episode 详情页和数据预览能力。
  • +
  • 任务、批次、统计、云同步等后台页面,可接入 QA 状态展示。
  • +
  • 已有数据生产统计页面,可继续按 episodes.qa_status 聚合。
  • +
+
+
+
+ +
+

3. 待实现模块

+

一期需要新增的是质检脚本管理和独立 Runner 执行闭环,生产主状态机不纳入本次改造。

+
+
+

Keystone 后端

+
    +
  • 新增 quality_scriptsquality_script_versionsquality_runsquality_run_jobsquality_overrides 表。
  • +
  • 新增脚本管理、版本上传、激活/停用、run/job 查询 API。
  • +
  • 在 Episode 创建后匹配 active 脚本并创建质检 run/jobs。
  • +
  • 新增手动重跑和人工覆盖 API。
  • +
  • 调整上传完成路径:有脚本时进入 qa_running,无脚本时自动 approved
  • +
+
+
+

Quality Runner

+
    +
  • 新增独立命令或服务 keystone-quality-runner
  • +
  • 从 MySQL 领取 quality_run_jobs
  • +
  • 从 MinIO 下载 MCAP、sidecar 和 Python 脚本。
  • +
  • 按固定命令执行 Python 子进程并限制超时。
  • +
  • 写回 job 结果,并在 run 完成后结算 Episode QA 状态。
  • +
+
+
+

Synapse 前端

+
    +
  • 新增“数据质检”后台入口。
  • +
  • 脚本列表、脚本元数据表单、版本上传和激活/停用页面。
  • +
  • run/job 列表和结果详情。
  • +
  • Episode 详情页增加 QA 面板、手动重跑和人工覆盖入口。
  • +
  • 批次、统计、云同步相关页面展示有效 QA 状态。
  • +
+
+
+
+ +
+

4. 架构

+
+
Synapse Admin
+  -> Keystone REST API
+      -> MySQL: scripts, versions, runs, jobs, overrides
+      -> MinIO: uploaded script files
+
+Keystone upload_complete
+  -> creates episode
+  -> matches active quality script versions
+  -> creates quality run and jobs
+
+keystone-quality-runner
+  -> polls MySQL quality_run_jobs
+  -> downloads MCAP, sidecar, and script from MinIO
+  -> runs Python script in a child process
+  -> writes job result
+  -> settles the parent quality run
+  -> updates episode QA status
+

+ Runner 是独立进程,不放在 Keystone API 进程内执行 Python。推荐同仓库、同发布包,但运行成两个命令: + keystone-edgekeystone-quality-runner。 +

+
+
+ +
+

5. Episode QA 状态

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
状态含义
pending_qaEpisode 已创建,但质检还没有开始。
qa_running当前质检轮次仍有 pending 或 running job。
approved所有匹配脚本都通过,或没有任何匹配脚本。
needs_inspection脚本异常、超时、输出非法,或脚本返回 uncertain
inspector_approved管理员人工确认通过。
rejected脚本或管理员明确驳回该数据。
+

+ 一期质检系统不写 episodes.qa_status = failed。执行失败不等于数据坏,统一进入 needs_inspection。 +

+

云同步资格仍只认 approvedinspector_approved

+
+ +
+

6. 脚本版本规则

+
+
    +
  • 上传只接受一个 .py 文件,空文件拒绝,最大 1 MB。
  • +
  • 版本号必须是 SemVer:MAJOR.MINOR.PATCH,不带 v 前缀。
  • +
  • Keystone 计算并保存 SHA256。
  • +
  • MinIO 路径使用 slug/version/sha256.py,不信任原始文件名。
  • +
  • 上传版本默认 inactive,不会自动激活。
  • +
  • 激活版本是单独的 admin 操作。
  • +
  • 激活一个版本会停用同一脚本的旧 active 版本。
  • +
  • 已经排队或运行中的 job 继续使用它引用的不可变版本。
  • +
  • 一期不物理删除脚本版本或 MinIO artifact。
  • +
+

+ 版本同时携带执行策略:languageruntimetimeout_seconds、 + scope_typescope_ref_iddefault_config。修改代码、配置、超时或适用范围都需要上传新版本。 +

+
+
+ +
+

7. 数据模型

+

一期使用 5 张质检表,先不拆独立 findings 表;脚本输出里的 findings 存在 quality_run_jobs.result_json

+ +

quality_scripts

+
id
+slug
+name
+description
+status              active | inactive
+created_by
+created_at
+updated_at
+deleted_at
+ +

quality_script_versions

+
id
+script_id
+version
+language            python
+runtime             python3.11-mcap
+entrypoint          normalized uploaded filename
+artifact_uri
+artifact_sha256
+artifact_size_bytes
+timeout_seconds
+scope_type          global | sop
+scope_ref_id        null for global, sop id for sop
+default_config      JSON
+status              active | inactive
+created_by
+created_at
+deleted_at
+ +

quality_runs

+
id
+episode_id
+trigger_type        auto | manual
+triggered_by        system or admin user id/name
+status              pending | running | completed
+final_qa_status
+settlement_reason
+created_at
+started_at
+completed_at
+

同一个 Episode 一期只允许一个 active 质检轮次;如果还有 pending/running job,手动重跑返回 409 Conflict

+ +

quality_run_jobs

+
id
+quality_run_id
+episode_id
+script_version_id
+status              pending | running | succeeded | failed | timeout | invalid_result
+decision            passed | rejected | uncertain
+runner_id
+locked_at
+started_at
+finished_at
+score
+summary
+result_json         full script output, including findings
+stdout_excerpt
+stderr_excerpt
+error_message
+duration_ms
+created_at
+updated_at
+

status = failed 表示 Runner 或 job 执行失败,不表示数据被驳回。

+ +

quality_overrides

+
id
+episode_id
+previous_qa_status
+new_qa_status
+decision            approved | rejected
+reason
+operator_id
+operator_name
+created_at
+

人工覆盖只更新 episodes.qa_status,不改历史 quality_runsquality_run_jobs

+
+ +
+

8. 自动触发与结算

+
+
+ 1 +
上传完成后 Keystone 创建 Episode,初始写 qa_status = pending_qa
+
+
+ 2 +
匹配所有 active global 版本,以及 SOP 匹配的 active sop 版本。
+
+
+ 3 +
如果没有匹配脚本,直接写 approvedauto_approved = true
+
+
+ 4 +
如果有匹配脚本,创建一个 quality_runs 和多条 quality_run_jobs,Episode 进入 qa_running
+
+
+ 5 +
所有 job 完成后统一结算;rejected 不短路其他脚本。
+
+
+ +

结算规则

+
if any job timed out, failed to execute, produced invalid output, or returned uncertain:
+  episode.qa_status = needs_inspection
+else if any job returned rejected:
+  episode.qa_status = rejected
+else:
+  episode.qa_status = approved
+ +

手动重跑

+
    +
  • 仅 admin 可用。
  • +
  • 使用当前 active 脚本版本。
  • +
  • 不允许临时覆盖配置。
  • +
  • 不使用历史轮次里的旧版本。
  • +
  • 如果 Episode 已有 active QA job,返回 409 Conflict
  • +
+
+ +
+

9. 脚本执行契约

+

Runner 为每个 job 准备临时目录:

+
input.mcap
+sidecar.json
+config.json
+result.json
+script.py
+

执行命令固定为:

+
python script.py --mcap input.mcap --sidecar sidecar.json --config config.json --output result.json
+

业务结果必须写入 --output 指定的 JSON 文件;stdoutstderr 只作为日志保存。

+ +

最小输出

+
{
+  "decision": "passed",
+  "score": 1.0,
+  "summary": "ok",
+  "findings": []
+}
+ +

结果字段

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
字段规则
decision必填:passedrejecteduncertain
score可选,0 到 1。
summary可选字符串。
findings可选数组,保存在 result_json 中。
findings[].severityinfowarningerror
findings[].message必填字符串。
+ +
    +
  • passed:脚本接受数据
  • +
  • rejected:脚本驳回数据
  • +
  • uncertain:脚本无法可靠判断
  • +
+

输出文件缺失、JSON 非法、缺少 decision 或未知 decision,job 状态写为 invalid_result

+
+ +
+

10. Runner 行为

+
+
    +
  • 轮询 MySQL quality_run_jobs
  • +
  • 用事务和行锁领取 job。
  • +
  • runner_idlocked_at 标识所有权。
  • +
  • 直接从 MinIO 下载 MCAP、sidecar 和脚本文件。
  • +
  • 以子进程执行 Python。
  • +
  • 执行 timeout_seconds 超时控制。
  • +
  • 截断保存 stdout/stderr。
  • +
  • 不把数据库或 MinIO 凭证传给脚本进程。
  • +
  • 不自动 retry。
  • +
  • stale running job 超过超时加宽限期后视为执行失败。
  • +
+

建议配置项:QUALITY_RUNNER_CONCURRENCYQUALITY_RUNNER_POLL_INTERVAL_SECONDS

+
+
+ +
+

11. 人工覆盖

+
+
+

允许来源状态

+
    +
  • needs_inspection
  • +
  • rejected
  • +
+
+
+

允许目标

+
    +
  • approved -> inspector_approved
  • +
  • rejected -> rejected
  • +
+
+
+
    +
  • reason 必填。
  • +
  • pending_qaqa_running 时不允许覆盖。
  • +
  • 覆盖只更新 Episode 的 effective QA 状态。
  • +
  • 覆盖不改脚本执行历史。
  • +
  • 覆盖写入 quality_overrides
  • +
+
POST /api/v1/episodes/:id/quality-override
+Content-Type: application/json
+
+{
+  "decision": "approved",
+  "reason": "manual review confirmed the data is usable"
+}
+
+ +
+

12. API 草案

+
+
+

脚本管理

+
GET   /api/v1/quality/scripts
+POST  /api/v1/quality/scripts
+GET   /api/v1/quality/scripts/:id
+PATCH /api/v1/quality/scripts/:id
+POST  /api/v1/quality/scripts/:id/versions
+GET   /api/v1/quality/scripts/:id/versions
+POST  /api/v1/quality/script-versions/:id/activate
+POST  /api/v1/quality/script-versions/:id/deactivate
+
+
+

执行与复核

+
GET  /api/v1/quality/runs
+GET  /api/v1/quality/jobs
+POST /api/v1/episodes/:id/quality-runs
+POST /api/v1/episodes/:id/quality-override
+
+
+

所有脚本管理、手动重跑和人工覆盖 API 一期都只开放给 admin。

+
+ +
+

13. Synapse 页面范围

+
+
+

脚本管理

+
    +
  • 脚本列表。
  • +
  • 创建和编辑元数据。
  • +
  • 上传脚本版本。
  • +
  • 激活和停用版本。
  • +
+
+
+

执行记录

+
    +
  • run/job 列表。
  • +
  • 按状态、脚本、Episode 筛选。
  • +
  • 查看 stdout/stderr 摘要。
  • +
  • 查看 result_json 中的 findings。
  • +
+
+
+

Episode 详情

+
    +
  • 显示 effective QA 状态。
  • +
  • 显示最新 run 状态。
  • +
  • 展示每个脚本 job 结果。
  • +
  • 支持手动重跑和人工覆盖。
  • +
+
+
+

一期不需要在线代码编辑器。

+
+ +
+

14. 批次和统计查询

+

一期不保存批次 QA 汇总表。批次详情和统计页面需要时直接聚合 episodes.qa_status

+
SELECT
+  COUNT(*) AS total,
+  SUM(qa_status = 'approved') AS approved_count,
+  SUM(qa_status = 'inspector_approved') AS inspector_approved_count,
+  SUM(qa_status = 'rejected') AS rejected_count,
+  SUM(qa_status = 'needs_inspection') AS needs_inspection_count,
+  SUM(qa_status = 'qa_running') AS qa_running_count,
+  SUM(qa_status = 'pending_qa') AS pending_qa_count
+FROM episodes
+WHERE batch_id = ? AND deleted_at IS NULL;
+
+ +
+

15. 固定 Runtime

+
+

一期 Runtime 固定为 python3.11-mcap,脚本不能上传或安装自己的依赖。

+

初始依赖集合

+
    +
  • mcap
  • +
  • numpy
  • +
  • pandas
  • +
  • Pillow
  • +
  • opencv-python-headless
  • +
  • pyyaml
  • +
  • jsonschema
  • +
+
+
+ +
+

16. 后续演进

+
    +
  • 第二阶段支持不可变 ZIP 包,包含 main.pyrequirements.lock 和可选 config.schema.json
  • +
  • 第二阶段在执行前构建受控 Python 环境,不在 job 执行时动态安装依赖。
  • +
  • 第三阶段支持脚本专属容器镜像,用于复杂依赖、模型文件、GPU Runtime 或更强隔离。
  • +
  • 未来执行器继续复用同一套输入和输出契约。
  • +
+
+
+ + diff --git a/internal/api/handlers/sync.go b/internal/api/handlers/sync.go index 9d1694c..7207912 100644 --- a/internal/api/handlers/sync.go +++ b/internal/api/handlers/sync.go @@ -20,21 +20,28 @@ import ( // SyncHandler handles cloud sync related HTTP requests. type SyncHandler struct { - db *sqlx.DB - syncWorker *services.SyncWorker + db *sqlx.DB + syncWorker *services.SyncWorker + cliSyncRunner *services.CLISyncRunner } // NewSyncHandler creates a new SyncHandler. -func NewSyncHandler(db *sqlx.DB, syncWorker *services.SyncWorker) *SyncHandler { - return &SyncHandler{db: db, syncWorker: syncWorker} +func NewSyncHandler(db *sqlx.DB, syncWorker *services.SyncWorker, cliSyncRunner ...*services.CLISyncRunner) *SyncHandler { + var runner *services.CLISyncRunner + if len(cliSyncRunner) > 0 { + runner = cliSyncRunner[0] + } + return &SyncHandler{db: db, syncWorker: syncWorker, cliSyncRunner: runner} } // RegisterRoutes registers cloud sync related routes. func (h *SyncHandler) RegisterRoutes(apiV1 *gin.RouterGroup) { apiV1.POST("/sync/episodes", h.TriggerBatchSync) apiV1.POST("/sync/episodes/:id", h.TriggerEpisodeSync) + apiV1.POST("/sync/episodes/:id/cli", h.TriggerEpisodeCLISync) apiV1.GET("/sync/episodes", h.ListSyncJobs) apiV1.GET("/sync/episodes/summary", h.ListEpisodeSyncSummaries) + apiV1.GET("/sync/episodes/:id/cli/status", h.GetEpisodeCLISyncStatus) apiV1.GET("/sync/episodes/:id/logs", h.ListEpisodeSyncLogs) apiV1.GET("/sync/episodes/:id/status", h.GetSyncStatus) apiV1.GET("/sync/config", h.GetSyncConfig) @@ -114,6 +121,26 @@ type SyncEpisodeSummaryResponse struct { CompletedAt *string `json:"completed_at,omitempty"` } +// CLISyncRunResponse represents one CLI sync sidepath run. +type CLISyncRunResponse struct { + ID int64 `json:"id"` + EpisodeID int64 `json:"episode_id"` + Status string `json:"status"` + SourcePath *string `json:"source_path,omitempty"` + TempPath *string `json:"temp_path,omitempty"` + FileID *string `json:"file_id,omitempty"` + LogicalUploadID *string `json:"logical_upload_id,omitempty"` + UploadID *string `json:"upload_id,omitempty"` + Bucket *string `json:"bucket,omitempty"` + ObjectKey *string `json:"object_key,omitempty"` + FileSize *int64 `json:"file_size,omitempty"` + OSSObjectETag *string `json:"oss_object_etag,omitempty"` + DurationSec *int64 `json:"duration_sec,omitempty"` + ErrorMessage *string `json:"error_message,omitempty"` + StartedAt *string `json:"started_at,omitempty"` + CompletedAt *string `json:"completed_at,omitempty"` +} + // SyncJobListResponse represents the response for listing sync jobs. type SyncJobListResponse struct { Items []SyncJobResponse `json:"items"` @@ -258,6 +285,82 @@ func (h *SyncHandler) TriggerEpisodeSync(c *gin.Context) { }) } +// TriggerEpisodeCLISync triggers the dp CLI cloud sync sidepath for one episode. +// +// @Summary Trigger single episode CLI cloud sync +// @Description Enqueues a specific episode for cloud sync through the dp CLI sidepath +// @Tags sync +// @Produce json +// @Param id path int true "Episode ID" +// @Success 202 {object} map[string]interface{} +// @Failure 400 {object} map[string]string +// @Failure 404 {object} map[string]string +// @Failure 409 {object} map[string]string +// @Failure 429 {object} map[string]string +// @Failure 503 {object} map[string]string +// @Router /sync/episodes/{id}/cli [post] +func (h *SyncHandler) TriggerEpisodeCLISync(c *gin.Context) { + if h.cliSyncRunner == nil || !h.cliSyncRunner.IsEnabled() { + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "CLI sync is not configured"}) + return + } + + episodeID, ok := parseSyncEpisodeIDParam(c) + if !ok { + return + } + + runID, err := h.cliSyncRunner.EnqueueEpisode(c.Request.Context(), episodeID) + if err != nil { + h.writeCLISyncError(c, episodeID, err) + return + } + + c.JSON(http.StatusAccepted, gin.H{ + "status": "accepted", + "episode_id": episodeID, + "run_id": runID, + "message": "episode accepted for CLI cloud sync", + }) +} + +// GetEpisodeCLISyncStatus returns the latest CLI sync sidepath run for one episode. +// +// @Summary Get episode CLI sync status +// @Description Returns the latest dp CLI sync run for a specific episode +// @Tags sync +// @Produce json +// @Param id path int true "Episode ID" +// @Success 200 {object} CLISyncRunResponse +// @Failure 400 {object} map[string]string +// @Failure 404 {object} map[string]string +// @Failure 503 {object} map[string]string +// @Router /sync/episodes/{id}/cli/status [get] +func (h *SyncHandler) GetEpisodeCLISyncStatus(c *gin.Context) { + if h.cliSyncRunner == nil || !h.cliSyncRunner.IsEnabled() { + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "CLI sync is not configured"}) + return + } + + episodeID, ok := parseSyncEpisodeIDParam(c) + if !ok { + return + } + + run, err := h.cliSyncRunner.LatestRun(c.Request.Context(), episodeID) + if err == sql.ErrNoRows { + c.JSON(http.StatusNotFound, gin.H{"error": "no CLI sync record found for this episode"}) + return + } + if err != nil { + logger.Printf("[SYNC] Failed to query CLI sync status for episode %d: %v", episodeID, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to get CLI sync status"}) + return + } + + c.JSON(http.StatusOK, cliSyncRunResponseFromRun(run)) +} + // ListSyncJobs lists sync log entries with filtering and pagination. // // @Summary List sync jobs @@ -603,10 +706,18 @@ func (h *SyncHandler) GetSyncConfig(c *gin.Context) { autoScanEnabled = h.syncWorker.AutoScanEnabled() maxRetries = h.syncWorker.MaxRetries() } + cliSyncEnabled := false + cliSyncRunning := false + if h.cliSyncRunner != nil { + cliSyncEnabled = h.cliSyncRunner.IsEnabled() + cliSyncRunning = h.cliSyncRunner.IsRunning() + } c.JSON(http.StatusOK, gin.H{ "worker_running": workerRunning, "auto_scan_enabled": autoScanEnabled, "max_retries": maxRetries, + "cli_sync_enabled": cliSyncEnabled, + "cli_sync_running": cliSyncRunning, }) } @@ -648,6 +759,79 @@ func syncEpisodeSummaryResponseFromRow(r syncEpisodeSummaryRow) SyncEpisodeSumma } } +func cliSyncRunResponseFromRun(r *services.CLISyncRun) CLISyncRunResponse { + if r == nil { + return CLISyncRunResponse{} + } + return CLISyncRunResponse{ + ID: r.ID, + EpisodeID: r.EpisodeID, + Status: r.Status, + SourcePath: nullableString(r.SourcePath), + TempPath: nullableString(r.TempPath), + FileID: nullableString(r.FileID), + LogicalUploadID: nullableString(r.LogicalUploadID), + UploadID: nullableString(r.UploadID), + Bucket: nullableString(r.Bucket), + ObjectKey: nullableString(r.ObjectKey), + FileSize: nullableInt64(r.FileSize), + OSSObjectETag: nullableString(r.OSSObjectETag), + DurationSec: nullableInt64(r.DurationSec), + ErrorMessage: nullableString(r.ErrorMessage), + StartedAt: nullableTime(r.StartedAt), + CompletedAt: nullableTime(r.CompletedAt), + } +} + +func parseSyncEpisodeIDParam(c *gin.Context) (int64, bool) { + idStr := c.Param("id") + episodeID, err := strconv.ParseInt(strings.TrimSpace(idStr), 10, 64) + if err != nil || episodeID <= 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "invalid episode id"}) + return 0, false + } + return episodeID, true +} + +func (h *SyncHandler) writeCLISyncError(c *gin.Context, episodeID int64, err error) { + switch { + case errors.Is(err, services.ErrCLISyncDisabled), errors.Is(err, services.ErrCLISyncNotRunning): + c.JSON(http.StatusServiceUnavailable, gin.H{ + "error": err.Error(), + "episode_id": episodeID, + "status": "cli_sync_unavailable", + }) + case errors.Is(err, services.ErrCLISyncEpisodeNotFound): + c.JSON(http.StatusNotFound, gin.H{ + "error": "episode not found", + "episode_id": episodeID, + }) + case errors.Is(err, services.ErrCLISyncNotEligible): + c.JSON(http.StatusBadRequest, gin.H{ + "error": err.Error(), + "episode_id": episodeID, + "status": "not_eligible", + }) + case errors.Is(err, services.ErrCLISyncAlreadySynced), + errors.Is(err, services.ErrCLISyncAlreadyActive), + errors.Is(err, services.ErrCLISyncNormalSyncActive): + c.JSON(http.StatusConflict, gin.H{ + "error": err.Error(), + "episode_id": episodeID, + "status": "already_active", + }) + case errors.Is(err, services.ErrCLISyncQueueFull): + c.JSON(http.StatusTooManyRequests, gin.H{ + "error": err.Error(), + "episode_id": episodeID, + "status": "queue_full", + }) + default: + logger.Printf("[SYNC] CLI enqueue episode %d failed: %v", episodeID, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to enqueue CLI sync"}) + } +} + func nullableInt64(v sql.NullInt64) *int64 { if !v.Valid { return nil diff --git a/internal/config/config.go b/internal/config/config.go index 0f1a635..0d56399 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -8,6 +8,7 @@ package config import ( "fmt" "os" + "os/exec" "strconv" "strings" ) @@ -19,6 +20,7 @@ type Config struct { Storage StorageConfig QA QAConfig Sync SyncConfig + CLISync CLISyncConfig Auth AuthConfig Features FeaturesConfig Monitoring MonitoringConfig @@ -89,6 +91,20 @@ type SyncConfig struct { MaxRestartCount int // max number of upload restarts before permanent failure; 0 uses uploader default (3) } +// CLISyncConfig controls the emergency dp CLI cloud sync sidepath. +type CLISyncConfig struct { + Enabled bool + DPBin string + DPConfigPath string + TempDir string + MaxConcurrent int + QueueSize int + TimeoutSec int + KeepTemp bool + MaxTags int + MaxTagBytes int +} + // FeaturesConfig feature flags configuration type FeaturesConfig struct { StrataEnabled bool @@ -204,6 +220,18 @@ func Load() (*Config, error) { PersistRootDir: getEnv("KEYSTONE_SYNC_PERSIST_ROOT_DIR", ""), MaxRestartCount: getEnvInt("KEYSTONE_SYNC_MAX_RESTART_COUNT", 3), }, + CLISync: CLISyncConfig{ + Enabled: getEnvBool("KEYSTONE_CLI_SYNC_ENABLED", false), + DPBin: getEnv("KEYSTONE_CLI_SYNC_DP_BIN", "dp"), + DPConfigPath: getEnv("KEYSTONE_CLI_SYNC_DP_CONFIG", ""), + TempDir: getEnv("KEYSTONE_CLI_SYNC_TEMP_DIR", "/var/lib/keystone/cli-sync"), + MaxConcurrent: getEnvInt("KEYSTONE_CLI_SYNC_MAX_CONCURRENT", 1), + QueueSize: getEnvInt("KEYSTONE_CLI_SYNC_QUEUE_SIZE", 16), + TimeoutSec: getEnvInt("KEYSTONE_CLI_SYNC_TIMEOUT_SEC", 7200), + KeepTemp: getEnvBool("KEYSTONE_CLI_SYNC_KEEP_TEMP", false), + MaxTags: getEnvInt("KEYSTONE_CLI_SYNC_MAX_TAGS", 128), + MaxTagBytes: getEnvInt("KEYSTONE_CLI_SYNC_MAX_TAG_BYTES", 65536), + }, Auth: AuthConfig{ JWTSecret: getEnv("KEYSTONE_JWT_SECRET", ""), Issuer: getEnv("KEYSTONE_JWT_ISSUER", "keystone-edge"), @@ -319,6 +347,45 @@ func (c *Config) Validate() error { return fmt.Errorf("sync max restart count must be greater than or equal to 0 when sync is enabled") } } + if c.CLISync.Enabled { + c.CLISync.DPBin = strings.TrimSpace(c.CLISync.DPBin) + if c.CLISync.DPBin == "" { + return fmt.Errorf("KEYSTONE_CLI_SYNC_DP_BIN is required when CLI sync is enabled") + } + if _, err := exec.LookPath(c.CLISync.DPBin); err != nil { + return fmt.Errorf("KEYSTONE_CLI_SYNC_DP_BIN %q is not executable: %w", c.CLISync.DPBin, err) + } + c.CLISync.DPConfigPath = strings.TrimSpace(c.CLISync.DPConfigPath) + if c.CLISync.DPConfigPath == "" { + return fmt.Errorf("KEYSTONE_CLI_SYNC_DP_CONFIG is required when CLI sync is enabled") + } + info, err := os.Stat(c.CLISync.DPConfigPath) + if err != nil { + return fmt.Errorf("KEYSTONE_CLI_SYNC_DP_CONFIG %q is not readable: %w", c.CLISync.DPConfigPath, err) + } + if info.IsDir() { + return fmt.Errorf("KEYSTONE_CLI_SYNC_DP_CONFIG %q must be a file", c.CLISync.DPConfigPath) + } + c.CLISync.TempDir = strings.TrimSpace(c.CLISync.TempDir) + if c.CLISync.TempDir == "" { + return fmt.Errorf("KEYSTONE_CLI_SYNC_TEMP_DIR is required when CLI sync is enabled") + } + if c.CLISync.MaxConcurrent <= 0 { + return fmt.Errorf("KEYSTONE_CLI_SYNC_MAX_CONCURRENT must be greater than 0 when CLI sync is enabled") + } + if c.CLISync.QueueSize <= 0 { + return fmt.Errorf("KEYSTONE_CLI_SYNC_QUEUE_SIZE must be greater than 0 when CLI sync is enabled") + } + if c.CLISync.TimeoutSec <= 0 { + return fmt.Errorf("KEYSTONE_CLI_SYNC_TIMEOUT_SEC must be greater than 0 when CLI sync is enabled") + } + if c.CLISync.MaxTags <= 0 { + return fmt.Errorf("KEYSTONE_CLI_SYNC_MAX_TAGS must be greater than 0 when CLI sync is enabled") + } + if c.CLISync.MaxTagBytes <= 0 { + return fmt.Errorf("KEYSTONE_CLI_SYNC_MAX_TAG_BYTES must be greater than 0 when CLI sync is enabled") + } + } return nil } diff --git a/internal/server/server.go b/internal/server/server.go index ff908c0..b2ff67d 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -59,6 +59,7 @@ type Server struct { productionDashboard *handlers.ProductionDashboardHandler syncHandler *handlers.SyncHandler syncWorker *services.SyncWorker + cliSyncRunner *services.CLISyncRunner httpServer *http.Server transferWSServer *http.Server recorderWSServer *http.Server @@ -76,8 +77,8 @@ func axonTransferWriteTimeout(cfg *config.TransferConfig) time.Duration { // New creates a new server instance. // db and s3Client are optional; pass nil to disable Verified ACK. -// syncWorker is optional; pass nil to disable cloud sync API. -func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *services.SyncWorker) *Server { +// syncWorker and cliSyncRunner are optional; pass nil to disable those sync APIs. +func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *services.SyncWorker, cliSyncRunner ...*services.CLISyncRunner) *Server { // Create Gin engine gin.SetMode(gin.ReleaseMode) engine := gin.New() @@ -156,8 +157,12 @@ func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *servi // Create SyncHandler for cloud sync API var syncHandler *handlers.SyncHandler + var cliRunner *services.CLISyncRunner + if len(cliSyncRunner) > 0 { + cliRunner = cliSyncRunner[0] + } if db != nil { - syncHandler = handlers.NewSyncHandler(db, syncWorker) + syncHandler = handlers.NewSyncHandler(db, syncWorker, cliRunner) } s := &Server{ @@ -188,6 +193,7 @@ func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *servi productionDashboard: productionDashboardHandler, syncHandler: syncHandler, syncWorker: syncWorker, + cliSyncRunner: cliRunner, engine: engine, } @@ -487,6 +493,14 @@ func (s *Server) Shutdown(ctx context.Context) error { } } } + if s.cliSyncRunner != nil { + if err := s.cliSyncRunner.Stop(ctx); err != nil { + logShutdownError("CLI sync runner", err) + if shutdownErr == nil { + shutdownErr = fmt.Errorf("CLI sync runner shutdown: %w", err) + } + } + } return shutdownErr } diff --git a/internal/services/cli_sync_runner.go b/internal/services/cli_sync_runner.go new file mode 100644 index 0000000..5760c9f --- /dev/null +++ b/internal/services/cli_sync_runner.go @@ -0,0 +1,887 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "bytes" + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "io" + "os" + "os/exec" + "regexp" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "archebase.com/keystone-edge/internal/logger" + "archebase.com/keystone-edge/internal/storage/s3" + + "github.com/jmoiron/sqlx" + "github.com/minio/minio-go/v7" +) + +const ( + cliSyncStatusPending = "pending" + cliSyncStatusInProgress = "in_progress" + cliSyncStatusCompleted = "completed" + cliSyncStatusFailed = "failed" + + cliSyncPollInterval = 30 * time.Second +) + +var ( + ErrCLISyncDisabled = errors.New("CLI sync is disabled") + ErrCLISyncNotRunning = errors.New("CLI sync runner is not running") + ErrCLISyncQueueFull = errors.New("CLI sync queue is full") + ErrCLISyncAlreadyActive = errors.New("CLI sync already active for episode") + ErrCLISyncNormalSyncActive = errors.New("normal cloud sync already active for episode") + ErrCLISyncEpisodeNotFound = errors.New("episode not found") + ErrCLISyncAlreadySynced = errors.New("episode already synced to cloud") + ErrCLISyncNotEligible = errors.New("episode is not eligible for CLI sync") +) + +// CLISyncRunnerConfig controls the dp CLI cloud sync sidepath. +type CLISyncRunnerConfig struct { + Enabled bool + DPBin string + DPConfigPath string + TempDir string + MaxConcurrent int + QueueSize int + TimeoutSec int + KeepTemp bool + MaxTags int + MaxTagBytes int +} + +// CLISyncRun is the API-facing representation of one CLI sync run. +type CLISyncRun struct { + ID int64 `db:"id"` + EpisodeID int64 `db:"episode_id"` + Status string `db:"status"` + SourcePath sql.NullString `db:"source_path"` + TempPath sql.NullString `db:"temp_path"` + DPConfigPath sql.NullString `db:"dp_config_path"` + FileID sql.NullString `db:"file_id"` + LogicalUploadID sql.NullString `db:"logical_upload_id"` + UploadID sql.NullString `db:"upload_id"` + Bucket sql.NullString `db:"bucket"` + ObjectKey sql.NullString `db:"object_key"` + FileSize sql.NullInt64 `db:"file_size"` + OSSObjectETag sql.NullString `db:"oss_object_etag"` + DurationSec sql.NullInt64 `db:"duration_sec"` + ErrorMessage sql.NullString `db:"error_message"` + StartedAt sql.NullTime `db:"started_at"` + CompletedAt sql.NullTime `db:"completed_at"` +} + +type cliSyncEpisode struct { + ID int64 `db:"id"` + EpisodePublicID string `db:"episode_id"` + QAStatus string `db:"qa_status"` + McapPath string `db:"mcap_path"` + SidecarPath string `db:"sidecar_path"` + CloudSynced bool `db:"cloud_synced"` + RobotDeviceID sql.NullString `db:"robot_device_id"` + TaskID sql.NullInt64 `db:"task_id"` + FactoryID sql.NullInt64 `db:"factory_id"` + OrganizationID sql.NullInt64 `db:"organization_id"` +} + +type cliUploadResult struct { + LogicalUploadID string `json:"logicalUploadId"` + UploadID string `json:"uploadId"` + FileID string `json:"fileId"` + Bucket string `json:"bucket"` + ObjectKey string `json:"objectKey"` + FileSize int64 `json:"fileSize"` + OSSObjectETag string `json:"ossObjectEtag"` +} + +// CLISyncRunner owns the emergency dp CLI upload sidepath. +type CLISyncRunner struct { + db *sqlx.DB + minioClient *s3.Client + minioBucket string + cfg CLISyncRunnerConfig + + runCh chan int64 + running atomic.Bool + stopping atomic.Bool + runCtx context.Context + runCancel context.CancelFunc + wg sync.WaitGroup + mu sync.Mutex +} + +// NewCLISyncRunner creates a runner. Call Start before accepting enqueue requests. +func NewCLISyncRunner(db *sqlx.DB, minioClient *s3.Client, minioBucket string, cfg CLISyncRunnerConfig) (*CLISyncRunner, error) { + if !cfg.Enabled { + return &CLISyncRunner{db: db, minioClient: minioClient, minioBucket: minioBucket, cfg: cfg}, nil + } + if db == nil { + return nil, fmt.Errorf("CLI sync requires database") + } + if minioClient == nil { + return nil, fmt.Errorf("CLI sync requires MinIO client") + } + if cfg.MaxConcurrent <= 0 { + cfg.MaxConcurrent = 1 + } + if cfg.QueueSize <= 0 { + cfg.QueueSize = 16 + } + if cfg.TimeoutSec <= 0 { + cfg.TimeoutSec = 7200 + } + if cfg.MaxTags <= 0 { + cfg.MaxTags = 128 + } + if cfg.MaxTagBytes <= 0 { + cfg.MaxTagBytes = 65536 + } + if strings.TrimSpace(cfg.TempDir) == "" { + cfg.TempDir = "/var/lib/keystone/cli-sync" + } + if err := os.MkdirAll(cfg.TempDir, 0o750); err != nil { + return nil, fmt.Errorf("create CLI sync temp dir: %w", err) + } + probe, err := os.CreateTemp(cfg.TempDir, ".write-probe-*") + if err != nil { + return nil, fmt.Errorf("CLI sync temp dir is not writable: %w", err) + } + probePath := probe.Name() + if err := probe.Close(); err != nil { + _ = os.Remove(probePath) + return nil, fmt.Errorf("close CLI sync temp probe: %w", err) + } + _ = os.Remove(probePath) + + return &CLISyncRunner{ + db: db, + minioClient: minioClient, + minioBucket: minioBucket, + cfg: cfg, + runCh: make(chan int64, cfg.QueueSize), + }, nil +} + +// IsEnabled reports whether the sidepath is configured. +func (r *CLISyncRunner) IsEnabled() bool { + return r != nil && r.cfg.Enabled +} + +// IsRunning reports whether background workers are accepting runs. +func (r *CLISyncRunner) IsRunning() bool { + return r != nil && r.running.Load() +} + +// Start starts background CLI sync workers. +func (r *CLISyncRunner) Start() { + if r == nil || !r.cfg.Enabled { + return + } + r.mu.Lock() + if !r.running.CompareAndSwap(false, true) { + r.mu.Unlock() + return + } + r.runCtx, r.runCancel = context.WithCancel(context.Background()) + runCtx := r.runCtx + r.mu.Unlock() + + for i := 0; i < r.cfg.MaxConcurrent; i++ { + r.wg.Add(1) + go r.worker(runCtx) + } + r.wg.Add(1) + go r.dispatcher(runCtx) + + logger.Printf("[CLI-SYNC] Started (dp=%s concurrency=%d queue=%d)", r.cfg.DPBin, r.cfg.MaxConcurrent, r.cfg.QueueSize) +} + +// Stop gracefully stops the runner. +func (r *CLISyncRunner) Stop(ctx context.Context) error { + if r == nil || !r.cfg.Enabled { + return nil + } + r.mu.Lock() + if !r.running.Load() { + r.mu.Unlock() + return nil + } + r.running.Store(false) + r.stopping.Store(true) + cancel := r.runCancel + r.mu.Unlock() + + if cancel != nil { + cancel() + } + + done := make(chan struct{}) + go func() { + r.wg.Wait() + close(done) + }() + + select { + case <-done: + logger.Printf("[CLI-SYNC] Stopped") + return nil + case <-ctx.Done(): + return fmt.Errorf("CLI sync runner shutdown: %w", ctx.Err()) + } +} + +// EnqueueEpisode creates a CLI sync run and schedules it for background processing. +func (r *CLISyncRunner) EnqueueEpisode(ctx context.Context, episodeID int64) (int64, error) { + if r == nil || !r.cfg.Enabled { + return 0, ErrCLISyncDisabled + } + if !r.running.Load() { + return 0, ErrCLISyncNotRunning + } + runID, err := r.persistPendingRun(ctx, episodeID) + if err != nil { + return 0, err + } + + select { + case r.runCh <- runID: + return runID, nil + case <-ctx.Done(): + r.markRunFailed(context.Background(), runID, time.Now(), ctx.Err()) + return 0, ctx.Err() + default: + r.markRunFailed(context.Background(), runID, time.Now(), ErrCLISyncQueueFull) + return 0, ErrCLISyncQueueFull + } +} + +// LatestRun returns the most recent CLI sync run for an episode. +func (r *CLISyncRunner) LatestRun(ctx context.Context, episodeID int64) (*CLISyncRun, error) { + if r == nil || !r.cfg.Enabled { + return nil, ErrCLISyncDisabled + } + var row CLISyncRun + err := r.db.GetContext(ctx, &row, ` + SELECT + id, + episode_id, + status, + source_path, + temp_path, + dp_config_path, + file_id, + logical_upload_id, + upload_id, + bucket, + object_key, + file_size, + oss_object_etag, + duration_sec, + error_message, + started_at, + completed_at + FROM cli_sync_runs + WHERE episode_id = ? + ORDER BY id DESC + LIMIT 1 + `, episodeID) + if err == sql.ErrNoRows { + return nil, sql.ErrNoRows + } + if err != nil { + return nil, fmt.Errorf("query latest CLI sync run: %w", err) + } + return &row, nil +} + +func (r *CLISyncRunner) persistPendingRun(ctx context.Context, episodeID int64) (int64, error) { + tx, err := r.db.BeginTxx(ctx, nil) + if err != nil { + return 0, fmt.Errorf("begin CLI sync transaction: %w", err) + } + defer func() { _ = tx.Rollback() }() + + lockClause := txLockClause(tx) + var ep cliSyncEpisode + if err := tx.GetContext(ctx, &ep, ` + SELECT + e.id, + e.episode_id, + e.qa_status, + e.mcap_path, + e.sidecar_path, + e.cloud_synced, + COALESCE(NULLIF(TRIM(r.device_id), ''), NULLIF(TRIM(ws.robot_serial), '')) AS robot_device_id, + e.task_id, + e.factory_id, + e.organization_id + FROM episodes e + LEFT JOIN workstations ws ON ws.id = e.workstation_id AND ws.deleted_at IS NULL + LEFT JOIN robots r ON r.id = ws.robot_id AND r.deleted_at IS NULL + WHERE e.id = ? AND e.deleted_at IS NULL + `+lockClause, episodeID); err != nil { + if err == sql.ErrNoRows { + return 0, fmt.Errorf("%w: %d", ErrCLISyncEpisodeNotFound, episodeID) + } + return 0, fmt.Errorf("lock episode %d: %w", episodeID, err) + } + if ep.CloudSynced { + return 0, fmt.Errorf("%w: %d", ErrCLISyncAlreadySynced, episodeID) + } + if ep.QAStatus != "approved" && ep.QAStatus != "inspector_approved" { + return 0, fmt.Errorf("%w: qa_status=%s", ErrCLISyncNotEligible, ep.QAStatus) + } + if strings.TrimSpace(ep.McapPath) == "" { + return 0, fmt.Errorf("%w: empty mcap_path", ErrCLISyncNotEligible) + } + if strings.TrimSpace(ep.SidecarPath) == "" { + return 0, fmt.Errorf("%w: empty sidecar_path", ErrCLISyncNotEligible) + } + if strings.TrimSpace(ep.RobotDeviceID.String) == "" { + return 0, fmt.Errorf("%w: empty robot_device_id", ErrCLISyncNotEligible) + } + + var normalActive int + if err := tx.GetContext(ctx, &normalActive, ` + SELECT COUNT(*) + FROM sync_logs + WHERE episode_id = ? + AND status IN ('pending', 'in_progress') + `, episodeID); err != nil { + return 0, fmt.Errorf("query active normal sync count: %w", err) + } + if normalActive > 0 { + return 0, fmt.Errorf("%w: %d", ErrCLISyncNormalSyncActive, episodeID) + } + + var cliActive int + if err := tx.GetContext(ctx, &cliActive, ` + SELECT COUNT(*) + FROM cli_sync_runs + WHERE episode_id = ? + AND status IN ('pending', 'in_progress') + `, episodeID); err != nil { + return 0, fmt.Errorf("query active CLI sync count: %w", err) + } + if cliActive > 0 { + return 0, fmt.Errorf("%w: %d", ErrCLISyncAlreadyActive, episodeID) + } + + now := time.Now().UTC() + result, err := tx.ExecContext(ctx, ` + INSERT INTO cli_sync_runs (episode_id, status, source_path, dp_config_path, created_at, updated_at) + VALUES (?, 'pending', ?, ?, ?, ?) + `, episodeID, ep.McapPath, r.cfg.DPConfigPath, now, now) + if err != nil { + return 0, fmt.Errorf("insert CLI sync run: %w", err) + } + runID, err := result.LastInsertId() + if err != nil { + return 0, fmt.Errorf("CLI sync run last insert id: %w", err) + } + if err := tx.Commit(); err != nil { + return 0, fmt.Errorf("commit CLI sync run: %w", err) + } + return runID, nil +} + +func (r *CLISyncRunner) dispatcher(ctx context.Context) { + defer r.wg.Done() + r.dispatchPendingRuns(ctx) + ticker := time.NewTicker(cliSyncPollInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + r.dispatchPendingRuns(ctx) + } + } +} + +func (r *CLISyncRunner) dispatchPendingRuns(ctx context.Context) { + var ids []int64 + if err := r.db.SelectContext(ctx, &ids, ` + SELECT id + FROM cli_sync_runs + WHERE status = 'pending' + ORDER BY id ASC + LIMIT ? + `, r.cfg.QueueSize); err != nil { + if ctx.Err() == nil { + logger.Printf("[CLI-SYNC] Failed to query pending runs: %v", err) + } + return + } + for _, id := range ids { + select { + case r.runCh <- id: + default: + return + } + } +} + +func (r *CLISyncRunner) worker(ctx context.Context) { + defer r.wg.Done() + for { + select { + case <-ctx.Done(): + return + case runID := <-r.runCh: + r.processRun(ctx, runID) + } + } +} + +func (r *CLISyncRunner) processRun(parent context.Context, runID int64) { + startedAt := time.Now().UTC() + claimed, err := r.claimRun(parent, runID, startedAt) + if err != nil { + logger.Printf("[CLI-SYNC] Failed to claim run %d: %v", runID, err) + return + } + if !claimed { + return + } + logger.Printf("[CLI-SYNC] Run %d claimed", runID) + + ctx, cancel := context.WithTimeout(parent, time.Duration(r.cfg.TimeoutSec)*time.Second) + defer cancel() + + var ep cliSyncEpisode + if err := r.loadEpisodeForRun(ctx, runID, &ep); err != nil { + r.markRunFailed(context.Background(), runID, startedAt, err) + return + } + deviceID := strings.TrimSpace(ep.RobotDeviceID.String) + if deviceID == "" { + r.markRunFailed(context.Background(), runID, startedAt, fmt.Errorf("%w: empty robot_device_id", ErrCLISyncNotEligible)) + return + } + logger.Printf("[CLI-SYNC] Run %d loaded episode: episode_id=%d public_id=%s qa_status=%s device_id=%s mcap_path=%s sidecar_path=%s", + runID, ep.ID, ep.EpisodePublicID, ep.QAStatus, deviceID, ep.McapPath, ep.SidecarPath) + + tags, err := r.buildTagsFromEpisode(ctx, ep) + if err != nil { + r.markRunFailed(context.Background(), runID, startedAt, err) + return + } + logger.Printf("[CLI-SYNC] Run %d built upload tags: episode_id=%d tag_count=%d", runID, ep.ID, len(tags)) + + mcapKey := stripBucketPrefix(ep.McapPath) + if mcapKey == "" { + r.markRunFailed(context.Background(), runID, startedAt, fmt.Errorf("empty mcap_path")) + return + } + logger.Printf("[CLI-SYNC] Run %d staging MCAP from MinIO: episode_id=%d bucket=%s key=%s temp_dir=%s", + runID, ep.ID, r.minioBucket, mcapKey, r.cfg.TempDir) + + tempPath, fileSize, err := r.stageMcap(ctx, ep.ID, mcapKey) + if err != nil { + r.markRunFailed(context.Background(), runID, startedAt, err) + return + } + logger.Printf("[CLI-SYNC] Run %d staged MCAP: episode_id=%d temp_path=%s size=%d bytes", + runID, ep.ID, tempPath, fileSize) + if !r.cfg.KeepTemp { + defer func() { _ = os.Remove(tempPath) }() + } + if err := r.setRunTempPath(context.Background(), runID, tempPath); err != nil { + logger.Printf("[CLI-SYNC] Failed to update temp path for run %d: %v", runID, err) + } + + uploadStartedAt := time.Now() + logger.Printf("[CLI-SYNC] Run %d starting dp upload: episode_id=%d dp_bin=%s device_id=%s tag_count=%d file_size=%d", + runID, ep.ID, r.cfg.DPBin, deviceID, len(tags), fileSize) + result, stdoutJSON, err := r.runDPUpload(ctx, tempPath, tags, deviceID) + if err != nil { + r.markRunFailed(context.Background(), runID, startedAt, err) + return + } + logger.Printf("[CLI-SYNC] Run %d dp upload finished: episode_id=%d elapsed=%s file_id=%s logical_upload_id=%s object_key=%s", + runID, ep.ID, time.Since(uploadStartedAt).Round(time.Millisecond), result.FileID, result.LogicalUploadID, result.ObjectKey) + if result.FileSize <= 0 { + result.FileSize = fileSize + } + if err := validateCLIUploadResult(result); err != nil { + r.markRunFailed(context.Background(), runID, startedAt, err) + return + } + + if err := r.markRunCompleted(context.Background(), runID, ep, result, stdoutJSON, startedAt); err != nil { + logger.Printf("[CLI-SYNC] Failed to mark run %d completed: %v", runID, err) + r.markRunFailed(context.Background(), runID, startedAt, err) + return + } + logger.Printf("[CLI-SYNC] Episode %d CLI synced: run_id=%d file_id=%s logical_upload_id=%s object_key=%s", + ep.ID, runID, result.FileID, result.LogicalUploadID, result.ObjectKey) +} + +func (r *CLISyncRunner) loadEpisodeForRun(ctx context.Context, runID int64, ep *cliSyncEpisode) error { + if err := r.db.GetContext(ctx, ep, ` + SELECT + e.id, + e.episode_id, + e.qa_status, + e.mcap_path, + e.sidecar_path, + e.cloud_synced, + COALESCE(NULLIF(TRIM(r.device_id), ''), NULLIF(TRIM(ws.robot_serial), '')) AS robot_device_id, + e.task_id, + e.factory_id, + e.organization_id + FROM cli_sync_runs csr + INNER JOIN episodes e ON e.id = csr.episode_id AND e.deleted_at IS NULL + LEFT JOIN workstations ws ON ws.id = e.workstation_id AND ws.deleted_at IS NULL + LEFT JOIN robots r ON r.id = ws.robot_id AND r.deleted_at IS NULL + WHERE csr.id = ? + `, runID); err != nil { + if err == sql.ErrNoRows { + return fmt.Errorf("%w for CLI sync run %d", ErrCLISyncEpisodeNotFound, runID) + } + return fmt.Errorf("load episode for CLI sync run %d: %w", runID, err) + } + return nil +} + +func (r *CLISyncRunner) claimRun(ctx context.Context, runID int64, startedAt time.Time) (bool, error) { + res, err := r.db.ExecContext(ctx, ` + UPDATE cli_sync_runs + SET status = 'in_progress', + started_at = ?, + error_message = NULL, + updated_at = ? + WHERE id = ? + AND status = 'pending' + `, startedAt, startedAt, runID) + if err != nil { + return false, fmt.Errorf("claim CLI sync run: %w", err) + } + n, err := res.RowsAffected() + if err != nil { + return false, fmt.Errorf("claim CLI sync rows affected: %w", err) + } + return n == 1, nil +} + +func (r *CLISyncRunner) buildTagsFromEpisode(ctx context.Context, ep cliSyncEpisode) (map[string]string, error) { + sidecarTags, err := r.tagsFromSidecar(ctx, ep.SidecarPath) + if err != nil { + return nil, err + } + + tags := make(map[string]string, len(sidecarTags)+6) + for k, v := range sidecarTags { + tags[k] = v + } + tags["episode_id"] = ep.EpisodePublicID + tags["keystone_episode_id"] = strconv.FormatInt(ep.ID, 10) + tags["sync_channel"] = "keystone_cli" + if deviceID := strings.TrimSpace(ep.RobotDeviceID.String); deviceID != "" { + tags["device_id"] = deviceID + } + if ep.TaskID.Valid { + tags["task_id"] = strconv.FormatInt(ep.TaskID.Int64, 10) + } + if ep.FactoryID.Valid { + tags["factory_id"] = strconv.FormatInt(ep.FactoryID.Int64, 10) + } + if ep.OrganizationID.Valid { + tags["organization_id"] = strconv.FormatInt(ep.OrganizationID.Int64, 10) + } + + if err := r.validateTags(tags); err != nil { + return nil, err + } + return tags, nil +} + +func (r *CLISyncRunner) tagsFromSidecar(ctx context.Context, sidecarPath string) (map[string]string, error) { + key := stripBucketPrefix(sidecarPath) + if key == "" { + return nil, fmt.Errorf("empty sidecar_path") + } + startedAt := time.Now() + logger.Printf("[CLI-SYNC] Reading sidecar from MinIO: bucket=%s key=%s", r.minioBucket, key) + obj, err := r.minioClient.GetObject(ctx, r.minioBucket, key, minio.GetObjectOptions{}) + if err != nil { + return nil, fmt.Errorf("get sidecar object %s: %w", key, err) + } + defer func() { _ = obj.Close() }() + + data, err := io.ReadAll(obj) + if err != nil { + return nil, fmt.Errorf("read sidecar object %s: %w", key, err) + } + tags, err := flattenSidecarScalars(data) + if err != nil { + return nil, fmt.Errorf("flatten sidecar %s: %w", key, err) + } + logger.Printf("[CLI-SYNC] Read sidecar complete: bucket=%s key=%s bytes=%d scalar_tag_count=%d elapsed=%s", + r.minioBucket, key, len(data), len(tags), time.Since(startedAt).Round(time.Millisecond)) + return tags, nil +} + +func (r *CLISyncRunner) validateTags(tags map[string]string) error { + if len(tags) > r.cfg.MaxTags { + return fmt.Errorf("too many CLI sync tags: %d > %d", len(tags), r.cfg.MaxTags) + } + totalBytes := 0 + for key, value := range tags { + key = strings.TrimSpace(key) + if key == "" { + return fmt.Errorf("CLI sync tag key is empty") + } + if strings.ContainsAny(key, ",=") { + return fmt.Errorf("CLI sync tag key %q contains unsupported characters", key) + } + totalBytes += len(key) + 1 + len(encodeDPTagValue(value)) + } + if totalBytes > r.cfg.MaxTagBytes { + return fmt.Errorf("CLI sync tags too large: %d > %d bytes", totalBytes, r.cfg.MaxTagBytes) + } + return nil +} + +func (r *CLISyncRunner) stageMcap(ctx context.Context, episodeID int64, mcapKey string) (string, int64, error) { + startedAt := time.Now() + obj, err := r.minioClient.GetObject(ctx, r.minioBucket, mcapKey, minio.GetObjectOptions{}) + if err != nil { + return "", 0, fmt.Errorf("get MCAP object %s: %w", mcapKey, err) + } + defer func() { _ = obj.Close() }() + + tmp, err := os.CreateTemp(r.cfg.TempDir, fmt.Sprintf("episode-%d-*.mcap", episodeID)) + if err != nil { + return "", 0, fmt.Errorf("create CLI sync temp file: %w", err) + } + tempPath := tmp.Name() + cleanup := true + defer func() { + _ = tmp.Close() + if cleanup { + _ = os.Remove(tempPath) + } + }() + + size, err := io.Copy(tmp, obj) + if err != nil { + return "", 0, fmt.Errorf("write CLI sync temp file: %w", err) + } + if err := tmp.Close(); err != nil { + return "", 0, fmt.Errorf("close CLI sync temp file: %w", err) + } + if size <= 0 { + return "", 0, fmt.Errorf("zero-byte MCAP cannot be CLI synced") + } + cleanup = false + logger.Printf("[CLI-SYNC] MCAP download complete: episode_id=%d bucket=%s key=%s temp_path=%s size=%d elapsed=%s", + episodeID, r.minioBucket, mcapKey, tempPath, size, time.Since(startedAt).Round(time.Millisecond)) + return tempPath, size, nil +} + +func (r *CLISyncRunner) runDPUpload(ctx context.Context, tempPath string, tags map[string]string, deviceID string) (*cliUploadResult, string, error) { + deviceID = strings.TrimSpace(deviceID) + if deviceID == "" { + return nil, "", fmt.Errorf("dp device id is required") + } + args := []string{ + "--config", r.cfg.DPConfigPath, + "--json", + "data", "upload", tempPath, + "--device", deviceID, + "--hint", "source=keystone_cli_sync", + } + + keys := make([]string, 0, len(tags)) + for key := range tags { + keys = append(keys, key) + } + sort.Strings(keys) + for _, key := range keys { + args = append(args, "--tag", key+"="+encodeDPTagValue(tags[key])) + } + logger.Printf("[CLI-SYNC] Prepared dp command: dp_bin=%s file=%s device_id=%s tag_count=%d hint_count=1", + r.cfg.DPBin, tempPath, deviceID, len(tags)) + + cmd := exec.CommandContext(ctx, r.cfg.DPBin, args...) + var stdout bytes.Buffer + var stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + output := strings.TrimSpace(stderr.String()) + if output == "" { + output = strings.TrimSpace(stdout.String()) + } + return nil, "", fmt.Errorf("dp data upload failed: %s", sanitizeCLIOutput(output, err)) + } + + stdoutText := strings.TrimSpace(stdout.String()) + var result cliUploadResult + if err := json.Unmarshal([]byte(stdoutText), &result); err != nil { + return nil, "", fmt.Errorf("parse dp upload JSON: %w", err) + } + return &result, stdoutText, nil +} + +func validateCLIUploadResult(result *cliUploadResult) error { + if result == nil { + return fmt.Errorf("dp upload result is empty") + } + if strings.TrimSpace(result.FileID) == "" { + return fmt.Errorf("dp upload result missing fileId") + } + if strings.TrimSpace(result.LogicalUploadID) == "" { + return fmt.Errorf("dp upload result missing logicalUploadId") + } + if strings.TrimSpace(result.ObjectKey) == "" { + return fmt.Errorf("dp upload result missing objectKey") + } + if result.FileSize <= 0 { + return fmt.Errorf("dp upload result has invalid fileSize") + } + return nil +} + +func (r *CLISyncRunner) setRunTempPath(ctx context.Context, runID int64, tempPath string) error { + _, err := r.db.ExecContext(ctx, ` + UPDATE cli_sync_runs + SET temp_path = ?, updated_at = ? + WHERE id = ? + `, tempPath, time.Now().UTC(), runID) + return err +} + +func (r *CLISyncRunner) markRunCompleted(ctx context.Context, runID int64, ep cliSyncEpisode, result *cliUploadResult, stdoutJSON string, startedAt time.Time) error { + now := time.Now().UTC() + durationSec := int64(now.Sub(startedAt).Seconds()) + + tx, err := r.db.BeginTxx(ctx, nil) + if err != nil { + return fmt.Errorf("begin CLI sync completion transaction: %w", err) + } + defer func() { _ = tx.Rollback() }() + + lockClause := txLockClause(tx) + var cloudSynced bool + if err := tx.GetContext(ctx, &cloudSynced, ` + SELECT cloud_synced + FROM episodes + WHERE id = ? AND deleted_at IS NULL + `+lockClause, ep.ID); err != nil { + return fmt.Errorf("lock episode for CLI sync completion: %w", err) + } + + if _, err := tx.ExecContext(ctx, ` + UPDATE cli_sync_runs + SET status = 'completed', + file_id = ?, + logical_upload_id = ?, + upload_id = ?, + bucket = ?, + object_key = ?, + file_size = ?, + oss_object_etag = ?, + duration_sec = ?, + error_message = NULL, + stdout_json = ?, + completed_at = ?, + updated_at = ? + WHERE id = ? + `, result.FileID, result.LogicalUploadID, nullableStringValue(result.UploadID), result.Bucket, result.ObjectKey, + result.FileSize, result.OSSObjectETag, durationSec, stdoutJSON, now, now, runID); err != nil { + return fmt.Errorf("update CLI sync run completed: %w", err) + } + + if cloudSynced { + return tx.Commit() + } + + if _, err := tx.ExecContext(ctx, ` + INSERT INTO sync_logs (episode_id, source_path, destination_path, status, bytes_transferred, duration_sec, attempt_count, started_at, completed_at) + VALUES (?, ?, ?, 'completed', ?, ?, 1, ?, ?) + `, ep.ID, ep.McapPath, result.ObjectKey, result.FileSize, durationSec, startedAt, now); err != nil { + return fmt.Errorf("insert CLI sync completed log: %w", err) + } + + if _, err := tx.ExecContext(ctx, ` + UPDATE episodes + SET cloud_synced = TRUE, + cloud_synced_at = ?, + cloud_mcap_path = ?, + cloud_processed = FALSE + WHERE id = ? AND deleted_at IS NULL + `, now, result.ObjectKey, ep.ID); err != nil { + return fmt.Errorf("update episode CLI sync cloud state: %w", err) + } + + if err := tx.Commit(); err != nil { + return fmt.Errorf("commit CLI sync completion: %w", err) + } + return nil +} + +func (r *CLISyncRunner) markRunFailed(ctx context.Context, runID int64, startedAt time.Time, runErr error) { + now := time.Now().UTC() + durationSec := int64(now.Sub(startedAt).Seconds()) + msg := sanitizeCLIOutput("", runErr) + if msg == "" && runErr != nil { + msg = runErr.Error() + } + logger.Printf("[CLI-SYNC] Run %d failed: duration=%ds error=%s", runID, durationSec, msg) + if _, err := r.db.ExecContext(ctx, ` + UPDATE cli_sync_runs + SET status = 'failed', + duration_sec = ?, + error_message = ?, + completed_at = ?, + updated_at = ? + WHERE id = ? + `, durationSec, msg, now, now, runID); err != nil { + logger.Printf("[CLI-SYNC] Failed to mark run %d failed: %v", runID, err) + } +} + +func nullableStringValue(value string) interface{} { + if strings.TrimSpace(value) == "" { + return nil + } + return value +} + +var cliSecretPattern = regexp.MustCompile(`(?i)(authorization|access[_-]?key|secret|token|password|api[_-]?key)(["'=:\s]+)([^,\s"}]+)`) + +func encodeDPTagValue(value string) string { + value = strings.ReplaceAll(value, `%`, `%25`) + value = strings.ReplaceAll(value, `,`, `%2C`) + return value +} + +func sanitizeCLIOutput(output string, err error) string { + text := strings.TrimSpace(output) + if text == "" && err != nil { + text = err.Error() + } + text = cliSecretPattern.ReplaceAllString(text, `$1$2`) + if len(text) > 4096 { + text = text[:4096] + "..." + } + return text +} diff --git a/internal/services/sidecar_tags.go b/internal/services/sidecar_tags.go index 9f976da..ad3fcdf 100644 --- a/internal/services/sidecar_tags.go +++ b/internal/services/sidecar_tags.go @@ -27,6 +27,19 @@ func flattenSidecar(data []byte) (map[string]string, error) { return result, nil } +// flattenSidecarScalars parses sidecar JSON for dp CLI upload tags. +// It skips arrays because the current dp CLI parser treats commas as tag separators. +func flattenSidecarScalars(data []byte) (map[string]string, error) { + var raw map[string]interface{} + if err := json.Unmarshal(data, &raw); err != nil { + return nil, fmt.Errorf("parse sidecar json: %w", err) + } + + result := make(map[string]string) + flattenScalarValue(result, "", raw) + return result, nil +} + func flattenValue(out map[string]string, prefix string, v interface{}) { switch val := v.(type) { case map[string]interface{}: @@ -60,6 +73,34 @@ func flattenValue(out map[string]string, prefix string, v interface{}) { } } +func flattenScalarValue(out map[string]string, prefix string, v interface{}) { + switch val := v.(type) { + case map[string]interface{}: + for k, child := range val { + if prefix == "" && k == "topics_summary" { + continue + } + flattenScalarValue(out, joinKey(prefix, k), child) + } + case []interface{}: + return + case nil: + out[prefix] = "" + case bool: + if val { + out[prefix] = "true" + } else { + out[prefix] = "false" + } + case float64: + out[prefix] = strconv.FormatFloat(val, 'f', -1, 64) + case string: + out[prefix] = val + default: + out[prefix] = fmt.Sprintf("%v", val) + } +} + func joinKey(prefix, key string) string { if prefix == "" { return key diff --git a/internal/services/sidecar_tags_test.go b/internal/services/sidecar_tags_test.go index d945c4b..e298673 100644 --- a/internal/services/sidecar_tags_test.go +++ b/internal/services/sidecar_tags_test.go @@ -108,6 +108,45 @@ func TestFlattenSidecar_ArraysEncodedAsJSONString(t *testing.T) { } } +func TestFlattenSidecarScalars_SkipsArrays(t *testing.T) { + tags, err := flattenSidecarScalars([]byte(testSidecarJSON)) + if err != nil { + t.Fatalf("flattenSidecarScalars failed: %v", err) + } + + cases := map[string]string{ + "device.device_id": "robot_01", + "recording.file_size_bytes": "147960982", + "task.data_collector_id": "刘备", + "recording.message_count": "222251", + "recording.recorder_version": "0.3.1", + } + + for key, want := range cases { + got, ok := tags[key] + if !ok { + t.Errorf("key %q missing from tags", key) + continue + } + if got != want { + t.Errorf("tags[%q] = %q, want %q", key, got, want) + } + } + + for _, key := range []string{"recording.topics_recorded", "task.skills", "topics_summary"} { + if _, ok := tags[key]; ok { + t.Errorf("array or excluded key %q should not be included", key) + } + } +} + +func TestEncodeDPTagValue(t *testing.T) { + got := encodeDPTagValue("a,b%") + if got != "a%2Cb%25" { + t.Fatalf("encodeDPTagValue() = %q, want %q", got, "a%2Cb%25") + } +} + func TestFlattenSidecar_TopicsSummaryExcluded(t *testing.T) { tags, err := flattenSidecar([]byte(testSidecarJSON)) if err != nil { diff --git a/internal/storage/database/migrations/000004_cli_sync_runs.down.sql b/internal/storage/database/migrations/000004_cli_sync_runs.down.sql new file mode 100644 index 0000000..6493ded --- /dev/null +++ b/internal/storage/database/migrations/000004_cli_sync_runs.down.sql @@ -0,0 +1,5 @@ +-- SPDX-FileCopyrightText: 2026 ArcheBase +-- +-- SPDX-License-Identifier: MulanPSL-2.0 + +DROP TABLE IF EXISTS cli_sync_runs; diff --git a/internal/storage/database/migrations/000004_cli_sync_runs.up.sql b/internal/storage/database/migrations/000004_cli_sync_runs.up.sql new file mode 100644 index 0000000..5d56f17 --- /dev/null +++ b/internal/storage/database/migrations/000004_cli_sync_runs.up.sql @@ -0,0 +1,29 @@ +-- SPDX-FileCopyrightText: 2026 ArcheBase +-- +-- SPDX-License-Identifier: MulanPSL-2.0 + +CREATE TABLE IF NOT EXISTS cli_sync_runs ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + episode_id BIGINT NOT NULL, + status ENUM('pending', 'in_progress', 'completed', 'failed') NOT NULL DEFAULT 'pending', + source_path VARCHAR(1024), + temp_path VARCHAR(1024), + dp_config_path VARCHAR(1024), + file_id VARCHAR(255), + logical_upload_id VARCHAR(255), + upload_id VARCHAR(255), + bucket VARCHAR(255), + object_key VARCHAR(1024), + file_size BIGINT, + oss_object_etag VARCHAR(255), + duration_sec INT, + error_message TEXT, + stdout_json JSON DEFAULT NULL, + started_at TIMESTAMP NULL, + completed_at TIMESTAMP NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + INDEX idx_cli_sync_episode (episode_id), + INDEX idx_cli_sync_status (status), + INDEX idx_cli_sync_created (created_at) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; From 00849b31c3319311e64b700c76a85626ea68a1c1 Mon Sep 17 00:00:00 2001 From: chaoliu Date: Thu, 4 Jun 2026 13:03:07 +0800 Subject: [PATCH 2/7] feat(sync): use direct cloud upload --- cmd/keystone-edge/main.go | 73 +- docker/.env.example | 10 +- docs/designs/cli-cloud-sync-sidepath.md | 6 + docs/designs/cli-cloud-sync-sidepath.zh.html | 1 + .../cloud-sync-go-direct-upload.zh.html | 825 ++++++++++++++++ internal/api/handlers/robot.go | 135 ++- internal/api/handlers/robot_test.go | 192 +++- internal/api/handlers/sync.go | 192 +--- internal/api/handlers/transfer.go | 31 +- .../transfer_asset_id_snapshot_test.go | 95 ++ internal/cloud/cloudpb/data_gateway.pb.go | 13 +- .../cloud/cloudpb/proto/data_gateway.proto | 1 + internal/cloud/gateway_client.go | 3 +- internal/cloud/gateway_client_test.go | 83 ++ internal/cloud/uploader.go | 189 +++- internal/cloud/uploader_test.go | 257 ++++- internal/config/config.go | 111 +-- internal/config/config_test.go | 68 +- internal/server/server.go | 20 +- internal/services/cli_sync_runner.go | 887 ------------------ internal/services/dp_asset_resolver.go | 61 ++ internal/services/dp_asset_resolver_test.go | 112 +++ internal/services/dp_config_loader.go | 167 ++++ internal/services/dp_config_loader_test.go | 176 ++++ internal/services/dp_raw_tags.go | 96 ++ internal/services/dp_raw_tags_test.go | 165 ++++ internal/services/sidecar_tags.go | 41 - internal/services/sidecar_tags_test.go | 39 - internal/services/sync_errors.go | 45 + internal/services/sync_worker.go | 258 ++++- internal/services/sync_worker_test.go | 201 +++- .../migrations/000004_cli_sync_runs.down.sql | 5 - .../migrations/000004_cli_sync_runs.up.sql | 29 - .../migrations/000004_robot_asset_id.down.sql | 7 + .../migrations/000004_robot_asset_id.up.sql | 14 + 35 files changed, 3110 insertions(+), 1498 deletions(-) create mode 100644 docs/designs/cloud-sync-go-direct-upload.zh.html create mode 100644 internal/api/handlers/transfer_asset_id_snapshot_test.go create mode 100644 internal/cloud/gateway_client_test.go delete mode 100644 internal/services/cli_sync_runner.go create mode 100644 internal/services/dp_asset_resolver.go create mode 100644 internal/services/dp_asset_resolver_test.go create mode 100644 internal/services/dp_config_loader.go create mode 100644 internal/services/dp_config_loader_test.go create mode 100644 internal/services/dp_raw_tags.go create mode 100644 internal/services/dp_raw_tags_test.go create mode 100644 internal/services/sync_errors.go delete mode 100644 internal/storage/database/migrations/000004_cli_sync_runs.down.sql delete mode 100644 internal/storage/database/migrations/000004_cli_sync_runs.up.sql create mode 100644 internal/storage/database/migrations/000004_robot_asset_id.down.sql create mode 100644 internal/storage/database/migrations/000004_robot_asset_id.up.sql diff --git a/cmd/keystone-edge/main.go b/cmd/keystone-edge/main.go index b376890..828437b 100644 --- a/cmd/keystone-edge/main.go +++ b/cmd/keystone-edge/main.go @@ -17,7 +17,6 @@ import ( "github.com/joho/godotenv" - "archebase.com/keystone-edge/internal/cloud" "archebase.com/keystone-edge/internal/config" "archebase.com/keystone-edge/internal/logger" "archebase.com/keystone-edge/internal/server" @@ -115,46 +114,8 @@ func main() { // Initialize cloud sync worker var syncWorker *services.SyncWorker - if cfg.Sync.Enabled && cfg.Sync.AuthEndpoint != "" && cfg.Sync.GatewayEndpoint != "" && s3Client != nil { - authClient := cloud.NewAuthClient(cloud.AuthClientConfig{ - Endpoint: cfg.Sync.AuthEndpoint, - UseTLS: cfg.Sync.CloudUseTLS, - TLSCAFile: cfg.Sync.CloudTLSCAFile, - TLSServerName: cfg.Sync.CloudTLSServerName, - APIKey: cfg.Sync.APIKey, - RefreshBefore: 60 * time.Second, - }) - - gatewayClient := cloud.NewGatewayClient(cloud.GatewayClientConfig{ - Endpoint: cfg.Sync.GatewayEndpoint, - UseTLS: cfg.Sync.CloudUseTLS, - TLSCAFile: cfg.Sync.CloudTLSCAFile, - TLSServerName: cfg.Sync.CloudTLSServerName, - RequestTimeout: time.Duration(cfg.Sync.RequestTimeoutSec) * time.Second, - }, authClient) - // Close gateway client before auth client (LIFO defer order). - defer func() { - if err := authClient.Close(); err != nil { - logger.Printf("[SYNC] Failed to close auth client: %v", err) - } - }() - defer func() { - if err := gatewayClient.Close(); err != nil { - logger.Printf("[SYNC] Failed to close gateway client: %v", err) - } - }() - - uploader, err := cloud.NewUploader(gatewayClient, s3Client, cfg.Storage.Bucket, cloud.UploaderConfig{ - RequestTimeout: time.Duration(cfg.Sync.RequestTimeoutSec) * time.Second, - OSSTimeout: time.Duration(cfg.Sync.OSSTimeoutSec) * time.Second, - PersistRootDir: cfg.Sync.PersistRootDir, - MaxRestartCount: uint32(cfg.Sync.MaxRestartCount), //nolint:gosec // non-negative guaranteed by config.Validate() - }) - if err != nil { - logger.Fatalf("[SYNC] Failed to initialise uploader: %v", err) - } - - syncWorker = services.NewSyncWorker(db.DB, uploader, s3Client, cfg.Storage.Bucket, services.SyncWorkerConfig{ + if cfg.Sync.Enabled && cfg.Sync.DPConfigPath != "" && s3Client != nil { + syncWorker = services.NewSyncWorker(db.DB, nil, s3Client, cfg.Storage.Bucket, services.SyncWorkerConfig{ BatchSize: cfg.Sync.BatchSize, MaxConcurrent: cfg.Sync.MaxConcurrent, MaxRetries: cfg.Sync.MaxRetries, @@ -166,37 +127,13 @@ func main() { }, &cfg.Sync) syncWorker.Start() - logger.Printf("[SYNC] Cloud sync worker started: auth=%s gateway=%s auto_scan=%t", cfg.Sync.AuthEndpoint, cfg.Sync.GatewayEndpoint, cfg.Sync.AutoScanEnabled) + logger.Printf("[SYNC] Cloud sync worker started: dp_config=%s auto_scan=%t", cfg.Sync.DPConfigPath, cfg.Sync.AutoScanEnabled) } else { - logger.Println("[SYNC] Cloud sync disabled (KEYSTONE_SYNC_ENABLED=false or missing endpoints)") - } - - var cliSyncRunner *services.CLISyncRunner - if cfg.CLISync.Enabled && s3Client != nil { - var err error - cliSyncRunner, err = services.NewCLISyncRunner(db.DB, s3Client, cfg.Storage.Bucket, services.CLISyncRunnerConfig{ - Enabled: cfg.CLISync.Enabled, - DPBin: cfg.CLISync.DPBin, - DPConfigPath: cfg.CLISync.DPConfigPath, - TempDir: cfg.CLISync.TempDir, - MaxConcurrent: cfg.CLISync.MaxConcurrent, - QueueSize: cfg.CLISync.QueueSize, - TimeoutSec: cfg.CLISync.TimeoutSec, - KeepTemp: cfg.CLISync.KeepTemp, - MaxTags: cfg.CLISync.MaxTags, - MaxTagBytes: cfg.CLISync.MaxTagBytes, - }) - if err != nil { - logger.Fatalf("[CLI-SYNC] Failed to initialise CLI sync runner: %v", err) - } - cliSyncRunner.Start() - logger.Printf("[CLI-SYNC] CLI sync runner started: dp=%s config=%s", cfg.CLISync.DPBin, cfg.CLISync.DPConfigPath) - } else if cfg.CLISync.Enabled { - logger.Println("[CLI-SYNC] CLI sync disabled because S3/MinIO is unavailable") + logger.Println("[SYNC] Cloud sync disabled (KEYSTONE_SYNC_ENABLED=false, missing KEYSTONE_SYNC_DP_CONFIG, or S3 unavailable)") } // Initialize and start HTTP server - srv := server.New(cfg, db.DB, s3Client, syncWorker, cliSyncRunner) + srv := server.New(cfg, db.DB, s3Client, syncWorker) if err := srv.Start(); err != nil { logger.Fatalf("[SERVER] Failed to start server: %v", err) } diff --git a/docker/.env.example b/docker/.env.example index 7d1c145..34ec451 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -42,15 +42,7 @@ KEYSTONE_MINIO_USE_SSL=false KEYSTONE_SYNC_ENABLED=true KEYSTONE_SYNC_BATCH_SIZE=10 KEYSTONE_SYNC_MAX_RETRIES=5 -KEYSTONE_CLOUD_AUTH_ENDPOINT=127.0.0.1:50051 -KEYSTONE_CLOUD_GATEWAY_ENDPOINT=127.0.0.1:50053 -KEYSTONE_CLOUD_USE_TLS=false -# Optional: custom CA bundle for TLS verification (PEM). -# KEYSTONE_CLOUD_TLS_CA_FILE=/etc/ssl/certs/your-ca.pem -# Optional: override TLS server name (SNI / verification), useful when endpoint is an IP. -# KEYSTONE_CLOUD_TLS_SERVER_NAME=cloud.example.com -# API key issued by the data-platform (base64url, no padding). -KEYSTONE_CLOUD_API_KEY=your-api-key-here +KEYSTONE_SYNC_DP_CONFIG=~/.archebase/config.json KEYSTONE_SYNC_WORKER_INTERVAL=15 KEYSTONE_SYNC_REQUEST_TIMEOUT=30 KEYSTONE_SYNC_OSS_TIMEOUT=120 diff --git a/docs/designs/cli-cloud-sync-sidepath.md b/docs/designs/cli-cloud-sync-sidepath.md index 9bbe508..14a8681 100644 --- a/docs/designs/cli-cloud-sync-sidepath.md +++ b/docs/designs/cli-cloud-sync-sidepath.md @@ -6,6 +6,12 @@ SPDX-License-Identifier: MulanPSL-2.0 # CLI Cloud Sync Sidepath Design +Status: Superseded. This sidepath is not implemented in Keystone anymore. +Native cloud sync now uploads directly with the Go uploader and Data Platform +device profiles; Keystone no longer registers CLI sync APIs, starts a +`CLISyncRunner`, reads `KEYSTONE_CLI_SYNC_*` config, or creates +`cli_sync_runs` migrations. + ## 1. Overview This document defines a sidepath for syncing one Keystone episode to cloud by diff --git a/docs/designs/cli-cloud-sync-sidepath.zh.html b/docs/designs/cli-cloud-sync-sidepath.zh.html index 9ac7b3f..fad753d 100644 --- a/docs/designs/cli-cloud-sync-sidepath.zh.html +++ b/docs/designs/cli-cloud-sync-sidepath.zh.html @@ -421,6 +421,7 @@

CLI 同步到云旁路设计

文档状态 +
状态:已废弃;当前实现改为 Keystone Go uploader 原生 direct sync,不再实现 CLI sync API、CLISyncRunner、KEYSTONE_CLI_SYNC_* 配置或 cli_sync_runs 迁移。
用途:实现设计 / 评审
范围:Keystone 后端、Synapse 前端;data-platform CLI 只作为外部命令调用
日期:2026-06-02
diff --git a/docs/designs/cloud-sync-go-direct-upload.zh.html b/docs/designs/cloud-sync-go-direct-upload.zh.html new file mode 100644 index 0000000..65f0a88 --- /dev/null +++ b/docs/designs/cloud-sync-go-direct-upload.zh.html @@ -0,0 +1,825 @@ + + + + + + + Keystone 原生云同步直连 Data Platform 上传方案 + + + +
+
+
+

Cloud Sync / Data Platform

+

Keystone 原生云同步直连 Data Platform 上传方案

+

+ 本方案采用方案 A:Keystone 读取 KEYSTONE_SYNC_DP_CONFIG 指向的 data-platform + config,按 episode 对应的 asset_id 选择 device profile,并复用现有 Go 上传器完成 + MinIO 到 Data Platform OSS 的流式上传。第一版直接改造 Keystone 原有 cloud sync 上传逻辑, + 不再依赖 dp data upload 或 Keystone 自有 cloud API key。 +

+
+
+ 设计状态 + 草案,面向第一版实现
+ 目标路径:原生 cloud sync,不下载 MCAP 到本地,不依赖 dp data upload
+ 兼容重点:robots.asset_id、device API key、device tags、raw tags 合成 +
+
+ + + +
+

目标与非目标

+
+
+

目标

+
    +
  • 更新 Keystone 原有 cloud sync worker 的上传身份和 raw tags 合成逻辑。
  • +
  • 复用既有 robots.asset_id 作为“云资产编号”,即本地 robot 与 Data Platform device 的稳定映射。
  • +
  • episode 创建时将当时的 asset_id 快照写入 episodes.metadata.asset_id
  • +
  • 读取 KEYSTONE_SYNC_DP_CONFIG 指向的 data-platform config,按 asset_id 选择 device profile。
  • +
  • 使用 device profile 的 apiKey 与 AuthService 交换 Bearer token。
  • +
  • 复用 Keystone 现有 cloud.Uploader,从 MinIO 流式上传到 Data Platform OSS。
  • +
  • 复刻 Rust SDK 的 raw tags 合并与冲突校验规则。
  • +
+
+
+

非目标

+
    +
  • 不在第一版集成 dp device init,device profile 由现场工程师提前初始化。
  • +
  • 不生成或修改 Data Platform device id,只存储自动化流程写入的 asset_id
  • +
  • 不迁移 data-platform config 到 Keystone 配置中心。
  • +
  • 删除 Keystone 后端 CLI 同步旁路,包括 CLISyncRunner、CLI sync API、CLI sync 配置和 cli_sync_runs 表迁移。
  • +
  • 不在每个 episode 上传前执行 device init;init 是一次性准备或凭证轮换动作。
  • +
  • 不做历史 episode 的自动 asset_id 回填工具;缺失时给出清晰错误并允许手动重试。
  • +
  • 不新增 direct sync raw tag 数量或总字节数限制。
  • +
  • 不把 MCAP 完整读入内存,只保持单分片缓冲。
  • +
+
+
+
+ +
+

当前行为

+

+ Keystone 原有 cloud sync 已经是 Go 直连上传:从 MinIO 流式读取 MCAP,创建 data-gateway 上传会话, + 再分片上传到 Data Platform OSS。当前差异在于它使用 Keystone 自己的 sync API key,并没有像 + dp data upload --device 一样读取 device profile、注入 device tags 和 reserved raw tags。 + 当前 worker 也没有读取 episode 的云端 device 快照,sidecar raw tag 读取失败时会 best-effort 继续上传。 +

+
Keystone DB episode
+  -> build sidecar raw tags
+  -> AuthService ExchangeCredential with Keystone sync API key
+  -> data-gateway CreateLogicalUpload
+  -> MinIO GetObject stream
+  -> OSS multipart upload
+  -> data-gateway CompleteUpload
+  -> update sync_logs / episodes
+
+

+ 所以第一版不需要新增上传链路,但必须把原有 worker 的上传身份切到 device profile, + 补齐 asset_id 解析、strict sidecar、non-retryable 错误和与 data-platform Rust SDK 一致的 raw tags 合成规则。 +

+
+
+ +
+

目标架构

+

+ 新路径保留原有 cloud sync worker 的触发、重试、状态更新和同步日志。每次处理 episode 时, + worker 先解析 episode 对应的 asset_id,再根据该值读取 data-platform device + profile,并用该 profile 的 API key 构造本次上传专用客户端。 +

+
Keystone DB episode
+  -> resolve asset_id from episodes.metadata
+     or fallback through episode.workstation_id -> workstations -> robots.asset_id
+  -> load DP config from KEYSTONE_SYNC_DP_CONFIG
+  -> select devices[].deviceId == asset_id
+  -> build effective raw tags
+  -> AuthService ExchangeCredential with device apiKey
+  -> data-gateway CreateLogicalUpload
+  -> MinIO GetObject stream
+  -> OSS multipart upload
+  -> data-gateway CompleteUpload
+  -> update sync_logs / episodes
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
模块职责建议位置
Robot asset mapping保存本地 robot 到 Data Platform device 的不可变映射。robots.asset_id、robot API、数据库迁移
Asset resolver优先读取 episodes.metadata.asset_id,缺失时按历史 workstation 反查 robot。internal/services/dp_asset_resolver.go
DP config loader解析 endpointsdevices[],按 device id 返回上传所需配置。internal/services/dp_config_loader.go
Raw tag builder复刻 Rust SDK 的 tag 合并顺序与冲突规则。internal/services/dp_raw_tags.go
Direct uploader factory按 episode 创建本次专用 AuthClientGatewayClientcloud.Uploaderinternal/services/sync_worker.go
Cloud uploader复用现有 data-gateway 与 OSS multipart 上传能力。internal/cloud/uploader.go
+
+ +
+

云资产编号映射规则

+

+ Keystone 本地 robots.device_id 继续表示 Axon / Keystone 内部设备编号,不参与 Data Platform + device 身份选择。云交互只使用既有 robots.asset_id,前端文案统一显示为“云资产编号”。 +

+ +

Robot 字段规则

+
    +
  • robots.asset_id 初始允许为空,创建 robot 时可写可不写。
  • +
  • 首次设置非空后不可修改、不可清空;同值更新视为幂等。
  • +
  • active robots 的非空 asset_id 必须唯一,软删除 robot 不占用唯一性。
  • +
  • 保存前 trim;空字符串按 NULL;最大长度 100;不做 Data Platform device id 格式正则。
  • +
  • robot create / update / list / detail API 暴露 asset_id
  • +
+ +

Episode 快照规则

+
    +
  • episode 创建时,如果能从 task -> workstation -> robot 解析到非空 asset_id,写入 episodes.metadata.asset_id
  • +
  • episode 创建不因 asset_id 缺失失败,本地采集、QA 和入库继续成功。
  • +
  • 不修改 sidecar JSON,不把 asset_id 写回采集产物。
  • +
  • 第一版不提供自动历史回填工具;缺失时通过错误信息提示配置 robot 或手动回填 metadata 后再手动同步。
  • +
+ +

上传时解析优先级

+
if episodes.metadata.asset_id is non-empty:
+    use metadata.asset_id
+else:
+    load workstation by episode.workstation_id, including soft-deleted workstation rows
+    load robots.asset_id by workstation.robot_id
+    use robots.asset_id if non-empty
+if still empty:
+    fail as non-retryable configuration error
+ +
+

+ cloud sync 不 fallback 到 robots.device_id。工位当前允许直接更新 robot_id; + 后续如果换绑改为“旧工位软删 + 新工位记录”,fallback 查询也必须允许读取软删除 workstation, + 因为 episode 的 workstation_id 是历史引用。 +

+
+
+ +
+

方案 A:读取 Data Platform Config

+

+ Keystone 使用 KEYSTONE_SYNC_DP_CONFIG 指向的 data-platform config。该文件由现场工程师提前通过 + dp configdp device init 生成和维护,Keystone 只解析直连上传需要的字段。 + 原生 direct sync 不再依赖 KEYSTONE_CLOUD_API_KEY、 + KEYSTONE_SYNC_AUTH_ENDPOINTKEYSTONE_SYNC_GATEWAY_ENDPOINT。 +

+ +

需要解析的 JSON 字段

+
{
+  "version": 3,
+  "endpoints": {
+    "auth": "https://auth.example.com:50051",
+    "gateway": "https://gateway.example.com:50052"
+  },
+  "devices": [
+    {
+      "deviceId": "AB-F0001-T0001-000006",
+      "apiKey": "ak_v1.device_secret",
+      "tags": {
+        "778a6d83c9ec49108537542a570966ee.device_id": "AB-F0001-T0001-000006",
+        "line": "a"
+      },
+      "initializedAtUnix": 1760000000
+    }
+  ]
+}
+ +

Go 结构建议

+
type DPConfigFile struct {
+    Version   *int              `json:"version,omitempty"`
+    Endpoints DPConfigEndpoints `json:"endpoints"`
+    Devices   []DPDeviceProfile `json:"devices"`
+}
+
+type DPConfigEndpoints struct {
+    Auth    string `json:"auth"`
+    Gateway string `json:"gateway"`
+}
+
+type DPDeviceProfile struct {
+    DeviceID string            `json:"deviceId"`
+    APIKey   string            `json:"apiKey"`
+    Tags     map[string]string `json:"tags"`
+}
+
+type DPResolvedEndpoint struct {
+    Target    string
+    UseTLS    bool
+    ServerName string
+}
+ +

解析规则

+
    +
  • version 缺失或等于 3 可接受;存在且不等于 3 时失败。
  • +
  • devices[].deviceId trim 后比较,大小写敏感;重复 device id 直接失败。
  • +
  • deviceId 必须与 Keystone 解析出的 asset_id 一致。
  • +
  • apiKey 不能为空,且永不打印明文日志。
  • +
  • tags 不能为空,保持与 Rust SDK require_device_upload() 一致;tag key/value 不 trim、不改写,但 key 必须非空。
  • +
  • endpoints.authendpoints.gateway 必须来自 config 文件,不支持 ARCHEBASE_*KEYSTONE_SYNC_* overlay。
  • +
  • 每个 episode 上传前重新读取 config 文件,避免长期进程缓存旧 device profile。
  • +
+ +

Endpoint 与 TLS 规则

+
    +
  • https://host[:port] 使用 TLS gRPC;未写端口时补 443;TLS CA 使用系统 CA,server name 使用 URL host。
  • +
  • http://host[:port] 使用 insecure gRPC;未写端口时补 80
  • +
  • host[:port] 兼容裸地址,按 insecure gRPC 处理,不自动补端口。
  • +
  • endpoint 禁止 path、query 和 fragment,例如 https://host:50051/foo 应视为配置错误。
  • +
  • Auth 和 Gateway 不强制使用同一种 scheme,分别按各自 endpoint 解析。
  • +
  • 第一版不支持自定义 CA 文件或 TLS server name override。
  • +
+ +
+

+ 第一版不把 device profile 写入 Keystone 数据库。Keystone 只消费 data-platform config, + 这样可以最大限度贴近当前 dp --config ... --device ... 的上传身份语义,同时避免 + Keystone 自有 sync API key 与 device API key 混用。 +

+
+ +

现场前置动作

+
dp --config /home/shark/.archebase/config.json config
+dp --config /home/shark/.archebase/config.json device init AB-F0001-T0001-000006
+

+ 初始化成功后,config 中会出现对应的 devices[] profile。后续 Keystone 上传只读取该 profile, + 不在上传前自动执行 init。凭证过期、设备迁移或平台侧 tags 变化时,由现场工程师执行 + dp device reinit ... --yes 轮换。 +

+
+ +
+

Raw Tags 合并规则

+

+ 直连上传必须复刻 data-platform Rust SDK 的 build_upload_tags() 语义。合并过程使用非冲突插入: + 如果 key 已存在且 value 不同,直接失败;如果 key 已存在且 value 相同,视为幂等。 +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
顺序来源说明
1device profile tags来自 devices[].tags,由 data-platform device init 生成。
2device id reserved tag778a6d83c9ec49108537542a570966ee.device_id,值为 profile 的 deviceId
3original file reserved taga206e337ecdf70a93bb611cf6a30c346.raw_file,值固定使用 MinIO MCAP object key 的 basename。
4Keystone sidecar tags从 sidecar JSON 扁平化得到;数组字段 JSON encode,顶层 topics_summary 排除。
5Keystone extra tagsepisode_idkeystone_episode_idsync_channeltask_idfactory_idorganization_id
+ +
+

+ 原有 cloud sync 没有本地临时文件,因此 reserved raw_file 不读取 sidecar 字段, + 只使用 basename(stripBucketPrefix(episodes.mcap_path))。如果 basename 为空,本次上传失败。 +

+
+ +
+

+ Keystone 不新增普通 device_id raw tag。设备归属只通过 + 778a6d83c9ec49108537542a570966ee.device_id reserved tag 表达,并由 Keystone 本地注入、 + data-gateway 服务端二次校验。 +

+
+ +

Sidecar 规则

+
    +
  • direct device sync 改为 strict:sidecar_path 为空、对象不可读或 JSON 解析失败时,不创建 data-gateway upload session。
  • +
  • 数组字段保留为 JSON 字符串;顶层 topics_summary 继续排除。
  • +
  • 第一版不新增 raw tag 数量或总字节数限制。
  • +
+ +

合并伪代码

+
merged := map[string]string{}
+insertAllNonConflicting(merged, deviceProfile.Tags)
+insertNonConflicting(merged, deviceIDRawTagKey, deviceProfile.DeviceID)
+insertNonConflicting(merged, originalFileRawTagKey, mcapBaseName)
+insertAllNonConflicting(merged, sidecarTags)
+insertAllNonConflicting(merged, keystoneExtraTags)
+return merged
+
+ +
+

直连上传流程

+
+
1. 领取 episode:沿用原有 cloud sync worker 的自动扫描、手动触发、重试和并发控制。
+
2. 加载 episode:读取 MCAP MinIO key、sidecar path、metadata、workstation id 和任务上下文。
+
3. 解析 asset_id:优先使用 episodes.metadata.asset_id,否则通过历史 workstation 反查 robots.asset_id
+
4. 加载 DP config:从 KEYSTONE_SYNC_DP_CONFIG 读取 device profile 和 endpoints。
+
5. 构造 raw tags:合并 device tags、reserved tags、sidecar tags 和 Keystone extra tags,执行冲突校验。
+
6. 构造 direct uploader:为本次 episode 创建专用 AuthClientGatewayClientcloud.Uploader
+
7. 执行上传:调用 cloud.Uploader.Upload(),从 MinIO 流式读取 MCAP 并上传 OSS。
+
8. 写回结果:沿用原有成功路径,更新 sync_logsepisodes.cloud_syncedcloud_mcap_path
+
+ +

结果字段映射

+

+ 第一版不新增 episodes 字段,也不扩展 sync_logs 表。Data Platform 审计 ID 先通过日志输出; + 如后续 UI 或 API 需要直接按 episode 查询,再单独扩表。 +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
结果Go direct 来源说明
file_idcloud.UploadResult.UploadID第一版不落库,只记录日志;如后续需要在 Keystone 记录 Data Platform 文件 ID,可直接使用该值。
logical_upload_idcloud.UploadResult.LogicalUploadID第一版不落库,只记录日志。
upload_idcloud.UploadResult.UploadID与 Data Platform SDK 返回的 fileId 当前等价,第一版不落库。
object_keycloud.UploadResult.ObjectKey写入 sync_logs.destination_pathepisodes.cloud_mcap_path
oss_object_etagcloud.UploadResult.OSSObjectETag客户端计算并回传给 data-gateway 的 multipart ETag,第一版不落库,只记录日志。
+
+ +
+

实施步骤

+
    +
  1. + 复用 robots.asset_id 字段,增加 active 非空唯一约束; + create / update 实现 trim、控制字符校验、“首次非空设置后不可修改、不可清空、同值幂等”。 +
  2. +
  3. + episode 创建时解析 task -> workstation -> robot 的 asset_id,非空时写入 + episodes.metadata.asset_id,但缺失不阻止 episode 创建。 +
  4. +
  5. + 新增 DP config loader,读取 SyncConfig.DPConfigPath / KEYSTONE_SYNC_DP_CONFIG, + 校验 version、endpoint、重复 device id、空 apiKey 和空 tags,并按 asset_id 返回 profile。 +
  6. +
  7. + 给 SyncWorker 增加 asset_id resolver:优先读 episodes.metadata.asset_id, + 缺失时允许读取软删除 workstation 并反查 robots.asset_id。 +
  8. +
  9. + 新增 raw tags builder,包含两个 reserved key 常量、非冲突插入、MinIO basename 选择、strict sidecar + 和 Keystone extra tags;不添加普通 device_id。 +
  10. +
  11. + 调整 uploader 构造方式,每个 episode 创建本次专用 AuthClient/GatewayClient/Uploader; + endpoint scheme 决定 TLS,TLS 使用系统 CA。 +
  12. +
  13. + 在 cloud.UploadRequest 和 persisted upload state 中记录 AssetID; + 恢复上传时只有 MCAP key 和 asset_id 同时匹配才允许复用旧 session。 +
  14. +
  15. + 更新 GatewayClient.CompleteUpload() 签名,complete 时回传 session.PartSizeBytes 到 + CompleteUploadRequest.part_size_bytes。 +
  16. +
  17. + 引入 retryable / non-retryable 错误分类:non-retryable failed 写 next_retry_at = NULL; + auto scan 跳过 latest failed 且 next_retry_at IS NULL 的 episode,manual sync 仍可重试。 +
  18. +
  19. + 删除 CLI 同步旁路:Synapse 不展示 CLI sync UI,Keystone 不注册 CLI sync API,不初始化 + CLISyncRunner,不读取 KEYSTONE_CLI_SYNC_* 配置,也不保留 + cli_sync_runs 迁移。 +
  20. +
+
+ +
+

风险与处理

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
风险影响处理策略
asset_id 缺失无法选择 device profile,episode 本次 sync 失败。写入 non-retryable failed,next_retry_at = NULL;错误信息包含 episode、workstation、robot 和修复方向,手动修复后手动重试。
asset_id 填错本地 robot 会永久绑定错误的 Data Platform device。字段首次非空设置后不可修改;未来由自动化流程写入,第一版不提供 break-glass 维护入口。
device profile 缺失或不完整上传前失败,episode 本次 sync 失败。错误信息包含 asset_id 和 config path,但不打印 api key;提示现场执行 dp device initdp device reinit
endpoint / TLS 配置错误Auth 或 Gateway 连接失败。endpoint 只来自 DP config;按 http/https scheme 自动解析 TLS;禁止 path/query/fragment,日志打印 target 和 TLS 标志。
secret 泄漏日志或错误信息暴露 device api key。loader 和上传日志只打印 asset_id、config path、endpoint,不打印 api key、token、STS secret。
sidecar 缺失或格式错误云端对象缺少业务 raw tags,影响检索。direct sync 对 sidecar strict;格式错误 non-retryable,MinIO 读对象失败可自动重试。
恢复状态身份混用同一 MCAP 可能复用另一个 device 身份创建的 upload session。persisted upload state 记录 asset_id,恢复时必须同时匹配 MCAP key 和 asset_id
CLI sidepath 遗留用户可能继续使用旧 CLI 同步入口,产生两条语义不同的同步路径。删除后端 CLI sync runner、API、配置和表迁移;只保留原生 direct sync 入口。
+
+ +
+

测试计划

+
    +
  • 单元测试 robot API / 存储:asset_id 可首次设置、同值幂等、不可修改、不可清空、active 非空唯一。
  • +
  • 单元测试 episode 创建:有 asset_id 时写入 episodes.metadata.asset_id,缺失时仍创建 episode。
  • +
  • 单元测试 asset_id resolver:metadata 优先、fallback 读取软删除 workstation、缺失时报 non-retryable 错误、不 fallback 到 robots.device_id
  • +
  • 单元测试 DP config loader:version、endpoint scheme/TLS、禁止 path/query/fragment、成功选择 device、缺失 device、空 apiKey、空 tags、重复 deviceId。
  • +
  • 单元测试 raw tags builder:合并顺序、reserved device tag 注入、raw_file 使用 MinIO basename、相同 key 相同 value 幂等、相同 key 不同 value 报错、空 value 保留。
  • +
  • 单元测试 SyncWorker 错误分类:non-retryable failed 写 next_retry_at=NULL,auto scan 跳过,manual sync 可重新尝试。
  • +
  • 单元测试 uploader 持久化恢复:同 MCAP key 但 asset_id 不同不复用旧 state。
  • +
  • 集成测试 fake gateway/OSS:验证使用 device API key、raw tags 完整、part_size_bytes 回传、object_key 写回现有 DB 字段。
  • +
  • 现场灰度:同一小 MCAP 分别跑当前原始上传和 device profile 上传,对比 raw tags、文件大小、ETag、Data Platform 可检索性。
  • +
+ +
+

+ 验收标准:原生 cloud sync 不产生本地 MCAP 临时文件,不依赖 dp data upload, + 不依赖 KEYSTONE_CLOUD_API_KEYKEYSTONE_SYNC_AUTH_ENDPOINT 或 + KEYSTONE_SYNC_GATEWAY_ENDPOINT, + Data Platform 中的文件可通过 fileId 检索,Keystone episode 状态与现有 cloud sync 一致。 +

+
+
+
+ + diff --git a/internal/api/handlers/robot.go b/internal/api/handlers/robot.go index a497a96..94fe64e 100644 --- a/internal/api/handlers/robot.go +++ b/internal/api/handlers/robot.go @@ -15,6 +15,8 @@ import ( "strconv" "strings" "time" + "unicode" + "unicode/utf8" "archebase.com/keystone-edge/internal/logger" "archebase.com/keystone-edge/internal/services" @@ -134,6 +136,48 @@ func robotMetadataFromDB(ns sql.NullString) interface{} { return parseJSONRaw(ns.String) } +func normalizeAssetID(raw string) (sql.NullString, error) { + value := strings.TrimSpace(raw) + if value == "" { + return sql.NullString{}, nil + } + if utf8.RuneCountInString(value) > 100 { + return sql.NullString{}, fmt.Errorf("asset_id must be at most 100 characters") + } + for _, r := range value { + if unicode.IsControl(r) { + return sql.NullString{}, fmt.Errorf("asset_id must not contain control characters") + } + } + return sql.NullString{String: value, Valid: true}, nil +} + +func assetIDValue(ns sql.NullString) string { + if !ns.Valid { + return "" + } + return strings.TrimSpace(ns.String) +} + +func (h *RobotHandler) assetIDInUse(assetID string, excludeRobotID int64) (bool, error) { + assetID = strings.TrimSpace(assetID) + if assetID == "" { + return false, nil + } + var exists bool + query := "SELECT EXISTS(SELECT 1 FROM robots WHERE asset_id = ? AND deleted_at IS NULL" + args := []interface{}{assetID} + if excludeRobotID > 0 { + query += " AND id <> ?" + args = append(args, excludeRobotID) + } + query += ")" + if err := h.db.Get(&exists, query, args...); err != nil { + return false, err + } + return exists, nil +} + func (h *RobotHandler) connectionState(deviceID string) (connected bool, connectedAt string) { connected, connectedAt, _, _ = h.connectionStateDetailed(deviceID) return connected, connectedAt @@ -462,6 +506,11 @@ func (h *RobotHandler) CreateRobot(c *gin.Context) { req.RobotTypeID = strings.TrimSpace(req.RobotTypeID) req.DeviceID = strings.TrimSpace(req.DeviceID) req.FactoryID = strings.TrimSpace(req.FactoryID) + assetID, err := normalizeAssetID(req.AssetID) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } if req.RobotTypeID == "" { c.JSON(http.StatusBadRequest, gin.H{"error": "robot_type_id is required"}) @@ -477,6 +526,18 @@ func (h *RobotHandler) CreateRobot(c *gin.Context) { c.JSON(http.StatusBadRequest, gin.H{"error": "factory_id is required"}) return } + if assetID.Valid { + inUse, err := h.assetIDInUse(assetID.String, 0) + if err != nil { + logger.Printf("[ROBOT] Failed to check asset_id uniqueness: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to create robot"}) + return + } + if inUse { + c.JSON(http.StatusConflict, gin.H{"error": "asset_id is already assigned to another robot"}) + return + } + } // Parse robot_type_id as numeric value robotTypeID, err := strconv.ParseInt(req.RobotTypeID, 10, 64) @@ -509,11 +570,6 @@ func (h *RobotHandler) CreateRobot(c *gin.Context) { now := time.Now().UTC() - var assetIDStr sql.NullString - if a := strings.TrimSpace(req.AssetID); a != "" { - assetIDStr = sql.NullString{String: a, Valid: true} - } - metadataStr := sql.NullString{String: "{}", Valid: true} if req.Metadata != nil { metadataJSON, err := json.Marshal(req.Metadata) @@ -539,7 +595,7 @@ func (h *RobotHandler) CreateRobot(c *gin.Context) { robotTypeID, req.DeviceID, factoryID, - assetIDStr, + assetID, "active", metadataStr, now, @@ -677,7 +733,7 @@ type UpdateRobotRequest struct { RobotTypeID *string `json:"robot_type_id,omitempty"` DeviceID *string `json:"device_id,omitempty"` FactoryID *string `json:"factory_id,omitempty"` - AssetID *string `json:"asset_id,omitempty"` + AssetID json.RawMessage `json:"asset_id,omitempty" swaggertype:"string"` Status *string `json:"status,omitempty"` Metadata json.RawMessage `json:"metadata,omitempty" swaggertype:"object"` } @@ -710,13 +766,19 @@ func (h *RobotHandler) UpdateRobot(c *gin.Context) { return } - // Check if robot exists - var exists bool - err = h.db.Get(&exists, "SELECT EXISTS(SELECT 1 FROM robots WHERE id = ? AND deleted_at IS NULL)", id) - if err != nil || !exists { + var current struct { + AssetID sql.NullString `db:"asset_id"` + } + err = h.db.Get(¤t, "SELECT asset_id FROM robots WHERE id = ? AND deleted_at IS NULL", id) + if err == sql.ErrNoRows { c.JSON(http.StatusNotFound, gin.H{"error": "robot not found"}) return } + if err != nil { + logger.Printf("[ROBOT] Failed to query robot: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to update robot"}) + return + } // Validate status if provided validStatuses := map[string]bool{ @@ -760,6 +822,47 @@ func (h *RobotHandler) UpdateRobot(c *gin.Context) { args = append(args, deviceID) } + if len(req.AssetID) > 0 { + var rawAssetID string + meta := bytes.TrimSpace(req.AssetID) + if bytes.Equal(meta, []byte("null")) { + rawAssetID = "" + } else if err := json.Unmarshal(req.AssetID, &rawAssetID); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "asset_id must be a string or null"}) + return + } + assetID, err := normalizeAssetID(rawAssetID) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + currentAssetID := assetIDValue(current.AssetID) + if currentAssetID != "" { + if !assetID.Valid { + c.JSON(http.StatusBadRequest, gin.H{"error": "asset_id cannot be cleared once set"}) + return + } + if assetID.String != currentAssetID { + c.JSON(http.StatusBadRequest, gin.H{"error": "asset_id cannot be changed once set"}) + return + } + } + if assetID.Valid && assetID.String != currentAssetID { + inUse, err := h.assetIDInUse(assetID.String, id) + if err != nil { + logger.Printf("[ROBOT] Failed to check asset_id uniqueness: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to update robot"}) + return + } + if inUse { + c.JSON(http.StatusConflict, gin.H{"error": "asset_id is already assigned to another robot"}) + return + } + } + updates = append(updates, "asset_id = ?") + args = append(args, assetID) + } + if req.FactoryID != nil { if *req.FactoryID == "" { c.JSON(http.StatusBadRequest, gin.H{"error": "factory_id cannot be empty"}) @@ -781,16 +884,6 @@ func (h *RobotHandler) UpdateRobot(c *gin.Context) { args = append(args, parsedFactoryID) } - if req.AssetID != nil { - trimmed := strings.TrimSpace(*req.AssetID) - var a sql.NullString - if trimmed != "" { - a = sql.NullString{String: trimmed, Valid: true} - } - updates = append(updates, "asset_id = ?") - args = append(args, a) - } - if req.Status != nil { status := strings.TrimSpace(*req.Status) if !validStatuses[status] { diff --git a/internal/api/handlers/robot_test.go b/internal/api/handlers/robot_test.go index 02777b1..fa7c1e1 100644 --- a/internal/api/handlers/robot_test.go +++ b/internal/api/handlers/robot_test.go @@ -5,9 +5,12 @@ package handlers import ( + "bytes" + "database/sql" "encoding/json" "net/http" "net/http/httptest" + "strings" "testing" "time" @@ -262,6 +265,164 @@ func TestRobotHandlerListRobots_ConnectedFilterUsesHubIntersection(t *testing.T) }) } +func TestRobotHandlerAssetID_CreateUpdateAndList(t *testing.T) { + db := newTestRobotHandlerDB(t) + defer db.Close() + seedRobotLookups(t, db) + + r := newTestRobotRouter(t, db) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString(`{ + "robot_type_id": "10", + "device_id": "local-device-1", + "asset_id": " asset-1 ", + "factory_id": "30" + }`)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + + if w.Code != http.StatusCreated { + t.Fatalf("create status=%d want=%d body=%s", w.Code, http.StatusCreated, w.Body.String()) + } + var created CreateRobotResponse + if err := json.Unmarshal(w.Body.Bytes(), &created); err != nil { + t.Fatalf("unmarshal create response: %v", err) + } + if created.AssetID != "asset-1" { + t.Fatalf("created asset_id=%v want asset-1", created.AssetID) + } + + req = httptest.NewRequest(http.MethodGet, "/api/v1/robots", nil) + w = httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusOK { + t.Fatalf("list status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } + var list RobotListResponse + if err := json.Unmarshal(w.Body.Bytes(), &list); err != nil { + t.Fatalf("unmarshal list response: %v", err) + } + if len(list.Items) != 1 || list.Items[0].AssetID != "asset-1" { + t.Fatalf("list asset_id response=%#v", list) + } + + req = httptest.NewRequest(http.MethodPut, "/api/v1/robots/"+created.ID, bytes.NewBufferString(`{"asset_id":"asset-1"}`)) + req.Header.Set("Content-Type", "application/json") + w = httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusOK { + t.Fatalf("same-value update status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String()) + } +} + +func TestRobotHandlerAssetID_ImmutableOnceSet(t *testing.T) { + db := newTestRobotHandlerDB(t) + defer db.Close() + seedRobotLookups(t, db) + seedRobot(t, db, 1, "local-device-1", "asset-1", nil) + + r := newTestRobotRouter(t, db) + + for _, tt := range []struct { + name string + body string + }{ + {name: "change rejected", body: `{"asset_id":"asset-2"}`}, + {name: "clear rejected", body: `{"asset_id":""}`}, + {name: "blank clear rejected", body: `{"asset_id":" "}`}, + } { + t.Run(tt.name, func(t *testing.T) { + req := httptest.NewRequest(http.MethodPut, "/api/v1/robots/1", bytes.NewBufferString(tt.body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusBadRequest { + t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusBadRequest, w.Body.String()) + } + }) + } + + req := httptest.NewRequest(http.MethodPut, "/api/v1/robots/1", bytes.NewBufferString(`{"asset_id":null}`)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusBadRequest { + t.Fatalf("null clear status=%d want=%d body=%s", w.Code, http.StatusBadRequest, w.Body.String()) + } +} + +func TestRobotHandlerAssetID_UniqueAmongActiveRobots(t *testing.T) { + db := newTestRobotHandlerDB(t) + defer db.Close() + seedRobotLookups(t, db) + seedRobot(t, db, 1, "local-device-1", "asset-1", nil) + deletedAt := time.Now().UTC() + seedRobot(t, db, 2, "deleted-device", "deleted-asset", &deletedAt) + + r := newTestRobotRouter(t, db) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString(`{ + "robot_type_id": "10", + "device_id": "local-device-2", + "asset_id": "asset-1", + "factory_id": "30" + }`)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusConflict { + t.Fatalf("duplicate create status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String()) + } + + req = httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString(`{ + "robot_type_id": "10", + "device_id": "local-device-3", + "asset_id": "deleted-asset", + "factory_id": "30" + }`)) + req.Header.Set("Content-Type", "application/json") + w = httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusCreated { + t.Fatalf("soft-deleted reuse status=%d want=%d body=%s", w.Code, http.StatusCreated, w.Body.String()) + } +} + +func TestRobotHandlerAssetID_Validation(t *testing.T) { + db := newTestRobotHandlerDB(t) + defer db.Close() + seedRobotLookups(t, db) + + r := newTestRobotRouter(t, db) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString("{\n"+ + `"robot_type_id":"10",`+ + `"device_id":"local-device-1",`+ + `"factory_id":"30",`+ + `"asset_id":"asset\u0001id"}`)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusBadRequest { + t.Fatalf("control char status=%d want=%d body=%s", w.Code, http.StatusBadRequest, w.Body.String()) + } + + longID := strings.Repeat("a", 101) + req = httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString(`{ + "robot_type_id": "10", + "device_id": "local-device-2", + "factory_id": "30", + "asset_id": "`+longID+`" + }`)) + req.Header.Set("Content-Type", "application/json") + w = httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusBadRequest { + t.Fatalf("long id status=%d want=%d body=%s", w.Code, http.StatusBadRequest, w.Body.String()) + } +} + func newTestRobotRouter(t *testing.T, db *sqlx.DB) *gin.Engine { t.Helper() return newTestRobotRouterWithHubs(t, db, nil, nil) @@ -306,8 +467,8 @@ func newTestRobotHandlerDB(t *testing.T) *sqlx.DB { id INTEGER PRIMARY KEY, robot_type_id INTEGER NOT NULL, device_id TEXT NOT NULL, - factory_id INTEGER NOT NULL, asset_id TEXT, + factory_id INTEGER NOT NULL, status TEXT NOT NULL, metadata TEXT, created_at TIMESTAMP, @@ -336,3 +497,32 @@ func newTestRobotHandlerDB(t *testing.T) *sqlx.DB { return db } + +func seedRobotLookups(t *testing.T, db *sqlx.DB) { + t.Helper() + if _, err := db.Exec(`INSERT INTO robot_types (id, name, model, deleted_at) VALUES (10, 'Arm Type', 'Model-A', NULL)`); err != nil { + t.Fatalf("seed robot type: %v", err) + } + if _, err := db.Exec(`INSERT INTO factories (id, name, slug, deleted_at) VALUES (30, 'Factory 30', 'fac-30', NULL)`); err != nil { + t.Fatalf("seed factory: %v", err) + } +} + +func seedRobot(t *testing.T, db *sqlx.DB, id int64, deviceID string, assetID string, deletedAt *time.Time) { + t.Helper() + var asset sql.NullString + if strings.TrimSpace(assetID) != "" { + asset = sql.NullString{String: strings.TrimSpace(assetID), Valid: true} + } + var deleted sql.NullTime + if deletedAt != nil { + deleted = sql.NullTime{Time: *deletedAt, Valid: true} + } + now := time.Now().UTC() + if _, err := db.Exec(` + INSERT INTO robots (id, robot_type_id, device_id, asset_id, factory_id, status, created_at, updated_at, deleted_at) + VALUES (?, 10, ?, ?, 30, 'active', ?, ?, ?) + `, id, deviceID, asset, now, now, deleted); err != nil { + t.Fatalf("seed robot %d: %v", id, err) + } +} diff --git a/internal/api/handlers/sync.go b/internal/api/handlers/sync.go index 7207912..9d1694c 100644 --- a/internal/api/handlers/sync.go +++ b/internal/api/handlers/sync.go @@ -20,28 +20,21 @@ import ( // SyncHandler handles cloud sync related HTTP requests. type SyncHandler struct { - db *sqlx.DB - syncWorker *services.SyncWorker - cliSyncRunner *services.CLISyncRunner + db *sqlx.DB + syncWorker *services.SyncWorker } // NewSyncHandler creates a new SyncHandler. -func NewSyncHandler(db *sqlx.DB, syncWorker *services.SyncWorker, cliSyncRunner ...*services.CLISyncRunner) *SyncHandler { - var runner *services.CLISyncRunner - if len(cliSyncRunner) > 0 { - runner = cliSyncRunner[0] - } - return &SyncHandler{db: db, syncWorker: syncWorker, cliSyncRunner: runner} +func NewSyncHandler(db *sqlx.DB, syncWorker *services.SyncWorker) *SyncHandler { + return &SyncHandler{db: db, syncWorker: syncWorker} } // RegisterRoutes registers cloud sync related routes. func (h *SyncHandler) RegisterRoutes(apiV1 *gin.RouterGroup) { apiV1.POST("/sync/episodes", h.TriggerBatchSync) apiV1.POST("/sync/episodes/:id", h.TriggerEpisodeSync) - apiV1.POST("/sync/episodes/:id/cli", h.TriggerEpisodeCLISync) apiV1.GET("/sync/episodes", h.ListSyncJobs) apiV1.GET("/sync/episodes/summary", h.ListEpisodeSyncSummaries) - apiV1.GET("/sync/episodes/:id/cli/status", h.GetEpisodeCLISyncStatus) apiV1.GET("/sync/episodes/:id/logs", h.ListEpisodeSyncLogs) apiV1.GET("/sync/episodes/:id/status", h.GetSyncStatus) apiV1.GET("/sync/config", h.GetSyncConfig) @@ -121,26 +114,6 @@ type SyncEpisodeSummaryResponse struct { CompletedAt *string `json:"completed_at,omitempty"` } -// CLISyncRunResponse represents one CLI sync sidepath run. -type CLISyncRunResponse struct { - ID int64 `json:"id"` - EpisodeID int64 `json:"episode_id"` - Status string `json:"status"` - SourcePath *string `json:"source_path,omitempty"` - TempPath *string `json:"temp_path,omitempty"` - FileID *string `json:"file_id,omitempty"` - LogicalUploadID *string `json:"logical_upload_id,omitempty"` - UploadID *string `json:"upload_id,omitempty"` - Bucket *string `json:"bucket,omitempty"` - ObjectKey *string `json:"object_key,omitempty"` - FileSize *int64 `json:"file_size,omitempty"` - OSSObjectETag *string `json:"oss_object_etag,omitempty"` - DurationSec *int64 `json:"duration_sec,omitempty"` - ErrorMessage *string `json:"error_message,omitempty"` - StartedAt *string `json:"started_at,omitempty"` - CompletedAt *string `json:"completed_at,omitempty"` -} - // SyncJobListResponse represents the response for listing sync jobs. type SyncJobListResponse struct { Items []SyncJobResponse `json:"items"` @@ -285,82 +258,6 @@ func (h *SyncHandler) TriggerEpisodeSync(c *gin.Context) { }) } -// TriggerEpisodeCLISync triggers the dp CLI cloud sync sidepath for one episode. -// -// @Summary Trigger single episode CLI cloud sync -// @Description Enqueues a specific episode for cloud sync through the dp CLI sidepath -// @Tags sync -// @Produce json -// @Param id path int true "Episode ID" -// @Success 202 {object} map[string]interface{} -// @Failure 400 {object} map[string]string -// @Failure 404 {object} map[string]string -// @Failure 409 {object} map[string]string -// @Failure 429 {object} map[string]string -// @Failure 503 {object} map[string]string -// @Router /sync/episodes/{id}/cli [post] -func (h *SyncHandler) TriggerEpisodeCLISync(c *gin.Context) { - if h.cliSyncRunner == nil || !h.cliSyncRunner.IsEnabled() { - c.JSON(http.StatusServiceUnavailable, gin.H{"error": "CLI sync is not configured"}) - return - } - - episodeID, ok := parseSyncEpisodeIDParam(c) - if !ok { - return - } - - runID, err := h.cliSyncRunner.EnqueueEpisode(c.Request.Context(), episodeID) - if err != nil { - h.writeCLISyncError(c, episodeID, err) - return - } - - c.JSON(http.StatusAccepted, gin.H{ - "status": "accepted", - "episode_id": episodeID, - "run_id": runID, - "message": "episode accepted for CLI cloud sync", - }) -} - -// GetEpisodeCLISyncStatus returns the latest CLI sync sidepath run for one episode. -// -// @Summary Get episode CLI sync status -// @Description Returns the latest dp CLI sync run for a specific episode -// @Tags sync -// @Produce json -// @Param id path int true "Episode ID" -// @Success 200 {object} CLISyncRunResponse -// @Failure 400 {object} map[string]string -// @Failure 404 {object} map[string]string -// @Failure 503 {object} map[string]string -// @Router /sync/episodes/{id}/cli/status [get] -func (h *SyncHandler) GetEpisodeCLISyncStatus(c *gin.Context) { - if h.cliSyncRunner == nil || !h.cliSyncRunner.IsEnabled() { - c.JSON(http.StatusServiceUnavailable, gin.H{"error": "CLI sync is not configured"}) - return - } - - episodeID, ok := parseSyncEpisodeIDParam(c) - if !ok { - return - } - - run, err := h.cliSyncRunner.LatestRun(c.Request.Context(), episodeID) - if err == sql.ErrNoRows { - c.JSON(http.StatusNotFound, gin.H{"error": "no CLI sync record found for this episode"}) - return - } - if err != nil { - logger.Printf("[SYNC] Failed to query CLI sync status for episode %d: %v", episodeID, err) - c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to get CLI sync status"}) - return - } - - c.JSON(http.StatusOK, cliSyncRunResponseFromRun(run)) -} - // ListSyncJobs lists sync log entries with filtering and pagination. // // @Summary List sync jobs @@ -706,18 +603,10 @@ func (h *SyncHandler) GetSyncConfig(c *gin.Context) { autoScanEnabled = h.syncWorker.AutoScanEnabled() maxRetries = h.syncWorker.MaxRetries() } - cliSyncEnabled := false - cliSyncRunning := false - if h.cliSyncRunner != nil { - cliSyncEnabled = h.cliSyncRunner.IsEnabled() - cliSyncRunning = h.cliSyncRunner.IsRunning() - } c.JSON(http.StatusOK, gin.H{ "worker_running": workerRunning, "auto_scan_enabled": autoScanEnabled, "max_retries": maxRetries, - "cli_sync_enabled": cliSyncEnabled, - "cli_sync_running": cliSyncRunning, }) } @@ -759,79 +648,6 @@ func syncEpisodeSummaryResponseFromRow(r syncEpisodeSummaryRow) SyncEpisodeSumma } } -func cliSyncRunResponseFromRun(r *services.CLISyncRun) CLISyncRunResponse { - if r == nil { - return CLISyncRunResponse{} - } - return CLISyncRunResponse{ - ID: r.ID, - EpisodeID: r.EpisodeID, - Status: r.Status, - SourcePath: nullableString(r.SourcePath), - TempPath: nullableString(r.TempPath), - FileID: nullableString(r.FileID), - LogicalUploadID: nullableString(r.LogicalUploadID), - UploadID: nullableString(r.UploadID), - Bucket: nullableString(r.Bucket), - ObjectKey: nullableString(r.ObjectKey), - FileSize: nullableInt64(r.FileSize), - OSSObjectETag: nullableString(r.OSSObjectETag), - DurationSec: nullableInt64(r.DurationSec), - ErrorMessage: nullableString(r.ErrorMessage), - StartedAt: nullableTime(r.StartedAt), - CompletedAt: nullableTime(r.CompletedAt), - } -} - -func parseSyncEpisodeIDParam(c *gin.Context) (int64, bool) { - idStr := c.Param("id") - episodeID, err := strconv.ParseInt(strings.TrimSpace(idStr), 10, 64) - if err != nil || episodeID <= 0 { - c.JSON(http.StatusBadRequest, gin.H{"error": "invalid episode id"}) - return 0, false - } - return episodeID, true -} - -func (h *SyncHandler) writeCLISyncError(c *gin.Context, episodeID int64, err error) { - switch { - case errors.Is(err, services.ErrCLISyncDisabled), errors.Is(err, services.ErrCLISyncNotRunning): - c.JSON(http.StatusServiceUnavailable, gin.H{ - "error": err.Error(), - "episode_id": episodeID, - "status": "cli_sync_unavailable", - }) - case errors.Is(err, services.ErrCLISyncEpisodeNotFound): - c.JSON(http.StatusNotFound, gin.H{ - "error": "episode not found", - "episode_id": episodeID, - }) - case errors.Is(err, services.ErrCLISyncNotEligible): - c.JSON(http.StatusBadRequest, gin.H{ - "error": err.Error(), - "episode_id": episodeID, - "status": "not_eligible", - }) - case errors.Is(err, services.ErrCLISyncAlreadySynced), - errors.Is(err, services.ErrCLISyncAlreadyActive), - errors.Is(err, services.ErrCLISyncNormalSyncActive): - c.JSON(http.StatusConflict, gin.H{ - "error": err.Error(), - "episode_id": episodeID, - "status": "already_active", - }) - case errors.Is(err, services.ErrCLISyncQueueFull): - c.JSON(http.StatusTooManyRequests, gin.H{ - "error": err.Error(), - "episode_id": episodeID, - "status": "queue_full", - }) - default: - logger.Printf("[SYNC] CLI enqueue episode %d failed: %v", episodeID, err) - c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to enqueue CLI sync"}) - } -} - func nullableInt64(v sql.NullInt64) *int64 { if !v.Valid { return nil diff --git a/internal/api/handlers/transfer.go b/internal/api/handlers/transfer.go index 1e70ce5..3ebb650 100644 --- a/internal/api/handlers/transfer.go +++ b/internal/api/handlers/transfer.go @@ -391,6 +391,30 @@ func readSidecarFromS3(ctx context.Context, s3Client *s3.Client, bucket, jsonKey return &sc } +func assetIDSnapshotMetadata(ctx context.Context, tx *sql.Tx, workstationID sql.NullInt64) sql.NullString { + if tx == nil || !workstationID.Valid || workstationID.Int64 <= 0 { + return sql.NullString{} + } + var assetID sql.NullString + err := tx.QueryRowContext(ctx, ` + SELECT r.asset_id + FROM workstations ws + LEFT JOIN robots r ON r.id = ws.robot_id AND r.deleted_at IS NULL + WHERE ws.id = ? AND ws.deleted_at IS NULL + LIMIT 1 + `, workstationID.Int64).Scan(&assetID) + if err != nil || !assetID.Valid || strings.TrimSpace(assetID.String) == "" { + return sql.NullString{} + } + data, err := json.Marshal(map[string]string{ + "asset_id": strings.TrimSpace(assetID.String), + }) + if err != nil { + return sql.NullString{} + } + return sql.NullString{String: string(data), Valid: true} +} + func uploadCompleteS3Key(data map[string]interface{}) string { return strings.TrimSpace(stringVal(data, "s3_key")) } @@ -617,6 +641,7 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra checksum = sql.NullString{String: sc.Recording.ChecksumSHA256, Valid: true} } } + episodeMetadata := assetIDSnapshotMetadata(ctx, tx, taskRow.WorkstationID) _, dbErr := tx.ExecContext(ctx, `INSERT INTO episodes ( @@ -635,8 +660,9 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra duration_sec, file_size_bytes, checksum, - qa_status - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + qa_status, + metadata + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, episodeID, taskRow.ID, taskRow.BatchID, @@ -653,6 +679,7 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra fileSizeBytes, checksum, "approved", + episodeMetadata, ) if dbErr != nil { // #nosec G706 -- Set aside for now diff --git a/internal/api/handlers/transfer_asset_id_snapshot_test.go b/internal/api/handlers/transfer_asset_id_snapshot_test.go new file mode 100644 index 0000000..9d71be7 --- /dev/null +++ b/internal/api/handlers/transfer_asset_id_snapshot_test.go @@ -0,0 +1,95 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package handlers + +import ( + "context" + "database/sql" + "encoding/json" + "testing" + + _ "modernc.org/sqlite" +) + +func TestAssetIDSnapshotMetadata_WritesWhenRobotHasAssetID(t *testing.T) { + db, err := sql.Open("sqlite", ":memory:") + if err != nil { + t.Fatalf("open sqlite db: %v", err) + } + defer db.Close() + + createAssetIDSnapshotSchema(t, db) + if _, err := db.Exec(`INSERT INTO robots (id, asset_id, deleted_at) VALUES (1, ' asset-1 ', NULL)`); err != nil { + t.Fatalf("seed robot: %v", err) + } + if _, err := db.Exec(`INSERT INTO workstations (id, robot_id, deleted_at) VALUES (10, 1, NULL)`); err != nil { + t.Fatalf("seed workstation: %v", err) + } + + tx, err := db.BeginTx(context.Background(), nil) + if err != nil { + t.Fatalf("begin tx: %v", err) + } + defer tx.Rollback() + + got := assetIDSnapshotMetadata(context.Background(), tx, sql.NullInt64{Int64: 10, Valid: true}) + if !got.Valid { + t.Fatal("metadata was not written") + } + var decoded map[string]string + if err := json.Unmarshal([]byte(got.String), &decoded); err != nil { + t.Fatalf("unmarshal metadata: %v", err) + } + if decoded["asset_id"] != "asset-1" { + t.Fatalf("asset_id=%q want asset-1", decoded["asset_id"]) + } +} + +func TestAssetIDSnapshotMetadata_MissingDoesNotFailEpisodeCreationPath(t *testing.T) { + db, err := sql.Open("sqlite", ":memory:") + if err != nil { + t.Fatalf("open sqlite db: %v", err) + } + defer db.Close() + + createAssetIDSnapshotSchema(t, db) + if _, err := db.Exec(`INSERT INTO robots (id, asset_id, deleted_at) VALUES (1, NULL, NULL)`); err != nil { + t.Fatalf("seed robot: %v", err) + } + if _, err := db.Exec(`INSERT INTO workstations (id, robot_id, deleted_at) VALUES (10, 1, NULL)`); err != nil { + t.Fatalf("seed workstation: %v", err) + } + + tx, err := db.BeginTx(context.Background(), nil) + if err != nil { + t.Fatalf("begin tx: %v", err) + } + defer tx.Rollback() + + got := assetIDSnapshotMetadata(context.Background(), tx, sql.NullInt64{Int64: 10, Valid: true}) + if got.Valid { + t.Fatalf("metadata valid=%t value=%q, want NULL", got.Valid, got.String) + } +} + +func createAssetIDSnapshotSchema(t *testing.T, db *sql.DB) { + t.Helper() + for _, stmt := range []string{ + `CREATE TABLE robots ( + id INTEGER PRIMARY KEY, + asset_id TEXT, + deleted_at TIMESTAMP NULL + )`, + `CREATE TABLE workstations ( + id INTEGER PRIMARY KEY, + robot_id INTEGER, + deleted_at TIMESTAMP NULL + )`, + } { + if _, err := db.Exec(stmt); err != nil { + t.Fatalf("create schema: %v", err) + } + } +} diff --git a/internal/cloud/cloudpb/data_gateway.pb.go b/internal/cloud/cloudpb/data_gateway.pb.go index fb89247..2b98e31 100644 --- a/internal/cloud/cloudpb/data_gateway.pb.go +++ b/internal/cloud/cloudpb/data_gateway.pb.go @@ -870,6 +870,7 @@ type CompleteUploadRequest struct { RawTags map[string]string `protobuf:"bytes,3,rep,name=raw_tags,json=rawTags,proto3" json:"raw_tags,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` CompletedPartCount int32 `protobuf:"varint,4,opt,name=completed_part_count,json=completedPartCount,proto3" json:"completed_part_count,omitempty"` OssObjectEtag string `protobuf:"bytes,5,opt,name=oss_object_etag,json=ossObjectEtag,proto3" json:"oss_object_etag,omitempty"` + PartSizeBytes int64 `protobuf:"varint,6,opt,name=part_size_bytes,json=partSizeBytes,proto3" json:"part_size_bytes,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -939,6 +940,13 @@ func (x *CompleteUploadRequest) GetOssObjectEtag() string { return "" } +func (x *CompleteUploadRequest) GetPartSizeBytes() int64 { + if x != nil { + return x.PartSizeBytes + } + return 0 +} + type CompleteUploadResponse struct { state protoimpl.MessageState `protogen:"open.v1"` unknownFields protoimpl.UnknownFields @@ -1287,13 +1295,14 @@ const file_data_gateway_proto_rawDesc = "" + "\x06reason\x18\x02 \x01(\tR\x06reason\"^\n" + "\x13AbortUploadResponse\x12*\n" + "\x11logical_upload_id\x18\x01 \x01(\tR\x0flogicalUploadId\x12\x1b\n" + - "\tupload_id\x18\x02 \x01(\tR\buploadId\"\xc1\x02\n" + + "\tupload_id\x18\x02 \x01(\tR\buploadId\"\xe9\x02\n" + "\x15CompleteUploadRequest\x12\x1b\n" + "\tupload_id\x18\x01 \x01(\tR\buploadId\x12\x1b\n" + "\tfile_size\x18\x02 \x01(\x03R\bfileSize\x12X\n" + "\braw_tags\x18\x03 \x03(\v2=.archebase.data_gateway.v1.CompleteUploadRequest.RawTagsEntryR\arawTags\x120\n" + "\x14completed_part_count\x18\x04 \x01(\x05R\x12completedPartCount\x12&\n" + - "\x0foss_object_etag\x18\x05 \x01(\tR\rossObjectEtag\x1a:\n" + + "\x0foss_object_etag\x18\x05 \x01(\tR\rossObjectEtag\x12&\n" + + "\x0fpart_size_bytes\x18\x06 \x01(\x03R\rpartSizeBytes\x1a:\n" + "\fRawTagsEntry\x12\x10\n" + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\x18\n" + diff --git a/internal/cloud/cloudpb/proto/data_gateway.proto b/internal/cloud/cloudpb/proto/data_gateway.proto index e3f180c..a5be7a4 100644 --- a/internal/cloud/cloudpb/proto/data_gateway.proto +++ b/internal/cloud/cloudpb/proto/data_gateway.proto @@ -111,6 +111,7 @@ message CompleteUploadRequest { map raw_tags = 3; int32 completed_part_count = 4; string oss_object_etag = 5; + int64 part_size_bytes = 6; } message CompleteUploadResponse {} diff --git a/internal/cloud/gateway_client.go b/internal/cloud/gateway_client.go index c91d238..5cc842d 100644 --- a/internal/cloud/gateway_client.go +++ b/internal/cloud/gateway_client.go @@ -209,7 +209,7 @@ func (c *GatewayClient) AbortUpload(ctx context.Context, logicalUploadID string, } // CompleteUpload notifies the data-gateway that all parts have been uploaded to OSS. -func (c *GatewayClient) CompleteUpload(ctx context.Context, uploadID string, fileSize int64, rawTags map[string]string, completedPartCount int32, ossObjectEtag string) error { +func (c *GatewayClient) CompleteUpload(ctx context.Context, uploadID string, fileSize int64, rawTags map[string]string, completedPartCount int32, ossObjectEtag string, partSizeBytes int64) error { authHeader, err := c.getAuthHeader(ctx) if err != nil { return err @@ -226,6 +226,7 @@ func (c *GatewayClient) CompleteUpload(ctx context.Context, uploadID string, fil RawTags: rawTags, CompletedPartCount: completedPartCount, OssObjectEtag: ossObjectEtag, + PartSizeBytes: partSizeBytes, }) return rpcErr }) diff --git a/internal/cloud/gateway_client_test.go b/internal/cloud/gateway_client_test.go new file mode 100644 index 0000000..e8ab36a --- /dev/null +++ b/internal/cloud/gateway_client_test.go @@ -0,0 +1,83 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package cloud + +import ( + "context" + "net" + "testing" + "time" + + pb "archebase.com/keystone-edge/internal/cloud/cloudpb" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/test/bufconn" +) + +type completeUploadCaptureServer struct { + pb.UnimplementedDataGatewayServiceServer + req *pb.CompleteUploadRequest +} + +func (s *completeUploadCaptureServer) CompleteUpload(_ context.Context, req *pb.CompleteUploadRequest) (*pb.CompleteUploadResponse, error) { + s.req = req + return &pb.CompleteUploadResponse{}, nil +} + +func TestGatewayClientCompleteUploadSendsPartSizeBytes(t *testing.T) { + listener := bufconn.Listen(1024 * 1024) + server := grpc.NewServer() + capture := &completeUploadCaptureServer{} + pb.RegisterDataGatewayServiceServer(server, capture) + go func() { + if err := server.Serve(listener); err != nil { + t.Logf("bufconn server exited: %v", err) + } + }() + t.Cleanup(func() { + server.Stop() + _ = listener.Close() + }) + + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + conn, err := grpc.DialContext(ctx, "bufnet", //nolint:staticcheck // bufconn tests still use DialContext. + grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) { + return listener.Dial() + }), + grpc.WithTransportCredentials(insecure.NewCredentials()), + ) + if err != nil { + t.Fatalf("dial bufconn: %v", err) + } + t.Cleanup(func() { _ = conn.Close() }) + + authClient := &AuthClient{ + token: &AuthToken{ + AccessToken: "test-token", + ExpiresAt: time.Now().Add(time.Hour), + }, + } + client := &GatewayClient{ + cfg: GatewayClientConfig{ + RequestTimeout: time.Second, + }, + authClient: authClient, + conn: conn, + } + + if err := client.CompleteUpload(ctx, "upload-1", 1234, map[string]string{"k": "v"}, 2, `"etag"`, 8*1024*1024); err != nil { + t.Fatalf("CompleteUpload() error = %v", err) + } + if capture.req == nil { + t.Fatal("CompleteUpload request was not captured") + } + if capture.req.PartSizeBytes != 8*1024*1024 { + t.Fatalf("PartSizeBytes=%d want %d", capture.req.PartSizeBytes, 8*1024*1024) + } + if capture.req.RawTags["k"] != "v" { + t.Fatalf("RawTags=%+v", capture.req.RawTags) + } +} diff --git a/internal/cloud/uploader.go b/internal/cloud/uploader.go index e7cd01c..9fdbf98 100644 --- a/internal/cloud/uploader.go +++ b/internal/cloud/uploader.go @@ -13,6 +13,7 @@ import ( "math" "os" "path/filepath" + "strings" "time" pb "archebase.com/keystone-edge/internal/cloud/cloudpb" @@ -42,6 +43,8 @@ type UploadRequest struct { EpisodeID string // McapKey is the MinIO object key for the MCAP file (without bucket prefix). McapKey string + // AssetID is the Data Platform device id used for this upload. + AssetID string // RawTags are arbitrary key-value tags passed to the data-gateway. RawTags map[string]string // ClientHints are passed to CreateLogicalUpload for server-side routing. @@ -69,6 +72,7 @@ type persistedUploadState struct { Endpoint string `json:"endpoint"` ObjectKey string `json:"object_key"` McapKey string `json:"mcap_key"` + AssetID string `json:"asset_id"` FileSize int64 `json:"file_size"` UpdatedAt time.Time `json:"updated_at"` } @@ -99,7 +103,7 @@ type gatewayClient interface { GetUploadRecovery(ctx context.Context, logicalUploadID string) (*UploadRecoveryInfo, error) ReissueUploadCredentials(ctx context.Context, uploadID string) (*UploadSession, error) AbortUpload(ctx context.Context, logicalUploadID string, reason string) error - CompleteUpload(ctx context.Context, uploadID string, fileSize int64, rawTags map[string]string, completedPartCount int32, ossObjectEtag string) error + CompleteUpload(ctx context.Context, uploadID string, fileSize int64, rawTags map[string]string, completedPartCount int32, ossObjectEtag string, partSizeBytes int64) error } // ossClient is the subset of OSSUploader methods used by Uploader. @@ -171,8 +175,18 @@ func (u *Uploader) validatePersistDir() error { // It uses context.Background() as base to ensure the abort is independent of the // caller's context, but with a 30s timeout to prevent indefinite hanging. func (u *Uploader) abortMultipartUpload(session *UploadSession, multipartUploadID string) { + if session == nil { + logger.Printf("[CLOUD-UPLOAD] Warning: skip OSS abort for multipart_upload_id=%s: missing upload session", multipartUploadID) + return + } abortCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() + refreshed, err := u.ensureFreshUploadCredentials(abortCtx, session) + if err != nil { + logger.Printf("[CLOUD-UPLOAD] Warning: refresh credentials before abort failed (proceeding anyway): %v", err) + } else { + session = refreshed + } u.oss.AbortMultipartUpload(abortCtx, session, multipartUploadID) } @@ -205,7 +219,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult logger.Printf("[CLOUD-UPLOAD] Starting upload: episode=%s mcap=%s size=%d", req.EpisodeID, req.McapKey, fileSize) // Step 2: Prepare upload session (with recovery if persisted state exists) - prepared, err := u.prepareUploadSession(ctx, hints, req.McapKey, fileSize) + prepared, err := u.prepareUploadSession(ctx, hints, req.McapKey, req.AssetID, fileSize) if err != nil { return nil, fmt.Errorf("prepare upload session: %w", err) } @@ -219,7 +233,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult if prepared.ossCompleteETag != "" { logger.Printf("[CLOUD-UPLOAD] OSS object already verified (COMPLETE_ONLY): logical_upload_id=%s etag=%s parts=%d", session.LogicalUploadID, prepared.ossCompleteETag, prepared.ossCompletePartCount) - if err := u.gateway.CompleteUpload(ctx, session.UploadID, fileSize, req.RawTags, prepared.ossCompletePartCount, prepared.ossCompleteETag); err != nil { + if err := u.gateway.CompleteUpload(ctx, session.UploadID, fileSize, req.RawTags, prepared.ossCompletePartCount, prepared.ossCompleteETag, session.PartSizeBytes); err != nil { return nil, fmt.Errorf("complete upload on gateway (oss-already-complete): %w", err) } u.cleanupPersistedState(session.LogicalUploadID) @@ -246,6 +260,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult Endpoint: session.Endpoint, ObjectKey: session.ObjectKey, McapKey: req.McapKey, + AssetID: req.AssetID, FileSize: fileSize, UpdatedAt: time.Now(), }); err != nil { @@ -269,7 +284,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult // InitiateMultipartUpload succeeds, before streaming any parts. This requires splitting // uploadParts into an initiate step (called here, result persisted) and a stream step. // The Rust SDK has the same gap; defer fixing until the upstream SDK is updated. - multipartUploadID, parts, partMD5s, err := u.uploadParts(ctx, req, session, fileSize) + session, multipartUploadID, parts, partMD5s, err := u.uploadParts(ctx, req, session, fileSize) if err != nil { return nil, err } @@ -285,6 +300,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult Endpoint: session.Endpoint, ObjectKey: session.ObjectKey, McapKey: req.McapKey, + AssetID: req.AssetID, FileSize: fileSize, UpdatedAt: time.Now(), }); err != nil { @@ -309,7 +325,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult return nil, fmt.Errorf("too many upload parts: %d", len(parts)) } //nolint:gosec // G115: len(parts) validated to fit into int32 above - if err := u.gateway.CompleteUpload(ctx, session.UploadID, fileSize, req.RawTags, int32(len(parts)), localETag); err != nil { + if err := u.gateway.CompleteUpload(ctx, session.UploadID, fileSize, req.RawTags, int32(len(parts)), localETag, session.PartSizeBytes); err != nil { return nil, fmt.Errorf("complete upload on gateway: %w", err) } @@ -341,8 +357,8 @@ type preparedSession struct { // prepareUploadSession checks for persisted state and either resumes or creates a new session. // It mirrors the Rust SDK's prepare_upload_session logic. -func (u *Uploader) prepareUploadSession(ctx context.Context, clientHints map[string]string, mcapKey string, fileSize int64) (preparedSession, error) { - state, err := u.findPersistedStateByKey(mcapKey) +func (u *Uploader) prepareUploadSession(ctx context.Context, clientHints map[string]string, mcapKey string, assetID string, fileSize int64) (preparedSession, error) { + state, err := u.findPersistedStateByKey(mcapKey, assetID) if err != nil { return preparedSession{}, fmt.Errorf("load persisted state: %w", err) } @@ -397,6 +413,7 @@ func (u *Uploader) prepareUploadSession(ctx context.Context, clientHints map[str Endpoint: newSession.Endpoint, ObjectKey: newSession.ObjectKey, McapKey: mcapKey, + AssetID: assetID, FileSize: fileSize, UpdatedAt: time.Now(), }); err != nil { @@ -537,32 +554,79 @@ func (u *Uploader) reconcileCompletedObject(ctx context.Context, session *Upload return reconcileRestart, nil } +// partStreamFactory opens a stream for a specific byte range of the MCAP file. +// Each call returns an independent io.ReadCloser so that connections are not +// kept idle across part uploads. +type partStreamFactory func(ctx context.Context, offset, length int64) (io.ReadCloser, error) + +// minioRangeReader returns a partStreamFactory that reads byte ranges from +// MinIO using independent ranged GetObject requests. +func (u *Uploader) minioRangeReader(key string) partStreamFactory { + return func(ctx context.Context, offset, length int64) (io.ReadCloser, error) { + opts := minio.GetObjectOptions{} + if err := opts.SetRange(offset, offset+length-1); err != nil { + return nil, fmt.Errorf("set range %d-%d: %w", offset, offset+length-1, err) + } + obj, err := u.minioClient.GetObject(ctx, u.minioBucket, key, opts) + if err != nil { + return nil, fmt.Errorf("get minio object range %d-%d: %w", offset, offset+length-1, err) + } + return obj, nil + } +} + // uploadParts streams the MCAP from MinIO and uploads it to OSS in parts. // Returns the OSS multipart upload ID, the list of uploaded parts, per-part MD5 digests, and any error. -func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session *UploadSession, fileSize int64) (string, []UploadedPart, [][16]byte, error) { +func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session *UploadSession, fileSize int64) (*UploadSession, string, []UploadedPart, [][16]byte, error) { + session, err := u.ensureFreshUploadCredentials(ctx, session) + if err != nil { + return nil, "", nil, nil, fmt.Errorf("refresh credentials before initiate multipart upload: %w", err) + } + // Initiate multipart upload on OSS multipartUploadID, err := u.oss.InitiateMultipartUpload(ctx, session) if err != nil { - return "", nil, nil, fmt.Errorf("initiate multipart upload: %w", err) + return nil, "", nil, nil, fmt.Errorf("initiate multipart upload: %w", err) } logger.Printf("[CLOUD-UPLOAD] Multipart initiated: multipart_upload_id=%s", multipartUploadID) - // Stream from MinIO → OSS in parts - mcapStream, err := u.minioClient.GetObject(ctx, u.minioBucket, req.McapKey, minio.GetObjectOptions{}) + // Stream from MinIO to OSS in parts. + // Each part uses an independent ranged GetObject so that the MinIO HTTP + // connection is not left idle during OSS part uploads. A single streaming + // response would risk idle connection timeout (~20-25s on MinIO or network + // intermediaries) when upload speed is slow. + session, parts, partMD5s, err := u.streamMultipartParts(ctx, req.EpisodeID, session, multipartUploadID, fileSize, u.minioRangeReader(req.McapKey)) if err != nil { u.abortMultipartUpload(session, multipartUploadID) - return "", nil, nil, fmt.Errorf("get minio object %s: %w", req.McapKey, err) + return nil, "", nil, nil, err } - defer func() { - _ = mcapStream.Close() - }() + session, err = u.ensureFreshUploadCredentials(ctx, session) + if err != nil { + u.abortMultipartUpload(session, multipartUploadID) + return nil, "", nil, nil, fmt.Errorf("refresh credentials before complete multipart upload: %w", err) + } + + // Complete multipart upload on OSS + if _, err := u.oss.CompleteMultipartUpload(ctx, session, multipartUploadID, parts); err != nil { + u.abortMultipartUpload(session, multipartUploadID) + return nil, "", nil, nil, fmt.Errorf("complete multipart upload on OSS: %w", err) + } + + return session, multipartUploadID, parts, partMD5s, nil +} + +func (u *Uploader) streamMultipartParts(ctx context.Context, episodeID string, session *UploadSession, multipartUploadID string, fileSize int64, newPartStream partStreamFactory) (*UploadSession, []UploadedPart, [][16]byte, error) { partSizeBytes := session.PartSizeBytes if partSizeBytes <= 0 { partSizeBytes = 8 * 1024 * 1024 // 8MB default } + partSize := int(partSizeBytes) + if int64(partSize) != partSizeBytes { + return session, nil, nil, fmt.Errorf("invalid part_size_bytes %d", partSizeBytes) + } - buf := make([]byte, partSizeBytes) + buf := make([]byte, partSize) var parts []UploadedPart var partMD5s [][16]byte var offset int64 @@ -570,8 +634,7 @@ func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session * for offset < fileSize { if err := ctx.Err(); err != nil { - u.abortMultipartUpload(session, multipartUploadID) - return "", nil, nil, err + return session, nil, nil, err } remaining := fileSize - offset @@ -580,19 +643,43 @@ func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session * readSize = remaining } - n, readErr := io.ReadFull(mcapStream, buf[:readSize]) - if readErr != nil && readErr != io.ErrUnexpectedEOF { - u.abortMultipartUpload(session, multipartUploadID) - return "", nil, nil, fmt.Errorf("read part %d from minio: %w", partNumber, readErr) + // Open a new connection for each part so that the MinIO stream is not + // left idle during OSS uploads. MinIO or intervening network equipment + // may drop idle streaming connections after ~20-25s, and the OSS upload + // between part reads can easily exceed this threshold on slow networks. + partStream, err := newPartStream(ctx, offset, readSize) + if err != nil { + return session, nil, nil, fmt.Errorf("open part %d stream at offset %d: %w", partNumber, offset, err) + } + + n, readErr := io.ReadFull(partStream, buf[:int(readSize)]) + _ = partStream.Close() // close ASAP, best-effort + if readErr != nil { + return session, nil, nil, fmt.Errorf("read part %d from minio: expected %d bytes, got %d: %w", partNumber, readSize, n, readErr) + } + if int64(n) != readSize { + return session, nil, nil, fmt.Errorf("read part %d from minio: expected %d bytes, got %d", partNumber, readSize, n) } partSlice := buf[:n] partMD5s = append(partMD5s, MD5DigestBytes(partSlice)) + session, err = u.ensureFreshUploadCredentials(ctx, session) + if err != nil { + return session, nil, nil, fmt.Errorf("refresh credentials before upload part %d: %w", partNumber, err) + } + etag, err := u.oss.UploadPart(ctx, session, multipartUploadID, partNumber, partSlice) + if err != nil && isSecurityTokenExpiredError(err) { + refreshed, refreshErr := u.refreshUploadCredentials(ctx, session) + if refreshErr != nil { + return session, nil, nil, fmt.Errorf("refresh credentials after upload part %d token expiry: %w", partNumber, refreshErr) + } + session = refreshed + etag, err = u.oss.UploadPart(ctx, session, multipartUploadID, partNumber, partSlice) + } if err != nil { - u.abortMultipartUpload(session, multipartUploadID) - return "", nil, nil, fmt.Errorf("upload part %d: %w", partNumber, err) + return session, nil, nil, fmt.Errorf("upload part %d: %w", partNumber, err) } parts = append(parts, UploadedPart{ @@ -603,19 +690,47 @@ func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session * offset += int64(n) partNumber++ - if partNumber%10 == 0 { - logger.Printf("[CLOUD-UPLOAD] Progress: episode=%s parts=%d offset=%d/%d", - req.EpisodeID, len(parts), offset, fileSize) - } + logger.Printf("[CLOUD-UPLOAD] Progress: episode=%s parts=%d offset=%d/%d", + episodeID, len(parts), offset, fileSize) } - // Complete multipart upload on OSS - if _, err := u.oss.CompleteMultipartUpload(ctx, session, multipartUploadID, parts); err != nil { - u.abortMultipartUpload(session, multipartUploadID) - return "", nil, nil, fmt.Errorf("complete multipart upload on OSS: %w", err) + return session, parts, partMD5s, nil +} + +func (u *Uploader) ensureFreshUploadCredentials(ctx context.Context, session *UploadSession) (*UploadSession, error) { + if session == nil { + return nil, fmt.Errorf("missing upload session") + } + if time.Until(session.STSExpireAt) > u.stsRefreshWindow() { + return session, nil } + return u.refreshUploadCredentials(ctx, session) +} + +func (u *Uploader) refreshUploadCredentials(ctx context.Context, session *UploadSession) (*UploadSession, error) { + if u.gateway == nil { + return nil, fmt.Errorf("gateway client is not configured") + } + refreshed, err := u.gateway.ReissueUploadCredentials(ctx, session.UploadID) + if err != nil { + return nil, err + } + return refreshed, nil +} + +func (u *Uploader) stsRefreshWindow() time.Duration { + window := u.cfg.RequestTimeout + if u.cfg.OSSTimeout > window { + window = u.cfg.OSSTimeout + } + if window <= 0 { + window = 30 * time.Second + } + return window + 30*time.Second +} - return multipartUploadID, parts, partMD5s, nil +func isSecurityTokenExpiredError(err error) bool { + return err != nil && strings.Contains(err.Error(), "SecurityTokenExpired") } // abortAndCleanupSession notifies the data-gateway to abort the logical upload session @@ -680,8 +795,10 @@ func (u *Uploader) cleanupPersistedState(logicalUploadID string) { } } -// findPersistedStateByKey scans the active state directory for a state matching the given mcap key. -func (u *Uploader) findPersistedStateByKey(mcapKey string) (*persistedUploadState, error) { +// findPersistedStateByKey scans the active state directory for a state matching the given +// MCAP key and asset id. Upload sessions are device-scoped and must not be reused +// across different Data Platform devices even when the MCAP object key is identical. +func (u *Uploader) findPersistedStateByKey(mcapKey string, assetID string) (*persistedUploadState, error) { if u.cfg.PersistRootDir == "" { return nil, nil } @@ -707,7 +824,7 @@ func (u *Uploader) findPersistedStateByKey(mcapKey string) (*persistedUploadStat logger.Printf("[CLOUD-UPLOAD] Warning: failed to parse state file %s: %v", entry.Name(), err) continue } - if state.McapKey == mcapKey { + if state.McapKey == mcapKey && state.AssetID == assetID { return &state, nil } } diff --git a/internal/cloud/uploader_test.go b/internal/cloud/uploader_test.go index b510f5f..6d38df8 100644 --- a/internal/cloud/uploader_test.go +++ b/internal/cloud/uploader_test.go @@ -5,11 +5,14 @@ package cloud import ( + "bytes" "context" "encoding/json" "errors" + "io" "os" "path/filepath" + "strings" "testing" "time" @@ -122,11 +125,12 @@ func TestFindPersistedStateByKey(t *testing.T) { LogicalUploadID: "logical-find-test", UploadID: "upload-find-test", McapKey: "episodes/7/find.mcap", + AssetID: "asset-a", FileSize: 256, UpdatedAt: time.Now(), }) - got, err := u.findPersistedStateByKey("episodes/7/find.mcap") + got, err := u.findPersistedStateByKey("episodes/7/find.mcap", "asset-a") if err != nil { t.Fatalf("findPersistedStateByKey: %v", err) } @@ -138,12 +142,36 @@ func TestFindPersistedStateByKey(t *testing.T) { } } +func TestFindPersistedStateByKey_DoesNotReuseDifferentAssetID(t *testing.T) { + dir := t.TempDir() + u := newTestUploader(dir) + + activeDir := filepath.Join(dir, "data-gateway-client", "uploads", "active") + writeTempState(t, activeDir, &persistedUploadState{ + Version: 1, + LogicalUploadID: "logical-device-a", + UploadID: "upload-device-a", + McapKey: "episodes/7/find.mcap", + AssetID: "asset-a", + FileSize: 256, + UpdatedAt: time.Now(), + }) + + got, err := u.findPersistedStateByKey("episodes/7/find.mcap", "asset-b") + if err != nil { + t.Fatalf("findPersistedStateByKey: %v", err) + } + if got != nil { + t.Fatalf("expected nil for different AssetID, got %+v", got) + } +} + // TestFindPersistedStateByKey_NotFound verifies nil is returned for unknown keys. func TestFindPersistedStateByKey_NotFound(t *testing.T) { dir := t.TempDir() u := newTestUploader(dir) - got, err := u.findPersistedStateByKey("episodes/99/missing.mcap") + got, err := u.findPersistedStateByKey("episodes/99/missing.mcap", "asset-a") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -156,7 +184,7 @@ func TestFindPersistedStateByKey_NotFound(t *testing.T) { func TestFindPersistedStateByKey_EmptyPersistRootDir(t *testing.T) { u := newTestUploader("") - got, err := u.findPersistedStateByKey("episodes/1/file.mcap") + got, err := u.findPersistedStateByKey("episodes/1/file.mcap", "asset-a") if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -197,6 +225,7 @@ func TestPersistedStateRoundTrip(t *testing.T) { Endpoint: "https://oss.example.com", ObjectKey: "uploads/1/test", McapKey: "episodes/1/test.mcap", + AssetID: "asset-a", FileSize: 4096, UpdatedAt: now, } @@ -225,6 +254,9 @@ func TestPersistedStateRoundTrip(t *testing.T) { if decoded.McapKey != original.McapKey { t.Errorf("McapKey = %q, want %q", decoded.McapKey, original.McapKey) } + if decoded.AssetID != original.AssetID { + t.Errorf("AssetID = %q, want %q", decoded.AssetID, original.AssetID) + } } // TestPrepareUploadSession_PermanentFailure_FileSizeMismatch verifies that a persisted state @@ -241,6 +273,7 @@ func TestPrepareUploadSession_PermanentFailure_FileSizeMismatch(t *testing.T) { LogicalUploadID: "logical-size-mismatch", UploadID: "upload-size-mismatch", McapKey: "episodes/1/mismatch.mcap", + AssetID: "asset-a", FileSize: 1024, // persisted as 1024 UpdatedAt: time.Now(), }) @@ -250,6 +283,7 @@ func TestPrepareUploadSession_PermanentFailure_FileSizeMismatch(t *testing.T) { context.Background(), map[string]string{}, "episodes/1/mismatch.mcap", + "asset-a", 512, // actual size differs ) if err == nil { @@ -271,6 +305,7 @@ func TestPrepareUploadSession_PermanentFailure_CleanupOnSizeMismatch(t *testing. LogicalUploadID: "logical-cleanup-mismatch", UploadID: "upload-cleanup-mismatch", McapKey: "episodes/2/cleanup.mcap", + AssetID: "asset-a", FileSize: 1024, // persisted size RestartCount: 0, UpdatedAt: time.Now(), @@ -280,6 +315,7 @@ func TestPrepareUploadSession_PermanentFailure_CleanupOnSizeMismatch(t *testing. context.Background(), map[string]string{}, "episodes/2/cleanup.mcap", + "asset-a", 512, // different from persisted ) if err == nil { @@ -326,13 +362,14 @@ func TestPrepareUploadSession_Restart_OldStatePreservedOnRPCFailure(t *testing.T LogicalUploadID: "logical-old", UploadID: "upload-old", McapKey: "episodes/10/restart-rpc-fail.mcap", + AssetID: "asset-a", FileSize: 512, RestartCount: 0, UpdatedAt: time.Now(), }) u := newDecideResumeUploader(dir, gw, &fakeOSS{}) - _, err := u.prepareUploadSession(context.Background(), map[string]string{}, "episodes/10/restart-rpc-fail.mcap", 512) + _, err := u.prepareUploadSession(context.Background(), map[string]string{}, "episodes/10/restart-rpc-fail.mcap", "asset-a", 512) if err == nil { t.Fatal("expected error when CreateLogicalUpload fails, got nil") } @@ -374,13 +411,14 @@ func TestPrepareUploadSession_Restart_NewStatePersisted_OldStateRemoved(t *testi LogicalUploadID: "logical-old", UploadID: "upload-old", McapKey: "episodes/11/restart-ok.mcap", + AssetID: "asset-a", FileSize: 512, RestartCount: 0, UpdatedAt: time.Now(), }) u := newDecideResumeUploader(dir, gw, &fakeOSS{}) - prepared, err := u.prepareUploadSession(context.Background(), map[string]string{}, "episodes/11/restart-ok.mcap", 512) + prepared, err := u.prepareUploadSession(context.Background(), map[string]string{}, "episodes/11/restart-ok.mcap", "asset-a", 512) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -528,7 +566,7 @@ func (f *fakeGateway) AbortUpload(_ context.Context, _ string, _ string) error { return nil } -func (f *fakeGateway) CompleteUpload(_ context.Context, _ string, _ int64, _ map[string]string, _ int32, _ string) error { +func (f *fakeGateway) CompleteUpload(_ context.Context, _ string, _ int64, _ map[string]string, _ int32, _ string, _ int64) error { panic("fakeGateway.CompleteUpload called unexpectedly") } @@ -538,6 +576,8 @@ type fakeOSS struct { listPartsFn func(ctx context.Context, session *UploadSession, multipartUploadID string) error // headObjectETagFn is called by HeadObjectETag; must be set for tests that reach it. headObjectETagFn func(ctx context.Context, session *UploadSession) (string, error) + // uploadPartFn is called by UploadPart; must be set for tests that reach it. + uploadPartFn func(ctx context.Context, session *UploadSession, multipartUploadID string, partNumber int, body []byte) (string, error) } func (f *fakeOSS) ListParts(ctx context.Context, session *UploadSession, multipartUploadID string) error { @@ -558,8 +598,11 @@ func (f *fakeOSS) InitiateMultipartUpload(_ context.Context, _ *UploadSession) ( panic("fakeOSS.InitiateMultipartUpload called unexpectedly") } -func (f *fakeOSS) UploadPart(_ context.Context, _ *UploadSession, _ string, _ int, _ []byte) (string, error) { - panic("fakeOSS.UploadPart called unexpectedly") +func (f *fakeOSS) UploadPart(ctx context.Context, session *UploadSession, multipartUploadID string, partNumber int, body []byte) (string, error) { + if f.uploadPartFn == nil { + panic("fakeOSS.UploadPart called unexpectedly") + } + return f.uploadPartFn(ctx, session, multipartUploadID, partNumber, body) } func (f *fakeOSS) CompleteMultipartUpload(_ context.Context, _ *UploadSession, _ string, _ []UploadedPart) (string, error) { @@ -596,6 +639,204 @@ func makeSession(logicalID, uploadID string) *UploadSession { } } +func TestStreamMultipartParts_UploadsExpectedPartBoundaries(t *testing.T) { + var gotPartNumbers []int + var gotSizes []int + oss := &fakeOSS{ + uploadPartFn: func(_ context.Context, _ *UploadSession, _ string, partNumber int, body []byte) (string, error) { + gotPartNumbers = append(gotPartNumbers, partNumber) + gotSizes = append(gotSizes, len(body)) + return "etag", nil + }, + } + u := newDecideResumeUploader("", &fakeGateway{}, oss) + session := makeSession("logical-stream", "upload-stream") + session.PartSizeBytes = 10 + + payload := []byte("abcdefghijklmnopqrstuvwxy") + factory := func(_ context.Context, offset, length int64) (io.ReadCloser, error) { + end := int(offset + length) + if end > len(payload) { + end = len(payload) + } + return io.NopCloser(bytes.NewReader(payload[offset:end])), nil + } + _, parts, partMD5s, err := u.streamMultipartParts( + context.Background(), + "episode-stream", + session, + "multipart-stream", + int64(len(payload)), + factory, + ) + if err != nil { + t.Fatalf("streamMultipartParts() error = %v", err) + } + if len(parts) != 3 { + t.Fatalf("uploaded part count = %d, want 3", len(parts)) + } + if len(partMD5s) != 3 { + t.Fatalf("part MD5 count = %d, want 3", len(partMD5s)) + } + + wantPartNumbers := []int{1, 2, 3} + wantSizes := []int{10, 10, 5} + for i := range wantPartNumbers { + if gotPartNumbers[i] != wantPartNumbers[i] { + t.Fatalf("part number[%d] = %d, want %d", i, gotPartNumbers[i], wantPartNumbers[i]) + } + if gotSizes[i] != wantSizes[i] { + t.Fatalf("part size[%d] = %d, want %d", i, gotSizes[i], wantSizes[i]) + } + } +} + +func TestStreamMultipartParts_EarlyEOFStopsInsteadOfUploadingEmptyParts(t *testing.T) { + var uploadedPartNumbers []int + oss := &fakeOSS{ + uploadPartFn: func(_ context.Context, _ *UploadSession, _ string, partNumber int, body []byte) (string, error) { + uploadedPartNumbers = append(uploadedPartNumbers, partNumber) + if len(body) == 0 { + t.Fatalf("uploaded empty part %d", partNumber) + } + return "etag", nil + }, + } + u := newDecideResumeUploader("", &fakeGateway{}, oss) + session := makeSession("logical-short", "upload-short") + session.PartSizeBytes = 10 + + payload := []byte("abcdefghijkl") // 12 bytes — part 2 will fail with short read + factory := func(_ context.Context, offset, length int64) (io.ReadCloser, error) { + end := int(offset + length) + if end > len(payload) { + end = len(payload) + } + return io.NopCloser(bytes.NewReader(payload[offset:end])), nil + } + + _, _, _, err := u.streamMultipartParts( + context.Background(), + "episode-short", + session, + "multipart-short", + 25, + factory, + ) + if err == nil { + t.Fatal("expected error for early EOF, got nil") + } + if !strings.Contains(err.Error(), "expected 10 bytes, got 2") { + t.Fatalf("error = %q, want short read details", err.Error()) + } + if len(uploadedPartNumbers) != 1 || uploadedPartNumbers[0] != 1 { + t.Fatalf("uploaded parts = %v, want only first complete part", uploadedPartNumbers) + } +} + +func TestStreamMultipartParts_RefreshesCredentialsBeforeUploadPart(t *testing.T) { + var reissueCalls int + gw := &fakeGateway{ + reissueCredentialsFn: func(_ context.Context, uploadID string) (*UploadSession, error) { + reissueCalls++ + if uploadID != "upload-expiring" { + t.Fatalf("uploadID = %q, want upload-expiring", uploadID) + } + refreshed := makeSession("logical-expiring", uploadID) + refreshed.STSAccessKeyID = "fresh-key" + return refreshed, nil + }, + } + + var usedAccessKeyID string + oss := &fakeOSS{ + uploadPartFn: func(_ context.Context, session *UploadSession, _ string, _ int, _ []byte) (string, error) { + usedAccessKeyID = session.STSAccessKeyID + return "etag", nil + }, + } + u := newDecideResumeUploader("", gw, oss) + session := makeSession("logical-expiring", "upload-expiring") + session.STSAccessKeyID = "stale-key" + session.STSExpireAt = time.Now().Add(10 * time.Second) + session.PartSizeBytes = 4 + + payload := []byte("abcd") + factory := func(_ context.Context, offset, length int64) (io.ReadCloser, error) { + return io.NopCloser(bytes.NewReader(payload[offset : offset+length])), nil + } + + _, parts, _, err := u.streamMultipartParts(context.Background(), "episode-expiring", session, "multipart-expiring", int64(len(payload)), factory) + if err != nil { + t.Fatalf("streamMultipartParts() error = %v", err) + } + if len(parts) != 1 { + t.Fatalf("uploaded part count = %d, want 1", len(parts)) + } + if reissueCalls != 1 { + t.Fatalf("ReissueUploadCredentials calls = %d, want 1", reissueCalls) + } + if usedAccessKeyID != "fresh-key" { + t.Fatalf("UploadPart access key = %q, want fresh-key", usedAccessKeyID) + } +} + +func TestStreamMultipartParts_RetriesCurrentPartAfterSecurityTokenExpired(t *testing.T) { + var reissueCalls int + gw := &fakeGateway{ + reissueCredentialsFn: func(_ context.Context, uploadID string) (*UploadSession, error) { + reissueCalls++ + refreshed := makeSession("logical-retry", uploadID) + refreshed.STSAccessKeyID = "fresh-key" + return refreshed, nil + }, + } + + var uploadPartCalls int + var partNumbers []int + var usedAccessKeyIDs []string + oss := &fakeOSS{ + uploadPartFn: func(_ context.Context, session *UploadSession, _ string, partNumber int, _ []byte) (string, error) { + uploadPartCalls++ + partNumbers = append(partNumbers, partNumber) + usedAccessKeyIDs = append(usedAccessKeyIDs, session.STSAccessKeyID) + if uploadPartCalls == 1 { + return "", errors.New("oss returned status 403: SecurityTokenExpired") + } + return "etag", nil + }, + } + u := newDecideResumeUploader("", gw, oss) + session := makeSession("logical-retry", "upload-retry") + session.STSAccessKeyID = "stale-key" + session.PartSizeBytes = 4 + + payload := []byte("abcd") + factory := func(_ context.Context, offset, length int64) (io.ReadCloser, error) { + return io.NopCloser(bytes.NewReader(payload[offset : offset+length])), nil + } + + _, parts, _, err := u.streamMultipartParts(context.Background(), "episode-retry", session, "multipart-retry", int64(len(payload)), factory) + if err != nil { + t.Fatalf("streamMultipartParts() error = %v", err) + } + if len(parts) != 1 { + t.Fatalf("uploaded part count = %d, want 1", len(parts)) + } + if reissueCalls != 1 { + t.Fatalf("ReissueUploadCredentials calls = %d, want 1", reissueCalls) + } + if uploadPartCalls != 2 { + t.Fatalf("UploadPart calls = %d, want 2", uploadPartCalls) + } + if partNumbers[0] != 1 || partNumbers[1] != 1 { + t.Fatalf("part numbers = %v, want [1 1]", partNumbers) + } + if usedAccessKeyIDs[0] != "stale-key" || usedAccessKeyIDs[1] != "fresh-key" { + t.Fatalf("access keys = %v, want [stale-key fresh-key]", usedAccessKeyIDs) + } +} + // ============================================================================= // decideResumeAction unit tests // ============================================================================= diff --git a/internal/config/config.go b/internal/config/config.go index 0d56399..4df907e 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -8,7 +8,7 @@ package config import ( "fmt" "os" - "os/exec" + "path/filepath" "strconv" "strings" ) @@ -20,7 +20,6 @@ type Config struct { Storage StorageConfig QA QAConfig Sync SyncConfig - CLISync CLISyncConfig Auth AuthConfig Features FeaturesConfig Monitoring MonitoringConfig @@ -89,20 +88,7 @@ type SyncConfig struct { RetryJitterSec int // max additive jitter in seconds PersistRootDir string // root directory for persisting upload state across restarts; empty disables persistence MaxRestartCount int // max number of upload restarts before permanent failure; 0 uses uploader default (3) -} - -// CLISyncConfig controls the emergency dp CLI cloud sync sidepath. -type CLISyncConfig struct { - Enabled bool - DPBin string - DPConfigPath string - TempDir string - MaxConcurrent int - QueueSize int - TimeoutSec int - KeepTemp bool - MaxTags int - MaxTagBytes int + DPConfigPath string // data-platform config path for direct device-profile uploads } // FeaturesConfig feature flags configuration @@ -219,18 +205,7 @@ func Load() (*Config, error) { RetryJitterSec: getEnvInt("KEYSTONE_SYNC_RETRY_JITTER_SEC", 30), PersistRootDir: getEnv("KEYSTONE_SYNC_PERSIST_ROOT_DIR", ""), MaxRestartCount: getEnvInt("KEYSTONE_SYNC_MAX_RESTART_COUNT", 3), - }, - CLISync: CLISyncConfig{ - Enabled: getEnvBool("KEYSTONE_CLI_SYNC_ENABLED", false), - DPBin: getEnv("KEYSTONE_CLI_SYNC_DP_BIN", "dp"), - DPConfigPath: getEnv("KEYSTONE_CLI_SYNC_DP_CONFIG", ""), - TempDir: getEnv("KEYSTONE_CLI_SYNC_TEMP_DIR", "/var/lib/keystone/cli-sync"), - MaxConcurrent: getEnvInt("KEYSTONE_CLI_SYNC_MAX_CONCURRENT", 1), - QueueSize: getEnvInt("KEYSTONE_CLI_SYNC_QUEUE_SIZE", 16), - TimeoutSec: getEnvInt("KEYSTONE_CLI_SYNC_TIMEOUT_SEC", 7200), - KeepTemp: getEnvBool("KEYSTONE_CLI_SYNC_KEEP_TEMP", false), - MaxTags: getEnvInt("KEYSTONE_CLI_SYNC_MAX_TAGS", 128), - MaxTagBytes: getEnvInt("KEYSTONE_CLI_SYNC_MAX_TAG_BYTES", 65536), + DPConfigPath: getEnv("KEYSTONE_SYNC_DP_CONFIG", defaultDPConfigPath()), }, Auth: AuthConfig{ JWTSecret: getEnv("KEYSTONE_JWT_SECRET", ""), @@ -302,17 +277,18 @@ func (c *Config) Validate() error { return fmt.Errorf("KEYSTONE_ADMIN_USERNAME and KEYSTONE_ADMIN_PASSWORD must both be set or both be empty") } if c.Sync.Enabled { - if strings.TrimSpace(c.Sync.AuthEndpoint) == "" { - return fmt.Errorf("sync auth endpoint is required when sync is enabled") - } - if strings.TrimSpace(c.Sync.GatewayEndpoint) == "" { - return fmt.Errorf("sync gateway endpoint is required when sync is enabled") + c.Sync.DPConfigPath = strings.TrimSpace(c.Sync.DPConfigPath) + if c.Sync.DPConfigPath == "" { + return fmt.Errorf("KEYSTONE_SYNC_DP_CONFIG is required when sync is enabled") } - apiKey := strings.TrimSpace(c.Sync.APIKey) - if apiKey == "" { - return fmt.Errorf("KEYSTONE_CLOUD_API_KEY is required when sync is enabled") + expandedDPConfigPath, err := expandHomePath(c.Sync.DPConfigPath) + if err != nil { + return fmt.Errorf("KEYSTONE_SYNC_DP_CONFIG %q is invalid: %w", c.Sync.DPConfigPath, err) } - c.Sync.APIKey = apiKey + c.Sync.DPConfigPath = expandedDPConfigPath + c.Sync.AuthEndpoint = strings.TrimSpace(c.Sync.AuthEndpoint) + c.Sync.GatewayEndpoint = strings.TrimSpace(c.Sync.GatewayEndpoint) + c.Sync.APIKey = strings.TrimSpace(c.Sync.APIKey) if c.Sync.BatchSize <= 0 { return fmt.Errorf("sync batch size must be greater than 0 when sync is enabled") } @@ -347,45 +323,6 @@ func (c *Config) Validate() error { return fmt.Errorf("sync max restart count must be greater than or equal to 0 when sync is enabled") } } - if c.CLISync.Enabled { - c.CLISync.DPBin = strings.TrimSpace(c.CLISync.DPBin) - if c.CLISync.DPBin == "" { - return fmt.Errorf("KEYSTONE_CLI_SYNC_DP_BIN is required when CLI sync is enabled") - } - if _, err := exec.LookPath(c.CLISync.DPBin); err != nil { - return fmt.Errorf("KEYSTONE_CLI_SYNC_DP_BIN %q is not executable: %w", c.CLISync.DPBin, err) - } - c.CLISync.DPConfigPath = strings.TrimSpace(c.CLISync.DPConfigPath) - if c.CLISync.DPConfigPath == "" { - return fmt.Errorf("KEYSTONE_CLI_SYNC_DP_CONFIG is required when CLI sync is enabled") - } - info, err := os.Stat(c.CLISync.DPConfigPath) - if err != nil { - return fmt.Errorf("KEYSTONE_CLI_SYNC_DP_CONFIG %q is not readable: %w", c.CLISync.DPConfigPath, err) - } - if info.IsDir() { - return fmt.Errorf("KEYSTONE_CLI_SYNC_DP_CONFIG %q must be a file", c.CLISync.DPConfigPath) - } - c.CLISync.TempDir = strings.TrimSpace(c.CLISync.TempDir) - if c.CLISync.TempDir == "" { - return fmt.Errorf("KEYSTONE_CLI_SYNC_TEMP_DIR is required when CLI sync is enabled") - } - if c.CLISync.MaxConcurrent <= 0 { - return fmt.Errorf("KEYSTONE_CLI_SYNC_MAX_CONCURRENT must be greater than 0 when CLI sync is enabled") - } - if c.CLISync.QueueSize <= 0 { - return fmt.Errorf("KEYSTONE_CLI_SYNC_QUEUE_SIZE must be greater than 0 when CLI sync is enabled") - } - if c.CLISync.TimeoutSec <= 0 { - return fmt.Errorf("KEYSTONE_CLI_SYNC_TIMEOUT_SEC must be greater than 0 when CLI sync is enabled") - } - if c.CLISync.MaxTags <= 0 { - return fmt.Errorf("KEYSTONE_CLI_SYNC_MAX_TAGS must be greater than 0 when CLI sync is enabled") - } - if c.CLISync.MaxTagBytes <= 0 { - return fmt.Errorf("KEYSTONE_CLI_SYNC_MAX_TAG_BYTES must be greater than 0 when CLI sync is enabled") - } - } return nil } @@ -396,6 +333,28 @@ func getEnv(key, fallback string) string { return fallback } +func defaultDPConfigPath() string { + home, err := os.UserHomeDir() + if err != nil || strings.TrimSpace(home) == "" { + return "~/.archebase/config.json" + } + return filepath.Join(home, ".archebase", "config.json") +} + +func expandHomePath(path string) (string, error) { + if path != "~" && !strings.HasPrefix(path, "~/") { + return path, nil + } + home, err := os.UserHomeDir() + if err != nil || strings.TrimSpace(home) == "" { + return "", fmt.Errorf("home directory is not available") + } + if path == "~" { + return home, nil + } + return filepath.Join(home, strings.TrimPrefix(path, "~/")), nil +} + func getEnvInt(key string, fallback int) int { if val := os.Getenv(key); val != "" { if i, err := strconv.Atoi(val); err == nil { diff --git a/internal/config/config_test.go b/internal/config/config_test.go index dd918e6..4578975 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -7,6 +7,7 @@ package config import ( "os" + "path/filepath" "strings" "testing" ) @@ -21,6 +22,7 @@ func TestLoad(t *testing.T) { "KEYSTONE_MINIO_SECRET_KEY": os.Getenv("KEYSTONE_MINIO_SECRET_KEY"), "KEYSTONE_FACTORY_ID": os.Getenv("KEYSTONE_FACTORY_ID"), "KEYSTONE_SYNC_AUTO_SCAN_ENABLED": os.Getenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED"), + "KEYSTONE_SYNC_DP_CONFIG": os.Getenv("KEYSTONE_SYNC_DP_CONFIG"), } defer func() { // Restore original environment variables @@ -35,6 +37,7 @@ func TestLoad(t *testing.T) { // Set test environment variables os.Unsetenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED") + os.Unsetenv("KEYSTONE_SYNC_DP_CONFIG") os.Setenv("KEYSTONE_MYSQL_PASSWORD", "test-password") os.Setenv("KEYSTONE_MINIO_ACCESS_KEY", "test-access-key") os.Setenv("KEYSTONE_MINIO_SECRET_KEY", "test-secret-key") @@ -69,6 +72,13 @@ func TestLoad(t *testing.T) { if cfg.Sync.AutoScanEnabled { t.Error("Load().Sync.AutoScanEnabled should default to false") } + home, err := os.UserHomeDir() + if err != nil { + t.Fatalf("os.UserHomeDir() error = %v", err) + } + if cfg.Sync.DPConfigPath != filepath.Join(home, ".archebase", "config.json") { + t.Errorf("Load().Sync.DPConfigPath = %q, want default ~/.archebase/config.json", cfg.Sync.DPConfigPath) + } // Verify QA configuration if !cfg.QA.Enabled { @@ -270,7 +280,7 @@ func TestConfigValidate(t *testing.T) { } } -func TestValidateSyncAPIKey(t *testing.T) { +func TestValidateSyncDPConfig(t *testing.T) { validBase := Config{ Server: ServerConfig{Mode: "edge"}, Database: DatabaseConfig{DSN: "user:pass@tcp(localhost:3306)/db"}, @@ -278,7 +288,7 @@ func TestValidateSyncAPIKey(t *testing.T) { Auth: AuthConfig{JWTSecret: "jwt-secret"}, } - t.Run("sync disabled — no API key required", func(t *testing.T) { + t.Run("sync disabled — no DP config required", func(t *testing.T) { cfg := validBase cfg.Sync = SyncConfig{Enabled: false} if err := cfg.Validate(); err != nil { @@ -286,13 +296,11 @@ func TestValidateSyncAPIKey(t *testing.T) { } }) - t.Run("sync enabled — missing API key", func(t *testing.T) { + t.Run("sync enabled — missing DP config", func(t *testing.T) { cfg := validBase cfg.Sync = SyncConfig{ Enabled: true, - AuthEndpoint: "auth:443", - GatewayEndpoint: "gateway:443", - APIKey: "", + DPConfigPath: "", BatchSize: 10, MaxRetries: 5, MaxConcurrent: 2, @@ -302,18 +310,38 @@ func TestValidateSyncAPIKey(t *testing.T) { RetryBaseSec: 30, RetryMaxSec: 1800, } - if err := cfg.Validate(); err == nil { - t.Error("Validate() expected error for missing API key, got nil") + if err := cfg.Validate(); err == nil || !strings.Contains(err.Error(), "KEYSTONE_SYNC_DP_CONFIG") { + t.Fatalf("Validate() error = %v, want KEYSTONE_SYNC_DP_CONFIG error", err) + } + }) + + t.Run("sync enabled — old cloud endpoint and API key are not required", func(t *testing.T) { + cfg := validBase + cfg.Sync = SyncConfig{ + Enabled: true, + DPConfigPath: "/etc/keystone/dp-config.json", + BatchSize: 10, + MaxRetries: 5, + MaxConcurrent: 2, + WorkerIntervalSec: 60, + RequestTimeoutSec: 30, + OSSTimeoutSec: 300, + RetryBaseSec: 30, + RetryMaxSec: 1800, + } + if err := cfg.Validate(); err != nil { + t.Fatalf("Validate() unexpected error = %v", err) + } + if cfg.Sync.AuthEndpoint != "" || cfg.Sync.GatewayEndpoint != "" || cfg.Sync.APIKey != "" { + t.Fatalf("legacy cloud config should remain optional and empty: %+v", cfg.Sync) } }) - t.Run("sync enabled — arbitrary opaque API key accepted", func(t *testing.T) { + t.Run("sync enabled — trims DP config whitespace", func(t *testing.T) { cfg := validBase cfg.Sync = SyncConfig{ Enabled: true, - AuthEndpoint: "auth:443", - GatewayEndpoint: "gateway:443", - APIKey: "notvalidbase64!!!", + DPConfigPath: " /etc/keystone/dp-config.json ", BatchSize: 10, MaxRetries: 5, MaxConcurrent: 2, @@ -326,18 +354,18 @@ func TestValidateSyncAPIKey(t *testing.T) { if err := cfg.Validate(); err != nil { t.Fatalf("Validate() unexpected error = %v", err) } - if cfg.Sync.APIKey != "notvalidbase64!!!" { - t.Errorf("APIKey = %q, want %q", cfg.Sync.APIKey, "notvalidbase64!!!") + if cfg.Sync.DPConfigPath != "/etc/keystone/dp-config.json" { + t.Errorf("DPConfigPath = %q, want trimmed path", cfg.Sync.DPConfigPath) } }) - t.Run("sync enabled — trims API key whitespace", func(t *testing.T) { + t.Run("sync enabled — expands DP config home path", func(t *testing.T) { + home := t.TempDir() + t.Setenv("HOME", home) cfg := validBase cfg.Sync = SyncConfig{ Enabled: true, - AuthEndpoint: "auth:443", - GatewayEndpoint: "gateway:443", - APIKey: " cloud-issued-key ", + DPConfigPath: "~/.archebase/config.json", BatchSize: 10, MaxRetries: 5, MaxConcurrent: 2, @@ -350,8 +378,8 @@ func TestValidateSyncAPIKey(t *testing.T) { if err := cfg.Validate(); err != nil { t.Fatalf("Validate() unexpected error = %v", err) } - if cfg.Sync.APIKey != "cloud-issued-key" { - t.Errorf("APIKey = %q, want %q", cfg.Sync.APIKey, "cloud-issued-key") + if cfg.Sync.DPConfigPath != filepath.Join(home, ".archebase", "config.json") { + t.Errorf("DPConfigPath = %q, want expanded home path", cfg.Sync.DPConfigPath) } }) } diff --git a/internal/server/server.go b/internal/server/server.go index b2ff67d..30c65ed 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -59,7 +59,6 @@ type Server struct { productionDashboard *handlers.ProductionDashboardHandler syncHandler *handlers.SyncHandler syncWorker *services.SyncWorker - cliSyncRunner *services.CLISyncRunner httpServer *http.Server transferWSServer *http.Server recorderWSServer *http.Server @@ -77,8 +76,8 @@ func axonTransferWriteTimeout(cfg *config.TransferConfig) time.Duration { // New creates a new server instance. // db and s3Client are optional; pass nil to disable Verified ACK. -// syncWorker and cliSyncRunner are optional; pass nil to disable those sync APIs. -func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *services.SyncWorker, cliSyncRunner ...*services.CLISyncRunner) *Server { +// syncWorker is optional; pass nil to disable cloud sync APIs. +func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *services.SyncWorker) *Server { // Create Gin engine gin.SetMode(gin.ReleaseMode) engine := gin.New() @@ -157,12 +156,8 @@ func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *servi // Create SyncHandler for cloud sync API var syncHandler *handlers.SyncHandler - var cliRunner *services.CLISyncRunner - if len(cliSyncRunner) > 0 { - cliRunner = cliSyncRunner[0] - } if db != nil { - syncHandler = handlers.NewSyncHandler(db, syncWorker, cliRunner) + syncHandler = handlers.NewSyncHandler(db, syncWorker) } s := &Server{ @@ -193,7 +188,6 @@ func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *servi productionDashboard: productionDashboardHandler, syncHandler: syncHandler, syncWorker: syncWorker, - cliSyncRunner: cliRunner, engine: engine, } @@ -493,14 +487,6 @@ func (s *Server) Shutdown(ctx context.Context) error { } } } - if s.cliSyncRunner != nil { - if err := s.cliSyncRunner.Stop(ctx); err != nil { - logShutdownError("CLI sync runner", err) - if shutdownErr == nil { - shutdownErr = fmt.Errorf("CLI sync runner shutdown: %w", err) - } - } - } return shutdownErr } diff --git a/internal/services/cli_sync_runner.go b/internal/services/cli_sync_runner.go deleted file mode 100644 index 5760c9f..0000000 --- a/internal/services/cli_sync_runner.go +++ /dev/null @@ -1,887 +0,0 @@ -// SPDX-FileCopyrightText: 2026 ArcheBase -// -// SPDX-License-Identifier: MulanPSL-2.0 - -package services - -import ( - "bytes" - "context" - "database/sql" - "encoding/json" - "errors" - "fmt" - "io" - "os" - "os/exec" - "regexp" - "sort" - "strconv" - "strings" - "sync" - "sync/atomic" - "time" - - "archebase.com/keystone-edge/internal/logger" - "archebase.com/keystone-edge/internal/storage/s3" - - "github.com/jmoiron/sqlx" - "github.com/minio/minio-go/v7" -) - -const ( - cliSyncStatusPending = "pending" - cliSyncStatusInProgress = "in_progress" - cliSyncStatusCompleted = "completed" - cliSyncStatusFailed = "failed" - - cliSyncPollInterval = 30 * time.Second -) - -var ( - ErrCLISyncDisabled = errors.New("CLI sync is disabled") - ErrCLISyncNotRunning = errors.New("CLI sync runner is not running") - ErrCLISyncQueueFull = errors.New("CLI sync queue is full") - ErrCLISyncAlreadyActive = errors.New("CLI sync already active for episode") - ErrCLISyncNormalSyncActive = errors.New("normal cloud sync already active for episode") - ErrCLISyncEpisodeNotFound = errors.New("episode not found") - ErrCLISyncAlreadySynced = errors.New("episode already synced to cloud") - ErrCLISyncNotEligible = errors.New("episode is not eligible for CLI sync") -) - -// CLISyncRunnerConfig controls the dp CLI cloud sync sidepath. -type CLISyncRunnerConfig struct { - Enabled bool - DPBin string - DPConfigPath string - TempDir string - MaxConcurrent int - QueueSize int - TimeoutSec int - KeepTemp bool - MaxTags int - MaxTagBytes int -} - -// CLISyncRun is the API-facing representation of one CLI sync run. -type CLISyncRun struct { - ID int64 `db:"id"` - EpisodeID int64 `db:"episode_id"` - Status string `db:"status"` - SourcePath sql.NullString `db:"source_path"` - TempPath sql.NullString `db:"temp_path"` - DPConfigPath sql.NullString `db:"dp_config_path"` - FileID sql.NullString `db:"file_id"` - LogicalUploadID sql.NullString `db:"logical_upload_id"` - UploadID sql.NullString `db:"upload_id"` - Bucket sql.NullString `db:"bucket"` - ObjectKey sql.NullString `db:"object_key"` - FileSize sql.NullInt64 `db:"file_size"` - OSSObjectETag sql.NullString `db:"oss_object_etag"` - DurationSec sql.NullInt64 `db:"duration_sec"` - ErrorMessage sql.NullString `db:"error_message"` - StartedAt sql.NullTime `db:"started_at"` - CompletedAt sql.NullTime `db:"completed_at"` -} - -type cliSyncEpisode struct { - ID int64 `db:"id"` - EpisodePublicID string `db:"episode_id"` - QAStatus string `db:"qa_status"` - McapPath string `db:"mcap_path"` - SidecarPath string `db:"sidecar_path"` - CloudSynced bool `db:"cloud_synced"` - RobotDeviceID sql.NullString `db:"robot_device_id"` - TaskID sql.NullInt64 `db:"task_id"` - FactoryID sql.NullInt64 `db:"factory_id"` - OrganizationID sql.NullInt64 `db:"organization_id"` -} - -type cliUploadResult struct { - LogicalUploadID string `json:"logicalUploadId"` - UploadID string `json:"uploadId"` - FileID string `json:"fileId"` - Bucket string `json:"bucket"` - ObjectKey string `json:"objectKey"` - FileSize int64 `json:"fileSize"` - OSSObjectETag string `json:"ossObjectEtag"` -} - -// CLISyncRunner owns the emergency dp CLI upload sidepath. -type CLISyncRunner struct { - db *sqlx.DB - minioClient *s3.Client - minioBucket string - cfg CLISyncRunnerConfig - - runCh chan int64 - running atomic.Bool - stopping atomic.Bool - runCtx context.Context - runCancel context.CancelFunc - wg sync.WaitGroup - mu sync.Mutex -} - -// NewCLISyncRunner creates a runner. Call Start before accepting enqueue requests. -func NewCLISyncRunner(db *sqlx.DB, minioClient *s3.Client, minioBucket string, cfg CLISyncRunnerConfig) (*CLISyncRunner, error) { - if !cfg.Enabled { - return &CLISyncRunner{db: db, minioClient: minioClient, minioBucket: minioBucket, cfg: cfg}, nil - } - if db == nil { - return nil, fmt.Errorf("CLI sync requires database") - } - if minioClient == nil { - return nil, fmt.Errorf("CLI sync requires MinIO client") - } - if cfg.MaxConcurrent <= 0 { - cfg.MaxConcurrent = 1 - } - if cfg.QueueSize <= 0 { - cfg.QueueSize = 16 - } - if cfg.TimeoutSec <= 0 { - cfg.TimeoutSec = 7200 - } - if cfg.MaxTags <= 0 { - cfg.MaxTags = 128 - } - if cfg.MaxTagBytes <= 0 { - cfg.MaxTagBytes = 65536 - } - if strings.TrimSpace(cfg.TempDir) == "" { - cfg.TempDir = "/var/lib/keystone/cli-sync" - } - if err := os.MkdirAll(cfg.TempDir, 0o750); err != nil { - return nil, fmt.Errorf("create CLI sync temp dir: %w", err) - } - probe, err := os.CreateTemp(cfg.TempDir, ".write-probe-*") - if err != nil { - return nil, fmt.Errorf("CLI sync temp dir is not writable: %w", err) - } - probePath := probe.Name() - if err := probe.Close(); err != nil { - _ = os.Remove(probePath) - return nil, fmt.Errorf("close CLI sync temp probe: %w", err) - } - _ = os.Remove(probePath) - - return &CLISyncRunner{ - db: db, - minioClient: minioClient, - minioBucket: minioBucket, - cfg: cfg, - runCh: make(chan int64, cfg.QueueSize), - }, nil -} - -// IsEnabled reports whether the sidepath is configured. -func (r *CLISyncRunner) IsEnabled() bool { - return r != nil && r.cfg.Enabled -} - -// IsRunning reports whether background workers are accepting runs. -func (r *CLISyncRunner) IsRunning() bool { - return r != nil && r.running.Load() -} - -// Start starts background CLI sync workers. -func (r *CLISyncRunner) Start() { - if r == nil || !r.cfg.Enabled { - return - } - r.mu.Lock() - if !r.running.CompareAndSwap(false, true) { - r.mu.Unlock() - return - } - r.runCtx, r.runCancel = context.WithCancel(context.Background()) - runCtx := r.runCtx - r.mu.Unlock() - - for i := 0; i < r.cfg.MaxConcurrent; i++ { - r.wg.Add(1) - go r.worker(runCtx) - } - r.wg.Add(1) - go r.dispatcher(runCtx) - - logger.Printf("[CLI-SYNC] Started (dp=%s concurrency=%d queue=%d)", r.cfg.DPBin, r.cfg.MaxConcurrent, r.cfg.QueueSize) -} - -// Stop gracefully stops the runner. -func (r *CLISyncRunner) Stop(ctx context.Context) error { - if r == nil || !r.cfg.Enabled { - return nil - } - r.mu.Lock() - if !r.running.Load() { - r.mu.Unlock() - return nil - } - r.running.Store(false) - r.stopping.Store(true) - cancel := r.runCancel - r.mu.Unlock() - - if cancel != nil { - cancel() - } - - done := make(chan struct{}) - go func() { - r.wg.Wait() - close(done) - }() - - select { - case <-done: - logger.Printf("[CLI-SYNC] Stopped") - return nil - case <-ctx.Done(): - return fmt.Errorf("CLI sync runner shutdown: %w", ctx.Err()) - } -} - -// EnqueueEpisode creates a CLI sync run and schedules it for background processing. -func (r *CLISyncRunner) EnqueueEpisode(ctx context.Context, episodeID int64) (int64, error) { - if r == nil || !r.cfg.Enabled { - return 0, ErrCLISyncDisabled - } - if !r.running.Load() { - return 0, ErrCLISyncNotRunning - } - runID, err := r.persistPendingRun(ctx, episodeID) - if err != nil { - return 0, err - } - - select { - case r.runCh <- runID: - return runID, nil - case <-ctx.Done(): - r.markRunFailed(context.Background(), runID, time.Now(), ctx.Err()) - return 0, ctx.Err() - default: - r.markRunFailed(context.Background(), runID, time.Now(), ErrCLISyncQueueFull) - return 0, ErrCLISyncQueueFull - } -} - -// LatestRun returns the most recent CLI sync run for an episode. -func (r *CLISyncRunner) LatestRun(ctx context.Context, episodeID int64) (*CLISyncRun, error) { - if r == nil || !r.cfg.Enabled { - return nil, ErrCLISyncDisabled - } - var row CLISyncRun - err := r.db.GetContext(ctx, &row, ` - SELECT - id, - episode_id, - status, - source_path, - temp_path, - dp_config_path, - file_id, - logical_upload_id, - upload_id, - bucket, - object_key, - file_size, - oss_object_etag, - duration_sec, - error_message, - started_at, - completed_at - FROM cli_sync_runs - WHERE episode_id = ? - ORDER BY id DESC - LIMIT 1 - `, episodeID) - if err == sql.ErrNoRows { - return nil, sql.ErrNoRows - } - if err != nil { - return nil, fmt.Errorf("query latest CLI sync run: %w", err) - } - return &row, nil -} - -func (r *CLISyncRunner) persistPendingRun(ctx context.Context, episodeID int64) (int64, error) { - tx, err := r.db.BeginTxx(ctx, nil) - if err != nil { - return 0, fmt.Errorf("begin CLI sync transaction: %w", err) - } - defer func() { _ = tx.Rollback() }() - - lockClause := txLockClause(tx) - var ep cliSyncEpisode - if err := tx.GetContext(ctx, &ep, ` - SELECT - e.id, - e.episode_id, - e.qa_status, - e.mcap_path, - e.sidecar_path, - e.cloud_synced, - COALESCE(NULLIF(TRIM(r.device_id), ''), NULLIF(TRIM(ws.robot_serial), '')) AS robot_device_id, - e.task_id, - e.factory_id, - e.organization_id - FROM episodes e - LEFT JOIN workstations ws ON ws.id = e.workstation_id AND ws.deleted_at IS NULL - LEFT JOIN robots r ON r.id = ws.robot_id AND r.deleted_at IS NULL - WHERE e.id = ? AND e.deleted_at IS NULL - `+lockClause, episodeID); err != nil { - if err == sql.ErrNoRows { - return 0, fmt.Errorf("%w: %d", ErrCLISyncEpisodeNotFound, episodeID) - } - return 0, fmt.Errorf("lock episode %d: %w", episodeID, err) - } - if ep.CloudSynced { - return 0, fmt.Errorf("%w: %d", ErrCLISyncAlreadySynced, episodeID) - } - if ep.QAStatus != "approved" && ep.QAStatus != "inspector_approved" { - return 0, fmt.Errorf("%w: qa_status=%s", ErrCLISyncNotEligible, ep.QAStatus) - } - if strings.TrimSpace(ep.McapPath) == "" { - return 0, fmt.Errorf("%w: empty mcap_path", ErrCLISyncNotEligible) - } - if strings.TrimSpace(ep.SidecarPath) == "" { - return 0, fmt.Errorf("%w: empty sidecar_path", ErrCLISyncNotEligible) - } - if strings.TrimSpace(ep.RobotDeviceID.String) == "" { - return 0, fmt.Errorf("%w: empty robot_device_id", ErrCLISyncNotEligible) - } - - var normalActive int - if err := tx.GetContext(ctx, &normalActive, ` - SELECT COUNT(*) - FROM sync_logs - WHERE episode_id = ? - AND status IN ('pending', 'in_progress') - `, episodeID); err != nil { - return 0, fmt.Errorf("query active normal sync count: %w", err) - } - if normalActive > 0 { - return 0, fmt.Errorf("%w: %d", ErrCLISyncNormalSyncActive, episodeID) - } - - var cliActive int - if err := tx.GetContext(ctx, &cliActive, ` - SELECT COUNT(*) - FROM cli_sync_runs - WHERE episode_id = ? - AND status IN ('pending', 'in_progress') - `, episodeID); err != nil { - return 0, fmt.Errorf("query active CLI sync count: %w", err) - } - if cliActive > 0 { - return 0, fmt.Errorf("%w: %d", ErrCLISyncAlreadyActive, episodeID) - } - - now := time.Now().UTC() - result, err := tx.ExecContext(ctx, ` - INSERT INTO cli_sync_runs (episode_id, status, source_path, dp_config_path, created_at, updated_at) - VALUES (?, 'pending', ?, ?, ?, ?) - `, episodeID, ep.McapPath, r.cfg.DPConfigPath, now, now) - if err != nil { - return 0, fmt.Errorf("insert CLI sync run: %w", err) - } - runID, err := result.LastInsertId() - if err != nil { - return 0, fmt.Errorf("CLI sync run last insert id: %w", err) - } - if err := tx.Commit(); err != nil { - return 0, fmt.Errorf("commit CLI sync run: %w", err) - } - return runID, nil -} - -func (r *CLISyncRunner) dispatcher(ctx context.Context) { - defer r.wg.Done() - r.dispatchPendingRuns(ctx) - ticker := time.NewTicker(cliSyncPollInterval) - defer ticker.Stop() - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - r.dispatchPendingRuns(ctx) - } - } -} - -func (r *CLISyncRunner) dispatchPendingRuns(ctx context.Context) { - var ids []int64 - if err := r.db.SelectContext(ctx, &ids, ` - SELECT id - FROM cli_sync_runs - WHERE status = 'pending' - ORDER BY id ASC - LIMIT ? - `, r.cfg.QueueSize); err != nil { - if ctx.Err() == nil { - logger.Printf("[CLI-SYNC] Failed to query pending runs: %v", err) - } - return - } - for _, id := range ids { - select { - case r.runCh <- id: - default: - return - } - } -} - -func (r *CLISyncRunner) worker(ctx context.Context) { - defer r.wg.Done() - for { - select { - case <-ctx.Done(): - return - case runID := <-r.runCh: - r.processRun(ctx, runID) - } - } -} - -func (r *CLISyncRunner) processRun(parent context.Context, runID int64) { - startedAt := time.Now().UTC() - claimed, err := r.claimRun(parent, runID, startedAt) - if err != nil { - logger.Printf("[CLI-SYNC] Failed to claim run %d: %v", runID, err) - return - } - if !claimed { - return - } - logger.Printf("[CLI-SYNC] Run %d claimed", runID) - - ctx, cancel := context.WithTimeout(parent, time.Duration(r.cfg.TimeoutSec)*time.Second) - defer cancel() - - var ep cliSyncEpisode - if err := r.loadEpisodeForRun(ctx, runID, &ep); err != nil { - r.markRunFailed(context.Background(), runID, startedAt, err) - return - } - deviceID := strings.TrimSpace(ep.RobotDeviceID.String) - if deviceID == "" { - r.markRunFailed(context.Background(), runID, startedAt, fmt.Errorf("%w: empty robot_device_id", ErrCLISyncNotEligible)) - return - } - logger.Printf("[CLI-SYNC] Run %d loaded episode: episode_id=%d public_id=%s qa_status=%s device_id=%s mcap_path=%s sidecar_path=%s", - runID, ep.ID, ep.EpisodePublicID, ep.QAStatus, deviceID, ep.McapPath, ep.SidecarPath) - - tags, err := r.buildTagsFromEpisode(ctx, ep) - if err != nil { - r.markRunFailed(context.Background(), runID, startedAt, err) - return - } - logger.Printf("[CLI-SYNC] Run %d built upload tags: episode_id=%d tag_count=%d", runID, ep.ID, len(tags)) - - mcapKey := stripBucketPrefix(ep.McapPath) - if mcapKey == "" { - r.markRunFailed(context.Background(), runID, startedAt, fmt.Errorf("empty mcap_path")) - return - } - logger.Printf("[CLI-SYNC] Run %d staging MCAP from MinIO: episode_id=%d bucket=%s key=%s temp_dir=%s", - runID, ep.ID, r.minioBucket, mcapKey, r.cfg.TempDir) - - tempPath, fileSize, err := r.stageMcap(ctx, ep.ID, mcapKey) - if err != nil { - r.markRunFailed(context.Background(), runID, startedAt, err) - return - } - logger.Printf("[CLI-SYNC] Run %d staged MCAP: episode_id=%d temp_path=%s size=%d bytes", - runID, ep.ID, tempPath, fileSize) - if !r.cfg.KeepTemp { - defer func() { _ = os.Remove(tempPath) }() - } - if err := r.setRunTempPath(context.Background(), runID, tempPath); err != nil { - logger.Printf("[CLI-SYNC] Failed to update temp path for run %d: %v", runID, err) - } - - uploadStartedAt := time.Now() - logger.Printf("[CLI-SYNC] Run %d starting dp upload: episode_id=%d dp_bin=%s device_id=%s tag_count=%d file_size=%d", - runID, ep.ID, r.cfg.DPBin, deviceID, len(tags), fileSize) - result, stdoutJSON, err := r.runDPUpload(ctx, tempPath, tags, deviceID) - if err != nil { - r.markRunFailed(context.Background(), runID, startedAt, err) - return - } - logger.Printf("[CLI-SYNC] Run %d dp upload finished: episode_id=%d elapsed=%s file_id=%s logical_upload_id=%s object_key=%s", - runID, ep.ID, time.Since(uploadStartedAt).Round(time.Millisecond), result.FileID, result.LogicalUploadID, result.ObjectKey) - if result.FileSize <= 0 { - result.FileSize = fileSize - } - if err := validateCLIUploadResult(result); err != nil { - r.markRunFailed(context.Background(), runID, startedAt, err) - return - } - - if err := r.markRunCompleted(context.Background(), runID, ep, result, stdoutJSON, startedAt); err != nil { - logger.Printf("[CLI-SYNC] Failed to mark run %d completed: %v", runID, err) - r.markRunFailed(context.Background(), runID, startedAt, err) - return - } - logger.Printf("[CLI-SYNC] Episode %d CLI synced: run_id=%d file_id=%s logical_upload_id=%s object_key=%s", - ep.ID, runID, result.FileID, result.LogicalUploadID, result.ObjectKey) -} - -func (r *CLISyncRunner) loadEpisodeForRun(ctx context.Context, runID int64, ep *cliSyncEpisode) error { - if err := r.db.GetContext(ctx, ep, ` - SELECT - e.id, - e.episode_id, - e.qa_status, - e.mcap_path, - e.sidecar_path, - e.cloud_synced, - COALESCE(NULLIF(TRIM(r.device_id), ''), NULLIF(TRIM(ws.robot_serial), '')) AS robot_device_id, - e.task_id, - e.factory_id, - e.organization_id - FROM cli_sync_runs csr - INNER JOIN episodes e ON e.id = csr.episode_id AND e.deleted_at IS NULL - LEFT JOIN workstations ws ON ws.id = e.workstation_id AND ws.deleted_at IS NULL - LEFT JOIN robots r ON r.id = ws.robot_id AND r.deleted_at IS NULL - WHERE csr.id = ? - `, runID); err != nil { - if err == sql.ErrNoRows { - return fmt.Errorf("%w for CLI sync run %d", ErrCLISyncEpisodeNotFound, runID) - } - return fmt.Errorf("load episode for CLI sync run %d: %w", runID, err) - } - return nil -} - -func (r *CLISyncRunner) claimRun(ctx context.Context, runID int64, startedAt time.Time) (bool, error) { - res, err := r.db.ExecContext(ctx, ` - UPDATE cli_sync_runs - SET status = 'in_progress', - started_at = ?, - error_message = NULL, - updated_at = ? - WHERE id = ? - AND status = 'pending' - `, startedAt, startedAt, runID) - if err != nil { - return false, fmt.Errorf("claim CLI sync run: %w", err) - } - n, err := res.RowsAffected() - if err != nil { - return false, fmt.Errorf("claim CLI sync rows affected: %w", err) - } - return n == 1, nil -} - -func (r *CLISyncRunner) buildTagsFromEpisode(ctx context.Context, ep cliSyncEpisode) (map[string]string, error) { - sidecarTags, err := r.tagsFromSidecar(ctx, ep.SidecarPath) - if err != nil { - return nil, err - } - - tags := make(map[string]string, len(sidecarTags)+6) - for k, v := range sidecarTags { - tags[k] = v - } - tags["episode_id"] = ep.EpisodePublicID - tags["keystone_episode_id"] = strconv.FormatInt(ep.ID, 10) - tags["sync_channel"] = "keystone_cli" - if deviceID := strings.TrimSpace(ep.RobotDeviceID.String); deviceID != "" { - tags["device_id"] = deviceID - } - if ep.TaskID.Valid { - tags["task_id"] = strconv.FormatInt(ep.TaskID.Int64, 10) - } - if ep.FactoryID.Valid { - tags["factory_id"] = strconv.FormatInt(ep.FactoryID.Int64, 10) - } - if ep.OrganizationID.Valid { - tags["organization_id"] = strconv.FormatInt(ep.OrganizationID.Int64, 10) - } - - if err := r.validateTags(tags); err != nil { - return nil, err - } - return tags, nil -} - -func (r *CLISyncRunner) tagsFromSidecar(ctx context.Context, sidecarPath string) (map[string]string, error) { - key := stripBucketPrefix(sidecarPath) - if key == "" { - return nil, fmt.Errorf("empty sidecar_path") - } - startedAt := time.Now() - logger.Printf("[CLI-SYNC] Reading sidecar from MinIO: bucket=%s key=%s", r.minioBucket, key) - obj, err := r.minioClient.GetObject(ctx, r.minioBucket, key, minio.GetObjectOptions{}) - if err != nil { - return nil, fmt.Errorf("get sidecar object %s: %w", key, err) - } - defer func() { _ = obj.Close() }() - - data, err := io.ReadAll(obj) - if err != nil { - return nil, fmt.Errorf("read sidecar object %s: %w", key, err) - } - tags, err := flattenSidecarScalars(data) - if err != nil { - return nil, fmt.Errorf("flatten sidecar %s: %w", key, err) - } - logger.Printf("[CLI-SYNC] Read sidecar complete: bucket=%s key=%s bytes=%d scalar_tag_count=%d elapsed=%s", - r.minioBucket, key, len(data), len(tags), time.Since(startedAt).Round(time.Millisecond)) - return tags, nil -} - -func (r *CLISyncRunner) validateTags(tags map[string]string) error { - if len(tags) > r.cfg.MaxTags { - return fmt.Errorf("too many CLI sync tags: %d > %d", len(tags), r.cfg.MaxTags) - } - totalBytes := 0 - for key, value := range tags { - key = strings.TrimSpace(key) - if key == "" { - return fmt.Errorf("CLI sync tag key is empty") - } - if strings.ContainsAny(key, ",=") { - return fmt.Errorf("CLI sync tag key %q contains unsupported characters", key) - } - totalBytes += len(key) + 1 + len(encodeDPTagValue(value)) - } - if totalBytes > r.cfg.MaxTagBytes { - return fmt.Errorf("CLI sync tags too large: %d > %d bytes", totalBytes, r.cfg.MaxTagBytes) - } - return nil -} - -func (r *CLISyncRunner) stageMcap(ctx context.Context, episodeID int64, mcapKey string) (string, int64, error) { - startedAt := time.Now() - obj, err := r.minioClient.GetObject(ctx, r.minioBucket, mcapKey, minio.GetObjectOptions{}) - if err != nil { - return "", 0, fmt.Errorf("get MCAP object %s: %w", mcapKey, err) - } - defer func() { _ = obj.Close() }() - - tmp, err := os.CreateTemp(r.cfg.TempDir, fmt.Sprintf("episode-%d-*.mcap", episodeID)) - if err != nil { - return "", 0, fmt.Errorf("create CLI sync temp file: %w", err) - } - tempPath := tmp.Name() - cleanup := true - defer func() { - _ = tmp.Close() - if cleanup { - _ = os.Remove(tempPath) - } - }() - - size, err := io.Copy(tmp, obj) - if err != nil { - return "", 0, fmt.Errorf("write CLI sync temp file: %w", err) - } - if err := tmp.Close(); err != nil { - return "", 0, fmt.Errorf("close CLI sync temp file: %w", err) - } - if size <= 0 { - return "", 0, fmt.Errorf("zero-byte MCAP cannot be CLI synced") - } - cleanup = false - logger.Printf("[CLI-SYNC] MCAP download complete: episode_id=%d bucket=%s key=%s temp_path=%s size=%d elapsed=%s", - episodeID, r.minioBucket, mcapKey, tempPath, size, time.Since(startedAt).Round(time.Millisecond)) - return tempPath, size, nil -} - -func (r *CLISyncRunner) runDPUpload(ctx context.Context, tempPath string, tags map[string]string, deviceID string) (*cliUploadResult, string, error) { - deviceID = strings.TrimSpace(deviceID) - if deviceID == "" { - return nil, "", fmt.Errorf("dp device id is required") - } - args := []string{ - "--config", r.cfg.DPConfigPath, - "--json", - "data", "upload", tempPath, - "--device", deviceID, - "--hint", "source=keystone_cli_sync", - } - - keys := make([]string, 0, len(tags)) - for key := range tags { - keys = append(keys, key) - } - sort.Strings(keys) - for _, key := range keys { - args = append(args, "--tag", key+"="+encodeDPTagValue(tags[key])) - } - logger.Printf("[CLI-SYNC] Prepared dp command: dp_bin=%s file=%s device_id=%s tag_count=%d hint_count=1", - r.cfg.DPBin, tempPath, deviceID, len(tags)) - - cmd := exec.CommandContext(ctx, r.cfg.DPBin, args...) - var stdout bytes.Buffer - var stderr bytes.Buffer - cmd.Stdout = &stdout - cmd.Stderr = &stderr - - if err := cmd.Run(); err != nil { - output := strings.TrimSpace(stderr.String()) - if output == "" { - output = strings.TrimSpace(stdout.String()) - } - return nil, "", fmt.Errorf("dp data upload failed: %s", sanitizeCLIOutput(output, err)) - } - - stdoutText := strings.TrimSpace(stdout.String()) - var result cliUploadResult - if err := json.Unmarshal([]byte(stdoutText), &result); err != nil { - return nil, "", fmt.Errorf("parse dp upload JSON: %w", err) - } - return &result, stdoutText, nil -} - -func validateCLIUploadResult(result *cliUploadResult) error { - if result == nil { - return fmt.Errorf("dp upload result is empty") - } - if strings.TrimSpace(result.FileID) == "" { - return fmt.Errorf("dp upload result missing fileId") - } - if strings.TrimSpace(result.LogicalUploadID) == "" { - return fmt.Errorf("dp upload result missing logicalUploadId") - } - if strings.TrimSpace(result.ObjectKey) == "" { - return fmt.Errorf("dp upload result missing objectKey") - } - if result.FileSize <= 0 { - return fmt.Errorf("dp upload result has invalid fileSize") - } - return nil -} - -func (r *CLISyncRunner) setRunTempPath(ctx context.Context, runID int64, tempPath string) error { - _, err := r.db.ExecContext(ctx, ` - UPDATE cli_sync_runs - SET temp_path = ?, updated_at = ? - WHERE id = ? - `, tempPath, time.Now().UTC(), runID) - return err -} - -func (r *CLISyncRunner) markRunCompleted(ctx context.Context, runID int64, ep cliSyncEpisode, result *cliUploadResult, stdoutJSON string, startedAt time.Time) error { - now := time.Now().UTC() - durationSec := int64(now.Sub(startedAt).Seconds()) - - tx, err := r.db.BeginTxx(ctx, nil) - if err != nil { - return fmt.Errorf("begin CLI sync completion transaction: %w", err) - } - defer func() { _ = tx.Rollback() }() - - lockClause := txLockClause(tx) - var cloudSynced bool - if err := tx.GetContext(ctx, &cloudSynced, ` - SELECT cloud_synced - FROM episodes - WHERE id = ? AND deleted_at IS NULL - `+lockClause, ep.ID); err != nil { - return fmt.Errorf("lock episode for CLI sync completion: %w", err) - } - - if _, err := tx.ExecContext(ctx, ` - UPDATE cli_sync_runs - SET status = 'completed', - file_id = ?, - logical_upload_id = ?, - upload_id = ?, - bucket = ?, - object_key = ?, - file_size = ?, - oss_object_etag = ?, - duration_sec = ?, - error_message = NULL, - stdout_json = ?, - completed_at = ?, - updated_at = ? - WHERE id = ? - `, result.FileID, result.LogicalUploadID, nullableStringValue(result.UploadID), result.Bucket, result.ObjectKey, - result.FileSize, result.OSSObjectETag, durationSec, stdoutJSON, now, now, runID); err != nil { - return fmt.Errorf("update CLI sync run completed: %w", err) - } - - if cloudSynced { - return tx.Commit() - } - - if _, err := tx.ExecContext(ctx, ` - INSERT INTO sync_logs (episode_id, source_path, destination_path, status, bytes_transferred, duration_sec, attempt_count, started_at, completed_at) - VALUES (?, ?, ?, 'completed', ?, ?, 1, ?, ?) - `, ep.ID, ep.McapPath, result.ObjectKey, result.FileSize, durationSec, startedAt, now); err != nil { - return fmt.Errorf("insert CLI sync completed log: %w", err) - } - - if _, err := tx.ExecContext(ctx, ` - UPDATE episodes - SET cloud_synced = TRUE, - cloud_synced_at = ?, - cloud_mcap_path = ?, - cloud_processed = FALSE - WHERE id = ? AND deleted_at IS NULL - `, now, result.ObjectKey, ep.ID); err != nil { - return fmt.Errorf("update episode CLI sync cloud state: %w", err) - } - - if err := tx.Commit(); err != nil { - return fmt.Errorf("commit CLI sync completion: %w", err) - } - return nil -} - -func (r *CLISyncRunner) markRunFailed(ctx context.Context, runID int64, startedAt time.Time, runErr error) { - now := time.Now().UTC() - durationSec := int64(now.Sub(startedAt).Seconds()) - msg := sanitizeCLIOutput("", runErr) - if msg == "" && runErr != nil { - msg = runErr.Error() - } - logger.Printf("[CLI-SYNC] Run %d failed: duration=%ds error=%s", runID, durationSec, msg) - if _, err := r.db.ExecContext(ctx, ` - UPDATE cli_sync_runs - SET status = 'failed', - duration_sec = ?, - error_message = ?, - completed_at = ?, - updated_at = ? - WHERE id = ? - `, durationSec, msg, now, now, runID); err != nil { - logger.Printf("[CLI-SYNC] Failed to mark run %d failed: %v", runID, err) - } -} - -func nullableStringValue(value string) interface{} { - if strings.TrimSpace(value) == "" { - return nil - } - return value -} - -var cliSecretPattern = regexp.MustCompile(`(?i)(authorization|access[_-]?key|secret|token|password|api[_-]?key)(["'=:\s]+)([^,\s"}]+)`) - -func encodeDPTagValue(value string) string { - value = strings.ReplaceAll(value, `%`, `%25`) - value = strings.ReplaceAll(value, `,`, `%2C`) - return value -} - -func sanitizeCLIOutput(output string, err error) string { - text := strings.TrimSpace(output) - if text == "" && err != nil { - text = err.Error() - } - text = cliSecretPattern.ReplaceAllString(text, `$1$2`) - if len(text) > 4096 { - text = text[:4096] + "..." - } - return text -} diff --git a/internal/services/dp_asset_resolver.go b/internal/services/dp_asset_resolver.go new file mode 100644 index 0000000..78f3e29 --- /dev/null +++ b/internal/services/dp_asset_resolver.go @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "strings" + + "github.com/jmoiron/sqlx" +) + +func assetIDFromEpisodeMetadata(metadata sql.NullString) string { + if !metadata.Valid || strings.TrimSpace(metadata.String) == "" { + return "" + } + var raw map[string]interface{} + if err := json.Unmarshal([]byte(metadata.String), &raw); err != nil { + return "" + } + value, _ := raw["asset_id"].(string) + return strings.TrimSpace(value) +} + +func resolveAssetIDForEpisode(ctx context.Context, db *sqlx.DB, episodeID int64, metadata sql.NullString, workstationID sql.NullInt64) (string, error) { + if assetID := assetIDFromEpisodeMetadata(metadata); assetID != "" { + return assetID, nil + } + if db == nil { + return "", fmt.Errorf("database is not available") + } + if !workstationID.Valid || workstationID.Int64 <= 0 { + return "", fmt.Errorf("episode %d has no asset_id metadata and no workstation_id", episodeID) + } + + var row struct { + AssetID sql.NullString `db:"asset_id"` + } + err := db.GetContext(ctx, &row, ` + SELECT r.asset_id + FROM workstations ws + LEFT JOIN robots r ON r.id = ws.robot_id + WHERE ws.id = ? + LIMIT 1 + `, workstationID.Int64) + if err == sql.ErrNoRows { + return "", fmt.Errorf("episode %d workstation %d not found while resolving asset_id", episodeID, workstationID.Int64) + } + if err != nil { + return "", fmt.Errorf("resolve asset_id for episode %d workstation %d: %w", episodeID, workstationID.Int64, err) + } + assetID := strings.TrimSpace(row.AssetID.String) + if !row.AssetID.Valid || assetID == "" { + return "", fmt.Errorf("episode %d workstation %d has no robot asset_id", episodeID, workstationID.Int64) + } + return assetID, nil +} diff --git a/internal/services/dp_asset_resolver_test.go b/internal/services/dp_asset_resolver_test.go new file mode 100644 index 0000000..d000738 --- /dev/null +++ b/internal/services/dp_asset_resolver_test.go @@ -0,0 +1,112 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "context" + "database/sql" + "strings" + "testing" + + "github.com/jmoiron/sqlx" + _ "modernc.org/sqlite" +) + +func newTestAssetResolverDB(t *testing.T) *sqlx.DB { + t.Helper() + db, err := sqlx.Open("sqlite", ":memory:") + if err != nil { + t.Fatalf("open sqlite db: %v", err) + } + for _, stmt := range []string{ + `CREATE TABLE robots ( + id INTEGER PRIMARY KEY, + device_id TEXT NOT NULL, + asset_id TEXT, + deleted_at TIMESTAMP NULL + )`, + `CREATE TABLE workstations ( + id INTEGER PRIMARY KEY, + robot_id INTEGER, + deleted_at TIMESTAMP NULL + )`, + } { + if _, err := db.Exec(stmt); err != nil { + _ = db.Close() + t.Fatalf("create schema: %v", err) + } + } + t.Cleanup(func() { _ = db.Close() }) + return db +} + +func TestResolveAssetIDForEpisode_MetadataWins(t *testing.T) { + db := newTestAssetResolverDB(t) + if _, err := db.Exec(`INSERT INTO robots (id, device_id, asset_id) VALUES (1, 'local-device', 'fallback-asset')`); err != nil { + t.Fatalf("seed robot: %v", err) + } + if _, err := db.Exec(`INSERT INTO workstations (id, robot_id) VALUES (10, 1)`); err != nil { + t.Fatalf("seed workstation: %v", err) + } + + got, err := resolveAssetIDForEpisode( + context.Background(), + db, + 1, + sql.NullString{String: `{"asset_id":" snapshot-asset "}`, Valid: true}, + sql.NullInt64{Int64: 10, Valid: true}, + ) + if err != nil { + t.Fatalf("resolveAssetIDForEpisode() error = %v", err) + } + if got != "snapshot-asset" { + t.Fatalf("asset_id=%q want snapshot-asset", got) + } +} + +func TestResolveAssetIDForEpisode_FallbackReadsSoftDeletedWorkstation(t *testing.T) { + db := newTestAssetResolverDB(t) + if _, err := db.Exec(`INSERT INTO robots (id, device_id, asset_id) VALUES (1, 'local-device', 'fallback-asset')`); err != nil { + t.Fatalf("seed robot: %v", err) + } + if _, err := db.Exec(`INSERT INTO workstations (id, robot_id, deleted_at) VALUES (10, 1, CURRENT_TIMESTAMP)`); err != nil { + t.Fatalf("seed workstation: %v", err) + } + + got, err := resolveAssetIDForEpisode( + context.Background(), + db, + 1, + sql.NullString{}, + sql.NullInt64{Int64: 10, Valid: true}, + ) + if err != nil { + t.Fatalf("resolveAssetIDForEpisode() error = %v", err) + } + if got != "fallback-asset" { + t.Fatalf("asset_id=%q want fallback-asset", got) + } +} + +func TestResolveAssetIDForEpisode_MissingDoesNotFallbackToLocalDeviceID(t *testing.T) { + db := newTestAssetResolverDB(t) + if _, err := db.Exec(`INSERT INTO robots (id, device_id, asset_id) VALUES (1, 'local-device', NULL)`); err != nil { + t.Fatalf("seed robot: %v", err) + } + if _, err := db.Exec(`INSERT INTO workstations (id, robot_id) VALUES (10, 1)`); err != nil { + t.Fatalf("seed workstation: %v", err) + } + + _, err := resolveAssetIDForEpisode( + context.Background(), + db, + 1, + sql.NullString{}, + sql.NullInt64{Int64: 10, Valid: true}, + ) + if err == nil || !strings.Contains(err.Error(), "asset_id") { + t.Fatalf("error=%v want asset_id missing error", err) + } +} diff --git a/internal/services/dp_config_loader.go b/internal/services/dp_config_loader.go new file mode 100644 index 0000000..b7b0f00 --- /dev/null +++ b/internal/services/dp_config_loader.go @@ -0,0 +1,167 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "encoding/json" + "fmt" + "net" + "net/url" + "os" + "strings" +) + +// DPConfigFile is the subset of data-platform config consumed by direct sync. +type DPConfigFile struct { + Version *int `json:"version,omitempty"` + Endpoints DPConfigEndpoints `json:"endpoints"` + Devices []DPDeviceProfile `json:"devices"` +} + +type DPConfigEndpoints struct { + Auth string `json:"auth"` + Gateway string `json:"gateway"` +} + +type DPDeviceProfile struct { + DeviceID string `json:"deviceId"` + APIKey string `json:"apiKey"` + Tags map[string]string `json:"tags"` +} + +type DPResolvedEndpoint struct { + Target string + UseTLS bool + ServerName string +} + +type DPDeviceUploadConfig struct { + ConfigPath string + Auth DPResolvedEndpoint + Gateway DPResolvedEndpoint + Profile DPDeviceProfile +} + +func loadDPDeviceUploadConfig(configPath string, assetID string) (*DPDeviceUploadConfig, error) { + configPath = strings.TrimSpace(configPath) + assetID = strings.TrimSpace(assetID) + if configPath == "" { + return nil, fmt.Errorf("KEYSTONE_SYNC_DP_CONFIG is required") + } + if assetID == "" { + return nil, fmt.Errorf("asset_id is required") + } + + data, err := os.ReadFile(configPath) //nolint:gosec // operator-controlled config path + if err != nil { + return nil, fmt.Errorf("read DP config %s: %w", configPath, err) + } + + var cfg DPConfigFile + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("parse DP config %s: %w", configPath, err) + } + if cfg.Version != nil && *cfg.Version != 3 { + return nil, fmt.Errorf("DP config %s has unsupported version %d", configPath, *cfg.Version) + } + + authEndpoint, err := parseDPResolvedEndpoint(cfg.Endpoints.Auth) + if err != nil { + return nil, fmt.Errorf("invalid endpoints.auth in DP config %s: %w", configPath, err) + } + gatewayEndpoint, err := parseDPResolvedEndpoint(cfg.Endpoints.Gateway) + if err != nil { + return nil, fmt.Errorf("invalid endpoints.gateway in DP config %s: %w", configPath, err) + } + + devices := make(map[string]DPDeviceProfile, len(cfg.Devices)) + for idx, device := range cfg.Devices { + deviceID := strings.TrimSpace(device.DeviceID) + if deviceID == "" { + return nil, fmt.Errorf("DP config %s devices[%d].deviceId is empty", configPath, idx) + } + if _, exists := devices[deviceID]; exists { + return nil, fmt.Errorf("DP config %s has duplicate deviceId %q", configPath, deviceID) + } + device.DeviceID = deviceID + devices[deviceID] = device + } + + profile, ok := devices[assetID] + if !ok { + return nil, fmt.Errorf("DP config %s has no device profile for asset_id %q", configPath, assetID) + } + profile.APIKey = strings.TrimSpace(profile.APIKey) + if profile.APIKey == "" { + return nil, fmt.Errorf("DP config %s device %q apiKey is empty", configPath, assetID) + } + if len(profile.Tags) == 0 { + return nil, fmt.Errorf("DP config %s device %q tags must be non-empty", configPath, assetID) + } + for key := range profile.Tags { + if key == "" { + return nil, fmt.Errorf("DP config %s device %q has an empty tag key", configPath, assetID) + } + } + + return &DPDeviceUploadConfig{ + ConfigPath: configPath, + Auth: authEndpoint, + Gateway: gatewayEndpoint, + Profile: profile, + }, nil +} + +func parseDPResolvedEndpoint(raw string) (DPResolvedEndpoint, error) { + value := strings.TrimSpace(raw) + if value == "" { + return DPResolvedEndpoint{}, fmt.Errorf("endpoint is required") + } + + if strings.Contains(value, "://") { + parsed, err := url.Parse(value) + if err != nil { + return DPResolvedEndpoint{}, err + } + if parsed.Scheme != "http" && parsed.Scheme != "https" { + return DPResolvedEndpoint{}, fmt.Errorf("unsupported scheme %q", parsed.Scheme) + } + if parsed.Host == "" || parsed.User != nil { + return DPResolvedEndpoint{}, fmt.Errorf("endpoint must be host[:port]") + } + if parsed.Path != "" || parsed.RawQuery != "" || parsed.Fragment != "" { + return DPResolvedEndpoint{}, fmt.Errorf("endpoint must not include path, query, or fragment") + } + host := parsed.Hostname() + if host == "" { + return DPResolvedEndpoint{}, fmt.Errorf("endpoint host is required") + } + target := parsed.Host + if parsed.Port() == "" { + defaultPort := "80" + if parsed.Scheme == "https" { + defaultPort = "443" + } + target = net.JoinHostPort(host, defaultPort) + } + return DPResolvedEndpoint{ + Target: target, + UseTLS: parsed.Scheme == "https", + ServerName: tlsServerNameForScheme(parsed.Scheme, host), + }, nil + } + + if strings.ContainsAny(value, "/?#") { + return DPResolvedEndpoint{}, fmt.Errorf("bare endpoint must not include path, query, or fragment") + } + return DPResolvedEndpoint{Target: value, UseTLS: false}, nil +} + +func tlsServerNameForScheme(scheme string, host string) string { + if scheme == "https" { + return host + } + return "" +} diff --git a/internal/services/dp_config_loader_test.go b/internal/services/dp_config_loader_test.go new file mode 100644 index 0000000..deff985 --- /dev/null +++ b/internal/services/dp_config_loader_test.go @@ -0,0 +1,176 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func writeDPConfigFixture(t *testing.T, body string) string { + t.Helper() + path := filepath.Join(t.TempDir(), "dp-config.json") + if err := os.WriteFile(path, []byte(body), 0o600); err != nil { + t.Fatalf("write DP config fixture: %v", err) + } + return path +} + +func validDPConfigJSON(extra string) string { + version := `"version":3,` + if extra == "missing-version" { + version = "" + } + return `{ + ` + version + ` + "endpoints": { + "auth": "https://auth.example.com", + "gateway": "gateway.example.com:7443" + }, + "devices": [{ + "deviceId": " asset-1 ", + "apiKey": " api-key-1 ", + "tags": {"line": "A", "empty_value": ""} + }] + }` +} + +func TestLoadDPDeviceUploadConfig_SelectsDeviceAndEndpoints(t *testing.T) { + for _, tt := range []struct { + name string + body string + }{ + {name: "version 3", body: validDPConfigJSON("")}, + {name: "missing version", body: validDPConfigJSON("missing-version")}, + } { + t.Run(tt.name, func(t *testing.T) { + cfg, err := loadDPDeviceUploadConfig(writeDPConfigFixture(t, tt.body), "asset-1") + if err != nil { + t.Fatalf("loadDPDeviceUploadConfig() error = %v", err) + } + if cfg.Profile.DeviceID != "asset-1" { + t.Fatalf("Profile.DeviceID=%q want asset-1", cfg.Profile.DeviceID) + } + if cfg.Profile.APIKey != "api-key-1" { + t.Fatalf("Profile.APIKey was not trimmed") + } + if cfg.Auth.Target != "auth.example.com:443" || !cfg.Auth.UseTLS || cfg.Auth.ServerName != "auth.example.com" { + t.Fatalf("auth endpoint=%+v", cfg.Auth) + } + if cfg.Gateway.Target != "gateway.example.com:7443" || cfg.Gateway.UseTLS { + t.Fatalf("gateway endpoint=%+v", cfg.Gateway) + } + if cfg.Profile.Tags["empty_value"] != "" { + t.Fatalf("empty tag values must be preserved: %+v", cfg.Profile.Tags) + } + }) + } +} + +func TestParseDPResolvedEndpoint(t *testing.T) { + tests := []struct { + raw string + target string + useTLS bool + serverName string + }{ + {raw: "https://dp.example.com", target: "dp.example.com:443", useTLS: true, serverName: "dp.example.com"}, + {raw: "https://dp.example.com:9443", target: "dp.example.com:9443", useTLS: true, serverName: "dp.example.com"}, + {raw: "http://dp.example.com", target: "dp.example.com:80", useTLS: false}, + {raw: "dp.example.com:7443", target: "dp.example.com:7443", useTLS: false}, + {raw: "dp.example.com", target: "dp.example.com", useTLS: false}, + } + for _, tt := range tests { + t.Run(tt.raw, func(t *testing.T) { + got, err := parseDPResolvedEndpoint(tt.raw) + if err != nil { + t.Fatalf("parseDPResolvedEndpoint() error = %v", err) + } + if got.Target != tt.target || got.UseTLS != tt.useTLS || got.ServerName != tt.serverName { + t.Fatalf("parseDPResolvedEndpoint()=%+v want target=%q tls=%t server=%q", got, tt.target, tt.useTLS, tt.serverName) + } + }) + } +} + +func TestParseDPResolvedEndpointRejectsUnsupportedForms(t *testing.T) { + for _, raw := range []string{ + "", + "https://dp.example.com/path", + "https://dp.example.com?x=1", + "https://dp.example.com#frag", + "ftp://dp.example.com", + "dp.example.com/path", + "dp.example.com?x=1", + "dp.example.com#frag", + } { + t.Run(raw, func(t *testing.T) { + if _, err := parseDPResolvedEndpoint(raw); err == nil { + t.Fatalf("parseDPResolvedEndpoint(%q) expected error", raw) + } + }) + } +} + +func TestLoadDPDeviceUploadConfigRejectsContractErrors(t *testing.T) { + tests := []struct { + name string + body string + deviceID string + want string + }{ + { + name: "unsupported version", + body: `{"version":2,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"key","tags":{"k":"v"}}]}`, + want: "unsupported version", + }, + { + name: "missing device", + body: validDPConfigJSON(""), + deviceID: "CLOUD-device-1", + want: "no device profile", + }, + { + name: "empty api key", + body: `{"version":3,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":" ","tags":{"k":"v"}}]}`, + want: "apiKey is empty", + }, + { + name: "empty tags", + body: `{"version":3,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"key","tags":{}}]}`, + want: "tags must be non-empty", + }, + { + name: "empty tag key", + body: `{"version":3,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"key","tags":{"":"v"}}]}`, + want: "empty tag key", + }, + { + name: "duplicate device", + body: `{"version":3,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":" asset-1 ","apiKey":"key","tags":{"k":"v"}},{"deviceId":"asset-1","apiKey":"key2","tags":{"k":"v"}}]}`, + want: "duplicate deviceId", + }, + { + name: "missing endpoint", + body: `{"version":3,"endpoints":{"auth":"","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"key","tags":{"k":"v"}}]}`, + want: "endpoints.auth", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + deviceID := tt.deviceID + if deviceID == "" { + deviceID = "asset-1" + } + _, err := loadDPDeviceUploadConfig(writeDPConfigFixture(t, tt.body), deviceID) + if err == nil || !strings.Contains(err.Error(), tt.want) { + t.Fatalf("error=%v want contains %q", err, tt.want) + } + }) + } +} diff --git a/internal/services/dp_raw_tags.go b/internal/services/dp_raw_tags.go new file mode 100644 index 0000000..925116a --- /dev/null +++ b/internal/services/dp_raw_tags.go @@ -0,0 +1,96 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "database/sql" + "fmt" + "path" + "strconv" + "strings" +) + +const ( + dpReservedDeviceIDTagKey = "778a6d83c9ec49108537542a570966ee.device_id" + dpReservedRawFileTagKey = "a206e337ecdf70a93bb611cf6a30c346.raw_file" +) + +type dpRawTagsInput struct { + Profile DPDeviceProfile + McapKey string + SidecarTags map[string]string + EpisodeID int64 + EpisodePublicID string + TaskID int64 + FactoryID sql.NullInt64 + OrganizationID sql.NullInt64 +} + +func buildDPDirectRawTags(input dpRawTagsInput) (map[string]string, error) { + mcapKey := stripBucketPrefix(input.McapKey) + rawFile := path.Base(strings.TrimSpace(mcapKey)) + if rawFile == "" || rawFile == "." || rawFile == "/" { + return nil, fmt.Errorf("raw_file basename is empty for mcap key %q", input.McapKey) + } + + merged := make(map[string]string, len(input.Profile.Tags)+len(input.SidecarTags)+8) + if err := insertAllNonConflictingTags(merged, input.Profile.Tags); err != nil { + return nil, fmt.Errorf("device profile tags: %w", err) + } + if err := insertNonConflictingTag(merged, dpReservedDeviceIDTagKey, input.Profile.DeviceID); err != nil { + return nil, err + } + if err := insertNonConflictingTag(merged, dpReservedRawFileTagKey, rawFile); err != nil { + return nil, err + } + if err := insertAllNonConflictingTags(merged, input.SidecarTags); err != nil { + return nil, fmt.Errorf("sidecar tags: %w", err) + } + if err := insertAllNonConflictingTags(merged, keystoneExtraTags(input)); err != nil { + return nil, fmt.Errorf("keystone extra tags: %w", err) + } + return merged, nil +} + +func keystoneExtraTags(input dpRawTagsInput) map[string]string { + tags := map[string]string{ + "episode_id": input.EpisodePublicID, + "keystone_episode_id": strconv.FormatInt(input.EpisodeID, 10), + "sync_channel": "keystone_direct", + } + if input.TaskID > 0 { + tags["task_id"] = strconv.FormatInt(input.TaskID, 10) + } + if input.FactoryID.Valid { + tags["factory_id"] = strconv.FormatInt(input.FactoryID.Int64, 10) + } + if input.OrganizationID.Valid { + tags["organization_id"] = strconv.FormatInt(input.OrganizationID.Int64, 10) + } + return tags +} + +func insertAllNonConflictingTags(dst map[string]string, src map[string]string) error { + for key, value := range src { + if err := insertNonConflictingTag(dst, key, value); err != nil { + return err + } + } + return nil +} + +func insertNonConflictingTag(dst map[string]string, key string, value string) error { + if key == "" { + return fmt.Errorf("raw tag key must not be empty") + } + if existing, ok := dst[key]; ok { + if existing != value { + return fmt.Errorf("raw tag conflict for key %q", key) + } + return nil + } + dst[key] = value + return nil +} diff --git a/internal/services/dp_raw_tags_test.go b/internal/services/dp_raw_tags_test.go new file mode 100644 index 0000000..bbca857 --- /dev/null +++ b/internal/services/dp_raw_tags_test.go @@ -0,0 +1,165 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "database/sql" + "strings" + "testing" +) + +func TestBuildDPDirectRawTags_MergesInDocumentedOrder(t *testing.T) { + got, err := buildDPDirectRawTags(dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{ + "profile": "tag", + "same": "value", + }, + }, + McapKey: "edge-factory/factory/device/task.mcap", + SidecarTags: map[string]string{ + "same": "value", + "array_field": `["a","b"]`, + "empty_value": "", + }, + EpisodeID: 42, + EpisodePublicID: "episode-public-42", + TaskID: 77, + FactoryID: sql.NullInt64{Int64: 3, Valid: true}, + OrganizationID: sql.NullInt64{Int64: 9, Valid: true}, + }) + if err != nil { + t.Fatalf("buildDPDirectRawTags() error = %v", err) + } + + cases := map[string]string{ + "profile": "tag", + "same": "value", + dpReservedDeviceIDTagKey: "asset-1", + dpReservedRawFileTagKey: "task.mcap", + "array_field": `["a","b"]`, + "empty_value": "", + "episode_id": "episode-public-42", + "keystone_episode_id": "42", + "sync_channel": "keystone_direct", + "task_id": "77", + "factory_id": "3", + "organization_id": "9", + } + for key, want := range cases { + if got[key] != want { + t.Fatalf("tag[%q]=%q want %q tags=%+v", key, got[key], want, got) + } + } + if _, ok := got["device_id"]; ok { + t.Fatalf("ordinary device_id raw tag must not be injected: %+v", got) + } +} + +func TestBuildDPDirectRawTags_UsesMcapKeyBasenameNotSidecarMcapFile(t *testing.T) { + got, err := buildDPDirectRawTags(dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{"profile": "tag"}, + }, + McapKey: "bucket/minio/path/actual.mcap", + SidecarTags: map[string]string{ + "mcap_file": "sidecar-claimed.mcap", + }, + EpisodeID: 1, + EpisodePublicID: "episode-1", + }) + if err != nil { + t.Fatalf("buildDPDirectRawTags() error = %v", err) + } + if got[dpReservedRawFileTagKey] != "actual.mcap" { + t.Fatalf("raw_file=%q want actual.mcap", got[dpReservedRawFileTagKey]) + } + if got["mcap_file"] != "sidecar-claimed.mcap" { + t.Fatalf("sidecar mcap_file should remain ordinary sidecar tag: %+v", got) + } +} + +func TestBuildDPDirectRawTags_ConflictingTagsFail(t *testing.T) { + tests := []struct { + name string + input dpRawTagsInput + }{ + { + name: "profile conflicts with reserved device id", + input: dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{dpReservedDeviceIDTagKey: "other-device"}, + }, + McapKey: "bucket/file.mcap", + EpisodeID: 1, + EpisodePublicID: "episode-1", + }, + }, + { + name: "sidecar conflicts with profile", + input: dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{"scene": "profile"}, + }, + McapKey: "bucket/file.mcap", + SidecarTags: map[string]string{"scene": "sidecar"}, + EpisodeID: 1, + EpisodePublicID: "episode-1", + }, + }, + { + name: "sidecar conflicts with keystone extra", + input: dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{"profile": "tag"}, + }, + McapKey: "bucket/file.mcap", + SidecarTags: map[string]string{"sync_channel": "other"}, + EpisodeID: 1, + EpisodePublicID: "episode-1", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if _, err := buildDPDirectRawTags(tt.input); err == nil || !strings.Contains(err.Error(), "conflict") { + t.Fatalf("error=%v want conflict", err) + } + }) + } +} + +func TestBuildDPDirectRawTags_RejectsEmptyKeyAndRawFile(t *testing.T) { + _, err := buildDPDirectRawTags(dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{"": "value"}, + }, + McapKey: "bucket/file.mcap", + EpisodeID: 1, + EpisodePublicID: "episode-1", + }) + if err == nil || !strings.Contains(err.Error(), "key") { + t.Fatalf("empty key error=%v", err) + } + + _, err = buildDPDirectRawTags(dpRawTagsInput{ + Profile: DPDeviceProfile{ + DeviceID: "asset-1", + Tags: map[string]string{"profile": "tag"}, + }, + McapKey: "bucket/", + EpisodeID: 1, + EpisodePublicID: "episode-1", + }) + if err == nil || !strings.Contains(err.Error(), "raw_file") { + t.Fatalf("empty raw_file error=%v", err) + } +} diff --git a/internal/services/sidecar_tags.go b/internal/services/sidecar_tags.go index ad3fcdf..9f976da 100644 --- a/internal/services/sidecar_tags.go +++ b/internal/services/sidecar_tags.go @@ -27,19 +27,6 @@ func flattenSidecar(data []byte) (map[string]string, error) { return result, nil } -// flattenSidecarScalars parses sidecar JSON for dp CLI upload tags. -// It skips arrays because the current dp CLI parser treats commas as tag separators. -func flattenSidecarScalars(data []byte) (map[string]string, error) { - var raw map[string]interface{} - if err := json.Unmarshal(data, &raw); err != nil { - return nil, fmt.Errorf("parse sidecar json: %w", err) - } - - result := make(map[string]string) - flattenScalarValue(result, "", raw) - return result, nil -} - func flattenValue(out map[string]string, prefix string, v interface{}) { switch val := v.(type) { case map[string]interface{}: @@ -73,34 +60,6 @@ func flattenValue(out map[string]string, prefix string, v interface{}) { } } -func flattenScalarValue(out map[string]string, prefix string, v interface{}) { - switch val := v.(type) { - case map[string]interface{}: - for k, child := range val { - if prefix == "" && k == "topics_summary" { - continue - } - flattenScalarValue(out, joinKey(prefix, k), child) - } - case []interface{}: - return - case nil: - out[prefix] = "" - case bool: - if val { - out[prefix] = "true" - } else { - out[prefix] = "false" - } - case float64: - out[prefix] = strconv.FormatFloat(val, 'f', -1, 64) - case string: - out[prefix] = val - default: - out[prefix] = fmt.Sprintf("%v", val) - } -} - func joinKey(prefix, key string) string { if prefix == "" { return key diff --git a/internal/services/sidecar_tags_test.go b/internal/services/sidecar_tags_test.go index e298673..d945c4b 100644 --- a/internal/services/sidecar_tags_test.go +++ b/internal/services/sidecar_tags_test.go @@ -108,45 +108,6 @@ func TestFlattenSidecar_ArraysEncodedAsJSONString(t *testing.T) { } } -func TestFlattenSidecarScalars_SkipsArrays(t *testing.T) { - tags, err := flattenSidecarScalars([]byte(testSidecarJSON)) - if err != nil { - t.Fatalf("flattenSidecarScalars failed: %v", err) - } - - cases := map[string]string{ - "device.device_id": "robot_01", - "recording.file_size_bytes": "147960982", - "task.data_collector_id": "刘备", - "recording.message_count": "222251", - "recording.recorder_version": "0.3.1", - } - - for key, want := range cases { - got, ok := tags[key] - if !ok { - t.Errorf("key %q missing from tags", key) - continue - } - if got != want { - t.Errorf("tags[%q] = %q, want %q", key, got, want) - } - } - - for _, key := range []string{"recording.topics_recorded", "task.skills", "topics_summary"} { - if _, ok := tags[key]; ok { - t.Errorf("array or excluded key %q should not be included", key) - } - } -} - -func TestEncodeDPTagValue(t *testing.T) { - got := encodeDPTagValue("a,b%") - if got != "a%2Cb%25" { - t.Fatalf("encodeDPTagValue() = %q, want %q", got, "a%2Cb%25") - } -} - func TestFlattenSidecar_TopicsSummaryExcluded(t *testing.T) { tags, err := flattenSidecar([]byte(testSidecarJSON)) if err != nil { diff --git a/internal/services/sync_errors.go b/internal/services/sync_errors.go new file mode 100644 index 0000000..c07e3ff --- /dev/null +++ b/internal/services/sync_errors.go @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: 2026 ArcheBase +// +// SPDX-License-Identifier: MulanPSL-2.0 + +package services + +import ( + "errors" + "fmt" +) + +type syncNonRetryableError struct { + err error +} + +func (e *syncNonRetryableError) Error() string { + if e == nil || e.err == nil { + return "" + } + return e.err.Error() +} + +func (e *syncNonRetryableError) Unwrap() error { + if e == nil { + return nil + } + return e.err +} + +func newNonRetryableSyncError(format string, args ...interface{}) error { + return &syncNonRetryableError{err: fmt.Errorf(format, args...)} +} + +func wrapNonRetryableSyncError(err error, format string, args ...interface{}) error { + if err == nil { + return nil + } + msg := fmt.Sprintf(format, args...) + return &syncNonRetryableError{err: fmt.Errorf("%s: %w", msg, err)} +} + +func isNonRetryableSyncError(err error) bool { + var target *syncNonRetryableError + return errors.As(err, &target) +} diff --git a/internal/services/sync_worker.go b/internal/services/sync_worker.go index f9971bb..637d8a4 100644 --- a/internal/services/sync_worker.go +++ b/internal/services/sync_worker.go @@ -42,6 +42,19 @@ type syncEnqueueRequest struct { manual bool } +type syncEpisodeUploadRow struct { + ID int64 `db:"id"` + EpisodeUUID string `db:"episode_id"` + TaskID int64 `db:"task_id"` + McapPath string `db:"mcap_path"` + SidecarPath string `db:"sidecar_path"` + CloudSynced bool `db:"cloud_synced"` + Metadata sql.NullString `db:"metadata"` + WorkstationID sql.NullInt64 `db:"workstation_id"` + FactoryID sql.NullInt64 `db:"factory_id"` + OrganizationID sql.NullInt64 `db:"organization_id"` +} + // SyncWorker is a background goroutine that processes queued cloud sync work // and optionally discovers approved episodes for automatic cloud upload. type SyncWorker struct { @@ -85,6 +98,7 @@ var ( errSyncRetryBackoffActive = errors.New("sync retry backoff active") errSyncRetryExhausted = errors.New("sync retry max retries exceeded") errSyncAlreadyCompleted = errors.New("sync already completed") + errSyncNonRetryableFailed = errors.New("sync latest failure is non-retryable") ) // NewSyncWorker creates a new sync worker. Call Start() to begin background processing. @@ -340,13 +354,16 @@ func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64, case "completed": return fmt.Errorf("%w for episode %d", errSyncAlreadyCompleted, episodeID) case "failed": - retryDue := !latest.NextRetry.Valid || !latest.NextRetry.Time.After(now) + retryDue := latest.NextRetry.Valid && !latest.NextRetry.Time.After(now) if latest.AttemptCount < w.cfg.MaxRetries && retryDue { if err := promoteFailedSyncLogToPending(ctx, tx, latest.ID, now); err != nil { return err } return tx.Commit() } + if !manual && !latest.NextRetry.Valid { + return fmt.Errorf("%w for episode %d", errSyncNonRetryableFailed, episodeID) + } if !manual && latest.AttemptCount >= w.cfg.MaxRetries { return fmt.Errorf("%w for episode %d", errSyncRetryExhausted, episodeID) } @@ -408,7 +425,8 @@ func isSkippablePendingError(err error) bool { return errors.Is(err, ErrSyncAlreadyInProgress) || errors.Is(err, errSyncRetryBackoffActive) || errors.Is(err, errSyncRetryExhausted) || - errors.Is(err, errSyncAlreadyCompleted) + errors.Is(err, errSyncAlreadyCompleted) || + errors.Is(err, errSyncNonRetryableFailed) } // EnqueuePendingEpisodes scans for all approved but un-synced episodes and enqueues them. @@ -682,6 +700,17 @@ func (w *SyncWorker) findPendingEpisodes(ctx context.Context, includeExhaustedFa WHERE sl.episode_id = e.id AND sl.status = 'failed' AND sl.attempt_count >= ? + ) + AND NOT EXISTS ( + SELECT 1 FROM sync_logs sl + INNER JOIN ( + SELECT episode_id, MAX(id) AS latest_id + FROM sync_logs + GROUP BY episode_id + ) t ON sl.episode_id = t.episode_id AND sl.id = t.latest_id + WHERE sl.episode_id = e.id + AND sl.status = 'failed' + AND sl.next_retry_at IS NULL )`) err = w.db.SelectContext(ctx, &ids, query, w.cfg.MaxRetries, w.cfg.BatchSize) } else { @@ -705,9 +734,13 @@ func (w *SyncWorker) retryFailedEpisodes(ctx context.Context) { FROM sync_logs GROUP BY episode_id ) t ON sl.episode_id = t.episode_id AND sl.id = t.latest_id + INNER JOIN episodes e ON e.id = sl.episode_id WHERE sl.status = 'failed' + AND e.deleted_at IS NULL + AND e.cloud_synced = FALSE AND sl.attempt_count < ? - AND (sl.next_retry_at IS NULL OR sl.next_retry_at <= ?) + AND sl.next_retry_at IS NOT NULL + AND sl.next_retry_at <= ? AND NOT EXISTS ( SELECT 1 FROM sync_logs sl2 WHERE sl2.episode_id = sl.episode_id @@ -738,16 +771,19 @@ func (w *SyncWorker) retryFailedEpisodes(ctx context.Context) { } func (w *SyncWorker) processEpisodeWithMode(ctx context.Context, episodeID int64, manual bool) { - // Fetch episode details - var ep struct { - ID int64 `db:"id"` - EpisodeUUID string `db:"episode_id"` - McapPath string `db:"mcap_path"` - SidecarPath string `db:"sidecar_path"` - CloudSynced bool `db:"cloud_synced"` - } + var ep syncEpisodeUploadRow err := w.db.GetContext(ctx, &ep, ` - SELECT id, episode_id, mcap_path, sidecar_path, cloud_synced + SELECT + id, + episode_id, + task_id, + mcap_path, + sidecar_path, + cloud_synced, + metadata, + workstation_id, + factory_id, + organization_id FROM episodes WHERE id = ? AND deleted_at IS NULL `, episodeID) @@ -765,50 +801,146 @@ func (w *SyncWorker) processEpisodeWithMode(ctx context.Context, episodeID int64 return } - // Extract the MinIO object key from the stored path (strip bucket prefix) - mcapKey := stripBucketPrefix(ep.McapPath) + syncLogID, attemptCount, err := w.acquireSyncLogWithMode(ctx, episodeID, ep.McapPath, manual) + if err != nil { + //logger.Printf("[SYNC-WORKER] Failed to acquire sync log for episode %d: %v", episodeID, err) + return + } - if mcapKey == "" { - logger.Printf("[SYNC-WORKER] Episode %d has empty mcap_path, skipping", episodeID) + startTime := time.Now() + + result, err := w.uploadEpisodeDirect(ctx, ep) + if err != nil { + duration := int64(time.Since(startTime).Seconds()) + w.markSyncFailed(ctx, syncLogID, episodeID, duration, err, attemptCount) return } - // Build raw tags from sidecar JSON (best-effort: log and continue on failure). - rawTags := map[string]string{ - "episode_id": ep.EpisodeUUID, + // Success: update episode and sync_log + duration := int64(time.Since(startTime).Seconds()) + w.markSyncCompleted(ctx, syncLogID, episodeID, result, duration) +} + +func (w *SyncWorker) uploadEpisodeDirect(ctx context.Context, ep syncEpisodeUploadRow) (*cloud.UploadResult, error) { + mcapKey := stripBucketPrefix(ep.McapPath) + if mcapKey == "" { + return nil, newNonRetryableSyncError("episode %d has empty mcap_path", ep.ID) } - if sidecarTags, err := w.tagsFromSidecar(ctx, ep.SidecarPath); err != nil { - logger.Printf("[SYNC-WORKER] Episode %d: failed to read sidecar tags, uploading without them: %v", episodeID, err) - } else { - for k, v := range sidecarTags { - rawTags[k] = v - } + + assetID, err := resolveAssetIDForEpisode(ctx, w.db, ep.ID, ep.Metadata, ep.WorkstationID) + if err != nil { + return nil, wrapNonRetryableSyncError(err, "resolve asset_id for episode %d", ep.ID) } - // Reuse latest failed sync_log when retry is due, otherwise insert a new row. - syncLogID, attemptCount, err := w.acquireSyncLogWithMode(ctx, episodeID, ep.McapPath, manual) + if w.syncCfg == nil || strings.TrimSpace(w.syncCfg.DPConfigPath) == "" { + return nil, newNonRetryableSyncError("KEYSTONE_SYNC_DP_CONFIG is required for direct sync") + } + dpConfig, err := loadDPDeviceUploadConfig(w.syncCfg.DPConfigPath, assetID) if err != nil { - //logger.Printf("[SYNC-WORKER] Failed to acquire sync log for episode %d: %v", episodeID, err) - return + return nil, wrapNonRetryableSyncError(err, "load DP config for asset_id %s", assetID) } - startTime := time.Now() + sidecarTags, err := w.directTagsFromSidecar(ctx, ep.SidecarPath) + if err != nil { + return nil, err + } + + rawTags, err := buildDPDirectRawTags(dpRawTagsInput{ + Profile: dpConfig.Profile, + McapKey: mcapKey, + SidecarTags: sidecarTags, + EpisodeID: ep.ID, + EpisodePublicID: ep.EpisodeUUID, + TaskID: ep.TaskID, + FactoryID: ep.FactoryID, + OrganizationID: ep.OrganizationID, + }) + if err != nil { + return nil, wrapNonRetryableSyncError(err, "build raw tags for episode %d", ep.ID) + } - // Execute upload - result, err := w.uploader.Upload(ctx, cloud.UploadRequest{ + uploader, cleanup, err := w.newDirectUploader(dpConfig) + if err != nil { + return nil, fmt.Errorf("create direct uploader for asset_id %s: %w", assetID, err) + } + defer cleanup() + + logger.Printf("[SYNC-WORKER] Episode %d direct sync config resolved: asset_id=%s auth=%s auth_tls=%t gateway=%s gateway_tls=%t", + ep.ID, assetID, dpConfig.Auth.Target, dpConfig.Auth.UseTLS, dpConfig.Gateway.Target, dpConfig.Gateway.UseTLS) + + return uploader.Upload(ctx, cloud.UploadRequest{ EpisodeID: ep.EpisodeUUID, McapKey: mcapKey, + AssetID: assetID, RawTags: rawTags, }) +} + +func (w *SyncWorker) newDirectUploader(dpConfig *DPDeviceUploadConfig) (*cloud.Uploader, func(), error) { + if dpConfig == nil { + return nil, func() {}, fmt.Errorf("missing DP upload config") + } + authClient := cloud.NewAuthClient(cloud.AuthClientConfig{ + Endpoint: dpConfig.Auth.Target, + UseTLS: dpConfig.Auth.UseTLS, + TLSServerName: dpConfig.Auth.ServerName, + APIKey: dpConfig.Profile.APIKey, + RefreshBefore: 60 * time.Second, + }) + gatewayClient := cloud.NewGatewayClient(cloud.GatewayClientConfig{ + Endpoint: dpConfig.Gateway.Target, + UseTLS: dpConfig.Gateway.UseTLS, + TLSServerName: dpConfig.Gateway.ServerName, + RequestTimeout: w.syncRequestTimeout(), + }, authClient) + cleanup := func() { + if err := gatewayClient.Close(); err != nil { + logger.Printf("[SYNC-WORKER] Failed to close direct gateway client: %v", err) + } + if err := authClient.Close(); err != nil { + logger.Printf("[SYNC-WORKER] Failed to close direct auth client: %v", err) + } + } + + uploader, err := cloud.NewUploader(gatewayClient, w.minioClient, w.minioBucket, cloud.UploaderConfig{ + RequestTimeout: w.syncRequestTimeout(), + OSSTimeout: w.syncOSSTimeout(), + PersistRootDir: w.syncPersistRootDir(), + MaxRestartCount: uint32(w.syncMaxRestartCount()), //nolint:gosec // non-negative by helper + }) if err != nil { - duration := int64(time.Since(startTime).Seconds()) - w.markSyncFailed(ctx, syncLogID, episodeID, duration, err, attemptCount) - return + cleanup() + return nil, func() {}, err } + return uploader, cleanup, nil +} - // Success: update episode and sync_log - duration := int64(time.Since(startTime).Seconds()) - w.markSyncCompleted(ctx, syncLogID, episodeID, result, duration) +func (w *SyncWorker) syncRequestTimeout() time.Duration { + if w.syncCfg != nil && w.syncCfg.RequestTimeoutSec > 0 { + return time.Duration(w.syncCfg.RequestTimeoutSec) * time.Second + } + return 30 * time.Second +} + +func (w *SyncWorker) syncOSSTimeout() time.Duration { + if w.syncCfg != nil && w.syncCfg.OSSTimeoutSec > 0 { + return time.Duration(w.syncCfg.OSSTimeoutSec) * time.Second + } + return 300 * time.Second +} + +func (w *SyncWorker) syncPersistRootDir() string { + if w.syncCfg == nil { + return "" + } + return w.syncCfg.PersistRootDir +} + +func (w *SyncWorker) syncMaxRestartCount() int { + if w.syncCfg != nil && w.syncCfg.MaxRestartCount >= 0 { + return w.syncCfg.MaxRestartCount + } + return 3 } func (w *SyncWorker) acquireSyncLogWithMode(ctx context.Context, episodeID int64, sourcePath string, manual bool) (int64, int, error) { @@ -893,7 +1025,7 @@ func (w *SyncWorker) acquireSyncLogWithMode(ctx context.Context, episodeID int64 case "completed": return 0, 0, fmt.Errorf("episode %d already has completed sync_log", episodeID) case "failed": - retryDue := !latest.NextRetry.Valid || !latest.NextRetry.Time.After(now) + retryDue := latest.NextRetry.Valid && !latest.NextRetry.Time.After(now) if latest.AttemptCount < w.cfg.MaxRetries && retryDue { res, updErr := tx.ExecContext(ctx, ` UPDATE sync_logs @@ -924,6 +1056,9 @@ func (w *SyncWorker) acquireSyncLogWithMode(ctx context.Context, episodeID int64 return latest.ID, latest.AttemptCount + 1, nil } + if !manual && !latest.NextRetry.Valid { + return 0, 0, fmt.Errorf("%w for episode %d", errSyncNonRetryableFailed, episodeID) + } if !manual && latest.AttemptCount >= w.cfg.MaxRetries { return 0, 0, fmt.Errorf("max retries exceeded for episode %d", episodeID) } @@ -1006,8 +1141,11 @@ func (w *SyncWorker) markSyncFailed(ctx context.Context, syncLogID, episodeID, d now := time.Now().UTC() errMsg := uploadErr.Error() - backoff := w.nextRetryDelay(attemptCount) - nextRetry := now.Add(backoff) + var nextRetry sql.NullTime + if !isNonRetryableSyncError(uploadErr) { + backoff := w.nextRetryDelay(attemptCount) + nextRetry = sql.NullTime{Time: now.Add(backoff), Valid: true} + } if _, err := w.db.ExecContext(ctx, ` UPDATE sync_logs @@ -1021,8 +1159,13 @@ func (w *SyncWorker) markSyncFailed(ctx context.Context, syncLogID, episodeID, d logger.Printf("[SYNC-WORKER] Failed to update sync log %d as failed: %v", syncLogID, err) } - logger.Printf("[SYNC-WORKER] Episode %d sync failed: %v (attempt=%d, next_retry=%v)", - episodeID, uploadErr, attemptCount, nextRetry.Format(time.RFC3339)) + if nextRetry.Valid { + logger.Printf("[SYNC-WORKER] Episode %d sync failed: %v (attempt=%d, next_retry=%v)", + episodeID, uploadErr, attemptCount, nextRetry.Time.Format(time.RFC3339)) + return + } + logger.Printf("[SYNC-WORKER] Episode %d sync failed non-retryable: %v (attempt=%d)", + episodeID, uploadErr, attemptCount) } func (w *SyncWorker) nextRetryDelay(attemptCount int) time.Duration { @@ -1100,6 +1243,35 @@ func (w *SyncWorker) tagsFromSidecar(ctx context.Context, sidecarPath string) (m return tags, nil } +func (w *SyncWorker) directTagsFromSidecar(ctx context.Context, sidecarPath string) (map[string]string, error) { + key := stripBucketPrefix(sidecarPath) + if key == "" { + return nil, newNonRetryableSyncError("empty sidecar_path") + } + if w.minioClient == nil { + return nil, fmt.Errorf("minio client not available") + } + + obj, err := w.minioClient.GetObject(ctx, w.minioBucket, key, minio.GetObjectOptions{}) + if err != nil { + return nil, fmt.Errorf("get sidecar object %s: %w", key, err) + } + defer func() { + _ = obj.Close() + }() + + data, err := io.ReadAll(obj) + if err != nil { + return nil, fmt.Errorf("read sidecar object %s: %w", key, err) + } + + tags, err := flattenSidecar(data) + if err != nil { + return nil, wrapNonRetryableSyncError(err, "flatten sidecar %s", key) + } + return tags, nil +} + // stripBucketPrefix removes the leading "bucket/" prefix from a stored path. // Stored paths look like "edge-factory-default/factory-default/device/date/task.mcap". func stripBucketPrefix(path string) string { diff --git a/internal/services/sync_worker_test.go b/internal/services/sync_worker_test.go index 02d3151..9f518e0 100644 --- a/internal/services/sync_worker_test.go +++ b/internal/services/sync_worker_test.go @@ -5,11 +5,17 @@ package services import ( + "bytes" "context" + "database/sql" "errors" + "log" + "strings" "testing" "time" + "archebase.com/keystone-edge/internal/cloud" + "archebase.com/keystone-edge/internal/logger" "github.com/jmoiron/sqlx" _ "modernc.org/sqlite" ) @@ -129,6 +135,27 @@ func TestFindPendingEpisodes_ExcludesExhaustedFailuresFromPollingOnly(t *testing assertEpisodeIDs(t, pollIDs, []int64{1, 3}) } +func TestFindPendingEpisodes_SkipsNonRetryableFailuresFromPollingOnly(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{db: db, cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3}} + + insertEpisodeForSyncWorkerTest(t, db, 5, "approved", false) + insertEpisodeForSyncWorkerTest(t, db, 6, "approved", false) + insertNonRetryableSyncLogForSyncWorkerTest(t, db, 6, "failed", 1) + + apiIDs, err := w.findPendingEpisodes(context.Background(), true) + if err != nil { + t.Fatalf("api pending query failed: %v", err) + } + assertEpisodeIDs(t, apiIDs, []int64{5, 6}) + + pollIDs, err := w.findPendingEpisodes(context.Background(), false) + if err != nil { + t.Fatalf("poll pending query failed: %v", err) + } + assertEpisodeIDs(t, pollIDs, []int64{5}) +} + func TestEnqueueEpisodeManual_AllowsExhaustedRetryEpisode(t *testing.T) { db := newTestSyncWorkerDB(t) w := &SyncWorker{ @@ -262,6 +289,35 @@ func TestEnqueueEpisodeManual_RejectsPendingEpisode(t *testing.T) { } } +func TestEnqueueEpisodeManual_AllowsNonRetryableFailure(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3}, + enqueueCh: make(chan syncEnqueueRequest, 1), + enqueuedEpisode: make(map[int64]struct{}), + } + w.running.Store(true) + + insertEpisodeForSyncWorkerTest(t, db, 24, "approved", false) + insertNonRetryableSyncLogForSyncWorkerTest(t, db, 24, "failed", 1) + + if err := w.EnqueueEpisodeManual(context.Background(), 24); err != nil { + t.Fatalf("manual enqueue failed: %v", err) + } + + latest := latestSyncLogForSyncWorkerTest(t, db, 24) + if latest.Status != "pending" { + t.Fatalf("latest status = %q, want pending", latest.Status) + } + if latest.AttemptCount != 0 { + t.Fatalf("latest attempt_count = %d, want fresh pending attempt count 0", latest.AttemptCount) + } + if count := countSyncLogsForSyncWorkerTest(t, db, 24); count != 2 { + t.Fatalf("sync log count = %d, want failed history plus fresh pending", count) + } +} + func TestEnqueuePendingEpisodes_PersistsPendingWhenMemoryQueueFull(t *testing.T) { db := newTestSyncWorkerDB(t) w := &SyncWorker{ @@ -448,6 +504,51 @@ func TestRetryFailedEpisodes_PromotesDueFailureToPendingBeforeDispatch(t *testin } } +func TestRetryFailedEpisodes_IgnoresMissingDeletedAndSyncedEpisodes(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3}, + jobCh: make(chan syncEnqueueRequest, 1), + enqueuedEpisode: make(map[int64]struct{}), + } + + insertSyncLogForSyncWorkerTest(t, db, 2, "failed", 1) + insertEpisodeForSyncWorkerTest(t, db, 3, "approved", false) + insertSyncLogForSyncWorkerTest(t, db, 3, "failed", 1) + if _, err := db.Exec(`UPDATE episodes SET deleted_at = ? WHERE id = 3`, time.Now().UTC()); err != nil { + t.Fatalf("mark episode deleted: %v", err) + } + insertEpisodeForSyncWorkerTest(t, db, 4, "approved", true) + insertSyncLogForSyncWorkerTest(t, db, 4, "failed", 1) + insertEpisodeForSyncWorkerTest(t, db, 5, "approved", false) + insertSyncLogForSyncWorkerTest(t, db, 5, "failed", 1) + + var logs bytes.Buffer + previousLogger := logger.Get() + logger.Set(log.New(&logs, "", 0)) + t.Cleanup(func() { logger.Set(previousLogger) }) + + w.retryFailedEpisodes(context.Background()) + + if strings.Contains(logs.String(), "Failed to queue retry") { + t.Fatalf("unexpected retry queue failure log: %s", logs.String()) + } + + latest := latestSyncLogForSyncWorkerTest(t, db, 5) + if latest.Status != "pending" { + t.Fatalf("episode 5 latest status = %q, want pending", latest.Status) + } + select { + case got := <-w.jobCh: + if got.episodeID != 5 { + t.Fatalf("unexpected retry dispatch episode id: got %d want 5", got.episodeID) + } + default: + t.Fatal("expected valid retryable episode to be dispatched") + } +} + func TestAcquireSyncLogWithMode_ClaimsFreshPendingRow(t *testing.T) { db := newTestSyncWorkerDB(t) w := &SyncWorker{ @@ -616,6 +717,74 @@ func TestNextRetryDelay_IncludesBoundedJitter(t *testing.T) { } } +func TestMarkSyncFailed_NonRetryableClearsNextRetry(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{RetryBaseSec: 30, RetryMaxSec: 1800}, + } + + insertEpisodeForSyncWorkerTest(t, db, 25, "approved", false) + insertSyncLogForSyncWorkerTest(t, db, 25, "in_progress", 1) + var syncLogID int64 + if err := db.Get(&syncLogID, "SELECT id FROM sync_logs WHERE episode_id = ?", 25); err != nil { + t.Fatalf("query sync log id: %v", err) + } + + w.markSyncFailed(context.Background(), syncLogID, 25, 0, newNonRetryableSyncError("asset_id missing"), 1) + + latest := latestSyncLogForSyncWorkerTest(t, db, 25) + if latest.Status != "failed" { + t.Fatalf("latest status = %q, want failed", latest.Status) + } + if latest.NextRetry.Valid { + t.Fatalf("next_retry_at valid = true, want NULL") + } +} + +func TestMarkSyncCompleted_WritesExistingCloudFields(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{db: db} + + insertEpisodeForSyncWorkerTest(t, db, 26, "approved", false) + insertSyncLogForSyncWorkerTest(t, db, 26, "in_progress", 1) + var syncLogID int64 + if err := db.Get(&syncLogID, "SELECT id FROM sync_logs WHERE episode_id = ?", 26); err != nil { + t.Fatalf("query sync log id: %v", err) + } + + w.markSyncCompleted(context.Background(), syncLogID, 26, &cloud.UploadResult{ + LogicalUploadID: "logical-26", + UploadID: "upload-26", + ObjectKey: "cloud/object.mcap", + FileSize: 12345, + }, 3) + + var ep struct { + CloudSynced bool `db:"cloud_synced"` + CloudMcapPath string `db:"cloud_mcap_path"` + CloudProcessed bool `db:"cloud_processed"` + } + if err := db.Get(&ep, "SELECT cloud_synced, cloud_mcap_path, cloud_processed FROM episodes WHERE id = ?", 26); err != nil { + t.Fatalf("query episode cloud fields: %v", err) + } + if !ep.CloudSynced || ep.CloudMcapPath != "cloud/object.mcap" || ep.CloudProcessed { + t.Fatalf("episode cloud fields = %+v", ep) + } + + var logRow struct { + Status string `db:"status"` + DestinationPath string `db:"destination_path"` + BytesTransferred int64 `db:"bytes_transferred"` + } + if err := db.Get(&logRow, "SELECT status, destination_path, bytes_transferred FROM sync_logs WHERE id = ?", syncLogID); err != nil { + t.Fatalf("query sync log completion fields: %v", err) + } + if logRow.Status != "completed" || logRow.DestinationPath != "cloud/object.mcap" || logRow.BytesTransferred != 12345 { + t.Fatalf("sync log completion fields = %+v", logRow) + } +} + func newTestSyncWorkerDB(t *testing.T) *sqlx.DB { t.Helper() @@ -629,6 +798,9 @@ func newTestSyncWorkerDB(t *testing.T) *sqlx.DB { id INTEGER PRIMARY KEY, qa_status TEXT NOT NULL, cloud_synced BOOLEAN NOT NULL DEFAULT 0, + cloud_synced_at TIMESTAMP NULL, + cloud_mcap_path TEXT, + cloud_processed BOOLEAN NOT NULL DEFAULT 0, deleted_at TIMESTAMP NULL, created_at TIMESTAMP NOT NULL )`, @@ -637,6 +809,8 @@ func newTestSyncWorkerDB(t *testing.T) *sqlx.DB { episode_id INTEGER NOT NULL, source_path TEXT, status TEXT NOT NULL, + destination_path TEXT, + bytes_transferred INTEGER, duration_sec INTEGER, error_message TEXT, attempt_count INTEGER NOT NULL DEFAULT 0, @@ -675,18 +849,35 @@ func insertEpisodeForSyncWorkerTest(t *testing.T, db *sqlx.DB, id int64, qaStatu func insertSyncLogForSyncWorkerTest(t *testing.T, db *sqlx.DB, episodeID int64, status string, attemptCount int) { t.Helper() + startedAt := time.Date(2026, 2, int(episodeID), 0, 0, 0, 0, time.UTC) + nextRetry := sql.NullTime{} + if status == "failed" { + nextRetry = sql.NullTime{Time: startedAt.Add(time.Second), Valid: true} + } + if _, err := db.Exec(` + INSERT INTO sync_logs (episode_id, status, attempt_count, started_at, next_retry_at) + VALUES (?, ?, ?, ?, ?) + `, episodeID, status, attemptCount, startedAt, nextRetry); err != nil { + t.Fatalf("insert sync log for episode %d: %v", episodeID, err) + } +} + +func insertNonRetryableSyncLogForSyncWorkerTest(t *testing.T, db *sqlx.DB, episodeID int64, status string, attemptCount int) { + t.Helper() + startedAt := time.Date(2026, 2, int(episodeID), 0, 0, 0, 0, time.UTC) if _, err := db.Exec(` - INSERT INTO sync_logs (episode_id, status, attempt_count, started_at) - VALUES (?, ?, ?, ?) + INSERT INTO sync_logs (episode_id, status, attempt_count, started_at, next_retry_at) + VALUES (?, ?, ?, ?, NULL) `, episodeID, status, attemptCount, startedAt); err != nil { t.Fatalf("insert sync log for episode %d: %v", episodeID, err) } } type syncLogForSyncWorkerTest struct { - Status string `db:"status"` - AttemptCount int `db:"attempt_count"` + Status string `db:"status"` + AttemptCount int `db:"attempt_count"` + NextRetry sql.NullTime `db:"next_retry_at"` } func latestSyncLogForSyncWorkerTest(t *testing.T, db *sqlx.DB, episodeID int64) syncLogForSyncWorkerTest { @@ -694,7 +885,7 @@ func latestSyncLogForSyncWorkerTest(t *testing.T, db *sqlx.DB, episodeID int64) var row syncLogForSyncWorkerTest if err := db.Get(&row, ` - SELECT status, attempt_count + SELECT status, attempt_count, next_retry_at FROM sync_logs WHERE episode_id = ? ORDER BY id DESC diff --git a/internal/storage/database/migrations/000004_cli_sync_runs.down.sql b/internal/storage/database/migrations/000004_cli_sync_runs.down.sql deleted file mode 100644 index 6493ded..0000000 --- a/internal/storage/database/migrations/000004_cli_sync_runs.down.sql +++ /dev/null @@ -1,5 +0,0 @@ --- SPDX-FileCopyrightText: 2026 ArcheBase --- --- SPDX-License-Identifier: MulanPSL-2.0 - -DROP TABLE IF EXISTS cli_sync_runs; diff --git a/internal/storage/database/migrations/000004_cli_sync_runs.up.sql b/internal/storage/database/migrations/000004_cli_sync_runs.up.sql deleted file mode 100644 index 5d56f17..0000000 --- a/internal/storage/database/migrations/000004_cli_sync_runs.up.sql +++ /dev/null @@ -1,29 +0,0 @@ --- SPDX-FileCopyrightText: 2026 ArcheBase --- --- SPDX-License-Identifier: MulanPSL-2.0 - -CREATE TABLE IF NOT EXISTS cli_sync_runs ( - id BIGINT AUTO_INCREMENT PRIMARY KEY, - episode_id BIGINT NOT NULL, - status ENUM('pending', 'in_progress', 'completed', 'failed') NOT NULL DEFAULT 'pending', - source_path VARCHAR(1024), - temp_path VARCHAR(1024), - dp_config_path VARCHAR(1024), - file_id VARCHAR(255), - logical_upload_id VARCHAR(255), - upload_id VARCHAR(255), - bucket VARCHAR(255), - object_key VARCHAR(1024), - file_size BIGINT, - oss_object_etag VARCHAR(255), - duration_sec INT, - error_message TEXT, - stdout_json JSON DEFAULT NULL, - started_at TIMESTAMP NULL, - completed_at TIMESTAMP NULL, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, - INDEX idx_cli_sync_episode (episode_id), - INDEX idx_cli_sync_status (status), - INDEX idx_cli_sync_created (created_at) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; diff --git a/internal/storage/database/migrations/000004_robot_asset_id.down.sql b/internal/storage/database/migrations/000004_robot_asset_id.down.sql new file mode 100644 index 0000000..c601e3d --- /dev/null +++ b/internal/storage/database/migrations/000004_robot_asset_id.down.sql @@ -0,0 +1,7 @@ +-- SPDX-FileCopyrightText: 2026 ArcheBase +-- +-- SPDX-License-Identifier: MulanPSL-2.0 + +ALTER TABLE robots + DROP INDEX idx_asset_active_unique, + DROP COLUMN _asset_unique; diff --git a/internal/storage/database/migrations/000004_robot_asset_id.up.sql b/internal/storage/database/migrations/000004_robot_asset_id.up.sql new file mode 100644 index 0000000..6bcf96c --- /dev/null +++ b/internal/storage/database/migrations/000004_robot_asset_id.up.sql @@ -0,0 +1,14 @@ +-- SPDX-FileCopyrightText: 2026 ArcheBase +-- +-- SPDX-License-Identifier: MulanPSL-2.0 + +ALTER TABLE robots + ADD COLUMN _asset_unique VARCHAR(100) + GENERATED ALWAYS AS ( + CASE + WHEN deleted_at IS NULL AND asset_id IS NOT NULL AND asset_id <> '' + THEN asset_id + ELSE NULL + END + ) STORED, + ADD UNIQUE INDEX idx_asset_active_unique (_asset_unique); From 7fa6dc4aa2685fb70583d26014e2c4d5bb48a5d1 Mon Sep 17 00:00:00 2001 From: chaoliu Date: Thu, 4 Jun 2026 13:37:38 +0800 Subject: [PATCH 3/7] fix(sync): keep upload part size stable --- internal/cloud/uploader.go | 30 +++++++++++++++++++++++------- internal/cloud/uploader_test.go | 15 +++++++++++++-- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/internal/cloud/uploader.go b/internal/cloud/uploader.go index 9fdbf98..8d746da 100644 --- a/internal/cloud/uploader.go +++ b/internal/cloud/uploader.go @@ -74,6 +74,7 @@ type persistedUploadState struct { McapKey string `json:"mcap_key"` AssetID string `json:"asset_id"` FileSize int64 `json:"file_size"` + PartSizeBytes int64 `json:"part_size_bytes,omitempty"` UpdatedAt time.Time `json:"updated_at"` } @@ -262,6 +263,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult McapKey: req.McapKey, AssetID: req.AssetID, FileSize: fileSize, + PartSizeBytes: session.PartSizeBytes, UpdatedAt: time.Now(), }); err != nil { return nil, fmt.Errorf("persist initial upload state: %w", err) @@ -302,6 +304,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult McapKey: req.McapKey, AssetID: req.AssetID, FileSize: fileSize, + PartSizeBytes: session.PartSizeBytes, UpdatedAt: time.Now(), }); err != nil { logger.Printf("[CLOUD-UPLOAD] Warning: failed to update state with multipart_upload_id: %v", err) @@ -312,7 +315,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult // Step 4: Refresh STS credentials if about to expire before CompleteUpload RPC if time.Until(session.STSExpireAt) <= u.cfg.RequestTimeout { - refreshed, err := u.gateway.ReissueUploadCredentials(ctx, session.UploadID) + refreshed, err := u.refreshUploadCredentials(ctx, session) if err != nil { logger.Printf("[CLOUD-UPLOAD] Warning: refresh credentials failed (proceeding anyway): %v", err) } else { @@ -478,6 +481,9 @@ func (u *Uploader) decideResumeAction(ctx context.Context, state *persistedUploa // Treat RPC failures as transient: preserve local state for next retry. return resumeContinue, nil, "", 0, fmt.Errorf("ReissueUploadCredentials: %w", err) } + if state.PartSizeBytes > 0 { + session.PartSizeBytes = state.PartSizeBytes + } if state.MultipartUploadID != "" { outcome, err := u.reconcileRemoteParts(ctx, session, state.MultipartUploadID) @@ -578,10 +584,12 @@ func (u *Uploader) minioRangeReader(key string) partStreamFactory { // uploadParts streams the MCAP from MinIO and uploads it to OSS in parts. // Returns the OSS multipart upload ID, the list of uploaded parts, per-part MD5 digests, and any error. func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session *UploadSession, fileSize int64) (*UploadSession, string, []UploadedPart, [][16]byte, error) { + fixedPartSizeBytes := normalizedPartSizeBytes(session.PartSizeBytes) session, err := u.ensureFreshUploadCredentials(ctx, session) if err != nil { return nil, "", nil, nil, fmt.Errorf("refresh credentials before initiate multipart upload: %w", err) } + session.PartSizeBytes = fixedPartSizeBytes // Initiate multipart upload on OSS multipartUploadID, err := u.oss.InitiateMultipartUpload(ctx, session) @@ -595,7 +603,7 @@ func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session * // connection is not left idle during OSS part uploads. A single streaming // response would risk idle connection timeout (~20-25s on MinIO or network // intermediaries) when upload speed is slow. - session, parts, partMD5s, err := u.streamMultipartParts(ctx, req.EpisodeID, session, multipartUploadID, fileSize, u.minioRangeReader(req.McapKey)) + session, parts, partMD5s, err := u.streamMultipartParts(ctx, req.EpisodeID, session, multipartUploadID, fileSize, fixedPartSizeBytes, u.minioRangeReader(req.McapKey)) if err != nil { u.abortMultipartUpload(session, multipartUploadID) return nil, "", nil, nil, err @@ -606,6 +614,7 @@ func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session * u.abortMultipartUpload(session, multipartUploadID) return nil, "", nil, nil, fmt.Errorf("refresh credentials before complete multipart upload: %w", err) } + session.PartSizeBytes = fixedPartSizeBytes // Complete multipart upload on OSS if _, err := u.oss.CompleteMultipartUpload(ctx, session, multipartUploadID, parts); err != nil { @@ -616,11 +625,9 @@ func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session * return session, multipartUploadID, parts, partMD5s, nil } -func (u *Uploader) streamMultipartParts(ctx context.Context, episodeID string, session *UploadSession, multipartUploadID string, fileSize int64, newPartStream partStreamFactory) (*UploadSession, []UploadedPart, [][16]byte, error) { - partSizeBytes := session.PartSizeBytes - if partSizeBytes <= 0 { - partSizeBytes = 8 * 1024 * 1024 // 8MB default - } +func (u *Uploader) streamMultipartParts(ctx context.Context, episodeID string, session *UploadSession, multipartUploadID string, fileSize int64, partSizeBytes int64, newPartStream partStreamFactory) (*UploadSession, []UploadedPart, [][16]byte, error) { + partSizeBytes = normalizedPartSizeBytes(partSizeBytes) + session.PartSizeBytes = partSizeBytes partSize := int(partSizeBytes) if int64(partSize) != partSizeBytes { return session, nil, nil, fmt.Errorf("invalid part_size_bytes %d", partSizeBytes) @@ -676,6 +683,7 @@ func (u *Uploader) streamMultipartParts(ctx context.Context, episodeID string, s return session, nil, nil, fmt.Errorf("refresh credentials after upload part %d token expiry: %w", partNumber, refreshErr) } session = refreshed + session.PartSizeBytes = partSizeBytes etag, err = u.oss.UploadPart(ctx, session, multipartUploadID, partNumber, partSlice) } if err != nil { @@ -715,9 +723,17 @@ func (u *Uploader) refreshUploadCredentials(ctx context.Context, session *Upload if err != nil { return nil, err } + refreshed.PartSizeBytes = normalizedPartSizeBytes(session.PartSizeBytes) return refreshed, nil } +func normalizedPartSizeBytes(partSizeBytes int64) int64 { + if partSizeBytes <= 0 { + return 8 * 1024 * 1024 + } + return partSizeBytes +} + func (u *Uploader) stsRefreshWindow() time.Duration { window := u.cfg.RequestTimeout if u.cfg.OSSTimeout > window { diff --git a/internal/cloud/uploader_test.go b/internal/cloud/uploader_test.go index 6d38df8..99e3432 100644 --- a/internal/cloud/uploader_test.go +++ b/internal/cloud/uploader_test.go @@ -667,6 +667,7 @@ func TestStreamMultipartParts_UploadsExpectedPartBoundaries(t *testing.T) { session, "multipart-stream", int64(len(payload)), + session.PartSizeBytes, factory, ) if err != nil { @@ -721,6 +722,7 @@ func TestStreamMultipartParts_EarlyEOFStopsInsteadOfUploadingEmptyParts(t *testi session, "multipart-short", 25, + session.PartSizeBytes, factory, ) if err == nil { @@ -744,14 +746,17 @@ func TestStreamMultipartParts_RefreshesCredentialsBeforeUploadPart(t *testing.T) } refreshed := makeSession("logical-expiring", uploadID) refreshed.STSAccessKeyID = "fresh-key" + refreshed.PartSizeBytes = 99 return refreshed, nil }, } var usedAccessKeyID string + var usedPartSizeBytes int64 oss := &fakeOSS{ uploadPartFn: func(_ context.Context, session *UploadSession, _ string, _ int, _ []byte) (string, error) { usedAccessKeyID = session.STSAccessKeyID + usedPartSizeBytes = session.PartSizeBytes return "etag", nil }, } @@ -766,7 +771,7 @@ func TestStreamMultipartParts_RefreshesCredentialsBeforeUploadPart(t *testing.T) return io.NopCloser(bytes.NewReader(payload[offset : offset+length])), nil } - _, parts, _, err := u.streamMultipartParts(context.Background(), "episode-expiring", session, "multipart-expiring", int64(len(payload)), factory) + finalSession, parts, _, err := u.streamMultipartParts(context.Background(), "episode-expiring", session, "multipart-expiring", int64(len(payload)), session.PartSizeBytes, factory) if err != nil { t.Fatalf("streamMultipartParts() error = %v", err) } @@ -779,6 +784,12 @@ func TestStreamMultipartParts_RefreshesCredentialsBeforeUploadPart(t *testing.T) if usedAccessKeyID != "fresh-key" { t.Fatalf("UploadPart access key = %q, want fresh-key", usedAccessKeyID) } + if usedPartSizeBytes != 4 { + t.Fatalf("UploadPart part size = %d, want fixed original size 4", usedPartSizeBytes) + } + if finalSession.PartSizeBytes != 4 { + t.Fatalf("final session part size = %d, want fixed original size 4", finalSession.PartSizeBytes) + } } func TestStreamMultipartParts_RetriesCurrentPartAfterSecurityTokenExpired(t *testing.T) { @@ -816,7 +827,7 @@ func TestStreamMultipartParts_RetriesCurrentPartAfterSecurityTokenExpired(t *tes return io.NopCloser(bytes.NewReader(payload[offset : offset+length])), nil } - _, parts, _, err := u.streamMultipartParts(context.Background(), "episode-retry", session, "multipart-retry", int64(len(payload)), factory) + _, parts, _, err := u.streamMultipartParts(context.Background(), "episode-retry", session, "multipart-retry", int64(len(payload)), session.PartSizeBytes, factory) if err != nil { t.Fatalf("streamMultipartParts() error = %v", err) } From 2b5bf5ca6cae174826cc604444e7efd3c7a15113 Mon Sep 17 00:00:00 2001 From: chaoliu Date: Thu, 4 Jun 2026 14:08:18 +0800 Subject: [PATCH 4/7] feat(sync): support episode resync --- internal/api/handlers/sync.go | 98 ++++++++++++++++++++ internal/services/sync_worker.go | 124 ++++++++++++++++++++------ internal/services/sync_worker_test.go | 64 ++++++++++++- 3 files changed, 258 insertions(+), 28 deletions(-) diff --git a/internal/api/handlers/sync.go b/internal/api/handlers/sync.go index 9d1694c..eadd585 100644 --- a/internal/api/handlers/sync.go +++ b/internal/api/handlers/sync.go @@ -32,6 +32,7 @@ func NewSyncHandler(db *sqlx.DB, syncWorker *services.SyncWorker) *SyncHandler { // RegisterRoutes registers cloud sync related routes. func (h *SyncHandler) RegisterRoutes(apiV1 *gin.RouterGroup) { apiV1.POST("/sync/episodes", h.TriggerBatchSync) + apiV1.POST("/sync/episodes/:id/resync", h.TriggerEpisodeResync) apiV1.POST("/sync/episodes/:id", h.TriggerEpisodeSync) apiV1.GET("/sync/episodes", h.ListSyncJobs) apiV1.GET("/sync/episodes/summary", h.ListEpisodeSyncSummaries) @@ -40,6 +41,103 @@ func (h *SyncHandler) RegisterRoutes(apiV1 *gin.RouterGroup) { apiV1.GET("/sync/config", h.GetSyncConfig) } +type syncEpisodeActionRow struct { + QaStatus string `db:"qa_status"` + CloudSynced bool `db:"cloud_synced"` +} + +func (h *SyncHandler) loadSyncEpisodeForAction(c *gin.Context, episodeID int64) (syncEpisodeActionRow, bool) { + var row syncEpisodeActionRow + err := h.db.Get(&row, "SELECT qa_status, cloud_synced FROM episodes WHERE id = ? AND deleted_at IS NULL", episodeID) + if err == sql.ErrNoRows { + c.JSON(http.StatusNotFound, gin.H{"error": "episode not found"}) + return row, false + } + if err != nil { + logger.Printf("[SYNC] Failed to query episode %d: %v", episodeID, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to query episode"}) + return row, false + } + return row, true +} + +func (h *SyncHandler) enqueueSyncErrorResponse(c *gin.Context, episodeID int64, err error) { + switch { + case errors.Is(err, services.ErrSyncWorkerNotRunning): + c.JSON(http.StatusServiceUnavailable, gin.H{ + "error": err.Error(), + "episode_id": episodeID, + "status": "worker_not_running", + }) + case errors.Is(err, services.ErrEpisodeAlreadyEnqueued), errors.Is(err, services.ErrSyncAlreadyInProgress): + c.JSON(http.StatusConflict, gin.H{ + "error": err.Error(), + "episode_id": episodeID, + "status": "already_queued", + }) + case errors.Is(err, services.ErrSyncQueueFull): + c.JSON(http.StatusTooManyRequests, gin.H{ + "error": err.Error(), + "episode_id": episodeID, + "status": "queue_full", + }) + default: + logger.Printf("[SYNC] Enqueue episode %d failed: %v", episodeID, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to enqueue episode"}) + } +} + +// TriggerEpisodeResync queues a new cloud upload for an already-synced episode. +// +// @Summary Resync episode to cloud +// @Description Enqueues a new cloud upload for an already-synced episode without clearing previous sync history +// @Tags sync +// @Produce json +// @Param id path int true "Episode ID" +// @Success 202 {object} map[string]interface{} +// @Failure 400 {object} map[string]string +// @Failure 404 {object} map[string]string +// @Failure 409 {object} map[string]string +// @Failure 500 {object} map[string]string +// @Router /sync/episodes/{id}/resync [post] +func (h *SyncHandler) TriggerEpisodeResync(c *gin.Context) { + if h.syncWorker == nil { + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "sync worker is not configured"}) + return + } + + episodeID, ok := parseEpisodeIDParam(c) + if !ok { + return + } + + row, ok := h.loadSyncEpisodeForAction(c, episodeID) + if !ok { + return + } + if row.QaStatus != "approved" && row.QaStatus != "inspector_approved" { + c.JSON(http.StatusBadRequest, gin.H{ + "error": fmt.Sprintf("episode qa_status is %q, must be approved or inspector_approved", row.QaStatus), + }) + return + } + if !row.CloudSynced { + c.JSON(http.StatusBadRequest, gin.H{"error": "episode has not completed cloud sync; use normal sync instead"}) + return + } + + if err := h.syncWorker.EnqueueEpisodeResync(c.Request.Context(), episodeID); err != nil { + h.enqueueSyncErrorResponse(c, episodeID, err) + return + } + + c.JSON(http.StatusAccepted, gin.H{ + "status": "accepted", + "episode_id": episodeID, + "message": "episode enqueued for cloud resync", + }) +} + // syncLogRow represents a row from the sync_logs table. type syncLogRow struct { ID int64 `db:"id"` diff --git a/internal/services/sync_worker.go b/internal/services/sync_worker.go index 637d8a4..533d890 100644 --- a/internal/services/sync_worker.go +++ b/internal/services/sync_worker.go @@ -40,6 +40,7 @@ type SyncWorkerConfig struct { type syncEnqueueRequest struct { episodeID int64 manual bool + resync bool } type syncEpisodeUploadRow struct { @@ -238,13 +239,25 @@ func (w *SyncWorker) EnqueueEpisodeManual(ctx context.Context, episodeID int64) if !w.running.Load() { return ErrSyncWorkerNotRunning } - if err := w.persistPendingSyncLog(ctx, episodeID, true); err != nil { + if err := w.persistPendingSyncLog(ctx, episodeID, true, false); err != nil { return err } w.enqueuePersistedEpisode(ctx, syncEnqueueRequest{episodeID: episodeID, manual: true}) return nil } +// EnqueueEpisodeResync queues a new upload attempt for an episode that has already synced. +func (w *SyncWorker) EnqueueEpisodeResync(ctx context.Context, episodeID int64) error { + if !w.running.Load() { + return ErrSyncWorkerNotRunning + } + if err := w.persistResyncSyncLog(ctx, episodeID); err != nil { + return err + } + w.enqueuePersistedEpisode(ctx, syncEnqueueRequest{episodeID: episodeID, manual: true, resync: true}) + return nil +} + func (w *SyncWorker) enqueueEpisode(ctx context.Context, episodeID int64, manual bool) error { if !w.running.Load() { return ErrSyncWorkerNotRunning @@ -281,7 +294,7 @@ func (w *SyncWorker) enqueuePersistedEpisode(ctx context.Context, req syncEnqueu } } -func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64, manual bool) error { +func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64, manual bool, allowSynced bool) error { if w.db == nil { return nil } @@ -307,7 +320,7 @@ func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64, } return fmt.Errorf("lock episode %d: %w", episodeID, err) } - if episode.CloudSynced { + if episode.CloudSynced && !allowSynced { return fmt.Errorf("episode %d already synced", episodeID) } @@ -379,6 +392,55 @@ func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64, } } +func (w *SyncWorker) persistResyncSyncLog(ctx context.Context, episodeID int64) error { + if w.db == nil { + return nil + } + + tx, err := w.db.BeginTxx(ctx, nil) + if err != nil { + return fmt.Errorf("begin resync sync_log transaction: %w", err) + } + defer func() { _ = tx.Rollback() }() + + lockClause := txLockClause(tx) + var episode struct { + ID int64 `db:"id"` + CloudSynced bool `db:"cloud_synced"` + } + if err := tx.GetContext(ctx, &episode, ` + SELECT id, cloud_synced + FROM episodes + WHERE id = ? AND deleted_at IS NULL + `+lockClause, episodeID); err != nil { + if err == sql.ErrNoRows { + return fmt.Errorf("episode %d not found", episodeID) + } + return fmt.Errorf("lock episode %d for resync: %w", episodeID, err) + } + if !episode.CloudSynced { + return fmt.Errorf("episode %d has not completed cloud sync", episodeID) + } + + var activeCount int + if err := tx.GetContext(ctx, &activeCount, ` + SELECT COUNT(*) + FROM sync_logs + WHERE episode_id = ? + AND status IN ('pending', 'in_progress') + `, episodeID); err != nil { + return fmt.Errorf("query active resync sync_log count: %w", err) + } + if activeCount > 0 { + return fmt.Errorf("%w for episode %d", ErrSyncAlreadyInProgress, episodeID) + } + + if err := insertPendingSyncLog(ctx, tx, episodeID, time.Now().UTC(), 0); err != nil { + return err + } + return tx.Commit() +} + func insertPendingSyncLog(ctx context.Context, tx *sqlx.Tx, episodeID int64, queuedAt time.Time, attemptCount int) error { if _, err := tx.ExecContext(ctx, ` INSERT INTO sync_logs (episode_id, status, attempt_count, started_at) @@ -442,7 +504,7 @@ func (w *SyncWorker) EnqueuePendingEpisodes(ctx context.Context) (int, error) { } count := 0 for _, id := range ids { - if err := w.persistPendingSyncLog(ctx, id, false); err != nil { + if err := w.persistPendingSyncLog(ctx, id, false, false); err != nil { if isSkippablePendingError(err) { continue } @@ -515,9 +577,9 @@ func (w *SyncWorker) processEnqueuedEpisode(ctx context.Context, req syncEnqueue w.processEnqueuedEpisodeWith(ctx, req, w.processEpisodeWithMode) } -func (w *SyncWorker) processEnqueuedEpisodeWith(ctx context.Context, req syncEnqueueRequest, process func(context.Context, int64, bool)) { +func (w *SyncWorker) processEnqueuedEpisodeWith(ctx context.Context, req syncEnqueueRequest, process func(context.Context, int64, bool, bool)) { defer w.unmarkEnqueued(req.episodeID) - process(ctx, req.episodeID, req.manual) + process(ctx, req.episodeID, req.manual, req.resync) } func (w *SyncWorker) dispatchJob(ctx context.Context, req syncEnqueueRequest) { @@ -608,7 +670,7 @@ func (w *SyncWorker) pollAndProcess(ctx context.Context) { logger.Printf("[SYNC-WORKER] Found %d episodes to sync", len(ids)) for _, id := range ids { - if err := w.persistPendingSyncLog(ctx, id, false); err != nil { + if err := w.persistPendingSyncLog(ctx, id, false, false); err != nil { if isSkippablePendingError(err) { continue } @@ -620,13 +682,13 @@ func (w *SyncWorker) pollAndProcess(ctx context.Context) { } func (w *SyncWorker) dispatchPendingSyncLogs(ctx context.Context) { - ids, err := w.findPendingSyncLogEpisodes(ctx) + reqs, err := w.findPendingSyncLogEpisodes(ctx) if err != nil { logger.Printf("[SYNC-WORKER] Failed to find queued sync logs: %v", err) return } - for _, id := range ids { - w.dispatchPersistedJob(ctx, syncEnqueueRequest{episodeID: id, manual: false}) + for _, req := range reqs { + w.dispatchPersistedJob(ctx, req) } } @@ -637,10 +699,13 @@ func (w *SyncWorker) dispatchPersistedJob(ctx context.Context, req syncEnqueueRe w.dispatchJob(ctx, req) } -func (w *SyncWorker) findPendingSyncLogEpisodes(ctx context.Context) ([]int64, error) { - var ids []int64 - if err := w.db.SelectContext(ctx, &ids, ` - SELECT latest_log.episode_id +func (w *SyncWorker) findPendingSyncLogEpisodes(ctx context.Context) ([]syncEnqueueRequest, error) { + var rows []struct { + EpisodeID int64 `db:"episode_id"` + CloudSynced bool `db:"cloud_synced"` + } + if err := w.db.SelectContext(ctx, &rows, ` + SELECT latest_log.episode_id, e.cloud_synced FROM sync_logs latest_log INNER JOIN ( SELECT episode_id, MAX(id) AS latest_id @@ -649,14 +714,17 @@ func (w *SyncWorker) findPendingSyncLogEpisodes(ctx context.Context) ([]int64, e ) latest ON latest_log.episode_id = latest.episode_id AND latest_log.id = latest.latest_id INNER JOIN episodes e ON e.id = latest_log.episode_id WHERE latest_log.status = 'pending' - AND e.cloud_synced = FALSE AND e.deleted_at IS NULL ORDER BY latest_log.started_at ASC, latest_log.id ASC LIMIT ? `, w.cfg.BatchSize); err != nil { return nil, fmt.Errorf("query pending sync logs: %w", err) } - return ids, nil + reqs := make([]syncEnqueueRequest, len(rows)) + for i, row := range rows { + reqs[i] = syncEnqueueRequest{episodeID: row.EpisodeID, resync: row.CloudSynced} + } + return reqs, nil } func (w *SyncWorker) findPendingEpisodes(ctx context.Context, includeExhaustedFailures bool) ([]int64, error) { @@ -724,10 +792,13 @@ func (w *SyncWorker) findPendingEpisodes(ctx context.Context, includeExhaustedFa } func (w *SyncWorker) retryFailedEpisodes(ctx context.Context) { - var ids []int64 + var rows []struct { + EpisodeID int64 `db:"episode_id"` + CloudSynced bool `db:"cloud_synced"` + } now := time.Now().UTC() - err := w.db.SelectContext(ctx, &ids, ` - SELECT sl.episode_id + err := w.db.SelectContext(ctx, &rows, ` + SELECT sl.episode_id, e.cloud_synced FROM sync_logs sl INNER JOIN ( SELECT episode_id, MAX(id) AS latest_id @@ -737,7 +808,6 @@ func (w *SyncWorker) retryFailedEpisodes(ctx context.Context) { INNER JOIN episodes e ON e.id = sl.episode_id WHERE sl.status = 'failed' AND e.deleted_at IS NULL - AND e.cloud_synced = FALSE AND sl.attempt_count < ? AND sl.next_retry_at IS NOT NULL AND sl.next_retry_at <= ? @@ -754,23 +824,23 @@ func (w *SyncWorker) retryFailedEpisodes(ctx context.Context) { return } - if len(ids) == 0 { + if len(rows) == 0 { return } - for _, id := range ids { - if err := w.persistPendingSyncLog(ctx, id, false); err != nil { + for _, row := range rows { + if err := w.persistPendingSyncLog(ctx, row.EpisodeID, false, row.CloudSynced); err != nil { if isSkippablePendingError(err) { continue } - logger.Printf("[SYNC-WORKER] Failed to queue retry for episode %d: %v", id, err) + logger.Printf("[SYNC-WORKER] Failed to queue retry for episode %d: %v", row.EpisodeID, err) continue } - w.dispatchPersistedJob(ctx, syncEnqueueRequest{episodeID: id, manual: false}) + w.dispatchPersistedJob(ctx, syncEnqueueRequest{episodeID: row.EpisodeID, manual: false, resync: row.CloudSynced}) } } -func (w *SyncWorker) processEpisodeWithMode(ctx context.Context, episodeID int64, manual bool) { +func (w *SyncWorker) processEpisodeWithMode(ctx context.Context, episodeID int64, manual bool, resync bool) { var ep syncEpisodeUploadRow err := w.db.GetContext(ctx, &ep, ` SELECT @@ -796,7 +866,7 @@ func (w *SyncWorker) processEpisodeWithMode(ctx context.Context, episodeID int64 return } - if ep.CloudSynced { + if ep.CloudSynced && !resync { //logger.Printf("[SYNC-WORKER] Episode %d already synced, skipping", episodeID) return } diff --git a/internal/services/sync_worker_test.go b/internal/services/sync_worker_test.go index 9f518e0..2771843 100644 --- a/internal/services/sync_worker_test.go +++ b/internal/services/sync_worker_test.go @@ -226,6 +226,68 @@ func TestEnqueueEpisodeManual_PromotesDueFailureToPending(t *testing.T) { } } +func TestEnqueueEpisodeResync_AllowsAlreadySyncedEpisode(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3}, + enqueueCh: make(chan syncEnqueueRequest, 1), + enqueuedEpisode: make(map[int64]struct{}), + } + w.running.Store(true) + + insertEpisodeForSyncWorkerTest(t, db, 27, "approved", true) + insertSyncLogForSyncWorkerTest(t, db, 27, "completed", 1) + + if err := w.EnqueueEpisodeResync(context.Background(), 27); err != nil { + t.Fatalf("resync enqueue failed: %v", err) + } + + latest := latestSyncLogForSyncWorkerTest(t, db, 27) + if latest.Status != "pending" { + t.Fatalf("latest status = %q, want pending", latest.Status) + } + if count := countSyncLogsForSyncWorkerTest(t, db, 27); count != 2 { + t.Fatalf("sync log count = %d, want completed history plus resync pending", count) + } + + select { + case got := <-w.enqueueCh: + if got.episodeID != 27 { + t.Fatalf("unexpected episode id: got %d want 27", got.episodeID) + } + if !got.manual || !got.resync { + t.Fatalf("enqueue flags = manual:%t resync:%t, want both true", got.manual, got.resync) + } + default: + t.Fatal("expected resync episode to be enqueued") + } +} + +func TestDispatchPendingSyncLogs_TreatsSyncedPendingRowsAsResync(t *testing.T) { + db := newTestSyncWorkerDB(t) + w := &SyncWorker{ + db: db, + cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3}, + jobCh: make(chan syncEnqueueRequest, 1), + enqueuedEpisode: make(map[int64]struct{}), + } + + insertEpisodeForSyncWorkerTest(t, db, 28, "approved", true) + insertSyncLogForSyncWorkerTest(t, db, 28, "pending", 0) + + w.dispatchPendingSyncLogs(context.Background()) + + select { + case got := <-w.jobCh: + if got.episodeID != 28 || !got.resync { + t.Fatalf("dispatched request = %+v, want episode 28 resync", got) + } + default: + t.Fatal("expected synced pending row to be dispatched as resync") + } +} + func TestEnqueueEpisode_RejectsInProgressEpisode(t *testing.T) { db := newTestSyncWorkerDB(t) w := &SyncWorker{ @@ -618,7 +680,7 @@ func TestProcessEnqueuedEpisode_HoldsMarkerUntilProcessingReturns(t *testing.T) w.processEnqueuedEpisodeWith( context.Background(), syncEnqueueRequest{episodeID: 77, manual: true}, - func(context.Context, int64, bool) { + func(context.Context, int64, bool, bool) { close(started) <-release }, From 7a00ed3aa823c11c03fb4e33f4d5e2a620e4f471 Mon Sep 17 00:00:00 2001 From: chaoliu Date: Thu, 4 Jun 2026 14:18:07 +0800 Subject: [PATCH 5/7] fix(sync): satisfy golangci lint --- internal/services/dp_config_loader.go | 4 ++++ internal/services/sync_worker.go | 32 --------------------------- 2 files changed, 4 insertions(+), 32 deletions(-) diff --git a/internal/services/dp_config_loader.go b/internal/services/dp_config_loader.go index b7b0f00..e68ec2c 100644 --- a/internal/services/dp_config_loader.go +++ b/internal/services/dp_config_loader.go @@ -20,23 +20,27 @@ type DPConfigFile struct { Devices []DPDeviceProfile `json:"devices"` } +// DPConfigEndpoints contains the auth and gateway endpoints from a DP config file. type DPConfigEndpoints struct { Auth string `json:"auth"` Gateway string `json:"gateway"` } +// DPDeviceProfile contains upload credentials and tags for one DP device. type DPDeviceProfile struct { DeviceID string `json:"deviceId"` APIKey string `json:"apiKey"` Tags map[string]string `json:"tags"` } +// DPResolvedEndpoint is a normalized upload service endpoint. type DPResolvedEndpoint struct { Target string UseTLS bool ServerName string } +// DPDeviceUploadConfig contains the resolved upload config for one asset ID. type DPDeviceUploadConfig struct { ConfigPath string Auth DPResolvedEndpoint diff --git a/internal/services/sync_worker.go b/internal/services/sync_worker.go index 533d890..52bc7cd 100644 --- a/internal/services/sync_worker.go +++ b/internal/services/sync_worker.go @@ -1281,38 +1281,6 @@ func (w *SyncWorker) nextRetryDelay(attemptCount int) time.Duration { return time.Duration(totalSec * float64(time.Second)) } -// tagsFromSidecar reads the sidecar JSON from MinIO and returns it as a flat string map -// for use as RawTags. topics_summary is excluded. Returns nil map and an error if the -// sidecar path is empty, the object cannot be read, or the JSON is malformed. -func (w *SyncWorker) tagsFromSidecar(ctx context.Context, sidecarPath string) (map[string]string, error) { - key := stripBucketPrefix(sidecarPath) - if key == "" { - return nil, fmt.Errorf("empty sidecar_path") - } - if w.minioClient == nil { - return nil, fmt.Errorf("minio client not available") - } - - obj, err := w.minioClient.GetObject(ctx, w.minioBucket, key, minio.GetObjectOptions{}) - if err != nil { - return nil, fmt.Errorf("get sidecar object %s: %w", key, err) - } - defer func() { - _ = obj.Close() - }() - - data, err := io.ReadAll(obj) - if err != nil { - return nil, fmt.Errorf("read sidecar object %s: %w", key, err) - } - - tags, err := flattenSidecar(data) - if err != nil { - return nil, fmt.Errorf("flatten sidecar %s: %w", key, err) - } - return tags, nil -} - func (w *SyncWorker) directTagsFromSidecar(ctx context.Context, sidecarPath string) (map[string]string, error) { key := stripBucketPrefix(sidecarPath) if key == "" { From c7fe736454f15d605961cf883415c63ef52a84cb Mon Sep 17 00:00:00 2001 From: chaoliu Date: Thu, 4 Jun 2026 14:23:32 +0800 Subject: [PATCH 6/7] fix(sync): document local dp api key field --- internal/services/dp_config_loader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/services/dp_config_loader.go b/internal/services/dp_config_loader.go index e68ec2c..f85f773 100644 --- a/internal/services/dp_config_loader.go +++ b/internal/services/dp_config_loader.go @@ -29,7 +29,7 @@ type DPConfigEndpoints struct { // DPDeviceProfile contains upload credentials and tags for one DP device. type DPDeviceProfile struct { DeviceID string `json:"deviceId"` - APIKey string `json:"apiKey"` + APIKey string `json:"apiKey"` // #nosec G117 -- operator-provided local DP upload config credential Tags map[string]string `json:"tags"` } From e8e86eb2033f0dac7b4bcffedd3d69d5e45b7801 Mon Sep 17 00:00:00 2001 From: chaoliu Date: Thu, 4 Jun 2026 14:44:53 +0800 Subject: [PATCH 7/7] test(sync): cover resync retry dispatch --- internal/services/sync_worker_test.go | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/internal/services/sync_worker_test.go b/internal/services/sync_worker_test.go index 2771843..d5d26d6 100644 --- a/internal/services/sync_worker_test.go +++ b/internal/services/sync_worker_test.go @@ -566,12 +566,12 @@ func TestRetryFailedEpisodes_PromotesDueFailureToPendingBeforeDispatch(t *testin } } -func TestRetryFailedEpisodes_IgnoresMissingDeletedAndSyncedEpisodes(t *testing.T) { +func TestRetryFailedEpisodes_IgnoresMissingDeletedAndRetriesSyncedEpisodesAsResync(t *testing.T) { db := newTestSyncWorkerDB(t) w := &SyncWorker{ db: db, cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3}, - jobCh: make(chan syncEnqueueRequest, 1), + jobCh: make(chan syncEnqueueRequest, 2), enqueuedEpisode: make(map[int64]struct{}), } @@ -597,17 +597,20 @@ func TestRetryFailedEpisodes_IgnoresMissingDeletedAndSyncedEpisodes(t *testing.T t.Fatalf("unexpected retry queue failure log: %s", logs.String()) } - latest := latestSyncLogForSyncWorkerTest(t, db, 5) - if latest.Status != "pending" { - t.Fatalf("episode 5 latest status = %q, want pending", latest.Status) - } - select { - case got := <-w.jobCh: - if got.episodeID != 5 { - t.Fatalf("unexpected retry dispatch episode id: got %d want 5", got.episodeID) + for _, episodeID := range []int64{4, 5} { + latest := latestSyncLogForSyncWorkerTest(t, db, episodeID) + if latest.Status != "pending" { + t.Fatalf("episode %d latest status = %q, want pending", episodeID, latest.Status) } - default: - t.Fatal("expected valid retryable episode to be dispatched") + } + + gotSynced := <-w.jobCh + if gotSynced.episodeID != 4 || !gotSynced.resync { + t.Fatalf("unexpected synced retry dispatch: got %+v want episode 4 resync", gotSynced) + } + gotUnsynced := <-w.jobCh + if gotUnsynced.episodeID != 5 || gotUnsynced.resync { + t.Fatalf("unexpected unsynced retry dispatch: got %+v want episode 5 non-resync", gotUnsynced) } }