diff --git a/cmd/keystone-edge/main.go b/cmd/keystone-edge/main.go
index 7164bff..828437b 100644
--- a/cmd/keystone-edge/main.go
+++ b/cmd/keystone-edge/main.go
@@ -17,7 +17,6 @@ import (
 
 	"github.com/joho/godotenv"
 
-	"archebase.com/keystone-edge/internal/cloud"
 	"archebase.com/keystone-edge/internal/config"
 	"archebase.com/keystone-edge/internal/logger"
 	"archebase.com/keystone-edge/internal/server"
@@ -115,46 +114,8 @@ func main() {
 
 	// Initialize cloud sync worker
 	var syncWorker *services.SyncWorker
-	if cfg.Sync.Enabled && cfg.Sync.AuthEndpoint != "" && cfg.Sync.GatewayEndpoint != "" && s3Client != nil {
-		authClient := cloud.NewAuthClient(cloud.AuthClientConfig{
-			Endpoint:      cfg.Sync.AuthEndpoint,
-			UseTLS:        cfg.Sync.CloudUseTLS,
-			TLSCAFile:     cfg.Sync.CloudTLSCAFile,
-			TLSServerName: cfg.Sync.CloudTLSServerName,
-			APIKey:        cfg.Sync.APIKey,
-			RefreshBefore: 60 * time.Second,
-		})
-
-		gatewayClient := cloud.NewGatewayClient(cloud.GatewayClientConfig{
-			Endpoint:       cfg.Sync.GatewayEndpoint,
-			UseTLS:         cfg.Sync.CloudUseTLS,
-			TLSCAFile:      cfg.Sync.CloudTLSCAFile,
-			TLSServerName:  cfg.Sync.CloudTLSServerName,
-			RequestTimeout: time.Duration(cfg.Sync.RequestTimeoutSec) * time.Second,
-		}, authClient)
-		// Close gateway client before auth client (LIFO defer order).
-		defer func() {
-			if err := authClient.Close(); err != nil {
-				logger.Printf("[SYNC] Failed to close auth client: %v", err)
-			}
-		}()
-		defer func() {
-			if err := gatewayClient.Close(); err != nil {
-				logger.Printf("[SYNC] Failed to close gateway client: %v", err)
-			}
-		}()
-
-		uploader, err := cloud.NewUploader(gatewayClient, s3Client, cfg.Storage.Bucket, cloud.UploaderConfig{
-			RequestTimeout:  time.Duration(cfg.Sync.RequestTimeoutSec) * time.Second,
-			OSSTimeout:      time.Duration(cfg.Sync.OSSTimeoutSec) * time.Second,
-			PersistRootDir:  cfg.Sync.PersistRootDir,
-			MaxRestartCount: uint32(cfg.Sync.MaxRestartCount), //nolint:gosec // non-negative guaranteed by config.Validate()
-		})
-		if err != nil {
-			logger.Fatalf("[SYNC] Failed to initialise uploader: %v", err)
-		}
-
-		syncWorker = services.NewSyncWorker(db.DB, uploader, s3Client, cfg.Storage.Bucket, services.SyncWorkerConfig{
+	if cfg.Sync.Enabled && cfg.Sync.DPConfigPath != "" && s3Client != nil {
+		syncWorker = services.NewSyncWorker(db.DB, nil, s3Client, cfg.Storage.Bucket, services.SyncWorkerConfig{
 			BatchSize:       cfg.Sync.BatchSize,
 			MaxConcurrent:   cfg.Sync.MaxConcurrent,
 			MaxRetries:      cfg.Sync.MaxRetries,
@@ -166,9 +127,9 @@ func main() {
 		}, &cfg.Sync)
 
 		syncWorker.Start()
-		logger.Printf("[SYNC] Cloud sync worker started: auth=%s gateway=%s auto_scan=%t", cfg.Sync.AuthEndpoint, cfg.Sync.GatewayEndpoint, cfg.Sync.AutoScanEnabled)
+		logger.Printf("[SYNC] Cloud sync worker started: dp_config=%s auto_scan=%t", cfg.Sync.DPConfigPath, cfg.Sync.AutoScanEnabled)
 	} else {
-		logger.Println("[SYNC] Cloud sync disabled (KEYSTONE_SYNC_ENABLED=false or missing endpoints)")
+		logger.Println("[SYNC] Cloud sync disabled (KEYSTONE_SYNC_ENABLED=false, missing KEYSTONE_SYNC_DP_CONFIG, or S3 unavailable)")
 	}
 
 	// Initialize and start HTTP server
diff --git a/docker/.env.example b/docker/.env.example
index 7d1c145..34ec451 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -42,15 +42,7 @@ KEYSTONE_MINIO_USE_SSL=false
 KEYSTONE_SYNC_ENABLED=true
 KEYSTONE_SYNC_BATCH_SIZE=10
 KEYSTONE_SYNC_MAX_RETRIES=5
-KEYSTONE_CLOUD_AUTH_ENDPOINT=127.0.0.1:50051
-KEYSTONE_CLOUD_GATEWAY_ENDPOINT=127.0.0.1:50053
-KEYSTONE_CLOUD_USE_TLS=false
-# Optional: custom CA bundle for TLS verification (PEM).
-# KEYSTONE_CLOUD_TLS_CA_FILE=/etc/ssl/certs/your-ca.pem
-# Optional: override TLS server name (SNI / verification), useful when endpoint is an IP.
-# KEYSTONE_CLOUD_TLS_SERVER_NAME=cloud.example.com
-# API key issued by the data-platform (base64url, no padding).
-KEYSTONE_CLOUD_API_KEY=your-api-key-here
+KEYSTONE_SYNC_DP_CONFIG=~/.archebase/config.json
 KEYSTONE_SYNC_WORKER_INTERVAL=15
 KEYSTONE_SYNC_REQUEST_TIMEOUT=30
 KEYSTONE_SYNC_OSS_TIMEOUT=120
diff --git a/docs/designs/cli-cloud-sync-sidepath.md b/docs/designs/cli-cloud-sync-sidepath.md
new file mode 100644
index 0000000..14a8681
--- /dev/null
+++ b/docs/designs/cli-cloud-sync-sidepath.md
@@ -0,0 +1,497 @@
+<!--
+SPDX-FileCopyrightText: 2026 ArcheBase
+
+SPDX-License-Identifier: MulanPSL-2.0
+-->
+
+# CLI Cloud Sync Sidepath Design
+
+Status: Superseded. This sidepath is not implemented in Keystone anymore.
+Native cloud sync now uploads directly with the Go uploader and Data Platform
+device profiles; Keystone no longer registers CLI sync APIs, starts a
+`CLISyncRunner`, reads `KEYSTONE_CLI_SYNC_*` config, or creates
+`cli_sync_runs` migrations.
+
+## 1. Overview
+
+This document defines a sidepath for syncing one Keystone episode to cloud by
+running the data-platform `dp` CLI from Keystone, while keeping the existing
+`SyncWorker -> data-platform DataGateway` flow unchanged.
+
+The sidepath is intended for controlled operations and emergency recovery, not
+as the default production upload path.
+
+Target flow:
+
+```text
+Synapse "CLI sync to cloud" button
+        -> Keystone CLI sync API
+        -> Keystone CLI sync runner
+        -> download MCAP from Keystone MinIO to a temporary local file
+        -> read sidecar JSON and flatten scalar metadata into --tag arguments
+        -> dp --json data upload <temporary-file> --tag ...
+        -> record dp result
+        -> mark the episode cloud_synced on success
+```
+
+The existing cloud sync flow remains:
+
+```text
+Synapse normal sync action
+        -> Keystone SyncWorker queue
+        -> Keystone Go uploader
+        -> data-platform DataGateway
+        -> cloud object storage
+```
+
+## 2. Goals
+
+- Add a Synapse action named `CLI sync to cloud` for a single episode.
+- Keep the current `POST /api/v1/sync/episodes/:id` behavior unchanged.
+- Keep the current `SyncWorker` queue, retry, backoff, and auto-scan behavior
+  unchanged.
+- Upload the episode MCAP through `dp data upload`.
+- Read the episode sidecar JSON and pass scalar metadata through
+  `dp data upload --tag`. Array fields are skipped in the first version so the
+  existing `dp` CLI does not need to change its comma-separated tag parser.
+- Persist CLI run audit data, including `fileId`, `logicalUploadId`, `uploadId`,
+  `objectKey`, command duration, and sanitized error output.
+- On successful CLI upload, update:
+  - `episodes.cloud_synced = TRUE`
+  - `episodes.cloud_synced_at`
+  - `episodes.cloud_mcap_path`
+  - `episodes.cloud_processed = FALSE`
+- On successful CLI upload, append a normal `sync_logs.completed` row so the
+  existing Cloud Sync Center summary can show the episode as synced.
+
+## 3. Non-Goals
+
+- Do not replace `SyncWorker`.
+- Do not make CLI sync the default action.
+- Do not add batch CLI sync in the first version.
+- Do not retry CLI sync automatically.
+- Do not let the existing `SyncWorker` process CLI pending or failed states.
+- Do not upload the sidecar JSON object through the CLI sidepath in the first
+  version. Its scalar content is still required as upload tags for the MCAP
+  object.
+- Do not expose `dp` command output containing secrets to the browser.
+
+## 4. Recommended Architecture
+
+Use a separate `cli_sync_runs` table for pending, in-progress, and failed CLI
+runs. This avoids putting CLI `pending` or `failed` rows into `sync_logs`, which
+would otherwise be visible to the existing `SyncWorker` polling queries.
+
+Only after the CLI upload succeeds should Keystone append a `sync_logs` row with
+`status = 'completed'`. That completed row is terminal and will not be retried
+by the existing worker.
+
+```text
+api request
+  -> insert cli_sync_runs(status='pending')
+  -> background runner claims run
+  -> cli_sync_runs(status='in_progress')
+  -> read sidecar JSON tags
+  -> run dp upload
+  -> success:
+       cli_sync_runs(status='completed', dp ids...)
+       sync_logs(status='completed', destination_path=objectKey...)
+       episodes.cloud_synced = TRUE
+  -> failure:
+       cli_sync_runs(status='failed', sanitized error...)
+       no sync_logs write
+       episodes unchanged
+```
+
+This keeps normal sync history authoritative while still allowing CLI success to
+close the episode's cloud sync state.
+
+## 5. Backend API
+
+### 5.1 Trigger CLI Sync
+
+```http
+POST /api/v1/sync/episodes/:id/cli
+```
+
+Request body:
+
+```json
+{}
+```
+
+Response:
+
+```json
+{
+  "status": "accepted",
+  "episode_id": 123,
+  "run_id": 456,
+  "message": "episode accepted for CLI cloud sync"
+}
+```
+
+Validation:
+
+| Check | Response |
+|---|---|
+| CLI sync feature disabled | `503 Service Unavailable` |
+| invalid episode id | `400 Bad Request` |
+| episode missing or deleted | `404 Not Found` |
+| `qa_status` is not `approved` or `inspector_approved` | `400 Bad Request` |
+| `cloud_synced = TRUE` | `409 Conflict` |
+| latest normal sync log is `pending` or `in_progress` | `409 Conflict` |
+| existing CLI run is `pending` or `in_progress` | `409 Conflict` |
+| CLI runner queue is full | `429 Too Many Requests` |
+
+The endpoint must return after the run is queued. It must not hold the HTTP
+request open for the entire upload.
+
+### 5.2 Get Latest CLI Sync Run
+
+```http
+GET /api/v1/sync/episodes/:id/cli/status
+```
+
+Response:
+
+```json
+{
+  "id": 456,
+  "episode_id": 123,
+  "status": "in_progress",
+  "file_id": null,
+  "logical_upload_id": null,
+  "upload_id": null,
+  "object_key": null,
+  "file_size": null,
+  "started_at": "2026-06-02T08:10:00Z",
+  "completed_at": null,
+  "error_message": null
+}
+```
+
+The frontend uses this endpoint to show button state while the sidepath is
+running. The normal sync summary remains sourced from `sync_logs`.
+
+## 6. Data Model
+
+### 6.1 New Table
+
+```sql
+CREATE TABLE IF NOT EXISTS cli_sync_runs (
+    id BIGINT AUTO_INCREMENT PRIMARY KEY,
+    episode_id BIGINT NOT NULL,
+    status ENUM('pending', 'in_progress', 'completed', 'failed') NOT NULL DEFAULT 'pending',
+    source_path VARCHAR(1024),
+    temp_path VARCHAR(1024),
+    dp_config_path VARCHAR(1024),
+    file_id VARCHAR(255),
+    logical_upload_id VARCHAR(255),
+    upload_id VARCHAR(255),
+    bucket VARCHAR(255),
+    object_key VARCHAR(1024),
+    file_size BIGINT,
+    oss_object_etag VARCHAR(255),
+    duration_sec INT,
+    error_message TEXT,
+    stdout_json JSON DEFAULT NULL,
+    started_at TIMESTAMP NULL,
+    completed_at TIMESTAMP NULL,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+    INDEX idx_cli_sync_episode (episode_id),
+    INDEX idx_cli_sync_status (status),
+    INDEX idx_cli_sync_created (created_at)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+```
+
+### 6.2 Why Not Store Pending CLI Runs In `sync_logs`
+
+The existing worker polls latest `sync_logs.status = 'pending'` rows and
+retryable `failed` rows. If CLI pending or failed rows are written to
+`sync_logs`, the normal worker can claim them and run the regular data-gateway
+upload path. That would mix the two channels and violate this design's goal.
+
+For this reason:
+
+- `cli_sync_runs` owns CLI pending, in-progress, and failed states.
+- `sync_logs` receives a completed row only after CLI upload succeeds.
+- `episodes.cloud_synced` is updated only after CLI upload succeeds.
+
+### 6.3 Successful CLI Sync Log Row
+
+On success, insert:
+
+```sql
+INSERT INTO sync_logs (
+    episode_id,
+    source_path,
+    destination_path,
+    status,
+    bytes_transferred,
+    duration_sec,
+    attempt_count,
+    started_at,
+    completed_at
+) VALUES (?, ?, ?, 'completed', ?, ?, 1, ?, ?);
+```
+
+Use `destination_path = dp.objectKey`. Store `dp.fileId` and
+`dp.logicalUploadId` in `cli_sync_runs`.
+
+## 7. CLI Runner
+
+### 7.1 Command Construction
+
+The runner must call `dp` without a shell:
+
+```text
+exec.CommandContext(ctx, dpBin,
+  "--config", dpConfigPath,
+  "--json",
+  "data", "upload", tempFile,
+  "--device", "<robot device id>",
+  "--tag", "episode_id=<episode public id>",
+  "--tag", "keystone_episode_id=<numeric id>",
+  "--tag", "device_id=<robot device id>",
+  "--tag", "sync_channel=keystone_cli",
+  "--tag", "<flattened sidecar key=value>",
+  "--hint", "source=keystone_cli_sync",
+)
+```
+
+Do not build a single shell command string.
+The device id is resolved from the episode workstation robot
+(`robots.device_id`, falling back to `workstations.robot_serial`). The selected
+`dp` config must contain a matching initialized device profile in `devices[]`.
+
+### 7.2 Tags
+
+Required tags:
+
+| Tag | Value |
+|---|---|
+| `episode_id` | `episodes.episode_id` |
+| `keystone_episode_id` | numeric `episodes.id` |
+| `device_id` | `robots.device_id` resolved through the episode workstation |
+| `sync_channel` | `keystone_cli` |
+
+Required sidecar-derived tags:
+
+| Source | Handling |
+|---|---|
+| sidecar JSON scalar fields | Flatten to string key/value pairs and pass as repeated `--tag key=value` arguments |
+| sidecar JSON arrays | Skip in the first version |
+| `topics_summary` | Exclude, matching the existing worker's filtering intent |
+| nested objects | Flatten with dot notation |
+
+Recommended tags:
+
+| Tag | Value |
+|---|---|
+| `task_id` | `episodes.task_id`, when available |
+| `factory_id` | `episodes.factory_id`, when available |
+| `organization_id` | `episodes.organization_id`, when available |
+
+The CLI sidepath uploads only the MCAP file body, but sidecar JSON metadata is
+not optional. Scalar sidecar fields must be included as tags; array fields are
+left out for the first version. If `sidecar_path` is missing, unreadable, or
+malformed, the CLI run should fail before invoking `dp`. This is stricter than
+the current worker's best-effort sidecar handling and prevents cloud objects
+from being created without the metadata required for filtering.
+
+The implementation must enforce a max tag count and max tag size so the CLI
+command line cannot exceed OS limits.
+
+### 7.3 Temporary File Handling
+
+The runner downloads the MCAP from Keystone MinIO to a temporary file before
+calling `dp`.
+
+Requirements:
+
+- Use a dedicated directory such as `/var/lib/keystone/cli-sync`.
+- Create temporary files with mode `0600`.
+- Delete the temporary file after success or failure unless
+  `KEYSTONE_CLI_SYNC_KEEP_TEMP=true`.
+- Refuse to start if the temp directory is not writable.
+- Check free disk space before download when a disk watermark helper is
+  available.
+
+### 7.4 JSON Output Parsing
+
+Expected `dp --json data upload` fields:
+
+```json
+{
+  "logicalUploadId": "logical-1",
+  "fileId": "file-1",
+  "bucket": "bucket-a",
+  "objectKey": "objects/file-1.mcap",
+  "fileSize": 123456789,
+  "ossObjectEtag": "etag",
+  "identity": "api-key",
+  "deviceId": null
+}
+```
+
+The runner must validate that `fileId`, `logicalUploadId`, `objectKey`, and
+`fileSize` are present before marking the run completed.
+
+## 8. Configuration
+
+Add a separate config group rather than reusing `SyncConfig`.
+
+| Environment variable | Default | Description |
+|---|---|---|
+| `KEYSTONE_CLI_SYNC_ENABLED` | `false` | Enables the sidepath API and runner |
+| `KEYSTONE_CLI_SYNC_DP_BIN` | `dp` | Path or binary name for the data-platform CLI |
+| `KEYSTONE_CLI_SYNC_DP_CONFIG` | empty | SDK config JSON path passed to `dp --config` |
+| `KEYSTONE_CLI_SYNC_TEMP_DIR` | `/var/lib/keystone/cli-sync` | Temporary MCAP staging directory |
+| `KEYSTONE_CLI_SYNC_MAX_CONCURRENT` | `1` | Max concurrent CLI uploads |
+| `KEYSTONE_CLI_SYNC_QUEUE_SIZE` | `16` | Max queued CLI runs |
+| `KEYSTONE_CLI_SYNC_TIMEOUT_SEC` | `7200` | Per-run timeout |
+| `KEYSTONE_CLI_SYNC_KEEP_TEMP` | `false` | Keeps staged files for debugging |
+| `KEYSTONE_CLI_SYNC_MAX_TAGS` | `128` | Max tags passed to CLI |
+| `KEYSTONE_CLI_SYNC_MAX_TAG_BYTES` | `65536` | Max total encoded tag bytes |
+
+Startup validation when enabled:
+
+- `dp` binary exists and is executable.
+- `KEYSTONE_CLI_SYNC_DP_CONFIG` is set and readable.
+- Temp directory exists or can be created.
+- Temp directory is writable.
+
+## 9. Frontend Behavior
+
+### 9.1 Cloud Sync Center
+
+Add a row action next to existing `Retry` and `History` actions:
+
+```text
+CLI sync to cloud
+```
+
+Show it only when the feature flag from config/status says CLI sync is enabled.
+
+Disable it when:
+
+- the row status is `pending` or `in_progress`;
+- the row status is `completed`;
+- the episode has an active CLI run;
+- a row action is already running;
+- the user does not have admin permission.
+
+After clicking:
+
+1. Call `POST /api/v1/sync/episodes/:id/cli`.
+2. Show the row as `CLI queued` or `CLI syncing` using the CLI status endpoint.
+3. Poll `GET /api/v1/sync/episodes/:id/cli/status`.
+4. On CLI completion, refresh normal sync summaries.
+5. On CLI failure, keep the normal sync row unchanged and show the sanitized CLI
+   error.
+
+### 9.2 Episode Detail
+
+Add the same action for approved, unsynced episodes. This is important because
+an approved unsynced episode may not yet have any `sync_logs` row and therefore
+may not appear in the Cloud Sync Center table.
+
+## 10. Security
+
+- The trigger API must require admin authorization.
+- `dp` must be launched through `exec.CommandContext`, never through a shell.
+- Do not pass API keys on the command line.
+- Store credentials only in the `dp` config file with restrictive permissions.
+- Redact stdout, stderr, paths, and errors before returning anything to the
+  frontend.
+- Do not log full `dp` config contents.
+- Do not log temporary object storage credentials or presigned URLs.
+- Limit concurrent CLI runs to protect Keystone CPU, disk, and network.
+
+## 11. Concurrency And Races
+
+Keystone should prevent multiple active CLI runs for the same episode by checking
+`cli_sync_runs.status IN ('pending', 'in_progress')` inside a transaction.
+
+Before marking success, lock the `episodes` row and re-check `cloud_synced`.
+
+If the normal SyncWorker synced the episode while the CLI run was uploading:
+
+- mark the CLI run as completed with its `dp` result;
+- do not overwrite `episodes.cloud_mcap_path`;
+- do not insert a second `sync_logs.completed` row unless product explicitly
+  wants duplicate completed history;
+- include a `duplicate_after_upload` marker in `cli_sync_runs.stdout_json` or a
+  dedicated metadata field if one is added later.
+
+Residual risk: if `dp` upload succeeds but Keystone crashes before recording the
+result, a later manual CLI retry can upload a duplicate object. This is accepted
+for the sidepath's emergency-use scope. A future implementation can reduce this
+by adding a data-platform idempotency key or a server-side upload lookup by
+`episode_id`.
+
+## 12. Rollout Plan
+
+1. Add `cli_sync_runs` migration and model helpers.
+2. Add CLI sync config with default disabled.
+3. Add the backend runner with a fake `dp` executable test fixture.
+4. Add `POST /sync/episodes/:id/cli` and latest status endpoint.
+5. Add Synapse API wrapper methods.
+6. Add Episode Detail button.
+7. Add Cloud Sync Center row button and CLI status overlay.
+8. Enable only in a staging environment.
+9. Run one approved small MCAP through CLI sync and verify:
+   - data-platform object is visible;
+   - expected sidecar JSON scalar fields are visible as data-platform raw tags;
+   - `cli_sync_runs` contains `fileId` and `logicalUploadId`;
+   - `sync_logs` has a completed row;
+   - `episodes.cloud_synced = TRUE`;
+   - normal SyncWorker does not retry the episode.
+
+## 13. Test Plan
+
+Backend unit tests:
+
+- rejects disabled feature;
+- rejects non-approved episodes;
+- rejects already cloud-synced episodes;
+- rejects active normal sync rows;
+- rejects active CLI runs;
+- fails when sidecar JSON is missing, unreadable, or malformed;
+- passes flattened sidecar JSON scalar fields as repeated `--tag` arguments;
+- builds `dp` argv without a shell;
+- parses valid `dp --json` output;
+- rejects missing `fileId`, `logicalUploadId`, or `objectKey`;
+- redacts stderr before API response;
+- records failed CLI runs without writing `sync_logs`;
+- records successful CLI runs and inserts one completed `sync_logs` row.
+
+Backend integration tests:
+
+- fake MinIO object is staged to temp file;
+- fake `dp` executable receives the expected args;
+- temp file is deleted after success and failure;
+- success updates `episodes.cloud_synced`;
+- normal sync summary sees the completed row after success.
+
+Frontend tests:
+
+- button is hidden when CLI sync config is disabled;
+- button is disabled for completed, pending, and in-progress rows;
+- click calls `triggerEpisodeCli`;
+- active CLI status changes row action text;
+- completed CLI run refreshes normal summaries;
+- failed CLI run shows sanitized error and leaves normal row state unchanged.
+
+## 14. Open Questions
+
+- Should CLI failures appear in the Cloud Sync Center main table, or only as a
+  per-episode CLI status/badge?
+- Should a successful CLI sync always append `sync_logs.completed`, even when
+  the latest normal row is already completed by a race?
+- Does data-platform need an explicit idempotency key for `dp data upload` so
+  crash-after-upload can be recovered without duplicate objects?
+- Should the `dp` config use a site API key or a device profile for the Keystone
+  edge site?
diff --git a/docs/designs/cli-cloud-sync-sidepath.zh.html b/docs/designs/cli-cloud-sync-sidepath.zh.html
new file mode 100644
index 0000000..fad753d
--- /dev/null
+++ b/docs/designs/cli-cloud-sync-sidepath.zh.html
@@ -0,0 +1,834 @@
+<!--
+SPDX-FileCopyrightText: 2026 ArcheBase
+
+SPDX-License-Identifier: MulanPSL-2.0
+-->
+<!doctype html>
+<html lang="zh-CN">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>CLI 同步到云旁路设计</title>
+  <style>
+    :root {
+      color-scheme: light;
+      --ink: #17231f;
+      --muted: #5c6b64;
+      --line: #d7e2dc;
+      --paper: #ffffff;
+      --surface: #f6faf8;
+      --surface-strong: #eaf4ef;
+      --accent: #126b5e;
+      --accent-strong: #0a4e46;
+      --blue: #205f8f;
+      --warn: #955914;
+      --danger: #a63a32;
+      --success: #1f7a4f;
+      --code-bg: #13221e;
+      --code-ink: #eef8f3;
+      --shadow: 0 20px 48px rgba(21, 42, 34, 0.09);
+    }
+
+    * {
+      box-sizing: border-box;
+    }
+
+    html {
+      scroll-behavior: smooth;
+    }
+
+    body {
+      margin: 0;
+      color: var(--ink);
+      background:
+        linear-gradient(135deg, rgba(18, 107, 94, 0.12), transparent 34%),
+        linear-gradient(180deg, #f8fbfa 0%, #ffffff 46%);
+      font: 16px/1.72 "Noto Sans SC", "Source Han Sans SC", "Microsoft YaHei", sans-serif;
+    }
+
+    main {
+      max-width: 1180px;
+      margin: 0 auto;
+      padding: 48px 24px 84px;
+    }
+
+    header {
+      display: grid;
+      grid-template-columns: minmax(0, 1fr) 320px;
+      gap: 30px;
+      align-items: end;
+      padding: 34px;
+      border: 1px solid var(--line);
+      border-radius: 18px;
+      background: rgba(255, 255, 255, 0.9);
+      box-shadow: var(--shadow);
+      backdrop-filter: blur(8px);
+    }
+
+    .eyebrow {
+      margin: 0 0 10px;
+      color: var(--accent-strong);
+      font-size: 13px;
+      font-weight: 700;
+      letter-spacing: 0;
+      text-transform: uppercase;
+    }
+
+    h1 {
+      margin: 0;
+      font-size: clamp(30px, 5vw, 48px);
+      line-height: 1.12;
+      letter-spacing: 0;
+    }
+
+    .lead {
+      max-width: 820px;
+      margin: 18px 0 0;
+      color: var(--muted);
+      font-size: 18px;
+    }
+
+    .header-meta {
+      border-left: 4px solid var(--accent);
+      padding-left: 16px;
+      color: var(--muted);
+      font-size: 14px;
+    }
+
+    .header-meta strong {
+      display: block;
+      margin-bottom: 8px;
+      color: var(--ink);
+      font-size: 16px;
+    }
+
+    nav {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 10px;
+      margin: 24px 0 4px;
+    }
+
+    nav a {
+      color: var(--accent-strong);
+      background: var(--surface-strong);
+      border: 1px solid #c8ded4;
+      border-radius: 999px;
+      padding: 8px 12px;
+      text-decoration: none;
+      font-size: 14px;
+    }
+
+    section {
+      margin-top: 38px;
+    }
+
+    h2 {
+      margin: 0 0 14px;
+      font-size: 26px;
+      line-height: 1.25;
+      letter-spacing: 0;
+    }
+
+    h3 {
+      margin: 24px 0 10px;
+      font-size: 19px;
+      line-height: 1.35;
+    }
+
+    p {
+      margin: 10px 0;
+    }
+
+    ul,
+    ol {
+      padding-left: 24px;
+    }
+
+    li {
+      margin: 6px 0;
+    }
+
+    code {
+      border-radius: 5px;
+      background: #edf4ef;
+      padding: 2px 5px;
+      color: #123c35;
+      font-family: "JetBrains Mono", "SFMono-Regular", Consolas, monospace;
+      font-size: 0.92em;
+    }
+
+    pre {
+      overflow: auto;
+      margin: 14px 0;
+      border-radius: 12px;
+      background: var(--code-bg);
+      color: var(--code-ink);
+      padding: 18px;
+      font: 14px/1.65 "JetBrains Mono", "SFMono-Regular", Consolas, monospace;
+    }
+
+    pre code {
+      background: transparent;
+      color: inherit;
+      padding: 0;
+    }
+
+    table {
+      width: 100%;
+      border-collapse: collapse;
+      margin: 14px 0 22px;
+      overflow: hidden;
+      border: 1px solid var(--line);
+      border-radius: 12px;
+      background: var(--paper);
+    }
+
+    th,
+    td {
+      border-bottom: 1px solid var(--line);
+      padding: 12px 14px;
+      text-align: left;
+      vertical-align: top;
+    }
+
+    th {
+      color: var(--accent-strong);
+      background: var(--surface-strong);
+      font-weight: 700;
+    }
+
+    tr:last-child td {
+      border-bottom: 0;
+    }
+
+    .panel {
+      border: 1px solid var(--line);
+      border-radius: 16px;
+      background: rgba(255, 255, 255, 0.92);
+      box-shadow: var(--shadow);
+      padding: 24px;
+    }
+
+    .split {
+      display: grid;
+      grid-template-columns: repeat(2, minmax(0, 1fr));
+      gap: 18px;
+    }
+
+    .cards {
+      display: grid;
+      grid-template-columns: repeat(3, minmax(0, 1fr));
+      gap: 14px;
+      margin-top: 16px;
+    }
+
+    .card {
+      border: 1px solid var(--line);
+      border-radius: 14px;
+      background: var(--paper);
+      padding: 18px;
+    }
+
+    .card strong {
+      display: block;
+      margin-bottom: 8px;
+      color: var(--ink);
+      font-size: 17px;
+    }
+
+    .card p {
+      margin: 0;
+      color: var(--muted);
+      font-size: 14px;
+    }
+
+    .callout {
+      margin: 16px 0;
+      border-left: 5px solid var(--accent);
+      border-radius: 12px;
+      background: var(--surface);
+      padding: 16px 18px;
+    }
+
+    .callout.warn {
+      border-left-color: var(--warn);
+      background: #fff8ed;
+    }
+
+    .callout.danger {
+      border-left-color: var(--danger);
+      background: #fff4f2;
+    }
+
+    .callout.success {
+      border-left-color: var(--success);
+      background: #f0faf4;
+    }
+
+    .badge-row {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 10px;
+      margin-top: 16px;
+    }
+
+    .badge {
+      display: inline-flex;
+      align-items: center;
+      min-height: 34px;
+      border: 1px solid #c8ded4;
+      border-radius: 999px;
+      background: var(--surface-strong);
+      padding: 6px 12px;
+      color: var(--accent-strong);
+      font-size: 14px;
+      font-weight: 700;
+    }
+
+    .flow {
+      display: grid;
+      gap: 12px;
+      margin: 16px 0 4px;
+    }
+
+    .flow-step {
+      position: relative;
+      border: 1px solid var(--line);
+      border-radius: 14px;
+      background: var(--paper);
+      padding: 14px 16px;
+    }
+
+    .flow-step strong {
+      display: block;
+      margin-bottom: 4px;
+    }
+
+    .flow-step span {
+      color: var(--muted);
+      font-size: 14px;
+    }
+
+    .sequence {
+      display: grid;
+      grid-template-columns: repeat(5, minmax(0, 1fr));
+      gap: 10px;
+      margin: 16px 0;
+    }
+
+    .sequence .node {
+      border: 1px solid var(--line);
+      border-radius: 14px;
+      background: var(--paper);
+      padding: 14px;
+      min-height: 92px;
+    }
+
+    .node b {
+      display: block;
+      color: var(--accent-strong);
+      font-size: 14px;
+      margin-bottom: 8px;
+    }
+
+    .node span {
+      color: var(--muted);
+      font-size: 13px;
+    }
+
+    .two-column-list {
+      columns: 2;
+      column-gap: 34px;
+    }
+
+    .subtle {
+      color: var(--muted);
+    }
+
+    .status {
+      display: inline-block;
+      border-radius: 999px;
+      padding: 4px 9px;
+      font-size: 13px;
+      font-weight: 700;
+    }
+
+    .status.ok {
+      color: #0d5939;
+      background: #e6f5eb;
+    }
+
+    .status.warn {
+      color: #7a430d;
+      background: #fff2dc;
+    }
+
+    .status.stop {
+      color: #84251f;
+      background: #ffe7e4;
+    }
+
+    @media (max-width: 860px) {
+      main {
+        padding: 28px 16px 64px;
+      }
+
+      header,
+      .split,
+      .cards,
+      .sequence {
+        grid-template-columns: 1fr;
+      }
+
+      header {
+        padding: 24px;
+      }
+
+      .two-column-list {
+        columns: 1;
+      }
+    }
+
+    @media print {
+      body {
+        background: #fff;
+      }
+
+      nav {
+        display: none;
+      }
+
+      .panel,
+      header {
+        box-shadow: none;
+      }
+    }
+  </style>
+</head>
+<body>
+  <main>
+    <header>
+      <div>
+        <p class="eyebrow">Keystone / Synapse Design</p>
+        <h1>CLI 同步到云旁路设计</h1>
+        <p class="lead">在不改动现有 Keystone 云同步主链路的前提下，新增一个由 Synapse 触发、Keystone 后台执行 <code>dp data upload</code> 的单片段应急同步入口。首版只上传 MCAP 文件本体，但必须读取 sidecar JSON，并把其中标量元数据作为 <code>--tag</code> 传给 data-platform；数组字段先跳过，<code>dp</code> 本身不需要改。CLI 上传成功后回写 Keystone 云同步状态，并保留 data-platform 返回的审计 ID。</p>
+        <div class="badge-row">
+          <span class="badge">方案 2：成功后回写 episode</span>
+          <span class="badge">现有 SyncWorker 不变</span>
+          <span class="badge">默认关闭，按环境启用</span>
+        </div>
+      </div>
+      <div class="header-meta">
+        <strong>文档状态</strong>
+        <div>状态：已废弃；当前实现改为 Keystone Go uploader 原生 direct sync，不再实现 CLI sync API、CLISyncRunner、KEYSTONE_CLI_SYNC_* 配置或 cli_sync_runs 迁移。</div>
+        <div>用途：实现设计 / 评审</div>
+        <div>范围：Keystone 后端、Synapse 前端；data-platform CLI 只作为外部命令调用</div>
+        <div>日期：2026-06-02</div>
+      </div>
+    </header>
+
+    <nav aria-label="目录">
+      <a href="#decision">结论</a>
+      <a href="#flow">流程</a>
+      <a href="#api">接口</a>
+      <a href="#data-model">数据模型</a>
+      <a href="#runner">CLI Runner</a>
+      <a href="#frontend">前端交互</a>
+      <a href="#security">安全与并发</a>
+      <a href="#rollout">落地计划</a>
+    </nav>
+
+    <section id="decision" class="panel">
+      <h2>1. 设计结论</h2>
+      <p>推荐新增独立的 CLI 同步旁路，而不是让前端直接调用 CLI，也不是把 CLI pending/failed 状态写入现有 <code>sync_logs</code>。核心原则是：正常同步继续归 <code>SyncWorker</code> 管，CLI 同步只作为手动应急通道。</p>
+
+      <div class="cards">
+        <div class="card">
+          <strong>主链路不动</strong>
+          <p><code>POST /sync/episodes/:id</code>、自动扫描、重试和 backoff 都保持现状。</p>
+        </div>
+        <div class="card">
+          <strong>旁路独立记账</strong>
+          <p>用新表 <code>cli_sync_runs</code> 记录 CLI 的 pending、running、failed 和 completed。</p>
+        </div>
+        <div class="card">
+          <strong>成功后闭环</strong>
+          <p>CLI 成功后写 <code>episodes.cloud_synced</code>，并追加一条 <code>sync_logs.completed</code>。</p>
+        </div>
+      </div>
+
+      <div class="callout warn">
+        <strong>关键约束：</strong>不要把 CLI 的 pending 或 failed 行写进 <code>sync_logs</code>。现有 worker 会扫描最新 pending 和可重试 failed 行，如果 CLI 行进入这张表，可能被正常同步 worker 误认领。
+      </div>
+    </section>
+
+    <section id="flow">
+      <h2>2. 目标流程</h2>
+      <div class="split">
+        <div class="panel">
+          <h3>新增 CLI 旁路</h3>
+          <div class="flow">
+            <div class="flow-step"><strong>Synapse 按钮</strong><span>管理员点击「CLI 同步到云」。</span></div>
+            <div class="flow-step"><strong>Keystone API</strong><span>创建 <code>cli_sync_runs.pending</code> 并返回 <code>202 Accepted</code>。</span></div>
+            <div class="flow-step"><strong>Keystone Runner</strong><span>从 MinIO 下载 MCAP 到临时文件，并读取 sidecar JSON。</span></div>
+            <div class="flow-step"><strong>dp CLI</strong><span>执行 <code>dp --json data upload</code>，将 sidecar 标量元数据作为重复 <code>--tag</code> 参数传入。</span></div>
+            <div class="flow-step"><strong>状态回写</strong><span>写入 CLI 审计数据、<code>sync_logs.completed</code> 和 episode 云同步字段。</span></div>
+          </div>
+        </div>
+
+        <div class="panel">
+          <h3>现有正常同步</h3>
+          <div class="flow">
+            <div class="flow-step"><strong>Synapse 正常同步</strong><span>仍调用 <code>POST /api/v1/sync/episodes/:id</code>。</span></div>
+            <div class="flow-step"><strong>SyncWorker 队列</strong><span>负责 pending、in_progress、failed 和重试。</span></div>
+            <div class="flow-step"><strong>Go Uploader</strong><span>通过 data-platform DataGateway 和 OSS 上传。</span></div>
+            <div class="flow-step"><strong>完成态</strong><span>更新 <code>sync_logs</code> 和 <code>episodes</code>。</span></div>
+          </div>
+          <div class="callout success">
+            两条链路最后都可以把 episode 标记为已同步，但只有正常链路参与自动发现和自动重试。
+          </div>
+        </div>
+      </div>
+    </section>
+
+    <section id="scope" class="panel">
+      <h2>3. 目标与非目标</h2>
+      <div class="split">
+        <div>
+          <h3>目标</h3>
+          <ul>
+            <li>增加单 episode 的「CLI 同步到云」动作。</li>
+            <li>上传 MCAP 文件到 data-platform 云端对象存储。</li>
+            <li>读取 sidecar JSON，并把标量字段作为 <code>dp data upload --tag</code> 传递；数组字段首版先跳过。</li>
+            <li>保存 <code>fileId</code>、<code>logicalUploadId</code>、<code>uploadId</code>、<code>objectKey</code> 等审计信息。</li>
+            <li>成功后更新 <code>episodes.cloud_synced</code>、<code>cloud_synced_at</code>、<code>cloud_mcap_path</code>、<code>cloud_processed</code>。</li>
+            <li>成功后插入一条 <code>sync_logs.completed</code>，让现有 Cloud Sync Center 能看到完成态。</li>
+          </ul>
+        </div>
+        <div>
+          <h3>非目标</h3>
+          <ul>
+            <li>不替换 <code>SyncWorker</code>。</li>
+            <li>不提供批量 CLI 同步。</li>
+            <li>不自动重试 CLI 失败任务。</li>
+            <li>不把 CLI 失败任务混入正常同步主表。</li>
+            <li>首版不上传 sidecar JSON 文件本体；但它的标量内容必须作为 MCAP 上传 tags。</li>
+            <li>不把包含敏感信息的 CLI 输出返回给浏览器。</li>
+          </ul>
+        </div>
+      </div>
+    </section>
+
+    <section id="api" class="panel">
+      <h2>4. 后端接口</h2>
+      <h3>4.1 触发 CLI 同步</h3>
+      <pre><code>POST /api/v1/sync/episodes/:id/cli</code></pre>
+      <p>请求体为空对象即可：</p>
+      <pre><code>{
+  "status": "accepted",
+  "episode_id": 123,
+  "run_id": 456,
+  "message": "episode accepted for CLI cloud sync"
+}</code></pre>
+
+      <table>
+        <thead>
+          <tr><th>校验项</th><th>失败响应</th></tr>
+        </thead>
+        <tbody>
+          <tr><td>CLI 同步功能未启用</td><td><code>503 Service Unavailable</code></td></tr>
+          <tr><td>episode id 非法</td><td><code>400 Bad Request</code></td></tr>
+          <tr><td>episode 不存在或已删除</td><td><code>404 Not Found</code></td></tr>
+          <tr><td><code>qa_status</code> 不是 <code>approved</code> 或 <code>inspector_approved</code></td><td><code>400 Bad Request</code></td></tr>
+          <tr><td><code>cloud_synced = TRUE</code></td><td><code>409 Conflict</code></td></tr>
+          <tr><td>正常同步最新状态为 <code>pending</code> 或 <code>in_progress</code></td><td><code>409 Conflict</code></td></tr>
+          <tr><td>已有 CLI run 为 <code>pending</code> 或 <code>in_progress</code></td><td><code>409 Conflict</code></td></tr>
+          <tr><td>CLI runner 队列已满</td><td><code>429 Too Many Requests</code></td></tr>
+        </tbody>
+      </table>
+
+      <h3>4.2 查询最新 CLI 状态</h3>
+      <pre><code>GET /api/v1/sync/episodes/:id/cli/status</code></pre>
+      <pre><code>{
+  "id": 456,
+  "episode_id": 123,
+  "status": "in_progress",
+  "file_id": null,
+  "logical_upload_id": null,
+  "upload_id": null,
+  "object_key": null,
+  "file_size": null,
+  "started_at": "2026-06-02T08:10:00Z",
+  "completed_at": null,
+  "error_message": null
+}</code></pre>
+    </section>
+
+    <section id="data-model" class="panel">
+      <h2>5. 数据模型</h2>
+      <p>新增 <code>cli_sync_runs</code>，专门承载 CLI 旁路生命周期。正常同步的 <code>sync_logs</code> 只在 CLI 成功后接收一条 completed 审计行。</p>
+
+      <h3>5.1 CLI run 表</h3>
+      <pre><code>CREATE TABLE IF NOT EXISTS cli_sync_runs (
+    id BIGINT AUTO_INCREMENT PRIMARY KEY,
+    episode_id BIGINT NOT NULL,
+    status ENUM('pending', 'in_progress', 'completed', 'failed') NOT NULL DEFAULT 'pending',
+    source_path VARCHAR(1024),
+    temp_path VARCHAR(1024),
+    dp_config_path VARCHAR(1024),
+    file_id VARCHAR(255),
+    logical_upload_id VARCHAR(255),
+    upload_id VARCHAR(255),
+    bucket VARCHAR(255),
+    object_key VARCHAR(1024),
+    file_size BIGINT,
+    oss_object_etag VARCHAR(255),
+    duration_sec INT,
+    error_message TEXT,
+    stdout_json JSON DEFAULT NULL,
+    started_at TIMESTAMP NULL,
+    completed_at TIMESTAMP NULL,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+    INDEX idx_cli_sync_episode (episode_id),
+    INDEX idx_cli_sync_status (status),
+    INDEX idx_cli_sync_created (created_at)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;</code></pre>
+
+      <h3>5.2 成功后的 normal sync log</h3>
+      <pre><code>INSERT INTO sync_logs (
+    episode_id,
+    source_path,
+    destination_path,
+    status,
+    bytes_transferred,
+    duration_sec,
+    attempt_count,
+    started_at,
+    completed_at
+) VALUES (?, ?, ?, 'completed', ?, ?, 1, ?, ?);</code></pre>
+
+      <div class="sequence" aria-label="状态序列">
+        <div class="node"><b>1. pending</b><span>API 接受请求，写入 CLI 独立表。</span></div>
+        <div class="node"><b>2. in_progress</b><span>runner 已 claim，正在 staging 或上传。</span></div>
+        <div class="node"><b>3A. failed</b><span>只更新 CLI 表，episode 不变。</span></div>
+        <div class="node"><b>3B. completed</b><span>CLI 表记录 dp 返回 ID。</span></div>
+        <div class="node"><b>4. synced</b><span>写 <code>sync_logs.completed</code> 与 episode 云同步字段。</span></div>
+      </div>
+    </section>
+
+    <section id="runner" class="panel">
+      <h2>6. CLI Runner</h2>
+      <h3>6.1 命令构造</h3>
+      <p>必须使用 <code>exec.CommandContext</code> 参数数组调用，不能拼 shell 字符串。</p>
+      <pre><code>exec.CommandContext(ctx, dpBin,
+  "--config", dpConfigPath,
+  "--json",
+  "data", "upload", tempFile,
+  "--device", "&lt;robot device id&gt;",
+  "--tag", "episode_id=&lt;episode public id&gt;",
+  "--tag", "keystone_episode_id=&lt;numeric id&gt;",
+  "--tag", "device_id=&lt;robot device id&gt;",
+  "--tag", "sync_channel=keystone_cli",
+  "--tag", "&lt;flattened sidecar key=value&gt;",
+  "--hint", "source=keystone_cli_sync",
+)</code></pre>
+      <p>设备 ID 通过 episode 对应工位的机器人解析，优先使用 <code>robots.device_id</code>，回退到 <code>workstations.robot_serial</code>。所选 <code>dp</code> config 的 <code>devices[]</code> 中必须已有这个 device profile。</p>
+
+      <h3>6.2 标签</h3>
+      <table>
+        <thead>
+          <tr><th>标签</th><th>来源</th><th>要求</th></tr>
+        </thead>
+        <tbody>
+          <tr><td><code>episode_id</code></td><td><code>episodes.episode_id</code></td><td>必填</td></tr>
+          <tr><td><code>keystone_episode_id</code></td><td><code>episodes.id</code></td><td>必填</td></tr>
+          <tr><td><code>device_id</code></td><td>episode 工位对应的 <code>robots.device_id</code></td><td>必填，同时作为 <code>--device</code> 参数</td></tr>
+          <tr><td><code>sync_channel</code></td><td>固定 <code>keystone_cli</code></td><td>必填</td></tr>
+          <tr><td>sidecar JSON 标量字段</td><td><code>episodes.sidecar_path</code> 指向的 JSON</td><td>必填，扁平化后作为重复 <code>--tag</code></td></tr>
+          <tr><td>sidecar JSON 数组字段</td><td>例如 topic 列表、skills</td><td>首版跳过，不传给 CLI</td></tr>
+          <tr><td><code>topics_summary</code></td><td>sidecar JSON</td><td>排除，避免 tag 过大</td></tr>
+          <tr><td><code>task_id</code></td><td><code>episodes.task_id</code></td><td>可选</td></tr>
+          <tr><td><code>factory_id</code></td><td><code>episodes.factory_id</code></td><td>可选</td></tr>
+          <tr><td><code>organization_id</code></td><td><code>episodes.organization_id</code></td><td>可选</td></tr>
+        </tbody>
+      </table>
+      <div class="callout warn">
+        CLI 首版只上传 MCAP 对象，不上传 sidecar JSON 对象。但 sidecar JSON 元数据不是可选项：标量字段必须作为 tag 传入，数组字段首版先跳过；如果 <code>sidecar_path</code> 缺失、对象读不到或 JSON 解析失败，本次 CLI run 应在调用 <code>dp</code> 前失败，避免云端产生缺少关键过滤标签的对象。
+      </div>
+      <p>sidecar 字段扁平化应复用现有同步 worker 的意图：普通字段转成字符串 key/value，嵌套对象用点号展开；数组字段首版跳过。同时必须受 <code>KEYSTONE_CLI_SYNC_MAX_TAGS</code> 和 <code>KEYSTONE_CLI_SYNC_MAX_TAG_BYTES</code> 限制。</p>
+
+      <h3>6.3 临时文件</h3>
+      <ul class="two-column-list">
+        <li>默认目录：<code>/var/lib/keystone/cli-sync</code>。</li>
+        <li>临时文件权限：<code>0600</code>。</li>
+        <li>成功或失败后删除临时文件。</li>
+        <li><code>KEYSTONE_CLI_SYNC_KEEP_TEMP=true</code> 时保留临时文件用于排障。</li>
+        <li>启动时校验目录可写。</li>
+        <li>可用时检查磁盘水位。</li>
+      </ul>
+
+      <h3>6.4 dp JSON 输出</h3>
+      <pre><code>{
+  "logicalUploadId": "logical-1",
+  "fileId": "file-1",
+  "bucket": "bucket-a",
+  "objectKey": "objects/file-1.mcap",
+  "fileSize": 123456789,
+  "ossObjectEtag": "etag",
+  "identity": "api-key",
+  "deviceId": null
+}</code></pre>
+      <p>标记成功前必须校验 <code>fileId</code>、<code>logicalUploadId</code>、<code>objectKey</code> 和 <code>fileSize</code> 非空且合法。</p>
+    </section>
+
+    <section id="config" class="panel">
+      <h2>7. 配置</h2>
+      <table>
+        <thead>
+          <tr><th>环境变量</th><th>默认值</th><th>说明</th></tr>
+        </thead>
+        <tbody>
+          <tr><td><code>KEYSTONE_CLI_SYNC_ENABLED</code></td><td><code>false</code></td><td>启用旁路 API 和 runner。</td></tr>
+          <tr><td><code>KEYSTONE_CLI_SYNC_DP_BIN</code></td><td><code>dp</code></td><td>data-platform CLI 二进制路径或名称。</td></tr>
+          <tr><td><code>KEYSTONE_CLI_SYNC_DP_CONFIG</code></td><td>空</td><td>传给 <code>dp --config</code> 的 SDK 配置文件。</td></tr>
+          <tr><td><code>KEYSTONE_CLI_SYNC_TEMP_DIR</code></td><td><code>/var/lib/keystone/cli-sync</code></td><td>MCAP staging 目录。</td></tr>
+          <tr><td><code>KEYSTONE_CLI_SYNC_MAX_CONCURRENT</code></td><td><code>1</code></td><td>最大并发 CLI 上传数。</td></tr>
+          <tr><td><code>KEYSTONE_CLI_SYNC_QUEUE_SIZE</code></td><td><code>16</code></td><td>最大排队 run 数。</td></tr>
+          <tr><td><code>KEYSTONE_CLI_SYNC_TIMEOUT_SEC</code></td><td><code>7200</code></td><td>单次 CLI run 超时时间。</td></tr>
+          <tr><td><code>KEYSTONE_CLI_SYNC_KEEP_TEMP</code></td><td><code>false</code></td><td>是否保留临时文件。</td></tr>
+          <tr><td><code>KEYSTONE_CLI_SYNC_MAX_TAGS</code></td><td><code>128</code></td><td>传给 CLI 的最大 tag 数。</td></tr>
+          <tr><td><code>KEYSTONE_CLI_SYNC_MAX_TAG_BYTES</code></td><td><code>65536</code></td><td>编码后 tag 总字节上限。</td></tr>
+        </tbody>
+      </table>
+      <div class="callout">
+        启用时启动校验：<code>dp</code> 可执行、<code>dp</code> 配置文件可读、临时目录可创建且可写。
+      </div>
+    </section>
+
+    <section id="frontend" class="panel">
+      <h2>8. 前端交互</h2>
+      <h3>8.1 Cloud Sync Center</h3>
+      <p>在现有「重试」「历史」旁边增加一个行级动作：</p>
+      <pre><code>CLI 同步到云</code></pre>
+      <p>仅在后端配置显示 CLI sync enabled 时展示。以下情况禁用：</p>
+      <ul>
+        <li>正常同步状态是 <code>pending</code> 或 <code>in_progress</code>。</li>
+        <li>正常同步状态是 <code>completed</code>。</li>
+        <li>当前 episode 已有 active CLI run。</li>
+        <li>当前行已有操作在提交。</li>
+        <li>当前用户不是 admin。</li>
+      </ul>
+
+      <h3>8.2 Episode Detail</h3>
+      <p>Episode 详情页也需要同一个动作。原因是 approved 但还没有任何 <code>sync_logs</code> 的 episode 可能不会出现在 Cloud Sync Center 列表里。</p>
+
+      <h3>8.3 状态展示</h3>
+      <table>
+        <thead>
+          <tr><th>CLI 状态</th><th>按钮文案</th><th>页面行为</th></tr>
+        </thead>
+        <tbody>
+          <tr><td><span class="status warn">pending</span></td><td>CLI 已入队</td><td>轮询 CLI status。</td></tr>
+          <tr><td><span class="status warn">in_progress</span></td><td>CLI 同步中</td><td>禁用重复点击。</td></tr>
+          <tr><td><span class="status ok">completed</span></td><td>CLI 已完成</td><td>刷新正常同步 summary。</td></tr>
+          <tr><td><span class="status stop">failed</span></td><td>CLI 同步失败</td><td>显示脱敏错误，正常同步行不变。</td></tr>
+        </tbody>
+      </table>
+    </section>
+
+    <section id="security" class="panel">
+      <h2>9. 安全、并发与竞态</h2>
+      <div class="split">
+        <div>
+          <h3>安全要求</h3>
+          <ul>
+            <li>触发 API 必须要求 admin 权限。</li>
+            <li>只能使用 <code>exec.CommandContext</code> 调用 CLI。</li>
+            <li>不要把 API key 放到命令行参数。</li>
+            <li><code>dp</code> 凭证只放在权限受控的 config 文件中。</li>
+            <li>返回前端的 stdout、stderr 和错误信息必须脱敏。</li>
+            <li>不要记录完整 <code>dp</code> config、临时凭证或 presigned URL。</li>
+          </ul>
+        </div>
+        <div>
+          <h3>并发策略</h3>
+          <ul>
+            <li>同一 episode 同时只允许一个 active CLI run。</li>
+            <li>创建 run 时在事务里检查 active normal sync 和 active CLI run。</li>
+            <li>完成前锁定 <code>episodes</code> 行并重新检查 <code>cloud_synced</code>。</li>
+            <li>默认并发为 1，避免占满 Keystone 磁盘、CPU 和网络。</li>
+          </ul>
+        </div>
+      </div>
+
+      <div class="callout danger">
+        如果 CLI 上传成功后 Keystone 在落库前崩溃，后续人工重试可能产生重复云端对象。首版接受这个应急通道风险，后续可通过 data-platform 上传 idempotency key 或按 <code>episode_id</code> 查询已上传对象来降低风险。
+      </div>
+
+      <h3>正常 worker 与 CLI 同时完成</h3>
+      <p>如果正常 <code>SyncWorker</code> 在 CLI 上传期间已经把 episode 同步完成，CLI runner 完成落库时应：</p>
+      <ul>
+        <li>将 CLI run 标记为 completed，并保留 <code>dp</code> 返回的审计信息。</li>
+        <li>不覆盖 <code>episodes.cloud_mcap_path</code>。</li>
+        <li>默认不插入第二条 <code>sync_logs.completed</code>，除非产品明确需要重复完成历史。</li>
+      </ul>
+    </section>
+
+    <section id="rollout" class="panel">
+      <h2>10. 落地计划与验收</h2>
+      <h3>10.1 实施顺序</h3>
+      <ol>
+        <li>新增 <code>cli_sync_runs</code> migration 和 repository helper。</li>
+        <li>新增 CLI sync config，默认关闭。</li>
+        <li>实现 backend runner，并用 fake <code>dp</code> 可执行文件做测试。</li>
+        <li>新增 <code>POST /sync/episodes/:id/cli</code> 和 CLI status endpoint。</li>
+        <li>新增 Synapse API wrapper。</li>
+        <li>Episode Detail 增加按钮。</li>
+        <li>Cloud Sync Center 增加行按钮和 CLI 状态展示。</li>
+        <li>只在 staging 环境开启。</li>
+      </ol>
+
+      <h3>10.2 验收标准</h3>
+      <ul>
+        <li>一个 approved 小 MCAP 可以通过 CLI 同步到云。</li>
+        <li>data-platform 对象列表可见该文件。</li>
+        <li>预期 sidecar JSON 标量字段可在 data-platform raw tags 中看到。</li>
+        <li><code>cli_sync_runs</code> 记录 <code>fileId</code> 和 <code>logicalUploadId</code>。</li>
+        <li><code>sync_logs</code> 出现一条 completed 行。</li>
+        <li><code>episodes.cloud_synced = TRUE</code>。</li>
+        <li>正常 <code>SyncWorker</code> 不会再次处理该 episode。</li>
+      </ul>
+
+      <h3>10.3 必测用例</h3>
+      <ul class="two-column-list">
+        <li>功能关闭时拒绝请求。</li>
+        <li>非 approved episode 被拒绝。</li>
+        <li>已 cloud_synced episode 被拒绝。</li>
+        <li>active normal sync 行被拒绝。</li>
+        <li>active CLI run 被拒绝。</li>
+        <li>sidecar JSON 缺失、不可读或格式错误时 run 失败。</li>
+        <li>sidecar JSON 标量字段会作为重复 <code>--tag</code> 参数传给 <code>dp</code>，数组字段首版跳过。</li>
+        <li><code>dp</code> argv 不经过 shell。</li>
+        <li>解析合法 <code>dp --json</code> 输出。</li>
+        <li>缺少关键字段时标记 failed。</li>
+        <li>失败不写 <code>sync_logs</code>。</li>
+        <li>成功更新 episode 和 completed sync log。</li>
+        <li>临时文件成功和失败后都会清理。</li>
+        <li>前端失败提示只显示脱敏错误。</li>
+      </ul>
+    </section>
+
+    <section id="questions" class="panel">
+      <h2>11. 待确认问题</h2>
+      <ul>
+        <li>CLI 失败是否需要进入 Cloud Sync Center 主表，还是只在 episode 详情/CLI badge 展示？</li>
+        <li>正常 worker 已经完成时，CLI completed 是否需要单独追加到 <code>sync_logs</code> 历史？</li>
+        <li><code>dp data upload</code> 是否需要 data-platform 支持显式 idempotency key？</li>
+        <li><code>dp</code> config 应使用 site API key，还是为 Keystone edge site 建一个 device profile？</li>
+      </ul>
+    </section>
+  </main>
+</body>
+</html>
diff --git a/docs/designs/cloud-sync-go-direct-upload.zh.html b/docs/designs/cloud-sync-go-direct-upload.zh.html
new file mode 100644
index 0000000..65f0a88
--- /dev/null
+++ b/docs/designs/cloud-sync-go-direct-upload.zh.html
@@ -0,0 +1,825 @@
+<!--
+SPDX-FileCopyrightText: 2026 ArcheBase
+
+SPDX-License-Identifier: MulanPSL-2.0
+-->
+<!doctype html>
+<html lang="zh-CN">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>Keystone 原生云同步直连 Data Platform 上传方案</title>
+  <style>
+    :root {
+      color-scheme: light;
+      --ink: #17211d;
+      --muted: #5a665f;
+      --line: #d7e1dc;
+      --paper: #ffffff;
+      --surface: #f5f9f7;
+      --surface-strong: #e7f3ee;
+      --accent: #0d6f61;
+      --accent-strong: #084d44;
+      --blue: #1f5f8f;
+      --warn: #8a5a12;
+      --danger: #9e3d32;
+      --success: #1e744d;
+      --code-bg: #10201c;
+      --code-ink: #edf8f3;
+      --shadow: 0 18px 46px rgba(23, 33, 29, 0.08);
+    }
+
+    * {
+      box-sizing: border-box;
+    }
+
+    html {
+      scroll-behavior: smooth;
+    }
+
+    body {
+      margin: 0;
+      color: var(--ink);
+      background:
+        linear-gradient(135deg, rgba(13, 111, 97, 0.13), transparent 34%),
+        linear-gradient(180deg, #f8fbfa 0%, #ffffff 48%);
+      font: 16px/1.72 "Noto Sans SC", "Source Han Sans SC", "Microsoft YaHei", sans-serif;
+    }
+
+    main {
+      max-width: 1160px;
+      margin: 0 auto;
+      padding: 46px 24px 80px;
+    }
+
+    header {
+      display: grid;
+      grid-template-columns: minmax(0, 1fr) 310px;
+      gap: 28px;
+      align-items: end;
+      padding: 32px;
+      border: 1px solid var(--line);
+      border-radius: 16px;
+      background: rgba(255, 255, 255, 0.92);
+      box-shadow: var(--shadow);
+    }
+
+    .eyebrow {
+      margin: 0 0 10px;
+      color: var(--accent-strong);
+      font-size: 13px;
+      font-weight: 700;
+      letter-spacing: 0;
+      text-transform: uppercase;
+    }
+
+    h1 {
+      margin: 0;
+      font-size: clamp(30px, 4.8vw, 46px);
+      line-height: 1.14;
+      letter-spacing: 0;
+    }
+
+    .lead {
+      max-width: 800px;
+      margin: 18px 0 0;
+      color: var(--muted);
+      font-size: 18px;
+    }
+
+    .meta {
+      border-left: 4px solid var(--accent);
+      padding-left: 16px;
+      color: var(--muted);
+      font-size: 14px;
+    }
+
+    .meta strong {
+      display: block;
+      margin-bottom: 8px;
+      color: var(--ink);
+      font-size: 16px;
+    }
+
+    nav {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 10px;
+      margin: 24px 0 4px;
+    }
+
+    nav a {
+      color: var(--accent-strong);
+      background: var(--surface-strong);
+      border: 1px solid #c6ddd3;
+      border-radius: 999px;
+      padding: 8px 12px;
+      text-decoration: none;
+      font-size: 14px;
+    }
+
+    section {
+      margin-top: 38px;
+    }
+
+    h2 {
+      margin: 0 0 14px;
+      font-size: 26px;
+      line-height: 1.26;
+      letter-spacing: 0;
+    }
+
+    h3 {
+      margin: 24px 0 10px;
+      font-size: 19px;
+      line-height: 1.35;
+    }
+
+    p {
+      margin: 10px 0;
+    }
+
+    ul,
+    ol {
+      padding-left: 24px;
+    }
+
+    li {
+      margin: 6px 0;
+    }
+
+    code {
+      border-radius: 5px;
+      background: #edf5f1;
+      padding: 2px 5px;
+      color: #123d35;
+      font-family: "JetBrains Mono", "SFMono-Regular", Consolas, monospace;
+      font-size: 0.92em;
+    }
+
+    pre {
+      overflow: auto;
+      margin: 14px 0;
+      border-radius: 12px;
+      background: var(--code-bg);
+      color: var(--code-ink);
+      padding: 18px;
+      font: 14px/1.65 "JetBrains Mono", "SFMono-Regular", Consolas, monospace;
+    }
+
+    pre code {
+      background: transparent;
+      color: inherit;
+      padding: 0;
+    }
+
+    table {
+      width: 100%;
+      border-collapse: collapse;
+      margin: 14px 0 22px;
+      overflow: hidden;
+      border: 1px solid var(--line);
+      border-radius: 12px;
+      background: var(--paper);
+    }
+
+    th,
+    td {
+      border-bottom: 1px solid var(--line);
+      padding: 12px 14px;
+      text-align: left;
+      vertical-align: top;
+    }
+
+    th {
+      color: var(--accent-strong);
+      background: var(--surface-strong);
+      font-weight: 700;
+    }
+
+    tr:last-child td {
+      border-bottom: 0;
+    }
+
+    .panel {
+      border: 1px solid var(--line);
+      border-radius: 14px;
+      background: rgba(255, 255, 255, 0.94);
+      box-shadow: var(--shadow);
+      padding: 22px;
+    }
+
+    .decision {
+      border-left: 4px solid var(--accent);
+      background: var(--surface);
+      padding: 14px 16px;
+      margin: 16px 0;
+    }
+
+    .warning {
+      border-left: 4px solid var(--warn);
+      background: #fff8ec;
+      padding: 14px 16px;
+      margin: 16px 0;
+    }
+
+    .danger {
+      border-left: 4px solid var(--danger);
+      background: #fff6f4;
+      padding: 14px 16px;
+      margin: 16px 0;
+    }
+
+    .success {
+      border-left: 4px solid var(--success);
+      background: #f1faf5;
+      padding: 14px 16px;
+      margin: 16px 0;
+    }
+
+    .grid {
+      display: grid;
+      grid-template-columns: repeat(2, minmax(0, 1fr));
+      gap: 18px;
+    }
+
+    .flow {
+      display: grid;
+      gap: 10px;
+      margin: 16px 0;
+    }
+
+    .step {
+      border: 1px solid var(--line);
+      border-radius: 10px;
+      background: var(--paper);
+      padding: 13px 15px;
+    }
+
+    .step strong {
+      color: var(--accent-strong);
+    }
+
+    @media (max-width: 840px) {
+      header,
+      .grid {
+        grid-template-columns: 1fr;
+      }
+
+      main {
+        padding: 28px 16px 56px;
+      }
+
+      header {
+        padding: 22px;
+      }
+    }
+  </style>
+</head>
+<body>
+  <main>
+    <header>
+      <div>
+        <p class="eyebrow">Cloud Sync / Data Platform</p>
+        <h1>Keystone 原生云同步直连 Data Platform 上传方案</h1>
+        <p class="lead">
+          本方案采用方案 A：Keystone 读取 <code>KEYSTONE_SYNC_DP_CONFIG</code> 指向的 data-platform
+          config，按 episode 对应的 <code>asset_id</code> 选择 device profile，并复用现有 Go 上传器完成
+          MinIO 到 Data Platform OSS 的流式上传。第一版直接改造 Keystone 原有 cloud sync 上传逻辑，
+          不再依赖 <code>dp data upload</code> 或 Keystone 自有 cloud API key。
+        </p>
+      </div>
+      <div class="meta">
+        <strong>设计状态</strong>
+        草案，面向第一版实现<br>
+        目标路径：原生 cloud sync，不下载 MCAP 到本地，不依赖 <code>dp data upload</code><br>
+        兼容重点：<code>robots.asset_id</code>、device API key、device tags、raw tags 合成
+      </div>
+    </header>
+
+    <nav aria-label="目录">
+      <a href="#goals">目标</a>
+      <a href="#current">当前行为</a>
+      <a href="#architecture">目标架构</a>
+      <a href="#device-mapping">设备映射</a>
+      <a href="#config">配置读取</a>
+      <a href="#tags">Raw Tags</a>
+      <a href="#flow">上传流程</a>
+      <a href="#implementation">实施步骤</a>
+      <a href="#risks">风险</a>
+      <a href="#tests">测试</a>
+    </nav>
+
+    <section id="goals" class="panel">
+      <h2>目标与非目标</h2>
+      <div class="grid">
+        <div>
+          <h3>目标</h3>
+          <ul>
+            <li>更新 Keystone 原有 cloud sync worker 的上传身份和 raw tags 合成逻辑。</li>
+            <li>复用既有 <code>robots.asset_id</code> 作为“云资产编号”，即本地 robot 与 Data Platform device 的稳定映射。</li>
+            <li>episode 创建时将当时的 <code>asset_id</code> 快照写入 <code>episodes.metadata.asset_id</code>。</li>
+            <li>读取 <code>KEYSTONE_SYNC_DP_CONFIG</code> 指向的 data-platform config，按 <code>asset_id</code> 选择 device profile。</li>
+            <li>使用 device profile 的 <code>apiKey</code> 与 AuthService 交换 Bearer token。</li>
+            <li>复用 Keystone 现有 <code>cloud.Uploader</code>，从 MinIO 流式上传到 Data Platform OSS。</li>
+            <li>复刻 Rust SDK 的 raw tags 合并与冲突校验规则。</li>
+          </ul>
+        </div>
+        <div>
+          <h3>非目标</h3>
+          <ul>
+            <li>不在第一版集成 <code>dp device init</code>，device profile 由现场工程师提前初始化。</li>
+            <li>不生成或修改 Data Platform device id，只存储自动化流程写入的 <code>asset_id</code>。</li>
+            <li>不迁移 data-platform config 到 Keystone 配置中心。</li>
+            <li>删除 Keystone 后端 CLI 同步旁路，包括 <code>CLISyncRunner</code>、CLI sync API、CLI sync 配置和 <code>cli_sync_runs</code> 表迁移。</li>
+            <li>不在每个 episode 上传前执行 device init；init 是一次性准备或凭证轮换动作。</li>
+            <li>不做历史 episode 的自动 <code>asset_id</code> 回填工具；缺失时给出清晰错误并允许手动重试。</li>
+            <li>不新增 direct sync raw tag 数量或总字节数限制。</li>
+            <li>不把 MCAP 完整读入内存，只保持单分片缓冲。</li>
+          </ul>
+        </div>
+      </div>
+    </section>
+
+    <section id="current" class="panel">
+      <h2>当前行为</h2>
+      <p>
+        Keystone 原有 cloud sync 已经是 Go 直连上传：从 MinIO 流式读取 MCAP，创建 data-gateway 上传会话，
+        再分片上传到 Data Platform OSS。当前差异在于它使用 Keystone 自己的 sync API key，并没有像
+        <code>dp data upload --device</code> 一样读取 device profile、注入 device tags 和 reserved raw tags。
+        当前 worker 也没有读取 episode 的云端 device 快照，sidecar raw tag 读取失败时会 best-effort 继续上传。
+      </p>
+      <pre><code>Keystone DB episode
+  -> build sidecar raw tags
+  -> AuthService ExchangeCredential with Keystone sync API key
+  -> data-gateway CreateLogicalUpload
+  -> MinIO GetObject stream
+  -> OSS multipart upload
+  -> data-gateway CompleteUpload
+  -> update sync_logs / episodes</code></pre>
+      <div class="warning">
+        <p>
+          所以第一版不需要新增上传链路，但必须把原有 worker 的上传身份切到 device profile，
+          补齐 asset_id 解析、strict sidecar、non-retryable 错误和与 data-platform Rust SDK 一致的 raw tags 合成规则。
+        </p>
+      </div>
+    </section>
+
+    <section id="architecture" class="panel">
+      <h2>目标架构</h2>
+      <p>
+        新路径保留原有 cloud sync worker 的触发、重试、状态更新和同步日志。每次处理 episode 时，
+        worker 先解析 episode 对应的 <code>asset_id</code>，再根据该值读取 data-platform device
+        profile，并用该 profile 的 API key 构造本次上传专用客户端。
+      </p>
+      <pre><code>Keystone DB episode
+  -> resolve asset_id from episodes.metadata
+     or fallback through episode.workstation_id -> workstations -> robots.asset_id
+  -> load DP config from KEYSTONE_SYNC_DP_CONFIG
+  -> select devices[].deviceId == asset_id
+  -> build effective raw tags
+  -> AuthService ExchangeCredential with device apiKey
+  -> data-gateway CreateLogicalUpload
+  -> MinIO GetObject stream
+  -> OSS multipart upload
+  -> data-gateway CompleteUpload
+  -> update sync_logs / episodes</code></pre>
+
+      <table>
+        <thead>
+          <tr>
+            <th>模块</th>
+            <th>职责</th>
+            <th>建议位置</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td>Robot asset mapping</td>
+            <td>保存本地 robot 到 Data Platform device 的不可变映射。</td>
+            <td><code>robots.asset_id</code>、robot API、数据库迁移</td>
+          </tr>
+          <tr>
+            <td>Asset resolver</td>
+            <td>优先读取 <code>episodes.metadata.asset_id</code>，缺失时按历史 workstation 反查 robot。</td>
+            <td><code>internal/services/dp_asset_resolver.go</code></td>
+          </tr>
+          <tr>
+            <td>DP config loader</td>
+            <td>解析 <code>endpoints</code> 与 <code>devices[]</code>，按 device id 返回上传所需配置。</td>
+            <td><code>internal/services/dp_config_loader.go</code></td>
+          </tr>
+          <tr>
+            <td>Raw tag builder</td>
+            <td>复刻 Rust SDK 的 tag 合并顺序与冲突规则。</td>
+            <td><code>internal/services/dp_raw_tags.go</code></td>
+          </tr>
+          <tr>
+            <td>Direct uploader factory</td>
+            <td>按 episode 创建本次专用 <code>AuthClient</code>、<code>GatewayClient</code> 和 <code>cloud.Uploader</code>。</td>
+            <td><code>internal/services/sync_worker.go</code></td>
+          </tr>
+          <tr>
+            <td>Cloud uploader</td>
+            <td>复用现有 data-gateway 与 OSS multipart 上传能力。</td>
+            <td><code>internal/cloud/uploader.go</code></td>
+          </tr>
+        </tbody>
+      </table>
+    </section>
+
+    <section id="device-mapping" class="panel">
+      <h2>云资产编号映射规则</h2>
+      <p>
+        Keystone 本地 <code>robots.device_id</code> 继续表示 Axon / Keystone 内部设备编号，不参与 Data Platform
+        device 身份选择。云交互只使用既有 <code>robots.asset_id</code>，前端文案统一显示为“云资产编号”。
+      </p>
+
+      <h3>Robot 字段规则</h3>
+      <ul>
+        <li><code>robots.asset_id</code> 初始允许为空，创建 robot 时可写可不写。</li>
+        <li>首次设置非空后不可修改、不可清空；同值更新视为幂等。</li>
+        <li>active robots 的非空 <code>asset_id</code> 必须唯一，软删除 robot 不占用唯一性。</li>
+        <li>保存前 trim；空字符串按 NULL；最大长度 100；不做 Data Platform device id 格式正则。</li>
+        <li>robot create / update / list / detail API 暴露 <code>asset_id</code>。</li>
+      </ul>
+
+      <h3>Episode 快照规则</h3>
+      <ul>
+        <li>episode 创建时，如果能从 task -> workstation -> robot 解析到非空 <code>asset_id</code>，写入 <code>episodes.metadata.asset_id</code>。</li>
+        <li>episode 创建不因 <code>asset_id</code> 缺失失败，本地采集、QA 和入库继续成功。</li>
+        <li>不修改 sidecar JSON，不把 <code>asset_id</code> 写回采集产物。</li>
+        <li>第一版不提供自动历史回填工具；缺失时通过错误信息提示配置 robot 或手动回填 metadata 后再手动同步。</li>
+      </ul>
+
+      <h3>上传时解析优先级</h3>
+      <pre><code>if episodes.metadata.asset_id is non-empty:
+    use metadata.asset_id
+else:
+    load workstation by episode.workstation_id, including soft-deleted workstation rows
+    load robots.asset_id by workstation.robot_id
+    use robots.asset_id if non-empty
+if still empty:
+    fail as non-retryable configuration error</code></pre>
+
+      <div class="decision">
+        <p>
+          cloud sync 不 fallback 到 <code>robots.device_id</code>。工位当前允许直接更新 <code>robot_id</code>；
+          后续如果换绑改为“旧工位软删 + 新工位记录”，fallback 查询也必须允许读取软删除 workstation，
+          因为 episode 的 <code>workstation_id</code> 是历史引用。
+        </p>
+      </div>
+    </section>
+
+    <section id="config" class="panel">
+      <h2>方案 A：读取 Data Platform Config</h2>
+      <p>
+        Keystone 使用 <code>KEYSTONE_SYNC_DP_CONFIG</code> 指向的 data-platform config。该文件由现场工程师提前通过
+        <code>dp config</code> 和 <code>dp device init</code> 生成和维护，Keystone 只解析直连上传需要的字段。
+        原生 direct sync 不再依赖 <code>KEYSTONE_CLOUD_API_KEY</code>、
+        <code>KEYSTONE_SYNC_AUTH_ENDPOINT</code> 或 <code>KEYSTONE_SYNC_GATEWAY_ENDPOINT</code>。
+      </p>
+
+      <h3>需要解析的 JSON 字段</h3>
+      <pre><code>{
+  "version": 3,
+  "endpoints": {
+    "auth": "https://auth.example.com:50051",
+    "gateway": "https://gateway.example.com:50052"
+  },
+  "devices": [
+    {
+      "deviceId": "AB-F0001-T0001-000006",
+      "apiKey": "ak_v1.device_secret",
+      "tags": {
+        "778a6d83c9ec49108537542a570966ee.device_id": "AB-F0001-T0001-000006",
+        "line": "a"
+      },
+      "initializedAtUnix": 1760000000
+    }
+  ]
+}</code></pre>
+
+      <h3>Go 结构建议</h3>
+      <pre><code>type DPConfigFile struct {
+    Version   *int              `json:"version,omitempty"`
+    Endpoints DPConfigEndpoints `json:"endpoints"`
+    Devices   []DPDeviceProfile `json:"devices"`
+}
+
+type DPConfigEndpoints struct {
+    Auth    string `json:"auth"`
+    Gateway string `json:"gateway"`
+}
+
+type DPDeviceProfile struct {
+    DeviceID string            `json:"deviceId"`
+    APIKey   string            `json:"apiKey"`
+    Tags     map[string]string `json:"tags"`
+}
+
+type DPResolvedEndpoint struct {
+    Target    string
+    UseTLS    bool
+    ServerName string
+}</code></pre>
+
+      <h3>解析规则</h3>
+      <ul>
+        <li><code>version</code> 缺失或等于 3 可接受；存在且不等于 3 时失败。</li>
+        <li><code>devices[].deviceId</code> trim 后比较，大小写敏感；重复 device id 直接失败。</li>
+        <li><code>deviceId</code> 必须与 Keystone 解析出的 <code>asset_id</code> 一致。</li>
+        <li><code>apiKey</code> 不能为空，且永不打印明文日志。</li>
+        <li><code>tags</code> 不能为空，保持与 Rust SDK <code>require_device_upload()</code> 一致；tag key/value 不 trim、不改写，但 key 必须非空。</li>
+        <li><code>endpoints.auth</code> 和 <code>endpoints.gateway</code> 必须来自 config 文件，不支持 <code>ARCHEBASE_*</code> 或 <code>KEYSTONE_SYNC_*</code> overlay。</li>
+        <li>每个 episode 上传前重新读取 config 文件，避免长期进程缓存旧 device profile。</li>
+      </ul>
+
+      <h3>Endpoint 与 TLS 规则</h3>
+      <ul>
+        <li><code>https://host[:port]</code> 使用 TLS gRPC；未写端口时补 <code>443</code>；TLS CA 使用系统 CA，server name 使用 URL host。</li>
+        <li><code>http://host[:port]</code> 使用 insecure gRPC；未写端口时补 <code>80</code>。</li>
+        <li><code>host[:port]</code> 兼容裸地址，按 insecure gRPC 处理，不自动补端口。</li>
+        <li>endpoint 禁止 path、query 和 fragment，例如 <code>https://host:50051/foo</code> 应视为配置错误。</li>
+        <li>Auth 和 Gateway 不强制使用同一种 scheme，分别按各自 endpoint 解析。</li>
+        <li>第一版不支持自定义 CA 文件或 TLS server name override。</li>
+      </ul>
+
+      <div class="decision">
+        <p>
+          第一版不把 device profile 写入 Keystone 数据库。Keystone 只消费 data-platform config，
+          这样可以最大限度贴近当前 <code>dp --config ... --device ...</code> 的上传身份语义，同时避免
+          Keystone 自有 sync API key 与 device API key 混用。
+        </p>
+      </div>
+
+      <h3>现场前置动作</h3>
+      <pre><code>dp --config /home/shark/.archebase/config.json config
+dp --config /home/shark/.archebase/config.json device init AB-F0001-T0001-000006</code></pre>
+      <p>
+        初始化成功后，config 中会出现对应的 <code>devices[]</code> profile。后续 Keystone 上传只读取该 profile，
+        不在上传前自动执行 init。凭证过期、设备迁移或平台侧 tags 变化时，由现场工程师执行
+        <code>dp device reinit ... --yes</code> 轮换。
+      </p>
+    </section>
+
+    <section id="tags" class="panel">
+      <h2>Raw Tags 合并规则</h2>
+      <p>
+        直连上传必须复刻 data-platform Rust SDK 的 <code>build_upload_tags()</code> 语义。合并过程使用非冲突插入：
+        如果 key 已存在且 value 不同，直接失败；如果 key 已存在且 value 相同，视为幂等。
+      </p>
+
+      <table>
+        <thead>
+          <tr>
+            <th>顺序</th>
+            <th>来源</th>
+            <th>说明</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td>1</td>
+            <td>device profile tags</td>
+            <td>来自 <code>devices[].tags</code>，由 data-platform device init 生成。</td>
+          </tr>
+          <tr>
+            <td>2</td>
+            <td>device id reserved tag</td>
+            <td><code>778a6d83c9ec49108537542a570966ee.device_id</code>，值为 profile 的 <code>deviceId</code>。</td>
+          </tr>
+          <tr>
+            <td>3</td>
+            <td>original file reserved tag</td>
+            <td><code>a206e337ecdf70a93bb611cf6a30c346.raw_file</code>，值固定使用 MinIO MCAP object key 的 basename。</td>
+          </tr>
+          <tr>
+            <td>4</td>
+            <td>Keystone sidecar tags</td>
+            <td>从 sidecar JSON 扁平化得到；数组字段 JSON encode，顶层 <code>topics_summary</code> 排除。</td>
+          </tr>
+          <tr>
+            <td>5</td>
+            <td>Keystone extra tags</td>
+            <td><code>episode_id</code>、<code>keystone_episode_id</code>、<code>sync_channel</code>、<code>task_id</code>、<code>factory_id</code>、<code>organization_id</code>。</td>
+          </tr>
+        </tbody>
+      </table>
+
+      <div class="warning">
+        <p>
+          原有 cloud sync 没有本地临时文件，因此 reserved <code>raw_file</code> 不读取 sidecar 字段，
+          只使用 <code>basename(stripBucketPrefix(episodes.mcap_path))</code>。如果 basename 为空，本次上传失败。
+        </p>
+      </div>
+
+      <div class="decision">
+        <p>
+          Keystone 不新增普通 <code>device_id</code> raw tag。设备归属只通过
+          <code>778a6d83c9ec49108537542a570966ee.device_id</code> reserved tag 表达，并由 Keystone 本地注入、
+          data-gateway 服务端二次校验。
+        </p>
+      </div>
+
+      <h3>Sidecar 规则</h3>
+      <ul>
+        <li>direct device sync 改为 strict：<code>sidecar_path</code> 为空、对象不可读或 JSON 解析失败时，不创建 data-gateway upload session。</li>
+        <li>数组字段保留为 JSON 字符串；顶层 <code>topics_summary</code> 继续排除。</li>
+        <li>第一版不新增 raw tag 数量或总字节数限制。</li>
+      </ul>
+
+      <h3>合并伪代码</h3>
+      <pre><code>merged := map[string]string{}
+insertAllNonConflicting(merged, deviceProfile.Tags)
+insertNonConflicting(merged, deviceIDRawTagKey, deviceProfile.DeviceID)
+insertNonConflicting(merged, originalFileRawTagKey, mcapBaseName)
+insertAllNonConflicting(merged, sidecarTags)
+insertAllNonConflicting(merged, keystoneExtraTags)
+return merged</code></pre>
+    </section>
+
+    <section id="flow" class="panel">
+      <h2>直连上传流程</h2>
+      <div class="flow">
+        <div class="step"><strong>1. 领取 episode</strong>：沿用原有 cloud sync worker 的自动扫描、手动触发、重试和并发控制。</div>
+        <div class="step"><strong>2. 加载 episode</strong>：读取 MCAP MinIO key、sidecar path、metadata、workstation id 和任务上下文。</div>
+        <div class="step"><strong>3. 解析 asset_id</strong>：优先使用 <code>episodes.metadata.asset_id</code>，否则通过历史 workstation 反查 <code>robots.asset_id</code>。</div>
+        <div class="step"><strong>4. 加载 DP config</strong>：从 <code>KEYSTONE_SYNC_DP_CONFIG</code> 读取 device profile 和 endpoints。</div>
+        <div class="step"><strong>5. 构造 raw tags</strong>：合并 device tags、reserved tags、sidecar tags 和 Keystone extra tags，执行冲突校验。</div>
+        <div class="step"><strong>6. 构造 direct uploader</strong>：为本次 episode 创建专用 <code>AuthClient</code>、<code>GatewayClient</code> 和 <code>cloud.Uploader</code>。</div>
+        <div class="step"><strong>7. 执行上传</strong>：调用 <code>cloud.Uploader.Upload()</code>，从 MinIO 流式读取 MCAP 并上传 OSS。</div>
+        <div class="step"><strong>8. 写回结果</strong>：沿用原有成功路径，更新 <code>sync_logs</code>、<code>episodes.cloud_synced</code> 和 <code>cloud_mcap_path</code>。</div>
+      </div>
+
+      <h3>结果字段映射</h3>
+      <p>
+        第一版不新增 <code>episodes</code> 字段，也不扩展 <code>sync_logs</code> 表。Data Platform 审计 ID 先通过日志输出；
+        如后续 UI 或 API 需要直接按 episode 查询，再单独扩表。
+      </p>
+      <table>
+        <thead>
+          <tr>
+            <th>结果</th>
+            <th>Go direct 来源</th>
+            <th>说明</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><code>file_id</code></td>
+            <td><code>cloud.UploadResult.UploadID</code></td>
+            <td>第一版不落库，只记录日志；如后续需要在 Keystone 记录 Data Platform 文件 ID，可直接使用该值。</td>
+          </tr>
+          <tr>
+            <td><code>logical_upload_id</code></td>
+            <td><code>cloud.UploadResult.LogicalUploadID</code></td>
+            <td>第一版不落库，只记录日志。</td>
+          </tr>
+          <tr>
+            <td><code>upload_id</code></td>
+            <td><code>cloud.UploadResult.UploadID</code></td>
+            <td>与 Data Platform SDK 返回的 <code>fileId</code> 当前等价，第一版不落库。</td>
+          </tr>
+          <tr>
+            <td><code>object_key</code></td>
+            <td><code>cloud.UploadResult.ObjectKey</code></td>
+            <td>写入 <code>sync_logs.destination_path</code> 和 <code>episodes.cloud_mcap_path</code>。</td>
+          </tr>
+          <tr>
+            <td><code>oss_object_etag</code></td>
+            <td><code>cloud.UploadResult.OSSObjectETag</code></td>
+            <td>客户端计算并回传给 data-gateway 的 multipart ETag，第一版不落库，只记录日志。</td>
+          </tr>
+        </tbody>
+      </table>
+    </section>
+
+    <section id="implementation" class="panel">
+      <h2>实施步骤</h2>
+      <ol>
+        <li>
+          复用 <code>robots.asset_id</code> 字段，增加 active 非空唯一约束；
+          create / update 实现 trim、控制字符校验、“首次非空设置后不可修改、不可清空、同值幂等”。
+        </li>
+        <li>
+          episode 创建时解析 task -> workstation -> robot 的 <code>asset_id</code>，非空时写入
+          <code>episodes.metadata.asset_id</code>，但缺失不阻止 episode 创建。
+        </li>
+        <li>
+          新增 DP config loader，读取 <code>SyncConfig.DPConfigPath</code> / <code>KEYSTONE_SYNC_DP_CONFIG</code>，
+          校验 version、endpoint、重复 device id、空 apiKey 和空 tags，并按 <code>asset_id</code> 返回 profile。
+        </li>
+        <li>
+          给 <code>SyncWorker</code> 增加 asset_id resolver：优先读 <code>episodes.metadata.asset_id</code>，
+          缺失时允许读取软删除 workstation 并反查 <code>robots.asset_id</code>。
+        </li>
+        <li>
+          新增 raw tags builder，包含两个 reserved key 常量、非冲突插入、MinIO basename 选择、strict sidecar
+          和 Keystone extra tags；不添加普通 <code>device_id</code>。
+        </li>
+        <li>
+          调整 uploader 构造方式，每个 episode 创建本次专用 <code>AuthClient</code>/<code>GatewayClient</code>/<code>Uploader</code>；
+          endpoint scheme 决定 TLS，TLS 使用系统 CA。
+        </li>
+        <li>
+          在 <code>cloud.UploadRequest</code> 和 persisted upload state 中记录 <code>AssetID</code>；
+          恢复上传时只有 MCAP key 和 <code>asset_id</code> 同时匹配才允许复用旧 session。
+        </li>
+        <li>
+          更新 <code>GatewayClient.CompleteUpload()</code> 签名，complete 时回传 <code>session.PartSizeBytes</code> 到
+          <code>CompleteUploadRequest.part_size_bytes</code>。
+        </li>
+        <li>
+          引入 retryable / non-retryable 错误分类：non-retryable failed 写 <code>next_retry_at = NULL</code>；
+          auto scan 跳过 latest failed 且 <code>next_retry_at IS NULL</code> 的 episode，manual sync 仍可重试。
+        </li>
+        <li>
+          删除 CLI 同步旁路：Synapse 不展示 CLI sync UI，Keystone 不注册 CLI sync API，不初始化
+          <code>CLISyncRunner</code>，不读取 <code>KEYSTONE_CLI_SYNC_*</code> 配置，也不保留
+          <code>cli_sync_runs</code> 迁移。
+        </li>
+      </ol>
+    </section>
+
+    <section id="risks" class="panel">
+      <h2>风险与处理</h2>
+      <table>
+        <thead>
+          <tr>
+            <th>风险</th>
+            <th>影响</th>
+            <th>处理策略</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><code>asset_id</code> 缺失</td>
+            <td>无法选择 device profile，episode 本次 sync 失败。</td>
+            <td>写入 non-retryable failed，<code>next_retry_at = NULL</code>；错误信息包含 episode、workstation、robot 和修复方向，手动修复后手动重试。</td>
+          </tr>
+          <tr>
+            <td><code>asset_id</code> 填错</td>
+            <td>本地 robot 会永久绑定错误的 Data Platform device。</td>
+            <td>字段首次非空设置后不可修改；未来由自动化流程写入，第一版不提供 break-glass 维护入口。</td>
+          </tr>
+          <tr>
+            <td>device profile 缺失或不完整</td>
+            <td>上传前失败，episode 本次 sync 失败。</td>
+            <td>错误信息包含 <code>asset_id</code> 和 config path，但不打印 api key；提示现场执行 <code>dp device init</code> 或 <code>dp device reinit</code>。</td>
+          </tr>
+          <tr>
+            <td>endpoint / TLS 配置错误</td>
+            <td>Auth 或 Gateway 连接失败。</td>
+            <td>endpoint 只来自 DP config；按 <code>http</code>/<code>https</code> scheme 自动解析 TLS；禁止 path/query/fragment，日志打印 target 和 TLS 标志。</td>
+          </tr>
+          <tr>
+            <td>secret 泄漏</td>
+            <td>日志或错误信息暴露 device api key。</td>
+            <td>loader 和上传日志只打印 <code>asset_id</code>、config path、endpoint，不打印 api key、token、STS secret。</td>
+          </tr>
+          <tr>
+            <td>sidecar 缺失或格式错误</td>
+            <td>云端对象缺少业务 raw tags，影响检索。</td>
+            <td>direct sync 对 sidecar strict；格式错误 non-retryable，MinIO 读对象失败可自动重试。</td>
+          </tr>
+          <tr>
+            <td>恢复状态身份混用</td>
+            <td>同一 MCAP 可能复用另一个 device 身份创建的 upload session。</td>
+            <td>persisted upload state 记录 <code>asset_id</code>，恢复时必须同时匹配 MCAP key 和 <code>asset_id</code>。</td>
+          </tr>
+          <tr>
+            <td>CLI sidepath 遗留</td>
+            <td>用户可能继续使用旧 CLI 同步入口，产生两条语义不同的同步路径。</td>
+            <td>删除后端 CLI sync runner、API、配置和表迁移；只保留原生 direct sync 入口。</td>
+          </tr>
+        </tbody>
+      </table>
+    </section>
+
+    <section id="tests" class="panel">
+      <h2>测试计划</h2>
+      <ul>
+        <li>单元测试 robot API / 存储：<code>asset_id</code> 可首次设置、同值幂等、不可修改、不可清空、active 非空唯一。</li>
+        <li>单元测试 episode 创建：有 <code>asset_id</code> 时写入 <code>episodes.metadata.asset_id</code>，缺失时仍创建 episode。</li>
+        <li>单元测试 asset_id resolver：metadata 优先、fallback 读取软删除 workstation、缺失时报 non-retryable 错误、不 fallback 到 <code>robots.device_id</code>。</li>
+        <li>单元测试 DP config loader：version、endpoint scheme/TLS、禁止 path/query/fragment、成功选择 device、缺失 device、空 apiKey、空 tags、重复 deviceId。</li>
+        <li>单元测试 raw tags builder：合并顺序、reserved device tag 注入、raw_file 使用 MinIO basename、相同 key 相同 value 幂等、相同 key 不同 value 报错、空 value 保留。</li>
+        <li>单元测试 SyncWorker 错误分类：non-retryable failed 写 <code>next_retry_at=NULL</code>，auto scan 跳过，manual sync 可重新尝试。</li>
+        <li>单元测试 uploader 持久化恢复：同 MCAP key 但 <code>asset_id</code> 不同不复用旧 state。</li>
+        <li>集成测试 fake gateway/OSS：验证使用 device API key、raw tags 完整、<code>part_size_bytes</code> 回传、<code>object_key</code> 写回现有 DB 字段。</li>
+        <li>现场灰度：同一小 MCAP 分别跑当前原始上传和 device profile 上传，对比 raw tags、文件大小、ETag、Data Platform 可检索性。</li>
+      </ul>
+
+      <div class="success">
+        <p>
+          验收标准：原生 cloud sync 不产生本地 MCAP 临时文件，不依赖 <code>dp data upload</code>，
+          不依赖 <code>KEYSTONE_CLOUD_API_KEY</code>、<code>KEYSTONE_SYNC_AUTH_ENDPOINT</code> 或
+          <code>KEYSTONE_SYNC_GATEWAY_ENDPOINT</code>，
+          Data Platform 中的文件可通过 <code>fileId</code> 检索，Keystone episode 状态与现有 cloud sync 一致。
+        </p>
+      </div>
+    </section>
+  </main>
+</body>
+</html>
diff --git a/docs/designs/data-quality-center-mvp.zh.html b/docs/designs/data-quality-center-mvp.zh.html
new file mode 100644
index 0000000..673d326
--- /dev/null
+++ b/docs/designs/data-quality-center-mvp.zh.html
@@ -0,0 +1,892 @@
+<!--
+SPDX-FileCopyrightText: 2026 ArcheBase
+
+SPDX-License-Identifier: MulanPSL-2.0
+-->
+<!doctype html>
+<html lang="zh-CN">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>数据质检中心简化版设计</title>
+  <style>
+    :root {
+      color-scheme: light;
+      --ink: #18241f;
+      --muted: #5d6a63;
+      --line: #d8e1dc;
+      --paper: #ffffff;
+      --surface: #f7faf8;
+      --surface-strong: #edf5f1;
+      --accent: #126b5e;
+      --accent-strong: #0b4f47;
+      --warn: #9b5b16;
+      --danger: #9f352e;
+      --code-bg: #14231f;
+      --code-ink: #edf7f3;
+      --shadow: 0 18px 44px rgba(21, 42, 34, 0.08);
+    }
+
+    * {
+      box-sizing: border-box;
+    }
+
+    body {
+      margin: 0;
+      color: var(--ink);
+      background:
+        linear-gradient(135deg, rgba(18, 107, 94, 0.12), transparent 34%),
+        linear-gradient(180deg, #f8fbf9 0%, #ffffff 44%);
+      font: 16px/1.72 "Noto Sans SC", "Source Han Sans SC", "Microsoft YaHei", sans-serif;
+    }
+
+    main {
+      max-width: 1120px;
+      margin: 0 auto;
+      padding: 48px 24px 80px;
+    }
+
+    header {
+      display: grid;
+      grid-template-columns: minmax(0, 1fr) 300px;
+      gap: 28px;
+      align-items: end;
+      padding: 32px;
+      border: 1px solid var(--line);
+      border-radius: 18px;
+      background: rgba(255, 255, 255, 0.9);
+      box-shadow: var(--shadow);
+      backdrop-filter: blur(8px);
+    }
+
+    .eyebrow {
+      margin: 0 0 10px;
+      color: var(--accent-strong);
+      font-size: 13px;
+      font-weight: 700;
+      letter-spacing: 0;
+      text-transform: uppercase;
+    }
+
+    h1 {
+      margin: 0;
+      font-size: clamp(30px, 5vw, 46px);
+      line-height: 1.14;
+      letter-spacing: 0;
+    }
+
+    .lead {
+      max-width: 760px;
+      margin: 16px 0 0;
+      color: var(--muted);
+      font-size: 18px;
+    }
+
+    .header-meta {
+      border-left: 4px solid var(--accent);
+      padding-left: 16px;
+      color: var(--muted);
+      font-size: 14px;
+    }
+
+    .header-meta strong {
+      display: block;
+      margin-bottom: 8px;
+      color: var(--ink);
+      font-size: 16px;
+    }
+
+    nav {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 10px;
+      margin: 24px 0 4px;
+    }
+
+    nav a {
+      color: var(--accent-strong);
+      background: var(--surface-strong);
+      border: 1px solid #c9ded5;
+      border-radius: 999px;
+      padding: 8px 12px;
+      text-decoration: none;
+      font-size: 14px;
+    }
+
+    section {
+      margin-top: 38px;
+    }
+
+    h2 {
+      margin: 0 0 14px;
+      font-size: 26px;
+      line-height: 1.25;
+      letter-spacing: 0;
+    }
+
+    h3 {
+      margin: 24px 0 10px;
+      font-size: 19px;
+      line-height: 1.35;
+    }
+
+    p {
+      margin: 10px 0;
+    }
+
+    ul,
+    ol {
+      padding-left: 24px;
+    }
+
+    li {
+      margin: 7px 0;
+    }
+
+    .band {
+      padding: 22px;
+      border: 1px solid var(--line);
+      border-radius: 14px;
+      background: var(--paper);
+      box-shadow: 0 12px 30px rgba(20, 43, 35, 0.05);
+    }
+
+    .grid {
+      display: grid;
+      grid-template-columns: repeat(2, minmax(0, 1fr));
+      gap: 18px;
+    }
+
+    .three-grid {
+      display: grid;
+      grid-template-columns: repeat(3, minmax(0, 1fr));
+      gap: 14px;
+    }
+
+    .panel {
+      padding: 18px;
+      border: 1px solid var(--line);
+      border-radius: 12px;
+      background: var(--surface);
+    }
+
+    .panel h3 {
+      margin-top: 0;
+    }
+
+    .ok {
+      border-color: #b9dacd;
+      background: #f3faf6;
+    }
+
+    .no {
+      border-color: #ead1c0;
+      background: #fff8f1;
+    }
+
+    .note {
+      border-left: 4px solid var(--accent);
+      padding: 12px 16px;
+      background: #f2faf6;
+      color: var(--muted);
+    }
+
+    .warning {
+      border-left-color: var(--warn);
+      background: #fff8ee;
+    }
+
+    table {
+      width: 100%;
+      border-collapse: collapse;
+      margin: 14px 0 22px;
+      background: var(--paper);
+    }
+
+    th,
+    td {
+      border: 1px solid var(--line);
+      padding: 10px 12px;
+      text-align: left;
+      vertical-align: top;
+    }
+
+    th {
+      background: var(--surface-strong);
+      font-weight: 700;
+    }
+
+    code {
+      font-family: "JetBrains Mono", "SFMono-Regular", Consolas, monospace;
+      background: #e7f2ec;
+      color: #12382f;
+      border-radius: 5px;
+      padding: 1px 5px;
+      font-size: 0.94em;
+    }
+
+    pre {
+      overflow-x: auto;
+      margin: 14px 0 20px;
+      padding: 16px;
+      color: var(--code-ink);
+      background: var(--code-bg);
+      border-radius: 12px;
+      line-height: 1.55;
+    }
+
+    pre code {
+      color: inherit;
+      background: transparent;
+      padding: 0;
+    }
+
+    .flow {
+      display: grid;
+      gap: 10px;
+      margin: 14px 0 4px;
+    }
+
+    .flow-step {
+      display: grid;
+      grid-template-columns: 28px minmax(0, 1fr);
+      gap: 12px;
+      align-items: start;
+      padding: 12px;
+      border: 1px solid var(--line);
+      border-radius: 10px;
+      background: var(--paper);
+    }
+
+    .flow-step span {
+      display: inline-flex;
+      width: 28px;
+      height: 28px;
+      align-items: center;
+      justify-content: center;
+      border-radius: 50%;
+      color: white;
+      background: var(--accent);
+      font-size: 13px;
+      font-weight: 800;
+    }
+
+    .tag-list {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 8px;
+      padding: 0;
+      list-style: none;
+    }
+
+    .tag-list li {
+      margin: 0;
+      padding: 6px 10px;
+      border: 1px solid #c8ded4;
+      border-radius: 999px;
+      background: var(--surface-strong);
+      color: var(--accent-strong);
+      font-size: 13px;
+      font-weight: 700;
+    }
+
+    @media (max-width: 860px) {
+      main {
+        padding: 24px 16px 56px;
+      }
+
+      header,
+      .grid,
+      .three-grid {
+        grid-template-columns: 1fr;
+      }
+
+      header {
+        padding: 24px;
+      }
+    }
+  </style>
+</head>
+<body>
+  <main>
+    <header>
+      <div>
+        <p class="eyebrow">Synapse / Keystone MVP</p>
+        <h1>数据质检中心简化版设计</h1>
+        <p class="lead">
+          简化版仍然使用 Python 脚本做质检，但首发只提供一个系统内置固定脚本，用来做 MCAP 预览可用性 smoke check。管理员在 Synapse 的数据运维模块进入“质检中心”，即可对全部 Episode 或筛选结果发起质检。
+        </p>
+      </div>
+      <aside class="header-meta">
+        <strong>上线目标</strong>
+        快速形成“内置脚本 -> 批量质检 -> 写回 QA 状态 -> 人工复核 -> 云同步按 QA 状态放行”的闭环，优先拦截预览时报 MCAP magic 不匹配的坏包。
+      </aside>
+    </header>
+
+    <nav aria-label="目录">
+      <a href="#decision">设计取舍</a>
+      <a href="#scope">MVP 范围</a>
+      <a href="#user-flow">用户流程</a>
+      <a href="#architecture">架构</a>
+      <a href="#data-model">数据模型</a>
+      <a href="#status">状态规则</a>
+      <a href="#default-script">内置脚本</a>
+      <a href="#script-contract">脚本契约</a>
+      <a href="#api">API</a>
+      <a href="#synapse">Synapse 页面</a>
+      <a href="#implementation">实施顺序</a>
+    </nav>
+
+    <section id="decision">
+      <h2>1. 设计取舍</h2>
+      <div class="band">
+        <p>
+          完整版的“脚本管理 + 版本 + run + job + override + 独立 Runner”能力适合长期演进，但首发上线成本偏高。简化版保留脚本执行能力，把管理面压缩成“一个内置固定脚本 + 多个质检任务”。
+        </p>
+        <table>
+          <thead>
+            <tr>
+              <th>能力</th>
+              <th>完整版</th>
+              <th>简化版</th>
+            </tr>
+          </thead>
+          <tbody>
+            <tr>
+              <td>脚本数量</td>
+              <td>多个脚本，支持 global / sop 范围</td>
+              <td>只提供一个内置脚本 <code>builtin_mcap_preview_smoke_check</code>，覆盖所有数据</td>
+            </tr>
+            <tr>
+              <td>版本管理</td>
+              <td>脚本定义和不可变版本分表</td>
+              <td>不做上传和版本管理；job 快照保存内置脚本 key、version、SHA</td>
+            </tr>
+            <tr>
+              <td>执行器</td>
+              <td>独立 <code>keystone-quality-runner</code> 进程</td>
+              <td>Keystone 内置轻量 worker，默认并发 1</td>
+            </tr>
+            <tr>
+              <td>触发方式</td>
+              <td>Episode 创建自动触发，支持重跑</td>
+              <td>支持新数据自动入队，也支持 Synapse 一键全量/筛选触发</td>
+            </tr>
+            <tr>
+              <td>人工覆盖</td>
+              <td>独立 <code>quality_overrides</code> 表</td>
+              <td>复用现有 <code>inspections</code> 表和 <code>episodes</code> QA 字段</td>
+            </tr>
+          </tbody>
+        </table>
+        <p class="note warning">
+          这个方案的关键约束是“首发只解决预览入口的可读性问题”，包括对象 size / range read 异常和 MCAP 边界 magic 异常。如果后续需要按 SOP、机器人类型或场景配置不同脚本，再升级到完整版的脚本版本模型。
+        </p>
+      </div>
+    </section>
+
+    <section id="scope">
+      <h2>2. MVP 范围</h2>
+      <div class="grid">
+        <article class="panel ok">
+          <h3>包含</h3>
+          <ul>
+            <li>Synapse 管理后台新增 <code>数据运维 / 质检中心</code>。</li>
+            <li>Keystone 内置固定 Python 脚本 <code>builtin_mcap_preview_smoke_check</code>，全局适用于所有 Episode。</li>
+            <li>内置脚本先检查对象 size / range read 是否可用，再检查 MCAP 边界 magic。</li>
+            <li>脚本元数据以代码常量形式提供，执行批次和执行任务存 MySQL。</li>
+            <li>支持对全部非删除 Episode 发起质检。</li>
+            <li>支持按 QA 状态、创建时间、设备 ID、采集员工号筛选后发起质检。</li>
+            <li>支持新创建 Episode 自动进入质检队列，开关可配置。</li>
+            <li>脚本结果回写 <code>episodes.qa_status</code>、<code>qa_score</code>、<code>quality_flag</code>。</li>
+            <li>执行异常、超时、非法输出统一进入 <code>needs_inspection</code>。</li>
+            <li>Episode 详情支持查看最近一次脚本结果、手动重跑、人工通过/驳回。</li>
+          </ul>
+        </article>
+        <article class="panel no">
+          <h3>不包含</h3>
+          <ul>
+            <li>不做多脚本并行规则。</li>
+            <li>不做 SOP / 场景 / 机器人类型范围匹配。</li>
+            <li>不做在线代码编辑器。</li>
+            <li>不做脚本上传、替换、ZIP、多文件包、Git 脚本源或动态安装依赖。</li>
+            <li>不做激活前 test run。</li>
+            <li>不做复杂版本列表、版本 diff 或回滚页面。</li>
+            <li>不做任务取消；误触发时让当前 job 完成，后续可重新触发质检。</li>
+            <li>不自动撤回已经云同步的数据。</li>
+          </ul>
+        </article>
+      </div>
+    </section>
+
+    <section id="user-flow">
+      <h2>3. 用户流程</h2>
+      <div class="flow">
+        <div class="flow-step">
+          <span>1</span>
+          <div>管理员进入 Synapse <code>数据运维 / 质检中心</code>，看到内置脚本、最近批次、任务列表和 QA 汇总。</div>
+        </div>
+        <div class="flow-step">
+          <span>2</span>
+          <div>管理员不需要上传脚本，直接点击“质检全部数据”或设置筛选条件后点击“质检筛选结果”。</div>
+        </div>
+        <div class="flow-step">
+          <span>3</span>
+          <div>Keystone 为匹配的 Episode 创建 <code>quality_jobs</code>，每个 job 都记录内置脚本 key、version 和 SHA。</div>
+        </div>
+        <div class="flow-step">
+          <span>4</span>
+          <div>Keystone 内置 worker 领取 pending job，读取 MCAP 前 8 字节和后 8 字节，执行内置 Python 脚本，并保存 stdout、stderr、result JSON 摘要。</div>
+        </div>
+        <div class="flow-step">
+          <span>5</span>
+          <div>job 完成后 Keystone 按脚本输出更新 Episode QA 状态。云同步仍只放行 <code>approved</code> 和 <code>inspector_approved</code>。</div>
+        </div>
+      </div>
+    </section>
+
+    <section id="architecture">
+      <h2>4. 架构</h2>
+      <div class="band">
+        <pre><code>Synapse Admin
+  -> /api/v1/admin/quality/batches      start all-data or filtered QA
+  -> /api/v1/admin/quality/jobs         inspect job status and result
+  -> /api/v1/episodes/:id/quality-*     rerun or manual decision
+
+Keystone API
+  -> MySQL: quality_batches, quality_jobs
+  -> MySQL: episodes.qa_status, qa_score, quality_flag
+  -> MinIO: episode MCAP
+
+Keystone built-in quality worker
+  -> claims pending quality_jobs
+  -> reads MCAP leading 8 bytes and trailing 8 bytes
+  -> runs builtin_mcap_preview_smoke_check.py
+  -> updates quality_jobs and episodes</code></pre>
+        <p>
+          简化版不新增独立部署进程，worker 随 Keystone 启动。默认并发为 <code>1</code>，避免首发时脚本执行挤占过多机器资源。
+        </p>
+      </div>
+    </section>
+
+    <section id="data-model">
+      <h2>5. 数据模型</h2>
+      <p>新增 2 张表即可支撑批量触发和执行记录。脚本不落库、不上传，脚本 key、version、SHA 由 Keystone 代码常量提供。人工复核继续复用已有 <code>inspections</code> 表。</p>
+
+      <h3>quality_batches</h3>
+      <pre><code>id
+script_key              builtin_mcap_preview_smoke_check
+script_version          e.g. 2026.06.01
+script_sha256           sha256 of embedded script content
+trigger_type            all | filtered | episode | auto_episode
+triggered_by            admin username or system
+filter_json             actual filters used to enqueue jobs
+status                  pending | running | completed
+total_count
+pending_count
+running_count
+succeeded_count
+failed_count
+created_at
+completed_at</code></pre>
+      <p>批次只用于 Synapse 展示进度。统计值可以由 <code>quality_jobs</code> 聚合后回写，首发也可以查询时实时计算。</p>
+
+      <h3>quality_jobs</h3>
+      <pre><code>id
+batch_id
+episode_id
+script_key
+script_version
+script_sha256
+status                  pending | running | succeeded | failed | timeout | invalid_result
+decision                passed | rejected | uncertain
+score
+summary
+result_json
+stdout_excerpt
+stderr_excerpt
+error_message
+duration_ms
+locked_at
+started_at
+finished_at
+created_at
+updated_at</code></pre>
+      <p>
+        <code>script_key</code>、<code>script_version</code> 和 <code>script_sha256</code> 是执行快照。即使后续升级内置脚本，历史 job 也能看出当时实际使用的检查逻辑。
+      </p>
+    </section>
+
+    <section id="status">
+      <h2>6. 状态规则</h2>
+      <table>
+        <thead>
+          <tr>
+            <th>脚本/执行结果</th>
+            <th>Episode QA 状态</th>
+            <th>说明</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td>job 创建或运行中</td>
+            <td><code>qa_running</code></td>
+            <td>用于在列表和详情页提示正在质检。</td>
+          </tr>
+          <tr>
+            <td><code>decision = passed</code></td>
+            <td><code>approved</code></td>
+            <td>同时写 <code>auto_approved = true</code>，可进入云同步。</td>
+          </tr>
+          <tr>
+            <td><code>decision = rejected</code></td>
+            <td><code>rejected</code></td>
+            <td>数据不可用，云同步不放行。</td>
+          </tr>
+          <tr>
+            <td><code>decision = uncertain</code></td>
+            <td><code>needs_inspection</code></td>
+            <td>脚本无法可靠判断，交给人工复核。</td>
+          </tr>
+          <tr>
+            <td><code>failed</code> / <code>timeout</code> / <code>invalid_result</code></td>
+            <td><code>needs_inspection</code></td>
+            <td>执行失败不等于数据坏，只要求人工复核。</td>
+          </tr>
+          <tr>
+            <td>人工通过</td>
+            <td><code>inspector_approved</code></td>
+            <td>写入 <code>inspections</code>，可进入云同步。</td>
+          </tr>
+          <tr>
+            <td>人工驳回</td>
+            <td><code>rejected</code></td>
+            <td>写入 <code>inspections</code>，不进入云同步。</td>
+          </tr>
+        </tbody>
+      </table>
+      <p class="note warning">
+        对已经云同步的 Episode 重新质检时，只更新 Keystone 本地 QA 状态和质检记录，不自动删除或撤回云端对象。Synapse 在“包含已同步数据”开关旁提示这个风险。
+      </p>
+    </section>
+
+    <section id="default-script">
+      <h2>7. 内置脚本</h2>
+      <div class="band">
+        <p>
+          首发固定脚本为 <code>builtin_mcap_preview_smoke_check</code>。它不是完整 MCAP 解析器，而是预览 smoke check：先检查对象是否能拿到有效 size 和必要字节范围，再用边界 magic 检查快速拦截数据预览中出现的这类错误：
+        </p>
+        <pre><code>Expected MCAP magic '89 4d 43 41 50 30 0d 0a',
+found '2f 06 84 5c 5b ea dc 8b' [library=libmcap 2.1.2]</code></pre>
+        <p>
+          MCAP magic 是 8 字节：<code>89 4d 43 41 50 30 0d 0a</code>。MCAP 文件开头有一次 magic，文件结尾也有一次 trailing magic。
+          Synapse 预览使用的 <code>@mcap/core</code> 会在初始化时先检查开头 magic，再读取 Header，随后检查结尾 trailing magic。
+        </p>
+        <p class="note warning">
+          因为错误里已经带了 <code>[library=libmcap 2.1.2]</code>，说明 Header 大概率已经读成功；这种情况下更可疑的是文件结尾 trailing magic 不匹配，而不是文件开头不匹配。所以默认脚本必须同时检查开头和结尾，不能只查开头。
+        </p>
+        <p>
+          实现时优先使用对象存储 range read：读取 offset <code>0..7</code> 和 <code>size-8..size-1</code> 即可发现这类 magic mismatch。只有当前 S3 client 不方便做 range read 时，才退回下载到临时文件后检查边界字节。
+          边界 magic 匹配只能说明这个 smoke check 通过，不代表 MCAP 内部索引、chunk、CRC 或压缩数据一定可读。
+        </p>
+        <p class="note warning">
+          <code>Failed to fetch size: 416 Requested Range Not Satisfiable</code> 也属于这个 smoke check 的覆盖范围。MinIO UI 里 size 显示 <code>-</code> 时，通常表示前端拿不到普通文件对象的有效大小，或者当前路径不是一个可按字节范围读取的 MCAP 对象。该问题发生在 magic 检查之前，应记录为对象读取/size 异常。
+        </p>
+
+        <h3>判定规则</h3>
+        <table>
+          <thead>
+            <tr>
+              <th>检查项</th>
+              <th>结果</th>
+              <th>Episode QA 状态</th>
+            </tr>
+          </thead>
+          <tbody>
+            <tr>
+              <td>无法获取对象 size，或 size 为空 / 未知 / 非数字</td>
+              <td><code>decision = rejected</code></td>
+              <td><code>rejected</code></td>
+            </tr>
+            <tr>
+              <td>文件大小小于 16 字节</td>
+              <td><code>decision = rejected</code></td>
+              <td><code>rejected</code></td>
+            </tr>
+            <tr>
+              <td>读取开头或结尾 range 返回 <code>416 Requested Range Not Satisfiable</code></td>
+              <td><code>decision = rejected</code></td>
+              <td><code>rejected</code></td>
+            </tr>
+            <tr>
+              <td>开头 8 字节不是 MCAP magic</td>
+              <td><code>decision = rejected</code></td>
+              <td><code>rejected</code></td>
+            </tr>
+            <tr>
+              <td>结尾 8 字节不是 MCAP magic</td>
+              <td><code>decision = rejected</code></td>
+              <td><code>rejected</code></td>
+            </tr>
+            <tr>
+              <td>开头和结尾 magic 都匹配</td>
+              <td><code>decision = passed</code></td>
+              <td><code>approved</code>，含义是通过当前内置 smoke check</td>
+            </tr>
+            <tr>
+              <td>对象网络超时、权限错误、MinIO 临时错误、脚本异常</td>
+              <td><code>status = failed</code></td>
+              <td><code>needs_inspection</code></td>
+            </tr>
+          </tbody>
+        </table>
+
+        <h3>默认输出示例</h3>
+        <pre><code>{
+  "decision": "rejected",
+  "score": 0.0,
+  "summary": "MCAP object range is not satisfiable",
+  "findings": [
+    {
+      "severity": "error",
+      "code": "mcap_range_not_satisfiable",
+      "message": "Failed to read MCAP boundary bytes: 416 Requested Range Not Satisfiable",
+      "http_status": 416,
+      "hint": "MinIO object size is unavailable or the path is not a readable MCAP object"
+    }
+  ]
+}</code></pre>
+        <pre><code>{
+  "decision": "rejected",
+  "score": 0.0,
+  "summary": "MCAP trailing magic mismatch",
+  "findings": [
+    {
+      "severity": "error",
+      "code": "mcap_trailing_magic_mismatch",
+      "message": "Expected trailing magic 89 4d 43 41 50 30 0d 0a, found 2f 06 84 5c 5b ea dc 8b",
+      "expected_hex": "89 4d 43 41 50 30 0d 0a",
+      "actual_hex": "2f 06 84 5c 5b ea dc 8b",
+      "offset": "file_end_minus_8"
+    }
+  ]
+}</code></pre>
+        <p class="note">
+          这个脚本只能证明 MCAP 边界 magic 没有命中已知预览错误，不能证明 MCAP 内部索引、chunk、CRC 或压缩数据一定可读。后续需要更强校验时，再升级为完整 MCAP reader smoke test。
+        </p>
+      </div>
+    </section>
+
+    <section id="script-contract">
+      <h2>8. 脚本契约</h2>
+      <div class="band">
+        <p>首发不做用户上传脚本，内置脚本使用轻量输入 JSON。Keystone worker 先尝试读取 MCAP 对象大小、开头 8 字节和结尾 8 字节，再执行 Python：</p>
+        <pre><code>python3 script.py --input input.json --output result.json</code></pre>
+        <p>脚本必须将业务结果写入 <code>--output</code> 指定的 JSON 文件。stdout / stderr 只作为诊断日志保存。</p>
+        <h3>输入文件</h3>
+        <pre><code>{
+  "episode_id": 42,
+  "mcap_path": "bucket/path/to/file.mcap",
+  "file_size_bytes": 123456789,
+  "object_status": "readable",
+  "object_error": null,
+  "expected_magic_hex": "89 4d 43 41 50 30 0d 0a",
+  "leading_magic_hex": "89 4d 43 41 50 30 0d 0a",
+  "trailing_magic_hex": "2f 06 84 5c 5b ea dc 8b"
+}</code></pre>
+        <p>如果 worker 在获取 size 或 range read 时已经失败，也仍然生成输入 JSON 交给内置脚本输出标准化结果：</p>
+        <pre><code>{
+  "episode_id": 42,
+  "mcap_path": "bucket/path/to/file.mcap",
+  "file_size_bytes": null,
+  "object_status": "range_not_satisfiable",
+  "object_error": "416 Requested Range Not Satisfiable",
+  "expected_magic_hex": "89 4d 43 41 50 30 0d 0a",
+  "leading_magic_hex": null,
+  "trailing_magic_hex": null
+}</code></pre>
+        <h3>最小输出</h3>
+        <pre><code>{
+  "decision": "passed",
+  "score": 1.0,
+  "summary": "ok",
+  "findings": []
+}</code></pre>
+        <h3>字段规则</h3>
+        <table>
+          <thead>
+            <tr>
+              <th>字段</th>
+              <th>规则</th>
+            </tr>
+          </thead>
+          <tbody>
+            <tr>
+              <td><code>decision</code></td>
+              <td>必填，只允许 <code>passed</code>、<code>rejected</code>、<code>uncertain</code>。</td>
+            </tr>
+            <tr>
+              <td><code>score</code></td>
+              <td>可选，0 到 1，写入 <code>episodes.qa_score</code>。</td>
+            </tr>
+            <tr>
+              <td><code>summary</code></td>
+              <td>可选字符串，写入 job 摘要；当 rejected / uncertain 时同步到 <code>episodes.quality_flag</code>。</td>
+            </tr>
+            <tr>
+              <td><code>findings</code></td>
+              <td>可选数组，完整保存在 <code>quality_jobs.result_json</code>。</td>
+            </tr>
+          </tbody>
+        </table>
+        <ul class="tag-list">
+          <li>内置脚本</li>
+          <li>默认超时 30 秒</li>
+          <li>默认并发 1</li>
+          <li>不安装依赖</li>
+          <li>不暴露数据库或 MinIO 凭证</li>
+        </ul>
+      </div>
+    </section>
+
+    <section id="api">
+      <h2>9. API 草案</h2>
+      <div class="grid">
+        <article class="panel">
+          <h3>质检中心</h3>
+          <pre><code>GET  /api/v1/admin/quality/overview
+GET  /api/v1/admin/quality/batches
+POST /api/v1/admin/quality/batches
+GET  /api/v1/admin/quality/jobs</code></pre>
+        </article>
+        <article class="panel">
+          <h3>Episode 质检操作</h3>
+          <pre><code>GET  /api/v1/episodes/:id/quality-jobs
+POST /api/v1/episodes/:id/quality-rerun
+POST /api/v1/episodes/:id/quality-decision</code></pre>
+        </article>
+      </div>
+
+      <h3>质检中心概览</h3>
+      <pre><code>GET /api/v1/admin/quality/overview
+
+{
+  "script": {
+    "key": "builtin_mcap_preview_smoke_check",
+    "version": "2026.06.01",
+    "expected_magic_hex": "89 4d 43 41 50 30 0d 0a",
+    "checks": ["object_size", "range_read", "leading_magic", "trailing_magic"]
+  },
+  "summary": {
+    "total": 1200,
+    "pending_qa": 20,
+    "qa_running": 4,
+    "approved": 1100,
+    "needs_inspection": 30,
+    "rejected": 46
+  }
+}</code></pre>
+
+      <h3>触发全量质检</h3>
+      <pre><code>POST /api/v1/admin/quality/batches
+Content-Type: application/json
+
+{
+  "scope": "all",
+  "include_cloud_synced": true
+}</code></pre>
+
+      <h3>触发筛选质检</h3>
+      <pre><code>POST /api/v1/admin/quality/batches
+Content-Type: application/json
+
+{
+  "scope": "filtered",
+  "filters": {
+    "qa_status": ["pending_qa", "needs_inspection"],
+    "created_at_from": "2026-06-01T00:00:00Z",
+    "created_at_to": "2026-06-02T00:00:00Z",
+    "robot_device_id": "robot-001",
+    "collector_operator_id": "collector-001"
+  }
+}</code></pre>
+
+      <h3>人工复核</h3>
+      <pre><code>POST /api/v1/episodes/:id/quality-decision
+Content-Type: application/json
+
+{
+  "decision": "approved",
+  "reason": "人工预览 MCAP 后确认可用"
+}</code></pre>
+      <p>所有 <code>/admin/quality/*</code> 和写操作首发只开放给 admin。</p>
+    </section>
+
+    <section id="synapse">
+      <h2>10. Synapse 页面</h2>
+      <div class="three-grid">
+        <article class="panel">
+          <h3>导航入口</h3>
+          <ul>
+            <li>在 <code>AdminSidebar.vue</code> 的“数据运维”分组新增 <code>质检中心</code>。</li>
+            <li>新增路由 <code>/admin/quality</code>，路由名 <code>AdminQualityCenter</code>。</li>
+            <li>页面文件建议为 <code>views/admin/quality/QualityCenter.vue</code>。</li>
+          </ul>
+        </article>
+        <article class="panel">
+          <h3>质检中心首屏</h3>
+          <ul>
+            <li>顶部汇总：总数据、待质检、质检中、已通过、需复核、已驳回。</li>
+            <li>脚本卡片：内置脚本 key、version、检查项、预期 magic、默认超时。</li>
+            <li>操作区：质检全部数据、质检筛选结果。</li>
+            <li>筛选项复用数据生产统计页面口径：QA 状态、时间范围、设备、采集员。</li>
+          </ul>
+        </article>
+        <article class="panel">
+          <h3>列表与详情</h3>
+          <ul>
+            <li>批次列表展示发起人、范围、总数、完成数、失败数、创建时间。</li>
+            <li>job 列表展示 Episode、状态、decision、score、summary、耗时。</li>
+            <li>点击 Episode 跳转现有 Episode 详情页。</li>
+            <li>Episode 详情页新增“脚本质检”卡片：最近 job、result JSON、重跑、人工通过/驳回。</li>
+          </ul>
+        </article>
+      </div>
+      <p class="note">
+        首发页面可以复用现有 <code>ListPageLayout</code>、<code>DataTable</code>、<code>Modal</code>、<code>BaseInput</code>、<code>BaseSelect</code> 和 <code>BaseTextarea</code>，不新建设计系统组件。
+      </p>
+    </section>
+
+    <section id="implementation">
+      <h2>11. 实施顺序</h2>
+      <ol>
+        <li>Keystone 新增迁移：<code>quality_batches</code>、<code>quality_jobs</code>。</li>
+        <li>Keystone 新增内置脚本文件或嵌入式脚本常量：<code>builtin_mcap_preview_smoke_check.py</code>。</li>
+        <li>Keystone 新增 <code>QualityHandler</code>：overview、batch enqueue、job 列表、Episode 重跑和人工决策。</li>
+        <li>Keystone 新增内置 quality worker：领取 pending job、执行内置脚本、落库结果、更新 Episode QA 状态。</li>
+        <li>上传完成路径增加自动入队：当 <code>QUALITY_AUTO_RUN_ON_UPLOAD=true</code> 时创建单 Episode job。</li>
+        <li>Synapse 新增 <code>api/quality.js</code>、<code>QualityCenter.vue</code>、路由和侧边栏入口。</li>
+        <li>Synapse Episode 详情页新增脚本质检卡片，接入重跑和人工通过/驳回。</li>
+        <li>验证：全量触发、筛选触发、size 不可用、range 416、开头 magic 异常、结尾 magic 异常、文件过小、执行异常、云同步资格。</li>
+      </ol>
+
+      <h3>首发验收标准</h3>
+      <div class="band">
+        <ul>
+          <li>管理员能在“质检中心”看到内置脚本 <code>builtin_mcap_preview_smoke_check</code> 的说明。</li>
+          <li>管理员能点击一次对全部 Episode 创建质检任务。</li>
+          <li>管理员能按筛选条件只质检一部分 Episode。</li>
+          <li>对象 size 不可用或 MinIO 显示 size 为 <code>-</code> 时 Episode 变为 <code>rejected</code>，finding 写明 size 异常。</li>
+          <li>读取边界字节返回 <code>416 Requested Range Not Satisfiable</code> 时 Episode 变为 <code>rejected</code>，finding 写明 range 异常。</li>
+          <li>开头和结尾 magic 都正确时 Episode 变为 <code>approved</code>，表示当前内置 smoke check 通过。</li>
+          <li>开头 magic 不匹配时 Episode 变为 <code>rejected</code>，finding 写明实际开头 8 字节。</li>
+          <li>结尾 magic 不匹配时 Episode 变为 <code>rejected</code>，finding 写明实际结尾 8 字节。</li>
+          <li>脚本超时、异常或对象读取失败后 Episode 变为 <code>needs_inspection</code>。</li>
+          <li>Episode 详情能看到最近一次脚本执行结果，并能人工通过或驳回。</li>
+          <li>云同步继续只允许 <code>approved</code> 和 <code>inspector_approved</code>。</li>
+        </ul>
+      </div>
+    </section>
+  </main>
+</body>
+</html>
diff --git a/docs/designs/data-quality-script-management.zh.html b/docs/designs/data-quality-script-management.zh.html
new file mode 100755
index 0000000..c58ae10
--- /dev/null
+++ b/docs/designs/data-quality-script-management.zh.html
@@ -0,0 +1,878 @@
+<!--
+SPDX-FileCopyrightText: 2026 ArcheBase
+
+SPDX-License-Identifier: MulanPSL-2.0
+-->
+<!doctype html>
+<html lang="zh-CN">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>数据质检脚本管理设计</title>
+  <style>
+    :root {
+      color-scheme: light;
+      --ink: #15211c;
+      --muted: #5f6d66;
+      --line: #d7e0da;
+      --paper: #ffffff;
+      --surface: #f6f9f6;
+      --surface-strong: #edf4ef;
+      --accent: #146c5f;
+      --accent-strong: #0b4f47;
+      --warn: #9b4d16;
+      --danger: #a8322a;
+      --code-bg: #13211d;
+      --code-ink: #edf7f2;
+      --shadow: 0 18px 48px rgba(22, 41, 33, 0.09);
+    }
+
+    * {
+      box-sizing: border-box;
+    }
+
+    body {
+      margin: 0;
+      color: var(--ink);
+      background:
+        linear-gradient(135deg, rgba(20, 108, 95, 0.13), transparent 36%),
+        linear-gradient(180deg, #f8fbf8 0%, #ffffff 42%);
+      font: 16px/1.72 "Noto Sans SC", "Source Han Sans SC", "Microsoft YaHei", sans-serif;
+    }
+
+    main {
+      max-width: 1160px;
+      margin: 0 auto;
+      padding: 48px 24px 80px;
+    }
+
+    header {
+      display: grid;
+      grid-template-columns: minmax(0, 1fr) 280px;
+      gap: 28px;
+      align-items: end;
+      padding: 34px;
+      border: 1px solid var(--line);
+      border-radius: 18px;
+      background: rgba(255, 255, 255, 0.86);
+      box-shadow: var(--shadow);
+      backdrop-filter: blur(8px);
+    }
+
+    .eyebrow {
+      margin: 0 0 10px;
+      color: var(--accent-strong);
+      font-size: 13px;
+      font-weight: 700;
+      letter-spacing: 0;
+      text-transform: uppercase;
+    }
+
+    h1 {
+      margin: 0;
+      font-size: clamp(30px, 5vw, 48px);
+      line-height: 1.12;
+      letter-spacing: 0;
+    }
+
+    .lead {
+      max-width: 760px;
+      margin: 18px 0 0;
+      color: var(--muted);
+      font-size: 18px;
+    }
+
+    .header-meta {
+      border-left: 4px solid var(--accent);
+      padding-left: 16px;
+      color: var(--muted);
+      font-size: 14px;
+    }
+
+    .header-meta strong {
+      display: block;
+      color: var(--ink);
+      font-size: 16px;
+      margin-bottom: 8px;
+    }
+
+    nav {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 10px;
+      margin: 24px 0 6px;
+    }
+
+    nav a {
+      color: var(--accent-strong);
+      background: var(--surface-strong);
+      border: 1px solid #c8ded4;
+      border-radius: 999px;
+      padding: 8px 12px;
+      text-decoration: none;
+      font-size: 14px;
+    }
+
+    section {
+      margin-top: 38px;
+    }
+
+    h2 {
+      margin: 0 0 14px;
+      font-size: 26px;
+      line-height: 1.25;
+      letter-spacing: 0;
+    }
+
+    h3 {
+      margin: 26px 0 10px;
+      font-size: 19px;
+      line-height: 1.35;
+    }
+
+    p {
+      margin: 10px 0;
+    }
+
+    ul,
+    ol {
+      padding-left: 24px;
+    }
+
+    li {
+      margin: 7px 0;
+    }
+
+    .band {
+      padding: 24px;
+      border: 1px solid var(--line);
+      border-radius: 14px;
+      background: var(--paper);
+      box-shadow: 0 12px 30px rgba(20, 43, 35, 0.05);
+    }
+
+    .grid {
+      display: grid;
+      grid-template-columns: repeat(2, minmax(0, 1fr));
+      gap: 18px;
+    }
+
+    .three-grid {
+      display: grid;
+      grid-template-columns: repeat(3, minmax(0, 1fr));
+      gap: 14px;
+    }
+
+    .panel {
+      padding: 18px;
+      border: 1px solid var(--line);
+      border-radius: 12px;
+      background: var(--surface);
+    }
+
+    .panel h3,
+    .panel h4 {
+      margin-top: 0;
+    }
+
+    .ok {
+      border-color: #b8d9cb;
+      background: #f2faf5;
+    }
+
+    .no {
+      border-color: #ead1c0;
+      background: #fff7f1;
+    }
+
+    .note {
+      border-left: 4px solid var(--accent);
+      padding: 12px 16px;
+      background: #f2faf6;
+      color: var(--muted);
+    }
+
+    .warning {
+      border-left-color: var(--warn);
+      background: #fff8ee;
+    }
+
+    table {
+      width: 100%;
+      border-collapse: collapse;
+      margin: 14px 0 22px;
+      background: var(--paper);
+    }
+
+    th,
+    td {
+      border: 1px solid var(--line);
+      padding: 10px 12px;
+      text-align: left;
+      vertical-align: top;
+    }
+
+    th {
+      background: var(--surface-strong);
+      font-weight: 700;
+    }
+
+    code {
+      font-family: "JetBrains Mono", "SFMono-Regular", Consolas, monospace;
+      background: #e7f2ec;
+      color: #12382f;
+      border-radius: 5px;
+      padding: 1px 5px;
+      font-size: 0.94em;
+    }
+
+    pre {
+      overflow-x: auto;
+      margin: 14px 0 20px;
+      padding: 16px;
+      color: var(--code-ink);
+      background: var(--code-bg);
+      border-radius: 12px;
+      line-height: 1.55;
+    }
+
+    pre code {
+      color: inherit;
+      background: transparent;
+      padding: 0;
+    }
+
+    .tag-list {
+      display: flex;
+      flex-wrap: wrap;
+      gap: 8px;
+      padding: 0;
+      list-style: none;
+    }
+
+    .tag-list li {
+      margin: 0;
+      padding: 6px 10px;
+      border: 1px solid #c8ded4;
+      border-radius: 999px;
+      background: var(--surface-strong);
+      color: var(--accent-strong);
+      font-size: 13px;
+      font-weight: 700;
+    }
+
+    .flow {
+      display: grid;
+      gap: 10px;
+      margin: 14px 0 4px;
+    }
+
+    .flow-step {
+      display: grid;
+      grid-template-columns: 28px minmax(0, 1fr);
+      gap: 12px;
+      align-items: start;
+      padding: 12px;
+      border: 1px solid var(--line);
+      border-radius: 10px;
+      background: var(--paper);
+    }
+
+    .flow-step span {
+      display: inline-flex;
+      width: 28px;
+      height: 28px;
+      align-items: center;
+      justify-content: center;
+      border-radius: 50%;
+      color: white;
+      background: var(--accent);
+      font-size: 13px;
+      font-weight: 800;
+    }
+
+    @media (max-width: 860px) {
+      main {
+        padding: 24px 16px 56px;
+      }
+
+      header,
+      .grid,
+      .three-grid {
+        grid-template-columns: 1fr;
+      }
+
+      header {
+        padding: 24px;
+      }
+    }
+  </style>
+</head>
+<body>
+  <main>
+    <header>
+      <div>
+        <p class="eyebrow">Keystone / Synapse Phase 1</p>
+        <h1>数据质检脚本管理设计</h1>
+        <p class="lead">
+          第一版只做一个可落地的最小闭环：上传 Python 质检脚本，Episode 入库后自动触发独立 Runner 执行，结果写回 QA 状态，并由 QA 状态控制云同步资格。
+        </p>
+      </div>
+      <aside class="header-meta">
+        <strong>设计边界</strong>
+        不做补采，不回退任务，不改变订单、批次、任务的生产状态机。质检只影响 Episode 的数据可用性。
+      </aside>
+    </header>
+
+    <nav aria-label="目录">
+      <a href="#scope">一期范围</a>
+      <a href="#implemented">已实现基础</a>
+      <a href="#todo">待实现模块</a>
+      <a href="#architecture">架构</a>
+      <a href="#qa-status">QA 状态</a>
+      <a href="#versioning">脚本版本</a>
+      <a href="#data-model">数据模型</a>
+      <a href="#flow">触发与结算</a>
+      <a href="#contract">脚本契约</a>
+      <a href="#runner">Runner</a>
+      <a href="#override">人工覆盖</a>
+      <a href="#api">API</a>
+      <a href="#synapse">Synapse</a>
+      <a href="#runtime">Runtime</a>
+    </nav>
+
+    <section id="scope">
+      <h2>1. 一期范围</h2>
+      <div class="grid">
+        <article class="panel ok">
+          <h3>包含</h3>
+          <ul>
+            <li>只支持 Python 脚本。</li>
+            <li>上传单个 <code>.py</code> 文件，最大 1 MB。</li>
+            <li>脚本文件存 MinIO，元数据和执行记录存 MySQL。</li>
+            <li>脚本版本不可变，上传后默认不激活。</li>
+            <li>管理员显式激活版本；同一脚本只允许一个 active 版本。</li>
+            <li>触发范围支持 <code>global</code> 和 <code>sop</code>。</li>
+            <li>Episode 新建后自动触发质检。</li>
+            <li>支持单个 Episode 手动重跑。</li>
+            <li>支持从 <code>needs_inspection</code> 或 <code>rejected</code> 人工覆盖。</li>
+            <li>固定 Runner Runtime：<code>python3.11-mcap</code>。</li>
+          </ul>
+        </article>
+        <article class="panel no">
+          <h3>不包含</h3>
+          <ul>
+            <li>不做 UI 在线代码编辑器。</li>
+            <li>不支持 ZIP、多文件脚本包、Git 脚本源。</li>
+            <li>不支持脚本自带依赖安装。</li>
+            <li>不做激活前 test run。</li>
+            <li>不做上传时 Python 语法校验。</li>
+            <li>不做 job cancel、自动 retry、历史批量回扫。</li>
+            <li>不建批次 QA 汇总表。</li>
+            <li>不做补采批次，不做任务回退。</li>
+            <li>不改 <code>tasks.status</code>、<code>batches.status</code>、<code>orders.status</code>。</li>
+          </ul>
+        </article>
+      </div>
+    </section>
+
+    <section id="implemented">
+      <h2>2. 已实现基础</h2>
+      <p>以下能力已存在于当前 Keystone / Synapse 体系中，是质检脚本管理的一期实现基础，不需要从零建设。</p>
+      <div class="grid">
+        <article class="panel ok">
+          <h3>Keystone 已有基础</h3>
+          <ul>
+            <li>任务、批次、订单、Episode 等生产数据模型。</li>
+            <li>上传完成后创建 Episode，并保存 MCAP / sidecar 的对象路径。</li>
+            <li>Episode 已有 <code>qa_status</code>、<code>auto_approved</code>、<code>quality_flag</code> 等质检相关字段。</li>
+            <li>MinIO / S3 存储接入能力，可保存 MCAP、sidecar 和后续脚本 artifact。</li>
+            <li>云同步 worker 已按 <code>approved</code> / <code>inspector_approved</code> 过滤可同步 Episode。</li>
+            <li>JWT 鉴权和 admin / data_collector 角色基础。</li>
+          </ul>
+        </article>
+        <article class="panel ok">
+          <h3>Synapse 已有基础</h3>
+          <ul>
+            <li>Admin 管理后台布局、导航和 CRUD 页面模式。</li>
+            <li>通用 API client、分页列表、表单、弹窗和确认对话框组件。</li>
+            <li>Episode 详情页和数据预览能力。</li>
+            <li>任务、批次、统计、云同步等后台页面，可接入 QA 状态展示。</li>
+            <li>已有数据生产统计页面，可继续按 <code>episodes.qa_status</code> 聚合。</li>
+          </ul>
+        </article>
+      </div>
+    </section>
+
+    <section id="todo">
+      <h2>3. 待实现模块</h2>
+      <p>一期需要新增的是质检脚本管理和独立 Runner 执行闭环，生产主状态机不纳入本次改造。</p>
+      <div class="three-grid">
+        <article class="panel">
+          <h3>Keystone 后端</h3>
+          <ul>
+            <li>新增 <code>quality_scripts</code>、<code>quality_script_versions</code>、<code>quality_runs</code>、<code>quality_run_jobs</code>、<code>quality_overrides</code> 表。</li>
+            <li>新增脚本管理、版本上传、激活/停用、run/job 查询 API。</li>
+            <li>在 Episode 创建后匹配 active 脚本并创建质检 run/jobs。</li>
+            <li>新增手动重跑和人工覆盖 API。</li>
+            <li>调整上传完成路径：有脚本时进入 <code>qa_running</code>，无脚本时自动 <code>approved</code>。</li>
+          </ul>
+        </article>
+        <article class="panel">
+          <h3>Quality Runner</h3>
+          <ul>
+            <li>新增独立命令或服务 <code>keystone-quality-runner</code>。</li>
+            <li>从 MySQL 领取 <code>quality_run_jobs</code>。</li>
+            <li>从 MinIO 下载 MCAP、sidecar 和 Python 脚本。</li>
+            <li>按固定命令执行 Python 子进程并限制超时。</li>
+            <li>写回 job 结果，并在 run 完成后结算 Episode QA 状态。</li>
+          </ul>
+        </article>
+        <article class="panel">
+          <h3>Synapse 前端</h3>
+          <ul>
+            <li>新增“数据质检”后台入口。</li>
+            <li>脚本列表、脚本元数据表单、版本上传和激活/停用页面。</li>
+            <li>run/job 列表和结果详情。</li>
+            <li>Episode 详情页增加 QA 面板、手动重跑和人工覆盖入口。</li>
+            <li>批次、统计、云同步相关页面展示有效 QA 状态。</li>
+          </ul>
+        </article>
+      </div>
+    </section>
+
+    <section id="architecture">
+      <h2>4. 架构</h2>
+      <div class="band">
+        <pre><code>Synapse Admin
+  -> Keystone REST API
+      -> MySQL: scripts, versions, runs, jobs, overrides
+      -> MinIO: uploaded script files
+
+Keystone upload_complete
+  -> creates episode
+  -> matches active quality script versions
+  -> creates quality run and jobs
+
+keystone-quality-runner
+  -> polls MySQL quality_run_jobs
+  -> downloads MCAP, sidecar, and script from MinIO
+  -> runs Python script in a child process
+  -> writes job result
+  -> settles the parent quality run
+  -> updates episode QA status</code></pre>
+        <p>
+          Runner 是独立进程，不放在 Keystone API 进程内执行 Python。推荐同仓库、同发布包，但运行成两个命令：
+          <code>keystone-edge</code> 和 <code>keystone-quality-runner</code>。
+        </p>
+      </div>
+    </section>
+
+    <section id="qa-status">
+      <h2>5. Episode QA 状态</h2>
+      <table>
+        <thead>
+          <tr>
+            <th>状态</th>
+            <th>含义</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><code>pending_qa</code></td>
+            <td>Episode 已创建，但质检还没有开始。</td>
+          </tr>
+          <tr>
+            <td><code>qa_running</code></td>
+            <td>当前质检轮次仍有 pending 或 running job。</td>
+          </tr>
+          <tr>
+            <td><code>approved</code></td>
+            <td>所有匹配脚本都通过，或没有任何匹配脚本。</td>
+          </tr>
+          <tr>
+            <td><code>needs_inspection</code></td>
+            <td>脚本异常、超时、输出非法，或脚本返回 <code>uncertain</code>。</td>
+          </tr>
+          <tr>
+            <td><code>inspector_approved</code></td>
+            <td>管理员人工确认通过。</td>
+          </tr>
+          <tr>
+            <td><code>rejected</code></td>
+            <td>脚本或管理员明确驳回该数据。</td>
+          </tr>
+        </tbody>
+      </table>
+      <p class="note warning">
+        一期质检系统不写 <code>episodes.qa_status = failed</code>。执行失败不等于数据坏，统一进入 <code>needs_inspection</code>。
+      </p>
+      <p>云同步资格仍只认 <code>approved</code> 和 <code>inspector_approved</code>。</p>
+    </section>
+
+    <section id="versioning">
+      <h2>6. 脚本版本规则</h2>
+      <div class="band">
+        <ul>
+          <li>上传只接受一个 <code>.py</code> 文件，空文件拒绝，最大 1 MB。</li>
+          <li>版本号必须是 SemVer：<code>MAJOR.MINOR.PATCH</code>，不带 <code>v</code> 前缀。</li>
+          <li>Keystone 计算并保存 SHA256。</li>
+          <li>MinIO 路径使用 <code>slug/version/sha256.py</code>，不信任原始文件名。</li>
+          <li>上传版本默认 <code>inactive</code>，不会自动激活。</li>
+          <li>激活版本是单独的 admin 操作。</li>
+          <li>激活一个版本会停用同一脚本的旧 active 版本。</li>
+          <li>已经排队或运行中的 job 继续使用它引用的不可变版本。</li>
+          <li>一期不物理删除脚本版本或 MinIO artifact。</li>
+        </ul>
+        <p>
+          版本同时携带执行策略：<code>language</code>、<code>runtime</code>、<code>timeout_seconds</code>、
+          <code>scope_type</code>、<code>scope_ref_id</code> 和 <code>default_config</code>。修改代码、配置、超时或适用范围都需要上传新版本。
+        </p>
+      </div>
+    </section>
+
+    <section id="data-model">
+      <h2>7. 数据模型</h2>
+      <p>一期使用 5 张质检表，先不拆独立 findings 表；脚本输出里的 findings 存在 <code>quality_run_jobs.result_json</code>。</p>
+
+      <h3>quality_scripts</h3>
+      <pre><code>id
+slug
+name
+description
+status              active | inactive
+created_by
+created_at
+updated_at
+deleted_at</code></pre>
+
+      <h3>quality_script_versions</h3>
+      <pre><code>id
+script_id
+version
+language            python
+runtime             python3.11-mcap
+entrypoint          normalized uploaded filename
+artifact_uri
+artifact_sha256
+artifact_size_bytes
+timeout_seconds
+scope_type          global | sop
+scope_ref_id        null for global, sop id for sop
+default_config      JSON
+status              active | inactive
+created_by
+created_at
+deleted_at</code></pre>
+
+      <h3>quality_runs</h3>
+      <pre><code>id
+episode_id
+trigger_type        auto | manual
+triggered_by        system or admin user id/name
+status              pending | running | completed
+final_qa_status
+settlement_reason
+created_at
+started_at
+completed_at</code></pre>
+      <p>同一个 Episode 一期只允许一个 active 质检轮次；如果还有 pending/running job，手动重跑返回 <code>409 Conflict</code>。</p>
+
+      <h3>quality_run_jobs</h3>
+      <pre><code>id
+quality_run_id
+episode_id
+script_version_id
+status              pending | running | succeeded | failed | timeout | invalid_result
+decision            passed | rejected | uncertain
+runner_id
+locked_at
+started_at
+finished_at
+score
+summary
+result_json         full script output, including findings
+stdout_excerpt
+stderr_excerpt
+error_message
+duration_ms
+created_at
+updated_at</code></pre>
+      <p><code>status = failed</code> 表示 Runner 或 job 执行失败，不表示数据被驳回。</p>
+
+      <h3>quality_overrides</h3>
+      <pre><code>id
+episode_id
+previous_qa_status
+new_qa_status
+decision            approved | rejected
+reason
+operator_id
+operator_name
+created_at</code></pre>
+      <p>人工覆盖只更新 <code>episodes.qa_status</code>，不改历史 <code>quality_runs</code> 或 <code>quality_run_jobs</code>。</p>
+    </section>
+
+    <section id="flow">
+      <h2>8. 自动触发与结算</h2>
+      <div class="flow">
+        <div class="flow-step">
+          <span>1</span>
+          <div>上传完成后 Keystone 创建 Episode，初始写 <code>qa_status = pending_qa</code>。</div>
+        </div>
+        <div class="flow-step">
+          <span>2</span>
+          <div>匹配所有 active <code>global</code> 版本，以及 SOP 匹配的 active <code>sop</code> 版本。</div>
+        </div>
+        <div class="flow-step">
+          <span>3</span>
+          <div>如果没有匹配脚本，直接写 <code>approved</code> 和 <code>auto_approved = true</code>。</div>
+        </div>
+        <div class="flow-step">
+          <span>4</span>
+          <div>如果有匹配脚本，创建一个 <code>quality_runs</code> 和多条 <code>quality_run_jobs</code>，Episode 进入 <code>qa_running</code>。</div>
+        </div>
+        <div class="flow-step">
+          <span>5</span>
+          <div>所有 job 完成后统一结算；<code>rejected</code> 不短路其他脚本。</div>
+        </div>
+      </div>
+
+      <h3>结算规则</h3>
+      <pre><code>if any job timed out, failed to execute, produced invalid output, or returned uncertain:
+  episode.qa_status = needs_inspection
+else if any job returned rejected:
+  episode.qa_status = rejected
+else:
+  episode.qa_status = approved</code></pre>
+
+      <h3>手动重跑</h3>
+      <ul>
+        <li>仅 admin 可用。</li>
+        <li>使用当前 active 脚本版本。</li>
+        <li>不允许临时覆盖配置。</li>
+        <li>不使用历史轮次里的旧版本。</li>
+        <li>如果 Episode 已有 active QA job，返回 <code>409 Conflict</code>。</li>
+      </ul>
+    </section>
+
+    <section id="contract">
+      <h2>9. 脚本执行契约</h2>
+      <p>Runner 为每个 job 准备临时目录：</p>
+      <pre><code>input.mcap
+sidecar.json
+config.json
+result.json
+script.py</code></pre>
+      <p>执行命令固定为：</p>
+      <pre><code>python script.py --mcap input.mcap --sidecar sidecar.json --config config.json --output result.json</code></pre>
+      <p>业务结果必须写入 <code>--output</code> 指定的 JSON 文件；<code>stdout</code> 和 <code>stderr</code> 只作为日志保存。</p>
+
+      <h3>最小输出</h3>
+      <pre><code>{
+  "decision": "passed",
+  "score": 1.0,
+  "summary": "ok",
+  "findings": []
+}</code></pre>
+
+      <h3>结果字段</h3>
+      <table>
+        <thead>
+          <tr>
+            <th>字段</th>
+            <th>规则</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><code>decision</code></td>
+            <td>必填：<code>passed</code>、<code>rejected</code>、<code>uncertain</code></td>
+          </tr>
+          <tr>
+            <td><code>score</code></td>
+            <td>可选，0 到 1。</td>
+          </tr>
+          <tr>
+            <td><code>summary</code></td>
+            <td>可选字符串。</td>
+          </tr>
+          <tr>
+            <td><code>findings</code></td>
+            <td>可选数组，保存在 <code>result_json</code> 中。</td>
+          </tr>
+          <tr>
+            <td><code>findings[].severity</code></td>
+            <td><code>info</code>、<code>warning</code>、<code>error</code></td>
+          </tr>
+          <tr>
+            <td><code>findings[].message</code></td>
+            <td>必填字符串。</td>
+          </tr>
+        </tbody>
+      </table>
+
+      <ul class="tag-list">
+        <li>passed：脚本接受数据</li>
+        <li>rejected：脚本驳回数据</li>
+        <li>uncertain：脚本无法可靠判断</li>
+      </ul>
+      <p class="note">输出文件缺失、JSON 非法、缺少 <code>decision</code> 或未知 <code>decision</code>，job 状态写为 <code>invalid_result</code>。</p>
+    </section>
+
+    <section id="runner">
+      <h2>10. Runner 行为</h2>
+      <div class="band">
+        <ul>
+          <li>轮询 MySQL <code>quality_run_jobs</code>。</li>
+          <li>用事务和行锁领取 job。</li>
+          <li>用 <code>runner_id</code> 和 <code>locked_at</code> 标识所有权。</li>
+          <li>直接从 MinIO 下载 MCAP、sidecar 和脚本文件。</li>
+          <li>以子进程执行 Python。</li>
+          <li>执行 <code>timeout_seconds</code> 超时控制。</li>
+          <li>截断保存 stdout/stderr。</li>
+          <li>不把数据库或 MinIO 凭证传给脚本进程。</li>
+          <li>不自动 retry。</li>
+          <li>stale running job 超过超时加宽限期后视为执行失败。</li>
+        </ul>
+        <p>建议配置项：<code>QUALITY_RUNNER_CONCURRENCY</code>、<code>QUALITY_RUNNER_POLL_INTERVAL_SECONDS</code>。</p>
+      </div>
+    </section>
+
+    <section id="override">
+      <h2>11. 人工覆盖</h2>
+      <div class="grid">
+        <article class="panel">
+          <h3>允许来源状态</h3>
+          <ul>
+            <li><code>needs_inspection</code></li>
+            <li><code>rejected</code></li>
+          </ul>
+        </article>
+        <article class="panel">
+          <h3>允许目标</h3>
+          <ul>
+            <li><code>approved</code> -> <code>inspector_approved</code></li>
+            <li><code>rejected</code> -> <code>rejected</code></li>
+          </ul>
+        </article>
+      </div>
+      <ul>
+        <li><code>reason</code> 必填。</li>
+        <li><code>pending_qa</code> 或 <code>qa_running</code> 时不允许覆盖。</li>
+        <li>覆盖只更新 Episode 的 effective QA 状态。</li>
+        <li>覆盖不改脚本执行历史。</li>
+        <li>覆盖写入 <code>quality_overrides</code>。</li>
+      </ul>
+      <pre><code>POST /api/v1/episodes/:id/quality-override
+Content-Type: application/json
+
+{
+  "decision": "approved",
+  "reason": "manual review confirmed the data is usable"
+}</code></pre>
+    </section>
+
+    <section id="api">
+      <h2>12. API 草案</h2>
+      <div class="grid">
+        <article class="panel">
+          <h3>脚本管理</h3>
+          <pre><code>GET   /api/v1/quality/scripts
+POST  /api/v1/quality/scripts
+GET   /api/v1/quality/scripts/:id
+PATCH /api/v1/quality/scripts/:id
+POST  /api/v1/quality/scripts/:id/versions
+GET   /api/v1/quality/scripts/:id/versions
+POST  /api/v1/quality/script-versions/:id/activate
+POST  /api/v1/quality/script-versions/:id/deactivate</code></pre>
+        </article>
+        <article class="panel">
+          <h3>执行与复核</h3>
+          <pre><code>GET  /api/v1/quality/runs
+GET  /api/v1/quality/jobs
+POST /api/v1/episodes/:id/quality-runs
+POST /api/v1/episodes/:id/quality-override</code></pre>
+        </article>
+      </div>
+      <p>所有脚本管理、手动重跑和人工覆盖 API 一期都只开放给 admin。</p>
+    </section>
+
+    <section id="synapse">
+      <h2>13. Synapse 页面范围</h2>
+      <div class="three-grid">
+        <article class="panel">
+          <h3>脚本管理</h3>
+          <ul>
+            <li>脚本列表。</li>
+            <li>创建和编辑元数据。</li>
+            <li>上传脚本版本。</li>
+            <li>激活和停用版本。</li>
+          </ul>
+        </article>
+        <article class="panel">
+          <h3>执行记录</h3>
+          <ul>
+            <li>run/job 列表。</li>
+            <li>按状态、脚本、Episode 筛选。</li>
+            <li>查看 stdout/stderr 摘要。</li>
+            <li>查看 <code>result_json</code> 中的 findings。</li>
+          </ul>
+        </article>
+        <article class="panel">
+          <h3>Episode 详情</h3>
+          <ul>
+            <li>显示 effective QA 状态。</li>
+            <li>显示最新 run 状态。</li>
+            <li>展示每个脚本 job 结果。</li>
+            <li>支持手动重跑和人工覆盖。</li>
+          </ul>
+        </article>
+      </div>
+      <p class="note">一期不需要在线代码编辑器。</p>
+    </section>
+
+    <section id="batch-stats">
+      <h2>14. 批次和统计查询</h2>
+      <p>一期不保存批次 QA 汇总表。批次详情和统计页面需要时直接聚合 <code>episodes.qa_status</code>。</p>
+      <pre><code>SELECT
+  COUNT(*) AS total,
+  SUM(qa_status = 'approved') AS approved_count,
+  SUM(qa_status = 'inspector_approved') AS inspector_approved_count,
+  SUM(qa_status = 'rejected') AS rejected_count,
+  SUM(qa_status = 'needs_inspection') AS needs_inspection_count,
+  SUM(qa_status = 'qa_running') AS qa_running_count,
+  SUM(qa_status = 'pending_qa') AS pending_qa_count
+FROM episodes
+WHERE batch_id = ? AND deleted_at IS NULL;</code></pre>
+    </section>
+
+    <section id="runtime">
+      <h2>15. 固定 Runtime</h2>
+      <div class="band">
+        <p>一期 Runtime 固定为 <code>python3.11-mcap</code>，脚本不能上传或安装自己的依赖。</p>
+        <h3>初始依赖集合</h3>
+        <ul class="tag-list">
+          <li>mcap</li>
+          <li>numpy</li>
+          <li>pandas</li>
+          <li>Pillow</li>
+          <li>opencv-python-headless</li>
+          <li>pyyaml</li>
+          <li>jsonschema</li>
+        </ul>
+      </div>
+    </section>
+
+    <section id="roadmap">
+      <h2>16. 后续演进</h2>
+      <ul>
+        <li>第二阶段支持不可变 ZIP 包，包含 <code>main.py</code>、<code>requirements.lock</code> 和可选 <code>config.schema.json</code>。</li>
+        <li>第二阶段在执行前构建受控 Python 环境，不在 job 执行时动态安装依赖。</li>
+        <li>第三阶段支持脚本专属容器镜像，用于复杂依赖、模型文件、GPU Runtime 或更强隔离。</li>
+        <li>未来执行器继续复用同一套输入和输出契约。</li>
+      </ul>
+    </section>
+  </main>
+</body>
+</html>
diff --git a/internal/api/handlers/robot.go b/internal/api/handlers/robot.go
index a497a96..94fe64e 100644
--- a/internal/api/handlers/robot.go
+++ b/internal/api/handlers/robot.go
@@ -15,6 +15,8 @@ import (
 	"strconv"
 	"strings"
 	"time"
+	"unicode"
+	"unicode/utf8"
 
 	"archebase.com/keystone-edge/internal/logger"
 	"archebase.com/keystone-edge/internal/services"
@@ -134,6 +136,48 @@ func robotMetadataFromDB(ns sql.NullString) interface{} {
 	return parseJSONRaw(ns.String)
 }
 
+func normalizeAssetID(raw string) (sql.NullString, error) {
+	value := strings.TrimSpace(raw)
+	if value == "" {
+		return sql.NullString{}, nil
+	}
+	if utf8.RuneCountInString(value) > 100 {
+		return sql.NullString{}, fmt.Errorf("asset_id must be at most 100 characters")
+	}
+	for _, r := range value {
+		if unicode.IsControl(r) {
+			return sql.NullString{}, fmt.Errorf("asset_id must not contain control characters")
+		}
+	}
+	return sql.NullString{String: value, Valid: true}, nil
+}
+
+func assetIDValue(ns sql.NullString) string {
+	if !ns.Valid {
+		return ""
+	}
+	return strings.TrimSpace(ns.String)
+}
+
+func (h *RobotHandler) assetIDInUse(assetID string, excludeRobotID int64) (bool, error) {
+	assetID = strings.TrimSpace(assetID)
+	if assetID == "" {
+		return false, nil
+	}
+	var exists bool
+	query := "SELECT EXISTS(SELECT 1 FROM robots WHERE asset_id = ? AND deleted_at IS NULL"
+	args := []interface{}{assetID}
+	if excludeRobotID > 0 {
+		query += " AND id <> ?"
+		args = append(args, excludeRobotID)
+	}
+	query += ")"
+	if err := h.db.Get(&exists, query, args...); err != nil {
+		return false, err
+	}
+	return exists, nil
+}
+
 func (h *RobotHandler) connectionState(deviceID string) (connected bool, connectedAt string) {
 	connected, connectedAt, _, _ = h.connectionStateDetailed(deviceID)
 	return connected, connectedAt
@@ -462,6 +506,11 @@ func (h *RobotHandler) CreateRobot(c *gin.Context) {
 	req.RobotTypeID = strings.TrimSpace(req.RobotTypeID)
 	req.DeviceID = strings.TrimSpace(req.DeviceID)
 	req.FactoryID = strings.TrimSpace(req.FactoryID)
+	assetID, err := normalizeAssetID(req.AssetID)
+	if err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
 
 	if req.RobotTypeID == "" {
 		c.JSON(http.StatusBadRequest, gin.H{"error": "robot_type_id is required"})
@@ -477,6 +526,18 @@ func (h *RobotHandler) CreateRobot(c *gin.Context) {
 		c.JSON(http.StatusBadRequest, gin.H{"error": "factory_id is required"})
 		return
 	}
+	if assetID.Valid {
+		inUse, err := h.assetIDInUse(assetID.String, 0)
+		if err != nil {
+			logger.Printf("[ROBOT] Failed to check asset_id uniqueness: %v", err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to create robot"})
+			return
+		}
+		if inUse {
+			c.JSON(http.StatusConflict, gin.H{"error": "asset_id is already assigned to another robot"})
+			return
+		}
+	}
 
 	// Parse robot_type_id as numeric value
 	robotTypeID, err := strconv.ParseInt(req.RobotTypeID, 10, 64)
@@ -509,11 +570,6 @@ func (h *RobotHandler) CreateRobot(c *gin.Context) {
 
 	now := time.Now().UTC()
 
-	var assetIDStr sql.NullString
-	if a := strings.TrimSpace(req.AssetID); a != "" {
-		assetIDStr = sql.NullString{String: a, Valid: true}
-	}
-
 	metadataStr := sql.NullString{String: "{}", Valid: true}
 	if req.Metadata != nil {
 		metadataJSON, err := json.Marshal(req.Metadata)
@@ -539,7 +595,7 @@ func (h *RobotHandler) CreateRobot(c *gin.Context) {
 		robotTypeID,
 		req.DeviceID,
 		factoryID,
-		assetIDStr,
+		assetID,
 		"active",
 		metadataStr,
 		now,
@@ -677,7 +733,7 @@ type UpdateRobotRequest struct {
 	RobotTypeID *string         `json:"robot_type_id,omitempty"`
 	DeviceID    *string         `json:"device_id,omitempty"`
 	FactoryID   *string         `json:"factory_id,omitempty"`
-	AssetID     *string         `json:"asset_id,omitempty"`
+	AssetID     json.RawMessage `json:"asset_id,omitempty" swaggertype:"string"`
 	Status      *string         `json:"status,omitempty"`
 	Metadata    json.RawMessage `json:"metadata,omitempty" swaggertype:"object"`
 }
@@ -710,13 +766,19 @@ func (h *RobotHandler) UpdateRobot(c *gin.Context) {
 		return
 	}
 
-	// Check if robot exists
-	var exists bool
-	err = h.db.Get(&exists, "SELECT EXISTS(SELECT 1 FROM robots WHERE id = ? AND deleted_at IS NULL)", id)
-	if err != nil || !exists {
+	var current struct {
+		AssetID sql.NullString `db:"asset_id"`
+	}
+	err = h.db.Get(&current, "SELECT asset_id FROM robots WHERE id = ? AND deleted_at IS NULL", id)
+	if err == sql.ErrNoRows {
 		c.JSON(http.StatusNotFound, gin.H{"error": "robot not found"})
 		return
 	}
+	if err != nil {
+		logger.Printf("[ROBOT] Failed to query robot: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to update robot"})
+		return
+	}
 
 	// Validate status if provided
 	validStatuses := map[string]bool{
@@ -760,6 +822,47 @@ func (h *RobotHandler) UpdateRobot(c *gin.Context) {
 		args = append(args, deviceID)
 	}
 
+	if len(req.AssetID) > 0 {
+		var rawAssetID string
+		meta := bytes.TrimSpace(req.AssetID)
+		if bytes.Equal(meta, []byte("null")) {
+			rawAssetID = ""
+		} else if err := json.Unmarshal(req.AssetID, &rawAssetID); err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "asset_id must be a string or null"})
+			return
+		}
+		assetID, err := normalizeAssetID(rawAssetID)
+		if err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+			return
+		}
+		currentAssetID := assetIDValue(current.AssetID)
+		if currentAssetID != "" {
+			if !assetID.Valid {
+				c.JSON(http.StatusBadRequest, gin.H{"error": "asset_id cannot be cleared once set"})
+				return
+			}
+			if assetID.String != currentAssetID {
+				c.JSON(http.StatusBadRequest, gin.H{"error": "asset_id cannot be changed once set"})
+				return
+			}
+		}
+		if assetID.Valid && assetID.String != currentAssetID {
+			inUse, err := h.assetIDInUse(assetID.String, id)
+			if err != nil {
+				logger.Printf("[ROBOT] Failed to check asset_id uniqueness: %v", err)
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to update robot"})
+				return
+			}
+			if inUse {
+				c.JSON(http.StatusConflict, gin.H{"error": "asset_id is already assigned to another robot"})
+				return
+			}
+		}
+		updates = append(updates, "asset_id = ?")
+		args = append(args, assetID)
+	}
+
 	if req.FactoryID != nil {
 		if *req.FactoryID == "" {
 			c.JSON(http.StatusBadRequest, gin.H{"error": "factory_id cannot be empty"})
@@ -781,16 +884,6 @@ func (h *RobotHandler) UpdateRobot(c *gin.Context) {
 		args = append(args, parsedFactoryID)
 	}
 
-	if req.AssetID != nil {
-		trimmed := strings.TrimSpace(*req.AssetID)
-		var a sql.NullString
-		if trimmed != "" {
-			a = sql.NullString{String: trimmed, Valid: true}
-		}
-		updates = append(updates, "asset_id = ?")
-		args = append(args, a)
-	}
-
 	if req.Status != nil {
 		status := strings.TrimSpace(*req.Status)
 		if !validStatuses[status] {
diff --git a/internal/api/handlers/robot_test.go b/internal/api/handlers/robot_test.go
index 02777b1..fa7c1e1 100644
--- a/internal/api/handlers/robot_test.go
+++ b/internal/api/handlers/robot_test.go
@@ -5,9 +5,12 @@
 package handlers
 
 import (
+	"bytes"
+	"database/sql"
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"
 	"time"
 
@@ -262,6 +265,164 @@ func TestRobotHandlerListRobots_ConnectedFilterUsesHubIntersection(t *testing.T)
 	})
 }
 
+func TestRobotHandlerAssetID_CreateUpdateAndList(t *testing.T) {
+	db := newTestRobotHandlerDB(t)
+	defer db.Close()
+	seedRobotLookups(t, db)
+
+	r := newTestRobotRouter(t, db)
+
+	req := httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString(`{
+		"robot_type_id": "10",
+		"device_id": "local-device-1",
+		"asset_id": "  asset-1  ",
+		"factory_id": "30"
+	}`))
+	req.Header.Set("Content-Type", "application/json")
+	w := httptest.NewRecorder()
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusCreated {
+		t.Fatalf("create status=%d want=%d body=%s", w.Code, http.StatusCreated, w.Body.String())
+	}
+	var created CreateRobotResponse
+	if err := json.Unmarshal(w.Body.Bytes(), &created); err != nil {
+		t.Fatalf("unmarshal create response: %v", err)
+	}
+	if created.AssetID != "asset-1" {
+		t.Fatalf("created asset_id=%v want asset-1", created.AssetID)
+	}
+
+	req = httptest.NewRequest(http.MethodGet, "/api/v1/robots", nil)
+	w = httptest.NewRecorder()
+	r.ServeHTTP(w, req)
+	if w.Code != http.StatusOK {
+		t.Fatalf("list status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+	}
+	var list RobotListResponse
+	if err := json.Unmarshal(w.Body.Bytes(), &list); err != nil {
+		t.Fatalf("unmarshal list response: %v", err)
+	}
+	if len(list.Items) != 1 || list.Items[0].AssetID != "asset-1" {
+		t.Fatalf("list asset_id response=%#v", list)
+	}
+
+	req = httptest.NewRequest(http.MethodPut, "/api/v1/robots/"+created.ID, bytes.NewBufferString(`{"asset_id":"asset-1"}`))
+	req.Header.Set("Content-Type", "application/json")
+	w = httptest.NewRecorder()
+	r.ServeHTTP(w, req)
+	if w.Code != http.StatusOK {
+		t.Fatalf("same-value update status=%d want=%d body=%s", w.Code, http.StatusOK, w.Body.String())
+	}
+}
+
+func TestRobotHandlerAssetID_ImmutableOnceSet(t *testing.T) {
+	db := newTestRobotHandlerDB(t)
+	defer db.Close()
+	seedRobotLookups(t, db)
+	seedRobot(t, db, 1, "local-device-1", "asset-1", nil)
+
+	r := newTestRobotRouter(t, db)
+
+	for _, tt := range []struct {
+		name string
+		body string
+	}{
+		{name: "change rejected", body: `{"asset_id":"asset-2"}`},
+		{name: "clear rejected", body: `{"asset_id":""}`},
+		{name: "blank clear rejected", body: `{"asset_id":"   "}`},
+	} {
+		t.Run(tt.name, func(t *testing.T) {
+			req := httptest.NewRequest(http.MethodPut, "/api/v1/robots/1", bytes.NewBufferString(tt.body))
+			req.Header.Set("Content-Type", "application/json")
+			w := httptest.NewRecorder()
+			r.ServeHTTP(w, req)
+			if w.Code != http.StatusBadRequest {
+				t.Fatalf("status=%d want=%d body=%s", w.Code, http.StatusBadRequest, w.Body.String())
+			}
+		})
+	}
+
+	req := httptest.NewRequest(http.MethodPut, "/api/v1/robots/1", bytes.NewBufferString(`{"asset_id":null}`))
+	req.Header.Set("Content-Type", "application/json")
+	w := httptest.NewRecorder()
+	r.ServeHTTP(w, req)
+	if w.Code != http.StatusBadRequest {
+		t.Fatalf("null clear status=%d want=%d body=%s", w.Code, http.StatusBadRequest, w.Body.String())
+	}
+}
+
+func TestRobotHandlerAssetID_UniqueAmongActiveRobots(t *testing.T) {
+	db := newTestRobotHandlerDB(t)
+	defer db.Close()
+	seedRobotLookups(t, db)
+	seedRobot(t, db, 1, "local-device-1", "asset-1", nil)
+	deletedAt := time.Now().UTC()
+	seedRobot(t, db, 2, "deleted-device", "deleted-asset", &deletedAt)
+
+	r := newTestRobotRouter(t, db)
+
+	req := httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString(`{
+		"robot_type_id": "10",
+		"device_id": "local-device-2",
+		"asset_id": "asset-1",
+		"factory_id": "30"
+	}`))
+	req.Header.Set("Content-Type", "application/json")
+	w := httptest.NewRecorder()
+	r.ServeHTTP(w, req)
+	if w.Code != http.StatusConflict {
+		t.Fatalf("duplicate create status=%d want=%d body=%s", w.Code, http.StatusConflict, w.Body.String())
+	}
+
+	req = httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString(`{
+		"robot_type_id": "10",
+		"device_id": "local-device-3",
+		"asset_id": "deleted-asset",
+		"factory_id": "30"
+	}`))
+	req.Header.Set("Content-Type", "application/json")
+	w = httptest.NewRecorder()
+	r.ServeHTTP(w, req)
+	if w.Code != http.StatusCreated {
+		t.Fatalf("soft-deleted reuse status=%d want=%d body=%s", w.Code, http.StatusCreated, w.Body.String())
+	}
+}
+
+func TestRobotHandlerAssetID_Validation(t *testing.T) {
+	db := newTestRobotHandlerDB(t)
+	defer db.Close()
+	seedRobotLookups(t, db)
+
+	r := newTestRobotRouter(t, db)
+
+	req := httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString("{\n"+
+		`"robot_type_id":"10",`+
+		`"device_id":"local-device-1",`+
+		`"factory_id":"30",`+
+		`"asset_id":"asset\u0001id"}`))
+	req.Header.Set("Content-Type", "application/json")
+	w := httptest.NewRecorder()
+	r.ServeHTTP(w, req)
+	if w.Code != http.StatusBadRequest {
+		t.Fatalf("control char status=%d want=%d body=%s", w.Code, http.StatusBadRequest, w.Body.String())
+	}
+
+	longID := strings.Repeat("a", 101)
+	req = httptest.NewRequest(http.MethodPost, "/api/v1/robots", bytes.NewBufferString(`{
+		"robot_type_id": "10",
+		"device_id": "local-device-2",
+		"factory_id": "30",
+		"asset_id": "`+longID+`"
+	}`))
+	req.Header.Set("Content-Type", "application/json")
+	w = httptest.NewRecorder()
+	r.ServeHTTP(w, req)
+	if w.Code != http.StatusBadRequest {
+		t.Fatalf("long id status=%d want=%d body=%s", w.Code, http.StatusBadRequest, w.Body.String())
+	}
+}
+
 func newTestRobotRouter(t *testing.T, db *sqlx.DB) *gin.Engine {
 	t.Helper()
 	return newTestRobotRouterWithHubs(t, db, nil, nil)
@@ -306,8 +467,8 @@ func newTestRobotHandlerDB(t *testing.T) *sqlx.DB {
 			id INTEGER PRIMARY KEY,
 			robot_type_id INTEGER NOT NULL,
 		device_id TEXT NOT NULL,
-		factory_id INTEGER NOT NULL,
 		asset_id TEXT,
+		factory_id INTEGER NOT NULL,
 		status TEXT NOT NULL,
 		metadata TEXT,
 		created_at TIMESTAMP,
@@ -336,3 +497,32 @@ func newTestRobotHandlerDB(t *testing.T) *sqlx.DB {
 
 	return db
 }
+
+func seedRobotLookups(t *testing.T, db *sqlx.DB) {
+	t.Helper()
+	if _, err := db.Exec(`INSERT INTO robot_types (id, name, model, deleted_at) VALUES (10, 'Arm Type', 'Model-A', NULL)`); err != nil {
+		t.Fatalf("seed robot type: %v", err)
+	}
+	if _, err := db.Exec(`INSERT INTO factories (id, name, slug, deleted_at) VALUES (30, 'Factory 30', 'fac-30', NULL)`); err != nil {
+		t.Fatalf("seed factory: %v", err)
+	}
+}
+
+func seedRobot(t *testing.T, db *sqlx.DB, id int64, deviceID string, assetID string, deletedAt *time.Time) {
+	t.Helper()
+	var asset sql.NullString
+	if strings.TrimSpace(assetID) != "" {
+		asset = sql.NullString{String: strings.TrimSpace(assetID), Valid: true}
+	}
+	var deleted sql.NullTime
+	if deletedAt != nil {
+		deleted = sql.NullTime{Time: *deletedAt, Valid: true}
+	}
+	now := time.Now().UTC()
+	if _, err := db.Exec(`
+		INSERT INTO robots (id, robot_type_id, device_id, asset_id, factory_id, status, created_at, updated_at, deleted_at)
+		VALUES (?, 10, ?, ?, 30, 'active', ?, ?, ?)
+	`, id, deviceID, asset, now, now, deleted); err != nil {
+		t.Fatalf("seed robot %d: %v", id, err)
+	}
+}
diff --git a/internal/api/handlers/sync.go b/internal/api/handlers/sync.go
index 9d1694c..eadd585 100644
--- a/internal/api/handlers/sync.go
+++ b/internal/api/handlers/sync.go
@@ -32,6 +32,7 @@ func NewSyncHandler(db *sqlx.DB, syncWorker *services.SyncWorker) *SyncHandler {
 // RegisterRoutes registers cloud sync related routes.
 func (h *SyncHandler) RegisterRoutes(apiV1 *gin.RouterGroup) {
 	apiV1.POST("/sync/episodes", h.TriggerBatchSync)
+	apiV1.POST("/sync/episodes/:id/resync", h.TriggerEpisodeResync)
 	apiV1.POST("/sync/episodes/:id", h.TriggerEpisodeSync)
 	apiV1.GET("/sync/episodes", h.ListSyncJobs)
 	apiV1.GET("/sync/episodes/summary", h.ListEpisodeSyncSummaries)
@@ -40,6 +41,103 @@ func (h *SyncHandler) RegisterRoutes(apiV1 *gin.RouterGroup) {
 	apiV1.GET("/sync/config", h.GetSyncConfig)
 }
 
+type syncEpisodeActionRow struct {
+	QaStatus    string `db:"qa_status"`
+	CloudSynced bool   `db:"cloud_synced"`
+}
+
+func (h *SyncHandler) loadSyncEpisodeForAction(c *gin.Context, episodeID int64) (syncEpisodeActionRow, bool) {
+	var row syncEpisodeActionRow
+	err := h.db.Get(&row, "SELECT qa_status, cloud_synced FROM episodes WHERE id = ? AND deleted_at IS NULL", episodeID)
+	if err == sql.ErrNoRows {
+		c.JSON(http.StatusNotFound, gin.H{"error": "episode not found"})
+		return row, false
+	}
+	if err != nil {
+		logger.Printf("[SYNC] Failed to query episode %d: %v", episodeID, err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to query episode"})
+		return row, false
+	}
+	return row, true
+}
+
+func (h *SyncHandler) enqueueSyncErrorResponse(c *gin.Context, episodeID int64, err error) {
+	switch {
+	case errors.Is(err, services.ErrSyncWorkerNotRunning):
+		c.JSON(http.StatusServiceUnavailable, gin.H{
+			"error":      err.Error(),
+			"episode_id": episodeID,
+			"status":     "worker_not_running",
+		})
+	case errors.Is(err, services.ErrEpisodeAlreadyEnqueued), errors.Is(err, services.ErrSyncAlreadyInProgress):
+		c.JSON(http.StatusConflict, gin.H{
+			"error":      err.Error(),
+			"episode_id": episodeID,
+			"status":     "already_queued",
+		})
+	case errors.Is(err, services.ErrSyncQueueFull):
+		c.JSON(http.StatusTooManyRequests, gin.H{
+			"error":      err.Error(),
+			"episode_id": episodeID,
+			"status":     "queue_full",
+		})
+	default:
+		logger.Printf("[SYNC] Enqueue episode %d failed: %v", episodeID, err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to enqueue episode"})
+	}
+}
+
+// TriggerEpisodeResync queues a new cloud upload for an already-synced episode.
+//
+// @Summary      Resync episode to cloud
+// @Description  Enqueues a new cloud upload for an already-synced episode without clearing previous sync history
+// @Tags         sync
+// @Produce      json
+// @Param        id   path      int  true  "Episode ID"
+// @Success      202  {object}  map[string]interface{}
+// @Failure      400  {object}  map[string]string
+// @Failure      404  {object}  map[string]string
+// @Failure      409  {object}  map[string]string
+// @Failure      500  {object}  map[string]string
+// @Router       /sync/episodes/{id}/resync [post]
+func (h *SyncHandler) TriggerEpisodeResync(c *gin.Context) {
+	if h.syncWorker == nil {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "sync worker is not configured"})
+		return
+	}
+
+	episodeID, ok := parseEpisodeIDParam(c)
+	if !ok {
+		return
+	}
+
+	row, ok := h.loadSyncEpisodeForAction(c, episodeID)
+	if !ok {
+		return
+	}
+	if row.QaStatus != "approved" && row.QaStatus != "inspector_approved" {
+		c.JSON(http.StatusBadRequest, gin.H{
+			"error": fmt.Sprintf("episode qa_status is %q, must be approved or inspector_approved", row.QaStatus),
+		})
+		return
+	}
+	if !row.CloudSynced {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "episode has not completed cloud sync; use normal sync instead"})
+		return
+	}
+
+	if err := h.syncWorker.EnqueueEpisodeResync(c.Request.Context(), episodeID); err != nil {
+		h.enqueueSyncErrorResponse(c, episodeID, err)
+		return
+	}
+
+	c.JSON(http.StatusAccepted, gin.H{
+		"status":     "accepted",
+		"episode_id": episodeID,
+		"message":    "episode enqueued for cloud resync",
+	})
+}
+
 // syncLogRow represents a row from the sync_logs table.
 type syncLogRow struct {
 	ID               int64          `db:"id"`
diff --git a/internal/api/handlers/transfer.go b/internal/api/handlers/transfer.go
index 1e70ce5..3ebb650 100644
--- a/internal/api/handlers/transfer.go
+++ b/internal/api/handlers/transfer.go
@@ -391,6 +391,30 @@ func readSidecarFromS3(ctx context.Context, s3Client *s3.Client, bucket, jsonKey
 	return &sc
 }
 
+func assetIDSnapshotMetadata(ctx context.Context, tx *sql.Tx, workstationID sql.NullInt64) sql.NullString {
+	if tx == nil || !workstationID.Valid || workstationID.Int64 <= 0 {
+		return sql.NullString{}
+	}
+	var assetID sql.NullString
+	err := tx.QueryRowContext(ctx, `
+		SELECT r.asset_id
+		FROM workstations ws
+		LEFT JOIN robots r ON r.id = ws.robot_id AND r.deleted_at IS NULL
+		WHERE ws.id = ? AND ws.deleted_at IS NULL
+		LIMIT 1
+	`, workstationID.Int64).Scan(&assetID)
+	if err != nil || !assetID.Valid || strings.TrimSpace(assetID.String) == "" {
+		return sql.NullString{}
+	}
+	data, err := json.Marshal(map[string]string{
+		"asset_id": strings.TrimSpace(assetID.String),
+	})
+	if err != nil {
+		return sql.NullString{}
+	}
+	return sql.NullString{String: string(data), Valid: true}
+}
+
 func uploadCompleteS3Key(data map[string]interface{}) string {
 	return strings.TrimSpace(stringVal(data, "s3_key"))
 }
@@ -617,6 +641,7 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra
 					checksum = sql.NullString{String: sc.Recording.ChecksumSHA256, Valid: true}
 				}
 			}
+			episodeMetadata := assetIDSnapshotMetadata(ctx, tx, taskRow.WorkstationID)
 
 			_, dbErr := tx.ExecContext(ctx,
 				`INSERT INTO episodes (
@@ -635,8 +660,9 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra
 					duration_sec,
 					file_size_bytes,
 					checksum,
-					qa_status
-				) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+					qa_status,
+					metadata
+				) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
 				episodeID,
 				taskRow.ID,
 				taskRow.BatchID,
@@ -653,6 +679,7 @@ func (h *TransferHandler) onUploadComplete(ctx context.Context, dc *services.Tra
 				fileSizeBytes,
 				checksum,
 				"approved",
+				episodeMetadata,
 			)
 			if dbErr != nil {
 				// #nosec G706 -- Set aside for now
diff --git a/internal/api/handlers/transfer_asset_id_snapshot_test.go b/internal/api/handlers/transfer_asset_id_snapshot_test.go
new file mode 100644
index 0000000..9d71be7
--- /dev/null
+++ b/internal/api/handlers/transfer_asset_id_snapshot_test.go
@@ -0,0 +1,95 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package handlers
+
+import (
+	"context"
+	"database/sql"
+	"encoding/json"
+	"testing"
+
+	_ "modernc.org/sqlite"
+)
+
+func TestAssetIDSnapshotMetadata_WritesWhenRobotHasAssetID(t *testing.T) {
+	db, err := sql.Open("sqlite", ":memory:")
+	if err != nil {
+		t.Fatalf("open sqlite db: %v", err)
+	}
+	defer db.Close()
+
+	createAssetIDSnapshotSchema(t, db)
+	if _, err := db.Exec(`INSERT INTO robots (id, asset_id, deleted_at) VALUES (1, ' asset-1 ', NULL)`); err != nil {
+		t.Fatalf("seed robot: %v", err)
+	}
+	if _, err := db.Exec(`INSERT INTO workstations (id, robot_id, deleted_at) VALUES (10, 1, NULL)`); err != nil {
+		t.Fatalf("seed workstation: %v", err)
+	}
+
+	tx, err := db.BeginTx(context.Background(), nil)
+	if err != nil {
+		t.Fatalf("begin tx: %v", err)
+	}
+	defer tx.Rollback()
+
+	got := assetIDSnapshotMetadata(context.Background(), tx, sql.NullInt64{Int64: 10, Valid: true})
+	if !got.Valid {
+		t.Fatal("metadata was not written")
+	}
+	var decoded map[string]string
+	if err := json.Unmarshal([]byte(got.String), &decoded); err != nil {
+		t.Fatalf("unmarshal metadata: %v", err)
+	}
+	if decoded["asset_id"] != "asset-1" {
+		t.Fatalf("asset_id=%q want asset-1", decoded["asset_id"])
+	}
+}
+
+func TestAssetIDSnapshotMetadata_MissingDoesNotFailEpisodeCreationPath(t *testing.T) {
+	db, err := sql.Open("sqlite", ":memory:")
+	if err != nil {
+		t.Fatalf("open sqlite db: %v", err)
+	}
+	defer db.Close()
+
+	createAssetIDSnapshotSchema(t, db)
+	if _, err := db.Exec(`INSERT INTO robots (id, asset_id, deleted_at) VALUES (1, NULL, NULL)`); err != nil {
+		t.Fatalf("seed robot: %v", err)
+	}
+	if _, err := db.Exec(`INSERT INTO workstations (id, robot_id, deleted_at) VALUES (10, 1, NULL)`); err != nil {
+		t.Fatalf("seed workstation: %v", err)
+	}
+
+	tx, err := db.BeginTx(context.Background(), nil)
+	if err != nil {
+		t.Fatalf("begin tx: %v", err)
+	}
+	defer tx.Rollback()
+
+	got := assetIDSnapshotMetadata(context.Background(), tx, sql.NullInt64{Int64: 10, Valid: true})
+	if got.Valid {
+		t.Fatalf("metadata valid=%t value=%q, want NULL", got.Valid, got.String)
+	}
+}
+
+func createAssetIDSnapshotSchema(t *testing.T, db *sql.DB) {
+	t.Helper()
+	for _, stmt := range []string{
+		`CREATE TABLE robots (
+			id INTEGER PRIMARY KEY,
+			asset_id TEXT,
+			deleted_at TIMESTAMP NULL
+		)`,
+		`CREATE TABLE workstations (
+			id INTEGER PRIMARY KEY,
+			robot_id INTEGER,
+			deleted_at TIMESTAMP NULL
+		)`,
+	} {
+		if _, err := db.Exec(stmt); err != nil {
+			t.Fatalf("create schema: %v", err)
+		}
+	}
+}
diff --git a/internal/cloud/cloudpb/data_gateway.pb.go b/internal/cloud/cloudpb/data_gateway.pb.go
index fb89247..2b98e31 100644
--- a/internal/cloud/cloudpb/data_gateway.pb.go
+++ b/internal/cloud/cloudpb/data_gateway.pb.go
@@ -870,6 +870,7 @@ type CompleteUploadRequest struct {
 	RawTags            map[string]string      `protobuf:"bytes,3,rep,name=raw_tags,json=rawTags,proto3" json:"raw_tags,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"`
 	CompletedPartCount int32                  `protobuf:"varint,4,opt,name=completed_part_count,json=completedPartCount,proto3" json:"completed_part_count,omitempty"`
 	OssObjectEtag      string                 `protobuf:"bytes,5,opt,name=oss_object_etag,json=ossObjectEtag,proto3" json:"oss_object_etag,omitempty"`
+	PartSizeBytes      int64                  `protobuf:"varint,6,opt,name=part_size_bytes,json=partSizeBytes,proto3" json:"part_size_bytes,omitempty"`
 	unknownFields      protoimpl.UnknownFields
 	sizeCache          protoimpl.SizeCache
 }
@@ -939,6 +940,13 @@ func (x *CompleteUploadRequest) GetOssObjectEtag() string {
 	return ""
 }
 
+func (x *CompleteUploadRequest) GetPartSizeBytes() int64 {
+	if x != nil {
+		return x.PartSizeBytes
+	}
+	return 0
+}
+
 type CompleteUploadResponse struct {
 	state         protoimpl.MessageState `protogen:"open.v1"`
 	unknownFields protoimpl.UnknownFields
@@ -1287,13 +1295,14 @@ const file_data_gateway_proto_rawDesc = "" +
 	"\x06reason\x18\x02 \x01(\tR\x06reason\"^\n" +
 	"\x13AbortUploadResponse\x12*\n" +
 	"\x11logical_upload_id\x18\x01 \x01(\tR\x0flogicalUploadId\x12\x1b\n" +
-	"\tupload_id\x18\x02 \x01(\tR\buploadId\"\xc1\x02\n" +
+	"\tupload_id\x18\x02 \x01(\tR\buploadId\"\xe9\x02\n" +
 	"\x15CompleteUploadRequest\x12\x1b\n" +
 	"\tupload_id\x18\x01 \x01(\tR\buploadId\x12\x1b\n" +
 	"\tfile_size\x18\x02 \x01(\x03R\bfileSize\x12X\n" +
 	"\braw_tags\x18\x03 \x03(\v2=.archebase.data_gateway.v1.CompleteUploadRequest.RawTagsEntryR\arawTags\x120\n" +
 	"\x14completed_part_count\x18\x04 \x01(\x05R\x12completedPartCount\x12&\n" +
-	"\x0foss_object_etag\x18\x05 \x01(\tR\rossObjectEtag\x1a:\n" +
+	"\x0foss_object_etag\x18\x05 \x01(\tR\rossObjectEtag\x12&\n" +
+	"\x0fpart_size_bytes\x18\x06 \x01(\x03R\rpartSizeBytes\x1a:\n" +
 	"\fRawTagsEntry\x12\x10\n" +
 	"\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" +
 	"\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\"\x18\n" +
diff --git a/internal/cloud/cloudpb/proto/data_gateway.proto b/internal/cloud/cloudpb/proto/data_gateway.proto
index e3f180c..a5be7a4 100644
--- a/internal/cloud/cloudpb/proto/data_gateway.proto
+++ b/internal/cloud/cloudpb/proto/data_gateway.proto
@@ -111,6 +111,7 @@ message CompleteUploadRequest {
   map<string, string> raw_tags = 3;
   int32 completed_part_count = 4;
   string oss_object_etag = 5;
+  int64 part_size_bytes = 6;
 }
 
 message CompleteUploadResponse {}
diff --git a/internal/cloud/gateway_client.go b/internal/cloud/gateway_client.go
index c91d238..5cc842d 100644
--- a/internal/cloud/gateway_client.go
+++ b/internal/cloud/gateway_client.go
@@ -209,7 +209,7 @@ func (c *GatewayClient) AbortUpload(ctx context.Context, logicalUploadID string,
 }
 
 // CompleteUpload notifies the data-gateway that all parts have been uploaded to OSS.
-func (c *GatewayClient) CompleteUpload(ctx context.Context, uploadID string, fileSize int64, rawTags map[string]string, completedPartCount int32, ossObjectEtag string) error {
+func (c *GatewayClient) CompleteUpload(ctx context.Context, uploadID string, fileSize int64, rawTags map[string]string, completedPartCount int32, ossObjectEtag string, partSizeBytes int64) error {
 	authHeader, err := c.getAuthHeader(ctx)
 	if err != nil {
 		return err
@@ -226,6 +226,7 @@ func (c *GatewayClient) CompleteUpload(ctx context.Context, uploadID string, fil
 			RawTags:            rawTags,
 			CompletedPartCount: completedPartCount,
 			OssObjectEtag:      ossObjectEtag,
+			PartSizeBytes:      partSizeBytes,
 		})
 		return rpcErr
 	})
diff --git a/internal/cloud/gateway_client_test.go b/internal/cloud/gateway_client_test.go
new file mode 100644
index 0000000..e8ab36a
--- /dev/null
+++ b/internal/cloud/gateway_client_test.go
@@ -0,0 +1,83 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package cloud
+
+import (
+	"context"
+	"net"
+	"testing"
+	"time"
+
+	pb "archebase.com/keystone-edge/internal/cloud/cloudpb"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
+	"google.golang.org/grpc/test/bufconn"
+)
+
+type completeUploadCaptureServer struct {
+	pb.UnimplementedDataGatewayServiceServer
+	req *pb.CompleteUploadRequest
+}
+
+func (s *completeUploadCaptureServer) CompleteUpload(_ context.Context, req *pb.CompleteUploadRequest) (*pb.CompleteUploadResponse, error) {
+	s.req = req
+	return &pb.CompleteUploadResponse{}, nil
+}
+
+func TestGatewayClientCompleteUploadSendsPartSizeBytes(t *testing.T) {
+	listener := bufconn.Listen(1024 * 1024)
+	server := grpc.NewServer()
+	capture := &completeUploadCaptureServer{}
+	pb.RegisterDataGatewayServiceServer(server, capture)
+	go func() {
+		if err := server.Serve(listener); err != nil {
+			t.Logf("bufconn server exited: %v", err)
+		}
+	}()
+	t.Cleanup(func() {
+		server.Stop()
+		_ = listener.Close()
+	})
+
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+	defer cancel()
+	conn, err := grpc.DialContext(ctx, "bufnet", //nolint:staticcheck // bufconn tests still use DialContext.
+		grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) {
+			return listener.Dial()
+		}),
+		grpc.WithTransportCredentials(insecure.NewCredentials()),
+	)
+	if err != nil {
+		t.Fatalf("dial bufconn: %v", err)
+	}
+	t.Cleanup(func() { _ = conn.Close() })
+
+	authClient := &AuthClient{
+		token: &AuthToken{
+			AccessToken: "test-token",
+			ExpiresAt:   time.Now().Add(time.Hour),
+		},
+	}
+	client := &GatewayClient{
+		cfg: GatewayClientConfig{
+			RequestTimeout: time.Second,
+		},
+		authClient: authClient,
+		conn:       conn,
+	}
+
+	if err := client.CompleteUpload(ctx, "upload-1", 1234, map[string]string{"k": "v"}, 2, `"etag"`, 8*1024*1024); err != nil {
+		t.Fatalf("CompleteUpload() error = %v", err)
+	}
+	if capture.req == nil {
+		t.Fatal("CompleteUpload request was not captured")
+	}
+	if capture.req.PartSizeBytes != 8*1024*1024 {
+		t.Fatalf("PartSizeBytes=%d want %d", capture.req.PartSizeBytes, 8*1024*1024)
+	}
+	if capture.req.RawTags["k"] != "v" {
+		t.Fatalf("RawTags=%+v", capture.req.RawTags)
+	}
+}
diff --git a/internal/cloud/uploader.go b/internal/cloud/uploader.go
index e7cd01c..8d746da 100644
--- a/internal/cloud/uploader.go
+++ b/internal/cloud/uploader.go
@@ -13,6 +13,7 @@ import (
 	"math"
 	"os"
 	"path/filepath"
+	"strings"
 	"time"
 
 	pb "archebase.com/keystone-edge/internal/cloud/cloudpb"
@@ -42,6 +43,8 @@ type UploadRequest struct {
 	EpisodeID string
 	// McapKey is the MinIO object key for the MCAP file (without bucket prefix).
 	McapKey string
+	// AssetID is the Data Platform device id used for this upload.
+	AssetID string
 	// RawTags are arbitrary key-value tags passed to the data-gateway.
 	RawTags map[string]string
 	// ClientHints are passed to CreateLogicalUpload for server-side routing.
@@ -69,7 +72,9 @@ type persistedUploadState struct {
 	Endpoint          string    `json:"endpoint"`
 	ObjectKey         string    `json:"object_key"`
 	McapKey           string    `json:"mcap_key"`
+	AssetID           string    `json:"asset_id"`
 	FileSize          int64     `json:"file_size"`
+	PartSizeBytes     int64     `json:"part_size_bytes,omitempty"`
 	UpdatedAt         time.Time `json:"updated_at"`
 }
 
@@ -99,7 +104,7 @@ type gatewayClient interface {
 	GetUploadRecovery(ctx context.Context, logicalUploadID string) (*UploadRecoveryInfo, error)
 	ReissueUploadCredentials(ctx context.Context, uploadID string) (*UploadSession, error)
 	AbortUpload(ctx context.Context, logicalUploadID string, reason string) error
-	CompleteUpload(ctx context.Context, uploadID string, fileSize int64, rawTags map[string]string, completedPartCount int32, ossObjectEtag string) error
+	CompleteUpload(ctx context.Context, uploadID string, fileSize int64, rawTags map[string]string, completedPartCount int32, ossObjectEtag string, partSizeBytes int64) error
 }
 
 // ossClient is the subset of OSSUploader methods used by Uploader.
@@ -171,8 +176,18 @@ func (u *Uploader) validatePersistDir() error {
 // It uses context.Background() as base to ensure the abort is independent of the
 // caller's context, but with a 30s timeout to prevent indefinite hanging.
 func (u *Uploader) abortMultipartUpload(session *UploadSession, multipartUploadID string) {
+	if session == nil {
+		logger.Printf("[CLOUD-UPLOAD] Warning: skip OSS abort for multipart_upload_id=%s: missing upload session", multipartUploadID)
+		return
+	}
 	abortCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()
+	refreshed, err := u.ensureFreshUploadCredentials(abortCtx, session)
+	if err != nil {
+		logger.Printf("[CLOUD-UPLOAD] Warning: refresh credentials before abort failed (proceeding anyway): %v", err)
+	} else {
+		session = refreshed
+	}
 	u.oss.AbortMultipartUpload(abortCtx, session, multipartUploadID)
 }
 
@@ -205,7 +220,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult
 	logger.Printf("[CLOUD-UPLOAD] Starting upload: episode=%s mcap=%s size=%d", req.EpisodeID, req.McapKey, fileSize)
 
 	// Step 2: Prepare upload session (with recovery if persisted state exists)
-	prepared, err := u.prepareUploadSession(ctx, hints, req.McapKey, fileSize)
+	prepared, err := u.prepareUploadSession(ctx, hints, req.McapKey, req.AssetID, fileSize)
 	if err != nil {
 		return nil, fmt.Errorf("prepare upload session: %w", err)
 	}
@@ -219,7 +234,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult
 	if prepared.ossCompleteETag != "" {
 		logger.Printf("[CLOUD-UPLOAD] OSS object already verified (COMPLETE_ONLY): logical_upload_id=%s etag=%s parts=%d",
 			session.LogicalUploadID, prepared.ossCompleteETag, prepared.ossCompletePartCount)
-		if err := u.gateway.CompleteUpload(ctx, session.UploadID, fileSize, req.RawTags, prepared.ossCompletePartCount, prepared.ossCompleteETag); err != nil {
+		if err := u.gateway.CompleteUpload(ctx, session.UploadID, fileSize, req.RawTags, prepared.ossCompletePartCount, prepared.ossCompleteETag, session.PartSizeBytes); err != nil {
 			return nil, fmt.Errorf("complete upload on gateway (oss-already-complete): %w", err)
 		}
 		u.cleanupPersistedState(session.LogicalUploadID)
@@ -246,7 +261,9 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult
 			Endpoint:        session.Endpoint,
 			ObjectKey:       session.ObjectKey,
 			McapKey:         req.McapKey,
+			AssetID:         req.AssetID,
 			FileSize:        fileSize,
+			PartSizeBytes:   session.PartSizeBytes,
 			UpdatedAt:       time.Now(),
 		}); err != nil {
 			return nil, fmt.Errorf("persist initial upload state: %w", err)
@@ -269,7 +286,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult
 	// InitiateMultipartUpload succeeds, before streaming any parts. This requires splitting
 	// uploadParts into an initiate step (called here, result persisted) and a stream step.
 	// The Rust SDK has the same gap; defer fixing until the upstream SDK is updated.
-	multipartUploadID, parts, partMD5s, err := u.uploadParts(ctx, req, session, fileSize)
+	session, multipartUploadID, parts, partMD5s, err := u.uploadParts(ctx, req, session, fileSize)
 	if err != nil {
 		return nil, err
 	}
@@ -285,7 +302,9 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult
 		Endpoint:          session.Endpoint,
 		ObjectKey:         session.ObjectKey,
 		McapKey:           req.McapKey,
+		AssetID:           req.AssetID,
 		FileSize:          fileSize,
+		PartSizeBytes:     session.PartSizeBytes,
 		UpdatedAt:         time.Now(),
 	}); err != nil {
 		logger.Printf("[CLOUD-UPLOAD] Warning: failed to update state with multipart_upload_id: %v", err)
@@ -296,7 +315,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult
 
 	// Step 4: Refresh STS credentials if about to expire before CompleteUpload RPC
 	if time.Until(session.STSExpireAt) <= u.cfg.RequestTimeout {
-		refreshed, err := u.gateway.ReissueUploadCredentials(ctx, session.UploadID)
+		refreshed, err := u.refreshUploadCredentials(ctx, session)
 		if err != nil {
 			logger.Printf("[CLOUD-UPLOAD] Warning: refresh credentials failed (proceeding anyway): %v", err)
 		} else {
@@ -309,7 +328,7 @@ func (u *Uploader) Upload(ctx context.Context, req UploadRequest) (*UploadResult
 		return nil, fmt.Errorf("too many upload parts: %d", len(parts))
 	}
 	//nolint:gosec // G115: len(parts) validated to fit into int32 above
-	if err := u.gateway.CompleteUpload(ctx, session.UploadID, fileSize, req.RawTags, int32(len(parts)), localETag); err != nil {
+	if err := u.gateway.CompleteUpload(ctx, session.UploadID, fileSize, req.RawTags, int32(len(parts)), localETag, session.PartSizeBytes); err != nil {
 		return nil, fmt.Errorf("complete upload on gateway: %w", err)
 	}
 
@@ -341,8 +360,8 @@ type preparedSession struct {
 
 // prepareUploadSession checks for persisted state and either resumes or creates a new session.
 // It mirrors the Rust SDK's prepare_upload_session logic.
-func (u *Uploader) prepareUploadSession(ctx context.Context, clientHints map[string]string, mcapKey string, fileSize int64) (preparedSession, error) {
-	state, err := u.findPersistedStateByKey(mcapKey)
+func (u *Uploader) prepareUploadSession(ctx context.Context, clientHints map[string]string, mcapKey string, assetID string, fileSize int64) (preparedSession, error) {
+	state, err := u.findPersistedStateByKey(mcapKey, assetID)
 	if err != nil {
 		return preparedSession{}, fmt.Errorf("load persisted state: %w", err)
 	}
@@ -397,6 +416,7 @@ func (u *Uploader) prepareUploadSession(ctx context.Context, clientHints map[str
 				Endpoint:        newSession.Endpoint,
 				ObjectKey:       newSession.ObjectKey,
 				McapKey:         mcapKey,
+				AssetID:         assetID,
 				FileSize:        fileSize,
 				UpdatedAt:       time.Now(),
 			}); err != nil {
@@ -461,6 +481,9 @@ func (u *Uploader) decideResumeAction(ctx context.Context, state *persistedUploa
 			// Treat RPC failures as transient: preserve local state for next retry.
 			return resumeContinue, nil, "", 0, fmt.Errorf("ReissueUploadCredentials: %w", err)
 		}
+		if state.PartSizeBytes > 0 {
+			session.PartSizeBytes = state.PartSizeBytes
+		}
 
 		if state.MultipartUploadID != "" {
 			outcome, err := u.reconcileRemoteParts(ctx, session, state.MultipartUploadID)
@@ -537,32 +560,80 @@ func (u *Uploader) reconcileCompletedObject(ctx context.Context, session *Upload
 	return reconcileRestart, nil
 }
 
+// partStreamFactory opens a stream for a specific byte range of the MCAP file.
+// Each call returns an independent io.ReadCloser so that connections are not
+// kept idle across part uploads.
+type partStreamFactory func(ctx context.Context, offset, length int64) (io.ReadCloser, error)
+
+// minioRangeReader returns a partStreamFactory that reads byte ranges from
+// MinIO using independent ranged GetObject requests.
+func (u *Uploader) minioRangeReader(key string) partStreamFactory {
+	return func(ctx context.Context, offset, length int64) (io.ReadCloser, error) {
+		opts := minio.GetObjectOptions{}
+		if err := opts.SetRange(offset, offset+length-1); err != nil {
+			return nil, fmt.Errorf("set range %d-%d: %w", offset, offset+length-1, err)
+		}
+		obj, err := u.minioClient.GetObject(ctx, u.minioBucket, key, opts)
+		if err != nil {
+			return nil, fmt.Errorf("get minio object range %d-%d: %w", offset, offset+length-1, err)
+		}
+		return obj, nil
+	}
+}
+
 // uploadParts streams the MCAP from MinIO and uploads it to OSS in parts.
 // Returns the OSS multipart upload ID, the list of uploaded parts, per-part MD5 digests, and any error.
-func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session *UploadSession, fileSize int64) (string, []UploadedPart, [][16]byte, error) {
+func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session *UploadSession, fileSize int64) (*UploadSession, string, []UploadedPart, [][16]byte, error) {
+	fixedPartSizeBytes := normalizedPartSizeBytes(session.PartSizeBytes)
+	session, err := u.ensureFreshUploadCredentials(ctx, session)
+	if err != nil {
+		return nil, "", nil, nil, fmt.Errorf("refresh credentials before initiate multipart upload: %w", err)
+	}
+	session.PartSizeBytes = fixedPartSizeBytes
+
 	// Initiate multipart upload on OSS
 	multipartUploadID, err := u.oss.InitiateMultipartUpload(ctx, session)
 	if err != nil {
-		return "", nil, nil, fmt.Errorf("initiate multipart upload: %w", err)
+		return nil, "", nil, nil, fmt.Errorf("initiate multipart upload: %w", err)
 	}
 	logger.Printf("[CLOUD-UPLOAD] Multipart initiated: multipart_upload_id=%s", multipartUploadID)
 
-	// Stream from MinIO → OSS in parts
-	mcapStream, err := u.minioClient.GetObject(ctx, u.minioBucket, req.McapKey, minio.GetObjectOptions{})
+	// Stream from MinIO to OSS in parts.
+	// Each part uses an independent ranged GetObject so that the MinIO HTTP
+	// connection is not left idle during OSS part uploads. A single streaming
+	// response would risk idle connection timeout (~20-25s on MinIO or network
+	// intermediaries) when upload speed is slow.
+	session, parts, partMD5s, err := u.streamMultipartParts(ctx, req.EpisodeID, session, multipartUploadID, fileSize, fixedPartSizeBytes, u.minioRangeReader(req.McapKey))
 	if err != nil {
 		u.abortMultipartUpload(session, multipartUploadID)
-		return "", nil, nil, fmt.Errorf("get minio object %s: %w", req.McapKey, err)
+		return nil, "", nil, nil, err
 	}
-	defer func() {
-		_ = mcapStream.Close()
-	}()
 
-	partSizeBytes := session.PartSizeBytes
-	if partSizeBytes <= 0 {
-		partSizeBytes = 8 * 1024 * 1024 // 8MB default
+	session, err = u.ensureFreshUploadCredentials(ctx, session)
+	if err != nil {
+		u.abortMultipartUpload(session, multipartUploadID)
+		return nil, "", nil, nil, fmt.Errorf("refresh credentials before complete multipart upload: %w", err)
+	}
+	session.PartSizeBytes = fixedPartSizeBytes
+
+	// Complete multipart upload on OSS
+	if _, err := u.oss.CompleteMultipartUpload(ctx, session, multipartUploadID, parts); err != nil {
+		u.abortMultipartUpload(session, multipartUploadID)
+		return nil, "", nil, nil, fmt.Errorf("complete multipart upload on OSS: %w", err)
 	}
 
-	buf := make([]byte, partSizeBytes)
+	return session, multipartUploadID, parts, partMD5s, nil
+}
+
+func (u *Uploader) streamMultipartParts(ctx context.Context, episodeID string, session *UploadSession, multipartUploadID string, fileSize int64, partSizeBytes int64, newPartStream partStreamFactory) (*UploadSession, []UploadedPart, [][16]byte, error) {
+	partSizeBytes = normalizedPartSizeBytes(partSizeBytes)
+	session.PartSizeBytes = partSizeBytes
+	partSize := int(partSizeBytes)
+	if int64(partSize) != partSizeBytes {
+		return session, nil, nil, fmt.Errorf("invalid part_size_bytes %d", partSizeBytes)
+	}
+
+	buf := make([]byte, partSize)
 	var parts []UploadedPart
 	var partMD5s [][16]byte
 	var offset int64
@@ -570,8 +641,7 @@ func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session *
 
 	for offset < fileSize {
 		if err := ctx.Err(); err != nil {
-			u.abortMultipartUpload(session, multipartUploadID)
-			return "", nil, nil, err
+			return session, nil, nil, err
 		}
 
 		remaining := fileSize - offset
@@ -580,19 +650,44 @@ func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session *
 			readSize = remaining
 		}
 
-		n, readErr := io.ReadFull(mcapStream, buf[:readSize])
-		if readErr != nil && readErr != io.ErrUnexpectedEOF {
-			u.abortMultipartUpload(session, multipartUploadID)
-			return "", nil, nil, fmt.Errorf("read part %d from minio: %w", partNumber, readErr)
+		// Open a new connection for each part so that the MinIO stream is not
+		// left idle during OSS uploads. MinIO or intervening network equipment
+		// may drop idle streaming connections after ~20-25s, and the OSS upload
+		// between part reads can easily exceed this threshold on slow networks.
+		partStream, err := newPartStream(ctx, offset, readSize)
+		if err != nil {
+			return session, nil, nil, fmt.Errorf("open part %d stream at offset %d: %w", partNumber, offset, err)
+		}
+
+		n, readErr := io.ReadFull(partStream, buf[:int(readSize)])
+		_ = partStream.Close() // close ASAP, best-effort
+		if readErr != nil {
+			return session, nil, nil, fmt.Errorf("read part %d from minio: expected %d bytes, got %d: %w", partNumber, readSize, n, readErr)
+		}
+		if int64(n) != readSize {
+			return session, nil, nil, fmt.Errorf("read part %d from minio: expected %d bytes, got %d", partNumber, readSize, n)
 		}
 
 		partSlice := buf[:n]
 		partMD5s = append(partMD5s, MD5DigestBytes(partSlice))
 
+		session, err = u.ensureFreshUploadCredentials(ctx, session)
+		if err != nil {
+			return session, nil, nil, fmt.Errorf("refresh credentials before upload part %d: %w", partNumber, err)
+		}
+
 		etag, err := u.oss.UploadPart(ctx, session, multipartUploadID, partNumber, partSlice)
+		if err != nil && isSecurityTokenExpiredError(err) {
+			refreshed, refreshErr := u.refreshUploadCredentials(ctx, session)
+			if refreshErr != nil {
+				return session, nil, nil, fmt.Errorf("refresh credentials after upload part %d token expiry: %w", partNumber, refreshErr)
+			}
+			session = refreshed
+			session.PartSizeBytes = partSizeBytes
+			etag, err = u.oss.UploadPart(ctx, session, multipartUploadID, partNumber, partSlice)
+		}
 		if err != nil {
-			u.abortMultipartUpload(session, multipartUploadID)
-			return "", nil, nil, fmt.Errorf("upload part %d: %w", partNumber, err)
+			return session, nil, nil, fmt.Errorf("upload part %d: %w", partNumber, err)
 		}
 
 		parts = append(parts, UploadedPart{
@@ -603,19 +698,55 @@ func (u *Uploader) uploadParts(ctx context.Context, req UploadRequest, session *
 		offset += int64(n)
 		partNumber++
 
-		if partNumber%10 == 0 {
-			logger.Printf("[CLOUD-UPLOAD] Progress: episode=%s parts=%d offset=%d/%d",
-				req.EpisodeID, len(parts), offset, fileSize)
-		}
+		logger.Printf("[CLOUD-UPLOAD] Progress: episode=%s parts=%d offset=%d/%d",
+			episodeID, len(parts), offset, fileSize)
 	}
 
-	// Complete multipart upload on OSS
-	if _, err := u.oss.CompleteMultipartUpload(ctx, session, multipartUploadID, parts); err != nil {
-		u.abortMultipartUpload(session, multipartUploadID)
-		return "", nil, nil, fmt.Errorf("complete multipart upload on OSS: %w", err)
+	return session, parts, partMD5s, nil
+}
+
+func (u *Uploader) ensureFreshUploadCredentials(ctx context.Context, session *UploadSession) (*UploadSession, error) {
+	if session == nil {
+		return nil, fmt.Errorf("missing upload session")
+	}
+	if time.Until(session.STSExpireAt) > u.stsRefreshWindow() {
+		return session, nil
 	}
+	return u.refreshUploadCredentials(ctx, session)
+}
+
+func (u *Uploader) refreshUploadCredentials(ctx context.Context, session *UploadSession) (*UploadSession, error) {
+	if u.gateway == nil {
+		return nil, fmt.Errorf("gateway client is not configured")
+	}
+	refreshed, err := u.gateway.ReissueUploadCredentials(ctx, session.UploadID)
+	if err != nil {
+		return nil, err
+	}
+	refreshed.PartSizeBytes = normalizedPartSizeBytes(session.PartSizeBytes)
+	return refreshed, nil
+}
+
+func normalizedPartSizeBytes(partSizeBytes int64) int64 {
+	if partSizeBytes <= 0 {
+		return 8 * 1024 * 1024
+	}
+	return partSizeBytes
+}
+
+func (u *Uploader) stsRefreshWindow() time.Duration {
+	window := u.cfg.RequestTimeout
+	if u.cfg.OSSTimeout > window {
+		window = u.cfg.OSSTimeout
+	}
+	if window <= 0 {
+		window = 30 * time.Second
+	}
+	return window + 30*time.Second
+}
 
-	return multipartUploadID, parts, partMD5s, nil
+func isSecurityTokenExpiredError(err error) bool {
+	return err != nil && strings.Contains(err.Error(), "SecurityTokenExpired")
 }
 
 // abortAndCleanupSession notifies the data-gateway to abort the logical upload session
@@ -680,8 +811,10 @@ func (u *Uploader) cleanupPersistedState(logicalUploadID string) {
 	}
 }
 
-// findPersistedStateByKey scans the active state directory for a state matching the given mcap key.
-func (u *Uploader) findPersistedStateByKey(mcapKey string) (*persistedUploadState, error) {
+// findPersistedStateByKey scans the active state directory for a state matching the given
+// MCAP key and asset id. Upload sessions are device-scoped and must not be reused
+// across different Data Platform devices even when the MCAP object key is identical.
+func (u *Uploader) findPersistedStateByKey(mcapKey string, assetID string) (*persistedUploadState, error) {
 	if u.cfg.PersistRootDir == "" {
 		return nil, nil
 	}
@@ -707,7 +840,7 @@ func (u *Uploader) findPersistedStateByKey(mcapKey string) (*persistedUploadStat
 			logger.Printf("[CLOUD-UPLOAD] Warning: failed to parse state file %s: %v", entry.Name(), err)
 			continue
 		}
-		if state.McapKey == mcapKey {
+		if state.McapKey == mcapKey && state.AssetID == assetID {
 			return &state, nil
 		}
 	}
diff --git a/internal/cloud/uploader_test.go b/internal/cloud/uploader_test.go
index b510f5f..99e3432 100644
--- a/internal/cloud/uploader_test.go
+++ b/internal/cloud/uploader_test.go
@@ -5,11 +5,14 @@
 package cloud
 
 import (
+	"bytes"
 	"context"
 	"encoding/json"
 	"errors"
+	"io"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"
 	"time"
 
@@ -122,11 +125,12 @@ func TestFindPersistedStateByKey(t *testing.T) {
 		LogicalUploadID: "logical-find-test",
 		UploadID:        "upload-find-test",
 		McapKey:         "episodes/7/find.mcap",
+		AssetID:         "asset-a",
 		FileSize:        256,
 		UpdatedAt:       time.Now(),
 	})
 
-	got, err := u.findPersistedStateByKey("episodes/7/find.mcap")
+	got, err := u.findPersistedStateByKey("episodes/7/find.mcap", "asset-a")
 	if err != nil {
 		t.Fatalf("findPersistedStateByKey: %v", err)
 	}
@@ -138,12 +142,36 @@ func TestFindPersistedStateByKey(t *testing.T) {
 	}
 }
 
+func TestFindPersistedStateByKey_DoesNotReuseDifferentAssetID(t *testing.T) {
+	dir := t.TempDir()
+	u := newTestUploader(dir)
+
+	activeDir := filepath.Join(dir, "data-gateway-client", "uploads", "active")
+	writeTempState(t, activeDir, &persistedUploadState{
+		Version:         1,
+		LogicalUploadID: "logical-device-a",
+		UploadID:        "upload-device-a",
+		McapKey:         "episodes/7/find.mcap",
+		AssetID:         "asset-a",
+		FileSize:        256,
+		UpdatedAt:       time.Now(),
+	})
+
+	got, err := u.findPersistedStateByKey("episodes/7/find.mcap", "asset-b")
+	if err != nil {
+		t.Fatalf("findPersistedStateByKey: %v", err)
+	}
+	if got != nil {
+		t.Fatalf("expected nil for different AssetID, got %+v", got)
+	}
+}
+
 // TestFindPersistedStateByKey_NotFound verifies nil is returned for unknown keys.
 func TestFindPersistedStateByKey_NotFound(t *testing.T) {
 	dir := t.TempDir()
 	u := newTestUploader(dir)
 
-	got, err := u.findPersistedStateByKey("episodes/99/missing.mcap")
+	got, err := u.findPersistedStateByKey("episodes/99/missing.mcap", "asset-a")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -156,7 +184,7 @@ func TestFindPersistedStateByKey_NotFound(t *testing.T) {
 func TestFindPersistedStateByKey_EmptyPersistRootDir(t *testing.T) {
 	u := newTestUploader("")
 
-	got, err := u.findPersistedStateByKey("episodes/1/file.mcap")
+	got, err := u.findPersistedStateByKey("episodes/1/file.mcap", "asset-a")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -197,6 +225,7 @@ func TestPersistedStateRoundTrip(t *testing.T) {
 		Endpoint:          "https://oss.example.com",
 		ObjectKey:         "uploads/1/test",
 		McapKey:           "episodes/1/test.mcap",
+		AssetID:           "asset-a",
 		FileSize:          4096,
 		UpdatedAt:         now,
 	}
@@ -225,6 +254,9 @@ func TestPersistedStateRoundTrip(t *testing.T) {
 	if decoded.McapKey != original.McapKey {
 		t.Errorf("McapKey = %q, want %q", decoded.McapKey, original.McapKey)
 	}
+	if decoded.AssetID != original.AssetID {
+		t.Errorf("AssetID = %q, want %q", decoded.AssetID, original.AssetID)
+	}
 }
 
 // TestPrepareUploadSession_PermanentFailure_FileSizeMismatch verifies that a persisted state
@@ -241,6 +273,7 @@ func TestPrepareUploadSession_PermanentFailure_FileSizeMismatch(t *testing.T) {
 		LogicalUploadID: "logical-size-mismatch",
 		UploadID:        "upload-size-mismatch",
 		McapKey:         "episodes/1/mismatch.mcap",
+		AssetID:         "asset-a",
 		FileSize:        1024, // persisted as 1024
 		UpdatedAt:       time.Now(),
 	})
@@ -250,6 +283,7 @@ func TestPrepareUploadSession_PermanentFailure_FileSizeMismatch(t *testing.T) {
 		context.Background(),
 		map[string]string{},
 		"episodes/1/mismatch.mcap",
+		"asset-a",
 		512, // actual size differs
 	)
 	if err == nil {
@@ -271,6 +305,7 @@ func TestPrepareUploadSession_PermanentFailure_CleanupOnSizeMismatch(t *testing.
 		LogicalUploadID: "logical-cleanup-mismatch",
 		UploadID:        "upload-cleanup-mismatch",
 		McapKey:         "episodes/2/cleanup.mcap",
+		AssetID:         "asset-a",
 		FileSize:        1024, // persisted size
 		RestartCount:    0,
 		UpdatedAt:       time.Now(),
@@ -280,6 +315,7 @@ func TestPrepareUploadSession_PermanentFailure_CleanupOnSizeMismatch(t *testing.
 		context.Background(),
 		map[string]string{},
 		"episodes/2/cleanup.mcap",
+		"asset-a",
 		512, // different from persisted
 	)
 	if err == nil {
@@ -326,13 +362,14 @@ func TestPrepareUploadSession_Restart_OldStatePreservedOnRPCFailure(t *testing.T
 		LogicalUploadID: "logical-old",
 		UploadID:        "upload-old",
 		McapKey:         "episodes/10/restart-rpc-fail.mcap",
+		AssetID:         "asset-a",
 		FileSize:        512,
 		RestartCount:    0,
 		UpdatedAt:       time.Now(),
 	})
 
 	u := newDecideResumeUploader(dir, gw, &fakeOSS{})
-	_, err := u.prepareUploadSession(context.Background(), map[string]string{}, "episodes/10/restart-rpc-fail.mcap", 512)
+	_, err := u.prepareUploadSession(context.Background(), map[string]string{}, "episodes/10/restart-rpc-fail.mcap", "asset-a", 512)
 	if err == nil {
 		t.Fatal("expected error when CreateLogicalUpload fails, got nil")
 	}
@@ -374,13 +411,14 @@ func TestPrepareUploadSession_Restart_NewStatePersisted_OldStateRemoved(t *testi
 		LogicalUploadID: "logical-old",
 		UploadID:        "upload-old",
 		McapKey:         "episodes/11/restart-ok.mcap",
+		AssetID:         "asset-a",
 		FileSize:        512,
 		RestartCount:    0,
 		UpdatedAt:       time.Now(),
 	})
 
 	u := newDecideResumeUploader(dir, gw, &fakeOSS{})
-	prepared, err := u.prepareUploadSession(context.Background(), map[string]string{}, "episodes/11/restart-ok.mcap", 512)
+	prepared, err := u.prepareUploadSession(context.Background(), map[string]string{}, "episodes/11/restart-ok.mcap", "asset-a", 512)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -528,7 +566,7 @@ func (f *fakeGateway) AbortUpload(_ context.Context, _ string, _ string) error {
 	return nil
 }
 
-func (f *fakeGateway) CompleteUpload(_ context.Context, _ string, _ int64, _ map[string]string, _ int32, _ string) error {
+func (f *fakeGateway) CompleteUpload(_ context.Context, _ string, _ int64, _ map[string]string, _ int32, _ string, _ int64) error {
 	panic("fakeGateway.CompleteUpload called unexpectedly")
 }
 
@@ -538,6 +576,8 @@ type fakeOSS struct {
 	listPartsFn func(ctx context.Context, session *UploadSession, multipartUploadID string) error
 	// headObjectETagFn is called by HeadObjectETag; must be set for tests that reach it.
 	headObjectETagFn func(ctx context.Context, session *UploadSession) (string, error)
+	// uploadPartFn is called by UploadPart; must be set for tests that reach it.
+	uploadPartFn func(ctx context.Context, session *UploadSession, multipartUploadID string, partNumber int, body []byte) (string, error)
 }
 
 func (f *fakeOSS) ListParts(ctx context.Context, session *UploadSession, multipartUploadID string) error {
@@ -558,8 +598,11 @@ func (f *fakeOSS) InitiateMultipartUpload(_ context.Context, _ *UploadSession) (
 	panic("fakeOSS.InitiateMultipartUpload called unexpectedly")
 }
 
-func (f *fakeOSS) UploadPart(_ context.Context, _ *UploadSession, _ string, _ int, _ []byte) (string, error) {
-	panic("fakeOSS.UploadPart called unexpectedly")
+func (f *fakeOSS) UploadPart(ctx context.Context, session *UploadSession, multipartUploadID string, partNumber int, body []byte) (string, error) {
+	if f.uploadPartFn == nil {
+		panic("fakeOSS.UploadPart called unexpectedly")
+	}
+	return f.uploadPartFn(ctx, session, multipartUploadID, partNumber, body)
 }
 
 func (f *fakeOSS) CompleteMultipartUpload(_ context.Context, _ *UploadSession, _ string, _ []UploadedPart) (string, error) {
@@ -596,6 +639,215 @@ func makeSession(logicalID, uploadID string) *UploadSession {
 	}
 }
 
+func TestStreamMultipartParts_UploadsExpectedPartBoundaries(t *testing.T) {
+	var gotPartNumbers []int
+	var gotSizes []int
+	oss := &fakeOSS{
+		uploadPartFn: func(_ context.Context, _ *UploadSession, _ string, partNumber int, body []byte) (string, error) {
+			gotPartNumbers = append(gotPartNumbers, partNumber)
+			gotSizes = append(gotSizes, len(body))
+			return "etag", nil
+		},
+	}
+	u := newDecideResumeUploader("", &fakeGateway{}, oss)
+	session := makeSession("logical-stream", "upload-stream")
+	session.PartSizeBytes = 10
+
+	payload := []byte("abcdefghijklmnopqrstuvwxy")
+	factory := func(_ context.Context, offset, length int64) (io.ReadCloser, error) {
+		end := int(offset + length)
+		if end > len(payload) {
+			end = len(payload)
+		}
+		return io.NopCloser(bytes.NewReader(payload[offset:end])), nil
+	}
+	_, parts, partMD5s, err := u.streamMultipartParts(
+		context.Background(),
+		"episode-stream",
+		session,
+		"multipart-stream",
+		int64(len(payload)),
+		session.PartSizeBytes,
+		factory,
+	)
+	if err != nil {
+		t.Fatalf("streamMultipartParts() error = %v", err)
+	}
+	if len(parts) != 3 {
+		t.Fatalf("uploaded part count = %d, want 3", len(parts))
+	}
+	if len(partMD5s) != 3 {
+		t.Fatalf("part MD5 count = %d, want 3", len(partMD5s))
+	}
+
+	wantPartNumbers := []int{1, 2, 3}
+	wantSizes := []int{10, 10, 5}
+	for i := range wantPartNumbers {
+		if gotPartNumbers[i] != wantPartNumbers[i] {
+			t.Fatalf("part number[%d] = %d, want %d", i, gotPartNumbers[i], wantPartNumbers[i])
+		}
+		if gotSizes[i] != wantSizes[i] {
+			t.Fatalf("part size[%d] = %d, want %d", i, gotSizes[i], wantSizes[i])
+		}
+	}
+}
+
+func TestStreamMultipartParts_EarlyEOFStopsInsteadOfUploadingEmptyParts(t *testing.T) {
+	var uploadedPartNumbers []int
+	oss := &fakeOSS{
+		uploadPartFn: func(_ context.Context, _ *UploadSession, _ string, partNumber int, body []byte) (string, error) {
+			uploadedPartNumbers = append(uploadedPartNumbers, partNumber)
+			if len(body) == 0 {
+				t.Fatalf("uploaded empty part %d", partNumber)
+			}
+			return "etag", nil
+		},
+	}
+	u := newDecideResumeUploader("", &fakeGateway{}, oss)
+	session := makeSession("logical-short", "upload-short")
+	session.PartSizeBytes = 10
+
+	payload := []byte("abcdefghijkl") // 12 bytes — part 2 will fail with short read
+	factory := func(_ context.Context, offset, length int64) (io.ReadCloser, error) {
+		end := int(offset + length)
+		if end > len(payload) {
+			end = len(payload)
+		}
+		return io.NopCloser(bytes.NewReader(payload[offset:end])), nil
+	}
+
+	_, _, _, err := u.streamMultipartParts(
+		context.Background(),
+		"episode-short",
+		session,
+		"multipart-short",
+		25,
+		session.PartSizeBytes,
+		factory,
+	)
+	if err == nil {
+		t.Fatal("expected error for early EOF, got nil")
+	}
+	if !strings.Contains(err.Error(), "expected 10 bytes, got 2") {
+		t.Fatalf("error = %q, want short read details", err.Error())
+	}
+	if len(uploadedPartNumbers) != 1 || uploadedPartNumbers[0] != 1 {
+		t.Fatalf("uploaded parts = %v, want only first complete part", uploadedPartNumbers)
+	}
+}
+
+func TestStreamMultipartParts_RefreshesCredentialsBeforeUploadPart(t *testing.T) {
+	var reissueCalls int
+	gw := &fakeGateway{
+		reissueCredentialsFn: func(_ context.Context, uploadID string) (*UploadSession, error) {
+			reissueCalls++
+			if uploadID != "upload-expiring" {
+				t.Fatalf("uploadID = %q, want upload-expiring", uploadID)
+			}
+			refreshed := makeSession("logical-expiring", uploadID)
+			refreshed.STSAccessKeyID = "fresh-key"
+			refreshed.PartSizeBytes = 99
+			return refreshed, nil
+		},
+	}
+
+	var usedAccessKeyID string
+	var usedPartSizeBytes int64
+	oss := &fakeOSS{
+		uploadPartFn: func(_ context.Context, session *UploadSession, _ string, _ int, _ []byte) (string, error) {
+			usedAccessKeyID = session.STSAccessKeyID
+			usedPartSizeBytes = session.PartSizeBytes
+			return "etag", nil
+		},
+	}
+	u := newDecideResumeUploader("", gw, oss)
+	session := makeSession("logical-expiring", "upload-expiring")
+	session.STSAccessKeyID = "stale-key"
+	session.STSExpireAt = time.Now().Add(10 * time.Second)
+	session.PartSizeBytes = 4
+
+	payload := []byte("abcd")
+	factory := func(_ context.Context, offset, length int64) (io.ReadCloser, error) {
+		return io.NopCloser(bytes.NewReader(payload[offset : offset+length])), nil
+	}
+
+	finalSession, parts, _, err := u.streamMultipartParts(context.Background(), "episode-expiring", session, "multipart-expiring", int64(len(payload)), session.PartSizeBytes, factory)
+	if err != nil {
+		t.Fatalf("streamMultipartParts() error = %v", err)
+	}
+	if len(parts) != 1 {
+		t.Fatalf("uploaded part count = %d, want 1", len(parts))
+	}
+	if reissueCalls != 1 {
+		t.Fatalf("ReissueUploadCredentials calls = %d, want 1", reissueCalls)
+	}
+	if usedAccessKeyID != "fresh-key" {
+		t.Fatalf("UploadPart access key = %q, want fresh-key", usedAccessKeyID)
+	}
+	if usedPartSizeBytes != 4 {
+		t.Fatalf("UploadPart part size = %d, want fixed original size 4", usedPartSizeBytes)
+	}
+	if finalSession.PartSizeBytes != 4 {
+		t.Fatalf("final session part size = %d, want fixed original size 4", finalSession.PartSizeBytes)
+	}
+}
+
+func TestStreamMultipartParts_RetriesCurrentPartAfterSecurityTokenExpired(t *testing.T) {
+	var reissueCalls int
+	gw := &fakeGateway{
+		reissueCredentialsFn: func(_ context.Context, uploadID string) (*UploadSession, error) {
+			reissueCalls++
+			refreshed := makeSession("logical-retry", uploadID)
+			refreshed.STSAccessKeyID = "fresh-key"
+			return refreshed, nil
+		},
+	}
+
+	var uploadPartCalls int
+	var partNumbers []int
+	var usedAccessKeyIDs []string
+	oss := &fakeOSS{
+		uploadPartFn: func(_ context.Context, session *UploadSession, _ string, partNumber int, _ []byte) (string, error) {
+			uploadPartCalls++
+			partNumbers = append(partNumbers, partNumber)
+			usedAccessKeyIDs = append(usedAccessKeyIDs, session.STSAccessKeyID)
+			if uploadPartCalls == 1 {
+				return "", errors.New("oss returned status 403: <Code>SecurityTokenExpired</Code>")
+			}
+			return "etag", nil
+		},
+	}
+	u := newDecideResumeUploader("", gw, oss)
+	session := makeSession("logical-retry", "upload-retry")
+	session.STSAccessKeyID = "stale-key"
+	session.PartSizeBytes = 4
+
+	payload := []byte("abcd")
+	factory := func(_ context.Context, offset, length int64) (io.ReadCloser, error) {
+		return io.NopCloser(bytes.NewReader(payload[offset : offset+length])), nil
+	}
+
+	_, parts, _, err := u.streamMultipartParts(context.Background(), "episode-retry", session, "multipart-retry", int64(len(payload)), session.PartSizeBytes, factory)
+	if err != nil {
+		t.Fatalf("streamMultipartParts() error = %v", err)
+	}
+	if len(parts) != 1 {
+		t.Fatalf("uploaded part count = %d, want 1", len(parts))
+	}
+	if reissueCalls != 1 {
+		t.Fatalf("ReissueUploadCredentials calls = %d, want 1", reissueCalls)
+	}
+	if uploadPartCalls != 2 {
+		t.Fatalf("UploadPart calls = %d, want 2", uploadPartCalls)
+	}
+	if partNumbers[0] != 1 || partNumbers[1] != 1 {
+		t.Fatalf("part numbers = %v, want [1 1]", partNumbers)
+	}
+	if usedAccessKeyIDs[0] != "stale-key" || usedAccessKeyIDs[1] != "fresh-key" {
+		t.Fatalf("access keys = %v, want [stale-key fresh-key]", usedAccessKeyIDs)
+	}
+}
+
 // =============================================================================
 // decideResumeAction unit tests
 // =============================================================================
diff --git a/internal/config/config.go b/internal/config/config.go
index 0f1a635..4df907e 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -8,6 +8,7 @@ package config
 import (
 	"fmt"
 	"os"
+	"path/filepath"
 	"strconv"
 	"strings"
 )
@@ -87,6 +88,7 @@ type SyncConfig struct {
 	RetryJitterSec     int    // max additive jitter in seconds
 	PersistRootDir     string // root directory for persisting upload state across restarts; empty disables persistence
 	MaxRestartCount    int    // max number of upload restarts before permanent failure; 0 uses uploader default (3)
+	DPConfigPath       string // data-platform config path for direct device-profile uploads
 }
 
 // FeaturesConfig feature flags configuration
@@ -203,6 +205,7 @@ func Load() (*Config, error) {
 			RetryJitterSec:     getEnvInt("KEYSTONE_SYNC_RETRY_JITTER_SEC", 30),
 			PersistRootDir:     getEnv("KEYSTONE_SYNC_PERSIST_ROOT_DIR", ""),
 			MaxRestartCount:    getEnvInt("KEYSTONE_SYNC_MAX_RESTART_COUNT", 3),
+			DPConfigPath:       getEnv("KEYSTONE_SYNC_DP_CONFIG", defaultDPConfigPath()),
 		},
 		Auth: AuthConfig{
 			JWTSecret:             getEnv("KEYSTONE_JWT_SECRET", ""),
@@ -274,17 +277,18 @@ func (c *Config) Validate() error {
 		return fmt.Errorf("KEYSTONE_ADMIN_USERNAME and KEYSTONE_ADMIN_PASSWORD must both be set or both be empty")
 	}
 	if c.Sync.Enabled {
-		if strings.TrimSpace(c.Sync.AuthEndpoint) == "" {
-			return fmt.Errorf("sync auth endpoint is required when sync is enabled")
+		c.Sync.DPConfigPath = strings.TrimSpace(c.Sync.DPConfigPath)
+		if c.Sync.DPConfigPath == "" {
+			return fmt.Errorf("KEYSTONE_SYNC_DP_CONFIG is required when sync is enabled")
 		}
-		if strings.TrimSpace(c.Sync.GatewayEndpoint) == "" {
-			return fmt.Errorf("sync gateway endpoint is required when sync is enabled")
+		expandedDPConfigPath, err := expandHomePath(c.Sync.DPConfigPath)
+		if err != nil {
+			return fmt.Errorf("KEYSTONE_SYNC_DP_CONFIG %q is invalid: %w", c.Sync.DPConfigPath, err)
 		}
-		apiKey := strings.TrimSpace(c.Sync.APIKey)
-		if apiKey == "" {
-			return fmt.Errorf("KEYSTONE_CLOUD_API_KEY is required when sync is enabled")
-		}
-		c.Sync.APIKey = apiKey
+		c.Sync.DPConfigPath = expandedDPConfigPath
+		c.Sync.AuthEndpoint = strings.TrimSpace(c.Sync.AuthEndpoint)
+		c.Sync.GatewayEndpoint = strings.TrimSpace(c.Sync.GatewayEndpoint)
+		c.Sync.APIKey = strings.TrimSpace(c.Sync.APIKey)
 		if c.Sync.BatchSize <= 0 {
 			return fmt.Errorf("sync batch size must be greater than 0 when sync is enabled")
 		}
@@ -329,6 +333,28 @@ func getEnv(key, fallback string) string {
 	return fallback
 }
 
+func defaultDPConfigPath() string {
+	home, err := os.UserHomeDir()
+	if err != nil || strings.TrimSpace(home) == "" {
+		return "~/.archebase/config.json"
+	}
+	return filepath.Join(home, ".archebase", "config.json")
+}
+
+func expandHomePath(path string) (string, error) {
+	if path != "~" && !strings.HasPrefix(path, "~/") {
+		return path, nil
+	}
+	home, err := os.UserHomeDir()
+	if err != nil || strings.TrimSpace(home) == "" {
+		return "", fmt.Errorf("home directory is not available")
+	}
+	if path == "~" {
+		return home, nil
+	}
+	return filepath.Join(home, strings.TrimPrefix(path, "~/")), nil
+}
+
 func getEnvInt(key string, fallback int) int {
 	if val := os.Getenv(key); val != "" {
 		if i, err := strconv.Atoi(val); err == nil {
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
index dd918e6..4578975 100644
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -7,6 +7,7 @@ package config
 
 import (
 	"os"
+	"path/filepath"
 	"strings"
 	"testing"
 )
@@ -21,6 +22,7 @@ func TestLoad(t *testing.T) {
 		"KEYSTONE_MINIO_SECRET_KEY":       os.Getenv("KEYSTONE_MINIO_SECRET_KEY"),
 		"KEYSTONE_FACTORY_ID":             os.Getenv("KEYSTONE_FACTORY_ID"),
 		"KEYSTONE_SYNC_AUTO_SCAN_ENABLED": os.Getenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED"),
+		"KEYSTONE_SYNC_DP_CONFIG":         os.Getenv("KEYSTONE_SYNC_DP_CONFIG"),
 	}
 	defer func() {
 		// Restore original environment variables
@@ -35,6 +37,7 @@ func TestLoad(t *testing.T) {
 
 	// Set test environment variables
 	os.Unsetenv("KEYSTONE_SYNC_AUTO_SCAN_ENABLED")
+	os.Unsetenv("KEYSTONE_SYNC_DP_CONFIG")
 	os.Setenv("KEYSTONE_MYSQL_PASSWORD", "test-password")
 	os.Setenv("KEYSTONE_MINIO_ACCESS_KEY", "test-access-key")
 	os.Setenv("KEYSTONE_MINIO_SECRET_KEY", "test-secret-key")
@@ -69,6 +72,13 @@ func TestLoad(t *testing.T) {
 	if cfg.Sync.AutoScanEnabled {
 		t.Error("Load().Sync.AutoScanEnabled should default to false")
 	}
+	home, err := os.UserHomeDir()
+	if err != nil {
+		t.Fatalf("os.UserHomeDir() error = %v", err)
+	}
+	if cfg.Sync.DPConfigPath != filepath.Join(home, ".archebase", "config.json") {
+		t.Errorf("Load().Sync.DPConfigPath = %q, want default ~/.archebase/config.json", cfg.Sync.DPConfigPath)
+	}
 
 	// Verify QA configuration
 	if !cfg.QA.Enabled {
@@ -270,7 +280,7 @@ func TestConfigValidate(t *testing.T) {
 	}
 }
 
-func TestValidateSyncAPIKey(t *testing.T) {
+func TestValidateSyncDPConfig(t *testing.T) {
 	validBase := Config{
 		Server:   ServerConfig{Mode: "edge"},
 		Database: DatabaseConfig{DSN: "user:pass@tcp(localhost:3306)/db"},
@@ -278,7 +288,7 @@ func TestValidateSyncAPIKey(t *testing.T) {
 		Auth:     AuthConfig{JWTSecret: "jwt-secret"},
 	}
 
-	t.Run("sync disabled — no API key required", func(t *testing.T) {
+	t.Run("sync disabled — no DP config required", func(t *testing.T) {
 		cfg := validBase
 		cfg.Sync = SyncConfig{Enabled: false}
 		if err := cfg.Validate(); err != nil {
@@ -286,13 +296,11 @@ func TestValidateSyncAPIKey(t *testing.T) {
 		}
 	})
 
-	t.Run("sync enabled — missing API key", func(t *testing.T) {
+	t.Run("sync enabled — missing DP config", func(t *testing.T) {
 		cfg := validBase
 		cfg.Sync = SyncConfig{
 			Enabled:           true,
-			AuthEndpoint:      "auth:443",
-			GatewayEndpoint:   "gateway:443",
-			APIKey:            "",
+			DPConfigPath:      "",
 			BatchSize:         10,
 			MaxRetries:        5,
 			MaxConcurrent:     2,
@@ -302,18 +310,38 @@ func TestValidateSyncAPIKey(t *testing.T) {
 			RetryBaseSec:      30,
 			RetryMaxSec:       1800,
 		}
-		if err := cfg.Validate(); err == nil {
-			t.Error("Validate() expected error for missing API key, got nil")
+		if err := cfg.Validate(); err == nil || !strings.Contains(err.Error(), "KEYSTONE_SYNC_DP_CONFIG") {
+			t.Fatalf("Validate() error = %v, want KEYSTONE_SYNC_DP_CONFIG error", err)
+		}
+	})
+
+	t.Run("sync enabled — old cloud endpoint and API key are not required", func(t *testing.T) {
+		cfg := validBase
+		cfg.Sync = SyncConfig{
+			Enabled:           true,
+			DPConfigPath:      "/etc/keystone/dp-config.json",
+			BatchSize:         10,
+			MaxRetries:        5,
+			MaxConcurrent:     2,
+			WorkerIntervalSec: 60,
+			RequestTimeoutSec: 30,
+			OSSTimeoutSec:     300,
+			RetryBaseSec:      30,
+			RetryMaxSec:       1800,
+		}
+		if err := cfg.Validate(); err != nil {
+			t.Fatalf("Validate() unexpected error = %v", err)
+		}
+		if cfg.Sync.AuthEndpoint != "" || cfg.Sync.GatewayEndpoint != "" || cfg.Sync.APIKey != "" {
+			t.Fatalf("legacy cloud config should remain optional and empty: %+v", cfg.Sync)
 		}
 	})
 
-	t.Run("sync enabled — arbitrary opaque API key accepted", func(t *testing.T) {
+	t.Run("sync enabled — trims DP config whitespace", func(t *testing.T) {
 		cfg := validBase
 		cfg.Sync = SyncConfig{
 			Enabled:           true,
-			AuthEndpoint:      "auth:443",
-			GatewayEndpoint:   "gateway:443",
-			APIKey:            "notvalidbase64!!!",
+			DPConfigPath:      "  /etc/keystone/dp-config.json  ",
 			BatchSize:         10,
 			MaxRetries:        5,
 			MaxConcurrent:     2,
@@ -326,18 +354,18 @@ func TestValidateSyncAPIKey(t *testing.T) {
 		if err := cfg.Validate(); err != nil {
 			t.Fatalf("Validate() unexpected error = %v", err)
 		}
-		if cfg.Sync.APIKey != "notvalidbase64!!!" {
-			t.Errorf("APIKey = %q, want %q", cfg.Sync.APIKey, "notvalidbase64!!!")
+		if cfg.Sync.DPConfigPath != "/etc/keystone/dp-config.json" {
+			t.Errorf("DPConfigPath = %q, want trimmed path", cfg.Sync.DPConfigPath)
 		}
 	})
 
-	t.Run("sync enabled — trims API key whitespace", func(t *testing.T) {
+	t.Run("sync enabled — expands DP config home path", func(t *testing.T) {
+		home := t.TempDir()
+		t.Setenv("HOME", home)
 		cfg := validBase
 		cfg.Sync = SyncConfig{
 			Enabled:           true,
-			AuthEndpoint:      "auth:443",
-			GatewayEndpoint:   "gateway:443",
-			APIKey:            "  cloud-issued-key  ",
+			DPConfigPath:      "~/.archebase/config.json",
 			BatchSize:         10,
 			MaxRetries:        5,
 			MaxConcurrent:     2,
@@ -350,8 +378,8 @@ func TestValidateSyncAPIKey(t *testing.T) {
 		if err := cfg.Validate(); err != nil {
 			t.Fatalf("Validate() unexpected error = %v", err)
 		}
-		if cfg.Sync.APIKey != "cloud-issued-key" {
-			t.Errorf("APIKey = %q, want %q", cfg.Sync.APIKey, "cloud-issued-key")
+		if cfg.Sync.DPConfigPath != filepath.Join(home, ".archebase", "config.json") {
+			t.Errorf("DPConfigPath = %q, want expanded home path", cfg.Sync.DPConfigPath)
 		}
 	})
 }
diff --git a/internal/server/server.go b/internal/server/server.go
index ff908c0..30c65ed 100644
--- a/internal/server/server.go
+++ b/internal/server/server.go
@@ -76,7 +76,7 @@ func axonTransferWriteTimeout(cfg *config.TransferConfig) time.Duration {
 
 // New creates a new server instance.
 // db and s3Client are optional; pass nil to disable Verified ACK.
-// syncWorker is optional; pass nil to disable cloud sync API.
+// syncWorker is optional; pass nil to disable cloud sync APIs.
 func New(cfg *config.Config, db *sqlx.DB, s3Client *s3.Client, syncWorker *services.SyncWorker) *Server {
 	// Create Gin engine
 	gin.SetMode(gin.ReleaseMode)
diff --git a/internal/services/dp_asset_resolver.go b/internal/services/dp_asset_resolver.go
new file mode 100644
index 0000000..78f3e29
--- /dev/null
+++ b/internal/services/dp_asset_resolver.go
@@ -0,0 +1,61 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package services
+
+import (
+	"context"
+	"database/sql"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/jmoiron/sqlx"
+)
+
+func assetIDFromEpisodeMetadata(metadata sql.NullString) string {
+	if !metadata.Valid || strings.TrimSpace(metadata.String) == "" {
+		return ""
+	}
+	var raw map[string]interface{}
+	if err := json.Unmarshal([]byte(metadata.String), &raw); err != nil {
+		return ""
+	}
+	value, _ := raw["asset_id"].(string)
+	return strings.TrimSpace(value)
+}
+
+func resolveAssetIDForEpisode(ctx context.Context, db *sqlx.DB, episodeID int64, metadata sql.NullString, workstationID sql.NullInt64) (string, error) {
+	if assetID := assetIDFromEpisodeMetadata(metadata); assetID != "" {
+		return assetID, nil
+	}
+	if db == nil {
+		return "", fmt.Errorf("database is not available")
+	}
+	if !workstationID.Valid || workstationID.Int64 <= 0 {
+		return "", fmt.Errorf("episode %d has no asset_id metadata and no workstation_id", episodeID)
+	}
+
+	var row struct {
+		AssetID sql.NullString `db:"asset_id"`
+	}
+	err := db.GetContext(ctx, &row, `
+		SELECT r.asset_id
+		FROM workstations ws
+		LEFT JOIN robots r ON r.id = ws.robot_id
+		WHERE ws.id = ?
+		LIMIT 1
+	`, workstationID.Int64)
+	if err == sql.ErrNoRows {
+		return "", fmt.Errorf("episode %d workstation %d not found while resolving asset_id", episodeID, workstationID.Int64)
+	}
+	if err != nil {
+		return "", fmt.Errorf("resolve asset_id for episode %d workstation %d: %w", episodeID, workstationID.Int64, err)
+	}
+	assetID := strings.TrimSpace(row.AssetID.String)
+	if !row.AssetID.Valid || assetID == "" {
+		return "", fmt.Errorf("episode %d workstation %d has no robot asset_id", episodeID, workstationID.Int64)
+	}
+	return assetID, nil
+}
diff --git a/internal/services/dp_asset_resolver_test.go b/internal/services/dp_asset_resolver_test.go
new file mode 100644
index 0000000..d000738
--- /dev/null
+++ b/internal/services/dp_asset_resolver_test.go
@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package services
+
+import (
+	"context"
+	"database/sql"
+	"strings"
+	"testing"
+
+	"github.com/jmoiron/sqlx"
+	_ "modernc.org/sqlite"
+)
+
+func newTestAssetResolverDB(t *testing.T) *sqlx.DB {
+	t.Helper()
+	db, err := sqlx.Open("sqlite", ":memory:")
+	if err != nil {
+		t.Fatalf("open sqlite db: %v", err)
+	}
+	for _, stmt := range []string{
+		`CREATE TABLE robots (
+			id INTEGER PRIMARY KEY,
+			device_id TEXT NOT NULL,
+			asset_id TEXT,
+			deleted_at TIMESTAMP NULL
+		)`,
+		`CREATE TABLE workstations (
+			id INTEGER PRIMARY KEY,
+			robot_id INTEGER,
+			deleted_at TIMESTAMP NULL
+		)`,
+	} {
+		if _, err := db.Exec(stmt); err != nil {
+			_ = db.Close()
+			t.Fatalf("create schema: %v", err)
+		}
+	}
+	t.Cleanup(func() { _ = db.Close() })
+	return db
+}
+
+func TestResolveAssetIDForEpisode_MetadataWins(t *testing.T) {
+	db := newTestAssetResolverDB(t)
+	if _, err := db.Exec(`INSERT INTO robots (id, device_id, asset_id) VALUES (1, 'local-device', 'fallback-asset')`); err != nil {
+		t.Fatalf("seed robot: %v", err)
+	}
+	if _, err := db.Exec(`INSERT INTO workstations (id, robot_id) VALUES (10, 1)`); err != nil {
+		t.Fatalf("seed workstation: %v", err)
+	}
+
+	got, err := resolveAssetIDForEpisode(
+		context.Background(),
+		db,
+		1,
+		sql.NullString{String: `{"asset_id":" snapshot-asset "}`, Valid: true},
+		sql.NullInt64{Int64: 10, Valid: true},
+	)
+	if err != nil {
+		t.Fatalf("resolveAssetIDForEpisode() error = %v", err)
+	}
+	if got != "snapshot-asset" {
+		t.Fatalf("asset_id=%q want snapshot-asset", got)
+	}
+}
+
+func TestResolveAssetIDForEpisode_FallbackReadsSoftDeletedWorkstation(t *testing.T) {
+	db := newTestAssetResolverDB(t)
+	if _, err := db.Exec(`INSERT INTO robots (id, device_id, asset_id) VALUES (1, 'local-device', 'fallback-asset')`); err != nil {
+		t.Fatalf("seed robot: %v", err)
+	}
+	if _, err := db.Exec(`INSERT INTO workstations (id, robot_id, deleted_at) VALUES (10, 1, CURRENT_TIMESTAMP)`); err != nil {
+		t.Fatalf("seed workstation: %v", err)
+	}
+
+	got, err := resolveAssetIDForEpisode(
+		context.Background(),
+		db,
+		1,
+		sql.NullString{},
+		sql.NullInt64{Int64: 10, Valid: true},
+	)
+	if err != nil {
+		t.Fatalf("resolveAssetIDForEpisode() error = %v", err)
+	}
+	if got != "fallback-asset" {
+		t.Fatalf("asset_id=%q want fallback-asset", got)
+	}
+}
+
+func TestResolveAssetIDForEpisode_MissingDoesNotFallbackToLocalDeviceID(t *testing.T) {
+	db := newTestAssetResolverDB(t)
+	if _, err := db.Exec(`INSERT INTO robots (id, device_id, asset_id) VALUES (1, 'local-device', NULL)`); err != nil {
+		t.Fatalf("seed robot: %v", err)
+	}
+	if _, err := db.Exec(`INSERT INTO workstations (id, robot_id) VALUES (10, 1)`); err != nil {
+		t.Fatalf("seed workstation: %v", err)
+	}
+
+	_, err := resolveAssetIDForEpisode(
+		context.Background(),
+		db,
+		1,
+		sql.NullString{},
+		sql.NullInt64{Int64: 10, Valid: true},
+	)
+	if err == nil || !strings.Contains(err.Error(), "asset_id") {
+		t.Fatalf("error=%v want asset_id missing error", err)
+	}
+}
diff --git a/internal/services/dp_config_loader.go b/internal/services/dp_config_loader.go
new file mode 100644
index 0000000..f85f773
--- /dev/null
+++ b/internal/services/dp_config_loader.go
@@ -0,0 +1,171 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package services
+
+import (
+	"encoding/json"
+	"fmt"
+	"net"
+	"net/url"
+	"os"
+	"strings"
+)
+
+// DPConfigFile is the subset of data-platform config consumed by direct sync.
+type DPConfigFile struct {
+	Version   *int              `json:"version,omitempty"`
+	Endpoints DPConfigEndpoints `json:"endpoints"`
+	Devices   []DPDeviceProfile `json:"devices"`
+}
+
+// DPConfigEndpoints contains the auth and gateway endpoints from a DP config file.
+type DPConfigEndpoints struct {
+	Auth    string `json:"auth"`
+	Gateway string `json:"gateway"`
+}
+
+// DPDeviceProfile contains upload credentials and tags for one DP device.
+type DPDeviceProfile struct {
+	DeviceID string            `json:"deviceId"`
+	APIKey   string            `json:"apiKey"` // #nosec G117 -- operator-provided local DP upload config credential
+	Tags     map[string]string `json:"tags"`
+}
+
+// DPResolvedEndpoint is a normalized upload service endpoint.
+type DPResolvedEndpoint struct {
+	Target     string
+	UseTLS     bool
+	ServerName string
+}
+
+// DPDeviceUploadConfig contains the resolved upload config for one asset ID.
+type DPDeviceUploadConfig struct {
+	ConfigPath string
+	Auth       DPResolvedEndpoint
+	Gateway    DPResolvedEndpoint
+	Profile    DPDeviceProfile
+}
+
+func loadDPDeviceUploadConfig(configPath string, assetID string) (*DPDeviceUploadConfig, error) {
+	configPath = strings.TrimSpace(configPath)
+	assetID = strings.TrimSpace(assetID)
+	if configPath == "" {
+		return nil, fmt.Errorf("KEYSTONE_SYNC_DP_CONFIG is required")
+	}
+	if assetID == "" {
+		return nil, fmt.Errorf("asset_id is required")
+	}
+
+	data, err := os.ReadFile(configPath) //nolint:gosec // operator-controlled config path
+	if err != nil {
+		return nil, fmt.Errorf("read DP config %s: %w", configPath, err)
+	}
+
+	var cfg DPConfigFile
+	if err := json.Unmarshal(data, &cfg); err != nil {
+		return nil, fmt.Errorf("parse DP config %s: %w", configPath, err)
+	}
+	if cfg.Version != nil && *cfg.Version != 3 {
+		return nil, fmt.Errorf("DP config %s has unsupported version %d", configPath, *cfg.Version)
+	}
+
+	authEndpoint, err := parseDPResolvedEndpoint(cfg.Endpoints.Auth)
+	if err != nil {
+		return nil, fmt.Errorf("invalid endpoints.auth in DP config %s: %w", configPath, err)
+	}
+	gatewayEndpoint, err := parseDPResolvedEndpoint(cfg.Endpoints.Gateway)
+	if err != nil {
+		return nil, fmt.Errorf("invalid endpoints.gateway in DP config %s: %w", configPath, err)
+	}
+
+	devices := make(map[string]DPDeviceProfile, len(cfg.Devices))
+	for idx, device := range cfg.Devices {
+		deviceID := strings.TrimSpace(device.DeviceID)
+		if deviceID == "" {
+			return nil, fmt.Errorf("DP config %s devices[%d].deviceId is empty", configPath, idx)
+		}
+		if _, exists := devices[deviceID]; exists {
+			return nil, fmt.Errorf("DP config %s has duplicate deviceId %q", configPath, deviceID)
+		}
+		device.DeviceID = deviceID
+		devices[deviceID] = device
+	}
+
+	profile, ok := devices[assetID]
+	if !ok {
+		return nil, fmt.Errorf("DP config %s has no device profile for asset_id %q", configPath, assetID)
+	}
+	profile.APIKey = strings.TrimSpace(profile.APIKey)
+	if profile.APIKey == "" {
+		return nil, fmt.Errorf("DP config %s device %q apiKey is empty", configPath, assetID)
+	}
+	if len(profile.Tags) == 0 {
+		return nil, fmt.Errorf("DP config %s device %q tags must be non-empty", configPath, assetID)
+	}
+	for key := range profile.Tags {
+		if key == "" {
+			return nil, fmt.Errorf("DP config %s device %q has an empty tag key", configPath, assetID)
+		}
+	}
+
+	return &DPDeviceUploadConfig{
+		ConfigPath: configPath,
+		Auth:       authEndpoint,
+		Gateway:    gatewayEndpoint,
+		Profile:    profile,
+	}, nil
+}
+
+func parseDPResolvedEndpoint(raw string) (DPResolvedEndpoint, error) {
+	value := strings.TrimSpace(raw)
+	if value == "" {
+		return DPResolvedEndpoint{}, fmt.Errorf("endpoint is required")
+	}
+
+	if strings.Contains(value, "://") {
+		parsed, err := url.Parse(value)
+		if err != nil {
+			return DPResolvedEndpoint{}, err
+		}
+		if parsed.Scheme != "http" && parsed.Scheme != "https" {
+			return DPResolvedEndpoint{}, fmt.Errorf("unsupported scheme %q", parsed.Scheme)
+		}
+		if parsed.Host == "" || parsed.User != nil {
+			return DPResolvedEndpoint{}, fmt.Errorf("endpoint must be host[:port]")
+		}
+		if parsed.Path != "" || parsed.RawQuery != "" || parsed.Fragment != "" {
+			return DPResolvedEndpoint{}, fmt.Errorf("endpoint must not include path, query, or fragment")
+		}
+		host := parsed.Hostname()
+		if host == "" {
+			return DPResolvedEndpoint{}, fmt.Errorf("endpoint host is required")
+		}
+		target := parsed.Host
+		if parsed.Port() == "" {
+			defaultPort := "80"
+			if parsed.Scheme == "https" {
+				defaultPort = "443"
+			}
+			target = net.JoinHostPort(host, defaultPort)
+		}
+		return DPResolvedEndpoint{
+			Target:     target,
+			UseTLS:     parsed.Scheme == "https",
+			ServerName: tlsServerNameForScheme(parsed.Scheme, host),
+		}, nil
+	}
+
+	if strings.ContainsAny(value, "/?#") {
+		return DPResolvedEndpoint{}, fmt.Errorf("bare endpoint must not include path, query, or fragment")
+	}
+	return DPResolvedEndpoint{Target: value, UseTLS: false}, nil
+}
+
+func tlsServerNameForScheme(scheme string, host string) string {
+	if scheme == "https" {
+		return host
+	}
+	return ""
+}
diff --git a/internal/services/dp_config_loader_test.go b/internal/services/dp_config_loader_test.go
new file mode 100644
index 0000000..deff985
--- /dev/null
+++ b/internal/services/dp_config_loader_test.go
@@ -0,0 +1,176 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package services
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+func writeDPConfigFixture(t *testing.T, body string) string {
+	t.Helper()
+	path := filepath.Join(t.TempDir(), "dp-config.json")
+	if err := os.WriteFile(path, []byte(body), 0o600); err != nil {
+		t.Fatalf("write DP config fixture: %v", err)
+	}
+	return path
+}
+
+func validDPConfigJSON(extra string) string {
+	version := `"version":3,`
+	if extra == "missing-version" {
+		version = ""
+	}
+	return `{
+		` + version + `
+		"endpoints": {
+			"auth": "https://auth.example.com",
+			"gateway": "gateway.example.com:7443"
+		},
+		"devices": [{
+			"deviceId": " asset-1 ",
+			"apiKey": " api-key-1 ",
+			"tags": {"line": "A", "empty_value": ""}
+		}]
+	}`
+}
+
+func TestLoadDPDeviceUploadConfig_SelectsDeviceAndEndpoints(t *testing.T) {
+	for _, tt := range []struct {
+		name string
+		body string
+	}{
+		{name: "version 3", body: validDPConfigJSON("")},
+		{name: "missing version", body: validDPConfigJSON("missing-version")},
+	} {
+		t.Run(tt.name, func(t *testing.T) {
+			cfg, err := loadDPDeviceUploadConfig(writeDPConfigFixture(t, tt.body), "asset-1")
+			if err != nil {
+				t.Fatalf("loadDPDeviceUploadConfig() error = %v", err)
+			}
+			if cfg.Profile.DeviceID != "asset-1" {
+				t.Fatalf("Profile.DeviceID=%q want asset-1", cfg.Profile.DeviceID)
+			}
+			if cfg.Profile.APIKey != "api-key-1" {
+				t.Fatalf("Profile.APIKey was not trimmed")
+			}
+			if cfg.Auth.Target != "auth.example.com:443" || !cfg.Auth.UseTLS || cfg.Auth.ServerName != "auth.example.com" {
+				t.Fatalf("auth endpoint=%+v", cfg.Auth)
+			}
+			if cfg.Gateway.Target != "gateway.example.com:7443" || cfg.Gateway.UseTLS {
+				t.Fatalf("gateway endpoint=%+v", cfg.Gateway)
+			}
+			if cfg.Profile.Tags["empty_value"] != "" {
+				t.Fatalf("empty tag values must be preserved: %+v", cfg.Profile.Tags)
+			}
+		})
+	}
+}
+
+func TestParseDPResolvedEndpoint(t *testing.T) {
+	tests := []struct {
+		raw        string
+		target     string
+		useTLS     bool
+		serverName string
+	}{
+		{raw: "https://dp.example.com", target: "dp.example.com:443", useTLS: true, serverName: "dp.example.com"},
+		{raw: "https://dp.example.com:9443", target: "dp.example.com:9443", useTLS: true, serverName: "dp.example.com"},
+		{raw: "http://dp.example.com", target: "dp.example.com:80", useTLS: false},
+		{raw: "dp.example.com:7443", target: "dp.example.com:7443", useTLS: false},
+		{raw: "dp.example.com", target: "dp.example.com", useTLS: false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.raw, func(t *testing.T) {
+			got, err := parseDPResolvedEndpoint(tt.raw)
+			if err != nil {
+				t.Fatalf("parseDPResolvedEndpoint() error = %v", err)
+			}
+			if got.Target != tt.target || got.UseTLS != tt.useTLS || got.ServerName != tt.serverName {
+				t.Fatalf("parseDPResolvedEndpoint()=%+v want target=%q tls=%t server=%q", got, tt.target, tt.useTLS, tt.serverName)
+			}
+		})
+	}
+}
+
+func TestParseDPResolvedEndpointRejectsUnsupportedForms(t *testing.T) {
+	for _, raw := range []string{
+		"",
+		"https://dp.example.com/path",
+		"https://dp.example.com?x=1",
+		"https://dp.example.com#frag",
+		"ftp://dp.example.com",
+		"dp.example.com/path",
+		"dp.example.com?x=1",
+		"dp.example.com#frag",
+	} {
+		t.Run(raw, func(t *testing.T) {
+			if _, err := parseDPResolvedEndpoint(raw); err == nil {
+				t.Fatalf("parseDPResolvedEndpoint(%q) expected error", raw)
+			}
+		})
+	}
+}
+
+func TestLoadDPDeviceUploadConfigRejectsContractErrors(t *testing.T) {
+	tests := []struct {
+		name     string
+		body     string
+		deviceID string
+		want     string
+	}{
+		{
+			name: "unsupported version",
+			body: `{"version":2,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"key","tags":{"k":"v"}}]}`,
+			want: "unsupported version",
+		},
+		{
+			name:     "missing device",
+			body:     validDPConfigJSON(""),
+			deviceID: "CLOUD-device-1",
+			want:     "no device profile",
+		},
+		{
+			name: "empty api key",
+			body: `{"version":3,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"  ","tags":{"k":"v"}}]}`,
+			want: "apiKey is empty",
+		},
+		{
+			name: "empty tags",
+			body: `{"version":3,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"key","tags":{}}]}`,
+			want: "tags must be non-empty",
+		},
+		{
+			name: "empty tag key",
+			body: `{"version":3,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"key","tags":{"":"v"}}]}`,
+			want: "empty tag key",
+		},
+		{
+			name: "duplicate device",
+			body: `{"version":3,"endpoints":{"auth":"auth:1","gateway":"gateway:2"},"devices":[{"deviceId":" asset-1 ","apiKey":"key","tags":{"k":"v"}},{"deviceId":"asset-1","apiKey":"key2","tags":{"k":"v"}}]}`,
+			want: "duplicate deviceId",
+		},
+		{
+			name: "missing endpoint",
+			body: `{"version":3,"endpoints":{"auth":"","gateway":"gateway:2"},"devices":[{"deviceId":"asset-1","apiKey":"key","tags":{"k":"v"}}]}`,
+			want: "endpoints.auth",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			deviceID := tt.deviceID
+			if deviceID == "" {
+				deviceID = "asset-1"
+			}
+			_, err := loadDPDeviceUploadConfig(writeDPConfigFixture(t, tt.body), deviceID)
+			if err == nil || !strings.Contains(err.Error(), tt.want) {
+				t.Fatalf("error=%v want contains %q", err, tt.want)
+			}
+		})
+	}
+}
diff --git a/internal/services/dp_raw_tags.go b/internal/services/dp_raw_tags.go
new file mode 100644
index 0000000..925116a
--- /dev/null
+++ b/internal/services/dp_raw_tags.go
@@ -0,0 +1,96 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package services
+
+import (
+	"database/sql"
+	"fmt"
+	"path"
+	"strconv"
+	"strings"
+)
+
+const (
+	dpReservedDeviceIDTagKey = "778a6d83c9ec49108537542a570966ee.device_id"
+	dpReservedRawFileTagKey  = "a206e337ecdf70a93bb611cf6a30c346.raw_file"
+)
+
+type dpRawTagsInput struct {
+	Profile         DPDeviceProfile
+	McapKey         string
+	SidecarTags     map[string]string
+	EpisodeID       int64
+	EpisodePublicID string
+	TaskID          int64
+	FactoryID       sql.NullInt64
+	OrganizationID  sql.NullInt64
+}
+
+func buildDPDirectRawTags(input dpRawTagsInput) (map[string]string, error) {
+	mcapKey := stripBucketPrefix(input.McapKey)
+	rawFile := path.Base(strings.TrimSpace(mcapKey))
+	if rawFile == "" || rawFile == "." || rawFile == "/" {
+		return nil, fmt.Errorf("raw_file basename is empty for mcap key %q", input.McapKey)
+	}
+
+	merged := make(map[string]string, len(input.Profile.Tags)+len(input.SidecarTags)+8)
+	if err := insertAllNonConflictingTags(merged, input.Profile.Tags); err != nil {
+		return nil, fmt.Errorf("device profile tags: %w", err)
+	}
+	if err := insertNonConflictingTag(merged, dpReservedDeviceIDTagKey, input.Profile.DeviceID); err != nil {
+		return nil, err
+	}
+	if err := insertNonConflictingTag(merged, dpReservedRawFileTagKey, rawFile); err != nil {
+		return nil, err
+	}
+	if err := insertAllNonConflictingTags(merged, input.SidecarTags); err != nil {
+		return nil, fmt.Errorf("sidecar tags: %w", err)
+	}
+	if err := insertAllNonConflictingTags(merged, keystoneExtraTags(input)); err != nil {
+		return nil, fmt.Errorf("keystone extra tags: %w", err)
+	}
+	return merged, nil
+}
+
+func keystoneExtraTags(input dpRawTagsInput) map[string]string {
+	tags := map[string]string{
+		"episode_id":          input.EpisodePublicID,
+		"keystone_episode_id": strconv.FormatInt(input.EpisodeID, 10),
+		"sync_channel":        "keystone_direct",
+	}
+	if input.TaskID > 0 {
+		tags["task_id"] = strconv.FormatInt(input.TaskID, 10)
+	}
+	if input.FactoryID.Valid {
+		tags["factory_id"] = strconv.FormatInt(input.FactoryID.Int64, 10)
+	}
+	if input.OrganizationID.Valid {
+		tags["organization_id"] = strconv.FormatInt(input.OrganizationID.Int64, 10)
+	}
+	return tags
+}
+
+func insertAllNonConflictingTags(dst map[string]string, src map[string]string) error {
+	for key, value := range src {
+		if err := insertNonConflictingTag(dst, key, value); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func insertNonConflictingTag(dst map[string]string, key string, value string) error {
+	if key == "" {
+		return fmt.Errorf("raw tag key must not be empty")
+	}
+	if existing, ok := dst[key]; ok {
+		if existing != value {
+			return fmt.Errorf("raw tag conflict for key %q", key)
+		}
+		return nil
+	}
+	dst[key] = value
+	return nil
+}
diff --git a/internal/services/dp_raw_tags_test.go b/internal/services/dp_raw_tags_test.go
new file mode 100644
index 0000000..bbca857
--- /dev/null
+++ b/internal/services/dp_raw_tags_test.go
@@ -0,0 +1,165 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package services
+
+import (
+	"database/sql"
+	"strings"
+	"testing"
+)
+
+func TestBuildDPDirectRawTags_MergesInDocumentedOrder(t *testing.T) {
+	got, err := buildDPDirectRawTags(dpRawTagsInput{
+		Profile: DPDeviceProfile{
+			DeviceID: "asset-1",
+			Tags: map[string]string{
+				"profile": "tag",
+				"same":    "value",
+			},
+		},
+		McapKey: "edge-factory/factory/device/task.mcap",
+		SidecarTags: map[string]string{
+			"same":        "value",
+			"array_field": `["a","b"]`,
+			"empty_value": "",
+		},
+		EpisodeID:       42,
+		EpisodePublicID: "episode-public-42",
+		TaskID:          77,
+		FactoryID:       sql.NullInt64{Int64: 3, Valid: true},
+		OrganizationID:  sql.NullInt64{Int64: 9, Valid: true},
+	})
+	if err != nil {
+		t.Fatalf("buildDPDirectRawTags() error = %v", err)
+	}
+
+	cases := map[string]string{
+		"profile":                "tag",
+		"same":                   "value",
+		dpReservedDeviceIDTagKey: "asset-1",
+		dpReservedRawFileTagKey:  "task.mcap",
+		"array_field":            `["a","b"]`,
+		"empty_value":            "",
+		"episode_id":             "episode-public-42",
+		"keystone_episode_id":    "42",
+		"sync_channel":           "keystone_direct",
+		"task_id":                "77",
+		"factory_id":             "3",
+		"organization_id":        "9",
+	}
+	for key, want := range cases {
+		if got[key] != want {
+			t.Fatalf("tag[%q]=%q want %q tags=%+v", key, got[key], want, got)
+		}
+	}
+	if _, ok := got["device_id"]; ok {
+		t.Fatalf("ordinary device_id raw tag must not be injected: %+v", got)
+	}
+}
+
+func TestBuildDPDirectRawTags_UsesMcapKeyBasenameNotSidecarMcapFile(t *testing.T) {
+	got, err := buildDPDirectRawTags(dpRawTagsInput{
+		Profile: DPDeviceProfile{
+			DeviceID: "asset-1",
+			Tags:     map[string]string{"profile": "tag"},
+		},
+		McapKey: "bucket/minio/path/actual.mcap",
+		SidecarTags: map[string]string{
+			"mcap_file": "sidecar-claimed.mcap",
+		},
+		EpisodeID:       1,
+		EpisodePublicID: "episode-1",
+	})
+	if err != nil {
+		t.Fatalf("buildDPDirectRawTags() error = %v", err)
+	}
+	if got[dpReservedRawFileTagKey] != "actual.mcap" {
+		t.Fatalf("raw_file=%q want actual.mcap", got[dpReservedRawFileTagKey])
+	}
+	if got["mcap_file"] != "sidecar-claimed.mcap" {
+		t.Fatalf("sidecar mcap_file should remain ordinary sidecar tag: %+v", got)
+	}
+}
+
+func TestBuildDPDirectRawTags_ConflictingTagsFail(t *testing.T) {
+	tests := []struct {
+		name  string
+		input dpRawTagsInput
+	}{
+		{
+			name: "profile conflicts with reserved device id",
+			input: dpRawTagsInput{
+				Profile: DPDeviceProfile{
+					DeviceID: "asset-1",
+					Tags:     map[string]string{dpReservedDeviceIDTagKey: "other-device"},
+				},
+				McapKey:         "bucket/file.mcap",
+				EpisodeID:       1,
+				EpisodePublicID: "episode-1",
+			},
+		},
+		{
+			name: "sidecar conflicts with profile",
+			input: dpRawTagsInput{
+				Profile: DPDeviceProfile{
+					DeviceID: "asset-1",
+					Tags:     map[string]string{"scene": "profile"},
+				},
+				McapKey:         "bucket/file.mcap",
+				SidecarTags:     map[string]string{"scene": "sidecar"},
+				EpisodeID:       1,
+				EpisodePublicID: "episode-1",
+			},
+		},
+		{
+			name: "sidecar conflicts with keystone extra",
+			input: dpRawTagsInput{
+				Profile: DPDeviceProfile{
+					DeviceID: "asset-1",
+					Tags:     map[string]string{"profile": "tag"},
+				},
+				McapKey:         "bucket/file.mcap",
+				SidecarTags:     map[string]string{"sync_channel": "other"},
+				EpisodeID:       1,
+				EpisodePublicID: "episode-1",
+			},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if _, err := buildDPDirectRawTags(tt.input); err == nil || !strings.Contains(err.Error(), "conflict") {
+				t.Fatalf("error=%v want conflict", err)
+			}
+		})
+	}
+}
+
+func TestBuildDPDirectRawTags_RejectsEmptyKeyAndRawFile(t *testing.T) {
+	_, err := buildDPDirectRawTags(dpRawTagsInput{
+		Profile: DPDeviceProfile{
+			DeviceID: "asset-1",
+			Tags:     map[string]string{"": "value"},
+		},
+		McapKey:         "bucket/file.mcap",
+		EpisodeID:       1,
+		EpisodePublicID: "episode-1",
+	})
+	if err == nil || !strings.Contains(err.Error(), "key") {
+		t.Fatalf("empty key error=%v", err)
+	}
+
+	_, err = buildDPDirectRawTags(dpRawTagsInput{
+		Profile: DPDeviceProfile{
+			DeviceID: "asset-1",
+			Tags:     map[string]string{"profile": "tag"},
+		},
+		McapKey:         "bucket/",
+		EpisodeID:       1,
+		EpisodePublicID: "episode-1",
+	})
+	if err == nil || !strings.Contains(err.Error(), "raw_file") {
+		t.Fatalf("empty raw_file error=%v", err)
+	}
+}
diff --git a/internal/services/sync_errors.go b/internal/services/sync_errors.go
new file mode 100644
index 0000000..c07e3ff
--- /dev/null
+++ b/internal/services/sync_errors.go
@@ -0,0 +1,45 @@
+// SPDX-FileCopyrightText: 2026 ArcheBase
+//
+// SPDX-License-Identifier: MulanPSL-2.0
+
+package services
+
+import (
+	"errors"
+	"fmt"
+)
+
+type syncNonRetryableError struct {
+	err error
+}
+
+func (e *syncNonRetryableError) Error() string {
+	if e == nil || e.err == nil {
+		return ""
+	}
+	return e.err.Error()
+}
+
+func (e *syncNonRetryableError) Unwrap() error {
+	if e == nil {
+		return nil
+	}
+	return e.err
+}
+
+func newNonRetryableSyncError(format string, args ...interface{}) error {
+	return &syncNonRetryableError{err: fmt.Errorf(format, args...)}
+}
+
+func wrapNonRetryableSyncError(err error, format string, args ...interface{}) error {
+	if err == nil {
+		return nil
+	}
+	msg := fmt.Sprintf(format, args...)
+	return &syncNonRetryableError{err: fmt.Errorf("%s: %w", msg, err)}
+}
+
+func isNonRetryableSyncError(err error) bool {
+	var target *syncNonRetryableError
+	return errors.As(err, &target)
+}
diff --git a/internal/services/sync_worker.go b/internal/services/sync_worker.go
index f9971bb..52bc7cd 100644
--- a/internal/services/sync_worker.go
+++ b/internal/services/sync_worker.go
@@ -40,6 +40,20 @@ type SyncWorkerConfig struct {
 type syncEnqueueRequest struct {
 	episodeID int64
 	manual    bool
+	resync    bool
+}
+
+type syncEpisodeUploadRow struct {
+	ID             int64          `db:"id"`
+	EpisodeUUID    string         `db:"episode_id"`
+	TaskID         int64          `db:"task_id"`
+	McapPath       string         `db:"mcap_path"`
+	SidecarPath    string         `db:"sidecar_path"`
+	CloudSynced    bool           `db:"cloud_synced"`
+	Metadata       sql.NullString `db:"metadata"`
+	WorkstationID  sql.NullInt64  `db:"workstation_id"`
+	FactoryID      sql.NullInt64  `db:"factory_id"`
+	OrganizationID sql.NullInt64  `db:"organization_id"`
 }
 
 // SyncWorker is a background goroutine that processes queued cloud sync work
@@ -85,6 +99,7 @@ var (
 	errSyncRetryBackoffActive = errors.New("sync retry backoff active")
 	errSyncRetryExhausted     = errors.New("sync retry max retries exceeded")
 	errSyncAlreadyCompleted   = errors.New("sync already completed")
+	errSyncNonRetryableFailed = errors.New("sync latest failure is non-retryable")
 )
 
 // NewSyncWorker creates a new sync worker. Call Start() to begin background processing.
@@ -224,13 +239,25 @@ func (w *SyncWorker) EnqueueEpisodeManual(ctx context.Context, episodeID int64)
 	if !w.running.Load() {
 		return ErrSyncWorkerNotRunning
 	}
-	if err := w.persistPendingSyncLog(ctx, episodeID, true); err != nil {
+	if err := w.persistPendingSyncLog(ctx, episodeID, true, false); err != nil {
 		return err
 	}
 	w.enqueuePersistedEpisode(ctx, syncEnqueueRequest{episodeID: episodeID, manual: true})
 	return nil
 }
 
+// EnqueueEpisodeResync queues a new upload attempt for an episode that has already synced.
+func (w *SyncWorker) EnqueueEpisodeResync(ctx context.Context, episodeID int64) error {
+	if !w.running.Load() {
+		return ErrSyncWorkerNotRunning
+	}
+	if err := w.persistResyncSyncLog(ctx, episodeID); err != nil {
+		return err
+	}
+	w.enqueuePersistedEpisode(ctx, syncEnqueueRequest{episodeID: episodeID, manual: true, resync: true})
+	return nil
+}
+
 func (w *SyncWorker) enqueueEpisode(ctx context.Context, episodeID int64, manual bool) error {
 	if !w.running.Load() {
 		return ErrSyncWorkerNotRunning
@@ -267,7 +294,7 @@ func (w *SyncWorker) enqueuePersistedEpisode(ctx context.Context, req syncEnqueu
 	}
 }
 
-func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64, manual bool) error {
+func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64, manual bool, allowSynced bool) error {
 	if w.db == nil {
 		return nil
 	}
@@ -293,7 +320,7 @@ func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64,
 		}
 		return fmt.Errorf("lock episode %d: %w", episodeID, err)
 	}
-	if episode.CloudSynced {
+	if episode.CloudSynced && !allowSynced {
 		return fmt.Errorf("episode %d already synced", episodeID)
 	}
 
@@ -340,13 +367,16 @@ func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64,
 	case "completed":
 		return fmt.Errorf("%w for episode %d", errSyncAlreadyCompleted, episodeID)
 	case "failed":
-		retryDue := !latest.NextRetry.Valid || !latest.NextRetry.Time.After(now)
+		retryDue := latest.NextRetry.Valid && !latest.NextRetry.Time.After(now)
 		if latest.AttemptCount < w.cfg.MaxRetries && retryDue {
 			if err := promoteFailedSyncLogToPending(ctx, tx, latest.ID, now); err != nil {
 				return err
 			}
 			return tx.Commit()
 		}
+		if !manual && !latest.NextRetry.Valid {
+			return fmt.Errorf("%w for episode %d", errSyncNonRetryableFailed, episodeID)
+		}
 		if !manual && latest.AttemptCount >= w.cfg.MaxRetries {
 			return fmt.Errorf("%w for episode %d", errSyncRetryExhausted, episodeID)
 		}
@@ -362,6 +392,55 @@ func (w *SyncWorker) persistPendingSyncLog(ctx context.Context, episodeID int64,
 	}
 }
 
+func (w *SyncWorker) persistResyncSyncLog(ctx context.Context, episodeID int64) error {
+	if w.db == nil {
+		return nil
+	}
+
+	tx, err := w.db.BeginTxx(ctx, nil)
+	if err != nil {
+		return fmt.Errorf("begin resync sync_log transaction: %w", err)
+	}
+	defer func() { _ = tx.Rollback() }()
+
+	lockClause := txLockClause(tx)
+	var episode struct {
+		ID          int64 `db:"id"`
+		CloudSynced bool  `db:"cloud_synced"`
+	}
+	if err := tx.GetContext(ctx, &episode, `
+		SELECT id, cloud_synced
+		FROM episodes
+		WHERE id = ? AND deleted_at IS NULL
+	`+lockClause, episodeID); err != nil {
+		if err == sql.ErrNoRows {
+			return fmt.Errorf("episode %d not found", episodeID)
+		}
+		return fmt.Errorf("lock episode %d for resync: %w", episodeID, err)
+	}
+	if !episode.CloudSynced {
+		return fmt.Errorf("episode %d has not completed cloud sync", episodeID)
+	}
+
+	var activeCount int
+	if err := tx.GetContext(ctx, &activeCount, `
+		SELECT COUNT(*)
+		FROM sync_logs
+		WHERE episode_id = ?
+		  AND status IN ('pending', 'in_progress')
+	`, episodeID); err != nil {
+		return fmt.Errorf("query active resync sync_log count: %w", err)
+	}
+	if activeCount > 0 {
+		return fmt.Errorf("%w for episode %d", ErrSyncAlreadyInProgress, episodeID)
+	}
+
+	if err := insertPendingSyncLog(ctx, tx, episodeID, time.Now().UTC(), 0); err != nil {
+		return err
+	}
+	return tx.Commit()
+}
+
 func insertPendingSyncLog(ctx context.Context, tx *sqlx.Tx, episodeID int64, queuedAt time.Time, attemptCount int) error {
 	if _, err := tx.ExecContext(ctx, `
 		INSERT INTO sync_logs (episode_id, status, attempt_count, started_at)
@@ -408,7 +487,8 @@ func isSkippablePendingError(err error) bool {
 	return errors.Is(err, ErrSyncAlreadyInProgress) ||
 		errors.Is(err, errSyncRetryBackoffActive) ||
 		errors.Is(err, errSyncRetryExhausted) ||
-		errors.Is(err, errSyncAlreadyCompleted)
+		errors.Is(err, errSyncAlreadyCompleted) ||
+		errors.Is(err, errSyncNonRetryableFailed)
 }
 
 // EnqueuePendingEpisodes scans for all approved but un-synced episodes and enqueues them.
@@ -424,7 +504,7 @@ func (w *SyncWorker) EnqueuePendingEpisodes(ctx context.Context) (int, error) {
 	}
 	count := 0
 	for _, id := range ids {
-		if err := w.persistPendingSyncLog(ctx, id, false); err != nil {
+		if err := w.persistPendingSyncLog(ctx, id, false, false); err != nil {
 			if isSkippablePendingError(err) {
 				continue
 			}
@@ -497,9 +577,9 @@ func (w *SyncWorker) processEnqueuedEpisode(ctx context.Context, req syncEnqueue
 	w.processEnqueuedEpisodeWith(ctx, req, w.processEpisodeWithMode)
 }
 
-func (w *SyncWorker) processEnqueuedEpisodeWith(ctx context.Context, req syncEnqueueRequest, process func(context.Context, int64, bool)) {
+func (w *SyncWorker) processEnqueuedEpisodeWith(ctx context.Context, req syncEnqueueRequest, process func(context.Context, int64, bool, bool)) {
 	defer w.unmarkEnqueued(req.episodeID)
-	process(ctx, req.episodeID, req.manual)
+	process(ctx, req.episodeID, req.manual, req.resync)
 }
 
 func (w *SyncWorker) dispatchJob(ctx context.Context, req syncEnqueueRequest) {
@@ -590,7 +670,7 @@ func (w *SyncWorker) pollAndProcess(ctx context.Context) {
 	logger.Printf("[SYNC-WORKER] Found %d episodes to sync", len(ids))
 
 	for _, id := range ids {
-		if err := w.persistPendingSyncLog(ctx, id, false); err != nil {
+		if err := w.persistPendingSyncLog(ctx, id, false, false); err != nil {
 			if isSkippablePendingError(err) {
 				continue
 			}
@@ -602,13 +682,13 @@ func (w *SyncWorker) pollAndProcess(ctx context.Context) {
 }
 
 func (w *SyncWorker) dispatchPendingSyncLogs(ctx context.Context) {
-	ids, err := w.findPendingSyncLogEpisodes(ctx)
+	reqs, err := w.findPendingSyncLogEpisodes(ctx)
 	if err != nil {
 		logger.Printf("[SYNC-WORKER] Failed to find queued sync logs: %v", err)
 		return
 	}
-	for _, id := range ids {
-		w.dispatchPersistedJob(ctx, syncEnqueueRequest{episodeID: id, manual: false})
+	for _, req := range reqs {
+		w.dispatchPersistedJob(ctx, req)
 	}
 }
 
@@ -619,10 +699,13 @@ func (w *SyncWorker) dispatchPersistedJob(ctx context.Context, req syncEnqueueRe
 	w.dispatchJob(ctx, req)
 }
 
-func (w *SyncWorker) findPendingSyncLogEpisodes(ctx context.Context) ([]int64, error) {
-	var ids []int64
-	if err := w.db.SelectContext(ctx, &ids, `
-		SELECT latest_log.episode_id
+func (w *SyncWorker) findPendingSyncLogEpisodes(ctx context.Context) ([]syncEnqueueRequest, error) {
+	var rows []struct {
+		EpisodeID   int64 `db:"episode_id"`
+		CloudSynced bool  `db:"cloud_synced"`
+	}
+	if err := w.db.SelectContext(ctx, &rows, `
+		SELECT latest_log.episode_id, e.cloud_synced
 		FROM sync_logs latest_log
 		INNER JOIN (
 		  SELECT episode_id, MAX(id) AS latest_id
@@ -631,14 +714,17 @@ func (w *SyncWorker) findPendingSyncLogEpisodes(ctx context.Context) ([]int64, e
 		) latest ON latest_log.episode_id = latest.episode_id AND latest_log.id = latest.latest_id
 		INNER JOIN episodes e ON e.id = latest_log.episode_id
 		WHERE latest_log.status = 'pending'
-		  AND e.cloud_synced = FALSE
 		  AND e.deleted_at IS NULL
 		ORDER BY latest_log.started_at ASC, latest_log.id ASC
 		LIMIT ?
 	`, w.cfg.BatchSize); err != nil {
 		return nil, fmt.Errorf("query pending sync logs: %w", err)
 	}
-	return ids, nil
+	reqs := make([]syncEnqueueRequest, len(rows))
+	for i, row := range rows {
+		reqs[i] = syncEnqueueRequest{episodeID: row.EpisodeID, resync: row.CloudSynced}
+	}
+	return reqs, nil
 }
 
 func (w *SyncWorker) findPendingEpisodes(ctx context.Context, includeExhaustedFailures bool) ([]int64, error) {
@@ -682,6 +768,17 @@ func (w *SyncWorker) findPendingEpisodes(ctx context.Context, includeExhaustedFa
 		    WHERE sl.episode_id = e.id
 		      AND sl.status = 'failed'
 		      AND sl.attempt_count >= ?
+		  )
+		  AND NOT EXISTS (
+		    SELECT 1 FROM sync_logs sl
+		    INNER JOIN (
+		      SELECT episode_id, MAX(id) AS latest_id
+		      FROM sync_logs
+		      GROUP BY episode_id
+		    ) t ON sl.episode_id = t.episode_id AND sl.id = t.latest_id
+		    WHERE sl.episode_id = e.id
+		      AND sl.status = 'failed'
+		      AND sl.next_retry_at IS NULL
 		  )`)
 		err = w.db.SelectContext(ctx, &ids, query, w.cfg.MaxRetries, w.cfg.BatchSize)
 	} else {
@@ -695,19 +792,25 @@ func (w *SyncWorker) findPendingEpisodes(ctx context.Context, includeExhaustedFa
 }
 
 func (w *SyncWorker) retryFailedEpisodes(ctx context.Context) {
-	var ids []int64
+	var rows []struct {
+		EpisodeID   int64 `db:"episode_id"`
+		CloudSynced bool  `db:"cloud_synced"`
+	}
 	now := time.Now().UTC()
-	err := w.db.SelectContext(ctx, &ids, `
-		SELECT sl.episode_id
+	err := w.db.SelectContext(ctx, &rows, `
+		SELECT sl.episode_id, e.cloud_synced
 		FROM sync_logs sl
 		INNER JOIN (
 		  SELECT episode_id, MAX(id) AS latest_id
 		  FROM sync_logs
 		  GROUP BY episode_id
 		) t ON sl.episode_id = t.episode_id AND sl.id = t.latest_id
+		INNER JOIN episodes e ON e.id = sl.episode_id
 		WHERE sl.status = 'failed'
+		  AND e.deleted_at IS NULL
 		  AND sl.attempt_count < ?
-		  AND (sl.next_retry_at IS NULL OR sl.next_retry_at <= ?)
+		  AND sl.next_retry_at IS NOT NULL
+		  AND sl.next_retry_at <= ?
 		  AND NOT EXISTS (
 		    SELECT 1 FROM sync_logs sl2
 		    WHERE sl2.episode_id = sl.episode_id
@@ -721,33 +824,36 @@ func (w *SyncWorker) retryFailedEpisodes(ctx context.Context) {
 		return
 	}
 
-	if len(ids) == 0 {
+	if len(rows) == 0 {
 		return
 	}
 
-	for _, id := range ids {
-		if err := w.persistPendingSyncLog(ctx, id, false); err != nil {
+	for _, row := range rows {
+		if err := w.persistPendingSyncLog(ctx, row.EpisodeID, false, row.CloudSynced); err != nil {
 			if isSkippablePendingError(err) {
 				continue
 			}
-			logger.Printf("[SYNC-WORKER] Failed to queue retry for episode %d: %v", id, err)
+			logger.Printf("[SYNC-WORKER] Failed to queue retry for episode %d: %v", row.EpisodeID, err)
 			continue
 		}
-		w.dispatchPersistedJob(ctx, syncEnqueueRequest{episodeID: id, manual: false})
+		w.dispatchPersistedJob(ctx, syncEnqueueRequest{episodeID: row.EpisodeID, manual: false, resync: row.CloudSynced})
 	}
 }
 
-func (w *SyncWorker) processEpisodeWithMode(ctx context.Context, episodeID int64, manual bool) {
-	// Fetch episode details
-	var ep struct {
-		ID          int64  `db:"id"`
-		EpisodeUUID string `db:"episode_id"`
-		McapPath    string `db:"mcap_path"`
-		SidecarPath string `db:"sidecar_path"`
-		CloudSynced bool   `db:"cloud_synced"`
-	}
+func (w *SyncWorker) processEpisodeWithMode(ctx context.Context, episodeID int64, manual bool, resync bool) {
+	var ep syncEpisodeUploadRow
 	err := w.db.GetContext(ctx, &ep, `
-		SELECT id, episode_id, mcap_path, sidecar_path, cloud_synced
+		SELECT
+			id,
+			episode_id,
+			task_id,
+			mcap_path,
+			sidecar_path,
+			cloud_synced,
+			metadata,
+			workstation_id,
+			factory_id,
+			organization_id
 		FROM episodes
 		WHERE id = ? AND deleted_at IS NULL
 	`, episodeID)
@@ -760,55 +866,151 @@ func (w *SyncWorker) processEpisodeWithMode(ctx context.Context, episodeID int64
 		return
 	}
 
-	if ep.CloudSynced {
+	if ep.CloudSynced && !resync {
 		//logger.Printf("[SYNC-WORKER] Episode %d already synced, skipping", episodeID)
 		return
 	}
 
-	// Extract the MinIO object key from the stored path (strip bucket prefix)
-	mcapKey := stripBucketPrefix(ep.McapPath)
+	syncLogID, attemptCount, err := w.acquireSyncLogWithMode(ctx, episodeID, ep.McapPath, manual)
+	if err != nil {
+		//logger.Printf("[SYNC-WORKER] Failed to acquire sync log for episode %d: %v", episodeID, err)
+		return
+	}
 
-	if mcapKey == "" {
-		logger.Printf("[SYNC-WORKER] Episode %d has empty mcap_path, skipping", episodeID)
+	startTime := time.Now()
+
+	result, err := w.uploadEpisodeDirect(ctx, ep)
+	if err != nil {
+		duration := int64(time.Since(startTime).Seconds())
+		w.markSyncFailed(ctx, syncLogID, episodeID, duration, err, attemptCount)
 		return
 	}
 
-	// Build raw tags from sidecar JSON (best-effort: log and continue on failure).
-	rawTags := map[string]string{
-		"episode_id": ep.EpisodeUUID,
+	// Success: update episode and sync_log
+	duration := int64(time.Since(startTime).Seconds())
+	w.markSyncCompleted(ctx, syncLogID, episodeID, result, duration)
+}
+
+func (w *SyncWorker) uploadEpisodeDirect(ctx context.Context, ep syncEpisodeUploadRow) (*cloud.UploadResult, error) {
+	mcapKey := stripBucketPrefix(ep.McapPath)
+	if mcapKey == "" {
+		return nil, newNonRetryableSyncError("episode %d has empty mcap_path", ep.ID)
 	}
-	if sidecarTags, err := w.tagsFromSidecar(ctx, ep.SidecarPath); err != nil {
-		logger.Printf("[SYNC-WORKER] Episode %d: failed to read sidecar tags, uploading without them: %v", episodeID, err)
-	} else {
-		for k, v := range sidecarTags {
-			rawTags[k] = v
-		}
+
+	assetID, err := resolveAssetIDForEpisode(ctx, w.db, ep.ID, ep.Metadata, ep.WorkstationID)
+	if err != nil {
+		return nil, wrapNonRetryableSyncError(err, "resolve asset_id for episode %d", ep.ID)
 	}
 
-	// Reuse latest failed sync_log when retry is due, otherwise insert a new row.
-	syncLogID, attemptCount, err := w.acquireSyncLogWithMode(ctx, episodeID, ep.McapPath, manual)
+	if w.syncCfg == nil || strings.TrimSpace(w.syncCfg.DPConfigPath) == "" {
+		return nil, newNonRetryableSyncError("KEYSTONE_SYNC_DP_CONFIG is required for direct sync")
+	}
+	dpConfig, err := loadDPDeviceUploadConfig(w.syncCfg.DPConfigPath, assetID)
 	if err != nil {
-		//logger.Printf("[SYNC-WORKER] Failed to acquire sync log for episode %d: %v", episodeID, err)
-		return
+		return nil, wrapNonRetryableSyncError(err, "load DP config for asset_id %s", assetID)
 	}
 
-	startTime := time.Now()
+	sidecarTags, err := w.directTagsFromSidecar(ctx, ep.SidecarPath)
+	if err != nil {
+		return nil, err
+	}
+
+	rawTags, err := buildDPDirectRawTags(dpRawTagsInput{
+		Profile:         dpConfig.Profile,
+		McapKey:         mcapKey,
+		SidecarTags:     sidecarTags,
+		EpisodeID:       ep.ID,
+		EpisodePublicID: ep.EpisodeUUID,
+		TaskID:          ep.TaskID,
+		FactoryID:       ep.FactoryID,
+		OrganizationID:  ep.OrganizationID,
+	})
+	if err != nil {
+		return nil, wrapNonRetryableSyncError(err, "build raw tags for episode %d", ep.ID)
+	}
+
+	uploader, cleanup, err := w.newDirectUploader(dpConfig)
+	if err != nil {
+		return nil, fmt.Errorf("create direct uploader for asset_id %s: %w", assetID, err)
+	}
+	defer cleanup()
+
+	logger.Printf("[SYNC-WORKER] Episode %d direct sync config resolved: asset_id=%s auth=%s auth_tls=%t gateway=%s gateway_tls=%t",
+		ep.ID, assetID, dpConfig.Auth.Target, dpConfig.Auth.UseTLS, dpConfig.Gateway.Target, dpConfig.Gateway.UseTLS)
 
-	// Execute upload
-	result, err := w.uploader.Upload(ctx, cloud.UploadRequest{
+	return uploader.Upload(ctx, cloud.UploadRequest{
 		EpisodeID: ep.EpisodeUUID,
 		McapKey:   mcapKey,
+		AssetID:   assetID,
 		RawTags:   rawTags,
 	})
+}
+
+func (w *SyncWorker) newDirectUploader(dpConfig *DPDeviceUploadConfig) (*cloud.Uploader, func(), error) {
+	if dpConfig == nil {
+		return nil, func() {}, fmt.Errorf("missing DP upload config")
+	}
+	authClient := cloud.NewAuthClient(cloud.AuthClientConfig{
+		Endpoint:      dpConfig.Auth.Target,
+		UseTLS:        dpConfig.Auth.UseTLS,
+		TLSServerName: dpConfig.Auth.ServerName,
+		APIKey:        dpConfig.Profile.APIKey,
+		RefreshBefore: 60 * time.Second,
+	})
+	gatewayClient := cloud.NewGatewayClient(cloud.GatewayClientConfig{
+		Endpoint:       dpConfig.Gateway.Target,
+		UseTLS:         dpConfig.Gateway.UseTLS,
+		TLSServerName:  dpConfig.Gateway.ServerName,
+		RequestTimeout: w.syncRequestTimeout(),
+	}, authClient)
+	cleanup := func() {
+		if err := gatewayClient.Close(); err != nil {
+			logger.Printf("[SYNC-WORKER] Failed to close direct gateway client: %v", err)
+		}
+		if err := authClient.Close(); err != nil {
+			logger.Printf("[SYNC-WORKER] Failed to close direct auth client: %v", err)
+		}
+	}
+
+	uploader, err := cloud.NewUploader(gatewayClient, w.minioClient, w.minioBucket, cloud.UploaderConfig{
+		RequestTimeout:  w.syncRequestTimeout(),
+		OSSTimeout:      w.syncOSSTimeout(),
+		PersistRootDir:  w.syncPersistRootDir(),
+		MaxRestartCount: uint32(w.syncMaxRestartCount()), //nolint:gosec // non-negative by helper
+	})
 	if err != nil {
-		duration := int64(time.Since(startTime).Seconds())
-		w.markSyncFailed(ctx, syncLogID, episodeID, duration, err, attemptCount)
-		return
+		cleanup()
+		return nil, func() {}, err
+	}
+	return uploader, cleanup, nil
+}
+
+func (w *SyncWorker) syncRequestTimeout() time.Duration {
+	if w.syncCfg != nil && w.syncCfg.RequestTimeoutSec > 0 {
+		return time.Duration(w.syncCfg.RequestTimeoutSec) * time.Second
 	}
+	return 30 * time.Second
+}
 
-	// Success: update episode and sync_log
-	duration := int64(time.Since(startTime).Seconds())
-	w.markSyncCompleted(ctx, syncLogID, episodeID, result, duration)
+func (w *SyncWorker) syncOSSTimeout() time.Duration {
+	if w.syncCfg != nil && w.syncCfg.OSSTimeoutSec > 0 {
+		return time.Duration(w.syncCfg.OSSTimeoutSec) * time.Second
+	}
+	return 300 * time.Second
+}
+
+func (w *SyncWorker) syncPersistRootDir() string {
+	if w.syncCfg == nil {
+		return ""
+	}
+	return w.syncCfg.PersistRootDir
+}
+
+func (w *SyncWorker) syncMaxRestartCount() int {
+	if w.syncCfg != nil && w.syncCfg.MaxRestartCount >= 0 {
+		return w.syncCfg.MaxRestartCount
+	}
+	return 3
 }
 
 func (w *SyncWorker) acquireSyncLogWithMode(ctx context.Context, episodeID int64, sourcePath string, manual bool) (int64, int, error) {
@@ -893,7 +1095,7 @@ func (w *SyncWorker) acquireSyncLogWithMode(ctx context.Context, episodeID int64
 		case "completed":
 			return 0, 0, fmt.Errorf("episode %d already has completed sync_log", episodeID)
 		case "failed":
-			retryDue := !latest.NextRetry.Valid || !latest.NextRetry.Time.After(now)
+			retryDue := latest.NextRetry.Valid && !latest.NextRetry.Time.After(now)
 			if latest.AttemptCount < w.cfg.MaxRetries && retryDue {
 				res, updErr := tx.ExecContext(ctx, `
 					UPDATE sync_logs
@@ -924,6 +1126,9 @@ func (w *SyncWorker) acquireSyncLogWithMode(ctx context.Context, episodeID int64
 				return latest.ID, latest.AttemptCount + 1, nil
 			}
 
+			if !manual && !latest.NextRetry.Valid {
+				return 0, 0, fmt.Errorf("%w for episode %d", errSyncNonRetryableFailed, episodeID)
+			}
 			if !manual && latest.AttemptCount >= w.cfg.MaxRetries {
 				return 0, 0, fmt.Errorf("max retries exceeded for episode %d", episodeID)
 			}
@@ -1006,8 +1211,11 @@ func (w *SyncWorker) markSyncFailed(ctx context.Context, syncLogID, episodeID, d
 	now := time.Now().UTC()
 	errMsg := uploadErr.Error()
 
-	backoff := w.nextRetryDelay(attemptCount)
-	nextRetry := now.Add(backoff)
+	var nextRetry sql.NullTime
+	if !isNonRetryableSyncError(uploadErr) {
+		backoff := w.nextRetryDelay(attemptCount)
+		nextRetry = sql.NullTime{Time: now.Add(backoff), Valid: true}
+	}
 
 	if _, err := w.db.ExecContext(ctx, `
 		UPDATE sync_logs
@@ -1021,8 +1229,13 @@ func (w *SyncWorker) markSyncFailed(ctx context.Context, syncLogID, episodeID, d
 		logger.Printf("[SYNC-WORKER] Failed to update sync log %d as failed: %v", syncLogID, err)
 	}
 
-	logger.Printf("[SYNC-WORKER] Episode %d sync failed: %v (attempt=%d, next_retry=%v)",
-		episodeID, uploadErr, attemptCount, nextRetry.Format(time.RFC3339))
+	if nextRetry.Valid {
+		logger.Printf("[SYNC-WORKER] Episode %d sync failed: %v (attempt=%d, next_retry=%v)",
+			episodeID, uploadErr, attemptCount, nextRetry.Time.Format(time.RFC3339))
+		return
+	}
+	logger.Printf("[SYNC-WORKER] Episode %d sync failed non-retryable: %v (attempt=%d)",
+		episodeID, uploadErr, attemptCount)
 }
 
 func (w *SyncWorker) nextRetryDelay(attemptCount int) time.Duration {
@@ -1068,13 +1281,10 @@ func (w *SyncWorker) nextRetryDelay(attemptCount int) time.Duration {
 	return time.Duration(totalSec * float64(time.Second))
 }
 
-// tagsFromSidecar reads the sidecar JSON from MinIO and returns it as a flat string map
-// for use as RawTags. topics_summary is excluded. Returns nil map and an error if the
-// sidecar path is empty, the object cannot be read, or the JSON is malformed.
-func (w *SyncWorker) tagsFromSidecar(ctx context.Context, sidecarPath string) (map[string]string, error) {
+func (w *SyncWorker) directTagsFromSidecar(ctx context.Context, sidecarPath string) (map[string]string, error) {
 	key := stripBucketPrefix(sidecarPath)
 	if key == "" {
-		return nil, fmt.Errorf("empty sidecar_path")
+		return nil, newNonRetryableSyncError("empty sidecar_path")
 	}
 	if w.minioClient == nil {
 		return nil, fmt.Errorf("minio client not available")
@@ -1095,7 +1305,7 @@ func (w *SyncWorker) tagsFromSidecar(ctx context.Context, sidecarPath string) (m
 
 	tags, err := flattenSidecar(data)
 	if err != nil {
-		return nil, fmt.Errorf("flatten sidecar %s: %w", key, err)
+		return nil, wrapNonRetryableSyncError(err, "flatten sidecar %s", key)
 	}
 	return tags, nil
 }
diff --git a/internal/services/sync_worker_test.go b/internal/services/sync_worker_test.go
index 02d3151..d5d26d6 100644
--- a/internal/services/sync_worker_test.go
+++ b/internal/services/sync_worker_test.go
@@ -5,11 +5,17 @@
 package services
 
 import (
+	"bytes"
 	"context"
+	"database/sql"
 	"errors"
+	"log"
+	"strings"
 	"testing"
 	"time"
 
+	"archebase.com/keystone-edge/internal/cloud"
+	"archebase.com/keystone-edge/internal/logger"
 	"github.com/jmoiron/sqlx"
 	_ "modernc.org/sqlite"
 )
@@ -129,6 +135,27 @@ func TestFindPendingEpisodes_ExcludesExhaustedFailuresFromPollingOnly(t *testing
 	assertEpisodeIDs(t, pollIDs, []int64{1, 3})
 }
 
+func TestFindPendingEpisodes_SkipsNonRetryableFailuresFromPollingOnly(t *testing.T) {
+	db := newTestSyncWorkerDB(t)
+	w := &SyncWorker{db: db, cfg: SyncWorkerConfig{BatchSize: 10, MaxRetries: 3}}
+
+	insertEpisodeForSyncWorkerTest(t, db, 5, "approved", false)
+	insertEpisodeForSyncWorkerTest(t, db, 6, "approved", false)
+	insertNonRetryableSyncLogForSyncWorkerTest(t, db, 6, "failed", 1)
+
+	apiIDs, err := w.findPendingEpisodes(context.Background(), true)
+	if err != nil {
+		t.Fatalf("api pending query failed: %v", err)
+	}
+	assertEpisodeIDs(t, apiIDs, []int64{5, 6})
+
+	pollIDs, err := w.findPendingEpisodes(context.Background(), false)
+	if err != nil {
+		t.Fatalf("poll pending query failed: %v", err)
+	}
+	assertEpisodeIDs(t, pollIDs, []int64{5})
+}
+
 func TestEnqueueEpisodeManual_AllowsExhaustedRetryEpisode(t *testing.T) {
 	db := newTestSyncWorkerDB(t)
 	w := &SyncWorker{
@@ -199,6 +226,68 @@ func TestEnqueueEpisodeManual_PromotesDueFailureToPending(t *testing.T) {
 	}
 }
 
+func TestEnqueueEpisodeResync_AllowsAlreadySyncedEpisode(t *testing.T) {
+	db := newTestSyncWorkerDB(t)
+	w := &SyncWorker{
+		db:              db,
+		cfg:             SyncWorkerConfig{BatchSize: 10, MaxRetries: 3},
+		enqueueCh:       make(chan syncEnqueueRequest, 1),
+		enqueuedEpisode: make(map[int64]struct{}),
+	}
+	w.running.Store(true)
+
+	insertEpisodeForSyncWorkerTest(t, db, 27, "approved", true)
+	insertSyncLogForSyncWorkerTest(t, db, 27, "completed", 1)
+
+	if err := w.EnqueueEpisodeResync(context.Background(), 27); err != nil {
+		t.Fatalf("resync enqueue failed: %v", err)
+	}
+
+	latest := latestSyncLogForSyncWorkerTest(t, db, 27)
+	if latest.Status != "pending" {
+		t.Fatalf("latest status = %q, want pending", latest.Status)
+	}
+	if count := countSyncLogsForSyncWorkerTest(t, db, 27); count != 2 {
+		t.Fatalf("sync log count = %d, want completed history plus resync pending", count)
+	}
+
+	select {
+	case got := <-w.enqueueCh:
+		if got.episodeID != 27 {
+			t.Fatalf("unexpected episode id: got %d want 27", got.episodeID)
+		}
+		if !got.manual || !got.resync {
+			t.Fatalf("enqueue flags = manual:%t resync:%t, want both true", got.manual, got.resync)
+		}
+	default:
+		t.Fatal("expected resync episode to be enqueued")
+	}
+}
+
+func TestDispatchPendingSyncLogs_TreatsSyncedPendingRowsAsResync(t *testing.T) {
+	db := newTestSyncWorkerDB(t)
+	w := &SyncWorker{
+		db:              db,
+		cfg:             SyncWorkerConfig{BatchSize: 10, MaxRetries: 3},
+		jobCh:           make(chan syncEnqueueRequest, 1),
+		enqueuedEpisode: make(map[int64]struct{}),
+	}
+
+	insertEpisodeForSyncWorkerTest(t, db, 28, "approved", true)
+	insertSyncLogForSyncWorkerTest(t, db, 28, "pending", 0)
+
+	w.dispatchPendingSyncLogs(context.Background())
+
+	select {
+	case got := <-w.jobCh:
+		if got.episodeID != 28 || !got.resync {
+			t.Fatalf("dispatched request = %+v, want episode 28 resync", got)
+		}
+	default:
+		t.Fatal("expected synced pending row to be dispatched as resync")
+	}
+}
+
 func TestEnqueueEpisode_RejectsInProgressEpisode(t *testing.T) {
 	db := newTestSyncWorkerDB(t)
 	w := &SyncWorker{
@@ -262,6 +351,35 @@ func TestEnqueueEpisodeManual_RejectsPendingEpisode(t *testing.T) {
 	}
 }
 
+func TestEnqueueEpisodeManual_AllowsNonRetryableFailure(t *testing.T) {
+	db := newTestSyncWorkerDB(t)
+	w := &SyncWorker{
+		db:              db,
+		cfg:             SyncWorkerConfig{BatchSize: 10, MaxRetries: 3},
+		enqueueCh:       make(chan syncEnqueueRequest, 1),
+		enqueuedEpisode: make(map[int64]struct{}),
+	}
+	w.running.Store(true)
+
+	insertEpisodeForSyncWorkerTest(t, db, 24, "approved", false)
+	insertNonRetryableSyncLogForSyncWorkerTest(t, db, 24, "failed", 1)
+
+	if err := w.EnqueueEpisodeManual(context.Background(), 24); err != nil {
+		t.Fatalf("manual enqueue failed: %v", err)
+	}
+
+	latest := latestSyncLogForSyncWorkerTest(t, db, 24)
+	if latest.Status != "pending" {
+		t.Fatalf("latest status = %q, want pending", latest.Status)
+	}
+	if latest.AttemptCount != 0 {
+		t.Fatalf("latest attempt_count = %d, want fresh pending attempt count 0", latest.AttemptCount)
+	}
+	if count := countSyncLogsForSyncWorkerTest(t, db, 24); count != 2 {
+		t.Fatalf("sync log count = %d, want failed history plus fresh pending", count)
+	}
+}
+
 func TestEnqueuePendingEpisodes_PersistsPendingWhenMemoryQueueFull(t *testing.T) {
 	db := newTestSyncWorkerDB(t)
 	w := &SyncWorker{
@@ -448,6 +566,54 @@ func TestRetryFailedEpisodes_PromotesDueFailureToPendingBeforeDispatch(t *testin
 	}
 }
 
+func TestRetryFailedEpisodes_IgnoresMissingDeletedAndRetriesSyncedEpisodesAsResync(t *testing.T) {
+	db := newTestSyncWorkerDB(t)
+	w := &SyncWorker{
+		db:              db,
+		cfg:             SyncWorkerConfig{BatchSize: 10, MaxRetries: 3},
+		jobCh:           make(chan syncEnqueueRequest, 2),
+		enqueuedEpisode: make(map[int64]struct{}),
+	}
+
+	insertSyncLogForSyncWorkerTest(t, db, 2, "failed", 1)
+	insertEpisodeForSyncWorkerTest(t, db, 3, "approved", false)
+	insertSyncLogForSyncWorkerTest(t, db, 3, "failed", 1)
+	if _, err := db.Exec(`UPDATE episodes SET deleted_at = ? WHERE id = 3`, time.Now().UTC()); err != nil {
+		t.Fatalf("mark episode deleted: %v", err)
+	}
+	insertEpisodeForSyncWorkerTest(t, db, 4, "approved", true)
+	insertSyncLogForSyncWorkerTest(t, db, 4, "failed", 1)
+	insertEpisodeForSyncWorkerTest(t, db, 5, "approved", false)
+	insertSyncLogForSyncWorkerTest(t, db, 5, "failed", 1)
+
+	var logs bytes.Buffer
+	previousLogger := logger.Get()
+	logger.Set(log.New(&logs, "", 0))
+	t.Cleanup(func() { logger.Set(previousLogger) })
+
+	w.retryFailedEpisodes(context.Background())
+
+	if strings.Contains(logs.String(), "Failed to queue retry") {
+		t.Fatalf("unexpected retry queue failure log: %s", logs.String())
+	}
+
+	for _, episodeID := range []int64{4, 5} {
+		latest := latestSyncLogForSyncWorkerTest(t, db, episodeID)
+		if latest.Status != "pending" {
+			t.Fatalf("episode %d latest status = %q, want pending", episodeID, latest.Status)
+		}
+	}
+
+	gotSynced := <-w.jobCh
+	if gotSynced.episodeID != 4 || !gotSynced.resync {
+		t.Fatalf("unexpected synced retry dispatch: got %+v want episode 4 resync", gotSynced)
+	}
+	gotUnsynced := <-w.jobCh
+	if gotUnsynced.episodeID != 5 || gotUnsynced.resync {
+		t.Fatalf("unexpected unsynced retry dispatch: got %+v want episode 5 non-resync", gotUnsynced)
+	}
+}
+
 func TestAcquireSyncLogWithMode_ClaimsFreshPendingRow(t *testing.T) {
 	db := newTestSyncWorkerDB(t)
 	w := &SyncWorker{
@@ -517,7 +683,7 @@ func TestProcessEnqueuedEpisode_HoldsMarkerUntilProcessingReturns(t *testing.T)
 		w.processEnqueuedEpisodeWith(
 			context.Background(),
 			syncEnqueueRequest{episodeID: 77, manual: true},
-			func(context.Context, int64, bool) {
+			func(context.Context, int64, bool, bool) {
 				close(started)
 				<-release
 			},
@@ -616,6 +782,74 @@ func TestNextRetryDelay_IncludesBoundedJitter(t *testing.T) {
 	}
 }
 
+func TestMarkSyncFailed_NonRetryableClearsNextRetry(t *testing.T) {
+	db := newTestSyncWorkerDB(t)
+	w := &SyncWorker{
+		db:  db,
+		cfg: SyncWorkerConfig{RetryBaseSec: 30, RetryMaxSec: 1800},
+	}
+
+	insertEpisodeForSyncWorkerTest(t, db, 25, "approved", false)
+	insertSyncLogForSyncWorkerTest(t, db, 25, "in_progress", 1)
+	var syncLogID int64
+	if err := db.Get(&syncLogID, "SELECT id FROM sync_logs WHERE episode_id = ?", 25); err != nil {
+		t.Fatalf("query sync log id: %v", err)
+	}
+
+	w.markSyncFailed(context.Background(), syncLogID, 25, 0, newNonRetryableSyncError("asset_id missing"), 1)
+
+	latest := latestSyncLogForSyncWorkerTest(t, db, 25)
+	if latest.Status != "failed" {
+		t.Fatalf("latest status = %q, want failed", latest.Status)
+	}
+	if latest.NextRetry.Valid {
+		t.Fatalf("next_retry_at valid = true, want NULL")
+	}
+}
+
+func TestMarkSyncCompleted_WritesExistingCloudFields(t *testing.T) {
+	db := newTestSyncWorkerDB(t)
+	w := &SyncWorker{db: db}
+
+	insertEpisodeForSyncWorkerTest(t, db, 26, "approved", false)
+	insertSyncLogForSyncWorkerTest(t, db, 26, "in_progress", 1)
+	var syncLogID int64
+	if err := db.Get(&syncLogID, "SELECT id FROM sync_logs WHERE episode_id = ?", 26); err != nil {
+		t.Fatalf("query sync log id: %v", err)
+	}
+
+	w.markSyncCompleted(context.Background(), syncLogID, 26, &cloud.UploadResult{
+		LogicalUploadID: "logical-26",
+		UploadID:        "upload-26",
+		ObjectKey:       "cloud/object.mcap",
+		FileSize:        12345,
+	}, 3)
+
+	var ep struct {
+		CloudSynced    bool   `db:"cloud_synced"`
+		CloudMcapPath  string `db:"cloud_mcap_path"`
+		CloudProcessed bool   `db:"cloud_processed"`
+	}
+	if err := db.Get(&ep, "SELECT cloud_synced, cloud_mcap_path, cloud_processed FROM episodes WHERE id = ?", 26); err != nil {
+		t.Fatalf("query episode cloud fields: %v", err)
+	}
+	if !ep.CloudSynced || ep.CloudMcapPath != "cloud/object.mcap" || ep.CloudProcessed {
+		t.Fatalf("episode cloud fields = %+v", ep)
+	}
+
+	var logRow struct {
+		Status           string `db:"status"`
+		DestinationPath  string `db:"destination_path"`
+		BytesTransferred int64  `db:"bytes_transferred"`
+	}
+	if err := db.Get(&logRow, "SELECT status, destination_path, bytes_transferred FROM sync_logs WHERE id = ?", syncLogID); err != nil {
+		t.Fatalf("query sync log completion fields: %v", err)
+	}
+	if logRow.Status != "completed" || logRow.DestinationPath != "cloud/object.mcap" || logRow.BytesTransferred != 12345 {
+		t.Fatalf("sync log completion fields = %+v", logRow)
+	}
+}
+
 func newTestSyncWorkerDB(t *testing.T) *sqlx.DB {
 	t.Helper()
 
@@ -629,6 +863,9 @@ func newTestSyncWorkerDB(t *testing.T) *sqlx.DB {
 			id INTEGER PRIMARY KEY,
 			qa_status TEXT NOT NULL,
 			cloud_synced BOOLEAN NOT NULL DEFAULT 0,
+			cloud_synced_at TIMESTAMP NULL,
+			cloud_mcap_path TEXT,
+			cloud_processed BOOLEAN NOT NULL DEFAULT 0,
 			deleted_at TIMESTAMP NULL,
 			created_at TIMESTAMP NOT NULL
 		)`,
@@ -637,6 +874,8 @@ func newTestSyncWorkerDB(t *testing.T) *sqlx.DB {
 				episode_id INTEGER NOT NULL,
 				source_path TEXT,
 				status TEXT NOT NULL,
+				destination_path TEXT,
+				bytes_transferred INTEGER,
 				duration_sec INTEGER,
 				error_message TEXT,
 				attempt_count INTEGER NOT NULL DEFAULT 0,
@@ -675,18 +914,35 @@ func insertEpisodeForSyncWorkerTest(t *testing.T, db *sqlx.DB, id int64, qaStatu
 func insertSyncLogForSyncWorkerTest(t *testing.T, db *sqlx.DB, episodeID int64, status string, attemptCount int) {
 	t.Helper()
 
+	startedAt := time.Date(2026, 2, int(episodeID), 0, 0, 0, 0, time.UTC)
+	nextRetry := sql.NullTime{}
+	if status == "failed" {
+		nextRetry = sql.NullTime{Time: startedAt.Add(time.Second), Valid: true}
+	}
+	if _, err := db.Exec(`
+		INSERT INTO sync_logs (episode_id, status, attempt_count, started_at, next_retry_at)
+		VALUES (?, ?, ?, ?, ?)
+	`, episodeID, status, attemptCount, startedAt, nextRetry); err != nil {
+		t.Fatalf("insert sync log for episode %d: %v", episodeID, err)
+	}
+}
+
+func insertNonRetryableSyncLogForSyncWorkerTest(t *testing.T, db *sqlx.DB, episodeID int64, status string, attemptCount int) {
+	t.Helper()
+
 	startedAt := time.Date(2026, 2, int(episodeID), 0, 0, 0, 0, time.UTC)
 	if _, err := db.Exec(`
-		INSERT INTO sync_logs (episode_id, status, attempt_count, started_at)
-		VALUES (?, ?, ?, ?)
+		INSERT INTO sync_logs (episode_id, status, attempt_count, started_at, next_retry_at)
+		VALUES (?, ?, ?, ?, NULL)
 	`, episodeID, status, attemptCount, startedAt); err != nil {
 		t.Fatalf("insert sync log for episode %d: %v", episodeID, err)
 	}
 }
 
 type syncLogForSyncWorkerTest struct {
-	Status       string `db:"status"`
-	AttemptCount int    `db:"attempt_count"`
+	Status       string       `db:"status"`
+	AttemptCount int          `db:"attempt_count"`
+	NextRetry    sql.NullTime `db:"next_retry_at"`
 }
 
 func latestSyncLogForSyncWorkerTest(t *testing.T, db *sqlx.DB, episodeID int64) syncLogForSyncWorkerTest {
@@ -694,7 +950,7 @@ func latestSyncLogForSyncWorkerTest(t *testing.T, db *sqlx.DB, episodeID int64)
 
 	var row syncLogForSyncWorkerTest
 	if err := db.Get(&row, `
-		SELECT status, attempt_count
+		SELECT status, attempt_count, next_retry_at
 		FROM sync_logs
 		WHERE episode_id = ?
 		ORDER BY id DESC
diff --git a/internal/storage/database/migrations/000004_robot_asset_id.down.sql b/internal/storage/database/migrations/000004_robot_asset_id.down.sql
new file mode 100644
index 0000000..c601e3d
--- /dev/null
+++ b/internal/storage/database/migrations/000004_robot_asset_id.down.sql
@@ -0,0 +1,7 @@
+-- SPDX-FileCopyrightText: 2026 ArcheBase
+--
+-- SPDX-License-Identifier: MulanPSL-2.0
+
+ALTER TABLE robots
+    DROP INDEX idx_asset_active_unique,
+    DROP COLUMN _asset_unique;
diff --git a/internal/storage/database/migrations/000004_robot_asset_id.up.sql b/internal/storage/database/migrations/000004_robot_asset_id.up.sql
new file mode 100644
index 0000000..6bcf96c
--- /dev/null
+++ b/internal/storage/database/migrations/000004_robot_asset_id.up.sql
@@ -0,0 +1,14 @@
+-- SPDX-FileCopyrightText: 2026 ArcheBase
+--
+-- SPDX-License-Identifier: MulanPSL-2.0
+
+ALTER TABLE robots
+    ADD COLUMN _asset_unique VARCHAR(100)
+        GENERATED ALWAYS AS (
+            CASE
+                WHEN deleted_at IS NULL AND asset_id IS NOT NULL AND asset_id <> ''
+                THEN asset_id
+                ELSE NULL
+            END
+        ) STORED,
+    ADD UNIQUE INDEX idx_asset_active_unique (_asset_unique);