From ea54799eaf7767aebf0bdf69313469617340dc9a Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Sun, 7 Jun 2026 11:06:44 +0000
Subject: [PATCH 1/2] fix(agent): avoid utf8 panic in planning errors

Co-authored-by: EXboy <EXboys@users.noreply.github.com>
---
 .../skilllite-agent/src/agent_loop/helpers.rs | 15 +++++-
 crates/skilllite-agent/src/task_planner.rs    | 16 +++++-
 .../CONTEXT.md                                | 28 ++++++++++
 tasks/TASK-2026-068-critical-bug-audit/PRD.md | 40 ++++++++++++++
 .../REVIEW.md                                 | 29 ++++++++++
 .../STATUS.md                                 | 17 ++++++
 .../TASK-2026-068-critical-bug-audit/TASK.md  | 54 +++++++++++++++++++
 tasks/board.md                                |  4 +-
 8 files changed, 199 insertions(+), 4 deletions(-)
 create mode 100644 tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md
 create mode 100644 tasks/TASK-2026-068-critical-bug-audit/PRD.md
 create mode 100644 tasks/TASK-2026-068-critical-bug-audit/REVIEW.md
 create mode 100644 tasks/TASK-2026-068-critical-bug-audit/STATUS.md
 create mode 100644 tasks/TASK-2026-068-critical-bug-audit/TASK.md

diff --git a/crates/skilllite-agent/src/agent_loop/helpers.rs b/crates/skilllite-agent/src/agent_loop/helpers.rs
index 15ecee94..0d163006 100644
--- a/crates/skilllite-agent/src/agent_loop/helpers.rs
+++ b/crates/skilllite-agent/src/agent_loop/helpers.rs
@@ -72,7 +72,7 @@ pub(super) fn handle_update_task_plan(
                                 "'tasks' must be a JSON array, got a string that is not valid JSON array. \
                                  Pass tasks as a real array: {{\"tasks\": [{{...}}]}} not {{\"tasks\": \"[...]\"}}. \
                                  Received string preview: {:?}",
-                                &s[..s.len().min(120)]
+                                safe_truncate(s, 120)
                             ),
                             is_error: true,
                             counts_as_failure: true,
@@ -1104,6 +1104,19 @@ mod tests {
         assert!(r.content.contains("must be a JSON array"), "{}", r.content);
     }
 
+    #[test]
+    fn update_task_plan_rejects_non_array_string_without_utf8_boundary_panic() {
+        let mut planner = TaskPlanner::new(None, None, None);
+        let mut sink = SilentEventSink;
+        let invalid_tasks = format!("{}汉 trailing invalid json", "a".repeat(119));
+        let args = serde_json::json!({ "tasks": invalid_tasks }).to_string();
+
+        let r = handle_update_task_plan(&args, &mut planner, &[], &mut sink);
+
+        assert!(r.is_error);
+        assert!(r.content.contains("must be a JSON array"), "{}", r.content);
+    }
+
     #[test]
     fn update_task_plan_rejects_missing_tasks_field() {
         let mut planner = TaskPlanner::new(None, None, None);
diff --git a/crates/skilllite-agent/src/task_planner.rs b/crates/skilllite-agent/src/task_planner.rs
index ef577c39..199918d7 100644
--- a/crates/skilllite-agent/src/task_planner.rs
+++ b/crates/skilllite-agent/src/task_planner.rs
@@ -394,7 +394,7 @@ impl TaskPlanner {
 
         tracing::debug!(
             "parse_task_list raw (first 500 chars): {}",
-            &raw[..raw.len().min(500)]
+            safe_truncate(raw, 500)
         );
         bail!("No valid JSON task array found in LLM response")
     }
@@ -777,6 +777,20 @@ mod tests {
         assert!(empty.is_empty());
     }
 
+    #[test]
+    fn parse_task_list_returns_error_without_utf8_boundary_panic() {
+        let planner = TaskPlanner::new(None, None, None);
+        let raw = format!("{}汉 trailing invalid planning response", "a".repeat(499));
+
+        let err = planner.parse_task_list(&raw).unwrap_err();
+
+        assert!(
+            err.to_string().contains("No valid JSON task array found"),
+            "{}",
+            err
+        );
+    }
+
     #[test]
     fn test_planning_prompt_contains_placeholders_resolved() {
         let planner = TaskPlanner::new(None, None, None);
diff --git a/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md b/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md
new file mode 100644
index 00000000..ee450c84
--- /dev/null
+++ b/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md
@@ -0,0 +1,28 @@
+# Technical Context
+
+## Current State
+
+- Relevant crates/files: To be determined from recent commit review; likely `crates/`, `docs/`, `python-sdk/`, and task artifacts touched by recent commits.
+- Current behavior: Current branch `cursor/critical-bug-investigation-b158` is aligned with `origin/main` at audit start.
+
+## Architecture Fit
+
+- Layer boundaries involved: Potentially all Rust workspace layers depending on reviewed commits.
+- Interfaces to preserve: Public CLI behavior, sandbox execution policy, agent/tool orchestration contracts, persisted data formats.
+
+## Dependency and Compatibility
+
+- New dependencies: None expected for audit-only work.
+- Backward compatibility notes: No compatibility changes unless a confirmed critical fix requires it.
+
+## Design Decisions
+
+- Decision: Use evidence-based triage before any fix.
+  - Rationale: The automation should not open PRs for speculative or low-impact observations.
+  - Alternatives considered: Broad refactoring or proactive hardening without a confirmed trigger.
+  - Why rejected: It would increase review noise and risk outside the requested scope.
+
+## Open Questions
+
+- [ ] Which recent commits have the largest behavioral blast radius?
+- [ ] Do any suspicious changes have a concrete critical trigger scenario?
diff --git a/tasks/TASK-2026-068-critical-bug-audit/PRD.md b/tasks/TASK-2026-068-critical-bug-audit/PRD.md
new file mode 100644
index 00000000..1765c501
--- /dev/null
+++ b/tasks/TASK-2026-068-critical-bug-audit/PRD.md
@@ -0,0 +1,40 @@
+# PRD
+
+## Background
+
+This scheduled automation performs a focused audit of recent commits to catch high-severity correctness regressions before they reach users.
+
+## Objective
+
+Identify and, when highly confident, minimally fix critical bugs with concrete trigger scenarios. If no such bug is found, produce a concise no-critical-findings report.
+
+## Functional Requirements
+
+- FR-1: Inspect recent behavioral changes and trace affected code paths beyond surface diffs.
+- FR-2: Only treat findings as actionable when they can cause data loss, crash, security bypass, silent corruption/truncation, or significant user-facing breakage.
+- FR-3: If a fix is made, include focused validation evidence.
+
+## Non-Functional Requirements
+
+- Security: Preserve sandbox and approval invariants; do not relax policy while auditing.
+- Performance: Avoid broad test or build work unless needed to validate a concrete finding.
+- Compatibility: Do not alter shipped behavior unless fixing a confirmed critical bug.
+
+## Constraints
+
+- Technical: Work on branch `cursor/critical-bug-investigation-b158`; avoid broad refactors.
+- Timeline: Scheduled automation run; complete autonomously with evidence.
+
+## Success Metrics
+
+- Metric: Number of surfaced critical findings without concrete trigger scenarios.
+- Baseline: Unknown.
+- Target: Zero speculative surfaced findings.
+- Metric: Validation evidence for any repository changes.
+- Baseline: Required by repository workflow.
+- Target: Commands actually run and recorded.
+
+## Rollout
+
+- Rollout plan: If no code fix is needed, report results only. If a critical fix is made, commit, push, and open a PR.
+- Rollback plan: Revert the minimal fix commit if validation or review disproves the finding.
diff --git a/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md b/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md
new file mode 100644
index 00000000..e5fd29ef
--- /dev/null
+++ b/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md
@@ -0,0 +1,29 @@
+# Review Report
+
+## Scope Reviewed
+
+- Files/modules:
+- Commits/changes:
+
+## Findings
+
+- Critical:
+- Major:
+- Minor:
+
+## Quality Gates
+
+- Architecture boundary checks: `pass | fail`
+- Security invariants: `pass | fail`
+- Required tests executed: `pass | fail`
+- Docs sync (EN/ZH): `pass | fail`
+
+## Test Evidence
+
+- Commands run:
+- Key outputs:
+
+## Decision
+
+- Merge readiness: `ready | not ready`
+- Follow-up actions:
diff --git a/tasks/TASK-2026-068-critical-bug-audit/STATUS.md b/tasks/TASK-2026-068-critical-bug-audit/STATUS.md
new file mode 100644
index 00000000..631fab78
--- /dev/null
+++ b/tasks/TASK-2026-068-critical-bug-audit/STATUS.md
@@ -0,0 +1,17 @@
+# Status Journal
+
+## Timeline
+
+- 2026-06-07:
+  - Progress: Created audit task, loaded injected specs, reviewed recent commits, and implemented a minimal UTF-8-safe fix for two agent planning error paths.
+  - Blockers: None.
+  - Next step: Commit/push the implementation, then run targeted and full validation.
+
+## Checkpoints
+
+- [x] PRD drafted before implementation (or `N/A` recorded)
+- [x] Context drafted before implementation (or `N/A` recorded)
+- [x] Implementation complete
+- [ ] Tests passed
+- [ ] Review complete
+- [ ] Board updated
diff --git a/tasks/TASK-2026-068-critical-bug-audit/TASK.md b/tasks/TASK-2026-068-critical-bug-audit/TASK.md
new file mode 100644
index 00000000..ab6497f9
--- /dev/null
+++ b/tasks/TASK-2026-068-critical-bug-audit/TASK.md
@@ -0,0 +1,54 @@
+# TASK Card
+
+## Metadata
+
+- Task ID: `TASK-2026-068`
+- Title: Critical Bug Audit
+- Status: `in_progress`
+- Priority: `P0`
+- Owner: `agent`
+- Contributors:
+- Created: `2026-06-07`
+- Target milestone:
+
+## Problem
+
+Inspect recent SkillLite commits for high-severity correctness regressions that escaped review. Only actionable findings with a concrete trigger scenario should result in a code change or PR.
+
+## Scope
+
+- In scope: Recent commits on `main` / current branch, with emphasis on behavioral changes affecting agent loops, CLI/runtime bridges, sandbox/security behavior, persistence, and UTF-8/error handling paths.
+- Out of scope: Style-only issues, theoretical concerns without a plausible trigger, broad refactors, and low-severity UX degradation.
+
+## Acceptance Criteria
+
+- [x] Recent behavioral commits are reviewed beyond the diff by tracing relevant caller and downstream paths.
+- [x] Any reported bug has a concrete trigger scenario and critical impact.
+- [x] If a critical bug is fixed, the fix is minimal and covered by targeted validation.
+- [ ] Validation commands pass and evidence is recorded.
+
+## Risks
+
+- Risk: Over-reporting speculative issues.
+  - Impact: Unnecessary PR noise and reviewer distraction.
+  - Mitigation: Require a concrete trigger scenario and critical user impact before opening a PR.
+- Risk: Missing a regression outside the highest-risk recent changes.
+  - Impact: Critical bug remains undetected.
+  - Mitigation: Review commit metadata first, then inspect broad-blast-radius code paths.
+
+## Validation Plan
+
+- Required tests: `cargo fmt --check`, `cargo clippy --all-targets -- -D warnings`, `cargo test -p skilllite-agent`, `cargo test`, and `python3 scripts/validate_tasks.py`.
+- Commands to run: `git log`, targeted `git show` / `git diff`, targeted agent regression tests, full workspace validation.
+- Manual checks: Trace caller/downstream paths and confirm whether each candidate has a plausible critical trigger.
+
+## Regression Scope
+
+- Areas likely affected: Agent behavior, command routing, sandbox execution, persistence/logging, UTF-8 truncation and user-facing error summaries.
+- Explicit non-goals: Cosmetic cleanup, unrelated TODO execution, documentation rewrite.
+
+## Links
+
+- Source TODO section: N/A - scheduled critical bug-finding automation.
+- Related PRs/issues: Recent commits on current branch and `main`.
+- Related docs: `spec/verification-integrity.md`, `spec/README.md`.
diff --git a/tasks/board.md b/tasks/board.md
index 4b63f2b2..c43093a0 100644
--- a/tasks/board.md
+++ b/tasks/board.md
@@ -1,10 +1,10 @@
 # Task Board
 
-Last updated: 2026-06-06 (TASK-2026-067 utf8 llm error truncate done)
+Last updated: 2026-06-07 (TASK-2026-068 critical bug audit in progress)
 
 ## In Progress
 
-- None.
+- `TASK-2026-068-critical-bug-audit` - Status: `in_progress` - Owner: `agent`
 
 ## Ready
 

From 807912da6b974696f43c961db01a10b19a2076b3 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Sun, 7 Jun 2026 11:10:39 +0000
Subject: [PATCH 2/2] docs(task): record critical bug audit validation

Co-authored-by: EXboy <EXboys@users.noreply.github.com>
---
 .../CONTEXT.md                                | 24 +++++++-------
 .../REVIEW.md                                 | 31 ++++++++++++-------
 .../STATUS.md                                 | 15 ++++++---
 .../TASK-2026-068-critical-bug-audit/TASK.md  |  6 ++--
 tasks/board.md                                |  5 +--
 5 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md b/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md
index ee450c84..afbffa55 100644
--- a/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md
+++ b/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md
@@ -2,27 +2,27 @@
 
 ## Current State
 
-- Relevant crates/files: To be determined from recent commit review; likely `crates/`, `docs/`, `python-sdk/`, and task artifacts touched by recent commits.
-- Current behavior: Current branch `cursor/critical-bug-investigation-b158` is aligned with `origin/main` at audit start.
+- Relevant crates/files: `crates/skilllite-agent/src/agent_loop/helpers.rs`, `crates/skilllite-agent/src/task_planner.rs`, and recent UTF-8 truncation commits on `main`.
+- Current behavior: Agent planning error paths now use `safe_truncate` for previews instead of raw byte slicing.
 
 ## Architecture Fit
 
-- Layer boundaries involved: Potentially all Rust workspace layers depending on reviewed commits.
-- Interfaces to preserve: Public CLI behavior, sandbox execution policy, agent/tool orchestration contracts, persisted data formats.
+- Layer boundaries involved: `skilllite-agent` planning and agent-loop error handling.
+- Interfaces to preserve: Tool-result error reporting, planner parse/fallback behavior, and existing `safe_truncate` utility semantics.
 
 ## Dependency and Compatibility
 
-- New dependencies: None expected for audit-only work.
-- Backward compatibility notes: No compatibility changes unless a confirmed critical fix requires it.
+- New dependencies: None.
+- Backward compatibility notes: No API, CLI, schema, or persisted-format changes; invalid non-ASCII planner inputs now return errors instead of panicking.
 
 ## Design Decisions
 
-- Decision: Use evidence-based triage before any fix.
-  - Rationale: The automation should not open PRs for speculative or low-impact observations.
-  - Alternatives considered: Broad refactoring or proactive hardening without a confirmed trigger.
-  - Why rejected: It would increase review noise and risk outside the requested scope.
+- Decision: Fix only the confirmed agent crash paths in this PR.
+  - Rationale: `update_task_plan` and `parse_task_list` are active agent/planning paths where LLM-produced non-ASCII invalid data can panic before returning a recoverable error.
+  - Alternatives considered: Sweep every remaining byte slice across CLI/admin formatting paths.
+  - Why rejected: Several remaining paths are lower-blast-radius or ASCII/index-derived; broad cleanup would exceed the critical-bug scope.
 
 ## Open Questions
 
-- [ ] Which recent commits have the largest behavioral blast radius?
-- [ ] Do any suspicious changes have a concrete critical trigger scenario?
+- [x] Which recent commits have the largest behavioral blast radius?
+- [x] Do any suspicious changes have a concrete critical trigger scenario?
diff --git a/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md b/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md
index e5fd29ef..43b43140 100644
--- a/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md
+++ b/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md
@@ -2,28 +2,37 @@
 
 ## Scope Reviewed
 
-- Files/modules:
-- Commits/changes:
+- Files/modules: Recent commits on current branch/main, `crates/skilllite-agent/src/agent_loop/helpers.rs`, `crates/skilllite-agent/src/task_planner.rs`, related UTF-8 truncation helpers and tests.
+- Commits/changes: Latest UTF-8 truncation fixes, recent agent/CLI/runtime bridge commits, and the new fix commit `ea54799`.
 
 ## Findings
 
-- Critical:
-- Major:
-- Minor:
+- Critical: `update_task_plan` error preview used raw byte slicing on model-provided `tasks` strings. A non-array string with a multibyte character crossing byte 120 panicked before returning a tool error, crashing the active agent turn.
+- Critical: `parse_task_list` debug preview used raw byte slicing on malformed LLM planning output. A multibyte character crossing byte 500 panicked before the intended structured parse error/fallback path.
+- Major: Other lower-blast-radius byte-slice truncation sites remain in CLI/admin formatting paths, but were not fixed because they did not meet the requested critical-confidence bar for this PR.
+- Minor: None.
 
 ## Quality Gates
 
-- Architecture boundary checks: `pass | fail`
-- Security invariants: `pass | fail`
-- Required tests executed: `pass | fail`
-- Docs sync (EN/ZH): `pass | fail`
+- Architecture boundary checks: `pass`
+- Security invariants: `pass`
+- Required tests executed: `pass`
+- Docs sync (EN/ZH): `pass` - no user-facing command/env/doc semantics changed.
 
 ## Test Evidence
 
 - Commands run:
+  - `cargo test -p skilllite-agent update_task_plan_rejects_non_array_string_without_utf8_boundary_panic`
+  - `cargo test -p skilllite-agent parse_task_list_returns_error_without_utf8_boundary_panic`
+  - `cargo fmt --check && cargo clippy --all-targets -- -D warnings && cargo test -p skilllite-agent && cargo test && python3 scripts/validate_tasks.py`
 - Key outputs:
+  - Targeted tests: both passed with 1 passed, 0 failed.
+  - `cargo clippy --all-targets -- -D warnings`: completed successfully.
+  - `cargo test -p skilllite-agent`: 247 passed, 0 failed.
+  - `cargo test`: full workspace test suite completed successfully.
+  - `python3 scripts/validate_tasks.py`: `Task validation passed (68 task directories checked).`
 
 ## Decision
 
-- Merge readiness: `ready | not ready`
-- Follow-up actions:
+- Merge readiness: `ready`
+- Follow-up actions: Consider a separate non-critical cleanup sweep for remaining CLI/admin byte-slice previews.
diff --git a/tasks/TASK-2026-068-critical-bug-audit/STATUS.md b/tasks/TASK-2026-068-critical-bug-audit/STATUS.md
index 631fab78..caf1d382 100644
--- a/tasks/TASK-2026-068-critical-bug-audit/STATUS.md
+++ b/tasks/TASK-2026-068-critical-bug-audit/STATUS.md
@@ -5,13 +5,20 @@
 - 2026-06-07:
   - Progress: Created audit task, loaded injected specs, reviewed recent commits, and implemented a minimal UTF-8-safe fix for two agent planning error paths.
   - Blockers: None.
-  - Next step: Commit/push the implementation, then run targeted and full validation.
+  - Next step: Open PR and report results.
 
 ## Checkpoints
 
 - [x] PRD drafted before implementation (or `N/A` recorded)
 - [x] Context drafted before implementation (or `N/A` recorded)
 - [x] Implementation complete
-- [ ] Tests passed
-- [ ] Review complete
-- [ ] Board updated
+- [x] Tests passed
+- [x] Review complete
+- [x] Board updated
+
+## Validation Evidence
+
+- `cargo test -p skilllite-agent update_task_plan_rejects_non_array_string_without_utf8_boundary_panic` passed: 1 test passed, 0 failed.
+- `cargo test -p skilllite-agent parse_task_list_returns_error_without_utf8_boundary_panic` passed: 1 test passed, 0 failed.
+- `cargo fmt --check && cargo clippy --all-targets -- -D warnings && cargo test -p skilllite-agent && cargo test && python3 scripts/validate_tasks.py` exited 0.
+- Key output: `skilllite-agent` tests passed with 247 passed, 0 failed; full workspace tests completed successfully; task validation passed for 68 task directories.
diff --git a/tasks/TASK-2026-068-critical-bug-audit/TASK.md b/tasks/TASK-2026-068-critical-bug-audit/TASK.md
index ab6497f9..bd7dfbfa 100644
--- a/tasks/TASK-2026-068-critical-bug-audit/TASK.md
+++ b/tasks/TASK-2026-068-critical-bug-audit/TASK.md
@@ -4,7 +4,7 @@
 
 - Task ID: `TASK-2026-068`
 - Title: Critical Bug Audit
-- Status: `in_progress`
+- Status: `done`
 - Priority: `P0`
 - Owner: `agent`
 - Contributors:
@@ -25,7 +25,7 @@ Inspect recent SkillLite commits for high-severity correctness regressions that
 - [x] Recent behavioral commits are reviewed beyond the diff by tracing relevant caller and downstream paths.
 - [x] Any reported bug has a concrete trigger scenario and critical impact.
 - [x] If a critical bug is fixed, the fix is minimal and covered by targeted validation.
-- [ ] Validation commands pass and evidence is recorded.
+- [x] Validation commands pass and evidence is recorded.
 
 ## Risks
 
@@ -50,5 +50,5 @@ Inspect recent SkillLite commits for high-severity correctness regressions that
 ## Links
 
 - Source TODO section: N/A - scheduled critical bug-finding automation.
-- Related PRs/issues: Recent commits on current branch and `main`.
+- Related PRs/issues: Fix commit `ea54799`; recent commits on current branch and `main`.
 - Related docs: `spec/verification-integrity.md`, `spec/README.md`.
diff --git a/tasks/board.md b/tasks/board.md
index c43093a0..5970e73e 100644
--- a/tasks/board.md
+++ b/tasks/board.md
@@ -1,10 +1,10 @@
 # Task Board
 
-Last updated: 2026-06-07 (TASK-2026-068 critical bug audit in progress)
+Last updated: 2026-06-07 (TASK-2026-068 critical bug audit done)
 
 ## In Progress
 
-- `TASK-2026-068-critical-bug-audit` - Status: `in_progress` - Owner: `agent`
+- None.
 
 ## Ready
 
@@ -17,6 +17,7 @@ Last updated: 2026-06-07 (TASK-2026-068 critical bug audit in progress)
 
 ## Done
 
+- `TASK-2026-068-critical-bug-audit` - Status: `done` - Owner: `agent`
 - `TASK-2026-067-utf8-llm-error-truncate` - Status: `done` - Owner: `agent`
 - `TASK-2026-066-utf8-evolution-log-truncate` - Status: `done` - Owner: `agent`
 - `TASK-2026-064-env-keys-single-source` - Status: `done` - Owner: `agent`