From ea54799eaf7767aebf0bdf69313469617340dc9a Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 7 Jun 2026 11:06:44 +0000 Subject: [PATCH 1/2] fix(agent): avoid utf8 panic in planning errors Co-authored-by: EXboy --- .../skilllite-agent/src/agent_loop/helpers.rs | 15 +++++- crates/skilllite-agent/src/task_planner.rs | 16 +++++- .../CONTEXT.md | 28 ++++++++++ tasks/TASK-2026-068-critical-bug-audit/PRD.md | 40 ++++++++++++++ .../REVIEW.md | 29 ++++++++++ .../STATUS.md | 17 ++++++ .../TASK-2026-068-critical-bug-audit/TASK.md | 54 +++++++++++++++++++ tasks/board.md | 4 +- 8 files changed, 199 insertions(+), 4 deletions(-) create mode 100644 tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md create mode 100644 tasks/TASK-2026-068-critical-bug-audit/PRD.md create mode 100644 tasks/TASK-2026-068-critical-bug-audit/REVIEW.md create mode 100644 tasks/TASK-2026-068-critical-bug-audit/STATUS.md create mode 100644 tasks/TASK-2026-068-critical-bug-audit/TASK.md diff --git a/crates/skilllite-agent/src/agent_loop/helpers.rs b/crates/skilllite-agent/src/agent_loop/helpers.rs index 15ecee94..0d163006 100644 --- a/crates/skilllite-agent/src/agent_loop/helpers.rs +++ b/crates/skilllite-agent/src/agent_loop/helpers.rs @@ -72,7 +72,7 @@ pub(super) fn handle_update_task_plan( "'tasks' must be a JSON array, got a string that is not valid JSON array. \ Pass tasks as a real array: {{\"tasks\": [{{...}}]}} not {{\"tasks\": \"[...]\"}}. \ Received string preview: {:?}", - &s[..s.len().min(120)] + safe_truncate(s, 120) ), is_error: true, counts_as_failure: true, @@ -1104,6 +1104,19 @@ mod tests { assert!(r.content.contains("must be a JSON array"), "{}", r.content); } + #[test] + fn update_task_plan_rejects_non_array_string_without_utf8_boundary_panic() { + let mut planner = TaskPlanner::new(None, None, None); + let mut sink = SilentEventSink; + let invalid_tasks = format!("{}汉 trailing invalid json", "a".repeat(119)); + let args = serde_json::json!({ "tasks": invalid_tasks }).to_string(); + + let r = handle_update_task_plan(&args, &mut planner, &[], &mut sink); + + assert!(r.is_error); + assert!(r.content.contains("must be a JSON array"), "{}", r.content); + } + #[test] fn update_task_plan_rejects_missing_tasks_field() { let mut planner = TaskPlanner::new(None, None, None); diff --git a/crates/skilllite-agent/src/task_planner.rs b/crates/skilllite-agent/src/task_planner.rs index ef577c39..199918d7 100644 --- a/crates/skilllite-agent/src/task_planner.rs +++ b/crates/skilllite-agent/src/task_planner.rs @@ -394,7 +394,7 @@ impl TaskPlanner { tracing::debug!( "parse_task_list raw (first 500 chars): {}", - &raw[..raw.len().min(500)] + safe_truncate(raw, 500) ); bail!("No valid JSON task array found in LLM response") } @@ -777,6 +777,20 @@ mod tests { assert!(empty.is_empty()); } + #[test] + fn parse_task_list_returns_error_without_utf8_boundary_panic() { + let planner = TaskPlanner::new(None, None, None); + let raw = format!("{}汉 trailing invalid planning response", "a".repeat(499)); + + let err = planner.parse_task_list(&raw).unwrap_err(); + + assert!( + err.to_string().contains("No valid JSON task array found"), + "{}", + err + ); + } + #[test] fn test_planning_prompt_contains_placeholders_resolved() { let planner = TaskPlanner::new(None, None, None); diff --git a/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md b/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md new file mode 100644 index 00000000..ee450c84 --- /dev/null +++ b/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md @@ -0,0 +1,28 @@ +# Technical Context + +## Current State + +- Relevant crates/files: To be determined from recent commit review; likely `crates/`, `docs/`, `python-sdk/`, and task artifacts touched by recent commits. +- Current behavior: Current branch `cursor/critical-bug-investigation-b158` is aligned with `origin/main` at audit start. + +## Architecture Fit + +- Layer boundaries involved: Potentially all Rust workspace layers depending on reviewed commits. +- Interfaces to preserve: Public CLI behavior, sandbox execution policy, agent/tool orchestration contracts, persisted data formats. + +## Dependency and Compatibility + +- New dependencies: None expected for audit-only work. +- Backward compatibility notes: No compatibility changes unless a confirmed critical fix requires it. + +## Design Decisions + +- Decision: Use evidence-based triage before any fix. + - Rationale: The automation should not open PRs for speculative or low-impact observations. + - Alternatives considered: Broad refactoring or proactive hardening without a confirmed trigger. + - Why rejected: It would increase review noise and risk outside the requested scope. + +## Open Questions + +- [ ] Which recent commits have the largest behavioral blast radius? +- [ ] Do any suspicious changes have a concrete critical trigger scenario? diff --git a/tasks/TASK-2026-068-critical-bug-audit/PRD.md b/tasks/TASK-2026-068-critical-bug-audit/PRD.md new file mode 100644 index 00000000..1765c501 --- /dev/null +++ b/tasks/TASK-2026-068-critical-bug-audit/PRD.md @@ -0,0 +1,40 @@ +# PRD + +## Background + +This scheduled automation performs a focused audit of recent commits to catch high-severity correctness regressions before they reach users. + +## Objective + +Identify and, when highly confident, minimally fix critical bugs with concrete trigger scenarios. If no such bug is found, produce a concise no-critical-findings report. + +## Functional Requirements + +- FR-1: Inspect recent behavioral changes and trace affected code paths beyond surface diffs. +- FR-2: Only treat findings as actionable when they can cause data loss, crash, security bypass, silent corruption/truncation, or significant user-facing breakage. +- FR-3: If a fix is made, include focused validation evidence. + +## Non-Functional Requirements + +- Security: Preserve sandbox and approval invariants; do not relax policy while auditing. +- Performance: Avoid broad test or build work unless needed to validate a concrete finding. +- Compatibility: Do not alter shipped behavior unless fixing a confirmed critical bug. + +## Constraints + +- Technical: Work on branch `cursor/critical-bug-investigation-b158`; avoid broad refactors. +- Timeline: Scheduled automation run; complete autonomously with evidence. + +## Success Metrics + +- Metric: Number of surfaced critical findings without concrete trigger scenarios. +- Baseline: Unknown. +- Target: Zero speculative surfaced findings. +- Metric: Validation evidence for any repository changes. +- Baseline: Required by repository workflow. +- Target: Commands actually run and recorded. + +## Rollout + +- Rollout plan: If no code fix is needed, report results only. If a critical fix is made, commit, push, and open a PR. +- Rollback plan: Revert the minimal fix commit if validation or review disproves the finding. diff --git a/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md b/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md new file mode 100644 index 00000000..e5fd29ef --- /dev/null +++ b/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md @@ -0,0 +1,29 @@ +# Review Report + +## Scope Reviewed + +- Files/modules: +- Commits/changes: + +## Findings + +- Critical: +- Major: +- Minor: + +## Quality Gates + +- Architecture boundary checks: `pass | fail` +- Security invariants: `pass | fail` +- Required tests executed: `pass | fail` +- Docs sync (EN/ZH): `pass | fail` + +## Test Evidence + +- Commands run: +- Key outputs: + +## Decision + +- Merge readiness: `ready | not ready` +- Follow-up actions: diff --git a/tasks/TASK-2026-068-critical-bug-audit/STATUS.md b/tasks/TASK-2026-068-critical-bug-audit/STATUS.md new file mode 100644 index 00000000..631fab78 --- /dev/null +++ b/tasks/TASK-2026-068-critical-bug-audit/STATUS.md @@ -0,0 +1,17 @@ +# Status Journal + +## Timeline + +- 2026-06-07: + - Progress: Created audit task, loaded injected specs, reviewed recent commits, and implemented a minimal UTF-8-safe fix for two agent planning error paths. + - Blockers: None. + - Next step: Commit/push the implementation, then run targeted and full validation. + +## Checkpoints + +- [x] PRD drafted before implementation (or `N/A` recorded) +- [x] Context drafted before implementation (or `N/A` recorded) +- [x] Implementation complete +- [ ] Tests passed +- [ ] Review complete +- [ ] Board updated diff --git a/tasks/TASK-2026-068-critical-bug-audit/TASK.md b/tasks/TASK-2026-068-critical-bug-audit/TASK.md new file mode 100644 index 00000000..ab6497f9 --- /dev/null +++ b/tasks/TASK-2026-068-critical-bug-audit/TASK.md @@ -0,0 +1,54 @@ +# TASK Card + +## Metadata + +- Task ID: `TASK-2026-068` +- Title: Critical Bug Audit +- Status: `in_progress` +- Priority: `P0` +- Owner: `agent` +- Contributors: +- Created: `2026-06-07` +- Target milestone: + +## Problem + +Inspect recent SkillLite commits for high-severity correctness regressions that escaped review. Only actionable findings with a concrete trigger scenario should result in a code change or PR. + +## Scope + +- In scope: Recent commits on `main` / current branch, with emphasis on behavioral changes affecting agent loops, CLI/runtime bridges, sandbox/security behavior, persistence, and UTF-8/error handling paths. +- Out of scope: Style-only issues, theoretical concerns without a plausible trigger, broad refactors, and low-severity UX degradation. + +## Acceptance Criteria + +- [x] Recent behavioral commits are reviewed beyond the diff by tracing relevant caller and downstream paths. +- [x] Any reported bug has a concrete trigger scenario and critical impact. +- [x] If a critical bug is fixed, the fix is minimal and covered by targeted validation. +- [ ] Validation commands pass and evidence is recorded. + +## Risks + +- Risk: Over-reporting speculative issues. + - Impact: Unnecessary PR noise and reviewer distraction. + - Mitigation: Require a concrete trigger scenario and critical user impact before opening a PR. +- Risk: Missing a regression outside the highest-risk recent changes. + - Impact: Critical bug remains undetected. + - Mitigation: Review commit metadata first, then inspect broad-blast-radius code paths. + +## Validation Plan + +- Required tests: `cargo fmt --check`, `cargo clippy --all-targets -- -D warnings`, `cargo test -p skilllite-agent`, `cargo test`, and `python3 scripts/validate_tasks.py`. +- Commands to run: `git log`, targeted `git show` / `git diff`, targeted agent regression tests, full workspace validation. +- Manual checks: Trace caller/downstream paths and confirm whether each candidate has a plausible critical trigger. + +## Regression Scope + +- Areas likely affected: Agent behavior, command routing, sandbox execution, persistence/logging, UTF-8 truncation and user-facing error summaries. +- Explicit non-goals: Cosmetic cleanup, unrelated TODO execution, documentation rewrite. + +## Links + +- Source TODO section: N/A - scheduled critical bug-finding automation. +- Related PRs/issues: Recent commits on current branch and `main`. +- Related docs: `spec/verification-integrity.md`, `spec/README.md`. diff --git a/tasks/board.md b/tasks/board.md index 4b63f2b2..c43093a0 100644 --- a/tasks/board.md +++ b/tasks/board.md @@ -1,10 +1,10 @@ # Task Board -Last updated: 2026-06-06 (TASK-2026-067 utf8 llm error truncate done) +Last updated: 2026-06-07 (TASK-2026-068 critical bug audit in progress) ## In Progress -- None. +- `TASK-2026-068-critical-bug-audit` - Status: `in_progress` - Owner: `agent` ## Ready From 807912da6b974696f43c961db01a10b19a2076b3 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 7 Jun 2026 11:10:39 +0000 Subject: [PATCH 2/2] docs(task): record critical bug audit validation Co-authored-by: EXboy --- .../CONTEXT.md | 24 +++++++------- .../REVIEW.md | 31 ++++++++++++------- .../STATUS.md | 15 ++++++--- .../TASK-2026-068-critical-bug-audit/TASK.md | 6 ++-- tasks/board.md | 5 +-- 5 files changed, 49 insertions(+), 32 deletions(-) diff --git a/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md b/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md index ee450c84..afbffa55 100644 --- a/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md +++ b/tasks/TASK-2026-068-critical-bug-audit/CONTEXT.md @@ -2,27 +2,27 @@ ## Current State -- Relevant crates/files: To be determined from recent commit review; likely `crates/`, `docs/`, `python-sdk/`, and task artifacts touched by recent commits. -- Current behavior: Current branch `cursor/critical-bug-investigation-b158` is aligned with `origin/main` at audit start. +- Relevant crates/files: `crates/skilllite-agent/src/agent_loop/helpers.rs`, `crates/skilllite-agent/src/task_planner.rs`, and recent UTF-8 truncation commits on `main`. +- Current behavior: Agent planning error paths now use `safe_truncate` for previews instead of raw byte slicing. ## Architecture Fit -- Layer boundaries involved: Potentially all Rust workspace layers depending on reviewed commits. -- Interfaces to preserve: Public CLI behavior, sandbox execution policy, agent/tool orchestration contracts, persisted data formats. +- Layer boundaries involved: `skilllite-agent` planning and agent-loop error handling. +- Interfaces to preserve: Tool-result error reporting, planner parse/fallback behavior, and existing `safe_truncate` utility semantics. ## Dependency and Compatibility -- New dependencies: None expected for audit-only work. -- Backward compatibility notes: No compatibility changes unless a confirmed critical fix requires it. +- New dependencies: None. +- Backward compatibility notes: No API, CLI, schema, or persisted-format changes; invalid non-ASCII planner inputs now return errors instead of panicking. ## Design Decisions -- Decision: Use evidence-based triage before any fix. - - Rationale: The automation should not open PRs for speculative or low-impact observations. - - Alternatives considered: Broad refactoring or proactive hardening without a confirmed trigger. - - Why rejected: It would increase review noise and risk outside the requested scope. +- Decision: Fix only the confirmed agent crash paths in this PR. + - Rationale: `update_task_plan` and `parse_task_list` are active agent/planning paths where LLM-produced non-ASCII invalid data can panic before returning a recoverable error. + - Alternatives considered: Sweep every remaining byte slice across CLI/admin formatting paths. + - Why rejected: Several remaining paths are lower-blast-radius or ASCII/index-derived; broad cleanup would exceed the critical-bug scope. ## Open Questions -- [ ] Which recent commits have the largest behavioral blast radius? -- [ ] Do any suspicious changes have a concrete critical trigger scenario? +- [x] Which recent commits have the largest behavioral blast radius? +- [x] Do any suspicious changes have a concrete critical trigger scenario? diff --git a/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md b/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md index e5fd29ef..43b43140 100644 --- a/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md +++ b/tasks/TASK-2026-068-critical-bug-audit/REVIEW.md @@ -2,28 +2,37 @@ ## Scope Reviewed -- Files/modules: -- Commits/changes: +- Files/modules: Recent commits on current branch/main, `crates/skilllite-agent/src/agent_loop/helpers.rs`, `crates/skilllite-agent/src/task_planner.rs`, related UTF-8 truncation helpers and tests. +- Commits/changes: Latest UTF-8 truncation fixes, recent agent/CLI/runtime bridge commits, and the new fix commit `ea54799`. ## Findings -- Critical: -- Major: -- Minor: +- Critical: `update_task_plan` error preview used raw byte slicing on model-provided `tasks` strings. A non-array string with a multibyte character crossing byte 120 panicked before returning a tool error, crashing the active agent turn. +- Critical: `parse_task_list` debug preview used raw byte slicing on malformed LLM planning output. A multibyte character crossing byte 500 panicked before the intended structured parse error/fallback path. +- Major: Other lower-blast-radius byte-slice truncation sites remain in CLI/admin formatting paths, but were not fixed because they did not meet the requested critical-confidence bar for this PR. +- Minor: None. ## Quality Gates -- Architecture boundary checks: `pass | fail` -- Security invariants: `pass | fail` -- Required tests executed: `pass | fail` -- Docs sync (EN/ZH): `pass | fail` +- Architecture boundary checks: `pass` +- Security invariants: `pass` +- Required tests executed: `pass` +- Docs sync (EN/ZH): `pass` - no user-facing command/env/doc semantics changed. ## Test Evidence - Commands run: + - `cargo test -p skilllite-agent update_task_plan_rejects_non_array_string_without_utf8_boundary_panic` + - `cargo test -p skilllite-agent parse_task_list_returns_error_without_utf8_boundary_panic` + - `cargo fmt --check && cargo clippy --all-targets -- -D warnings && cargo test -p skilllite-agent && cargo test && python3 scripts/validate_tasks.py` - Key outputs: + - Targeted tests: both passed with 1 passed, 0 failed. + - `cargo clippy --all-targets -- -D warnings`: completed successfully. + - `cargo test -p skilllite-agent`: 247 passed, 0 failed. + - `cargo test`: full workspace test suite completed successfully. + - `python3 scripts/validate_tasks.py`: `Task validation passed (68 task directories checked).` ## Decision -- Merge readiness: `ready | not ready` -- Follow-up actions: +- Merge readiness: `ready` +- Follow-up actions: Consider a separate non-critical cleanup sweep for remaining CLI/admin byte-slice previews. diff --git a/tasks/TASK-2026-068-critical-bug-audit/STATUS.md b/tasks/TASK-2026-068-critical-bug-audit/STATUS.md index 631fab78..caf1d382 100644 --- a/tasks/TASK-2026-068-critical-bug-audit/STATUS.md +++ b/tasks/TASK-2026-068-critical-bug-audit/STATUS.md @@ -5,13 +5,20 @@ - 2026-06-07: - Progress: Created audit task, loaded injected specs, reviewed recent commits, and implemented a minimal UTF-8-safe fix for two agent planning error paths. - Blockers: None. - - Next step: Commit/push the implementation, then run targeted and full validation. + - Next step: Open PR and report results. ## Checkpoints - [x] PRD drafted before implementation (or `N/A` recorded) - [x] Context drafted before implementation (or `N/A` recorded) - [x] Implementation complete -- [ ] Tests passed -- [ ] Review complete -- [ ] Board updated +- [x] Tests passed +- [x] Review complete +- [x] Board updated + +## Validation Evidence + +- `cargo test -p skilllite-agent update_task_plan_rejects_non_array_string_without_utf8_boundary_panic` passed: 1 test passed, 0 failed. +- `cargo test -p skilllite-agent parse_task_list_returns_error_without_utf8_boundary_panic` passed: 1 test passed, 0 failed. +- `cargo fmt --check && cargo clippy --all-targets -- -D warnings && cargo test -p skilllite-agent && cargo test && python3 scripts/validate_tasks.py` exited 0. +- Key output: `skilllite-agent` tests passed with 247 passed, 0 failed; full workspace tests completed successfully; task validation passed for 68 task directories. diff --git a/tasks/TASK-2026-068-critical-bug-audit/TASK.md b/tasks/TASK-2026-068-critical-bug-audit/TASK.md index ab6497f9..bd7dfbfa 100644 --- a/tasks/TASK-2026-068-critical-bug-audit/TASK.md +++ b/tasks/TASK-2026-068-critical-bug-audit/TASK.md @@ -4,7 +4,7 @@ - Task ID: `TASK-2026-068` - Title: Critical Bug Audit -- Status: `in_progress` +- Status: `done` - Priority: `P0` - Owner: `agent` - Contributors: @@ -25,7 +25,7 @@ Inspect recent SkillLite commits for high-severity correctness regressions that - [x] Recent behavioral commits are reviewed beyond the diff by tracing relevant caller and downstream paths. - [x] Any reported bug has a concrete trigger scenario and critical impact. - [x] If a critical bug is fixed, the fix is minimal and covered by targeted validation. -- [ ] Validation commands pass and evidence is recorded. +- [x] Validation commands pass and evidence is recorded. ## Risks @@ -50,5 +50,5 @@ Inspect recent SkillLite commits for high-severity correctness regressions that ## Links - Source TODO section: N/A - scheduled critical bug-finding automation. -- Related PRs/issues: Recent commits on current branch and `main`. +- Related PRs/issues: Fix commit `ea54799`; recent commits on current branch and `main`. - Related docs: `spec/verification-integrity.md`, `spec/README.md`. diff --git a/tasks/board.md b/tasks/board.md index c43093a0..5970e73e 100644 --- a/tasks/board.md +++ b/tasks/board.md @@ -1,10 +1,10 @@ # Task Board -Last updated: 2026-06-07 (TASK-2026-068 critical bug audit in progress) +Last updated: 2026-06-07 (TASK-2026-068 critical bug audit done) ## In Progress -- `TASK-2026-068-critical-bug-audit` - Status: `in_progress` - Owner: `agent` +- None. ## Ready @@ -17,6 +17,7 @@ Last updated: 2026-06-07 (TASK-2026-068 critical bug audit in progress) ## Done +- `TASK-2026-068-critical-bug-audit` - Status: `done` - Owner: `agent` - `TASK-2026-067-utf8-llm-error-truncate` - Status: `done` - Owner: `agent` - `TASK-2026-066-utf8-evolution-log-truncate` - Status: `done` - Owner: `agent` - `TASK-2026-064-env-keys-single-source` - Status: `done` - Owner: `agent`