diff --git a/CHANGELOG.md b/CHANGELOG.md index 0816c5e..75eceee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,46 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.23.0] - 2026-06-13 + +Finalize, retuned after running `complete` on real 12-session tasks: the fast, +reliable judge-only path is now the default, and the slow session-enrich pass is +opt-in. + +### Changed +- **`complete` is judge-only by default; enrich is opt-in via `--enrich`.** + Finalizing through the model's judgment (retitle + close + outcome) takes + seconds and is what gives ~90% of the value. The session-backfill pass — one + `claude -p` call per session, minutes on a big multi-session task — proved too + slow to be the default, so it now runs only with `--enrich`. (The old `--quick` + flag is gone: its behaviour is the default. Replace `complete --quick` + with `complete `, and `complete ` with `complete --enrich` if you + want the old full behaviour.) + +### Fixed +- **`complete` survives a non-JSON enrich reply.** When the backfill model + answered with prose instead of the requested JSON array — e.g. continuing the + transcript's own dialogue ("Контекст в норме… Что дальше?") — the parse error + aborted the whole `complete`, losing the retitle and close. Backfill now skips + an unparseable chunk reply (with a warning), the parser extracts a JSON array + even when wrapped in prose, and the prompt re-asserts "output ONLY the JSON + array, do not continue the transcript" after the transcript. +- **Enrich chunks are sized for `claude -p`'s overhead.** `claude -p` is a full + Claude Code instance whose system prompt + tool definitions cost ~113k tokens + before our content, so the earlier 360k-char chunk still 400'd at ~204k total. + The per-call transcript budget drops to 150k chars (~37k tokens), and **any** + per-chunk failure (over-budget 400, transient error, non-JSON) is skipped + rather than aborting — a genuinely broken backend still surfaces at the judge + step. +- **No more apparent hang.** A big task makes many sequential `claude -p` calls; + without a timeout one wedged call hung the whole command with no output. Each + call now has a wall-clock timeout (90s, `TJ_CLAUDE_TIMEOUT_SECS`) that kills a + stuck `claude` (pipes drained in threads to avoid buffer deadlock), and enrich + prints an "enriching N session(s)…" progress line pointing at `--quick`. +- **Legible `claude -p` errors** (carried from the same investigation): a + non-zero exit now surfaces the JSON error claude prints on stdout, so failures + read as "Prompt is too long · ~204261 tokens" instead of a bare "exit 1". + ## [0.22.1] - 2026-06-13 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index 1bfc571..bec2953 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2572,7 +2572,7 @@ dependencies = [ [[package]] name = "task-journal-cli" -version = "0.22.1" +version = "0.23.0" dependencies = [ "anyhow", "assert_cmd", @@ -2596,7 +2596,7 @@ dependencies = [ [[package]] name = "task-journal-core" -version = "0.22.1" +version = "0.23.0" dependencies = [ "anyhow", "chrono", @@ -2621,7 +2621,7 @@ dependencies = [ [[package]] name = "task-journal-mcp" -version = "0.22.1" +version = "0.23.0" dependencies = [ "anyhow", "chrono", diff --git a/Cargo.toml b/Cargo.toml index f9c65b5..acf8fae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ ] [workspace.package] -version = "0.22.1" +version = "0.23.0" edition = "2021" rust-version = "1.88" license = "MIT" diff --git a/crates/tj-cli/Cargo.toml b/crates/tj-cli/Cargo.toml index 1ef884f..6f3ce8b 100644 --- a/crates/tj-cli/Cargo.toml +++ b/crates/tj-cli/Cargo.toml @@ -23,7 +23,7 @@ default = ["embed"] embed = ["tj-core/embed"] [dependencies] -tj-core = { package = "task-journal-core", version = "0.22.1", path = "../tj-core", default-features = false } +tj-core = { package = "task-journal-core", version = "0.23.0", path = "../tj-core", default-features = false } anyhow = { workspace = true } clap = { workspace = true } tracing = { workspace = true } diff --git a/crates/tj-cli/src/main.rs b/crates/tj-cli/src/main.rs index 3894399..28d49a0 100644 --- a/crates/tj-cli/src/main.rs +++ b/crates/tj-cli/src/main.rs @@ -871,21 +871,22 @@ enum Commands { #[arg(long)] backend: Option, }, - /// Finalize a task: enrich its memory from the sessions it touched, fix a - /// junk auto-title, and close it IF the events clearly show it is done — - /// the model decides from the content. Omit the id to finalize every open - /// task in the project (batch, with a reviewable list). One LLM call per - /// session for enrich + one judge call per task, via the chosen backend - /// (free with `--backend ollama`). + /// Finalize a task: fix a junk auto-title and close it IF the events + /// clearly show it is done — the model decides from the content, in + /// seconds. Omit the id to finalize every open task (batch, with a + /// reviewable list). Add `--enrich` to also re-read the task's sessions and + /// backfill missed events first — thorough but slow (one `claude -p` call + /// per session; minutes on a big multi-session task). Complete { /// The task id to finalize. Omit to finalize all open tasks (batch). task: Option, /// Show scope and planned actions without calling the model or writing. #[arg(long)] dry_run: bool, - /// Skip the (heavy) enrich pass; judge/retitle/close from stored events only. + /// Also backfill missed events from the task's sessions before judging. + /// Thorough but slow (one `claude -p` call per session). #[arg(long)] - quick: bool, + enrich: bool, /// Required for batch finalize when stdin is not an interactive terminal. #[arg(long)] yes: bool, @@ -2784,12 +2785,12 @@ fn main() -> Result<()> { Commands::Complete { task, dry_run, - quick, + enrich, yes, backend, } => match task { - Some(id) => run_complete_single(&id, dry_run, quick, backend.as_deref())?, - None => run_complete_batch(dry_run, quick, yes, backend.as_deref())?, + Some(id) => run_complete_single(&id, dry_run, enrich, backend.as_deref())?, + None => run_complete_batch(dry_run, enrich, yes, backend.as_deref())?, }, Commands::Export { format, @@ -4153,6 +4154,14 @@ fn enrich_task( if sessions.is_empty() { return Ok(0); } + // Enrich is the slow part — one (or more, for big transcripts) `claude -p` + // call per session. Announce it so a multi-minute run doesn't look hung; + // `--quick` skips this entirely. + eprintln!( + "complete: enriching {} session(s) via {} — can take a few minutes (or use --quick to skip)…", + sessions.len(), + llm.name() + ); let run_id = ulid::Ulid::new().to_string(); let dream_backend = tj_core::dream::llm_backend::LlmDreamBackend::new(llm); let opts = tj_core::dream::DreamOptions { @@ -4206,7 +4215,7 @@ fn task_event_lines(conn: &rusqlite::Connection, task_id: &str) -> anyhow::Resul fn finalize_one_task( ctx: &ProjectCtx<'_>, task_id: &str, - quick: bool, + enrich: bool, dry_run: bool, backend: Option<&str>, ) -> anyhow::Result { @@ -4215,8 +4224,9 @@ fn finalize_one_task( let events_path = ctx.events_path; let project_hash = ctx.project_hash; - // 1. Enrich (unless quick / dry-run) — needs sessions and a backend. - if !quick && !dry_run { + // 1. Enrich (only when asked, and not on a dry-run) — needs sessions and a + // backend. Off by default because it is slow (one claude -p per session). + if enrich && !dry_run { if let Some(dir) = ctx.project_dir { if let Some(llm) = tj_core::llm::backend_from_env(backend)? { out.enriched = enrich_task(conn, events_path, project_hash, dir, task_id, llm)?; @@ -4331,7 +4341,7 @@ PATH; or pick one via --backend / TJ_BACKEND: anthropic, openai, ollama (free, l fn run_complete_single( task_id: &str, dry_run: bool, - quick: bool, + enrich: bool, backend: Option<&str>, ) -> anyhow::Result<()> { let cwd = std::env::current_dir()?; @@ -4352,7 +4362,7 @@ fn run_complete_single( project_hash: &project_hash, project_dir: project_dir.as_deref(), }; - let out = finalize_one_task(&ctx, task_id, quick, dry_run, backend)?; + let out = finalize_one_task(&ctx, task_id, enrich, dry_run, backend)?; print_finalize_outcome(task_id, &out); Ok(()) } @@ -4361,7 +4371,7 @@ fn run_complete_single( /// user can prune before confirming. Refuses without a TTY unless `--yes`. fn run_complete_batch( dry_run: bool, - quick: bool, + enrich: bool, yes: bool, backend: Option<&str>, ) -> anyhow::Result<()> { @@ -4417,7 +4427,7 @@ fn run_complete_batch( if dry_run { println!(); for (id, _) in &open { - finalize_one_task(&ctx, id, quick, true, backend)?; + finalize_one_task(&ctx, id, enrich, true, backend)?; } return Ok(()); } @@ -4457,7 +4467,11 @@ fn run_complete_batch( println!( "\nWill finalize {} task(s){}. Proceed? [y/N]", targets.len(), - if quick { " (quick: no enrich)" } else { "" } + if enrich { + " (with --enrich: slow, reads sessions)" + } else { + "" + } ); let mut buf = String::new(); std::io::stdin().read_line(&mut buf)?; @@ -4469,7 +4483,7 @@ fn run_complete_batch( let mut left_open: Vec<(String, String)> = Vec::new(); for (id, _) in &targets { - let out = finalize_one_task(&ctx, id, quick, false, backend)?; + let out = finalize_one_task(&ctx, id, enrich, false, backend)?; print_finalize_outcome(id, &out); if out.skipped_no_backend { println!("complete: stopping batch — no LLM backend available."); diff --git a/crates/tj-cli/tests/cli.rs b/crates/tj-cli/tests/cli.rs index c90699c..645b804 100644 --- a/crates/tj-cli/tests/cli.rs +++ b/crates/tj-cli/tests/cli.rs @@ -5549,10 +5549,10 @@ fn complete_batch_dry_run_lists_open_tasks() { /// `claude` on PATH returning a canned judgment. Proves the wiring: junk /// title → Rename, done verdict → Close with a persisted outcome. Unix-only /// (shell-script stub); the logic itself is covered cross-platform by the -/// finalize.rs unit tests. +/// finalize.rs unit tests. Default mode (judge-only, no `--enrich`). #[cfg(unix)] #[test] -fn complete_quick_retitles_and_closes_via_fake_backend() { +fn complete_retitles_and_closes_via_fake_backend() { use std::os::unix::fs::PermissionsExt; let dir = assert_fs::TempDir::new().unwrap(); @@ -5609,14 +5609,14 @@ fn complete_quick_retitles_and_closes_via_fake_backend() { .trim() .to_string(); - // --quick: skip enrich (no sessions), exercise judge → retitle → close. + // Default mode (judge-only): exercise judge → retitle → close. Command::cargo_bin("task-journal") .unwrap() .current_dir(proj.path()) .env("XDG_DATA_HOME", dir.path()) .env("PATH", &path_env) .env_remove("ANTHROPIC_API_KEY") - .args(["complete", &task_id, "--quick"]) + .args(["complete", &task_id]) .assert() .success() .stdout(contains("retitled")) diff --git a/crates/tj-core/src/classifier/agent_sdk.rs b/crates/tj-core/src/classifier/agent_sdk.rs index 69a5b43..239489d 100644 --- a/crates/tj-core/src/classifier/agent_sdk.rs +++ b/crates/tj-core/src/classifier/agent_sdk.rs @@ -93,12 +93,71 @@ fn claude_exit_error( anyhow!("`claude -p` exited with {status}: {detail}") } +/// Per-call wall-clock ceiling for a `claude -p` invocation. A spawned full +/// Claude Code instance normally answers in seconds; this kills a wedged one so +/// a multi-chunk enrich can't hang the whole `complete`. Override with +/// `TJ_CLAUDE_TIMEOUT_SECS`. +fn claude_timeout() -> std::time::Duration { + let secs = std::env::var("TJ_CLAUDE_TIMEOUT_SECS") + .ok() + .and_then(|s| s.parse::().ok()) + .filter(|n| *n > 0) + .unwrap_or(90); + std::time::Duration::from_secs(secs) +} + +/// Wait for `child` up to `timeout`, draining stdout/stderr concurrently so a +/// full pipe can't deadlock the wait. On timeout the child is killed and an +/// error returned; otherwise the captured output is handed back. +fn wait_with_timeout( + mut child: std::process::Child, + timeout: std::time::Duration, +) -> anyhow::Result { + use std::io::Read; + let mut out_pipe = child.stdout.take(); + let mut err_pipe = child.stderr.take(); + let so = std::thread::spawn(move || { + let mut b = Vec::new(); + if let Some(p) = out_pipe.as_mut() { + let _ = p.read_to_end(&mut b); + } + b + }); + let se = std::thread::spawn(move || { + let mut b = Vec::new(); + if let Some(p) = err_pipe.as_mut() { + let _ = p.read_to_end(&mut b); + } + b + }); + let start = std::time::Instant::now(); + let status = loop { + if let Some(status) = child.try_wait()? { + break status; + } + if start.elapsed() >= timeout { + let _ = child.kill(); + let _ = child.wait(); + anyhow::bail!("`claude -p` timed out after {}s", timeout.as_secs()); + } + std::thread::sleep(std::time::Duration::from_millis(150)); + }; + Ok(std::process::Output { + status, + stdout: so.join().unwrap_or_default(), + stderr: se.join().unwrap_or_default(), + }) +} + impl CommandRunner for ClaudeBinaryRunner { fn run(&self, model: &str, prompt: &str) -> anyhow::Result { - let output = base_claude_command(model) + let child = base_claude_command(model) .arg(prompt) - .output() + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() .context("failed to spawn `claude` (is Claude Code installed and on PATH?)")?; + let output = wait_with_timeout(child, claude_timeout())?; if !output.status.success() { return Err(claude_exit_error( output.status, @@ -135,9 +194,7 @@ impl CommandRunner for ClaudeBinaryStdinRunner { .context("claude stdin was not captured")? .write_all(prompt.as_bytes()) .context("failed to write prompt to claude stdin")?; - let output = child - .wait_with_output() - .context("failed to wait for `claude`")?; + let output = wait_with_timeout(child, claude_timeout())?; if !output.status.success() { return Err(claude_exit_error( output.status, diff --git a/crates/tj-core/src/dream/llm_backend.rs b/crates/tj-core/src/dream/llm_backend.rs index 7ad63fc..11e1042 100644 --- a/crates/tj-core/src/dream/llm_backend.rs +++ b/crates/tj-core/src/dream/llm_backend.rs @@ -25,10 +25,13 @@ impl LlmDreamBackend { } /// Max transcript characters fed to the model in one call. The hard wall is -/// the ~200k-token context limit (a real session hit ~220k tokens and `claude -/// -p` returned HTTP 400). We stay well under it and split oversized -/// transcripts across several calls, merging the events (run_dream dedups). -const TRANSCRIPT_CHAR_BUDGET: usize = 360_000; +/// the ~200k-token context window, but `claude -p` is a full Claude Code +/// instance: its system prompt + tool definitions alone cost ~113k tokens +/// before our content (measured: a 360k-char chunk was ~91k tokens, yet the +/// request totalled ~204k and 400'd). So the usable budget is far below the +/// nominal limit — keep each chunk well under it (~37k tokens) and split the +/// rest across calls, merging the events (run_dream dedups). +const TRANSCRIPT_CHAR_BUDGET: usize = 150_000; impl DreamBackend for LlmDreamBackend { fn backfill(&self, input: &BackfillInput) -> anyhow::Result> { @@ -39,8 +42,19 @@ impl DreamBackend for LlmDreamBackend { transcript: chunk, }; let prompt = crate::dream::prompt::build_prompt(&chunk_input); - let text = self.llm.complete(&prompt, 1024)?; - out.extend(parse_backfill_json(&text)?); + // Backfill is strictly best-effort: ANY per-chunk failure — an + // over-budget 400, a transient backend error, or a non-JSON reply + // (model continued the transcript dialogue) — is skipped, never + // aborting the finalize. A genuinely broken backend still surfaces + // at the judge step, which has its own (small, always-sized) call. + match self + .llm + .complete(&prompt, 1024) + .and_then(|text| parse_backfill_json(&text)) + { + Ok(evs) => out.extend(evs), + Err(e) => tracing::warn!(error = %e, "dream backfill: skipping chunk"), + } } Ok(out) } @@ -86,8 +100,12 @@ pub fn parse_backfill_json(text: &str) -> anyhow::Result> { .trim_start_matches("```") .trim_end_matches("```") .trim(); - serde_json::from_str(json_str) - .with_context(|| format!("dream JSON parse failed; got: {json_str}")) + // Tolerate a JSON array wrapped in prose by slicing to the outer brackets. + let slice = match (json_str.find('['), json_str.rfind(']')) { + (Some(a), Some(b)) if b > a => &json_str[a..=b], + _ => json_str, + }; + serde_json::from_str(slice).with_context(|| format!("dream JSON parse failed; got: {json_str}")) } #[cfg(test)] @@ -111,6 +129,67 @@ mod tests { assert!(parse_backfill_json("[]").unwrap().is_empty()); } + #[test] + fn parse_extracts_array_wrapped_in_prose() { + let reply = "Here are the missed events:\n[{\"event_type\":\"finding\",\ +\"task_id\":\"tj-1\",\"text\":\"found\",\"timestamp\":\"2026-06-13T00:00:00Z\"}]\nHope that helps!"; + let evs = parse_backfill_json(reply).unwrap(); + assert_eq!(evs.len(), 1); + } + + #[test] + fn parse_errors_on_pure_prose() { + // A conversational reply with no array at all must be an Err so the + // backfill loop can skip the chunk instead of inventing events. + assert!(parse_backfill_json("Контекст в норме. Что дальше?").is_err()); + } + + #[test] + fn backfill_skips_unparseable_chunk_reply() { + // Model replies with prose, not JSON → backfill yields nothing but does + // NOT error, so the surrounding finalize (retitle/close) still runs. + struct ChattyLlm; + impl LlmBackend for ChattyLlm { + fn complete(&self, _prompt: &str, _max: u32) -> anyhow::Result { + Ok("Контекст в норме. 566.5k/1M использовано. Что дальше?".to_string()) + } + fn name(&self) -> &'static str { + "chatty" + } + } + let b = LlmDreamBackend::new(Box::new(ChattyLlm)); + let input = BackfillInput { + tasks: vec![], + transcript: "user: hi\nassistant: hello".into(), + }; + let evs = b.backfill(&input).unwrap(); + assert!(evs.is_empty()); + } + + #[test] + fn backfill_skips_chunk_whose_call_errors() { + // An over-budget 400 / transient backend error on a chunk must be + // swallowed so the surrounding finalize (retitle/close) still runs. + struct FailingLlm; + impl LlmBackend for FailingLlm { + fn complete(&self, _prompt: &str, _max: u32) -> anyhow::Result { + Err(anyhow::anyhow!( + "`claude -p` exited with status 1: Prompt is too long" + )) + } + fn name(&self) -> &'static str { + "failing" + } + } + let b = LlmDreamBackend::new(Box::new(FailingLlm)); + let input = BackfillInput { + tasks: vec![], + transcript: "user: hi\nassistant: hello".into(), + }; + let evs = b.backfill(&input).unwrap(); + assert!(evs.is_empty()); + } + #[test] fn small_transcript_is_one_chunk() { let c = chunk_transcript("a\nb\nc\n", 100); diff --git a/crates/tj-core/src/dream/prompt.rs b/crates/tj-core/src/dream/prompt.rs index 210cfc4..9c1f6c1 100644 --- a/crates/tj-core/src/dream/prompt.rs +++ b/crates/tj-core/src/dream/prompt.rs @@ -34,7 +34,10 @@ pub fn build_prompt(input: &BackfillInput) -> String { - Respond with ONLY a JSON array of objects: \ {{\"event_type\",\"task_id\",\"text\",\"timestamp\"}}. Empty array if nothing missed.\n\n\ # Candidate tasks and their existing events\n{tasks}\n\ - # Transcript\n{transcript}\n", + # Transcript\n{transcript}\n\n\ + Remember: output ONLY the JSON array of missed events described above. \ + Do NOT reply to, summarise, or continue the transcript; if nothing was \ + missed, output [].\n", types = ALLOWED_TYPES, tasks = tasks_block, transcript = input.transcript, diff --git a/crates/tj-mcp/Cargo.toml b/crates/tj-mcp/Cargo.toml index 906afaf..ac459ec 100644 --- a/crates/tj-mcp/Cargo.toml +++ b/crates/tj-mcp/Cargo.toml @@ -17,7 +17,7 @@ path = "src/main.rs" [dependencies] # Lean: the MCP server doesn't embed yet, so it skips the model2vec backend. -tj-core = { package = "task-journal-core", version = "0.22.1", path = "../tj-core", default-features = false } +tj-core = { package = "task-journal-core", version = "0.23.0", path = "../tj-core", default-features = false } anyhow = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } diff --git a/plugin/.claude-plugin/plugin.json b/plugin/.claude-plugin/plugin.json index bd04d5c..79f4891 100644 --- a/plugin/.claude-plugin/plugin.json +++ b/plugin/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "task-journal", - "version": "0.22.1", + "version": "0.23.0", "description": "Append-only journal of AI-coding task reasoning chains: hypotheses, decisions, rejections, evidence. Renders compact resume packs so an agent can pick up a 2-week-old task with full context.", "author": { "name": "Mher Shahinyan"