diff --git a/src/maestro/db/client.py b/src/maestro/db/client.py index 420daa7..bbbd161 100644 --- a/src/maestro/db/client.py +++ b/src/maestro/db/client.py @@ -42,6 +42,12 @@ CREATE TABLE IF NOT EXISTS run_results ( run_id TEXT PRIMARY KEY, output_diagram_code TEXT, + -- Unprocessed model output as returned by the provider, kept even when the + -- cell fails (output_diagram_code is None on a failure). This is what makes + -- a failure analysable after the fact: the malformed JSON or Mermaid the + -- model actually produced, not just the error string. Nullable: a provider + -- that returns nothing (safety block, no candidate) has no raw text. + raw_response TEXT, prompt_tokens INTEGER NOT NULL, completion_tokens INTEGER NOT NULL, duration_ms INTEGER NOT NULL, @@ -57,6 +63,10 @@ step_number INTEGER NOT NULL, step_name TEXT NOT NULL, output_text TEXT, + -- Raw model output for this step, kept even when the step fails validation + -- (output_text is None then). This is where a rejected step-1 JSON or a + -- malformed step-3 diagram is recoverable for diagnosis. See run_results. + raw_response TEXT, prompt_tokens INTEGER NOT NULL, completion_tokens INTEGER NOT NULL, duration_ms INTEGER NOT NULL, diff --git a/src/maestro/db/queries.py b/src/maestro/db/queries.py index 12c1708..1658b03 100644 --- a/src/maestro/db/queries.py +++ b/src/maestro/db/queries.py @@ -71,14 +71,15 @@ def insert_run_result(conn: sqlite3.Connection, result: RunResult) -> None: conn.execute( """ INSERT INTO run_results - (run_id, output_diagram_code, prompt_tokens, completion_tokens, - duration_ms, cost_usd, error, retry_count) + (run_id, output_diagram_code, raw_response, prompt_tokens, + completion_tokens, duration_ms, cost_usd, error, retry_count) VALUES - (?, ?, ?, ?, ?, ?, ?, ?) + (?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( str(result.run_id), result.output_diagram_code, + result.raw_response, result.prompt_tokens, result.completion_tokens, result.duration_ms, @@ -94,11 +95,11 @@ def insert_sub_result(conn: sqlite3.Connection, sub: SubResult) -> None: conn.execute( """ INSERT INTO sub_results - (sub_id, run_id, step_number, step_name, output_text, + (sub_id, run_id, step_number, step_name, output_text, raw_response, prompt_tokens, completion_tokens, duration_ms, cost_usd, error, retry_count) VALUES - (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( str(sub.sub_id), @@ -106,6 +107,7 @@ def insert_sub_result(conn: sqlite3.Connection, sub: SubResult) -> None: sub.step_number, sub.step_name, sub.output_text, + sub.raw_response, sub.prompt_tokens, sub.completion_tokens, sub.duration_ms, diff --git a/src/maestro/providers/anthropic.py b/src/maestro/providers/anthropic.py index fd6b835..37a7216 100644 --- a/src/maestro/providers/anthropic.py +++ b/src/maestro/providers/anthropic.py @@ -129,6 +129,7 @@ def _do_call(): return RunResult( run_id=config.run_id, output_diagram_code=output, + raw_response=output, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, duration_ms=duration_ms, diff --git a/src/maestro/providers/gemini.py b/src/maestro/providers/gemini.py index 2506950..5b7e9bc 100644 --- a/src/maestro/providers/gemini.py +++ b/src/maestro/providers/gemini.py @@ -121,6 +121,7 @@ def _do_call(): return RunResult( run_id=config.run_id, output_diagram_code=output, + raw_response=output, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, duration_ms=duration_ms, diff --git a/src/maestro/providers/mistral.py b/src/maestro/providers/mistral.py index 06725ca..1f1ac09 100644 --- a/src/maestro/providers/mistral.py +++ b/src/maestro/providers/mistral.py @@ -125,6 +125,7 @@ def _do_call(): return RunResult( run_id=config.run_id, output_diagram_code=output, + raw_response=output, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, duration_ms=duration_ms, diff --git a/src/maestro/providers/openai.py b/src/maestro/providers/openai.py index 93c0860..fb40efc 100644 --- a/src/maestro/providers/openai.py +++ b/src/maestro/providers/openai.py @@ -149,6 +149,7 @@ def _do_call(): return RunResult( run_id=config.run_id, output_diagram_code=output, + raw_response=output, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, duration_ms=duration_ms, diff --git a/src/maestro/schemas.py b/src/maestro/schemas.py index a181328..31bff44 100644 --- a/src/maestro/schemas.py +++ b/src/maestro/schemas.py @@ -176,6 +176,12 @@ class RunResult(BaseModel): # Output output_diagram_code: str | None = None # Generated Mermaid / PlantUML / etc. + # Unprocessed model output as the provider returned it, retained even on a + # failure (when output_diagram_code is None). Lets a failed cell be diagnosed + # after the run from the text the model actually produced. None when the + # provider returned no text at all (safety block, no candidate). + raw_response: str | None = None + # Token usage prompt_tokens: int completion_tokens: int @@ -221,6 +227,9 @@ class SubResult(BaseModel): step_number: int # 1, 2, 3... step_name: str # "extract_entities", "extract_relationships", etc. output_text: str | None = None + # Raw model output for this step, kept even when the step fails validation + # (output_text is None then), so a rejected JSON/diagram stays diagnosable. + raw_response: str | None = None prompt_tokens: int completion_tokens: int duration_ms: int diff --git a/src/maestro/strategies/crew.py b/src/maestro/strategies/crew.py index a8a743f..f01c068 100644 --- a/src/maestro/strategies/crew.py +++ b/src/maestro/strategies/crew.py @@ -341,6 +341,7 @@ def _execute_step( SOP. """ last_error: str | None = None + last_raw: str | None = None recorder = _Recorder() # Accumulators across retry attempts: match SOP's metric model @@ -416,6 +417,8 @@ def _execute_step( total_completion_tokens += call.completion_tokens total_duration_ms += call.duration_ms total_cost_usd += call.cost_usd + # Keep the last raw output so a failed step stays diagnosable. + last_raw = call.output_text if call.error is not None: last_error = call.error @@ -436,6 +439,7 @@ def _execute_step( step_number=step_number, step_name=step_name, output_text=output, + raw_response=call.output_text, prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, duration_ms=total_duration_ms, @@ -453,6 +457,7 @@ def _execute_step( step_number=step_number, step_name=step_name, output_text=None, + raw_response=last_raw, prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, duration_ms=total_duration_ms, diff --git a/src/maestro/strategies/langgraph.py b/src/maestro/strategies/langgraph.py index 71fc987..20fae86 100644 --- a/src/maestro/strategies/langgraph.py +++ b/src/maestro/strategies/langgraph.py @@ -101,6 +101,7 @@ def _run_step( unobstructed. """ last_error: str | None = None + last_raw: str | None = None total_prompt_tokens = 0 total_completion_tokens = 0 @@ -116,9 +117,14 @@ def _run_step( total_completion_tokens += result.completion_tokens total_duration_ms += result.duration_ms total_cost_usd += result.cost_usd + # Keep the last raw output so a failed step stays diagnosable. + last_raw = result.raw_response if not result.success: - last_error = result.error + # success is False with no error string means an empty/blank + # diagram from the provider, not an SDK error; name it so the + # failed row does not read "No attempts executed". + last_error = result.error or "empty output from provider" continue output = strip_fences(result.output_diagram_code) @@ -135,6 +141,7 @@ def _run_step( step_number=step_number, step_name=step_name, output_text=output, + raw_response=result.raw_response, prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, duration_ms=total_duration_ms, @@ -148,6 +155,7 @@ def _run_step( step_number=step_number, step_name=step_name, output_text=None, + raw_response=last_raw, prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, duration_ms=total_duration_ms, diff --git a/src/maestro/strategies/sop.py b/src/maestro/strategies/sop.py index 7acf63f..ea5943a 100644 --- a/src/maestro/strategies/sop.py +++ b/src/maestro/strategies/sop.py @@ -187,6 +187,7 @@ def _execute_step( output_text is None if the step failed. """ last_error = None + last_raw = None result = None # Accumulate metrics across all attempts (including failed ones) @@ -206,6 +207,9 @@ def _execute_step( total_duration_ms += result.duration_ms total_cost_usd += result.cost_usd actual_retries = attempt + # Keep the last raw output so a failed step stays diagnosable (e.g. + # the malformed JSON a weak model emitted, which validation rejects). + last_raw = result.raw_response if result.success: # Validate the fenced-stripped output: empty on any step (a @@ -225,6 +229,7 @@ def _execute_step( step_number=step_number, step_name=step_name, output_text=output, + raw_response=result.raw_response, prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, duration_ms=total_duration_ms, @@ -235,7 +240,10 @@ def _execute_step( output, ) else: - last_error = result.error + # success is False with no error string means the provider + # returned an empty/blank diagram (not an SDK error). Name that + # so the failed row does not read "No attempts executed". + last_error = result.error or "empty output from provider" # All attempts failed return ( @@ -244,6 +252,7 @@ def _execute_step( step_number=step_number, step_name=step_name, output_text=None, + raw_response=last_raw, prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, duration_ms=total_duration_ms, diff --git a/tests/db/test_client.py b/tests/db/test_client.py index 98d942f..48923dc 100644 --- a/tests/db/test_client.py +++ b/tests/db/test_client.py @@ -59,3 +59,63 @@ def test_writer_connection_allows_writes(tmp_path): with get_readonly_connection(db_path) as conn: row = conn.execute("SELECT environment_id FROM run_environments").fetchone() assert row[0] == "env-1" + + +def test_raw_response_survives_a_failed_cell(tmp_path): + """A failed cell keeps the raw model output for diagnosis even though the + cleaned output (output_diagram_code / output_text) is None. This is what + lets a "invalid JSON" failure be inspected after the run without re-calling + the model.""" + from maestro.db.client import init_db + from maestro.db.queries import ( + insert_run_config, + insert_run_result, + insert_sub_result, + ) + from maestro.schemas import RunConfig, RunResult, Strategy, SubResult + + db_path = tmp_path / "raw.db" + init_db(db_path) + cfg = RunConfig( + strategy=Strategy.SOP_BASED, + model="m", + example_id="e", + tier=1, + run_number=1, + ) + failed = RunResult( + run_id=cfg.run_id, + output_diagram_code=None, + raw_response="{ malformed json the model emitted", + prompt_tokens=1, + completion_tokens=1, + duration_ms=1, + cost_usd=0.0, + error="invalid JSON", + ) + sub = SubResult( + run_id=cfg.run_id, + step_number=1, + step_name="extract_entities", + output_text=None, + raw_response="{ bad json from step 1", + prompt_tokens=1, + completion_tokens=1, + duration_ms=1, + cost_usd=0.0, + error="rejected", + ) + with get_connection(db_path) as conn: + insert_run_config(conn, cfg) + insert_run_result(conn, failed) + insert_sub_result(conn, sub) + + with get_readonly_connection(db_path) as conn: + rr = conn.execute( + "SELECT output_diagram_code, raw_response FROM run_results" + ).fetchone() + sr = conn.execute( + "SELECT output_text, raw_response FROM sub_results" + ).fetchone() + assert rr[0] is None and rr[1] == "{ malformed json the model emitted" + assert sr[0] is None and sr[1] == "{ bad json from step 1"