Colinho22 · Colinho22 · Jun 20, 2026 · Jun 20, 2026
diff --git a/src/maestro/db/client.py b/src/maestro/db/client.py
@@ -42,6 +42,12 @@
 CREATE TABLE IF NOT EXISTS run_results (
     run_id               TEXT PRIMARY KEY,
     output_diagram_code  TEXT,
+    -- Unprocessed model output as returned by the provider, kept even when the
+    -- cell fails (output_diagram_code is None on a failure). This is what makes
+    -- a failure analysable after the fact: the malformed JSON or Mermaid the
+    -- model actually produced, not just the error string. Nullable: a provider
+    -- that returns nothing (safety block, no candidate) has no raw text.
+    raw_response         TEXT,
     prompt_tokens        INTEGER NOT NULL,
     completion_tokens    INTEGER NOT NULL,
     duration_ms          INTEGER NOT NULL,
@@ -57,6 +63,10 @@
     step_number       INTEGER NOT NULL,
     step_name         TEXT NOT NULL,
     output_text       TEXT,
+    -- Raw model output for this step, kept even when the step fails validation
+    -- (output_text is None then). This is where a rejected step-1 JSON or a
+    -- malformed step-3 diagram is recoverable for diagnosis. See run_results.
+    raw_response      TEXT,
     prompt_tokens     INTEGER NOT NULL,
     completion_tokens INTEGER NOT NULL,
     duration_ms       INTEGER NOT NULL,

diff --git a/src/maestro/db/queries.py b/src/maestro/db/queries.py
@@ -71,14 +71,15 @@ def insert_run_result(conn: sqlite3.Connection, result: RunResult) -> None:
     conn.execute(
         """
         INSERT INTO run_results
-            (run_id, output_diagram_code, prompt_tokens, completion_tokens,
-             duration_ms, cost_usd, error, retry_count)
+            (run_id, output_diagram_code, raw_response, prompt_tokens,
+             completion_tokens, duration_ms, cost_usd, error, retry_count)
         VALUES
-            (?, ?, ?, ?, ?, ?, ?, ?)
+            (?, ?, ?, ?, ?, ?, ?, ?, ?)
         """,
         (
             str(result.run_id),
             result.output_diagram_code,
+            result.raw_response,
             result.prompt_tokens,
             result.completion_tokens,
             result.duration_ms,
@@ -94,18 +95,19 @@ def insert_sub_result(conn: sqlite3.Connection, sub: SubResult) -> None:
     conn.execute(
         """
         INSERT INTO sub_results
-            (sub_id, run_id, step_number, step_name, output_text,
+            (sub_id, run_id, step_number, step_name, output_text, raw_response,
              prompt_tokens, completion_tokens, duration_ms, cost_usd,
              error, retry_count)
         VALUES
-            (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
         """,
         (
             str(sub.sub_id),
             str(sub.run_id),
             sub.step_number,
             sub.step_name,
             sub.output_text,
+            sub.raw_response,
             sub.prompt_tokens,
             sub.completion_tokens,
             sub.duration_ms,

diff --git a/src/maestro/providers/anthropic.py b/src/maestro/providers/anthropic.py
@@ -129,6 +129,7 @@ def _do_call():
             return RunResult(
                 run_id=config.run_id,
                 output_diagram_code=output,
+                raw_response=output,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 duration_ms=duration_ms,

diff --git a/src/maestro/providers/gemini.py b/src/maestro/providers/gemini.py
@@ -121,6 +121,7 @@ def _do_call():
             return RunResult(
                 run_id=config.run_id,
                 output_diagram_code=output,
+                raw_response=output,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 duration_ms=duration_ms,

diff --git a/src/maestro/providers/mistral.py b/src/maestro/providers/mistral.py
@@ -125,6 +125,7 @@ def _do_call():
             return RunResult(
                 run_id=config.run_id,
                 output_diagram_code=output,
+                raw_response=output,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 duration_ms=duration_ms,

diff --git a/src/maestro/providers/openai.py b/src/maestro/providers/openai.py
@@ -149,6 +149,7 @@ def _do_call():
             return RunResult(
                 run_id=config.run_id,
                 output_diagram_code=output,
+                raw_response=output,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
                 duration_ms=duration_ms,

diff --git a/src/maestro/schemas.py b/src/maestro/schemas.py
@@ -176,6 +176,12 @@ class RunResult(BaseModel):
     # Output
     output_diagram_code: str | None = None  # Generated Mermaid / PlantUML / etc.
 
+    # Unprocessed model output as the provider returned it, retained even on a
+    # failure (when output_diagram_code is None). Lets a failed cell be diagnosed
+    # after the run from the text the model actually produced. None when the
+    # provider returned no text at all (safety block, no candidate).
+    raw_response: str | None = None
+
     # Token usage
     prompt_tokens: int
     completion_tokens: int
@@ -221,6 +227,9 @@ class SubResult(BaseModel):
     step_number: int  # 1, 2, 3...
     step_name: str  # "extract_entities", "extract_relationships", etc.
     output_text: str | None = None
+    # Raw model output for this step, kept even when the step fails validation
+    # (output_text is None then), so a rejected JSON/diagram stays diagnosable.
+    raw_response: str | None = None
     prompt_tokens: int
     completion_tokens: int
     duration_ms: int

diff --git a/src/maestro/strategies/crew.py b/src/maestro/strategies/crew.py
@@ -341,6 +341,7 @@ def _execute_step(
         SOP.
         """
         last_error: str | None = None
+        last_raw: str | None = None
         recorder = _Recorder()
 
         # Accumulators across retry attempts: match SOP's metric model
@@ -416,6 +417,8 @@ def _execute_step(
             total_completion_tokens += call.completion_tokens
             total_duration_ms += call.duration_ms
             total_cost_usd += call.cost_usd
+            # Keep the last raw output so a failed step stays diagnosable.
+            last_raw = call.output_text
 
             if call.error is not None:
                 last_error = call.error
@@ -436,6 +439,7 @@ def _execute_step(
                     step_number=step_number,
                     step_name=step_name,
                     output_text=output,
+                    raw_response=call.output_text,
                     prompt_tokens=total_prompt_tokens,
                     completion_tokens=total_completion_tokens,
                     duration_ms=total_duration_ms,
@@ -453,6 +457,7 @@ def _execute_step(
                 step_number=step_number,
                 step_name=step_name,
                 output_text=None,
+                raw_response=last_raw,
                 prompt_tokens=total_prompt_tokens,
                 completion_tokens=total_completion_tokens,
                 duration_ms=total_duration_ms,

diff --git a/src/maestro/strategies/langgraph.py b/src/maestro/strategies/langgraph.py
@@ -101,6 +101,7 @@ def _run_step(
     unobstructed.
     """
     last_error: str | None = None
+    last_raw: str | None = None
 
     total_prompt_tokens = 0
     total_completion_tokens = 0
@@ -116,9 +117,14 @@ def _run_step(
         total_completion_tokens += result.completion_tokens
         total_duration_ms += result.duration_ms
         total_cost_usd += result.cost_usd
+        # Keep the last raw output so a failed step stays diagnosable.
+        last_raw = result.raw_response
 
         if not result.success:
-            last_error = result.error
+            # success is False with no error string means an empty/blank
+            # diagram from the provider, not an SDK error; name it so the
+            # failed row does not read "No attempts executed".
+            last_error = result.error or "empty output from provider"
             continue
 
         output = strip_fences(result.output_diagram_code)
@@ -135,6 +141,7 @@ def _run_step(
             step_number=step_number,
             step_name=step_name,
             output_text=output,
+            raw_response=result.raw_response,
             prompt_tokens=total_prompt_tokens,
             completion_tokens=total_completion_tokens,
             duration_ms=total_duration_ms,
@@ -148,6 +155,7 @@ def _run_step(
         step_number=step_number,
         step_name=step_name,
         output_text=None,
+        raw_response=last_raw,
         prompt_tokens=total_prompt_tokens,
         completion_tokens=total_completion_tokens,
         duration_ms=total_duration_ms,

diff --git a/src/maestro/strategies/sop.py b/src/maestro/strategies/sop.py
@@ -187,6 +187,7 @@ def _execute_step(
         output_text is None if the step failed.
         """
         last_error = None
+        last_raw = None
         result = None
 
         # Accumulate metrics across all attempts (including failed ones)
@@ -206,6 +207,9 @@ def _execute_step(
             total_duration_ms += result.duration_ms
             total_cost_usd += result.cost_usd
             actual_retries = attempt
+            # Keep the last raw output so a failed step stays diagnosable (e.g.
+            # the malformed JSON a weak model emitted, which validation rejects).
+            last_raw = result.raw_response
 
             if result.success:
                 # Validate the fenced-stripped output: empty on any step (a
@@ -225,6 +229,7 @@ def _execute_step(
                         step_number=step_number,
                         step_name=step_name,
                         output_text=output,
+                        raw_response=result.raw_response,
                         prompt_tokens=total_prompt_tokens,
                         completion_tokens=total_completion_tokens,
                         duration_ms=total_duration_ms,
@@ -235,7 +240,10 @@ def _execute_step(
                     output,
                 )
             else:
-                last_error = result.error
+                # success is False with no error string means the provider
+                # returned an empty/blank diagram (not an SDK error). Name that
+                # so the failed row does not read "No attempts executed".
+                last_error = result.error or "empty output from provider"
 
         # All attempts failed
         return (
@@ -244,6 +252,7 @@ def _execute_step(
                 step_number=step_number,
                 step_name=step_name,
                 output_text=None,
+                raw_response=last_raw,
                 prompt_tokens=total_prompt_tokens,
                 completion_tokens=total_completion_tokens,
                 duration_ms=total_duration_ms,

diff --git a/tests/db/test_client.py b/tests/db/test_client.py
@@ -59,3 +59,63 @@ def test_writer_connection_allows_writes(tmp_path):
     with get_readonly_connection(db_path) as conn:
         row = conn.execute("SELECT environment_id FROM run_environments").fetchone()
     assert row[0] == "env-1"
+
+
+def test_raw_response_survives_a_failed_cell(tmp_path):
+    """A failed cell keeps the raw model output for diagnosis even though the
+    cleaned output (output_diagram_code / output_text) is None. This is what
+    lets a "invalid JSON" failure be inspected after the run without re-calling
+    the model."""
+    from maestro.db.client import init_db
+    from maestro.db.queries import (
+        insert_run_config,
+        insert_run_result,
+        insert_sub_result,
+    )
+    from maestro.schemas import RunConfig, RunResult, Strategy, SubResult
+
+    db_path = tmp_path / "raw.db"
+    init_db(db_path)
+    cfg = RunConfig(
+        strategy=Strategy.SOP_BASED,
+        model="m",
+        example_id="e",
+        tier=1,
+        run_number=1,
+    )
+    failed = RunResult(
+        run_id=cfg.run_id,
+        output_diagram_code=None,
+        raw_response="{ malformed json the model emitted",
+        prompt_tokens=1,
+        completion_tokens=1,
+        duration_ms=1,
+        cost_usd=0.0,
+        error="invalid JSON",
+    )
+    sub = SubResult(
+        run_id=cfg.run_id,
+        step_number=1,
+        step_name="extract_entities",
+        output_text=None,
+        raw_response="{ bad json from step 1",
+        prompt_tokens=1,
+        completion_tokens=1,
+        duration_ms=1,
+        cost_usd=0.0,
+        error="rejected",
+    )
+    with get_connection(db_path) as conn:
+        insert_run_config(conn, cfg)
+        insert_run_result(conn, failed)
+        insert_sub_result(conn, sub)
+
+    with get_readonly_connection(db_path) as conn:
+        rr = conn.execute(
+            "SELECT output_diagram_code, raw_response FROM run_results"
+        ).fetchone()
+        sr = conn.execute(
+            "SELECT output_text, raw_response FROM sub_results"
+        ).fetchone()
+    assert rr[0] is None and rr[1] == "{ malformed json the model emitted"
+    assert sr[0] is None and sr[1] == "{ bad json from step 1"