Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/maestro/db/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@
CREATE TABLE IF NOT EXISTS run_results (
run_id TEXT PRIMARY KEY,
output_diagram_code TEXT,
-- Unprocessed model output as returned by the provider, kept even when the
-- cell fails (output_diagram_code is None on a failure). This is what makes
-- a failure analysable after the fact: the malformed JSON or Mermaid the
-- model actually produced, not just the error string. Nullable: a provider
-- that returns nothing (safety block, no candidate) has no raw text.
raw_response TEXT,
prompt_tokens INTEGER NOT NULL,
completion_tokens INTEGER NOT NULL,
duration_ms INTEGER NOT NULL,
Expand All @@ -57,6 +63,10 @@
step_number INTEGER NOT NULL,
step_name TEXT NOT NULL,
output_text TEXT,
-- Raw model output for this step, kept even when the step fails validation
-- (output_text is None then). This is where a rejected step-1 JSON or a
-- malformed step-3 diagram is recoverable for diagnosis. See run_results.
raw_response TEXT,
prompt_tokens INTEGER NOT NULL,
completion_tokens INTEGER NOT NULL,
duration_ms INTEGER NOT NULL,
Expand Down
12 changes: 7 additions & 5 deletions src/maestro/db/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,15 @@ def insert_run_result(conn: sqlite3.Connection, result: RunResult) -> None:
conn.execute(
"""
INSERT INTO run_results
(run_id, output_diagram_code, prompt_tokens, completion_tokens,
duration_ms, cost_usd, error, retry_count)
(run_id, output_diagram_code, raw_response, prompt_tokens,
completion_tokens, duration_ms, cost_usd, error, retry_count)
VALUES
(?, ?, ?, ?, ?, ?, ?, ?)
(?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
str(result.run_id),
result.output_diagram_code,
result.raw_response,
result.prompt_tokens,
result.completion_tokens,
result.duration_ms,
Expand All @@ -94,18 +95,19 @@ def insert_sub_result(conn: sqlite3.Connection, sub: SubResult) -> None:
conn.execute(
"""
INSERT INTO sub_results
(sub_id, run_id, step_number, step_name, output_text,
(sub_id, run_id, step_number, step_name, output_text, raw_response,
prompt_tokens, completion_tokens, duration_ms, cost_usd,
error, retry_count)
VALUES
(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
str(sub.sub_id),
str(sub.run_id),
sub.step_number,
sub.step_name,
sub.output_text,
sub.raw_response,
sub.prompt_tokens,
sub.completion_tokens,
sub.duration_ms,
Expand Down
1 change: 1 addition & 0 deletions src/maestro/providers/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def _do_call():
return RunResult(
run_id=config.run_id,
output_diagram_code=output,
raw_response=output,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
duration_ms=duration_ms,
Expand Down
1 change: 1 addition & 0 deletions src/maestro/providers/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def _do_call():
return RunResult(
run_id=config.run_id,
output_diagram_code=output,
raw_response=output,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
duration_ms=duration_ms,
Expand Down
1 change: 1 addition & 0 deletions src/maestro/providers/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def _do_call():
return RunResult(
run_id=config.run_id,
output_diagram_code=output,
raw_response=output,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
duration_ms=duration_ms,
Expand Down
1 change: 1 addition & 0 deletions src/maestro/providers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def _do_call():
return RunResult(
run_id=config.run_id,
output_diagram_code=output,
raw_response=output,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
duration_ms=duration_ms,
Expand Down
9 changes: 9 additions & 0 deletions src/maestro/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,12 @@ class RunResult(BaseModel):
# Output
output_diagram_code: str | None = None # Generated Mermaid / PlantUML / etc.

# Unprocessed model output as the provider returned it, retained even on a
# failure (when output_diagram_code is None). Lets a failed cell be diagnosed
# after the run from the text the model actually produced. None when the
# provider returned no text at all (safety block, no candidate).
raw_response: str | None = None

# Token usage
prompt_tokens: int
completion_tokens: int
Expand Down Expand Up @@ -221,6 +227,9 @@ class SubResult(BaseModel):
step_number: int # 1, 2, 3...
step_name: str # "extract_entities", "extract_relationships", etc.
output_text: str | None = None
# Raw model output for this step, kept even when the step fails validation
# (output_text is None then), so a rejected JSON/diagram stays diagnosable.
raw_response: str | None = None
prompt_tokens: int
completion_tokens: int
duration_ms: int
Expand Down
5 changes: 5 additions & 0 deletions src/maestro/strategies/crew.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ def _execute_step(
SOP.
"""
last_error: str | None = None
last_raw: str | None = None
recorder = _Recorder()

# Accumulators across retry attempts: match SOP's metric model
Expand Down Expand Up @@ -416,6 +417,8 @@ def _execute_step(
total_completion_tokens += call.completion_tokens
total_duration_ms += call.duration_ms
total_cost_usd += call.cost_usd
# Keep the last raw output so a failed step stays diagnosable.
last_raw = call.output_text

if call.error is not None:
last_error = call.error
Expand All @@ -436,6 +439,7 @@ def _execute_step(
step_number=step_number,
step_name=step_name,
output_text=output,
raw_response=call.output_text,
prompt_tokens=total_prompt_tokens,
completion_tokens=total_completion_tokens,
duration_ms=total_duration_ms,
Expand All @@ -453,6 +457,7 @@ def _execute_step(
step_number=step_number,
step_name=step_name,
output_text=None,
raw_response=last_raw,
prompt_tokens=total_prompt_tokens,
completion_tokens=total_completion_tokens,
duration_ms=total_duration_ms,
Expand Down
10 changes: 9 additions & 1 deletion src/maestro/strategies/langgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def _run_step(
unobstructed.
"""
last_error: str | None = None
last_raw: str | None = None

total_prompt_tokens = 0
total_completion_tokens = 0
Expand All @@ -116,9 +117,14 @@ def _run_step(
total_completion_tokens += result.completion_tokens
total_duration_ms += result.duration_ms
total_cost_usd += result.cost_usd
# Keep the last raw output so a failed step stays diagnosable.
last_raw = result.raw_response

if not result.success:
last_error = result.error
# success is False with no error string means an empty/blank
# diagram from the provider, not an SDK error; name it so the
# failed row does not read "No attempts executed".
last_error = result.error or "empty output from provider"
continue

output = strip_fences(result.output_diagram_code)
Expand All @@ -135,6 +141,7 @@ def _run_step(
step_number=step_number,
step_name=step_name,
output_text=output,
raw_response=result.raw_response,
prompt_tokens=total_prompt_tokens,
completion_tokens=total_completion_tokens,
duration_ms=total_duration_ms,
Expand All @@ -148,6 +155,7 @@ def _run_step(
step_number=step_number,
step_name=step_name,
output_text=None,
raw_response=last_raw,
prompt_tokens=total_prompt_tokens,
completion_tokens=total_completion_tokens,
duration_ms=total_duration_ms,
Expand Down
11 changes: 10 additions & 1 deletion src/maestro/strategies/sop.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ def _execute_step(
output_text is None if the step failed.
"""
last_error = None
last_raw = None
result = None

# Accumulate metrics across all attempts (including failed ones)
Expand All @@ -206,6 +207,9 @@ def _execute_step(
total_duration_ms += result.duration_ms
total_cost_usd += result.cost_usd
actual_retries = attempt
# Keep the last raw output so a failed step stays diagnosable (e.g.
# the malformed JSON a weak model emitted, which validation rejects).
last_raw = result.raw_response

if result.success:
# Validate the fenced-stripped output: empty on any step (a
Expand All @@ -225,6 +229,7 @@ def _execute_step(
step_number=step_number,
step_name=step_name,
output_text=output,
raw_response=result.raw_response,
prompt_tokens=total_prompt_tokens,
completion_tokens=total_completion_tokens,
duration_ms=total_duration_ms,
Expand All @@ -235,7 +240,10 @@ def _execute_step(
output,
)
else:
last_error = result.error
# success is False with no error string means the provider
# returned an empty/blank diagram (not an SDK error). Name that
# so the failed row does not read "No attempts executed".
last_error = result.error or "empty output from provider"

# All attempts failed
return (
Expand All @@ -244,6 +252,7 @@ def _execute_step(
step_number=step_number,
step_name=step_name,
output_text=None,
raw_response=last_raw,
prompt_tokens=total_prompt_tokens,
completion_tokens=total_completion_tokens,
duration_ms=total_duration_ms,
Expand Down
60 changes: 60 additions & 0 deletions tests/db/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,63 @@ def test_writer_connection_allows_writes(tmp_path):
with get_readonly_connection(db_path) as conn:
row = conn.execute("SELECT environment_id FROM run_environments").fetchone()
assert row[0] == "env-1"


def test_raw_response_survives_a_failed_cell(tmp_path):
"""A failed cell keeps the raw model output for diagnosis even though the
cleaned output (output_diagram_code / output_text) is None. This is what
lets a "invalid JSON" failure be inspected after the run without re-calling
the model."""
from maestro.db.client import init_db
from maestro.db.queries import (
insert_run_config,
insert_run_result,
insert_sub_result,
)
from maestro.schemas import RunConfig, RunResult, Strategy, SubResult

db_path = tmp_path / "raw.db"
init_db(db_path)
cfg = RunConfig(
strategy=Strategy.SOP_BASED,
model="m",
example_id="e",
tier=1,
run_number=1,
)
failed = RunResult(
run_id=cfg.run_id,
output_diagram_code=None,
raw_response="{ malformed json the model emitted",
prompt_tokens=1,
completion_tokens=1,
duration_ms=1,
cost_usd=0.0,
error="invalid JSON",
)
sub = SubResult(
run_id=cfg.run_id,
step_number=1,
step_name="extract_entities",
output_text=None,
raw_response="{ bad json from step 1",
prompt_tokens=1,
completion_tokens=1,
duration_ms=1,
cost_usd=0.0,
error="rejected",
)
with get_connection(db_path) as conn:
insert_run_config(conn, cfg)
insert_run_result(conn, failed)
insert_sub_result(conn, sub)

with get_readonly_connection(db_path) as conn:
rr = conn.execute(
"SELECT output_diagram_code, raw_response FROM run_results"
).fetchone()
sr = conn.execute(
"SELECT output_text, raw_response FROM sub_results"
).fetchone()
assert rr[0] is None and rr[1] == "{ malformed json the model emitted"
assert sr[0] is None and sr[1] == "{ bad json from step 1"