From c58f8d6bbb7a153ea7c4f1756ca9376dcc1afdb9 Mon Sep 17 00:00:00 2001 From: Colinho22 <48288595+Colinho22@users.noreply.github.com> Date: Fri, 19 Jun 2026 23:17:13 +0200 Subject: [PATCH 1/7] fix(run): suppress CrewAI trace prompt that blocks batch runs CREWAI_TRACING_ENABLED=false governs only the enabled path, not the first-execution 'view traces? [y/N]' prompt, which blocks on stdin for a 20s timeout per crew. The file-based declined preference is wiped on every docker --rm, so it re-fires on every cell: ~1500 crew_ai cells means hours of dead waiting and corrupted duration_ms. CREWAI_TESTING=true short circuits the prompt before any stdin read. --- src/maestro/run.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/maestro/run.py b/src/maestro/run.py index 2f0c90e..27ae9e8 100644 --- a/src/maestro/run.py +++ b/src/maestro/run.py @@ -57,8 +57,19 @@ # Silence CrewAI's interactive tracing prompt and telemetry so batch runs # stay non-interactive on fresh checkouts (the user's preference file is # machine-local and won't exist in CI / on a fresh clone). +# +# CREWAI_TRACING_ENABLED=false only governs the *enabled* path; it does NOT +# stop the first-execution "view your traces? [y/N]" prompt, which blocks on +# stdin for a 20s timeout per crew. In a headless container that file-based +# "declined" preference is wiped every `--rm`, so the prompt fires on every +# cell: ~1500 crew_ai cells x 20s is hours of dead waiting, and it corrupts +# the measured duration_ms. CREWAI_TESTING=true is the only flag that short +# circuits the prompt before any stdin read (crewai .../tracing/utils.py: +# _is_test_environment guards both the auto-collect check and the prompt). It +# disables trace prompts/telemetry only; it does not alter agent execution. os.environ.setdefault("CREWAI_TRACING_ENABLED", "false") os.environ.setdefault("CREWAI_DISABLE_TELEMETRY", "true") +os.environ.setdefault("CREWAI_TESTING", "true") from maestro.analysis.metrics import evaluate_run from maestro.db.client import get_connection, init_db From be99dd99e8e4b0d85aaf0b2dfe92cf7ef6b01198 Mon Sep 17 00:00:00 2001 From: Colinho22 <48288595+Colinho22@users.noreply.github.com> Date: Sat, 20 Jun 2026 12:17:43 +0200 Subject: [PATCH 2/7] fix(prompts): notation-aware labels, IR fidelity, subgraph labels, step-3 validation Empirical review of strategy outputs found the multi-step strategies scored entity_name_f1 near 0 on C4 inputs: their JSON IR flattened the name/[Type]/tech label structure the ground truth uses, and the shared rules never taught that format. Fixes, verified to lift name_f1 to ~1.0 on both C4 and BPMN across all four strategies: - Label rule is now notation-aware: C4 and network-topology diagrams get the name/[Type]/tech multi-line label; BPMN process and collaboration diagrams keep bare names. diagram_type is read from the input metadata and passed to every strategy's render step as task context (the baseline already saw it; this makes it uniform and explicit), so the convention is applied consistently rather than guessed. - Step-1 IR carries a short tech label (not the long description) and extracts every referenced parent group as a named entity, so container and boundary subgraphs render with their labels instead of empty brackets. The hierarchy rule now covers C4 boundaries and deployment environments and requires a quoted subgraph label. - Step-3 gains a light structural check that rejects empty-label brackets and concatenated node defs, so a malformed diagram consumes the retry budget instead of landing as a scored parse failure. - Strip CrewAI's expected-output scaffolding so its delivered prompt is byte-identical to SOP, removing a confound between the two strategies. --- src/maestro/prompts.py | 3 +- src/maestro/strategies/_extraction.py | 66 ++++++++++++++++++++++-- src/maestro/strategies/crew.py | 47 +++++++++++++---- src/maestro/strategies/langgraph.py | 10 +++- src/maestro/strategies/single.py | 14 +++-- src/maestro/strategies/sop.py | 7 ++- tests/strategies/test_step_validation.py | 33 +++++++++++- tests/test_prompts.py | 25 +++++++-- 8 files changed, 178 insertions(+), 27 deletions(-) diff --git a/src/maestro/prompts.py b/src/maestro/prompts.py index 005b682..4f3e83e 100644 --- a/src/maestro/prompts.py +++ b/src/maestro/prompts.py @@ -37,9 +37,10 @@ - Output only valid Mermaid syntax - Wrap node labels in double quotes, e.g. node_id["My Label"], so labels with spaces, parentheses, slashes, or line breaks stay parseable - If a node has no label, write just its id (e.g. gw_result): never an empty bracket like node_id[""] +- Build each node label according to the diagram notation. For architecture and infrastructure diagrams (C4 container, network topology), put the entity name on the first line, the type on a second line wrapped in square brackets and title-cased (external_system becomes [External System], device becomes [Device]), and a short technology or kind label on a third line only when the input gives one, joined with a literal \\n inside the quotes, e.g. node_id["SomeApp\\n[Container]\\nWeb Application"]. Keep the third line to a few words; never put a full sentence or description in a label. For process diagrams (BPMN process, BPMN collaboration), use the entity name alone with no type line, e.g. task_1["Task 1"] - Quote edge labels the same way, with no spaces inside the pipes, e.g. a -->|"My edge"| b; for an unlabelled edge use a plain arrow a --> b and never an empty label like -->|| or -->| | - Include every entity and relationship from the input -- Preserve hierarchy using subgraphs for pools, lanes, and subprocesses +- Preserve hierarchy using subgraphs for any grouping the input gives, such as pools, lanes, subprocesses, system boundaries, or deployment environments. Always give a subgraph a quoted label from the group's name, e.g. subgraph infomaniak["Infomaniak Public Cloud"]; never write an empty subgraph label like subgraph id[""] - Do not invent entities or relationships not present in the input - Do not include explanations or markdown code fences - Do not use internal or relationship IDs as edge labels""" diff --git a/src/maestro/strategies/_extraction.py b/src/maestro/strategies/_extraction.py index a2f0288..79609ae 100644 --- a/src/maestro/strategies/_extraction.py +++ b/src/maestro/strategies/_extraction.py @@ -19,6 +19,7 @@ from __future__ import annotations import json +import re from maestro.prompts import render_rules @@ -32,8 +33,18 @@ Rules: - Output valid JSON only, no explanations, no markdown fencing +- Extract only the actual elements or nodes; never turn a metadata or summary + field into an entity - Include every entity from the input -- Capture parent-child relationships (pools, lanes, subprocesses) +- Capture parent-child relationships (pools, lanes, subprocesses, system + boundaries, deployment environments). When an element names a parent group + (a lane, pool, boundary, or environment), include that group itself as an + entity with its own id and name, so the group can be drawn as a labeled + subgraph; never leave a referenced parent without a name. +- Copy name and type from the input verbatim. For tech, copy a short technology + or kind label if the input gives one (a "technology" field or similar), not a + long sentence; leave it null when the input has none. This is the third label + line, so keep it short. - Use this exact schema: {{ "entities": [ @@ -41,6 +52,7 @@ "id": "string", "name": "string", "type": "string", + "tech": "string or null", "parent_id": "string or null" }} ] @@ -91,11 +103,17 @@ # Rules come from the canonical contract (maestro.prompts) so step 3 and the # single-agent baseline are given a byte-identical output contract. The runtime # placeholders are escaped (``{{...}}``) so they survive this f-string and are -# filled later by ``.format(entities_json=..., relationships_json=...)``. +# filled later by ``.format(diagram_type=..., entities_json=..., +# relationships_json=...)``. The diagram type is task context (it is in the +# input metadata, which the single-agent baseline already sees): the label +# rules differ by notation, so the render step must know which notation it is +# producing, the same way a person would be told "draw this as a C4 diagram". STEP_3_PROMPT = f"""\ You are given a set of entities and relationships extracted from a dataset. Your task is to generate a Mermaid diagram that accurately represents them. +The diagram notation is: {{diagram_type}} + Rules: {render_rules()} @@ -135,6 +153,23 @@ # --------------------------------------------------------------------------- +def extract_diagram_type(raw_input: str) -> str: + """ + Read ``metadata.diagram_type`` from a raw input JSON string. + + Shared by every strategy so the render step's notation context is derived + identically. Falls back to "unspecified" if the field is missing or the + input is unparseable, so a malformed input still renders rather than + crashing the strategy (the diagram simply gets no notation hint). + """ + try: + return ( + json.loads(raw_input).get("metadata", {}).get("diagram_type", "unspecified") + ) + except (json.JSONDecodeError, TypeError, AttributeError): + return "unspecified" + + def strip_fences(text: str | None) -> str | None: """ Remove markdown code fences if present. @@ -187,8 +222,9 @@ def validate_step_output(text: str | None, step_number: int) -> tuple[bool, str Rejects empty output on every step (strip_fences can empty an otherwise non-empty response, e.g. bare ``` fences, which would otherwise pass the provider success check and leave step 3 with no diagram), then applies the - step-1/step-2 JSON shape check via validate_step_payload. Step 3 has no - further shape rule once it is non-empty. + step-1/step-2 JSON shape check via validate_step_payload. Step 3 gets a + light structural check so a malformed diagram consumes the retry budget + instead of passing as "non-empty" and landing as a scored parse failure. Returns (is_valid, error_message); error_message is None on success. """ @@ -196,4 +232,26 @@ def validate_step_output(text: str | None, step_number: int) -> tuple[bool, str return False, "empty output" if step_number < 3: return validate_step_payload(text, step_number) + return _validate_mermaid_shape(text) + + +# Empty-label bracket (node_id[""]) and two node defs concatenated with no +# separator (][ as in `a[""]b["B"]`) are the exact malformations a weak model +# emitted at step 3. They are cheap to detect with a regex and forbidden by the +# output contract, so catching them here lets the existing MAX_RETRIES retry +# rather than scoring a guaranteed mmdc parse failure. This is a narrow +# structural smoke check, not full Mermaid validation (that lives in the metric +# via mmdc); it must never reject a well-formed diagram. +_EMPTY_LABEL = re.compile(r"""[\[\(\{]+\s*(["'])\1\s*[\]\)\}]+""") +_CONCATENATED_NODES = re.compile(r"[\]\)\}]\s*\w+\s*[\[\(\{]") + + +def _validate_mermaid_shape(text: str) -> tuple[bool, str | None]: + """Reject the two step-3 malformations we have actually observed.""" + if _EMPTY_LABEL.search(text): + return False, 'empty node label bracket (e.g. node_id[""])' + for raw in text.splitlines(): + line = raw.strip() + if _CONCATENATED_NODES.search(line): + return False, "node definitions concatenated without a separator" return True, None diff --git a/src/maestro/strategies/crew.py b/src/maestro/strategies/crew.py index a492b2d..a19b549 100644 --- a/src/maestro/strategies/crew.py +++ b/src/maestro/strategies/crew.py @@ -47,6 +47,7 @@ STEP_1_PROMPT, STEP_2_PROMPT, STEP_3_PROMPT, + extract_diagram_type, strip_fences, validate_step_output, ) @@ -153,19 +154,41 @@ def call( # recorder and surfaces it through SubResult.error. return result.output_diagram_code or "" - @staticmethod - def _collapse_messages(messages) -> str: + # CrewAI appends an "expected output" block after our task description, + # built from its en.json "expected_output" slice. That trailing text is + # CrewAI scaffolding SOP never sends, so leaving it in would make crew_ai's + # delivered prompt differ from SOP's and confound the comparison (the two + # strategies must differ only in orchestration, not prompt content). We cut + # from this stable marker onward so the provider receives exactly the step + # prompt, byte-identical to SOP. The marker is verbatim from CrewAI's + # en.json "expected_output" slice. + _CREW_EXPECTED_OUTPUT_MARKER = ( + "\nThis is the expected criteria for your final answer:" + ) + + @classmethod + def _collapse_messages(cls, messages) -> str: """ CrewAI's ``Agent`` + ``Task`` machinery composes a list of messages - with role 'system' (agent persona) and 'user' (task description). We - ignore CrewAI's system message (our system prompt is set explicitly - per step via ``system_prompt_override``) and concatenate the user - portion as the prompt. + with role 'system' (agent persona) and 'user' (task description plus an + expected-output block). We ignore CrewAI's system message (our system + prompt is set explicitly per step via ``system_prompt_override``), + concatenate the user portion, and strip CrewAI's expected-output + scaffolding so the delivered prompt matches SOP byte-for-byte. """ if isinstance(messages, str): - return messages - user_parts = [m.get("content", "") for m in messages if m.get("role") == "user"] - return "\n\n".join(p for p in user_parts if p) + text = messages + else: + user_parts = [ + m.get("content", "") for m in messages if m.get("role") == "user" + ] + text = "\n\n".join(p for p in user_parts if p) + # Drop CrewAI's expected-output block (and anything after it) so only + # our task prompt reaches the provider. + marker = text.find(cls._CREW_EXPECTED_OUTPUT_MARKER) + if marker != -1: + text = text[:marker] + return text.rstrip() # --------------------------------------------------------------------------- @@ -246,6 +269,7 @@ def run( try: raw = input_file.file_path.read_text(encoding="utf-8") input_data = json.dumps(json.loads(raw), indent=2) + diagram_type = extract_diagram_type(raw) except FileNotFoundError: return self._error_result( config, f"Input file not found: {input_file.file_path}" @@ -261,7 +285,7 @@ def run( for step in STEPS: # Build the prompt with outputs from previous steps (same logic as SOP) - prompt = self._build_prompt(step, input_data, step_outputs) + prompt = self._build_prompt(step, input_data, diagram_type, step_outputs) # Execute this step as a single-task Crew, with retries sub, output_text = self._execute_step( @@ -445,11 +469,12 @@ def _build_prompt( self, step: dict, input_data: str, + diagram_type: str, step_outputs: dict[str, str], ) -> str: """Like SOP._build_prompt: fill template vars from prior outputs.""" template = step["task_prompt"] - fmt = {"input_data": input_data} + fmt = {"input_data": input_data, "diagram_type": diagram_type} if "extract_entities" in step_outputs: fmt["entities_json"] = step_outputs["extract_entities"] if "extract_relationships" in step_outputs: diff --git a/src/maestro/strategies/langgraph.py b/src/maestro/strategies/langgraph.py index 45637ac..71fc987 100644 --- a/src/maestro/strategies/langgraph.py +++ b/src/maestro/strategies/langgraph.py @@ -47,6 +47,7 @@ STEP_1_PROMPT, STEP_2_PROMPT, STEP_3_PROMPT, + extract_diagram_type, strip_fences, validate_step_output, ) @@ -67,6 +68,7 @@ class GraphState(TypedDict, total=False): """ input_data: str + diagram_type: str entities_json: str relationships_json: str diagram_code: str @@ -209,6 +211,7 @@ def extract_relationships_node(state: GraphState) -> GraphState: def generate_mermaid_node(state: GraphState) -> GraphState: prompt = STEP_3_PROMPT.format( + diagram_type=state["diagram_type"], entities_json=state["entities_json"], relationships_json=state["relationships_json"], ) @@ -287,6 +290,7 @@ def run( try: raw = input_file.file_path.read_text(encoding="utf-8") input_data = json.dumps(json.loads(raw), indent=2) + diagram_type = extract_diagram_type(raw) except FileNotFoundError: return self._error_result( config, f"Input file not found: {input_file.file_path}" @@ -328,7 +332,11 @@ def run( # Execute the graph. LangGraph collects partial updates from each # node into a single final state dict. final_state: GraphState = compiled.invoke( - {"input_data": input_data, "sub_results": []} + { + "input_data": input_data, + "diagram_type": diagram_type, + "sub_results": [], + } ) sub_results: list[SubResult] = final_state.get("sub_results", []) diff --git a/src/maestro/strategies/single.py b/src/maestro/strategies/single.py index 282c793..0264d44 100644 --- a/src/maestro/strategies/single.py +++ b/src/maestro/strategies/single.py @@ -27,12 +27,17 @@ # Rules come from the canonical contract (maestro.prompts) so single-agent and # the multi-step strategies are given a byte-identical output contract. The -# runtime placeholder is escaped as ``{{input_data}}`` so it survives this -# f-string and is filled later by ``.format(input_data=...)``. +# runtime placeholders are escaped (``{{...}}``) so they survive this f-string +# and are filled later by ``.format(diagram_type=..., input_data=...)``. The +# diagram type is task context, stated explicitly to every strategy so the +# notation-dependent label rules are applied uniformly (the baseline would +# otherwise have to infer it from the input metadata on its own). PROMPT_TEMPLATE = f"""\ You are given a dataset describing entities and their relationships. Your task is to generate a Mermaid diagram that accurately represents this data. +The diagram notation is: {{diagram_type}} + Rules: {render_rules()} @@ -73,7 +78,10 @@ def run( return self._error_result(config, f"Failed to read input file: {e}") formatted_input = json.dumps(input_data, indent=2) - prompt = PROMPT_TEMPLATE.format(input_data=formatted_input) + diagram_type = input_data.get("metadata", {}).get("diagram_type", "unspecified") + prompt = PROMPT_TEMPLATE.format( + diagram_type=diagram_type, input_data=formatted_input + ) # Single call: wrap result in tuple with empty sub_results result = self.provider.complete(prompt, config) diff --git a/src/maestro/strategies/sop.py b/src/maestro/strategies/sop.py index cfe2160..7acf63f 100644 --- a/src/maestro/strategies/sop.py +++ b/src/maestro/strategies/sop.py @@ -31,6 +31,7 @@ STEP_1_PROMPT, STEP_2_PROMPT, STEP_3_PROMPT, + extract_diagram_type, strip_fences, validate_step_output, ) @@ -97,6 +98,7 @@ def run( try: raw = input_file.file_path.read_text(encoding="utf-8") input_data = json.dumps(json.loads(raw), indent=2) + diagram_type = extract_diagram_type(raw) except FileNotFoundError: return self._error_result( config, f"Input file not found: {input_file.file_path}" @@ -117,7 +119,7 @@ def run( step_system_prompt = step["system_prompt"] # Build prompt with outputs from previous steps - prompt = self._build_prompt(step, input_data, step_outputs) + prompt = self._build_prompt(step, input_data, diagram_type, step_outputs) # Execute with retry sub, output_text = self._execute_step( @@ -153,6 +155,7 @@ def _build_prompt( self, step: dict, input_data: str, + diagram_type: str, step_outputs: dict[str, str], ) -> str: """ @@ -160,7 +163,7 @@ def _build_prompt( Each step gets different variables depending on what's available. """ template = step["prompt"] - fmt = {"input_data": input_data} + fmt = {"input_data": input_data, "diagram_type": diagram_type} if "extract_entities" in step_outputs: fmt["entities_json"] = step_outputs["extract_entities"] diff --git a/tests/strategies/test_step_validation.py b/tests/strategies/test_step_validation.py index 7abb3b6..2439764 100644 --- a/tests/strategies/test_step_validation.py +++ b/tests/strategies/test_step_validation.py @@ -30,10 +30,39 @@ def test_empty_output_rejected_on_every_step(): assert ok is False and err == "empty output" -def test_step3_accepts_any_nonempty_text(): - """Step 3 is free-form Mermaid: non-empty is the only rule.""" +def test_step3_accepts_wellformed_mermaid(): + """Step 3 passes any non-empty, structurally sound Mermaid.""" ok, err = validate_step_output("flowchart LR\n a --> b", 3) assert ok is True and err is None + # Multi-line labels, cylinders, and subgraphs must not trip the shape check. + diagram = ( + "flowchart LR\n" + ' user["User\\n[Person]"]\n' + ' store[("Object Storage\\n[Container]")]\n' + ' subgraph infomaniak["Infomaniak"]\n' + ' web["SomeApp"]\n' + " end\n" + " user --> web" + ) + ok, err = validate_step_output(diagram, 3) + assert ok is True and err is None + + +def test_step3_rejects_empty_label_bracket(): + """The empty-label malformation a weak model emitted must consume a retry, + not pass as non-empty and land as a scored parse failure.""" + for bad in ('a[""]', "a['']", 'gw{""}', 'n(("")) '): + ok, err = validate_step_output(f"flowchart LR\n {bad}", 3) + assert ok is False and "empty node label" in err + + +def test_step3_rejects_concatenated_nodes(): + """Two node defs concatenated without a separator (the other observed + failure, e.g. ``a[""]b["B"]``) is rejected.""" + ok, err = validate_step_output('flowchart LR\n pool[""]web["W"]', 3) + assert ok is False + # Either malformation may match first; both are real defects in this line. + assert "empty node label" in err or "concatenated" in err def test_steps_1_2_still_apply_json_shape(): diff --git a/tests/test_prompts.py b/tests/test_prompts.py index 1d210a2..b49fe16 100644 --- a/tests/test_prompts.py +++ b/tests/test_prompts.py @@ -64,11 +64,26 @@ def test_rules_snapshot(): "parseable\n" "- If a node has no label, write just its id (e.g. gw_result): " 'never an empty bracket like node_id[""]\n' + "- Build each node label according to the diagram notation. For " + "architecture and infrastructure diagrams (C4 container, network " + "topology), put the entity name on the first line, the type on a " + "second line wrapped in square brackets and title-cased " + "(external_system becomes [External System], device becomes [Device]), " + "and a short technology or kind label on a third line only when the " + "input gives one, joined with a literal \\n inside the quotes, e.g. " + 'node_id["SomeApp\\n[Container]\\nWeb Application"]. Keep the third ' + "line to a few words; never put a full sentence or description in a " + "label. For process diagrams (BPMN process, BPMN collaboration), use " + 'the entity name alone with no type line, e.g. task_1["Task 1"]\n' "- Quote edge labels the same way, with no spaces inside the pipes, " 'e.g. a -->|"My edge"| b; for an unlabelled edge use a plain arrow ' "a --> b and never an empty label like -->|| or -->| |\n" "- Include every entity and relationship from the input\n" - "- Preserve hierarchy using subgraphs for pools, lanes, and subprocesses\n" + "- Preserve hierarchy using subgraphs for any grouping the input " + "gives, such as pools, lanes, subprocesses, system boundaries, or " + "deployment environments. Always give a subgraph a quoted label from " + "the group's name, e.g. subgraph infomaniak[\"Infomaniak Public " + 'Cloud"]; never write an empty subgraph label like subgraph id[""]\n' "- Do not invent entities or relationships not present in the input\n" "- Do not include explanations or markdown code fences\n" "- Do not use internal or relationship IDs as edge labels" @@ -133,13 +148,17 @@ def test_single_and_step3_inject_identical_rules(): def test_templates_keep_runtime_placeholders(): """The .format() placeholders must survive the f-string composition.""" assert "{input_data}" in PROMPT_TEMPLATE + assert "{diagram_type}" in PROMPT_TEMPLATE assert "{entities_json}" in STEP_3_PROMPT assert "{relationships_json}" in STEP_3_PROMPT + assert "{diagram_type}" in STEP_3_PROMPT def test_templates_format_without_stray_braces(): - PROMPT_TEMPLATE.format(input_data="{}") - STEP_3_PROMPT.format(entities_json="[]", relationships_json="[]") + PROMPT_TEMPLATE.format(diagram_type="c4_container", input_data="{}") + STEP_3_PROMPT.format( + diagram_type="bpmn_process", entities_json="[]", relationships_json="[]" + ) # --------------------------------------------------------------------------- From 24cee1b74fba72cdc326f117a94f329d3fd493c7 Mon Sep 17 00:00:00 2001 From: Colinho22 <48288595+Colinho22@users.noreply.github.com> Date: Sat, 20 Jun 2026 12:48:52 +0200 Subject: [PATCH 3/7] fix(strategies): harden diagram_type extraction and step-3 validation - extract_diagram_type normalises present-but-malformed values (null, a number, blank) to "unspecified"; get()'s default only guarded a missing key. - step-3 concatenation check blanks quoted labels first, so brackets inside a label (e.g. a["Service [v1] Gateway"]) are no longer a false positive. - single-agent shares extract_diagram_type instead of duplicating the lookup, so the baseline gets the same edge-case handling as the multi-step strategies. - crew marker strip uses rfind so an appended scaffold is cut even if the marker phrase ever appears in the prompt body. --- src/maestro/strategies/_extraction.py | 28 ++++++++++++++++-------- src/maestro/strategies/crew.py | 6 +++-- src/maestro/strategies/single.py | 3 ++- tests/strategies/test_step_validation.py | 23 +++++++++++++++++++ 4 files changed, 48 insertions(+), 12 deletions(-) diff --git a/src/maestro/strategies/_extraction.py b/src/maestro/strategies/_extraction.py index 79609ae..1ec7d58 100644 --- a/src/maestro/strategies/_extraction.py +++ b/src/maestro/strategies/_extraction.py @@ -7,8 +7,10 @@ differs between strategies: what each strategy file expresses in its own shape. -Single-agent does not import from here: its baseline prompt is intentionally -distinct (one shot -> diagram, no decomposition) and lives in `single.py`. +Single-agent does not reuse the multi-step *contract* from here: its baseline +prompt is intentionally distinct (one shot -> diagram, no decomposition) and +lives in `single.py`. It does share the small ``extract_diagram_type`` utility +so the notation context is read identically for every strategy. Adding a new multi-step strategy? - Reuse the prompts, schemas and validators from this module unchanged. @@ -158,16 +160,19 @@ def extract_diagram_type(raw_input: str) -> str: Read ``metadata.diagram_type`` from a raw input JSON string. Shared by every strategy so the render step's notation context is derived - identically. Falls back to "unspecified" if the field is missing or the - input is unparseable, so a malformed input still renders rather than - crashing the strategy (the diagram simply gets no notation hint). + identically. Falls back to "unspecified" if the field is missing, blank, not + a string, or the input is unparseable, so a malformed input still renders + rather than crashing the strategy (the diagram simply gets no notation hint). + The get() default only guards a missing key, so a present-but-malformed value + (null, a number, "") is normalised here before it reaches a prompt. """ try: - return ( - json.loads(raw_input).get("metadata", {}).get("diagram_type", "unspecified") - ) + value = json.loads(raw_input).get("metadata", {}).get("diagram_type") except (json.JSONDecodeError, TypeError, AttributeError): return "unspecified" + if not isinstance(value, str) or not value.strip(): + return "unspecified" + return value def strip_fences(text: str | None) -> str | None: @@ -244,6 +249,9 @@ def validate_step_output(text: str | None, step_number: int) -> tuple[bool, str # via mmdc); it must never reject a well-formed diagram. _EMPTY_LABEL = re.compile(r"""[\[\(\{]+\s*(["'])\1\s*[\]\)\}]+""") _CONCATENATED_NODES = re.compile(r"[\]\)\}]\s*\w+\s*[\[\(\{]") +# Quoted label spans, blanked before the concatenation scan so brackets INSIDE +# a label (e.g. a["Service [v1] Gateway"]) cannot be misread as two node defs. +_QUOTED_SPAN = re.compile(r"\"[^\"]*\"|'[^']*'") def _validate_mermaid_shape(text: str) -> tuple[bool, str | None]: @@ -251,7 +259,9 @@ def _validate_mermaid_shape(text: str) -> tuple[bool, str | None]: if _EMPTY_LABEL.search(text): return False, 'empty node label bracket (e.g. node_id[""])' for raw in text.splitlines(): - line = raw.strip() + # Blank quoted labels first: only structural brackets (not bracket + # characters inside a label) should count toward concatenation. + line = _QUOTED_SPAN.sub('""', raw.strip()) if _CONCATENATED_NODES.search(line): return False, "node definitions concatenated without a separator" return True, None diff --git a/src/maestro/strategies/crew.py b/src/maestro/strategies/crew.py index a19b549..a8a743f 100644 --- a/src/maestro/strategies/crew.py +++ b/src/maestro/strategies/crew.py @@ -184,8 +184,10 @@ def _collapse_messages(cls, messages) -> str: ] text = "\n\n".join(p for p in user_parts if p) # Drop CrewAI's expected-output block (and anything after it) so only - # our task prompt reaches the provider. - marker = text.find(cls._CREW_EXPECTED_OUTPUT_MARKER) + # our task prompt reaches the provider. rfind, not find: the scaffold is + # appended after the task description, so the last occurrence is the one + # to cut at even if the marker phrase ever appears in the prompt itself. + marker = text.rfind(cls._CREW_EXPECTED_OUTPUT_MARKER) if marker != -1: text = text[:marker] return text.rstrip() diff --git a/src/maestro/strategies/single.py b/src/maestro/strategies/single.py index 0264d44..6525dc3 100644 --- a/src/maestro/strategies/single.py +++ b/src/maestro/strategies/single.py @@ -23,6 +23,7 @@ from maestro.prompts import render_rules from maestro.schemas import InputFile, RunConfig, RunResult, SubResult +from maestro.strategies._extraction import extract_diagram_type from maestro.strategies.base import BaseStrategy # Rules come from the canonical contract (maestro.prompts) so single-agent and @@ -78,7 +79,7 @@ def run( return self._error_result(config, f"Failed to read input file: {e}") formatted_input = json.dumps(input_data, indent=2) - diagram_type = input_data.get("metadata", {}).get("diagram_type", "unspecified") + diagram_type = extract_diagram_type(raw) prompt = PROMPT_TEMPLATE.format( diagram_type=diagram_type, input_data=formatted_input ) diff --git a/tests/strategies/test_step_validation.py b/tests/strategies/test_step_validation.py index 2439764..a6cb842 100644 --- a/tests/strategies/test_step_validation.py +++ b/tests/strategies/test_step_validation.py @@ -9,11 +9,34 @@ from __future__ import annotations from maestro.strategies._extraction import ( + extract_diagram_type, validate_step_output, validate_step_payload, ) +def test_extract_diagram_type_normalizes_malformed_values(): + """get()'s default only guards a missing key; a present-but-malformed value + (null, a number, blank) must still fall back to the string "unspecified".""" + assert extract_diagram_type('{"metadata": {"diagram_type": "c4_container"}}') == ( + "c4_container" + ) + assert extract_diagram_type('{"metadata": {"diagram_type": null}}') == "unspecified" + assert extract_diagram_type('{"metadata": {"diagram_type": 123}}') == "unspecified" + assert extract_diagram_type('{"metadata": {"diagram_type": ""}}') == "unspecified" + assert extract_diagram_type('{"metadata": null}') == "unspecified" + assert extract_diagram_type("not json") == "unspecified" + + +def test_step3_allows_brackets_inside_quoted_labels(): + """Bracket characters inside a quoted label (e.g. a version tag) must not be + misread as concatenated node definitions.""" + ok, err = validate_step_output( + 'flowchart LR\n a["Service [v1] Gateway [public]"]', 3 + ) + assert ok is True and err is None + + def test_empty_output_rejected_on_every_step(): """Empty or whitespace output is a failure on all three steps. From 625bb81603e1671575d76b65617a030482bad91e Mon Sep 17 00:00:00 2001 From: Colinho22 <48288595+Colinho22@users.noreply.github.com> Date: Sat, 20 Jun 2026 15:25:17 +0200 Subject: [PATCH 4/7] fix(strategies): catch unbalanced subgraph/end in step-3 validation A dropped `end` leaves a subgraph open: invalid Mermaid that mmdc rejects but the prior checks passed, so it scored as a parse failure instead of consuming the retry budget. Deeply nested diagrams (pools > lanes > subprocesses, network zones) are where a weaker model drops one. Count `subgraph` openers against standalone `end` closers and reject an imbalance; `end` is matched only as a whole-line closer so node ids like end_event_1 and labels like "End Event" never count. Verified against all 30 ground truths (zero false positives). Also pins extract_diagram_type on a whitespace-only diagram_type value. --- src/maestro/strategies/_extraction.py | 18 +++++++++++++++--- tests/strategies/test_step_validation.py | 24 ++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/src/maestro/strategies/_extraction.py b/src/maestro/strategies/_extraction.py index 1ec7d58..d694816 100644 --- a/src/maestro/strategies/_extraction.py +++ b/src/maestro/strategies/_extraction.py @@ -255,13 +255,25 @@ def validate_step_output(text: str | None, step_number: int) -> tuple[bool, str def _validate_mermaid_shape(text: str) -> tuple[bool, str | None]: - """Reject the two step-3 malformations we have actually observed.""" + """Reject the structural malformations we have actually observed.""" if _EMPTY_LABEL.search(text): return False, 'empty node label bracket (e.g. node_id[""])' + # Subgraph nesting must balance: an unclosed subgraph (a dropped `end`) is + # invalid Mermaid that mmdc rejects, so flag it here to consume the retry + # rather than scoring as a parse failure. Deeply nested diagrams (pools > + # lanes > subprocesses, network zones) are where a model drops one. `end` + # is counted only as a standalone closer line so node ids and labels that + # merely contain "end" (end_event_1, "End Event") never match. + opens = closes = 0 for raw in text.splitlines(): - # Blank quoted labels first: only structural brackets (not bracket - # characters inside a label) should count toward concatenation. line = _QUOTED_SPAN.sub('""', raw.strip()) if _CONCATENATED_NODES.search(line): return False, "node definitions concatenated without a separator" + stripped = raw.strip() + if stripped.startswith("subgraph "): + opens += 1 + elif stripped == "end": + closes += 1 + if opens != closes: + return False, f"unbalanced subgraph/end ({opens} subgraph, {closes} end)" return True, None diff --git a/tests/strategies/test_step_validation.py b/tests/strategies/test_step_validation.py index a6cb842..fcf1eb5 100644 --- a/tests/strategies/test_step_validation.py +++ b/tests/strategies/test_step_validation.py @@ -24,6 +24,9 @@ def test_extract_diagram_type_normalizes_malformed_values(): assert extract_diagram_type('{"metadata": {"diagram_type": null}}') == "unspecified" assert extract_diagram_type('{"metadata": {"diagram_type": 123}}') == "unspecified" assert extract_diagram_type('{"metadata": {"diagram_type": ""}}') == "unspecified" + assert extract_diagram_type('{"metadata": {"diagram_type": " "}}') == ( + "unspecified" + ) assert extract_diagram_type('{"metadata": null}') == "unspecified" assert extract_diagram_type("not json") == "unspecified" @@ -37,6 +40,27 @@ def test_step3_allows_brackets_inside_quoted_labels(): assert ok is True and err is None +def test_step3_rejects_unbalanced_subgraph(): + """A dropped `end` is invalid Mermaid mmdc would reject; flag it so the + retry budget applies instead of scoring a parse failure.""" + missing_end = ( + 'flowchart LR\nsubgraph p1["Pool"]\n a["A"]\nsubgraph p2["P2"]\n b["B"]\nend' + ) + ok, err = validate_step_output(missing_end, 3) + assert ok is False and "unbalanced subgraph" in err + + +def test_step3_balance_ignores_end_in_ids_and_labels(): + """`end` counts only as a standalone closer line: node ids like end_event_1 + and labels like "End Event" must not be read as closers.""" + diagram = ( + 'flowchart LR\nsubgraph p["Pool"]\n' + ' end_event_1(["End Event 1"])\n task_1["Task end here"]\nend' + ) + ok, err = validate_step_output(diagram, 3) + assert ok is True and err is None + + def test_empty_output_rejected_on_every_step(): """Empty or whitespace output is a failure on all three steps. From fccc5ac94ab8d9c6669c492ff04407e50c148f65 Mon Sep 17 00:00:00 2001 From: Colinho22 <48288595+Colinho22@users.noreply.github.com> Date: Sat, 20 Jun 2026 15:25:54 +0200 Subject: [PATCH 5/7] fix(metrics): score only the input-derivable part of a node label Two ground-truth authoring conventions are not derivable from the input, so scoring them penalised models for an unreachable target: - The optional third descriptor line of a C4 / network label is authored inconsistently (network topology includes it for some nodes and omits it for others, though the input always carries the field). The entity-name metric now compares on the name and [Type] lines only. - Nodes the input leaves unnamed (BPMN gateways/events with name "") are labelled in the ground truth from convention the input never supplies (type name, unicode symbols, split/join). These are scored by id match only. This is conditional on the input: a node the input did name is still scored on its label, so blanking a nameable node is still penalised. evaluate_run takes an optional input_path to read the unnamed-id set; it is backward compatible (omitted means strict label scoring) and fails soft on a missing or malformed input. On the pre-flight, network name_f1 rose from ~0.28-0.38 to ~0.88-1.0 with no regression on BPMN or C4. --- src/maestro/analysis/metrics.py | 126 ++++++++++++++++++++++++--- src/maestro/run.py | 1 + tests/analysis/test_label_scoring.py | 70 +++++++++++++++ 3 files changed, 184 insertions(+), 13 deletions(-) create mode 100644 tests/analysis/test_label_scoring.py diff --git a/src/maestro/analysis/metrics.py b/src/maestro/analysis/metrics.py index 48cac95..08b2d98 100644 --- a/src/maestro/analysis/metrics.py +++ b/src/maestro/analysis/metrics.py @@ -10,6 +10,7 @@ from __future__ import annotations +import json import os import re import shutil @@ -106,20 +107,70 @@ def check_mermaid_valid(diagram_code: str) -> tuple[bool | None, str | None]: # --------------------------------------------------------------------------- +def extract_input_unnamed_ids(input_path: Path | None) -> set[str]: + """ + Ids of input elements with an empty ``name`` (e.g. BPMN gateways/events the + source leaves unnamed). Their ground-truth label is authoring convention the + input does not provide, so the entity-name metric scores them by id only. + + Returns an empty set when no input path is given or the input cannot be read + or parsed, so scoring degrades to the strict label comparison rather than + crashing (observability fails soft). + """ + if input_path is None: + return set() + try: + data = json.loads(Path(input_path).read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return set() + elements = data.get("elements") or data.get("nodes") or [] + if not isinstance(elements, list): + return set() + return { + e["id"] + for e in elements + if isinstance(e, dict) and e.get("id") and not (e.get("name") or "").strip() + } + + +def _label_core(label: str) -> str: + r""" + Keep the scored part of a multi-line node label: the name and the bracketed + ``[Type]`` line, dropping any trailing descriptor line. + + Labels are ``name`` (BPMN), ``name\n[Type]`` or ``name\n[Type]\ndescriptor`` + (C4 / network), with ``\n`` as a literal two-character separator. The third + descriptor line is authored inconsistently in the ground truth (network + topology includes it for some nodes and not others, though the input always + carries the field), so no model can predict it. Scoring it would penalise a + correct name and type for an unpredictable authoring choice, so the entity + name metric compares on name + type only. The descriptor is out of the + scored contract by design; this is applied to output and truth identically. + """ + parts = label.split("\\n") + kept = [parts[0]] + for p in parts[1:]: + if p.strip().startswith("["): # the [Type] line; descriptor follows + kept.append(p) + break + return "\\n".join(kept) + + def _normalize_label(label: str) -> str: """ - Basic normalization: lowercase, strip whitespace. + Basic normalization: drop the descriptor line, lowercase, strip whitespace. Used for raw fuzzy matching: no linguistic processing. """ - return label.strip().lower() + return _label_core(label).strip().lower() def _lemmatize_label(label: str) -> str: """ - Normalize + lemmatize: lowercase, strip plurals, collapse separators. - Catches 'Tasks' -> 'task', 'start_event_1' -> 'start event 1'. + Normalize + lemmatize: drop the descriptor line, lowercase, strip plurals, + collapse separators. Catches 'Tasks' -> 'task', 'start_event_1' -> 'start + event 1'. """ - text = label.strip().lower() + text = _label_core(label).strip().lower() # Replace underscores and hyphens with spaces text = re.sub(r"[_\-]", " ", text) # Strip trailing 's' for basic plural handling @@ -372,18 +423,37 @@ def _fuzzy_match( output_nodes: list[dict], truth_nodes: list[dict], normalizer, + input_unnamed_ids: set[str] | None = None, ) -> tuple[float, float, float]: """ Fuzzy name matching with a configurable normalizer function. Used for both raw and lemmatized matching. + + ``input_unnamed_ids`` are node ids the *input* left without a name (e.g. a + BPMN gateway with name ""). The ground truth labels these from convention + (type name, unicode symbols, split/join) that the input does not provide, so + the model cannot derive the label. For such a node, an id match counts as a + name match regardless of the produced label, including a blank one. This is + conditional on the input: a node the input *did* name is still scored on its + label, so a model that blanks a nameable node is still penalised. """ if not output_nodes or not truth_nodes: return (0.0, 0.0, 0.0) + unnamed = input_unnamed_ids or set() + truth_ids = {t["id"]: i for i, t in enumerate(truth_nodes)} matched_truth = set() correct = 0 for out_node in output_nodes: + # Input-unnamed node: an id match is a name match, label not scored. + if out_node["id"] in unnamed and out_node["id"] in truth_ids: + idx = truth_ids[out_node["id"]] + if idx not in matched_truth: + correct += 1 + matched_truth.add(idx) + continue + out_label = normalizer(out_node["label"]) best_score = 0.0 best_idx = None @@ -407,17 +477,21 @@ def _fuzzy_match( def compute_entity_metrics_fuzzy( - output_nodes: list[dict], truth_nodes: list[dict] + output_nodes: list[dict], + truth_nodes: list[dict], + input_unnamed_ids: set[str] | None = None, ) -> tuple[float, float, float]: """Fuzzy name match with basic normalization (lowercase only).""" - return _fuzzy_match(output_nodes, truth_nodes, _normalize_label) + return _fuzzy_match(output_nodes, truth_nodes, _normalize_label, input_unnamed_ids) def compute_entity_metrics_lemma( - output_nodes: list[dict], truth_nodes: list[dict] + output_nodes: list[dict], + truth_nodes: list[dict], + input_unnamed_ids: set[str] | None = None, ) -> tuple[float, float, float]: """Fuzzy name match with lemmatization (lowercase + strip plurals).""" - return _fuzzy_match(output_nodes, truth_nodes, _lemmatize_label) + return _fuzzy_match(output_nodes, truth_nodes, _lemmatize_label, input_unnamed_ids) # --------------------------------------------------------------------------- @@ -513,11 +587,21 @@ def compute_attachment_metrics( # --------------------------------------------------------------------------- -def compute_entity_taxonomy(output_nodes: list[dict], truth_nodes: list[dict]) -> dict: +def compute_entity_taxonomy( + output_nodes: list[dict], + truth_nodes: list[dict], + input_unnamed_ids: set[str] | None = None, +) -> dict: """ Count entity-level errors by taxonomy category. Returns: {"missing": int, "extra": int, "false": int, "duplicate": int} + + A "false" entity is an id match with a mismatched label. Nodes the input + left unnamed (``input_unnamed_ids``) are exempt: their ground-truth label is + convention the input does not provide, so a label mismatch there is not a + model error (see ``_fuzzy_match``). """ + unnamed = input_unnamed_ids or set() output_ids = [n["id"] for n in output_nodes] truth_ids = {n["id"] for n in truth_nodes} @@ -540,6 +624,8 @@ def compute_entity_taxonomy(output_nodes: list[dict], truth_nodes: list[dict]) - false_count = 0 for nid in shared_ids: + if nid in unnamed: + continue # input gave no name; label is not the model's to get right similarity = _fuzzy_score( _normalize_label(output_labels[nid]), _normalize_label(truth_labels[nid]), @@ -604,10 +690,16 @@ def evaluate_run( run_id: UUID, output_diagram_code: str, ground_truth_path: Path, + input_path: Path | None = None, ) -> MetricResult: """ Full evaluation pipeline for one run. Compares generated diagram against ground truth file. + + ``input_path`` is the source JSON. When given, elements it leaves unnamed + are scored by id only for the entity-name metric, since their ground-truth + label is convention the input does not supply. Optional and backward + compatible: omitted means strict label scoring for every node. """ try: truth_code = ground_truth_path.read_text(encoding="utf-8") @@ -659,10 +751,18 @@ def evaluate_run( output_attachments = extract_attachments(output_diagram_code) truth_attachments = extract_attachments(truth_code) + # Ids the input left unnamed: their label is GT convention, not derivable, + # so the name/lemma metrics and the false-entity count score them by id. + unnamed_ids = extract_input_unnamed_ids(input_path) + # 3. Entity metrics: three levels id_p, id_r, id_f1 = compute_entity_metrics_exact(output_nodes, truth_nodes) - name_p, name_r, name_f1 = compute_entity_metrics_fuzzy(output_nodes, truth_nodes) - lemma_p, lemma_r, lemma_f1 = compute_entity_metrics_lemma(output_nodes, truth_nodes) + name_p, name_r, name_f1 = compute_entity_metrics_fuzzy( + output_nodes, truth_nodes, unnamed_ids + ) + lemma_p, lemma_r, lemma_f1 = compute_entity_metrics_lemma( + output_nodes, truth_nodes, unnamed_ids + ) # 4. Relationship metrics: two levels rel_p, rel_r, rel_f1 = compute_relationship_metrics_relaxed( @@ -673,7 +773,7 @@ def evaluate_run( ) # 5. Error taxonomy - entity_tax = compute_entity_taxonomy(output_nodes, truth_nodes) + entity_tax = compute_entity_taxonomy(output_nodes, truth_nodes, unnamed_ids) relationship_tax = compute_relationship_taxonomy( output_relationships, truth_relationships ) diff --git a/src/maestro/run.py b/src/maestro/run.py index 27ae9e8..2488af4 100644 --- a/src/maestro/run.py +++ b/src/maestro/run.py @@ -615,6 +615,7 @@ def _execute_cell( run_id=config.run_id, output_diagram_code=result.output_diagram_code, ground_truth_path=input_file.ground_truth_path, + input_path=input_file.file_path, ) except Exception as exc: traceback.print_exc(file=sys.stderr) diff --git a/tests/analysis/test_label_scoring.py b/tests/analysis/test_label_scoring.py new file mode 100644 index 0000000..94edf46 --- /dev/null +++ b/tests/analysis/test_label_scoring.py @@ -0,0 +1,70 @@ +""" +Label-scoring policy: what the entity-name metric does and does not score. + +Two ground-truth authoring conventions are not derivable from the input, so the +metric excludes them rather than penalising a model for an unreachable target: + + * the optional third descriptor line of a C4 / network label, authored + inconsistently across nodes, is dropped before comparison; + * nodes the input leaves unnamed (BPMN gateways/events with name "") are + scored by id only, since their ground-truth label is pure convention. + +The second is conditional on the input: a node the input *did* name is still +scored on its label, so a model that blanks a nameable node is still penalised. +""" + +from __future__ import annotations + +from maestro.analysis.metrics import ( + _label_core, + compute_entity_metrics_fuzzy, + extract_input_unnamed_ids, +) + + +def test_label_core_keeps_name_and_type_drops_descriptor(): + assert _label_core("Task 1") == "Task 1" + assert _label_core("User\\n[Person]") == "User\\n[Person]" + # third descriptor line dropped; name + [Type] kept + assert _label_core("SomeApp\\n[Container]\\nWeb Application") == ( + "SomeApp\\n[Container]" + ) + + +def test_descriptor_line_not_scored(): + """A mismatched third line must not sink an otherwise correct label.""" + out = [{"id": "a", "label": "Router\\n[Device]\\nNetwork Switch"}] + truth = [{"id": "a", "label": "Router\\n[Device]"}] + _, _, f1 = compute_entity_metrics_fuzzy(out, truth) + assert f1 == 1.0 + + +def test_input_unnamed_node_scored_by_id_only(): + """A gateway the input left unnamed: id match counts even though the model's + label cannot match the convention-authored ground-truth label.""" + out = [{"id": "pgw_3", "label": ""}] + truth = [{"id": "pgw_3", "label": "+ Parallel Split"}] + # Without the exemption, the empty label would never reach threshold. + _, _, f1_strict = compute_entity_metrics_fuzzy(out, truth) + assert f1_strict < 1.0 + # With the input marking pgw_3 unnamed, the id match is a name match. + _, _, f1_exempt = compute_entity_metrics_fuzzy(out, truth, {"pgw_3"}) + assert f1_exempt == 1.0 + + +def test_exemption_does_not_excuse_a_nameable_node(): + """The guardrail: a node the input *named* is still scored on its label, so a + model that blanks it is still penalised even if other nodes are exempt.""" + out = [{"id": "task_1", "label": ""}] + truth = [{"id": "task_1", "label": "Receive Travel Request"}] + # task_1 is NOT in the unnamed set, so the blank label is still wrong. + _, _, f1 = compute_entity_metrics_fuzzy(out, truth, {"pgw_3"}) + assert f1 < 1.0 + + +def test_extract_input_unnamed_ids_handles_missing_and_malformed(): + assert extract_input_unnamed_ids(None) == set() + # nonexistent path fails soft to an empty set + from pathlib import Path + + assert extract_input_unnamed_ids(Path("/no/such/input.json")) == set() From 53780cd7191a8105ad097b054139d7fec32177ba Mon Sep 17 00:00:00 2001 From: Colinho22 <48288595+Colinho22@users.noreply.github.com> Date: Sat, 20 Jun 2026 17:07:15 +0200 Subject: [PATCH 6/7] fix(strategies): count anonymous subgraph openers in balance check A bare `subgraph` (no id) is a valid opener, but the balance check only matched `subgraph ` with a trailing space, so an anonymous subgraph would be counted as 0 opens against its 1 `end` and falsely rejected as unbalanced. Match the bare form too. None appear in the corpus, but a model emitting one during a run should not produce a spurious failed cell. --- src/maestro/strategies/_extraction.py | 5 ++++- tests/strategies/test_step_validation.py | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/maestro/strategies/_extraction.py b/src/maestro/strategies/_extraction.py index d694816..33eb2c0 100644 --- a/src/maestro/strategies/_extraction.py +++ b/src/maestro/strategies/_extraction.py @@ -270,7 +270,10 @@ def _validate_mermaid_shape(text: str) -> tuple[bool, str | None]: if _CONCATENATED_NODES.search(line): return False, "node definitions concatenated without a separator" stripped = raw.strip() - if stripped.startswith("subgraph "): + # An anonymous subgraph (bare "subgraph", no id) is valid Mermaid and + # still opens a block, so count it too: missing it would falsely reject + # a balanced diagram as unbalanced. + if stripped.startswith("subgraph ") or stripped == "subgraph": opens += 1 elif stripped == "end": closes += 1 diff --git a/tests/strategies/test_step_validation.py b/tests/strategies/test_step_validation.py index fcf1eb5..7d48775 100644 --- a/tests/strategies/test_step_validation.py +++ b/tests/strategies/test_step_validation.py @@ -61,6 +61,14 @@ def test_step3_balance_ignores_end_in_ids_and_labels(): assert ok is True and err is None +def test_step3_balance_counts_anonymous_subgraph(): + """A bare `subgraph` (no id) is a valid opener; its `end` must balance it, + not be read as an extra closer.""" + diagram = 'flowchart LR\nsubgraph\n a["A"]\nend' + ok, err = validate_step_output(diagram, 3) + assert ok is True and err is None + + def test_empty_output_rejected_on_every_step(): """Empty or whitespace output is a failure on all three steps. From cbdbe7d739e9ddd504a86309bc3130221eb65023 Mon Sep 17 00:00:00 2001 From: Colinho22 <48288595+Colinho22@users.noreply.github.com> Date: Sat, 20 Jun 2026 21:11:10 +0200 Subject: [PATCH 7/7] fix(metrics): drop redundant Path() wrapper in extract_input_unnamed_ids input_path is already Path after the None guard, so wrapping it in Path() before read_text() was a no-op. --- src/maestro/analysis/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/maestro/analysis/metrics.py b/src/maestro/analysis/metrics.py index 08b2d98..8c2908a 100644 --- a/src/maestro/analysis/metrics.py +++ b/src/maestro/analysis/metrics.py @@ -120,7 +120,7 @@ def extract_input_unnamed_ids(input_path: Path | None) -> set[str]: if input_path is None: return set() try: - data = json.loads(Path(input_path).read_text(encoding="utf-8")) + data = json.loads(input_path.read_text(encoding="utf-8")) except (OSError, json.JSONDecodeError): return set() elements = data.get("elements") or data.get("nodes") or []