From c58f8d6bbb7a153ea7c4f1756ca9376dcc1afdb9 Mon Sep 17 00:00:00 2001
From: Colinho22 <48288595+Colinho22@users.noreply.github.com>
Date: Fri, 19 Jun 2026 23:17:13 +0200
Subject: [PATCH 1/7] fix(run): suppress CrewAI trace prompt that blocks batch
 runs

CREWAI_TRACING_ENABLED=false governs only the enabled path, not the
first-execution 'view traces? [y/N]' prompt, which blocks on stdin for a
20s timeout per crew. The file-based declined preference is wiped on every
docker --rm, so it re-fires on every cell: ~1500 crew_ai cells means hours
of dead waiting and corrupted duration_ms. CREWAI_TESTING=true short
circuits the prompt before any stdin read.
---
 src/maestro/run.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/maestro/run.py b/src/maestro/run.py
index 2f0c90e..27ae9e8 100644
--- a/src/maestro/run.py
+++ b/src/maestro/run.py
@@ -57,8 +57,19 @@
 # Silence CrewAI's interactive tracing prompt and telemetry so batch runs
 # stay non-interactive on fresh checkouts (the user's preference file is
 # machine-local and won't exist in CI / on a fresh clone).
+#
+# CREWAI_TRACING_ENABLED=false only governs the *enabled* path; it does NOT
+# stop the first-execution "view your traces? [y/N]" prompt, which blocks on
+# stdin for a 20s timeout per crew. In a headless container that file-based
+# "declined" preference is wiped every `--rm`, so the prompt fires on every
+# cell: ~1500 crew_ai cells x 20s is hours of dead waiting, and it corrupts
+# the measured duration_ms. CREWAI_TESTING=true is the only flag that short
+# circuits the prompt before any stdin read (crewai .../tracing/utils.py:
+# _is_test_environment guards both the auto-collect check and the prompt). It
+# disables trace prompts/telemetry only; it does not alter agent execution.
 os.environ.setdefault("CREWAI_TRACING_ENABLED", "false")
 os.environ.setdefault("CREWAI_DISABLE_TELEMETRY", "true")
+os.environ.setdefault("CREWAI_TESTING", "true")
 
 from maestro.analysis.metrics import evaluate_run
 from maestro.db.client import get_connection, init_db

From be99dd99e8e4b0d85aaf0b2dfe92cf7ef6b01198 Mon Sep 17 00:00:00 2001
From: Colinho22 <48288595+Colinho22@users.noreply.github.com>
Date: Sat, 20 Jun 2026 12:17:43 +0200
Subject: [PATCH 2/7] fix(prompts): notation-aware labels, IR fidelity,
 subgraph labels, step-3 validation

Empirical review of strategy outputs found the multi-step strategies
scored entity_name_f1 near 0 on C4 inputs: their JSON IR flattened the
name/[Type]/tech label structure the ground truth uses, and the shared
rules never taught that format. Fixes, verified to lift name_f1 to ~1.0
on both C4 and BPMN across all four strategies:

- Label rule is now notation-aware: C4 and network-topology diagrams get
  the name/[Type]/tech multi-line label; BPMN process and collaboration
  diagrams keep bare names. diagram_type is read from the input metadata
  and passed to every strategy's render step as task context (the baseline
  already saw it; this makes it uniform and explicit), so the convention
  is applied consistently rather than guessed.
- Step-1 IR carries a short tech label (not the long description) and
  extracts every referenced parent group as a named entity, so container
  and boundary subgraphs render with their labels instead of empty
  brackets. The hierarchy rule now covers C4 boundaries and deployment
  environments and requires a quoted subgraph label.
- Step-3 gains a light structural check that rejects empty-label brackets
  and concatenated node defs, so a malformed diagram consumes the retry
  budget instead of landing as a scored parse failure.
- Strip CrewAI's expected-output scaffolding so its delivered prompt is
  byte-identical to SOP, removing a confound between the two strategies.
---
 src/maestro/prompts.py                   |  3 +-
 src/maestro/strategies/_extraction.py    | 66 ++++++++++++++++++++++--
 src/maestro/strategies/crew.py           | 47 +++++++++++++----
 src/maestro/strategies/langgraph.py      | 10 +++-
 src/maestro/strategies/single.py         | 14 +++--
 src/maestro/strategies/sop.py            |  7 ++-
 tests/strategies/test_step_validation.py | 33 +++++++++++-
 tests/test_prompts.py                    | 25 +++++++--
 8 files changed, 178 insertions(+), 27 deletions(-)

diff --git a/src/maestro/prompts.py b/src/maestro/prompts.py
index 005b682..4f3e83e 100644
--- a/src/maestro/prompts.py
+++ b/src/maestro/prompts.py
@@ -37,9 +37,10 @@
 - Output only valid Mermaid syntax
 - Wrap node labels in double quotes, e.g. node_id["My Label"], so labels with spaces, parentheses, slashes, or line breaks stay parseable
 - If a node has no label, write just its id (e.g. gw_result): never an empty bracket like node_id[""]
+- Build each node label according to the diagram notation. For architecture and infrastructure diagrams (C4 container, network topology), put the entity name on the first line, the type on a second line wrapped in square brackets and title-cased (external_system becomes [External System], device becomes [Device]), and a short technology or kind label on a third line only when the input gives one, joined with a literal \\n inside the quotes, e.g. node_id["SomeApp\\n[Container]\\nWeb Application"]. Keep the third line to a few words; never put a full sentence or description in a label. For process diagrams (BPMN process, BPMN collaboration), use the entity name alone with no type line, e.g. task_1["Task 1"]
 - Quote edge labels the same way, with no spaces inside the pipes, e.g. a -->|"My edge"| b; for an unlabelled edge use a plain arrow a --> b and never an empty label like -->|| or -->| |
 - Include every entity and relationship from the input
-- Preserve hierarchy using subgraphs for pools, lanes, and subprocesses
+- Preserve hierarchy using subgraphs for any grouping the input gives, such as pools, lanes, subprocesses, system boundaries, or deployment environments. Always give a subgraph a quoted label from the group's name, e.g. subgraph infomaniak["Infomaniak Public Cloud"]; never write an empty subgraph label like subgraph id[""]
 - Do not invent entities or relationships not present in the input
 - Do not include explanations or markdown code fences
 - Do not use internal or relationship IDs as edge labels"""
diff --git a/src/maestro/strategies/_extraction.py b/src/maestro/strategies/_extraction.py
index a2f0288..79609ae 100644
--- a/src/maestro/strategies/_extraction.py
+++ b/src/maestro/strategies/_extraction.py
@@ -19,6 +19,7 @@
 from __future__ import annotations
 
 import json
+import re
 
 from maestro.prompts import render_rules
 
@@ -32,8 +33,18 @@
 
 Rules:
 - Output valid JSON only, no explanations, no markdown fencing
+- Extract only the actual elements or nodes; never turn a metadata or summary
+  field into an entity
 - Include every entity from the input
-- Capture parent-child relationships (pools, lanes, subprocesses)
+- Capture parent-child relationships (pools, lanes, subprocesses, system
+  boundaries, deployment environments). When an element names a parent group
+  (a lane, pool, boundary, or environment), include that group itself as an
+  entity with its own id and name, so the group can be drawn as a labeled
+  subgraph; never leave a referenced parent without a name.
+- Copy name and type from the input verbatim. For tech, copy a short technology
+  or kind label if the input gives one (a "technology" field or similar), not a
+  long sentence; leave it null when the input has none. This is the third label
+  line, so keep it short.
 - Use this exact schema:
 {{
   "entities": [
@@ -41,6 +52,7 @@
       "id": "string",
       "name": "string",
       "type": "string",
+      "tech": "string or null",
       "parent_id": "string or null"
     }}
   ]
@@ -91,11 +103,17 @@
 # Rules come from the canonical contract (maestro.prompts) so step 3 and the
 # single-agent baseline are given a byte-identical output contract. The runtime
 # placeholders are escaped (``{{...}}``) so they survive this f-string and are
-# filled later by ``.format(entities_json=..., relationships_json=...)``.
+# filled later by ``.format(diagram_type=..., entities_json=...,
+# relationships_json=...)``. The diagram type is task context (it is in the
+# input metadata, which the single-agent baseline already sees): the label
+# rules differ by notation, so the render step must know which notation it is
+# producing, the same way a person would be told "draw this as a C4 diagram".
 STEP_3_PROMPT = f"""\
 You are given a set of entities and relationships extracted from a dataset.
 Your task is to generate a Mermaid diagram that accurately represents them.
 
+The diagram notation is: {{diagram_type}}
+
 Rules:
 {render_rules()}
 
@@ -135,6 +153,23 @@
 # ---------------------------------------------------------------------------
 
 
+def extract_diagram_type(raw_input: str) -> str:
+    """
+    Read ``metadata.diagram_type`` from a raw input JSON string.
+
+    Shared by every strategy so the render step's notation context is derived
+    identically. Falls back to "unspecified" if the field is missing or the
+    input is unparseable, so a malformed input still renders rather than
+    crashing the strategy (the diagram simply gets no notation hint).
+    """
+    try:
+        return (
+            json.loads(raw_input).get("metadata", {}).get("diagram_type", "unspecified")
+        )
+    except (json.JSONDecodeError, TypeError, AttributeError):
+        return "unspecified"
+
+
 def strip_fences(text: str | None) -> str | None:
     """
     Remove markdown code fences if present.
@@ -187,8 +222,9 @@ def validate_step_output(text: str | None, step_number: int) -> tuple[bool, str
     Rejects empty output on every step (strip_fences can empty an otherwise
     non-empty response, e.g. bare ``` fences, which would otherwise pass the
     provider success check and leave step 3 with no diagram), then applies the
-    step-1/step-2 JSON shape check via validate_step_payload. Step 3 has no
-    further shape rule once it is non-empty.
+    step-1/step-2 JSON shape check via validate_step_payload. Step 3 gets a
+    light structural check so a malformed diagram consumes the retry budget
+    instead of passing as "non-empty" and landing as a scored parse failure.
 
     Returns (is_valid, error_message); error_message is None on success.
     """
@@ -196,4 +232,26 @@ def validate_step_output(text: str | None, step_number: int) -> tuple[bool, str
         return False, "empty output"
     if step_number < 3:
         return validate_step_payload(text, step_number)
+    return _validate_mermaid_shape(text)
+
+
+# Empty-label bracket (node_id[""]) and two node defs concatenated with no
+# separator (][ as in `a[""]b["B"]`) are the exact malformations a weak model
+# emitted at step 3. They are cheap to detect with a regex and forbidden by the
+# output contract, so catching them here lets the existing MAX_RETRIES retry
+# rather than scoring a guaranteed mmdc parse failure. This is a narrow
+# structural smoke check, not full Mermaid validation (that lives in the metric
+# via mmdc); it must never reject a well-formed diagram.
+_EMPTY_LABEL = re.compile(r"""[\[\(\{]+\s*(["'])\1\s*[\]\)\}]+""")
+_CONCATENATED_NODES = re.compile(r"[\]\)\}]\s*\w+\s*[\[\(\{]")
+
+
+def _validate_mermaid_shape(text: str) -> tuple[bool, str | None]:
+    """Reject the two step-3 malformations we have actually observed."""
+    if _EMPTY_LABEL.search(text):
+        return False, 'empty node label bracket (e.g. node_id[""])'
+    for raw in text.splitlines():
+        line = raw.strip()
+        if _CONCATENATED_NODES.search(line):
+            return False, "node definitions concatenated without a separator"
     return True, None
diff --git a/src/maestro/strategies/crew.py b/src/maestro/strategies/crew.py
index a492b2d..a19b549 100644
--- a/src/maestro/strategies/crew.py
+++ b/src/maestro/strategies/crew.py
@@ -47,6 +47,7 @@
     STEP_1_PROMPT,
     STEP_2_PROMPT,
     STEP_3_PROMPT,
+    extract_diagram_type,
     strip_fences,
     validate_step_output,
 )
@@ -153,19 +154,41 @@ def call(
         # recorder and surfaces it through SubResult.error.
         return result.output_diagram_code or ""
 
-    @staticmethod
-    def _collapse_messages(messages) -> str:
+    # CrewAI appends an "expected output" block after our task description,
+    # built from its en.json "expected_output" slice. That trailing text is
+    # CrewAI scaffolding SOP never sends, so leaving it in would make crew_ai's
+    # delivered prompt differ from SOP's and confound the comparison (the two
+    # strategies must differ only in orchestration, not prompt content). We cut
+    # from this stable marker onward so the provider receives exactly the step
+    # prompt, byte-identical to SOP. The marker is verbatim from CrewAI's
+    # en.json "expected_output" slice.
+    _CREW_EXPECTED_OUTPUT_MARKER = (
+        "\nThis is the expected criteria for your final answer:"
+    )
+
+    @classmethod
+    def _collapse_messages(cls, messages) -> str:
         """
         CrewAI's ``Agent`` + ``Task`` machinery composes a list of messages
-        with role 'system' (agent persona) and 'user' (task description). We
-        ignore CrewAI's system message (our system prompt is set explicitly
-        per step via ``system_prompt_override``) and concatenate the user
-        portion as the prompt.
+        with role 'system' (agent persona) and 'user' (task description plus an
+        expected-output block). We ignore CrewAI's system message (our system
+        prompt is set explicitly per step via ``system_prompt_override``),
+        concatenate the user portion, and strip CrewAI's expected-output
+        scaffolding so the delivered prompt matches SOP byte-for-byte.
         """
         if isinstance(messages, str):
-            return messages
-        user_parts = [m.get("content", "") for m in messages if m.get("role") == "user"]
-        return "\n\n".join(p for p in user_parts if p)
+            text = messages
+        else:
+            user_parts = [
+                m.get("content", "") for m in messages if m.get("role") == "user"
+            ]
+            text = "\n\n".join(p for p in user_parts if p)
+        # Drop CrewAI's expected-output block (and anything after it) so only
+        # our task prompt reaches the provider.
+        marker = text.find(cls._CREW_EXPECTED_OUTPUT_MARKER)
+        if marker != -1:
+            text = text[:marker]
+        return text.rstrip()
 
 
 # ---------------------------------------------------------------------------
@@ -246,6 +269,7 @@ def run(
         try:
             raw = input_file.file_path.read_text(encoding="utf-8")
             input_data = json.dumps(json.loads(raw), indent=2)
+            diagram_type = extract_diagram_type(raw)
         except FileNotFoundError:
             return self._error_result(
                 config, f"Input file not found: {input_file.file_path}"
@@ -261,7 +285,7 @@ def run(
 
         for step in STEPS:
             # Build the prompt with outputs from previous steps (same logic as SOP)
-            prompt = self._build_prompt(step, input_data, step_outputs)
+            prompt = self._build_prompt(step, input_data, diagram_type, step_outputs)
 
             # Execute this step as a single-task Crew, with retries
             sub, output_text = self._execute_step(
@@ -445,11 +469,12 @@ def _build_prompt(
         self,
         step: dict,
         input_data: str,
+        diagram_type: str,
         step_outputs: dict[str, str],
     ) -> str:
         """Like SOP._build_prompt: fill template vars from prior outputs."""
         template = step["task_prompt"]
-        fmt = {"input_data": input_data}
+        fmt = {"input_data": input_data, "diagram_type": diagram_type}
         if "extract_entities" in step_outputs:
             fmt["entities_json"] = step_outputs["extract_entities"]
         if "extract_relationships" in step_outputs:
diff --git a/src/maestro/strategies/langgraph.py b/src/maestro/strategies/langgraph.py
index 45637ac..71fc987 100644
--- a/src/maestro/strategies/langgraph.py
+++ b/src/maestro/strategies/langgraph.py
@@ -47,6 +47,7 @@
     STEP_1_PROMPT,
     STEP_2_PROMPT,
     STEP_3_PROMPT,
+    extract_diagram_type,
     strip_fences,
     validate_step_output,
 )
@@ -67,6 +68,7 @@ class GraphState(TypedDict, total=False):
     """
 
     input_data: str
+    diagram_type: str
     entities_json: str
     relationships_json: str
     diagram_code: str
@@ -209,6 +211,7 @@ def extract_relationships_node(state: GraphState) -> GraphState:
 
     def generate_mermaid_node(state: GraphState) -> GraphState:
         prompt = STEP_3_PROMPT.format(
+            diagram_type=state["diagram_type"],
             entities_json=state["entities_json"],
             relationships_json=state["relationships_json"],
         )
@@ -287,6 +290,7 @@ def run(
         try:
             raw = input_file.file_path.read_text(encoding="utf-8")
             input_data = json.dumps(json.loads(raw), indent=2)
+            diagram_type = extract_diagram_type(raw)
         except FileNotFoundError:
             return self._error_result(
                 config, f"Input file not found: {input_file.file_path}"
@@ -328,7 +332,11 @@ def run(
         # Execute the graph. LangGraph collects partial updates from each
         # node into a single final state dict.
         final_state: GraphState = compiled.invoke(
-            {"input_data": input_data, "sub_results": []}
+            {
+                "input_data": input_data,
+                "diagram_type": diagram_type,
+                "sub_results": [],
+            }
         )
 
         sub_results: list[SubResult] = final_state.get("sub_results", [])
diff --git a/src/maestro/strategies/single.py b/src/maestro/strategies/single.py
index 282c793..0264d44 100644
--- a/src/maestro/strategies/single.py
+++ b/src/maestro/strategies/single.py
@@ -27,12 +27,17 @@
 
 # Rules come from the canonical contract (maestro.prompts) so single-agent and
 # the multi-step strategies are given a byte-identical output contract. The
-# runtime placeholder is escaped as ``{{input_data}}`` so it survives this
-# f-string and is filled later by ``.format(input_data=...)``.
+# runtime placeholders are escaped (``{{...}}``) so they survive this f-string
+# and are filled later by ``.format(diagram_type=..., input_data=...)``. The
+# diagram type is task context, stated explicitly to every strategy so the
+# notation-dependent label rules are applied uniformly (the baseline would
+# otherwise have to infer it from the input metadata on its own).
 PROMPT_TEMPLATE = f"""\
 You are given a dataset describing entities and their relationships.
 Your task is to generate a Mermaid diagram that accurately represents this data.
 
+The diagram notation is: {{diagram_type}}
+
 Rules:
 {render_rules()}
 
@@ -73,7 +78,10 @@ def run(
             return self._error_result(config, f"Failed to read input file: {e}")
 
         formatted_input = json.dumps(input_data, indent=2)
-        prompt = PROMPT_TEMPLATE.format(input_data=formatted_input)
+        diagram_type = input_data.get("metadata", {}).get("diagram_type", "unspecified")
+        prompt = PROMPT_TEMPLATE.format(
+            diagram_type=diagram_type, input_data=formatted_input
+        )
 
         # Single call: wrap result in tuple with empty sub_results
         result = self.provider.complete(prompt, config)
diff --git a/src/maestro/strategies/sop.py b/src/maestro/strategies/sop.py
index cfe2160..7acf63f 100644
--- a/src/maestro/strategies/sop.py
+++ b/src/maestro/strategies/sop.py
@@ -31,6 +31,7 @@
     STEP_1_PROMPT,
     STEP_2_PROMPT,
     STEP_3_PROMPT,
+    extract_diagram_type,
     strip_fences,
     validate_step_output,
 )
@@ -97,6 +98,7 @@ def run(
         try:
             raw = input_file.file_path.read_text(encoding="utf-8")
             input_data = json.dumps(json.loads(raw), indent=2)
+            diagram_type = extract_diagram_type(raw)
         except FileNotFoundError:
             return self._error_result(
                 config, f"Input file not found: {input_file.file_path}"
@@ -117,7 +119,7 @@ def run(
             step_system_prompt = step["system_prompt"]
 
             # Build prompt with outputs from previous steps
-            prompt = self._build_prompt(step, input_data, step_outputs)
+            prompt = self._build_prompt(step, input_data, diagram_type, step_outputs)
 
             # Execute with retry
             sub, output_text = self._execute_step(
@@ -153,6 +155,7 @@ def _build_prompt(
         self,
         step: dict,
         input_data: str,
+        diagram_type: str,
         step_outputs: dict[str, str],
     ) -> str:
         """
@@ -160,7 +163,7 @@ def _build_prompt(
         Each step gets different variables depending on what's available.
         """
         template = step["prompt"]
-        fmt = {"input_data": input_data}
+        fmt = {"input_data": input_data, "diagram_type": diagram_type}
 
         if "extract_entities" in step_outputs:
             fmt["entities_json"] = step_outputs["extract_entities"]
diff --git a/tests/strategies/test_step_validation.py b/tests/strategies/test_step_validation.py
index 7abb3b6..2439764 100644
--- a/tests/strategies/test_step_validation.py
+++ b/tests/strategies/test_step_validation.py
@@ -30,10 +30,39 @@ def test_empty_output_rejected_on_every_step():
         assert ok is False and err == "empty output"
 
 
-def test_step3_accepts_any_nonempty_text():
-    """Step 3 is free-form Mermaid: non-empty is the only rule."""
+def test_step3_accepts_wellformed_mermaid():
+    """Step 3 passes any non-empty, structurally sound Mermaid."""
     ok, err = validate_step_output("flowchart LR\n  a --> b", 3)
     assert ok is True and err is None
+    # Multi-line labels, cylinders, and subgraphs must not trip the shape check.
+    diagram = (
+        "flowchart LR\n"
+        '  user["User\\n[Person]"]\n'
+        '  store[("Object Storage\\n[Container]")]\n'
+        '  subgraph infomaniak["Infomaniak"]\n'
+        '    web["SomeApp"]\n'
+        "  end\n"
+        "  user --> web"
+    )
+    ok, err = validate_step_output(diagram, 3)
+    assert ok is True and err is None
+
+
+def test_step3_rejects_empty_label_bracket():
+    """The empty-label malformation a weak model emitted must consume a retry,
+    not pass as non-empty and land as a scored parse failure."""
+    for bad in ('a[""]', "a['']", 'gw{""}', 'n(("")) '):
+        ok, err = validate_step_output(f"flowchart LR\n  {bad}", 3)
+        assert ok is False and "empty node label" in err
+
+
+def test_step3_rejects_concatenated_nodes():
+    """Two node defs concatenated without a separator (the other observed
+    failure, e.g. ``a[""]b["B"]``) is rejected."""
+    ok, err = validate_step_output('flowchart LR\n  pool[""]web["W"]', 3)
+    assert ok is False
+    # Either malformation may match first; both are real defects in this line.
+    assert "empty node label" in err or "concatenated" in err
 
 
 def test_steps_1_2_still_apply_json_shape():
diff --git a/tests/test_prompts.py b/tests/test_prompts.py
index 1d210a2..b49fe16 100644
--- a/tests/test_prompts.py
+++ b/tests/test_prompts.py
@@ -64,11 +64,26 @@ def test_rules_snapshot():
         "parseable\n"
         "- If a node has no label, write just its id (e.g. gw_result): "
         'never an empty bracket like node_id[""]\n'
+        "- Build each node label according to the diagram notation. For "
+        "architecture and infrastructure diagrams (C4 container, network "
+        "topology), put the entity name on the first line, the type on a "
+        "second line wrapped in square brackets and title-cased "
+        "(external_system becomes [External System], device becomes [Device]), "
+        "and a short technology or kind label on a third line only when the "
+        "input gives one, joined with a literal \\n inside the quotes, e.g. "
+        'node_id["SomeApp\\n[Container]\\nWeb Application"]. Keep the third '
+        "line to a few words; never put a full sentence or description in a "
+        "label. For process diagrams (BPMN process, BPMN collaboration), use "
+        'the entity name alone with no type line, e.g. task_1["Task 1"]\n'
         "- Quote edge labels the same way, with no spaces inside the pipes, "
         'e.g. a -->|"My edge"| b; for an unlabelled edge use a plain arrow '
         "a --> b and never an empty label like -->|| or -->| |\n"
         "- Include every entity and relationship from the input\n"
-        "- Preserve hierarchy using subgraphs for pools, lanes, and subprocesses\n"
+        "- Preserve hierarchy using subgraphs for any grouping the input "
+        "gives, such as pools, lanes, subprocesses, system boundaries, or "
+        "deployment environments. Always give a subgraph a quoted label from "
+        "the group's name, e.g. subgraph infomaniak[\"Infomaniak Public "
+        'Cloud"]; never write an empty subgraph label like subgraph id[""]\n'
         "- Do not invent entities or relationships not present in the input\n"
         "- Do not include explanations or markdown code fences\n"
         "- Do not use internal or relationship IDs as edge labels"
@@ -133,13 +148,17 @@ def test_single_and_step3_inject_identical_rules():
 def test_templates_keep_runtime_placeholders():
     """The .format() placeholders must survive the f-string composition."""
     assert "{input_data}" in PROMPT_TEMPLATE
+    assert "{diagram_type}" in PROMPT_TEMPLATE
     assert "{entities_json}" in STEP_3_PROMPT
     assert "{relationships_json}" in STEP_3_PROMPT
+    assert "{diagram_type}" in STEP_3_PROMPT
 
 
 def test_templates_format_without_stray_braces():
-    PROMPT_TEMPLATE.format(input_data="{}")
-    STEP_3_PROMPT.format(entities_json="[]", relationships_json="[]")
+    PROMPT_TEMPLATE.format(diagram_type="c4_container", input_data="{}")
+    STEP_3_PROMPT.format(
+        diagram_type="bpmn_process", entities_json="[]", relationships_json="[]"
+    )
 
 
 # ---------------------------------------------------------------------------

From 24cee1b74fba72cdc326f117a94f329d3fd493c7 Mon Sep 17 00:00:00 2001
From: Colinho22 <48288595+Colinho22@users.noreply.github.com>
Date: Sat, 20 Jun 2026 12:48:52 +0200
Subject: [PATCH 3/7] fix(strategies): harden diagram_type extraction and
 step-3 validation

- extract_diagram_type normalises present-but-malformed values (null, a
  number, blank) to "unspecified"; get()'s default only guarded a missing key.
- step-3 concatenation check blanks quoted labels first, so brackets inside a
  label (e.g. a["Service [v1] Gateway"]) are no longer a false positive.
- single-agent shares extract_diagram_type instead of duplicating the lookup,
  so the baseline gets the same edge-case handling as the multi-step strategies.
- crew marker strip uses rfind so an appended scaffold is cut even if the
  marker phrase ever appears in the prompt body.
---
 src/maestro/strategies/_extraction.py    | 28 ++++++++++++++++--------
 src/maestro/strategies/crew.py           |  6 +++--
 src/maestro/strategies/single.py         |  3 ++-
 tests/strategies/test_step_validation.py | 23 +++++++++++++++++++
 4 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/src/maestro/strategies/_extraction.py b/src/maestro/strategies/_extraction.py
index 79609ae..1ec7d58 100644
--- a/src/maestro/strategies/_extraction.py
+++ b/src/maestro/strategies/_extraction.py
@@ -7,8 +7,10 @@
 differs between strategies: what each strategy file expresses in its own
 shape.
 
-Single-agent does not import from here: its baseline prompt is intentionally
-distinct (one shot -> diagram, no decomposition) and lives in `single.py`.
+Single-agent does not reuse the multi-step *contract* from here: its baseline
+prompt is intentionally distinct (one shot -> diagram, no decomposition) and
+lives in `single.py`. It does share the small ``extract_diagram_type`` utility
+so the notation context is read identically for every strategy.
 
 Adding a new multi-step strategy?
 - Reuse the prompts, schemas and validators from this module unchanged.
@@ -158,16 +160,19 @@ def extract_diagram_type(raw_input: str) -> str:
     Read ``metadata.diagram_type`` from a raw input JSON string.
 
     Shared by every strategy so the render step's notation context is derived
-    identically. Falls back to "unspecified" if the field is missing or the
-    input is unparseable, so a malformed input still renders rather than
-    crashing the strategy (the diagram simply gets no notation hint).
+    identically. Falls back to "unspecified" if the field is missing, blank, not
+    a string, or the input is unparseable, so a malformed input still renders
+    rather than crashing the strategy (the diagram simply gets no notation hint).
+    The get() default only guards a missing key, so a present-but-malformed value
+    (null, a number, "") is normalised here before it reaches a prompt.
     """
     try:
-        return (
-            json.loads(raw_input).get("metadata", {}).get("diagram_type", "unspecified")
-        )
+        value = json.loads(raw_input).get("metadata", {}).get("diagram_type")
     except (json.JSONDecodeError, TypeError, AttributeError):
         return "unspecified"
+    if not isinstance(value, str) or not value.strip():
+        return "unspecified"
+    return value
 
 
 def strip_fences(text: str | None) -> str | None:
@@ -244,6 +249,9 @@ def validate_step_output(text: str | None, step_number: int) -> tuple[bool, str
 # via mmdc); it must never reject a well-formed diagram.
 _EMPTY_LABEL = re.compile(r"""[\[\(\{]+\s*(["'])\1\s*[\]\)\}]+""")
 _CONCATENATED_NODES = re.compile(r"[\]\)\}]\s*\w+\s*[\[\(\{]")
+# Quoted label spans, blanked before the concatenation scan so brackets INSIDE
+# a label (e.g. a["Service [v1] Gateway"]) cannot be misread as two node defs.
+_QUOTED_SPAN = re.compile(r"\"[^\"]*\"|'[^']*'")
 
 
 def _validate_mermaid_shape(text: str) -> tuple[bool, str | None]:
@@ -251,7 +259,9 @@ def _validate_mermaid_shape(text: str) -> tuple[bool, str | None]:
     if _EMPTY_LABEL.search(text):
         return False, 'empty node label bracket (e.g. node_id[""])'
     for raw in text.splitlines():
-        line = raw.strip()
+        # Blank quoted labels first: only structural brackets (not bracket
+        # characters inside a label) should count toward concatenation.
+        line = _QUOTED_SPAN.sub('""', raw.strip())
         if _CONCATENATED_NODES.search(line):
             return False, "node definitions concatenated without a separator"
     return True, None
diff --git a/src/maestro/strategies/crew.py b/src/maestro/strategies/crew.py
index a19b549..a8a743f 100644
--- a/src/maestro/strategies/crew.py
+++ b/src/maestro/strategies/crew.py
@@ -184,8 +184,10 @@ def _collapse_messages(cls, messages) -> str:
             ]
             text = "\n\n".join(p for p in user_parts if p)
         # Drop CrewAI's expected-output block (and anything after it) so only
-        # our task prompt reaches the provider.
-        marker = text.find(cls._CREW_EXPECTED_OUTPUT_MARKER)
+        # our task prompt reaches the provider. rfind, not find: the scaffold is
+        # appended after the task description, so the last occurrence is the one
+        # to cut at even if the marker phrase ever appears in the prompt itself.
+        marker = text.rfind(cls._CREW_EXPECTED_OUTPUT_MARKER)
         if marker != -1:
             text = text[:marker]
         return text.rstrip()
diff --git a/src/maestro/strategies/single.py b/src/maestro/strategies/single.py
index 0264d44..6525dc3 100644
--- a/src/maestro/strategies/single.py
+++ b/src/maestro/strategies/single.py
@@ -23,6 +23,7 @@
 
 from maestro.prompts import render_rules
 from maestro.schemas import InputFile, RunConfig, RunResult, SubResult
+from maestro.strategies._extraction import extract_diagram_type
 from maestro.strategies.base import BaseStrategy
 
 # Rules come from the canonical contract (maestro.prompts) so single-agent and
@@ -78,7 +79,7 @@ def run(
             return self._error_result(config, f"Failed to read input file: {e}")
 
         formatted_input = json.dumps(input_data, indent=2)
-        diagram_type = input_data.get("metadata", {}).get("diagram_type", "unspecified")
+        diagram_type = extract_diagram_type(raw)
         prompt = PROMPT_TEMPLATE.format(
             diagram_type=diagram_type, input_data=formatted_input
         )
diff --git a/tests/strategies/test_step_validation.py b/tests/strategies/test_step_validation.py
index 2439764..a6cb842 100644
--- a/tests/strategies/test_step_validation.py
+++ b/tests/strategies/test_step_validation.py
@@ -9,11 +9,34 @@
 from __future__ import annotations
 
 from maestro.strategies._extraction import (
+    extract_diagram_type,
     validate_step_output,
     validate_step_payload,
 )
 
 
+def test_extract_diagram_type_normalizes_malformed_values():
+    """get()'s default only guards a missing key; a present-but-malformed value
+    (null, a number, blank) must still fall back to the string "unspecified"."""
+    assert extract_diagram_type('{"metadata": {"diagram_type": "c4_container"}}') == (
+        "c4_container"
+    )
+    assert extract_diagram_type('{"metadata": {"diagram_type": null}}') == "unspecified"
+    assert extract_diagram_type('{"metadata": {"diagram_type": 123}}') == "unspecified"
+    assert extract_diagram_type('{"metadata": {"diagram_type": ""}}') == "unspecified"
+    assert extract_diagram_type('{"metadata": null}') == "unspecified"
+    assert extract_diagram_type("not json") == "unspecified"
+
+
+def test_step3_allows_brackets_inside_quoted_labels():
+    """Bracket characters inside a quoted label (e.g. a version tag) must not be
+    misread as concatenated node definitions."""
+    ok, err = validate_step_output(
+        'flowchart LR\n  a["Service [v1] Gateway [public]"]', 3
+    )
+    assert ok is True and err is None
+
+
 def test_empty_output_rejected_on_every_step():
     """Empty or whitespace output is a failure on all three steps.
 

From 625bb81603e1671575d76b65617a030482bad91e Mon Sep 17 00:00:00 2001
From: Colinho22 <48288595+Colinho22@users.noreply.github.com>
Date: Sat, 20 Jun 2026 15:25:17 +0200
Subject: [PATCH 4/7] fix(strategies): catch unbalanced subgraph/end in step-3
 validation

A dropped `end` leaves a subgraph open: invalid Mermaid that mmdc rejects
but the prior checks passed, so it scored as a parse failure instead of
consuming the retry budget. Deeply nested diagrams (pools > lanes >
subprocesses, network zones) are where a weaker model drops one. Count
`subgraph` openers against standalone `end` closers and reject an
imbalance; `end` is matched only as a whole-line closer so node ids like
end_event_1 and labels like "End Event" never count. Verified against all
30 ground truths (zero false positives).

Also pins extract_diagram_type on a whitespace-only diagram_type value.
---
 src/maestro/strategies/_extraction.py    | 18 +++++++++++++++---
 tests/strategies/test_step_validation.py | 24 ++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/src/maestro/strategies/_extraction.py b/src/maestro/strategies/_extraction.py
index 1ec7d58..d694816 100644
--- a/src/maestro/strategies/_extraction.py
+++ b/src/maestro/strategies/_extraction.py
@@ -255,13 +255,25 @@ def validate_step_output(text: str | None, step_number: int) -> tuple[bool, str
 
 
 def _validate_mermaid_shape(text: str) -> tuple[bool, str | None]:
-    """Reject the two step-3 malformations we have actually observed."""
+    """Reject the structural malformations we have actually observed."""
     if _EMPTY_LABEL.search(text):
         return False, 'empty node label bracket (e.g. node_id[""])'
+    # Subgraph nesting must balance: an unclosed subgraph (a dropped `end`) is
+    # invalid Mermaid that mmdc rejects, so flag it here to consume the retry
+    # rather than scoring as a parse failure. Deeply nested diagrams (pools >
+    # lanes > subprocesses, network zones) are where a model drops one. `end`
+    # is counted only as a standalone closer line so node ids and labels that
+    # merely contain "end" (end_event_1, "End Event") never match.
+    opens = closes = 0
     for raw in text.splitlines():
-        # Blank quoted labels first: only structural brackets (not bracket
-        # characters inside a label) should count toward concatenation.
         line = _QUOTED_SPAN.sub('""', raw.strip())
         if _CONCATENATED_NODES.search(line):
             return False, "node definitions concatenated without a separator"
+        stripped = raw.strip()
+        if stripped.startswith("subgraph "):
+            opens += 1
+        elif stripped == "end":
+            closes += 1
+    if opens != closes:
+        return False, f"unbalanced subgraph/end ({opens} subgraph, {closes} end)"
     return True, None
diff --git a/tests/strategies/test_step_validation.py b/tests/strategies/test_step_validation.py
index a6cb842..fcf1eb5 100644
--- a/tests/strategies/test_step_validation.py
+++ b/tests/strategies/test_step_validation.py
@@ -24,6 +24,9 @@ def test_extract_diagram_type_normalizes_malformed_values():
     assert extract_diagram_type('{"metadata": {"diagram_type": null}}') == "unspecified"
     assert extract_diagram_type('{"metadata": {"diagram_type": 123}}') == "unspecified"
     assert extract_diagram_type('{"metadata": {"diagram_type": ""}}') == "unspecified"
+    assert extract_diagram_type('{"metadata": {"diagram_type": "   "}}') == (
+        "unspecified"
+    )
     assert extract_diagram_type('{"metadata": null}') == "unspecified"
     assert extract_diagram_type("not json") == "unspecified"
 
@@ -37,6 +40,27 @@ def test_step3_allows_brackets_inside_quoted_labels():
     assert ok is True and err is None
 
 
+def test_step3_rejects_unbalanced_subgraph():
+    """A dropped `end` is invalid Mermaid mmdc would reject; flag it so the
+    retry budget applies instead of scoring a parse failure."""
+    missing_end = (
+        'flowchart LR\nsubgraph p1["Pool"]\n  a["A"]\nsubgraph p2["P2"]\n  b["B"]\nend'
+    )
+    ok, err = validate_step_output(missing_end, 3)
+    assert ok is False and "unbalanced subgraph" in err
+
+
+def test_step3_balance_ignores_end_in_ids_and_labels():
+    """`end` counts only as a standalone closer line: node ids like end_event_1
+    and labels like "End Event" must not be read as closers."""
+    diagram = (
+        'flowchart LR\nsubgraph p["Pool"]\n'
+        '  end_event_1(["End Event 1"])\n  task_1["Task end here"]\nend'
+    )
+    ok, err = validate_step_output(diagram, 3)
+    assert ok is True and err is None
+
+
 def test_empty_output_rejected_on_every_step():
     """Empty or whitespace output is a failure on all three steps.
 

From fccc5ac94ab8d9c6669c492ff04407e50c148f65 Mon Sep 17 00:00:00 2001
From: Colinho22 <48288595+Colinho22@users.noreply.github.com>
Date: Sat, 20 Jun 2026 15:25:54 +0200
Subject: [PATCH 5/7] fix(metrics): score only the input-derivable part of a
 node label

Two ground-truth authoring conventions are not derivable from the input, so
scoring them penalised models for an unreachable target:

- The optional third descriptor line of a C4 / network label is authored
  inconsistently (network topology includes it for some nodes and omits it
  for others, though the input always carries the field). The entity-name
  metric now compares on the name and [Type] lines only.
- Nodes the input leaves unnamed (BPMN gateways/events with name "") are
  labelled in the ground truth from convention the input never supplies
  (type name, unicode symbols, split/join). These are scored by id match
  only. This is conditional on the input: a node the input did name is still
  scored on its label, so blanking a nameable node is still penalised.

evaluate_run takes an optional input_path to read the unnamed-id set; it is
backward compatible (omitted means strict label scoring) and fails soft on a
missing or malformed input. On the pre-flight, network name_f1 rose from
~0.28-0.38 to ~0.88-1.0 with no regression on BPMN or C4.
---
 src/maestro/analysis/metrics.py      | 126 ++++++++++++++++++++++++---
 src/maestro/run.py                   |   1 +
 tests/analysis/test_label_scoring.py |  70 +++++++++++++++
 3 files changed, 184 insertions(+), 13 deletions(-)
 create mode 100644 tests/analysis/test_label_scoring.py

diff --git a/src/maestro/analysis/metrics.py b/src/maestro/analysis/metrics.py
index 48cac95..08b2d98 100644
--- a/src/maestro/analysis/metrics.py
+++ b/src/maestro/analysis/metrics.py
@@ -10,6 +10,7 @@
 
 from __future__ import annotations
 
+import json
 import os
 import re
 import shutil
@@ -106,20 +107,70 @@ def check_mermaid_valid(diagram_code: str) -> tuple[bool | None, str | None]:
 # ---------------------------------------------------------------------------
 
 
+def extract_input_unnamed_ids(input_path: Path | None) -> set[str]:
+    """
+    Ids of input elements with an empty ``name`` (e.g. BPMN gateways/events the
+    source leaves unnamed). Their ground-truth label is authoring convention the
+    input does not provide, so the entity-name metric scores them by id only.
+
+    Returns an empty set when no input path is given or the input cannot be read
+    or parsed, so scoring degrades to the strict label comparison rather than
+    crashing (observability fails soft).
+    """
+    if input_path is None:
+        return set()
+    try:
+        data = json.loads(Path(input_path).read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError):
+        return set()
+    elements = data.get("elements") or data.get("nodes") or []
+    if not isinstance(elements, list):
+        return set()
+    return {
+        e["id"]
+        for e in elements
+        if isinstance(e, dict) and e.get("id") and not (e.get("name") or "").strip()
+    }
+
+
+def _label_core(label: str) -> str:
+    r"""
+    Keep the scored part of a multi-line node label: the name and the bracketed
+    ``[Type]`` line, dropping any trailing descriptor line.
+
+    Labels are ``name`` (BPMN), ``name\n[Type]`` or ``name\n[Type]\ndescriptor``
+    (C4 / network), with ``\n`` as a literal two-character separator. The third
+    descriptor line is authored inconsistently in the ground truth (network
+    topology includes it for some nodes and not others, though the input always
+    carries the field), so no model can predict it. Scoring it would penalise a
+    correct name and type for an unpredictable authoring choice, so the entity
+    name metric compares on name + type only. The descriptor is out of the
+    scored contract by design; this is applied to output and truth identically.
+    """
+    parts = label.split("\\n")
+    kept = [parts[0]]
+    for p in parts[1:]:
+        if p.strip().startswith("["):  # the [Type] line; descriptor follows
+            kept.append(p)
+            break
+    return "\\n".join(kept)
+
+
 def _normalize_label(label: str) -> str:
     """
-    Basic normalization: lowercase, strip whitespace.
+    Basic normalization: drop the descriptor line, lowercase, strip whitespace.
     Used for raw fuzzy matching: no linguistic processing.
     """
-    return label.strip().lower()
+    return _label_core(label).strip().lower()
 
 
 def _lemmatize_label(label: str) -> str:
     """
-    Normalize + lemmatize: lowercase, strip plurals, collapse separators.
-    Catches 'Tasks' -> 'task', 'start_event_1' -> 'start event 1'.
+    Normalize + lemmatize: drop the descriptor line, lowercase, strip plurals,
+    collapse separators. Catches 'Tasks' -> 'task', 'start_event_1' -> 'start
+    event 1'.
     """
-    text = label.strip().lower()
+    text = _label_core(label).strip().lower()
     # Replace underscores and hyphens with spaces
     text = re.sub(r"[_\-]", " ", text)
     # Strip trailing 's' for basic plural handling
@@ -372,18 +423,37 @@ def _fuzzy_match(
     output_nodes: list[dict],
     truth_nodes: list[dict],
     normalizer,
+    input_unnamed_ids: set[str] | None = None,
 ) -> tuple[float, float, float]:
     """
     Fuzzy name matching with a configurable normalizer function.
     Used for both raw and lemmatized matching.
+
+    ``input_unnamed_ids`` are node ids the *input* left without a name (e.g. a
+    BPMN gateway with name ""). The ground truth labels these from convention
+    (type name, unicode symbols, split/join) that the input does not provide, so
+    the model cannot derive the label. For such a node, an id match counts as a
+    name match regardless of the produced label, including a blank one. This is
+    conditional on the input: a node the input *did* name is still scored on its
+    label, so a model that blanks a nameable node is still penalised.
     """
     if not output_nodes or not truth_nodes:
         return (0.0, 0.0, 0.0)
 
+    unnamed = input_unnamed_ids or set()
+    truth_ids = {t["id"]: i for i, t in enumerate(truth_nodes)}
     matched_truth = set()
     correct = 0
 
     for out_node in output_nodes:
+        # Input-unnamed node: an id match is a name match, label not scored.
+        if out_node["id"] in unnamed and out_node["id"] in truth_ids:
+            idx = truth_ids[out_node["id"]]
+            if idx not in matched_truth:
+                correct += 1
+                matched_truth.add(idx)
+                continue
+
         out_label = normalizer(out_node["label"])
         best_score = 0.0
         best_idx = None
@@ -407,17 +477,21 @@ def _fuzzy_match(
 
 
 def compute_entity_metrics_fuzzy(
-    output_nodes: list[dict], truth_nodes: list[dict]
+    output_nodes: list[dict],
+    truth_nodes: list[dict],
+    input_unnamed_ids: set[str] | None = None,
 ) -> tuple[float, float, float]:
     """Fuzzy name match with basic normalization (lowercase only)."""
-    return _fuzzy_match(output_nodes, truth_nodes, _normalize_label)
+    return _fuzzy_match(output_nodes, truth_nodes, _normalize_label, input_unnamed_ids)
 
 
 def compute_entity_metrics_lemma(
-    output_nodes: list[dict], truth_nodes: list[dict]
+    output_nodes: list[dict],
+    truth_nodes: list[dict],
+    input_unnamed_ids: set[str] | None = None,
 ) -> tuple[float, float, float]:
     """Fuzzy name match with lemmatization (lowercase + strip plurals)."""
-    return _fuzzy_match(output_nodes, truth_nodes, _lemmatize_label)
+    return _fuzzy_match(output_nodes, truth_nodes, _lemmatize_label, input_unnamed_ids)
 
 
 # ---------------------------------------------------------------------------
@@ -513,11 +587,21 @@ def compute_attachment_metrics(
 # ---------------------------------------------------------------------------
 
 
-def compute_entity_taxonomy(output_nodes: list[dict], truth_nodes: list[dict]) -> dict:
+def compute_entity_taxonomy(
+    output_nodes: list[dict],
+    truth_nodes: list[dict],
+    input_unnamed_ids: set[str] | None = None,
+) -> dict:
     """
     Count entity-level errors by taxonomy category.
     Returns: {"missing": int, "extra": int, "false": int, "duplicate": int}
+
+    A "false" entity is an id match with a mismatched label. Nodes the input
+    left unnamed (``input_unnamed_ids``) are exempt: their ground-truth label is
+    convention the input does not provide, so a label mismatch there is not a
+    model error (see ``_fuzzy_match``).
     """
+    unnamed = input_unnamed_ids or set()
     output_ids = [n["id"] for n in output_nodes]
     truth_ids = {n["id"] for n in truth_nodes}
 
@@ -540,6 +624,8 @@ def compute_entity_taxonomy(output_nodes: list[dict], truth_nodes: list[dict]) -
 
     false_count = 0
     for nid in shared_ids:
+        if nid in unnamed:
+            continue  # input gave no name; label is not the model's to get right
         similarity = _fuzzy_score(
             _normalize_label(output_labels[nid]),
             _normalize_label(truth_labels[nid]),
@@ -604,10 +690,16 @@ def evaluate_run(
     run_id: UUID,
     output_diagram_code: str,
     ground_truth_path: Path,
+    input_path: Path | None = None,
 ) -> MetricResult:
     """
     Full evaluation pipeline for one run.
     Compares generated diagram against ground truth file.
+
+    ``input_path`` is the source JSON. When given, elements it leaves unnamed
+    are scored by id only for the entity-name metric, since their ground-truth
+    label is convention the input does not supply. Optional and backward
+    compatible: omitted means strict label scoring for every node.
     """
     try:
         truth_code = ground_truth_path.read_text(encoding="utf-8")
@@ -659,10 +751,18 @@ def evaluate_run(
     output_attachments = extract_attachments(output_diagram_code)
     truth_attachments = extract_attachments(truth_code)
 
+    # Ids the input left unnamed: their label is GT convention, not derivable,
+    # so the name/lemma metrics and the false-entity count score them by id.
+    unnamed_ids = extract_input_unnamed_ids(input_path)
+
     # 3. Entity metrics: three levels
     id_p, id_r, id_f1 = compute_entity_metrics_exact(output_nodes, truth_nodes)
-    name_p, name_r, name_f1 = compute_entity_metrics_fuzzy(output_nodes, truth_nodes)
-    lemma_p, lemma_r, lemma_f1 = compute_entity_metrics_lemma(output_nodes, truth_nodes)
+    name_p, name_r, name_f1 = compute_entity_metrics_fuzzy(
+        output_nodes, truth_nodes, unnamed_ids
+    )
+    lemma_p, lemma_r, lemma_f1 = compute_entity_metrics_lemma(
+        output_nodes, truth_nodes, unnamed_ids
+    )
 
     # 4. Relationship metrics: two levels
     rel_p, rel_r, rel_f1 = compute_relationship_metrics_relaxed(
@@ -673,7 +773,7 @@ def evaluate_run(
     )
 
     # 5. Error taxonomy
-    entity_tax = compute_entity_taxonomy(output_nodes, truth_nodes)
+    entity_tax = compute_entity_taxonomy(output_nodes, truth_nodes, unnamed_ids)
     relationship_tax = compute_relationship_taxonomy(
         output_relationships, truth_relationships
     )
diff --git a/src/maestro/run.py b/src/maestro/run.py
index 27ae9e8..2488af4 100644
--- a/src/maestro/run.py
+++ b/src/maestro/run.py
@@ -615,6 +615,7 @@ def _execute_cell(
                 run_id=config.run_id,
                 output_diagram_code=result.output_diagram_code,
                 ground_truth_path=input_file.ground_truth_path,
+                input_path=input_file.file_path,
             )
         except Exception as exc:
             traceback.print_exc(file=sys.stderr)
diff --git a/tests/analysis/test_label_scoring.py b/tests/analysis/test_label_scoring.py
new file mode 100644
index 0000000..94edf46
--- /dev/null
+++ b/tests/analysis/test_label_scoring.py
@@ -0,0 +1,70 @@
+"""
+Label-scoring policy: what the entity-name metric does and does not score.
+
+Two ground-truth authoring conventions are not derivable from the input, so the
+metric excludes them rather than penalising a model for an unreachable target:
+
+  * the optional third descriptor line of a C4 / network label, authored
+    inconsistently across nodes, is dropped before comparison;
+  * nodes the input leaves unnamed (BPMN gateways/events with name "") are
+    scored by id only, since their ground-truth label is pure convention.
+
+The second is conditional on the input: a node the input *did* name is still
+scored on its label, so a model that blanks a nameable node is still penalised.
+"""
+
+from __future__ import annotations
+
+from maestro.analysis.metrics import (
+    _label_core,
+    compute_entity_metrics_fuzzy,
+    extract_input_unnamed_ids,
+)
+
+
+def test_label_core_keeps_name_and_type_drops_descriptor():
+    assert _label_core("Task 1") == "Task 1"
+    assert _label_core("User\\n[Person]") == "User\\n[Person]"
+    # third descriptor line dropped; name + [Type] kept
+    assert _label_core("SomeApp\\n[Container]\\nWeb Application") == (
+        "SomeApp\\n[Container]"
+    )
+
+
+def test_descriptor_line_not_scored():
+    """A mismatched third line must not sink an otherwise correct label."""
+    out = [{"id": "a", "label": "Router\\n[Device]\\nNetwork Switch"}]
+    truth = [{"id": "a", "label": "Router\\n[Device]"}]
+    _, _, f1 = compute_entity_metrics_fuzzy(out, truth)
+    assert f1 == 1.0
+
+
+def test_input_unnamed_node_scored_by_id_only():
+    """A gateway the input left unnamed: id match counts even though the model's
+    label cannot match the convention-authored ground-truth label."""
+    out = [{"id": "pgw_3", "label": ""}]
+    truth = [{"id": "pgw_3", "label": "+ Parallel Split"}]
+    # Without the exemption, the empty label would never reach threshold.
+    _, _, f1_strict = compute_entity_metrics_fuzzy(out, truth)
+    assert f1_strict < 1.0
+    # With the input marking pgw_3 unnamed, the id match is a name match.
+    _, _, f1_exempt = compute_entity_metrics_fuzzy(out, truth, {"pgw_3"})
+    assert f1_exempt == 1.0
+
+
+def test_exemption_does_not_excuse_a_nameable_node():
+    """The guardrail: a node the input *named* is still scored on its label, so a
+    model that blanks it is still penalised even if other nodes are exempt."""
+    out = [{"id": "task_1", "label": ""}]
+    truth = [{"id": "task_1", "label": "Receive Travel Request"}]
+    # task_1 is NOT in the unnamed set, so the blank label is still wrong.
+    _, _, f1 = compute_entity_metrics_fuzzy(out, truth, {"pgw_3"})
+    assert f1 < 1.0
+
+
+def test_extract_input_unnamed_ids_handles_missing_and_malformed():
+    assert extract_input_unnamed_ids(None) == set()
+    # nonexistent path fails soft to an empty set
+    from pathlib import Path
+
+    assert extract_input_unnamed_ids(Path("/no/such/input.json")) == set()

From 53780cd7191a8105ad097b054139d7fec32177ba Mon Sep 17 00:00:00 2001
From: Colinho22 <48288595+Colinho22@users.noreply.github.com>
Date: Sat, 20 Jun 2026 17:07:15 +0200
Subject: [PATCH 6/7] fix(strategies): count anonymous subgraph openers in
 balance check

A bare `subgraph` (no id) is a valid opener, but the balance check only
matched `subgraph ` with a trailing space, so an anonymous subgraph would
be counted as 0 opens against its 1 `end` and falsely rejected as
unbalanced. Match the bare form too. None appear in the corpus, but a model
emitting one during a run should not produce a spurious failed cell.
---
 src/maestro/strategies/_extraction.py    | 5 ++++-
 tests/strategies/test_step_validation.py | 8 ++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/maestro/strategies/_extraction.py b/src/maestro/strategies/_extraction.py
index d694816..33eb2c0 100644
--- a/src/maestro/strategies/_extraction.py
+++ b/src/maestro/strategies/_extraction.py
@@ -270,7 +270,10 @@ def _validate_mermaid_shape(text: str) -> tuple[bool, str | None]:
         if _CONCATENATED_NODES.search(line):
             return False, "node definitions concatenated without a separator"
         stripped = raw.strip()
-        if stripped.startswith("subgraph "):
+        # An anonymous subgraph (bare "subgraph", no id) is valid Mermaid and
+        # still opens a block, so count it too: missing it would falsely reject
+        # a balanced diagram as unbalanced.
+        if stripped.startswith("subgraph ") or stripped == "subgraph":
             opens += 1
         elif stripped == "end":
             closes += 1
diff --git a/tests/strategies/test_step_validation.py b/tests/strategies/test_step_validation.py
index fcf1eb5..7d48775 100644
--- a/tests/strategies/test_step_validation.py
+++ b/tests/strategies/test_step_validation.py
@@ -61,6 +61,14 @@ def test_step3_balance_ignores_end_in_ids_and_labels():
     assert ok is True and err is None
 
 
+def test_step3_balance_counts_anonymous_subgraph():
+    """A bare `subgraph` (no id) is a valid opener; its `end` must balance it,
+    not be read as an extra closer."""
+    diagram = 'flowchart LR\nsubgraph\n  a["A"]\nend'
+    ok, err = validate_step_output(diagram, 3)
+    assert ok is True and err is None
+
+
 def test_empty_output_rejected_on_every_step():
     """Empty or whitespace output is a failure on all three steps.
 

From cbdbe7d739e9ddd504a86309bc3130221eb65023 Mon Sep 17 00:00:00 2001
From: Colinho22 <48288595+Colinho22@users.noreply.github.com>
Date: Sat, 20 Jun 2026 21:11:10 +0200
Subject: [PATCH 7/7] fix(metrics): drop redundant Path() wrapper in
 extract_input_unnamed_ids

input_path is already Path after the None guard, so wrapping it in Path()
before read_text() was a no-op.
---
 src/maestro/analysis/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/maestro/analysis/metrics.py b/src/maestro/analysis/metrics.py
index 08b2d98..8c2908a 100644
--- a/src/maestro/analysis/metrics.py
+++ b/src/maestro/analysis/metrics.py
@@ -120,7 +120,7 @@ def extract_input_unnamed_ids(input_path: Path | None) -> set[str]:
     if input_path is None:
         return set()
     try:
-        data = json.loads(Path(input_path).read_text(encoding="utf-8"))
+        data = json.loads(input_path.read_text(encoding="utf-8"))
     except (OSError, json.JSONDecodeError):
         return set()
     elements = data.get("elements") or data.get("nodes") or []