diff --git a/.gitignore b/.gitignore
index a6e56d75..d69b09a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -80,10 +80,11 @@ mlartifacts/
 # Cookie jars / curl session dumps (may contain live session + CSRF tokens)
 cookies*.txt
 *.cookiejar
-tests/e2e/_e2e_server.log
 
+# Local git worktrees for isolated feature work
+.worktrees/
+
+tests/e2e/_e2e_server.log
 
 .claude/
 .understand-anything/
-
-
diff --git a/README.md b/README.md
index c4f2f99d..262e601e 100644
--- a/README.md
+++ b/README.md
@@ -256,6 +256,39 @@ require installing the optional extra:
 uv sync --extra pitfalls
 ```
 
+### PGE Intrinsic Evaluation (quality scorecard)
+
+The PGE pipeline (ontology + mapping generation) is scored by a
+**usecase-agnostic, gold-free** scorecard (`src/agents/pge_eval/`). It uses
+intrinsic structural/self-consistency metrics — table/column footprint
+coverage, orphan classes, dangling domain/range, naming/duplicate hygiene,
+mapping completeness, id-integrity, dangling-FK fractions — across three gate
+tiers (absolute / ratio / self-baseline regression), plus an **advisory**
+LLM-judge that never gates. No domain reference answer is encoded, so it works
+for any domain.
+
+It runs in two places:
+
+- **In-app** — after you generate an ontology or mappings, a scorecard is
+  attached to the task result (`pge_scorecard`) and the verdict is shown in the
+  completion message (e.g. `… · quality GREEN`). Deterministic, no extra LLM.
+- **CLI** — `scripts/goals_eval.py`:
+
+  ```bash
+  # Score a captured artifact (offline, deterministic; --no-judge = zero network)
+  .venv/bin/python scripts/goals_eval.py score <artifact.json> [--no-judge] [--gate-ratios]
+
+  # Run the pipeline live for ANY domain, then score it (domain-agnostic)
+  .venv/bin/python scripts/goals_eval.py run --registry-json <domain-export.json> [--version V]
+  .venv/bin/python scripts/goals_eval.py run --ontology <ontology.json> --metadata <metadata.json>
+  ```
+
+  Exit code is the verdict: `0` GREEN, non-zero RED.
+
+The OWL generator also gains an **Evaluator stage**: the Stage-1 deterministic
+checks run in its generation loop and feed retry-hints back (a real PGE loop),
+bounded by a retry cap.
+
 ### Documentation
 
 Full documentation is available in [`docs/`](docs/README.md). For a comprehensive feature list and architecture details, see [INFO.md](docs/INFO.md).
diff --git a/changelogs/v0.3.1/2026-05-28.log b/changelogs/v0.3.1/2026-05-28.log
new file mode 100644
index 00000000..01b97cbb
--- /dev/null
+++ b/changelogs/v0.3.1/2026-05-28.log
@@ -0,0 +1,371 @@
+# 2026-05-28 — v0.3.1
+
+## feat(mapping-pge): Sprint 1 — scaffolding + contracts + deterministic evaluator
+
+Context: First sprint of the Planner -> Generator -> Evaluator (PGE)
+redesign that will replace the single-loop ReAct
+`agents/agent_auto_assignment`. This sprint is foundation-only: no LLM
+code, nothing wired into Mapping.py, the old agent untouched.
+
+Changes:
+1. `src/agents/agent_mapping_pge/__init__.py` — package marker re-exporting
+   all contract dataclasses.
+2. `src/agents/agent_mapping_pge/contracts.py` — typed dataclasses
+   (`TableRoleCandidate`, `TableRole`, `CanonicalId`, `JoinKey`, `SkipItem`,
+   `MappingPlan`, `SourceModel`, `EvalFailure`, `EvalReport`, `RetryState`)
+   with `to_dict` / `from_dict` JSON round-trip.
+3. `src/agents/agent_mapping_pge/evaluator/__init__.py` — re-exports the
+   two public evaluator entry points.
+4. `src/agents/agent_mapping_pge/evaluator/report.py` — `build_report`
+   helper that derives `status` from the failure list.
+5. `src/agents/agent_mapping_pge/evaluator/deterministic.py` — pure-Python
+   stage-1 evaluator: `evaluate_entity_mapping` and
+   `evaluate_relationship_mapping`. Caller injects an `execute_sql_fn`
+   returning `{"columns": [...], "rows": [...]}` so the evaluator is
+   trivially testable without a Databricks connection.
+6. `tests/agents/agent_mapping_pge/test_deterministic_evaluator.py` — 14
+   tests covering entity PASS/FAIL paths (row_count zero, duplicate ids,
+   null ids, unmapped attribute, declared-as-unmapped) and relationship
+   PASS/FAIL paths (3% and 47% dangling source, >50% bubbling, zero edges,
+   cross-source overlap band inside/outside).
+7. `tests/agents/agent_mapping_pge/test_contracts.py` — 3 round-trip
+   smoke tests for `SourceModel`, `EvalReport`, `RetryState`.
+
+Modified files:
+- src/agents/agent_mapping_pge/__init__.py (new)
+- src/agents/agent_mapping_pge/contracts.py (new)
+- src/agents/agent_mapping_pge/evaluator/__init__.py (new)
+- src/agents/agent_mapping_pge/evaluator/report.py (new)
+- src/agents/agent_mapping_pge/evaluator/deterministic.py (new)
+- tests/agents/__init__.py (new)
+- tests/agents/agent_mapping_pge/__init__.py (new)
+- tests/agents/agent_mapping_pge/test_deterministic_evaluator.py (new)
+- tests/agents/agent_mapping_pge/test_contracts.py (new)
+- changelogs/v0.3.1/2026-05-28.log (this file)
+
+Test result: 17/17 new tests pass. Full suite: 2075 passed, 3 pre-existing
+failures in `test_settings_lakebase_status.py` (unrelated to this change,
+already failing on master).
+
+Deviation from spec: when an explicit
+`expected_cross_source_overlap_band` is supplied to
+`evaluate_relationship_mapping`, the `dangling_target_pct < 0.05` check
+is skipped. Rationale: cross-source FKs are *expected* to be partial
+(that is the wedge), so a band check is the correct semantic and the
+strict dangling check would always fail. Bubble-on-source-dangling and
+edge-count checks still apply.
+
+
+## chore(mapping-pge): cache seam + bubble-demotion warning + test rename
+
+Context: Three targeted code-quality fixes on Sprint 1 ahead of the
+Sprint 7 orchestrator. Scope is narrow — no contract / dataclass
+changes, no behavioural changes beyond logging.
+
+Changes:
+1. `src/agents/agent_mapping_pge/evaluator/deterministic.py` — added an
+   opt-in `id_universe_cache: Optional[Dict[str, set]] = None` kwarg to
+   `evaluate_relationship_mapping` (and to the private `_distinct_id_set`
+   helper). When provided, source/target id universes are looked up by
+   the entity mapping's SQL string and stored on miss. When `None`,
+   behaviour is unchanged. Cache is caller-managed; no module-level
+   state. Lets the Sprint 7 orchestrator avoid N×2 redundant entity
+   universe re-fetches when many relationships share endpoint classes.
+2. `src/agents/agent_mapping_pge/evaluator/report.py` — `build_report`
+   now emits a `logger.warning` when the caller passed
+   `bubble_to_planner=True` but `status` resolves to PASS. The silent
+   demotion behaviour itself is unchanged.
+3. `tests/agents/agent_mapping_pge/test_deterministic_evaluator.py` —
+   renamed misnamed `test_cross_source_band_pass_inside` (which actually
+   asserted FAIL on overlap outside the band) to
+   `test_cross_source_band_fail_when_outside`; assertions unchanged.
+   Added `test_relationship_evaluator_uses_id_universe_cache` proving
+   the cache short-circuits entity-SQL execution by counting wrapped
+   SQL calls. Added `test_build_report_warns_when_bubble_demoted`
+   asserting the warning fires under PASS+bubble demotion and stays
+   silent otherwise (PASS+no-bubble, FAIL+bubble).
+
+Modified files:
+- src/agents/agent_mapping_pge/evaluator/deterministic.py
+- src/agents/agent_mapping_pge/evaluator/report.py
+- tests/agents/agent_mapping_pge/test_deterministic_evaluator.py
+- changelogs/v0.3.1/2026-05-28.log (this file)
+
+Test result: 22/22 pass in `tests/agents/agent_mapping_pge/` (net +2 vs
+prior 20: cache test, warning test).
+
+
+## feat(mapping-pge): Sprint 2 — planner tools
+
+Context: Second sprint of the Planner -> Generator -> Evaluator (PGE)
+redesign. Adds the Planner's tool surface: four OpenAI function-calling
+tools the Planner LLM (Sprint 3) will use to probe source tables and
+submit a validated `SourceModel`. No LLM code in this sprint either —
+just the tool defs + handlers + a new context slot.
+
+Changes:
+1. `src/agents/tools/planner.py` (new) — four tools matching the
+   `mapping.py` / `sql.py` convention:
+   * `sample_table(full_name, n=20)` — `SELECT * ORDER BY RAND() LIMIT n`,
+     n capped at 100. Stringifies values for the LLM-facing surface.
+   * `column_value_overlap(from_table, from_column, to_table, to_column)` —
+     one-sided overlap `|distinct(from) ∩ distinct(to)| / |distinct(from)|`
+     in a single CTE-based query; returns 0.0 with a note when the
+     denominator is zero.
+   * `distinct_count(full_name, column)` — row/distinct/null counts with
+     `is_unique` and `is_complete` flags derived in Python.
+   * `submit_source_model(model)` — terminal tool: round-trips `model`
+     through `SourceModel.from_dict` and stores the dataclass on
+     `ctx.source_model`. Catches `KeyError` / `TypeError` / `ValueError`
+     and returns `success: False` with the error message — never raises.
+   Module exports `PLANNER_TOOL_DEFINITIONS` and `PLANNER_TOOL_HANDLERS`
+   matching the `MAPPING_TOOL_*` shape.
+2. `src/agents/tools/context.py` — added a single optional field
+   `source_model: Optional["SourceModel"] = None` with a string-forward-ref
+   and a `TYPE_CHECKING` import to avoid a circular import between
+   `agents.tools` and `agents.agent_mapping_pge`.
+3. `tests/agents/test_planner_tools.py` (new) — 16 tests against a
+   `FakeClient` whose `execute_query` is a per-test closure. Covers each
+   tool's happy path, error path, and the boundary cases called out in
+   the Sprint 2 spec (n-cap at 100, division-by-zero guard, structural
+   validation only on submit).
+
+Modified files:
+- src/agents/tools/planner.py (new)
+- src/agents/tools/context.py
+- tests/agents/test_planner_tools.py (new)
+- changelogs/v0.3.1/2026-05-28.log (this file)
+
+Test result: 38/38 pass in `tests/agents/` (22 Sprint 1 + 16 new). Full
+suite unaffected — same 3 pre-existing failures in
+`test_settings_lakebase_status.py` as before.
+
+## feat(mapping-pge): Sprint 3 — planner agent
+
+Context: Third sprint of the PGE redesign. The Planner is the first LLM-
+backed stage: a single-invocation ReAct-style agent that consumes the
+ontology + table metadata + imported documents, probes the source data via
+the Sprint 2 tools (sample_table, column_value_overlap, distinct_count),
+and emits a validated `SourceModel` via the terminal `submit_source_model`
+tool. Re-invocations are driven by the orchestrator in Sprint 7 — the
+Planner itself has no internal retry loop. The auto_assignment agent and
+all Sprint 1 / Sprint 2 modules are untouched.
+
+Changes:
+1. `src/agents/agent_mapping_pge/planner.py` (new) — `run_planner()`
+   entry point + `PlannerResult` / `PlannerStep` dataclasses. ReAct loop
+   mirrors `agents/agent_auto_assignment/engine.py` (same
+   `call_serving_endpoint` + `dispatch_tool` cycle, same 3-second
+   inter-iteration delay, same usage accumulation, same `@trace_agent`
+   decorator). Differences vs auto_assignment: smaller default
+   `max_iterations=25`; NO single-shot fallback on 400/422 (the Planner
+   needs tools); terminates immediately after a successful
+   `submit_source_model` (no chatty wrap-up turn). The system prompt
+   frames the role as "senior data architect", enumerates each tool's
+   purpose, lays out the canonical SourceModel workflow, and pins the
+   invariants the orchestrator depends on (URI existence, ordering
+   constraint on relationship_order, confidence range, `kind` enum).
+2. `tests/agents/agent_mapping_pge/test_planner.py` (new) — 6 tests
+   exercising the four termination conditions (single-shot submit,
+   multi-step ReAct then submit, submit-failure-then-retry, free-text
+   without terminal, iteration budget exhaustion) and the step-recording
+   invariants. Uses a `FakeLLM` / `CyclingFakeLLM` stub injected via
+   `monkeypatch.setattr(planner_mod, "call_serving_endpoint", ...)` —
+   no real HTTP, no Databricks, no MLflow.
+
+Modified files:
+- src/agents/agent_mapping_pge/planner.py (new)
+- tests/agents/agent_mapping_pge/test_planner.py (new)
+- changelogs/v0.3.1/2026-05-28.log (this file)
+
+Test result: 48/48 pass in `tests/agents/` (42 Sprint 1+2 baseline + 6
+new Sprint 3 tests). No edits to out-of-scope files (engine_base.py,
+tracing.py, llm_utils.py, contracts.py, evaluator/, tools/planner.py,
+or agent_auto_assignment/). Same pre-existing failures elsewhere in the
+full suite as before.
+
+## feat(mapping-pge): Sprint 4 — EntityGenerator agent + unmapped_attributes
+
+Context: Sprint 4 of the PGE redesign. The EntityGenerator is a narrow,
+focused LLM agent that maps ONE ontology class at a time. The orchestrator
+(Sprint 7) calls it per item with a filtered SourceModel slice. It does
+NOT see the full ontology or full metadata — only what's relevant to the
+class being mapped. Same loop machinery as the Planner (Sprint 3) with a
+smaller default budget (12 vs 25) and a narrower tool set.
+
+Changes:
+1. `src/agents/agent_mapping_pge/generators/__init__.py` (new) — empty
+   package marker for the Generator submodule.
+2. `src/agents/agent_mapping_pge/generators/entity.py` (new) — the
+   `run_entity_generator` agent. Builds a 3-tool surface (`execute_sql`,
+   `sample_table`, `submit_entity_mapping`), a per-class user prompt
+   carrying the ontology class + source-model slice + optional retry hint,
+   and a ReAct loop terminated by `submit_entity_mapping`. The system
+   prompt lifts the SQL RULES FOR ENTITIES section from
+   `agent_auto_assignment/engine.py` and adds the slice-consumption rules
+   plus the NO SILENT DROPS invariant.
+3. `src/agents/tools/mapping.py` — added `unmapped_attributes` kwarg to
+   `tool_submit_entity_mapping` and to its OpenAI function definition.
+   The field accepts either `[{"name", "reason"}]` dicts (preferred) or
+   bare strings (fallback); it persists on the mapping dict under the
+   same key. ~30 lines added; no behaviour change to existing callers.
+4. `tests/agents/agent_mapping_pge/test_entity_generator.py` (new) —
+   7 tests covering the four termination conditions
+   (`test_terminates_on_submit`, `test_text_without_terminal_fails`,
+   `test_exhausts_iteration_budget`), the multi-step ReAct trajectory
+   (`test_validates_sql_then_submits`), the new field round-trip
+   (`test_unmapped_attributes_round_trip` — both dict and string forms),
+   the retry-hint surfacing (`test_retry_hint_surfaces_in_user_prompt`),
+   and step-recording invariants (`test_records_steps`). Uses the same
+   `FakeLLM` / `CyclingFakeLLM` stub pattern as `test_planner.py`.
+5. `tests/agents/test_mapping_tools.py` (new) — 3 minimal tests directly
+   covering `tool_submit_entity_mapping`: dict-form unmapped_attributes
+   round-trips, string-form round-trips, default value is an empty list.
+   These are the first direct tests for `agents.tools.mapping`; previously
+   it was exercised only indirectly via the auto-mapping agent.
+
+Modified files:
+- src/agents/agent_mapping_pge/generators/__init__.py (new)
+- src/agents/agent_mapping_pge/generators/entity.py (new)
+- src/agents/tools/mapping.py (added unmapped_attributes plumbing)
+- tests/agents/agent_mapping_pge/test_entity_generator.py (new)
+- tests/agents/test_mapping_tools.py (new)
+- changelogs/v0.3.1/2026-05-28.log (this file)
+
+Test result: 58/58 pass in `tests/agents/` (48 Sprint 1+2+3 baseline +
+7 new Sprint 4 generator tests + 3 new direct tool tests). No edits to
+out-of-scope files (engine_base.py, tracing.py, llm_utils.py,
+contracts.py, evaluator/, planner.py, agent_auto_assignment/,
+tools/sql.py, tools/metadata.py, tools/documents.py, tools/ontology.py,
+tools/context.py, tools/planner.py). Same pre-existing failures
+elsewhere in the full suite as before (3 lakebase tests, 80 e2e errors,
+all unrelated and present on baseline).
+
+
+====================================================================
+Sprint 6 — Semantic Critic + submit_evaluation tool
+====================================================================
+
+Context: Sprint 6 of the mapping-PGE redesign — the Semantic Critic
+(Evaluator stage 2). Sibling of the deterministic evaluator: runs only
+when stage 1 PASSES and audits ONE submitted mapping for semantic
+correctness. Bubble-to-planner signal sharpens around the wrong-table
+vs wrong-column distinction: wrong column = Generator retry, wrong
+table = Planner re-invocation.
+
+Changes:
+1. `src/agents/tools/evaluation.py` (new) — `tool_submit_evaluation`
+   terminal handler + `SUBMIT_EVALUATION_DEF` OpenAI function definition.
+   Validates `status` in {"PASS", "FAIL"} (invalid → success=False,
+   loop continues), synthesises a generic `semantic_audit` failure when
+   status=FAIL with empty failures[] so the report stays coherent, and
+   demotes `bubble_to_planner=True` when status=PASS (mirrors
+   `evaluator.report.build_report`). Stamps the resulting `EvalReport`
+   (stage="semantic") onto `ctx.semantic_eval_report`. Exports
+   `EVALUATION_TOOL_DEFINITIONS` / `EVALUATION_TOOL_HANDLERS` aggregates.
+2. `src/agents/tools/context.py` — added `semantic_eval_report:
+   Optional["EvalReport"] = None` using the same `TYPE_CHECKING`
+   forward-ref pattern already used for `source_model`. 5-line touch.
+3. `src/agents/agent_mapping_pge/evaluator/critic.py` (new) — the
+   `run_critic` agent. 4-tool surface (`sample_table`,
+   `get_documents_context`, `execute_sql`, `submit_evaluation`), under-3KB
+   system prompt with PASS / FAIL(no-bubble) / FAIL(bubble) rubric,
+   default `max_iterations=6`, 3-second inter-iteration sleep, MLflow
+   `@trace_agent` decorator, no single-shot fallback. The user prompt
+   surfaces AUDIT TARGET (kind/uri/label/comment; attributes for
+   entities, domain/range for relationships), SUBMITTED MAPPING,
+   PLANNER'S PREDICTION, STRUCTURAL CHECK METRICS (PASSED), and a
+   YOUR TASK reminder.
+4. `tests/agents/test_evaluation_tool.py` (new) — 5 direct tests
+   for `tool_submit_evaluation`: valid PASS round-trip, valid FAIL with
+   failures round-trip, invalid status rejection (no report stamped),
+   plus 2 export-aggregate sanity checks.
+5. `tests/agents/agent_mapping_pge/test_critic.py` (new) — 11 tests
+   covering the full Critic loop with a FakeLLM stub: PASS verdict,
+   FAIL-column (no bubble), FAIL-table (bubbles), PASS+bubble demotion,
+   FAIL with no failures synthesises one, invalid-status non-termination
+   then valid retry, text-only failure, iteration-budget exhaustion,
+   user prompt surfaces stage1 metrics, user prompt distinguishes
+   entity vs relationship (domain/range lines), and step-recording
+   invariants.
+
+Modified files:
+- src/agents/tools/evaluation.py (new)
+- src/agents/tools/context.py (added semantic_eval_report field)
+- src/agents/agent_mapping_pge/evaluator/critic.py (new)
+- tests/agents/test_evaluation_tool.py (new)
+- tests/agents/agent_mapping_pge/test_critic.py (new)
+- changelogs/v0.3.1/2026-05-28.log (this entry)
+
+Test result: 83/83 pass in `tests/agents/` (67 Sprint 1–5 baseline +
+11 new Critic tests + 5 new evaluation-tool tests). No edits to
+out-of-scope files: contracts.py, evaluator/{deterministic,report}.py,
+generators/, planner.py, engine_base.py, tracing.py, llm_utils.py,
+tools/{sql,metadata,documents,ontology,mapping,planner}.py,
+agent_auto_assignment/.
+
+## feat(mapping-pge): Sprint 8 — wire PGE engine into Mapping.py + remove legacy agent
+
+Context: Final sprint of the Planner -> Generator -> Evaluator redesign.
+Sprints 1-7 built `agents/agent_mapping_pge/` with a drop-in `run_agent`
+matching the legacy `agent_auto_assignment` signature. This sprint flips
+the switch: `Mapping.py` and `AgentClient.py` now import from the new
+engine, the PGE-specific extras (`source_model`, `mapping_evaluations`,
+`mapping_run_log`) are persisted on the session, and the legacy package
+is deleted.
+
+Changes:
+1. `src/back/objects/mapping/Mapping.py` — switched the
+   `auto_assign_with_agent` import to `agents.agent_mapping_pge`. Updated
+   the TYPE_CHECKING alias likewise. Accumulated the new PGE-extra
+   fields across chunks in `run_auto_assign_task` and passed them
+   through `save_mappings_to_session` (single-item flow does the same
+   in `run_single_auto_assign_task`).
+2. `src/back/objects/mapping/Mapping.py::save_mappings_to_session` —
+   extended the signature with three optional kwargs (`source_model`,
+   `mapping_evaluations`, `mapping_run_log`) and persisted them as
+   siblings of `entities`/`relationships` under the session
+   `assignment` bucket. `mapping_evaluations` merges by item key,
+   `mapping_run_log` appends; both stay backwards-compatible (None ->
+   no-op) so existing callers (and the R2RML parser) are unaffected.
+3. `src/back/core/agents/AgentClient.py` — switched
+   `run_auto_assignment` to import the new engine; updated the
+   TYPE_CHECKING alias and docstring.
+4. `src/agents/agent_auto_assignment/` — deleted (`__init__.py`,
+   `engine.py`, `tools.py`). The PGE pipeline is now the only mapping
+   agent in the repo.
+5. Scrubbed stale "agent_auto_assignment" mentions from `agent_mapping_pge`
+   docstrings/comments (engine.py, planner.py, generators/entity.py,
+   generators/relationship.py, __init__.py) so `grep -rn
+   "agent_auto_assignment" src/ tests/` returns zero hits.
+
+Modified files:
+- src/back/objects/mapping/Mapping.py (import swap + PGE-extras persistence)
+- src/back/core/agents/AgentClient.py (import swap)
+- src/agents/agent_mapping_pge/__init__.py (docstring scrub)
+- src/agents/agent_mapping_pge/engine.py (docstring scrub)
+- src/agents/agent_mapping_pge/planner.py (docstring + comment scrub)
+- src/agents/agent_mapping_pge/generators/entity.py (comment scrub)
+- src/agents/agent_mapping_pge/generators/relationship.py (comment scrub)
+- src/agents/agent_auto_assignment/__init__.py (deleted)
+- src/agents/agent_auto_assignment/engine.py (deleted)
+- src/agents/agent_auto_assignment/tools.py (deleted)
+- changelogs/v0.3.1/2026-05-28.log (this entry)
+
+Test result: 99/99 pass in `tests/agents/` (PGE-agent suite intact).
+40/40 pass in `tests/test_mapping_service.py` +
+`tests/test_workflow_mapping.py`. Full suite: 2157 passed; the 3
+pre-existing `test_settings_lakebase_status.py` failures and the
+Playwright `tests/e2e/` collection errors are unrelated to this
+change (also failing on the prior Sprint 1-7 commits per earlier
+changelog entries).
+
+Acceptance:
+- `grep -rn "agent_auto_assignment" src/ tests/` returns zero hits.
+- `from agents.agent_mapping_pge.engine import run_agent` is the new
+  import in `Mapping.py`.
+- Three PGE extras reach the session via `assignment.source_model`,
+  `assignment.mapping_evaluations`, `assignment.mapping_run_log`
+  (durable through `session_path.write_text`).
+- Public `Mapping.auto_assign_with_agent` signature unchanged.
+
diff --git a/changelogs/v0.5.0/2026-06-02.log b/changelogs/v0.5.0/2026-06-02.log
new file mode 100644
index 00000000..3581e0d3
--- /dev/null
+++ b/changelogs/v0.5.0/2026-06-02.log
@@ -0,0 +1,108 @@
+# v0.5.0 — 2026-06-02
+
+## PGE generation quality: exhaustive attributes + value harmonization
+
+### Context
+`newdomain` (ontobricks_pge_registry) was attribute-sparse: the OWL generator
+emitted a curated ~28-property ontology subset (vs V1.1's ~53), so most classes
+had empty `dataProperties` and the EntityGenerator produced ID+Label-only
+entities — unfit as the KPI/analytical source of truth. Root cause is generation
+QUALITY in two LLM-driven agents, not plumbing (the sync/finalize infra in
+`Ontology.finalize_class_attributes` already works and runs at session load).
+This change strengthens both generators so the PGE pipeline reproduces V1.1-level
+coverage from source introspection.
+
+### Changes
+1. `src/agents/agent_owl_generator/engine.py` — Part 1: exhaustive datatype-property
+   coverage.
+   - Added a `# ATTRIBUTE COVERAGE (CRITICAL — exhaustive, NOT curated)` section
+     to the system prompt: emit a DatatypeProperty for EVERY meaningful source
+     column per class across ALL covering trust tables; exclude only surrogate
+     keys / audit columns / FK columns; collapse cross-trust synonyms to one
+     property; lowerCamelCase `[a-z][A-Za-z0-9]*` names; rich clinical entities
+     warrant 6–11 properties; "2 props" is a floor not a target.
+   - Workflow now instructs `get_table_detail` on every covering table (so the
+     LLM sees the full column list past the 80-column get_metadata truncation).
+   - Softened the conflicting `GENERIC_GUIDELINES §2.1` minimalism line to defer
+     to the new coverage rule.
+2. `src/agents/agent_mapping_pge/generators/entity.py` — Part 2: value harmonization
+   + regex safety.
+   - Added a `VALUE HARMONIZATION` section: for coded attributes (method/status/
+     type/mode/outcome), discover raw distinct values (`SELECT DISTINCT`) then map
+     to ONE canonical lowercase token set with a CASE expression aliased to the
+     clean attribute name, using the SAME tokens across all UNION branches
+     (delivery method → caesarean/instrumental/vaginal worked example).
+   - Added a `REGEX SAFETY` section: always `[0-9]`/`[a-z]`, NEVER `\d`/`\w`/`\s`
+     (the build strips a lone backslash → `\d` degrades to literal `d`).
+   - Workflow step 3 now references harmonization.
+3. `src/agents/agent_mapping_pge/planner.py` — Part 2: multi-trust completeness.
+   - Added a COMPLETENESS rule to CANONICAL-KEY NORMALIZATION: include EVERY
+     covering trust table in `canonical_column_per_table`, not just the two
+     checked for overlap (omitting one drops 30–60% of instances and dangles
+     relationships). Planner already emits `[0-9]`-safe regex and mandates
+     multi-trust UNION — this reinforces full coverage during candidate discovery.
+4. `tests/agents/agent_mapping_pge/test_entity_generator.py` — added
+   `test_system_prompt_mandates_value_harmonization` (canonical token set,
+   discover-before-harmonize, `[0-9]`-not-`\d` regex safety).
+
+### Modified files
+- src/agents/agent_owl_generator/engine.py
+- src/agents/agent_mapping_pge/generators/entity.py
+- src/agents/agent_mapping_pge/planner.py
+- tests/agents/agent_mapping_pge/test_entity_generator.py
+
+### Test result
+`uv run --offline pytest tests/agents/agent_mapping_pge/ tests/units/ontology/test_owl_generator.py`
+— 101 passed → 102 passed with the new harmonization test. No regressions.
+
+---
+
+# v0.5.0 — 2026-06-03
+
+## OWL generator output ceiling + live newdomain regeneration
+
+### Context
+Exercising the v0.5.0 generation-quality changes end-to-end against the live
+`ontobricks-pge` registry surfaced one defect and confirmed the rest.
+
+### Changes
+1. `src/agents/agent_owl_generator/engine.py` — raised the LLM completion ceiling
+   from a hardcoded `max_tokens=4096` to a named `MAX_OUTPUT_TOKENS = 16000`
+   (both the main-loop and tools-unsupported fallback call sites). The exhaustive
+   `# ATTRIBUTE COVERAGE` prompt makes the Turtle output larger; at 4096 the final
+   statement was silently truncated (`finish_reason=length`) and OWL parsing failed.
+   Claude Opus supports large completions; 16k fits a full maternity ontology.
+
+### Live regeneration result (newdomain, ontobricks-pge / ontobricks_pge_registry)
+Ran the real PGE pipeline against the live workspace (endpoint
+`databricks-claude-opus-4-7`, warehouse `e6b70b0c07bbaa10`,
+`fiifi_cdm_demo_catalog`):
+- OWL generator → 17 classes, **60 datatype properties** (was 22), all KPI props.
+- Mapping-PGE → 15 entities + 12 relationships, every item PASS the deterministic
+  evaluator, **72 attribute mappings**, multi-trust UNIONs + value harmonization,
+  `[0-9]`-safe canonical key, GROUP BY dedup. (ClinicalFinding / ClinicalProvider
+  honestly skipped — no source rows; they were ID+Label stubs in V1.1.)
+- One targeted re-run of Pregnancy via a retry-hint to DERIVE `bookingGestationWeeks`
+  (= `DATEDIFF(lmp_date, booking_date)/7`, a computed feature with no source column).
+- Overwrote `newdomain` v1 in Lakebase (rollback backup
+  `/tmp/newdomain_live_backup_2026-06-02.json`).
+
+### Independent verification (warehouse audit, round-tripped from Lakebase)
+- Entity-join dangling: **0.00% across all 12 relationships** (worst case 0.00%).
+- Delivery method mix vaginal 1227 / caesarean 849 / instrumental 297 (matches V1.1).
+- Pregnancy outcome delivered 1658 / transferred 715 / ongoing 74; feeding_status
+  mixed 549 / formula 543 / breast 542; booking gestation avg 10.5 wks; Pregnancy
+  hub 2463 rows = 2463 distinct IDs.
+- All 5 KPI columns present: deliveryMethod, outcomeStatus, bookingGestationWeeks,
+  contactType (=contact_type), feedingStatus (=feeding_status).
+
+NOTE (next phase, out of scope here): the downstream `canonical.*` UC views +
+metric views were built from the prior hand-copy's entity SQLs and must be
+republished from the regenerated newdomain SQLs.
+
+### Modified files
+- src/agents/agent_owl_generator/engine.py
+
+### Test result
+`uv run --offline pytest tests/agents/agent_mapping_pge/ tests/units/ontology/test_owl_generator.py`
+— 102 passed. No regressions.
diff --git a/changelogs/v0.5.0/FiifiB_2026-06-16.log b/changelogs/v0.5.0/FiifiB_2026-06-16.log
new file mode 100644
index 00000000..90d2ebaf
--- /dev/null
+++ b/changelogs/v0.5.0/FiifiB_2026-06-16.log
@@ -0,0 +1,73 @@
+# v0.5.0 — 2026-06-16 (FiifiB)
+
+## PGE intrinsic-evaluation toolkit + ontology-gen Evaluator stage + in-app scorecard
+
+### Context
+The PGE pipeline (ontology + mapping generation) had no measurable, gold-free
+stopping condition. This adds a **usecase-agnostic intrinsic scorecard**
+(`src/agents/pge_eval/`), wires it to run **in-app** after generation/mapping,
+turns `agent_owl_generator` into a real PGE loop via a new Evaluator stage, and
+ships a CLI. No golden/reference labels are encoded (correctness is measured by
+internal consistency against the actual runtime inputs), so it works for any
+domain — not just the NHS CDM/maternity demo it was developed against.
+
+### Changes
+1. `src/agents/pge_eval/` (new package) — the scorer:
+   - `scorecard.py` — `score_artifact`, the single offline-testable scoring
+     core; emits the scorecard JSON + GREEN/RED verdict + exit code.
+   - `ontology_metrics.py` / `mapping_metrics.py` / `pipeline_metrics.py` —
+     Stage-1 (footprint coverage, orphan/dangling/naming/duplicate), Stage-2
+     (entity/rel/attribute completeness, dangling-FK max, id-integrity, sql-exec
+     failures, conditional cross-source band), and pipeline (coverage_loss +
+     convergence, advisory/tracked only).
+   - `gates.py` — three tiers: absolute / ratio (`--gate-ratios`) / direction-
+     aware self-baseline regression.
+   - `baseline.py` — Tier-3 self-baseline store in `logs/goals/` (baseline = most
+     recent GREEN; RED runs never baseline).
+   - `judge.py` — advisory LLM-judge; the only network path; never gates; skipped
+     under `--no-judge` (zero network).
+   - `normalize.py` / `loaders.py` — shape-agnostic ontology/metadata
+     normalisation + domain-agnostic live-run input loaders.
+   - `inapp.py` — fail-safe in-app hooks (deterministic, never raise).
+2. `scripts/goals_eval.py` (new) — CLI with `score` / `run` subcommands and
+   `--no-judge` / `--gate-ratios`; `run` is domain-agnostic
+   (`--registry-json`/`--version` or `--ontology`/`--metadata`). Exit code =
+   verdict.
+3. `src/agents/agent_owl_generator/engine.py` — new **Evaluator stage**: after
+   the (upstream) pitfall-tool loop settles, run the Stage-1 deterministic checks
+   and feed retry-hints back into generation, bounded by `MAX_OWL_EVAL_ROUNDS`;
+   cleans prose/markdown preambles before parsing; de-maternified the ATTRIBUTE
+   COVERAGE prompt (domain-neutral examples).
+4. In-app scorecard hooks — `src/api/routers/internal/ontology.py` and
+   `src/back/objects/mapping/Mapping.py`: after generation/mapping, attach
+   `pge_scorecard` to the task result and append the verdict to the completion
+   message (kept alongside upstream's per-iteration generation score).
+5. Generality hardening — de-maternified the mapping VALUE HARMONIZATION
+   (`generators/entity.py`) and planner CANONICAL-KEY NORMALIZATION
+   (`planner.py`) prompts; broadened `test_no_domain_hardcoding` to also scan the
+   generator prompt files.
+6. `scripts/smoke_pge.py` — artifact dump embeds `ontology`/`metadata`/
+   `elapsed_s` so score-only is self-contained.
+7. Docs — Sphinx `docs/sphinx/api/agents.rst` "PGE Intrinsic Evaluation" section
+   (all 11 modules) + README "PGE Intrinsic Evaluation (quality scorecard)"
+   section.
+8. Integrated 65 upstream `master` commits (mid-0.5.0) via merge `198f38e`;
+   re-landed the Evaluator stage onto upstream's new tool-driven pitfall loop.
+
+### Modified / added files
+- src/agents/pge_eval/*.py (new: __init__, scorecard, normalize, ontology_metrics,
+  mapping_metrics, pipeline_metrics, gates, baseline, judge, inapp, loaders)
+- scripts/goals_eval.py (new), scripts/smoke_pge.py
+- src/agents/agent_owl_generator/engine.py
+- src/agents/agent_mapping_pge/generators/entity.py, src/agents/agent_mapping_pge/planner.py
+- src/api/routers/internal/ontology.py, src/back/objects/mapping/Mapping.py
+- docs/sphinx/api/agents.rst, README.md
+- tests/units/pge_eval/* (new), tests/agents/agent_mapping_pge/{test_entity_generator,test_planner}.py
+
+### Test result
+- `pytest tests/units tests/agents` → **2528 passed, 11 skipped, 0 failed**
+  (post-merge with origin/master).
+- Evidence captured during development: clean artifact → GREEN/exit 0; seeded
+  dangling-FK → RED/exit 1; Tier-3 regression → RED; `--no-judge` zero-network
+  verified; grep clean for trust_a/b/c, preg, maternity, NHS, spr in scorer +
+  generator prompts.
diff --git a/changelogs/v0.5.0/FiifiB_2026-06-19.log b/changelogs/v0.5.0/FiifiB_2026-06-19.log
new file mode 100644
index 00000000..19383ea0
--- /dev/null
+++ b/changelogs/v0.5.0/FiifiB_2026-06-19.log
@@ -0,0 +1,67 @@
+# v0.5.0 — PGE Run-Visualizer (make the mapping PGE loop demoable in the UI)
+
+## Context
+
+The mapping PGE pipeline (Planner → Generator → Evaluator → Critic) already
+computes rich artifacts — the planner's `source_model`, per-item
+`mapping_evaluations` (EvalReports), an attempt-by-attempt `mapping_run_log`,
+and an intrinsic-eval `pge_scorecard` — but the UI surfaced almost none of it.
+Auto-Map showed a progress bar, a flat per-item results table, and appended
+the scorecard verdict to a toast string. The orchestration (planner picks,
+generator attempts, evaluator/critic verdicts, retry hints, re-plan
+escalations) was invisible, which made the agentic loop "hard to demo on the
+OntoBricks UI."
+
+This change exposes the PGE loop in the Batch Auto-Map report without touching
+any agent logic: a new client-side run-visualizer renders the loop from the
+artifacts the engine already produces, and the two artifacts that were
+persisted-to-session-only are now also surfaced on the polled task result.
+
+## Changes
+
+1. `src/back/objects/mapping/Mapping.py` — `run_auto_assign_task` (batch) and
+   `run_single_auto_assign_task` (single): added `source_model`,
+   `mapping_evaluations`, and `mapping_run_log` to the `complete_task` result
+   dict. These were already accumulated and persisted to the session via
+   `save_mappings_to_session`; they are now also returned on the polled task so
+   the UI can render the loop from `GET /tasks/{id}` with no extra round-trip.
+   `pge_scorecard` was already present (computed by `score_mapping_run`).
+2. `src/front/static/mapping/js/mapping-pge-visualizer.js` — NEW. `PgeVisualizer`
+   module: renders the verdict pill + KPI strip (entity/relationship
+   completeness, id-integrity, sql-exec failures, coverage-loss), the three
+   gate tiers (absolute / ratio / regression), a per-item loop trace
+   (Generator › Evaluator › Critic chain with attempt numbers, retry hints, and
+   re-plan markers) enriched with per-item eval metrics, and a collapsible
+   planner source-model panel (table→class candidates with confidence bars,
+   canonical ids, join keys with overlap, planner skips). Fully defensive — any
+   field may be missing; renders nothing if there is no PGE payload.
+3. `src/front/static/mapping/css/mapping-pge-visualizer.css` — NEW. Component
+   styles (ob-pge-* classes) for the card, KPI strip, gate chips, loop-trace
+   timeline, and source-model tables. Bootstrap 5.3 for everything else.
+4. `src/front/static/mapping/js/mapping-autoassign.js` — stash `this.taskResult`
+   in both completion paths (live monitor + resumed task) and call
+   `PgeVisualizer.render(...)` from `showReport()`; clear it in `reset()`.
+5. `src/front/templates/partials/mapping/_mapping_autoassign.html` — added the
+   `#autoAssignPgeVisualizer` mount container inside the report section.
+6. `src/front/templates/mapping.html` — included the new CSS + JS assets.
+
+## Modified / new files
+
+- src/back/objects/mapping/Mapping.py (modified)
+- src/front/static/mapping/js/mapping-pge-visualizer.js (new)
+- src/front/static/mapping/css/mapping-pge-visualizer.css (new)
+- src/front/static/mapping/js/mapping-autoassign.js (modified)
+- src/front/templates/partials/mapping/_mapping_autoassign.html (modified)
+- src/front/templates/mapping.html (modified)
+
+## Tests
+
+- `pytest tests/agents/agent_mapping_pge/` — 82 passed, 1 warning (no regression
+  from the task-result additions; the engine/contracts are untouched).
+- `node --check` passes on both modified/new JS modules.
+- `python -m py_compile` passes on Mapping.py.
+- Visual verification: rendered the visualizer against a realistic synthetic
+  task result via headless Chrome (file:// harness). Confirmed: verdict pill,
+  KPI strip, three gate tiers, per-item loop chains (incl. a 2-attempt
+  null-id-hint retry and a re-plan escalation), per-item eval metrics, and the
+  planner source-model panel all render correctly. Screenshots captured.
diff --git a/changelogs/v0.5.0/FiifiB_2026-06-20.log b/changelogs/v0.5.0/FiifiB_2026-06-20.log
new file mode 100644
index 00000000..ccb59797
--- /dev/null
+++ b/changelogs/v0.5.0/FiifiB_2026-06-20.log
@@ -0,0 +1,42 @@
+# v0.5.0 — PGE Run-Visualizer: reasoning-wrap fix + live verification
+
+## Context
+
+Continuation of the PGE run-visualizer (see FiifiB_2026-06-19.log). Verified the
+visualizer end-to-end with a real, live PGE auto-map run on the deployed app
+(ontobricks-pgeviz). The live run surfaced one rendering issue not present in the
+synthetic-data tests: the semantic critic's `reasoning` (carried in
+`EvalReport.metrics.reasoning`) is long free text that rendered inline and
+overflowed the viewport.
+
+## Changes
+
+1. `src/front/static/mapping/js/mapping-pge-visualizer.js` — `renderItem()`:
+   split per-item eval metrics into short scalar metrics (rendered inline as
+   before) and long free-text fields >60 chars such as `reasoning` (now rendered
+   as a wrapped `.ob-pge-reasoning` block instead of a nowrap inline span).
+2. `src/front/static/mapping/css/mapping-pge-visualizer.css` — added
+   `.ob-pge-reasoning` (white-space: normal, overflow-wrap: anywhere) so long
+   critic reasoning wraps cleanly.
+
+## Modified files
+
+- src/front/static/mapping/js/mapping-pge-visualizer.js
+- src/front/static/mapping/css/mapping-pge-visualizer.css
+
+## Tests / verification
+
+- `node --check` passes on the visualizer JS.
+- LIVE end-to-end verification on deployed app `ontobricks-pgeviz`
+  (https://ontobricks-pgeviz-7474646666236453.aws.databricksapps.com):
+  loaded trust_a.maternity_episode metadata → applied a maternity ontology
+  (MaternityEpisode, Baby) → ran a real PGE auto-assign → both entities PASS in
+  1 attempt. The deployed PgeVisualizer rendered the real run: stage chips,
+  RED verdict, KPI strip (entity/rel/id-integrity 100%, 0 sql failures,
+  0% coverage-loss), gate tiers (Tier-1 absolute FAILED on orphan_class_count=2,
+  Tier-2/3 pass), per-item Generator→Evaluator→Critic chains with full wrapped
+  critic reasoning, per-item metrics, and the planner source-model (candidate
+  confidences 98%/90%, canonical-id analysis with real row counts 1228/776).
+  Zero console errors. Screenshot captured.
+- Deployment used the dev-lakebase target against a dedicated Lakebase project
+  (ontobricks-pgeviz-db) for full isolation from shared production.
diff --git a/docs/INFO.md b/docs/INFO.md
index 1b68e3cb..06ba8196 100644
--- a/docs/INFO.md
+++ b/docs/INFO.md
@@ -323,7 +323,7 @@ src/
 │   ├── llm_utils.py             # Shared LLM call with retry
 │   ├── tools/                   # Shared agent tools (context, metadata, SQL, …)
 │   ├── agent_owl_generator/     # OWL ontology generation agent
-│   ├── agent_auto_assignment/   # Entity/relationship → SQL mapping agent
+│   ├── agent_mapping_pge/       # Mapping PGE pipeline (Planner → Generator → Evaluator)
 │   ├── agent_auto_icon_assign/  # Emoji icon mapping agent
 │   └── agent_ontology_assistant/# Conversational assistant + ResponsesAgent wrapper
 │
diff --git a/docs/architecture.md b/docs/architecture.md
index 5a87799a..5fc9baf0 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -20,7 +20,7 @@ Under the hood, SPARQL translates ontology mappings into Spark SQL — users nev
 | **User Interface** | Bootstrap 5.3 + OntoViz visual editor + Sigma.js / D3.js graph views |
 | **MCP Server** | Separate Databricks App (`mcp-ontobricks`) exposing knowledge-graph tools to LLM clients (Cursor, Claude Desktop, Playground) |
 | **FastAPI Application** | Routes → Domain Objects → Core layered architecture with GlobalConfigService, PermissionService, and BuildScheduler |
-| **LLM Agents** | MLflow-traced agentic loops for ontology generation, auto-mapping, icon mapping, and conversational assistance |
+| **LLM Agents** | MLflow-traced agentic loops for ontology generation, mapping (Planner/Generator/Evaluator), icon mapping, and conversational assistance |
 | **Reasoning Engine** | OWL 2 RL deductive closure, SWRL rules (compiled to SQL), graph reasoning, and constraint validation |
 | **Triple Store Backends** | Delta-backed view in Unity Catalog plus a pluggable Graph DB engine (currently Lakebase Postgres) via the `GraphDBFactory` pattern, with BFS, shortest path, and transitive closure built in |
 | **Databricks Platform** | Unity Catalog (metadata & governance), SQL Warehouse (query execution), UC Volumes (shared storage) |
@@ -448,7 +448,7 @@ src/
 │   ├── tracing.py                      # MLflow tracing setup & decorators
 │   ├── tools/                          # Shared agent tools (ontology, mapping, metadata, SQL, etc.)
 │   ├── agent_owl_generator/            # OWL ontology generation agent
-│   ├── agent_auto_assignment/          # Entity/relationship → SQL mapping agent
+│   ├── agent_mapping_pge/              # Mapping PGE pipeline (Planner → Generator → Evaluator)
 │   ├── agent_auto_icon_assign/         # Emoji icon mapping agent
 │   └── agent_ontology_assistant/       # Conversational assistant + ResponsesAgent wrapper
 │
@@ -1512,36 +1512,41 @@ In addition to the UI-driven agents, OntoBricks provides an **MCP server** (`mcp
 
 ---
 
-#### 2. Auto-Mapping Agent (`agent_auto_assignment`)
+#### 2. Mapping PGE Pipeline (`agent_mapping_pge`)
 
-**Purpose**: Autonomously map ontology entities and relationships to SQL queries against the domain's Databricks tables. The agent writes SQL, validates it by executing queries, and submits the finalized mappings.
+**Purpose**: Autonomously map ontology entities and relationships to SQL queries against the domain's Databricks tables. The pipeline replaces the legacy single-loop agent with a Planner → Generator → Evaluator (PGE) decomposition: a global planner produces a typed `SourceModel`, narrow per-item generators emit SQL against that plan, and a two-stage evaluator gates every mapping with deterministic checks plus a semantic critic.
 
-| Parameter | Value |
-|-----------|-------|
-| Max iterations | 60 (batch) / 15 (single-item) |
-| LLM timeout | 180s |
-| Max tokens | 2048 |
-| Temperature | 0.1 |
-| Iteration delay | 3s between LLM calls |
-| Chunk size | 5 items per agent run (`AUTO_ASSIGN_CHUNK_SIZE`) |
-| Chunk cooldown | 15s between chunks (`AUTO_ASSIGN_CHUNK_COOLDOWN`) |
+**Components**:
 
-**Workflow**:
-1. Calls `get_ontology` to see entities, relationships, and their attributes
-2. Calls `get_metadata` to understand available tables and columns
-3. For each entity/relationship:
-   - Writes a SQL query using `execute_sql` to validate it
-   - Iterates on SQL errors until the query succeeds
-   - Calls `submit_entity_mapping` or `submit_relationship_mapping` to finalize
-4. Repeats until all items are mapped or iteration limit is reached
+| Component | Module | Role |
+|-----------|--------|------|
+| **Planner** | `agent_mapping_pge/planner.py::run_planner` | Single-invocation agent. Consumes ontology + table metadata + imported documents, probes data with `sample_table` / `column_value_overlap` / `distinct_count`, and emits a validated `SourceModel` (per-table role candidates, canonical-ID map, intra-trust and cross-source join keys, ordered mapping plan). |
+| **EntityGenerator** | `agent_mapping_pge/generators/entity.py::run_entity_generator` | Narrow agent that maps ONE ontology class given a filtered `SourceModel` slice. Emits SQL with `AS ID` aliasing the canonical ID column; populates `attribute_mappings` or explicit `unmapped_attributes` (no silent drops). |
+| **RelationshipGenerator** | `agent_mapping_pge/generators/relationship.py::run_relationship_generator` | Sibling generator for ontology properties. Constrains endpoint columns to the IDs the source/target entities were already mapped on. |
+| **Deterministic Evaluator** | `agent_mapping_pge/evaluator/deterministic.py` | Pure-Python Stage 1 checks: `row_count`, distinct IDs, null IDs, dangling source/target percentages, cross-source overlap band. Fast and reproducible — gates Stage 2. |
+| **Semantic Critic** | `agent_mapping_pge/evaluator/critic.py` | LLM agent that runs ONLY when Stage 1 passes. Audits semantic correctness with `sample_table`, `execute_sql`, `get_documents_context`. Submits a verdict via the `submit_evaluation` terminal tool. |
+| **Orchestrator** | `agent_mapping_pge/engine.py::run_agent` | Drop-in replacement for the legacy `run_agent`. Persists the `SourceModel`, per-item `EvalReport`s, and a `mapping_run_log` on the session via `Mapping.save_mappings_to_session`. |
+| **Contracts** | `agent_mapping_pge/contracts.py` | Pydantic models: `SourceModel`, `EntityMappingDraft`, `RelationshipMappingDraft`, `EvalReport`. |
 
-**Tools used**: `get_ontology`, `get_metadata`, `execute_sql`, `submit_entity_mapping`, `submit_relationship_mapping`
+**Per-item loop**:
+1. Generator emits a draft mapping against the Planner's slice for the target class/property.
+2. Deterministic Evaluator runs Stage 1 checks. On failure → return hints to the Generator.
+3. If Stage 1 passes, Semantic Critic audits the draft and returns a verdict via `submit_evaluation`.
+4. Up to 3 Generator → Evaluator attempts per item with hint-driven retry.
+5. Persistent semantic or structural failure → bubble up to Planner; the orchestrator triggers a global replan (max 2 replans across the run).
+
+**Tools used**:
+- Planner: `sample_table`, `column_value_overlap`, `distinct_count`, `submit_source_model`, `get_metadata`, `get_documents_context`
+- Generators: `get_ontology`, `get_metadata`, `execute_sql`, `submit_entity_mapping`, `submit_relationship_mapping`
+- Critic: `sample_table`, `execute_sql`, `get_documents_context`, `submit_evaluation`
 
 **Invoked by**:
-- **Batch**: `POST /mapping/auto-assign/start` → background thread → TaskManager. Large jobs are split into chunks of `AUTO_ASSIGN_CHUNK_SIZE` items; each chunk runs its own agent loop with a `AUTO_ASSIGN_CHUNK_COOLDOWN` pause between chunks to avoid LLM rate limits (429 errors). Partial results accumulate across chunks.
-- **Single-item**: `POST /mapping/auto-assign/single` → background thread → TaskManager (processes one entity or relationship)
+- **Batch**: `POST /mapping/auto-assign/start` → background thread → TaskManager.
+- **Single-item**: `POST /mapping/auto-assign/single` → background thread → TaskManager (processes one entity or relationship).
+
+The public `Mapping.auto_assign_with_agent` API and the `on_step(msg, pct)` progress callback are unchanged — this is a transparent under-the-hood swap. New persisted artifacts (`source_model`, `mapping_evaluations`, `mapping_run_log`) are written to the session but not yet surfaced in the UI.
 
-**Single-item mode**: The same agent engine is used with `max_iterations=15`. The ontology payload is scoped to the single target item. The frontend fires the request, polls `/tasks/{id}`, and saves the result directly to `MappingState.config` by URI — enabling concurrent auto-maps on different items.
+**Single-item mode**: The orchestrator scopes the Planner's `SourceModel` to the target class/property and runs the same Generator → Evaluator loop. The frontend fires the request, polls `/tasks/{id}`, and saves the result directly to `MappingState.config` by URI — enabling concurrent maps on different items.
 
 ---
 
@@ -1608,10 +1613,15 @@ All tools live in `src/agents/tools/` and follow a consistent pattern:
 | `get_table_detail` | `metadata.py` | Returns detailed schema for a specific table | OWL Generator |
 | `list_documents` | `documents.py` | Lists uploaded domain documents from Unity Catalog | OWL Generator |
 | `read_document` | `documents.py` | Reads content of a specific document | OWL Generator |
-| `get_ontology` | `ontology.py` | Returns current ontology (entities, relationships, attributes) | Auto-Mapping, Icon Mapping, Ontology Assistant |
-| `execute_sql` | `sql.py` | Executes a SQL query via Databricks SQL Warehouse | Auto-Mapping |
-| `submit_entity_mapping` | `mapping.py` | Saves a validated entity → SQL mapping | Auto-Mapping |
-| `submit_relationship_mapping` | `mapping.py` | Saves a validated relationship → SQL mapping | Auto-Mapping |
+| `get_ontology` | `ontology.py` | Returns current ontology (entities, relationships, attributes) | Mapping PGE (Generators), Icon Mapping, Ontology Assistant |
+| `execute_sql` | `sql.py` | Executes a SQL query via Databricks SQL Warehouse | Mapping PGE (Generators, Critic) |
+| `submit_entity_mapping` | `mapping.py` | Saves a validated entity → SQL mapping | Mapping PGE (EntityGenerator) |
+| `submit_relationship_mapping` | `mapping.py` | Saves a validated relationship → SQL mapping | Mapping PGE (RelationshipGenerator) |
+| `sample_table` | `planner.py` | Returns N sample rows from a table | Mapping PGE (Planner, Critic) |
+| `column_value_overlap` | `planner.py` | Reports value overlap between two columns (cross-source join probe) | Mapping PGE (Planner) |
+| `distinct_count` | `planner.py` | Returns the distinct-value count for a column | Mapping PGE (Planner) |
+| `submit_source_model` | `planner.py` | Terminal tool — submits the Planner's validated `SourceModel` | Mapping PGE (Planner) |
+| `submit_evaluation` | `evaluation.py` | Terminal tool — submits the Critic's `EvalReport` verdict | Mapping PGE (Critic) |
 | `assign_icons` | `icons.py` | Saves entity → emoji icon mapping | Icon Mapping |
 
 #### ToolContext
@@ -1629,11 +1639,12 @@ class ToolContext:
     # OWL Generator
     uc_location: dict        # Unity Catalog file location
 
-    # Auto-Mapping
+    # Mapping PGE (Planner, Generators, Critic)
     client: Any              # DatabricksClient for SQL execution
     ontology: dict           # Current ontology data
     entity_mappings: list    # Accumulated entity mapping results
     relationship_mappings: list  # Accumulated relationship mapping results
+    source_model: Any        # Planner-emitted SourceModel (set after planning)
 
     # Icon Assign
     icon_results: dict       # Accumulated icon assignments
@@ -1645,7 +1656,7 @@ Each agent populates only the fields it needs; unused fields remain at their def
 
 ### Agent Engine Pattern
 
-All three agents share the same engine structure (defined independently in each `engine.py`):
+Each agent (OWL Generator, Icon Assign, Ontology Assistant, and the Planner, Generators, and Critic inside the Mapping PGE pipeline) shares the same ReAct-style engine structure (defined independently in each `engine.py`):
 
 #### Core Loop
 
@@ -1674,7 +1685,7 @@ All three agents share the same engine structure (defined independently in each
 
 #### Fallback Mode
 
-If the LLM endpoint returns HTTP 400/422 (indicating it doesn't support the `tools` parameter), the OWL Generator and Icon Assign agents automatically retry without tools, falling back to single-shot generation. The Auto-Mapping agent does not fall back because its workflow fundamentally requires tool calls (SQL execution, mapping submission).
+If the LLM endpoint returns HTTP 400/422 (indicating it doesn't support the `tools` parameter), the OWL Generator and Icon Assign agents automatically retry without tools, falling back to single-shot generation. The Mapping PGE pipeline does not fall back because every stage (Planner, Generators, Critic) fundamentally requires tool calls (data probing, SQL execution, terminal submission tools).
 
 #### Task Integration
 
diff --git a/docs/code_organization.md b/docs/code_organization.md
index aa27fb3d..2fa35244 100644
--- a/docs/code_organization.md
+++ b/docs/code_organization.md
@@ -288,7 +288,7 @@ Examples under `src/agents/`:
 - **`agent_ontology_assistant`** — conversational edits to the loaded ontology (exposed via e.g. `ontology_assistant_chat` in `front/routes/ontology.py`).
 - **`agent_owl_generator`** — generates OWL from natural language (async task wrapper in ontology routes).
 - **`agent_auto_icon_assign`** — suggests emoji icons for entities.
-- **`agent_auto_assignment`** — automated mapping or assignment workflows (see package for details).
+- **`agent_mapping_pge`** — Mapping PGE pipeline (Planner → Generators → Evaluator) that maps ontology entities and relationships to Spark SQL. Composed of `planner.py`, `generators/{entity,relationship}.py`, and `evaluator/{deterministic,critic}.py`, orchestrated by `engine.py::run_agent`.
 
 Shared utilities include **`agents.engine_base`** (shared `AgentStep` data class, `call_serving_endpoint` for LLM calls, `dispatch_tool` for tool execution, `extract_message_content` for response parsing, and `accumulate_usage` for token tracking), **`agents.llm_utils`** (retry/backoff logic), **`agents.tools.context`** (`ToolContext` for domain/session-aware tool execution), and **`agents.tracing`** (initialized from app `lifespan` in `src/shared/fastapi/main.py` via `setup_tracing()`).
 
diff --git a/docs/data-access.md b/docs/data-access.md
index a70ebf2e..db9b3876 100644
--- a/docs/data-access.md
+++ b/docs/data-access.md
@@ -217,7 +217,7 @@ These agents do not query the triple store at runtime; they operate on the
 | Agent | Purpose | Tools call | Wrapper | Engine |
 |---|---|---|---|---|
 | `agent_owl_generator` | Build an OWL ontology from metadata + documents | `metadata.list_tables`, `metadata.preview_table`, `documents.read`, `ontology.write_owl` | REST + Spark SQL (samples) | `databricks-sql-connector` against UC tables, plus rdflib write |
-| `agent_auto_assignment` | Map ontology entities to Spark SQL queries | `tables.list`, `tables.sample`, `mapping.write` | REST + Spark SQL (samples) | Same as above; output stored as R2RML |
+| `agent_mapping_pge` | Map ontology entities and relationships to Spark SQL queries via a Planner → Generator → Evaluator pipeline | `tables.list`, `tables.sample`, `column_value_overlap`, `distinct_count`, `execute_sql`, `mapping.write`, `submit_evaluation` | REST + Spark SQL (samples + validation) | Same as above; output stored as R2RML |
 | `agent_auto_icon_assign` | Pick emojis for entities | Inspects ontology + metadata | REST | None — generation only |
 | `agent_ontology_assistant` | Conversational ontology editing | Dozens of tools mutating the in-session ontology | REST | Python ontology object model |
 | `agent_dtwin_chat` | Conversational graph querying | See §6 | REST + **GraphQL** + **SPARQL** | **Spark SQL** + **Cypher** (engine-side) |
diff --git a/docs/deployment.md b/docs/deployment.md
index 210ce8e5..3ab7f0f2 100644
--- a/docs/deployment.md
+++ b/docs/deployment.md
@@ -1299,7 +1299,7 @@ OntoBricks agents are instrumented with MLflow tracing. When deployed to Databri
 
 - `MLFLOW_TRACKING_URI=databricks` is set in `app.yaml`
 - Application startup in `src/shared/fastapi/main.py` calls `setup_tracing()`, which creates the `/Shared/ontobricks-agents` experiment
-- Every agent call (OWL Generator, Auto-Mapping, Auto Icon Assign, Ontology Assistant) produces a span tree:
+- Every agent call (OWL Generator, Mapping PGE pipeline — Planner, Generators, Critic — Auto Icon Assign, Ontology Assistant) produces a span tree:
 
 ```
 AGENT (run_agent)
diff --git a/docs/sphinx/api/agents.rst b/docs/sphinx/api/agents.rst
index 311afa26..35450564 100644
--- a/docs/sphinx/api/agents.rst
+++ b/docs/sphinx/api/agents.rst
@@ -22,24 +22,59 @@ Tracing
    :undoc-members:
    :show-inheritance:
 
-Auto Assignment Agent
----------------------
+Mapping PGE Pipeline
+--------------------
 
-.. automodule:: agents.agent_auto_assignment
+The Mapping PGE pipeline replaces the legacy single-loop auto-assignment agent
+with a Planner → Generator → Evaluator decomposition: a global planner emits a
+typed ``SourceModel``, narrow per-item generators produce SQL against that
+plan, and a two-stage evaluator (deterministic checks + semantic critic) gates
+every mapping.
+
+.. automodule:: agents.agent_mapping_pge
    :members:
    :undoc-members:
    :show-inheritance:
 
-.. automodule:: agents.agent_auto_assignment.engine
+.. automodule:: agents.agent_mapping_pge.engine
    :members:
    :undoc-members:
    :show-inheritance:
 
-.. automodule:: agents.agent_auto_assignment.tools
+.. automodule:: agents.agent_mapping_pge.planner
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.agent_mapping_pge.contracts
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.agent_mapping_pge.generators.entity
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.agent_mapping_pge.generators.relationship
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.agent_mapping_pge.evaluator.deterministic
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.agent_mapping_pge.evaluator.critic
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.agent_mapping_pge.evaluator.report
    :members:
    :undoc-members:
    :show-inheritance:
-   :exclude-members: ToolContext
 
 Auto Icon Assignment Agent
 --------------------------
@@ -168,6 +203,16 @@ Shared Tools
    :undoc-members:
    :show-inheritance:
 
+.. automodule:: agents.tools.planner
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.tools.evaluation
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 .. automodule:: agents.tools.sql
    :members:
    :undoc-members:
@@ -177,3 +222,68 @@ Shared Tools
    :members:
    :undoc-members:
    :show-inheritance:
+
+PGE Intrinsic Evaluation
+------------------------
+
+A usecase-agnostic, gold-free scorecard for the PGE pipeline (ontology +
+mapping generation). Intrinsic structural/self-consistency metrics plus an
+advisory LLM-judge — no stored reference answer. The deterministic core
+(``score_artifact``) ingests a captured ``AgentResult`` artifact and emits the
+scorecard JSON with zero LLM calls; the in-app hooks run it live after
+generation/mapping; the CLI lives in ``scripts/goals_eval.py``.
+
+.. automodule:: agents.pge_eval
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.pge_eval.scorecard
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.pge_eval.normalize
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.pge_eval.ontology_metrics
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.pge_eval.mapping_metrics
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.pge_eval.pipeline_metrics
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.pge_eval.gates
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.pge_eval.baseline
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.pge_eval.judge
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.pge_eval.inapp
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: agents.pge_eval.loaders
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/scripts/goals_eval.py b/scripts/goals_eval.py
new file mode 100644
index 00000000..f354170e
--- /dev/null
+++ b/scripts/goals_eval.py
@@ -0,0 +1,238 @@
+"""goals_eval — OntoBricks PGE intrinsic-evaluation CLI.
+
+Two subcommands:
+
+    score   evaluate a captured AgentResult artifact (cheap, deterministic,
+            re-runnable). Consumes the JSON dumped by scripts/smoke_pge.py.
+
+                $ .venv/bin/python scripts/goals_eval.py score <artifact.json> \
+                    [--no-judge] [--gate-ratios]
+
+    run     run the mapping PGE pipeline live, dump an artifact, then score it.
+            A thin wrapper around score-only (D6).
+
+                $ .venv/bin/python scripts/goals_eval.py run [--gate-ratios] \
+                    [--no-judge]
+
+Flags:
+    --no-judge      skip the advisory LLM-judge (the ONLY LLM/network path).
+                    Deterministic metrics always run with zero LLM calls.
+    --gate-ratios   promote Tier-2 ratio warnings to hard gates for this run.
+
+The process exit code is the scorecard verdict: 0 == GREEN, non-zero == RED.
+
+The scorecard is usecase-agnostic and uses no gold/reference labels.
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+
+# Make ``src/`` importable without a packaged install.
+REPO_ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(REPO_ROOT / "src"))
+
+from agents.pge_eval.baseline import DEFAULT_BASELINE_DIR, save_scorecard  # noqa: E402
+from agents.pge_eval.scorecard import score_artifact  # noqa: E402
+
+LLM_ENDPOINT = os.environ.get("PGE_EVAL_ENDPOINT", "databricks-claude-opus-4-7")
+
+
+def _now_ids():
+    t = time.time()
+    dt = datetime.fromtimestamp(t, tz=timezone.utc)
+    # Microsecond-precise run_id so rapid successive runs never collide
+    # (a collision would make a run baseline against itself and skip Tier-3).
+    run_id = dt.strftime("%Y%m%dT%H%M%S_%f")
+    ts = dt.isoformat()
+    return run_id, ts
+
+
+def _load_json(path: str) -> dict:
+    with open(path) as f:
+        return json.load(f)
+
+
+def _resolve_judge_creds(args):
+    """Return (host, token, endpoint) for the judge, or (None, None, None).
+
+    Only touched when the judge is enabled — keeps ``--no-judge`` offline.
+    """
+    endpoint = args.endpoint or LLM_ENDPOINT
+    try:
+        from back.core.databricks.DatabricksClient import DatabricksClient
+
+        client = DatabricksClient()
+        return client.host, client.token, endpoint
+    except Exception as exc:  # noqa: BLE001
+        print(f"  (judge disabled — no Databricks credentials: {exc})", file=sys.stderr)
+        return None, None, None
+
+
+def _emit(scorecard: dict, out_path: str) -> None:
+    text = json.dumps(scorecard, indent=2, default=str)
+    if out_path:
+        Path(out_path).parent.mkdir(parents=True, exist_ok=True)
+        with open(out_path, "w") as f:
+            f.write(text)
+        print(f"Scorecard written to {out_path}", file=sys.stderr)
+    print(text)
+
+
+def _score_common(artifact, args, *, mode, ontology=None, metadata=None):
+    run_id, ts = _now_ids()
+
+    host = token = endpoint = None
+    if not args.no_judge:
+        host, token, endpoint = _resolve_judge_creds(args)
+        if not host:
+            # No creds resolved → degrade to deterministic-only, still no net.
+            args.no_judge = True
+
+    scorecard = score_artifact(
+        artifact,
+        ontology=ontology,
+        metadata=metadata,
+        gate_ratios=args.gate_ratios,
+        no_judge=args.no_judge,
+        mode=mode,
+        run_id=run_id,
+        timestamp=ts,
+        endpoint=endpoint,
+        host=host,
+        token=token,
+        baseline_dir=args.baseline_dir,
+        use_baseline=not args.no_baseline,
+    )
+
+    if not args.no_save:
+        path = save_scorecard(scorecard, args.baseline_dir)
+        print(f"  (scorecard persisted to {path})", file=sys.stderr)
+
+    _emit(scorecard, args.out)
+    return scorecard
+
+
+def cmd_score(args) -> int:
+    artifact = _load_json(args.artifact)
+    ontology = _load_json(args.ontology) if args.ontology else None
+    metadata = _load_json(args.metadata) if args.metadata else None
+    scorecard = _score_common(
+        artifact, args, mode="score-only", ontology=ontology, metadata=metadata
+    )
+    return int(scorecard["exit_code"])
+
+
+def cmd_run(args) -> int:
+    """Live mode: run the mapping PGE for ANY domain, dump an artifact, score it.
+
+    Domain-agnostic: the ontology + source metadata come from a registry export
+    (``--registry-json`` [+``--version``]) or plain JSON files (``--ontology``
+    [+``--metadata``]) — nothing about any specific domain is hard-coded.
+    """
+    from back.core.databricks.DatabricksClient import DatabricksClient
+    from agents.agent_mapping_pge.engine import run_agent
+    from agents.pge_eval.loaders import load_run_inputs
+
+    registry_json = args.registry_json or os.environ.get("PGE_EVAL_REGISTRY_JSON")
+    ontology, metadata = load_run_inputs(
+        registry_json=registry_json,
+        version=args.version,
+        ontology_path=args.ontology,
+        metadata_path=args.metadata,
+    )
+
+    client = DatabricksClient()
+    t0 = time.time()
+    result = run_agent(
+        host=client.host,
+        token=client.token,
+        endpoint_name=args.endpoint or LLM_ENDPOINT,
+        client=client,
+        metadata=metadata,
+        ontology=ontology,
+        documents=[],
+        on_step=lambda m, p: print(f"  [{p:3d}%] {m}", file=sys.stderr),
+        skip_semantic_critic=args.no_judge,
+    )
+    elapsed = time.time() - t0
+
+    artifact = {
+        "success": result.success,
+        "iterations": result.iterations,
+        "error": result.error,
+        "usage": result.usage,
+        "stats": result.stats,
+        "entity_mappings": result.entity_mappings,
+        "relationship_mappings": result.relationship_mappings,
+        "source_model": result.source_model,
+        "mapping_evaluations": result.mapping_evaluations,
+        "mapping_run_log": result.mapping_run_log,
+        "steps": [
+            {"step_type": s.step_type, "tool_name": s.tool_name, "duration_ms": s.duration_ms}
+            for s in result.steps
+        ],
+        "ontology": ontology,
+        "metadata": metadata,
+        "elapsed_s": round(elapsed, 3),
+    }
+    scorecard = _score_common(
+        artifact, args, mode="live", ontology=ontology, metadata=metadata
+    )
+    return int(scorecard["exit_code"])
+
+
+def _add_common_flags(p):
+    p.add_argument("--no-judge", action="store_true",
+                   help="skip the advisory LLM-judge (no network calls)")
+    p.add_argument("--gate-ratios", action="store_true",
+                   help="promote Tier-2 ratio warnings to hard gates")
+    p.add_argument("--endpoint", default=None, help="serving endpoint for the judge")
+    p.add_argument("--baseline-dir", dest="baseline_dir", default=DEFAULT_BASELINE_DIR,
+                   help="directory for Tier-3 self-baseline scorecards")
+    p.add_argument("--no-baseline", action="store_true",
+                   help="skip the Tier-3 self-baseline regression gate")
+    p.add_argument("--no-save", action="store_true",
+                   help="do not persist this scorecard to the baseline dir")
+    p.add_argument("--out", default=None, help="also write the scorecard JSON here")
+
+
+def main(argv=None) -> int:
+    parser = argparse.ArgumentParser(prog="goals_eval", description=__doc__)
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    p_score = sub.add_parser("score", help="score a captured artifact")
+    p_score.add_argument("artifact", help="path to a smoke_pge AgentResult artifact JSON")
+    p_score.add_argument("--ontology", default=None,
+                         help="ontology JSON (defaults to artifact['ontology'])")
+    p_score.add_argument("--metadata", default=None,
+                         help="source metadata JSON (defaults to artifact['metadata'])")
+    _add_common_flags(p_score)
+    p_score.set_defaults(func=cmd_score)
+
+    p_run = sub.add_parser("run", help="run the PGE pipeline live, then score it")
+    p_run.add_argument("--registry-json", dest="registry_json", default=None,
+                       help="exported registry version dump for ANY domain "
+                            "({versions:{<ver>:{ontology,metadata}}}); "
+                            "defaults to $PGE_EVAL_REGISTRY_JSON")
+    p_run.add_argument("--version", default=None,
+                       help="version key to pick from --registry-json "
+                            "(required only when the dump has >1 version)")
+    p_run.add_argument("--ontology", default=None,
+                       help="ontology JSON (registry or agent shape) — "
+                            "alternative to --registry-json")
+    p_run.add_argument("--metadata", default=None,
+                       help="source metadata JSON (used with --ontology)")
+    _add_common_flags(p_run)
+    p_run.set_defaults(func=cmd_run)
+
+    args = parser.parse_args(argv)
+    return args.func(args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/smoke_pge.py b/scripts/smoke_pge.py
new file mode 100644
index 00000000..02589881
--- /dev/null
+++ b/scripts/smoke_pge.py
@@ -0,0 +1,206 @@
+"""Smoke test: PGE pipeline on CDM V1.1 maternity ontology.
+
+Runs the new Planner/Generator/Evaluator pipeline against the live
+fe-vm-fiifi-cdm-demo workspace. Compares per-item PASS/FAIL to the
+V1.1 baseline (17 entities + 18 relationships already in registry).
+
+Usage from repo root with env vars set:
+
+    .venv/bin/python scripts/smoke_pge.py [--items N] [--no-critic]
+
+--items N         restrict to the first N entities (default: all 17, plus
+                  all relationships whose endpoints are mapped)
+--no-critic       skip the semantic critic stage 2 (faster, cheaper)
+--scope=entities  only run entities (skip relationships)
+"""
+
+import argparse
+import json
+import logging
+import os
+import sys
+import time
+from pathlib import Path
+
+# Make ``src/`` importable without a packaged install.
+REPO_ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(REPO_ROOT / "src"))
+
+# Route the OntoBricks loggers (which use back.core.logging.get_logger) to
+# stdout at INFO so per-iteration agent traces appear in the smoke output.
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)-5s %(name)s | %(message)s",
+    stream=sys.stdout,
+)
+
+from agents.agent_mapping_pge.engine import run_agent  # noqa: E402
+from back.core.databricks.DatabricksClient import DatabricksClient  # noqa: E402
+
+REGISTRY_JSON = "/tmp/V1_1.json"
+LLM_ENDPOINT = "databricks-claude-opus-4-7"
+
+
+def load_v1_1():
+    with open(REGISTRY_JSON) as f:
+        doc = json.load(f)
+    v = doc["versions"]["1_1"]
+    return v["ontology"], v["metadata"], v["assignment"]
+
+
+def to_agent_shape(ontology):
+    """Convert V1.1 ontology (classes + properties) to the agent's expected
+    {entities, relationships} shape.
+    """
+    classes = ontology.get("classes", [])
+    properties = ontology.get("properties", [])
+
+    name_to_uri = {c["name"]: c["uri"] for c in classes if c.get("uri")}
+
+    def resolve(short_or_uri):
+        if not short_or_uri:
+            return short_or_uri
+        if short_or_uri.startswith("http"):
+            return short_or_uri
+        return name_to_uri.get(short_or_uri, short_or_uri)
+
+    entities = []
+    for c in classes:
+        entities.append(
+            {
+                "uri": c.get("uri", ""),
+                "name": c.get("name", ""),
+                "label": c.get("label", ""),
+                "comment": c.get("comment", ""),
+                "parent": c.get("parent", ""),
+                "attributes": list(c.get("dataProperties", [])),
+            }
+        )
+
+    relationships = []
+    for p in properties:
+        if p.get("type") != "ObjectProperty":
+            continue
+        relationships.append(
+            {
+                "uri": p.get("uri", ""),
+                "name": p.get("name", ""),
+                "label": p.get("label", p.get("name", "")),
+                "comment": p.get("comment", ""),
+                "domain": resolve(p.get("domain", "")),
+                "range": resolve(p.get("range", "")),
+            }
+        )
+    return {"entities": entities, "relationships": relationships}
+
+
+def filter_agent_ontology(agent_ont, item_limit, scope):
+    entities = agent_ont["entities"]
+    relationships = agent_ont["relationships"]
+    if item_limit is not None:
+        entities = entities[:item_limit]
+    kept_uris = {e["uri"] for e in entities}
+    if scope == "entities":
+        relationships = []
+    else:
+        relationships = [
+            r for r in relationships
+            if r["domain"] in kept_uris and r["range"] in kept_uris
+        ]
+    return {"entities": entities, "relationships": relationships}
+
+
+def on_step(msg, pct):
+    print(f"  [{pct:3d}%] {msg}", flush=True)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--items", type=int, default=None, help="Cap entity count")
+    parser.add_argument("--no-critic", action="store_true", help="Skip semantic critic")
+    parser.add_argument(
+        "--scope", choices=["all", "entities"], default="all",
+    )
+    args = parser.parse_args()
+
+    print(f"=== PGE smoke test — endpoint={LLM_ENDPOINT} ===")
+    print(f"items={args.items}, no-critic={args.no_critic}, scope={args.scope}")
+    print()
+
+    print("Loading V1.1 ontology…")
+    raw_ont, metadata, baseline = load_v1_1()
+    agent_ont = to_agent_shape(raw_ont)
+    ontology = filter_agent_ontology(agent_ont, args.items, args.scope)
+    print(f"  ontology: {len(ontology['entities'])} entities, {len(ontology['relationships'])} relationships")
+    print(f"  metadata: {len(metadata.get('tables', []))} tables")
+    print(f"  baseline: {len(baseline.get('entities', []))} entity mappings + "
+          f"{len(baseline.get('relationships', []))} relationships")
+    print()
+
+    client = DatabricksClient()
+    print(f"DatabricksClient: host={client.host}, warehouse={client.warehouse_id}")
+    print()
+
+    print("Invoking run_agent…")
+    t0 = time.time()
+    result = run_agent(
+        host=client.host,
+        token=client.token,
+        endpoint_name=LLM_ENDPOINT,
+        client=client,
+        metadata=metadata,
+        ontology=ontology,
+        documents=[],
+        on_step=on_step,
+        skip_semantic_critic=args.no_critic,
+    )
+    elapsed = time.time() - t0
+    print()
+    print(f"=== Run finished in {elapsed:.1f}s ===")
+    print(f"success={result.success}, iterations={result.iterations}, error={result.error!r}")
+    print(f"usage={result.usage}")
+    print()
+
+    print("Per-item run log:")
+    for entry in result.mapping_run_log:
+        attempts = len(entry.get("attempts", []))
+        print(f"  {entry['kind']:<13}  {entry['item']:<60}  "
+              f"attempts={attempts}  final={entry['final_status']}")
+    print()
+
+    print(f"entity_mappings: {len(result.entity_mappings)} / {len(ontology['entities'])} "
+          f"(baseline {len(baseline.get('entities', []))})")
+    print(f"relationship_mappings: {len(result.relationship_mappings)} / "
+          f"{len(ontology['relationships'])} (baseline {len(baseline.get('relationships', []))})")
+
+    # Dump the full result for inspection
+    out = {
+        "success": result.success,
+        "iterations": result.iterations,
+        "error": result.error,
+        "usage": result.usage,
+        "stats": result.stats,
+        "entity_mappings": result.entity_mappings,
+        "relationship_mappings": result.relationship_mappings,
+        "source_model": result.source_model,
+        "mapping_evaluations": result.mapping_evaluations,
+        "mapping_run_log": result.mapping_run_log,
+        "steps": [{"step_type": s.step_type, "tool_name": s.tool_name, "duration_ms": s.duration_ms} for s in result.steps],
+        # Embed the generated ontology + source metadata so the intrinsic
+        # evaluator (scripts/goals_eval.py score) can compute Stage-1 ontology
+        # metrics offline from this artifact alone.
+        "ontology": ontology,
+        "metadata": metadata,
+        "elapsed_s": round(elapsed, 3),
+    }
+    out_path = REPO_ROOT / "logs" / f"smoke_pge_{int(t0)}.json"
+    out_path.parent.mkdir(exist_ok=True)
+    with open(out_path, "w") as f:
+        json.dump(out, f, indent=2, default=str)
+    print(f"\nFull result written to {out_path}")
+
+    return 0 if result.success else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/agents/agent_auto_assignment/__init__.py b/src/agents/agent_auto_assignment/__init__.py
deleted file mode 100644
index 7d98e82b..00000000
--- a/src/agents/agent_auto_assignment/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""
-Auto-Mapping Agent – autonomous SQL mapping generation using MCP-style tools.
-"""
-
-from agents.agent_auto_assignment.engine import run_agent, AgentResult  # noqa: F401
diff --git a/src/agents/agent_auto_assignment/engine.py b/src/agents/agent_auto_assignment/engine.py
deleted file mode 100644
index 6de853d1..00000000
--- a/src/agents/agent_auto_assignment/engine.py
+++ /dev/null
@@ -1,510 +0,0 @@
-"""
-OntoBricks Auto-Mapping Agent Engine.
-
-Implements an agentic loop that uses the Databricks Foundation Model API
-with function calling to autonomously map ontology entities and relationships
-to SQL queries against domain tables.
-
-Fallback: if the LLM endpoint does not support the ``tools`` parameter the
-engine transparently degrades to a single-shot generation (no tool calls).
-"""
-
-import json
-import time
-from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional
-
-import requests
-
-from back.core.logging import get_logger
-from agents.agent_auto_assignment.tools import (
-    ToolContext,
-    TOOL_DEFINITIONS,
-    TOOL_HANDLERS,
-)
-from agents.engine_base import (
-    AgentStep,
-    call_serving_endpoint,
-    dispatch_tool,
-    extract_message_content,
-    accumulate_usage,
-)
-from agents.tracing import trace_agent
-
-logger = get_logger(__name__)
-
-MAX_ITERATIONS = 60
-LLM_TIMEOUT = 180
-_ITERATION_DELAY_SEC = 3
-
-_TRACE_NAME = "auto_assignment"
-
-
-# =====================================================
-# Data classes
-# =====================================================
-
-
-@dataclass
-class AgentResult:
-    """Outcome of a full auto-mapping agent run."""
-
-    success: bool
-    entity_mappings: list = field(default_factory=list)
-    relationship_mappings: list = field(default_factory=list)
-    steps: List[AgentStep] = field(default_factory=list)
-    iterations: int = 0
-    error: str = ""
-    usage: Dict[str, int] = field(default_factory=dict)
-    stats: Dict[str, int] = field(default_factory=dict)
-
-
-# =====================================================
-# System prompt
-# =====================================================
-
-SYSTEM_PROMPT = """\
-You are an expert data engineer. Your task is to map ontology entities \
-and relationships to SQL queries against Databricks tables.
-
-TOOLS
-You have six tools:
-  • get_metadata           – get imported table schemas (full names, columns, types) — no UC query
-  • get_documents_context   – get imported domain documents to enrich domain context — no UC query
-  • get_ontology           – get entities (with attributes) and relationships to map
-  • execute_sql            – run a SQL query to validate it and see columns + sample data
-  • submit_entity_mapping       – record a validated entity mapping
-  • submit_relationship_mapping – record a validated relationship mapping
-
-WORKFLOW
-1. Call get_ontology AND get_metadata to understand what needs mapping and what data is available.
-2. Call get_documents_context to read any imported documents — use them to enrich domain knowledge for better mapping decisions.
-3. For EACH entity:
-   a. Compose a SELECT query using the table schemas.
-   b. Call execute_sql to validate the query works and see the columns.
-   c. If the query fails, fix the SQL and try execute_sql again.
-   d. Once validated, call submit_entity_mapping with the correct column assignments.
-4. For EACH relationship:
-   a. Compose a SELECT query returning source and target identifiers.
-   b. Call execute_sql to validate the query.
-   c. Once validated, call submit_relationship_mapping.
-5. After all mappings are submitted, output a brief summary.
-
-SQL RULES FOR ENTITIES (CRITICAL)
-• Always use full table names from get_metadata (catalog.schema.table).
-• The FIRST column MUST be aliased AS ID (the entity identifier).
-• The SECOND column MUST be aliased AS Label (human-readable name).
-• If the entity has attributes (non-empty "attributes" list), add one column per \
-attribute after ID and Label.
-• If the entity has NO attributes, select ONLY ID and Label — no extra columns.
-• If the same column serves as both an alias and an attribute, include it twice: \
-once with the alias (AS ID) and once with its original name.
-• Add WHERE <id_column> IS NOT NULL to filter null keys.
-• Do NOT add LIMIT — the mapping query must return ALL rows.
-• Do NOT use ORDER BY, CTEs, or subqueries unless absolutely necessary.
-• Write simple, flat SELECT statements.
-
-COLUMN NAME QUOTING (CRITICAL)
-• In SQL, ALWAYS wrap EVERY source column name in backticks: \
-`customer_id`, `name`, `first_name`, `column name`, `my-col`.
-• Alias names (after AS) must NEVER be backtick-quoted: write AS ID, AS Label, \
-AS customer_name — NOT AS `ID`, NOT AS `Label`.
-• When a source column name contains spaces or non-alphanumeric characters, alias \
-it to a safe snake_case name: `customer name` AS customer_name.
-• The values you pass to submit_entity_mapping for id_column, label_column, and \
-attribute_mappings values are the alias names (no backticks). \
-Example: id_column="ID", label_column="Label", attribute_mappings={"name": "name"}.
-• Never pass a value with backticks to id_column, label_column, source_id_column, \
-target_id_column, or attribute_mappings — always use the plain alias name.
-
-SQL RULES FOR RELATIONSHIPS (CRITICAL)
-• SELECT exactly 2 columns: source identifier AS source_id, target identifier AS target_id.
-• If both columns are in the SAME table, query only that table (no joins).
-• Do NOT add LIMIT or ORDER BY.
-• Apply the same always-backtick-quote rule as for entity SQL.
-
-ATTRIBUTE MAPPING
-• In submit_entity_mapping, provide attribute_mappings: a JSON object mapping each \
-ontology attribute name to the corresponding SQL column name.
-• Match by name similarity (e.g. ontology "firstName" → column "first_name").
-• Map ONLY attributes listed in the entity's "attributes" list from get_ontology. \
-If that list is empty, submit attribute_mappings: {} and do NOT add extra SQL columns.
-• NEVER invent attribute mappings for columns not listed as ontology attributes.
-
-GENERAL RULES
-• Process ALL entities and ALL relationships — do not skip any.
-• If execute_sql fails, read the error and fix the SQL.
-• You may batch multiple independent tool calls in a single response.
-• Only ever pass row-returning queries (SELECT / WITH …) to execute_sql. \
-Never pass DESCRIBE, SHOW, EXPLAIN or other metadata statements — \
-use get_metadata for schema introspection instead.
-• After submitting all mappings, output ONLY a brief text summary of what was mapped."""
-
-
-# =====================================================
-# Internal helpers
-# =====================================================
-
-
-def _build_user_prompt(entities: List[dict], relationships: List[dict]) -> str:
-    parts = []
-    parts.append(
-        f"Please map {len(entities)} entities and {len(relationships)} relationships "
-        "to SQL queries. Start by calling get_ontology, get_metadata, and get_documents_context "
-        "(documents enrich domain context for better mapping decisions)."
-    )
-    if entities:
-        names = ", ".join(e.get("name", "?") for e in entities)
-        parts.append(f"Entities to map: {names}")
-    if relationships:
-        names = ", ".join(r.get("name", "?") for r in relationships)
-        parts.append(f"Relationships to map: {names}")
-    prompt = "\n".join(parts)
-    logger.debug("_build_user_prompt (%d chars):\n%s", len(prompt), prompt)
-    return prompt
-
-
-# =====================================================
-# Public entry point
-# =====================================================
-
-
-@trace_agent(name="auto_assignment")
-def run_agent(
-    host: str,
-    token: str,
-    endpoint_name: str,
-    client: Any,
-    metadata: dict,
-    ontology: dict,
-    entity_mappings: Optional[list] = None,
-    relationship_mappings: Optional[list] = None,
-    documents: Optional[list] = None,
-    on_step: Optional[Callable[[str, int], None]] = None,
-    max_iterations: Optional[int] = None,
-) -> AgentResult:
-    """Run the auto-mapping agent.
-
-    The agent autonomously maps ontology entities and relationships to SQL
-    queries by composing, validating, and submitting mappings via tools.
-
-    Args:
-        max_iterations: Override the default iteration budget.  Use a smaller
-            value (e.g. 15) when mapping a single item to keep latency low.
-    """
-    iteration_limit = max_iterations if max_iterations is not None else MAX_ITERATIONS
-
-    entities = ontology.get("entities", [])
-    relationships = ontology.get("relationships", [])
-    total_items = len(entities) + len(relationships)
-
-    logger.info(
-        "===== AUTO-ASSIGN AGENT START ===== endpoint=%s, entities=%d, relationships=%d, max_iter=%d",
-        endpoint_name,
-        len(entities),
-        len(relationships),
-        iteration_limit,
-    )
-    logger.debug(
-        "run_agent: metadata tables=%d", len((metadata or {}).get("tables", []))
-    )
-
-    ctx = ToolContext(
-        host=host.rstrip("/"),
-        token=token,
-        client=client,
-        metadata=metadata or {},
-        ontology=ontology,
-        entity_mappings=list(entity_mappings or []),
-        relationships=list(relationship_mappings or []),
-        documents=list(documents or []),
-    )
-
-    result = AgentResult(success=False)
-
-    # Build conversation
-    user_prompt = _build_user_prompt(entities, relationships)
-    messages: List[dict] = [
-        {"role": "system", "content": SYSTEM_PROMPT},
-        {"role": "user", "content": user_prompt},
-    ]
-    logger.info(
-        "Agent conversation initialized: system=%d chars, user=%d chars",
-        len(SYSTEM_PROMPT),
-        len(user_prompt),
-    )
-
-    total_usage: Dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0}
-    current_iteration = 0
-
-    def _progress_pct() -> int:
-        mapped = len(ctx.entity_mappings) + len(ctx.relationships)
-        if total_items <= 0:
-            return 5
-        return min(5 + int((mapped / total_items) * 90), 95)
-
-    def notify(msg: str, *, pct: Optional[int] = None):
-        actual_pct = pct if pct is not None else _progress_pct()
-        logger.info("STEP [%d%%] %s", actual_pct, msg)
-        if on_step:
-            on_step(msg, actual_pct)
-
-    notify("Starting auto-mapping agent…", pct=1)
-
-    # ------------------------------------------------------------------
-    # Agent loop
-    # ------------------------------------------------------------------
-    tools_supported = True
-
-    for iteration in range(iteration_limit):
-        # Delay between iterations to avoid "too many requests" rate limits
-        if iteration > 0:
-            logger.debug(
-                "Iteration %d: waiting %ds before LLM call (rate limit mitigation)",
-                iteration + 1,
-                _ITERATION_DELAY_SEC,
-            )
-            time.sleep(_ITERATION_DELAY_SEC)
-
-        current_iteration = iteration + 1
-        logger.info(
-            "----- Iteration %d/%d — %d messages, %d entity mappings, %d rel mappings -----",
-            current_iteration,
-            iteration_limit,
-            len(messages),
-            len(ctx.entity_mappings),
-            len(ctx.relationships),
-        )
-        mapped = len(ctx.entity_mappings) + len(ctx.relationships)
-        notify(f"Mapped {mapped}/{total_items} — thinking…")
-
-        is_last = iteration >= iteration_limit - 1
-        send_tools = TOOL_DEFINITIONS if (tools_supported and not is_last) else None
-
-        t0 = time.time()
-        try:
-            llm_response = call_serving_endpoint(
-                host,
-                token,
-                endpoint_name,
-                messages,
-                tools=send_tools,
-                max_tokens=2048,
-                temperature=0.1,
-                timeout=LLM_TIMEOUT,
-                trace_name=_TRACE_NAME,
-            )
-        except requests.exceptions.HTTPError as exc:
-            status = exc.response.status_code if exc.response is not None else "?"
-            logger.warning("Iteration %d: HTTPError status=%s", iteration + 1, status)
-            logger.debug(
-                "Iteration %d: HTTPError body: %.500s",
-                iteration + 1,
-                exc.response.text if exc.response is not None else "N/A",
-            )
-            if exc.response is not None and status in (400, 422) and tools_supported:
-                logger.warning(
-                    "Agent: endpoint rejected tools — falling back to direct mode"
-                )
-                tools_supported = False
-                notify("Endpoint does not support tools – aborting.")
-                result.error = "LLM endpoint does not support function calling"
-                return result
-            result.error = f"LLM request failed: {exc}"
-            logger.error(
-                "Agent: LLM request failed at iteration %d: %s", iteration + 1, exc
-            )
-            return result
-        except requests.exceptions.ReadTimeout:
-            result.error = f"LLM request timed out after {LLM_TIMEOUT}s"
-            logger.error("Agent: timeout at iteration %d", iteration + 1)
-            return result
-        except requests.exceptions.RequestException as exc:
-            result.error = f"LLM request failed: {exc}"
-            logger.error(
-                "Agent: request exception at iteration %d: %s", iteration + 1, exc
-            )
-            return result
-
-        elapsed_ms = int((time.time() - t0) * 1000)
-        logger.info("Iteration %d: LLM responded in %dms", iteration + 1, elapsed_ms)
-
-        accumulate_usage(total_usage, llm_response.get("usage", {}))
-
-        # Parse response
-        choice = llm_response.get("choices", [{}])[0]
-        finish_reason = choice.get("finish_reason", "?")
-        message = choice.get("message", {})
-        tool_calls = message.get("tool_calls", [])
-        has_content = bool(message.get("content"))
-        logger.info(
-            "Iteration %d: finish_reason=%s, tool_calls=%d, has_content=%s",
-            iteration + 1,
-            finish_reason,
-            len(tool_calls),
-            has_content,
-        )
-
-        if tool_calls:
-            logger.info(
-                "Iteration %d: processing %d tool call(s): [%s]",
-                iteration + 1,
-                len(tool_calls),
-                ", ".join(tc.get("function", {}).get("name", "?") for tc in tool_calls),
-            )
-            messages.append(message)
-
-            for tc_idx, tc in enumerate(tool_calls, 1):
-                func = tc.get("function", {})
-                tool_name = func.get("name", "")
-                raw_args = func.get("arguments", "{}")
-                tool_id = tc.get("id", "")
-
-                try:
-                    arguments = json.loads(raw_args)
-                except json.JSONDecodeError:
-                    arguments = {}
-
-                logger.info(
-                    "Iteration %d: calling tool '%s' (%d/%d)",
-                    iteration + 1,
-                    tool_name,
-                    tc_idx,
-                    len(tool_calls),
-                )
-
-                if tool_name == "submit_entity_mapping":
-                    name = arguments.get("class_name", "?")
-                    notify(f"Mapping entity: {name}")
-                elif tool_name == "submit_relationship_mapping":
-                    name = arguments.get("property_name", "?")
-                    notify(f"Mapping relationship: {name}")
-                elif tool_name == "execute_sql":
-                    sql_preview = arguments.get("sql", "")[:80]
-                    notify(f"Validating SQL: {sql_preview}…")
-                elif tool_name == "get_metadata":
-                    notify("Retrieving table metadata…")
-                elif tool_name == "get_documents_context":
-                    notify("Retrieving imported documents…")
-                elif tool_name == "get_ontology":
-                    notify("Retrieving ontology to map…")
-                else:
-                    notify(f"Calling {tool_name}…")
-
-                result.steps.append(
-                    AgentStep(
-                        step_type="tool_call",
-                        content=json.dumps(arguments, default=str)[:500],
-                        tool_name=tool_name,
-                    )
-                )
-
-                t1 = time.time()
-                tool_result = dispatch_tool(
-                    TOOL_HANDLERS, ctx, tool_name, arguments, trace_name=_TRACE_NAME
-                )
-                tool_ms = int((time.time() - t1) * 1000)
-
-                logger.info(
-                    "Iteration %d: tool '%s' returned %d chars in %dms",
-                    iteration + 1,
-                    tool_name,
-                    len(tool_result),
-                    tool_ms,
-                )
-
-                result.steps.append(
-                    AgentStep(
-                        step_type="tool_result",
-                        content=(
-                            (tool_result[:500] + "…")
-                            if len(tool_result) > 500
-                            else tool_result
-                        ),
-                        tool_name=tool_name,
-                        duration_ms=tool_ms,
-                    )
-                )
-
-                messages.append(
-                    {
-                        "role": "tool",
-                        "tool_call_id": tool_id,
-                        "content": tool_result,
-                    }
-                )
-
-            mapped = len(ctx.entity_mappings) + len(ctx.relationships)
-            notify(f"Mapped {mapped}/{total_items} items")
-            logger.info(
-                "Iteration %d: tool calls done, conversation=%d messages, mappings=%d/%d",
-                iteration + 1,
-                len(messages),
-                mapped,
-                total_items,
-            )
-        else:
-            # Agent produced text — should be the final summary
-            content = extract_message_content(llm_response)
-            logger.info(
-                "Iteration %d: agent produced text output — %d chars",
-                iteration + 1,
-                len(content),
-            )
-
-            result.steps.append(
-                AgentStep(
-                    step_type="output",
-                    content=(content[:500] + "…") if len(content) > 500 else content,
-                    duration_ms=elapsed_ms,
-                )
-            )
-
-            result.success = True
-            result.entity_mappings = ctx.entity_mappings
-            result.relationship_mappings = ctx.relationships
-            result.iterations = iteration + 1
-            result.usage = total_usage
-            result.stats = {
-                "total": total_items,
-                "entities": len(ctx.entity_mappings),
-                "relationships": len(ctx.relationships),
-            }
-
-            logger.info(
-                "===== AUTO-ASSIGN AGENT COMPLETE ===== iterations=%d, "
-                "entity_mappings=%d, rel_mappings=%d, "
-                "prompt_tokens=%d, completion_tokens=%d",
-                result.iterations,
-                len(ctx.entity_mappings),
-                len(ctx.relationships),
-                total_usage["prompt_tokens"],
-                total_usage["completion_tokens"],
-            )
-            notify("Agent completed!", pct=100)
-            return result
-
-    # Exhausted iterations — still return what we have
-    result.entity_mappings = ctx.entity_mappings
-    result.relationship_mappings = ctx.relationships
-    result.iterations = iteration_limit
-    result.usage = total_usage
-    result.stats = {
-        "total": total_items,
-        "entities": len(ctx.entity_mappings),
-        "relationships": len(ctx.relationships),
-    }
-    if ctx.entity_mappings or ctx.relationships:
-        result.success = True
-        result.error = f"Agent used all {iteration_limit} iterations but submitted partial mappings"
-        logger.warning(
-            "===== AUTO-ASSIGN AGENT PARTIAL ===== %s — entity=%d, rel=%d",
-            result.error,
-            len(ctx.entity_mappings),
-            len(ctx.relationships),
-        )
-    else:
-        result.error = f"Agent reached maximum iterations ({iteration_limit}) without producing mappings"
-        logger.error("===== AUTO-ASSIGN AGENT FAILED ===== %s", result.error)
-
-    return result
diff --git a/src/agents/agent_auto_assignment/tools.py b/src/agents/agent_auto_assignment/tools.py
deleted file mode 100644
index 94ceee49..00000000
--- a/src/agents/agent_auto_assignment/tools.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""
-Tool assembly for the Auto-Mapping Agent.
-
-Composes the set of tools available to this agent from the shared
-``agents.tools`` package.
-"""
-
-from typing import Callable, Dict, List
-
-from agents.tools.context import ToolContext
-from agents.tools.metadata import (
-    GET_METADATA_DEF,
-    tool_get_metadata,
-)
-from agents.tools.ontology import (
-    ONTOLOGY_TOOL_DEFINITIONS,
-    ONTOLOGY_TOOL_HANDLERS,
-)
-from agents.tools.sql import (
-    SQL_TOOL_DEFINITIONS,
-    SQL_TOOL_HANDLERS,
-)
-from agents.tools.mapping import (
-    MAPPING_TOOL_DEFINITIONS,
-    MAPPING_TOOL_HANDLERS,
-)
-from agents.tools.documents import (
-    GET_DOCUMENTS_CONTEXT_DEF,
-    tool_get_documents_context,
-)
-
-__all__ = ["ToolContext", "TOOL_DEFINITIONS", "TOOL_HANDLERS"]
-
-TOOL_DEFINITIONS: List[dict] = (
-    [GET_METADATA_DEF, GET_DOCUMENTS_CONTEXT_DEF]
-    + ONTOLOGY_TOOL_DEFINITIONS
-    + SQL_TOOL_DEFINITIONS
-    + MAPPING_TOOL_DEFINITIONS
-)
-
-TOOL_HANDLERS: Dict[str, Callable] = {
-    "get_metadata": tool_get_metadata,
-    "get_documents_context": tool_get_documents_context,
-    **ONTOLOGY_TOOL_HANDLERS,
-    **SQL_TOOL_HANDLERS,
-    **MAPPING_TOOL_HANDLERS,
-}
diff --git a/src/agents/agent_dtwin_chat/tools.py b/src/agents/agent_dtwin_chat/tools.py
index 155bf6bd..2ad945a9 100644
--- a/src/agents/agent_dtwin_chat/tools.py
+++ b/src/agents/agent_dtwin_chat/tools.py
@@ -47,7 +47,11 @@
 
 logger = get_logger(__name__)
 
-_HTTP_TIMEOUT = 120
+# Interactive chat tools: warm Lakebase graph queries are sub-second; a long
+# wait almost always means the autoscaling Lakebase instance is cold (scaled
+# to zero) and waking. Fail fast with a graceful message rather than make the
+# user wait minutes — a retry once the instance is warm succeeds quickly.
+_HTTP_TIMEOUT = 60
 _MAX_DEPTH = 1
 _SPARQL_DANGEROUS = re.compile(
     r"\b(DROP|DELETE|INSERT|CREATE|CLEAR|LOAD|COPY|MOVE|ADD)\b",
@@ -195,6 +199,11 @@ def tool_describe_entity(
         "depth": min(max(int(depth or _MAX_DEPTH), 1), 10),
         "limit": 500,
         "offset": 0,
+        # Cap BFS seeds: a broad search ("mother") otherwise seeds every match
+        # and the recursive traversal can run for minutes. The agent only needs
+        # a handful of concrete examples to describe, so 25 seeds is plenty and
+        # keeps the query fast.
+        "seed_limit": 25,
     }
     if search:
         params["search"] = search
diff --git a/src/agents/agent_mapping_pge/__init__.py b/src/agents/agent_mapping_pge/__init__.py
new file mode 100644
index 00000000..0da0713e
--- /dev/null
+++ b/src/agents/agent_mapping_pge/__init__.py
@@ -0,0 +1,50 @@
+"""Planner -> Generator -> Evaluator (PGE) mapping agent.
+
+Three-stage mapping pipeline that replaces the prior single-loop ReAct
+mapping agent:
+
+* **Planner** — proposes a :class:`SourceModel` (table roles, canonical IDs,
+  join keys, ordered mapping plan).
+* **Generator** — produces individual entity/relationship mappings given the
+  plan.
+* **Evaluator** — checks each submitted mapping; stage 1 is deterministic
+  (pure SQL counts), stage 2 is semantic.
+
+Sprint 1 lays the foundation: the typed contracts plus the deterministic
+evaluator.  Subsequent sprints add the LLM-backed Planner, Generator,
+semantic Evaluator, and the orchestrating loop.
+"""
+
+from agents.agent_mapping_pge.contracts import (
+    CanonicalId,
+    EvalFailure,
+    EvalReport,
+    JoinKey,
+    MappingPlan,
+    RetryState,
+    SkipItem,
+    SourceModel,
+    TableRole,
+    TableRoleCandidate,
+)
+from agents.agent_mapping_pge.engine import (
+    AgentResult,
+    AgentStep,
+    run_agent,
+)
+
+__all__ = [
+    "AgentResult",
+    "AgentStep",
+    "CanonicalId",
+    "EvalFailure",
+    "EvalReport",
+    "JoinKey",
+    "MappingPlan",
+    "RetryState",
+    "SkipItem",
+    "SourceModel",
+    "TableRole",
+    "TableRoleCandidate",
+    "run_agent",
+]
diff --git a/src/agents/agent_mapping_pge/contracts.py b/src/agents/agent_mapping_pge/contracts.py
new file mode 100644
index 00000000..8172e431
--- /dev/null
+++ b/src/agents/agent_mapping_pge/contracts.py
@@ -0,0 +1,344 @@
+"""Typed contracts for the mapping PGE pipeline.
+
+These dataclasses are the load-bearing interface between Planner, Generator,
+Evaluator, and the orchestrator (added in later sprints).  All shapes here
+are JSON round-trippable via ``to_dict()`` / ``from_dict()`` so they can be
+persisted as artefacts, attached to MLflow traces, or shipped over the wire
+to the UI.
+
+No LLM code lives here; this is a pure-data module.
+"""
+
+from dataclasses import dataclass, field, fields, is_dataclass
+from typing import Any, Dict, List, Optional
+
+
+# =====================================================
+# SourceModel — Planner output
+# =====================================================
+
+
+@dataclass
+class TableRoleCandidate:
+    """A candidate ontology class for a given source table."""
+
+    uri: str
+    confidence: float  # 0.0 .. 1.0
+    reason: str = ""
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {"uri": self.uri, "confidence": self.confidence, "reason": self.reason}
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "TableRoleCandidate":
+        return cls(
+            uri=data["uri"],
+            confidence=float(data["confidence"]),
+            reason=data.get("reason", ""),
+        )
+
+
+@dataclass
+class TableRole:
+    """A source table together with its ranked ontology-class candidates."""
+
+    table: str  # full name catalog.schema.table
+    ontology_class_candidates: List[TableRoleCandidate] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "table": self.table,
+            "ontology_class_candidates": [
+                c.to_dict() for c in self.ontology_class_candidates
+            ],
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "TableRole":
+        return cls(
+            table=data["table"],
+            ontology_class_candidates=[
+                TableRoleCandidate.from_dict(c)
+                for c in data.get("ontology_class_candidates", [])
+            ],
+        )
+
+
+@dataclass
+class CanonicalId:
+    """Identifier conventions for an ontology class across its source tables.
+
+    ``canonical_column_per_table`` maps a full table name -> the column to
+    use as the canonical identifier in that table (e.g. NHS number rather
+    than the trust-local patient id).
+    """
+
+    ontology_class: str  # class URI
+    canonical_column_per_table: Dict[str, str] = field(default_factory=dict)
+    format_note: str = ""
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "ontology_class": self.ontology_class,
+            "canonical_column_per_table": dict(self.canonical_column_per_table),
+            "format_note": self.format_note,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "CanonicalId":
+        return cls(
+            ontology_class=data["ontology_class"],
+            canonical_column_per_table=dict(
+                data.get("canonical_column_per_table", {})
+            ),
+            format_note=data.get("format_note", ""),
+        )
+
+
+@dataclass
+class JoinKey:
+    """A proposed join between two table.column references.
+
+    ``kind`` distinguishes within-trust foreign keys from value-matched
+    cross-source joins (e.g. NHS-number-to-NHS-number across trusts).
+    """
+
+    from_ref: str  # "table.col"
+    to_ref: str  # "table.col"
+    confidence: float  # 0..1
+    overlap_pct: float  # 0..1
+    kind: str  # "same_trust_fk" | "cross_source_value_match"
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "from_ref": self.from_ref,
+            "to_ref": self.to_ref,
+            "confidence": self.confidence,
+            "overlap_pct": self.overlap_pct,
+            "kind": self.kind,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "JoinKey":
+        return cls(
+            from_ref=data["from_ref"],
+            to_ref=data["to_ref"],
+            confidence=float(data["confidence"]),
+            overlap_pct=float(data["overlap_pct"]),
+            kind=data["kind"],
+        )
+
+
+@dataclass
+class SkipItem:
+    """An ontology entity/relationship the planner has decided to skip."""
+
+    item: str  # uri
+    reason: str = ""
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {"item": self.item, "reason": self.reason}
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "SkipItem":
+        return cls(item=data["item"], reason=data.get("reason", ""))
+
+
+@dataclass
+class MappingPlan:
+    """The order in which the Generator should attempt entity/relationship
+    mappings, plus any items the planner chose to drop."""
+
+    entity_order: List[str] = field(default_factory=list)
+    relationship_order: List[str] = field(default_factory=list)
+    skip: List[SkipItem] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "entity_order": list(self.entity_order),
+            "relationship_order": list(self.relationship_order),
+            "skip": [s.to_dict() for s in self.skip],
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "MappingPlan":
+        return cls(
+            entity_order=list(data.get("entity_order", [])),
+            relationship_order=list(data.get("relationship_order", [])),
+            skip=[SkipItem.from_dict(s) for s in data.get("skip", [])],
+        )
+
+
+@dataclass
+class SourceModel:
+    """Output of the Planner stage; input to the Generator.
+
+    Contains the planner's understanding of the source schema (table roles,
+    canonical ids, join keys) and the ordered plan of work for the
+    Generator.
+    """
+
+    table_roles: List[TableRole] = field(default_factory=list)
+    canonical_ids: List[CanonicalId] = field(default_factory=list)
+    join_keys: List[JoinKey] = field(default_factory=list)
+    mapping_plan: MappingPlan = field(default_factory=MappingPlan)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "table_roles": [t.to_dict() for t in self.table_roles],
+            "canonical_ids": [c.to_dict() for c in self.canonical_ids],
+            "join_keys": [j.to_dict() for j in self.join_keys],
+            "mapping_plan": self.mapping_plan.to_dict(),
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "SourceModel":
+        return cls(
+            table_roles=[
+                TableRole.from_dict(t) for t in data.get("table_roles", [])
+            ],
+            canonical_ids=[
+                CanonicalId.from_dict(c) for c in data.get("canonical_ids", [])
+            ],
+            join_keys=[JoinKey.from_dict(j) for j in data.get("join_keys", [])],
+            mapping_plan=MappingPlan.from_dict(data.get("mapping_plan", {})),
+        )
+
+
+# =====================================================
+# EvalReport — Evaluator output
+# =====================================================
+
+
+@dataclass
+class EvalFailure:
+    """A single failed check inside an :class:`EvalReport`.
+
+    ``hint`` is the actionable correction text fed back to the Generator on
+    retry; it should be concrete and template-y, not a free-form essay.
+    """
+
+    kind: str  # "structural" | "semantic"
+    check: str  # e.g. "dangling_source_pct"
+    expected: str  # e.g. "< 0.05"
+    observed: str  # e.g. "0.47"
+    hint: str = ""
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "kind": self.kind,
+            "check": self.check,
+            "expected": self.expected,
+            "observed": self.observed,
+            "hint": self.hint,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "EvalFailure":
+        return cls(
+            kind=data["kind"],
+            check=data["check"],
+            expected=data["expected"],
+            observed=data["observed"],
+            hint=data.get("hint", ""),
+        )
+
+
+@dataclass
+class EvalReport:
+    """Outcome of evaluating a single submitted mapping.
+
+    ``bubble_to_planner`` signals that the failure cannot reasonably be
+    fixed by the Generator alone and warrants re-planning (e.g. wrong
+    canonical id column, table assigned to wrong ontology class).
+    """
+
+    status: str  # "PASS" | "FAIL"
+    stage: str  # "deterministic" | "semantic"
+    metrics: Dict[str, Any] = field(default_factory=dict)
+    failures: List[EvalFailure] = field(default_factory=list)
+    bubble_to_planner: bool = False
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "status": self.status,
+            "stage": self.stage,
+            "metrics": dict(self.metrics),
+            "failures": [f.to_dict() for f in self.failures],
+            "bubble_to_planner": self.bubble_to_planner,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "EvalReport":
+        return cls(
+            status=data["status"],
+            stage=data["stage"],
+            metrics=dict(data.get("metrics", {})),
+            failures=[EvalFailure.from_dict(f) for f in data.get("failures", [])],
+            bubble_to_planner=bool(data.get("bubble_to_planner", False)),
+        )
+
+
+# =====================================================
+# RetryState — orchestrator bookkeeping (used in Sprint 7)
+# =====================================================
+
+
+@dataclass
+class RetryState:
+    """Per-item retry budget tracked by the orchestrator.
+
+    The orchestrator caps the Generator at 3 attempts per item before
+    giving up, and bumps the Planner at most twice per item if the
+    evaluator keeps bubbling failures upstream.
+    """
+
+    item_uri: str
+    generator_attempts: int = 0
+    planner_reinvocations: int = 0
+    last_eval_report: Optional[EvalReport] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "item_uri": self.item_uri,
+            "generator_attempts": self.generator_attempts,
+            "planner_reinvocations": self.planner_reinvocations,
+            "last_eval_report": (
+                self.last_eval_report.to_dict()
+                if self.last_eval_report is not None
+                else None
+            ),
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "RetryState":
+        last = data.get("last_eval_report")
+        return cls(
+            item_uri=data["item_uri"],
+            generator_attempts=int(data.get("generator_attempts", 0)),
+            planner_reinvocations=int(data.get("planner_reinvocations", 0)),
+            last_eval_report=EvalReport.from_dict(last) if last is not None else None,
+        )
+
+
+# =====================================================
+# Sanity check — keep dataclass discovery introspectable
+# =====================================================
+
+_ALL_CONTRACTS = (
+    TableRoleCandidate,
+    TableRole,
+    CanonicalId,
+    JoinKey,
+    SkipItem,
+    MappingPlan,
+    SourceModel,
+    EvalFailure,
+    EvalReport,
+    RetryState,
+)
+for _cls in _ALL_CONTRACTS:
+    assert is_dataclass(_cls), f"{_cls.__name__} must be a dataclass"
+    # touch ``fields`` to ensure all defaults are well-formed at import time.
+    fields(_cls)
+del _cls
diff --git a/src/agents/agent_mapping_pge/engine.py b/src/agents/agent_mapping_pge/engine.py
new file mode 100644
index 00000000..80d4c8ad
--- /dev/null
+++ b/src/agents/agent_mapping_pge/engine.py
@@ -0,0 +1,1281 @@
+"""
+OntoBricks Mapping-PGE Orchestrator.
+
+Wires the Planner, the Entity/Relationship Generators, and the two-stage
+Evaluator (deterministic + semantic critic) into a single ``run_agent``
+entry point.
+
+The public ``run_agent`` signature and :class:`AgentResult` shape match the
+prior in-house single-loop mapping agent so ``back/objects/mapping/Mapping.py``
+can call this engine without other changes.
+
+Control flow per item (entity or relationship)
+==============================================
+
+1. Build a focused slice from the Planner's :class:`SourceModel`.
+2. Run the appropriate Generator with ``retry_hint=None``.
+3. Run the deterministic evaluator. On FAIL:
+   * if ``bubble_to_planner=True`` -> escalate to Planner (capped at 2 global
+     replans across the whole run);
+   * else retry the Generator with the first failure's hint.
+4. On stage-1 PASS, run the semantic critic (unless ``skip_semantic_critic``
+   is set).  Same bubble / hint logic on FAIL.
+5. After 3 unsuccessful attempts, the item is recorded as ``FAIL_BUDGET`` and
+   the orchestrator moves on to the next item.
+
+Step-log design
+===============
+
+``AgentResult.steps`` is a HIGH-LEVEL log — one entry per stage transition
+(planner-start, generator-start, evaluator-result, critic-result, item-done).
+The detailed per-tool steps emitted by each sub-agent stay on the sub-agent's
+own result dataclass (``PlannerResult.steps``, ``EntityGenResult.steps``, …)
+and are NOT merged into the orchestrator's ``steps`` field. This keeps the
+top-level log readable in the UI; the persistence layer can attach sub-agent
+step lists separately when needed.
+"""
+
+import time
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from back.core.logging import get_logger
+from agents.agent_mapping_pge.contracts import EvalReport, SourceModel
+from agents.agent_mapping_pge.evaluator.critic import run_critic
+from agents.agent_mapping_pge.evaluator.deterministic import (
+    evaluate_entity_mapping,
+    evaluate_relationship_mapping,
+)
+from agents.agent_mapping_pge.generators.entity import run_entity_generator
+from agents.agent_mapping_pge.generators.relationship import (
+    run_relationship_generator,
+)
+from agents.agent_mapping_pge.planner import run_planner
+from agents.tracing import trace_agent
+
+logger = get_logger(__name__)
+
+# Per-item retry budget for the Generator->Evaluator inner loop.
+_PER_ITEM_GENERATOR_ATTEMPTS = 3
+# Global cap on Planner re-invocations triggered by escalated failures.
+_PLANNER_REINVOCATION_BUDGET = 2
+
+
+# =====================================================
+# Public dataclasses — mirror the prior mapping agent's shapes
+# =====================================================
+
+
+@dataclass
+class AgentStep:
+    """One observable step of the orchestrator's execution.
+
+    Same shape as :class:`agents.engine_base.AgentStep` plus a few extra
+    ``step_type`` values used by the PGE orchestrator:
+
+    * ``"planner"`` / ``"generator"`` / ``"evaluator"`` / ``"critic"`` for
+      stage transitions; the legacy ``"tool_call"`` / ``"tool_result"`` /
+      ``"output"`` types remain valid so this struct is fully drop-in-
+      compatible with the prior orchestrator.
+    """
+
+    step_type: str
+    content: str
+    tool_name: str = ""
+    duration_ms: int = 0
+
+
+@dataclass
+class AgentResult:
+    """Outcome of a full PGE orchestration run.
+
+    The first eight fields mirror the prior in-house mapping-agent's result
+    dataclass exactly so callers can swap engines without touching their
+    downstream code. The last three are PGE-specific extras the caller
+    can choose to persist.
+    """
+
+    success: bool
+    entity_mappings: list = field(default_factory=list)
+    relationship_mappings: list = field(default_factory=list)
+    steps: List[AgentStep] = field(default_factory=list)
+    iterations: int = 0
+    error: str = ""
+    usage: Dict[str, int] = field(default_factory=dict)
+    stats: Dict[str, int] = field(default_factory=dict)
+    # PGE-specific extras
+    source_model: Optional[dict] = None
+    mapping_evaluations: Dict[str, dict] = field(default_factory=dict)
+    mapping_run_log: List[dict] = field(default_factory=list)
+
+
+# =====================================================
+# Internal helpers
+# =====================================================
+
+
+def _ontology_index(ontology: dict) -> Dict[str, dict]:
+    """Build ``uri -> entity dict`` for fast lookup by URI."""
+    out: Dict[str, dict] = {}
+    for e in (ontology or {}).get("entities", []) or []:
+        uri = e.get("uri") or e.get("name")
+        if uri:
+            out[uri] = e
+    return out
+
+
+def _relationship_index(ontology: dict) -> Dict[str, dict]:
+    """Build ``uri -> relationship dict`` for fast lookup by URI."""
+    out: Dict[str, dict] = {}
+    for r in (ontology or {}).get("relationships", []) or []:
+        uri = r.get("uri") or r.get("name")
+        if uri:
+            out[uri] = r
+    return out
+
+
+def _slice_for_entity(source_model: SourceModel, class_uri: str) -> dict:
+    """Render the SourceModel slice consumed by the EntityGenerator.
+
+    The slice surfaces only what's relevant to one ontology class:
+    candidate tables, the canonical-ID per chosen table, and any joins
+    naming a candidate table on at least one side.
+    """
+    candidate_tables: List[dict] = []
+    candidate_table_names: set = set()
+    for role in source_model.table_roles:
+        for cand in role.ontology_class_candidates:
+            if cand.uri == class_uri:
+                candidate_tables.append(
+                    {
+                        "table": role.table,
+                        "confidence": cand.confidence,
+                        "reason": cand.reason,
+                    }
+                )
+                candidate_table_names.add(role.table)
+                break  # one entry per role is enough
+
+    canonical_id_obj: Dict[str, Any] = {
+        "ontology_class": class_uri,
+        "canonical_column_per_table": {},
+        "format_note": "",
+    }
+    for c in source_model.canonical_ids:
+        if c.ontology_class == class_uri:
+            canonical_id_obj = c.to_dict()
+            break
+
+    relevant_joins: List[dict] = []
+    for j in source_model.join_keys:
+        from_table = j.from_ref.split(".")[0] if j.from_ref else ""
+        to_table = j.to_ref.split(".")[0] if j.to_ref else ""
+        if any(
+            ft == from_table or ft.endswith("." + from_table)
+            for ft in candidate_table_names
+        ) or any(
+            tt == to_table or tt.endswith("." + to_table)
+            for tt in candidate_table_names
+        ):
+            relevant_joins.append(j.to_dict())
+
+    return {
+        "candidate_tables": candidate_tables,
+        "canonical_id": canonical_id_obj,
+        "relevant_joins": relevant_joins,
+    }
+
+
+def _slice_for_relationship(
+    source_model: SourceModel,
+    property_uri: str,
+    source_entity_mapping: dict,
+    target_entity_mapping: dict,
+) -> dict:
+    """Render the SourceModel slice consumed by the RelationshipGenerator.
+
+    The slice surfaces every join key the Planner produced (the Generator
+    picks among them), plus the candidate-table list filtered to the
+    source/target classes when those classes are known.
+    """
+    src_class = (source_entity_mapping or {}).get("ontology_class") or (
+        source_entity_mapping or {}
+    ).get("class_uri", "")
+    tgt_class = (target_entity_mapping or {}).get("ontology_class") or (
+        target_entity_mapping or {}
+    ).get("class_uri", "")
+    endpoint_classes = {c for c in (src_class, tgt_class) if c}
+
+    candidate_tables: List[dict] = []
+    for role in source_model.table_roles:
+        for cand in role.ontology_class_candidates:
+            if not endpoint_classes or cand.uri in endpoint_classes:
+                candidate_tables.append(
+                    {
+                        "table": role.table,
+                        "ontology_class": cand.uri,
+                        "confidence": cand.confidence,
+                        "reason": cand.reason,
+                    }
+                )
+                break
+
+    relevant_joins = [j.to_dict() for j in source_model.join_keys]
+
+    return {
+        "property_uri": property_uri,
+        "relevant_joins": relevant_joins,
+        "candidate_tables": candidate_tables,
+    }
+
+
+def _wrap_execute_sql(client: Any) -> Callable[[str], dict]:
+    """Adapt ``client.execute_query`` to the evaluator's expected shape.
+
+    The deterministic evaluator wants ``{"columns": [...], "rows": [{...}]}``
+    with FULL rows. ``client.execute_query`` returns ``List[Dict[str, Any]]``
+    — we promote that to the evaluator's shape and derive columns from the
+    first row. Calling the underlying client directly (rather than the
+    sampling ``tool_execute_sql``) is load-bearing: the deterministic
+    evaluator's count-based checks need real values, not stringified ones.
+    """
+
+    def _run(sql: str) -> dict:
+        rows = client.execute_query(sql) or []
+        if isinstance(rows, dict) and "rows" in rows:
+            return rows  # client already returns the evaluator's shape
+        columns: List[str] = []
+        if rows and isinstance(rows[0], dict):
+            columns = list(rows[0].keys())
+        return {"columns": columns, "rows": list(rows)}
+
+    return _run
+
+
+def _first_hint(report: EvalReport) -> Optional[str]:
+    """Return the first failure's hint (or ``None`` when the report has none)."""
+    for f in report.failures:
+        if f.hint:
+            return f.hint
+    return None
+
+
+def _resolve_endpoint_em(
+    ref: str,
+    by_uri: Dict[str, dict],
+    entity_index: Dict[str, dict],
+) -> Optional[dict]:
+    """Best-effort lookup of an endpoint entity mapping.
+
+    The ontology's ``domain`` / ``range`` may use either the entity's full
+    URI or its short name. We try direct lookup, then a name-match scan.
+    """
+    if not ref:
+        return None
+    if ref in by_uri:
+        return by_uri[ref]
+    for uri, ent in entity_index.items():
+        if ent.get("name") == ref or ent.get("label") == ref:
+            if uri in by_uri:
+                return by_uri[uri]
+    return None
+
+
+# =====================================================
+# Public entry point
+# =====================================================
+
+
+@trace_agent(name="mapping_pge_engine")
+def run_agent(
+    host: str,
+    token: str,
+    endpoint_name: str,
+    client: Any,
+    metadata: dict,
+    ontology: dict,
+    entity_mappings: Optional[list] = None,
+    relationship_mappings: Optional[list] = None,
+    documents: Optional[list] = None,
+    on_step: Optional[Callable[[str, int], None]] = None,
+    max_iterations: Optional[int] = None,
+    *,
+    skip_semantic_critic: bool = False,
+) -> AgentResult:
+    """Run the PGE mapping orchestrator.
+
+    Drop-in replacement for the prior in-house single-loop mapping agent —
+    same positional/keyword signature, same :class:`AgentResult` shape.
+
+    Args:
+        host: Databricks workspace URL.
+        token: Bearer token for the serving endpoint.
+        endpoint_name: Foundation Model serving endpoint name.
+        client: Databricks SQL client exposing ``execute_query(sql)``.
+        metadata: Imported table metadata to hand to the Planner.
+        ontology: Ontology dict with ``entities`` and ``relationships``.
+        entity_mappings: Pre-seeded entity mappings (URI matched -> skipped).
+        relationship_mappings: Pre-seeded relationship mappings (likewise).
+        documents: Optional pre-loaded domain documents.
+        on_step: Optional progress callback ``(msg, pct)``.
+        max_iterations: Per-item override for the Generator's iteration cap.
+            Kept for API parity with the legacy engine; ``None`` uses each
+            sub-agent's default.
+        skip_semantic_critic: When ``True``, the orchestrator skips the
+            stage-2 critic and accepts every stage-1 PASS as a final PASS.
+            Production callers leave this ``False``; tests flip it ``True``
+            to avoid LLM calls in the orchestrator's unit tests.
+
+    Returns:
+        An :class:`AgentResult` with the submitted mappings, a high-level
+        ``steps`` log, per-item ``mapping_run_log``, and PGE-specific
+        extras (``source_model``, ``mapping_evaluations``).
+    """
+    # ------------------------------------------------------------------
+    # Per-call state lives entirely on this RunState object — no module-
+    # level mutables, so concurrent calls (and tests) cannot collide.
+    # ------------------------------------------------------------------
+    state = _RunState(
+        host=host,
+        token=token,
+        endpoint_name=endpoint_name,
+        client=client,
+        metadata=metadata or {},
+        ontology=ontology or {},
+        documents=list(documents or []),
+        on_step=on_step,
+        max_iterations=max_iterations,
+        skip_semantic_critic=skip_semantic_critic,
+    )
+
+    # Pre-seeded mappings carry over verbatim — we never overwrite a URI the
+    # caller already mapped.
+    pre_entity_list = list(entity_mappings or [])
+    pre_rel_list = list(relationship_mappings or [])
+    preseeded_entity_uris = {
+        m.get("ontology_class") or m.get("class_uri") or "" for m in pre_entity_list
+    }
+    preseeded_entity_uris.discard("")
+    preseeded_rel_uris = {
+        m.get("property") or m.get("property_uri") or "" for m in pre_rel_list
+    }
+    preseeded_rel_uris.discard("")
+
+    state.entity_mappings.extend(pre_entity_list)
+    state.relationship_mappings.extend(pre_rel_list)
+    for m in pre_entity_list:
+        uri = m.get("ontology_class") or m.get("class_uri")
+        if uri:
+            state.entity_mapping_by_uri[uri] = m
+
+    entities_in_scope = state.ontology.get("entities", []) or []
+    relationships_in_scope = state.ontology.get("relationships", []) or []
+
+    logger.info(
+        "===== MAPPING-PGE ENGINE START ===== endpoint=%s, entities=%d, "
+        "relationships=%d, preseeded_entities=%d, preseeded_rels=%d, "
+        "skip_critic=%s",
+        endpoint_name,
+        len(entities_in_scope),
+        len(relationships_in_scope),
+        len(preseeded_entity_uris),
+        len(preseeded_rel_uris),
+        skip_semantic_critic,
+    )
+
+    # ------------------------------------------------------------------
+    # 1. Planner
+    # ------------------------------------------------------------------
+    state.notify("Planning…", pct=2)
+    state.add_step("planner", "planner-start")
+
+    t0 = time.time()
+    try:
+        planner_result = run_planner(
+            host=host,
+            token=token,
+            endpoint_name=endpoint_name,
+            client=client,
+            metadata=state.metadata,
+            ontology=state.ontology,
+            documents=state.documents,
+            on_step=None,
+        )
+    except Exception as exc:  # noqa: BLE001 — surface any failure as run failure
+        logger.error("Planner raised an exception: %s", exc, exc_info=True)
+        return state.finalise(error=f"planner exception: {exc}")
+
+    planner_ms = int((time.time() - t0) * 1000)
+    state.add_iterations(planner_result.iterations)
+    state.accumulate_usage(planner_result.usage)
+
+    if not planner_result.success or planner_result.source_model is None:
+        state.add_step(
+            "planner",
+            f"planner-fail: {planner_result.error}",
+            duration_ms=planner_ms,
+        )
+        logger.error("===== MAPPING-PGE ENGINE FAILED ===== planner failed")
+        state.notify("Planner failed — aborting.", pct=10)
+        return state.finalise(
+            error=f"planner failed: {planner_result.error or 'no source model'}"
+        )
+
+    state.source_model = planner_result.source_model
+    state.refresh_plan()
+    state.add_step(
+        "planner",
+        f"planner-done: entities={len(state.entity_order)}, "
+        f"relationships={len(state.relationship_order)}",
+        duration_ms=planner_ms,
+    )
+
+    # ------------------------------------------------------------------
+    # 2. Walk the plan — entities first, then relationships.
+    # ------------------------------------------------------------------
+    state.entity_index = _ontology_index(state.ontology)
+    state.rel_index = _relationship_index(state.ontology)
+    state.execute_sql_fn = _wrap_execute_sql(client)
+    state.total_items_planned = len(state.entity_order) + len(
+        state.relationship_order
+    )
+
+    # ------------------ Entity walk ------------------
+    for entity_uri in list(state.entity_order):
+        ontology_class = state.entity_index.get(entity_uri, {"uri": entity_uri})
+        label = ontology_class.get("label") or ontology_class.get(
+            "name", entity_uri
+        )
+
+        if entity_uri in preseeded_entity_uris:
+            state.mapping_run_log.append(
+                {
+                    "item": entity_uri,
+                    "kind": "entity",
+                    "attempts": [],
+                    "final_status": "PRESEEDED",
+                }
+            )
+            state.notify(f"Skipping pre-seeded {label}")
+            state.items_done += 1
+            continue
+
+        if entity_uri in state.skip_reasons:
+            state.mapping_run_log.append(
+                {
+                    "item": entity_uri,
+                    "kind": "entity",
+                    "attempts": [],
+                    "final_status": "SKIPPED",
+                }
+            )
+            state.notify(
+                f"Skipped {label}: {state.skip_reasons[entity_uri]}"
+            )
+            state.items_done += 1
+            continue
+
+        final_status, attempts_log, last_mapping, last_report = _run_entity_item(
+            state, ontology_class
+        )
+
+        state.mapping_run_log.append(
+            {
+                "item": entity_uri,
+                "kind": "entity",
+                "attempts": attempts_log,
+                "final_status": final_status,
+            }
+        )
+        if final_status == "PASS" and last_mapping is not None:
+            state.entity_mappings.append(last_mapping)
+            state.entity_mapping_by_uri[entity_uri] = last_mapping
+            state.submitted_any = True
+            if last_report is not None:
+                state.mapping_evaluations[entity_uri] = last_report.to_dict()
+            state.notify(f"Mapped {label}")
+        state.items_done += 1
+
+    # ------------------ Relationship walk ------------------
+    for property_uri in list(state.relationship_order):
+        prop = state.rel_index.get(property_uri, {"uri": property_uri})
+        label = prop.get("label") or prop.get("name", property_uri)
+
+        if property_uri in preseeded_rel_uris:
+            state.mapping_run_log.append(
+                {
+                    "item": property_uri,
+                    "kind": "relationship",
+                    "attempts": [],
+                    "final_status": "PRESEEDED",
+                }
+            )
+            state.notify(f"Skipping pre-seeded {label}")
+            state.items_done += 1
+            continue
+
+        if property_uri in state.skip_reasons:
+            state.mapping_run_log.append(
+                {
+                    "item": property_uri,
+                    "kind": "relationship",
+                    "attempts": [],
+                    "final_status": "SKIPPED",
+                }
+            )
+            state.notify(f"Skipped {label}: {state.skip_reasons[property_uri]}")
+            state.items_done += 1
+            continue
+
+        domain_ref = prop.get("domain", "") or ""
+        range_ref = prop.get("range", "") or ""
+        source_em = state.entity_mapping_by_uri.get(
+            domain_ref
+        ) or _resolve_endpoint_em(
+            domain_ref, state.entity_mapping_by_uri, state.entity_index
+        )
+        target_em = state.entity_mapping_by_uri.get(
+            range_ref
+        ) or _resolve_endpoint_em(
+            range_ref, state.entity_mapping_by_uri, state.entity_index
+        )
+        if source_em is None or target_em is None:
+            state.mapping_run_log.append(
+                {
+                    "item": property_uri,
+                    "kind": "relationship",
+                    "attempts": [],
+                    "final_status": "FAIL_BUDGET",
+                }
+            )
+            state.add_step(
+                "evaluator",
+                f"relationship {property_uri}: endpoint mapping missing — skipped",
+            )
+            state.notify(f"Cannot map {label}: endpoint entity not available")
+            state.items_done += 1
+            continue
+
+        final_status, attempts_log, last_mapping, last_report = _run_relationship_item(
+            state, prop, source_em, target_em
+        )
+
+        state.mapping_run_log.append(
+            {
+                "item": property_uri,
+                "kind": "relationship",
+                "attempts": attempts_log,
+                "final_status": final_status,
+            }
+        )
+        if final_status == "PASS" and last_mapping is not None:
+            state.relationship_mappings.append(last_mapping)
+            state.submitted_any = True
+            if last_report is not None:
+                state.mapping_evaluations[property_uri] = last_report.to_dict()
+            state.notify(f"Mapped {label}")
+        state.items_done += 1
+
+    state.notify("Agent completed!", pct=100)
+    return state.finalise()
+
+
+# =====================================================
+# Run-scoped mutable state
+# =====================================================
+
+
+@dataclass
+class _RunState:
+    """Encapsulates per-call mutable state — keeps ``run_agent`` re-entrant.
+
+    All counters, mapping lists, and accumulators that need to evolve as the
+    walk progresses live here so the orchestrator never relies on module-
+    level globals.  This also keeps the per-item helpers (``_run_*_item``)
+    pure functions of state + item input.
+    """
+
+    host: str
+    token: str
+    endpoint_name: str
+    client: Any
+    metadata: dict
+    ontology: dict
+    documents: List[Any]
+    on_step: Optional[Callable[[str, int], None]]
+    max_iterations: Optional[int]
+    skip_semantic_critic: bool
+
+    # Output accumulators
+    entity_mappings: List[dict] = field(default_factory=list)
+    relationship_mappings: List[dict] = field(default_factory=list)
+    entity_mapping_by_uri: Dict[str, dict] = field(default_factory=dict)
+    mapping_run_log: List[dict] = field(default_factory=list)
+    mapping_evaluations: Dict[str, dict] = field(default_factory=dict)
+    steps: List[AgentStep] = field(default_factory=list)
+    usage: Dict[str, int] = field(
+        default_factory=lambda: {"prompt_tokens": 0, "completion_tokens": 0}
+    )
+    iterations: int = 0
+    submitted_any: bool = False
+
+    # Plan-derived state — refreshed on (re)plan.
+    source_model: Optional[SourceModel] = None
+    entity_order: List[str] = field(default_factory=list)
+    relationship_order: List[str] = field(default_factory=list)
+    skip_reasons: Dict[str, str] = field(default_factory=dict)
+    planner_reinvocations: int = 0
+
+    # Walk progress
+    items_done: int = 0
+    total_items_planned: int = 0
+
+    # Per-run caches & lookups
+    id_universe_cache: Dict[str, set] = field(default_factory=dict)
+    entity_index: Dict[str, dict] = field(default_factory=dict)
+    rel_index: Dict[str, dict] = field(default_factory=dict)
+    execute_sql_fn: Optional[Callable[[str], dict]] = None
+
+    # -- helpers ----------------------------------------------------------
+
+    def add_step(
+        self,
+        step_type: str,
+        content: str,
+        *,
+        tool_name: str = "",
+        duration_ms: int = 0,
+    ) -> None:
+        self.steps.append(
+            AgentStep(
+                step_type=step_type,
+                content=content,
+                tool_name=tool_name,
+                duration_ms=duration_ms,
+            )
+        )
+
+    def pct(self) -> int:
+        total = max(self.total_items_planned, 1)
+        return min(5 + int((self.items_done / total) * 90), 95)
+
+    def notify(self, msg: str, *, pct: Optional[int] = None) -> None:
+        actual_pct = pct if pct is not None else self.pct()
+        logger.info("PGE STEP [%d%%] %s", actual_pct, msg)
+        if self.on_step:
+            self.on_step(msg, actual_pct)
+
+    def add_iterations(self, n: int) -> None:
+        self.iterations += int(n or 0)
+
+    def accumulate_usage(self, src: Dict[str, int]) -> None:
+        for k in ("prompt_tokens", "completion_tokens"):
+            self.usage[k] = self.usage.get(k, 0) + int((src or {}).get(k, 0))
+
+    def refresh_plan(self) -> None:
+        sm = self.source_model
+        if sm is None:
+            return
+        self.entity_order = list(sm.mapping_plan.entity_order)
+        self.relationship_order = list(sm.mapping_plan.relationship_order)
+        self.skip_reasons = {s.item: s.reason for s in sm.mapping_plan.skip}
+
+    def replan_once(self) -> bool:
+        """Re-invoke the Planner once (subject to the global budget).
+
+        Returns ``True`` on success (and updates the plan in place), ``False``
+        when the budget is exhausted or the new Planner run failed.
+        """
+        if self.planner_reinvocations >= _PLANNER_REINVOCATION_BUDGET:
+            return False
+        self.planner_reinvocations += 1
+        self.notify("Re-planning (escalated)…", pct=self.pct())
+        self.add_step(
+            "planner",
+            f"replan-start (reinvocation #{self.planner_reinvocations})",
+        )
+        t_rp = time.time()
+        try:
+            new_result = run_planner(
+                host=self.host,
+                token=self.token,
+                endpoint_name=self.endpoint_name,
+                client=self.client,
+                metadata=self.metadata,
+                ontology=self.ontology,
+                documents=self.documents,
+                on_step=None,
+            )
+        except Exception as exc:  # noqa: BLE001
+            logger.error("Replan raised an exception: %s", exc, exc_info=True)
+            self.add_step("planner", f"replan-exception: {exc}")
+            return False
+        replan_ms = int((time.time() - t_rp) * 1000)
+        self.add_iterations(new_result.iterations)
+        self.accumulate_usage(new_result.usage)
+        if not new_result.success or new_result.source_model is None:
+            self.add_step(
+                "planner",
+                f"replan-fail: {new_result.error}",
+                duration_ms=replan_ms,
+            )
+            return False
+        self.source_model = new_result.source_model
+        self.refresh_plan()
+        self.add_step("planner", "replan-done", duration_ms=replan_ms)
+        return True
+
+    def finalise(self, *, error: str = "") -> AgentResult:
+        """Build the final :class:`AgentResult`."""
+        result = AgentResult(success=False)
+        result.entity_mappings = list(self.entity_mappings)
+        result.relationship_mappings = list(self.relationship_mappings)
+        result.steps = list(self.steps)
+        result.iterations = self.iterations
+        result.usage = dict(self.usage)
+        result.mapping_run_log = list(self.mapping_run_log)
+        result.mapping_evaluations = dict(self.mapping_evaluations)
+        result.source_model = (
+            self.source_model.to_dict() if self.source_model is not None else None
+        )
+        result.stats = {
+            "total": len(self.entity_order) + len(self.relationship_order),
+            "entities": len(self.entity_mappings),
+            "relationships": len(self.relationship_mappings),
+            "planner_reinvocations": self.planner_reinvocations,
+        }
+        if error:
+            result.error = error
+            result.success = False
+            return result
+
+        # Success when at least one mapping was submitted, OR when there was
+        # nothing to map (legitimate empty run).
+        nothing_to_map = (
+            not self.entity_order and not self.relationship_order
+        )
+        result.success = self.submitted_any or nothing_to_map
+        if not result.success:
+            result.error = (
+                "no mappings submitted (all items failed or were skipped)"
+            )
+        logger.info(
+            "===== MAPPING-PGE ENGINE COMPLETE ===== success=%s, entities=%d, "
+            "relationships=%d, iterations=%d, replans=%d",
+            result.success,
+            len(self.entity_mappings),
+            len(self.relationship_mappings),
+            self.iterations,
+            self.planner_reinvocations,
+        )
+        return result
+
+
+# =====================================================
+# Per-item walk helpers
+# =====================================================
+
+
+def _run_entity_item(
+    state: _RunState,
+    ontology_class: dict,
+) -> Tuple[str, List[dict], Optional[dict], Optional[EvalReport]]:
+    """Run the G->E loop for one entity.
+
+    Returns ``(final_status, attempts_log, last_mapping, last_report)``.
+    The outer ``while True`` lets a successful replan restart the inner
+    retry budget fresh, which is the intent of the bubble-to-planner path.
+    """
+    class_uri = ontology_class.get("uri", "")
+    class_label = ontology_class.get("label") or ontology_class.get(
+        "name", class_uri
+    )
+    attempts_log: List[dict] = []
+    last_mapping: Optional[dict] = None
+    last_report: Optional[EvalReport] = None
+
+    while True:
+        retry_hint: Optional[str] = None
+        bubble_requested = False
+        for attempt_idx in range(_PER_ITEM_GENERATOR_ATTEMPTS):
+            attempt_num = attempt_idx + 1
+            slice_dict = _slice_for_entity(state.source_model, class_uri)
+            state.notify(
+                f"Mapping {class_label} (attempt {attempt_num}/{_PER_ITEM_GENERATOR_ATTEMPTS})…",
+                pct=state.pct(),
+            )
+            state.add_step(
+                "generator",
+                f"entity-gen-start: {class_uri} attempt {attempt_num}",
+            )
+            t_g = time.time()
+            try:
+                gen_result = run_entity_generator(
+                    host=state.host,
+                    token=state.token,
+                    endpoint_name=state.endpoint_name,
+                    client=state.client,
+                    ontology_class=ontology_class,
+                    source_model_slice=slice_dict,
+                    retry_hint=retry_hint,
+                    on_step=None,
+                    **(
+                        {"max_iterations": state.max_iterations}
+                        if state.max_iterations is not None
+                        else {}
+                    ),
+                )
+            except Exception as exc:  # noqa: BLE001
+                logger.error(
+                    "EntityGenerator raised on %s attempt %d: %s",
+                    class_uri,
+                    attempt_num,
+                    exc,
+                    exc_info=True,
+                )
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": int((time.time() - t_g) * 1000),
+                        "stage1_status": "skipped",
+                        "critic_status": "skipped",
+                        "bubble": False,
+                        "hint": None,
+                        "error": f"generator exception: {exc}",
+                    }
+                )
+                continue
+            gen_ms = int((time.time() - t_g) * 1000)
+            state.add_iterations(gen_result.iterations)
+            state.accumulate_usage(gen_result.usage)
+
+            if not gen_result.success or gen_result.mapping is None:
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": gen_ms,
+                        "stage1_status": "skipped",
+                        "critic_status": "skipped",
+                        "bubble": False,
+                        "hint": None,
+                        "error": gen_result.error or "generator failed",
+                    }
+                )
+                state.add_step(
+                    "generator",
+                    f"entity-gen-fail: {class_uri} attempt {attempt_num}: "
+                    f"{gen_result.error}",
+                    duration_ms=gen_ms,
+                )
+                retry_hint = gen_result.error or retry_hint
+                continue
+
+            mapping = gen_result.mapping
+            last_mapping = mapping
+
+            state.notify(f"Evaluating {class_label}…", pct=state.pct())
+            t_e = time.time()
+            stage1_report = evaluate_entity_mapping(
+                mapping=mapping,
+                ontology_class=ontology_class,
+                execute_sql_fn=state.execute_sql_fn,
+            )
+            eval_ms = int((time.time() - t_e) * 1000)
+            last_report = stage1_report
+            state.add_step(
+                "evaluator",
+                f"entity-stage1: {class_uri} status={stage1_report.status} "
+                f"bubble={stage1_report.bubble_to_planner}",
+                duration_ms=eval_ms,
+            )
+
+            if stage1_report.status == "FAIL":
+                hint = _first_hint(stage1_report)
+                bubble = bool(stage1_report.bubble_to_planner)
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": gen_ms,
+                        "stage1_status": "FAIL",
+                        "critic_status": "skipped",
+                        "bubble": bubble,
+                        "hint": hint,
+                    }
+                )
+                if bubble:
+                    bubble_requested = True
+                    break
+                retry_hint = hint or retry_hint
+                continue
+
+            # Stage 1 PASS — optionally run the critic.
+            if state.skip_semantic_critic:
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": gen_ms,
+                        "stage1_status": "PASS",
+                        "critic_status": "skipped",
+                        "bubble": False,
+                        "hint": None,
+                    }
+                )
+                return "PASS", attempts_log, mapping, stage1_report
+
+            state.notify(f"Critiquing {class_label}…", pct=state.pct())
+            t_c = time.time()
+            try:
+                critic_result = run_critic(
+                    host=state.host,
+                    token=state.token,
+                    endpoint_name=state.endpoint_name,
+                    client=state.client,
+                    item_kind="entity",
+                    item_uri=class_uri,
+                    item_definition=ontology_class,
+                    submitted_mapping=mapping,
+                    source_model_slice=slice_dict,
+                    stage1_metrics=dict(stage1_report.metrics),
+                )
+            except Exception as exc:  # noqa: BLE001
+                logger.error(
+                    "Critic raised on %s attempt %d: %s",
+                    class_uri,
+                    attempt_num,
+                    exc,
+                    exc_info=True,
+                )
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": gen_ms,
+                        "stage1_status": "PASS",
+                        "critic_status": "skipped",
+                        "bubble": False,
+                        "hint": None,
+                        "error": f"critic exception: {exc}",
+                    }
+                )
+                return "PASS", attempts_log, mapping, stage1_report
+            critic_ms = int((time.time() - t_c) * 1000)
+            state.add_iterations(critic_result.iterations)
+            state.accumulate_usage(critic_result.usage)
+
+            critic_report = critic_result.report
+            state.add_step(
+                "critic",
+                f"entity-critic: {class_uri} status="
+                f"{critic_report.status if critic_report else '?'} "
+                f"bubble="
+                f"{critic_report.bubble_to_planner if critic_report else '?'}",
+                duration_ms=critic_ms,
+            )
+
+            if not critic_result.success or critic_report is None:
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": gen_ms,
+                        "stage1_status": "PASS",
+                        "critic_status": "skipped",
+                        "bubble": False,
+                        "hint": None,
+                        "error": critic_result.error or "critic failed",
+                    }
+                )
+                return "PASS", attempts_log, mapping, stage1_report
+
+            if critic_report.status == "PASS":
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": gen_ms,
+                        "stage1_status": "PASS",
+                        "critic_status": "PASS",
+                        "bubble": False,
+                        "hint": None,
+                    }
+                )
+                return "PASS", attempts_log, mapping, critic_report
+
+            hint = _first_hint(critic_report)
+            bubble = bool(critic_report.bubble_to_planner)
+            attempts_log.append(
+                {
+                    "attempt": attempt_num,
+                    "generator_ms": gen_ms,
+                    "stage1_status": "PASS",
+                    "critic_status": "FAIL",
+                    "bubble": bubble,
+                    "hint": hint,
+                }
+            )
+            last_report = critic_report
+            if bubble:
+                bubble_requested = True
+                break
+            retry_hint = hint or retry_hint
+            continue
+
+        if bubble_requested:
+            if state.replan_once():
+                continue  # restart the item with the new plan
+            return "FAIL_BUBBLE", attempts_log, last_mapping, last_report
+        return "FAIL_BUDGET", attempts_log, last_mapping, last_report
+
+
+def _run_relationship_item(
+    state: _RunState,
+    ontology_property: dict,
+    source_em: dict,
+    target_em: dict,
+) -> Tuple[str, List[dict], Optional[dict], Optional[EvalReport]]:
+    """Run the G->E loop for one relationship.
+
+    Returns ``(final_status, attempts_log, last_mapping, last_report)``.
+    """
+    property_uri = ontology_property.get("uri", "")
+    property_label = ontology_property.get("label") or ontology_property.get(
+        "name", property_uri
+    )
+    attempts_log: List[dict] = []
+    last_mapping: Optional[dict] = None
+    last_report: Optional[EvalReport] = None
+
+    while True:
+        retry_hint: Optional[str] = None
+        bubble_requested = False
+        for attempt_idx in range(_PER_ITEM_GENERATOR_ATTEMPTS):
+            attempt_num = attempt_idx + 1
+            slice_dict = _slice_for_relationship(
+                state.source_model,
+                property_uri,
+                source_em,
+                target_em,
+            )
+            state.notify(
+                f"Mapping {property_label} (attempt {attempt_num}/"
+                f"{_PER_ITEM_GENERATOR_ATTEMPTS})…",
+                pct=state.pct(),
+            )
+            state.add_step(
+                "generator",
+                f"rel-gen-start: {property_uri} attempt {attempt_num}",
+            )
+            t_g = time.time()
+            try:
+                gen_result = run_relationship_generator(
+                    host=state.host,
+                    token=state.token,
+                    endpoint_name=state.endpoint_name,
+                    client=state.client,
+                    ontology_property=ontology_property,
+                    source_entity_mapping=source_em,
+                    target_entity_mapping=target_em,
+                    source_model_slice=slice_dict,
+                    retry_hint=retry_hint,
+                    on_step=None,
+                    **(
+                        {"max_iterations": state.max_iterations}
+                        if state.max_iterations is not None
+                        else {}
+                    ),
+                )
+            except Exception as exc:  # noqa: BLE001
+                logger.error(
+                    "RelationshipGenerator raised on %s attempt %d: %s",
+                    property_uri,
+                    attempt_num,
+                    exc,
+                    exc_info=True,
+                )
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": int((time.time() - t_g) * 1000),
+                        "stage1_status": "skipped",
+                        "critic_status": "skipped",
+                        "bubble": False,
+                        "hint": None,
+                        "error": f"generator exception: {exc}",
+                    }
+                )
+                continue
+            gen_ms = int((time.time() - t_g) * 1000)
+            state.add_iterations(gen_result.iterations)
+            state.accumulate_usage(gen_result.usage)
+
+            if not gen_result.success or gen_result.mapping is None:
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": gen_ms,
+                        "stage1_status": "skipped",
+                        "critic_status": "skipped",
+                        "bubble": False,
+                        "hint": None,
+                        "error": gen_result.error or "generator failed",
+                    }
+                )
+                state.add_step(
+                    "generator",
+                    f"rel-gen-fail: {property_uri} attempt {attempt_num}: "
+                    f"{gen_result.error}",
+                    duration_ms=gen_ms,
+                )
+                retry_hint = gen_result.error or retry_hint
+                continue
+
+            mapping = gen_result.mapping
+            last_mapping = mapping
+
+            state.notify(f"Evaluating {property_label}…", pct=state.pct())
+            t_e = time.time()
+            stage1_report = evaluate_relationship_mapping(
+                mapping=mapping,
+                source_entity_mapping=source_em,
+                target_entity_mapping=target_em,
+                execute_sql_fn=state.execute_sql_fn,
+                id_universe_cache=state.id_universe_cache,
+            )
+            eval_ms = int((time.time() - t_e) * 1000)
+            last_report = stage1_report
+            state.add_step(
+                "evaluator",
+                f"rel-stage1: {property_uri} status={stage1_report.status} "
+                f"bubble={stage1_report.bubble_to_planner}",
+                duration_ms=eval_ms,
+            )
+
+            if stage1_report.status == "FAIL":
+                hint = _first_hint(stage1_report)
+                bubble = bool(stage1_report.bubble_to_planner)
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": gen_ms,
+                        "stage1_status": "FAIL",
+                        "critic_status": "skipped",
+                        "bubble": bubble,
+                        "hint": hint,
+                    }
+                )
+                if bubble:
+                    bubble_requested = True
+                    break
+                retry_hint = hint or retry_hint
+                continue
+
+            if state.skip_semantic_critic:
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": gen_ms,
+                        "stage1_status": "PASS",
+                        "critic_status": "skipped",
+                        "bubble": False,
+                        "hint": None,
+                    }
+                )
+                return "PASS", attempts_log, mapping, stage1_report
+
+            state.notify(f"Critiquing {property_label}…", pct=state.pct())
+            t_c = time.time()
+            try:
+                critic_result = run_critic(
+                    host=state.host,
+                    token=state.token,
+                    endpoint_name=state.endpoint_name,
+                    client=state.client,
+                    item_kind="relationship",
+                    item_uri=property_uri,
+                    item_definition=ontology_property,
+                    submitted_mapping=mapping,
+                    source_model_slice=slice_dict,
+                    stage1_metrics=dict(stage1_report.metrics),
+                )
+            except Exception as exc:  # noqa: BLE001
+                logger.error(
+                    "Critic raised on %s attempt %d: %s",
+                    property_uri,
+                    attempt_num,
+                    exc,
+                    exc_info=True,
+                )
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": gen_ms,
+                        "stage1_status": "PASS",
+                        "critic_status": "skipped",
+                        "bubble": False,
+                        "hint": None,
+                        "error": f"critic exception: {exc}",
+                    }
+                )
+                return "PASS", attempts_log, mapping, stage1_report
+            critic_ms = int((time.time() - t_c) * 1000)
+            state.add_iterations(critic_result.iterations)
+            state.accumulate_usage(critic_result.usage)
+
+            critic_report = critic_result.report
+            state.add_step(
+                "critic",
+                f"rel-critic: {property_uri} status="
+                f"{critic_report.status if critic_report else '?'} "
+                f"bubble="
+                f"{critic_report.bubble_to_planner if critic_report else '?'}",
+                duration_ms=critic_ms,
+            )
+
+            if not critic_result.success or critic_report is None:
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": gen_ms,
+                        "stage1_status": "PASS",
+                        "critic_status": "skipped",
+                        "bubble": False,
+                        "hint": None,
+                        "error": critic_result.error or "critic failed",
+                    }
+                )
+                return "PASS", attempts_log, mapping, stage1_report
+
+            if critic_report.status == "PASS":
+                attempts_log.append(
+                    {
+                        "attempt": attempt_num,
+                        "generator_ms": gen_ms,
+                        "stage1_status": "PASS",
+                        "critic_status": "PASS",
+                        "bubble": False,
+                        "hint": None,
+                    }
+                )
+                return "PASS", attempts_log, mapping, critic_report
+
+            hint = _first_hint(critic_report)
+            bubble = bool(critic_report.bubble_to_planner)
+            attempts_log.append(
+                {
+                    "attempt": attempt_num,
+                    "generator_ms": gen_ms,
+                    "stage1_status": "PASS",
+                    "critic_status": "FAIL",
+                    "bubble": bubble,
+                    "hint": hint,
+                }
+            )
+            last_report = critic_report
+            if bubble:
+                bubble_requested = True
+                break
+            retry_hint = hint or retry_hint
+            continue
+
+        if bubble_requested:
+            if state.replan_once():
+                continue
+            return "FAIL_BUBBLE", attempts_log, last_mapping, last_report
+        return "FAIL_BUDGET", attempts_log, last_mapping, last_report
diff --git a/src/agents/agent_mapping_pge/evaluator/__init__.py b/src/agents/agent_mapping_pge/evaluator/__init__.py
new file mode 100644
index 00000000..41e7ef5e
--- /dev/null
+++ b/src/agents/agent_mapping_pge/evaluator/__init__.py
@@ -0,0 +1,18 @@
+"""Evaluator stage of the mapping PGE pipeline.
+
+Stage 1 (this module) is the *deterministic* evaluator — pure-Python checks
+backed by SQL counts.  Stage 2 (added in a later sprint) is the semantic
+evaluator that uses an LLM to judge naming/semantic fidelity.
+
+The deterministic checks live in :mod:`agents.agent_mapping_pge.evaluator.deterministic`.
+"""
+
+from agents.agent_mapping_pge.evaluator.deterministic import (
+    evaluate_entity_mapping,
+    evaluate_relationship_mapping,
+)
+
+__all__ = [
+    "evaluate_entity_mapping",
+    "evaluate_relationship_mapping",
+]
diff --git a/src/agents/agent_mapping_pge/evaluator/critic.py b/src/agents/agent_mapping_pge/evaluator/critic.py
new file mode 100644
index 00000000..a41fe1b0
--- /dev/null
+++ b/src/agents/agent_mapping_pge/evaluator/critic.py
@@ -0,0 +1,747 @@
+"""
+OntoBricks Mapping-PGE Semantic Critic Agent.
+
+Sprint 6 of the Planner-Generator-Evaluator (PGE) redesign — stage 2 of the
+Evaluator. Runs ONLY after the deterministic (stage-1) evaluator has passed.
+
+The Critic audits ONE submitted mapping for SEMANTIC correctness — issues that
+pure structural checks cannot catch:
+
+* the WRONG TABLE was picked (e.g. ``antenatal_visits`` chosen to realise
+  the ``Delivery`` class), or
+* the wrong COLUMN within the right table (e.g. ``appointment_date`` used
+  for ``deliveryDate``).
+
+The Critic's "bubble" signal is sharp: if the wrong TABLE was chosen, the
+verdict bubbles to the Planner (which must revise the source model); if just
+a wrong column inside the right table, the verdict stays with the Generator
+which can retry against the same table.
+
+The loop shape mirrors :mod:`agents.agent_mapping_pge.generators.entity` —
+same ``call_serving_endpoint`` + ``dispatch_tool`` ReAct cycle, same 3-second
+inter-iteration delay, same MLflow trace decorator. Differences:
+
+* Smaller default budget (6) — auditing is bounded work; if the Critic can't
+  conclude in 6 iterations, it defers (PASS with a reasoning note) rather
+  than falsely escalates.
+* Different tool set: only ``sample_table``, ``execute_sql``,
+  ``get_documents_context``, and the terminal ``submit_evaluation``.
+* No single-shot fallback — the Critic produces structured output through
+  ``submit_evaluation`` only.
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+
+import requests
+
+if TYPE_CHECKING:
+    from agents.agent_mapping_pge.contracts import EvalReport
+
+from back.core.logging import get_logger
+from agents.engine_base import (
+    accumulate_usage,
+    call_serving_endpoint,
+    dispatch_tool,
+)
+from agents.tools.context import ToolContext
+from agents.tools.documents import (
+    GET_DOCUMENTS_CONTEXT_DEF,
+    tool_get_documents_context,
+)
+from agents.tools.evaluation import (
+    EVALUATION_TOOL_DEFINITIONS,
+    EVALUATION_TOOL_HANDLERS,
+)
+from agents.tools.planner import (
+    SAMPLE_TABLE_DEF,
+    tool_sample_table,
+)
+from agents.tools.sql import (
+    SQL_TOOL_DEFINITIONS,
+    SQL_TOOL_HANDLERS,
+)
+from agents.tracing import trace_agent
+
+logger = get_logger(__name__)
+
+MAX_ITERATIONS = 6
+LLM_TIMEOUT = 180
+_ITERATION_DELAY_SEC = 3
+# See planner._MAX_TOKENS comment — same rationale for submit_evaluation.
+_MAX_TOKENS = 50000
+
+_TRACE_NAME = "mapping_pge_critic"
+
+
+# =====================================================
+# Tool aggregation
+# =====================================================
+#
+# The Critic only needs:
+#   * sample_table           – peek at actual values to verify the column
+#                              picked really represents the ontology concept.
+#   * execute_sql            – targeted probes for "is this column really
+#                              what it claims" sanity checks.
+#   * get_documents_context  – consult any imported domain glossary.
+#   * submit_evaluation      – TERMINAL.
+#
+# We deliberately exclude:
+#   * get_metadata / get_ontology — the audit target is supplied in the user
+#     prompt; broad re-fetches just inflate context.
+#   * column_value_overlap / distinct_count — those are structural, already
+#     covered by the deterministic stage.
+#   * submit_source_model / submit_entity_mapping / submit_relationship_mapping
+#     — wrong stage.
+
+TOOL_DEFINITIONS: List[dict] = (
+    [SAMPLE_TABLE_DEF, GET_DOCUMENTS_CONTEXT_DEF]
+    + SQL_TOOL_DEFINITIONS
+    + EVALUATION_TOOL_DEFINITIONS
+)
+
+TOOL_HANDLERS: Dict[str, Callable] = {
+    "sample_table": tool_sample_table,
+    "get_documents_context": tool_get_documents_context,
+    **SQL_TOOL_HANDLERS,
+    **EVALUATION_TOOL_HANDLERS,
+}
+
+
+# =====================================================
+# Data classes
+# =====================================================
+
+
+@dataclass
+class CriticStep:
+    """One observable step of the Critic's execution.
+
+    Mirrors :class:`agents.agent_mapping_pge.generators.entity.EntityGenStep`
+    so the orchestrator (Sprint 7) can render a per-audit timeline in the UI.
+    """
+
+    step_type: str  # "tool_call" | "tool_result" | "output"
+    content: str
+    tool_name: str = ""
+    duration_ms: int = 0
+
+
+@dataclass
+class CriticResult:
+    """Outcome of a single Critic invocation.
+
+    ``report`` is populated when the agent terminated by submitting a verdict
+    via ``submit_evaluation``. ``success`` here is the agent-level success
+    flag — it indicates a *clean termination*, NOT a PASS verdict. A FAIL
+    verdict with ``bubble_to_planner=True`` still has ``success=True``.
+    ``success=False`` is reserved for budget exhaustion, text-only output,
+    and LLM/transport errors.
+    """
+
+    success: bool
+    report: Optional["EvalReport"] = None
+    steps: List[CriticStep] = field(default_factory=list)
+    iterations: int = 0
+    error: str = ""
+    usage: Dict[str, int] = field(default_factory=dict)
+
+
+# =====================================================
+# System prompt
+# =====================================================
+#
+# Kept under 3KB. Frames the Critic as a senior data engineer auditing ONE
+# submitted mapping for SEMANTIC correctness — the structural checks have
+# already passed. The decision rubric (PASS / FAIL+no-bubble / FAIL+bubble)
+# is load-bearing: it determines whether the orchestrator retries the
+# Generator or re-invokes the Planner.
+
+SYSTEM_PROMPT = """\
+You are a senior data engineer auditing ONE submitted mapping for SEMANTIC \
+correctness. The structural checks (row counts, distinct IDs, dangling FKs) \
+have ALREADY PASSED — your job is to catch wrong-concept errors that pure \
+structural checks cannot see.
+
+WHAT YOU AUDIT
+• Did the mapping pick the RIGHT TABLE for the ontology class/property?
+• Do sampled values in the chosen column(s) actually represent what the \
+ontology attribute means? (e.g. "delivery_date" should be a delivery date, \
+not a booking date.)
+• Does the column's semantics match the ontology comment / label?
+
+TOOLS
+You have these tools:
+  • sample_table          – Up to N random rows from a table. Use to peek at \
+actual values and check they match the concept.
+  • execute_sql           – Targeted SQL for "is this column really what it \
+claims" probes (e.g. value ranges, distinct categories, null patterns).
+  • get_documents_context – Imported domain glossaries / data dictionaries. \
+Check against these when the column's role is non-obvious.
+  • submit_evaluation     – TERMINAL. Call EXACTLY ONCE when you have a \
+confident verdict.
+
+DECISION RUBRIC
+• PASS — sampled values, column semantics, and domain context all support \
+the mapping. status="PASS", failures=[], bubble_to_planner=false.
+• FAIL with bubble_to_planner=false — the WRONG COLUMN was picked within \
+the RIGHT TABLE. The Generator can fix this on retry. Populate failures[] \
+with the specific column-level issue and a concrete hint.
+• FAIL with bubble_to_planner=true — the WRONG TABLE was chosen entirely. \
+The Planner must revise the source model. Populate failures[] and set the \
+bubble flag.
+
+HINT DISCIPLINE
+• Hints must be CONCRETE, ACTIONABLE, single-sentence corrections.
+• Good column-level hint: "Sampled rows show `appointment_date` is the \
+booking date, not delivery date. Use `delivery_dttm` instead."
+• Good table-level hint: "This mapping uses `antenatal_visits`, but the \
+chosen class is Delivery. Switch to the `labour_delivery` table."
+• Bad hint (vague): "consider using a different column"
+• Bad hint (chatty): "I think there might be an issue here, you should look \
+into it more carefully"
+
+HARD RULES
+• You are bounded by max_iterations=6. Keep your audit FOCUSED — pick the \
+one or two probes that would change your verdict, not exhaustive ones.
+• Call submit_evaluation EXACTLY ONCE.
+• If you cannot determine a verdict within 6 iterations, submit PASS with a \
+reasoning note explaining the uncertainty. Do NOT bubble — better to defer \
+than to falsely escalate.
+• Do not call get_metadata, get_ontology, column_value_overlap, \
+distinct_count, submit_source_model, submit_entity_mapping, or \
+submit_relationship_mapping — they are not available to you. The audit \
+target and structural metrics are already in the user message.
+"""
+
+
+# =====================================================
+# Internal helpers
+# =====================================================
+
+
+def _format_entity_definition(item_definition: dict) -> List[str]:
+    """Lines for an entity (ontology class) audit target."""
+    parts: List[str] = []
+    label = item_definition.get("label") or item_definition.get("name", "")
+    comment = item_definition.get("comment", "") or ""
+    attributes = item_definition.get("attributes", []) or []
+
+    parts.append(f"  label:   {label}")
+    if comment:
+        parts.append(f"  comment: {comment}")
+    if attributes:
+        parts.append(f"  attributes ({len(attributes)}):")
+        for attr in attributes:
+            if isinstance(attr, dict):
+                a_name = attr.get("name") or attr.get("label") or attr.get("uri", "?")
+                a_type = attr.get("type") or attr.get("range") or ""
+                parts.append(f"    - {a_name}" + (f" ({a_type})" if a_type else ""))
+            else:
+                parts.append(f"    - {attr}")
+    return parts
+
+
+def _format_relationship_definition(item_definition: dict) -> List[str]:
+    """Lines for a relationship (ontology property) audit target.
+
+    Always emits explicit ``domain`` and ``range`` lines — these are what
+    differentiate a relationship audit from an entity audit, and the tests
+    pin them.
+    """
+    parts: List[str] = []
+    label = item_definition.get("label") or item_definition.get("name", "")
+    comment = item_definition.get("comment", "") or ""
+    domain = item_definition.get("domain", "") or ""
+    range_class = item_definition.get("range", "") or ""
+
+    parts.append(f"  label:   {label}")
+    if comment:
+        parts.append(f"  comment: {comment}")
+    parts.append(f"  domain:  {domain}")
+    parts.append(f"  range:   {range_class}")
+    return parts
+
+
+def _format_submitted_entity_mapping(submitted_mapping: dict) -> List[str]:
+    """Lines summarising an entity mapping under audit."""
+    parts: List[str] = ["SUBMITTED MAPPING (entity)"]
+    parts.append(f"  sql_query:       {submitted_mapping.get('sql_query', '')}")
+    parts.append(f"  id_column:       {submitted_mapping.get('id_column', '')}")
+    parts.append(f"  label_column:    {submitted_mapping.get('label_column', '')}")
+    attr_map = submitted_mapping.get("attribute_mappings", {}) or {}
+    if attr_map:
+        parts.append("  attribute_mappings:")
+        for k, v in attr_map.items():
+            parts.append(f"    {k} -> {v}")
+    unmapped = submitted_mapping.get("unmapped_attributes", []) or []
+    if unmapped:
+        parts.append("  unmapped_attributes:")
+        for u in unmapped:
+            if isinstance(u, dict):
+                parts.append(
+                    f"    - {u.get('name', '?')}: {u.get('reason', '')}"
+                )
+            else:
+                parts.append(f"    - {u}")
+    return parts
+
+
+def _format_submitted_relationship_mapping(submitted_mapping: dict) -> List[str]:
+    """Lines summarising a relationship mapping under audit."""
+    parts: List[str] = ["SUBMITTED MAPPING (relationship)"]
+    parts.append(f"  sql_query:        {submitted_mapping.get('sql_query', '')}")
+    parts.append(
+        f"  source_id_column: {submitted_mapping.get('source_id_column', '')}"
+    )
+    parts.append(
+        f"  target_id_column: {submitted_mapping.get('target_id_column', '')}"
+    )
+    parts.append(
+        f"  source_class:     {submitted_mapping.get('source_class', '') or submitted_mapping.get('domain', '')}"
+    )
+    parts.append(
+        f"  target_class:     {submitted_mapping.get('target_class', '') or submitted_mapping.get('range_class', '')}"
+    )
+    return parts
+
+
+def _build_user_prompt(
+    item_kind: str,
+    item_uri: str,
+    item_definition: dict,
+    submitted_mapping: dict,
+    source_model_slice: dict,
+    stage1_metrics: dict,
+) -> str:
+    """Render the audit user prompt.
+
+    Structure:
+      1. AUDIT TARGET — item_kind, URI, ontology metadata (label/comment,
+         attributes for entities; domain/range for relationships).
+      2. SUBMITTED MAPPING — the actual mapping under audit.
+      3. PLANNER'S PREDICTION — the slice the Planner curated for this item.
+      4. STRUCTURAL CHECK METRICS (PASSED) — context from stage 1.
+      5. YOUR TASK — short reminder of the rubric.
+    """
+    parts: List[str] = []
+
+    parts.append("AUDIT TARGET")
+    parts.append(f"  kind:    {item_kind}")
+    parts.append(f"  uri:     {item_uri}")
+    if item_kind == "relationship":
+        parts.extend(_format_relationship_definition(item_definition or {}))
+    else:
+        parts.extend(_format_entity_definition(item_definition or {}))
+
+    parts.append("")
+    if item_kind == "relationship":
+        parts.extend(_format_submitted_relationship_mapping(submitted_mapping or {}))
+    else:
+        parts.extend(_format_submitted_entity_mapping(submitted_mapping or {}))
+
+    parts.append("")
+    parts.append("PLANNER'S PREDICTION")
+    parts.append(json.dumps(source_model_slice or {}, indent=2, default=str))
+
+    parts.append("")
+    parts.append("STRUCTURAL CHECK METRICS (PASSED)")
+    parts.append(json.dumps(stage1_metrics or {}, indent=2, default=str))
+
+    parts.append("")
+    parts.append("YOUR TASK")
+    parts.append(
+        "Audit the SEMANTIC correctness of the submitted mapping. Use "
+        "sample_table / execute_sql / get_documents_context as needed, then "
+        "call submit_evaluation EXACTLY ONCE with your verdict. Follow the "
+        "PASS / FAIL(no bubble) / FAIL(bubble) rubric in the system prompt."
+    )
+
+    prompt = "\n".join(parts)
+    logger.debug(
+        "_build_user_prompt for %s=%s (%d chars):\n%s",
+        item_kind,
+        item_uri,
+        len(prompt),
+        prompt,
+    )
+    return prompt
+
+
+# =====================================================
+# Public entry point
+# =====================================================
+
+
+@trace_agent(name="mapping_pge_critic")
+def run_critic(
+    host: str,
+    token: str,
+    endpoint_name: str,
+    client: Any,
+    *,
+    item_kind: str,
+    item_uri: str,
+    item_definition: dict,
+    submitted_mapping: dict,
+    source_model_slice: dict,
+    stage1_metrics: dict,
+    documents: Optional[list] = None,
+    on_step: Optional[Callable[[str, int], None]] = None,
+    max_iterations: int = MAX_ITERATIONS,
+) -> CriticResult:
+    """Run the Semantic Critic agent for one submitted mapping.
+
+    The Critic autonomously audits ``submitted_mapping`` for semantic
+    correctness using ``sample_table`` / ``execute_sql`` /
+    ``get_documents_context``, then submits a verdict via the terminal
+    ``submit_evaluation`` tool. The resulting :class:`EvalReport` (stage
+    ``"semantic"``) is stored on ``ctx.semantic_eval_report`` and returned in
+    ``CriticResult.report``.
+
+    Args:
+        host: Databricks workspace URL.
+        token: Bearer token for the serving endpoint.
+        endpoint_name: Foundation Model serving endpoint name.
+        client: Databricks SQL client (must expose ``execute_query(sql)``).
+        item_kind: ``"entity"`` or ``"relationship"``.
+        item_uri: The ontology class or property URI under audit.
+        item_definition: Full ontology dict for the item (label/comment,
+            plus attributes for entities or domain/range for relationships).
+        submitted_mapping: The mapping under audit (handler dict shape).
+        source_model_slice: The Planner's slice for this item.
+        stage1_metrics: Metrics from the deterministic evaluator, for
+            context.
+        documents: Optional pre-loaded domain documents — surfaced via
+            ``get_documents_context``.
+        on_step: Optional progress callback ``(msg, pct)`` for UI updates.
+        max_iterations: Upper bound on tool-call iterations (default 6 —
+            smaller than the Generators because auditing is bounded work).
+
+    Returns:
+        A :class:`CriticResult`. ``success`` is True iff the Critic
+        terminated by submitting a verdict; in that case ``report`` holds
+        the resulting :class:`EvalReport`. On failure (budget exhaustion,
+        text-only output, transport error), ``error`` explains why.
+    """
+    iteration_limit = max_iterations if max_iterations is not None else MAX_ITERATIONS
+
+    logger.info(
+        "===== CRITIC START ===== endpoint=%s, kind=%s, uri=%s, max_iter=%d",
+        endpoint_name,
+        item_kind,
+        item_uri,
+        iteration_limit,
+    )
+
+    ctx = ToolContext(
+        host=host.rstrip("/"),
+        token=token,
+        client=client,
+        # The audit target is in the user prompt; metadata/ontology are not
+        # needed by the Critic's tools.
+        metadata={},
+        ontology={},
+        documents=list(documents or []),
+    )
+
+    result = CriticResult(success=False)
+
+    user_prompt = _build_user_prompt(
+        item_kind=item_kind,
+        item_uri=item_uri,
+        item_definition=item_definition or {},
+        submitted_mapping=submitted_mapping or {},
+        source_model_slice=source_model_slice or {},
+        stage1_metrics=stage1_metrics or {},
+    )
+    messages: List[dict] = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": user_prompt},
+    ]
+    logger.info(
+        "Critic conversation initialized: system=%d chars, user=%d chars",
+        len(SYSTEM_PROMPT),
+        len(user_prompt),
+    )
+
+    total_usage: Dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0}
+
+    def _progress_pct(iteration_idx: int) -> int:
+        ratio = (iteration_idx + 1) / max(iteration_limit, 1)
+        return min(5 + int(ratio * 90), 95)
+
+    def notify(msg: str, *, pct: Optional[int] = None) -> None:
+        actual_pct = pct if pct is not None else 5
+        logger.info("CRITIC STEP [%d%%] %s", actual_pct, msg)
+        if on_step:
+            on_step(msg, actual_pct)
+
+    notify(f"Auditing {item_kind} {item_uri}…", pct=1)
+
+    # ------------------------------------------------------------------
+    # Agent loop
+    # ------------------------------------------------------------------
+    for iteration in range(iteration_limit):
+        if iteration > 0:
+            logger.debug(
+                "Iteration %d: waiting %ds before LLM call (rate limit mitigation)",
+                iteration + 1,
+                _ITERATION_DELAY_SEC,
+            )
+            time.sleep(_ITERATION_DELAY_SEC)
+
+        current_iteration = iteration + 1
+        pct = _progress_pct(iteration)
+        logger.info(
+            "----- Critic iteration %d/%d — %d messages, report=%s -----",
+            current_iteration,
+            iteration_limit,
+            len(messages),
+            "set" if ctx.semantic_eval_report is not None else "unset",
+        )
+        notify(
+            f"Critic iteration {current_iteration}/{iteration_limit}…",
+            pct=pct,
+        )
+
+        t0 = time.time()
+        try:
+            llm_response = call_serving_endpoint(
+                host,
+                token,
+                endpoint_name,
+                messages,
+                tools=TOOL_DEFINITIONS,
+                max_tokens=_MAX_TOKENS,
+                temperature=0.1,
+                timeout=LLM_TIMEOUT,
+                trace_name=_TRACE_NAME,
+            )
+        except requests.exceptions.HTTPError as exc:
+            status = exc.response.status_code if exc.response is not None else "?"
+            logger.warning(
+                "Critic iteration %d: HTTPError status=%s",
+                current_iteration,
+                status,
+            )
+            logger.debug(
+                "Critic iteration %d: HTTPError body: %.500s",
+                current_iteration,
+                exc.response.text if exc.response is not None else "N/A",
+            )
+            if exc.response is not None and status in (400, 422):
+                result.error = "LLM endpoint does not support function calling"
+                result.iterations = current_iteration
+                result.usage = total_usage
+                logger.error(
+                    "Critic: endpoint refused tools — cannot produce an evaluation"
+                )
+                return result
+            result.error = f"LLM request failed: {exc}"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.error(
+                "Critic: LLM request failed at iteration %d: %s",
+                current_iteration,
+                exc,
+            )
+            return result
+        except requests.exceptions.ReadTimeout:
+            result.error = f"LLM request timed out after {LLM_TIMEOUT}s"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.error("Critic: timeout at iteration %d", current_iteration)
+            return result
+        except requests.exceptions.RequestException as exc:
+            result.error = f"LLM request failed: {exc}"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.error(
+                "Critic: request exception at iteration %d: %s",
+                current_iteration,
+                exc,
+            )
+            return result
+
+        elapsed_ms = int((time.time() - t0) * 1000)
+        logger.info(
+            "Critic iteration %d: LLM responded in %dms",
+            current_iteration,
+            elapsed_ms,
+        )
+
+        accumulate_usage(total_usage, llm_response.get("usage", {}))
+
+        choice = llm_response.get("choices", [{}])[0]
+        finish_reason = choice.get("finish_reason", "?")
+        message = choice.get("message", {})
+        tool_calls = message.get("tool_calls", [])
+        has_content = bool(message.get("content"))
+        logger.info(
+            "Critic iteration %d: finish_reason=%s, tool_calls=%d, has_content=%s",
+            current_iteration,
+            finish_reason,
+            len(tool_calls),
+            has_content,
+        )
+
+        if not tool_calls:
+            # The Critic must terminate via submit_evaluation, never via
+            # free text. Text-only output is a failure.
+            content = (message.get("content") or "")[:500]
+            logger.warning(
+                "Critic iteration %d: produced text without submitting evaluation — %d chars",
+                current_iteration,
+                len(message.get("content") or ""),
+            )
+            result.steps.append(
+                CriticStep(
+                    step_type="output",
+                    content=content,
+                    duration_ms=elapsed_ms,
+                )
+            )
+            result.error = "critic produced text without submitting evaluation"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            notify(
+                "Critic produced text without submitting evaluation.",
+                pct=pct,
+            )
+            return result
+
+        logger.info(
+            "Critic iteration %d: processing %d tool call(s): [%s]",
+            current_iteration,
+            len(tool_calls),
+            ", ".join(
+                tc.get("function", {}).get("name", "?") for tc in tool_calls
+            ),
+        )
+        messages.append(message)
+
+        terminal_success = False
+        for tc_idx, tc in enumerate(tool_calls, 1):
+            func = tc.get("function", {})
+            tool_name = func.get("name", "")
+            raw_args = func.get("arguments", "{}")
+            tool_id = tc.get("id", "")
+
+            try:
+                arguments = json.loads(raw_args)
+            except json.JSONDecodeError:
+                arguments = {}
+
+            logger.info(
+                "Critic iteration %d: calling tool '%s' (%d/%d)",
+                current_iteration,
+                tool_name,
+                tc_idx,
+                len(tool_calls),
+            )
+
+            if tool_name == "submit_evaluation":
+                notify(
+                    f"Submitting evaluation for {item_uri}…", pct=pct
+                )
+            elif tool_name == "sample_table":
+                fn = arguments.get("full_name", "?")
+                notify(f"Sampling {fn}…", pct=pct)
+            elif tool_name == "execute_sql":
+                sql_preview = arguments.get("sql", "")[:80]
+                notify(f"Running SQL: {sql_preview}…", pct=pct)
+            elif tool_name == "get_documents_context":
+                notify("Retrieving documents…", pct=pct)
+            else:
+                notify(f"Calling {tool_name}…", pct=pct)
+
+            result.steps.append(
+                CriticStep(
+                    step_type="tool_call",
+                    content=json.dumps(arguments, default=str)[:500],
+                    tool_name=tool_name,
+                )
+            )
+
+            t1 = time.time()
+            tool_result = dispatch_tool(
+                TOOL_HANDLERS, ctx, tool_name, arguments, trace_name=_TRACE_NAME
+            )
+            tool_ms = int((time.time() - t1) * 1000)
+
+            logger.info(
+                "Critic iteration %d: tool '%s' returned %d chars in %dms",
+                current_iteration,
+                tool_name,
+                len(tool_result),
+                tool_ms,
+            )
+
+            result.steps.append(
+                CriticStep(
+                    step_type="tool_result",
+                    content=(
+                        (tool_result[:500] + "…")
+                        if len(tool_result) > 500
+                        else tool_result
+                    ),
+                    tool_name=tool_name,
+                    duration_ms=tool_ms,
+                )
+            )
+
+            messages.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": tool_id,
+                    "content": tool_result,
+                }
+            )
+
+            # Detect terminal success: submit_evaluation returned success=True
+            # AND stamped an EvalReport onto the context. An invalid status
+            # (the handler returns success=False) does NOT terminate the
+            # loop — the agent continues so it can resubmit a valid verdict.
+            if tool_name == "submit_evaluation":
+                try:
+                    parsed = json.loads(tool_result)
+                except json.JSONDecodeError:
+                    parsed = {}
+                if (
+                    parsed.get("success") is True
+                    and ctx.semantic_eval_report is not None
+                ):
+                    terminal_success = True
+                    logger.info(
+                        "Critic iteration %d: submit_evaluation succeeded — terminating",
+                        current_iteration,
+                    )
+
+        if terminal_success:
+            result.success = True
+            result.report = ctx.semantic_eval_report
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.info(
+                "===== CRITIC COMPLETE ===== uri=%s, status=%s, bubble=%s, "
+                "iterations=%d, prompt_tokens=%d, completion_tokens=%d",
+                item_uri,
+                result.report.status if result.report else "?",
+                result.report.bubble_to_planner if result.report else "?",
+                result.iterations,
+                total_usage["prompt_tokens"],
+                total_usage["completion_tokens"],
+            )
+            notify(f"Critic verdict submitted for {item_uri}.", pct=100)
+            return result
+
+    # Budget exhausted without a successful submit.
+    result.iterations = iteration_limit
+    result.usage = total_usage
+    result.error = "critic exhausted iteration budget"
+    logger.error("===== CRITIC FAILED ===== %s", result.error)
+    notify(result.error, pct=95)
+    return result
diff --git a/src/agents/agent_mapping_pge/evaluator/deterministic.py b/src/agents/agent_mapping_pge/evaluator/deterministic.py
new file mode 100644
index 00000000..b8e72951
--- /dev/null
+++ b/src/agents/agent_mapping_pge/evaluator/deterministic.py
@@ -0,0 +1,539 @@
+"""Deterministic (stage-1) evaluator for submitted mappings.
+
+This module is pure-Python and has no LLM dependency.  It runs the
+submitted mapping's SQL through a caller-supplied ``execute_sql_fn`` and
+checks structural invariants (row count, distinct id count, dangling
+foreign-key fractions, etc.).
+
+``execute_sql_fn`` contract::
+
+    def execute_sql_fn(sql: str) -> dict
+returning ``{"columns": [...], "rows": [{col: value, ...}, ...]}``.
+
+Important: this is the *full* result set, not the 3-row sample emitted by
+:func:`agents.tools.sql.tool_execute_sql`.  The orchestrator (Sprint 7) is
+responsible for plugging in a runner that returns full rows — typically a
+thin wrapper around ``DatabricksClient.execute_query``.
+
+All checks compute every metric even when some fail; the resulting
+:class:`~agents.agent_mapping_pge.contracts.EvalReport` lists every failure
+so the Generator/Planner can address them in one shot.
+"""
+
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from back.core.logging import get_logger
+from agents.agent_mapping_pge.contracts import EvalFailure, EvalReport
+from agents.agent_mapping_pge.evaluator.report import build_report
+
+logger = get_logger(__name__)
+
+# Thresholds for stage-1 checks.  These are intentionally lax — the
+# semantic evaluator (stage 2) catches subtler issues.
+_DANGLING_FK_FAIL_THRESHOLD = 0.05
+_DANGLING_FK_BUBBLE_THRESHOLD = 0.5
+
+
+SqlFn = Callable[[str], dict]
+
+
+# =====================================================
+# Helpers
+# =====================================================
+
+
+def _resolve_id_col(mapping: dict, fallback: str = "ID") -> str:
+    """Return the column name that holds the entity identifier in the row dicts."""
+    return mapping.get("id_column") or fallback
+
+
+def _extract_id_values(rows: List[dict], id_col: str) -> List[Any]:
+    """Pull the id_col value from each row; missing key -> ``None``."""
+    return [r.get(id_col) for r in rows]
+
+
+def _attribute_names(ontology_class: dict) -> List[str]:
+    """Ontology attributes can come in a few shapes; normalise to a list of names."""
+    attrs = ontology_class.get("attributes") or []
+    out: List[str] = []
+    for a in attrs:
+        if isinstance(a, str):
+            out.append(a)
+        elif isinstance(a, dict):
+            name = a.get("name") or a.get("uri") or a.get("label")
+            if name:
+                out.append(name)
+    return out
+
+
+def _fail(
+    *,
+    check: str,
+    expected: str,
+    observed: str,
+    hint: str,
+    kind: str = "structural",
+) -> EvalFailure:
+    return EvalFailure(
+        kind=kind, check=check, expected=expected, observed=observed, hint=hint
+    )
+
+
+class _SqlExecError(Exception):
+    """A generated mapping's SQL parsed but failed at execution time.
+
+    Wraps the underlying DB driver exception so the deterministic evaluator
+    can convert it into an actionable FAIL — never let it crash the run.
+    """
+
+
+def _exec(execute_sql_fn: SqlFn, sql: str) -> dict:
+    """Run SQL, normalising any driver-level failure into ``_SqlExecError``.
+
+    Generated mappings routinely produce SQL that *parses* but fails at
+    execution (UNION column-type mismatch, invalid CAST, unknown column).
+    The PGE contract is that such errors become feedback for the generator,
+    so they must surface as a FAIL report rather than an unhandled exception
+    that aborts the whole agent run.
+    """
+    try:
+        return execute_sql_fn(sql) or {}
+    except Exception as exc:  # noqa: BLE001 — any driver error becomes feedback
+        raise _SqlExecError(str(exc)) from exc
+
+
+def _sql_error_report(*, item: str, sql_error: str) -> EvalReport:
+    """Build a FAIL report for a mapping whose SQL failed to execute.
+
+    ``bubble_to_planner`` stays False: a runtime SQL error is the
+    Generator's to fix (align types, correct columns), not a signal that the
+    Planner's source model is wrong.
+    """
+    # Keep the hint compact — driver errors can be very long.
+    err = sql_error.strip().splitlines()[0][:300] if sql_error else "unknown error"
+    return build_report(
+        stage="deterministic",
+        metrics={"sql_error": err},
+        failures=[
+            _fail(
+                check="sql_execution",
+                expected="SQL executes without error",
+                observed="execution error",
+                hint=(
+                    f"The mapping SQL for '{item}' failed to execute: {err}. "
+                    "Fix the SQL — e.g. align UNION branch column types with "
+                    "explicit CAST (a common cause is one branch typing a "
+                    "column as BIGINT and another as STRING/NULL), correct "
+                    "column names, or use try_cast for malformed values."
+                ),
+            )
+        ],
+        bubble_to_planner=False,
+    )
+
+
+# =====================================================
+# Entity evaluator
+# =====================================================
+
+
+def evaluate_entity_mapping(
+    *,
+    mapping: dict,
+    ontology_class: dict,
+    execute_sql_fn: SqlFn,
+) -> EvalReport:
+    """Run the stage-1 deterministic checks on a submitted entity mapping.
+
+    Args:
+        mapping: Submitted entity mapping in the shape produced by
+            ``tool_submit_entity_mapping``.
+        ontology_class: The ontology-class dict the mapping targets; must
+            expose an ``attributes`` list (each item being a name string or
+            a dict with a ``name`` key).
+        execute_sql_fn: Caller-supplied SQL runner — see module docstring.
+
+    Returns:
+        An :class:`EvalReport` summarising the metrics and any failures.
+        ``bubble_to_planner`` is set when ``row_count == 0`` (typically
+        means the mapping is querying the wrong table altogether).
+    """
+    class_name = mapping.get("class_name") or ontology_class.get("name") or "?"
+    sql = mapping.get("sql_query", "")
+    id_col = _resolve_id_col(mapping)
+    logger.info(
+        "evaluate_entity_mapping: class=%s, id_col=%s, sql_len=%d",
+        class_name,
+        id_col,
+        len(sql),
+    )
+
+    try:
+        result = _exec(execute_sql_fn, sql)
+    except _SqlExecError as exc:
+        logger.warning(
+            "evaluate_entity_mapping: class=%s SQL failed to execute: %s",
+            class_name,
+            exc,
+        )
+        return _sql_error_report(item=class_name, sql_error=str(exc))
+    rows = result.get("rows", []) or []
+    row_count = len(rows)
+
+    id_values = _extract_id_values(rows, id_col)
+    null_id_count = sum(1 for v in id_values if v is None)
+    distinct_id_count = len({v for v in id_values if v is not None})
+
+    raw_unmapped = mapping.get("unmapped_attributes") or []
+    declared_unmapped: set = set()
+    for item in raw_unmapped:
+        if isinstance(item, dict):
+            name = item.get("name")
+            if name:
+                declared_unmapped.add(str(name))
+        elif item is not None:
+            declared_unmapped.add(str(item))
+    declared_mapped = set((mapping.get("attribute_mappings") or {}).keys())
+    all_attrs = _attribute_names(ontology_class)
+    unmapped_attrs = [
+        a for a in all_attrs if a not in declared_mapped and a not in declared_unmapped
+    ]
+    unmapped_pct = (len(unmapped_attrs) / len(all_attrs)) if all_attrs else 0.0
+
+    metrics: Dict[str, Any] = {
+        "row_count": row_count,
+        "distinct_id_count": distinct_id_count,
+        "null_id_count": null_id_count,
+        "unmapped_attribute_pct": unmapped_pct,
+        "unmapped_attributes": unmapped_attrs,
+    }
+
+    failures: List[EvalFailure] = []
+    bubble = False
+
+    if row_count == 0:
+        failures.append(
+            _fail(
+                check="row_count",
+                expected="> 0",
+                observed="0",
+                hint=(
+                    f"Entity '{class_name}' SQL returned 0 rows. Check the FROM "
+                    "table is correct and the WHERE clause is not over-filtering."
+                ),
+            )
+        )
+        bubble = True
+
+    if row_count > 0 and distinct_id_count != row_count:
+        dupes = row_count - distinct_id_count
+        failures.append(
+            _fail(
+                check="distinct_id_count",
+                expected=f"== row_count ({row_count})",
+                observed=str(distinct_id_count),
+                hint=(
+                    f"{dupes} duplicate '{id_col}' value(s) in entity '{class_name}'. "
+                    "Add DISTINCT or use a stricter id column."
+                ),
+            )
+        )
+
+    if null_id_count > 0:
+        failures.append(
+            _fail(
+                check="null_id_count",
+                expected="== 0",
+                observed=str(null_id_count),
+                hint=(
+                    f"{null_id_count} row(s) have NULL '{id_col}' in entity "
+                    f"'{class_name}'. Add 'WHERE {id_col} IS NOT NULL' to the SQL."
+                ),
+            )
+        )
+
+    if unmapped_pct > 0:
+        failures.append(
+            _fail(
+                check="unmapped_attribute_pct",
+                expected="== 0",
+                observed=f"{unmapped_pct:.3f}",
+                hint=(
+                    f"{len(unmapped_attrs)} attribute(s) of '{class_name}' are "
+                    f"neither in attribute_mappings nor declared in "
+                    f"unmapped_attributes: {unmapped_attrs}. Map them, or list "
+                    "them explicitly under 'unmapped_attributes'."
+                ),
+            )
+        )
+
+    logger.info(
+        "evaluate_entity_mapping: class=%s -> %s (%d failure(s), bubble=%s)",
+        class_name,
+        "PASS" if not failures else "FAIL",
+        len(failures),
+        bubble,
+    )
+    return build_report(
+        stage="deterministic",
+        metrics=metrics,
+        failures=failures,
+        bubble_to_planner=bubble,
+    )
+
+
+# =====================================================
+# Relationship evaluator
+# =====================================================
+
+
+def _distinct_id_set(
+    entity_mapping: dict,
+    execute_sql_fn: SqlFn,
+    id_universe_cache: Optional[Dict[str, set]] = None,
+) -> set:
+    """Materialise the set of valid ids for a given entity mapping.
+
+    When ``id_universe_cache`` is provided it is consulted/populated keyed
+    by the entity mapping's SQL string, avoiding redundant SQL execution
+    across repeated calls that share endpoint entities.
+    """
+    sql = entity_mapping.get("sql_query", "")
+    id_col = _resolve_id_col(entity_mapping)
+    if id_universe_cache is not None and sql in id_universe_cache:
+        return id_universe_cache[sql]
+    result = _exec(execute_sql_fn, sql)  # may raise _SqlExecError
+    rows = result.get("rows", []) or []
+    ids = {r.get(id_col) for r in rows if r.get(id_col) is not None}
+    if id_universe_cache is not None:
+        id_universe_cache[sql] = ids
+    return ids
+
+
+def _resolve_edge_columns(mapping: dict) -> Tuple[str, str]:
+    """Return ``(source_col, target_col)`` for a relationship mapping."""
+    return (
+        mapping.get("source_id_column") or "source_id",
+        mapping.get("target_id_column") or "target_id",
+    )
+
+
+def evaluate_relationship_mapping(
+    *,
+    mapping: dict,
+    source_entity_mapping: dict,
+    target_entity_mapping: dict,
+    execute_sql_fn: SqlFn,
+    expected_cross_source_overlap_band: Optional[Tuple[float, float]] = None,
+    id_universe_cache: Optional[Dict[str, set]] = None,
+) -> EvalReport:
+    """Run stage-1 deterministic checks on a relationship mapping.
+
+    Checks:
+
+    * ``total_edges > 0``
+    * ``dangling_source_pct < 0.05`` — fraction of source ids that do not
+      exist in the source entity's id universe.
+    * ``dangling_target_pct < 0.05`` — same for targets.
+    * If ``expected_cross_source_overlap_band`` is supplied, the realised
+      ``overlap_pct`` (fraction of edges whose target id appears in the
+      target entity universe) must fall inside the band.
+
+    ``bubble_to_planner`` is set when ``total_edges == 0``, when the source
+    dangling fraction exceeds ``0.5``, or when the target dangling fraction
+    exceeds ``0.5`` *and* the realised overlap is materially worse than the
+    Planner predicted (either no band was supplied, or the band check
+    itself failed).  These cases typically indicate the relationship was
+    built off the wrong join key.
+
+    Args:
+        id_universe_cache: Optional caller-managed dict mapping an entity
+            mapping's ``sql_query`` string to its materialised set of ids.
+            When provided, repeated calls across relationships that share
+            endpoint entities reuse cached id universes instead of
+            re-running the entity SQL via ``execute_sql_fn``.  When
+            ``None`` (default) behaviour is unchanged — fetch fresh each
+            call.  No module-level state is involved.
+    """
+    name = mapping.get("property_name") or mapping.get("property") or "?"
+    sql = mapping.get("sql_query", "")
+    src_col, tgt_col = _resolve_edge_columns(mapping)
+    logger.info(
+        "evaluate_relationship_mapping: property=%s, src_col=%s, tgt_col=%s",
+        name,
+        src_col,
+        tgt_col,
+    )
+
+    try:
+        edges_result = _exec(execute_sql_fn, sql)
+        edge_rows = edges_result.get("rows", []) or []
+        total_edges = len(edge_rows)
+
+        source_universe = _distinct_id_set(
+            source_entity_mapping, execute_sql_fn, id_universe_cache
+        )
+        target_universe = _distinct_id_set(
+            target_entity_mapping, execute_sql_fn, id_universe_cache
+        )
+    except _SqlExecError as exc:
+        logger.warning(
+            "evaluate_relationship_mapping: property=%s SQL failed to execute: %s",
+            name,
+            exc,
+        )
+        return _sql_error_report(item=name, sql_error=str(exc))
+
+    src_values = [r.get(src_col) for r in edge_rows]
+    tgt_values = [r.get(tgt_col) for r in edge_rows]
+
+    if total_edges > 0:
+        dangling_src = sum(
+            1 for v in src_values if v is None or v not in source_universe
+        )
+        dangling_tgt = sum(
+            1 for v in tgt_values if v is None or v not in target_universe
+        )
+        dangling_src_pct = dangling_src / total_edges
+        dangling_tgt_pct = dangling_tgt / total_edges
+        overlap_pct = 1.0 - dangling_tgt_pct
+    else:
+        dangling_src_pct = 0.0
+        dangling_tgt_pct = 0.0
+        overlap_pct = 0.0
+
+    metrics: Dict[str, Any] = {
+        "total_edges": total_edges,
+        "dangling_source_pct": dangling_src_pct,
+        "dangling_target_pct": dangling_tgt_pct,
+        "cross_source_overlap_pct": overlap_pct,
+        "source_universe_size": len(source_universe),
+        "target_universe_size": len(target_universe),
+    }
+
+    failures: List[EvalFailure] = []
+    bubble = False
+
+    if total_edges == 0:
+        failures.append(
+            _fail(
+                check="total_edges",
+                expected="> 0",
+                observed="0",
+                hint=(
+                    f"Relationship '{name}' produced 0 edges. Confirm the join "
+                    "predicate is on the right columns and rows are not being "
+                    "filtered away."
+                ),
+            )
+        )
+        bubble = True
+
+    if total_edges > 0 and dangling_src_pct >= _DANGLING_FK_FAIL_THRESHOLD:
+        failures.append(
+            _fail(
+                check="dangling_source_pct",
+                expected=f"< {_DANGLING_FK_FAIL_THRESHOLD}",
+                observed=f"{dangling_src_pct:.3f}",
+                hint=(
+                    f"{dangling_src_pct:.1%} of source_id values in relationship "
+                    f"'{name}' are absent from the mapped source entity. The "
+                    "source entity's id_column is usually an ALIAS for a derived "
+                    "expression (e.g. CONCAT(regexp_extract(<col>,'...'),'-x')). "
+                    "Reproduce that exact id expression from the source entity's "
+                    "SQL for source_id — do not select a raw/trust-local column."
+                ),
+            )
+        )
+        if dangling_src_pct > _DANGLING_FK_BUBBLE_THRESHOLD:
+            bubble = True
+
+    # When an explicit cross-source overlap band is provided the relationship
+    # is *expected* to be partial (e.g. trust_a-only IDs vs the cross-trust
+    # canonical universe).  In that case we trust the band check and skip
+    # the standard ``dangling_target_pct`` strictness — the partiality is
+    # the point.  The catastrophic-dangling bubble below still fires, but
+    # only when the band itself ALSO fails (i.e. the realised overlap is
+    # materially worse than the Planner predicted).
+    if (
+        total_edges > 0
+        and dangling_tgt_pct >= _DANGLING_FK_FAIL_THRESHOLD
+        and expected_cross_source_overlap_band is None
+    ):
+        failures.append(
+            _fail(
+                check="dangling_target_pct",
+                expected=f"< {_DANGLING_FK_FAIL_THRESHOLD}",
+                observed=f"{dangling_tgt_pct:.3f}",
+                hint=(
+                    f"{dangling_tgt_pct:.1%} of target_id values in relationship "
+                    f"'{name}' are absent from the mapped target entity. The "
+                    "target entity's id_column is usually an ALIAS for a derived "
+                    "expression; reproduce that exact id expression from the "
+                    "target entity's SQL for target_id — not a raw join column."
+                ),
+            )
+        )
+
+    band_failed = False
+    if expected_cross_source_overlap_band is not None:
+        lo, hi = expected_cross_source_overlap_band
+        if not (lo <= overlap_pct <= hi):
+            band_failed = True
+            failures.append(
+                _fail(
+                    check="cross_source_overlap_pct",
+                    expected=f"in [{lo:.3f}, {hi:.3f}]",
+                    observed=f"{overlap_pct:.3f}",
+                    hint=(
+                        f"Cross-source overlap for '{name}' is {overlap_pct:.1%}, "
+                        f"outside the expected band [{lo:.1%}, {hi:.1%}]. "
+                        "Check the join key and the source/target trust assignments."
+                    ),
+                )
+            )
+
+    # Bubble-to-planner on catastrophic target-dangling, with a band-aware gate.
+    #
+    # * Band absent + dangling > 0.5: the strict dangling_target_pct failure
+    #   above already fired; we just flip the bubble flag (no new row needed).
+    # * Band present + band PASSED: the Planner predicted this overlap and
+    #   was right — do NOT bubble, even if dangling > 0.5 (the partiality
+    #   was expected).
+    # * Band present + band FAILED + dangling > 0.5: the realised overlap
+    #   is materially worse than predicted.  Bubble, and emit a dedicated
+    #   ``dangling_target_pct_catastrophic`` failure so the FAIL report has
+    #   a concrete structural row alongside the band-check failure.
+    if total_edges > 0 and dangling_tgt_pct > _DANGLING_FK_BUBBLE_THRESHOLD:
+        if expected_cross_source_overlap_band is None:
+            bubble = True
+        elif band_failed:
+            bubble = True
+            failures.append(
+                _fail(
+                    check="dangling_target_pct_catastrophic",
+                    expected=f"<= {_DANGLING_FK_BUBBLE_THRESHOLD}",
+                    observed=f"{dangling_tgt_pct:.3f}",
+                    hint=(
+                        f"{dangling_tgt_pct:.1%} of target_id values in "
+                        f"relationship '{name}' are absent from the mapped "
+                        "target entity AND the realised overlap is outside "
+                        "the predicted band.  Re-plan the join key and the "
+                        "source/target trust assignments."
+                    ),
+                )
+            )
+
+    logger.info(
+        "evaluate_relationship_mapping: %s -> %s (%d failure(s), bubble=%s)",
+        name,
+        "PASS" if not failures else "FAIL",
+        len(failures),
+        bubble,
+    )
+    return build_report(
+        stage="deterministic",
+        metrics=metrics,
+        failures=failures,
+        bubble_to_planner=bubble,
+    )
diff --git a/src/agents/agent_mapping_pge/evaluator/report.py b/src/agents/agent_mapping_pge/evaluator/report.py
new file mode 100644
index 00000000..532f0a7d
--- /dev/null
+++ b/src/agents/agent_mapping_pge/evaluator/report.py
@@ -0,0 +1,37 @@
+"""Small helpers for assembling :class:`EvalReport` objects.
+
+The dataclasses themselves live in
+:mod:`agents.agent_mapping_pge.contracts`; this module just centralises the
+"compose a report from a list of failures" boilerplate so the deterministic
+and (future) semantic evaluators stay short.
+"""
+
+from typing import Any, Dict, List
+
+from back.core.logging import get_logger
+from agents.agent_mapping_pge.contracts import EvalFailure, EvalReport
+
+logger = get_logger(__name__)
+
+
+def build_report(
+    *,
+    stage: str,
+    metrics: Dict[str, Any],
+    failures: List[EvalFailure],
+    bubble_to_planner: bool,
+) -> EvalReport:
+    """Assemble an :class:`EvalReport`; status is derived from ``failures``."""
+    status = "PASS" if not failures else "FAIL"
+    if bubble_to_planner and status == "PASS":
+        logger.warning(
+            "build_report: bubble_to_planner=True but no failures → demoted "
+            "to False; check caller logic"
+        )
+    return EvalReport(
+        status=status,
+        stage=stage,
+        metrics=dict(metrics),
+        failures=list(failures),
+        bubble_to_planner=bool(bubble_to_planner) and status == "FAIL",
+    )
diff --git a/src/agents/agent_mapping_pge/generators/__init__.py b/src/agents/agent_mapping_pge/generators/__init__.py
new file mode 100644
index 00000000..575f858c
--- /dev/null
+++ b/src/agents/agent_mapping_pge/generators/__init__.py
@@ -0,0 +1,31 @@
+"""Generator agents for the mapping-PGE pipeline.
+
+Each Generator is a narrow tool-calling agent that maps ONE ontology item
+(class or relationship) at a time. The orchestrator (Sprint 7) calls them
+per-item with a filtered slice of the Planner's :class:`SourceModel` — the
+Generators never see the full ontology or full metadata, keeping each
+decision cheap and local.
+
+* Sprint 4 — :mod:`agents.agent_mapping_pge.generators.entity`.
+* Sprint 5 — :mod:`agents.agent_mapping_pge.generators.relationship`.
+"""
+
+from agents.agent_mapping_pge.generators.entity import (
+    EntityGenResult,
+    EntityGenStep,
+    run_entity_generator,
+)
+from agents.agent_mapping_pge.generators.relationship import (
+    RelationshipGenResult,
+    RelationshipGenStep,
+    run_relationship_generator,
+)
+
+__all__ = [
+    "EntityGenResult",
+    "EntityGenStep",
+    "run_entity_generator",
+    "RelationshipGenResult",
+    "RelationshipGenStep",
+    "run_relationship_generator",
+]
diff --git a/src/agents/agent_mapping_pge/generators/entity.py b/src/agents/agent_mapping_pge/generators/entity.py
new file mode 100644
index 00000000..2062f5e5
--- /dev/null
+++ b/src/agents/agent_mapping_pge/generators/entity.py
@@ -0,0 +1,852 @@
+"""
+OntoBricks Mapping-PGE EntityGenerator Agent.
+
+Sprint 4 of the Planner-Generator-Evaluator (PGE) redesign.
+
+The EntityGenerator is a narrow, focused LLM agent that maps **one** ontology
+class at a time. The orchestrator (Sprint 7) calls it per item with a
+filtered slice of the Planner's :class:`SourceModel`:
+
+* the single ontology class to map, with its full attribute list, and
+* a small SourceModel slice — only the candidate tables / canonical IDs /
+  joins that are relevant to *this* class.
+
+The Generator does NOT see the full ontology or full metadata. That is the
+core design contract: keep its context bounded and each decision cheap.
+
+The loop shape mirrors :mod:`agents.agent_mapping_pge.planner` — same
+``call_serving_endpoint`` + ``dispatch_tool`` ReAct cycle, same 3-second
+inter-iteration delay, same MLflow trace decorator — with these differences:
+
+* Smaller default budget (12 vs 25): mapping one class is bounded work.
+* Different tool set: only ``execute_sql``, ``sample_table``, and the
+  terminal ``submit_entity_mapping``. The slice already carries every piece
+  of context the Generator needs.
+* No single-shot fallback: if the endpoint refuses tools, the Generator
+  reports failure — it produces structured output through
+  ``submit_entity_mapping`` only.
+* The "NO SILENT DROPS" invariant: every ontology attribute must be either
+  in ``attribute_mappings`` or in ``unmapped_attributes`` with a one-sentence
+  reason. The system prompt enforces this; the tool persists it.
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional
+
+import requests
+
+from back.core.logging import get_logger
+from agents.engine_base import (
+    call_serving_endpoint,
+    dispatch_tool,
+    accumulate_usage,
+)
+from agents.tools.context import ToolContext
+from agents.tools.mapping import (
+    MAPPING_TOOL_DEFINITIONS_BY_NAME,
+    MAPPING_TOOL_HANDLERS,
+)
+from agents.tools.planner import (
+    SAMPLE_TABLE_DEF,
+    tool_sample_table,
+)
+from agents.tools.sql import (
+    SQL_TOOL_DEFINITIONS,
+    SQL_TOOL_HANDLERS,
+)
+from agents.tracing import trace_agent
+
+logger = get_logger(__name__)
+
+MAX_ITERATIONS = 12
+LLM_TIMEOUT = 180
+_ITERATION_DELAY_SEC = 3
+# See planner._MAX_TOKENS comment — same rationale for the Generator's
+# submit_entity_mapping JSON (SQL + attribute_mappings can be large).
+_MAX_TOKENS = 50000
+
+_TRACE_NAME = "mapping_pge_entity_generator"
+
+
+# =====================================================
+# Tool aggregation
+# =====================================================
+#
+# The EntityGenerator only needs:
+#   * execute_sql        – validate the composed SELECT before submitting.
+#   * sample_table       – disambiguate when two candidate tables are equally
+#                          plausible (e.g. same confidence in the slice).
+#   * submit_entity_mapping – TERMINAL.
+#
+# We deliberately exclude:
+#   * get_ontology / get_metadata / get_documents_context — the Planner's
+#     view; the slice already has what's needed.
+#   * column_value_overlap / distinct_count — those validate join keys and
+#     canonical IDs, which the Planner already locked in.
+#   * submit_relationship_mapping / submit_source_model — wrong stage.
+
+# Filter MAPPING_TOOL_DEFINITIONS down to just submit_entity_mapping. We
+# look up by name from the by-name index in ``mapping.py`` rather than
+# scanning the list inline. Sprint 5 will reuse the same pattern for
+# ``submit_relationship_mapping``.
+_SUBMIT_ENTITY_DEF: dict = MAPPING_TOOL_DEFINITIONS_BY_NAME["submit_entity_mapping"]
+
+TOOL_DEFINITIONS: List[dict] = (
+    SQL_TOOL_DEFINITIONS
+    + [SAMPLE_TABLE_DEF]
+    + [_SUBMIT_ENTITY_DEF]
+)
+
+TOOL_HANDLERS: Dict[str, Callable] = {
+    **SQL_TOOL_HANDLERS,
+    "sample_table": tool_sample_table,
+    "submit_entity_mapping": MAPPING_TOOL_HANDLERS["submit_entity_mapping"],
+}
+
+
+# =====================================================
+# Data classes
+# =====================================================
+
+
+@dataclass
+class EntityGenStep:
+    """One observable step of the EntityGenerator's execution.
+
+    Mirrors :class:`agents.agent_mapping_pge.planner.PlannerStep` but is
+    scoped to the Generator so the orchestrator (Sprint 7) can render a
+    per-class timeline in the UI.
+    """
+
+    step_type: str  # "tool_call" | "tool_result" | "output"
+    content: str
+    tool_name: str = ""
+    duration_ms: int = 0
+
+
+@dataclass
+class EntityGenResult:
+    """Outcome of a single EntityGenerator invocation.
+
+    ``mapping`` holds the submitted entity-mapping dict (the same shape the
+    handler appends to ``ctx.entity_mappings``) when ``success`` is True.
+    """
+
+    success: bool
+    mapping: Optional[dict] = None
+    steps: List[EntityGenStep] = field(default_factory=list)
+    iterations: int = 0
+    error: str = ""
+    usage: Dict[str, int] = field(default_factory=dict)
+
+
+# =====================================================
+# System prompt
+# =====================================================
+#
+# The ENTITY SQL RULES section is lifted verbatim from the legacy in-house
+# mapping agent (the section starting "SQL RULES FOR ENTITIES") because
+# those rules are correct and load-bearing — every mapping query must
+# follow them or downstream SPARQL translation breaks.
+#
+# The PGE-specific additions are the slice-consumption rules: pick the
+# best candidate table from the slice, use the canonical ID exactly as
+# the Planner specified it, and account for every ontology attribute.
+
+SYSTEM_PROMPT = """\
+You are a senior data engineer. Your job is to map ONE ontology class to a \
+single SQL SELECT query against a Databricks source table, validated against \
+real data via execute_sql, and submitted via submit_entity_mapping.
+
+YOU WILL BE GIVEN
+• ontology_class: the class to map (uri, label, comment, attributes list).
+• source_model_slice: a small JSON object the Planner already curated for \
+this class:
+  - candidate_tables[]: {table, confidence, reason} — the tables that could \
+realise this class.
+  - canonical_id.canonical_column_per_table[<table>]: the expression that \
+MUST be aliased AS ID for each table. THIS VALUE MAY BE A BARE COLUMN \
+NAME ("CUSTOMER_ID") OR A FULL SQL EXPRESSION \
+("regexp_extract(order_ref, '([a-f0-9-]+-ord-[0-9]+)')"). Drop it \
+verbatim into the SELECT and alias it AS ID — do NOT rewrite it, do NOT \
+pick a different column, do NOT strip the function call. The Planner emits \
+SQL expressions when raw column values across sources are in different \
+formats and need to be normalized to a common canonical key.
+  - canonical_id.format_note: a one-sentence note describing the canonical \
+key (may be empty). Read it to understand what each row's ID represents.
+  - relevant_joins[]: optional — any joins the Planner thinks may apply.
+
+SINGLE-SOURCE vs CROSS-SOURCE (CRITICAL — read carefully)
+The number of entries in canonical_id.canonical_column_per_table is the \
+authoritative signal for how to shape your SELECT:
+
+  • If canonical_column_per_table has EXACTLY ONE table → single-source \
+class. Write a flat SELECT from that one table. Pick it from the matching \
+candidate_tables entry.
+
+  • If canonical_column_per_table has TWO OR MORE tables → CROSS-SOURCE \
+class (e.g. the same customer or order realised across multiple sources). \
+You MUST emit a UNION ALL across ALL listed tables, NOT pick one. Each \
+branch uses that table's canonical-ID column AS ID. Picking just one would \
+produce an entity missing a large fraction of its real instances, and every \
+relationship pointing at it would then dangle. \
+This is the #1 failure mode the orchestrator catches — do not produce it.
+
+  UNION shape (use exactly this pattern — substitute the canonical-ID \
+EXPRESSION exactly as the Planner specified it for that table; do NOT \
+rewrite it):
+    SELECT <canonical_expr_A> AS ID, <label_col_A> AS Label, \
+<attr cols from A> FROM <table_A> WHERE <canonical_expr_A> IS NOT NULL
+    UNION ALL
+    SELECT <canonical_expr_B> AS ID, <label_col_B> AS Label, \
+<attr cols from B> FROM <table_B> WHERE <canonical_expr_B> IS NOT NULL
+    UNION ALL
+    ...
+
+  All branches must return the SAME columns in the SAME order, AND each \
+column must have the SAME TYPE in every branch. If a branch lacks a column \
+another branch has, project a NULL with a matching alias **cast to the same \
+type the real branch uses** (e.g. if branch A has ``ACCOUNT_ID`` typed \
+BIGINT, branch B must use ``CAST(NULL AS BIGINT) AS ACCOUNT_ID`` — not \
+``AS STRING``). When two branches hold the column with DIFFERENT types, cast \
+BOTH to a common type (``CAST(... AS STRING)`` is the safe default). A \
+``CAST_INVALID_INPUT`` / type-mismatch error from execute_sql always means a \
+column's types differ across branches — fix the casts, do not change the ID.
+
+TOOLS
+You have three tools:
+  • execute_sql           – Validate the composed SELECT before submitting. \
+The tool runs your query with a small LIMIT and returns columns + sample \
+rows; the persisted mapping has no LIMIT.
+  • sample_table          – Up to N random rows from a table. Use only when \
+two candidate tables are equally plausible and you need to peek at real \
+values to disambiguate.
+  • submit_entity_mapping – TERMINAL. Call exactly once, after execute_sql \
+succeeds, with the full mapping payload.
+
+SQL RULES FOR ENTITIES (CRITICAL)
+• Always use the full table name from the slice (catalog.schema.table).
+• The FIRST column MUST be aliased AS ID — it MUST be the canonical-ID \
+column the slice specifies for the chosen table.
+• The SECOND column MUST be aliased AS Label — pick the most human-readable \
+available column (typically ``name``, ``label``, ``display_name``, or \
+similar). If no human-readable column exists, fall back to the canonical \
+ID column itself aliased AS Label.
+• Add one column per ontology data-property attribute you can satisfy from \
+the chosen table. Use the column's original name (no alias).
+• If the same column serves as both an alias and an attribute, include it \
+twice: once with the alias (AS ID or AS Label) and once with its original \
+name so it appears in attribute_mappings.
+• Add WHERE <id_column> IS NOT NULL to filter null keys. When the ID is a \
+derived expression, also exclude empty extractions (e.g. \
+``WHERE regexp_extract(...) <> ''``).
+• DEDUP COLLAPSED KEYS: when the canonical-ID is a derived EXPRESSION that \
+can repeat across rows (e.g. a ``<core>-line`` key where several child rows \
+share one parent core), the same ID will appear on multiple rows and the \
+evaluator FAILs on "duplicate ID values". Make each node id unique: wrap the \
+UNION in ``SELECT ... FROM (<union>) GROUP BY ID`` (taking MAX() of each \
+attribute) or use ``SELECT DISTINCT`` when there are no attributes. The id \
+column must have exactly one row per distinct value.
+• Do NOT add LIMIT — the persisted mapping query must return ALL rows. \
+execute_sql adds a small LIMIT internally for validation only.
+• Do NOT use ORDER BY, CTEs, or subqueries unless absolutely necessary.
+• Write simple, flat SELECT statements.
+
+REGEX SAFETY (CRITICAL — applies to EVERY regex you write)
+• ALWAYS use explicit character classes: ``[0-9]`` for digits, ``[a-z]`` / \
+``[A-Za-z]`` for letters. NEVER use the backslash escapes ``\\d``, ``\\w``, \
+``\\s``. The OntoBricks build pipeline strips a lone backslash, so ``\\d`` \
+silently degrades to the literal ``d`` and the mapping breaks AFTER it has \
+already passed validation here. This applies to the canonical-ID expression \
+(use it verbatim from the slice — the Planner already emits ``[0-9]``) AND to \
+any CASE/RLIKE you write for value harmonization below.
+
+VALUE HARMONIZATION (controlled-vocabulary attributes)
+Some attributes are CODED: the same real-world value is spelled differently \
+across sources (e.g. a status as 'A' / 'Active' / 'ACTIVE'; a category code as \
+'CS' / 'C-Section' / 'cs'; a flag as 'Y' / 'true' / '1'). A raw column copied \
+verbatim then has a source-fractured, un-aggregatable vocabulary and the KPI it \
+feeds is garbage.
+When an attribute is a controlled vocabulary (the class/attribute name implies \
+a small fixed value set — method, status, type, mode, outcome, category, \
+classification — or sampling reveals a handful of distinct codes):
+  1. DISCOVER the raw distinct values first. For each covering table run \
+``SELECT DISTINCT <col> FROM <table> LIMIT 50`` via execute_sql (or \
+sample_table). Do NOT guess the value set — harmonize what is actually there.
+  2. Map every raw spelling to ONE canonical lowercase token with a CASE \
+expression aliased to the attribute's clean name. Use the SAME token set in \
+EVERY UNION branch so the entity carries one coherent vocabulary regardless of \
+source system. Domain-neutral example (a status attribute):
+        CASE
+          WHEN lower(STATUS_CODE) RLIKE 'a|active|open' THEN 'active'
+          WHEN lower(STATUS_CODE) RLIKE 'c|closed|done' THEN 'closed'
+          WHEN lower(STATUS_CODE) RLIKE 'p|pending|hold' THEN 'pending'
+          ELSE NULL
+        END AS status
+  3. Record it in attribute_mappings: ontology attribute name → the alias you \
+chose (e.g. "status" → "status").
+  4. This is a LEGITIMATE exception to "use the column's original name": a \
+harmonized attribute is a CASE expression aliased to the clean attribute name. \
+Plain, non-coded attributes (dates, numbers, names, free-text) still use their \
+original column name unaliased.
+
+ATTRIBUTE COVERAGE — NO SILENT DROPS (CRITICAL)
+For EACH ontology attribute on the class, you must do ONE of:
+  (a) include a SQL column for it in the SELECT, AND add an entry to \
+attribute_mappings mapping the ontology attribute name to the SQL column \
+name (case-sensitive); OR
+  (b) add it to unmapped_attributes with a one-sentence reason, using the \
+shape {"name": "<attr>", "reason": "<why>"}.
+
+You may NOT silently drop an attribute. The orchestrator will reject any \
+mapping where some ontology attributes appear in neither list. If a column \
+genuinely does not exist on the chosen table, that's an honest unmapped — \
+say so in the reason.
+
+WORKFLOW
+1. Read the ontology class and the source_model_slice carefully.
+2. COUNT the entries in canonical_id.canonical_column_per_table:
+   - one → single-source: pick that table, compose a flat SELECT.
+   - two or more → cross-source: compose a UNION ALL across ALL of them \
+(see the SINGLE-SOURCE vs CROSS-SOURCE block above). Do NOT pick one.
+3. Compose the SELECT (or UNION ALL) following the SQL RULES above. For \
+each branch, the value of canonical_column_per_table[<that_table>] is what \
+gets aliased AS ID — drop it in verbatim. It may already be a SQL \
+expression (e.g. ``regexp_extract(...)``); do not rewrite it. For any coded \
+attribute, apply VALUE HARMONIZATION (sample the distinct values, then a CASE \
+to a shared canonical token set across all branches).
+4. Call execute_sql to validate the SELECT. If it fails, READ the error and \
+fix the SQL (typically a typo'd column name, mismatched column lists in a \
+UNION, or wrong full_name). Retry as needed. Never submit an un-validated \
+query.
+5. Once execute_sql succeeds, call submit_entity_mapping EXACTLY ONCE with:
+     class_uri, class_name, sql_query (no LIMIT), id_column, label_column, \
+attribute_mappings, unmapped_attributes.
+6. That's the terminal step. Do not emit any free text after submitting.
+
+GENERAL RULES
+• Only ever pass row-returning queries (SELECT / WITH …) to execute_sql.
+• Do not call get_metadata, get_ontology, or any other tool — they are not \
+available to you. The slice carries everything you need.
+• If a retry_hint is present at the top of the user message, treat it as \
+authoritative — your previous attempt failed for the reason stated and you \
+should NOT repeat the same mistake.
+"""
+
+
+# =====================================================
+# Internal helpers
+# =====================================================
+
+
+def _build_user_prompt(
+    ontology_class: dict,
+    source_model_slice: dict,
+    retry_hint: Optional[str] = None,
+) -> str:
+    """Render the per-class user prompt.
+
+    The orchestrator hands us `ontology_class` and a focused
+    `source_model_slice`. We emit a structured prompt that:
+      * surfaces the retry hint up top if one was provided,
+      * lists the class metadata and attribute list explicitly so the LLM
+        cannot forget any attribute, and
+      * embeds the slice as JSON so the LLM can refer to it precisely.
+    """
+    parts: List[str] = []
+
+    if retry_hint:
+        parts.append(f"RETRY HINT (authoritative): {retry_hint}")
+        parts.append("")
+
+    class_uri = ontology_class.get("uri", "")
+    class_label = ontology_class.get("label") or ontology_class.get("name", "")
+    class_comment = ontology_class.get("comment", "") or ""
+    attributes = ontology_class.get("attributes", []) or []
+
+    attr_summary_lines: List[str] = []
+    for attr in attributes:
+        if isinstance(attr, dict):
+            attr_name = attr.get("name") or attr.get("label") or attr.get("uri", "?")
+            attr_type = attr.get("type") or attr.get("range") or ""
+            attr_summary_lines.append(
+                f"  - {attr_name}" + (f" ({attr_type})" if attr_type else "")
+            )
+        else:
+            attr_summary_lines.append(f"  - {attr}")
+
+    parts.append("ONTOLOGY CLASS")
+    parts.append(f"  uri:     {class_uri}")
+    parts.append(f"  label:   {class_label}")
+    if class_comment:
+        parts.append(f"  comment: {class_comment}")
+    if attr_summary_lines:
+        parts.append("  attributes ({} total):".format(len(attributes)))
+        parts.extend(attr_summary_lines)
+    else:
+        parts.append("  attributes: (none — only ID and Label required)")
+
+    parts.append("")
+    parts.append("SOURCE MODEL SLICE")
+    parts.append(json.dumps(source_model_slice, indent=2, default=str))
+
+    parts.append("")
+    parts.append(
+        "Pick the best candidate table from the slice, compose a flat SELECT "
+        "following the SQL RULES, validate with execute_sql, then call "
+        "submit_entity_mapping exactly once. Every ontology attribute must "
+        "appear in either attribute_mappings or unmapped_attributes — no "
+        "silent drops."
+    )
+
+    prompt = "\n".join(parts)
+    logger.debug(
+        "_build_user_prompt for class=%s (%d chars):\n%s",
+        class_uri,
+        len(prompt),
+        prompt,
+    )
+    return prompt
+
+
+# =====================================================
+# Public entry point
+# =====================================================
+
+
+@trace_agent(name="mapping_pge_entity_generator")
+def run_entity_generator(
+    host: str,
+    token: str,
+    endpoint_name: str,
+    client: Any,
+    *,
+    ontology_class: dict,
+    source_model_slice: dict,
+    retry_hint: Optional[str] = None,
+    on_step: Optional[Callable[[str, int], None]] = None,
+    max_iterations: int = MAX_ITERATIONS,
+) -> EntityGenResult:
+    """Run the EntityGenerator agent for a single ontology class.
+
+    The agent autonomously composes a SQL SELECT for ``ontology_class``
+    against the candidate table(s) in ``source_model_slice``, validates the
+    SQL with ``execute_sql``, and submits the validated mapping via the
+    terminal ``submit_entity_mapping`` tool.
+
+    Args:
+        host: Databricks workspace URL.
+        token: Bearer token for the serving endpoint.
+        endpoint_name: Foundation Model serving endpoint name.
+        client: Databricks SQL client (must expose ``execute_query(sql)``).
+        ontology_class: Full dict for the SINGLE class to map (uri, label,
+            comment, attributes list).
+        source_model_slice: Filtered SourceModel slice with candidate_tables,
+            canonical_id, and optional relevant_joins.
+        retry_hint: Optional one-sentence hint from the orchestrator's
+            previous-attempt evaluation. When present, surfaced at the top of
+            the user prompt.
+        on_step: Optional progress callback ``(msg, pct)`` for UI updates.
+        max_iterations: Upper bound on tool-call iterations (default 12 —
+            smaller than the Planner because the scope is one class).
+
+    Returns:
+        An :class:`EntityGenResult`. ``success`` is True iff a mapping was
+        successfully submitted; in that case ``mapping`` holds the submitted
+        dict. On failure, ``error`` explains why and ``mapping`` is None.
+    """
+    iteration_limit = max_iterations if max_iterations is not None else MAX_ITERATIONS
+
+    class_uri = (ontology_class or {}).get("uri", "")
+    class_label = (
+        (ontology_class or {}).get("label")
+        or (ontology_class or {}).get("name", "")
+    )
+    n_attrs = len(((ontology_class or {}).get("attributes") or []))
+    n_candidates = len(((source_model_slice or {}).get("candidate_tables") or []))
+
+    logger.info(
+        "===== ENTITY GENERATOR START ===== endpoint=%s, class=%s (%s), "
+        "attributes=%d, candidate_tables=%d, retry_hint=%s, max_iter=%d",
+        endpoint_name,
+        class_label,
+        class_uri,
+        n_attrs,
+        n_candidates,
+        "yes" if retry_hint else "no",
+        iteration_limit,
+    )
+
+    ctx = ToolContext(
+        host=host.rstrip("/"),
+        token=token,
+        client=client,
+        # The slice subsumes metadata/ontology for this agent; the unified
+        # ToolContext still needs these fields, so we plant the slice into
+        # ``metadata`` for completeness even though no handler reads it.
+        metadata={},
+        ontology={},
+        documents=[],
+    )
+
+    result = EntityGenResult(success=False)
+
+    user_prompt = _build_user_prompt(
+        ontology_class or {}, source_model_slice or {}, retry_hint=retry_hint
+    )
+    messages: List[dict] = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": user_prompt},
+    ]
+    logger.info(
+        "EntityGenerator conversation initialized: system=%d chars, user=%d chars",
+        len(SYSTEM_PROMPT),
+        len(user_prompt),
+    )
+
+    total_usage: Dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0}
+
+    def _progress_pct(iteration_idx: int) -> int:
+        # Linear ramp 5 → 95 across the iteration budget. submit hits 100.
+        ratio = (iteration_idx + 1) / max(iteration_limit, 1)
+        return min(5 + int(ratio * 90), 95)
+
+    def notify(msg: str, *, pct: Optional[int] = None) -> None:
+        actual_pct = pct if pct is not None else 5
+        logger.info("ENTITY GEN STEP [%d%%] %s", actual_pct, msg)
+        if on_step:
+            on_step(msg, actual_pct)
+
+    notify(f"Generating mapping for {class_label or class_uri}…", pct=1)
+
+    # Snapshot the pre-existing mapping count so we can detect "this run
+    # added a mapping" without relying on absolute counters. (The orchestrator
+    # in Sprint 7 may reuse a ToolContext across calls; today's `ctx` is
+    # fresh, but the assertion is cheap and future-proof.)
+    pre_run_mapping_count = len(ctx.entity_mappings)
+
+    # ------------------------------------------------------------------
+    # Agent loop
+    # ------------------------------------------------------------------
+    for iteration in range(iteration_limit):
+        if iteration > 0:
+            logger.debug(
+                "Iteration %d: waiting %ds before LLM call (rate limit mitigation)",
+                iteration + 1,
+                _ITERATION_DELAY_SEC,
+            )
+            time.sleep(_ITERATION_DELAY_SEC)
+
+        current_iteration = iteration + 1
+        pct = _progress_pct(iteration)
+        logger.info(
+            "----- EntityGenerator iteration %d/%d — %d messages, mapping=%s -----",
+            current_iteration,
+            iteration_limit,
+            len(messages),
+            "set" if len(ctx.entity_mappings) > pre_run_mapping_count else "unset",
+        )
+        notify(
+            f"Mapping iteration {current_iteration}/{iteration_limit}…",
+            pct=pct,
+        )
+
+        t0 = time.time()
+        try:
+            llm_response = call_serving_endpoint(
+                host,
+                token,
+                endpoint_name,
+                messages,
+                tools=TOOL_DEFINITIONS,
+                max_tokens=_MAX_TOKENS,
+                temperature=0.1,
+                timeout=LLM_TIMEOUT,
+                trace_name=_TRACE_NAME,
+            )
+        except requests.exceptions.HTTPError as exc:
+            status = exc.response.status_code if exc.response is not None else "?"
+            logger.warning(
+                "EntityGenerator iteration %d: HTTPError status=%s",
+                current_iteration,
+                status,
+            )
+            logger.debug(
+                "EntityGenerator iteration %d: HTTPError body: %.500s",
+                current_iteration,
+                exc.response.text if exc.response is not None else "N/A",
+            )
+            if exc.response is not None and status in (400, 422):
+                result.error = "LLM endpoint does not support function calling"
+                result.iterations = current_iteration
+                result.usage = total_usage
+                logger.error(
+                    "EntityGenerator: endpoint refused tools — cannot produce a mapping"
+                )
+                return result
+            result.error = f"LLM request failed: {exc}"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.error(
+                "EntityGenerator: LLM request failed at iteration %d: %s",
+                current_iteration,
+                exc,
+            )
+            return result
+        except requests.exceptions.ReadTimeout:
+            result.error = f"LLM request timed out after {LLM_TIMEOUT}s"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.error("EntityGenerator: timeout at iteration %d", current_iteration)
+            return result
+        except requests.exceptions.RequestException as exc:
+            result.error = f"LLM request failed: {exc}"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.error(
+                "EntityGenerator: request exception at iteration %d: %s",
+                current_iteration,
+                exc,
+            )
+            return result
+
+        elapsed_ms = int((time.time() - t0) * 1000)
+        logger.info(
+            "EntityGenerator iteration %d: LLM responded in %dms",
+            current_iteration,
+            elapsed_ms,
+        )
+
+        accumulate_usage(total_usage, llm_response.get("usage", {}))
+
+        choice = llm_response.get("choices", [{}])[0]
+        finish_reason = choice.get("finish_reason", "?")
+        message = choice.get("message", {})
+        tool_calls = message.get("tool_calls", [])
+        has_content = bool(message.get("content"))
+        logger.info(
+            "EntityGenerator iteration %d: finish_reason=%s, tool_calls=%d, has_content=%s",
+            current_iteration,
+            finish_reason,
+            len(tool_calls),
+            has_content,
+        )
+
+        if not tool_calls:
+            # The Generator must terminate via submit_entity_mapping, never
+            # via free text.
+            content = (message.get("content") or "")[:500]
+            logger.warning(
+                "EntityGenerator iteration %d: produced text without submitting mapping — %d chars",
+                current_iteration,
+                len(message.get("content") or ""),
+            )
+            result.steps.append(
+                EntityGenStep(
+                    step_type="output",
+                    content=content,
+                    duration_ms=elapsed_ms,
+                )
+            )
+            result.error = "entity generator produced text without submitting mapping"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            notify(
+                "Entity generator produced text without submitting mapping.",
+                pct=pct,
+            )
+            return result
+
+        logger.info(
+            "EntityGenerator iteration %d: processing %d tool call(s): [%s]",
+            current_iteration,
+            len(tool_calls),
+            ", ".join(
+                tc.get("function", {}).get("name", "?") for tc in tool_calls
+            ),
+        )
+        messages.append(message)
+
+        terminal_success = False
+        for tc_idx, tc in enumerate(tool_calls, 1):
+            func = tc.get("function", {})
+            tool_name = func.get("name", "")
+            raw_args = func.get("arguments", "{}")
+            tool_id = tc.get("id", "")
+
+            try:
+                arguments = json.loads(raw_args)
+            except json.JSONDecodeError:
+                arguments = {}
+
+            logger.info(
+                "EntityGenerator iteration %d: calling tool '%s' (%d/%d)",
+                current_iteration,
+                tool_name,
+                tc_idx,
+                len(tool_calls),
+            )
+
+            # Human-readable progress messages per tool.
+            if tool_name == "submit_entity_mapping":
+                notify(f"Submitting mapping for {class_label or class_uri}…", pct=pct)
+            elif tool_name == "sample_table":
+                fn = arguments.get("full_name", "?")
+                notify(f"Sampling {fn}…", pct=pct)
+            elif tool_name == "execute_sql":
+                sql_preview = arguments.get("sql", "")[:80]
+                notify(f"Running SQL: {sql_preview}…", pct=pct)
+            else:
+                notify(f"Calling {tool_name}…", pct=pct)
+
+            result.steps.append(
+                EntityGenStep(
+                    step_type="tool_call",
+                    content=json.dumps(arguments, default=str)[:500],
+                    tool_name=tool_name,
+                )
+            )
+
+            t1 = time.time()
+            tool_result = dispatch_tool(
+                TOOL_HANDLERS, ctx, tool_name, arguments, trace_name=_TRACE_NAME
+            )
+            tool_ms = int((time.time() - t1) * 1000)
+
+            logger.info(
+                "EntityGenerator iteration %d: tool '%s' returned %d chars in %dms",
+                current_iteration,
+                tool_name,
+                len(tool_result),
+                tool_ms,
+            )
+
+            result.steps.append(
+                EntityGenStep(
+                    step_type="tool_result",
+                    content=(
+                        (tool_result[:500] + "…")
+                        if len(tool_result) > 500
+                        else tool_result
+                    ),
+                    tool_name=tool_name,
+                    duration_ms=tool_ms,
+                )
+            )
+
+            messages.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": tool_id,
+                    "content": tool_result,
+                }
+            )
+
+            # Detect terminal success: submit_entity_mapping returned
+            # success=True AND a mapping for THIS class_uri is present in
+            # ctx.entity_mappings. A submit with a mismatched class_uri (the
+            # LLM mapped a different class than requested) is NOT terminal —
+            # we coach the LLM via a corrective tool message and let the loop
+            # continue so it can resubmit with the right URI.
+            if tool_name == "submit_entity_mapping":
+                try:
+                    parsed = json.loads(tool_result)
+                except json.JSONDecodeError:
+                    parsed = {}
+                if parsed.get("success") is True:
+                    matched = any(
+                        m.get("ontology_class") == class_uri
+                        for m in ctx.entity_mappings
+                    )
+                    if matched:
+                        terminal_success = True
+                        logger.info(
+                            "EntityGenerator iteration %d: submit_entity_mapping succeeded — terminating",
+                            current_iteration,
+                        )
+                    else:
+                        submitted_uri = arguments.get("class_uri", "")
+                        mismatch_msg = (
+                            f"submitted class_uri '{submitted_uri}' does not "
+                            f"match requested class_uri '{class_uri}'; "
+                            f"resubmit with class_uri='{class_uri}'"
+                        )
+                        logger.warning(
+                            "EntityGenerator iteration %d: submit_entity_mapping "
+                            "class_uri mismatch — submitted=%s, requested=%s",
+                            current_iteration,
+                            submitted_uri,
+                            class_uri,
+                        )
+                        corrective_payload = json.dumps(
+                            {"success": False, "error": mismatch_msg}
+                        )
+                        # Replace the recorded tool_result step's content so
+                        # the UI / trace reflects the corrective signal
+                        # rather than the original (misleading) success
+                        # response.
+                        result.steps[-1] = EntityGenStep(
+                            step_type="tool_result",
+                            content=corrective_payload,
+                            tool_name=tool_name,
+                            duration_ms=result.steps[-1].duration_ms,
+                        )
+                        # Replace the tool message just appended to
+                        # ``messages`` so the LLM sees the corrective
+                        # payload on the next turn (one tool message per
+                        # tool_call_id — keep the protocol clean).
+                        messages[-1] = {
+                            "role": "tool",
+                            "tool_call_id": tool_id,
+                            "content": corrective_payload,
+                        }
+
+        if terminal_success:
+            # Pull the mapping for this class by strict URI match. The
+            # terminal-success guard above already verified an entry with
+            # this URI exists; if we somehow can't find one here that's an
+            # internal invariant violation, not a recoverable failure.
+            submitted = next(
+                (
+                    m
+                    for m in reversed(ctx.entity_mappings)
+                    if m.get("ontology_class") == class_uri
+                ),
+                None,
+            )
+            if submitted is None:
+                result.error = (
+                    "internal: submit succeeded but mapping not found for class_uri"
+                )
+                result.iterations = current_iteration
+                result.usage = total_usage
+                logger.error(
+                    "===== ENTITY GENERATOR FAILED ===== %s (class=%s)",
+                    result.error,
+                    class_uri,
+                )
+                return result
+            result.success = True
+            result.mapping = submitted
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.info(
+                "===== ENTITY GENERATOR COMPLETE ===== class=%s, iterations=%d, "
+                "prompt_tokens=%d, completion_tokens=%d",
+                class_uri,
+                result.iterations,
+                total_usage["prompt_tokens"],
+                total_usage["completion_tokens"],
+            )
+            notify(f"Mapping for {class_label or class_uri} complete!", pct=100)
+            return result
+
+    # Budget exhausted without a successful submit.
+    result.iterations = iteration_limit
+    result.usage = total_usage
+    result.error = "entity generator exhausted iteration budget"
+    logger.error("===== ENTITY GENERATOR FAILED ===== %s", result.error)
+    notify(result.error, pct=95)
+    return result
diff --git a/src/agents/agent_mapping_pge/generators/relationship.py b/src/agents/agent_mapping_pge/generators/relationship.py
new file mode 100644
index 00000000..7d32ad63
--- /dev/null
+++ b/src/agents/agent_mapping_pge/generators/relationship.py
@@ -0,0 +1,875 @@
+"""
+OntoBricks Mapping-PGE RelationshipGenerator Agent.
+
+Sprint 5 of the Planner-Generator-Evaluator (PGE) redesign.
+
+The RelationshipGenerator is the sibling of :mod:`.entity` — same ReAct
+loop shape and tooling discipline, narrower scope. It maps **one** ontology
+property (relationship) at a time, given:
+
+* the property to map (uri, label, comment, domain, range),
+* the source and target **entity mappings already produced by the
+  EntityGenerator** — crucially, the ``id_column`` each side mapped on, and
+* a small SourceModel slice that surfaces the relevant join-key subgraph.
+
+The system prompt FORBIDS picking endpoint columns that do not match the
+already-mapped entity IDs: the source/target endpoint columns are GIVEN.
+This keeps relationships consistent with the entities they connect — if a
+relationship's ``source_id`` doesn't match the source entity's ``id_column``,
+the resulting SPARQL graph cannot join.
+
+The loop semantics mirror :mod:`.entity`:
+
+* Same default budget (12).
+* Same 3-second inter-iteration delay.
+* Same MLflow trace decorator.
+* No single-shot fallback (terminate via tool call only).
+* Strict ``property_uri`` match on terminal detection — a submit with the
+  wrong URI is coached via a corrective tool message, not accepted.
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional
+
+import requests
+
+from back.core.logging import get_logger
+from agents.engine_base import (
+    call_serving_endpoint,
+    dispatch_tool,
+    accumulate_usage,
+)
+from agents.tools.context import ToolContext
+from agents.tools.mapping import (
+    MAPPING_TOOL_DEFINITIONS_BY_NAME,
+    MAPPING_TOOL_HANDLERS,
+)
+from agents.tools.planner import (
+    SAMPLE_TABLE_DEF,
+    tool_sample_table,
+)
+from agents.tools.sql import (
+    SQL_TOOL_DEFINITIONS,
+    SQL_TOOL_HANDLERS,
+)
+from agents.tracing import trace_agent
+
+logger = get_logger(__name__)
+
+MAX_ITERATIONS = 12
+LLM_TIMEOUT = 180
+_ITERATION_DELAY_SEC = 3
+# See planner._MAX_TOKENS comment — large UNION ALL queries for cross-source
+# relationships can exceed a small ceiling.
+_MAX_TOKENS = 50000
+
+_TRACE_NAME = "mapping_pge_relationship_generator"
+
+
+# =====================================================
+# Tool aggregation
+# =====================================================
+#
+# The RelationshipGenerator only needs:
+#   * execute_sql                 – validate the composed two-column SELECT.
+#   * sample_table                – peek at endpoint columns when the join is
+#                                   ambiguous (rare; usually unnecessary).
+#   * submit_relationship_mapping – TERMINAL.
+#
+# We deliberately exclude:
+#   * get_ontology / get_metadata / get_documents_context — wrong stage.
+#   * column_value_overlap / distinct_count — already locked by the Planner.
+#   * submit_source_model / submit_entity_mapping — wrong stage.
+
+_SUBMIT_RELATIONSHIP_DEF: dict = MAPPING_TOOL_DEFINITIONS_BY_NAME[
+    "submit_relationship_mapping"
+]
+
+TOOL_DEFINITIONS: List[dict] = (
+    SQL_TOOL_DEFINITIONS
+    + [SAMPLE_TABLE_DEF]
+    + [_SUBMIT_RELATIONSHIP_DEF]
+)
+
+TOOL_HANDLERS: Dict[str, Callable] = {
+    **SQL_TOOL_HANDLERS,
+    "sample_table": tool_sample_table,
+    "submit_relationship_mapping": MAPPING_TOOL_HANDLERS[
+        "submit_relationship_mapping"
+    ],
+}
+
+
+# =====================================================
+# Data classes
+# =====================================================
+
+
+@dataclass
+class RelationshipGenStep:
+    """One observable step of the RelationshipGenerator's execution.
+
+    Mirrors :class:`.entity.EntityGenStep` — scoped to the relationship
+    generator so the orchestrator (Sprint 7) can render a per-property
+    timeline in the UI.
+    """
+
+    step_type: str  # "tool_call" | "tool_result" | "output"
+    content: str
+    tool_name: str = ""
+    duration_ms: int = 0
+
+
+@dataclass
+class RelationshipGenResult:
+    """Outcome of a single RelationshipGenerator invocation.
+
+    ``mapping`` holds the submitted relationship-mapping dict (the same
+    shape the handler appends to ``ctx.relationships``) when ``success`` is
+    True.
+    """
+
+    success: bool
+    mapping: Optional[dict] = None
+    steps: List[RelationshipGenStep] = field(default_factory=list)
+    iterations: int = 0
+    error: str = ""
+    usage: Dict[str, int] = field(default_factory=dict)
+
+
+# =====================================================
+# System prompt
+# =====================================================
+#
+# The RELATIONSHIP SQL RULES section is lifted verbatim from the legacy
+# in-house mapping agent (the section starting "SQL RULES FOR
+# RELATIONSHIPS"). To those rules we add the Sprint 5 constraints: the
+# source and target ID columns are GIVEN by the already-produced entity
+# mappings; the LLM may not pick different endpoint columns.
+
+SYSTEM_PROMPT = """\
+You are a senior data engineer. Your job is to map ONE ontology property \
+(relationship) to a single SQL SELECT query against Databricks source \
+table(s), validated against real data via execute_sql, and submitted via \
+submit_relationship_mapping.
+
+YOU WILL BE GIVEN
+• ontology_property: the property to map (uri, label, comment, domain, range).
+• source_entity_mapping / target_entity_mapping: the ALREADY-MAPPED endpoint \
+entities — each with its class_uri, id_column, and the exact SQL it ran. \
+READ BOTH SQLs: they are the source of truth for your endpoint values.
+• source_model_slice: relevant_joins[] {from_ref, to_ref, confidence, \
+overlap_pct, kind} and candidate_tables[] the Planner curated. Prefer \
+high-overlap, high-confidence joins.
+
+THE EDGE MUST CONNECT EXISTING NODES
+An edge row is (source_id, target_id). Each value MUST already exist as a \
+node id in the corresponding entity, or it "dangles" and the mapping is \
+rejected (the evaluator fails any mapping with >5% dangling on either side, \
+unless the Planner predicted a cross-source band). Three traps cause almost \
+all dangling — avoid all three:
+
+TRAP 1 — id_column is an ALIAS FOR A DERIVED EXPRESSION, not a real column.
+Each entity mints its id with ``SELECT <expression> AS <id_column>`` (the \
+id_column is usually just ``ID``). That expression is often a canonical-key \
+normalization, e.g.::
+
+    CONCAT(regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-[0-9]+)', 1), '-baby')
+
+There is no ``ID`` column to select. You MUST **reproduce the entity's id \
+EXPRESSION verbatim** (copied from its SQL), applied to your table, for the \
+endpoint. A raw column (a ``*_id`` join key, a trust-local id) will NOT match \
+→ 100% dangling.
+
+TRAP 2 — building from a table only ONE endpoint entity covers.
+The two entities may be sourced from different trusts (compare the FROM \
+tables in each entity's SQL). Their id universes overlap only on the trust(s) \
+BOTH cover. Build the edge from a table present in BOTH entities' FROM lists \
+(the shared-coverage table). Building from a table only the target covers \
+makes every source_id absent from the source → 100% source-dangling (and \
+vice-versa).
+
+TRAP 3 — column-name / alias mismatch on submit.
+Your SELECT MUST alias the two columns exactly ``AS source_id`` and \
+``AS target_id``, and you MUST submit ``source_id_column="source_id"`` and \
+``target_id_column="target_id"``. These name the columns IN YOUR EDGE OUTPUT, \
+NOT the entity's id_column. If they disagree with your SELECT aliases the \
+evaluator reads nothing and every edge dangles.
+
+WORKED EXAMPLE — ``Baby --hasApgarScore--> Apgar Score``
+Baby is sourced from {trust_a.maternity_episode, trust_b.delivery}; Apgar \
+Score from {trust_a.maternity_episode, trust_c.maternity_event}. Shared \
+coverage = trust_a only (Trap 2). Both ids share the canonical pregnancy core \
+with role suffixes (Trap 1). So build from trust_a, reproducing both \
+expressions from one row::
+
+    SELECT CONCAT(regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-[0-9]+)', 1), '-baby')  AS source_id,
+           CONCAT(regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-[0-9]+)', 1), '-apgar') AS target_id
+    FROM   fiifi_cdm_demo_catalog.trust_a.maternity_episode
+    WHERE  regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-[0-9]+)', 1) <> ''
+
+Building from trust_c (Apgar's natural home) would dangle 100% on the Baby \
+side, because Baby has no trust_c rows.
+
+TOOLS
+  • execute_sql                 – validate / probe your SELECT (runs with a \
+small LIMIT; the persisted mapping has none).
+  • sample_table                – peek at real values when a column is \
+ambiguous.
+  • submit_relationship_mapping – TERMINAL. Call EXACTLY ONCE, only after a \
+clean dangling probe (see WORKFLOW step 4).
+
+SQL RULES
+• SELECT exactly two columns: ``<source id expr> AS source_id, <target id \
+expr> AS target_id`` (Trap 1 + Trap 3).
+• Build FROM a table both entities cover (Trap 2). Same-trust FK joins: one \
+table, no join. Cross-source: a UNION ALL of per-source SELECTs (each source \
+that holds both cores), or a JOIN on the shared canonical key.
+• No LIMIT, no ORDER BY. Always full table names (catalog.schema.table).
+
+WORKFLOW
+1. Read BOTH entity SQLs. Extract each entity's id EXPRESSION (the \
+``SELECT <expr> AS <id_column>``) and its set of FROM tables.
+2. Pick a shared-coverage table (Trap 2). Compose the two-column SELECT, \
+setting source_id to the source entity's id EXPRESSION and target_id to the \
+target entity's id EXPRESSION, reproduced verbatim and aliased ``AS \
+source_id`` / ``AS target_id``.
+3. Call execute_sql to confirm the query parses and returns two columns of \
+rows. Read any error and fix it; never submit an un-validated query.
+4. SELF-VERIFY THE VALUES BEFORE SUBMITTING (MANDATORY GATE). Run this probe \
+via execute_sql:
+
+  WITH rel AS (<your two-column SELECT>),
+       src AS (<source entity's SQL, its id aliased AS ID>),
+       tgt AS (<target entity's SQL, its id aliased AS ID>)
+  SELECT
+    (SELECT COUNT(*) FROM rel) AS edges,
+    (SELECT COUNT(*) FROM rel r WHERE r.source_id NOT IN (SELECT ID FROM src)) AS dangling_src,
+    (SELECT COUNT(*) FROM rel r WHERE r.target_id NOT IN (SELECT ID FROM tgt)) AS dangling_tgt
+
+  You may submit ONLY when ``dangling_src`` AND ``dangling_tgt`` are both 0 \
+(or a tiny fraction of edges). If either is high you hit Trap 1 or Trap 2 — \
+fix the endpoint expression or switch to the shared-coverage table, then \
+re-run this probe. Do NOT submit on an unrun or failing probe.
+5. submit_relationship_mapping EXACTLY ONCE: property_uri, property_name, \
+sql_query (no LIMIT), source_id_column="source_id", target_id_column=\
+"target_id", domain, range_class.
+6. Terminal — emit no free text after submitting.
+
+GENERAL RULES
+• Only ever pass row-returning queries (SELECT / WITH …) to execute_sql.
+• Do not call get_metadata, get_ontology, column_value_overlap, \
+distinct_count, submit_entity_mapping, or submit_source_model — they are \
+not available to you. The slice plus the entity mappings carry everything \
+you need.
+• If a retry_hint is present at the top of the user message, treat it as \
+authoritative — your previous attempt failed for the reason stated; do NOT \
+repeat the same mistake.
+"""
+
+
+# =====================================================
+# Internal helpers
+# =====================================================
+
+
+def _summarise_entity_mapping(em: dict, side: str) -> List[str]:
+    """One-block textual summary of a previously-produced entity mapping.
+
+    Surfaces exactly the fields the LLM needs to constrain its endpoint
+    choice: the class_uri, the id_column it locked in, and the SQL it ran.
+    Anything else (label_column, attribute_mappings, …) is irrelevant to the
+    relationship task and is intentionally omitted to keep the prompt tight.
+    """
+    em = em or {}
+    class_uri = (
+        em.get("ontology_class") or em.get("class_uri") or em.get("class") or ""
+    )
+    id_column = em.get("id_column", "")
+    sql_query = em.get("sql_query", "")
+    return [
+        f"{side.upper()} ENTITY MAPPING",
+        f"  class_uri: {class_uri}",
+        f"  id_column: {id_column}",
+        f"  sql:       {sql_query}",
+    ]
+
+
+def _format_join(j: dict) -> str:
+    """Readable one-line rendering of a join entry from the slice.
+
+    Defensive about missing fields — partial joins still render usefully so
+    a malformed slice doesn't blow up the prompt build.
+    """
+    from_ref = j.get("from_ref", "?")
+    to_ref = j.get("to_ref", "?")
+    kind = j.get("kind", "?")
+    conf = j.get("confidence")
+    overlap = j.get("overlap_pct")
+    extras: List[str] = []
+    if conf is not None:
+        try:
+            extras.append(f"confidence={float(conf):.2f}")
+        except (TypeError, ValueError):
+            extras.append(f"confidence={conf}")
+    if overlap is not None:
+        try:
+            extras.append(f"overlap_pct={float(overlap):.2f}")
+        except (TypeError, ValueError):
+            extras.append(f"overlap_pct={overlap}")
+    suffix = (" — " + ", ".join(extras)) if extras else ""
+    return f"  - {from_ref} -> {to_ref}  [{kind}]{suffix}"
+
+
+def _build_user_prompt(
+    ontology_property: dict,
+    source_entity_mapping: dict,
+    target_entity_mapping: dict,
+    source_model_slice: dict,
+    retry_hint: Optional[str] = None,
+) -> str:
+    """Render the per-property user prompt.
+
+    Structure:
+      1. retry_hint (if any) at the very top
+      2. ontology property metadata
+      3. source entity mapping summary (class_uri / id_column / sql)
+      4. target entity mapping summary
+      5. relevant joins (one line per join, readable)
+      6. candidate_tables (raw JSON — small)
+      7. a reminder block reiterating the two-column / endpoint-match rules
+    """
+    parts: List[str] = []
+
+    if retry_hint:
+        parts.append("RETRY HINT (authoritative — your previous attempt FAILED):")
+        parts.append(retry_hint)
+        parts.append(
+            "DO NOT repeat the same column choice. If the hint mentions "
+            "'dangling' or 'canonical id': sample BOTH the candidate endpoint "
+            "column AND the entity's id_column, compare actual values, and "
+            "pick the column whose values overlap. Run the dangling-edge "
+            "probe (step 4 of WORKFLOW) BEFORE submitting this time.\n"
+        )
+
+    prop_uri = ontology_property.get("uri", "")
+    prop_label = (
+        ontology_property.get("label") or ontology_property.get("name", "")
+    )
+    prop_comment = ontology_property.get("comment", "") or ""
+    prop_domain = ontology_property.get("domain", "") or ""
+    prop_range = ontology_property.get("range", "") or ""
+
+    parts.append("ONTOLOGY PROPERTY")
+    parts.append(f"  uri:     {prop_uri}")
+    parts.append(f"  label:   {prop_label}")
+    if prop_comment:
+        parts.append(f"  comment: {prop_comment}")
+    parts.append(f"  domain:  {prop_domain}")
+    parts.append(f"  range:   {prop_range}")
+
+    parts.append("")
+    parts.extend(_summarise_entity_mapping(source_entity_mapping, side="source"))
+
+    parts.append("")
+    parts.extend(_summarise_entity_mapping(target_entity_mapping, side="target"))
+
+    slice_obj = source_model_slice or {}
+    joins = slice_obj.get("relevant_joins") or []
+    candidates = slice_obj.get("candidate_tables") or []
+
+    parts.append("")
+    parts.append("RELEVANT JOINS")
+    if joins:
+        for j in joins:
+            parts.append(_format_join(j))
+    else:
+        parts.append("  (none surfaced by the Planner — fall back to a single-table SELECT if possible)")
+
+    if candidates:
+        parts.append("")
+        parts.append("CANDIDATE TABLES")
+        parts.append(json.dumps(candidates, indent=2, default=str))
+
+    src_id = (source_entity_mapping or {}).get("id_column", "")
+    tgt_id = (target_entity_mapping or {}).get("id_column", "")
+
+    parts.append("")
+    parts.append("REMINDERS (CRITICAL)")
+    parts.append(
+        "  • The persisted SQL MUST return EXACTLY two columns aliased "
+        "AS source_id and AS target_id."
+    )
+    parts.append(
+        f"  • source_id values MUST come from the column '{src_id}' (the "
+        "source entity's id_column) — or be directly transformable into it "
+        "via a join key in the slice."
+    )
+    parts.append(
+        f"  • target_id values MUST come from the column '{tgt_id}' (the "
+        "target entity's id_column) — same constraint."
+    )
+    parts.append(
+        "  • Validate with execute_sql, then call submit_relationship_mapping "
+        "exactly once."
+    )
+
+    prompt = "\n".join(parts)
+    logger.debug(
+        "_build_user_prompt for property=%s (%d chars):\n%s",
+        prop_uri,
+        len(prompt),
+        prompt,
+    )
+    return prompt
+
+
+# =====================================================
+# Public entry point
+# =====================================================
+
+
+@trace_agent(name="mapping_pge_relationship_generator")
+def run_relationship_generator(
+    host: str,
+    token: str,
+    endpoint_name: str,
+    client: Any,
+    *,
+    ontology_property: dict,
+    source_entity_mapping: dict,
+    target_entity_mapping: dict,
+    source_model_slice: dict,
+    retry_hint: Optional[str] = None,
+    on_step: Optional[Callable[[str, int], None]] = None,
+    max_iterations: int = MAX_ITERATIONS,
+) -> RelationshipGenResult:
+    """Run the RelationshipGenerator agent for a single ontology property.
+
+    The agent composes a two-column SQL SELECT (``source_id`` / ``target_id``)
+    that realises the relationship between the source and target entities
+    using the join-key subgraph in ``source_model_slice``, validates the
+    SQL via ``execute_sql``, and submits the validated mapping via the
+    terminal ``submit_relationship_mapping`` tool.
+
+    Args:
+        host: Databricks workspace URL.
+        token: Bearer token for the serving endpoint.
+        endpoint_name: Foundation Model serving endpoint name.
+        client: Databricks SQL client (must expose ``execute_query(sql)``).
+        ontology_property: Full dict for the SINGLE property to map (uri,
+            label, comment, domain, range).
+        source_entity_mapping: The ALREADY-MAPPED source entity (carries the
+            ``id_column`` the source endpoint must align with).
+        target_entity_mapping: The ALREADY-MAPPED target entity (same).
+        source_model_slice: Filtered SourceModel slice with relevant_joins
+            and optional candidate_tables.
+        retry_hint: Optional one-sentence hint from the orchestrator's
+            previous-attempt evaluation. When present, surfaced at the top
+            of the user prompt.
+        on_step: Optional progress callback ``(msg, pct)`` for UI updates.
+        max_iterations: Upper bound on tool-call iterations (default 12 —
+            same as the EntityGenerator).
+
+    Returns:
+        A :class:`RelationshipGenResult`. ``success`` is True iff a mapping
+        was successfully submitted with the requested ``property_uri``; in
+        that case ``mapping`` holds the submitted dict. On failure, ``error``
+        explains why and ``mapping`` is None.
+    """
+    iteration_limit = max_iterations if max_iterations is not None else MAX_ITERATIONS
+
+    property_uri = (ontology_property or {}).get("uri", "")
+    property_label = (
+        (ontology_property or {}).get("label")
+        or (ontology_property or {}).get("name", "")
+    )
+    n_joins = len(((source_model_slice or {}).get("relevant_joins") or []))
+    n_candidates = len(((source_model_slice or {}).get("candidate_tables") or []))
+
+    logger.info(
+        "===== RELATIONSHIP GENERATOR START ===== endpoint=%s, property=%s (%s), "
+        "joins=%d, candidate_tables=%d, retry_hint=%s, max_iter=%d",
+        endpoint_name,
+        property_label,
+        property_uri,
+        n_joins,
+        n_candidates,
+        "yes" if retry_hint else "no",
+        iteration_limit,
+    )
+
+    ctx = ToolContext(
+        host=host.rstrip("/"),
+        token=token,
+        client=client,
+        # The slice + entity mappings subsume metadata/ontology for this
+        # agent; the unified ToolContext still wants these fields, so we
+        # leave them empty.
+        metadata={},
+        ontology={},
+        documents=[],
+    )
+
+    result = RelationshipGenResult(success=False)
+
+    user_prompt = _build_user_prompt(
+        ontology_property or {},
+        source_entity_mapping or {},
+        target_entity_mapping or {},
+        source_model_slice or {},
+        retry_hint=retry_hint,
+    )
+    messages: List[dict] = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": user_prompt},
+    ]
+    logger.info(
+        "RelationshipGenerator conversation initialized: system=%d chars, user=%d chars",
+        len(SYSTEM_PROMPT),
+        len(user_prompt),
+    )
+
+    total_usage: Dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0}
+
+    def _progress_pct(iteration_idx: int) -> int:
+        ratio = (iteration_idx + 1) / max(iteration_limit, 1)
+        return min(5 + int(ratio * 90), 95)
+
+    def notify(msg: str, *, pct: Optional[int] = None) -> None:
+        actual_pct = pct if pct is not None else 5
+        logger.info("RELATIONSHIP GEN STEP [%d%%] %s", actual_pct, msg)
+        if on_step:
+            on_step(msg, actual_pct)
+
+    notify(f"Generating mapping for {property_label or property_uri}…", pct=1)
+
+    # Snapshot the pre-existing relationship count so we can detect "this
+    # run added a mapping" without relying on absolute counters. Future-proof
+    # for an orchestrator that reuses a ToolContext across calls.
+    pre_run_count = len(ctx.relationships)
+
+    # ------------------------------------------------------------------
+    # Agent loop
+    # ------------------------------------------------------------------
+    for iteration in range(iteration_limit):
+        if iteration > 0:
+            logger.debug(
+                "Iteration %d: waiting %ds before LLM call (rate limit mitigation)",
+                iteration + 1,
+                _ITERATION_DELAY_SEC,
+            )
+            time.sleep(_ITERATION_DELAY_SEC)
+
+        current_iteration = iteration + 1
+        pct = _progress_pct(iteration)
+        logger.info(
+            "----- RelationshipGenerator iteration %d/%d — %d messages, mapping=%s -----",
+            current_iteration,
+            iteration_limit,
+            len(messages),
+            "set" if len(ctx.relationships) > pre_run_count else "unset",
+        )
+        notify(
+            f"Mapping iteration {current_iteration}/{iteration_limit}…",
+            pct=pct,
+        )
+
+        t0 = time.time()
+        try:
+            llm_response = call_serving_endpoint(
+                host,
+                token,
+                endpoint_name,
+                messages,
+                tools=TOOL_DEFINITIONS,
+                max_tokens=_MAX_TOKENS,
+                temperature=0.1,
+                timeout=LLM_TIMEOUT,
+                trace_name=_TRACE_NAME,
+            )
+        except requests.exceptions.HTTPError as exc:
+            status = exc.response.status_code if exc.response is not None else "?"
+            logger.warning(
+                "RelationshipGenerator iteration %d: HTTPError status=%s",
+                current_iteration,
+                status,
+            )
+            logger.debug(
+                "RelationshipGenerator iteration %d: HTTPError body: %.500s",
+                current_iteration,
+                exc.response.text if exc.response is not None else "N/A",
+            )
+            if exc.response is not None and status in (400, 422):
+                result.error = "LLM endpoint does not support function calling"
+                result.iterations = current_iteration
+                result.usage = total_usage
+                logger.error(
+                    "RelationshipGenerator: endpoint refused tools — cannot produce a mapping"
+                )
+                return result
+            result.error = f"LLM request failed: {exc}"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.error(
+                "RelationshipGenerator: LLM request failed at iteration %d: %s",
+                current_iteration,
+                exc,
+            )
+            return result
+        except requests.exceptions.ReadTimeout:
+            result.error = f"LLM request timed out after {LLM_TIMEOUT}s"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.error(
+                "RelationshipGenerator: timeout at iteration %d", current_iteration
+            )
+            return result
+        except requests.exceptions.RequestException as exc:
+            result.error = f"LLM request failed: {exc}"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.error(
+                "RelationshipGenerator: request exception at iteration %d: %s",
+                current_iteration,
+                exc,
+            )
+            return result
+
+        elapsed_ms = int((time.time() - t0) * 1000)
+        logger.info(
+            "RelationshipGenerator iteration %d: LLM responded in %dms",
+            current_iteration,
+            elapsed_ms,
+        )
+
+        accumulate_usage(total_usage, llm_response.get("usage", {}))
+
+        choice = llm_response.get("choices", [{}])[0]
+        finish_reason = choice.get("finish_reason", "?")
+        message = choice.get("message", {})
+        tool_calls = message.get("tool_calls", [])
+        has_content = bool(message.get("content"))
+        logger.info(
+            "RelationshipGenerator iteration %d: finish_reason=%s, tool_calls=%d, has_content=%s",
+            current_iteration,
+            finish_reason,
+            len(tool_calls),
+            has_content,
+        )
+
+        if not tool_calls:
+            # The Generator must terminate via submit_relationship_mapping,
+            # never via free text.
+            content = (message.get("content") or "")[:500]
+            logger.warning(
+                "RelationshipGenerator iteration %d: produced text without submitting mapping — %d chars",
+                current_iteration,
+                len(message.get("content") or ""),
+            )
+            result.steps.append(
+                RelationshipGenStep(
+                    step_type="output",
+                    content=content,
+                    duration_ms=elapsed_ms,
+                )
+            )
+            result.error = "relationship generator produced text without submitting mapping"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            notify(
+                "Relationship generator produced text without submitting mapping.",
+                pct=pct,
+            )
+            return result
+
+        logger.info(
+            "RelationshipGenerator iteration %d: processing %d tool call(s): [%s]",
+            current_iteration,
+            len(tool_calls),
+            ", ".join(
+                tc.get("function", {}).get("name", "?") for tc in tool_calls
+            ),
+        )
+        messages.append(message)
+
+        terminal_success = False
+        for tc_idx, tc in enumerate(tool_calls, 1):
+            func = tc.get("function", {})
+            tool_name = func.get("name", "")
+            raw_args = func.get("arguments", "{}")
+            tool_id = tc.get("id", "")
+
+            try:
+                arguments = json.loads(raw_args)
+            except json.JSONDecodeError:
+                arguments = {}
+
+            logger.info(
+                "RelationshipGenerator iteration %d: calling tool '%s' (%d/%d)",
+                current_iteration,
+                tool_name,
+                tc_idx,
+                len(tool_calls),
+            )
+
+            if tool_name == "submit_relationship_mapping":
+                notify(
+                    f"Submitting mapping for {property_label or property_uri}…",
+                    pct=pct,
+                )
+            elif tool_name == "sample_table":
+                fn = arguments.get("full_name", "?")
+                notify(f"Sampling {fn}…", pct=pct)
+            elif tool_name == "execute_sql":
+                sql_preview = arguments.get("sql", "")[:80]
+                notify(f"Running SQL: {sql_preview}…", pct=pct)
+            else:
+                notify(f"Calling {tool_name}…", pct=pct)
+
+            result.steps.append(
+                RelationshipGenStep(
+                    step_type="tool_call",
+                    content=json.dumps(arguments, default=str)[:500],
+                    tool_name=tool_name,
+                )
+            )
+
+            t1 = time.time()
+            tool_result = dispatch_tool(
+                TOOL_HANDLERS, ctx, tool_name, arguments, trace_name=_TRACE_NAME
+            )
+            tool_ms = int((time.time() - t1) * 1000)
+
+            logger.info(
+                "RelationshipGenerator iteration %d: tool '%s' returned %d chars in %dms",
+                current_iteration,
+                tool_name,
+                len(tool_result),
+                tool_ms,
+            )
+
+            result.steps.append(
+                RelationshipGenStep(
+                    step_type="tool_result",
+                    content=(
+                        (tool_result[:500] + "…")
+                        if len(tool_result) > 500
+                        else tool_result
+                    ),
+                    tool_name=tool_name,
+                    duration_ms=tool_ms,
+                )
+            )
+
+            messages.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": tool_id,
+                    "content": tool_result,
+                }
+            )
+
+            # Detect terminal success: submit_relationship_mapping returned
+            # success=True AND a mapping for THIS property_uri is present in
+            # ctx.relationships. A submit with a mismatched property_uri is
+            # NOT terminal — we coach the LLM via a corrective tool message
+            # and let the loop continue.
+            if tool_name == "submit_relationship_mapping":
+                try:
+                    parsed = json.loads(tool_result)
+                except json.JSONDecodeError:
+                    parsed = {}
+                if parsed.get("success") is True:
+                    matched = any(
+                        m.get("property") == property_uri
+                        for m in ctx.relationships
+                    )
+                    if matched:
+                        terminal_success = True
+                        logger.info(
+                            "RelationshipGenerator iteration %d: submit_relationship_mapping succeeded — terminating",
+                            current_iteration,
+                        )
+                    else:
+                        submitted_uri = arguments.get("property_uri", "")
+                        mismatch_msg = (
+                            f"submitted property_uri '{submitted_uri}' does "
+                            f"not match requested property_uri "
+                            f"'{property_uri}'; resubmit with "
+                            f"property_uri='{property_uri}'"
+                        )
+                        logger.warning(
+                            "RelationshipGenerator iteration %d: submit_relationship_mapping "
+                            "property_uri mismatch — submitted=%s, requested=%s",
+                            current_iteration,
+                            submitted_uri,
+                            property_uri,
+                        )
+                        corrective_payload = json.dumps(
+                            {"success": False, "error": mismatch_msg}
+                        )
+                        # Replace the recorded tool_result step's content so
+                        # the UI / trace shows the corrective signal.
+                        result.steps[-1] = RelationshipGenStep(
+                            step_type="tool_result",
+                            content=corrective_payload,
+                            tool_name=tool_name,
+                            duration_ms=result.steps[-1].duration_ms,
+                        )
+                        # Replace the tool message on the conversation so
+                        # the LLM sees the corrective payload next turn.
+                        messages[-1] = {
+                            "role": "tool",
+                            "tool_call_id": tool_id,
+                            "content": corrective_payload,
+                        }
+
+        if terminal_success:
+            # Pull the mapping for this property by strict URI match.
+            submitted = next(
+                (
+                    m
+                    for m in reversed(ctx.relationships)
+                    if m.get("property") == property_uri
+                ),
+                None,
+            )
+            if submitted is None:
+                result.error = (
+                    "internal: submit succeeded but mapping not found for property_uri"
+                )
+                result.iterations = current_iteration
+                result.usage = total_usage
+                logger.error(
+                    "===== RELATIONSHIP GENERATOR FAILED ===== %s (property=%s)",
+                    result.error,
+                    property_uri,
+                )
+                return result
+            result.success = True
+            result.mapping = submitted
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.info(
+                "===== RELATIONSHIP GENERATOR COMPLETE ===== property=%s, iterations=%d, "
+                "prompt_tokens=%d, completion_tokens=%d",
+                property_uri,
+                result.iterations,
+                total_usage["prompt_tokens"],
+                total_usage["completion_tokens"],
+            )
+            notify(
+                f"Mapping for {property_label or property_uri} complete!", pct=100
+            )
+            return result
+
+    # Budget exhausted without a successful submit.
+    result.iterations = iteration_limit
+    result.usage = total_usage
+    result.error = "relationship generator exhausted iteration budget"
+    logger.error("===== RELATIONSHIP GENERATOR FAILED ===== %s", result.error)
+    notify(result.error, pct=95)
+    return result
diff --git a/src/agents/agent_mapping_pge/planner.py b/src/agents/agent_mapping_pge/planner.py
new file mode 100644
index 00000000..a65babf7
--- /dev/null
+++ b/src/agents/agent_mapping_pge/planner.py
@@ -0,0 +1,738 @@
+"""
+OntoBricks Mapping-PGE Planner Agent.
+
+Sprint 3 of the Planner-Generator-Evaluator (PGE) redesign.
+
+The Planner is a single-invocation agent (no internal retry loop — re-
+invocations come from the orchestrator on Evaluator escalation in Sprint 7).
+It consumes the ontology, table metadata, and any imported domain documents,
+probes the source data via the planner tools (sample_table, column_value_overlap,
+distinct_count) plus the shared tools (get_metadata, get_ontology,
+get_documents_context, execute_sql), and emits a validated
+:class:`SourceModel` via the ``submit_source_model`` terminal tool.
+
+The loop semantics mirror the prior single-loop mapping agent — same
+``call_serving_endpoint`` + ``dispatch_tool`` ReAct cycle, same 3-second
+inter-iteration delay, same accumulated usage tracking, same MLflow trace
+decorator — with two key differences:
+
+* No fallback to single-shot generation. If the endpoint refuses tools, the
+  Planner returns failure (the Planner *needs* tools — it produces structured
+  output through ``submit_source_model``).
+* Smaller default iteration budget (25 instead of 60) — the Planner is more
+  focused than the auto-mapping agent.
+"""
+
+import json
+import time
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+
+import requests
+
+if TYPE_CHECKING:
+    from agents.agent_mapping_pge.contracts import SourceModel
+
+from back.core.logging import get_logger
+from agents.engine_base import (
+    call_serving_endpoint,
+    dispatch_tool,
+    accumulate_usage,
+)
+from agents.tools.context import ToolContext
+from agents.tools.documents import (
+    GET_DOCUMENTS_CONTEXT_DEF,
+    tool_get_documents_context,
+)
+from agents.tools.metadata import (
+    GET_METADATA_DEF,
+    tool_get_metadata,
+)
+from agents.tools.ontology import (
+    ONTOLOGY_TOOL_DEFINITIONS,
+    ONTOLOGY_TOOL_HANDLERS,
+)
+from agents.tools.planner import (
+    PLANNER_TOOL_DEFINITIONS,
+    PLANNER_TOOL_HANDLERS,
+)
+from agents.tools.sql import (
+    SQL_TOOL_DEFINITIONS,
+    SQL_TOOL_HANDLERS,
+)
+from agents.tracing import trace_agent
+
+logger = get_logger(__name__)
+
+MAX_ITERATIONS = 50
+LLM_TIMEOUT = 180
+_ITERATION_DELAY_SEC = 3
+
+# The submit_source_model JSON for a real-world ontology can run several KB
+# (17+ classes × multiple candidates + canonical_ids + join_keys + plan).
+# A small ceiling silently truncates the call (finish_reason=length) and the
+# dataclass validation fails with no clue to the LLM as to why. 100k removes
+# the practical ceiling for any ontology size; you only pay for tokens
+# actually generated, so the cost stays bounded by output complexity.
+_MAX_TOKENS = 50000
+
+_TRACE_NAME = "mapping_pge_planner"
+
+
+# =====================================================
+# Tool aggregation
+# =====================================================
+#
+# The Planner uses every read tool the auto-mapping agent has — ontology,
+# metadata, documents, execute_sql — *plus* the four planner-specific tools.
+# It deliberately does NOT receive ``submit_entity_mapping`` /
+# ``submit_relationship_mapping``: those belong to the Generator (Sprints 4
+# and 5). The Planner's only terminal tool is ``submit_source_model``.
+
+TOOL_DEFINITIONS: List[dict] = (
+    [GET_METADATA_DEF, GET_DOCUMENTS_CONTEXT_DEF]
+    + ONTOLOGY_TOOL_DEFINITIONS
+    + SQL_TOOL_DEFINITIONS
+    + PLANNER_TOOL_DEFINITIONS
+)
+
+TOOL_HANDLERS: Dict[str, Callable] = {
+    "get_metadata": tool_get_metadata,
+    "get_documents_context": tool_get_documents_context,
+    **ONTOLOGY_TOOL_HANDLERS,
+    **SQL_TOOL_HANDLERS,
+    **PLANNER_TOOL_HANDLERS,
+}
+
+
+# =====================================================
+# Data classes
+# =====================================================
+
+
+@dataclass
+class PlannerStep:
+    """One observable step of the Planner's execution.
+
+    Mirrors :class:`agents.engine_base.AgentStep` but is scoped to the Planner
+    so the orchestrator (Sprint 7) can present a stage-specific timeline in
+    the UI.
+    """
+
+    step_type: str  # tool_call | tool_result | output
+    content: str
+    tool_name: str = ""
+    duration_ms: int = 0
+
+
+@dataclass
+class PlannerResult:
+    """Outcome of a single Planner invocation.
+
+    ``source_model`` is populated only when the LLM successfully called
+    ``submit_source_model`` with a structurally-valid payload. ``error`` is
+    the short reason string when ``success`` is ``False``.
+    """
+
+    success: bool
+    source_model: Optional["SourceModel"] = None
+    steps: List[PlannerStep] = field(default_factory=list)
+    iterations: int = 0
+    error: str = ""
+    usage: Dict[str, int] = field(default_factory=dict)
+
+
+# =====================================================
+# System prompt
+# =====================================================
+
+SYSTEM_PROMPT = """\
+You are a senior data architect. Your job is to build a SourceModel that \
+bridges a set of source tables to an OWL ontology, so a downstream Generator \
+agent can mechanically emit entity- and relationship-mapping SQL.
+
+TOOLS
+You have these tools available:
+  • get_ontology           – load classes (with attributes) and object \
+properties to be mapped.
+  • get_metadata           – load imported table schemas (full names, \
+columns, types).
+  • get_documents_context  – load any imported domain documents (glossaries, \
+schema docs).
+  • sample_table           – return up to N random rows so you can see \
+actual values, not just column types. Use when a column's role is unclear \
+from its name/type alone.
+  • column_value_overlap   – measure |distinct(from) ∩ distinct(to)| / \
+|distinct(from)| for two bare COLUMNS. Use to VALIDATE a candidate join key \
+with real data — never propose a join_key on the strength of name similarity \
+alone.
+  • normalized_value_overlap – the same overlap metric, but each side is a \
+scalar SQL EXPRESSION. This is how you PROVE a canonical-key normalization: \
+when two tables for the same class have 0% raw overlap, propose a \
+normalization expression per table and confirm overlap_pct > 0 here BEFORE \
+you submit. A still-zero result means your expression is wrong — fix it.
+  • distinct_count         – row / distinct / null counts plus is_unique \
+and is_complete flags. Use to confirm a candidate canonical-ID column is \
+actually unique and complete.
+  • execute_sql            – escape hatch for any check the four tools above \
+do not cover. Use sparingly — prefer the focused tools.
+  • submit_source_model    – TERMINAL. Call exactly once, when the \
+SourceModel is complete and you are ready to hand off to the Generator.
+
+WORKFLOW
+1. Call get_ontology AND get_metadata first to see what needs mapping and \
+what data is available.
+2. Call get_documents_context to pick up any pre-loaded domain documents — \
+they often disambiguate column semantics.
+3. For each table, decide which ontology class(es) it could realise — these \
+become table_roles[].ontology_class_candidates with a confidence and a one- \
+sentence reason.
+4. For each ontology class, decide which column serves as its canonical \
+identifier in each table — record under canonical_ids[]. When you are \
+uncertain, run distinct_count to confirm uniqueness/completeness.
+5. For each pair of tables that should join (intra-source FK or cross-source \
+value match), run column_value_overlap and only record join_keys[] when the \
+realised overlap_pct supports it. Use kind="same_trust_fk" for FK joins and \
+kind="cross_source_value_match" for value-matched joins across sources. \
+For any class mapped to 2+ tables, follow CANONICAL-KEY NORMALIZATION below \
+and PROVE the chosen keys overlap with normalized_value_overlap.
+6. Build mapping_plan.entity_order so that BASE classes come first \
+(i.e. classes that are referenced by other classes through object properties \
+should be mapped before their referencers). Build \
+mapping_plan.relationship_order so that, by the time each relationship is \
+attempted, BOTH its domain and range classes have already appeared in \
+entity_order. List anything you cannot reasonably map under \
+mapping_plan.skip[] with a short reason.
+7. Finally, call submit_source_model exactly once with the full JSON. The \
+call returns success=true when the model is structurally valid; if it \
+returns success=false, fix the indicated problem and call it again.
+
+CANONICAL-KEY NORMALIZATION (CRITICAL — this is the #1 cause of relationship dangling)
+For any class whose canonical_id lists MORE THAN ONE table, run \
+column_value_overlap on a representative column pair to see whether the raw \
+values already share a format:
+
+  • If overlap_pct > 0 → values are in compatible formats. Record bare \
+column names in canonical_column_per_table (e.g. ``"CUSTOMER_ID"``). \
+A UNION across the tables produces a coherent ID universe.
+
+  • If overlap_pct == 0 → DO NOT conclude these are "different" or \
+"source-scoped" entities. When two tables both map to the SAME ontology \
+class, 0% overlap almost always means the SAME real-world key wrapped in \
+DIFFERENT source-local encodings (prefixes, suffixes, embedded sub-IDs). \
+Leaving them disjoint makes every relationship pointing at this class 100% \
+dangle — that is a FAILURE, not an acceptable outcome. You MUST normalize:
+
+    STEP 1 — sample_table BOTH columns and read the raw values. Look for a \
+shared embedded substring across the sources — a stable inner identifier \
+(UUID, account number, ``...-ord-<n>`` core) that appears in every source's \
+value with only the surrounding prefix/suffix differing.
+
+    STEP 2 — write ONE scalar SQL expression PER TABLE that strips the \
+source-specific wrapping and exposes that shared core in an identical form. \
+Prefer extracting the shared core over stripping a single known prefix \
+(extraction is robust to multiple prefixes). When matching a hex/UUID core, \
+ALWAYS anchor the regex with a leading character class so a preceding dash \
+is not captured:
+          ✗ WRONG: regexp_extract(ORDER_REF, '([a-f0-9-]+-ord-[0-9]+)', 1)
+                   → returns "-<uuid>-ord-1" (leading dash) — will NOT match
+          ✓ RIGHT: regexp_extract(ORDER_REF, '([a-f0-9][a-f0-9-]+-ord-[0-9]+)', 1)
+                   → returns "<uuid>-ord-1"
+
+    STEP 3 — for a DERIVED / child key (e.g. an OrderLine, Shipment or Payment \
+that hangs off an order), DO NOT concatenate a suffix onto the RAW prefixed \
+local id — that re-introduces the source prefix and the keys stay disjoint. \
+Extract the shared core FIRST, then append the role suffix, so every source \
+yields the identical synthetic key:
+          ✗ WRONG: source_a "CONCAT(ORDER_REF, '-line')"  (→ SA-<uuid>-ord-1-line)
+                   source_b "line_id"                     (→ SB-LN-SB-<uuid>-ord-1)
+          ✓ RIGHT: source_a "CONCAT(regexp_extract(ORDER_REF, '([a-f0-9][a-f0-9-]+-ord-[0-9]+)', 1), '-line')"
+                   source_b "CONCAT(regexp_extract(line_id, '([a-f0-9][a-f0-9-]+-ord-[0-9]+)', 1), '-line')"
+                   (both → <uuid>-ord-1-line)
+
+    STEP 4 — PROVE IT. Call normalized_value_overlap with your two \
+expressions. It MUST return overlap_pct > 0. If it is still 0, your \
+expressions land in different value spaces — go back to STEP 1 and fix them. \
+Do NOT call submit_source_model with an unverified normalization.
+
+    (If, after sampling, a table genuinely cannot expose the shared core at \
+all, omit that table from canonical_column_per_table and note why — but this \
+is rare; exhaust STEP 1–4 first.)
+
+  • COMPLETENESS — list EVERY covering source. When more than one source table \
+realises the SAME class, include ALL of them in canonical_column_per_table, \
+not just the two you checked overlap on. The same real-world entity is \
+typically present across multiple sources (e.g. source_a AND source_b AND \
+source_c); omitting one drops a large fraction of that entity's real instances \
+and makes every relationship pointing at it partially dangle. During candidate \
+discovery (step 3) actively look for the class across all source schemas before \
+you settle on its canonical_ids.
+
+  • Whatever expression you record, the EntityGenerator drops it verbatim \
+into the SELECT aliased AS ID. Bare column names and SQL expressions are \
+both valid here.
+
+  • Always update format_note to one sentence describing what the canonical \
+key looks like (e.g. ``"<account-uuid>-ord-<ordinal> core extracted from each \
+source's local order id"``). Downstream agents read this.
+
+SOURCEMODEL JSON SCHEMA (these key names are LOAD-BEARING — do not improvise)
+The `model` argument to submit_source_model has exactly this shape:
+
+{
+  "table_roles": [
+    {
+      "table": "<catalog.schema.table>",                       // STRING — required key is "table"
+      "ontology_class_candidates": [
+        {"uri": "<class URI>", "confidence": 0.0, "reason": "<one sentence>"}
+      ]
+    }
+  ],
+  "canonical_ids": [
+    {
+      "ontology_class": "<class URI>",                          // STRING — required key is "ontology_class"
+      // VALUES may be either a bare column name OR a SQL expression that
+      // produces the canonical key for that table. Use a SQL expression
+      // when raw column values across the listed tables are in different
+      // formats (see CANONICAL-KEY NORMALIZATION below).
+      "canonical_column_per_table": {"<catalog.schema.table>": "<column or SQL expression>"},
+      "format_note": "<one-sentence description of the canonical-key format>"
+    }
+  ],
+  "join_keys": [
+    {
+      "from_ref": "<table>.<col>",                              // STRING — required key is "from_ref"
+      "to_ref":   "<table>.<col>",                              // STRING — required key is "to_ref"
+      "confidence": 0.0,
+      "overlap_pct": 0.0,
+      "kind": "same_trust_fk"                                   // or "cross_source_value_match"
+    }
+  ],
+  "mapping_plan": {
+    "entity_order":       ["<class URI>", "..."],
+    "relationship_order": ["<property URI>", "..."],
+    "skip": [
+      {"item": "<class or property URI, or 'all'>", "reason": "<short reason>"}   // required keys: "item", "reason"
+    ]
+  }
+}
+
+Key-name traps to avoid:
+• Use "table" (not "name", "table_name", "uri") in each table_roles[] entry.
+• Use "ontology_class" (not "class", "uri") in each canonical_ids[] entry.
+• Use "from_ref" / "to_ref" (not "from" / "to" / "source" / "target") in each join_keys[] entry.
+• Use "item" (not "uri", "property") in each mapping_plan.skip[] entry.
+
+INVARIANTS (the orchestrator will enforce these)
+• Every URI in entity_order MUST exist in the ontology AND have at least one \
+candidate in table_roles[].ontology_class_candidates.
+• Every URI in relationship_order MUST reference a property whose domain \
+class and range class both appear in entity_order at an EARLIER position.
+• All confidence values are floats in [0.0, 1.0].
+• kind on each join_key is EXACTLY one of: "same_trust_fk", \
+"cross_source_value_match".
+• Call submit_source_model EXACTLY ONCE, at the end. Do not emit a free-text \
+summary afterwards — submit_source_model is the terminal step.
+
+GENERAL RULES
+• Prefer the focused tools (sample_table, column_value_overlap, \
+normalized_value_overlap, distinct_count) over execute_sql.
+• Validate candidate join keys with column_value_overlap before adding them \
+to join_keys[].
+• You may batch multiple independent tool calls in a single response.
+• Only ever pass row-returning queries (SELECT / WITH …) to execute_sql.
+"""
+
+
+# =====================================================
+# Internal helpers
+# =====================================================
+
+
+def _build_user_prompt(
+    entities: List[dict], relationships: List[dict], n_tables: int
+) -> str:
+    parts = [
+        (
+            f"Build a SourceModel for {n_tables} table(s), {len(entities)} ontology "
+            f"entity/entities, and {len(relationships)} relationship(s). "
+            "Start by calling get_ontology, get_metadata, and get_documents_context."
+        )
+    ]
+    if entities:
+        names = ", ".join(e.get("name", "?") for e in entities)
+        parts.append(f"Entities in scope: {names}")
+    if relationships:
+        names = ", ".join(r.get("name", "?") for r in relationships)
+        parts.append(f"Relationships in scope: {names}")
+    prompt = "\n".join(parts)
+    logger.debug("_build_user_prompt (%d chars):\n%s", len(prompt), prompt)
+    return prompt
+
+
+# =====================================================
+# Public entry point
+# =====================================================
+
+
+@trace_agent(name="mapping_pge_planner")
+def run_planner(
+    host: str,
+    token: str,
+    endpoint_name: str,
+    client: Any,
+    metadata: dict,
+    ontology: dict,
+    *,
+    documents: Optional[list] = None,
+    on_step: Optional[Callable[[str, int], None]] = None,
+    max_iterations: int = MAX_ITERATIONS,
+) -> PlannerResult:
+    """Run the Planner agent.
+
+    The Planner autonomously produces a :class:`SourceModel` by exploring the
+    ontology, metadata, documents, and source data via tool calls. It
+    terminates as soon as it submits a structurally-valid SourceModel via the
+    terminal ``submit_source_model`` tool.
+
+    Args:
+        host: Databricks workspace URL.
+        token: Bearer token for the serving endpoint.
+        endpoint_name: Foundation Model serving endpoint name.
+        client: Databricks SQL client (must expose ``execute_query(sql)``).
+        metadata: Imported domain metadata (``{"tables": [...]}``).
+        ontology: Imported ontology (``{"entities": [...], "relationships": [...]}``).
+        documents: Optional pre-loaded domain documents.
+        on_step: Optional progress callback ``(msg, pct)`` for UI updates.
+        max_iterations: Upper bound on tool-call iterations (default 25).
+
+    Returns:
+        A :class:`PlannerResult`. ``success`` is True iff a SourceModel was
+        successfully submitted; in that case ``source_model`` holds the
+        validated dataclass. On failure, ``error`` explains why and
+        ``source_model`` is None.
+    """
+    iteration_limit = max_iterations if max_iterations is not None else MAX_ITERATIONS
+
+    entities = (ontology or {}).get("entities", [])
+    relationships = (ontology or {}).get("relationships", [])
+    n_tables = len((metadata or {}).get("tables", []))
+
+    logger.info(
+        "===== PLANNER START ===== endpoint=%s, tables=%d, entities=%d, relationships=%d, max_iter=%d",
+        endpoint_name,
+        n_tables,
+        len(entities),
+        len(relationships),
+        iteration_limit,
+    )
+
+    ctx = ToolContext(
+        host=host.rstrip("/"),
+        token=token,
+        client=client,
+        metadata=metadata or {},
+        ontology=ontology or {},
+        documents=list(documents or []),
+    )
+
+    result = PlannerResult(success=False)
+
+    user_prompt = _build_user_prompt(entities, relationships, n_tables)
+    messages: List[dict] = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": user_prompt},
+    ]
+    logger.info(
+        "Planner conversation initialized: system=%d chars, user=%d chars",
+        len(SYSTEM_PROMPT),
+        len(user_prompt),
+    )
+
+    total_usage: Dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0}
+
+    def _progress_pct(iteration_idx: int) -> int:
+        # Linear ramp from 5 → 95 across the iteration budget. The terminal
+        # submit_source_model call is what sets 100.
+        ratio = (iteration_idx + 1) / max(iteration_limit, 1)
+        return min(5 + int(ratio * 90), 95)
+
+    def notify(msg: str, *, pct: Optional[int] = None):
+        actual_pct = pct if pct is not None else 5
+        logger.info("PLANNER STEP [%d%%] %s", actual_pct, msg)
+        if on_step:
+            on_step(msg, actual_pct)
+
+    notify("Starting planner…", pct=1)
+
+    # ------------------------------------------------------------------
+    # Agent loop
+    # ------------------------------------------------------------------
+    for iteration in range(iteration_limit):
+        # Rate-limit mitigation — same 3s delay as the legacy mapping agent.
+        if iteration > 0:
+            logger.debug(
+                "Iteration %d: waiting %ds before LLM call (rate limit mitigation)",
+                iteration + 1,
+                _ITERATION_DELAY_SEC,
+            )
+            time.sleep(_ITERATION_DELAY_SEC)
+
+        current_iteration = iteration + 1
+        pct = _progress_pct(iteration)
+        logger.info(
+            "----- Planner iteration %d/%d — %d messages, source_model=%s -----",
+            current_iteration,
+            iteration_limit,
+            len(messages),
+            "set" if ctx.source_model is not None else "unset",
+        )
+        notify(f"Planning iteration {current_iteration}/{iteration_limit}…", pct=pct)
+
+        t0 = time.time()
+        try:
+            llm_response = call_serving_endpoint(
+                host,
+                token,
+                endpoint_name,
+                messages,
+                tools=TOOL_DEFINITIONS,
+                max_tokens=_MAX_TOKENS,
+                temperature=0.1,
+                timeout=LLM_TIMEOUT,
+                trace_name=_TRACE_NAME,
+            )
+        except requests.exceptions.HTTPError as exc:
+            status = exc.response.status_code if exc.response is not None else "?"
+            logger.warning(
+                "Planner iteration %d: HTTPError status=%s", current_iteration, status
+            )
+            logger.debug(
+                "Planner iteration %d: HTTPError body: %.500s",
+                current_iteration,
+                exc.response.text if exc.response is not None else "N/A",
+            )
+            # Tools are non-negotiable for the Planner — no single-shot fallback.
+            if exc.response is not None and status in (400, 422):
+                result.error = "LLM endpoint does not support function calling"
+                result.iterations = current_iteration
+                result.usage = total_usage
+                logger.error(
+                    "Planner: endpoint refused tools — cannot produce a SourceModel"
+                )
+                return result
+            result.error = f"LLM request failed: {exc}"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.error(
+                "Planner: LLM request failed at iteration %d: %s",
+                current_iteration,
+                exc,
+            )
+            return result
+        except requests.exceptions.ReadTimeout:
+            result.error = f"LLM request timed out after {LLM_TIMEOUT}s"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.error("Planner: timeout at iteration %d", current_iteration)
+            return result
+        except requests.exceptions.RequestException as exc:
+            result.error = f"LLM request failed: {exc}"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.error(
+                "Planner: request exception at iteration %d: %s",
+                current_iteration,
+                exc,
+            )
+            return result
+
+        elapsed_ms = int((time.time() - t0) * 1000)
+        logger.info(
+            "Planner iteration %d: LLM responded in %dms", current_iteration, elapsed_ms
+        )
+
+        accumulate_usage(total_usage, llm_response.get("usage", {}))
+
+        choice = llm_response.get("choices", [{}])[0]
+        finish_reason = choice.get("finish_reason", "?")
+        message = choice.get("message", {})
+        tool_calls = message.get("tool_calls", [])
+        has_content = bool(message.get("content"))
+        logger.info(
+            "Planner iteration %d: finish_reason=%s, tool_calls=%d, has_content=%s",
+            current_iteration,
+            finish_reason,
+            len(tool_calls),
+            has_content,
+        )
+        # A tool call truncated by the max_tokens ceiling produces malformed
+        # arguments and the tool can't recover. Flag it loudly so future runs
+        # don't silently waste iterations resubmitting the same broken JSON.
+        if finish_reason == "length" and tool_calls:
+            logger.error(
+                "Planner iteration %d: finish_reason=length on a tool call — "
+                "arguments were likely truncated. Consider bumping max_tokens.",
+                current_iteration,
+            )
+
+        if not tool_calls:
+            # The Planner must end with submit_source_model, not free text.
+            # If we see text without a terminal call, that's a failure.
+            content = (message.get("content") or "")[:500]
+            logger.warning(
+                "Planner iteration %d: produced text without submitting source model — %d chars",
+                current_iteration,
+                len(message.get("content") or ""),
+            )
+            result.steps.append(
+                PlannerStep(
+                    step_type="output",
+                    content=content,
+                    duration_ms=elapsed_ms,
+                )
+            )
+            result.error = "planner produced text without submitting source model"
+            result.iterations = current_iteration
+            result.usage = total_usage
+            notify("Planner produced text without submitting source model.", pct=pct)
+            return result
+
+        # Tool-call branch — dispatch each call and accumulate steps.
+        logger.info(
+            "Planner iteration %d: processing %d tool call(s): [%s]",
+            current_iteration,
+            len(tool_calls),
+            ", ".join(
+                tc.get("function", {}).get("name", "?") for tc in tool_calls
+            ),
+        )
+        messages.append(message)
+
+        terminal_success = False
+        for tc_idx, tc in enumerate(tool_calls, 1):
+            func = tc.get("function", {})
+            tool_name = func.get("name", "")
+            raw_args = func.get("arguments", "{}")
+            tool_id = tc.get("id", "")
+
+            try:
+                arguments = json.loads(raw_args)
+            except json.JSONDecodeError:
+                arguments = {}
+
+            logger.info(
+                "Planner iteration %d: calling tool '%s' (%d/%d)",
+                current_iteration,
+                tool_name,
+                tc_idx,
+                len(tool_calls),
+            )
+
+            # Human-readable progress messages per tool — same pattern as
+            # the legacy mapping agent for UI consistency.
+            if tool_name == "submit_source_model":
+                notify("Submitting source model…", pct=pct)
+            elif tool_name == "get_metadata":
+                notify("Retrieving table metadata…", pct=pct)
+            elif tool_name == "get_ontology":
+                notify("Retrieving ontology…", pct=pct)
+            elif tool_name == "get_documents_context":
+                notify("Retrieving documents…", pct=pct)
+            elif tool_name == "sample_table":
+                fn = arguments.get("full_name", "?")
+                notify(f"Sampling {fn}…", pct=pct)
+            elif tool_name == "column_value_overlap":
+                notify("Checking column overlap…", pct=pct)
+            elif tool_name == "normalized_value_overlap":
+                notify("Verifying canonical-key normalization…", pct=pct)
+            elif tool_name == "distinct_count":
+                notify("Checking distinct count…", pct=pct)
+            elif tool_name == "execute_sql":
+                sql_preview = arguments.get("sql", "")[:80]
+                notify(f"Running SQL: {sql_preview}…", pct=pct)
+            else:
+                notify(f"Calling {tool_name}…", pct=pct)
+
+            result.steps.append(
+                PlannerStep(
+                    step_type="tool_call",
+                    content=json.dumps(arguments, default=str)[:500],
+                    tool_name=tool_name,
+                )
+            )
+
+            t1 = time.time()
+            tool_result = dispatch_tool(
+                TOOL_HANDLERS, ctx, tool_name, arguments, trace_name=_TRACE_NAME
+            )
+            tool_ms = int((time.time() - t1) * 1000)
+
+            logger.info(
+                "Planner iteration %d: tool '%s' returned %d chars in %dms",
+                current_iteration,
+                tool_name,
+                len(tool_result),
+                tool_ms,
+            )
+
+            result.steps.append(
+                PlannerStep(
+                    step_type="tool_result",
+                    content=(
+                        (tool_result[:500] + "…")
+                        if len(tool_result) > 500
+                        else tool_result
+                    ),
+                    tool_name=tool_name,
+                    duration_ms=tool_ms,
+                )
+            )
+
+            messages.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": tool_id,
+                    "content": tool_result,
+                }
+            )
+
+            # Detect terminal success: submit_source_model returned success=True
+            # *and* stamped a SourceModel onto the context. We break *after*
+            # appending the tool result so the orchestrator sees a complete
+            # message trail in conversation/replay.
+            if tool_name == "submit_source_model":
+                try:
+                    parsed = json.loads(tool_result)
+                except json.JSONDecodeError:
+                    parsed = {}
+                if parsed.get("success") is True and ctx.source_model is not None:
+                    terminal_success = True
+                    logger.info(
+                        "Planner iteration %d: submit_source_model succeeded — terminating",
+                        current_iteration,
+                    )
+
+        if terminal_success:
+            result.success = True
+            result.source_model = ctx.source_model
+            result.iterations = current_iteration
+            result.usage = total_usage
+            logger.info(
+                "===== PLANNER COMPLETE ===== iterations=%d, "
+                "prompt_tokens=%d, completion_tokens=%d",
+                result.iterations,
+                total_usage["prompt_tokens"],
+                total_usage["completion_tokens"],
+            )
+            notify("Planner completed!", pct=100)
+            return result
+
+    # Exhausted the iteration budget without ever calling submit_source_model
+    # successfully (or the LLM kept calling other tools forever).
+    result.iterations = iteration_limit
+    result.usage = total_usage
+    result.error = "planner exhausted iteration budget without submitting source model"
+    logger.error("===== PLANNER FAILED ===== %s", result.error)
+    notify(result.error, pct=95)
+    return result
diff --git a/src/agents/agent_owl_generator/engine.py b/src/agents/agent_owl_generator/engine.py
index 9d755d48..c16f4ad1 100644
--- a/src/agents/agent_owl_generator/engine.py
+++ b/src/agents/agent_owl_generator/engine.py
@@ -37,6 +37,18 @@
 
 MAX_ITERATIONS = 10
 LLM_TIMEOUT = 180
+# Exhaustive per-class datatype-property coverage (see # ATTRIBUTE COVERAGE in
+# the system prompt) makes the Turtle output large — a large domain ontology
+# with dozens of classes and 50+ datatype properties runs well past the old 4096
+# ceiling, which silently truncated the final statement and broke parsing.
+# Claude Opus supports large completions; 16k tokens fits an exhaustive
+# domain ontology with headroom.
+MAX_OUTPUT_TOKENS = 16000
+
+# Bounded PGE retry cap for the Evaluator stage (§3.5): how many times the
+# deterministic Stage-1 ontology checks may feed retry_hints back into
+# generation before owl delivery proceeds regardless.
+MAX_OWL_EVAL_ROUNDS = 2
 
 _TRACE_NAME = "owl_generator"
 
@@ -95,9 +107,12 @@ def _load_pitfall_rules() -> str:
 
 # WORKFLOW
 1. Call get_metadata to understand the database schema.
-2. Call list_documents to discover available documents.
-3. Read relevant documents with read_document.
-4. Output ONLY the final Turtle ontology as plain text (starting with @prefix).
+2. Call get_table_detail on EVERY table you intend to map a class to — get_metadata
+   truncates wide tables at 80 columns, and you must see the FULL column list to give
+   each class exhaustive attribute coverage (see # ATTRIBUTE COVERAGE).
+3. Call list_documents to discover available documents.
+4. Read relevant documents with read_document.
+5. Output ONLY the final Turtle ontology as plain text (starting with @prefix).
 
 # NAMING RULES (CRITICAL – NO EXCEPTIONS)
 • Classes: PascalCase (Customer, SalesOrder)
@@ -127,6 +142,34 @@ def _load_pitfall_rules() -> str:
 • For EVERY DatatypeProperty you MUST declare rdfs:domain on the property itself
   (do not rely on owl:Restriction alone — the platform reads attributes from rdfs:domain)
 
+# ATTRIBUTE COVERAGE (CRITICAL — exhaustive, NOT curated)
+The downstream mapping pipeline can only bind a SQL column to a class when that
+column has a matching owl:DatatypeProperty with rdfs:domain on the class. A class
+with few datatype properties produces an ID+Label-only entity that is USELESS for
+analytics. So model attributes EXHAUSTIVELY, not minimally:
+• For EVERY class, emit a DatatypeProperty for EVERY meaningful source column that
+  describes an instance of that class — across ALL tables that realise the class.
+  A single class is often realised by several source tables (e.g. one per source
+  system, region, or tenant) that each hold the same real-world entity in a local
+  schema; UNION their columns mentally and cover the full set. Use get_table_detail
+  on each covering table to see every column.
+• "Meaningful" = a genuine attribute of the entity: dates, measurements, codes,
+  scores, names, statuses, flags, free-text notes. EXCLUDE ONLY: surrogate/auto-
+  increment row keys with no analytical value, audit columns (created_at, updated_by,
+  etl_*, _ingest_*), and the foreign-key columns that ObjectProperty relationships
+  already carry.
+• When two sources expose the SAME attribute under different column names
+  (e.g. total_amount vs TOTAL_AMT; status vs STATUS_CODE), emit ONE datatype
+  property — do NOT emit a per-source duplicate. The mapping layer reconciles the
+  source columns.
+• Name datatype properties in lowerCamelCase derived from business meaning
+  (order_date → orderDate, TOTAL_AMT → totalAmount).
+  Use ONLY [a-z][A-Za-z0-9]* — never underscores, hyphens, or backslash escapes.
+• The "at least 2 datatype properties" floor in the guidelines is a MINIMUM, not a
+  target. Rich, real-world entities (a transaction, an encounter, an event, a core
+  business object) typically warrant 6–11 datatype properties. Aim for full column
+  coverage, not a tidy subset.
+
 # RELATIONSHIP RULES
 • NEVER create bidirectional relationships.
 • Between any two classes A and B create at most ONE ObjectProperty.
@@ -160,10 +203,12 @@ def _load_pitfall_rules() -> str:
 
 ## 2. Class and property design rules
 For each **class** you create:[1][2][3][4]
-1. Provide:  
-   - A short, clear natural-language definition (1–2 sentences).  
-   - At least 1 object property (unless the class is explicitly abstract).  
-   - At least 2 datatype properties, when meaningful in the domain.  
+1. Provide:
+   - A short, clear natural-language definition (1–2 sentences).
+   - At least 1 object property (unless the class is explicitly abstract).
+   - Datatype properties covering EVERY meaningful source column for the class
+     (see "# ATTRIBUTE COVERAGE" in the system prompt — exhaustive, not curated;
+     2 is a floor, full column coverage is the goal).
 2. Naming conventions:  
    - Classes: UpperCamelCase (e.g., `CustomerOrder`).  
    - Object properties: lowerCamelCase verbs or verb-like phrases (e.g., `placesOrder`).  
@@ -241,6 +286,80 @@ def _parse_pitfall_tool_result(tool_result_json: str) -> Optional[Dict]:
         return None
 
 
+# Stage-1 absolute (Tier-1) ontology defects that the Evaluator forces a
+# retry on.  Coverage ratios are computed and logged but are advisory at the
+# generation stage (they are Tier-2 in the scorecard), so they do not by
+# themselves trigger a regeneration — only hard structural defects do.
+_EVAL_ABSOLUTE_CHECKS = (
+    "orphan_class_count",
+    "dangling_domain_range_count",
+    "naming_violation_count",
+    "duplicate_class_count",
+)
+
+
+def _evaluate_ontology_stage(
+    turtle_text: str, metadata: dict, iteration: int
+) -> Optional[str]:
+    """Run the Stage-1 deterministic ontology checks (§3.2) on *turtle_text*.
+
+    Parses the Turtle into the registry shape, runs the shared intrinsic
+    checks, and returns a concrete ``retry_hint`` feedback string when any
+    Tier-1 absolute defect (orphan class, dangling domain/range, naming
+    violation, duplicate class) is present — turning owl-gen into a real
+    PGE loop.  Returns ``None`` when the ontology is structurally clean.
+
+    Fails open: any parse/dep error returns ``None`` so a check failure
+    never blocks OWL delivery (mirrors the pitfall-tool check).
+    """
+    try:
+        from back.core.w3c.owl.OntologyParser import OntologyParser
+        from back.objects.ontology.Ontology import Ontology
+        from agents.pge_eval.ontology_metrics import evaluate_ontology
+
+        # The model sometimes prepends a prose sentence or wraps the Turtle in
+        # a markdown fence; strip that the same way the downstream registry
+        # does, so the Evaluator parses real output instead of skipping.
+        turtle_text = Ontology.clean_owl_output(turtle_text)
+        parser = OntologyParser(turtle_text)
+        ontology = {
+            "classes": parser.get_classes(),
+            "properties": parser.get_properties(),
+        }
+        metrics, issues, _footprint = evaluate_ontology(ontology, metadata or {})
+        logger.info(
+            "Iteration %d: ontology evaluator — metrics=%s",
+            iteration,
+            metrics,
+        )
+
+        absolute_issues = [
+            i for i in issues if i.get("check") in _EVAL_ABSOLUTE_CHECKS
+        ]
+        if not absolute_issues:
+            logger.info(
+                "Iteration %d: ontology evaluator — no Tier-1 defects", iteration
+            )
+            return None
+
+        lines = [
+            "The ontology you produced has structural defects. Fix ALL of them "
+            "and output ONLY the corrected Turtle (no markdown, no comments, "
+            "starting with @prefix declarations):\n"
+        ]
+        # Cap feedback to keep the prompt bounded.
+        for issue in absolute_issues[:12]:
+            lines.append(f"  • {issue['hint']}")
+        return "\n".join(lines)
+    except Exception as exc:  # noqa: BLE001
+        logger.warning(
+            "Iteration %d: ontology evaluator skipped due to error: %s",
+            iteration,
+            exc,
+        )
+        return None
+
+
 def _build_user_prompt(
     guidelines: str,
     options: dict,
@@ -443,6 +562,7 @@ def notify(msg: str):
     # ------------------------------------------------------------------
     tools_supported = True
     _owl_fix_rounds = 0   # pitfall-fix rounds consumed so far
+    _owl_eval_rounds = 0  # Evaluator (Stage-1 PGE) retry rounds consumed
 
     for iteration in range(MAX_ITERATIONS):
         logger.info(
@@ -477,7 +597,7 @@ def notify(msg: str):
                 endpoint_name,
                 messages,
                 tools=send_tools,
-                max_tokens=4096,
+                max_tokens=MAX_OUTPUT_TOKENS,
                 temperature=0.1,
                 timeout=LLM_TIMEOUT,
                 trace_name=_TRACE_NAME,
@@ -509,7 +629,7 @@ def notify(msg: str):
                         endpoint_name,
                         messages,
                         tools=None,
-                        max_tokens=4096,
+                        max_tokens=MAX_OUTPUT_TOKENS,
                         temperature=0.1,
                         timeout=LLM_TIMEOUT,
                         trace_name=_TRACE_NAME,
@@ -749,6 +869,41 @@ def notify(msg: str):
                         _owl_fix_rounds, max_fix_rounds,
                     )
 
+            # --------------------------------------------------------------
+            # Evaluator stage (PGE loop) — after the pitfall-tool loop is
+            # clean/maxed, run the Stage-1 deterministic ontology checks (§3.2).
+            # On a Tier-1 structural defect, feed concrete retry_hints back to
+            # the generator, bounded by MAX_OWL_EVAL_ROUNDS. Only retry when
+            # there's another iteration left, so a usable ontology is never
+            # discarded by exhausting MAX_ITERATIONS.
+            # --------------------------------------------------------------
+            eval_feedback = _evaluate_ontology_stage(content, ctx.metadata, iteration + 1)
+            if (
+                eval_feedback
+                and _owl_eval_rounds < MAX_OWL_EVAL_ROUNDS
+                and iteration < MAX_ITERATIONS - 1
+            ):
+                _owl_eval_rounds += 1
+                notify(
+                    f"Ontology defects found — eval round "
+                    f"{_owl_eval_rounds}/{MAX_OWL_EVAL_ROUNDS}…"
+                )
+                result.steps.append(
+                    AgentStep(
+                        step_type="evaluator",
+                        content=eval_feedback[:200],
+                        duration_ms=0,
+                    )
+                )
+                messages.append({"role": "assistant", "content": content})
+                messages.append({"role": "user", "content": eval_feedback})
+                logger.info(
+                    "Iteration %d: ontology evaluator found defects — eval round %d",
+                    iteration + 1,
+                    _owl_eval_rounds,
+                )
+                continue   # next iteration will produce corrected OWL
+
             # ── Accept this text as the final OWL ────────────────────────────
             result.success = True
             result.owl_content = content
diff --git a/src/agents/pge_eval/__init__.py b/src/agents/pge_eval/__init__.py
new file mode 100644
index 00000000..bd997701
--- /dev/null
+++ b/src/agents/pge_eval/__init__.py
@@ -0,0 +1,18 @@
+"""OntoBricks PGE intrinsic-evaluation toolkit.
+
+A usecase-agnostic, gold-free scorecard for the PGE pipeline (ontology
+generation + mapping generation).  Intrinsic structural/self-consistency
+metrics + an advisory LLM-judge — never a stored reference answer (D1).
+
+Public surface:
+
+* :func:`agents.pge_eval.scorecard.score_artifact` — the offline-testable
+  scoring core (D6).
+* :func:`agents.pge_eval.ontology_metrics.evaluate_ontology` — Stage-1
+  deterministic ontology checks, shared with the owl-generator Evaluator
+  stage (§3.5).
+"""
+
+from agents.pge_eval.scorecard import score_artifact  # noqa: F401
+
+__all__ = ["score_artifact"]
diff --git a/src/agents/pge_eval/baseline.py b/src/agents/pge_eval/baseline.py
new file mode 100644
index 00000000..a979acbe
--- /dev/null
+++ b/src/agents/pge_eval/baseline.py
@@ -0,0 +1,68 @@
+"""Tier-3 self-baseline storage (§3.4).
+
+Each scored run is persisted under ``logs/goals/``.  The baseline for the
+next run is the pipeline's own *most recent accepted* (GREEN) scorecard —
+never a domain answer key.  This is how "did it get worse" is detected
+without gold labels.
+"""
+
+from __future__ import annotations
+
+import glob
+import json
+import os
+from typing import Any, Dict, List, Optional
+
+DEFAULT_BASELINE_DIR = "logs/goals"
+_PREFIX = "scorecard_"
+
+
+def _sort_key(card: Dict[str, Any]) -> Any:
+    return (card.get("timestamp") or "", card.get("run_id") or "")
+
+
+def save_scorecard(
+    scorecard: Dict[str, Any], baseline_dir: str = DEFAULT_BASELINE_DIR
+) -> str:
+    """Persist *scorecard* and return the path written."""
+    os.makedirs(baseline_dir, exist_ok=True)
+    run_id = scorecard.get("run_id") or "run"
+    safe = "".join(ch if ch.isalnum() or ch in "-_." else "_" for ch in str(run_id))
+    path = os.path.join(baseline_dir, f"{_PREFIX}{safe}.json")
+    with open(path, "w") as f:
+        json.dump(scorecard, f, indent=2, default=str)
+    return path
+
+
+def _load_all(baseline_dir: str) -> List[Dict[str, Any]]:
+    cards: List[Dict[str, Any]] = []
+    for p in glob.glob(os.path.join(baseline_dir, f"{_PREFIX}*.json")):
+        try:
+            with open(p) as f:
+                cards.append(json.load(f))
+        except (OSError, json.JSONDecodeError):
+            continue
+    return cards
+
+
+def load_baseline(
+    baseline_dir: str = DEFAULT_BASELINE_DIR,
+    *,
+    exclude_run_id: Optional[str] = None,
+) -> Optional[Dict[str, Any]]:
+    """Return the most recent accepted (GREEN) scorecard, or ``None``.
+
+    A RED run never becomes a baseline — otherwise a regression would
+    silently reset the bar.  ``exclude_run_id`` drops the current run so a
+    scorecard never baselines against itself.
+    """
+    cards = [
+        c
+        for c in _load_all(baseline_dir)
+        if c.get("verdict") == "GREEN"
+        and c.get("run_id") != exclude_run_id
+    ]
+    if not cards:
+        return None
+    cards.sort(key=_sort_key)
+    return cards[-1]
diff --git a/src/agents/pge_eval/gates.py b/src/agents/pge_eval/gates.py
new file mode 100644
index 00000000..826d8b3f
--- /dev/null
+++ b/src/agents/pge_eval/gates.py
@@ -0,0 +1,182 @@
+"""The three gate tiers (§3.4) + metric directionality.
+
+* **Tier 1 — absolute hard gates** (always active).  Integrity / hygiene /
+  executability invariants that hold for any domain; non-zero exit on fail.
+* **Tier 2 — ratio thresholds**.  Warn by default; promotable to hard gates
+  per run via ``--gate-ratios``.  The 0.90 default is a starting heuristic,
+  overridable, never an absolute truth.
+* **Tier 3 — self-baseline regression** (active when a baseline exists).
+  Any Tier-1/Tier-2 metric that drops vs the last accepted baseline beyond
+  its tolerance fails the run, even if still above its absolute bar.
+
+The LLM judge is Tier-exempt — it never appears here.
+
+No domain identifiers, table names, or counts are encoded; every threshold
+is a generic structural bar.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional
+
+DEFAULT_RATIO_THRESHOLD = 0.90
+
+# Directionality for the Tier-3 regression gate.
+HIGHER_BETTER = "higher_better"
+LOWER_BETTER = "lower_better"
+
+
+# Each spec: stage, key, tier, direction, plus tier-specific config.
+# ``conditional`` marks a metric that only gates when active (set at runtime).
+METRIC_SPECS: List[Dict[str, Any]] = [
+    # ---- Tier-1 absolute (ontology) ----
+    {"stage": "ontology", "key": "orphan_class_count", "tier": 1,
+     "direction": LOWER_BETTER, "op": "==", "bound": 0, "tol": 0},
+    {"stage": "ontology", "key": "dangling_domain_range_count", "tier": 1,
+     "direction": LOWER_BETTER, "op": "==", "bound": 0, "tol": 0},
+    {"stage": "ontology", "key": "naming_violation_count", "tier": 1,
+     "direction": LOWER_BETTER, "op": "==", "bound": 0, "tol": 0},
+    {"stage": "ontology", "key": "duplicate_class_count", "tier": 1,
+     "direction": LOWER_BETTER, "op": "==", "bound": 0, "tol": 0},
+    # ---- Tier-1 absolute (mapping) ----
+    {"stage": "mapping", "key": "dangling_target_pct_max", "tier": 1,
+     "direction": LOWER_BETTER, "op": "<", "bound": 0.05, "tol": 0.01},
+    {"stage": "mapping", "key": "dangling_source_pct_max", "tier": 1,
+     "direction": LOWER_BETTER, "op": "<", "bound": 0.05, "tol": 0.01},
+    {"stage": "mapping", "key": "id_integrity", "tier": 1,
+     "direction": HIGHER_BETTER, "op": "==", "bound": 1.0, "tol": 0.0},
+    {"stage": "mapping", "key": "sql_exec_failures", "tier": 1,
+     "direction": LOWER_BETTER, "op": "==", "bound": 0, "tol": 0},
+    {"stage": "mapping", "key": "cross_source_band_compliance", "tier": 1,
+     "direction": HIGHER_BETTER, "op": "==", "bound": 1.0, "tol": 0.0,
+     "conditional": "band_active"},
+    # ---- Tier-2 ratio ----
+    {"stage": "ontology", "key": "table_footprint_coverage", "tier": 2,
+     "direction": HIGHER_BETTER, "tol": 0.02},
+    {"stage": "ontology", "key": "column_footprint_coverage", "tier": 2,
+     "direction": HIGHER_BETTER, "tol": 0.02},
+    {"stage": "mapping", "key": "entity_completeness", "tier": 2,
+     "direction": HIGHER_BETTER, "tol": 0.02},
+    {"stage": "mapping", "key": "relationship_completeness", "tier": 2,
+     "direction": HIGHER_BETTER, "tol": 0.02},
+    {"stage": "mapping", "key": "attribute_coverage", "tier": 2,
+     "direction": HIGHER_BETTER, "tol": 0.02},
+]
+
+
+def get_metric(stages: Dict[str, Any], stage: str, key: str) -> Any:
+    return ((stages.get(stage, {}) or {}).get("metrics", {}) or {}).get(key)
+
+
+def _abs_pass(op: str, value: float, bound: float) -> bool:
+    if value is None:
+        return False
+    if op == "==":
+        return value == bound
+    if op == "<":
+        return value < bound
+    if op == "<=":
+        return value <= bound
+    if op == ">=":
+        return value >= bound
+    raise ValueError(f"unknown op {op!r}")
+
+
+def evaluate_tier1(
+    stages: Dict[str, Any], *, active_conditionals: Optional[Dict[str, bool]] = None
+) -> Dict[str, Any]:
+    active_conditionals = active_conditionals or {}
+    failures: List[Dict[str, Any]] = []
+    for spec in METRIC_SPECS:
+        if spec["tier"] != 1:
+            continue
+        cond = spec.get("conditional")
+        if cond and not active_conditionals.get(cond, False):
+            continue
+        value = get_metric(stages, spec["stage"], spec["key"])
+        if not _abs_pass(spec["op"], value, spec["bound"]):
+            failures.append(
+                {
+                    "metric": f"{spec['stage']}.{spec['key']}",
+                    "observed": value,
+                    "expected": f"{spec['op']} {spec['bound']}",
+                }
+            )
+    return {"passed": not failures, "failures": failures}
+
+
+def evaluate_tier2(
+    stages: Dict[str, Any],
+    *,
+    gate_ratios: bool,
+    threshold: float = DEFAULT_RATIO_THRESHOLD,
+) -> Dict[str, Any]:
+    warnings: List[Dict[str, Any]] = []
+    for spec in METRIC_SPECS:
+        if spec["tier"] != 2:
+            continue
+        value = get_metric(stages, spec["stage"], spec["key"])
+        if value is None or value < threshold:
+            warnings.append(
+                {
+                    "metric": f"{spec['stage']}.{spec['key']}",
+                    "observed": value,
+                    "expected": f">= {threshold}",
+                }
+            )
+    # When --gate-ratios is set, the warnings become hard failures.
+    passed = (not warnings) if gate_ratios else True
+    return {"gated": gate_ratios, "passed": passed, "warnings": warnings}
+
+
+def evaluate_tier3(
+    stages: Dict[str, Any],
+    baseline: Optional[Dict[str, Any]],
+) -> Dict[str, Any]:
+    """Compare every Tier-1/Tier-2 metric against the baseline scorecard.
+
+    A metric *regresses* when it moves the wrong way beyond its tolerance.
+    Direction-aware: higher-better metrics regress on a drop, lower-better
+    metrics regress on a rise.
+    """
+    if not baseline:
+        return {"baseline_run_id": None, "passed": True, "regressions": []}
+
+    base_stages = baseline.get("stages", {})
+    base_id = baseline.get("run_id")
+    regressions: List[Dict[str, Any]] = []
+    for spec in METRIC_SPECS:
+        # A conditional metric (e.g. cross-source band compliance) only counts
+        # as a regression when it was actively measured in BOTH runs — otherwise
+        # the first real measurement after an inactive 1.0 looks like a drop.
+        cond = spec.get("conditional")
+        if cond:
+            cur_active = (stages.get(spec["stage"], {}) or {}).get(cond, False)
+            base_active = (base_stages.get(spec["stage"], {}) or {}).get(cond, False)
+            if not (cur_active and base_active):
+                continue
+        value = get_metric(stages, spec["stage"], spec["key"])
+        base_value = get_metric(base_stages, spec["stage"], spec["key"])
+        if value is None or base_value is None:
+            continue
+        tol = spec.get("tol", 0)
+        regressed = False
+        if spec["direction"] == HIGHER_BETTER:
+            regressed = value < base_value - tol
+        else:  # LOWER_BETTER
+            regressed = value > base_value + tol
+        if regressed:
+            regressions.append(
+                {
+                    "metric": f"{spec['stage']}.{spec['key']}",
+                    "observed": value,
+                    "baseline": base_value,
+                    "direction": spec["direction"],
+                    "tolerance": tol,
+                }
+            )
+    return {
+        "baseline_run_id": base_id,
+        "passed": not regressions,
+        "regressions": regressions,
+    }
diff --git a/src/agents/pge_eval/inapp.py b/src/agents/pge_eval/inapp.py
new file mode 100644
index 00000000..a7bbfc46
--- /dev/null
+++ b/src/agents/pge_eval/inapp.py
@@ -0,0 +1,135 @@
+"""In-app scorecard hooks — run the PGE intrinsic evaluator *inside* the
+Databricks app, right after ontology generation or mapping generation.
+
+These are thin, fail-safe wrappers around
+:func:`agents.pge_eval.scorecard.score_artifact`:
+
+* **Deterministic only** — ``no_judge=True``: no extra LLM/network calls are
+  added to the user-facing generation/mapping latency.
+* **No baseline side-effects** — ``use_baseline=False``: the app server never
+  reads/writes the Tier-3 ``logs/goals`` baseline store (that is a CI/CLI
+  concern). The in-app scorecard is a per-run quality snapshot surfaced to
+  the user.
+* **Never raises** — scoring must never break a generation/mapping run; any
+  failure logs a warning and returns ``None``.
+
+The result is the §3.6 scorecard dict, attached to the background task's
+``result`` so the UI can surface verdict + metrics.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional
+
+from back.core.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def _now():
+    t = datetime.now(timezone.utc)
+    return t.strftime("%Y%m%dT%H%M%S_%f"), t.isoformat()
+
+
+def _turtle_to_ontology(owl_content: str) -> Dict[str, Any]:
+    """Parse generated Turtle into the registry ontology shape.
+
+    Lazy imports keep the pure scorecard modules free of back/ deps.
+    """
+    from back.core.w3c.owl.OntologyParser import OntologyParser
+    from back.objects.ontology.Ontology import Ontology
+
+    cleaned = Ontology.clean_owl_output(owl_content or "")
+    parser = OntologyParser(cleaned)
+    return {"classes": parser.get_classes(), "properties": parser.get_properties()}
+
+
+def score_generated_ontology(
+    owl_content: str,
+    metadata: Optional[dict],
+) -> Optional[Dict[str, Any]]:
+    """Score a freshly generated ontology (Stage-1 focus). Returns the §3.6
+    scorecard dict, or ``None`` on any failure."""
+    try:
+        from agents.pge_eval.scorecard import score_artifact
+
+        ontology = _turtle_to_ontology(owl_content)
+        artifact = {
+            "ontology": ontology,
+            "metadata": metadata or {"tables": []},
+            "mapping_run_log": [],
+            "mapping_evaluations": {},
+            "entity_mappings": [],
+        }
+        run_id, ts = _now()
+        scorecard = score_artifact(
+            artifact,
+            no_judge=True,
+            use_baseline=False,
+            mode="live",
+            run_id=run_id,
+            timestamp=ts,
+        )
+        logger.info(
+            "in-app ontology scorecard: verdict=%s (orphans=%s, dangling=%s, "
+            "naming=%s, dupes=%s)",
+            scorecard["verdict"],
+            scorecard["stages"]["ontology"]["metrics"]["orphan_class_count"],
+            scorecard["stages"]["ontology"]["metrics"]["dangling_domain_range_count"],
+            scorecard["stages"]["ontology"]["metrics"]["naming_violation_count"],
+            scorecard["stages"]["ontology"]["metrics"]["duplicate_class_count"],
+        )
+        return scorecard
+    except Exception as exc:  # noqa: BLE001 — scoring must never break generation
+        logger.warning("in-app ontology scoring failed (ignored): %s", exc)
+        return None
+
+
+def score_mapping_run(
+    *,
+    ontology: dict,
+    metadata: Optional[dict],
+    mapping_run_log: Optional[List[dict]],
+    mapping_evaluations: Optional[Dict[str, dict]],
+    entity_mappings: Optional[List[dict]],
+    relationship_mappings: Optional[List[dict]] = None,
+    usage: Optional[dict] = None,
+) -> Optional[Dict[str, Any]]:
+    """Score a completed mapping-PGE run (Stage-2 + Stage-1 + pipeline).
+    Returns the §3.6 scorecard dict, or ``None`` on any failure."""
+    try:
+        from agents.pge_eval.scorecard import score_artifact
+
+        artifact = {
+            "ontology": ontology or {},
+            "metadata": metadata or {"tables": []},
+            "mapping_run_log": mapping_run_log or [],
+            "mapping_evaluations": mapping_evaluations or {},
+            "entity_mappings": entity_mappings or [],
+            "relationship_mappings": relationship_mappings or [],
+            "usage": usage or {},
+        }
+        run_id, ts = _now()
+        scorecard = score_artifact(
+            artifact,
+            no_judge=True,
+            use_baseline=False,
+            mode="live",
+            run_id=run_id,
+            timestamp=ts,
+        )
+        m = scorecard["stages"]["mapping"]["metrics"]
+        logger.info(
+            "in-app mapping scorecard: verdict=%s (entity_completeness=%s, "
+            "rel_completeness=%s, id_integrity=%s, sql_failures=%s)",
+            scorecard["verdict"],
+            m["entity_completeness"],
+            m["relationship_completeness"],
+            m["id_integrity"],
+            m["sql_exec_failures"],
+        )
+        return scorecard
+    except Exception as exc:  # noqa: BLE001 — scoring must never break mapping
+        logger.warning("in-app mapping scoring failed (ignored): %s", exc)
+        return None
diff --git a/src/agents/pge_eval/judge.py b/src/agents/pge_eval/judge.py
new file mode 100644
index 00000000..98479319
--- /dev/null
+++ b/src/agents/pge_eval/judge.py
@@ -0,0 +1,138 @@
+"""Advisory LLM-judge (§3.2 / §3.3, D5).
+
+The judge is **advisory only** — it never gates a run (Tier-exempt).  It
+emits a 0–1 score per axis plus flagged issues that inform ``retry_hint``s.
+
+This module is the **only** place the scorer touches the network.  The
+deterministic metrics never import it, and the orchestrator only calls
+:func:`run_judge` when ``--no-judge`` is NOT set — guaranteeing zero network
+traffic in ``--no-judge`` mode.  ``requests``/serving imports are lazy so
+merely importing this module makes no connection either.
+
+The judge is usecase-agnostic: it asks generic coherence questions and is
+handed the *actual* runtime ontology/mapping, never a reference answer.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Dict, List, Optional
+
+from back.core.logging import get_logger
+
+logger = get_logger(__name__)
+
+_JUDGE_TIMEOUT = 60
+_MAX_TOKENS = 1024
+
+
+def _empty_axis(reason: str = "") -> Dict[str, Any]:
+    flags = [reason] if reason else []
+    return {"score": None, "flags": flags}
+
+
+def _ontology_summary(ontology: dict) -> str:
+    from agents.pge_eval.normalize import normalize_ontology
+
+    norm = normalize_ontology(ontology)
+    classes = [
+        f"{c['name']}({len(c.get('data_properties', []))} dp)" for c in norm.classes
+    ]
+    rels = [
+        f"{op['name']}: {op.get('domain', '?')}->{op.get('range', '?')}"
+        for op in norm.object_properties
+    ]
+    return (
+        f"Classes ({len(classes)}): {', '.join(classes[:60])}\n"
+        f"ObjectProperties ({len(rels)}): {', '.join(rels[:60])}"
+    )
+
+
+def _mapping_summary(artifact: dict) -> str:
+    log = artifact.get("mapping_run_log", []) or []
+    lines = [
+        f"{e.get('kind')}: {e.get('item')} -> {e.get('final_status')}"
+        for e in log[:80]
+    ]
+    return "\n".join(lines)
+
+
+def _parse_axis(text: str) -> Dict[str, Any]:
+    """Pull a ``{"score": float, "flags": [str]}`` object out of LLM text."""
+    try:
+        start = text.index("{")
+        end = text.rindex("}") + 1
+        obj = json.loads(text[start:end])
+        score = obj.get("score")
+        score = float(score) if score is not None else None
+        flags = [str(f) for f in (obj.get("flags") or [])]
+        return {"score": score, "flags": flags}
+    except (ValueError, TypeError, json.JSONDecodeError):
+        return _empty_axis("judge response could not be parsed")
+
+
+def _ask(host: str, token: str, endpoint_name: str, system: str, user: str) -> Dict[str, Any]:
+    # Lazy import: no network dependency unless the judge actually runs.
+    from agents.engine_base import call_serving_endpoint, extract_message_content
+
+    try:
+        resp = call_serving_endpoint(
+            host,
+            token,
+            endpoint_name,
+            [
+                {"role": "system", "content": system},
+                {"role": "user", "content": user},
+            ],
+            tools=None,
+            max_tokens=_MAX_TOKENS,
+            temperature=0.0,
+            timeout=_JUDGE_TIMEOUT,
+            trace_name="pge_eval:judge",
+        )
+        return _parse_axis(extract_message_content(resp))
+    except Exception as exc:  # noqa: BLE001 — advisory, must never crash scoring
+        logger.warning("pge_eval judge call failed (advisory, ignored): %s", exc)
+        return _empty_axis(f"judge unavailable: {exc}")
+
+
+_ONTOLOGY_SYSTEM = (
+    "You are an ontology reviewer. Judge whether the classes and properties "
+    "are coherent and non-redundant for the implied domain. Reply ONLY with a "
+    'JSON object: {"score": <0..1 float>, "flags": ["short issue", ...]}. '
+    "score=1 means fully coherent; flags list concrete redundancy/incoherence "
+    "issues. Do not compare against any reference ontology."
+)
+
+_MAPPING_SYSTEM = (
+    "You are a data-mapping reviewer. Given per-item mapping outcomes, judge "
+    "holistically what the mapping likely missed or got wrong. Reply ONLY with "
+    'a JSON object: {"score": <0..1 float>, "flags": ["short issue", ...]}. '
+    "score=1 means the mapping looks complete and correct."
+)
+
+
+def run_judge(
+    *,
+    host: str,
+    token: str,
+    endpoint_name: str,
+    ontology: dict,
+    artifact: dict,
+    stage1_issues: Optional[List[Dict[str, str]]] = None,
+) -> Dict[str, Dict[str, Any]]:
+    """Run both advisory axes. Returns ``{"ontology": {...}, "mapping": {...}}``.
+
+    Never raises; any failure degrades to an empty axis with a flag.
+    """
+    if not endpoint_name:
+        return {"ontology": _empty_axis("no endpoint"), "mapping": _empty_axis("no endpoint")}
+
+    onto_user = _ontology_summary(ontology)
+    if stage1_issues:
+        onto_user += "\n\nDeterministic issues already found:\n" + "\n".join(
+            f"- {i['check']}: {i['observed']}" for i in stage1_issues[:20]
+        )
+    ontology_axis = _ask(host, token, endpoint_name, _ONTOLOGY_SYSTEM, onto_user)
+    mapping_axis = _ask(host, token, endpoint_name, _MAPPING_SYSTEM, _mapping_summary(artifact))
+    return {"ontology": ontology_axis, "mapping": mapping_axis}
diff --git a/src/agents/pge_eval/loaders.py b/src/agents/pge_eval/loaders.py
new file mode 100644
index 00000000..abf10c64
--- /dev/null
+++ b/src/agents/pge_eval/loaders.py
@@ -0,0 +1,130 @@
+"""Domain-agnostic input loaders for the live ``goals_eval.py run`` path.
+
+The score-only path is already usecase-agnostic (it ingests a captured
+artifact). Live ``run`` previously reused the smoke-test loader, which
+hard-pinned a single demo domain (a fixed ``/tmp`` dump + a fixed version key).
+These helpers replace that: they load the ontology + source metadata for ANY
+domain, from either an exported registry version dump or plain ontology/metadata
+JSON files — no domain, table, or version is baked in.
+
+Pure functions (file IO + dict reshaping only) — no LLM, no DB, no domain
+knowledge — so they are unit-testable offline.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Dict, Optional, Tuple
+
+
+def to_agent_shape(ontology: dict) -> dict:
+    """Convert a registry-shape ontology (``{classes, properties}``) to the
+    ``{entities, relationships}`` shape the mapping-PGE engine consumes.
+
+    If the input is already in agent shape (has ``entities``) it is returned
+    unchanged. Fully generic — only field copying + domain/range resolution.
+    """
+    ontology = ontology or {}
+    if "entities" in ontology or "relationships" in ontology:
+        return {
+            "entities": list(ontology.get("entities", []) or []),
+            "relationships": list(ontology.get("relationships", []) or []),
+        }
+
+    classes = ontology.get("classes", []) or []
+    properties = ontology.get("properties", []) or []
+    name_to_uri = {c["name"]: c["uri"] for c in classes if c.get("name") and c.get("uri")}
+
+    def _resolve(ref: str) -> str:
+        if not ref or str(ref).startswith("http"):
+            return ref
+        return name_to_uri.get(ref, ref)
+
+    entities = [
+        {
+            "uri": c.get("uri", ""),
+            "name": c.get("name", ""),
+            "label": c.get("label", ""),
+            "comment": c.get("comment", ""),
+            "parent": c.get("parent", ""),
+            "attributes": list(c.get("dataProperties", []) or []),
+        }
+        for c in classes
+    ]
+    relationships = [
+        {
+            "uri": p.get("uri", ""),
+            "name": p.get("name", ""),
+            "label": p.get("label", p.get("name", "")),
+            "comment": p.get("comment", ""),
+            "domain": _resolve(p.get("domain", "")),
+            "range": _resolve(p.get("range", "")),
+        }
+        for p in properties
+        if p.get("type", "ObjectProperty") == "ObjectProperty"
+    ]
+    return {"entities": entities, "relationships": relationships}
+
+
+def _pick_version(versions: Dict[str, Any], version: Optional[str]) -> str:
+    """Choose a version key from a registry dump's ``versions`` map.
+
+    Explicit ``version`` wins; otherwise the single version if there's exactly
+    one; otherwise raise asking the caller to disambiguate. Never guesses a
+    domain-specific default (the old code hard-coded ``"1_1"``).
+    """
+    if version is not None:
+        if version not in versions:
+            raise ValueError(
+                f"version {version!r} not in registry dump; available: "
+                f"{sorted(versions)}"
+            )
+        return version
+    keys = list(versions)
+    if len(keys) == 1:
+        return keys[0]
+    raise ValueError(
+        f"registry dump has {len(keys)} versions {sorted(keys)}; "
+        "pass --version to choose one"
+    )
+
+
+def load_run_inputs(
+    *,
+    registry_json: Optional[str] = None,
+    version: Optional[str] = None,
+    ontology_path: Optional[str] = None,
+    metadata_path: Optional[str] = None,
+) -> Tuple[dict, dict]:
+    """Resolve ``(ontology_agent_shape, metadata)`` for a live run, domain-agnostic.
+
+    Exactly one source must be given:
+
+    * ``registry_json`` (+ optional ``version``) — an exported registry version
+      dump shaped ``{"versions": {<ver>: {"ontology": ..., "metadata": ...}}}``.
+    * ``ontology_path`` (+ optional ``metadata_path``) — plain JSON files holding
+      the ontology (registry or agent shape) and source metadata.
+    """
+    if registry_json:
+        with open(registry_json) as f:
+            doc = json.load(f)
+        versions = doc.get("versions") or {}
+        if not versions:
+            raise ValueError(f"{registry_json} has no 'versions' map")
+        ver = _pick_version(versions, version)
+        v = versions[ver]
+        return to_agent_shape(v.get("ontology", {})), (v.get("metadata", {}) or {})
+
+    if ontology_path:
+        with open(ontology_path) as f:
+            ontology = json.load(f)
+        metadata: dict = {}
+        if metadata_path:
+            with open(metadata_path) as f:
+                metadata = json.load(f)
+        return to_agent_shape(ontology), metadata
+
+    raise ValueError(
+        "live run needs an ontology source: pass --registry-json (+--version) "
+        "or --ontology (+--metadata)"
+    )
diff --git a/src/agents/pge_eval/mapping_metrics.py b/src/agents/pge_eval/mapping_metrics.py
new file mode 100644
index 00000000..ecc588f3
--- /dev/null
+++ b/src/agents/pge_eval/mapping_metrics.py
@@ -0,0 +1,181 @@
+"""Stage-2 — mapping-generation quality (deterministic, no LLM).
+
+Computed from a captured PGE ``AgentResult`` artifact (the JSON dumped by
+``scripts/smoke_pge.py``).  Stage-2 reads two artifact fields:
+
+* ``mapping_run_log`` — authoritative per-item ``final_status`` (drives the
+  completeness ratios and pass/fail accounting).
+* ``mapping_evaluations`` — the per-item deterministic ``EvalReport`` dicts
+  the run captured (drives the numeric metrics: dangling fractions, id
+  integrity, sql-execution failures).  Defect detection keys off the
+  structured ``failures[].check`` field, never on prose.
+
+This makes score-only fully offline: no DB round-trip, no LLM, no network.
+Live mode produces the same artifact first, then calls this.  Nothing here
+is domain-specific.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, List, Set, Tuple
+
+from agents.pge_eval.normalize import (
+    NormalizedOntology,
+    local_name,
+    normalize_name,
+    normalize_ontology,
+)
+
+_PASS_STATUSES = {"PASS", "PRESEEDED"}
+_OUT_OF_SCOPE = {"SKIPPED", "FAIL_BUDGET"}
+
+_IDENT_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]*")
+
+
+def _ratio(num: int, denom: int) -> float:
+    return round(num / denom, 6) if denom else 1.0
+
+
+def _is_rel_report(metrics: dict) -> bool:
+    return "dangling_target_pct" in metrics or "total_edges" in metrics
+
+
+def _is_entity_report(metrics: dict) -> bool:
+    return "row_count" in metrics
+
+
+def _has_sql_failure(report: dict) -> bool:
+    for f in report.get("failures", []) or []:
+        if f.get("check") == "sql_execution":
+            return True
+    return False
+
+
+def _class_dp_counts(norm: NormalizedOntology) -> Dict[str, int]:
+    """Map every class identifier (uri + local + name) -> data-property count."""
+    out: Dict[str, int] = {}
+    for c in norm.classes:
+        n = len(c.get("data_properties", []))
+        for key in (c.get("uri"), c.get("name")):
+            if key:
+                out[key] = n
+                out[local_name(key)] = n
+    return out
+
+
+def evaluate_mapping(
+    artifact: dict,
+    ontology: dict,
+) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """Run the deterministic Stage-2 checks.
+
+    Returns ``(metrics, extras)`` where ``extras`` carries the mapped
+    column footprint reused by ``pipeline.coverage_loss``.
+    """
+    norm = normalize_ontology(ontology)
+    run_log = artifact.get("mapping_run_log", []) or []
+    evaluations = artifact.get("mapping_evaluations", {}) or {}
+    entity_mappings = artifact.get("entity_mappings", []) or []
+
+    # ---- completeness from the run log -----------------------------
+    ent_inscope = ent_pass = 0
+    rel_inscope = rel_pass = 0
+    for entry in run_log:
+        status = entry.get("final_status", "")
+        if status in _OUT_OF_SCOPE:
+            continue
+        if entry.get("kind") == "entity":
+            ent_inscope += 1
+            if status in _PASS_STATUSES:
+                ent_pass += 1
+        elif entry.get("kind") == "relationship":
+            rel_inscope += 1
+            if status in _PASS_STATUSES:
+                rel_pass += 1
+
+    entity_completeness = _ratio(ent_pass, ent_inscope)
+    relationship_completeness = _ratio(rel_pass, rel_inscope)
+
+    # ---- numeric metrics from captured eval reports ----------------
+    dangling_target_pcts: List[float] = []
+    dangling_source_pcts: List[float] = []
+    id_ok = id_total = 0
+    sql_exec_failures = 0
+    band_declared = band_compliant = 0
+
+    for report in evaluations.values():
+        metrics = report.get("metrics", {}) or {}
+        if _has_sql_failure(report):
+            sql_exec_failures += 1
+        if _is_rel_report(metrics):
+            dangling_target_pcts.append(float(metrics.get("dangling_target_pct", 0.0)))
+            dangling_source_pcts.append(float(metrics.get("dangling_source_pct", 0.0)))
+            band = metrics.get("expected_cross_source_overlap_band")
+            if band and isinstance(band, (list, tuple)) and len(band) == 2:
+                band_declared += 1
+                lo, hi = float(band[0]), float(band[1])
+                overlap = float(metrics.get("cross_source_overlap_pct", 0.0))
+                if lo <= overlap <= hi:
+                    band_compliant += 1
+        if _is_entity_report(metrics):
+            row_count = int(metrics.get("row_count", 0))
+            # A legitimately empty (0-row) entity is id-vacuous: it has no ids to
+            # be (non-)unique, so it neither passes nor fails id-integrity.
+            # Counting it as a failure would RED a clean run on empty source data.
+            if row_count == 0:
+                continue
+            id_total += 1
+            distinct = int(metrics.get("distinct_id_count", 0))
+            null_id = int(metrics.get("null_id_count", 0))
+            if distinct == row_count and null_id == 0:
+                id_ok += 1
+
+    dangling_target_pct_max = round(max(dangling_target_pcts), 6) if dangling_target_pcts else 0.0
+    dangling_source_pct_max = round(max(dangling_source_pcts), 6) if dangling_source_pcts else 0.0
+    id_integrity = _ratio(id_ok, id_total)
+
+    # cross_source_band_compliance is conditional: only active when >=1 band
+    # was declared.  When inactive it reports 1.0 and is flagged so the gate
+    # skips it.
+    band_active = band_declared > 0
+    cross_source_band_compliance = (
+        _ratio(band_compliant, band_declared) if band_active else 1.0
+    )
+
+    # ---- attribute coverage + mapped footprint ---------------------
+    dp_counts = _class_dp_counts(norm)
+    attrs_emitted = 0
+    dp_denominator = 0
+    mapped_cols: Set[str] = set()
+    counted_classes: Set[str] = set()
+    for em in entity_mappings:
+        am = em.get("attribute_mappings", {}) or {}
+        attrs_emitted += len(am)
+        cls = em.get("ontology_class") or em.get("class_name") or ""
+        if cls and cls not in counted_classes:
+            counted_classes.add(cls)
+            dp_denominator += dp_counts.get(cls, dp_counts.get(local_name(cls), 0))
+        for value in am.values():
+            for tok in _IDENT_RE.findall(str(value)):
+                k = normalize_name(tok)
+                if k:
+                    mapped_cols.add(k)
+
+    attribute_coverage = _ratio(attrs_emitted, dp_denominator)
+
+    metrics_out: Dict[str, Any] = {
+        "entity_completeness": entity_completeness,
+        "relationship_completeness": relationship_completeness,
+        "attribute_coverage": attribute_coverage,
+        "dangling_target_pct_max": dangling_target_pct_max,
+        "dangling_source_pct_max": dangling_source_pct_max,
+        "id_integrity": id_integrity,
+        "sql_exec_failures": sql_exec_failures,
+        "cross_source_band_compliance": cross_source_band_compliance,
+    }
+    extras = {
+        "mapped_cols": mapped_cols,
+        "band_active": band_active,
+    }
+    return metrics_out, extras
diff --git a/src/agents/pge_eval/normalize.py b/src/agents/pge_eval/normalize.py
new file mode 100644
index 00000000..3e20baab
--- /dev/null
+++ b/src/agents/pge_eval/normalize.py
@@ -0,0 +1,262 @@
+"""Shape normalisation + footprint helpers for the PGE intrinsic evaluator.
+
+Everything in this module is pure Python — no LLM, no DB, no domain
+knowledge.  It exists so the rest of the scorer can reason over one stable
+in-memory shape regardless of whether the caller handed it the *agent*
+ontology shape (``{entities, relationships}``), the *registry* ontology
+shape (``{classes, properties}``), or raw source metadata.
+
+Design constraints (see docs/plans/2026-06-10-goal-loop-and-pge-eval-design.md):
+
+* **Usecase-agnostic.**  No table name, identifier, or count from any
+  particular domain is encoded here.  The only constants are generic
+  audit/surrogate column heuristics that hold for any relational source.
+* **Deterministic.**  Pure functions of their inputs; no randomness, no
+  wall-clock, no network.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, List, Optional, Set
+
+
+# =====================================================
+# Name normalisation
+# =====================================================
+
+
+def normalize_name(name: Optional[str]) -> str:
+    """Collapse a column / property / class name to a comparison key.
+
+    Lower-cases and strips every non-alphanumeric character so that
+    ``first_name``, ``firstName`` and ``FirstName`` all collapse to
+    ``firstname``.  This is the footprint-matching key used to decide
+    whether a source column "became" a data property without consulting
+    the mapping (Stage-1 is mapping-independent — see D2/D3).
+    """
+    if not name:
+        return ""
+    return re.sub(r"[^a-z0-9]", "", str(name).lower())
+
+
+def local_name(uri_or_name: Optional[str]) -> str:
+    """Return the local name of a URI/CURIE, or the value unchanged.
+
+    ``http://x/Customer`` -> ``Customer``; ``ex:Customer`` -> ``Customer``;
+    ``Customer`` -> ``Customer``.
+    """
+    if not uri_or_name:
+        return ""
+    s = str(uri_or_name)
+    for sep in ("#", "/"):
+        if sep in s:
+            s = s.rsplit(sep, 1)[-1]
+    if ":" in s and not s.startswith("http"):
+        s = s.rsplit(":", 1)[-1]
+    return s
+
+
+# =====================================================
+# Audit / surrogate column heuristics (generic, not domain-specific)
+# =====================================================
+
+# Audit tokens that mark a column as non-analytical bookkeeping.  These are
+# generic ETL/CDC conventions, not tied to any domain.
+_AUDIT_TOKENS = (
+    "createdat",
+    "updatedat",
+    "createdon",
+    "updatedon",
+    "createdby",
+    "updatedby",
+    "modifiedat",
+    "modifiedby",
+    "deletedat",
+    "ingestedat",
+    "loadedat",
+    "loadts",
+    "etltimestamp",
+    "dwcreated",
+    "dwupdated",
+)
+_AUDIT_PREFIXES = ("etl", "ingest", "_ingest", "dw")
+# Exact surrogate row-key names + suffixes for warehouse surrogate keys.
+_SURROGATE_EXACT = ("id", "rowid", "rownum", "rownumber")
+_SURROGATE_SUFFIXES = ("sk", "surrogatekey")
+
+
+def is_surrogate_or_audit(column_name: str) -> bool:
+    """Heuristic: True when *column_name* is a surrogate row key or audit
+    column with no analytical value.
+
+    The OWL generator is instructed to drop exactly these, so they are
+    excluded from coverage denominators (D3).  Intentionally conservative:
+    it does NOT drop every ``*_id`` column (foreign keys can be meaningful),
+    only obvious surrogate keys and audit bookkeeping.
+    """
+    norm = normalize_name(column_name)
+    if not norm:
+        return True
+    if norm in _SURROGATE_EXACT:
+        return True
+    if any(norm.endswith(sfx) for sfx in _SURROGATE_SUFFIXES):
+        return True
+    if any(tok in norm for tok in _AUDIT_TOKENS):
+        return True
+    raw = re.sub(r"[^a-z0-9_]", "", str(column_name).lower())
+    if any(raw.startswith(p) for p in _AUDIT_PREFIXES):
+        return True
+    return False
+
+
+# =====================================================
+# Ontology normalisation
+# =====================================================
+
+
+def _attr_names(raw_attrs: Any) -> List[str]:
+    """Normalise an attribute container to a flat list of name strings.
+
+    Accepts the agent shape (list of str or ``{name|uri|label}`` dicts) and
+    the registry shape (list of ``{name|localName}`` dicts).
+    """
+    out: List[str] = []
+    for a in raw_attrs or []:
+        if isinstance(a, str):
+            out.append(a)
+        elif isinstance(a, dict):
+            name = a.get("name") or a.get("localName") or a.get("uri") or a.get("label")
+            if name:
+                out.append(local_name(name))
+    return out
+
+
+class NormalizedOntology:
+    """A flat, shape-agnostic view of a generated ontology.
+
+    Attributes:
+        classes: list of ``{"name", "uri", "data_properties": [str]}``.
+        object_properties: list of ``{"name", "uri", "domain", "range"}``
+            where domain/range are the raw refs as authored (URI or local).
+    """
+
+    def __init__(self, classes: List[dict], object_properties: List[dict]):
+        self.classes = classes
+        self.object_properties = object_properties
+
+    # --- derived sets, computed lazily but cheaply ------------------
+
+    @property
+    def class_resolution_set(self) -> Set[str]:
+        """Every token a domain/range ref could legitimately resolve to."""
+        out: Set[str] = set()
+        for c in self.classes:
+            if c.get("uri"):
+                out.add(c["uri"])
+                out.add(local_name(c["uri"]))
+            if c.get("name"):
+                out.add(c["name"])
+                out.add(local_name(c["name"]))
+        return out
+
+    @property
+    def all_data_property_keys(self) -> Set[str]:
+        """Normalised keys of every data property across every class."""
+        keys: Set[str] = set()
+        for c in self.classes:
+            for dp in c.get("data_properties", []):
+                k = normalize_name(local_name(dp))
+                if k:
+                    keys.add(k)
+        return keys
+
+    @property
+    def class_name_keys(self) -> Set[str]:
+        keys: Set[str] = set()
+        for c in self.classes:
+            k = normalize_name(local_name(c.get("name") or c.get("uri")))
+            if k:
+                keys.add(k)
+        return keys
+
+
+def normalize_ontology(ontology: dict) -> NormalizedOntology:
+    """Normalise either the agent shape or the registry shape.
+
+    * Agent shape:    ``{"entities": [...], "relationships": [...]}``
+    * Registry shape: ``{"classes": [...], "properties": [...]}``
+    """
+    ontology = ontology or {}
+    classes: List[dict] = []
+    object_props: List[dict] = []
+
+    if "entities" in ontology or "relationships" in ontology:
+        for e in ontology.get("entities", []) or []:
+            classes.append(
+                {
+                    "name": e.get("name") or local_name(e.get("uri")),
+                    "uri": e.get("uri", ""),
+                    "data_properties": _attr_names(e.get("attributes")),
+                }
+            )
+        for r in ontology.get("relationships", []) or []:
+            object_props.append(
+                {
+                    "name": r.get("name") or local_name(r.get("uri")),
+                    "uri": r.get("uri", ""),
+                    "domain": r.get("domain", ""),
+                    "range": r.get("range", ""),
+                }
+            )
+    else:
+        for c in ontology.get("classes", []) or []:
+            classes.append(
+                {
+                    "name": c.get("name") or local_name(c.get("uri")),
+                    "uri": c.get("uri", ""),
+                    "data_properties": _attr_names(c.get("dataProperties")),
+                }
+            )
+        for p in ontology.get("properties", []) or []:
+            if p.get("type") and p.get("type") != "ObjectProperty":
+                continue
+            object_props.append(
+                {
+                    "name": p.get("name") or local_name(p.get("uri")),
+                    "uri": p.get("uri", ""),
+                    "domain": p.get("domain", ""),
+                    "range": p.get("range", ""),
+                }
+            )
+
+    return NormalizedOntology(classes=classes, object_properties=object_props)
+
+
+# =====================================================
+# Source-metadata normalisation
+# =====================================================
+
+
+def normalize_metadata(metadata: dict) -> List[dict]:
+    """Return ``[{"name", "columns": [str]}]`` from domain metadata.
+
+    Accepts the ``{"tables": [{"name"|"full_name", "columns": [...]}]}``
+    shape produced by the metadata tools.  Column entries may be plain
+    strings or ``{"name": ...}`` dicts.
+    """
+    out: List[dict] = []
+    for t in (metadata or {}).get("tables", []) or []:
+        cols: List[str] = []
+        for c in t.get("columns", []) or []:
+            if isinstance(c, str):
+                cols.append(c)
+            elif isinstance(c, dict) and c.get("name"):
+                cols.append(c["name"])
+        out.append(
+            {
+                "name": t.get("full_name") or t.get("name") or "",
+                "columns": cols,
+            }
+        )
+    return out
diff --git a/src/agents/pge_eval/ontology_metrics.py b/src/agents/pge_eval/ontology_metrics.py
new file mode 100644
index 00000000..21ba2c16
--- /dev/null
+++ b/src/agents/pge_eval/ontology_metrics.py
@@ -0,0 +1,270 @@
+"""Stage-1 — ontology-generation quality (deterministic, no LLM).
+
+Computed purely from the generated ontology + source metadata.  No mapping
+dependency (D2) and no LLM for the deterministic part (§3.2).  The same
+checks back the new owl-generator Evaluator stage (§3.5): each issue carries
+a concrete ``hint`` that becomes a generator retry_hint.
+
+All metrics are usecase-agnostic: nothing about any particular domain is
+hard-coded here.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, List, Set, Tuple
+
+from agents.pge_eval.normalize import (
+    NormalizedOntology,
+    is_surrogate_or_audit,
+    local_name,
+    normalize_metadata,
+    normalize_name,
+    normalize_ontology,
+)
+
+# Naming conventions (mirror the OWL generator's NAMING RULES, domain-free).
+_CLASS_RE = re.compile(r"^[A-Z][A-Za-z0-9]*$")
+_PROPERTY_RE = re.compile(r"^[a-z][A-Za-z0-9]*$")
+
+
+def _issue(check: str, expected: str, observed: str, hint: str) -> Dict[str, str]:
+    return {"check": check, "expected": expected, "observed": observed, "hint": hint}
+
+
+# =====================================================
+# Footprint computation (shared with pipeline.coverage_loss)
+# =====================================================
+
+
+def _column_key(table_name: str, column_name: str) -> str:
+    return f"{normalize_name(table_name)}::{normalize_name(column_name)}"
+
+
+def compute_footprint(
+    ontology: NormalizedOntology, tables: List[dict]
+) -> Dict[str, Any]:
+    """Return the ontology footprint over the source metadata.
+
+    A *column* is covered when its normalised name matches some data
+    property's normalised name.  A *table* is covered when its name matches
+    a class name OR ≥1 of its non-surrogate columns is covered (D3).
+
+    Surrogate/audit columns are excluded from the denominators.
+    """
+    dp_keys = ontology.all_data_property_keys
+    class_keys = ontology.class_name_keys
+
+    total_columns = 0
+    covered_columns: Set[str] = set()
+    total_tables = len(tables)
+    covered_tables: Set[str] = set()
+
+    for t in tables:
+        tname = t["name"]
+        tkey = normalize_name(local_name(tname))
+        table_is_covered = tkey in class_keys
+        for col in t["columns"]:
+            if is_surrogate_or_audit(col):
+                continue
+            total_columns += 1
+            ckey = _column_key(tname, col)
+            if normalize_name(col) in dp_keys:
+                covered_columns.add(ckey)
+                table_is_covered = True
+        if table_is_covered:
+            covered_tables.add(tname)
+
+    return {
+        "total_tables": total_tables,
+        "covered_tables": covered_tables,
+        "total_columns": total_columns,
+        "covered_columns": covered_columns,
+    }
+
+
+# =====================================================
+# Stage-1 metrics + issues
+# =====================================================
+
+
+def evaluate_ontology(
+    ontology: dict,
+    metadata: dict,
+) -> Tuple[Dict[str, Any], List[Dict[str, str]], Dict[str, Any]]:
+    """Run the deterministic Stage-1 checks.
+
+    Returns ``(metrics, issues, footprint)``:
+
+    * ``metrics`` — the §3.2 metric block (ratios + absolute counts).
+    * ``issues`` — actionable failures (``check/expected/observed/hint``)
+      for the owl-gen Evaluator's retry_hints.
+    * ``footprint`` — covered tables/columns sets reused by
+      ``pipeline.coverage_loss``.
+    """
+    norm = normalize_ontology(ontology)
+    tables = normalize_metadata(metadata)
+    footprint = compute_footprint(norm, tables)
+
+    issues: List[Dict[str, str]] = []
+
+    # ---- coverage ratios (Tier-2 warn) -----------------------------
+    table_cov = (
+        len(footprint["covered_tables"]) / footprint["total_tables"]
+        if footprint["total_tables"]
+        else 1.0
+    )
+    column_cov = (
+        len(footprint["covered_columns"]) / footprint["total_columns"]
+        if footprint["total_columns"]
+        else 1.0
+    )
+
+    uncovered_tables = [
+        t["name"]
+        for t in tables
+        if t["name"] not in footprint["covered_tables"]
+    ]
+    for tname in uncovered_tables:
+        issues.append(
+            _issue(
+                "table_footprint_coverage",
+                "table maps to a class or contributes >=1 data property",
+                "no footprint",
+                f"source table '{tname}' has no class and contributes no data "
+                "property — model it as a class, attach its columns as data "
+                "properties on an existing class, or justify the omission.",
+            )
+        )
+
+    # ---- orphan classes (Tier-1 absolute = 0) ----------------------
+    related: Set[str] = set()
+    for op in norm.object_properties:
+        for ref in (op.get("domain"), op.get("range")):
+            if ref:
+                related.add(local_name(ref))
+                related.add(str(ref))
+    orphan_classes: List[str] = []
+    for c in norm.classes:
+        has_props = bool(c.get("data_properties"))
+        name = c.get("name") or local_name(c.get("uri"))
+        in_rel = name in related or local_name(c.get("uri")) in related
+        if not has_props and not in_rel:
+            orphan_classes.append(name)
+            issues.append(
+                _issue(
+                    "orphan_class_count",
+                    "0 orphan classes",
+                    name,
+                    f"class '{name}' is an orphan (no data properties and no "
+                    "object-property domain/range) — attach properties, relate "
+                    "it to another class, or remove it.",
+                )
+            )
+
+    # ---- dangling domain/range (Tier-1 absolute = 0) ---------------
+    resolvable = norm.class_resolution_set
+    dangling_dr: List[str] = []
+    for op in norm.object_properties:
+        opname = op.get("name") or local_name(op.get("uri"))
+        for role in ("domain", "range"):
+            ref = op.get(role)
+            if not ref:
+                dangling_dr.append(f"{opname}.{role}")
+                issues.append(
+                    _issue(
+                        "dangling_domain_range_count",
+                        f"ObjectProperty {role} resolves to a class",
+                        f"{opname}.{role}=<missing>",
+                        f"ObjectProperty '{opname}' has no {role} — declare an "
+                        f"rdfs:{role} pointing at an existing class.",
+                    )
+                )
+                continue
+            if ref not in resolvable and local_name(ref) not in resolvable:
+                dangling_dr.append(f"{opname}.{role}")
+                issues.append(
+                    _issue(
+                        "dangling_domain_range_count",
+                        f"ObjectProperty {role} resolves to a class",
+                        f"{opname}.{role}={local_name(ref)}",
+                        f"ObjectProperty '{opname}' has {role} "
+                        f"'{local_name(ref)}' which resolves to no class — fix "
+                        "the reference or add the missing class.",
+                    )
+                )
+
+    # ---- naming violations (Tier-1 absolute = 0) -------------------
+    naming_violations: List[str] = []
+    for c in norm.classes:
+        nm = local_name(c.get("name") or c.get("uri"))
+        if nm and not _CLASS_RE.match(nm):
+            naming_violations.append(f"class:{nm}")
+            issues.append(
+                _issue(
+                    "naming_violation_count",
+                    "class name is PascalCase [A-Z][A-Za-z0-9]*",
+                    nm,
+                    f"class '{nm}' violates PascalCase — remove spaces / "
+                    "underscores / hyphens and capitalise (e.g. sales_order -> "
+                    "SalesOrder).",
+                )
+            )
+    for op in norm.object_properties:
+        nm = local_name(op.get("name") or op.get("uri"))
+        if nm and not _PROPERTY_RE.match(nm):
+            naming_violations.append(f"property:{nm}")
+            issues.append(
+                _issue(
+                    "naming_violation_count",
+                    "property name is lowerCamelCase [a-z][A-Za-z0-9]*",
+                    nm,
+                    f"property '{nm}' violates lowerCamelCase — use "
+                    "[a-z][A-Za-z0-9]* with no underscores/hyphens/escapes.",
+                )
+            )
+    # data properties too
+    for c in norm.classes:
+        for dp in c.get("data_properties", []):
+            nm = local_name(dp)
+            if nm and not _PROPERTY_RE.match(nm):
+                naming_violations.append(f"dataproperty:{nm}")
+                issues.append(
+                    _issue(
+                        "naming_violation_count",
+                        "data property name is lowerCamelCase",
+                        nm,
+                        f"data property '{nm}' violates lowerCamelCase — use "
+                        "[a-z][A-Za-z0-9]* with no underscores/hyphens/escapes.",
+                    )
+                )
+
+    # ---- duplicate classes (Tier-1 absolute = 0) -------------------
+    seen: Dict[str, int] = {}
+    for c in norm.classes:
+        key = normalize_name(local_name(c.get("name") or c.get("uri")))
+        if not key:
+            continue
+        seen[key] = seen.get(key, 0) + 1
+    duplicate_class_count = sum(n - 1 for n in seen.values() if n > 1)
+    for key, n in seen.items():
+        if n > 1:
+            issues.append(
+                _issue(
+                    "duplicate_class_count",
+                    "0 duplicate class local names",
+                    f"{key} x{n}",
+                    f"{n} classes collapse to the local name '{key}' — merge "
+                    "them or differentiate their names/definitions.",
+                )
+            )
+
+    metrics: Dict[str, Any] = {
+        "table_footprint_coverage": round(table_cov, 6),
+        "column_footprint_coverage": round(column_cov, 6),
+        "orphan_class_count": len(orphan_classes),
+        "dangling_domain_range_count": len(dangling_dr),
+        "naming_violation_count": len(naming_violations),
+        "duplicate_class_count": duplicate_class_count,
+    }
+    return metrics, issues, footprint
diff --git a/src/agents/pge_eval/pipeline_metrics.py b/src/agents/pge_eval/pipeline_metrics.py
new file mode 100644
index 00000000..ac5ae87e
--- /dev/null
+++ b/src/agents/pge_eval/pipeline_metrics.py
@@ -0,0 +1,85 @@
+"""Pipeline-level metrics (deterministic, no LLM).
+
+* ``coverage_loss`` — source concepts the ontology surfaced but that never
+  reached a mapping (ontology footprint − mapped footprint).  The gap
+  between the two complementary denominators of D2.
+* ``convergence`` — effort signals (mean generator attempts, planner
+  reinvocations, total tokens, wall-clock).
+
+Both pipeline metrics are **tracked/advisory only** — they are reported on the
+scorecard for inspection and trend-watching but are not wired into any gate
+tier (no ``METRIC_SPECS`` entry references the ``pipeline`` stage). Treat them
+as observability, not pass/fail.
+
+Usecase-agnostic.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Set
+
+from agents.pge_eval.normalize import normalize_name
+
+
+def _surfaced_column_keys(footprint: Dict[str, Any]) -> Set[str]:
+    """Normalised column-name keys of every ontology-covered column.
+
+    ``footprint['covered_columns']`` holds ``table::col`` keys; the loss
+    comparison works at the column-name level so it matches the mapped
+    footprint (which has no reliable table qualifier).
+    """
+    out: Set[str] = set()
+    for key in footprint.get("covered_columns", set()):
+        col = key.split("::", 1)[-1]
+        if col:
+            out.add(col)
+    return out
+
+
+def evaluate_pipeline(
+    artifact: dict,
+    ontology_footprint: Dict[str, Any],
+    mapped_cols: Set[str],
+) -> Dict[str, Any]:
+    surfaced = _surfaced_column_keys(ontology_footprint)
+    lost = {c for c in surfaced if c not in mapped_cols}
+    coverage_loss = len(lost)
+
+    # ---- convergence -----------------------------------------------
+    run_log = artifact.get("mapping_run_log", []) or []
+    attempt_counts: List[int] = [
+        len(entry.get("attempts", []) or [])
+        for entry in run_log
+        if entry.get("attempts")
+    ]
+    mean_attempts = (
+        round(sum(attempt_counts) / len(attempt_counts), 6) if attempt_counts else 0.0
+    )
+
+    stats = artifact.get("stats", {}) or {}
+    planner_reinvocations = int(
+        stats.get("planner_reinvocations", artifact.get("planner_reinvocations", 0)) or 0
+    )
+
+    usage = artifact.get("usage", {}) or {}
+    total_tokens = int(
+        usage.get("total_tokens", 0)
+        or (usage.get("prompt_tokens", 0) + usage.get("completion_tokens", 0))
+    )
+
+    wall_clock_s = float(artifact.get("elapsed_s", 0.0) or 0.0)
+    if not wall_clock_s:
+        step_ms = sum(
+            int(s.get("duration_ms", 0) or 0) for s in artifact.get("steps", []) or []
+        )
+        wall_clock_s = round(step_ms / 1000.0, 3)
+
+    return {
+        "coverage_loss": coverage_loss,
+        "convergence": {
+            "mean_generator_attempts": mean_attempts,
+            "planner_reinvocations": planner_reinvocations,
+            "total_tokens": total_tokens,
+            "wall_clock_s": wall_clock_s,
+        },
+    }
diff --git a/src/agents/pge_eval/scorecard.py b/src/agents/pge_eval/scorecard.py
new file mode 100644
index 00000000..cdb4e05b
--- /dev/null
+++ b/src/agents/pge_eval/scorecard.py
@@ -0,0 +1,161 @@
+"""Scorecard assembly + verdict (§3.6).
+
+``score_artifact`` is the single offline-testable code path (D6): it ingests
+a captured ``AgentResult`` artifact (plus the generated ontology and source
+metadata) and emits the §3.6 scorecard JSON.  Every deterministic metric is
+computed with zero LLM calls; the advisory judge is the only network path
+and runs only when ``no_judge`` is False.
+
+Live mode (``scripts/goals_eval.py run``) is a thin wrapper: it produces the
+artifact first, then calls this.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+from typing import Any, Dict, List, Optional
+
+from agents.pge_eval import gates as gates_mod
+from agents.pge_eval.baseline import DEFAULT_BASELINE_DIR, load_baseline
+from agents.pge_eval.mapping_metrics import evaluate_mapping
+from agents.pge_eval.normalize import normalize_metadata, normalize_ontology
+from agents.pge_eval.ontology_metrics import evaluate_ontology
+from agents.pge_eval.pipeline_metrics import evaluate_pipeline
+
+SCHEMA_VERSION = "1.0"
+
+
+def _digest(obj: Any) -> str:
+    payload = json.dumps(obj, sort_keys=True, default=str).encode("utf-8")
+    return hashlib.sha256(payload).hexdigest()[:16]
+
+
+def _ontology_digest(ontology: dict) -> str:
+    norm = normalize_ontology(ontology)
+    sig = {
+        "classes": sorted(
+            (c.get("name", ""), tuple(sorted(c.get("data_properties", []))))
+            for c in norm.classes
+        ),
+        "object_properties": sorted(
+            (op.get("name", ""), op.get("domain", ""), op.get("range", ""))
+            for op in norm.object_properties
+        ),
+    }
+    return _digest(sig)
+
+
+def _metadata_digest(metadata: dict) -> str:
+    tables = normalize_metadata(metadata)
+    sig = sorted((t["name"], tuple(sorted(t["columns"]))) for t in tables)
+    return _digest(sig)
+
+
+def _resolve_inputs(artifact: dict, ontology, metadata):
+    if ontology is None:
+        ontology = artifact.get("ontology") or {}
+    if metadata is None:
+        metadata = (
+            artifact.get("metadata")
+            or artifact.get("source_metadata")
+            or {}
+        )
+    return ontology, metadata
+
+
+def score_artifact(
+    artifact: dict,
+    *,
+    ontology: Optional[dict] = None,
+    metadata: Optional[dict] = None,
+    gate_ratios: bool = False,
+    no_judge: bool = True,
+    mode: str = "score-only",
+    run_id: Optional[str] = None,
+    timestamp: Optional[str] = None,
+    endpoint: Optional[str] = None,
+    host: Optional[str] = None,
+    token: Optional[str] = None,
+    baseline_dir: str = DEFAULT_BASELINE_DIR,
+    baseline: Optional[Dict[str, Any]] = None,
+    use_baseline: bool = True,
+    ratio_threshold: float = gates_mod.DEFAULT_RATIO_THRESHOLD,
+) -> Dict[str, Any]:
+    """Score a captured artifact and return the §3.6 scorecard dict.
+
+    Deterministic unless ``no_judge`` is False.  ``run_id``/``timestamp`` are
+    stamped by the caller (kept out of the deterministic core so unit tests
+    are reproducible).
+    """
+    ontology, metadata = _resolve_inputs(artifact, ontology, metadata)
+
+    onto_metrics, stage1_issues, footprint = evaluate_ontology(ontology, metadata)
+    map_metrics, map_extras = evaluate_mapping(artifact, ontology)
+    pipeline = evaluate_pipeline(artifact, footprint, map_extras["mapped_cols"])
+
+    # ---- advisory judge (only LLM path) ----------------------------
+    if no_judge:
+        onto_judge = {"score": None, "flags": []}
+        map_judge = {"score": None, "flags": []}
+    else:
+        from agents.pge_eval.judge import run_judge
+
+        verdicts = run_judge(
+            host=host or "",
+            token=token or "",
+            endpoint_name=endpoint or "",
+            ontology=ontology,
+            artifact=artifact,
+            stage1_issues=stage1_issues,
+        )
+        onto_judge = verdicts["ontology"]
+        map_judge = verdicts["mapping"]
+
+    stages = {
+        "ontology": {"metrics": onto_metrics, "judge": onto_judge},
+        "mapping": {
+            "metrics": map_metrics,
+            "judge": map_judge,
+            # Persisted so Tier-3 can tell an inactive-1.0 band (no band declared)
+            # from an active measurement, and not flag the first real band reading
+            # as a regression.
+            "band_active": bool(map_extras.get("band_active")),
+        },
+        "pipeline": pipeline,
+    }
+
+    # ---- gates -----------------------------------------------------
+    active_conditionals = {"band_active": bool(map_extras.get("band_active"))}
+    tier1 = gates_mod.evaluate_tier1(stages, active_conditionals=active_conditionals)
+    tier2 = gates_mod.evaluate_tier2(
+        stages, gate_ratios=gate_ratios, threshold=ratio_threshold
+    )
+
+    if baseline is None and use_baseline:
+        baseline = load_baseline(baseline_dir, exclude_run_id=run_id)
+    tier3 = gates_mod.evaluate_tier3(stages, baseline)
+
+    passed = tier1["passed"] and tier2["passed"] and tier3["passed"]
+    verdict = "GREEN" if passed else "RED"
+
+    return {
+        "schema_version": SCHEMA_VERSION,
+        "run_id": run_id,
+        "timestamp": timestamp,
+        "mode": mode,
+        "inputs": {
+            "source_metadata_digest": _metadata_digest(metadata),
+            "ontology_digest": _ontology_digest(ontology),
+            "endpoint": None if no_judge else endpoint,
+        },
+        "stages": stages,
+        "stage1_issues": stage1_issues,
+        "gates": {
+            "tier1_absolute": tier1,
+            "tier2_ratio": tier2,
+            "tier3_regression": tier3,
+        },
+        "verdict": verdict,
+        "exit_code": 0 if verdict == "GREEN" else 1,
+    }
diff --git a/src/agents/tools/context.py b/src/agents/tools/context.py
index 3b88df82..6edf3919 100644
--- a/src/agents/tools/context.py
+++ b/src/agents/tools/context.py
@@ -6,7 +6,10 @@
 """
 
 from dataclasses import dataclass, field
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
+
+if TYPE_CHECKING:
+    from agents.agent_mapping_pge.contracts import EvalReport, SourceModel
 
 
 @dataclass
@@ -53,3 +56,13 @@ class ToolContext:
     dtwin_registry_params: dict = field(default_factory=dict)
     dtwin_domain_name: str = ""
     dtwin_ontology_labels: dict = field(default_factory=dict)  # uri/name → display label
+
+    # Mapping PGE planner output (``agent_mapping_pge``) – populated by the
+    # ``submit_source_model`` terminal tool. Forward-ref string typing avoids a
+    # circular import between ``agents.tools`` and ``agents.agent_mapping_pge``.
+    source_model: Optional["SourceModel"] = None
+
+    # Mapping PGE semantic critic output (``agent_mapping_pge``) – populated by
+    # the ``submit_evaluation`` terminal tool of the Sprint 6 Critic agent.
+    # Same forward-ref pattern as ``source_model`` to avoid a circular import.
+    semantic_eval_report: Optional["EvalReport"] = None
diff --git a/src/agents/tools/evaluation.py b/src/agents/tools/evaluation.py
new file mode 100644
index 00000000..e4f62f94
--- /dev/null
+++ b/src/agents/tools/evaluation.py
@@ -0,0 +1,205 @@
+"""Terminal tool for the mapping-PGE Semantic Critic (Sprint 6).
+
+The Critic audits ONE submitted mapping for semantic correctness after the
+deterministic (stage-1) evaluator has already passed. It submits its verdict
+through ``submit_evaluation`` — the terminal tool defined here — which
+constructs an :class:`EvalReport` (stage="semantic") and stamps it onto
+``ctx.semantic_eval_report``.
+
+This module deliberately mirrors the shape of the other terminal tools
+(``submit_source_model``, ``submit_entity_mapping``, …) — pure-Python handler
+with a JSON-schema definition for OpenAI function calling, exported via
+``EVALUATION_TOOL_DEFINITIONS`` / ``EVALUATION_TOOL_HANDLERS`` aggregates.
+"""
+
+import json
+from typing import Callable, Dict, List, Optional
+
+from back.core.logging import get_logger
+from agents.tools.context import ToolContext
+
+logger = get_logger(__name__)
+
+
+# =====================================================
+# OpenAI function-calling definition
+# =====================================================
+
+SUBMIT_EVALUATION_DEF: dict = {
+    "type": "function",
+    "function": {
+        "name": "submit_evaluation",
+        "description": (
+            "Submit the final semantic evaluation. Terminal tool — call exactly once "
+            "when you have a confident verdict. status MUST be 'PASS' or 'FAIL'. "
+            "If failing, populate failures[] with at least one entry. "
+            "Set bubble_to_planner=true ONLY when the wrong TABLE was chosen "
+            "(not just a wrong column within the right table)."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "status": {"type": "string", "enum": ["PASS", "FAIL"]},
+                "failures": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "check": {"type": "string"},
+                            "expected": {"type": "string"},
+                            "observed": {"type": "string"},
+                            "hint": {"type": "string"},
+                        },
+                        "required": ["check", "expected", "observed", "hint"],
+                    },
+                    "description": "Empty when status is PASS.",
+                },
+                "bubble_to_planner": {"type": "boolean"},
+                "reasoning": {
+                    "type": "string",
+                    "description": "One-paragraph summary of the audit reasoning.",
+                },
+            },
+            "required": ["status"],
+        },
+    },
+}
+
+
+# =====================================================
+# Handler
+# =====================================================
+
+
+def tool_submit_evaluation(
+    ctx: ToolContext,
+    *,
+    status: str = "",
+    failures: Optional[list] = None,
+    bubble_to_planner: bool = False,
+    reasoning: str = "",
+    **_kwargs,
+) -> str:
+    """Construct an EvalReport from the critic's submission and store on ctx.
+
+    Contract:
+      * ``status`` MUST be one of ``"PASS"`` or ``"FAIL"`` — anything else is
+        rejected as a JSON error so the agent loop can coach the LLM and
+        continue (it does NOT terminate the loop).
+      * On ``FAIL`` with an empty ``failures`` list, a generic
+        ``semantic_audit`` failure is synthesised so the resulting report is
+        coherent (status=FAIL <=> failures non-empty, matching
+        :func:`evaluator.report.build_report` semantics).
+      * ``bubble_to_planner=True`` is demoted to False when status is PASS —
+        same invariant the deterministic evaluator's :func:`build_report`
+        enforces (a passing evaluation should not escalate).
+    """
+    logger.info(
+        "tool_submit_evaluation: status=%s, failures=%d, bubble=%s, reasoning=%d chars",
+        status,
+        len(failures or []),
+        bubble_to_planner,
+        len(reasoning or ""),
+    )
+
+    if status not in ("PASS", "FAIL"):
+        logger.warning("tool_submit_evaluation: invalid status=%r", status)
+        return json.dumps(
+            {
+                "success": False,
+                "error": f"invalid status: {status!r} (must be PASS or FAIL)",
+            }
+        )
+
+    # Lazy import — these contracts live in agent_mapping_pge and importing
+    # them at module load time would create a cycle through
+    # ``agents.tools.context``.
+    from agents.agent_mapping_pge.contracts import EvalFailure, EvalReport
+
+    eval_failures: List[EvalFailure] = []
+    for f in failures or []:
+        if not isinstance(f, dict):
+            continue
+        eval_failures.append(
+            EvalFailure(
+                kind="semantic",
+                check=str(f.get("check") or ""),
+                expected=str(f.get("expected") or ""),
+                observed=str(f.get("observed") or ""),
+                hint=str(f.get("hint") or ""),
+            )
+        )
+
+    # status=PASS <=> failures empty. If the LLM submitted both, clamp the
+    # failures list and warn — keeping a passing report internally coherent.
+    if status == "PASS" and eval_failures:
+        logger.warning(
+            "tool_submit_evaluation: status=PASS with %d failures — clamping to []",
+            len(eval_failures),
+        )
+        eval_failures = []
+
+    # If status=FAIL but no failures, synthesise a generic one so the report
+    # is coherent (status=FAIL <=> failures non-empty).
+    if status == "FAIL" and not eval_failures:
+        logger.debug(
+            "tool_submit_evaluation: synthesising semantic_audit failure for "
+            "FAIL with no failures[]"
+        )
+        eval_failures.append(
+            EvalFailure(
+                kind="semantic",
+                check="semantic_audit",
+                expected="PASS",
+                observed="FAIL",
+                hint=reasoning or "critic returned FAIL without specific failures",
+            )
+        )
+
+    # If status=PASS but bubble flag is True, demote — matches
+    # ``build_report``'s behaviour and the documented invariant: a passing
+    # evaluation does not escalate to the Planner.
+    if status == "PASS" and bubble_to_planner:
+        logger.warning(
+            "tool_submit_evaluation: bubble_to_planner=True with status=PASS — "
+            "demoting to False"
+        )
+        bubble_to_planner = False
+
+    metrics: Dict[str, str] = {"reasoning": reasoning} if reasoning else {}
+
+    report = EvalReport(
+        status=status,
+        stage="semantic",
+        metrics=metrics,
+        failures=eval_failures,
+        bubble_to_planner=bool(bubble_to_planner),
+    )
+    ctx.semantic_eval_report = report
+
+    logger.info(
+        "tool_submit_evaluation: stored EvalReport status=%s, failures=%d, bubble=%s",
+        report.status,
+        len(report.failures),
+        report.bubble_to_planner,
+    )
+
+    return json.dumps(
+        {
+            "success": True,
+            "status": status,
+            "failures": len(eval_failures),
+            "bubble_to_planner": report.bubble_to_planner,
+        }
+    )
+
+
+# =====================================================
+# Aggregates
+# =====================================================
+
+EVALUATION_TOOL_DEFINITIONS: List[dict] = [SUBMIT_EVALUATION_DEF]
+
+EVALUATION_TOOL_HANDLERS: Dict[str, Callable] = {
+    "submit_evaluation": tool_submit_evaluation,
+}
diff --git a/src/agents/tools/mapping.py b/src/agents/tools/mapping.py
index e54ff80f..82b11279 100644
--- a/src/agents/tools/mapping.py
+++ b/src/agents/tools/mapping.py
@@ -35,15 +35,25 @@ def tool_submit_entity_mapping(
     id_column: str = "",
     label_column: str = "",
     attribute_mappings: Optional[dict] = None,
+    unmapped_attributes: Optional[list] = None,
     **_kwargs,
 ) -> str:
-    """Record a completed entity mapping."""
+    """Record a completed entity mapping.
+
+    ``unmapped_attributes`` lets the Generator stage declare ontology attributes
+    it intentionally did not map to a column, with a one-sentence ``reason``.
+    Items may be either bare strings (attribute name only) or dicts of shape
+    ``{"name": str, "reason": str}`` — the richer dict form is preferred for
+    downstream consumption but bare strings round-trip too. Anything else is
+    coerced to a string for safety. This enforces the PGE "no silent drops"
+    invariant: every ontology attribute is either in ``attribute_mappings`` or
+    in ``unmapped_attributes``.
+    """
     # Normalise column names: strip any surrounding backticks the LLM may have added.
     id_column = _strip_backticks(id_column)
     label_column = _strip_backticks(label_column)
     if attribute_mappings:
         attribute_mappings = {k: _strip_backticks(v) for k, v in attribute_mappings.items()}
-
     logger.info("tool_submit_entity_mapping: '%s' (uri=%s)", class_name, class_uri)
     if not class_uri or not sql_query:
         logger.warning("tool_submit_entity_mapping: missing required fields")
@@ -55,6 +65,22 @@ def tool_submit_entity_mapping(
         .rstrip(";")
     )
 
+    # Normalise ``unmapped_attributes`` — accept either form, persist as-is for
+    # dicts, leave bare strings as strings (validation/coverage is downstream).
+    normalised_unmapped: List = []
+    for item in unmapped_attributes or []:
+        if isinstance(item, dict) and "name" in item:
+            normalised_unmapped.append(
+                {
+                    "name": str(item.get("name", "")),
+                    "reason": str(item.get("reason", "")),
+                }
+            )
+        elif isinstance(item, str):
+            normalised_unmapped.append(item)
+        else:
+            normalised_unmapped.append(str(item))
+
     # Restrict attribute_mappings to attributes declared in the ontology for this entity.
     # This prevents the LLM from inventing mappings for columns that are not ontology
     # data properties (e.g. mapping all table columns when the entity has none).
@@ -107,6 +133,7 @@ def tool_submit_entity_mapping(
         "id_column": id_column,
         "label_column": label_column,
         "attribute_mappings": filtered_mappings,
+        "unmapped_attributes": normalised_unmapped,
     }
     # Preserve user-set excluded_attributes across auto-map runs.
     if existing_excl:
@@ -132,12 +159,14 @@ def tool_submit_entity_mapping(
         logger.debug("tool_submit_entity_mapping: appended new mapping")
 
     mapped_attrs = len(mapping["attribute_mappings"])
+    unmapped_count = len(mapping["unmapped_attributes"])
     logger.info(
-        "tool_submit_entity_mapping: '%s' recorded — ID=%s, Label=%s, %d attr(s) mapped",
+        "tool_submit_entity_mapping: '%s' recorded — ID=%s, Label=%s, %d attr(s) mapped, %d unmapped",
         class_name,
         id_column,
         label_column,
         mapped_attrs,
+        unmapped_count,
     )
     return json.dumps(
         {
@@ -146,6 +175,7 @@ def tool_submit_entity_mapping(
             "id_column": id_column,
             "label_column": label_column,
             "attributes_mapped": mapped_attrs,
+            "attributes_unmapped": unmapped_count,
             "total_entity_mappings": len(ctx.entity_mappings),
         }
     )
@@ -311,6 +341,30 @@ def _extract_label(value: str) -> str:
                         ),
                         "additionalProperties": {"type": "string"},
                     },
+                    "unmapped_attributes": {
+                        "type": "array",
+                        "description": (
+                            "Ontology attributes you intentionally did NOT map to a column, "
+                            "each with a one-sentence reason. Use this to satisfy the "
+                            'no-silent-drops invariant. Preferred shape: '
+                            '[{"name": "apgarScore", "reason": "absent from source table"}]. '
+                            "Bare strings are also accepted but discouraged."
+                        ),
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "name": {
+                                    "type": "string",
+                                    "description": "Ontology attribute name.",
+                                },
+                                "reason": {
+                                    "type": "string",
+                                    "description": "Why this attribute was not mapped.",
+                                },
+                            },
+                            "required": ["name", "reason"],
+                        },
+                    },
                 },
                 "required": [
                     "class_uri",
@@ -347,11 +401,19 @@ def _extract_label(value: str) -> str:
                     },
                     "source_id_column": {
                         "type": "string",
-                        "description": "Column name for the source entity identifier.",
+                        "description": (
+                            "The output-column alias in sql_query that holds the "
+                            "source id — alias it AS source_id and pass "
+                            '"source_id" here (NOT the entity\'s id_column).'
+                        ),
                     },
                     "target_id_column": {
                         "type": "string",
-                        "description": "Column name for the target entity identifier.",
+                        "description": (
+                            "The output-column alias in sql_query that holds the "
+                            "target id — alias it AS target_id and pass "
+                            '"target_id" here (NOT the entity\'s id_column).'
+                        ),
                     },
                     "domain": {
                         "type": "string",
@@ -385,3 +447,10 @@ def _extract_label(value: str) -> str:
     "submit_entity_mapping": tool_submit_entity_mapping,
     "submit_relationship_mapping": tool_submit_relationship_mapping,
 }
+
+# Name-indexed view of MAPPING_TOOL_DEFINITIONS so callers needing a single
+# definition (e.g. the EntityGenerator, which only exposes the entity submit
+# tool) can look it up by name without re-scanning the list.
+MAPPING_TOOL_DEFINITIONS_BY_NAME: Dict[str, dict] = {
+    d["function"]["name"]: d for d in MAPPING_TOOL_DEFINITIONS
+}
diff --git a/src/agents/tools/planner.py b/src/agents/tools/planner.py
new file mode 100644
index 00000000..a1de9af9
--- /dev/null
+++ b/src/agents/tools/planner.py
@@ -0,0 +1,707 @@
+"""
+Planner tools – used by the mapping-PGE Planner agent (Sprint 2+).
+
+Exposes the OpenAI function-calling tools that let the Planner LLM probe
+source tables and submit a validated ``SourceModel`` artefact:
+
+* ``sample_table``         — N random rows from a table (n capped at 100).
+* ``column_value_overlap`` — one-sided distinct-value overlap between two columns.
+* ``normalized_value_overlap`` — same metric, but each side is a scalar SQL
+  expression, so canonical-key normalizations can be proven before commit.
+* ``distinct_count``       — uniqueness / completeness of a candidate canonical id.
+* ``submit_source_model``  — terminal tool: validates the candidate SourceModel
+  JSON against :class:`agents.agent_mapping_pge.contracts.SourceModel` and stores
+  the dataclass instance on :attr:`ToolContext.source_model`.
+
+All handlers return JSON strings (same convention as ``agents.tools.sql``)
+and stringify scalar values for the LLM-facing surface.
+"""
+
+import json
+import re
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from back.core.logging import get_logger
+from agents.tools.context import ToolContext
+
+logger = get_logger(__name__)
+
+
+# Cap on ``n`` in ``sample_table`` to keep the LLM context bounded.
+_SAMPLE_TABLE_MAX_N = 100
+_SAMPLE_TABLE_DEFAULT_N = 20
+
+
+# Permissive but injection-safe SQL identifier shape. We allow dots (for
+# fully-qualified ``catalog.schema.table``) and backticks (for quoted
+# identifiers), plus the usual alphanumerics + underscore. Anything else
+# — semicolons, whitespace, quotes, comment markers — is rejected.
+_IDENTIFIER_RE = re.compile(r"^[A-Za-z0-9_.`]+$")
+
+
+# SQL keywords whose presence in a "normalization expression" indicates the
+# string is no longer a scalar expression but a smuggled clause / subquery /
+# DDL. A legitimate canonical-key expression (regexp_extract, regexp_replace,
+# concat, substring, lower, upper, trim, coalesce, ||, string literals) needs
+# none of these. Matched case-insensitively as whole words.
+_EXPR_FORBIDDEN_WORDS = frozenset(
+    {
+        "select",
+        "from",
+        "where",
+        "join",
+        "union",
+        "intersect",
+        "except",
+        "insert",
+        "update",
+        "delete",
+        "drop",
+        "alter",
+        "create",
+        "grant",
+        "revoke",
+        "table",
+        "into",
+        "exec",
+        "execute",
+        "call",
+        "merge",
+        "values",
+        "having",
+        "group",
+        "order",
+        "limit",
+    }
+)
+_EXPR_WORD_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]*")
+
+
+def _validate_safe_expression(expr: str, *, role: str) -> Optional[str]:
+    """Return None if ``expr`` is a safe scalar SQL expression; else an error.
+
+    Unlike :func:`_validate_identifier`, this permits the parentheses, commas,
+    quotes and operators a canonical-key normalization needs (e.g.
+    ``regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-\\d+)', 1)`` or
+    ``concat(regexp_extract(delivery_id, '...', 1), '-del')``). It still gets
+    interpolated into SQL via an f-string, so it is gated against the obvious
+    injection vectors: statement terminators, comment markers, and any SQL
+    keyword that would turn the scalar into a clause/subquery/DDL.
+    """
+    if not isinstance(expr, str) or not expr.strip():
+        return f"invalid {role}: must be a non-empty string"
+    if ";" in expr or "--" in expr or "/*" in expr or "*/" in expr:
+        return (
+            f"invalid {role}: must not contain ';' or SQL comment markers "
+            f"(got {expr!r})"
+        )
+    bad = sorted(
+        {
+            w.lower()
+            for w in _EXPR_WORD_RE.findall(expr)
+            if w.lower() in _EXPR_FORBIDDEN_WORDS
+        }
+    )
+    if bad:
+        return (
+            f"invalid {role}: a canonical-key expression must be a single scalar "
+            f"expression, not a clause/subquery. Forbidden keyword(s): "
+            f"{', '.join(bad)} (got {expr!r})"
+        )
+    return None
+
+
+def _validate_identifier(name: str, *, role: str) -> Optional[str]:
+    """Return None if ``name`` is a valid SQL identifier; else an error message.
+
+    Used to gate identifiers that get interpolated into SQL via f-strings.
+    Even though today's callers are LLMs (not untrusted users), a hallucinated
+    identifier like ``t; DROP TABLE x`` or ``nhs FROM secrets--`` would
+    otherwise execute.
+    """
+    if not isinstance(name, str) or not _IDENTIFIER_RE.fullmatch(name):
+        return f"invalid {role}: {name!r}"
+    return None
+
+
+def _run_query(
+    ctx: ToolContext,
+    sql: str,
+    *,
+    tool_name: str,
+) -> Tuple[Optional[List[Dict[str, Any]]], Optional[str]]:
+    """Execute the SQL via the client. Returns ``(rows, None)`` on success,
+    ``(None, error_str)`` on failure. On failure the SQL is logged at ERROR
+    level alongside the exception (previously only at DEBUG).
+    """
+    try:
+        result = ctx.client.execute_query(sql)
+        return result, None
+    except Exception as exc:
+        logger.error(
+            "%s: query failed: %s\nSQL: %s", tool_name, exc, sql, exc_info=True
+        )
+        return None, str(exc)
+
+
+# =====================================================
+# Tool implementations
+# =====================================================
+
+
+def tool_sample_table(
+    ctx: ToolContext, *, full_name: str = "", n: Any = _SAMPLE_TABLE_DEFAULT_N, **_kwargs
+) -> str:
+    """Return N random sample rows from ``full_name`` so the agent can see
+    real values (not just column types). ``n`` is capped at 100.
+    """
+    logger.info("tool_sample_table: full_name=%s, n=%s", full_name, n)
+    if not full_name:
+        return json.dumps({"success": False, "error": "full_name is required"})
+
+    err = _validate_identifier(full_name, role="full_name")
+    if err is not None:
+        return json.dumps({"success": False, "error": err})
+
+    # Strict ``n`` parsing: a malformed value is a tool-call error, not a
+    # silent fallback. The default (when ``n`` is omitted) is already the int
+    # ``_SAMPLE_TABLE_DEFAULT_N``, so ``int(n)`` is a no-op in that case.
+    try:
+        n_int = int(n)
+    except (TypeError, ValueError):
+        return json.dumps({"success": False, "error": f"invalid n: {n!r}"})
+    capped_n = max(1, min(n_int, _SAMPLE_TABLE_MAX_N))
+
+    sql = f"SELECT * FROM {full_name} ORDER BY RAND() LIMIT {capped_n}"
+    logger.debug("tool_sample_table: SQL=%s", sql)
+
+    rows, err = _run_query(ctx, sql, tool_name="tool_sample_table")
+    if err is not None:
+        return json.dumps({"success": False, "error": err})
+
+    rows = rows or []
+    columns: List[str] = list(rows[0].keys()) if rows else []
+    stringified_rows: List[List[Optional[str]]] = []
+    for row in rows:
+        stringified_rows.append(
+            [str(row[c]) if row.get(c) is not None else None for c in columns]
+        )
+    logger.info(
+        "tool_sample_table: %d row(s) × %d column(s)",
+        len(stringified_rows),
+        len(columns),
+    )
+    return json.dumps(
+        {
+            "success": True,
+            "columns": columns,
+            "rows": stringified_rows,
+            "row_count": len(stringified_rows),
+        }
+    )
+
+
+def tool_column_value_overlap(
+    ctx: ToolContext,
+    *,
+    from_table: str = "",
+    from_column: str = "",
+    to_table: str = "",
+    to_column: str = "",
+    **_kwargs,
+) -> str:
+    """Compute the one-sided overlap
+    ``|distinct(from) ∩ distinct(to)| / |distinct(from)|``.
+
+    The numerator dedupes ``from`` before intersecting. Returns 0.0 (and a
+    note) when ``from_distinct_count`` is zero to avoid division by zero.
+    """
+    logger.info(
+        "tool_column_value_overlap: %s.%s ↔ %s.%s",
+        from_table,
+        from_column,
+        to_table,
+        to_column,
+    )
+    if not (from_table and from_column and to_table and to_column):
+        return json.dumps(
+            {
+                "success": False,
+                "error": "from_table, from_column, to_table, to_column are all required",
+            }
+        )
+
+    for value, role in (
+        (from_table, "from_table"),
+        (from_column, "from_column"),
+        (to_table, "to_table"),
+        (to_column, "to_column"),
+    ):
+        err = _validate_identifier(value, role=role)
+        if err is not None:
+            return json.dumps({"success": False, "error": err})
+
+    sql = (
+        "WITH from_distinct AS ("
+        f"  SELECT DISTINCT {from_column} AS v FROM {from_table} "
+        f"  WHERE {from_column} IS NOT NULL"
+        "),"
+        " to_distinct AS ("
+        f"  SELECT DISTINCT {to_column} AS v FROM {to_table} "
+        f"  WHERE {to_column} IS NOT NULL"
+        "),"
+        " inter AS ("
+        "  SELECT v FROM from_distinct INTERSECT SELECT v FROM to_distinct"
+        ") "
+        "SELECT (SELECT COUNT(*) FROM from_distinct) AS from_distinct_count, "
+        "       (SELECT COUNT(*) FROM to_distinct)   AS to_distinct_count, "
+        "       (SELECT COUNT(*) FROM inter)         AS intersection_count"
+    )
+    logger.debug("tool_column_value_overlap: SQL=%s", sql)
+
+    rows, err = _run_query(ctx, sql, tool_name="tool_column_value_overlap")
+    if err is not None:
+        return json.dumps({"success": False, "error": err})
+    if not rows:
+        return json.dumps(
+            {"success": False, "error": "overlap query returned no rows"}
+        )
+
+    row = rows[0]
+    from_distinct = int(row.get("from_distinct_count", 0) or 0)
+    to_distinct = int(row.get("to_distinct_count", 0) or 0)
+    intersection = int(row.get("intersection_count", 0) or 0)
+
+    if from_distinct == 0:
+        result: Dict[str, Any] = {
+            "success": True,
+            "overlap_pct": 0.0,
+            "from_distinct_count": 0,
+            "to_distinct_count": to_distinct,
+            "intersection_count": 0,
+            "note": (
+                f"{from_table}.{from_column} has zero distinct non-null values; "
+                "overlap_pct defaulted to 0.0 (no division by zero)."
+            ),
+        }
+    else:
+        result = {
+            "success": True,
+            "overlap_pct": intersection / from_distinct,
+            "from_distinct_count": from_distinct,
+            "to_distinct_count": to_distinct,
+            "intersection_count": intersection,
+            # Symmetric shape with the zero-denom branch: downstream consumers
+            # can read ``note`` unconditionally.
+            "note": "",
+        }
+    logger.info(
+        "tool_column_value_overlap: overlap_pct=%.4f (%d/%d)",
+        result["overlap_pct"],
+        intersection,
+        from_distinct,
+    )
+    return json.dumps(result)
+
+
+def tool_normalized_value_overlap(
+    ctx: ToolContext,
+    *,
+    from_table: str = "",
+    from_expr: str = "",
+    to_table: str = "",
+    to_expr: str = "",
+    **_kwargs,
+) -> str:
+    """Like :func:`tool_column_value_overlap`, but each side is an arbitrary
+    scalar SQL *expression* rather than a bare column.
+
+    This is the tool the Planner uses to PROVE a canonical-key normalization
+    works before committing it. When two tables that map to the same ontology
+    class have 0% raw-column overlap, the values are trust-local encodings of
+    the same key. The Planner proposes a normalization expression per table
+    (e.g. ``regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-\\d+)', 1)``)
+    and calls this tool to confirm the expressions land in a common value
+    space (overlap_pct > 0). A still-zero overlap means the normalization is
+    wrong — fix it before submitting.
+    """
+    logger.info(
+        "tool_normalized_value_overlap: %s[%s] ↔ %s[%s]",
+        from_table,
+        from_expr,
+        to_table,
+        to_expr,
+    )
+    if not (from_table and from_expr and to_table and to_expr):
+        return json.dumps(
+            {
+                "success": False,
+                "error": "from_table, from_expr, to_table, to_expr are all required",
+            }
+        )
+
+    for value, role in ((from_table, "from_table"), (to_table, "to_table")):
+        err = _validate_identifier(value, role=role)
+        if err is not None:
+            return json.dumps({"success": False, "error": err})
+    for value, role in ((from_expr, "from_expr"), (to_expr, "to_expr")):
+        err = _validate_safe_expression(value, role=role)
+        if err is not None:
+            return json.dumps({"success": False, "error": err})
+
+    sql = (
+        "WITH from_distinct AS ("
+        f"  SELECT DISTINCT {from_expr} AS v FROM {from_table} "
+        f"  WHERE {from_expr} IS NOT NULL AND {from_expr} <> ''"
+        "),"
+        " to_distinct AS ("
+        f"  SELECT DISTINCT {to_expr} AS v FROM {to_table} "
+        f"  WHERE {to_expr} IS NOT NULL AND {to_expr} <> ''"
+        "),"
+        " inter AS ("
+        "  SELECT v FROM from_distinct INTERSECT SELECT v FROM to_distinct"
+        ") "
+        "SELECT (SELECT COUNT(*) FROM from_distinct) AS from_distinct_count, "
+        "       (SELECT COUNT(*) FROM to_distinct)   AS to_distinct_count, "
+        "       (SELECT COUNT(*) FROM inter)         AS intersection_count"
+    )
+    logger.debug("tool_normalized_value_overlap: SQL=%s", sql)
+
+    rows, err = _run_query(ctx, sql, tool_name="tool_normalized_value_overlap")
+    if err is not None:
+        return json.dumps({"success": False, "error": err})
+    if not rows:
+        return json.dumps(
+            {"success": False, "error": "overlap query returned no rows"}
+        )
+
+    row = rows[0]
+    from_distinct = int(row.get("from_distinct_count", 0) or 0)
+    to_distinct = int(row.get("to_distinct_count", 0) or 0)
+    intersection = int(row.get("intersection_count", 0) or 0)
+
+    if from_distinct == 0:
+        result: Dict[str, Any] = {
+            "success": True,
+            "overlap_pct": 0.0,
+            "from_distinct_count": 0,
+            "to_distinct_count": to_distinct,
+            "intersection_count": 0,
+            "note": (
+                f"{from_expr} over {from_table} produced zero distinct non-empty "
+                "values; the expression likely does not match the data — revise it."
+            ),
+        }
+    else:
+        result = {
+            "success": True,
+            "overlap_pct": intersection / from_distinct,
+            "from_distinct_count": from_distinct,
+            "to_distinct_count": to_distinct,
+            "intersection_count": intersection,
+            "note": "",
+        }
+    logger.info(
+        "tool_normalized_value_overlap: overlap_pct=%.4f (%d/%d)",
+        result["overlap_pct"],
+        intersection,
+        from_distinct,
+    )
+    return json.dumps(result)
+
+
+def tool_distinct_count(
+    ctx: ToolContext, *, full_name: str = "", column: str = "", **_kwargs
+) -> str:
+    """Report row / distinct / null counts for ``full_name.column`` and
+    derive ``is_unique`` and ``is_complete`` flags.
+
+    * ``is_unique = distinct_count == row_count - null_count`` — i.e. the
+      non-null subset has no duplicates.
+    * ``is_complete = null_count == 0`` — no missing values.
+    """
+    logger.info("tool_distinct_count: %s.%s", full_name, column)
+    if not (full_name and column):
+        return json.dumps(
+            {"success": False, "error": "full_name and column are required"}
+        )
+
+    for value, role in ((full_name, "full_name"), (column, "column")):
+        err = _validate_identifier(value, role=role)
+        if err is not None:
+            return json.dumps({"success": False, "error": err})
+
+    sql = (
+        f"SELECT COUNT(*) AS row_count, "
+        f"       COUNT(DISTINCT {column}) AS distinct_count, "
+        f"       COUNT(*) - COUNT({column}) AS null_count "
+        f"FROM {full_name}"
+    )
+    logger.debug("tool_distinct_count: SQL=%s", sql)
+
+    rows, err = _run_query(ctx, sql, tool_name="tool_distinct_count")
+    if err is not None:
+        return json.dumps({"success": False, "error": err})
+    if not rows:
+        return json.dumps(
+            {"success": False, "error": "distinct_count query returned no rows"}
+        )
+
+    row = rows[0]
+    row_count = int(row.get("row_count", 0) or 0)
+    distinct_count = int(row.get("distinct_count", 0) or 0)
+    null_count = int(row.get("null_count", 0) or 0)
+    non_null_rows = row_count - null_count
+
+    result = {
+        "success": True,
+        "row_count": row_count,
+        "distinct_count": distinct_count,
+        "null_count": null_count,
+        "is_unique": distinct_count == non_null_rows,
+        "is_complete": null_count == 0,
+    }
+    logger.info(
+        "tool_distinct_count: rows=%d, distinct=%d, nulls=%d, unique=%s, complete=%s",
+        row_count,
+        distinct_count,
+        null_count,
+        result["is_unique"],
+        result["is_complete"],
+    )
+    return json.dumps(result)
+
+
+def tool_submit_source_model(
+    ctx: ToolContext, *, model: Optional[dict] = None, **_kwargs
+) -> str:
+    """Terminal Planner tool: validate ``model`` against
+    :class:`SourceModel` and stash the dataclass on ``ctx.source_model``.
+
+    Only structural validity is checked here (does ``SourceModel.from_dict``
+    succeed?). Semantic checks — e.g. coverage against the live ontology —
+    are the orchestrator's responsibility.
+    """
+    # Local import to keep ``agents.tools`` importable without
+    # ``agents.agent_mapping_pge`` (avoids circular imports during pkg init).
+    from agents.agent_mapping_pge.contracts import SourceModel
+
+    logger.info("tool_submit_source_model: validating candidate model")
+    if model is None or not isinstance(model, dict):
+        return json.dumps(
+            {"success": False, "error": "model must be a JSON object"}
+        )
+
+    try:
+        source_model = SourceModel.from_dict(model)
+    except (KeyError, TypeError, ValueError) as exc:
+        # ``KeyError`` for missing required fields; ``TypeError`` / ``ValueError``
+        # for bad coercions (e.g. confidence not float-parseable).
+        logger.warning(
+            "tool_submit_source_model: validation failed: %s: %s",
+            type(exc).__name__,
+            exc,
+        )
+        return json.dumps(
+            {
+                "success": False,
+                "error": f"SourceModel validation failed: {type(exc).__name__}: {exc}",
+            }
+        )
+
+    ctx.source_model = source_model
+    summary = {
+        "table_roles": len(source_model.table_roles),
+        "canonical_ids": len(source_model.canonical_ids),
+        "join_keys": len(source_model.join_keys),
+        "entity_order_len": len(source_model.mapping_plan.entity_order),
+        "relationship_order_len": len(source_model.mapping_plan.relationship_order),
+    }
+    logger.info("tool_submit_source_model: stored — %s", summary)
+    return json.dumps({"success": True, "summary": summary})
+
+
+# =====================================================
+# OpenAI function-calling definitions
+# =====================================================
+
+
+SAMPLE_TABLE_DEF: dict = {
+    "type": "function",
+    "function": {
+        "name": "sample_table",
+        "description": (
+            "Return up to N random sample rows from a table so you can see actual values "
+            "(not just column types). n defaults to 20 and is capped at 100."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "full_name": {
+                    "type": "string",
+                    "description": "Fully-qualified table name (catalog.schema.table).",
+                },
+                "n": {
+                    "type": "integer",
+                    "description": "Sample size (default 20, max 100).",
+                },
+            },
+            "required": ["full_name"],
+        },
+    },
+}
+
+
+COLUMN_VALUE_OVERLAP_DEF: dict = {
+    "type": "function",
+    "function": {
+        "name": "column_value_overlap",
+        "description": (
+            "Compute the one-sided overlap |distinct(from) ∩ distinct(to)| / |distinct(from)|. "
+            "Use this to validate a candidate join key before committing it to the SourceModel."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "from_table": {
+                    "type": "string",
+                    "description": "Fully-qualified source table.",
+                },
+                "from_column": {
+                    "type": "string",
+                    "description": "Column on the source side (numerator denominator).",
+                },
+                "to_table": {
+                    "type": "string",
+                    "description": "Fully-qualified target table.",
+                },
+                "to_column": {
+                    "type": "string",
+                    "description": "Column on the target side.",
+                },
+            },
+            "required": ["from_table", "from_column", "to_table", "to_column"],
+        },
+    },
+}
+
+
+NORMALIZED_VALUE_OVERLAP_DEF: dict = {
+    "type": "function",
+    "function": {
+        "name": "normalized_value_overlap",
+        "description": (
+            "Same overlap metric as column_value_overlap, but each side is a "
+            "scalar SQL EXPRESSION instead of a bare column. Use this to PROVE a "
+            "canonical-key normalization before committing it: when two tables "
+            "that map to the same ontology class have 0% raw-column overlap, "
+            "propose a normalization expression per table (e.g. "
+            "regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-\\d+)', 1)) and "
+            "call this to confirm overlap_pct > 0. A still-zero result means the "
+            "expression is wrong — fix it before submit_source_model. Expressions "
+            "must be a single scalar (functions/literals/operators only); "
+            "subqueries and SQL keywords are rejected."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "from_table": {
+                    "type": "string",
+                    "description": "Fully-qualified source table.",
+                },
+                "from_expr": {
+                    "type": "string",
+                    "description": (
+                        "Scalar SQL expression over the source table that "
+                        "produces the canonical key (e.g. a regexp_extract / "
+                        "concat). Bare column names are also accepted."
+                    ),
+                },
+                "to_table": {
+                    "type": "string",
+                    "description": "Fully-qualified target table.",
+                },
+                "to_expr": {
+                    "type": "string",
+                    "description": "Scalar SQL expression over the target table.",
+                },
+            },
+            "required": ["from_table", "from_expr", "to_table", "to_expr"],
+        },
+    },
+}
+
+
+DISTINCT_COUNT_DEF: dict = {
+    "type": "function",
+    "function": {
+        "name": "distinct_count",
+        "description": (
+            "Report row_count / distinct_count / null_count for a column, with is_unique "
+            "and is_complete flags. Use this to vet a candidate canonical-ID column."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "full_name": {
+                    "type": "string",
+                    "description": "Fully-qualified table name (catalog.schema.table).",
+                },
+                "column": {
+                    "type": "string",
+                    "description": "Column to characterise.",
+                },
+            },
+            "required": ["full_name", "column"],
+        },
+    },
+}
+
+
+SUBMIT_SOURCE_MODEL_DEF: dict = {
+    "type": "function",
+    "function": {
+        "name": "submit_source_model",
+        "description": (
+            "Terminal Planner tool. Submit the final SourceModel JSON (matching "
+            "SourceModel.to_dict() shape). Validates the structure and stores the "
+            "dataclass on the ToolContext for the Generator stage to consume."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "model": {
+                    "type": "object",
+                    "description": (
+                        "JSON-encoded SourceModel with table_roles, canonical_ids, "
+                        "join_keys, and mapping_plan."
+                    ),
+                }
+            },
+            "required": ["model"],
+        },
+    },
+}
+
+
+# =====================================================
+# Aggregate exports
+# =====================================================
+
+
+PLANNER_TOOL_DEFINITIONS: List[dict] = [
+    SAMPLE_TABLE_DEF,
+    COLUMN_VALUE_OVERLAP_DEF,
+    NORMALIZED_VALUE_OVERLAP_DEF,
+    DISTINCT_COUNT_DEF,
+    SUBMIT_SOURCE_MODEL_DEF,
+]
+
+
+PLANNER_TOOL_HANDLERS: Dict[str, Callable] = {
+    "sample_table": tool_sample_table,
+    "column_value_overlap": tool_column_value_overlap,
+    "normalized_value_overlap": tool_normalized_value_overlap,
+    "distinct_count": tool_distinct_count,
+    "submit_source_model": tool_submit_source_model,
+}
diff --git a/src/api/routers/internal/dtwin.py b/src/api/routers/internal/dtwin.py
index 65a78cdd..3256fe88 100644
--- a/src/api/routers/internal/dtwin.py
+++ b/src/api/routers/internal/dtwin.py
@@ -1458,10 +1458,34 @@ def _is_ready(ep: dict) -> bool:
         state = (ep.get("state") or "").upper()
         return state in ("READY", "TRUE", "UP")
 
+    def _is_tool_incompatible(name: str) -> bool:
+        """Reasoning-first models that reject function tools via the standard
+        /v1/chat/completions path (they require /v1/responses) — picking one
+        breaks Graph Chat, which is a tool-calling agent. Skip them in
+        auto-discovery. e.g. ``databricks-gpt-5-5`` returns HTTP 400
+        "Function tools with reasoning_effort are not supported ... use
+        /v1/responses instead".
+        """
+        n = (name or "").lower()
+        markers = ("gpt-5", "gpt5", "-o1", "-o3", "-o4-", "reasoning")
+        return any(m in n for m in markers)
+
+    # Preferred: a tool-capable Databricks foundation model.
     for ep in endpoints:
         name = ep.get("name") or ""
-        if name.startswith("databricks-") and _is_ready(ep):
+        if (
+            name.startswith("databricks-")
+            and _is_ready(ep)
+            and not _is_tool_incompatible(name)
+        ):
             return name
+    # Next: any ready endpoint that isn't a known tool-incompatible model.
+    for ep in endpoints:
+        name = ep.get("name") or ""
+        if name and _is_ready(ep) and not _is_tool_incompatible(name):
+            return name
+    # Last resort: a ready endpoint even if it may be tool-incompatible
+    # (better to try than to return nothing).
     for ep in endpoints:
         if _is_ready(ep) and ep.get("name"):
             return ep["name"]
@@ -1724,8 +1748,20 @@ async def dtwin_assistant_chat_stream(
     event_queue: asyncio.Queue = asyncio.Queue()
 
     def _on_event(step: AgentStep) -> None:
-        """Forward an AgentStep from the sync thread to the async generator."""
-        asyncio.run_coroutine_threadsafe(event_queue.put(step), loop).result(timeout=10)
+        """Forward an AgentStep from the sync thread to the async generator.
+
+        Best-effort: step events drive the live progress UI only — the final
+        reply is delivered separately via the ``done`` event. If the async
+        consumer is slow (slow SSE client, long-running tool), enqueueing must
+        NOT raise, or the timeout would crash the whole agent turn. Drop the
+        progress event instead and let the agent keep running.
+        """
+        try:
+            asyncio.run_coroutine_threadsafe(
+                event_queue.put(step), loop
+            ).result(timeout=10)
+        except Exception as exc:  # noqa: BLE001 — progress delivery is non-critical
+            logger.debug("GraphChat/stream: dropped progress event: %s", exc)
 
     async def _run_agent_task() -> None:
         try:
@@ -1986,6 +2022,7 @@ async def dtwin_triples_find(
     depth: int = 1,
     limit: int = 1000,
     offset: int = 0,
+    seed_limit: int = 0,
     session_mgr: SessionManager = Depends(get_session_manager),
     settings: Settings = Depends(get_settings),
 ):
@@ -2004,6 +2041,10 @@ async def dtwin_triples_find(
     depth = max(1, min(int(depth or 1), 10))
     limit = max(1, min(int(limit or 1000), 10000))
     offset = max(0, int(offset or 0))
+    # 0 = unbounded (back-compat for callers that paginate over all matches);
+    # >0 caps BFS seeds so a broad search ("mother") can't seed hundreds of
+    # subjects and blow up the recursive traversal.
+    seed_limit = max(0, min(int(seed_limit or 0), 1000))
 
     domain = get_domain(session_mgr)
     table = effective_graph_name(domain)
@@ -2044,6 +2085,7 @@ async def dtwin_triples_find(
             depth,
             search=search or "",
             entity_type=entity_type or "",
+            seed_limit=seed_limit,
         )
 
         if not bfs_rows:
diff --git a/src/api/routers/internal/ontology.py b/src/api/routers/internal/ontology.py
index 2e6cf990..0785856a 100644
--- a/src/api/routers/internal/ontology.py
+++ b/src/api/routers/internal/ontology.py
@@ -240,11 +240,19 @@ async def export_owl(session_mgr: SessionManager = Depends(get_session_manager))
 async def get_loaded_ontology(
     session_mgr: SessionManager = Depends(get_session_manager),
 ):
-    """Get currently loaded ontology from session."""
+    """Get currently loaded ontology from session.
+
+    Returns ``success: false`` (HTTP 200) when no ontology is loaded rather
+    than a 404 — "no ontology yet" is a normal empty state on pages that load
+    before a domain is selected (e.g. landing on Digital Twin / Graph Chat in
+    a fresh session). All callers branch on ``data.success``, so a 200
+    empty-state is handled identically while avoiding noisy 404s in the
+    browser console and server logs.
+    """
     domain = get_domain(session_mgr)
     if domain.get_classes():
         return {"success": True, "ontology": domain.ontology}
-    raise NotFoundError("No ontology loaded")
+    return {"success": False, "ontology": None, "message": "No ontology loaded"}
 
 
 @router.post("/parse-owl")
@@ -1671,6 +1679,8 @@ def on_step(msg: str):
 
             tm.advance_step(task.id, "Finalizing…")
 
+            # Upstream's per-iteration generation score (from the agent's
+            # pitfall-tool quality loop).
             iteration_summary = agent_result.iteration_summary or []
             final_score = (
                 iteration_summary[-1]["score"] if iteration_summary else None
@@ -1680,6 +1690,27 @@ def on_step(msg: str):
                 and iteration_summary[-1]["status"] in ("passed", "max_rounds_reached")
             )
 
+            # Run the PGE intrinsic evaluator in-app (deterministic, no extra
+            # LLM) — a complementary structural scorecard + GREEN/RED verdict.
+            # Never breaks generation: the import + call are guarded so even an
+            # import-time failure can't fail an already-good run.
+            scorecard = None
+            try:
+                from agents.pge_eval.inapp import score_generated_ontology
+
+                scorecard = score_generated_ontology(owl_content, metadata)
+            except Exception as score_exc:  # noqa: BLE001
+                logger.warning("Wizard: in-app scoring unavailable: %s", score_exc)
+
+            message = (
+                f"Generated {stats.get('classes', 0)} classes, "
+                f"{stats.get('properties', 0)} properties "
+                f"({agent_result.iterations} agent iterations)"
+                + (f" — quality score {final_score}/100" if final_score is not None else "")
+            )
+            if scorecard:
+                message += f" · PGE {scorecard['verdict']}"
+
             tm.complete_task(
                 task.id,
                 result={
@@ -1691,13 +1722,9 @@ def on_step(msg: str):
                     "iteration_summary": iteration_summary,
                     "generation_score": final_score,
                     "generation_converged": converged,
+                    "pge_scorecard": scorecard,
                 },
-                message=(
-                    f"Generated {stats.get('classes', 0)} classes, "
-                    f"{stats.get('properties', 0)} properties "
-                    f"({agent_result.iterations} agent iterations)"
-                    + (f" — quality score {final_score}/100" if final_score is not None else "")
-                ),
+                message=message,
             )
 
         except Exception as e:
diff --git a/src/back/core/agents/AgentClient.py b/src/back/core/agents/AgentClient.py
index 5d7ab263..de8a63a3 100644
--- a/src/back/core/agents/AgentClient.py
+++ b/src/back/core/agents/AgentClient.py
@@ -11,7 +11,7 @@
 
 if TYPE_CHECKING:
     from agents.agent_owl_generator.engine import AgentResult
-    from agents.agent_auto_assignment.engine import AgentResult as AutoAssignAgentResult
+    from agents.agent_mapping_pge.engine import AgentResult as AutoAssignAgentResult
     from agents.agent_auto_icon_assign.engine import (
         AgentResult as IconAssignAgentResult,
     )
@@ -109,13 +109,13 @@ def run_auto_assignment(
             max_iterations: Upper bound on agent refinement iterations.
 
         Returns:
-            Structured result from ``agents.agent_auto_assignment`` describing
+            Structured result from ``agents.agent_mapping_pge`` describing
             proposed mappings and per-item status.
 
         Raises:
             Exception: Propagates any failure raised by ``run_agent``.
         """
-        from agents.agent_auto_assignment import run_agent
+        from agents.agent_mapping_pge import run_agent
 
         return run_agent(
             host=host,
diff --git a/src/back/core/databricks/DatabricksAuth.py b/src/back/core/databricks/DatabricksAuth.py
index 2f3e35df..cb568275 100644
--- a/src/back/core/databricks/DatabricksAuth.py
+++ b/src/back/core/databricks/DatabricksAuth.py
@@ -258,7 +258,25 @@ def can_use_cloud_fetch(self) -> bool:
         return True
 
     def probe_cloud_fetch_capability(self) -> Tuple[bool, str]:
-        """Issue a tiny ``SELECT 1`` with ``use_cloud_fetch=True`` and cache the outcome.
+        """Probe whether the runtime can actually download CloudFetch result
+        blobs from the storage host, and cache the outcome.
+
+        Two-stage probe so a blocked-egress Apps sandbox is caught quickly
+        without burning 40 MB of bandwidth on every cache miss:
+
+        1. **TCP reachability** to known AWS CloudFetch storage hosts. The
+           Databricks Apps egress firewall blocks the whole
+           ``*.storage.cloud.databricks.com`` family at L3/L4, so a plain
+           TCP connect with a short timeout returns connection-refused
+           almost instantly. This is the fast, accurate path on AWS.
+
+        2. **SQL load-test** as a backstop: ``SELECT id FROM range(N)`` with
+           N large enough that the warehouse returns presigned-URL
+           CloudFetch links instead of inline Thrift rows. Has to be on
+           the order of millions of BIGINTs to clear the typical 10-20 MB
+           inline threshold — smaller queries get returned inline and the
+           probe never touches storage at all (this was the
+           original-probe bug, where ``SELECT 1`` always reported "ok").
 
         Returns ``(capable, reason)``. The result is cached at the class
         level for ``_CLOUD_FETCH_PROBE_TTL_SECONDS`` so subsequent SQL
@@ -276,6 +294,14 @@ def probe_cloud_fetch_capability(self) -> Tuple[bool, str]:
             self._record_cloud_fetch(False, prereq_msg)
             return False, prereq_msg
 
+        # ── Stage 1: direct TCP egress check ────────────────────────────
+        tcp_ok, tcp_reason = self._probe_cloud_fetch_storage_egress()
+        if not tcp_ok:
+            self._record_cloud_fetch(False, tcp_reason)
+            logger.info("CloudFetch probe: not capable (%s)", tcp_reason)
+            return False, tcp_reason
+
+        # ── Stage 2: SQL load-test (large enough to force CloudFetch) ──
         try:
             from databricks import sql
 
@@ -292,20 +318,66 @@ def probe_cloud_fetch_capability(self) -> Tuple[bool, str]:
             elif self.token:
                 probe_params["access_token"] = self.token
 
+            # 5M BIGINTs ≈ 40 MB raw, ~10-20 MB Arrow-compressed — over the
+            # typical warehouse inline threshold, so the warehouse returns
+            # CloudFetch presigned URLs which the connector downloads
+            # during ``fetchmany``. A blocked storage host raises there.
+            probe_sql = "SELECT id FROM range(5000000)"
             with sql.connect(**probe_params) as conn:
                 with conn.cursor() as cur:
-                    cur.execute("SELECT 1")
-                    cur.fetchall()
-            msg = "Probe SELECT 1 succeeded with use_cloud_fetch=True"
+                    cur.execute(probe_sql)
+                    cur.fetchmany(1)
+            msg = (
+                "Probe SELECT id FROM range(5000000) succeeded "
+                "with use_cloud_fetch=True (TCP egress + CloudFetch reachable)"
+            )
             self._record_cloud_fetch(True, msg)
             logger.info("CloudFetch probe: capable (%s)", msg)
             return True, msg
         except Exception as exc:  # noqa: BLE001 - vendor/network surface
-            msg = f"Probe SELECT 1 failed with use_cloud_fetch=True: {exc}"
+            msg = (
+                "Probe SELECT id FROM range(5000000) failed with "
+                f"use_cloud_fetch=True: {exc}"
+            )
             self._record_cloud_fetch(False, msg)
             logger.info("CloudFetch probe: not capable (%s)", msg)
             return False, msg
 
+    # AWS CloudFetch presigned-URL storage hosts. Databricks Apps blocks
+    # the whole family at the L3/L4 egress firewall, so a TCP connect with
+    # a short timeout returns connection-refused almost instantly. We
+    # probe two common regions; if either is blocked we treat egress as
+    # blocked everywhere (Apps doesn't selectively allow some regions).
+    _CLOUD_FETCH_STORAGE_HOSTS = (
+        "us-east-1.storage.cloud.databricks.com",
+        "us-west-2.storage.cloud.databricks.com",
+    )
+
+    def _probe_cloud_fetch_storage_egress(self) -> Tuple[bool, str]:
+        """TCP-connect to known CloudFetch storage hosts with a short
+        timeout. Returns ``(True, msg)`` only if every probe host is
+        reachable; the first failure is enough to declare egress blocked.
+        """
+        import socket
+
+        # Only applies to AWS workspaces; Azure CloudFetch uses a
+        # different storage host pattern. For non-AWS hosts, skip the
+        # TCP check and rely solely on the SQL load-test below.
+        if "cloud.databricks.com" not in self.host:
+            return True, "Workspace is not AWS — skipping TCP egress probe"
+
+        for host in self._CLOUD_FETCH_STORAGE_HOSTS:
+            try:
+                with socket.create_connection((host, 443), timeout=3):
+                    pass
+            except (OSError, socket.timeout) as exc:
+                return (
+                    False,
+                    f"CloudFetch storage host {host} unreachable "
+                    f"(TCP egress blocked): {exc}",
+                )
+        return True, "TCP egress to CloudFetch storage hosts is reachable"
+
     def _record_cloud_fetch(self, capable: bool, reason: str) -> None:
         DatabricksAuth._cloud_fetch_cache[(self.host, self.warehouse_id)] = (
             capable,
diff --git a/src/back/core/triplestore/TripleStoreBackend.py b/src/back/core/triplestore/TripleStoreBackend.py
index 2d6e9a26..c60bc1c9 100644
--- a/src/back/core/triplestore/TripleStoreBackend.py
+++ b/src/back/core/triplestore/TripleStoreBackend.py
@@ -314,6 +314,7 @@ def bfs_traversal(
         depth: int,
         search: str = "",
         entity_type: str = "",
+        seed_limit: int = 0,
     ) -> List[Dict[str, Any]]:
         """BFS traversal from seed entities.
 
@@ -323,6 +324,14 @@ def bfs_traversal(
         *search* and *entity_type* are structured parameters for future
         non-SQL backends (Cypher, Gremlin) that cannot use raw SQL fragments.
 
+        *seed_limit* (when > 0) caps the number of seed entities the BFS
+        starts from. A broad search (e.g. "mother" matching every Mother)
+        otherwise seeds hundreds of subjects and the recursive OR-join
+        expansion over the whole graph becomes very expensive. Capping seeds
+        bounds the entire traversal+fetch pipeline — ideal for "describe a few
+        matching entities" (the Graph Chat agent's use), which never needs all
+        matches at once.
+
         Returns rows with ``entity`` and ``min_lvl`` columns.
         """
         edge_filters = (
@@ -332,9 +341,10 @@ def bfs_traversal(
             f"AND t.predicate != '{RDFS_LABEL}' "
             f"AND (t.object LIKE 'http://%' OR t.object LIKE 'https://%')"
         )
+        seed_cap = f" LIMIT {int(seed_limit)}" if seed_limit and seed_limit > 0 else ""
         sql = (
             f"WITH RECURSIVE seeds AS (\n"
-            f"  SELECT DISTINCT subject AS entity FROM {self._sql_relation(table_name)}{seed_where}\n"
+            f"  SELECT DISTINCT subject AS entity FROM {self._sql_relation(table_name)}{seed_where}{seed_cap}\n"
             f"), bfs(entity, lvl) AS (\n"
             f"  SELECT entity, 0 FROM seeds\n"
             f"  UNION ALL\n"
diff --git a/src/back/objects/mapping/Mapping.py b/src/back/objects/mapping/Mapping.py
index 61ebdf76..fc7ccbb5 100644
--- a/src/back/objects/mapping/Mapping.py
+++ b/src/back/objects/mapping/Mapping.py
@@ -27,7 +27,7 @@
 _MAX_DOC_CHARS = 50_000
 
 if TYPE_CHECKING:
-    from agents.agent_auto_assignment.engine import AgentResult as AutoAssignAgentResult
+    from agents.agent_mapping_pge.engine import AgentResult as AutoAssignAgentResult
 
 SINGLE_ITEM_MAX_ITERATIONS = 15
 
@@ -78,13 +78,18 @@ def auto_assign_with_agent(
         on_step: Optional[Callable[[str, int], None]] = None,
         max_iterations: Optional[int] = None,
     ) -> "AutoAssignAgentResult":
-        """Run ``agent_auto_assignment`` (blocking).
+        """Run the mapping-PGE agent (``agent_mapping_pge``) — blocking.
+
+        Returns an :class:`AgentResult` with the standard ``entity_mappings``
+        and ``relationship_mappings`` plus three PGE-specific extras
+        (``source_model``, ``mapping_evaluations``, ``mapping_run_log``) that
+        the caller can persist on the session.
 
         ``client`` is typically a :class:`~back.core.databricks.DatabricksClient`
         built with the domain warehouse. Call from a background thread when
         started from HTTP.
         """
-        from agents.agent_auto_assignment import run_agent
+        from agents.agent_mapping_pge import run_agent
 
         return run_agent(
             host=host,
@@ -165,6 +170,12 @@ def run_auto_assign_task(
             total_iterations = 0
             total_usage = {"prompt_tokens": 0, "completion_tokens": 0}
             chunk_errors: List[str] = []
+            # PGE-specific extras accumulated across chunks. Each chunk
+            # re-plans, so ``last_source_model`` reflects the most recent
+            # plan; per-item evaluations / run logs concatenate cleanly.
+            last_source_model: Optional[Dict[str, Any]] = None
+            merged_mapping_evaluations: Dict[str, Any] = {}
+            merged_mapping_run_log: List[Any] = []
 
             for chunk_idx, chunk in enumerate(chunks):
                 chunk_num = chunk_idx + 1
@@ -261,6 +272,19 @@ def on_step(msg: str, progress_pct: int = 0) -> None:
                 for k in total_usage:
                     total_usage[k] += agent_result.usage.get(k, 0)
 
+                # PGE extras — accumulate. The new engine returns these as
+                # dicts/lists (drop-in compatible). The legacy engine omitted
+                # them; ``getattr`` with defaults keeps us tolerant.
+                chunk_source_model = getattr(agent_result, "source_model", None)
+                if chunk_source_model:
+                    last_source_model = chunk_source_model
+                chunk_evals = getattr(agent_result, "mapping_evaluations", None) or {}
+                if chunk_evals:
+                    merged_mapping_evaluations.update(chunk_evals)
+                chunk_run_log = getattr(agent_result, "mapping_run_log", None) or []
+                if chunk_run_log:
+                    merged_mapping_run_log.extend(chunk_run_log)
+
                 e_done = len(entity_mapping_by_uri)
                 r_done = len(rel_mapping_by_uri)
 
@@ -345,12 +369,37 @@ def on_step(msg: str, progress_pct: int = 0) -> None:
                 all_relationship_mappings,
                 existing_entity_mappings=entity_mappings,
                 existing_relationship_mappings=relationship_mappings,
+                source_model=last_source_model,
+                mapping_evaluations=merged_mapping_evaluations or None,
+                mapping_run_log=merged_mapping_run_log or None,
             )
 
             message = f"Completed: {e_count} entities, {r_count} relationships mapped"
             if chunk_errors:
                 message += f" ({len(chunk_errors)} chunk(s) had errors)"
 
+            # Run the PGE intrinsic evaluator in-app on the completed mapping
+            # run (deterministic — re-uses the captured per-item evaluations,
+            # no extra LLM). Never breaks the run; the import + call are guarded
+            # so even an import-time failure can't fail an already-good run.
+            scorecard = None
+            try:
+                from agents.pge_eval.inapp import score_mapping_run
+
+                scorecard = score_mapping_run(
+                    ontology={"entities": entities, "relationships": relationships},
+                    metadata=schema_context,
+                    mapping_run_log=merged_mapping_run_log,
+                    mapping_evaluations=merged_mapping_evaluations,
+                    entity_mappings=all_entity_mappings,
+                    relationship_mappings=all_relationship_mappings,
+                    usage=total_usage,
+                )
+            except Exception as score_exc:  # noqa: BLE001
+                logger.warning("Auto-assign: in-app scoring unavailable: %s", score_exc)
+            if scorecard:
+                message += f" · quality {scorecard['verdict']}"
+
             tm.complete_task(
                 task.id,
                 result={
@@ -365,6 +414,15 @@ def on_step(msg: str, progress_pct: int = 0) -> None:
                     "agent_steps": serialize_agent_steps(all_steps),
                     "agent_iterations": total_iterations,
                     "agent_usage": total_usage,
+                    "pge_scorecard": scorecard,
+                    # PGE run-visualizer payload — the planner's source model,
+                    # per-item evaluator verdicts, and the attempt-by-attempt run
+                    # log. Persisted to the session already; also surfaced here so
+                    # the UI can render the planner→generator→evaluator→critic loop
+                    # from the polled task result without a second round-trip.
+                    "source_model": last_source_model,
+                    "mapping_evaluations": merged_mapping_evaluations or None,
+                    "mapping_run_log": merged_mapping_run_log or None,
                 },
                 message=message,
             )
@@ -449,6 +507,11 @@ def on_step(msg: str, progress_pct: int = 0) -> None:
                 tm.fail_task(task.id, "Agent completed but produced no mapping")
                 return
 
+            # PGE extras from this single-item run — passed through verbatim.
+            single_source_model = getattr(agent_result, "source_model", None)
+            single_evals = getattr(agent_result, "mapping_evaluations", None) or None
+            single_run_log = getattr(agent_result, "mapping_run_log", None) or None
+
             if item_type == "entity":
                 Mapping.save_mappings_to_session(
                     session_id,
@@ -456,6 +519,9 @@ def on_step(msg: str, progress_pct: int = 0) -> None:
                     agent_result.entity_mappings,
                     None,
                     existing_entity_mappings=existing_entity_mappings,
+                    source_model=single_source_model,
+                    mapping_evaluations=single_evals,
+                    mapping_run_log=single_run_log,
                 )
             else:
                 Mapping.save_mappings_to_session(
@@ -464,6 +530,9 @@ def on_step(msg: str, progress_pct: int = 0) -> None:
                     None,
                     agent_result.relationship_mappings,
                     existing_relationship_mappings=existing_relationship_mappings,
+                    source_model=single_source_model,
+                    mapping_evaluations=single_evals,
+                    mapping_run_log=single_run_log,
                 )
 
             tm.complete_task(
@@ -472,6 +541,12 @@ def on_step(msg: str, progress_pct: int = 0) -> None:
                     "item_type": item_type,
                     "mapping": mapping,
                     "iterations": agent_result.iterations,
+                    # PGE run-visualizer payload (see batch path) — surface the
+                    # single-item planner model, evaluator verdicts and run log
+                    # so the UI renders the same loop view for one-off re-maps.
+                    "source_model": single_source_model,
+                    "mapping_evaluations": single_evals,
+                    "mapping_run_log": single_run_log,
                 },
                 message=f"Assigned {item_type}: {item_name}",
             )
@@ -973,6 +1048,9 @@ def save_mappings_to_session(
         *,
         existing_entity_mappings: Optional[list] = None,
         existing_relationship_mappings: Optional[list] = None,
+        source_model: Optional[Dict[str, Any]] = None,
+        mapping_evaluations: Optional[Dict[str, Any]] = None,
+        mapping_run_log: Optional[List[Any]] = None,
     ) -> None:
         if not session_id:
             logger.warning("save_mappings_to_session: no session_id — skipping")
@@ -1010,6 +1088,21 @@ def save_mappings_to_session(
                 else:
                     assignment["relationships"] = relationship_mappings
 
+            # Mapping-PGE extras — persisted alongside the assignment so the
+            # UI (future work) and downstream observability can surface
+            # planner state, per-item evaluation reports, and the per-item
+            # attempt log without re-running the agent.
+            if source_model is not None:
+                assignment["source_model"] = source_model
+            if mapping_evaluations is not None:
+                merged_evals = dict(assignment.get("mapping_evaluations") or {})
+                merged_evals.update(mapping_evaluations)
+                assignment["mapping_evaluations"] = merged_evals
+            if mapping_run_log is not None:
+                existing_log = list(assignment.get("mapping_run_log") or [])
+                existing_log.extend(mapping_run_log)
+                assignment["mapping_run_log"] = existing_log
+
             domain_node = bucket.setdefault("domain", {})
             domain_node["assignment_changed"] = True
 
diff --git a/src/front/static/mapping/css/mapping-pge-visualizer.css b/src/front/static/mapping/css/mapping-pge-visualizer.css
new file mode 100644
index 00000000..c0c6dee5
--- /dev/null
+++ b/src/front/static/mapping/css/mapping-pge-visualizer.css
@@ -0,0 +1,147 @@
+/* PGE Run-Visualizer — surfaces the Planner→Generator→Evaluator→Critic loop.
+ * JS: /static/mapping/js/mapping-pge-visualizer.js
+ * Component-level classes only (ob-pge-*), Bootstrap 5.3 for everything else. */
+
+.ob-pge-card {
+    border: 1px solid var(--bs-border-color, #dee2e6);
+    border-radius: 0.5rem;
+    overflow: hidden;
+}
+
+.ob-pge-header {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    gap: 0.75rem;
+    padding: 0.75rem 1rem;
+    background: linear-gradient(90deg, #f3f0ff 0%, #eef5ff 100%);
+    border-bottom: 1px solid var(--bs-border-color, #dee2e6);
+}
+
+.ob-pge-stages {
+    display: flex;
+    align-items: center;
+    gap: 0.35rem;
+    font-weight: 600;
+    font-size: 0.95rem;
+    color: #343a40;
+    flex-wrap: wrap;
+}
+.ob-pge-stage-chip {
+    display: inline-flex;
+    align-items: center;
+    gap: 0.3rem;
+    padding: 0.15rem 0.55rem;
+    border-radius: 999px;
+    background: #fff;
+    border: 1px solid #d7d2f0;
+    font-size: 0.8rem;
+}
+.ob-pge-stage-arrow { color: #adb5bd; }
+
+/* Verdict pill */
+.ob-pge-verdict {
+    font-size: 0.85rem;
+    font-weight: 700;
+    letter-spacing: 0.02em;
+    padding: 0.3rem 0.7rem;
+    border-radius: 999px;
+}
+.ob-pge-verdict-green { background: #d1e7dd; color: #0f5132; }
+.ob-pge-verdict-red   { background: #f8d7da; color: #842029; }
+.ob-pge-verdict-na    { background: #e2e3e5; color: #41464b; }
+
+/* KPI strip */
+.ob-pge-kpis {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 0.6rem;
+    padding: 0.85rem 1rem;
+    border-bottom: 1px solid var(--bs-border-color, #dee2e6);
+    background: #fcfcfd;
+}
+.ob-pge-kpi {
+    flex: 1 1 120px;
+    min-width: 110px;
+    background: #fff;
+    border: 1px solid #eef0f2;
+    border-radius: 0.4rem;
+    padding: 0.5rem 0.7rem;
+}
+.ob-pge-kpi-val { font-size: 1.15rem; font-weight: 700; line-height: 1.1; }
+.ob-pge-kpi-lbl { font-size: 0.72rem; color: #6c757d; text-transform: uppercase; letter-spacing: 0.03em; }
+
+.ob-pge-gates { display: flex; gap: 0.4rem; flex-wrap: wrap; padding: 0.6rem 1rem 0.85rem; }
+.ob-pge-gate {
+    display: inline-flex; align-items: center; gap: 0.35rem;
+    font-size: 0.78rem; padding: 0.25rem 0.6rem; border-radius: 0.35rem;
+    border: 1px solid transparent;
+}
+.ob-pge-gate-pass { background: #e8f6ee; color: #0f5132; border-color: #b6ddc6; }
+.ob-pge-gate-fail { background: #fdecea; color: #842029; border-color: #f1b0b7; }
+
+/* Per-item loop trace */
+.ob-pge-item {
+    border-top: 1px solid #f0f1f3;
+    padding: 0.6rem 1rem;
+}
+.ob-pge-item:first-child { border-top: none; }
+.ob-pge-item-head {
+    display: flex; align-items: center; gap: 0.5rem;
+    cursor: pointer;
+}
+.ob-pge-item-name { font-weight: 600; flex: 1 1 auto; }
+.ob-pge-item-name code { font-weight: 600; color: #4530a8; background: none; padding: 0; }
+
+/* Attempt chain */
+.ob-pge-attempts { margin: 0.5rem 0 0.2rem; padding-left: 0.25rem; }
+.ob-pge-attempt {
+    display: flex; align-items: flex-start; gap: 0.5rem;
+    padding: 0.35rem 0;
+    border-left: 2px solid #e9ecef;
+    padding-left: 0.75rem;
+    margin-left: 0.4rem;
+}
+.ob-pge-attempt-num {
+    flex: 0 0 auto; font-size: 0.72rem; color: #6c757d;
+    background: #f1f3f5; border-radius: 999px; padding: 0.05rem 0.45rem; margin-top: 0.1rem;
+}
+.ob-pge-chain { display: flex; align-items: center; gap: 0.3rem; flex-wrap: wrap; }
+.ob-pge-step {
+    display: inline-flex; align-items: center; gap: 0.25rem;
+    font-size: 0.76rem; padding: 0.1rem 0.45rem; border-radius: 0.3rem;
+    border: 1px solid #e3e6ea; background: #fff;
+}
+.ob-pge-step-pass { border-color: #b6ddc6; background: #f0faf4; color: #0f5132; }
+.ob-pge-step-fail { border-color: #f1b0b7; background: #fdf2f3; color: #842029; }
+.ob-pge-step-skip { color: #868e96; }
+.ob-pge-step-bubble { border-color: #ffe0a6; background: #fff8ec; color: #8a5a00; }
+.ob-pge-arrow { color: #ced4da; font-size: 0.7rem; }
+.ob-pge-hint {
+    font-size: 0.76rem; color: #6c4a00; background: #fff8ec;
+    border: 1px solid #ffe9c2; border-radius: 0.3rem; padding: 0.3rem 0.5rem;
+    margin: 0.25rem 0 0.1rem; display: block;
+}
+.ob-pge-metrics-inline {
+    font-size: 0.74rem; color: #495057; margin-top: 0.2rem; display: flex; flex-wrap: wrap; gap: 0.5rem;
+}
+.ob-pge-metrics-inline span { white-space: nowrap; }
+
+/* Long free-text eval fields (e.g. critic reasoning) — wrap, don't overflow. */
+.ob-pge-reasoning {
+    font-size: 0.78rem;
+    color: #495057;
+    margin-top: 0.3rem;
+    line-height: 1.4;
+    white-space: normal;
+    overflow-wrap: anywhere;
+}
+.ob-pge-reasoning .text-muted { font-weight: 600; }
+
+/* Source model panel */
+.ob-pge-sm-table { font-size: 0.8rem; }
+.ob-pge-sm-table th { white-space: nowrap; }
+.ob-pge-conf-bar {
+    display: inline-block; height: 6px; border-radius: 3px; background: #6f42c1;
+    vertical-align: middle; margin-right: 0.35rem;
+}
diff --git a/src/front/static/mapping/js/mapping-autoassign.js b/src/front/static/mapping/js/mapping-autoassign.js
index 1b7546df..145f4d95 100644
--- a/src/front/static/mapping/js/mapping-autoassign.js
+++ b/src/front/static/mapping/js/mapping-autoassign.js
@@ -189,6 +189,7 @@ window.AutoAssignModule = {
                 console.log('[AutoAssign] Task completed, applying results');
                 sessionStorage.removeItem(AUTO_ASSIGN_TASK_KEY);
                 this.results = task.result.results || [];
+                this.taskResult = task.result;
                 await this.saveMappingsFromTask(task.result);
                 this.showReport();
                 await this.refreshMappingConfig();
@@ -449,6 +450,7 @@ window.AutoAssignModule = {
                     
                     if (task.result) {
                         this.results = task.result.results || [];
+                        this.taskResult = task.result;
                         await this.saveMappingsFromTask(task.result);
                     }
                     
@@ -773,6 +775,17 @@ window.AutoAssignModule = {
             `;
         }).join('');
         
+        // Render the PGE run-visualizer (planner→generator→evaluator→critic
+        // loop + scorecard) from the captured task result. Defensive: never
+        // let a visualizer error break the report.
+        if (window.PgeVisualizer) {
+            try {
+                PgeVisualizer.render(this.taskResult || {}, 'autoAssignPgeVisualizer');
+            } catch (e) {
+                console.error('[AutoAssign] PGE visualizer render failed:', e);
+            }
+        }
+
         // Show notification
         if (successCount > 0) {
             showNotification(`Auto-mapped ${successCount} item(s) successfully`, 'success', 3000);
@@ -784,6 +797,9 @@ window.AutoAssignModule = {
      */
     reset: function() {
         this.results = [];
+        this.taskResult = null;
+        const pgeEl = document.getElementById('autoAssignPgeVisualizer');
+        if (pgeEl) { pgeEl.style.display = 'none'; pgeEl.innerHTML = ''; }
         document.getElementById('autoAssignProgressSection').style.display = 'none';
         document.getElementById('autoAssignReportSection').style.display = 'none';
         document.getElementById('startAutoAssignBtn').style.display = 'inline-block';
diff --git a/src/front/static/mapping/js/mapping-pge-visualizer.js b/src/front/static/mapping/js/mapping-pge-visualizer.js
new file mode 100644
index 00000000..b0a730dc
--- /dev/null
+++ b/src/front/static/mapping/js/mapping-pge-visualizer.js
@@ -0,0 +1,347 @@
+/*
+ * PGE Run-Visualizer
+ * -------------------
+ * Renders the Planner → Generator → Evaluator → Critic loop from a completed
+ * auto-map task result. Consumes the PGE artifacts surfaced on `task.result`:
+ *   - pge_scorecard        : intrinsic-eval scorecard (verdict + gate tiers + metrics)
+ *   - source_model         : the Planner's output (table roles, canonical ids, joins, plan)
+ *   - mapping_run_log[]     : per-item attempt-by-attempt trace
+ *   - mapping_evaluations{} : per-item final EvalReport (metrics + failures)
+ *
+ * Entirely defensive — any field may be missing (legacy engine, partial run).
+ * If there is nothing PGE-specific to show, render() hides the container.
+ *
+ * Public API:  PgeVisualizer.render(taskResult, containerId)
+ */
+const PgeVisualizer = (function () {
+    'use strict';
+
+    // ---- small helpers -------------------------------------------------
+    function esc(s) {
+        if (s === null || s === undefined) return '';
+        return String(s)
+            .replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;')
+            .replace(/"/g, '&quot;').replace(/'/g, '&#39;');
+    }
+
+    function humanize(key) {
+        return String(key)
+            .replace(/_/g, ' ')
+            .replace(/\bpct\b/gi, '%')
+            .replace(/\b(\w)/g, (m) => m.toUpperCase());
+    }
+
+    // Format a known ratio metric as a percentage (so an exact 1.0 reads
+    // "100%", not "1" — JS treats 1.0 as an integer, so fmtMetric alone can't).
+    function fmtRatio(v) {
+        if (typeof v !== 'number') return fmtMetric(v);
+        return (v * 100).toFixed(1).replace(/\.0$/, '') + '%';
+    }
+
+    // Format a metric value: floats in [0,1] become percentages.
+    function fmtMetric(v) {
+        if (typeof v === 'number') {
+            if (Number.isInteger(v)) return String(v);
+            if (v >= 0 && v <= 1) return (v * 100).toFixed(1) + '%';
+            return v.toFixed(3);
+        }
+        if (Array.isArray(v)) return v.length ? v.join(', ') : '—';
+        if (v === null || v === undefined) return '—';
+        return esc(v);
+    }
+
+    function shortUri(uri) {
+        if (!uri) return '';
+        const s = String(uri);
+        const hashIdx = s.lastIndexOf('#');
+        const slashIdx = s.lastIndexOf('/');
+        const cut = Math.max(hashIdx, slashIdx);
+        return cut >= 0 && cut < s.length - 1 ? s.slice(cut + 1) : s;
+    }
+
+    function statusBadge(status) {
+        const map = {
+            PASS: ['bg-success', 'check-circle-fill', 'Pass'],
+            PRESEEDED: ['bg-info', 'bookmark-check-fill', 'Pre-seeded'],
+            SKIPPED: ['bg-secondary', 'dash-circle-fill', 'Skipped'],
+            FAIL_BUDGET: ['bg-danger', 'x-circle-fill', 'Failed (budget)'],
+            FAIL_BUBBLE: ['bg-danger', 'x-circle-fill', 'Failed (re-plan)'],
+        };
+        const [cls, icon, label] = map[status] || ['bg-secondary', 'question-circle', status || '?'];
+        return `<span class="badge ${cls}"><i class="bi bi-${icon} me-1"></i>${esc(label)}</span>`;
+    }
+
+    // ---- scorecard: verdict pill (goes inside the header) --------------
+    function renderVerdictPill(sc) {
+        const verdict = (sc && sc.verdict) || 'N/A';
+        const vClass = verdict === 'GREEN' ? 'ob-pge-verdict-green'
+            : verdict === 'RED' ? 'ob-pge-verdict-red' : 'ob-pge-verdict-na';
+        const vIcon = verdict === 'GREEN' ? 'shield-check'
+            : verdict === 'RED' ? 'shield-exclamation' : 'shield';
+        const label = sc ? verdict : 'no scorecard';
+        return `<span class="ob-pge-verdict ${vClass}"><i class="bi bi-${vIcon} me-1"></i>${esc(label)}</span>`;
+    }
+
+    // ---- scorecard: KPI strip + gate tiers (go below the header) -------
+    function renderScorecardBody(sc) {
+        if (!sc) return '';
+        // KPI chips from the mapping stage (the most demo-relevant metrics).
+        const mapMetrics = (sc.stages && sc.stages.mapping && sc.stages.mapping.metrics) || {};
+        // ratio metrics render as %, count metrics as integers.
+        const kpiKeys = [
+            { k: 'entity_completeness', ratio: true },
+            { k: 'relationship_completeness', ratio: true },
+            { k: 'id_integrity', ratio: true },
+            { k: 'sql_exec_failures', ratio: false },
+        ];
+        let kpis = kpiKeys
+            .filter((spec) => spec.k in mapMetrics)
+            .map((spec) => `
+                <div class="ob-pge-kpi">
+                    <div class="ob-pge-kpi-val">${spec.ratio ? fmtRatio(mapMetrics[spec.k]) : fmtMetric(mapMetrics[spec.k])}</div>
+                    <div class="ob-pge-kpi-lbl">${esc(humanize(spec.k))}</div>
+                </div>`).join('');
+        // Pipeline coverage-loss is the anti-circularity metric — show it if present.
+        const pipe = (sc.stages && sc.stages.pipeline) || {};
+        const pipeMetrics = pipe.metrics || pipe;
+        if (pipeMetrics && 'coverage_loss' in pipeMetrics) {
+            kpis += `
+                <div class="ob-pge-kpi">
+                    <div class="ob-pge-kpi-val">${fmtRatio(pipeMetrics.coverage_loss)}</div>
+                    <div class="ob-pge-kpi-lbl">Coverage Loss</div>
+                </div>`;
+        }
+
+        const gates = sc.gates || {};
+        function gateChip(label, tier) {
+            if (!tier) return '';
+            const pass = tier.passed;
+            const detail = (tier.failures || tier.regressions || tier.warnings || []);
+            const cls = pass ? 'ob-pge-gate-pass' : 'ob-pge-gate-fail';
+            const icon = pass ? 'check-lg' : 'exclamation-triangle-fill';
+            const title = detail.length ? esc(detail.map((d) => (typeof d === 'string' ? d : (d.check || d.metric || JSON.stringify(d)))).join(' · ')) : '';
+            return `<span class="ob-pge-gate ${cls}" title="${title}">
+                <i class="bi bi-${icon}"></i>${esc(label)}${detail.length ? ` (${detail.length})` : ''}</span>`;
+        }
+        const gatesHtml = `
+            <div class="ob-pge-gates">
+                ${gateChip('Tier 1 · absolute', gates.tier1_absolute)}
+                ${gateChip('Tier 2 · ratio', gates.tier2_ratio)}
+                ${gateChip('Tier 3 · regression', gates.tier3_regression)}
+            </div>`;
+
+        return `
+            ${kpis ? `<div class="ob-pge-kpis">${kpis}</div>` : ''}
+            ${gatesHtml}`;
+    }
+
+    // ---- planner source-model panel ------------------------------------
+    function renderSourceModel(sm) {
+        if (!sm) return '<p class="text-muted small mb-0">No planner source-model captured.</p>';
+        let html = '';
+
+        const roles = sm.table_roles || [];
+        if (roles.length) {
+            html += '<h6 class="mt-1">Table → class candidates</h6>';
+            html += '<table class="table table-sm ob-pge-sm-table"><tbody>';
+            roles.forEach((r) => {
+                const cands = (r.ontology_class_candidates || []).map((c) => {
+                    const conf = typeof c.confidence === 'number' ? c.confidence : 0;
+                    const w = Math.max(6, Math.round(conf * 40));
+                    return `<div title="${esc(c.reason || '')}">
+                        <span class="ob-pge-conf-bar" style="width:${w}px"></span>
+                        <code>${esc(shortUri(c.uri))}</code>
+                        <span class="text-muted small">${(conf * 100).toFixed(0)}%</span></div>`;
+                }).join('');
+                html += `<tr><td class="text-nowrap"><code>${esc(r.table)}</code></td><td>${cands || '<span class="text-muted">—</span>'}</td></tr>`;
+            });
+            html += '</tbody></table>';
+        }
+
+        const cids = sm.canonical_ids || [];
+        if (cids.length) {
+            html += '<h6 class="mt-2">Canonical identifiers</h6><ul class="small mb-2">';
+            cids.forEach((c) => {
+                const perTable = c.canonical_column_per_table || {};
+                const cols = Object.entries(perTable)
+                    .map(([t, col]) => `<code>${esc(shortUri(t))}</code>→<code>${esc(col)}</code>`).join(', ');
+                html += `<li><code>${esc(shortUri(c.ontology_class))}</code>: ${cols || '—'}${c.format_note ? ` <span class="text-muted">(${esc(c.format_note)})</span>` : ''}</li>`;
+            });
+            html += '</ul>';
+        }
+
+        const joins = sm.join_keys || [];
+        if (joins.length) {
+            html += '<h6 class="mt-2">Join keys</h6><table class="table table-sm ob-pge-sm-table"><thead><tr><th>From</th><th>To</th><th>Kind</th><th>Overlap</th></tr></thead><tbody>';
+            joins.forEach((j) => {
+                html += `<tr><td><code>${esc(j.from_ref)}</code></td><td><code>${esc(j.to_ref)}</code></td>
+                    <td><span class="badge bg-light text-dark">${esc(j.kind)}</span></td>
+                    <td>${fmtMetric(j.overlap_pct)}</td></tr>`;
+            });
+            html += '</tbody></table>';
+        }
+
+        const plan = sm.mapping_plan || {};
+        const skips = plan.skip || [];
+        if (skips.length) {
+            html += '<h6 class="mt-2">Planner skipped</h6><ul class="small mb-0">';
+            skips.forEach((s) => {
+                html += `<li><code>${esc(shortUri(s.item))}</code> — ${esc(s.reason || 'no reason given')}</li>`;
+            });
+            html += '</ul>';
+        }
+        return html || '<p class="text-muted small mb-0">Planner produced an empty source-model.</p>';
+    }
+
+    // ---- per-item loop trace -------------------------------------------
+    function renderAttempt(a) {
+        function step(label, status, extraClass) {
+            let cls = 'ob-pge-step';
+            if (status === 'PASS') cls += ' ob-pge-step-pass';
+            else if (status === 'FAIL') cls += ' ob-pge-step-fail';
+            else if (status === 'skipped' || status === 'skip') cls += ' ob-pge-step-skip';
+            if (extraClass) cls += ' ' + extraClass;
+            const label2 = status && status !== 'skipped' ? `${label}: ${status}` : label;
+            return `<span class="${cls}">${esc(label2)}</span>`;
+        }
+        const gen = `<span class="ob-pge-step">Generator</span>`;
+        const stage1 = step('Evaluator', a.stage1_status);
+        const showCritic = a.critic_status && a.critic_status !== 'skipped';
+        const critic = showCritic ? `<span class="ob-pge-arrow">›</span>${step('Critic', a.critic_status)}` : '';
+        const bubble = a.bubble ? `<span class="ob-pge-step ob-pge-step-bubble"><i class="bi bi-arrow-up-circle me-1"></i>re-plan</span>` : '';
+        const err = a.error ? `<span class="ob-pge-hint"><i class="bi bi-bug me-1"></i>${esc(a.error)}</span>` : '';
+        const hint = a.hint ? `<span class="ob-pge-hint"><i class="bi bi-lightbulb me-1"></i>${esc(a.hint)}</span>` : '';
+        return `
+            <div class="ob-pge-attempt">
+                <span class="ob-pge-attempt-num">#${esc(a.attempt)}</span>
+                <div class="flex-grow-1">
+                    <div class="ob-pge-chain">
+                        ${gen}<span class="ob-pge-arrow">›</span>${stage1}${critic}${bubble}
+                    </div>
+                    ${err}${hint}
+                </div>
+            </div>`;
+    }
+
+    function renderItem(entry, evals, idx) {
+        const evalReport = evals[entry.item];
+        const metrics = evalReport && evalReport.metrics ? evalReport.metrics : null;
+        let metricsInline = '';
+        if (metrics) {
+            // Long free-text fields (e.g. the critic's reasoning) render as a
+            // wrapped block; short scalar metrics render inline.
+            const isLongText = (v) => typeof v === 'string' && v.length > 60;
+            const longKeys = Object.keys(metrics).filter((k) => isLongText(metrics[k]));
+            const scalarKeys = Object.keys(metrics)
+                .filter((k) => !Array.isArray(metrics[k]) && typeof metrics[k] !== 'object' && !isLongText(metrics[k]))
+                .slice(0, 6);
+            if (scalarKeys.length) {
+                metricsInline += `<div class="ob-pge-metrics-inline">` +
+                    scalarKeys.map((k) => {
+                        const val = /pct$|_pct|overlap/i.test(k) ? fmtRatio(metrics[k]) : fmtMetric(metrics[k]);
+                        return `<span><span class="text-muted">${esc(humanize(k))}:</span> <strong>${val}</strong></span>`;
+                    }).join('') +
+                    `</div>`;
+            }
+            metricsInline += longKeys.map((k) =>
+                `<div class="ob-pge-reasoning"><span class="text-muted">${esc(humanize(k))}:</span> ${esc(metrics[k])}</div>`
+            ).join('');
+        }
+        const attempts = entry.attempts || [];
+        const attemptsHtml = attempts.length
+            ? `<div class="ob-pge-attempts">${attempts.map(renderAttempt).join('')}</div>`
+            : '<div class="text-muted small ms-3 mt-1">No generator attempts (pre-seeded or skipped).</div>';
+        const kindIcon = entry.kind === 'relationship' ? 'arrow-left-right' : 'box';
+        const collId = `obPgeItem${idx}`;
+        return `
+            <div class="ob-pge-item">
+                <div class="ob-pge-item-head" data-bs-toggle="collapse" data-bs-target="#${collId}" aria-expanded="false">
+                    <i class="bi bi-${kindIcon} text-muted"></i>
+                    <span class="ob-pge-item-name"><code>${esc(shortUri(entry.item))}</code></span>
+                    ${attempts.length > 1 ? `<span class="badge bg-light text-dark">${attempts.length} attempts</span>` : ''}
+                    ${statusBadge(entry.final_status)}
+                    <i class="bi bi-chevron-down text-muted small"></i>
+                </div>
+                <div class="collapse" id="${collId}">
+                    ${attemptsHtml}
+                    ${metricsInline}
+                </div>
+            </div>`;
+    }
+
+    function renderTrace(runLog, evals) {
+        if (!runLog || !runLog.length) return '<p class="text-muted small mb-0">No per-item run log captured.</p>';
+        return runLog.map((e, i) => renderItem(e, evals || {}, i)).join('');
+    }
+
+    // ---- main entrypoint -----------------------------------------------
+    function render(taskResult, containerId) {
+        const container = document.getElementById(containerId);
+        if (!container) return;
+
+        const tr = taskResult || {};
+        const sc = tr.pge_scorecard || null;
+        const sm = tr.source_model || null;
+        const runLog = tr.mapping_run_log || null;
+        const evals = tr.mapping_evaluations || {};
+
+        // Nothing PGE-specific → keep the container empty/hidden.
+        if (!sc && !sm && !(runLog && runLog.length)) {
+            container.innerHTML = '';
+            container.style.display = 'none';
+            return;
+        }
+        container.style.display = 'block';
+
+        const headerStages = `
+            <div class="ob-pge-stages">
+                <span class="ob-pge-stage-chip"><i class="bi bi-diagram-2"></i>Planner</span>
+                <span class="ob-pge-stage-arrow">›</span>
+                <span class="ob-pge-stage-chip"><i class="bi bi-cpu"></i>Generator</span>
+                <span class="ob-pge-stage-arrow">›</span>
+                <span class="ob-pge-stage-chip"><i class="bi bi-rulers"></i>Evaluator</span>
+                <span class="ob-pge-stage-arrow">›</span>
+                <span class="ob-pge-stage-chip"><i class="bi bi-search"></i>Critic</span>
+            </div>`;
+
+        const itemCount = runLog ? runLog.length : 0;
+        const accordion = `
+            <div class="accordion accordion-flush" id="obPgeAccordion">
+                <div class="accordion-item">
+                    <h2 class="accordion-header">
+                        <button class="accordion-button" type="button" data-bs-toggle="collapse" data-bs-target="#obPgeTracePanel">
+                            <i class="bi bi-list-check me-2"></i>Loop trace${itemCount ? ` · ${itemCount} item${itemCount > 1 ? 's' : ''}` : ''}
+                        </button>
+                    </h2>
+                    <div id="obPgeTracePanel" class="accordion-collapse collapse show" data-bs-parent="#obPgeAccordion">
+                        <div class="accordion-body p-0">${renderTrace(runLog, evals)}</div>
+                    </div>
+                </div>
+                <div class="accordion-item">
+                    <h2 class="accordion-header">
+                        <button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#obPgeSmPanel">
+                            <i class="bi bi-diagram-2 me-2"></i>Planner source-model
+                        </button>
+                    </h2>
+                    <div id="obPgeSmPanel" class="accordion-collapse collapse" data-bs-parent="#obPgeAccordion">
+                        <div class="accordion-body">${renderSourceModel(sm)}</div>
+                    </div>
+                </div>
+            </div>`;
+
+        container.innerHTML = `
+            <div class="ob-pge-card mt-3">
+                <div class="ob-pge-header">
+                    ${headerStages}
+                    ${renderVerdictPill(sc)}
+                </div>
+                ${renderScorecardBody(sc)}
+                ${accordion}
+            </div>`;
+    }
+
+    return { render: render };
+})();
+
+// Expose globally (non-module script include).
+window.PgeVisualizer = PgeVisualizer;
diff --git a/src/front/static/query/js/query-chat.js b/src/front/static/query/js/query-chat.js
index 6fa7fec9..7382dbb8 100644
--- a/src/front/static/query/js/query-chat.js
+++ b/src/front/static/query/js/query-chat.js
@@ -364,8 +364,10 @@
             bodyEl.removeChild(stepsEl);
         }
 
-        // Render the final markdown reply
-        const reply = event.reply || '(no reply)';
+        // Render the final markdown reply. Empty replies are normally routed
+        // to errorStreamingBubble by the caller; this is a defensive fallback.
+        const reply = (event.reply || '').trim()
+            || '_No response was generated. Please try again._';
         bodyEl.innerHTML = renderMarkdown(reply);
         enhanceEntityLinks(bodyEl);
 
@@ -479,12 +481,23 @@
 
             const doneEvent = await _consumeStream(bubble, response);
 
-            if (doneEvent) {
+            if (doneEvent && (doneEvent.reply || '').trim()) {
                 finalizeStreamingBubble(bubble, doneEvent);
                 conversationHistory.push({
                     role: 'assistant',
-                    content: doneEvent.reply || '',
+                    content: doneEvent.reply,
                 });
+            } else if (doneEvent) {
+                // The turn completed but produced no text — e.g. a transient
+                // model error (success:false) or an empty generation. Surface
+                // an actionable message instead of a cryptic "(no reply)", and
+                // do NOT persist the empty turn to history.
+                errorStreamingBubble(
+                    bubble,
+                    doneEvent.success === false
+                        ? "The assistant didn't return a response — the model may have hit a transient error. Please try again."
+                        : 'No response was generated. Try rephrasing your question, or ask again.'
+                );
             } else {
                 errorStreamingBubble(bubble, 'Stream ended without a final response.');
             }
diff --git a/src/front/templates/mapping.html b/src/front/templates/mapping.html
index 60c063fe..5d893a5e 100644
--- a/src/front/templates/mapping.html
+++ b/src/front/templates/mapping.html
@@ -10,6 +10,7 @@
 <link rel="stylesheet" href="{{ url_for('static', filename='mapping/css/mapping-shared.css') }}?v={{ asset_version }}">
 <link rel="stylesheet" href="{{ url_for('static', filename='mapping/css/mapping-manual.css') }}?v={{ asset_version }}">
 <link rel="stylesheet" href="{{ url_for('static', filename='mapping/css/mapping-diagnostics.css') }}?v={{ asset_version }}">
+<link rel="stylesheet" href="{{ url_for('static', filename='mapping/css/mapping-pge-visualizer.css') }}?v={{ asset_version }}">
 {% endblock %}
 
 {% block content %}
@@ -30,6 +31,7 @@
 <script src="{{ url_for('static', filename='mapping/js/mapping-import.js') }}?v={{ asset_version }}"></script>
 <script src="{{ url_for('static', filename='mapping/js/mapping-r2rml.js') }}?v={{ asset_version }}"></script>
 <script src="{{ url_for('static', filename='mapping/js/mapping-manual.js') }}?v={{ asset_version }}"></script>
+<script src="{{ url_for('static', filename='mapping/js/mapping-pge-visualizer.js') }}?v={{ asset_version }}"></script>
 <script src="{{ url_for('static', filename='mapping/js/mapping-autoassign.js') }}?v={{ asset_version }}"></script>
 <script src="{{ url_for('static', filename='mapping/js/mapping-diagnostics.js') }}?v={{ asset_version }}"></script>
 
diff --git a/src/front/templates/partials/mapping/_mapping_autoassign.html b/src/front/templates/partials/mapping/_mapping_autoassign.html
index 5b839fb5..ca047d81 100644
--- a/src/front/templates/partials/mapping/_mapping_autoassign.html
+++ b/src/front/templates/partials/mapping/_mapping_autoassign.html
@@ -125,6 +125,10 @@ <h4 class="mb-0 text-secondary" id="reportSkippedCount">0</h4>
                 </div>
             </div>
             
+            <!-- PGE Run-Visualizer — Planner→Generator→Evaluator→Critic loop + scorecard.
+                 Populated by PgeVisualizer.render() from the completed task result. -->
+            <div id="autoAssignPgeVisualizer" style="display:none;"></div>
+
             <!-- Post-Report Actions -->
             <div class="mt-3">
                 <button type="button" class="btn btn-outline-primary" onclick="AutoAssignModule.reset()">
diff --git a/tests/agents/__init__.py b/tests/agents/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/agents/agent_mapping_pge/__init__.py b/tests/agents/agent_mapping_pge/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/agents/agent_mapping_pge/test_contracts.py b/tests/agents/agent_mapping_pge/test_contracts.py
new file mode 100644
index 00000000..35498c15
--- /dev/null
+++ b/tests/agents/agent_mapping_pge/test_contracts.py
@@ -0,0 +1,126 @@
+"""Smoke tests for the PGE contracts.
+
+These are intentionally narrow — they only assert that every contract
+dataclass round-trips cleanly through ``to_dict`` / ``from_dict`` so that
+downstream sprints (Planner, Generator, orchestrator) can rely on
+JSON-safe serialisation for MLflow artefacts and registry persistence.
+"""
+
+import json
+
+from agents.agent_mapping_pge.contracts import (
+    CanonicalId,
+    EvalFailure,
+    EvalReport,
+    JoinKey,
+    MappingPlan,
+    RetryState,
+    SkipItem,
+    SourceModel,
+    TableRole,
+    TableRoleCandidate,
+)
+
+
+def _roundtrip(obj):
+    """Serialise to dict -> JSON string -> dict -> reconstruct via from_dict."""
+    cls = type(obj)
+    d = obj.to_dict()
+    encoded = json.dumps(d)
+    back = cls.from_dict(json.loads(encoded))
+    return back, d
+
+
+def test_source_model_roundtrip():
+    sm = SourceModel(
+        table_roles=[
+            TableRole(
+                table="cat.sch.mothers",
+                ontology_class_candidates=[
+                    TableRoleCandidate(
+                        uri="http://ex.org#Mother", confidence=0.92, reason="row match"
+                    ),
+                ],
+            ),
+        ],
+        canonical_ids=[
+            CanonicalId(
+                ontology_class="http://ex.org#Mother",
+                canonical_column_per_table={"cat.sch.mothers": "nhs_number"},
+                format_note="NHS number, 10 digits, no separators",
+            )
+        ],
+        join_keys=[
+            JoinKey(
+                from_ref="cat.sch.babies.mother_nhs",
+                to_ref="cat.sch.mothers.nhs_number",
+                confidence=0.88,
+                overlap_pct=0.97,
+                kind="same_trust_fk",
+            )
+        ],
+        mapping_plan=MappingPlan(
+            entity_order=["http://ex.org#Mother", "http://ex.org#Baby"],
+            relationship_order=["http://ex.org#hasBaby"],
+            skip=[SkipItem(item="http://ex.org#Ghost", reason="no source table")],
+        ),
+    )
+    back, d = _roundtrip(sm)
+    assert back.to_dict() == d
+    assert back.table_roles[0].table == "cat.sch.mothers"
+    assert back.canonical_ids[0].canonical_column_per_table["cat.sch.mothers"] == "nhs_number"
+    assert back.join_keys[0].kind == "same_trust_fk"
+    assert back.mapping_plan.skip[0].item == "http://ex.org#Ghost"
+
+
+def test_eval_report_roundtrip():
+    report = EvalReport(
+        status="FAIL",
+        stage="deterministic",
+        metrics={"row_count": 0},
+        failures=[
+            EvalFailure(
+                kind="structural",
+                check="row_count",
+                expected="> 0",
+                observed="0",
+                hint="fix the FROM clause",
+            )
+        ],
+        bubble_to_planner=True,
+    )
+    back, d = _roundtrip(report)
+    assert back.to_dict() == d
+    assert back.status == "FAIL"
+    assert back.failures[0].check == "row_count"
+
+
+def test_retry_state_roundtrip_with_and_without_report():
+    rs_empty = RetryState(item_uri="http://ex.org#Mother")
+    back, d = _roundtrip(rs_empty)
+    assert back.to_dict() == d
+    assert back.last_eval_report is None
+
+    rs = RetryState(
+        item_uri="http://ex.org#Baby",
+        generator_attempts=2,
+        planner_reinvocations=1,
+        last_eval_report=EvalReport(
+            status="FAIL",
+            stage="deterministic",
+            failures=[
+                EvalFailure(
+                    kind="structural",
+                    check="total_edges",
+                    expected="> 0",
+                    observed="0",
+                    hint="fix join",
+                )
+            ],
+            bubble_to_planner=True,
+        ),
+    )
+    back, d = _roundtrip(rs)
+    assert back.to_dict() == d
+    assert back.last_eval_report is not None
+    assert back.last_eval_report.failures[0].check == "total_edges"
diff --git a/tests/agents/agent_mapping_pge/test_critic.py b/tests/agents/agent_mapping_pge/test_critic.py
new file mode 100644
index 00000000..cad85d79
--- /dev/null
+++ b/tests/agents/agent_mapping_pge/test_critic.py
@@ -0,0 +1,684 @@
+"""Tests for the mapping-PGE Semantic Critic agent (Sprint 6).
+
+Mirrors the structure of ``test_relationship_generator.py``. The Critic is a
+narrow tool-calling ReAct loop terminated by ``submit_evaluation``. These
+tests exercise the loop's control flow with a *fake LLM* — a stub that
+replaces ``call_serving_endpoint`` at module level and returns canned
+responses on a per-call basis.
+
+No real HTTP, no real Databricks, no MLflow tracing.
+
+What we DO exercise:
+* PASS verdict terminates immediately.
+* FAIL with bubble_to_planner=False (column-level).
+* FAIL with bubble_to_planner=True (table-level) — the bubble flag survives.
+* PASS+bubble is demoted (matches build_report behaviour).
+* FAIL with empty failures[] synthesises a generic semantic failure.
+* Invalid status does NOT terminate — loop continues, accepts a valid retry.
+* Text-only response → failure with "without submitting evaluation".
+* Iteration-budget exhaustion → failure with "iteration budget".
+* User prompt surfaces structural-stage metrics.
+* User prompt for relationships includes domain/range sections.
+"""
+
+import json
+from typing import Any, Callable, Dict, List, Optional
+
+import pytest
+
+from agents.agent_mapping_pge.evaluator import critic as critic_mod
+from agents.agent_mapping_pge.evaluator.critic import (
+    CriticResult,
+    CriticStep,
+    run_critic,
+)
+
+
+# =====================================================
+# Fake LLM scaffolding
+# =====================================================
+
+
+_ENTITY_URI = "http://ex.org/maternity#Mother"
+_REL_URI = "http://ex.org/maternity#motherOf"
+
+
+def _make_tool_call(name: str, arguments: dict, *, tc_id: str = "tc1") -> dict:
+    return {
+        "id": tc_id,
+        "type": "function",
+        "function": {"name": name, "arguments": json.dumps(arguments)},
+    }
+
+
+def _llm_response(
+    *,
+    tool_calls: Optional[List[dict]] = None,
+    content: Optional[str] = None,
+    finish_reason: str = "tool_calls",
+    usage: Optional[Dict[str, int]] = None,
+) -> dict:
+    message: Dict[str, Any] = {"role": "assistant"}
+    if tool_calls:
+        message["tool_calls"] = tool_calls
+    if content is not None:
+        message["content"] = content
+    return {
+        "choices": [{"finish_reason": finish_reason, "message": message}],
+        "usage": usage or {"prompt_tokens": 10, "completion_tokens": 5},
+    }
+
+
+class FakeLLM:
+    def __init__(self, responses: List[dict]):
+        self.responses = list(responses)
+        self.calls = 0
+        self.last_messages: Optional[List[dict]] = None
+        self.first_messages: Optional[List[dict]] = None
+
+    def __call__(self, *args, **kwargs) -> dict:
+        self.calls += 1
+        msgs: Optional[List[dict]] = None
+        if len(args) >= 4 and isinstance(args[3], list):
+            msgs = args[3]
+        elif "messages" in kwargs:
+            msgs = kwargs["messages"]
+        if msgs is not None:
+            snapshot = [dict(m) for m in msgs]
+            if self.first_messages is None:
+                self.first_messages = snapshot
+            self.last_messages = snapshot
+
+        if not self.responses:
+            raise AssertionError(
+                f"FakeLLM: ran out of canned responses on call #{self.calls}"
+            )
+        return self.responses.pop(0)
+
+
+class CyclingFakeLLM:
+    """Like FakeLLM but cycles through a fixed list forever."""
+
+    def __init__(self, responses: List[dict]):
+        self.responses = list(responses)
+        self.calls = 0
+
+    def __call__(self, *args, **kwargs) -> dict:
+        resp = self.responses[self.calls % len(self.responses)]
+        self.calls += 1
+        return resp
+
+
+@pytest.fixture
+def no_sleep(monkeypatch):
+    """Neutralise the 3-second inter-iteration delay so tests run fast."""
+    monkeypatch.setattr(critic_mod.time, "sleep", lambda *_a, **_k: None)
+
+
+def _patch_llm(monkeypatch, fake: Callable[..., dict]) -> None:
+    monkeypatch.setattr(critic_mod, "call_serving_endpoint", fake)
+
+
+# =====================================================
+# Fixtures
+# =====================================================
+
+
+def _entity_definition() -> dict:
+    return {
+        "uri": _ENTITY_URI,
+        "label": "Mother",
+        "name": "Mother",
+        "comment": "A pregnant woman in the maternity dataset.",
+        "attributes": [
+            {"name": "nhsNumber", "type": "string"},
+            {"name": "dateOfBirth", "type": "date"},
+        ],
+    }
+
+
+def _relationship_definition() -> dict:
+    return {
+        "uri": _REL_URI,
+        "label": "motherOf",
+        "name": "motherOf",
+        "comment": "Links a Mother to each of her babies.",
+        "domain": _ENTITY_URI,
+        "range": "http://ex.org/maternity#Baby",
+    }
+
+
+def _entity_submitted_mapping() -> dict:
+    return {
+        "ontology_class": _ENTITY_URI,
+        "class_name": "Mother",
+        "sql_query": "SELECT nhs_number AS ID, nhs_number AS Label FROM cat.sch.mothers WHERE nhs_number IS NOT NULL",
+        "id_column": "nhs_number",
+        "label_column": "nhs_number",
+        "attribute_mappings": {"nhsNumber": "nhs_number"},
+        "unmapped_attributes": [
+            {"name": "dateOfBirth", "reason": "column absent from this table"}
+        ],
+    }
+
+
+def _relationship_submitted_mapping() -> dict:
+    return {
+        "property": _REL_URI,
+        "property_name": "motherOf",
+        "sql_query": "SELECT mother_nhs_number AS source_id, baby_id AS target_id FROM cat.sch.babies",
+        "source_id_column": "nhs_number",
+        "target_id_column": "baby_id",
+        "domain": _ENTITY_URI,
+        "range_class": "http://ex.org/maternity#Baby",
+    }
+
+
+def _source_model_slice() -> dict:
+    return {
+        "candidate_tables": [
+            {
+                "table": "cat.sch.mothers",
+                "confidence": 0.9,
+                "reason": "row per mother, nhs_number as PK",
+            }
+        ],
+        "canonical_id": {
+            "canonical_column_per_table": {"cat.sch.mothers": "nhs_number"},
+            "format_note": "10-digit NHS number",
+        },
+    }
+
+
+def _stage1_metrics(**overrides) -> dict:
+    base = {
+        "row_count": 100,
+        "distinct_ids": 100,
+        "null_ids": 0,
+    }
+    base.update(overrides)
+    return base
+
+
+def _valid_pass_submit() -> dict:
+    return {
+        "status": "PASS",
+        "failures": [],
+        "bubble_to_planner": False,
+        "reasoning": "Sampled values match the Mother concept; column semantics OK.",
+    }
+
+
+def _valid_fail_column_submit() -> dict:
+    return {
+        "status": "FAIL",
+        "failures": [
+            {
+                "check": "column_semantics",
+                "expected": "delivery date",
+                "observed": "appointment_date is a booking date",
+                "hint": "Use `delivery_dttm` instead of `appointment_date`.",
+            }
+        ],
+        "bubble_to_planner": False,
+        "reasoning": "Wrong column within the right table.",
+    }
+
+
+def _valid_fail_table_submit() -> dict:
+    return {
+        "status": "FAIL",
+        "failures": [
+            {
+                "check": "table_selection",
+                "expected": "labour_delivery",
+                "observed": "antenatal_visits",
+                "hint": "Switch to `labour_delivery` table for the Delivery class.",
+            }
+        ],
+        "bubble_to_planner": True,
+        "reasoning": "Wrong table chosen — bubble to Planner.",
+    }
+
+
+def _run_entity_critic(
+    fake: Callable[..., dict],
+    *,
+    max_iterations: int = 6,
+    item_kind: str = "entity",
+    item_uri: str = _ENTITY_URI,
+    item_definition: Optional[dict] = None,
+    submitted_mapping: Optional[dict] = None,
+    stage1_metrics: Optional[dict] = None,
+    client: Any = None,
+) -> CriticResult:
+    return run_critic(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=client,
+        item_kind=item_kind,
+        item_uri=item_uri,
+        item_definition=item_definition
+        if item_definition is not None
+        else _entity_definition(),
+        submitted_mapping=submitted_mapping
+        if submitted_mapping is not None
+        else _entity_submitted_mapping(),
+        source_model_slice=_source_model_slice(),
+        stage1_metrics=stage1_metrics
+        if stage1_metrics is not None
+        else _stage1_metrics(),
+        max_iterations=max_iterations,
+    )
+
+
+# =====================================================
+# 1. PASS verdict terminates immediately
+# =====================================================
+
+
+def test_pass_verdict(monkeypatch, no_sleep):
+    """First LLM turn submits PASS → success=True, status=PASS, iterations=1."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_evaluation", _valid_pass_submit())
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = _run_entity_critic(fake)
+
+    assert isinstance(result, CriticResult)
+    assert result.success is True
+    assert result.iterations == 1
+    assert result.report is not None
+    assert result.report.status == "PASS"
+    assert result.report.stage == "semantic"
+    assert result.report.failures == []
+    assert result.report.bubble_to_planner is False
+    assert result.error == ""
+    # Step recording: one tool_call + one tool_result.
+    assert [s.step_type for s in result.steps] == ["tool_call", "tool_result"]
+    assert result.steps[0].tool_name == "submit_evaluation"
+
+
+# =====================================================
+# 2. FAIL with bubble_to_planner=False (column-level)
+# =====================================================
+
+
+def test_fail_column_level(monkeypatch, no_sleep):
+    """status=FAIL, bubble_to_planner=False → column-level failure preserved."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_evaluation", _valid_fail_column_submit())
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = _run_entity_critic(fake)
+
+    assert result.success is True
+    assert result.report is not None
+    assert result.report.status == "FAIL"
+    assert result.report.bubble_to_planner is False
+    assert len(result.report.failures) == 1
+    failure = result.report.failures[0]
+    assert failure.kind == "semantic"
+    assert failure.check == "column_semantics"
+    assert "delivery_dttm" in failure.hint
+
+
+# =====================================================
+# 3. FAIL with bubble_to_planner=True (table-level)
+# =====================================================
+
+
+def test_fail_table_level_bubbles(monkeypatch, no_sleep):
+    """status=FAIL, bubble_to_planner=True → bubble flag preserved on report."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_evaluation", _valid_fail_table_submit())
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = _run_entity_critic(fake)
+
+    assert result.success is True
+    assert result.report is not None
+    assert result.report.status == "FAIL"
+    assert result.report.bubble_to_planner is True
+    assert len(result.report.failures) == 1
+    failure = result.report.failures[0]
+    assert failure.check == "table_selection"
+    assert "labour_delivery" in failure.hint
+
+
+# =====================================================
+# 4. PASS with bubble_to_planner=True is demoted
+# =====================================================
+
+
+def test_demotes_pass_with_bubble(monkeypatch, no_sleep):
+    """A PASS verdict that asks to bubble is demoted to bubble=False."""
+    bad_pass = _valid_pass_submit()
+    bad_pass["bubble_to_planner"] = True
+
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[_make_tool_call("submit_evaluation", bad_pass)]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = _run_entity_critic(fake)
+
+    assert result.success is True
+    assert result.report is not None
+    assert result.report.status == "PASS"
+    # The bubble flag must have been demoted.
+    assert result.report.bubble_to_planner is False
+
+
+# =====================================================
+# 5. FAIL with empty failures[] synthesises one
+# =====================================================
+
+
+def test_fail_without_failures_synthesises_one(monkeypatch, no_sleep):
+    """status=FAIL with empty failures[] gets a generic semantic failure
+    synthesised so the report stays coherent."""
+    fail_no_failures = {
+        "status": "FAIL",
+        "failures": [],
+        "bubble_to_planner": False,
+        "reasoning": "Something is off but I can't pinpoint it.",
+    }
+
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_evaluation", fail_no_failures)
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = _run_entity_critic(fake)
+
+    assert result.success is True
+    assert result.report is not None
+    assert result.report.status == "FAIL"
+    assert len(result.report.failures) == 1
+    f = result.report.failures[0]
+    assert f.kind == "semantic"
+    assert f.check == "semantic_audit"
+    # The reasoning is folded into the synthetic failure's hint when present.
+    assert "Something is off" in f.hint
+
+
+# =====================================================
+# 6. Invalid status does NOT terminate — agent retries
+# =====================================================
+
+
+def test_invalid_status_rejected(monkeypatch, no_sleep):
+    """A submit with status='UNKNOWN' must NOT terminate the loop; the
+    Critic must keep going and a follow-up submit with a valid status
+    should succeed.
+    """
+    fake = FakeLLM(
+        [
+            # Turn 1: invalid status → handler returns success=False, loop continues.
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_evaluation",
+                        {
+                            "status": "UNKNOWN",
+                            "failures": [],
+                            "bubble_to_planner": False,
+                            "reasoning": "n/a",
+                        },
+                        tc_id="bad",
+                    )
+                ]
+            ),
+            # Turn 2: valid PASS submit → terminates.
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_evaluation",
+                        _valid_pass_submit(),
+                        tc_id="good",
+                    )
+                ]
+            ),
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = _run_entity_critic(fake)
+
+    assert result.success is True
+    assert result.iterations == 2
+    assert result.report is not None
+    assert result.report.status == "PASS"
+    # Both submit attempts left tool_call + tool_result steps (4 total).
+    assert len(result.steps) == 4
+
+    # The corrective tool message on the 2nd LLM call must contain the
+    # "invalid status" error so the LLM sees why its first attempt failed.
+    assert fake.last_messages is not None
+    tool_messages = [m for m in fake.last_messages if m.get("role") == "tool"]
+    assert tool_messages, "expected at least one tool message on the 2nd call"
+    first_tool_msg = tool_messages[0].get("content", "")
+    parsed = json.loads(first_tool_msg)
+    assert parsed.get("success") is False
+    assert "invalid status" in parsed.get("error", "")
+
+
+# =====================================================
+# 7. Text without terminal call → failure
+# =====================================================
+
+
+def test_text_without_terminal_fails(monkeypatch, no_sleep):
+    """A plain-text response is treated as failure — the Critic must
+    terminate via submit_evaluation.
+    """
+    fake = FakeLLM(
+        [_llm_response(content="I am thinking…", finish_reason="stop")]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = _run_entity_critic(fake)
+
+    assert result.success is False
+    assert result.iterations == 1
+    assert result.report is None
+    assert "without submitting evaluation" in result.error
+    assert any(s.step_type == "output" for s in result.steps)
+
+
+# =====================================================
+# 8. Iteration-budget exhaustion → failure
+# =====================================================
+
+
+def test_exhausts_budget(monkeypatch, no_sleep):
+    """Endless sample_table calls with max_iterations=3 → fail with
+    ``iteration budget`` and three iterations of steps recorded."""
+    fake = CyclingFakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "sample_table",
+                        {"full_name": "cat.sch.mothers"},
+                        tc_id="probe",
+                    )
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    class FakeClient:
+        def execute_query(self, sql):
+            return [{"nhs_number": "1234567890"}]
+
+    result = _run_entity_critic(fake, max_iterations=3, client=FakeClient())
+
+    assert result.success is False
+    assert result.iterations == 3
+    assert result.report is None
+    assert "iteration budget" in result.error
+    # 3 iterations × (tool_call + tool_result) = 6 steps.
+    assert len(result.steps) == 6
+
+
+# =====================================================
+# 9. User prompt surfaces stage1 metrics
+# =====================================================
+
+
+def test_user_prompt_includes_stage1_metrics(monkeypatch, no_sleep):
+    """The first LLM call's user message must contain stage1 metric values
+    so the Critic sees the structural context."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_evaluation", _valid_pass_submit())
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    _run_entity_critic(
+        fake,
+        stage1_metrics={"row_count": 1234, "distinct_ids": 1234, "null_ids": 0},
+    )
+
+    assert fake.first_messages is not None
+    assert fake.first_messages[0]["role"] == "system"
+    assert fake.first_messages[1]["role"] == "user"
+    user_content = fake.first_messages[1]["content"]
+    assert "1234" in user_content
+    assert "STRUCTURAL CHECK METRICS" in user_content
+
+
+# =====================================================
+# 10. Relationship audit surfaces domain/range
+# =====================================================
+
+
+def test_user_prompt_distinguishes_entity_vs_relationship(monkeypatch, no_sleep):
+    """When item_kind='relationship', the user prompt must include the
+    'domain' and 'range' lines that an entity prompt would not have."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_evaluation", _valid_pass_submit())
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    run_critic(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        item_kind="relationship",
+        item_uri=_REL_URI,
+        item_definition=_relationship_definition(),
+        submitted_mapping=_relationship_submitted_mapping(),
+        source_model_slice=_source_model_slice(),
+        stage1_metrics=_stage1_metrics(),
+    )
+
+    assert fake.first_messages is not None
+    user_content = fake.first_messages[1]["content"]
+    # The kind is surfaced explicitly.
+    assert "relationship" in user_content
+    # The relationship-specific domain/range sections appear.
+    assert "domain:" in user_content
+    assert "range:" in user_content
+    # And it is framed as a relationship submitted mapping.
+    assert "SUBMITTED MAPPING (relationship)" in user_content
+    # The relationship endpoint columns are surfaced too.
+    assert "source_id_column" in user_content
+    assert "target_id_column" in user_content
+
+
+# =====================================================
+# 11. Step recording invariants
+# =====================================================
+
+
+def test_records_steps(monkeypatch, no_sleep):
+    """Every tool-calling iteration produces one ``tool_call`` step
+    immediately followed by one ``tool_result`` step with the same tool_name."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "sample_table",
+                        {"full_name": "cat.sch.mothers"},
+                        tc_id="a",
+                    )
+                ]
+            ),
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_evaluation",
+                        _valid_pass_submit(),
+                        tc_id="b",
+                    )
+                ]
+            ),
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    class FakeClient:
+        def execute_query(self, sql):
+            return [{"nhs_number": "1234567890"}]
+
+    result = _run_entity_critic(fake, client=FakeClient())
+
+    assert result.success is True
+    assert len(result.steps) % 2 == 0
+    for i in range(0, len(result.steps), 2):
+        call_step = result.steps[i]
+        result_step = result.steps[i + 1]
+        assert call_step.step_type == "tool_call"
+        assert result_step.step_type == "tool_result"
+        assert call_step.tool_name == result_step.tool_name
+        assert isinstance(call_step, CriticStep)
+        assert isinstance(result_step, CriticStep)
diff --git a/tests/agents/agent_mapping_pge/test_deterministic_evaluator.py b/tests/agents/agent_mapping_pge/test_deterministic_evaluator.py
new file mode 100644
index 00000000..94ec3f35
--- /dev/null
+++ b/tests/agents/agent_mapping_pge/test_deterministic_evaluator.py
@@ -0,0 +1,916 @@
+"""Tests for the deterministic (stage-1) evaluator of the mapping PGE pipeline.
+
+The evaluator is a pure function: it takes a submitted mapping plus an
+injectable ``execute_sql_fn`` and returns an ``EvalReport`` summarising
+structural failures.  No LLM, no Databricks connection.
+
+``execute_sql_fn`` contract (for the evaluator):
+    def execute_sql_fn(sql: str) -> dict
+returning::
+
+    {"columns": [...], "rows": [{col: value, ...}, ...]}
+
+This is the full result set — not the 3-row sample emitted by
+``agents.tools.sql.tool_execute_sql``.  The PGE orchestrator (Sprint 7) is
+responsible for wiring a runner that yields full rows.
+"""
+
+from typing import Dict, List
+
+import pytest
+
+from agents.agent_mapping_pge.contracts import EvalFailure, EvalReport
+from agents.agent_mapping_pge.evaluator.deterministic import (
+    evaluate_entity_mapping,
+    evaluate_relationship_mapping,
+)
+from agents.agent_mapping_pge.evaluator.report import build_report
+
+
+# =====================================================
+# Fixtures
+# =====================================================
+
+
+MOTHER_CLASS = {
+    "uri": "http://ex.org/maternity#Mother",
+    "name": "Mother",
+    "attributes": [
+        {"name": "firstName"},
+        {"name": "lastName"},
+        {"name": "nhsNumber"},
+    ],
+}
+
+BABY_CLASS = {
+    "uri": "http://ex.org/maternity#Baby",
+    "name": "Baby",
+    "attributes": [
+        {"name": "birthWeight"},
+    ],
+}
+
+
+def _mother_mapping(*, attribute_mappings=None, unmapped_attributes=None):
+    mapping = {
+        "ontology_class": MOTHER_CLASS["uri"],
+        "class_name": "Mother",
+        "sql_query": "SELECT nhs_number AS ID, full_name AS Label, first_name, last_name, nhs_number FROM cat.sch.mothers",
+        "id_column": "ID",
+        "label_column": "Label",
+        "attribute_mappings": attribute_mappings
+        if attribute_mappings is not None
+        else {
+            "firstName": "first_name",
+            "lastName": "last_name",
+            "nhsNumber": "nhs_number",
+        },
+    }
+    if unmapped_attributes is not None:
+        mapping["unmapped_attributes"] = unmapped_attributes
+    return mapping
+
+
+def _baby_mapping():
+    return {
+        "ontology_class": BABY_CLASS["uri"],
+        "class_name": "Baby",
+        "sql_query": "SELECT baby_id AS ID, baby_id AS Label, birth_weight FROM cat.sch.babies",
+        "id_column": "ID",
+        "label_column": "Label",
+        "attribute_mappings": {"birthWeight": "birth_weight"},
+    }
+
+
+def _mother_to_baby_relationship():
+    return {
+        "property": "http://ex.org/maternity#hasBaby",
+        "property_name": "hasBaby",
+        "sql_query": (
+            "SELECT mother_nhs AS source_id, baby_id AS target_id "
+            "FROM cat.sch.babies"
+        ),
+        "source_id_column": "source_id",
+        "target_id_column": "target_id",
+        "source_class": MOTHER_CLASS["uri"],
+        "target_class": BABY_CLASS["uri"],
+    }
+
+
+def _make_sql_fn(table: dict):
+    """Return an execute_sql_fn closure that routes by SQL substring.
+
+    ``table`` maps a unique substring -> {"columns": [...], "rows": [...]}.
+    """
+
+    def fn(sql: str) -> dict:
+        for needle, payload in table.items():
+            if needle in sql:
+                return payload
+        raise AssertionError(f"unexpected SQL in test: {sql}")
+
+    return fn
+
+
+# =====================================================
+# Entity evaluator
+# =====================================================
+
+
+class TestEvaluateEntityMapping:
+    def test_pass_happy_path(self):
+        mapping = _mother_mapping()
+        sql_fn = _make_sql_fn(
+            {
+                "mothers": {
+                    "columns": ["ID", "Label", "first_name", "last_name", "nhs_number"],
+                    "rows": [
+                        {
+                            "ID": "NHS-001",
+                            "Label": "Alice Smith",
+                            "first_name": "Alice",
+                            "last_name": "Smith",
+                            "nhs_number": "NHS-001",
+                        },
+                        {
+                            "ID": "NHS-002",
+                            "Label": "Bob Jones",
+                            "first_name": "Bob",
+                            "last_name": "Jones",
+                            "nhs_number": "NHS-002",
+                        },
+                    ],
+                }
+            }
+        )
+
+        report = evaluate_entity_mapping(
+            mapping=mapping,
+            ontology_class=MOTHER_CLASS,
+            execute_sql_fn=sql_fn,
+        )
+
+        assert isinstance(report, EvalReport)
+        assert report.status == "PASS"
+        assert report.stage == "deterministic"
+        assert report.failures == []
+        assert report.bubble_to_planner is False
+        assert report.metrics["row_count"] == 2
+        assert report.metrics["distinct_id_count"] == 2
+        assert report.metrics["null_id_count"] == 0
+        assert report.metrics["unmapped_attribute_pct"] == 0.0
+
+    def test_fail_row_count_zero_bubbles_to_planner(self):
+        mapping = _mother_mapping()
+        sql_fn = _make_sql_fn(
+            {"mothers": {"columns": ["ID", "Label"], "rows": []}}
+        )
+
+        report = evaluate_entity_mapping(
+            mapping=mapping,
+            ontology_class=MOTHER_CLASS,
+            execute_sql_fn=sql_fn,
+        )
+
+        assert report.status == "FAIL"
+        assert report.bubble_to_planner is True
+        check_names = [f.check for f in report.failures]
+        assert "row_count" in check_names
+
+    def test_fail_duplicate_ids(self):
+        mapping = _mother_mapping()
+        sql_fn = _make_sql_fn(
+            {
+                "mothers": {
+                    "columns": ["ID", "Label", "first_name", "last_name", "nhs_number"],
+                    "rows": [
+                        {
+                            "ID": "NHS-001",
+                            "Label": "Alice",
+                            "first_name": "Alice",
+                            "last_name": "Smith",
+                            "nhs_number": "NHS-001",
+                        },
+                        {
+                            "ID": "NHS-001",
+                            "Label": "Alice dup",
+                            "first_name": "Alice",
+                            "last_name": "Smith",
+                            "nhs_number": "NHS-001",
+                        },
+                    ],
+                }
+            }
+        )
+
+        report = evaluate_entity_mapping(
+            mapping=mapping,
+            ontology_class=MOTHER_CLASS,
+            execute_sql_fn=sql_fn,
+        )
+
+        assert report.status == "FAIL"
+        assert report.bubble_to_planner is False
+        check_names = [f.check for f in report.failures]
+        assert "distinct_id_count" in check_names
+
+    def test_fail_null_ids(self):
+        mapping = _mother_mapping()
+        sql_fn = _make_sql_fn(
+            {
+                "mothers": {
+                    "columns": ["ID", "Label", "first_name", "last_name", "nhs_number"],
+                    "rows": [
+                        {
+                            "ID": None,
+                            "Label": "Alice",
+                            "first_name": "Alice",
+                            "last_name": "Smith",
+                            "nhs_number": None,
+                        },
+                        {
+                            "ID": "NHS-002",
+                            "Label": "Bob",
+                            "first_name": "Bob",
+                            "last_name": "Jones",
+                            "nhs_number": "NHS-002",
+                        },
+                    ],
+                }
+            }
+        )
+
+        report = evaluate_entity_mapping(
+            mapping=mapping,
+            ontology_class=MOTHER_CLASS,
+            execute_sql_fn=sql_fn,
+        )
+
+        assert report.status == "FAIL"
+        check_names = [f.check for f in report.failures]
+        assert "null_id_count" in check_names
+
+    def test_fail_unmapped_attribute(self):
+        # Omit lastName from attribute_mappings, no unmapped_attributes list.
+        mapping = _mother_mapping(
+            attribute_mappings={
+                "firstName": "first_name",
+                "nhsNumber": "nhs_number",
+            },
+        )
+        sql_fn = _make_sql_fn(
+            {
+                "mothers": {
+                    "columns": ["ID", "Label", "first_name", "last_name", "nhs_number"],
+                    "rows": [
+                        {
+                            "ID": "NHS-001",
+                            "Label": "Alice",
+                            "first_name": "Alice",
+                            "last_name": "Smith",
+                            "nhs_number": "NHS-001",
+                        },
+                    ],
+                }
+            }
+        )
+
+        report = evaluate_entity_mapping(
+            mapping=mapping,
+            ontology_class=MOTHER_CLASS,
+            execute_sql_fn=sql_fn,
+        )
+
+        assert report.status == "FAIL"
+        check_names = [f.check for f in report.failures]
+        assert "unmapped_attribute_pct" in check_names
+        # 1 of 3 attributes missing -> ~0.333
+        assert report.metrics["unmapped_attribute_pct"] == pytest.approx(1 / 3)
+
+    def test_pass_when_unmapped_attribute_is_declared(self):
+        mapping = _mother_mapping(
+            attribute_mappings={
+                "firstName": "first_name",
+                "nhsNumber": "nhs_number",
+            },
+            unmapped_attributes=["lastName"],
+        )
+        sql_fn = _make_sql_fn(
+            {
+                "mothers": {
+                    "columns": ["ID", "Label", "first_name", "last_name", "nhs_number"],
+                    "rows": [
+                        {
+                            "ID": "NHS-001",
+                            "Label": "Alice",
+                            "first_name": "Alice",
+                            "last_name": "Smith",
+                            "nhs_number": "NHS-001",
+                        },
+                    ],
+                }
+            }
+        )
+
+        report = evaluate_entity_mapping(
+            mapping=mapping,
+            ontology_class=MOTHER_CLASS,
+            execute_sql_fn=sql_fn,
+        )
+
+        assert report.status == "PASS"
+        assert report.metrics["unmapped_attribute_pct"] == 0.0
+
+    def test_pass_when_unmapped_attribute_is_declared_as_dict(self):
+        """The Generator may emit unmapped_attributes as [{name, reason}, ...].
+        Hashing dicts would crash the evaluator — names must be extracted."""
+        mapping = _mother_mapping(
+            attribute_mappings={
+                "firstName": "first_name",
+                "nhsNumber": "nhs_number",
+            },
+            unmapped_attributes=[
+                {"name": "lastName", "reason": "no source column"}
+            ],
+        )
+        sql_fn = _make_sql_fn(
+            {
+                "mothers": {
+                    "columns": ["ID", "Label", "first_name", "last_name", "nhs_number"],
+                    "rows": [
+                        {
+                            "ID": "NHS-001",
+                            "Label": "Alice",
+                            "first_name": "Alice",
+                            "last_name": "Smith",
+                            "nhs_number": "NHS-001",
+                        },
+                    ],
+                }
+            }
+        )
+
+        report = evaluate_entity_mapping(
+            mapping=mapping,
+            ontology_class=MOTHER_CLASS,
+            execute_sql_fn=sql_fn,
+        )
+
+        assert report.status == "PASS"
+        assert report.metrics["unmapped_attribute_pct"] == 0.0
+
+    def test_report_is_json_serialisable(self):
+        mapping = _mother_mapping()
+        sql_fn = _make_sql_fn(
+            {
+                "mothers": {
+                    "columns": ["ID", "Label", "first_name", "last_name", "nhs_number"],
+                    "rows": [
+                        {
+                            "ID": "NHS-001",
+                            "Label": "Alice",
+                            "first_name": "Alice",
+                            "last_name": "Smith",
+                            "nhs_number": "NHS-001",
+                        },
+                    ],
+                }
+            }
+        )
+
+        report = evaluate_entity_mapping(
+            mapping=mapping,
+            ontology_class=MOTHER_CLASS,
+            execute_sql_fn=sql_fn,
+        )
+        d = report.to_dict()
+        assert d["status"] == "PASS"
+        assert d["stage"] == "deterministic"
+        assert isinstance(d["metrics"], dict)
+        assert isinstance(d["failures"], list)
+
+    def test_sql_execution_error_becomes_fail_not_crash(self):
+        """A mapping whose SQL parses but fails at runtime (e.g. a UNION
+        type mismatch) must yield a FAIL report with the error as a hint —
+        never propagate and crash the agent run.
+        """
+        mapping = _mother_mapping()
+
+        def boom(sql: str) -> dict:
+            raise RuntimeError(
+                "[CAST_INVALID_INPUT] The value 'x-preg-1-baby' of the type "
+                '"STRING" cannot be cast to "BIGINT"'
+            )
+
+        report = evaluate_entity_mapping(
+            mapping=mapping,
+            ontology_class=MOTHER_CLASS,
+            execute_sql_fn=boom,
+        )
+
+        assert report.status == "FAIL"
+        # A runtime SQL error is the Generator's to fix, not a re-plan trigger.
+        assert report.bubble_to_planner is False
+        checks = [f.check for f in report.failures]
+        assert "sql_execution" in checks
+        # The underlying DB error is surfaced for the generator to act on.
+        assert "CAST_INVALID_INPUT" in report.metrics.get("sql_error", "")
+        # Report must remain JSON-serialisable.
+        import json as _json
+
+        _json.dumps(report.to_dict())
+
+
+# =====================================================
+# Relationship evaluator
+# =====================================================
+
+
+def _entity_rows(ids):
+    return {
+        "columns": ["ID", "Label"],
+        "rows": [{"ID": i, "Label": i} for i in ids],
+    }
+
+
+class TestEvaluateRelationshipMapping:
+    def test_pass_happy_path(self):
+        rel = _mother_to_baby_relationship()
+        sql_fn = _make_sql_fn(
+            {
+                # Relationship edges
+                "source_id": {
+                    "columns": ["source_id", "target_id"],
+                    "rows": [
+                        {"source_id": "NHS-001", "target_id": "B-1"},
+                        {"source_id": "NHS-002", "target_id": "B-2"},
+                    ],
+                },
+                # Source entity universe
+                "mothers": _entity_rows(["NHS-001", "NHS-002", "NHS-003"]),
+                # Target entity universe
+                "babies": _entity_rows(["B-1", "B-2", "B-3"]),
+            }
+        )
+
+        report = evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=sql_fn,
+        )
+
+        assert report.status == "PASS"
+        assert report.bubble_to_planner is False
+        assert report.metrics["total_edges"] == 2
+        assert report.metrics["dangling_source_pct"] == 0.0
+        assert report.metrics["dangling_target_pct"] == 0.0
+
+    def test_sql_execution_error_becomes_fail_not_crash(self):
+        """A relationship (or its endpoint-universe) SQL that errors at
+        runtime must yield a FAIL report, not crash the agent run.
+        """
+        rel = _mother_to_baby_relationship()
+
+        def boom(sql: str) -> dict:
+            raise RuntimeError("[UNRESOLVED_COLUMN] cannot resolve `target_id`")
+
+        report = evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=boom,
+        )
+
+        assert report.status == "FAIL"
+        assert report.bubble_to_planner is False
+        assert "sql_execution" in [f.check for f in report.failures]
+        assert "UNRESOLVED_COLUMN" in report.metrics.get("sql_error", "")
+
+    def test_fail_47_pct_dangling_source_bubbles(self):
+        rel = _mother_to_baby_relationship()
+        # 100 edges, 47 source_ids unknown to source universe.
+        edge_rows = [
+            {"source_id": f"NHS-{i:03d}", "target_id": f"B-{i}"}
+            for i in range(1, 101)
+        ]
+        # Only NHS-001..NHS-053 exist as mothers.
+        mother_ids = [f"NHS-{i:03d}" for i in range(1, 54)]
+        baby_ids = [f"B-{i}" for i in range(1, 201)]
+
+        sql_fn = _make_sql_fn(
+            {
+                "source_id": {
+                    "columns": ["source_id", "target_id"],
+                    "rows": edge_rows,
+                },
+                "mothers": _entity_rows(mother_ids),
+                "babies": _entity_rows(baby_ids),
+            }
+        )
+
+        report = evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=sql_fn,
+        )
+
+        assert report.status == "FAIL"
+        assert report.bubble_to_planner is False  # 0.47 < 0.5 threshold
+        check_names = [f.check for f in report.failures]
+        assert "dangling_source_pct" in check_names
+        assert report.metrics["dangling_source_pct"] == pytest.approx(0.47)
+
+    def test_fail_above_50_pct_dangling_source_bubbles_to_planner(self):
+        rel = _mother_to_baby_relationship()
+        edge_rows = [
+            {"source_id": f"NHS-{i:03d}", "target_id": f"B-{i}"}
+            for i in range(1, 101)
+        ]
+        # Only NHS-001..NHS-040 are known mothers -> 60% dangling
+        mother_ids = [f"NHS-{i:03d}" for i in range(1, 41)]
+        baby_ids = [f"B-{i}" for i in range(1, 201)]
+
+        sql_fn = _make_sql_fn(
+            {
+                "source_id": {
+                    "columns": ["source_id", "target_id"],
+                    "rows": edge_rows,
+                },
+                "mothers": _entity_rows(mother_ids),
+                "babies": _entity_rows(baby_ids),
+            }
+        )
+
+        report = evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=sql_fn,
+        )
+
+        assert report.status == "FAIL"
+        assert report.bubble_to_planner is True
+
+    def test_pass_3_pct_dangling_source_under_threshold(self):
+        rel = _mother_to_baby_relationship()
+        # 100 edges, only 3 source ids not in mother universe -> 3%.
+        edge_rows = [
+            {"source_id": f"NHS-{i:03d}", "target_id": f"B-{i}"}
+            for i in range(1, 101)
+        ]
+        mother_ids = [f"NHS-{i:03d}" for i in range(1, 98)]  # 97 known, 3 dangling
+        baby_ids = [f"B-{i}" for i in range(1, 201)]
+
+        sql_fn = _make_sql_fn(
+            {
+                "source_id": {
+                    "columns": ["source_id", "target_id"],
+                    "rows": edge_rows,
+                },
+                "mothers": _entity_rows(mother_ids),
+                "babies": _entity_rows(baby_ids),
+            }
+        )
+
+        report = evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=sql_fn,
+        )
+
+        assert report.status == "PASS"
+        assert report.bubble_to_planner is False
+        assert report.metrics["dangling_source_pct"] == pytest.approx(0.03)
+
+    def test_fail_zero_edges_bubbles_to_planner(self):
+        rel = _mother_to_baby_relationship()
+        sql_fn = _make_sql_fn(
+            {
+                "source_id": {"columns": ["source_id", "target_id"], "rows": []},
+                "mothers": _entity_rows(["NHS-001"]),
+                "babies": _entity_rows(["B-1"]),
+            }
+        )
+
+        report = evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=sql_fn,
+        )
+
+        assert report.status == "FAIL"
+        assert report.bubble_to_planner is True
+        check_names = [f.check for f in report.failures]
+        assert "total_edges" in check_names
+
+    def test_cross_source_band_fail_when_outside(self):
+        rel = _mother_to_baby_relationship()
+        # 100 edges, all source ids in mother universe.
+        edge_rows = [
+            {"source_id": f"NHS-{i:03d}", "target_id": f"B-{i}"}
+            for i in range(1, 101)
+        ]
+        sql_fn = _make_sql_fn(
+            {
+                "source_id": {
+                    "columns": ["source_id", "target_id"],
+                    "rows": edge_rows,
+                },
+                "mothers": _entity_rows([f"NHS-{i:03d}" for i in range(1, 101)]),
+                "babies": _entity_rows([f"B-{i}" for i in range(1, 101)]),
+            }
+        )
+
+        report = evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=sql_fn,
+            expected_cross_source_overlap_band=(0.25, 0.4),
+        )
+        # overlap_pct = 1.0 (every source row matches a target id); outside band.
+        assert report.status == "FAIL"
+        check_names = [f.check for f in report.failures]
+        assert "cross_source_overlap_pct" in check_names
+
+    def test_cross_source_band_pass_when_inside(self):
+        rel = _mother_to_baby_relationship()
+        # Build edges where only ~30% of source ids match a target id (band 0.25..0.4 ).
+        edge_rows = []
+        for i in range(1, 101):
+            edge_rows.append(
+                {
+                    "source_id": f"NHS-{i:03d}",
+                    "target_id": f"B-{i}" if i <= 30 else f"X-{i}",
+                }
+            )
+        sql_fn = _make_sql_fn(
+            {
+                "source_id": {
+                    "columns": ["source_id", "target_id"],
+                    "rows": edge_rows,
+                },
+                "mothers": _entity_rows([f"NHS-{i:03d}" for i in range(1, 101)]),
+                "babies": _entity_rows([f"B-{i}" for i in range(1, 101)]),
+            }
+        )
+
+        report = evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=sql_fn,
+            expected_cross_source_overlap_band=(0.25, 0.4),
+        )
+        # overlap = 30/100 = 0.3, inside band.
+        assert report.status == "PASS"
+        assert report.metrics["cross_source_overlap_pct"] == pytest.approx(0.3)
+
+    def test_band_present_overlap_outside_band_with_catastrophic_dangling_bubbles(self):
+        """Band FAILS (overlap 0.05 << lo=0.25) AND dangling > 0.5 → bubble.
+
+        The realised overlap is materially worse than the Planner predicted,
+        so the catastrophic-dangling structural failure fires alongside the
+        band-check failure, and ``bubble_to_planner`` flips True.
+        """
+        rel = _mother_to_baby_relationship()
+        # 100 edges, only the first 5 target_ids land in the babies universe
+        # → overlap = 0.05, dangling_target = 0.95.
+        edge_rows = []
+        for i in range(1, 101):
+            edge_rows.append(
+                {
+                    "source_id": f"NHS-{i:03d}",
+                    "target_id": f"B-{i}" if i <= 5 else f"X-{i}",
+                }
+            )
+        sql_fn = _make_sql_fn(
+            {
+                "source_id": {
+                    "columns": ["source_id", "target_id"],
+                    "rows": edge_rows,
+                },
+                "mothers": _entity_rows([f"NHS-{i:03d}" for i in range(1, 101)]),
+                "babies": _entity_rows([f"B-{i}" for i in range(1, 101)]),
+            }
+        )
+
+        report = evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=sql_fn,
+            expected_cross_source_overlap_band=(0.25, 0.4),
+        )
+
+        assert report.status == "FAIL"
+        assert report.bubble_to_planner is True
+        assert report.metrics["dangling_target_pct"] == pytest.approx(0.95)
+        check_names = [f.check for f in report.failures]
+        # Both the band failure AND the catastrophic-dangling row must surface.
+        assert "cross_source_overlap_pct" in check_names
+        assert "dangling_target_pct_catastrophic" in check_names
+        # The strict 0.05 dangling_target_pct row is gated behind "band is None"
+        # — it must NOT appear here.
+        assert "dangling_target_pct" not in check_names
+
+    def test_band_present_overlap_outside_band_with_mild_dangling_does_not_bubble(self):
+        """Band FAILS but dangling is exactly at the bubble threshold (not > 0.5)
+        → status FAIL on the band row but ``bubble_to_planner`` stays False.
+        """
+        rel = _mother_to_baby_relationship()
+        # 100 edges, 50 land in target universe → overlap = 0.50, dangling = 0.50.
+        # Band is (0.6, 0.8) so band check fails (0.50 < 0.6); dangling NOT > 0.5.
+        edge_rows = []
+        for i in range(1, 101):
+            edge_rows.append(
+                {
+                    "source_id": f"NHS-{i:03d}",
+                    "target_id": f"B-{i}" if i <= 50 else f"X-{i}",
+                }
+            )
+        sql_fn = _make_sql_fn(
+            {
+                "source_id": {
+                    "columns": ["source_id", "target_id"],
+                    "rows": edge_rows,
+                },
+                "mothers": _entity_rows([f"NHS-{i:03d}" for i in range(1, 101)]),
+                "babies": _entity_rows([f"B-{i}" for i in range(1, 101)]),
+            }
+        )
+
+        report = evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=sql_fn,
+            expected_cross_source_overlap_band=(0.6, 0.8),
+        )
+
+        assert report.status == "FAIL"
+        assert report.bubble_to_planner is False
+        assert report.metrics["dangling_target_pct"] == pytest.approx(0.5)
+        check_names = [f.check for f in report.failures]
+        assert "cross_source_overlap_pct" in check_names
+        # No catastrophic row because dangling is not strictly > 0.5.
+        assert "dangling_target_pct_catastrophic" not in check_names
+
+    def test_relationship_evaluator_uses_id_universe_cache(self):
+        """Sharing a cache across calls avoids re-running the entity SQLs."""
+        rel = _mother_to_baby_relationship()
+        base_fn = _make_sql_fn(
+            {
+                "source_id": {
+                    "columns": ["source_id", "target_id"],
+                    "rows": [
+                        {"source_id": "NHS-001", "target_id": "B-1"},
+                        {"source_id": "NHS-002", "target_id": "B-2"},
+                    ],
+                },
+                "mothers": _entity_rows(["NHS-001", "NHS-002", "NHS-003"]),
+                "babies": _entity_rows(["B-1", "B-2", "B-3"]),
+            }
+        )
+
+        calls: List[str] = []
+
+        def counting_fn(sql: str) -> dict:
+            calls.append(sql)
+            return base_fn(sql)
+
+        cache: Dict[str, set] = {}
+
+        # First call: source + target entity SQLs + relationship SQL = 3 calls.
+        evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=counting_fn,
+            id_universe_cache=cache,
+        )
+        first_call_count = len(calls)
+        assert first_call_count == 3
+
+        mother_sql = _mother_mapping()["sql_query"]
+        baby_sql = _baby_mapping()["sql_query"]
+        assert mother_sql in cache
+        assert baby_sql in cache
+
+        # Second call with same cache: only the relationship SQL should be
+        # re-executed; both entity universes are served from cache.
+        evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=counting_fn,
+            id_universe_cache=cache,
+        )
+
+        delta = calls[first_call_count:]
+        assert len(delta) == 1
+        assert mother_sql not in delta
+        assert baby_sql not in delta
+
+    def test_band_absent_catastrophic_target_dangling_bubbles(self):
+        """No band supplied + dangling_target > 0.5 → strict check fires and bubbles."""
+        rel = _mother_to_baby_relationship()
+        # 100 edges, only 20 target_ids land in babies universe → dangling = 0.80.
+        edge_rows = []
+        for i in range(1, 101):
+            edge_rows.append(
+                {
+                    "source_id": f"NHS-{i:03d}",
+                    "target_id": f"B-{i}" if i <= 20 else f"X-{i}",
+                }
+            )
+        sql_fn = _make_sql_fn(
+            {
+                "source_id": {
+                    "columns": ["source_id", "target_id"],
+                    "rows": edge_rows,
+                },
+                "mothers": _entity_rows([f"NHS-{i:03d}" for i in range(1, 101)]),
+                "babies": _entity_rows([f"B-{i}" for i in range(1, 101)]),
+            }
+        )
+
+        report = evaluate_relationship_mapping(
+            mapping=rel,
+            source_entity_mapping=_mother_mapping(),
+            target_entity_mapping=_baby_mapping(),
+            execute_sql_fn=sql_fn,
+        )
+
+        assert report.status == "FAIL"
+        assert report.bubble_to_planner is True
+        assert report.metrics["dangling_target_pct"] == pytest.approx(0.8)
+        check_names = [f.check for f in report.failures]
+        assert "dangling_target_pct" in check_names
+
+
+# =====================================================
+# build_report — bubble demotion warning
+# =====================================================
+
+
+def test_build_report_warns_when_bubble_demoted(caplog):
+    """``bubble_to_planner=True`` with no failures (status PASS) should
+    emit a warning, AND silently-PASSing reports should not warn.
+    """
+    import logging
+
+    # PASS + bubble_to_planner=True → warning expected, bubble demoted.
+    caplog.clear()
+    with caplog.at_level(logging.WARNING):
+        passing = build_report(
+            stage="deterministic",
+            metrics={"row_count": 1},
+            failures=[],
+            bubble_to_planner=True,
+        )
+    assert passing.status == "PASS"
+    assert passing.bubble_to_planner is False
+    assert any(
+        "bubble_to_planner=True" in rec.message and rec.levelname == "WARNING"
+        for rec in caplog.records
+    )
+
+    # PASS + bubble_to_planner=False → no warning.
+    caplog.clear()
+    with caplog.at_level(logging.WARNING):
+        build_report(
+            stage="deterministic",
+            metrics={"row_count": 1},
+            failures=[],
+            bubble_to_planner=False,
+        )
+    assert not any(
+        "bubble_to_planner=True" in rec.message for rec in caplog.records
+    )
+
+    # FAIL + bubble_to_planner=True → no demotion, no warning.
+    caplog.clear()
+    failure = EvalFailure(
+        kind="structural",
+        check="row_count",
+        expected="> 0",
+        observed="0",
+        hint="",
+    )
+    with caplog.at_level(logging.WARNING):
+        failing = build_report(
+            stage="deterministic",
+            metrics={"row_count": 0},
+            failures=[failure],
+            bubble_to_planner=True,
+        )
+    assert failing.status == "FAIL"
+    assert failing.bubble_to_planner is True
+    assert not any(
+        "bubble_to_planner=True" in rec.message for rec in caplog.records
+    )
diff --git a/tests/agents/agent_mapping_pge/test_engine.py b/tests/agents/agent_mapping_pge/test_engine.py
new file mode 100644
index 00000000..281e2fba
--- /dev/null
+++ b/tests/agents/agent_mapping_pge/test_engine.py
@@ -0,0 +1,1123 @@
+"""Tests for the mapping-PGE orchestrator (Sprint 7).
+
+The orchestrator wires Planner -> Generator(s) -> Evaluator(s) into a single
+``run_agent`` entry. These tests exercise the control flow with fake versions
+of each sub-agent — no real LLM, no real Databricks. Each test patches the
+module-level references in :mod:`engine` so the orchestrator calls the fakes
+instead of the production functions.
+
+What we DO exercise:
+* Happy path with both entities and relationships.
+* Planner failure aborts cleanly.
+* Generator failure records FAIL but continues.
+* Evaluator FAIL (non-bubble) drives a retry with a hint.
+* Bubble-to-planner triggers Planner re-invocation; budget is global.
+* 3-attempt retry budget exhaustion records FAIL_BUDGET.
+* Critic PASS / FAIL paths, and the ``skip_semantic_critic`` short-circuit.
+* Pre-seeded entity mappings and Planner skip[] entries.
+* on_step pct stays non-decreasing across the run.
+* Id-universe cache shares entity universes across relationships.
+"""
+
+from typing import Any, Dict, List, Optional, Tuple
+
+import pytest
+
+from agents.agent_mapping_pge import engine as engine_mod
+from agents.agent_mapping_pge.contracts import (
+    CanonicalId,
+    EvalFailure,
+    EvalReport,
+    JoinKey,
+    MappingPlan,
+    SkipItem,
+    SourceModel,
+    TableRole,
+    TableRoleCandidate,
+)
+from agents.agent_mapping_pge.engine import AgentResult, run_agent
+from agents.agent_mapping_pge.evaluator.critic import CriticResult
+from agents.agent_mapping_pge.generators.entity import EntityGenResult
+from agents.agent_mapping_pge.generators.relationship import RelationshipGenResult
+from agents.agent_mapping_pge.planner import PlannerResult
+
+
+# =====================================================
+# Ontology + SourceModel fixtures
+# =====================================================
+
+
+CUSTOMER_URI = "http://test.org/ontology#Customer"
+ORDER_URI = "http://test.org/ontology#Order"
+HAS_ORDER_URI = "http://test.org/ontology#hasOrder"
+ITEM_URI = "http://test.org/ontology#Item"
+CONTAINS_URI = "http://test.org/ontology#contains"
+
+T_CUSTOMERS = "cat.sch.customers"
+T_ORDERS = "cat.sch.orders"
+T_ITEMS = "cat.sch.items"
+
+
+def _ontology() -> dict:
+    return {
+        "entities": [
+            {
+                "uri": CUSTOMER_URI,
+                "name": "Customer",
+                "label": "Customer",
+                "attributes": [{"name": "firstName", "type": "xsd:string"}],
+            },
+            {
+                "uri": ORDER_URI,
+                "name": "Order",
+                "label": "Order",
+                "attributes": [{"name": "orderDate", "type": "xsd:string"}],
+            },
+            {
+                "uri": ITEM_URI,
+                "name": "Item",
+                "label": "Item",
+                "attributes": [],
+            },
+        ],
+        "relationships": [
+            {
+                "uri": HAS_ORDER_URI,
+                "name": "hasOrder",
+                "label": "hasOrder",
+                "domain": CUSTOMER_URI,
+                "range": ORDER_URI,
+            },
+            {
+                "uri": CONTAINS_URI,
+                "name": "contains",
+                "label": "contains",
+                "domain": ORDER_URI,
+                "range": ITEM_URI,
+            },
+        ],
+    }
+
+
+def _source_model(*, with_items: bool = False) -> SourceModel:
+    table_roles = [
+        TableRole(
+            table=T_CUSTOMERS,
+            ontology_class_candidates=[
+                TableRoleCandidate(uri=CUSTOMER_URI, confidence=0.9, reason="ok")
+            ],
+        ),
+        TableRole(
+            table=T_ORDERS,
+            ontology_class_candidates=[
+                TableRoleCandidate(uri=ORDER_URI, confidence=0.9, reason="ok")
+            ],
+        ),
+    ]
+    if with_items:
+        table_roles.append(
+            TableRole(
+                table=T_ITEMS,
+                ontology_class_candidates=[
+                    TableRoleCandidate(uri=ITEM_URI, confidence=0.9, reason="ok")
+                ],
+            )
+        )
+    canonical_ids = [
+        CanonicalId(
+            ontology_class=CUSTOMER_URI,
+            canonical_column_per_table={T_CUSTOMERS: "customer_id"},
+        ),
+        CanonicalId(
+            ontology_class=ORDER_URI,
+            canonical_column_per_table={T_ORDERS: "order_id"},
+        ),
+    ]
+    if with_items:
+        canonical_ids.append(
+            CanonicalId(
+                ontology_class=ITEM_URI,
+                canonical_column_per_table={T_ITEMS: "item_id"},
+            )
+        )
+    join_keys = [
+        JoinKey(
+            from_ref=f"{T_ORDERS}.customer_id",
+            to_ref=f"{T_CUSTOMERS}.customer_id",
+            confidence=0.9,
+            overlap_pct=0.95,
+            kind="same_trust_fk",
+        ),
+    ]
+    if with_items:
+        join_keys.append(
+            JoinKey(
+                from_ref=f"{T_ITEMS}.order_id",
+                to_ref=f"{T_ORDERS}.order_id",
+                confidence=0.9,
+                overlap_pct=0.95,
+                kind="same_trust_fk",
+            )
+        )
+
+    entity_order = [CUSTOMER_URI, ORDER_URI]
+    relationship_order = [HAS_ORDER_URI]
+    if with_items:
+        entity_order.append(ITEM_URI)
+        relationship_order.append(CONTAINS_URI)
+
+    return SourceModel(
+        table_roles=table_roles,
+        canonical_ids=canonical_ids,
+        join_keys=join_keys,
+        mapping_plan=MappingPlan(
+            entity_order=entity_order,
+            relationship_order=relationship_order,
+            skip=[],
+        ),
+    )
+
+
+def _entity_mapping(class_uri: str, id_col: str, sql: str) -> dict:
+    """Shape produced by the EntityGenerator's submit handler."""
+    return {
+        "ontology_class": class_uri,
+        "class_name": class_uri.rsplit("#", 1)[-1],
+        "sql_query": sql,
+        "id_column": id_col,
+        "label_column": id_col,
+        "attribute_mappings": {},
+        "unmapped_attributes": [],
+    }
+
+
+def _relationship_mapping(
+    prop_uri: str, source_col: str, target_col: str, sql: str
+) -> dict:
+    return {
+        "property": prop_uri,
+        "property_name": prop_uri.rsplit("#", 1)[-1],
+        "sql_query": sql,
+        "source_id_column": source_col,
+        "target_id_column": target_col,
+        "domain": CUSTOMER_URI,
+        "range_class": ORDER_URI,
+    }
+
+
+# =====================================================
+# Fake sub-agent factories
+# =====================================================
+
+
+class FakePlanner:
+    """Fake ``run_planner`` returning canned :class:`PlannerResult` values."""
+
+    def __init__(self, results: List[PlannerResult]):
+        self.results = list(results)
+        self.calls = 0
+
+    def __call__(self, *args: Any, **kwargs: Any) -> PlannerResult:
+        self.calls += 1
+        if not self.results:
+            raise AssertionError(
+                f"FakePlanner ran out of canned results on call #{self.calls}"
+            )
+        return self.results.pop(0)
+
+
+class FakeEntityGenerator:
+    """Routes the call by ontology_class URI to a per-URI list of results."""
+
+    def __init__(self, results_by_uri: Dict[str, List[EntityGenResult]]):
+        self.results_by_uri = {k: list(v) for k, v in results_by_uri.items()}
+        self.calls: List[Tuple[str, Optional[str]]] = []
+
+    def __call__(self, *args: Any, **kwargs: Any) -> EntityGenResult:
+        ontology_class = kwargs["ontology_class"]
+        uri = ontology_class.get("uri", "")
+        hint = kwargs.get("retry_hint")
+        self.calls.append((uri, hint))
+        queue = self.results_by_uri.get(uri, [])
+        if not queue:
+            raise AssertionError(
+                f"FakeEntityGenerator: no canned result for {uri} (call "
+                f"#{len(self.calls)})"
+            )
+        return queue.pop(0)
+
+
+class FakeRelationshipGenerator:
+    """Routes the call by ontology_property URI."""
+
+    def __init__(self, results_by_uri: Dict[str, List[RelationshipGenResult]]):
+        self.results_by_uri = {k: list(v) for k, v in results_by_uri.items()}
+        self.calls: List[Tuple[str, Optional[str]]] = []
+
+    def __call__(self, *args: Any, **kwargs: Any) -> RelationshipGenResult:
+        prop = kwargs["ontology_property"]
+        uri = prop.get("uri", "")
+        hint = kwargs.get("retry_hint")
+        self.calls.append((uri, hint))
+        queue = self.results_by_uri.get(uri, [])
+        if not queue:
+            raise AssertionError(
+                f"FakeRelationshipGenerator: no canned result for {uri}"
+            )
+        return queue.pop(0)
+
+
+class FakeCritic:
+    """Routes by item_uri."""
+
+    def __init__(self, reports_by_uri: Dict[str, List[CriticResult]]):
+        self.reports_by_uri = {k: list(v) for k, v in reports_by_uri.items()}
+        self.calls: List[str] = []
+
+    def __call__(self, *args: Any, **kwargs: Any) -> CriticResult:
+        uri = kwargs["item_uri"]
+        self.calls.append(uri)
+        queue = self.reports_by_uri.get(uri, [])
+        if not queue:
+            # Default: PASS so tests that don't care about critic still work.
+            return CriticResult(
+                success=True,
+                report=EvalReport(
+                    status="PASS",
+                    stage="semantic",
+                    metrics={},
+                    failures=[],
+                    bubble_to_planner=False,
+                ),
+            )
+        return queue.pop(0)
+
+
+class FakeDeterministicEvaluator:
+    """Stage-1 evaluator stub keyed by mapping uri (class or property)."""
+
+    def __init__(self, reports_by_uri: Dict[str, List[EvalReport]]):
+        self.reports_by_uri = {k: list(v) for k, v in reports_by_uri.items()}
+        self.calls: List[str] = []
+
+    def for_entity(self, *args: Any, **kwargs: Any) -> EvalReport:
+        mapping = kwargs["mapping"]
+        uri = mapping.get("ontology_class", "")
+        return self._next(uri)
+
+    def for_relationship(self, *args: Any, **kwargs: Any) -> EvalReport:
+        mapping = kwargs["mapping"]
+        uri = mapping.get("property", "")
+        return self._next(uri)
+
+    def _next(self, uri: str) -> EvalReport:
+        self.calls.append(uri)
+        queue = self.reports_by_uri.get(uri, [])
+        if not queue:
+            return EvalReport(
+                status="PASS",
+                stage="deterministic",
+                metrics={},
+                failures=[],
+                bubble_to_planner=False,
+            )
+        return queue.pop(0)
+
+
+# =====================================================
+# Helpers — build typical canned results
+# =====================================================
+
+
+def _ok_entity_gen(class_uri: str, sql: Optional[str] = None) -> EntityGenResult:
+    id_col = {
+        CUSTOMER_URI: "customer_id",
+        ORDER_URI: "order_id",
+        ITEM_URI: "item_id",
+    }.get(class_uri, "id")
+    sql = sql or f"SELECT {id_col} AS ID, {id_col} AS Label FROM tbl_for_{class_uri[-3:]}"
+    return EntityGenResult(
+        success=True,
+        mapping=_entity_mapping(class_uri, id_col, sql),
+        iterations=2,
+        usage={"prompt_tokens": 10, "completion_tokens": 5},
+    )
+
+
+def _ok_rel_gen(prop_uri: str) -> RelationshipGenResult:
+    sql = "SELECT customer_id AS source_id, order_id AS target_id FROM orders"
+    return RelationshipGenResult(
+        success=True,
+        mapping=_relationship_mapping(prop_uri, "customer_id", "order_id", sql),
+        iterations=2,
+        usage={"prompt_tokens": 10, "completion_tokens": 5},
+    )
+
+
+def _pass_report(stage: str = "deterministic") -> EvalReport:
+    return EvalReport(
+        status="PASS",
+        stage=stage,
+        metrics={"row_count": 100},
+        failures=[],
+        bubble_to_planner=False,
+    )
+
+
+def _fail_report(
+    *,
+    stage: str = "deterministic",
+    hint: str = "wrong column",
+    bubble: bool = False,
+) -> EvalReport:
+    return EvalReport(
+        status="FAIL",
+        stage=stage,
+        metrics={"row_count": 5},
+        failures=[
+            EvalFailure(
+                kind="structural" if stage == "deterministic" else "semantic",
+                check="some_check",
+                expected=">0",
+                observed="0",
+                hint=hint,
+            )
+        ],
+        bubble_to_planner=bubble,
+    )
+
+
+# =====================================================
+# Common fixtures
+# =====================================================
+
+
+@pytest.fixture
+def fake_client() -> Any:
+    """Lightweight stub with the ``execute_query`` method the orchestrator wraps."""
+
+    class _Client:
+        def __init__(self):
+            self.calls: List[str] = []
+
+        def execute_query(self, sql: str):
+            self.calls.append(sql)
+            # Echo three rows so ``row_count > 0`` if the real evaluator is
+            # invoked. (Most tests stub the evaluator and never hit this.)
+            return [
+                {"customer_id": 1, "order_id": 10},
+                {"customer_id": 2, "order_id": 20},
+                {"customer_id": 3, "order_id": 30},
+            ]
+
+    return _Client()
+
+
+def _patch_sub_agents(
+    monkeypatch,
+    *,
+    planner: Any,
+    entity_gen: Any = None,
+    rel_gen: Any = None,
+    critic: Any = None,
+    det_eval: Optional[FakeDeterministicEvaluator] = None,
+) -> None:
+    monkeypatch.setattr(engine_mod, "run_planner", planner)
+    if entity_gen is not None:
+        monkeypatch.setattr(engine_mod, "run_entity_generator", entity_gen)
+    if rel_gen is not None:
+        monkeypatch.setattr(engine_mod, "run_relationship_generator", rel_gen)
+    if critic is not None:
+        monkeypatch.setattr(engine_mod, "run_critic", critic)
+    if det_eval is not None:
+        monkeypatch.setattr(
+            engine_mod, "evaluate_entity_mapping", det_eval.for_entity
+        )
+        monkeypatch.setattr(
+            engine_mod,
+            "evaluate_relationship_mapping",
+            det_eval.for_relationship,
+        )
+
+
+def _run(client: Any, **overrides) -> AgentResult:
+    kwargs = dict(
+        host="https://test",
+        token="t",
+        endpoint_name="ep",
+        client=client,
+        metadata={},
+        ontology=_ontology(),
+        skip_semantic_critic=True,
+    )
+    kwargs.update(overrides)
+    return run_agent(**kwargs)
+
+
+# =====================================================
+# Tests
+# =====================================================
+
+
+def test_happy_path_two_entities_one_relationship(monkeypatch, fake_client):
+    planner = FakePlanner(
+        [PlannerResult(success=True, source_model=_source_model(), iterations=1)]
+    )
+    entity_gen = FakeEntityGenerator(
+        {
+            CUSTOMER_URI: [_ok_entity_gen(CUSTOMER_URI)],
+            ORDER_URI: [_ok_entity_gen(ORDER_URI)],
+        }
+    )
+    rel_gen = FakeRelationshipGenerator({HAS_ORDER_URI: [_ok_rel_gen(HAS_ORDER_URI)]})
+    det = FakeDeterministicEvaluator({})  # all default PASS
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=rel_gen,
+        det_eval=det,
+    )
+
+    result = _run(fake_client)
+
+    assert result.success is True
+    assert len(result.entity_mappings) == 2
+    assert len(result.relationship_mappings) == 1
+    assert {m["ontology_class"] for m in result.entity_mappings} == {
+        CUSTOMER_URI,
+        ORDER_URI,
+    }
+    # 3 mapping_run_log entries, all PASS.
+    assert len(result.mapping_run_log) == 3
+    assert all(entry["final_status"] == "PASS" for entry in result.mapping_run_log)
+    # source_model serialised onto the result.
+    assert result.source_model is not None
+    assert "table_roles" in result.source_model
+
+
+def test_planner_failure_aborts(monkeypatch, fake_client):
+    planner = FakePlanner(
+        [PlannerResult(success=False, source_model=None, error="LLM rejected tools")]
+    )
+    _patch_sub_agents(monkeypatch, planner=planner)
+
+    result = _run(fake_client)
+
+    assert result.success is False
+    assert "LLM rejected tools" in result.error
+    assert result.entity_mappings == []
+    assert result.relationship_mappings == []
+
+
+def test_generator_failure_records_item_failure_continues_run(
+    monkeypatch, fake_client
+):
+    planner = FakePlanner(
+        [PlannerResult(success=True, source_model=_source_model(), iterations=1)]
+    )
+    # Customer generator fails 3 times; Order succeeds.
+    fail = EntityGenResult(success=False, mapping=None, error="generator crashed")
+    entity_gen = FakeEntityGenerator(
+        {
+            CUSTOMER_URI: [fail, fail, fail],
+            ORDER_URI: [_ok_entity_gen(ORDER_URI)],
+        }
+    )
+    rel_gen = FakeRelationshipGenerator({HAS_ORDER_URI: [_ok_rel_gen(HAS_ORDER_URI)]})
+    det = FakeDeterministicEvaluator({})
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=rel_gen,
+        det_eval=det,
+    )
+
+    result = _run(fake_client)
+
+    # Order entity mapped despite Customer failing.
+    assert any(m["ontology_class"] == ORDER_URI for m in result.entity_mappings)
+    assert not any(m["ontology_class"] == CUSTOMER_URI for m in result.entity_mappings)
+    customer_log = next(
+        e for e in result.mapping_run_log if e["item"] == CUSTOMER_URI
+    )
+    assert customer_log["final_status"] == "FAIL_BUDGET"
+    # The relationship endpoint for hasOrder requires Customer; with Customer
+    # missing the relationship is recorded but not mapped.
+    rel_log = next(e for e in result.mapping_run_log if e["item"] == HAS_ORDER_URI)
+    assert rel_log["final_status"] in {"FAIL_BUDGET", "PASS"}
+
+
+def test_evaluator_fail_retry_with_hint(monkeypatch, fake_client):
+    planner = FakePlanner(
+        [PlannerResult(success=True, source_model=_source_model(), iterations=1)]
+    )
+    entity_gen = FakeEntityGenerator(
+        {
+            CUSTOMER_URI: [
+                _ok_entity_gen(CUSTOMER_URI, "SELECT bad_col AS ID FROM x"),
+                _ok_entity_gen(CUSTOMER_URI),
+            ],
+            ORDER_URI: [_ok_entity_gen(ORDER_URI)],
+        }
+    )
+    rel_gen = FakeRelationshipGenerator({HAS_ORDER_URI: [_ok_rel_gen(HAS_ORDER_URI)]})
+    # First attempt fails, second passes — non-bubble FAIL with a hint.
+    det = FakeDeterministicEvaluator(
+        {
+            CUSTOMER_URI: [
+                _fail_report(hint="use customer_id, not bad_col", bubble=False),
+                _pass_report(),
+            ]
+        }
+    )
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=rel_gen,
+        det_eval=det,
+    )
+
+    result = _run(fake_client)
+
+    assert result.success is True
+    customer_log = next(
+        e for e in result.mapping_run_log if e["item"] == CUSTOMER_URI
+    )
+    assert customer_log["final_status"] == "PASS"
+    assert len(customer_log["attempts"]) == 2
+    assert customer_log["attempts"][0]["stage1_status"] == "FAIL"
+    assert customer_log["attempts"][0]["hint"] == "use customer_id, not bad_col"
+    # Second EntityGenerator call must have been given the hint.
+    customer_calls = [c for c in entity_gen.calls if c[0] == CUSTOMER_URI]
+    assert customer_calls[0][1] is None
+    assert customer_calls[1][1] == "use customer_id, not bad_col"
+
+
+def test_bubble_to_planner_triggers_replanning(monkeypatch, fake_client):
+    planner = FakePlanner(
+        [
+            PlannerResult(success=True, source_model=_source_model(), iterations=1),
+            PlannerResult(success=True, source_model=_source_model(), iterations=1),
+        ]
+    )
+    entity_gen = FakeEntityGenerator(
+        {
+            CUSTOMER_URI: [
+                _ok_entity_gen(CUSTOMER_URI),  # attempt 1 (bubbles)
+                _ok_entity_gen(CUSTOMER_URI),  # attempt 1 of replan iteration
+            ],
+            ORDER_URI: [_ok_entity_gen(ORDER_URI)],
+        }
+    )
+    rel_gen = FakeRelationshipGenerator({HAS_ORDER_URI: [_ok_rel_gen(HAS_ORDER_URI)]})
+    det = FakeDeterministicEvaluator(
+        {
+            CUSTOMER_URI: [
+                _fail_report(hint="wrong table", bubble=True),
+                _pass_report(),
+            ]
+        }
+    )
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=rel_gen,
+        det_eval=det,
+    )
+
+    result = _run(fake_client)
+
+    assert result.success is True
+    customer_log = next(
+        e for e in result.mapping_run_log if e["item"] == CUSTOMER_URI
+    )
+    assert customer_log["final_status"] == "PASS"
+    # Planner was invoked twice (initial + 1 replan).
+    assert planner.calls == 2
+    assert result.stats["planner_reinvocations"] == 1
+
+
+def test_planner_reinvocation_budget_exhausted(monkeypatch, fake_client):
+    # 3 planner results in total: initial + 2 replans (matches the global budget).
+    planner = FakePlanner(
+        [
+            PlannerResult(success=True, source_model=_source_model(), iterations=1)
+            for _ in range(3)
+        ]
+    )
+    bubble = _fail_report(hint="wrong table", bubble=True)
+    # Customer entity bubbles on every attempt forever.
+    entity_gen = FakeEntityGenerator(
+        {
+            CUSTOMER_URI: [_ok_entity_gen(CUSTOMER_URI) for _ in range(20)],
+            ORDER_URI: [_ok_entity_gen(ORDER_URI)],
+        }
+    )
+    rel_gen = FakeRelationshipGenerator({HAS_ORDER_URI: [_ok_rel_gen(HAS_ORDER_URI)]})
+    det = FakeDeterministicEvaluator(
+        {
+            CUSTOMER_URI: [bubble for _ in range(20)],
+        }
+    )
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=rel_gen,
+        det_eval=det,
+    )
+
+    result = _run(fake_client)
+
+    customer_log = next(
+        e for e in result.mapping_run_log if e["item"] == CUSTOMER_URI
+    )
+    assert customer_log["final_status"] == "FAIL_BUBBLE"
+    # 1 initial planner call + exactly 2 replans = 3 total.
+    assert planner.calls == 3
+    assert result.stats["planner_reinvocations"] == 2
+    # Other items still attempted; Order succeeded.
+    assert any(m["ontology_class"] == ORDER_URI for m in result.entity_mappings)
+
+
+def test_retry_budget_exhausted_marks_item_fail_budget(monkeypatch, fake_client):
+    planner = FakePlanner(
+        [PlannerResult(success=True, source_model=_source_model(), iterations=1)]
+    )
+    entity_gen = FakeEntityGenerator(
+        {
+            CUSTOMER_URI: [_ok_entity_gen(CUSTOMER_URI) for _ in range(5)],
+            ORDER_URI: [_ok_entity_gen(ORDER_URI)],
+        }
+    )
+    rel_gen = FakeRelationshipGenerator({HAS_ORDER_URI: [_ok_rel_gen(HAS_ORDER_URI)]})
+    det = FakeDeterministicEvaluator(
+        {
+            CUSTOMER_URI: [
+                _fail_report(hint="hint-1", bubble=False),
+                _fail_report(hint="hint-2", bubble=False),
+                _fail_report(hint="hint-3", bubble=False),
+            ],
+        }
+    )
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=rel_gen,
+        det_eval=det,
+    )
+
+    result = _run(fake_client)
+
+    customer_log = next(
+        e for e in result.mapping_run_log if e["item"] == CUSTOMER_URI
+    )
+    assert customer_log["final_status"] == "FAIL_BUDGET"
+    assert len(customer_log["attempts"]) == 3
+    assert all(a["stage1_status"] == "FAIL" for a in customer_log["attempts"])
+    assert planner.calls == 1
+    # Order still mapped.
+    assert any(m["ontology_class"] == ORDER_URI for m in result.entity_mappings)
+
+
+def test_critic_pass_full_pipeline(monkeypatch, fake_client):
+    planner = FakePlanner(
+        [PlannerResult(success=True, source_model=_source_model(), iterations=1)]
+    )
+    entity_gen = FakeEntityGenerator(
+        {
+            CUSTOMER_URI: [_ok_entity_gen(CUSTOMER_URI)],
+            ORDER_URI: [_ok_entity_gen(ORDER_URI)],
+        }
+    )
+    rel_gen = FakeRelationshipGenerator({HAS_ORDER_URI: [_ok_rel_gen(HAS_ORDER_URI)]})
+    det = FakeDeterministicEvaluator({})  # default PASS
+    critic = FakeCritic(
+        {
+            CUSTOMER_URI: [CriticResult(success=True, report=_pass_report("semantic"))],
+            ORDER_URI: [CriticResult(success=True, report=_pass_report("semantic"))],
+            HAS_ORDER_URI: [
+                CriticResult(success=True, report=_pass_report("semantic"))
+            ],
+        }
+    )
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=rel_gen,
+        critic=critic,
+        det_eval=det,
+    )
+
+    result = _run(fake_client, skip_semantic_critic=False)
+
+    assert result.success is True
+    customer_log = next(
+        e for e in result.mapping_run_log if e["item"] == CUSTOMER_URI
+    )
+    last_attempt = customer_log["attempts"][-1]
+    assert last_attempt["stage1_status"] == "PASS"
+    assert last_attempt["critic_status"] == "PASS"
+    # Critic was actually called.
+    assert CUSTOMER_URI in critic.calls
+
+
+def test_critic_fail_with_bubble(monkeypatch, fake_client):
+    planner = FakePlanner(
+        [
+            PlannerResult(success=True, source_model=_source_model(), iterations=1),
+            PlannerResult(success=True, source_model=_source_model(), iterations=1),
+        ]
+    )
+    entity_gen = FakeEntityGenerator(
+        {
+            CUSTOMER_URI: [
+                _ok_entity_gen(CUSTOMER_URI),  # initial attempt — critic bubbles
+                _ok_entity_gen(CUSTOMER_URI),  # post-replan attempt — passes
+            ],
+            ORDER_URI: [_ok_entity_gen(ORDER_URI)],
+        }
+    )
+    rel_gen = FakeRelationshipGenerator({HAS_ORDER_URI: [_ok_rel_gen(HAS_ORDER_URI)]})
+    det = FakeDeterministicEvaluator({})  # default PASS on stage 1
+    critic = FakeCritic(
+        {
+            CUSTOMER_URI: [
+                CriticResult(
+                    success=True,
+                    report=_fail_report(
+                        stage="semantic", hint="wrong table", bubble=True
+                    ),
+                ),
+                CriticResult(success=True, report=_pass_report("semantic")),
+            ],
+            ORDER_URI: [CriticResult(success=True, report=_pass_report("semantic"))],
+            HAS_ORDER_URI: [
+                CriticResult(success=True, report=_pass_report("semantic"))
+            ],
+        }
+    )
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=rel_gen,
+        critic=critic,
+        det_eval=det,
+    )
+
+    result = _run(fake_client, skip_semantic_critic=False)
+
+    assert result.success is True
+    assert planner.calls == 2
+    customer_log = next(
+        e for e in result.mapping_run_log if e["item"] == CUSTOMER_URI
+    )
+    assert customer_log["final_status"] == "PASS"
+
+
+def test_skip_semantic_critic_true_skips_critic(monkeypatch, fake_client):
+    planner = FakePlanner(
+        [PlannerResult(success=True, source_model=_source_model(), iterations=1)]
+    )
+    entity_gen = FakeEntityGenerator(
+        {
+            CUSTOMER_URI: [_ok_entity_gen(CUSTOMER_URI)],
+            ORDER_URI: [_ok_entity_gen(ORDER_URI)],
+        }
+    )
+    rel_gen = FakeRelationshipGenerator({HAS_ORDER_URI: [_ok_rel_gen(HAS_ORDER_URI)]})
+    det = FakeDeterministicEvaluator({})
+    critic = FakeCritic({})  # would default-PASS if called
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=rel_gen,
+        critic=critic,
+        det_eval=det,
+    )
+
+    result = _run(fake_client, skip_semantic_critic=True)
+
+    assert result.success is True
+    # Critic was never called.
+    assert critic.calls == []
+    # Every attempt records critic_status="skipped".
+    for entry in result.mapping_run_log:
+        for attempt in entry["attempts"]:
+            assert attempt["critic_status"] == "skipped"
+
+
+def test_preseeded_entity_skipped(monkeypatch, fake_client):
+    planner = FakePlanner(
+        [PlannerResult(success=True, source_model=_source_model(), iterations=1)]
+    )
+    entity_gen = FakeEntityGenerator(
+        {
+            # Customer must NOT be generated — it's pre-seeded.
+            ORDER_URI: [_ok_entity_gen(ORDER_URI)],
+        }
+    )
+    rel_gen = FakeRelationshipGenerator({HAS_ORDER_URI: [_ok_rel_gen(HAS_ORDER_URI)]})
+    det = FakeDeterministicEvaluator({})
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=rel_gen,
+        det_eval=det,
+    )
+
+    pre = [
+        _entity_mapping(
+            CUSTOMER_URI,
+            "customer_id",
+            "SELECT customer_id AS ID FROM cat.sch.customers",
+        )
+    ]
+
+    result = _run(fake_client, entity_mappings=pre)
+
+    assert result.success is True
+    customer_log = next(
+        e for e in result.mapping_run_log if e["item"] == CUSTOMER_URI
+    )
+    assert customer_log["final_status"] == "PRESEEDED"
+    assert customer_log["attempts"] == []
+    # The pre-seeded mapping is still in the result list.
+    assert any(m["ontology_class"] == CUSTOMER_URI for m in result.entity_mappings)
+    # EntityGenerator never called for Customer.
+    assert not any(c[0] == CUSTOMER_URI for c in entity_gen.calls)
+
+
+def test_skip_list_honoured(monkeypatch, fake_client):
+    sm = _source_model()
+    # Planner asks the orchestrator to skip Order entirely.
+    sm.mapping_plan.skip.append(SkipItem(item=ORDER_URI, reason="no source table"))
+    sm.mapping_plan.entity_order = [CUSTOMER_URI, ORDER_URI]
+    sm.mapping_plan.relationship_order = []  # nothing depending on Order
+
+    planner = FakePlanner([PlannerResult(success=True, source_model=sm, iterations=1)])
+    entity_gen = FakeEntityGenerator(
+        {
+            CUSTOMER_URI: [_ok_entity_gen(CUSTOMER_URI)],
+            # Order MUST NOT be generated.
+        }
+    )
+    det = FakeDeterministicEvaluator({})
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=FakeRelationshipGenerator({}),
+        det_eval=det,
+    )
+
+    result = _run(fake_client)
+
+    order_log = next(e for e in result.mapping_run_log if e["item"] == ORDER_URI)
+    assert order_log["final_status"] == "SKIPPED"
+    assert not any(c[0] == ORDER_URI for c in entity_gen.calls)
+
+
+def test_on_step_pct_monotonic(monkeypatch, fake_client):
+    planner = FakePlanner(
+        [PlannerResult(success=True, source_model=_source_model(), iterations=1)]
+    )
+    entity_gen = FakeEntityGenerator(
+        {
+            CUSTOMER_URI: [_ok_entity_gen(CUSTOMER_URI)],
+            ORDER_URI: [_ok_entity_gen(ORDER_URI)],
+        }
+    )
+    rel_gen = FakeRelationshipGenerator({HAS_ORDER_URI: [_ok_rel_gen(HAS_ORDER_URI)]})
+    det = FakeDeterministicEvaluator({})
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=rel_gen,
+        det_eval=det,
+    )
+
+    pcts: List[int] = []
+
+    def on_step(msg: str, pct: int) -> None:
+        pcts.append(pct)
+
+    result = _run(fake_client, on_step=on_step)
+
+    assert result.success is True
+    assert pcts, "expected at least one on_step call"
+    # Monotonic non-decreasing — captures the documented design contract
+    # (we only replan on bubble, which this test does not trigger).
+    for prev, curr in zip(pcts, pcts[1:]):
+        assert curr >= prev, f"pct went backwards: {prev} -> {curr}"
+    # First call planning at low pct, last call completion at 100.
+    assert pcts[0] <= 5
+    assert pcts[-1] == 100
+
+
+def test_id_universe_cache_used_across_relationships(monkeypatch, fake_client):
+    # Bare ontology with no attributes so the real deterministic evaluator
+    # doesn't fire on unmapped_attribute_pct.
+    bare_ontology = {
+        "entities": [
+            {"uri": CUSTOMER_URI, "name": "Customer", "label": "Customer", "attributes": []},
+            {"uri": ORDER_URI, "name": "Order", "label": "Order", "attributes": []},
+            {"uri": ITEM_URI, "name": "Item", "label": "Item", "attributes": []},
+        ],
+        "relationships": [
+            {
+                "uri": HAS_ORDER_URI,
+                "name": "hasOrder",
+                "label": "hasOrder",
+                "domain": CUSTOMER_URI,
+                "range": ORDER_URI,
+            },
+            {
+                "uri": CONTAINS_URI,
+                "name": "contains",
+                "label": "contains",
+                "domain": ORDER_URI,
+                "range": ITEM_URI,
+            },
+        ],
+    }
+
+    # Distinct, recognisable SQL strings per entity — used both as cache keys
+    # and as a discriminator for the CountingClient routing below.
+    customer_sql = "SELECT customer_id AS ID, customer_id AS Label FROM cat.sch.customers"
+    order_sql = "SELECT order_id AS ID, order_id AS Label FROM cat.sch.orders"
+    item_sql = "SELECT item_id AS ID, item_id AS Label FROM cat.sch.items"
+
+    planner = FakePlanner(
+        [
+            PlannerResult(
+                success=True, source_model=_source_model(with_items=True), iterations=1
+            )
+        ]
+    )
+    entity_gen = FakeEntityGenerator(
+        {
+            CUSTOMER_URI: [
+                EntityGenResult(
+                    success=True,
+                    mapping=_entity_mapping(CUSTOMER_URI, "customer_id", customer_sql),
+                    iterations=1,
+                )
+            ],
+            ORDER_URI: [
+                EntityGenResult(
+                    success=True,
+                    mapping=_entity_mapping(ORDER_URI, "order_id", order_sql),
+                    iterations=1,
+                )
+            ],
+            ITEM_URI: [
+                EntityGenResult(
+                    success=True,
+                    mapping=_entity_mapping(ITEM_URI, "item_id", item_sql),
+                    iterations=1,
+                )
+            ],
+        }
+    )
+
+    # Relationship edges return rows whose source/target values fall inside
+    # the entity universes so the deterministic evaluator passes.
+    has_order_sql = "SELECT customer_id AS source_id, order_id AS target_id FROM has_order_edge"
+    contains_sql = "SELECT order_id AS source_id, item_id AS target_id FROM contains_edge"
+
+    rel_gen = FakeRelationshipGenerator(
+        {
+            HAS_ORDER_URI: [
+                RelationshipGenResult(
+                    success=True,
+                    mapping={
+                        "property": HAS_ORDER_URI,
+                        "property_name": "hasOrder",
+                        "sql_query": has_order_sql,
+                        "source_id_column": "source_id",
+                        "target_id_column": "target_id",
+                    },
+                    iterations=1,
+                )
+            ],
+            CONTAINS_URI: [
+                RelationshipGenResult(
+                    success=True,
+                    mapping={
+                        "property": CONTAINS_URI,
+                        "property_name": "contains",
+                        "sql_query": contains_sql,
+                        "source_id_column": "source_id",
+                        "target_id_column": "target_id",
+                    },
+                    iterations=1,
+                )
+            ],
+        }
+    )
+    # Use the REAL deterministic evaluators here so the cache codepath is
+    # actually exercised against execute_sql_fn.
+    _patch_sub_agents(
+        monkeypatch,
+        planner=planner,
+        entity_gen=entity_gen,
+        rel_gen=rel_gen,
+        # No det_eval override -> real evaluators used.
+    )
+
+    class CountingClient:
+        """Records every ``execute_query`` call so we can count cache hits."""
+
+        def __init__(self):
+            self.sql_calls: List[str] = []
+
+        def execute_query(self, sql: str):
+            self.sql_calls.append(sql)
+            # Entity-universe queries return rows keyed by the entity's id_column.
+            if sql == customer_sql:
+                return [{"customer_id": i, "ID": i} for i in range(1, 4)]
+            if sql == order_sql:
+                return [{"order_id": i, "ID": i} for i in range(1, 4)]
+            if sql == item_sql:
+                return [{"item_id": i, "ID": i} for i in range(1, 4)]
+            # Edge SQLs: values must overlap with the entity universes so
+            # dangling_*_pct stays low and the report PASSes.
+            if sql == has_order_sql:
+                return [
+                    {"source_id": i, "target_id": i, "customer_id": i, "order_id": i}
+                    for i in range(1, 4)
+                ]
+            if sql == contains_sql:
+                return [
+                    {"source_id": i, "target_id": i, "order_id": i, "item_id": i}
+                    for i in range(1, 4)
+                ]
+            return []
+
+    client = CountingClient()
+    result = _run(client, ontology=bare_ontology)
+
+    assert len(result.entity_mappings) == 3, result.mapping_run_log
+    assert len(result.relationship_mappings) == 2, result.mapping_run_log
+
+    # Each unique entity SQL is run by the entity evaluator (1) + at most
+    # ONCE more from the first relationship that references it (cached for
+    # subsequent relationships).  Without the cache, each entity SQL would
+    # fire from EVERY relationship that touches it — order_sql in
+    # particular would run 1 (entity) + 2 (hasOrder + contains) = 3 times.
+    for sql in (customer_sql, order_sql, item_sql):
+        count = sum(1 for c in client.sql_calls if c == sql)
+        assert count <= 2, (
+            f"entity SQL ran {count} times — id_universe_cache failed:\n{sql}"
+        )
diff --git a/tests/agents/agent_mapping_pge/test_entity_generator.py b/tests/agents/agent_mapping_pge/test_entity_generator.py
new file mode 100644
index 00000000..f4bf4a61
--- /dev/null
+++ b/tests/agents/agent_mapping_pge/test_entity_generator.py
@@ -0,0 +1,702 @@
+"""Tests for the mapping-PGE EntityGenerator agent (Sprint 4).
+
+The Generator is a narrow tool-calling ReAct loop terminated by
+``submit_entity_mapping``. These tests exercise the loop's control flow with
+a *fake LLM* — a stub that replaces ``call_serving_endpoint`` at module
+level and returns canned tool-call responses on a per-call basis.
+
+No real HTTP, no real Databricks, no MLflow tracing.
+
+What we DO exercise:
+* Termination on a single submit call.
+* Multi-step trajectory (execute_sql → submit).
+* ``unmapped_attributes`` round-trips through the tool to the result.
+* Text-only output is treated as failure (no terminal call).
+* Iteration-budget exhaustion is treated as failure.
+* ``retry_hint`` surfaces inside the user message.
+* Step recording: every tool call produces both tool_call and tool_result
+  steps in the right order.
+"""
+
+import json
+from typing import Any, Callable, Dict, List, Optional
+
+import pytest
+
+from agents.agent_mapping_pge.generators import entity as entity_mod
+from agents.agent_mapping_pge.generators.entity import (
+    EntityGenResult,
+    EntityGenStep,
+    run_entity_generator,
+)
+
+
+# =====================================================
+# Fake LLM scaffolding (mirrors test_planner.py)
+# =====================================================
+
+
+_CLASS_URI = "http://ex.org/maternity#Mother"
+
+
+def _make_tool_call(name: str, arguments: dict, *, tc_id: str = "tc1") -> dict:
+    return {
+        "id": tc_id,
+        "type": "function",
+        "function": {"name": name, "arguments": json.dumps(arguments)},
+    }
+
+
+def _llm_response(
+    *,
+    tool_calls: Optional[List[dict]] = None,
+    content: Optional[str] = None,
+    finish_reason: str = "tool_calls",
+    usage: Optional[Dict[str, int]] = None,
+) -> dict:
+    message: Dict[str, Any] = {"role": "assistant"}
+    if tool_calls:
+        message["tool_calls"] = tool_calls
+    if content is not None:
+        message["content"] = content
+    return {
+        "choices": [{"finish_reason": finish_reason, "message": message}],
+        "usage": usage or {"prompt_tokens": 10, "completion_tokens": 5},
+    }
+
+
+class FakeLLM:
+    def __init__(self, responses: List[dict]):
+        self.responses = list(responses)
+        self.calls = 0
+        # Capture the messages list as observed on each call, so tests can
+        # introspect what the agent put into the prompt.
+        self.last_messages: Optional[List[dict]] = None
+        self.first_messages: Optional[List[dict]] = None
+
+    def __call__(self, *args, **kwargs) -> dict:
+        self.calls += 1
+        # ``call_serving_endpoint(host, token, endpoint, messages, ...)`` —
+        # the messages list is positional arg #3 (zero-indexed). Capture
+        # defensively in case the call site changes to kwargs.
+        msgs: Optional[List[dict]] = None
+        if len(args) >= 4 and isinstance(args[3], list):
+            msgs = args[3]
+        elif "messages" in kwargs:
+            msgs = kwargs["messages"]
+        if msgs is not None:
+            # snapshot so later mutations by the loop do not affect what we
+            # captured.
+            snapshot = [dict(m) for m in msgs]
+            if self.first_messages is None:
+                self.first_messages = snapshot
+            self.last_messages = snapshot
+
+        if not self.responses:
+            raise AssertionError(
+                f"FakeLLM: ran out of canned responses on call #{self.calls}"
+            )
+        return self.responses.pop(0)
+
+
+class CyclingFakeLLM:
+    """Like FakeLLM but cycles through a fixed list forever."""
+
+    def __init__(self, responses: List[dict]):
+        self.responses = list(responses)
+        self.calls = 0
+
+    def __call__(self, *args, **kwargs) -> dict:
+        resp = self.responses[self.calls % len(self.responses)]
+        self.calls += 1
+        return resp
+
+
+@pytest.fixture
+def no_sleep(monkeypatch):
+    """Neutralise the 3-second inter-iteration delay so tests run fast."""
+    monkeypatch.setattr(entity_mod.time, "sleep", lambda *_a, **_k: None)
+
+
+def _patch_llm(monkeypatch, fake: Callable[..., dict]) -> None:
+    monkeypatch.setattr(entity_mod, "call_serving_endpoint", fake)
+
+
+# =====================================================
+# Fixtures
+# =====================================================
+
+
+def _ontology_class() -> dict:
+    return {
+        "uri": _CLASS_URI,
+        "label": "Mother",
+        "name": "Mother",
+        "comment": "A mother in the maternity trust dataset.",
+        "attributes": [
+            {"name": "nhsNumber", "type": "string"},
+            {"name": "dateOfBirth", "type": "date"},
+            {"name": "ethnicity", "type": "string"},
+        ],
+    }
+
+
+def _source_model_slice() -> dict:
+    return {
+        "candidate_tables": [
+            {
+                "table": "cat.sch.mothers",
+                "confidence": 0.92,
+                "reason": "row per NHS — mother demographics",
+            }
+        ],
+        "canonical_id": {
+            "canonical_column_per_table": {"cat.sch.mothers": "nhs_number"},
+            "format_note": "10-digit NHS",
+        },
+        "relevant_joins": [],
+    }
+
+
+def _valid_submit_args(
+    *,
+    unmapped: Optional[list] = None,
+) -> dict:
+    args: Dict[str, Any] = {
+        "class_uri": _CLASS_URI,
+        "class_name": "Mother",
+        "sql_query": (
+            "SELECT nhs_number AS ID, nhs_number AS Label, nhs_number, dob, ethnicity "
+            "FROM cat.sch.mothers WHERE nhs_number IS NOT NULL"
+        ),
+        "id_column": "nhs_number",
+        "label_column": "nhs_number",
+        "attribute_mappings": {
+            "nhsNumber": "nhs_number",
+            "dateOfBirth": "dob",
+            "ethnicity": "ethnicity",
+        },
+    }
+    if unmapped is not None:
+        args["unmapped_attributes"] = unmapped
+    return args
+
+
+# =====================================================
+# 1. Single-shot submit terminates immediately
+# =====================================================
+
+
+def test_terminates_on_submit(monkeypatch, no_sleep):
+    """First LLM turn submits a valid mapping → success, iterations=1."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_entity_mapping", _valid_submit_args())
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_entity_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_class=_ontology_class(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    assert isinstance(result, EntityGenResult)
+    assert result.success is True
+    assert result.iterations == 1
+    assert result.mapping is not None
+    assert result.mapping["ontology_class"] == _CLASS_URI
+    assert result.mapping["id_column"] == "nhs_number"
+    assert result.error == ""
+    step_kinds = [s.step_type for s in result.steps]
+    assert step_kinds == ["tool_call", "tool_result"]
+    assert result.steps[0].tool_name == "submit_entity_mapping"
+
+
+# =====================================================
+# 2. execute_sql validation, then submit
+# =====================================================
+
+
+def test_validates_sql_then_submits(monkeypatch, no_sleep):
+    """execute_sql → submit_entity_mapping → success, iterations=2."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "execute_sql",
+                        {
+                            "sql": (
+                                "SELECT nhs_number AS ID, nhs_number AS Label, "
+                                "nhs_number, dob, ethnicity FROM cat.sch.mothers "
+                                "WHERE nhs_number IS NOT NULL"
+                            )
+                        },
+                        tc_id="a",
+                    )
+                ]
+            ),
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_entity_mapping", _valid_submit_args(), tc_id="b"
+                    )
+                ]
+            ),
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    class FakeClient:
+        def execute_query(self, sql):
+            return [
+                {
+                    "ID": "1234567890",
+                    "Label": "1234567890",
+                    "nhs_number": "1234567890",
+                    "dob": "1990-01-01",
+                    "ethnicity": "white",
+                }
+            ]
+
+    result = run_entity_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=FakeClient(),
+        ontology_class=_ontology_class(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    assert result.success is True
+    assert result.iterations == 2
+    assert result.mapping is not None
+    # Sequence: tool_call(execute_sql), tool_result(execute_sql),
+    # tool_call(submit), tool_result(submit) — 4 steps.
+    assert len(result.steps) == 4
+    assert [s.tool_name for s in result.steps] == [
+        "execute_sql",
+        "execute_sql",
+        "submit_entity_mapping",
+        "submit_entity_mapping",
+    ]
+
+
+# =====================================================
+# 3. unmapped_attributes round-trips
+# =====================================================
+
+
+def test_unmapped_attributes_round_trip(monkeypatch, no_sleep):
+    """Submit with ``unmapped_attributes`` — the field must appear on the
+    resulting mapping dict in the same (normalised) shape."""
+    unmapped_payload = [
+        {"name": "ethnicity", "reason": "no ethnicity column in this table"}
+    ]
+    args = _valid_submit_args(unmapped=unmapped_payload)
+    # Strip ethnicity from attribute_mappings to make the example coherent.
+    args["attribute_mappings"].pop("ethnicity", None)
+
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[_make_tool_call("submit_entity_mapping", args)]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_entity_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_class=_ontology_class(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    assert result.success is True
+    assert result.mapping is not None
+    assert result.mapping["unmapped_attributes"] == [
+        {"name": "ethnicity", "reason": "no ethnicity column in this table"}
+    ]
+    # Plain-string form is also documented; make sure it survives too.
+    fake2 = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_entity_mapping",
+                        _valid_submit_args(unmapped=["ethnicity"]),
+                    )
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake2)
+    result2 = run_entity_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_class=_ontology_class(),
+        source_model_slice=_source_model_slice(),
+    )
+    assert result2.success is True
+    assert result2.mapping["unmapped_attributes"] == ["ethnicity"]
+
+
+# =====================================================
+# 4. Text without terminal call → failure
+# =====================================================
+
+
+def test_text_without_terminal_fails(monkeypatch, no_sleep):
+    """A plain-text response is treated as failure — the Generator must
+    terminate via submit_entity_mapping.
+    """
+    fake = FakeLLM(
+        [_llm_response(content="I am thinking…", finish_reason="stop")]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_entity_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_class=_ontology_class(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    assert result.success is False
+    assert result.iterations == 1
+    assert result.mapping is None
+    assert "without submitting mapping" in result.error
+    assert any(s.step_type == "output" for s in result.steps)
+
+
+# =====================================================
+# 5. Iteration-budget exhaustion → failure
+# =====================================================
+
+
+def test_exhausts_iteration_budget(monkeypatch, no_sleep):
+    """Endless sample_table calls with max_iterations=3 → fail with
+    ``iteration budget`` and three iterations of steps recorded."""
+    fake = CyclingFakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "sample_table",
+                        {"full_name": "cat.sch.mothers"},
+                        tc_id="a",
+                    )
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    class FakeClient:
+        def execute_query(self, sql):
+            return [{"nhs_number": "1234567890"}]
+
+    result = run_entity_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=FakeClient(),
+        ontology_class=_ontology_class(),
+        source_model_slice=_source_model_slice(),
+        max_iterations=3,
+    )
+
+    assert result.success is False
+    assert result.iterations == 3
+    assert result.mapping is None
+    assert "iteration budget" in result.error
+    # 3 iterations × (tool_call + tool_result) = 6 steps.
+    assert len(result.steps) == 6
+
+
+# =====================================================
+# 6. retry_hint surfaces in the user prompt
+# =====================================================
+
+
+def test_retry_hint_surfaces_in_user_prompt(monkeypatch, no_sleep):
+    """If ``retry_hint`` is provided, the FIRST LLM call's user message must
+    contain the hint verbatim."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_entity_mapping", _valid_submit_args())
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_entity_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_class=_ontology_class(),
+        source_model_slice=_source_model_slice(),
+        retry_hint="Use NHS column, not patient_id.",
+    )
+
+    assert result.success is True
+    assert fake.first_messages is not None
+    # messages[0] is system, messages[1] is user.
+    assert fake.first_messages[0]["role"] == "system"
+    assert fake.first_messages[1]["role"] == "user"
+    user_content = fake.first_messages[1]["content"]
+    assert "Use NHS column, not patient_id." in user_content
+    # RETRY HINT label is present so the LLM understands its provenance.
+    assert "RETRY HINT" in user_content
+
+
+def test_system_prompt_treats_canonical_value_as_sql_expression(monkeypatch, no_sleep):
+    """canonical_column_per_table values may be SQL expressions (e.g.
+    ``regexp_extract(...)``) not just bare columns. The Generator must drop
+    them verbatim. Live V1.1 smoke surfaced 100% dangling on cross-trust
+    entities whose canonical IDs needed regex normalization to a common
+    format."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_entity_mapping", _valid_submit_args())
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    run_entity_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_class=_ontology_class(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    system_content = fake.first_messages[0]["content"]
+    assert "SQL EXPRESSION" in system_content
+    assert "regexp_extract" in system_content
+    assert "verbatim" in system_content
+    assert "do NOT rewrite" in system_content
+
+
+def test_system_prompt_mandates_union_for_cross_source(monkeypatch, no_sleep):
+    """When canonical_id.canonical_column_per_table lists 2+ tables, the
+    Generator MUST UNION across all of them. Single-trust selection on a
+    cross-source class makes relationship dangling 100% — the failure mode
+    surfaced by the live V1.1 smoke."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_entity_mapping", _valid_submit_args())
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    run_entity_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_class=_ontology_class(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    system_content = fake.first_messages[0]["content"]
+    assert "SINGLE-SOURCE vs CROSS-SOURCE" in system_content
+    assert "UNION ALL" in system_content
+    assert "TWO OR MORE tables" in system_content
+    # Anti-pattern: picking one trust is called out by name.
+    assert "Picking just one" in system_content or "missing" in system_content
+
+
+def test_system_prompt_mandates_value_harmonization(monkeypatch, no_sleep):
+    """Coded (controlled-vocabulary) attributes are spelled differently across
+    sources. The Generator must harmonize them to one canonical token set with a
+    CASE expression — copying the raw column verbatim yields a source-fractured
+    vocabulary that breaks KPI aggregation. Also guards the [0-9]-not-\\d
+    regex-safety rule that survives the OntoBricks build's backslash-strip.
+
+    Asserts on the domain-neutral *technique* markers, not example vocabulary, so
+    the prompt stays usecase-agnostic and the test is not brittle to rewordings."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_entity_mapping", _valid_submit_args())
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    run_entity_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_class=_ontology_class(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    system_content = fake.first_messages[0]["content"]
+    assert "VALUE HARMONIZATION" in system_content
+    assert "controlled vocabulary" in system_content
+    # Canonical token set + the discover-before-harmonize discipline.
+    assert "SELECT DISTINCT" in system_content
+    assert "CASE" in system_content and "canonical lowercase token" in system_content
+    # Regex-safety rule: explicit char classes, never backslash escapes.
+    assert "[0-9]" in system_content
+    assert "\\d" in system_content  # mentioned only to forbid it
+
+
+# =====================================================
+# 7. Step recording invariants
+# =====================================================
+
+
+def test_wrong_class_uri_submission_does_not_terminate(monkeypatch, no_sleep):
+    """A submit_entity_mapping call with a class_uri that doesn't match the
+    requested one must NOT terminate the loop. The Generator must keep going
+    so a follow-up submit (with the correct URI) can succeed, and the LLM
+    must see a corrective tool message describing the mismatch.
+    """
+    requested_uri = _CLASS_URI
+    other_uri = "http://ex.org/maternity#Baby"
+
+    wrong_args = _valid_submit_args()
+    wrong_args["class_uri"] = other_uri
+    wrong_args["class_name"] = "Baby"
+
+    fake = FakeLLM(
+        [
+            # Turn 1: submit with the WRONG class_uri — must NOT terminate.
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_entity_mapping", wrong_args, tc_id="wrong"
+                    )
+                ]
+            ),
+            # Turn 2: submit with the correct class_uri — should terminate.
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_entity_mapping",
+                        _valid_submit_args(),
+                        tc_id="right",
+                    )
+                ]
+            ),
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_entity_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_class=_ontology_class(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    assert result.success is True
+    assert result.iterations == 2
+    assert result.mapping is not None
+    assert result.mapping["ontology_class"] == requested_uri
+
+    # The LLM's second call must have seen a corrective tool message
+    # describing the mismatch, surfaced through ``messages``.
+    assert fake.last_messages is not None
+    tool_messages = [m for m in fake.last_messages if m.get("role") == "tool"]
+    assert tool_messages, "expected at least one tool message on the 2nd call"
+    corrective = tool_messages[-1]
+    corrective_content = corrective.get("content", "")
+    assert other_uri in corrective_content
+    assert requested_uri in corrective_content
+    assert "does not match" in corrective_content
+    # Sanity: the corrective payload is a JSON error (not the original
+    # success=True response).
+    parsed = json.loads(corrective_content)
+    assert parsed.get("success") is False
+    assert "error" in parsed
+
+
+def test_records_steps(monkeypatch, no_sleep):
+    """Every tool-calling iteration produces exactly one ``tool_call`` step
+    immediately followed by one ``tool_result`` step with the same tool_name.
+    """
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "sample_table",
+                        {"full_name": "cat.sch.mothers"},
+                        tc_id="a",
+                    )
+                ]
+            ),
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_entity_mapping", _valid_submit_args(), tc_id="b"
+                    )
+                ]
+            ),
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    class FakeClient:
+        def execute_query(self, sql):
+            return [{"nhs_number": "1234567890"}]
+
+    result = run_entity_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=FakeClient(),
+        ontology_class=_ontology_class(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    assert result.success is True
+    assert len(result.steps) % 2 == 0
+    for i in range(0, len(result.steps), 2):
+        call_step = result.steps[i]
+        result_step = result.steps[i + 1]
+        assert call_step.step_type == "tool_call"
+        assert result_step.step_type == "tool_result"
+        assert call_step.tool_name == result_step.tool_name
+        assert call_step.content != ""
+        assert result_step.content != ""
+        assert isinstance(call_step, EntityGenStep)
+        assert isinstance(result_step, EntityGenStep)
diff --git a/tests/agents/agent_mapping_pge/test_planner.py b/tests/agents/agent_mapping_pge/test_planner.py
new file mode 100644
index 00000000..21684acd
--- /dev/null
+++ b/tests/agents/agent_mapping_pge/test_planner.py
@@ -0,0 +1,522 @@
+"""Tests for the mapping-PGE Planner agent (Sprint 3).
+
+The Planner is a tool-calling ReAct loop terminated by ``submit_source_model``.
+These tests exercise the loop's control flow with a *fake LLM* — a stub that
+replaces ``call_serving_endpoint`` at module level and returns canned tool-
+call responses on a per-call basis.
+
+No real HTTP, no real Databricks, no MLflow tracing. The tracing decorator
+is a no-op when MLflow isn't configured (see ``_TRACING_READY`` in
+``agents.tracing``), so it runs cleanly here.
+
+What we DO exercise:
+* The four termination conditions
+  — terminal submit_source_model with success=True breaks the loop
+  — text content with no tool calls is treated as failure
+  — iteration budget exhaustion is treated as failure
+  — submit returning success=False is NOT terminal (allows retry)
+* Step recording: every tool call produces both tool_call and tool_result
+  steps in the right order.
+* Iteration counter accuracy.
+
+What we do NOT exercise (covered elsewhere or out of scope):
+* The actual content of the SourceModel — that's Sprint 1's contracts tests.
+* The four planner tool handlers — that's Sprint 2's test_planner_tools.py.
+* MLflow tracing semantics — the decorator is wrapped in an ``if`` guard.
+"""
+
+import json
+from typing import Any, Callable, Dict, List, Optional
+
+import pytest
+
+from agents.agent_mapping_pge import planner as planner_mod
+from agents.agent_mapping_pge.contracts import SourceModel
+from agents.agent_mapping_pge.planner import (
+    PlannerResult,
+    PlannerStep,
+    run_planner,
+)
+
+
+# =====================================================
+# Fake LLM scaffolding
+# =====================================================
+
+
+def _make_tool_call(name: str, arguments: dict, *, tc_id: str = "tc1") -> dict:
+    """Build an OpenAI-style tool_calls entry."""
+    return {
+        "id": tc_id,
+        "type": "function",
+        "function": {"name": name, "arguments": json.dumps(arguments)},
+    }
+
+
+def _llm_response(
+    *,
+    tool_calls: Optional[List[dict]] = None,
+    content: Optional[str] = None,
+    finish_reason: str = "tool_calls",
+    usage: Optional[Dict[str, int]] = None,
+) -> dict:
+    """Build a minimal OpenAI-style chat-completions response."""
+    message: Dict[str, Any] = {"role": "assistant"}
+    if tool_calls:
+        message["tool_calls"] = tool_calls
+    if content is not None:
+        message["content"] = content
+    return {
+        "choices": [{"finish_reason": finish_reason, "message": message}],
+        "usage": usage or {"prompt_tokens": 10, "completion_tokens": 5},
+    }
+
+
+class FakeLLM:
+    """A stub for ``call_serving_endpoint`` that returns canned responses.
+
+    The list is consumed front-to-back, one response per call. If a test
+    exhausts the list, the stub raises — that's almost always a test bug
+    (the loop iterated more times than the test author expected).
+    """
+
+    def __init__(self, responses: List[dict]):
+        self.responses = list(responses)
+        self.calls = 0
+
+    def __call__(self, *args, **kwargs) -> dict:
+        self.calls += 1
+        if not self.responses:
+            raise AssertionError(
+                f"FakeLLM: ran out of canned responses on call #{self.calls}"
+            )
+        return self.responses.pop(0)
+
+
+class CyclingFakeLLM:
+    """Like FakeLLM but cycles through a fixed list forever.
+
+    Used for the iteration-budget-exhaustion test, where the LLM is supposed
+    to be stuck in an infinite loop until the engine cuts it off.
+    """
+
+    def __init__(self, responses: List[dict]):
+        self.responses = list(responses)
+        self.calls = 0
+
+    def __call__(self, *args, **kwargs) -> dict:
+        resp = self.responses[self.calls % len(self.responses)]
+        self.calls += 1
+        return resp
+
+
+@pytest.fixture
+def no_sleep(monkeypatch):
+    """Neutralise the 3-second inter-iteration delay so tests run fast."""
+    monkeypatch.setattr(planner_mod.time, "sleep", lambda *_a, **_k: None)
+
+
+def _patch_llm(monkeypatch, fake: Callable[..., dict]) -> None:
+    """Replace the planner's reference to ``call_serving_endpoint``."""
+    monkeypatch.setattr(planner_mod, "call_serving_endpoint", fake)
+
+
+# =====================================================
+# Fixtures: a minimal valid SourceModel payload
+# =====================================================
+
+
+def _valid_source_model_dict() -> Dict[str, Any]:
+    """Same shape as test_planner_tools._valid_source_model_dict — kept
+    independent here so the two test files don't coupling-leak."""
+    return {
+        "table_roles": [
+            {
+                "table": "cat.sch.mothers",
+                "ontology_class_candidates": [
+                    {
+                        "uri": "http://ex.org/maternity#Mother",
+                        "confidence": 0.9,
+                        "reason": "row per NHS",
+                    }
+                ],
+            }
+        ],
+        "canonical_ids": [
+            {
+                "ontology_class": "http://ex.org/maternity#Mother",
+                "canonical_column_per_table": {"cat.sch.mothers": "nhs_number"},
+                "format_note": "",
+            }
+        ],
+        "join_keys": [],
+        "mapping_plan": {
+            "entity_order": ["http://ex.org/maternity#Mother"],
+            "relationship_order": [],
+            "skip": [],
+        },
+    }
+
+
+def _minimal_metadata() -> dict:
+    return {
+        "tables": [
+            {
+                "name": "mothers",
+                "full_name": "cat.sch.mothers",
+                "columns": [
+                    {"name": "nhs_number", "type": "STRING"},
+                    {"name": "dob", "type": "DATE"},
+                ],
+            }
+        ]
+    }
+
+
+def _minimal_ontology() -> dict:
+    return {
+        "entities": [{"name": "Mother", "uri": "http://ex.org/maternity#Mother"}],
+        "relationships": [],
+    }
+
+
+# =====================================================
+# 1. Single-shot submit terminates immediately
+# =====================================================
+
+
+def test_planner_terminates_on_submit_source_model(monkeypatch, no_sleep):
+    """First LLM turn calls submit_source_model with a valid model — Planner
+    must return success=True with iterations=1 and source_model populated.
+    """
+    sm = _valid_source_model_dict()
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[_make_tool_call("submit_source_model", {"model": sm})]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_planner(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,  # not used in this scenario
+        metadata=_minimal_metadata(),
+        ontology=_minimal_ontology(),
+    )
+
+    assert isinstance(result, PlannerResult)
+    assert result.success is True
+    assert result.iterations == 1
+    assert isinstance(result.source_model, SourceModel)
+    assert len(result.source_model.table_roles) == 1
+    assert result.error == ""
+    assert result.usage["prompt_tokens"] >= 0
+    # Exactly one tool_call + one tool_result step.
+    step_kinds = [s.step_type for s in result.steps]
+    assert step_kinds == ["tool_call", "tool_result"]
+    assert result.steps[0].tool_name == "submit_source_model"
+    assert result.steps[1].tool_name == "submit_source_model"
+
+
+# =====================================================
+# 2. Multi-step ReAct trajectory followed by submit
+# =====================================================
+
+
+def test_planner_multi_step_then_submit(monkeypatch, no_sleep):
+    """get_metadata → get_ontology → sample_table → submit_source_model."""
+    sm = _valid_source_model_dict()
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[_make_tool_call("get_metadata", {}, tc_id="a")]
+            ),
+            _llm_response(
+                tool_calls=[_make_tool_call("get_ontology", {}, tc_id="b")]
+            ),
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "sample_table",
+                        {"full_name": "cat.sch.mothers"},
+                        tc_id="c",
+                    )
+                ]
+            ),
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_source_model", {"model": sm}, tc_id="d"
+                    )
+                ]
+            ),
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    # sample_table needs a client — return one row.
+    class FakeClient:
+        def execute_query(self, sql):
+            return [{"nhs_number": "1234567890", "dob": "1990-01-01"}]
+
+    result = run_planner(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=FakeClient(),
+        metadata=_minimal_metadata(),
+        ontology=_minimal_ontology(),
+    )
+
+    assert result.success is True
+    assert result.iterations == 4
+    assert isinstance(result.source_model, SourceModel)
+
+    # Every iteration produces both a tool_call and a tool_result step.
+    assert len(result.steps) == 8
+    expected_tool_names = [
+        "get_metadata",
+        "get_metadata",
+        "get_ontology",
+        "get_ontology",
+        "sample_table",
+        "sample_table",
+        "submit_source_model",
+        "submit_source_model",
+    ]
+    assert [s.tool_name for s in result.steps] == expected_tool_names
+    assert [s.step_type for s in result.steps] == [
+        "tool_call",
+        "tool_result",
+        "tool_call",
+        "tool_result",
+        "tool_call",
+        "tool_result",
+        "tool_call",
+        "tool_result",
+    ]
+
+
+# =====================================================
+# 3. submit returning success=False does NOT terminate
+# =====================================================
+
+
+def test_planner_invalid_source_model_does_not_terminate(monkeypatch, no_sleep):
+    """First submit is malformed (missing 'table' on a table_role) — the
+    tool returns success=False and the Planner keeps going. Second submit
+    is valid and terminates the loop.
+    """
+    bad = _valid_source_model_dict()
+    del bad["table_roles"][0]["table"]  # break it
+
+    good = _valid_source_model_dict()
+    # Make the good one visibly different so we can prove which one stuck.
+    good["mapping_plan"]["entity_order"] = ["http://ex.org/maternity#Mother"]
+
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_source_model", {"model": bad}, tc_id="x")
+                ]
+            ),
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_source_model", {"model": good}, tc_id="y"
+                    )
+                ]
+            ),
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_planner(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        metadata=_minimal_metadata(),
+        ontology=_minimal_ontology(),
+    )
+
+    assert result.success is True
+    assert result.iterations == 2
+    assert isinstance(result.source_model, SourceModel)
+    # The valid one is what landed on ctx — pull a field from it.
+    assert result.source_model.mapping_plan.entity_order == [
+        "http://ex.org/maternity#Mother"
+    ]
+    # Both submit attempts were recorded; the first tool_result must signal
+    # failure so the orchestrator can attribute the retry.
+    first_submit_result = result.steps[1]
+    assert first_submit_result.step_type == "tool_result"
+    assert first_submit_result.tool_name == "submit_source_model"
+    payload = json.loads(first_submit_result.content)
+    assert payload["success"] is False
+
+
+# =====================================================
+# 4. Free-text output without a terminal tool call → failure
+# =====================================================
+
+
+def test_planner_text_without_terminal_fails(monkeypatch, no_sleep):
+    """The Planner must terminate via submit_source_model. A plain-text
+    response is treated as failure.
+    """
+    fake = FakeLLM(
+        [_llm_response(content="I think we are done.", finish_reason="stop")]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_planner(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        metadata=_minimal_metadata(),
+        ontology=_minimal_ontology(),
+    )
+
+    assert result.success is False
+    assert result.iterations == 1
+    assert result.source_model is None
+    assert "without submitting source model" in result.error
+    # The text was recorded as an output step for debuggability.
+    assert any(s.step_type == "output" for s in result.steps)
+
+
+# =====================================================
+# 5. Iteration budget exhaustion → failure
+# =====================================================
+
+
+def test_planner_exhausts_iteration_budget(monkeypatch, no_sleep):
+    """Fake LLM keeps calling get_metadata forever. With max_iterations=3
+    the Planner must give up cleanly and report budget exhaustion.
+    """
+    fake = CyclingFakeLLM(
+        [
+            _llm_response(
+                tool_calls=[_make_tool_call("get_metadata", {}, tc_id="a")]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_planner(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        metadata=_minimal_metadata(),
+        ontology=_minimal_ontology(),
+        max_iterations=3,
+    )
+
+    assert result.success is False
+    assert result.iterations == 3
+    assert result.source_model is None
+    assert "iteration budget" in result.error
+    # Three iterations × (tool_call + tool_result) = 6 steps.
+    assert len(result.steps) == 6
+
+
+# =====================================================
+# 6. Step recording invariants
+# =====================================================
+
+
+def test_planner_records_steps(monkeypatch, no_sleep):
+    """For each tool-calling iteration, the Planner must record exactly one
+    ``tool_call`` step (with non-empty arguments-as-content) and one
+    ``tool_result`` step (with non-empty content) — in that order, paired by
+    ``tool_name``.
+    """
+    sm = _valid_source_model_dict()
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[_make_tool_call("get_metadata", {}, tc_id="a")]
+            ),
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call("submit_source_model", {"model": sm}, tc_id="b")
+                ]
+            ),
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_planner(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        metadata=_minimal_metadata(),
+        ontology=_minimal_ontology(),
+    )
+
+    assert result.success is True
+    # Verify the pairing: every odd-indexed step (tool_call) is immediately
+    # followed by an even-indexed step (tool_result) with the same tool_name.
+    assert len(result.steps) % 2 == 0
+    for i in range(0, len(result.steps), 2):
+        call_step = result.steps[i]
+        result_step = result.steps[i + 1]
+        assert call_step.step_type == "tool_call"
+        assert result_step.step_type == "tool_result"
+        assert call_step.tool_name == result_step.tool_name
+        assert call_step.content != ""
+        assert result_step.content != ""
+        # PlannerStep is the right type.
+        assert isinstance(call_step, PlannerStep)
+        assert isinstance(result_step, PlannerStep)
+
+
+# =====================================================
+# Prompt contract — canonical-key normalization guidance
+# =====================================================
+
+
+class TestCanonicalKeyNormalizationPrompt:
+    """Pin the load-bearing canonical-key guidance in the system prompt.
+
+    Issue 2 root cause: the Planner left cross-trust keys disjoint (0%
+    overlap rationalized as "trust-scoped"), and when it did normalize it
+    copied a non-anchored regex that returns a leading-dash key. These
+    assertions keep the corrective guidance from silently regressing.
+    """
+
+    def test_offers_expression_overlap_verification_tool(self):
+        assert "normalized_value_overlap" in planner_mod.SYSTEM_PROMPT
+
+    def test_zero_overlap_is_not_a_terminal_state(self):
+        prompt = planner_mod.SYSTEM_PROMPT
+        # The prompt must steer the model AWAY from accepting disjoint keys.
+        # (Domain-neutral wording: "source-scoped", not "trust-scoped".)
+        assert "source-scoped" in prompt  # names the trap explicitly
+        assert "100%" in prompt and "dangle" in prompt
+
+    def test_regex_example_is_anchored(self):
+        prompt = planner_mod.SYSTEM_PROMPT
+        # The correct, anchored pattern must be present (leading char-class so a
+        # preceding dash is not captured) — asserted on the structure, not on a
+        # domain-specific token, so the example stays usecase-agnostic.
+        assert "[a-f0-9][a-f0-9-]+-ord-" in prompt
+        # ...and it must be flagged as the RIGHT one (the WRONG/RIGHT contrast
+        # teaches the leading-dash pitfall).
+        assert "✓ RIGHT" in prompt and "✗ WRONG" in prompt
+
+    def test_derived_key_extracts_core_before_suffix(self):
+        prompt = planner_mod.SYSTEM_PROMPT
+        # Derived child keys must extract the shared core, then append suffix —
+        # not concat onto the raw prefixed local id.
+        assert "regexp_extract" in prompt
+        assert "-line" in prompt  # the worked (domain-neutral) child-key example
diff --git a/tests/agents/agent_mapping_pge/test_relationship_generator.py b/tests/agents/agent_mapping_pge/test_relationship_generator.py
new file mode 100644
index 00000000..d18e4a71
--- /dev/null
+++ b/tests/agents/agent_mapping_pge/test_relationship_generator.py
@@ -0,0 +1,736 @@
+"""Tests for the mapping-PGE RelationshipGenerator agent (Sprint 5).
+
+Mirrors the structure of ``test_entity_generator.py``. The Generator is a
+narrow tool-calling ReAct loop terminated by ``submit_relationship_mapping``.
+These tests exercise the loop's control flow with a *fake LLM* — a stub that
+replaces ``call_serving_endpoint`` at module level and returns canned
+responses on a per-call basis.
+
+No real HTTP, no real Databricks, no MLflow tracing.
+
+What we DO exercise:
+* Termination on a single submit call.
+* Multi-step trajectory (execute_sql → submit).
+* Text-only output is treated as failure (no terminal call).
+* Iteration-budget exhaustion is treated as failure.
+* ``retry_hint`` surfaces inside the user message.
+* Strict ``property_uri`` match — submit with a wrong URI is coached, not
+  terminal.
+* Step recording invariants.
+* The user prompt surfaces the source/target id_columns verbatim — pins the
+  Sprint 5 contract that the LLM sees the endpoint columns.
+"""
+
+import json
+from typing import Any, Callable, Dict, List, Optional
+
+import pytest
+
+from agents.agent_mapping_pge.generators import relationship as rel_mod
+from agents.agent_mapping_pge.generators.relationship import (
+    RelationshipGenResult,
+    RelationshipGenStep,
+    run_relationship_generator,
+)
+
+
+# =====================================================
+# Fake LLM scaffolding (mirrors test_entity_generator.py)
+# =====================================================
+
+
+_PROP_URI = "http://ex.org/maternity#motherOf"
+_SOURCE_CLASS = "http://ex.org/maternity#Mother"
+_TARGET_CLASS = "http://ex.org/maternity#Baby"
+
+
+def _make_tool_call(name: str, arguments: dict, *, tc_id: str = "tc1") -> dict:
+    return {
+        "id": tc_id,
+        "type": "function",
+        "function": {"name": name, "arguments": json.dumps(arguments)},
+    }
+
+
+def _llm_response(
+    *,
+    tool_calls: Optional[List[dict]] = None,
+    content: Optional[str] = None,
+    finish_reason: str = "tool_calls",
+    usage: Optional[Dict[str, int]] = None,
+) -> dict:
+    message: Dict[str, Any] = {"role": "assistant"}
+    if tool_calls:
+        message["tool_calls"] = tool_calls
+    if content is not None:
+        message["content"] = content
+    return {
+        "choices": [{"finish_reason": finish_reason, "message": message}],
+        "usage": usage or {"prompt_tokens": 10, "completion_tokens": 5},
+    }
+
+
+class FakeLLM:
+    def __init__(self, responses: List[dict]):
+        self.responses = list(responses)
+        self.calls = 0
+        self.last_messages: Optional[List[dict]] = None
+        self.first_messages: Optional[List[dict]] = None
+
+    def __call__(self, *args, **kwargs) -> dict:
+        self.calls += 1
+        msgs: Optional[List[dict]] = None
+        if len(args) >= 4 and isinstance(args[3], list):
+            msgs = args[3]
+        elif "messages" in kwargs:
+            msgs = kwargs["messages"]
+        if msgs is not None:
+            snapshot = [dict(m) for m in msgs]
+            if self.first_messages is None:
+                self.first_messages = snapshot
+            self.last_messages = snapshot
+
+        if not self.responses:
+            raise AssertionError(
+                f"FakeLLM: ran out of canned responses on call #{self.calls}"
+            )
+        return self.responses.pop(0)
+
+
+class CyclingFakeLLM:
+    """Like FakeLLM but cycles through a fixed list forever."""
+
+    def __init__(self, responses: List[dict]):
+        self.responses = list(responses)
+        self.calls = 0
+
+    def __call__(self, *args, **kwargs) -> dict:
+        resp = self.responses[self.calls % len(self.responses)]
+        self.calls += 1
+        return resp
+
+
+@pytest.fixture
+def no_sleep(monkeypatch):
+    """Neutralise the 3-second inter-iteration delay so tests run fast."""
+    monkeypatch.setattr(rel_mod.time, "sleep", lambda *_a, **_k: None)
+
+
+def _patch_llm(monkeypatch, fake: Callable[..., dict]) -> None:
+    monkeypatch.setattr(rel_mod, "call_serving_endpoint", fake)
+
+
+# =====================================================
+# Fixtures
+# =====================================================
+
+
+def _ontology_property() -> dict:
+    return {
+        "uri": _PROP_URI,
+        "label": "motherOf",
+        "name": "motherOf",
+        "comment": "Links a Mother to each of her babies.",
+        "domain": _SOURCE_CLASS,
+        "range": _TARGET_CLASS,
+    }
+
+
+def _source_entity_mapping() -> dict:
+    return {
+        "ontology_class": _SOURCE_CLASS,
+        "class_name": "Mother",
+        "id_column": "nhs_number",
+        "label_column": "nhs_number",
+        "sql_query": (
+            "SELECT nhs_number AS ID, nhs_number AS Label FROM cat.sch.mothers "
+            "WHERE nhs_number IS NOT NULL"
+        ),
+    }
+
+
+def _target_entity_mapping() -> dict:
+    return {
+        "ontology_class": _TARGET_CLASS,
+        "class_name": "Baby",
+        "id_column": "baby_id",
+        "label_column": "baby_id",
+        "sql_query": (
+            "SELECT baby_id AS ID, baby_id AS Label FROM cat.sch.babies "
+            "WHERE baby_id IS NOT NULL"
+        ),
+    }
+
+
+def _source_model_slice() -> dict:
+    return {
+        "relevant_joins": [
+            {
+                "from_ref": "cat.sch.babies.mother_nhs_number",
+                "to_ref": "cat.sch.mothers.nhs_number",
+                "confidence": 0.95,
+                "overlap_pct": 0.98,
+                "kind": "same_trust_fk",
+            }
+        ],
+        "candidate_tables": [
+            {"table": "cat.sch.babies", "reason": "row per baby, has mother FK"}
+        ],
+    }
+
+
+def _valid_submit_args() -> dict:
+    return {
+        "property_uri": _PROP_URI,
+        "property_name": "motherOf",
+        "sql_query": (
+            "SELECT mother_nhs_number AS source_id, baby_id AS target_id "
+            "FROM cat.sch.babies WHERE mother_nhs_number IS NOT NULL"
+        ),
+        "source_id_column": "nhs_number",
+        "target_id_column": "baby_id",
+        "domain": _SOURCE_CLASS,
+        "range_class": _TARGET_CLASS,
+        "direction": "forward",
+    }
+
+
+# =====================================================
+# 1. Single-shot submit terminates immediately
+# =====================================================
+
+
+def test_terminates_on_submit(monkeypatch, no_sleep):
+    """First LLM turn submits a valid mapping → success, iterations=1."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_relationship_mapping", _valid_submit_args()
+                    )
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_relationship_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_property=_ontology_property(),
+        source_entity_mapping=_source_entity_mapping(),
+        target_entity_mapping=_target_entity_mapping(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    assert isinstance(result, RelationshipGenResult)
+    assert result.success is True
+    assert result.iterations == 1
+    assert result.mapping is not None
+    assert result.mapping["property"] == _PROP_URI
+    assert result.mapping["source_id_column"] == "nhs_number"
+    assert result.mapping["target_id_column"] == "baby_id"
+    assert result.error == ""
+    step_kinds = [s.step_type for s in result.steps]
+    assert step_kinds == ["tool_call", "tool_result"]
+    assert result.steps[0].tool_name == "submit_relationship_mapping"
+
+
+# =====================================================
+# 2. execute_sql validation, then submit
+# =====================================================
+
+
+def test_validates_sql_then_submits(monkeypatch, no_sleep):
+    """execute_sql → submit_relationship_mapping → success, iterations=2."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "execute_sql",
+                        {
+                            "sql": (
+                                "SELECT mother_nhs_number AS source_id, baby_id "
+                                "AS target_id FROM cat.sch.babies "
+                                "WHERE mother_nhs_number IS NOT NULL"
+                            )
+                        },
+                        tc_id="a",
+                    )
+                ]
+            ),
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_relationship_mapping",
+                        _valid_submit_args(),
+                        tc_id="b",
+                    )
+                ]
+            ),
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    class FakeClient:
+        def execute_query(self, sql):
+            return [{"source_id": "1234567890", "target_id": "b-1"}]
+
+    result = run_relationship_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=FakeClient(),
+        ontology_property=_ontology_property(),
+        source_entity_mapping=_source_entity_mapping(),
+        target_entity_mapping=_target_entity_mapping(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    assert result.success is True
+    assert result.iterations == 2
+    assert result.mapping is not None
+    # Sequence: tool_call(execute_sql), tool_result(execute_sql),
+    # tool_call(submit), tool_result(submit) — 4 steps.
+    assert len(result.steps) == 4
+    assert [s.tool_name for s in result.steps] == [
+        "execute_sql",
+        "execute_sql",
+        "submit_relationship_mapping",
+        "submit_relationship_mapping",
+    ]
+
+
+# =====================================================
+# 3. Text without terminal call → failure
+# =====================================================
+
+
+def test_text_without_terminal_fails(monkeypatch, no_sleep):
+    """A plain-text response is treated as failure — the Generator must
+    terminate via submit_relationship_mapping.
+    """
+    fake = FakeLLM(
+        [_llm_response(content="I am thinking…", finish_reason="stop")]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_relationship_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_property=_ontology_property(),
+        source_entity_mapping=_source_entity_mapping(),
+        target_entity_mapping=_target_entity_mapping(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    assert result.success is False
+    assert result.iterations == 1
+    assert result.mapping is None
+    assert "without submitting mapping" in result.error
+    assert any(s.step_type == "output" for s in result.steps)
+
+
+# =====================================================
+# 4. Iteration-budget exhaustion → failure
+# =====================================================
+
+
+def test_exhausts_iteration_budget(monkeypatch, no_sleep):
+    """Endless sample_table calls with max_iterations=3 → fail with
+    ``iteration budget`` and three iterations of steps recorded."""
+    fake = CyclingFakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "sample_table",
+                        {"full_name": "cat.sch.babies"},
+                        tc_id="a",
+                    )
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    class FakeClient:
+        def execute_query(self, sql):
+            return [{"mother_nhs_number": "1234567890", "baby_id": "b-1"}]
+
+    result = run_relationship_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=FakeClient(),
+        ontology_property=_ontology_property(),
+        source_entity_mapping=_source_entity_mapping(),
+        target_entity_mapping=_target_entity_mapping(),
+        source_model_slice=_source_model_slice(),
+        max_iterations=3,
+    )
+
+    assert result.success is False
+    assert result.iterations == 3
+    assert result.mapping is None
+    assert "iteration budget" in result.error
+    # 3 iterations × (tool_call + tool_result) = 6 steps.
+    assert len(result.steps) == 6
+
+
+# =====================================================
+# 5. retry_hint surfaces in the user prompt
+# =====================================================
+
+
+def test_retry_hint_surfaces_in_user_prompt(monkeypatch, no_sleep):
+    """If ``retry_hint`` is provided, the FIRST LLM call's user message must
+    contain the hint verbatim and the RETRY HINT label."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_relationship_mapping", _valid_submit_args()
+                    )
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_relationship_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_property=_ontology_property(),
+        source_entity_mapping=_source_entity_mapping(),
+        target_entity_mapping=_target_entity_mapping(),
+        source_model_slice=_source_model_slice(),
+        retry_hint="Use mother_nhs_number, not patient_id.",
+    )
+
+    assert result.success is True
+    assert fake.first_messages is not None
+    assert fake.first_messages[0]["role"] == "system"
+    assert fake.first_messages[1]["role"] == "user"
+    user_content = fake.first_messages[1]["content"]
+    assert "Use mother_nhs_number, not patient_id." in user_content
+    assert "RETRY HINT" in user_content
+    # Retry-hint corrective workflow surfaces the dangling-edge probe.
+    assert "dangling-edge probe" in user_content
+    assert "DO NOT repeat the same column choice" in user_content
+
+
+def test_system_prompt_mandates_dangling_edge_self_check(monkeypatch, no_sleep):
+    """The system prompt must instruct the model to run a dangling-edge
+    probe with execute_sql BEFORE submitting — name-similarity alone is
+    insufficient and was the root cause of the live smoke failure on
+    hasapgarscore."""
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_relationship_mapping", _valid_submit_args()
+                    )
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    run_relationship_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_property=_ontology_property(),
+        source_entity_mapping=_source_entity_mapping(),
+        target_entity_mapping=_target_entity_mapping(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    system_content = fake.first_messages[0]["content"]
+    assert "SELF-VERIFY THE VALUES BEFORE SUBMITTING" in system_content
+    assert "dangling_src" in system_content
+    assert "dangling_tgt" in system_content
+    # The probe must reference both endpoint universes via the entity SQLs.
+    assert "source entity's SQL" in system_content
+    assert "target entity's SQL" in system_content
+
+
+def test_system_prompt_teaches_reproducing_derived_id_expression(monkeypatch, no_sleep):
+    """The id_column is an alias for a derived canonical expression; the
+    prompt must instruct the model to REPRODUCE that expression for the
+    endpoints (not select a raw column) — the root cause of the 100%
+    source-dangling on hasapgarscore/deliveredbaby in the live smoke.
+    """
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_relationship_mapping", _valid_submit_args()
+                    )
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    run_relationship_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_property=_ontology_property(),
+        source_entity_mapping=_source_entity_mapping(),
+        target_entity_mapping=_target_entity_mapping(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    system_content = fake.first_messages[0]["content"]
+    assert "ALIAS FOR A DERIVED EXPRESSION" in system_content
+    assert "regexp_extract" in system_content
+    # Must steer away from selecting a raw column for the endpoint.
+    assert "reproduce" in system_content.lower()
+
+
+def test_system_prompt_teaches_shared_coverage_table_rule(monkeypatch, no_sleep):
+    """Cross-trust endpoint entities only overlap on shared trusts; building
+    the edge from a table only one entity covers yields 100% dangling on the
+    other side. The prompt must teach picking a BOTH-covered source table.
+    """
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_relationship_mapping", _valid_submit_args()
+                    )
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    run_relationship_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_property=_ontology_property(),
+        source_entity_mapping=_source_entity_mapping(),
+        target_entity_mapping=_target_entity_mapping(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    system_content = fake.first_messages[0]["content"]
+    assert "BOTH" in system_content
+    assert "coverage" in system_content.lower()
+    assert "100% source-dangling" in system_content or "100%" in system_content
+
+
+# =====================================================
+# 6. Wrong property_uri submission does NOT terminate
+# =====================================================
+
+
+def test_wrong_property_uri_submission_does_not_terminate(monkeypatch, no_sleep):
+    """A submit_relationship_mapping call with a property_uri that doesn't
+    match the requested one must NOT terminate the loop. The Generator must
+    keep going so a follow-up submit (with the correct URI) can succeed, and
+    the LLM must see a corrective tool message describing the mismatch.
+    """
+    requested_uri = _PROP_URI
+    other_uri = "http://ex.org/maternity#fatherOf"
+
+    wrong_args = _valid_submit_args()
+    wrong_args["property_uri"] = other_uri
+    wrong_args["property_name"] = "fatherOf"
+
+    fake = FakeLLM(
+        [
+            # Turn 1: submit with the WRONG property_uri — must NOT terminate.
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_relationship_mapping",
+                        wrong_args,
+                        tc_id="wrong",
+                    )
+                ]
+            ),
+            # Turn 2: submit with the correct property_uri — should terminate.
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_relationship_mapping",
+                        _valid_submit_args(),
+                        tc_id="right",
+                    )
+                ]
+            ),
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    result = run_relationship_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_property=_ontology_property(),
+        source_entity_mapping=_source_entity_mapping(),
+        target_entity_mapping=_target_entity_mapping(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    assert result.success is True
+    assert result.iterations == 2
+    assert result.mapping is not None
+    assert result.mapping["property"] == requested_uri
+
+    # The LLM's second call must have seen a corrective tool message
+    # describing the mismatch, surfaced through ``messages``.
+    assert fake.last_messages is not None
+    tool_messages = [m for m in fake.last_messages if m.get("role") == "tool"]
+    assert tool_messages, "expected at least one tool message on the 2nd call"
+    corrective = tool_messages[-1]
+    corrective_content = corrective.get("content", "")
+    assert other_uri in corrective_content
+    assert requested_uri in corrective_content
+    assert "does not match" in corrective_content
+    # Sanity: the corrective payload is a JSON error (not the original
+    # success=True response).
+    parsed = json.loads(corrective_content)
+    assert parsed.get("success") is False
+    assert "error" in parsed
+
+
+# =====================================================
+# 7. Step recording invariants
+# =====================================================
+
+
+def test_records_steps(monkeypatch, no_sleep):
+    """Every tool-calling iteration produces exactly one ``tool_call`` step
+    immediately followed by one ``tool_result`` step with the same tool_name.
+    """
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "sample_table",
+                        {"full_name": "cat.sch.babies"},
+                        tc_id="a",
+                    )
+                ]
+            ),
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_relationship_mapping",
+                        _valid_submit_args(),
+                        tc_id="b",
+                    )
+                ]
+            ),
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    class FakeClient:
+        def execute_query(self, sql):
+            return [{"mother_nhs_number": "1234567890", "baby_id": "b-1"}]
+
+    result = run_relationship_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=FakeClient(),
+        ontology_property=_ontology_property(),
+        source_entity_mapping=_source_entity_mapping(),
+        target_entity_mapping=_target_entity_mapping(),
+        source_model_slice=_source_model_slice(),
+    )
+
+    assert result.success is True
+    assert len(result.steps) % 2 == 0
+    for i in range(0, len(result.steps), 2):
+        call_step = result.steps[i]
+        result_step = result.steps[i + 1]
+        assert call_step.step_type == "tool_call"
+        assert result_step.step_type == "tool_result"
+        assert call_step.tool_name == result_step.tool_name
+        assert call_step.content != ""
+        assert result_step.content != ""
+        assert isinstance(call_step, RelationshipGenStep)
+        assert isinstance(result_step, RelationshipGenStep)
+
+
+# =====================================================
+# 8. User prompt surfaces source/target id_columns
+# =====================================================
+
+
+def test_user_prompt_includes_source_and_target_id_columns(monkeypatch, no_sleep):
+    """The FIRST call's user message must contain both id_column names
+    verbatim. This pins the Sprint 5 contract that the Generator surfaces
+    the endpoint columns to the LLM, so the LLM cannot silently pick
+    different endpoints.
+    """
+    # Use distinctive id_column names that won't appear anywhere else in
+    # the slice (mothers/babies join etc.), to make the assertion strict.
+    src_em = {
+        "ontology_class": _SOURCE_CLASS,
+        "class_name": "Mother",
+        "id_column": "weirdly_named_mother_pk",
+        "label_column": "weirdly_named_mother_pk",
+        "sql_query": "SELECT weirdly_named_mother_pk AS ID FROM cat.sch.mothers",
+    }
+    tgt_em = {
+        "ontology_class": _TARGET_CLASS,
+        "class_name": "Baby",
+        "id_column": "weirdly_named_baby_pk",
+        "label_column": "weirdly_named_baby_pk",
+        "sql_query": "SELECT weirdly_named_baby_pk AS ID FROM cat.sch.babies",
+    }
+
+    fake = FakeLLM(
+        [
+            _llm_response(
+                tool_calls=[
+                    _make_tool_call(
+                        "submit_relationship_mapping", _valid_submit_args()
+                    )
+                ]
+            )
+        ]
+    )
+    _patch_llm(monkeypatch, fake)
+
+    run_relationship_generator(
+        host="https://x",
+        token="t",
+        endpoint_name="ep",
+        client=None,
+        ontology_property=_ontology_property(),
+        source_entity_mapping=src_em,
+        target_entity_mapping=tgt_em,
+        source_model_slice=_source_model_slice(),
+    )
+
+    assert fake.first_messages is not None
+    user_content = fake.first_messages[1]["content"]
+    assert "weirdly_named_mother_pk" in user_content
+    assert "weirdly_named_baby_pk" in user_content
diff --git a/tests/agents/test_evaluation_tool.py b/tests/agents/test_evaluation_tool.py
new file mode 100644
index 00000000..b7e244d8
--- /dev/null
+++ b/tests/agents/test_evaluation_tool.py
@@ -0,0 +1,144 @@
+"""Tests for ``agents.tools.evaluation`` — the Sprint 6 terminal tool.
+
+These are direct unit tests for ``tool_submit_evaluation``. The agent-level
+loop semantics are covered in ``test_critic.py``.
+"""
+
+import json
+
+from agents.agent_mapping_pge.contracts import EvalReport
+from agents.tools.context import ToolContext
+from agents.tools.evaluation import (
+    EVALUATION_TOOL_DEFINITIONS,
+    EVALUATION_TOOL_HANDLERS,
+    SUBMIT_EVALUATION_DEF,
+    tool_submit_evaluation,
+)
+
+
+def _ctx() -> ToolContext:
+    return ToolContext(host="https://x", token="t")
+
+
+class TestSubmitEvaluation:
+    """Direct handler tests — no LLM, no loop."""
+
+    def test_valid_pass_stores_report(self):
+        """status=PASS, no failures → report stored, success=True."""
+        ctx = _ctx()
+        payload = tool_submit_evaluation(
+            ctx,
+            status="PASS",
+            failures=[],
+            bubble_to_planner=False,
+            reasoning="Sampled values match the Mother concept.",
+        )
+        body = json.loads(payload)
+        assert body["success"] is True
+        assert body["status"] == "PASS"
+        assert body["failures"] == 0
+        assert body["bubble_to_planner"] is False
+
+        assert isinstance(ctx.semantic_eval_report, EvalReport)
+        rep = ctx.semantic_eval_report
+        assert rep.status == "PASS"
+        assert rep.stage == "semantic"
+        assert rep.failures == []
+        assert rep.bubble_to_planner is False
+        # reasoning is preserved in metrics
+        assert rep.metrics.get("reasoning") == "Sampled values match the Mother concept."
+
+    def test_valid_fail_with_failures(self):
+        """status=FAIL with failures[] → report stored with semantic-kind failures."""
+        ctx = _ctx()
+        payload = tool_submit_evaluation(
+            ctx,
+            status="FAIL",
+            failures=[
+                {
+                    "check": "column_semantics",
+                    "expected": "delivery date",
+                    "observed": "booking date",
+                    "hint": "Use `delivery_dttm` instead of `appointment_date`.",
+                }
+            ],
+            bubble_to_planner=False,
+            reasoning="Wrong column within the right table.",
+        )
+        body = json.loads(payload)
+        assert body["success"] is True
+        assert body["status"] == "FAIL"
+        assert body["failures"] == 1
+        assert body["bubble_to_planner"] is False
+
+        rep = ctx.semantic_eval_report
+        assert rep is not None
+        assert rep.status == "FAIL"
+        assert len(rep.failures) == 1
+        f = rep.failures[0]
+        assert f.kind == "semantic"
+        assert f.check == "column_semantics"
+        assert f.expected == "delivery date"
+        assert f.observed == "booking date"
+        assert "delivery_dttm" in f.hint
+
+    def test_pass_with_failures_is_clamped(self):
+        """status=PASS with non-empty failures[] → failures clamped to []."""
+        ctx = _ctx()
+        payload = tool_submit_evaluation(
+            ctx,
+            status="PASS",
+            failures=[
+                {"check": "x", "expected": "y", "observed": "z", "hint": "h"}
+            ],
+        )
+        body = json.loads(payload)
+        assert body["success"] is True
+        assert body["failures"] == 0
+        assert ctx.semantic_eval_report.failures == []
+
+    def test_none_fields_coerce_to_empty_string(self):
+        """If the LLM passes a failure with None fields, they coerce to ''
+        (not the literal string 'None')."""
+        ctx = _ctx()
+        tool_submit_evaluation(
+            ctx,
+            status="FAIL",
+            failures=[
+                {"check": None, "expected": None, "observed": None, "hint": None}
+            ],
+        )
+        f = ctx.semantic_eval_report.failures[0]
+        assert f.check == ""
+        assert f.expected == ""
+        assert f.observed == ""
+        assert f.hint == ""
+
+    def test_invalid_status_rejected_no_report_stored(self):
+        """status not in {PASS,FAIL} → handler returns success=False, no
+        report is stamped on ctx."""
+        ctx = _ctx()
+        payload = tool_submit_evaluation(
+            ctx,
+            status="UNKNOWN",
+            failures=[],
+        )
+        body = json.loads(payload)
+        assert body["success"] is False
+        assert "invalid status" in body["error"]
+        assert ctx.semantic_eval_report is None
+
+
+class TestExports:
+    """Sanity-check the aggregates the Critic agent imports."""
+
+    def test_definitions_include_submit_evaluation(self):
+        names = [
+            d["function"]["name"] for d in EVALUATION_TOOL_DEFINITIONS
+        ]
+        assert "submit_evaluation" in names
+        assert SUBMIT_EVALUATION_DEF in EVALUATION_TOOL_DEFINITIONS
+
+    def test_handlers_match_definitions(self):
+        assert set(EVALUATION_TOOL_HANDLERS.keys()) == {"submit_evaluation"}
+        assert EVALUATION_TOOL_HANDLERS["submit_evaluation"] is tool_submit_evaluation
diff --git a/tests/agents/test_mapping_tools.py b/tests/agents/test_mapping_tools.py
new file mode 100644
index 00000000..38fc9724
--- /dev/null
+++ b/tests/agents/test_mapping_tools.py
@@ -0,0 +1,75 @@
+"""Tests for ``agents.tools.mapping`` (the shared mapping-submission tools).
+
+The historical mapping-submission tools were exercised only indirectly via
+the auto-mapping agent's integration tests. With the Sprint 4 addition of
+``unmapped_attributes`` to ``tool_submit_entity_mapping`` we need a direct
+assertion that the new field round-trips through the mapping dict that
+lands on ``ctx.entity_mappings``.
+"""
+
+import json
+
+from agents.tools.context import ToolContext
+from agents.tools.mapping import tool_submit_entity_mapping
+
+
+def _ctx() -> ToolContext:
+    return ToolContext(host="https://x", token="t")
+
+
+class TestSubmitEntityMappingUnmappedAttributes:
+    """The Sprint 4 NO-SILENT-DROPS invariant requires capturing attributes
+    that the Generator intentionally did not map. The dict form
+    ({"name", "reason"}) is the preferred shape; bare strings are accepted
+    as a fallback."""
+
+    def test_dict_form_round_trips(self):
+        ctx = _ctx()
+        payload = tool_submit_entity_mapping(
+            ctx,
+            class_uri="http://ex.org/maternity#Mother",
+            class_name="Mother",
+            sql_query="SELECT nhs_number AS ID, nhs_number AS Label FROM cat.sch.mothers",
+            id_column="nhs_number",
+            label_column="nhs_number",
+            attribute_mappings={"nhsNumber": "nhs_number"},
+            unmapped_attributes=[
+                {"name": "ethnicity", "reason": "column absent from this table"}
+            ],
+        )
+        body = json.loads(payload)
+        assert body["success"] is True
+        assert body["attributes_unmapped"] == 1
+        # The mapping dict on the context carries the new field.
+        assert len(ctx.entity_mappings) == 1
+        mapping = ctx.entity_mappings[0]
+        assert mapping["unmapped_attributes"] == [
+            {"name": "ethnicity", "reason": "column absent from this table"}
+        ]
+
+    def test_string_form_round_trips(self):
+        ctx = _ctx()
+        tool_submit_entity_mapping(
+            ctx,
+            class_uri="http://ex.org/maternity#Mother",
+            class_name="Mother",
+            sql_query="SELECT nhs_number AS ID, nhs_number AS Label FROM cat.sch.mothers",
+            id_column="nhs_number",
+            label_column="nhs_number",
+            attribute_mappings={"nhsNumber": "nhs_number"},
+            unmapped_attributes=["ethnicity"],
+        )
+        assert ctx.entity_mappings[0]["unmapped_attributes"] == ["ethnicity"]
+
+    def test_default_empty_list(self):
+        ctx = _ctx()
+        tool_submit_entity_mapping(
+            ctx,
+            class_uri="http://ex.org/maternity#Mother",
+            class_name="Mother",
+            sql_query="SELECT nhs_number AS ID, nhs_number AS Label FROM cat.sch.mothers",
+            id_column="nhs_number",
+            label_column="nhs_number",
+            attribute_mappings={"nhsNumber": "nhs_number"},
+        )
+        assert ctx.entity_mappings[0]["unmapped_attributes"] == []
diff --git a/tests/agents/test_planner_tools.py b/tests/agents/test_planner_tools.py
new file mode 100644
index 00000000..e09c2563
--- /dev/null
+++ b/tests/agents/test_planner_tools.py
@@ -0,0 +1,583 @@
+"""Tests for the Planner tool surface (Sprint 2 of mapping-PGE).
+
+The four planner tools wrap Databricks SQL queries behind a uniform
+JSON-string return contract suitable for an LLM function-calling loop:
+
+* ``sample_table``         — `SELECT * ORDER BY RAND() LIMIT n` with n capped at 100.
+* ``column_value_overlap`` — one-sided distinct-value overlap between two columns.
+* ``distinct_count``       — row / distinct / null counts for a candidate canonical id.
+* ``submit_source_model``  — terminal tool: validates a ``SourceModel`` dict and stashes
+  the dataclass instance on the ``ToolContext``.
+
+Tests use a ``FakeClient`` whose ``execute_query(sql)`` is parameterised per-test;
+handlers are exercised at the JSON-string boundary (parse the return, assert keys).
+No real Databricks connection, no LLM.
+"""
+
+import json
+from typing import Any, Callable, Dict, List, Optional
+
+import pytest
+
+from agents.agent_mapping_pge.contracts import SourceModel
+from agents.tools.context import ToolContext
+from agents.tools.planner import (
+    PLANNER_TOOL_DEFINITIONS,
+    PLANNER_TOOL_HANDLERS,
+    tool_column_value_overlap,
+    tool_distinct_count,
+    tool_normalized_value_overlap,
+    tool_sample_table,
+    tool_submit_source_model,
+)
+
+
+# =====================================================
+# Fakes
+# =====================================================
+
+
+class FakeClient:
+    """Minimal stand-in for ``DatabricksClient`` — records the last SQL
+    executed and dispatches to a per-test ``handler`` closure.
+    """
+
+    def __init__(self, handler: Callable[[str], Any]):
+        self._handler = handler
+        self.calls: List[str] = []
+
+    def execute_query(self, sql: str):
+        self.calls.append(sql)
+        result = self._handler(sql)
+        if isinstance(result, Exception):
+            raise result
+        return result
+
+
+def _ctx(handler: Callable[[str], Any]) -> ToolContext:
+    return ToolContext(host="x", token="y", client=FakeClient(handler))
+
+
+# =====================================================
+# sample_table
+# =====================================================
+
+
+class TestSampleTable:
+    def test_returns_stringified_rows_and_row_count(self):
+        def handler(sql: str):
+            assert "ORDER BY RAND()" in sql
+            assert "LIMIT 20" in sql
+            assert "cat.sch.t" in sql
+            return [
+                {"id": 1, "name": "alice", "age": None},
+                {"id": 2, "name": "bob", "age": 42},
+            ]
+
+        ctx = _ctx(handler)
+        out = json.loads(tool_sample_table(ctx, full_name="cat.sch.t"))
+
+        assert out["success"] is True
+        assert out["columns"] == ["id", "name", "age"]
+        assert out["row_count"] == 2
+        # Values stringified, nulls preserved as None.
+        assert out["rows"] == [
+            ["1", "alice", None],
+            ["2", "bob", "42"],
+        ]
+
+    def test_caps_n_at_100_when_500_requested(self):
+        captured = {}
+
+        def handler(sql: str):
+            captured["sql"] = sql
+            return []
+
+        ctx = _ctx(handler)
+        out = json.loads(tool_sample_table(ctx, full_name="cat.sch.t", n=500))
+
+        assert out["success"] is True
+        assert "LIMIT 100" in captured["sql"]
+        # Make sure we didn't smuggle 500 anywhere in the SQL.
+        assert "500" not in captured["sql"]
+
+    def test_default_n_is_20(self):
+        captured = {}
+
+        def handler(sql: str):
+            captured["sql"] = sql
+            return []
+
+        ctx = _ctx(handler)
+        tool_sample_table(ctx, full_name="cat.sch.t")
+        assert "LIMIT 20" in captured["sql"]
+
+    def test_catches_exception_returns_success_false(self):
+        def handler(sql: str):
+            raise RuntimeError("table not found")
+
+        ctx = _ctx(handler)
+        out = json.loads(tool_sample_table(ctx, full_name="cat.sch.missing"))
+
+        assert out["success"] is False
+        assert "table not found" in out["error"]
+
+    def test_sample_table_rejects_invalid_full_name(self):
+        """Identifier validator must catch SQL-injection-shaped names
+        *before* any SQL is composed or executed.
+        """
+
+        def handler(sql: str):  # pragma: no cover — must not be called
+            raise AssertionError("execute_query should not have been called")
+
+        ctx = _ctx(handler)
+        out = json.loads(tool_sample_table(ctx, full_name="t; DROP TABLE x"))
+
+        assert out["success"] is False
+        assert "invalid full_name" in out["error"]
+        # Confirm no SQL ever reached the client.
+        assert ctx.client.calls == []
+
+    def test_sample_table_returns_error_on_non_integer_n(self):
+        """Strict ``n`` parsing — a non-coercible value is a tool-call error,
+        not a silent fallback to the default.
+        """
+
+        def handler(sql: str):  # pragma: no cover — must not be called
+            raise AssertionError("execute_query should not have been called")
+
+        ctx = _ctx(handler)
+        out = json.loads(tool_sample_table(ctx, full_name="cat.sch.t", n="abc"))
+
+        assert out["success"] is False
+        assert "invalid n" in out["error"]
+        assert ctx.client.calls == []
+
+
+# =====================================================
+# column_value_overlap
+# =====================================================
+
+
+class TestColumnValueOverlap:
+    def test_happy_path_percentage(self):
+        # 100 distinct from-values, 80 distinct to-values, 60 in intersection.
+        def handler(sql: str):
+            return [
+                {
+                    "from_distinct_count": 100,
+                    "to_distinct_count": 80,
+                    "intersection_count": 60,
+                }
+            ]
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_column_value_overlap(
+                ctx,
+                from_table="cat.sch.a",
+                from_column="nhs",
+                to_table="cat.sch.b",
+                to_column="nhs_number",
+            )
+        )
+
+        assert out["success"] is True
+        assert out["from_distinct_count"] == 100
+        assert out["to_distinct_count"] == 80
+        assert out["intersection_count"] == 60
+        assert out["overlap_pct"] == pytest.approx(0.6)
+        # Symmetric shape with the zero-denom branch: ``note`` is always
+        # present so downstream consumers can read it unconditionally.
+        assert "note" in out and out["note"] == ""
+
+    def test_zero_from_distinct_no_division_by_zero(self):
+        def handler(sql: str):
+            return [
+                {
+                    "from_distinct_count": 0,
+                    "to_distinct_count": 50,
+                    "intersection_count": 0,
+                }
+            ]
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_column_value_overlap(
+                ctx,
+                from_table="cat.sch.empty",
+                from_column="x",
+                to_table="cat.sch.b",
+                to_column="y",
+            )
+        )
+
+        assert out["success"] is True
+        assert out["overlap_pct"] == 0.0
+        assert out["from_distinct_count"] == 0
+        # Note the surface so the LLM knows why pct is 0.
+        assert "note" in out
+
+    def test_catches_exception(self):
+        def handler(sql: str):
+            raise RuntimeError("permission denied")
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_column_value_overlap(
+                ctx,
+                from_table="cat.sch.a",
+                from_column="x",
+                to_table="cat.sch.b",
+                to_column="y",
+            )
+        )
+
+        assert out["success"] is False
+        assert "permission denied" in out["error"]
+
+    def test_column_value_overlap_rejects_invalid_from_column(self):
+        """Injection-shaped identifier on any of the four args short-circuits
+        before SQL is composed.
+        """
+
+        def handler(sql: str):  # pragma: no cover — must not be called
+            raise AssertionError("execute_query should not have been called")
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_column_value_overlap(
+                ctx,
+                from_table="cat.sch.a",
+                from_column="nhs FROM secrets--",
+                to_table="cat.sch.b",
+                to_column="y",
+            )
+        )
+
+        assert out["success"] is False
+        assert "invalid from_column" in out["error"]
+        assert ctx.client.calls == []
+
+
+# =====================================================
+# normalized_value_overlap
+# =====================================================
+
+
+class TestNormalizedValueOverlap:
+    def test_happy_path_interpolates_expressions(self):
+        captured = {}
+
+        def handler(sql: str):
+            captured["sql"] = sql
+            return [
+                {
+                    "from_distinct_count": 100,
+                    "to_distinct_count": 40,
+                    "intersection_count": 35,
+                }
+            ]
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_normalized_value_overlap(
+                ctx,
+                from_table="cat.trust_a.maternity_episode",
+                from_expr="regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-\\d+)', 1)",
+                to_table="cat.trust_b.delivery",
+                to_expr="regexp_extract(delivery_id, '([a-f0-9][a-f0-9-]+-preg-\\d+)', 1)",
+            )
+        )
+
+        assert out["success"] is True
+        assert out["overlap_pct"] == pytest.approx(0.35)
+        # The expressions reach the SQL verbatim (not stripped to columns).
+        assert "regexp_extract(EPISODE_ID" in captured["sql"]
+        assert "regexp_extract(delivery_id" in captured["sql"]
+
+    def test_zero_distinct_surfaces_revise_note(self):
+        def handler(sql: str):
+            return [
+                {
+                    "from_distinct_count": 0,
+                    "to_distinct_count": 40,
+                    "intersection_count": 0,
+                }
+            ]
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_normalized_value_overlap(
+                ctx,
+                from_table="cat.trust_a.t",
+                from_expr="regexp_extract(EPISODE_ID, 'nomatch', 1)",
+                to_table="cat.trust_b.t",
+                to_expr="delivery_id",
+            )
+        )
+
+        assert out["success"] is True
+        assert out["overlap_pct"] == 0.0
+        assert "revise" in out["note"].lower()
+
+    def test_rejects_injection_in_expression(self):
+        def handler(sql: str):  # pragma: no cover — must not be called
+            raise AssertionError("execute_query should not have been called")
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_normalized_value_overlap(
+                ctx,
+                from_table="cat.sch.a",
+                from_expr="x) AS v FROM cat.sch.a; DROP TABLE secrets--",
+                to_table="cat.sch.b",
+                to_expr="y",
+            )
+        )
+
+        assert out["success"] is False
+        assert "invalid from_expr" in out["error"]
+        assert ctx.client.calls == []
+
+    def test_rejects_subquery_keyword_in_expression(self):
+        def handler(sql: str):  # pragma: no cover — must not be called
+            raise AssertionError("execute_query should not have been called")
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_normalized_value_overlap(
+                ctx,
+                from_table="cat.sch.a",
+                from_expr="(SELECT max(id) FROM cat.sch.other)",
+                to_table="cat.sch.b",
+                to_expr="y",
+            )
+        )
+
+        assert out["success"] is False
+        assert "invalid from_expr" in out["error"]
+        assert ctx.client.calls == []
+
+    def test_requires_all_four_args(self):
+        ctx = _ctx(lambda sql: [])
+        out = json.loads(
+            tool_normalized_value_overlap(
+                ctx, from_table="cat.sch.a", from_expr="x", to_table="cat.sch.b"
+            )
+        )
+        assert out["success"] is False
+        assert "required" in out["error"]
+
+
+# =====================================================
+# distinct_count
+# =====================================================
+
+
+class TestDistinctCount:
+    def test_unique_and_complete(self):
+        def handler(sql: str):
+            return [{"row_count": 100, "distinct_count": 100, "null_count": 0}]
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_distinct_count(ctx, full_name="cat.sch.mothers", column="nhs_number")
+        )
+
+        assert out["success"] is True
+        assert out["row_count"] == 100
+        assert out["distinct_count"] == 100
+        assert out["null_count"] == 0
+        assert out["is_unique"] is True
+        assert out["is_complete"] is True
+
+    def test_with_nulls_is_not_complete(self):
+        # 100 rows, 10 nulls, 90 distinct values in the non-null subset.
+        def handler(sql: str):
+            return [{"row_count": 100, "distinct_count": 90, "null_count": 10}]
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_distinct_count(ctx, full_name="cat.sch.t", column="maybe_nullable")
+        )
+
+        assert out["success"] is True
+        assert out["null_count"] == 10
+        assert out["is_complete"] is False
+        # 90 distinct out of (100 - 10) = 90 non-null rows -> unique.
+        assert out["is_unique"] is True
+
+    def test_with_duplicates_is_not_unique(self):
+        # 100 rows, no nulls, only 70 distinct -> duplicates exist.
+        def handler(sql: str):
+            return [{"row_count": 100, "distinct_count": 70, "null_count": 0}]
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_distinct_count(ctx, full_name="cat.sch.t", column="trust_local_id")
+        )
+
+        assert out["success"] is True
+        assert out["is_unique"] is False
+        assert out["is_complete"] is True
+
+    def test_catches_exception(self):
+        def handler(sql: str):
+            raise RuntimeError("column does not exist")
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_distinct_count(ctx, full_name="cat.sch.t", column="missing")
+        )
+
+        assert out["success"] is False
+        assert "column does not exist" in out["error"]
+
+    def test_distinct_count_rejects_invalid_column(self):
+        """Injection-shaped column name short-circuits before SQL runs."""
+
+        def handler(sql: str):  # pragma: no cover — must not be called
+            raise AssertionError("execute_query should not have been called")
+
+        ctx = _ctx(handler)
+        out = json.loads(
+            tool_distinct_count(
+                ctx, full_name="cat.sch.t", column="nhs; DROP TABLE x"
+            )
+        )
+
+        assert out["success"] is False
+        assert "invalid column" in out["error"]
+        assert ctx.client.calls == []
+
+
+# =====================================================
+# submit_source_model
+# =====================================================
+
+
+def _valid_source_model_dict() -> Dict[str, Any]:
+    return {
+        "table_roles": [
+            {
+                "table": "cat.sch.mothers",
+                "ontology_class_candidates": [
+                    {
+                        "uri": "http://ex.org/maternity#Mother",
+                        "confidence": 0.92,
+                        "reason": "row per NHS number",
+                    }
+                ],
+            },
+            {
+                "table": "cat.sch.babies",
+                "ontology_class_candidates": [
+                    {
+                        "uri": "http://ex.org/maternity#Baby",
+                        "confidence": 0.88,
+                        "reason": "row per delivery",
+                    }
+                ],
+            },
+        ],
+        "canonical_ids": [
+            {
+                "ontology_class": "http://ex.org/maternity#Mother",
+                "canonical_column_per_table": {"cat.sch.mothers": "nhs_number"},
+                "format_note": "NHS number, 10 digits",
+            }
+        ],
+        "join_keys": [
+            {
+                "from_ref": "cat.sch.babies.mother_nhs",
+                "to_ref": "cat.sch.mothers.nhs_number",
+                "confidence": 0.9,
+                "overlap_pct": 0.95,
+                "kind": "same_trust_fk",
+            }
+        ],
+        "mapping_plan": {
+            "entity_order": [
+                "http://ex.org/maternity#Mother",
+                "http://ex.org/maternity#Baby",
+            ],
+            "relationship_order": ["http://ex.org/maternity#hasBaby"],
+            "skip": [],
+        },
+    }
+
+
+class TestSubmitSourceModel:
+    def test_valid_model_stores_and_returns_summary(self):
+        ctx = ToolContext(host="x", token="y")
+        model = _valid_source_model_dict()
+
+        out = json.loads(tool_submit_source_model(ctx, model=model))
+
+        assert out["success"] is True
+        assert isinstance(ctx.source_model, SourceModel)
+        assert len(ctx.source_model.table_roles) == 2
+        assert ctx.source_model.canonical_ids[0].ontology_class == (
+            "http://ex.org/maternity#Mother"
+        )
+        # Summary mirrors the dataclass shape.
+        summary = out["summary"]
+        assert summary["table_roles"] == 2
+        assert summary["canonical_ids"] == 1
+        assert summary["join_keys"] == 1
+        assert summary["entity_order_len"] == 2
+        assert summary["relationship_order_len"] == 1
+
+    def test_malformed_missing_required_field_returns_failure(self):
+        ctx = ToolContext(host="x", token="y")
+        # ``TableRole.from_dict`` requires the ``table`` key — drop it.
+        bad = _valid_source_model_dict()
+        del bad["table_roles"][0]["table"]
+
+        out = json.loads(tool_submit_source_model(ctx, model=bad))
+
+        assert out["success"] is False
+        assert isinstance(out["error"], str) and out["error"]
+        # ctx.source_model unchanged (still None).
+        assert ctx.source_model is None
+
+    def test_empty_table_roles_is_still_stored(self):
+        """Structural validity only — semantic emptiness is the
+        orchestrator's call, not the tool layer's.
+        """
+        ctx = ToolContext(host="x", token="y")
+        model = _valid_source_model_dict()
+        model["table_roles"] = []
+
+        out = json.loads(tool_submit_source_model(ctx, model=model))
+
+        assert out["success"] is True
+        assert isinstance(ctx.source_model, SourceModel)
+        assert ctx.source_model.table_roles == []
+        assert out["summary"]["table_roles"] == 0
+
+
+# =====================================================
+# Aggregate exports
+# =====================================================
+
+
+class TestPlannerExports:
+    _EXPECTED_TOOLS = {
+        "sample_table",
+        "column_value_overlap",
+        "normalized_value_overlap",
+        "distinct_count",
+        "submit_source_model",
+    }
+
+    def test_definitions_cover_all_tools(self):
+        names = {d["function"]["name"] for d in PLANNER_TOOL_DEFINITIONS}
+        assert names == self._EXPECTED_TOOLS
+
+    def test_handlers_match_definitions(self):
+        assert set(PLANNER_TOOL_HANDLERS.keys()) == self._EXPECTED_TOOLS
+        # All handlers must be callable.
+        for fn in PLANNER_TOOL_HANDLERS.values():
+            assert callable(fn)
diff --git a/tests/fixtures/pge_eval/clean_artifact.json b/tests/fixtures/pge_eval/clean_artifact.json
new file mode 100644
index 00000000..ca0818e7
--- /dev/null
+++ b/tests/fixtures/pge_eval/clean_artifact.json
@@ -0,0 +1,223 @@
+{
+  "success": true,
+  "iterations": 3,
+  "usage": {
+    "prompt_tokens": 1000,
+    "completion_tokens": 400
+  },
+  "stats": {
+    "planner_reinvocations": 0
+  },
+  "mapping_run_log": [
+    {
+      "item": "ex:Customer",
+      "kind": "entity",
+      "attempts": [
+        {}
+      ],
+      "final_status": "PASS"
+    },
+    {
+      "item": "ex:Order",
+      "kind": "entity",
+      "attempts": [
+        {}
+      ],
+      "final_status": "PASS"
+    },
+    {
+      "item": "ex:Product",
+      "kind": "entity",
+      "attempts": [
+        {}
+      ],
+      "final_status": "PASS"
+    },
+    {
+      "item": "ex:placesOrder",
+      "kind": "relationship",
+      "attempts": [
+        {}
+      ],
+      "final_status": "PASS"
+    },
+    {
+      "item": "ex:containsProduct",
+      "kind": "relationship",
+      "attempts": [
+        {}
+      ],
+      "final_status": "PASS"
+    }
+  ],
+  "mapping_evaluations": {
+    "ex:Customer": {
+      "metrics": {
+        "row_count": 100,
+        "distinct_id_count": 100,
+        "null_id_count": 0
+      },
+      "failures": []
+    },
+    "ex:Order": {
+      "metrics": {
+        "row_count": 500,
+        "distinct_id_count": 500,
+        "null_id_count": 0
+      },
+      "failures": []
+    },
+    "ex:Product": {
+      "metrics": {
+        "row_count": 50,
+        "distinct_id_count": 50,
+        "null_id_count": 0
+      },
+      "failures": []
+    },
+    "ex:placesOrder": {
+      "metrics": {
+        "total_edges": 500,
+        "dangling_source_pct": 0.0,
+        "dangling_target_pct": 0.0
+      },
+      "failures": []
+    },
+    "ex:containsProduct": {
+      "metrics": {
+        "total_edges": 800,
+        "dangling_source_pct": 0.0,
+        "dangling_target_pct": 0.0
+      },
+      "failures": []
+    }
+  },
+  "entity_mappings": [
+    {
+      "ontology_class": "ex:Customer",
+      "attribute_mappings": {
+        "firstName": "first_name",
+        "lastName": "last_name",
+        "email": "email"
+      }
+    },
+    {
+      "ontology_class": "ex:Order",
+      "attribute_mappings": {
+        "orderDate": "order_date",
+        "totalAmount": "total_amount"
+      }
+    },
+    {
+      "ontology_class": "ex:Product",
+      "attribute_mappings": {
+        "sku": "sku",
+        "unitPrice": "unit_price"
+      }
+    }
+  ],
+  "relationship_mappings": [],
+  "steps": [
+    {
+      "step_type": "planner",
+      "tool_name": "",
+      "duration_ms": 1200
+    }
+  ],
+  "ontology": {
+    "entities": [
+      {
+        "uri": "ex:Customer",
+        "name": "Customer",
+        "attributes": [
+          "firstName",
+          "lastName",
+          "email"
+        ]
+      },
+      {
+        "uri": "ex:Order",
+        "name": "Order",
+        "attributes": [
+          "orderDate",
+          "totalAmount"
+        ]
+      },
+      {
+        "uri": "ex:Product",
+        "name": "Product",
+        "attributes": [
+          "sku",
+          "unitPrice"
+        ]
+      }
+    ],
+    "relationships": [
+      {
+        "uri": "ex:placesOrder",
+        "name": "placesOrder",
+        "domain": "ex:Customer",
+        "range": "ex:Order"
+      },
+      {
+        "uri": "ex:containsProduct",
+        "name": "containsProduct",
+        "domain": "ex:Order",
+        "range": "ex:Product"
+      }
+    ]
+  },
+  "metadata": {
+    "tables": [
+      {
+        "name": "customers",
+        "columns": [
+          {
+            "name": "id"
+          },
+          {
+            "name": "first_name"
+          },
+          {
+            "name": "last_name"
+          },
+          {
+            "name": "email"
+          },
+          {
+            "name": "created_at"
+          }
+        ]
+      },
+      {
+        "name": "orders",
+        "columns": [
+          {
+            "name": "id"
+          },
+          {
+            "name": "order_date"
+          },
+          {
+            "name": "total_amount"
+          }
+        ]
+      },
+      {
+        "name": "products",
+        "columns": [
+          {
+            "name": "id"
+          },
+          {
+            "name": "sku"
+          },
+          {
+            "name": "unit_price"
+          }
+        ]
+      }
+    ]
+  },
+  "elapsed_s": 42.5
+}
\ No newline at end of file
diff --git a/tests/fixtures/pge_eval/defect_dangling_fk.json b/tests/fixtures/pge_eval/defect_dangling_fk.json
new file mode 100644
index 00000000..b30982f2
--- /dev/null
+++ b/tests/fixtures/pge_eval/defect_dangling_fk.json
@@ -0,0 +1,223 @@
+{
+  "success": true,
+  "iterations": 3,
+  "usage": {
+    "prompt_tokens": 1000,
+    "completion_tokens": 400
+  },
+  "stats": {
+    "planner_reinvocations": 0
+  },
+  "mapping_run_log": [
+    {
+      "item": "ex:Customer",
+      "kind": "entity",
+      "attempts": [
+        {}
+      ],
+      "final_status": "PASS"
+    },
+    {
+      "item": "ex:Order",
+      "kind": "entity",
+      "attempts": [
+        {}
+      ],
+      "final_status": "PASS"
+    },
+    {
+      "item": "ex:Product",
+      "kind": "entity",
+      "attempts": [
+        {}
+      ],
+      "final_status": "PASS"
+    },
+    {
+      "item": "ex:placesOrder",
+      "kind": "relationship",
+      "attempts": [
+        {}
+      ],
+      "final_status": "PASS"
+    },
+    {
+      "item": "ex:containsProduct",
+      "kind": "relationship",
+      "attempts": [
+        {}
+      ],
+      "final_status": "PASS"
+    }
+  ],
+  "mapping_evaluations": {
+    "ex:Customer": {
+      "metrics": {
+        "row_count": 100,
+        "distinct_id_count": 100,
+        "null_id_count": 0
+      },
+      "failures": []
+    },
+    "ex:Order": {
+      "metrics": {
+        "row_count": 500,
+        "distinct_id_count": 500,
+        "null_id_count": 0
+      },
+      "failures": []
+    },
+    "ex:Product": {
+      "metrics": {
+        "row_count": 50,
+        "distinct_id_count": 50,
+        "null_id_count": 0
+      },
+      "failures": []
+    },
+    "ex:placesOrder": {
+      "metrics": {
+        "total_edges": 500,
+        "dangling_source_pct": 0.0,
+        "dangling_target_pct": 0.47
+      },
+      "failures": []
+    },
+    "ex:containsProduct": {
+      "metrics": {
+        "total_edges": 800,
+        "dangling_source_pct": 0.0,
+        "dangling_target_pct": 0.0
+      },
+      "failures": []
+    }
+  },
+  "entity_mappings": [
+    {
+      "ontology_class": "ex:Customer",
+      "attribute_mappings": {
+        "firstName": "first_name",
+        "lastName": "last_name",
+        "email": "email"
+      }
+    },
+    {
+      "ontology_class": "ex:Order",
+      "attribute_mappings": {
+        "orderDate": "order_date",
+        "totalAmount": "total_amount"
+      }
+    },
+    {
+      "ontology_class": "ex:Product",
+      "attribute_mappings": {
+        "sku": "sku",
+        "unitPrice": "unit_price"
+      }
+    }
+  ],
+  "relationship_mappings": [],
+  "steps": [
+    {
+      "step_type": "planner",
+      "tool_name": "",
+      "duration_ms": 1200
+    }
+  ],
+  "ontology": {
+    "entities": [
+      {
+        "uri": "ex:Customer",
+        "name": "Customer",
+        "attributes": [
+          "firstName",
+          "lastName",
+          "email"
+        ]
+      },
+      {
+        "uri": "ex:Order",
+        "name": "Order",
+        "attributes": [
+          "orderDate",
+          "totalAmount"
+        ]
+      },
+      {
+        "uri": "ex:Product",
+        "name": "Product",
+        "attributes": [
+          "sku",
+          "unitPrice"
+        ]
+      }
+    ],
+    "relationships": [
+      {
+        "uri": "ex:placesOrder",
+        "name": "placesOrder",
+        "domain": "ex:Customer",
+        "range": "ex:Order"
+      },
+      {
+        "uri": "ex:containsProduct",
+        "name": "containsProduct",
+        "domain": "ex:Order",
+        "range": "ex:Product"
+      }
+    ]
+  },
+  "metadata": {
+    "tables": [
+      {
+        "name": "customers",
+        "columns": [
+          {
+            "name": "id"
+          },
+          {
+            "name": "first_name"
+          },
+          {
+            "name": "last_name"
+          },
+          {
+            "name": "email"
+          },
+          {
+            "name": "created_at"
+          }
+        ]
+      },
+      {
+        "name": "orders",
+        "columns": [
+          {
+            "name": "id"
+          },
+          {
+            "name": "order_date"
+          },
+          {
+            "name": "total_amount"
+          }
+        ]
+      },
+      {
+        "name": "products",
+        "columns": [
+          {
+            "name": "id"
+          },
+          {
+            "name": "sku"
+          },
+          {
+            "name": "unit_price"
+          }
+        ]
+      }
+    ]
+  },
+  "elapsed_s": 42.5
+}
\ No newline at end of file
diff --git a/tests/units/auth/test_auth.py b/tests/units/auth/test_auth.py
index f41d5562..17d7c334 100644
--- a/tests/units/auth/test_auth.py
+++ b/tests/units/auth/test_auth.py
@@ -241,8 +241,11 @@ def test_cloud_fetch_disabled_when_pyarrow_missing(self, monkeypatch):
             assert ok is False
             assert "pyarrow" in reason.lower()
 
+    @patch("socket.create_connection")
     @patch("databricks.sql.connect")
-    def test_probe_cloud_fetch_capability_success(self, mock_connect, monkeypatch):
+    def test_probe_cloud_fetch_capability_success(
+        self, mock_connect, mock_socket, monkeypatch
+    ):
         _clear_databricks_env(monkeypatch)
         self._reset_cache()
         auth = DatabricksAuth(
@@ -251,11 +254,15 @@ def test_probe_cloud_fetch_capability_success(self, mock_connect, monkeypatch):
             warehouse_id="wh-1",
         )
 
+        # Stage-1 TCP egress check: every probed host accepts the connect.
+        mock_socket.return_value.__enter__.return_value = MagicMock()
+
+        # Stage-2 SQL load-test.
         conn_cm = MagicMock()
         cur_cm = MagicMock()
         mock_connect.return_value.__enter__.return_value = conn_cm
         conn_cm.cursor.return_value.__enter__.return_value = cur_cm
-        cur_cm.fetchall.return_value = [(1,)]
+        cur_cm.fetchmany.return_value = [(0,)]
 
         ok, reason = auth.probe_cloud_fetch_capability()
         assert ok is True
@@ -263,12 +270,43 @@ def test_probe_cloud_fetch_capability_success(self, mock_connect, monkeypatch):
         kwargs = mock_connect.call_args.kwargs
         assert kwargs["use_cloud_fetch"] is True
         assert kwargs["server_hostname"] == "ws.cloud.databricks.com"
+        # Probe must run a query large enough to actually trigger CloudFetch;
+        # SELECT 1 returns inline via Thrift and never exercises the
+        # blocked-egress path (regression: the previous SELECT 1 probe
+        # falsely reported "ok" on Databricks Apps that block storage egress).
+        executed_sql = cur_cm.execute.call_args.args[0]
+        assert "range(5000000)" in executed_sql
+        cur_cm.fetchmany.assert_called_once_with(1)
 
         # Cached result drives can_use_cloud_fetch with no extra connect.
         mock_connect.reset_mock()
         assert auth.can_use_cloud_fetch() is True
         mock_connect.assert_not_called()
 
+    @patch("databricks.sql.connect")
+    @patch("socket.create_connection", side_effect=ConnectionRefusedError("blocked"))
+    def test_probe_cloud_fetch_tcp_egress_blocked(
+        self, _mock_socket, mock_connect, monkeypatch
+    ):
+        """The fast TCP probe to the CloudFetch storage host must catch
+        the Databricks Apps egress block BEFORE running an expensive
+        SQL load-test. Regression for the case where Apps egress is
+        blocked at L3/L4 — no point burning a 40 MB SQL query."""
+        _clear_databricks_env(monkeypatch)
+        self._reset_cache()
+        auth = DatabricksAuth(
+            host="https://ws.cloud.databricks.com",
+            token="sql-pat",
+            warehouse_id="wh-1",
+        )
+        ok, reason = auth.probe_cloud_fetch_capability()
+        assert ok is False
+        assert "storage.cloud.databricks.com" in reason
+        assert "TCP egress blocked" in reason
+        # SQL load-test must NOT run if the cheap TCP probe already failed.
+        mock_connect.assert_not_called()
+        assert auth.can_use_cloud_fetch() is False
+
     @patch("databricks.sql.connect", side_effect=RuntimeError("blocked"))
     def test_probe_cloud_fetch_capability_failure(self, _mock_connect, monkeypatch):
         _clear_databricks_env(monkeypatch)
@@ -284,6 +322,42 @@ def test_probe_cloud_fetch_capability_failure(self, _mock_connect, monkeypatch):
 
         assert auth.can_use_cloud_fetch() is False
 
+    @patch("socket.create_connection")
+    @patch("databricks.sql.connect")
+    def test_probe_cloud_fetch_blocked_egress_caught_at_fetchmany(
+        self, mock_connect, mock_socket, monkeypatch
+    ):
+        """Backstop case: TCP probe passes (e.g. an Apps env that opens
+        port 443 to the storage host but blocks the actual HTTP download
+        at L7), then the SQL load-test catches the failure during
+        fetchmany when the connector tries to download the first chunk."""
+        _clear_databricks_env(monkeypatch)
+        self._reset_cache()
+        auth = DatabricksAuth(
+            host="https://ws.cloud.databricks.com",
+            token="sql-pat",
+            warehouse_id="wh-1",
+        )
+
+        # TCP probe passes.
+        mock_socket.return_value.__enter__.return_value = MagicMock()
+
+        # SQL load-test: connect + execute work, fetchmany raises during
+        # the first CloudFetch chunk download.
+        conn_cm = MagicMock()
+        cur_cm = MagicMock()
+        mock_connect.return_value.__enter__.return_value = conn_cm
+        conn_cm.cursor.return_value.__enter__.return_value = cur_cm
+        cur_cm.fetchmany.side_effect = ConnectionError(
+            "HTTPSConnectionPool(host='us-east-1.storage.cloud.databricks.com', "
+            "port=443): Connection refused"
+        )
+
+        ok, reason = auth.probe_cloud_fetch_capability()
+        assert ok is False
+        assert "storage.cloud.databricks.com" in reason
+        assert auth.can_use_cloud_fetch() is False
+
     def test_can_use_cloud_fetch_default_app_mode_enabled(self, monkeypatch):
         _clear_databricks_env(monkeypatch)
         self._reset_cache()
diff --git a/tests/units/pge_eval/__init__.py b/tests/units/pge_eval/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/units/pge_eval/_fixtures.py b/tests/units/pge_eval/_fixtures.py
new file mode 100644
index 00000000..71ee60d6
--- /dev/null
+++ b/tests/units/pge_eval/_fixtures.py
@@ -0,0 +1,178 @@
+"""Synthetic, usecase-agnostic fixtures for the PGE evaluator unit tests.
+
+Deliberately uses a generic e-commerce-ish toy domain (Customer / Order /
+Product) so the tests prove the scorer is domain-free — none of these names
+appear in the scorer code.
+"""
+
+from copy import deepcopy
+
+
+def clean_ontology() -> dict:
+    """Agent-shape ontology that is fully structurally clean."""
+    return {
+        "entities": [
+            {
+                "uri": "ex:Customer",
+                "name": "Customer",
+                "attributes": ["firstName", "lastName", "email"],
+            },
+            {
+                "uri": "ex:Order",
+                "name": "Order",
+                "attributes": ["orderDate", "totalAmount"],
+            },
+            {
+                "uri": "ex:Product",
+                "name": "Product",
+                "attributes": ["sku", "unitPrice"],
+            },
+        ],
+        "relationships": [
+            {
+                "uri": "ex:placesOrder",
+                "name": "placesOrder",
+                "domain": "ex:Customer",
+                "range": "ex:Order",
+            },
+            {
+                "uri": "ex:containsProduct",
+                "name": "containsProduct",
+                "domain": "ex:Order",
+                "range": "ex:Product",
+            },
+        ],
+    }
+
+
+def clean_metadata() -> dict:
+    return {
+        "tables": [
+            {
+                "name": "customers",
+                "columns": [
+                    {"name": "id"},
+                    {"name": "first_name"},
+                    {"name": "last_name"},
+                    {"name": "email"},
+                    {"name": "created_at"},
+                ],
+            },
+            {
+                "name": "orders",
+                "columns": [
+                    {"name": "id"},
+                    {"name": "order_date"},
+                    {"name": "total_amount"},
+                ],
+            },
+            {
+                "name": "products",
+                "columns": [
+                    {"name": "id"},
+                    {"name": "sku"},
+                    {"name": "unit_price"},
+                ],
+            },
+        ]
+    }
+
+
+def clean_artifact() -> dict:
+    onto = clean_ontology()
+    meta = clean_metadata()
+    return {
+        "success": True,
+        "iterations": 3,
+        "usage": {"prompt_tokens": 1000, "completion_tokens": 400},
+        "stats": {"planner_reinvocations": 0},
+        "mapping_run_log": [
+            {"item": "ex:Customer", "kind": "entity", "attempts": [{}], "final_status": "PASS"},
+            {"item": "ex:Order", "kind": "entity", "attempts": [{}], "final_status": "PASS"},
+            {"item": "ex:Product", "kind": "entity", "attempts": [{}], "final_status": "PASS"},
+            {"item": "ex:placesOrder", "kind": "relationship", "attempts": [{}], "final_status": "PASS"},
+            {"item": "ex:containsProduct", "kind": "relationship", "attempts": [{}], "final_status": "PASS"},
+        ],
+        "mapping_evaluations": {
+            "ex:Customer": {"metrics": {"row_count": 100, "distinct_id_count": 100, "null_id_count": 0}, "failures": []},
+            "ex:Order": {"metrics": {"row_count": 500, "distinct_id_count": 500, "null_id_count": 0}, "failures": []},
+            "ex:Product": {"metrics": {"row_count": 50, "distinct_id_count": 50, "null_id_count": 0}, "failures": []},
+            "ex:placesOrder": {"metrics": {"total_edges": 500, "dangling_source_pct": 0.0, "dangling_target_pct": 0.0}, "failures": []},
+            "ex:containsProduct": {"metrics": {"total_edges": 800, "dangling_source_pct": 0.0, "dangling_target_pct": 0.0}, "failures": []},
+        },
+        "entity_mappings": [
+            {"ontology_class": "ex:Customer", "attribute_mappings": {"firstName": "first_name", "lastName": "last_name", "email": "email"}},
+            {"ontology_class": "ex:Order", "attribute_mappings": {"orderDate": "order_date", "totalAmount": "total_amount"}},
+            {"ontology_class": "ex:Product", "attribute_mappings": {"sku": "sku", "unitPrice": "unit_price"}},
+        ],
+        "relationship_mappings": [],
+        "steps": [{"step_type": "planner", "tool_name": "", "duration_ms": 1200}],
+        "ontology": onto,
+        "metadata": meta,
+        "elapsed_s": 42.5,
+    }
+
+
+def artifact_with_dangling_fk() -> dict:
+    """Clean except one relationship has a dangling target FK > 5%."""
+    art = clean_artifact()
+    art["mapping_evaluations"]["ex:placesOrder"]["metrics"]["dangling_target_pct"] = 0.47
+    return art
+
+
+def artifact_with_sql_failure() -> dict:
+    """Clean except one entity's SQL failed to execute."""
+    art = clean_artifact()
+    art["mapping_evaluations"]["ex:Order"] = {
+        "metrics": {"sql_error": "UNION type mismatch"},
+        "failures": [
+            {
+                "check": "sql_execution",
+                "expected": "SQL executes without error",
+                "observed": "execution error",
+                "hint": "fix the SQL",
+            }
+        ],
+    }
+    # The entity drops out of PASS in the run log too (in-scope but failed).
+    for entry in art["mapping_run_log"]:
+        if entry["item"] == "ex:Order":
+            entry["final_status"] = "FAIL"
+    return art
+
+
+def ontology_with_orphan() -> dict:
+    """Add a class with no data properties and no relationships."""
+    onto = clean_ontology()
+    onto["entities"].append({"uri": "ex:Ghost", "name": "Ghost", "attributes": []})
+    return onto
+
+
+def artifact_with_orphan_class() -> dict:
+    art = clean_artifact()
+    art["ontology"] = ontology_with_orphan()
+    return art
+
+
+def ontology_with_dangling_range() -> dict:
+    onto = clean_ontology()
+    onto["relationships"].append(
+        {"uri": "ex:refersTo", "name": "refersTo", "domain": "ex:Order", "range": "ex:Nonexistent"}
+    )
+    return onto
+
+
+def ontology_with_naming_violation() -> dict:
+    onto = clean_ontology()
+    onto["entities"].append(
+        {"uri": "ex:bad_class", "name": "bad_class", "attributes": ["someAttr"]}
+    )
+    return onto
+
+
+def ontology_with_duplicate_class() -> dict:
+    onto = clean_ontology()
+    onto["entities"].append(
+        {"uri": "ex:Customer2", "name": "Customer", "attributes": ["nickname"]}
+    )
+    return onto
diff --git a/tests/units/pge_eval/test_baseline_regression.py b/tests/units/pge_eval/test_baseline_regression.py
new file mode 100644
index 00000000..11235c7b
--- /dev/null
+++ b/tests/units/pge_eval/test_baseline_regression.py
@@ -0,0 +1,59 @@
+"""Tier-3 self-baseline regression: store a GREEN run, then a worse run REDs."""
+
+from agents.pge_eval.baseline import load_baseline, save_scorecard
+from agents.pge_eval.scorecard import score_artifact
+
+from tests.units.pge_eval import _fixtures as fx
+
+
+def test_worse_run_regresses_against_stored_baseline(tmp_path):
+    baseline_dir = str(tmp_path / "goals")
+
+    # First run: clean -> GREEN, stored as the baseline.
+    first = score_artifact(
+        fx.clean_artifact(),
+        no_judge=True,
+        baseline_dir=baseline_dir,
+        run_id="run-001",
+        timestamp="2026-06-10T00:00:00Z",
+    )
+    assert first["verdict"] == "GREEN"
+    save_scorecard(first, baseline_dir)
+
+    # A GREEN baseline is now discoverable.
+    base = load_baseline(baseline_dir, exclude_run_id="run-002")
+    assert base is not None and base["run_id"] == "run-001"
+
+    # Second run: a worse artifact (entity completeness drops) scored against
+    # the stored baseline -> Tier-3 regression -> RED on the regressed metric.
+    worse = fx.clean_artifact()
+    for entry in worse["mapping_run_log"]:
+        if entry["item"] in ("ex:Order", "ex:Product"):
+            entry["final_status"] = "FAIL"
+    second = score_artifact(
+        worse,
+        no_judge=True,
+        baseline_dir=baseline_dir,
+        run_id="run-002",
+        timestamp="2026-06-10T01:00:00Z",
+    )
+    assert second["verdict"] == "RED"
+    regressions = second["gates"]["tier3_regression"]["regressions"]
+    assert second["gates"]["tier3_regression"]["baseline_run_id"] == "run-001"
+    assert any(r["metric"] == "mapping.entity_completeness" for r in regressions)
+
+
+def test_red_run_does_not_become_baseline(tmp_path):
+    baseline_dir = str(tmp_path / "goals")
+    red = score_artifact(
+        fx.artifact_with_dangling_fk(),
+        no_judge=True,
+        baseline_dir=baseline_dir,
+        run_id="red-001",
+        timestamp="2026-06-10T00:00:00Z",
+        use_baseline=False,
+    )
+    assert red["verdict"] == "RED"
+    save_scorecard(red, baseline_dir)
+    # RED runs are never selected as a baseline.
+    assert load_baseline(baseline_dir) is None
diff --git a/tests/units/pge_eval/test_gates.py b/tests/units/pge_eval/test_gates.py
new file mode 100644
index 00000000..bd28cc72
--- /dev/null
+++ b/tests/units/pge_eval/test_gates.py
@@ -0,0 +1,136 @@
+"""Gate-tier tests (Tier-1 absolute, Tier-2 ratio, Tier-3 regression)."""
+
+from agents.pge_eval import gates
+
+
+def _stages(onto=None, mapping=None):
+    base_onto = {
+        "table_footprint_coverage": 1.0,
+        "column_footprint_coverage": 1.0,
+        "orphan_class_count": 0,
+        "dangling_domain_range_count": 0,
+        "naming_violation_count": 0,
+        "duplicate_class_count": 0,
+    }
+    base_map = {
+        "entity_completeness": 1.0,
+        "relationship_completeness": 1.0,
+        "attribute_coverage": 1.0,
+        "dangling_target_pct_max": 0.0,
+        "dangling_source_pct_max": 0.0,
+        "id_integrity": 1.0,
+        "sql_exec_failures": 0,
+        "cross_source_band_compliance": 1.0,
+    }
+    base_onto.update(onto or {})
+    base_map.update(mapping or {})
+    return {
+        "ontology": {"metrics": base_onto},
+        "mapping": {"metrics": base_map},
+    }
+
+
+def test_tier1_passes_when_clean():
+    res = gates.evaluate_tier1(_stages())
+    assert res["passed"] is True
+    assert res["failures"] == []
+
+
+def test_tier1_fails_on_orphan():
+    res = gates.evaluate_tier1(_stages(onto={"orphan_class_count": 2}))
+    assert res["passed"] is False
+    assert any(f["metric"] == "ontology.orphan_class_count" for f in res["failures"])
+
+
+def test_tier1_fails_on_dangling_fk():
+    res = gates.evaluate_tier1(_stages(mapping={"dangling_target_pct_max": 0.47}))
+    assert res["passed"] is False
+
+
+def test_tier1_fails_on_sql_exec():
+    res = gates.evaluate_tier1(_stages(mapping={"sql_exec_failures": 1}))
+    assert res["passed"] is False
+
+
+def test_tier1_band_skipped_when_inactive():
+    # band compliance < 1 but conditional inactive -> not a failure.
+    res = gates.evaluate_tier1(
+        _stages(mapping={"cross_source_band_compliance": 0.5}),
+        active_conditionals={"band_active": False},
+    )
+    assert res["passed"] is True
+
+
+def test_tier1_band_gated_when_active():
+    res = gates.evaluate_tier1(
+        _stages(mapping={"cross_source_band_compliance": 0.5}),
+        active_conditionals={"band_active": True},
+    )
+    assert res["passed"] is False
+
+
+def test_tier2_warns_but_does_not_gate_by_default():
+    res = gates.evaluate_tier2(
+        _stages(onto={"column_footprint_coverage": 0.5}), gate_ratios=False
+    )
+    assert res["passed"] is True  # warn only
+    assert res["warnings"]
+
+
+def test_tier2_gates_when_requested():
+    res = gates.evaluate_tier2(
+        _stages(onto={"column_footprint_coverage": 0.5}), gate_ratios=True
+    )
+    assert res["passed"] is False
+
+
+def test_tier3_no_baseline_passes():
+    res = gates.evaluate_tier3(_stages(), None)
+    assert res["passed"] is True
+    assert res["baseline_run_id"] is None
+
+
+def test_tier3_detects_ratio_regression():
+    baseline = {"run_id": "b1", "stages": _stages(mapping={"entity_completeness": 1.0})}
+    current = _stages(mapping={"entity_completeness": 0.6})
+    res = gates.evaluate_tier3(current, baseline)
+    assert res["passed"] is False
+    assert any(r["metric"] == "mapping.entity_completeness" for r in res["regressions"])
+
+
+def test_tier3_detects_count_regression():
+    baseline = {"run_id": "b1", "stages": _stages(onto={"orphan_class_count": 0})}
+    current = _stages(onto={"orphan_class_count": 1})
+    res = gates.evaluate_tier3(current, baseline)
+    assert res["passed"] is False
+
+
+def test_tier3_tolerance_absorbs_tiny_drop():
+    baseline = {"run_id": "b1", "stages": _stages(mapping={"entity_completeness": 1.0})}
+    current = _stages(mapping={"entity_completeness": 0.99})  # within 0.02 tol
+    res = gates.evaluate_tier3(current, baseline)
+    assert res["passed"] is True
+
+
+def test_tier3_conditional_band_not_flagged_when_inactive():
+    # Baseline had no band (inactive 1.0); current introduces a band < 1.0.
+    # This is a first measurement, NOT a regression — must NOT flag.
+    base = _stages(mapping={"cross_source_band_compliance": 1.0})
+    base["mapping"]["band_active"] = False
+    baseline = {"run_id": "b1", "stages": base}
+    cur = _stages(mapping={"cross_source_band_compliance": 0.6})
+    cur["mapping"]["band_active"] = True
+    res = gates.evaluate_tier3(cur, baseline)
+    assert res["passed"] is True
+    assert not any(r["metric"].endswith("cross_source_band_compliance") for r in res["regressions"])
+
+
+def test_tier3_conditional_band_flagged_when_active_in_both():
+    base = _stages(mapping={"cross_source_band_compliance": 1.0})
+    base["mapping"]["band_active"] = True
+    baseline = {"run_id": "b1", "stages": base}
+    cur = _stages(mapping={"cross_source_band_compliance": 0.6})
+    cur["mapping"]["band_active"] = True
+    res = gates.evaluate_tier3(cur, baseline)
+    assert res["passed"] is False
+    assert any(r["metric"].endswith("cross_source_band_compliance") for r in res["regressions"])
diff --git a/tests/units/pge_eval/test_inapp.py b/tests/units/pge_eval/test_inapp.py
new file mode 100644
index 00000000..de250725
--- /dev/null
+++ b/tests/units/pge_eval/test_inapp.py
@@ -0,0 +1,65 @@
+"""In-app scorecard hooks — run inside the app after generation/mapping."""
+
+from agents.pge_eval import inapp
+
+from tests.units.pge_eval import _fixtures as fx
+
+_TTL = """@prefix owl: <http://www.w3.org/2002/07/owl#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix : <http://ex.org/o#> .
+:Vehicle a owl:Class ; rdfs:label "Vehicle" .
+:Depot a owl:Class ; rdfs:label "Depot" .
+:stationedAt a owl:ObjectProperty ; rdfs:domain :Vehicle ; rdfs:range :Depot .
+:plate a owl:DatatypeProperty ; rdfs:domain :Vehicle ; rdfs:range xsd:string .
+:depotName a owl:DatatypeProperty ; rdfs:domain :Depot ; rdfs:range xsd:string .
+"""
+
+# Model sometimes prepends prose — the hook must still score (clean_owl_output).
+_TTL_PROSE = "Here is the ontology you asked for.\n\n" + _TTL
+
+
+def test_score_generated_ontology_clean_is_green():
+    sc = inapp.score_generated_ontology(_TTL, {"tables": []})
+    assert sc is not None
+    assert sc["verdict"] == "GREEN"
+    assert sc["mode"] == "live"
+    assert sc["stages"]["ontology"]["metrics"]["orphan_class_count"] == 0
+
+
+def test_score_generated_ontology_handles_prose_preamble():
+    sc = inapp.score_generated_ontology(_TTL_PROSE, {"tables": []})
+    assert sc is not None and sc["verdict"] == "GREEN"
+
+
+def test_score_generated_ontology_never_raises_on_garbage():
+    # Fails open -> None, never throws into the generation path.
+    assert inapp.score_generated_ontology("not turtle {{{", {}) is None
+
+
+def test_score_mapping_run_clean_is_green():
+    art = fx.clean_artifact()
+    sc = inapp.score_mapping_run(
+        ontology=art["ontology"],
+        metadata=art["metadata"],
+        mapping_run_log=art["mapping_run_log"],
+        mapping_evaluations=art["mapping_evaluations"],
+        entity_mappings=art["entity_mappings"],
+        relationship_mappings=art.get("relationship_mappings"),
+        usage=art.get("usage"),
+    )
+    assert sc is not None
+    assert sc["verdict"] == "GREEN"
+    assert sc["stages"]["mapping"]["metrics"]["id_integrity"] == 1.0
+
+
+def test_score_mapping_run_red_on_seeded_dangling_fk():
+    art = fx.artifact_with_dangling_fk()
+    sc = inapp.score_mapping_run(
+        ontology=art["ontology"],
+        metadata=art["metadata"],
+        mapping_run_log=art["mapping_run_log"],
+        mapping_evaluations=art["mapping_evaluations"],
+        entity_mappings=art["entity_mappings"],
+    )
+    assert sc is not None and sc["verdict"] == "RED"
diff --git a/tests/units/pge_eval/test_judge.py b/tests/units/pge_eval/test_judge.py
new file mode 100644
index 00000000..5b7b0beb
--- /dev/null
+++ b/tests/units/pge_eval/test_judge.py
@@ -0,0 +1,65 @@
+"""Advisory LLM-judge tests — the only network module. No network here:
+we test the pure parsing/degradation paths and the no-endpoint short-circuit."""
+
+from agents.pge_eval import judge
+
+
+def test_parse_axis_valid_json():
+    out = judge._parse_axis('{"score": 0.8, "flags": ["redundant class"]}')
+    assert out["score"] == 0.8
+    assert out["flags"] == ["redundant class"]
+
+
+def test_parse_axis_embedded_in_prose():
+    out = judge._parse_axis('Here is my verdict: {"score": 1.0, "flags": []} done')
+    assert out["score"] == 1.0
+    assert out["flags"] == []
+
+
+def test_parse_axis_malformed_degrades():
+    out = judge._parse_axis("not json at all")
+    assert out["score"] is None
+    assert out["flags"]  # carries a parse-failure flag
+
+
+def test_parse_axis_null_score():
+    out = judge._parse_axis('{"score": null, "flags": ["x"]}')
+    assert out["score"] is None
+    assert out["flags"] == ["x"]
+
+
+def test_empty_axis():
+    a = judge._empty_axis("no endpoint")
+    assert a["score"] is None and a["flags"] == ["no endpoint"]
+
+
+def test_run_judge_no_endpoint_is_offline(monkeypatch):
+    # No endpoint -> short-circuit, never touches the network.
+    import agents.engine_base as eb
+
+    monkeypatch.setattr(
+        eb, "call_serving_endpoint",
+        lambda *a, **k: (_ for _ in ()).throw(AssertionError("network call")),
+    )
+    out = judge.run_judge(
+        host="", token="", endpoint_name="", ontology={"classes": []}, artifact={}
+    )
+    assert out["ontology"]["score"] is None
+    assert out["mapping"]["score"] is None
+
+
+def test_run_judge_failopen_when_endpoint_errors(monkeypatch):
+    # An endpoint that errors must degrade to empty axes, never raise.
+    import agents.engine_base as eb
+
+    def _boom(*a, **k):
+        raise RuntimeError("503 model overloaded")
+
+    monkeypatch.setattr(eb, "call_serving_endpoint", _boom)
+    out = judge.run_judge(
+        host="h", token="t", endpoint_name="ep",
+        ontology={"classes": [{"name": "A", "dataProperties": []}]},
+        artifact={"mapping_run_log": []},
+    )
+    assert out["ontology"]["score"] is None
+    assert out["mapping"]["score"] is None
diff --git a/tests/units/pge_eval/test_loaders.py b/tests/units/pge_eval/test_loaders.py
new file mode 100644
index 00000000..b60731b0
--- /dev/null
+++ b/tests/units/pge_eval/test_loaders.py
@@ -0,0 +1,65 @@
+"""Domain-agnostic live-run input loaders (used by `goals_eval.py run`)."""
+
+import json
+
+import pytest
+
+from agents.pge_eval import loaders
+
+
+def test_to_agent_shape_from_registry_shape():
+    reg = {
+        "classes": [
+            {"uri": "ex:A", "name": "A", "label": "A", "dataProperties": [{"name": "x"}]},
+        ],
+        "properties": [
+            {"uri": "ex:rel", "name": "rel", "type": "ObjectProperty", "domain": "A", "range": "A"},
+            {"uri": "ex:x", "name": "x", "type": "DatatypeProperty", "domain": "A", "range": "string"},
+        ],
+    }
+    out = loaders.to_agent_shape(reg)
+    assert [e["name"] for e in out["entities"]] == ["A"]
+    # Only the ObjectProperty becomes a relationship; domain/range resolve to URIs.
+    assert len(out["relationships"]) == 1
+    assert out["relationships"][0]["domain"] == "ex:A"
+
+
+def test_to_agent_shape_passthrough_when_already_agent_shape():
+    agent = {"entities": [{"uri": "ex:A", "name": "A"}], "relationships": []}
+    out = loaders.to_agent_shape(agent)
+    assert out["entities"][0]["name"] == "A"
+
+
+def test_load_run_inputs_registry_json_single_version(tmp_path):
+    dump = {"versions": {"7": {"ontology": {"classes": [{"uri": "ex:A", "name": "A"}], "properties": []},
+                               "metadata": {"tables": [{"name": "t", "columns": []}]}}}}
+    p = tmp_path / "dump.json"
+    p.write_text(json.dumps(dump))
+    ont, meta = loaders.load_run_inputs(registry_json=str(p))
+    assert ont["entities"][0]["name"] == "A"
+    assert meta["tables"][0]["name"] == "t"
+
+
+def test_load_run_inputs_requires_version_when_ambiguous(tmp_path):
+    dump = {"versions": {"1": {"ontology": {}, "metadata": {}}, "2": {"ontology": {}, "metadata": {}}}}
+    p = tmp_path / "dump.json"
+    p.write_text(json.dumps(dump))
+    with pytest.raises(ValueError, match="pass --version"):
+        loaders.load_run_inputs(registry_json=str(p))
+    # explicit version resolves
+    ont, meta = loaders.load_run_inputs(registry_json=str(p), version="2")
+    assert "entities" in ont
+
+
+def test_load_run_inputs_from_ontology_and_metadata_files(tmp_path):
+    op = tmp_path / "o.json"; mp = tmp_path / "m.json"
+    op.write_text(json.dumps({"entities": [{"uri": "ex:B", "name": "B"}], "relationships": []}))
+    mp.write_text(json.dumps({"tables": [{"name": "tb", "columns": []}]}))
+    ont, meta = loaders.load_run_inputs(ontology_path=str(op), metadata_path=str(mp))
+    assert ont["entities"][0]["name"] == "B"
+    assert meta["tables"][0]["name"] == "tb"
+
+
+def test_load_run_inputs_no_source_raises():
+    with pytest.raises(ValueError, match="needs an ontology source"):
+        loaders.load_run_inputs()
diff --git a/tests/units/pge_eval/test_mapping_metrics.py b/tests/units/pge_eval/test_mapping_metrics.py
new file mode 100644
index 00000000..5b35817c
--- /dev/null
+++ b/tests/units/pge_eval/test_mapping_metrics.py
@@ -0,0 +1,70 @@
+"""Stage-2 mapping metric tests (deterministic, no LLM)."""
+
+from agents.pge_eval.mapping_metrics import evaluate_mapping
+
+from tests.units.pge_eval import _fixtures as fx
+
+
+def test_clean_mapping_metrics():
+    metrics, extras = evaluate_mapping(fx.clean_artifact(), fx.clean_ontology())
+    assert metrics["entity_completeness"] == 1.0
+    assert metrics["relationship_completeness"] == 1.0
+    assert metrics["attribute_coverage"] == 1.0
+    assert metrics["dangling_target_pct_max"] == 0.0
+    assert metrics["dangling_source_pct_max"] == 0.0
+    assert metrics["id_integrity"] == 1.0
+    assert metrics["sql_exec_failures"] == 0
+    assert metrics["cross_source_band_compliance"] == 1.0
+    assert extras["band_active"] is False
+
+
+def test_dangling_target_max_picks_worst():
+    metrics, _ = evaluate_mapping(fx.artifact_with_dangling_fk(), fx.clean_ontology())
+    assert metrics["dangling_target_pct_max"] == 0.47
+
+
+def test_sql_exec_failure_counted():
+    metrics, _ = evaluate_mapping(fx.artifact_with_sql_failure(), fx.clean_ontology())
+    assert metrics["sql_exec_failures"] == 1
+
+
+def test_entity_completeness_drops_on_failure():
+    metrics, _ = evaluate_mapping(fx.artifact_with_sql_failure(), fx.clean_ontology())
+    # One of three entities failed -> 2/3.
+    assert metrics["entity_completeness"] < 1.0
+
+
+def test_id_integrity_detects_duplicates():
+    art = fx.clean_artifact()
+    art["mapping_evaluations"]["ex:Customer"]["metrics"]["distinct_id_count"] = 90
+    metrics, _ = evaluate_mapping(art, fx.clean_ontology())
+    assert metrics["id_integrity"] < 1.0
+
+
+def test_id_integrity_ignores_legitimately_empty_entity():
+    # A 0-row entity that passed SQL is id-vacuous: it must NOT drag id_integrity
+    # below 1.0 (that would RED a clean run on empty source data).
+    art = fx.clean_artifact()
+    art["mapping_evaluations"]["ex:Product"]["metrics"] = {
+        "row_count": 0, "distinct_id_count": 0, "null_id_count": 0,
+    }
+    metrics, _ = evaluate_mapping(art, fx.clean_ontology())
+    assert metrics["id_integrity"] == 1.0
+
+
+def test_attribute_coverage_partial():
+    art = fx.clean_artifact()
+    # Drop one attribute mapping from Customer (3 dp, now 2 mapped).
+    art["entity_mappings"][0]["attribute_mappings"].pop("email")
+    metrics, _ = evaluate_mapping(art, fx.clean_ontology())
+    assert metrics["attribute_coverage"] < 1.0
+
+
+def test_band_compliance_active_and_failing():
+    art = fx.clean_artifact()
+    art["mapping_evaluations"]["ex:placesOrder"]["metrics"].update(
+        {"expected_cross_source_overlap_band": [0.2, 0.4], "cross_source_overlap_pct": 0.9}
+    )
+    metrics, extras = evaluate_mapping(art, fx.clean_ontology())
+    assert extras["band_active"] is True
+    assert metrics["cross_source_band_compliance"] < 1.0
diff --git a/tests/units/pge_eval/test_no_domain_hardcoding.py b/tests/units/pge_eval/test_no_domain_hardcoding.py
new file mode 100644
index 00000000..01b3c201
--- /dev/null
+++ b/tests/units/pge_eval/test_no_domain_hardcoding.py
@@ -0,0 +1,73 @@
+"""The scorer must be usecase-agnostic: no maternity/domain identifiers.
+
+Guards the D1/D2 contract — a domain-specific token leaking into the scorer
+would bias the harness toward one usecase and reward overfitting.
+"""
+
+import re
+from pathlib import Path
+
+import pytest
+
+_ROOT = Path(__file__).resolve().parents[3]
+# Directory of the pge_eval package + the CLI entry point = "the scorer".
+_SCORER_PKG = _ROOT / "src" / "agents" / "pge_eval"
+_CLI = _ROOT / "scripts" / "goals_eval.py"
+
+# The PGE generator/evaluator PROMPTS that run for EVERY domain. The scorer is
+# the headline agnostic surface, but these prompts bias generation for all
+# domains, so they must stay domain-neutral too (illustrative examples only).
+_PROMPT_FILES = [
+    _ROOT / "src" / "agents" / "agent_owl_generator" / "engine.py",
+    _ROOT / "src" / "agents" / "agent_mapping_pge" / "generators" / "entity.py",
+    _ROOT / "src" / "agents" / "agent_mapping_pge" / "planner.py",
+]
+
+# Domain tokens that must never appear. Broadened well past the original
+# (trust_a/b/c, preg, maternity, nhs) to catch disguised NHS/CDM/SPR coupling.
+# Generic English words (patient/delivery/order) are deliberately excluded —
+# they collide with legitimate prose (e.g. "OWL delivery").
+_FORBIDDEN = [
+    r"trust_a",
+    r"trust_b",
+    r"trust_c",
+    r"\bpreg\b",
+    r"-preg-",
+    r"pregnancy",
+    r"maternity",
+    r"\bnhs\b",
+    r"\bspr\b",
+    r"caesar",
+    r"gestation",
+    r"antenatal",
+    r"postnatal",
+    r"apgar",
+    r"\bmother\b",
+]
+
+
+def _scorer_files():
+    files = list(_SCORER_PKG.glob("*.py"))
+    files.append(_CLI)
+    return files
+
+
+@pytest.mark.parametrize("path", _scorer_files(), ids=lambda p: p.name)
+def test_no_domain_token_in_scorer_file(path):
+    text = path.read_text(encoding="utf-8").lower()
+    for pattern in _FORBIDDEN:
+        assert re.search(pattern, text) is None, (
+            f"domain token /{pattern}/ found in scorer file {path.name}"
+        )
+
+
+@pytest.mark.parametrize("path", _PROMPT_FILES, ids=lambda p: p.name)
+def test_no_domain_token_in_generator_prompt(path):
+    """PGE generation/evaluation prompts must stay usecase-agnostic — concrete
+    NHS/CDM/maternity vocabulary in a system prompt biases EVERY domain's run."""
+    text = path.read_text(encoding="utf-8").lower()
+    for pattern in _FORBIDDEN:
+        assert re.search(pattern, text) is None, (
+            f"domain token /{pattern}/ found in generator prompt {path.name} — "
+            "use a domain-neutral example instead"
+        )
diff --git a/tests/units/pge_eval/test_ontology_metrics.py b/tests/units/pge_eval/test_ontology_metrics.py
new file mode 100644
index 00000000..640a61ec
--- /dev/null
+++ b/tests/units/pge_eval/test_ontology_metrics.py
@@ -0,0 +1,83 @@
+"""Stage-1 ontology metric tests (deterministic, no LLM)."""
+
+import pytest
+
+from agents.pge_eval.ontology_metrics import evaluate_ontology
+from agents.pge_eval.normalize import is_surrogate_or_audit, normalize_name
+
+from tests.units.pge_eval import _fixtures as fx
+
+
+def test_clean_ontology_all_absolute_zero():
+    metrics, issues, _ = evaluate_ontology(fx.clean_ontology(), fx.clean_metadata())
+    assert metrics["orphan_class_count"] == 0
+    assert metrics["dangling_domain_range_count"] == 0
+    assert metrics["naming_violation_count"] == 0
+    assert metrics["duplicate_class_count"] == 0
+    assert metrics["table_footprint_coverage"] == 1.0
+    assert metrics["column_footprint_coverage"] >= 0.9
+
+
+def test_orphan_class_detected():
+    metrics, issues, _ = evaluate_ontology(fx.ontology_with_orphan(), fx.clean_metadata())
+    assert metrics["orphan_class_count"] == 1
+    assert any(i["check"] == "orphan_class_count" for i in issues)
+
+
+def test_dangling_range_detected():
+    metrics, issues, _ = evaluate_ontology(
+        fx.ontology_with_dangling_range(), fx.clean_metadata()
+    )
+    assert metrics["dangling_domain_range_count"] == 1
+    assert any(i["check"] == "dangling_domain_range_count" for i in issues)
+
+
+def test_naming_violation_detected():
+    metrics, _, _ = evaluate_ontology(
+        fx.ontology_with_naming_violation(), fx.clean_metadata()
+    )
+    assert metrics["naming_violation_count"] >= 1
+
+
+def test_duplicate_class_detected():
+    metrics, _, _ = evaluate_ontology(
+        fx.ontology_with_duplicate_class(), fx.clean_metadata()
+    )
+    assert metrics["duplicate_class_count"] == 1
+
+
+def test_table_coverage_drops_with_unmodelled_table():
+    meta = fx.clean_metadata()
+    meta["tables"].append({"name": "shipments", "columns": [{"name": "carrier"}]})
+    metrics, issues, _ = evaluate_ontology(fx.clean_ontology(), meta)
+    assert metrics["table_footprint_coverage"] < 1.0
+    assert any(
+        i["check"] == "table_footprint_coverage" for i in issues
+    )
+
+
+def test_surrogate_and_audit_columns_excluded():
+    assert is_surrogate_or_audit("id")
+    assert is_surrogate_or_audit("created_at")
+    assert is_surrogate_or_audit("customer_sk")
+    assert is_surrogate_or_audit("etl_load_ts")
+    assert not is_surrogate_or_audit("first_name")
+    assert not is_surrogate_or_audit("customer_id")  # FK can be meaningful
+
+
+def test_name_normalization():
+    assert normalize_name("first_name") == normalize_name("firstName") == "firstname"
+    assert normalize_name("Order Date") == "orderdate"
+
+
+def test_registry_shape_accepted():
+    # Same ontology in registry (classes/properties) shape must score identically.
+    registry = {
+        "classes": [
+            {"uri": "ex:A", "name": "A", "dataProperties": [{"name": "x"}]},
+        ],
+        "properties": [],
+    }
+    metrics, _, _ = evaluate_ontology(registry, {"tables": []})
+    # A has a data property -> not an orphan.
+    assert metrics["orphan_class_count"] == 0
diff --git a/tests/units/pge_eval/test_owl_evaluator_stage.py b/tests/units/pge_eval/test_owl_evaluator_stage.py
new file mode 100644
index 00000000..ec23bf47
--- /dev/null
+++ b/tests/units/pge_eval/test_owl_evaluator_stage.py
@@ -0,0 +1,69 @@
+"""The owl-generator Evaluator stage (§3.5) — deterministic Stage-1 checks
+feeding retry_hints, with a bounded retry cap."""
+
+from agents.agent_owl_generator import engine as owl_engine
+
+_CLEAN_TTL = """@prefix owl: <http://www.w3.org/2002/07/owl#> .
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+@prefix : <http://test.org/o#> .
+
+<http://test.org/o> a owl:Ontology .
+
+:Customer a owl:Class ; rdfs:label "Customer" .
+:Order a owl:Class ; rdfs:label "Order" .
+
+:placesOrder a owl:ObjectProperty ; rdfs:domain :Customer ; rdfs:range :Order .
+:firstName a owl:DatatypeProperty ; rdfs:domain :Customer ; rdfs:range xsd:string .
+:orderDate a owl:DatatypeProperty ; rdfs:domain :Order ; rdfs:range xsd:string .
+"""
+
+_ORPHAN_TTL = _CLEAN_TTL + """
+:Ghost a owl:Class ; rdfs:label "Ghost" .
+"""
+
+
+def test_clean_ontology_returns_no_retry_hint():
+    assert owl_engine._evaluate_ontology_stage(_CLEAN_TTL, {}, 1) is None
+
+
+def test_orphan_class_yields_retry_hint():
+    hint = owl_engine._evaluate_ontology_stage(_ORPHAN_TTL, {}, 1)
+    assert hint is not None
+    assert "Ghost" in hint
+    assert "orphan" in hint.lower()
+
+
+_PROSE_PREFIXED = (
+    "No database tables are available. I have what I need from the guidelines.\n\n"
+    + _ORPHAN_TTL
+)
+
+_FENCED = "```turtle\n" + _ORPHAN_TTL + "```"
+
+
+def test_prose_preamble_is_stripped_before_parsing():
+    # Regression: the model sometimes prepends a sentence before @prefix. The
+    # evaluator must clean it (like the downstream registry) and still run,
+    # not skip. Found via a live Chrome DevTools generation run.
+    hint = owl_engine._evaluate_ontology_stage(_PROSE_PREFIXED, {}, 1)
+    assert hint is not None
+    assert "Ghost" in hint
+
+
+def test_markdown_fenced_turtle_is_parsed():
+    hint = owl_engine._evaluate_ontology_stage(_FENCED, {}, 1)
+    assert hint is not None
+    assert "Ghost" in hint
+
+
+def test_parse_error_fails_open():
+    # Garbage in -> None (never blocks OWL delivery).
+    assert owl_engine._evaluate_ontology_stage("not turtle at all {{{", {}, 1) is None
+
+
+def test_evaluator_loop_is_bounded():
+    # The Evaluator retry cap exists and is finite (real PGE discipline).
+    assert owl_engine.MAX_OWL_EVAL_ROUNDS >= 1
+    assert owl_engine.MAX_OWL_EVAL_ROUNDS < 10
diff --git a/tests/units/pge_eval/test_pipeline_metrics.py b/tests/units/pge_eval/test_pipeline_metrics.py
new file mode 100644
index 00000000..167c641d
--- /dev/null
+++ b/tests/units/pge_eval/test_pipeline_metrics.py
@@ -0,0 +1,39 @@
+"""Pipeline-level metric tests (coverage_loss + convergence)."""
+
+from agents.pge_eval.mapping_metrics import evaluate_mapping
+from agents.pge_eval.ontology_metrics import evaluate_ontology
+from agents.pge_eval.pipeline_metrics import evaluate_pipeline
+
+from tests.units.pge_eval import _fixtures as fx
+
+
+def _footprint_and_mapped(artifact, ontology, metadata):
+    _, _, footprint = evaluate_ontology(ontology, metadata)
+    _, extras = evaluate_mapping(artifact, ontology)
+    return footprint, extras["mapped_cols"]
+
+
+def test_coverage_loss_zero_when_all_surfaced_cols_mapped():
+    art = fx.clean_artifact()
+    fp, mapped = _footprint_and_mapped(art, fx.clean_ontology(), fx.clean_metadata())
+    pipeline = evaluate_pipeline(art, fp, mapped)
+    assert pipeline["coverage_loss"] == 0
+
+
+def test_coverage_loss_positive_when_mapping_drops_a_column():
+    art = fx.clean_artifact()
+    # Ontology surfaces email, but the mapping never binds it.
+    art["entity_mappings"][0]["attribute_mappings"].pop("email")
+    fp, mapped = _footprint_and_mapped(art, fx.clean_ontology(), fx.clean_metadata())
+    pipeline = evaluate_pipeline(art, fp, mapped)
+    assert pipeline["coverage_loss"] >= 1
+
+
+def test_convergence_fields_present():
+    art = fx.clean_artifact()
+    fp, mapped = _footprint_and_mapped(art, fx.clean_ontology(), fx.clean_metadata())
+    conv = evaluate_pipeline(art, fp, mapped)["convergence"]
+    assert conv["mean_generator_attempts"] == 1.0
+    assert conv["planner_reinvocations"] == 0
+    assert conv["total_tokens"] == 1400
+    assert conv["wall_clock_s"] == 42.5
diff --git a/tests/units/pge_eval/test_scorecard_verdict.py b/tests/units/pge_eval/test_scorecard_verdict.py
new file mode 100644
index 00000000..5e92fa6d
--- /dev/null
+++ b/tests/units/pge_eval/test_scorecard_verdict.py
@@ -0,0 +1,75 @@
+"""End-to-end scorecard verdict + exit-code tests (§3.6)."""
+
+import pytest
+
+from agents.pge_eval.scorecard import score_artifact
+
+from tests.units.pge_eval import _fixtures as fx
+
+
+def _score(artifact, **kw):
+    kw.setdefault("no_judge", True)
+    kw.setdefault("use_baseline", False)
+    kw.setdefault("run_id", "t")
+    kw.setdefault("timestamp", "2026-06-10T00:00:00Z")
+    return score_artifact(artifact, **kw)
+
+
+def test_clean_artifact_is_green_exit_zero():
+    sc = _score(fx.clean_artifact())
+    assert sc["verdict"] == "GREEN"
+    assert sc["exit_code"] == 0
+    # All Stage-1, Stage-2, pipeline metrics populated.
+    assert set(sc["stages"]) == {"ontology", "mapping", "pipeline"}
+    assert sc["stages"]["ontology"]["metrics"]["orphan_class_count"] == 0
+    assert sc["stages"]["mapping"]["metrics"]["id_integrity"] == 1.0
+    assert "coverage_loss" in sc["stages"]["pipeline"]
+
+
+def test_dangling_fk_artifact_is_red_exit_nonzero():
+    sc = _score(fx.artifact_with_dangling_fk())
+    assert sc["verdict"] == "RED"
+    assert sc["exit_code"] != 0
+    assert any(
+        f["metric"] == "mapping.dangling_target_pct_max"
+        for f in sc["gates"]["tier1_absolute"]["failures"]
+    )
+
+
+def test_orphan_class_artifact_is_red():
+    sc = _score(fx.artifact_with_orphan_class())
+    assert sc["verdict"] == "RED"
+    assert any(
+        f["metric"] == "ontology.orphan_class_count"
+        for f in sc["gates"]["tier1_absolute"]["failures"]
+    )
+
+
+def test_sql_failure_artifact_is_red():
+    sc = _score(fx.artifact_with_sql_failure())
+    assert sc["verdict"] == "RED"
+    assert any(
+        f["metric"] == "mapping.sql_exec_failures"
+        for f in sc["gates"]["tier1_absolute"]["failures"]
+    )
+
+
+def test_schema_version_and_digests_present():
+    sc = _score(fx.clean_artifact())
+    assert sc["schema_version"] == "1.0"
+    assert sc["inputs"]["source_metadata_digest"]
+    assert sc["inputs"]["ontology_digest"]
+    assert sc["inputs"]["endpoint"] is None  # no_judge
+
+
+def test_no_judge_makes_no_network_call(monkeypatch):
+    """--no-judge must perform ZERO calls to the serving endpoint."""
+    import agents.engine_base as eb
+
+    def _boom(*a, **k):
+        raise AssertionError("network call made despite --no-judge")
+
+    monkeypatch.setattr(eb, "call_serving_endpoint", _boom)
+    sc = _score(fx.clean_artifact(), no_judge=True)
+    assert sc["stages"]["ontology"]["judge"]["score"] is None
+    assert sc["stages"]["mapping"]["judge"]["score"] is None