diff --git a/.gitignore b/.gitignore index a6e56d75..d69b09a9 100644 --- a/.gitignore +++ b/.gitignore @@ -80,10 +80,11 @@ mlartifacts/ # Cookie jars / curl session dumps (may contain live session + CSRF tokens) cookies*.txt *.cookiejar -tests/e2e/_e2e_server.log +# Local git worktrees for isolated feature work +.worktrees/ + +tests/e2e/_e2e_server.log .claude/ .understand-anything/ - - diff --git a/README.md b/README.md index c4f2f99d..262e601e 100644 --- a/README.md +++ b/README.md @@ -256,6 +256,39 @@ require installing the optional extra: uv sync --extra pitfalls ``` +### PGE Intrinsic Evaluation (quality scorecard) + +The PGE pipeline (ontology + mapping generation) is scored by a +**usecase-agnostic, gold-free** scorecard (`src/agents/pge_eval/`). It uses +intrinsic structural/self-consistency metrics — table/column footprint +coverage, orphan classes, dangling domain/range, naming/duplicate hygiene, +mapping completeness, id-integrity, dangling-FK fractions — across three gate +tiers (absolute / ratio / self-baseline regression), plus an **advisory** +LLM-judge that never gates. No domain reference answer is encoded, so it works +for any domain. + +It runs in two places: + +- **In-app** — after you generate an ontology or mappings, a scorecard is + attached to the task result (`pge_scorecard`) and the verdict is shown in the + completion message (e.g. `… · quality GREEN`). Deterministic, no extra LLM. +- **CLI** — `scripts/goals_eval.py`: + + ```bash + # Score a captured artifact (offline, deterministic; --no-judge = zero network) + .venv/bin/python scripts/goals_eval.py score [--no-judge] [--gate-ratios] + + # Run the pipeline live for ANY domain, then score it (domain-agnostic) + .venv/bin/python scripts/goals_eval.py run --registry-json [--version V] + .venv/bin/python scripts/goals_eval.py run --ontology --metadata + ``` + + Exit code is the verdict: `0` GREEN, non-zero RED. + +The OWL generator also gains an **Evaluator stage**: the Stage-1 deterministic +checks run in its generation loop and feed retry-hints back (a real PGE loop), +bounded by a retry cap. + ### Documentation Full documentation is available in [`docs/`](docs/README.md). For a comprehensive feature list and architecture details, see [INFO.md](docs/INFO.md). diff --git a/changelogs/v0.3.1/2026-05-28.log b/changelogs/v0.3.1/2026-05-28.log new file mode 100644 index 00000000..01b97cbb --- /dev/null +++ b/changelogs/v0.3.1/2026-05-28.log @@ -0,0 +1,371 @@ +# 2026-05-28 — v0.3.1 + +## feat(mapping-pge): Sprint 1 — scaffolding + contracts + deterministic evaluator + +Context: First sprint of the Planner -> Generator -> Evaluator (PGE) +redesign that will replace the single-loop ReAct +`agents/agent_auto_assignment`. This sprint is foundation-only: no LLM +code, nothing wired into Mapping.py, the old agent untouched. + +Changes: +1. `src/agents/agent_mapping_pge/__init__.py` — package marker re-exporting + all contract dataclasses. +2. `src/agents/agent_mapping_pge/contracts.py` — typed dataclasses + (`TableRoleCandidate`, `TableRole`, `CanonicalId`, `JoinKey`, `SkipItem`, + `MappingPlan`, `SourceModel`, `EvalFailure`, `EvalReport`, `RetryState`) + with `to_dict` / `from_dict` JSON round-trip. +3. `src/agents/agent_mapping_pge/evaluator/__init__.py` — re-exports the + two public evaluator entry points. +4. `src/agents/agent_mapping_pge/evaluator/report.py` — `build_report` + helper that derives `status` from the failure list. +5. `src/agents/agent_mapping_pge/evaluator/deterministic.py` — pure-Python + stage-1 evaluator: `evaluate_entity_mapping` and + `evaluate_relationship_mapping`. Caller injects an `execute_sql_fn` + returning `{"columns": [...], "rows": [...]}` so the evaluator is + trivially testable without a Databricks connection. +6. `tests/agents/agent_mapping_pge/test_deterministic_evaluator.py` — 14 + tests covering entity PASS/FAIL paths (row_count zero, duplicate ids, + null ids, unmapped attribute, declared-as-unmapped) and relationship + PASS/FAIL paths (3% and 47% dangling source, >50% bubbling, zero edges, + cross-source overlap band inside/outside). +7. `tests/agents/agent_mapping_pge/test_contracts.py` — 3 round-trip + smoke tests for `SourceModel`, `EvalReport`, `RetryState`. + +Modified files: +- src/agents/agent_mapping_pge/__init__.py (new) +- src/agents/agent_mapping_pge/contracts.py (new) +- src/agents/agent_mapping_pge/evaluator/__init__.py (new) +- src/agents/agent_mapping_pge/evaluator/report.py (new) +- src/agents/agent_mapping_pge/evaluator/deterministic.py (new) +- tests/agents/__init__.py (new) +- tests/agents/agent_mapping_pge/__init__.py (new) +- tests/agents/agent_mapping_pge/test_deterministic_evaluator.py (new) +- tests/agents/agent_mapping_pge/test_contracts.py (new) +- changelogs/v0.3.1/2026-05-28.log (this file) + +Test result: 17/17 new tests pass. Full suite: 2075 passed, 3 pre-existing +failures in `test_settings_lakebase_status.py` (unrelated to this change, +already failing on master). + +Deviation from spec: when an explicit +`expected_cross_source_overlap_band` is supplied to +`evaluate_relationship_mapping`, the `dangling_target_pct < 0.05` check +is skipped. Rationale: cross-source FKs are *expected* to be partial +(that is the wedge), so a band check is the correct semantic and the +strict dangling check would always fail. Bubble-on-source-dangling and +edge-count checks still apply. + + +## chore(mapping-pge): cache seam + bubble-demotion warning + test rename + +Context: Three targeted code-quality fixes on Sprint 1 ahead of the +Sprint 7 orchestrator. Scope is narrow — no contract / dataclass +changes, no behavioural changes beyond logging. + +Changes: +1. `src/agents/agent_mapping_pge/evaluator/deterministic.py` — added an + opt-in `id_universe_cache: Optional[Dict[str, set]] = None` kwarg to + `evaluate_relationship_mapping` (and to the private `_distinct_id_set` + helper). When provided, source/target id universes are looked up by + the entity mapping's SQL string and stored on miss. When `None`, + behaviour is unchanged. Cache is caller-managed; no module-level + state. Lets the Sprint 7 orchestrator avoid N×2 redundant entity + universe re-fetches when many relationships share endpoint classes. +2. `src/agents/agent_mapping_pge/evaluator/report.py` — `build_report` + now emits a `logger.warning` when the caller passed + `bubble_to_planner=True` but `status` resolves to PASS. The silent + demotion behaviour itself is unchanged. +3. `tests/agents/agent_mapping_pge/test_deterministic_evaluator.py` — + renamed misnamed `test_cross_source_band_pass_inside` (which actually + asserted FAIL on overlap outside the band) to + `test_cross_source_band_fail_when_outside`; assertions unchanged. + Added `test_relationship_evaluator_uses_id_universe_cache` proving + the cache short-circuits entity-SQL execution by counting wrapped + SQL calls. Added `test_build_report_warns_when_bubble_demoted` + asserting the warning fires under PASS+bubble demotion and stays + silent otherwise (PASS+no-bubble, FAIL+bubble). + +Modified files: +- src/agents/agent_mapping_pge/evaluator/deterministic.py +- src/agents/agent_mapping_pge/evaluator/report.py +- tests/agents/agent_mapping_pge/test_deterministic_evaluator.py +- changelogs/v0.3.1/2026-05-28.log (this file) + +Test result: 22/22 pass in `tests/agents/agent_mapping_pge/` (net +2 vs +prior 20: cache test, warning test). + + +## feat(mapping-pge): Sprint 2 — planner tools + +Context: Second sprint of the Planner -> Generator -> Evaluator (PGE) +redesign. Adds the Planner's tool surface: four OpenAI function-calling +tools the Planner LLM (Sprint 3) will use to probe source tables and +submit a validated `SourceModel`. No LLM code in this sprint either — +just the tool defs + handlers + a new context slot. + +Changes: +1. `src/agents/tools/planner.py` (new) — four tools matching the + `mapping.py` / `sql.py` convention: + * `sample_table(full_name, n=20)` — `SELECT * ORDER BY RAND() LIMIT n`, + n capped at 100. Stringifies values for the LLM-facing surface. + * `column_value_overlap(from_table, from_column, to_table, to_column)` — + one-sided overlap `|distinct(from) ∩ distinct(to)| / |distinct(from)|` + in a single CTE-based query; returns 0.0 with a note when the + denominator is zero. + * `distinct_count(full_name, column)` — row/distinct/null counts with + `is_unique` and `is_complete` flags derived in Python. + * `submit_source_model(model)` — terminal tool: round-trips `model` + through `SourceModel.from_dict` and stores the dataclass on + `ctx.source_model`. Catches `KeyError` / `TypeError` / `ValueError` + and returns `success: False` with the error message — never raises. + Module exports `PLANNER_TOOL_DEFINITIONS` and `PLANNER_TOOL_HANDLERS` + matching the `MAPPING_TOOL_*` shape. +2. `src/agents/tools/context.py` — added a single optional field + `source_model: Optional["SourceModel"] = None` with a string-forward-ref + and a `TYPE_CHECKING` import to avoid a circular import between + `agents.tools` and `agents.agent_mapping_pge`. +3. `tests/agents/test_planner_tools.py` (new) — 16 tests against a + `FakeClient` whose `execute_query` is a per-test closure. Covers each + tool's happy path, error path, and the boundary cases called out in + the Sprint 2 spec (n-cap at 100, division-by-zero guard, structural + validation only on submit). + +Modified files: +- src/agents/tools/planner.py (new) +- src/agents/tools/context.py +- tests/agents/test_planner_tools.py (new) +- changelogs/v0.3.1/2026-05-28.log (this file) + +Test result: 38/38 pass in `tests/agents/` (22 Sprint 1 + 16 new). Full +suite unaffected — same 3 pre-existing failures in +`test_settings_lakebase_status.py` as before. + +## feat(mapping-pge): Sprint 3 — planner agent + +Context: Third sprint of the PGE redesign. The Planner is the first LLM- +backed stage: a single-invocation ReAct-style agent that consumes the +ontology + table metadata + imported documents, probes the source data via +the Sprint 2 tools (sample_table, column_value_overlap, distinct_count), +and emits a validated `SourceModel` via the terminal `submit_source_model` +tool. Re-invocations are driven by the orchestrator in Sprint 7 — the +Planner itself has no internal retry loop. The auto_assignment agent and +all Sprint 1 / Sprint 2 modules are untouched. + +Changes: +1. `src/agents/agent_mapping_pge/planner.py` (new) — `run_planner()` + entry point + `PlannerResult` / `PlannerStep` dataclasses. ReAct loop + mirrors `agents/agent_auto_assignment/engine.py` (same + `call_serving_endpoint` + `dispatch_tool` cycle, same 3-second + inter-iteration delay, same usage accumulation, same `@trace_agent` + decorator). Differences vs auto_assignment: smaller default + `max_iterations=25`; NO single-shot fallback on 400/422 (the Planner + needs tools); terminates immediately after a successful + `submit_source_model` (no chatty wrap-up turn). The system prompt + frames the role as "senior data architect", enumerates each tool's + purpose, lays out the canonical SourceModel workflow, and pins the + invariants the orchestrator depends on (URI existence, ordering + constraint on relationship_order, confidence range, `kind` enum). +2. `tests/agents/agent_mapping_pge/test_planner.py` (new) — 6 tests + exercising the four termination conditions (single-shot submit, + multi-step ReAct then submit, submit-failure-then-retry, free-text + without terminal, iteration budget exhaustion) and the step-recording + invariants. Uses a `FakeLLM` / `CyclingFakeLLM` stub injected via + `monkeypatch.setattr(planner_mod, "call_serving_endpoint", ...)` — + no real HTTP, no Databricks, no MLflow. + +Modified files: +- src/agents/agent_mapping_pge/planner.py (new) +- tests/agents/agent_mapping_pge/test_planner.py (new) +- changelogs/v0.3.1/2026-05-28.log (this file) + +Test result: 48/48 pass in `tests/agents/` (42 Sprint 1+2 baseline + 6 +new Sprint 3 tests). No edits to out-of-scope files (engine_base.py, +tracing.py, llm_utils.py, contracts.py, evaluator/, tools/planner.py, +or agent_auto_assignment/). Same pre-existing failures elsewhere in the +full suite as before. + +## feat(mapping-pge): Sprint 4 — EntityGenerator agent + unmapped_attributes + +Context: Sprint 4 of the PGE redesign. The EntityGenerator is a narrow, +focused LLM agent that maps ONE ontology class at a time. The orchestrator +(Sprint 7) calls it per item with a filtered SourceModel slice. It does +NOT see the full ontology or full metadata — only what's relevant to the +class being mapped. Same loop machinery as the Planner (Sprint 3) with a +smaller default budget (12 vs 25) and a narrower tool set. + +Changes: +1. `src/agents/agent_mapping_pge/generators/__init__.py` (new) — empty + package marker for the Generator submodule. +2. `src/agents/agent_mapping_pge/generators/entity.py` (new) — the + `run_entity_generator` agent. Builds a 3-tool surface (`execute_sql`, + `sample_table`, `submit_entity_mapping`), a per-class user prompt + carrying the ontology class + source-model slice + optional retry hint, + and a ReAct loop terminated by `submit_entity_mapping`. The system + prompt lifts the SQL RULES FOR ENTITIES section from + `agent_auto_assignment/engine.py` and adds the slice-consumption rules + plus the NO SILENT DROPS invariant. +3. `src/agents/tools/mapping.py` — added `unmapped_attributes` kwarg to + `tool_submit_entity_mapping` and to its OpenAI function definition. + The field accepts either `[{"name", "reason"}]` dicts (preferred) or + bare strings (fallback); it persists on the mapping dict under the + same key. ~30 lines added; no behaviour change to existing callers. +4. `tests/agents/agent_mapping_pge/test_entity_generator.py` (new) — + 7 tests covering the four termination conditions + (`test_terminates_on_submit`, `test_text_without_terminal_fails`, + `test_exhausts_iteration_budget`), the multi-step ReAct trajectory + (`test_validates_sql_then_submits`), the new field round-trip + (`test_unmapped_attributes_round_trip` — both dict and string forms), + the retry-hint surfacing (`test_retry_hint_surfaces_in_user_prompt`), + and step-recording invariants (`test_records_steps`). Uses the same + `FakeLLM` / `CyclingFakeLLM` stub pattern as `test_planner.py`. +5. `tests/agents/test_mapping_tools.py` (new) — 3 minimal tests directly + covering `tool_submit_entity_mapping`: dict-form unmapped_attributes + round-trips, string-form round-trips, default value is an empty list. + These are the first direct tests for `agents.tools.mapping`; previously + it was exercised only indirectly via the auto-mapping agent. + +Modified files: +- src/agents/agent_mapping_pge/generators/__init__.py (new) +- src/agents/agent_mapping_pge/generators/entity.py (new) +- src/agents/tools/mapping.py (added unmapped_attributes plumbing) +- tests/agents/agent_mapping_pge/test_entity_generator.py (new) +- tests/agents/test_mapping_tools.py (new) +- changelogs/v0.3.1/2026-05-28.log (this file) + +Test result: 58/58 pass in `tests/agents/` (48 Sprint 1+2+3 baseline + +7 new Sprint 4 generator tests + 3 new direct tool tests). No edits to +out-of-scope files (engine_base.py, tracing.py, llm_utils.py, +contracts.py, evaluator/, planner.py, agent_auto_assignment/, +tools/sql.py, tools/metadata.py, tools/documents.py, tools/ontology.py, +tools/context.py, tools/planner.py). Same pre-existing failures +elsewhere in the full suite as before (3 lakebase tests, 80 e2e errors, +all unrelated and present on baseline). + + +==================================================================== +Sprint 6 — Semantic Critic + submit_evaluation tool +==================================================================== + +Context: Sprint 6 of the mapping-PGE redesign — the Semantic Critic +(Evaluator stage 2). Sibling of the deterministic evaluator: runs only +when stage 1 PASSES and audits ONE submitted mapping for semantic +correctness. Bubble-to-planner signal sharpens around the wrong-table +vs wrong-column distinction: wrong column = Generator retry, wrong +table = Planner re-invocation. + +Changes: +1. `src/agents/tools/evaluation.py` (new) — `tool_submit_evaluation` + terminal handler + `SUBMIT_EVALUATION_DEF` OpenAI function definition. + Validates `status` in {"PASS", "FAIL"} (invalid → success=False, + loop continues), synthesises a generic `semantic_audit` failure when + status=FAIL with empty failures[] so the report stays coherent, and + demotes `bubble_to_planner=True` when status=PASS (mirrors + `evaluator.report.build_report`). Stamps the resulting `EvalReport` + (stage="semantic") onto `ctx.semantic_eval_report`. Exports + `EVALUATION_TOOL_DEFINITIONS` / `EVALUATION_TOOL_HANDLERS` aggregates. +2. `src/agents/tools/context.py` — added `semantic_eval_report: + Optional["EvalReport"] = None` using the same `TYPE_CHECKING` + forward-ref pattern already used for `source_model`. 5-line touch. +3. `src/agents/agent_mapping_pge/evaluator/critic.py` (new) — the + `run_critic` agent. 4-tool surface (`sample_table`, + `get_documents_context`, `execute_sql`, `submit_evaluation`), under-3KB + system prompt with PASS / FAIL(no-bubble) / FAIL(bubble) rubric, + default `max_iterations=6`, 3-second inter-iteration sleep, MLflow + `@trace_agent` decorator, no single-shot fallback. The user prompt + surfaces AUDIT TARGET (kind/uri/label/comment; attributes for + entities, domain/range for relationships), SUBMITTED MAPPING, + PLANNER'S PREDICTION, STRUCTURAL CHECK METRICS (PASSED), and a + YOUR TASK reminder. +4. `tests/agents/test_evaluation_tool.py` (new) — 5 direct tests + for `tool_submit_evaluation`: valid PASS round-trip, valid FAIL with + failures round-trip, invalid status rejection (no report stamped), + plus 2 export-aggregate sanity checks. +5. `tests/agents/agent_mapping_pge/test_critic.py` (new) — 11 tests + covering the full Critic loop with a FakeLLM stub: PASS verdict, + FAIL-column (no bubble), FAIL-table (bubbles), PASS+bubble demotion, + FAIL with no failures synthesises one, invalid-status non-termination + then valid retry, text-only failure, iteration-budget exhaustion, + user prompt surfaces stage1 metrics, user prompt distinguishes + entity vs relationship (domain/range lines), and step-recording + invariants. + +Modified files: +- src/agents/tools/evaluation.py (new) +- src/agents/tools/context.py (added semantic_eval_report field) +- src/agents/agent_mapping_pge/evaluator/critic.py (new) +- tests/agents/test_evaluation_tool.py (new) +- tests/agents/agent_mapping_pge/test_critic.py (new) +- changelogs/v0.3.1/2026-05-28.log (this entry) + +Test result: 83/83 pass in `tests/agents/` (67 Sprint 1–5 baseline + +11 new Critic tests + 5 new evaluation-tool tests). No edits to +out-of-scope files: contracts.py, evaluator/{deterministic,report}.py, +generators/, planner.py, engine_base.py, tracing.py, llm_utils.py, +tools/{sql,metadata,documents,ontology,mapping,planner}.py, +agent_auto_assignment/. + +## feat(mapping-pge): Sprint 8 — wire PGE engine into Mapping.py + remove legacy agent + +Context: Final sprint of the Planner -> Generator -> Evaluator redesign. +Sprints 1-7 built `agents/agent_mapping_pge/` with a drop-in `run_agent` +matching the legacy `agent_auto_assignment` signature. This sprint flips +the switch: `Mapping.py` and `AgentClient.py` now import from the new +engine, the PGE-specific extras (`source_model`, `mapping_evaluations`, +`mapping_run_log`) are persisted on the session, and the legacy package +is deleted. + +Changes: +1. `src/back/objects/mapping/Mapping.py` — switched the + `auto_assign_with_agent` import to `agents.agent_mapping_pge`. Updated + the TYPE_CHECKING alias likewise. Accumulated the new PGE-extra + fields across chunks in `run_auto_assign_task` and passed them + through `save_mappings_to_session` (single-item flow does the same + in `run_single_auto_assign_task`). +2. `src/back/objects/mapping/Mapping.py::save_mappings_to_session` — + extended the signature with three optional kwargs (`source_model`, + `mapping_evaluations`, `mapping_run_log`) and persisted them as + siblings of `entities`/`relationships` under the session + `assignment` bucket. `mapping_evaluations` merges by item key, + `mapping_run_log` appends; both stay backwards-compatible (None -> + no-op) so existing callers (and the R2RML parser) are unaffected. +3. `src/back/core/agents/AgentClient.py` — switched + `run_auto_assignment` to import the new engine; updated the + TYPE_CHECKING alias and docstring. +4. `src/agents/agent_auto_assignment/` — deleted (`__init__.py`, + `engine.py`, `tools.py`). The PGE pipeline is now the only mapping + agent in the repo. +5. Scrubbed stale "agent_auto_assignment" mentions from `agent_mapping_pge` + docstrings/comments (engine.py, planner.py, generators/entity.py, + generators/relationship.py, __init__.py) so `grep -rn + "agent_auto_assignment" src/ tests/` returns zero hits. + +Modified files: +- src/back/objects/mapping/Mapping.py (import swap + PGE-extras persistence) +- src/back/core/agents/AgentClient.py (import swap) +- src/agents/agent_mapping_pge/__init__.py (docstring scrub) +- src/agents/agent_mapping_pge/engine.py (docstring scrub) +- src/agents/agent_mapping_pge/planner.py (docstring + comment scrub) +- src/agents/agent_mapping_pge/generators/entity.py (comment scrub) +- src/agents/agent_mapping_pge/generators/relationship.py (comment scrub) +- src/agents/agent_auto_assignment/__init__.py (deleted) +- src/agents/agent_auto_assignment/engine.py (deleted) +- src/agents/agent_auto_assignment/tools.py (deleted) +- changelogs/v0.3.1/2026-05-28.log (this entry) + +Test result: 99/99 pass in `tests/agents/` (PGE-agent suite intact). +40/40 pass in `tests/test_mapping_service.py` + +`tests/test_workflow_mapping.py`. Full suite: 2157 passed; the 3 +pre-existing `test_settings_lakebase_status.py` failures and the +Playwright `tests/e2e/` collection errors are unrelated to this +change (also failing on the prior Sprint 1-7 commits per earlier +changelog entries). + +Acceptance: +- `grep -rn "agent_auto_assignment" src/ tests/` returns zero hits. +- `from agents.agent_mapping_pge.engine import run_agent` is the new + import in `Mapping.py`. +- Three PGE extras reach the session via `assignment.source_model`, + `assignment.mapping_evaluations`, `assignment.mapping_run_log` + (durable through `session_path.write_text`). +- Public `Mapping.auto_assign_with_agent` signature unchanged. + diff --git a/changelogs/v0.5.0/2026-06-02.log b/changelogs/v0.5.0/2026-06-02.log new file mode 100644 index 00000000..3581e0d3 --- /dev/null +++ b/changelogs/v0.5.0/2026-06-02.log @@ -0,0 +1,108 @@ +# v0.5.0 — 2026-06-02 + +## PGE generation quality: exhaustive attributes + value harmonization + +### Context +`newdomain` (ontobricks_pge_registry) was attribute-sparse: the OWL generator +emitted a curated ~28-property ontology subset (vs V1.1's ~53), so most classes +had empty `dataProperties` and the EntityGenerator produced ID+Label-only +entities — unfit as the KPI/analytical source of truth. Root cause is generation +QUALITY in two LLM-driven agents, not plumbing (the sync/finalize infra in +`Ontology.finalize_class_attributes` already works and runs at session load). +This change strengthens both generators so the PGE pipeline reproduces V1.1-level +coverage from source introspection. + +### Changes +1. `src/agents/agent_owl_generator/engine.py` — Part 1: exhaustive datatype-property + coverage. + - Added a `# ATTRIBUTE COVERAGE (CRITICAL — exhaustive, NOT curated)` section + to the system prompt: emit a DatatypeProperty for EVERY meaningful source + column per class across ALL covering trust tables; exclude only surrogate + keys / audit columns / FK columns; collapse cross-trust synonyms to one + property; lowerCamelCase `[a-z][A-Za-z0-9]*` names; rich clinical entities + warrant 6–11 properties; "2 props" is a floor not a target. + - Workflow now instructs `get_table_detail` on every covering table (so the + LLM sees the full column list past the 80-column get_metadata truncation). + - Softened the conflicting `GENERIC_GUIDELINES §2.1` minimalism line to defer + to the new coverage rule. +2. `src/agents/agent_mapping_pge/generators/entity.py` — Part 2: value harmonization + + regex safety. + - Added a `VALUE HARMONIZATION` section: for coded attributes (method/status/ + type/mode/outcome), discover raw distinct values (`SELECT DISTINCT`) then map + to ONE canonical lowercase token set with a CASE expression aliased to the + clean attribute name, using the SAME tokens across all UNION branches + (delivery method → caesarean/instrumental/vaginal worked example). + - Added a `REGEX SAFETY` section: always `[0-9]`/`[a-z]`, NEVER `\d`/`\w`/`\s` + (the build strips a lone backslash → `\d` degrades to literal `d`). + - Workflow step 3 now references harmonization. +3. `src/agents/agent_mapping_pge/planner.py` — Part 2: multi-trust completeness. + - Added a COMPLETENESS rule to CANONICAL-KEY NORMALIZATION: include EVERY + covering trust table in `canonical_column_per_table`, not just the two + checked for overlap (omitting one drops 30–60% of instances and dangles + relationships). Planner already emits `[0-9]`-safe regex and mandates + multi-trust UNION — this reinforces full coverage during candidate discovery. +4. `tests/agents/agent_mapping_pge/test_entity_generator.py` — added + `test_system_prompt_mandates_value_harmonization` (canonical token set, + discover-before-harmonize, `[0-9]`-not-`\d` regex safety). + +### Modified files +- src/agents/agent_owl_generator/engine.py +- src/agents/agent_mapping_pge/generators/entity.py +- src/agents/agent_mapping_pge/planner.py +- tests/agents/agent_mapping_pge/test_entity_generator.py + +### Test result +`uv run --offline pytest tests/agents/agent_mapping_pge/ tests/units/ontology/test_owl_generator.py` +— 101 passed → 102 passed with the new harmonization test. No regressions. + +--- + +# v0.5.0 — 2026-06-03 + +## OWL generator output ceiling + live newdomain regeneration + +### Context +Exercising the v0.5.0 generation-quality changes end-to-end against the live +`ontobricks-pge` registry surfaced one defect and confirmed the rest. + +### Changes +1. `src/agents/agent_owl_generator/engine.py` — raised the LLM completion ceiling + from a hardcoded `max_tokens=4096` to a named `MAX_OUTPUT_TOKENS = 16000` + (both the main-loop and tools-unsupported fallback call sites). The exhaustive + `# ATTRIBUTE COVERAGE` prompt makes the Turtle output larger; at 4096 the final + statement was silently truncated (`finish_reason=length`) and OWL parsing failed. + Claude Opus supports large completions; 16k fits a full maternity ontology. + +### Live regeneration result (newdomain, ontobricks-pge / ontobricks_pge_registry) +Ran the real PGE pipeline against the live workspace (endpoint +`databricks-claude-opus-4-7`, warehouse `e6b70b0c07bbaa10`, +`fiifi_cdm_demo_catalog`): +- OWL generator → 17 classes, **60 datatype properties** (was 22), all KPI props. +- Mapping-PGE → 15 entities + 12 relationships, every item PASS the deterministic + evaluator, **72 attribute mappings**, multi-trust UNIONs + value harmonization, + `[0-9]`-safe canonical key, GROUP BY dedup. (ClinicalFinding / ClinicalProvider + honestly skipped — no source rows; they were ID+Label stubs in V1.1.) +- One targeted re-run of Pregnancy via a retry-hint to DERIVE `bookingGestationWeeks` + (= `DATEDIFF(lmp_date, booking_date)/7`, a computed feature with no source column). +- Overwrote `newdomain` v1 in Lakebase (rollback backup + `/tmp/newdomain_live_backup_2026-06-02.json`). + +### Independent verification (warehouse audit, round-tripped from Lakebase) +- Entity-join dangling: **0.00% across all 12 relationships** (worst case 0.00%). +- Delivery method mix vaginal 1227 / caesarean 849 / instrumental 297 (matches V1.1). +- Pregnancy outcome delivered 1658 / transferred 715 / ongoing 74; feeding_status + mixed 549 / formula 543 / breast 542; booking gestation avg 10.5 wks; Pregnancy + hub 2463 rows = 2463 distinct IDs. +- All 5 KPI columns present: deliveryMethod, outcomeStatus, bookingGestationWeeks, + contactType (=contact_type), feedingStatus (=feeding_status). + +NOTE (next phase, out of scope here): the downstream `canonical.*` UC views + +metric views were built from the prior hand-copy's entity SQLs and must be +republished from the regenerated newdomain SQLs. + +### Modified files +- src/agents/agent_owl_generator/engine.py + +### Test result +`uv run --offline pytest tests/agents/agent_mapping_pge/ tests/units/ontology/test_owl_generator.py` +— 102 passed. No regressions. diff --git a/changelogs/v0.5.0/FiifiB_2026-06-16.log b/changelogs/v0.5.0/FiifiB_2026-06-16.log new file mode 100644 index 00000000..90d2ebaf --- /dev/null +++ b/changelogs/v0.5.0/FiifiB_2026-06-16.log @@ -0,0 +1,73 @@ +# v0.5.0 — 2026-06-16 (FiifiB) + +## PGE intrinsic-evaluation toolkit + ontology-gen Evaluator stage + in-app scorecard + +### Context +The PGE pipeline (ontology + mapping generation) had no measurable, gold-free +stopping condition. This adds a **usecase-agnostic intrinsic scorecard** +(`src/agents/pge_eval/`), wires it to run **in-app** after generation/mapping, +turns `agent_owl_generator` into a real PGE loop via a new Evaluator stage, and +ships a CLI. No golden/reference labels are encoded (correctness is measured by +internal consistency against the actual runtime inputs), so it works for any +domain — not just the NHS CDM/maternity demo it was developed against. + +### Changes +1. `src/agents/pge_eval/` (new package) — the scorer: + - `scorecard.py` — `score_artifact`, the single offline-testable scoring + core; emits the scorecard JSON + GREEN/RED verdict + exit code. + - `ontology_metrics.py` / `mapping_metrics.py` / `pipeline_metrics.py` — + Stage-1 (footprint coverage, orphan/dangling/naming/duplicate), Stage-2 + (entity/rel/attribute completeness, dangling-FK max, id-integrity, sql-exec + failures, conditional cross-source band), and pipeline (coverage_loss + + convergence, advisory/tracked only). + - `gates.py` — three tiers: absolute / ratio (`--gate-ratios`) / direction- + aware self-baseline regression. + - `baseline.py` — Tier-3 self-baseline store in `logs/goals/` (baseline = most + recent GREEN; RED runs never baseline). + - `judge.py` — advisory LLM-judge; the only network path; never gates; skipped + under `--no-judge` (zero network). + - `normalize.py` / `loaders.py` — shape-agnostic ontology/metadata + normalisation + domain-agnostic live-run input loaders. + - `inapp.py` — fail-safe in-app hooks (deterministic, never raise). +2. `scripts/goals_eval.py` (new) — CLI with `score` / `run` subcommands and + `--no-judge` / `--gate-ratios`; `run` is domain-agnostic + (`--registry-json`/`--version` or `--ontology`/`--metadata`). Exit code = + verdict. +3. `src/agents/agent_owl_generator/engine.py` — new **Evaluator stage**: after + the (upstream) pitfall-tool loop settles, run the Stage-1 deterministic checks + and feed retry-hints back into generation, bounded by `MAX_OWL_EVAL_ROUNDS`; + cleans prose/markdown preambles before parsing; de-maternified the ATTRIBUTE + COVERAGE prompt (domain-neutral examples). +4. In-app scorecard hooks — `src/api/routers/internal/ontology.py` and + `src/back/objects/mapping/Mapping.py`: after generation/mapping, attach + `pge_scorecard` to the task result and append the verdict to the completion + message (kept alongside upstream's per-iteration generation score). +5. Generality hardening — de-maternified the mapping VALUE HARMONIZATION + (`generators/entity.py`) and planner CANONICAL-KEY NORMALIZATION + (`planner.py`) prompts; broadened `test_no_domain_hardcoding` to also scan the + generator prompt files. +6. `scripts/smoke_pge.py` — artifact dump embeds `ontology`/`metadata`/ + `elapsed_s` so score-only is self-contained. +7. Docs — Sphinx `docs/sphinx/api/agents.rst` "PGE Intrinsic Evaluation" section + (all 11 modules) + README "PGE Intrinsic Evaluation (quality scorecard)" + section. +8. Integrated 65 upstream `master` commits (mid-0.5.0) via merge `198f38e`; + re-landed the Evaluator stage onto upstream's new tool-driven pitfall loop. + +### Modified / added files +- src/agents/pge_eval/*.py (new: __init__, scorecard, normalize, ontology_metrics, + mapping_metrics, pipeline_metrics, gates, baseline, judge, inapp, loaders) +- scripts/goals_eval.py (new), scripts/smoke_pge.py +- src/agents/agent_owl_generator/engine.py +- src/agents/agent_mapping_pge/generators/entity.py, src/agents/agent_mapping_pge/planner.py +- src/api/routers/internal/ontology.py, src/back/objects/mapping/Mapping.py +- docs/sphinx/api/agents.rst, README.md +- tests/units/pge_eval/* (new), tests/agents/agent_mapping_pge/{test_entity_generator,test_planner}.py + +### Test result +- `pytest tests/units tests/agents` → **2528 passed, 11 skipped, 0 failed** + (post-merge with origin/master). +- Evidence captured during development: clean artifact → GREEN/exit 0; seeded + dangling-FK → RED/exit 1; Tier-3 regression → RED; `--no-judge` zero-network + verified; grep clean for trust_a/b/c, preg, maternity, NHS, spr in scorer + + generator prompts. diff --git a/changelogs/v0.5.0/FiifiB_2026-06-19.log b/changelogs/v0.5.0/FiifiB_2026-06-19.log new file mode 100644 index 00000000..19383ea0 --- /dev/null +++ b/changelogs/v0.5.0/FiifiB_2026-06-19.log @@ -0,0 +1,67 @@ +# v0.5.0 — PGE Run-Visualizer (make the mapping PGE loop demoable in the UI) + +## Context + +The mapping PGE pipeline (Planner → Generator → Evaluator → Critic) already +computes rich artifacts — the planner's `source_model`, per-item +`mapping_evaluations` (EvalReports), an attempt-by-attempt `mapping_run_log`, +and an intrinsic-eval `pge_scorecard` — but the UI surfaced almost none of it. +Auto-Map showed a progress bar, a flat per-item results table, and appended +the scorecard verdict to a toast string. The orchestration (planner picks, +generator attempts, evaluator/critic verdicts, retry hints, re-plan +escalations) was invisible, which made the agentic loop "hard to demo on the +OntoBricks UI." + +This change exposes the PGE loop in the Batch Auto-Map report without touching +any agent logic: a new client-side run-visualizer renders the loop from the +artifacts the engine already produces, and the two artifacts that were +persisted-to-session-only are now also surfaced on the polled task result. + +## Changes + +1. `src/back/objects/mapping/Mapping.py` — `run_auto_assign_task` (batch) and + `run_single_auto_assign_task` (single): added `source_model`, + `mapping_evaluations`, and `mapping_run_log` to the `complete_task` result + dict. These were already accumulated and persisted to the session via + `save_mappings_to_session`; they are now also returned on the polled task so + the UI can render the loop from `GET /tasks/{id}` with no extra round-trip. + `pge_scorecard` was already present (computed by `score_mapping_run`). +2. `src/front/static/mapping/js/mapping-pge-visualizer.js` — NEW. `PgeVisualizer` + module: renders the verdict pill + KPI strip (entity/relationship + completeness, id-integrity, sql-exec failures, coverage-loss), the three + gate tiers (absolute / ratio / regression), a per-item loop trace + (Generator › Evaluator › Critic chain with attempt numbers, retry hints, and + re-plan markers) enriched with per-item eval metrics, and a collapsible + planner source-model panel (table→class candidates with confidence bars, + canonical ids, join keys with overlap, planner skips). Fully defensive — any + field may be missing; renders nothing if there is no PGE payload. +3. `src/front/static/mapping/css/mapping-pge-visualizer.css` — NEW. Component + styles (ob-pge-* classes) for the card, KPI strip, gate chips, loop-trace + timeline, and source-model tables. Bootstrap 5.3 for everything else. +4. `src/front/static/mapping/js/mapping-autoassign.js` — stash `this.taskResult` + in both completion paths (live monitor + resumed task) and call + `PgeVisualizer.render(...)` from `showReport()`; clear it in `reset()`. +5. `src/front/templates/partials/mapping/_mapping_autoassign.html` — added the + `#autoAssignPgeVisualizer` mount container inside the report section. +6. `src/front/templates/mapping.html` — included the new CSS + JS assets. + +## Modified / new files + +- src/back/objects/mapping/Mapping.py (modified) +- src/front/static/mapping/js/mapping-pge-visualizer.js (new) +- src/front/static/mapping/css/mapping-pge-visualizer.css (new) +- src/front/static/mapping/js/mapping-autoassign.js (modified) +- src/front/templates/partials/mapping/_mapping_autoassign.html (modified) +- src/front/templates/mapping.html (modified) + +## Tests + +- `pytest tests/agents/agent_mapping_pge/` — 82 passed, 1 warning (no regression + from the task-result additions; the engine/contracts are untouched). +- `node --check` passes on both modified/new JS modules. +- `python -m py_compile` passes on Mapping.py. +- Visual verification: rendered the visualizer against a realistic synthetic + task result via headless Chrome (file:// harness). Confirmed: verdict pill, + KPI strip, three gate tiers, per-item loop chains (incl. a 2-attempt + null-id-hint retry and a re-plan escalation), per-item eval metrics, and the + planner source-model panel all render correctly. Screenshots captured. diff --git a/changelogs/v0.5.0/FiifiB_2026-06-20.log b/changelogs/v0.5.0/FiifiB_2026-06-20.log new file mode 100644 index 00000000..ccb59797 --- /dev/null +++ b/changelogs/v0.5.0/FiifiB_2026-06-20.log @@ -0,0 +1,42 @@ +# v0.5.0 — PGE Run-Visualizer: reasoning-wrap fix + live verification + +## Context + +Continuation of the PGE run-visualizer (see FiifiB_2026-06-19.log). Verified the +visualizer end-to-end with a real, live PGE auto-map run on the deployed app +(ontobricks-pgeviz). The live run surfaced one rendering issue not present in the +synthetic-data tests: the semantic critic's `reasoning` (carried in +`EvalReport.metrics.reasoning`) is long free text that rendered inline and +overflowed the viewport. + +## Changes + +1. `src/front/static/mapping/js/mapping-pge-visualizer.js` — `renderItem()`: + split per-item eval metrics into short scalar metrics (rendered inline as + before) and long free-text fields >60 chars such as `reasoning` (now rendered + as a wrapped `.ob-pge-reasoning` block instead of a nowrap inline span). +2. `src/front/static/mapping/css/mapping-pge-visualizer.css` — added + `.ob-pge-reasoning` (white-space: normal, overflow-wrap: anywhere) so long + critic reasoning wraps cleanly. + +## Modified files + +- src/front/static/mapping/js/mapping-pge-visualizer.js +- src/front/static/mapping/css/mapping-pge-visualizer.css + +## Tests / verification + +- `node --check` passes on the visualizer JS. +- LIVE end-to-end verification on deployed app `ontobricks-pgeviz` + (https://ontobricks-pgeviz-7474646666236453.aws.databricksapps.com): + loaded trust_a.maternity_episode metadata → applied a maternity ontology + (MaternityEpisode, Baby) → ran a real PGE auto-assign → both entities PASS in + 1 attempt. The deployed PgeVisualizer rendered the real run: stage chips, + RED verdict, KPI strip (entity/rel/id-integrity 100%, 0 sql failures, + 0% coverage-loss), gate tiers (Tier-1 absolute FAILED on orphan_class_count=2, + Tier-2/3 pass), per-item Generator→Evaluator→Critic chains with full wrapped + critic reasoning, per-item metrics, and the planner source-model (candidate + confidences 98%/90%, canonical-id analysis with real row counts 1228/776). + Zero console errors. Screenshot captured. +- Deployment used the dev-lakebase target against a dedicated Lakebase project + (ontobricks-pgeviz-db) for full isolation from shared production. diff --git a/docs/INFO.md b/docs/INFO.md index 1b68e3cb..06ba8196 100644 --- a/docs/INFO.md +++ b/docs/INFO.md @@ -323,7 +323,7 @@ src/ │ ├── llm_utils.py # Shared LLM call with retry │ ├── tools/ # Shared agent tools (context, metadata, SQL, …) │ ├── agent_owl_generator/ # OWL ontology generation agent -│ ├── agent_auto_assignment/ # Entity/relationship → SQL mapping agent +│ ├── agent_mapping_pge/ # Mapping PGE pipeline (Planner → Generator → Evaluator) │ ├── agent_auto_icon_assign/ # Emoji icon mapping agent │ └── agent_ontology_assistant/# Conversational assistant + ResponsesAgent wrapper │ diff --git a/docs/architecture.md b/docs/architecture.md index 5a87799a..5fc9baf0 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -20,7 +20,7 @@ Under the hood, SPARQL translates ontology mappings into Spark SQL — users nev | **User Interface** | Bootstrap 5.3 + OntoViz visual editor + Sigma.js / D3.js graph views | | **MCP Server** | Separate Databricks App (`mcp-ontobricks`) exposing knowledge-graph tools to LLM clients (Cursor, Claude Desktop, Playground) | | **FastAPI Application** | Routes → Domain Objects → Core layered architecture with GlobalConfigService, PermissionService, and BuildScheduler | -| **LLM Agents** | MLflow-traced agentic loops for ontology generation, auto-mapping, icon mapping, and conversational assistance | +| **LLM Agents** | MLflow-traced agentic loops for ontology generation, mapping (Planner/Generator/Evaluator), icon mapping, and conversational assistance | | **Reasoning Engine** | OWL 2 RL deductive closure, SWRL rules (compiled to SQL), graph reasoning, and constraint validation | | **Triple Store Backends** | Delta-backed view in Unity Catalog plus a pluggable Graph DB engine (currently Lakebase Postgres) via the `GraphDBFactory` pattern, with BFS, shortest path, and transitive closure built in | | **Databricks Platform** | Unity Catalog (metadata & governance), SQL Warehouse (query execution), UC Volumes (shared storage) | @@ -448,7 +448,7 @@ src/ │ ├── tracing.py # MLflow tracing setup & decorators │ ├── tools/ # Shared agent tools (ontology, mapping, metadata, SQL, etc.) │ ├── agent_owl_generator/ # OWL ontology generation agent -│ ├── agent_auto_assignment/ # Entity/relationship → SQL mapping agent +│ ├── agent_mapping_pge/ # Mapping PGE pipeline (Planner → Generator → Evaluator) │ ├── agent_auto_icon_assign/ # Emoji icon mapping agent │ └── agent_ontology_assistant/ # Conversational assistant + ResponsesAgent wrapper │ @@ -1512,36 +1512,41 @@ In addition to the UI-driven agents, OntoBricks provides an **MCP server** (`mcp --- -#### 2. Auto-Mapping Agent (`agent_auto_assignment`) +#### 2. Mapping PGE Pipeline (`agent_mapping_pge`) -**Purpose**: Autonomously map ontology entities and relationships to SQL queries against the domain's Databricks tables. The agent writes SQL, validates it by executing queries, and submits the finalized mappings. +**Purpose**: Autonomously map ontology entities and relationships to SQL queries against the domain's Databricks tables. The pipeline replaces the legacy single-loop agent with a Planner → Generator → Evaluator (PGE) decomposition: a global planner produces a typed `SourceModel`, narrow per-item generators emit SQL against that plan, and a two-stage evaluator gates every mapping with deterministic checks plus a semantic critic. -| Parameter | Value | -|-----------|-------| -| Max iterations | 60 (batch) / 15 (single-item) | -| LLM timeout | 180s | -| Max tokens | 2048 | -| Temperature | 0.1 | -| Iteration delay | 3s between LLM calls | -| Chunk size | 5 items per agent run (`AUTO_ASSIGN_CHUNK_SIZE`) | -| Chunk cooldown | 15s between chunks (`AUTO_ASSIGN_CHUNK_COOLDOWN`) | +**Components**: -**Workflow**: -1. Calls `get_ontology` to see entities, relationships, and their attributes -2. Calls `get_metadata` to understand available tables and columns -3. For each entity/relationship: - - Writes a SQL query using `execute_sql` to validate it - - Iterates on SQL errors until the query succeeds - - Calls `submit_entity_mapping` or `submit_relationship_mapping` to finalize -4. Repeats until all items are mapped or iteration limit is reached +| Component | Module | Role | +|-----------|--------|------| +| **Planner** | `agent_mapping_pge/planner.py::run_planner` | Single-invocation agent. Consumes ontology + table metadata + imported documents, probes data with `sample_table` / `column_value_overlap` / `distinct_count`, and emits a validated `SourceModel` (per-table role candidates, canonical-ID map, intra-trust and cross-source join keys, ordered mapping plan). | +| **EntityGenerator** | `agent_mapping_pge/generators/entity.py::run_entity_generator` | Narrow agent that maps ONE ontology class given a filtered `SourceModel` slice. Emits SQL with `AS ID` aliasing the canonical ID column; populates `attribute_mappings` or explicit `unmapped_attributes` (no silent drops). | +| **RelationshipGenerator** | `agent_mapping_pge/generators/relationship.py::run_relationship_generator` | Sibling generator for ontology properties. Constrains endpoint columns to the IDs the source/target entities were already mapped on. | +| **Deterministic Evaluator** | `agent_mapping_pge/evaluator/deterministic.py` | Pure-Python Stage 1 checks: `row_count`, distinct IDs, null IDs, dangling source/target percentages, cross-source overlap band. Fast and reproducible — gates Stage 2. | +| **Semantic Critic** | `agent_mapping_pge/evaluator/critic.py` | LLM agent that runs ONLY when Stage 1 passes. Audits semantic correctness with `sample_table`, `execute_sql`, `get_documents_context`. Submits a verdict via the `submit_evaluation` terminal tool. | +| **Orchestrator** | `agent_mapping_pge/engine.py::run_agent` | Drop-in replacement for the legacy `run_agent`. Persists the `SourceModel`, per-item `EvalReport`s, and a `mapping_run_log` on the session via `Mapping.save_mappings_to_session`. | +| **Contracts** | `agent_mapping_pge/contracts.py` | Pydantic models: `SourceModel`, `EntityMappingDraft`, `RelationshipMappingDraft`, `EvalReport`. | -**Tools used**: `get_ontology`, `get_metadata`, `execute_sql`, `submit_entity_mapping`, `submit_relationship_mapping` +**Per-item loop**: +1. Generator emits a draft mapping against the Planner's slice for the target class/property. +2. Deterministic Evaluator runs Stage 1 checks. On failure → return hints to the Generator. +3. If Stage 1 passes, Semantic Critic audits the draft and returns a verdict via `submit_evaluation`. +4. Up to 3 Generator → Evaluator attempts per item with hint-driven retry. +5. Persistent semantic or structural failure → bubble up to Planner; the orchestrator triggers a global replan (max 2 replans across the run). + +**Tools used**: +- Planner: `sample_table`, `column_value_overlap`, `distinct_count`, `submit_source_model`, `get_metadata`, `get_documents_context` +- Generators: `get_ontology`, `get_metadata`, `execute_sql`, `submit_entity_mapping`, `submit_relationship_mapping` +- Critic: `sample_table`, `execute_sql`, `get_documents_context`, `submit_evaluation` **Invoked by**: -- **Batch**: `POST /mapping/auto-assign/start` → background thread → TaskManager. Large jobs are split into chunks of `AUTO_ASSIGN_CHUNK_SIZE` items; each chunk runs its own agent loop with a `AUTO_ASSIGN_CHUNK_COOLDOWN` pause between chunks to avoid LLM rate limits (429 errors). Partial results accumulate across chunks. -- **Single-item**: `POST /mapping/auto-assign/single` → background thread → TaskManager (processes one entity or relationship) +- **Batch**: `POST /mapping/auto-assign/start` → background thread → TaskManager. +- **Single-item**: `POST /mapping/auto-assign/single` → background thread → TaskManager (processes one entity or relationship). + +The public `Mapping.auto_assign_with_agent` API and the `on_step(msg, pct)` progress callback are unchanged — this is a transparent under-the-hood swap. New persisted artifacts (`source_model`, `mapping_evaluations`, `mapping_run_log`) are written to the session but not yet surfaced in the UI. -**Single-item mode**: The same agent engine is used with `max_iterations=15`. The ontology payload is scoped to the single target item. The frontend fires the request, polls `/tasks/{id}`, and saves the result directly to `MappingState.config` by URI — enabling concurrent auto-maps on different items. +**Single-item mode**: The orchestrator scopes the Planner's `SourceModel` to the target class/property and runs the same Generator → Evaluator loop. The frontend fires the request, polls `/tasks/{id}`, and saves the result directly to `MappingState.config` by URI — enabling concurrent maps on different items. --- @@ -1608,10 +1613,15 @@ All tools live in `src/agents/tools/` and follow a consistent pattern: | `get_table_detail` | `metadata.py` | Returns detailed schema for a specific table | OWL Generator | | `list_documents` | `documents.py` | Lists uploaded domain documents from Unity Catalog | OWL Generator | | `read_document` | `documents.py` | Reads content of a specific document | OWL Generator | -| `get_ontology` | `ontology.py` | Returns current ontology (entities, relationships, attributes) | Auto-Mapping, Icon Mapping, Ontology Assistant | -| `execute_sql` | `sql.py` | Executes a SQL query via Databricks SQL Warehouse | Auto-Mapping | -| `submit_entity_mapping` | `mapping.py` | Saves a validated entity → SQL mapping | Auto-Mapping | -| `submit_relationship_mapping` | `mapping.py` | Saves a validated relationship → SQL mapping | Auto-Mapping | +| `get_ontology` | `ontology.py` | Returns current ontology (entities, relationships, attributes) | Mapping PGE (Generators), Icon Mapping, Ontology Assistant | +| `execute_sql` | `sql.py` | Executes a SQL query via Databricks SQL Warehouse | Mapping PGE (Generators, Critic) | +| `submit_entity_mapping` | `mapping.py` | Saves a validated entity → SQL mapping | Mapping PGE (EntityGenerator) | +| `submit_relationship_mapping` | `mapping.py` | Saves a validated relationship → SQL mapping | Mapping PGE (RelationshipGenerator) | +| `sample_table` | `planner.py` | Returns N sample rows from a table | Mapping PGE (Planner, Critic) | +| `column_value_overlap` | `planner.py` | Reports value overlap between two columns (cross-source join probe) | Mapping PGE (Planner) | +| `distinct_count` | `planner.py` | Returns the distinct-value count for a column | Mapping PGE (Planner) | +| `submit_source_model` | `planner.py` | Terminal tool — submits the Planner's validated `SourceModel` | Mapping PGE (Planner) | +| `submit_evaluation` | `evaluation.py` | Terminal tool — submits the Critic's `EvalReport` verdict | Mapping PGE (Critic) | | `assign_icons` | `icons.py` | Saves entity → emoji icon mapping | Icon Mapping | #### ToolContext @@ -1629,11 +1639,12 @@ class ToolContext: # OWL Generator uc_location: dict # Unity Catalog file location - # Auto-Mapping + # Mapping PGE (Planner, Generators, Critic) client: Any # DatabricksClient for SQL execution ontology: dict # Current ontology data entity_mappings: list # Accumulated entity mapping results relationship_mappings: list # Accumulated relationship mapping results + source_model: Any # Planner-emitted SourceModel (set after planning) # Icon Assign icon_results: dict # Accumulated icon assignments @@ -1645,7 +1656,7 @@ Each agent populates only the fields it needs; unused fields remain at their def ### Agent Engine Pattern -All three agents share the same engine structure (defined independently in each `engine.py`): +Each agent (OWL Generator, Icon Assign, Ontology Assistant, and the Planner, Generators, and Critic inside the Mapping PGE pipeline) shares the same ReAct-style engine structure (defined independently in each `engine.py`): #### Core Loop @@ -1674,7 +1685,7 @@ All three agents share the same engine structure (defined independently in each #### Fallback Mode -If the LLM endpoint returns HTTP 400/422 (indicating it doesn't support the `tools` parameter), the OWL Generator and Icon Assign agents automatically retry without tools, falling back to single-shot generation. The Auto-Mapping agent does not fall back because its workflow fundamentally requires tool calls (SQL execution, mapping submission). +If the LLM endpoint returns HTTP 400/422 (indicating it doesn't support the `tools` parameter), the OWL Generator and Icon Assign agents automatically retry without tools, falling back to single-shot generation. The Mapping PGE pipeline does not fall back because every stage (Planner, Generators, Critic) fundamentally requires tool calls (data probing, SQL execution, terminal submission tools). #### Task Integration diff --git a/docs/code_organization.md b/docs/code_organization.md index aa27fb3d..2fa35244 100644 --- a/docs/code_organization.md +++ b/docs/code_organization.md @@ -288,7 +288,7 @@ Examples under `src/agents/`: - **`agent_ontology_assistant`** — conversational edits to the loaded ontology (exposed via e.g. `ontology_assistant_chat` in `front/routes/ontology.py`). - **`agent_owl_generator`** — generates OWL from natural language (async task wrapper in ontology routes). - **`agent_auto_icon_assign`** — suggests emoji icons for entities. -- **`agent_auto_assignment`** — automated mapping or assignment workflows (see package for details). +- **`agent_mapping_pge`** — Mapping PGE pipeline (Planner → Generators → Evaluator) that maps ontology entities and relationships to Spark SQL. Composed of `planner.py`, `generators/{entity,relationship}.py`, and `evaluator/{deterministic,critic}.py`, orchestrated by `engine.py::run_agent`. Shared utilities include **`agents.engine_base`** (shared `AgentStep` data class, `call_serving_endpoint` for LLM calls, `dispatch_tool` for tool execution, `extract_message_content` for response parsing, and `accumulate_usage` for token tracking), **`agents.llm_utils`** (retry/backoff logic), **`agents.tools.context`** (`ToolContext` for domain/session-aware tool execution), and **`agents.tracing`** (initialized from app `lifespan` in `src/shared/fastapi/main.py` via `setup_tracing()`). diff --git a/docs/data-access.md b/docs/data-access.md index a70ebf2e..db9b3876 100644 --- a/docs/data-access.md +++ b/docs/data-access.md @@ -217,7 +217,7 @@ These agents do not query the triple store at runtime; they operate on the | Agent | Purpose | Tools call | Wrapper | Engine | |---|---|---|---|---| | `agent_owl_generator` | Build an OWL ontology from metadata + documents | `metadata.list_tables`, `metadata.preview_table`, `documents.read`, `ontology.write_owl` | REST + Spark SQL (samples) | `databricks-sql-connector` against UC tables, plus rdflib write | -| `agent_auto_assignment` | Map ontology entities to Spark SQL queries | `tables.list`, `tables.sample`, `mapping.write` | REST + Spark SQL (samples) | Same as above; output stored as R2RML | +| `agent_mapping_pge` | Map ontology entities and relationships to Spark SQL queries via a Planner → Generator → Evaluator pipeline | `tables.list`, `tables.sample`, `column_value_overlap`, `distinct_count`, `execute_sql`, `mapping.write`, `submit_evaluation` | REST + Spark SQL (samples + validation) | Same as above; output stored as R2RML | | `agent_auto_icon_assign` | Pick emojis for entities | Inspects ontology + metadata | REST | None — generation only | | `agent_ontology_assistant` | Conversational ontology editing | Dozens of tools mutating the in-session ontology | REST | Python ontology object model | | `agent_dtwin_chat` | Conversational graph querying | See §6 | REST + **GraphQL** + **SPARQL** | **Spark SQL** + **Cypher** (engine-side) | diff --git a/docs/deployment.md b/docs/deployment.md index 210ce8e5..3ab7f0f2 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -1299,7 +1299,7 @@ OntoBricks agents are instrumented with MLflow tracing. When deployed to Databri - `MLFLOW_TRACKING_URI=databricks` is set in `app.yaml` - Application startup in `src/shared/fastapi/main.py` calls `setup_tracing()`, which creates the `/Shared/ontobricks-agents` experiment -- Every agent call (OWL Generator, Auto-Mapping, Auto Icon Assign, Ontology Assistant) produces a span tree: +- Every agent call (OWL Generator, Mapping PGE pipeline — Planner, Generators, Critic — Auto Icon Assign, Ontology Assistant) produces a span tree: ``` AGENT (run_agent) diff --git a/docs/sphinx/api/agents.rst b/docs/sphinx/api/agents.rst index 311afa26..35450564 100644 --- a/docs/sphinx/api/agents.rst +++ b/docs/sphinx/api/agents.rst @@ -22,24 +22,59 @@ Tracing :undoc-members: :show-inheritance: -Auto Assignment Agent ---------------------- +Mapping PGE Pipeline +-------------------- -.. automodule:: agents.agent_auto_assignment +The Mapping PGE pipeline replaces the legacy single-loop auto-assignment agent +with a Planner → Generator → Evaluator decomposition: a global planner emits a +typed ``SourceModel``, narrow per-item generators produce SQL against that +plan, and a two-stage evaluator (deterministic checks + semantic critic) gates +every mapping. + +.. automodule:: agents.agent_mapping_pge :members: :undoc-members: :show-inheritance: -.. automodule:: agents.agent_auto_assignment.engine +.. automodule:: agents.agent_mapping_pge.engine :members: :undoc-members: :show-inheritance: -.. automodule:: agents.agent_auto_assignment.tools +.. automodule:: agents.agent_mapping_pge.planner + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.agent_mapping_pge.contracts + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.agent_mapping_pge.generators.entity + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.agent_mapping_pge.generators.relationship + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.agent_mapping_pge.evaluator.deterministic + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.agent_mapping_pge.evaluator.critic + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.agent_mapping_pge.evaluator.report :members: :undoc-members: :show-inheritance: - :exclude-members: ToolContext Auto Icon Assignment Agent -------------------------- @@ -168,6 +203,16 @@ Shared Tools :undoc-members: :show-inheritance: +.. automodule:: agents.tools.planner + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.tools.evaluation + :members: + :undoc-members: + :show-inheritance: + .. automodule:: agents.tools.sql :members: :undoc-members: @@ -177,3 +222,68 @@ Shared Tools :members: :undoc-members: :show-inheritance: + +PGE Intrinsic Evaluation +------------------------ + +A usecase-agnostic, gold-free scorecard for the PGE pipeline (ontology + +mapping generation). Intrinsic structural/self-consistency metrics plus an +advisory LLM-judge — no stored reference answer. The deterministic core +(``score_artifact``) ingests a captured ``AgentResult`` artifact and emits the +scorecard JSON with zero LLM calls; the in-app hooks run it live after +generation/mapping; the CLI lives in ``scripts/goals_eval.py``. + +.. automodule:: agents.pge_eval + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.pge_eval.scorecard + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.pge_eval.normalize + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.pge_eval.ontology_metrics + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.pge_eval.mapping_metrics + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.pge_eval.pipeline_metrics + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.pge_eval.gates + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.pge_eval.baseline + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.pge_eval.judge + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.pge_eval.inapp + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: agents.pge_eval.loaders + :members: + :undoc-members: + :show-inheritance: diff --git a/scripts/goals_eval.py b/scripts/goals_eval.py new file mode 100644 index 00000000..f354170e --- /dev/null +++ b/scripts/goals_eval.py @@ -0,0 +1,238 @@ +"""goals_eval — OntoBricks PGE intrinsic-evaluation CLI. + +Two subcommands: + + score evaluate a captured AgentResult artifact (cheap, deterministic, + re-runnable). Consumes the JSON dumped by scripts/smoke_pge.py. + + $ .venv/bin/python scripts/goals_eval.py score \ + [--no-judge] [--gate-ratios] + + run run the mapping PGE pipeline live, dump an artifact, then score it. + A thin wrapper around score-only (D6). + + $ .venv/bin/python scripts/goals_eval.py run [--gate-ratios] \ + [--no-judge] + +Flags: + --no-judge skip the advisory LLM-judge (the ONLY LLM/network path). + Deterministic metrics always run with zero LLM calls. + --gate-ratios promote Tier-2 ratio warnings to hard gates for this run. + +The process exit code is the scorecard verdict: 0 == GREEN, non-zero == RED. + +The scorecard is usecase-agnostic and uses no gold/reference labels. +""" + +import argparse +import json +import os +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +# Make ``src/`` importable without a packaged install. +REPO_ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(REPO_ROOT / "src")) + +from agents.pge_eval.baseline import DEFAULT_BASELINE_DIR, save_scorecard # noqa: E402 +from agents.pge_eval.scorecard import score_artifact # noqa: E402 + +LLM_ENDPOINT = os.environ.get("PGE_EVAL_ENDPOINT", "databricks-claude-opus-4-7") + + +def _now_ids(): + t = time.time() + dt = datetime.fromtimestamp(t, tz=timezone.utc) + # Microsecond-precise run_id so rapid successive runs never collide + # (a collision would make a run baseline against itself and skip Tier-3). + run_id = dt.strftime("%Y%m%dT%H%M%S_%f") + ts = dt.isoformat() + return run_id, ts + + +def _load_json(path: str) -> dict: + with open(path) as f: + return json.load(f) + + +def _resolve_judge_creds(args): + """Return (host, token, endpoint) for the judge, or (None, None, None). + + Only touched when the judge is enabled — keeps ``--no-judge`` offline. + """ + endpoint = args.endpoint or LLM_ENDPOINT + try: + from back.core.databricks.DatabricksClient import DatabricksClient + + client = DatabricksClient() + return client.host, client.token, endpoint + except Exception as exc: # noqa: BLE001 + print(f" (judge disabled — no Databricks credentials: {exc})", file=sys.stderr) + return None, None, None + + +def _emit(scorecard: dict, out_path: str) -> None: + text = json.dumps(scorecard, indent=2, default=str) + if out_path: + Path(out_path).parent.mkdir(parents=True, exist_ok=True) + with open(out_path, "w") as f: + f.write(text) + print(f"Scorecard written to {out_path}", file=sys.stderr) + print(text) + + +def _score_common(artifact, args, *, mode, ontology=None, metadata=None): + run_id, ts = _now_ids() + + host = token = endpoint = None + if not args.no_judge: + host, token, endpoint = _resolve_judge_creds(args) + if not host: + # No creds resolved → degrade to deterministic-only, still no net. + args.no_judge = True + + scorecard = score_artifact( + artifact, + ontology=ontology, + metadata=metadata, + gate_ratios=args.gate_ratios, + no_judge=args.no_judge, + mode=mode, + run_id=run_id, + timestamp=ts, + endpoint=endpoint, + host=host, + token=token, + baseline_dir=args.baseline_dir, + use_baseline=not args.no_baseline, + ) + + if not args.no_save: + path = save_scorecard(scorecard, args.baseline_dir) + print(f" (scorecard persisted to {path})", file=sys.stderr) + + _emit(scorecard, args.out) + return scorecard + + +def cmd_score(args) -> int: + artifact = _load_json(args.artifact) + ontology = _load_json(args.ontology) if args.ontology else None + metadata = _load_json(args.metadata) if args.metadata else None + scorecard = _score_common( + artifact, args, mode="score-only", ontology=ontology, metadata=metadata + ) + return int(scorecard["exit_code"]) + + +def cmd_run(args) -> int: + """Live mode: run the mapping PGE for ANY domain, dump an artifact, score it. + + Domain-agnostic: the ontology + source metadata come from a registry export + (``--registry-json`` [+``--version``]) or plain JSON files (``--ontology`` + [+``--metadata``]) — nothing about any specific domain is hard-coded. + """ + from back.core.databricks.DatabricksClient import DatabricksClient + from agents.agent_mapping_pge.engine import run_agent + from agents.pge_eval.loaders import load_run_inputs + + registry_json = args.registry_json or os.environ.get("PGE_EVAL_REGISTRY_JSON") + ontology, metadata = load_run_inputs( + registry_json=registry_json, + version=args.version, + ontology_path=args.ontology, + metadata_path=args.metadata, + ) + + client = DatabricksClient() + t0 = time.time() + result = run_agent( + host=client.host, + token=client.token, + endpoint_name=args.endpoint or LLM_ENDPOINT, + client=client, + metadata=metadata, + ontology=ontology, + documents=[], + on_step=lambda m, p: print(f" [{p:3d}%] {m}", file=sys.stderr), + skip_semantic_critic=args.no_judge, + ) + elapsed = time.time() - t0 + + artifact = { + "success": result.success, + "iterations": result.iterations, + "error": result.error, + "usage": result.usage, + "stats": result.stats, + "entity_mappings": result.entity_mappings, + "relationship_mappings": result.relationship_mappings, + "source_model": result.source_model, + "mapping_evaluations": result.mapping_evaluations, + "mapping_run_log": result.mapping_run_log, + "steps": [ + {"step_type": s.step_type, "tool_name": s.tool_name, "duration_ms": s.duration_ms} + for s in result.steps + ], + "ontology": ontology, + "metadata": metadata, + "elapsed_s": round(elapsed, 3), + } + scorecard = _score_common( + artifact, args, mode="live", ontology=ontology, metadata=metadata + ) + return int(scorecard["exit_code"]) + + +def _add_common_flags(p): + p.add_argument("--no-judge", action="store_true", + help="skip the advisory LLM-judge (no network calls)") + p.add_argument("--gate-ratios", action="store_true", + help="promote Tier-2 ratio warnings to hard gates") + p.add_argument("--endpoint", default=None, help="serving endpoint for the judge") + p.add_argument("--baseline-dir", dest="baseline_dir", default=DEFAULT_BASELINE_DIR, + help="directory for Tier-3 self-baseline scorecards") + p.add_argument("--no-baseline", action="store_true", + help="skip the Tier-3 self-baseline regression gate") + p.add_argument("--no-save", action="store_true", + help="do not persist this scorecard to the baseline dir") + p.add_argument("--out", default=None, help="also write the scorecard JSON here") + + +def main(argv=None) -> int: + parser = argparse.ArgumentParser(prog="goals_eval", description=__doc__) + sub = parser.add_subparsers(dest="command", required=True) + + p_score = sub.add_parser("score", help="score a captured artifact") + p_score.add_argument("artifact", help="path to a smoke_pge AgentResult artifact JSON") + p_score.add_argument("--ontology", default=None, + help="ontology JSON (defaults to artifact['ontology'])") + p_score.add_argument("--metadata", default=None, + help="source metadata JSON (defaults to artifact['metadata'])") + _add_common_flags(p_score) + p_score.set_defaults(func=cmd_score) + + p_run = sub.add_parser("run", help="run the PGE pipeline live, then score it") + p_run.add_argument("--registry-json", dest="registry_json", default=None, + help="exported registry version dump for ANY domain " + "({versions:{:{ontology,metadata}}}); " + "defaults to $PGE_EVAL_REGISTRY_JSON") + p_run.add_argument("--version", default=None, + help="version key to pick from --registry-json " + "(required only when the dump has >1 version)") + p_run.add_argument("--ontology", default=None, + help="ontology JSON (registry or agent shape) — " + "alternative to --registry-json") + p_run.add_argument("--metadata", default=None, + help="source metadata JSON (used with --ontology)") + _add_common_flags(p_run) + p_run.set_defaults(func=cmd_run) + + args = parser.parse_args(argv) + return args.func(args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/smoke_pge.py b/scripts/smoke_pge.py new file mode 100644 index 00000000..02589881 --- /dev/null +++ b/scripts/smoke_pge.py @@ -0,0 +1,206 @@ +"""Smoke test: PGE pipeline on CDM V1.1 maternity ontology. + +Runs the new Planner/Generator/Evaluator pipeline against the live +fe-vm-fiifi-cdm-demo workspace. Compares per-item PASS/FAIL to the +V1.1 baseline (17 entities + 18 relationships already in registry). + +Usage from repo root with env vars set: + + .venv/bin/python scripts/smoke_pge.py [--items N] [--no-critic] + +--items N restrict to the first N entities (default: all 17, plus + all relationships whose endpoints are mapped) +--no-critic skip the semantic critic stage 2 (faster, cheaper) +--scope=entities only run entities (skip relationships) +""" + +import argparse +import json +import logging +import os +import sys +import time +from pathlib import Path + +# Make ``src/`` importable without a packaged install. +REPO_ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(REPO_ROOT / "src")) + +# Route the OntoBricks loggers (which use back.core.logging.get_logger) to +# stdout at INFO so per-iteration agent traces appear in the smoke output. +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)-5s %(name)s | %(message)s", + stream=sys.stdout, +) + +from agents.agent_mapping_pge.engine import run_agent # noqa: E402 +from back.core.databricks.DatabricksClient import DatabricksClient # noqa: E402 + +REGISTRY_JSON = "/tmp/V1_1.json" +LLM_ENDPOINT = "databricks-claude-opus-4-7" + + +def load_v1_1(): + with open(REGISTRY_JSON) as f: + doc = json.load(f) + v = doc["versions"]["1_1"] + return v["ontology"], v["metadata"], v["assignment"] + + +def to_agent_shape(ontology): + """Convert V1.1 ontology (classes + properties) to the agent's expected + {entities, relationships} shape. + """ + classes = ontology.get("classes", []) + properties = ontology.get("properties", []) + + name_to_uri = {c["name"]: c["uri"] for c in classes if c.get("uri")} + + def resolve(short_or_uri): + if not short_or_uri: + return short_or_uri + if short_or_uri.startswith("http"): + return short_or_uri + return name_to_uri.get(short_or_uri, short_or_uri) + + entities = [] + for c in classes: + entities.append( + { + "uri": c.get("uri", ""), + "name": c.get("name", ""), + "label": c.get("label", ""), + "comment": c.get("comment", ""), + "parent": c.get("parent", ""), + "attributes": list(c.get("dataProperties", [])), + } + ) + + relationships = [] + for p in properties: + if p.get("type") != "ObjectProperty": + continue + relationships.append( + { + "uri": p.get("uri", ""), + "name": p.get("name", ""), + "label": p.get("label", p.get("name", "")), + "comment": p.get("comment", ""), + "domain": resolve(p.get("domain", "")), + "range": resolve(p.get("range", "")), + } + ) + return {"entities": entities, "relationships": relationships} + + +def filter_agent_ontology(agent_ont, item_limit, scope): + entities = agent_ont["entities"] + relationships = agent_ont["relationships"] + if item_limit is not None: + entities = entities[:item_limit] + kept_uris = {e["uri"] for e in entities} + if scope == "entities": + relationships = [] + else: + relationships = [ + r for r in relationships + if r["domain"] in kept_uris and r["range"] in kept_uris + ] + return {"entities": entities, "relationships": relationships} + + +def on_step(msg, pct): + print(f" [{pct:3d}%] {msg}", flush=True) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--items", type=int, default=None, help="Cap entity count") + parser.add_argument("--no-critic", action="store_true", help="Skip semantic critic") + parser.add_argument( + "--scope", choices=["all", "entities"], default="all", + ) + args = parser.parse_args() + + print(f"=== PGE smoke test — endpoint={LLM_ENDPOINT} ===") + print(f"items={args.items}, no-critic={args.no_critic}, scope={args.scope}") + print() + + print("Loading V1.1 ontology…") + raw_ont, metadata, baseline = load_v1_1() + agent_ont = to_agent_shape(raw_ont) + ontology = filter_agent_ontology(agent_ont, args.items, args.scope) + print(f" ontology: {len(ontology['entities'])} entities, {len(ontology['relationships'])} relationships") + print(f" metadata: {len(metadata.get('tables', []))} tables") + print(f" baseline: {len(baseline.get('entities', []))} entity mappings + " + f"{len(baseline.get('relationships', []))} relationships") + print() + + client = DatabricksClient() + print(f"DatabricksClient: host={client.host}, warehouse={client.warehouse_id}") + print() + + print("Invoking run_agent…") + t0 = time.time() + result = run_agent( + host=client.host, + token=client.token, + endpoint_name=LLM_ENDPOINT, + client=client, + metadata=metadata, + ontology=ontology, + documents=[], + on_step=on_step, + skip_semantic_critic=args.no_critic, + ) + elapsed = time.time() - t0 + print() + print(f"=== Run finished in {elapsed:.1f}s ===") + print(f"success={result.success}, iterations={result.iterations}, error={result.error!r}") + print(f"usage={result.usage}") + print() + + print("Per-item run log:") + for entry in result.mapping_run_log: + attempts = len(entry.get("attempts", [])) + print(f" {entry['kind']:<13} {entry['item']:<60} " + f"attempts={attempts} final={entry['final_status']}") + print() + + print(f"entity_mappings: {len(result.entity_mappings)} / {len(ontology['entities'])} " + f"(baseline {len(baseline.get('entities', []))})") + print(f"relationship_mappings: {len(result.relationship_mappings)} / " + f"{len(ontology['relationships'])} (baseline {len(baseline.get('relationships', []))})") + + # Dump the full result for inspection + out = { + "success": result.success, + "iterations": result.iterations, + "error": result.error, + "usage": result.usage, + "stats": result.stats, + "entity_mappings": result.entity_mappings, + "relationship_mappings": result.relationship_mappings, + "source_model": result.source_model, + "mapping_evaluations": result.mapping_evaluations, + "mapping_run_log": result.mapping_run_log, + "steps": [{"step_type": s.step_type, "tool_name": s.tool_name, "duration_ms": s.duration_ms} for s in result.steps], + # Embed the generated ontology + source metadata so the intrinsic + # evaluator (scripts/goals_eval.py score) can compute Stage-1 ontology + # metrics offline from this artifact alone. + "ontology": ontology, + "metadata": metadata, + "elapsed_s": round(elapsed, 3), + } + out_path = REPO_ROOT / "logs" / f"smoke_pge_{int(t0)}.json" + out_path.parent.mkdir(exist_ok=True) + with open(out_path, "w") as f: + json.dump(out, f, indent=2, default=str) + print(f"\nFull result written to {out_path}") + + return 0 if result.success else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/agents/agent_auto_assignment/__init__.py b/src/agents/agent_auto_assignment/__init__.py deleted file mode 100644 index 7d98e82b..00000000 --- a/src/agents/agent_auto_assignment/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -""" -Auto-Mapping Agent – autonomous SQL mapping generation using MCP-style tools. -""" - -from agents.agent_auto_assignment.engine import run_agent, AgentResult # noqa: F401 diff --git a/src/agents/agent_auto_assignment/engine.py b/src/agents/agent_auto_assignment/engine.py deleted file mode 100644 index 6de853d1..00000000 --- a/src/agents/agent_auto_assignment/engine.py +++ /dev/null @@ -1,510 +0,0 @@ -""" -OntoBricks Auto-Mapping Agent Engine. - -Implements an agentic loop that uses the Databricks Foundation Model API -with function calling to autonomously map ontology entities and relationships -to SQL queries against domain tables. - -Fallback: if the LLM endpoint does not support the ``tools`` parameter the -engine transparently degrades to a single-shot generation (no tool calls). -""" - -import json -import time -from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional - -import requests - -from back.core.logging import get_logger -from agents.agent_auto_assignment.tools import ( - ToolContext, - TOOL_DEFINITIONS, - TOOL_HANDLERS, -) -from agents.engine_base import ( - AgentStep, - call_serving_endpoint, - dispatch_tool, - extract_message_content, - accumulate_usage, -) -from agents.tracing import trace_agent - -logger = get_logger(__name__) - -MAX_ITERATIONS = 60 -LLM_TIMEOUT = 180 -_ITERATION_DELAY_SEC = 3 - -_TRACE_NAME = "auto_assignment" - - -# ===================================================== -# Data classes -# ===================================================== - - -@dataclass -class AgentResult: - """Outcome of a full auto-mapping agent run.""" - - success: bool - entity_mappings: list = field(default_factory=list) - relationship_mappings: list = field(default_factory=list) - steps: List[AgentStep] = field(default_factory=list) - iterations: int = 0 - error: str = "" - usage: Dict[str, int] = field(default_factory=dict) - stats: Dict[str, int] = field(default_factory=dict) - - -# ===================================================== -# System prompt -# ===================================================== - -SYSTEM_PROMPT = """\ -You are an expert data engineer. Your task is to map ontology entities \ -and relationships to SQL queries against Databricks tables. - -TOOLS -You have six tools: - • get_metadata – get imported table schemas (full names, columns, types) — no UC query - • get_documents_context – get imported domain documents to enrich domain context — no UC query - • get_ontology – get entities (with attributes) and relationships to map - • execute_sql – run a SQL query to validate it and see columns + sample data - • submit_entity_mapping – record a validated entity mapping - • submit_relationship_mapping – record a validated relationship mapping - -WORKFLOW -1. Call get_ontology AND get_metadata to understand what needs mapping and what data is available. -2. Call get_documents_context to read any imported documents — use them to enrich domain knowledge for better mapping decisions. -3. For EACH entity: - a. Compose a SELECT query using the table schemas. - b. Call execute_sql to validate the query works and see the columns. - c. If the query fails, fix the SQL and try execute_sql again. - d. Once validated, call submit_entity_mapping with the correct column assignments. -4. For EACH relationship: - a. Compose a SELECT query returning source and target identifiers. - b. Call execute_sql to validate the query. - c. Once validated, call submit_relationship_mapping. -5. After all mappings are submitted, output a brief summary. - -SQL RULES FOR ENTITIES (CRITICAL) -• Always use full table names from get_metadata (catalog.schema.table). -• The FIRST column MUST be aliased AS ID (the entity identifier). -• The SECOND column MUST be aliased AS Label (human-readable name). -• If the entity has attributes (non-empty "attributes" list), add one column per \ -attribute after ID and Label. -• If the entity has NO attributes, select ONLY ID and Label — no extra columns. -• If the same column serves as both an alias and an attribute, include it twice: \ -once with the alias (AS ID) and once with its original name. -• Add WHERE IS NOT NULL to filter null keys. -• Do NOT add LIMIT — the mapping query must return ALL rows. -• Do NOT use ORDER BY, CTEs, or subqueries unless absolutely necessary. -• Write simple, flat SELECT statements. - -COLUMN NAME QUOTING (CRITICAL) -• In SQL, ALWAYS wrap EVERY source column name in backticks: \ -`customer_id`, `name`, `first_name`, `column name`, `my-col`. -• Alias names (after AS) must NEVER be backtick-quoted: write AS ID, AS Label, \ -AS customer_name — NOT AS `ID`, NOT AS `Label`. -• When a source column name contains spaces or non-alphanumeric characters, alias \ -it to a safe snake_case name: `customer name` AS customer_name. -• The values you pass to submit_entity_mapping for id_column, label_column, and \ -attribute_mappings values are the alias names (no backticks). \ -Example: id_column="ID", label_column="Label", attribute_mappings={"name": "name"}. -• Never pass a value with backticks to id_column, label_column, source_id_column, \ -target_id_column, or attribute_mappings — always use the plain alias name. - -SQL RULES FOR RELATIONSHIPS (CRITICAL) -• SELECT exactly 2 columns: source identifier AS source_id, target identifier AS target_id. -• If both columns are in the SAME table, query only that table (no joins). -• Do NOT add LIMIT or ORDER BY. -• Apply the same always-backtick-quote rule as for entity SQL. - -ATTRIBUTE MAPPING -• In submit_entity_mapping, provide attribute_mappings: a JSON object mapping each \ -ontology attribute name to the corresponding SQL column name. -• Match by name similarity (e.g. ontology "firstName" → column "first_name"). -• Map ONLY attributes listed in the entity's "attributes" list from get_ontology. \ -If that list is empty, submit attribute_mappings: {} and do NOT add extra SQL columns. -• NEVER invent attribute mappings for columns not listed as ontology attributes. - -GENERAL RULES -• Process ALL entities and ALL relationships — do not skip any. -• If execute_sql fails, read the error and fix the SQL. -• You may batch multiple independent tool calls in a single response. -• Only ever pass row-returning queries (SELECT / WITH …) to execute_sql. \ -Never pass DESCRIBE, SHOW, EXPLAIN or other metadata statements — \ -use get_metadata for schema introspection instead. -• After submitting all mappings, output ONLY a brief text summary of what was mapped.""" - - -# ===================================================== -# Internal helpers -# ===================================================== - - -def _build_user_prompt(entities: List[dict], relationships: List[dict]) -> str: - parts = [] - parts.append( - f"Please map {len(entities)} entities and {len(relationships)} relationships " - "to SQL queries. Start by calling get_ontology, get_metadata, and get_documents_context " - "(documents enrich domain context for better mapping decisions)." - ) - if entities: - names = ", ".join(e.get("name", "?") for e in entities) - parts.append(f"Entities to map: {names}") - if relationships: - names = ", ".join(r.get("name", "?") for r in relationships) - parts.append(f"Relationships to map: {names}") - prompt = "\n".join(parts) - logger.debug("_build_user_prompt (%d chars):\n%s", len(prompt), prompt) - return prompt - - -# ===================================================== -# Public entry point -# ===================================================== - - -@trace_agent(name="auto_assignment") -def run_agent( - host: str, - token: str, - endpoint_name: str, - client: Any, - metadata: dict, - ontology: dict, - entity_mappings: Optional[list] = None, - relationship_mappings: Optional[list] = None, - documents: Optional[list] = None, - on_step: Optional[Callable[[str, int], None]] = None, - max_iterations: Optional[int] = None, -) -> AgentResult: - """Run the auto-mapping agent. - - The agent autonomously maps ontology entities and relationships to SQL - queries by composing, validating, and submitting mappings via tools. - - Args: - max_iterations: Override the default iteration budget. Use a smaller - value (e.g. 15) when mapping a single item to keep latency low. - """ - iteration_limit = max_iterations if max_iterations is not None else MAX_ITERATIONS - - entities = ontology.get("entities", []) - relationships = ontology.get("relationships", []) - total_items = len(entities) + len(relationships) - - logger.info( - "===== AUTO-ASSIGN AGENT START ===== endpoint=%s, entities=%d, relationships=%d, max_iter=%d", - endpoint_name, - len(entities), - len(relationships), - iteration_limit, - ) - logger.debug( - "run_agent: metadata tables=%d", len((metadata or {}).get("tables", [])) - ) - - ctx = ToolContext( - host=host.rstrip("/"), - token=token, - client=client, - metadata=metadata or {}, - ontology=ontology, - entity_mappings=list(entity_mappings or []), - relationships=list(relationship_mappings or []), - documents=list(documents or []), - ) - - result = AgentResult(success=False) - - # Build conversation - user_prompt = _build_user_prompt(entities, relationships) - messages: List[dict] = [ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": user_prompt}, - ] - logger.info( - "Agent conversation initialized: system=%d chars, user=%d chars", - len(SYSTEM_PROMPT), - len(user_prompt), - ) - - total_usage: Dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0} - current_iteration = 0 - - def _progress_pct() -> int: - mapped = len(ctx.entity_mappings) + len(ctx.relationships) - if total_items <= 0: - return 5 - return min(5 + int((mapped / total_items) * 90), 95) - - def notify(msg: str, *, pct: Optional[int] = None): - actual_pct = pct if pct is not None else _progress_pct() - logger.info("STEP [%d%%] %s", actual_pct, msg) - if on_step: - on_step(msg, actual_pct) - - notify("Starting auto-mapping agent…", pct=1) - - # ------------------------------------------------------------------ - # Agent loop - # ------------------------------------------------------------------ - tools_supported = True - - for iteration in range(iteration_limit): - # Delay between iterations to avoid "too many requests" rate limits - if iteration > 0: - logger.debug( - "Iteration %d: waiting %ds before LLM call (rate limit mitigation)", - iteration + 1, - _ITERATION_DELAY_SEC, - ) - time.sleep(_ITERATION_DELAY_SEC) - - current_iteration = iteration + 1 - logger.info( - "----- Iteration %d/%d — %d messages, %d entity mappings, %d rel mappings -----", - current_iteration, - iteration_limit, - len(messages), - len(ctx.entity_mappings), - len(ctx.relationships), - ) - mapped = len(ctx.entity_mappings) + len(ctx.relationships) - notify(f"Mapped {mapped}/{total_items} — thinking…") - - is_last = iteration >= iteration_limit - 1 - send_tools = TOOL_DEFINITIONS if (tools_supported and not is_last) else None - - t0 = time.time() - try: - llm_response = call_serving_endpoint( - host, - token, - endpoint_name, - messages, - tools=send_tools, - max_tokens=2048, - temperature=0.1, - timeout=LLM_TIMEOUT, - trace_name=_TRACE_NAME, - ) - except requests.exceptions.HTTPError as exc: - status = exc.response.status_code if exc.response is not None else "?" - logger.warning("Iteration %d: HTTPError status=%s", iteration + 1, status) - logger.debug( - "Iteration %d: HTTPError body: %.500s", - iteration + 1, - exc.response.text if exc.response is not None else "N/A", - ) - if exc.response is not None and status in (400, 422) and tools_supported: - logger.warning( - "Agent: endpoint rejected tools — falling back to direct mode" - ) - tools_supported = False - notify("Endpoint does not support tools – aborting.") - result.error = "LLM endpoint does not support function calling" - return result - result.error = f"LLM request failed: {exc}" - logger.error( - "Agent: LLM request failed at iteration %d: %s", iteration + 1, exc - ) - return result - except requests.exceptions.ReadTimeout: - result.error = f"LLM request timed out after {LLM_TIMEOUT}s" - logger.error("Agent: timeout at iteration %d", iteration + 1) - return result - except requests.exceptions.RequestException as exc: - result.error = f"LLM request failed: {exc}" - logger.error( - "Agent: request exception at iteration %d: %s", iteration + 1, exc - ) - return result - - elapsed_ms = int((time.time() - t0) * 1000) - logger.info("Iteration %d: LLM responded in %dms", iteration + 1, elapsed_ms) - - accumulate_usage(total_usage, llm_response.get("usage", {})) - - # Parse response - choice = llm_response.get("choices", [{}])[0] - finish_reason = choice.get("finish_reason", "?") - message = choice.get("message", {}) - tool_calls = message.get("tool_calls", []) - has_content = bool(message.get("content")) - logger.info( - "Iteration %d: finish_reason=%s, tool_calls=%d, has_content=%s", - iteration + 1, - finish_reason, - len(tool_calls), - has_content, - ) - - if tool_calls: - logger.info( - "Iteration %d: processing %d tool call(s): [%s]", - iteration + 1, - len(tool_calls), - ", ".join(tc.get("function", {}).get("name", "?") for tc in tool_calls), - ) - messages.append(message) - - for tc_idx, tc in enumerate(tool_calls, 1): - func = tc.get("function", {}) - tool_name = func.get("name", "") - raw_args = func.get("arguments", "{}") - tool_id = tc.get("id", "") - - try: - arguments = json.loads(raw_args) - except json.JSONDecodeError: - arguments = {} - - logger.info( - "Iteration %d: calling tool '%s' (%d/%d)", - iteration + 1, - tool_name, - tc_idx, - len(tool_calls), - ) - - if tool_name == "submit_entity_mapping": - name = arguments.get("class_name", "?") - notify(f"Mapping entity: {name}") - elif tool_name == "submit_relationship_mapping": - name = arguments.get("property_name", "?") - notify(f"Mapping relationship: {name}") - elif tool_name == "execute_sql": - sql_preview = arguments.get("sql", "")[:80] - notify(f"Validating SQL: {sql_preview}…") - elif tool_name == "get_metadata": - notify("Retrieving table metadata…") - elif tool_name == "get_documents_context": - notify("Retrieving imported documents…") - elif tool_name == "get_ontology": - notify("Retrieving ontology to map…") - else: - notify(f"Calling {tool_name}…") - - result.steps.append( - AgentStep( - step_type="tool_call", - content=json.dumps(arguments, default=str)[:500], - tool_name=tool_name, - ) - ) - - t1 = time.time() - tool_result = dispatch_tool( - TOOL_HANDLERS, ctx, tool_name, arguments, trace_name=_TRACE_NAME - ) - tool_ms = int((time.time() - t1) * 1000) - - logger.info( - "Iteration %d: tool '%s' returned %d chars in %dms", - iteration + 1, - tool_name, - len(tool_result), - tool_ms, - ) - - result.steps.append( - AgentStep( - step_type="tool_result", - content=( - (tool_result[:500] + "…") - if len(tool_result) > 500 - else tool_result - ), - tool_name=tool_name, - duration_ms=tool_ms, - ) - ) - - messages.append( - { - "role": "tool", - "tool_call_id": tool_id, - "content": tool_result, - } - ) - - mapped = len(ctx.entity_mappings) + len(ctx.relationships) - notify(f"Mapped {mapped}/{total_items} items") - logger.info( - "Iteration %d: tool calls done, conversation=%d messages, mappings=%d/%d", - iteration + 1, - len(messages), - mapped, - total_items, - ) - else: - # Agent produced text — should be the final summary - content = extract_message_content(llm_response) - logger.info( - "Iteration %d: agent produced text output — %d chars", - iteration + 1, - len(content), - ) - - result.steps.append( - AgentStep( - step_type="output", - content=(content[:500] + "…") if len(content) > 500 else content, - duration_ms=elapsed_ms, - ) - ) - - result.success = True - result.entity_mappings = ctx.entity_mappings - result.relationship_mappings = ctx.relationships - result.iterations = iteration + 1 - result.usage = total_usage - result.stats = { - "total": total_items, - "entities": len(ctx.entity_mappings), - "relationships": len(ctx.relationships), - } - - logger.info( - "===== AUTO-ASSIGN AGENT COMPLETE ===== iterations=%d, " - "entity_mappings=%d, rel_mappings=%d, " - "prompt_tokens=%d, completion_tokens=%d", - result.iterations, - len(ctx.entity_mappings), - len(ctx.relationships), - total_usage["prompt_tokens"], - total_usage["completion_tokens"], - ) - notify("Agent completed!", pct=100) - return result - - # Exhausted iterations — still return what we have - result.entity_mappings = ctx.entity_mappings - result.relationship_mappings = ctx.relationships - result.iterations = iteration_limit - result.usage = total_usage - result.stats = { - "total": total_items, - "entities": len(ctx.entity_mappings), - "relationships": len(ctx.relationships), - } - if ctx.entity_mappings or ctx.relationships: - result.success = True - result.error = f"Agent used all {iteration_limit} iterations but submitted partial mappings" - logger.warning( - "===== AUTO-ASSIGN AGENT PARTIAL ===== %s — entity=%d, rel=%d", - result.error, - len(ctx.entity_mappings), - len(ctx.relationships), - ) - else: - result.error = f"Agent reached maximum iterations ({iteration_limit}) without producing mappings" - logger.error("===== AUTO-ASSIGN AGENT FAILED ===== %s", result.error) - - return result diff --git a/src/agents/agent_auto_assignment/tools.py b/src/agents/agent_auto_assignment/tools.py deleted file mode 100644 index 94ceee49..00000000 --- a/src/agents/agent_auto_assignment/tools.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Tool assembly for the Auto-Mapping Agent. - -Composes the set of tools available to this agent from the shared -``agents.tools`` package. -""" - -from typing import Callable, Dict, List - -from agents.tools.context import ToolContext -from agents.tools.metadata import ( - GET_METADATA_DEF, - tool_get_metadata, -) -from agents.tools.ontology import ( - ONTOLOGY_TOOL_DEFINITIONS, - ONTOLOGY_TOOL_HANDLERS, -) -from agents.tools.sql import ( - SQL_TOOL_DEFINITIONS, - SQL_TOOL_HANDLERS, -) -from agents.tools.mapping import ( - MAPPING_TOOL_DEFINITIONS, - MAPPING_TOOL_HANDLERS, -) -from agents.tools.documents import ( - GET_DOCUMENTS_CONTEXT_DEF, - tool_get_documents_context, -) - -__all__ = ["ToolContext", "TOOL_DEFINITIONS", "TOOL_HANDLERS"] - -TOOL_DEFINITIONS: List[dict] = ( - [GET_METADATA_DEF, GET_DOCUMENTS_CONTEXT_DEF] - + ONTOLOGY_TOOL_DEFINITIONS - + SQL_TOOL_DEFINITIONS - + MAPPING_TOOL_DEFINITIONS -) - -TOOL_HANDLERS: Dict[str, Callable] = { - "get_metadata": tool_get_metadata, - "get_documents_context": tool_get_documents_context, - **ONTOLOGY_TOOL_HANDLERS, - **SQL_TOOL_HANDLERS, - **MAPPING_TOOL_HANDLERS, -} diff --git a/src/agents/agent_dtwin_chat/tools.py b/src/agents/agent_dtwin_chat/tools.py index 155bf6bd..2ad945a9 100644 --- a/src/agents/agent_dtwin_chat/tools.py +++ b/src/agents/agent_dtwin_chat/tools.py @@ -47,7 +47,11 @@ logger = get_logger(__name__) -_HTTP_TIMEOUT = 120 +# Interactive chat tools: warm Lakebase graph queries are sub-second; a long +# wait almost always means the autoscaling Lakebase instance is cold (scaled +# to zero) and waking. Fail fast with a graceful message rather than make the +# user wait minutes — a retry once the instance is warm succeeds quickly. +_HTTP_TIMEOUT = 60 _MAX_DEPTH = 1 _SPARQL_DANGEROUS = re.compile( r"\b(DROP|DELETE|INSERT|CREATE|CLEAR|LOAD|COPY|MOVE|ADD)\b", @@ -195,6 +199,11 @@ def tool_describe_entity( "depth": min(max(int(depth or _MAX_DEPTH), 1), 10), "limit": 500, "offset": 0, + # Cap BFS seeds: a broad search ("mother") otherwise seeds every match + # and the recursive traversal can run for minutes. The agent only needs + # a handful of concrete examples to describe, so 25 seeds is plenty and + # keeps the query fast. + "seed_limit": 25, } if search: params["search"] = search diff --git a/src/agents/agent_mapping_pge/__init__.py b/src/agents/agent_mapping_pge/__init__.py new file mode 100644 index 00000000..0da0713e --- /dev/null +++ b/src/agents/agent_mapping_pge/__init__.py @@ -0,0 +1,50 @@ +"""Planner -> Generator -> Evaluator (PGE) mapping agent. + +Three-stage mapping pipeline that replaces the prior single-loop ReAct +mapping agent: + +* **Planner** — proposes a :class:`SourceModel` (table roles, canonical IDs, + join keys, ordered mapping plan). +* **Generator** — produces individual entity/relationship mappings given the + plan. +* **Evaluator** — checks each submitted mapping; stage 1 is deterministic + (pure SQL counts), stage 2 is semantic. + +Sprint 1 lays the foundation: the typed contracts plus the deterministic +evaluator. Subsequent sprints add the LLM-backed Planner, Generator, +semantic Evaluator, and the orchestrating loop. +""" + +from agents.agent_mapping_pge.contracts import ( + CanonicalId, + EvalFailure, + EvalReport, + JoinKey, + MappingPlan, + RetryState, + SkipItem, + SourceModel, + TableRole, + TableRoleCandidate, +) +from agents.agent_mapping_pge.engine import ( + AgentResult, + AgentStep, + run_agent, +) + +__all__ = [ + "AgentResult", + "AgentStep", + "CanonicalId", + "EvalFailure", + "EvalReport", + "JoinKey", + "MappingPlan", + "RetryState", + "SkipItem", + "SourceModel", + "TableRole", + "TableRoleCandidate", + "run_agent", +] diff --git a/src/agents/agent_mapping_pge/contracts.py b/src/agents/agent_mapping_pge/contracts.py new file mode 100644 index 00000000..8172e431 --- /dev/null +++ b/src/agents/agent_mapping_pge/contracts.py @@ -0,0 +1,344 @@ +"""Typed contracts for the mapping PGE pipeline. + +These dataclasses are the load-bearing interface between Planner, Generator, +Evaluator, and the orchestrator (added in later sprints). All shapes here +are JSON round-trippable via ``to_dict()`` / ``from_dict()`` so they can be +persisted as artefacts, attached to MLflow traces, or shipped over the wire +to the UI. + +No LLM code lives here; this is a pure-data module. +""" + +from dataclasses import dataclass, field, fields, is_dataclass +from typing import Any, Dict, List, Optional + + +# ===================================================== +# SourceModel — Planner output +# ===================================================== + + +@dataclass +class TableRoleCandidate: + """A candidate ontology class for a given source table.""" + + uri: str + confidence: float # 0.0 .. 1.0 + reason: str = "" + + def to_dict(self) -> Dict[str, Any]: + return {"uri": self.uri, "confidence": self.confidence, "reason": self.reason} + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "TableRoleCandidate": + return cls( + uri=data["uri"], + confidence=float(data["confidence"]), + reason=data.get("reason", ""), + ) + + +@dataclass +class TableRole: + """A source table together with its ranked ontology-class candidates.""" + + table: str # full name catalog.schema.table + ontology_class_candidates: List[TableRoleCandidate] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + return { + "table": self.table, + "ontology_class_candidates": [ + c.to_dict() for c in self.ontology_class_candidates + ], + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "TableRole": + return cls( + table=data["table"], + ontology_class_candidates=[ + TableRoleCandidate.from_dict(c) + for c in data.get("ontology_class_candidates", []) + ], + ) + + +@dataclass +class CanonicalId: + """Identifier conventions for an ontology class across its source tables. + + ``canonical_column_per_table`` maps a full table name -> the column to + use as the canonical identifier in that table (e.g. NHS number rather + than the trust-local patient id). + """ + + ontology_class: str # class URI + canonical_column_per_table: Dict[str, str] = field(default_factory=dict) + format_note: str = "" + + def to_dict(self) -> Dict[str, Any]: + return { + "ontology_class": self.ontology_class, + "canonical_column_per_table": dict(self.canonical_column_per_table), + "format_note": self.format_note, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "CanonicalId": + return cls( + ontology_class=data["ontology_class"], + canonical_column_per_table=dict( + data.get("canonical_column_per_table", {}) + ), + format_note=data.get("format_note", ""), + ) + + +@dataclass +class JoinKey: + """A proposed join between two table.column references. + + ``kind`` distinguishes within-trust foreign keys from value-matched + cross-source joins (e.g. NHS-number-to-NHS-number across trusts). + """ + + from_ref: str # "table.col" + to_ref: str # "table.col" + confidence: float # 0..1 + overlap_pct: float # 0..1 + kind: str # "same_trust_fk" | "cross_source_value_match" + + def to_dict(self) -> Dict[str, Any]: + return { + "from_ref": self.from_ref, + "to_ref": self.to_ref, + "confidence": self.confidence, + "overlap_pct": self.overlap_pct, + "kind": self.kind, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "JoinKey": + return cls( + from_ref=data["from_ref"], + to_ref=data["to_ref"], + confidence=float(data["confidence"]), + overlap_pct=float(data["overlap_pct"]), + kind=data["kind"], + ) + + +@dataclass +class SkipItem: + """An ontology entity/relationship the planner has decided to skip.""" + + item: str # uri + reason: str = "" + + def to_dict(self) -> Dict[str, Any]: + return {"item": self.item, "reason": self.reason} + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SkipItem": + return cls(item=data["item"], reason=data.get("reason", "")) + + +@dataclass +class MappingPlan: + """The order in which the Generator should attempt entity/relationship + mappings, plus any items the planner chose to drop.""" + + entity_order: List[str] = field(default_factory=list) + relationship_order: List[str] = field(default_factory=list) + skip: List[SkipItem] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + return { + "entity_order": list(self.entity_order), + "relationship_order": list(self.relationship_order), + "skip": [s.to_dict() for s in self.skip], + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "MappingPlan": + return cls( + entity_order=list(data.get("entity_order", [])), + relationship_order=list(data.get("relationship_order", [])), + skip=[SkipItem.from_dict(s) for s in data.get("skip", [])], + ) + + +@dataclass +class SourceModel: + """Output of the Planner stage; input to the Generator. + + Contains the planner's understanding of the source schema (table roles, + canonical ids, join keys) and the ordered plan of work for the + Generator. + """ + + table_roles: List[TableRole] = field(default_factory=list) + canonical_ids: List[CanonicalId] = field(default_factory=list) + join_keys: List[JoinKey] = field(default_factory=list) + mapping_plan: MappingPlan = field(default_factory=MappingPlan) + + def to_dict(self) -> Dict[str, Any]: + return { + "table_roles": [t.to_dict() for t in self.table_roles], + "canonical_ids": [c.to_dict() for c in self.canonical_ids], + "join_keys": [j.to_dict() for j in self.join_keys], + "mapping_plan": self.mapping_plan.to_dict(), + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SourceModel": + return cls( + table_roles=[ + TableRole.from_dict(t) for t in data.get("table_roles", []) + ], + canonical_ids=[ + CanonicalId.from_dict(c) for c in data.get("canonical_ids", []) + ], + join_keys=[JoinKey.from_dict(j) for j in data.get("join_keys", [])], + mapping_plan=MappingPlan.from_dict(data.get("mapping_plan", {})), + ) + + +# ===================================================== +# EvalReport — Evaluator output +# ===================================================== + + +@dataclass +class EvalFailure: + """A single failed check inside an :class:`EvalReport`. + + ``hint`` is the actionable correction text fed back to the Generator on + retry; it should be concrete and template-y, not a free-form essay. + """ + + kind: str # "structural" | "semantic" + check: str # e.g. "dangling_source_pct" + expected: str # e.g. "< 0.05" + observed: str # e.g. "0.47" + hint: str = "" + + def to_dict(self) -> Dict[str, Any]: + return { + "kind": self.kind, + "check": self.check, + "expected": self.expected, + "observed": self.observed, + "hint": self.hint, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "EvalFailure": + return cls( + kind=data["kind"], + check=data["check"], + expected=data["expected"], + observed=data["observed"], + hint=data.get("hint", ""), + ) + + +@dataclass +class EvalReport: + """Outcome of evaluating a single submitted mapping. + + ``bubble_to_planner`` signals that the failure cannot reasonably be + fixed by the Generator alone and warrants re-planning (e.g. wrong + canonical id column, table assigned to wrong ontology class). + """ + + status: str # "PASS" | "FAIL" + stage: str # "deterministic" | "semantic" + metrics: Dict[str, Any] = field(default_factory=dict) + failures: List[EvalFailure] = field(default_factory=list) + bubble_to_planner: bool = False + + def to_dict(self) -> Dict[str, Any]: + return { + "status": self.status, + "stage": self.stage, + "metrics": dict(self.metrics), + "failures": [f.to_dict() for f in self.failures], + "bubble_to_planner": self.bubble_to_planner, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "EvalReport": + return cls( + status=data["status"], + stage=data["stage"], + metrics=dict(data.get("metrics", {})), + failures=[EvalFailure.from_dict(f) for f in data.get("failures", [])], + bubble_to_planner=bool(data.get("bubble_to_planner", False)), + ) + + +# ===================================================== +# RetryState — orchestrator bookkeeping (used in Sprint 7) +# ===================================================== + + +@dataclass +class RetryState: + """Per-item retry budget tracked by the orchestrator. + + The orchestrator caps the Generator at 3 attempts per item before + giving up, and bumps the Planner at most twice per item if the + evaluator keeps bubbling failures upstream. + """ + + item_uri: str + generator_attempts: int = 0 + planner_reinvocations: int = 0 + last_eval_report: Optional[EvalReport] = None + + def to_dict(self) -> Dict[str, Any]: + return { + "item_uri": self.item_uri, + "generator_attempts": self.generator_attempts, + "planner_reinvocations": self.planner_reinvocations, + "last_eval_report": ( + self.last_eval_report.to_dict() + if self.last_eval_report is not None + else None + ), + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "RetryState": + last = data.get("last_eval_report") + return cls( + item_uri=data["item_uri"], + generator_attempts=int(data.get("generator_attempts", 0)), + planner_reinvocations=int(data.get("planner_reinvocations", 0)), + last_eval_report=EvalReport.from_dict(last) if last is not None else None, + ) + + +# ===================================================== +# Sanity check — keep dataclass discovery introspectable +# ===================================================== + +_ALL_CONTRACTS = ( + TableRoleCandidate, + TableRole, + CanonicalId, + JoinKey, + SkipItem, + MappingPlan, + SourceModel, + EvalFailure, + EvalReport, + RetryState, +) +for _cls in _ALL_CONTRACTS: + assert is_dataclass(_cls), f"{_cls.__name__} must be a dataclass" + # touch ``fields`` to ensure all defaults are well-formed at import time. + fields(_cls) +del _cls diff --git a/src/agents/agent_mapping_pge/engine.py b/src/agents/agent_mapping_pge/engine.py new file mode 100644 index 00000000..80d4c8ad --- /dev/null +++ b/src/agents/agent_mapping_pge/engine.py @@ -0,0 +1,1281 @@ +""" +OntoBricks Mapping-PGE Orchestrator. + +Wires the Planner, the Entity/Relationship Generators, and the two-stage +Evaluator (deterministic + semantic critic) into a single ``run_agent`` +entry point. + +The public ``run_agent`` signature and :class:`AgentResult` shape match the +prior in-house single-loop mapping agent so ``back/objects/mapping/Mapping.py`` +can call this engine without other changes. + +Control flow per item (entity or relationship) +============================================== + +1. Build a focused slice from the Planner's :class:`SourceModel`. +2. Run the appropriate Generator with ``retry_hint=None``. +3. Run the deterministic evaluator. On FAIL: + * if ``bubble_to_planner=True`` -> escalate to Planner (capped at 2 global + replans across the whole run); + * else retry the Generator with the first failure's hint. +4. On stage-1 PASS, run the semantic critic (unless ``skip_semantic_critic`` + is set). Same bubble / hint logic on FAIL. +5. After 3 unsuccessful attempts, the item is recorded as ``FAIL_BUDGET`` and + the orchestrator moves on to the next item. + +Step-log design +=============== + +``AgentResult.steps`` is a HIGH-LEVEL log — one entry per stage transition +(planner-start, generator-start, evaluator-result, critic-result, item-done). +The detailed per-tool steps emitted by each sub-agent stay on the sub-agent's +own result dataclass (``PlannerResult.steps``, ``EntityGenResult.steps``, …) +and are NOT merged into the orchestrator's ``steps`` field. This keeps the +top-level log readable in the UI; the persistence layer can attach sub-agent +step lists separately when needed. +""" + +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Tuple + +from back.core.logging import get_logger +from agents.agent_mapping_pge.contracts import EvalReport, SourceModel +from agents.agent_mapping_pge.evaluator.critic import run_critic +from agents.agent_mapping_pge.evaluator.deterministic import ( + evaluate_entity_mapping, + evaluate_relationship_mapping, +) +from agents.agent_mapping_pge.generators.entity import run_entity_generator +from agents.agent_mapping_pge.generators.relationship import ( + run_relationship_generator, +) +from agents.agent_mapping_pge.planner import run_planner +from agents.tracing import trace_agent + +logger = get_logger(__name__) + +# Per-item retry budget for the Generator->Evaluator inner loop. +_PER_ITEM_GENERATOR_ATTEMPTS = 3 +# Global cap on Planner re-invocations triggered by escalated failures. +_PLANNER_REINVOCATION_BUDGET = 2 + + +# ===================================================== +# Public dataclasses — mirror the prior mapping agent's shapes +# ===================================================== + + +@dataclass +class AgentStep: + """One observable step of the orchestrator's execution. + + Same shape as :class:`agents.engine_base.AgentStep` plus a few extra + ``step_type`` values used by the PGE orchestrator: + + * ``"planner"`` / ``"generator"`` / ``"evaluator"`` / ``"critic"`` for + stage transitions; the legacy ``"tool_call"`` / ``"tool_result"`` / + ``"output"`` types remain valid so this struct is fully drop-in- + compatible with the prior orchestrator. + """ + + step_type: str + content: str + tool_name: str = "" + duration_ms: int = 0 + + +@dataclass +class AgentResult: + """Outcome of a full PGE orchestration run. + + The first eight fields mirror the prior in-house mapping-agent's result + dataclass exactly so callers can swap engines without touching their + downstream code. The last three are PGE-specific extras the caller + can choose to persist. + """ + + success: bool + entity_mappings: list = field(default_factory=list) + relationship_mappings: list = field(default_factory=list) + steps: List[AgentStep] = field(default_factory=list) + iterations: int = 0 + error: str = "" + usage: Dict[str, int] = field(default_factory=dict) + stats: Dict[str, int] = field(default_factory=dict) + # PGE-specific extras + source_model: Optional[dict] = None + mapping_evaluations: Dict[str, dict] = field(default_factory=dict) + mapping_run_log: List[dict] = field(default_factory=list) + + +# ===================================================== +# Internal helpers +# ===================================================== + + +def _ontology_index(ontology: dict) -> Dict[str, dict]: + """Build ``uri -> entity dict`` for fast lookup by URI.""" + out: Dict[str, dict] = {} + for e in (ontology or {}).get("entities", []) or []: + uri = e.get("uri") or e.get("name") + if uri: + out[uri] = e + return out + + +def _relationship_index(ontology: dict) -> Dict[str, dict]: + """Build ``uri -> relationship dict`` for fast lookup by URI.""" + out: Dict[str, dict] = {} + for r in (ontology or {}).get("relationships", []) or []: + uri = r.get("uri") or r.get("name") + if uri: + out[uri] = r + return out + + +def _slice_for_entity(source_model: SourceModel, class_uri: str) -> dict: + """Render the SourceModel slice consumed by the EntityGenerator. + + The slice surfaces only what's relevant to one ontology class: + candidate tables, the canonical-ID per chosen table, and any joins + naming a candidate table on at least one side. + """ + candidate_tables: List[dict] = [] + candidate_table_names: set = set() + for role in source_model.table_roles: + for cand in role.ontology_class_candidates: + if cand.uri == class_uri: + candidate_tables.append( + { + "table": role.table, + "confidence": cand.confidence, + "reason": cand.reason, + } + ) + candidate_table_names.add(role.table) + break # one entry per role is enough + + canonical_id_obj: Dict[str, Any] = { + "ontology_class": class_uri, + "canonical_column_per_table": {}, + "format_note": "", + } + for c in source_model.canonical_ids: + if c.ontology_class == class_uri: + canonical_id_obj = c.to_dict() + break + + relevant_joins: List[dict] = [] + for j in source_model.join_keys: + from_table = j.from_ref.split(".")[0] if j.from_ref else "" + to_table = j.to_ref.split(".")[0] if j.to_ref else "" + if any( + ft == from_table or ft.endswith("." + from_table) + for ft in candidate_table_names + ) or any( + tt == to_table or tt.endswith("." + to_table) + for tt in candidate_table_names + ): + relevant_joins.append(j.to_dict()) + + return { + "candidate_tables": candidate_tables, + "canonical_id": canonical_id_obj, + "relevant_joins": relevant_joins, + } + + +def _slice_for_relationship( + source_model: SourceModel, + property_uri: str, + source_entity_mapping: dict, + target_entity_mapping: dict, +) -> dict: + """Render the SourceModel slice consumed by the RelationshipGenerator. + + The slice surfaces every join key the Planner produced (the Generator + picks among them), plus the candidate-table list filtered to the + source/target classes when those classes are known. + """ + src_class = (source_entity_mapping or {}).get("ontology_class") or ( + source_entity_mapping or {} + ).get("class_uri", "") + tgt_class = (target_entity_mapping or {}).get("ontology_class") or ( + target_entity_mapping or {} + ).get("class_uri", "") + endpoint_classes = {c for c in (src_class, tgt_class) if c} + + candidate_tables: List[dict] = [] + for role in source_model.table_roles: + for cand in role.ontology_class_candidates: + if not endpoint_classes or cand.uri in endpoint_classes: + candidate_tables.append( + { + "table": role.table, + "ontology_class": cand.uri, + "confidence": cand.confidence, + "reason": cand.reason, + } + ) + break + + relevant_joins = [j.to_dict() for j in source_model.join_keys] + + return { + "property_uri": property_uri, + "relevant_joins": relevant_joins, + "candidate_tables": candidate_tables, + } + + +def _wrap_execute_sql(client: Any) -> Callable[[str], dict]: + """Adapt ``client.execute_query`` to the evaluator's expected shape. + + The deterministic evaluator wants ``{"columns": [...], "rows": [{...}]}`` + with FULL rows. ``client.execute_query`` returns ``List[Dict[str, Any]]`` + — we promote that to the evaluator's shape and derive columns from the + first row. Calling the underlying client directly (rather than the + sampling ``tool_execute_sql``) is load-bearing: the deterministic + evaluator's count-based checks need real values, not stringified ones. + """ + + def _run(sql: str) -> dict: + rows = client.execute_query(sql) or [] + if isinstance(rows, dict) and "rows" in rows: + return rows # client already returns the evaluator's shape + columns: List[str] = [] + if rows and isinstance(rows[0], dict): + columns = list(rows[0].keys()) + return {"columns": columns, "rows": list(rows)} + + return _run + + +def _first_hint(report: EvalReport) -> Optional[str]: + """Return the first failure's hint (or ``None`` when the report has none).""" + for f in report.failures: + if f.hint: + return f.hint + return None + + +def _resolve_endpoint_em( + ref: str, + by_uri: Dict[str, dict], + entity_index: Dict[str, dict], +) -> Optional[dict]: + """Best-effort lookup of an endpoint entity mapping. + + The ontology's ``domain`` / ``range`` may use either the entity's full + URI or its short name. We try direct lookup, then a name-match scan. + """ + if not ref: + return None + if ref in by_uri: + return by_uri[ref] + for uri, ent in entity_index.items(): + if ent.get("name") == ref or ent.get("label") == ref: + if uri in by_uri: + return by_uri[uri] + return None + + +# ===================================================== +# Public entry point +# ===================================================== + + +@trace_agent(name="mapping_pge_engine") +def run_agent( + host: str, + token: str, + endpoint_name: str, + client: Any, + metadata: dict, + ontology: dict, + entity_mappings: Optional[list] = None, + relationship_mappings: Optional[list] = None, + documents: Optional[list] = None, + on_step: Optional[Callable[[str, int], None]] = None, + max_iterations: Optional[int] = None, + *, + skip_semantic_critic: bool = False, +) -> AgentResult: + """Run the PGE mapping orchestrator. + + Drop-in replacement for the prior in-house single-loop mapping agent — + same positional/keyword signature, same :class:`AgentResult` shape. + + Args: + host: Databricks workspace URL. + token: Bearer token for the serving endpoint. + endpoint_name: Foundation Model serving endpoint name. + client: Databricks SQL client exposing ``execute_query(sql)``. + metadata: Imported table metadata to hand to the Planner. + ontology: Ontology dict with ``entities`` and ``relationships``. + entity_mappings: Pre-seeded entity mappings (URI matched -> skipped). + relationship_mappings: Pre-seeded relationship mappings (likewise). + documents: Optional pre-loaded domain documents. + on_step: Optional progress callback ``(msg, pct)``. + max_iterations: Per-item override for the Generator's iteration cap. + Kept for API parity with the legacy engine; ``None`` uses each + sub-agent's default. + skip_semantic_critic: When ``True``, the orchestrator skips the + stage-2 critic and accepts every stage-1 PASS as a final PASS. + Production callers leave this ``False``; tests flip it ``True`` + to avoid LLM calls in the orchestrator's unit tests. + + Returns: + An :class:`AgentResult` with the submitted mappings, a high-level + ``steps`` log, per-item ``mapping_run_log``, and PGE-specific + extras (``source_model``, ``mapping_evaluations``). + """ + # ------------------------------------------------------------------ + # Per-call state lives entirely on this RunState object — no module- + # level mutables, so concurrent calls (and tests) cannot collide. + # ------------------------------------------------------------------ + state = _RunState( + host=host, + token=token, + endpoint_name=endpoint_name, + client=client, + metadata=metadata or {}, + ontology=ontology or {}, + documents=list(documents or []), + on_step=on_step, + max_iterations=max_iterations, + skip_semantic_critic=skip_semantic_critic, + ) + + # Pre-seeded mappings carry over verbatim — we never overwrite a URI the + # caller already mapped. + pre_entity_list = list(entity_mappings or []) + pre_rel_list = list(relationship_mappings or []) + preseeded_entity_uris = { + m.get("ontology_class") or m.get("class_uri") or "" for m in pre_entity_list + } + preseeded_entity_uris.discard("") + preseeded_rel_uris = { + m.get("property") or m.get("property_uri") or "" for m in pre_rel_list + } + preseeded_rel_uris.discard("") + + state.entity_mappings.extend(pre_entity_list) + state.relationship_mappings.extend(pre_rel_list) + for m in pre_entity_list: + uri = m.get("ontology_class") or m.get("class_uri") + if uri: + state.entity_mapping_by_uri[uri] = m + + entities_in_scope = state.ontology.get("entities", []) or [] + relationships_in_scope = state.ontology.get("relationships", []) or [] + + logger.info( + "===== MAPPING-PGE ENGINE START ===== endpoint=%s, entities=%d, " + "relationships=%d, preseeded_entities=%d, preseeded_rels=%d, " + "skip_critic=%s", + endpoint_name, + len(entities_in_scope), + len(relationships_in_scope), + len(preseeded_entity_uris), + len(preseeded_rel_uris), + skip_semantic_critic, + ) + + # ------------------------------------------------------------------ + # 1. Planner + # ------------------------------------------------------------------ + state.notify("Planning…", pct=2) + state.add_step("planner", "planner-start") + + t0 = time.time() + try: + planner_result = run_planner( + host=host, + token=token, + endpoint_name=endpoint_name, + client=client, + metadata=state.metadata, + ontology=state.ontology, + documents=state.documents, + on_step=None, + ) + except Exception as exc: # noqa: BLE001 — surface any failure as run failure + logger.error("Planner raised an exception: %s", exc, exc_info=True) + return state.finalise(error=f"planner exception: {exc}") + + planner_ms = int((time.time() - t0) * 1000) + state.add_iterations(planner_result.iterations) + state.accumulate_usage(planner_result.usage) + + if not planner_result.success or planner_result.source_model is None: + state.add_step( + "planner", + f"planner-fail: {planner_result.error}", + duration_ms=planner_ms, + ) + logger.error("===== MAPPING-PGE ENGINE FAILED ===== planner failed") + state.notify("Planner failed — aborting.", pct=10) + return state.finalise( + error=f"planner failed: {planner_result.error or 'no source model'}" + ) + + state.source_model = planner_result.source_model + state.refresh_plan() + state.add_step( + "planner", + f"planner-done: entities={len(state.entity_order)}, " + f"relationships={len(state.relationship_order)}", + duration_ms=planner_ms, + ) + + # ------------------------------------------------------------------ + # 2. Walk the plan — entities first, then relationships. + # ------------------------------------------------------------------ + state.entity_index = _ontology_index(state.ontology) + state.rel_index = _relationship_index(state.ontology) + state.execute_sql_fn = _wrap_execute_sql(client) + state.total_items_planned = len(state.entity_order) + len( + state.relationship_order + ) + + # ------------------ Entity walk ------------------ + for entity_uri in list(state.entity_order): + ontology_class = state.entity_index.get(entity_uri, {"uri": entity_uri}) + label = ontology_class.get("label") or ontology_class.get( + "name", entity_uri + ) + + if entity_uri in preseeded_entity_uris: + state.mapping_run_log.append( + { + "item": entity_uri, + "kind": "entity", + "attempts": [], + "final_status": "PRESEEDED", + } + ) + state.notify(f"Skipping pre-seeded {label}") + state.items_done += 1 + continue + + if entity_uri in state.skip_reasons: + state.mapping_run_log.append( + { + "item": entity_uri, + "kind": "entity", + "attempts": [], + "final_status": "SKIPPED", + } + ) + state.notify( + f"Skipped {label}: {state.skip_reasons[entity_uri]}" + ) + state.items_done += 1 + continue + + final_status, attempts_log, last_mapping, last_report = _run_entity_item( + state, ontology_class + ) + + state.mapping_run_log.append( + { + "item": entity_uri, + "kind": "entity", + "attempts": attempts_log, + "final_status": final_status, + } + ) + if final_status == "PASS" and last_mapping is not None: + state.entity_mappings.append(last_mapping) + state.entity_mapping_by_uri[entity_uri] = last_mapping + state.submitted_any = True + if last_report is not None: + state.mapping_evaluations[entity_uri] = last_report.to_dict() + state.notify(f"Mapped {label}") + state.items_done += 1 + + # ------------------ Relationship walk ------------------ + for property_uri in list(state.relationship_order): + prop = state.rel_index.get(property_uri, {"uri": property_uri}) + label = prop.get("label") or prop.get("name", property_uri) + + if property_uri in preseeded_rel_uris: + state.mapping_run_log.append( + { + "item": property_uri, + "kind": "relationship", + "attempts": [], + "final_status": "PRESEEDED", + } + ) + state.notify(f"Skipping pre-seeded {label}") + state.items_done += 1 + continue + + if property_uri in state.skip_reasons: + state.mapping_run_log.append( + { + "item": property_uri, + "kind": "relationship", + "attempts": [], + "final_status": "SKIPPED", + } + ) + state.notify(f"Skipped {label}: {state.skip_reasons[property_uri]}") + state.items_done += 1 + continue + + domain_ref = prop.get("domain", "") or "" + range_ref = prop.get("range", "") or "" + source_em = state.entity_mapping_by_uri.get( + domain_ref + ) or _resolve_endpoint_em( + domain_ref, state.entity_mapping_by_uri, state.entity_index + ) + target_em = state.entity_mapping_by_uri.get( + range_ref + ) or _resolve_endpoint_em( + range_ref, state.entity_mapping_by_uri, state.entity_index + ) + if source_em is None or target_em is None: + state.mapping_run_log.append( + { + "item": property_uri, + "kind": "relationship", + "attempts": [], + "final_status": "FAIL_BUDGET", + } + ) + state.add_step( + "evaluator", + f"relationship {property_uri}: endpoint mapping missing — skipped", + ) + state.notify(f"Cannot map {label}: endpoint entity not available") + state.items_done += 1 + continue + + final_status, attempts_log, last_mapping, last_report = _run_relationship_item( + state, prop, source_em, target_em + ) + + state.mapping_run_log.append( + { + "item": property_uri, + "kind": "relationship", + "attempts": attempts_log, + "final_status": final_status, + } + ) + if final_status == "PASS" and last_mapping is not None: + state.relationship_mappings.append(last_mapping) + state.submitted_any = True + if last_report is not None: + state.mapping_evaluations[property_uri] = last_report.to_dict() + state.notify(f"Mapped {label}") + state.items_done += 1 + + state.notify("Agent completed!", pct=100) + return state.finalise() + + +# ===================================================== +# Run-scoped mutable state +# ===================================================== + + +@dataclass +class _RunState: + """Encapsulates per-call mutable state — keeps ``run_agent`` re-entrant. + + All counters, mapping lists, and accumulators that need to evolve as the + walk progresses live here so the orchestrator never relies on module- + level globals. This also keeps the per-item helpers (``_run_*_item``) + pure functions of state + item input. + """ + + host: str + token: str + endpoint_name: str + client: Any + metadata: dict + ontology: dict + documents: List[Any] + on_step: Optional[Callable[[str, int], None]] + max_iterations: Optional[int] + skip_semantic_critic: bool + + # Output accumulators + entity_mappings: List[dict] = field(default_factory=list) + relationship_mappings: List[dict] = field(default_factory=list) + entity_mapping_by_uri: Dict[str, dict] = field(default_factory=dict) + mapping_run_log: List[dict] = field(default_factory=list) + mapping_evaluations: Dict[str, dict] = field(default_factory=dict) + steps: List[AgentStep] = field(default_factory=list) + usage: Dict[str, int] = field( + default_factory=lambda: {"prompt_tokens": 0, "completion_tokens": 0} + ) + iterations: int = 0 + submitted_any: bool = False + + # Plan-derived state — refreshed on (re)plan. + source_model: Optional[SourceModel] = None + entity_order: List[str] = field(default_factory=list) + relationship_order: List[str] = field(default_factory=list) + skip_reasons: Dict[str, str] = field(default_factory=dict) + planner_reinvocations: int = 0 + + # Walk progress + items_done: int = 0 + total_items_planned: int = 0 + + # Per-run caches & lookups + id_universe_cache: Dict[str, set] = field(default_factory=dict) + entity_index: Dict[str, dict] = field(default_factory=dict) + rel_index: Dict[str, dict] = field(default_factory=dict) + execute_sql_fn: Optional[Callable[[str], dict]] = None + + # -- helpers ---------------------------------------------------------- + + def add_step( + self, + step_type: str, + content: str, + *, + tool_name: str = "", + duration_ms: int = 0, + ) -> None: + self.steps.append( + AgentStep( + step_type=step_type, + content=content, + tool_name=tool_name, + duration_ms=duration_ms, + ) + ) + + def pct(self) -> int: + total = max(self.total_items_planned, 1) + return min(5 + int((self.items_done / total) * 90), 95) + + def notify(self, msg: str, *, pct: Optional[int] = None) -> None: + actual_pct = pct if pct is not None else self.pct() + logger.info("PGE STEP [%d%%] %s", actual_pct, msg) + if self.on_step: + self.on_step(msg, actual_pct) + + def add_iterations(self, n: int) -> None: + self.iterations += int(n or 0) + + def accumulate_usage(self, src: Dict[str, int]) -> None: + for k in ("prompt_tokens", "completion_tokens"): + self.usage[k] = self.usage.get(k, 0) + int((src or {}).get(k, 0)) + + def refresh_plan(self) -> None: + sm = self.source_model + if sm is None: + return + self.entity_order = list(sm.mapping_plan.entity_order) + self.relationship_order = list(sm.mapping_plan.relationship_order) + self.skip_reasons = {s.item: s.reason for s in sm.mapping_plan.skip} + + def replan_once(self) -> bool: + """Re-invoke the Planner once (subject to the global budget). + + Returns ``True`` on success (and updates the plan in place), ``False`` + when the budget is exhausted or the new Planner run failed. + """ + if self.planner_reinvocations >= _PLANNER_REINVOCATION_BUDGET: + return False + self.planner_reinvocations += 1 + self.notify("Re-planning (escalated)…", pct=self.pct()) + self.add_step( + "planner", + f"replan-start (reinvocation #{self.planner_reinvocations})", + ) + t_rp = time.time() + try: + new_result = run_planner( + host=self.host, + token=self.token, + endpoint_name=self.endpoint_name, + client=self.client, + metadata=self.metadata, + ontology=self.ontology, + documents=self.documents, + on_step=None, + ) + except Exception as exc: # noqa: BLE001 + logger.error("Replan raised an exception: %s", exc, exc_info=True) + self.add_step("planner", f"replan-exception: {exc}") + return False + replan_ms = int((time.time() - t_rp) * 1000) + self.add_iterations(new_result.iterations) + self.accumulate_usage(new_result.usage) + if not new_result.success or new_result.source_model is None: + self.add_step( + "planner", + f"replan-fail: {new_result.error}", + duration_ms=replan_ms, + ) + return False + self.source_model = new_result.source_model + self.refresh_plan() + self.add_step("planner", "replan-done", duration_ms=replan_ms) + return True + + def finalise(self, *, error: str = "") -> AgentResult: + """Build the final :class:`AgentResult`.""" + result = AgentResult(success=False) + result.entity_mappings = list(self.entity_mappings) + result.relationship_mappings = list(self.relationship_mappings) + result.steps = list(self.steps) + result.iterations = self.iterations + result.usage = dict(self.usage) + result.mapping_run_log = list(self.mapping_run_log) + result.mapping_evaluations = dict(self.mapping_evaluations) + result.source_model = ( + self.source_model.to_dict() if self.source_model is not None else None + ) + result.stats = { + "total": len(self.entity_order) + len(self.relationship_order), + "entities": len(self.entity_mappings), + "relationships": len(self.relationship_mappings), + "planner_reinvocations": self.planner_reinvocations, + } + if error: + result.error = error + result.success = False + return result + + # Success when at least one mapping was submitted, OR when there was + # nothing to map (legitimate empty run). + nothing_to_map = ( + not self.entity_order and not self.relationship_order + ) + result.success = self.submitted_any or nothing_to_map + if not result.success: + result.error = ( + "no mappings submitted (all items failed or were skipped)" + ) + logger.info( + "===== MAPPING-PGE ENGINE COMPLETE ===== success=%s, entities=%d, " + "relationships=%d, iterations=%d, replans=%d", + result.success, + len(self.entity_mappings), + len(self.relationship_mappings), + self.iterations, + self.planner_reinvocations, + ) + return result + + +# ===================================================== +# Per-item walk helpers +# ===================================================== + + +def _run_entity_item( + state: _RunState, + ontology_class: dict, +) -> Tuple[str, List[dict], Optional[dict], Optional[EvalReport]]: + """Run the G->E loop for one entity. + + Returns ``(final_status, attempts_log, last_mapping, last_report)``. + The outer ``while True`` lets a successful replan restart the inner + retry budget fresh, which is the intent of the bubble-to-planner path. + """ + class_uri = ontology_class.get("uri", "") + class_label = ontology_class.get("label") or ontology_class.get( + "name", class_uri + ) + attempts_log: List[dict] = [] + last_mapping: Optional[dict] = None + last_report: Optional[EvalReport] = None + + while True: + retry_hint: Optional[str] = None + bubble_requested = False + for attempt_idx in range(_PER_ITEM_GENERATOR_ATTEMPTS): + attempt_num = attempt_idx + 1 + slice_dict = _slice_for_entity(state.source_model, class_uri) + state.notify( + f"Mapping {class_label} (attempt {attempt_num}/{_PER_ITEM_GENERATOR_ATTEMPTS})…", + pct=state.pct(), + ) + state.add_step( + "generator", + f"entity-gen-start: {class_uri} attempt {attempt_num}", + ) + t_g = time.time() + try: + gen_result = run_entity_generator( + host=state.host, + token=state.token, + endpoint_name=state.endpoint_name, + client=state.client, + ontology_class=ontology_class, + source_model_slice=slice_dict, + retry_hint=retry_hint, + on_step=None, + **( + {"max_iterations": state.max_iterations} + if state.max_iterations is not None + else {} + ), + ) + except Exception as exc: # noqa: BLE001 + logger.error( + "EntityGenerator raised on %s attempt %d: %s", + class_uri, + attempt_num, + exc, + exc_info=True, + ) + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": int((time.time() - t_g) * 1000), + "stage1_status": "skipped", + "critic_status": "skipped", + "bubble": False, + "hint": None, + "error": f"generator exception: {exc}", + } + ) + continue + gen_ms = int((time.time() - t_g) * 1000) + state.add_iterations(gen_result.iterations) + state.accumulate_usage(gen_result.usage) + + if not gen_result.success or gen_result.mapping is None: + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "skipped", + "critic_status": "skipped", + "bubble": False, + "hint": None, + "error": gen_result.error or "generator failed", + } + ) + state.add_step( + "generator", + f"entity-gen-fail: {class_uri} attempt {attempt_num}: " + f"{gen_result.error}", + duration_ms=gen_ms, + ) + retry_hint = gen_result.error or retry_hint + continue + + mapping = gen_result.mapping + last_mapping = mapping + + state.notify(f"Evaluating {class_label}…", pct=state.pct()) + t_e = time.time() + stage1_report = evaluate_entity_mapping( + mapping=mapping, + ontology_class=ontology_class, + execute_sql_fn=state.execute_sql_fn, + ) + eval_ms = int((time.time() - t_e) * 1000) + last_report = stage1_report + state.add_step( + "evaluator", + f"entity-stage1: {class_uri} status={stage1_report.status} " + f"bubble={stage1_report.bubble_to_planner}", + duration_ms=eval_ms, + ) + + if stage1_report.status == "FAIL": + hint = _first_hint(stage1_report) + bubble = bool(stage1_report.bubble_to_planner) + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "FAIL", + "critic_status": "skipped", + "bubble": bubble, + "hint": hint, + } + ) + if bubble: + bubble_requested = True + break + retry_hint = hint or retry_hint + continue + + # Stage 1 PASS — optionally run the critic. + if state.skip_semantic_critic: + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "PASS", + "critic_status": "skipped", + "bubble": False, + "hint": None, + } + ) + return "PASS", attempts_log, mapping, stage1_report + + state.notify(f"Critiquing {class_label}…", pct=state.pct()) + t_c = time.time() + try: + critic_result = run_critic( + host=state.host, + token=state.token, + endpoint_name=state.endpoint_name, + client=state.client, + item_kind="entity", + item_uri=class_uri, + item_definition=ontology_class, + submitted_mapping=mapping, + source_model_slice=slice_dict, + stage1_metrics=dict(stage1_report.metrics), + ) + except Exception as exc: # noqa: BLE001 + logger.error( + "Critic raised on %s attempt %d: %s", + class_uri, + attempt_num, + exc, + exc_info=True, + ) + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "PASS", + "critic_status": "skipped", + "bubble": False, + "hint": None, + "error": f"critic exception: {exc}", + } + ) + return "PASS", attempts_log, mapping, stage1_report + critic_ms = int((time.time() - t_c) * 1000) + state.add_iterations(critic_result.iterations) + state.accumulate_usage(critic_result.usage) + + critic_report = critic_result.report + state.add_step( + "critic", + f"entity-critic: {class_uri} status=" + f"{critic_report.status if critic_report else '?'} " + f"bubble=" + f"{critic_report.bubble_to_planner if critic_report else '?'}", + duration_ms=critic_ms, + ) + + if not critic_result.success or critic_report is None: + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "PASS", + "critic_status": "skipped", + "bubble": False, + "hint": None, + "error": critic_result.error or "critic failed", + } + ) + return "PASS", attempts_log, mapping, stage1_report + + if critic_report.status == "PASS": + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "PASS", + "critic_status": "PASS", + "bubble": False, + "hint": None, + } + ) + return "PASS", attempts_log, mapping, critic_report + + hint = _first_hint(critic_report) + bubble = bool(critic_report.bubble_to_planner) + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "PASS", + "critic_status": "FAIL", + "bubble": bubble, + "hint": hint, + } + ) + last_report = critic_report + if bubble: + bubble_requested = True + break + retry_hint = hint or retry_hint + continue + + if bubble_requested: + if state.replan_once(): + continue # restart the item with the new plan + return "FAIL_BUBBLE", attempts_log, last_mapping, last_report + return "FAIL_BUDGET", attempts_log, last_mapping, last_report + + +def _run_relationship_item( + state: _RunState, + ontology_property: dict, + source_em: dict, + target_em: dict, +) -> Tuple[str, List[dict], Optional[dict], Optional[EvalReport]]: + """Run the G->E loop for one relationship. + + Returns ``(final_status, attempts_log, last_mapping, last_report)``. + """ + property_uri = ontology_property.get("uri", "") + property_label = ontology_property.get("label") or ontology_property.get( + "name", property_uri + ) + attempts_log: List[dict] = [] + last_mapping: Optional[dict] = None + last_report: Optional[EvalReport] = None + + while True: + retry_hint: Optional[str] = None + bubble_requested = False + for attempt_idx in range(_PER_ITEM_GENERATOR_ATTEMPTS): + attempt_num = attempt_idx + 1 + slice_dict = _slice_for_relationship( + state.source_model, + property_uri, + source_em, + target_em, + ) + state.notify( + f"Mapping {property_label} (attempt {attempt_num}/" + f"{_PER_ITEM_GENERATOR_ATTEMPTS})…", + pct=state.pct(), + ) + state.add_step( + "generator", + f"rel-gen-start: {property_uri} attempt {attempt_num}", + ) + t_g = time.time() + try: + gen_result = run_relationship_generator( + host=state.host, + token=state.token, + endpoint_name=state.endpoint_name, + client=state.client, + ontology_property=ontology_property, + source_entity_mapping=source_em, + target_entity_mapping=target_em, + source_model_slice=slice_dict, + retry_hint=retry_hint, + on_step=None, + **( + {"max_iterations": state.max_iterations} + if state.max_iterations is not None + else {} + ), + ) + except Exception as exc: # noqa: BLE001 + logger.error( + "RelationshipGenerator raised on %s attempt %d: %s", + property_uri, + attempt_num, + exc, + exc_info=True, + ) + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": int((time.time() - t_g) * 1000), + "stage1_status": "skipped", + "critic_status": "skipped", + "bubble": False, + "hint": None, + "error": f"generator exception: {exc}", + } + ) + continue + gen_ms = int((time.time() - t_g) * 1000) + state.add_iterations(gen_result.iterations) + state.accumulate_usage(gen_result.usage) + + if not gen_result.success or gen_result.mapping is None: + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "skipped", + "critic_status": "skipped", + "bubble": False, + "hint": None, + "error": gen_result.error or "generator failed", + } + ) + state.add_step( + "generator", + f"rel-gen-fail: {property_uri} attempt {attempt_num}: " + f"{gen_result.error}", + duration_ms=gen_ms, + ) + retry_hint = gen_result.error or retry_hint + continue + + mapping = gen_result.mapping + last_mapping = mapping + + state.notify(f"Evaluating {property_label}…", pct=state.pct()) + t_e = time.time() + stage1_report = evaluate_relationship_mapping( + mapping=mapping, + source_entity_mapping=source_em, + target_entity_mapping=target_em, + execute_sql_fn=state.execute_sql_fn, + id_universe_cache=state.id_universe_cache, + ) + eval_ms = int((time.time() - t_e) * 1000) + last_report = stage1_report + state.add_step( + "evaluator", + f"rel-stage1: {property_uri} status={stage1_report.status} " + f"bubble={stage1_report.bubble_to_planner}", + duration_ms=eval_ms, + ) + + if stage1_report.status == "FAIL": + hint = _first_hint(stage1_report) + bubble = bool(stage1_report.bubble_to_planner) + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "FAIL", + "critic_status": "skipped", + "bubble": bubble, + "hint": hint, + } + ) + if bubble: + bubble_requested = True + break + retry_hint = hint or retry_hint + continue + + if state.skip_semantic_critic: + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "PASS", + "critic_status": "skipped", + "bubble": False, + "hint": None, + } + ) + return "PASS", attempts_log, mapping, stage1_report + + state.notify(f"Critiquing {property_label}…", pct=state.pct()) + t_c = time.time() + try: + critic_result = run_critic( + host=state.host, + token=state.token, + endpoint_name=state.endpoint_name, + client=state.client, + item_kind="relationship", + item_uri=property_uri, + item_definition=ontology_property, + submitted_mapping=mapping, + source_model_slice=slice_dict, + stage1_metrics=dict(stage1_report.metrics), + ) + except Exception as exc: # noqa: BLE001 + logger.error( + "Critic raised on %s attempt %d: %s", + property_uri, + attempt_num, + exc, + exc_info=True, + ) + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "PASS", + "critic_status": "skipped", + "bubble": False, + "hint": None, + "error": f"critic exception: {exc}", + } + ) + return "PASS", attempts_log, mapping, stage1_report + critic_ms = int((time.time() - t_c) * 1000) + state.add_iterations(critic_result.iterations) + state.accumulate_usage(critic_result.usage) + + critic_report = critic_result.report + state.add_step( + "critic", + f"rel-critic: {property_uri} status=" + f"{critic_report.status if critic_report else '?'} " + f"bubble=" + f"{critic_report.bubble_to_planner if critic_report else '?'}", + duration_ms=critic_ms, + ) + + if not critic_result.success or critic_report is None: + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "PASS", + "critic_status": "skipped", + "bubble": False, + "hint": None, + "error": critic_result.error or "critic failed", + } + ) + return "PASS", attempts_log, mapping, stage1_report + + if critic_report.status == "PASS": + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "PASS", + "critic_status": "PASS", + "bubble": False, + "hint": None, + } + ) + return "PASS", attempts_log, mapping, critic_report + + hint = _first_hint(critic_report) + bubble = bool(critic_report.bubble_to_planner) + attempts_log.append( + { + "attempt": attempt_num, + "generator_ms": gen_ms, + "stage1_status": "PASS", + "critic_status": "FAIL", + "bubble": bubble, + "hint": hint, + } + ) + last_report = critic_report + if bubble: + bubble_requested = True + break + retry_hint = hint or retry_hint + continue + + if bubble_requested: + if state.replan_once(): + continue + return "FAIL_BUBBLE", attempts_log, last_mapping, last_report + return "FAIL_BUDGET", attempts_log, last_mapping, last_report diff --git a/src/agents/agent_mapping_pge/evaluator/__init__.py b/src/agents/agent_mapping_pge/evaluator/__init__.py new file mode 100644 index 00000000..41e7ef5e --- /dev/null +++ b/src/agents/agent_mapping_pge/evaluator/__init__.py @@ -0,0 +1,18 @@ +"""Evaluator stage of the mapping PGE pipeline. + +Stage 1 (this module) is the *deterministic* evaluator — pure-Python checks +backed by SQL counts. Stage 2 (added in a later sprint) is the semantic +evaluator that uses an LLM to judge naming/semantic fidelity. + +The deterministic checks live in :mod:`agents.agent_mapping_pge.evaluator.deterministic`. +""" + +from agents.agent_mapping_pge.evaluator.deterministic import ( + evaluate_entity_mapping, + evaluate_relationship_mapping, +) + +__all__ = [ + "evaluate_entity_mapping", + "evaluate_relationship_mapping", +] diff --git a/src/agents/agent_mapping_pge/evaluator/critic.py b/src/agents/agent_mapping_pge/evaluator/critic.py new file mode 100644 index 00000000..a41fe1b0 --- /dev/null +++ b/src/agents/agent_mapping_pge/evaluator/critic.py @@ -0,0 +1,747 @@ +""" +OntoBricks Mapping-PGE Semantic Critic Agent. + +Sprint 6 of the Planner-Generator-Evaluator (PGE) redesign — stage 2 of the +Evaluator. Runs ONLY after the deterministic (stage-1) evaluator has passed. + +The Critic audits ONE submitted mapping for SEMANTIC correctness — issues that +pure structural checks cannot catch: + +* the WRONG TABLE was picked (e.g. ``antenatal_visits`` chosen to realise + the ``Delivery`` class), or +* the wrong COLUMN within the right table (e.g. ``appointment_date`` used + for ``deliveryDate``). + +The Critic's "bubble" signal is sharp: if the wrong TABLE was chosen, the +verdict bubbles to the Planner (which must revise the source model); if just +a wrong column inside the right table, the verdict stays with the Generator +which can retry against the same table. + +The loop shape mirrors :mod:`agents.agent_mapping_pge.generators.entity` — +same ``call_serving_endpoint`` + ``dispatch_tool`` ReAct cycle, same 3-second +inter-iteration delay, same MLflow trace decorator. Differences: + +* Smaller default budget (6) — auditing is bounded work; if the Critic can't + conclude in 6 iterations, it defers (PASS with a reasoning note) rather + than falsely escalates. +* Different tool set: only ``sample_table``, ``execute_sql``, + ``get_documents_context``, and the terminal ``submit_evaluation``. +* No single-shot fallback — the Critic produces structured output through + ``submit_evaluation`` only. +""" + +import json +import time +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional + +import requests + +if TYPE_CHECKING: + from agents.agent_mapping_pge.contracts import EvalReport + +from back.core.logging import get_logger +from agents.engine_base import ( + accumulate_usage, + call_serving_endpoint, + dispatch_tool, +) +from agents.tools.context import ToolContext +from agents.tools.documents import ( + GET_DOCUMENTS_CONTEXT_DEF, + tool_get_documents_context, +) +from agents.tools.evaluation import ( + EVALUATION_TOOL_DEFINITIONS, + EVALUATION_TOOL_HANDLERS, +) +from agents.tools.planner import ( + SAMPLE_TABLE_DEF, + tool_sample_table, +) +from agents.tools.sql import ( + SQL_TOOL_DEFINITIONS, + SQL_TOOL_HANDLERS, +) +from agents.tracing import trace_agent + +logger = get_logger(__name__) + +MAX_ITERATIONS = 6 +LLM_TIMEOUT = 180 +_ITERATION_DELAY_SEC = 3 +# See planner._MAX_TOKENS comment — same rationale for submit_evaluation. +_MAX_TOKENS = 50000 + +_TRACE_NAME = "mapping_pge_critic" + + +# ===================================================== +# Tool aggregation +# ===================================================== +# +# The Critic only needs: +# * sample_table – peek at actual values to verify the column +# picked really represents the ontology concept. +# * execute_sql – targeted probes for "is this column really +# what it claims" sanity checks. +# * get_documents_context – consult any imported domain glossary. +# * submit_evaluation – TERMINAL. +# +# We deliberately exclude: +# * get_metadata / get_ontology — the audit target is supplied in the user +# prompt; broad re-fetches just inflate context. +# * column_value_overlap / distinct_count — those are structural, already +# covered by the deterministic stage. +# * submit_source_model / submit_entity_mapping / submit_relationship_mapping +# — wrong stage. + +TOOL_DEFINITIONS: List[dict] = ( + [SAMPLE_TABLE_DEF, GET_DOCUMENTS_CONTEXT_DEF] + + SQL_TOOL_DEFINITIONS + + EVALUATION_TOOL_DEFINITIONS +) + +TOOL_HANDLERS: Dict[str, Callable] = { + "sample_table": tool_sample_table, + "get_documents_context": tool_get_documents_context, + **SQL_TOOL_HANDLERS, + **EVALUATION_TOOL_HANDLERS, +} + + +# ===================================================== +# Data classes +# ===================================================== + + +@dataclass +class CriticStep: + """One observable step of the Critic's execution. + + Mirrors :class:`agents.agent_mapping_pge.generators.entity.EntityGenStep` + so the orchestrator (Sprint 7) can render a per-audit timeline in the UI. + """ + + step_type: str # "tool_call" | "tool_result" | "output" + content: str + tool_name: str = "" + duration_ms: int = 0 + + +@dataclass +class CriticResult: + """Outcome of a single Critic invocation. + + ``report`` is populated when the agent terminated by submitting a verdict + via ``submit_evaluation``. ``success`` here is the agent-level success + flag — it indicates a *clean termination*, NOT a PASS verdict. A FAIL + verdict with ``bubble_to_planner=True`` still has ``success=True``. + ``success=False`` is reserved for budget exhaustion, text-only output, + and LLM/transport errors. + """ + + success: bool + report: Optional["EvalReport"] = None + steps: List[CriticStep] = field(default_factory=list) + iterations: int = 0 + error: str = "" + usage: Dict[str, int] = field(default_factory=dict) + + +# ===================================================== +# System prompt +# ===================================================== +# +# Kept under 3KB. Frames the Critic as a senior data engineer auditing ONE +# submitted mapping for SEMANTIC correctness — the structural checks have +# already passed. The decision rubric (PASS / FAIL+no-bubble / FAIL+bubble) +# is load-bearing: it determines whether the orchestrator retries the +# Generator or re-invokes the Planner. + +SYSTEM_PROMPT = """\ +You are a senior data engineer auditing ONE submitted mapping for SEMANTIC \ +correctness. The structural checks (row counts, distinct IDs, dangling FKs) \ +have ALREADY PASSED — your job is to catch wrong-concept errors that pure \ +structural checks cannot see. + +WHAT YOU AUDIT +• Did the mapping pick the RIGHT TABLE for the ontology class/property? +• Do sampled values in the chosen column(s) actually represent what the \ +ontology attribute means? (e.g. "delivery_date" should be a delivery date, \ +not a booking date.) +• Does the column's semantics match the ontology comment / label? + +TOOLS +You have these tools: + • sample_table – Up to N random rows from a table. Use to peek at \ +actual values and check they match the concept. + • execute_sql – Targeted SQL for "is this column really what it \ +claims" probes (e.g. value ranges, distinct categories, null patterns). + • get_documents_context – Imported domain glossaries / data dictionaries. \ +Check against these when the column's role is non-obvious. + • submit_evaluation – TERMINAL. Call EXACTLY ONCE when you have a \ +confident verdict. + +DECISION RUBRIC +• PASS — sampled values, column semantics, and domain context all support \ +the mapping. status="PASS", failures=[], bubble_to_planner=false. +• FAIL with bubble_to_planner=false — the WRONG COLUMN was picked within \ +the RIGHT TABLE. The Generator can fix this on retry. Populate failures[] \ +with the specific column-level issue and a concrete hint. +• FAIL with bubble_to_planner=true — the WRONG TABLE was chosen entirely. \ +The Planner must revise the source model. Populate failures[] and set the \ +bubble flag. + +HINT DISCIPLINE +• Hints must be CONCRETE, ACTIONABLE, single-sentence corrections. +• Good column-level hint: "Sampled rows show `appointment_date` is the \ +booking date, not delivery date. Use `delivery_dttm` instead." +• Good table-level hint: "This mapping uses `antenatal_visits`, but the \ +chosen class is Delivery. Switch to the `labour_delivery` table." +• Bad hint (vague): "consider using a different column" +• Bad hint (chatty): "I think there might be an issue here, you should look \ +into it more carefully" + +HARD RULES +• You are bounded by max_iterations=6. Keep your audit FOCUSED — pick the \ +one or two probes that would change your verdict, not exhaustive ones. +• Call submit_evaluation EXACTLY ONCE. +• If you cannot determine a verdict within 6 iterations, submit PASS with a \ +reasoning note explaining the uncertainty. Do NOT bubble — better to defer \ +than to falsely escalate. +• Do not call get_metadata, get_ontology, column_value_overlap, \ +distinct_count, submit_source_model, submit_entity_mapping, or \ +submit_relationship_mapping — they are not available to you. The audit \ +target and structural metrics are already in the user message. +""" + + +# ===================================================== +# Internal helpers +# ===================================================== + + +def _format_entity_definition(item_definition: dict) -> List[str]: + """Lines for an entity (ontology class) audit target.""" + parts: List[str] = [] + label = item_definition.get("label") or item_definition.get("name", "") + comment = item_definition.get("comment", "") or "" + attributes = item_definition.get("attributes", []) or [] + + parts.append(f" label: {label}") + if comment: + parts.append(f" comment: {comment}") + if attributes: + parts.append(f" attributes ({len(attributes)}):") + for attr in attributes: + if isinstance(attr, dict): + a_name = attr.get("name") or attr.get("label") or attr.get("uri", "?") + a_type = attr.get("type") or attr.get("range") or "" + parts.append(f" - {a_name}" + (f" ({a_type})" if a_type else "")) + else: + parts.append(f" - {attr}") + return parts + + +def _format_relationship_definition(item_definition: dict) -> List[str]: + """Lines for a relationship (ontology property) audit target. + + Always emits explicit ``domain`` and ``range`` lines — these are what + differentiate a relationship audit from an entity audit, and the tests + pin them. + """ + parts: List[str] = [] + label = item_definition.get("label") or item_definition.get("name", "") + comment = item_definition.get("comment", "") or "" + domain = item_definition.get("domain", "") or "" + range_class = item_definition.get("range", "") or "" + + parts.append(f" label: {label}") + if comment: + parts.append(f" comment: {comment}") + parts.append(f" domain: {domain}") + parts.append(f" range: {range_class}") + return parts + + +def _format_submitted_entity_mapping(submitted_mapping: dict) -> List[str]: + """Lines summarising an entity mapping under audit.""" + parts: List[str] = ["SUBMITTED MAPPING (entity)"] + parts.append(f" sql_query: {submitted_mapping.get('sql_query', '')}") + parts.append(f" id_column: {submitted_mapping.get('id_column', '')}") + parts.append(f" label_column: {submitted_mapping.get('label_column', '')}") + attr_map = submitted_mapping.get("attribute_mappings", {}) or {} + if attr_map: + parts.append(" attribute_mappings:") + for k, v in attr_map.items(): + parts.append(f" {k} -> {v}") + unmapped = submitted_mapping.get("unmapped_attributes", []) or [] + if unmapped: + parts.append(" unmapped_attributes:") + for u in unmapped: + if isinstance(u, dict): + parts.append( + f" - {u.get('name', '?')}: {u.get('reason', '')}" + ) + else: + parts.append(f" - {u}") + return parts + + +def _format_submitted_relationship_mapping(submitted_mapping: dict) -> List[str]: + """Lines summarising a relationship mapping under audit.""" + parts: List[str] = ["SUBMITTED MAPPING (relationship)"] + parts.append(f" sql_query: {submitted_mapping.get('sql_query', '')}") + parts.append( + f" source_id_column: {submitted_mapping.get('source_id_column', '')}" + ) + parts.append( + f" target_id_column: {submitted_mapping.get('target_id_column', '')}" + ) + parts.append( + f" source_class: {submitted_mapping.get('source_class', '') or submitted_mapping.get('domain', '')}" + ) + parts.append( + f" target_class: {submitted_mapping.get('target_class', '') or submitted_mapping.get('range_class', '')}" + ) + return parts + + +def _build_user_prompt( + item_kind: str, + item_uri: str, + item_definition: dict, + submitted_mapping: dict, + source_model_slice: dict, + stage1_metrics: dict, +) -> str: + """Render the audit user prompt. + + Structure: + 1. AUDIT TARGET — item_kind, URI, ontology metadata (label/comment, + attributes for entities; domain/range for relationships). + 2. SUBMITTED MAPPING — the actual mapping under audit. + 3. PLANNER'S PREDICTION — the slice the Planner curated for this item. + 4. STRUCTURAL CHECK METRICS (PASSED) — context from stage 1. + 5. YOUR TASK — short reminder of the rubric. + """ + parts: List[str] = [] + + parts.append("AUDIT TARGET") + parts.append(f" kind: {item_kind}") + parts.append(f" uri: {item_uri}") + if item_kind == "relationship": + parts.extend(_format_relationship_definition(item_definition or {})) + else: + parts.extend(_format_entity_definition(item_definition or {})) + + parts.append("") + if item_kind == "relationship": + parts.extend(_format_submitted_relationship_mapping(submitted_mapping or {})) + else: + parts.extend(_format_submitted_entity_mapping(submitted_mapping or {})) + + parts.append("") + parts.append("PLANNER'S PREDICTION") + parts.append(json.dumps(source_model_slice or {}, indent=2, default=str)) + + parts.append("") + parts.append("STRUCTURAL CHECK METRICS (PASSED)") + parts.append(json.dumps(stage1_metrics or {}, indent=2, default=str)) + + parts.append("") + parts.append("YOUR TASK") + parts.append( + "Audit the SEMANTIC correctness of the submitted mapping. Use " + "sample_table / execute_sql / get_documents_context as needed, then " + "call submit_evaluation EXACTLY ONCE with your verdict. Follow the " + "PASS / FAIL(no bubble) / FAIL(bubble) rubric in the system prompt." + ) + + prompt = "\n".join(parts) + logger.debug( + "_build_user_prompt for %s=%s (%d chars):\n%s", + item_kind, + item_uri, + len(prompt), + prompt, + ) + return prompt + + +# ===================================================== +# Public entry point +# ===================================================== + + +@trace_agent(name="mapping_pge_critic") +def run_critic( + host: str, + token: str, + endpoint_name: str, + client: Any, + *, + item_kind: str, + item_uri: str, + item_definition: dict, + submitted_mapping: dict, + source_model_slice: dict, + stage1_metrics: dict, + documents: Optional[list] = None, + on_step: Optional[Callable[[str, int], None]] = None, + max_iterations: int = MAX_ITERATIONS, +) -> CriticResult: + """Run the Semantic Critic agent for one submitted mapping. + + The Critic autonomously audits ``submitted_mapping`` for semantic + correctness using ``sample_table`` / ``execute_sql`` / + ``get_documents_context``, then submits a verdict via the terminal + ``submit_evaluation`` tool. The resulting :class:`EvalReport` (stage + ``"semantic"``) is stored on ``ctx.semantic_eval_report`` and returned in + ``CriticResult.report``. + + Args: + host: Databricks workspace URL. + token: Bearer token for the serving endpoint. + endpoint_name: Foundation Model serving endpoint name. + client: Databricks SQL client (must expose ``execute_query(sql)``). + item_kind: ``"entity"`` or ``"relationship"``. + item_uri: The ontology class or property URI under audit. + item_definition: Full ontology dict for the item (label/comment, + plus attributes for entities or domain/range for relationships). + submitted_mapping: The mapping under audit (handler dict shape). + source_model_slice: The Planner's slice for this item. + stage1_metrics: Metrics from the deterministic evaluator, for + context. + documents: Optional pre-loaded domain documents — surfaced via + ``get_documents_context``. + on_step: Optional progress callback ``(msg, pct)`` for UI updates. + max_iterations: Upper bound on tool-call iterations (default 6 — + smaller than the Generators because auditing is bounded work). + + Returns: + A :class:`CriticResult`. ``success`` is True iff the Critic + terminated by submitting a verdict; in that case ``report`` holds + the resulting :class:`EvalReport`. On failure (budget exhaustion, + text-only output, transport error), ``error`` explains why. + """ + iteration_limit = max_iterations if max_iterations is not None else MAX_ITERATIONS + + logger.info( + "===== CRITIC START ===== endpoint=%s, kind=%s, uri=%s, max_iter=%d", + endpoint_name, + item_kind, + item_uri, + iteration_limit, + ) + + ctx = ToolContext( + host=host.rstrip("/"), + token=token, + client=client, + # The audit target is in the user prompt; metadata/ontology are not + # needed by the Critic's tools. + metadata={}, + ontology={}, + documents=list(documents or []), + ) + + result = CriticResult(success=False) + + user_prompt = _build_user_prompt( + item_kind=item_kind, + item_uri=item_uri, + item_definition=item_definition or {}, + submitted_mapping=submitted_mapping or {}, + source_model_slice=source_model_slice or {}, + stage1_metrics=stage1_metrics or {}, + ) + messages: List[dict] = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ] + logger.info( + "Critic conversation initialized: system=%d chars, user=%d chars", + len(SYSTEM_PROMPT), + len(user_prompt), + ) + + total_usage: Dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0} + + def _progress_pct(iteration_idx: int) -> int: + ratio = (iteration_idx + 1) / max(iteration_limit, 1) + return min(5 + int(ratio * 90), 95) + + def notify(msg: str, *, pct: Optional[int] = None) -> None: + actual_pct = pct if pct is not None else 5 + logger.info("CRITIC STEP [%d%%] %s", actual_pct, msg) + if on_step: + on_step(msg, actual_pct) + + notify(f"Auditing {item_kind} {item_uri}…", pct=1) + + # ------------------------------------------------------------------ + # Agent loop + # ------------------------------------------------------------------ + for iteration in range(iteration_limit): + if iteration > 0: + logger.debug( + "Iteration %d: waiting %ds before LLM call (rate limit mitigation)", + iteration + 1, + _ITERATION_DELAY_SEC, + ) + time.sleep(_ITERATION_DELAY_SEC) + + current_iteration = iteration + 1 + pct = _progress_pct(iteration) + logger.info( + "----- Critic iteration %d/%d — %d messages, report=%s -----", + current_iteration, + iteration_limit, + len(messages), + "set" if ctx.semantic_eval_report is not None else "unset", + ) + notify( + f"Critic iteration {current_iteration}/{iteration_limit}…", + pct=pct, + ) + + t0 = time.time() + try: + llm_response = call_serving_endpoint( + host, + token, + endpoint_name, + messages, + tools=TOOL_DEFINITIONS, + max_tokens=_MAX_TOKENS, + temperature=0.1, + timeout=LLM_TIMEOUT, + trace_name=_TRACE_NAME, + ) + except requests.exceptions.HTTPError as exc: + status = exc.response.status_code if exc.response is not None else "?" + logger.warning( + "Critic iteration %d: HTTPError status=%s", + current_iteration, + status, + ) + logger.debug( + "Critic iteration %d: HTTPError body: %.500s", + current_iteration, + exc.response.text if exc.response is not None else "N/A", + ) + if exc.response is not None and status in (400, 422): + result.error = "LLM endpoint does not support function calling" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "Critic: endpoint refused tools — cannot produce an evaluation" + ) + return result + result.error = f"LLM request failed: {exc}" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "Critic: LLM request failed at iteration %d: %s", + current_iteration, + exc, + ) + return result + except requests.exceptions.ReadTimeout: + result.error = f"LLM request timed out after {LLM_TIMEOUT}s" + result.iterations = current_iteration + result.usage = total_usage + logger.error("Critic: timeout at iteration %d", current_iteration) + return result + except requests.exceptions.RequestException as exc: + result.error = f"LLM request failed: {exc}" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "Critic: request exception at iteration %d: %s", + current_iteration, + exc, + ) + return result + + elapsed_ms = int((time.time() - t0) * 1000) + logger.info( + "Critic iteration %d: LLM responded in %dms", + current_iteration, + elapsed_ms, + ) + + accumulate_usage(total_usage, llm_response.get("usage", {})) + + choice = llm_response.get("choices", [{}])[0] + finish_reason = choice.get("finish_reason", "?") + message = choice.get("message", {}) + tool_calls = message.get("tool_calls", []) + has_content = bool(message.get("content")) + logger.info( + "Critic iteration %d: finish_reason=%s, tool_calls=%d, has_content=%s", + current_iteration, + finish_reason, + len(tool_calls), + has_content, + ) + + if not tool_calls: + # The Critic must terminate via submit_evaluation, never via + # free text. Text-only output is a failure. + content = (message.get("content") or "")[:500] + logger.warning( + "Critic iteration %d: produced text without submitting evaluation — %d chars", + current_iteration, + len(message.get("content") or ""), + ) + result.steps.append( + CriticStep( + step_type="output", + content=content, + duration_ms=elapsed_ms, + ) + ) + result.error = "critic produced text without submitting evaluation" + result.iterations = current_iteration + result.usage = total_usage + notify( + "Critic produced text without submitting evaluation.", + pct=pct, + ) + return result + + logger.info( + "Critic iteration %d: processing %d tool call(s): [%s]", + current_iteration, + len(tool_calls), + ", ".join( + tc.get("function", {}).get("name", "?") for tc in tool_calls + ), + ) + messages.append(message) + + terminal_success = False + for tc_idx, tc in enumerate(tool_calls, 1): + func = tc.get("function", {}) + tool_name = func.get("name", "") + raw_args = func.get("arguments", "{}") + tool_id = tc.get("id", "") + + try: + arguments = json.loads(raw_args) + except json.JSONDecodeError: + arguments = {} + + logger.info( + "Critic iteration %d: calling tool '%s' (%d/%d)", + current_iteration, + tool_name, + tc_idx, + len(tool_calls), + ) + + if tool_name == "submit_evaluation": + notify( + f"Submitting evaluation for {item_uri}…", pct=pct + ) + elif tool_name == "sample_table": + fn = arguments.get("full_name", "?") + notify(f"Sampling {fn}…", pct=pct) + elif tool_name == "execute_sql": + sql_preview = arguments.get("sql", "")[:80] + notify(f"Running SQL: {sql_preview}…", pct=pct) + elif tool_name == "get_documents_context": + notify("Retrieving documents…", pct=pct) + else: + notify(f"Calling {tool_name}…", pct=pct) + + result.steps.append( + CriticStep( + step_type="tool_call", + content=json.dumps(arguments, default=str)[:500], + tool_name=tool_name, + ) + ) + + t1 = time.time() + tool_result = dispatch_tool( + TOOL_HANDLERS, ctx, tool_name, arguments, trace_name=_TRACE_NAME + ) + tool_ms = int((time.time() - t1) * 1000) + + logger.info( + "Critic iteration %d: tool '%s' returned %d chars in %dms", + current_iteration, + tool_name, + len(tool_result), + tool_ms, + ) + + result.steps.append( + CriticStep( + step_type="tool_result", + content=( + (tool_result[:500] + "…") + if len(tool_result) > 500 + else tool_result + ), + tool_name=tool_name, + duration_ms=tool_ms, + ) + ) + + messages.append( + { + "role": "tool", + "tool_call_id": tool_id, + "content": tool_result, + } + ) + + # Detect terminal success: submit_evaluation returned success=True + # AND stamped an EvalReport onto the context. An invalid status + # (the handler returns success=False) does NOT terminate the + # loop — the agent continues so it can resubmit a valid verdict. + if tool_name == "submit_evaluation": + try: + parsed = json.loads(tool_result) + except json.JSONDecodeError: + parsed = {} + if ( + parsed.get("success") is True + and ctx.semantic_eval_report is not None + ): + terminal_success = True + logger.info( + "Critic iteration %d: submit_evaluation succeeded — terminating", + current_iteration, + ) + + if terminal_success: + result.success = True + result.report = ctx.semantic_eval_report + result.iterations = current_iteration + result.usage = total_usage + logger.info( + "===== CRITIC COMPLETE ===== uri=%s, status=%s, bubble=%s, " + "iterations=%d, prompt_tokens=%d, completion_tokens=%d", + item_uri, + result.report.status if result.report else "?", + result.report.bubble_to_planner if result.report else "?", + result.iterations, + total_usage["prompt_tokens"], + total_usage["completion_tokens"], + ) + notify(f"Critic verdict submitted for {item_uri}.", pct=100) + return result + + # Budget exhausted without a successful submit. + result.iterations = iteration_limit + result.usage = total_usage + result.error = "critic exhausted iteration budget" + logger.error("===== CRITIC FAILED ===== %s", result.error) + notify(result.error, pct=95) + return result diff --git a/src/agents/agent_mapping_pge/evaluator/deterministic.py b/src/agents/agent_mapping_pge/evaluator/deterministic.py new file mode 100644 index 00000000..b8e72951 --- /dev/null +++ b/src/agents/agent_mapping_pge/evaluator/deterministic.py @@ -0,0 +1,539 @@ +"""Deterministic (stage-1) evaluator for submitted mappings. + +This module is pure-Python and has no LLM dependency. It runs the +submitted mapping's SQL through a caller-supplied ``execute_sql_fn`` and +checks structural invariants (row count, distinct id count, dangling +foreign-key fractions, etc.). + +``execute_sql_fn`` contract:: + + def execute_sql_fn(sql: str) -> dict +returning ``{"columns": [...], "rows": [{col: value, ...}, ...]}``. + +Important: this is the *full* result set, not the 3-row sample emitted by +:func:`agents.tools.sql.tool_execute_sql`. The orchestrator (Sprint 7) is +responsible for plugging in a runner that returns full rows — typically a +thin wrapper around ``DatabricksClient.execute_query``. + +All checks compute every metric even when some fail; the resulting +:class:`~agents.agent_mapping_pge.contracts.EvalReport` lists every failure +so the Generator/Planner can address them in one shot. +""" + +from typing import Any, Callable, Dict, List, Optional, Tuple + +from back.core.logging import get_logger +from agents.agent_mapping_pge.contracts import EvalFailure, EvalReport +from agents.agent_mapping_pge.evaluator.report import build_report + +logger = get_logger(__name__) + +# Thresholds for stage-1 checks. These are intentionally lax — the +# semantic evaluator (stage 2) catches subtler issues. +_DANGLING_FK_FAIL_THRESHOLD = 0.05 +_DANGLING_FK_BUBBLE_THRESHOLD = 0.5 + + +SqlFn = Callable[[str], dict] + + +# ===================================================== +# Helpers +# ===================================================== + + +def _resolve_id_col(mapping: dict, fallback: str = "ID") -> str: + """Return the column name that holds the entity identifier in the row dicts.""" + return mapping.get("id_column") or fallback + + +def _extract_id_values(rows: List[dict], id_col: str) -> List[Any]: + """Pull the id_col value from each row; missing key -> ``None``.""" + return [r.get(id_col) for r in rows] + + +def _attribute_names(ontology_class: dict) -> List[str]: + """Ontology attributes can come in a few shapes; normalise to a list of names.""" + attrs = ontology_class.get("attributes") or [] + out: List[str] = [] + for a in attrs: + if isinstance(a, str): + out.append(a) + elif isinstance(a, dict): + name = a.get("name") or a.get("uri") or a.get("label") + if name: + out.append(name) + return out + + +def _fail( + *, + check: str, + expected: str, + observed: str, + hint: str, + kind: str = "structural", +) -> EvalFailure: + return EvalFailure( + kind=kind, check=check, expected=expected, observed=observed, hint=hint + ) + + +class _SqlExecError(Exception): + """A generated mapping's SQL parsed but failed at execution time. + + Wraps the underlying DB driver exception so the deterministic evaluator + can convert it into an actionable FAIL — never let it crash the run. + """ + + +def _exec(execute_sql_fn: SqlFn, sql: str) -> dict: + """Run SQL, normalising any driver-level failure into ``_SqlExecError``. + + Generated mappings routinely produce SQL that *parses* but fails at + execution (UNION column-type mismatch, invalid CAST, unknown column). + The PGE contract is that such errors become feedback for the generator, + so they must surface as a FAIL report rather than an unhandled exception + that aborts the whole agent run. + """ + try: + return execute_sql_fn(sql) or {} + except Exception as exc: # noqa: BLE001 — any driver error becomes feedback + raise _SqlExecError(str(exc)) from exc + + +def _sql_error_report(*, item: str, sql_error: str) -> EvalReport: + """Build a FAIL report for a mapping whose SQL failed to execute. + + ``bubble_to_planner`` stays False: a runtime SQL error is the + Generator's to fix (align types, correct columns), not a signal that the + Planner's source model is wrong. + """ + # Keep the hint compact — driver errors can be very long. + err = sql_error.strip().splitlines()[0][:300] if sql_error else "unknown error" + return build_report( + stage="deterministic", + metrics={"sql_error": err}, + failures=[ + _fail( + check="sql_execution", + expected="SQL executes without error", + observed="execution error", + hint=( + f"The mapping SQL for '{item}' failed to execute: {err}. " + "Fix the SQL — e.g. align UNION branch column types with " + "explicit CAST (a common cause is one branch typing a " + "column as BIGINT and another as STRING/NULL), correct " + "column names, or use try_cast for malformed values." + ), + ) + ], + bubble_to_planner=False, + ) + + +# ===================================================== +# Entity evaluator +# ===================================================== + + +def evaluate_entity_mapping( + *, + mapping: dict, + ontology_class: dict, + execute_sql_fn: SqlFn, +) -> EvalReport: + """Run the stage-1 deterministic checks on a submitted entity mapping. + + Args: + mapping: Submitted entity mapping in the shape produced by + ``tool_submit_entity_mapping``. + ontology_class: The ontology-class dict the mapping targets; must + expose an ``attributes`` list (each item being a name string or + a dict with a ``name`` key). + execute_sql_fn: Caller-supplied SQL runner — see module docstring. + + Returns: + An :class:`EvalReport` summarising the metrics and any failures. + ``bubble_to_planner`` is set when ``row_count == 0`` (typically + means the mapping is querying the wrong table altogether). + """ + class_name = mapping.get("class_name") or ontology_class.get("name") or "?" + sql = mapping.get("sql_query", "") + id_col = _resolve_id_col(mapping) + logger.info( + "evaluate_entity_mapping: class=%s, id_col=%s, sql_len=%d", + class_name, + id_col, + len(sql), + ) + + try: + result = _exec(execute_sql_fn, sql) + except _SqlExecError as exc: + logger.warning( + "evaluate_entity_mapping: class=%s SQL failed to execute: %s", + class_name, + exc, + ) + return _sql_error_report(item=class_name, sql_error=str(exc)) + rows = result.get("rows", []) or [] + row_count = len(rows) + + id_values = _extract_id_values(rows, id_col) + null_id_count = sum(1 for v in id_values if v is None) + distinct_id_count = len({v for v in id_values if v is not None}) + + raw_unmapped = mapping.get("unmapped_attributes") or [] + declared_unmapped: set = set() + for item in raw_unmapped: + if isinstance(item, dict): + name = item.get("name") + if name: + declared_unmapped.add(str(name)) + elif item is not None: + declared_unmapped.add(str(item)) + declared_mapped = set((mapping.get("attribute_mappings") or {}).keys()) + all_attrs = _attribute_names(ontology_class) + unmapped_attrs = [ + a for a in all_attrs if a not in declared_mapped and a not in declared_unmapped + ] + unmapped_pct = (len(unmapped_attrs) / len(all_attrs)) if all_attrs else 0.0 + + metrics: Dict[str, Any] = { + "row_count": row_count, + "distinct_id_count": distinct_id_count, + "null_id_count": null_id_count, + "unmapped_attribute_pct": unmapped_pct, + "unmapped_attributes": unmapped_attrs, + } + + failures: List[EvalFailure] = [] + bubble = False + + if row_count == 0: + failures.append( + _fail( + check="row_count", + expected="> 0", + observed="0", + hint=( + f"Entity '{class_name}' SQL returned 0 rows. Check the FROM " + "table is correct and the WHERE clause is not over-filtering." + ), + ) + ) + bubble = True + + if row_count > 0 and distinct_id_count != row_count: + dupes = row_count - distinct_id_count + failures.append( + _fail( + check="distinct_id_count", + expected=f"== row_count ({row_count})", + observed=str(distinct_id_count), + hint=( + f"{dupes} duplicate '{id_col}' value(s) in entity '{class_name}'. " + "Add DISTINCT or use a stricter id column." + ), + ) + ) + + if null_id_count > 0: + failures.append( + _fail( + check="null_id_count", + expected="== 0", + observed=str(null_id_count), + hint=( + f"{null_id_count} row(s) have NULL '{id_col}' in entity " + f"'{class_name}'. Add 'WHERE {id_col} IS NOT NULL' to the SQL." + ), + ) + ) + + if unmapped_pct > 0: + failures.append( + _fail( + check="unmapped_attribute_pct", + expected="== 0", + observed=f"{unmapped_pct:.3f}", + hint=( + f"{len(unmapped_attrs)} attribute(s) of '{class_name}' are " + f"neither in attribute_mappings nor declared in " + f"unmapped_attributes: {unmapped_attrs}. Map them, or list " + "them explicitly under 'unmapped_attributes'." + ), + ) + ) + + logger.info( + "evaluate_entity_mapping: class=%s -> %s (%d failure(s), bubble=%s)", + class_name, + "PASS" if not failures else "FAIL", + len(failures), + bubble, + ) + return build_report( + stage="deterministic", + metrics=metrics, + failures=failures, + bubble_to_planner=bubble, + ) + + +# ===================================================== +# Relationship evaluator +# ===================================================== + + +def _distinct_id_set( + entity_mapping: dict, + execute_sql_fn: SqlFn, + id_universe_cache: Optional[Dict[str, set]] = None, +) -> set: + """Materialise the set of valid ids for a given entity mapping. + + When ``id_universe_cache`` is provided it is consulted/populated keyed + by the entity mapping's SQL string, avoiding redundant SQL execution + across repeated calls that share endpoint entities. + """ + sql = entity_mapping.get("sql_query", "") + id_col = _resolve_id_col(entity_mapping) + if id_universe_cache is not None and sql in id_universe_cache: + return id_universe_cache[sql] + result = _exec(execute_sql_fn, sql) # may raise _SqlExecError + rows = result.get("rows", []) or [] + ids = {r.get(id_col) for r in rows if r.get(id_col) is not None} + if id_universe_cache is not None: + id_universe_cache[sql] = ids + return ids + + +def _resolve_edge_columns(mapping: dict) -> Tuple[str, str]: + """Return ``(source_col, target_col)`` for a relationship mapping.""" + return ( + mapping.get("source_id_column") or "source_id", + mapping.get("target_id_column") or "target_id", + ) + + +def evaluate_relationship_mapping( + *, + mapping: dict, + source_entity_mapping: dict, + target_entity_mapping: dict, + execute_sql_fn: SqlFn, + expected_cross_source_overlap_band: Optional[Tuple[float, float]] = None, + id_universe_cache: Optional[Dict[str, set]] = None, +) -> EvalReport: + """Run stage-1 deterministic checks on a relationship mapping. + + Checks: + + * ``total_edges > 0`` + * ``dangling_source_pct < 0.05`` — fraction of source ids that do not + exist in the source entity's id universe. + * ``dangling_target_pct < 0.05`` — same for targets. + * If ``expected_cross_source_overlap_band`` is supplied, the realised + ``overlap_pct`` (fraction of edges whose target id appears in the + target entity universe) must fall inside the band. + + ``bubble_to_planner`` is set when ``total_edges == 0``, when the source + dangling fraction exceeds ``0.5``, or when the target dangling fraction + exceeds ``0.5`` *and* the realised overlap is materially worse than the + Planner predicted (either no band was supplied, or the band check + itself failed). These cases typically indicate the relationship was + built off the wrong join key. + + Args: + id_universe_cache: Optional caller-managed dict mapping an entity + mapping's ``sql_query`` string to its materialised set of ids. + When provided, repeated calls across relationships that share + endpoint entities reuse cached id universes instead of + re-running the entity SQL via ``execute_sql_fn``. When + ``None`` (default) behaviour is unchanged — fetch fresh each + call. No module-level state is involved. + """ + name = mapping.get("property_name") or mapping.get("property") or "?" + sql = mapping.get("sql_query", "") + src_col, tgt_col = _resolve_edge_columns(mapping) + logger.info( + "evaluate_relationship_mapping: property=%s, src_col=%s, tgt_col=%s", + name, + src_col, + tgt_col, + ) + + try: + edges_result = _exec(execute_sql_fn, sql) + edge_rows = edges_result.get("rows", []) or [] + total_edges = len(edge_rows) + + source_universe = _distinct_id_set( + source_entity_mapping, execute_sql_fn, id_universe_cache + ) + target_universe = _distinct_id_set( + target_entity_mapping, execute_sql_fn, id_universe_cache + ) + except _SqlExecError as exc: + logger.warning( + "evaluate_relationship_mapping: property=%s SQL failed to execute: %s", + name, + exc, + ) + return _sql_error_report(item=name, sql_error=str(exc)) + + src_values = [r.get(src_col) for r in edge_rows] + tgt_values = [r.get(tgt_col) for r in edge_rows] + + if total_edges > 0: + dangling_src = sum( + 1 for v in src_values if v is None or v not in source_universe + ) + dangling_tgt = sum( + 1 for v in tgt_values if v is None or v not in target_universe + ) + dangling_src_pct = dangling_src / total_edges + dangling_tgt_pct = dangling_tgt / total_edges + overlap_pct = 1.0 - dangling_tgt_pct + else: + dangling_src_pct = 0.0 + dangling_tgt_pct = 0.0 + overlap_pct = 0.0 + + metrics: Dict[str, Any] = { + "total_edges": total_edges, + "dangling_source_pct": dangling_src_pct, + "dangling_target_pct": dangling_tgt_pct, + "cross_source_overlap_pct": overlap_pct, + "source_universe_size": len(source_universe), + "target_universe_size": len(target_universe), + } + + failures: List[EvalFailure] = [] + bubble = False + + if total_edges == 0: + failures.append( + _fail( + check="total_edges", + expected="> 0", + observed="0", + hint=( + f"Relationship '{name}' produced 0 edges. Confirm the join " + "predicate is on the right columns and rows are not being " + "filtered away." + ), + ) + ) + bubble = True + + if total_edges > 0 and dangling_src_pct >= _DANGLING_FK_FAIL_THRESHOLD: + failures.append( + _fail( + check="dangling_source_pct", + expected=f"< {_DANGLING_FK_FAIL_THRESHOLD}", + observed=f"{dangling_src_pct:.3f}", + hint=( + f"{dangling_src_pct:.1%} of source_id values in relationship " + f"'{name}' are absent from the mapped source entity. The " + "source entity's id_column is usually an ALIAS for a derived " + "expression (e.g. CONCAT(regexp_extract(,'...'),'-x')). " + "Reproduce that exact id expression from the source entity's " + "SQL for source_id — do not select a raw/trust-local column." + ), + ) + ) + if dangling_src_pct > _DANGLING_FK_BUBBLE_THRESHOLD: + bubble = True + + # When an explicit cross-source overlap band is provided the relationship + # is *expected* to be partial (e.g. trust_a-only IDs vs the cross-trust + # canonical universe). In that case we trust the band check and skip + # the standard ``dangling_target_pct`` strictness — the partiality is + # the point. The catastrophic-dangling bubble below still fires, but + # only when the band itself ALSO fails (i.e. the realised overlap is + # materially worse than the Planner predicted). + if ( + total_edges > 0 + and dangling_tgt_pct >= _DANGLING_FK_FAIL_THRESHOLD + and expected_cross_source_overlap_band is None + ): + failures.append( + _fail( + check="dangling_target_pct", + expected=f"< {_DANGLING_FK_FAIL_THRESHOLD}", + observed=f"{dangling_tgt_pct:.3f}", + hint=( + f"{dangling_tgt_pct:.1%} of target_id values in relationship " + f"'{name}' are absent from the mapped target entity. The " + "target entity's id_column is usually an ALIAS for a derived " + "expression; reproduce that exact id expression from the " + "target entity's SQL for target_id — not a raw join column." + ), + ) + ) + + band_failed = False + if expected_cross_source_overlap_band is not None: + lo, hi = expected_cross_source_overlap_band + if not (lo <= overlap_pct <= hi): + band_failed = True + failures.append( + _fail( + check="cross_source_overlap_pct", + expected=f"in [{lo:.3f}, {hi:.3f}]", + observed=f"{overlap_pct:.3f}", + hint=( + f"Cross-source overlap for '{name}' is {overlap_pct:.1%}, " + f"outside the expected band [{lo:.1%}, {hi:.1%}]. " + "Check the join key and the source/target trust assignments." + ), + ) + ) + + # Bubble-to-planner on catastrophic target-dangling, with a band-aware gate. + # + # * Band absent + dangling > 0.5: the strict dangling_target_pct failure + # above already fired; we just flip the bubble flag (no new row needed). + # * Band present + band PASSED: the Planner predicted this overlap and + # was right — do NOT bubble, even if dangling > 0.5 (the partiality + # was expected). + # * Band present + band FAILED + dangling > 0.5: the realised overlap + # is materially worse than predicted. Bubble, and emit a dedicated + # ``dangling_target_pct_catastrophic`` failure so the FAIL report has + # a concrete structural row alongside the band-check failure. + if total_edges > 0 and dangling_tgt_pct > _DANGLING_FK_BUBBLE_THRESHOLD: + if expected_cross_source_overlap_band is None: + bubble = True + elif band_failed: + bubble = True + failures.append( + _fail( + check="dangling_target_pct_catastrophic", + expected=f"<= {_DANGLING_FK_BUBBLE_THRESHOLD}", + observed=f"{dangling_tgt_pct:.3f}", + hint=( + f"{dangling_tgt_pct:.1%} of target_id values in " + f"relationship '{name}' are absent from the mapped " + "target entity AND the realised overlap is outside " + "the predicted band. Re-plan the join key and the " + "source/target trust assignments." + ), + ) + ) + + logger.info( + "evaluate_relationship_mapping: %s -> %s (%d failure(s), bubble=%s)", + name, + "PASS" if not failures else "FAIL", + len(failures), + bubble, + ) + return build_report( + stage="deterministic", + metrics=metrics, + failures=failures, + bubble_to_planner=bubble, + ) diff --git a/src/agents/agent_mapping_pge/evaluator/report.py b/src/agents/agent_mapping_pge/evaluator/report.py new file mode 100644 index 00000000..532f0a7d --- /dev/null +++ b/src/agents/agent_mapping_pge/evaluator/report.py @@ -0,0 +1,37 @@ +"""Small helpers for assembling :class:`EvalReport` objects. + +The dataclasses themselves live in +:mod:`agents.agent_mapping_pge.contracts`; this module just centralises the +"compose a report from a list of failures" boilerplate so the deterministic +and (future) semantic evaluators stay short. +""" + +from typing import Any, Dict, List + +from back.core.logging import get_logger +from agents.agent_mapping_pge.contracts import EvalFailure, EvalReport + +logger = get_logger(__name__) + + +def build_report( + *, + stage: str, + metrics: Dict[str, Any], + failures: List[EvalFailure], + bubble_to_planner: bool, +) -> EvalReport: + """Assemble an :class:`EvalReport`; status is derived from ``failures``.""" + status = "PASS" if not failures else "FAIL" + if bubble_to_planner and status == "PASS": + logger.warning( + "build_report: bubble_to_planner=True but no failures → demoted " + "to False; check caller logic" + ) + return EvalReport( + status=status, + stage=stage, + metrics=dict(metrics), + failures=list(failures), + bubble_to_planner=bool(bubble_to_planner) and status == "FAIL", + ) diff --git a/src/agents/agent_mapping_pge/generators/__init__.py b/src/agents/agent_mapping_pge/generators/__init__.py new file mode 100644 index 00000000..575f858c --- /dev/null +++ b/src/agents/agent_mapping_pge/generators/__init__.py @@ -0,0 +1,31 @@ +"""Generator agents for the mapping-PGE pipeline. + +Each Generator is a narrow tool-calling agent that maps ONE ontology item +(class or relationship) at a time. The orchestrator (Sprint 7) calls them +per-item with a filtered slice of the Planner's :class:`SourceModel` — the +Generators never see the full ontology or full metadata, keeping each +decision cheap and local. + +* Sprint 4 — :mod:`agents.agent_mapping_pge.generators.entity`. +* Sprint 5 — :mod:`agents.agent_mapping_pge.generators.relationship`. +""" + +from agents.agent_mapping_pge.generators.entity import ( + EntityGenResult, + EntityGenStep, + run_entity_generator, +) +from agents.agent_mapping_pge.generators.relationship import ( + RelationshipGenResult, + RelationshipGenStep, + run_relationship_generator, +) + +__all__ = [ + "EntityGenResult", + "EntityGenStep", + "run_entity_generator", + "RelationshipGenResult", + "RelationshipGenStep", + "run_relationship_generator", +] diff --git a/src/agents/agent_mapping_pge/generators/entity.py b/src/agents/agent_mapping_pge/generators/entity.py new file mode 100644 index 00000000..2062f5e5 --- /dev/null +++ b/src/agents/agent_mapping_pge/generators/entity.py @@ -0,0 +1,852 @@ +""" +OntoBricks Mapping-PGE EntityGenerator Agent. + +Sprint 4 of the Planner-Generator-Evaluator (PGE) redesign. + +The EntityGenerator is a narrow, focused LLM agent that maps **one** ontology +class at a time. The orchestrator (Sprint 7) calls it per item with a +filtered slice of the Planner's :class:`SourceModel`: + +* the single ontology class to map, with its full attribute list, and +* a small SourceModel slice — only the candidate tables / canonical IDs / + joins that are relevant to *this* class. + +The Generator does NOT see the full ontology or full metadata. That is the +core design contract: keep its context bounded and each decision cheap. + +The loop shape mirrors :mod:`agents.agent_mapping_pge.planner` — same +``call_serving_endpoint`` + ``dispatch_tool`` ReAct cycle, same 3-second +inter-iteration delay, same MLflow trace decorator — with these differences: + +* Smaller default budget (12 vs 25): mapping one class is bounded work. +* Different tool set: only ``execute_sql``, ``sample_table``, and the + terminal ``submit_entity_mapping``. The slice already carries every piece + of context the Generator needs. +* No single-shot fallback: if the endpoint refuses tools, the Generator + reports failure — it produces structured output through + ``submit_entity_mapping`` only. +* The "NO SILENT DROPS" invariant: every ontology attribute must be either + in ``attribute_mappings`` or in ``unmapped_attributes`` with a one-sentence + reason. The system prompt enforces this; the tool persists it. +""" + +import json +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional + +import requests + +from back.core.logging import get_logger +from agents.engine_base import ( + call_serving_endpoint, + dispatch_tool, + accumulate_usage, +) +from agents.tools.context import ToolContext +from agents.tools.mapping import ( + MAPPING_TOOL_DEFINITIONS_BY_NAME, + MAPPING_TOOL_HANDLERS, +) +from agents.tools.planner import ( + SAMPLE_TABLE_DEF, + tool_sample_table, +) +from agents.tools.sql import ( + SQL_TOOL_DEFINITIONS, + SQL_TOOL_HANDLERS, +) +from agents.tracing import trace_agent + +logger = get_logger(__name__) + +MAX_ITERATIONS = 12 +LLM_TIMEOUT = 180 +_ITERATION_DELAY_SEC = 3 +# See planner._MAX_TOKENS comment — same rationale for the Generator's +# submit_entity_mapping JSON (SQL + attribute_mappings can be large). +_MAX_TOKENS = 50000 + +_TRACE_NAME = "mapping_pge_entity_generator" + + +# ===================================================== +# Tool aggregation +# ===================================================== +# +# The EntityGenerator only needs: +# * execute_sql – validate the composed SELECT before submitting. +# * sample_table – disambiguate when two candidate tables are equally +# plausible (e.g. same confidence in the slice). +# * submit_entity_mapping – TERMINAL. +# +# We deliberately exclude: +# * get_ontology / get_metadata / get_documents_context — the Planner's +# view; the slice already has what's needed. +# * column_value_overlap / distinct_count — those validate join keys and +# canonical IDs, which the Planner already locked in. +# * submit_relationship_mapping / submit_source_model — wrong stage. + +# Filter MAPPING_TOOL_DEFINITIONS down to just submit_entity_mapping. We +# look up by name from the by-name index in ``mapping.py`` rather than +# scanning the list inline. Sprint 5 will reuse the same pattern for +# ``submit_relationship_mapping``. +_SUBMIT_ENTITY_DEF: dict = MAPPING_TOOL_DEFINITIONS_BY_NAME["submit_entity_mapping"] + +TOOL_DEFINITIONS: List[dict] = ( + SQL_TOOL_DEFINITIONS + + [SAMPLE_TABLE_DEF] + + [_SUBMIT_ENTITY_DEF] +) + +TOOL_HANDLERS: Dict[str, Callable] = { + **SQL_TOOL_HANDLERS, + "sample_table": tool_sample_table, + "submit_entity_mapping": MAPPING_TOOL_HANDLERS["submit_entity_mapping"], +} + + +# ===================================================== +# Data classes +# ===================================================== + + +@dataclass +class EntityGenStep: + """One observable step of the EntityGenerator's execution. + + Mirrors :class:`agents.agent_mapping_pge.planner.PlannerStep` but is + scoped to the Generator so the orchestrator (Sprint 7) can render a + per-class timeline in the UI. + """ + + step_type: str # "tool_call" | "tool_result" | "output" + content: str + tool_name: str = "" + duration_ms: int = 0 + + +@dataclass +class EntityGenResult: + """Outcome of a single EntityGenerator invocation. + + ``mapping`` holds the submitted entity-mapping dict (the same shape the + handler appends to ``ctx.entity_mappings``) when ``success`` is True. + """ + + success: bool + mapping: Optional[dict] = None + steps: List[EntityGenStep] = field(default_factory=list) + iterations: int = 0 + error: str = "" + usage: Dict[str, int] = field(default_factory=dict) + + +# ===================================================== +# System prompt +# ===================================================== +# +# The ENTITY SQL RULES section is lifted verbatim from the legacy in-house +# mapping agent (the section starting "SQL RULES FOR ENTITIES") because +# those rules are correct and load-bearing — every mapping query must +# follow them or downstream SPARQL translation breaks. +# +# The PGE-specific additions are the slice-consumption rules: pick the +# best candidate table from the slice, use the canonical ID exactly as +# the Planner specified it, and account for every ontology attribute. + +SYSTEM_PROMPT = """\ +You are a senior data engineer. Your job is to map ONE ontology class to a \ +single SQL SELECT query against a Databricks source table, validated against \ +real data via execute_sql, and submitted via submit_entity_mapping. + +YOU WILL BE GIVEN +• ontology_class: the class to map (uri, label, comment, attributes list). +• source_model_slice: a small JSON object the Planner already curated for \ +this class: + - candidate_tables[]: {table, confidence, reason} — the tables that could \ +realise this class. + - canonical_id.canonical_column_per_table[]: the expression that \ +MUST be aliased AS ID for each table. THIS VALUE MAY BE A BARE COLUMN \ +NAME ("CUSTOMER_ID") OR A FULL SQL EXPRESSION \ +("regexp_extract(order_ref, '([a-f0-9-]+-ord-[0-9]+)')"). Drop it \ +verbatim into the SELECT and alias it AS ID — do NOT rewrite it, do NOT \ +pick a different column, do NOT strip the function call. The Planner emits \ +SQL expressions when raw column values across sources are in different \ +formats and need to be normalized to a common canonical key. + - canonical_id.format_note: a one-sentence note describing the canonical \ +key (may be empty). Read it to understand what each row's ID represents. + - relevant_joins[]: optional — any joins the Planner thinks may apply. + +SINGLE-SOURCE vs CROSS-SOURCE (CRITICAL — read carefully) +The number of entries in canonical_id.canonical_column_per_table is the \ +authoritative signal for how to shape your SELECT: + + • If canonical_column_per_table has EXACTLY ONE table → single-source \ +class. Write a flat SELECT from that one table. Pick it from the matching \ +candidate_tables entry. + + • If canonical_column_per_table has TWO OR MORE tables → CROSS-SOURCE \ +class (e.g. the same customer or order realised across multiple sources). \ +You MUST emit a UNION ALL across ALL listed tables, NOT pick one. Each \ +branch uses that table's canonical-ID column AS ID. Picking just one would \ +produce an entity missing a large fraction of its real instances, and every \ +relationship pointing at it would then dangle. \ +This is the #1 failure mode the orchestrator catches — do not produce it. + + UNION shape (use exactly this pattern — substitute the canonical-ID \ +EXPRESSION exactly as the Planner specified it for that table; do NOT \ +rewrite it): + SELECT AS ID, AS Label, \ + FROM WHERE IS NOT NULL + UNION ALL + SELECT AS ID, AS Label, \ + FROM WHERE IS NOT NULL + UNION ALL + ... + + All branches must return the SAME columns in the SAME order, AND each \ +column must have the SAME TYPE in every branch. If a branch lacks a column \ +another branch has, project a NULL with a matching alias **cast to the same \ +type the real branch uses** (e.g. if branch A has ``ACCOUNT_ID`` typed \ +BIGINT, branch B must use ``CAST(NULL AS BIGINT) AS ACCOUNT_ID`` — not \ +``AS STRING``). When two branches hold the column with DIFFERENT types, cast \ +BOTH to a common type (``CAST(... AS STRING)`` is the safe default). A \ +``CAST_INVALID_INPUT`` / type-mismatch error from execute_sql always means a \ +column's types differ across branches — fix the casts, do not change the ID. + +TOOLS +You have three tools: + • execute_sql – Validate the composed SELECT before submitting. \ +The tool runs your query with a small LIMIT and returns columns + sample \ +rows; the persisted mapping has no LIMIT. + • sample_table – Up to N random rows from a table. Use only when \ +two candidate tables are equally plausible and you need to peek at real \ +values to disambiguate. + • submit_entity_mapping – TERMINAL. Call exactly once, after execute_sql \ +succeeds, with the full mapping payload. + +SQL RULES FOR ENTITIES (CRITICAL) +• Always use the full table name from the slice (catalog.schema.table). +• The FIRST column MUST be aliased AS ID — it MUST be the canonical-ID \ +column the slice specifies for the chosen table. +• The SECOND column MUST be aliased AS Label — pick the most human-readable \ +available column (typically ``name``, ``label``, ``display_name``, or \ +similar). If no human-readable column exists, fall back to the canonical \ +ID column itself aliased AS Label. +• Add one column per ontology data-property attribute you can satisfy from \ +the chosen table. Use the column's original name (no alias). +• If the same column serves as both an alias and an attribute, include it \ +twice: once with the alias (AS ID or AS Label) and once with its original \ +name so it appears in attribute_mappings. +• Add WHERE IS NOT NULL to filter null keys. When the ID is a \ +derived expression, also exclude empty extractions (e.g. \ +``WHERE regexp_extract(...) <> ''``). +• DEDUP COLLAPSED KEYS: when the canonical-ID is a derived EXPRESSION that \ +can repeat across rows (e.g. a ``-line`` key where several child rows \ +share one parent core), the same ID will appear on multiple rows and the \ +evaluator FAILs on "duplicate ID values". Make each node id unique: wrap the \ +UNION in ``SELECT ... FROM () GROUP BY ID`` (taking MAX() of each \ +attribute) or use ``SELECT DISTINCT`` when there are no attributes. The id \ +column must have exactly one row per distinct value. +• Do NOT add LIMIT — the persisted mapping query must return ALL rows. \ +execute_sql adds a small LIMIT internally for validation only. +• Do NOT use ORDER BY, CTEs, or subqueries unless absolutely necessary. +• Write simple, flat SELECT statements. + +REGEX SAFETY (CRITICAL — applies to EVERY regex you write) +• ALWAYS use explicit character classes: ``[0-9]`` for digits, ``[a-z]`` / \ +``[A-Za-z]`` for letters. NEVER use the backslash escapes ``\\d``, ``\\w``, \ +``\\s``. The OntoBricks build pipeline strips a lone backslash, so ``\\d`` \ +silently degrades to the literal ``d`` and the mapping breaks AFTER it has \ +already passed validation here. This applies to the canonical-ID expression \ +(use it verbatim from the slice — the Planner already emits ``[0-9]``) AND to \ +any CASE/RLIKE you write for value harmonization below. + +VALUE HARMONIZATION (controlled-vocabulary attributes) +Some attributes are CODED: the same real-world value is spelled differently \ +across sources (e.g. a status as 'A' / 'Active' / 'ACTIVE'; a category code as \ +'CS' / 'C-Section' / 'cs'; a flag as 'Y' / 'true' / '1'). A raw column copied \ +verbatim then has a source-fractured, un-aggregatable vocabulary and the KPI it \ +feeds is garbage. +When an attribute is a controlled vocabulary (the class/attribute name implies \ +a small fixed value set — method, status, type, mode, outcome, category, \ +classification — or sampling reveals a handful of distinct codes): + 1. DISCOVER the raw distinct values first. For each covering table run \ +``SELECT DISTINCT FROM
LIMIT 50`` via execute_sql (or \ +sample_table). Do NOT guess the value set — harmonize what is actually there. + 2. Map every raw spelling to ONE canonical lowercase token with a CASE \ +expression aliased to the attribute's clean name. Use the SAME token set in \ +EVERY UNION branch so the entity carries one coherent vocabulary regardless of \ +source system. Domain-neutral example (a status attribute): + CASE + WHEN lower(STATUS_CODE) RLIKE 'a|active|open' THEN 'active' + WHEN lower(STATUS_CODE) RLIKE 'c|closed|done' THEN 'closed' + WHEN lower(STATUS_CODE) RLIKE 'p|pending|hold' THEN 'pending' + ELSE NULL + END AS status + 3. Record it in attribute_mappings: ontology attribute name → the alias you \ +chose (e.g. "status" → "status"). + 4. This is a LEGITIMATE exception to "use the column's original name": a \ +harmonized attribute is a CASE expression aliased to the clean attribute name. \ +Plain, non-coded attributes (dates, numbers, names, free-text) still use their \ +original column name unaliased. + +ATTRIBUTE COVERAGE — NO SILENT DROPS (CRITICAL) +For EACH ontology attribute on the class, you must do ONE of: + (a) include a SQL column for it in the SELECT, AND add an entry to \ +attribute_mappings mapping the ontology attribute name to the SQL column \ +name (case-sensitive); OR + (b) add it to unmapped_attributes with a one-sentence reason, using the \ +shape {"name": "", "reason": ""}. + +You may NOT silently drop an attribute. The orchestrator will reject any \ +mapping where some ontology attributes appear in neither list. If a column \ +genuinely does not exist on the chosen table, that's an honest unmapped — \ +say so in the reason. + +WORKFLOW +1. Read the ontology class and the source_model_slice carefully. +2. COUNT the entries in canonical_id.canonical_column_per_table: + - one → single-source: pick that table, compose a flat SELECT. + - two or more → cross-source: compose a UNION ALL across ALL of them \ +(see the SINGLE-SOURCE vs CROSS-SOURCE block above). Do NOT pick one. +3. Compose the SELECT (or UNION ALL) following the SQL RULES above. For \ +each branch, the value of canonical_column_per_table[] is what \ +gets aliased AS ID — drop it in verbatim. It may already be a SQL \ +expression (e.g. ``regexp_extract(...)``); do not rewrite it. For any coded \ +attribute, apply VALUE HARMONIZATION (sample the distinct values, then a CASE \ +to a shared canonical token set across all branches). +4. Call execute_sql to validate the SELECT. If it fails, READ the error and \ +fix the SQL (typically a typo'd column name, mismatched column lists in a \ +UNION, or wrong full_name). Retry as needed. Never submit an un-validated \ +query. +5. Once execute_sql succeeds, call submit_entity_mapping EXACTLY ONCE with: + class_uri, class_name, sql_query (no LIMIT), id_column, label_column, \ +attribute_mappings, unmapped_attributes. +6. That's the terminal step. Do not emit any free text after submitting. + +GENERAL RULES +• Only ever pass row-returning queries (SELECT / WITH …) to execute_sql. +• Do not call get_metadata, get_ontology, or any other tool — they are not \ +available to you. The slice carries everything you need. +• If a retry_hint is present at the top of the user message, treat it as \ +authoritative — your previous attempt failed for the reason stated and you \ +should NOT repeat the same mistake. +""" + + +# ===================================================== +# Internal helpers +# ===================================================== + + +def _build_user_prompt( + ontology_class: dict, + source_model_slice: dict, + retry_hint: Optional[str] = None, +) -> str: + """Render the per-class user prompt. + + The orchestrator hands us `ontology_class` and a focused + `source_model_slice`. We emit a structured prompt that: + * surfaces the retry hint up top if one was provided, + * lists the class metadata and attribute list explicitly so the LLM + cannot forget any attribute, and + * embeds the slice as JSON so the LLM can refer to it precisely. + """ + parts: List[str] = [] + + if retry_hint: + parts.append(f"RETRY HINT (authoritative): {retry_hint}") + parts.append("") + + class_uri = ontology_class.get("uri", "") + class_label = ontology_class.get("label") or ontology_class.get("name", "") + class_comment = ontology_class.get("comment", "") or "" + attributes = ontology_class.get("attributes", []) or [] + + attr_summary_lines: List[str] = [] + for attr in attributes: + if isinstance(attr, dict): + attr_name = attr.get("name") or attr.get("label") or attr.get("uri", "?") + attr_type = attr.get("type") or attr.get("range") or "" + attr_summary_lines.append( + f" - {attr_name}" + (f" ({attr_type})" if attr_type else "") + ) + else: + attr_summary_lines.append(f" - {attr}") + + parts.append("ONTOLOGY CLASS") + parts.append(f" uri: {class_uri}") + parts.append(f" label: {class_label}") + if class_comment: + parts.append(f" comment: {class_comment}") + if attr_summary_lines: + parts.append(" attributes ({} total):".format(len(attributes))) + parts.extend(attr_summary_lines) + else: + parts.append(" attributes: (none — only ID and Label required)") + + parts.append("") + parts.append("SOURCE MODEL SLICE") + parts.append(json.dumps(source_model_slice, indent=2, default=str)) + + parts.append("") + parts.append( + "Pick the best candidate table from the slice, compose a flat SELECT " + "following the SQL RULES, validate with execute_sql, then call " + "submit_entity_mapping exactly once. Every ontology attribute must " + "appear in either attribute_mappings or unmapped_attributes — no " + "silent drops." + ) + + prompt = "\n".join(parts) + logger.debug( + "_build_user_prompt for class=%s (%d chars):\n%s", + class_uri, + len(prompt), + prompt, + ) + return prompt + + +# ===================================================== +# Public entry point +# ===================================================== + + +@trace_agent(name="mapping_pge_entity_generator") +def run_entity_generator( + host: str, + token: str, + endpoint_name: str, + client: Any, + *, + ontology_class: dict, + source_model_slice: dict, + retry_hint: Optional[str] = None, + on_step: Optional[Callable[[str, int], None]] = None, + max_iterations: int = MAX_ITERATIONS, +) -> EntityGenResult: + """Run the EntityGenerator agent for a single ontology class. + + The agent autonomously composes a SQL SELECT for ``ontology_class`` + against the candidate table(s) in ``source_model_slice``, validates the + SQL with ``execute_sql``, and submits the validated mapping via the + terminal ``submit_entity_mapping`` tool. + + Args: + host: Databricks workspace URL. + token: Bearer token for the serving endpoint. + endpoint_name: Foundation Model serving endpoint name. + client: Databricks SQL client (must expose ``execute_query(sql)``). + ontology_class: Full dict for the SINGLE class to map (uri, label, + comment, attributes list). + source_model_slice: Filtered SourceModel slice with candidate_tables, + canonical_id, and optional relevant_joins. + retry_hint: Optional one-sentence hint from the orchestrator's + previous-attempt evaluation. When present, surfaced at the top of + the user prompt. + on_step: Optional progress callback ``(msg, pct)`` for UI updates. + max_iterations: Upper bound on tool-call iterations (default 12 — + smaller than the Planner because the scope is one class). + + Returns: + An :class:`EntityGenResult`. ``success`` is True iff a mapping was + successfully submitted; in that case ``mapping`` holds the submitted + dict. On failure, ``error`` explains why and ``mapping`` is None. + """ + iteration_limit = max_iterations if max_iterations is not None else MAX_ITERATIONS + + class_uri = (ontology_class or {}).get("uri", "") + class_label = ( + (ontology_class or {}).get("label") + or (ontology_class or {}).get("name", "") + ) + n_attrs = len(((ontology_class or {}).get("attributes") or [])) + n_candidates = len(((source_model_slice or {}).get("candidate_tables") or [])) + + logger.info( + "===== ENTITY GENERATOR START ===== endpoint=%s, class=%s (%s), " + "attributes=%d, candidate_tables=%d, retry_hint=%s, max_iter=%d", + endpoint_name, + class_label, + class_uri, + n_attrs, + n_candidates, + "yes" if retry_hint else "no", + iteration_limit, + ) + + ctx = ToolContext( + host=host.rstrip("/"), + token=token, + client=client, + # The slice subsumes metadata/ontology for this agent; the unified + # ToolContext still needs these fields, so we plant the slice into + # ``metadata`` for completeness even though no handler reads it. + metadata={}, + ontology={}, + documents=[], + ) + + result = EntityGenResult(success=False) + + user_prompt = _build_user_prompt( + ontology_class or {}, source_model_slice or {}, retry_hint=retry_hint + ) + messages: List[dict] = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ] + logger.info( + "EntityGenerator conversation initialized: system=%d chars, user=%d chars", + len(SYSTEM_PROMPT), + len(user_prompt), + ) + + total_usage: Dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0} + + def _progress_pct(iteration_idx: int) -> int: + # Linear ramp 5 → 95 across the iteration budget. submit hits 100. + ratio = (iteration_idx + 1) / max(iteration_limit, 1) + return min(5 + int(ratio * 90), 95) + + def notify(msg: str, *, pct: Optional[int] = None) -> None: + actual_pct = pct if pct is not None else 5 + logger.info("ENTITY GEN STEP [%d%%] %s", actual_pct, msg) + if on_step: + on_step(msg, actual_pct) + + notify(f"Generating mapping for {class_label or class_uri}…", pct=1) + + # Snapshot the pre-existing mapping count so we can detect "this run + # added a mapping" without relying on absolute counters. (The orchestrator + # in Sprint 7 may reuse a ToolContext across calls; today's `ctx` is + # fresh, but the assertion is cheap and future-proof.) + pre_run_mapping_count = len(ctx.entity_mappings) + + # ------------------------------------------------------------------ + # Agent loop + # ------------------------------------------------------------------ + for iteration in range(iteration_limit): + if iteration > 0: + logger.debug( + "Iteration %d: waiting %ds before LLM call (rate limit mitigation)", + iteration + 1, + _ITERATION_DELAY_SEC, + ) + time.sleep(_ITERATION_DELAY_SEC) + + current_iteration = iteration + 1 + pct = _progress_pct(iteration) + logger.info( + "----- EntityGenerator iteration %d/%d — %d messages, mapping=%s -----", + current_iteration, + iteration_limit, + len(messages), + "set" if len(ctx.entity_mappings) > pre_run_mapping_count else "unset", + ) + notify( + f"Mapping iteration {current_iteration}/{iteration_limit}…", + pct=pct, + ) + + t0 = time.time() + try: + llm_response = call_serving_endpoint( + host, + token, + endpoint_name, + messages, + tools=TOOL_DEFINITIONS, + max_tokens=_MAX_TOKENS, + temperature=0.1, + timeout=LLM_TIMEOUT, + trace_name=_TRACE_NAME, + ) + except requests.exceptions.HTTPError as exc: + status = exc.response.status_code if exc.response is not None else "?" + logger.warning( + "EntityGenerator iteration %d: HTTPError status=%s", + current_iteration, + status, + ) + logger.debug( + "EntityGenerator iteration %d: HTTPError body: %.500s", + current_iteration, + exc.response.text if exc.response is not None else "N/A", + ) + if exc.response is not None and status in (400, 422): + result.error = "LLM endpoint does not support function calling" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "EntityGenerator: endpoint refused tools — cannot produce a mapping" + ) + return result + result.error = f"LLM request failed: {exc}" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "EntityGenerator: LLM request failed at iteration %d: %s", + current_iteration, + exc, + ) + return result + except requests.exceptions.ReadTimeout: + result.error = f"LLM request timed out after {LLM_TIMEOUT}s" + result.iterations = current_iteration + result.usage = total_usage + logger.error("EntityGenerator: timeout at iteration %d", current_iteration) + return result + except requests.exceptions.RequestException as exc: + result.error = f"LLM request failed: {exc}" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "EntityGenerator: request exception at iteration %d: %s", + current_iteration, + exc, + ) + return result + + elapsed_ms = int((time.time() - t0) * 1000) + logger.info( + "EntityGenerator iteration %d: LLM responded in %dms", + current_iteration, + elapsed_ms, + ) + + accumulate_usage(total_usage, llm_response.get("usage", {})) + + choice = llm_response.get("choices", [{}])[0] + finish_reason = choice.get("finish_reason", "?") + message = choice.get("message", {}) + tool_calls = message.get("tool_calls", []) + has_content = bool(message.get("content")) + logger.info( + "EntityGenerator iteration %d: finish_reason=%s, tool_calls=%d, has_content=%s", + current_iteration, + finish_reason, + len(tool_calls), + has_content, + ) + + if not tool_calls: + # The Generator must terminate via submit_entity_mapping, never + # via free text. + content = (message.get("content") or "")[:500] + logger.warning( + "EntityGenerator iteration %d: produced text without submitting mapping — %d chars", + current_iteration, + len(message.get("content") or ""), + ) + result.steps.append( + EntityGenStep( + step_type="output", + content=content, + duration_ms=elapsed_ms, + ) + ) + result.error = "entity generator produced text without submitting mapping" + result.iterations = current_iteration + result.usage = total_usage + notify( + "Entity generator produced text without submitting mapping.", + pct=pct, + ) + return result + + logger.info( + "EntityGenerator iteration %d: processing %d tool call(s): [%s]", + current_iteration, + len(tool_calls), + ", ".join( + tc.get("function", {}).get("name", "?") for tc in tool_calls + ), + ) + messages.append(message) + + terminal_success = False + for tc_idx, tc in enumerate(tool_calls, 1): + func = tc.get("function", {}) + tool_name = func.get("name", "") + raw_args = func.get("arguments", "{}") + tool_id = tc.get("id", "") + + try: + arguments = json.loads(raw_args) + except json.JSONDecodeError: + arguments = {} + + logger.info( + "EntityGenerator iteration %d: calling tool '%s' (%d/%d)", + current_iteration, + tool_name, + tc_idx, + len(tool_calls), + ) + + # Human-readable progress messages per tool. + if tool_name == "submit_entity_mapping": + notify(f"Submitting mapping for {class_label or class_uri}…", pct=pct) + elif tool_name == "sample_table": + fn = arguments.get("full_name", "?") + notify(f"Sampling {fn}…", pct=pct) + elif tool_name == "execute_sql": + sql_preview = arguments.get("sql", "")[:80] + notify(f"Running SQL: {sql_preview}…", pct=pct) + else: + notify(f"Calling {tool_name}…", pct=pct) + + result.steps.append( + EntityGenStep( + step_type="tool_call", + content=json.dumps(arguments, default=str)[:500], + tool_name=tool_name, + ) + ) + + t1 = time.time() + tool_result = dispatch_tool( + TOOL_HANDLERS, ctx, tool_name, arguments, trace_name=_TRACE_NAME + ) + tool_ms = int((time.time() - t1) * 1000) + + logger.info( + "EntityGenerator iteration %d: tool '%s' returned %d chars in %dms", + current_iteration, + tool_name, + len(tool_result), + tool_ms, + ) + + result.steps.append( + EntityGenStep( + step_type="tool_result", + content=( + (tool_result[:500] + "…") + if len(tool_result) > 500 + else tool_result + ), + tool_name=tool_name, + duration_ms=tool_ms, + ) + ) + + messages.append( + { + "role": "tool", + "tool_call_id": tool_id, + "content": tool_result, + } + ) + + # Detect terminal success: submit_entity_mapping returned + # success=True AND a mapping for THIS class_uri is present in + # ctx.entity_mappings. A submit with a mismatched class_uri (the + # LLM mapped a different class than requested) is NOT terminal — + # we coach the LLM via a corrective tool message and let the loop + # continue so it can resubmit with the right URI. + if tool_name == "submit_entity_mapping": + try: + parsed = json.loads(tool_result) + except json.JSONDecodeError: + parsed = {} + if parsed.get("success") is True: + matched = any( + m.get("ontology_class") == class_uri + for m in ctx.entity_mappings + ) + if matched: + terminal_success = True + logger.info( + "EntityGenerator iteration %d: submit_entity_mapping succeeded — terminating", + current_iteration, + ) + else: + submitted_uri = arguments.get("class_uri", "") + mismatch_msg = ( + f"submitted class_uri '{submitted_uri}' does not " + f"match requested class_uri '{class_uri}'; " + f"resubmit with class_uri='{class_uri}'" + ) + logger.warning( + "EntityGenerator iteration %d: submit_entity_mapping " + "class_uri mismatch — submitted=%s, requested=%s", + current_iteration, + submitted_uri, + class_uri, + ) + corrective_payload = json.dumps( + {"success": False, "error": mismatch_msg} + ) + # Replace the recorded tool_result step's content so + # the UI / trace reflects the corrective signal + # rather than the original (misleading) success + # response. + result.steps[-1] = EntityGenStep( + step_type="tool_result", + content=corrective_payload, + tool_name=tool_name, + duration_ms=result.steps[-1].duration_ms, + ) + # Replace the tool message just appended to + # ``messages`` so the LLM sees the corrective + # payload on the next turn (one tool message per + # tool_call_id — keep the protocol clean). + messages[-1] = { + "role": "tool", + "tool_call_id": tool_id, + "content": corrective_payload, + } + + if terminal_success: + # Pull the mapping for this class by strict URI match. The + # terminal-success guard above already verified an entry with + # this URI exists; if we somehow can't find one here that's an + # internal invariant violation, not a recoverable failure. + submitted = next( + ( + m + for m in reversed(ctx.entity_mappings) + if m.get("ontology_class") == class_uri + ), + None, + ) + if submitted is None: + result.error = ( + "internal: submit succeeded but mapping not found for class_uri" + ) + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "===== ENTITY GENERATOR FAILED ===== %s (class=%s)", + result.error, + class_uri, + ) + return result + result.success = True + result.mapping = submitted + result.iterations = current_iteration + result.usage = total_usage + logger.info( + "===== ENTITY GENERATOR COMPLETE ===== class=%s, iterations=%d, " + "prompt_tokens=%d, completion_tokens=%d", + class_uri, + result.iterations, + total_usage["prompt_tokens"], + total_usage["completion_tokens"], + ) + notify(f"Mapping for {class_label or class_uri} complete!", pct=100) + return result + + # Budget exhausted without a successful submit. + result.iterations = iteration_limit + result.usage = total_usage + result.error = "entity generator exhausted iteration budget" + logger.error("===== ENTITY GENERATOR FAILED ===== %s", result.error) + notify(result.error, pct=95) + return result diff --git a/src/agents/agent_mapping_pge/generators/relationship.py b/src/agents/agent_mapping_pge/generators/relationship.py new file mode 100644 index 00000000..7d32ad63 --- /dev/null +++ b/src/agents/agent_mapping_pge/generators/relationship.py @@ -0,0 +1,875 @@ +""" +OntoBricks Mapping-PGE RelationshipGenerator Agent. + +Sprint 5 of the Planner-Generator-Evaluator (PGE) redesign. + +The RelationshipGenerator is the sibling of :mod:`.entity` — same ReAct +loop shape and tooling discipline, narrower scope. It maps **one** ontology +property (relationship) at a time, given: + +* the property to map (uri, label, comment, domain, range), +* the source and target **entity mappings already produced by the + EntityGenerator** — crucially, the ``id_column`` each side mapped on, and +* a small SourceModel slice that surfaces the relevant join-key subgraph. + +The system prompt FORBIDS picking endpoint columns that do not match the +already-mapped entity IDs: the source/target endpoint columns are GIVEN. +This keeps relationships consistent with the entities they connect — if a +relationship's ``source_id`` doesn't match the source entity's ``id_column``, +the resulting SPARQL graph cannot join. + +The loop semantics mirror :mod:`.entity`: + +* Same default budget (12). +* Same 3-second inter-iteration delay. +* Same MLflow trace decorator. +* No single-shot fallback (terminate via tool call only). +* Strict ``property_uri`` match on terminal detection — a submit with the + wrong URI is coached via a corrective tool message, not accepted. +""" + +import json +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional + +import requests + +from back.core.logging import get_logger +from agents.engine_base import ( + call_serving_endpoint, + dispatch_tool, + accumulate_usage, +) +from agents.tools.context import ToolContext +from agents.tools.mapping import ( + MAPPING_TOOL_DEFINITIONS_BY_NAME, + MAPPING_TOOL_HANDLERS, +) +from agents.tools.planner import ( + SAMPLE_TABLE_DEF, + tool_sample_table, +) +from agents.tools.sql import ( + SQL_TOOL_DEFINITIONS, + SQL_TOOL_HANDLERS, +) +from agents.tracing import trace_agent + +logger = get_logger(__name__) + +MAX_ITERATIONS = 12 +LLM_TIMEOUT = 180 +_ITERATION_DELAY_SEC = 3 +# See planner._MAX_TOKENS comment — large UNION ALL queries for cross-source +# relationships can exceed a small ceiling. +_MAX_TOKENS = 50000 + +_TRACE_NAME = "mapping_pge_relationship_generator" + + +# ===================================================== +# Tool aggregation +# ===================================================== +# +# The RelationshipGenerator only needs: +# * execute_sql – validate the composed two-column SELECT. +# * sample_table – peek at endpoint columns when the join is +# ambiguous (rare; usually unnecessary). +# * submit_relationship_mapping – TERMINAL. +# +# We deliberately exclude: +# * get_ontology / get_metadata / get_documents_context — wrong stage. +# * column_value_overlap / distinct_count — already locked by the Planner. +# * submit_source_model / submit_entity_mapping — wrong stage. + +_SUBMIT_RELATIONSHIP_DEF: dict = MAPPING_TOOL_DEFINITIONS_BY_NAME[ + "submit_relationship_mapping" +] + +TOOL_DEFINITIONS: List[dict] = ( + SQL_TOOL_DEFINITIONS + + [SAMPLE_TABLE_DEF] + + [_SUBMIT_RELATIONSHIP_DEF] +) + +TOOL_HANDLERS: Dict[str, Callable] = { + **SQL_TOOL_HANDLERS, + "sample_table": tool_sample_table, + "submit_relationship_mapping": MAPPING_TOOL_HANDLERS[ + "submit_relationship_mapping" + ], +} + + +# ===================================================== +# Data classes +# ===================================================== + + +@dataclass +class RelationshipGenStep: + """One observable step of the RelationshipGenerator's execution. + + Mirrors :class:`.entity.EntityGenStep` — scoped to the relationship + generator so the orchestrator (Sprint 7) can render a per-property + timeline in the UI. + """ + + step_type: str # "tool_call" | "tool_result" | "output" + content: str + tool_name: str = "" + duration_ms: int = 0 + + +@dataclass +class RelationshipGenResult: + """Outcome of a single RelationshipGenerator invocation. + + ``mapping`` holds the submitted relationship-mapping dict (the same + shape the handler appends to ``ctx.relationships``) when ``success`` is + True. + """ + + success: bool + mapping: Optional[dict] = None + steps: List[RelationshipGenStep] = field(default_factory=list) + iterations: int = 0 + error: str = "" + usage: Dict[str, int] = field(default_factory=dict) + + +# ===================================================== +# System prompt +# ===================================================== +# +# The RELATIONSHIP SQL RULES section is lifted verbatim from the legacy +# in-house mapping agent (the section starting "SQL RULES FOR +# RELATIONSHIPS"). To those rules we add the Sprint 5 constraints: the +# source and target ID columns are GIVEN by the already-produced entity +# mappings; the LLM may not pick different endpoint columns. + +SYSTEM_PROMPT = """\ +You are a senior data engineer. Your job is to map ONE ontology property \ +(relationship) to a single SQL SELECT query against Databricks source \ +table(s), validated against real data via execute_sql, and submitted via \ +submit_relationship_mapping. + +YOU WILL BE GIVEN +• ontology_property: the property to map (uri, label, comment, domain, range). +• source_entity_mapping / target_entity_mapping: the ALREADY-MAPPED endpoint \ +entities — each with its class_uri, id_column, and the exact SQL it ran. \ +READ BOTH SQLs: they are the source of truth for your endpoint values. +• source_model_slice: relevant_joins[] {from_ref, to_ref, confidence, \ +overlap_pct, kind} and candidate_tables[] the Planner curated. Prefer \ +high-overlap, high-confidence joins. + +THE EDGE MUST CONNECT EXISTING NODES +An edge row is (source_id, target_id). Each value MUST already exist as a \ +node id in the corresponding entity, or it "dangles" and the mapping is \ +rejected (the evaluator fails any mapping with >5% dangling on either side, \ +unless the Planner predicted a cross-source band). Three traps cause almost \ +all dangling — avoid all three: + +TRAP 1 — id_column is an ALIAS FOR A DERIVED EXPRESSION, not a real column. +Each entity mints its id with ``SELECT AS `` (the \ +id_column is usually just ``ID``). That expression is often a canonical-key \ +normalization, e.g.:: + + CONCAT(regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-[0-9]+)', 1), '-baby') + +There is no ``ID`` column to select. You MUST **reproduce the entity's id \ +EXPRESSION verbatim** (copied from its SQL), applied to your table, for the \ +endpoint. A raw column (a ``*_id`` join key, a trust-local id) will NOT match \ +→ 100% dangling. + +TRAP 2 — building from a table only ONE endpoint entity covers. +The two entities may be sourced from different trusts (compare the FROM \ +tables in each entity's SQL). Their id universes overlap only on the trust(s) \ +BOTH cover. Build the edge from a table present in BOTH entities' FROM lists \ +(the shared-coverage table). Building from a table only the target covers \ +makes every source_id absent from the source → 100% source-dangling (and \ +vice-versa). + +TRAP 3 — column-name / alias mismatch on submit. +Your SELECT MUST alias the two columns exactly ``AS source_id`` and \ +``AS target_id``, and you MUST submit ``source_id_column="source_id"`` and \ +``target_id_column="target_id"``. These name the columns IN YOUR EDGE OUTPUT, \ +NOT the entity's id_column. If they disagree with your SELECT aliases the \ +evaluator reads nothing and every edge dangles. + +WORKED EXAMPLE — ``Baby --hasApgarScore--> Apgar Score`` +Baby is sourced from {trust_a.maternity_episode, trust_b.delivery}; Apgar \ +Score from {trust_a.maternity_episode, trust_c.maternity_event}. Shared \ +coverage = trust_a only (Trap 2). Both ids share the canonical pregnancy core \ +with role suffixes (Trap 1). So build from trust_a, reproducing both \ +expressions from one row:: + + SELECT CONCAT(regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-[0-9]+)', 1), '-baby') AS source_id, + CONCAT(regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-[0-9]+)', 1), '-apgar') AS target_id + FROM fiifi_cdm_demo_catalog.trust_a.maternity_episode + WHERE regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-[0-9]+)', 1) <> '' + +Building from trust_c (Apgar's natural home) would dangle 100% on the Baby \ +side, because Baby has no trust_c rows. + +TOOLS + • execute_sql – validate / probe your SELECT (runs with a \ +small LIMIT; the persisted mapping has none). + • sample_table – peek at real values when a column is \ +ambiguous. + • submit_relationship_mapping – TERMINAL. Call EXACTLY ONCE, only after a \ +clean dangling probe (see WORKFLOW step 4). + +SQL RULES +• SELECT exactly two columns: `` AS source_id, AS target_id`` (Trap 1 + Trap 3). +• Build FROM a table both entities cover (Trap 2). Same-trust FK joins: one \ +table, no join. Cross-source: a UNION ALL of per-source SELECTs (each source \ +that holds both cores), or a JOIN on the shared canonical key. +• No LIMIT, no ORDER BY. Always full table names (catalog.schema.table). + +WORKFLOW +1. Read BOTH entity SQLs. Extract each entity's id EXPRESSION (the \ +``SELECT AS ``) and its set of FROM tables. +2. Pick a shared-coverage table (Trap 2). Compose the two-column SELECT, \ +setting source_id to the source entity's id EXPRESSION and target_id to the \ +target entity's id EXPRESSION, reproduced verbatim and aliased ``AS \ +source_id`` / ``AS target_id``. +3. Call execute_sql to confirm the query parses and returns two columns of \ +rows. Read any error and fix it; never submit an un-validated query. +4. SELF-VERIFY THE VALUES BEFORE SUBMITTING (MANDATORY GATE). Run this probe \ +via execute_sql: + + WITH rel AS (), + src AS (), + tgt AS () + SELECT + (SELECT COUNT(*) FROM rel) AS edges, + (SELECT COUNT(*) FROM rel r WHERE r.source_id NOT IN (SELECT ID FROM src)) AS dangling_src, + (SELECT COUNT(*) FROM rel r WHERE r.target_id NOT IN (SELECT ID FROM tgt)) AS dangling_tgt + + You may submit ONLY when ``dangling_src`` AND ``dangling_tgt`` are both 0 \ +(or a tiny fraction of edges). If either is high you hit Trap 1 or Trap 2 — \ +fix the endpoint expression or switch to the shared-coverage table, then \ +re-run this probe. Do NOT submit on an unrun or failing probe. +5. submit_relationship_mapping EXACTLY ONCE: property_uri, property_name, \ +sql_query (no LIMIT), source_id_column="source_id", target_id_column=\ +"target_id", domain, range_class. +6. Terminal — emit no free text after submitting. + +GENERAL RULES +• Only ever pass row-returning queries (SELECT / WITH …) to execute_sql. +• Do not call get_metadata, get_ontology, column_value_overlap, \ +distinct_count, submit_entity_mapping, or submit_source_model — they are \ +not available to you. The slice plus the entity mappings carry everything \ +you need. +• If a retry_hint is present at the top of the user message, treat it as \ +authoritative — your previous attempt failed for the reason stated; do NOT \ +repeat the same mistake. +""" + + +# ===================================================== +# Internal helpers +# ===================================================== + + +def _summarise_entity_mapping(em: dict, side: str) -> List[str]: + """One-block textual summary of a previously-produced entity mapping. + + Surfaces exactly the fields the LLM needs to constrain its endpoint + choice: the class_uri, the id_column it locked in, and the SQL it ran. + Anything else (label_column, attribute_mappings, …) is irrelevant to the + relationship task and is intentionally omitted to keep the prompt tight. + """ + em = em or {} + class_uri = ( + em.get("ontology_class") or em.get("class_uri") or em.get("class") or "" + ) + id_column = em.get("id_column", "") + sql_query = em.get("sql_query", "") + return [ + f"{side.upper()} ENTITY MAPPING", + f" class_uri: {class_uri}", + f" id_column: {id_column}", + f" sql: {sql_query}", + ] + + +def _format_join(j: dict) -> str: + """Readable one-line rendering of a join entry from the slice. + + Defensive about missing fields — partial joins still render usefully so + a malformed slice doesn't blow up the prompt build. + """ + from_ref = j.get("from_ref", "?") + to_ref = j.get("to_ref", "?") + kind = j.get("kind", "?") + conf = j.get("confidence") + overlap = j.get("overlap_pct") + extras: List[str] = [] + if conf is not None: + try: + extras.append(f"confidence={float(conf):.2f}") + except (TypeError, ValueError): + extras.append(f"confidence={conf}") + if overlap is not None: + try: + extras.append(f"overlap_pct={float(overlap):.2f}") + except (TypeError, ValueError): + extras.append(f"overlap_pct={overlap}") + suffix = (" — " + ", ".join(extras)) if extras else "" + return f" - {from_ref} -> {to_ref} [{kind}]{suffix}" + + +def _build_user_prompt( + ontology_property: dict, + source_entity_mapping: dict, + target_entity_mapping: dict, + source_model_slice: dict, + retry_hint: Optional[str] = None, +) -> str: + """Render the per-property user prompt. + + Structure: + 1. retry_hint (if any) at the very top + 2. ontology property metadata + 3. source entity mapping summary (class_uri / id_column / sql) + 4. target entity mapping summary + 5. relevant joins (one line per join, readable) + 6. candidate_tables (raw JSON — small) + 7. a reminder block reiterating the two-column / endpoint-match rules + """ + parts: List[str] = [] + + if retry_hint: + parts.append("RETRY HINT (authoritative — your previous attempt FAILED):") + parts.append(retry_hint) + parts.append( + "DO NOT repeat the same column choice. If the hint mentions " + "'dangling' or 'canonical id': sample BOTH the candidate endpoint " + "column AND the entity's id_column, compare actual values, and " + "pick the column whose values overlap. Run the dangling-edge " + "probe (step 4 of WORKFLOW) BEFORE submitting this time.\n" + ) + + prop_uri = ontology_property.get("uri", "") + prop_label = ( + ontology_property.get("label") or ontology_property.get("name", "") + ) + prop_comment = ontology_property.get("comment", "") or "" + prop_domain = ontology_property.get("domain", "") or "" + prop_range = ontology_property.get("range", "") or "" + + parts.append("ONTOLOGY PROPERTY") + parts.append(f" uri: {prop_uri}") + parts.append(f" label: {prop_label}") + if prop_comment: + parts.append(f" comment: {prop_comment}") + parts.append(f" domain: {prop_domain}") + parts.append(f" range: {prop_range}") + + parts.append("") + parts.extend(_summarise_entity_mapping(source_entity_mapping, side="source")) + + parts.append("") + parts.extend(_summarise_entity_mapping(target_entity_mapping, side="target")) + + slice_obj = source_model_slice or {} + joins = slice_obj.get("relevant_joins") or [] + candidates = slice_obj.get("candidate_tables") or [] + + parts.append("") + parts.append("RELEVANT JOINS") + if joins: + for j in joins: + parts.append(_format_join(j)) + else: + parts.append(" (none surfaced by the Planner — fall back to a single-table SELECT if possible)") + + if candidates: + parts.append("") + parts.append("CANDIDATE TABLES") + parts.append(json.dumps(candidates, indent=2, default=str)) + + src_id = (source_entity_mapping or {}).get("id_column", "") + tgt_id = (target_entity_mapping or {}).get("id_column", "") + + parts.append("") + parts.append("REMINDERS (CRITICAL)") + parts.append( + " • The persisted SQL MUST return EXACTLY two columns aliased " + "AS source_id and AS target_id." + ) + parts.append( + f" • source_id values MUST come from the column '{src_id}' (the " + "source entity's id_column) — or be directly transformable into it " + "via a join key in the slice." + ) + parts.append( + f" • target_id values MUST come from the column '{tgt_id}' (the " + "target entity's id_column) — same constraint." + ) + parts.append( + " • Validate with execute_sql, then call submit_relationship_mapping " + "exactly once." + ) + + prompt = "\n".join(parts) + logger.debug( + "_build_user_prompt for property=%s (%d chars):\n%s", + prop_uri, + len(prompt), + prompt, + ) + return prompt + + +# ===================================================== +# Public entry point +# ===================================================== + + +@trace_agent(name="mapping_pge_relationship_generator") +def run_relationship_generator( + host: str, + token: str, + endpoint_name: str, + client: Any, + *, + ontology_property: dict, + source_entity_mapping: dict, + target_entity_mapping: dict, + source_model_slice: dict, + retry_hint: Optional[str] = None, + on_step: Optional[Callable[[str, int], None]] = None, + max_iterations: int = MAX_ITERATIONS, +) -> RelationshipGenResult: + """Run the RelationshipGenerator agent for a single ontology property. + + The agent composes a two-column SQL SELECT (``source_id`` / ``target_id``) + that realises the relationship between the source and target entities + using the join-key subgraph in ``source_model_slice``, validates the + SQL via ``execute_sql``, and submits the validated mapping via the + terminal ``submit_relationship_mapping`` tool. + + Args: + host: Databricks workspace URL. + token: Bearer token for the serving endpoint. + endpoint_name: Foundation Model serving endpoint name. + client: Databricks SQL client (must expose ``execute_query(sql)``). + ontology_property: Full dict for the SINGLE property to map (uri, + label, comment, domain, range). + source_entity_mapping: The ALREADY-MAPPED source entity (carries the + ``id_column`` the source endpoint must align with). + target_entity_mapping: The ALREADY-MAPPED target entity (same). + source_model_slice: Filtered SourceModel slice with relevant_joins + and optional candidate_tables. + retry_hint: Optional one-sentence hint from the orchestrator's + previous-attempt evaluation. When present, surfaced at the top + of the user prompt. + on_step: Optional progress callback ``(msg, pct)`` for UI updates. + max_iterations: Upper bound on tool-call iterations (default 12 — + same as the EntityGenerator). + + Returns: + A :class:`RelationshipGenResult`. ``success`` is True iff a mapping + was successfully submitted with the requested ``property_uri``; in + that case ``mapping`` holds the submitted dict. On failure, ``error`` + explains why and ``mapping`` is None. + """ + iteration_limit = max_iterations if max_iterations is not None else MAX_ITERATIONS + + property_uri = (ontology_property or {}).get("uri", "") + property_label = ( + (ontology_property or {}).get("label") + or (ontology_property or {}).get("name", "") + ) + n_joins = len(((source_model_slice or {}).get("relevant_joins") or [])) + n_candidates = len(((source_model_slice or {}).get("candidate_tables") or [])) + + logger.info( + "===== RELATIONSHIP GENERATOR START ===== endpoint=%s, property=%s (%s), " + "joins=%d, candidate_tables=%d, retry_hint=%s, max_iter=%d", + endpoint_name, + property_label, + property_uri, + n_joins, + n_candidates, + "yes" if retry_hint else "no", + iteration_limit, + ) + + ctx = ToolContext( + host=host.rstrip("/"), + token=token, + client=client, + # The slice + entity mappings subsume metadata/ontology for this + # agent; the unified ToolContext still wants these fields, so we + # leave them empty. + metadata={}, + ontology={}, + documents=[], + ) + + result = RelationshipGenResult(success=False) + + user_prompt = _build_user_prompt( + ontology_property or {}, + source_entity_mapping or {}, + target_entity_mapping or {}, + source_model_slice or {}, + retry_hint=retry_hint, + ) + messages: List[dict] = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ] + logger.info( + "RelationshipGenerator conversation initialized: system=%d chars, user=%d chars", + len(SYSTEM_PROMPT), + len(user_prompt), + ) + + total_usage: Dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0} + + def _progress_pct(iteration_idx: int) -> int: + ratio = (iteration_idx + 1) / max(iteration_limit, 1) + return min(5 + int(ratio * 90), 95) + + def notify(msg: str, *, pct: Optional[int] = None) -> None: + actual_pct = pct if pct is not None else 5 + logger.info("RELATIONSHIP GEN STEP [%d%%] %s", actual_pct, msg) + if on_step: + on_step(msg, actual_pct) + + notify(f"Generating mapping for {property_label or property_uri}…", pct=1) + + # Snapshot the pre-existing relationship count so we can detect "this + # run added a mapping" without relying on absolute counters. Future-proof + # for an orchestrator that reuses a ToolContext across calls. + pre_run_count = len(ctx.relationships) + + # ------------------------------------------------------------------ + # Agent loop + # ------------------------------------------------------------------ + for iteration in range(iteration_limit): + if iteration > 0: + logger.debug( + "Iteration %d: waiting %ds before LLM call (rate limit mitigation)", + iteration + 1, + _ITERATION_DELAY_SEC, + ) + time.sleep(_ITERATION_DELAY_SEC) + + current_iteration = iteration + 1 + pct = _progress_pct(iteration) + logger.info( + "----- RelationshipGenerator iteration %d/%d — %d messages, mapping=%s -----", + current_iteration, + iteration_limit, + len(messages), + "set" if len(ctx.relationships) > pre_run_count else "unset", + ) + notify( + f"Mapping iteration {current_iteration}/{iteration_limit}…", + pct=pct, + ) + + t0 = time.time() + try: + llm_response = call_serving_endpoint( + host, + token, + endpoint_name, + messages, + tools=TOOL_DEFINITIONS, + max_tokens=_MAX_TOKENS, + temperature=0.1, + timeout=LLM_TIMEOUT, + trace_name=_TRACE_NAME, + ) + except requests.exceptions.HTTPError as exc: + status = exc.response.status_code if exc.response is not None else "?" + logger.warning( + "RelationshipGenerator iteration %d: HTTPError status=%s", + current_iteration, + status, + ) + logger.debug( + "RelationshipGenerator iteration %d: HTTPError body: %.500s", + current_iteration, + exc.response.text if exc.response is not None else "N/A", + ) + if exc.response is not None and status in (400, 422): + result.error = "LLM endpoint does not support function calling" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "RelationshipGenerator: endpoint refused tools — cannot produce a mapping" + ) + return result + result.error = f"LLM request failed: {exc}" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "RelationshipGenerator: LLM request failed at iteration %d: %s", + current_iteration, + exc, + ) + return result + except requests.exceptions.ReadTimeout: + result.error = f"LLM request timed out after {LLM_TIMEOUT}s" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "RelationshipGenerator: timeout at iteration %d", current_iteration + ) + return result + except requests.exceptions.RequestException as exc: + result.error = f"LLM request failed: {exc}" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "RelationshipGenerator: request exception at iteration %d: %s", + current_iteration, + exc, + ) + return result + + elapsed_ms = int((time.time() - t0) * 1000) + logger.info( + "RelationshipGenerator iteration %d: LLM responded in %dms", + current_iteration, + elapsed_ms, + ) + + accumulate_usage(total_usage, llm_response.get("usage", {})) + + choice = llm_response.get("choices", [{}])[0] + finish_reason = choice.get("finish_reason", "?") + message = choice.get("message", {}) + tool_calls = message.get("tool_calls", []) + has_content = bool(message.get("content")) + logger.info( + "RelationshipGenerator iteration %d: finish_reason=%s, tool_calls=%d, has_content=%s", + current_iteration, + finish_reason, + len(tool_calls), + has_content, + ) + + if not tool_calls: + # The Generator must terminate via submit_relationship_mapping, + # never via free text. + content = (message.get("content") or "")[:500] + logger.warning( + "RelationshipGenerator iteration %d: produced text without submitting mapping — %d chars", + current_iteration, + len(message.get("content") or ""), + ) + result.steps.append( + RelationshipGenStep( + step_type="output", + content=content, + duration_ms=elapsed_ms, + ) + ) + result.error = "relationship generator produced text without submitting mapping" + result.iterations = current_iteration + result.usage = total_usage + notify( + "Relationship generator produced text without submitting mapping.", + pct=pct, + ) + return result + + logger.info( + "RelationshipGenerator iteration %d: processing %d tool call(s): [%s]", + current_iteration, + len(tool_calls), + ", ".join( + tc.get("function", {}).get("name", "?") for tc in tool_calls + ), + ) + messages.append(message) + + terminal_success = False + for tc_idx, tc in enumerate(tool_calls, 1): + func = tc.get("function", {}) + tool_name = func.get("name", "") + raw_args = func.get("arguments", "{}") + tool_id = tc.get("id", "") + + try: + arguments = json.loads(raw_args) + except json.JSONDecodeError: + arguments = {} + + logger.info( + "RelationshipGenerator iteration %d: calling tool '%s' (%d/%d)", + current_iteration, + tool_name, + tc_idx, + len(tool_calls), + ) + + if tool_name == "submit_relationship_mapping": + notify( + f"Submitting mapping for {property_label or property_uri}…", + pct=pct, + ) + elif tool_name == "sample_table": + fn = arguments.get("full_name", "?") + notify(f"Sampling {fn}…", pct=pct) + elif tool_name == "execute_sql": + sql_preview = arguments.get("sql", "")[:80] + notify(f"Running SQL: {sql_preview}…", pct=pct) + else: + notify(f"Calling {tool_name}…", pct=pct) + + result.steps.append( + RelationshipGenStep( + step_type="tool_call", + content=json.dumps(arguments, default=str)[:500], + tool_name=tool_name, + ) + ) + + t1 = time.time() + tool_result = dispatch_tool( + TOOL_HANDLERS, ctx, tool_name, arguments, trace_name=_TRACE_NAME + ) + tool_ms = int((time.time() - t1) * 1000) + + logger.info( + "RelationshipGenerator iteration %d: tool '%s' returned %d chars in %dms", + current_iteration, + tool_name, + len(tool_result), + tool_ms, + ) + + result.steps.append( + RelationshipGenStep( + step_type="tool_result", + content=( + (tool_result[:500] + "…") + if len(tool_result) > 500 + else tool_result + ), + tool_name=tool_name, + duration_ms=tool_ms, + ) + ) + + messages.append( + { + "role": "tool", + "tool_call_id": tool_id, + "content": tool_result, + } + ) + + # Detect terminal success: submit_relationship_mapping returned + # success=True AND a mapping for THIS property_uri is present in + # ctx.relationships. A submit with a mismatched property_uri is + # NOT terminal — we coach the LLM via a corrective tool message + # and let the loop continue. + if tool_name == "submit_relationship_mapping": + try: + parsed = json.loads(tool_result) + except json.JSONDecodeError: + parsed = {} + if parsed.get("success") is True: + matched = any( + m.get("property") == property_uri + for m in ctx.relationships + ) + if matched: + terminal_success = True + logger.info( + "RelationshipGenerator iteration %d: submit_relationship_mapping succeeded — terminating", + current_iteration, + ) + else: + submitted_uri = arguments.get("property_uri", "") + mismatch_msg = ( + f"submitted property_uri '{submitted_uri}' does " + f"not match requested property_uri " + f"'{property_uri}'; resubmit with " + f"property_uri='{property_uri}'" + ) + logger.warning( + "RelationshipGenerator iteration %d: submit_relationship_mapping " + "property_uri mismatch — submitted=%s, requested=%s", + current_iteration, + submitted_uri, + property_uri, + ) + corrective_payload = json.dumps( + {"success": False, "error": mismatch_msg} + ) + # Replace the recorded tool_result step's content so + # the UI / trace shows the corrective signal. + result.steps[-1] = RelationshipGenStep( + step_type="tool_result", + content=corrective_payload, + tool_name=tool_name, + duration_ms=result.steps[-1].duration_ms, + ) + # Replace the tool message on the conversation so + # the LLM sees the corrective payload next turn. + messages[-1] = { + "role": "tool", + "tool_call_id": tool_id, + "content": corrective_payload, + } + + if terminal_success: + # Pull the mapping for this property by strict URI match. + submitted = next( + ( + m + for m in reversed(ctx.relationships) + if m.get("property") == property_uri + ), + None, + ) + if submitted is None: + result.error = ( + "internal: submit succeeded but mapping not found for property_uri" + ) + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "===== RELATIONSHIP GENERATOR FAILED ===== %s (property=%s)", + result.error, + property_uri, + ) + return result + result.success = True + result.mapping = submitted + result.iterations = current_iteration + result.usage = total_usage + logger.info( + "===== RELATIONSHIP GENERATOR COMPLETE ===== property=%s, iterations=%d, " + "prompt_tokens=%d, completion_tokens=%d", + property_uri, + result.iterations, + total_usage["prompt_tokens"], + total_usage["completion_tokens"], + ) + notify( + f"Mapping for {property_label or property_uri} complete!", pct=100 + ) + return result + + # Budget exhausted without a successful submit. + result.iterations = iteration_limit + result.usage = total_usage + result.error = "relationship generator exhausted iteration budget" + logger.error("===== RELATIONSHIP GENERATOR FAILED ===== %s", result.error) + notify(result.error, pct=95) + return result diff --git a/src/agents/agent_mapping_pge/planner.py b/src/agents/agent_mapping_pge/planner.py new file mode 100644 index 00000000..a65babf7 --- /dev/null +++ b/src/agents/agent_mapping_pge/planner.py @@ -0,0 +1,738 @@ +""" +OntoBricks Mapping-PGE Planner Agent. + +Sprint 3 of the Planner-Generator-Evaluator (PGE) redesign. + +The Planner is a single-invocation agent (no internal retry loop — re- +invocations come from the orchestrator on Evaluator escalation in Sprint 7). +It consumes the ontology, table metadata, and any imported domain documents, +probes the source data via the planner tools (sample_table, column_value_overlap, +distinct_count) plus the shared tools (get_metadata, get_ontology, +get_documents_context, execute_sql), and emits a validated +:class:`SourceModel` via the ``submit_source_model`` terminal tool. + +The loop semantics mirror the prior single-loop mapping agent — same +``call_serving_endpoint`` + ``dispatch_tool`` ReAct cycle, same 3-second +inter-iteration delay, same accumulated usage tracking, same MLflow trace +decorator — with two key differences: + +* No fallback to single-shot generation. If the endpoint refuses tools, the + Planner returns failure (the Planner *needs* tools — it produces structured + output through ``submit_source_model``). +* Smaller default iteration budget (25 instead of 60) — the Planner is more + focused than the auto-mapping agent. +""" + +import json +import time +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional + +import requests + +if TYPE_CHECKING: + from agents.agent_mapping_pge.contracts import SourceModel + +from back.core.logging import get_logger +from agents.engine_base import ( + call_serving_endpoint, + dispatch_tool, + accumulate_usage, +) +from agents.tools.context import ToolContext +from agents.tools.documents import ( + GET_DOCUMENTS_CONTEXT_DEF, + tool_get_documents_context, +) +from agents.tools.metadata import ( + GET_METADATA_DEF, + tool_get_metadata, +) +from agents.tools.ontology import ( + ONTOLOGY_TOOL_DEFINITIONS, + ONTOLOGY_TOOL_HANDLERS, +) +from agents.tools.planner import ( + PLANNER_TOOL_DEFINITIONS, + PLANNER_TOOL_HANDLERS, +) +from agents.tools.sql import ( + SQL_TOOL_DEFINITIONS, + SQL_TOOL_HANDLERS, +) +from agents.tracing import trace_agent + +logger = get_logger(__name__) + +MAX_ITERATIONS = 50 +LLM_TIMEOUT = 180 +_ITERATION_DELAY_SEC = 3 + +# The submit_source_model JSON for a real-world ontology can run several KB +# (17+ classes × multiple candidates + canonical_ids + join_keys + plan). +# A small ceiling silently truncates the call (finish_reason=length) and the +# dataclass validation fails with no clue to the LLM as to why. 100k removes +# the practical ceiling for any ontology size; you only pay for tokens +# actually generated, so the cost stays bounded by output complexity. +_MAX_TOKENS = 50000 + +_TRACE_NAME = "mapping_pge_planner" + + +# ===================================================== +# Tool aggregation +# ===================================================== +# +# The Planner uses every read tool the auto-mapping agent has — ontology, +# metadata, documents, execute_sql — *plus* the four planner-specific tools. +# It deliberately does NOT receive ``submit_entity_mapping`` / +# ``submit_relationship_mapping``: those belong to the Generator (Sprints 4 +# and 5). The Planner's only terminal tool is ``submit_source_model``. + +TOOL_DEFINITIONS: List[dict] = ( + [GET_METADATA_DEF, GET_DOCUMENTS_CONTEXT_DEF] + + ONTOLOGY_TOOL_DEFINITIONS + + SQL_TOOL_DEFINITIONS + + PLANNER_TOOL_DEFINITIONS +) + +TOOL_HANDLERS: Dict[str, Callable] = { + "get_metadata": tool_get_metadata, + "get_documents_context": tool_get_documents_context, + **ONTOLOGY_TOOL_HANDLERS, + **SQL_TOOL_HANDLERS, + **PLANNER_TOOL_HANDLERS, +} + + +# ===================================================== +# Data classes +# ===================================================== + + +@dataclass +class PlannerStep: + """One observable step of the Planner's execution. + + Mirrors :class:`agents.engine_base.AgentStep` but is scoped to the Planner + so the orchestrator (Sprint 7) can present a stage-specific timeline in + the UI. + """ + + step_type: str # tool_call | tool_result | output + content: str + tool_name: str = "" + duration_ms: int = 0 + + +@dataclass +class PlannerResult: + """Outcome of a single Planner invocation. + + ``source_model`` is populated only when the LLM successfully called + ``submit_source_model`` with a structurally-valid payload. ``error`` is + the short reason string when ``success`` is ``False``. + """ + + success: bool + source_model: Optional["SourceModel"] = None + steps: List[PlannerStep] = field(default_factory=list) + iterations: int = 0 + error: str = "" + usage: Dict[str, int] = field(default_factory=dict) + + +# ===================================================== +# System prompt +# ===================================================== + +SYSTEM_PROMPT = """\ +You are a senior data architect. Your job is to build a SourceModel that \ +bridges a set of source tables to an OWL ontology, so a downstream Generator \ +agent can mechanically emit entity- and relationship-mapping SQL. + +TOOLS +You have these tools available: + • get_ontology – load classes (with attributes) and object \ +properties to be mapped. + • get_metadata – load imported table schemas (full names, \ +columns, types). + • get_documents_context – load any imported domain documents (glossaries, \ +schema docs). + • sample_table – return up to N random rows so you can see \ +actual values, not just column types. Use when a column's role is unclear \ +from its name/type alone. + • column_value_overlap – measure |distinct(from) ∩ distinct(to)| / \ +|distinct(from)| for two bare COLUMNS. Use to VALIDATE a candidate join key \ +with real data — never propose a join_key on the strength of name similarity \ +alone. + • normalized_value_overlap – the same overlap metric, but each side is a \ +scalar SQL EXPRESSION. This is how you PROVE a canonical-key normalization: \ +when two tables for the same class have 0% raw overlap, propose a \ +normalization expression per table and confirm overlap_pct > 0 here BEFORE \ +you submit. A still-zero result means your expression is wrong — fix it. + • distinct_count – row / distinct / null counts plus is_unique \ +and is_complete flags. Use to confirm a candidate canonical-ID column is \ +actually unique and complete. + • execute_sql – escape hatch for any check the four tools above \ +do not cover. Use sparingly — prefer the focused tools. + • submit_source_model – TERMINAL. Call exactly once, when the \ +SourceModel is complete and you are ready to hand off to the Generator. + +WORKFLOW +1. Call get_ontology AND get_metadata first to see what needs mapping and \ +what data is available. +2. Call get_documents_context to pick up any pre-loaded domain documents — \ +they often disambiguate column semantics. +3. For each table, decide which ontology class(es) it could realise — these \ +become table_roles[].ontology_class_candidates with a confidence and a one- \ +sentence reason. +4. For each ontology class, decide which column serves as its canonical \ +identifier in each table — record under canonical_ids[]. When you are \ +uncertain, run distinct_count to confirm uniqueness/completeness. +5. For each pair of tables that should join (intra-source FK or cross-source \ +value match), run column_value_overlap and only record join_keys[] when the \ +realised overlap_pct supports it. Use kind="same_trust_fk" for FK joins and \ +kind="cross_source_value_match" for value-matched joins across sources. \ +For any class mapped to 2+ tables, follow CANONICAL-KEY NORMALIZATION below \ +and PROVE the chosen keys overlap with normalized_value_overlap. +6. Build mapping_plan.entity_order so that BASE classes come first \ +(i.e. classes that are referenced by other classes through object properties \ +should be mapped before their referencers). Build \ +mapping_plan.relationship_order so that, by the time each relationship is \ +attempted, BOTH its domain and range classes have already appeared in \ +entity_order. List anything you cannot reasonably map under \ +mapping_plan.skip[] with a short reason. +7. Finally, call submit_source_model exactly once with the full JSON. The \ +call returns success=true when the model is structurally valid; if it \ +returns success=false, fix the indicated problem and call it again. + +CANONICAL-KEY NORMALIZATION (CRITICAL — this is the #1 cause of relationship dangling) +For any class whose canonical_id lists MORE THAN ONE table, run \ +column_value_overlap on a representative column pair to see whether the raw \ +values already share a format: + + • If overlap_pct > 0 → values are in compatible formats. Record bare \ +column names in canonical_column_per_table (e.g. ``"CUSTOMER_ID"``). \ +A UNION across the tables produces a coherent ID universe. + + • If overlap_pct == 0 → DO NOT conclude these are "different" or \ +"source-scoped" entities. When two tables both map to the SAME ontology \ +class, 0% overlap almost always means the SAME real-world key wrapped in \ +DIFFERENT source-local encodings (prefixes, suffixes, embedded sub-IDs). \ +Leaving them disjoint makes every relationship pointing at this class 100% \ +dangle — that is a FAILURE, not an acceptable outcome. You MUST normalize: + + STEP 1 — sample_table BOTH columns and read the raw values. Look for a \ +shared embedded substring across the sources — a stable inner identifier \ +(UUID, account number, ``...-ord-`` core) that appears in every source's \ +value with only the surrounding prefix/suffix differing. + + STEP 2 — write ONE scalar SQL expression PER TABLE that strips the \ +source-specific wrapping and exposes that shared core in an identical form. \ +Prefer extracting the shared core over stripping a single known prefix \ +(extraction is robust to multiple prefixes). When matching a hex/UUID core, \ +ALWAYS anchor the regex with a leading character class so a preceding dash \ +is not captured: + ✗ WRONG: regexp_extract(ORDER_REF, '([a-f0-9-]+-ord-[0-9]+)', 1) + → returns "--ord-1" (leading dash) — will NOT match + ✓ RIGHT: regexp_extract(ORDER_REF, '([a-f0-9][a-f0-9-]+-ord-[0-9]+)', 1) + → returns "-ord-1" + + STEP 3 — for a DERIVED / child key (e.g. an OrderLine, Shipment or Payment \ +that hangs off an order), DO NOT concatenate a suffix onto the RAW prefixed \ +local id — that re-introduces the source prefix and the keys stay disjoint. \ +Extract the shared core FIRST, then append the role suffix, so every source \ +yields the identical synthetic key: + ✗ WRONG: source_a "CONCAT(ORDER_REF, '-line')" (→ SA--ord-1-line) + source_b "line_id" (→ SB-LN-SB--ord-1) + ✓ RIGHT: source_a "CONCAT(regexp_extract(ORDER_REF, '([a-f0-9][a-f0-9-]+-ord-[0-9]+)', 1), '-line')" + source_b "CONCAT(regexp_extract(line_id, '([a-f0-9][a-f0-9-]+-ord-[0-9]+)', 1), '-line')" + (both → -ord-1-line) + + STEP 4 — PROVE IT. Call normalized_value_overlap with your two \ +expressions. It MUST return overlap_pct > 0. If it is still 0, your \ +expressions land in different value spaces — go back to STEP 1 and fix them. \ +Do NOT call submit_source_model with an unverified normalization. + + (If, after sampling, a table genuinely cannot expose the shared core at \ +all, omit that table from canonical_column_per_table and note why — but this \ +is rare; exhaust STEP 1–4 first.) + + • COMPLETENESS — list EVERY covering source. When more than one source table \ +realises the SAME class, include ALL of them in canonical_column_per_table, \ +not just the two you checked overlap on. The same real-world entity is \ +typically present across multiple sources (e.g. source_a AND source_b AND \ +source_c); omitting one drops a large fraction of that entity's real instances \ +and makes every relationship pointing at it partially dangle. During candidate \ +discovery (step 3) actively look for the class across all source schemas before \ +you settle on its canonical_ids. + + • Whatever expression you record, the EntityGenerator drops it verbatim \ +into the SELECT aliased AS ID. Bare column names and SQL expressions are \ +both valid here. + + • Always update format_note to one sentence describing what the canonical \ +key looks like (e.g. ``"-ord- core extracted from each \ +source's local order id"``). Downstream agents read this. + +SOURCEMODEL JSON SCHEMA (these key names are LOAD-BEARING — do not improvise) +The `model` argument to submit_source_model has exactly this shape: + +{ + "table_roles": [ + { + "table": "", // STRING — required key is "table" + "ontology_class_candidates": [ + {"uri": "", "confidence": 0.0, "reason": ""} + ] + } + ], + "canonical_ids": [ + { + "ontology_class": "", // STRING — required key is "ontology_class" + // VALUES may be either a bare column name OR a SQL expression that + // produces the canonical key for that table. Use a SQL expression + // when raw column values across the listed tables are in different + // formats (see CANONICAL-KEY NORMALIZATION below). + "canonical_column_per_table": {"": ""}, + "format_note": "" + } + ], + "join_keys": [ + { + "from_ref": "
.", // STRING — required key is "from_ref" + "to_ref": "
.", // STRING — required key is "to_ref" + "confidence": 0.0, + "overlap_pct": 0.0, + "kind": "same_trust_fk" // or "cross_source_value_match" + } + ], + "mapping_plan": { + "entity_order": ["", "..."], + "relationship_order": ["", "..."], + "skip": [ + {"item": "", "reason": ""} // required keys: "item", "reason" + ] + } +} + +Key-name traps to avoid: +• Use "table" (not "name", "table_name", "uri") in each table_roles[] entry. +• Use "ontology_class" (not "class", "uri") in each canonical_ids[] entry. +• Use "from_ref" / "to_ref" (not "from" / "to" / "source" / "target") in each join_keys[] entry. +• Use "item" (not "uri", "property") in each mapping_plan.skip[] entry. + +INVARIANTS (the orchestrator will enforce these) +• Every URI in entity_order MUST exist in the ontology AND have at least one \ +candidate in table_roles[].ontology_class_candidates. +• Every URI in relationship_order MUST reference a property whose domain \ +class and range class both appear in entity_order at an EARLIER position. +• All confidence values are floats in [0.0, 1.0]. +• kind on each join_key is EXACTLY one of: "same_trust_fk", \ +"cross_source_value_match". +• Call submit_source_model EXACTLY ONCE, at the end. Do not emit a free-text \ +summary afterwards — submit_source_model is the terminal step. + +GENERAL RULES +• Prefer the focused tools (sample_table, column_value_overlap, \ +normalized_value_overlap, distinct_count) over execute_sql. +• Validate candidate join keys with column_value_overlap before adding them \ +to join_keys[]. +• You may batch multiple independent tool calls in a single response. +• Only ever pass row-returning queries (SELECT / WITH …) to execute_sql. +""" + + +# ===================================================== +# Internal helpers +# ===================================================== + + +def _build_user_prompt( + entities: List[dict], relationships: List[dict], n_tables: int +) -> str: + parts = [ + ( + f"Build a SourceModel for {n_tables} table(s), {len(entities)} ontology " + f"entity/entities, and {len(relationships)} relationship(s). " + "Start by calling get_ontology, get_metadata, and get_documents_context." + ) + ] + if entities: + names = ", ".join(e.get("name", "?") for e in entities) + parts.append(f"Entities in scope: {names}") + if relationships: + names = ", ".join(r.get("name", "?") for r in relationships) + parts.append(f"Relationships in scope: {names}") + prompt = "\n".join(parts) + logger.debug("_build_user_prompt (%d chars):\n%s", len(prompt), prompt) + return prompt + + +# ===================================================== +# Public entry point +# ===================================================== + + +@trace_agent(name="mapping_pge_planner") +def run_planner( + host: str, + token: str, + endpoint_name: str, + client: Any, + metadata: dict, + ontology: dict, + *, + documents: Optional[list] = None, + on_step: Optional[Callable[[str, int], None]] = None, + max_iterations: int = MAX_ITERATIONS, +) -> PlannerResult: + """Run the Planner agent. + + The Planner autonomously produces a :class:`SourceModel` by exploring the + ontology, metadata, documents, and source data via tool calls. It + terminates as soon as it submits a structurally-valid SourceModel via the + terminal ``submit_source_model`` tool. + + Args: + host: Databricks workspace URL. + token: Bearer token for the serving endpoint. + endpoint_name: Foundation Model serving endpoint name. + client: Databricks SQL client (must expose ``execute_query(sql)``). + metadata: Imported domain metadata (``{"tables": [...]}``). + ontology: Imported ontology (``{"entities": [...], "relationships": [...]}``). + documents: Optional pre-loaded domain documents. + on_step: Optional progress callback ``(msg, pct)`` for UI updates. + max_iterations: Upper bound on tool-call iterations (default 25). + + Returns: + A :class:`PlannerResult`. ``success`` is True iff a SourceModel was + successfully submitted; in that case ``source_model`` holds the + validated dataclass. On failure, ``error`` explains why and + ``source_model`` is None. + """ + iteration_limit = max_iterations if max_iterations is not None else MAX_ITERATIONS + + entities = (ontology or {}).get("entities", []) + relationships = (ontology or {}).get("relationships", []) + n_tables = len((metadata or {}).get("tables", [])) + + logger.info( + "===== PLANNER START ===== endpoint=%s, tables=%d, entities=%d, relationships=%d, max_iter=%d", + endpoint_name, + n_tables, + len(entities), + len(relationships), + iteration_limit, + ) + + ctx = ToolContext( + host=host.rstrip("/"), + token=token, + client=client, + metadata=metadata or {}, + ontology=ontology or {}, + documents=list(documents or []), + ) + + result = PlannerResult(success=False) + + user_prompt = _build_user_prompt(entities, relationships, n_tables) + messages: List[dict] = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ] + logger.info( + "Planner conversation initialized: system=%d chars, user=%d chars", + len(SYSTEM_PROMPT), + len(user_prompt), + ) + + total_usage: Dict[str, int] = {"prompt_tokens": 0, "completion_tokens": 0} + + def _progress_pct(iteration_idx: int) -> int: + # Linear ramp from 5 → 95 across the iteration budget. The terminal + # submit_source_model call is what sets 100. + ratio = (iteration_idx + 1) / max(iteration_limit, 1) + return min(5 + int(ratio * 90), 95) + + def notify(msg: str, *, pct: Optional[int] = None): + actual_pct = pct if pct is not None else 5 + logger.info("PLANNER STEP [%d%%] %s", actual_pct, msg) + if on_step: + on_step(msg, actual_pct) + + notify("Starting planner…", pct=1) + + # ------------------------------------------------------------------ + # Agent loop + # ------------------------------------------------------------------ + for iteration in range(iteration_limit): + # Rate-limit mitigation — same 3s delay as the legacy mapping agent. + if iteration > 0: + logger.debug( + "Iteration %d: waiting %ds before LLM call (rate limit mitigation)", + iteration + 1, + _ITERATION_DELAY_SEC, + ) + time.sleep(_ITERATION_DELAY_SEC) + + current_iteration = iteration + 1 + pct = _progress_pct(iteration) + logger.info( + "----- Planner iteration %d/%d — %d messages, source_model=%s -----", + current_iteration, + iteration_limit, + len(messages), + "set" if ctx.source_model is not None else "unset", + ) + notify(f"Planning iteration {current_iteration}/{iteration_limit}…", pct=pct) + + t0 = time.time() + try: + llm_response = call_serving_endpoint( + host, + token, + endpoint_name, + messages, + tools=TOOL_DEFINITIONS, + max_tokens=_MAX_TOKENS, + temperature=0.1, + timeout=LLM_TIMEOUT, + trace_name=_TRACE_NAME, + ) + except requests.exceptions.HTTPError as exc: + status = exc.response.status_code if exc.response is not None else "?" + logger.warning( + "Planner iteration %d: HTTPError status=%s", current_iteration, status + ) + logger.debug( + "Planner iteration %d: HTTPError body: %.500s", + current_iteration, + exc.response.text if exc.response is not None else "N/A", + ) + # Tools are non-negotiable for the Planner — no single-shot fallback. + if exc.response is not None and status in (400, 422): + result.error = "LLM endpoint does not support function calling" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "Planner: endpoint refused tools — cannot produce a SourceModel" + ) + return result + result.error = f"LLM request failed: {exc}" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "Planner: LLM request failed at iteration %d: %s", + current_iteration, + exc, + ) + return result + except requests.exceptions.ReadTimeout: + result.error = f"LLM request timed out after {LLM_TIMEOUT}s" + result.iterations = current_iteration + result.usage = total_usage + logger.error("Planner: timeout at iteration %d", current_iteration) + return result + except requests.exceptions.RequestException as exc: + result.error = f"LLM request failed: {exc}" + result.iterations = current_iteration + result.usage = total_usage + logger.error( + "Planner: request exception at iteration %d: %s", + current_iteration, + exc, + ) + return result + + elapsed_ms = int((time.time() - t0) * 1000) + logger.info( + "Planner iteration %d: LLM responded in %dms", current_iteration, elapsed_ms + ) + + accumulate_usage(total_usage, llm_response.get("usage", {})) + + choice = llm_response.get("choices", [{}])[0] + finish_reason = choice.get("finish_reason", "?") + message = choice.get("message", {}) + tool_calls = message.get("tool_calls", []) + has_content = bool(message.get("content")) + logger.info( + "Planner iteration %d: finish_reason=%s, tool_calls=%d, has_content=%s", + current_iteration, + finish_reason, + len(tool_calls), + has_content, + ) + # A tool call truncated by the max_tokens ceiling produces malformed + # arguments and the tool can't recover. Flag it loudly so future runs + # don't silently waste iterations resubmitting the same broken JSON. + if finish_reason == "length" and tool_calls: + logger.error( + "Planner iteration %d: finish_reason=length on a tool call — " + "arguments were likely truncated. Consider bumping max_tokens.", + current_iteration, + ) + + if not tool_calls: + # The Planner must end with submit_source_model, not free text. + # If we see text without a terminal call, that's a failure. + content = (message.get("content") or "")[:500] + logger.warning( + "Planner iteration %d: produced text without submitting source model — %d chars", + current_iteration, + len(message.get("content") or ""), + ) + result.steps.append( + PlannerStep( + step_type="output", + content=content, + duration_ms=elapsed_ms, + ) + ) + result.error = "planner produced text without submitting source model" + result.iterations = current_iteration + result.usage = total_usage + notify("Planner produced text without submitting source model.", pct=pct) + return result + + # Tool-call branch — dispatch each call and accumulate steps. + logger.info( + "Planner iteration %d: processing %d tool call(s): [%s]", + current_iteration, + len(tool_calls), + ", ".join( + tc.get("function", {}).get("name", "?") for tc in tool_calls + ), + ) + messages.append(message) + + terminal_success = False + for tc_idx, tc in enumerate(tool_calls, 1): + func = tc.get("function", {}) + tool_name = func.get("name", "") + raw_args = func.get("arguments", "{}") + tool_id = tc.get("id", "") + + try: + arguments = json.loads(raw_args) + except json.JSONDecodeError: + arguments = {} + + logger.info( + "Planner iteration %d: calling tool '%s' (%d/%d)", + current_iteration, + tool_name, + tc_idx, + len(tool_calls), + ) + + # Human-readable progress messages per tool — same pattern as + # the legacy mapping agent for UI consistency. + if tool_name == "submit_source_model": + notify("Submitting source model…", pct=pct) + elif tool_name == "get_metadata": + notify("Retrieving table metadata…", pct=pct) + elif tool_name == "get_ontology": + notify("Retrieving ontology…", pct=pct) + elif tool_name == "get_documents_context": + notify("Retrieving documents…", pct=pct) + elif tool_name == "sample_table": + fn = arguments.get("full_name", "?") + notify(f"Sampling {fn}…", pct=pct) + elif tool_name == "column_value_overlap": + notify("Checking column overlap…", pct=pct) + elif tool_name == "normalized_value_overlap": + notify("Verifying canonical-key normalization…", pct=pct) + elif tool_name == "distinct_count": + notify("Checking distinct count…", pct=pct) + elif tool_name == "execute_sql": + sql_preview = arguments.get("sql", "")[:80] + notify(f"Running SQL: {sql_preview}…", pct=pct) + else: + notify(f"Calling {tool_name}…", pct=pct) + + result.steps.append( + PlannerStep( + step_type="tool_call", + content=json.dumps(arguments, default=str)[:500], + tool_name=tool_name, + ) + ) + + t1 = time.time() + tool_result = dispatch_tool( + TOOL_HANDLERS, ctx, tool_name, arguments, trace_name=_TRACE_NAME + ) + tool_ms = int((time.time() - t1) * 1000) + + logger.info( + "Planner iteration %d: tool '%s' returned %d chars in %dms", + current_iteration, + tool_name, + len(tool_result), + tool_ms, + ) + + result.steps.append( + PlannerStep( + step_type="tool_result", + content=( + (tool_result[:500] + "…") + if len(tool_result) > 500 + else tool_result + ), + tool_name=tool_name, + duration_ms=tool_ms, + ) + ) + + messages.append( + { + "role": "tool", + "tool_call_id": tool_id, + "content": tool_result, + } + ) + + # Detect terminal success: submit_source_model returned success=True + # *and* stamped a SourceModel onto the context. We break *after* + # appending the tool result so the orchestrator sees a complete + # message trail in conversation/replay. + if tool_name == "submit_source_model": + try: + parsed = json.loads(tool_result) + except json.JSONDecodeError: + parsed = {} + if parsed.get("success") is True and ctx.source_model is not None: + terminal_success = True + logger.info( + "Planner iteration %d: submit_source_model succeeded — terminating", + current_iteration, + ) + + if terminal_success: + result.success = True + result.source_model = ctx.source_model + result.iterations = current_iteration + result.usage = total_usage + logger.info( + "===== PLANNER COMPLETE ===== iterations=%d, " + "prompt_tokens=%d, completion_tokens=%d", + result.iterations, + total_usage["prompt_tokens"], + total_usage["completion_tokens"], + ) + notify("Planner completed!", pct=100) + return result + + # Exhausted the iteration budget without ever calling submit_source_model + # successfully (or the LLM kept calling other tools forever). + result.iterations = iteration_limit + result.usage = total_usage + result.error = "planner exhausted iteration budget without submitting source model" + logger.error("===== PLANNER FAILED ===== %s", result.error) + notify(result.error, pct=95) + return result diff --git a/src/agents/agent_owl_generator/engine.py b/src/agents/agent_owl_generator/engine.py index 9d755d48..c16f4ad1 100644 --- a/src/agents/agent_owl_generator/engine.py +++ b/src/agents/agent_owl_generator/engine.py @@ -37,6 +37,18 @@ MAX_ITERATIONS = 10 LLM_TIMEOUT = 180 +# Exhaustive per-class datatype-property coverage (see # ATTRIBUTE COVERAGE in +# the system prompt) makes the Turtle output large — a large domain ontology +# with dozens of classes and 50+ datatype properties runs well past the old 4096 +# ceiling, which silently truncated the final statement and broke parsing. +# Claude Opus supports large completions; 16k tokens fits an exhaustive +# domain ontology with headroom. +MAX_OUTPUT_TOKENS = 16000 + +# Bounded PGE retry cap for the Evaluator stage (§3.5): how many times the +# deterministic Stage-1 ontology checks may feed retry_hints back into +# generation before owl delivery proceeds regardless. +MAX_OWL_EVAL_ROUNDS = 2 _TRACE_NAME = "owl_generator" @@ -95,9 +107,12 @@ def _load_pitfall_rules() -> str: # WORKFLOW 1. Call get_metadata to understand the database schema. -2. Call list_documents to discover available documents. -3. Read relevant documents with read_document. -4. Output ONLY the final Turtle ontology as plain text (starting with @prefix). +2. Call get_table_detail on EVERY table you intend to map a class to — get_metadata + truncates wide tables at 80 columns, and you must see the FULL column list to give + each class exhaustive attribute coverage (see # ATTRIBUTE COVERAGE). +3. Call list_documents to discover available documents. +4. Read relevant documents with read_document. +5. Output ONLY the final Turtle ontology as plain text (starting with @prefix). # NAMING RULES (CRITICAL – NO EXCEPTIONS) • Classes: PascalCase (Customer, SalesOrder) @@ -127,6 +142,34 @@ def _load_pitfall_rules() -> str: • For EVERY DatatypeProperty you MUST declare rdfs:domain on the property itself (do not rely on owl:Restriction alone — the platform reads attributes from rdfs:domain) +# ATTRIBUTE COVERAGE (CRITICAL — exhaustive, NOT curated) +The downstream mapping pipeline can only bind a SQL column to a class when that +column has a matching owl:DatatypeProperty with rdfs:domain on the class. A class +with few datatype properties produces an ID+Label-only entity that is USELESS for +analytics. So model attributes EXHAUSTIVELY, not minimally: +• For EVERY class, emit a DatatypeProperty for EVERY meaningful source column that + describes an instance of that class — across ALL tables that realise the class. + A single class is often realised by several source tables (e.g. one per source + system, region, or tenant) that each hold the same real-world entity in a local + schema; UNION their columns mentally and cover the full set. Use get_table_detail + on each covering table to see every column. +• "Meaningful" = a genuine attribute of the entity: dates, measurements, codes, + scores, names, statuses, flags, free-text notes. EXCLUDE ONLY: surrogate/auto- + increment row keys with no analytical value, audit columns (created_at, updated_by, + etl_*, _ingest_*), and the foreign-key columns that ObjectProperty relationships + already carry. +• When two sources expose the SAME attribute under different column names + (e.g. total_amount vs TOTAL_AMT; status vs STATUS_CODE), emit ONE datatype + property — do NOT emit a per-source duplicate. The mapping layer reconciles the + source columns. +• Name datatype properties in lowerCamelCase derived from business meaning + (order_date → orderDate, TOTAL_AMT → totalAmount). + Use ONLY [a-z][A-Za-z0-9]* — never underscores, hyphens, or backslash escapes. +• The "at least 2 datatype properties" floor in the guidelines is a MINIMUM, not a + target. Rich, real-world entities (a transaction, an encounter, an event, a core + business object) typically warrant 6–11 datatype properties. Aim for full column + coverage, not a tidy subset. + # RELATIONSHIP RULES • NEVER create bidirectional relationships. • Between any two classes A and B create at most ONE ObjectProperty. @@ -160,10 +203,12 @@ def _load_pitfall_rules() -> str: ## 2. Class and property design rules For each **class** you create:[1][2][3][4] -1. Provide: - - A short, clear natural-language definition (1–2 sentences). - - At least 1 object property (unless the class is explicitly abstract). - - At least 2 datatype properties, when meaningful in the domain. +1. Provide: + - A short, clear natural-language definition (1–2 sentences). + - At least 1 object property (unless the class is explicitly abstract). + - Datatype properties covering EVERY meaningful source column for the class + (see "# ATTRIBUTE COVERAGE" in the system prompt — exhaustive, not curated; + 2 is a floor, full column coverage is the goal). 2. Naming conventions: - Classes: UpperCamelCase (e.g., `CustomerOrder`). - Object properties: lowerCamelCase verbs or verb-like phrases (e.g., `placesOrder`). @@ -241,6 +286,80 @@ def _parse_pitfall_tool_result(tool_result_json: str) -> Optional[Dict]: return None +# Stage-1 absolute (Tier-1) ontology defects that the Evaluator forces a +# retry on. Coverage ratios are computed and logged but are advisory at the +# generation stage (they are Tier-2 in the scorecard), so they do not by +# themselves trigger a regeneration — only hard structural defects do. +_EVAL_ABSOLUTE_CHECKS = ( + "orphan_class_count", + "dangling_domain_range_count", + "naming_violation_count", + "duplicate_class_count", +) + + +def _evaluate_ontology_stage( + turtle_text: str, metadata: dict, iteration: int +) -> Optional[str]: + """Run the Stage-1 deterministic ontology checks (§3.2) on *turtle_text*. + + Parses the Turtle into the registry shape, runs the shared intrinsic + checks, and returns a concrete ``retry_hint`` feedback string when any + Tier-1 absolute defect (orphan class, dangling domain/range, naming + violation, duplicate class) is present — turning owl-gen into a real + PGE loop. Returns ``None`` when the ontology is structurally clean. + + Fails open: any parse/dep error returns ``None`` so a check failure + never blocks OWL delivery (mirrors the pitfall-tool check). + """ + try: + from back.core.w3c.owl.OntologyParser import OntologyParser + from back.objects.ontology.Ontology import Ontology + from agents.pge_eval.ontology_metrics import evaluate_ontology + + # The model sometimes prepends a prose sentence or wraps the Turtle in + # a markdown fence; strip that the same way the downstream registry + # does, so the Evaluator parses real output instead of skipping. + turtle_text = Ontology.clean_owl_output(turtle_text) + parser = OntologyParser(turtle_text) + ontology = { + "classes": parser.get_classes(), + "properties": parser.get_properties(), + } + metrics, issues, _footprint = evaluate_ontology(ontology, metadata or {}) + logger.info( + "Iteration %d: ontology evaluator — metrics=%s", + iteration, + metrics, + ) + + absolute_issues = [ + i for i in issues if i.get("check") in _EVAL_ABSOLUTE_CHECKS + ] + if not absolute_issues: + logger.info( + "Iteration %d: ontology evaluator — no Tier-1 defects", iteration + ) + return None + + lines = [ + "The ontology you produced has structural defects. Fix ALL of them " + "and output ONLY the corrected Turtle (no markdown, no comments, " + "starting with @prefix declarations):\n" + ] + # Cap feedback to keep the prompt bounded. + for issue in absolute_issues[:12]: + lines.append(f" • {issue['hint']}") + return "\n".join(lines) + except Exception as exc: # noqa: BLE001 + logger.warning( + "Iteration %d: ontology evaluator skipped due to error: %s", + iteration, + exc, + ) + return None + + def _build_user_prompt( guidelines: str, options: dict, @@ -443,6 +562,7 @@ def notify(msg: str): # ------------------------------------------------------------------ tools_supported = True _owl_fix_rounds = 0 # pitfall-fix rounds consumed so far + _owl_eval_rounds = 0 # Evaluator (Stage-1 PGE) retry rounds consumed for iteration in range(MAX_ITERATIONS): logger.info( @@ -477,7 +597,7 @@ def notify(msg: str): endpoint_name, messages, tools=send_tools, - max_tokens=4096, + max_tokens=MAX_OUTPUT_TOKENS, temperature=0.1, timeout=LLM_TIMEOUT, trace_name=_TRACE_NAME, @@ -509,7 +629,7 @@ def notify(msg: str): endpoint_name, messages, tools=None, - max_tokens=4096, + max_tokens=MAX_OUTPUT_TOKENS, temperature=0.1, timeout=LLM_TIMEOUT, trace_name=_TRACE_NAME, @@ -749,6 +869,41 @@ def notify(msg: str): _owl_fix_rounds, max_fix_rounds, ) + # -------------------------------------------------------------- + # Evaluator stage (PGE loop) — after the pitfall-tool loop is + # clean/maxed, run the Stage-1 deterministic ontology checks (§3.2). + # On a Tier-1 structural defect, feed concrete retry_hints back to + # the generator, bounded by MAX_OWL_EVAL_ROUNDS. Only retry when + # there's another iteration left, so a usable ontology is never + # discarded by exhausting MAX_ITERATIONS. + # -------------------------------------------------------------- + eval_feedback = _evaluate_ontology_stage(content, ctx.metadata, iteration + 1) + if ( + eval_feedback + and _owl_eval_rounds < MAX_OWL_EVAL_ROUNDS + and iteration < MAX_ITERATIONS - 1 + ): + _owl_eval_rounds += 1 + notify( + f"Ontology defects found — eval round " + f"{_owl_eval_rounds}/{MAX_OWL_EVAL_ROUNDS}…" + ) + result.steps.append( + AgentStep( + step_type="evaluator", + content=eval_feedback[:200], + duration_ms=0, + ) + ) + messages.append({"role": "assistant", "content": content}) + messages.append({"role": "user", "content": eval_feedback}) + logger.info( + "Iteration %d: ontology evaluator found defects — eval round %d", + iteration + 1, + _owl_eval_rounds, + ) + continue # next iteration will produce corrected OWL + # ── Accept this text as the final OWL ──────────────────────────── result.success = True result.owl_content = content diff --git a/src/agents/pge_eval/__init__.py b/src/agents/pge_eval/__init__.py new file mode 100644 index 00000000..bd997701 --- /dev/null +++ b/src/agents/pge_eval/__init__.py @@ -0,0 +1,18 @@ +"""OntoBricks PGE intrinsic-evaluation toolkit. + +A usecase-agnostic, gold-free scorecard for the PGE pipeline (ontology +generation + mapping generation). Intrinsic structural/self-consistency +metrics + an advisory LLM-judge — never a stored reference answer (D1). + +Public surface: + +* :func:`agents.pge_eval.scorecard.score_artifact` — the offline-testable + scoring core (D6). +* :func:`agents.pge_eval.ontology_metrics.evaluate_ontology` — Stage-1 + deterministic ontology checks, shared with the owl-generator Evaluator + stage (§3.5). +""" + +from agents.pge_eval.scorecard import score_artifact # noqa: F401 + +__all__ = ["score_artifact"] diff --git a/src/agents/pge_eval/baseline.py b/src/agents/pge_eval/baseline.py new file mode 100644 index 00000000..a979acbe --- /dev/null +++ b/src/agents/pge_eval/baseline.py @@ -0,0 +1,68 @@ +"""Tier-3 self-baseline storage (§3.4). + +Each scored run is persisted under ``logs/goals/``. The baseline for the +next run is the pipeline's own *most recent accepted* (GREEN) scorecard — +never a domain answer key. This is how "did it get worse" is detected +without gold labels. +""" + +from __future__ import annotations + +import glob +import json +import os +from typing import Any, Dict, List, Optional + +DEFAULT_BASELINE_DIR = "logs/goals" +_PREFIX = "scorecard_" + + +def _sort_key(card: Dict[str, Any]) -> Any: + return (card.get("timestamp") or "", card.get("run_id") or "") + + +def save_scorecard( + scorecard: Dict[str, Any], baseline_dir: str = DEFAULT_BASELINE_DIR +) -> str: + """Persist *scorecard* and return the path written.""" + os.makedirs(baseline_dir, exist_ok=True) + run_id = scorecard.get("run_id") or "run" + safe = "".join(ch if ch.isalnum() or ch in "-_." else "_" for ch in str(run_id)) + path = os.path.join(baseline_dir, f"{_PREFIX}{safe}.json") + with open(path, "w") as f: + json.dump(scorecard, f, indent=2, default=str) + return path + + +def _load_all(baseline_dir: str) -> List[Dict[str, Any]]: + cards: List[Dict[str, Any]] = [] + for p in glob.glob(os.path.join(baseline_dir, f"{_PREFIX}*.json")): + try: + with open(p) as f: + cards.append(json.load(f)) + except (OSError, json.JSONDecodeError): + continue + return cards + + +def load_baseline( + baseline_dir: str = DEFAULT_BASELINE_DIR, + *, + exclude_run_id: Optional[str] = None, +) -> Optional[Dict[str, Any]]: + """Return the most recent accepted (GREEN) scorecard, or ``None``. + + A RED run never becomes a baseline — otherwise a regression would + silently reset the bar. ``exclude_run_id`` drops the current run so a + scorecard never baselines against itself. + """ + cards = [ + c + for c in _load_all(baseline_dir) + if c.get("verdict") == "GREEN" + and c.get("run_id") != exclude_run_id + ] + if not cards: + return None + cards.sort(key=_sort_key) + return cards[-1] diff --git a/src/agents/pge_eval/gates.py b/src/agents/pge_eval/gates.py new file mode 100644 index 00000000..826d8b3f --- /dev/null +++ b/src/agents/pge_eval/gates.py @@ -0,0 +1,182 @@ +"""The three gate tiers (§3.4) + metric directionality. + +* **Tier 1 — absolute hard gates** (always active). Integrity / hygiene / + executability invariants that hold for any domain; non-zero exit on fail. +* **Tier 2 — ratio thresholds**. Warn by default; promotable to hard gates + per run via ``--gate-ratios``. The 0.90 default is a starting heuristic, + overridable, never an absolute truth. +* **Tier 3 — self-baseline regression** (active when a baseline exists). + Any Tier-1/Tier-2 metric that drops vs the last accepted baseline beyond + its tolerance fails the run, even if still above its absolute bar. + +The LLM judge is Tier-exempt — it never appears here. + +No domain identifiers, table names, or counts are encoded; every threshold +is a generic structural bar. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Optional + +DEFAULT_RATIO_THRESHOLD = 0.90 + +# Directionality for the Tier-3 regression gate. +HIGHER_BETTER = "higher_better" +LOWER_BETTER = "lower_better" + + +# Each spec: stage, key, tier, direction, plus tier-specific config. +# ``conditional`` marks a metric that only gates when active (set at runtime). +METRIC_SPECS: List[Dict[str, Any]] = [ + # ---- Tier-1 absolute (ontology) ---- + {"stage": "ontology", "key": "orphan_class_count", "tier": 1, + "direction": LOWER_BETTER, "op": "==", "bound": 0, "tol": 0}, + {"stage": "ontology", "key": "dangling_domain_range_count", "tier": 1, + "direction": LOWER_BETTER, "op": "==", "bound": 0, "tol": 0}, + {"stage": "ontology", "key": "naming_violation_count", "tier": 1, + "direction": LOWER_BETTER, "op": "==", "bound": 0, "tol": 0}, + {"stage": "ontology", "key": "duplicate_class_count", "tier": 1, + "direction": LOWER_BETTER, "op": "==", "bound": 0, "tol": 0}, + # ---- Tier-1 absolute (mapping) ---- + {"stage": "mapping", "key": "dangling_target_pct_max", "tier": 1, + "direction": LOWER_BETTER, "op": "<", "bound": 0.05, "tol": 0.01}, + {"stage": "mapping", "key": "dangling_source_pct_max", "tier": 1, + "direction": LOWER_BETTER, "op": "<", "bound": 0.05, "tol": 0.01}, + {"stage": "mapping", "key": "id_integrity", "tier": 1, + "direction": HIGHER_BETTER, "op": "==", "bound": 1.0, "tol": 0.0}, + {"stage": "mapping", "key": "sql_exec_failures", "tier": 1, + "direction": LOWER_BETTER, "op": "==", "bound": 0, "tol": 0}, + {"stage": "mapping", "key": "cross_source_band_compliance", "tier": 1, + "direction": HIGHER_BETTER, "op": "==", "bound": 1.0, "tol": 0.0, + "conditional": "band_active"}, + # ---- Tier-2 ratio ---- + {"stage": "ontology", "key": "table_footprint_coverage", "tier": 2, + "direction": HIGHER_BETTER, "tol": 0.02}, + {"stage": "ontology", "key": "column_footprint_coverage", "tier": 2, + "direction": HIGHER_BETTER, "tol": 0.02}, + {"stage": "mapping", "key": "entity_completeness", "tier": 2, + "direction": HIGHER_BETTER, "tol": 0.02}, + {"stage": "mapping", "key": "relationship_completeness", "tier": 2, + "direction": HIGHER_BETTER, "tol": 0.02}, + {"stage": "mapping", "key": "attribute_coverage", "tier": 2, + "direction": HIGHER_BETTER, "tol": 0.02}, +] + + +def get_metric(stages: Dict[str, Any], stage: str, key: str) -> Any: + return ((stages.get(stage, {}) or {}).get("metrics", {}) or {}).get(key) + + +def _abs_pass(op: str, value: float, bound: float) -> bool: + if value is None: + return False + if op == "==": + return value == bound + if op == "<": + return value < bound + if op == "<=": + return value <= bound + if op == ">=": + return value >= bound + raise ValueError(f"unknown op {op!r}") + + +def evaluate_tier1( + stages: Dict[str, Any], *, active_conditionals: Optional[Dict[str, bool]] = None +) -> Dict[str, Any]: + active_conditionals = active_conditionals or {} + failures: List[Dict[str, Any]] = [] + for spec in METRIC_SPECS: + if spec["tier"] != 1: + continue + cond = spec.get("conditional") + if cond and not active_conditionals.get(cond, False): + continue + value = get_metric(stages, spec["stage"], spec["key"]) + if not _abs_pass(spec["op"], value, spec["bound"]): + failures.append( + { + "metric": f"{spec['stage']}.{spec['key']}", + "observed": value, + "expected": f"{spec['op']} {spec['bound']}", + } + ) + return {"passed": not failures, "failures": failures} + + +def evaluate_tier2( + stages: Dict[str, Any], + *, + gate_ratios: bool, + threshold: float = DEFAULT_RATIO_THRESHOLD, +) -> Dict[str, Any]: + warnings: List[Dict[str, Any]] = [] + for spec in METRIC_SPECS: + if spec["tier"] != 2: + continue + value = get_metric(stages, spec["stage"], spec["key"]) + if value is None or value < threshold: + warnings.append( + { + "metric": f"{spec['stage']}.{spec['key']}", + "observed": value, + "expected": f">= {threshold}", + } + ) + # When --gate-ratios is set, the warnings become hard failures. + passed = (not warnings) if gate_ratios else True + return {"gated": gate_ratios, "passed": passed, "warnings": warnings} + + +def evaluate_tier3( + stages: Dict[str, Any], + baseline: Optional[Dict[str, Any]], +) -> Dict[str, Any]: + """Compare every Tier-1/Tier-2 metric against the baseline scorecard. + + A metric *regresses* when it moves the wrong way beyond its tolerance. + Direction-aware: higher-better metrics regress on a drop, lower-better + metrics regress on a rise. + """ + if not baseline: + return {"baseline_run_id": None, "passed": True, "regressions": []} + + base_stages = baseline.get("stages", {}) + base_id = baseline.get("run_id") + regressions: List[Dict[str, Any]] = [] + for spec in METRIC_SPECS: + # A conditional metric (e.g. cross-source band compliance) only counts + # as a regression when it was actively measured in BOTH runs — otherwise + # the first real measurement after an inactive 1.0 looks like a drop. + cond = spec.get("conditional") + if cond: + cur_active = (stages.get(spec["stage"], {}) or {}).get(cond, False) + base_active = (base_stages.get(spec["stage"], {}) or {}).get(cond, False) + if not (cur_active and base_active): + continue + value = get_metric(stages, spec["stage"], spec["key"]) + base_value = get_metric(base_stages, spec["stage"], spec["key"]) + if value is None or base_value is None: + continue + tol = spec.get("tol", 0) + regressed = False + if spec["direction"] == HIGHER_BETTER: + regressed = value < base_value - tol + else: # LOWER_BETTER + regressed = value > base_value + tol + if regressed: + regressions.append( + { + "metric": f"{spec['stage']}.{spec['key']}", + "observed": value, + "baseline": base_value, + "direction": spec["direction"], + "tolerance": tol, + } + ) + return { + "baseline_run_id": base_id, + "passed": not regressions, + "regressions": regressions, + } diff --git a/src/agents/pge_eval/inapp.py b/src/agents/pge_eval/inapp.py new file mode 100644 index 00000000..a7bbfc46 --- /dev/null +++ b/src/agents/pge_eval/inapp.py @@ -0,0 +1,135 @@ +"""In-app scorecard hooks — run the PGE intrinsic evaluator *inside* the +Databricks app, right after ontology generation or mapping generation. + +These are thin, fail-safe wrappers around +:func:`agents.pge_eval.scorecard.score_artifact`: + +* **Deterministic only** — ``no_judge=True``: no extra LLM/network calls are + added to the user-facing generation/mapping latency. +* **No baseline side-effects** — ``use_baseline=False``: the app server never + reads/writes the Tier-3 ``logs/goals`` baseline store (that is a CI/CLI + concern). The in-app scorecard is a per-run quality snapshot surfaced to + the user. +* **Never raises** — scoring must never break a generation/mapping run; any + failure logs a warning and returns ``None``. + +The result is the §3.6 scorecard dict, attached to the background task's +``result`` so the UI can surface verdict + metrics. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any, Dict, List, Optional + +from back.core.logging import get_logger + +logger = get_logger(__name__) + + +def _now(): + t = datetime.now(timezone.utc) + return t.strftime("%Y%m%dT%H%M%S_%f"), t.isoformat() + + +def _turtle_to_ontology(owl_content: str) -> Dict[str, Any]: + """Parse generated Turtle into the registry ontology shape. + + Lazy imports keep the pure scorecard modules free of back/ deps. + """ + from back.core.w3c.owl.OntologyParser import OntologyParser + from back.objects.ontology.Ontology import Ontology + + cleaned = Ontology.clean_owl_output(owl_content or "") + parser = OntologyParser(cleaned) + return {"classes": parser.get_classes(), "properties": parser.get_properties()} + + +def score_generated_ontology( + owl_content: str, + metadata: Optional[dict], +) -> Optional[Dict[str, Any]]: + """Score a freshly generated ontology (Stage-1 focus). Returns the §3.6 + scorecard dict, or ``None`` on any failure.""" + try: + from agents.pge_eval.scorecard import score_artifact + + ontology = _turtle_to_ontology(owl_content) + artifact = { + "ontology": ontology, + "metadata": metadata or {"tables": []}, + "mapping_run_log": [], + "mapping_evaluations": {}, + "entity_mappings": [], + } + run_id, ts = _now() + scorecard = score_artifact( + artifact, + no_judge=True, + use_baseline=False, + mode="live", + run_id=run_id, + timestamp=ts, + ) + logger.info( + "in-app ontology scorecard: verdict=%s (orphans=%s, dangling=%s, " + "naming=%s, dupes=%s)", + scorecard["verdict"], + scorecard["stages"]["ontology"]["metrics"]["orphan_class_count"], + scorecard["stages"]["ontology"]["metrics"]["dangling_domain_range_count"], + scorecard["stages"]["ontology"]["metrics"]["naming_violation_count"], + scorecard["stages"]["ontology"]["metrics"]["duplicate_class_count"], + ) + return scorecard + except Exception as exc: # noqa: BLE001 — scoring must never break generation + logger.warning("in-app ontology scoring failed (ignored): %s", exc) + return None + + +def score_mapping_run( + *, + ontology: dict, + metadata: Optional[dict], + mapping_run_log: Optional[List[dict]], + mapping_evaluations: Optional[Dict[str, dict]], + entity_mappings: Optional[List[dict]], + relationship_mappings: Optional[List[dict]] = None, + usage: Optional[dict] = None, +) -> Optional[Dict[str, Any]]: + """Score a completed mapping-PGE run (Stage-2 + Stage-1 + pipeline). + Returns the §3.6 scorecard dict, or ``None`` on any failure.""" + try: + from agents.pge_eval.scorecard import score_artifact + + artifact = { + "ontology": ontology or {}, + "metadata": metadata or {"tables": []}, + "mapping_run_log": mapping_run_log or [], + "mapping_evaluations": mapping_evaluations or {}, + "entity_mappings": entity_mappings or [], + "relationship_mappings": relationship_mappings or [], + "usage": usage or {}, + } + run_id, ts = _now() + scorecard = score_artifact( + artifact, + no_judge=True, + use_baseline=False, + mode="live", + run_id=run_id, + timestamp=ts, + ) + m = scorecard["stages"]["mapping"]["metrics"] + logger.info( + "in-app mapping scorecard: verdict=%s (entity_completeness=%s, " + "rel_completeness=%s, id_integrity=%s, sql_failures=%s)", + scorecard["verdict"], + m["entity_completeness"], + m["relationship_completeness"], + m["id_integrity"], + m["sql_exec_failures"], + ) + return scorecard + except Exception as exc: # noqa: BLE001 — scoring must never break mapping + logger.warning("in-app mapping scoring failed (ignored): %s", exc) + return None diff --git a/src/agents/pge_eval/judge.py b/src/agents/pge_eval/judge.py new file mode 100644 index 00000000..98479319 --- /dev/null +++ b/src/agents/pge_eval/judge.py @@ -0,0 +1,138 @@ +"""Advisory LLM-judge (§3.2 / §3.3, D5). + +The judge is **advisory only** — it never gates a run (Tier-exempt). It +emits a 0–1 score per axis plus flagged issues that inform ``retry_hint``s. + +This module is the **only** place the scorer touches the network. The +deterministic metrics never import it, and the orchestrator only calls +:func:`run_judge` when ``--no-judge`` is NOT set — guaranteeing zero network +traffic in ``--no-judge`` mode. ``requests``/serving imports are lazy so +merely importing this module makes no connection either. + +The judge is usecase-agnostic: it asks generic coherence questions and is +handed the *actual* runtime ontology/mapping, never a reference answer. +""" + +from __future__ import annotations + +import json +from typing import Any, Dict, List, Optional + +from back.core.logging import get_logger + +logger = get_logger(__name__) + +_JUDGE_TIMEOUT = 60 +_MAX_TOKENS = 1024 + + +def _empty_axis(reason: str = "") -> Dict[str, Any]: + flags = [reason] if reason else [] + return {"score": None, "flags": flags} + + +def _ontology_summary(ontology: dict) -> str: + from agents.pge_eval.normalize import normalize_ontology + + norm = normalize_ontology(ontology) + classes = [ + f"{c['name']}({len(c.get('data_properties', []))} dp)" for c in norm.classes + ] + rels = [ + f"{op['name']}: {op.get('domain', '?')}->{op.get('range', '?')}" + for op in norm.object_properties + ] + return ( + f"Classes ({len(classes)}): {', '.join(classes[:60])}\n" + f"ObjectProperties ({len(rels)}): {', '.join(rels[:60])}" + ) + + +def _mapping_summary(artifact: dict) -> str: + log = artifact.get("mapping_run_log", []) or [] + lines = [ + f"{e.get('kind')}: {e.get('item')} -> {e.get('final_status')}" + for e in log[:80] + ] + return "\n".join(lines) + + +def _parse_axis(text: str) -> Dict[str, Any]: + """Pull a ``{"score": float, "flags": [str]}`` object out of LLM text.""" + try: + start = text.index("{") + end = text.rindex("}") + 1 + obj = json.loads(text[start:end]) + score = obj.get("score") + score = float(score) if score is not None else None + flags = [str(f) for f in (obj.get("flags") or [])] + return {"score": score, "flags": flags} + except (ValueError, TypeError, json.JSONDecodeError): + return _empty_axis("judge response could not be parsed") + + +def _ask(host: str, token: str, endpoint_name: str, system: str, user: str) -> Dict[str, Any]: + # Lazy import: no network dependency unless the judge actually runs. + from agents.engine_base import call_serving_endpoint, extract_message_content + + try: + resp = call_serving_endpoint( + host, + token, + endpoint_name, + [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + tools=None, + max_tokens=_MAX_TOKENS, + temperature=0.0, + timeout=_JUDGE_TIMEOUT, + trace_name="pge_eval:judge", + ) + return _parse_axis(extract_message_content(resp)) + except Exception as exc: # noqa: BLE001 — advisory, must never crash scoring + logger.warning("pge_eval judge call failed (advisory, ignored): %s", exc) + return _empty_axis(f"judge unavailable: {exc}") + + +_ONTOLOGY_SYSTEM = ( + "You are an ontology reviewer. Judge whether the classes and properties " + "are coherent and non-redundant for the implied domain. Reply ONLY with a " + 'JSON object: {"score": <0..1 float>, "flags": ["short issue", ...]}. ' + "score=1 means fully coherent; flags list concrete redundancy/incoherence " + "issues. Do not compare against any reference ontology." +) + +_MAPPING_SYSTEM = ( + "You are a data-mapping reviewer. Given per-item mapping outcomes, judge " + "holistically what the mapping likely missed or got wrong. Reply ONLY with " + 'a JSON object: {"score": <0..1 float>, "flags": ["short issue", ...]}. ' + "score=1 means the mapping looks complete and correct." +) + + +def run_judge( + *, + host: str, + token: str, + endpoint_name: str, + ontology: dict, + artifact: dict, + stage1_issues: Optional[List[Dict[str, str]]] = None, +) -> Dict[str, Dict[str, Any]]: + """Run both advisory axes. Returns ``{"ontology": {...}, "mapping": {...}}``. + + Never raises; any failure degrades to an empty axis with a flag. + """ + if not endpoint_name: + return {"ontology": _empty_axis("no endpoint"), "mapping": _empty_axis("no endpoint")} + + onto_user = _ontology_summary(ontology) + if stage1_issues: + onto_user += "\n\nDeterministic issues already found:\n" + "\n".join( + f"- {i['check']}: {i['observed']}" for i in stage1_issues[:20] + ) + ontology_axis = _ask(host, token, endpoint_name, _ONTOLOGY_SYSTEM, onto_user) + mapping_axis = _ask(host, token, endpoint_name, _MAPPING_SYSTEM, _mapping_summary(artifact)) + return {"ontology": ontology_axis, "mapping": mapping_axis} diff --git a/src/agents/pge_eval/loaders.py b/src/agents/pge_eval/loaders.py new file mode 100644 index 00000000..abf10c64 --- /dev/null +++ b/src/agents/pge_eval/loaders.py @@ -0,0 +1,130 @@ +"""Domain-agnostic input loaders for the live ``goals_eval.py run`` path. + +The score-only path is already usecase-agnostic (it ingests a captured +artifact). Live ``run`` previously reused the smoke-test loader, which +hard-pinned a single demo domain (a fixed ``/tmp`` dump + a fixed version key). +These helpers replace that: they load the ontology + source metadata for ANY +domain, from either an exported registry version dump or plain ontology/metadata +JSON files — no domain, table, or version is baked in. + +Pure functions (file IO + dict reshaping only) — no LLM, no DB, no domain +knowledge — so they are unit-testable offline. +""" + +from __future__ import annotations + +import json +from typing import Any, Dict, Optional, Tuple + + +def to_agent_shape(ontology: dict) -> dict: + """Convert a registry-shape ontology (``{classes, properties}``) to the + ``{entities, relationships}`` shape the mapping-PGE engine consumes. + + If the input is already in agent shape (has ``entities``) it is returned + unchanged. Fully generic — only field copying + domain/range resolution. + """ + ontology = ontology or {} + if "entities" in ontology or "relationships" in ontology: + return { + "entities": list(ontology.get("entities", []) or []), + "relationships": list(ontology.get("relationships", []) or []), + } + + classes = ontology.get("classes", []) or [] + properties = ontology.get("properties", []) or [] + name_to_uri = {c["name"]: c["uri"] for c in classes if c.get("name") and c.get("uri")} + + def _resolve(ref: str) -> str: + if not ref or str(ref).startswith("http"): + return ref + return name_to_uri.get(ref, ref) + + entities = [ + { + "uri": c.get("uri", ""), + "name": c.get("name", ""), + "label": c.get("label", ""), + "comment": c.get("comment", ""), + "parent": c.get("parent", ""), + "attributes": list(c.get("dataProperties", []) or []), + } + for c in classes + ] + relationships = [ + { + "uri": p.get("uri", ""), + "name": p.get("name", ""), + "label": p.get("label", p.get("name", "")), + "comment": p.get("comment", ""), + "domain": _resolve(p.get("domain", "")), + "range": _resolve(p.get("range", "")), + } + for p in properties + if p.get("type", "ObjectProperty") == "ObjectProperty" + ] + return {"entities": entities, "relationships": relationships} + + +def _pick_version(versions: Dict[str, Any], version: Optional[str]) -> str: + """Choose a version key from a registry dump's ``versions`` map. + + Explicit ``version`` wins; otherwise the single version if there's exactly + one; otherwise raise asking the caller to disambiguate. Never guesses a + domain-specific default (the old code hard-coded ``"1_1"``). + """ + if version is not None: + if version not in versions: + raise ValueError( + f"version {version!r} not in registry dump; available: " + f"{sorted(versions)}" + ) + return version + keys = list(versions) + if len(keys) == 1: + return keys[0] + raise ValueError( + f"registry dump has {len(keys)} versions {sorted(keys)}; " + "pass --version to choose one" + ) + + +def load_run_inputs( + *, + registry_json: Optional[str] = None, + version: Optional[str] = None, + ontology_path: Optional[str] = None, + metadata_path: Optional[str] = None, +) -> Tuple[dict, dict]: + """Resolve ``(ontology_agent_shape, metadata)`` for a live run, domain-agnostic. + + Exactly one source must be given: + + * ``registry_json`` (+ optional ``version``) — an exported registry version + dump shaped ``{"versions": {: {"ontology": ..., "metadata": ...}}}``. + * ``ontology_path`` (+ optional ``metadata_path``) — plain JSON files holding + the ontology (registry or agent shape) and source metadata. + """ + if registry_json: + with open(registry_json) as f: + doc = json.load(f) + versions = doc.get("versions") or {} + if not versions: + raise ValueError(f"{registry_json} has no 'versions' map") + ver = _pick_version(versions, version) + v = versions[ver] + return to_agent_shape(v.get("ontology", {})), (v.get("metadata", {}) or {}) + + if ontology_path: + with open(ontology_path) as f: + ontology = json.load(f) + metadata: dict = {} + if metadata_path: + with open(metadata_path) as f: + metadata = json.load(f) + return to_agent_shape(ontology), metadata + + raise ValueError( + "live run needs an ontology source: pass --registry-json (+--version) " + "or --ontology (+--metadata)" + ) diff --git a/src/agents/pge_eval/mapping_metrics.py b/src/agents/pge_eval/mapping_metrics.py new file mode 100644 index 00000000..ecc588f3 --- /dev/null +++ b/src/agents/pge_eval/mapping_metrics.py @@ -0,0 +1,181 @@ +"""Stage-2 — mapping-generation quality (deterministic, no LLM). + +Computed from a captured PGE ``AgentResult`` artifact (the JSON dumped by +``scripts/smoke_pge.py``). Stage-2 reads two artifact fields: + +* ``mapping_run_log`` — authoritative per-item ``final_status`` (drives the + completeness ratios and pass/fail accounting). +* ``mapping_evaluations`` — the per-item deterministic ``EvalReport`` dicts + the run captured (drives the numeric metrics: dangling fractions, id + integrity, sql-execution failures). Defect detection keys off the + structured ``failures[].check`` field, never on prose. + +This makes score-only fully offline: no DB round-trip, no LLM, no network. +Live mode produces the same artifact first, then calls this. Nothing here +is domain-specific. +""" + +from __future__ import annotations + +import re +from typing import Any, Dict, List, Set, Tuple + +from agents.pge_eval.normalize import ( + NormalizedOntology, + local_name, + normalize_name, + normalize_ontology, +) + +_PASS_STATUSES = {"PASS", "PRESEEDED"} +_OUT_OF_SCOPE = {"SKIPPED", "FAIL_BUDGET"} + +_IDENT_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]*") + + +def _ratio(num: int, denom: int) -> float: + return round(num / denom, 6) if denom else 1.0 + + +def _is_rel_report(metrics: dict) -> bool: + return "dangling_target_pct" in metrics or "total_edges" in metrics + + +def _is_entity_report(metrics: dict) -> bool: + return "row_count" in metrics + + +def _has_sql_failure(report: dict) -> bool: + for f in report.get("failures", []) or []: + if f.get("check") == "sql_execution": + return True + return False + + +def _class_dp_counts(norm: NormalizedOntology) -> Dict[str, int]: + """Map every class identifier (uri + local + name) -> data-property count.""" + out: Dict[str, int] = {} + for c in norm.classes: + n = len(c.get("data_properties", [])) + for key in (c.get("uri"), c.get("name")): + if key: + out[key] = n + out[local_name(key)] = n + return out + + +def evaluate_mapping( + artifact: dict, + ontology: dict, +) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """Run the deterministic Stage-2 checks. + + Returns ``(metrics, extras)`` where ``extras`` carries the mapped + column footprint reused by ``pipeline.coverage_loss``. + """ + norm = normalize_ontology(ontology) + run_log = artifact.get("mapping_run_log", []) or [] + evaluations = artifact.get("mapping_evaluations", {}) or {} + entity_mappings = artifact.get("entity_mappings", []) or [] + + # ---- completeness from the run log ----------------------------- + ent_inscope = ent_pass = 0 + rel_inscope = rel_pass = 0 + for entry in run_log: + status = entry.get("final_status", "") + if status in _OUT_OF_SCOPE: + continue + if entry.get("kind") == "entity": + ent_inscope += 1 + if status in _PASS_STATUSES: + ent_pass += 1 + elif entry.get("kind") == "relationship": + rel_inscope += 1 + if status in _PASS_STATUSES: + rel_pass += 1 + + entity_completeness = _ratio(ent_pass, ent_inscope) + relationship_completeness = _ratio(rel_pass, rel_inscope) + + # ---- numeric metrics from captured eval reports ---------------- + dangling_target_pcts: List[float] = [] + dangling_source_pcts: List[float] = [] + id_ok = id_total = 0 + sql_exec_failures = 0 + band_declared = band_compliant = 0 + + for report in evaluations.values(): + metrics = report.get("metrics", {}) or {} + if _has_sql_failure(report): + sql_exec_failures += 1 + if _is_rel_report(metrics): + dangling_target_pcts.append(float(metrics.get("dangling_target_pct", 0.0))) + dangling_source_pcts.append(float(metrics.get("dangling_source_pct", 0.0))) + band = metrics.get("expected_cross_source_overlap_band") + if band and isinstance(band, (list, tuple)) and len(band) == 2: + band_declared += 1 + lo, hi = float(band[0]), float(band[1]) + overlap = float(metrics.get("cross_source_overlap_pct", 0.0)) + if lo <= overlap <= hi: + band_compliant += 1 + if _is_entity_report(metrics): + row_count = int(metrics.get("row_count", 0)) + # A legitimately empty (0-row) entity is id-vacuous: it has no ids to + # be (non-)unique, so it neither passes nor fails id-integrity. + # Counting it as a failure would RED a clean run on empty source data. + if row_count == 0: + continue + id_total += 1 + distinct = int(metrics.get("distinct_id_count", 0)) + null_id = int(metrics.get("null_id_count", 0)) + if distinct == row_count and null_id == 0: + id_ok += 1 + + dangling_target_pct_max = round(max(dangling_target_pcts), 6) if dangling_target_pcts else 0.0 + dangling_source_pct_max = round(max(dangling_source_pcts), 6) if dangling_source_pcts else 0.0 + id_integrity = _ratio(id_ok, id_total) + + # cross_source_band_compliance is conditional: only active when >=1 band + # was declared. When inactive it reports 1.0 and is flagged so the gate + # skips it. + band_active = band_declared > 0 + cross_source_band_compliance = ( + _ratio(band_compliant, band_declared) if band_active else 1.0 + ) + + # ---- attribute coverage + mapped footprint --------------------- + dp_counts = _class_dp_counts(norm) + attrs_emitted = 0 + dp_denominator = 0 + mapped_cols: Set[str] = set() + counted_classes: Set[str] = set() + for em in entity_mappings: + am = em.get("attribute_mappings", {}) or {} + attrs_emitted += len(am) + cls = em.get("ontology_class") or em.get("class_name") or "" + if cls and cls not in counted_classes: + counted_classes.add(cls) + dp_denominator += dp_counts.get(cls, dp_counts.get(local_name(cls), 0)) + for value in am.values(): + for tok in _IDENT_RE.findall(str(value)): + k = normalize_name(tok) + if k: + mapped_cols.add(k) + + attribute_coverage = _ratio(attrs_emitted, dp_denominator) + + metrics_out: Dict[str, Any] = { + "entity_completeness": entity_completeness, + "relationship_completeness": relationship_completeness, + "attribute_coverage": attribute_coverage, + "dangling_target_pct_max": dangling_target_pct_max, + "dangling_source_pct_max": dangling_source_pct_max, + "id_integrity": id_integrity, + "sql_exec_failures": sql_exec_failures, + "cross_source_band_compliance": cross_source_band_compliance, + } + extras = { + "mapped_cols": mapped_cols, + "band_active": band_active, + } + return metrics_out, extras diff --git a/src/agents/pge_eval/normalize.py b/src/agents/pge_eval/normalize.py new file mode 100644 index 00000000..3e20baab --- /dev/null +++ b/src/agents/pge_eval/normalize.py @@ -0,0 +1,262 @@ +"""Shape normalisation + footprint helpers for the PGE intrinsic evaluator. + +Everything in this module is pure Python — no LLM, no DB, no domain +knowledge. It exists so the rest of the scorer can reason over one stable +in-memory shape regardless of whether the caller handed it the *agent* +ontology shape (``{entities, relationships}``), the *registry* ontology +shape (``{classes, properties}``), or raw source metadata. + +Design constraints (see docs/plans/2026-06-10-goal-loop-and-pge-eval-design.md): + +* **Usecase-agnostic.** No table name, identifier, or count from any + particular domain is encoded here. The only constants are generic + audit/surrogate column heuristics that hold for any relational source. +* **Deterministic.** Pure functions of their inputs; no randomness, no + wall-clock, no network. +""" + +from __future__ import annotations + +import re +from typing import Any, Dict, List, Optional, Set + + +# ===================================================== +# Name normalisation +# ===================================================== + + +def normalize_name(name: Optional[str]) -> str: + """Collapse a column / property / class name to a comparison key. + + Lower-cases and strips every non-alphanumeric character so that + ``first_name``, ``firstName`` and ``FirstName`` all collapse to + ``firstname``. This is the footprint-matching key used to decide + whether a source column "became" a data property without consulting + the mapping (Stage-1 is mapping-independent — see D2/D3). + """ + if not name: + return "" + return re.sub(r"[^a-z0-9]", "", str(name).lower()) + + +def local_name(uri_or_name: Optional[str]) -> str: + """Return the local name of a URI/CURIE, or the value unchanged. + + ``http://x/Customer`` -> ``Customer``; ``ex:Customer`` -> ``Customer``; + ``Customer`` -> ``Customer``. + """ + if not uri_or_name: + return "" + s = str(uri_or_name) + for sep in ("#", "/"): + if sep in s: + s = s.rsplit(sep, 1)[-1] + if ":" in s and not s.startswith("http"): + s = s.rsplit(":", 1)[-1] + return s + + +# ===================================================== +# Audit / surrogate column heuristics (generic, not domain-specific) +# ===================================================== + +# Audit tokens that mark a column as non-analytical bookkeeping. These are +# generic ETL/CDC conventions, not tied to any domain. +_AUDIT_TOKENS = ( + "createdat", + "updatedat", + "createdon", + "updatedon", + "createdby", + "updatedby", + "modifiedat", + "modifiedby", + "deletedat", + "ingestedat", + "loadedat", + "loadts", + "etltimestamp", + "dwcreated", + "dwupdated", +) +_AUDIT_PREFIXES = ("etl", "ingest", "_ingest", "dw") +# Exact surrogate row-key names + suffixes for warehouse surrogate keys. +_SURROGATE_EXACT = ("id", "rowid", "rownum", "rownumber") +_SURROGATE_SUFFIXES = ("sk", "surrogatekey") + + +def is_surrogate_or_audit(column_name: str) -> bool: + """Heuristic: True when *column_name* is a surrogate row key or audit + column with no analytical value. + + The OWL generator is instructed to drop exactly these, so they are + excluded from coverage denominators (D3). Intentionally conservative: + it does NOT drop every ``*_id`` column (foreign keys can be meaningful), + only obvious surrogate keys and audit bookkeeping. + """ + norm = normalize_name(column_name) + if not norm: + return True + if norm in _SURROGATE_EXACT: + return True + if any(norm.endswith(sfx) for sfx in _SURROGATE_SUFFIXES): + return True + if any(tok in norm for tok in _AUDIT_TOKENS): + return True + raw = re.sub(r"[^a-z0-9_]", "", str(column_name).lower()) + if any(raw.startswith(p) for p in _AUDIT_PREFIXES): + return True + return False + + +# ===================================================== +# Ontology normalisation +# ===================================================== + + +def _attr_names(raw_attrs: Any) -> List[str]: + """Normalise an attribute container to a flat list of name strings. + + Accepts the agent shape (list of str or ``{name|uri|label}`` dicts) and + the registry shape (list of ``{name|localName}`` dicts). + """ + out: List[str] = [] + for a in raw_attrs or []: + if isinstance(a, str): + out.append(a) + elif isinstance(a, dict): + name = a.get("name") or a.get("localName") or a.get("uri") or a.get("label") + if name: + out.append(local_name(name)) + return out + + +class NormalizedOntology: + """A flat, shape-agnostic view of a generated ontology. + + Attributes: + classes: list of ``{"name", "uri", "data_properties": [str]}``. + object_properties: list of ``{"name", "uri", "domain", "range"}`` + where domain/range are the raw refs as authored (URI or local). + """ + + def __init__(self, classes: List[dict], object_properties: List[dict]): + self.classes = classes + self.object_properties = object_properties + + # --- derived sets, computed lazily but cheaply ------------------ + + @property + def class_resolution_set(self) -> Set[str]: + """Every token a domain/range ref could legitimately resolve to.""" + out: Set[str] = set() + for c in self.classes: + if c.get("uri"): + out.add(c["uri"]) + out.add(local_name(c["uri"])) + if c.get("name"): + out.add(c["name"]) + out.add(local_name(c["name"])) + return out + + @property + def all_data_property_keys(self) -> Set[str]: + """Normalised keys of every data property across every class.""" + keys: Set[str] = set() + for c in self.classes: + for dp in c.get("data_properties", []): + k = normalize_name(local_name(dp)) + if k: + keys.add(k) + return keys + + @property + def class_name_keys(self) -> Set[str]: + keys: Set[str] = set() + for c in self.classes: + k = normalize_name(local_name(c.get("name") or c.get("uri"))) + if k: + keys.add(k) + return keys + + +def normalize_ontology(ontology: dict) -> NormalizedOntology: + """Normalise either the agent shape or the registry shape. + + * Agent shape: ``{"entities": [...], "relationships": [...]}`` + * Registry shape: ``{"classes": [...], "properties": [...]}`` + """ + ontology = ontology or {} + classes: List[dict] = [] + object_props: List[dict] = [] + + if "entities" in ontology or "relationships" in ontology: + for e in ontology.get("entities", []) or []: + classes.append( + { + "name": e.get("name") or local_name(e.get("uri")), + "uri": e.get("uri", ""), + "data_properties": _attr_names(e.get("attributes")), + } + ) + for r in ontology.get("relationships", []) or []: + object_props.append( + { + "name": r.get("name") or local_name(r.get("uri")), + "uri": r.get("uri", ""), + "domain": r.get("domain", ""), + "range": r.get("range", ""), + } + ) + else: + for c in ontology.get("classes", []) or []: + classes.append( + { + "name": c.get("name") or local_name(c.get("uri")), + "uri": c.get("uri", ""), + "data_properties": _attr_names(c.get("dataProperties")), + } + ) + for p in ontology.get("properties", []) or []: + if p.get("type") and p.get("type") != "ObjectProperty": + continue + object_props.append( + { + "name": p.get("name") or local_name(p.get("uri")), + "uri": p.get("uri", ""), + "domain": p.get("domain", ""), + "range": p.get("range", ""), + } + ) + + return NormalizedOntology(classes=classes, object_properties=object_props) + + +# ===================================================== +# Source-metadata normalisation +# ===================================================== + + +def normalize_metadata(metadata: dict) -> List[dict]: + """Return ``[{"name", "columns": [str]}]`` from domain metadata. + + Accepts the ``{"tables": [{"name"|"full_name", "columns": [...]}]}`` + shape produced by the metadata tools. Column entries may be plain + strings or ``{"name": ...}`` dicts. + """ + out: List[dict] = [] + for t in (metadata or {}).get("tables", []) or []: + cols: List[str] = [] + for c in t.get("columns", []) or []: + if isinstance(c, str): + cols.append(c) + elif isinstance(c, dict) and c.get("name"): + cols.append(c["name"]) + out.append( + { + "name": t.get("full_name") or t.get("name") or "", + "columns": cols, + } + ) + return out diff --git a/src/agents/pge_eval/ontology_metrics.py b/src/agents/pge_eval/ontology_metrics.py new file mode 100644 index 00000000..21ba2c16 --- /dev/null +++ b/src/agents/pge_eval/ontology_metrics.py @@ -0,0 +1,270 @@ +"""Stage-1 — ontology-generation quality (deterministic, no LLM). + +Computed purely from the generated ontology + source metadata. No mapping +dependency (D2) and no LLM for the deterministic part (§3.2). The same +checks back the new owl-generator Evaluator stage (§3.5): each issue carries +a concrete ``hint`` that becomes a generator retry_hint. + +All metrics are usecase-agnostic: nothing about any particular domain is +hard-coded here. +""" + +from __future__ import annotations + +import re +from typing import Any, Dict, List, Set, Tuple + +from agents.pge_eval.normalize import ( + NormalizedOntology, + is_surrogate_or_audit, + local_name, + normalize_metadata, + normalize_name, + normalize_ontology, +) + +# Naming conventions (mirror the OWL generator's NAMING RULES, domain-free). +_CLASS_RE = re.compile(r"^[A-Z][A-Za-z0-9]*$") +_PROPERTY_RE = re.compile(r"^[a-z][A-Za-z0-9]*$") + + +def _issue(check: str, expected: str, observed: str, hint: str) -> Dict[str, str]: + return {"check": check, "expected": expected, "observed": observed, "hint": hint} + + +# ===================================================== +# Footprint computation (shared with pipeline.coverage_loss) +# ===================================================== + + +def _column_key(table_name: str, column_name: str) -> str: + return f"{normalize_name(table_name)}::{normalize_name(column_name)}" + + +def compute_footprint( + ontology: NormalizedOntology, tables: List[dict] +) -> Dict[str, Any]: + """Return the ontology footprint over the source metadata. + + A *column* is covered when its normalised name matches some data + property's normalised name. A *table* is covered when its name matches + a class name OR ≥1 of its non-surrogate columns is covered (D3). + + Surrogate/audit columns are excluded from the denominators. + """ + dp_keys = ontology.all_data_property_keys + class_keys = ontology.class_name_keys + + total_columns = 0 + covered_columns: Set[str] = set() + total_tables = len(tables) + covered_tables: Set[str] = set() + + for t in tables: + tname = t["name"] + tkey = normalize_name(local_name(tname)) + table_is_covered = tkey in class_keys + for col in t["columns"]: + if is_surrogate_or_audit(col): + continue + total_columns += 1 + ckey = _column_key(tname, col) + if normalize_name(col) in dp_keys: + covered_columns.add(ckey) + table_is_covered = True + if table_is_covered: + covered_tables.add(tname) + + return { + "total_tables": total_tables, + "covered_tables": covered_tables, + "total_columns": total_columns, + "covered_columns": covered_columns, + } + + +# ===================================================== +# Stage-1 metrics + issues +# ===================================================== + + +def evaluate_ontology( + ontology: dict, + metadata: dict, +) -> Tuple[Dict[str, Any], List[Dict[str, str]], Dict[str, Any]]: + """Run the deterministic Stage-1 checks. + + Returns ``(metrics, issues, footprint)``: + + * ``metrics`` — the §3.2 metric block (ratios + absolute counts). + * ``issues`` — actionable failures (``check/expected/observed/hint``) + for the owl-gen Evaluator's retry_hints. + * ``footprint`` — covered tables/columns sets reused by + ``pipeline.coverage_loss``. + """ + norm = normalize_ontology(ontology) + tables = normalize_metadata(metadata) + footprint = compute_footprint(norm, tables) + + issues: List[Dict[str, str]] = [] + + # ---- coverage ratios (Tier-2 warn) ----------------------------- + table_cov = ( + len(footprint["covered_tables"]) / footprint["total_tables"] + if footprint["total_tables"] + else 1.0 + ) + column_cov = ( + len(footprint["covered_columns"]) / footprint["total_columns"] + if footprint["total_columns"] + else 1.0 + ) + + uncovered_tables = [ + t["name"] + for t in tables + if t["name"] not in footprint["covered_tables"] + ] + for tname in uncovered_tables: + issues.append( + _issue( + "table_footprint_coverage", + "table maps to a class or contributes >=1 data property", + "no footprint", + f"source table '{tname}' has no class and contributes no data " + "property — model it as a class, attach its columns as data " + "properties on an existing class, or justify the omission.", + ) + ) + + # ---- orphan classes (Tier-1 absolute = 0) ---------------------- + related: Set[str] = set() + for op in norm.object_properties: + for ref in (op.get("domain"), op.get("range")): + if ref: + related.add(local_name(ref)) + related.add(str(ref)) + orphan_classes: List[str] = [] + for c in norm.classes: + has_props = bool(c.get("data_properties")) + name = c.get("name") or local_name(c.get("uri")) + in_rel = name in related or local_name(c.get("uri")) in related + if not has_props and not in_rel: + orphan_classes.append(name) + issues.append( + _issue( + "orphan_class_count", + "0 orphan classes", + name, + f"class '{name}' is an orphan (no data properties and no " + "object-property domain/range) — attach properties, relate " + "it to another class, or remove it.", + ) + ) + + # ---- dangling domain/range (Tier-1 absolute = 0) --------------- + resolvable = norm.class_resolution_set + dangling_dr: List[str] = [] + for op in norm.object_properties: + opname = op.get("name") or local_name(op.get("uri")) + for role in ("domain", "range"): + ref = op.get(role) + if not ref: + dangling_dr.append(f"{opname}.{role}") + issues.append( + _issue( + "dangling_domain_range_count", + f"ObjectProperty {role} resolves to a class", + f"{opname}.{role}=", + f"ObjectProperty '{opname}' has no {role} — declare an " + f"rdfs:{role} pointing at an existing class.", + ) + ) + continue + if ref not in resolvable and local_name(ref) not in resolvable: + dangling_dr.append(f"{opname}.{role}") + issues.append( + _issue( + "dangling_domain_range_count", + f"ObjectProperty {role} resolves to a class", + f"{opname}.{role}={local_name(ref)}", + f"ObjectProperty '{opname}' has {role} " + f"'{local_name(ref)}' which resolves to no class — fix " + "the reference or add the missing class.", + ) + ) + + # ---- naming violations (Tier-1 absolute = 0) ------------------- + naming_violations: List[str] = [] + for c in norm.classes: + nm = local_name(c.get("name") or c.get("uri")) + if nm and not _CLASS_RE.match(nm): + naming_violations.append(f"class:{nm}") + issues.append( + _issue( + "naming_violation_count", + "class name is PascalCase [A-Z][A-Za-z0-9]*", + nm, + f"class '{nm}' violates PascalCase — remove spaces / " + "underscores / hyphens and capitalise (e.g. sales_order -> " + "SalesOrder).", + ) + ) + for op in norm.object_properties: + nm = local_name(op.get("name") or op.get("uri")) + if nm and not _PROPERTY_RE.match(nm): + naming_violations.append(f"property:{nm}") + issues.append( + _issue( + "naming_violation_count", + "property name is lowerCamelCase [a-z][A-Za-z0-9]*", + nm, + f"property '{nm}' violates lowerCamelCase — use " + "[a-z][A-Za-z0-9]* with no underscores/hyphens/escapes.", + ) + ) + # data properties too + for c in norm.classes: + for dp in c.get("data_properties", []): + nm = local_name(dp) + if nm and not _PROPERTY_RE.match(nm): + naming_violations.append(f"dataproperty:{nm}") + issues.append( + _issue( + "naming_violation_count", + "data property name is lowerCamelCase", + nm, + f"data property '{nm}' violates lowerCamelCase — use " + "[a-z][A-Za-z0-9]* with no underscores/hyphens/escapes.", + ) + ) + + # ---- duplicate classes (Tier-1 absolute = 0) ------------------- + seen: Dict[str, int] = {} + for c in norm.classes: + key = normalize_name(local_name(c.get("name") or c.get("uri"))) + if not key: + continue + seen[key] = seen.get(key, 0) + 1 + duplicate_class_count = sum(n - 1 for n in seen.values() if n > 1) + for key, n in seen.items(): + if n > 1: + issues.append( + _issue( + "duplicate_class_count", + "0 duplicate class local names", + f"{key} x{n}", + f"{n} classes collapse to the local name '{key}' — merge " + "them or differentiate their names/definitions.", + ) + ) + + metrics: Dict[str, Any] = { + "table_footprint_coverage": round(table_cov, 6), + "column_footprint_coverage": round(column_cov, 6), + "orphan_class_count": len(orphan_classes), + "dangling_domain_range_count": len(dangling_dr), + "naming_violation_count": len(naming_violations), + "duplicate_class_count": duplicate_class_count, + } + return metrics, issues, footprint diff --git a/src/agents/pge_eval/pipeline_metrics.py b/src/agents/pge_eval/pipeline_metrics.py new file mode 100644 index 00000000..ac5ae87e --- /dev/null +++ b/src/agents/pge_eval/pipeline_metrics.py @@ -0,0 +1,85 @@ +"""Pipeline-level metrics (deterministic, no LLM). + +* ``coverage_loss`` — source concepts the ontology surfaced but that never + reached a mapping (ontology footprint − mapped footprint). The gap + between the two complementary denominators of D2. +* ``convergence`` — effort signals (mean generator attempts, planner + reinvocations, total tokens, wall-clock). + +Both pipeline metrics are **tracked/advisory only** — they are reported on the +scorecard for inspection and trend-watching but are not wired into any gate +tier (no ``METRIC_SPECS`` entry references the ``pipeline`` stage). Treat them +as observability, not pass/fail. + +Usecase-agnostic. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Set + +from agents.pge_eval.normalize import normalize_name + + +def _surfaced_column_keys(footprint: Dict[str, Any]) -> Set[str]: + """Normalised column-name keys of every ontology-covered column. + + ``footprint['covered_columns']`` holds ``table::col`` keys; the loss + comparison works at the column-name level so it matches the mapped + footprint (which has no reliable table qualifier). + """ + out: Set[str] = set() + for key in footprint.get("covered_columns", set()): + col = key.split("::", 1)[-1] + if col: + out.add(col) + return out + + +def evaluate_pipeline( + artifact: dict, + ontology_footprint: Dict[str, Any], + mapped_cols: Set[str], +) -> Dict[str, Any]: + surfaced = _surfaced_column_keys(ontology_footprint) + lost = {c for c in surfaced if c not in mapped_cols} + coverage_loss = len(lost) + + # ---- convergence ----------------------------------------------- + run_log = artifact.get("mapping_run_log", []) or [] + attempt_counts: List[int] = [ + len(entry.get("attempts", []) or []) + for entry in run_log + if entry.get("attempts") + ] + mean_attempts = ( + round(sum(attempt_counts) / len(attempt_counts), 6) if attempt_counts else 0.0 + ) + + stats = artifact.get("stats", {}) or {} + planner_reinvocations = int( + stats.get("planner_reinvocations", artifact.get("planner_reinvocations", 0)) or 0 + ) + + usage = artifact.get("usage", {}) or {} + total_tokens = int( + usage.get("total_tokens", 0) + or (usage.get("prompt_tokens", 0) + usage.get("completion_tokens", 0)) + ) + + wall_clock_s = float(artifact.get("elapsed_s", 0.0) or 0.0) + if not wall_clock_s: + step_ms = sum( + int(s.get("duration_ms", 0) or 0) for s in artifact.get("steps", []) or [] + ) + wall_clock_s = round(step_ms / 1000.0, 3) + + return { + "coverage_loss": coverage_loss, + "convergence": { + "mean_generator_attempts": mean_attempts, + "planner_reinvocations": planner_reinvocations, + "total_tokens": total_tokens, + "wall_clock_s": wall_clock_s, + }, + } diff --git a/src/agents/pge_eval/scorecard.py b/src/agents/pge_eval/scorecard.py new file mode 100644 index 00000000..cdb4e05b --- /dev/null +++ b/src/agents/pge_eval/scorecard.py @@ -0,0 +1,161 @@ +"""Scorecard assembly + verdict (§3.6). + +``score_artifact`` is the single offline-testable code path (D6): it ingests +a captured ``AgentResult`` artifact (plus the generated ontology and source +metadata) and emits the §3.6 scorecard JSON. Every deterministic metric is +computed with zero LLM calls; the advisory judge is the only network path +and runs only when ``no_judge`` is False. + +Live mode (``scripts/goals_eval.py run``) is a thin wrapper: it produces the +artifact first, then calls this. +""" + +from __future__ import annotations + +import hashlib +import json +from typing import Any, Dict, List, Optional + +from agents.pge_eval import gates as gates_mod +from agents.pge_eval.baseline import DEFAULT_BASELINE_DIR, load_baseline +from agents.pge_eval.mapping_metrics import evaluate_mapping +from agents.pge_eval.normalize import normalize_metadata, normalize_ontology +from agents.pge_eval.ontology_metrics import evaluate_ontology +from agents.pge_eval.pipeline_metrics import evaluate_pipeline + +SCHEMA_VERSION = "1.0" + + +def _digest(obj: Any) -> str: + payload = json.dumps(obj, sort_keys=True, default=str).encode("utf-8") + return hashlib.sha256(payload).hexdigest()[:16] + + +def _ontology_digest(ontology: dict) -> str: + norm = normalize_ontology(ontology) + sig = { + "classes": sorted( + (c.get("name", ""), tuple(sorted(c.get("data_properties", [])))) + for c in norm.classes + ), + "object_properties": sorted( + (op.get("name", ""), op.get("domain", ""), op.get("range", "")) + for op in norm.object_properties + ), + } + return _digest(sig) + + +def _metadata_digest(metadata: dict) -> str: + tables = normalize_metadata(metadata) + sig = sorted((t["name"], tuple(sorted(t["columns"]))) for t in tables) + return _digest(sig) + + +def _resolve_inputs(artifact: dict, ontology, metadata): + if ontology is None: + ontology = artifact.get("ontology") or {} + if metadata is None: + metadata = ( + artifact.get("metadata") + or artifact.get("source_metadata") + or {} + ) + return ontology, metadata + + +def score_artifact( + artifact: dict, + *, + ontology: Optional[dict] = None, + metadata: Optional[dict] = None, + gate_ratios: bool = False, + no_judge: bool = True, + mode: str = "score-only", + run_id: Optional[str] = None, + timestamp: Optional[str] = None, + endpoint: Optional[str] = None, + host: Optional[str] = None, + token: Optional[str] = None, + baseline_dir: str = DEFAULT_BASELINE_DIR, + baseline: Optional[Dict[str, Any]] = None, + use_baseline: bool = True, + ratio_threshold: float = gates_mod.DEFAULT_RATIO_THRESHOLD, +) -> Dict[str, Any]: + """Score a captured artifact and return the §3.6 scorecard dict. + + Deterministic unless ``no_judge`` is False. ``run_id``/``timestamp`` are + stamped by the caller (kept out of the deterministic core so unit tests + are reproducible). + """ + ontology, metadata = _resolve_inputs(artifact, ontology, metadata) + + onto_metrics, stage1_issues, footprint = evaluate_ontology(ontology, metadata) + map_metrics, map_extras = evaluate_mapping(artifact, ontology) + pipeline = evaluate_pipeline(artifact, footprint, map_extras["mapped_cols"]) + + # ---- advisory judge (only LLM path) ---------------------------- + if no_judge: + onto_judge = {"score": None, "flags": []} + map_judge = {"score": None, "flags": []} + else: + from agents.pge_eval.judge import run_judge + + verdicts = run_judge( + host=host or "", + token=token or "", + endpoint_name=endpoint or "", + ontology=ontology, + artifact=artifact, + stage1_issues=stage1_issues, + ) + onto_judge = verdicts["ontology"] + map_judge = verdicts["mapping"] + + stages = { + "ontology": {"metrics": onto_metrics, "judge": onto_judge}, + "mapping": { + "metrics": map_metrics, + "judge": map_judge, + # Persisted so Tier-3 can tell an inactive-1.0 band (no band declared) + # from an active measurement, and not flag the first real band reading + # as a regression. + "band_active": bool(map_extras.get("band_active")), + }, + "pipeline": pipeline, + } + + # ---- gates ----------------------------------------------------- + active_conditionals = {"band_active": bool(map_extras.get("band_active"))} + tier1 = gates_mod.evaluate_tier1(stages, active_conditionals=active_conditionals) + tier2 = gates_mod.evaluate_tier2( + stages, gate_ratios=gate_ratios, threshold=ratio_threshold + ) + + if baseline is None and use_baseline: + baseline = load_baseline(baseline_dir, exclude_run_id=run_id) + tier3 = gates_mod.evaluate_tier3(stages, baseline) + + passed = tier1["passed"] and tier2["passed"] and tier3["passed"] + verdict = "GREEN" if passed else "RED" + + return { + "schema_version": SCHEMA_VERSION, + "run_id": run_id, + "timestamp": timestamp, + "mode": mode, + "inputs": { + "source_metadata_digest": _metadata_digest(metadata), + "ontology_digest": _ontology_digest(ontology), + "endpoint": None if no_judge else endpoint, + }, + "stages": stages, + "stage1_issues": stage1_issues, + "gates": { + "tier1_absolute": tier1, + "tier2_ratio": tier2, + "tier3_regression": tier3, + }, + "verdict": verdict, + "exit_code": 0 if verdict == "GREEN" else 1, + } diff --git a/src/agents/tools/context.py b/src/agents/tools/context.py index 3b88df82..6edf3919 100644 --- a/src/agents/tools/context.py +++ b/src/agents/tools/context.py @@ -6,7 +6,10 @@ """ from dataclasses import dataclass, field -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional + +if TYPE_CHECKING: + from agents.agent_mapping_pge.contracts import EvalReport, SourceModel @dataclass @@ -53,3 +56,13 @@ class ToolContext: dtwin_registry_params: dict = field(default_factory=dict) dtwin_domain_name: str = "" dtwin_ontology_labels: dict = field(default_factory=dict) # uri/name → display label + + # Mapping PGE planner output (``agent_mapping_pge``) – populated by the + # ``submit_source_model`` terminal tool. Forward-ref string typing avoids a + # circular import between ``agents.tools`` and ``agents.agent_mapping_pge``. + source_model: Optional["SourceModel"] = None + + # Mapping PGE semantic critic output (``agent_mapping_pge``) – populated by + # the ``submit_evaluation`` terminal tool of the Sprint 6 Critic agent. + # Same forward-ref pattern as ``source_model`` to avoid a circular import. + semantic_eval_report: Optional["EvalReport"] = None diff --git a/src/agents/tools/evaluation.py b/src/agents/tools/evaluation.py new file mode 100644 index 00000000..e4f62f94 --- /dev/null +++ b/src/agents/tools/evaluation.py @@ -0,0 +1,205 @@ +"""Terminal tool for the mapping-PGE Semantic Critic (Sprint 6). + +The Critic audits ONE submitted mapping for semantic correctness after the +deterministic (stage-1) evaluator has already passed. It submits its verdict +through ``submit_evaluation`` — the terminal tool defined here — which +constructs an :class:`EvalReport` (stage="semantic") and stamps it onto +``ctx.semantic_eval_report``. + +This module deliberately mirrors the shape of the other terminal tools +(``submit_source_model``, ``submit_entity_mapping``, …) — pure-Python handler +with a JSON-schema definition for OpenAI function calling, exported via +``EVALUATION_TOOL_DEFINITIONS`` / ``EVALUATION_TOOL_HANDLERS`` aggregates. +""" + +import json +from typing import Callable, Dict, List, Optional + +from back.core.logging import get_logger +from agents.tools.context import ToolContext + +logger = get_logger(__name__) + + +# ===================================================== +# OpenAI function-calling definition +# ===================================================== + +SUBMIT_EVALUATION_DEF: dict = { + "type": "function", + "function": { + "name": "submit_evaluation", + "description": ( + "Submit the final semantic evaluation. Terminal tool — call exactly once " + "when you have a confident verdict. status MUST be 'PASS' or 'FAIL'. " + "If failing, populate failures[] with at least one entry. " + "Set bubble_to_planner=true ONLY when the wrong TABLE was chosen " + "(not just a wrong column within the right table)." + ), + "parameters": { + "type": "object", + "properties": { + "status": {"type": "string", "enum": ["PASS", "FAIL"]}, + "failures": { + "type": "array", + "items": { + "type": "object", + "properties": { + "check": {"type": "string"}, + "expected": {"type": "string"}, + "observed": {"type": "string"}, + "hint": {"type": "string"}, + }, + "required": ["check", "expected", "observed", "hint"], + }, + "description": "Empty when status is PASS.", + }, + "bubble_to_planner": {"type": "boolean"}, + "reasoning": { + "type": "string", + "description": "One-paragraph summary of the audit reasoning.", + }, + }, + "required": ["status"], + }, + }, +} + + +# ===================================================== +# Handler +# ===================================================== + + +def tool_submit_evaluation( + ctx: ToolContext, + *, + status: str = "", + failures: Optional[list] = None, + bubble_to_planner: bool = False, + reasoning: str = "", + **_kwargs, +) -> str: + """Construct an EvalReport from the critic's submission and store on ctx. + + Contract: + * ``status`` MUST be one of ``"PASS"`` or ``"FAIL"`` — anything else is + rejected as a JSON error so the agent loop can coach the LLM and + continue (it does NOT terminate the loop). + * On ``FAIL`` with an empty ``failures`` list, a generic + ``semantic_audit`` failure is synthesised so the resulting report is + coherent (status=FAIL <=> failures non-empty, matching + :func:`evaluator.report.build_report` semantics). + * ``bubble_to_planner=True`` is demoted to False when status is PASS — + same invariant the deterministic evaluator's :func:`build_report` + enforces (a passing evaluation should not escalate). + """ + logger.info( + "tool_submit_evaluation: status=%s, failures=%d, bubble=%s, reasoning=%d chars", + status, + len(failures or []), + bubble_to_planner, + len(reasoning or ""), + ) + + if status not in ("PASS", "FAIL"): + logger.warning("tool_submit_evaluation: invalid status=%r", status) + return json.dumps( + { + "success": False, + "error": f"invalid status: {status!r} (must be PASS or FAIL)", + } + ) + + # Lazy import — these contracts live in agent_mapping_pge and importing + # them at module load time would create a cycle through + # ``agents.tools.context``. + from agents.agent_mapping_pge.contracts import EvalFailure, EvalReport + + eval_failures: List[EvalFailure] = [] + for f in failures or []: + if not isinstance(f, dict): + continue + eval_failures.append( + EvalFailure( + kind="semantic", + check=str(f.get("check") or ""), + expected=str(f.get("expected") or ""), + observed=str(f.get("observed") or ""), + hint=str(f.get("hint") or ""), + ) + ) + + # status=PASS <=> failures empty. If the LLM submitted both, clamp the + # failures list and warn — keeping a passing report internally coherent. + if status == "PASS" and eval_failures: + logger.warning( + "tool_submit_evaluation: status=PASS with %d failures — clamping to []", + len(eval_failures), + ) + eval_failures = [] + + # If status=FAIL but no failures, synthesise a generic one so the report + # is coherent (status=FAIL <=> failures non-empty). + if status == "FAIL" and not eval_failures: + logger.debug( + "tool_submit_evaluation: synthesising semantic_audit failure for " + "FAIL with no failures[]" + ) + eval_failures.append( + EvalFailure( + kind="semantic", + check="semantic_audit", + expected="PASS", + observed="FAIL", + hint=reasoning or "critic returned FAIL without specific failures", + ) + ) + + # If status=PASS but bubble flag is True, demote — matches + # ``build_report``'s behaviour and the documented invariant: a passing + # evaluation does not escalate to the Planner. + if status == "PASS" and bubble_to_planner: + logger.warning( + "tool_submit_evaluation: bubble_to_planner=True with status=PASS — " + "demoting to False" + ) + bubble_to_planner = False + + metrics: Dict[str, str] = {"reasoning": reasoning} if reasoning else {} + + report = EvalReport( + status=status, + stage="semantic", + metrics=metrics, + failures=eval_failures, + bubble_to_planner=bool(bubble_to_planner), + ) + ctx.semantic_eval_report = report + + logger.info( + "tool_submit_evaluation: stored EvalReport status=%s, failures=%d, bubble=%s", + report.status, + len(report.failures), + report.bubble_to_planner, + ) + + return json.dumps( + { + "success": True, + "status": status, + "failures": len(eval_failures), + "bubble_to_planner": report.bubble_to_planner, + } + ) + + +# ===================================================== +# Aggregates +# ===================================================== + +EVALUATION_TOOL_DEFINITIONS: List[dict] = [SUBMIT_EVALUATION_DEF] + +EVALUATION_TOOL_HANDLERS: Dict[str, Callable] = { + "submit_evaluation": tool_submit_evaluation, +} diff --git a/src/agents/tools/mapping.py b/src/agents/tools/mapping.py index e54ff80f..82b11279 100644 --- a/src/agents/tools/mapping.py +++ b/src/agents/tools/mapping.py @@ -35,15 +35,25 @@ def tool_submit_entity_mapping( id_column: str = "", label_column: str = "", attribute_mappings: Optional[dict] = None, + unmapped_attributes: Optional[list] = None, **_kwargs, ) -> str: - """Record a completed entity mapping.""" + """Record a completed entity mapping. + + ``unmapped_attributes`` lets the Generator stage declare ontology attributes + it intentionally did not map to a column, with a one-sentence ``reason``. + Items may be either bare strings (attribute name only) or dicts of shape + ``{"name": str, "reason": str}`` — the richer dict form is preferred for + downstream consumption but bare strings round-trip too. Anything else is + coerced to a string for safety. This enforces the PGE "no silent drops" + invariant: every ontology attribute is either in ``attribute_mappings`` or + in ``unmapped_attributes``. + """ # Normalise column names: strip any surrounding backticks the LLM may have added. id_column = _strip_backticks(id_column) label_column = _strip_backticks(label_column) if attribute_mappings: attribute_mappings = {k: _strip_backticks(v) for k, v in attribute_mappings.items()} - logger.info("tool_submit_entity_mapping: '%s' (uri=%s)", class_name, class_uri) if not class_uri or not sql_query: logger.warning("tool_submit_entity_mapping: missing required fields") @@ -55,6 +65,22 @@ def tool_submit_entity_mapping( .rstrip(";") ) + # Normalise ``unmapped_attributes`` — accept either form, persist as-is for + # dicts, leave bare strings as strings (validation/coverage is downstream). + normalised_unmapped: List = [] + for item in unmapped_attributes or []: + if isinstance(item, dict) and "name" in item: + normalised_unmapped.append( + { + "name": str(item.get("name", "")), + "reason": str(item.get("reason", "")), + } + ) + elif isinstance(item, str): + normalised_unmapped.append(item) + else: + normalised_unmapped.append(str(item)) + # Restrict attribute_mappings to attributes declared in the ontology for this entity. # This prevents the LLM from inventing mappings for columns that are not ontology # data properties (e.g. mapping all table columns when the entity has none). @@ -107,6 +133,7 @@ def tool_submit_entity_mapping( "id_column": id_column, "label_column": label_column, "attribute_mappings": filtered_mappings, + "unmapped_attributes": normalised_unmapped, } # Preserve user-set excluded_attributes across auto-map runs. if existing_excl: @@ -132,12 +159,14 @@ def tool_submit_entity_mapping( logger.debug("tool_submit_entity_mapping: appended new mapping") mapped_attrs = len(mapping["attribute_mappings"]) + unmapped_count = len(mapping["unmapped_attributes"]) logger.info( - "tool_submit_entity_mapping: '%s' recorded — ID=%s, Label=%s, %d attr(s) mapped", + "tool_submit_entity_mapping: '%s' recorded — ID=%s, Label=%s, %d attr(s) mapped, %d unmapped", class_name, id_column, label_column, mapped_attrs, + unmapped_count, ) return json.dumps( { @@ -146,6 +175,7 @@ def tool_submit_entity_mapping( "id_column": id_column, "label_column": label_column, "attributes_mapped": mapped_attrs, + "attributes_unmapped": unmapped_count, "total_entity_mappings": len(ctx.entity_mappings), } ) @@ -311,6 +341,30 @@ def _extract_label(value: str) -> str: ), "additionalProperties": {"type": "string"}, }, + "unmapped_attributes": { + "type": "array", + "description": ( + "Ontology attributes you intentionally did NOT map to a column, " + "each with a one-sentence reason. Use this to satisfy the " + 'no-silent-drops invariant. Preferred shape: ' + '[{"name": "apgarScore", "reason": "absent from source table"}]. ' + "Bare strings are also accepted but discouraged." + ), + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Ontology attribute name.", + }, + "reason": { + "type": "string", + "description": "Why this attribute was not mapped.", + }, + }, + "required": ["name", "reason"], + }, + }, }, "required": [ "class_uri", @@ -347,11 +401,19 @@ def _extract_label(value: str) -> str: }, "source_id_column": { "type": "string", - "description": "Column name for the source entity identifier.", + "description": ( + "The output-column alias in sql_query that holds the " + "source id — alias it AS source_id and pass " + '"source_id" here (NOT the entity\'s id_column).' + ), }, "target_id_column": { "type": "string", - "description": "Column name for the target entity identifier.", + "description": ( + "The output-column alias in sql_query that holds the " + "target id — alias it AS target_id and pass " + '"target_id" here (NOT the entity\'s id_column).' + ), }, "domain": { "type": "string", @@ -385,3 +447,10 @@ def _extract_label(value: str) -> str: "submit_entity_mapping": tool_submit_entity_mapping, "submit_relationship_mapping": tool_submit_relationship_mapping, } + +# Name-indexed view of MAPPING_TOOL_DEFINITIONS so callers needing a single +# definition (e.g. the EntityGenerator, which only exposes the entity submit +# tool) can look it up by name without re-scanning the list. +MAPPING_TOOL_DEFINITIONS_BY_NAME: Dict[str, dict] = { + d["function"]["name"]: d for d in MAPPING_TOOL_DEFINITIONS +} diff --git a/src/agents/tools/planner.py b/src/agents/tools/planner.py new file mode 100644 index 00000000..a1de9af9 --- /dev/null +++ b/src/agents/tools/planner.py @@ -0,0 +1,707 @@ +""" +Planner tools – used by the mapping-PGE Planner agent (Sprint 2+). + +Exposes the OpenAI function-calling tools that let the Planner LLM probe +source tables and submit a validated ``SourceModel`` artefact: + +* ``sample_table`` — N random rows from a table (n capped at 100). +* ``column_value_overlap`` — one-sided distinct-value overlap between two columns. +* ``normalized_value_overlap`` — same metric, but each side is a scalar SQL + expression, so canonical-key normalizations can be proven before commit. +* ``distinct_count`` — uniqueness / completeness of a candidate canonical id. +* ``submit_source_model`` — terminal tool: validates the candidate SourceModel + JSON against :class:`agents.agent_mapping_pge.contracts.SourceModel` and stores + the dataclass instance on :attr:`ToolContext.source_model`. + +All handlers return JSON strings (same convention as ``agents.tools.sql``) +and stringify scalar values for the LLM-facing surface. +""" + +import json +import re +from typing import Any, Callable, Dict, List, Optional, Tuple + +from back.core.logging import get_logger +from agents.tools.context import ToolContext + +logger = get_logger(__name__) + + +# Cap on ``n`` in ``sample_table`` to keep the LLM context bounded. +_SAMPLE_TABLE_MAX_N = 100 +_SAMPLE_TABLE_DEFAULT_N = 20 + + +# Permissive but injection-safe SQL identifier shape. We allow dots (for +# fully-qualified ``catalog.schema.table``) and backticks (for quoted +# identifiers), plus the usual alphanumerics + underscore. Anything else +# — semicolons, whitespace, quotes, comment markers — is rejected. +_IDENTIFIER_RE = re.compile(r"^[A-Za-z0-9_.`]+$") + + +# SQL keywords whose presence in a "normalization expression" indicates the +# string is no longer a scalar expression but a smuggled clause / subquery / +# DDL. A legitimate canonical-key expression (regexp_extract, regexp_replace, +# concat, substring, lower, upper, trim, coalesce, ||, string literals) needs +# none of these. Matched case-insensitively as whole words. +_EXPR_FORBIDDEN_WORDS = frozenset( + { + "select", + "from", + "where", + "join", + "union", + "intersect", + "except", + "insert", + "update", + "delete", + "drop", + "alter", + "create", + "grant", + "revoke", + "table", + "into", + "exec", + "execute", + "call", + "merge", + "values", + "having", + "group", + "order", + "limit", + } +) +_EXPR_WORD_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]*") + + +def _validate_safe_expression(expr: str, *, role: str) -> Optional[str]: + """Return None if ``expr`` is a safe scalar SQL expression; else an error. + + Unlike :func:`_validate_identifier`, this permits the parentheses, commas, + quotes and operators a canonical-key normalization needs (e.g. + ``regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-\\d+)', 1)`` or + ``concat(regexp_extract(delivery_id, '...', 1), '-del')``). It still gets + interpolated into SQL via an f-string, so it is gated against the obvious + injection vectors: statement terminators, comment markers, and any SQL + keyword that would turn the scalar into a clause/subquery/DDL. + """ + if not isinstance(expr, str) or not expr.strip(): + return f"invalid {role}: must be a non-empty string" + if ";" in expr or "--" in expr or "/*" in expr or "*/" in expr: + return ( + f"invalid {role}: must not contain ';' or SQL comment markers " + f"(got {expr!r})" + ) + bad = sorted( + { + w.lower() + for w in _EXPR_WORD_RE.findall(expr) + if w.lower() in _EXPR_FORBIDDEN_WORDS + } + ) + if bad: + return ( + f"invalid {role}: a canonical-key expression must be a single scalar " + f"expression, not a clause/subquery. Forbidden keyword(s): " + f"{', '.join(bad)} (got {expr!r})" + ) + return None + + +def _validate_identifier(name: str, *, role: str) -> Optional[str]: + """Return None if ``name`` is a valid SQL identifier; else an error message. + + Used to gate identifiers that get interpolated into SQL via f-strings. + Even though today's callers are LLMs (not untrusted users), a hallucinated + identifier like ``t; DROP TABLE x`` or ``nhs FROM secrets--`` would + otherwise execute. + """ + if not isinstance(name, str) or not _IDENTIFIER_RE.fullmatch(name): + return f"invalid {role}: {name!r}" + return None + + +def _run_query( + ctx: ToolContext, + sql: str, + *, + tool_name: str, +) -> Tuple[Optional[List[Dict[str, Any]]], Optional[str]]: + """Execute the SQL via the client. Returns ``(rows, None)`` on success, + ``(None, error_str)`` on failure. On failure the SQL is logged at ERROR + level alongside the exception (previously only at DEBUG). + """ + try: + result = ctx.client.execute_query(sql) + return result, None + except Exception as exc: + logger.error( + "%s: query failed: %s\nSQL: %s", tool_name, exc, sql, exc_info=True + ) + return None, str(exc) + + +# ===================================================== +# Tool implementations +# ===================================================== + + +def tool_sample_table( + ctx: ToolContext, *, full_name: str = "", n: Any = _SAMPLE_TABLE_DEFAULT_N, **_kwargs +) -> str: + """Return N random sample rows from ``full_name`` so the agent can see + real values (not just column types). ``n`` is capped at 100. + """ + logger.info("tool_sample_table: full_name=%s, n=%s", full_name, n) + if not full_name: + return json.dumps({"success": False, "error": "full_name is required"}) + + err = _validate_identifier(full_name, role="full_name") + if err is not None: + return json.dumps({"success": False, "error": err}) + + # Strict ``n`` parsing: a malformed value is a tool-call error, not a + # silent fallback. The default (when ``n`` is omitted) is already the int + # ``_SAMPLE_TABLE_DEFAULT_N``, so ``int(n)`` is a no-op in that case. + try: + n_int = int(n) + except (TypeError, ValueError): + return json.dumps({"success": False, "error": f"invalid n: {n!r}"}) + capped_n = max(1, min(n_int, _SAMPLE_TABLE_MAX_N)) + + sql = f"SELECT * FROM {full_name} ORDER BY RAND() LIMIT {capped_n}" + logger.debug("tool_sample_table: SQL=%s", sql) + + rows, err = _run_query(ctx, sql, tool_name="tool_sample_table") + if err is not None: + return json.dumps({"success": False, "error": err}) + + rows = rows or [] + columns: List[str] = list(rows[0].keys()) if rows else [] + stringified_rows: List[List[Optional[str]]] = [] + for row in rows: + stringified_rows.append( + [str(row[c]) if row.get(c) is not None else None for c in columns] + ) + logger.info( + "tool_sample_table: %d row(s) × %d column(s)", + len(stringified_rows), + len(columns), + ) + return json.dumps( + { + "success": True, + "columns": columns, + "rows": stringified_rows, + "row_count": len(stringified_rows), + } + ) + + +def tool_column_value_overlap( + ctx: ToolContext, + *, + from_table: str = "", + from_column: str = "", + to_table: str = "", + to_column: str = "", + **_kwargs, +) -> str: + """Compute the one-sided overlap + ``|distinct(from) ∩ distinct(to)| / |distinct(from)|``. + + The numerator dedupes ``from`` before intersecting. Returns 0.0 (and a + note) when ``from_distinct_count`` is zero to avoid division by zero. + """ + logger.info( + "tool_column_value_overlap: %s.%s ↔ %s.%s", + from_table, + from_column, + to_table, + to_column, + ) + if not (from_table and from_column and to_table and to_column): + return json.dumps( + { + "success": False, + "error": "from_table, from_column, to_table, to_column are all required", + } + ) + + for value, role in ( + (from_table, "from_table"), + (from_column, "from_column"), + (to_table, "to_table"), + (to_column, "to_column"), + ): + err = _validate_identifier(value, role=role) + if err is not None: + return json.dumps({"success": False, "error": err}) + + sql = ( + "WITH from_distinct AS (" + f" SELECT DISTINCT {from_column} AS v FROM {from_table} " + f" WHERE {from_column} IS NOT NULL" + ")," + " to_distinct AS (" + f" SELECT DISTINCT {to_column} AS v FROM {to_table} " + f" WHERE {to_column} IS NOT NULL" + ")," + " inter AS (" + " SELECT v FROM from_distinct INTERSECT SELECT v FROM to_distinct" + ") " + "SELECT (SELECT COUNT(*) FROM from_distinct) AS from_distinct_count, " + " (SELECT COUNT(*) FROM to_distinct) AS to_distinct_count, " + " (SELECT COUNT(*) FROM inter) AS intersection_count" + ) + logger.debug("tool_column_value_overlap: SQL=%s", sql) + + rows, err = _run_query(ctx, sql, tool_name="tool_column_value_overlap") + if err is not None: + return json.dumps({"success": False, "error": err}) + if not rows: + return json.dumps( + {"success": False, "error": "overlap query returned no rows"} + ) + + row = rows[0] + from_distinct = int(row.get("from_distinct_count", 0) or 0) + to_distinct = int(row.get("to_distinct_count", 0) or 0) + intersection = int(row.get("intersection_count", 0) or 0) + + if from_distinct == 0: + result: Dict[str, Any] = { + "success": True, + "overlap_pct": 0.0, + "from_distinct_count": 0, + "to_distinct_count": to_distinct, + "intersection_count": 0, + "note": ( + f"{from_table}.{from_column} has zero distinct non-null values; " + "overlap_pct defaulted to 0.0 (no division by zero)." + ), + } + else: + result = { + "success": True, + "overlap_pct": intersection / from_distinct, + "from_distinct_count": from_distinct, + "to_distinct_count": to_distinct, + "intersection_count": intersection, + # Symmetric shape with the zero-denom branch: downstream consumers + # can read ``note`` unconditionally. + "note": "", + } + logger.info( + "tool_column_value_overlap: overlap_pct=%.4f (%d/%d)", + result["overlap_pct"], + intersection, + from_distinct, + ) + return json.dumps(result) + + +def tool_normalized_value_overlap( + ctx: ToolContext, + *, + from_table: str = "", + from_expr: str = "", + to_table: str = "", + to_expr: str = "", + **_kwargs, +) -> str: + """Like :func:`tool_column_value_overlap`, but each side is an arbitrary + scalar SQL *expression* rather than a bare column. + + This is the tool the Planner uses to PROVE a canonical-key normalization + works before committing it. When two tables that map to the same ontology + class have 0% raw-column overlap, the values are trust-local encodings of + the same key. The Planner proposes a normalization expression per table + (e.g. ``regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-\\d+)', 1)``) + and calls this tool to confirm the expressions land in a common value + space (overlap_pct > 0). A still-zero overlap means the normalization is + wrong — fix it before submitting. + """ + logger.info( + "tool_normalized_value_overlap: %s[%s] ↔ %s[%s]", + from_table, + from_expr, + to_table, + to_expr, + ) + if not (from_table and from_expr and to_table and to_expr): + return json.dumps( + { + "success": False, + "error": "from_table, from_expr, to_table, to_expr are all required", + } + ) + + for value, role in ((from_table, "from_table"), (to_table, "to_table")): + err = _validate_identifier(value, role=role) + if err is not None: + return json.dumps({"success": False, "error": err}) + for value, role in ((from_expr, "from_expr"), (to_expr, "to_expr")): + err = _validate_safe_expression(value, role=role) + if err is not None: + return json.dumps({"success": False, "error": err}) + + sql = ( + "WITH from_distinct AS (" + f" SELECT DISTINCT {from_expr} AS v FROM {from_table} " + f" WHERE {from_expr} IS NOT NULL AND {from_expr} <> ''" + ")," + " to_distinct AS (" + f" SELECT DISTINCT {to_expr} AS v FROM {to_table} " + f" WHERE {to_expr} IS NOT NULL AND {to_expr} <> ''" + ")," + " inter AS (" + " SELECT v FROM from_distinct INTERSECT SELECT v FROM to_distinct" + ") " + "SELECT (SELECT COUNT(*) FROM from_distinct) AS from_distinct_count, " + " (SELECT COUNT(*) FROM to_distinct) AS to_distinct_count, " + " (SELECT COUNT(*) FROM inter) AS intersection_count" + ) + logger.debug("tool_normalized_value_overlap: SQL=%s", sql) + + rows, err = _run_query(ctx, sql, tool_name="tool_normalized_value_overlap") + if err is not None: + return json.dumps({"success": False, "error": err}) + if not rows: + return json.dumps( + {"success": False, "error": "overlap query returned no rows"} + ) + + row = rows[0] + from_distinct = int(row.get("from_distinct_count", 0) or 0) + to_distinct = int(row.get("to_distinct_count", 0) or 0) + intersection = int(row.get("intersection_count", 0) or 0) + + if from_distinct == 0: + result: Dict[str, Any] = { + "success": True, + "overlap_pct": 0.0, + "from_distinct_count": 0, + "to_distinct_count": to_distinct, + "intersection_count": 0, + "note": ( + f"{from_expr} over {from_table} produced zero distinct non-empty " + "values; the expression likely does not match the data — revise it." + ), + } + else: + result = { + "success": True, + "overlap_pct": intersection / from_distinct, + "from_distinct_count": from_distinct, + "to_distinct_count": to_distinct, + "intersection_count": intersection, + "note": "", + } + logger.info( + "tool_normalized_value_overlap: overlap_pct=%.4f (%d/%d)", + result["overlap_pct"], + intersection, + from_distinct, + ) + return json.dumps(result) + + +def tool_distinct_count( + ctx: ToolContext, *, full_name: str = "", column: str = "", **_kwargs +) -> str: + """Report row / distinct / null counts for ``full_name.column`` and + derive ``is_unique`` and ``is_complete`` flags. + + * ``is_unique = distinct_count == row_count - null_count`` — i.e. the + non-null subset has no duplicates. + * ``is_complete = null_count == 0`` — no missing values. + """ + logger.info("tool_distinct_count: %s.%s", full_name, column) + if not (full_name and column): + return json.dumps( + {"success": False, "error": "full_name and column are required"} + ) + + for value, role in ((full_name, "full_name"), (column, "column")): + err = _validate_identifier(value, role=role) + if err is not None: + return json.dumps({"success": False, "error": err}) + + sql = ( + f"SELECT COUNT(*) AS row_count, " + f" COUNT(DISTINCT {column}) AS distinct_count, " + f" COUNT(*) - COUNT({column}) AS null_count " + f"FROM {full_name}" + ) + logger.debug("tool_distinct_count: SQL=%s", sql) + + rows, err = _run_query(ctx, sql, tool_name="tool_distinct_count") + if err is not None: + return json.dumps({"success": False, "error": err}) + if not rows: + return json.dumps( + {"success": False, "error": "distinct_count query returned no rows"} + ) + + row = rows[0] + row_count = int(row.get("row_count", 0) or 0) + distinct_count = int(row.get("distinct_count", 0) or 0) + null_count = int(row.get("null_count", 0) or 0) + non_null_rows = row_count - null_count + + result = { + "success": True, + "row_count": row_count, + "distinct_count": distinct_count, + "null_count": null_count, + "is_unique": distinct_count == non_null_rows, + "is_complete": null_count == 0, + } + logger.info( + "tool_distinct_count: rows=%d, distinct=%d, nulls=%d, unique=%s, complete=%s", + row_count, + distinct_count, + null_count, + result["is_unique"], + result["is_complete"], + ) + return json.dumps(result) + + +def tool_submit_source_model( + ctx: ToolContext, *, model: Optional[dict] = None, **_kwargs +) -> str: + """Terminal Planner tool: validate ``model`` against + :class:`SourceModel` and stash the dataclass on ``ctx.source_model``. + + Only structural validity is checked here (does ``SourceModel.from_dict`` + succeed?). Semantic checks — e.g. coverage against the live ontology — + are the orchestrator's responsibility. + """ + # Local import to keep ``agents.tools`` importable without + # ``agents.agent_mapping_pge`` (avoids circular imports during pkg init). + from agents.agent_mapping_pge.contracts import SourceModel + + logger.info("tool_submit_source_model: validating candidate model") + if model is None or not isinstance(model, dict): + return json.dumps( + {"success": False, "error": "model must be a JSON object"} + ) + + try: + source_model = SourceModel.from_dict(model) + except (KeyError, TypeError, ValueError) as exc: + # ``KeyError`` for missing required fields; ``TypeError`` / ``ValueError`` + # for bad coercions (e.g. confidence not float-parseable). + logger.warning( + "tool_submit_source_model: validation failed: %s: %s", + type(exc).__name__, + exc, + ) + return json.dumps( + { + "success": False, + "error": f"SourceModel validation failed: {type(exc).__name__}: {exc}", + } + ) + + ctx.source_model = source_model + summary = { + "table_roles": len(source_model.table_roles), + "canonical_ids": len(source_model.canonical_ids), + "join_keys": len(source_model.join_keys), + "entity_order_len": len(source_model.mapping_plan.entity_order), + "relationship_order_len": len(source_model.mapping_plan.relationship_order), + } + logger.info("tool_submit_source_model: stored — %s", summary) + return json.dumps({"success": True, "summary": summary}) + + +# ===================================================== +# OpenAI function-calling definitions +# ===================================================== + + +SAMPLE_TABLE_DEF: dict = { + "type": "function", + "function": { + "name": "sample_table", + "description": ( + "Return up to N random sample rows from a table so you can see actual values " + "(not just column types). n defaults to 20 and is capped at 100." + ), + "parameters": { + "type": "object", + "properties": { + "full_name": { + "type": "string", + "description": "Fully-qualified table name (catalog.schema.table).", + }, + "n": { + "type": "integer", + "description": "Sample size (default 20, max 100).", + }, + }, + "required": ["full_name"], + }, + }, +} + + +COLUMN_VALUE_OVERLAP_DEF: dict = { + "type": "function", + "function": { + "name": "column_value_overlap", + "description": ( + "Compute the one-sided overlap |distinct(from) ∩ distinct(to)| / |distinct(from)|. " + "Use this to validate a candidate join key before committing it to the SourceModel." + ), + "parameters": { + "type": "object", + "properties": { + "from_table": { + "type": "string", + "description": "Fully-qualified source table.", + }, + "from_column": { + "type": "string", + "description": "Column on the source side (numerator denominator).", + }, + "to_table": { + "type": "string", + "description": "Fully-qualified target table.", + }, + "to_column": { + "type": "string", + "description": "Column on the target side.", + }, + }, + "required": ["from_table", "from_column", "to_table", "to_column"], + }, + }, +} + + +NORMALIZED_VALUE_OVERLAP_DEF: dict = { + "type": "function", + "function": { + "name": "normalized_value_overlap", + "description": ( + "Same overlap metric as column_value_overlap, but each side is a " + "scalar SQL EXPRESSION instead of a bare column. Use this to PROVE a " + "canonical-key normalization before committing it: when two tables " + "that map to the same ontology class have 0% raw-column overlap, " + "propose a normalization expression per table (e.g. " + "regexp_extract(EPISODE_ID, '([a-f0-9][a-f0-9-]+-preg-\\d+)', 1)) and " + "call this to confirm overlap_pct > 0. A still-zero result means the " + "expression is wrong — fix it before submit_source_model. Expressions " + "must be a single scalar (functions/literals/operators only); " + "subqueries and SQL keywords are rejected." + ), + "parameters": { + "type": "object", + "properties": { + "from_table": { + "type": "string", + "description": "Fully-qualified source table.", + }, + "from_expr": { + "type": "string", + "description": ( + "Scalar SQL expression over the source table that " + "produces the canonical key (e.g. a regexp_extract / " + "concat). Bare column names are also accepted." + ), + }, + "to_table": { + "type": "string", + "description": "Fully-qualified target table.", + }, + "to_expr": { + "type": "string", + "description": "Scalar SQL expression over the target table.", + }, + }, + "required": ["from_table", "from_expr", "to_table", "to_expr"], + }, + }, +} + + +DISTINCT_COUNT_DEF: dict = { + "type": "function", + "function": { + "name": "distinct_count", + "description": ( + "Report row_count / distinct_count / null_count for a column, with is_unique " + "and is_complete flags. Use this to vet a candidate canonical-ID column." + ), + "parameters": { + "type": "object", + "properties": { + "full_name": { + "type": "string", + "description": "Fully-qualified table name (catalog.schema.table).", + }, + "column": { + "type": "string", + "description": "Column to characterise.", + }, + }, + "required": ["full_name", "column"], + }, + }, +} + + +SUBMIT_SOURCE_MODEL_DEF: dict = { + "type": "function", + "function": { + "name": "submit_source_model", + "description": ( + "Terminal Planner tool. Submit the final SourceModel JSON (matching " + "SourceModel.to_dict() shape). Validates the structure and stores the " + "dataclass on the ToolContext for the Generator stage to consume." + ), + "parameters": { + "type": "object", + "properties": { + "model": { + "type": "object", + "description": ( + "JSON-encoded SourceModel with table_roles, canonical_ids, " + "join_keys, and mapping_plan." + ), + } + }, + "required": ["model"], + }, + }, +} + + +# ===================================================== +# Aggregate exports +# ===================================================== + + +PLANNER_TOOL_DEFINITIONS: List[dict] = [ + SAMPLE_TABLE_DEF, + COLUMN_VALUE_OVERLAP_DEF, + NORMALIZED_VALUE_OVERLAP_DEF, + DISTINCT_COUNT_DEF, + SUBMIT_SOURCE_MODEL_DEF, +] + + +PLANNER_TOOL_HANDLERS: Dict[str, Callable] = { + "sample_table": tool_sample_table, + "column_value_overlap": tool_column_value_overlap, + "normalized_value_overlap": tool_normalized_value_overlap, + "distinct_count": tool_distinct_count, + "submit_source_model": tool_submit_source_model, +} diff --git a/src/api/routers/internal/dtwin.py b/src/api/routers/internal/dtwin.py index 65a78cdd..3256fe88 100644 --- a/src/api/routers/internal/dtwin.py +++ b/src/api/routers/internal/dtwin.py @@ -1458,10 +1458,34 @@ def _is_ready(ep: dict) -> bool: state = (ep.get("state") or "").upper() return state in ("READY", "TRUE", "UP") + def _is_tool_incompatible(name: str) -> bool: + """Reasoning-first models that reject function tools via the standard + /v1/chat/completions path (they require /v1/responses) — picking one + breaks Graph Chat, which is a tool-calling agent. Skip them in + auto-discovery. e.g. ``databricks-gpt-5-5`` returns HTTP 400 + "Function tools with reasoning_effort are not supported ... use + /v1/responses instead". + """ + n = (name or "").lower() + markers = ("gpt-5", "gpt5", "-o1", "-o3", "-o4-", "reasoning") + return any(m in n for m in markers) + + # Preferred: a tool-capable Databricks foundation model. for ep in endpoints: name = ep.get("name") or "" - if name.startswith("databricks-") and _is_ready(ep): + if ( + name.startswith("databricks-") + and _is_ready(ep) + and not _is_tool_incompatible(name) + ): return name + # Next: any ready endpoint that isn't a known tool-incompatible model. + for ep in endpoints: + name = ep.get("name") or "" + if name and _is_ready(ep) and not _is_tool_incompatible(name): + return name + # Last resort: a ready endpoint even if it may be tool-incompatible + # (better to try than to return nothing). for ep in endpoints: if _is_ready(ep) and ep.get("name"): return ep["name"] @@ -1724,8 +1748,20 @@ async def dtwin_assistant_chat_stream( event_queue: asyncio.Queue = asyncio.Queue() def _on_event(step: AgentStep) -> None: - """Forward an AgentStep from the sync thread to the async generator.""" - asyncio.run_coroutine_threadsafe(event_queue.put(step), loop).result(timeout=10) + """Forward an AgentStep from the sync thread to the async generator. + + Best-effort: step events drive the live progress UI only — the final + reply is delivered separately via the ``done`` event. If the async + consumer is slow (slow SSE client, long-running tool), enqueueing must + NOT raise, or the timeout would crash the whole agent turn. Drop the + progress event instead and let the agent keep running. + """ + try: + asyncio.run_coroutine_threadsafe( + event_queue.put(step), loop + ).result(timeout=10) + except Exception as exc: # noqa: BLE001 — progress delivery is non-critical + logger.debug("GraphChat/stream: dropped progress event: %s", exc) async def _run_agent_task() -> None: try: @@ -1986,6 +2022,7 @@ async def dtwin_triples_find( depth: int = 1, limit: int = 1000, offset: int = 0, + seed_limit: int = 0, session_mgr: SessionManager = Depends(get_session_manager), settings: Settings = Depends(get_settings), ): @@ -2004,6 +2041,10 @@ async def dtwin_triples_find( depth = max(1, min(int(depth or 1), 10)) limit = max(1, min(int(limit or 1000), 10000)) offset = max(0, int(offset or 0)) + # 0 = unbounded (back-compat for callers that paginate over all matches); + # >0 caps BFS seeds so a broad search ("mother") can't seed hundreds of + # subjects and blow up the recursive traversal. + seed_limit = max(0, min(int(seed_limit or 0), 1000)) domain = get_domain(session_mgr) table = effective_graph_name(domain) @@ -2044,6 +2085,7 @@ async def dtwin_triples_find( depth, search=search or "", entity_type=entity_type or "", + seed_limit=seed_limit, ) if not bfs_rows: diff --git a/src/api/routers/internal/ontology.py b/src/api/routers/internal/ontology.py index 2e6cf990..0785856a 100644 --- a/src/api/routers/internal/ontology.py +++ b/src/api/routers/internal/ontology.py @@ -240,11 +240,19 @@ async def export_owl(session_mgr: SessionManager = Depends(get_session_manager)) async def get_loaded_ontology( session_mgr: SessionManager = Depends(get_session_manager), ): - """Get currently loaded ontology from session.""" + """Get currently loaded ontology from session. + + Returns ``success: false`` (HTTP 200) when no ontology is loaded rather + than a 404 — "no ontology yet" is a normal empty state on pages that load + before a domain is selected (e.g. landing on Digital Twin / Graph Chat in + a fresh session). All callers branch on ``data.success``, so a 200 + empty-state is handled identically while avoiding noisy 404s in the + browser console and server logs. + """ domain = get_domain(session_mgr) if domain.get_classes(): return {"success": True, "ontology": domain.ontology} - raise NotFoundError("No ontology loaded") + return {"success": False, "ontology": None, "message": "No ontology loaded"} @router.post("/parse-owl") @@ -1671,6 +1679,8 @@ def on_step(msg: str): tm.advance_step(task.id, "Finalizing…") + # Upstream's per-iteration generation score (from the agent's + # pitfall-tool quality loop). iteration_summary = agent_result.iteration_summary or [] final_score = ( iteration_summary[-1]["score"] if iteration_summary else None @@ -1680,6 +1690,27 @@ def on_step(msg: str): and iteration_summary[-1]["status"] in ("passed", "max_rounds_reached") ) + # Run the PGE intrinsic evaluator in-app (deterministic, no extra + # LLM) — a complementary structural scorecard + GREEN/RED verdict. + # Never breaks generation: the import + call are guarded so even an + # import-time failure can't fail an already-good run. + scorecard = None + try: + from agents.pge_eval.inapp import score_generated_ontology + + scorecard = score_generated_ontology(owl_content, metadata) + except Exception as score_exc: # noqa: BLE001 + logger.warning("Wizard: in-app scoring unavailable: %s", score_exc) + + message = ( + f"Generated {stats.get('classes', 0)} classes, " + f"{stats.get('properties', 0)} properties " + f"({agent_result.iterations} agent iterations)" + + (f" — quality score {final_score}/100" if final_score is not None else "") + ) + if scorecard: + message += f" · PGE {scorecard['verdict']}" + tm.complete_task( task.id, result={ @@ -1691,13 +1722,9 @@ def on_step(msg: str): "iteration_summary": iteration_summary, "generation_score": final_score, "generation_converged": converged, + "pge_scorecard": scorecard, }, - message=( - f"Generated {stats.get('classes', 0)} classes, " - f"{stats.get('properties', 0)} properties " - f"({agent_result.iterations} agent iterations)" - + (f" — quality score {final_score}/100" if final_score is not None else "") - ), + message=message, ) except Exception as e: diff --git a/src/back/core/agents/AgentClient.py b/src/back/core/agents/AgentClient.py index 5d7ab263..de8a63a3 100644 --- a/src/back/core/agents/AgentClient.py +++ b/src/back/core/agents/AgentClient.py @@ -11,7 +11,7 @@ if TYPE_CHECKING: from agents.agent_owl_generator.engine import AgentResult - from agents.agent_auto_assignment.engine import AgentResult as AutoAssignAgentResult + from agents.agent_mapping_pge.engine import AgentResult as AutoAssignAgentResult from agents.agent_auto_icon_assign.engine import ( AgentResult as IconAssignAgentResult, ) @@ -109,13 +109,13 @@ def run_auto_assignment( max_iterations: Upper bound on agent refinement iterations. Returns: - Structured result from ``agents.agent_auto_assignment`` describing + Structured result from ``agents.agent_mapping_pge`` describing proposed mappings and per-item status. Raises: Exception: Propagates any failure raised by ``run_agent``. """ - from agents.agent_auto_assignment import run_agent + from agents.agent_mapping_pge import run_agent return run_agent( host=host, diff --git a/src/back/core/databricks/DatabricksAuth.py b/src/back/core/databricks/DatabricksAuth.py index 2f3e35df..cb568275 100644 --- a/src/back/core/databricks/DatabricksAuth.py +++ b/src/back/core/databricks/DatabricksAuth.py @@ -258,7 +258,25 @@ def can_use_cloud_fetch(self) -> bool: return True def probe_cloud_fetch_capability(self) -> Tuple[bool, str]: - """Issue a tiny ``SELECT 1`` with ``use_cloud_fetch=True`` and cache the outcome. + """Probe whether the runtime can actually download CloudFetch result + blobs from the storage host, and cache the outcome. + + Two-stage probe so a blocked-egress Apps sandbox is caught quickly + without burning 40 MB of bandwidth on every cache miss: + + 1. **TCP reachability** to known AWS CloudFetch storage hosts. The + Databricks Apps egress firewall blocks the whole + ``*.storage.cloud.databricks.com`` family at L3/L4, so a plain + TCP connect with a short timeout returns connection-refused + almost instantly. This is the fast, accurate path on AWS. + + 2. **SQL load-test** as a backstop: ``SELECT id FROM range(N)`` with + N large enough that the warehouse returns presigned-URL + CloudFetch links instead of inline Thrift rows. Has to be on + the order of millions of BIGINTs to clear the typical 10-20 MB + inline threshold — smaller queries get returned inline and the + probe never touches storage at all (this was the + original-probe bug, where ``SELECT 1`` always reported "ok"). Returns ``(capable, reason)``. The result is cached at the class level for ``_CLOUD_FETCH_PROBE_TTL_SECONDS`` so subsequent SQL @@ -276,6 +294,14 @@ def probe_cloud_fetch_capability(self) -> Tuple[bool, str]: self._record_cloud_fetch(False, prereq_msg) return False, prereq_msg + # ── Stage 1: direct TCP egress check ──────────────────────────── + tcp_ok, tcp_reason = self._probe_cloud_fetch_storage_egress() + if not tcp_ok: + self._record_cloud_fetch(False, tcp_reason) + logger.info("CloudFetch probe: not capable (%s)", tcp_reason) + return False, tcp_reason + + # ── Stage 2: SQL load-test (large enough to force CloudFetch) ── try: from databricks import sql @@ -292,20 +318,66 @@ def probe_cloud_fetch_capability(self) -> Tuple[bool, str]: elif self.token: probe_params["access_token"] = self.token + # 5M BIGINTs ≈ 40 MB raw, ~10-20 MB Arrow-compressed — over the + # typical warehouse inline threshold, so the warehouse returns + # CloudFetch presigned URLs which the connector downloads + # during ``fetchmany``. A blocked storage host raises there. + probe_sql = "SELECT id FROM range(5000000)" with sql.connect(**probe_params) as conn: with conn.cursor() as cur: - cur.execute("SELECT 1") - cur.fetchall() - msg = "Probe SELECT 1 succeeded with use_cloud_fetch=True" + cur.execute(probe_sql) + cur.fetchmany(1) + msg = ( + "Probe SELECT id FROM range(5000000) succeeded " + "with use_cloud_fetch=True (TCP egress + CloudFetch reachable)" + ) self._record_cloud_fetch(True, msg) logger.info("CloudFetch probe: capable (%s)", msg) return True, msg except Exception as exc: # noqa: BLE001 - vendor/network surface - msg = f"Probe SELECT 1 failed with use_cloud_fetch=True: {exc}" + msg = ( + "Probe SELECT id FROM range(5000000) failed with " + f"use_cloud_fetch=True: {exc}" + ) self._record_cloud_fetch(False, msg) logger.info("CloudFetch probe: not capable (%s)", msg) return False, msg + # AWS CloudFetch presigned-URL storage hosts. Databricks Apps blocks + # the whole family at the L3/L4 egress firewall, so a TCP connect with + # a short timeout returns connection-refused almost instantly. We + # probe two common regions; if either is blocked we treat egress as + # blocked everywhere (Apps doesn't selectively allow some regions). + _CLOUD_FETCH_STORAGE_HOSTS = ( + "us-east-1.storage.cloud.databricks.com", + "us-west-2.storage.cloud.databricks.com", + ) + + def _probe_cloud_fetch_storage_egress(self) -> Tuple[bool, str]: + """TCP-connect to known CloudFetch storage hosts with a short + timeout. Returns ``(True, msg)`` only if every probe host is + reachable; the first failure is enough to declare egress blocked. + """ + import socket + + # Only applies to AWS workspaces; Azure CloudFetch uses a + # different storage host pattern. For non-AWS hosts, skip the + # TCP check and rely solely on the SQL load-test below. + if "cloud.databricks.com" not in self.host: + return True, "Workspace is not AWS — skipping TCP egress probe" + + for host in self._CLOUD_FETCH_STORAGE_HOSTS: + try: + with socket.create_connection((host, 443), timeout=3): + pass + except (OSError, socket.timeout) as exc: + return ( + False, + f"CloudFetch storage host {host} unreachable " + f"(TCP egress blocked): {exc}", + ) + return True, "TCP egress to CloudFetch storage hosts is reachable" + def _record_cloud_fetch(self, capable: bool, reason: str) -> None: DatabricksAuth._cloud_fetch_cache[(self.host, self.warehouse_id)] = ( capable, diff --git a/src/back/core/triplestore/TripleStoreBackend.py b/src/back/core/triplestore/TripleStoreBackend.py index 2d6e9a26..c60bc1c9 100644 --- a/src/back/core/triplestore/TripleStoreBackend.py +++ b/src/back/core/triplestore/TripleStoreBackend.py @@ -314,6 +314,7 @@ def bfs_traversal( depth: int, search: str = "", entity_type: str = "", + seed_limit: int = 0, ) -> List[Dict[str, Any]]: """BFS traversal from seed entities. @@ -323,6 +324,14 @@ def bfs_traversal( *search* and *entity_type* are structured parameters for future non-SQL backends (Cypher, Gremlin) that cannot use raw SQL fragments. + *seed_limit* (when > 0) caps the number of seed entities the BFS + starts from. A broad search (e.g. "mother" matching every Mother) + otherwise seeds hundreds of subjects and the recursive OR-join + expansion over the whole graph becomes very expensive. Capping seeds + bounds the entire traversal+fetch pipeline — ideal for "describe a few + matching entities" (the Graph Chat agent's use), which never needs all + matches at once. + Returns rows with ``entity`` and ``min_lvl`` columns. """ edge_filters = ( @@ -332,9 +341,10 @@ def bfs_traversal( f"AND t.predicate != '{RDFS_LABEL}' " f"AND (t.object LIKE 'http://%' OR t.object LIKE 'https://%')" ) + seed_cap = f" LIMIT {int(seed_limit)}" if seed_limit and seed_limit > 0 else "" sql = ( f"WITH RECURSIVE seeds AS (\n" - f" SELECT DISTINCT subject AS entity FROM {self._sql_relation(table_name)}{seed_where}\n" + f" SELECT DISTINCT subject AS entity FROM {self._sql_relation(table_name)}{seed_where}{seed_cap}\n" f"), bfs(entity, lvl) AS (\n" f" SELECT entity, 0 FROM seeds\n" f" UNION ALL\n" diff --git a/src/back/objects/mapping/Mapping.py b/src/back/objects/mapping/Mapping.py index 61ebdf76..fc7ccbb5 100644 --- a/src/back/objects/mapping/Mapping.py +++ b/src/back/objects/mapping/Mapping.py @@ -27,7 +27,7 @@ _MAX_DOC_CHARS = 50_000 if TYPE_CHECKING: - from agents.agent_auto_assignment.engine import AgentResult as AutoAssignAgentResult + from agents.agent_mapping_pge.engine import AgentResult as AutoAssignAgentResult SINGLE_ITEM_MAX_ITERATIONS = 15 @@ -78,13 +78,18 @@ def auto_assign_with_agent( on_step: Optional[Callable[[str, int], None]] = None, max_iterations: Optional[int] = None, ) -> "AutoAssignAgentResult": - """Run ``agent_auto_assignment`` (blocking). + """Run the mapping-PGE agent (``agent_mapping_pge``) — blocking. + + Returns an :class:`AgentResult` with the standard ``entity_mappings`` + and ``relationship_mappings`` plus three PGE-specific extras + (``source_model``, ``mapping_evaluations``, ``mapping_run_log``) that + the caller can persist on the session. ``client`` is typically a :class:`~back.core.databricks.DatabricksClient` built with the domain warehouse. Call from a background thread when started from HTTP. """ - from agents.agent_auto_assignment import run_agent + from agents.agent_mapping_pge import run_agent return run_agent( host=host, @@ -165,6 +170,12 @@ def run_auto_assign_task( total_iterations = 0 total_usage = {"prompt_tokens": 0, "completion_tokens": 0} chunk_errors: List[str] = [] + # PGE-specific extras accumulated across chunks. Each chunk + # re-plans, so ``last_source_model`` reflects the most recent + # plan; per-item evaluations / run logs concatenate cleanly. + last_source_model: Optional[Dict[str, Any]] = None + merged_mapping_evaluations: Dict[str, Any] = {} + merged_mapping_run_log: List[Any] = [] for chunk_idx, chunk in enumerate(chunks): chunk_num = chunk_idx + 1 @@ -261,6 +272,19 @@ def on_step(msg: str, progress_pct: int = 0) -> None: for k in total_usage: total_usage[k] += agent_result.usage.get(k, 0) + # PGE extras — accumulate. The new engine returns these as + # dicts/lists (drop-in compatible). The legacy engine omitted + # them; ``getattr`` with defaults keeps us tolerant. + chunk_source_model = getattr(agent_result, "source_model", None) + if chunk_source_model: + last_source_model = chunk_source_model + chunk_evals = getattr(agent_result, "mapping_evaluations", None) or {} + if chunk_evals: + merged_mapping_evaluations.update(chunk_evals) + chunk_run_log = getattr(agent_result, "mapping_run_log", None) or [] + if chunk_run_log: + merged_mapping_run_log.extend(chunk_run_log) + e_done = len(entity_mapping_by_uri) r_done = len(rel_mapping_by_uri) @@ -345,12 +369,37 @@ def on_step(msg: str, progress_pct: int = 0) -> None: all_relationship_mappings, existing_entity_mappings=entity_mappings, existing_relationship_mappings=relationship_mappings, + source_model=last_source_model, + mapping_evaluations=merged_mapping_evaluations or None, + mapping_run_log=merged_mapping_run_log or None, ) message = f"Completed: {e_count} entities, {r_count} relationships mapped" if chunk_errors: message += f" ({len(chunk_errors)} chunk(s) had errors)" + # Run the PGE intrinsic evaluator in-app on the completed mapping + # run (deterministic — re-uses the captured per-item evaluations, + # no extra LLM). Never breaks the run; the import + call are guarded + # so even an import-time failure can't fail an already-good run. + scorecard = None + try: + from agents.pge_eval.inapp import score_mapping_run + + scorecard = score_mapping_run( + ontology={"entities": entities, "relationships": relationships}, + metadata=schema_context, + mapping_run_log=merged_mapping_run_log, + mapping_evaluations=merged_mapping_evaluations, + entity_mappings=all_entity_mappings, + relationship_mappings=all_relationship_mappings, + usage=total_usage, + ) + except Exception as score_exc: # noqa: BLE001 + logger.warning("Auto-assign: in-app scoring unavailable: %s", score_exc) + if scorecard: + message += f" · quality {scorecard['verdict']}" + tm.complete_task( task.id, result={ @@ -365,6 +414,15 @@ def on_step(msg: str, progress_pct: int = 0) -> None: "agent_steps": serialize_agent_steps(all_steps), "agent_iterations": total_iterations, "agent_usage": total_usage, + "pge_scorecard": scorecard, + # PGE run-visualizer payload — the planner's source model, + # per-item evaluator verdicts, and the attempt-by-attempt run + # log. Persisted to the session already; also surfaced here so + # the UI can render the planner→generator→evaluator→critic loop + # from the polled task result without a second round-trip. + "source_model": last_source_model, + "mapping_evaluations": merged_mapping_evaluations or None, + "mapping_run_log": merged_mapping_run_log or None, }, message=message, ) @@ -449,6 +507,11 @@ def on_step(msg: str, progress_pct: int = 0) -> None: tm.fail_task(task.id, "Agent completed but produced no mapping") return + # PGE extras from this single-item run — passed through verbatim. + single_source_model = getattr(agent_result, "source_model", None) + single_evals = getattr(agent_result, "mapping_evaluations", None) or None + single_run_log = getattr(agent_result, "mapping_run_log", None) or None + if item_type == "entity": Mapping.save_mappings_to_session( session_id, @@ -456,6 +519,9 @@ def on_step(msg: str, progress_pct: int = 0) -> None: agent_result.entity_mappings, None, existing_entity_mappings=existing_entity_mappings, + source_model=single_source_model, + mapping_evaluations=single_evals, + mapping_run_log=single_run_log, ) else: Mapping.save_mappings_to_session( @@ -464,6 +530,9 @@ def on_step(msg: str, progress_pct: int = 0) -> None: None, agent_result.relationship_mappings, existing_relationship_mappings=existing_relationship_mappings, + source_model=single_source_model, + mapping_evaluations=single_evals, + mapping_run_log=single_run_log, ) tm.complete_task( @@ -472,6 +541,12 @@ def on_step(msg: str, progress_pct: int = 0) -> None: "item_type": item_type, "mapping": mapping, "iterations": agent_result.iterations, + # PGE run-visualizer payload (see batch path) — surface the + # single-item planner model, evaluator verdicts and run log + # so the UI renders the same loop view for one-off re-maps. + "source_model": single_source_model, + "mapping_evaluations": single_evals, + "mapping_run_log": single_run_log, }, message=f"Assigned {item_type}: {item_name}", ) @@ -973,6 +1048,9 @@ def save_mappings_to_session( *, existing_entity_mappings: Optional[list] = None, existing_relationship_mappings: Optional[list] = None, + source_model: Optional[Dict[str, Any]] = None, + mapping_evaluations: Optional[Dict[str, Any]] = None, + mapping_run_log: Optional[List[Any]] = None, ) -> None: if not session_id: logger.warning("save_mappings_to_session: no session_id — skipping") @@ -1010,6 +1088,21 @@ def save_mappings_to_session( else: assignment["relationships"] = relationship_mappings + # Mapping-PGE extras — persisted alongside the assignment so the + # UI (future work) and downstream observability can surface + # planner state, per-item evaluation reports, and the per-item + # attempt log without re-running the agent. + if source_model is not None: + assignment["source_model"] = source_model + if mapping_evaluations is not None: + merged_evals = dict(assignment.get("mapping_evaluations") or {}) + merged_evals.update(mapping_evaluations) + assignment["mapping_evaluations"] = merged_evals + if mapping_run_log is not None: + existing_log = list(assignment.get("mapping_run_log") or []) + existing_log.extend(mapping_run_log) + assignment["mapping_run_log"] = existing_log + domain_node = bucket.setdefault("domain", {}) domain_node["assignment_changed"] = True diff --git a/src/front/static/mapping/css/mapping-pge-visualizer.css b/src/front/static/mapping/css/mapping-pge-visualizer.css new file mode 100644 index 00000000..c0c6dee5 --- /dev/null +++ b/src/front/static/mapping/css/mapping-pge-visualizer.css @@ -0,0 +1,147 @@ +/* PGE Run-Visualizer — surfaces the Planner→Generator→Evaluator→Critic loop. + * JS: /static/mapping/js/mapping-pge-visualizer.js + * Component-level classes only (ob-pge-*), Bootstrap 5.3 for everything else. */ + +.ob-pge-card { + border: 1px solid var(--bs-border-color, #dee2e6); + border-radius: 0.5rem; + overflow: hidden; +} + +.ob-pge-header { + display: flex; + align-items: center; + justify-content: space-between; + gap: 0.75rem; + padding: 0.75rem 1rem; + background: linear-gradient(90deg, #f3f0ff 0%, #eef5ff 100%); + border-bottom: 1px solid var(--bs-border-color, #dee2e6); +} + +.ob-pge-stages { + display: flex; + align-items: center; + gap: 0.35rem; + font-weight: 600; + font-size: 0.95rem; + color: #343a40; + flex-wrap: wrap; +} +.ob-pge-stage-chip { + display: inline-flex; + align-items: center; + gap: 0.3rem; + padding: 0.15rem 0.55rem; + border-radius: 999px; + background: #fff; + border: 1px solid #d7d2f0; + font-size: 0.8rem; +} +.ob-pge-stage-arrow { color: #adb5bd; } + +/* Verdict pill */ +.ob-pge-verdict { + font-size: 0.85rem; + font-weight: 700; + letter-spacing: 0.02em; + padding: 0.3rem 0.7rem; + border-radius: 999px; +} +.ob-pge-verdict-green { background: #d1e7dd; color: #0f5132; } +.ob-pge-verdict-red { background: #f8d7da; color: #842029; } +.ob-pge-verdict-na { background: #e2e3e5; color: #41464b; } + +/* KPI strip */ +.ob-pge-kpis { + display: flex; + flex-wrap: wrap; + gap: 0.6rem; + padding: 0.85rem 1rem; + border-bottom: 1px solid var(--bs-border-color, #dee2e6); + background: #fcfcfd; +} +.ob-pge-kpi { + flex: 1 1 120px; + min-width: 110px; + background: #fff; + border: 1px solid #eef0f2; + border-radius: 0.4rem; + padding: 0.5rem 0.7rem; +} +.ob-pge-kpi-val { font-size: 1.15rem; font-weight: 700; line-height: 1.1; } +.ob-pge-kpi-lbl { font-size: 0.72rem; color: #6c757d; text-transform: uppercase; letter-spacing: 0.03em; } + +.ob-pge-gates { display: flex; gap: 0.4rem; flex-wrap: wrap; padding: 0.6rem 1rem 0.85rem; } +.ob-pge-gate { + display: inline-flex; align-items: center; gap: 0.35rem; + font-size: 0.78rem; padding: 0.25rem 0.6rem; border-radius: 0.35rem; + border: 1px solid transparent; +} +.ob-pge-gate-pass { background: #e8f6ee; color: #0f5132; border-color: #b6ddc6; } +.ob-pge-gate-fail { background: #fdecea; color: #842029; border-color: #f1b0b7; } + +/* Per-item loop trace */ +.ob-pge-item { + border-top: 1px solid #f0f1f3; + padding: 0.6rem 1rem; +} +.ob-pge-item:first-child { border-top: none; } +.ob-pge-item-head { + display: flex; align-items: center; gap: 0.5rem; + cursor: pointer; +} +.ob-pge-item-name { font-weight: 600; flex: 1 1 auto; } +.ob-pge-item-name code { font-weight: 600; color: #4530a8; background: none; padding: 0; } + +/* Attempt chain */ +.ob-pge-attempts { margin: 0.5rem 0 0.2rem; padding-left: 0.25rem; } +.ob-pge-attempt { + display: flex; align-items: flex-start; gap: 0.5rem; + padding: 0.35rem 0; + border-left: 2px solid #e9ecef; + padding-left: 0.75rem; + margin-left: 0.4rem; +} +.ob-pge-attempt-num { + flex: 0 0 auto; font-size: 0.72rem; color: #6c757d; + background: #f1f3f5; border-radius: 999px; padding: 0.05rem 0.45rem; margin-top: 0.1rem; +} +.ob-pge-chain { display: flex; align-items: center; gap: 0.3rem; flex-wrap: wrap; } +.ob-pge-step { + display: inline-flex; align-items: center; gap: 0.25rem; + font-size: 0.76rem; padding: 0.1rem 0.45rem; border-radius: 0.3rem; + border: 1px solid #e3e6ea; background: #fff; +} +.ob-pge-step-pass { border-color: #b6ddc6; background: #f0faf4; color: #0f5132; } +.ob-pge-step-fail { border-color: #f1b0b7; background: #fdf2f3; color: #842029; } +.ob-pge-step-skip { color: #868e96; } +.ob-pge-step-bubble { border-color: #ffe0a6; background: #fff8ec; color: #8a5a00; } +.ob-pge-arrow { color: #ced4da; font-size: 0.7rem; } +.ob-pge-hint { + font-size: 0.76rem; color: #6c4a00; background: #fff8ec; + border: 1px solid #ffe9c2; border-radius: 0.3rem; padding: 0.3rem 0.5rem; + margin: 0.25rem 0 0.1rem; display: block; +} +.ob-pge-metrics-inline { + font-size: 0.74rem; color: #495057; margin-top: 0.2rem; display: flex; flex-wrap: wrap; gap: 0.5rem; +} +.ob-pge-metrics-inline span { white-space: nowrap; } + +/* Long free-text eval fields (e.g. critic reasoning) — wrap, don't overflow. */ +.ob-pge-reasoning { + font-size: 0.78rem; + color: #495057; + margin-top: 0.3rem; + line-height: 1.4; + white-space: normal; + overflow-wrap: anywhere; +} +.ob-pge-reasoning .text-muted { font-weight: 600; } + +/* Source model panel */ +.ob-pge-sm-table { font-size: 0.8rem; } +.ob-pge-sm-table th { white-space: nowrap; } +.ob-pge-conf-bar { + display: inline-block; height: 6px; border-radius: 3px; background: #6f42c1; + vertical-align: middle; margin-right: 0.35rem; +} diff --git a/src/front/static/mapping/js/mapping-autoassign.js b/src/front/static/mapping/js/mapping-autoassign.js index 1b7546df..145f4d95 100644 --- a/src/front/static/mapping/js/mapping-autoassign.js +++ b/src/front/static/mapping/js/mapping-autoassign.js @@ -189,6 +189,7 @@ window.AutoAssignModule = { console.log('[AutoAssign] Task completed, applying results'); sessionStorage.removeItem(AUTO_ASSIGN_TASK_KEY); this.results = task.result.results || []; + this.taskResult = task.result; await this.saveMappingsFromTask(task.result); this.showReport(); await this.refreshMappingConfig(); @@ -449,6 +450,7 @@ window.AutoAssignModule = { if (task.result) { this.results = task.result.results || []; + this.taskResult = task.result; await this.saveMappingsFromTask(task.result); } @@ -773,6 +775,17 @@ window.AutoAssignModule = { `; }).join(''); + // Render the PGE run-visualizer (planner→generator→evaluator→critic + // loop + scorecard) from the captured task result. Defensive: never + // let a visualizer error break the report. + if (window.PgeVisualizer) { + try { + PgeVisualizer.render(this.taskResult || {}, 'autoAssignPgeVisualizer'); + } catch (e) { + console.error('[AutoAssign] PGE visualizer render failed:', e); + } + } + // Show notification if (successCount > 0) { showNotification(`Auto-mapped ${successCount} item(s) successfully`, 'success', 3000); @@ -784,6 +797,9 @@ window.AutoAssignModule = { */ reset: function() { this.results = []; + this.taskResult = null; + const pgeEl = document.getElementById('autoAssignPgeVisualizer'); + if (pgeEl) { pgeEl.style.display = 'none'; pgeEl.innerHTML = ''; } document.getElementById('autoAssignProgressSection').style.display = 'none'; document.getElementById('autoAssignReportSection').style.display = 'none'; document.getElementById('startAutoAssignBtn').style.display = 'inline-block'; diff --git a/src/front/static/mapping/js/mapping-pge-visualizer.js b/src/front/static/mapping/js/mapping-pge-visualizer.js new file mode 100644 index 00000000..b0a730dc --- /dev/null +++ b/src/front/static/mapping/js/mapping-pge-visualizer.js @@ -0,0 +1,347 @@ +/* + * PGE Run-Visualizer + * ------------------- + * Renders the Planner → Generator → Evaluator → Critic loop from a completed + * auto-map task result. Consumes the PGE artifacts surfaced on `task.result`: + * - pge_scorecard : intrinsic-eval scorecard (verdict + gate tiers + metrics) + * - source_model : the Planner's output (table roles, canonical ids, joins, plan) + * - mapping_run_log[] : per-item attempt-by-attempt trace + * - mapping_evaluations{} : per-item final EvalReport (metrics + failures) + * + * Entirely defensive — any field may be missing (legacy engine, partial run). + * If there is nothing PGE-specific to show, render() hides the container. + * + * Public API: PgeVisualizer.render(taskResult, containerId) + */ +const PgeVisualizer = (function () { + 'use strict'; + + // ---- small helpers ------------------------------------------------- + function esc(s) { + if (s === null || s === undefined) return ''; + return String(s) + .replace(/&/g, '&').replace(//g, '>') + .replace(/"/g, '"').replace(/'/g, '''); + } + + function humanize(key) { + return String(key) + .replace(/_/g, ' ') + .replace(/\bpct\b/gi, '%') + .replace(/\b(\w)/g, (m) => m.toUpperCase()); + } + + // Format a known ratio metric as a percentage (so an exact 1.0 reads + // "100%", not "1" — JS treats 1.0 as an integer, so fmtMetric alone can't). + function fmtRatio(v) { + if (typeof v !== 'number') return fmtMetric(v); + return (v * 100).toFixed(1).replace(/\.0$/, '') + '%'; + } + + // Format a metric value: floats in [0,1] become percentages. + function fmtMetric(v) { + if (typeof v === 'number') { + if (Number.isInteger(v)) return String(v); + if (v >= 0 && v <= 1) return (v * 100).toFixed(1) + '%'; + return v.toFixed(3); + } + if (Array.isArray(v)) return v.length ? v.join(', ') : '—'; + if (v === null || v === undefined) return '—'; + return esc(v); + } + + function shortUri(uri) { + if (!uri) return ''; + const s = String(uri); + const hashIdx = s.lastIndexOf('#'); + const slashIdx = s.lastIndexOf('/'); + const cut = Math.max(hashIdx, slashIdx); + return cut >= 0 && cut < s.length - 1 ? s.slice(cut + 1) : s; + } + + function statusBadge(status) { + const map = { + PASS: ['bg-success', 'check-circle-fill', 'Pass'], + PRESEEDED: ['bg-info', 'bookmark-check-fill', 'Pre-seeded'], + SKIPPED: ['bg-secondary', 'dash-circle-fill', 'Skipped'], + FAIL_BUDGET: ['bg-danger', 'x-circle-fill', 'Failed (budget)'], + FAIL_BUBBLE: ['bg-danger', 'x-circle-fill', 'Failed (re-plan)'], + }; + const [cls, icon, label] = map[status] || ['bg-secondary', 'question-circle', status || '?']; + return `${esc(label)}`; + } + + // ---- scorecard: verdict pill (goes inside the header) -------------- + function renderVerdictPill(sc) { + const verdict = (sc && sc.verdict) || 'N/A'; + const vClass = verdict === 'GREEN' ? 'ob-pge-verdict-green' + : verdict === 'RED' ? 'ob-pge-verdict-red' : 'ob-pge-verdict-na'; + const vIcon = verdict === 'GREEN' ? 'shield-check' + : verdict === 'RED' ? 'shield-exclamation' : 'shield'; + const label = sc ? verdict : 'no scorecard'; + return `${esc(label)}`; + } + + // ---- scorecard: KPI strip + gate tiers (go below the header) ------- + function renderScorecardBody(sc) { + if (!sc) return ''; + // KPI chips from the mapping stage (the most demo-relevant metrics). + const mapMetrics = (sc.stages && sc.stages.mapping && sc.stages.mapping.metrics) || {}; + // ratio metrics render as %, count metrics as integers. + const kpiKeys = [ + { k: 'entity_completeness', ratio: true }, + { k: 'relationship_completeness', ratio: true }, + { k: 'id_integrity', ratio: true }, + { k: 'sql_exec_failures', ratio: false }, + ]; + let kpis = kpiKeys + .filter((spec) => spec.k in mapMetrics) + .map((spec) => ` +
+
${spec.ratio ? fmtRatio(mapMetrics[spec.k]) : fmtMetric(mapMetrics[spec.k])}
+
${esc(humanize(spec.k))}
+
`).join(''); + // Pipeline coverage-loss is the anti-circularity metric — show it if present. + const pipe = (sc.stages && sc.stages.pipeline) || {}; + const pipeMetrics = pipe.metrics || pipe; + if (pipeMetrics && 'coverage_loss' in pipeMetrics) { + kpis += ` +
+
${fmtRatio(pipeMetrics.coverage_loss)}
+
Coverage Loss
+
`; + } + + const gates = sc.gates || {}; + function gateChip(label, tier) { + if (!tier) return ''; + const pass = tier.passed; + const detail = (tier.failures || tier.regressions || tier.warnings || []); + const cls = pass ? 'ob-pge-gate-pass' : 'ob-pge-gate-fail'; + const icon = pass ? 'check-lg' : 'exclamation-triangle-fill'; + const title = detail.length ? esc(detail.map((d) => (typeof d === 'string' ? d : (d.check || d.metric || JSON.stringify(d)))).join(' · ')) : ''; + return ` + ${esc(label)}${detail.length ? ` (${detail.length})` : ''}`; + } + const gatesHtml = ` +
+ ${gateChip('Tier 1 · absolute', gates.tier1_absolute)} + ${gateChip('Tier 2 · ratio', gates.tier2_ratio)} + ${gateChip('Tier 3 · regression', gates.tier3_regression)} +
`; + + return ` + ${kpis ? `
${kpis}
` : ''} + ${gatesHtml}`; + } + + // ---- planner source-model panel ------------------------------------ + function renderSourceModel(sm) { + if (!sm) return '

No planner source-model captured.

'; + let html = ''; + + const roles = sm.table_roles || []; + if (roles.length) { + html += '
Table → class candidates
'; + html += '
'; + roles.forEach((r) => { + const cands = (r.ontology_class_candidates || []).map((c) => { + const conf = typeof c.confidence === 'number' ? c.confidence : 0; + const w = Math.max(6, Math.round(conf * 40)); + return `
+ + ${esc(shortUri(c.uri))} + ${(conf * 100).toFixed(0)}%
`; + }).join(''); + html += ``; + }); + html += '
${esc(r.table)}${cands || ''}
'; + } + + const cids = sm.canonical_ids || []; + if (cids.length) { + html += '
Canonical identifiers
    '; + cids.forEach((c) => { + const perTable = c.canonical_column_per_table || {}; + const cols = Object.entries(perTable) + .map(([t, col]) => `${esc(shortUri(t))}${esc(col)}`).join(', '); + html += `
  • ${esc(shortUri(c.ontology_class))}: ${cols || '—'}${c.format_note ? ` (${esc(c.format_note)})` : ''}
  • `; + }); + html += '
'; + } + + const joins = sm.join_keys || []; + if (joins.length) { + html += '
Join keys
'; + joins.forEach((j) => { + html += ` + + `; + }); + html += '
FromToKindOverlap
${esc(j.from_ref)}${esc(j.to_ref)}${esc(j.kind)}${fmtMetric(j.overlap_pct)}
'; + } + + const plan = sm.mapping_plan || {}; + const skips = plan.skip || []; + if (skips.length) { + html += '
Planner skipped
    '; + skips.forEach((s) => { + html += `
  • ${esc(shortUri(s.item))} — ${esc(s.reason || 'no reason given')}
  • `; + }); + html += '
'; + } + return html || '

Planner produced an empty source-model.

'; + } + + // ---- per-item loop trace ------------------------------------------- + function renderAttempt(a) { + function step(label, status, extraClass) { + let cls = 'ob-pge-step'; + if (status === 'PASS') cls += ' ob-pge-step-pass'; + else if (status === 'FAIL') cls += ' ob-pge-step-fail'; + else if (status === 'skipped' || status === 'skip') cls += ' ob-pge-step-skip'; + if (extraClass) cls += ' ' + extraClass; + const label2 = status && status !== 'skipped' ? `${label}: ${status}` : label; + return `${esc(label2)}`; + } + const gen = `Generator`; + const stage1 = step('Evaluator', a.stage1_status); + const showCritic = a.critic_status && a.critic_status !== 'skipped'; + const critic = showCritic ? `${step('Critic', a.critic_status)}` : ''; + const bubble = a.bubble ? `re-plan` : ''; + const err = a.error ? `${esc(a.error)}` : ''; + const hint = a.hint ? `${esc(a.hint)}` : ''; + return ` +
+ #${esc(a.attempt)} +
+
+ ${gen}${stage1}${critic}${bubble} +
+ ${err}${hint} +
+
`; + } + + function renderItem(entry, evals, idx) { + const evalReport = evals[entry.item]; + const metrics = evalReport && evalReport.metrics ? evalReport.metrics : null; + let metricsInline = ''; + if (metrics) { + // Long free-text fields (e.g. the critic's reasoning) render as a + // wrapped block; short scalar metrics render inline. + const isLongText = (v) => typeof v === 'string' && v.length > 60; + const longKeys = Object.keys(metrics).filter((k) => isLongText(metrics[k])); + const scalarKeys = Object.keys(metrics) + .filter((k) => !Array.isArray(metrics[k]) && typeof metrics[k] !== 'object' && !isLongText(metrics[k])) + .slice(0, 6); + if (scalarKeys.length) { + metricsInline += `
` + + scalarKeys.map((k) => { + const val = /pct$|_pct|overlap/i.test(k) ? fmtRatio(metrics[k]) : fmtMetric(metrics[k]); + return `${esc(humanize(k))}: ${val}`; + }).join('') + + `
`; + } + metricsInline += longKeys.map((k) => + `
${esc(humanize(k))}: ${esc(metrics[k])}
` + ).join(''); + } + const attempts = entry.attempts || []; + const attemptsHtml = attempts.length + ? `
${attempts.map(renderAttempt).join('')}
` + : '
No generator attempts (pre-seeded or skipped).
'; + const kindIcon = entry.kind === 'relationship' ? 'arrow-left-right' : 'box'; + const collId = `obPgeItem${idx}`; + return ` +
+ +
+ ${attemptsHtml} + ${metricsInline} +
+
`; + } + + function renderTrace(runLog, evals) { + if (!runLog || !runLog.length) return '

No per-item run log captured.

'; + return runLog.map((e, i) => renderItem(e, evals || {}, i)).join(''); + } + + // ---- main entrypoint ----------------------------------------------- + function render(taskResult, containerId) { + const container = document.getElementById(containerId); + if (!container) return; + + const tr = taskResult || {}; + const sc = tr.pge_scorecard || null; + const sm = tr.source_model || null; + const runLog = tr.mapping_run_log || null; + const evals = tr.mapping_evaluations || {}; + + // Nothing PGE-specific → keep the container empty/hidden. + if (!sc && !sm && !(runLog && runLog.length)) { + container.innerHTML = ''; + container.style.display = 'none'; + return; + } + container.style.display = 'block'; + + const headerStages = ` +
+ Planner + + Generator + + Evaluator + + Critic +
`; + + const itemCount = runLog ? runLog.length : 0; + const accordion = ` +
+
+

+ +

+
+
${renderTrace(runLog, evals)}
+
+
+
+

+ +

+
+
${renderSourceModel(sm)}
+
+
+
`; + + container.innerHTML = ` +
+
+ ${headerStages} + ${renderVerdictPill(sc)} +
+ ${renderScorecardBody(sc)} + ${accordion} +
`; + } + + return { render: render }; +})(); + +// Expose globally (non-module script include). +window.PgeVisualizer = PgeVisualizer; diff --git a/src/front/static/query/js/query-chat.js b/src/front/static/query/js/query-chat.js index 6fa7fec9..7382dbb8 100644 --- a/src/front/static/query/js/query-chat.js +++ b/src/front/static/query/js/query-chat.js @@ -364,8 +364,10 @@ bodyEl.removeChild(stepsEl); } - // Render the final markdown reply - const reply = event.reply || '(no reply)'; + // Render the final markdown reply. Empty replies are normally routed + // to errorStreamingBubble by the caller; this is a defensive fallback. + const reply = (event.reply || '').trim() + || '_No response was generated. Please try again._'; bodyEl.innerHTML = renderMarkdown(reply); enhanceEntityLinks(bodyEl); @@ -479,12 +481,23 @@ const doneEvent = await _consumeStream(bubble, response); - if (doneEvent) { + if (doneEvent && (doneEvent.reply || '').trim()) { finalizeStreamingBubble(bubble, doneEvent); conversationHistory.push({ role: 'assistant', - content: doneEvent.reply || '', + content: doneEvent.reply, }); + } else if (doneEvent) { + // The turn completed but produced no text — e.g. a transient + // model error (success:false) or an empty generation. Surface + // an actionable message instead of a cryptic "(no reply)", and + // do NOT persist the empty turn to history. + errorStreamingBubble( + bubble, + doneEvent.success === false + ? "The assistant didn't return a response — the model may have hit a transient error. Please try again." + : 'No response was generated. Try rephrasing your question, or ask again.' + ); } else { errorStreamingBubble(bubble, 'Stream ended without a final response.'); } diff --git a/src/front/templates/mapping.html b/src/front/templates/mapping.html index 60c063fe..5d893a5e 100644 --- a/src/front/templates/mapping.html +++ b/src/front/templates/mapping.html @@ -10,6 +10,7 @@ + {% endblock %} {% block content %} @@ -30,6 +31,7 @@ + diff --git a/src/front/templates/partials/mapping/_mapping_autoassign.html b/src/front/templates/partials/mapping/_mapping_autoassign.html index 5b839fb5..ca047d81 100644 --- a/src/front/templates/partials/mapping/_mapping_autoassign.html +++ b/src/front/templates/partials/mapping/_mapping_autoassign.html @@ -125,6 +125,10 @@

0

+ + +