diff --git a/changelogs/v0.5.2/FiifiB_2026-06-25.log b/changelogs/v0.5.2/FiifiB_2026-06-25.log new file mode 100644 index 00000000..8a23f7f6 --- /dev/null +++ b/changelogs/v0.5.2/FiifiB_2026-06-25.log @@ -0,0 +1,58 @@ +# 2026-06-25 — feat(ontology): PGE Evaluator stage for owl-generator + +## Context + +The owl-generator agent had a single-shot generation + a pitfall-tool fix loop, +but no deterministic Evaluator stage — so structural defects (orphan classes, +dangling domain/range, naming violations, duplicate classes) could survive into +the delivered ontology. This change turns owl-generation into a real +Planner→Generator→Evaluator (PGE) loop: after the pitfall loop settles, a +deterministic Stage-1 evaluator scores the ontology against the source metadata +and feeds concrete retry-hints back to the generator, bounded by a hard cap. + +The Evaluator reuses a small, usecase-agnostic ontology-metrics module +(`agents.pge_eval.ontology_metrics`) — gold-free, computed purely from the +generated ontology + source schema. Only the ontology slice of the metrics +package is introduced here; the full scorecard/CLI lands separately. + +## Changes + +1. `src/agents/agent_owl_generator/engine.py` + - Add `MAX_OWL_EVAL_ROUNDS` (bounded Evaluator retry cap) and + `_evaluate_ontology_stage()` — parses the Turtle, runs the deterministic + Tier-1 ontology checks, and returns a retry-hint string on hard defects + (orphan / dangling domain-range / naming / duplicate). Fails open: any + parse/dep error returns `None` so a check failure never blocks delivery. + - Wire the Evaluator into the agent loop after the pitfall loop; only retry + when an iteration remains, so a usable ontology is never discarded by + exhausting `MAX_ITERATIONS`. + - Raise `max_tokens` to `MAX_OUTPUT_TOKENS = 16000` so exhaustive attribute + coverage isn't silently truncated past the old 4096 ceiling. + - Strengthen the system prompt: `# ATTRIBUTE COVERAGE` section + a + `get_table_detail`-per-table workflow step driving exhaustive (not curated) + datatype-property coverage. +2. `src/agents/pge_eval/__init__.py` — new package (minimal root; importers + depend on the concrete submodule to avoid coupling to later modules). +3. `src/agents/pge_eval/normalize.py` — shared name/metadata/ontology + normalization primitives (stdlib-only). +4. `src/agents/pge_eval/ontology_metrics.py` — `evaluate_ontology()`: + deterministic Stage-1 checks + footprint coverage, no stored reference. +5. Tests: `tests/units/pge_eval/{__init__,_fixtures}.py`, + `test_ontology_metrics.py`, `test_owl_evaluator_stage.py`. + +## Modified / added files + +- M src/agents/agent_owl_generator/engine.py +- A src/agents/pge_eval/__init__.py +- A src/agents/pge_eval/normalize.py +- A src/agents/pge_eval/ontology_metrics.py +- A tests/units/pge_eval/__init__.py +- A tests/units/pge_eval/_fixtures.py +- A tests/units/pge_eval/test_ontology_metrics.py +- A tests/units/pge_eval/test_owl_evaluator_stage.py + +## Tests + +`uv run pytest tests/units/pge_eval/test_ontology_metrics.py +tests/units/pge_eval/test_owl_evaluator_stage.py +tests/units/ontology/test_owl_generator.py -q` → **39 passed**. diff --git a/src/agents/agent_owl_generator/engine.py b/src/agents/agent_owl_generator/engine.py index 9d755d48..c16f4ad1 100644 --- a/src/agents/agent_owl_generator/engine.py +++ b/src/agents/agent_owl_generator/engine.py @@ -37,6 +37,18 @@ MAX_ITERATIONS = 10 LLM_TIMEOUT = 180 +# Exhaustive per-class datatype-property coverage (see # ATTRIBUTE COVERAGE in +# the system prompt) makes the Turtle output large — a large domain ontology +# with dozens of classes and 50+ datatype properties runs well past the old 4096 +# ceiling, which silently truncated the final statement and broke parsing. +# Claude Opus supports large completions; 16k tokens fits an exhaustive +# domain ontology with headroom. +MAX_OUTPUT_TOKENS = 16000 + +# Bounded PGE retry cap for the Evaluator stage (§3.5): how many times the +# deterministic Stage-1 ontology checks may feed retry_hints back into +# generation before owl delivery proceeds regardless. +MAX_OWL_EVAL_ROUNDS = 2 _TRACE_NAME = "owl_generator" @@ -95,9 +107,12 @@ def _load_pitfall_rules() -> str: # WORKFLOW 1. Call get_metadata to understand the database schema. -2. Call list_documents to discover available documents. -3. Read relevant documents with read_document. -4. Output ONLY the final Turtle ontology as plain text (starting with @prefix). +2. Call get_table_detail on EVERY table you intend to map a class to — get_metadata + truncates wide tables at 80 columns, and you must see the FULL column list to give + each class exhaustive attribute coverage (see # ATTRIBUTE COVERAGE). +3. Call list_documents to discover available documents. +4. Read relevant documents with read_document. +5. Output ONLY the final Turtle ontology as plain text (starting with @prefix). # NAMING RULES (CRITICAL – NO EXCEPTIONS) • Classes: PascalCase (Customer, SalesOrder) @@ -127,6 +142,34 @@ def _load_pitfall_rules() -> str: • For EVERY DatatypeProperty you MUST declare rdfs:domain on the property itself (do not rely on owl:Restriction alone — the platform reads attributes from rdfs:domain) +# ATTRIBUTE COVERAGE (CRITICAL — exhaustive, NOT curated) +The downstream mapping pipeline can only bind a SQL column to a class when that +column has a matching owl:DatatypeProperty with rdfs:domain on the class. A class +with few datatype properties produces an ID+Label-only entity that is USELESS for +analytics. So model attributes EXHAUSTIVELY, not minimally: +• For EVERY class, emit a DatatypeProperty for EVERY meaningful source column that + describes an instance of that class — across ALL tables that realise the class. + A single class is often realised by several source tables (e.g. one per source + system, region, or tenant) that each hold the same real-world entity in a local + schema; UNION their columns mentally and cover the full set. Use get_table_detail + on each covering table to see every column. +• "Meaningful" = a genuine attribute of the entity: dates, measurements, codes, + scores, names, statuses, flags, free-text notes. EXCLUDE ONLY: surrogate/auto- + increment row keys with no analytical value, audit columns (created_at, updated_by, + etl_*, _ingest_*), and the foreign-key columns that ObjectProperty relationships + already carry. +• When two sources expose the SAME attribute under different column names + (e.g. total_amount vs TOTAL_AMT; status vs STATUS_CODE), emit ONE datatype + property — do NOT emit a per-source duplicate. The mapping layer reconciles the + source columns. +• Name datatype properties in lowerCamelCase derived from business meaning + (order_date → orderDate, TOTAL_AMT → totalAmount). + Use ONLY [a-z][A-Za-z0-9]* — never underscores, hyphens, or backslash escapes. +• The "at least 2 datatype properties" floor in the guidelines is a MINIMUM, not a + target. Rich, real-world entities (a transaction, an encounter, an event, a core + business object) typically warrant 6–11 datatype properties. Aim for full column + coverage, not a tidy subset. + # RELATIONSHIP RULES • NEVER create bidirectional relationships. • Between any two classes A and B create at most ONE ObjectProperty. @@ -160,10 +203,12 @@ def _load_pitfall_rules() -> str: ## 2. Class and property design rules For each **class** you create:[1][2][3][4] -1. Provide: - - A short, clear natural-language definition (1–2 sentences). - - At least 1 object property (unless the class is explicitly abstract). - - At least 2 datatype properties, when meaningful in the domain. +1. Provide: + - A short, clear natural-language definition (1–2 sentences). + - At least 1 object property (unless the class is explicitly abstract). + - Datatype properties covering EVERY meaningful source column for the class + (see "# ATTRIBUTE COVERAGE" in the system prompt — exhaustive, not curated; + 2 is a floor, full column coverage is the goal). 2. Naming conventions: - Classes: UpperCamelCase (e.g., `CustomerOrder`). - Object properties: lowerCamelCase verbs or verb-like phrases (e.g., `placesOrder`). @@ -241,6 +286,80 @@ def _parse_pitfall_tool_result(tool_result_json: str) -> Optional[Dict]: return None +# Stage-1 absolute (Tier-1) ontology defects that the Evaluator forces a +# retry on. Coverage ratios are computed and logged but are advisory at the +# generation stage (they are Tier-2 in the scorecard), so they do not by +# themselves trigger a regeneration — only hard structural defects do. +_EVAL_ABSOLUTE_CHECKS = ( + "orphan_class_count", + "dangling_domain_range_count", + "naming_violation_count", + "duplicate_class_count", +) + + +def _evaluate_ontology_stage( + turtle_text: str, metadata: dict, iteration: int +) -> Optional[str]: + """Run the Stage-1 deterministic ontology checks (§3.2) on *turtle_text*. + + Parses the Turtle into the registry shape, runs the shared intrinsic + checks, and returns a concrete ``retry_hint`` feedback string when any + Tier-1 absolute defect (orphan class, dangling domain/range, naming + violation, duplicate class) is present — turning owl-gen into a real + PGE loop. Returns ``None`` when the ontology is structurally clean. + + Fails open: any parse/dep error returns ``None`` so a check failure + never blocks OWL delivery (mirrors the pitfall-tool check). + """ + try: + from back.core.w3c.owl.OntologyParser import OntologyParser + from back.objects.ontology.Ontology import Ontology + from agents.pge_eval.ontology_metrics import evaluate_ontology + + # The model sometimes prepends a prose sentence or wraps the Turtle in + # a markdown fence; strip that the same way the downstream registry + # does, so the Evaluator parses real output instead of skipping. + turtle_text = Ontology.clean_owl_output(turtle_text) + parser = OntologyParser(turtle_text) + ontology = { + "classes": parser.get_classes(), + "properties": parser.get_properties(), + } + metrics, issues, _footprint = evaluate_ontology(ontology, metadata or {}) + logger.info( + "Iteration %d: ontology evaluator — metrics=%s", + iteration, + metrics, + ) + + absolute_issues = [ + i for i in issues if i.get("check") in _EVAL_ABSOLUTE_CHECKS + ] + if not absolute_issues: + logger.info( + "Iteration %d: ontology evaluator — no Tier-1 defects", iteration + ) + return None + + lines = [ + "The ontology you produced has structural defects. Fix ALL of them " + "and output ONLY the corrected Turtle (no markdown, no comments, " + "starting with @prefix declarations):\n" + ] + # Cap feedback to keep the prompt bounded. + for issue in absolute_issues[:12]: + lines.append(f" • {issue['hint']}") + return "\n".join(lines) + except Exception as exc: # noqa: BLE001 + logger.warning( + "Iteration %d: ontology evaluator skipped due to error: %s", + iteration, + exc, + ) + return None + + def _build_user_prompt( guidelines: str, options: dict, @@ -443,6 +562,7 @@ def notify(msg: str): # ------------------------------------------------------------------ tools_supported = True _owl_fix_rounds = 0 # pitfall-fix rounds consumed so far + _owl_eval_rounds = 0 # Evaluator (Stage-1 PGE) retry rounds consumed for iteration in range(MAX_ITERATIONS): logger.info( @@ -477,7 +597,7 @@ def notify(msg: str): endpoint_name, messages, tools=send_tools, - max_tokens=4096, + max_tokens=MAX_OUTPUT_TOKENS, temperature=0.1, timeout=LLM_TIMEOUT, trace_name=_TRACE_NAME, @@ -509,7 +629,7 @@ def notify(msg: str): endpoint_name, messages, tools=None, - max_tokens=4096, + max_tokens=MAX_OUTPUT_TOKENS, temperature=0.1, timeout=LLM_TIMEOUT, trace_name=_TRACE_NAME, @@ -749,6 +869,41 @@ def notify(msg: str): _owl_fix_rounds, max_fix_rounds, ) + # -------------------------------------------------------------- + # Evaluator stage (PGE loop) — after the pitfall-tool loop is + # clean/maxed, run the Stage-1 deterministic ontology checks (§3.2). + # On a Tier-1 structural defect, feed concrete retry_hints back to + # the generator, bounded by MAX_OWL_EVAL_ROUNDS. Only retry when + # there's another iteration left, so a usable ontology is never + # discarded by exhausting MAX_ITERATIONS. + # -------------------------------------------------------------- + eval_feedback = _evaluate_ontology_stage(content, ctx.metadata, iteration + 1) + if ( + eval_feedback + and _owl_eval_rounds < MAX_OWL_EVAL_ROUNDS + and iteration < MAX_ITERATIONS - 1 + ): + _owl_eval_rounds += 1 + notify( + f"Ontology defects found — eval round " + f"{_owl_eval_rounds}/{MAX_OWL_EVAL_ROUNDS}…" + ) + result.steps.append( + AgentStep( + step_type="evaluator", + content=eval_feedback[:200], + duration_ms=0, + ) + ) + messages.append({"role": "assistant", "content": content}) + messages.append({"role": "user", "content": eval_feedback}) + logger.info( + "Iteration %d: ontology evaluator found defects — eval round %d", + iteration + 1, + _owl_eval_rounds, + ) + continue # next iteration will produce corrected OWL + # ── Accept this text as the final OWL ──────────────────────────── result.success = True result.owl_content = content diff --git a/src/agents/pge_eval/__init__.py b/src/agents/pge_eval/__init__.py new file mode 100644 index 00000000..5a2089d9 --- /dev/null +++ b/src/agents/pge_eval/__init__.py @@ -0,0 +1,16 @@ +"""OntoBricks PGE intrinsic-evaluation primitives. + +This package holds usecase-agnostic, gold-free structural metrics for the PGE +pipeline. This PR introduces only the **ontology** slice consumed by the +owl-generator Evaluator stage: + +* :func:`agents.pge_eval.ontology_metrics.evaluate_ontology` — Stage-1 + deterministic ontology checks (orphan classes, dangling domain/range, + naming, duplicates, footprint coverage), computed purely from the generated + ontology + source metadata (no stored reference answer). + +The full scorecard (mapping metrics, gate tiers, baseline regression, LLM +judge, CLI) lands in a separate change. Importers should depend on the +concrete submodule (``agents.pge_eval.ontology_metrics``) rather than this +package root to avoid coupling to modules introduced later. +""" diff --git a/src/agents/pge_eval/normalize.py b/src/agents/pge_eval/normalize.py new file mode 100644 index 00000000..3e20baab --- /dev/null +++ b/src/agents/pge_eval/normalize.py @@ -0,0 +1,262 @@ +"""Shape normalisation + footprint helpers for the PGE intrinsic evaluator. + +Everything in this module is pure Python — no LLM, no DB, no domain +knowledge. It exists so the rest of the scorer can reason over one stable +in-memory shape regardless of whether the caller handed it the *agent* +ontology shape (``{entities, relationships}``), the *registry* ontology +shape (``{classes, properties}``), or raw source metadata. + +Design constraints (see docs/plans/2026-06-10-goal-loop-and-pge-eval-design.md): + +* **Usecase-agnostic.** No table name, identifier, or count from any + particular domain is encoded here. The only constants are generic + audit/surrogate column heuristics that hold for any relational source. +* **Deterministic.** Pure functions of their inputs; no randomness, no + wall-clock, no network. +""" + +from __future__ import annotations + +import re +from typing import Any, Dict, List, Optional, Set + + +# ===================================================== +# Name normalisation +# ===================================================== + + +def normalize_name(name: Optional[str]) -> str: + """Collapse a column / property / class name to a comparison key. + + Lower-cases and strips every non-alphanumeric character so that + ``first_name``, ``firstName`` and ``FirstName`` all collapse to + ``firstname``. This is the footprint-matching key used to decide + whether a source column "became" a data property without consulting + the mapping (Stage-1 is mapping-independent — see D2/D3). + """ + if not name: + return "" + return re.sub(r"[^a-z0-9]", "", str(name).lower()) + + +def local_name(uri_or_name: Optional[str]) -> str: + """Return the local name of a URI/CURIE, or the value unchanged. + + ``http://x/Customer`` -> ``Customer``; ``ex:Customer`` -> ``Customer``; + ``Customer`` -> ``Customer``. + """ + if not uri_or_name: + return "" + s = str(uri_or_name) + for sep in ("#", "/"): + if sep in s: + s = s.rsplit(sep, 1)[-1] + if ":" in s and not s.startswith("http"): + s = s.rsplit(":", 1)[-1] + return s + + +# ===================================================== +# Audit / surrogate column heuristics (generic, not domain-specific) +# ===================================================== + +# Audit tokens that mark a column as non-analytical bookkeeping. These are +# generic ETL/CDC conventions, not tied to any domain. +_AUDIT_TOKENS = ( + "createdat", + "updatedat", + "createdon", + "updatedon", + "createdby", + "updatedby", + "modifiedat", + "modifiedby", + "deletedat", + "ingestedat", + "loadedat", + "loadts", + "etltimestamp", + "dwcreated", + "dwupdated", +) +_AUDIT_PREFIXES = ("etl", "ingest", "_ingest", "dw") +# Exact surrogate row-key names + suffixes for warehouse surrogate keys. +_SURROGATE_EXACT = ("id", "rowid", "rownum", "rownumber") +_SURROGATE_SUFFIXES = ("sk", "surrogatekey") + + +def is_surrogate_or_audit(column_name: str) -> bool: + """Heuristic: True when *column_name* is a surrogate row key or audit + column with no analytical value. + + The OWL generator is instructed to drop exactly these, so they are + excluded from coverage denominators (D3). Intentionally conservative: + it does NOT drop every ``*_id`` column (foreign keys can be meaningful), + only obvious surrogate keys and audit bookkeeping. + """ + norm = normalize_name(column_name) + if not norm: + return True + if norm in _SURROGATE_EXACT: + return True + if any(norm.endswith(sfx) for sfx in _SURROGATE_SUFFIXES): + return True + if any(tok in norm for tok in _AUDIT_TOKENS): + return True + raw = re.sub(r"[^a-z0-9_]", "", str(column_name).lower()) + if any(raw.startswith(p) for p in _AUDIT_PREFIXES): + return True + return False + + +# ===================================================== +# Ontology normalisation +# ===================================================== + + +def _attr_names(raw_attrs: Any) -> List[str]: + """Normalise an attribute container to a flat list of name strings. + + Accepts the agent shape (list of str or ``{name|uri|label}`` dicts) and + the registry shape (list of ``{name|localName}`` dicts). + """ + out: List[str] = [] + for a in raw_attrs or []: + if isinstance(a, str): + out.append(a) + elif isinstance(a, dict): + name = a.get("name") or a.get("localName") or a.get("uri") or a.get("label") + if name: + out.append(local_name(name)) + return out + + +class NormalizedOntology: + """A flat, shape-agnostic view of a generated ontology. + + Attributes: + classes: list of ``{"name", "uri", "data_properties": [str]}``. + object_properties: list of ``{"name", "uri", "domain", "range"}`` + where domain/range are the raw refs as authored (URI or local). + """ + + def __init__(self, classes: List[dict], object_properties: List[dict]): + self.classes = classes + self.object_properties = object_properties + + # --- derived sets, computed lazily but cheaply ------------------ + + @property + def class_resolution_set(self) -> Set[str]: + """Every token a domain/range ref could legitimately resolve to.""" + out: Set[str] = set() + for c in self.classes: + if c.get("uri"): + out.add(c["uri"]) + out.add(local_name(c["uri"])) + if c.get("name"): + out.add(c["name"]) + out.add(local_name(c["name"])) + return out + + @property + def all_data_property_keys(self) -> Set[str]: + """Normalised keys of every data property across every class.""" + keys: Set[str] = set() + for c in self.classes: + for dp in c.get("data_properties", []): + k = normalize_name(local_name(dp)) + if k: + keys.add(k) + return keys + + @property + def class_name_keys(self) -> Set[str]: + keys: Set[str] = set() + for c in self.classes: + k = normalize_name(local_name(c.get("name") or c.get("uri"))) + if k: + keys.add(k) + return keys + + +def normalize_ontology(ontology: dict) -> NormalizedOntology: + """Normalise either the agent shape or the registry shape. + + * Agent shape: ``{"entities": [...], "relationships": [...]}`` + * Registry shape: ``{"classes": [...], "properties": [...]}`` + """ + ontology = ontology or {} + classes: List[dict] = [] + object_props: List[dict] = [] + + if "entities" in ontology or "relationships" in ontology: + for e in ontology.get("entities", []) or []: + classes.append( + { + "name": e.get("name") or local_name(e.get("uri")), + "uri": e.get("uri", ""), + "data_properties": _attr_names(e.get("attributes")), + } + ) + for r in ontology.get("relationships", []) or []: + object_props.append( + { + "name": r.get("name") or local_name(r.get("uri")), + "uri": r.get("uri", ""), + "domain": r.get("domain", ""), + "range": r.get("range", ""), + } + ) + else: + for c in ontology.get("classes", []) or []: + classes.append( + { + "name": c.get("name") or local_name(c.get("uri")), + "uri": c.get("uri", ""), + "data_properties": _attr_names(c.get("dataProperties")), + } + ) + for p in ontology.get("properties", []) or []: + if p.get("type") and p.get("type") != "ObjectProperty": + continue + object_props.append( + { + "name": p.get("name") or local_name(p.get("uri")), + "uri": p.get("uri", ""), + "domain": p.get("domain", ""), + "range": p.get("range", ""), + } + ) + + return NormalizedOntology(classes=classes, object_properties=object_props) + + +# ===================================================== +# Source-metadata normalisation +# ===================================================== + + +def normalize_metadata(metadata: dict) -> List[dict]: + """Return ``[{"name", "columns": [str]}]`` from domain metadata. + + Accepts the ``{"tables": [{"name"|"full_name", "columns": [...]}]}`` + shape produced by the metadata tools. Column entries may be plain + strings or ``{"name": ...}`` dicts. + """ + out: List[dict] = [] + for t in (metadata or {}).get("tables", []) or []: + cols: List[str] = [] + for c in t.get("columns", []) or []: + if isinstance(c, str): + cols.append(c) + elif isinstance(c, dict) and c.get("name"): + cols.append(c["name"]) + out.append( + { + "name": t.get("full_name") or t.get("name") or "", + "columns": cols, + } + ) + return out diff --git a/src/agents/pge_eval/ontology_metrics.py b/src/agents/pge_eval/ontology_metrics.py new file mode 100644 index 00000000..21ba2c16 --- /dev/null +++ b/src/agents/pge_eval/ontology_metrics.py @@ -0,0 +1,270 @@ +"""Stage-1 — ontology-generation quality (deterministic, no LLM). + +Computed purely from the generated ontology + source metadata. No mapping +dependency (D2) and no LLM for the deterministic part (§3.2). The same +checks back the new owl-generator Evaluator stage (§3.5): each issue carries +a concrete ``hint`` that becomes a generator retry_hint. + +All metrics are usecase-agnostic: nothing about any particular domain is +hard-coded here. +""" + +from __future__ import annotations + +import re +from typing import Any, Dict, List, Set, Tuple + +from agents.pge_eval.normalize import ( + NormalizedOntology, + is_surrogate_or_audit, + local_name, + normalize_metadata, + normalize_name, + normalize_ontology, +) + +# Naming conventions (mirror the OWL generator's NAMING RULES, domain-free). +_CLASS_RE = re.compile(r"^[A-Z][A-Za-z0-9]*$") +_PROPERTY_RE = re.compile(r"^[a-z][A-Za-z0-9]*$") + + +def _issue(check: str, expected: str, observed: str, hint: str) -> Dict[str, str]: + return {"check": check, "expected": expected, "observed": observed, "hint": hint} + + +# ===================================================== +# Footprint computation (shared with pipeline.coverage_loss) +# ===================================================== + + +def _column_key(table_name: str, column_name: str) -> str: + return f"{normalize_name(table_name)}::{normalize_name(column_name)}" + + +def compute_footprint( + ontology: NormalizedOntology, tables: List[dict] +) -> Dict[str, Any]: + """Return the ontology footprint over the source metadata. + + A *column* is covered when its normalised name matches some data + property's normalised name. A *table* is covered when its name matches + a class name OR ≥1 of its non-surrogate columns is covered (D3). + + Surrogate/audit columns are excluded from the denominators. + """ + dp_keys = ontology.all_data_property_keys + class_keys = ontology.class_name_keys + + total_columns = 0 + covered_columns: Set[str] = set() + total_tables = len(tables) + covered_tables: Set[str] = set() + + for t in tables: + tname = t["name"] + tkey = normalize_name(local_name(tname)) + table_is_covered = tkey in class_keys + for col in t["columns"]: + if is_surrogate_or_audit(col): + continue + total_columns += 1 + ckey = _column_key(tname, col) + if normalize_name(col) in dp_keys: + covered_columns.add(ckey) + table_is_covered = True + if table_is_covered: + covered_tables.add(tname) + + return { + "total_tables": total_tables, + "covered_tables": covered_tables, + "total_columns": total_columns, + "covered_columns": covered_columns, + } + + +# ===================================================== +# Stage-1 metrics + issues +# ===================================================== + + +def evaluate_ontology( + ontology: dict, + metadata: dict, +) -> Tuple[Dict[str, Any], List[Dict[str, str]], Dict[str, Any]]: + """Run the deterministic Stage-1 checks. + + Returns ``(metrics, issues, footprint)``: + + * ``metrics`` — the §3.2 metric block (ratios + absolute counts). + * ``issues`` — actionable failures (``check/expected/observed/hint``) + for the owl-gen Evaluator's retry_hints. + * ``footprint`` — covered tables/columns sets reused by + ``pipeline.coverage_loss``. + """ + norm = normalize_ontology(ontology) + tables = normalize_metadata(metadata) + footprint = compute_footprint(norm, tables) + + issues: List[Dict[str, str]] = [] + + # ---- coverage ratios (Tier-2 warn) ----------------------------- + table_cov = ( + len(footprint["covered_tables"]) / footprint["total_tables"] + if footprint["total_tables"] + else 1.0 + ) + column_cov = ( + len(footprint["covered_columns"]) / footprint["total_columns"] + if footprint["total_columns"] + else 1.0 + ) + + uncovered_tables = [ + t["name"] + for t in tables + if t["name"] not in footprint["covered_tables"] + ] + for tname in uncovered_tables: + issues.append( + _issue( + "table_footprint_coverage", + "table maps to a class or contributes >=1 data property", + "no footprint", + f"source table '{tname}' has no class and contributes no data " + "property — model it as a class, attach its columns as data " + "properties on an existing class, or justify the omission.", + ) + ) + + # ---- orphan classes (Tier-1 absolute = 0) ---------------------- + related: Set[str] = set() + for op in norm.object_properties: + for ref in (op.get("domain"), op.get("range")): + if ref: + related.add(local_name(ref)) + related.add(str(ref)) + orphan_classes: List[str] = [] + for c in norm.classes: + has_props = bool(c.get("data_properties")) + name = c.get("name") or local_name(c.get("uri")) + in_rel = name in related or local_name(c.get("uri")) in related + if not has_props and not in_rel: + orphan_classes.append(name) + issues.append( + _issue( + "orphan_class_count", + "0 orphan classes", + name, + f"class '{name}' is an orphan (no data properties and no " + "object-property domain/range) — attach properties, relate " + "it to another class, or remove it.", + ) + ) + + # ---- dangling domain/range (Tier-1 absolute = 0) --------------- + resolvable = norm.class_resolution_set + dangling_dr: List[str] = [] + for op in norm.object_properties: + opname = op.get("name") or local_name(op.get("uri")) + for role in ("domain", "range"): + ref = op.get(role) + if not ref: + dangling_dr.append(f"{opname}.{role}") + issues.append( + _issue( + "dangling_domain_range_count", + f"ObjectProperty {role} resolves to a class", + f"{opname}.{role}=", + f"ObjectProperty '{opname}' has no {role} — declare an " + f"rdfs:{role} pointing at an existing class.", + ) + ) + continue + if ref not in resolvable and local_name(ref) not in resolvable: + dangling_dr.append(f"{opname}.{role}") + issues.append( + _issue( + "dangling_domain_range_count", + f"ObjectProperty {role} resolves to a class", + f"{opname}.{role}={local_name(ref)}", + f"ObjectProperty '{opname}' has {role} " + f"'{local_name(ref)}' which resolves to no class — fix " + "the reference or add the missing class.", + ) + ) + + # ---- naming violations (Tier-1 absolute = 0) ------------------- + naming_violations: List[str] = [] + for c in norm.classes: + nm = local_name(c.get("name") or c.get("uri")) + if nm and not _CLASS_RE.match(nm): + naming_violations.append(f"class:{nm}") + issues.append( + _issue( + "naming_violation_count", + "class name is PascalCase [A-Z][A-Za-z0-9]*", + nm, + f"class '{nm}' violates PascalCase — remove spaces / " + "underscores / hyphens and capitalise (e.g. sales_order -> " + "SalesOrder).", + ) + ) + for op in norm.object_properties: + nm = local_name(op.get("name") or op.get("uri")) + if nm and not _PROPERTY_RE.match(nm): + naming_violations.append(f"property:{nm}") + issues.append( + _issue( + "naming_violation_count", + "property name is lowerCamelCase [a-z][A-Za-z0-9]*", + nm, + f"property '{nm}' violates lowerCamelCase — use " + "[a-z][A-Za-z0-9]* with no underscores/hyphens/escapes.", + ) + ) + # data properties too + for c in norm.classes: + for dp in c.get("data_properties", []): + nm = local_name(dp) + if nm and not _PROPERTY_RE.match(nm): + naming_violations.append(f"dataproperty:{nm}") + issues.append( + _issue( + "naming_violation_count", + "data property name is lowerCamelCase", + nm, + f"data property '{nm}' violates lowerCamelCase — use " + "[a-z][A-Za-z0-9]* with no underscores/hyphens/escapes.", + ) + ) + + # ---- duplicate classes (Tier-1 absolute = 0) ------------------- + seen: Dict[str, int] = {} + for c in norm.classes: + key = normalize_name(local_name(c.get("name") or c.get("uri"))) + if not key: + continue + seen[key] = seen.get(key, 0) + 1 + duplicate_class_count = sum(n - 1 for n in seen.values() if n > 1) + for key, n in seen.items(): + if n > 1: + issues.append( + _issue( + "duplicate_class_count", + "0 duplicate class local names", + f"{key} x{n}", + f"{n} classes collapse to the local name '{key}' — merge " + "them or differentiate their names/definitions.", + ) + ) + + metrics: Dict[str, Any] = { + "table_footprint_coverage": round(table_cov, 6), + "column_footprint_coverage": round(column_cov, 6), + "orphan_class_count": len(orphan_classes), + "dangling_domain_range_count": len(dangling_dr), + "naming_violation_count": len(naming_violations), + "duplicate_class_count": duplicate_class_count, + } + return metrics, issues, footprint diff --git a/tests/units/pge_eval/__init__.py b/tests/units/pge_eval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/units/pge_eval/_fixtures.py b/tests/units/pge_eval/_fixtures.py new file mode 100644 index 00000000..71ee60d6 --- /dev/null +++ b/tests/units/pge_eval/_fixtures.py @@ -0,0 +1,178 @@ +"""Synthetic, usecase-agnostic fixtures for the PGE evaluator unit tests. + +Deliberately uses a generic e-commerce-ish toy domain (Customer / Order / +Product) so the tests prove the scorer is domain-free — none of these names +appear in the scorer code. +""" + +from copy import deepcopy + + +def clean_ontology() -> dict: + """Agent-shape ontology that is fully structurally clean.""" + return { + "entities": [ + { + "uri": "ex:Customer", + "name": "Customer", + "attributes": ["firstName", "lastName", "email"], + }, + { + "uri": "ex:Order", + "name": "Order", + "attributes": ["orderDate", "totalAmount"], + }, + { + "uri": "ex:Product", + "name": "Product", + "attributes": ["sku", "unitPrice"], + }, + ], + "relationships": [ + { + "uri": "ex:placesOrder", + "name": "placesOrder", + "domain": "ex:Customer", + "range": "ex:Order", + }, + { + "uri": "ex:containsProduct", + "name": "containsProduct", + "domain": "ex:Order", + "range": "ex:Product", + }, + ], + } + + +def clean_metadata() -> dict: + return { + "tables": [ + { + "name": "customers", + "columns": [ + {"name": "id"}, + {"name": "first_name"}, + {"name": "last_name"}, + {"name": "email"}, + {"name": "created_at"}, + ], + }, + { + "name": "orders", + "columns": [ + {"name": "id"}, + {"name": "order_date"}, + {"name": "total_amount"}, + ], + }, + { + "name": "products", + "columns": [ + {"name": "id"}, + {"name": "sku"}, + {"name": "unit_price"}, + ], + }, + ] + } + + +def clean_artifact() -> dict: + onto = clean_ontology() + meta = clean_metadata() + return { + "success": True, + "iterations": 3, + "usage": {"prompt_tokens": 1000, "completion_tokens": 400}, + "stats": {"planner_reinvocations": 0}, + "mapping_run_log": [ + {"item": "ex:Customer", "kind": "entity", "attempts": [{}], "final_status": "PASS"}, + {"item": "ex:Order", "kind": "entity", "attempts": [{}], "final_status": "PASS"}, + {"item": "ex:Product", "kind": "entity", "attempts": [{}], "final_status": "PASS"}, + {"item": "ex:placesOrder", "kind": "relationship", "attempts": [{}], "final_status": "PASS"}, + {"item": "ex:containsProduct", "kind": "relationship", "attempts": [{}], "final_status": "PASS"}, + ], + "mapping_evaluations": { + "ex:Customer": {"metrics": {"row_count": 100, "distinct_id_count": 100, "null_id_count": 0}, "failures": []}, + "ex:Order": {"metrics": {"row_count": 500, "distinct_id_count": 500, "null_id_count": 0}, "failures": []}, + "ex:Product": {"metrics": {"row_count": 50, "distinct_id_count": 50, "null_id_count": 0}, "failures": []}, + "ex:placesOrder": {"metrics": {"total_edges": 500, "dangling_source_pct": 0.0, "dangling_target_pct": 0.0}, "failures": []}, + "ex:containsProduct": {"metrics": {"total_edges": 800, "dangling_source_pct": 0.0, "dangling_target_pct": 0.0}, "failures": []}, + }, + "entity_mappings": [ + {"ontology_class": "ex:Customer", "attribute_mappings": {"firstName": "first_name", "lastName": "last_name", "email": "email"}}, + {"ontology_class": "ex:Order", "attribute_mappings": {"orderDate": "order_date", "totalAmount": "total_amount"}}, + {"ontology_class": "ex:Product", "attribute_mappings": {"sku": "sku", "unitPrice": "unit_price"}}, + ], + "relationship_mappings": [], + "steps": [{"step_type": "planner", "tool_name": "", "duration_ms": 1200}], + "ontology": onto, + "metadata": meta, + "elapsed_s": 42.5, + } + + +def artifact_with_dangling_fk() -> dict: + """Clean except one relationship has a dangling target FK > 5%.""" + art = clean_artifact() + art["mapping_evaluations"]["ex:placesOrder"]["metrics"]["dangling_target_pct"] = 0.47 + return art + + +def artifact_with_sql_failure() -> dict: + """Clean except one entity's SQL failed to execute.""" + art = clean_artifact() + art["mapping_evaluations"]["ex:Order"] = { + "metrics": {"sql_error": "UNION type mismatch"}, + "failures": [ + { + "check": "sql_execution", + "expected": "SQL executes without error", + "observed": "execution error", + "hint": "fix the SQL", + } + ], + } + # The entity drops out of PASS in the run log too (in-scope but failed). + for entry in art["mapping_run_log"]: + if entry["item"] == "ex:Order": + entry["final_status"] = "FAIL" + return art + + +def ontology_with_orphan() -> dict: + """Add a class with no data properties and no relationships.""" + onto = clean_ontology() + onto["entities"].append({"uri": "ex:Ghost", "name": "Ghost", "attributes": []}) + return onto + + +def artifact_with_orphan_class() -> dict: + art = clean_artifact() + art["ontology"] = ontology_with_orphan() + return art + + +def ontology_with_dangling_range() -> dict: + onto = clean_ontology() + onto["relationships"].append( + {"uri": "ex:refersTo", "name": "refersTo", "domain": "ex:Order", "range": "ex:Nonexistent"} + ) + return onto + + +def ontology_with_naming_violation() -> dict: + onto = clean_ontology() + onto["entities"].append( + {"uri": "ex:bad_class", "name": "bad_class", "attributes": ["someAttr"]} + ) + return onto + + +def ontology_with_duplicate_class() -> dict: + onto = clean_ontology() + onto["entities"].append( + {"uri": "ex:Customer2", "name": "Customer", "attributes": ["nickname"]} + ) + return onto diff --git a/tests/units/pge_eval/test_ontology_metrics.py b/tests/units/pge_eval/test_ontology_metrics.py new file mode 100644 index 00000000..640a61ec --- /dev/null +++ b/tests/units/pge_eval/test_ontology_metrics.py @@ -0,0 +1,83 @@ +"""Stage-1 ontology metric tests (deterministic, no LLM).""" + +import pytest + +from agents.pge_eval.ontology_metrics import evaluate_ontology +from agents.pge_eval.normalize import is_surrogate_or_audit, normalize_name + +from tests.units.pge_eval import _fixtures as fx + + +def test_clean_ontology_all_absolute_zero(): + metrics, issues, _ = evaluate_ontology(fx.clean_ontology(), fx.clean_metadata()) + assert metrics["orphan_class_count"] == 0 + assert metrics["dangling_domain_range_count"] == 0 + assert metrics["naming_violation_count"] == 0 + assert metrics["duplicate_class_count"] == 0 + assert metrics["table_footprint_coverage"] == 1.0 + assert metrics["column_footprint_coverage"] >= 0.9 + + +def test_orphan_class_detected(): + metrics, issues, _ = evaluate_ontology(fx.ontology_with_orphan(), fx.clean_metadata()) + assert metrics["orphan_class_count"] == 1 + assert any(i["check"] == "orphan_class_count" for i in issues) + + +def test_dangling_range_detected(): + metrics, issues, _ = evaluate_ontology( + fx.ontology_with_dangling_range(), fx.clean_metadata() + ) + assert metrics["dangling_domain_range_count"] == 1 + assert any(i["check"] == "dangling_domain_range_count" for i in issues) + + +def test_naming_violation_detected(): + metrics, _, _ = evaluate_ontology( + fx.ontology_with_naming_violation(), fx.clean_metadata() + ) + assert metrics["naming_violation_count"] >= 1 + + +def test_duplicate_class_detected(): + metrics, _, _ = evaluate_ontology( + fx.ontology_with_duplicate_class(), fx.clean_metadata() + ) + assert metrics["duplicate_class_count"] == 1 + + +def test_table_coverage_drops_with_unmodelled_table(): + meta = fx.clean_metadata() + meta["tables"].append({"name": "shipments", "columns": [{"name": "carrier"}]}) + metrics, issues, _ = evaluate_ontology(fx.clean_ontology(), meta) + assert metrics["table_footprint_coverage"] < 1.0 + assert any( + i["check"] == "table_footprint_coverage" for i in issues + ) + + +def test_surrogate_and_audit_columns_excluded(): + assert is_surrogate_or_audit("id") + assert is_surrogate_or_audit("created_at") + assert is_surrogate_or_audit("customer_sk") + assert is_surrogate_or_audit("etl_load_ts") + assert not is_surrogate_or_audit("first_name") + assert not is_surrogate_or_audit("customer_id") # FK can be meaningful + + +def test_name_normalization(): + assert normalize_name("first_name") == normalize_name("firstName") == "firstname" + assert normalize_name("Order Date") == "orderdate" + + +def test_registry_shape_accepted(): + # Same ontology in registry (classes/properties) shape must score identically. + registry = { + "classes": [ + {"uri": "ex:A", "name": "A", "dataProperties": [{"name": "x"}]}, + ], + "properties": [], + } + metrics, _, _ = evaluate_ontology(registry, {"tables": []}) + # A has a data property -> not an orphan. + assert metrics["orphan_class_count"] == 0 diff --git a/tests/units/pge_eval/test_owl_evaluator_stage.py b/tests/units/pge_eval/test_owl_evaluator_stage.py new file mode 100644 index 00000000..ec23bf47 --- /dev/null +++ b/tests/units/pge_eval/test_owl_evaluator_stage.py @@ -0,0 +1,69 @@ +"""The owl-generator Evaluator stage (§3.5) — deterministic Stage-1 checks +feeding retry_hints, with a bounded retry cap.""" + +from agents.agent_owl_generator import engine as owl_engine + +_CLEAN_TTL = """@prefix owl: . +@prefix rdf: . +@prefix rdfs: . +@prefix xsd: . +@prefix : . + + a owl:Ontology . + +:Customer a owl:Class ; rdfs:label "Customer" . +:Order a owl:Class ; rdfs:label "Order" . + +:placesOrder a owl:ObjectProperty ; rdfs:domain :Customer ; rdfs:range :Order . +:firstName a owl:DatatypeProperty ; rdfs:domain :Customer ; rdfs:range xsd:string . +:orderDate a owl:DatatypeProperty ; rdfs:domain :Order ; rdfs:range xsd:string . +""" + +_ORPHAN_TTL = _CLEAN_TTL + """ +:Ghost a owl:Class ; rdfs:label "Ghost" . +""" + + +def test_clean_ontology_returns_no_retry_hint(): + assert owl_engine._evaluate_ontology_stage(_CLEAN_TTL, {}, 1) is None + + +def test_orphan_class_yields_retry_hint(): + hint = owl_engine._evaluate_ontology_stage(_ORPHAN_TTL, {}, 1) + assert hint is not None + assert "Ghost" in hint + assert "orphan" in hint.lower() + + +_PROSE_PREFIXED = ( + "No database tables are available. I have what I need from the guidelines.\n\n" + + _ORPHAN_TTL +) + +_FENCED = "```turtle\n" + _ORPHAN_TTL + "```" + + +def test_prose_preamble_is_stripped_before_parsing(): + # Regression: the model sometimes prepends a sentence before @prefix. The + # evaluator must clean it (like the downstream registry) and still run, + # not skip. Found via a live Chrome DevTools generation run. + hint = owl_engine._evaluate_ontology_stage(_PROSE_PREFIXED, {}, 1) + assert hint is not None + assert "Ghost" in hint + + +def test_markdown_fenced_turtle_is_parsed(): + hint = owl_engine._evaluate_ontology_stage(_FENCED, {}, 1) + assert hint is not None + assert "Ghost" in hint + + +def test_parse_error_fails_open(): + # Garbage in -> None (never blocks OWL delivery). + assert owl_engine._evaluate_ontology_stage("not turtle at all {{{", {}, 1) is None + + +def test_evaluator_loop_is_bounded(): + # The Evaluator retry cap exists and is finite (real PGE discipline). + assert owl_engine.MAX_OWL_EVAL_ROUNDS >= 1 + assert owl_engine.MAX_OWL_EVAL_ROUNDS < 10