databrickslabs · FiifiB · Jun 25, 2026
@@ -0,0 +1,58 @@
+# 2026-06-25 — feat(ontology): PGE Evaluator stage for owl-generator
+
+## Context
+
+The owl-generator agent had a single-shot generation + a pitfall-tool fix loop,
+but no deterministic Evaluator stage — so structural defects (orphan classes,
+dangling domain/range, naming violations, duplicate classes) could survive into
+the delivered ontology. This change turns owl-generation into a real
+Planner→Generator→Evaluator (PGE) loop: after the pitfall loop settles, a
+deterministic Stage-1 evaluator scores the ontology against the source metadata
+and feeds concrete retry-hints back to the generator, bounded by a hard cap.
+
+The Evaluator reuses a small, usecase-agnostic ontology-metrics module
+(`agents.pge_eval.ontology_metrics`) — gold-free, computed purely from the
+generated ontology + source schema. Only the ontology slice of the metrics
+package is introduced here; the full scorecard/CLI lands separately.
+
+## Changes
+
+1. `src/agents/agent_owl_generator/engine.py`
+   - Add `MAX_OWL_EVAL_ROUNDS` (bounded Evaluator retry cap) and
+     `_evaluate_ontology_stage()` — parses the Turtle, runs the deterministic
+     Tier-1 ontology checks, and returns a retry-hint string on hard defects
+     (orphan / dangling domain-range / naming / duplicate). Fails open: any
+     parse/dep error returns `None` so a check failure never blocks delivery.
+   - Wire the Evaluator into the agent loop after the pitfall loop; only retry
+     when an iteration remains, so a usable ontology is never discarded by
+     exhausting `MAX_ITERATIONS`.
+   - Raise `max_tokens` to `MAX_OUTPUT_TOKENS = 16000` so exhaustive attribute
+     coverage isn't silently truncated past the old 4096 ceiling.
+   - Strengthen the system prompt: `# ATTRIBUTE COVERAGE` section + a
+     `get_table_detail`-per-table workflow step driving exhaustive (not curated)
+     datatype-property coverage.
+2. `src/agents/pge_eval/__init__.py` — new package (minimal root; importers
+   depend on the concrete submodule to avoid coupling to later modules).
+3. `src/agents/pge_eval/normalize.py` — shared name/metadata/ontology
+   normalization primitives (stdlib-only).
+4. `src/agents/pge_eval/ontology_metrics.py` — `evaluate_ontology()`:
+   deterministic Stage-1 checks + footprint coverage, no stored reference.
+5. Tests: `tests/units/pge_eval/{__init__,_fixtures}.py`,
+   `test_ontology_metrics.py`, `test_owl_evaluator_stage.py`.
+
+## Modified / added files
+
+- M src/agents/agent_owl_generator/engine.py
+- A src/agents/pge_eval/__init__.py
+- A src/agents/pge_eval/normalize.py
+- A src/agents/pge_eval/ontology_metrics.py
+- A tests/units/pge_eval/__init__.py
+- A tests/units/pge_eval/_fixtures.py
+- A tests/units/pge_eval/test_ontology_metrics.py
+- A tests/units/pge_eval/test_owl_evaluator_stage.py
+
+## Tests
+
+`uv run pytest tests/units/pge_eval/test_ontology_metrics.py
+tests/units/pge_eval/test_owl_evaluator_stage.py
+tests/units/ontology/test_owl_generator.py -q` → **39 passed**.
@@ -37,6 +37,18 @@
 
 MAX_ITERATIONS = 10
 LLM_TIMEOUT = 180
+# Exhaustive per-class datatype-property coverage (see # ATTRIBUTE COVERAGE in
+# the system prompt) makes the Turtle output large — a large domain ontology
+# with dozens of classes and 50+ datatype properties runs well past the old 4096
+# ceiling, which silently truncated the final statement and broke parsing.
+# Claude Opus supports large completions; 16k tokens fits an exhaustive
+# domain ontology with headroom.
+MAX_OUTPUT_TOKENS = 16000
+
+# Bounded PGE retry cap for the Evaluator stage (§3.5): how many times the
+# deterministic Stage-1 ontology checks may feed retry_hints back into
+# generation before owl delivery proceeds regardless.
+MAX_OWL_EVAL_ROUNDS = 2
 
 _TRACE_NAME = "owl_generator"
 
@@ -95,9 +107,12 @@ def _load_pitfall_rules() -> str:
 
 # WORKFLOW
 1. Call get_metadata to understand the database schema.
-2. Call list_documents to discover available documents.
-3. Read relevant documents with read_document.
-4. Output ONLY the final Turtle ontology as plain text (starting with @prefix).
+2. Call get_table_detail on EVERY table you intend to map a class to — get_metadata
+   truncates wide tables at 80 columns, and you must see the FULL column list to give
+   each class exhaustive attribute coverage (see # ATTRIBUTE COVERAGE).
+3. Call list_documents to discover available documents.
+4. Read relevant documents with read_document.
+5. Output ONLY the final Turtle ontology as plain text (starting with @prefix).
 
 # NAMING RULES (CRITICAL – NO EXCEPTIONS)
 • Classes: PascalCase (Customer, SalesOrder)
@@ -127,6 +142,34 @@ def _load_pitfall_rules() -> str:
 • For EVERY DatatypeProperty you MUST declare rdfs:domain on the property itself
   (do not rely on owl:Restriction alone — the platform reads attributes from rdfs:domain)
 
+# ATTRIBUTE COVERAGE (CRITICAL — exhaustive, NOT curated)
+The downstream mapping pipeline can only bind a SQL column to a class when that
+column has a matching owl:DatatypeProperty with rdfs:domain on the class. A class
+with few datatype properties produces an ID+Label-only entity that is USELESS for
+analytics. So model attributes EXHAUSTIVELY, not minimally:
+• For EVERY class, emit a DatatypeProperty for EVERY meaningful source column that
+  describes an instance of that class — across ALL tables that realise the class.
+  A single class is often realised by several source tables (e.g. one per source
+  system, region, or tenant) that each hold the same real-world entity in a local
+  schema; UNION their columns mentally and cover the full set. Use get_table_detail
+  on each covering table to see every column.
+• "Meaningful" = a genuine attribute of the entity: dates, measurements, codes,
+  scores, names, statuses, flags, free-text notes. EXCLUDE ONLY: surrogate/auto-
+  increment row keys with no analytical value, audit columns (created_at, updated_by,
+  etl_*, _ingest_*), and the foreign-key columns that ObjectProperty relationships
+  already carry.
+• When two sources expose the SAME attribute under different column names
+  (e.g. total_amount vs TOTAL_AMT; status vs STATUS_CODE), emit ONE datatype
+  property — do NOT emit a per-source duplicate. The mapping layer reconciles the
+  source columns.
+• Name datatype properties in lowerCamelCase derived from business meaning
+  (order_date → orderDate, TOTAL_AMT → totalAmount).
+  Use ONLY [a-z][A-Za-z0-9]* — never underscores, hyphens, or backslash escapes.
+• The "at least 2 datatype properties" floor in the guidelines is a MINIMUM, not a
+  target. Rich, real-world entities (a transaction, an encounter, an event, a core
+  business object) typically warrant 6–11 datatype properties. Aim for full column
+  coverage, not a tidy subset.
+
 # RELATIONSHIP RULES
 • NEVER create bidirectional relationships.
 • Between any two classes A and B create at most ONE ObjectProperty.
@@ -160,10 +203,12 @@ def _load_pitfall_rules() -> str:
 
 ## 2. Class and property design rules
 For each **class** you create:[1][2][3][4]
-1. Provide:  
-   - A short, clear natural-language definition (1–2 sentences).  
-   - At least 1 object property (unless the class is explicitly abstract).  
-   - At least 2 datatype properties, when meaningful in the domain.  
+1. Provide:
+   - A short, clear natural-language definition (1–2 sentences).
+   - At least 1 object property (unless the class is explicitly abstract).
+   - Datatype properties covering EVERY meaningful source column for the class
+     (see "# ATTRIBUTE COVERAGE" in the system prompt — exhaustive, not curated;
+     2 is a floor, full column coverage is the goal).
 2. Naming conventions:  
    - Classes: UpperCamelCase (e.g., `CustomerOrder`).  
    - Object properties: lowerCamelCase verbs or verb-like phrases (e.g., `placesOrder`).  
@@ -241,6 +286,80 @@ def _parse_pitfall_tool_result(tool_result_json: str) -> Optional[Dict]:
         return None
 
 
+# Stage-1 absolute (Tier-1) ontology defects that the Evaluator forces a
+# retry on.  Coverage ratios are computed and logged but are advisory at the
+# generation stage (they are Tier-2 in the scorecard), so they do not by
+# themselves trigger a regeneration — only hard structural defects do.
+_EVAL_ABSOLUTE_CHECKS = (
+    "orphan_class_count",
+    "dangling_domain_range_count",
+    "naming_violation_count",
+    "duplicate_class_count",
+)
+
+
+def _evaluate_ontology_stage(
+    turtle_text: str, metadata: dict, iteration: int
+) -> Optional[str]:
+    """Run the Stage-1 deterministic ontology checks (§3.2) on *turtle_text*.
+
+    Parses the Turtle into the registry shape, runs the shared intrinsic
+    checks, and returns a concrete ``retry_hint`` feedback string when any
+    Tier-1 absolute defect (orphan class, dangling domain/range, naming
+    violation, duplicate class) is present — turning owl-gen into a real
+    PGE loop.  Returns ``None`` when the ontology is structurally clean.
+
+    Fails open: any parse/dep error returns ``None`` so a check failure
+    never blocks OWL delivery (mirrors the pitfall-tool check).
+    """
+    try:
+        from back.core.w3c.owl.OntologyParser import OntologyParser
+        from back.objects.ontology.Ontology import Ontology
+        from agents.pge_eval.ontology_metrics import evaluate_ontology
+
+        # The model sometimes prepends a prose sentence or wraps the Turtle in
+        # a markdown fence; strip that the same way the downstream registry
+        # does, so the Evaluator parses real output instead of skipping.
+        turtle_text = Ontology.clean_owl_output(turtle_text)
+        parser = OntologyParser(turtle_text)
+        ontology = {
+            "classes": parser.get_classes(),
+            "properties": parser.get_properties(),
+        }
+        metrics, issues, _footprint = evaluate_ontology(ontology, metadata or {})
+        logger.info(
+            "Iteration %d: ontology evaluator — metrics=%s",
+            iteration,
+            metrics,
+        )
+
+        absolute_issues = [
+            i for i in issues if i.get("check") in _EVAL_ABSOLUTE_CHECKS
+        ]
+        if not absolute_issues:
+            logger.info(
+                "Iteration %d: ontology evaluator — no Tier-1 defects", iteration
+            )
+            return None
+
+        lines = [
+            "The ontology you produced has structural defects. Fix ALL of them "
+            "and output ONLY the corrected Turtle (no markdown, no comments, "
+            "starting with @prefix declarations):\n"
+        ]
+        # Cap feedback to keep the prompt bounded.
+        for issue in absolute_issues[:12]:
+            lines.append(f"  • {issue['hint']}")
+        return "\n".join(lines)
+    except Exception as exc:  # noqa: BLE001
+        logger.warning(
+            "Iteration %d: ontology evaluator skipped due to error: %s",
+            iteration,
+            exc,
+        )
+        return None
+
+
 def _build_user_prompt(
     guidelines: str,
     options: dict,
@@ -443,6 +562,7 @@ def notify(msg: str):
     # ------------------------------------------------------------------
     tools_supported = True
     _owl_fix_rounds = 0   # pitfall-fix rounds consumed so far
+    _owl_eval_rounds = 0  # Evaluator (Stage-1 PGE) retry rounds consumed
 
     for iteration in range(MAX_ITERATIONS):
         logger.info(
@@ -477,7 +597,7 @@ def notify(msg: str):
                 endpoint_name,
                 messages,
                 tools=send_tools,
-                max_tokens=4096,
+                max_tokens=MAX_OUTPUT_TOKENS,
                 temperature=0.1,
                 timeout=LLM_TIMEOUT,
                 trace_name=_TRACE_NAME,
@@ -509,7 +629,7 @@ def notify(msg: str):
                         endpoint_name,
                         messages,
                         tools=None,
-                        max_tokens=4096,
+                        max_tokens=MAX_OUTPUT_TOKENS,
                         temperature=0.1,
                         timeout=LLM_TIMEOUT,
                         trace_name=_TRACE_NAME,
@@ -749,6 +869,41 @@ def notify(msg: str):
                         _owl_fix_rounds, max_fix_rounds,
                     )
 
+            # --------------------------------------------------------------
+            # Evaluator stage (PGE loop) — after the pitfall-tool loop is
+            # clean/maxed, run the Stage-1 deterministic ontology checks (§3.2).
+            # On a Tier-1 structural defect, feed concrete retry_hints back to
+            # the generator, bounded by MAX_OWL_EVAL_ROUNDS. Only retry when
+            # there's another iteration left, so a usable ontology is never
+            # discarded by exhausting MAX_ITERATIONS.
+            # --------------------------------------------------------------
+            eval_feedback = _evaluate_ontology_stage(content, ctx.metadata, iteration + 1)
+            if (
+                eval_feedback
+                and _owl_eval_rounds < MAX_OWL_EVAL_ROUNDS
+                and iteration < MAX_ITERATIONS - 1
+            ):
+                _owl_eval_rounds += 1
+                notify(
+                    f"Ontology defects found — eval round "
+                    f"{_owl_eval_rounds}/{MAX_OWL_EVAL_ROUNDS}…"
+                )
+                result.steps.append(
+                    AgentStep(
+                        step_type="evaluator",
+                        content=eval_feedback[:200],
+                        duration_ms=0,
+                    )
+                )
+                messages.append({"role": "assistant", "content": content})
+                messages.append({"role": "user", "content": eval_feedback})
+                logger.info(
+                    "Iteration %d: ontology evaluator found defects — eval round %d",
+                    iteration + 1,
+                    _owl_eval_rounds,
+                )
+                continue   # next iteration will produce corrected OWL
+
             # ── Accept this text as the final OWL ────────────────────────────
             result.success = True
             result.owl_content = content

@@ -0,0 +1,16 @@
+"""OntoBricks PGE intrinsic-evaluation primitives.
+
+This package holds usecase-agnostic, gold-free structural metrics for the PGE
+pipeline.  This PR introduces only the **ontology** slice consumed by the
+owl-generator Evaluator stage:
+
+* :func:`agents.pge_eval.ontology_metrics.evaluate_ontology` — Stage-1
+  deterministic ontology checks (orphan classes, dangling domain/range,
+  naming, duplicates, footprint coverage), computed purely from the generated
+  ontology + source metadata (no stored reference answer).
+
+The full scorecard (mapping metrics, gate tiers, baseline regression, LLM
+judge, CLI) lands in a separate change.  Importers should depend on the
+concrete submodule (``agents.pge_eval.ontology_metrics``) rather than this
+package root to avoid coupling to modules introduced later.
+"""