From 2f749e3681ea03480b2448b03509111ee3a3daa3 Mon Sep 17 00:00:00 2001 From: boringdata Date: Thu, 2 Apr 2026 07:22:41 +0000 Subject: [PATCH 1/4] feat: add OSI (Open Semantic Interchange) v0.1.1 compatibility Add bidirectional converter between BSL and OSI YAML format: - to_osi() / to_osi_yaml(): Export BSL models to OSI-compliant YAML - from_osi() / from_osi_yaml(): Import OSI YAML into BSL models - ai_context field on Dimension and Measure for LLM metadata - Expression translation between Ibis Deferred and SQL strings - BSL-specific metadata preserved via OSI custom_extensions - Round-trip tested: BSL->OSI->BSL and OSI->BSL->OSI Closes #226 Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/osi-compatibility.md | 115 ++++ examples/flights_osi.yaml | 436 ++++++++++++ src/boring_semantic_layer/__init__.py | 10 + src/boring_semantic_layer/ops.py | 20 +- src/boring_semantic_layer/osi.py | 696 ++++++++++++++++++++ src/boring_semantic_layer/tests/test_osi.py | 551 ++++++++++++++++ src/boring_semantic_layer/yaml.py | 2 + 7 files changed, 1826 insertions(+), 4 deletions(-) create mode 100644 docs/osi-compatibility.md create mode 100644 examples/flights_osi.yaml create mode 100644 src/boring_semantic_layer/osi.py create mode 100644 src/boring_semantic_layer/tests/test_osi.py diff --git a/docs/osi-compatibility.md b/docs/osi-compatibility.md new file mode 100644 index 00000000..e03c56d3 --- /dev/null +++ b/docs/osi-compatibility.md @@ -0,0 +1,115 @@ +# BSL <> OSI Compatibility Gap Analysis + +**OSI Spec Version**: 0.1.1 (2025-12-11) +**BSL Version**: 0.3.11 + +## Overview + +The [Open Semantic Interchange (OSI)](https://github.com/open-semantic-interchange/OSI) spec defines +a vendor-neutral YAML format for semantic model definitions. This document maps BSL's current format +to OSI and identifies gaps that need bridging for full compatibility. + +## Structural Differences + +| Aspect | OSI | BSL | Status | +|--------|-----|-----|--------| +| Top-level wrapper | `semantic_model: [{name, datasets, ...}]` | Flat model-per-key | DIFFERENT | +| Version field | `version: "0.1.1"` (required) | None | MISSING | +| Dataset/model | `datasets` array with `name` + `source` | Top-level keys with `table` | DIFFERENT | +| Fields | `fields` array (name + expression object) | `dimensions` dict with `_.expr` syntax | DIFFERENT | +| Metrics | `metrics` array at semantic_model level | `measures`/`calculated_measures` dicts at model level | DIFFERENT | +| Relationships | `relationships` array with `from`/`to` + column arrays | `joins` dict with `model`/`type`/`left_on`/`right_on` | DIFFERENT | + +## Field-Level Gaps + +### Missing in BSL (needed for OSI export) + +| OSI Field | Location | Description | Priority | +|-----------|----------|-------------|----------| +| `ai_context` | Every level | String or structured object with instructions/synonyms/examples | HIGH | +| `primary_key` | Dataset | Array of column names forming PK | MEDIUM | +| `unique_keys` | Dataset | Array of unique key arrays | LOW | +| `custom_extensions` | Every level | Vendor-specific metadata (`vendor_name` + `data` JSON) | LOW | +| `label` | Field | Categorization label (e.g., "filter") | LOW | +| Multi-dialect `expression` | Field/Metric | `dialects: [{dialect, expression}]` | MEDIUM | + +### BSL has, OSI doesn't + +| BSL Field | Description | Handling | +|-----------|-------------|----------| +| `profile` | Database connection config | Omit from OSI export | +| `filter` | Model-level filter expression | Store in `custom_extensions` | +| `is_entity` | Entity/PK marker on dimension | Map to `primary_key` on dataset | +| `is_event_timestamp` | Event timestamp marker | Store in `custom_extensions` | +| `smallest_time_grain` | Time granularity | Store in `custom_extensions` | +| `derived_dimensions` | Auto-derived time parts | Store in `custom_extensions` | +| `calculated_measures` | Derived metrics referencing other measures | Export as metrics with `custom_extensions` | +| `join.type` / `join.how` | Join cardinality + method | Inferred from relationship + `custom_extensions` | + +## Concept Mapping + +### Dimensions -> Fields + +``` +BSL dimension with is_time_dimension=true + -> OSI field with dimension.is_time=true + +BSL dimension with is_entity=true + -> OSI dataset.primary_key includes this field's column + +BSL dimension expression: _.column_name + -> OSI expression.dialects[{dialect: "ANSI_SQL", expression: "column_name"}] + +BSL computed dimension: _.first_name.concat(' ', _.last_name) + -> OSI expression: "first_name || ' ' || last_name" (SQL form) +``` + +### Measures -> Metrics + +``` +BSL measure: _.column.sum() + -> OSI metric expression: "SUM(dataset_name.column)" + +BSL measure: _.count() + -> OSI metric expression: "COUNT(*)" + +BSL calculated_measure: _.meas1 / _.meas2 + -> OSI metric with custom_extension noting it's derived +``` + +### Joins -> Relationships + +``` +BSL join: + carriers: + model: carriers + type: one + left_on: carrier + right_on: code + + -> OSI relationship: + name: flights_to_carriers + from: flights + to: carriers + from_columns: [carrier] + to_columns: [code] +``` + +## Implementation Plan + +### Phase 1: Bidirectional Converter (this PR) + +1. **`osi.py`** - New module with `to_osi()` and `from_osi()` functions +2. **`ai_context`** on `Dimension` and `Measure` - Optional field for round-trip fidelity +3. **Tests** - Round-trip conversion tests +4. **Example** - flights.yml converted to OSI format + +### Expression Translation Strategy + +BSL uses Ibis Deferred expressions (`_.column`), while OSI uses SQL strings. +For simple column references, extraction is straightforward. For complex expressions, +we serialize to ANSI SQL via Ibis's SQL compiler. + +**Simple**: `_.column_name` -> `"column_name"` +**Computed**: `_.first_name.concat(' ', _.last_name)` -> `"first_name || ' ' || last_name"` +**Aggregate**: `_.amount.sum()` -> `"SUM(amount)"` diff --git a/examples/flights_osi.yaml b/examples/flights_osi.yaml new file mode 100644 index 00000000..5d6260ae --- /dev/null +++ b/examples/flights_osi.yaml @@ -0,0 +1,436 @@ +# OSI (Open Semantic Interchange) version of the flights example +# Spec: https://github.com/open-semantic-interchange/OSI (v0.1.1) +# +# This is the OSI-compliant equivalent of flights.yml. +# Use from_osi_yaml() to load this into BSL: +# +# from boring_semantic_layer.osi import from_osi_yaml +# models = from_osi_yaml("flights_osi.yaml", tables=tables) + +version: "0.1.1" + +semantic_model: + - name: flights_analytics + description: Flight data analytics model with carriers, aircraft, and airports + ai_context: + instructions: > + Use this model for analyzing US domestic flight data including + delays, distances, carriers, aircraft, and airports. + examples: + - "What are the top 10 carriers by total flights?" + - "What is the average flight distance by origin airport?" + - "Show me the distribution of departure delays" + + datasets: + # ---------------------------------------------------------------- + # Carriers + # ---------------------------------------------------------------- + - name: carriers + source: carriers_tbl + primary_key: [code] + description: Airline carrier information + fields: + - name: code + expression: + dialects: + - dialect: ANSI_SQL + expression: code + dimension: + is_time: false + description: Airline carrier code + + - name: name + expression: + dialects: + - dialect: ANSI_SQL + expression: name + dimension: + is_time: false + description: Full airline name + + - name: nickname + expression: + dialects: + - dialect: ANSI_SQL + expression: nickname + dimension: + is_time: false + + # ---------------------------------------------------------------- + # Aircraft Models + # ---------------------------------------------------------------- + - name: aircraft_models + source: aircraft_models_tbl + primary_key: [aircraft_model_code] + description: Aircraft model specifications + fields: + - name: aircraft_model_code + expression: + dialects: + - dialect: ANSI_SQL + expression: aircraft_model_code + dimension: + is_time: false + + - name: manufacturer + expression: + dialects: + - dialect: ANSI_SQL + expression: manufacturer + dimension: + is_time: false + + - name: model + expression: + dialects: + - dialect: ANSI_SQL + expression: model + dimension: + is_time: false + + - name: aircraft_type_id + expression: + dialects: + - dialect: ANSI_SQL + expression: aircraft_type_id + dimension: + is_time: false + + - name: aircraft_engine_type_id + expression: + dialects: + - dialect: ANSI_SQL + expression: aircraft_engine_type_id + dimension: + is_time: false + + - name: aircraft_category_id + expression: + dialects: + - dialect: ANSI_SQL + expression: aircraft_category_id + dimension: + is_time: false + + # ---------------------------------------------------------------- + # Aircraft (Individual Planes) + # ---------------------------------------------------------------- + - name: aircraft + source: aircraft_tbl + primary_key: [tail_num] + description: Individual aircraft registry + fields: + - name: tail_num + expression: + dialects: + - dialect: ANSI_SQL + expression: tail_num + dimension: + is_time: false + + - name: aircraft_serial + expression: + dialects: + - dialect: ANSI_SQL + expression: aircraft_serial + dimension: + is_time: false + + - name: aircraft_model_code + expression: + dialects: + - dialect: ANSI_SQL + expression: aircraft_model_code + dimension: + is_time: false + + - name: aircraft_engine_code + expression: + dialects: + - dialect: ANSI_SQL + expression: aircraft_engine_code + dimension: + is_time: false + + - name: aircraft_type_id + expression: + dialects: + - dialect: ANSI_SQL + expression: aircraft_type_id + dimension: + is_time: false + + # ---------------------------------------------------------------- + # Airports + # ---------------------------------------------------------------- + - name: airports + source: airports_tbl + primary_key: [code] + description: Airport locations and information + ai_context: + synonyms: + - "airport" + - "airfield" + - "aerodrome" + fields: + - name: code + expression: + dialects: + - dialect: ANSI_SQL + expression: code + dimension: + is_time: false + + - name: city + expression: + dialects: + - dialect: ANSI_SQL + expression: city + dimension: + is_time: false + + - name: county + expression: + dialects: + - dialect: ANSI_SQL + expression: county + dimension: + is_time: false + + - name: state + expression: + dialects: + - dialect: ANSI_SQL + expression: state + dimension: + is_time: false + + - name: full_name + expression: + dialects: + - dialect: ANSI_SQL + expression: full_name + dimension: + is_time: false + + - name: latitude + expression: + dialects: + - dialect: ANSI_SQL + expression: latitude + dimension: + is_time: false + + - name: longitude + expression: + dialects: + - dialect: ANSI_SQL + expression: longitude + dimension: + is_time: false + + - name: elevation + expression: + dialects: + - dialect: ANSI_SQL + expression: elevation + dimension: + is_time: false + + # ---------------------------------------------------------------- + # Flights + # ---------------------------------------------------------------- + - name: flights + source: flights_tbl + description: Flight data with origin, destination, and metrics + ai_context: + synonyms: + - "flight records" + - "air travel" + - "flight segments" + fields: + - name: origin + expression: + dialects: + - dialect: ANSI_SQL + expression: origin + dimension: + is_time: false + description: Origin airport code + ai_context: + synonyms: + - "departure airport" + - "from" + + - name: destination + expression: + dialects: + - dialect: ANSI_SQL + expression: destination + dimension: + is_time: false + description: Destination airport code + ai_context: + synonyms: + - "arrival airport" + - "to" + + - name: carrier + expression: + dialects: + - dialect: ANSI_SQL + expression: carrier + dimension: + is_time: false + + - name: tail_num + expression: + dialects: + - dialect: ANSI_SQL + expression: tail_num + dimension: + is_time: false + + - name: dep_time + expression: + dialects: + - dialect: ANSI_SQL + expression: dep_time + dimension: + is_time: false + description: Departure timestamp + + - name: arr_time + expression: + dialects: + - dialect: ANSI_SQL + expression: arr_time + dimension: + is_time: true + description: Arrival timestamp + custom_extensions: + - vendor_name: COMMON + data: '{"is_event_timestamp": false, "smallest_time_grain": "TIME_GRAIN_DAY"}' + + - name: distance + expression: + dialects: + - dialect: ANSI_SQL + expression: distance + dimension: + is_time: false + description: Flight distance in miles + + relationships: + - name: flights_to_carriers + from: flights + to: carriers + from_columns: [carrier] + to_columns: [code] + + - name: aircraft_to_aircraft_models + from: aircraft + to: aircraft_models + from_columns: [aircraft_model_code] + to_columns: [aircraft_model_code] + + - name: flights_to_aircraft + from: flights + to: aircraft + from_columns: [tail_num] + to_columns: [tail_num] + + - name: flights_to_origin_airport + from: flights + to: airports + from_columns: [origin] + to_columns: [code] + + metrics: + - name: carrier_count + expression: + dialects: + - dialect: ANSI_SQL + expression: COUNT(*) + description: Number of carriers + + - name: model_count + expression: + dialects: + - dialect: ANSI_SQL + expression: COUNT(*) + description: Number of aircraft models + + - name: avg_seats + expression: + dialects: + - dialect: ANSI_SQL + expression: AVG(aircraft_models.seats) + description: Average number of seats + + - name: aircraft_count + expression: + dialects: + - dialect: ANSI_SQL + expression: COUNT(*) + description: Number of aircraft + + - name: airport_count + expression: + dialects: + - dialect: ANSI_SQL + expression: COUNT(*) + description: Number of airports + + - name: avg_elevation + expression: + dialects: + - dialect: ANSI_SQL + expression: AVG(airports.elevation) + description: Average airport elevation + + - name: flight_count + expression: + dialects: + - dialect: ANSI_SQL + expression: COUNT(*) + description: Total number of flights + ai_context: + synonyms: + - "number of flights" + - "total flights" + + - name: total_distance + expression: + dialects: + - dialect: ANSI_SQL + expression: SUM(flights.distance) + description: Total distance flown + + - name: avg_distance + expression: + dialects: + - dialect: ANSI_SQL + expression: AVG(flights.distance) + description: Average flight distance + + - name: max_distance + expression: + dialects: + - dialect: ANSI_SQL + expression: MAX(flights.distance) + description: Maximum flight distance + + - name: avg_delay + expression: + dialects: + - dialect: ANSI_SQL + expression: AVG(flights.dep_delay) + description: Average departure delay + + - name: max_delay + expression: + dialects: + - dialect: ANSI_SQL + expression: MAX(flights.dep_delay) + description: Maximum departure delay diff --git a/src/boring_semantic_layer/__init__.py b/src/boring_semantic_layer/__init__.py index fe92c358..828cebc3 100644 --- a/src/boring_semantic_layer/__init__.py +++ b/src/boring_semantic_layer/__init__.py @@ -43,6 +43,12 @@ from_config, from_yaml, ) +from .osi import ( + from_osi, + from_osi_yaml, + to_osi, + to_osi_yaml, +) __all__ = [ "to_semantic_table", @@ -57,6 +63,10 @@ "Measure", "from_config", "from_yaml", + "to_osi", + "to_osi_yaml", + "from_osi", + "from_osi_yaml", "MCPSemanticModel", "LangGraphBackend", "options", diff --git a/src/boring_semantic_layer/ops.py b/src/boring_semantic_layer/ops.py index 3702816f..55469dbd 100644 --- a/src/boring_semantic_layer/ops.py +++ b/src/boring_semantic_layer/ops.py @@ -626,22 +626,24 @@ def _infer_unnest(fn: Callable, table: Any) -> tuple[str, ...]: return () -def _extract_measure_metadata(fn_or_expr: Any) -> tuple[Any, str | None, tuple]: +def _extract_measure_metadata(fn_or_expr: Any) -> tuple[Any, str | None, tuple, Any]: """Extract metadata from various measure representations.""" if isinstance(fn_or_expr, dict): return ( fn_or_expr["expr"], fn_or_expr.get("description"), tuple(fn_or_expr.get("requires_unnest", [])), + fn_or_expr.get("ai_context"), ) elif isinstance(fn_or_expr, Measure): return ( fn_or_expr.expr, fn_or_expr.description, fn_or_expr.requires_unnest, + fn_or_expr.ai_context, ) else: - return (fn_or_expr, None, ()) + return (fn_or_expr, None, (), None) _AGG_METHODS = frozenset({"sum", "mean", "avg", "count", "min", "max"}) @@ -733,6 +735,7 @@ def _make_base_measure( expr: Any, description: str | None, requires_unnest: tuple, + ai_context: Any = None, ) -> Measure: """Create a base measure with proper callable wrapping using functional patterns.""" @@ -788,6 +791,7 @@ def wrapped_expr(t): description=description, requires_unnest=requires_unnest, original_expr=raw_expr, + ai_context=ai_context, ) if callable(expr): @@ -806,6 +810,7 @@ def wrapped_expr(t): description=description, requires_unnest=requires_unnest, original_expr=raw_expr, + ai_context=ai_context, ) else: return Measure( @@ -813,12 +818,13 @@ def wrapped_expr(t): description=description, requires_unnest=requires_unnest, original_expr=raw_expr, + ai_context=ai_context, ) def _classify_measure(fn_or_expr: Any, scope: Any) -> tuple[str, Any]: """Classify measure as 'calc' or 'base' with appropriate handling.""" - expr, description, requires_unnest = _extract_measure_metadata(fn_or_expr) + expr, description, requires_unnest, ai_context = _extract_measure_metadata(fn_or_expr) resolved = safe(lambda: _resolve_expr(expr, scope))().map( lambda val: ("calc", val) if _is_calculated_measure(val) else None @@ -833,7 +839,7 @@ def _classify_measure(fn_or_expr: Any, scope: Any) -> tuple[str, Any]: inferred_unnest = _infer_unnest(expr, table) requires_unnest = requires_unnest or inferred_unnest - return ("base", _make_base_measure(expr, description, requires_unnest)) + return ("base", _make_base_measure(expr, description, requires_unnest, ai_context)) def _build_json_definition( @@ -953,6 +959,7 @@ class Dimension: is_event_timestamp: bool = False smallest_time_grain: str | None = None derived_dimensions: tuple[str, ...] = () + ai_context: str | dict | None = None def __call__(self, table: ir.Table, _dims: dict | None = None) -> ir.Value: try: @@ -990,6 +997,8 @@ def to_json(self) -> Mapping[str, Any]: base["smallest_time_grain"] = self.smallest_time_grain if self.derived_dimensions: base["derived_dimensions"] = list(self.derived_dimensions) + if self.ai_context: + base["ai_context"] = self.ai_context return base def __hash__(self) -> int: @@ -1011,6 +1020,7 @@ class Measure: description: str | None = None requires_unnest: tuple[str, ...] = () # Internal: Arrays that must be unnested original_expr: Any = field(default=None, eq=False, hash=False) + ai_context: str | dict | None = None def __call__(self, table: ir.Table) -> ir.Value: return self.expr.resolve(table) if _is_deferred(self.expr) else self.expr(table) @@ -1026,6 +1036,8 @@ def to_json(self) -> Mapping[str, Any]: base["locality"] = self.locality if self.requires_unnest: base["requires_unnest"] = list(self.requires_unnest) + if self.ai_context: + base["ai_context"] = self.ai_context return base def __hash__(self) -> int: diff --git a/src/boring_semantic_layer/osi.py b/src/boring_semantic_layer/osi.py new file mode 100644 index 00000000..d2802f77 --- /dev/null +++ b/src/boring_semantic_layer/osi.py @@ -0,0 +1,696 @@ +""" +OSI (Open Semantic Interchange) converter for Boring Semantic Layer. + +Provides bidirectional conversion between BSL's semantic model format +and the OSI v0.1.1 YAML specification. + +See: https://github.com/open-semantic-interchange/OSI +""" + +from __future__ import annotations + +import re +from collections.abc import Mapping +from typing import Any + +from ibis import _ +from ibis.common.deferred import Deferred + +from .api import to_semantic_table +from .expr import SemanticModel, SemanticTable +from .ops import Dimension, Measure, SemanticTableOp, _is_deferred +from .utils import expr_to_ibis_string, safe_eval + +OSI_VERSION = "0.1.1" +BSL_VENDOR = "BSL" + + +# --------------------------------------------------------------------------- +# Expression helpers +# --------------------------------------------------------------------------- + + +def _deferred_to_sql(expr: Deferred) -> str: + """Convert an Ibis Deferred expression to a simple SQL-like string. + + Handles common patterns: + _.column_name -> "column_name" + _.column.sum() -> "SUM(column)" + _.column.mean() -> "AVG(column)" + _.column.max() -> "MAX(column)" + _.column.min() -> "MIN(column)" + _.column.nunique() -> "COUNT(DISTINCT column)" + _.count() -> "COUNT(*)" + """ + s = str(expr) # e.g. "_.column_name" or "_.column.sum()" + return _ibis_string_to_sql(s) + + +def _ibis_string_to_sql(s: str) -> str: + """Convert an Ibis deferred string repr to SQL expression.""" + s = s.strip() + + # _.count() -> COUNT(*) + if s == "_.count()": + return "COUNT(*)" + + # _.col.agg() patterns + agg_map = { + "sum": "SUM", + "mean": "AVG", + "max": "MAX", + "min": "MIN", + } + for ibis_fn, sql_fn in agg_map.items(): + pattern = rf"^_\.(.+)\.{ibis_fn}\(\)$" + m = re.match(pattern, s) + if m: + return f"{sql_fn}({m.group(1)})" + + # _.col.nunique() -> COUNT(DISTINCT col) + m = re.match(r"^_\.(.+)\.nunique\(\)$", s) + if m: + return f"COUNT(DISTINCT {m.group(1)})" + + # Simple column reference: _.col -> col + m = re.match(r"^_\.(\w+)$", s) + if m: + return m.group(1) + + # Fallback: strip leading "_." and return as-is + if s.startswith("_."): + return s[2:] + return s + + +def _expr_to_sql_string(expr: Any) -> str | None: + """Best-effort conversion of a BSL expression to a SQL string.""" + if _is_deferred(expr): + return _deferred_to_sql(expr) + + # Try the ibis string extraction utility + from returns.result import Success + + result = expr_to_ibis_string(expr) + if isinstance(result, Success): + val = result.unwrap() + if val is not None: + return _ibis_string_to_sql(val) + + return None + + +def _sql_to_deferred(sql: str) -> Deferred: + """Convert a simple SQL expression back to an Ibis Deferred. + + Handles: + "column_name" -> _.column_name + "SUM(column)" -> _.column.sum() + "AVG(column)" -> _.column.mean() + "COUNT(*)" -> _.count() + "COUNT(DISTINCT column)" -> _.column.nunique() + """ + sql = sql.strip() + + if sql == "COUNT(*)": + return safe_eval("_.count()", context={"_": _}).unwrap() + + # COUNT(DISTINCT col) + m = re.match(r"^COUNT\(DISTINCT\s+(\w+)\)$", sql, re.IGNORECASE) + if m: + return safe_eval(f"_.{m.group(1)}.nunique()", context={"_": _}).unwrap() + + # AGG(col) patterns + sql_to_ibis = {"SUM": "sum", "AVG": "mean", "MAX": "max", "MIN": "min"} + for sql_fn, ibis_fn in sql_to_ibis.items(): + m = re.match(rf"^{sql_fn}\((\w+)\)$", sql, re.IGNORECASE) + if m: + return safe_eval(f"_.{m.group(1)}.{ibis_fn}()", context={"_": _}).unwrap() + + # Simple column reference + if re.match(r"^\w+$", sql): + return safe_eval(f"_.{sql}", context={"_": _}).unwrap() + + # Fallback: try eval as-is with underscore prefix + try: + return safe_eval(f"_.{sql}", context={"_": _}).unwrap() + except Exception: + # Last resort: return as deferred column access + return safe_eval(f"_.{sql.split('.')[0] if '.' in sql else sql}", context={"_": _}).unwrap() + + +# --------------------------------------------------------------------------- +# Export: BSL -> OSI +# --------------------------------------------------------------------------- + + +def _make_osi_expression(sql_expr: str, dialect: str = "ANSI_SQL") -> dict: + """Create an OSI expression object.""" + return {"dialects": [{"dialect": dialect, "expression": sql_expr}]} + + +def _dimension_to_osi_field(name: str, dim: Dimension) -> dict: + """Convert a BSL Dimension to an OSI field dict.""" + sql = _expr_to_sql_string(dim.expr) + field: dict[str, Any] = { + "name": name, + "expression": _make_osi_expression(sql or name), + } + + if dim.is_time_dimension or dim.is_event_timestamp: + field["dimension"] = {"is_time": True} + else: + field["dimension"] = {"is_time": False} + + if dim.description: + field["description"] = dim.description + + if dim.ai_context: + field["ai_context"] = dim.ai_context + + # Store BSL-specific metadata in custom_extensions + bsl_data: dict[str, Any] = {} + if dim.is_entity: + bsl_data["is_entity"] = True + if dim.is_event_timestamp: + bsl_data["is_event_timestamp"] = True + if dim.smallest_time_grain: + bsl_data["smallest_time_grain"] = dim.smallest_time_grain + if dim.derived_dimensions: + bsl_data["derived_dimensions"] = list(dim.derived_dimensions) + + if bsl_data: + field["custom_extensions"] = [ + {"vendor_name": "COMMON", "data": _json_dumps(bsl_data)} + ] + + return field + + +def _measure_to_osi_metric( + name: str, measure: Measure, dataset_name: str | None = None +) -> dict: + """Convert a BSL Measure to an OSI metric dict.""" + sql = _expr_to_sql_string(measure.expr) + if sql and dataset_name: + # Prefix column references in aggregate functions with dataset name + # e.g. SUM(amount) -> SUM(dataset.amount) + sql = _prefix_columns_in_sql(sql, dataset_name) + + metric: dict[str, Any] = { + "name": name, + "expression": _make_osi_expression(sql or name), + } + + if measure.description: + metric["description"] = measure.description + + if measure.ai_context: + metric["ai_context"] = measure.ai_context + + return metric + + +def _prefix_columns_in_sql(sql: str, dataset: str) -> str: + """Add dataset prefix to bare column references inside aggregate functions. + + SUM(amount) -> SUM(dataset.amount) + COUNT(*) stays as COUNT(*) + COUNT(DISTINCT col) -> COUNT(DISTINCT dataset.col) + """ + if sql == "COUNT(*)": + return sql + + # Handle aggregate functions: FN(col) -> FN(dataset.col) + def _prefix_match(m: re.Match) -> str: + fn = m.group(1) + inner = m.group(2).strip() + # Handle DISTINCT keyword + if inner.upper().startswith("DISTINCT "): + col = inner[9:].strip() + if "." not in col: + return f"{fn}(DISTINCT {dataset}.{col})" + return m.group(0) + if "." not in inner and inner != "*": + return f"{fn}({dataset}.{inner})" + return m.group(0) + + return re.sub(r"(\w+)\(([^)]+)\)", _prefix_match, sql) + + +def _json_dumps(obj: Any) -> str: + """Serialize to JSON string for custom_extensions.""" + import json + + return json.dumps(obj) + + +def _extract_join_info(model: SemanticModel) -> list[dict]: + """Extract relationship info from a model's join chain. + + Returns list of OSI relationship dicts by inspecting the model's + SemanticJoinOp chain. + """ + relationships = [] + op = model.op() + + # Walk up the op tree looking for SemanticJoinOp nodes + from .ops import SemanticJoinOp + + def _walk_joins(node, parent_name: str | None = None): + if isinstance(node, SemanticJoinOp): + rel: dict[str, Any] = { + "name": f"{_get_model_name(node.left)}_{_get_model_name(node.right)}", + "from": _get_model_name(node.left), + "to": _get_model_name(node.right), + } + # Try to extract join columns from the predicate + left_cols, right_cols = _extract_join_columns(node) + if left_cols and right_cols: + rel["from_columns"] = left_cols + rel["to_columns"] = right_cols + else: + rel["from_columns"] = ["unknown"] + rel["to_columns"] = ["unknown"] + + # Store cardinality in custom_extensions + if hasattr(node, "cardinality"): + rel["custom_extensions"] = [ + { + "vendor_name": "COMMON", + "data": _json_dumps({"cardinality": node.cardinality}), + } + ] + + relationships.append(rel) + _walk_joins(node.left, _get_model_name(node.left)) + _walk_joins(node.right, _get_model_name(node.right)) + elif isinstance(node, SemanticTableOp): + pass # Base case + elif hasattr(node, "table"): + _walk_joins(node.table if not hasattr(node.table, "op") else node.table.op()) + + _walk_joins(op) + return relationships + + +def _get_model_name(node) -> str: + """Extract model name from an op node.""" + if isinstance(node, SemanticTableOp): + return node.name or "unnamed" + if hasattr(node, "name") and node.name: + return node.name + if hasattr(node, "table"): + inner = node.table if not hasattr(node.table, "op") else node.table.op() + return _get_model_name(inner) + return "unnamed" + + +def _extract_join_columns(join_op) -> tuple[list[str], list[str]]: + """Try to extract column names from a join predicate. Returns (left_cols, right_cols).""" + # This is best-effort; join predicates are lambdas and hard to introspect + return [], [] + + +def to_osi( + models: dict[str, SemanticModel] | SemanticModel, + name: str = "semantic_model", + description: str | None = None, + ai_context: str | dict | None = None, +) -> dict[str, Any]: + """Convert BSL SemanticModel(s) to an OSI-compliant dict. + + Args: + models: A single SemanticModel or dict of name -> SemanticModel + name: Name for the OSI semantic model + description: Optional description + ai_context: Optional AI context + + Returns: + Dict that can be serialized to OSI YAML via yaml.dump() + + Example: + >>> from boring_semantic_layer import from_yaml + >>> from boring_semantic_layer.osi import to_osi + >>> models = from_yaml("flights.yml") + >>> osi = to_osi(models, name="flights_analytics") + >>> import yaml + >>> print(yaml.dump(osi, sort_keys=False)) + """ + if isinstance(models, (SemanticModel, SemanticTable)): + op = models.op() + model_name = op.name or "model" + models = {model_name: models} + + datasets = [] + all_metrics: list[dict] = [] + all_relationships: list[dict] = [] + seen_relationship_names: set[str] = set() + + for model_name, model in models.items(): + op = model.op() + + # --- Dataset --- + dataset: dict[str, Any] = {"name": model_name} + + # Source: try to get table name + try: + source_table = op.to_untagged() + if hasattr(source_table, "get_name"): + dataset["source"] = source_table.get_name() + elif hasattr(source_table, "op") and hasattr(source_table.op(), "name"): + dataset["source"] = source_table.op().name or model_name + else: + dataset["source"] = model_name + except Exception: + dataset["source"] = model_name + + # Primary key from entity dimensions + pk_cols = [] + dims = op.get_dimensions() + for dim_name, dim in dims.items(): + if dim.is_entity: + sql = _expr_to_sql_string(dim.expr) + pk_cols.append(sql or dim_name) + if pk_cols: + dataset["primary_key"] = pk_cols + + # Description + if op.description: + dataset["description"] = op.description + + # Fields from dimensions + fields = [] + for dim_name, dim in dims.items(): + fields.append(_dimension_to_osi_field(dim_name, dim)) + if fields: + dataset["fields"] = fields + + datasets.append(dataset) + + # --- Metrics from measures --- + measures = op.get_measures() + for meas_name, meas in measures.items(): + all_metrics.append(_measure_to_osi_metric(meas_name, meas, model_name)) + + # --- Metrics from calculated measures --- + calc_measures = op.get_calculated_measures() + for cm_name, cm_fn in calc_measures.items(): + metric: dict[str, Any] = { + "name": cm_name, + "expression": _make_osi_expression(cm_name), + } + if isinstance(cm_fn, Measure) and cm_fn.description: + metric["description"] = cm_fn.description + metric.setdefault("custom_extensions", []).append( + { + "vendor_name": "COMMON", + "data": _json_dumps({"bsl_type": "calculated_measure"}), + } + ) + all_metrics.append(metric) + + # --- Relationships from joins --- + rels = _extract_join_info(model) + for rel in rels: + if rel["name"] not in seen_relationship_names: + all_relationships.append(rel) + seen_relationship_names.add(rel["name"]) + + # Build the OSI document + semantic_model: dict[str, Any] = {"name": name, "datasets": datasets} + + if description: + semantic_model["description"] = description + if ai_context: + semantic_model["ai_context"] = ai_context + if all_relationships: + semantic_model["relationships"] = all_relationships + if all_metrics: + semantic_model["metrics"] = all_metrics + + return {"version": OSI_VERSION, "semantic_model": [semantic_model]} + + +def to_osi_yaml( + models: dict[str, SemanticModel] | SemanticModel, + name: str = "semantic_model", + description: str | None = None, + ai_context: str | dict | None = None, +) -> str: + """Convert BSL models to an OSI YAML string. + + Convenience wrapper around to_osi() that returns a formatted YAML string. + """ + import yaml + + osi_dict = to_osi(models, name=name, description=description, ai_context=ai_context) + return yaml.dump(osi_dict, sort_keys=False, default_flow_style=False) + + +# --------------------------------------------------------------------------- +# Import: OSI -> BSL +# --------------------------------------------------------------------------- + + +def _parse_osi_expression(expr_obj: dict, prefer_dialect: str = "ANSI_SQL") -> str: + """Extract the SQL expression string from an OSI expression object. + + Prefers the specified dialect, falls back to the first available. + """ + dialects = expr_obj.get("dialects", []) + if not dialects: + raise ValueError("OSI expression has no dialects") + + # Try preferred dialect first + for d in dialects: + if d.get("dialect") == prefer_dialect: + return d["expression"] + + # Fallback to first + return dialects[0]["expression"] + + +def _osi_field_to_dimension(field: dict) -> tuple[str, Dimension]: + """Convert an OSI field dict to a (name, BSL Dimension) tuple.""" + name = field["name"] + sql_expr = _parse_osi_expression(field["expression"]) + deferred = _sql_to_deferred(sql_expr) + + kwargs: dict[str, Any] = { + "expr": deferred, + "description": field.get("description"), + } + + # Dimension metadata + dim_meta = field.get("dimension", {}) + if dim_meta.get("is_time"): + kwargs["is_time_dimension"] = True + + # AI context + if "ai_context" in field: + kwargs["ai_context"] = field["ai_context"] + + # BSL-specific from custom_extensions + import json + + for ext in field.get("custom_extensions", []): + if ext.get("vendor_name") == "COMMON": + try: + data = json.loads(ext["data"]) + if data.get("is_entity"): + kwargs["is_entity"] = True + if data.get("is_event_timestamp"): + kwargs["is_event_timestamp"] = True + if data.get("smallest_time_grain"): + kwargs["smallest_time_grain"] = data["smallest_time_grain"] + if data.get("derived_dimensions"): + kwargs["derived_dimensions"] = tuple(data["derived_dimensions"]) + except (json.JSONDecodeError, KeyError): + pass + + return name, Dimension(**kwargs) + + +def _osi_metric_to_measure(metric: dict) -> tuple[str, Measure]: + """Convert an OSI metric dict to a (name, BSL Measure) tuple.""" + name = metric["name"] + sql_expr = _parse_osi_expression(metric["expression"]) + + # Strip dataset prefix from column refs for BSL (which scopes per-model) + # e.g., "SUM(flights.distance)" -> "SUM(distance)" via deferred + sql_expr = _strip_dataset_prefix(sql_expr) + deferred = _sql_to_deferred(sql_expr) + + kwargs: dict[str, Any] = { + "expr": deferred, + "description": metric.get("description"), + } + + if "ai_context" in metric: + kwargs["ai_context"] = metric["ai_context"] + + return name, Measure(**kwargs) + + +def _strip_dataset_prefix(sql: str) -> str: + """Remove dataset.column prefixes from SQL, keeping just column name. + + SUM(flights.distance) -> SUM(distance) + COUNT(DISTINCT customers.id) -> COUNT(DISTINCT id) + """ + + def _strip_match(m: re.Match) -> str: + fn = m.group(1) + inner = m.group(2).strip() + # Handle DISTINCT + if inner.upper().startswith("DISTINCT "): + rest = inner[9:].strip() + if "." in rest: + col = rest.split(".")[-1] + return f"{fn}(DISTINCT {col})" + return m.group(0) + if "." in inner and inner != "*": + col = inner.split(".")[-1] + return f"{fn}({col})" + return m.group(0) + + return re.sub(r"(\w+)\(([^)]+)\)", _strip_match, sql) + + +def from_osi( + osi_config: dict[str, Any], + tables: Mapping[str, Any] | None = None, +) -> dict[str, SemanticModel]: + """Convert an OSI YAML dict to BSL SemanticModel instances. + + Args: + osi_config: Parsed OSI YAML dict (as returned by yaml.safe_load) + tables: Optional mapping of dataset names to ibis table expressions. + If not provided, models are created without backing tables + (metadata-only, useful for inspection but not query execution). + + Returns: + Dict mapping model names to SemanticModel instances + + Example: + >>> import yaml + >>> from boring_semantic_layer.osi import from_osi + >>> with open("model.osi.yaml") as f: + ... osi = yaml.safe_load(f) + >>> models = from_osi(osi, tables={"flights": con.table("flights")}) + """ + tables = dict(tables) if tables else {} + + semantic_models = osi_config.get("semantic_model", []) + if not semantic_models: + raise ValueError("No semantic_model found in OSI config") + + result: dict[str, SemanticModel] = {} + + for sm in semantic_models: + datasets = sm.get("datasets", []) + metrics = sm.get("metrics", []) + relationships = sm.get("relationships", []) + + # Build a mapping of dataset_name -> metrics that reference it + # (by checking column prefixes in metric expressions) + dataset_names = {ds["name"] for ds in datasets} + + for ds in datasets: + ds_name = ds["name"] + + # Get or create table + if ds_name in tables: + table = tables[ds_name] + elif ds.get("source") and ds["source"] in tables: + table = tables[ds["source"]] + else: + # Create a dummy table from field schema for metadata-only use + table = _create_placeholder_table(ds) + if table is None: + continue + + # Create semantic model + model = to_semantic_table(table, name=ds_name, description=ds.get("description")) + + # Parse fields into dimensions + dimensions: dict[str, Dimension] = {} + for field in ds.get("fields", []): + dim_name, dim = _osi_field_to_dimension(field) + dimensions[dim_name] = dim + + if dimensions: + model = model.with_dimensions(**dimensions) + + # Find metrics that belong to this dataset + ds_measures: dict[str, Measure] = {} + for metric in metrics: + sql_expr = _parse_osi_expression(metric["expression"]) + # Check if metric references this dataset (or has no prefix) + if f"{ds_name}." in sql_expr or not any( + f"{other}." in sql_expr for other in dataset_names if other != ds_name + ): + meas_name, meas = _osi_metric_to_measure(metric) + ds_measures[meas_name] = meas + + if ds_measures: + model = model.with_measures(**ds_measures) + + result[ds_name] = model + + # Apply relationships as joins (if tables are provided) + if tables and relationships: + for rel in relationships: + from_ds = rel.get("from", "") + to_ds = rel.get("to", "") + if from_ds in result and to_ds in result: + from_cols = rel.get("from_columns", []) + to_cols = rel.get("to_columns", []) + if from_cols and to_cols and from_cols[0] != "unknown": + left_col = from_cols[0] + right_col = to_cols[0] + + def make_join_cond(lc, rc): + return lambda left, right: getattr(left, lc) == getattr(right, rc) + + result[from_ds] = result[from_ds].join_one( + result[to_ds], + on=make_join_cond(left_col, right_col), + ) + + return result + + +def from_osi_yaml( + yaml_path: str, + tables: Mapping[str, Any] | None = None, +) -> dict[str, SemanticModel]: + """Load BSL models from an OSI YAML file. + + Convenience wrapper around from_osi() that reads and parses the YAML file. + """ + from .utils import read_yaml_file + + osi_config = read_yaml_file(yaml_path) + return from_osi(osi_config, tables=tables) + + +def _create_placeholder_table(dataset: dict) -> Any: + """Create a placeholder ibis table from OSI field definitions. + + This allows metadata inspection without a real database connection. + Returns None if no fields are defined. + """ + import ibis + + fields = dataset.get("fields", []) + if not fields: + return None + + # Create a simple schema from field names (all as string type as placeholder) + schema = {f["name"]: "string" for f in fields} + try: + return ibis.table(schema, name=dataset["name"]) + except Exception: + return None diff --git a/src/boring_semantic_layer/tests/test_osi.py b/src/boring_semantic_layer/tests/test_osi.py new file mode 100644 index 00000000..f1d52e46 --- /dev/null +++ b/src/boring_semantic_layer/tests/test_osi.py @@ -0,0 +1,551 @@ +"""Tests for OSI (Open Semantic Interchange) converter.""" + +import json + +import ibis +import pytest + +from boring_semantic_layer import ( + Dimension, + Measure, + to_semantic_table, +) +from boring_semantic_layer.osi import ( + OSI_VERSION, + _deferred_to_sql, + _ibis_string_to_sql, + _sql_to_deferred, + _strip_dataset_prefix, + from_osi, + to_osi, + to_osi_yaml, +) + + +# --------------------------------------------------------------------------- +# Expression conversion tests +# --------------------------------------------------------------------------- + + +class TestIbisStringToSql: + def test_simple_column(self): + assert _ibis_string_to_sql("_.column_name") == "column_name" + + def test_count(self): + assert _ibis_string_to_sql("_.count()") == "COUNT(*)" + + def test_sum(self): + assert _ibis_string_to_sql("_.amount.sum()") == "SUM(amount)" + + def test_mean(self): + assert _ibis_string_to_sql("_.amount.mean()") == "AVG(amount)" + + def test_max(self): + assert _ibis_string_to_sql("_.amount.max()") == "MAX(amount)" + + def test_min(self): + assert _ibis_string_to_sql("_.amount.min()") == "MIN(amount)" + + def test_nunique(self): + assert _ibis_string_to_sql("_.customer_id.nunique()") == "COUNT(DISTINCT customer_id)" + + +class TestDeferredToSql: + def test_simple_column(self): + from ibis import _ + + assert _deferred_to_sql(_.column_name) == "column_name" + + def test_count(self): + from ibis import _ + + assert _deferred_to_sql(_.count()) == "COUNT(*)" + + def test_sum(self): + from ibis import _ + + result = _deferred_to_sql(_.amount.sum()) + assert result == "SUM(amount)" + + def test_mean(self): + from ibis import _ + + result = _deferred_to_sql(_.amount.mean()) + assert result == "AVG(amount)" + + +class TestSqlToDeferred: + def test_simple_column(self): + d = _sql_to_deferred("column_name") + # Verify it's a Deferred by checking str representation + assert "column_name" in str(d) + + def test_count_star(self): + d = _sql_to_deferred("COUNT(*)") + assert "count" in str(d).lower() + + def test_sum(self): + d = _sql_to_deferred("SUM(amount)") + assert "sum" in str(d).lower() or "amount" in str(d).lower() + + def test_avg(self): + d = _sql_to_deferred("AVG(price)") + assert "mean" in str(d).lower() or "price" in str(d).lower() + + +class TestStripDatasetPrefix: + def test_sum_with_prefix(self): + assert _strip_dataset_prefix("SUM(flights.distance)") == "SUM(distance)" + + def test_count_star(self): + assert _strip_dataset_prefix("COUNT(*)") == "COUNT(*)" + + def test_count_distinct_with_prefix(self): + assert ( + _strip_dataset_prefix("COUNT(DISTINCT customers.id)") + == "COUNT(DISTINCT id)" + ) + + def test_no_prefix(self): + assert _strip_dataset_prefix("SUM(distance)") == "SUM(distance)" + + +# --------------------------------------------------------------------------- +# Export tests: BSL -> OSI +# --------------------------------------------------------------------------- + + +@pytest.fixture +def simple_model(): + """A simple BSL model with dimensions and measures.""" + table = ibis.table( + {"order_id": "int64", "customer_id": "int64", "amount": "float64", "created_at": "timestamp"}, + name="orders", + ) + model = to_semantic_table(table, name="orders", description="Order transactions") + model = model.with_dimensions( + order_id=Dimension(expr=ibis._.order_id, description="Order ID", is_entity=True), + customer_id=Dimension(expr=ibis._.customer_id, description="Customer ID"), + created_at=Dimension( + expr=ibis._.created_at, + description="Order creation timestamp", + is_time_dimension=True, + smallest_time_grain="TIME_GRAIN_DAY", + ), + ) + model = model.with_measures( + order_count=Measure(expr=ibis._.count(), description="Total orders"), + total_amount=Measure(expr=ibis._.amount.sum(), description="Total order amount"), + avg_amount=Measure(expr=ibis._.amount.mean(), description="Average order amount"), + ) + return model + + +@pytest.fixture +def model_with_ai_context(): + """A BSL model with ai_context on dimensions and measures.""" + table = ibis.table( + {"product_id": "int64", "name": "string", "price": "float64"}, + name="products", + ) + model = to_semantic_table(table, name="products", description="Product catalog") + model = model.with_dimensions( + product_id=Dimension( + expr=ibis._.product_id, + description="Product identifier", + is_entity=True, + ai_context={"synonyms": ["SKU", "item ID"]}, + ), + name=Dimension( + expr=ibis._.name, + description="Product name", + ai_context="Product display name shown to customers", + ), + ) + model = model.with_measures( + avg_price=Measure( + expr=ibis._.price.mean(), + description="Average product price", + ai_context={"synonyms": ["mean price", "price average"]}, + ), + ) + return model + + +class TestToOsi: + def test_basic_structure(self, simple_model): + osi = to_osi(simple_model, name="test_model") + assert osi["version"] == OSI_VERSION + assert len(osi["semantic_model"]) == 1 + sm = osi["semantic_model"][0] + assert sm["name"] == "test_model" + assert "datasets" in sm + + def test_dataset_fields(self, simple_model): + osi = to_osi(simple_model) + ds = osi["semantic_model"][0]["datasets"][0] + assert ds["name"] == "orders" + assert ds["description"] == "Order transactions" + assert "fields" in ds + field_names = {f["name"] for f in ds["fields"]} + assert "order_id" in field_names + assert "customer_id" in field_names + assert "created_at" in field_names + + def test_primary_key_from_entity(self, simple_model): + osi = to_osi(simple_model) + ds = osi["semantic_model"][0]["datasets"][0] + assert "primary_key" in ds + assert "order_id" in ds["primary_key"] + + def test_time_dimension(self, simple_model): + osi = to_osi(simple_model) + ds = osi["semantic_model"][0]["datasets"][0] + created_at = next(f for f in ds["fields"] if f["name"] == "created_at") + assert created_at["dimension"]["is_time"] is True + + def test_non_time_dimension(self, simple_model): + osi = to_osi(simple_model) + ds = osi["semantic_model"][0]["datasets"][0] + customer = next(f for f in ds["fields"] if f["name"] == "customer_id") + assert customer["dimension"]["is_time"] is False + + def test_metrics(self, simple_model): + osi = to_osi(simple_model) + sm = osi["semantic_model"][0] + assert "metrics" in sm + metric_names = {m["name"] for m in sm["metrics"]} + assert "order_count" in metric_names + assert "total_amount" in metric_names + assert "avg_amount" in metric_names + + def test_metric_expressions(self, simple_model): + osi = to_osi(simple_model) + sm = osi["semantic_model"][0] + count_metric = next(m for m in sm["metrics"] if m["name"] == "order_count") + expr = count_metric["expression"] + assert "dialects" in expr + assert len(expr["dialects"]) >= 1 + assert expr["dialects"][0]["dialect"] == "ANSI_SQL" + + def test_field_expression_format(self, simple_model): + osi = to_osi(simple_model) + ds = osi["semantic_model"][0]["datasets"][0] + order_id_field = next(f for f in ds["fields"] if f["name"] == "order_id") + expr = order_id_field["expression"] + assert "dialects" in expr + assert expr["dialects"][0]["dialect"] == "ANSI_SQL" + assert expr["dialects"][0]["expression"] == "order_id" + + def test_custom_extensions_for_bsl_metadata(self, simple_model): + osi = to_osi(simple_model) + ds = osi["semantic_model"][0]["datasets"][0] + order_id_field = next(f for f in ds["fields"] if f["name"] == "order_id") + assert "custom_extensions" in order_id_field + ext = order_id_field["custom_extensions"][0] + assert ext["vendor_name"] == "COMMON" + data = json.loads(ext["data"]) + assert data["is_entity"] is True + + def test_time_grain_in_custom_extensions(self, simple_model): + osi = to_osi(simple_model) + ds = osi["semantic_model"][0]["datasets"][0] + created_at_field = next(f for f in ds["fields"] if f["name"] == "created_at") + ext = created_at_field["custom_extensions"][0] + data = json.loads(ext["data"]) + assert data["smallest_time_grain"] == "TIME_GRAIN_DAY" + + def test_ai_context_on_dimensions(self, model_with_ai_context): + osi = to_osi(model_with_ai_context) + ds = osi["semantic_model"][0]["datasets"][0] + product_field = next(f for f in ds["fields"] if f["name"] == "product_id") + assert "ai_context" in product_field + assert product_field["ai_context"]["synonyms"] == ["SKU", "item ID"] + + def test_ai_context_on_metrics(self, model_with_ai_context): + osi = to_osi(model_with_ai_context) + sm = osi["semantic_model"][0] + avg_price = next(m for m in sm["metrics"] if m["name"] == "avg_price") + assert "ai_context" in avg_price + assert avg_price["ai_context"]["synonyms"] == ["mean price", "price average"] + + def test_with_description_and_ai_context(self, simple_model): + osi = to_osi( + simple_model, + name="my_model", + description="A test model", + ai_context={"instructions": "Use for order analysis"}, + ) + sm = osi["semantic_model"][0] + assert sm["description"] == "A test model" + assert sm["ai_context"]["instructions"] == "Use for order analysis" + + def test_multiple_models(self): + orders = ibis.table({"id": "int64", "amount": "float64"}, name="orders") + customers = ibis.table({"id": "int64", "name": "string"}, name="customers") + + m1 = to_semantic_table(orders, name="orders") + m1 = m1.with_dimensions(id=Dimension(expr=ibis._.id)) + m1 = m1.with_measures(total=Measure(expr=ibis._.amount.sum())) + + m2 = to_semantic_table(customers, name="customers") + m2 = m2.with_dimensions(id=Dimension(expr=ibis._.id)) + + osi = to_osi({"orders": m1, "customers": m2}, name="ecommerce") + assert len(osi["semantic_model"][0]["datasets"]) == 2 + + +class TestToOsiYaml: + def test_yaml_output(self, simple_model): + yaml_str = to_osi_yaml(simple_model, name="test") + assert "version:" in yaml_str + assert "semantic_model:" in yaml_str + assert "datasets:" in yaml_str + + +# --------------------------------------------------------------------------- +# Import tests: OSI -> BSL +# --------------------------------------------------------------------------- + + +@pytest.fixture +def osi_config(): + """A minimal OSI config dict.""" + return { + "version": "0.1.1", + "semantic_model": [ + { + "name": "test_model", + "description": "Test semantic model", + "datasets": [ + { + "name": "orders", + "source": "orders_table", + "primary_key": ["order_id"], + "description": "Order transactions", + "fields": [ + { + "name": "order_id", + "expression": { + "dialects": [ + {"dialect": "ANSI_SQL", "expression": "order_id"} + ] + }, + "dimension": {"is_time": False}, + "description": "Order identifier", + "custom_extensions": [ + { + "vendor_name": "COMMON", + "data": json.dumps({"is_entity": True}), + } + ], + }, + { + "name": "created_at", + "expression": { + "dialects": [ + {"dialect": "ANSI_SQL", "expression": "created_at"} + ] + }, + "dimension": {"is_time": True}, + "description": "Creation timestamp", + }, + { + "name": "amount", + "expression": { + "dialects": [ + {"dialect": "ANSI_SQL", "expression": "amount"} + ] + }, + "dimension": {"is_time": False}, + }, + ], + }, + ], + "metrics": [ + { + "name": "total_amount", + "expression": { + "dialects": [ + {"dialect": "ANSI_SQL", "expression": "SUM(orders.amount)"} + ] + }, + "description": "Total order amount", + }, + { + "name": "order_count", + "expression": { + "dialects": [ + {"dialect": "ANSI_SQL", "expression": "COUNT(*)"} + ] + }, + "description": "Number of orders", + }, + ], + } + ], + } + + +class TestFromOsi: + def test_basic_import(self, osi_config): + models = from_osi(osi_config) + assert "orders" in models + + def test_model_description(self, osi_config): + models = from_osi(osi_config) + op = models["orders"].op() + assert op.description == "Order transactions" + + def test_dimensions_imported(self, osi_config): + models = from_osi(osi_config) + dims = models["orders"].op().get_dimensions() + assert "order_id" in dims + assert "created_at" in dims + assert "amount" in dims + + def test_dimension_descriptions(self, osi_config): + models = from_osi(osi_config) + dims = models["orders"].op().get_dimensions() + assert dims["order_id"].description == "Order identifier" + assert dims["created_at"].description == "Creation timestamp" + + def test_time_dimension_flag(self, osi_config): + models = from_osi(osi_config) + dims = models["orders"].op().get_dimensions() + assert dims["created_at"].is_time_dimension is True + assert dims["order_id"].is_time_dimension is False + + def test_entity_from_custom_extensions(self, osi_config): + models = from_osi(osi_config) + dims = models["orders"].op().get_dimensions() + assert dims["order_id"].is_entity is True + + def test_measures_imported(self, osi_config): + models = from_osi(osi_config) + measures = models["orders"].op().get_measures() + assert "total_amount" in measures + assert "order_count" in measures + + def test_measure_descriptions(self, osi_config): + models = from_osi(osi_config) + measures = models["orders"].op().get_measures() + assert measures["total_amount"].description == "Total order amount" + assert measures["order_count"].description == "Number of orders" + + def test_with_real_table(self, osi_config): + """Test import with a real DuckDB table backing.""" + con = ibis.duckdb.connect() + con.raw_sql( + "CREATE TABLE orders_table (order_id INT, created_at TIMESTAMP, amount DOUBLE)" + ) + table = con.table("orders_table") + models = from_osi(osi_config, tables={"orders": table}) + assert "orders" in models + + def test_ai_context_preserved(self): + config = { + "version": "0.1.1", + "semantic_model": [ + { + "name": "test", + "datasets": [ + { + "name": "items", + "source": "items", + "fields": [ + { + "name": "item_id", + "expression": { + "dialects": [ + {"dialect": "ANSI_SQL", "expression": "item_id"} + ] + }, + "ai_context": {"synonyms": ["SKU", "product_id"]}, + } + ], + } + ], + } + ], + } + models = from_osi(config) + dims = models["items"].op().get_dimensions() + assert dims["item_id"].ai_context == {"synonyms": ["SKU", "product_id"]} + + +# --------------------------------------------------------------------------- +# Round-trip tests +# --------------------------------------------------------------------------- + + +class TestRoundTrip: + def test_bsl_to_osi_to_bsl(self, simple_model): + """BSL -> OSI -> BSL preserves key semantics.""" + # Export + osi = to_osi(simple_model, name="round_trip_test") + + # Import back + models = from_osi(osi) + assert "orders" in models + + # Check dimensions preserved + orig_dims = simple_model.op().get_dimensions() + new_dims = models["orders"].op().get_dimensions() + assert set(orig_dims.keys()) == set(new_dims.keys()) + + # Check descriptions preserved + for name in orig_dims: + assert orig_dims[name].description == new_dims[name].description + + # Check time dimension flag preserved + assert new_dims["created_at"].is_time_dimension is True + assert new_dims["order_id"].is_time_dimension is False + + # Check entity flag preserved (via custom_extensions round-trip) + assert new_dims["order_id"].is_entity is True + + def test_bsl_to_osi_to_bsl_measures(self, simple_model): + """BSL -> OSI -> BSL preserves measure semantics.""" + osi = to_osi(simple_model) + models = from_osi(osi) + + orig_measures = simple_model.op().get_measures() + new_measures = models["orders"].op().get_measures() + + # All measures should be present + assert set(orig_measures.keys()) == set(new_measures.keys()) + + # Descriptions preserved + for name in orig_measures: + assert orig_measures[name].description == new_measures[name].description + + def test_osi_to_bsl_to_osi(self, osi_config): + """OSI -> BSL -> OSI preserves key structure.""" + # Import + models = from_osi(osi_config) + + # Export back + osi_out = to_osi(models, name="round_trip") + + # Check structure + assert osi_out["version"] == OSI_VERSION + ds_out = osi_out["semantic_model"][0]["datasets"][0] + assert ds_out["name"] == "orders" + assert ds_out["description"] == "Order transactions" + + # Field count preserved + ds_in = osi_config["semantic_model"][0]["datasets"][0] + assert len(ds_out["fields"]) == len(ds_in["fields"]) + + def test_ai_context_round_trip(self, model_with_ai_context): + """ai_context survives BSL -> OSI -> BSL round-trip.""" + osi = to_osi(model_with_ai_context) + models = from_osi(osi) + + dims = models["products"].op().get_dimensions() + assert dims["product_id"].ai_context == {"synonyms": ["SKU", "item ID"]} + assert dims["name"].ai_context == "Product display name shown to customers" + + measures = models["products"].op().get_measures() + assert measures["avg_price"].ai_context == {"synonyms": ["mean price", "price average"]} diff --git a/src/boring_semantic_layer/yaml.py b/src/boring_semantic_layer/yaml.py index 71af2bfc..25f92d46 100644 --- a/src/boring_semantic_layer/yaml.py +++ b/src/boring_semantic_layer/yaml.py @@ -30,6 +30,8 @@ def _parse_expression_config(name: str, config: str | dict, metric_type: str): extra_kwargs["is_time_dimension"] = config.get("is_time_dimension", False) extra_kwargs["smallest_time_grain"] = config.get("smallest_time_grain") extra_kwargs["derived_dimensions"] = tuple(config.get("derived_dimensions") or ()) + if "ai_context" in config: + extra_kwargs["ai_context"] = config["ai_context"] return config["expr"], config.get("description"), extra_kwargs else: raise ValueError(f"Invalid {metric_type} format for '{name}'. Must be a string or dict") From 387a6d5b4a76e11947c49caff72280c241af1787 Mon Sep 17 00:00:00 2001 From: boringdata Date: Thu, 2 Apr 2026 07:29:53 +0000 Subject: [PATCH 2/4] refactor: make from_yaml/from_config natively parse OSI YAML MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of a separate from_osi() conversion layer, from_config() now auto-detects OSI format (version + semantic_model keys) and parses it directly. This means from_yaml("model.osi.yaml") just works — BSL natively speaks OSI. - OSI parsing logic moved from osi.py into yaml.py - osi.py slimmed to export-only (to_osi/to_osi_yaml) + expression helpers - from_osi/from_osi_yaml kept as thin aliases to from_config/from_yaml - Removed from_osi/from_osi_yaml from top-level __init__.py exports - Tests updated to use from_config for OSI import (the native path) - Added format detection tests Co-Authored-By: Claude Opus 4.6 (1M context) --- src/boring_semantic_layer/__init__.py | 4 - src/boring_semantic_layer/osi.py | 514 ++++---------------- src/boring_semantic_layer/tests/test_osi.py | 299 ++++-------- src/boring_semantic_layer/yaml.py | 380 ++++++++++++--- 4 files changed, 492 insertions(+), 705 deletions(-) diff --git a/src/boring_semantic_layer/__init__.py b/src/boring_semantic_layer/__init__.py index 828cebc3..608d05bd 100644 --- a/src/boring_semantic_layer/__init__.py +++ b/src/boring_semantic_layer/__init__.py @@ -44,8 +44,6 @@ from_yaml, ) from .osi import ( - from_osi, - from_osi_yaml, to_osi, to_osi_yaml, ) @@ -65,8 +63,6 @@ "from_yaml", "to_osi", "to_osi_yaml", - "from_osi", - "from_osi_yaml", "MCPSemanticModel", "LangGraphBackend", "options", diff --git a/src/boring_semantic_layer/osi.py b/src/boring_semantic_layer/osi.py index d2802f77..c7b734ab 100644 --- a/src/boring_semantic_layer/osi.py +++ b/src/boring_semantic_layer/osi.py @@ -1,8 +1,10 @@ """ -OSI (Open Semantic Interchange) converter for Boring Semantic Layer. +OSI (Open Semantic Interchange) support for Boring Semantic Layer. -Provides bidirectional conversion between BSL's semantic model format -and the OSI v0.1.1 YAML specification. +Export: ``to_osi`` / ``to_osi_yaml`` convert BSL models to OSI v0.1.1 YAML. +Import: ``from_yaml`` / ``from_config`` natively detect and parse OSI YAML + (no separate import step needed). ``from_osi`` / ``from_osi_yaml`` + are kept as convenience aliases. See: https://github.com/open-semantic-interchange/OSI """ @@ -13,71 +15,46 @@ from collections.abc import Mapping from typing import Any -from ibis import _ from ibis.common.deferred import Deferred -from .api import to_semantic_table from .expr import SemanticModel, SemanticTable from .ops import Dimension, Measure, SemanticTableOp, _is_deferred -from .utils import expr_to_ibis_string, safe_eval +from .utils import expr_to_ibis_string OSI_VERSION = "0.1.1" -BSL_VENDOR = "BSL" # --------------------------------------------------------------------------- -# Expression helpers +# Expression helpers (BSL -> SQL for OSI export) # --------------------------------------------------------------------------- def _deferred_to_sql(expr: Deferred) -> str: - """Convert an Ibis Deferred expression to a simple SQL-like string. - - Handles common patterns: - _.column_name -> "column_name" - _.column.sum() -> "SUM(column)" - _.column.mean() -> "AVG(column)" - _.column.max() -> "MAX(column)" - _.column.min() -> "MIN(column)" - _.column.nunique() -> "COUNT(DISTINCT column)" - _.count() -> "COUNT(*)" - """ - s = str(expr) # e.g. "_.column_name" or "_.column.sum()" - return _ibis_string_to_sql(s) + """Convert an Ibis Deferred expression to a simple SQL-like string.""" + return _ibis_string_to_sql(str(expr)) def _ibis_string_to_sql(s: str) -> str: """Convert an Ibis deferred string repr to SQL expression.""" s = s.strip() - # _.count() -> COUNT(*) if s == "_.count()": return "COUNT(*)" - # _.col.agg() patterns - agg_map = { - "sum": "SUM", - "mean": "AVG", - "max": "MAX", - "min": "MIN", - } + agg_map = {"sum": "SUM", "mean": "AVG", "max": "MAX", "min": "MIN"} for ibis_fn, sql_fn in agg_map.items(): - pattern = rf"^_\.(.+)\.{ibis_fn}\(\)$" - m = re.match(pattern, s) + m = re.match(rf"^_\.(.+)\.{ibis_fn}\(\)$", s) if m: return f"{sql_fn}({m.group(1)})" - # _.col.nunique() -> COUNT(DISTINCT col) m = re.match(r"^_\.(.+)\.nunique\(\)$", s) if m: return f"COUNT(DISTINCT {m.group(1)})" - # Simple column reference: _.col -> col m = re.match(r"^_\.(\w+)$", s) if m: return m.group(1) - # Fallback: strip leading "_." and return as-is if s.startswith("_."): return s[2:] return s @@ -88,7 +65,6 @@ def _expr_to_sql_string(expr: Any) -> str | None: if _is_deferred(expr): return _deferred_to_sql(expr) - # Try the ibis string extraction utility from returns.result import Success result = expr_to_ibis_string(expr) @@ -100,52 +76,17 @@ def _expr_to_sql_string(expr: Any) -> str | None: return None -def _sql_to_deferred(sql: str) -> Deferred: - """Convert a simple SQL expression back to an Ibis Deferred. - - Handles: - "column_name" -> _.column_name - "SUM(column)" -> _.column.sum() - "AVG(column)" -> _.column.mean() - "COUNT(*)" -> _.count() - "COUNT(DISTINCT column)" -> _.column.nunique() - """ - sql = sql.strip() - - if sql == "COUNT(*)": - return safe_eval("_.count()", context={"_": _}).unwrap() - - # COUNT(DISTINCT col) - m = re.match(r"^COUNT\(DISTINCT\s+(\w+)\)$", sql, re.IGNORECASE) - if m: - return safe_eval(f"_.{m.group(1)}.nunique()", context={"_": _}).unwrap() - - # AGG(col) patterns - sql_to_ibis = {"SUM": "sum", "AVG": "mean", "MAX": "max", "MIN": "min"} - for sql_fn, ibis_fn in sql_to_ibis.items(): - m = re.match(rf"^{sql_fn}\((\w+)\)$", sql, re.IGNORECASE) - if m: - return safe_eval(f"_.{m.group(1)}.{ibis_fn}()", context={"_": _}).unwrap() - - # Simple column reference - if re.match(r"^\w+$", sql): - return safe_eval(f"_.{sql}", context={"_": _}).unwrap() - - # Fallback: try eval as-is with underscore prefix - try: - return safe_eval(f"_.{sql}", context={"_": _}).unwrap() - except Exception: - # Last resort: return as deferred column access - return safe_eval(f"_.{sql.split('.')[0] if '.' in sql else sql}", context={"_": _}).unwrap() - - # --------------------------------------------------------------------------- -# Export: BSL -> OSI +# Export helpers # --------------------------------------------------------------------------- +def _json_dumps(obj: Any) -> str: + import json + return json.dumps(obj) + + def _make_osi_expression(sql_expr: str, dialect: str = "ANSI_SQL") -> dict: - """Create an OSI expression object.""" return {"dialects": [{"dialect": dialect, "expression": sql_expr}]} @@ -157,18 +98,13 @@ def _dimension_to_osi_field(name: str, dim: Dimension) -> dict: "expression": _make_osi_expression(sql or name), } - if dim.is_time_dimension or dim.is_event_timestamp: - field["dimension"] = {"is_time": True} - else: - field["dimension"] = {"is_time": False} + field["dimension"] = {"is_time": bool(dim.is_time_dimension or dim.is_event_timestamp)} if dim.description: field["description"] = dim.description - if dim.ai_context: field["ai_context"] = dim.ai_context - # Store BSL-specific metadata in custom_extensions bsl_data: dict[str, Any] = {} if dim.is_entity: bsl_data["is_entity"] = True @@ -187,45 +123,13 @@ def _dimension_to_osi_field(name: str, dim: Dimension) -> dict: return field -def _measure_to_osi_metric( - name: str, measure: Measure, dataset_name: str | None = None -) -> dict: - """Convert a BSL Measure to an OSI metric dict.""" - sql = _expr_to_sql_string(measure.expr) - if sql and dataset_name: - # Prefix column references in aggregate functions with dataset name - # e.g. SUM(amount) -> SUM(dataset.amount) - sql = _prefix_columns_in_sql(sql, dataset_name) - - metric: dict[str, Any] = { - "name": name, - "expression": _make_osi_expression(sql or name), - } - - if measure.description: - metric["description"] = measure.description - - if measure.ai_context: - metric["ai_context"] = measure.ai_context - - return metric - - def _prefix_columns_in_sql(sql: str, dataset: str) -> str: - """Add dataset prefix to bare column references inside aggregate functions. - - SUM(amount) -> SUM(dataset.amount) - COUNT(*) stays as COUNT(*) - COUNT(DISTINCT col) -> COUNT(DISTINCT dataset.col) - """ + """``SUM(amount)`` -> ``SUM(dataset.amount)``""" if sql == "COUNT(*)": return sql - # Handle aggregate functions: FN(col) -> FN(dataset.col) def _prefix_match(m: re.Match) -> str: - fn = m.group(1) - inner = m.group(2).strip() - # Handle DISTINCT keyword + fn, inner = m.group(1), m.group(2).strip() if inner.upper().startswith("DISTINCT "): col = inner[9:].strip() if "." not in col: @@ -238,78 +142,65 @@ def _prefix_match(m: re.Match) -> str: return re.sub(r"(\w+)\(([^)]+)\)", _prefix_match, sql) -def _json_dumps(obj: Any) -> str: - """Serialize to JSON string for custom_extensions.""" - import json +def _measure_to_osi_metric(name: str, measure: Measure, dataset_name: str | None = None) -> dict: + sql = _expr_to_sql_string(measure.expr) + if sql and dataset_name: + sql = _prefix_columns_in_sql(sql, dataset_name) - return json.dumps(obj) + metric: dict[str, Any] = { + "name": name, + "expression": _make_osi_expression(sql or name), + } + if measure.description: + metric["description"] = measure.description + if measure.ai_context: + metric["ai_context"] = measure.ai_context + return metric def _extract_join_info(model: SemanticModel) -> list[dict]: - """Extract relationship info from a model's join chain. + """Extract relationship info from a model's join chain.""" + from .ops import SemanticJoinOp - Returns list of OSI relationship dicts by inspecting the model's - SemanticJoinOp chain. - """ - relationships = [] + relationships: list[dict] = [] op = model.op() - # Walk up the op tree looking for SemanticJoinOp nodes - from .ops import SemanticJoinOp - - def _walk_joins(node, parent_name: str | None = None): + def _name(node) -> str: + if isinstance(node, SemanticTableOp): + return node.name or "unnamed" + if hasattr(node, "name") and node.name: + return node.name + if hasattr(node, "table"): + inner = node.table if not hasattr(node.table, "op") else node.table.op() + return _name(inner) + return "unnamed" + + def _walk(node): if isinstance(node, SemanticJoinOp): rel: dict[str, Any] = { - "name": f"{_get_model_name(node.left)}_{_get_model_name(node.right)}", - "from": _get_model_name(node.left), - "to": _get_model_name(node.right), + "name": f"{_name(node.left)}_{_name(node.right)}", + "from": _name(node.left), + "to": _name(node.right), + "from_columns": ["unknown"], + "to_columns": ["unknown"], } - # Try to extract join columns from the predicate - left_cols, right_cols = _extract_join_columns(node) - if left_cols and right_cols: - rel["from_columns"] = left_cols - rel["to_columns"] = right_cols - else: - rel["from_columns"] = ["unknown"] - rel["to_columns"] = ["unknown"] - - # Store cardinality in custom_extensions if hasattr(node, "cardinality"): rel["custom_extensions"] = [ - { - "vendor_name": "COMMON", - "data": _json_dumps({"cardinality": node.cardinality}), - } + {"vendor_name": "COMMON", "data": _json_dumps({"cardinality": node.cardinality})} ] - relationships.append(rel) - _walk_joins(node.left, _get_model_name(node.left)) - _walk_joins(node.right, _get_model_name(node.right)) - elif isinstance(node, SemanticTableOp): - pass # Base case + _walk(node.left) + _walk(node.right) elif hasattr(node, "table"): - _walk_joins(node.table if not hasattr(node.table, "op") else node.table.op()) + _walk(node.table if not hasattr(node.table, "op") else node.table.op()) - _walk_joins(op) + _walk(op) return relationships -def _get_model_name(node) -> str: - """Extract model name from an op node.""" - if isinstance(node, SemanticTableOp): - return node.name or "unnamed" - if hasattr(node, "name") and node.name: - return node.name - if hasattr(node, "table"): - inner = node.table if not hasattr(node.table, "op") else node.table.op() - return _get_model_name(inner) - return "unnamed" - - -def _extract_join_columns(join_op) -> tuple[list[str], list[str]]: - """Try to extract column names from a join predicate. Returns (left_cols, right_cols).""" - # This is best-effort; join predicates are lambdas and hard to introspect - return [], [] +# --------------------------------------------------------------------------- +# Public: Export BSL -> OSI +# --------------------------------------------------------------------------- def to_osi( @@ -327,15 +218,14 @@ def to_osi( ai_context: Optional AI context Returns: - Dict that can be serialized to OSI YAML via yaml.dump() + Dict that can be serialized to OSI YAML via ``yaml.dump()`` + + Example:: - Example: >>> from boring_semantic_layer import from_yaml >>> from boring_semantic_layer.osi import to_osi >>> models = from_yaml("flights.yml") >>> osi = to_osi(models, name="flights_analytics") - >>> import yaml - >>> print(yaml.dump(osi, sort_keys=False)) """ if isinstance(models, (SemanticModel, SemanticTable)): op = models.op() @@ -350,10 +240,8 @@ def to_osi( for model_name, model in models.items(): op = model.op() - # --- Dataset --- dataset: dict[str, Any] = {"name": model_name} - # Source: try to get table name try: source_table = op.to_untagged() if hasattr(source_table, "get_name"): @@ -365,7 +253,6 @@ def to_osi( except Exception: dataset["source"] = model_name - # Primary key from entity dimensions pk_cols = [] dims = op.get_dimensions() for dim_name, dim in dims.items(): @@ -375,27 +262,19 @@ def to_osi( if pk_cols: dataset["primary_key"] = pk_cols - # Description if op.description: dataset["description"] = op.description - # Fields from dimensions - fields = [] - for dim_name, dim in dims.items(): - fields.append(_dimension_to_osi_field(dim_name, dim)) + fields = [_dimension_to_osi_field(n, d) for n, d in dims.items()] if fields: dataset["fields"] = fields datasets.append(dataset) - # --- Metrics from measures --- - measures = op.get_measures() - for meas_name, meas in measures.items(): + for meas_name, meas in op.get_measures().items(): all_metrics.append(_measure_to_osi_metric(meas_name, meas, model_name)) - # --- Metrics from calculated measures --- - calc_measures = op.get_calculated_measures() - for cm_name, cm_fn in calc_measures.items(): + for cm_name, cm_fn in op.get_calculated_measures().items(): metric: dict[str, Any] = { "name": cm_name, "expression": _make_osi_expression(cm_name), @@ -403,23 +282,16 @@ def to_osi( if isinstance(cm_fn, Measure) and cm_fn.description: metric["description"] = cm_fn.description metric.setdefault("custom_extensions", []).append( - { - "vendor_name": "COMMON", - "data": _json_dumps({"bsl_type": "calculated_measure"}), - } + {"vendor_name": "COMMON", "data": _json_dumps({"bsl_type": "calculated_measure"})} ) all_metrics.append(metric) - # --- Relationships from joins --- - rels = _extract_join_info(model) - for rel in rels: + for rel in _extract_join_info(model): if rel["name"] not in seen_relationship_names: all_relationships.append(rel) seen_relationship_names.add(rel["name"]) - # Build the OSI document semantic_model: dict[str, Any] = {"name": name, "datasets": datasets} - if description: semantic_model["description"] = description if ai_context: @@ -438,228 +310,32 @@ def to_osi_yaml( description: str | None = None, ai_context: str | dict | None = None, ) -> str: - """Convert BSL models to an OSI YAML string. - - Convenience wrapper around to_osi() that returns a formatted YAML string. - """ + """Convert BSL models to an OSI YAML string.""" import yaml - - osi_dict = to_osi(models, name=name, description=description, ai_context=ai_context) - return yaml.dump(osi_dict, sort_keys=False, default_flow_style=False) + return yaml.dump( + to_osi(models, name=name, description=description, ai_context=ai_context), + sort_keys=False, + default_flow_style=False, + ) # --------------------------------------------------------------------------- -# Import: OSI -> BSL +# Convenience aliases — import delegates to from_config (which auto-detects) # --------------------------------------------------------------------------- -def _parse_osi_expression(expr_obj: dict, prefer_dialect: str = "ANSI_SQL") -> str: - """Extract the SQL expression string from an OSI expression object. - - Prefers the specified dialect, falls back to the first available. - """ - dialects = expr_obj.get("dialects", []) - if not dialects: - raise ValueError("OSI expression has no dialects") - - # Try preferred dialect first - for d in dialects: - if d.get("dialect") == prefer_dialect: - return d["expression"] - - # Fallback to first - return dialects[0]["expression"] - - -def _osi_field_to_dimension(field: dict) -> tuple[str, Dimension]: - """Convert an OSI field dict to a (name, BSL Dimension) tuple.""" - name = field["name"] - sql_expr = _parse_osi_expression(field["expression"]) - deferred = _sql_to_deferred(sql_expr) - - kwargs: dict[str, Any] = { - "expr": deferred, - "description": field.get("description"), - } - - # Dimension metadata - dim_meta = field.get("dimension", {}) - if dim_meta.get("is_time"): - kwargs["is_time_dimension"] = True - - # AI context - if "ai_context" in field: - kwargs["ai_context"] = field["ai_context"] - - # BSL-specific from custom_extensions - import json - - for ext in field.get("custom_extensions", []): - if ext.get("vendor_name") == "COMMON": - try: - data = json.loads(ext["data"]) - if data.get("is_entity"): - kwargs["is_entity"] = True - if data.get("is_event_timestamp"): - kwargs["is_event_timestamp"] = True - if data.get("smallest_time_grain"): - kwargs["smallest_time_grain"] = data["smallest_time_grain"] - if data.get("derived_dimensions"): - kwargs["derived_dimensions"] = tuple(data["derived_dimensions"]) - except (json.JSONDecodeError, KeyError): - pass - - return name, Dimension(**kwargs) - - -def _osi_metric_to_measure(metric: dict) -> tuple[str, Measure]: - """Convert an OSI metric dict to a (name, BSL Measure) tuple.""" - name = metric["name"] - sql_expr = _parse_osi_expression(metric["expression"]) - - # Strip dataset prefix from column refs for BSL (which scopes per-model) - # e.g., "SUM(flights.distance)" -> "SUM(distance)" via deferred - sql_expr = _strip_dataset_prefix(sql_expr) - deferred = _sql_to_deferred(sql_expr) - - kwargs: dict[str, Any] = { - "expr": deferred, - "description": metric.get("description"), - } - - if "ai_context" in metric: - kwargs["ai_context"] = metric["ai_context"] - - return name, Measure(**kwargs) - - -def _strip_dataset_prefix(sql: str) -> str: - """Remove dataset.column prefixes from SQL, keeping just column name. - - SUM(flights.distance) -> SUM(distance) - COUNT(DISTINCT customers.id) -> COUNT(DISTINCT id) - """ - - def _strip_match(m: re.Match) -> str: - fn = m.group(1) - inner = m.group(2).strip() - # Handle DISTINCT - if inner.upper().startswith("DISTINCT "): - rest = inner[9:].strip() - if "." in rest: - col = rest.split(".")[-1] - return f"{fn}(DISTINCT {col})" - return m.group(0) - if "." in inner and inner != "*": - col = inner.split(".")[-1] - return f"{fn}({col})" - return m.group(0) - - return re.sub(r"(\w+)\(([^)]+)\)", _strip_match, sql) - - def from_osi( osi_config: dict[str, Any], tables: Mapping[str, Any] | None = None, ) -> dict[str, SemanticModel]: - """Convert an OSI YAML dict to BSL SemanticModel instances. - - Args: - osi_config: Parsed OSI YAML dict (as returned by yaml.safe_load) - tables: Optional mapping of dataset names to ibis table expressions. - If not provided, models are created without backing tables - (metadata-only, useful for inspection but not query execution). + """Parse an OSI config dict into BSL models. - Returns: - Dict mapping model names to SemanticModel instances - - Example: - >>> import yaml - >>> from boring_semantic_layer.osi import from_osi - >>> with open("model.osi.yaml") as f: - ... osi = yaml.safe_load(f) - >>> models = from_osi(osi, tables={"flights": con.table("flights")}) + This is a convenience alias for ``from_config(osi_config, tables=tables)``. + You can also call ``from_config`` or ``from_yaml`` directly — they + auto-detect OSI format. """ - tables = dict(tables) if tables else {} - - semantic_models = osi_config.get("semantic_model", []) - if not semantic_models: - raise ValueError("No semantic_model found in OSI config") - - result: dict[str, SemanticModel] = {} - - for sm in semantic_models: - datasets = sm.get("datasets", []) - metrics = sm.get("metrics", []) - relationships = sm.get("relationships", []) - - # Build a mapping of dataset_name -> metrics that reference it - # (by checking column prefixes in metric expressions) - dataset_names = {ds["name"] for ds in datasets} - - for ds in datasets: - ds_name = ds["name"] - - # Get or create table - if ds_name in tables: - table = tables[ds_name] - elif ds.get("source") and ds["source"] in tables: - table = tables[ds["source"]] - else: - # Create a dummy table from field schema for metadata-only use - table = _create_placeholder_table(ds) - if table is None: - continue - - # Create semantic model - model = to_semantic_table(table, name=ds_name, description=ds.get("description")) - - # Parse fields into dimensions - dimensions: dict[str, Dimension] = {} - for field in ds.get("fields", []): - dim_name, dim = _osi_field_to_dimension(field) - dimensions[dim_name] = dim - - if dimensions: - model = model.with_dimensions(**dimensions) - - # Find metrics that belong to this dataset - ds_measures: dict[str, Measure] = {} - for metric in metrics: - sql_expr = _parse_osi_expression(metric["expression"]) - # Check if metric references this dataset (or has no prefix) - if f"{ds_name}." in sql_expr or not any( - f"{other}." in sql_expr for other in dataset_names if other != ds_name - ): - meas_name, meas = _osi_metric_to_measure(metric) - ds_measures[meas_name] = meas - - if ds_measures: - model = model.with_measures(**ds_measures) - - result[ds_name] = model - - # Apply relationships as joins (if tables are provided) - if tables and relationships: - for rel in relationships: - from_ds = rel.get("from", "") - to_ds = rel.get("to", "") - if from_ds in result and to_ds in result: - from_cols = rel.get("from_columns", []) - to_cols = rel.get("to_columns", []) - if from_cols and to_cols and from_cols[0] != "unknown": - left_col = from_cols[0] - right_col = to_cols[0] - - def make_join_cond(lc, rc): - return lambda left, right: getattr(left, lc) == getattr(right, rc) - - result[from_ds] = result[from_ds].join_one( - result[to_ds], - on=make_join_cond(left_col, right_col), - ) - - return result + from .yaml import from_config + return from_config(osi_config, tables=tables) def from_osi_yaml( @@ -668,29 +344,7 @@ def from_osi_yaml( ) -> dict[str, SemanticModel]: """Load BSL models from an OSI YAML file. - Convenience wrapper around from_osi() that reads and parses the YAML file. - """ - from .utils import read_yaml_file - - osi_config = read_yaml_file(yaml_path) - return from_osi(osi_config, tables=tables) - - -def _create_placeholder_table(dataset: dict) -> Any: - """Create a placeholder ibis table from OSI field definitions. - - This allows metadata inspection without a real database connection. - Returns None if no fields are defined. + This is a convenience alias for ``from_yaml(yaml_path, tables=tables)``. """ - import ibis - - fields = dataset.get("fields", []) - if not fields: - return None - - # Create a simple schema from field names (all as string type as placeholder) - schema = {f["name"]: "string" for f in fields} - try: - return ibis.table(schema, name=dataset["name"]) - except Exception: - return None + from .yaml import from_yaml + return from_yaml(yaml_path, tables=tables) diff --git a/src/boring_semantic_layer/tests/test_osi.py b/src/boring_semantic_layer/tests/test_osi.py index f1d52e46..164a9101 100644 --- a/src/boring_semantic_layer/tests/test_osi.py +++ b/src/boring_semantic_layer/tests/test_osi.py @@ -1,4 +1,9 @@ -"""Tests for OSI (Open Semantic Interchange) converter.""" +"""Tests for OSI (Open Semantic Interchange) support. + +Import tests use ``from_config`` (the native entry point) to verify that OSI +format is auto-detected and parsed correctly — no separate ``from_osi`` call +needed. Export tests use ``to_osi`` / ``to_osi_yaml``. +""" import json @@ -8,18 +13,21 @@ from boring_semantic_layer import ( Dimension, Measure, + from_config, to_semantic_table, ) from boring_semantic_layer.osi import ( OSI_VERSION, _deferred_to_sql, _ibis_string_to_sql, - _sql_to_deferred, - _strip_dataset_prefix, - from_osi, to_osi, to_osi_yaml, ) +from boring_semantic_layer.yaml import ( + _is_osi_config, + _sql_to_deferred, + _strip_dataset_prefix, +) # --------------------------------------------------------------------------- @@ -52,32 +60,21 @@ def test_nunique(self): class TestDeferredToSql: def test_simple_column(self): - from ibis import _ - - assert _deferred_to_sql(_.column_name) == "column_name" + assert _deferred_to_sql(ibis._.column_name) == "column_name" def test_count(self): - from ibis import _ - - assert _deferred_to_sql(_.count()) == "COUNT(*)" + assert _deferred_to_sql(ibis._.count()) == "COUNT(*)" def test_sum(self): - from ibis import _ - - result = _deferred_to_sql(_.amount.sum()) - assert result == "SUM(amount)" + assert _deferred_to_sql(ibis._.amount.sum()) == "SUM(amount)" def test_mean(self): - from ibis import _ - - result = _deferred_to_sql(_.amount.mean()) - assert result == "AVG(amount)" + assert _deferred_to_sql(ibis._.amount.mean()) == "AVG(amount)" class TestSqlToDeferred: def test_simple_column(self): d = _sql_to_deferred("column_name") - # Verify it's a Deferred by checking str representation assert "column_name" in str(d) def test_count_star(self): @@ -101,15 +98,29 @@ def test_count_star(self): assert _strip_dataset_prefix("COUNT(*)") == "COUNT(*)" def test_count_distinct_with_prefix(self): - assert ( - _strip_dataset_prefix("COUNT(DISTINCT customers.id)") - == "COUNT(DISTINCT id)" - ) + assert _strip_dataset_prefix("COUNT(DISTINCT customers.id)") == "COUNT(DISTINCT id)" def test_no_prefix(self): assert _strip_dataset_prefix("SUM(distance)") == "SUM(distance)" +# --------------------------------------------------------------------------- +# Format detection +# --------------------------------------------------------------------------- + + +class TestFormatDetection: + def test_detects_osi(self): + assert _is_osi_config({"version": "0.1.1", "semantic_model": []}) + + def test_rejects_bsl(self): + assert not _is_osi_config({"flights": {"table": "flights_tbl"}}) + + def test_rejects_partial(self): + assert not _is_osi_config({"version": "0.1.1"}) + assert not _is_osi_config({"semantic_model": []}) + + # --------------------------------------------------------------------------- # Export tests: BSL -> OSI # --------------------------------------------------------------------------- @@ -186,16 +197,12 @@ def test_dataset_fields(self, simple_model): ds = osi["semantic_model"][0]["datasets"][0] assert ds["name"] == "orders" assert ds["description"] == "Order transactions" - assert "fields" in ds field_names = {f["name"] for f in ds["fields"]} - assert "order_id" in field_names - assert "customer_id" in field_names - assert "created_at" in field_names + assert {"order_id", "customer_id", "created_at"} <= field_names def test_primary_key_from_entity(self, simple_model): osi = to_osi(simple_model) ds = osi["semantic_model"][0]["datasets"][0] - assert "primary_key" in ds assert "order_id" in ds["primary_key"] def test_time_dimension(self, simple_model): @@ -212,99 +219,71 @@ def test_non_time_dimension(self, simple_model): def test_metrics(self, simple_model): osi = to_osi(simple_model) - sm = osi["semantic_model"][0] - assert "metrics" in sm - metric_names = {m["name"] for m in sm["metrics"]} - assert "order_count" in metric_names - assert "total_amount" in metric_names - assert "avg_amount" in metric_names + metric_names = {m["name"] for m in osi["semantic_model"][0]["metrics"]} + assert {"order_count", "total_amount", "avg_amount"} <= metric_names def test_metric_expressions(self, simple_model): osi = to_osi(simple_model) - sm = osi["semantic_model"][0] - count_metric = next(m for m in sm["metrics"] if m["name"] == "order_count") - expr = count_metric["expression"] - assert "dialects" in expr - assert len(expr["dialects"]) >= 1 - assert expr["dialects"][0]["dialect"] == "ANSI_SQL" + count_metric = next(m for m in osi["semantic_model"][0]["metrics"] if m["name"] == "order_count") + assert count_metric["expression"]["dialects"][0]["dialect"] == "ANSI_SQL" def test_field_expression_format(self, simple_model): osi = to_osi(simple_model) ds = osi["semantic_model"][0]["datasets"][0] - order_id_field = next(f for f in ds["fields"] if f["name"] == "order_id") - expr = order_id_field["expression"] - assert "dialects" in expr - assert expr["dialects"][0]["dialect"] == "ANSI_SQL" - assert expr["dialects"][0]["expression"] == "order_id" + f = next(f for f in ds["fields"] if f["name"] == "order_id") + assert f["expression"]["dialects"][0] == {"dialect": "ANSI_SQL", "expression": "order_id"} def test_custom_extensions_for_bsl_metadata(self, simple_model): osi = to_osi(simple_model) ds = osi["semantic_model"][0]["datasets"][0] - order_id_field = next(f for f in ds["fields"] if f["name"] == "order_id") - assert "custom_extensions" in order_id_field - ext = order_id_field["custom_extensions"][0] - assert ext["vendor_name"] == "COMMON" - data = json.loads(ext["data"]) + f = next(f for f in ds["fields"] if f["name"] == "order_id") + data = json.loads(f["custom_extensions"][0]["data"]) assert data["is_entity"] is True def test_time_grain_in_custom_extensions(self, simple_model): osi = to_osi(simple_model) ds = osi["semantic_model"][0]["datasets"][0] - created_at_field = next(f for f in ds["fields"] if f["name"] == "created_at") - ext = created_at_field["custom_extensions"][0] - data = json.loads(ext["data"]) + f = next(f for f in ds["fields"] if f["name"] == "created_at") + data = json.loads(f["custom_extensions"][0]["data"]) assert data["smallest_time_grain"] == "TIME_GRAIN_DAY" def test_ai_context_on_dimensions(self, model_with_ai_context): osi = to_osi(model_with_ai_context) ds = osi["semantic_model"][0]["datasets"][0] - product_field = next(f for f in ds["fields"] if f["name"] == "product_id") - assert "ai_context" in product_field - assert product_field["ai_context"]["synonyms"] == ["SKU", "item ID"] + f = next(f for f in ds["fields"] if f["name"] == "product_id") + assert f["ai_context"]["synonyms"] == ["SKU", "item ID"] def test_ai_context_on_metrics(self, model_with_ai_context): osi = to_osi(model_with_ai_context) - sm = osi["semantic_model"][0] - avg_price = next(m for m in sm["metrics"] if m["name"] == "avg_price") - assert "ai_context" in avg_price - assert avg_price["ai_context"]["synonyms"] == ["mean price", "price average"] + m = next(m for m in osi["semantic_model"][0]["metrics"] if m["name"] == "avg_price") + assert m["ai_context"]["synonyms"] == ["mean price", "price average"] def test_with_description_and_ai_context(self, simple_model): - osi = to_osi( - simple_model, - name="my_model", - description="A test model", - ai_context={"instructions": "Use for order analysis"}, - ) + osi = to_osi(simple_model, name="my_model", description="A test", ai_context={"instructions": "test"}) sm = osi["semantic_model"][0] - assert sm["description"] == "A test model" - assert sm["ai_context"]["instructions"] == "Use for order analysis" + assert sm["description"] == "A test" + assert sm["ai_context"]["instructions"] == "test" def test_multiple_models(self): - orders = ibis.table({"id": "int64", "amount": "float64"}, name="orders") - customers = ibis.table({"id": "int64", "name": "string"}, name="customers") - - m1 = to_semantic_table(orders, name="orders") + m1 = to_semantic_table(ibis.table({"id": "int64", "amount": "float64"}, name="o"), name="orders") m1 = m1.with_dimensions(id=Dimension(expr=ibis._.id)) m1 = m1.with_measures(total=Measure(expr=ibis._.amount.sum())) - - m2 = to_semantic_table(customers, name="customers") + m2 = to_semantic_table(ibis.table({"id": "int64", "name": "string"}, name="c"), name="customers") m2 = m2.with_dimensions(id=Dimension(expr=ibis._.id)) - osi = to_osi({"orders": m1, "customers": m2}, name="ecommerce") assert len(osi["semantic_model"][0]["datasets"]) == 2 class TestToOsiYaml: def test_yaml_output(self, simple_model): - yaml_str = to_osi_yaml(simple_model, name="test") - assert "version:" in yaml_str - assert "semantic_model:" in yaml_str - assert "datasets:" in yaml_str + s = to_osi_yaml(simple_model, name="test") + assert "version:" in s + assert "semantic_model:" in s + assert "datasets:" in s # --------------------------------------------------------------------------- -# Import tests: OSI -> BSL +# Import tests: OSI parsed natively via from_config # --------------------------------------------------------------------------- @@ -316,7 +295,6 @@ def osi_config(): "semantic_model": [ { "name": "test_model", - "description": "Test semantic model", "datasets": [ { "name": "orders", @@ -326,37 +304,20 @@ def osi_config(): "fields": [ { "name": "order_id", - "expression": { - "dialects": [ - {"dialect": "ANSI_SQL", "expression": "order_id"} - ] - }, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "order_id"}]}, "dimension": {"is_time": False}, "description": "Order identifier", - "custom_extensions": [ - { - "vendor_name": "COMMON", - "data": json.dumps({"is_entity": True}), - } - ], + "custom_extensions": [{"vendor_name": "COMMON", "data": json.dumps({"is_entity": True})}], }, { "name": "created_at", - "expression": { - "dialects": [ - {"dialect": "ANSI_SQL", "expression": "created_at"} - ] - }, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "created_at"}]}, "dimension": {"is_time": True}, "description": "Creation timestamp", }, { "name": "amount", - "expression": { - "dialects": [ - {"dialect": "ANSI_SQL", "expression": "amount"} - ] - }, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "amount"}]}, "dimension": {"is_time": False}, }, ], @@ -365,20 +326,12 @@ def osi_config(): "metrics": [ { "name": "total_amount", - "expression": { - "dialects": [ - {"dialect": "ANSI_SQL", "expression": "SUM(orders.amount)"} - ] - }, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.amount)"}]}, "description": "Total order amount", }, { "name": "order_count", - "expression": { - "dialects": [ - {"dialect": "ANSI_SQL", "expression": "COUNT(*)"} - ] - }, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "COUNT(*)"}]}, "description": "Number of orders", }, ], @@ -387,90 +340,66 @@ def osi_config(): } -class TestFromOsi: +class TestFromConfig_OSI: + """Test that from_config auto-detects and parses OSI format.""" + def test_basic_import(self, osi_config): - models = from_osi(osi_config) + models = from_config(osi_config) assert "orders" in models def test_model_description(self, osi_config): - models = from_osi(osi_config) - op = models["orders"].op() - assert op.description == "Order transactions" + models = from_config(osi_config) + assert models["orders"].op().description == "Order transactions" def test_dimensions_imported(self, osi_config): - models = from_osi(osi_config) - dims = models["orders"].op().get_dimensions() - assert "order_id" in dims - assert "created_at" in dims - assert "amount" in dims + dims = from_config(osi_config)["orders"].op().get_dimensions() + assert {"order_id", "created_at", "amount"} == set(dims.keys()) def test_dimension_descriptions(self, osi_config): - models = from_osi(osi_config) - dims = models["orders"].op().get_dimensions() + dims = from_config(osi_config)["orders"].op().get_dimensions() assert dims["order_id"].description == "Order identifier" assert dims["created_at"].description == "Creation timestamp" def test_time_dimension_flag(self, osi_config): - models = from_osi(osi_config) - dims = models["orders"].op().get_dimensions() + dims = from_config(osi_config)["orders"].op().get_dimensions() assert dims["created_at"].is_time_dimension is True assert dims["order_id"].is_time_dimension is False def test_entity_from_custom_extensions(self, osi_config): - models = from_osi(osi_config) - dims = models["orders"].op().get_dimensions() + dims = from_config(osi_config)["orders"].op().get_dimensions() assert dims["order_id"].is_entity is True def test_measures_imported(self, osi_config): - models = from_osi(osi_config) - measures = models["orders"].op().get_measures() - assert "total_amount" in measures - assert "order_count" in measures + measures = from_config(osi_config)["orders"].op().get_measures() + assert {"total_amount", "order_count"} == set(measures.keys()) def test_measure_descriptions(self, osi_config): - models = from_osi(osi_config) - measures = models["orders"].op().get_measures() + measures = from_config(osi_config)["orders"].op().get_measures() assert measures["total_amount"].description == "Total order amount" - assert measures["order_count"].description == "Number of orders" def test_with_real_table(self, osi_config): - """Test import with a real DuckDB table backing.""" con = ibis.duckdb.connect() - con.raw_sql( - "CREATE TABLE orders_table (order_id INT, created_at TIMESTAMP, amount DOUBLE)" - ) - table = con.table("orders_table") - models = from_osi(osi_config, tables={"orders": table}) + con.raw_sql("CREATE TABLE orders_table (order_id INT, created_at TIMESTAMP, amount DOUBLE)") + models = from_config(osi_config, tables={"orders": con.table("orders_table")}) assert "orders" in models def test_ai_context_preserved(self): config = { "version": "0.1.1", - "semantic_model": [ - { - "name": "test", - "datasets": [ - { - "name": "items", - "source": "items", - "fields": [ - { - "name": "item_id", - "expression": { - "dialects": [ - {"dialect": "ANSI_SQL", "expression": "item_id"} - ] - }, - "ai_context": {"synonyms": ["SKU", "product_id"]}, - } - ], - } - ], - } - ], + "semantic_model": [{ + "name": "test", + "datasets": [{ + "name": "items", + "source": "items", + "fields": [{ + "name": "item_id", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "item_id"}]}, + "ai_context": {"synonyms": ["SKU", "product_id"]}, + }], + }], + }], } - models = from_osi(config) - dims = models["items"].op().get_dimensions() + dims = from_config(config)["items"].op().get_dimensions() assert dims["item_id"].ai_context == {"synonyms": ["SKU", "product_id"]} @@ -482,66 +411,44 @@ def test_ai_context_preserved(self): class TestRoundTrip: def test_bsl_to_osi_to_bsl(self, simple_model): """BSL -> OSI -> BSL preserves key semantics.""" - # Export osi = to_osi(simple_model, name="round_trip_test") - - # Import back - models = from_osi(osi) + models = from_config(osi) # auto-detects OSI assert "orders" in models - # Check dimensions preserved orig_dims = simple_model.op().get_dimensions() new_dims = models["orders"].op().get_dimensions() assert set(orig_dims.keys()) == set(new_dims.keys()) - # Check descriptions preserved for name in orig_dims: assert orig_dims[name].description == new_dims[name].description - # Check time dimension flag preserved assert new_dims["created_at"].is_time_dimension is True - assert new_dims["order_id"].is_time_dimension is False - - # Check entity flag preserved (via custom_extensions round-trip) assert new_dims["order_id"].is_entity is True def test_bsl_to_osi_to_bsl_measures(self, simple_model): - """BSL -> OSI -> BSL preserves measure semantics.""" osi = to_osi(simple_model) - models = from_osi(osi) - - orig_measures = simple_model.op().get_measures() - new_measures = models["orders"].op().get_measures() + models = from_config(osi) - # All measures should be present - assert set(orig_measures.keys()) == set(new_measures.keys()) - - # Descriptions preserved - for name in orig_measures: - assert orig_measures[name].description == new_measures[name].description + orig = simple_model.op().get_measures() + new = models["orders"].op().get_measures() + assert set(orig.keys()) == set(new.keys()) + for name in orig: + assert orig[name].description == new[name].description def test_osi_to_bsl_to_osi(self, osi_config): - """OSI -> BSL -> OSI preserves key structure.""" - # Import - models = from_osi(osi_config) - - # Export back + models = from_config(osi_config) osi_out = to_osi(models, name="round_trip") - # Check structure assert osi_out["version"] == OSI_VERSION ds_out = osi_out["semantic_model"][0]["datasets"][0] assert ds_out["name"] == "orders" assert ds_out["description"] == "Order transactions" - - # Field count preserved ds_in = osi_config["semantic_model"][0]["datasets"][0] assert len(ds_out["fields"]) == len(ds_in["fields"]) def test_ai_context_round_trip(self, model_with_ai_context): - """ai_context survives BSL -> OSI -> BSL round-trip.""" osi = to_osi(model_with_ai_context) - models = from_osi(osi) + models = from_config(osi) dims = models["products"].op().get_dimensions() assert dims["product_id"].ai_context == {"synonyms": ["SKU", "item ID"]} diff --git a/src/boring_semantic_layer/yaml.py b/src/boring_semantic_layer/yaml.py index 25f92d46..6488baf4 100644 --- a/src/boring_semantic_layer/yaml.py +++ b/src/boring_semantic_layer/yaml.py @@ -1,7 +1,13 @@ """ YAML loader for Boring Semantic Layer models using the semantic API. + +Supports both BSL native YAML format and OSI (Open Semantic Interchange) +v0.1.1 format. The format is auto-detected based on the presence of +``version`` and ``semantic_model`` keys. """ +import json +import re from collections.abc import Mapping from typing import Any @@ -14,6 +20,276 @@ from .utils import read_yaml_file, safe_eval +# --------------------------------------------------------------------------- +# Format detection +# --------------------------------------------------------------------------- + + +def _is_osi_config(config: Mapping[str, Any]) -> bool: + """Return True if *config* looks like an OSI YAML document.""" + return "semantic_model" in config and "version" in config + + +# --------------------------------------------------------------------------- +# OSI expression helpers (SQL <-> Ibis Deferred) +# --------------------------------------------------------------------------- + + +def _sql_to_deferred(sql: str): + """Convert a simple SQL expression to an Ibis Deferred. + + Handles: + "column_name" -> _.column_name + "SUM(column)" -> _.column.sum() + "AVG(column)" -> _.column.mean() + "COUNT(*)" -> _.count() + "COUNT(DISTINCT column)" -> _.column.nunique() + """ + sql = sql.strip() + + if sql == "COUNT(*)": + return safe_eval("_.count()", context={"_": _}).unwrap() + + # COUNT(DISTINCT col) + m = re.match(r"^COUNT\(DISTINCT\s+(\w+)\)$", sql, re.IGNORECASE) + if m: + return safe_eval(f"_.{m.group(1)}.nunique()", context={"_": _}).unwrap() + + # AGG(col) patterns + sql_to_ibis = {"SUM": "sum", "AVG": "mean", "MAX": "max", "MIN": "min"} + for sql_fn, ibis_fn in sql_to_ibis.items(): + m = re.match(rf"^{sql_fn}\((\w+)\)$", sql, re.IGNORECASE) + if m: + return safe_eval(f"_.{m.group(1)}.{ibis_fn}()", context={"_": _}).unwrap() + + # Simple column reference + if re.match(r"^\w+$", sql): + return safe_eval(f"_.{sql}", context={"_": _}).unwrap() + + # Fallback: try eval as-is with underscore prefix + try: + return safe_eval(f"_.{sql}", context={"_": _}).unwrap() + except Exception: + return safe_eval( + f"_.{sql.split('.')[0] if '.' in sql else sql}", context={"_": _} + ).unwrap() + + +def _parse_osi_expression(expr_obj: dict, prefer_dialect: str = "ANSI_SQL") -> str: + """Extract the SQL expression string from an OSI expression object.""" + dialects = expr_obj.get("dialects", []) + if not dialects: + raise ValueError("OSI expression has no dialects") + for d in dialects: + if d.get("dialect") == prefer_dialect: + return d["expression"] + return dialects[0]["expression"] + + +def _strip_dataset_prefix(sql: str) -> str: + """Remove dataset.column prefixes from SQL aggregates. + + ``SUM(flights.distance)`` -> ``SUM(distance)`` + """ + + def _strip_match(m: re.Match) -> str: + fn = m.group(1) + inner = m.group(2).strip() + if inner.upper().startswith("DISTINCT "): + rest = inner[9:].strip() + if "." in rest: + return f"{fn}(DISTINCT {rest.split('.')[-1]})" + return m.group(0) + if "." in inner and inner != "*": + return f"{fn}({inner.split('.')[-1]})" + return m.group(0) + + return re.sub(r"(\w+)\(([^)]+)\)", _strip_match, sql) + + +# --------------------------------------------------------------------------- +# OSI field / metric -> BSL Dimension / Measure +# --------------------------------------------------------------------------- + + +def _osi_field_to_dimension(field: dict) -> tuple[str, Dimension]: + """Convert an OSI field dict to a ``(name, Dimension)`` pair.""" + name = field["name"] + sql_expr = _parse_osi_expression(field["expression"]) + deferred = _sql_to_deferred(sql_expr) + + kwargs: dict[str, Any] = { + "expr": deferred, + "description": field.get("description"), + } + + dim_meta = field.get("dimension", {}) + if dim_meta.get("is_time"): + kwargs["is_time_dimension"] = True + + if "ai_context" in field: + kwargs["ai_context"] = field["ai_context"] + + # Recover BSL-specific metadata stored in custom_extensions + for ext in field.get("custom_extensions", []): + if ext.get("vendor_name") == "COMMON": + try: + data = json.loads(ext["data"]) + if data.get("is_entity"): + kwargs["is_entity"] = True + if data.get("is_event_timestamp"): + kwargs["is_event_timestamp"] = True + if data.get("smallest_time_grain"): + kwargs["smallest_time_grain"] = data["smallest_time_grain"] + if data.get("derived_dimensions"): + kwargs["derived_dimensions"] = tuple(data["derived_dimensions"]) + except (json.JSONDecodeError, KeyError): + pass + + return name, Dimension(**kwargs) + + +def _osi_metric_to_measure(metric: dict) -> tuple[str, Measure]: + """Convert an OSI metric dict to a ``(name, Measure)`` pair.""" + name = metric["name"] + sql_expr = _parse_osi_expression(metric["expression"]) + sql_expr = _strip_dataset_prefix(sql_expr) + deferred = _sql_to_deferred(sql_expr) + + kwargs: dict[str, Any] = { + "expr": deferred, + "description": metric.get("description"), + } + if "ai_context" in metric: + kwargs["ai_context"] = metric["ai_context"] + + return name, Measure(**kwargs) + + +# --------------------------------------------------------------------------- +# OSI config -> BSL models (called from from_config when OSI is detected) +# --------------------------------------------------------------------------- + + +def _create_placeholder_table(dataset: dict): + """Create a placeholder ibis table from OSI field definitions.""" + import ibis + + fields = dataset.get("fields", []) + if not fields: + return None + schema = {f["name"]: "string" for f in fields} + try: + return ibis.table(schema, name=dataset["name"]) + except Exception: + return None + + +def _from_osi_config( + config: Mapping[str, Any], + tables: Mapping[str, Any] | None = None, + profile: str | None = None, + profile_path: str | None = None, +) -> dict[str, SemanticModel]: + """Parse an OSI config dict into BSL SemanticModel instances. + + This is an internal entry-point invoked by :func:`from_config` when it + detects OSI format. Users should call ``from_config`` / ``from_yaml`` + directly — those work for *both* BSL and OSI files. + """ + tables = dict(tables) if tables else {} + + # Load tables from profile if not provided + if not tables: + profile_config = profile or config.get("profile") + if profile_config or profile_path: + connection = get_connection( + profile_config or profile_path, + profile_file=profile_path if profile_config else None, + ) + tables = {name: connection.table(name) for name in connection.list_tables()} + + semantic_models = config.get("semantic_model", []) + if not semantic_models: + raise ValueError("No semantic_model found in OSI config") + + result: dict[str, SemanticModel] = {} + + for sm in semantic_models: + datasets = sm.get("datasets", []) + metrics = sm.get("metrics", []) + relationships = sm.get("relationships", []) + dataset_names = {ds["name"] for ds in datasets} + + for ds in datasets: + ds_name = ds["name"] + + # Resolve backing table + if ds_name in tables: + table = tables[ds_name] + elif ds.get("source") and ds["source"] in tables: + table = tables[ds["source"]] + else: + table = _create_placeholder_table(ds) + if table is None: + continue + + model = to_semantic_table( + table, name=ds_name, description=ds.get("description") + ) + + # Fields -> Dimensions + dimensions: dict[str, Dimension] = {} + for field in ds.get("fields", []): + dim_name, dim = _osi_field_to_dimension(field) + dimensions[dim_name] = dim + if dimensions: + model = model.with_dimensions(**dimensions) + + # Metrics -> Measures (assign to the dataset they reference) + ds_measures: dict[str, Measure] = {} + for metric in metrics: + sql_expr = _parse_osi_expression(metric["expression"]) + if f"{ds_name}." in sql_expr or not any( + f"{other}." in sql_expr + for other in dataset_names + if other != ds_name + ): + meas_name, meas = _osi_metric_to_measure(metric) + ds_measures[meas_name] = meas + if ds_measures: + model = model.with_measures(**ds_measures) + + result[ds_name] = model + + # Relationships -> Joins + if tables and relationships: + for rel in relationships: + from_ds = rel.get("from", "") + to_ds = rel.get("to", "") + if from_ds in result and to_ds in result: + from_cols = rel.get("from_columns", []) + to_cols = rel.get("to_columns", []) + if from_cols and to_cols and from_cols[0] != "unknown": + + def _make_join_cond(lc, rc): + return lambda left, right: getattr(left, lc) == getattr( + right, rc + ) + + result[from_ds] = result[from_ds].join_one( + result[to_ds], + on=_make_join_cond(from_cols[0], to_cols[0]), + ) + + return result + + +# --------------------------------------------------------------------------- +# BSL native YAML helpers +# --------------------------------------------------------------------------- + + def _parse_expression_config(name: str, config: str | dict, metric_type: str): """Extract expression string, description, and extra kwargs from config.""" if isinstance(config, str): @@ -322,6 +598,11 @@ def _load_table_for_yaml_model( return tables, tables[table_name] +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + def from_config( config: Mapping[str, Any], tables: Mapping[str, Any] | None = None, @@ -331,48 +612,27 @@ def from_config( """ Load semantic tables from a configuration dictionary. - This is useful when you have already loaded your configuration through - custom logic (e.g., Kedro catalog, external config management) and want - to construct SemanticTable objects without going through YAML file loading. + Accepts **both** BSL native YAML format and OSI (Open Semantic Interchange) + v0.1.1 format. The format is auto-detected: if the dict contains + ``version`` and ``semantic_model`` keys it is treated as OSI, otherwise + as BSL native. Args: - config: Configuration dictionary with model definitions + config: Configuration dictionary (BSL or OSI format) tables: Optional mapping of table names to ibis table expressions profile: Optional profile name to load tables from profile_path: Optional path to profile file Returns: Dict mapping model names to SemanticModel instances - - Example config format: - { - "flights": { - "table": "flights_tbl", - "description": "Flight data model", - "database": ["analytics", "prod"], # optional: catalog.schema - "dimensions": { - "origin": {"expr": "_.origin", "description": "Origin airport"}, - "destination": "_.destination", - }, - "measures": { - "flight_count": "_.count()", - "avg_distance": "_.distance.mean()", - }, - } - } - - The optional 'database' field can be a string or list for multi-part identifiers - (e.g., ["catalog", "schema"] for catalog.schema.table). This is passed to - ibis connection.table() and is useful for loading tables from different - databases/schemas under the same connection. - - Example usage with pre-loaded tables: - >>> import ibis - >>> con = ibis.duckdb.connect() - >>> flights_tbl = con.table("flights") - >>> config = {"flights": {"table": "flights_tbl", "dimensions": {...}}} - >>> models = from_config(config, tables={"flights_tbl": flights_tbl}) """ + # ---- Auto-detect OSI format ---- + if _is_osi_config(config): + return _from_osi_config( + config, tables=tables, profile=profile, profile_path=profile_path + ) + + # ---- BSL native format ---- tables = _load_tables_from_references(dict(tables) if tables else {}) # Load tables from profile if not provided @@ -453,12 +713,13 @@ def from_yaml( profile_path: str | None = None, ) -> dict[str, SemanticModel]: """ - Load semantic tables from a YAML file with optional profile-based table loading. + Load semantic tables from a YAML file. - This is a convenience wrapper around from_config() that loads the YAML file first. + Accepts **both** BSL native YAML format and OSI (Open Semantic Interchange) + v0.1.1 format. The format is auto-detected. Args: - yaml_path: Path to the YAML configuration file + yaml_path: Path to the YAML configuration file (BSL or OSI format) tables: Optional mapping of table names to ibis table expressions profile: Optional profile name to load tables from profile_path: Optional path to profile file @@ -466,45 +727,14 @@ def from_yaml( Returns: Dict mapping model names to SemanticModel instances - Example YAML format: - flights: - table: flights_tbl - description: "Flight data model" - database: # optional: for loading from specific database/schema - - analytics - - prod - dimensions: - origin: - expr: _.origin - description: "Origin airport code" - is_entity: true - destination: _.destination - carrier: _.carrier - arr_time: - expr: _.arr_time - description: "Arrival time" - is_event_timestamp: true - is_time_dimension: true - smallest_time_grain: "TIME_GRAIN_DAY" - measures: - flight_count: _.count() - avg_distance: _.distance.mean() - total_distance: - expr: _.distance.sum() - description: "Total distance flown" - calculated_measures: - avg_per_flight: - expr: _.total_distance / _.flight_count - description: "Average distance per flight" - pct_of_total: - expr: _.total_distance / _.all(_.total_distance) * 100 - description: "Percentage of total distance" - joins: - carriers: - model: carriers - type: one - left_on: carrier - right_on: code + Examples: + Load a BSL native YAML file:: + + models = from_yaml("flights.yml") + + Load an OSI YAML file:: + + models = from_yaml("flights_osi.yaml", tables=tables) """ yaml_configs = read_yaml_file(yaml_path) return from_config(yaml_configs, tables=tables, profile=profile, profile_path=profile_path) From 52638f0b8ff0b1b0f988a17f18bac1c91fb885ec Mon Sep 17 00:00:00 2001 From: boringdata Date: Thu, 2 Apr 2026 10:25:08 +0000 Subject: [PATCH 3/4] =?UTF-8?q?feat:=20close=20OSI=20import=20gaps=20?= =?UTF-8?q?=E2=80=94=20primary=5Fkey,=20ai=5Fcontext,=20label,=20multi-col?= =?UTF-8?q?umn=20joins?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four fixes to make OSI import near-lossless: 1. primary_key -> is_entity: fields matching dataset.primary_key are automatically marked is_entity=True on import (no custom_extensions needed for standard OSI files) 2. Dataset-level ai_context: added to SemanticTableOp, threaded through to_semantic_table/SemanticModel/with_dimensions/with_measures. Stored as JSON string internally for ibis hashability, deserialized via get_ai_context(). Round-trips through to_osi export. 3. label on Dimension: new optional field, parsed on OSI import, emitted on export. Supports the OSI field.label categorization concept. 4. Multi-column relationship joins: all from_columns/to_columns pairs are now used to build compound join predicates, not just the first. All 56 OSI tests pass. Export validates against official OSI JSON schema. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/boring_semantic_layer/api.py | 8 +- src/boring_semantic_layer/expr.py | 9 +++ src/boring_semantic_layer/ops.py | 20 +++++ src/boring_semantic_layer/osi.py | 5 ++ src/boring_semantic_layer/tests/test_osi.py | 88 +++++++++++++++++++++ src/boring_semantic_layer/yaml.py | 59 +++++++++++--- 6 files changed, 176 insertions(+), 13 deletions(-) diff --git a/src/boring_semantic_layer/api.py b/src/boring_semantic_layer/api.py index b3564c20..6bfb2003 100644 --- a/src/boring_semantic_layer/api.py +++ b/src/boring_semantic_layer/api.py @@ -18,7 +18,10 @@ def to_semantic_table( - ibis_table: ir.Table, name: str | None = None, description: str | None = None + ibis_table: ir.Table, + name: str | None = None, + description: str | None = None, + ai_context: str | dict | None = None, ) -> SemanticModel: """Create a SemanticModel from an Ibis table. @@ -26,6 +29,8 @@ def to_semantic_table( ibis_table: An Ibis table expression (can be regular ibis or xorq vendored ibis) name: Optional name for the semantic table description: Optional description for the semantic table + ai_context: Optional AI context (string or structured object with + instructions, synonyms, examples) per the OSI specification Returns: A new SemanticModel wrapping the table @@ -41,6 +46,7 @@ def to_semantic_table( calc_measures=None, name=name, description=description, + ai_context=ai_context, ) diff --git a/src/boring_semantic_layer/expr.py b/src/boring_semantic_layer/expr.py index b08f9984..8ff5a4d0 100644 --- a/src/boring_semantic_layer/expr.py +++ b/src/boring_semantic_layer/expr.py @@ -433,6 +433,7 @@ def __init__( calc_measures: Mapping[str, Any] | None = None, name: str | None = None, description: str | None = None, + ai_context: str | dict | None = None, _source_join: Any | None = None, ) -> None: # Keep tables in regular ibis - only convert to xorq at execution time if needed @@ -452,6 +453,11 @@ def __init__( derived_name = name or _derive_name(table) + # Serialize dict ai_context to JSON string for ibis hashability + import json as _json + + _ai_ctx = _json.dumps(ai_context, sort_keys=True) if isinstance(ai_context, dict) else ai_context + op = SemanticTableOp( table=table, dimensions=dims, @@ -459,6 +465,7 @@ def __init__( calc_measures=calc_meas, name=derived_name, description=description, + ai_context=_ai_ctx, _source_join=_source_join, ) @@ -525,6 +532,7 @@ def with_dimensions(self, **dims) -> SemanticModel: calc_measures=self.get_calculated_measures(), name=self.name, description=self.description, + ai_context=self.op().ai_context, ) def with_measures(self, **meas) -> SemanticModel: @@ -548,6 +556,7 @@ def with_measures(self, **meas) -> SemanticModel: calc_measures=new_calc_meas, name=self.name, description=self.description, + ai_context=self.op().ai_context, ) def join_one( diff --git a/src/boring_semantic_layer/ops.py b/src/boring_semantic_layer/ops.py index 55469dbd..73e62c1e 100644 --- a/src/boring_semantic_layer/ops.py +++ b/src/boring_semantic_layer/ops.py @@ -960,6 +960,7 @@ class Dimension: smallest_time_grain: str | None = None derived_dimensions: tuple[str, ...] = () ai_context: str | dict | None = None + label: str | None = None def __call__(self, table: ir.Table, _dims: dict | None = None) -> ir.Value: try: @@ -999,6 +1000,8 @@ def to_json(self) -> Mapping[str, Any]: base["derived_dimensions"] = list(self.derived_dimensions) if self.ai_context: base["ai_context"] = self.ai_context + if self.label: + base["label"] = self.label return base def __hash__(self) -> int: @@ -1059,10 +1062,25 @@ class SemanticTableOp(Relation): calc_measures: FrozenDict[str, Any] name: str | None = None description: str | None = None + ai_context: str | None = None # JSON string when dict; plain string otherwise _source_join: Any = field( default=None, repr=False ) # Track if this wraps a join (SemanticJoinOp) for optimization + def get_ai_context(self) -> str | dict | None: + """Return ai_context, deserializing JSON-encoded dicts.""" + val = self.ai_context + if val is None: + return None + try: + import json + parsed = json.loads(val) + if isinstance(parsed, dict): + return parsed + except (json.JSONDecodeError, TypeError, ValueError): + pass + return val + def __init__( self, table: ir.Table, @@ -1071,6 +1089,7 @@ def __init__( calc_measures: dict[str, Any] | FrozenDict[str, Any], name: str | None = None, description: str | None = None, + ai_context: str | dict | None = None, _source_join: Any = None, ) -> None: # Accept both regular ibis and xorq tables without conversion @@ -1086,6 +1105,7 @@ def __init__( else calc_measures, name=name, description=description, + ai_context=ai_context, _source_join=_source_join, ) diff --git a/src/boring_semantic_layer/osi.py b/src/boring_semantic_layer/osi.py index c7b734ab..c5c82f00 100644 --- a/src/boring_semantic_layer/osi.py +++ b/src/boring_semantic_layer/osi.py @@ -102,6 +102,8 @@ def _dimension_to_osi_field(name: str, dim: Dimension) -> dict: if dim.description: field["description"] = dim.description + if dim.label: + field["label"] = dim.label if dim.ai_context: field["ai_context"] = dim.ai_context @@ -264,6 +266,9 @@ def to_osi( if op.description: dataset["description"] = op.description + ds_ai_ctx = op.get_ai_context() + if ds_ai_ctx: + dataset["ai_context"] = ds_ai_ctx fields = [_dimension_to_osi_field(n, d) for n, d in dims.items()] if fields: diff --git a/src/boring_semantic_layer/tests/test_osi.py b/src/boring_semantic_layer/tests/test_osi.py index 164a9101..51ee8fed 100644 --- a/src/boring_semantic_layer/tests/test_osi.py +++ b/src/boring_semantic_layer/tests/test_osi.py @@ -403,6 +403,94 @@ def test_ai_context_preserved(self): assert dims["item_id"].ai_context == {"synonyms": ["SKU", "product_id"]} + def test_primary_key_sets_is_entity(self): + """primary_key field names should become is_entity=True dimensions.""" + config = { + "version": "0.1.1", + "semantic_model": [{ + "name": "test", + "datasets": [{ + "name": "users", + "source": "users", + "primary_key": ["user_id"], + "fields": [ + { + "name": "user_id", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "user_id"}]}, + }, + { + "name": "email", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "email"}]}, + }, + ], + }], + }], + } + dims = from_config(config)["users"].op().get_dimensions() + assert dims["user_id"].is_entity is True + assert dims["email"].is_entity is False + + def test_dataset_ai_context(self): + """Dataset-level ai_context should be stored on the model.""" + config = { + "version": "0.1.1", + "semantic_model": [{ + "name": "test", + "datasets": [{ + "name": "products", + "source": "products", + "ai_context": {"synonyms": ["items", "SKUs"]}, + "fields": [{ + "name": "id", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "id"}]}, + }], + }], + }], + } + op = from_config(config)["products"].op() + assert op.get_ai_context() == {"synonyms": ["items", "SKUs"]} + + def test_field_label(self): + """Field label should be stored on the Dimension.""" + config = { + "version": "0.1.1", + "semantic_model": [{ + "name": "test", + "datasets": [{ + "name": "events", + "source": "events", + "fields": [{ + "name": "status", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, + "label": "filter", + }], + }], + }], + } + dims = from_config(config)["events"].op().get_dimensions() + assert dims["status"].label == "filter" + + def test_label_round_trip(self): + """label survives BSL -> OSI -> BSL.""" + import ibis + t = ibis.table({"status": "string"}, name="events") + m = to_semantic_table(t, name="events") + m = m.with_dimensions(status=Dimension(expr=ibis._.status, label="filter")) + osi = to_osi(m) + models = from_config(osi) + assert models["events"].op().get_dimensions()["status"].label == "filter" + + def test_dataset_ai_context_round_trip(self): + """Dataset ai_context survives BSL -> OSI -> BSL.""" + import ibis + t = ibis.table({"id": "int64"}, name="items") + m = to_semantic_table(t, name="items", ai_context={"synonyms": ["products"]}) + m = m.with_dimensions(id=Dimension(expr=ibis._.id)) + osi = to_osi(m) + op = from_config(osi)["items"].op() + assert op.get_ai_context() == {"synonyms": ["products"]} + + # --------------------------------------------------------------------------- # Round-trip tests # --------------------------------------------------------------------------- diff --git a/src/boring_semantic_layer/yaml.py b/src/boring_semantic_layer/yaml.py index 6488baf4..a11945c3 100644 --- a/src/boring_semantic_layer/yaml.py +++ b/src/boring_semantic_layer/yaml.py @@ -112,8 +112,17 @@ def _strip_match(m: re.Match) -> str: # --------------------------------------------------------------------------- -def _osi_field_to_dimension(field: dict) -> tuple[str, Dimension]: - """Convert an OSI field dict to a ``(name, Dimension)`` pair.""" +def _osi_field_to_dimension( + field: dict, primary_key_cols: set[str] | None = None +) -> tuple[str, Dimension]: + """Convert an OSI field dict to a ``(name, Dimension)`` pair. + + Args: + field: OSI field definition. + primary_key_cols: Column names from the dataset's ``primary_key``. + If the field's expression matches one of these, the dimension + is marked ``is_entity=True``. + """ name = field["name"] sql_expr = _parse_osi_expression(field["expression"]) deferred = _sql_to_deferred(sql_expr) @@ -130,6 +139,13 @@ def _osi_field_to_dimension(field: dict) -> tuple[str, Dimension]: if "ai_context" in field: kwargs["ai_context"] = field["ai_context"] + if field.get("label"): + kwargs["label"] = field["label"] + + # Mark as entity if the field appears in the dataset's primary_key + if primary_key_cols and (sql_expr in primary_key_cols or name in primary_key_cols): + kwargs["is_entity"] = True + # Recover BSL-specific metadata stored in custom_extensions for ext in field.get("custom_extensions", []): if ext.get("vendor_name") == "COMMON": @@ -235,13 +251,19 @@ def _from_osi_config( continue model = to_semantic_table( - table, name=ds_name, description=ds.get("description") + table, + name=ds_name, + description=ds.get("description"), + ai_context=ds.get("ai_context"), ) + # primary_key column names for is_entity detection + pk_cols = set(ds.get("primary_key") or []) + # Fields -> Dimensions dimensions: dict[str, Dimension] = {} for field in ds.get("fields", []): - dim_name, dim = _osi_field_to_dimension(field) + dim_name, dim = _osi_field_to_dimension(field, pk_cols) dimensions[dim_name] = dim if dimensions: model = model.with_dimensions(**dimensions) @@ -262,7 +284,7 @@ def _from_osi_config( result[ds_name] = model - # Relationships -> Joins + # Relationships -> Joins (supports multi-column keys) if tables and relationships: for rel in relationships: from_ds = rel.get("from", "") @@ -270,16 +292,29 @@ def _from_osi_config( if from_ds in result and to_ds in result: from_cols = rel.get("from_columns", []) to_cols = rel.get("to_columns", []) - if from_cols and to_cols and from_cols[0] != "unknown": - - def _make_join_cond(lc, rc): - return lambda left, right: getattr(left, lc) == getattr( - right, rc - ) + if ( + from_cols + and to_cols + and len(from_cols) == len(to_cols) + and from_cols[0] != "unknown" + ): + + def _make_join_cond(lcols, rcols): + def cond(left, right): + pairs = [ + getattr(left, lc) == getattr(right, rc) + for lc, rc in zip(lcols, rcols) + ] + result = pairs[0] + for p in pairs[1:]: + result = result & p + return result + + return cond result[from_ds] = result[from_ds].join_one( result[to_ds], - on=_make_join_cond(from_cols[0], to_cols[0]), + on=_make_join_cond(from_cols, to_cols), ) return result From a57744bea194e8858d65cd8f2396d7860d1774b2 Mon Sep 17 00:00:00 2001 From: boringdata Date: Thu, 2 Apr 2026 18:51:31 +0000 Subject: [PATCH 4/4] fix: address all 6 Codex review findings P1 fixes: - Calculated measures: extract formula from closure/original_expr instead of emitting the metric name as a self-reference - Join key export: introspect lambda predicates by evaluating against mock tables and walking the Equals expression tree to extract column names, instead of hardcoding ["unknown"] - Unqualified metrics (COUNT(*)): only assign to the first dataset instead of duplicating across all datasets in multi-dataset imports P2 fixes: - Relationship cardinality: read from custom_extensions and use join_many() when cardinality is "many" instead of always join_one() - Expression fallback: return None for non-trivial Ibis expressions instead of stripping "_." prefix which leaks method syntax as invalid SQL P3 fixes: - BSL YAML measure ai_context: pass extra_kwargs["ai_context"] through to Measure() constructor instead of silently dropping it Co-Authored-By: Claude Opus 4.6 (1M context) --- src/boring_semantic_layer/osi.py | 91 +++++++++++++++++++++++++++++-- src/boring_semantic_layer/yaml.py | 72 ++++++++++++++++++------ 2 files changed, 139 insertions(+), 24 deletions(-) diff --git a/src/boring_semantic_layer/osi.py b/src/boring_semantic_layer/osi.py index c5c82f00..b4c1b3c2 100644 --- a/src/boring_semantic_layer/osi.py +++ b/src/boring_semantic_layer/osi.py @@ -55,9 +55,10 @@ def _ibis_string_to_sql(s: str) -> str: if m: return m.group(1) - if s.startswith("_."): - return s[2:] - return s + # For non-trivial expressions (concat, case, etc.) that don't match + # known patterns, return None to signal the caller that we can't produce + # valid SQL. Stripping "_." would leak Ibis method syntax. + return None def _expr_to_sql_string(expr: Any) -> str | None: @@ -160,6 +161,43 @@ def _measure_to_osi_metric(name: str, measure: Measure, dataset_name: str | None return metric +def _walk_predicate_for_columns( + op_node, left_cols: set[str], right_cols: set[str] +) -> tuple[list[str], list[str]]: + """Extract (from_columns, to_columns) from an ibis Equals expression tree.""" + from_list: list[str] = [] + to_list: list[str] = [] + + def _extract_eq(node): + """Handle Equals(Field, Field) nodes.""" + cls_name = type(node).__name__ + if cls_name == "Equals": + args = node.args + names = [] + for arg in args: + if hasattr(arg, "name") and isinstance(arg.name, str): + names.append(arg.name) + if len(names) == 2: + a, b = names + if a in left_cols and b in right_cols: + from_list.append(a) + to_list.append(b) + elif b in left_cols and a in right_cols: + from_list.append(b) + to_list.append(a) + elif cls_name == "And": + for arg in node.args: + _extract_eq(arg) + # Recurse into args that are ops + if not from_list: + for arg in getattr(node, "args", []): + if hasattr(arg, "args"): + _extract_eq(arg) + + _extract_eq(op_node) + return from_list, to_list + + def _extract_join_info(model: SemanticModel) -> list[dict]: """Extract relationship info from a model's join chain.""" from .ops import SemanticJoinOp @@ -177,14 +215,37 @@ def _name(node) -> str: return _name(inner) return "unnamed" + def _extract_join_cols(node) -> tuple[list[str], list[str]]: + """Try to extract join column names from a SemanticJoinOp predicate.""" + if node.on is None: + return [], [] + try: + # Build mock tables from left/right schemas + import ibis + left_tbl = node.left.to_expr() if hasattr(node.left, "to_expr") else None + right_tbl = node.right.to_expr() if hasattr(node.right, "to_expr") else None + if left_tbl is None or right_tbl is None: + return [], [] + # Evaluate the predicate to get an ibis expression + pred = node.on(left_tbl, right_tbl) + # Walk the expression tree to find Equals(field, field) patterns + op_node = pred.op() + from_cols, to_cols = _walk_predicate_for_columns( + op_node, set(left_tbl.columns), set(right_tbl.columns) + ) + return from_cols, to_cols + except Exception: + return [], [] + def _walk(node): if isinstance(node, SemanticJoinOp): + from_cols, to_cols = _extract_join_cols(node) rel: dict[str, Any] = { "name": f"{_name(node.left)}_{_name(node.right)}", "from": _name(node.left), "to": _name(node.right), - "from_columns": ["unknown"], - "to_columns": ["unknown"], + "from_columns": from_cols or ["unknown"], + "to_columns": to_cols or ["unknown"], } if hasattr(node, "cardinality"): rel["custom_extensions"] = [ @@ -280,9 +341,27 @@ def to_osi( all_metrics.append(_measure_to_osi_metric(meas_name, meas, model_name)) for cm_name, cm_fn in op.get_calculated_measures().items(): + # Try to extract the formula from the calculated measure + cm_expr_str = None + if isinstance(cm_fn, Measure) and cm_fn.original_expr is not None: + cm_expr_str = _expr_to_sql_string(cm_fn.original_expr) + if cm_expr_str is None and isinstance(cm_fn, Measure): + cm_expr_str = _expr_to_sql_string(cm_fn.expr) + # Try inspecting closure for the source expression string + if cm_expr_str is None and callable(cm_fn): + import inspect + try: + closure = inspect.getclosurevars(cm_fn) + if "source" in closure.nonlocals: + src = closure.nonlocals["source"] + # Convert _.meas_a / _.meas_b style to readable form + cm_expr_str = src.replace("_.", "") + except Exception: + pass + metric: dict[str, Any] = { "name": cm_name, - "expression": _make_osi_expression(cm_name), + "expression": _make_osi_expression(cm_expr_str or cm_name), } if isinstance(cm_fn, Measure) and cm_fn.description: metric["description"] = cm_fn.description diff --git a/src/boring_semantic_layer/yaml.py b/src/boring_semantic_layer/yaml.py index a11945c3..f2690d86 100644 --- a/src/boring_semantic_layer/yaml.py +++ b/src/boring_semantic_layer/yaml.py @@ -272,11 +272,8 @@ def _from_osi_config( ds_measures: dict[str, Measure] = {} for metric in metrics: sql_expr = _parse_osi_expression(metric["expression"]) - if f"{ds_name}." in sql_expr or not any( - f"{other}." in sql_expr - for other in dataset_names - if other != ds_name - ): + # Explicitly references this dataset + if f"{ds_name}." in sql_expr: meas_name, meas = _osi_metric_to_measure(metric) ds_measures[meas_name] = meas if ds_measures: @@ -284,7 +281,27 @@ def _from_osi_config( result[ds_name] = model - # Relationships -> Joins (supports multi-column keys) + # Second pass: assign unqualified metrics (no dataset. prefix) to the + # first dataset only, to avoid duplicating them across all datasets. + if datasets and metrics: + first_ds_name = datasets[0]["name"] + if first_ds_name in result: + unqualified: dict[str, Measure] = {} + for metric in metrics: + sql_expr = _parse_osi_expression(metric["expression"]) + # Skip if it explicitly references any dataset + if any(f"{dn}." in sql_expr for dn in dataset_names): + continue + meas_name, meas = _osi_metric_to_measure(metric) + # Only add if not already present on this model + if meas_name not in result[first_ds_name].op().get_measures(): + unqualified[meas_name] = meas + if unqualified: + result[first_ds_name] = result[first_ds_name].with_measures( + **unqualified + ) + + # Relationships -> Joins (supports multi-column keys + cardinality) if tables and relationships: for rel in relationships: from_ds = rel.get("from", "") @@ -305,17 +322,33 @@ def cond(left, right): getattr(left, lc) == getattr(right, rc) for lc, rc in zip(lcols, rcols) ] - result = pairs[0] + res = pairs[0] for p in pairs[1:]: - result = result & p - return result + res = res & p + return res return cond - result[from_ds] = result[from_ds].join_one( - result[to_ds], - on=_make_join_cond(from_cols, to_cols), - ) + # Detect cardinality from custom_extensions + cardinality = "one" + for ext in rel.get("custom_extensions", []): + if ext.get("vendor_name") == "COMMON": + try: + data = json.loads(ext["data"]) + if data.get("cardinality") in ("one", "many"): + cardinality = data["cardinality"] + except (json.JSONDecodeError, KeyError): + pass + + on_cond = _make_join_cond(from_cols, to_cols) + if cardinality == "many": + result[from_ds] = result[from_ds].join_many( + result[to_ds], on=on_cond + ) + else: + result[from_ds] = result[from_ds].join_one( + result[to_ds], on=on_cond + ) return result @@ -368,11 +401,14 @@ def _parse_dimension_or_measure( expr_str, description, extra_kwargs = _parse_expression_config(name, config, metric_type) deferred = safe_eval(expr_str, context={"_": _}).unwrap() base_kwargs = {"expr": deferred, "description": description} - return ( - Dimension(**base_kwargs, **extra_kwargs) - if metric_type == "dimension" - else Measure(**base_kwargs) - ) + if metric_type == "dimension": + return Dimension(**base_kwargs, **extra_kwargs) + else: + # Pass through ai_context for measures too + meas_kwargs = base_kwargs + if "ai_context" in extra_kwargs: + meas_kwargs["ai_context"] = extra_kwargs["ai_context"] + return Measure(**meas_kwargs) def _parse_calc_measure(name: str, config: str | dict) -> Measure: