Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 45 additions & 1 deletion src/boring_semantic_layer/measure_scope.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import difflib
from collections.abc import Iterable
from typing import Any

Expand All @@ -8,6 +9,17 @@
from toolz import curry


class UnknownMeasureRefError(AttributeError):
"""Raised when a calc-measure lambda references an unknown name.

Subclasses :class:`AttributeError` so existing code that ``except``\\ s
on attribute errors continues to work, but ``_classify_measure``
re-raises this specific subclass instead of swallowing it. Surfaces
typos at construction time with a "did you mean?" suggestion built
from the surrounding measure / column names.
"""


def _has_prefixed_columns(tbl, name: str) -> bool:
"""Check if table has columns with the given prefix (e.g., 'flights.' prefix)."""
if not hasattr(tbl, "columns"):
Expand Down Expand Up @@ -361,7 +373,39 @@ def __getattr__(self, name: str):
if _has_prefixed_columns(self.tbl, name):
return _ColumnPrefixProxy(self.tbl, name)

return _resolve_column_short_name(self.tbl, name)
# Fall through to ibis (covers Table methods like ``count``, ``filter``).
# If ibis rejects too, surface a typo suggestion rather than the opaque
# ibis AttributeError so the user can see a "did you mean?" hint.
try:
return _resolve_column_short_name(self.tbl, name)
except AttributeError:
suggestion = self._typo_suggestion(name)
if suggestion:
raise UnknownMeasureRefError(
f"{name!r} is not a known measure or column. {suggestion}"
) from None
raise

def _typo_suggestion(self, name: str) -> str | None:
# 0.80 catches single-character typos and case mistakes
# (``flight_konut`` vs ``flight_count`` ≈ 0.83) without flagging
# legitimate substring overlaps (``net_revenue`` vs
# ``total_net_revenue`` ≈ 0.79). Calibrated against real-world
# confusable measure names.
cutoff = 0.80
candidates: list[tuple[str, str]] = []
if self.known:
for match in difflib.get_close_matches(name, self.known, n=3, cutoff=cutoff):
candidates.append(("measure", match))
if hasattr(self.tbl, "columns"):
for match in difflib.get_close_matches(
name, list(self.tbl.columns), n=3, cutoff=cutoff
):
candidates.append(("column", match))
if not candidates:
return None
formatted = ", ".join(f"{kind} {match!r}" for kind, match in candidates)
return f"Did you mean: {formatted}?"

def __getitem__(self, name: str):
if self.post_agg:
Expand Down
66 changes: 55 additions & 11 deletions src/boring_semantic_layer/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,21 @@ class _RenamedResolver:

Used during join predicate resolution to avoid ibis "Ambiguous field
reference" errors when left and right tables share column names.

This works around two upstream ibis behaviors that have no public
issue tracker entry yet but are pinned by
``test_upstream_ibis_pins.py``:

1. ``DerefMap`` raises ``IbisInputError: Ambiguous field reference``
when a predicate column-name appears in more than one relation
reachable from the join's LHS.
2. The default ``rname='{name}_right'`` collides on the third
table when 3+ joined relations share a column name, raising
``IntegrityError: Name collisions``.

When the pinning tests start failing — i.e. ibis no longer raises
these errors — the rename dance in ``SemanticJoinOp.to_untagged``
can be removed.
"""

__slots__ = ("_table", "_name_map")
Expand Down Expand Up @@ -663,13 +678,31 @@ def _extract_measure_metadata(
return (fn_or_expr, None, (), {})


_AGG_METHODS = frozenset({"sum", "mean", "avg", "count", "min", "max"})
_AGG_METHODS = frozenset({
# Standard reductions
"sum", "mean", "avg", "count", "min", "max",
# Statistical reductions
"var", "std", "median", "quantile",
# Approximate reductions
"approx_count_distinct", "approx_nunique", "approx_median",
# Distinct count
"nunique",
# Categorical reductions
"mode", "first", "last", "arbitrary",
# Boolean reductions
"any", "all",
# Collection reductions
"group_concat", "collect",
})


def _is_calculated_measure(val: Any) -> bool:
# A MethodCall with an aggregation method on a MeasureRef is a base measure:
# the column name matched a known measure name in MeasureScope, but the user
# is really defining a column aggregation (e.g. lambda t: t.flight_count.sum()).
# is really defining a column aggregation (e.g. lambda t: t.flight_count.sum()
# or t.distance.var()). ``_AGG_METHODS`` covers the ibis ``Reduction`` ops a
# user might reasonably reach for; methods outside the set fall through to
# the calc path.
if (
isinstance(val, MethodCall)
and val.method in _AGG_METHODS
Expand Down Expand Up @@ -843,18 +876,26 @@ def _classify_measure(
fn_or_expr: Any, scope: Any, measure_name: str | None = None
) -> tuple[str, Any]:
"""Classify measure as 'calc' or 'base' with appropriate handling."""
from .measure_scope import validate_calc_ast
from .measure_scope import UnknownMeasureRefError, validate_calc_ast

expr, description, requires_unnest, metadata = _extract_measure_metadata(fn_or_expr)

resolved = safe(lambda: _resolve_expr(expr, scope))().map(
lambda val: ("calc", val) if _is_calculated_measure(val) else None
)
# ``_resolve_expr`` may raise for legitimate base-measure shapes
# (e.g. lambdas that touch ibis methods MeasureScope can't reflect),
# so most exceptions are caught and the lambda falls through to
# base classification. ``UnknownMeasureRefError`` is the typo case
# though — surface it loudly instead of letting the lambda fail with
# an opaque error at execute time.
try:
resolved_value = _resolve_expr(expr, scope)
except UnknownMeasureRefError:
raise
except Exception:
resolved_value = None

if isinstance(resolved, Success) and resolved.unwrap() is not None:
kind, value = resolved.unwrap()
validate_calc_ast(value, measure_name)
return (kind, value)
if resolved_value is not None and _is_calculated_measure(resolved_value):
validate_calc_ast(resolved_value, measure_name)
return ("calc", resolved_value)

if not requires_unnest and callable(expr):
# All scopes (MeasureScope, ColumnScope) have tbl attribute
Expand Down Expand Up @@ -4014,7 +4055,10 @@ def to_untagged(self, parent_requirements: dict[str, set[str]] | None = None):
return left_tbl.join(right_tbl, how=self.how, rname=rname)

# Detect column name conflicts that cause ibis/xorq to raise
# "Ambiguous field reference" during predicate resolution.
# ``Ambiguous field reference`` during predicate resolution. The
# rename dance + ``_RenamedResolver`` below is a workaround for
# upstream ibis behaviors pinned by ``test_upstream_ibis_pins``;
# remove this branch when those tests fail.
conflicting = frozenset(left_tbl.columns) & frozenset(right_tbl.columns)

if not conflicting:
Expand Down
109 changes: 109 additions & 0 deletions src/boring_semantic_layer/tests/test_measure_reference_styles.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,3 +540,112 @@ def test_calc_dtype_inference_with_inline_aggregation():
# Before infer_calc_dtype handled the rewrite path, this was silently
# dropped from values and the dtype was unknown.
assert "pct_of_total" in st.schema.names


# ---------------------------------------------------------------------------
# Broader base-measure recognition (#1)
# ---------------------------------------------------------------------------


@pytest.mark.parametrize(
"method, expected_kind",
[
("var", "base"),
("std", "base"),
("median", "base"),
("nunique", "base"),
("approx_nunique", "base"),
("first", "base"),
("last", "base"),
],
)
def test_extra_reductions_classified_as_base(method, expected_kind):
"""Measures named the same as a column then aggregated with a non-sum/mean
reduction should still be classified as base, not silently routed through
the calc compiler.
"""
con = ibis.duckdb.connect(":memory:")
data = pd.DataFrame({"carrier": ["AA", "AA", "UA"], "distance": [100, 200, 300]})
tbl = con.create_table(f"flights_{method}", data)

# Define a measure named ``distance`` (matching the column name) so that
# ``t.distance.<method>()`` resolves through the MeasureRef path that
# ``_AGG_METHODS`` guards.
st = to_semantic_table(tbl, f"flights_{method}").with_measures(
distance=lambda t: t.distance.sum(),
)
st = st.with_measures(
derived=lambda t, _m=method: getattr(t.distance, _m)(),
)

op = st.op()
base_names = set(op.get_measures().keys())
calc_names = set(op.get_calculated_measures().keys())
assert "derived" in base_names, (
f"{method!r} measure should classify as base, got calc"
)
assert "derived" not in calc_names


# ---------------------------------------------------------------------------
# Typo'd measure references (#2)
# ---------------------------------------------------------------------------


def test_typo_measure_ref_raises_with_suggestion():
"""A calc-measure lambda referencing a misspelled measure name should
fail loudly at construction with a "did you mean?" suggestion."""
from boring_semantic_layer.measure_scope import UnknownMeasureRefError

con = ibis.duckdb.connect(":memory:")
data = pd.DataFrame({"carrier": ["AA"], "distance": [100]})
tbl = con.create_table("flights_typo", data)

st = to_semantic_table(tbl, "flights_typo").with_measures(
flight_count=lambda t: t.count(),
total_distance=lambda t: t.distance.sum(),
)

with pytest.raises(UnknownMeasureRefError, match="flight_count"):
st.with_measures(
# ``flight_konut`` typo of ``flight_count``
ratio=lambda t: t.flight_konut / t.total_distance,
)


def test_typo_in_t_all_raises():
"""Same loud-fail behavior when the typo is the argument to ``t.all``."""
from boring_semantic_layer.measure_scope import UnknownMeasureRefError

con = ibis.duckdb.connect(":memory:")
data = pd.DataFrame({"carrier": ["AA"], "distance": [100]})
tbl = con.create_table("flights_typo_all", data)

st = to_semantic_table(tbl, "flights_typo_all").with_measures(
flight_count=lambda t: t.count(),
)

with pytest.raises(UnknownMeasureRefError, match="flight_count"):
st.with_measures(
ratio=lambda t: t.flight_count / t.all(t.flight_konut),
)


def test_substring_measure_name_does_not_trigger_typo():
"""Names that are substrings of other measures should not trip the typo
detector. ``net_revenue`` referenced from a measure that also defines
``total_net_revenue`` is legitimate, not a typo (similarity ≈ 0.79).
"""
from boring_semantic_layer.measure_scope import MeasureScope

# Probe MeasureScope directly: with measures named [net_revenue,
# total_net_revenue], looking up an unrelated name 'net_revenue'
# via getattr should NOT fire the typo path.
import ibis as i

tbl = i.table({"col": "int64"}, name="t")
scope = MeasureScope(_tbl=tbl, _known=("net_revenue", "total_net_revenue"))
# Asking for 'net_revenue' is fine — it's a known measure.
assert scope.net_revenue is not None
# Asking for 'col' is fine — it's a column.
assert scope.col is not None
116 changes: 116 additions & 0 deletions src/boring_semantic_layer/tests/test_upstream_ibis_pins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""Pin upstream ibis behaviors that BSL works around.

Each test reproduces a specific ibis (or xorq-vendored ibis) behavior that
forced a workaround somewhere in BSL. The tests pass while ibis still
exhibits the behavior and fail when ibis is fixed — at which point the
corresponding BSL workaround can be deleted.

When a test in this file starts failing, do not "fix" it by adding the
workaround in: instead, locate the BSL workaround it pins (each test
docstring names the call site) and delete that workaround.

Each pin is parametrized over plain ``ibis`` and ``xorq.vendor.ibis``
because BSL's internal tables are xorq-vendored after construction-time
conversion, so the workaround actually fires against the xorq flavor —
but plain ibis is the upstream, and divergence between them is itself a
signal worth pinning.
"""

from __future__ import annotations

import pandas as pd
import pytest

import ibis as plain_ibis
from xorq.common.utils.ibis_utils import from_ibis


def _make_table(flavor: str, name: str, df: pd.DataFrame):
"""Create a table in the requested ibis flavor.

For ``"ibis"`` we create directly via the duckdb backend.
For ``"xorq"`` we create via plain ibis, then wrap with
``from_ibis`` so the resulting table is in ``xorq.vendor.ibis``.
"""
con = plain_ibis.duckdb.connect()
tbl = con.create_table(name, df)
if flavor == "xorq":
return from_ibis(tbl)
return tbl


@pytest.mark.parametrize("flavor", ["ibis", "xorq"])
def test_pin_three_way_join_default_rname_collides(flavor):
"""3+ joined relations sharing a column name collide on ``_right``.

Pinned BSL workaround: depth-based ``rname`` in
``SemanticJoinOp._rname_for_depth`` (``_right``, ``_right2``, …)
plus the ``conflicting`` rename block in ``SemanticJoinOp.to_untagged``.

When ibis auto-disambiguates collisions across nested joins, this
test will succeed without raising — at which point the depth
hack and rename block can be removed.
"""
flights = _make_table(
flavor,
f"fl_pin_{flavor}",
pd.DataFrame(
{"carrier": ["AA"], "origin": ["JFK"], "destination": ["LAX"]}
),
)
carriers = _make_table(
flavor,
f"ca_pin_{flavor}",
pd.DataFrame({"code": ["AA"], "name": ["American"]}),
)
airports1 = _make_table(
flavor,
f"ap1_pin_{flavor}",
pd.DataFrame({"code": ["JFK"], "city": ["NY"]}),
)
airports2 = _make_table(
flavor,
f"ap2_pin_{flavor}",
pd.DataFrame({"code": ["LAX"], "city": ["LA"]}),
)

fc = flights.join(carriers, flights.carrier == carriers.code)
fca = fc.join(airports1, fc.origin == airports1.code)
# Both ``fca`` and ``airports2`` carry a column called ``code``; ibis
# tries to suffix the right one as ``code_right``, which already exists
# from the previous join → IntegrityError. Plain ibis raises on schema
# materialization (``.columns``); xorq's vendored ibis raises at
# execute time. Force materialization to cover both.
with pytest.raises(Exception, match=r"(?i)collision|integrity"):
fca.join(airports2, fca.destination == airports2.code).execute()


@pytest.mark.parametrize("flavor", ["ibis", "xorq"])
def test_pin_self_reference_join_is_ambiguous_without_view(flavor):
"""Joining the same table twice without ``.view()`` is ambiguous.

Pinned BSL workaround: BSL relies on the user calling ``.view()`` (or
``to_semantic_table`` on a viewed table) when joining the same source
twice. The reconstruction path in ``serialization.reconstruct``
explicitly preserves ``SelfReference`` nodes for the same reason.

When ibis disambiguates self-references automatically, the
``SelfReference`` preservation in reconstruct can be relaxed.
"""
ports = _make_table(
flavor,
f"ports_pin_{flavor}",
pd.DataFrame({"code": ["JFK", "LAX"], "city": ["NY", "LA"]}),
)
flights = _make_table(
flavor,
f"fl_self_pin_{flavor}",
pd.DataFrame({"origin": ["JFK"], "dest": ["LAX"]}),
)

j1 = flights.join(ports, flights.origin == ports.code)
# Reusing ``ports`` (same identity) without ``.view()`` makes the
# second predicate's ``ports.code`` ambiguous. Error fires at
# predicate-resolution time during the second ``join`` call.
with pytest.raises(Exception, match=r"(?i)ambiguous"):
j1.join(ports, j1.dest == ports.code)
Loading