diff --git a/src/boring_semantic_layer/measure_scope.py b/src/boring_semantic_layer/measure_scope.py index 38ad485..d46749a 100644 --- a/src/boring_semantic_layer/measure_scope.py +++ b/src/boring_semantic_layer/measure_scope.py @@ -1,5 +1,6 @@ from __future__ import annotations +import difflib from collections.abc import Iterable from typing import Any @@ -8,6 +9,17 @@ from toolz import curry +class UnknownMeasureRefError(AttributeError): + """Raised when a calc-measure lambda references an unknown name. + + Subclasses :class:`AttributeError` so existing code that ``except``\\ s + on attribute errors continues to work, but ``_classify_measure`` + re-raises this specific subclass instead of swallowing it. Surfaces + typos at construction time with a "did you mean?" suggestion built + from the surrounding measure / column names. + """ + + def _has_prefixed_columns(tbl, name: str) -> bool: """Check if table has columns with the given prefix (e.g., 'flights.' prefix).""" if not hasattr(tbl, "columns"): @@ -361,7 +373,39 @@ def __getattr__(self, name: str): if _has_prefixed_columns(self.tbl, name): return _ColumnPrefixProxy(self.tbl, name) - return _resolve_column_short_name(self.tbl, name) + # Fall through to ibis (covers Table methods like ``count``, ``filter``). + # If ibis rejects too, surface a typo suggestion rather than the opaque + # ibis AttributeError so the user can see a "did you mean?" hint. + try: + return _resolve_column_short_name(self.tbl, name) + except AttributeError: + suggestion = self._typo_suggestion(name) + if suggestion: + raise UnknownMeasureRefError( + f"{name!r} is not a known measure or column. {suggestion}" + ) from None + raise + + def _typo_suggestion(self, name: str) -> str | None: + # 0.80 catches single-character typos and case mistakes + # (``flight_konut`` vs ``flight_count`` ≈ 0.83) without flagging + # legitimate substring overlaps (``net_revenue`` vs + # ``total_net_revenue`` ≈ 0.79). Calibrated against real-world + # confusable measure names. + cutoff = 0.80 + candidates: list[tuple[str, str]] = [] + if self.known: + for match in difflib.get_close_matches(name, self.known, n=3, cutoff=cutoff): + candidates.append(("measure", match)) + if hasattr(self.tbl, "columns"): + for match in difflib.get_close_matches( + name, list(self.tbl.columns), n=3, cutoff=cutoff + ): + candidates.append(("column", match)) + if not candidates: + return None + formatted = ", ".join(f"{kind} {match!r}" for kind, match in candidates) + return f"Did you mean: {formatted}?" def __getitem__(self, name: str): if self.post_agg: diff --git a/src/boring_semantic_layer/ops.py b/src/boring_semantic_layer/ops.py index ff2f534..c9f35ff 100644 --- a/src/boring_semantic_layer/ops.py +++ b/src/boring_semantic_layer/ops.py @@ -75,6 +75,21 @@ class _RenamedResolver: Used during join predicate resolution to avoid ibis "Ambiguous field reference" errors when left and right tables share column names. + + This works around two upstream ibis behaviors that have no public + issue tracker entry yet but are pinned by + ``test_upstream_ibis_pins.py``: + + 1. ``DerefMap`` raises ``IbisInputError: Ambiguous field reference`` + when a predicate column-name appears in more than one relation + reachable from the join's LHS. + 2. The default ``rname='{name}_right'`` collides on the third + table when 3+ joined relations share a column name, raising + ``IntegrityError: Name collisions``. + + When the pinning tests start failing — i.e. ibis no longer raises + these errors — the rename dance in ``SemanticJoinOp.to_untagged`` + can be removed. """ __slots__ = ("_table", "_name_map") @@ -663,13 +678,31 @@ def _extract_measure_metadata( return (fn_or_expr, None, (), {}) -_AGG_METHODS = frozenset({"sum", "mean", "avg", "count", "min", "max"}) +_AGG_METHODS = frozenset({ + # Standard reductions + "sum", "mean", "avg", "count", "min", "max", + # Statistical reductions + "var", "std", "median", "quantile", + # Approximate reductions + "approx_count_distinct", "approx_nunique", "approx_median", + # Distinct count + "nunique", + # Categorical reductions + "mode", "first", "last", "arbitrary", + # Boolean reductions + "any", "all", + # Collection reductions + "group_concat", "collect", +}) def _is_calculated_measure(val: Any) -> bool: # A MethodCall with an aggregation method on a MeasureRef is a base measure: # the column name matched a known measure name in MeasureScope, but the user - # is really defining a column aggregation (e.g. lambda t: t.flight_count.sum()). + # is really defining a column aggregation (e.g. lambda t: t.flight_count.sum() + # or t.distance.var()). ``_AGG_METHODS`` covers the ibis ``Reduction`` ops a + # user might reasonably reach for; methods outside the set fall through to + # the calc path. if ( isinstance(val, MethodCall) and val.method in _AGG_METHODS @@ -843,18 +876,26 @@ def _classify_measure( fn_or_expr: Any, scope: Any, measure_name: str | None = None ) -> tuple[str, Any]: """Classify measure as 'calc' or 'base' with appropriate handling.""" - from .measure_scope import validate_calc_ast + from .measure_scope import UnknownMeasureRefError, validate_calc_ast expr, description, requires_unnest, metadata = _extract_measure_metadata(fn_or_expr) - resolved = safe(lambda: _resolve_expr(expr, scope))().map( - lambda val: ("calc", val) if _is_calculated_measure(val) else None - ) + # ``_resolve_expr`` may raise for legitimate base-measure shapes + # (e.g. lambdas that touch ibis methods MeasureScope can't reflect), + # so most exceptions are caught and the lambda falls through to + # base classification. ``UnknownMeasureRefError`` is the typo case + # though — surface it loudly instead of letting the lambda fail with + # an opaque error at execute time. + try: + resolved_value = _resolve_expr(expr, scope) + except UnknownMeasureRefError: + raise + except Exception: + resolved_value = None - if isinstance(resolved, Success) and resolved.unwrap() is not None: - kind, value = resolved.unwrap() - validate_calc_ast(value, measure_name) - return (kind, value) + if resolved_value is not None and _is_calculated_measure(resolved_value): + validate_calc_ast(resolved_value, measure_name) + return ("calc", resolved_value) if not requires_unnest and callable(expr): # All scopes (MeasureScope, ColumnScope) have tbl attribute @@ -4014,7 +4055,10 @@ def to_untagged(self, parent_requirements: dict[str, set[str]] | None = None): return left_tbl.join(right_tbl, how=self.how, rname=rname) # Detect column name conflicts that cause ibis/xorq to raise - # "Ambiguous field reference" during predicate resolution. + # ``Ambiguous field reference`` during predicate resolution. The + # rename dance + ``_RenamedResolver`` below is a workaround for + # upstream ibis behaviors pinned by ``test_upstream_ibis_pins``; + # remove this branch when those tests fail. conflicting = frozenset(left_tbl.columns) & frozenset(right_tbl.columns) if not conflicting: diff --git a/src/boring_semantic_layer/tests/test_measure_reference_styles.py b/src/boring_semantic_layer/tests/test_measure_reference_styles.py index d741af9..93c0c6b 100644 --- a/src/boring_semantic_layer/tests/test_measure_reference_styles.py +++ b/src/boring_semantic_layer/tests/test_measure_reference_styles.py @@ -540,3 +540,112 @@ def test_calc_dtype_inference_with_inline_aggregation(): # Before infer_calc_dtype handled the rewrite path, this was silently # dropped from values and the dtype was unknown. assert "pct_of_total" in st.schema.names + + +# --------------------------------------------------------------------------- +# Broader base-measure recognition (#1) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "method, expected_kind", + [ + ("var", "base"), + ("std", "base"), + ("median", "base"), + ("nunique", "base"), + ("approx_nunique", "base"), + ("first", "base"), + ("last", "base"), + ], +) +def test_extra_reductions_classified_as_base(method, expected_kind): + """Measures named the same as a column then aggregated with a non-sum/mean + reduction should still be classified as base, not silently routed through + the calc compiler. + """ + con = ibis.duckdb.connect(":memory:") + data = pd.DataFrame({"carrier": ["AA", "AA", "UA"], "distance": [100, 200, 300]}) + tbl = con.create_table(f"flights_{method}", data) + + # Define a measure named ``distance`` (matching the column name) so that + # ``t.distance.()`` resolves through the MeasureRef path that + # ``_AGG_METHODS`` guards. + st = to_semantic_table(tbl, f"flights_{method}").with_measures( + distance=lambda t: t.distance.sum(), + ) + st = st.with_measures( + derived=lambda t, _m=method: getattr(t.distance, _m)(), + ) + + op = st.op() + base_names = set(op.get_measures().keys()) + calc_names = set(op.get_calculated_measures().keys()) + assert "derived" in base_names, ( + f"{method!r} measure should classify as base, got calc" + ) + assert "derived" not in calc_names + + +# --------------------------------------------------------------------------- +# Typo'd measure references (#2) +# --------------------------------------------------------------------------- + + +def test_typo_measure_ref_raises_with_suggestion(): + """A calc-measure lambda referencing a misspelled measure name should + fail loudly at construction with a "did you mean?" suggestion.""" + from boring_semantic_layer.measure_scope import UnknownMeasureRefError + + con = ibis.duckdb.connect(":memory:") + data = pd.DataFrame({"carrier": ["AA"], "distance": [100]}) + tbl = con.create_table("flights_typo", data) + + st = to_semantic_table(tbl, "flights_typo").with_measures( + flight_count=lambda t: t.count(), + total_distance=lambda t: t.distance.sum(), + ) + + with pytest.raises(UnknownMeasureRefError, match="flight_count"): + st.with_measures( + # ``flight_konut`` typo of ``flight_count`` + ratio=lambda t: t.flight_konut / t.total_distance, + ) + + +def test_typo_in_t_all_raises(): + """Same loud-fail behavior when the typo is the argument to ``t.all``.""" + from boring_semantic_layer.measure_scope import UnknownMeasureRefError + + con = ibis.duckdb.connect(":memory:") + data = pd.DataFrame({"carrier": ["AA"], "distance": [100]}) + tbl = con.create_table("flights_typo_all", data) + + st = to_semantic_table(tbl, "flights_typo_all").with_measures( + flight_count=lambda t: t.count(), + ) + + with pytest.raises(UnknownMeasureRefError, match="flight_count"): + st.with_measures( + ratio=lambda t: t.flight_count / t.all(t.flight_konut), + ) + + +def test_substring_measure_name_does_not_trigger_typo(): + """Names that are substrings of other measures should not trip the typo + detector. ``net_revenue`` referenced from a measure that also defines + ``total_net_revenue`` is legitimate, not a typo (similarity ≈ 0.79). + """ + from boring_semantic_layer.measure_scope import MeasureScope + + # Probe MeasureScope directly: with measures named [net_revenue, + # total_net_revenue], looking up an unrelated name 'net_revenue' + # via getattr should NOT fire the typo path. + import ibis as i + + tbl = i.table({"col": "int64"}, name="t") + scope = MeasureScope(_tbl=tbl, _known=("net_revenue", "total_net_revenue")) + # Asking for 'net_revenue' is fine — it's a known measure. + assert scope.net_revenue is not None + # Asking for 'col' is fine — it's a column. + assert scope.col is not None diff --git a/src/boring_semantic_layer/tests/test_upstream_ibis_pins.py b/src/boring_semantic_layer/tests/test_upstream_ibis_pins.py new file mode 100644 index 0000000..9ef2321 --- /dev/null +++ b/src/boring_semantic_layer/tests/test_upstream_ibis_pins.py @@ -0,0 +1,116 @@ +"""Pin upstream ibis behaviors that BSL works around. + +Each test reproduces a specific ibis (or xorq-vendored ibis) behavior that +forced a workaround somewhere in BSL. The tests pass while ibis still +exhibits the behavior and fail when ibis is fixed — at which point the +corresponding BSL workaround can be deleted. + +When a test in this file starts failing, do not "fix" it by adding the +workaround in: instead, locate the BSL workaround it pins (each test +docstring names the call site) and delete that workaround. + +Each pin is parametrized over plain ``ibis`` and ``xorq.vendor.ibis`` +because BSL's internal tables are xorq-vendored after construction-time +conversion, so the workaround actually fires against the xorq flavor — +but plain ibis is the upstream, and divergence between them is itself a +signal worth pinning. +""" + +from __future__ import annotations + +import pandas as pd +import pytest + +import ibis as plain_ibis +from xorq.common.utils.ibis_utils import from_ibis + + +def _make_table(flavor: str, name: str, df: pd.DataFrame): + """Create a table in the requested ibis flavor. + + For ``"ibis"`` we create directly via the duckdb backend. + For ``"xorq"`` we create via plain ibis, then wrap with + ``from_ibis`` so the resulting table is in ``xorq.vendor.ibis``. + """ + con = plain_ibis.duckdb.connect() + tbl = con.create_table(name, df) + if flavor == "xorq": + return from_ibis(tbl) + return tbl + + +@pytest.mark.parametrize("flavor", ["ibis", "xorq"]) +def test_pin_three_way_join_default_rname_collides(flavor): + """3+ joined relations sharing a column name collide on ``_right``. + + Pinned BSL workaround: depth-based ``rname`` in + ``SemanticJoinOp._rname_for_depth`` (``_right``, ``_right2``, …) + plus the ``conflicting`` rename block in ``SemanticJoinOp.to_untagged``. + + When ibis auto-disambiguates collisions across nested joins, this + test will succeed without raising — at which point the depth + hack and rename block can be removed. + """ + flights = _make_table( + flavor, + f"fl_pin_{flavor}", + pd.DataFrame( + {"carrier": ["AA"], "origin": ["JFK"], "destination": ["LAX"]} + ), + ) + carriers = _make_table( + flavor, + f"ca_pin_{flavor}", + pd.DataFrame({"code": ["AA"], "name": ["American"]}), + ) + airports1 = _make_table( + flavor, + f"ap1_pin_{flavor}", + pd.DataFrame({"code": ["JFK"], "city": ["NY"]}), + ) + airports2 = _make_table( + flavor, + f"ap2_pin_{flavor}", + pd.DataFrame({"code": ["LAX"], "city": ["LA"]}), + ) + + fc = flights.join(carriers, flights.carrier == carriers.code) + fca = fc.join(airports1, fc.origin == airports1.code) + # Both ``fca`` and ``airports2`` carry a column called ``code``; ibis + # tries to suffix the right one as ``code_right``, which already exists + # from the previous join → IntegrityError. Plain ibis raises on schema + # materialization (``.columns``); xorq's vendored ibis raises at + # execute time. Force materialization to cover both. + with pytest.raises(Exception, match=r"(?i)collision|integrity"): + fca.join(airports2, fca.destination == airports2.code).execute() + + +@pytest.mark.parametrize("flavor", ["ibis", "xorq"]) +def test_pin_self_reference_join_is_ambiguous_without_view(flavor): + """Joining the same table twice without ``.view()`` is ambiguous. + + Pinned BSL workaround: BSL relies on the user calling ``.view()`` (or + ``to_semantic_table`` on a viewed table) when joining the same source + twice. The reconstruction path in ``serialization.reconstruct`` + explicitly preserves ``SelfReference`` nodes for the same reason. + + When ibis disambiguates self-references automatically, the + ``SelfReference`` preservation in reconstruct can be relaxed. + """ + ports = _make_table( + flavor, + f"ports_pin_{flavor}", + pd.DataFrame({"code": ["JFK", "LAX"], "city": ["NY", "LA"]}), + ) + flights = _make_table( + flavor, + f"fl_self_pin_{flavor}", + pd.DataFrame({"origin": ["JFK"], "dest": ["LAX"]}), + ) + + j1 = flights.join(ports, flights.origin == ports.code) + # Reusing ``ports`` (same identity) without ``.view()`` makes the + # second predicate's ``ports.code`` ambiguous. Error fires at + # predicate-resolution time during the second ``join`` call. + with pytest.raises(Exception, match=r"(?i)ambiguous"): + j1.join(ports, j1.dest == ports.code)