Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions CLAUDE.md

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions docs/concepts/ingestion.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,15 @@ flags compose: `--demo` runs first (creating the Jaffle Shop datasource),
then the startup-ingest pass runs over every datasource including the
freshly-created demo.

Each per-datasource pass refreshes embeddings for the datasource doc,
every visible model + its visible children, **and every memory whose
canonical entities are rooted at the datasource** (DEV-1416). A stale
`embeddings.db` (created without an `OPENAI_API_KEY`, or after a manual
`memories.yaml` edit) is therefore repaired by the next
`--ingest-on-startup` with no extra step. Per-memory embed failures
surface as `IngestionError(model_name="memory:<id>", …)` in the
result's `errors` list.

### CLI

```bash
Expand Down
7 changes: 6 additions & 1 deletion docs/concepts/search.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,12 @@ sql-mode and query-backed models are silently skipped in v1.
(`EmbeddingService._apply_pending`) issues one batched
`get_embeddings_for_canonical_ids` for the hash-skip filter and one
batched `save_embeddings` for the persist step (DEV-1405) — refresh
cost is independent of subtree size.
cost is independent of subtree size. **Memories** are included in the
`slayer ingest` / `--ingest-on-startup` per-datasource refresh
(DEV-1416), filtered to memories with at least one canonical entity
rooted at the current datasource — so `embeddings.db` can be repaired
by re-running ingest, no separate `slayer embeddings refresh` step
required.
- **Cascade** semantics (DEV-1405 fix): `delete_embeddings_for_canonical`
matches the canonical id exactly OR as a strict dotted-path descendant
(`<root>.<...>`) — never as a character prefix. So `delete_memory(4)`
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "motley-slayer"
version = "0.6.7"
version = "0.6.8"
description = "A lightweight, agent-first semantic layer for AI agents"
requires-python = ">=3.11"
license = "MIT"
Expand Down
186 changes: 106 additions & 80 deletions slayer/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,38 +27,91 @@
_STRING_LITERAL_RE = re.compile(r"'[^']*'")


def _validate_model_name(name: str, context: str) -> str:
"""Reject model/query names containing ``__`` or ``.``.
class _SubstringRule:
"""Single source of truth for a forbidden substring inside a name.

Model and query names become SQL table aliases where ``__`` encodes
join paths, so both separators are reserved.
Each rule pairs the forbidden character / digraph with the rationale.
Every validator that rejects the same substring uses the same rule
so the wording (and the rejection rationale) lives in one place.
"""
if "__" in name:

__slots__ = ("substring", "reason")

def __init__(self, *, substring: str, reason: str) -> None:
self.substring = substring
self.reason = reason

def check(self, name: str, context: str) -> None:
if self.substring in name:
raise ValueError(
f"{context} '{name}' must not contain "
f"{self.substring!r}; {self.reason}"
)


_NO_DUNDER = _SubstringRule(
substring="__",
reason="double underscores are reserved for join path aliases in "
"generated SQL.",
)
_NO_DOT = _SubstringRule(
substring=".",
reason="dots are the canonical-id namespace delimiter "
"(``<ds>.<model>.<leaf>``) and the dotted-path reference "
"syntax in queries.",
)
_NO_COLON = _SubstringRule(
substring=":",
reason="colons are reserved as the aggregation separator "
"(``revenue:sum``) and the ``memory:<int>`` canonical-id "
"prefix.",
)
_NO_FWD_SLASH = _SubstringRule(
substring="/",
reason="path separators break the storage layout.",
)
_NO_BACK_SLASH = _SubstringRule(
substring="\\",
reason="path separators break the storage layout.",
)
_NO_NUL = _SubstringRule(
substring="\x00",
reason="NUL bytes are filesystem-unsafe.",
)


def _require_non_empty_trimmed(v: str, context: str) -> None:
"""Reject empty / whitespace-only inputs and inputs with
leading or trailing whitespace."""
if not v or not v.strip():
raise ValueError(
f"{context} name '{name}' must not contain '__'. "
f"Double underscores are reserved for join path aliases in generated SQL."
f"{context} must be a non-empty string; got {v!r}."
)
if "." in name:
if v.strip() != v:
raise ValueError(
f"{context} name '{name}' must not contain '.'. "
f"Dots are path syntax for referencing joined models in queries."
f"{context} must not have leading/trailing whitespace; "
f"got {v!r}."
)


def _validate_model_name(name: str, context: str) -> str:
"""Reject model/query names containing ``__``, ``.``, or ``:``."""
label = f"{context} name"
_NO_DUNDER.check(name=name, context=label)
_NO_DOT.check(name=name, context=label)
_NO_COLON.check(name=name, context=label)
return name


def _validate_column_name(name: str, context: str) -> str:
"""Reject dimension/measure names containing ``.``.
"""Reject dimension/measure names containing ``.`` or ``:``.

Dots are path syntax in queries (``customers.name``), not part of names.
``__`` is allowed — it encodes flattened join paths in virtual models
created by ``_query_as_model`` (e.g., ``stores__name``).
``__`` is allowed — it encodes flattened join paths in virtual
models created by ``_query_as_model`` (e.g., ``stores__name``).
"""
if "." in name:
raise ValueError(
f"{context} name '{name}' must not contain '.'. "
f"Dots are path syntax for referencing joined models in queries, "
f"not part of dimension or measure names."
)
label = f"{context} name"
_NO_DOT.check(name=name, context=label)
_NO_COLON.check(name=name, context=label)
return name


Expand Down Expand Up @@ -360,37 +413,26 @@ def _validate_name(cls, v: str) -> str:
@field_validator("data_source")
@classmethod
def _validate_data_source_format(cls, v: str) -> str:
# Format-only checks (run on every input). Emptiness is enforced in
# ``_require_data_source_unless_query_backed`` below so query-backed
# models can be constructed before their cache populator fills in
# ``data_source`` from the resolved virtual model. Whitespace-strip
# mismatch and NUL are rejected here so the rule mirrors the
# ``DatasourceConfig.name`` validator and the storage-layer
# ``_validate_path_component``.
if v and v.strip() != v:
# Format-only checks (run on every input). Emptiness is enforced
# in ``_require_data_source_unless_query_backed`` below so
# query-backed models can be constructed before their cache
# populator fills in ``data_source`` from the resolved virtual
# model. Whitespace-strip mismatch and substring rules mirror
# ``DatasourceConfig.name`` so the two canonical-id ingress
# points share validation logic via the shared ``_NO_*`` rules.
if not v:
return v
if v.strip() != v:
raise ValueError(
f"Model 'data_source' must not have leading/trailing "
f"whitespace; got {v!r}."
)
if "\x00" in v:
raise ValueError(
f"Model 'data_source' must not contain NUL bytes; got {v!r}."
)
if "/" in v or "\\" in v:
raise ValueError(
f"Model 'data_source' must not contain path separators "
f"('/' or '\\'); got {v!r}."
)
# DEV-1405: dot is the canonical-id namespace delimiter
# (``<ds>.<model>.<leaf>``). Allowing dots in a data_source name
# would let ``delete_datasource('prod')`` cascade-nuke embeddings
# belonging to a sibling datasource named ``prod.legacy``.
if "." in v:
raise ValueError(
f"Model 'data_source' must not contain '.'; "
f"dots are the canonical-id namespace delimiter "
f"(``<ds>.<model>.<leaf>``). Got {v!r}."
)
label = "Model 'data_source'"
_NO_NUL.check(name=v, context=label)
_NO_FWD_SLASH.check(name=v, context=label)
_NO_BACK_SLASH.check(name=v, context=label)
_NO_DOT.check(name=v, context=label)
_NO_COLON.check(name=v, context=label)
return v

@model_validator(mode="after")
Expand Down Expand Up @@ -650,39 +692,23 @@ def _apply_schema_migrations_and_aliases(cls, data: Any) -> Any:
@field_validator("name")
@classmethod
def _validate_name(cls, v: str) -> str:
# DEV-1405: datasource names are the leading segment of every
# canonical-id (``<ds>``, ``<ds>.<model>``, ``<ds>.<model>.<leaf>``)
# and become a path component in YAML storage
# (``datasources/<name>.yaml``, ``models/<name>/...``). They must
# therefore reject:
# - path separators / NUL (filesystem safety, mirrors
# ``_validate_path_component`` at the storage layer)
# - ``.`` (canonical-id namespace delimiter: ``prod`` vs ``prod.db``
# would otherwise collide in cascade-delete prefix matches)
# - whitespace-only / empty (storage primary key)
# ``__`` is intentionally NOT rejected: datasource names never become
# SQL table aliases, so the join-path-alias reservation that applies
# to model and query names doesn't apply here.
if not v or not v.strip():
raise ValueError(
f"Datasource 'name' must be a non-empty string; got {v!r}."
)
if v.strip() != v:
raise ValueError(
f"Datasource 'name' must not have leading/trailing whitespace; "
f"got {v!r}."
)
for ch in ("/", "\\", "\x00"):
if ch in v:
raise ValueError(
f"Datasource 'name' must not contain {ch!r}; got {v!r}."
)
if "." in v:
raise ValueError(
f"Datasource 'name' must not contain '.'; dots are the "
f"canonical-id namespace delimiter "
f"(``<ds>.<model>.<leaf>``). Got {v!r}."
)
# Datasource names are the leading segment of every canonical-id
# (``<ds>``, ``<ds>.<model>``, ``<ds>.<model>.<leaf>``) and a
# path component in YAML storage (``datasources/<name>.yaml``,
# ``models/<name>/...``). The substring rules are shared with
# ``SlayerModel.data_source`` via the module-level ``_NO_*``
# rules so the rationale lives in one place.
#
# ``__`` is intentionally NOT rejected: datasource names never
# become SQL table aliases, so the join-path-alias reservation
# that applies to model and query names doesn't apply here.
label = "Datasource 'name'"
_require_non_empty_trimmed(v=v, context=label)
_NO_NUL.check(name=v, context=label)
_NO_FWD_SLASH.check(name=v, context=label)
_NO_BACK_SLASH.check(name=v, context=label)
_NO_DOT.check(name=v, context=label)
_NO_COLON.check(name=v, context=label)
return v

def get_connection_string(self) -> str:
Expand Down
17 changes: 10 additions & 7 deletions slayer/core/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,13 +319,16 @@ def _apply_schema_migrations(cls, data: Any) -> Any:

@field_validator("name")
@classmethod
def _validate_no_dunder_in_name(cls, v: Optional[str]) -> Optional[str]:
if v is not None and "__" in v:
raise ValueError(
f"Query name '{v}' must not contain '__'. "
f"Double underscores are reserved for join path aliases in generated SQL."
)
return v
def _validate_query_name(cls, v: Optional[str]) -> Optional[str]:
# Share the same rejection rules as SlayerModel.name —
# SlayerQuery names occupy the same naming space when persisted
# as query-backed models. Rejects ``__`` (join-path alias
# separator), ``.`` (dotted reference syntax), and ``:`` (DSL
# aggregation separator).
if v is None:
return v
from slayer.core.models import _validate_model_name
return _validate_model_name(v, "Query")
dimensions: Annotated[Optional[List[ColumnRef]], BeforeValidator(_coerce_dimensions)] = None
time_dimensions: Optional[List[TimeDimension]] = None
main_time_dimension: Optional[str] = None # Explicit time dimension for transforms (overrides auto-detection)
Expand Down
9 changes: 7 additions & 2 deletions slayer/embeddings/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,9 +252,14 @@ async def _apply_pending(
try:
await self._storage.save_embeddings(rows)
except Exception as exc: # NOSONAR(S112) — best-effort persistence
# Include canonical ids so a caller doing failure
# attribution by entity (e.g. ``ingest_datasource_idempotent``
# tagging memory failures as ``model_name="memory:<id>"``)
# can see which rows did not land.
canonical_ids = ", ".join(r.canonical_id for r in rows)
warnings.append(
f"embedding batch persist failed "
f"({len(rows)} rows): {exc}"
f"embedding batch persist failed for "
f"{len(rows)} row(s) [{canonical_ids}]: {exc}"
)
_log.debug(
"EmbeddingService: refreshed=%d stale=%d total=%d warnings=%d",
Expand Down
Loading
Loading