From bb0e9c992789c93b715d5b880522ce94b72615e8 Mon Sep 17 00:00:00 2001 From: Peter Freytag Date: Fri, 6 Mar 2026 08:24:07 +0100 Subject: [PATCH] add pydantic entity models --- CLAUDE.md | 94 ++++++++++++++++++ PydanticEntityModels.md | 63 ++++++++++++ cmem_plugin_base/dataintegration/entity.py | 96 ++++--------------- .../dataintegration/typed_entities/file.py | 8 +- .../dataintegration/typed_entities/quads.py | 16 ++-- .../typed_entities/typed_entities.py | 4 +- .../dataintegration/utils/entity_builder.py | 2 +- tests/test_utils_build_entities_from_data.py | 80 ++++++++-------- 8 files changed, 232 insertions(+), 131 deletions(-) create mode 100644 CLAUDE.md create mode 100644 PydanticEntityModels.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..7ed160b --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,94 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Overview + +`cmem-plugin-base` is a Python library providing base classes for developing eccenca Corporate Memory (CMEM) DataIntegration plugins. It is consumed by other plugin packages, not used directly. + +## Commands + +Uses [task](https://taskfile.dev/) as the task runner and [poetry](https://python-poetry.org/) for dependency management. + +```bash +task # List all available tasks +poetry install # Install dependencies + +task check # Run full test suite (linters + pytest) +task check:linters # Run ruff, mypy, deptry, trivy +task check:pytest # Run pytest with coverage +task check:ruff # Lint and format check only +task check:mypy # Type checking only +task format:fix # Auto-fix formatting and lint issues + +task build # Build wheel and tarball +task clean # Remove dist, caches +``` + +Run a single test file: +```bash +poetry run pytest tests/test_description.py +``` + +Run a specific test: +```bash +poetry run pytest tests/test_description.py::test_function_name +``` + +## Architecture + +### Plugin System + +Plugins are plain Python classes decorated with `@Plugin(...)` from `cmem_plugin_base.dataintegration.description`. Two base types exist: + +- **`WorkflowPlugin`** (`plugins.py`) — operator in a DI workflow; implements `execute(inputs, context) -> Entities | None` +- **`TransformPlugin`** (`plugins.py`) — transform operator; implements `transform(inputs) -> Sequence[str]` + +The `@Plugin` decorator registers the class in `Plugin.plugins` and generates a `PluginDescription` with metadata (label, description, categories, parameters, icon, actions). + +### Parameter Types + +`ParameterType[T]` (`types.py`) is the base class for all parameter types. Subclasses implement `from_string()` / `to_string()` and optionally `autocomplete()`. Built-in types live in `cmem_plugin_base/dataintegration/parameter/`: +- `choice.py` — enum-like choices +- `code.py` — code editor (SPARQL, Python, etc.) +- `dataset.py` — CMEM dataset reference +- `graph.py` — named graph URI with autocompletion +- `multiline.py` — multiline text +- `password.py` — encrypted password +- `resource.py` — DI project resource + +Parameter types are auto-inferred from `__init__` type annotations; use `PluginParameter(name, param_type=...)` to override. + +### Context Classes + +Context objects are injected by DataIntegration at runtime; the classes in `context.py` are documentation stubs only (empty method bodies): + +- `SystemContext` — DI version, CMEM/DP/DI endpoints, encrypt/decrypt +- `UserContext` — OAuth token, user URI/label +- `TaskContext` — project_id, task_id +- `WorkflowContext` — workflow_id, execution status +- `ExecutionContext` — combines system, user, task, workflow, report +- `PluginContext` — available during plugin creation/update (no task/workflow) +- `ReportContext` / `ExecutionReport` — progress reporting during execution + +### Entity Model + +Data flows through plugins as `Entities` objects (`entity.py`): +- `Entity(uri, values)` — one instance; `values` is a sequence of sequences of strings +- `EntitySchema(type_uri, paths)` — schema with ordered `EntityPath` list +- `Entities(entities, schema)` — lazy iterator of entities + schema + +### Testing Utilities + +`cmem_plugin_base/testing.py` provides concrete implementations of all context classes for use in tests (`TestExecutionContext`, `TestPluginContext`, etc.). These fetch real OAuth tokens via `cmempy` — tests that instantiate them require a running CMEM environment configured via `.env`. + +### Discovery + +`discovery.py` provides `import_modules()` which walks installed packages under the `cmem` namespace and collects all `@Plugin`-decorated classes into a `PluginDiscoveryResult`. + +## Code Style + +- Line length: 100 +- Ruff with `select = ["ALL"]` and specific ignores (see `pyproject.toml`) +- mypy strict; `context.py`, `plugins.py`, `types.py` have `disable_error_code = "empty-body"` because they contain abstract-style stub methods +- Python ≥ 3.13 required \ No newline at end of file diff --git a/PydanticEntityModels.md b/PydanticEntityModels.md new file mode 100644 index 0000000..6ccfddb --- /dev/null +++ b/PydanticEntityModels.md @@ -0,0 +1,63 @@ +## Overview of Changes + +### Goal +Convert the four entity classes in `entity.py` from plain Python classes to Pydantic `BaseModel` subclasses, removing handwritten boilerplate while preserving full backward compatibility. + +--- + +### `cmem_plugin_base/dataintegration/entity.py` + +**`EntityPath` → `BaseModel, frozen=True`** + +Before: manual `__init__`, `__eq__`, `__hash__`, `__repr__` (41 lines). +After: 4 field declarations (8 lines). + +`frozen=True` makes the model immutable and auto-generates `__hash__` and `__eq__`. This is correct because `EntityPath` is a value object — there's no reason to mutate a path after construction, and immutability enables safe use as dict keys or in sets. + +**`EntitySchema` → `BaseModel` (non-frozen)** + +Before: manual `__init__`, `__eq__`, `__hash__`, `__repr__` (50 lines) with a `None`→`EntityPath("")` conversion in `__init__`. +After: 4 field declarations (5 lines); the `path_to_root` default replaces the `None` sentinel logic. + +`frozen=True` was considered but dropped because `TypedEntitySchema` (in `typed_entities/typed_entities.py`) subclasses `EntitySchema` using a singleton pattern that sets `self._initialized` — which requires mutability. A frozen Pydantic model disallows `__setattr__` calls entirely, so the subclass would break. + +`paths` and `sub_schemata` are typed as `tuple[EntityPath, ...]` / `tuple["EntitySchema", ...] | None`. Pydantic v2 coerces list inputs to tuples automatically at runtime, which is why call sites can still pass lists — Pydantic normalises them. The stored value is always a tuple, ensuring consistent equality comparisons. + +**`Entity` → `BaseModel`** + +Before: manual `__init__` (2 lines of field assignment). +After: 2 field declarations + `arbitrary_types_allowed=True` config. + +`arbitrary_types_allowed=True` is required because `values: Sequence[Sequence[str]]` uses an abstract type that Pydantic cannot validate with its default type system. + +**`Entities` — kept as plain Python class** + +Two incompatibilities prevented the Pydantic conversion: +1. `TypedEntities` subclasses `Entities` with a completely different `__init__` signature (`values: Iterator[T], schema: TypedEntitySchema[T]`), which clashes with Pydantic's model init mechanics. +2. The field name `schema` shadows Pydantic `BaseModel`'s deprecated `.schema()` classmethod, causing a `UserWarning` (no functional issue, but noisy). + +Since `Entities` had no `__eq__`/`__hash__`/`__repr__` boilerplate to begin with, there was nothing to gain by converting it. + +--- + +### Call-site fixes (required due to Pydantic not accepting positional args) + +Pydantic `BaseModel.__init__` requires keyword arguments. All positional constructor calls were updated: + +| File | Change | +|---|---| +| `tests/test_utils_build_entities_from_data.py` | `EntityPath("name", False, ...)` → `EntityPath(path="name", ...)` and `paths=[...]` → `paths=(...)` | +| `typed_entities/quads.py` | `EntityPath(path_uri(...))` → `EntityPath(path=path_uri(...))` | +| `typed_entities/file.py` | Same as quads | +| `typed_entities/typed_entities.py` | `super().__init__(type_uri, paths)` → `super().__init__(type_uri=type_uri, paths=tuple(paths))` | +| `utils/entity_builder.py` | `paths=schema_paths` → `paths=tuple(schema_paths)` | + +The `tuple()` wraps in the last two are also necessary for mypy: the field is typed `tuple[EntityPath, ...]`, so passing a `list` is a type error even though Pydantic would coerce it at runtime. Making the coercion explicit satisfies both mypy and the type contract. + +--- + +### Net result + +- `entity.py` reduced from 129 lines to 68 lines +- Zero handwritten `__init__`, `__eq__`, `__hash__`, or `__repr__` methods remain in the converted classes +- All 30 tests pass, mypy reports no errors, ruff reports no issues \ No newline at end of file diff --git a/cmem_plugin_base/dataintegration/entity.py b/cmem_plugin_base/dataintegration/entity.py index 79c756d..05d3586 100644 --- a/cmem_plugin_base/dataintegration/entity.py +++ b/cmem_plugin_base/dataintegration/entity.py @@ -2,8 +2,10 @@ from collections.abc import Iterator, Sequence +from pydantic import BaseModel, ConfigDict -class EntityPath: + +class EntityPath(BaseModel, frozen=True): """A path in a schema. :param is_relation: If true, values for this path must only contain URIs that @@ -13,35 +15,12 @@ class EntityPath: nested elements. """ - def __init__(self, path: str, is_relation: bool = False, is_single_value: bool = False) -> None: - self.path = path - self.is_relation = is_relation - self.is_single_value = is_single_value - - def __repr__(self) -> str: - """Get a string representation""" - obj = { - "path": self.path, - "is_relation": self.is_relation, - "is_single_value": self.is_single_value, - } - return f"EntityPath({obj})" - - def __eq__(self, other: object) -> bool: - """Compare""" - return ( - isinstance(other, EntityPath) - and self.path == other.path - and self.is_relation == other.is_relation - and self.is_single_value == other.is_single_value - ) - - def __hash__(self) -> int: - """Return a hash value based on its path, relation status, and single value status.""" - return hash((self.path, self.is_relation, self.is_single_value)) - - -class EntitySchema: + path: str + is_relation: bool = False + is_single_value: bool = False + + +class EntitySchema(BaseModel): """An entity schema. :param type_uri: The entity type @@ -51,49 +30,13 @@ class EntitySchema: :param sub_schemata: Nested entity schemata """ - def __init__( - self, - type_uri: str, - paths: Sequence[EntityPath], - path_to_root: EntityPath | None = None, - sub_schemata: Sequence["EntitySchema"] | None = None, - ) -> None: - self.type_uri = type_uri - self.paths = paths - if path_to_root is None: - self.path_to_root = EntityPath("") - else: - self.path_to_root = path_to_root - self.sub_schemata = sub_schemata - - def __repr__(self) -> str: - """Get a string representation""" - obj = {"type_uri": self.type_uri, "paths": self.paths, "path_to_root": self.path_to_root} - return f"EntitySchema({obj})" - - def __eq__(self, other: object) -> bool: - """Compare""" - return ( - isinstance(other, EntitySchema) - and self.type_uri == other.type_uri - and self.paths == other.paths - and self.path_to_root == other.path_to_root - and self.sub_schemata == other.sub_schemata - ) - - def __hash__(self) -> int: - """Return a hash value based on its attributes.""" - return hash( - ( - self.type_uri, - tuple(self.paths), - self.path_to_root, - tuple(self.sub_schemata) if self.sub_schemata is not None else None, - ) - ) - - -class Entity: + type_uri: str + paths: tuple[EntityPath, ...] + path_to_root: EntityPath = EntityPath(path="") + sub_schemata: tuple["EntitySchema", ...] | None = None + + +class Entity(BaseModel): """An Entity can represent an instance of any given concept. :param uri: The URI of this entity @@ -103,9 +46,10 @@ class Entity: TODO: uri generation """ - def __init__(self, uri: str, values: Sequence[Sequence[str]]) -> None: - self.uri = uri - self.values = values + model_config = ConfigDict(arbitrary_types_allowed=True) + + uri: str + values: Sequence[Sequence[str]] class Entities: diff --git a/cmem_plugin_base/dataintegration/typed_entities/file.py b/cmem_plugin_base/dataintegration/typed_entities/file.py index edcfbeb..b3c1c4c 100644 --- a/cmem_plugin_base/dataintegration/typed_entities/file.py +++ b/cmem_plugin_base/dataintegration/typed_entities/file.py @@ -274,10 +274,10 @@ def __init__(self): super().__init__( type_uri=type_uri("File"), paths=[ - EntityPath(path_uri("filePath"), is_single_value=True), - EntityPath(path_uri("fileType"), is_single_value=True), - EntityPath(path_uri("mimeType"), is_single_value=True), - EntityPath(path_uri("entryPath"), is_single_value=True), + EntityPath(path=path_uri("filePath"), is_single_value=True), + EntityPath(path=path_uri("fileType"), is_single_value=True), + EntityPath(path=path_uri("mimeType"), is_single_value=True), + EntityPath(path=path_uri("entryPath"), is_single_value=True), ], ) diff --git a/cmem_plugin_base/dataintegration/typed_entities/quads.py b/cmem_plugin_base/dataintegration/typed_entities/quads.py index 6a7051d..6dffea8 100644 --- a/cmem_plugin_base/dataintegration/typed_entities/quads.py +++ b/cmem_plugin_base/dataintegration/typed_entities/quads.py @@ -118,14 +118,14 @@ def __init__(self): super().__init__( type_uri=type_uri("Quad"), paths=[ - EntityPath(path_uri("quad/subject")), - EntityPath(path_uri("quad/subjectType")), - EntityPath(path_uri("quad/predicate")), - EntityPath(path_uri("quad/object")), - EntityPath(path_uri("quad/objectType")), - EntityPath(path_uri("quad/objectLanguage")), - EntityPath(path_uri("quad/objectDataType")), - EntityPath(path_uri("quad/graph")), + EntityPath(path=path_uri("quad/subject")), + EntityPath(path=path_uri("quad/subjectType")), + EntityPath(path=path_uri("quad/predicate")), + EntityPath(path=path_uri("quad/object")), + EntityPath(path=path_uri("quad/objectType")), + EntityPath(path=path_uri("quad/objectLanguage")), + EntityPath(path=path_uri("quad/objectDataType")), + EntityPath(path=path_uri("quad/graph")), ], ) diff --git a/cmem_plugin_base/dataintegration/typed_entities/typed_entities.py b/cmem_plugin_base/dataintegration/typed_entities/typed_entities.py index 7682922..70dfc55 100644 --- a/cmem_plugin_base/dataintegration/typed_entities/typed_entities.py +++ b/cmem_plugin_base/dataintegration/typed_entities/typed_entities.py @@ -21,10 +21,10 @@ def __new__(cls, *args, **kwargs) -> "TypedEntitySchema": # noqa: ANN002, ANN00 cls._instances[cls] = super().__new__(cls) return cls._instances[cls] - def __init__(self, type_uri: str, paths: Sequence[EntityPath]) -> None: + def __init__(self, type_uri: str, paths: Sequence[EntityPath] | tuple[EntityPath, ...]) -> None: # Check if this instance has already been initialized if not hasattr(self, "_initialized"): - super().__init__(type_uri, paths) + super().__init__(type_uri=type_uri, paths=tuple(paths)) self._initialized = True @abstractmethod diff --git a/cmem_plugin_base/dataintegration/utils/entity_builder.py b/cmem_plugin_base/dataintegration/utils/entity_builder.py index cd896ec..2bfc5a6 100644 --- a/cmem_plugin_base/dataintegration/utils/entity_builder.py +++ b/cmem_plugin_base/dataintegration/utils/entity_builder.py @@ -87,7 +87,7 @@ def _get_schema(data: dict | list) -> dict[str, EntitySchema] | None: ) schema = EntitySchema( type_uri="", - paths=schema_paths, + paths=tuple(schema_paths), ) path_to_schema_map[path] = schema return path_to_schema_map diff --git a/tests/test_utils_build_entities_from_data.py b/tests/test_utils_build_entities_from_data.py index c7c7218..7c64237 100644 --- a/tests/test_utils_build_entities_from_data.py +++ b/tests/test_utils_build_entities_from_data.py @@ -29,10 +29,10 @@ def test_single_object() -> None: assert len(entities.schema.paths) == 2 assert entities.schema == EntitySchema( type_uri="", - paths=[ - EntityPath("name", False, is_single_value=True), - EntityPath("email", False, is_single_value=True), - ], + paths=( + EntityPath(path="name", is_single_value=True), + EntityPath(path="email", is_single_value=True), + ), ) @@ -56,11 +56,11 @@ def test_single_object_one_level() -> None: assert len(entities.schema.paths) == 3 assert entities.schema == EntitySchema( type_uri="", - paths=[ - EntityPath("name", False, is_single_value=True), - EntityPath("email", False, is_single_value=True), - EntityPath("city", True, is_single_value=True), - ], + paths=( + EntityPath(path="name", is_single_value=True), + EntityPath(path="email", is_single_value=True), + EntityPath(path="city", is_relation=True, is_single_value=True), + ), ) # Validate sub entities assert entities.sub_entities is not None @@ -70,10 +70,10 @@ def test_single_object_one_level() -> None: assert _entity.values == [["San Francisco"], ["United States"]] assert _.schema == EntitySchema( type_uri="", - paths=[ - EntityPath("name", False, is_single_value=True), - EntityPath("country", False, is_single_value=True), - ], + paths=( + EntityPath(path="name", is_single_value=True), + EntityPath(path="country", is_single_value=True), + ), ) @@ -104,11 +104,11 @@ def test_single_object_one_level_array() -> None: assert len(entities.schema.paths) == 3 assert entities.schema == EntitySchema( type_uri="", - paths=[ - EntityPath("name", False, is_single_value=True), - EntityPath("email", False, is_single_value=True), - EntityPath("city", True, is_single_value=False), - ], + paths=( + EntityPath(path="name", is_single_value=True), + EntityPath(path="email", is_single_value=True), + EntityPath(path="city", is_relation=True, is_single_value=False), + ), ) # Validate sub entities assert entities.sub_entities is not None @@ -118,10 +118,10 @@ def test_single_object_one_level_array() -> None: assert len(_entity.values) == 2 assert _.schema == EntitySchema( type_uri="", - paths=[ - EntityPath("name", False, is_single_value=True), - EntityPath("country", False, is_single_value=True), - ], + paths=( + EntityPath(path="name", is_single_value=True), + EntityPath(path="country", is_single_value=True), + ), ) @@ -162,11 +162,11 @@ def test_single_object_two_level_array() -> None: assert len(entities.schema.paths) == 3 assert entities.schema == EntitySchema( type_uri="", - paths=[ - EntityPath("name", False, is_single_value=True), - EntityPath("email", False, is_single_value=True), - EntityPath("city", True, is_single_value=False), - ], + paths=( + EntityPath(path="name", is_single_value=True), + EntityPath(path="email", is_single_value=True), + EntityPath(path="city", is_relation=True, is_single_value=False), + ), ) # Validate sub entities assert entities.sub_entities is not None @@ -177,19 +177,19 @@ def test_single_object_two_level_array() -> None: assert city_entities.schema == EntitySchema( type_uri="", - paths=[ - EntityPath("name", False, is_single_value=True), - EntityPath("country", False, is_single_value=True), - EntityPath("geo_location", True, is_single_value=True), - ], + paths=( + EntityPath(path="name", is_single_value=True), + EntityPath(path="country", is_single_value=True), + EntityPath(path="geo_location", is_relation=True, is_single_value=True), + ), ) assert location_entities.schema == EntitySchema( type_uri="", - paths=[ - EntityPath("lat", False, is_single_value=True), - EntityPath("long", False, is_single_value=True), - ], + paths=( + EntityPath(path="lat", is_single_value=True), + EntityPath(path="long", is_single_value=True), + ), ) @@ -213,10 +213,10 @@ def test_array_object() -> None: assert len(entities.schema.paths) == 2 assert entities.schema == EntitySchema( type_uri="", - paths=[ - EntityPath("name", False, is_single_value=True), - EntityPath("email", False, is_single_value=True), - ], + paths=( + EntityPath(path="name", is_single_value=True), + EntityPath(path="email", is_single_value=True), + ), )