From 8012ba93cab36fbfab9ed338d5f32ff7f34d9d04 Mon Sep 17 00:00:00 2001 From: d-v-b-agent Date: Fri, 26 Jun 2026 15:39:59 +0000 Subject: [PATCH 1/7] Make the data type registry a plain Mapping[str, type] Replace the DataTypeRegistry frozen dataclass (which wrapped a dict in `contents` plus a `_lazy_load_list` and six methods) with a plain `dict[str, type[ZDType]]`. Resolution (`match_dtype`/`match_json`) and lifecycle (`register_data_type`, `unregister_data_type`, `load_data_type_entrypoints`) are now free functions over the mapping, each accepting an optional `registry=` so callers can operate on an isolated dict. This also fixes a latent bug: the old `_lazy_load()` was never called in `src/`, so data types advertised via the `zarr.data_type` entry point group were silently never loaded. `load_data_type_entrypoints()` is now invoked when the dtype package is imported. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XZHYrRBh54e7tFearnZ72r --- src/zarr/core/dtype/__init__.py | 26 ++- src/zarr/core/dtype/registry.py | 346 ++++++++++++++------------------ src/zarr/dtype.py | 4 + src/zarr/registry.py | 5 +- tests/test_dtype/conftest.py | 2 +- tests/test_dtype_registry.py | 53 ++--- 6 files changed, 212 insertions(+), 224 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index d1dbd6e2c8..13e2f960cd 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -50,7 +50,15 @@ VariableLengthUTF8, VariableLengthUTF8JSON_V2, ) -from zarr.core.dtype.registry import DataTypeRegistry +from zarr.core.dtype.registry import ( + DataTypeRegistry, + data_type_registry, + load_data_type_entrypoints, + match_dtype, + match_json, + register_data_type, + unregister_data_type, +) from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType __all__ = [ @@ -97,12 +105,15 @@ "VariableLengthUTF8JSON_V2", "ZDType", "data_type_registry", + "load_data_type_entrypoints", + "match_dtype", + "match_json", "parse_data_type", "parse_dtype", + "register_data_type", + "unregister_data_type", ] -data_type_registry = DataTypeRegistry() - IntegerDType = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 INTEGER_DTYPE: Final = Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 @@ -157,7 +168,10 @@ for dtype in ANY_DTYPE: # mypy does not know that all the elements of ANY_DTYPE are subclasses of ZDType - data_type_registry.register(dtype._zarr_v3_name, dtype) # type: ignore[arg-type] + register_data_type(dtype) # type: ignore[arg-type] + +# Register any data types advertised by third-party packages via entry points. +load_data_type_entrypoints() # TODO: find a better name for this function @@ -174,7 +188,7 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType, na_dtype = np.dtype(dtype) else: na_dtype = dtype - return data_type_registry.match_dtype(dtype=na_dtype) + return match_dtype(na_dtype) def get_data_type_from_json( @@ -184,7 +198,7 @@ def get_data_type_from_json( Given a JSON representation of a data type and a Zarr format version, attempt to create a ZDType instance from the registered ZDType classes. """ - return data_type_registry.match_json(dtype_spec, zarr_format=zarr_format) + return match_json(dtype_spec, zarr_format=zarr_format) def parse_data_type( diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 0a9b2aa64a..f890c89365 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -1,210 +1,176 @@ from __future__ import annotations import contextlib -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Self +from importlib.metadata import entry_points as get_entry_points +from typing import TYPE_CHECKING import numpy as np from zarr.errors import DataTypeValidationError if TYPE_CHECKING: - from importlib.metadata import EntryPoint - from zarr.core.common import ZarrFormat from zarr.core.dtype.common import DTypeJSON from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType +# The data type registry is just a mapping from a canonical name to a ZDType class. +# Resolution logic (matching a native dtype or a JSON document to a registered type) lives in the +# free functions below rather than on the mapping itself, so the registry stays a plain dict that +# anyone can read, copy, or populate. +DataTypeRegistry = dict[str, "type[ZDType[TBaseDType, TBaseScalar]]"] + +# The global registry. Built-in data types are registered into this at import time (see +# ``zarr.core.dtype.__init__``), and third parties can add to it via ``register_data_type``. +data_type_registry: DataTypeRegistry = {} + + +def register_data_type( + cls: type[ZDType[TBaseDType, TBaseScalar]], + *, + name: str | None = None, + registry: DataTypeRegistry | None = None, +) -> None: + """ + Register a ZDType class under its canonical name. + + Parameters + ---------- + cls : type[ZDType] + The data type class to register. + name : str, optional + The name to register the class under. Defaults to ``cls._zarr_v3_name``. + registry : dict, optional + The registry to register into. Defaults to the global registry. + """ + if registry is None: + registry = data_type_registry + registry[name if name is not None else cls._zarr_v3_name] = cls + + +def unregister_data_type(name: str, *, registry: DataTypeRegistry | None = None) -> None: + """ + Remove a data type from the registry by its canonical name. + + Raises + ------ + KeyError + If the data type is not found in the registry. + """ + if registry is None: + registry = data_type_registry + del registry[name] -# This class is different from the other registry classes, which inherit from -# dict. IMO it's simpler to just do a dataclass. But long-term we should -# have just 1 registry class in use. -@dataclass(frozen=True, kw_only=True) -class DataTypeRegistry: + +def load_data_type_entrypoints(*, registry: DataTypeRegistry | None = None) -> None: + """ + Discover and register data types advertised via the ``zarr.data_type`` entry point group. """ - A registry for ZDType classes. + entry_points = get_entry_points() + for e in ( + *entry_points.select(group="zarr.data_type"), + *entry_points.select(group="zarr", name="data_type"), + ): + register_data_type(e.load(), registry=registry) - This registry is a mapping from Zarr data type names to their - corresponding ZDType classes. - Attributes +def match_dtype( + dtype: TBaseDType, *, registry: DataTypeRegistry | None = None +) -> ZDType[TBaseDType, TBaseScalar]: + """ + Match a native data type, e.g. a NumPy data type, to a registered ZDType. + + Parameters ---------- - contents : dict[str, type[ZDType[TBaseDType, TBaseScalar]]] - The mapping from Zarr data type names to their corresponding - ZDType classes. + dtype : TBaseDType + The native data type to match. + + Returns + ------- + ZDType[TBaseDType, TBaseScalar] + The matched ZDType corresponding to the provided NumPy data type. + + Raises + ------ + ValueError + If the data type is a NumPy "Object" type, which is ambiguous, or if multiple + or no Zarr data types are found that match the provided dtype. + + Notes + ----- + This function attempts to resolve a Zarr data type from a given native data type. + If the dtype is a NumPy "Object" data type, it raises a ValueError, as this type + can represent multiple Zarr data types. In such cases, a specific Zarr data type + should be explicitly constructed instead of relying on dynamic resolution. + + If multiple matches are found, it will also raise a ValueError. In this case + conflicting data types must be unregistered, or the Zarr data type should be explicitly + constructed. + """ + if registry is None: + registry = data_type_registry + + if dtype == np.dtype("O"): + msg = ( + f"Zarr data type resolution from {dtype} failed. " + 'Attempted to resolve a zarr data type from a numpy "Object" data type, which is ' + 'ambiguous, as multiple zarr data types can be represented by the numpy "Object" ' + "data type. " + "In this case you should construct your array by providing a specific Zarr data " + 'type. For a list of Zarr data types that are compatible with the numpy "Object"' + "data type, see https://github.com/zarr-developers/zarr-python/issues/3117" + ) + raise ValueError(msg) + matched: list[ZDType[TBaseDType, TBaseScalar]] = [] + for val in registry.values(): + # DataTypeValidationError means "this dtype doesn't match me", which is + # expected and suppressed. Other exceptions (e.g. ValueError for a dtype + # that matches the type but has an invalid configuration) are propagated + # to the caller. + with contextlib.suppress(DataTypeValidationError): + matched.append(val.from_native_dtype(dtype)) + if len(matched) == 1: + return matched[0] + elif len(matched) > 1: + msg = ( + f"Zarr data type resolution from {dtype} failed. " + f"Multiple data type wrappers found that match dtype '{dtype}': {matched}. " + "You should unregister one of these data types, or avoid Zarr data type inference " + "entirely by providing a specific Zarr data type when creating your array." + "For more information, see https://github.com/zarr-developers/zarr-python/issues/3117" + ) + raise ValueError(msg) + raise ValueError(f"No Zarr data type found that matches dtype '{dtype!r}'") + + +def match_json( + data: DTypeJSON, *, zarr_format: ZarrFormat, registry: DataTypeRegistry | None = None +) -> ZDType[TBaseDType, TBaseScalar]: """ + Match a JSON representation of a data type to a registered ZDType. - contents: dict[str, type[ZDType[TBaseDType, TBaseScalar]]] = field( - default_factory=dict, init=False - ) - - _lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) - - def _lazy_load(self) -> None: - """ - Load all data types from the lazy load list and register them with - the registry. After loading, clear the lazy load list. - """ - for e in self._lazy_load_list: - self.register(e.load()._zarr_v3_name, e.load()) - - self._lazy_load_list.clear() - - def register(self: Self, key: str, cls: type[ZDType[TBaseDType, TBaseScalar]]) -> None: - """ - Register a data type with the registry. - - Parameters - ---------- - key : str - The Zarr V3 name of the data type. - cls : type[ZDType[TBaseDType, TBaseScalar]] - The class of the data type to register. - - Notes - ----- - This method is idempotent. If the data type is already registered, this - method does nothing. - """ - if key not in self.contents or self.contents[key] != cls: - self.contents[key] = cls - - def unregister(self, key: str) -> None: - """ - Unregister a data type from the registry. - - Parameters - ---------- - key : str - The key associated with the ZDType class to be unregistered. - - Returns - ------- - None - - Raises - ------ - KeyError - If the data type is not found in the registry. - """ - if key in self.contents: - del self.contents[key] - else: - raise KeyError(f"Data type '{key}' not found in registry.") - - def get(self, key: str) -> type[ZDType[TBaseDType, TBaseScalar]]: - """ - Retrieve a registered ZDType class by its key. - - Parameters - ---------- - key : str - The key associated with the desired ZDType class. - - Returns - ------- - type[ZDType[TBaseDType, TBaseScalar]] - The ZDType class registered under the given key. - - Raises - ------ - KeyError - If the key is not found in the registry. - """ - - return self.contents[key] - - def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: - """ - Match a native data type, e.g. a NumPy data type, to a registered ZDType. - - Parameters - ---------- - dtype : TBaseDType - The native data type to match. - - Returns - ------- - ZDType[TBaseDType, TBaseScalar] - The matched ZDType corresponding to the provided NumPy data type. - - Raises - ------ - ValueError - If the data type is a NumPy "Object" type, which is ambiguous, or if multiple - or no Zarr data types are found that match the provided dtype. - - Notes - ----- - This function attempts to resolve a Zarr data type from a given native data type. - If the dtype is a NumPy "Object" data type, it raises a ValueError, as this type - can represent multiple Zarr data types. In such cases, a specific Zarr data type - should be explicitly constructed instead of relying on dynamic resolution. - - If multiple matches are found, it will also raise a ValueError. In this case - conflicting data types must be unregistered, or the Zarr data type should be explicitly - constructed. - """ - - if dtype == np.dtype("O"): - msg = ( - f"Zarr data type resolution from {dtype} failed. " - 'Attempted to resolve a zarr data type from a numpy "Object" data type, which is ' - 'ambiguous, as multiple zarr data types can be represented by the numpy "Object" ' - "data type. " - "In this case you should construct your array by providing a specific Zarr data " - 'type. For a list of Zarr data types that are compatible with the numpy "Object"' - "data type, see https://github.com/zarr-developers/zarr-python/issues/3117" - ) - raise ValueError(msg) - matched: list[ZDType[TBaseDType, TBaseScalar]] = [] - for val in self.contents.values(): - # DataTypeValidationError means "this dtype doesn't match me", which is - # expected and suppressed. Other exceptions (e.g. ValueError for a dtype - # that matches the type but has an invalid configuration) are propagated - # to the caller. - with contextlib.suppress(DataTypeValidationError): - matched.append(val.from_native_dtype(dtype)) - if len(matched) == 1: - return matched[0] - elif len(matched) > 1: - msg = ( - f"Zarr data type resolution from {dtype} failed. " - f"Multiple data type wrappers found that match dtype '{dtype}': {matched}. " - "You should unregister one of these data types, or avoid Zarr data type inference " - "entirely by providing a specific Zarr data type when creating your array." - "For more information, see https://github.com/zarr-developers/zarr-python/issues/3117" - ) - raise ValueError(msg) - raise ValueError(f"No Zarr data type found that matches dtype '{dtype!r}'") - - def match_json( - self, data: DTypeJSON, *, zarr_format: ZarrFormat - ) -> ZDType[TBaseDType, TBaseScalar]: - """ - Match a JSON representation of a data type to a registered ZDType. - - Parameters - ---------- - data : DTypeJSON - The JSON representation of a data type to match. - zarr_format : ZarrFormat - The Zarr format version to consider when matching data types. - - Returns - ------- - ZDType[TBaseDType, TBaseScalar] - The matched ZDType corresponding to the JSON representation. - - Raises - ------ - ValueError - If no matching Zarr data type is found for the given JSON data. - """ - - for val in self.contents.values(): - try: - return val.from_json(data, zarr_format=zarr_format) - except DataTypeValidationError: - pass - raise ValueError(f"No Zarr data type found that matches {data!r}") + Parameters + ---------- + data : DTypeJSON + The JSON representation of a data type to match. + zarr_format : ZarrFormat + The Zarr format version to consider when matching data types. + + Returns + ------- + ZDType[TBaseDType, TBaseScalar] + The matched ZDType corresponding to the JSON representation. + + Raises + ------ + ValueError + If no matching Zarr data type is found for the given JSON data. + """ + if registry is None: + registry = data_type_registry + + for val in registry.values(): + try: + return val.from_json(data, zarr_format=zarr_format) + except DataTypeValidationError: + pass + raise ValueError(f"No Zarr data type found that matches {data!r}") diff --git a/src/zarr/dtype.py b/src/zarr/dtype.py index 0c271b6c90..4df21d8a8a 100644 --- a/src/zarr/dtype.py +++ b/src/zarr/dtype.py @@ -43,6 +43,8 @@ # so it doesn't show up in the docs parse_data_type, # noqa: F401 parse_dtype, + register_data_type, + unregister_data_type, ) from zarr.core.dtype.common import DTypeSpec_V2, check_dtype_spec_v2 @@ -90,6 +92,8 @@ "check_dtype_spec_v2", "data_type_registry", "parse_dtype", + "register_data_type", + "unregister_data_type", ] diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 48f60fabd7..29a3228336 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any from zarr.core.config import BadConfigError, config -from zarr.core.dtype import data_type_registry from zarr.errors import ZarrUserWarning if TYPE_CHECKING: @@ -98,8 +97,8 @@ def _collect_entrypoints() -> list[Registry[Any]]: _ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) _ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) - data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr.data_type")) - data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + # Data types are loaded eagerly from entry points at import time of zarr.core.dtype + # (see load_data_type_entrypoints), so there is nothing to collect here. _chunk_key_encoding_registry.lazy_load_list.extend( entry_points.select(group="zarr.chunk_key_encoding") diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 4c585bfdf6..47d9638b43 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -11,7 +11,7 @@ from zarr.core.dtype.wrapper import ZDType zdtype_examples: tuple[ZDType[Any, Any], ...] = () -for wrapper_cls in data_type_registry.contents.values(): +for wrapper_cls in data_type_registry.values(): if wrapper_cls is Struct: with warnings.catch_warnings(): warnings.simplefilter("ignore") diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index f0946014fc..f849b089c7 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -13,6 +13,11 @@ TBaseDType, TBaseScalar, get_data_type_from_json, + load_data_type_entrypoints, + match_dtype, + match_json, + register_data_type, + unregister_data_type, ) from zarr.core.dtype.common import unpack_dtype_json from zarr.dtype import ( # type: ignore[attr-defined] @@ -20,7 +25,6 @@ FixedLengthUTF32, VariableLengthUTF8, ZDType, - data_type_registry, parse_data_type, parse_dtype, ) @@ -33,7 +37,7 @@ @pytest.fixture def data_type_registry_fixture() -> DataTypeRegistry: - return DataTypeRegistry() + return {} class TestRegistry: @@ -42,23 +46,25 @@ def test_register(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a dtype in a data type registry works. """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) - assert data_type_registry_fixture.get(Bool._zarr_v3_name) == Bool - assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), Bool) + register_data_type(Bool, registry=data_type_registry_fixture) + assert data_type_registry_fixture[Bool._zarr_v3_name] == Bool + assert isinstance(match_dtype(np.dtype("bool"), registry=data_type_registry_fixture), Bool) @staticmethod def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a new dtype with the same name works (overriding the previous one). """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) + register_data_type(Bool, registry=data_type_registry_fixture) class NewBool(Bool): def default_scalar(self) -> np.bool_: return np.True_ - data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) - assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), NewBool) + register_data_type(NewBool, registry=data_type_registry_fixture) + assert isinstance( + match_dtype(np.dtype("bool"), registry=data_type_registry_fixture), NewBool + ) @staticmethod @pytest.mark.parametrize( @@ -72,17 +78,19 @@ def test_match_dtype( """ Test that match_dtype resolves a numpy dtype into an instance of the correspond wrapper for that dtype. """ - data_type_registry_fixture.register(wrapper_cls._zarr_v3_name, wrapper_cls) - assert isinstance(data_type_registry_fixture.match_dtype(np.dtype(dtype_str)), wrapper_cls) + register_data_type(wrapper_cls, registry=data_type_registry_fixture) + assert isinstance( + match_dtype(np.dtype(dtype_str), registry=data_type_registry_fixture), wrapper_cls + ) @staticmethod def test_match_dtype_string_na_object_error( data_type_registry_fixture: DataTypeRegistry, ) -> None: - data_type_registry_fixture.register(VariableLengthUTF8._zarr_v3_name, VariableLengthUTF8) # type: ignore[arg-type] + register_data_type(VariableLengthUTF8, registry=data_type_registry_fixture) # type: ignore[arg-type] dtype: np.dtype[Any] = np.dtypes.StringDType(na_object=None) with pytest.raises(ValueError, match=r"Zarr data type resolution from StringDType.*failed"): - data_type_registry_fixture.match_dtype(dtype) + match_dtype(dtype, registry=data_type_registry_fixture) @staticmethod def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> None: @@ -93,10 +101,10 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non outside_dtype = np.dtype(outside_dtype_name) msg = f"No Zarr data type found that matches dtype '{outside_dtype!r}'" with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_dtype(outside_dtype) + match_dtype(outside_dtype, registry=data_type_registry_fixture) with pytest.raises(KeyError): - data_type_registry_fixture.get(outside_dtype_name) + data_type_registry_fixture[outside_dtype_name] @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -106,7 +114,7 @@ def test_registered_dtypes_match_dtype(zdtype: ZDType[TBaseDType, TBaseScalar]) Test that the registered dtypes can be retrieved from the registry. """ skip_object_dtype(zdtype) - assert data_type_registry.match_dtype(zdtype.to_native_dtype()) == zdtype + assert match_dtype(zdtype.to_native_dtype()) == zdtype @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -115,10 +123,7 @@ def test_registered_dtypes_match_json( zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat ) -> None: assert ( - data_type_registry.match_json( - zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format - ) - == zdtype + match_json(zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format) == zdtype ) @staticmethod @@ -137,29 +142,29 @@ def test_match_dtype_unique( skip_object_dtype(zdtype) for _cls in get_args(AnyDType): if _cls is not type(zdtype): - data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) + register_data_type(_cls, registry=data_type_registry_fixture) dtype_instance = zdtype.to_native_dtype() msg = f"No Zarr data type found that matches dtype '{dtype_instance!r}'" with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_dtype(dtype_instance) + match_dtype(dtype_instance, registry=data_type_registry_fixture) instance_dict = zdtype.to_json(zarr_format=zarr_format) msg = f"No Zarr data type found that matches {instance_dict!r}" with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) + match_json(instance_dict, zarr_format=zarr_format, registry=data_type_registry_fixture) @pytest.mark.usefixtures("set_path") def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: from package_with_entrypoint import TestDataType - data_type_registry._lazy_load() + load_data_type_entrypoints() instance = TestDataType() dtype_json = instance.to_json(zarr_format=zarr_format) assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance - data_type_registry.unregister(TestDataType._zarr_v3_name) + unregister_data_type(TestDataType._zarr_v3_name) @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") From 2779575ecce1d4c08945273da84d7a9a15cad95b Mon Sep 17 00:00:00 2001 From: d-v-b-agent Date: Fri, 26 Jun 2026 15:40:07 +0000 Subject: [PATCH 2/7] Provide generic dtype-JSON defaults on the ZDType base Make `to_json`, `_from_json_v2`, and `_from_json_v3` concrete on the ZDType base class instead of abstract. The defaults handle parameter-free data types: the Zarr V3 form is just the data type name, and the Zarr V2 form is that name plus an optional `object_codec_id` (a new base class var, default None, for object-backed types like variable-length strings). The V2 name defaults to the V3 name, which is what custom data types want. Built-in data types keep their existing NumPy-typestring-based overrides, so their behavior is unchanged. The win is for custom data types: a parameter-free dtype now inherits all dtype-JSON handling and no longer needs to implement `_check_json_v2`, `_from_json_v2`, `_from_json_v3`, or `to_json`. The custom_dtype example shrinks by ~90 lines accordingly. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XZHYrRBh54e7tFearnZ72r --- examples/custom_dtype/custom_dtype.py | 99 +++------------------------ src/zarr/core/dtype/wrapper.py | 50 ++++++++++++-- 2 files changed, 52 insertions(+), 97 deletions(-) diff --git a/examples/custom_dtype/custom_dtype.py b/examples/custom_dtype/custom_dtype.py index 53acb70f52..854f02f62b 100644 --- a/examples/custom_dtype/custom_dtype.py +++ b/examples/custom_dtype/custom_dtype.py @@ -15,16 +15,16 @@ import json import sys from pathlib import Path -from typing import ClassVar, Literal, Self, TypeGuard, overload +from typing import ClassVar, Literal, Self, TypeGuard import ml_dtypes # necessary to add extra dtypes to NumPy import numpy as np import pytest import zarr -from zarr.dtype import ZDType, check_dtype_spec_v2, data_type_registry +from zarr.dtype import ZDType, register_data_type from zarr.errors import DataTypeValidationError -from zarr.types import JSON, DTypeConfig_V2, DTypeJSON, ZarrFormat +from zarr.types import JSON, ZarrFormat # This is the int2 array data type int2_dtype_cls = type(np.dtype("int2")) @@ -39,11 +39,11 @@ class Int2(ZDType[int2_dtype_cls, int2_scalar_cls]): NumPy array of type int2) and the int2 scalar type (the ``dtype`` of the scalar value inside an int2 array). """ - # This field is as the key for the data type in the internal data type registry, and also - # as the identifier for the data type when serializaing the data type to disk for zarr v3 + # This is the key for the data type in the internal data type registry, and also the identifier + # for the data type when serializing it to disk. For a parameter-free data type like this one, + # ZDType uses it as the entire Zarr V3 representation and as the Zarr V2 ``name`` -- so we don't + # need to write any JSON (de)serialization for the data type itself; the base class handles it. _zarr_v3_name: ClassVar[Literal["int2"]] = "int2" - # this field will be used internally - _zarr_v2_name: ClassVar[Literal["int2"]] = "int2" # we bind a class variable to the native data type class so we can create instances of it dtype_cls = int2_dtype_cls @@ -61,89 +61,6 @@ def to_native_dtype(self: Self) -> int2_dtype_cls: """Create an int2 dtype instance from this ZDType""" return self.dtype_cls() - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]: - """ - Type check for Zarr v2-flavored JSON. - - This will check that the input is a dict like this: - .. code-block:: json - - { - "name": "int2", - "object_codec_id": None - } - - Note that this representation differs from the ``dtype`` field looks like in zarr v2 metadata. - Specifically, whatever goes into the ``dtype`` field in metadata is assigned to the ``name`` field here. - - See the Zarr docs for more information about the JSON encoding for data types. - """ - return ( - check_dtype_spec_v2(data) and data["name"] == "int2" and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["int2"]]: - """ - Type check for Zarr V3-flavored JSON. - - Checks that the input is the string "int2". - """ - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this ZDType from Zarr V3-flavored JSON. - """ - if cls._check_json_v2(data): - return cls() - # This first does a type check on the input, and if that passes we create an instance of the ZDType. - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: - """ - Create an instance of this ZDType from Zarr V3-flavored JSON. - - This first does a type check on the input, and if that passes we create an instance of the ZDType. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["int2"], None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["int2"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["int2"], None] | Literal["int2"]: - """ - Serialize this ZDType to v2- or v3-flavored JSON - - If the zarr_format is 2, then return a dict like this: - .. code-block:: json - - { - "name": "int2", - "object_codec_id": None - } - - If the zarr_format is 3, then return the string "int2" - - """ - if zarr_format == 2: - return {"name": "int2", "object_codec_id": None} - if zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[int | ml_dtypes.int2]: """ Check if a python object is a valid int2-compatible scalar @@ -209,7 +126,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> ml_dtypes. # after defining dtype class, it must be registered with the data type registry so zarr can use it -data_type_registry.register(Int2._zarr_v3_name, Int2) +register_data_type(Int2) # this parametrized function will create arrays in zarr v2 and v3 using our new data type diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 42d5d88473..4859f402d9 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -36,6 +36,9 @@ import numpy as np +from zarr.core.dtype.common import check_dtype_spec_v2 +from zarr.errors import DataTypeValidationError + if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.common import DTypeJSON, DTypeSpec_V2, DTypeSpec_V3 @@ -65,6 +68,10 @@ class variable, and it should generally be unique across different data types. # this class will create a native data type dtype_cls: ClassVar[type[TBaseDType]] _zarr_v3_name: ClassVar[str] + # For data types backed by the NumPy object dtype in Zarr V2 (e.g. variable-length strings), + # this is the id of the object codec used to encode values, e.g. "vlen-utf8". For all other + # data types it is None. It is used by the default Zarr V2 (de)serialization below. + object_codec_id: ClassVar[str | None] = None @classmethod def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[DType]: @@ -124,14 +131,37 @@ def to_native_dtype(self: Self) -> DType: raise NotImplementedError # pragma: no cover @classmethod - @abstractmethod def _from_json_v2(cls: type[Self], data: DTypeJSON) -> Self: - raise NotImplementedError # pragma: no cover + """ + Construct an instance from the Zarr V2 JSON form of this data type. + + The default implementation handles parameter-free data types whose Zarr V2 ``name`` equals + their Zarr V3 name (the common case for custom data types). Data types that are + parametrized, or that use a NumPy type string as their Zarr V2 name, should override this. + """ + if ( + check_dtype_spec_v2(data) + and data["name"] == cls._zarr_v3_name + and data["object_codec_id"] == cls.object_codec_id + ): + return cls() + raise DataTypeValidationError( + f"Invalid Zarr V2 JSON representation of {cls.__name__}: {data!r}" + ) @classmethod - @abstractmethod def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: - raise NotImplementedError # pragma: no cover + """ + Construct an instance from the Zarr V3 JSON form of this data type. + + The default implementation handles parameter-free data types, whose Zarr V3 form is just + the data type name. Data types with a ``configuration`` object should override this. + """ + if data == cls._zarr_v3_name: + return cls() + raise DataTypeValidationError( + f"Invalid Zarr V3 JSON representation of {cls.__name__}: {data!r}" + ) @classmethod def from_json(cls: type[Self], data: DTypeJSON, *, zarr_format: ZarrFormat) -> Self: @@ -163,11 +193,15 @@ def to_json(self, zarr_format: Literal[2]) -> DTypeSpec_V2: ... @overload def to_json(self, zarr_format: Literal[3]) -> DTypeSpec_V3: ... - @abstractmethod def to_json(self, zarr_format: ZarrFormat) -> DTypeSpec_V2 | DTypeSpec_V3: """ Serialize this ZDType to JSON. + The default implementation handles parameter-free data types: in Zarr V3 the representation + is just the data type name, and in Zarr V2 it is that name plus an optional object codec id. + Data types with a ``configuration`` (Zarr V3), or that use a NumPy type string as their + Zarr V2 name, should override this. + Parameters ---------- zarr_format : ZarrFormat @@ -178,7 +212,11 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeSpec_V2 | DTypeSpec_V3: DTypeJSON_V2 | DTypeJSON_V3 The JSON-serializable representation of the wrapped data type """ - raise NotImplementedError # pragma: no cover + if zarr_format == 2: + return {"name": self._zarr_v3_name, "object_codec_id": self.object_codec_id} + if zarr_format == 3: + return self._zarr_v3_name + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover @abstractmethod def _check_scalar(self, data: object) -> bool: From bb392662fdc947781eaac9cdb7b6f81215e7e92c Mon Sep 17 00:00:00 2001 From: d-v-b-agent Date: Fri, 26 Jun 2026 15:58:43 +0000 Subject: [PATCH 3/7] Migrate NumPy-native scalar dtypes onto shared JSON handling Split the ZDType base `to_json` into `_to_json_v2`/`_to_json_v3` hooks, and add a `NumpyNativeDTypeV2` mixin for data types whose Zarr V2 name is the NumPy type string of the wrapped dtype (which fully determines the dtype, including byte order). The bool, integer, float, and complex data types now inherit V2 (de)serialization from the mixin and V3 from the base, instead of each concrete class repeating ~50 lines of `_from_json_v2`/`_from_json_v3`/ `to_json`/`_check_json_*` boilerplate plus a `_zarr_v2_names` table. Behavior is byte-identical: `to_native_dtype().str` reproduces each type's existing V2 name (e.g. " Claude-Session: https://claude.ai/code/session_01XZHYrRBh54e7tFearnZ72r --- src/zarr/core/dtype/npy/bool.py | 135 +----- src/zarr/core/dtype/npy/common.py | 40 ++ src/zarr/core/dtype/npy/complex.py | 150 +----- src/zarr/core/dtype/npy/float.py | 128 +---- src/zarr/core/dtype/npy/int.py | 742 +---------------------------- src/zarr/core/dtype/wrapper.py | 16 +- 6 files changed, 68 insertions(+), 1143 deletions(-) diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index f92476a455..8dd3c07bdd 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -1,25 +1,23 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, overload +from typing import TYPE_CHECKING, ClassVar, Literal, Self import numpy as np from zarr.core.dtype.common import ( - DTypeConfig_V2, - DTypeJSON, HasItemSize, - check_dtype_spec_v2, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.npy.common import NumpyNativeDTypeV2 from zarr.errors import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import TBaseDType @dataclass(frozen=True, kw_only=True, slots=True) -class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): +class Bool(NumpyNativeDTypeV2[np.dtypes.BoolDType, np.bool_], HasItemSize): """ A Zarr data type for arrays containing booleans. @@ -45,7 +43,6 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): """ _zarr_v3_name: ClassVar[Literal["bool"]] = "bool" - _zarr_v2_name: ClassVar[Literal["|b1"]] = "|b1" dtype_cls = np.dtypes.BoolDType @classmethod @@ -85,130 +82,6 @@ def to_native_dtype(self: Self) -> np.dtypes.BoolDType: """ return self.dtype_cls() - @classmethod - def _check_json_v2( - cls, - data: DTypeJSON, - ) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]: - """ - Check that the input is a valid JSON representation of a Bool. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - ``TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]`` - True if the input is a valid JSON representation, False otherwise. - """ - return ( - check_dtype_spec_v2(data) - and data["name"] == cls._zarr_v2_name - and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["bool"]]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - bool - True if the input is a valid JSON representation, False otherwise. - """ - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of Bool from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Bool - An instance of Bool. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: - """ - Create an instance of Bool from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Bool - An instance of Bool. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|b1"], None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["bool"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]: - """ - Serialize this Bool instance to JSON. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version (2 or 3). - - Returns - ------- - ``DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]`` - The JSON representation of the Bool instance. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ - if zarr_format == 2: - return {"name": self._zarr_v2_name, "object_codec_id": None} - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> bool: """ Check if the input can be cast to a boolean scalar. diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index f413f5f678..2e19a6a04e 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -10,6 +10,7 @@ Final, Literal, NewType, + Self, SupportsComplex, SupportsFloat, SupportsIndex, @@ -25,10 +26,49 @@ EndiannessStr, JSONFloatV2, JSONFloatV3, + check_dtype_spec_v2, ) +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType +from zarr.errors import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.common import DTypeJSON, DTypeSpec_V2 + + +class NumpyNativeDTypeV2[DType: TBaseDType, Scalar: TBaseScalar](ZDType[DType, Scalar]): + """ + Mixin for data types whose Zarr V2 representation is just the NumPy type string of the + wrapped dtype, e.g. ``" Self: + # These data types always have a string NumPy type string as their V2 name (a structured + # name, which is a sequence, can never match a scalar NumPy-native type). + if ( + check_dtype_spec_v2(data) + and isinstance(data["name"], str) + and data["object_codec_id"] is None + ): + try: + native_dtype = np.dtype(data["name"]) + except TypeError: + pass + else: + # from_native_dtype validates the dtype class (and byte order) and raises + # DataTypeValidationError if it does not match this data type. + return cls.from_native_dtype(native_dtype) + raise DataTypeValidationError( + f"Invalid Zarr V2 JSON representation of {cls.__name__}: {data!r}" + ) + + def _to_json_v2(self) -> DTypeSpec_V2: + return {"name": self.to_native_dtype().str, "object_codec_id": None} + IntLike = SupportsInt | SupportsIndex | bytes | str FloatLike = SupportsIndex | SupportsFloat | bytes | str diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 0286d42380..aec6d65d97 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -7,20 +7,17 @@ Literal, Self, TypeGuard, - overload, ) import numpy as np from zarr.core.dtype.common import ( - DTypeConfig_V2, - DTypeJSON, HasEndianness, HasItemSize, - check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( ComplexLike, + NumpyNativeDTypeV2, check_json_complex_float_v2, check_json_complex_float_v3, complex_float_from_json_v2, @@ -30,25 +27,22 @@ endianness_to_numpy_str, get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType from zarr.errors import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import TBaseDType @dataclass(frozen=True) class BaseComplex[ DType: np.dtypes.Complex64DType | np.dtypes.Complex128DType, Scalar: np.complex64 | np.complex128, -](ZDType[DType, Scalar], HasEndianness, HasItemSize): +](NumpyNativeDTypeV2[DType, Scalar], HasEndianness, HasItemSize): """ A base class for Zarr data types that wrap NumPy complex float data types. """ - # This attribute holds the possible zarr v2 JSON names for the data type - _zarr_v2_names: ClassVar[tuple[str, ...]] - @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ @@ -88,138 +82,6 @@ def to_native_dtype(self) -> DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[no-any-return,call-overload] - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: - """ - Check that the input is a valid JSON representation of this data type. - - The input data must be a mapping that contains a "name" key that is one of - the strings from cls._zarr_v2_names and an "object_codec_id" key that is None. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - bool - True if the input is a valid JSON representation, False otherwise. - """ - return ( - check_dtype_spec_v2(data) - and data["name"] in cls._zarr_v2_names - and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: - """ - Check that the input is a valid JSON representation of this data type in Zarr V3. - - This method verifies that the provided data matches the expected Zarr V3 - representation, which is the string specified by the class-level attribute _zarr_v3_name. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - TypeGuard[str] - True if the input is a valid representation of this class in Zarr V3, False otherwise. - """ - - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this class. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> str: ... - - def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: - """ - Serialize this object to a JSON-serializable representation. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version. Supported values are 2 and 3. - - Returns - ------- - DTypeConfig_V2[str, None] | str - If ``zarr_format`` is 2, a dictionary with ``"name"`` and ``"object_codec_id"`` keys is - returned. - If ``zarr_format`` is 3, a string representation of the complex data type is returned. - - Raises - ------ - ValueError - If `zarr_format` is not 2 or 3. - """ - - if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[ComplexLike]: """ Check that the input is a scalar complex value. @@ -363,13 +225,10 @@ class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): The numpy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["complex64"]] The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">c8"], Literal["c8"], Literal["c8", " int: @@ -398,13 +257,10 @@ class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndia The numpy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["complex128"]] The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">c16"], Literal["c16"], Literal["c16", " int: diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index d041416b81..89dc912505 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -1,19 +1,17 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, overload +from typing import TYPE_CHECKING, Self, TypeGuard import numpy as np from zarr.core.dtype.common import ( - DTypeConfig_V2, - DTypeJSON, HasEndianness, HasItemSize, - check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( FloatLike, + NumpyNativeDTypeV2, check_json_float_v2, check_json_float_v3, check_json_floatish_str, @@ -24,25 +22,22 @@ float_to_json_v3, get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType from zarr.errors import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import TBaseDType @dataclass(frozen=True) class BaseFloat[ DType: np.dtypes.Float16DType | np.dtypes.Float32DType | np.dtypes.Float64DType, Scalar: np.float16 | np.float32 | np.float64, -](ZDType[DType, Scalar], HasEndianness, HasItemSize): +](NumpyNativeDTypeV2[DType, Scalar], HasEndianness, HasItemSize): """ A base class for Zarr data types that wrap NumPy float data types. """ - # This attribute holds the possible zarr v2 JSON names for the data type - _zarr_v2_names: ClassVar[tuple[str, ...]] - @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ @@ -76,118 +71,6 @@ def to_native_dtype(self) -> DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[no-any-return,call-overload] - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: - """ - Check that the input is a valid JSON representation of this data type. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TypeGuard[DTypeConfig_V2[str, None]] - True if the input is a valid JSON representation of this data type, False otherwise. - """ - return ( - check_dtype_spec_v2(data) - and data["name"] in cls._zarr_v2_names - and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TypeGuard[str] - True if the input is a valid JSON representation of this class, False otherwise. - """ - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this ZDType from Zarr v2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this ZDType from Zarr v3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> str: ... - - def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: - """ - Convert the wrapped data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - DTypeConfig_V2[str, None] or str - The JSON-serializable representation of the wrapped data type. - - Raises - ------ - ValueError - If zarr_format is not 2 or 3. - """ - if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[FloatLike]: """ Check that the input is a valid scalar value. @@ -347,7 +230,6 @@ class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): dtype_cls = np.dtypes.Float16DType _zarr_v3_name = "float16" - _zarr_v2_names: ClassVar[tuple[Literal[">f2"], Literal["f2", " int: @@ -384,7 +266,6 @@ class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): dtype_cls = np.dtypes.Float32DType _zarr_v3_name = "float32" - _zarr_v2_names: ClassVar[tuple[Literal[">f4"], Literal["f4", " int: @@ -421,7 +302,6 @@ class Float64(BaseFloat[np.dtypes.Float64DType, np.float64]): dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" - _zarr_v2_names: ClassVar[tuple[Literal[">f8"], Literal["f8", " int: diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index c18fd01dd8..c76d0369d4 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -9,30 +9,27 @@ SupportsIndex, SupportsInt, TypeGuard, - overload, ) import numpy as np from zarr.core.dtype.common import ( - DTypeConfig_V2, - DTypeJSON, HasEndianness, HasItemSize, - check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( + NumpyNativeDTypeV2, check_json_int, check_json_intish_float, check_json_intish_str, endianness_to_numpy_str, get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType from zarr.errors import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import TBaseDType _NumpyIntDType = ( np.dtypes.Int8DType @@ -55,7 +52,7 @@ class BaseInt[ DType: _NumpyIntDType, Scalar: np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64, -](ZDType[DType, Scalar], HasItemSize): +](NumpyNativeDTypeV2[DType, Scalar], HasItemSize): """ A base class for integer data types in Zarr. @@ -63,53 +60,6 @@ class BaseInt[ in both Zarr v2 and v3 formats, as well as methods for checking and casting scalars. """ - _zarr_v2_names: ClassVar[tuple[str, ...]] - - @classmethod - def _check_json_v2(cls, data: object) -> TypeGuard[DTypeConfig_V2[str, None]]: - """ - Check that the input is a valid JSON representation of this integer data type in Zarr V2. - - This method verifies that the provided data matches the expected Zarr V2 representation - for this data type. The input data must be a mapping that contains a "name" key that is - one of the strings from cls._zarr_v2_names and an "object_codec_id" key that is None. - - Parameters - ---------- - data : object - The JSON data to check. - - Returns - ------- - TypeGuard[DTypeConfig_V2[str, None]] - True if the input is a valid representation of this class in Zarr V2, - False otherwise. - """ - - return ( - check_dtype_spec_v2(data) - and data["name"] in cls._zarr_v2_names - and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: object) -> TypeGuard[str]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : object - The JSON data to check. - - Returns - ------- - TypeGuard[str] - True if the input is a valid representation of this class in Zarr v3, - False otherwise. - """ - return data == cls._zarr_v3_name - def _check_scalar(self, data: object) -> TypeGuard[IntLike]: """ Check if the input object is of an IntLike type. @@ -259,7 +209,6 @@ class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): dtype_cls = np.dtypes.Int8DType _zarr_v3_name: ClassVar[Literal["int8"]] = "int8" - _zarr_v2_names: ClassVar[tuple[Literal["|i1"]]] = ("|i1",) @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: @@ -298,89 +247,6 @@ def to_native_dtype(self: Self) -> np.dtypes.Int8DType: """ return self.dtype_cls() - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an Int8 from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int8. - """ - if cls._check_json_v2(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an Int8 from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int8. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|i1"], None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["int8"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]: - """ - Convert the data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version. - - Returns - ------- - ``DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]`` - The JSON-serializable representation of the data type. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ - if zarr_format == 2: - return {"name": self._zarr_v2_names[0], "object_codec_id": None} - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @property def item_size(self) -> int: """ @@ -415,7 +281,6 @@ class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): dtype_cls = np.dtypes.UInt8DType _zarr_v3_name: ClassVar[Literal["uint8"]] = "uint8" - _zarr_v2_names: ClassVar[tuple[Literal["|u1"]]] = ("|u1",) @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: @@ -440,93 +305,6 @@ def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: return self.dtype_cls() - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - - if cls._check_json_v2(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|u1"], None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["uint8"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]: - """ - Convert the data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version. Supported values are 2 and 3. - - Returns - ------- - ``DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]`` - The JSON-serializable representation of the data type. - - Raises - ------ - ValueError - If `zarr_format` is not 2 or 3. - """ - if zarr_format == 2: - # For Zarr format version 2, return a dictionary with the name and object codec ID. - return {"name": self._zarr_v2_names[0], "object_codec_id": None} - elif zarr_format == 3: - # For Zarr format version 3, return the v3 name as a string. - return self._zarr_v3_name - # Raise an error if the zarr_format is neither 2 nor 3. - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @property def item_size(self) -> int: """ @@ -562,7 +340,6 @@ class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): dtype_cls = np.dtypes.Int16DType _zarr_v3_name: ClassVar[Literal["int16"]] = "int16" - _zarr_v2_names: ClassVar[tuple[Literal[">i2"], Literal["i2", " Self: @@ -603,93 +380,6 @@ def to_native_dtype(self) -> np.dtypes.Int16DType: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i2", " Literal["int16"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">i2", "i2", " int: """ @@ -725,7 +415,6 @@ class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): dtype_cls = np.dtypes.UInt16DType _zarr_v3_name: ClassVar[Literal["uint16"]] = "uint16" - _zarr_v2_names: ClassVar[tuple[Literal[">u2"], Literal["u2", " Self: @@ -766,93 +455,6 @@ def to_native_dtype(self) -> np.dtypes.UInt16DType: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u2", " Literal["uint16"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">u2", "u2", " int: """ @@ -888,7 +490,6 @@ class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): dtype_cls = np.dtypes.Int32DType _zarr_v3_name: ClassVar[Literal["int32"]] = "int32" - _zarr_v2_names: ClassVar[tuple[Literal[">i4"], Literal["i4", " TypeGuard[np.dtypes.Int32DType]: @@ -950,93 +551,6 @@ def to_native_dtype(self: Self) -> np.dtypes.Int32DType: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an Int32 from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int32. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an Int32 from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int32. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i4", " Literal["int32"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">i4", "i4", " int: """ @@ -1072,7 +586,6 @@ class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): dtype_cls = np.dtypes.UInt32DType _zarr_v3_name: ClassVar[Literal["uint32"]] = "uint32" - _zarr_v2_names: ClassVar[tuple[Literal[">u4"], Literal["u4", " TypeGuard[np.dtypes.UInt32DType]: @@ -1136,88 +649,6 @@ def to_native_dtype(self) -> np.dtypes.UInt32DType: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class 32-bit unsigned - integer. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class 32-bit unsigned - integer. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u4", " Literal["uint32"]: ... - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">u4", "u4", " int: """ @@ -1253,7 +684,6 @@ class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): dtype_cls = np.dtypes.Int64DType _zarr_v3_name: ClassVar[Literal["int64"]] = "int64" - _zarr_v2_names: ClassVar[tuple[Literal[">i8"], Literal["i8", " Self: @@ -1295,88 +725,6 @@ def to_native_dtype(self) -> np.dtypes.Int64DType: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class 64-bit signed - integer. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class 64-bit signed - integer. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i8", " Literal["int64"]: ... - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">i8", "i8", " int: """ @@ -1412,7 +760,6 @@ class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): dtype_cls = np.dtypes.UInt64DType _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" - _zarr_v2_names: ClassVar[tuple[Literal[">u8"], Literal["u8", " np.dtypes.UInt64DType: """ @@ -1427,89 +774,6 @@ def to_native_dtype(self) -> np.dtypes.UInt64DType: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class unsigned 64-bit - integer. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class unsigned 64-bit - integer. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u8", " Literal["uint64"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">u8", "u8", " Self: """ diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 4859f402d9..2305fdd629 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -213,11 +213,23 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeSpec_V2 | DTypeSpec_V3: The JSON-serializable representation of the wrapped data type """ if zarr_format == 2: - return {"name": self._zarr_v3_name, "object_codec_id": self.object_codec_id} + return self._to_json_v2() if zarr_format == 3: - return self._zarr_v3_name + return self._to_json_v3() raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v2(self) -> DTypeSpec_V2: + """ + Serialize this ZDType to its Zarr V2 JSON form. See ``to_json`` for the default behavior. + """ + return {"name": self._zarr_v3_name, "object_codec_id": self.object_codec_id} + + def _to_json_v3(self) -> DTypeSpec_V3: + """ + Serialize this ZDType to its Zarr V3 JSON form. See ``to_json`` for the default behavior. + """ + return self._zarr_v3_name + @abstractmethod def _check_scalar(self, data: object) -> bool: """ From 078aa3c65c4cf724657ff199f2851a95d43128c5 Mon Sep 17 00:00:00 2001 From: d-v-b-agent Date: Sat, 27 Jun 2026 09:09:40 +0000 Subject: [PATCH 4/7] Migrate remaining built-in dtypes onto shared JSON handling Extend the shared dtype-JSON machinery to the object, fixed-length, raw, time, and structured data types: - Add an `ObjectCodecDTypeV2` mixin for parameter-free object dtypes whose Zarr V2 form is `{"name": "|O", "object_codec_id": }` (variable-length strings and bytes). - Fixed-length / raw / time dtypes (FixedLengthUTF32, NullTerminatedBytes, RawBytes, DateTime64, TimeDelta64) now inherit V2 from `NumpyNativeDTypeV2` (their V2 name is the NumPy type string, which round-trips the length/unit/ byte order via from_native_dtype) and implement only a small `_to_json_v3` hook for their V3 `configuration`. Their `_zarr_v2_names` tables are gone. - Structured/Struct keep their bespoke field-list V2/V3 logic but drop the `to_json` dispatcher in favor of `_to_json_v2`/`_to_json_v3` hooks. All output is byte-identical (verified per type, incl. the V3 unstable-spec warnings and the variable_length_bytes "bytes" alias). Net ~645 fewer lines. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XZHYrRBh54e7tFearnZ72r --- src/zarr/core/dtype/npy/bytes.py | 318 +++----------------------- src/zarr/core/dtype/npy/common.py | 23 ++ src/zarr/core/dtype/npy/string.py | 202 +--------------- src/zarr/core/dtype/npy/structured.py | 99 +++----- src/zarr/core/dtype/npy/time.py | 213 ++--------------- 5 files changed, 105 insertions(+), 750 deletions(-) diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index af8fa7a272..1fa3915979 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -1,9 +1,8 @@ from __future__ import annotations import base64 -import re from dataclasses import dataclass -from typing import ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypedDict, TypeGuard, cast import numpy as np @@ -14,13 +13,18 @@ HasItemSize, HasLength, HasObjectCodec, - check_dtype_spec_v2, v3_unstable_dtype_warning, ) -from zarr.core.dtype.npy.common import check_json_str -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.npy.common import ( + NumpyNativeDTypeV2, + ObjectCodecDTypeV2, + check_json_str, +) from zarr.errors import DataTypeValidationError +if TYPE_CHECKING: + from zarr.core.dtype.wrapper import TBaseDType + BytesLike = np.bytes_ | str | bytes | int @@ -166,7 +170,9 @@ class VariableLengthBytesJSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-byt @dataclass(frozen=True, kw_only=True) -class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): +class NullTerminatedBytes( + NumpyNativeDTypeV2[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize +): """ A Zarr data type for arrays containing fixed-length null-terminated byte sequences. @@ -247,32 +253,6 @@ def to_native_dtype(self) -> np.dtypes.BytesDType[int]: return self.dtype_cls(self.length) - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[NullterminatedBytesJSON_V2]: - """ - Check that the input is a valid JSON representation of NullTerminatedBytes in Zarr V2. - - The input data must be a mapping that contains a "name" key that matches the pattern - "|S" and an "object_codec_id" key that is None. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - bool - True if the input data is a valid representation, False otherwise. - """ - - return ( - check_dtype_spec_v2(data) - and isinstance(data["name"], str) - and re.match(r"^\|S\d+$", data["name"]) is not None - and data["object_codec_id"] is None - ) - @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSON_V3]: """ @@ -298,37 +278,6 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSON_V3 and isinstance(data["configuration"]["length_bytes"], int) ) - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from Zarr V2-flavored JSON. - - This method checks if the input data is a valid representation of - this class in Zarr V2. If so, it returns a new instance of - this class with a ``length`` as specified in the input data. - - Parameters - ---------- - data : DTypeJSON - The JSON data to parse. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input data is not a valid representation of this class. - """ - - if cls._check_json_v2(data): - name = data["name"] - return cls(length=int(name[2:])) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string like '|S1', '|S2', etc" - raise DataTypeValidationError(msg) - @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ @@ -358,37 +307,12 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> NullterminatedBytesJSON_V2: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> NullTerminatedBytesJSON_V3: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[str, None] | NullTerminatedBytesJSON_V3: - """ - Generate a JSON representation of this data type. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - NullterminatedBytesJSON_V2 | NullTerminatedBytesJSON_V3 - The JSON-serializable representation of the data type - """ - if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - return { - "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v3(self) -> NullTerminatedBytesJSON_V3: + v3_unstable_dtype_warning(self) + return { + "name": self._zarr_v3_name, + "configuration": {"length_bytes": self.length}, + } def _check_scalar(self, data: object) -> TypeGuard[BytesLike]: """ @@ -542,7 +466,7 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) -class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): +class RawBytes(NumpyNativeDTypeV2[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): """ A Zarr data type for arrays containing fixed-length sequences of raw bytes. @@ -653,28 +577,6 @@ def to_native_dtype(self) -> np.dtypes.VoidDType[int]: # by invoking np.dtypes.VoidDType directly return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V2]: - """ - Check that the input is a valid representation of this class in Zarr V2. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - True if the input is a valid representation of this class in Zarr V3, False otherwise. - - """ - return ( - check_dtype_spec_v2(data) - and isinstance(data["name"], str) - and re.match(r"^\|V\d+$", data["name"]) is not None - and data["object_codec_id"] is None - ) - @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V3]: """ @@ -700,36 +602,6 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V3]: and isinstance(data["configuration"]["length_bytes"], int) ) - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of RawBytes from Zarr V2-flavored JSON. - - This method checks if the input data is a valid representation of - RawBytes in Zarr V2. If so, it returns a new instance of - RawBytes with a ``length`` as specified in the input data. - - Parameters - ---------- - data : DTypeJSON - The JSON data to parse. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input data is not a valid representation of this class. - """ - if cls._check_json_v2(data): - name = data["name"] - return cls(length=int(name[2:])) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string like '|V1', '|V2', etc" - raise DataTypeValidationError(msg) - @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ @@ -759,32 +631,9 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> RawBytesJSON_V2: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> RawBytesJSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> RawBytesJSON_V2 | RawBytesJSON_V3: - """ - Generate a JSON representation of this data type. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - RawBytesJSON_V2 | RawBytesJSON_V3 - The JSON-serializable representation of the data type. - """ - if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v3(self) -> RawBytesJSON_V3: + v3_unstable_dtype_warning(self) + return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} def _check_scalar(self, data: object) -> TypeGuard[np.bytes_ | str | bytes | np.void]: """ @@ -936,7 +785,7 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) -class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): +class VariableLengthBytes(ObjectCodecDTypeV2[np.dtypes.ObjectDType, bytes], HasObjectCodec): """ A Zarr data type for arrays containing variable-length sequences of bytes. @@ -1002,89 +851,13 @@ def to_native_dtype(self) -> np.dtypes.ObjectDType: """ return self.dtype_cls() - @classmethod - def _check_json_v2( - cls, - data: DTypeJSON, - ) -> TypeGuard[VariableLengthBytesJSON_V2]: - """ - Check that the input is a valid JSON representation of a NumPy O dtype, and that the - object codec id is appropriate for variable-length bytes strings. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - True if the input is a valid representation of this class in Zarr V2, False - otherwise. - """ - # Check that the input is a valid JSON representation of a Zarr v2 data type spec. - if not check_dtype_spec_v2(data): - return False - - # Check that the object codec id is appropriate for variable-length bytes strings. - if data["name"] != "|O": - return False - return data["object_codec_id"] == cls.object_codec_id - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_bytes"]]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - TypeGuard[Literal["variable_length_bytes"]] - True if the input is a valid representation of this class in Zarr V3, False otherwise. - """ - - return data in (cls._zarr_v3_name, "bytes") - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this VariableLengthBytes from Zarr V2-flavored JSON. - - This method checks if the input data is a valid representation of this class - in Zarr V2. If so, it returns a new instance this class. - - Parameters - ---------- - data : DTypeJSON - The JSON data to parse. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input data is not a valid representation of this class class. - """ - - if cls._check_json_v2(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O' and an object_codec_id of {cls.object_codec_id}" - raise DataTypeValidationError(msg) - @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ Create an instance of VariableLengthBytes from Zarr V3-flavored JSON. - This method checks if the input data is a valid representation of - VariableLengthBytes in Zarr V3. If so, it returns a new instance of - VariableLengthBytes. + In addition to the canonical ``variable_length_bytes`` name, the alias ``"bytes"`` is + accepted. Parameters ---------- @@ -1102,47 +875,14 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: If the input data is not a valid representation of this class. """ - if cls._check_json_v3(data): + if data in (cls._zarr_v3_name, "bytes"): return cls() msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> VariableLengthBytesJSON_V2: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_bytes"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> VariableLengthBytesJSON_V2 | Literal["variable_length_bytes"]: - """ - Convert the variable-length bytes data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. Accepted values are 2 and 3. - - Returns - ------- - ``DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]] | Literal["variable_length_bytes"]`` - The JSON-serializable representation of the variable-length bytes data type. - For zarr_format 2, returns a dictionary with "name" and "object_codec_id". - For zarr_format 3, returns a string identifier "variable_length_bytes". - - Raises - ------ - ValueError - If zarr_format is not 2 or 3. - """ - - if zarr_format == 2: - return {"name": "|O", "object_codec_id": self.object_codec_id} - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v3(self) -> Literal["variable_length_bytes"]: + v3_unstable_dtype_warning(self) + return self._zarr_v3_name def default_scalar(self) -> bytes: """ diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index 2e19a6a04e..db7d51a97a 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -70,6 +70,29 @@ def _to_json_v2(self) -> DTypeSpec_V2: return {"name": self.to_native_dtype().str, "object_codec_id": None} +class ObjectCodecDTypeV2[DType: TBaseDType, Scalar: TBaseScalar](ZDType[DType, Scalar]): + """ + Mixin for parameter-free data types whose Zarr V2 representation is the NumPy "object" type + string ``"|O"`` together with an ``object_codec_id`` identifying the codec used to encode + values (e.g. ``"vlen-utf8"``). Subclasses must define ``object_codec_id``. + """ + + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if ( + check_dtype_spec_v2(data) + and data["name"] == "|O" + and data["object_codec_id"] == cls.object_codec_id + ): + return cls() + raise DataTypeValidationError( + f"Invalid Zarr V2 JSON representation of {cls.__name__}: {data!r}" + ) + + def _to_json_v2(self) -> DTypeSpec_V2: + return {"name": "|O", "object_codec_id": self.object_codec_id} + + IntLike = SupportsInt | SupportsIndex | bytes | str FloatLike = SupportsIndex | SupportsFloat | bytes | str ComplexLike = SupportsFloat | SupportsIndex | SupportsComplex | bytes | str | None diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 3f84e8123f..cad681721a 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -1,6 +1,5 @@ from __future__ import annotations -import re from dataclasses import dataclass from typing import ( TYPE_CHECKING, @@ -10,7 +9,6 @@ Self, TypedDict, TypeGuard, - overload, runtime_checkable, ) @@ -24,14 +22,14 @@ HasItemSize, HasLength, HasObjectCodec, - check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( + NumpyNativeDTypeV2, + ObjectCodecDTypeV2, check_json_str, endianness_to_numpy_str, get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import ZDType from zarr.errors import DataTypeValidationError if TYPE_CHECKING: @@ -104,7 +102,7 @@ class FixedLengthUTF32JSON_V3(NamedConfig[Literal["fixed_length_utf32"], LengthB @dataclass(frozen=True, kw_only=True) class FixedLengthUTF32( - ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize + NumpyNativeDTypeV2[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize ): """ A Zarr data type for arrays containing fixed-length UTF-32 strings. @@ -175,28 +173,6 @@ def to_native_dtype(self) -> np.dtypes.StrDType[int]: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls(self.length).newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V2]: - """ - Check that the input is a valid JSON representation of a NumPy U dtype. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TypeGuard[FixedLengthUTF32JSON_V2] - Whether the input is a valid JSON representation of a NumPy U dtype. - """ - return ( - check_dtype_spec_v2(data) - and isinstance(data["name"], str) - and re.match(r"^[><]U\d+$", data["name"]) is not None - and data["object_codec_id"] is None - ) - @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V3]: """ @@ -222,59 +198,11 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V3]: and isinstance(data["configuration"]["length_bytes"], int) ) - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSON_V3: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3: - """ - Convert the FixedLengthUTF32 instance to a JSON representation. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format to use. - - Returns - ------- - DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3 - The JSON representation of the data type. - """ - if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length * self.code_point_bytes}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create a FixedLengthUTF32 from a JSON representation of a NumPy U dtype. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v2(data): - # Construct the NumPy dtype instead of string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - raise DataTypeValidationError( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string representation of a NumPy U dtype." - ) + def _to_json_v3(self) -> FixedLengthUTF32JSON_V3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bytes": self.length * self.code_point_bytes}, + } @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: @@ -450,7 +378,7 @@ class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8 @dataclass(frozen=True, kw_only=True) -class VariableLengthUTF8(ZDType[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var] +class VariableLengthUTF8(ObjectCodecDTypeV2[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var] """ A Zarr data type for arrays containing variable-length UTF-8 strings. @@ -525,118 +453,6 @@ def to_native_dtype(self) -> np.dtypes.StringDType: """ return self.dtype_cls() - @classmethod - def _check_json_v2( - cls, - data: DTypeJSON, - ) -> TypeGuard[VariableLengthUTF8JSON_V2]: - """ - "Check if the input is a valid JSON representation of a variable-length UTF-8 string dtype - for Zarr v2." - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - ``TypeGuard[VariableLengthUTF8JSON_V2]`` - Whether the input is a valid JSON representation of a NumPy "object" data type, and that the - object codec id is appropriate for variable-length UTF-8 strings. - """ - return ( - check_dtype_spec_v2(data) - and data["name"] == "|O" - and data["object_codec_id"] == cls.object_codec_id - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_utf8"]]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - TypeGuard[Literal["variable_length_utf8"]] - Whether the input is a valid JSON representation of a variable length UTF-8 string - data type. - """ - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from a JSON representation of a NumPy "object" dtype. - - Parameters - ---------- - data : DTypeJSON - The JSON data to create an instance from. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v2(data): - return cls() - msg = ( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O'" - ) - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from a JSON representation of a variable length UTF-8 - string data type. - - Parameters - ---------- - data : DTypeJSON - The JSON data to create an instance from. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> VariableLengthUTF8JSON_V2: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["string"]: ... - - def to_json(self, zarr_format: ZarrFormat) -> VariableLengthUTF8JSON_V2 | Literal["string"]: - """ - Convert this data type to a JSON representation. - - Parameters - ---------- - zarr_format : int - The zarr format to use for the JSON representation. - - Returns - ------- - ``VariableLengthUTF8JSON_V2 | Literal["string"]`` - The JSON representation of this data type. - """ - if zarr_format == 2: - return {"name": "|O", "object_codec_id": self.object_codec_id} - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_scalar(self) -> str: """ Return the default scalar value for this data type. diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index b865998e52..f63df0acbb 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -2,7 +2,7 @@ from collections.abc import Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, cast, overload +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, cast import numpy as np @@ -309,49 +309,20 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> StructuredJSON_V2: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> StructuredJSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructuredJSON_V3: - """ - Convert the structured data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version. Accepted values are 2 and 3. - - Returns - ------- - StructuredJSON_V2 | StructuredJSON_V3 - The JSON representation of the structured data type. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ - if zarr_format == 2: - fields = [ - [f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]] - for f_name, f_dtype in self.fields - ] - return {"name": fields, "object_codec_id": None} - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - fields = [ - [f_name, f_dtype.to_json(zarr_format=zarr_format)] # type: ignore[list-item] - for f_name, f_dtype in self.fields - ] - base_dict = { - "name": self._zarr_v3_name, - "configuration": {"fields": fields}, - } - return cast("StructuredJSON_V3", base_dict) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v2(self) -> StructuredJSON_V2: + fields = [ + [f_name, f_dtype.to_json(zarr_format=2)["name"]] for f_name, f_dtype in self.fields + ] + return {"name": fields, "object_codec_id": None} + + def _to_json_v3(self) -> StructuredJSON_V3: + v3_unstable_dtype_warning(self) + fields = [[f_name, f_dtype.to_json(zarr_format=3)] for f_name, f_dtype in self.fields] + base_dict = { + "name": self._zarr_v3_name, + "configuration": {"fields": fields}, + } + return cast("StructuredJSON_V3", base_dict) def _check_scalar(self, data: object) -> TypeGuard[StructuredScalarLike]: # TODO: implement something more precise here! @@ -575,30 +546,22 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] - def to_json(self, zarr_format: Literal[2]) -> StructuredJSON_V2: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> StructJSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructJSON_V3: - if zarr_format == 2: - fields_v2 = [ - [f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]] - for f_name, f_dtype in self.fields - ] - return {"name": fields_v2, "object_codec_id": None} - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - fields_v3 = [ - {"name": f_name, "data_type": f_dtype.to_json(zarr_format=zarr_format)} - for f_name, f_dtype in self.fields - ] - return cast( - "StructJSON_V3", - {"name": self._zarr_v3_name, "configuration": {"fields": fields_v3}}, - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v2(self) -> StructuredJSON_V2: + fields_v2 = [ + [f_name, f_dtype.to_json(zarr_format=2)["name"]] for f_name, f_dtype in self.fields + ] + return {"name": fields_v2, "object_codec_id": None} + + def _to_json_v3(self) -> StructJSON_V3: # type: ignore[override] + v3_unstable_dtype_warning(self) + fields_v3 = [ + {"name": f_name, "data_type": f_dtype.to_json(zarr_format=3)} + for f_name, f_dtype in self.fields + ] + return cast( + "StructJSON_V3", + {"name": self._zarr_v3_name, "configuration": {"fields": fields_v3}}, + ) def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: """ diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 4efa0be7bb..668e8c6a2a 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -11,7 +11,6 @@ TypeGuard, cast, get_args, - overload, ) import numpy as np @@ -23,20 +22,19 @@ DTypeJSON, HasEndianness, HasItemSize, - check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( - DATETIME_UNIT, DateTimeUnit, + NumpyNativeDTypeV2, check_json_int, endianness_to_numpy_str, get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType from zarr.errors import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import TBaseDType TimeDeltaLike = str | int | bytes | np.timedelta64 | timedelta | None DateTimeLike = str | int | bytes | np.datetime64 | datetime | None @@ -209,7 +207,7 @@ class DateTime64JSON_V2(DTypeConfig_V2[str, None]): class TimeDTypeBase[ DType: np.dtypes.TimeDelta64DType | np.dtypes.DateTime64DType, Scalar: np.timedelta64 | np.datetime64, -](ZDType[DType, Scalar], HasEndianness, HasItemSize): +](NumpyNativeDTypeV2[DType, Scalar], HasEndianness, HasItemSize): """ A base class for data types that represent time via the NumPy TimeDelta64 and DateTime64 data types. @@ -348,43 +346,8 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has unit: DateTimeUnit = "generic" scale_factor: int = 1 _zarr_v3_name: ClassVar[Literal["numpy.timedelta64"]] = "numpy.timedelta64" - _zarr_v2_names: ClassVar[tuple[Literal[">m8"], Literal["m8", " TypeGuard[TimeDelta64JSON_V2]: - """ - Validate that the provided JSON input accurately represents a NumPy timedelta64 data type, - which could be in the form of strings like "m8[10s]". This method serves as a type - guard, helping to refine the type of unknown JSON input by confirming its adherence to the - expected format for NumPy timedelta64 data types. - - The JSON input should contain a "name" key with a value that matches the expected string - pattern for NumPy timedelta64 data types. The pattern includes an optional unit enclosed - within square brackets, following the base type identifier. - - Returns - ------- - bool - True if the JSON input is a valid representation of this class, - otherwise False. - """ - if not check_dtype_spec_v2(data): - return False - name = data["name"] - # match m[M], etc - # consider making this a standalone function - if not isinstance(name, str): - return False - if not name.startswith(cls._zarr_v2_names): - return False - if len(name) == 3: - # no unit, and - # we already checked that this string is either m8 - return True - else: - return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" - @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: """ @@ -404,35 +367,6 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: and set(data["configuration"].keys()) == {"unit", "scale_factor"} ) - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create a TimeDelta64 from a Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TimeDelta64 - An instance of TimeDelta64. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = ( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string " - f"representation of an instance of {cls.dtype_cls}" - ) - raise DataTypeValidationError(msg) - @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ @@ -467,39 +401,11 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: ) raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> TimeDelta64JSON_V2: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> TimeDelta64JSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> TimeDelta64JSON_V2 | TimeDelta64JSON_V3: - """ - Serialize this data type to JSON. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version (2 or 3). - - Returns - ------- - TimeDelta64JSON_V2 | TimeDelta64JSON_V3 - The JSON representation of the data type. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ - if zarr_format == 2: - name = self.to_native_dtype().str - return {"name": name, "object_codec_id": None} - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v3(self) -> TimeDelta64JSON_V3: + return { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, + } def _check_scalar(self, data: object) -> TypeGuard[TimeDeltaLike]: """ @@ -623,41 +529,10 @@ class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEnd dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["numpy.datetime64"]] = "numpy.datetime64" - _zarr_v2_names: ClassVar[tuple[Literal[">M8"], Literal["M8", " TypeGuard[DateTime64JSON_V2]: - """ - Check that the input is a valid JSON representation of this data type. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - TypeGuard[DateTime64JSON_V2] - True if the input is a valid JSON representation of a NumPy datetime64 data type, - otherwise False. - """ - if not check_dtype_spec_v2(data): - return False - name = data["name"] - if not isinstance(name, str): - return False - if not name.startswith(cls._zarr_v2_names): - return False - if len(name) == 3: - # no unit, and - # we already checked that this string is either M8 - return True - else: - return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" - @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: """ @@ -682,40 +557,6 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: and set(data["configuration"].keys()) == {"unit", "scale_factor"} ) - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from a Zarr V2-flavored JSON representation. - - This method checks if the provided JSON data is a valid representation of this class. - If valid, it creates an instance using the native NumPy dtype. Otherwise, it raises a - DataTypeValidationError. - - Parameters - ---------- - data : DTypeJSON - The JSON data to parse. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - - if cls._check_json_v2(data): - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = ( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string " - f"representation of an instance of {cls.dtype_cls}" - ) - raise DataTypeValidationError(msg) - @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ @@ -752,39 +593,11 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: ) raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> DateTime64JSON_V2: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> DateTime64JSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> DateTime64JSON_V2 | DateTime64JSON_V3: - """ - Serialize this data type to JSON. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version (2 or 3). - - Returns - ------- - DateTime64JSON_V2 | DateTime64JSON_V3 - The JSON representation of the data type. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ - if zarr_format == 2: - name = self.to_native_dtype().str - return {"name": name, "object_codec_id": None} - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v3(self) -> DateTime64JSON_V3: + return { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, + } def _check_scalar(self, data: object) -> TypeGuard[DateTimeLike]: """ From a23991dab97041388529ece59f1417d77ad2b554 Mon Sep 17 00:00:00 2001 From: d-v-b-agent Date: Sat, 27 Jun 2026 09:41:20 +0000 Subject: [PATCH 5/7] Make data type aliasing declarative and uniform Introduce a single `_aliases` class var on the ZDType base (default empty) for alternative Zarr V3 names a data type accepts on input, plus `_zarr_v3_names()`/`_check_zarr_v3_name()` helpers used by the base `_from_json_v3` and every parametrized `_check_json_v3`. The canonical `_zarr_v3_name` is always what gets written out -- aliases are input-only. This replaces the previously scattered, hardcoded aliases with declarations on the types that have them: - VariableLengthBytes: "bytes" (deletes its custom `_from_json_v3`) - Struct: "structured" (legacy name) - VariableLengthUTF8: "str" (the user-facing `VLEN_UTF8_ALIAS` is now derived from the type's declared names, so there is a single source of truth) Adds tests documenting the complete alias surface across all data types and verifying that aliases resolve on input while serialization emits the canonical name. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XZHYrRBh54e7tFearnZ72r --- src/zarr/core/dtype/__init__.py | 10 +++--- src/zarr/core/dtype/npy/bytes.py | 34 ++----------------- src/zarr/core/dtype/npy/string.py | 3 +- src/zarr/core/dtype/npy/structured.py | 7 ++-- src/zarr/core/dtype/npy/time.py | 4 +-- src/zarr/core/dtype/wrapper.py | 20 +++++++++-- tests/test_dtype_registry.py | 48 +++++++++++++++++++++++++++ 7 files changed, 83 insertions(+), 43 deletions(-) diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 13e2f960cd..4fc819c2a6 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -157,11 +157,11 @@ VariableLengthBytes, ) -# These are aliases for variable-length UTF-8 strings -# We handle them when a user requests a data type instead of using NumPy's dtype inferece because -# the default NumPy behavior -- to inspect the user-provided array data and choose -# an appropriately sized U dtype -- is unworkable for Zarr. -VLEN_UTF8_ALIAS: Final = ("str", str, "string") +# These are aliases for variable-length UTF-8 strings: the python ``str`` type plus the data +# type's declared Zarr V3 names ("string" and "str"). We handle them when a user requests a data +# type instead of using NumPy's dtype inference because the default NumPy behavior -- to inspect +# the user-provided array data and choose an appropriately sized U dtype -- is unworkable for Zarr. +VLEN_UTF8_ALIAS: Final = (str, *VariableLengthUTF8._zarr_v3_names()) # This type models inputs that can be coerced to a ZDType type ZDTypeLike = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index 1fa3915979..a540d549e9 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -272,7 +272,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSON_V3 return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name + and cls._check_zarr_v3_name(data["name"]) and isinstance(data["configuration"], dict) and "length_bytes" in data["configuration"] and isinstance(data["configuration"]["length_bytes"], int) @@ -596,7 +596,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name + and cls._check_zarr_v3_name(data["name"]) and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"length_bytes"} and isinstance(data["configuration"]["length_bytes"], int) @@ -809,6 +809,7 @@ class VariableLengthBytes(ObjectCodecDTypeV2[np.dtypes.ObjectDType, bytes], HasO dtype_cls = np.dtypes.ObjectDType _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" + _aliases: ClassVar[tuple[str, ...]] = ("bytes",) object_codec_id: ClassVar[Literal["vlen-bytes"]] = "vlen-bytes" @classmethod @@ -851,35 +852,6 @@ def to_native_dtype(self) -> np.dtypes.ObjectDType: """ return self.dtype_cls() - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of VariableLengthBytes from Zarr V3-flavored JSON. - - In addition to the canonical ``variable_length_bytes`` name, the alias ``"bytes"`` is - accepted. - - Parameters - ---------- - data : DTypeJSON - The JSON data to parse. - - Returns - ------- - VariableLengthBytes - An instance of VariableLengthBytes. - - Raises - ------ - DataTypeValidationError - If the input data is not a valid representation of this class. - """ - - if data in (cls._zarr_v3_name, "bytes"): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - def _to_json_v3(self) -> Literal["variable_length_bytes"]: v3_unstable_dtype_warning(self) return self._zarr_v3_name diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index cad681721a..4fc90275d2 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -191,7 +191,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name + and cls._check_zarr_v3_name(data["name"]) and "configuration" in data and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"length_bytes"} @@ -403,6 +403,7 @@ class VariableLengthUTF8(ObjectCodecDTypeV2[np.dtypes.StringDType, str], HasObje dtype_cls = np.dtypes.StringDType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["string"]] = "string" + _aliases: ClassVar[tuple[str, ...]] = ("str",) object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" @classmethod diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index f63df0acbb..fff852e328 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -264,7 +264,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[StructuredJSON_V3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name + and cls._check_zarr_v3_name(data["name"]) and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"fields"} ) @@ -515,13 +515,16 @@ class Struct(Structured): """ _zarr_v3_name: ClassVar[Literal["struct"]] = "struct" # type: ignore[assignment] + # "structured" is the legacy name for this data type; it is accepted on input but "struct" + # is always written out. + _aliases: ClassVar[tuple[str, ...]] = ("structured",) @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[StructJSON_V3]: # type: ignore[override] return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] in ("struct", "structured") + and cls._check_zarr_v3_name(data["name"]) and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"fields"} ) diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 668e8c6a2a..87b656e09a 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -362,7 +362,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name + and cls._check_zarr_v3_name(data["name"]) and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"unit", "scale_factor"} ) @@ -552,7 +552,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name + and cls._check_zarr_v3_name(data["name"]) and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"unit", "scale_factor"} ) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 2305fdd629..572d0bdd5a 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -68,11 +68,26 @@ class variable, and it should generally be unique across different data types. # this class will create a native data type dtype_cls: ClassVar[type[TBaseDType]] _zarr_v3_name: ClassVar[str] + # Alternative Zarr V3 names that are *accepted* on input (in array metadata or as a + # user-provided data type string) in addition to ``_zarr_v3_name``. The canonical + # ``_zarr_v3_name`` is always what gets written out; aliases are input-only. Most data types + # have no aliases. + _aliases: ClassVar[tuple[str, ...]] = () # For data types backed by the NumPy object dtype in Zarr V2 (e.g. variable-length strings), # this is the id of the object codec used to encode values, e.g. "vlen-utf8". For all other # data types it is None. It is used by the default Zarr V2 (de)serialization below. object_codec_id: ClassVar[str | None] = None + @classmethod + def _zarr_v3_names(cls) -> tuple[str, ...]: + """All Zarr V3 names accepted for this data type: the canonical name plus any aliases.""" + return (cls._zarr_v3_name, *cls._aliases) + + @classmethod + def _check_zarr_v3_name(cls, name: object) -> bool: + """Whether ``name`` is this data type's canonical Zarr V3 name or one of its aliases.""" + return name in cls._zarr_v3_names() + @classmethod def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[DType]: """ @@ -155,9 +170,10 @@ def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: Construct an instance from the Zarr V3 JSON form of this data type. The default implementation handles parameter-free data types, whose Zarr V3 form is just - the data type name. Data types with a ``configuration`` object should override this. + the data type name (or one of its aliases). Data types with a ``configuration`` object + should override this. """ - if data == cls._zarr_v3_name: + if cls._check_zarr_v3_name(data): return cls() raise DataTypeValidationError( f"Invalid Zarr V3 JSON representation of {cls.__name__}: {data!r}" diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index f849b089c7..8bc86f56f3 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -23,8 +23,11 @@ from zarr.dtype import ( # type: ignore[attr-defined] Bool, FixedLengthUTF32, + Struct, + VariableLengthBytes, VariableLengthUTF8, ZDType, + data_type_registry, parse_data_type, parse_dtype, ) @@ -220,3 +223,48 @@ def test_parse_data_type( else: observed = dtype_parser_func(dtype_spec, zarr_format=zarr_format) assert observed == data_type + + +# The complete alias surface across every built-in data type. Aliases are alternative Zarr V3 +# names accepted on input; the canonical ``_zarr_v3_name`` is always what gets written out. Most +# data types have no aliases (an empty ``_aliases``). Update this mapping when adding aliases. +EXPECTED_ALIASES: dict[str, tuple[str, ...]] = { + "string": ("str",), + "variable_length_bytes": ("bytes",), + "struct": ("structured",), +} + + +def test_alias_surface_is_complete() -> None: + """Every registered data type's declared aliases match the documented alias surface.""" + observed = { + cls._zarr_v3_name: cls._aliases for cls in data_type_registry.values() if cls._aliases + } + assert observed == EXPECTED_ALIASES + + +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") +@pytest.mark.parametrize( + ("alias", "expected_cls"), + [ + ("str", VariableLengthUTF8), + ("string", VariableLengthUTF8), + ("bytes", VariableLengthBytes), + ], +) +def test_v3_name_alias_resolves(alias: str, expected_cls: type[ZDType[Any, Any]]) -> None: + """A Zarr V3 name alias resolves to its data type, and the data type writes the canonical name.""" + resolved = get_data_type_from_json(alias, zarr_format=3) + assert isinstance(resolved, expected_cls) + # Aliases are input-only: serialization always emits the canonical name, never the alias. + assert resolved.to_json(zarr_format=3) == expected_cls._zarr_v3_name + assert resolved.to_json(zarr_format=3) != alias or alias == expected_cls._zarr_v3_name + + +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") +def test_v3_name_alias_parametrized() -> None: + """The ``structured`` legacy name resolves to ``Struct``, which writes ``struct``.""" + data = {"name": "structured", "configuration": {"fields": [["a", "int8"]]}} + resolved = get_data_type_from_json(data, zarr_format=3) + assert isinstance(resolved, Struct) + assert resolved.to_json(zarr_format=3)["name"] == "struct" From 5a43a52ea3718d5fef2a47f9d66ccc188a24d49a Mon Sep 17 00:00:00 2001 From: d-v-b-agent Date: Sat, 27 Jun 2026 11:27:32 +0000 Subject: [PATCH 6/7] Accept the Zarr V3 spec raw type name r on input The Zarr V3 core spec names raw data types `r`, where N is a bit count that must be a positive multiple of 8 (e.g. "r8", "r16"). zarr-python's own canonical V3 form for raw bytes is the more explicit `{"name": "raw_bytes", "configuration": {"length_bytes": ...}}`. Accept the spec `r` form on input (parsing N bits into a byte length) so that raw arrays written by other Zarr V3 implementations can be read. This is an input-only alias: serialization still emits the canonical `raw_bytes` configuration form, consistent with how all other data type aliases behave. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XZHYrRBh54e7tFearnZ72r --- src/zarr/core/dtype/npy/bytes.py | 24 ++++++++++++++++++++++++ tests/test_dtype_registry.py | 22 ++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index a540d549e9..a7a7b247cd 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -1,6 +1,7 @@ from __future__ import annotations import base64 +import re from dataclasses import dataclass from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypedDict, TypeGuard, cast @@ -465,6 +466,25 @@ def item_size(self) -> int: return self.length +# The Zarr V3 core spec names raw data types ``r``, where N is the number of *bits* and must be +# a multiple of 8 (e.g. "r8", "r16"). zarr-python's own canonical V3 form is the more explicit +# ``{"name": "raw_bytes", "configuration": {"length_bytes": ...}}``, but we accept the spec form on +# input so that raw arrays written by other Zarr V3 implementations can be read. +_RAW_BYTES_SPEC_NAME = re.compile(r"^r(\d+)$") + + +def _raw_bytes_spec_length_bytes(data: object) -> int | None: + """ + If ``data`` is a Zarr V3 core-spec raw data type name (e.g. ``"r8"``), return its length in + bytes; otherwise return None. The number in the name is a bit count and must be a multiple of 8. + """ + if isinstance(data, str) and (match := _RAW_BYTES_SPEC_NAME.match(data)) is not None: + num_bits = int(match.group(1)) + if num_bits > 0 and num_bits % 8 == 0: + return num_bits // 8 + return None + + @dataclass(frozen=True, kw_only=True) class RawBytes(NumpyNativeDTypeV2[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): """ @@ -628,6 +648,10 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: """ if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"]) + # Also accept the Zarr V3 core spec form, e.g. "r8", "r16". + spec_length_bytes = _raw_bytes_spec_length_bytes(data) + if spec_length_bytes is not None: + return cls(length=spec_length_bytes) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 8bc86f56f3..2030264ee8 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -23,6 +23,7 @@ from zarr.dtype import ( # type: ignore[attr-defined] Bool, FixedLengthUTF32, + RawBytes, Struct, VariableLengthBytes, VariableLengthUTF8, @@ -268,3 +269,24 @@ def test_v3_name_alias_parametrized() -> None: resolved = get_data_type_from_json(data, zarr_format=3) assert isinstance(resolved, Struct) assert resolved.to_json(zarr_format=3)["name"] == "struct" + + +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") +@pytest.mark.parametrize(("spec_name", "length"), [("r8", 1), ("r16", 2), ("r64", 8)]) +def test_raw_bytes_v3_spec_name(spec_name: str, length: int) -> None: + """The Zarr V3 core spec raw name ``r`` (N bits) is accepted on input as RawBytes.""" + resolved = get_data_type_from_json(spec_name, zarr_format=3) + assert isinstance(resolved, RawBytes) + assert resolved.length == length + # Aliases are input-only: RawBytes always writes its canonical configuration form. + assert resolved.to_json(zarr_format=3) == { + "name": "raw_bytes", + "configuration": {"length_bytes": length}, + } + + +@pytest.mark.parametrize("bad_name", ["r12", "r0", "r", "rabc", "r8x"]) +def test_raw_bytes_v3_spec_name_invalid(bad_name: str) -> None: + """Raw spec names with a non-positive or non-multiple-of-8 bit count do not resolve.""" + with pytest.raises(ValueError, match="No Zarr data type found"): + get_data_type_from_json(bad_name, zarr_format=3) From 7dc19781f9eeea23c7a3bc1c62e4ea9c1e8651d0 Mon Sep 17 00:00:00 2001 From: d-v-b-agent Date: Sun, 28 Jun 2026 10:22:35 +0000 Subject: [PATCH 7/7] Resolve data types by index instead of iterate-and-try; add strict/compatible modes Replace the "try every registered data type until one matches" resolution with normalize-then-look-up: - Native NumPy dtypes resolve through an index keyed on the dtype class (built per call from the registry, which stays a plain Mapping). Each class maps to a single data type, except NumPy's VoidDType, which is shared by the raw-bytes and structured types and disambiguated by `.fields`. The NumPy "Object" dtype remains a deliberate ambiguity and is refused. - Zarr V3 JSON resolves by name: canonical name, alias, or a parametric name (e.g. raw `r`, via a new `_zarr_v3_name_pattern` class var). - Zarr V2 JSON resolves object-codec-backed types by `object_codec_id`, custom types by their registered name, and everything else through the native NumPy dtype. Add a `data_type_resolution` config option with two modes: "compatible" (default) makes a best-effort attempt to read wrong-but-parsable Zarr V2 type strings (e.g. ">u1", which NumPy normalizes to "|u1"), while "strict" accepts only spec-compliant, canonical data type metadata. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01XZHYrRBh54e7tFearnZ72r --- src/zarr/core/config.py | 5 ++ src/zarr/core/dtype/npy/bytes.py | 2 + src/zarr/core/dtype/registry.py | 150 ++++++++++++++++++++++++++++--- src/zarr/core/dtype/wrapper.py | 7 ++ tests/test_config.py | 1 + tests/test_dtype_registry.py | 34 +++++++ 6 files changed, 186 insertions(+), 13 deletions(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 08d2a50ace..e2243f11b3 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -93,6 +93,11 @@ def enable_gpu(self) -> ConfigSet: defaults=[ { "default_zarr_format": 3, + # How to interpret data type metadata. "compatible" makes a best-effort attempt to + # read wrong-but-parsable data type metadata (e.g. a Zarr V2 ``">u1"`` typestring, + # which NumPy accepts but normalizes to ``"|u1"``). "strict" accepts only + # spec-compliant, canonical data type metadata. + "data_type_resolution": "compatible", "array": { "order": "C", "write_empty_chunks": False, diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index a7a7b247cd..fe65150e53 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -517,6 +517,8 @@ class does not support structured data types. # so we have to tell mypy to ignore this here dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["raw_bytes"]] = "raw_bytes" + # The Zarr V3 core spec writes raw data types as ``r``; accept that form on input. + _zarr_v3_name_pattern: ClassVar[re.Pattern[str] | None] = _RAW_BYTES_SPEC_NAME def __post_init__(self) -> None: """ diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index f890c89365..08988e4bc0 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -1,8 +1,10 @@ from __future__ import annotations import contextlib +from collections import defaultdict +from collections.abc import Mapping from importlib.metadata import entry_points as get_entry_points -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import numpy as np @@ -109,6 +111,8 @@ def match_dtype( if registry is None: registry = data_type_registry + # The NumPy "Object" dtype is a catch-all that can back many Zarr data types, so it cannot be + # uniquely resolved from the native dtype alone. if dtype == np.dtype("O"): msg = ( f"Zarr data type resolution from {dtype} failed. " @@ -120,17 +124,19 @@ def match_dtype( "data type, see https://github.com/zarr-developers/zarr-python/issues/3117" ) raise ValueError(msg) + # Resolve via an index keyed on the NumPy dtype class rather than scanning every registered + # data type. Each class maps to a single data type, except NumPy's VoidDType, which is shared + # by the raw-bytes and structured data types (disambiguated by from_native_dtype via .fields). + candidates = _native_dtype_index(registry).get(type(dtype), []) matched: list[ZDType[TBaseDType, TBaseScalar]] = [] - for val in registry.values(): - # DataTypeValidationError means "this dtype doesn't match me", which is - # expected and suppressed. Other exceptions (e.g. ValueError for a dtype - # that matches the type but has an invalid configuration) are propagated - # to the caller. + for cls in candidates: + # DataTypeValidationError means "this dtype doesn't match me", which is expected and + # suppressed. Other exceptions (e.g. ValueError for an unsupported configuration) propagate. with contextlib.suppress(DataTypeValidationError): - matched.append(val.from_native_dtype(dtype)) + matched.append(cls.from_native_dtype(dtype)) if len(matched) == 1: return matched[0] - elif len(matched) > 1: + if len(matched) > 1: msg = ( f"Zarr data type resolution from {dtype} failed. " f"Multiple data type wrappers found that match dtype '{dtype}': {matched}. " @@ -148,6 +154,15 @@ def match_json( """ Match a JSON representation of a data type to a registered ZDType. + Resolution normalizes the input to a data type *name* and looks that name up, rather than + trying every registered data type in turn: + + - Zarr V3: the canonical name, an alias, or a parametric name (e.g. raw ``r``) is mapped + directly to its data type. + - Zarr V2: object-codec-backed types are identified by their ``object_codec_id``; everything + else is resolved through the native NumPy dtype (which also accepts wrong-but-parsable + type strings such as ``">u1"`` unless the ``data_type_resolution`` config is ``"strict"``). + Parameters ---------- data : DTypeJSON @@ -167,10 +182,119 @@ def match_json( """ if registry is None: registry = data_type_registry + if zarr_format == 2: + return _match_json_v2(data, registry) + if zarr_format == 3: + return _match_json_v3(data, registry) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + +def _no_match(data: object) -> ValueError: + return ValueError(f"No Zarr data type found that matches {data!r}") + + +def _native_dtype_index( + registry: DataTypeRegistry, +) -> dict[type[TBaseDType], list[type[ZDType[TBaseDType, TBaseScalar]]]]: + """ + Index a registry by the NumPy dtype class that each data type wraps. The index is small and + cheap to build, and is rebuilt per call so the registry itself stays a plain mapping. + """ + index: dict[type[TBaseDType], list[type[ZDType[TBaseDType, TBaseScalar]]]] = defaultdict(list) + for cls in registry.values(): + index[cls.dtype_cls].append(cls) + return index + + +def _resolve_v3_name( + name: str, registry: DataTypeRegistry +) -> type[ZDType[TBaseDType, TBaseScalar]] | None: + """Resolve a Zarr V3 data type name (canonical, alias, or parametric) to its data type.""" + if name in registry: + return registry[name] + for cls in registry.values(): + if name in cls._aliases: + return cls + pattern = cls._zarr_v3_name_pattern + if pattern is not None and pattern.match(name): + return cls + return None - for val in registry.values(): + +def _match_json_v3( + data: DTypeJSON, registry: DataTypeRegistry +) -> ZDType[TBaseDType, TBaseScalar]: + if isinstance(data, str): + name: object = data + elif isinstance(data, Mapping) and "name" in data: + name = data["name"] + else: + name = None + if isinstance(name, str): + cls = _resolve_v3_name(name, registry) + if cls is not None: + with contextlib.suppress(DataTypeValidationError): + return cls.from_json(data, zarr_format=3) + raise _no_match(data) + + +def _match_json_v2( + data: DTypeJSON, registry: DataTypeRegistry +) -> ZDType[TBaseDType, TBaseScalar]: + if not (isinstance(data, Mapping) and "name" in data and "object_codec_id" in data): + raise _no_match(data) + name = data["name"] + object_codec_id = data["object_codec_id"] + if object_codec_id is not None: + # Object-codec-backed data types (variable-length strings/bytes) are identified by the + # object codec id rather than the (always "|O") name. + for cls in registry.values(): + if cls.object_codec_id == object_codec_id: + with contextlib.suppress(DataTypeValidationError): + return cls.from_json(data, zarr_format=2) + raise _no_match(data) + if isinstance(name, str): + # First try interpreting the name as a NumPy type string (the usual Zarr V2 case). This + # also accepts wrong-but-parsable spellings (e.g. ">u1", which NumPy normalizes to "|u1"); + # in strict mode such non-canonical names are rejected. + native_dtype: TBaseDType | None try: - return val.from_json(data, zarr_format=zarr_format) - except DataTypeValidationError: - pass - raise ValueError(f"No Zarr data type found that matches {data!r}") + native_dtype = np.dtype(name) + except TypeError: + native_dtype = None + if native_dtype is not None: + try: + zdtype = match_dtype(native_dtype, registry=registry) + except ValueError: + zdtype = None + if zdtype is not None: + if _resolution_mode() == "strict": + canonical = zdtype.to_json(zarr_format=2)["name"] + if name != canonical: + raise ValueError( + f"The Zarr V2 data type name {name!r} is not spec-compliant; the " + f"canonical name for this data type is {canonical!r}. Set the " + "'data_type_resolution' config option to 'compatible' to read it anyway." + ) + return zdtype + # Fall back to interpreting the name as a registered data type name. This covers data types + # (e.g. parameter-free custom data types) whose Zarr V2 name is their registered name rather + # than a NumPy type string. + named_cls = _resolve_v3_name(name, registry) + if named_cls is not None: + with contextlib.suppress(DataTypeValidationError): + return named_cls.from_json(data, zarr_format=2) + raise _no_match(data) + # Otherwise the name is a structured data type descriptor (a sequence of fields), which the + # raw-bytes / structured data types (NumPy VoidDType) know how to parse. + for cls in _native_dtype_index(registry).get(cast("type[TBaseDType]", np.dtypes.VoidDType), []): + with contextlib.suppress(DataTypeValidationError): + return cls.from_json(data, zarr_format=2) + raise _no_match(data) + + +def _resolution_mode() -> str: + """The configured data type resolution mode: ``"strict"`` or ``"compatible"``.""" + from zarr.core.config import config + + return str(config.get("data_type_resolution")) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 572d0bdd5a..6c45095844 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -40,6 +40,8 @@ from zarr.errors import DataTypeValidationError if TYPE_CHECKING: + import re + from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.common import DTypeJSON, DTypeSpec_V2, DTypeSpec_V3 @@ -73,6 +75,11 @@ class variable, and it should generally be unique across different data types. # ``_zarr_v3_name`` is always what gets written out; aliases are input-only. Most data types # have no aliases. _aliases: ClassVar[tuple[str, ...]] = () + # A pattern for *parametric* Zarr V3 names that encode a parameter in the name itself rather + # than a fixed string or alias (e.g. the raw data type's ``r``, where N is a bit count). + # Used during data type resolution to map such a name to this data type. Most data types have + # no parametric name (None). + _zarr_v3_name_pattern: ClassVar[re.Pattern[str] | None] = None # For data types backed by the NumPy object dtype in Zarr V2 (e.g. variable-length strings), # this is the id of the object codec used to encode values, e.g. "vlen-utf8". For all other # data types it is None. It is used by the default Zarr V2 (de)serialization below. diff --git a/tests/test_config.py b/tests/test_config.py index a758378dc7..7a62001264 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -51,6 +51,7 @@ def test_config_defaults_set() -> None: == [ { "default_zarr_format": 3, + "data_type_resolution": "compatible", "array": { "order": "C", "write_empty_chunks": False, diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 2030264ee8..e13ca2467d 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -23,8 +23,10 @@ from zarr.dtype import ( # type: ignore[attr-defined] Bool, FixedLengthUTF32, + Int16, RawBytes, Struct, + UInt8, VariableLengthBytes, VariableLengthUTF8, ZDType, @@ -290,3 +292,35 @@ def test_raw_bytes_v3_spec_name_invalid(bad_name: str) -> None: """Raw spec names with a non-positive or non-multiple-of-8 bit count do not resolve.""" with pytest.raises(ValueError, match="No Zarr data type found"): get_data_type_from_json(bad_name, zarr_format=3) + + +@pytest.mark.parametrize("mode", ["strict", "compatible"]) +def test_v2_wrong_but_parsable_typestring(mode: str) -> None: + """ + A wrong-but-parsable Zarr V2 type string (here ``">u1"``, which NumPy accepts but normalizes + to ``"|u1"``) is interpreted in compatibility mode and rejected in strict mode. + """ + from zarr.core.config import config + + data = {"name": ">u1", "object_codec_id": None} + with config.set({"data_type_resolution": mode}): + if mode == "compatible": + assert isinstance(get_data_type_from_json(data, zarr_format=2), UInt8) + else: + with pytest.raises(ValueError, match="not spec-compliant"): + get_data_type_from_json(data, zarr_format=2) + + +@pytest.mark.parametrize("mode", ["strict", "compatible"]) +def test_v2_canonical_typestring_both_modes(mode: str) -> None: + """Canonical Zarr V2 type strings resolve in both modes, including legitimate byte order.""" + from zarr.core.config import config + + with config.set({"data_type_resolution": mode}): + assert isinstance( + get_data_type_from_json({"name": "|u1", "object_codec_id": None}, zarr_format=2), UInt8 + ) + # A byte-order prefix on a multibyte type is canonical, not "wrong-but-parsable". + assert isinstance( + get_data_type_from_json({"name": ">i2", "object_codec_id": None}, zarr_format=2), Int16 + )