diff --git a/examples/custom_dtype/custom_dtype.py b/examples/custom_dtype/custom_dtype.py index 53acb70f52..854f02f62b 100644 --- a/examples/custom_dtype/custom_dtype.py +++ b/examples/custom_dtype/custom_dtype.py @@ -15,16 +15,16 @@ import json import sys from pathlib import Path -from typing import ClassVar, Literal, Self, TypeGuard, overload +from typing import ClassVar, Literal, Self, TypeGuard import ml_dtypes # necessary to add extra dtypes to NumPy import numpy as np import pytest import zarr -from zarr.dtype import ZDType, check_dtype_spec_v2, data_type_registry +from zarr.dtype import ZDType, register_data_type from zarr.errors import DataTypeValidationError -from zarr.types import JSON, DTypeConfig_V2, DTypeJSON, ZarrFormat +from zarr.types import JSON, ZarrFormat # This is the int2 array data type int2_dtype_cls = type(np.dtype("int2")) @@ -39,11 +39,11 @@ class Int2(ZDType[int2_dtype_cls, int2_scalar_cls]): NumPy array of type int2) and the int2 scalar type (the ``dtype`` of the scalar value inside an int2 array). """ - # This field is as the key for the data type in the internal data type registry, and also - # as the identifier for the data type when serializaing the data type to disk for zarr v3 + # This is the key for the data type in the internal data type registry, and also the identifier + # for the data type when serializing it to disk. For a parameter-free data type like this one, + # ZDType uses it as the entire Zarr V3 representation and as the Zarr V2 ``name`` -- so we don't + # need to write any JSON (de)serialization for the data type itself; the base class handles it. _zarr_v3_name: ClassVar[Literal["int2"]] = "int2" - # this field will be used internally - _zarr_v2_name: ClassVar[Literal["int2"]] = "int2" # we bind a class variable to the native data type class so we can create instances of it dtype_cls = int2_dtype_cls @@ -61,89 +61,6 @@ def to_native_dtype(self: Self) -> int2_dtype_cls: """Create an int2 dtype instance from this ZDType""" return self.dtype_cls() - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]: - """ - Type check for Zarr v2-flavored JSON. - - This will check that the input is a dict like this: - .. code-block:: json - - { - "name": "int2", - "object_codec_id": None - } - - Note that this representation differs from the ``dtype`` field looks like in zarr v2 metadata. - Specifically, whatever goes into the ``dtype`` field in metadata is assigned to the ``name`` field here. - - See the Zarr docs for more information about the JSON encoding for data types. - """ - return ( - check_dtype_spec_v2(data) and data["name"] == "int2" and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["int2"]]: - """ - Type check for Zarr V3-flavored JSON. - - Checks that the input is the string "int2". - """ - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this ZDType from Zarr V3-flavored JSON. - """ - if cls._check_json_v2(data): - return cls() - # This first does a type check on the input, and if that passes we create an instance of the ZDType. - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: - """ - Create an instance of this ZDType from Zarr V3-flavored JSON. - - This first does a type check on the input, and if that passes we create an instance of the ZDType. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["int2"], None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["int2"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["int2"], None] | Literal["int2"]: - """ - Serialize this ZDType to v2- or v3-flavored JSON - - If the zarr_format is 2, then return a dict like this: - .. code-block:: json - - { - "name": "int2", - "object_codec_id": None - } - - If the zarr_format is 3, then return the string "int2" - - """ - if zarr_format == 2: - return {"name": "int2", "object_codec_id": None} - if zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[int | ml_dtypes.int2]: """ Check if a python object is a valid int2-compatible scalar @@ -209,7 +126,7 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> ml_dtypes. # after defining dtype class, it must be registered with the data type registry so zarr can use it -data_type_registry.register(Int2._zarr_v3_name, Int2) +register_data_type(Int2) # this parametrized function will create arrays in zarr v2 and v3 using our new data type diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 08d2a50ace..e2243f11b3 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -93,6 +93,11 @@ def enable_gpu(self) -> ConfigSet: defaults=[ { "default_zarr_format": 3, + # How to interpret data type metadata. "compatible" makes a best-effort attempt to + # read wrong-but-parsable data type metadata (e.g. a Zarr V2 ``">u1"`` typestring, + # which NumPy accepts but normalizes to ``"|u1"``). "strict" accepts only + # spec-compliant, canonical data type metadata. + "data_type_resolution": "compatible", "array": { "order": "C", "write_empty_chunks": False, diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index d1dbd6e2c8..4fc819c2a6 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -50,7 +50,15 @@ VariableLengthUTF8, VariableLengthUTF8JSON_V2, ) -from zarr.core.dtype.registry import DataTypeRegistry +from zarr.core.dtype.registry import ( + DataTypeRegistry, + data_type_registry, + load_data_type_entrypoints, + match_dtype, + match_json, + register_data_type, + unregister_data_type, +) from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType __all__ = [ @@ -97,12 +105,15 @@ "VariableLengthUTF8JSON_V2", "ZDType", "data_type_registry", + "load_data_type_entrypoints", + "match_dtype", + "match_json", "parse_data_type", "parse_dtype", + "register_data_type", + "unregister_data_type", ] -data_type_registry = DataTypeRegistry() - IntegerDType = Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 INTEGER_DTYPE: Final = Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 @@ -146,18 +157,21 @@ VariableLengthBytes, ) -# These are aliases for variable-length UTF-8 strings -# We handle them when a user requests a data type instead of using NumPy's dtype inferece because -# the default NumPy behavior -- to inspect the user-provided array data and choose -# an appropriately sized U dtype -- is unworkable for Zarr. -VLEN_UTF8_ALIAS: Final = ("str", str, "string") +# These are aliases for variable-length UTF-8 strings: the python ``str`` type plus the data +# type's declared Zarr V3 names ("string" and "str"). We handle them when a user requests a data +# type instead of using NumPy's dtype inference because the default NumPy behavior -- to inspect +# the user-provided array data and choose an appropriately sized U dtype -- is unworkable for Zarr. +VLEN_UTF8_ALIAS: Final = (str, *VariableLengthUTF8._zarr_v3_names()) # This type models inputs that can be coerced to a ZDType type ZDTypeLike = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str for dtype in ANY_DTYPE: # mypy does not know that all the elements of ANY_DTYPE are subclasses of ZDType - data_type_registry.register(dtype._zarr_v3_name, dtype) # type: ignore[arg-type] + register_data_type(dtype) # type: ignore[arg-type] + +# Register any data types advertised by third-party packages via entry points. +load_data_type_entrypoints() # TODO: find a better name for this function @@ -174,7 +188,7 @@ def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType, na_dtype = np.dtype(dtype) else: na_dtype = dtype - return data_type_registry.match_dtype(dtype=na_dtype) + return match_dtype(na_dtype) def get_data_type_from_json( @@ -184,7 +198,7 @@ def get_data_type_from_json( Given a JSON representation of a data type and a Zarr format version, attempt to create a ZDType instance from the registered ZDType classes. """ - return data_type_registry.match_json(dtype_spec, zarr_format=zarr_format) + return match_json(dtype_spec, zarr_format=zarr_format) def parse_data_type( diff --git a/src/zarr/core/dtype/npy/bool.py b/src/zarr/core/dtype/npy/bool.py index f92476a455..8dd3c07bdd 100644 --- a/src/zarr/core/dtype/npy/bool.py +++ b/src/zarr/core/dtype/npy/bool.py @@ -1,25 +1,23 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, overload +from typing import TYPE_CHECKING, ClassVar, Literal, Self import numpy as np from zarr.core.dtype.common import ( - DTypeConfig_V2, - DTypeJSON, HasItemSize, - check_dtype_spec_v2, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.npy.common import NumpyNativeDTypeV2 from zarr.errors import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import TBaseDType @dataclass(frozen=True, kw_only=True, slots=True) -class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): +class Bool(NumpyNativeDTypeV2[np.dtypes.BoolDType, np.bool_], HasItemSize): """ A Zarr data type for arrays containing booleans. @@ -45,7 +43,6 @@ class Bool(ZDType[np.dtypes.BoolDType, np.bool_], HasItemSize): """ _zarr_v3_name: ClassVar[Literal["bool"]] = "bool" - _zarr_v2_name: ClassVar[Literal["|b1"]] = "|b1" dtype_cls = np.dtypes.BoolDType @classmethod @@ -85,130 +82,6 @@ def to_native_dtype(self: Self) -> np.dtypes.BoolDType: """ return self.dtype_cls() - @classmethod - def _check_json_v2( - cls, - data: DTypeJSON, - ) -> TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]: - """ - Check that the input is a valid JSON representation of a Bool. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - ``TypeGuard[DTypeConfig_V2[Literal["|b1"], None]]`` - True if the input is a valid JSON representation, False otherwise. - """ - return ( - check_dtype_spec_v2(data) - and data["name"] == cls._zarr_v2_name - and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["bool"]]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - bool - True if the input is a valid JSON representation, False otherwise. - """ - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of Bool from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Bool - An instance of Bool. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_name!r}" - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: - """ - Create an instance of Bool from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Bool - An instance of Bool. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|b1"], None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["bool"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]: - """ - Serialize this Bool instance to JSON. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version (2 or 3). - - Returns - ------- - ``DTypeConfig_V2[Literal["|b1"], None] | Literal["bool"]`` - The JSON representation of the Bool instance. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ - if zarr_format == 2: - return {"name": self._zarr_v2_name, "object_codec_id": None} - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> bool: """ Check if the input can be cast to a boolean scalar. diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py index af8fa7a272..fe65150e53 100644 --- a/src/zarr/core/dtype/npy/bytes.py +++ b/src/zarr/core/dtype/npy/bytes.py @@ -3,7 +3,7 @@ import base64 import re from dataclasses import dataclass -from typing import ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypedDict, TypeGuard, cast import numpy as np @@ -14,13 +14,18 @@ HasItemSize, HasLength, HasObjectCodec, - check_dtype_spec_v2, v3_unstable_dtype_warning, ) -from zarr.core.dtype.npy.common import check_json_str -from zarr.core.dtype.wrapper import TBaseDType, ZDType +from zarr.core.dtype.npy.common import ( + NumpyNativeDTypeV2, + ObjectCodecDTypeV2, + check_json_str, +) from zarr.errors import DataTypeValidationError +if TYPE_CHECKING: + from zarr.core.dtype.wrapper import TBaseDType + BytesLike = np.bytes_ | str | bytes | int @@ -166,7 +171,9 @@ class VariableLengthBytesJSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-byt @dataclass(frozen=True, kw_only=True) -class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize): +class NullTerminatedBytes( + NumpyNativeDTypeV2[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize +): """ A Zarr data type for arrays containing fixed-length null-terminated byte sequences. @@ -247,32 +254,6 @@ def to_native_dtype(self) -> np.dtypes.BytesDType[int]: return self.dtype_cls(self.length) - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[NullterminatedBytesJSON_V2]: - """ - Check that the input is a valid JSON representation of NullTerminatedBytes in Zarr V2. - - The input data must be a mapping that contains a "name" key that matches the pattern - "|S" and an "object_codec_id" key that is None. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - bool - True if the input data is a valid representation, False otherwise. - """ - - return ( - check_dtype_spec_v2(data) - and isinstance(data["name"], str) - and re.match(r"^\|S\d+$", data["name"]) is not None - and data["object_codec_id"] is None - ) - @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSON_V3]: """ @@ -292,43 +273,12 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSON_V3 return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name + and cls._check_zarr_v3_name(data["name"]) and isinstance(data["configuration"], dict) and "length_bytes" in data["configuration"] and isinstance(data["configuration"]["length_bytes"], int) ) - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from Zarr V2-flavored JSON. - - This method checks if the input data is a valid representation of - this class in Zarr V2. If so, it returns a new instance of - this class with a ``length`` as specified in the input data. - - Parameters - ---------- - data : DTypeJSON - The JSON data to parse. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input data is not a valid representation of this class. - """ - - if cls._check_json_v2(data): - name = data["name"] - return cls(length=int(name[2:])) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string like '|S1', '|S2', etc" - raise DataTypeValidationError(msg) - @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ @@ -358,37 +308,12 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> NullterminatedBytesJSON_V2: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> NullTerminatedBytesJSON_V3: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[str, None] | NullTerminatedBytesJSON_V3: - """ - Generate a JSON representation of this data type. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - NullterminatedBytesJSON_V2 | NullTerminatedBytesJSON_V3 - The JSON-serializable representation of the data type - """ - if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - return { - "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v3(self) -> NullTerminatedBytesJSON_V3: + v3_unstable_dtype_warning(self) + return { + "name": self._zarr_v3_name, + "configuration": {"length_bytes": self.length}, + } def _check_scalar(self, data: object) -> TypeGuard[BytesLike]: """ @@ -541,8 +466,27 @@ def item_size(self) -> int: return self.length +# The Zarr V3 core spec names raw data types ``r``, where N is the number of *bits* and must be +# a multiple of 8 (e.g. "r8", "r16"). zarr-python's own canonical V3 form is the more explicit +# ``{"name": "raw_bytes", "configuration": {"length_bytes": ...}}``, but we accept the spec form on +# input so that raw arrays written by other Zarr V3 implementations can be read. +_RAW_BYTES_SPEC_NAME = re.compile(r"^r(\d+)$") + + +def _raw_bytes_spec_length_bytes(data: object) -> int | None: + """ + If ``data`` is a Zarr V3 core-spec raw data type name (e.g. ``"r8"``), return its length in + bytes; otherwise return None. The number in the name is a bit count and must be a multiple of 8. + """ + if isinstance(data, str) and (match := _RAW_BYTES_SPEC_NAME.match(data)) is not None: + num_bits = int(match.group(1)) + if num_bits > 0 and num_bits % 8 == 0: + return num_bits // 8 + return None + + @dataclass(frozen=True, kw_only=True) -class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): +class RawBytes(NumpyNativeDTypeV2[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize): """ A Zarr data type for arrays containing fixed-length sequences of raw bytes. @@ -573,6 +517,8 @@ class does not support structured data types. # so we have to tell mypy to ignore this here dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["raw_bytes"]] = "raw_bytes" + # The Zarr V3 core spec writes raw data types as ``r``; accept that form on input. + _zarr_v3_name_pattern: ClassVar[re.Pattern[str] | None] = _RAW_BYTES_SPEC_NAME def __post_init__(self) -> None: """ @@ -653,28 +599,6 @@ def to_native_dtype(self) -> np.dtypes.VoidDType[int]: # by invoking np.dtypes.VoidDType directly return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}")) - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V2]: - """ - Check that the input is a valid representation of this class in Zarr V2. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - True if the input is a valid representation of this class in Zarr V3, False otherwise. - - """ - return ( - check_dtype_spec_v2(data) - and isinstance(data["name"], str) - and re.match(r"^\|V\d+$", data["name"]) is not None - and data["object_codec_id"] is None - ) - @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V3]: """ @@ -694,42 +618,12 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name + and cls._check_zarr_v3_name(data["name"]) and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"length_bytes"} and isinstance(data["configuration"]["length_bytes"], int) ) - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of RawBytes from Zarr V2-flavored JSON. - - This method checks if the input data is a valid representation of - RawBytes in Zarr V2. If so, it returns a new instance of - RawBytes with a ``length`` as specified in the input data. - - Parameters - ---------- - data : DTypeJSON - The JSON data to parse. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input data is not a valid representation of this class. - """ - if cls._check_json_v2(data): - name = data["name"] - return cls(length=int(name[2:])) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string like '|V1', '|V2', etc" - raise DataTypeValidationError(msg) - @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ @@ -756,35 +650,16 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: """ if cls._check_json_v3(data): return cls(length=data["configuration"]["length_bytes"]) + # Also accept the Zarr V3 core spec form, e.g. "r8", "r16". + spec_length_bytes = _raw_bytes_spec_length_bytes(data) + if spec_length_bytes is not None: + return cls(length=spec_length_bytes) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> RawBytesJSON_V2: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> RawBytesJSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> RawBytesJSON_V2 | RawBytesJSON_V3: - """ - Generate a JSON representation of this data type. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - RawBytesJSON_V2 | RawBytesJSON_V3 - The JSON-serializable representation of the data type. - """ - if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v3(self) -> RawBytesJSON_V3: + v3_unstable_dtype_warning(self) + return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}} def _check_scalar(self, data: object) -> TypeGuard[np.bytes_ | str | bytes | np.void]: """ @@ -936,7 +811,7 @@ def item_size(self) -> int: @dataclass(frozen=True, kw_only=True) -class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): +class VariableLengthBytes(ObjectCodecDTypeV2[np.dtypes.ObjectDType, bytes], HasObjectCodec): """ A Zarr data type for arrays containing variable-length sequences of bytes. @@ -960,6 +835,7 @@ class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec): dtype_cls = np.dtypes.ObjectDType _zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes" + _aliases: ClassVar[tuple[str, ...]] = ("bytes",) object_codec_id: ClassVar[Literal["vlen-bytes"]] = "vlen-bytes" @classmethod @@ -1002,147 +878,9 @@ def to_native_dtype(self) -> np.dtypes.ObjectDType: """ return self.dtype_cls() - @classmethod - def _check_json_v2( - cls, - data: DTypeJSON, - ) -> TypeGuard[VariableLengthBytesJSON_V2]: - """ - Check that the input is a valid JSON representation of a NumPy O dtype, and that the - object codec id is appropriate for variable-length bytes strings. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - True if the input is a valid representation of this class in Zarr V2, False - otherwise. - """ - # Check that the input is a valid JSON representation of a Zarr v2 data type spec. - if not check_dtype_spec_v2(data): - return False - - # Check that the object codec id is appropriate for variable-length bytes strings. - if data["name"] != "|O": - return False - return data["object_codec_id"] == cls.object_codec_id - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_bytes"]]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - TypeGuard[Literal["variable_length_bytes"]] - True if the input is a valid representation of this class in Zarr V3, False otherwise. - """ - - return data in (cls._zarr_v3_name, "bytes") - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this VariableLengthBytes from Zarr V2-flavored JSON. - - This method checks if the input data is a valid representation of this class - in Zarr V2. If so, it returns a new instance this class. - - Parameters - ---------- - data : DTypeJSON - The JSON data to parse. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input data is not a valid representation of this class class. - """ - - if cls._check_json_v2(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O' and an object_codec_id of {cls.object_codec_id}" - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of VariableLengthBytes from Zarr V3-flavored JSON. - - This method checks if the input data is a valid representation of - VariableLengthBytes in Zarr V3. If so, it returns a new instance of - VariableLengthBytes. - - Parameters - ---------- - data : DTypeJSON - The JSON data to parse. - - Returns - ------- - VariableLengthBytes - An instance of VariableLengthBytes. - - Raises - ------ - DataTypeValidationError - If the input data is not a valid representation of this class. - """ - - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> VariableLengthBytesJSON_V2: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_bytes"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> VariableLengthBytesJSON_V2 | Literal["variable_length_bytes"]: - """ - Convert the variable-length bytes data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. Accepted values are 2 and 3. - - Returns - ------- - ``DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]] | Literal["variable_length_bytes"]`` - The JSON-serializable representation of the variable-length bytes data type. - For zarr_format 2, returns a dictionary with "name" and "object_codec_id". - For zarr_format 3, returns a string identifier "variable_length_bytes". - - Raises - ------ - ValueError - If zarr_format is not 2 or 3. - """ - - if zarr_format == 2: - return {"name": "|O", "object_codec_id": self.object_codec_id} - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v3(self) -> Literal["variable_length_bytes"]: + v3_unstable_dtype_warning(self) + return self._zarr_v3_name def default_scalar(self) -> bytes: """ diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index f413f5f678..db7d51a97a 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -10,6 +10,7 @@ Final, Literal, NewType, + Self, SupportsComplex, SupportsFloat, SupportsIndex, @@ -25,10 +26,72 @@ EndiannessStr, JSONFloatV2, JSONFloatV3, + check_dtype_spec_v2, ) +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType +from zarr.errors import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.common import DTypeJSON, DTypeSpec_V2 + + +class NumpyNativeDTypeV2[DType: TBaseDType, Scalar: TBaseScalar](ZDType[DType, Scalar]): + """ + Mixin for data types whose Zarr V2 representation is just the NumPy type string of the + wrapped dtype, e.g. ``" Self: + # These data types always have a string NumPy type string as their V2 name (a structured + # name, which is a sequence, can never match a scalar NumPy-native type). + if ( + check_dtype_spec_v2(data) + and isinstance(data["name"], str) + and data["object_codec_id"] is None + ): + try: + native_dtype = np.dtype(data["name"]) + except TypeError: + pass + else: + # from_native_dtype validates the dtype class (and byte order) and raises + # DataTypeValidationError if it does not match this data type. + return cls.from_native_dtype(native_dtype) + raise DataTypeValidationError( + f"Invalid Zarr V2 JSON representation of {cls.__name__}: {data!r}" + ) + + def _to_json_v2(self) -> DTypeSpec_V2: + return {"name": self.to_native_dtype().str, "object_codec_id": None} + + +class ObjectCodecDTypeV2[DType: TBaseDType, Scalar: TBaseScalar](ZDType[DType, Scalar]): + """ + Mixin for parameter-free data types whose Zarr V2 representation is the NumPy "object" type + string ``"|O"`` together with an ``object_codec_id`` identifying the codec used to encode + values (e.g. ``"vlen-utf8"``). Subclasses must define ``object_codec_id``. + """ + + @classmethod + def _from_json_v2(cls, data: DTypeJSON) -> Self: + if ( + check_dtype_spec_v2(data) + and data["name"] == "|O" + and data["object_codec_id"] == cls.object_codec_id + ): + return cls() + raise DataTypeValidationError( + f"Invalid Zarr V2 JSON representation of {cls.__name__}: {data!r}" + ) + + def _to_json_v2(self) -> DTypeSpec_V2: + return {"name": "|O", "object_codec_id": self.object_codec_id} + IntLike = SupportsInt | SupportsIndex | bytes | str FloatLike = SupportsIndex | SupportsFloat | bytes | str diff --git a/src/zarr/core/dtype/npy/complex.py b/src/zarr/core/dtype/npy/complex.py index 0286d42380..aec6d65d97 100644 --- a/src/zarr/core/dtype/npy/complex.py +++ b/src/zarr/core/dtype/npy/complex.py @@ -7,20 +7,17 @@ Literal, Self, TypeGuard, - overload, ) import numpy as np from zarr.core.dtype.common import ( - DTypeConfig_V2, - DTypeJSON, HasEndianness, HasItemSize, - check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( ComplexLike, + NumpyNativeDTypeV2, check_json_complex_float_v2, check_json_complex_float_v3, complex_float_from_json_v2, @@ -30,25 +27,22 @@ endianness_to_numpy_str, get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType from zarr.errors import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import TBaseDType @dataclass(frozen=True) class BaseComplex[ DType: np.dtypes.Complex64DType | np.dtypes.Complex128DType, Scalar: np.complex64 | np.complex128, -](ZDType[DType, Scalar], HasEndianness, HasItemSize): +](NumpyNativeDTypeV2[DType, Scalar], HasEndianness, HasItemSize): """ A base class for Zarr data types that wrap NumPy complex float data types. """ - # This attribute holds the possible zarr v2 JSON names for the data type - _zarr_v2_names: ClassVar[tuple[str, ...]] - @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ @@ -88,138 +82,6 @@ def to_native_dtype(self) -> DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[no-any-return,call-overload] - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: - """ - Check that the input is a valid JSON representation of this data type. - - The input data must be a mapping that contains a "name" key that is one of - the strings from cls._zarr_v2_names and an "object_codec_id" key that is None. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - bool - True if the input is a valid JSON representation, False otherwise. - """ - return ( - check_dtype_spec_v2(data) - and data["name"] in cls._zarr_v2_names - and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: - """ - Check that the input is a valid JSON representation of this data type in Zarr V3. - - This method verifies that the provided data matches the expected Zarr V3 - representation, which is the string specified by the class-level attribute _zarr_v3_name. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - TypeGuard[str] - True if the input is a valid representation of this class in Zarr V3, False otherwise. - """ - - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this class. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - # Going via numpy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> str: ... - - def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: - """ - Serialize this object to a JSON-serializable representation. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version. Supported values are 2 and 3. - - Returns - ------- - DTypeConfig_V2[str, None] | str - If ``zarr_format`` is 2, a dictionary with ``"name"`` and ``"object_codec_id"`` keys is - returned. - If ``zarr_format`` is 3, a string representation of the complex data type is returned. - - Raises - ------ - ValueError - If `zarr_format` is not 2 or 3. - """ - - if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[ComplexLike]: """ Check that the input is a scalar complex value. @@ -363,13 +225,10 @@ class Complex64(BaseComplex[np.dtypes.Complex64DType, np.complex64]): The numpy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["complex64"]] The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">c8"], Literal["c8"], Literal["c8", " int: @@ -398,13 +257,10 @@ class Complex128(BaseComplex[np.dtypes.Complex128DType, np.complex128], HasEndia The numpy dtype class for this data type. _zarr_v3_name : ClassVar[Literal["complex128"]] The name of this data type in Zarr V3. - _zarr_v2_names : ClassVar[tuple[Literal[">c16"], Literal["c16"], Literal["c16", " int: diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index d041416b81..89dc912505 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -1,19 +1,17 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, overload +from typing import TYPE_CHECKING, Self, TypeGuard import numpy as np from zarr.core.dtype.common import ( - DTypeConfig_V2, - DTypeJSON, HasEndianness, HasItemSize, - check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( FloatLike, + NumpyNativeDTypeV2, check_json_float_v2, check_json_float_v3, check_json_floatish_str, @@ -24,25 +22,22 @@ float_to_json_v3, get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType from zarr.errors import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import TBaseDType @dataclass(frozen=True) class BaseFloat[ DType: np.dtypes.Float16DType | np.dtypes.Float32DType | np.dtypes.Float64DType, Scalar: np.float16 | np.float32 | np.float64, -](ZDType[DType, Scalar], HasEndianness, HasItemSize): +](NumpyNativeDTypeV2[DType, Scalar], HasEndianness, HasItemSize): """ A base class for Zarr data types that wrap NumPy float data types. """ - # This attribute holds the possible zarr v2 JSON names for the data type - _zarr_v2_names: ClassVar[tuple[str, ...]] - @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ @@ -76,118 +71,6 @@ def to_native_dtype(self) -> DType: byte_order = endianness_to_numpy_str(self.endianness) return self.dtype_cls().newbyteorder(byte_order) # type: ignore[no-any-return,call-overload] - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[DTypeConfig_V2[str, None]]: - """ - Check that the input is a valid JSON representation of this data type. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TypeGuard[DTypeConfig_V2[str, None]] - True if the input is a valid JSON representation of this data type, False otherwise. - """ - return ( - check_dtype_spec_v2(data) - and data["name"] in cls._zarr_v2_names - and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[str]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TypeGuard[str] - True if the input is a valid JSON representation of this class, False otherwise. - """ - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this ZDType from Zarr v2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this ZDType from Zarr v3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> str: ... - - def to_json(self, zarr_format: ZarrFormat) -> DTypeConfig_V2[str, None] | str: - """ - Convert the wrapped data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The zarr format version. - - Returns - ------- - DTypeConfig_V2[str, None] or str - The JSON-serializable representation of the wrapped data type. - - Raises - ------ - ValueError - If zarr_format is not 2 or 3. - """ - if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def _check_scalar(self, data: object) -> TypeGuard[FloatLike]: """ Check that the input is a valid scalar value. @@ -347,7 +230,6 @@ class Float16(BaseFloat[np.dtypes.Float16DType, np.float16]): dtype_cls = np.dtypes.Float16DType _zarr_v3_name = "float16" - _zarr_v2_names: ClassVar[tuple[Literal[">f2"], Literal["f2", " int: @@ -384,7 +266,6 @@ class Float32(BaseFloat[np.dtypes.Float32DType, np.float32]): dtype_cls = np.dtypes.Float32DType _zarr_v3_name = "float32" - _zarr_v2_names: ClassVar[tuple[Literal[">f4"], Literal["f4", " int: @@ -421,7 +302,6 @@ class Float64(BaseFloat[np.dtypes.Float64DType, np.float64]): dtype_cls = np.dtypes.Float64DType _zarr_v3_name = "float64" - _zarr_v2_names: ClassVar[tuple[Literal[">f8"], Literal["f8", " int: diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index c18fd01dd8..c76d0369d4 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -9,30 +9,27 @@ SupportsIndex, SupportsInt, TypeGuard, - overload, ) import numpy as np from zarr.core.dtype.common import ( - DTypeConfig_V2, - DTypeJSON, HasEndianness, HasItemSize, - check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( + NumpyNativeDTypeV2, check_json_int, check_json_intish_float, check_json_intish_str, endianness_to_numpy_str, get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType from zarr.errors import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import TBaseDType _NumpyIntDType = ( np.dtypes.Int8DType @@ -55,7 +52,7 @@ class BaseInt[ DType: _NumpyIntDType, Scalar: np.int8 | np.int16 | np.int32 | np.int64 | np.uint8 | np.uint16 | np.uint32 | np.uint64, -](ZDType[DType, Scalar], HasItemSize): +](NumpyNativeDTypeV2[DType, Scalar], HasItemSize): """ A base class for integer data types in Zarr. @@ -63,53 +60,6 @@ class BaseInt[ in both Zarr v2 and v3 formats, as well as methods for checking and casting scalars. """ - _zarr_v2_names: ClassVar[tuple[str, ...]] - - @classmethod - def _check_json_v2(cls, data: object) -> TypeGuard[DTypeConfig_V2[str, None]]: - """ - Check that the input is a valid JSON representation of this integer data type in Zarr V2. - - This method verifies that the provided data matches the expected Zarr V2 representation - for this data type. The input data must be a mapping that contains a "name" key that is - one of the strings from cls._zarr_v2_names and an "object_codec_id" key that is None. - - Parameters - ---------- - data : object - The JSON data to check. - - Returns - ------- - TypeGuard[DTypeConfig_V2[str, None]] - True if the input is a valid representation of this class in Zarr V2, - False otherwise. - """ - - return ( - check_dtype_spec_v2(data) - and data["name"] in cls._zarr_v2_names - and data["object_codec_id"] is None - ) - - @classmethod - def _check_json_v3(cls, data: object) -> TypeGuard[str]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : object - The JSON data to check. - - Returns - ------- - TypeGuard[str] - True if the input is a valid representation of this class in Zarr v3, - False otherwise. - """ - return data == cls._zarr_v3_name - def _check_scalar(self, data: object) -> TypeGuard[IntLike]: """ Check if the input object is of an IntLike type. @@ -259,7 +209,6 @@ class Int8(BaseInt[np.dtypes.Int8DType, np.int8]): dtype_cls = np.dtypes.Int8DType _zarr_v3_name: ClassVar[Literal["int8"]] = "int8" - _zarr_v2_names: ClassVar[tuple[Literal["|i1"]]] = ("|i1",) @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: @@ -298,89 +247,6 @@ def to_native_dtype(self: Self) -> np.dtypes.Int8DType: """ return self.dtype_cls() - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an Int8 from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int8. - """ - if cls._check_json_v2(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an Int8 from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int8. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|i1"], None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["int8"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]: - """ - Convert the data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version. - - Returns - ------- - ``DTypeConfig_V2[Literal["|i1"], None] | Literal["int8"]`` - The JSON-serializable representation of the data type. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ - if zarr_format == 2: - return {"name": self._zarr_v2_names[0], "object_codec_id": None} - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @property def item_size(self) -> int: """ @@ -415,7 +281,6 @@ class UInt8(BaseInt[np.dtypes.UInt8DType, np.uint8]): dtype_cls = np.dtypes.UInt8DType _zarr_v3_name: ClassVar[Literal["uint8"]] = "uint8" - _zarr_v2_names: ClassVar[tuple[Literal["|u1"]]] = ("|u1",) @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: @@ -440,93 +305,6 @@ def to_native_dtype(self: Self) -> np.dtypes.UInt8DType: return self.dtype_cls() - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - - if cls._check_json_v2(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v2_names[0]!r}" - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal["|u1"], None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["uint8"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]: - """ - Convert the data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version. Supported values are 2 and 3. - - Returns - ------- - ``DTypeConfig_V2[Literal["|u1"], None] | Literal["uint8"]`` - The JSON-serializable representation of the data type. - - Raises - ------ - ValueError - If `zarr_format` is not 2 or 3. - """ - if zarr_format == 2: - # For Zarr format version 2, return a dictionary with the name and object codec ID. - return {"name": self._zarr_v2_names[0], "object_codec_id": None} - elif zarr_format == 3: - # For Zarr format version 3, return the v3 name as a string. - return self._zarr_v3_name - # Raise an error if the zarr_format is neither 2 nor 3. - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - @property def item_size(self) -> int: """ @@ -562,7 +340,6 @@ class Int16(BaseInt[np.dtypes.Int16DType, np.int16], HasEndianness): dtype_cls = np.dtypes.Int16DType _zarr_v3_name: ClassVar[Literal["int16"]] = "int16" - _zarr_v2_names: ClassVar[tuple[Literal[">i2"], Literal["i2", " Self: @@ -603,93 +380,6 @@ def to_native_dtype(self) -> np.dtypes.Int16DType: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i2", " Literal["int16"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">i2", "i2", " int: """ @@ -725,7 +415,6 @@ class UInt16(BaseInt[np.dtypes.UInt16DType, np.uint16], HasEndianness): dtype_cls = np.dtypes.UInt16DType _zarr_v3_name: ClassVar[Literal["uint16"]] = "uint16" - _zarr_v2_names: ClassVar[tuple[Literal[">u2"], Literal["u2", " Self: @@ -766,93 +455,6 @@ def to_native_dtype(self) -> np.dtypes.UInt16DType: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of UInt16. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u2", " Literal["uint16"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">u2", "u2", " int: """ @@ -888,7 +490,6 @@ class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness): dtype_cls = np.dtypes.Int32DType _zarr_v3_name: ClassVar[Literal["int32"]] = "int32" - _zarr_v2_names: ClassVar[tuple[Literal[">i4"], Literal["i4", " TypeGuard[np.dtypes.Int32DType]: @@ -950,93 +551,6 @@ def to_native_dtype(self: Self) -> np.dtypes.Int32DType: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an Int32 from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int32. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names!r}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an Int32 from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class Int32. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i4", " Literal["int32"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">i4", "i4", " int: """ @@ -1072,7 +586,6 @@ class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): dtype_cls = np.dtypes.UInt32DType _zarr_v3_name: ClassVar[Literal["uint32"]] = "uint32" - _zarr_v2_names: ClassVar[tuple[Literal[">u4"], Literal["u4", " TypeGuard[np.dtypes.UInt32DType]: @@ -1136,88 +649,6 @@ def to_native_dtype(self) -> np.dtypes.UInt32DType: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class 32-bit unsigned - integer. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class 32-bit unsigned - integer. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u4", " Literal["uint32"]: ... - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">u4", "u4", " int: """ @@ -1253,7 +684,6 @@ class Int64(BaseInt[np.dtypes.Int64DType, np.int64], HasEndianness): dtype_cls = np.dtypes.Int64DType _zarr_v3_name: ClassVar[Literal["int64"]] = "int64" - _zarr_v2_names: ClassVar[tuple[Literal[">i8"], Literal["i8", " Self: @@ -1295,88 +725,6 @@ def to_native_dtype(self) -> np.dtypes.Int64DType: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class 64-bit signed - integer. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class 64-bit signed - integer. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">i8", " Literal["int64"]: ... - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">i8", "i8", " int: """ @@ -1412,7 +760,6 @@ class UInt64(BaseInt[np.dtypes.UInt64DType, np.uint64], HasEndianness): dtype_cls = np.dtypes.UInt64DType _zarr_v3_name: ClassVar[Literal["uint64"]] = "uint64" - _zarr_v2_names: ClassVar[tuple[Literal[">u8"], Literal["u8", " np.dtypes.UInt64DType: """ @@ -1427,89 +774,6 @@ def to_native_dtype(self) -> np.dtypes.UInt64DType: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls().newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class unsigned 64-bit - integer. - """ - if cls._check_json_v2(data): - # Going via NumPy ensures that we get the endianness correct without - # annoying string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected one of the strings {cls._zarr_v2_names}." - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from Zarr V3-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class unsigned 64-bit - integer. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}" - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[Literal[">u8", " Literal["uint64"]: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[Literal[">u8", "u8", " Self: """ diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py index 3f84e8123f..4fc90275d2 100644 --- a/src/zarr/core/dtype/npy/string.py +++ b/src/zarr/core/dtype/npy/string.py @@ -1,6 +1,5 @@ from __future__ import annotations -import re from dataclasses import dataclass from typing import ( TYPE_CHECKING, @@ -10,7 +9,6 @@ Self, TypedDict, TypeGuard, - overload, runtime_checkable, ) @@ -24,14 +22,14 @@ HasItemSize, HasLength, HasObjectCodec, - check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( + NumpyNativeDTypeV2, + ObjectCodecDTypeV2, check_json_str, endianness_to_numpy_str, get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import ZDType from zarr.errors import DataTypeValidationError if TYPE_CHECKING: @@ -104,7 +102,7 @@ class FixedLengthUTF32JSON_V3(NamedConfig[Literal["fixed_length_utf32"], LengthB @dataclass(frozen=True, kw_only=True) class FixedLengthUTF32( - ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize + NumpyNativeDTypeV2[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize ): """ A Zarr data type for arrays containing fixed-length UTF-32 strings. @@ -175,28 +173,6 @@ def to_native_dtype(self) -> np.dtypes.StrDType[int]: # numpy 2.x stub: newbyteorder widens to base dtype, runtime preserves the concrete subclass return self.dtype_cls(self.length).newbyteorder(byte_order) # type: ignore[return-value] - @classmethod - def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V2]: - """ - Check that the input is a valid JSON representation of a NumPy U dtype. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TypeGuard[FixedLengthUTF32JSON_V2] - Whether the input is a valid JSON representation of a NumPy U dtype. - """ - return ( - check_dtype_spec_v2(data) - and isinstance(data["name"], str) - and re.match(r"^[><]U\d+$", data["name"]) is not None - and data["object_codec_id"] is None - ) - @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V3]: """ @@ -215,66 +191,18 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[FixedLengthUTF32JSON_V3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name + and cls._check_zarr_v3_name(data["name"]) and "configuration" in data and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"length_bytes"} and isinstance(data["configuration"]["length_bytes"], int) ) - @overload - def to_json(self, zarr_format: Literal[2]) -> DTypeConfig_V2[str, None]: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> FixedLengthUTF32JSON_V3: ... - - def to_json( - self, zarr_format: ZarrFormat - ) -> DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3: - """ - Convert the FixedLengthUTF32 instance to a JSON representation. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format to use. - - Returns - ------- - DTypeConfig_V2[str, None] | FixedLengthUTF32JSON_V3 - The JSON representation of the data type. - """ - if zarr_format == 2: - return {"name": self.to_native_dtype().str, "object_codec_id": None} - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"length_bytes": self.length * self.code_point_bytes}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create a FixedLengthUTF32 from a JSON representation of a NumPy U dtype. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v2(data): - # Construct the NumPy dtype instead of string parsing. - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - raise DataTypeValidationError( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string representation of a NumPy U dtype." - ) + def _to_json_v3(self) -> FixedLengthUTF32JSON_V3: + return { + "name": self._zarr_v3_name, + "configuration": {"length_bytes": self.length * self.code_point_bytes}, + } @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: @@ -450,7 +378,7 @@ class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8 @dataclass(frozen=True, kw_only=True) -class VariableLengthUTF8(ZDType[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var] +class VariableLengthUTF8(ObjectCodecDTypeV2[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var] """ A Zarr data type for arrays containing variable-length UTF-8 strings. @@ -475,6 +403,7 @@ class VariableLengthUTF8(ZDType[np.dtypes.StringDType, str], HasObjectCodec): # dtype_cls = np.dtypes.StringDType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["string"]] = "string" + _aliases: ClassVar[tuple[str, ...]] = ("str",) object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8" @classmethod @@ -525,118 +454,6 @@ def to_native_dtype(self) -> np.dtypes.StringDType: """ return self.dtype_cls() - @classmethod - def _check_json_v2( - cls, - data: DTypeJSON, - ) -> TypeGuard[VariableLengthUTF8JSON_V2]: - """ - "Check if the input is a valid JSON representation of a variable-length UTF-8 string dtype - for Zarr v2." - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - ``TypeGuard[VariableLengthUTF8JSON_V2]`` - Whether the input is a valid JSON representation of a NumPy "object" data type, and that the - object codec id is appropriate for variable-length UTF-8 strings. - """ - return ( - check_dtype_spec_v2(data) - and data["name"] == "|O" - and data["object_codec_id"] == cls.object_codec_id - ) - - @classmethod - def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_utf8"]]: - """ - Check that the input is a valid JSON representation of this class in Zarr V3. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - TypeGuard[Literal["variable_length_utf8"]] - Whether the input is a valid JSON representation of a variable length UTF-8 string - data type. - """ - return data == cls._zarr_v3_name - - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from a JSON representation of a NumPy "object" dtype. - - Parameters - ---------- - data : DTypeJSON - The JSON data to create an instance from. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v2(data): - return cls() - msg = ( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O'" - ) - raise DataTypeValidationError(msg) - - @classmethod - def _from_json_v3(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this class from a JSON representation of a variable length UTF-8 - string data type. - - Parameters - ---------- - data : DTypeJSON - The JSON data to create an instance from. - - Returns - ------- - Self - An instance of this data type. - """ - if cls._check_json_v3(data): - return cls() - msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected {cls._zarr_v3_name}." - raise DataTypeValidationError(msg) - - @overload - def to_json(self, zarr_format: Literal[2]) -> VariableLengthUTF8JSON_V2: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> Literal["string"]: ... - - def to_json(self, zarr_format: ZarrFormat) -> VariableLengthUTF8JSON_V2 | Literal["string"]: - """ - Convert this data type to a JSON representation. - - Parameters - ---------- - zarr_format : int - The zarr format to use for the JSON representation. - - Returns - ------- - ``VariableLengthUTF8JSON_V2 | Literal["string"]`` - The JSON representation of this data type. - """ - if zarr_format == 2: - return {"name": "|O", "object_codec_id": self.object_codec_id} - elif zarr_format == 3: - return self._zarr_v3_name - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover - def default_scalar(self) -> str: """ Return the default scalar value for this data type. diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index b865998e52..fff852e328 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -2,7 +2,7 @@ from collections.abc import Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, cast, overload +from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, cast import numpy as np @@ -264,7 +264,7 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[StructuredJSON_V3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name + and cls._check_zarr_v3_name(data["name"]) and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"fields"} ) @@ -309,49 +309,20 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> StructuredJSON_V2: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> StructuredJSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructuredJSON_V3: - """ - Convert the structured data type to a JSON-serializable form. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version. Accepted values are 2 and 3. - - Returns - ------- - StructuredJSON_V2 | StructuredJSON_V3 - The JSON representation of the structured data type. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ - if zarr_format == 2: - fields = [ - [f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]] - for f_name, f_dtype in self.fields - ] - return {"name": fields, "object_codec_id": None} - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - fields = [ - [f_name, f_dtype.to_json(zarr_format=zarr_format)] # type: ignore[list-item] - for f_name, f_dtype in self.fields - ] - base_dict = { - "name": self._zarr_v3_name, - "configuration": {"fields": fields}, - } - return cast("StructuredJSON_V3", base_dict) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v2(self) -> StructuredJSON_V2: + fields = [ + [f_name, f_dtype.to_json(zarr_format=2)["name"]] for f_name, f_dtype in self.fields + ] + return {"name": fields, "object_codec_id": None} + + def _to_json_v3(self) -> StructuredJSON_V3: + v3_unstable_dtype_warning(self) + fields = [[f_name, f_dtype.to_json(zarr_format=3)] for f_name, f_dtype in self.fields] + base_dict = { + "name": self._zarr_v3_name, + "configuration": {"fields": fields}, + } + return cast("StructuredJSON_V3", base_dict) def _check_scalar(self, data: object) -> TypeGuard[StructuredScalarLike]: # TODO: implement something more precise here! @@ -544,13 +515,16 @@ class Struct(Structured): """ _zarr_v3_name: ClassVar[Literal["struct"]] = "struct" # type: ignore[assignment] + # "structured" is the legacy name for this data type; it is accepted on input but "struct" + # is always written out. + _aliases: ClassVar[tuple[str, ...]] = ("structured",) @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[StructJSON_V3]: # type: ignore[override] return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] in ("struct", "structured") + and cls._check_zarr_v3_name(data["name"]) and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"fields"} ) @@ -575,30 +549,22 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) - @overload # type: ignore[override] - def to_json(self, zarr_format: Literal[2]) -> StructuredJSON_V2: ... - - @overload - def to_json(self, zarr_format: Literal[3]) -> StructJSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructJSON_V3: - if zarr_format == 2: - fields_v2 = [ - [f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]] - for f_name, f_dtype in self.fields - ] - return {"name": fields_v2, "object_codec_id": None} - elif zarr_format == 3: - v3_unstable_dtype_warning(self) - fields_v3 = [ - {"name": f_name, "data_type": f_dtype.to_json(zarr_format=zarr_format)} - for f_name, f_dtype in self.fields - ] - return cast( - "StructJSON_V3", - {"name": self._zarr_v3_name, "configuration": {"fields": fields_v3}}, - ) - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v2(self) -> StructuredJSON_V2: + fields_v2 = [ + [f_name, f_dtype.to_json(zarr_format=2)["name"]] for f_name, f_dtype in self.fields + ] + return {"name": fields_v2, "object_codec_id": None} + + def _to_json_v3(self) -> StructJSON_V3: # type: ignore[override] + v3_unstable_dtype_warning(self) + fields_v3 = [ + {"name": f_name, "data_type": f_dtype.to_json(zarr_format=3)} + for f_name, f_dtype in self.fields + ] + return cast( + "StructJSON_V3", + {"name": self._zarr_v3_name, "configuration": {"fields": fields_v3}}, + ) def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: """ diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 4efa0be7bb..87b656e09a 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -11,7 +11,6 @@ TypeGuard, cast, get_args, - overload, ) import numpy as np @@ -23,20 +22,19 @@ DTypeJSON, HasEndianness, HasItemSize, - check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( - DATETIME_UNIT, DateTimeUnit, + NumpyNativeDTypeV2, check_json_int, endianness_to_numpy_str, get_endianness_from_numpy_dtype, ) -from zarr.core.dtype.wrapper import TBaseDType, ZDType from zarr.errors import DataTypeValidationError if TYPE_CHECKING: from zarr.core.common import JSON, ZarrFormat + from zarr.core.dtype.wrapper import TBaseDType TimeDeltaLike = str | int | bytes | np.timedelta64 | timedelta | None DateTimeLike = str | int | bytes | np.datetime64 | datetime | None @@ -209,7 +207,7 @@ class DateTime64JSON_V2(DTypeConfig_V2[str, None]): class TimeDTypeBase[ DType: np.dtypes.TimeDelta64DType | np.dtypes.DateTime64DType, Scalar: np.timedelta64 | np.datetime64, -](ZDType[DType, Scalar], HasEndianness, HasItemSize): +](NumpyNativeDTypeV2[DType, Scalar], HasEndianness, HasItemSize): """ A base class for data types that represent time via the NumPy TimeDelta64 and DateTime64 data types. @@ -348,43 +346,8 @@ class TimeDelta64(TimeDTypeBase[np.dtypes.TimeDelta64DType, np.timedelta64], Has unit: DateTimeUnit = "generic" scale_factor: int = 1 _zarr_v3_name: ClassVar[Literal["numpy.timedelta64"]] = "numpy.timedelta64" - _zarr_v2_names: ClassVar[tuple[Literal[">m8"], Literal["m8", " TypeGuard[TimeDelta64JSON_V2]: - """ - Validate that the provided JSON input accurately represents a NumPy timedelta64 data type, - which could be in the form of strings like "m8[10s]". This method serves as a type - guard, helping to refine the type of unknown JSON input by confirming its adherence to the - expected format for NumPy timedelta64 data types. - - The JSON input should contain a "name" key with a value that matches the expected string - pattern for NumPy timedelta64 data types. The pattern includes an optional unit enclosed - within square brackets, following the base type identifier. - - Returns - ------- - bool - True if the JSON input is a valid representation of this class, - otherwise False. - """ - if not check_dtype_spec_v2(data): - return False - name = data["name"] - # match m[M], etc - # consider making this a standalone function - if not isinstance(name, str): - return False - if not name.startswith(cls._zarr_v2_names): - return False - if len(name) == 3: - # no unit, and - # we already checked that this string is either m8 - return True - else: - return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" - @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: """ @@ -399,40 +362,11 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name + and cls._check_zarr_v3_name(data["name"]) and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"unit", "scale_factor"} ) - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create a TimeDelta64 from a Zarr V2-flavored JSON. - - Parameters - ---------- - data : DTypeJSON - The JSON data. - - Returns - ------- - TimeDelta64 - An instance of TimeDelta64. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - if cls._check_json_v2(data): - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = ( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string " - f"representation of an instance of {cls.dtype_cls}" - ) - raise DataTypeValidationError(msg) - @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ @@ -467,39 +401,11 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: ) raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> TimeDelta64JSON_V2: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> TimeDelta64JSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> TimeDelta64JSON_V2 | TimeDelta64JSON_V3: - """ - Serialize this data type to JSON. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version (2 or 3). - - Returns - ------- - TimeDelta64JSON_V2 | TimeDelta64JSON_V3 - The JSON representation of the data type. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ - if zarr_format == 2: - name = self.to_native_dtype().str - return {"name": name, "object_codec_id": None} - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v3(self) -> TimeDelta64JSON_V3: + return { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, + } def _check_scalar(self, data: object) -> TypeGuard[TimeDeltaLike]: """ @@ -623,41 +529,10 @@ class DateTime64(TimeDTypeBase[np.dtypes.DateTime64DType, np.datetime64], HasEnd dtype_cls = np.dtypes.DateTime64DType # type: ignore[assignment] _zarr_v3_name: ClassVar[Literal["numpy.datetime64"]] = "numpy.datetime64" - _zarr_v2_names: ClassVar[tuple[Literal[">M8"], Literal["M8", " TypeGuard[DateTime64JSON_V2]: - """ - Check that the input is a valid JSON representation of this data type. - - Parameters - ---------- - data : DTypeJSON - The JSON data to check. - - Returns - ------- - TypeGuard[DateTime64JSON_V2] - True if the input is a valid JSON representation of a NumPy datetime64 data type, - otherwise False. - """ - if not check_dtype_spec_v2(data): - return False - name = data["name"] - if not isinstance(name, str): - return False - if not name.startswith(cls._zarr_v2_names): - return False - if len(name) == 3: - # no unit, and - # we already checked that this string is either M8 - return True - else: - return name[4:-1].endswith(DATETIME_UNIT) and name[-1] == "]" - @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: """ @@ -677,45 +552,11 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[DateTime64JSON_V3]: return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} - and data["name"] == cls._zarr_v3_name + and cls._check_zarr_v3_name(data["name"]) and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"unit", "scale_factor"} ) - @classmethod - def _from_json_v2(cls, data: DTypeJSON) -> Self: - """ - Create an instance of this data type from a Zarr V2-flavored JSON representation. - - This method checks if the provided JSON data is a valid representation of this class. - If valid, it creates an instance using the native NumPy dtype. Otherwise, it raises a - DataTypeValidationError. - - Parameters - ---------- - data : DTypeJSON - The JSON data to parse. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input JSON is not a valid representation of this class. - """ - - if cls._check_json_v2(data): - name = data["name"] - return cls.from_native_dtype(np.dtype(name)) - msg = ( - f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string " - f"representation of an instance of {cls.dtype_cls}" - ) - raise DataTypeValidationError(msg) - @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: """ @@ -752,39 +593,11 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self: ) raise DataTypeValidationError(msg) - @overload - def to_json(self, zarr_format: Literal[2]) -> DateTime64JSON_V2: ... - @overload - def to_json(self, zarr_format: Literal[3]) -> DateTime64JSON_V3: ... - - def to_json(self, zarr_format: ZarrFormat) -> DateTime64JSON_V2 | DateTime64JSON_V3: - """ - Serialize this data type to JSON. - - Parameters - ---------- - zarr_format : ZarrFormat - The Zarr format version (2 or 3). - - Returns - ------- - DateTime64JSON_V2 | DateTime64JSON_V3 - The JSON representation of the data type. - - Raises - ------ - ValueError - If the zarr_format is not 2 or 3. - """ - if zarr_format == 2: - name = self.to_native_dtype().str - return {"name": name, "object_codec_id": None} - elif zarr_format == 3: - return { - "name": self._zarr_v3_name, - "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, - } - raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + def _to_json_v3(self) -> DateTime64JSON_V3: + return { + "name": self._zarr_v3_name, + "configuration": {"unit": self.unit, "scale_factor": self.scale_factor}, + } def _check_scalar(self, data: object) -> TypeGuard[DateTimeLike]: """ diff --git a/src/zarr/core/dtype/registry.py b/src/zarr/core/dtype/registry.py index 0a9b2aa64a..08988e4bc0 100644 --- a/src/zarr/core/dtype/registry.py +++ b/src/zarr/core/dtype/registry.py @@ -1,210 +1,300 @@ from __future__ import annotations import contextlib -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Self +from collections import defaultdict +from collections.abc import Mapping +from importlib.metadata import entry_points as get_entry_points +from typing import TYPE_CHECKING, cast import numpy as np from zarr.errors import DataTypeValidationError if TYPE_CHECKING: - from importlib.metadata import EntryPoint - from zarr.core.common import ZarrFormat from zarr.core.dtype.common import DTypeJSON from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType +# The data type registry is just a mapping from a canonical name to a ZDType class. +# Resolution logic (matching a native dtype or a JSON document to a registered type) lives in the +# free functions below rather than on the mapping itself, so the registry stays a plain dict that +# anyone can read, copy, or populate. +DataTypeRegistry = dict[str, "type[ZDType[TBaseDType, TBaseScalar]]"] + +# The global registry. Built-in data types are registered into this at import time (see +# ``zarr.core.dtype.__init__``), and third parties can add to it via ``register_data_type``. +data_type_registry: DataTypeRegistry = {} + + +def register_data_type( + cls: type[ZDType[TBaseDType, TBaseScalar]], + *, + name: str | None = None, + registry: DataTypeRegistry | None = None, +) -> None: + """ + Register a ZDType class under its canonical name. + + Parameters + ---------- + cls : type[ZDType] + The data type class to register. + name : str, optional + The name to register the class under. Defaults to ``cls._zarr_v3_name``. + registry : dict, optional + The registry to register into. Defaults to the global registry. + """ + if registry is None: + registry = data_type_registry + registry[name if name is not None else cls._zarr_v3_name] = cls + + +def unregister_data_type(name: str, *, registry: DataTypeRegistry | None = None) -> None: + """ + Remove a data type from the registry by its canonical name. + + Raises + ------ + KeyError + If the data type is not found in the registry. + """ + if registry is None: + registry = data_type_registry + del registry[name] + + +def load_data_type_entrypoints(*, registry: DataTypeRegistry | None = None) -> None: + """ + Discover and register data types advertised via the ``zarr.data_type`` entry point group. + """ + entry_points = get_entry_points() + for e in ( + *entry_points.select(group="zarr.data_type"), + *entry_points.select(group="zarr", name="data_type"), + ): + register_data_type(e.load(), registry=registry) + + +def match_dtype( + dtype: TBaseDType, *, registry: DataTypeRegistry | None = None +) -> ZDType[TBaseDType, TBaseScalar]: + """ + Match a native data type, e.g. a NumPy data type, to a registered ZDType. -# This class is different from the other registry classes, which inherit from -# dict. IMO it's simpler to just do a dataclass. But long-term we should -# have just 1 registry class in use. -@dataclass(frozen=True, kw_only=True) -class DataTypeRegistry: + Parameters + ---------- + dtype : TBaseDType + The native data type to match. + + Returns + ------- + ZDType[TBaseDType, TBaseScalar] + The matched ZDType corresponding to the provided NumPy data type. + + Raises + ------ + ValueError + If the data type is a NumPy "Object" type, which is ambiguous, or if multiple + or no Zarr data types are found that match the provided dtype. + + Notes + ----- + This function attempts to resolve a Zarr data type from a given native data type. + If the dtype is a NumPy "Object" data type, it raises a ValueError, as this type + can represent multiple Zarr data types. In such cases, a specific Zarr data type + should be explicitly constructed instead of relying on dynamic resolution. + + If multiple matches are found, it will also raise a ValueError. In this case + conflicting data types must be unregistered, or the Zarr data type should be explicitly + constructed. + """ + if registry is None: + registry = data_type_registry + + # The NumPy "Object" dtype is a catch-all that can back many Zarr data types, so it cannot be + # uniquely resolved from the native dtype alone. + if dtype == np.dtype("O"): + msg = ( + f"Zarr data type resolution from {dtype} failed. " + 'Attempted to resolve a zarr data type from a numpy "Object" data type, which is ' + 'ambiguous, as multiple zarr data types can be represented by the numpy "Object" ' + "data type. " + "In this case you should construct your array by providing a specific Zarr data " + 'type. For a list of Zarr data types that are compatible with the numpy "Object"' + "data type, see https://github.com/zarr-developers/zarr-python/issues/3117" + ) + raise ValueError(msg) + # Resolve via an index keyed on the NumPy dtype class rather than scanning every registered + # data type. Each class maps to a single data type, except NumPy's VoidDType, which is shared + # by the raw-bytes and structured data types (disambiguated by from_native_dtype via .fields). + candidates = _native_dtype_index(registry).get(type(dtype), []) + matched: list[ZDType[TBaseDType, TBaseScalar]] = [] + for cls in candidates: + # DataTypeValidationError means "this dtype doesn't match me", which is expected and + # suppressed. Other exceptions (e.g. ValueError for an unsupported configuration) propagate. + with contextlib.suppress(DataTypeValidationError): + matched.append(cls.from_native_dtype(dtype)) + if len(matched) == 1: + return matched[0] + if len(matched) > 1: + msg = ( + f"Zarr data type resolution from {dtype} failed. " + f"Multiple data type wrappers found that match dtype '{dtype}': {matched}. " + "You should unregister one of these data types, or avoid Zarr data type inference " + "entirely by providing a specific Zarr data type when creating your array." + "For more information, see https://github.com/zarr-developers/zarr-python/issues/3117" + ) + raise ValueError(msg) + raise ValueError(f"No Zarr data type found that matches dtype '{dtype!r}'") + + +def match_json( + data: DTypeJSON, *, zarr_format: ZarrFormat, registry: DataTypeRegistry | None = None +) -> ZDType[TBaseDType, TBaseScalar]: """ - A registry for ZDType classes. + Match a JSON representation of a data type to a registered ZDType. - This registry is a mapping from Zarr data type names to their - corresponding ZDType classes. + Resolution normalizes the input to a data type *name* and looks that name up, rather than + trying every registered data type in turn: - Attributes + - Zarr V3: the canonical name, an alias, or a parametric name (e.g. raw ``r``) is mapped + directly to its data type. + - Zarr V2: object-codec-backed types are identified by their ``object_codec_id``; everything + else is resolved through the native NumPy dtype (which also accepts wrong-but-parsable + type strings such as ``">u1"`` unless the ``data_type_resolution`` config is ``"strict"``). + + Parameters ---------- - contents : dict[str, type[ZDType[TBaseDType, TBaseScalar]]] - The mapping from Zarr data type names to their corresponding - ZDType classes. + data : DTypeJSON + The JSON representation of a data type to match. + zarr_format : ZarrFormat + The Zarr format version to consider when matching data types. + + Returns + ------- + ZDType[TBaseDType, TBaseScalar] + The matched ZDType corresponding to the JSON representation. + + Raises + ------ + ValueError + If no matching Zarr data type is found for the given JSON data. """ + if registry is None: + registry = data_type_registry + if zarr_format == 2: + return _match_json_v2(data, registry) + if zarr_format == 3: + return _match_json_v3(data, registry) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + - contents: dict[str, type[ZDType[TBaseDType, TBaseScalar]]] = field( - default_factory=dict, init=False - ) - - _lazy_load_list: list[EntryPoint] = field(default_factory=list, init=False) - - def _lazy_load(self) -> None: - """ - Load all data types from the lazy load list and register them with - the registry. After loading, clear the lazy load list. - """ - for e in self._lazy_load_list: - self.register(e.load()._zarr_v3_name, e.load()) - - self._lazy_load_list.clear() - - def register(self: Self, key: str, cls: type[ZDType[TBaseDType, TBaseScalar]]) -> None: - """ - Register a data type with the registry. - - Parameters - ---------- - key : str - The Zarr V3 name of the data type. - cls : type[ZDType[TBaseDType, TBaseScalar]] - The class of the data type to register. - - Notes - ----- - This method is idempotent. If the data type is already registered, this - method does nothing. - """ - if key not in self.contents or self.contents[key] != cls: - self.contents[key] = cls - - def unregister(self, key: str) -> None: - """ - Unregister a data type from the registry. - - Parameters - ---------- - key : str - The key associated with the ZDType class to be unregistered. - - Returns - ------- - None - - Raises - ------ - KeyError - If the data type is not found in the registry. - """ - if key in self.contents: - del self.contents[key] - else: - raise KeyError(f"Data type '{key}' not found in registry.") - - def get(self, key: str) -> type[ZDType[TBaseDType, TBaseScalar]]: - """ - Retrieve a registered ZDType class by its key. - - Parameters - ---------- - key : str - The key associated with the desired ZDType class. - - Returns - ------- - type[ZDType[TBaseDType, TBaseScalar]] - The ZDType class registered under the given key. - - Raises - ------ - KeyError - If the key is not found in the registry. - """ - - return self.contents[key] - - def match_dtype(self, dtype: TBaseDType) -> ZDType[TBaseDType, TBaseScalar]: - """ - Match a native data type, e.g. a NumPy data type, to a registered ZDType. - - Parameters - ---------- - dtype : TBaseDType - The native data type to match. - - Returns - ------- - ZDType[TBaseDType, TBaseScalar] - The matched ZDType corresponding to the provided NumPy data type. - - Raises - ------ - ValueError - If the data type is a NumPy "Object" type, which is ambiguous, or if multiple - or no Zarr data types are found that match the provided dtype. - - Notes - ----- - This function attempts to resolve a Zarr data type from a given native data type. - If the dtype is a NumPy "Object" data type, it raises a ValueError, as this type - can represent multiple Zarr data types. In such cases, a specific Zarr data type - should be explicitly constructed instead of relying on dynamic resolution. - - If multiple matches are found, it will also raise a ValueError. In this case - conflicting data types must be unregistered, or the Zarr data type should be explicitly - constructed. - """ - - if dtype == np.dtype("O"): - msg = ( - f"Zarr data type resolution from {dtype} failed. " - 'Attempted to resolve a zarr data type from a numpy "Object" data type, which is ' - 'ambiguous, as multiple zarr data types can be represented by the numpy "Object" ' - "data type. " - "In this case you should construct your array by providing a specific Zarr data " - 'type. For a list of Zarr data types that are compatible with the numpy "Object"' - "data type, see https://github.com/zarr-developers/zarr-python/issues/3117" - ) - raise ValueError(msg) - matched: list[ZDType[TBaseDType, TBaseScalar]] = [] - for val in self.contents.values(): - # DataTypeValidationError means "this dtype doesn't match me", which is - # expected and suppressed. Other exceptions (e.g. ValueError for a dtype - # that matches the type but has an invalid configuration) are propagated - # to the caller. +def _no_match(data: object) -> ValueError: + return ValueError(f"No Zarr data type found that matches {data!r}") + + +def _native_dtype_index( + registry: DataTypeRegistry, +) -> dict[type[TBaseDType], list[type[ZDType[TBaseDType, TBaseScalar]]]]: + """ + Index a registry by the NumPy dtype class that each data type wraps. The index is small and + cheap to build, and is rebuilt per call so the registry itself stays a plain mapping. + """ + index: dict[type[TBaseDType], list[type[ZDType[TBaseDType, TBaseScalar]]]] = defaultdict(list) + for cls in registry.values(): + index[cls.dtype_cls].append(cls) + return index + + +def _resolve_v3_name( + name: str, registry: DataTypeRegistry +) -> type[ZDType[TBaseDType, TBaseScalar]] | None: + """Resolve a Zarr V3 data type name (canonical, alias, or parametric) to its data type.""" + if name in registry: + return registry[name] + for cls in registry.values(): + if name in cls._aliases: + return cls + pattern = cls._zarr_v3_name_pattern + if pattern is not None and pattern.match(name): + return cls + return None + + +def _match_json_v3( + data: DTypeJSON, registry: DataTypeRegistry +) -> ZDType[TBaseDType, TBaseScalar]: + if isinstance(data, str): + name: object = data + elif isinstance(data, Mapping) and "name" in data: + name = data["name"] + else: + name = None + if isinstance(name, str): + cls = _resolve_v3_name(name, registry) + if cls is not None: with contextlib.suppress(DataTypeValidationError): - matched.append(val.from_native_dtype(dtype)) - if len(matched) == 1: - return matched[0] - elif len(matched) > 1: - msg = ( - f"Zarr data type resolution from {dtype} failed. " - f"Multiple data type wrappers found that match dtype '{dtype}': {matched}. " - "You should unregister one of these data types, or avoid Zarr data type inference " - "entirely by providing a specific Zarr data type when creating your array." - "For more information, see https://github.com/zarr-developers/zarr-python/issues/3117" - ) - raise ValueError(msg) - raise ValueError(f"No Zarr data type found that matches dtype '{dtype!r}'") - - def match_json( - self, data: DTypeJSON, *, zarr_format: ZarrFormat - ) -> ZDType[TBaseDType, TBaseScalar]: - """ - Match a JSON representation of a data type to a registered ZDType. - - Parameters - ---------- - data : DTypeJSON - The JSON representation of a data type to match. - zarr_format : ZarrFormat - The Zarr format version to consider when matching data types. - - Returns - ------- - ZDType[TBaseDType, TBaseScalar] - The matched ZDType corresponding to the JSON representation. - - Raises - ------ - ValueError - If no matching Zarr data type is found for the given JSON data. - """ - - for val in self.contents.values(): + return cls.from_json(data, zarr_format=3) + raise _no_match(data) + + +def _match_json_v2( + data: DTypeJSON, registry: DataTypeRegistry +) -> ZDType[TBaseDType, TBaseScalar]: + if not (isinstance(data, Mapping) and "name" in data and "object_codec_id" in data): + raise _no_match(data) + name = data["name"] + object_codec_id = data["object_codec_id"] + if object_codec_id is not None: + # Object-codec-backed data types (variable-length strings/bytes) are identified by the + # object codec id rather than the (always "|O") name. + for cls in registry.values(): + if cls.object_codec_id == object_codec_id: + with contextlib.suppress(DataTypeValidationError): + return cls.from_json(data, zarr_format=2) + raise _no_match(data) + if isinstance(name, str): + # First try interpreting the name as a NumPy type string (the usual Zarr V2 case). This + # also accepts wrong-but-parsable spellings (e.g. ">u1", which NumPy normalizes to "|u1"); + # in strict mode such non-canonical names are rejected. + native_dtype: TBaseDType | None + try: + native_dtype = np.dtype(name) + except TypeError: + native_dtype = None + if native_dtype is not None: try: - return val.from_json(data, zarr_format=zarr_format) - except DataTypeValidationError: - pass - raise ValueError(f"No Zarr data type found that matches {data!r}") + zdtype = match_dtype(native_dtype, registry=registry) + except ValueError: + zdtype = None + if zdtype is not None: + if _resolution_mode() == "strict": + canonical = zdtype.to_json(zarr_format=2)["name"] + if name != canonical: + raise ValueError( + f"The Zarr V2 data type name {name!r} is not spec-compliant; the " + f"canonical name for this data type is {canonical!r}. Set the " + "'data_type_resolution' config option to 'compatible' to read it anyway." + ) + return zdtype + # Fall back to interpreting the name as a registered data type name. This covers data types + # (e.g. parameter-free custom data types) whose Zarr V2 name is their registered name rather + # than a NumPy type string. + named_cls = _resolve_v3_name(name, registry) + if named_cls is not None: + with contextlib.suppress(DataTypeValidationError): + return named_cls.from_json(data, zarr_format=2) + raise _no_match(data) + # Otherwise the name is a structured data type descriptor (a sequence of fields), which the + # raw-bytes / structured data types (NumPy VoidDType) know how to parse. + for cls in _native_dtype_index(registry).get(cast("type[TBaseDType]", np.dtypes.VoidDType), []): + with contextlib.suppress(DataTypeValidationError): + return cls.from_json(data, zarr_format=2) + raise _no_match(data) + + +def _resolution_mode() -> str: + """The configured data type resolution mode: ``"strict"`` or ``"compatible"``.""" + from zarr.core.config import config + + return str(config.get("data_type_resolution")) diff --git a/src/zarr/core/dtype/wrapper.py b/src/zarr/core/dtype/wrapper.py index 42d5d88473..6c45095844 100644 --- a/src/zarr/core/dtype/wrapper.py +++ b/src/zarr/core/dtype/wrapper.py @@ -36,7 +36,12 @@ import numpy as np +from zarr.core.dtype.common import check_dtype_spec_v2 +from zarr.errors import DataTypeValidationError + if TYPE_CHECKING: + import re + from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.common import DTypeJSON, DTypeSpec_V2, DTypeSpec_V3 @@ -65,6 +70,30 @@ class variable, and it should generally be unique across different data types. # this class will create a native data type dtype_cls: ClassVar[type[TBaseDType]] _zarr_v3_name: ClassVar[str] + # Alternative Zarr V3 names that are *accepted* on input (in array metadata or as a + # user-provided data type string) in addition to ``_zarr_v3_name``. The canonical + # ``_zarr_v3_name`` is always what gets written out; aliases are input-only. Most data types + # have no aliases. + _aliases: ClassVar[tuple[str, ...]] = () + # A pattern for *parametric* Zarr V3 names that encode a parameter in the name itself rather + # than a fixed string or alias (e.g. the raw data type's ``r``, where N is a bit count). + # Used during data type resolution to map such a name to this data type. Most data types have + # no parametric name (None). + _zarr_v3_name_pattern: ClassVar[re.Pattern[str] | None] = None + # For data types backed by the NumPy object dtype in Zarr V2 (e.g. variable-length strings), + # this is the id of the object codec used to encode values, e.g. "vlen-utf8". For all other + # data types it is None. It is used by the default Zarr V2 (de)serialization below. + object_codec_id: ClassVar[str | None] = None + + @classmethod + def _zarr_v3_names(cls) -> tuple[str, ...]: + """All Zarr V3 names accepted for this data type: the canonical name plus any aliases.""" + return (cls._zarr_v3_name, *cls._aliases) + + @classmethod + def _check_zarr_v3_name(cls, name: object) -> bool: + """Whether ``name`` is this data type's canonical Zarr V3 name or one of its aliases.""" + return name in cls._zarr_v3_names() @classmethod def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[DType]: @@ -124,14 +153,38 @@ def to_native_dtype(self: Self) -> DType: raise NotImplementedError # pragma: no cover @classmethod - @abstractmethod def _from_json_v2(cls: type[Self], data: DTypeJSON) -> Self: - raise NotImplementedError # pragma: no cover + """ + Construct an instance from the Zarr V2 JSON form of this data type. + + The default implementation handles parameter-free data types whose Zarr V2 ``name`` equals + their Zarr V3 name (the common case for custom data types). Data types that are + parametrized, or that use a NumPy type string as their Zarr V2 name, should override this. + """ + if ( + check_dtype_spec_v2(data) + and data["name"] == cls._zarr_v3_name + and data["object_codec_id"] == cls.object_codec_id + ): + return cls() + raise DataTypeValidationError( + f"Invalid Zarr V2 JSON representation of {cls.__name__}: {data!r}" + ) @classmethod - @abstractmethod def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: - raise NotImplementedError # pragma: no cover + """ + Construct an instance from the Zarr V3 JSON form of this data type. + + The default implementation handles parameter-free data types, whose Zarr V3 form is just + the data type name (or one of its aliases). Data types with a ``configuration`` object + should override this. + """ + if cls._check_zarr_v3_name(data): + return cls() + raise DataTypeValidationError( + f"Invalid Zarr V3 JSON representation of {cls.__name__}: {data!r}" + ) @classmethod def from_json(cls: type[Self], data: DTypeJSON, *, zarr_format: ZarrFormat) -> Self: @@ -163,11 +216,15 @@ def to_json(self, zarr_format: Literal[2]) -> DTypeSpec_V2: ... @overload def to_json(self, zarr_format: Literal[3]) -> DTypeSpec_V3: ... - @abstractmethod def to_json(self, zarr_format: ZarrFormat) -> DTypeSpec_V2 | DTypeSpec_V3: """ Serialize this ZDType to JSON. + The default implementation handles parameter-free data types: in Zarr V3 the representation + is just the data type name, and in Zarr V2 it is that name plus an optional object codec id. + Data types with a ``configuration`` (Zarr V3), or that use a NumPy type string as their + Zarr V2 name, should override this. + Parameters ---------- zarr_format : ZarrFormat @@ -178,7 +235,23 @@ def to_json(self, zarr_format: ZarrFormat) -> DTypeSpec_V2 | DTypeSpec_V3: DTypeJSON_V2 | DTypeJSON_V3 The JSON-serializable representation of the wrapped data type """ - raise NotImplementedError # pragma: no cover + if zarr_format == 2: + return self._to_json_v2() + if zarr_format == 3: + return self._to_json_v3() + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover + + def _to_json_v2(self) -> DTypeSpec_V2: + """ + Serialize this ZDType to its Zarr V2 JSON form. See ``to_json`` for the default behavior. + """ + return {"name": self._zarr_v3_name, "object_codec_id": self.object_codec_id} + + def _to_json_v3(self) -> DTypeSpec_V3: + """ + Serialize this ZDType to its Zarr V3 JSON form. See ``to_json`` for the default behavior. + """ + return self._zarr_v3_name @abstractmethod def _check_scalar(self, data: object) -> bool: diff --git a/src/zarr/dtype.py b/src/zarr/dtype.py index 0c271b6c90..4df21d8a8a 100644 --- a/src/zarr/dtype.py +++ b/src/zarr/dtype.py @@ -43,6 +43,8 @@ # so it doesn't show up in the docs parse_data_type, # noqa: F401 parse_dtype, + register_data_type, + unregister_data_type, ) from zarr.core.dtype.common import DTypeSpec_V2, check_dtype_spec_v2 @@ -90,6 +92,8 @@ "check_dtype_spec_v2", "data_type_registry", "parse_dtype", + "register_data_type", + "unregister_data_type", ] diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 48f60fabd7..29a3228336 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any from zarr.core.config import BadConfigError, config -from zarr.core.dtype import data_type_registry from zarr.errors import ZarrUserWarning if TYPE_CHECKING: @@ -98,8 +97,8 @@ def _collect_entrypoints() -> list[Registry[Any]]: _ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr.ndbuffer")) _ndbuffer_registry.lazy_load_list.extend(entry_points.select(group="zarr", name="ndbuffer")) - data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr.data_type")) - data_type_registry._lazy_load_list.extend(entry_points.select(group="zarr", name="data_type")) + # Data types are loaded eagerly from entry points at import time of zarr.core.dtype + # (see load_data_type_entrypoints), so there is nothing to collect here. _chunk_key_encoding_registry.lazy_load_list.extend( entry_points.select(group="zarr.chunk_key_encoding") diff --git a/tests/test_config.py b/tests/test_config.py index a758378dc7..7a62001264 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -51,6 +51,7 @@ def test_config_defaults_set() -> None: == [ { "default_zarr_format": 3, + "data_type_resolution": "compatible", "array": { "order": "C", "write_empty_chunks": False, diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 4c585bfdf6..47d9638b43 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -11,7 +11,7 @@ from zarr.core.dtype.wrapper import ZDType zdtype_examples: tuple[ZDType[Any, Any], ...] = () -for wrapper_cls in data_type_registry.contents.values(): +for wrapper_cls in data_type_registry.values(): if wrapper_cls is Struct: with warnings.catch_warnings(): warnings.simplefilter("ignore") diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index f0946014fc..e13ca2467d 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -13,11 +13,21 @@ TBaseDType, TBaseScalar, get_data_type_from_json, + load_data_type_entrypoints, + match_dtype, + match_json, + register_data_type, + unregister_data_type, ) from zarr.core.dtype.common import unpack_dtype_json from zarr.dtype import ( # type: ignore[attr-defined] Bool, FixedLengthUTF32, + Int16, + RawBytes, + Struct, + UInt8, + VariableLengthBytes, VariableLengthUTF8, ZDType, data_type_registry, @@ -33,7 +43,7 @@ @pytest.fixture def data_type_registry_fixture() -> DataTypeRegistry: - return DataTypeRegistry() + return {} class TestRegistry: @@ -42,23 +52,25 @@ def test_register(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a dtype in a data type registry works. """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) - assert data_type_registry_fixture.get(Bool._zarr_v3_name) == Bool - assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), Bool) + register_data_type(Bool, registry=data_type_registry_fixture) + assert data_type_registry_fixture[Bool._zarr_v3_name] == Bool + assert isinstance(match_dtype(np.dtype("bool"), registry=data_type_registry_fixture), Bool) @staticmethod def test_override(data_type_registry_fixture: DataTypeRegistry) -> None: """ Test that registering a new dtype with the same name works (overriding the previous one). """ - data_type_registry_fixture.register(Bool._zarr_v3_name, Bool) + register_data_type(Bool, registry=data_type_registry_fixture) class NewBool(Bool): def default_scalar(self) -> np.bool_: return np.True_ - data_type_registry_fixture.register(NewBool._zarr_v3_name, NewBool) - assert isinstance(data_type_registry_fixture.match_dtype(np.dtype("bool")), NewBool) + register_data_type(NewBool, registry=data_type_registry_fixture) + assert isinstance( + match_dtype(np.dtype("bool"), registry=data_type_registry_fixture), NewBool + ) @staticmethod @pytest.mark.parametrize( @@ -72,17 +84,19 @@ def test_match_dtype( """ Test that match_dtype resolves a numpy dtype into an instance of the correspond wrapper for that dtype. """ - data_type_registry_fixture.register(wrapper_cls._zarr_v3_name, wrapper_cls) - assert isinstance(data_type_registry_fixture.match_dtype(np.dtype(dtype_str)), wrapper_cls) + register_data_type(wrapper_cls, registry=data_type_registry_fixture) + assert isinstance( + match_dtype(np.dtype(dtype_str), registry=data_type_registry_fixture), wrapper_cls + ) @staticmethod def test_match_dtype_string_na_object_error( data_type_registry_fixture: DataTypeRegistry, ) -> None: - data_type_registry_fixture.register(VariableLengthUTF8._zarr_v3_name, VariableLengthUTF8) # type: ignore[arg-type] + register_data_type(VariableLengthUTF8, registry=data_type_registry_fixture) # type: ignore[arg-type] dtype: np.dtype[Any] = np.dtypes.StringDType(na_object=None) with pytest.raises(ValueError, match=r"Zarr data type resolution from StringDType.*failed"): - data_type_registry_fixture.match_dtype(dtype) + match_dtype(dtype, registry=data_type_registry_fixture) @staticmethod def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> None: @@ -93,10 +107,10 @@ def test_unregistered_dtype(data_type_registry_fixture: DataTypeRegistry) -> Non outside_dtype = np.dtype(outside_dtype_name) msg = f"No Zarr data type found that matches dtype '{outside_dtype!r}'" with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_dtype(outside_dtype) + match_dtype(outside_dtype, registry=data_type_registry_fixture) with pytest.raises(KeyError): - data_type_registry_fixture.get(outside_dtype_name) + data_type_registry_fixture[outside_dtype_name] @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -106,7 +120,7 @@ def test_registered_dtypes_match_dtype(zdtype: ZDType[TBaseDType, TBaseScalar]) Test that the registered dtypes can be retrieved from the registry. """ skip_object_dtype(zdtype) - assert data_type_registry.match_dtype(zdtype.to_native_dtype()) == zdtype + assert match_dtype(zdtype.to_native_dtype()) == zdtype @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -115,10 +129,7 @@ def test_registered_dtypes_match_json( zdtype: ZDType[TBaseDType, TBaseScalar], zarr_format: ZarrFormat ) -> None: assert ( - data_type_registry.match_json( - zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format - ) - == zdtype + match_json(zdtype.to_json(zarr_format=zarr_format), zarr_format=zarr_format) == zdtype ) @staticmethod @@ -137,29 +148,29 @@ def test_match_dtype_unique( skip_object_dtype(zdtype) for _cls in get_args(AnyDType): if _cls is not type(zdtype): - data_type_registry_fixture.register(_cls._zarr_v3_name, _cls) + register_data_type(_cls, registry=data_type_registry_fixture) dtype_instance = zdtype.to_native_dtype() msg = f"No Zarr data type found that matches dtype '{dtype_instance!r}'" with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_dtype(dtype_instance) + match_dtype(dtype_instance, registry=data_type_registry_fixture) instance_dict = zdtype.to_json(zarr_format=zarr_format) msg = f"No Zarr data type found that matches {instance_dict!r}" with pytest.raises(ValueError, match=re.escape(msg)): - data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) + match_json(instance_dict, zarr_format=zarr_format, registry=data_type_registry_fixture) @pytest.mark.usefixtures("set_path") def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: from package_with_entrypoint import TestDataType - data_type_registry._lazy_load() + load_data_type_entrypoints() instance = TestDataType() dtype_json = instance.to_json(zarr_format=zarr_format) assert get_data_type_from_json(dtype_json, zarr_format=zarr_format) == instance - data_type_registry.unregister(TestDataType._zarr_v3_name) + unregister_data_type(TestDataType._zarr_v3_name) @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @@ -215,3 +226,101 @@ def test_parse_data_type( else: observed = dtype_parser_func(dtype_spec, zarr_format=zarr_format) assert observed == data_type + + +# The complete alias surface across every built-in data type. Aliases are alternative Zarr V3 +# names accepted on input; the canonical ``_zarr_v3_name`` is always what gets written out. Most +# data types have no aliases (an empty ``_aliases``). Update this mapping when adding aliases. +EXPECTED_ALIASES: dict[str, tuple[str, ...]] = { + "string": ("str",), + "variable_length_bytes": ("bytes",), + "struct": ("structured",), +} + + +def test_alias_surface_is_complete() -> None: + """Every registered data type's declared aliases match the documented alias surface.""" + observed = { + cls._zarr_v3_name: cls._aliases for cls in data_type_registry.values() if cls._aliases + } + assert observed == EXPECTED_ALIASES + + +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") +@pytest.mark.parametrize( + ("alias", "expected_cls"), + [ + ("str", VariableLengthUTF8), + ("string", VariableLengthUTF8), + ("bytes", VariableLengthBytes), + ], +) +def test_v3_name_alias_resolves(alias: str, expected_cls: type[ZDType[Any, Any]]) -> None: + """A Zarr V3 name alias resolves to its data type, and the data type writes the canonical name.""" + resolved = get_data_type_from_json(alias, zarr_format=3) + assert isinstance(resolved, expected_cls) + # Aliases are input-only: serialization always emits the canonical name, never the alias. + assert resolved.to_json(zarr_format=3) == expected_cls._zarr_v3_name + assert resolved.to_json(zarr_format=3) != alias or alias == expected_cls._zarr_v3_name + + +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") +def test_v3_name_alias_parametrized() -> None: + """The ``structured`` legacy name resolves to ``Struct``, which writes ``struct``.""" + data = {"name": "structured", "configuration": {"fields": [["a", "int8"]]}} + resolved = get_data_type_from_json(data, zarr_format=3) + assert isinstance(resolved, Struct) + assert resolved.to_json(zarr_format=3)["name"] == "struct" + + +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") +@pytest.mark.parametrize(("spec_name", "length"), [("r8", 1), ("r16", 2), ("r64", 8)]) +def test_raw_bytes_v3_spec_name(spec_name: str, length: int) -> None: + """The Zarr V3 core spec raw name ``r`` (N bits) is accepted on input as RawBytes.""" + resolved = get_data_type_from_json(spec_name, zarr_format=3) + assert isinstance(resolved, RawBytes) + assert resolved.length == length + # Aliases are input-only: RawBytes always writes its canonical configuration form. + assert resolved.to_json(zarr_format=3) == { + "name": "raw_bytes", + "configuration": {"length_bytes": length}, + } + + +@pytest.mark.parametrize("bad_name", ["r12", "r0", "r", "rabc", "r8x"]) +def test_raw_bytes_v3_spec_name_invalid(bad_name: str) -> None: + """Raw spec names with a non-positive or non-multiple-of-8 bit count do not resolve.""" + with pytest.raises(ValueError, match="No Zarr data type found"): + get_data_type_from_json(bad_name, zarr_format=3) + + +@pytest.mark.parametrize("mode", ["strict", "compatible"]) +def test_v2_wrong_but_parsable_typestring(mode: str) -> None: + """ + A wrong-but-parsable Zarr V2 type string (here ``">u1"``, which NumPy accepts but normalizes + to ``"|u1"``) is interpreted in compatibility mode and rejected in strict mode. + """ + from zarr.core.config import config + + data = {"name": ">u1", "object_codec_id": None} + with config.set({"data_type_resolution": mode}): + if mode == "compatible": + assert isinstance(get_data_type_from_json(data, zarr_format=2), UInt8) + else: + with pytest.raises(ValueError, match="not spec-compliant"): + get_data_type_from_json(data, zarr_format=2) + + +@pytest.mark.parametrize("mode", ["strict", "compatible"]) +def test_v2_canonical_typestring_both_modes(mode: str) -> None: + """Canonical Zarr V2 type strings resolve in both modes, including legitimate byte order.""" + from zarr.core.config import config + + with config.set({"data_type_resolution": mode}): + assert isinstance( + get_data_type_from_json({"name": "|u1", "object_codec_id": None}, zarr_format=2), UInt8 + ) + # A byte-order prefix on a multibyte type is canonical, not "wrong-but-parsable". + assert isinstance( + get_data_type_from_json({"name": ">i2", "object_codec_id": None}, zarr_format=2), Int16 + )