Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions changes/3285.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
JSON metadata validation now delegates to ``msgspec.convert`` for the type
coercions it supports (``Literal`` membership, ``int`` / ``bool`` strictness,
list-to-tuple), replacing the per-field hand-written ``parse_*`` logic. A small
fallback validates the recursive JSON values msgspec cannot, now with an
explicit nesting-depth limit, and a latent generator-exhaustion bug in
``parse_storage_transformers`` is fixed. See #3285.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ dependencies = [
'google-crc32c>=1.5',
'typing_extensions>=4.14',
'donfig>=0.8',
'msgspec>=0.19',
]

dynamic = [
Expand Down Expand Up @@ -271,6 +272,7 @@ extra-dependencies = [
'typing_extensions==4.14.*',
'donfig==0.8.*',
'obstore==0.5.*',
'msgspec==0.19.*',
]

[tool.hatch.envs.default]
Expand Down
31 changes: 17 additions & 14 deletions src/zarr/codecs/blosc.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,27 +104,30 @@ class BloscCname(metaclass=_DeprecatedStrEnumMeta):


def parse_typesize(data: JSON) -> int:
if isinstance(data, int):
if data > 0:
return data
else:
raise ValueError(
f"Value must be greater than 0. Got {data}, which is less or equal to 0."
)
raise TypeError(f"Value must be an int. Got {type(data)} instead.")
from zarr.core.json_parse import parse_field

parsed: int = parse_field(data, int, "typesize", error=TypeError)
if parsed > 0:
return parsed
else:
raise ValueError(
f"Value must be greater than 0. Got {parsed}, which is less or equal to 0."
)


# todo: real validation
def parse_clevel(data: JSON) -> int:
if isinstance(data, int):
return data
raise TypeError(f"Value should be an int. Got {type(data)} instead.")
from zarr.core.json_parse import parse_field

parsed: int = parse_field(data, int, "clevel", error=TypeError)
return parsed


def parse_blocksize(data: JSON) -> int:
if isinstance(data, int):
return data
raise TypeError(f"Value should be an int. Got {type(data)} instead.")
from zarr.core.json_parse import parse_field

parsed: int = parse_field(data, int, "blocksize", error=TypeError)
return parsed


def _parse_cname(data: object) -> BloscCnameLiteral:
Expand Down
11 changes: 6 additions & 5 deletions src/zarr/codecs/gzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@


def parse_gzip_level(data: JSON) -> int:
if not isinstance(data, (int)):
raise TypeError(f"Expected int, got {type(data)}")
if data not in range(10):
from zarr.core.json_parse import parse_field

parsed: int = parse_field(data, int, "level", error=TypeError)
if parsed not in range(10):
raise ValueError(
f"Expected an integer from the inclusive range (0, 9). Got {data} instead."
f"Expected an integer from the inclusive range (0, 9). Got {parsed} instead."
)
return data
return parsed


@dataclass(frozen=True)
Expand Down
18 changes: 10 additions & 8 deletions src/zarr/codecs/zstd.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,19 @@


def parse_zstd_level(data: JSON) -> int:
if isinstance(data, int):
if data >= 23:
raise ValueError(f"Value must be less than or equal to 22. Got {data} instead.")
return data
raise TypeError(f"Got value with type {type(data)}, but expected an int.")
from zarr.core.json_parse import parse_field

parsed: int = parse_field(data, int, "level", error=TypeError)
if parsed >= 23:
raise ValueError(f"Value must be less than or equal to 22. Got {parsed} instead.")
return parsed


def parse_checksum(data: JSON) -> bool:
if isinstance(data, bool):
return data
raise TypeError(f"Expected bool. Got {type(data)}.")
from zarr.core.json_parse import parse_field

parsed: bool = parse_field(data, bool, "checksum", error=TypeError)
return parsed


@dataclass(frozen=True)
Expand Down
6 changes: 3 additions & 3 deletions src/zarr/core/chunk_key_encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@


def parse_separator(data: JSON) -> SeparatorLiteral:
if data not in (".", "/"):
raise ValueError(f"Expected an '.' or '/' separator. Got {data} instead.")
return cast("SeparatorLiteral", data)
from zarr.core.json_parse import parse_field

return cast("SeparatorLiteral", parse_field(data, Literal[".", "/"], "separator"))


class ChunkKeyEncodingParams(TypedDict):
Expand Down
27 changes: 15 additions & 12 deletions src/zarr/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,15 @@ def parse_enum[E: Enum](data: object, cls: type[E]) -> E:


def parse_name(data: JSON, expected: str | None = None) -> str:
if isinstance(data, str):
if expected is None or data == expected:
return data
raise ValueError(f"Expected '{expected}'. Got {data} instead.")
else:
raise TypeError(f"Expected a string, got an instance of {type(data)}.")
from zarr.core.json_parse import convert

try:
data = cast("str", convert(data, str))
except (ValueError, TypeError) as exc:
raise TypeError(f"Expected a string, got an instance of {type(data)}.") from exc
if expected is None or data == expected:
return data
raise ValueError(f"Expected '{expected}'. Got {data} instead.")


def parse_configuration(data: JSON) -> JSON:
Expand Down Expand Up @@ -204,15 +207,15 @@ def parse_fill_value(data: Any) -> Any:


def parse_order(data: Any) -> Literal["C", "F"]:
if data in ("C", "F"):
return cast("Literal['C', 'F']", data)
raise ValueError(f"Expected one of ('C', 'F'), got {data} instead.")
from zarr.core.json_parse import parse_field

return cast("Literal['C', 'F']", parse_field(data, Literal["C", "F"], "order"))


def parse_bool(data: Any) -> bool:
if isinstance(data, bool):
return data
raise ValueError(f"Expected bool, got {data} instead.")
from zarr.core.json_parse import convert

return cast("bool", convert(data, bool))


def parse_int(data: Any) -> int:
Expand Down
7 changes: 3 additions & 4 deletions src/zarr/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ def enable_gpu(self) -> ConfigSet:


def parse_indexing_order(data: Any) -> Literal["C", "F"]:
if data in ("C", "F"):
return cast("Literal['C', 'F']", data)
msg = f"Expected one of ('C', 'F'), got {data} instead."
raise ValueError(msg)
from zarr.core.json_parse import parse_field

return cast("Literal['C', 'F']", parse_field(data, Literal["C", "F"], "order"))
17 changes: 9 additions & 8 deletions src/zarr/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,18 +85,19 @@

def parse_zarr_format(data: Any) -> ZarrFormat:
"""Parse the zarr_format field from metadata."""
if data in (2, 3):
return cast("ZarrFormat", data)
msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}."
raise ValueError(msg)
from zarr.core.json_parse import parse_field

return cast("ZarrFormat", parse_field(data, Literal[2, 3], "zarr_format"))


def parse_node_type(data: Any) -> NodeType:
"""Parse the node_type field from metadata."""
if data in ("array", "group"):
return cast("Literal['array', 'group']", data)
msg = f"Invalid value for 'node_type'. Expected 'array' or 'group'. Got '{data}'."
raise MetadataValidationError(msg)
from zarr.core.json_parse import parse_field

return cast(
"Literal['array', 'group']",
parse_field(data, Literal["array", "group"], "node_type", error=MetadataValidationError),
)


# todo: convert None to empty dict
Expand Down
91 changes: 91 additions & 0 deletions src/zarr/core/json_parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""Helpers for validating JSON-decoded metadata.

Most JSON metadata validation is delegated to :func:`msgspec.convert`, which
handles the type coercions Zarr needs (``Literal`` membership, ``int``/``bool``
strictness, list-to-tuple, ``TypedDict`` with ``NotRequired``). :func:`convert`
is a thin wrapper that translates :class:`msgspec.ValidationError` into the
``TypeError`` the rest of the codebase already raises.

msgspec cannot handle two things in Zarr's metadata types:

* the recursive ``JSON`` / ``JSONValue`` aliases, which it rejects at
schema-build time, and
* PEP 728 ``extra_items=`` extension fields, which it silently drops.

:func:`validate_json_value` is the small hand-written fallback for the first of
those. See https://github.com/zarr-developers/zarr-python/issues/3285.
"""

from __future__ import annotations

from collections.abc import Mapping
from typing import TYPE_CHECKING, Any, Final, cast

import msgspec

if TYPE_CHECKING:
from zarr.core.common import JSON

__all__ = ["MAX_JSON_DEPTH", "convert", "parse_field", "validate_json_value"]

MAX_JSON_DEPTH: Final = 64
"""Maximum nesting depth accepted by :func:`validate_json_value`."""


def _type_name(type_: Any) -> str:
return getattr(type_, "__name__", None) or str(type_).replace("typing.", "")


def convert(value: object, type_: Any, *, strict: bool = True) -> Any:
"""Validate and coerce ``value`` against ``type_`` via :func:`msgspec.convert`.

On a mismatch msgspec raises :class:`msgspec.ValidationError`; this re-raises
a plain, field-agnostic ``ValueError`` naming the expected type, so callers
can add their own field context (see :func:`parse_field`).
"""
try:
return msgspec.convert(value, type_, strict=strict)
except msgspec.ValidationError as exc:
raise ValueError(f"Expected instance of {_type_name(type_)}, got {value!r}.") from exc


def parse_field(
data: object, type_: Any, field: str, *, error: type[Exception] = ValueError
) -> Any:
"""Validate ``data`` for metadata field ``field`` against ``type_``.

Wraps :func:`convert` and, on failure, re-raises ``error`` with field
context, chaining the underlying type error. This keeps the
``convert``-then-re-raise pattern in one place rather than repeating it in
every per-field parser.
"""
try:
return convert(data, type_)
except ValueError as exc:
raise error(f"Failed to parse input for {field!r}.") from exc


def validate_json_value(value: object, *, max_depth: int = MAX_JSON_DEPTH, _depth: int = 0) -> JSON:
"""Check that ``value`` is a JSON value and return it unchanged.

msgspec cannot build a schema for Zarr's recursive ``JSON`` / ``JSONValue``
aliases, so this covers the fields typed that way (``attributes``,
``fill_value``, extension-field values). Unlike the previous per-field
parsers it also enforces ``max_depth``: a pathologically nested document
could otherwise exhaust the interpreter stack.
"""
if _depth > max_depth:
raise ValueError(f"JSON value nesting exceeds the maximum depth of {max_depth}.")
if value is None or isinstance(value, (bool, int, float, str)):
return cast("JSON", value)
if isinstance(value, (list, tuple)):
for item in value:
validate_json_value(item, max_depth=max_depth, _depth=_depth + 1)
return cast("JSON", value)
if isinstance(value, Mapping):
for key, item in value.items():
if not isinstance(key, str):
raise TypeError(f"JSON object keys must be str, got {type(key).__name__}.")
validate_json_value(item, max_depth=max_depth, _depth=_depth + 1)
return cast("JSON", value)
raise TypeError(f"Value {value!r} is not a valid JSON value.")
8 changes: 5 additions & 3 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,9 +278,11 @@ def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]:


def parse_zarr_format(data: object) -> Literal[2]:
if data == 2:
return 2
raise ValueError(f"Invalid value. Expected 2. Got {data}.")
from typing import Literal

from zarr.core.json_parse import parse_field

return cast("Literal[2]", parse_field(data, Literal[2], "zarr_format"))


def parse_filters(data: object) -> tuple[Numcodec, ...] | None:
Expand Down
34 changes: 20 additions & 14 deletions src/zarr/core/metadata/v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,20 @@


def parse_zarr_format(data: object) -> Literal[3]:
if data == 3:
return 3
msg = f"Invalid value for 'zarr_format'. Expected '3'. Got '{data}'."
raise MetadataValidationError(msg)
from zarr.core.json_parse import parse_field

return cast(
"Literal[3]", parse_field(data, Literal[3], "zarr_format", error=MetadataValidationError)
)


def parse_node_type_array(data: object) -> Literal["array"]:
if data == "array":
return "array"
msg = f"Invalid value for 'node_type'. Expected 'array'. Got '{data}'."
raise NodeTypeValidationError(msg)
from zarr.core.json_parse import parse_field

return cast(
'Literal["array"]',
parse_field(data, Literal["array"], "node_type", error=NodeTypeValidationError),
)


def parse_codecs(data: object) -> tuple[Codec, ...]:
Expand Down Expand Up @@ -130,11 +133,12 @@ def parse_storage_transformers(data: object) -> tuple[dict[str, JSON], ...]:
"""
if data is None:
return ()
if isinstance(data, Iterable):
if len(tuple(data)) >= 1:
return data # type: ignore[return-value]
else:
return ()
if isinstance(data, Iterable) and not isinstance(data, (str, bytes)):
# Materialise once. The previous implementation called ``len(tuple(data))``
# and then returned ``data`` itself, which exhausted (and discarded) a
# one-shot iterable and could return a value typed as a tuple that was not
# actually a tuple.
return tuple(data)
raise TypeError(
f"Invalid storage_transformers. Expected an iterable of dicts. Got {type(data)} instead."
)
Expand Down Expand Up @@ -610,6 +614,8 @@ def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:

@classmethod
def from_dict(cls, data: dict[str, JSON]) -> Self:
from zarr.core.json_parse import validate_json_value

# make a copy because we are modifying the dict
_data = data.copy()

Expand Down Expand Up @@ -656,7 +662,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
chunk_grid=_data_typed["chunk_grid"], # type: ignore[arg-type]
chunk_key_encoding=_data_typed["chunk_key_encoding"], # type: ignore[arg-type]
codecs=_data_typed["codecs"],
attributes=_data_typed.get("attributes", {}), # type: ignore[arg-type]
attributes=validate_json_value(_data_typed.get("attributes", {})), # type: ignore[arg-type]
dimension_names=_data_typed.get("dimension_names", None),
fill_value=fill_value_parsed,
data_type=data_type,
Expand Down
Loading
Loading